commit 80005ea368ebe99d2bfb19900d69b81e4a74f4d4
parent 40e1479ff5d396c027259692510e8993632e4b6c
Author: Kyle Milz <kyle@getaddrinfo.net>
Date: Sun, 8 Feb 2015 21:34:12 -0700
product_scraper: shuffle code around, simplify
move the db insert/update check up from the scrape_thumbnail function. move
sleep up also. new email output.
Diffstat:
M | product_scraper.pl | | | 90 | +++++++++++++++++++++++++++++++++++++++++++++---------------------------------- |
1 file changed, 51 insertions(+), 39 deletions(-)
diff --git a/product_scraper.pl b/product_scraper.pl
@@ -51,8 +51,8 @@ my $insert_sth = $dbh->prepare($sql);
$sql = "update products set last_seen = ? where part_num = ?";
my $update_sth = $dbh->prepare($sql);
-my $summary .= "type scraped total new errors time (s)\n";
-$summary .= "----------- ------- ----- --- ------ --------\n";
+my $summary .= "type ok percent errors new duration\n";
+$summary .= "--------------- ------- ------- ------ --- --------\n";
my $new_products;
while (my ($type, $name) = each %product_map) {
@@ -89,7 +89,7 @@ while (my ($type, $name) = each %product_map) {
# slow this down a bit
my $sleep = int(rand(5));
print "$pager_hdr: $_/$pages: $sleep s wait\n" if ($args{v});
- sleep $sleep;
+ sleep $sleep unless ($args{t});
$dom = get_dom($class_url . "$_", $ua, $args{v});
next if (!defined $dom);
@@ -98,29 +98,54 @@ while (my ($type, $name) = each %product_map) {
my @temp_thumbs = $dom->find(".PIV_Regular")->html_array();
printf "$pager_hdr: $_/$pages: %i thumbs found\n", scalar @temp_thumbs if ($args{v});
push @thumbnails, @temp_thumbs;
+
+ last if ($args{t});
}
my $total = scalar @thumbnails;
print "$info_hdr: $total total\n" if ($args{v});
- # extract part number, brand, and description
- my ($new, $old, $errors, $start, $i) = (0, 0, 0, time, 0);
+ # extract and store part number, brand, and description
+ my ($new, $old, $err, $start, $i) = (0, 0, 0, time, 0);
for my $thumbnail_html (@thumbnails) {
$i++;
-
- my $ret = scrape_thumbnail($type, $thumbnail_html, "$i/$total");
- if (!defined $ret) {
- $errors++;
- last if ($args{t});
+ my $thumb_hdr = "$info_hdr: $i/$total";
+
+ # look less suspicious
+ my $sleep = int(rand(20));
+ printf "$thumb_hdr (%ss wait)\n", $sleep if ($args{v});
+ sleep $sleep unless ($args{t});
+
+ # attempt to extract information from thumbnail_html
+ my ($brand, $part_num, $desc, $tmp_desc) =
+ scrape_thumbnail("$type: $i/$total", $thumbnail_html);
+ if (!defined $brand) {
+ $err++;
next;
}
- $ret ? $new++ : $old++;
+
+ # extraction looks good, insert or update the database
+ my $sql = "select * from products where part_num = ?";
+ if ($dbh->selectrow_arrayref($sql, undef, $part_num)) {
+ # also check description and manufacturer are consistent?
+ $update_sth->execute(time, $part_num);
+ print "$thumb_hdr: db updated\n" if ($args{v});
+ $old++;
+ }
+ else {
+ $insert_sth->execute($part_num, $brand, $desc, $type,
+ time, time, 0);
+ print "$thumb_hdr db inserted\n" if ($args{v});
+ $new_products .= "$brand $part_num: $tmp_desc\n";
+ $new++;
+ }
last if ($args{t});
}
- $summary .= sprintf("%-11s %7s %5s %3s %6s %8s\n", $type, $new + $old,
- $total, $new, $errors, time - $start);
+ my $ok = $new + $old;
+ $summary .= sprintf("%-15s %7s %6.1f%% %6i %3i %7is\n", $type,
+ "$ok/$total", $ok * 100.0 / $total, $err, $new, time - $start);
}
$dbh->disconnect();
@@ -146,22 +171,22 @@ if ($args{v}) {
exit 0;
}
-my $sender = Email::Send->new({mailer => 'SMTP'});
+my $sender = Email::Send->new({mailer => "SMTP"});
$sender->mailer_args([Host => $cfg->{"general"}{"smtp"}]);
$sender->send($email->as_string()) || print "Couldn't send email\n";
+
+#
+# this checks the input html for 3 things, part num, manufacturer, and
+# description. if any of these aren't found, fail.
+#
sub scrape_thumbnail
{
- my $type = shift;
+ my $thumb_hdr = shift;
my $html = shift;
- my $count = shift;
- my $error_hdr = "error: $type: $count";
- my $info_hdr = "info: $type: $count";
-
- my $sleep = int(rand(20));
- printf "$info_hdr (%ss wait)\n", $sleep if ($args{v});
- sleep $sleep;
+ my $error_hdr = "error: $thumb_hdr";
+ my $info_hdr = "info: $thumb_hdr";
# make new html grabber instance with the thumbnail html
my $dom = HTML::Grabber->new(html => $html);
@@ -209,25 +234,12 @@ sub scrape_thumbnail
print "$info_hdr: $brand $part_num\n" if ($args{v});
print "$info_hdr: $tmp_desc\n" if ($args{v});
- # use existence of part_num to decide on update or insert new
- my $sql = "select * from products where part_num = ?";
- if ($dbh->selectrow_arrayref($sql, undef, $part_num)) {
- # update
- $update_sth->execute(time, $part_num);
- print "$info_hdr: db updated\n" if ($args{v});
- return 0;
- }
- else {
- # insert new
- $insert_sth->execute($part_num, $brand, $desc, $type, time,
- time, 0);
- print "$info_hdr: db inserted\n" if ($args{v});
- $new_products .= "$brand $part_num: $tmp_desc\n";
- return 1;
- }
- # $scrapes_sth->execute("thumbnail", time - $start, time);
+ return ($brand, $part_num, $desc, $tmp_desc);
}
+#
+# unwrap the plain text inside of an html tag
+#
sub get_tag_text
{
my $dom = shift;