pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit 80005ea368ebe99d2bfb19900d69b81e4a74f4d4
parent 40e1479ff5d396c027259692510e8993632e4b6c
Author: Kyle Milz <kyle@getaddrinfo.net>
Date:   Sun,  8 Feb 2015 21:34:12 -0700

product_scraper: shuffle code around, simplify

move the db insert/update check up from the scrape_thumbnail function. move
sleep up also. new email output.

Diffstat:
Mproduct_scraper.pl | 90+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
1 file changed, 51 insertions(+), 39 deletions(-)

diff --git a/product_scraper.pl b/product_scraper.pl @@ -51,8 +51,8 @@ my $insert_sth = $dbh->prepare($sql); $sql = "update products set last_seen = ? where part_num = ?"; my $update_sth = $dbh->prepare($sql); -my $summary .= "type scraped total new errors time (s)\n"; -$summary .= "----------- ------- ----- --- ------ --------\n"; +my $summary .= "type ok percent errors new duration\n"; +$summary .= "--------------- ------- ------- ------ --- --------\n"; my $new_products; while (my ($type, $name) = each %product_map) { @@ -89,7 +89,7 @@ while (my ($type, $name) = each %product_map) { # slow this down a bit my $sleep = int(rand(5)); print "$pager_hdr: $_/$pages: $sleep s wait\n" if ($args{v}); - sleep $sleep; + sleep $sleep unless ($args{t}); $dom = get_dom($class_url . "$_", $ua, $args{v}); next if (!defined $dom); @@ -98,29 +98,54 @@ while (my ($type, $name) = each %product_map) { my @temp_thumbs = $dom->find(".PIV_Regular")->html_array(); printf "$pager_hdr: $_/$pages: %i thumbs found\n", scalar @temp_thumbs if ($args{v}); push @thumbnails, @temp_thumbs; + + last if ($args{t}); } my $total = scalar @thumbnails; print "$info_hdr: $total total\n" if ($args{v}); - # extract part number, brand, and description - my ($new, $old, $errors, $start, $i) = (0, 0, 0, time, 0); + # extract and store part number, brand, and description + my ($new, $old, $err, $start, $i) = (0, 0, 0, time, 0); for my $thumbnail_html (@thumbnails) { $i++; - - my $ret = scrape_thumbnail($type, $thumbnail_html, "$i/$total"); - if (!defined $ret) { - $errors++; - last if ($args{t}); + my $thumb_hdr = "$info_hdr: $i/$total"; + + # look less suspicious + my $sleep = int(rand(20)); + printf "$thumb_hdr (%ss wait)\n", $sleep if ($args{v}); + sleep $sleep unless ($args{t}); + + # attempt to extract information from thumbnail_html + my ($brand, $part_num, $desc, $tmp_desc) = + scrape_thumbnail("$type: $i/$total", $thumbnail_html); + if (!defined $brand) { + $err++; next; } - $ret ? $new++ : $old++; + + # extraction looks good, insert or update the database + my $sql = "select * from products where part_num = ?"; + if ($dbh->selectrow_arrayref($sql, undef, $part_num)) { + # also check description and manufacturer are consistent? + $update_sth->execute(time, $part_num); + print "$thumb_hdr: db updated\n" if ($args{v}); + $old++; + } + else { + $insert_sth->execute($part_num, $brand, $desc, $type, + time, time, 0); + print "$thumb_hdr db inserted\n" if ($args{v}); + $new_products .= "$brand $part_num: $tmp_desc\n"; + $new++; + } last if ($args{t}); } - $summary .= sprintf("%-11s %7s %5s %3s %6s %8s\n", $type, $new + $old, - $total, $new, $errors, time - $start); + my $ok = $new + $old; + $summary .= sprintf("%-15s %7s %6.1f%% %6i %3i %7is\n", $type, + "$ok/$total", $ok * 100.0 / $total, $err, $new, time - $start); } $dbh->disconnect(); @@ -146,22 +171,22 @@ if ($args{v}) { exit 0; } -my $sender = Email::Send->new({mailer => 'SMTP'}); +my $sender = Email::Send->new({mailer => "SMTP"}); $sender->mailer_args([Host => $cfg->{"general"}{"smtp"}]); $sender->send($email->as_string()) || print "Couldn't send email\n"; + +# +# this checks the input html for 3 things, part num, manufacturer, and +# description. if any of these aren't found, fail. +# sub scrape_thumbnail { - my $type = shift; + my $thumb_hdr = shift; my $html = shift; - my $count = shift; - my $error_hdr = "error: $type: $count"; - my $info_hdr = "info: $type: $count"; - - my $sleep = int(rand(20)); - printf "$info_hdr (%ss wait)\n", $sleep if ($args{v}); - sleep $sleep; + my $error_hdr = "error: $thumb_hdr"; + my $info_hdr = "info: $thumb_hdr"; # make new html grabber instance with the thumbnail html my $dom = HTML::Grabber->new(html => $html); @@ -209,25 +234,12 @@ sub scrape_thumbnail print "$info_hdr: $brand $part_num\n" if ($args{v}); print "$info_hdr: $tmp_desc\n" if ($args{v}); - # use existence of part_num to decide on update or insert new - my $sql = "select * from products where part_num = ?"; - if ($dbh->selectrow_arrayref($sql, undef, $part_num)) { - # update - $update_sth->execute(time, $part_num); - print "$info_hdr: db updated\n" if ($args{v}); - return 0; - } - else { - # insert new - $insert_sth->execute($part_num, $brand, $desc, $type, time, - time, 0); - print "$info_hdr: db inserted\n" if ($args{v}); - $new_products .= "$brand $part_num: $tmp_desc\n"; - return 1; - } - # $scrapes_sth->execute("thumbnail", time - $start, time); + return ($brand, $part_num, $desc, $tmp_desc); } +# +# unwrap the plain text inside of an html tag +# sub get_tag_text { my $dom = shift;