pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit 0b16990609dd8aa6abdd13f01a98bdc5e8d3ddee
parent 64a99096e32686e3faac7c6e48cbd11d9db1dd4b
Author: Kyle Milz <kyle@getaddrinfo.net>
Date:   Sun,  1 Feb 2015 16:04:07 -0700

product_scraper: simplify and improve logging

Move the thumbnail scraping into it's own function, mostly to reduce
indentation. Add logging for search results pre-scraping, simplify but kind of
break error printing, this will have to be fixed at a later date.

Diffstat:
Mproduct_scraper.pl | 218++++++++++++++++++++++++++++++++++++++++---------------------------------------
1 file changed, 110 insertions(+), 108 deletions(-)

diff --git a/product_scraper.pl b/product_scraper.pl @@ -31,6 +31,8 @@ $dbh->do("create table if not exists products(" . "last_seen int, " . "last_scraped int)") or die $DBI::errstr; +# $dbh->do("create table if not exists scrapes"); + # # Memory Express # @@ -52,10 +54,10 @@ my $update_sth = $dbh->prepare($sql); my $summary .= "type scraped total new errors time (s)\n"; $summary .= "----------- ------- ----- --- ------ --------\n"; -my ($new_products, $errors); - +my $new_products; while (my ($type, $name) = each %product_map) { - print "Enumerating $type\n"; + my $info_hdr = "info: $type"; + print "$info_hdr\n"; # this returns a search results page, link found through trial and error my $class_url = "http://www.memoryexpress.com/Category/" . @@ -65,10 +67,12 @@ while (my ($type, $name) = each %product_map) { my $dom = get_dom($class_url . "1", $ua, $args{v}); next if (!defined $dom); + my $pager_hdr = "$info_hdr: .AJAX_List_Pager"; + # extract the first of two pager widgets on the page my ($pager_html) = $dom->find(".AJAX_List_Pager")->html_array(); next if (!defined $pager_html); - print "info: .AJAX_List_Pager found\n" if ($args{v}); + print "$pager_hdr found\n" if ($args{v}); # find how many pages of results we have, each page is one <li> element my $pager = HTML::Grabber->new(html => $pager_html); @@ -77,135 +81,56 @@ while (my ($type, $name) = each %product_map) { # if more than 1 <li> is found, one <li> is always a "next" arrow $pages-- if ($pages > 1); - print "info: .AJAX_List_Pager: $pages pages\n" if ($args{v}); + print "$pager_hdr: $pages pages\n" if ($args{v}); # loop over results pages and append product thumbnails my @thumbnails; for (1..$pages) { # slow this down a bit - sleep int(rand(5)); + my $sleep = int(rand(5)); + print "$pager_hdr: $_/$pages: $sleep s wait\n" if ($args{v}); + sleep $sleep; $dom = get_dom($class_url . "$_", $ua, $args{v}); next if (!defined $dom); # each product thumbnail has class=PIV_Regular - push @thumbnails, $dom->find(".PIV_Regular")->html_array(); - - next if ($args{t}); + my @temp_thumbs = $dom->find(".PIV_Regular")->html_array(); + printf "$pager_hdr: $_/$pages: %i thumbs found\n", scalar @temp_thumbs if ($args{v}); + push @thumbnails, @temp_thumbs; } my $total = scalar @thumbnails; - print "info: found $total $type, scraping individually\n" if ($args{v}); + print "$info_hdr: $total total\n" if ($args{v}); # extract part number, brand, and description - my ($new, $old, $start, $i) = (0, 0, time, 0); + my ($new, $old, $errors, $start, $i) = (0, 0, 0, time, 0); for my $thumbnail_html (@thumbnails) { $i++; - my $hdr = "$type: $i/$total"; - - my $sleep = int(rand(20)); - print "info: $hdr ($sleep s wait)\n" if ($args{v}); - sleep $sleep; - - # make new html grabber instance with the thumbnail html - my $thumbnail_dom = HTML::Grabber->new(html => $thumbnail_html); - - # has to be found otherwise we can't do anything - my $product_id = get_tag_text($thumbnail_dom, ".ProductId"); - if (!defined $product_id) { - print "error: $hdr: .ProductId not found\n"; - next; - } - else { - print "info: $hdr: .ProductId = $product_id\n" if ($args{v}); - } - - # visit the extended description page - my $product_url = "http://www.memoryexpress.com/Products/"; - my $product_dom = get_dom("$product_url$product_id", $ua, $args{v}); - # the part number is inside of id=ProductAdd always - my $part_num = get_tag_text($product_dom, "#ProductAdd"); - if (!defined $part_num) { - print "error: $hdr: #ProductAdd not found\n"; + my $ret = scrape_thumbnail($type, $thumbnail_html, "$i/$total"); + if (!defined $ret) { + $errors++; + last if ($args{t}); next; } + $ret ? $new++ : $old++; - # extract the part number, always is text inside of the tag - ($part_num) = ($part_num =~ m/Part #:\s*(.*)\r/); - if (!defined $part_num || $part_num eq "") { - print "error: $hdr: part num regex failed\n"; - next; - } - else { - print "info: $hdr: part_num = $part_num\n" if ($args{v}); - } - - # extract the product tile - my $desc = get_tag_text($thumbnail_dom, ".ProductTitle"); - if (!defined $desc) { - print "error: $hdr: .ProductTitle was not found.\n"; - next; - } - else { - my $tmp_desc = $desc; - if (length($tmp_desc) > 35) { - $tmp_desc = substr($tmp_desc, 0, 40) . "..."; - } - print "info: $hdr: .ProductTitle = $tmp_desc\n" if ($args{v}); - } - - # extract the brand, sometimes shows up as text - my $brand = $thumbnail_dom->find(".ProductBrand")->text(); - if ($brand eq "") { - print "info: $hdr: .ProductBrand not text\n" if ($args{v}); - # and sometimes shows up inside the tag attributes - $brand = $thumbnail_dom->find(".ProductBrand")->html(); - ($brand) = ($brand =~ m/Brand: ([A-Za-z]+)/); - } - if (!defined $brand || $brand eq "") { - print "error: $hdr: .ProductBrand not found, html:\n"; - print "$thumbnail_html\n"; - next; - } - else { - print "info: $hdr: .ProductBrand = $brand\n" if ($args{v}); - } - - # use existence of part_num to decide on update or insert new - my $sql = "select * from products where part_num = ?"; - if ($dbh->selectrow_arrayref($sql, undef, $part_num)) { - # update - $update_sth->execute(time, $part_num); - print "info: $hdr: db updated\n" if ($args{v}); - $old++; - } - else { - # insert new - $insert_sth->execute($part_num, $brand, $desc, - $type, time, time, 0); - print "info: $hdr: db inserted\n" if ($args{v}); - $new_products .= "$brand $desc ($part_num)\n"; - $new++; - } last if ($args{t}); } $summary .= sprintf("%-11s %7s %5s %3s %6s %8s\n", $type, $new + $old, - $total, $new, $total - ($new + $old), time - $start); - print "\n" if ($args{v}); + $total, $new, $errors, time - $start); } $dbh->disconnect(); -my $mail; -$mail .= "$vendor\n"; +my $mail = "$vendor\n"; $mail .= "=" for (1..length $vendor); $mail .= "\n\n"; $mail .= "$summary\n" if ($summary); $mail .= "$new_products\n" if ($new_products); -$mail .= $errors if ($errors); my $email = Email::Simple->create( header => [ @@ -218,26 +143,103 @@ my $email = Email::Simple->create( if ($args{v}) { print $email->as_string(); + exit 0; } -else { - my $sender = Email::Send->new({mailer => 'SMTP'}); - $sender->mailer_args([Host => $cfg->{"general"}{"smtp"}]); - $sender->send($email->as_string()) || print "Couldn't send email\n"; + +my $sender = Email::Send->new({mailer => 'SMTP'}); +$sender->mailer_args([Host => $cfg->{"general"}{"smtp"}]); +$sender->send($email->as_string()) || print "Couldn't send email\n"; + +sub scrape_thumbnail +{ + my $type = shift; + my $html = shift; + my $count = shift; + + my $error_hdr = "error: $type: $count"; + my $info_hdr = "info: $type: $count"; + + my $sleep = int(rand(20)); + printf "$info_hdr (%ss wait)\n", $sleep if ($args{v}); + sleep $sleep; + + # make new html grabber instance with the thumbnail html + my $dom = HTML::Grabber->new(html => $html); + + # has to be found otherwise we can't do anything + my $product_id = get_tag_text($dom, ".ProductId", $error_hdr); + return undef unless defined $product_id; + + # visit the extended description page + my $product_url = "http://www.memoryexpress.com/Products/"; + my $product_dom = get_dom("$product_url$product_id", $ua, $args{v}); + + # the part number is inside of id=ProductAdd always + my $part_num = get_tag_text($product_dom, "#ProductAdd", $error_hdr); + return undef unless defined $part_num; + + # extract the part number, always is text inside of the tag + ($part_num) = ($part_num =~ m/Part #:\s*(.*)\r/); + if (!defined $part_num) { + print "$error_hdr: part num regex failed\n"; + return undef; + } + + # extract the product description + my $desc = get_tag_text($dom, ".ProductTitle", $error_hdr); + return undef unless defined $desc; + + # extract the brand, sometimes shows up as text + my $brand = $dom->find(".ProductBrand")->text(); + if ($brand eq "") { + # and sometimes shows up inside the tag attributes + $brand = $dom->find(".ProductBrand")->html(); + ($brand) = ($brand =~ m/Brand: ([A-Za-z]+)/); + } + if (!defined $brand || $brand eq "") { + print "$error_hdr: .ProductBrand not found, html was:\n"; + print "$html\n"; + return undef; + } + + my $tmp_desc = $desc; + if (length($tmp_desc) > 50) { + $tmp_desc = substr($tmp_desc, 0, 50) . "..."; + } + print "$info_hdr: $brand $part_num\n" if ($args{v}); + print "$info_hdr: $tmp_desc\n" if ($args{v}); + + # use existence of part_num to decide on update or insert new + my $sql = "select * from products where part_num = ?"; + if ($dbh->selectrow_arrayref($sql, undef, $part_num)) { + # update + $update_sth->execute(time, $part_num); + print "$info_hdr: db updated\n" if ($args{v}); + return 0; + } + else { + # insert new + $insert_sth->execute($part_num, $brand, $desc, $type, time, + time, 0); + print "$info_hdr: db inserted\n" if ($args{v}); + $new_products .= "$brand $desc ($part_num)\n"; + return 1; + } + # $scrapes_sth->execute("thumbnail", time - $start, time); } sub get_tag_text { my $dom = shift; my $tag = shift; + my $error_hdr = shift; my $field = $dom->find($tag)->text(); if (!defined $field || $field eq "") { - $errors .= "error: could not find $tag, html was:\n"; - $errors .= $dom->html(); - $errors .= "\n\n"; - print $errors if ($args{v}); - + print "$error_hdr: $tag not found or empty, html was:\n"; + print $dom->html() . "\n"; return undef; } + return $field; }