pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit 6012f05c4d5c900f0d9c84c35c71da1a6dc23b13
parent 7c66858a5de2f8f277a5c66aa5321ddce0210f3c
Author: Kyle Milz <kyle@getaddrinfo.net>
Date:   Thu, 12 Mar 2015 01:27:44 -0600

product_scraper: lift db insert/delete into its own function

Add sleep_rand helper, and fix annoying SIGBUS error by setting the statement
handles to undefined before disconnecting from the db.

Diffstat:
Mproduct_scraper.pl | 121++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
1 file changed, 70 insertions(+), 51 deletions(-)

diff --git a/product_scraper.pl b/product_scraper.pl @@ -4,7 +4,6 @@ use strict; use warnings; use Config::Grammar; -use DBI; use Email::Simple; use Email::Send; use Getopt::Std; @@ -60,6 +59,8 @@ while (my ($type, $name) = each %product_map) { mem_exp_scrape_class($type, $name); } +$update_sth = undef; +$insert_sth = undef; $dbh->disconnect(); send_email($mail, $args{v}); @@ -72,9 +73,59 @@ sub mem_exp_scrape_class { my $type = shift; my $name = shift; - my $info_hdr = "info: $type"; - print "$info_hdr\n" if ($args{v}); + + my $thumbnails = mem_exp_get_thumbnails($name, $info_hdr); + return undef unless defined $thumbnails; + + my $total = scalar @$thumbnails; + print "$info_hdr: $total total\n" if ($args{v}); + + # extract and store part number, brand, and description + my ($new, $old, $err, $start, $i) = (0, 0, 0, time, 0); + for my $thumbnail_html (@$thumbnails) { + $i++; + my $thumb_hdr = "$info_hdr: $i/$total"; + + # look less suspicious + sleep_rand($thumb_hdr, 20); + + # attempt to extract information from thumbnail html + my ($brand, $part_num, $desc) = + mem_exp_scrape_thumbnail("$type: $i/$total", $thumbnail_html); + unless (defined $brand && defined $part_num && defined $desc) { + $err++; + next; + } + + # extraction looks good, insert or update the database + $sql = "select * from products where part_num = ?"; + if ($dbh->selectrow_arrayref($sql, undef, $part_num)) { + # also check description and manufacturer are consistent? + $update_sth->execute(time, $part_num) or die $dbh->errstr(); + print "$thumb_hdr: updated db\n" if ($args{v}); + $old++; + } + else { + $insert_sth->execute($part_num, $brand, $desc, $type, + time, time, 0) or die $dbh->errstr(); + print "$thumb_hdr: inserted into db\n" if ($args{v}); + $new++; + } + } + + my $ok = $new + $old; + $mail .= sprintf("%-15s %7s %6.1f%% %6i %3i %7is\n", $type, + "$ok/$total", $ok * 100.0 / $total, $err, $new, time - $start); +} + +# +# get all thumbnails from generic unfiltered search page +# +sub mem_exp_get_thumbnails +{ + my $name = shift; + my $info_hdr = shift; # this returns a search results page, link found through trial and error my $class_url = "http://www.memoryexpress.com/Category/" . @@ -104,17 +155,17 @@ sub mem_exp_scrape_class my @thumbnails; for (1..$pages) { my $page_hdr = "$pager_hdr: $_/$pages"; + sleep_rand($page_hdr, 5); - # slow this down a bit - my $sleep = int(rand(5)); - printf "$page_hdr: (%is wait)\n", $sleep if ($args{v}); - sleep $sleep unless ($args{t}); - + # get a search pages dom $dom = get_dom($class_url . "$_", $ua, $args{v}); next if (!defined $dom); # each product thumbnail has class=PIV_Regular my @temp_thumbs = $dom->find(".PIV_Regular")->html_array(); + if ($args{t}) { + @temp_thumbs = ($temp_thumbs[0]); + } my $num_thumbs = scalar @temp_thumbs; print "$page_hdr: $num_thumbs thumbs found\n" if ($args{v}); push @thumbnails, @temp_thumbs; @@ -122,49 +173,7 @@ sub mem_exp_scrape_class last if ($args{t}); } - my $total = scalar @thumbnails; - print "$info_hdr: $total total\n" if ($args{v}); - - # extract and store part number, brand, and description - my ($new, $old, $err, $start, $i) = (0, 0, 0, time, 0); - for my $thumbnail_html (@thumbnails) { - $i++; - my $thumb_hdr = "$info_hdr: $i/$total"; - - # look less suspicious - my $sleep = int(rand(20)); - printf "$thumb_hdr (%ss wait)\n", $sleep if ($args{v}); - sleep $sleep unless ($args{t}); - - # attempt to extract information from thumbnail_html - my ($brand, $part_num, $desc) = - mem_exp_scrape_thumbnail("$type: $i/$total", $thumbnail_html); - if (!defined $brand) { - $err++; - next; - } - - # extraction looks good, insert or update the database - my $sql = "select * from products where part_num = ?"; - if ($dbh->selectrow_arrayref($sql, undef, $part_num)) { - # also check description and manufacturer are consistent? - $update_sth->execute(time, $part_num); - print "$thumb_hdr: updated db\n" if ($args{v}); - $old++; - } - else { - $insert_sth->execute($part_num, $brand, $desc, $type, - time, time, 0); - print "$thumb_hdr: inserted into db\n" if ($args{v}); - $new++; - } - - last if ($args{t}); - } - - my $ok = $new + $old; - $mail .= sprintf("%-15s %7s %6.1f%% %6i %3i %7is\n", $type, - "$ok/$total", $ok * 100.0 / $total, $err, $new, time - $start); + return \@thumbnails; } # @@ -270,3 +279,13 @@ sub send_email $sender->mailer_args([Host => $cfg->{"general"}{"smtp"}]); $sender->send($email->as_string()) || print "Couldn't send email\n"; } + +sub sleep_rand +{ + my $header = shift; + my $upper_limit = shift || 0; + + my $sleep = int(rand($upper_limit)); + printf "$header: (%ss wait)\n", $sleep if ($args{v}); + sleep $sleep unless ($args{t}); +}