pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit b142f5f806fa976b8852d3ba23b8a042dbca9771
parent 459ca65b91eb3fcad6aeabe0c0c3c8199eccfb07
Author: Kyle Milz <kyle@getaddrinfo.net>
Date:   Sun,  4 Jan 2015 02:49:17 -0700

product_scraper: document scraping technique

Clean up and document scraping technique for Memory Express. While here, change
shared module name to not use a subdirectory.

Diffstat:
APriceChart.pm | 76++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
DShared.pm | 73-------------------------------------------------------------------------
Mgen_index.pl | 2+-
Mgen_svg.pl | 2+-
Mprice_scraper.pl | 5++---
Mproduct_scraper.pl | 69++++++++++++++++++++++++++++++++++++---------------------------------
Msearch.pl | 2+-
7 files changed, 117 insertions(+), 112 deletions(-)

diff --git a/PriceChart.pm b/PriceChart.pm @@ -0,0 +1,76 @@ +package PriceChart; + +use DBI; +use Exporter; +use POSIX; + +@ISA = ("Exporter"); +@EXPORT = qw(get_config get_dom get_ua get_log get_dbh); + + +sub get_config +{ + my $parser = Config::Grammar->new({ + _vars => [ + 'user_agent', + 'email', + 'smtp', + ], + }); + my $cfg_file = "/etc/pricechart.cfg"; + return $parser->parse($cfg_file) or die "error: $parser->{err}\n"; +} + +sub get_dbh +{ + my $db_dir = "/var/www/db"; + mkdir $db_dir; + + my $dbh = DBI->connect( + "dbi:SQLite:dbname=$db_dir/pricechart.db", + "", + "", + { RaiseError => 1 } + ) or die $DBI::errstr; + return $dbh; +} + +sub get_dom +{ + my $url = shift; + my $ua = shift; + my $verbose = shift; + + my $resp = $ua->get($url); + if ($resp->is_success) { + print "GET $url " . $resp->status_line . "\n" if ($verbose); + return HTML::Grabber->new(html => $resp->decoded_content); + } + + print "GET $url " . $resp->status_line . "\n"; + return undef; +} + +sub get_ua +{ + my $cfg = shift; + + my $ua = LWP::UserAgent->new(agent => $cfg->{user_agent}); + $ua->default_header("Accept" => "*/*"); + + return $ua; +} + +sub get_log +{ + my $file = shift; + my $log_dir = "/var/www/logs/pricechart"; + + mkdir $log_dir; + open my $log, ">>", "$log_dir/$file.log" || die "$!"; + + print $log strftime "%b %e %Y %H:%M ", localtime; + return $log; +} + +1; diff --git a/Shared.pm b/Shared.pm @@ -1,73 +0,0 @@ -package PriceChart::Shared; - -use DBI; -use Exporter; -use POSIX; - -@ISA = ("Exporter"); -@EXPORT = qw(get_config get_dom get_ua get_log get_dbh); - - -sub get_config -{ - my $parser = Config::Grammar->new({ - _vars => [ - 'user_agent', - 'email', - 'smtp', - ], - }); - my $cfg_file = "/etc/pricechart.cfg"; - return $parser->parse($cfg_file) or die "error: $parser->{err}\n"; -} - -sub get_dbh -{ - my $db_dir = "/var/www/db"; - mkdir $db_dir; - - my $dbh = DBI->connect( - "dbi:SQLite:dbname=$db_dir/pricechart.db", - "", - "", - { RaiseError => 1 } - ) or die $DBI::errstr; - return $dbh; -} - -sub get_dom -{ - my $url = shift; - my $ua = shift; - - my $resp = $ua->get($url); - if (! $resp->is_success) { - print "getting $url failed: " . $resp->status_line . "\n"; - return undef; - } - return HTML::Grabber->new(html => $resp->decoded_content); -} - -sub get_ua -{ - my $cfg = shift; - - my $ua = LWP::UserAgent->new(agent => $cfg->{user_agent}); - $ua->default_header("Accept" => "*/*"); - - return $ua; -} - -sub get_log -{ - my $file = shift; - my $log_dir = "/var/www/logs/pricechart"; - - mkdir $log_dir; - open my $log, ">>", "$log_dir/$file.log" || die "$!"; - - print $log strftime "%b %e %Y %H:%M ", localtime; - return $log; -} - -1; diff --git a/gen_index.pl b/gen_index.pl @@ -4,7 +4,7 @@ use strict; use warnings; use File::Copy; -use PriceChart::Shared; +use PriceChart; use Template; diff --git a/gen_svg.pl b/gen_svg.pl @@ -6,7 +6,7 @@ use warnings; use Getopt::Std; use SVG; use POSIX; -use PriceChart::Shared; +use PriceChart; my %args; diff --git a/price_scraper.pl b/price_scraper.pl @@ -7,7 +7,7 @@ use Config::Grammar; use Getopt::Std; use HTML::Grabber; use LWP::Simple; -use PriceChart::Shared; +use PriceChart; my %args; @@ -59,12 +59,11 @@ while (my ($vendor, $url, $price_tag, $sale_tag) = $vendor_sth->fetchrow_array) my $start = time; print "$vendor:\n" if ($args{v}); - my $dom = get_dom($url . $part_num, $ua); + my $dom = get_dom($url . $part_num, $ua, $args{v}); if (!defined $dom) { msg("e", "error: dom"); next; } - print "\turl GET ok\n" if ($args{v}); my $price = get_price($price_tag, $dom); if ($sale_tag) { diff --git a/product_scraper.pl b/product_scraper.pl @@ -9,7 +9,7 @@ use Email::Send; use Getopt::Std; use HTML::Grabber; use LWP::Simple; -use PriceChart::Shared; +use PriceChart; my %args; @@ -40,57 +40,60 @@ my $vendor = "Memory Express"; my $product_url = "http://www.memoryexpress.com/Products/"; my %product_map = ("televisions" => "Televisions", "laptops" => "LaptopsNotebooks", - "hard_drives" => "HardDrives"); + "hard drives" => "HardDrives"); -my $qry = "insert into products(part_num, manufacturer, description, type, " . +my $sql = "insert into products(part_num, manufacturer, description, type, " . "first_seen, last_seen, last_scraped) values (?, ?, ?, ?, ?, ?, ?)"; -my $insert_sth = $dbh->prepare($qry); +my $insert_sth = $dbh->prepare($sql); # also update description, manufacturer? -$qry = "update products set last_seen = ? where part_num = ?"; -my $update_sth = $dbh->prepare($qry); +$sql = "update products set last_seen = ? where part_num = ?"; +my $update_sth = $dbh->prepare($sql); my $summary .= "type scraped total new errors time (s)\n"; $summary .= "----------- ------- ----- --- ------ --------\n"; -my $new_products; -my $errors; +my ($new_products, $errors); + +while (my ($type, $name) = each %product_map) { + print "Enumerating $type\n"; -for my $type (keys %product_map) { my $class_url = "http://www.memoryexpress.com/Category/" . - "$product_map{$type}?PageSize=120&Page="; - my $dom = get_dom($class_url . "1", $ua); - next if (! defined $dom); - - print "GET " . $class_url . "1 OK\n" if ($args{v}); - - $dom = $dom->find(".AJAX_List_Pager"); - my @elements = $dom->find("li")->html_array(); - my $pages; - if (@elements == 2) { - $pages = 1; - } else { - $pages = (@elements / 2) - 1; - } + "$name?PageSize=40&Page="; + + # Get first page of results + my $dom = get_dom($class_url . "1", $ua, $args{v}); + next if (!defined $dom); - print "$pages pages of products found\n" if ($args{v}); + # Extract the first of two pager widgets on the page + my ($pager_html) = $dom->find(".AJAX_List_Pager")->html_array(); + next if (!defined $pager_html); + print "Found .AJAX_List_Pager\n" if ($args{v}); + # Find how many pages of results we have + my $pager = HTML::Grabber->new(html => $pager_html); + my $pages = $pager->find("li")->html_array(); + next unless ($pages); + + # If more than 1 page of results are found, the pager contains a "next" + # arrow that needs to be accounted for + $pages-- if ($pages > 1); + print "Found $pages pages\n" if ($args{v}); + + # Loop over all results pages and append all products my @thumbnails; for (1..$pages) { - $dom = get_dom($class_url . "$_", $ua); - return if (! defined $dom); - - print "GET " . $class_url . "$_ OK\n" if ($args{v}); + $dom = get_dom($class_url . "$_", $ua, $args{v}); + next if (!defined $dom); - # $dom->filter(".AJAX_List_Body"); + # Each product is contained inside of their own PIV_Regular push @thumbnails, $dom->find(".PIV_Regular")->html_array(); } my $total = scalar @thumbnails; - print "\nprocessing $type: ($total)\n" if ($args{v}); + print "Found $total $type\n" if ($args{v}); - my ($new, $old) = (0, 0); - my $start = time; + my ($new, $old, $start) = (0, 0, time); for my $thumbnail_html (@thumbnails) { sleep int(rand(10)); @@ -101,7 +104,7 @@ for my $type (keys %product_map) { next unless (defined $product_id); # get the part number from the product page as early as possible - my $product_dom = get_dom("$product_url$product_id", $ua); + my $product_dom = get_dom("$product_url$product_id", $ua, $args{v}); my $part_num = get_tag_text($product_dom, "#ProductAdd"); next unless (defined $part_num); diff --git a/search.pl b/search.pl @@ -6,7 +6,7 @@ use warnings; use FCGI; use Getopt::Std; use Template; -use PriceChart::Shared; +use PriceChart; use Unix::Syslog qw(:macros :subs); use URI::Escape;