pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit b7fb366a469387a1dd8dc3c0eb9bb7c79894832b
parent 589c8eab646ebb6122a0753e64b03cbbb8892731
Author: Kyle Milz <kyle@getaddrinfo.net>
Date:   Thu, 29 Jan 2015 01:08:25 -0700

product_scraper: add comments and debugging

Diffstat:
MPriceChart.pm | 35++++++++++++++++++++++++++---------
Mproduct_scraper.pl | 122+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
2 files changed, 112 insertions(+), 45 deletions(-)

diff --git a/PriceChart.pm b/PriceChart.pm @@ -10,11 +10,26 @@ use Exporter; sub get_config { my $parser = Config::Grammar->new({ - _vars => [ - 'user_agent', - 'email', - 'smtp', - ], + _sections => ["general", "vendors"], + general => { + _vars => [ + 'user_agent', + 'email', + 'smtp' + ], + }, + vendors => { + _sections => ["/[A-Za-z ]+/"], + "/[A-Za-z ]+/" => { + _vars => [ + "search_url", + "regular_price_tag", + "sale_price_tag", + "color", + "title" + ] + } + } }); my $cfg_file = "/etc/pricechart.cfg"; return $parser->parse($cfg_file) or die "error: $parser->{err}\n"; @@ -22,7 +37,9 @@ sub get_config sub get_dbh { - my $db_dir = "/var/www/db"; + # XXX: needs to be changed in production! + # my $db_dir = "/var/www/db"; + my $db_dir = "./"; mkdir $db_dir; my $dbh = DBI->connect( @@ -42,10 +59,10 @@ sub get_dom my $resp = $ua->get($url); if ($resp->is_success) { - if (length($url) > 60) { - $url = "..." . substr($url, length($url) - 60); + if (length($url) > 55) { + $url = "..." . substr($url, length($url) - 55); } - print "GET $url " . $resp->status_line . "\n" if ($verbose); + print "info: GET $url " . $resp->status_line . "\n" if ($verbose); return HTML::Grabber->new(html => $resp->decoded_content); } diff --git a/product_scraper.pl b/product_scraper.pl @@ -13,12 +13,12 @@ use PriceChart; my %args; -getopts("v", \%args); +getopts("tv", \%args); $| = 1 if ($args{v}); my $cfg = get_config(); -my $ua = get_ua($cfg); +my $ua = get_ua($cfg->{"general"}); my $dbh = get_dbh(); srand; @@ -36,9 +36,8 @@ $dbh->do("create table if not exists products(" . # my $vendor = "Memory Express"; -# use this to look up individual products -my $product_url = "http://www.memoryexpress.com/Products/"; -my %product_map = ("televisions" => "Televisions", +my %product_map = ( + "televisions" => "Televisions", "laptops" => "LaptopsNotebooks", "hard drives" => "HardDrives"); @@ -58,88 +57,138 @@ my ($new_products, $errors); while (my ($type, $name) = each %product_map) { print "Enumerating $type\n"; + # this returns a search results page, link found through trial and error my $class_url = "http://www.memoryexpress.com/Category/" . "$name?PageSize=40&Page="; - # Get first page of results + # get first page of results my $dom = get_dom($class_url . "1", $ua, $args{v}); next if (!defined $dom); - # Extract the first of two pager widgets on the page + # extract the first of two pager widgets on the page my ($pager_html) = $dom->find(".AJAX_List_Pager")->html_array(); next if (!defined $pager_html); - print "Found .AJAX_List_Pager\n" if ($args{v}); + print "info: .AJAX_List_Pager found\n" if ($args{v}); - # Find how many pages of results we have + # find how many pages of results we have, each page is one <li> element my $pager = HTML::Grabber->new(html => $pager_html); my $pages = $pager->find("li")->html_array(); next unless ($pages); - # If more than 1 page of results are found, the pager contains a "next" - # arrow that needs to be accounted for + # if more than 1 <li> is found, one <li> is always a "next" arrow $pages-- if ($pages > 1); - print "Found $pages pages\n" if ($args{v}); + print "info: .AJAX_List_Pager: $pages pages\n" if ($args{v}); - # Loop over all results pages and append all products + # loop over results pages and append product thumbnails my @thumbnails; for (1..$pages) { + # slow this down a bit + sleep int(rand(5)); + $dom = get_dom($class_url . "$_", $ua, $args{v}); next if (!defined $dom); - # Each product is contained inside of their own PIV_Regular + # each product thumbnail has class=PIV_Regular push @thumbnails, $dom->find(".PIV_Regular")->html_array(); + + next if ($args{t}); } my $total = scalar @thumbnails; - print "Found $total $type\n" if ($args{v}); + print "info: found $total $type, scraping individually\n" if ($args{v}); - my ($new, $old, $start) = (0, 0, time); + # extract part number, brand, and description + my ($new, $old, $start, $i) = (0, 0, time, 0); for my $thumbnail_html (@thumbnails) { - sleep int(rand(10)); + $i++; + my $hdr = "$type: $i/$total"; + + my $sleep = int(rand(20)); + print "info: $hdr ($sleep s wait)\n" if ($args{v}); + sleep $sleep; + # make new html grabber instance with the thumbnail html my $thumbnail_dom = HTML::Grabber->new(html => $thumbnail_html); - # used to visit the actual product page + # has to be found otherwise we can't do anything my $product_id = get_tag_text($thumbnail_dom, ".ProductId"); - next unless (defined $product_id); + if (!defined $product_id) { + print "error: $hdr: .ProductId not found\n"; + next; + } + else { + print "info: $hdr: .ProductId = $product_id\n" if ($args{v}); + } - # get the part number from the product page as early as possible + # visit the extended description page + my $product_url = "http://www.memoryexpress.com/Products/"; my $product_dom = get_dom("$product_url$product_id", $ua, $args{v}); + + # the part number is inside of id=ProductAdd always my $part_num = get_tag_text($product_dom, "#ProductAdd"); - next unless (defined $part_num); + if (!defined $part_num) { + print "error: $hdr: #ProductAdd not found\n"; + next; + } + # extract the part number, always is text inside of the tag ($part_num) = ($part_num =~ m/Part #:\s*(.*)\r/); - next unless (defined $part_num && $part_num ne ""); + if (!defined $part_num || $part_num eq "") { + print "error: $hdr: part num regex failed\n"; + next; + } + else { + print "info: $hdr: part_num = $part_num\n" if ($args{v}); + } - my $description = get_tag_text($thumbnail_dom, ".ProductTitle"); - next unless (defined $description); + # extract the product tile + my $desc = get_tag_text($thumbnail_dom, ".ProductTitle"); + if (!defined $desc) { + print "error: $hdr: .ProductTitle was not found.\n"; + next; + } + else { + my $tmp_desc = $desc; + if (length($tmp_desc) > 35) { + $tmp_desc = substr($tmp_desc, 0, 40) . "..."; + } + print "info: $hdr: .ProductTitle = $tmp_desc\n" if ($args{v}); + } - # brand sometimes shows up as text + # extract the brand, sometimes shows up as text my $brand = $thumbnail_dom->find(".ProductBrand")->text(); if ($brand eq "") { + print "info: $hdr: .ProductBrand not text\n" if ($args{v}); + # and sometimes shows up inside the tag attributes $brand = $thumbnail_dom->find(".ProductBrand")->html(); ($brand) = ($brand =~ m/Brand: ([A-Za-z]+)/); } if (!defined $brand || $brand eq "") { - $errors .= "could not find .ProductBrand, html was:\n"; - $errors .= "$thumbnail_html\n\n"; - print $errors if ($args{v}); + print "error: $hdr: .ProductBrand not found, html:\n"; + print "$thumbnail_html\n"; next; } + else { + print "info: $hdr: .ProductBrand = $brand\n" if ($args{v}); + } + # use existence of part_num to decide on update or insert new my $sql = "select * from products where part_num = ?"; if ($dbh->selectrow_arrayref($sql, undef, $part_num)) { + # update $update_sth->execute(time, $part_num); - print "updated $part_num\n" if ($args{v}); + print "info: $hdr: db updated\n" if ($args{v}); $old++; } else { - $insert_sth->execute($part_num, $brand, $description, + # insert new + $insert_sth->execute($part_num, $brand, $desc, $type, time, time, 0); - print "inserted $part_num\n" if ($args{v}); - $new_products .= "$brand $description ($part_num)\n"; + print "info: $hdr: db inserted\n" if ($args{v}); + $new_products .= "$brand $desc ($part_num)\n"; $new++; } + last if ($args{t}); } $summary .= sprintf("%-11s %7s %5s %3s %6s %8s\n", $type, $new + $old, @@ -161,17 +210,18 @@ $mail .= $errors if ($errors); my $email = Email::Simple->create( header => [ From => "Santa Claus <sc\@np.com>", - To => $cfg->{email}, + To => $cfg->{"general"}{"email"}, Subject => "PriceChart product scrape", ], - body => $mail); + body => $mail +); if ($args{v}) { print $email->as_string(); } else { my $sender = Email::Send->new({mailer => 'SMTP'}); - $sender->mailer_args([Host => $cfg->{smtp}]); + $sender->mailer_args([Host => $cfg->{"general"}{"smtp"}]); $sender->send($email->as_string()) || print "Couldn't send email\n"; } @@ -182,7 +232,7 @@ sub get_tag_text my $field = $dom->find($tag)->text(); if (!defined $field || $field eq "") { - $errors .= "could not find $tag, html was:\n"; + $errors .= "error: could not find $tag, html was:\n"; $errors .= $dom->html(); $errors .= "\n\n"; print $errors if ($args{v});