pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit 589c8eab646ebb6122a0753e64b03cbbb8892731
parent 8e49f263dd73c6190bc5110678990924de90a801
Author: Kyle Milz <kyle@getaddrinfo.net>
Date:   Tue, 27 Jan 2015 23:07:28 -0700

price_scraper: re-factor to add robustness

Make the price_scraper more robust by adding information debug messages and
doing more verification on the search results page.

Print out how many duplicate price tags we have, and how many candidate prices
we get from the first tag.

Diffstat:
MPriceChart.pm | 14++++++++++----
Mprice_scraper.pl | 129++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
2 files changed, 86 insertions(+), 57 deletions(-)

diff --git a/PriceChart.pm b/PriceChart.pm @@ -2,7 +2,6 @@ package PriceChart; use DBI; use Exporter; -use POSIX; @ISA = ("Exporter"); @EXPORT = qw(get_config get_dom get_ua get_log get_dbh); @@ -43,6 +42,9 @@ sub get_dom my $resp = $ua->get($url); if ($resp->is_success) { + if (length($url) > 60) { + $url = "..." . substr($url, length($url) - 60); + } print "GET $url " . $resp->status_line . "\n" if ($verbose); return HTML::Grabber->new(html => $resp->decoded_content); } @@ -64,12 +66,16 @@ sub get_ua sub get_log { my $file = shift; + my $verbose = shift; my $log_dir = "/var/www/logs/pricechart"; - mkdir $log_dir; - open my $log, ">>", "$log_dir/$file.log" || die "$!"; + if ($verbose) { + open my $log, '>&', STDOUT or die "$!"; + return $log; + } - print $log strftime "%b %e %Y %H:%M ", localtime; + mkdir $log_dir; + open my $log, ">>", "$log_dir/$file.log" or die "$!"; return $log; } diff --git a/price_scraper.pl b/price_scraper.pl @@ -6,26 +6,36 @@ use warnings; use Config::Grammar; use Getopt::Std; use HTML::Grabber; +use List::Util qw(min); use LWP::Simple; use PriceChart; +use POSIX; +use URI::Escape; my %args; -getopts("nv", \%args); +getopts("m:np:v", \%args); $| = 1 if ($args{v}); +my $log = get_log("scrapes", $args{v}); my $cfg = get_config(); my $ua = get_ua($cfg); my $dbh = get_dbh(); -# pick the oldest product +# allow products to go out of stock. if we haven't seen them for > 30 days +# chances are retailers aren't carrying them anymore my $cutoff = time - (30 * 24 * 60 * 60); my $sql = "select part_num, manufacturer from products " . "where last_seen > $cutoff order by last_scraped asc"; my ($part_num, $manufacturer) = $dbh->selectrow_array($sql); +if ($args{p} && $args{m}) { + $part_num = $args{p}; + $manufacturer = $args{m}; +} exit unless (defined $part_num); +# keep track of when we last tried to scrape this product $dbh->do("update products set last_scraped = ? where part_num = ?", undef, time, $part_num); @@ -35,12 +45,11 @@ $dbh->do("create table if not exists prices(" . "vendor text not null, " . "price int not null, " . "duration int, " . - "primary key(date, part_num, vendor, price))") or die $DBI::errstr; + "primary key(date, part_num, vendor, price))" +) or die $DBI::errstr; -my $log = get_log("scrapes"); -printf $log "%-25s [", $manufacturer . " " . $part_num; -print "$manufacturer $part_num\n" if ($args{v}); +print "info: $manufacturer $part_num\n" if ($args{v}); $sql = "insert into prices(date, part_num, vendor, price, duration) " . "values (?, ?, ?, ?, ?)"; @@ -52,74 +61,88 @@ my $products_sth = $dbh->prepare($sql); $sql = "select * from vendors order by name"; my $vendor_sth = $dbh->prepare($sql); -my $date = time; +my ($start, @status, $i) = (time, "", -1); $vendor_sth->execute(); while (my ($vendor, $url, $price_tag, $sale_tag) = $vendor_sth->fetchrow_array) { - - my $start = time; - print "$vendor:\n" if ($args{v}); - - my $dom = get_dom($url . $part_num, $ua, $args{v}); - if (!defined $dom) { - msg("e", "error: dom"); - next; + my $vendor_start = time; + $status[++$i] = " "; + + # for products with short part numbers, also search manufacturer + my $search; + if (length($part_num) < 6) { + $search = uri_escape("$manufacturer $part_num"); + } else { + $search = uri_escape($part_num); } - my $price = get_price($price_tag, $dom); - if ($sale_tag) { - my $sale_price = get_price($sale_tag, $dom); - $price = $sale_price if (defined $sale_price); - } - if (! $price) { - msg(" ", "error: price not found"); + # get a page of search results from a vendor + my $search_results = get_dom($url . $search, $ua, $args{v}); + if (!defined $search_results) { + print $log "error: $vendor: couldn't GET search results\n"; next; } - my @prices = ($price =~ m/(\d[\d,]+)/); - if (@prices != 1) { - msg("r", "error: " . @prices . " regex matches, expecting 1"); - next; - } + # search search_results for particular html tags that should be prices + my $price_r = get_valid_price($price_tag, $search_results, $vendor); + my $price_s = get_valid_price($sale_tag, $search_results, $vendor); + next unless ($price_r || $price_s); - $price = $prices[0]; - $price =~ s/,//; - if ($price <= 0 || $price > 10000) { - msg("o", "error: price \$$price out of range"); - next; - } + # choose the lowest that exists + my $price; + $price = $price_r if ($price_r); + $price = $price_s if ($price_s); + $price = min($price_r, $price_s) if ($price_r && $price_s); - msg(substr($vendor, 0, 1), "price = \$$price"); + # everything looks good + $status[$i] = substr($vendor, 0, 1); + print "info: $vendor: final = \$$price\n" if ($args{v}); next if ($args{n}); + $prices_sth->execute($start, $part_num, $vendor, $price, time - $vendor_start); + $products_sth->execute($start, $part_num); - $prices_sth->execute($date, $part_num, $vendor, $price, time - $start); - $products_sth->execute($date, $part_num); - - print "\tdb updated\n" if ($args{v}); + print "info: $vendor: db updated\n" if ($args{v}); } -my $duration = time - $date; -print $log "] ($duration s)\n"; +printf $log "%s %-10s %-15s [%s] (%i s)\n", strftime("%F %T", localtime), + $manufacturer, $part_num, join("", @status), time - $start; close $log; $dbh->disconnect(); -sub get_price +exit 0; + +sub get_valid_price { - my $dom_element = shift; - my $dom = shift; + my $dom_tag = shift; + my $search_results = shift; + my $vendor = shift; + return undef unless (defined $dom_tag); - my @prices = $dom->find($dom_element)->text_array(); - printf "\t%s = %i\n", $dom_element, scalar @prices if ($args{v}); + # break the search_results page down into individual results + my @search_prices = $search_results->find($dom_tag)->text_array(); + my $num_prices = @search_prices; + return undef if ($num_prices == 0); - return $prices[0]; -} + print "info: $vendor: $dom_tag ($num_prices)\n" if ($args{v}); -sub msg -{ - my $log_char = shift; - my $verbose_msg = shift; + # do a fuzzy search for digit combinations that look like a price + # XXX: use the first found price in the page + my ($price, @others) = ($search_prices[0] =~ m/(\d[\d,]+)/); + return undef unless defined $price; + + # print total regex matches we had above + $num_prices = @others + 1; + print "info: $vendor: $dom_tag" . "[0] ($num_prices)\n" if ($args{v}); + return undef if (@others); + + # sanity check on the numerical value of the price + $price =~ s/,//; + if ($price <= 0 || $price > 10000) { + print $log "error: $vendor: price \$$price out of range\n"; + return undef; + } - print $log $log_char; - print "\t$verbose_msg\n" if ($args{v}); + print "info: $vendor: $dom_tag" . "[0]: \$$price\n" if ($args{v}); + return $price; }