pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit 81e0dd305731c931b3682e12fe1271d96efaa205
parent a466d6fcee688efcdf4ca0650da9d35b661b663c
Author: kyle <kyle@getaddrinfo.net>
Date:   Sun,  8 Nov 2015 11:18:37 -0700

ps_scrape: move price scraper functions to top

Diffstat:
Mps_scrape | 318++++++++++++++++++++++++++++++++++++++++---------------------------------------
1 file changed, 160 insertions(+), 158 deletions(-)

diff --git a/ps_scrape b/ps_scrape @@ -35,6 +35,166 @@ else { scrape_prices(); } +sub scrape_prices +{ + my $log_path = $cfg->{general}{log_dir} . "/pricesloth"; + my $log = get_log($log_path, $args{v}); + + # allow products to go out of stock. if we haven't seen them for > 30 days + # chances are retailers aren't carrying them anymore + my $cutoff = time - (30 * 24 * 60 * 60); + my $sql = "select part_num, manufacturer, type from products " . + "where last_seen > $cutoff order by last_scraped asc"; + my ($part_num, $manufacturer, $type) = $dbh->selectrow_array($sql); + + unless (defined $part_num && defined $manufacturer) { + print "error: no parts seen in the last 30 days\n"; + print " run a product scrape to freshen the part numbers\n"; + exit 1; + } + + # prevent races with other scrapers, claim ownership as soon as possible + $dbh->do("update products set last_scraped = ? where part_num = ? and manufacturer = ?", + undef, time, $part_num, $manufacturer); + + print "info: scraping $manufacturer $part_num\n" if ($args{v}); + + $sql = qq{insert into prices(date, manufacturer, part_num, retailer, + price, duration) values (?, ?, ?, ?, ?, ?)}; + my $prices_sth = $dbh->prepare($sql); + + $sql = qq{update products set last_seen = ?, svg_stale = 1 + where part_num = ? and manufacturer = ?}; + my $products_sth = $dbh->prepare($sql); + + $sql = "insert or replace into retailers(name, color, url) values (?, ?, ?)"; + my $retailer_sth = $dbh->prepare($sql); + + $sql = qq{insert or replace into descriptions(manufacturer, part_num, + retailer, description, date) values (?, ?, ?, ?, ?)}; + my $descriptions_sth = $dbh->prepare($sql); + + my $timestamp = strftime("%F %T> ", localtime); + my ($start, @status, $i) = (time, "", -1); + for my $retailer (sort keys %{$cfg->{retailers}}) { + my %props = %{$cfg->{retailers}{$retailer}}; + # this could probably be done smarter + my $url = $props{url}; + my $color = $props{color}; + my $price_tag = $props{reg_tag}; + my $sale_tag = $props{sale_tag}; + my $desc_tag = $props{title}; + + my $retailer_start = time; + $status[++$i] = " "; + + # for products with short part numbers, also search manufacturer + my $search; + if (length($part_num) < 6) { + $search = uri_escape("$manufacturer $part_num"); + } else { + $search = uri_escape($part_num); + } + + # get a page of search results from a retailer + my $search_results = get_dom($url . $search, $ua, $args{v}, $log); + next unless defined $search_results; + + # search search_results for particular html tags that should be prices + my $price_r = get_valid_price($price_tag, $search_results, $retailer, $log); + my $price_s = get_valid_price($sale_tag, $search_results, $retailer, $log); + next unless ($price_r || $price_s); + + # choose the lowest that exists + my $price; + $price = $price_r if ($price_r); + $price = $price_s if ($price_s); + $price = min($price_r, $price_s) if ($price_r && $price_s); + + # opportunistically scrape descriptions + my ($found_descr, $descr); + if ($desc_tag) { + # scrape description, use first one found on page + ($descr) = $search_results->find($desc_tag)->text_array(); + if (defined $descr && $descr ne "") { + $descr =~ s/^\s+//; + $descr =~ s/\s+$//; + $descr =~ s/$manufacturer//; + $descr =~ s/$part_num//; + + my $descr_s = trunc_line($descr, length($retailer) + 8); + print "info: $retailer: $descr_s\n" if ($args{v}); + $found_descr = 1; + } + } + + # everything looks good + $status[$i] = substr($retailer, 0, 1); + + next if ($args{n}); + $dbh->begin_work; + $retailer_sth->execute($retailer, $color, $url); + $prices_sth->execute($start, $manufacturer, $part_num, $retailer, $price, + time - $retailer_start); + $products_sth->execute($start, $part_num, $manufacturer); + $descriptions_sth->execute($manufacturer, $part_num, $retailer, + $descr, time) if (defined $found_descr); + $dbh->commit; + + print "info: $retailer: db: inserted \$$price\n" if ($args{v}); + } + + printf $log "%s %-12s %-10s %-20s [%s] (%i s)\n", $timestamp, $type, + $manufacturer, $part_num, join("", @status), time - $start; + + $log->close(); + $retailer_sth = undef; + $prices_sth = undef; + $products_sth = undef; + $descriptions_sth = undef; + $dbh->disconnect(); + + exit 0; +} + +sub get_valid_price +{ + my $dom_tag = shift || return undef; + my $search_results = shift; + my $retailer = shift; + my $log = shift; + + # break the search_results page down into individual results + my @search_prices = $search_results->find($dom_tag)->text_array(); + my $num_prices = @search_prices; + return undef if ($num_prices == 0); + + print "info: $retailer: $dom_tag: $num_prices elements\n" if ($args{v}); + my $hdr = "$retailer: $dom_tag" . "[0]"; + + # do a fuzzy search for digit combinations that look like a price + # XXX: uses the first found price in the page + # XXX: this does not work on single digit prices, ie $7.00 + my ($price, @others) = ($search_prices[0] =~ m/(\d[\d,]+)/); + if (!defined $price || @others) { + print $log "error: $hdr: wrong number of regexs\n"; + return undef; + } + + # sanity check the numerical price value + $price =~ s/,//; + if ($price <= 0 || $price > 10000) { + print $log "error: $hdr: price $price out of range\n"; + return undef; + } + + print "info: $hdr: \$$price\n" if ($args{v}); + return $price; +} + + +# --- PRODUCT SCRAPE --- + sub mem_exp_scrape_products { my $sql = qq{insert into products(part_num, manufacturer, retailer, type, @@ -334,161 +494,3 @@ sub sleep_rand printf "$header: (%ss wait)\n", $sleep if ($args{v}); sleep $sleep unless ($args{t}); } - -sub scrape_prices -{ - - my $log_path = $cfg->{general}{log_dir} . "/pricesloth"; - my $log = get_log($log_path, $args{v}); - - # allow products to go out of stock. if we haven't seen them for > 30 days - # chances are retailers aren't carrying them anymore - my $cutoff = time - (30 * 24 * 60 * 60); - my $sql = "select part_num, manufacturer, type from products " . - "where last_seen > $cutoff order by last_scraped asc"; - my ($part_num, $manufacturer, $type) = $dbh->selectrow_array($sql); - - unless (defined $part_num && defined $manufacturer) { - print "error: no parts seen in the last 30 days\n"; - print " run a product scrape to freshen the part numbers\n"; - exit 1; - } - - # prevent races with other scrapers, claim ownership as soon as possible - $dbh->do("update products set last_scraped = ? where part_num = ? and manufacturer = ?", - undef, time, $part_num, $manufacturer); - - print "info: scraping $manufacturer $part_num\n" if ($args{v}); - - $sql = qq{insert into prices(date, manufacturer, part_num, retailer, - price, duration) values (?, ?, ?, ?, ?, ?)}; - my $prices_sth = $dbh->prepare($sql); - - $sql = qq{update products set last_seen = ?, svg_stale = 1 - where part_num = ? and manufacturer = ?}; - my $products_sth = $dbh->prepare($sql); - - $sql = "insert or replace into retailers(name, color, url) values (?, ?, ?)"; - my $retailer_sth = $dbh->prepare($sql); - - $sql = qq{insert or replace into descriptions(manufacturer, part_num, - retailer, description, date) values (?, ?, ?, ?, ?)}; - my $descriptions_sth = $dbh->prepare($sql); - - my $timestamp = strftime("%F %T> ", localtime); - my ($start, @status, $i) = (time, "", -1); - for my $retailer (sort keys %{$cfg->{retailers}}) { - my %props = %{$cfg->{retailers}{$retailer}}; - # this could probably be done smarter - my $url = $props{url}; - my $color = $props{color}; - my $price_tag = $props{reg_tag}; - my $sale_tag = $props{sale_tag}; - my $desc_tag = $props{title}; - - my $retailer_start = time; - $status[++$i] = " "; - - # for products with short part numbers, also search manufacturer - my $search; - if (length($part_num) < 6) { - $search = uri_escape("$manufacturer $part_num"); - } else { - $search = uri_escape($part_num); - } - - # get a page of search results from a retailer - my $search_results = get_dom($url . $search, $ua, $args{v}, $log); - next unless defined $search_results; - - # search search_results for particular html tags that should be prices - my $price_r = get_valid_price($price_tag, $search_results, $retailer, $log); - my $price_s = get_valid_price($sale_tag, $search_results, $retailer, $log); - next unless ($price_r || $price_s); - - # choose the lowest that exists - my $price; - $price = $price_r if ($price_r); - $price = $price_s if ($price_s); - $price = min($price_r, $price_s) if ($price_r && $price_s); - - # opportunistically scrape descriptions - my ($found_descr, $descr); - if ($desc_tag) { - # scrape description, use first one found on page - ($descr) = $search_results->find($desc_tag)->text_array(); - if (defined $descr && $descr ne "") { - $descr =~ s/^\s+//; - $descr =~ s/\s+$//; - $descr =~ s/$manufacturer//; - $descr =~ s/$part_num//; - - my $descr_s = trunc_line($descr, length($retailer) + 8); - print "info: $retailer: $descr_s\n" if ($args{v}); - $found_descr = 1; - } - } - - # everything looks good - $status[$i] = substr($retailer, 0, 1); - - next if ($args{n}); - $dbh->begin_work; - $retailer_sth->execute($retailer, $color, $url); - $prices_sth->execute($start, $manufacturer, $part_num, $retailer, $price, - time - $retailer_start); - $products_sth->execute($start, $part_num, $manufacturer); - $descriptions_sth->execute($manufacturer, $part_num, $retailer, - $descr, time) if (defined $found_descr); - $dbh->commit; - - print "info: $retailer: db: inserted \$$price\n" if ($args{v}); - } - - printf $log "%s %-12s %-10s %-20s [%s] (%i s)\n", $timestamp, $type, - $manufacturer, $part_num, join("", @status), time - $start; - - $log->close(); - $retailer_sth = undef; - $prices_sth = undef; - $products_sth = undef; - $descriptions_sth = undef; - $dbh->disconnect(); - - exit 0; -} - -sub get_valid_price -{ - my $dom_tag = shift || return undef; - my $search_results = shift; - my $retailer = shift; - my $log = shift; - - # break the search_results page down into individual results - my @search_prices = $search_results->find($dom_tag)->text_array(); - my $num_prices = @search_prices; - return undef if ($num_prices == 0); - - print "info: $retailer: $dom_tag: $num_prices elements\n" if ($args{v}); - my $hdr = "$retailer: $dom_tag" . "[0]"; - - # do a fuzzy search for digit combinations that look like a price - # XXX: uses the first found price in the page - # XXX: this does not work on single digit prices, ie $7.00 - my ($price, @others) = ($search_prices[0] =~ m/(\d[\d,]+)/); - if (!defined $price || @others) { - print $log "error: $hdr: wrong number of regexs\n"; - return undef; - } - - # sanity check the numerical price value - $price =~ s/,//; - if ($price <= 0 || $price > 10000) { - print $log "error: $hdr: price $price out of range\n"; - return undef; - } - - print "info: $hdr: \$$price\n" if ($args{v}); - return $price; -}