commit 81e0dd305731c931b3682e12fe1271d96efaa205
parent a466d6fcee688efcdf4ca0650da9d35b661b663c
Author: kyle <kyle@getaddrinfo.net>
Date: Sun, 8 Nov 2015 11:18:37 -0700
ps_scrape: move price scraper functions to top
Diffstat:
M | ps_scrape | | | 318 | ++++++++++++++++++++++++++++++++++++++++--------------------------------------- |
1 file changed, 160 insertions(+), 158 deletions(-)
diff --git a/ps_scrape b/ps_scrape
@@ -35,6 +35,166 @@ else {
scrape_prices();
}
+sub scrape_prices
+{
+ my $log_path = $cfg->{general}{log_dir} . "/pricesloth";
+ my $log = get_log($log_path, $args{v});
+
+ # allow products to go out of stock. if we haven't seen them for > 30 days
+ # chances are retailers aren't carrying them anymore
+ my $cutoff = time - (30 * 24 * 60 * 60);
+ my $sql = "select part_num, manufacturer, type from products " .
+ "where last_seen > $cutoff order by last_scraped asc";
+ my ($part_num, $manufacturer, $type) = $dbh->selectrow_array($sql);
+
+ unless (defined $part_num && defined $manufacturer) {
+ print "error: no parts seen in the last 30 days\n";
+ print " run a product scrape to freshen the part numbers\n";
+ exit 1;
+ }
+
+ # prevent races with other scrapers, claim ownership as soon as possible
+ $dbh->do("update products set last_scraped = ? where part_num = ? and manufacturer = ?",
+ undef, time, $part_num, $manufacturer);
+
+ print "info: scraping $manufacturer $part_num\n" if ($args{v});
+
+ $sql = qq{insert into prices(date, manufacturer, part_num, retailer,
+ price, duration) values (?, ?, ?, ?, ?, ?)};
+ my $prices_sth = $dbh->prepare($sql);
+
+ $sql = qq{update products set last_seen = ?, svg_stale = 1
+ where part_num = ? and manufacturer = ?};
+ my $products_sth = $dbh->prepare($sql);
+
+ $sql = "insert or replace into retailers(name, color, url) values (?, ?, ?)";
+ my $retailer_sth = $dbh->prepare($sql);
+
+ $sql = qq{insert or replace into descriptions(manufacturer, part_num,
+ retailer, description, date) values (?, ?, ?, ?, ?)};
+ my $descriptions_sth = $dbh->prepare($sql);
+
+ my $timestamp = strftime("%F %T> ", localtime);
+ my ($start, @status, $i) = (time, "", -1);
+ for my $retailer (sort keys %{$cfg->{retailers}}) {
+ my %props = %{$cfg->{retailers}{$retailer}};
+ # this could probably be done smarter
+ my $url = $props{url};
+ my $color = $props{color};
+ my $price_tag = $props{reg_tag};
+ my $sale_tag = $props{sale_tag};
+ my $desc_tag = $props{title};
+
+ my $retailer_start = time;
+ $status[++$i] = " ";
+
+ # for products with short part numbers, also search manufacturer
+ my $search;
+ if (length($part_num) < 6) {
+ $search = uri_escape("$manufacturer $part_num");
+ } else {
+ $search = uri_escape($part_num);
+ }
+
+ # get a page of search results from a retailer
+ my $search_results = get_dom($url . $search, $ua, $args{v}, $log);
+ next unless defined $search_results;
+
+ # search search_results for particular html tags that should be prices
+ my $price_r = get_valid_price($price_tag, $search_results, $retailer, $log);
+ my $price_s = get_valid_price($sale_tag, $search_results, $retailer, $log);
+ next unless ($price_r || $price_s);
+
+ # choose the lowest that exists
+ my $price;
+ $price = $price_r if ($price_r);
+ $price = $price_s if ($price_s);
+ $price = min($price_r, $price_s) if ($price_r && $price_s);
+
+ # opportunistically scrape descriptions
+ my ($found_descr, $descr);
+ if ($desc_tag) {
+ # scrape description, use first one found on page
+ ($descr) = $search_results->find($desc_tag)->text_array();
+ if (defined $descr && $descr ne "") {
+ $descr =~ s/^\s+//;
+ $descr =~ s/\s+$//;
+ $descr =~ s/$manufacturer//;
+ $descr =~ s/$part_num//;
+
+ my $descr_s = trunc_line($descr, length($retailer) + 8);
+ print "info: $retailer: $descr_s\n" if ($args{v});
+ $found_descr = 1;
+ }
+ }
+
+ # everything looks good
+ $status[$i] = substr($retailer, 0, 1);
+
+ next if ($args{n});
+ $dbh->begin_work;
+ $retailer_sth->execute($retailer, $color, $url);
+ $prices_sth->execute($start, $manufacturer, $part_num, $retailer, $price,
+ time - $retailer_start);
+ $products_sth->execute($start, $part_num, $manufacturer);
+ $descriptions_sth->execute($manufacturer, $part_num, $retailer,
+ $descr, time) if (defined $found_descr);
+ $dbh->commit;
+
+ print "info: $retailer: db: inserted \$$price\n" if ($args{v});
+ }
+
+ printf $log "%s %-12s %-10s %-20s [%s] (%i s)\n", $timestamp, $type,
+ $manufacturer, $part_num, join("", @status), time - $start;
+
+ $log->close();
+ $retailer_sth = undef;
+ $prices_sth = undef;
+ $products_sth = undef;
+ $descriptions_sth = undef;
+ $dbh->disconnect();
+
+ exit 0;
+}
+
+sub get_valid_price
+{
+ my $dom_tag = shift || return undef;
+ my $search_results = shift;
+ my $retailer = shift;
+ my $log = shift;
+
+ # break the search_results page down into individual results
+ my @search_prices = $search_results->find($dom_tag)->text_array();
+ my $num_prices = @search_prices;
+ return undef if ($num_prices == 0);
+
+ print "info: $retailer: $dom_tag: $num_prices elements\n" if ($args{v});
+ my $hdr = "$retailer: $dom_tag" . "[0]";
+
+ # do a fuzzy search for digit combinations that look like a price
+ # XXX: uses the first found price in the page
+ # XXX: this does not work on single digit prices, ie $7.00
+ my ($price, @others) = ($search_prices[0] =~ m/(\d[\d,]+)/);
+ if (!defined $price || @others) {
+ print $log "error: $hdr: wrong number of regexs\n";
+ return undef;
+ }
+
+ # sanity check the numerical price value
+ $price =~ s/,//;
+ if ($price <= 0 || $price > 10000) {
+ print $log "error: $hdr: price $price out of range\n";
+ return undef;
+ }
+
+ print "info: $hdr: \$$price\n" if ($args{v});
+ return $price;
+}
+
+
+# --- PRODUCT SCRAPE ---
+
sub mem_exp_scrape_products
{
my $sql = qq{insert into products(part_num, manufacturer, retailer, type,
@@ -334,161 +494,3 @@ sub sleep_rand
printf "$header: (%ss wait)\n", $sleep if ($args{v});
sleep $sleep unless ($args{t});
}
-
-sub scrape_prices
-{
-
- my $log_path = $cfg->{general}{log_dir} . "/pricesloth";
- my $log = get_log($log_path, $args{v});
-
- # allow products to go out of stock. if we haven't seen them for > 30 days
- # chances are retailers aren't carrying them anymore
- my $cutoff = time - (30 * 24 * 60 * 60);
- my $sql = "select part_num, manufacturer, type from products " .
- "where last_seen > $cutoff order by last_scraped asc";
- my ($part_num, $manufacturer, $type) = $dbh->selectrow_array($sql);
-
- unless (defined $part_num && defined $manufacturer) {
- print "error: no parts seen in the last 30 days\n";
- print " run a product scrape to freshen the part numbers\n";
- exit 1;
- }
-
- # prevent races with other scrapers, claim ownership as soon as possible
- $dbh->do("update products set last_scraped = ? where part_num = ? and manufacturer = ?",
- undef, time, $part_num, $manufacturer);
-
- print "info: scraping $manufacturer $part_num\n" if ($args{v});
-
- $sql = qq{insert into prices(date, manufacturer, part_num, retailer,
- price, duration) values (?, ?, ?, ?, ?, ?)};
- my $prices_sth = $dbh->prepare($sql);
-
- $sql = qq{update products set last_seen = ?, svg_stale = 1
- where part_num = ? and manufacturer = ?};
- my $products_sth = $dbh->prepare($sql);
-
- $sql = "insert or replace into retailers(name, color, url) values (?, ?, ?)";
- my $retailer_sth = $dbh->prepare($sql);
-
- $sql = qq{insert or replace into descriptions(manufacturer, part_num,
- retailer, description, date) values (?, ?, ?, ?, ?)};
- my $descriptions_sth = $dbh->prepare($sql);
-
- my $timestamp = strftime("%F %T> ", localtime);
- my ($start, @status, $i) = (time, "", -1);
- for my $retailer (sort keys %{$cfg->{retailers}}) {
- my %props = %{$cfg->{retailers}{$retailer}};
- # this could probably be done smarter
- my $url = $props{url};
- my $color = $props{color};
- my $price_tag = $props{reg_tag};
- my $sale_tag = $props{sale_tag};
- my $desc_tag = $props{title};
-
- my $retailer_start = time;
- $status[++$i] = " ";
-
- # for products with short part numbers, also search manufacturer
- my $search;
- if (length($part_num) < 6) {
- $search = uri_escape("$manufacturer $part_num");
- } else {
- $search = uri_escape($part_num);
- }
-
- # get a page of search results from a retailer
- my $search_results = get_dom($url . $search, $ua, $args{v}, $log);
- next unless defined $search_results;
-
- # search search_results for particular html tags that should be prices
- my $price_r = get_valid_price($price_tag, $search_results, $retailer, $log);
- my $price_s = get_valid_price($sale_tag, $search_results, $retailer, $log);
- next unless ($price_r || $price_s);
-
- # choose the lowest that exists
- my $price;
- $price = $price_r if ($price_r);
- $price = $price_s if ($price_s);
- $price = min($price_r, $price_s) if ($price_r && $price_s);
-
- # opportunistically scrape descriptions
- my ($found_descr, $descr);
- if ($desc_tag) {
- # scrape description, use first one found on page
- ($descr) = $search_results->find($desc_tag)->text_array();
- if (defined $descr && $descr ne "") {
- $descr =~ s/^\s+//;
- $descr =~ s/\s+$//;
- $descr =~ s/$manufacturer//;
- $descr =~ s/$part_num//;
-
- my $descr_s = trunc_line($descr, length($retailer) + 8);
- print "info: $retailer: $descr_s\n" if ($args{v});
- $found_descr = 1;
- }
- }
-
- # everything looks good
- $status[$i] = substr($retailer, 0, 1);
-
- next if ($args{n});
- $dbh->begin_work;
- $retailer_sth->execute($retailer, $color, $url);
- $prices_sth->execute($start, $manufacturer, $part_num, $retailer, $price,
- time - $retailer_start);
- $products_sth->execute($start, $part_num, $manufacturer);
- $descriptions_sth->execute($manufacturer, $part_num, $retailer,
- $descr, time) if (defined $found_descr);
- $dbh->commit;
-
- print "info: $retailer: db: inserted \$$price\n" if ($args{v});
- }
-
- printf $log "%s %-12s %-10s %-20s [%s] (%i s)\n", $timestamp, $type,
- $manufacturer, $part_num, join("", @status), time - $start;
-
- $log->close();
- $retailer_sth = undef;
- $prices_sth = undef;
- $products_sth = undef;
- $descriptions_sth = undef;
- $dbh->disconnect();
-
- exit 0;
-}
-
-sub get_valid_price
-{
- my $dom_tag = shift || return undef;
- my $search_results = shift;
- my $retailer = shift;
- my $log = shift;
-
- # break the search_results page down into individual results
- my @search_prices = $search_results->find($dom_tag)->text_array();
- my $num_prices = @search_prices;
- return undef if ($num_prices == 0);
-
- print "info: $retailer: $dom_tag: $num_prices elements\n" if ($args{v});
- my $hdr = "$retailer: $dom_tag" . "[0]";
-
- # do a fuzzy search for digit combinations that look like a price
- # XXX: uses the first found price in the page
- # XXX: this does not work on single digit prices, ie $7.00
- my ($price, @others) = ($search_prices[0] =~ m/(\d[\d,]+)/);
- if (!defined $price || @others) {
- print $log "error: $hdr: wrong number of regexs\n";
- return undef;
- }
-
- # sanity check the numerical price value
- $price =~ s/,//;
- if ($price <= 0 || $price > 10000) {
- print $log "error: $hdr: price $price out of range\n";
- return undef;
- }
-
- print "info: $hdr: \$$price\n" if ($args{v});
- return $price;
-}