pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit 64a99096e32686e3faac7c6e48cbd11d9db1dd4b
parent b7fb366a469387a1dd8dc3c0eb9bb7c79894832b
Author: Kyle Milz <kyle@getaddrinfo.net>
Date:   Sun,  1 Feb 2015 15:59:50 -0700

price_scraper: use config file for vendors, again

Don't put vendor information in the db. It's unmaintainable, and the only thing
we need to carry forward is the color of that vendor. Just store the color
inside the price scrape for later graph generation.

Diffstat:
MPriceChart.pm | 11+++++++----
Mprice_scraper.pl | 43++++++++++++++++++++++++-------------------
Dupdate_vendors.pl | 43-------------------------------------------
3 files changed, 31 insertions(+), 66 deletions(-)

diff --git a/PriceChart.pm b/PriceChart.pm @@ -23,16 +23,19 @@ sub get_config "/[A-Za-z ]+/" => { _vars => [ "search_url", - "regular_price_tag", - "sale_price_tag", + "price_regular", + "price_sale", "color", "title" ] } } }); + my $cfg_file = "/etc/pricechart.cfg"; - return $parser->parse($cfg_file) or die "error: $parser->{err}\n"; + my $cfg = $parser->parse($cfg_file) or die "error: $parser->{err}\n"; + + return $cfg; } sub get_dbh @@ -66,7 +69,7 @@ sub get_dom return HTML::Grabber->new(html => $resp->decoded_content); } - print "GET $url " . $resp->status_line . "\n"; + print "error: GET $url " . $resp->status_line . "\n"; return undef; } diff --git a/price_scraper.pl b/price_scraper.pl @@ -20,7 +20,7 @@ $| = 1 if ($args{v}); my $log = get_log("scrapes", $args{v}); my $cfg = get_config(); -my $ua = get_ua($cfg); +my $ua = get_ua($cfg->{"general"}); my $dbh = get_dbh(); # allow products to go out of stock. if we haven't seen them for > 30 days @@ -35,38 +35,38 @@ if ($args{p} && $args{m}) { } exit unless (defined $part_num); -# keep track of when we last tried to scrape this product -$dbh->do("update products set last_scraped = ? where part_num = ?", - undef, time, $part_num); - $dbh->do("create table if not exists prices(" . "date int not null, " . "part_num text not null, " . "vendor text not null, " . "price int not null, " . + "color text not null, " . "duration int, " . + "title text, " . "primary key(date, part_num, vendor, price))" ) or die $DBI::errstr; - print "info: $manufacturer $part_num\n" if ($args{v}); -$sql = "insert into prices(date, part_num, vendor, price, duration) " . - "values (?, ?, ?, ?, ?)"; +$sql = "insert into prices(date, part_num, vendor, color, price, duration) " . + "values (?, ?, ?, ?, ?, ?)"; my $prices_sth = $dbh->prepare($sql); $sql = "update products set last_seen = ? where part_num = ?"; my $products_sth = $dbh->prepare($sql); -$sql = "select * from vendors order by name"; -my $vendor_sth = $dbh->prepare($sql); - my ($start, @status, $i) = (time, "", -1); -$vendor_sth->execute(); -while (my ($vendor, $url, $price_tag, $sale_tag) = $vendor_sth->fetchrow_array) { +while (my ($vendor, $props) = each $cfg->{"vendors"}) { + my $url = $props->{"search_url"}; + my $color = $props->{"color"}; + my $price_tag = $props->{"price_regular"}; + my $sale_tag = $props->{"price_sale"}; + my $vendor_start = time; $status[++$i] = " "; + print "info: $vendor\n" if ($args{v}); + # for products with short part numbers, also search manufacturer my $search; if (length($part_num) < 6) { @@ -77,10 +77,7 @@ while (my ($vendor, $url, $price_tag, $sale_tag) = $vendor_sth->fetchrow_array) # get a page of search results from a vendor my $search_results = get_dom($url . $search, $ua, $args{v}); - if (!defined $search_results) { - print $log "error: $vendor: couldn't GET search results\n"; - next; - } + next unless defined $search_results; # search search_results for particular html tags that should be prices my $price_r = get_valid_price($price_tag, $search_results, $vendor); @@ -93,12 +90,15 @@ while (my ($vendor, $url, $price_tag, $sale_tag) = $vendor_sth->fetchrow_array) $price = $price_s if ($price_s); $price = min($price_r, $price_s) if ($price_r && $price_s); + # XXX: also think about scraping title here + # everything looks good $status[$i] = substr($vendor, 0, 1); print "info: $vendor: final = \$$price\n" if ($args{v}); next if ($args{n}); - $prices_sth->execute($start, $part_num, $vendor, $price, time - $vendor_start); + $prices_sth->execute($start, $part_num, $vendor, $color, + $price, time - $vendor_start); $products_sth->execute($start, $part_num); print "info: $vendor: db updated\n" if ($args{v}); @@ -108,6 +108,11 @@ printf $log "%s %-10s %-15s [%s] (%i s)\n", strftime("%F %T", localtime), $manufacturer, $part_num, join("", @status), time - $start; close $log; + +# record that we finished scraping this product, successful or not +$dbh->do("update products set last_scraped = ? where part_num = ?", + undef, time, $part_num); + $dbh->disconnect(); exit 0; @@ -127,7 +132,7 @@ sub get_valid_price print "info: $vendor: $dom_tag ($num_prices)\n" if ($args{v}); # do a fuzzy search for digit combinations that look like a price - # XXX: use the first found price in the page + # XXX: uses the first found price in the page my ($price, @others) = ($search_prices[0] =~ m/(\d[\d,]+)/); return undef unless defined $price; diff --git a/update_vendors.pl b/update_vendors.pl @@ -1,43 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; - -use Getopt::Std; -use PriceChart; - - -my %args; -getopt("v:u:r:s:c:", \%args); - -if (!$args{v}) { - print "Argument -v must be present\n"; - exit -} - -my $dbh = get_dbh(); - -$dbh->do("create table if not exists vendors(" . - "name text not null primary key, " . - "search_url not null, " . - "price_tag not null, " . - "sale_tag, " . - "color text not null)") or die $DBI::errstr; - -my $sql = "update vendors set search_url = ?, price_tag = ?, sale_tag = ?, " . - "color = ? where name = ?"; -my $update_sth = $dbh->prepare($sql); - -$sql = "insert into vendors(name, search_url, price_tag, sale_tag, color) " . - "values (?, ?, ?, ?, ?)"; -my $insert_sth = $dbh->prepare($sql); - -$sql = "select * from vendors where name = ?"; -if ($dbh->selectrow_arrayref($sql, undef, $args{v})) { - $update_sth->execute($args{u}, $args{r}, $args{s}, $args{c}, $args{v}); -} -else { - $insert_sth->execute($args{v}, $args{u}, $args{r}, $args{s}, $args{c}); -} - -$dbh->disconnect();