pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit f8a16f2f99fb89386e8c78fa3ca99006dbf769f2
parent 4f6a895117ede80527331e35edbffdff3ee24390
Author: kyle <kyle@getaddrinfo.net>
Date:   Wed,  2 Mar 2016 20:15:18 -0700

start transitioning to a module based layout

- make database, config and user agent their own classes
- start making each vendor their own class too
- add memory express price scrape test

Diffstat:
APS/Config.pm | 47+++++++++++++++++++++++++++++++++++++++++++++++
APS/Database.pm | 90+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
APS/MemoryExpress.pm | 115+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
APS/UserAgent.pm | 55+++++++++++++++++++++++++++++++++++++++++++++++++++++++
DPriceSloth.pm | 233-------------------------------------------------------------------------------
Metc/pricesloth.cfg | 105-------------------------------------------------------------------------------
At/memory_express.t | 97+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 404 insertions(+), 338 deletions(-)

diff --git a/PS/Config.pm b/PS/Config.pm @@ -0,0 +1,47 @@ +package PS::Config; + +use Config::Grammar; + +sub new { + my $class = shift; + my $self = {}; + bless ($self, $class); + + my $parser = Config::Grammar->new({ + _sections => [ "general", "http" ], + general => { + _vars => [ + "agent", + "email", + "smtp", + # XXX: add simple regex validation here + "addrs", + "db_dir", + "log_dir", + ], + }, + http => { + _vars => [ + "socket", + "uid", + "gid", + "chroot", + "htdocs", + ], + }, + }); + + my $cfg_file = "/etc/pricesloth.cfg"; + if (-e "etc/pricesloth.cfg") { + $cfg_file = "etc/pricesloth.cfg"; + } + $self->{cfg} = $parser->parse($cfg_file) or die "error: $parser->{err}\n"; + + return $self; +} + +sub get_cfg { + return $self->{cfg}; +} + +1; diff --git a/PS/Database.pm b/PS/Database.pm @@ -0,0 +1,90 @@ +package PS::Database; + +use DBI; +use Log::Log4perl qw(:easy); +use File::Path qw(make_path); +use POSIX; + +use PS::Config; + +my $logger = get_logger('pricesloth.database'); + +sub new { + my $class = shift; + + my $cfg = PS::Config->new(); + my $db_dir = $cfg->{cfg}->{general}{db_dir}; + + my $self = {}; + bless ($self, $class); + + make_path($db_dir); + my $dbh = DBI->connect( + "dbi:SQLite:dbname=$db_dir/db", + "", + "", + { RaiseError => 1 } + ) or die $DBI::errstr; + + $self->{dbh} = $dbh; + $dbh->do("PRAGMA foreign_keys = ON"); + create_tables($dbh); + + $dbh->{AutoCommit} = 1; + + $logger->debug("opened $db_dir/db\n"); + return $self; +} + +sub create_tables { + my ($dbh) = @_; + + $dbh->do(qq{ + create table if not exists products( + manufacturer text not null, + part_num text not null, + retailer text not null, + type text, + first_seen int, + last_seen int, + last_scraped int, + svg_stale int default 1, + primary key(manufacturer, part_num)) + }) or die $DBI::errstr; + + $dbh->do(qq{ + create table if not exists descriptions( + manufacturer text not null, + part_num text not null, + retailer text not null, + description text not null, + date int not null, + primary key(manufacturer, part_num, retailer, description), + foreign key(manufacturer, part_num) references + products(manufacturer, part_num)) + }) or die $DBI::errstr; + + $dbh->do(qq{ + create table if not exists retailers( + name text not null primary key, + color text not null, + url text not null) + }) or die $DBI::errstr; + + $dbh->do(qq{ + create table if not exists prices( + date int not null, + manufacturer text not null, + part_num text not null, + retailer text not null, + price int not null, + duration int, + primary key(date, part_num, retailer, price), + foreign key(manufacturer, part_num) references products(manufacturer, part_num), + foreign key(retailer) references retailers(name)) + }) or die $DBI::errstr; + + # $dbh->do("create table if not exists scrapes"); +} + +1; diff --git a/PS/MemoryExpress.pm b/PS/MemoryExpress.pm @@ -0,0 +1,115 @@ +package PS::MemoryExpress; +use strict; + +use HTML::Grabber; +use Log::Log4perl qw(:easy); +use URI::Escape; + +use PS::Database; +use PS::UserAgent; + +my $logger = Log::Log4perl::get_logger('pricesloth.memory_express'); + + +# On sale: +# <div class="PIV_BotPrices"> +# <div class="PIV_PriceRegular">Reg: <span>$359.99</span></div> +# <div class="PIV_PriceSale"> +# $279.99 +# </div> +# </div> +# +# Regular price: +# <div class="PIV_BotPrices"> +# <div class="PIV_Price"> +# <span>$359.99</span> +# </div> +# </div> + +sub new { + my ($class) = @_; + + my $self = { + color => "#56B849", + url => "http://www.memoryexpress.com/Search/Products?Search=", + title => ".ProductTitle", + reg_tag => ".PIV_Price", + sale_tag => ".PIV_PriceSale", + ua => PS::UserAgent->new(), + db => PS::Database->new() + }; + + bless ($self, $class); + $logger->debug("new(): success"); + + # XXX: make sure row in retailer table is created + + return $self; +} + +# Creates the URL search string. +sub create_search { + my ($self, $part_num) = @_; + + # As learned in the Seagate ST8000AS0002 case searching for manufacturer + # concatenated to part num will hide valid search results. + # Instead search only for part number. We'll have to deal with thumbnail + # view return vs a full page product. + + return $self->{url} . uri_escape($part_num); +} + +sub find_price { + my ($self, $srch_results) = @_; + + my @prices = $srch_results->find($self->{reg_tag})->text_array(); + if (@prices == 0) { + $logger->debug("get_price(): no prices found"); + return; + } + + my ($price, @others) = ($prices[0] =~ m/(\d[\d,]+)/); + if (! defined $price) { + $logger->warn("get_price(): found price containers but they contained no numeric price"); + return; + } + if (@others) { + $logger->warn("get_price(): price container had more than 1 price"); + return; + } + + $price =~ s/,//; + + if ($price <= 0 || $price > 10000) { + $logger->warn("get_price(): price '$price' out of range"); + return; + } + + return $price; +} + +sub scrape_price { + my ($self, $manufacturer, $part_num) = @_; + my $ua = $self->{ua}; + + my $search = $self->create_search($part_num); + return unless ($search); + + my $srch_results = $ua->get_dom($search); + return unless ($srch_results); + + my $price = $self->find_price($srch_results); + return unless ($price); + + my $sql = qq{insert into prices(date, manufacturer, part_num, retailer, + price, duration) values (?, ?, ?, ?, ?, ?)}; + my $dbh = $self->{db}->{dbh}; + my $prices_sth = $dbh->prepare($sql); + + $dbh->begin_work; + $prices_sth->execute(time, $manufacturer, $part_num, "Memory Express", $price, 99); + $dbh->commit; + + $logger->debug("scrape_price(): added price \$$price\n"); + return $price; +} diff --git a/PS/UserAgent.pm b/PS/UserAgent.pm @@ -0,0 +1,55 @@ +package PS::UserAgent; + +use LWP::UserAgent; +use Log::Log4perl qw(:easy); +use File::Path qw(make_path); + +use PS::Config; + +my $logger = Log::Log4perl->get_logger('pricesloth.useragent'); + +sub new { + my $class = shift; + my $self = {}; + bless ($self, $class); + + my $config = PS::Config->new(); + my $cfg = $config->{cfg}->{general}; + + # it's optional to list ip addresses to scrape on + my $ua; + if ($cfg->{addrs}) { + my @addresses = split(" ", $cfg->{addrs}); + my $addr = $addresses[rand @addresses]; + $logger->info("new_ua: using ip $addr\n"); + $ua = LWP::UserAgent->new(local_address => $addr); + } + else { + $ua = LWP::UserAgent->new(); + } + + $ua->default_header("Accept" => "*/*"); + $ua->default_header("Accept-Encoding" => scalar HTTP::Message::decodable()); + $ua->default_header("Accept-Charset" => "utf-8"); + $ua->default_header("Accept-Language" => "en-US"); + $ua->default_header("User-Agent" => $cfg->{agent}); + + $self->{ua} = $ua; + return $self; +} + +sub get_dom { + my ($self, $url) = @_; + my $ua = $self->{ua}; + + my $resp = $ua->get($url); + if ($resp->is_success) { + $logger->debug("get_dom: " . $resp->status_line . " $url\n"); + return $resp; + } + + $logger->error("get_dom: " . $resp->status_line . " $url\n"); + return; +} + +1; diff --git a/PriceSloth.pm b/PriceSloth.pm @@ -1,233 +0,0 @@ -package PriceSloth; - -use DBI; -use File::Path qw(make_path); -use Exporter; - -@ISA = ("Exporter"); -@EXPORT = qw(get_config get_dom get_log get_dbh trunc_line new_ua make_path spin); - - -sub get_config -{ - my $parser = Config::Grammar->new({ - _sections => ["general", "http", "retailers"], - general => { - _vars => [ - "agent", - "email", - "smtp", - # XXX: add simple regex validation here - "addrs", - "db_dir", - "log_dir", - ], - }, - http => { - _vars => [ - "socket", - "uid", - "gid", - "chroot", - "htdocs", - ], - }, - retailers => { - _sections => ["/[A-Za-z ]+/"], - "/[A-Za-z ]+/" => { - _vars => [ - "url", - "reg_tag", - "sale_tag", - "color", - "title" - ] - } - } - }); - - my $cfg_file = "/etc/pricesloth.cfg"; - if (-e "etc/pricesloth.cfg") { - $cfg_file = "etc/pricesloth.cfg"; - } - my $cfg = $parser->parse($cfg_file) or die "error: $parser->{err}\n"; - - return $cfg; -} - -sub get_dbh -{ - my $db_dir = shift; - my $verbose = shift || undef; - - make_path($db_dir, { verbose => $verbose }); - my $dbh = DBI->connect( - "dbi:SQLite:dbname=$db_dir/db", - "", - "", - { RaiseError => 1 } - ) or die $DBI::errstr; - - $dbh->do("PRAGMA foreign_keys = ON"); - create_tables($dbh); - - $dbh->{AutoCommit} = 1; - - print "info: opened $db_dir/db\n" if ($verbose); - return $dbh; -} - -sub get_dom -{ - my $url = shift; - my $ua = shift; - my $verbose = shift; - my $log = shift; - - my $resp = $ua->get($url); - if ($resp->is_success) { - my $short_url = trunc_line($url, length($resp->status_line) + 16); - print "info: get_dom: " . $resp->status_line . " $short_url\n" if ($verbose); - return HTML::Grabber->new(html => $resp->decoded_content); - } - - print $log "error: get_dom: " . $resp->status_line . " $url\n"; - return undef; -} - -sub new_ua -{ - my $cfg = shift; - my $verbose = shift || 0; - my $ua; - - # it's optional to list ip addresses to scrape on - if ($cfg->{addrs}) { - my @addresses = split(" ", $cfg->{addrs}); - my $addr = $addresses[rand @addresses]; - print "info: new_ua: using ip $addr\n" if ($verbose); - $ua = LWP::UserAgent->new(local_address => $addr); - } - else { - $ua = LWP::UserAgent->new(); - } - - $ua->default_header("Accept" => "*/*"); - $ua->default_header("Accept-Encoding" => scalar HTTP::Message::decodable()); - $ua->default_header("Accept-Charset" => "utf-8"); - $ua->default_header("Accept-Language" => "en-US"); - $ua->default_header("User-Agent" => $cfg->{agent}); - - my $headers = $ua->default_headers; - for (sort keys %$headers) { - my $header = trunc_line($headers->{$_}, length($_) + 18); - print "info: new_ua: $_ => $header\n" if ($verbose); - } - - return $ua; -} - -sub get_log -{ - my $log_path = shift || return undef; - my $verbose = shift || 0; - - # if $log_path has a / in it, make sure the path to it is made - make_path(substr($log_path, 0, rindex($log_path, '/')), { verbose => $verbose }); - - print "info: get_log: opening $log_path, append mode\n" if ($verbose); - open my $log, ">>", $log_path or die "can't open $log_path: $!"; - - if ($verbose) { - print "info: get_log: outputting to tee\n"; - open my $std_out, '>&', STDOUT or die "$!"; - - return IO::Tee->new($log, $std_out); - } - - return IO::Tee->new($log); -} - -# -# make a possibly long line fit on a single line, with ellipses -# -sub trunc_line -{ - my $line = shift || return undef; - my $prefix = shift || 0; - - # if stdout is not a tty, it's likely a log file, output everything - return $line unless (POSIX::isatty(STDOUT)); - - my ($term_width) = Term::ReadKey::GetTerminalSize(); - my $len = $term_width - $prefix - 3; - if (length($line) < $len) { - return $line; - } - - my $chopped = substr($line, 0, $len); - return $chopped . "..."; -} - -my $state = 0; -sub spin -{ - my @spin_states = ("-", "\\", "|", "/"); - - print "\b"; - print $spin_states[++$state % 4]; -} - -sub create_tables -{ - my $dbh = shift; - - $dbh->do(qq{ - create table if not exists products( - manufacturer text not null, - part_num text not null, - retailer text not null, - type text, - first_seen int, - last_seen int, - last_scraped int, - svg_stale int default 1, - primary key(manufacturer, part_num)) - }) or die $DBI::errstr; - - $dbh->do(qq{ - create table if not exists descriptions( - manufacturer text not null, - part_num text not null, - retailer text not null, - description text not null, - date int not null, - primary key(manufacturer, part_num, retailer, description), - foreign key(manufacturer, part_num) references - products(manufacturer, part_num)) - }) or die $DBI::errstr; - - $dbh->do(qq{ - create table if not exists retailers( - name text not null primary key, - color text not null, - url text not null) - }) or die $DBI::errstr; - - $dbh->do(qq{ - create table if not exists prices( - date int not null, - manufacturer text not null, - part_num text not null, - retailer text not null, - price int not null, - duration int, - primary key(date, part_num, retailer, price), - foreign key(manufacturer, part_num) references products(manufacturer, part_num), - foreign key(retailer) references retailers(name)) - }) or die $DBI::errstr; - - # $dbh->do("create table if not exists scrapes"); -} - -1; diff --git a/etc/pricesloth.cfg b/etc/pricesloth.cfg @@ -15,108 +15,3 @@ gid = daemon chroot = /var/www htdocs = /var/www/htdocs/pricesloth socket = /var/www/run/search.sock - - -*** retailers *** - -+ Memory Express -# -# On sale: -# <div class="PIV_BotPrices"> -# <div class="PIV_PriceRegular">Reg: <span>$359.99</span></div> -# <div class="PIV_PriceSale"> -# $279.99 -# </div> -# </div> -# -# Regular price: -# <div class="PIV_BotPrices"> -# <div class="PIV_Price"> -# <span>$359.99</span> -# </div> -# </div> -# -color = 56B849 -url = http://www.memoryexpress.com/Search/Products?Search= -title = .ProductTitle -reg_tag = .PIV_Price -sale_tag = .PIV_PriceSale - - -+ Visions Electronics -# -# <td class="price"> -# <span id="ctl00_..." class="regPrice">Price: <span>$509.99</span></span> -# <span id="ctl00_..." class="salePrice">Sale Price: $336.00</span> -# </td> -# -# price is a unique class when only a single product is returned and -# can be used to make sure only a single product has been returned. -# Products that are on sale return both regPrice and salePrice classes -# while regularly priced productes only return the regPrice class. -# -color = 000 -url = http://www.visions.ca/catalogue/category/ProductResults.aspx?searchText= -title = .plProductName -reg_tag = .price -sale_tag = .salePrice -# type_includes = televisions - - -+ London Drugs -color = 005DAB -url = http://www.londondrugs.com/on/demandware.store/Sites-LondonDrugs-Site/default/Search-Show?q= -title = .productname -reg_tag = .pricing -# not sure about the below -# reg_price = .standardprice -# sale_price = .salesprice - - -# + Amazon -# color = FFA51D -# url = http://www.amazon.ca/s/keywords= -# title = .newaps -# reg_tag = .price - - -+ Best Buy -color = 003B64 -url = http://www.bestbuy.ca/Search/SearchResults.aspx?query= -reg_tag = .prodprice -# title = .product-title, .prod-title -# sale_tag = .price-onsale - - -# + RadioShack -# color = E76453 -# search_uri = http://www.radioshack.com/search/controller.jsp?kw= -# title = .title -# price_context = .product-price-tag -# reg_price = .price - -# + Walmart -# color = 0000FF -# url = http://www.walmart.ca/search/ -# title = .title -# reg_tag = .price-current - -# this one has a table layout with no id= tags, making scraping impossible with -# the current technique -# + NCIX -# color = -# url = http://search.ncix.com/search/?q= -# reg_tag = - -+ Newegg -color = F8A42A -url = http://www.newegg.ca/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description= -title = .itemDescription -reg_tag = .price-current - - -+ Tiger Direct -color = FED443 -url = http://www.tigerdirect.ca/applications/SearchTools/search.asp?keywords= -title = .itemName -reg_tag = .salePrice diff --git a/t/memory_express.t b/t/memory_express.t @@ -0,0 +1,97 @@ +use strict; +use PS::UserAgent; +use PS::MemoryExpress; +use Log::Log4perl qw(:easy); +use Test; + +BEGIN { plan tests => 20 } + +Log::Log4perl->easy_init($INFO); + +my $ua = PS::UserAgent->new(); +my $me = PS::MemoryExpress->new(); + +# Search for a Seagate hard drive I know about +# +my $search_url = $me->create_search("ST8000AS0002"); +my $resp = $ua->get_dom($search_url); +ok($resp->is_success); + +# Check the returned URI is the product page directly +my $uri = $resp->base; +ok($uri =~ /.*\/Products\/.*/); + +my $dom = HTML::Grabber->new( html => $resp->decoded_content ); +ok($dom); + +# Product part number is inside of this div id +my $product_add = $dom->find("#ProductAdd")->text(); +my ($part_num) = ($product_add =~ m/Part #:\s*(.*)\r/); +ok($part_num, "ST8000AS0002"); + +# We know we're on the product page +my $grand_total_tag = $dom->find(".GrandTotal")->text(); +# ->text() doesn't trim all the garbage whitespace +$grand_total_tag =~ s/^\s+//; +$grand_total_tag =~ s/\s+$//; + +# Final massaging, remove "Only" text that's right beside the price +my ($price, @others) = ($grand_total_tag =~ m/(\d[\d,]+.\d\d)/); +ok($price); +ok(@others == 0); + +# Remove any commas we may have matched earlier +$price =~ s/,//; + +ok($price > 0.0); +ok($price < 10000.0); + + +# Search for something I know has multiple results +# +my $search_url = $me->create_search("ST8000"); +$resp = $ua->get_dom($search_url); +ok($resp->is_success); + +# The returned URI should have been the search results page +$uri = $resp->base; +ok($uri =~ /.*\/Search\/.*/); + +my $dom = HTML::Grabber->new( html => $resp->decoded_content ); +ok($dom); + +# There's two of these tags, one at the top of the page and one at the bottom +my ($ajax_list_pager) = $dom->find('.AJAX_List_Pager')->text_array(); +ok($ajax_list_pager); + +# Match multiple lines and replace multiple times +$ajax_list_pager =~ s/\r\n//mg; +ok($ajax_list_pager, "/1/"); + +# Searching for the above product yields two results +my ($first_result, @other) = $dom->find('.PIV_Regular')->html_array(); +ok(@other, 1); + +my $thumb = HTML::Grabber->new( html => $first_result ); +my $product_id = $thumb->find(".ProductId")->text(); +ok($product_id); + +my $product_url = "http://www.memoryexpress.com/Products/" . $product_id; +$resp = $ua->get_dom($product_url); +ok($resp->is_success); + + +# Search for something that returns 0 results +# +my $search_url = $me->create_search("some nonexistent product here"); +$resp = $ua->get_dom($search_url); +ok($resp->is_success); + +$uri = $resp->base; +ok($uri =~ /.*\/Search\/.*/); + +my $dom = HTML::Grabber->new( html => $resp->decoded_content ); +ok($dom); + +# Check we get the no results found error +ok($dom->text, "/We're sorry, but there are no products with the specified search parameters./");