commit f8a16f2f99fb89386e8c78fa3ca99006dbf769f2
parent 4f6a895117ede80527331e35edbffdff3ee24390
Author: kyle <kyle@getaddrinfo.net>
Date: Wed, 2 Mar 2016 20:15:18 -0700
start transitioning to a module based layout
- make database, config and user agent their own classes
- start making each vendor their own class too
- add memory express price scrape test
Diffstat:
A | PS/Config.pm | | | 47 | +++++++++++++++++++++++++++++++++++++++++++++++ |
A | PS/Database.pm | | | 90 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | PS/MemoryExpress.pm | | | 115 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | PS/UserAgent.pm | | | 55 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
D | PriceSloth.pm | | | 233 | ------------------------------------------------------------------------------- |
M | etc/pricesloth.cfg | | | 105 | ------------------------------------------------------------------------------- |
A | t/memory_express.t | | | 97 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
7 files changed, 404 insertions(+), 338 deletions(-)
diff --git a/PS/Config.pm b/PS/Config.pm
@@ -0,0 +1,47 @@
+package PS::Config;
+
+use Config::Grammar;
+
+sub new {
+ my $class = shift;
+ my $self = {};
+ bless ($self, $class);
+
+ my $parser = Config::Grammar->new({
+ _sections => [ "general", "http" ],
+ general => {
+ _vars => [
+ "agent",
+ "email",
+ "smtp",
+ # XXX: add simple regex validation here
+ "addrs",
+ "db_dir",
+ "log_dir",
+ ],
+ },
+ http => {
+ _vars => [
+ "socket",
+ "uid",
+ "gid",
+ "chroot",
+ "htdocs",
+ ],
+ },
+ });
+
+ my $cfg_file = "/etc/pricesloth.cfg";
+ if (-e "etc/pricesloth.cfg") {
+ $cfg_file = "etc/pricesloth.cfg";
+ }
+ $self->{cfg} = $parser->parse($cfg_file) or die "error: $parser->{err}\n";
+
+ return $self;
+}
+
+sub get_cfg {
+ return $self->{cfg};
+}
+
+1;
diff --git a/PS/Database.pm b/PS/Database.pm
@@ -0,0 +1,90 @@
+package PS::Database;
+
+use DBI;
+use Log::Log4perl qw(:easy);
+use File::Path qw(make_path);
+use POSIX;
+
+use PS::Config;
+
+my $logger = get_logger('pricesloth.database');
+
+sub new {
+ my $class = shift;
+
+ my $cfg = PS::Config->new();
+ my $db_dir = $cfg->{cfg}->{general}{db_dir};
+
+ my $self = {};
+ bless ($self, $class);
+
+ make_path($db_dir);
+ my $dbh = DBI->connect(
+ "dbi:SQLite:dbname=$db_dir/db",
+ "",
+ "",
+ { RaiseError => 1 }
+ ) or die $DBI::errstr;
+
+ $self->{dbh} = $dbh;
+ $dbh->do("PRAGMA foreign_keys = ON");
+ create_tables($dbh);
+
+ $dbh->{AutoCommit} = 1;
+
+ $logger->debug("opened $db_dir/db\n");
+ return $self;
+}
+
+sub create_tables {
+ my ($dbh) = @_;
+
+ $dbh->do(qq{
+ create table if not exists products(
+ manufacturer text not null,
+ part_num text not null,
+ retailer text not null,
+ type text,
+ first_seen int,
+ last_seen int,
+ last_scraped int,
+ svg_stale int default 1,
+ primary key(manufacturer, part_num))
+ }) or die $DBI::errstr;
+
+ $dbh->do(qq{
+ create table if not exists descriptions(
+ manufacturer text not null,
+ part_num text not null,
+ retailer text not null,
+ description text not null,
+ date int not null,
+ primary key(manufacturer, part_num, retailer, description),
+ foreign key(manufacturer, part_num) references
+ products(manufacturer, part_num))
+ }) or die $DBI::errstr;
+
+ $dbh->do(qq{
+ create table if not exists retailers(
+ name text not null primary key,
+ color text not null,
+ url text not null)
+ }) or die $DBI::errstr;
+
+ $dbh->do(qq{
+ create table if not exists prices(
+ date int not null,
+ manufacturer text not null,
+ part_num text not null,
+ retailer text not null,
+ price int not null,
+ duration int,
+ primary key(date, part_num, retailer, price),
+ foreign key(manufacturer, part_num) references products(manufacturer, part_num),
+ foreign key(retailer) references retailers(name))
+ }) or die $DBI::errstr;
+
+ # $dbh->do("create table if not exists scrapes");
+}
+
+1;
diff --git a/PS/MemoryExpress.pm b/PS/MemoryExpress.pm
@@ -0,0 +1,115 @@
+package PS::MemoryExpress;
+use strict;
+
+use HTML::Grabber;
+use Log::Log4perl qw(:easy);
+use URI::Escape;
+
+use PS::Database;
+use PS::UserAgent;
+
+my $logger = Log::Log4perl::get_logger('pricesloth.memory_express');
+
+
+# On sale:
+# <div class="PIV_BotPrices">
+# <div class="PIV_PriceRegular">Reg: <span>$359.99</span></div>
+# <div class="PIV_PriceSale">
+# $279.99
+# </div>
+# </div>
+#
+# Regular price:
+# <div class="PIV_BotPrices">
+# <div class="PIV_Price">
+# <span>$359.99</span>
+# </div>
+# </div>
+
+sub new {
+ my ($class) = @_;
+
+ my $self = {
+ color => "#56B849",
+ url => "http://www.memoryexpress.com/Search/Products?Search=",
+ title => ".ProductTitle",
+ reg_tag => ".PIV_Price",
+ sale_tag => ".PIV_PriceSale",
+ ua => PS::UserAgent->new(),
+ db => PS::Database->new()
+ };
+
+ bless ($self, $class);
+ $logger->debug("new(): success");
+
+ # XXX: make sure row in retailer table is created
+
+ return $self;
+}
+
+# Creates the URL search string.
+sub create_search {
+ my ($self, $part_num) = @_;
+
+ # As learned in the Seagate ST8000AS0002 case searching for manufacturer
+ # concatenated to part num will hide valid search results.
+ # Instead search only for part number. We'll have to deal with thumbnail
+ # view return vs a full page product.
+
+ return $self->{url} . uri_escape($part_num);
+}
+
+sub find_price {
+ my ($self, $srch_results) = @_;
+
+ my @prices = $srch_results->find($self->{reg_tag})->text_array();
+ if (@prices == 0) {
+ $logger->debug("get_price(): no prices found");
+ return;
+ }
+
+ my ($price, @others) = ($prices[0] =~ m/(\d[\d,]+)/);
+ if (! defined $price) {
+ $logger->warn("get_price(): found price containers but they contained no numeric price");
+ return;
+ }
+ if (@others) {
+ $logger->warn("get_price(): price container had more than 1 price");
+ return;
+ }
+
+ $price =~ s/,//;
+
+ if ($price <= 0 || $price > 10000) {
+ $logger->warn("get_price(): price '$price' out of range");
+ return;
+ }
+
+ return $price;
+}
+
+sub scrape_price {
+ my ($self, $manufacturer, $part_num) = @_;
+ my $ua = $self->{ua};
+
+ my $search = $self->create_search($part_num);
+ return unless ($search);
+
+ my $srch_results = $ua->get_dom($search);
+ return unless ($srch_results);
+
+ my $price = $self->find_price($srch_results);
+ return unless ($price);
+
+ my $sql = qq{insert into prices(date, manufacturer, part_num, retailer,
+ price, duration) values (?, ?, ?, ?, ?, ?)};
+ my $dbh = $self->{db}->{dbh};
+ my $prices_sth = $dbh->prepare($sql);
+
+ $dbh->begin_work;
+ $prices_sth->execute(time, $manufacturer, $part_num, "Memory Express", $price, 99);
+ $dbh->commit;
+
+ $logger->debug("scrape_price(): added price \$$price\n");
+ return $price;
+}
diff --git a/PS/UserAgent.pm b/PS/UserAgent.pm
@@ -0,0 +1,55 @@
+package PS::UserAgent;
+
+use LWP::UserAgent;
+use Log::Log4perl qw(:easy);
+use File::Path qw(make_path);
+
+use PS::Config;
+
+my $logger = Log::Log4perl->get_logger('pricesloth.useragent');
+
+sub new {
+ my $class = shift;
+ my $self = {};
+ bless ($self, $class);
+
+ my $config = PS::Config->new();
+ my $cfg = $config->{cfg}->{general};
+
+ # it's optional to list ip addresses to scrape on
+ my $ua;
+ if ($cfg->{addrs}) {
+ my @addresses = split(" ", $cfg->{addrs});
+ my $addr = $addresses[rand @addresses];
+ $logger->info("new_ua: using ip $addr\n");
+ $ua = LWP::UserAgent->new(local_address => $addr);
+ }
+ else {
+ $ua = LWP::UserAgent->new();
+ }
+
+ $ua->default_header("Accept" => "*/*");
+ $ua->default_header("Accept-Encoding" => scalar HTTP::Message::decodable());
+ $ua->default_header("Accept-Charset" => "utf-8");
+ $ua->default_header("Accept-Language" => "en-US");
+ $ua->default_header("User-Agent" => $cfg->{agent});
+
+ $self->{ua} = $ua;
+ return $self;
+}
+
+sub get_dom {
+ my ($self, $url) = @_;
+ my $ua = $self->{ua};
+
+ my $resp = $ua->get($url);
+ if ($resp->is_success) {
+ $logger->debug("get_dom: " . $resp->status_line . " $url\n");
+ return $resp;
+ }
+
+ $logger->error("get_dom: " . $resp->status_line . " $url\n");
+ return;
+}
+
+1;
diff --git a/PriceSloth.pm b/PriceSloth.pm
@@ -1,233 +0,0 @@
-package PriceSloth;
-
-use DBI;
-use File::Path qw(make_path);
-use Exporter;
-
-@ISA = ("Exporter");
-@EXPORT = qw(get_config get_dom get_log get_dbh trunc_line new_ua make_path spin);
-
-
-sub get_config
-{
- my $parser = Config::Grammar->new({
- _sections => ["general", "http", "retailers"],
- general => {
- _vars => [
- "agent",
- "email",
- "smtp",
- # XXX: add simple regex validation here
- "addrs",
- "db_dir",
- "log_dir",
- ],
- },
- http => {
- _vars => [
- "socket",
- "uid",
- "gid",
- "chroot",
- "htdocs",
- ],
- },
- retailers => {
- _sections => ["/[A-Za-z ]+/"],
- "/[A-Za-z ]+/" => {
- _vars => [
- "url",
- "reg_tag",
- "sale_tag",
- "color",
- "title"
- ]
- }
- }
- });
-
- my $cfg_file = "/etc/pricesloth.cfg";
- if (-e "etc/pricesloth.cfg") {
- $cfg_file = "etc/pricesloth.cfg";
- }
- my $cfg = $parser->parse($cfg_file) or die "error: $parser->{err}\n";
-
- return $cfg;
-}
-
-sub get_dbh
-{
- my $db_dir = shift;
- my $verbose = shift || undef;
-
- make_path($db_dir, { verbose => $verbose });
- my $dbh = DBI->connect(
- "dbi:SQLite:dbname=$db_dir/db",
- "",
- "",
- { RaiseError => 1 }
- ) or die $DBI::errstr;
-
- $dbh->do("PRAGMA foreign_keys = ON");
- create_tables($dbh);
-
- $dbh->{AutoCommit} = 1;
-
- print "info: opened $db_dir/db\n" if ($verbose);
- return $dbh;
-}
-
-sub get_dom
-{
- my $url = shift;
- my $ua = shift;
- my $verbose = shift;
- my $log = shift;
-
- my $resp = $ua->get($url);
- if ($resp->is_success) {
- my $short_url = trunc_line($url, length($resp->status_line) + 16);
- print "info: get_dom: " . $resp->status_line . " $short_url\n" if ($verbose);
- return HTML::Grabber->new(html => $resp->decoded_content);
- }
-
- print $log "error: get_dom: " . $resp->status_line . " $url\n";
- return undef;
-}
-
-sub new_ua
-{
- my $cfg = shift;
- my $verbose = shift || 0;
- my $ua;
-
- # it's optional to list ip addresses to scrape on
- if ($cfg->{addrs}) {
- my @addresses = split(" ", $cfg->{addrs});
- my $addr = $addresses[rand @addresses];
- print "info: new_ua: using ip $addr\n" if ($verbose);
- $ua = LWP::UserAgent->new(local_address => $addr);
- }
- else {
- $ua = LWP::UserAgent->new();
- }
-
- $ua->default_header("Accept" => "*/*");
- $ua->default_header("Accept-Encoding" => scalar HTTP::Message::decodable());
- $ua->default_header("Accept-Charset" => "utf-8");
- $ua->default_header("Accept-Language" => "en-US");
- $ua->default_header("User-Agent" => $cfg->{agent});
-
- my $headers = $ua->default_headers;
- for (sort keys %$headers) {
- my $header = trunc_line($headers->{$_}, length($_) + 18);
- print "info: new_ua: $_ => $header\n" if ($verbose);
- }
-
- return $ua;
-}
-
-sub get_log
-{
- my $log_path = shift || return undef;
- my $verbose = shift || 0;
-
- # if $log_path has a / in it, make sure the path to it is made
- make_path(substr($log_path, 0, rindex($log_path, '/')), { verbose => $verbose });
-
- print "info: get_log: opening $log_path, append mode\n" if ($verbose);
- open my $log, ">>", $log_path or die "can't open $log_path: $!";
-
- if ($verbose) {
- print "info: get_log: outputting to tee\n";
- open my $std_out, '>&', STDOUT or die "$!";
-
- return IO::Tee->new($log, $std_out);
- }
-
- return IO::Tee->new($log);
-}
-
-#
-# make a possibly long line fit on a single line, with ellipses
-#
-sub trunc_line
-{
- my $line = shift || return undef;
- my $prefix = shift || 0;
-
- # if stdout is not a tty, it's likely a log file, output everything
- return $line unless (POSIX::isatty(STDOUT));
-
- my ($term_width) = Term::ReadKey::GetTerminalSize();
- my $len = $term_width - $prefix - 3;
- if (length($line) < $len) {
- return $line;
- }
-
- my $chopped = substr($line, 0, $len);
- return $chopped . "...";
-}
-
-my $state = 0;
-sub spin
-{
- my @spin_states = ("-", "\\", "|", "/");
-
- print "\b";
- print $spin_states[++$state % 4];
-}
-
-sub create_tables
-{
- my $dbh = shift;
-
- $dbh->do(qq{
- create table if not exists products(
- manufacturer text not null,
- part_num text not null,
- retailer text not null,
- type text,
- first_seen int,
- last_seen int,
- last_scraped int,
- svg_stale int default 1,
- primary key(manufacturer, part_num))
- }) or die $DBI::errstr;
-
- $dbh->do(qq{
- create table if not exists descriptions(
- manufacturer text not null,
- part_num text not null,
- retailer text not null,
- description text not null,
- date int not null,
- primary key(manufacturer, part_num, retailer, description),
- foreign key(manufacturer, part_num) references
- products(manufacturer, part_num))
- }) or die $DBI::errstr;
-
- $dbh->do(qq{
- create table if not exists retailers(
- name text not null primary key,
- color text not null,
- url text not null)
- }) or die $DBI::errstr;
-
- $dbh->do(qq{
- create table if not exists prices(
- date int not null,
- manufacturer text not null,
- part_num text not null,
- retailer text not null,
- price int not null,
- duration int,
- primary key(date, part_num, retailer, price),
- foreign key(manufacturer, part_num) references products(manufacturer, part_num),
- foreign key(retailer) references retailers(name))
- }) or die $DBI::errstr;
-
- # $dbh->do("create table if not exists scrapes");
-}
-
-1;
diff --git a/etc/pricesloth.cfg b/etc/pricesloth.cfg
@@ -15,108 +15,3 @@ gid = daemon
chroot = /var/www
htdocs = /var/www/htdocs/pricesloth
socket = /var/www/run/search.sock
-
-
-*** retailers ***
-
-+ Memory Express
-#
-# On sale:
-# <div class="PIV_BotPrices">
-# <div class="PIV_PriceRegular">Reg: <span>$359.99</span></div>
-# <div class="PIV_PriceSale">
-# $279.99
-# </div>
-# </div>
-#
-# Regular price:
-# <div class="PIV_BotPrices">
-# <div class="PIV_Price">
-# <span>$359.99</span>
-# </div>
-# </div>
-#
-color = 56B849
-url = http://www.memoryexpress.com/Search/Products?Search=
-title = .ProductTitle
-reg_tag = .PIV_Price
-sale_tag = .PIV_PriceSale
-
-
-+ Visions Electronics
-#
-# <td class="price">
-# <span id="ctl00_..." class="regPrice">Price: <span>$509.99</span></span>
-# <span id="ctl00_..." class="salePrice">Sale Price: $336.00</span>
-# </td>
-#
-# price is a unique class when only a single product is returned and
-# can be used to make sure only a single product has been returned.
-# Products that are on sale return both regPrice and salePrice classes
-# while regularly priced productes only return the regPrice class.
-#
-color = 000
-url = http://www.visions.ca/catalogue/category/ProductResults.aspx?searchText=
-title = .plProductName
-reg_tag = .price
-sale_tag = .salePrice
-# type_includes = televisions
-
-
-+ London Drugs
-color = 005DAB
-url = http://www.londondrugs.com/on/demandware.store/Sites-LondonDrugs-Site/default/Search-Show?q=
-title = .productname
-reg_tag = .pricing
-# not sure about the below
-# reg_price = .standardprice
-# sale_price = .salesprice
-
-
-# + Amazon
-# color = FFA51D
-# url = http://www.amazon.ca/s/keywords=
-# title = .newaps
-# reg_tag = .price
-
-
-+ Best Buy
-color = 003B64
-url = http://www.bestbuy.ca/Search/SearchResults.aspx?query=
-reg_tag = .prodprice
-# title = .product-title, .prod-title
-# sale_tag = .price-onsale
-
-
-# + RadioShack
-# color = E76453
-# search_uri = http://www.radioshack.com/search/controller.jsp?kw=
-# title = .title
-# price_context = .product-price-tag
-# reg_price = .price
-
-# + Walmart
-# color = 0000FF
-# url = http://www.walmart.ca/search/
-# title = .title
-# reg_tag = .price-current
-
-# this one has a table layout with no id= tags, making scraping impossible with
-# the current technique
-# + NCIX
-# color =
-# url = http://search.ncix.com/search/?q=
-# reg_tag =
-
-+ Newegg
-color = F8A42A
-url = http://www.newegg.ca/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=
-title = .itemDescription
-reg_tag = .price-current
-
-
-+ Tiger Direct
-color = FED443
-url = http://www.tigerdirect.ca/applications/SearchTools/search.asp?keywords=
-title = .itemName
-reg_tag = .salePrice
diff --git a/t/memory_express.t b/t/memory_express.t
@@ -0,0 +1,97 @@
+use strict;
+use PS::UserAgent;
+use PS::MemoryExpress;
+use Log::Log4perl qw(:easy);
+use Test;
+
+BEGIN { plan tests => 20 }
+
+Log::Log4perl->easy_init($INFO);
+
+my $ua = PS::UserAgent->new();
+my $me = PS::MemoryExpress->new();
+
+# Search for a Seagate hard drive I know about
+#
+my $search_url = $me->create_search("ST8000AS0002");
+my $resp = $ua->get_dom($search_url);
+ok($resp->is_success);
+
+# Check the returned URI is the product page directly
+my $uri = $resp->base;
+ok($uri =~ /.*\/Products\/.*/);
+
+my $dom = HTML::Grabber->new( html => $resp->decoded_content );
+ok($dom);
+
+# Product part number is inside of this div id
+my $product_add = $dom->find("#ProductAdd")->text();
+my ($part_num) = ($product_add =~ m/Part #:\s*(.*)\r/);
+ok($part_num, "ST8000AS0002");
+
+# We know we're on the product page
+my $grand_total_tag = $dom->find(".GrandTotal")->text();
+# ->text() doesn't trim all the garbage whitespace
+$grand_total_tag =~ s/^\s+//;
+$grand_total_tag =~ s/\s+$//;
+
+# Final massaging, remove "Only" text that's right beside the price
+my ($price, @others) = ($grand_total_tag =~ m/(\d[\d,]+.\d\d)/);
+ok($price);
+ok(@others == 0);
+
+# Remove any commas we may have matched earlier
+$price =~ s/,//;
+
+ok($price > 0.0);
+ok($price < 10000.0);
+
+
+# Search for something I know has multiple results
+#
+my $search_url = $me->create_search("ST8000");
+$resp = $ua->get_dom($search_url);
+ok($resp->is_success);
+
+# The returned URI should have been the search results page
+$uri = $resp->base;
+ok($uri =~ /.*\/Search\/.*/);
+
+my $dom = HTML::Grabber->new( html => $resp->decoded_content );
+ok($dom);
+
+# There's two of these tags, one at the top of the page and one at the bottom
+my ($ajax_list_pager) = $dom->find('.AJAX_List_Pager')->text_array();
+ok($ajax_list_pager);
+
+# Match multiple lines and replace multiple times
+$ajax_list_pager =~ s/\r\n//mg;
+ok($ajax_list_pager, "/1/");
+
+# Searching for the above product yields two results
+my ($first_result, @other) = $dom->find('.PIV_Regular')->html_array();
+ok(@other, 1);
+
+my $thumb = HTML::Grabber->new( html => $first_result );
+my $product_id = $thumb->find(".ProductId")->text();
+ok($product_id);
+
+my $product_url = "http://www.memoryexpress.com/Products/" . $product_id;
+$resp = $ua->get_dom($product_url);
+ok($resp->is_success);
+
+
+# Search for something that returns 0 results
+#
+my $search_url = $me->create_search("some nonexistent product here");
+$resp = $ua->get_dom($search_url);
+ok($resp->is_success);
+
+$uri = $resp->base;
+ok($uri =~ /.*\/Search\/.*/);
+
+my $dom = HTML::Grabber->new( html => $resp->decoded_content );
+ok($dom);
+
+# Check we get the no results found error
+ok($dom->text, "/We're sorry, but there are no products with the specified search parameters./");