commit 5bc28dcc231fd48c15ccd5c320262f6316eb41c7
parent 38b01c17db70af16c70b9b75000b302494d3c18f
Author: kyle <kyle@getaddrinfo.net>
Date: Sun, 6 Mar 2016 15:06:15 -0700
add london drugs test and scraper
Diffstat:
A | PS/LondonDrugs.pm | | | 147 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | t/london_drugs.t | | | 62 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
2 files changed, 209 insertions(+), 0 deletions(-)
diff --git a/PS/LondonDrugs.pm b/PS/LondonDrugs.pm
@@ -0,0 +1,147 @@
+package PS::LondonDrugs;
+use strict;
+
+use HTML::Grabber;
+use Log::Log4perl qw(:easy);
+use URI::Escape;
+
+use PS::Database;
+use PS::UserAgent;
+
+my $logger = get_logger('pricesloth.london_drugs');
+
+sub new {
+ my ($class) = @_;
+
+ my $self = {
+ color => "#005DAB",
+ url => "http://www.londondrugs.com/on/demandware.store/Sites-LondonDrugs-Site/default/Search-Show?q=",
+ title => ".productname",
+ reg_tag => ".pricing",
+ ua => PS::UserAgent->new(),
+ db => PS::Database->new()
+ };
+
+ bless ($self, $class);
+ $logger->debug("new(): success");
+
+ # XXX: make sure row in retailer table is created
+
+ return $self;
+}
+
+sub create_search {
+ my ($self, $manufacturer, $part_num) = @_;
+
+ # London drugs search looks like it work well when both manufacturer and
+ # part number are given.
+ return $self->{url} . uri_escape("$manufacturer $part_num");
+}
+
+sub scrape_part_num {
+ my ($self, $resp) = @_;
+ my $dom = HTML::Grabber->new( html => $resp->decoded_content );
+
+ my ($title) = $dom->find(".productname")->text_array();
+ my ($part_num) = ($title =~ m/.* - (.*)\r/);
+ return $part_num;
+}
+
+sub scrape_description {
+ my ($self, $resp) = @_;
+ my $dom = HTML::Grabber->new( html => $resp->decoded_content );
+
+ my ($title) = $dom->find(".productname")->text_array();
+ my ($descr) = ($title =~ m/^\s+(.*) - .*\r/);
+ return $descr;
+}
+
+sub scrape_price {
+ my ($self, $resp) = @_;
+ my $dom = HTML::Grabber->new( html => $resp->decoded_content );
+
+ # There are many .salesprice tags on the page but only one is inside of
+ # .productpricing which is the main product on the page.
+ my $price_container = $dom->find(".productpricing .salesprice")->text();
+ $price_container =~ s/^\s+//;
+ $price_container =~ s/\s+$//;
+
+ # Try and match a dollars dot cents format with leeway for comma
+ # separated digits.
+ my ($price, @others) = ($price_container =~ m/(\d[\d,]+.\d\d)/);
+ $logger->warn("memexp: found more than 1 price") if (@others);
+
+ # Remove any commas we may have matched earlier
+ $price =~ s/,//;
+
+ return ($price, @others);
+}
+
+sub find_product_page {
+ my ($self, $resp) = @_;
+ my $ua = $self->{ua};
+
+ my $search_url = $self->{url};
+ # The search url has "//" characters that need to be escaped before
+ # being used in regular expressions
+ $search_url = quotemeta $search_url;
+
+ my $uri = $resp->base;
+ if ($uri =~ /http:\/\/www.londondrugs.com\/.*\.html/) {
+ # We landed on the product page directly, great.
+ return ($resp);
+ }
+ elsif ($uri =~ /$search_url/) {
+ # We landed on the search page.
+ my $dom = HTML::Grabber->new( html => $resp->decoded_content );
+
+ my ($first_result, @others) = $dom->find(".productlisting .product")->html_array();
+ return unless ($first_result);
+
+ my $num_total = scalar (@others) + 1;
+ $logger->debug("find_product_page(): found $num_total thumbnails");
+
+ # For every thumbnail there is a div with class="name" with a
+ # link to the product page inside
+ my $thumb_dom = HTML::Grabber->new( html => $first_result );
+ my $product_url = $thumb_dom->find(".name a")->attr('href');
+
+ $resp = $ua->get_dom($product_url);
+ return unless $resp->is_success;
+
+ return ($resp, @others);
+ }
+ else {
+ $logger->error("find_product_page(): unexpected search URI '$uri'");
+ return;
+ }
+}
+
+sub scrape {
+ my ($self, $manufacturer, $part_num) = @_;
+ my $ua = $self->{ua};
+
+ my $search = $self->create_search($manufacturer, $part_num);
+ my $resp = $ua->get_dom($search);
+ return unless ($resp->is_success);
+
+ # Searching can sometimes take you to different places
+ ($resp) = $self->find_product_page($resp);
+ return unless ($resp);
+
+ # my $part_num = $self->scrape_part_num($resp);
+ my ($price) = $self->scrape_price($resp);
+ my $desc = $self->scrape_description($resp);
+
+ my $sql = qq{insert into prices(date, manufacturer, part_num, retailer,
+ price, duration) values (?, ?, ?, ?, ?, ?)};
+ my $dbh = $self->{db}->{dbh};
+ my $prices_sth = $dbh->prepare($sql);
+
+ $dbh->begin_work;
+ $prices_sth->execute(time, $manufacturer, $part_num, "London Drugs", $price, 100);
+ $dbh->commit;
+
+ $logger->debug("scrape_price(): added price \$$price\n");
+ return $price;
+}
diff --git a/t/london_drugs.t b/t/london_drugs.t
@@ -0,0 +1,62 @@
+use strict;
+use PS::UserAgent;
+use PS::LondonDrugs;
+use Log::Log4perl qw(:easy);
+use Test;
+
+BEGIN { plan tests => 14 }
+
+Log::Log4perl->easy_init($INFO);
+
+my $ua = PS::UserAgent->new();
+my $ld = PS::LondonDrugs->new();
+
+#
+# Search for a Samsung television I know they have
+my $search_url = $ld->create_search("Samsung", "UN55JS9000");
+my $resp = $ua->get_dom($search_url);
+ok($resp->is_success);
+
+# Check that the object is working
+my ($obj_resp) = $ld->find_product_page($resp);
+ok($obj_resp->base, $resp->base);
+
+# Make sure the part number we scrape is correct
+my $part_num = $ld->scrape_part_num($resp);
+ok($part_num, "UN55JS9000");
+
+# Make sure the price we scrape is at least close to correct
+my ($price, @others) = $ld->scrape_price($resp);
+ok($price);
+ok(@others == 0);
+ok($price > 2000.0);
+ok($price < 3000.0);
+
+my $descr = $ld->scrape_description($resp);
+ok($descr, "Samsung 55\" JS9000 Series SUHD 4K Curved Smart TV");
+
+#
+# Search for something that returns multiple results
+my $search_url = $ld->create_search("Samsung", "UN55");
+my $resp = $ua->get_dom($search_url);
+ok($resp->is_success);
+
+# Searching for the above product yields multiple results.
+my ($obj_resp, @others) = $ld->find_product_page($resp);
+ok(@others, 6);
+ok($obj_resp->is_success);
+
+#
+# Search for something non existent
+my $search_url = $ld->create_search("", "some product that for sure doesnot exist");
+$resp = $ua->get_dom($search_url);
+ok($resp->is_success);
+
+#ok($resp->base =~ /.*\/Search\/.*/);
+
+my ($obj_resp) = $ld->find_product_page($resp);
+ok( !defined $obj_resp );
+
+# Check we get the no results found error
+ok( $resp->decoded_content,
+ "/We're sorry, no products were found for your search/");