pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit 5bc28dcc231fd48c15ccd5c320262f6316eb41c7
parent 38b01c17db70af16c70b9b75000b302494d3c18f
Author: kyle <kyle@getaddrinfo.net>
Date:   Sun,  6 Mar 2016 15:06:15 -0700

add london drugs test and scraper

Diffstat:
APS/LondonDrugs.pm | 147+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
At/london_drugs.t | 62++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 209 insertions(+), 0 deletions(-)

diff --git a/PS/LondonDrugs.pm b/PS/LondonDrugs.pm @@ -0,0 +1,147 @@ +package PS::LondonDrugs; +use strict; + +use HTML::Grabber; +use Log::Log4perl qw(:easy); +use URI::Escape; + +use PS::Database; +use PS::UserAgent; + +my $logger = get_logger('pricesloth.london_drugs'); + +sub new { + my ($class) = @_; + + my $self = { + color => "#005DAB", + url => "http://www.londondrugs.com/on/demandware.store/Sites-LondonDrugs-Site/default/Search-Show?q=", + title => ".productname", + reg_tag => ".pricing", + ua => PS::UserAgent->new(), + db => PS::Database->new() + }; + + bless ($self, $class); + $logger->debug("new(): success"); + + # XXX: make sure row in retailer table is created + + return $self; +} + +sub create_search { + my ($self, $manufacturer, $part_num) = @_; + + # London drugs search looks like it work well when both manufacturer and + # part number are given. + return $self->{url} . uri_escape("$manufacturer $part_num"); +} + +sub scrape_part_num { + my ($self, $resp) = @_; + my $dom = HTML::Grabber->new( html => $resp->decoded_content ); + + my ($title) = $dom->find(".productname")->text_array(); + my ($part_num) = ($title =~ m/.* - (.*)\r/); + return $part_num; +} + +sub scrape_description { + my ($self, $resp) = @_; + my $dom = HTML::Grabber->new( html => $resp->decoded_content ); + + my ($title) = $dom->find(".productname")->text_array(); + my ($descr) = ($title =~ m/^\s+(.*) - .*\r/); + return $descr; +} + +sub scrape_price { + my ($self, $resp) = @_; + my $dom = HTML::Grabber->new( html => $resp->decoded_content ); + + # There are many .salesprice tags on the page but only one is inside of + # .productpricing which is the main product on the page. + my $price_container = $dom->find(".productpricing .salesprice")->text(); + $price_container =~ s/^\s+//; + $price_container =~ s/\s+$//; + + # Try and match a dollars dot cents format with leeway for comma + # separated digits. + my ($price, @others) = ($price_container =~ m/(\d[\d,]+.\d\d)/); + $logger->warn("memexp: found more than 1 price") if (@others); + + # Remove any commas we may have matched earlier + $price =~ s/,//; + + return ($price, @others); +} + +sub find_product_page { + my ($self, $resp) = @_; + my $ua = $self->{ua}; + + my $search_url = $self->{url}; + # The search url has "//" characters that need to be escaped before + # being used in regular expressions + $search_url = quotemeta $search_url; + + my $uri = $resp->base; + if ($uri =~ /http:\/\/www.londondrugs.com\/.*\.html/) { + # We landed on the product page directly, great. + return ($resp); + } + elsif ($uri =~ /$search_url/) { + # We landed on the search page. + my $dom = HTML::Grabber->new( html => $resp->decoded_content ); + + my ($first_result, @others) = $dom->find(".productlisting .product")->html_array(); + return unless ($first_result); + + my $num_total = scalar (@others) + 1; + $logger->debug("find_product_page(): found $num_total thumbnails"); + + # For every thumbnail there is a div with class="name" with a + # link to the product page inside + my $thumb_dom = HTML::Grabber->new( html => $first_result ); + my $product_url = $thumb_dom->find(".name a")->attr('href'); + + $resp = $ua->get_dom($product_url); + return unless $resp->is_success; + + return ($resp, @others); + } + else { + $logger->error("find_product_page(): unexpected search URI '$uri'"); + return; + } +} + +sub scrape { + my ($self, $manufacturer, $part_num) = @_; + my $ua = $self->{ua}; + + my $search = $self->create_search($manufacturer, $part_num); + my $resp = $ua->get_dom($search); + return unless ($resp->is_success); + + # Searching can sometimes take you to different places + ($resp) = $self->find_product_page($resp); + return unless ($resp); + + # my $part_num = $self->scrape_part_num($resp); + my ($price) = $self->scrape_price($resp); + my $desc = $self->scrape_description($resp); + + my $sql = qq{insert into prices(date, manufacturer, part_num, retailer, + price, duration) values (?, ?, ?, ?, ?, ?)}; + my $dbh = $self->{db}->{dbh}; + my $prices_sth = $dbh->prepare($sql); + + $dbh->begin_work; + $prices_sth->execute(time, $manufacturer, $part_num, "London Drugs", $price, 100); + $dbh->commit; + + $logger->debug("scrape_price(): added price \$$price\n"); + return $price; +} diff --git a/t/london_drugs.t b/t/london_drugs.t @@ -0,0 +1,62 @@ +use strict; +use PS::UserAgent; +use PS::LondonDrugs; +use Log::Log4perl qw(:easy); +use Test; + +BEGIN { plan tests => 14 } + +Log::Log4perl->easy_init($INFO); + +my $ua = PS::UserAgent->new(); +my $ld = PS::LondonDrugs->new(); + +# +# Search for a Samsung television I know they have +my $search_url = $ld->create_search("Samsung", "UN55JS9000"); +my $resp = $ua->get_dom($search_url); +ok($resp->is_success); + +# Check that the object is working +my ($obj_resp) = $ld->find_product_page($resp); +ok($obj_resp->base, $resp->base); + +# Make sure the part number we scrape is correct +my $part_num = $ld->scrape_part_num($resp); +ok($part_num, "UN55JS9000"); + +# Make sure the price we scrape is at least close to correct +my ($price, @others) = $ld->scrape_price($resp); +ok($price); +ok(@others == 0); +ok($price > 2000.0); +ok($price < 3000.0); + +my $descr = $ld->scrape_description($resp); +ok($descr, "Samsung 55\" JS9000 Series SUHD 4K Curved Smart TV"); + +# +# Search for something that returns multiple results +my $search_url = $ld->create_search("Samsung", "UN55"); +my $resp = $ua->get_dom($search_url); +ok($resp->is_success); + +# Searching for the above product yields multiple results. +my ($obj_resp, @others) = $ld->find_product_page($resp); +ok(@others, 6); +ok($obj_resp->is_success); + +# +# Search for something non existent +my $search_url = $ld->create_search("", "some product that for sure doesnot exist"); +$resp = $ua->get_dom($search_url); +ok($resp->is_success); + +#ok($resp->base =~ /.*\/Search\/.*/); + +my ($obj_resp) = $ld->find_product_page($resp); +ok( !defined $obj_resp ); + +# Check we get the no results found error +ok( $resp->decoded_content, + "/We're sorry, no products were found for your search/");