pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit d510acae6f7e33b8331115408dca696f100d7984
parent bf59160a974495ea6052259507b6451ad7e3d315
Author: kyle <kyle@getaddrinfo.net>
Date:   Sun,  6 Mar 2016 17:03:51 -0700

add best buy price scraper and test

Diffstat:
APS/BestBuy.pm | 132+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
At/best_buy.t | 56++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 188 insertions(+), 0 deletions(-)

diff --git a/PS/BestBuy.pm b/PS/BestBuy.pm @@ -0,0 +1,132 @@ +package PS::BestBuy; +use strict; + +use HTML::Grabber; +use Log::Log4perl qw(:easy); +use URI::Escape; + +use PS::Database; +use PS::UserAgent; + +my $logger = get_logger('pricesloth.best_buy'); + +sub new { + my ($class) = @_; + + my $self = { + color => "#003B64", + url => "http://www.bestbuy.ca/Search/SearchResults.aspx?query=", + ua => PS::UserAgent->new(), + db => PS::Database->new() + }; + + bless ($self, $class); + $logger->debug("new(): success"); + + # XXX: make sure row in retailer table is created + + return $self; +} + +sub create_search { + my ($self, $manufacturer, $part_num) = @_; + + return $self->{url} . uri_escape("$manufacturer $part_num"); +} + +sub scrape_part_num { + my ($self, $resp) = @_; + my $dom = HTML::Grabber->new( html => $resp->decoded_content ); + + # Part number is inside this ridiculous tag. Seems to be page unique + # too. + my $part_num = $dom->find("#ctl00_CP_ctl00_PD_lblModelNumber")->text(); + return $part_num; +} + +sub scrape_description { + my ($self, $resp) = @_; + my $dom = HTML::Grabber->new( html => $resp->decoded_content ); + + my $title = $dom->find("#ctl00_CP_ctl00_PD_lblProductTitle")->text(); + # Part number is at the end, regex that out + my ($descr) = ($title =~ /(.*) \(.+\)/); + return $descr; +} + +sub scrape_price { + my ($self, $resp) = @_; + my $dom = HTML::Grabber->new( html => $resp->decoded_content ); + + my $price = $dom->find(".price-wrapper .prodprice")->text(); + $price =~ s/^\s+//; + $price =~ s/\s+$//; + # Remove dollar sign and any commas between digits + $price =~ s/^\$//; + $price =~ s/,//; + + return $price; +} + +sub find_product_page { + my ($self, $resp) = @_; + my $ua = $self->{ua}; + + my $product_url = "http://www.bestbuy.ca/en-CA/product/"; + my $search_url = "http://www.bestbuy.ca/Search/SearchResults.aspx?"; + # The search url has "//" characters that need to be escaped before + # being used in regular expressions + $search_url = quotemeta $search_url; + $product_url = quotemeta $product_url; + + my $uri = $resp->base; + if ($uri =~ /$product_url/) { + # We landed on the product page directly, great. + return ($resp); + } + elsif ($uri =~ m/$search_url/) { + # We landed on the search page. + my $dom = HTML::Grabber->new( html => $resp->decoded_content ); + + my ($first_result, @others) = $dom->find(".listing-items .listing-item")->html_array(); + return unless $first_result; + + my $first_dom = HTML::Grabber->new( html => $first_result ); + my $product_url = $first_dom->find(".prod-title a")->attr("href"); + + my $base_url = "http://www.bestbuy.ca"; + my $resp = $ua->get_dom($base_url . $product_url); + return unless $resp->is_success; + + return ($resp, @others); + } + else { + $logger->error("find_product_page(): unexpected search URI '$uri'"); + return; + } +} + +sub scrape { + my ($self, $manufacturer, $part_num) = @_; + my $ua = $self->{ua}; + my $db = $self->{db}; + my $start = time; + + my $search = $self->create_search($manufacturer, $part_num); + my $resp = $ua->get_dom($search); + return unless ($resp->is_success); + + # Searching can sometimes take you to different places + ($resp) = $self->find_product_page($resp); + return unless ($resp); + + # my $part_num = $self->scrape_part_num($resp); + my ($price) = $self->scrape_price($resp); + my $desc = $self->scrape_description($resp); + + $db->insert_price($manufacturer, $part_num, "Best Buy", $price, time - $start); + $db->insert_descr($manufacturer, $part_num, "Besy Buy", $desc) if ($desc); + + $logger->debug("scrape_price(): added price \$$price\n"); + return $price; +} diff --git a/t/best_buy.t b/t/best_buy.t @@ -0,0 +1,56 @@ +use strict; +use PS::BestBuy; +use Log::Log4perl qw(:easy); +use Test; + +BEGIN { plan tests => 13 } + +Log::Log4perl->easy_init($INFO); + +my $ua = PS::UserAgent->new(); +my $bb = PS::BestBuy->new(); + +# +# Search for a Samsung television I know they have +my $search_url = $bb->create_search("Samsung", "UN55JS8500FXZC"); +my $resp = $ua->get_dom($search_url); +ok($resp->is_success); + +# Check that the object is working +my ($obj_resp) = $bb->find_product_page($resp); +ok($obj_resp->base, $resp->base); + +# Make sure the part number we scrape is correct +my $part_num = $bb->scrape_part_num($resp); +ok($part_num, "UN55JS8500FXZC"); + +# Make sure the price we scrape is at least close to correct +my $price = $bb->scrape_price($resp); +ok($price); +ok($price > 2000.0); +ok($price < 2400.0); + +my $descr = $bb->scrape_description($resp); +ok($descr, "Samsung 55\" 4K Ultra HD 3D LED Tizen Smart OS TV"); + +# +# Search for something that returns multiple results +my $search_url = $bb->create_search("Samsung", "UN55"); +$resp = $ua->get_dom($search_url); +ok($resp->is_success); + +my ($obj_resp, @others) = $bb->find_product_page($resp); +ok(@others, 10); +ok($obj_resp->is_success); + +# +# Search for something non existent +my $search_url = $bb->create_search("", "some non-existent product name"); +$resp = $ua->get_dom($search_url); +ok($resp->is_success); + +my ($obj_resp) = $bb->find_product_page($resp); +ok( !defined $obj_resp ); + +# Check we get the no results found error +ok( $resp->decoded_content, "/Sorry, we couldn.t find any results./");