commit d510acae6f7e33b8331115408dca696f100d7984
parent bf59160a974495ea6052259507b6451ad7e3d315
Author: kyle <kyle@getaddrinfo.net>
Date: Sun, 6 Mar 2016 17:03:51 -0700
add best buy price scraper and test
Diffstat:
A | PS/BestBuy.pm | | | 132 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | t/best_buy.t | | | 56 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
2 files changed, 188 insertions(+), 0 deletions(-)
diff --git a/PS/BestBuy.pm b/PS/BestBuy.pm
@@ -0,0 +1,132 @@
+package PS::BestBuy;
+use strict;
+
+use HTML::Grabber;
+use Log::Log4perl qw(:easy);
+use URI::Escape;
+
+use PS::Database;
+use PS::UserAgent;
+
+my $logger = get_logger('pricesloth.best_buy');
+
+sub new {
+ my ($class) = @_;
+
+ my $self = {
+ color => "#003B64",
+ url => "http://www.bestbuy.ca/Search/SearchResults.aspx?query=",
+ ua => PS::UserAgent->new(),
+ db => PS::Database->new()
+ };
+
+ bless ($self, $class);
+ $logger->debug("new(): success");
+
+ # XXX: make sure row in retailer table is created
+
+ return $self;
+}
+
+sub create_search {
+ my ($self, $manufacturer, $part_num) = @_;
+
+ return $self->{url} . uri_escape("$manufacturer $part_num");
+}
+
+sub scrape_part_num {
+ my ($self, $resp) = @_;
+ my $dom = HTML::Grabber->new( html => $resp->decoded_content );
+
+ # Part number is inside this ridiculous tag. Seems to be page unique
+ # too.
+ my $part_num = $dom->find("#ctl00_CP_ctl00_PD_lblModelNumber")->text();
+ return $part_num;
+}
+
+sub scrape_description {
+ my ($self, $resp) = @_;
+ my $dom = HTML::Grabber->new( html => $resp->decoded_content );
+
+ my $title = $dom->find("#ctl00_CP_ctl00_PD_lblProductTitle")->text();
+ # Part number is at the end, regex that out
+ my ($descr) = ($title =~ /(.*) \(.+\)/);
+ return $descr;
+}
+
+sub scrape_price {
+ my ($self, $resp) = @_;
+ my $dom = HTML::Grabber->new( html => $resp->decoded_content );
+
+ my $price = $dom->find(".price-wrapper .prodprice")->text();
+ $price =~ s/^\s+//;
+ $price =~ s/\s+$//;
+ # Remove dollar sign and any commas between digits
+ $price =~ s/^\$//;
+ $price =~ s/,//;
+
+ return $price;
+}
+
+sub find_product_page {
+ my ($self, $resp) = @_;
+ my $ua = $self->{ua};
+
+ my $product_url = "http://www.bestbuy.ca/en-CA/product/";
+ my $search_url = "http://www.bestbuy.ca/Search/SearchResults.aspx?";
+ # The search url has "//" characters that need to be escaped before
+ # being used in regular expressions
+ $search_url = quotemeta $search_url;
+ $product_url = quotemeta $product_url;
+
+ my $uri = $resp->base;
+ if ($uri =~ /$product_url/) {
+ # We landed on the product page directly, great.
+ return ($resp);
+ }
+ elsif ($uri =~ m/$search_url/) {
+ # We landed on the search page.
+ my $dom = HTML::Grabber->new( html => $resp->decoded_content );
+
+ my ($first_result, @others) = $dom->find(".listing-items .listing-item")->html_array();
+ return unless $first_result;
+
+ my $first_dom = HTML::Grabber->new( html => $first_result );
+ my $product_url = $first_dom->find(".prod-title a")->attr("href");
+
+ my $base_url = "http://www.bestbuy.ca";
+ my $resp = $ua->get_dom($base_url . $product_url);
+ return unless $resp->is_success;
+
+ return ($resp, @others);
+ }
+ else {
+ $logger->error("find_product_page(): unexpected search URI '$uri'");
+ return;
+ }
+}
+
+sub scrape {
+ my ($self, $manufacturer, $part_num) = @_;
+ my $ua = $self->{ua};
+ my $db = $self->{db};
+ my $start = time;
+
+ my $search = $self->create_search($manufacturer, $part_num);
+ my $resp = $ua->get_dom($search);
+ return unless ($resp->is_success);
+
+ # Searching can sometimes take you to different places
+ ($resp) = $self->find_product_page($resp);
+ return unless ($resp);
+
+ # my $part_num = $self->scrape_part_num($resp);
+ my ($price) = $self->scrape_price($resp);
+ my $desc = $self->scrape_description($resp);
+
+ $db->insert_price($manufacturer, $part_num, "Best Buy", $price, time - $start);
+ $db->insert_descr($manufacturer, $part_num, "Besy Buy", $desc) if ($desc);
+
+ $logger->debug("scrape_price(): added price \$$price\n");
+ return $price;
+}
diff --git a/t/best_buy.t b/t/best_buy.t
@@ -0,0 +1,56 @@
+use strict;
+use PS::BestBuy;
+use Log::Log4perl qw(:easy);
+use Test;
+
+BEGIN { plan tests => 13 }
+
+Log::Log4perl->easy_init($INFO);
+
+my $ua = PS::UserAgent->new();
+my $bb = PS::BestBuy->new();
+
+#
+# Search for a Samsung television I know they have
+my $search_url = $bb->create_search("Samsung", "UN55JS8500FXZC");
+my $resp = $ua->get_dom($search_url);
+ok($resp->is_success);
+
+# Check that the object is working
+my ($obj_resp) = $bb->find_product_page($resp);
+ok($obj_resp->base, $resp->base);
+
+# Make sure the part number we scrape is correct
+my $part_num = $bb->scrape_part_num($resp);
+ok($part_num, "UN55JS8500FXZC");
+
+# Make sure the price we scrape is at least close to correct
+my $price = $bb->scrape_price($resp);
+ok($price);
+ok($price > 2000.0);
+ok($price < 2400.0);
+
+my $descr = $bb->scrape_description($resp);
+ok($descr, "Samsung 55\" 4K Ultra HD 3D LED Tizen Smart OS TV");
+
+#
+# Search for something that returns multiple results
+my $search_url = $bb->create_search("Samsung", "UN55");
+$resp = $ua->get_dom($search_url);
+ok($resp->is_success);
+
+my ($obj_resp, @others) = $bb->find_product_page($resp);
+ok(@others, 10);
+ok($obj_resp->is_success);
+
+#
+# Search for something non existent
+my $search_url = $bb->create_search("", "some non-existent product name");
+$resp = $ua->get_dom($search_url);
+ok($resp->is_success);
+
+my ($obj_resp) = $bb->find_product_page($resp);
+ok( !defined $obj_resp );
+
+# Check we get the no results found error
+ok( $resp->decoded_content, "/Sorry, we couldn.t find any results./");