pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit 60346754980b30e22c556d2b580674f3e29e2b5d
parent 2d4de52a337de1de9901a65ac1083b7fbce45ecc
Author: kyle <kyle@getaddrinfo.net>
Date:   Sun,  6 Mar 2016 10:40:34 -0700

memexp: put scraping smarts in real module

- move scraping stuff into real module such that it can start being used
- modify the test to test the module instead of doing all the work itself

Diffstat:
MPS/MemoryExpress.pm | 93++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
Mt/memory_express.t | 62++++++++++++++++++--------------------------------------------
2 files changed, 86 insertions(+), 69 deletions(-)

diff --git a/PS/MemoryExpress.pm b/PS/MemoryExpress.pm @@ -59,47 +59,90 @@ sub create_search { return $self->{url} . uri_escape($part_num); } -sub find_price { - my ($self, $srch_results) = @_; +sub scrape_part_num { + my ($self, $resp) = @_; + my $dom = HTML::Grabber->new( html => $resp->decoded_content ); + + # Product part number is inside of this div id + my $product_add = $dom->find("#ProductAdd")->text(); + my ($part_num) = ($product_add =~ m/Part #:\s*(.*)\r/); + return $part_num; +} - my @prices = $srch_results->find($self->{reg_tag})->text_array(); - if (@prices == 0) { - $logger->debug("get_price(): no prices found"); - return; - } +sub scrape_price { + my ($self, $resp) = @_; + my $dom = HTML::Grabber->new( html => $resp->decoded_content ); - my ($price, @others) = ($prices[0] =~ m/(\d[\d,]+)/); - if (! defined $price) { - $logger->warn("get_price(): found price containers but they contained no numeric price"); - return; - } - if (@others) { - $logger->warn("get_price(): price container had more than 1 price"); - return; - } + my $grand_total_tag = $dom->find(".GrandTotal")->text(); + # -> text() does not trim all whitespace + $grand_total_tag =~ s/^\s+//; + $grand_total_tag =~ s/\s+$//; + + # Try and match a dollars dot cents format with leeway for comma + # separated digits. + # This also remove the "Only" text right beside the price. + my ($price, @others) = ($grand_total_tag =~ m/(\d[\d,]+.\d\d)/); + $logger->warn("memexp: found more than 1 price") if (@others); + # Remove any commas we may have matched earlier $price =~ s/,//; - if ($price <= 0 || $price > 10000) { - $logger->warn("get_price(): price '$price' out of range"); - return; + return ($price, @others); +} + +sub scrape_description { +} + +sub find_product_page { + my ($self, $resp) = @_; + my $ua = $self->{ua}; + + my $uri = $resp->base; + if ($uri =~ /.*\/Products\/.*/) { + # We landed on the product page directly, great. + return $resp; } + elsif ($uri =~ /.*\/Search\/.*/) { + # We landed on the search page. + my $dom = HTML::Grabber->new( html => $resp->decoded_content ); - return $price; + # We're only going to search the first page of results + my ($first_result, @others) = $dom->find('.PIV_Regular')->html_array(); + return unless ($first_result); + + my $thumb_dom = HTML::Grabber->new( html => $first_result ); + my $product_id = $thumb_dom->find(".ProductId")->text(); + return unless ($product_id); + + my $product_url = "http://www.memoryexpress.com/Products/" . $product_id; + + $resp = $ua->get_dom($product_url); + return unless $resp->is_success; + + return ($resp, @others); + } + else { + $logger->error("find_product_page(): unexpected search URI '$uri'"); + return; + } } -sub scrape_price { +sub scrape_all { my ($self, $manufacturer, $part_num) = @_; my $ua = $self->{ua}; my $search = $self->create_search($part_num); return unless ($search); - my $srch_results = $ua->get_dom($search); - return unless ($srch_results); + my $resp = $ua->get_dom($search); + return unless ($resp->is_success); + + # Searching can sometimes take you to different places + my $resp = $self->find_product_page($resp); + return unless ($resp); - my $price = $self->find_price($srch_results); - return unless ($price); + my $part_num = $self->scrape_part_num($resp); + my $price = $self->scrape_price($resp); my $sql = qq{insert into prices(date, manufacturer, part_num, retailer, price, duration) values (?, ?, ?, ?, ?, ?)}; diff --git a/t/memory_express.t b/t/memory_express.t @@ -4,7 +4,7 @@ use PS::MemoryExpress; use Log::Log4perl qw(:easy); use Test; -BEGIN { plan tests => 20 } +BEGIN { plan tests => 17 } Log::Log4perl->easy_init($INFO); @@ -21,30 +21,20 @@ ok($resp->is_success); my $uri = $resp->base; ok($uri =~ /.*\/Products\/.*/); -my $dom = HTML::Grabber->new( html => $resp->decoded_content ); -ok($dom); +# Check that the object is working +my $obj_resp = $me->find_product_page($resp); +ok($obj_resp->base, $resp->base); -# Product part number is inside of this div id -my $product_add = $dom->find("#ProductAdd")->text(); -my ($part_num) = ($product_add =~ m/Part #:\s*(.*)\r/); +# Make sure the part number we scrape is correct +my $part_num = $me->scrape_part_num($resp); ok($part_num, "ST8000AS0002"); -# We know we're on the product page -my $grand_total_tag = $dom->find(".GrandTotal")->text(); -# ->text() doesn't trim all the garbage whitespace -$grand_total_tag =~ s/^\s+//; -$grand_total_tag =~ s/\s+$//; - -# Final massaging, remove "Only" text that's right beside the price -my ($price, @others) = ($grand_total_tag =~ m/(\d[\d,]+.\d\d)/); +# Make sure the price we scrape is at least close to correct +my ($price, @others) = $me->scrape_price($resp); ok($price); ok(@others == 0); - -# Remove any commas we may have matched earlier -$price =~ s/,//; - -ok($price > 0.0); -ok($price < 10000.0); +ok($price > 200.0); +ok($price < 400.0); # Search for something I know has multiple results @@ -57,28 +47,11 @@ ok($resp->is_success); $uri = $resp->base; ok($uri =~ /.*\/Search\/.*/); -my $dom = HTML::Grabber->new( html => $resp->decoded_content ); -ok($dom); - -# There's two of these tags, one at the top of the page and one at the bottom -my ($ajax_list_pager) = $dom->find('.AJAX_List_Pager')->text_array(); -ok($ajax_list_pager); - -# Match multiple lines and replace multiple times -$ajax_list_pager =~ s/\r\n//mg; -ok($ajax_list_pager, "/1/"); - # Searching for the above product yields two results -my ($first_result, @other) = $dom->find('.PIV_Regular')->html_array(); -ok(@other, 1); - -my $thumb = HTML::Grabber->new( html => $first_result ); -my $product_id = $thumb->find(".ProductId")->text(); -ok($product_id); - -my $product_url = "http://www.memoryexpress.com/Products/" . $product_id; -$resp = $ua->get_dom($product_url); -ok($resp->is_success); +my ($obj_resp, @others) = $me->find_product_page($resp); +ok($obj_resp->base =~ /.*\/Products\/.*/); +ok(@others, 1); +ok($obj_resp->is_success); # Search for something that returns 0 results @@ -90,8 +63,9 @@ ok($resp->is_success); $uri = $resp->base; ok($uri =~ /.*\/Search\/.*/); -my $dom = HTML::Grabber->new( html => $resp->decoded_content ); -ok($dom); +my ($obj_resp) = $me->find_product_page($resp); +ok( !defined $obj_resp ); # Check we get the no results found error -ok($dom->text, "/We're sorry, but there are no products with the specified search parameters./"); +ok( $resp->decoded_content, + "/We're sorry, but there are no products with the specified search parameters./");