commit 60346754980b30e22c556d2b580674f3e29e2b5d
parent 2d4de52a337de1de9901a65ac1083b7fbce45ecc
Author: kyle <kyle@getaddrinfo.net>
Date: Sun, 6 Mar 2016 10:40:34 -0700
memexp: put scraping smarts in real module
- move scraping stuff into real module such that it can start being used
- modify the test to test the module instead of doing all the work itself
Diffstat:
M | PS/MemoryExpress.pm | | | 93 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------- |
M | t/memory_express.t | | | 62 | ++++++++++++++++++-------------------------------------------- |
2 files changed, 86 insertions(+), 69 deletions(-)
diff --git a/PS/MemoryExpress.pm b/PS/MemoryExpress.pm
@@ -59,47 +59,90 @@ sub create_search {
return $self->{url} . uri_escape($part_num);
}
-sub find_price {
- my ($self, $srch_results) = @_;
+sub scrape_part_num {
+ my ($self, $resp) = @_;
+ my $dom = HTML::Grabber->new( html => $resp->decoded_content );
+
+ # Product part number is inside of this div id
+ my $product_add = $dom->find("#ProductAdd")->text();
+ my ($part_num) = ($product_add =~ m/Part #:\s*(.*)\r/);
+ return $part_num;
+}
- my @prices = $srch_results->find($self->{reg_tag})->text_array();
- if (@prices == 0) {
- $logger->debug("get_price(): no prices found");
- return;
- }
+sub scrape_price {
+ my ($self, $resp) = @_;
+ my $dom = HTML::Grabber->new( html => $resp->decoded_content );
- my ($price, @others) = ($prices[0] =~ m/(\d[\d,]+)/);
- if (! defined $price) {
- $logger->warn("get_price(): found price containers but they contained no numeric price");
- return;
- }
- if (@others) {
- $logger->warn("get_price(): price container had more than 1 price");
- return;
- }
+ my $grand_total_tag = $dom->find(".GrandTotal")->text();
+ # -> text() does not trim all whitespace
+ $grand_total_tag =~ s/^\s+//;
+ $grand_total_tag =~ s/\s+$//;
+
+ # Try and match a dollars dot cents format with leeway for comma
+ # separated digits.
+ # This also remove the "Only" text right beside the price.
+ my ($price, @others) = ($grand_total_tag =~ m/(\d[\d,]+.\d\d)/);
+ $logger->warn("memexp: found more than 1 price") if (@others);
+ # Remove any commas we may have matched earlier
$price =~ s/,//;
- if ($price <= 0 || $price > 10000) {
- $logger->warn("get_price(): price '$price' out of range");
- return;
+ return ($price, @others);
+}
+
+sub scrape_description {
+}
+
+sub find_product_page {
+ my ($self, $resp) = @_;
+ my $ua = $self->{ua};
+
+ my $uri = $resp->base;
+ if ($uri =~ /.*\/Products\/.*/) {
+ # We landed on the product page directly, great.
+ return $resp;
}
+ elsif ($uri =~ /.*\/Search\/.*/) {
+ # We landed on the search page.
+ my $dom = HTML::Grabber->new( html => $resp->decoded_content );
- return $price;
+ # We're only going to search the first page of results
+ my ($first_result, @others) = $dom->find('.PIV_Regular')->html_array();
+ return unless ($first_result);
+
+ my $thumb_dom = HTML::Grabber->new( html => $first_result );
+ my $product_id = $thumb_dom->find(".ProductId")->text();
+ return unless ($product_id);
+
+ my $product_url = "http://www.memoryexpress.com/Products/" . $product_id;
+
+ $resp = $ua->get_dom($product_url);
+ return unless $resp->is_success;
+
+ return ($resp, @others);
+ }
+ else {
+ $logger->error("find_product_page(): unexpected search URI '$uri'");
+ return;
+ }
}
-sub scrape_price {
+sub scrape_all {
my ($self, $manufacturer, $part_num) = @_;
my $ua = $self->{ua};
my $search = $self->create_search($part_num);
return unless ($search);
- my $srch_results = $ua->get_dom($search);
- return unless ($srch_results);
+ my $resp = $ua->get_dom($search);
+ return unless ($resp->is_success);
+
+ # Searching can sometimes take you to different places
+ my $resp = $self->find_product_page($resp);
+ return unless ($resp);
- my $price = $self->find_price($srch_results);
- return unless ($price);
+ my $part_num = $self->scrape_part_num($resp);
+ my $price = $self->scrape_price($resp);
my $sql = qq{insert into prices(date, manufacturer, part_num, retailer,
price, duration) values (?, ?, ?, ?, ?, ?)};
diff --git a/t/memory_express.t b/t/memory_express.t
@@ -4,7 +4,7 @@ use PS::MemoryExpress;
use Log::Log4perl qw(:easy);
use Test;
-BEGIN { plan tests => 20 }
+BEGIN { plan tests => 17 }
Log::Log4perl->easy_init($INFO);
@@ -21,30 +21,20 @@ ok($resp->is_success);
my $uri = $resp->base;
ok($uri =~ /.*\/Products\/.*/);
-my $dom = HTML::Grabber->new( html => $resp->decoded_content );
-ok($dom);
+# Check that the object is working
+my $obj_resp = $me->find_product_page($resp);
+ok($obj_resp->base, $resp->base);
-# Product part number is inside of this div id
-my $product_add = $dom->find("#ProductAdd")->text();
-my ($part_num) = ($product_add =~ m/Part #:\s*(.*)\r/);
+# Make sure the part number we scrape is correct
+my $part_num = $me->scrape_part_num($resp);
ok($part_num, "ST8000AS0002");
-# We know we're on the product page
-my $grand_total_tag = $dom->find(".GrandTotal")->text();
-# ->text() doesn't trim all the garbage whitespace
-$grand_total_tag =~ s/^\s+//;
-$grand_total_tag =~ s/\s+$//;
-
-# Final massaging, remove "Only" text that's right beside the price
-my ($price, @others) = ($grand_total_tag =~ m/(\d[\d,]+.\d\d)/);
+# Make sure the price we scrape is at least close to correct
+my ($price, @others) = $me->scrape_price($resp);
ok($price);
ok(@others == 0);
-
-# Remove any commas we may have matched earlier
-$price =~ s/,//;
-
-ok($price > 0.0);
-ok($price < 10000.0);
+ok($price > 200.0);
+ok($price < 400.0);
# Search for something I know has multiple results
@@ -57,28 +47,11 @@ ok($resp->is_success);
$uri = $resp->base;
ok($uri =~ /.*\/Search\/.*/);
-my $dom = HTML::Grabber->new( html => $resp->decoded_content );
-ok($dom);
-
-# There's two of these tags, one at the top of the page and one at the bottom
-my ($ajax_list_pager) = $dom->find('.AJAX_List_Pager')->text_array();
-ok($ajax_list_pager);
-
-# Match multiple lines and replace multiple times
-$ajax_list_pager =~ s/\r\n//mg;
-ok($ajax_list_pager, "/1/");
-
# Searching for the above product yields two results
-my ($first_result, @other) = $dom->find('.PIV_Regular')->html_array();
-ok(@other, 1);
-
-my $thumb = HTML::Grabber->new( html => $first_result );
-my $product_id = $thumb->find(".ProductId")->text();
-ok($product_id);
-
-my $product_url = "http://www.memoryexpress.com/Products/" . $product_id;
-$resp = $ua->get_dom($product_url);
-ok($resp->is_success);
+my ($obj_resp, @others) = $me->find_product_page($resp);
+ok($obj_resp->base =~ /.*\/Products\/.*/);
+ok(@others, 1);
+ok($obj_resp->is_success);
# Search for something that returns 0 results
@@ -90,8 +63,9 @@ ok($resp->is_success);
$uri = $resp->base;
ok($uri =~ /.*\/Search\/.*/);
-my $dom = HTML::Grabber->new( html => $resp->decoded_content );
-ok($dom);
+my ($obj_resp) = $me->find_product_page($resp);
+ok( !defined $obj_resp );
# Check we get the no results found error
-ok($dom->text, "/We're sorry, but there are no products with the specified search parameters./");
+ok( $resp->decoded_content,
+ "/We're sorry, but there are no products with the specified search parameters./");