pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit 4f39a6e0c37d96a253563c4cf3463f5c515c9147
parent 28ec55ef9260b1d073cc51920848f4c076e75adb
Author: Kyle Milz <kyle@getaddrinfo.net>
Date:   Mon,  3 Nov 2014 21:04:15 -0700

product_scraper: add new dom text getter

Diffstat:
Mproduct_scraper.pl | 28+++++++++++++++++++++-------
1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/product_scraper.pl b/product_scraper.pl @@ -87,9 +87,12 @@ for (keys %product_map) { sleep int(rand(10)); my $product = HTML::Grabber->new(html => $node); - # title is easier to parse from general results page - my $title = $product->find(".ProductTitle")->text(); - next if (not_defined($title, "title", $node)); + # used to visit the actual product page + my $product_id = get_tag_text($product, ".ProductId"); + next unless (defined $product_id); + + my $title = get_tag_text($product, ".ProductTitle"); + next unless (defined $title); # brand is easier to parse from general results page, sometimes # shows up as text @@ -100,10 +103,6 @@ for (keys %product_map) { } next if (not_defined($brand, "brand", $node)); - # used to visit the actual product page - my $product_id = $product->find(".ProductId")->text(); - next if (not_defined($product_id, "product ID", $node)); - my $product_url = "http://www.memoryexpress.com/Products/"; my $product_dom = get_dom("$product_url$product_id", $ua); @@ -191,3 +190,18 @@ sub not_defined } return 0; } + +sub get_tag_text +{ + my $dom = shift; + my $tag = shift; + + my $field = $dom->find($tag)->text(); + if (!defined $field || $field eq "" ) { + vprint("could not find $tag, html was:\n"); + vprint($dom->html()); + vprint("\n"); + return undef; + } + return $field; +}