commit b7fb366a469387a1dd8dc3c0eb9bb7c79894832b
parent 589c8eab646ebb6122a0753e64b03cbbb8892731
Author: Kyle Milz <kyle@getaddrinfo.net>
Date:   Thu, 29 Jan 2015 01:08:25 -0700
product_scraper: add comments and debugging
Diffstat:
| M | PriceChart.pm |  |  | 35 | ++++++++++++++++++++++++++--------- | 
| M | product_scraper.pl |  |  | 122 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------ | 
2 files changed, 112 insertions(+), 45 deletions(-)
diff --git a/PriceChart.pm b/PriceChart.pm
@@ -10,11 +10,26 @@ use Exporter;
 sub get_config
 {
 	my $parser = Config::Grammar->new({
-		_vars => [
-			'user_agent',
-			'email',
-			'smtp',
-		],
+		_sections => ["general", "vendors"],
+		general => {
+			_vars => [
+				'user_agent',
+				'email',
+				'smtp'
+			],
+		},
+		vendors => {
+			_sections => ["/[A-Za-z ]+/"],
+			"/[A-Za-z ]+/" => {
+				_vars => [
+					"search_url",
+					"regular_price_tag",
+					"sale_price_tag",
+					"color",
+					"title"
+				]
+			}
+		}
 	});
 	my $cfg_file = "/etc/pricechart.cfg";
 	return $parser->parse($cfg_file) or die "error: $parser->{err}\n";
@@ -22,7 +37,9 @@ sub get_config
 
 sub get_dbh
 {
-	my $db_dir = "/var/www/db";
+	# XXX: needs to be changed in production!
+	# my $db_dir = "/var/www/db";
+	my $db_dir = "./";
 	mkdir $db_dir;
 
 	my $dbh = DBI->connect(
@@ -42,10 +59,10 @@ sub get_dom
 
 	my $resp = $ua->get($url);
 	if ($resp->is_success) {
-		if (length($url) > 60) {
-			$url = "..." . substr($url, length($url) - 60);
+		if (length($url) > 55) {
+			$url = "..." . substr($url, length($url) - 55);
 		}
-		print "GET $url " . $resp->status_line . "\n" if ($verbose);
+		print "info: GET $url " . $resp->status_line . "\n" if ($verbose);
 		return HTML::Grabber->new(html => $resp->decoded_content);
 	}
 
diff --git a/product_scraper.pl b/product_scraper.pl
@@ -13,12 +13,12 @@ use PriceChart;
 
 
 my %args;
-getopts("v", \%args);
+getopts("tv", \%args);
 
 $| = 1 if ($args{v});
 
 my $cfg = get_config();
-my $ua  = get_ua($cfg);
+my $ua  = get_ua($cfg->{"general"});
 my $dbh = get_dbh();
 srand;
 
@@ -36,9 +36,8 @@ $dbh->do("create table if not exists products(" .
 #
 
 my $vendor = "Memory Express";
-# use this to look up individual products
-my $product_url = "http://www.memoryexpress.com/Products/";
-my %product_map = ("televisions" => "Televisions",
+my %product_map = (
+	"televisions" => "Televisions",
 	"laptops" => "LaptopsNotebooks",
 	"hard drives" => "HardDrives");
 
@@ -58,88 +57,138 @@ my ($new_products, $errors);
 while (my ($type, $name) = each %product_map) {
 	print "Enumerating $type\n";
 
+	# this returns a search results page, link found through trial and error
 	my $class_url = "http://www.memoryexpress.com/Category/" .
 		"$name?PageSize=40&Page=";
 
-	# Get first page of results
+	# get first page of results
 	my $dom = get_dom($class_url . "1", $ua, $args{v});
 	next if (!defined $dom);
 
-	# Extract the first of two pager widgets on the page
+	# extract the first of two pager widgets on the page
 	my ($pager_html) = $dom->find(".AJAX_List_Pager")->html_array();
 	next if (!defined $pager_html);
-	print "Found .AJAX_List_Pager\n" if ($args{v});
+	print "info: .AJAX_List_Pager found\n" if ($args{v});
 
-	# Find how many pages of results we have
+	# find how many pages of results we have, each page is one <li> element
 	my $pager = HTML::Grabber->new(html => $pager_html);
 	my $pages = $pager->find("li")->html_array();
 	next unless ($pages);
 
-	# If more than 1 page of results are found, the pager contains a "next"
-	# arrow that needs to be accounted for
+	# if more than 1 <li> is found, one <li> is always a "next" arrow
 	$pages-- if ($pages > 1);
-	print "Found $pages pages\n" if ($args{v});
+	print "info: .AJAX_List_Pager: $pages pages\n" if ($args{v});
 
-	# Loop over all results pages and append all products
+	# loop over results pages and append product thumbnails
 	my @thumbnails;
 	for (1..$pages) {
+		# slow this down a bit
+		sleep int(rand(5));
+
 		$dom = get_dom($class_url . "$_", $ua, $args{v});
 		next if (!defined $dom);
 
-		# Each product is contained inside of their own PIV_Regular
+		# each product thumbnail has class=PIV_Regular
 		push @thumbnails, $dom->find(".PIV_Regular")->html_array();
+
+		next if ($args{t});
 	}
 
 	my $total = scalar @thumbnails;
-	print "Found $total $type\n" if ($args{v});
+	print "info: found $total $type, scraping individually\n" if ($args{v});
 
-	my ($new, $old, $start) = (0, 0, time);
+	# extract part number, brand, and description
+	my ($new, $old, $start, $i) = (0, 0, time, 0);
 	for my $thumbnail_html (@thumbnails) {
-		sleep int(rand(10));
+		$i++;
+		my $hdr = "$type: $i/$total";
+
+		my $sleep = int(rand(20));
+		print "info: $hdr ($sleep s wait)\n" if ($args{v});
+		sleep $sleep;
 
+		# make new html grabber instance with the thumbnail html
 		my $thumbnail_dom = HTML::Grabber->new(html => $thumbnail_html);
 
-		# used to visit the actual product page
+		# has to be found otherwise we can't do anything
 		my $product_id = get_tag_text($thumbnail_dom, ".ProductId");
-		next unless (defined $product_id);
+		if (!defined $product_id) {
+			print "error: $hdr: .ProductId not found\n";
+			next;
+		}
+		else {
+			print "info: $hdr: .ProductId = $product_id\n" if ($args{v});
+		}
 
-		# get the part number from the product page as early as possible
+		# visit the extended description page
+		my $product_url = "http://www.memoryexpress.com/Products/";
 		my $product_dom = get_dom("$product_url$product_id", $ua, $args{v});
+
+		# the part number is inside of id=ProductAdd always
 		my $part_num = get_tag_text($product_dom, "#ProductAdd");
-		next unless (defined $part_num);
+		if (!defined $part_num) {
+			print "error: $hdr: #ProductAdd not found\n";
+			next;
+		}
 
+		# extract the part number, always is text inside of the tag
 		($part_num) = ($part_num =~ m/Part #:\s*(.*)\r/);
-		next unless (defined $part_num && $part_num ne "");
+		if (!defined $part_num || $part_num eq "") {
+			print "error: $hdr: part num regex failed\n";
+			next;
+		}
+		else {
+			print "info: $hdr: part_num = $part_num\n" if ($args{v});
+		}
 
-		my $description = get_tag_text($thumbnail_dom, ".ProductTitle");
-		next unless (defined $description);
+		# extract the product tile
+		my $desc = get_tag_text($thumbnail_dom, ".ProductTitle");
+		if (!defined $desc) {
+			print "error: $hdr: .ProductTitle was not found.\n";
+			next;
+		}
+		else {
+			my $tmp_desc = $desc;
+			if (length($tmp_desc) > 35) {
+				$tmp_desc = substr($tmp_desc, 0, 40) . "...";
+			}
+			print "info: $hdr: .ProductTitle = $tmp_desc\n" if ($args{v});
+		}
 
-		# brand sometimes shows up as text
+		# extract the brand, sometimes shows up as text
 		my $brand = $thumbnail_dom->find(".ProductBrand")->text();
 		if ($brand eq "") {
+			print "info: $hdr: .ProductBrand not text\n" if ($args{v});
+			# and sometimes shows up inside the tag attributes
 			$brand = $thumbnail_dom->find(".ProductBrand")->html();
 			($brand) = ($brand =~ m/Brand: ([A-Za-z]+)/);
 		}
 		if (!defined $brand || $brand eq "") {
-			$errors .= "could not find .ProductBrand, html was:\n";
-			$errors .= "$thumbnail_html\n\n";
-			print $errors if ($args{v});
+			print "error: $hdr: .ProductBrand not found, html:\n";
+			print "$thumbnail_html\n";
 			next;
 		}
+		else {
+			print "info: $hdr: .ProductBrand = $brand\n" if ($args{v});
+		}
 
+		# use existence of part_num to decide on update or insert new
 		my $sql = "select * from products where part_num = ?";
 		if ($dbh->selectrow_arrayref($sql, undef, $part_num)) {
+			# update
 			$update_sth->execute(time, $part_num);
-			print "updated $part_num\n" if ($args{v});
+			print "info: $hdr: db updated\n" if ($args{v});
 			$old++;
 		}
 		else {
-			$insert_sth->execute($part_num, $brand, $description,
+			# insert new
+			$insert_sth->execute($part_num, $brand, $desc,
 				$type, time, time, 0);
-			print "inserted $part_num\n" if ($args{v});
-			$new_products .= "$brand $description ($part_num)\n";
+			print "info: $hdr: db inserted\n" if ($args{v});
+			$new_products .= "$brand $desc ($part_num)\n";
 			$new++;
 		}
+		last if ($args{t});
 	}
 
 	$summary .= sprintf("%-11s %7s %5s %3s %6s %8s\n", $type, $new + $old,
@@ -161,17 +210,18 @@ $mail .= $errors           if ($errors);
 my $email = Email::Simple->create(
 	header => [
 		From	=> "Santa Claus <sc\@np.com>",
-		To	=> $cfg->{email},
+		To	=> $cfg->{"general"}{"email"},
 		Subject	=> "PriceChart product scrape",
 	],
-	body => $mail);
+	body => $mail
+);
 
 if ($args{v}) {
 	print $email->as_string();
 }
 else {
 	my $sender = Email::Send->new({mailer => 'SMTP'});
-	$sender->mailer_args([Host => $cfg->{smtp}]);
+	$sender->mailer_args([Host => $cfg->{"general"}{"smtp"}]);
 	$sender->send($email->as_string()) || print "Couldn't send email\n";
 }
 
@@ -182,7 +232,7 @@ sub get_tag_text
 
 	my $field = $dom->find($tag)->text();
 	if (!defined $field || $field eq "") {
-		$errors .= "could not find $tag, html was:\n";
+		$errors .= "error: could not find $tag, html was:\n";
 		$errors .= $dom->html();
 		$errors .= "\n\n";
 		print $errors if ($args{v});