commit 6012f05c4d5c900f0d9c84c35c71da1a6dc23b13
parent 7c66858a5de2f8f277a5c66aa5321ddce0210f3c
Author: Kyle Milz <kyle@getaddrinfo.net>
Date:   Thu, 12 Mar 2015 01:27:44 -0600
product_scraper: lift db insert/delete into its own function
Add sleep_rand helper, and fix annoying SIGBUS error by setting the statement
handles to undefined before disconnecting from the db.
Diffstat:
| M | product_scraper.pl |  |  | 121 | ++++++++++++++++++++++++++++++++++++++++++++++--------------------------------- | 
1 file changed, 70 insertions(+), 51 deletions(-)
diff --git a/product_scraper.pl b/product_scraper.pl
@@ -4,7 +4,6 @@ use strict;
 use warnings;
 
 use Config::Grammar;
-use DBI;
 use Email::Simple;
 use Email::Send;
 use Getopt::Std;
@@ -60,6 +59,8 @@ while (my ($type, $name) = each %product_map) {
 	mem_exp_scrape_class($type, $name);
 }
 
+$update_sth = undef;
+$insert_sth = undef;
 $dbh->disconnect();
 send_email($mail, $args{v});
 
@@ -72,9 +73,59 @@ sub mem_exp_scrape_class
 {
 	my $type = shift;
 	my $name = shift;
-
 	my $info_hdr = "info: $type";
-	print "$info_hdr\n" if ($args{v});
+
+	my $thumbnails = mem_exp_get_thumbnails($name, $info_hdr);
+	return undef unless defined $thumbnails;
+
+	my $total = scalar @$thumbnails;
+	print "$info_hdr: $total total\n" if ($args{v});
+
+	# extract and store part number, brand, and description
+	my ($new, $old, $err, $start, $i) = (0, 0, 0, time, 0);
+	for my $thumbnail_html (@$thumbnails) {
+		$i++;
+		my $thumb_hdr = "$info_hdr: $i/$total";
+
+		# look less suspicious
+		sleep_rand($thumb_hdr, 20);
+
+		# attempt to extract information from thumbnail html
+		my ($brand, $part_num, $desc) =
+			mem_exp_scrape_thumbnail("$type: $i/$total", $thumbnail_html);
+		unless (defined $brand && defined $part_num && defined $desc) {
+			$err++;
+			next;
+		}
+
+		# extraction looks good, insert or update the database
+		$sql = "select * from products where part_num = ?";
+		if ($dbh->selectrow_arrayref($sql, undef, $part_num)) {
+			# also check description and manufacturer are consistent?
+			$update_sth->execute(time, $part_num) or die $dbh->errstr();
+			print "$thumb_hdr: updated db\n" if ($args{v});
+			$old++;
+		}
+		else {
+			$insert_sth->execute($part_num, $brand, $desc, $type,
+				time, time, 0) or die $dbh->errstr();
+			print "$thumb_hdr: inserted into db\n" if ($args{v});
+			$new++;
+		}
+	}
+
+	my $ok = $new + $old;
+	$mail .= sprintf("%-15s %7s %6.1f%% %6i %3i %7is\n", $type,
+		"$ok/$total", $ok * 100.0 / $total, $err, $new, time - $start);
+}
+
+#
+# get all thumbnails from generic unfiltered search page
+#
+sub mem_exp_get_thumbnails
+{
+	my $name = shift;
+	my $info_hdr = shift;
 
 	# this returns a search results page, link found through trial and error
 	my $class_url = "http://www.memoryexpress.com/Category/" .
@@ -104,17 +155,17 @@ sub mem_exp_scrape_class
 	my @thumbnails;
 	for (1..$pages) {
 		my $page_hdr = "$pager_hdr: $_/$pages";
+		sleep_rand($page_hdr, 5);
 
-		# slow this down a bit
-		my $sleep = int(rand(5));
-		printf "$page_hdr: (%is wait)\n", $sleep if ($args{v});
-		sleep $sleep unless ($args{t});
-
+		# get a search pages dom
 		$dom = get_dom($class_url . "$_", $ua, $args{v});
 		next if (!defined $dom);
 
 		# each product thumbnail has class=PIV_Regular
 		my @temp_thumbs = $dom->find(".PIV_Regular")->html_array();
+		if ($args{t}) {
+			@temp_thumbs = ($temp_thumbs[0]);
+		}
 		my $num_thumbs = scalar @temp_thumbs;
 		print "$page_hdr: $num_thumbs thumbs found\n" if ($args{v});
 		push @thumbnails, @temp_thumbs;
@@ -122,49 +173,7 @@ sub mem_exp_scrape_class
 		last if ($args{t});
 	}
 
-	my $total = scalar @thumbnails;
-	print "$info_hdr: $total total\n" if ($args{v});
-
-	# extract and store part number, brand, and description
-	my ($new, $old, $err, $start, $i) = (0, 0, 0, time, 0);
-	for my $thumbnail_html (@thumbnails) {
-		$i++;
-		my $thumb_hdr = "$info_hdr: $i/$total";
-
-		# look less suspicious
-		my $sleep = int(rand(20));
-		printf "$thumb_hdr (%ss wait)\n", $sleep if ($args{v});
-		sleep $sleep unless ($args{t});
-
-		# attempt to extract information from thumbnail_html
-		my ($brand, $part_num, $desc) =
-			mem_exp_scrape_thumbnail("$type: $i/$total", $thumbnail_html);
-		if (!defined $brand) {
-			$err++;
-			next;
-		}
-
-		# extraction looks good, insert or update the database
-		my $sql = "select * from products where part_num = ?";
-		if ($dbh->selectrow_arrayref($sql, undef, $part_num)) {
-			# also check description and manufacturer are consistent?
-			$update_sth->execute(time, $part_num);
-			print "$thumb_hdr: updated db\n" if ($args{v});
-			$old++;
-		}
-		else {
-			$insert_sth->execute($part_num, $brand, $desc, $type,
-				time, time, 0);
-			print "$thumb_hdr:  inserted into db\n" if ($args{v});
-			$new++;
-		}
-
-		last if ($args{t});
-	}
-
-	my $ok = $new + $old;
-	$mail .= sprintf("%-15s %7s %6.1f%% %6i %3i %7is\n", $type,
-		"$ok/$total", $ok * 100.0 / $total, $err, $new, time - $start);
+	return \@thumbnails;
 }
 
 #
@@ -270,3 +279,13 @@ sub send_email
 	$sender->mailer_args([Host => $cfg->{"general"}{"smtp"}]);
 	$sender->send($email->as_string()) || print "Couldn't send email\n";
 }
+
+sub sleep_rand
+{
+	my $header = shift;
+	my $upper_limit = shift || 0;
+
+	my $sleep = int(rand($upper_limit));
+	printf "$header: (%ss wait)\n", $sleep if ($args{v});
+	sleep $sleep unless ($args{t});
+}