commit 82c9894c54890e944601ec056f880dd7a6ecc1ce
parent e5c8e104dad45a3decb96243fa7abae51abc312c
Author: Kyle Milz <kyle@getaddrinfo.net>
Date: Sat, 28 Feb 2015 17:20:10 -0700
product_scraper: change the code flow a bit
Move the email code into a function, move the product class scrape code into a
function, don't keep track of new products.
Diffstat:
M | product_scraper.pl | | | 129 | +++++++++++++++++++++++++++++++++++++++++++++---------------------------------- |
1 file changed, 73 insertions(+), 56 deletions(-)
diff --git a/product_scraper.pl b/product_scraper.pl
@@ -4,6 +4,7 @@ use strict;
use warnings;
use Config::Grammar;
+use DBI;
use Email::Simple;
use Email::Send;
use Getopt::Std;
@@ -18,8 +19,9 @@ getopts("tv", \%args);
$| = 1 if ($args{v});
my $cfg = get_config();
-my $ua = get_ua($cfg->{"general"});
+my $ua = new_ua($cfg->{"general"}, $args{v});
my $dbh = get_dbh($cfg->{"general"});
+# my $log = get_log("products.txt", $args{v});
srand;
$dbh->do("create table if not exists products(" .
@@ -33,16 +35,6 @@ $dbh->do("create table if not exists products(" .
# $dbh->do("create table if not exists scrapes");
-#
-# Memory Express
-#
-
-my $vendor = "Memory Express";
-my %product_map = (
- "televisions" => "Televisions",
- "laptops" => "LaptopsNotebooks",
- "hard drives" => "HardDrives");
-
my $sql = "insert into products(part_num, manufacturer, description, type, " .
"first_seen, last_seen, last_scraped) values (?, ?, ?, ?, ?, ?, ?)";
my $insert_sth = $dbh->prepare($sql);
@@ -51,13 +43,37 @@ my $insert_sth = $dbh->prepare($sql);
$sql = "update products set last_seen = ? where part_num = ?";
my $update_sth = $dbh->prepare($sql);
-my $summary .= "type ok percent errors new duration\n";
-$summary .= "--------------- ------- ------- ------ --- --------\n";
+#
+# Memory Express
+#
+my $mail = "Memory Express\n==============\n\n";
+$mail .= "type ok percent errors new duration\n";
+$mail .= "--------------- ------- ------- ------ --- --------\n";
-my $new_products;
+my %product_map = (
+ "televisions" => "Televisions",
+ "laptops" => "LaptopsNotebooks",
+ "hard drives" => "HardDrives"
+);
while (my ($type, $name) = each %product_map) {
+ mem_exp_scrape_class($type, $name);
+}
+
+$dbh->disconnect();
+send_email($mail, $args{v});
+
+#
+# scrape an entire class of products, inserting or updating the db as needed.
+# general flow is get all thumbnails on the unfiltered search results page, then
+# for each of these get the part number, brand, and description.
+#
+sub mem_exp_scrape_class
+{
+ my $type = shift;
+ my $name = shift;
+
my $info_hdr = "info: $type";
- print "$info_hdr\n";
+ print "$info_hdr\n" if ($args{v});
# this returns a search results page, link found through trial and error
my $class_url = "http://www.memoryexpress.com/Category/" .
@@ -65,19 +81,19 @@ while (my ($type, $name) = each %product_map) {
# get first page of results
my $dom = get_dom($class_url . "1", $ua, $args{v});
- next if (!defined $dom);
+ return undef if (!defined $dom);
my $pager_hdr = "$info_hdr: .AJAX_List_Pager";
# extract the first of two pager widgets on the page
my ($pager_html) = $dom->find(".AJAX_List_Pager")->html_array();
- next if (!defined $pager_html);
+ return undef if (!defined $pager_html);
print "$pager_hdr found\n" if ($args{v});
# find how many pages of results we have, each page is one <li> element
my $pager = HTML::Grabber->new(html => $pager_html);
my $pages = $pager->find("li")->html_array();
- next unless ($pages);
+ return undef unless ($pages);
# if more than 1 <li> is found, one <li> is always a "next" arrow
$pages-- if ($pages > 1);
@@ -86,9 +102,11 @@ while (my ($type, $name) = each %product_map) {
# loop over results pages and append product thumbnails
my @thumbnails;
for (1..$pages) {
+ my $page_hdr = "$pager_hdr: $_/$pages";
+
# slow this down a bit
my $sleep = int(rand(5));
- print "$pager_hdr: $_/$pages: $sleep s wait\n" if ($args{v});
+ printf "$page_hdr: (%is wait)\n", $sleep if ($args{v});
sleep $sleep unless ($args{t});
$dom = get_dom($class_url . "$_", $ua, $args{v});
@@ -96,7 +114,8 @@ while (my ($type, $name) = each %product_map) {
# each product thumbnail has class=PIV_Regular
my @temp_thumbs = $dom->find(".PIV_Regular")->html_array();
- printf "$pager_hdr: $_/$pages: %i thumbs found\n", scalar @temp_thumbs if ($args{v});
+ my $num_thumbs = scalar @temp_thumbs;
+ print "$page_hdr: $num_thumbs thumbs found\n" if ($args{v});
push @thumbnails, @temp_thumbs;
last if ($args{t});
@@ -117,8 +136,8 @@ while (my ($type, $name) = each %product_map) {
sleep $sleep unless ($args{t});
# attempt to extract information from thumbnail_html
- my ($brand, $part_num, $desc, $tmp_desc) =
- scrape_thumbnail("$type: $i/$total", $thumbnail_html);
+ my ($brand, $part_num, $desc) =
+ mem_exp_scrape_thumbnail("$type: $i/$total", $thumbnail_html);
if (!defined $brand) {
$err++;
next;
@@ -129,14 +148,13 @@ while (my ($type, $name) = each %product_map) {
if ($dbh->selectrow_arrayref($sql, undef, $part_num)) {
# also check description and manufacturer are consistent?
$update_sth->execute(time, $part_num);
- print "$thumb_hdr: db updated\n" if ($args{v});
+ print "$thumb_hdr: updated db\n" if ($args{v});
$old++;
}
else {
$insert_sth->execute($part_num, $brand, $desc, $type,
time, time, 0);
- print "$thumb_hdr db inserted\n" if ($args{v});
- $new_products .= "$brand $part_num: $tmp_desc\n";
+ print "$thumb_hdr: inserted into db\n" if ($args{v});
$new++;
}
@@ -144,43 +162,15 @@ while (my ($type, $name) = each %product_map) {
}
my $ok = $new + $old;
- $summary .= sprintf("%-15s %7s %6.1f%% %6i %3i %7is\n", $type,
+ $mail .= sprintf("%-15s %7s %6.1f%% %6i %3i %7is\n", $type,
"$ok/$total", $ok * 100.0 / $total, $err, $new, time - $start);
}
-$dbh->disconnect();
-
-my $mail = "$vendor\n";
-$mail .= "=" for (1..length $vendor);
-$mail .= "\n\n";
-
-$mail .= "$summary\n" if ($summary);
-$mail .= "$new_products\n" if ($new_products);
-
-my $email = Email::Simple->create(
- header => [
- From => "Santa Claus <sc\@np.com>",
- To => $cfg->{"general"}{"email"},
- Subject => "PriceChart product scrape",
- ],
- body => $mail
-);
-
-if ($args{v}) {
- print $email->as_string();
- exit 0;
-}
-
-my $sender = Email::Send->new({mailer => "SMTP"});
-$sender->mailer_args([Host => $cfg->{"general"}{"smtp"}]);
-$sender->send($email->as_string()) || print "Couldn't send email\n";
-
-
#
# this checks the input html for 3 things, part num, manufacturer, and
# description. if any of these aren't found, fail.
#
-sub scrape_thumbnail
+sub mem_exp_scrape_thumbnail
{
my $thumb_hdr = shift;
my $html = shift;
@@ -234,7 +224,7 @@ sub scrape_thumbnail
print "$info_hdr: $brand $part_num\n" if ($args{v});
print "$info_hdr: $tmp_desc\n" if ($args{v});
- return ($brand, $part_num, $desc, $tmp_desc);
+ return ($brand, $part_num, $desc);
}
#
@@ -255,3 +245,30 @@ sub get_tag_text
return $field;
}
+
+#
+# send an email with the summary of the scrape
+#
+sub send_email
+{
+ my $mail = shift;
+ my $verbose = shift || 0;
+
+ if ($verbose) {
+ print $mail;
+ return;
+ }
+
+ my $email = Email::Simple->create(
+ header => [
+ From => "Santa Claus <sc\@np.com>",
+ To => $cfg->{"general"}{"email"},
+ Subject => "pricechart product scrape",
+ ],
+ body => $mail
+ );
+
+ my $sender = Email::Send->new({mailer => "SMTP"});
+ $sender->mailer_args([Host => $cfg->{"general"}{"smtp"}]);
+ $sender->send($email->as_string()) || print "Couldn't send email\n";
+}