commit b7fb366a469387a1dd8dc3c0eb9bb7c79894832b
parent 589c8eab646ebb6122a0753e64b03cbbb8892731
Author: Kyle Milz <kyle@getaddrinfo.net>
Date: Thu, 29 Jan 2015 01:08:25 -0700
product_scraper: add comments and debugging
Diffstat:
M | PriceChart.pm | | | 35 | ++++++++++++++++++++++++++--------- |
M | product_scraper.pl | | | 122 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------ |
2 files changed, 112 insertions(+), 45 deletions(-)
diff --git a/PriceChart.pm b/PriceChart.pm
@@ -10,11 +10,26 @@ use Exporter;
sub get_config
{
my $parser = Config::Grammar->new({
- _vars => [
- 'user_agent',
- 'email',
- 'smtp',
- ],
+ _sections => ["general", "vendors"],
+ general => {
+ _vars => [
+ 'user_agent',
+ 'email',
+ 'smtp'
+ ],
+ },
+ vendors => {
+ _sections => ["/[A-Za-z ]+/"],
+ "/[A-Za-z ]+/" => {
+ _vars => [
+ "search_url",
+ "regular_price_tag",
+ "sale_price_tag",
+ "color",
+ "title"
+ ]
+ }
+ }
});
my $cfg_file = "/etc/pricechart.cfg";
return $parser->parse($cfg_file) or die "error: $parser->{err}\n";
@@ -22,7 +37,9 @@ sub get_config
sub get_dbh
{
- my $db_dir = "/var/www/db";
+ # XXX: needs to be changed in production!
+ # my $db_dir = "/var/www/db";
+ my $db_dir = "./";
mkdir $db_dir;
my $dbh = DBI->connect(
@@ -42,10 +59,10 @@ sub get_dom
my $resp = $ua->get($url);
if ($resp->is_success) {
- if (length($url) > 60) {
- $url = "..." . substr($url, length($url) - 60);
+ if (length($url) > 55) {
+ $url = "..." . substr($url, length($url) - 55);
}
- print "GET $url " . $resp->status_line . "\n" if ($verbose);
+ print "info: GET $url " . $resp->status_line . "\n" if ($verbose);
return HTML::Grabber->new(html => $resp->decoded_content);
}
diff --git a/product_scraper.pl b/product_scraper.pl
@@ -13,12 +13,12 @@ use PriceChart;
my %args;
-getopts("v", \%args);
+getopts("tv", \%args);
$| = 1 if ($args{v});
my $cfg = get_config();
-my $ua = get_ua($cfg);
+my $ua = get_ua($cfg->{"general"});
my $dbh = get_dbh();
srand;
@@ -36,9 +36,8 @@ $dbh->do("create table if not exists products(" .
#
my $vendor = "Memory Express";
-# use this to look up individual products
-my $product_url = "http://www.memoryexpress.com/Products/";
-my %product_map = ("televisions" => "Televisions",
+my %product_map = (
+ "televisions" => "Televisions",
"laptops" => "LaptopsNotebooks",
"hard drives" => "HardDrives");
@@ -58,88 +57,138 @@ my ($new_products, $errors);
while (my ($type, $name) = each %product_map) {
print "Enumerating $type\n";
+ # this returns a search results page, link found through trial and error
my $class_url = "http://www.memoryexpress.com/Category/" .
"$name?PageSize=40&Page=";
- # Get first page of results
+ # get first page of results
my $dom = get_dom($class_url . "1", $ua, $args{v});
next if (!defined $dom);
- # Extract the first of two pager widgets on the page
+ # extract the first of two pager widgets on the page
my ($pager_html) = $dom->find(".AJAX_List_Pager")->html_array();
next if (!defined $pager_html);
- print "Found .AJAX_List_Pager\n" if ($args{v});
+ print "info: .AJAX_List_Pager found\n" if ($args{v});
- # Find how many pages of results we have
+ # find how many pages of results we have, each page is one <li> element
my $pager = HTML::Grabber->new(html => $pager_html);
my $pages = $pager->find("li")->html_array();
next unless ($pages);
- # If more than 1 page of results are found, the pager contains a "next"
- # arrow that needs to be accounted for
+ # if more than 1 <li> is found, one <li> is always a "next" arrow
$pages-- if ($pages > 1);
- print "Found $pages pages\n" if ($args{v});
+ print "info: .AJAX_List_Pager: $pages pages\n" if ($args{v});
- # Loop over all results pages and append all products
+ # loop over results pages and append product thumbnails
my @thumbnails;
for (1..$pages) {
+ # slow this down a bit
+ sleep int(rand(5));
+
$dom = get_dom($class_url . "$_", $ua, $args{v});
next if (!defined $dom);
- # Each product is contained inside of their own PIV_Regular
+ # each product thumbnail has class=PIV_Regular
push @thumbnails, $dom->find(".PIV_Regular")->html_array();
+
+ next if ($args{t});
}
my $total = scalar @thumbnails;
- print "Found $total $type\n" if ($args{v});
+ print "info: found $total $type, scraping individually\n" if ($args{v});
- my ($new, $old, $start) = (0, 0, time);
+ # extract part number, brand, and description
+ my ($new, $old, $start, $i) = (0, 0, time, 0);
for my $thumbnail_html (@thumbnails) {
- sleep int(rand(10));
+ $i++;
+ my $hdr = "$type: $i/$total";
+
+ my $sleep = int(rand(20));
+ print "info: $hdr ($sleep s wait)\n" if ($args{v});
+ sleep $sleep;
+ # make new html grabber instance with the thumbnail html
my $thumbnail_dom = HTML::Grabber->new(html => $thumbnail_html);
- # used to visit the actual product page
+ # has to be found otherwise we can't do anything
my $product_id = get_tag_text($thumbnail_dom, ".ProductId");
- next unless (defined $product_id);
+ if (!defined $product_id) {
+ print "error: $hdr: .ProductId not found\n";
+ next;
+ }
+ else {
+ print "info: $hdr: .ProductId = $product_id\n" if ($args{v});
+ }
- # get the part number from the product page as early as possible
+ # visit the extended description page
+ my $product_url = "http://www.memoryexpress.com/Products/";
my $product_dom = get_dom("$product_url$product_id", $ua, $args{v});
+
+ # the part number is inside of id=ProductAdd always
my $part_num = get_tag_text($product_dom, "#ProductAdd");
- next unless (defined $part_num);
+ if (!defined $part_num) {
+ print "error: $hdr: #ProductAdd not found\n";
+ next;
+ }
+ # extract the part number, always is text inside of the tag
($part_num) = ($part_num =~ m/Part #:\s*(.*)\r/);
- next unless (defined $part_num && $part_num ne "");
+ if (!defined $part_num || $part_num eq "") {
+ print "error: $hdr: part num regex failed\n";
+ next;
+ }
+ else {
+ print "info: $hdr: part_num = $part_num\n" if ($args{v});
+ }
- my $description = get_tag_text($thumbnail_dom, ".ProductTitle");
- next unless (defined $description);
+ # extract the product tile
+ my $desc = get_tag_text($thumbnail_dom, ".ProductTitle");
+ if (!defined $desc) {
+ print "error: $hdr: .ProductTitle was not found.\n";
+ next;
+ }
+ else {
+ my $tmp_desc = $desc;
+ if (length($tmp_desc) > 35) {
+ $tmp_desc = substr($tmp_desc, 0, 40) . "...";
+ }
+ print "info: $hdr: .ProductTitle = $tmp_desc\n" if ($args{v});
+ }
- # brand sometimes shows up as text
+ # extract the brand, sometimes shows up as text
my $brand = $thumbnail_dom->find(".ProductBrand")->text();
if ($brand eq "") {
+ print "info: $hdr: .ProductBrand not text\n" if ($args{v});
+ # and sometimes shows up inside the tag attributes
$brand = $thumbnail_dom->find(".ProductBrand")->html();
($brand) = ($brand =~ m/Brand: ([A-Za-z]+)/);
}
if (!defined $brand || $brand eq "") {
- $errors .= "could not find .ProductBrand, html was:\n";
- $errors .= "$thumbnail_html\n\n";
- print $errors if ($args{v});
+ print "error: $hdr: .ProductBrand not found, html:\n";
+ print "$thumbnail_html\n";
next;
}
+ else {
+ print "info: $hdr: .ProductBrand = $brand\n" if ($args{v});
+ }
+ # use existence of part_num to decide on update or insert new
my $sql = "select * from products where part_num = ?";
if ($dbh->selectrow_arrayref($sql, undef, $part_num)) {
+ # update
$update_sth->execute(time, $part_num);
- print "updated $part_num\n" if ($args{v});
+ print "info: $hdr: db updated\n" if ($args{v});
$old++;
}
else {
- $insert_sth->execute($part_num, $brand, $description,
+ # insert new
+ $insert_sth->execute($part_num, $brand, $desc,
$type, time, time, 0);
- print "inserted $part_num\n" if ($args{v});
- $new_products .= "$brand $description ($part_num)\n";
+ print "info: $hdr: db inserted\n" if ($args{v});
+ $new_products .= "$brand $desc ($part_num)\n";
$new++;
}
+ last if ($args{t});
}
$summary .= sprintf("%-11s %7s %5s %3s %6s %8s\n", $type, $new + $old,
@@ -161,17 +210,18 @@ $mail .= $errors if ($errors);
my $email = Email::Simple->create(
header => [
From => "Santa Claus <sc\@np.com>",
- To => $cfg->{email},
+ To => $cfg->{"general"}{"email"},
Subject => "PriceChart product scrape",
],
- body => $mail);
+ body => $mail
+);
if ($args{v}) {
print $email->as_string();
}
else {
my $sender = Email::Send->new({mailer => 'SMTP'});
- $sender->mailer_args([Host => $cfg->{smtp}]);
+ $sender->mailer_args([Host => $cfg->{"general"}{"smtp"}]);
$sender->send($email->as_string()) || print "Couldn't send email\n";
}
@@ -182,7 +232,7 @@ sub get_tag_text
my $field = $dom->find($tag)->text();
if (!defined $field || $field eq "") {
- $errors .= "could not find $tag, html was:\n";
+ $errors .= "error: could not find $tag, html was:\n";
$errors .= $dom->html();
$errors .= "\n\n";
print $errors if ($args{v});