commit b142f5f806fa976b8852d3ba23b8a042dbca9771
parent 459ca65b91eb3fcad6aeabe0c0c3c8199eccfb07
Author: Kyle Milz <kyle@getaddrinfo.net>
Date: Sun, 4 Jan 2015 02:49:17 -0700
product_scraper: document scraping technique
Clean up and document scraping technique for Memory Express. While here, change
shared module name to not use a subdirectory.
Diffstat:
7 files changed, 117 insertions(+), 112 deletions(-)
diff --git a/PriceChart.pm b/PriceChart.pm
@@ -0,0 +1,76 @@
+package PriceChart;
+
+use DBI;
+use Exporter;
+use POSIX;
+
+@ISA = ("Exporter");
+@EXPORT = qw(get_config get_dom get_ua get_log get_dbh);
+
+
+sub get_config
+{
+ my $parser = Config::Grammar->new({
+ _vars => [
+ 'user_agent',
+ 'email',
+ 'smtp',
+ ],
+ });
+ my $cfg_file = "/etc/pricechart.cfg";
+ return $parser->parse($cfg_file) or die "error: $parser->{err}\n";
+}
+
+sub get_dbh
+{
+ my $db_dir = "/var/www/db";
+ mkdir $db_dir;
+
+ my $dbh = DBI->connect(
+ "dbi:SQLite:dbname=$db_dir/pricechart.db",
+ "",
+ "",
+ { RaiseError => 1 }
+ ) or die $DBI::errstr;
+ return $dbh;
+}
+
+sub get_dom
+{
+ my $url = shift;
+ my $ua = shift;
+ my $verbose = shift;
+
+ my $resp = $ua->get($url);
+ if ($resp->is_success) {
+ print "GET $url " . $resp->status_line . "\n" if ($verbose);
+ return HTML::Grabber->new(html => $resp->decoded_content);
+ }
+
+ print "GET $url " . $resp->status_line . "\n";
+ return undef;
+}
+
+sub get_ua
+{
+ my $cfg = shift;
+
+ my $ua = LWP::UserAgent->new(agent => $cfg->{user_agent});
+ $ua->default_header("Accept" => "*/*");
+
+ return $ua;
+}
+
+sub get_log
+{
+ my $file = shift;
+ my $log_dir = "/var/www/logs/pricechart";
+
+ mkdir $log_dir;
+ open my $log, ">>", "$log_dir/$file.log" || die "$!";
+
+ print $log strftime "%b %e %Y %H:%M ", localtime;
+ return $log;
+}
+
+1;
diff --git a/Shared.pm b/Shared.pm
@@ -1,73 +0,0 @@
-package PriceChart::Shared;
-
-use DBI;
-use Exporter;
-use POSIX;
-
-@ISA = ("Exporter");
-@EXPORT = qw(get_config get_dom get_ua get_log get_dbh);
-
-
-sub get_config
-{
- my $parser = Config::Grammar->new({
- _vars => [
- 'user_agent',
- 'email',
- 'smtp',
- ],
- });
- my $cfg_file = "/etc/pricechart.cfg";
- return $parser->parse($cfg_file) or die "error: $parser->{err}\n";
-}
-
-sub get_dbh
-{
- my $db_dir = "/var/www/db";
- mkdir $db_dir;
-
- my $dbh = DBI->connect(
- "dbi:SQLite:dbname=$db_dir/pricechart.db",
- "",
- "",
- { RaiseError => 1 }
- ) or die $DBI::errstr;
- return $dbh;
-}
-
-sub get_dom
-{
- my $url = shift;
- my $ua = shift;
-
- my $resp = $ua->get($url);
- if (! $resp->is_success) {
- print "getting $url failed: " . $resp->status_line . "\n";
- return undef;
- }
- return HTML::Grabber->new(html => $resp->decoded_content);
-}
-
-sub get_ua
-{
- my $cfg = shift;
-
- my $ua = LWP::UserAgent->new(agent => $cfg->{user_agent});
- $ua->default_header("Accept" => "*/*");
-
- return $ua;
-}
-
-sub get_log
-{
- my $file = shift;
- my $log_dir = "/var/www/logs/pricechart";
-
- mkdir $log_dir;
- open my $log, ">>", "$log_dir/$file.log" || die "$!";
-
- print $log strftime "%b %e %Y %H:%M ", localtime;
- return $log;
-}
-
-1;
diff --git a/gen_index.pl b/gen_index.pl
@@ -4,7 +4,7 @@ use strict;
use warnings;
use File::Copy;
-use PriceChart::Shared;
+use PriceChart;
use Template;
diff --git a/gen_svg.pl b/gen_svg.pl
@@ -6,7 +6,7 @@ use warnings;
use Getopt::Std;
use SVG;
use POSIX;
-use PriceChart::Shared;
+use PriceChart;
my %args;
diff --git a/price_scraper.pl b/price_scraper.pl
@@ -7,7 +7,7 @@ use Config::Grammar;
use Getopt::Std;
use HTML::Grabber;
use LWP::Simple;
-use PriceChart::Shared;
+use PriceChart;
my %args;
@@ -59,12 +59,11 @@ while (my ($vendor, $url, $price_tag, $sale_tag) = $vendor_sth->fetchrow_array)
my $start = time;
print "$vendor:\n" if ($args{v});
- my $dom = get_dom($url . $part_num, $ua);
+ my $dom = get_dom($url . $part_num, $ua, $args{v});
if (!defined $dom) {
msg("e", "error: dom");
next;
}
- print "\turl GET ok\n" if ($args{v});
my $price = get_price($price_tag, $dom);
if ($sale_tag) {
diff --git a/product_scraper.pl b/product_scraper.pl
@@ -9,7 +9,7 @@ use Email::Send;
use Getopt::Std;
use HTML::Grabber;
use LWP::Simple;
-use PriceChart::Shared;
+use PriceChart;
my %args;
@@ -40,57 +40,60 @@ my $vendor = "Memory Express";
my $product_url = "http://www.memoryexpress.com/Products/";
my %product_map = ("televisions" => "Televisions",
"laptops" => "LaptopsNotebooks",
- "hard_drives" => "HardDrives");
+ "hard drives" => "HardDrives");
-my $qry = "insert into products(part_num, manufacturer, description, type, " .
+my $sql = "insert into products(part_num, manufacturer, description, type, " .
"first_seen, last_seen, last_scraped) values (?, ?, ?, ?, ?, ?, ?)";
-my $insert_sth = $dbh->prepare($qry);
+my $insert_sth = $dbh->prepare($sql);
# also update description, manufacturer?
-$qry = "update products set last_seen = ? where part_num = ?";
-my $update_sth = $dbh->prepare($qry);
+$sql = "update products set last_seen = ? where part_num = ?";
+my $update_sth = $dbh->prepare($sql);
my $summary .= "type scraped total new errors time (s)\n";
$summary .= "----------- ------- ----- --- ------ --------\n";
-my $new_products;
-my $errors;
+my ($new_products, $errors);
+
+while (my ($type, $name) = each %product_map) {
+ print "Enumerating $type\n";
-for my $type (keys %product_map) {
my $class_url = "http://www.memoryexpress.com/Category/" .
- "$product_map{$type}?PageSize=120&Page=";
- my $dom = get_dom($class_url . "1", $ua);
- next if (! defined $dom);
-
- print "GET " . $class_url . "1 OK\n" if ($args{v});
-
- $dom = $dom->find(".AJAX_List_Pager");
- my @elements = $dom->find("li")->html_array();
- my $pages;
- if (@elements == 2) {
- $pages = 1;
- } else {
- $pages = (@elements / 2) - 1;
- }
+ "$name?PageSize=40&Page=";
+
+ # Get first page of results
+ my $dom = get_dom($class_url . "1", $ua, $args{v});
+ next if (!defined $dom);
- print "$pages pages of products found\n" if ($args{v});
+ # Extract the first of two pager widgets on the page
+ my ($pager_html) = $dom->find(".AJAX_List_Pager")->html_array();
+ next if (!defined $pager_html);
+ print "Found .AJAX_List_Pager\n" if ($args{v});
+ # Find how many pages of results we have
+ my $pager = HTML::Grabber->new(html => $pager_html);
+ my $pages = $pager->find("li")->html_array();
+ next unless ($pages);
+
+ # If more than 1 page of results are found, the pager contains a "next"
+ # arrow that needs to be accounted for
+ $pages-- if ($pages > 1);
+ print "Found $pages pages\n" if ($args{v});
+
+ # Loop over all results pages and append all products
my @thumbnails;
for (1..$pages) {
- $dom = get_dom($class_url . "$_", $ua);
- return if (! defined $dom);
-
- print "GET " . $class_url . "$_ OK\n" if ($args{v});
+ $dom = get_dom($class_url . "$_", $ua, $args{v});
+ next if (!defined $dom);
- # $dom->filter(".AJAX_List_Body");
+ # Each product is contained inside of their own PIV_Regular
push @thumbnails, $dom->find(".PIV_Regular")->html_array();
}
my $total = scalar @thumbnails;
- print "\nprocessing $type: ($total)\n" if ($args{v});
+ print "Found $total $type\n" if ($args{v});
- my ($new, $old) = (0, 0);
- my $start = time;
+ my ($new, $old, $start) = (0, 0, time);
for my $thumbnail_html (@thumbnails) {
sleep int(rand(10));
@@ -101,7 +104,7 @@ for my $type (keys %product_map) {
next unless (defined $product_id);
# get the part number from the product page as early as possible
- my $product_dom = get_dom("$product_url$product_id", $ua);
+ my $product_dom = get_dom("$product_url$product_id", $ua, $args{v});
my $part_num = get_tag_text($product_dom, "#ProductAdd");
next unless (defined $part_num);
diff --git a/search.pl b/search.pl
@@ -6,7 +6,7 @@ use warnings;
use FCGI;
use Getopt::Std;
use Template;
-use PriceChart::Shared;
+use PriceChart;
use Unix::Syslog qw(:macros :subs);
use URI::Escape;