commit 94b8a5ba8c4e9fc96dcb04f911e4b04fdf16777f
parent 37205039e32bba72b4bb4accd25e63bad7a3b8f5
Author: Kyle R W Milz <kyle@getaddrinfo.net>
Date: Sun, 10 Aug 2014 21:13:27 -0600
product_scraper: add initial implementation
Diffstat:
A | product_scraper.pl | | | 166 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
1 file changed, 166 insertions(+), 0 deletions(-)
diff --git a/product_scraper.pl b/product_scraper.pl
@@ -0,0 +1,166 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use Config::Grammar;
+use Data::Dumper;
+use DBI;
+use File::Basename;
+use Getopt::Std;
+use JSON;
+use HTML::Grabber;
+use LWP::Simple;
+use POSIX;
+
+
+my %args;
+getopts("v", \%args);
+
+if ($args{v}) {
+ # Disable buffering on STDOUT
+ $| = 1;
+ select STDOUT;
+}
+
+my $dbh = DBI->connect(
+ "dbi:SQLite:dbname=pricechart.db",
+ "",
+ "",
+ { RaiseError => 1 },) or die $DBI::errstr;
+
+$dbh->do("create table if not exists products(" .
+ "part_num text not null primary key, " .
+ "brand text, " .
+ "title text, " .
+ "first_seen int, " .
+ "last_seen int)") or die $DBI::errstr;
+
+# Chrome 36 Win7 64bit
+my $user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36";
+my $ua = LWP::UserAgent->new(agent => $user_agent);
+$ua->default_header("Accept" => "*/*");
+
+#
+# Memory Express
+#
+my %product_map = {televisions => "Televisions",
+ laptops => "LaptopsNotebooks",
+ hard_drives => "HardDrives"};
+for (keys %product_map) {
+
+ print "*** $_ ***\n";
+
+ my $class_url = "http://www.memoryexpress.com/Category/" .
+ "$product_map{$_}?PageSize=120&Page=";
+ my $dom = get_dom($class_url . "1");
+ return if (! defined $dom);
+
+ $dom = $dom->find(".AJAX_List_Pager");
+ my @elements = $dom->find("li")->html_array();
+ my $pages;
+ if (@elements == 2) {
+ $pages = 1;
+ } else {
+ $pages = (@elements / 2) - 1;
+ }
+
+ my @results;
+ for (1..$pages) {
+ $dom = get_dom($class_url . "$_");
+ return if (! defined $dom);
+
+ # $dom->filter(".AJAX_List_Body");
+ push @results, $dom->find(".PIV_Regular")->html_array();
+ }
+
+ my $scraped = 0;
+ my @new_products = ();
+ for my $node (@results) {
+ my $product = HTML::Grabber->new(html => $node);
+
+ # title is easier to parse from general results page
+ my $title = $product->find(".ProductTitle")->text();
+ next if (not_defined($title, "title", $node));
+
+ # brand is easier to parse from general results page
+ my $brand = $product->find(".ProductBrand")->html();
+ ($brand) = ($brand =~ m/Brand: ([A-Za-z]+)/);
+ next if (not_defined($brand, "brand", $node));
+
+ # used to visit the actual product page
+ my $product_id = $product->find(".ProductId")->text();
+ next if (not_defined($product_id, "product ID", $node));
+
+ my $product_url = "http://www.memoryexpress.com/Products/";
+ my $product_dom = get_dom("$product_url$product_id");
+
+ # part number only found on product page
+ my $part_num = $product_dom->find("#ProductAdd")->text;
+ ($part_num) = ($part_num =~ m/Part #: (.*)/);
+ next if (not_defined($part_num, "part number", $product_dom));
+
+ my $query = "select * from products where part_num = ?";
+ my $sth = $dbh->prepare($query);
+ $sth->execute($part_num);
+ if ($sth->fetchrow_array()) {
+ $dbh->do("update products set last_seen = ? where part_num = ?",
+ undef, time, $part_num);
+ }
+ else {
+ $dbh->do("insert into products(" .
+ "part_num, brand, title, first_seen, last_seen)" .
+ " values (?, ?, ?, ?, ?)",
+ undef, $part_num, $brand, $title, time, time);
+ #$dbh->do("create table [$part_num]" .
+ # "(unix_time int not null primary key)");
+ push @new_products, ([$brand, $title, $part_num]);
+ }
+
+ $scraped++;
+ last;
+ }
+
+ print "scraped/total: $scraped/" . @results . "\n";
+ print "new: " . scalar @new_products . "\n";
+ print " - $_->[0] $_->[1] $_->[2]\n" for (@new_products);
+ print "\n";
+}
+
+#
+# Best Buy
+#
+# my %product_map = {televisions => "led-tvs/25993.aspx"};
+
+#
+# Visions
+#
+# televisions = http://www.visions.ca/Catalogue/Category/ProductResults.aspx?categoryId=5&menu=9&pz=30
+# televisions_page = &px=<PAGE>
+# product_list = .centerPanel
+
+sub not_defined
+{
+ my $var = shift;
+ my $var_name = shift;
+ my $dom = shift;
+
+ if (!defined $var) {
+ print "could not find $var_name, DOM was:\n";
+ print "$dom\n";
+ return 1;
+ }
+ return 0;
+}
+
+sub get_dom
+{
+ my $url = shift;
+
+ my $resp = $ua->get($url);
+ if (! $resp->is_success) {
+ print STDERR "getting $url failed: " . $resp->status_line . "\n";
+ return undef;
+ }
+ return HTML::Grabber->new(html => $resp->decoded_content);
+}