pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit 94b8a5ba8c4e9fc96dcb04f911e4b04fdf16777f
parent 37205039e32bba72b4bb4accd25e63bad7a3b8f5
Author: Kyle R W Milz <kyle@getaddrinfo.net>
Date:   Sun, 10 Aug 2014 21:13:27 -0600

product_scraper: add initial implementation

Diffstat:
Aproduct_scraper.pl | 166+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 166 insertions(+), 0 deletions(-)

diff --git a/product_scraper.pl b/product_scraper.pl @@ -0,0 +1,166 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +use Config::Grammar; +use Data::Dumper; +use DBI; +use File::Basename; +use Getopt::Std; +use JSON; +use HTML::Grabber; +use LWP::Simple; +use POSIX; + + +my %args; +getopts("v", \%args); + +if ($args{v}) { + # Disable buffering on STDOUT + $| = 1; + select STDOUT; +} + +my $dbh = DBI->connect( + "dbi:SQLite:dbname=pricechart.db", + "", + "", + { RaiseError => 1 },) or die $DBI::errstr; + +$dbh->do("create table if not exists products(" . + "part_num text not null primary key, " . + "brand text, " . + "title text, " . + "first_seen int, " . + "last_seen int)") or die $DBI::errstr; + +# Chrome 36 Win7 64bit +my $user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"; +my $ua = LWP::UserAgent->new(agent => $user_agent); +$ua->default_header("Accept" => "*/*"); + +# +# Memory Express +# +my %product_map = {televisions => "Televisions", + laptops => "LaptopsNotebooks", + hard_drives => "HardDrives"}; +for (keys %product_map) { + + print "*** $_ ***\n"; + + my $class_url = "http://www.memoryexpress.com/Category/" . + "$product_map{$_}?PageSize=120&Page="; + my $dom = get_dom($class_url . "1"); + return if (! defined $dom); + + $dom = $dom->find(".AJAX_List_Pager"); + my @elements = $dom->find("li")->html_array(); + my $pages; + if (@elements == 2) { + $pages = 1; + } else { + $pages = (@elements / 2) - 1; + } + + my @results; + for (1..$pages) { + $dom = get_dom($class_url . "$_"); + return if (! defined $dom); + + # $dom->filter(".AJAX_List_Body"); + push @results, $dom->find(".PIV_Regular")->html_array(); + } + + my $scraped = 0; + my @new_products = (); + for my $node (@results) { + my $product = HTML::Grabber->new(html => $node); + + # title is easier to parse from general results page + my $title = $product->find(".ProductTitle")->text(); + next if (not_defined($title, "title", $node)); + + # brand is easier to parse from general results page + my $brand = $product->find(".ProductBrand")->html(); + ($brand) = ($brand =~ m/Brand: ([A-Za-z]+)/); + next if (not_defined($brand, "brand", $node)); + + # used to visit the actual product page + my $product_id = $product->find(".ProductId")->text(); + next if (not_defined($product_id, "product ID", $node)); + + my $product_url = "http://www.memoryexpress.com/Products/"; + my $product_dom = get_dom("$product_url$product_id"); + + # part number only found on product page + my $part_num = $product_dom->find("#ProductAdd")->text; + ($part_num) = ($part_num =~ m/Part #: (.*)/); + next if (not_defined($part_num, "part number", $product_dom)); + + my $query = "select * from products where part_num = ?"; + my $sth = $dbh->prepare($query); + $sth->execute($part_num); + if ($sth->fetchrow_array()) { + $dbh->do("update products set last_seen = ? where part_num = ?", + undef, time, $part_num); + } + else { + $dbh->do("insert into products(" . + "part_num, brand, title, first_seen, last_seen)" . + " values (?, ?, ?, ?, ?)", + undef, $part_num, $brand, $title, time, time); + #$dbh->do("create table [$part_num]" . + # "(unix_time int not null primary key)"); + push @new_products, ([$brand, $title, $part_num]); + } + + $scraped++; + last; + } + + print "scraped/total: $scraped/" . @results . "\n"; + print "new: " . scalar @new_products . "\n"; + print " - $_->[0] $_->[1] $_->[2]\n" for (@new_products); + print "\n"; +} + +# +# Best Buy +# +# my %product_map = {televisions => "led-tvs/25993.aspx"}; + +# +# Visions +# +# televisions = http://www.visions.ca/Catalogue/Category/ProductResults.aspx?categoryId=5&menu=9&pz=30 +# televisions_page = &px=<PAGE> +# product_list = .centerPanel + +sub not_defined +{ + my $var = shift; + my $var_name = shift; + my $dom = shift; + + if (!defined $var) { + print "could not find $var_name, DOM was:\n"; + print "$dom\n"; + return 1; + } + return 0; +} + +sub get_dom +{ + my $url = shift; + + my $resp = $ua->get($url); + if (! $resp->is_success) { + print STDERR "getting $url failed: " . $resp->status_line . "\n"; + return undef; + } + return HTML::Grabber->new(html => $resp->decoded_content); +}