pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit 9043c6646c544b5f0427a21adac7e85ad145b087
Author: Kyle R W Milz <kyle@getaddrinfo.net>
Date:   Wed,  6 Aug 2014 00:01:25 -0600

pricegraph: initial commit

Diffstat:
Apricegraph | 262+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apricegraph.1 | 88+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apricegraph.cfg | 224+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 574 insertions(+), 0 deletions(-)

diff --git a/pricegraph b/pricegraph @@ -0,0 +1,262 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +use Config::Grammar; +use Data::Dumper; +use File::Basename; +use Getopt::Std; +use JSON; +use HTML::Grabber; +use LWP::Simple; +use POSIX; + + +my %args; +getopts('adf:i:np:rv', \%args); + +my $parser = Config::Grammar->new({ + _sections => ['products', 'vendors', 'paths'], + products => { + # manufacturer regular expression + _sections => ['/[A-Za-z]+/'], + '/[A-Za-z]+/' => { + # part number regular expression + _sections => ['/[A-Za-z0-9]+/'], + '/[A-Za-z0-9]+/' => { + }, + }, + }, + vendors => { + # vendor regular expression + _sections => ['/[A-Za-z ]+/'], + '/[A-Za-z ]+/' => { + _vars => ['search_uri', 'title', 'reg_price', 'sale_price', 'color'], + }, + }, + paths => { + _vars => ['http', 'data', 'log'], + }, +}); + +my $cfg_file; +if ($args{f}) { + $cfg_file = $args{f}; +} +elsif (-e "/etc/pricegraph.cfg") { + $cfg_file = "/etc/pricegraph.cfg"; +} +elsif (-e "pricegraph.cfg") { + $cfg_file = "pricegraph.cfg"; +} + +my $cfg = $parser->parse($cfg_file) or die "ERROR: $parser->{err}\n"; + +if ($args{v}) { + # Disable buffering on STDOUT + $| = 1; + select STDOUT; +} +else { + open my $logfile, ">>", "$cfg->{paths}{log}" or die $!; + select $logfile; +} + +if ($args{a}) { + scrape_vendors($_) for (make_parts_list()); + regenerate_json(); +} +elsif ($args{d}) { + print Dumper($cfg); +} +elsif ($args{p}) { + scrape_vendors($args{p}); +} +elsif ($args{r}) { + regenerate_json(); +} +else { + srand; + my @parts = make_parts_list(); + scrape_vendors($parts[rand @parts]); + regenerate_json(); +} + +sub make_parts_list +{ + my @parts; + for (sort keys $cfg->{products}) { + push @parts, sort keys $cfg->{products}{$_}; + } + return @parts; +} + +sub scrape_vendors +{ + my $part_no = shift; + my $time_start = time; + my @prices; + my @titles; + + print strftime '%b %e %Y %H:%M ', localtime; + printf '%-10s [', $part_no; + + my $ua = LWP::UserAgent->new(agent => 'Mozilla/5.0'); + # some sites need this (amazon I think?) + $ua->default_header('Accept' => '*/*'); + + while (my ($name, $vendor) = each ($cfg->{vendors})) { + + my $resp = $ua->get("$vendor->{search_uri}$part_no"); + if (! $resp->is_success) { + print STDERR "$name: " . $resp->status_line . "\n"; + print ' '; + next; + } + + my $dom = HTML::Grabber->new(html => $resp->decoded_content); + + #if (substr($vendor->{context}, 0, 1) eq '@') { + # $vendor->{context} =~ s/@/#/; + #} + + #my $context = $dom->find($vendor->{context})->html(); + #if ($context) { + # $dom = HTML::Grabber->new(html => $context); + #} + #else { + # print ' '; + # next; + #} + + my @title = $dom->find($vendor->{title})->text_array; + if (@title) { + $title[0] =~ s/^\s+//; + $title[0] =~ s/\s+$//; + + if ($part_no =~ m/(.*$title[0].*)/) { + print "part number in title\n"; + } + } + + my $price = $dom->find($vendor->{reg_price})->text; + if ($vendor->{sale_price}) { + my $sale = $dom->find($vendor->{sale_price})->text; + $price = $sale if ($sale ne ''); + } + + if (! $price) { + print ' '; + next; + } + + ($price) = ($price =~ m/(\d[\d,]+)/); + $price =~ s/,//; + + print substr($name, 0, 1); + push @prices, "$name=$price"; + push @titles, "$name=$title[0]"; + } + + print '] (' . (time - $time_start) . " s)\n"; + if ($args{v}) { + print "$_\n" for @prices; + print "$_\n" for @titles; + } + + #for (keys %title_dict) { + # print "$_ " if ($title_dict{$_} / $total_titles >= 0.5); + #} + # for (split(" ", $title)) { + # if (! $title_dict{$_}) { + # $title_dict{$_} = 0; + # } + # $title_dict{$_}++; + # } + # $total_titles++; + #print "\n"; + + return if ($args{n} || (scalar @prices) == 0); + + mkdir $cfg->{paths}{data}; + open FILE, ">>", "$cfg->{paths}{data}/$part_no.txt" or die $!; + print FILE time * 1000; + print FILE "\t$_" for @prices; + print FILE "\n"; + close FILE; +} + +sub regenerate_json +{ + my $pretty = 0; + $pretty = 1 if $args{v}; + + mkdir "$cfg->{paths}{http}/json"; + + my @manufacturers = sort keys $cfg->{products}; + open my $fh, '>', "$cfg->{paths}{http}/json/manufacturers.json" or die $!; + print $fh to_json(\@manufacturers, {pretty => $pretty}); + close $fh; + + open $fh, '>', "$cfg->{paths}{http}/json/vendors.json" or die $!; + print $fh to_json($cfg->{vendors}, {pretty => $pretty}); + close $fh; + + print "Regenerating... " if $args{v}; + + my %parts; + opendir(DIR, $cfg->{paths}{data}); + while (my $file = readdir(DIR)) { + next if ($file =~ m/^\./); + + my %part; + my $part_num = basename($file, '.txt'); + print $part_num if ($args{v}); + + my %tmp; + open FILE, "<", "$cfg->{paths}{data}/$file" or die $!; + while (<FILE>) { + chomp; + my @fields = split("\t", $_); + + my $date = $fields[0]; + splice(@fields, 0, 1); + foreach (@fields) { + my ($l, $r) = split("=", $_); + if (! defined $tmp{$l}) { + $tmp{$l}{data} = []; + $tmp{$l}{name} = $l; + if ($cfg->{vendors}{$l}) { + $tmp{$l}{color} = "#$cfg->{vendors}{$l}{color}"; + } + } + push @{$tmp{$l}{data}}, [int($date), int($r)]; + } + } + close FILE; + + @{$part{vendors}} = keys %tmp; + @{$part{series}} = values %tmp; + $part{part_num} = $part_num; + + for my $manuf (keys $cfg->{products}) { + for (keys $cfg->{products}{$manuf}) { + $part{manuf} = $manuf if ($_ eq $part_num); + } + } + + if ($args{v}) { + print chr(0x08) for split("", $part_num); + } + + $parts{$part_num} = \%part; + } + closedir(DIR); + + open $fh, ">$cfg->{paths}{http}/json/products.json" or die $!; + print $fh to_json(\%parts, {pretty => $pretty}); + close $fh; + + print "done. \n" if $args{v}; +} diff --git a/pricegraph.1 b/pricegraph.1 @@ -0,0 +1,88 @@ +.Dd $Mdocdate$ +.Dt pricegraph 1 +.Os +.Sh NAME +.Nm pricegraph +.Nd scrape price information and generate static web charts +.\" .Sh LIBRARY +.\" For sections 2, 3, & 9 only. +.\" Not used in OpenBSD. +.Sh SYNOPSIS +.Nm pricegraph +.Op Fl nv +.Op Fl f Ar file +.Op Fl a | Fl p Ar product | Fl r +.Sh DESCRIPTION +The +.Nm +utility scrapes price information from websites and stores it into a +database. It then generates a complete static website that contains +charts showing the current and historical prices of each tracked item. + +The arguments are as follows: +.Bl -tag -width Ds +.It Fl a +Scrape all products in the configuration file at once. Not recommended. +.El +.Bl -tag -width Ds +.It Fl f Ar file +Use config from specified +.Ar file +instead of the /etc/pricegraph.cfg default. +.El +.Bl -tag -width Ds +.It Fl n +Do not save scraped information into database. +.El +.Bl -tag -width Ds +.It Fl p Ar product +Scrape prices for the specified +.Ar product +instead of a random one chosen from the config file. +.El +.Bl -tag -width Ds +.It Fl v +Print verbose information. +.El + +When +.Nm +is run with no arguments a product is chosen at random from the +configuration file and the price information is scraped, logged and +saved. +.Sh LOG FORMAT +The default logging format will be of the form + +Dec 24 2013 [MV BF ] (10s) Samdung UN32EH3200 32" + +And the verbose logging format of the form + +Dec 24 2013 UN32EH3200 +[M] [CSP] (2s) +.Sh CONFIGURATION FILE +.Sh FILES +.Pa /etc/pricegraph.cfg + +.\" .Sh IMPLEMENTATION NOTES +.\" Not used in OpenBSD. +.\" .Sh RETURN VALUES +.\" For sections 2, 3, & 9 only. +.\" .Sh ENVIRONMENT +.\" For sections 1, 6, 7, & 8 only. +.\" .Sh FILES +.\" .Sh EXIT STATUS +.\" For sections 1, 6, & 8 only. +.\" .Sh EXAMPLES +.\" .Sh DIAGNOSTICS +.\" For sections 1, 4, 6, 7, & 8 only. +.\" .Sh ERRORS +.\" For sections 2, 3, & 9 only. +.\" .Sh SEE ALSO +.\" .Xr foobar 1 +.\" .Sh STANDARDS +.\" .Sh HISTORY +.\" .Sh AUTHORS +.\" .Sh CAVEATS +.\" .Sh BUGS +.\" .Sh SECURITY CONSIDERATIONS +.\" Not used in OpenBSD. diff --git a/pricegraph.cfg b/pricegraph.cfg @@ -0,0 +1,224 @@ +*** paths *** + +http = /var/www/htdocs/pricegraph +data = data # must be a directory +log = pricegraph.txt + +*** products *** + ++ Samsung + +++ UN32F5500 +++ UN32EH4003 +++ UN32EH5300 +++ UN39EH5003 +++ HG40NA570L +++ UN40EH5300 +++ UN40FH6030 +++ UN40F5500 +++ UN40F6300 +++ UN46EH5300 +++ UN46F5500 +++ UN46F6300 +++ UN46F6800 +++ UN46FH6030 +++ UN50EH5300 +++ UN50F5500 +++ UN50F6300 +++ UN50F6800 +++ UN55F6300 +++ UN55F6800 +++ UN55F7050 +++ UN55F7100 +++ UN55F8000 +++ UN55FH6030 +++ UN55FH6200 +++ UN55F7500 +++ UN55F9000 +# ++ KN55S9 +++ UN60ES6500 +++ UN60F6300 +++ UN60F6400 +++ UN60F7050 +++ UN60F7100 +++ UN60F8000 +++ UN60FH6200 +++ UN65EH6000 +++ UN65F6300 +++ UN65FH6001 +++ UN65F6400 +++ UN65F7100 +++ UN65F8000 +++ UN65F9000 +++ UN75F6300 +++ UN75F7100 +++ UN75F8000 +++ UN85S9 + ++ Toshiba + +++ 32L1300UC +++ 39L1350UC +++ 39L4300UC +++ 50L1350UC +++ 50L4300UC +++ 50L5300 +++ 50L7300 +++ 58L1350 +++ 58L7350UC +++ 58L9300 +++ 65L7350UC +++ 65L9300 +++ 84L9300 + ++ Sharp + +++ LC60C8470U +++ LC60LE450U +++ LC60LE550U +++ LC60LE650U +++ LC60LE757U +++ LC70C8470U +++ LC70LE550U +++ LC70LE650U +++ LC70LE757U +++ LC80LE642U +++ LC80LE650U +++ LC80LE757U +++ LC80LE857U +++ LC90LE657U + ++ Sony + +++ KDL32R400A +++ KDL40R450A +++ KDL46R450A +# ++ KDL47W802A +# ++ KDL50R550A +++ KDL55W802A +++ KDL55W900A +# ++ KDL60R550A +++ KDL70R550A +++ XBR55X900A +++ XBR65X900A + ++ Panasonic + +++ TCL32B6 +++ TCL42E60 +++ TCL47ET60 +++ TCL55ET60 +++ TCL55WT50 + ++ LG + +++ 32LN530B +++ 32LN5700 +++ 42LA6205 +++ 42LN5300 +++ 42LN5400 +++ 42LN5700 +++ 47LA6205 +++ 47LN5400 +++ 47LN5750 +++ 50LA6205 +++ 50LN5310 +++ 50LN5750 +++ 55LA8600 +++ 55LN5310 +++ 55LN5400 +++ 55LN5750 +++ 60LA7400 +++ 60LA8600 +++ 65LA9700 + + +*** vendors *** + ++ Memory Express +# +# On sale: +# <div class="PIV_BotPrices"> +# <div class="PIV_PriceRegular">Reg: <span>$359.99</span></div> +# <div class="PIV_PriceSale"> +# $279.99 +# </div> +# </div> +# +# Regular price: +# <div class="PIV_BotPrices"> +# <div class="PIV_Price"> +# <span>$359.99</span> +# </div> +# </div> +# +color = 56B849 +search_uri = http://www.memoryexpress.com/Search/Products?Search= +title = .ProductTitle +reg_price = .PIV_Price +sale_price = .PIV_PriceSale + ++ Future Shop +color = BA0024 +search_uri = http://www.futureshop.ca/Search/SearchResults.aspx?query= +title = .prod-title +reg_price = .dollars + ++ Visions Electronics +# +# <td class="price"> +# <span id="ctl00_..." class="regPrice">Price: <span>$509.99</span></span> +# <span id="ctl00_..." class="salePrice">Sale Price: $336.00</span> +# </td> +# +# price is a unique class when only a single product is returned and +# can be used to make sure only a single product has been returned. +# Products that are on sale return both regPrice and salePrice classes +# while regularly priced productes only return the regPrice class. +# +color = 000 +search_uri = http://www.visions.ca/catalogue/category/ProductResults.aspx?searchText= +title = .plProductName +reg_price = .regPrice +sale_price = .salePrice + ++ London Drugs +color = 005DAB +search_uri = http://www.londondrugs.com/on/demandware.store/Sites-LondonDrugs-Site/default/Search-Show?q= +title = .productname +reg_price = .pricing +#reg_price = .standardprice +#sale_price = .salesprice + ++ Amazon +color = FFA51D +search_uri = http://www.amazon.ca/s/keywords= +title = .newaps +reg_price = .price + +# + Tiger Direct +# color = 660 +# search_uri = http://www.tigerdirect.ca/applications/SearchTools/search.asp?keywords= +# price_context = +# reg_price = .salePrice +# sale_price = + ++ Best Buy +color = 003B64 +search_uri = http://www.bestbuy.ca/Search/SearchResults.aspx?query= +title = .product-title, .prod-title +#sale_price = .price-onsale +reg_price = .prodprice + +# + RadioShack +# color = E76453 +# search_uri = http://www.radioshack.com/search/controller.jsp?kw= +# title = .title +# price_context = .product-price-tag +# reg_price = .price + +# + Walmart +# color = 0000FF +# search_uri = http://www.walmart.ca/search/ +# title = .title +# reg_price = .price-current