commit 9043c6646c544b5f0427a21adac7e85ad145b087
Author: Kyle R W Milz <kyle@getaddrinfo.net>
Date: Wed, 6 Aug 2014 00:01:25 -0600
pricegraph: initial commit
Diffstat:
A | pricegraph | | | 262 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | pricegraph.1 | | | 88 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | pricegraph.cfg | | | 224 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
3 files changed, 574 insertions(+), 0 deletions(-)
diff --git a/pricegraph b/pricegraph
@@ -0,0 +1,262 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use Config::Grammar;
+use Data::Dumper;
+use File::Basename;
+use Getopt::Std;
+use JSON;
+use HTML::Grabber;
+use LWP::Simple;
+use POSIX;
+
+
+my %args;
+getopts('adf:i:np:rv', \%args);
+
+my $parser = Config::Grammar->new({
+ _sections => ['products', 'vendors', 'paths'],
+ products => {
+ # manufacturer regular expression
+ _sections => ['/[A-Za-z]+/'],
+ '/[A-Za-z]+/' => {
+ # part number regular expression
+ _sections => ['/[A-Za-z0-9]+/'],
+ '/[A-Za-z0-9]+/' => {
+ },
+ },
+ },
+ vendors => {
+ # vendor regular expression
+ _sections => ['/[A-Za-z ]+/'],
+ '/[A-Za-z ]+/' => {
+ _vars => ['search_uri', 'title', 'reg_price', 'sale_price', 'color'],
+ },
+ },
+ paths => {
+ _vars => ['http', 'data', 'log'],
+ },
+});
+
+my $cfg_file;
+if ($args{f}) {
+ $cfg_file = $args{f};
+}
+elsif (-e "/etc/pricegraph.cfg") {
+ $cfg_file = "/etc/pricegraph.cfg";
+}
+elsif (-e "pricegraph.cfg") {
+ $cfg_file = "pricegraph.cfg";
+}
+
+my $cfg = $parser->parse($cfg_file) or die "ERROR: $parser->{err}\n";
+
+if ($args{v}) {
+ # Disable buffering on STDOUT
+ $| = 1;
+ select STDOUT;
+}
+else {
+ open my $logfile, ">>", "$cfg->{paths}{log}" or die $!;
+ select $logfile;
+}
+
+if ($args{a}) {
+ scrape_vendors($_) for (make_parts_list());
+ regenerate_json();
+}
+elsif ($args{d}) {
+ print Dumper($cfg);
+}
+elsif ($args{p}) {
+ scrape_vendors($args{p});
+}
+elsif ($args{r}) {
+ regenerate_json();
+}
+else {
+ srand;
+ my @parts = make_parts_list();
+ scrape_vendors($parts[rand @parts]);
+ regenerate_json();
+}
+
+sub make_parts_list
+{
+ my @parts;
+ for (sort keys $cfg->{products}) {
+ push @parts, sort keys $cfg->{products}{$_};
+ }
+ return @parts;
+}
+
+sub scrape_vendors
+{
+ my $part_no = shift;
+ my $time_start = time;
+ my @prices;
+ my @titles;
+
+ print strftime '%b %e %Y %H:%M ', localtime;
+ printf '%-10s [', $part_no;
+
+ my $ua = LWP::UserAgent->new(agent => 'Mozilla/5.0');
+ # some sites need this (amazon I think?)
+ $ua->default_header('Accept' => '*/*');
+
+ while (my ($name, $vendor) = each ($cfg->{vendors})) {
+
+ my $resp = $ua->get("$vendor->{search_uri}$part_no");
+ if (! $resp->is_success) {
+ print STDERR "$name: " . $resp->status_line . "\n";
+ print ' ';
+ next;
+ }
+
+ my $dom = HTML::Grabber->new(html => $resp->decoded_content);
+
+ #if (substr($vendor->{context}, 0, 1) eq '@') {
+ # $vendor->{context} =~ s/@/#/;
+ #}
+
+ #my $context = $dom->find($vendor->{context})->html();
+ #if ($context) {
+ # $dom = HTML::Grabber->new(html => $context);
+ #}
+ #else {
+ # print ' ';
+ # next;
+ #}
+
+ my @title = $dom->find($vendor->{title})->text_array;
+ if (@title) {
+ $title[0] =~ s/^\s+//;
+ $title[0] =~ s/\s+$//;
+
+ if ($part_no =~ m/(.*$title[0].*)/) {
+ print "part number in title\n";
+ }
+ }
+
+ my $price = $dom->find($vendor->{reg_price})->text;
+ if ($vendor->{sale_price}) {
+ my $sale = $dom->find($vendor->{sale_price})->text;
+ $price = $sale if ($sale ne '');
+ }
+
+ if (! $price) {
+ print ' ';
+ next;
+ }
+
+ ($price) = ($price =~ m/(\d[\d,]+)/);
+ $price =~ s/,//;
+
+ print substr($name, 0, 1);
+ push @prices, "$name=$price";
+ push @titles, "$name=$title[0]";
+ }
+
+ print '] (' . (time - $time_start) . " s)\n";
+ if ($args{v}) {
+ print "$_\n" for @prices;
+ print "$_\n" for @titles;
+ }
+
+ #for (keys %title_dict) {
+ # print "$_ " if ($title_dict{$_} / $total_titles >= 0.5);
+ #}
+ # for (split(" ", $title)) {
+ # if (! $title_dict{$_}) {
+ # $title_dict{$_} = 0;
+ # }
+ # $title_dict{$_}++;
+ # }
+ # $total_titles++;
+ #print "\n";
+
+ return if ($args{n} || (scalar @prices) == 0);
+
+ mkdir $cfg->{paths}{data};
+ open FILE, ">>", "$cfg->{paths}{data}/$part_no.txt" or die $!;
+ print FILE time * 1000;
+ print FILE "\t$_" for @prices;
+ print FILE "\n";
+ close FILE;
+}
+
+sub regenerate_json
+{
+ my $pretty = 0;
+ $pretty = 1 if $args{v};
+
+ mkdir "$cfg->{paths}{http}/json";
+
+ my @manufacturers = sort keys $cfg->{products};
+ open my $fh, '>', "$cfg->{paths}{http}/json/manufacturers.json" or die $!;
+ print $fh to_json(\@manufacturers, {pretty => $pretty});
+ close $fh;
+
+ open $fh, '>', "$cfg->{paths}{http}/json/vendors.json" or die $!;
+ print $fh to_json($cfg->{vendors}, {pretty => $pretty});
+ close $fh;
+
+ print "Regenerating... " if $args{v};
+
+ my %parts;
+ opendir(DIR, $cfg->{paths}{data});
+ while (my $file = readdir(DIR)) {
+ next if ($file =~ m/^\./);
+
+ my %part;
+ my $part_num = basename($file, '.txt');
+ print $part_num if ($args{v});
+
+ my %tmp;
+ open FILE, "<", "$cfg->{paths}{data}/$file" or die $!;
+ while (<FILE>) {
+ chomp;
+ my @fields = split("\t", $_);
+
+ my $date = $fields[0];
+ splice(@fields, 0, 1);
+ foreach (@fields) {
+ my ($l, $r) = split("=", $_);
+ if (! defined $tmp{$l}) {
+ $tmp{$l}{data} = [];
+ $tmp{$l}{name} = $l;
+ if ($cfg->{vendors}{$l}) {
+ $tmp{$l}{color} = "#$cfg->{vendors}{$l}{color}";
+ }
+ }
+ push @{$tmp{$l}{data}}, [int($date), int($r)];
+ }
+ }
+ close FILE;
+
+ @{$part{vendors}} = keys %tmp;
+ @{$part{series}} = values %tmp;
+ $part{part_num} = $part_num;
+
+ for my $manuf (keys $cfg->{products}) {
+ for (keys $cfg->{products}{$manuf}) {
+ $part{manuf} = $manuf if ($_ eq $part_num);
+ }
+ }
+
+ if ($args{v}) {
+ print chr(0x08) for split("", $part_num);
+ }
+
+ $parts{$part_num} = \%part;
+ }
+ closedir(DIR);
+
+ open $fh, ">$cfg->{paths}{http}/json/products.json" or die $!;
+ print $fh to_json(\%parts, {pretty => $pretty});
+ close $fh;
+
+ print "done. \n" if $args{v};
+}
diff --git a/pricegraph.1 b/pricegraph.1
@@ -0,0 +1,88 @@
+.Dd $Mdocdate$
+.Dt pricegraph 1
+.Os
+.Sh NAME
+.Nm pricegraph
+.Nd scrape price information and generate static web charts
+.\" .Sh LIBRARY
+.\" For sections 2, 3, & 9 only.
+.\" Not used in OpenBSD.
+.Sh SYNOPSIS
+.Nm pricegraph
+.Op Fl nv
+.Op Fl f Ar file
+.Op Fl a | Fl p Ar product | Fl r
+.Sh DESCRIPTION
+The
+.Nm
+utility scrapes price information from websites and stores it into a
+database. It then generates a complete static website that contains
+charts showing the current and historical prices of each tracked item.
+
+The arguments are as follows:
+.Bl -tag -width Ds
+.It Fl a
+Scrape all products in the configuration file at once. Not recommended.
+.El
+.Bl -tag -width Ds
+.It Fl f Ar file
+Use config from specified
+.Ar file
+instead of the /etc/pricegraph.cfg default.
+.El
+.Bl -tag -width Ds
+.It Fl n
+Do not save scraped information into database.
+.El
+.Bl -tag -width Ds
+.It Fl p Ar product
+Scrape prices for the specified
+.Ar product
+instead of a random one chosen from the config file.
+.El
+.Bl -tag -width Ds
+.It Fl v
+Print verbose information.
+.El
+
+When
+.Nm
+is run with no arguments a product is chosen at random from the
+configuration file and the price information is scraped, logged and
+saved.
+.Sh LOG FORMAT
+The default logging format will be of the form
+
+Dec 24 2013 [MV BF ] (10s) Samdung UN32EH3200 32"
+
+And the verbose logging format of the form
+
+Dec 24 2013 UN32EH3200
+[M] [CSP] (2s)
+.Sh CONFIGURATION FILE
+.Sh FILES
+.Pa /etc/pricegraph.cfg
+
+.\" .Sh IMPLEMENTATION NOTES
+.\" Not used in OpenBSD.
+.\" .Sh RETURN VALUES
+.\" For sections 2, 3, & 9 only.
+.\" .Sh ENVIRONMENT
+.\" For sections 1, 6, 7, & 8 only.
+.\" .Sh FILES
+.\" .Sh EXIT STATUS
+.\" For sections 1, 6, & 8 only.
+.\" .Sh EXAMPLES
+.\" .Sh DIAGNOSTICS
+.\" For sections 1, 4, 6, 7, & 8 only.
+.\" .Sh ERRORS
+.\" For sections 2, 3, & 9 only.
+.\" .Sh SEE ALSO
+.\" .Xr foobar 1
+.\" .Sh STANDARDS
+.\" .Sh HISTORY
+.\" .Sh AUTHORS
+.\" .Sh CAVEATS
+.\" .Sh BUGS
+.\" .Sh SECURITY CONSIDERATIONS
+.\" Not used in OpenBSD.
diff --git a/pricegraph.cfg b/pricegraph.cfg
@@ -0,0 +1,224 @@
+*** paths ***
+
+http = /var/www/htdocs/pricegraph
+data = data # must be a directory
+log = pricegraph.txt
+
+*** products ***
+
++ Samsung
+
+++ UN32F5500
+++ UN32EH4003
+++ UN32EH5300
+++ UN39EH5003
+++ HG40NA570L
+++ UN40EH5300
+++ UN40FH6030
+++ UN40F5500
+++ UN40F6300
+++ UN46EH5300
+++ UN46F5500
+++ UN46F6300
+++ UN46F6800
+++ UN46FH6030
+++ UN50EH5300
+++ UN50F5500
+++ UN50F6300
+++ UN50F6800
+++ UN55F6300
+++ UN55F6800
+++ UN55F7050
+++ UN55F7100
+++ UN55F8000
+++ UN55FH6030
+++ UN55FH6200
+++ UN55F7500
+++ UN55F9000
+# ++ KN55S9
+++ UN60ES6500
+++ UN60F6300
+++ UN60F6400
+++ UN60F7050
+++ UN60F7100
+++ UN60F8000
+++ UN60FH6200
+++ UN65EH6000
+++ UN65F6300
+++ UN65FH6001
+++ UN65F6400
+++ UN65F7100
+++ UN65F8000
+++ UN65F9000
+++ UN75F6300
+++ UN75F7100
+++ UN75F8000
+++ UN85S9
+
++ Toshiba
+
+++ 32L1300UC
+++ 39L1350UC
+++ 39L4300UC
+++ 50L1350UC
+++ 50L4300UC
+++ 50L5300
+++ 50L7300
+++ 58L1350
+++ 58L7350UC
+++ 58L9300
+++ 65L7350UC
+++ 65L9300
+++ 84L9300
+
++ Sharp
+
+++ LC60C8470U
+++ LC60LE450U
+++ LC60LE550U
+++ LC60LE650U
+++ LC60LE757U
+++ LC70C8470U
+++ LC70LE550U
+++ LC70LE650U
+++ LC70LE757U
+++ LC80LE642U
+++ LC80LE650U
+++ LC80LE757U
+++ LC80LE857U
+++ LC90LE657U
+
++ Sony
+
+++ KDL32R400A
+++ KDL40R450A
+++ KDL46R450A
+# ++ KDL47W802A
+# ++ KDL50R550A
+++ KDL55W802A
+++ KDL55W900A
+# ++ KDL60R550A
+++ KDL70R550A
+++ XBR55X900A
+++ XBR65X900A
+
++ Panasonic
+
+++ TCL32B6
+++ TCL42E60
+++ TCL47ET60
+++ TCL55ET60
+++ TCL55WT50
+
++ LG
+
+++ 32LN530B
+++ 32LN5700
+++ 42LA6205
+++ 42LN5300
+++ 42LN5400
+++ 42LN5700
+++ 47LA6205
+++ 47LN5400
+++ 47LN5750
+++ 50LA6205
+++ 50LN5310
+++ 50LN5750
+++ 55LA8600
+++ 55LN5310
+++ 55LN5400
+++ 55LN5750
+++ 60LA7400
+++ 60LA8600
+++ 65LA9700
+
+
+*** vendors ***
+
++ Memory Express
+#
+# On sale:
+# <div class="PIV_BotPrices">
+# <div class="PIV_PriceRegular">Reg: <span>$359.99</span></div>
+# <div class="PIV_PriceSale">
+# $279.99
+# </div>
+# </div>
+#
+# Regular price:
+# <div class="PIV_BotPrices">
+# <div class="PIV_Price">
+# <span>$359.99</span>
+# </div>
+# </div>
+#
+color = 56B849
+search_uri = http://www.memoryexpress.com/Search/Products?Search=
+title = .ProductTitle
+reg_price = .PIV_Price
+sale_price = .PIV_PriceSale
+
++ Future Shop
+color = BA0024
+search_uri = http://www.futureshop.ca/Search/SearchResults.aspx?query=
+title = .prod-title
+reg_price = .dollars
+
++ Visions Electronics
+#
+# <td class="price">
+# <span id="ctl00_..." class="regPrice">Price: <span>$509.99</span></span>
+# <span id="ctl00_..." class="salePrice">Sale Price: $336.00</span>
+# </td>
+#
+# price is a unique class when only a single product is returned and
+# can be used to make sure only a single product has been returned.
+# Products that are on sale return both regPrice and salePrice classes
+# while regularly priced productes only return the regPrice class.
+#
+color = 000
+search_uri = http://www.visions.ca/catalogue/category/ProductResults.aspx?searchText=
+title = .plProductName
+reg_price = .regPrice
+sale_price = .salePrice
+
++ London Drugs
+color = 005DAB
+search_uri = http://www.londondrugs.com/on/demandware.store/Sites-LondonDrugs-Site/default/Search-Show?q=
+title = .productname
+reg_price = .pricing
+#reg_price = .standardprice
+#sale_price = .salesprice
+
++ Amazon
+color = FFA51D
+search_uri = http://www.amazon.ca/s/keywords=
+title = .newaps
+reg_price = .price
+
+# + Tiger Direct
+# color = 660
+# search_uri = http://www.tigerdirect.ca/applications/SearchTools/search.asp?keywords=
+# price_context =
+# reg_price = .salePrice
+# sale_price =
+
++ Best Buy
+color = 003B64
+search_uri = http://www.bestbuy.ca/Search/SearchResults.aspx?query=
+title = .product-title, .prod-title
+#sale_price = .price-onsale
+reg_price = .prodprice
+
+# + RadioShack
+# color = E76453
+# search_uri = http://www.radioshack.com/search/controller.jsp?kw=
+# title = .title
+# price_context = .product-price-tag
+# reg_price = .price
+
+# + Walmart
+# color = 0000FF
+# search_uri = http://www.walmart.ca/search/
+# title = .title
+# reg_price = .price-current