pricecharts

track prices of consumer electronics
Log | Files | Refs | README

commit 8728fc8c891b4907919c1b53bac1615a27ef1569
parent eaee0ebafa8943232ef3d6e1a029f5e1e431f8d8
Author: Kyle Milz <kyle@getaddrinfo.net>
Date:   Sun, 15 Mar 2015 21:39:00 -0600

PriceChart: take an IO Tee in get_log

Now we use get_log in product_scraper to write out the temp email we send at the
end, instead of using a memory buffer. Pass the IO Tee to get_dom so that errors
here can be logged and emailed.

Diffstat:
MPriceChart.pm | 15++++++++-------
Mprice_scraper | 6++++--
Mproduct_scraper | 42+++++++++++++++++++++++-------------------
3 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/PriceChart.pm b/PriceChart.pm @@ -73,6 +73,7 @@ sub get_dom my $url = shift; my $ua = shift; my $verbose = shift; + my $log = shift; my $resp = $ua->get($url); if ($resp->is_success) { @@ -81,8 +82,7 @@ sub get_dom return HTML::Grabber->new(html => $resp->decoded_content); } - print "error: get_dom: $url failed\n"; - print "error: " . $resp->status_line . "\n"; + print $log "error: get_dom: " . $resp->status_line . " $url\n"; return undef; } @@ -109,13 +109,14 @@ sub new_ua sub get_log { - my $cfg = shift || return undef; - my $file = shift || return undef; + my $log_path = shift || return undef; my $verbose = shift || 0; - my $path = $cfg->{"chroot"} . $cfg->{"logs"} . "/" . $file; - mkdir $log_dir; - open my $log, ">>", $path or die "can't open $path: $!"; + unless (-d substr($log_path, 0, rindex($log_path, '/'))) { + mkdir $log_path or die "couldn't mkdir $log_path: $!" ; + } + print "info: get_log: opening $log_path\n"; + open my $log, ">>", $log_path or die "can't open $log_path: $!"; if ($verbose) { print "info: get_log: outputting to tee\n"; diff --git a/price_scraper b/price_scraper @@ -21,10 +21,12 @@ getopts("m:np:v", \%args); $| = 1 if ($args{v}); my $cfg = get_config(); -my $log = get_log($cfg->{"http"}, "price_scrapes.txt", $args{v}); my $ua = new_ua($cfg->{"general"}, $args{v}); my $dbh = get_dbh($cfg->{"general"}, undef, $args{v}); +my $log_path = $cfg->{"http"}{"chroot"} . $cfg->{"http"}{"logs"} . "/price_scrapes.txt"; +my $log = get_log($log_path, $args{v}); + # allow products to go out of stock. if we haven't seen them for > 30 days # chances are retailers aren't carrying them anymore my $cutoff = time - (30 * 24 * 60 * 60); @@ -85,7 +87,7 @@ for my $vendor (sort keys %{$cfg->{"vendors"}}) { } # get a page of search results from a vendor - my $search_results = get_dom($url . $search, $ua, $args{v}); + my $search_results = get_dom($url . $search, $ua, $args{v}, $log); next unless defined $search_results; # search search_results for particular html tags that should be prices diff --git a/product_scraper b/product_scraper @@ -8,6 +8,7 @@ use Email::Simple; use Email::Send; use Getopt::Std; use HTML::Grabber; +use IO::Tee; use LWP::Simple; use PriceChart; use Term::ReadKey; @@ -22,7 +23,8 @@ $| = 1 if ($args{v}); my $cfg = get_config(); my $ua = new_ua($cfg->{"general"}, $args{v}); my $dbh = get_dbh($cfg->{"general"}, undef, $args{v}); -# my $log = get_log($cfg->{"http"}, "products.txt", $args{v}); +my $tmp_file = "/tmp/product_scraper.txt"; +my $log = get_log($tmp_file, $args{v}); srand; $dbh->do("create table if not exists products(" . @@ -47,9 +49,9 @@ my $update_sth = $dbh->prepare($sql); # # Memory Express # -my $mail = "Memory Express\n==============\n\n"; -$mail .= "type ok percent errors new duration\n"; -$mail .= "--------------- ------- ------- ------ --- --------\n"; +print $log "Memory Express\n==============\n\n"; +print $log "type ok percent errors new duration\n"; +print $log "--------------- ------- ------- ------ --- --------\n"; my %product_map = ( "televisions" => "Televisions", @@ -63,7 +65,8 @@ while (my ($type, $name) = each %product_map) { $update_sth = undef; $insert_sth = undef; $dbh->disconnect(); -send_email($mail, $args{v}); +$log->close(); +send_email($args{v}); # # scrape an entire class of products, inserting or updating the db as needed. @@ -116,7 +119,7 @@ sub mem_exp_scrape_class } my $ok = $new + $old; - $mail .= sprintf("%-15s %7s %6.1f%% %6i %3i %7is\n", $type, + print $log sprintf("%-15s %7s %6.1f%% %6i %3i %7is\n", $type, "$ok/$total", $ok * 100.0 / $total, $err, $new, time - $start); } @@ -133,7 +136,7 @@ sub mem_exp_get_thumbnails "$name?PageSize=40&Page="; # get first page of results - my $dom = get_dom($class_url . "1", $ua, $args{v}); + my $dom = get_dom($class_url . "1", $ua, $args{v}, $log); return undef if (!defined $dom); my $pager_hdr = "$info_hdr: .AJAX_List_Pager"; @@ -159,7 +162,7 @@ sub mem_exp_get_thumbnails sleep_rand($page_hdr, 5); # get a search pages dom - $dom = get_dom($class_url . "$_", $ua, $args{v}); + $dom = get_dom($class_url . "$_", $ua, $args{v}, $log); next if (!defined $dom); # each product thumbnail has class=PIV_Regular @@ -198,7 +201,7 @@ sub mem_exp_scrape_thumbnail # visit the extended description page my $product_url = "http://www.memoryexpress.com/Products/"; - my $product_dom = get_dom("$product_url$product_id", $ua, $args{v}); + my $product_dom = get_dom("$product_url$product_id", $ua, $args{v}, $log); # the part number is inside of id=ProductAdd always my $part_num = get_tag_text($product_dom, "#ProductAdd", $error_hdr); @@ -207,7 +210,7 @@ sub mem_exp_scrape_thumbnail # extract the part number, always is text inside of the tag ($part_num) = ($part_num =~ m/Part #:\s*(.*)\r/); if (!defined $part_num) { - print "$error_hdr: part num regex failed\n"; + print $log "$error_hdr: part num regex failed\n"; return undef; } @@ -223,8 +226,8 @@ sub mem_exp_scrape_thumbnail ($brand) = ($brand =~ m/Brand: ([A-Za-z]+)/); } if (!defined $brand || $brand eq "") { - print "$error_hdr: .ProductBrand not found, html was:\n"; - print "$html\n"; + print $log "$error_hdr: .ProductBrand not found, html was:\n"; + print $log "$html\n"; return undef; } @@ -246,8 +249,8 @@ sub get_tag_text my $field = $dom->find($tag)->text(); if (!defined $field || $field eq "") { - print "$error_hdr: $tag not found or empty, html was:\n"; - print $dom->html() . "\n"; + print $log "$error_hdr: $tag not found or empty, html was:\n"; + print $log $dom->html() . "\n"; return undef; } @@ -259,14 +262,15 @@ sub get_tag_text # sub send_email { - my $mail = shift; my $verbose = shift || 0; - if ($verbose) { - print $mail; - return; - } + open my $fh, "<", $tmp_file or die "couldn't open $tmp_file: $!"; + my $mail; + $mail .= $_ for (<$fh>); + close $fh; + unlink($tmp_file) or warn "couldn't unlink $tmp_file: $!"; + return if ($verbose); my $email = Email::Simple->create( header => [ From => "Santa Claus <sc\@np.com>",