pricecharts

track prices of consumer electronics
Log | Files | Refs | README

ps_scrape.pl (14928B)


      1 #!/usr/bin/env perl
      2 use strict;
      3 use warnings;
      4 
      5 use BSD::arc4random qw(arc4random_uniform);
      6 use Config::Grammar;
      7 use Email::Simple;
      8 use Email::Send;
      9 use Getopt::Std;
     10 use HTML::Grabber;
     11 use IO::Tee;
     12 use List::Util qw(min);
     13 use LWP::Simple;
     14 use PriceSloth;
     15 use POSIX;
     16 use Term::ReadKey;
     17 use URI::Escape;
     18 
     19 
     20 my %args;
     21 getopts("nptv", \%args);
     22 
     23 $| = 1 if ($args{v});
     24 
     25 sleep_if_cron();
     26 my $cfg = get_config();
     27 my $ua  = new_ua($cfg->{general}, $args{v});
     28 my $dbh = get_dbh($cfg->{general}{db_dir}, $args{v});
     29 my $tmp_file = "/tmp/product_scraper.txt";
     30 my $tmp_log = get_log($tmp_file, $args{v});
     31 srand;
     32 
     33 if ($args{p}) {
     34 	mem_exp_scrape_products();
     35 }
     36 else {
     37 	scrape_prices();
     38 }
     39 
     40 sub scrape_prices
     41 {
     42 	my $log_path = $cfg->{general}{log_dir} . "/pricesloth";
     43 	my $log = get_log($log_path, $args{v});
     44 
     45 	# allow products to go out of stock. if we haven't seen them for > 30 days
     46 	# chances are retailers aren't carrying them anymore
     47 	my $cutoff = time - (30 * 24 * 60 * 60);
     48 	my $sql = "select part_num, manufacturer, type from products " .
     49 	"where last_seen > $cutoff order by last_scraped asc";
     50 	my ($part_num, $manufacturer, $type) = $dbh->selectrow_array($sql);
     51 
     52 	unless (defined $part_num && defined $manufacturer) {
     53 		print "error: no parts seen in the last 30 days\n";
     54 		print "       run a product scrape to freshen the part numbers\n";
     55 		exit 1;
     56 	}
     57 
     58 	# prevent races with other scrapers, claim ownership as soon as possible
     59 	$dbh->do("update products set last_scraped = ? where part_num = ? and manufacturer = ?",
     60 		undef, time, $part_num, $manufacturer);
     61 
     62 	print "info: scraping $manufacturer $part_num\n" if ($args{v});
     63 
     64 	$sql = qq{insert into prices(date, manufacturer, part_num, retailer,
     65 		price, duration) values (?, ?, ?, ?, ?, ?)};
     66 	my $prices_sth = $dbh->prepare($sql);
     67 
     68 	$sql = qq{update products set last_seen = ?, svg_stale = 1
     69 		where part_num = ? and manufacturer = ?};
     70 	my $products_sth = $dbh->prepare($sql);
     71 
     72 	$sql = "insert or replace into retailers(name, color, url) values (?, ?, ?)";
     73 	my $retailer_sth = $dbh->prepare($sql);
     74 
     75 	$sql = qq{insert or replace into descriptions(manufacturer, part_num,
     76 		retailer, description, date) values (?, ?, ?, ?, ?)};
     77 	my $descriptions_sth = $dbh->prepare($sql);
     78 
     79 	my $timestamp = strftime("%F %T> ", localtime);
     80 	my ($start, @status, $i) = (time, "", -1);
     81 	for my $retailer (sort keys %{$cfg->{retailers}}) {
     82 		my %props =	%{$cfg->{retailers}{$retailer}};
     83 		# this could probably be done smarter
     84 		my $url =	$props{url};
     85 		my $color =	$props{color};
     86 		my $price_tag =	$props{reg_tag};
     87 		my $sale_tag =	$props{sale_tag};
     88 		my $desc_tag =  $props{title};
     89 
     90 		my $retailer_start = time;
     91 		$status[++$i] = " ";
     92 
     93 		# for products with short part numbers, also search manufacturer
     94 		my $search;
     95 		if (length($part_num) < 6) {
     96 			$search = uri_escape("$manufacturer $part_num");
     97 		} else {
     98 			$search = uri_escape($part_num);
     99 		}
    100 
    101 		# get a page of search results from a retailer
    102 		my $search_results = get_dom($url . $search, $ua, $args{v}, $log);
    103 		next unless defined $search_results;
    104 
    105 		# search search_results for particular html tags that should be prices
    106 		my $price_r = get_valid_price($price_tag, $search_results, $retailer, $log);
    107 		my $price_s = get_valid_price($sale_tag,  $search_results, $retailer, $log);
    108 		next unless ($price_r || $price_s);
    109 
    110 		# choose the lowest that exists
    111 		my $price;
    112 		$price = $price_r if ($price_r);
    113 		$price = $price_s if ($price_s);
    114 		$price = min($price_r, $price_s) if ($price_r && $price_s);
    115 
    116 		# opportunistically scrape descriptions
    117 		my ($found_descr, $descr);
    118 		if ($desc_tag) {
    119 			# scrape description, use first one found on page
    120 			($descr) = $search_results->find($desc_tag)->text_array();
    121 			if (defined $descr && $descr ne "") {
    122 				$descr =~ s/^\s+//;
    123 				$descr =~ s/\s+$//;
    124 				$descr =~ s/$manufacturer//;
    125 				$descr =~ s/$part_num//;
    126 
    127 				my $descr_s = trunc_line($descr, length($retailer) + 8);
    128 				print "info: $retailer: $descr_s\n" if ($args{v});
    129 				$found_descr = 1;
    130 			}
    131 		}
    132 
    133 		# everything looks good
    134 		$status[$i] = substr($retailer, 0, 1);
    135 
    136 		next if ($args{n});
    137 		$dbh->begin_work;
    138 		$retailer_sth->execute($retailer, $color, $url);
    139 		$prices_sth->execute($start, $manufacturer, $part_num, $retailer, $price,
    140 			time - $retailer_start);
    141 		$products_sth->execute($start, $part_num, $manufacturer);
    142 		$descriptions_sth->execute($manufacturer, $part_num, $retailer,
    143 			$descr, time) if (defined $found_descr);
    144 		$dbh->commit;
    145 
    146 		print "info: $retailer: db: inserted \$$price\n" if ($args{v});
    147 	}
    148 
    149 	printf $log "%s %-12s %-10s %-20s [%s] (%i s)\n", $timestamp, $type,
    150 	$manufacturer, $part_num, join("", @status), time - $start;
    151 
    152 	$log->close();
    153 	$retailer_sth = undef;
    154 	$prices_sth = undef;
    155 	$products_sth = undef;
    156 	$descriptions_sth = undef;
    157 	$dbh->disconnect();
    158 
    159 	exit 0;
    160 }
    161 
    162 sub get_valid_price
    163 {
    164 	my ($dom_tag, $search_results, $retailer, $log) = @_;
    165 	return undef unless defined $dom_tag;
    166 
    167 	# break the search_results page down into individual results
    168 	my @search_prices = $search_results->find($dom_tag)->text_array();
    169 	my $num_prices = @search_prices;
    170 	return undef if ($num_prices == 0);
    171 
    172 	print "info: $retailer: $dom_tag: $num_prices elements\n" if ($args{v});
    173 	my $hdr = "$retailer: $dom_tag" . "[0]";
    174 
    175 	# do a fuzzy search for digit combinations that look like a price
    176 	# XXX: uses the first found price in the page
    177 	# XXX: this does not work on single digit prices, ie $7.00
    178 	my ($price, @others) = ($search_prices[0] =~ m/(\d[\d,]+)/);
    179 	if (!defined $price || @others) {
    180 		print $log "error: $hdr: wrong number of regexs\n";
    181 		return undef;
    182 	}
    183 
    184 	# sanity check the numerical price value
    185 	$price =~ s/,//;
    186 	if ($price <= 0 || $price > 10000) {
    187 		print $log "error: $hdr: price $price out of range\n";
    188 		return undef;
    189 	}
    190 
    191 	print "info: $hdr: \$$price\n" if ($args{v});
    192 	return $price;
    193 }
    194 
    195 
    196 # --- PRODUCT SCRAPE ---
    197 
    198 sub mem_exp_scrape_products
    199 {
    200 	my $sql = qq{insert into products(part_num, manufacturer, retailer, type,
    201 	first_seen, last_seen, last_scraped) values (?, ?, ?, ?, ?, ?, ?)};
    202 	my $insert_sth = $dbh->prepare($sql);
    203 
    204 	$sql = "insert or replace into descriptions(manufacturer, part_num, retailer, ".
    205 	"description, date) values (?, ?, ?, ?, ?)";
    206 	my $descriptions_sth = $dbh->prepare($sql);
    207 
    208 	# also update description, manufacturer?
    209 	$sql = "update products set last_seen = ? where part_num = ?";
    210 	my $update_sth = $dbh->prepare($sql);
    211 
    212 	#
    213 	# Memory Express
    214 	#
    215 	print $tmp_log "Memory Express\n==============\n\n";
    216 	print $tmp_log "type                 ok percent errors new duration\n";
    217 	print $tmp_log "--------------- ------- ------- ------ --- --------\n";
    218 
    219 	my %product_map = (
    220 		"Television" => "Televisions",
    221 		"Laptop" => "LaptopsNotebooks",
    222 		"Hard Drive" => "HardDrives",
    223 		"Memory" => "Memory",
    224 		"Video Card" => "VideoCards",
    225 		"Processor" => "Processors"
    226 	);
    227 	while (my ($type, $name) = each %product_map) {
    228 		mem_exp_scrape_class($type, $name, $insert_sth, $descriptions_sth,
    229 			$update_sth);
    230 	}
    231 
    232 	$update_sth = undef;
    233 	$insert_sth = undef;
    234 	$dbh->disconnect();
    235 	$tmp_log->close();
    236 	send_email($args{v});
    237 
    238 	exit 0;
    239 }
    240 
    241 #
    242 # scrape an entire class of products, inserting or updating the db as needed.
    243 # general flow is get all thumbnails on the unfiltered search results page, then
    244 # for each of these get the part number, brand, and description.
    245 #
    246 sub mem_exp_scrape_class
    247 {
    248 	my ($type, $name, $insert_sth, $descriptions_sth, $update_sth) = @_;
    249 
    250 	my $info_hdr = "info: " . lc($type);
    251 
    252 	my $thumbnails = mem_exp_get_thumbnails($name, $info_hdr);
    253 	return undef unless defined $thumbnails;
    254 
    255 	my $total = scalar @$thumbnails;
    256 	print "$info_hdr: $total total\n" if ($args{v});
    257 
    258 	# randomize the combined results so we don't linearly visit them
    259 	my @rand_thumbnails = sort { rand > .5 } @$thumbnails;
    260 
    261 	# extract and store part number, brand, and description
    262 	my ($new, $old, $err, $start, $i) = (0, 0, 0, time, 0);
    263 	for my $thumbnail_html (@rand_thumbnails) {
    264 		$i++;
    265 		my $thumb_hdr = "$info_hdr: $i/$total";
    266 
    267 		# look less suspicious
    268 		sleep_rand($thumb_hdr, 20);
    269 
    270 		# attempt to extract information from thumbnail html
    271 		my ($brand, $part_num, $desc) =
    272 			mem_exp_scrape_thumbnail("$type: $i/$total", $thumbnail_html);
    273 		unless (defined $brand && defined $part_num && defined $desc) {
    274 			$err++;
    275 			next;
    276 		}
    277 
    278 		# memory express has bundles, we're not really interested in
    279 		# those
    280 		next if ($part_num =~ /^BDL_/);
    281 
    282 		$dbh->begin_work;
    283 
    284 		# sanitize $brand against known good manufacturer names
    285 		my $sql = qq{select manufacturer from products where
    286 			lower(manufacturer) = ?};
    287 		my $manufs = $dbh->selectcol_arrayref($sql, undef, lc($brand));
    288 		if (@$manufs) {
    289 			# take a risk that the first one is spelled right
    290 			if ($manufs->[0] ne $brand) {
    291 				print "warn: forcing misspelled $brand to ";
    292 				print $manufs->[0] . "\n";
    293 				$brand = $manufs->[0];
    294 			}
    295 		}
    296 
    297 		# extraction looks good, insert or update the database
    298 		$sql = "select * from products where manufacturer = ? and
    299 			part_num = ?";
    300 		if ($dbh->selectrow_arrayref($sql, undef, $brand, $part_num)) {
    301 			# also check description and manufacturer are consistent?
    302 			$update_sth->execute(time, $part_num) or die $dbh->errstr();
    303 			$old++;
    304 		}
    305 		else {
    306 			$insert_sth->execute($part_num, $brand, "Memory Express", $type,
    307 				time, time, 0) or die $dbh->errstr();
    308 			print "$thumb_hdr: inserted into db\n" if ($args{v});
    309 			$new++;
    310 		}
    311 
    312 		# this has a foreign key constraint on the product table
    313 		$descriptions_sth->execute($brand, $part_num, "Memory Express",
    314 			$desc, time);
    315 
    316 		$dbh->commit;
    317 	}
    318 
    319 	my $ok = $new + $old;
    320 	my $time_str = sprintf("%dh %dm %ds", (gmtime(time - $start))[2, 1, 0]);
    321 	print $tmp_log sprintf("%-15s %7s %6.1f%% %6i %3i %s\n", lc($type),
    322 		"$ok/$total", $ok * 100.0 / $total, $err, $new, $time_str);
    323 }
    324 
    325 #
    326 # get all thumbnails from generic unfiltered search page
    327 #
    328 sub mem_exp_get_thumbnails
    329 {
    330 	my ($name, $info_hdr) = @_;
    331 
    332 	# this returns a search results page, link found through trial and error
    333 	my $class_url = "http://www.memoryexpress.com/Category/" .
    334 		"$name?PageSize=40&Page=";
    335 
    336 	# get first page of results
    337 	my $dom = get_dom($class_url . "1", $ua, $args{v}, $tmp_log);
    338 	return undef if (!defined $dom);
    339 
    340 	my $pager_hdr = "$info_hdr: .AJAX_List_Pager";
    341 
    342 	# extract the first of two pager widgets on the page
    343 	my ($pager_html) = $dom->find(".AJAX_List_Pager")->html_array();
    344 	return undef if (!defined $pager_html);
    345 	print "$pager_hdr found\n" if ($args{v});
    346 
    347 	# find how many pages of results we have, each page is one <li> element
    348 	my $pager = HTML::Grabber->new(html => $pager_html);
    349 	my $pages = $pager->find("li")->html_array();
    350 	return undef unless ($pages);
    351 
    352 	# if more than 1 <li> is found, one <li> is always a "next" arrow
    353 	$pages-- if ($pages > 1);
    354 	print "$pager_hdr: $pages pages\n" if ($args{v});
    355 
    356 	# loop over results pages and append product thumbnails
    357 	my @thumbnails;
    358 	for (1..$pages) {
    359 		my $page_hdr = "$pager_hdr: $_/$pages";
    360 		sleep_rand($page_hdr, 5);
    361 
    362 		# get a search pages dom
    363 		$dom = get_dom($class_url . "$_", $ua, $args{v}, $tmp_log);
    364 		next if (!defined $dom);
    365 
    366 		# each product thumbnail has class=PIV_Regular
    367 		my @temp_thumbs = $dom->find(".PIV_Regular")->html_array();
    368 		if ($args{t}) {
    369 			@temp_thumbs = ($temp_thumbs[0]);
    370 		}
    371 		my $num_thumbs = scalar @temp_thumbs;
    372 		print "$page_hdr: $num_thumbs thumbs found\n" if ($args{v});
    373 		push @thumbnails, @temp_thumbs;
    374 
    375 		last if ($args{t});
    376 	}
    377 
    378 	return \@thumbnails;
    379 }
    380 
    381 #
    382 # this checks the input html for 3 things, part num, manufacturer, and
    383 # description. if any of these aren't found, fail.
    384 #
    385 sub mem_exp_scrape_thumbnail
    386 {
    387 	my ($thumb_hdr, $html) = @_;
    388 
    389 	my $error_hdr = "error: $thumb_hdr";
    390 	my $info_hdr = "info: $thumb_hdr";
    391 
    392 	# make new html grabber instance with the thumbnail html
    393 	my $dom = HTML::Grabber->new(html => $html);
    394 
    395 	# has to be found otherwise we can't do anything
    396 	my $product_id = get_tag_text($dom, ".ProductId", $error_hdr);
    397 	return undef unless defined $product_id;
    398 
    399 	# visit the extended description page
    400 	my $product_url = "http://www.memoryexpress.com/Products/";
    401 	my $product_dom = get_dom("$product_url$product_id", $ua, $args{v}, $tmp_log);
    402 
    403 	# the part number is inside of id=ProductAdd always
    404 	my $part_num = get_tag_text($product_dom, "#ProductAdd", $error_hdr);
    405 	return undef unless defined $part_num;
    406 
    407 	# extract the part number, always is text inside of the tag
    408 	($part_num) = ($part_num =~ m/Part #:\s*(.*)\r/);
    409 	if (!defined $part_num) {
    410 		print $tmp_log "$error_hdr: part num regex failed\n";
    411 		return undef;
    412 	}
    413 
    414 	# extract the product description
    415 	my $desc = get_tag_text($dom, ".ProductTitle", $error_hdr);
    416 	return undef unless defined $desc;
    417 
    418 	# extract the brand, sometimes shows up as text
    419 	my $brand = $dom->find(".ProductBrand")->text();
    420 	if ($brand eq "") {
    421 		# and sometimes shows up inside the tag attributes
    422 		$brand = $dom->find(".ProductBrand")->html();
    423 		# XXX: revisit this regex, it should be less strict
    424 		($brand) = ($brand =~ m/Brand: ([0-9A-Za-z\. ]+)/);
    425 	}
    426 	if (!defined $brand || $brand eq "") {
    427 		print $tmp_log "$error_hdr: .ProductBrand not found, html was:\n";
    428 		print $tmp_log "$html\n";
    429 		return undef;
    430 	}
    431 
    432 	my $shortened_desc = trunc_line($desc, length($info_hdr) + 2);
    433 	print "$info_hdr: $brand $part_num\n" if ($args{v});
    434 	print "$info_hdr: $shortened_desc\n" if ($args{v});
    435 
    436 	return ($brand, $part_num, $desc);
    437 }
    438 
    439 #
    440 # unwrap the plain text inside of an html tag
    441 #
    442 sub get_tag_text
    443 {
    444 	my ($dom, $tag, $error_hdr) = @_;
    445 
    446 	my $field = $dom->find($tag)->text();
    447 	if (!defined $field || $field eq "") {
    448 		print $tmp_log "$error_hdr: $tag not found or empty, html was:\n";
    449 		print $tmp_log $dom->html() . "\n";
    450 		return undef;
    451 	}
    452 
    453 	return $field;
    454 }
    455 
    456 #
    457 # send an email with the summary of the scrape
    458 #
    459 sub send_email
    460 {
    461 	my $verbose = shift || 0;
    462 
    463 	open my $fh, "<", $tmp_file or die "couldn't open $tmp_file: $!";
    464 	my $mail;
    465 	$mail .= $_ for (<$fh>);
    466 	close $fh;
    467 	unlink($tmp_file) or warn "couldn't unlink $tmp_file: $!";
    468 
    469 	return if ($verbose);
    470 	my $email = Email::Simple->create(
    471 		header => [
    472 			From	=> "Price Sloth <www\@pricesloth.com>",
    473 			To	=> $cfg->{general}{email},
    474 			Subject	=> "weekly product scrape",
    475 		],
    476 		body => $mail
    477 	);
    478 
    479 	my $sender = Email::Send->new({mailer => "SMTP"});
    480 	$sender->mailer_args([Host => $cfg->{"general"}{"smtp"}]);
    481 	$sender->send($email->as_string()) || print "Couldn't send email\n";
    482 }
    483 
    484 sub sleep_rand
    485 {
    486 	my $header = shift;
    487 	my $upper_limit = shift || 0;
    488 
    489 	my $sleep = int(rand($upper_limit));
    490 	printf "$header: (%ss wait)\n", $sleep if ($args{v});
    491 	sleep $sleep unless ($args{t});
    492 }
    493 
    494 sub can_open_tty
    495 {
    496 	no autodie;
    497 	return open(my $tty, '+<', '/dev/tty');
    498 }
    499 
    500 sub sleep_if_cron
    501 {
    502 	if (can_open_tty()) {
    503 		return;
    504 	}
    505 
    506 	# 577s is appx 9.5 min
    507 	my $sleep = arc4random_uniform(577);
    508 	print "info: script run from cron, sleeping $sleep s\n" if ($args{v});
    509 
    510 	# modify ps output to show what state the program is in
    511 	my $old_argv0 = $0;
    512 	$0 .= " (sleeping)";
    513 	sleep $sleep;
    514 	$0 = $old_argv0;
    515 }