ps_scrape.pl (14928B)
1 #!/usr/bin/env perl 2 use strict; 3 use warnings; 4 5 use BSD::arc4random qw(arc4random_uniform); 6 use Config::Grammar; 7 use Email::Simple; 8 use Email::Send; 9 use Getopt::Std; 10 use HTML::Grabber; 11 use IO::Tee; 12 use List::Util qw(min); 13 use LWP::Simple; 14 use PriceSloth; 15 use POSIX; 16 use Term::ReadKey; 17 use URI::Escape; 18 19 20 my %args; 21 getopts("nptv", \%args); 22 23 $| = 1 if ($args{v}); 24 25 sleep_if_cron(); 26 my $cfg = get_config(); 27 my $ua = new_ua($cfg->{general}, $args{v}); 28 my $dbh = get_dbh($cfg->{general}{db_dir}, $args{v}); 29 my $tmp_file = "/tmp/product_scraper.txt"; 30 my $tmp_log = get_log($tmp_file, $args{v}); 31 srand; 32 33 if ($args{p}) { 34 mem_exp_scrape_products(); 35 } 36 else { 37 scrape_prices(); 38 } 39 40 sub scrape_prices 41 { 42 my $log_path = $cfg->{general}{log_dir} . "/pricesloth"; 43 my $log = get_log($log_path, $args{v}); 44 45 # allow products to go out of stock. if we haven't seen them for > 30 days 46 # chances are retailers aren't carrying them anymore 47 my $cutoff = time - (30 * 24 * 60 * 60); 48 my $sql = "select part_num, manufacturer, type from products " . 49 "where last_seen > $cutoff order by last_scraped asc"; 50 my ($part_num, $manufacturer, $type) = $dbh->selectrow_array($sql); 51 52 unless (defined $part_num && defined $manufacturer) { 53 print "error: no parts seen in the last 30 days\n"; 54 print " run a product scrape to freshen the part numbers\n"; 55 exit 1; 56 } 57 58 # prevent races with other scrapers, claim ownership as soon as possible 59 $dbh->do("update products set last_scraped = ? where part_num = ? and manufacturer = ?", 60 undef, time, $part_num, $manufacturer); 61 62 print "info: scraping $manufacturer $part_num\n" if ($args{v}); 63 64 $sql = qq{insert into prices(date, manufacturer, part_num, retailer, 65 price, duration) values (?, ?, ?, ?, ?, ?)}; 66 my $prices_sth = $dbh->prepare($sql); 67 68 $sql = qq{update products set last_seen = ?, svg_stale = 1 69 where part_num = ? and manufacturer = ?}; 70 my $products_sth = $dbh->prepare($sql); 71 72 $sql = "insert or replace into retailers(name, color, url) values (?, ?, ?)"; 73 my $retailer_sth = $dbh->prepare($sql); 74 75 $sql = qq{insert or replace into descriptions(manufacturer, part_num, 76 retailer, description, date) values (?, ?, ?, ?, ?)}; 77 my $descriptions_sth = $dbh->prepare($sql); 78 79 my $timestamp = strftime("%F %T> ", localtime); 80 my ($start, @status, $i) = (time, "", -1); 81 for my $retailer (sort keys %{$cfg->{retailers}}) { 82 my %props = %{$cfg->{retailers}{$retailer}}; 83 # this could probably be done smarter 84 my $url = $props{url}; 85 my $color = $props{color}; 86 my $price_tag = $props{reg_tag}; 87 my $sale_tag = $props{sale_tag}; 88 my $desc_tag = $props{title}; 89 90 my $retailer_start = time; 91 $status[++$i] = " "; 92 93 # for products with short part numbers, also search manufacturer 94 my $search; 95 if (length($part_num) < 6) { 96 $search = uri_escape("$manufacturer $part_num"); 97 } else { 98 $search = uri_escape($part_num); 99 } 100 101 # get a page of search results from a retailer 102 my $search_results = get_dom($url . $search, $ua, $args{v}, $log); 103 next unless defined $search_results; 104 105 # search search_results for particular html tags that should be prices 106 my $price_r = get_valid_price($price_tag, $search_results, $retailer, $log); 107 my $price_s = get_valid_price($sale_tag, $search_results, $retailer, $log); 108 next unless ($price_r || $price_s); 109 110 # choose the lowest that exists 111 my $price; 112 $price = $price_r if ($price_r); 113 $price = $price_s if ($price_s); 114 $price = min($price_r, $price_s) if ($price_r && $price_s); 115 116 # opportunistically scrape descriptions 117 my ($found_descr, $descr); 118 if ($desc_tag) { 119 # scrape description, use first one found on page 120 ($descr) = $search_results->find($desc_tag)->text_array(); 121 if (defined $descr && $descr ne "") { 122 $descr =~ s/^\s+//; 123 $descr =~ s/\s+$//; 124 $descr =~ s/$manufacturer//; 125 $descr =~ s/$part_num//; 126 127 my $descr_s = trunc_line($descr, length($retailer) + 8); 128 print "info: $retailer: $descr_s\n" if ($args{v}); 129 $found_descr = 1; 130 } 131 } 132 133 # everything looks good 134 $status[$i] = substr($retailer, 0, 1); 135 136 next if ($args{n}); 137 $dbh->begin_work; 138 $retailer_sth->execute($retailer, $color, $url); 139 $prices_sth->execute($start, $manufacturer, $part_num, $retailer, $price, 140 time - $retailer_start); 141 $products_sth->execute($start, $part_num, $manufacturer); 142 $descriptions_sth->execute($manufacturer, $part_num, $retailer, 143 $descr, time) if (defined $found_descr); 144 $dbh->commit; 145 146 print "info: $retailer: db: inserted \$$price\n" if ($args{v}); 147 } 148 149 printf $log "%s %-12s %-10s %-20s [%s] (%i s)\n", $timestamp, $type, 150 $manufacturer, $part_num, join("", @status), time - $start; 151 152 $log->close(); 153 $retailer_sth = undef; 154 $prices_sth = undef; 155 $products_sth = undef; 156 $descriptions_sth = undef; 157 $dbh->disconnect(); 158 159 exit 0; 160 } 161 162 sub get_valid_price 163 { 164 my ($dom_tag, $search_results, $retailer, $log) = @_; 165 return undef unless defined $dom_tag; 166 167 # break the search_results page down into individual results 168 my @search_prices = $search_results->find($dom_tag)->text_array(); 169 my $num_prices = @search_prices; 170 return undef if ($num_prices == 0); 171 172 print "info: $retailer: $dom_tag: $num_prices elements\n" if ($args{v}); 173 my $hdr = "$retailer: $dom_tag" . "[0]"; 174 175 # do a fuzzy search for digit combinations that look like a price 176 # XXX: uses the first found price in the page 177 # XXX: this does not work on single digit prices, ie $7.00 178 my ($price, @others) = ($search_prices[0] =~ m/(\d[\d,]+)/); 179 if (!defined $price || @others) { 180 print $log "error: $hdr: wrong number of regexs\n"; 181 return undef; 182 } 183 184 # sanity check the numerical price value 185 $price =~ s/,//; 186 if ($price <= 0 || $price > 10000) { 187 print $log "error: $hdr: price $price out of range\n"; 188 return undef; 189 } 190 191 print "info: $hdr: \$$price\n" if ($args{v}); 192 return $price; 193 } 194 195 196 # --- PRODUCT SCRAPE --- 197 198 sub mem_exp_scrape_products 199 { 200 my $sql = qq{insert into products(part_num, manufacturer, retailer, type, 201 first_seen, last_seen, last_scraped) values (?, ?, ?, ?, ?, ?, ?)}; 202 my $insert_sth = $dbh->prepare($sql); 203 204 $sql = "insert or replace into descriptions(manufacturer, part_num, retailer, ". 205 "description, date) values (?, ?, ?, ?, ?)"; 206 my $descriptions_sth = $dbh->prepare($sql); 207 208 # also update description, manufacturer? 209 $sql = "update products set last_seen = ? where part_num = ?"; 210 my $update_sth = $dbh->prepare($sql); 211 212 # 213 # Memory Express 214 # 215 print $tmp_log "Memory Express\n==============\n\n"; 216 print $tmp_log "type ok percent errors new duration\n"; 217 print $tmp_log "--------------- ------- ------- ------ --- --------\n"; 218 219 my %product_map = ( 220 "Television" => "Televisions", 221 "Laptop" => "LaptopsNotebooks", 222 "Hard Drive" => "HardDrives", 223 "Memory" => "Memory", 224 "Video Card" => "VideoCards", 225 "Processor" => "Processors" 226 ); 227 while (my ($type, $name) = each %product_map) { 228 mem_exp_scrape_class($type, $name, $insert_sth, $descriptions_sth, 229 $update_sth); 230 } 231 232 $update_sth = undef; 233 $insert_sth = undef; 234 $dbh->disconnect(); 235 $tmp_log->close(); 236 send_email($args{v}); 237 238 exit 0; 239 } 240 241 # 242 # scrape an entire class of products, inserting or updating the db as needed. 243 # general flow is get all thumbnails on the unfiltered search results page, then 244 # for each of these get the part number, brand, and description. 245 # 246 sub mem_exp_scrape_class 247 { 248 my ($type, $name, $insert_sth, $descriptions_sth, $update_sth) = @_; 249 250 my $info_hdr = "info: " . lc($type); 251 252 my $thumbnails = mem_exp_get_thumbnails($name, $info_hdr); 253 return undef unless defined $thumbnails; 254 255 my $total = scalar @$thumbnails; 256 print "$info_hdr: $total total\n" if ($args{v}); 257 258 # randomize the combined results so we don't linearly visit them 259 my @rand_thumbnails = sort { rand > .5 } @$thumbnails; 260 261 # extract and store part number, brand, and description 262 my ($new, $old, $err, $start, $i) = (0, 0, 0, time, 0); 263 for my $thumbnail_html (@rand_thumbnails) { 264 $i++; 265 my $thumb_hdr = "$info_hdr: $i/$total"; 266 267 # look less suspicious 268 sleep_rand($thumb_hdr, 20); 269 270 # attempt to extract information from thumbnail html 271 my ($brand, $part_num, $desc) = 272 mem_exp_scrape_thumbnail("$type: $i/$total", $thumbnail_html); 273 unless (defined $brand && defined $part_num && defined $desc) { 274 $err++; 275 next; 276 } 277 278 # memory express has bundles, we're not really interested in 279 # those 280 next if ($part_num =~ /^BDL_/); 281 282 $dbh->begin_work; 283 284 # sanitize $brand against known good manufacturer names 285 my $sql = qq{select manufacturer from products where 286 lower(manufacturer) = ?}; 287 my $manufs = $dbh->selectcol_arrayref($sql, undef, lc($brand)); 288 if (@$manufs) { 289 # take a risk that the first one is spelled right 290 if ($manufs->[0] ne $brand) { 291 print "warn: forcing misspelled $brand to "; 292 print $manufs->[0] . "\n"; 293 $brand = $manufs->[0]; 294 } 295 } 296 297 # extraction looks good, insert or update the database 298 $sql = "select * from products where manufacturer = ? and 299 part_num = ?"; 300 if ($dbh->selectrow_arrayref($sql, undef, $brand, $part_num)) { 301 # also check description and manufacturer are consistent? 302 $update_sth->execute(time, $part_num) or die $dbh->errstr(); 303 $old++; 304 } 305 else { 306 $insert_sth->execute($part_num, $brand, "Memory Express", $type, 307 time, time, 0) or die $dbh->errstr(); 308 print "$thumb_hdr: inserted into db\n" if ($args{v}); 309 $new++; 310 } 311 312 # this has a foreign key constraint on the product table 313 $descriptions_sth->execute($brand, $part_num, "Memory Express", 314 $desc, time); 315 316 $dbh->commit; 317 } 318 319 my $ok = $new + $old; 320 my $time_str = sprintf("%dh %dm %ds", (gmtime(time - $start))[2, 1, 0]); 321 print $tmp_log sprintf("%-15s %7s %6.1f%% %6i %3i %s\n", lc($type), 322 "$ok/$total", $ok * 100.0 / $total, $err, $new, $time_str); 323 } 324 325 # 326 # get all thumbnails from generic unfiltered search page 327 # 328 sub mem_exp_get_thumbnails 329 { 330 my ($name, $info_hdr) = @_; 331 332 # this returns a search results page, link found through trial and error 333 my $class_url = "http://www.memoryexpress.com/Category/" . 334 "$name?PageSize=40&Page="; 335 336 # get first page of results 337 my $dom = get_dom($class_url . "1", $ua, $args{v}, $tmp_log); 338 return undef if (!defined $dom); 339 340 my $pager_hdr = "$info_hdr: .AJAX_List_Pager"; 341 342 # extract the first of two pager widgets on the page 343 my ($pager_html) = $dom->find(".AJAX_List_Pager")->html_array(); 344 return undef if (!defined $pager_html); 345 print "$pager_hdr found\n" if ($args{v}); 346 347 # find how many pages of results we have, each page is one <li> element 348 my $pager = HTML::Grabber->new(html => $pager_html); 349 my $pages = $pager->find("li")->html_array(); 350 return undef unless ($pages); 351 352 # if more than 1 <li> is found, one <li> is always a "next" arrow 353 $pages-- if ($pages > 1); 354 print "$pager_hdr: $pages pages\n" if ($args{v}); 355 356 # loop over results pages and append product thumbnails 357 my @thumbnails; 358 for (1..$pages) { 359 my $page_hdr = "$pager_hdr: $_/$pages"; 360 sleep_rand($page_hdr, 5); 361 362 # get a search pages dom 363 $dom = get_dom($class_url . "$_", $ua, $args{v}, $tmp_log); 364 next if (!defined $dom); 365 366 # each product thumbnail has class=PIV_Regular 367 my @temp_thumbs = $dom->find(".PIV_Regular")->html_array(); 368 if ($args{t}) { 369 @temp_thumbs = ($temp_thumbs[0]); 370 } 371 my $num_thumbs = scalar @temp_thumbs; 372 print "$page_hdr: $num_thumbs thumbs found\n" if ($args{v}); 373 push @thumbnails, @temp_thumbs; 374 375 last if ($args{t}); 376 } 377 378 return \@thumbnails; 379 } 380 381 # 382 # this checks the input html for 3 things, part num, manufacturer, and 383 # description. if any of these aren't found, fail. 384 # 385 sub mem_exp_scrape_thumbnail 386 { 387 my ($thumb_hdr, $html) = @_; 388 389 my $error_hdr = "error: $thumb_hdr"; 390 my $info_hdr = "info: $thumb_hdr"; 391 392 # make new html grabber instance with the thumbnail html 393 my $dom = HTML::Grabber->new(html => $html); 394 395 # has to be found otherwise we can't do anything 396 my $product_id = get_tag_text($dom, ".ProductId", $error_hdr); 397 return undef unless defined $product_id; 398 399 # visit the extended description page 400 my $product_url = "http://www.memoryexpress.com/Products/"; 401 my $product_dom = get_dom("$product_url$product_id", $ua, $args{v}, $tmp_log); 402 403 # the part number is inside of id=ProductAdd always 404 my $part_num = get_tag_text($product_dom, "#ProductAdd", $error_hdr); 405 return undef unless defined $part_num; 406 407 # extract the part number, always is text inside of the tag 408 ($part_num) = ($part_num =~ m/Part #:\s*(.*)\r/); 409 if (!defined $part_num) { 410 print $tmp_log "$error_hdr: part num regex failed\n"; 411 return undef; 412 } 413 414 # extract the product description 415 my $desc = get_tag_text($dom, ".ProductTitle", $error_hdr); 416 return undef unless defined $desc; 417 418 # extract the brand, sometimes shows up as text 419 my $brand = $dom->find(".ProductBrand")->text(); 420 if ($brand eq "") { 421 # and sometimes shows up inside the tag attributes 422 $brand = $dom->find(".ProductBrand")->html(); 423 # XXX: revisit this regex, it should be less strict 424 ($brand) = ($brand =~ m/Brand: ([0-9A-Za-z\. ]+)/); 425 } 426 if (!defined $brand || $brand eq "") { 427 print $tmp_log "$error_hdr: .ProductBrand not found, html was:\n"; 428 print $tmp_log "$html\n"; 429 return undef; 430 } 431 432 my $shortened_desc = trunc_line($desc, length($info_hdr) + 2); 433 print "$info_hdr: $brand $part_num\n" if ($args{v}); 434 print "$info_hdr: $shortened_desc\n" if ($args{v}); 435 436 return ($brand, $part_num, $desc); 437 } 438 439 # 440 # unwrap the plain text inside of an html tag 441 # 442 sub get_tag_text 443 { 444 my ($dom, $tag, $error_hdr) = @_; 445 446 my $field = $dom->find($tag)->text(); 447 if (!defined $field || $field eq "") { 448 print $tmp_log "$error_hdr: $tag not found or empty, html was:\n"; 449 print $tmp_log $dom->html() . "\n"; 450 return undef; 451 } 452 453 return $field; 454 } 455 456 # 457 # send an email with the summary of the scrape 458 # 459 sub send_email 460 { 461 my $verbose = shift || 0; 462 463 open my $fh, "<", $tmp_file or die "couldn't open $tmp_file: $!"; 464 my $mail; 465 $mail .= $_ for (<$fh>); 466 close $fh; 467 unlink($tmp_file) or warn "couldn't unlink $tmp_file: $!"; 468 469 return if ($verbose); 470 my $email = Email::Simple->create( 471 header => [ 472 From => "Price Sloth <www\@pricesloth.com>", 473 To => $cfg->{general}{email}, 474 Subject => "weekly product scrape", 475 ], 476 body => $mail 477 ); 478 479 my $sender = Email::Send->new({mailer => "SMTP"}); 480 $sender->mailer_args([Host => $cfg->{"general"}{"smtp"}]); 481 $sender->send($email->as_string()) || print "Couldn't send email\n"; 482 } 483 484 sub sleep_rand 485 { 486 my $header = shift; 487 my $upper_limit = shift || 0; 488 489 my $sleep = int(rand($upper_limit)); 490 printf "$header: (%ss wait)\n", $sleep if ($args{v}); 491 sleep $sleep unless ($args{t}); 492 } 493 494 sub can_open_tty 495 { 496 no autodie; 497 return open(my $tty, '+<', '/dev/tty'); 498 } 499 500 sub sleep_if_cron 501 { 502 if (can_open_tty()) { 503 return; 504 } 505 506 # 577s is appx 9.5 min 507 my $sleep = arc4random_uniform(577); 508 print "info: script run from cron, sleeping $sleep s\n" if ($args{v}); 509 510 # modify ps output to show what state the program is in 511 my $old_argv0 = $0; 512 $0 .= " (sleeping)"; 513 sleep $sleep; 514 $0 = $old_argv0; 515 }