LondonDrugs.pm (3918B)
1 package PS::LondonDrugs; 2 use strict; 3 4 use HTML::Grabber; 5 use Log::Log4perl qw(:easy); 6 use URI::Escape; 7 8 use PS::Database; 9 use PS::UserAgent; 10 11 my $logger = get_logger('pricesloth.london_drugs'); 12 13 sub new { 14 my ($class) = @_; 15 16 my $self = { 17 color => "#005DAB", 18 url => "http://www.londondrugs.com/on/demandware.store/Sites-LondonDrugs-Site/default/Search-Show?q=", 19 ua => PS::UserAgent->new(), 20 db => PS::Database->new() 21 }; 22 23 bless ($self, $class); 24 $logger->debug("new(): success"); 25 26 # XXX: make sure row in retailer table is created 27 28 return $self; 29 } 30 31 sub create_search { 32 my ($self, $manufacturer, $part_num) = @_; 33 34 # London drugs search looks like it work well when both manufacturer and 35 # part number are given. 36 return $self->{url} . uri_escape("$manufacturer $part_num"); 37 } 38 39 sub scrape_part_num { 40 my ($self, $resp) = @_; 41 my $dom = HTML::Grabber->new( html => $resp->decoded_content ); 42 43 my ($title) = $dom->find(".productname")->text_array(); 44 my ($part_num) = ($title =~ m/.* - (.*)\r/); 45 return $part_num; 46 } 47 48 sub scrape_description { 49 my ($self, $resp) = @_; 50 my $dom = HTML::Grabber->new( html => $resp->decoded_content ); 51 52 my ($title) = $dom->find(".productname")->text_array(); 53 my ($descr) = ($title =~ m/^\s+(.*) - .*\r/); 54 return $descr; 55 } 56 57 sub scrape_price { 58 my ($self, $resp) = @_; 59 my $dom = HTML::Grabber->new( html => $resp->decoded_content ); 60 61 # There are many .salesprice tags on the page but only one is inside of 62 # .productpricing which is the main product on the page. 63 my $price_container = $dom->find(".productpricing .salesprice")->text(); 64 $price_container =~ s/^\s+//; 65 $price_container =~ s/\s+$//; 66 67 # Try and match a dollars dot cents format with leeway for comma 68 # separated digits. 69 my ($price, @others) = ($price_container =~ m/(\d[\d,]+.\d\d)/); 70 $logger->warn("memexp: found more than 1 price") if (@others); 71 72 # Remove any commas we may have matched earlier 73 $price =~ s/,//; 74 75 return ($price, @others); 76 } 77 78 sub find_product_page { 79 my ($self, $resp) = @_; 80 my $ua = $self->{ua}; 81 82 my $search_url = $self->{url}; 83 # The search url has "//" characters that need to be escaped before 84 # being used in regular expressions 85 $search_url = quotemeta $search_url; 86 87 my $uri = $resp->base; 88 if ($uri =~ /http:\/\/www.londondrugs.com\/.*\.html/) { 89 # We landed on the product page directly, great. 90 return ($resp); 91 } 92 elsif ($uri =~ /$search_url/) { 93 # We landed on the search page. 94 my $dom = HTML::Grabber->new( html => $resp->decoded_content ); 95 96 my ($first_result, @others) = $dom->find(".productlisting .product")->html_array(); 97 return unless ($first_result); 98 99 my $num_total = scalar (@others) + 1; 100 $logger->debug("find_product_page(): found $num_total thumbnails"); 101 102 # For every thumbnail there is a div with class="name" with a 103 # link to the product page inside 104 my $thumb_dom = HTML::Grabber->new( html => $first_result ); 105 my $product_url = $thumb_dom->find(".name a")->attr('href'); 106 107 $resp = $ua->get_dom($product_url); 108 return unless $resp->is_success; 109 110 return ($resp, @others); 111 } 112 else { 113 $logger->error("find_product_page(): unexpected search URI '$uri'"); 114 return; 115 } 116 } 117 118 sub scrape { 119 my ($self, $manufacturer, $part_num) = @_; 120 my $ua = $self->{ua}; 121 my $db = $self->{db}; 122 my $start = time; 123 124 my $search = $self->create_search($manufacturer, $part_num); 125 my $resp = $ua->get_dom($search); 126 return unless ($resp->is_success); 127 128 # Searching can sometimes take you to different places 129 ($resp) = $self->find_product_page($resp); 130 return unless ($resp); 131 132 # my $part_num = $self->scrape_part_num($resp); 133 my ($price) = $self->scrape_price($resp); 134 my $desc = $self->scrape_description($resp); 135 136 $db->insert_price($manufacturer, $part_num, "London Drugs", $price, time - $start); 137 $db->insert_descr($manufacturer, $part_num, "London Drugs", $desc) if ($desc); 138 139 $logger->debug("scrape_price(): added price \$$price\n"); 140 return $price; 141 }