Rev 16217 | Blame | Compare with Previous | Last modification | View Log | RSS feed
from dtr.utils.utils import fetchResponseUsingProxy, removePriceFormatting, \transformUrlimport cssutilsfrom pyquery import PyQueryheaders = {'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36','Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language' : 'en-US,en;q=0.8','Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Connection':'keep-alive','Accept-Encoding' : 'gzip,deflate,sdch'}class ShopCluesScraper:def __init__(self, livePricing=None, findThumbnail=None):self.count_trials = 0self.livePricing = livePricingself.findThumbnail = findThumbnaildef read(self, url):response_data = ""url = transformUrl(url,5)print urltry:response_data = fetchResponseUsingProxy(url, headers=headers, proxy=False)except Exception as e:print 'ERROR: ', eprint 'Retrying'self.count_trials += 1if self.count_trials < 5:return self.read(url)self.response_data=response_datareturn self.scrape()def scrape(self):pq = PyQuery(self.response_data)thumbnailUrl = ""if self.findThumbnail:imgTag = pq('div.pd-image').attr['style']style = cssutils.parseStyle(imgTag)thumbnailUrl = style['background'][style['background'].index('(')+1:style['background'].rfind(')')]headerDiv = pq('div.pd_name.clearfix')scin = str(headerDiv.attr['data-id'])infoDiv = pq('div.pd-price-cont.clearfix')try:price = float(removePriceFormatting(infoDiv('span#thirdPrice').text()))except:price = float(removePriceFormatting(infoDiv('span#sellingPrice').text()))inStock = 1if infoDiv('div.stock').text().strip().upper() == 'OUT OF STOCK':inStock = 0isCod = 1if len(infoDiv('li#notcod')) > 0:isCod = 0coupon = ""couponDiv = (infoDiv('div.info.clearfix'))if len(couponDiv) > 0:coupon = (couponDiv('span'))[1].textreturn {'scin':scin,'price':price,'inStock':inStock,'isCod':isCod,'coupon':coupon, "thumbnail":thumbnailUrl}if __name__ == '__main__':import datetimeprint datetime.datetime.now()scraper = ShopCluesScraper(findThumbnail=True)print scraper.read('http://m.shopclues.com/apple-iphone-6s-16gb-9.html')print datetime.datetime.now()