Rev 15906 | Rev 16100 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
import urllib2from BeautifulSoup import BeautifulSoup, NavigableStringfrom dtr.utils.utils import fetchResponseUsingProxy, removePriceFormatting, \transformUrlimport reinvalid_tags = ['b', 'i', 'u']bestSellers = []headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36','Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language' : 'en-US,en;q=0.8','Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Connection':'keep-alive','Accept-Encoding' : 'gzip,deflate,sdch'}def strip_tags(html, invalid_tags):soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)for tag in soup.findAll(True):if tag.name in invalid_tags:s = ""for c in tag.contents:if not isinstance(c, NavigableString):c = strip_tags(unicode(c), invalid_tags)s += unicode(c)tag.replaceWith(s)return soupclass ShopCluesScraper:def __init__(self, livePricing=None):self.count_trials = 0self.livePricing = livePricingdef read(self, url):response_data = ""url = transformUrl(url,5)print urltry:response_data = fetchResponseUsingProxy(url, headers=headers, proxy=False)except Exception as e:print 'ERROR: ', eprint 'Retrying'self.count_trials += 1if self.count_trials < 5:return self.read(url)self.response_data=response_datareturn self.createData()def createData(self):self.soup = strip_tags(self.response_data,invalid_tags)self.response_data =Nonereturn self.scrape(self.soup)def scrape(self,soup):div = soup.find('div',{'class':'pd_name clearfix'})scin = div['data-id']div2 = soup.find('div',{'class':'pd-price-cont clearfix'})try:price = float(removePriceFormatting(div2.find('span',{'id':'thirdPrice'}).find('span').string))except:price = float(removePriceFormatting(div2.find('span',{'id':'sellingPrice'}).find('span').string))inStock = 1if (div2.find('div',{'class':re.compile('stock.*')}).string).strip().upper() == 'OUT OF STOCK':inStock = 0isCod = 1if div2.find('li',{'id':'iscod'}) is None:isCod = 0coupon = ""try:if div2.find('div',{'class':'info clearfix'}):coupon = div2.find('div',{'class':'info clearfix'}).findAll('span')[1].textexcept:print "Unable to parse coupon code"return {'scin':scin,'price':price,'inStock':inStock,'isCod':isCod,'coupon':coupon}if __name__ == '__main__':import datetimeprint datetime.datetime.now()scraper = ShopCluesScraper()print scraper.read('http://shopclues.com/samsung-galaxy-note-4-13.html')print datetime.datetime.now()