Subversion Repositories SmartDukaan

Rev

Rev 15154 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

import urllib2
from BeautifulSoup import BeautifulSoup, NavigableString
import re
import sys

invalid_tags = ['b', 'i', 'u']
bestSellers = []

def strip_tags(html, invalid_tags):
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)

    for tag in soup.findAll(True):
        if tag.name in invalid_tags:
            s = ""

            for c in tag.contents:
                if not isinstance(c, NavigableString):
                    c = strip_tags(unicode(c), invalid_tags)
                s += unicode(c)

            tag.replaceWith(s)

    return soup

class AmazonScraper:
    def __init__(self):
        self.count_trials = 0
    
    def read(self, url):
        request = urllib2.Request(url)
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
        response_data = ""
        try:
            response = urllib2.urlopen(request)
            response_data = response.read()
            response.close()
            
        except urllib2.HTTPError as e:
            print 'ERROR: ', e
            print 'Retrying'
            self.count_trials += 1
            
            if self.count_trials < 3:
                return self.read(url)
        
        self.response_data=response_data
        return self.createData()
    
    def createData(self):
        self.soup = strip_tags(self.response_data,invalid_tags)
        self.response_data =None
        return self.scrape(self.soup)
    
    
    def scrape(self,soup):
        try:
            sellerData = soup.find("span" , {"id" : "priceblock_dealprice"})
            dealPrice = float(sellerData.text.replace("Rs.","").replace(",",""))
        except:
            dealPrice = 0.0
        try:
            dealAvailablity =  soup.find('div',{'id':'deal_availability'})
            dealStatus = dealAvailablity.find('span',{'id':re.compile('dealStatusAvailability_*')})
            dealStatus = float(dealStatus.text.replace("%","").replace(",",""))
        except:
            dealStatus = 100
        
        if dealStatus < 100 and dealPrice > 0:
            return dealPrice
        else:
            return 0.0

if __name__ == '__main__':
    scraper = AmazonScraper()
    print scraper.read('http://www.amazon.in/gp/product/B00FXLCLTO')