Subversion Repositories SmartDukaan

Rev

Rev 17246 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
14307 kshitij.so 1
import urllib2
2
from BeautifulSoup import BeautifulSoup, NavigableString
3
import re
14759 kshitij.so 4
from dtr.utils.utils import fetchResponseUsingProxy
14307 kshitij.so 5
 
6
invalid_tags = ['b', 'i', 'u']
7
bestSellers = []
8
 
9
def strip_tags(html, invalid_tags):
10
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
11
 
12
    for tag in soup.findAll(True):
13
        if tag.name in invalid_tags:
14
            s = ""
15
 
16
            for c in tag.contents:
17
                if not isinstance(c, NavigableString):
18
                    c = strip_tags(unicode(c), invalid_tags)
19
                s += unicode(c)
20
 
21
            tag.replaceWith(s)
22
 
23
    return soup
24
 
25
class AmazonScraper:
15210 kshitij.so 26
    def __init__(self,livePricing=None):
14307 kshitij.so 27
        self.count_trials = 0
15211 kshitij.so 28
        self.livePricing = livePricing
14307 kshitij.so 29
 
30
    def read(self, url):
31
        response_data = ""
32
        try:
15211 kshitij.so 33
            response_data = fetchResponseUsingProxy(url, livePricing=self.livePricing)
14759 kshitij.so 34
        except Exception as e:
14307 kshitij.so 35
            print 'ERROR: ', e
36
            print 'Retrying'
37
            self.count_trials += 1
38
 
15156 kshitij.so 39
            if self.count_trials < 5:
14307 kshitij.so 40
                return self.read(url)
41
 
42
        self.response_data=response_data
15154 kshitij.so 43
        if "Server Busy" in self.response_data:
44
            print "Server busy...Ahhhhh"
45
            self.count_trials += 1
46
            return self.read(url)
14307 kshitij.so 47
        return self.createData()
48
 
49
    def createData(self):
50
        self.soup = strip_tags(self.response_data,invalid_tags)
51
        self.response_data =None
52
        return self.scrape(self.soup)
53
 
54
 
55
    def scrape(self,soup):
56
        try:
57
            sellerData = soup.find("span" , {"id" : "priceblock_dealprice"})
58
            dealPrice = float(sellerData.text.replace("Rs.","").replace(",",""))
17246 kshitij.so 59
            print dealPrice
14307 kshitij.so 60
        except:
61
            dealPrice = 0.0
62
        try:
17246 kshitij.so 63
            dealStatus = soup.find('span',{'id':re.compile('dealStatusAvailability_*')})
14307 kshitij.so 64
            dealStatus = float(dealStatus.text.replace("%","").replace(",",""))
65
        except:
66
            dealStatus = 100
67
 
17246 kshitij.so 68
 
17247 kshitij.so 69
        if dealStatus < 100 and dealPrice > 0:
14307 kshitij.so 70
            return dealPrice
71
        else:
72
            return 0.0
73
 
74
if __name__ == '__main__':
15211 kshitij.so 75
    scraper = AmazonScraper(True)
17246 kshitij.so 76
    print scraper.read('http://www.amazon.in/dp/B015CSIA38/ref=gbdp_vlo_61287013_B015CSIA38?_encoding=UTF8&smid=A2WBY8FP973J47')
14307 kshitij.so 77