Subversion Repositories SmartDukaan

Rev

Rev 12275 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
11934 kshitij.so 1
import urllib2
2
from BeautifulSoup import BeautifulSoup, NavigableString
15483 kshitij.so 3
from dtr.utils.utils import fetchResponseUsingProxy
11934 kshitij.so 4
import re
5
import sys
6
 
7
invalid_tags = ['b', 'i', 'u']
8
bestSellers = []
9
 
10
def strip_tags(html, invalid_tags):
11
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
12
 
13
    for tag in soup.findAll(True):
14
        if tag.name in invalid_tags:
15
            s = ""
16
 
17
            for c in tag.contents:
18
                if not isinstance(c, NavigableString):
19
                    c = strip_tags(unicode(c), invalid_tags)
20
                s += unicode(c)
21
 
22
            tag.replaceWith(s)
23
 
24
    return soup
25
 
26
class AmazonScraper:
15483 kshitij.so 27
    def __init__(self, livePricing=None):
11934 kshitij.so 28
        self.count_trials = 0
15483 kshitij.so 29
        self.livePricing = livePricing
11934 kshitij.so 30
 
12256 kshitij.so 31
    def read(self, url, findStore):
11934 kshitij.so 32
        response_data = ""
12256 kshitij.so 33
        self.findStore = findStore
11934 kshitij.so 34
        try:
15483 kshitij.so 35
            response_data = fetchResponseUsingProxy(url,livePricing=self.livePricing)
36
        except Exception as e:
11934 kshitij.so 37
            print 'ERROR: ', e
38
            print 'Retrying'
39
            self.count_trials += 1
40
 
15483 kshitij.so 41
            if self.count_trials < 5:
11934 kshitij.so 42
                return self.read(url)
43
 
44
        self.response_data=response_data
15483 kshitij.so 45
 
46
        if "Server Busy" in self.response_data:
47
            print "Captcha page, lets try again."
48
            self.count_trials += 1
49
            return self.read(url)
50
        return self.createData()
11934 kshitij.so 51
 
52
    def createData(self):
53
        self.soup = strip_tags(self.response_data,invalid_tags)
12197 kshitij.so 54
        self.response_data =None
11934 kshitij.so 55
        return self.scrape(self.soup)
56
 
57
 
58
    def scrape(self,soup):
59
        sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})
60
        for data in sellerData:
61
            print "sellerData****"
62
            price = data.find('span', attrs={'class' : re.compile('.*olpOfferPrice*')}).find('span').text
63
            print "Unit cost= ",float(price.replace("Rs.","").replace(",",""))
64
            unitCost = float(price.replace("Rs.","").replace(",",""))
65
            shippingCost = data.find('p', attrs={'class' : re.compile('.*olpShippingInfo*')}).find('span').text
66
            if "FREE" in shippingCost:
67
                print "shippingCost=0"
68
                shippingCost = 0
69
            else:
70
                print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))
71
                shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))
72
 
73
            sellerColumn =  data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})
12256 kshitij.so 74
            store=""
75
            if self.findStore:
76
                print "Seller info ",sellerColumn
77
                x = sellerColumn.find('a')['href']
12275 kshitij.so 78
                print x
79
                temp =  sellerColumn.find('a')
80
                store = temp.text
81
                if len(store)==0:
82
                    storeUrl = x
83
                    dom_in = storeUrl.find("www.amazon.in")
84
                    if dom_in ==-1:
85
                        storeUrl="http://amazon.in"+storeUrl
86
                    store = self.findStoreFront(storeUrl)
12256 kshitij.so 87
                    try:
12275 kshitij.so 88
                        ind = store.index("@ Amazon.in")
89
                        store = store[0:ind].strip()
12256 kshitij.so 90
                    except:
12275 kshitij.so 91
                        try:
92
                            ind = store.split(":")
93
                            store = ind[1].strip()
94
                        except:
95
                            store =""
11934 kshitij.so 96
            ratingColumn = data.find('p', attrs={'class' : 'a-spacing-small'}).find('a').contents[0]
97
            print "Rating info ",ratingColumn
98
            print "***********************"
12256 kshitij.so 99
            return unitCost+shippingCost,store
11934 kshitij.so 100
 
12256 kshitij.so 101
    def findStoreFront(self,storeUrl):
102
        try:
15483 kshitij.so 103
            response_data = fetchResponseUsingProxy(storeUrl,livePricing=None)
104
        except:
105
            return ""
12256 kshitij.so 106
        soup = strip_tags(response_data,invalid_tags)
107
        response_data =None
108
        return soup.title.string
109
 
11934 kshitij.so 110
 
111
if __name__ == '__main__':
112
    scraper = AmazonScraper()
15483 kshitij.so 113
    print scraper.read('http://www.amazon.in/gp/offer-listing/B006PB44NM/ref=olp_sort_ps',True)
11934 kshitij.so 114