Subversion Repositories SmartDukaan

Rev

Rev 12198 | Rev 12275 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
11934 kshitij.so 1
import urllib2
2
from BeautifulSoup import BeautifulSoup, NavigableString
3
import re
4
import sys
5
 
6
invalid_tags = ['b', 'i', 'u']
7
bestSellers = []
8
 
9
def strip_tags(html, invalid_tags):
10
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
11
 
12
    for tag in soup.findAll(True):
13
        if tag.name in invalid_tags:
14
            s = ""
15
 
16
            for c in tag.contents:
17
                if not isinstance(c, NavigableString):
18
                    c = strip_tags(unicode(c), invalid_tags)
19
                s += unicode(c)
20
 
21
            tag.replaceWith(s)
22
 
23
    return soup
24
 
25
class AmazonScraper:
26
    def __init__(self):
27
        self.count_trials = 0
28
 
12256 kshitij.so 29
    def read(self, url, findStore):
11934 kshitij.so 30
        request = urllib2.Request(url)
31
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
32
        opener = urllib2.build_opener()
33
        response_data = ""
12256 kshitij.so 34
        self.findStore = findStore
11934 kshitij.so 35
        try:
36
            response_data = opener.open(request).read()
37
 
38
        except urllib2.HTTPError as e:
39
            print 'ERROR: ', e
40
            print 'Retrying'
41
            self.count_trials += 1
42
 
43
            if self.count_trials < 3:
44
                return self.read(url)
45
 
46
        self.response_data=response_data
47
 
48
    def createData(self):
49
        self.soup = strip_tags(self.response_data,invalid_tags)
12197 kshitij.so 50
        self.response_data =None
11934 kshitij.so 51
        return self.scrape(self.soup)
52
 
53
 
54
    def scrape(self,soup):
55
        sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})
56
        for data in sellerData:
57
            print "sellerData****"
58
            price = data.find('span', attrs={'class' : re.compile('.*olpOfferPrice*')}).find('span').text
59
            print "Unit cost= ",float(price.replace("Rs.","").replace(",",""))
60
            unitCost = float(price.replace("Rs.","").replace(",",""))
61
            shippingCost = data.find('p', attrs={'class' : re.compile('.*olpShippingInfo*')}).find('span').text
62
            if "FREE" in shippingCost:
63
                print "shippingCost=0"
64
                shippingCost = 0
65
            else:
66
                print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))
67
                shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))
68
 
69
            sellerColumn =  data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})
12256 kshitij.so 70
            store=""
71
            if self.findStore:
72
                print "Seller info ",sellerColumn
73
                x = sellerColumn.find('a')['href']
74
                print "&&&&"
75
                storeUrl = x
76
                store = self.findStoreFront(storeUrl)
77
                try:
78
                    ind = store.index("@ Amazon.in")
79
                    store = store[0:ind].strip()
80
                except:
81
                    try:
82
                        ind = store.split(":")
83
                        store = ind[1].strip()
84
                    except:
85
                        store =""
11934 kshitij.so 86
            ratingColumn = data.find('p', attrs={'class' : 'a-spacing-small'}).find('a').contents[0]
87
            print "Rating info ",ratingColumn
88
            print "***********************"
12256 kshitij.so 89
            return unitCost+shippingCost,store
11934 kshitij.so 90
 
12256 kshitij.so 91
    def findStoreFront(self,storeUrl):
92
        request = urllib2.Request(storeUrl)
93
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
94
        opener = urllib2.build_opener()
95
        response_data = ""
96
        try:
97
            response_data = opener.open(request).read()
11934 kshitij.so 98
 
12256 kshitij.so 99
        except urllib2.HTTPError as e:
100
            print 'ERROR: ', e
101
            print 'Retrying'
102
            self.count_trials += 1
103
 
104
            if self.count_trials < 3:
105
                return ""
106
        soup = strip_tags(response_data,invalid_tags)
107
        response_data =None
108
        return soup.title.string
109
 
11934 kshitij.so 110
 
111
if __name__ == '__main__':
112
    scraper = AmazonScraper()
12256 kshitij.so 113
    scraper.read('http://www.amazon.in/gp/offer-listing/B001D0ROGO/ref=olp_sort_ps',True)
11934 kshitij.so 114
    print scraper.createData()
115