Subversion Repositories SmartDukaan

Rev

Rev 12197 | Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
11934 kshitij.so 1
import urllib2
2
from BeautifulSoup import BeautifulSoup, NavigableString
3
import re
4
import sys
5
 
6
invalid_tags = ['b', 'i', 'u']
7
bestSellers = []
8
 
9
def strip_tags(html, invalid_tags):
10
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
11
 
12
    for tag in soup.findAll(True):
13
        if tag.name in invalid_tags:
14
            s = ""
15
 
16
            for c in tag.contents:
17
                if not isinstance(c, NavigableString):
18
                    c = strip_tags(unicode(c), invalid_tags)
19
                s += unicode(c)
20
 
21
            tag.replaceWith(s)
22
 
23
    return soup
24
 
25
class AmazonScraper:
26
    def __init__(self):
27
        self.count_trials = 0
28
 
29
    def read(self, url):
30
        request = urllib2.Request(url)
31
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
32
        opener = urllib2.build_opener()
33
        response_data = ""
34
        try:
35
            response_data = opener.open(request).read()
36
 
37
        except urllib2.HTTPError as e:
38
            print 'ERROR: ', e
39
            print 'Retrying'
40
            self.count_trials += 1
41
 
42
            if self.count_trials < 3:
43
                return self.read(url)
44
 
45
        self.response_data=response_data
46
 
47
    def createData(self):
48
        self.soup = strip_tags(self.response_data,invalid_tags)
49
        return self.scrape(self.soup)
50
 
51
 
52
    def scrape(self,soup):
53
        sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})
54
        for data in sellerData:
55
            print "sellerData****"
56
            price = data.find('span', attrs={'class' : re.compile('.*olpOfferPrice*')}).find('span').text
57
            print "Unit cost= ",float(price.replace("Rs.","").replace(",",""))
58
            unitCost = float(price.replace("Rs.","").replace(",",""))
59
            shippingCost = data.find('p', attrs={'class' : re.compile('.*olpShippingInfo*')}).find('span').text
60
            if "FREE" in shippingCost:
61
                print "shippingCost=0"
62
                shippingCost = 0
63
            else:
64
                print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))
65
                shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))
66
 
67
            sellerColumn =  data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})
68
            print "Seller info ",sellerColumn
69
            ratingColumn = data.find('p', attrs={'class' : 'a-spacing-small'}).find('a').contents[0]
70
            print "Rating info ",ratingColumn
71
            print "***********************"
72
            return unitCost+shippingCost
73
 
74
    def getBestSellers(self,soup):
75
        global bestSellers
76
        bestSellerData = soup.findAll("div" , {"class" : "zg_itemImmersion"})
77
        for data in bestSellerData:
78
            temp = {}
79
            rankVal = data.find('span', attrs={'class' : 'zg_rankNumber'}).text
80
            print "Rank = ",rankVal.lstrip()
81
            productUrl = data.find('a')['href']
82
            print "Product URL = ",productUrl.lstrip().replace("\n","")
83
            productUrl = productUrl.replace("http://www.amazon.in/","").lstrip()
84
            ind = productUrl.rindex("/dp/")
85
            productName = productUrl[0:productUrl.rindex("/dp/")]
86
            print "Product Name = ",productName
87
            asin = productUrl[ind+4: productUrl.rindex("/ref=")]
88
            print "Asin = ",asin
89
            print "**********************"
90
            temp['Rank'] = rankVal.lstrip().replace(".","")
91
            temp['Url'] = productUrl.lstrip().replace("\n","")
92
            temp['Product Name'] = productUrl[0:productUrl.rindex("/dp/")]
93
            temp['Asin'] = productUrl[ind+4: productUrl.rindex("/ref=")]
94
            bestSellers.append(temp)
95
 
96
 
97
if __name__ == '__main__':
98
    scraper = AmazonScraper()
99
    scraper.read('http://www.amazon.in/gp/offer-listing/B001D0ROGO/ref=olp_sort_ps')
100
    print scraper.createData()
101