Subversion Repositories SmartDukaan

Rev

Rev 15155 | Rev 15215 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
13829 kshitij.so 1
from BeautifulSoup import BeautifulSoup, NavigableString
14744 kshitij.so 2
from dtr.utils.utils import fetchResponseUsingProxy
13829 kshitij.so 3
import re
4
import sys
5
 
6
invalid_tags = ['b', 'i', 'u']
7
bestSellers = []
8
 
9
def strip_tags(html, invalid_tags):
10
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
11
 
12
    for tag in soup.findAll(True):
13
        if tag.name in invalid_tags:
14
            s = ""
15
 
16
            for c in tag.contents:
17
                if not isinstance(c, NavigableString):
18
                    c = strip_tags(unicode(c), invalid_tags)
19
                s += unicode(c)
20
 
21
            tag.replaceWith(s)
22
 
23
    return soup
24
 
25
class AmazonScraper:
15212 kshitij.so 26
    def __init__(self, livePricing=None):
13829 kshitij.so 27
        self.count_trials = 0
15212 kshitij.so 28
        self.livePricing = livePricing
13829 kshitij.so 29
 
30
    def read(self, url):
31
        response_data = ""
15212 kshitij.so 32
        print self.livePricing
13829 kshitij.so 33
        try:
15212 kshitij.so 34
            response_data = fetchResponseUsingProxy(url,livePricing=self.livePricing)
14744 kshitij.so 35
        except Exception as e:
13829 kshitij.so 36
            print 'ERROR: ', e
37
            print 'Retrying'
38
            self.count_trials += 1
39
 
15155 kshitij.so 40
            if self.count_trials < 5:
13829 kshitij.so 41
                return self.read(url)
42
 
43
        self.response_data=response_data
15212 kshitij.so 44
        print response_data
15153 kshitij.so 45
        if "Server Busy" in self.response_data:
46
            print "Server busy...Ahhhhh"
47
            self.count_trials += 1
48
            return self.read(url)
13829 kshitij.so 49
        return self.createData()
50
 
51
    def createData(self):
52
        self.soup = strip_tags(self.response_data,invalid_tags)
53
        self.response_data =None
54
        return self.scrape(self.soup)
55
 
56
 
57
    def scrape(self,soup):
58
        sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})
59
        for data in sellerData:
60
            print "sellerData****"
61
            price = data.find('span', attrs={'class' : re.compile('.*olpOfferPrice*')}).find('span').text
62
            print "Unit cost= ",float(price.replace("Rs.","").replace(",",""))
63
            unitCost = float(price.replace("Rs.","").replace(",",""))
64
            shippingCost = data.find('p', attrs={'class' : re.compile('.*olpShippingInfo*')}).find('span').text
65
            if "FREE" in shippingCost:
66
                print "shippingCost=0"
67
                shippingCost = 0
68
            else:
69
                try:
70
                    print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery","").replace(",",""))
71
                    shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery","").replace(",",""))
72
                except:
73
                    shippingCost = 0.0
74
            try:
75
                return (unitCost + shippingCost)
76
            except:
77
                return  unitCost
78
 
79
 
80
if __name__ == '__main__':
15212 kshitij.so 81
    scraper = AmazonScraper(True)
15153 kshitij.so 82
    print scraper.read('http://www.amazon.in/gp/offer-listing/B00UFPHX8M')
13829 kshitij.so 83