Subversion Repositories SmartDukaan

Rev

Rev 15265 | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 15265 Rev 15950
Line 1... Line -...
1
from BeautifulSoup import BeautifulSoup, NavigableString
-
 
2
from dtr.utils.utils import fetchResponseUsingProxy
1
from dtr.utils.utils import fetchResponseUsingProxy
3
import re
2
import re
4
import sys
3
import datetime
-
 
4
from pyquery import PyQuery
-
 
5
import traceback
5
 
6
 
6
invalid_tags = ['b', 'i', 'u']
7
invalid_tags = ['b', 'i', 'u']
7
bestSellers = []
8
bestSellers = []
8
 
9
 
9
def strip_tags(html, invalid_tags):
10
headers = {
10
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
11
            'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36',
11
 
-
 
12
    for tag in soup.findAll(True):
-
 
13
        if tag.name in invalid_tags:
-
 
14
            s = ""
-
 
15
 
-
 
16
            for c in tag.contents:
12
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
17
                if not isinstance(c, NavigableString):
13
            'Accept-Language' : 'en-US,en;q=0.8',                     
18
                    c = strip_tags(unicode(c), invalid_tags)
14
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
19
                s += unicode(c)
15
            'Connection':'keep-alive',
20
 
-
 
21
            tag.replaceWith(s)
16
            'Accept-Encoding' : 'gzip,deflate,sdch'
22
 
-
 
23
    return soup
17
        }
24
 
18
 
25
class AmazonScraper:
19
class AmazonScraper:
26
    def __init__(self, livePricing=None):
20
    def __init__(self, livePricing=None):
27
        self.count_trials = 0
21
        self.count_trials = 0
28
        self.livePricing = livePricing
22
        self.livePricing = livePricing
29
    
23
    
30
    def read(self, url):
24
    def read(self, url):
31
        response_data = ""
25
        response_data = ""
32
        try:
26
        try:
33
            response_data = fetchResponseUsingProxy(url,livePricing=self.livePricing)
27
            response_data = fetchResponseUsingProxy(url,headers=headers,livePricing=self.livePricing,proxy=True)
34
        except Exception as e:
28
        except Exception as e:
35
            print 'ERROR: ', e
29
            print 'ERROR: ', e
36
            print 'Retrying'
30
            print 'Retrying'
37
            self.count_trials += 1
31
            self.count_trials += 1
38
            
32
            
Line 40... Line 34...
40
                return self.read(url)
34
                return self.read(url)
41
        
35
        
42
        self.response_data=response_data
36
        self.response_data=response_data
43
        
37
        
44
        if "Server Busy" in self.response_data:
38
        if "Server Busy" in self.response_data:
45
            print "Captcha page, lets try again."
-
 
46
            self.count_trials += 1
39
            self.count_trials += 1
47
            return self.read(url)
40
            return self.read(url)
48
        return self.createData()
41
        return self.createData()
49
    
42
    
50
    def createData(self):
43
    def createData(self):
51
        self.soup = strip_tags(self.response_data,invalid_tags)
-
 
52
        self.response_data =None
44
        try:
53
        return self.scrape(self.soup)
45
            pq = PyQuery(self.response_data)
54
    
-
 
55
    
-
 
56
    def scrape(self,soup):
-
 
57
        sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})
-
 
58
        for data in sellerData:
46
            tag = pq('div.olpOffer')
59
            print "sellerData****"
47
            infoDiv =  pq(tag[0])
60
            price = data.find('span', attrs={'class' : re.compile('.*olpOfferPrice*')}).find('span').text
-
 
61
            print "Unit cost= ",float(price.replace("Rs.","").replace(",",""))
48
            price = infoDiv('span.olpOfferPrice')
62
            unitCost = float(price.replace("Rs.","").replace(",",""))
49
            unitCost = float(price.text().replace("Rs.","").replace(",",""))
63
            shippingCost = data.find('p', attrs={'class' : re.compile('.*olpShippingInfo*')}).find('span').text
-
 
64
            if "FREE" in shippingCost:
-
 
65
                print "shippingCost=0"
50
            shipping = infoDiv('span.olpShippingPrice')
66
                shippingCost = 0
-
 
67
            else:
-
 
68
                try:
-
 
69
                    print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery","").replace(",",""))
-
 
70
                    shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery","").replace(",",""))
-
 
71
                except:
-
 
72
                    shippingCost = 0.0
-
 
73
            try:
51
            try:
74
                return (unitCost + shippingCost)
52
                shippingCost = float(shipping.text().replace("Rs.","").replace(",",""))
75
            except:
53
            except:
76
                return  unitCost
54
                shippingCost = 0
-
 
55
            return unitCost + shippingCost 
-
 
56
        except:
-
 
57
            return 0.0
77
            
58
        
78
 
59
    
79
if __name__ == '__main__':
60
if __name__ == '__main__':
-
 
61
    print datetime.datetime.now()
80
    scraper = AmazonScraper(True)
62
    scraper = AmazonScraper(True)
81
    print scraper.read('http://www.amazon.in/gp/offer-listing/B00R659KZ8')
-
 
82
    
-
 
83
63
    print scraper.read('http://www.amazon.in/gp/aw/ol/B00UTKPKHY')
-
 
64
    print datetime.datetime.now()
-
 
65