Subversion Repositories SmartDukaan

Rev

Rev 12256 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
12256 kshitij.so 1
from BeautifulSoup import BeautifulSoup
2
import mechanize
3
import re
4
 
5
class SellerCentralScraper:
6
 
7
        def getBrowserObject(self):
8
            import cookielib
9
            br = mechanize.Browser(factory=mechanize.RobustFactory())
10
            cj = cookielib.LWPCookieJar()
11
            br.set_cookiejar(cj)
12
            br.set_handle_equiv(True)
13
            br.set_handle_redirect(True)
14
            br.set_handle_referer(True)
15
            br.set_handle_robots(False)
16
            br.set_debug_http(False)
17
            br.set_debug_redirects(False)
18
            br.set_debug_responses(False)
19
 
20
            br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
21
 
22
            br.addheaders = [('User-agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'),
23
                             ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
24
                             ('Accept-Encoding', 'gzip,deflate,sdch'),                  
25
                             ('Accept-Language', 'en-US,en;q=0.8'),                     
26
                             ('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3')]
27
            return br
28
 
29
        def login(self,url):
30
            br = self.getBrowserObject()
31
            br.open(url)
32
            response = br.open(url)
33
            self.ungzipResponse(response, br)
34
            #html = response.read()
35
            #print html
36
            br.select_form(name="signinWidget")
37
            br.form['username'] = "kshitij.sood@saholic.com"
38
            br.form['password'] = "pioneer"
39
            response = br.submit()
40
            print "********************"
41
            print "Attempting to Login"
42
            print "********************"
43
            #ungzipResponse(response, br)
44
            return br
45
 
46
        def requestSku(self,br,skuUrl):
47
            print "********************"
48
            print "Requesting SKU Details"
49
            print "********************"
50
            response = br.open(skuUrl)
51
            self.ungzipResponse(response, br)
52
            page = response.read()
53
            response = None
54
            try:
55
                return self.getSkuDetails(br, page)
56
            except:
57
                return '',0,0
58
 
59
        def getSkuDetails(self,br,page):
60
            soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
61
            table_rows = soup.find("table" , {"class" : "data-display manageTable"})
62
            skuDetails = table_rows.findAll("tr" , {"id" : re.compile('sku-.*')})
63
            skuRow = skuDetails[0].findAll("td")
64
            sku = str(skuRow[4].text)
65
            if sku.startswith('FBA'):
66
                isFba = True
67
            else:
68
                isFba = False
69
            asin = skuRow[5].text
70
            try:
71
                inputTag = skuDetails[0].find(attrs={"name": "inv"})
72
                inventory = inputTag['value']
73
            except:
74
                inventory = skuRow[8].text
75
            try:
76
                inputTag = skuDetails[0].find(attrs={"name": "price"})
77
                ourPrice = inputTag['value']
78
            except:
79
                ourPrice = skuRow[10].text
80
 
81
            return asin, inventory, ourPrice
82
 
83
        def ungzipResponse(self,r,b):
84
            headers = r.info()
85
            if headers['Content-Encoding']=='gzip':
86
                import gzip
87
                print "********************"
88
                print "Deflating gzip response"
89
                print "********************"
90
                gz = gzip.GzipFile(fileobj=r, mode='rb')
91
                html = gz.read()
92
                gz.close()
93
                headers["Content-type"] = "text/html; charset=utf-8"
94
                r.set_data( html )
95
                b.set_response(r)
96
 
97
 
98
 
99
 
100
def main():
101
    print "Opening Seller Central login page"
102
    login_url = "https://sellercentral.amazon.in/gp/homepage.html"
103
    sc = SellerCentralScraper()
104
    br = sc.login(login_url)
12282 kshitij.so 105
    sku_url = "https://sellercentral.amazon.in/myi/search/ProductSummary?keyword=2287"
12256 kshitij.so 106
    print sc.requestSku(br,sku_url)
107
 
108
 
109
if __name__ == "__main__":
110
    main()