Subversion Repositories SmartDukaan

Rev

Rev 12412 | Rev 12434 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 12412 Rev 12430
Line 1... Line 1...
1
from BeautifulSoup import BeautifulSoup, NavigableString
1
from xml.dom.minidom import parseString
2
import re
2
import re
3
import sys
3
import urllib
4
import  datetime
4
import hashlib
5
import grequests
5
import hmac
6
import re
6
import base64
7
 
-
 
8
invalid_tags = ['b', 'i', 'u']
7
from time import strftime, gmtime
9
bestSellers = []
8
from requests import request
10
 
-
 
11
def strip_tags(html, invalid_tags):
9
from operator import itemgetter
12
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
-
 
13
 
10
 
14
    for tag in soup.findAll(True):
-
 
15
        if tag.name in invalid_tags:
-
 
16
            s = ""
-
 
17
 
11
 
18
            for c in tag.contents:
12
class MWS(object):
19
                if not isinstance(c, NavigableString):
-
 
20
                    c = strip_tags(unicode(c), invalid_tags)
-
 
21
                s += unicode(c)
13
    URI = "/"
22
 
-
 
23
            tag.replaceWith(s)
14
    VERSION = "2009-01-01"
24
 
-
 
25
    return soup
15
    NS = ''
26
 
16
 
27
class AmazonAsyncScraper:
17
    def __init__(self, access_key, secret_key, merchant_id,
28
    def __init__(self):
18
                 domain='https://mws-eu.amazonservices.com', uri="", version=""):
29
        self.count_trials = 0
19
        self.access_key = access_key
30
    
-
 
31
    def read(self, urls, findStore):
20
        self.secret_key = secret_key
32
        returnMap = {}
21
        self.merchant_id = merchant_id
33
        print datetime.datetime.now()
22
        self.domain = domain
34
        header = {'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1',
-
 
35
                  'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
23
        self.uri = uri or self.URI
36
                  'Accept-Encoding':'gzip,deflate,sdch'
24
        self.version = version or self.VERSION
37
                  }
-
 
38
 
25
 
39
        rs = (grequests.get(u, headers=header) for u in urls)
26
    def make_request(self, extra_data, action,method="GET", **kwargs):
-
 
27
 
40
        for x in grequests.map(rs):
28
        params = {
41
            soup = strip_tags(x.text,invalid_tags)
29
            'AWSAccessKeyId': self.access_key,
42
            for tag in soup.findAll(True):
30
            'SignatureVersion': '2',
43
                if tag.name in invalid_tags:
31
            'Timestamp': self.get_timestamp(),
44
                    s = ""
32
            'Version': self.version,
45
        
-
 
46
                    for c in tag.contents:
33
            'SignatureMethod': 'HmacSHA256',
47
                        if not isinstance(c, NavigableString):
-
 
48
                            c = strip_tags(unicode(c), invalid_tags)
-
 
49
                        s += unicode(c)
34
            'Action': action
50
        
35
        }
51
                    tag.replaceWith(s)
36
        if action=='GetLowestOfferListingsForSKU':
52
            x.close()
-
 
53
            sellerCount=0
-
 
54
            info = []
-
 
55
            sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})
-
 
56
            dataLength = len(sellerData)
37
            params['ExcludeMe']='true'
57
            for data in sellerData:
38
        params.update(extra_data)
58
                tempMap={}
-
 
59
                price = data.find('span', attrs={'class' : re.compile('.*olpOfferPrice*')}).find('span').text
39
        request_description = '&'.join(['%s=%s' % (k, urllib.quote(params[k], safe='-_.~').encode('utf-8')) for k in sorted(params)])
60
                unitCost = float(price.replace("Rs.","").replace(",",""))
40
        signature = self.calc_signature(method, request_description)
61
                shippingCost = data.find('p', attrs={'class' : re.compile('.*olpShippingInfo*')}).find('span').text
41
        url = '%s%s?%s&Signature=%s' % (self.domain, self.uri, request_description, urllib.quote(signature))
62
                if "FREE" in shippingCost:
-
 
63
                    shippingCost = 0
-
 
64
                else:
-
 
65
                    #print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))
-
 
66
                    shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))
-
 
67
                
-
 
68
                sellerColumn =  data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})
42
        headers = {'User-Agent': 'AmazonJavascriptScratchpad/1.0 (Language=Python)'}
69
                store=""
-
 
70
                storeUrl=""
-
 
71
                if findStore:
-
 
72
                    storeUrl = sellerColumn.find('a')['href']
-
 
73
                    temp =  sellerColumn.find('a')
43
        headers.update(kwargs.get('extra_headers', {}))
74
                    store = temp.text
-
 
75
                    if len(store)==0:
-
 
76
                        dom_in = storeUrl.find("www.amazon.in")
-
 
77
                        if dom_in ==-1:
-
 
78
                            storeUrl="http://amazon.in"+storeUrl
-
 
79
                        if storeUrl[storeUrl.rfind('/')+1:]=='AF6E3O0VE0X4D':
-
 
80
                            store = 'Saholic'
-
 
81
                    if len(store)!=0:
44
        response = request(method, url)
82
                        tempMap['isStoreFront']='True'
-
 
83
                    else:
-
 
84
                        tempMap['isStoreFront']='False'
-
 
85
                    tempMap['storeUrl'] =storeUrl
45
        if action=='GetLowestOfferListingsForSKU':
86
                asinind = x.url.index("offer-listing")
-
 
87
                refind = x.url.index("/ref=olp_sort_ps")
-
 
88
                asin = x.url[asinind+14:refind].strip()
-
 
89
                sellerCount+=1
-
 
90
                if sellerCount==1:
-
 
91
                    tempMap['sellerName'] = store.strip()
-
 
92
                    tempMap['sellerPrice'] = unitCost+shippingCost
-
 
93
                if sellerCount==2:
-
 
94
                    tempMap['sellerName'] = store.strip()
-
 
95
                    tempMap['sellerPrice'] = unitCost+shippingCost
-
 
96
                if sellerCount==3:
-
 
97
                    tempMap['sellerName'] = store.strip()
-
 
98
                    tempMap['sellerPrice'] = unitCost+shippingCost
46
            return self.parse_competitor_pricing_response(response)
99
                info.append(tempMap) 
47
        elif action=='GetMyPriceForSKU':
100
                if sellerCount==3 or sellerCount==dataLength:
-
 
101
                    returnMap[asin] = info 
-
 
102
                    break
-
 
103
        if findStore:
-
 
104
            return self.findStoreFront(returnMap)
48
            return self.parse_my_pricing_response(response)
105
        else:
49
        else:
106
            return returnMap
50
            raise
107
    
51
    
108
    def findStoreFront(self,returnMap):
52
    def parse_competitor_pricing_response(self,response):
-
 
53
        spString = re.sub('<\?.*\?>','',response.text)
109
        storeFront={}
54
        spString = "<dom>" + spString + "</dom>"
110
        for arr in returnMap.itervalues():
55
        dom = parseString(spString)
-
 
56
        skuOffers = dom.getElementsByTagName('GetLowestOfferListingsForSKUResult')
111
            for dic in arr:
57
        offerMap = {}
112
                if dic['isStoreFront']!='True':
58
        for skuOffer in skuOffers:
113
                    storeFront[dic.get('storeUrl')] =''
59
            status = skuOffer.attributes.items()[0][1]
114
        rs = (grequests.get(u,stream=False) for u in storeFront.keys())
60
            sku = skuOffer.attributes.items()[1][1]
115
        for x in grequests.map(rs):
61
            info = []
116
            soup = strip_tags(x.text,invalid_tags)
62
            for offer in skuOffer.getElementsByTagName('LowestOfferListing'):
117
            x.close
63
                if len(info)==3:
118
            #print x.url.rfind('&me=')
64
                    break
119
            #print x.url[x.url.rfind('&me='):].rfind('&')
65
                temp = {}
120
            mId= x.url[x.url.rfind('&me=')+4:x.url[x.url.rfind('&me='):].rfind('&')+x.url.rfind('&me=')]
66
                amount = offer.getElementsByTagName('Amount')[0].firstChild.nodeValue
121
            sellerName = soup.title.string
67
                temp['sellingPrice'] = amount
122
            #print mId
68
                temp['promoPrice'] = amount
-
 
69
                temp['fulfillmentChannel'] = offer.getElementsByTagName('FulfillmentChannel')[0].firstChild.nodeValue
123
            try:
70
                try:
124
                ind = sellerName.index("@ Amazon.in")
-
 
125
                sellerName = sellerName[0:ind].strip()
71
                    temp['shippingTime'] = (offer.getElementsByTagName('Max')[0].firstChild.nodeValue).replace('days','')
126
            except:
72
                except:
-
 
73
                    temp['shippingTime'] = '0-0'
127
                try:
74
                try:
-
 
75
                    temp['rating'] = (offer.getElementsByTagName('SellerPositiveFeedbackRating')[0].firstChild.nodeValue)
-
 
76
                    if temp['rating'] == 'Just Launched':
-
 
77
                        temp['rating'] = '0'
-
 
78
                    else:
-
 
79
                        str_rating = temp['rating'].replace('-',' ').replace('%','')
128
                    ind = sellerName.split(":")
80
                        a =  str_rating.split()
-
 
81
                        l = []
-
 
82
                        for x in a:
129
                    sellerName = ind[1].strip()
83
                            if x.isdigit():
-
 
84
                                l.append(x)
-
 
85
                        temp['rating'] = l[0]
130
                except:
86
                except:
131
                    sellerName =""
87
                    temp['rating'] = '0'
132
            #storeFront[re.compile('*'+mId+'.*')] = sellerName
88
                temp['notOurSku'] = True
133
            #print mId
89
                info.append(temp)
-
 
90
            offerMap[sku]=info        
134
            #print sellerName
91
        return offerMap
-
 
92
    
-
 
93
    def parse_my_pricing_response(self,response):
135
            myRe = re.compile('.*'+mId+'.*')
94
        spString = re.sub('<\?.*\?>','',response.text)
-
 
95
        spString = "<dom>" + spString + "</dom>"
136
            for key in storeFront:
96
        dom = parseString(spString)
-
 
97
        skuOffers = dom.getElementsByTagName('GetMyPriceForSKUResult')
-
 
98
        skuMap = {}
137
                if myRe.match(key):
99
        for skuOffer in skuOffers:
-
 
100
            status = skuOffer.attributes.items()[0][1]
138
                    #print "Match found ",key
101
            sku = skuOffer.attributes.items()[1][1]
-
 
102
            asin = skuOffer.getElementsByTagName('ASIN')[0].firstChild.nodeValue
-
 
103
            for offer in skuOffer.getElementsByTagName('Offers'):
-
 
104
                temp = {}
-
 
105
                promoPrice = offer.getElementsByTagName('LandedPrice')[0].getElementsByTagName('Amount')[0].firstChild.nodeValue
-
 
106
                regularPrice = offer.getElementsByTagName('RegularPrice')[0].getElementsByTagName('Amount')[0].firstChild.nodeValue
139
                    storeFront[key] = sellerName.strip()
107
                temp['sellingPrice'] = regularPrice
140
            #storeFront.get(re.compile('.*'+mId+'.*'))
108
                temp['promoPrice'] = promoPrice
141
        for arr in returnMap.itervalues():
109
                if promoPrice == regularPrice:
142
            #print "arr is ",arr
110
                    temp['promotion'] = False
143
            for dic in arr:
111
                else:
-
 
112
                    temp['promotion'] = True
-
 
113
                temp['status'] = status
144
                #print "dic ",dic
114
                temp['asin'] = asin
145
                if dic['isStoreFront']!='True':
115
                temp['notOurSku'] = False
146
                    dic['sellerName'] =storeFront.get(dic.get('storeUrl'))
116
                temp['fulfillmentChannel'] ='AMAZON'
-
 
117
                temp['shippingTime'] =  '0-0'
147
                    dic['isStoreFront']='True'
118
                temp['rating'] = '0'
-
 
119
            skuMap[sku]=temp     
-
 
120
        return skuMap    
148
        
121
        
149
        print "********"
-
 
150
        return returnMap
-
 
151
         
-
 
152
                    
-
 
153
#        rs = (grequests.get(u,stream=False) for u in urls)
-
 
154
#        for x in grequests.map(rs):
-
 
155
        #return soup.title.string
-
 
156
            
-
 
157
 
-
 
158
if __name__ == '__main__':
-
 
159
    urls=[]
-
 
160
    urls.append("http://amazon.in/gp/offer-listing/B007VZFZO8/ref=olp_sort_ps")
-
 
161
#    asin = []
-
 
162
#    for a in amazonlisted:
-
 
163
#        asin.append(a.asin)
-
 
164
#        urls.append('http://www.amazon.in/gp/offer-listing/'+str(a.asin)+'/ref=olp_sort_ps')
-
 
165
#        if len(urls)==50:
-
 
166
#            break
-
 
167
    print urls
-
 
168
    scraper = AmazonAsyncScraper()
-
 
169
    'http://www.amazon.in/gp/offer-listing/B003SNIN9Q/ref=olp_sort_ps'
-
 
170
    print len(urls)
-
 
171
    x = scraper.read(urls,True)
-
 
172
    print x
-
 
173
    print "##################"
-
 
174
#    fetched = x.items()
-
 
175
#    print list(set(asin) - set(fetched))
-
 
176
#    for a,i in x.iteritems():
-
 
177
#        print a
-
 
178
#        for data in i:
-
 
179
#            print data
-
 
180
#        print "*********"
-
 
181
    #print scraper.createData()
-
 
182
    print datetime.datetime.now()
-
 
183
122
        
-
 
123
    def calc_signature(self, method, request_description):
-
 
124
        sig_data = method + '\n' + self.domain.replace('https://', '').lower() + '\n' + self.uri + '\n' + request_description
-
 
125
        return base64.b64encode(hmac.new(str(self.secret_key), sig_data, hashlib.sha256).digest())
-
 
126
 
-
 
127
    def get_timestamp(self):
-
 
128
        return strftime("%Y-%m-%dT%H:%M:%SZ", gmtime())
-
 
129
 
-
 
130
 
-
 
131
class Products(MWS):
-
 
132
 
-
 
133
    URI = '/Products/2011-10-01'
-
 
134
    VERSION = '2011-10-01'
-
 
135
    NS = '{https://mws-eu.amazonservices.com/Products/2011-10-01}'
-
 
136
    
-
 
137
    
-
 
138
    def get_my_pricing_for_sku(self, marketplaceid, skus):
-
 
139
 
-
 
140
        data = dict(SellerId=self.merchant_id, MarketplaceId=marketplaceid)
-
 
141
        num=0
-
 
142
        for sku in skus:
-
 
143
            data['SellerSKUList.SellerSKU.%d' % (num + 1)] = sku
-
 
144
            num+=1
-
 
145
        return self.make_request(data,'GetMyPriceForSKU')
-
 
146
    
-
 
147
 
-
 
148
    def get_competitive_pricing_for_sku(self, marketplaceid, skus):
-
 
149
        data = dict(SellerId=self.merchant_id, MarketplaceId=marketplaceid)
-
 
150
        num=0
-
 
151
        for sku in skus:
-
 
152
            data['SellerSKUList.SellerSKU.%d' % (num + 1)] = sku
-
 
153
            num+=1
-
 
154
        return self.make_request(data,'GetLowestOfferListingsForSKU')
-
 
155
    
-
 
156
def main():
-
 
157
    p = Products("AKIAII3SGRXBJDPCHSGQ", "B92xTbNBTYygbGs98w01nFQUhbec1pNCkCsKVfpg", "AF6E3O0VE0X4D")
-
 
158
    comp = p.get_competitive_pricing_for_sku('A21TJRUUN4KGV', ['FBA12248','FBB12248'])
-
 
159
    our = p.get_my_pricing_for_sku('A21TJRUUN4KGV', ['FBA12248','FBB12248'])
-
 
160
    print comp.get('FBA12248')
-
 
161
    print our.get('FBA12248')
-
 
162
    x = (our.get('FBA12248'))
-
 
163
    x['sellingPrice']=41089
-
 
164
    l = (comp.get('FBA12248'))
-
 
165
    l.append((our.get('FBA12248')))
-
 
166
    print sorted(l, key=itemgetter('promoPrice','notOurSku'))
-
 
167
    
-
 
168
    
-
 
169
    
-
 
170
    
-
 
171
    
-
 
172
 
-
 
173
if __name__=='__main__':
-
 
174
    main()
-
 
175
184
176