Subversion Repositories SmartDukaan

Rev

Rev 12410 | Rev 12412 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
12363 kshitij.so 1
from BeautifulSoup import BeautifulSoup, NavigableString
2
import re
3
import sys
4
import  datetime
5
import grequests
6
import re
7
 
8
invalid_tags = ['b', 'i', 'u']
9
bestSellers = []
10
 
11
def strip_tags(html, invalid_tags):
12
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
13
 
14
    for tag in soup.findAll(True):
15
        if tag.name in invalid_tags:
16
            s = ""
17
 
18
            for c in tag.contents:
19
                if not isinstance(c, NavigableString):
20
                    c = strip_tags(unicode(c), invalid_tags)
21
                s += unicode(c)
22
 
23
            tag.replaceWith(s)
24
 
25
    return soup
26
 
27
class AmazonAsyncScraper:
28
    def __init__(self):
29
        self.count_trials = 0
30
 
31
    def read(self, urls, findStore):
32
        returnMap = {}
33
        print datetime.datetime.now()
12411 kshitij.so 34
        header = {'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1',
35
                  'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
36
                  'Accept-Encoding':'gzip,deflate,sdch'
37
                  }
38
 
12410 kshitij.so 39
        rs = (grequests.get(u, headers=header) for u in urls)
12363 kshitij.so 40
        for x in grequests.map(rs):
12410 kshitij.so 41
            soup = strip_tags(x.text,invalid_tags)
12363 kshitij.so 42
            for tag in soup.findAll(True):
43
                if tag.name in invalid_tags:
44
                    s = ""
45
 
46
                    for c in tag.contents:
47
                        if not isinstance(c, NavigableString):
48
                            c = strip_tags(unicode(c), invalid_tags)
49
                        s += unicode(c)
50
 
51
                    tag.replaceWith(s)
52
            x.close()
53
            sellerCount=0
54
            info = []
55
            sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})
12396 kshitij.so 56
            dataLength = len(sellerData)
12410 kshitij.so 57
            print dataLength
12363 kshitij.so 58
            for data in sellerData:
59
                tempMap={}
60
                price = data.find('span', attrs={'class' : re.compile('.*olpOfferPrice*')}).find('span').text
61
                unitCost = float(price.replace("Rs.","").replace(",",""))
62
                shippingCost = data.find('p', attrs={'class' : re.compile('.*olpShippingInfo*')}).find('span').text
63
                if "FREE" in shippingCost:
64
                    shippingCost = 0
65
                else:
12402 kshitij.so 66
                    #print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))
12363 kshitij.so 67
                    shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))
68
 
69
                sellerColumn =  data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})
70
                store=""
71
                storeUrl=""
72
                if findStore:
73
                    storeUrl = sellerColumn.find('a')['href']
74
                    temp =  sellerColumn.find('a')
75
                    store = temp.text
76
                    if len(store)==0:
77
                        print storeUrl
78
                        dom_in = storeUrl.find("www.amazon.in")
79
                        print dom_in
80
                        if dom_in ==-1:
81
                            storeUrl="http://amazon.in"+storeUrl
82
                        if storeUrl[storeUrl.rfind('/')+1:]=='AF6E3O0VE0X4D':
83
                            store = 'Saholic'
84
                    if len(store)!=0:
85
                        tempMap['isStoreFront']='True'
86
                    else:
87
                        tempMap['isStoreFront']='False'
88
                    tempMap['storeUrl'] =storeUrl
89
                asinind = x.url.index("offer-listing")
90
                refind = x.url.index("/ref=olp_sort_ps")
91
                asin = x.url[asinind+14:refind].strip()
92
                sellerCount+=1
93
                if sellerCount==1:
94
                    tempMap['sellerName'] = store.strip()
95
                    tempMap['sellerPrice'] = unitCost+shippingCost
96
                if sellerCount==2:
97
                    tempMap['sellerName'] = store.strip()
98
                    tempMap['sellerPrice'] = unitCost+shippingCost
99
                if sellerCount==3:
100
                    tempMap['sellerName'] = store.strip()
101
                    tempMap['sellerPrice'] = unitCost+shippingCost
102
                info.append(tempMap) 
12396 kshitij.so 103
                if sellerCount==3 or sellerCount==dataLength:
12363 kshitij.so 104
                    returnMap[asin] = info 
105
                    break
106
        if findStore:
107
            return self.findStoreFront(returnMap)
108
        else:
109
            return returnMap
110
 
111
    def findStoreFront(self,returnMap):
112
        storeFront={}
113
        for arr in returnMap.itervalues():
12410 kshitij.so 114
            print "arr is ",arr
12363 kshitij.so 115
            for dic in arr:
12410 kshitij.so 116
                print "dic ",dic
12363 kshitij.so 117
                if dic['isStoreFront']!='True':
118
                    storeFront[dic.get('storeUrl')] =''
119
        rs = (grequests.get(u,stream=False) for u in storeFront.keys())
120
        for x in grequests.map(rs):
121
            soup = strip_tags(x.text,invalid_tags)
122
            x.close
123
            #print x.url.rfind('&me=')
124
            #print x.url[x.url.rfind('&me='):].rfind('&')
125
            mId= x.url[x.url.rfind('&me=')+4:x.url[x.url.rfind('&me='):].rfind('&')+x.url.rfind('&me=')]
126
            sellerName = soup.title.string
127
            #print mId
128
            try:
129
                ind = sellerName.index("@ Amazon.in")
130
                sellerName = sellerName[0:ind].strip()
131
            except:
132
                try:
133
                    ind = sellerName.split(":")
134
                    sellerName = ind[1].strip()
135
                except:
136
                    sellerName =""
137
            #storeFront[re.compile('*'+mId+'.*')] = sellerName
138
            #print mId
139
            #print sellerName
140
            myRe = re.compile('.*'+mId+'.*')
141
            for key in storeFront:
142
                if myRe.match(key):
143
                    #print "Match found ",key
144
                    storeFront[key] = sellerName.strip()
145
            #storeFront.get(re.compile('.*'+mId+'.*'))
146
        for arr in returnMap.itervalues():
147
            #print "arr is ",arr
148
            for dic in arr:
149
                #print "dic ",dic
150
                if dic['isStoreFront']!='True':
151
                    dic['sellerName'] =storeFront.get(dic.get('storeUrl'))
152
                    dic['isStoreFront']='True'
153
 
154
        print "********"
155
        return returnMap
156
 
157
 
158
#        rs = (grequests.get(u,stream=False) for u in urls)
159
#        for x in grequests.map(rs):
160
        #return soup.title.string
161
 
162
 
163
if __name__ == '__main__':
164
    urls=[]
12410 kshitij.so 165
    urls.append("http://amazon.in/gp/offer-listing/B007VZFZO8/ref=olp_sort_ps")
12363 kshitij.so 166
#    asin = []
167
#    for a in amazonlisted:
168
#        asin.append(a.asin)
169
#        urls.append('http://www.amazon.in/gp/offer-listing/'+str(a.asin)+'/ref=olp_sort_ps')
170
#        if len(urls)==50:
171
#            break
172
    print urls
173
    scraper = AmazonAsyncScraper()
12396 kshitij.so 174
    'http://www.amazon.in/gp/offer-listing/B003SNIN9Q/ref=olp_sort_ps'
12363 kshitij.so 175
    print len(urls)
176
    x = scraper.read(urls,True)
177
    print x
178
    print "##################"
179
#    fetched = x.items()
180
#    print list(set(asin) - set(fetched))
181
#    for a,i in x.iteritems():
182
#        print a
183
#        for data in i:
184
#            print data
185
#        print "*********"
186
    #print scraper.createData()
187
    print datetime.datetime.now()