Subversion Repositories SmartDukaan

Rev

Rev 12396 | Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
12363 kshitij.so 1
from BeautifulSoup import BeautifulSoup, NavigableString
2
import re
3
import sys
4
import  datetime
5
import grequests
6
import re
7
 
8
invalid_tags = ['b', 'i', 'u']
9
bestSellers = []
10
 
11
def strip_tags(html, invalid_tags):
12
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
13
 
14
    for tag in soup.findAll(True):
15
        if tag.name in invalid_tags:
16
            s = ""
17
 
18
            for c in tag.contents:
19
                if not isinstance(c, NavigableString):
20
                    c = strip_tags(unicode(c), invalid_tags)
21
                s += unicode(c)
22
 
23
            tag.replaceWith(s)
24
 
25
    return soup
26
 
27
class AmazonAsyncScraper:
28
    def __init__(self):
29
        self.count_trials = 0
30
 
31
    def read(self, urls, findStore):
32
        returnMap = {}
33
        print datetime.datetime.now()
34
        rs = (grequests.get(u,stream=False) for u in urls)
35
        for x in grequests.map(rs):
36
            soup = BeautifulSoup(x.text,convertEntities=BeautifulSoup.HTML_ENTITIES)
37
            for tag in soup.findAll(True):
38
                if tag.name in invalid_tags:
39
                    s = ""
40
 
41
                    for c in tag.contents:
42
                        if not isinstance(c, NavigableString):
43
                            c = strip_tags(unicode(c), invalid_tags)
44
                        s += unicode(c)
45
 
46
                    tag.replaceWith(s)
47
            x.close()
48
            sellerCount=0
49
            info = []
50
            sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})
51
            for data in sellerData:
52
                tempMap={}
53
                price = data.find('span', attrs={'class' : re.compile('.*olpOfferPrice*')}).find('span').text
54
                unitCost = float(price.replace("Rs.","").replace(",",""))
55
                shippingCost = data.find('p', attrs={'class' : re.compile('.*olpShippingInfo*')}).find('span').text
56
                if "FREE" in shippingCost:
57
                    shippingCost = 0
58
                else:
59
                    print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))
60
                    shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))
61
 
62
                sellerColumn =  data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})
63
                store=""
64
                storeUrl=""
65
                if findStore:
66
                    storeUrl = sellerColumn.find('a')['href']
67
                    temp =  sellerColumn.find('a')
68
                    store = temp.text
69
                    if len(store)==0:
70
                        print storeUrl
71
                        dom_in = storeUrl.find("www.amazon.in")
72
                        print dom_in
73
                        if dom_in ==-1:
74
                            storeUrl="http://amazon.in"+storeUrl
75
                        if storeUrl[storeUrl.rfind('/')+1:]=='AF6E3O0VE0X4D':
76
                            store = 'Saholic'
77
                    if len(store)!=0:
78
                        tempMap['isStoreFront']='True'
79
                    else:
80
                        tempMap['isStoreFront']='False'
81
                    tempMap['storeUrl'] =storeUrl
82
                asinind = x.url.index("offer-listing")
83
                refind = x.url.index("/ref=olp_sort_ps")
84
                asin = x.url[asinind+14:refind].strip()
85
                sellerCount+=1
86
                if sellerCount==1:
87
                    tempMap['sellerName'] = store.strip()
88
                    tempMap['sellerPrice'] = unitCost+shippingCost
89
                if sellerCount==2:
90
                    tempMap['sellerName'] = store.strip()
91
                    tempMap['sellerPrice'] = unitCost+shippingCost
92
                if sellerCount==3:
93
                    tempMap['sellerName'] = store.strip()
94
                    tempMap['sellerPrice'] = unitCost+shippingCost
95
                info.append(tempMap) 
96
                if sellerCount==3:
97
                    returnMap[asin] = info 
98
                    break
99
        if findStore:
100
            return self.findStoreFront(returnMap)
101
        else:
102
            return returnMap
103
 
104
    def findStoreFront(self,returnMap):
105
        storeFront={}
106
        for arr in returnMap.itervalues():
107
            print "arr is ",arr
108
            for dic in arr:
109
                print "dic ",dic
110
                if dic['isStoreFront']!='True':
111
                    storeFront[dic.get('storeUrl')] =''
112
        rs = (grequests.get(u,stream=False) for u in storeFront.keys())
113
        for x in grequests.map(rs):
114
            soup = strip_tags(x.text,invalid_tags)
115
            x.close
116
            #print x.url.rfind('&me=')
117
            #print x.url[x.url.rfind('&me='):].rfind('&')
118
            mId= x.url[x.url.rfind('&me=')+4:x.url[x.url.rfind('&me='):].rfind('&')+x.url.rfind('&me=')]
119
            sellerName = soup.title.string
120
            #print mId
121
            try:
122
                ind = sellerName.index("@ Amazon.in")
123
                sellerName = sellerName[0:ind].strip()
124
            except:
125
                try:
126
                    ind = sellerName.split(":")
127
                    sellerName = ind[1].strip()
128
                except:
129
                    sellerName =""
130
            #storeFront[re.compile('*'+mId+'.*')] = sellerName
131
            #print mId
132
            #print sellerName
133
            myRe = re.compile('.*'+mId+'.*')
134
            for key in storeFront:
135
                if myRe.match(key):
136
                    #print "Match found ",key
137
                    storeFront[key] = sellerName.strip()
138
            #storeFront.get(re.compile('.*'+mId+'.*'))
139
        for arr in returnMap.itervalues():
140
            #print "arr is ",arr
141
            for dic in arr:
142
                #print "dic ",dic
143
                if dic['isStoreFront']!='True':
144
                    dic['sellerName'] =storeFront.get(dic.get('storeUrl'))
145
                    dic['isStoreFront']='True'
146
 
147
        print "********"
148
        return returnMap
149
 
150
 
151
#        rs = (grequests.get(u,stream=False) for u in urls)
152
#        for x in grequests.map(rs):
153
        #return soup.title.string
154
 
155
 
156
if __name__ == '__main__':
157
    urls=[]
158
    urls.append("http://amazon.in/gp/offer-listing/B00CE2LQSW/ref=olp_sort_ps")
159
    urls.append("http://amazon.in/gp/offer-listing")
160
#    asin = []
161
#    for a in amazonlisted:
162
#        asin.append(a.asin)
163
#        urls.append('http://www.amazon.in/gp/offer-listing/'+str(a.asin)+'/ref=olp_sort_ps')
164
#        if len(urls)==50:
165
#            break
166
    print urls
167
    scraper = AmazonAsyncScraper()
168
    'http://www.amazon.in/gp/offer-listing/B00CE2LQSW/ref=olp_sort_ps'
169
    print len(urls)
170
    x = scraper.read(urls,True)
171
    print x
172
    print "##################"
173
#    fetched = x.items()
174
#    print list(set(asin) - set(fetched))
175
#    for a,i in x.iteritems():
176
#        print a
177
#        for data in i:
178
#            print data
179
#        print "*********"
180
    #print scraper.createData()
181
    print datetime.datetime.now()