WebSVN – SmartDukaan – //trunk/PyProj/src/shop2020/model/v1/catalog/script/AmazonAsyncScraper.py

from BeautifulSoup import BeautifulSoup, NavigableString
import re
import sys
import  datetime
import grequests
import re

invalid_tags = ['b', 'i', 'u']
bestSellers = []

def strip_tags(html, invalid_tags):
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)

    for tag in soup.findAll(True):
        if tag.name in invalid_tags:
            s = ""

            for c in tag.contents:
                if not isinstance(c, NavigableString):
                    c = strip_tags(unicode(c), invalid_tags)
                s += unicode(c)

            tag.replaceWith(s)

    return soup

class AmazonAsyncScraper:
    def __init__(self):
        self.count_trials = 0
    
    def read(self, urls, findStore):
        returnMap = {}
        print datetime.datetime.now()
        header = {'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1'}
        rs = (grequests.get(u, headers=header) for u in urls)
        for x in grequests.map(rs):
            soup = strip_tags(x.text,invalid_tags)
            print soup
            for tag in soup.findAll(True):
                if tag.name in invalid_tags:
                    s = ""
        
                    for c in tag.contents:
                        if not isinstance(c, NavigableString):
                            c = strip_tags(unicode(c), invalid_tags)
                        s += unicode(c)
        
                    tag.replaceWith(s)
            x.close()
            sellerCount=0
            info = []
            sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})
            dataLength = len(sellerData)
            print dataLength
            for data in sellerData:
                tempMap={}
                price = data.find('span', attrs={'class' : re.compile('.*olpOfferPrice*')}).find('span').text
                unitCost = float(price.replace("Rs.","").replace(",",""))
                shippingCost = data.find('p', attrs={'class' : re.compile('.*olpShippingInfo*')}).find('span').text
                if "FREE" in shippingCost:
                    shippingCost = 0
                else:
                    #print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))
                    shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))
                
                sellerColumn =  data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})
                store=""
                storeUrl=""
                if findStore:
                    storeUrl = sellerColumn.find('a')['href']
                    temp =  sellerColumn.find('a')
                    store = temp.text
                    if len(store)==0:
                        print storeUrl
                        dom_in = storeUrl.find("www.amazon.in")
                        print dom_in
                        if dom_in ==-1:
                            storeUrl="http://amazon.in"+storeUrl
                        if storeUrl[storeUrl.rfind('/')+1:]=='AF6E3O0VE0X4D':
                            store = 'Saholic'
                    if len(store)!=0:
                        tempMap['isStoreFront']='True'
                    else:
                        tempMap['isStoreFront']='False'
                    tempMap['storeUrl'] =storeUrl
                asinind = x.url.index("offer-listing")
                refind = x.url.index("/ref=olp_sort_ps")
                asin = x.url[asinind+14:refind].strip()
                sellerCount+=1
                if sellerCount==1:
                    tempMap['sellerName'] = store.strip()
                    tempMap['sellerPrice'] = unitCost+shippingCost
                if sellerCount==2:
                    tempMap['sellerName'] = store.strip()
                    tempMap['sellerPrice'] = unitCost+shippingCost
                if sellerCount==3:
                    tempMap['sellerName'] = store.strip()
                    tempMap['sellerPrice'] = unitCost+shippingCost
                info.append(tempMap) 
                if sellerCount==3 or sellerCount==dataLength:
                    returnMap[asin] = info 
                    break
        if findStore:
            return self.findStoreFront(returnMap)
        else:
            return returnMap
    
    def findStoreFront(self,returnMap):
        storeFront={}
        for arr in returnMap.itervalues():
            print "arr is ",arr
            for dic in arr:
                print "dic ",dic
                if dic['isStoreFront']!='True':
                    storeFront[dic.get('storeUrl')] =''
        rs = (grequests.get(u,stream=False) for u in storeFront.keys())
        for x in grequests.map(rs):
            soup = strip_tags(x.text,invalid_tags)
            x.close
            #print x.url.rfind('&me=')
            #print x.url[x.url.rfind('&me='):].rfind('&')
            mId= x.url[x.url.rfind('&me=')+4:x.url[x.url.rfind('&me='):].rfind('&')+x.url.rfind('&me=')]
            sellerName = soup.title.string
            #print mId
            try:
                ind = sellerName.index("@ Amazon.in")
                sellerName = sellerName[0:ind].strip()
            except:
                try:
                    ind = sellerName.split(":")
                    sellerName = ind[1].strip()
                except:
                    sellerName =""
            #storeFront[re.compile('*'+mId+'.*')] = sellerName
            #print mId
            #print sellerName
            myRe = re.compile('.*'+mId+'.*')
            for key in storeFront:
                if myRe.match(key):
                    #print "Match found ",key
                    storeFront[key] = sellerName.strip()
            #storeFront.get(re.compile('.*'+mId+'.*'))
        for arr in returnMap.itervalues():
            #print "arr is ",arr
            for dic in arr:
                #print "dic ",dic
                if dic['isStoreFront']!='True':
                    dic['sellerName'] =storeFront.get(dic.get('storeUrl'))
                    dic['isStoreFront']='True'
        
        print "********"
        return returnMap
         
                    
#        rs = (grequests.get(u,stream=False) for u in urls)
#        for x in grequests.map(rs):
        #return soup.title.string
            

if __name__ == '__main__':
    urls=[]
    urls.append("http://amazon.in/gp/offer-listing/B007VZFZO8/ref=olp_sort_ps")
#    asin = []
#    for a in amazonlisted:
#        asin.append(a.asin)
#        urls.append('http://www.amazon.in/gp/offer-listing/'+str(a.asin)+'/ref=olp_sort_ps')
#        if len(urls)==50:
#            break
    print urls
    scraper = AmazonAsyncScraper()
    'http://www.amazon.in/gp/offer-listing/B003SNIN9Q/ref=olp_sort_ps'
    print len(urls)
    x = scraper.read(urls,True)
    print x
    print "##################"
#    fetched = x.items()
#    print list(set(asin) - set(fetched))
#    for a,i in x.iteritems():
#        print a
#        for data in i:
#            print data
#        print "*********"
    #print scraper.createData()
    print datetime.datetime.now()
Subversion Repositories SmartDukaan

(root)//trunk/PyProj/src/shop2020/model/v1/catalog/script/AmazonAsyncScraper.py – Rev 12410