Subversion Repositories SmartDukaan

Rev

Rev 13754 | Rev 14379 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

import urllib2
from BeautifulSoup import BeautifulSoup
import pymongo
import re
from dtr.utils.utils import to_java_date
from datetime import datetime

con = None
bestSellers = []
now = datetime.now()

class __RankInfo:
    
    def __init__(self, identifier, rank):
        self.identifier = identifier
        self.rank  = rank

def get_mongo_connection(host='localhost', port=27017):
    global con
    if con is None:
        print "Establishing connection %s host and port %d" %(host,port)
        try:
            con = pymongo.MongoClient(host, port)
        except Exception, e:
            print e
            return None
    return con

def getSoupObject(url):
    print "Getting soup object for"
    print url
    global RETRY_COUNT
    RETRY_COUNT = 1 
    while RETRY_COUNT < 10:
        try:
            soup = None
            request = urllib2.Request(url)
            request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
            request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')
            request.add_header('Connection','keep-alive')
            request.add_header('User-Agent','Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36')
        
            response = urllib2.urlopen(request)   
            response_data = response.read()
            response.close()
            try:
                page=response_data.decode("utf-8")
                soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
            except:
                soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)
            if soup is None:
                raise
            return soup
        except Exception as e:
            print e
            print "Retrying"
            RETRY_COUNT = RETRY_COUNT + 1


def scrapeBestSellerMobiles():
    global bestSellers
    rank = 0
    for i in [1, 21, 41, 61, 81]:
        url = "http://www.flipkart.com/mobiles/~bestsellers/pr?p[]=sort=popularity&sid=tyy,4io&start=%d&ajax=true" %(i)
        soup = getSoupObject(url)
        product_divs = soup.findAll('div',{'class':re.compile('.*browse-product')})
        for x in product_divs:
            rank  = rank +1
            print rank,
            print '\t',
            print x['data-pid']
            r_info = __RankInfo(x['data-pid'].strip(),rank)
            bestSellers.append(r_info)

def scrapeBestSellerTablets():
    global bestSellers
    bestSellers = []
    rank = 0
    for i in [1, 21, 41, 61, 81]:
        url = "http://www.flipkart.com/tablets/~bestsellers/pr?p[]=sort=popularity&sid=tyy,hry&start=%d&ajax=true" %(i)
        soup = getSoupObject(url)
        product_divs = soup.findAll('div',{'class':re.compile('.*browse-product')})
        for x in product_divs:
            rank  = rank +1
            print rank,
            print '\t',
            print x['data-pid']
            r_info = __RankInfo(x['data-pid'].strip(),rank)
            bestSellers.append(r_info)

def commitBestSellers():
    for x in bestSellers:
        print x.rank,
        print '\t',
        print x.identifier,
        col = get_mongo_connection().Catalog.MasterData.find({'identifier':x.identifier.strip()})
        print "count sku",
        print '\t',
        print len(list(col))
        get_mongo_connection().Catalog.MasterData.update({'identifier':x.identifier.strip()}, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}}, multi=True)

def resetRanks(category):
    oldRankedItems = get_mongo_connection().Catalog.MasterData.find({'rank':{'$gt':0},'source_id':2,'category':category})
    for item in oldRankedItems:
        get_mongo_connection().Catalog.MasterData.update({'_id':item['_id']}, {'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)
        
def main():
    scrapeBestSellerMobiles()
    if len(bestSellers) > 0:
        resetRanks('Mobiles')
        commitBestSellers()
    scrapeBestSellerTablets()
    if len(bestSellers) > 0:
        resetRanks('Tablets')
        commitBestSellers()
    
if __name__=='__main__':
    main()