Rev 14157 | Rev 14178 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
import pymongofrom dtr.utils.utils import to_java_datefrom datetime import datetime, timedeltafrom operator import itemgetterfrom dtr.utils import FlipkartScraper,NewFlipkartScraperfrom multiprocessing.dummy import Pool as ThreadPoolcon = Nonedef get_mongo_connection(host='localhost', port=27017):global conif con is None:print "Establishing connection %s host and port %d" %(host,port)try:con = pymongo.MongoClient(host, port)except Exception, e:print ereturn Nonereturn condef populate():toScrapMap = {}bestSellers = list(get_mongo_connection().Catalog.MasterData.find({'rank':{'$gt':0}}))for bestSeller in bestSellers:amazonBestSellers = list(get_mongo_connection().Catalog.MasterData.find({'skuBundleId':bestSeller['skuBundleId'],'source_id':2}))for data in amazonBestSellers:if not toScrapMap.has_key(data['_id']):toScrapMap[data['_id']] = datafor k, y in toScrapMap.iteritems():print k,print '\t',print ypool = ThreadPool(50)pool.map(scrapeFlipkart,toScrapMap.values())pool.close()pool.join()print "joining threads"print datetime.now()def scrapeFlipkart(data):if data['source_id']!=2:returninStock = 0retryCount = 0print str(data['identifier'])if data['identifier'] is None or len(data['identifier'].strip())==0:print "returning in valid identifier"returntry:if data['priceUpdatedOn'] > to_java_date(datetime.now() - timedelta(minutes=5)):print "sku id is already updated",data['_id']returnexcept:passlowestSp = 0inStock = 0scraperFk = FlipkartScraper.FlipkartScraper()scraperProductPage = NewFlipkartScraper.FlipkartProductPageScraper()try:if data['marketPlaceUrl']!="" or data['marketPlaceUrl'] !="http://www.flipkart.com/ps/%s"%(data['identifier']):result = scraperProductPage.read(data['marketPlaceUrl'])if result.get('lowestSp')!=0:lowestSp = result.get('lowestSp')inStock = result.get('inStock')except:print "Unable to scrape product page ",data['identifier']if lowestSp == 0:url = "http://www.flipkart.com/ps/%s"%(data['identifier'].strip())while(retryCount < 3):try:vendorsData = scraperFk.read(url)fetched = Truebreakexcept Exception as e:print "***Retry count ",retryCountretryCount+=1if retryCount == 3:fetched = Falseprint eif not fetched:print "Unable to fetch data after multiple tries.Continue for ",data['identifier']returnsortedVendorsData = []sortedVendorsData = sorted(vendorsData, key=itemgetter('sellingPrice'))print "data",sortedVendorsDatalowestSp, iterator = (0,)*2for vData in sortedVendorsData:if iterator == 0:lowestSp = vData['sellingPrice']breakif lowestSp > 0:inStock = 1print lowestSpprint inStockif lowestSp > 0:get_mongo_connection().Catalog.MasterData.update({'_id':data['_id']}, {'$set' : {'available_price':lowestSp,'updatedOn':to_java_date(datetime.now()),'priceUpdatedOn':to_java_date(datetime.now()),'in_stock':inStock}}, multi=True)get_mongo_connection().Catalog.Deals.update({'_id':data['_id']}, {'$set' : {'available_price':lowestSp , 'in_stock':inStock}}, multi=True)else:get_mongo_connection().Catalog.MasterData.update({'_id':data['_id']}, {'$set' : {'updatedOn':to_java_date(datetime.now()),'in_stock':inStock,'priceUpdatedOn':to_java_date(datetime.now())}}, multi=True)get_mongo_connection().Catalog.Deals.update({'_id':data['_id']}, {'$set' : {'in_stock':inStock}}, multi=True)try:recomputeDeal(data['skuBundleId'])except:print "Unable to compute deal for ",data['skuBundleId']def recomputeDeal(skuBundleId):"""Lets recompute deal for this bundle"""print "Recomputing for bundleId",skuBundleIdsimilarItems = list(get_mongo_connection().Catalog.Deals.find({'skuBundleId':skuBundleId}).sort([('available_price',pymongo.ASCENDING)]))bestPrice = float("inf")bestOne = NonebestSellerPoints = 0toUpdate = []for similarItem in similarItems:if similarItem['in_stock'] == 0 or similarItem['maxprice'] is None or similarItem['maxprice'] < similarItem['available_price']:get_mongo_connection().Catalog.Deals.update({ '_id' : similarItem['_id'] }, {'$set':{'showDeal':0 }})continueif similarItem['available_price'] < bestPrice:bestOne = similarItembestPrice = similarItem['available_price']bestSellerPoints = similarItem['bestSellerPoints']elif similarItem['available_price'] == bestPrice and bestSellerPoints < similarItem['bestSellerPoints']:bestOne = similarItembestPrice = similarItem['available_price']bestSellerPoints = similarItem['bestSellerPoints']else:passif bestOne is not None:for similarItem in similarItems:toUpdate.append(similarItem['_id'])toUpdate.remove(bestOne['_id'])get_mongo_connection().Catalog.Deals.update({ '_id' : bestOne['_id'] }, {'$set':{'showDeal':1 }})if len(toUpdate) > 0:get_mongo_connection().Catalog.Deals.update({ '_id' : { "$in": toUpdate } }, {'$set':{'showDeal':0 }},upsert=False, multi=True)def main():populate()if __name__=='__main__':main()