Rev 15895 | Rev 16021 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
import urllib2from BeautifulSoup import BeautifulSoupimport pymongoimport refrom dtr.utils.utils import to_java_date, getNlcPointsimport optparsefrom datetime import datetimeimport smtplibfrom email.mime.text import MIMETextfrom email.mime.multipart import MIMEMultipartfrom dtr.utils import ShopCluesScraperimport tracebackfrom dtr.storage.MemCache import MemCachecon = Noneparser = optparse.OptionParser()parser.add_option("-m", "--m", dest="mongoHost",default="localhost",type="string", help="The HOST where the mongo server is running",metavar="mongo_host")(options, args) = parser.parse_args()SOURCE_MAP = {'AMAZON':1,'FLIPKART':2,'SNAPDEAL':3,'SAHOLIC':4, 'SHOPCLUES.COM':5}exceptionList = []bestSellers = []baseUrl = "http://m.shopclues.com/products/getProductList/mobiles:top-selling-mobiles-and-tablets.html/%s/page=%s"headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36','Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language' : 'en-US,en;q=0.8','Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Connection':'keep-alive'}now = datetime.now()mc = MemCache(options.mongoHost)sc = ShopCluesScraper.ShopCluesScraper()class __ProductInfo:def __init__(self, identifier, rank, url, available_price, in_stock, codAvailable, source_product_name, thumbnail, coupon):self.identifier = identifierself.rank = rankself.url = urlself.available_price = available_priceself.in_stock = in_stockself.codAvailable = codAvailableself.source_product_name = source_product_nameself.thumbnail = thumbnailself.coupon = coupondef get_mongo_connection(host=options.mongoHost, port=27017):global conif con is None:print "Establishing connection %s host and port %d" %(host,port)try:con = pymongo.MongoClient(host, port)except Exception, e:print ereturn Nonereturn condef getSoupObject(url):print "Getting soup object for"print urlglobal RETRY_COUNTRETRY_COUNT = 1while RETRY_COUNT < 10:try:soup = Nonerequest = urllib2.Request(url, headers=headers)response = urllib2.urlopen(request)response_data = response.read()response.close()try:page=response_data.decode("utf-8")soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)except:print traceback.print_exc()soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)if soup is None:raisereturn soupexcept Exception as e:traceback.print_exc()print "Retrying"RETRY_COUNT = RETRY_COUNT + 1def scrapeBestSellers():global bestSellersbestSellers = []rank = 0page = 1while (True):url = (baseUrl)%(page,page-1)soup = getSoupObject(url)productDivs = soup.findAll('div',{'class':'pd-list-cont'})if productDivs is None or len(productDivs)==0:returnfor productDiv in productDivs:rank = rank + 1info_tag = productDiv.find('a')link = info_tag['href']scin = info_tag['data-id'].strip()print linkprint scinproductName = productDiv.find('div',{'class':'pdt-name'}).stringtry:productInfo = sc.read(link)except Exception as e:traceback.print_exc()continueproduct = list(get_mongo_connection().Catalog.MasterData.find({'source_id':5,'identifier':scin}))if len(product) > 0:if productInfo['inStock'] ==1:get_mongo_connection().Catalog.MasterData.update({'_id':product[0]['_id']},{"$set":{'rank':rank, 'available_price':productInfo['price'], \'in_stock':productInfo['inStock'], 'codAvailable':productInfo['isCod'], \'coupon':productInfo['coupon']}, 'updatedOn':to_java_date(datetime.now()),'priceUpdatedOn':to_java_date(datetime.now())})get_mongo_connection().Catalog.Deals.update({'_id':product[0]['_id']}, {'$set' : {'available_price':productInfo['price'] , 'in_stock':productInfo['inStock'],'codAvailable':productInfo['isCod']}})else:get_mongo_connection().Catalog.MasterData.update({'_id':product[0]['_id']}, {'$set' : {'updatedOn':to_java_date(datetime.now()),'in_stock':0,'priceUpdatedOn':to_java_date(datetime.now())}})get_mongo_connection().Catalog.Deals.update({'_id':product[0]['_id']}, {'$set' : {'in_stock':0}})try:recomputeDeal(product[0])except:print "Unable to compute deal for %s"%(product[0]['skuBundleId'])else:#Lets bundle product by finding similar url patternuri = link.replace('http://m.shopclues.com','').replace(".html","")try:int(uri[uri.rfind('-')+1:])uri = uri[:uri.rfind('-')]except:passproduct = list(get_mongo_connection().Catalog.MasterData.find({'source_id':5,'marketPlaceUrl':{'$regex': uri}}))toBundle = __ProductInfo(scin, rank, link, productInfo['price'], productInfo['inStock'],productInfo['isCod'], productName, "" ,productInfo['coupon'])if len(product) > 0:bundleNewProduct(product[0], toBundle)else:exceptionList.append(toBundle)page = page+1def populateNegativeDeals():negativeDeals = get_mongo_connection().Catalog.NegativeDeals.find().distinct('sku')mc.set("negative_deals", negativeDeals, 600)def recomputePoints(item, deal):try:nlcPoints = getNlcPoints(item, deal['minNlc'], deal['maxNlc'], deal['available_price'])except:traceback.print_exc()nlcPoints = deal['nlcPoints']if item['manualDealThresholdPrice'] >= deal['available_price']:dealPoints = item['dealPoints']else:dealPoints = 0get_mongo_connection().Catalog.Deals.update({'_id':deal['_id']},{"$set":{'totalPoints':deal['totalPoints'] - deal['nlcPoints'] + nlcPoints - deal['dealPoints'] +dealPoints , 'nlcPoints': nlcPoints, 'dealPoints': dealPoints, 'manualDealThresholdPrice': item['manualDealThresholdPrice']}})def recomputeDeal(item):"""Lets recompute deal for this bundle"""print "Recomputing for bundleId %d" %(item.get('skuBundleId'))skuBundleId = item['skuBundleId']similarItems = list(get_mongo_connection().Catalog.Deals.find({'skuBundleId':skuBundleId}).sort([('available_price',pymongo.ASCENDING)]))bestPrice = float("inf")bestOne = NonebestSellerPoints = 0toUpdate = []prepaidBestPrice = float("inf")prepaidBestOne = NoneprepaidBestSellerPoints = 0for similarItem in similarItems:if similarItem['_id'] == item['_id']:try:recomputePoints(item, similarItem)except:traceback.print_exc()if similarItem['codAvailable'] ==1:if mc.get("negative_deals") is None:populateNegativeDeals()if similarItem['in_stock'] == 0 or similarItem['maxprice'] is None or similarItem['maxprice'] < similarItem['available_price'] or similarItem['_id'] in mc.get("negative_deals"):get_mongo_connection().Catalog.Deals.update({ '_id' : similarItem['_id'] }, {'$set':{'showDeal':0, 'prepaidDeal':0 }})continueif similarItem['source_id'] == SOURCE_MAP.get('SHOPCLUES.COM') and similarItem['rank']==0:get_mongo_connection().Catalog.Deals.update({ '_id' : similarItem['_id'] }, {'$set':{'showDeal':0,'prepaidDeal':0 }})continueif similarItem['available_price'] < bestPrice:bestOne = similarItembestPrice = similarItem['available_price']bestSellerPoints = similarItem['bestSellerPoints']elif similarItem['available_price'] == bestPrice and bestSellerPoints < similarItem['bestSellerPoints']:bestOne = similarItembestPrice = similarItem['available_price']bestSellerPoints = similarItem['bestSellerPoints']else:passelse:if mc.get("negative_deals") is None:populateNegativeDeals()if similarItem['in_stock'] == 0 or similarItem['maxprice'] is None or similarItem['maxprice'] < similarItem['available_price'] or similarItem['_id'] in mc.get("negative_deals"):get_mongo_connection().Catalog.Deals.update({ '_id' : similarItem['_id'] }, {'$set':{'showDeal':0, 'prepaidDeal':0 }})continueif similarItem['source_id'] == SOURCE_MAP.get('SHOPCLUES.COM') and similarItem['rank']==0:get_mongo_connection().Catalog.Deals.update({ '_id' : similarItem['_id'] }, {'$set':{'showDeal':0,'prepaidDeal':0 }})continueif similarItem['available_price'] < prepaidBestPrice:prepaidBestOne = similarItemprepaidBestPrice = similarItem['available_price']prepaidBestSellerPoints = similarItem['bestSellerPoints']elif similarItem['available_price'] == prepaidBestPrice and prepaidBestSellerPoints < similarItem['bestSellerPoints']:prepaidBestOne = similarItemprepaidBestPrice = similarItem['available_price']prepaidBestSellerPoints = similarItem['bestSellerPoints']else:passif bestOne is not None and prepaidBestOne is not None:for similarItem in similarItems:toUpdate.append(similarItem['_id'])toUpdate.remove(bestOne['_id'])toUpdate.remove(prepaidBestOne['_id'])get_mongo_connection().Catalog.Deals.update({ '_id' : bestOne['_id'] }, {'$set':{'showDeal':1,'prepaidDeal':0 }})get_mongo_connection().Catalog.Deals.update({ '_id' : prepaidBestOne['_id'] }, {'$set':{'showDeal':0,'prepaidDeal':1 }})if len(toUpdate) > 0:get_mongo_connection().Catalog.Deals.update({ '_id' : { "$in": toUpdate } }, {'$set':{'showDeal':0,'prepaidDeal':0 }},upsert=False, multi=True)def bundleNewProduct(existingProduct, toBundle):print "Adding new product"try:max_id = list(get_mongo_connection().Catalog.MasterData.find().sort([('_id',pymongo.DESCENDING)]).limit(1))existingProduct['_id'] = max_id[0]['_id'] + 1existingProduct['addedOn'] = to_java_date(datetime.now())existingProduct['available_price'] = toBundle.available_priceexistingProduct['updatedOn'] = to_java_date(datetime.now())existingProduct['codAvailable'] = toBundle.codAvailableexistingProduct['coupon'] = toBundle.couponexistingProduct['identifier'] = toBundle.identifierexistingProduct['in_stock'] = toBundle.in_stockexistingProduct['marketPlaceUrl'] = toBundle.urlexistingProduct['rank'] = toBundle.rankexistingProduct['source_product_name'] = toBundle.source_product_nameexistingProduct['url'] = toBundle.urlget_mongo_connection().Catalog.MasterData.insert(existingProduct)return {1:'Data added successfully.'}except Exception as e:print ereturn {0:'Unable to add data.'}def exceptionItems():for item in exceptionList:print vars(item)def main():scrapeBestSellers()exceptionItems()if __name__=='__main__':main()