Rev 20347 | Rev 21135 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
import urllib2from BeautifulSoup import BeautifulSoupimport pymongoimport refrom dtr.utils.utils import to_java_date, getNlcPoints, DEAL_PRIORITY, getCashBackimport optparsefrom datetime import datetimeimport timeimport smtplibfrom email.mime.text import MIMETextfrom email.mime.multipart import MIMEMultipartfrom dtr.utils import ShopCluesScraperimport tracebackfrom dtr.storage.MemCache import MemCacheimport chardetcon = Noneparser = optparse.OptionParser()parser.add_option("-m", "--m", dest="mongoHost",default="localhost",type="string", help="The HOST where the mongo server is running",metavar="mongo_host")parser.add_option("-r", "--reset", dest="reset",default="False", type="string",help="Reset Ranks?")(options, args) = parser.parse_args()SOURCE_MAP = {'AMAZON':1,'FLIPKART':2,'SNAPDEAL':3,'SAHOLIC':4, 'SHOPCLUES.COM':5,'PAYTM.COM':6}bestSellers = []baseUrl = "http://m.shopclues.com/products/getProductList/mobiles:top-selling-mobiles-and-tablets.html/%s/page=%s"headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36','Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language' : 'en-US,en;q=0.8','Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Connection':'keep-alive'}now = datetime.now()mc = MemCache(options.mongoHost)sc = ShopCluesScraper.ShopCluesScraper(findThumbnail=True)bundledProducts = []exceptionList = []class __ProductInfo:def __init__(self, identifier, rank, url, available_price, in_stock, codAvailable, source_product_name, thumbnail, coupon):self.identifier = identifierself.rank = rankself.url = urlself.available_price = available_priceself.in_stock = in_stockself.codAvailable = codAvailableself.source_product_name = source_product_nameself.thumbnail = thumbnailself.coupon = couponclass __NewBundled:def __init__(self, newProduct, oldProduct):self.newProduct = newProductself.oldProduct = oldProductdef get_mongo_connection(host=options.mongoHost, port=27017):global conif con is None:print "Establishing connection %s host and port %d" %(host,port)try:con = pymongo.MongoClient(host, port)except Exception, e:print ereturn Nonereturn condef getNetPriceForItem(itemId, source_id, category_id ,price):cash_back_type = 0cash_back = 0try:cashBack = getCashBack(itemId, source_id, category_id, mc, options.mongoHost)if not cashBack or cashBack.get('cash_back_status')!=1:cash_back_type = 0cash_back = 0else:if cashBack['cash_back_type'] in (1,2):if cashBack.get('maxCashBack') is not None:if cashBack.get('cash_back_type') ==1 and (float(cashBack.get('cash_back'))*price)/100 > cashBack.get('maxCashBack'):cashBack['cash_back_type'] = 2cashBack['cash_back'] = cashBack['maxCashBack']elif cashBack.get('cash_back_type') ==2 and cashBack.get('cash_back') > cashBack.get('maxCashBack'):cashBack['cash_back'] = cashBack['maxCashBack']else:passcash_back_type = cashBack['cash_back_type']cash_back = float(cashBack['cash_back'])except Exception as cashBackEx:passif cash_back_type ==1:return (price - float(cash_back)*price/100)elif cash_back_type ==2:return (price - cash_back)else:return pricedef getSoupObject(url):print "Getting soup object for"print urlglobal RETRY_COUNTRETRY_COUNT = 1while RETRY_COUNT < 10:try:soup = Nonerequest = urllib2.Request(url, headers=headers)response = urllib2.urlopen(request)response_data = response.read()response.close()try:page=response_data.decode("utf-8")soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)except:print traceback.print_exc()soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)if soup is None:raisereturn soupexcept Exception as e:traceback.print_exc()print "Retrying"RETRY_COUNT = RETRY_COUNT + 1def scrapeBestSellers():global bestSellersglobal exceptionListbestSellers = []rank = 0page = 1while (True):url = (baseUrl)%(page,page-1)soup = getSoupObject(url)productDivs = soup.findAll('div',{'class':'pd-list-cont'})if productDivs is None or len(productDivs)==0:returnfor productDiv in productDivs:rank = rank + 1info_tag = productDiv.find('a')link = info_tag['href']scin = info_tag['data-id'].strip()print linkprint scinproductName = productDiv.find('div',{'class':'pdt-name'}).stringtry:productInfo = sc.read(link)except Exception as e:traceback.print_exc()continueproduct = list(get_mongo_connection().Catalog.MasterData.find({'source_id':5,'identifier':scin}))if len(product) > 0:if product[0].get('ignorePricing') ==1:continueif productInfo['inStock'] ==1:netPriceAfterCashBack = getNetPriceForItem(product[0]['_id'], SOURCE_MAP.get('SHOPCLUES.COM'), product[0]['category_id'], productInfo['price'])get_mongo_connection().Catalog.MasterData.update({'_id':product[0]['_id']},{"$set":{'rank':rank, 'available_price':productInfo['price'], \'in_stock':productInfo['inStock'], 'codAvailable':productInfo['isCod'], \'coupon':productInfo['coupon'], 'updatedOn':to_java_date(datetime.now()),'priceUpdatedOn':to_java_date(datetime.now())}})get_mongo_connection().Catalog.Deals.update({'_id':product[0]['_id']}, {'$set' : {'rank':rank,'available_price':productInfo['price'] , 'in_stock':productInfo['inStock'],'codAvailable':productInfo['isCod'],'netPriceAfterCashBack':netPriceAfterCashBack}})else:netPriceAfterCashBack = getNetPriceForItem(product[0]['_id'], SOURCE_MAP.get('SHOPCLUES.COM'), product[0]['category_id'], product[0]['available_price'])get_mongo_connection().Catalog.MasterData.update({'_id':product[0]['_id']}, {'$set' : {'updatedOn':to_java_date(datetime.now()),'in_stock':0,'priceUpdatedOn':to_java_date(datetime.now())}})get_mongo_connection().Catalog.Deals.update({'_id':product[0]['_id']}, {'$set' : {'in_stock':0,'netPriceAfterCashBack':netPriceAfterCashBack}})try:recomputeDeal(product[0])except:print "Unable to compute deal for %s"%(product[0]['skuBundleId'])else:#Lets bundle product by finding similar url patternuri = link.replace('http://m.shopclues.com','').replace(".html","")try:int(uri[uri.rfind('-')+1:])uri = uri[:uri.rfind('-')]except:passproduct = list(get_mongo_connection().Catalog.MasterData.find({'source_id':5,'marketPlaceUrl':{'$regex': uri}}))toBundle = __ProductInfo(scin, rank, link, productInfo['price'], productInfo['inStock'],productInfo['isCod'], productName, productInfo['thumbnail'] ,productInfo['coupon'])if len(product) > 0:bundleNewProduct(product[0], toBundle)try:recomputeDeal(product[0])except:print "Unable to compute deal for %s"%(product[0]['skuBundleId'])else:exceptionList.append(toBundle)page = page+1def populateNegativeDeals():negativeDeals = get_mongo_connection().Catalog.NegativeDeals.find().distinct('sku')mc.set("negative_deals", negativeDeals, 600)#def recomputePoints(item, deal):# try:# if item.get('available_price') == deal['available_price']:# print "No need to compute points for %d , as price is still same" %(item['_id'])# raise# nlcPoints = getNlcPoints(item, deal['minNlc'], deal['maxNlc'], deal['available_price'])# except:# print traceback.print_exc()# nlcPoints = deal['nlcPoints']### bundleDealPoints = list(get_mongo_connection().Catalog.DealPoints.find({'skuBundleId':item['skuBundleId'],'startDate':{'$lte':to_java_date(datetime.now())},'endDate':{'$gte':to_java_date(datetime.now())}}))# if len(bundleDealPoints) > 0:# item['manualDealThresholdPrice'] = bundleDealPoints[0]['dealThresholdPrice']# dealPoints = bundleDealPoints[0]['dealPoints']# else:# dealPoints = 0# item['manualDealThresholdPrice'] = None## get_mongo_connection().Catalog.Deals.update({'_id':deal['_id']},{"$set":{'totalPoints':deal['totalPoints'] - deal['nlcPoints'] + nlcPoints - deal['dealPoints'] +dealPoints , 'nlcPoints': nlcPoints, 'dealPoints': dealPoints, 'manualDealThresholdPrice': item['manualDealThresholdPrice']}})def recomputeDeal(item):"""Lets recompute deal for this bundle"""print "Recomputing for bundleId %d" %(item.get('skuBundleId'))skuBundleId = item['skuBundleId']similarItems = list(get_mongo_connection().Catalog.Deals.find({'skuBundleId':skuBundleId}).sort([('netPriceAfterCashBack',pymongo.ASCENDING)]))bestPrice = float("inf")bestOne = NonetoUpdate = []prepaidBestPrice = float("inf")prepaidBestOne = Nonefor similarItem in similarItems:if similarItem['codAvailable'] ==1:if mc.get("negative_deals") is None:populateNegativeDeals()if similarItem['in_stock'] == 0 or similarItem['_id'] in mc.get("negative_deals"):get_mongo_connection().Catalog.Deals.update({ '_id' : similarItem['_id'] }, {'$set':{'showDeal':0, 'prepaidDeal':0 }})continueif similarItem['source_id'] == SOURCE_MAP.get('SHOPCLUES.COM') and similarItem['rank']==0:get_mongo_connection().Catalog.Deals.update({ '_id' : similarItem['_id'] }, {'$set':{'showDeal':0,'prepaidDeal':0 }})continueif similarItem.get('netPriceAfterCashBack') < bestPrice:bestOne = similarItembestPrice = similarItem.get('netPriceAfterCashBack')elif similarItem.get('netPriceAfterCashBack') == bestPrice:try:if (DEAL_PRIORITY.index(int(similarItem['source_id'])) > DEAL_PRIORITY.index(int(bestOne['source_id']))):continueexcept:traceback.print_exc()bestOne = similarItembestPrice = similarItem.get('netPriceAfterCashBack')else:passelse:if mc.get("negative_deals") is None:populateNegativeDeals()if similarItem['in_stock'] == 0 or similarItem['_id'] in mc.get("negative_deals"):get_mongo_connection().Catalog.Deals.update({ '_id' : similarItem['_id'] }, {'$set':{'showDeal':0, 'prepaidDeal':0 }})continueif similarItem['source_id'] == SOURCE_MAP.get('SHOPCLUES.COM') and similarItem['rank']==0:get_mongo_connection().Catalog.Deals.update({ '_id' : similarItem['_id'] }, {'$set':{'showDeal':0,'prepaidDeal':0 }})continueif similarItem.get('netPriceAfterCashBack') < prepaidBestPrice:prepaidBestOne = similarItemprepaidBestPrice = similarItem.get('netPriceAfterCashBack')elif similarItem.get('netPriceAfterCashBack') == prepaidBestPrice:try:if (DEAL_PRIORITY.index(int(similarItem['source_id'])) > DEAL_PRIORITY.index(int(prepaidBestOne['source_id']))):continueexcept:traceback.print_exc()prepaidBestOne = similarItemprepaidBestPrice = similarItem.get('netPriceAfterCashBack')else:passif bestOne is not None or prepaidBestOne is not None:for similarItem in similarItems:toUpdate.append(similarItem['_id'])if bestOne is not None:toUpdate.remove(bestOne['_id'])get_mongo_connection().Catalog.Deals.update({ '_id' : bestOne['_id'] }, {'$set':{'showDeal':1,'prepaidDeal':0 }})if prepaidBestOne is not None:if bestOne is not None:if prepaidBestOne.get('netPriceAfterCashBack') < bestOne.get('netPriceAfterCashBack'):toUpdate.remove(prepaidBestOne['_id'])get_mongo_connection().Catalog.Deals.update({ '_id' : prepaidBestOne['_id'] }, {'$set':{'showDeal':0,'prepaidDeal':1 }})else:toUpdate.remove(prepaidBestOne['_id'])get_mongo_connection().Catalog.Deals.update({ '_id' : prepaidBestOne['_id'] }, {'$set':{'showDeal':0,'prepaidDeal':1 }})if len(toUpdate) > 0:get_mongo_connection().Catalog.Deals.update({ '_id' : { "$in": toUpdate } }, {'$set':{'showDeal':0,'prepaidDeal':0 }},upsert=False, multi=True)def bundleNewProduct(existingProduct, toBundle):global bundledProductsglobal exceptionListprint "Adding new product"try:max_id = list(get_mongo_connection().Catalog.MasterData.find().sort([('_id',pymongo.DESCENDING)]).limit(1))existingProduct['_id'] = max_id[0]['_id'] + 1existingProduct['addedOn'] = to_java_date(datetime.now())existingProduct['available_price'] = toBundle.available_priceexistingProduct['updatedOn'] = to_java_date(datetime.now())existingProduct['codAvailable'] = toBundle.codAvailableexistingProduct['coupon'] = str(toBundle.coupon)existingProduct['identifier'] = str(toBundle.identifier)existingProduct['in_stock'] = toBundle.in_stockexistingProduct['marketPlaceUrl'] = toBundle.urlexistingProduct['rank'] = toBundle.rankexistingProduct['source_product_name'] = toBundle.source_product_nameexistingProduct['url'] = toBundle.urlexistingProduct['showVideo'] = 0existingProduct['shippingCost'] = 0existingProduct['quantity'] = 1existingProduct['videoLink'] = ""existingProduct['showNetPrice'] = 0get_mongo_connection().Catalog.MasterData.insert(existingProduct)newBundled = __NewBundled(toBundle, existingProduct)bundledProducts.append(newBundled)return {1:'Data added successfully.'}except Exception as e:print eexceptionList.append(toBundle)return {0:'Unable to add data.'}def sendMail():message="""<html><body><h3>ShopClues Best Sellers Auto Bundled</h3><table border="1" style="width:100%;"><thead><tr><th>Item Id</th><th>Identifier</th><th>Rank</th><th>Product Name</th><th>Bundle Id</th><th>Bundled with Brand</th><th>Bundled with Product Name</th><th>Available_price</th><th>In Stock</th><th>Coupon</th><th>COD Available</th></tr></thead><tbody>"""for bundledProduct in bundledProducts:newProduct = bundledProduct.newProductoldProduct = bundledProduct.oldProductmessage+="""<tr><td style="text-align:center">"""+str(oldProduct.get('_id'))+"""</td><td style="text-align:center">"""+oldProduct.get('identifier')+"""</td><td style="text-align:center">"""+str(oldProduct.get('rank'))+"""</td><td style="text-align:center">"""+(oldProduct.get('source_product_name'))+"""</td><td style="text-align:center">"""+str(oldProduct.get('skuBundleId'))+"""</td><td style="text-align:center">"""+(oldProduct.get('brand'))+"""</td><td style="text-align:center">"""+(oldProduct.get('product_name'))+"""</td><td style="text-align:center">"""+str(oldProduct.get('available_price'))+"""</td><td style="text-align:center">"""+str(oldProduct.get('in_stock'))+"""</td><td style="text-align:center">"""+str(oldProduct.get('coupon'))+"""</td><td style="text-align:center">"""+str(oldProduct.get('codAvailable'))+"""</td></tr>"""message+="""</tbody></table><h3>Items not bundled</h3><table border="1" style="width:100%;"><tr><th>Identifier</th><th>Rank</th><th>Product Name</th><th>Url</th><th>Available Price</th><th>In Stock</th><th>COD Available</th><th>Coupon</th><th>Thumbnail</th></tr></thead><tbody>"""for exceptionItem in exceptionList:message+="""<tr><td style="text-align:center">"""+str(exceptionItem.identifier)+"""</td><td style="text-align:center">"""+str(exceptionItem.rank)+"""</td><td style="text-align:center">"""+(exceptionItem.source_product_name)+"""</td><td style="text-align:center">"""+(exceptionItem.url)+"""</td><td style="text-align:center">"""+str(exceptionItem.available_price)+"""</td><td style="text-align:center">"""+str(exceptionItem.in_stock)+"""</td><td style="text-align:center">"""+str(exceptionItem.codAvailable)+"""</td><td style="text-align:center">"""+str(exceptionItem.coupon)+"""</td><td style="text-align:left">"""+(exceptionItem.thumbnail)+"""</td></tr>"""message+="""</tbody></table></body></html>"""print messageencoding = chardet.detect(message)try:message = message.decode(encoding.get('encoding'))except:pass#recipients = ['kshitij.sood@saholic.com']recipients = ['rajneesh.arora@saholic.com','kshitij.sood@saholic.com','chaitnaya.vats@saholic.com','ritesh.chauhan@saholic.com','khushal.bhatia@saholic.com']msg = MIMEMultipart()msg['Subject'] = "Shopclues Best Sellers" + ' - ' + str(datetime.now())msg['From'] = ""msg['To'] = ",".join(recipients)msg.preamble = "Shopclues Best Sellers" + ' - ' + str(datetime.now())html_msg = MIMEText(message, 'html')msg.attach(html_msg)smtpServer = smtplib.SMTP('localhost')smtpServer.set_debuglevel(1)sender = 'dtr@shop2020.in'try:smtpServer.sendmail(sender, recipients, msg.as_string())print "Successfully sent email"except:traceback.print_exc()print "Error: unable to send email."def resetRanks():get_mongo_connection().Catalog.MasterData.update({'rank':{'$gt':0},'source_id':5},{'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)def main():if options.reset == 'True':resetRanks()scrapeBestSellers()if len(bundledProducts)>0 or len(exceptionList) > 0:sendMail()else:"print nothing to send"if __name__=='__main__':main()