Rev 20372 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
import urllib2from BeautifulSoup import BeautifulSoupimport refrom dtr.utils.utils import to_java_dateimport optparsefrom datetime import datetimeimport smtplibfrom email.mime.text import MIMETextfrom email.mime.multipart import MIMEMultipartfrom dtr.utils.utils import fetchResponseUsingProxy, get_mongo_connection, ungzipResponseimport jsonimport urllibimport chardetfrom shop2020.utils.EmailAttachmentSender import get_attachment_partfrom shop2020.utils import EmailAttachmentSendercon = Noneparser = optparse.OptionParser()parser.add_option("-m", "--m", dest="mongoHost",default="localhost",type="string", help="The HOST where the mongo server is running",metavar="mongo_host")(options, args) = parser.parse_args()headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36','Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Accept-Language' : 'en-US,en;q=0.8','Accept-Encoding' : 'gzip,deflate,sdch','Host' : 'www.homeshop18.com','Referer': 'http://www.homeshop18.com/mobile-phones/categoryid:14569/search:/sort:Popularity/inStock:%28%22true%22%29/start:0/'}exceptionList = []bestSellers = []now = datetime.now()csrfValue = Noneclass __RankInfo:def __init__(self, identifier, rank, category, available_price, in_stock, thumbnail, source_product_name, marketPlaceUrl):self.identifier = identifierself.rank = rankself.available_price = available_priceself.in_stock = in_stockself.category = categoryself.thumbnail = thumbnailself.source_product_name = source_product_nameself.marketPlaceUrl = marketPlaceUrldef commitBestSellers(category):global exceptionListprint "Rank",print '\t',print 'Identifier'for x in bestSellers:print x.rank,print '\t',print x.identifier,print '\t',col = list(get_mongo_connection(host=options.mongoHost).Catalog.MasterData.find({'identifier':x.identifier, 'source_id':7}))print "count sku",len(col)print '\n'if len(col) == 0:x.category = categoryexceptionList.append(x)else:get_mongo_connection(host=options.mongoHost).Catalog.MasterData.update({'identifier':x.identifier, 'source_id':7 }, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}})def scrapeBestSellerMobiles():global bestSellersrank = 0bestSellers = []print "Homeshop18 Best Sellers Mobiles..."for i in range(0,5):mobileCategoryUrl = 'http://www.homeshop18.com/mobile-phones/categoryid:14569/search:/sort:Popularity/inStock:%28%22true%22%29/start:'+str(i*24)+'/?lazy=true'#mobileCategoryUrl = "http://m.homeshop18.com/search.mobi?categoryId=14569&isAjax=true&page="+str(i)+"&csrf="+csrfValuedata = fetchResponseUsingProxy(mobileCategoryUrl, proxy=False )soup = BeautifulSoup(data)tags = soup.findAll("div", {'class':'inside'})for tag in tags:if not tag.has_key('id'):continuerank = rank +1if rank >100:breaktitleTag = tag.find('p', {'class' : 'product_title'})source_product_name = titleTag.text'''encoding = chardet.detect(source_product_name)try:source_product_name = source_product_name.decode(encoding.get('encoding'))except:source_product_name = source_product_name.decode('latin-1')'''source_product_name = source_product_name.encode('utf8')productUrl = titleTag.find('a').get('href')productUrl = 'http://www.homeshop18.com'+str(productUrl)inStock = 1identifier = tag['id'].split('_')[1]available_price = long(tag.find('p',{'class':'price clearfix'}).find('b').text.split(' ')[1])thumbnail = tag.find('p',{'class':'product_image'}).find('img').get('data-original')print productUrl, source_product_name, thumbnail, rank, available_price, identifier, inStockr_info = __RankInfo(identifier, rank, None, available_price, inStock, thumbnail, source_product_name, "http://m.homeshop18.com/product.mobi?productId=%s"%str(identifier))bestSellers.append(r_info)def scrapeBestSellerTablets():global bestSellersrank = 0bestSellers = []print "Homeshop18 Best Sellers Tablets..."for i in range(0,5):tabletCategoryUrl = 'http://www.homeshop18.com/tablets/categoryid:8937/search:/sort:Popularity/inStock:%28%22true%22%29/start:'+str(i*24)+'/?lazy=true'data = fetchResponseUsingProxy(tabletCategoryUrl, proxy=False)soup = BeautifulSoup(data)tags = soup.findAll("div", {'class':'inside'})for tag in tags:if not tag.has_key('id'):continuerank = rank +1if rank >100:breaktitleTag = tag.find('p', {'class' : 'product_title'})source_product_name = titleTag.text'''encoding = chardet.detect(source_product_name)try:source_product_name = source_product_name.decode(encoding.get('encoding'))except:source_product_name = source_product_name.decode('latin-1')'''source_product_name = source_product_name.encode('utf8')productUrl = titleTag.find('a').get('href')productUrl = 'http://www.homeshop18.com'+str(productUrl)inStock = 1identifier = tag['id'].split('_')[1]available_price = long(tag.find('p',{'class':'price clearfix'}).find('b').text.split(' ')[1])thumbnail = tag.find('p',{'class':'product_image'}).find('img').get('data-original')print productUrl, source_product_name, thumbnail, rank, available_price, identifier, inStockr_info = __RankInfo(identifier, rank, None , available_price, inStock, thumbnail, source_product_name, "http://m.homeshop18.com/product.mobi?productId=%s"%str(identifier))bestSellers.append(r_info)def resetRanks(category_id):get_mongo_connection(host=options.mongoHost).Catalog.MasterData.update({'rank':{'$gt':0},'source_id':7,'category_id':category_id}, {'$set':{'rank':0}}, upsert=False, multi=True)def sendEmail():message="""<html><body><h3>HomeShop18 Best Sellers not in master</h3><table border="1" style="width:100%;"><thead><tr><th>Identifier</th><th>Category</th><th>Rank</th><th>Available_price</th><th>In_stock</th><th>Thumbnail</th><th>Source_product_name</th><th>MarketPlaceUrl</th></tr></thead><tbody>"""for item in exceptionList:try:message+="""<tr><td style="text-align:center">"""+str(item.identifier)+"""</td><td style="text-align:center">"""+str(item.category)+"""</td><td style="text-align:center">"""+str(item.rank)+"""</td><td style="text-align:center">"""+str(item.available_price)+"""</td><td style="text-align:center">"""+str(item.in_stock)+"""</td><td style="text-align:center">"""+str(item.thumbnail)+"""</td><td style="text-align:center">"""+str(item.source_product_name)+"""</td><td style="text-align:center">"""+str(item.marketPlaceUrl)+"""</td></tr>"""except:continuemessage+="""</tbody></table></body></html>"""print message#recipients = ['amit.gupta@saholic.com']recipients = ['kshitij.sood@saholic.com','ritesh.chauhan@saholic.com','aishwarya.singh@saholic.com']EmailAttachmentSender.mail_send_grid("dtr@profitmandi.com","apikey", "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", recipients, "Homeshop18 Best Sellers",message ,[],[],[])def main():#getCsrfValue()scrapeBestSellerMobiles()if len(bestSellers) > 0:resetRanks(3)commitBestSellers("MOBILE")scrapeBestSellerTablets()if len(bestSellers) > 0:resetRanks(5)commitBestSellers("TABLET")sendEmail()if __name__=='__main__':main()