Rev 17038 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
import urllib2from BeautifulSoup import BeautifulSoupimport pymongoimport refrom dtr.utils.utils import to_java_dateimport optparsefrom datetime import datetimeimport smtplibfrom email.mime.text import MIMETextfrom email.mime.multipart import MIMEMultipartfrom dtr.utils.utils import fetchResponseUsingProxy, get_mongo_connection, ungzipResponsefrom pyquery import PyQueryimport jsonimport mechanizeimport urllibcon = Noneparser = optparse.OptionParser()parser.add_option("-m", "--m", dest="mongoHost",default="localhost",type="string", help="The HOST where the mongo server is running",metavar="mongo_host")(options, args) = parser.parse_args()headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36','Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Accept-Language' : 'en-US,en;q=0.8','Accept-Encoding' : 'gzip,deflate,sdch','Host' : 'www.homeshop18.com','Referer': 'http://www.homeshop18.com/mobile-phones/categoryid:14569/search:/sort:Popularity/inStock:%28%22true%22%29/start:0/'}exceptionList = []bestSellers = []now = datetime.now()csrfValue = Noneclass __RankInfo:def __init__(self, identifier, rank, category, available_price, in_stock, thumbnail, source_product_name, marketPlaceUrl):self.identifier = identifierself.rank = rankself.available_price = available_priceself.in_stock = in_stockself.category = categoryself.thumbnail = thumbnailself.source_product_name = source_product_nameself.marketPlaceUrl = marketPlaceUrldef getBrowserObject():import cookielibbr = mechanize.Browser(factory=mechanize.RobustFactory())cj = cookielib.LWPCookieJar()br.set_cookiejar(cj)br.set_handle_equiv(True)br.set_handle_redirect(True)br.set_handle_referer(True)br.set_handle_robots(False)br.set_debug_http(False)br.set_debug_redirects(False)br.set_debug_responses(False)br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)br.addheaders = [('User-agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'),('Accept', 'text/html,application/xhtml+xml,application/json,application/xml;q=0.9,*/*;q=0.8'),('Accept-Encoding', 'gzip,deflate,sdch'),('Accept-Language', 'en-US,en;q=0.8'),('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'),('Host' , 'm.homeshop18.com'),('Referer', 'http://m.homeshop18.com/search.mobi?keyword=&categoryId=3027&sortConfig=BEST_SELLER')]return brdef ungzipResponseBr(r,b):headers = r.info()if headers['Content-Encoding']=='gzip':import gzipprint "********************"print "Deflating gzip response"print "********************"gz = gzip.GzipFile(fileobj=r, mode='rb')html = gz.read()gz.close()headers["Content-type"] = "text/html; charset=utf-8"r.set_data( html )b.set_response(r)def getCsrfValue():global csrfValuecsrfValUrl = 'http://m.homeshop18.com/search.mobi?keyword=&categoryId=14569&sortConfig=BEST_SELLER'data = fetchResponseUsingProxy(csrfValUrl, proxy=False )pq = PyQuery(data)csrf = pq("input[name=csrf]")csrfValue = csrf[0].valueprint csrfValuedef commitBestSellers(category):global exceptionListprint "Rank",print '\t',print 'Identifier'for x in bestSellers:print x.rank,print '\t',print x.identifier,print '\t',col = list(get_mongo_connection().Catalog.MasterData.find({'identifier':x.identifier, 'source_id':7}))print "count sku",len(col)print '\n'if len(col) == 0:x.category = categoryexceptionList.append(x)else:get_mongo_connection().Catalog.MasterData.update({'identifier':x.identifier, 'source_id':7 }, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}})def scrapeBestSellerMobiles():global bestSellersrank = 0print "Homeshop18 Best Sellers Mobiles..."for i in range(0,5):mobileCategoryUrl = 'http://www.homeshop18.com/mobile-phones/categoryid:14569/search:/sort:Popularity/inStock:%28%22true%22%29/start:'+str(i*24)+'/?lazy=true'#mobileCategoryUrl = "http://m.homeshop18.com/search.mobi?categoryId=14569&isAjax=true&page="+str(i)+"&csrf="+csrfValuedata = fetchResponseUsingProxy(mobileCategoryUrl, proxy=False )soup = BeautifulSoup(data)tags = soup.findAll("div", {'class':'inside'})for tag in tags:if not tag.has_key('id'):continuerank = rank +1if rank >100:breaktitleTag = tag.find('p', {'class' : 'product_title'})source_product_name = titleTag.textproductUrl = titleTag.find('a').get('href')productUrl = 'http://www.homeshop18.com'+str(productUrl)inStock = 1identfier = tag['id'].split('_')[1]available_price = long(tag.find('p',{'class':'price clearfix'}).find('b').text.split(' ')[1])thumbnail = tag.find('p',{'class':'product_image'}).find('img').get('data-original')print productUrl, source_product_name, thumbnail, rank, available_price, identfier, inStockr_info = __RankInfo(identfier, rank, None , available_price, inStock, thumbnail, source_product_name, productUrl)bestSellers.append(r_info)def scrapeBestSellerTablets():global bestSellersrank = 0print "Homeshop18 Best Sellers Tablets..."for i in range(0,5):tabletCategoryUrl = 'http://www.homeshop18.com/tablets/categoryid:8937/search:Tablets/sort:Popularity/inStock:%28%22true%22%29/start:'+str(i*24)+'/?lazy=true'data = fetchResponseUsingProxy(tabletCategoryUrl)soup = BeautifulSoup(data)tags = soup.findAll("div", {'class':'inside'})for tag in tags:if not tag.has_key('id'):continuerank = rank +1if rank >100:breaktitleTag = tag.find('p', {'class' : 'product_title'})source_product_name = titleTag.textproductUrl = titleTag.find('a').get('href')productUrl = 'http://www.homeshop18.com'+str(productUrl)inStock = 1identfier = tag['id'].split('_')[1]available_price = long(tag.find('p',{'class':'price clearfix'}).find('b').text.split(' ')[1])thumbnail = tag.find('p',{'class':'product_image'}).find('img').get('data-original')print productUrl, source_product_name, thumbnail, rank, available_price, identfier, inStockr_info = __RankInfo(identfier, rank, None , available_price, inStock, thumbnail, source_product_name, productUrl)bestSellers.append(r_info)def resetRanks(category_id):get_mongo_connection().Catalog.MasterData.update({'rank':{'$gt':0},'source_id':6,'category_id':category_id}, {'$set':{'rank':0}}, upsert=False, multi=True)def sendEmail():message="""<html><body><h3>HomeShop18 Best Sellers not in master</h3><table border="1" style="width:100%;"><thead><tr><th>Identifier</th><th>Category</th><th>Rank</th><th>Available_price</th><th>Gross_price</th><th>In_stock</th><th>Coupon</th><th>Thumbnail</th><th>Source_product_name</th><th>MarketPlaceUrl</th><th>Cod</th></tr></thead><tbody>"""for item in exceptionList:try:message+="""<tr><td style="text-align:center">"""+(item.identifier)+"""</td><td style="text-align:center">"""+(item.category)+"""</td><td style="text-align:center">"""+str(item.rank)+"""</td><td style="text-align:center">"""+str(item.available_price)+"""</td><td style="text-align:center">"""+str(item.gross_price)+"""</td><td style="text-align:center">"""+str(item.in_stock)+"""</td><td style="text-align:center">"""+str(item.coupon)+"""</td><td style="text-align:center">"""+str(item.thumbnail)+"""</td><td style="text-align:center">"""+str(item.source_product_name)+"""</td><td style="text-align:center">"""+str(item.marketPlaceUrl)+"""</td><td style="text-align:center">"""+str(item.cod)+"""</td></tr>"""except:continuemessage+="""</tbody></table></body></html>"""print message#recipients = ['amit.gupta@saholic.com']recipients = ['rajneesh.arora@saholic.com','kshitij.sood@saholic.com','chaitnaya.vats@saholic.com','manoj.kumar@saholic.com','amit.gupta@saholic.com']msg = MIMEMultipart()msg['Subject'] = "HomeShop18 Best Sellers" + ' - ' + str(datetime.now())msg['From'] = ""msg['To'] = ",".join(recipients)msg.preamble = "HomeShop18 Best Sellers" + ' - ' + str(datetime.now())html_msg = MIMEText(message, 'html')msg.attach(html_msg)smtpServer = smtplib.SMTP('localhost')smtpServer.set_debuglevel(1)sender = 'dtr@shop2020.in'try:smtpServer.sendmail(sender, recipients, msg.as_string())print "Successfully sent email"except:print "Error: unable to send email."def main():#getCsrfValue()scrapeBestSellerMobiles()if len(bestSellers) > 0:resetRanks(3)commitBestSellers("MOBILE")scrapeBestSellerTablets()if len(bestSellers) > 0:resetRanks(5)commitBestSellers("TABLET")sendEmail()if __name__=='__main__':main()