Rev 21135 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
import urllibimport urllib2from BeautifulSoup import BeautifulSoupimport pymongoimport refrom dtr.utils.utils import to_java_datefrom datetime import datetimeimport optparseimport smtplibfrom email.mime.text import MIMETextfrom email.mime.multipart import MIMEMultipartfrom shop2020.utils.EmailAttachmentSender import get_attachment_partfrom shop2020.utils import EmailAttachmentSendercon = Noneparser = optparse.OptionParser()parser.add_option("-m", "--m", dest="mongoHost",default="localhost",type="string", help="The HOST where the mongo server is running",metavar="mongo_host")(options, args) = parser.parse_args()bestSellers = []now = datetime.now()mobUrl = "http://www.saholic.com/mobile-phone/10006"tabUrl = "http://www.saholic.com/all-tablets/10010"exceptionList = []class __RankInfo:def __init__(self, identifier, rank, category):self.identifier = identifierself.rank = rankself.category = categorydef get_mongo_connection(host=options.mongoHost, port=27017):global conif con is None:print "Establishing connection %s host and port %d" %(host,port)try:con = pymongo.MongoClient(host, port)except Exception, e:print ereturn Nonereturn condef getSoupObject(url,data):print "Getting soup object for"global RETRY_COUNTRETRY_COUNT = 1while RETRY_COUNT < 10:try:soup = Nonerequest = urllib2.Request(url,data)request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')request.add_header('Connection','keep-alive')request.add_header('User-Agent','Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36')response = urllib2.urlopen(request)response_data = response.read()response.close()try:page=response_data.decode("utf-8")soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)except:soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)if soup is None:raisereturn soupexcept Exception as e:print eprint "Retrying"RETRY_COUNT = RETRY_COUNT + 1def scrapeBestSellerMobiles():global bestSellersrank = 1for i in range(1,6):query_args = { 'fq':'F_50028:In Stock', 'page':i }data = urllib.urlencode(query_args)soup = getSoupObject(mobUrl,data)titleTags = soup.findAll('div',{'class':'title'})for titleTag in titleTags:print titleTag.find('a').textidentifier = getSaholicIdentifier(titleTag.find('a')['href'])r_info = __RankInfo(str(identifier), rank ,None)bestSellers.append(r_info)rank = rank + 1def scrapeBestSellerTablets():global bestSellersbestSellers = []rank = 1for i in range(1,6):query_args = { 'fq':'F_50028:In Stock', 'page':i }data = urllib.urlencode(query_args)soup = getSoupObject(tabUrl, data)titleTags = soup.findAll('div',{'class':'title'})for titleTag in titleTags:print titleTag.find('a').textidentifier = getSaholicIdentifier(titleTag.find('a')['href'])r_info = __RankInfo(str(identifier), rank, None)bestSellers.append(r_info)rank = rank + 1def getSaholicIdentifier(url):return url[url.rfind('-')+len('-'):]def resetRanks(category):oldRankedItems = get_mongo_connection().Catalog.MasterData.find({'rank':{'$gt':0},'source_id':4,'category':category})for item in oldRankedItems:print item['_id']get_mongo_connection().Catalog.MasterData.update({'_id':item['_id']}, {'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)def commitBestSellers(category):global exceptionListprint "Rank",print '\t',print 'Identifier'for x in bestSellers:print x.rank,print '\t',print x.identifier,col = get_mongo_connection().Catalog.MasterData.find({'identifier':x.identifier})print '\t',print "count sku"if len(list(col)) == 0:x.category = categoryexceptionList.append(x)else:get_mongo_connection().Catalog.MasterData.update({'identifier':x.identifier,'source_id':4}, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}}, multi=True)def sendEmail():message="""<html><body><h3>Saholic Best Sellers not in master</h3><table border="1" style="width:100%;"><thead><tr><th>Identifier</th><th>Category</th><th>Rank</th></tr></thead><tbody>"""for item in exceptionList:message+="""<tr><td style="text-align:center">"""+(item.identifier)+"""</td><td style="text-align:center">"""+(item.category)+"""</td><td style="text-align:center">"""+str(item.rank)+"""</td></tr>"""message+="""</tbody></table></body></html>"""print messagerecipients = ['kshitij.sood@saholic.com','ritesh.chauhan@saholic.com','aishwarya.singh@saholic.com']EmailAttachmentSender.mail_send_grid("dtr@profitmandi.com","apikey", "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", recipients, "Saholic Best Sellers",message ,[],[],[])def main():scrapeBestSellerMobiles()if len(bestSellers) > 0:resetRanks('Mobiles')commitBestSellers('MOBILES')scrapeBestSellerTablets()if len(bestSellers) > 0:resetRanks('Tablets')commitBestSellers('TABLETS')sendEmail()if __name__=='__main__':main()