Subversion Repositories SmartDukaan

Rev

Rev 14257 | Rev 20319 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

import urllib2
from BeautifulSoup import BeautifulSoup
import pymongo
import re
from dtr.utils.utils import to_java_date
import optparse
from datetime import datetime
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

con = None
parser = optparse.OptionParser()
parser.add_option("-m", "--m", dest="mongoHost",
                      default="localhost",
                      type="string", help="The HOST where the mongo server is running",
                      metavar="mongo_host")

(options, args) = parser.parse_args()


exceptionList = []
asin_regex = r'/([A-Z0-9]{10})'
bestSellers = []
now = datetime.now()


class __RankInfo:
    
    def __init__(self, identifier, rank, category):
        self.identifier = identifier
        self.rank  = rank
        self.category = category

def get_mongo_connection(host=options.mongoHost, port=27017):
    global con
    if con is None:
        print "Establishing connection %s host and port %d" %(host,port)
        try:
            con = pymongo.MongoClient(host, port)
        except Exception, e:
            print e
            return None
    return con

def getSoupObject(url):
    print "Getting soup object for"
    print url
    global RETRY_COUNT
    RETRY_COUNT = 1 
    while RETRY_COUNT < 10:
        try:
            soup = None
            request = urllib2.Request(url)
            request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
            request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')
            request.add_header('Connection','keep-alive')
            request.add_header('User-Agent','Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36')
        
            response = urllib2.urlopen(request)   
            response_data = response.read()
            response.close()
            try:
                page=response_data.decode("utf-8")
                soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
            except:
                soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)
            if soup is None:
                raise
            return soup
        except Exception as e:
            print e
            print "Retrying"
            RETRY_COUNT = RETRY_COUNT + 1


def scrapeBestSellerMobiles():
    global bestSellers
    rank = 0
    for i in range(1,6):
        aboveFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1389432031/ref=zg_bs_1389432031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=1" %(i,i) 
        belowFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1389432031/ref=zg_bs_1389432031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=0" %(i,i)
        above_soup = getSoupObject(aboveFoldUrl)
        below_soup = getSoupObject(belowFoldUrl)
        for x in above_soup.findAll('div',{'class':'zg_itemImmersion'}):
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
            identifier =  (re.search(asin_regex, am_url)).group(1)
            rank = rank + 1
            r_info = __RankInfo(identifier,rank, None)
            bestSellers.append(r_info)
        for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
            identifier =  (re.search(asin_regex, am_url)).group(1)
            rank = rank + 1
            r_info = __RankInfo(identifier,rank, None)
            bestSellers.append(r_info)

def commitBestSellers(category):
    global exceptionList
    print "Rank",
    print '\t',
    print 'Identifier'
    for x in bestSellers:
        print x.rank,
        print '\t',
        print x.identifier,
        col = get_mongo_connection().Catalog.MasterData.find({'identifier':x.identifier.strip()})
        print "count sku",
        print '\t',
        if len(list(col)) == 0:
            x.category = category
            exceptionList.append(x)
        else:
            get_mongo_connection().Catalog.MasterData.update({'identifier':x.identifier.strip() }, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}}, multi=True)
        
def scrapeBestSellerTablets():
    global bestSellers
    bestSellers = []
    rank = 0
    for i in range(1,6):
        aboveFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1375458031/ref=zg_bs_1375458031_pg_%d?ie=UTF8&pg=%d&ajax=1" %(i,i) 
        belowFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1375458031/ref=zg_bs_1375458031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=0" %(i,i)
        above_soup = getSoupObject(aboveFoldUrl)
        below_soup = getSoupObject(belowFoldUrl)
        for x in above_soup.findAll('div',{'class':'zg_itemImmersion'}):
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
            identifier =  (re.search(asin_regex, am_url)).group(1)
            rank = rank + 1
            print identifier,
            print '\t',
            print rank
            r_info = __RankInfo(identifier,rank, None)
            bestSellers.append(r_info)
        for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
            identifier =  (re.search(asin_regex, am_url)).group(1)
            rank = rank + 1
            print identifier,
            print '\t',
            print rank
            r_info = __RankInfo(identifier,rank, None)
            bestSellers.append(r_info)

def resetRanks(category_id):
    oldRankedItems = get_mongo_connection().Catalog.MasterData.find({'rank':{'$gt':0},'source_id':1,'category_id':category_id})
    for item in oldRankedItems:
        get_mongo_connection().Catalog.MasterData.update({'_id':item['_id']}, {'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)

def sendEmail():
    message="""<html>
            <body>
            <h3>Amazon Best Sellers not in master</h3>
            <table border="1" style="width:100%;">
            <thead>
            <tr><th>Identifier</th>
            <th>Category</th>
            <th>Rank</th>
            </tr></thead>
            <tbody>"""
    for item in exceptionList:
        message+="""<tr>
        <td style="text-align:center">"""+(item.identifier)+"""</td>
        <td style="text-align:center">"""+(item.category)+"""</td>
        <td style="text-align:center">"""+str(item.rank)+"""</td>
        </tr>"""
    message+="""</tbody></table></body></html>"""
    print message
    #recipients = ['kshitij.sood@saholic.com']
    recipients = ['rajneesh.arora@saholic.com','kshitij.sood@saholic.com','chaitnaya.vats@saholic.com','manoj.kumar@saholic.com']
    msg = MIMEMultipart()
    msg['Subject'] = "Amazon Best Sellers" + ' - ' + str(datetime.now())
    msg['From'] = ""
    msg['To'] = ",".join(recipients)
    msg.preamble = "Amazon Best Sellers" + ' - ' + str(datetime.now())
    html_msg = MIMEText(message, 'html')
    msg.attach(html_msg)
    
    smtpServer = smtplib.SMTP('localhost')
    smtpServer.set_debuglevel(1)
    sender = 'dtr@shop2020.in'
    try:
        smtpServer.sendmail(sender, recipients, msg.as_string())
        print "Successfully sent email"
    except:
        print "Error: unable to send email."
            
            
            
def main():
    scrapeBestSellerMobiles()
    if len(bestSellers) > 0:
        resetRanks(3)
        commitBestSellers("MOBILE")
    scrapeBestSellerTablets()
    if len(bestSellers) > 0:
        resetRanks(5)
        commitBestSellers("TABLET")
    sendEmail()
        
if __name__=='__main__':
    main()