Subversion Repositories SmartDukaan

Rev

Rev 15088 | Rev 18202 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

import urllib2
import simplejson as json
import pymongo
from dtr.utils.utils import to_java_date
from datetime import datetime
import optparse
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from BeautifulSoup import BeautifulSoup
from dtr.utils.utils import ungzipResponse
import json

con = None
parser = optparse.OptionParser()
parser.add_option("-m", "--m", dest="mongoHost",
                      default="localhost",
                      type="string", help="The HOST where the mongo server is running",
                      metavar="mongo_host")

(options, args) = parser.parse_args()

exceptionList = []
noAttributesSupc = []

categoryMap = {3:'Mobiles',5:'Tablets'}

headers = { 
           'User-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
            'Accept-Language' : 'en-US,en;q=0.8',                     
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Accept-Encoding' : 'gzip,deflate,sdch'
        }

bestSellers = []
now = datetime.now()
BASE_URL = "snapdeal.com/"

class __RankInfo:
    
    def __init__(self, identifier, rank, category, product_name, page_url, baseColor, ppid, subAttributes, live):
        self.identifier = identifier
        self.rank  = rank
        self.category  =category
        self.product_name = product_name
        self.page_url = page_url 
        self.baseColor = baseColor
        self.ppid = ppid
        self.subAttributes = subAttributes
        self.live = live
        
class __SubAttributes:
    
    def __init__(self, identifier, color, name, value):
        self.identifier = identifier
        self.color = color
        self.name = name
        self.value = value

class __Exception:
    
    def __init__(self, identifier, color, desc, pageurl, rank, category, product_name):
        self.identifier = identifier
        self.color = color
        self.desc = desc
        self.pageurl = pageurl
        self.rank = rank
        self.category = category
        self.product_name = product_name
    
     

def get_mongo_connection(host=options.mongoHost, port=27017):
    global con
    if con is None:
        print "Establishing connection %s host and port %d" %(host,port)
        try:
            con = pymongo.MongoClient(host, port)
        except Exception, e:
            print e
            return None
    return con

def getSoupObject(url):
    print "Getting soup object for"
    print url
    global RETRY_COUNT
    RETRY_COUNT = 1 
    while RETRY_COUNT < 10:
        try:
            soup = None
            request = urllib2.Request(url,headers=headers)
            response = urllib2.urlopen(request)
            response_data = ungzipResponse(response)   
            response.close()
            try:
                page=response_data.decode("utf-8")
                soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
            except:
                soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)
            if soup is None:
                raise
            return soup
        except Exception as e:
            print e
            print "Retrying"
            RETRY_COUNT = RETRY_COUNT + 1

def fetchSupcDetails(p_url, rank, category):
    supcMap = {}
    soup = getSoupObject(p_url)
    attributes =  soup.find('div',{'id':'attributesJson'}).string
    try:
        ppid = soup.find('input',{'id':'pppid'})['value']
    except:
        ppid = p_url[p_url.rfind('/')+1:]
    productName = soup.find('input',{'id':'productNamePDP'})['value']
    supcMap[ppid] = []
    if attributes == "[]":
        supc = soup.find('div',{'id':'defaultSupc'}).string
        r_info = __RankInfo(supc, rank, category, productName, p_url, "", ppid, [], True)
        supcMap[ppid] = [r_info]
    p_info = json.loads(attributes)
    for product in p_info:
        color = product['value']
        supc = product['supc']
        live = product['live']
        r_info = __RankInfo(supc, rank, category, productName, p_url, color, ppid, [], live)
        temp = supcMap.get(ppid)
        temp.append(r_info)
        supcMap[ppid] = temp  
        for subAttribute in product['subAttributes']:
            sub_supc = subAttribute['supc']
            sub_value = subAttribute['value']
            sub_name = subAttribute['name']
            subAttr = __SubAttributes(sub_supc, color, sub_name, sub_value)
            subAttributes_list = r_info.subAttributes
            subAttributes_list.append(subAttr)
    return supcMap
            
    

def scrapeBestSellerMobiles():
    global bestSellers
    bestSellers = []
    rank = 1
    for z in [0,20,40,60,80]:
        url = "http://www.snapdeal.com/acors/json/product/get/search/175/%d/20?q=&sort=bstslr&keyword=&clickSrc=&viewType=List&lang=en&snr=false" %(z)
        soup = getSoupObject(url)
        for product in soup.findAll('div',{'class':'outerImg'}):
            print product.find('a')['pogid']
            print product.find('a')['href']
            supcMap = fetchSupcDetails(product.find('a')['href'],rank, 3)
            bestSellers.append(supcMap)
            rank = rank + 1

def scrapeBestSellerTablets():
    global bestSellers
    bestSellers = []
    rank = 1
    for z in [0,20,40,60,80]:
        url = "http://www.snapdeal.com/acors/json/product/get/search/133/%d/20?sort=bstslr&keyword=&clickSrc=&viewType=List&lang=en&snr=false" %(z)
        soup = getSoupObject(url)
        for product in soup.findAll('div',{'class':'outerImg'}):
            print product.find('a')['pogid']
            print product.find('a')['href']
            supcMap = fetchSupcDetails(product.find('a')['href'],rank, 5)
            bestSellers.append(supcMap)
            rank = rank + 1


def resetRanks(category):
    get_mongo_connection().Catalog.MasterData.update({'rank':{'$gt':0},'source_id':3,'category_id':category},{'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)

def commitBestSellers(category):
    global exceptionList
    for bestSeller in bestSellers:
        for mapVal in bestSeller.itervalues():
            for v in mapVal:
                col = list(get_mongo_connection().Catalog.MasterData.find({'identifier':v.identifier.strip(),'source_id':3}))
                if len(col) ==0 and len(v.subAttributes) == 0:
                    exObj = __Exception(v.identifier.strip(), v.baseColor, "", v.page_url ,v.rank, category, v.product_name)
                    exceptionList.append(exObj)
                    continue
                print v.identifier
                print v.rank
                get_mongo_connection().Catalog.MasterData.update({'identifier':v.identifier,'source_id':3},{'$set' : {'rank':v.rank,'updatedOn':to_java_date(now)}}, multi=True)
                for subAttr in v.subAttributes:
                    print "Inside subattr"
                    print vars(subAttr)
                    col = list(get_mongo_connection().Catalog.MasterData.find({'identifier':subAttr.identifier.strip(),'source_id':3}))
                    if len(col) ==0:
                        exObj = __Exception(subAttr.identifier.strip(), subAttr.color, subAttr.name+" "+subAttr.value, v.page_url ,v.rank, category, v.product_name)
                        exceptionList.append(exObj)
                    else:
                        print v.identifier
                        print v.rank
                        get_mongo_connection().Catalog.MasterData.update({'identifier':subAttr.identifier,'source_id':3},{'$set' : {'rank':v.rank,'updatedOn':to_java_date(now)}}, multi=True)
    print exceptionList

def sendEmail():
    message="""<html>
            <body>
            <h3>Snapdeal Best Sellers not in master</h3>
            <table border="1" style="width:100%;">
            <thead>
            <tr><th>Identifier</th>
            <th>Category</th>
            <th>Rank</th>
            <th>Product Name</th>
            <th>Color</th>
            <th>Description</th>
            <th>Page Url</th>
            </tr></thead>
            <tbody>"""
    for item in exceptionList:
        message+="""<tr>
        <td style="text-align:center">"""+(item.identifier)+"""</td>
        <td style="text-align:center">"""+(categoryMap.get(item.category))+"""</td>
        <td style="text-align:center">"""+str(item.rank)+"""</td>
        <td style="text-align:center">"""+str(item.product_name)+"""</td>
        <td style="text-align:center">"""+item.color+"""</td>
        <td style="text-align:center">"""+item.desc+"""</td>
        <td style="text-align:center">"""+item.pageurl+"""</td>
        </tr>"""
    message+="""</tbody></table></body></html>"""
    print message
    #recipients = ['kshitij.sood@saholic.com']
    recipients = ['rajneesh.arora@saholic.com','kshitij.sood@saholic.com','chaitnaya.vats@saholic.com','manoj.kumar@saholic.com']
    msg = MIMEMultipart()
    msg['Subject'] = "Snapdeal Best Sellers" + ' - ' + str(datetime.now())
    msg['From'] = ""
    msg['To'] = ",".join(recipients)
    msg.preamble = "Snapdeal Best Sellers" + ' - ' + str(datetime.now())
    html_msg = MIMEText(message, 'html')
    msg.attach(html_msg)
    
    smtpServer = smtplib.SMTP('localhost')
    smtpServer.set_debuglevel(1)
    sender = 'dtr@shop2020.in'
    try:
        smtpServer.sendmail(sender, recipients, msg.as_string())
        print "Successfully sent email"
    except:
        print "Error: unable to send email."
            
            

def main():
    import time
    scrapeBestSellerMobiles()
    if len(bestSellers) > 0:
        resetRanks(3)
        commitBestSellers(3)
    scrapeBestSellerTablets()
    if len(bestSellers) > 0:
        resetRanks(5)
        commitBestSellers(5)
        
    sendEmail()

if __name__=='__main__':
    main()