Subversion Repositories SmartDukaan

Rev

Rev 21135 | Blame | Compare with Previous | Last modification | View Log | RSS feed

import urllib2
import simplejson as json
import pymongo
from dtr.utils.utils import to_java_date
from datetime import datetime
import optparse
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from BeautifulSoup import BeautifulSoup
from dtr.utils.utils import ungzipResponse
import json
import chardet
from shop2020.utils.EmailAttachmentSender import get_attachment_part
from shop2020.utils import EmailAttachmentSender

con = None
parser = optparse.OptionParser()
parser.add_option("-m", "--m", dest="mongoHost",
                      default="localhost",
                      type="string", help="The HOST where the mongo server is running",
                      metavar="mongo_host")

(options, args) = parser.parse_args()

exceptionList = []
noAttributesSupc = []

categoryMap = {3:'Mobiles',5:'Tablets'}

headers = { 
           'User-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
            'Accept-Language' : 'en-US,en;q=0.8',                     
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Accept-Encoding' : 'gzip,deflate,sdch'
        }

bestSellers = []
now = datetime.now()
BASE_URL = "snapdeal.com/"

class __RankInfo:
    
    def __init__(self, identifier, rank, category, product_name, page_url, baseColor, ppid, subAttributes, live):
        self.identifier = identifier
        self.rank  = rank
        self.category  =category
        self.product_name = product_name
        self.page_url = page_url 
        self.baseColor = baseColor
        self.ppid = ppid
        self.subAttributes = subAttributes
        self.live = live
        
class __SubAttributes:
    
    def __init__(self, identifier, color, name, value):
        self.identifier = identifier
        self.color = color
        self.name = name
        self.value = value

class __Exception:
    
    def __init__(self, identifier, color, desc, pageurl, rank, category, product_name):
        self.identifier = identifier
        self.color = color
        self.desc = desc
        self.pageurl = pageurl
        self.rank = rank
        self.category = category
        self.product_name = product_name
    
     

def get_mongo_connection(host=options.mongoHost, port=27017):
    global con
    if con is None:
        print "Establishing connection %s host and port %d" %(host,port)
        try:
            con = pymongo.MongoClient(host, port)
        except Exception, e:
            print e
            return None
    return con

def getSoupObject(url):
    print "Getting soup object for"
    print url
    global RETRY_COUNT
    RETRY_COUNT = 1 
    while RETRY_COUNT < 10:
        try:
            soup = None
            request = urllib2.Request(url,headers=headers)
            response = urllib2.urlopen(request)
            response_data = ungzipResponse(response)   
            response.close()
            try:
                page=response_data.decode("utf-8")
                soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
            except:
                soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)
            if soup is None:
                raise
            return soup
        except Exception as e:
            print e
            print "Retrying"
            RETRY_COUNT = RETRY_COUNT + 1

def fetchSupcDetails(p_url, rank, category):
    supcMap = {}
    soup = getSoupObject(p_url)
    attributes =  soup.find('div',{'id':'attributesJson'}).string
    try:
        ppid = soup.find('input',{'id':'pppid'})['value']
    except:
        ppid = p_url[p_url.rfind('/')+1:]
    productName = soup.find('input',{'id':'productNamePDP'})['value']
    supcMap[ppid] = []
    if attributes == "[]":
        supc = soup.find('div',{'id':'defaultSupc'}).string
        r_info = __RankInfo(supc, rank, category, productName, p_url, "", ppid, [], True)
        supcMap[ppid] = [r_info]
    p_info = json.loads(attributes)
    for product in p_info:
        color = product['value']
        supc = product['supc']
        live = product['live']
        r_info = __RankInfo(supc, rank, category, productName, p_url, color, ppid, [], live)
        temp = supcMap.get(ppid)
        temp.append(r_info)
        supcMap[ppid] = temp  
        if product['subAttributes'] is not None:
            for subAttribute in product['subAttributes']:
                sub_supc = subAttribute['supc']
                sub_value = subAttribute['value']
                sub_name = subAttribute['name']
                subAttr = __SubAttributes(sub_supc, color, sub_name, sub_value)
                subAttributes_list = r_info.subAttributes
                subAttributes_list.append(subAttr)
    return supcMap
            
    

def scrapeBestSellerMobiles():
    global bestSellers
    bestSellers = []
    rank = 1
    for z in [0,20,40,60,80]:
        url = "http://www.snapdeal.com/acors/json/product/get/search/175/%d/20?q=&sort=bstslr&keyword=&clickSrc=&viewType=List&lang=en&snr=false" %(z)
        soup = getSoupObject(url)
        for product in soup.findAll('a',{'class':'dp-widget-link'}):
            print product['href']
            try:
                supcMap = fetchSupcDetails(product['href'],rank, 3)
            except:
                continue
            print "supcMap ",supcMap
            bestSellers.append(supcMap)
            rank = rank + 1

def scrapeBestSellerTablets():
    global bestSellers
    bestSellers = []
    rank = 1
    for z in [0,20,40,60,80]:
        url = "http://www.snapdeal.com/acors/json/product/get/search/133/%d/20?sort=bstslr&keyword=&clickSrc=&viewType=List&lang=en&snr=false" %(z)
        soup = getSoupObject(url)
        for product in soup.findAll('a',{'class':'dp-widget-link'}):
            print product['href']
            try:
                supcMap = fetchSupcDetails(product['href'],rank, 5)
            except:
                continue
            print "supcMap ",supcMap
            bestSellers.append(supcMap)
            rank = rank + 1


def resetRanks(category):
    get_mongo_connection().Catalog.MasterData.update({'rank':{'$gt':0},'source_id':3,'category_id':category},{'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)

def commitBestSellers(category):
    global exceptionList
    for bestSeller in bestSellers:
        for mapVal in bestSeller.itervalues():
            for v in mapVal:
                col = list(get_mongo_connection().Catalog.MasterData.find({'identifier':v.identifier.strip(),'source_id':3}))
                if len(col) ==0 and len(v.subAttributes) == 0:
                    exObj = __Exception(v.identifier.strip(), v.baseColor, "", v.page_url ,v.rank, category, v.product_name)
                    exceptionList.append(exObj)
                    continue
                print v.identifier
                print v.rank
                get_mongo_connection().Catalog.MasterData.update({'identifier':v.identifier,'source_id':3},{'$set' : {'rank':v.rank,'updatedOn':to_java_date(now)}}, multi=True)
                for subAttr in v.subAttributes:
                    print "Inside subattr"
                    print vars(subAttr)
                    col = list(get_mongo_connection().Catalog.MasterData.find({'identifier':subAttr.identifier.strip(),'source_id':3}))
                    if len(col) ==0:
                        exObj = __Exception(subAttr.identifier.strip(), subAttr.color, subAttr.name+" "+subAttr.value, v.page_url ,v.rank, category, v.product_name)
                        exceptionList.append(exObj)
                    else:
                        print v.identifier
                        print v.rank
                        get_mongo_connection().Catalog.MasterData.update({'identifier':subAttr.identifier,'source_id':3},{'$set' : {'rank':v.rank,'updatedOn':to_java_date(now)}}, multi=True)
    print exceptionList

def sendEmail():
    message="""<html>
            <body>
            <h3>Snapdeal Best Sellers not in master</h3>
            <table border="1" style="width:100%;">
            <thead>
            <tr><th>Identifier</th>
            <th>Category</th>
            <th>Rank</th>
            <th>Product Name</th>
            <th>Color</th>
            <th>Description</th>
            <th>Page Url</th>
            </tr></thead>
            <tbody>"""
    for item in exceptionList:
        encoding =  chardet.detect(item.pageurl)
        try:
            message+="""<tr>
            <td style="text-align:center">"""+(item.identifier)+"""</td>
            <td style="text-align:center">"""+(categoryMap.get(item.category))+"""</td>
            <td style="text-align:center">"""+str(item.rank)+"""</td>
            <td style="text-align:center">"""+str(item.product_name)+"""</td>
            <td style="text-align:center">"""+item.color+"""</td>
            <td style="text-align:center">"""+item.desc+"""</td>
            <td style="text-align:center">"""+item.pageurl+"""</td>
            </tr>"""
        except:
            pass
    message+="""</tbody></table></body></html>"""
    print message
    #recipients = ['kshitij.sood@saholic.com']
    recipients = ['kshitij.sood@saholic.com','ritesh.chauhan@saholic.com','aishwarya.singh@saholic.com']
    EmailAttachmentSender.mail_send_grid("dtr@smartdukaan.com","apikey", "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", recipients, "Snapdeal Best Sellers",message ,[],[],[])              
            

def main():
    import time
    scrapeBestSellerMobiles()
    if len(bestSellers) > 0:
        resetRanks(3)
        commitBestSellers(3)
    scrapeBestSellerTablets()
    if len(bestSellers) > 0:
        resetRanks(5)
        commitBestSellers(5)
    print bestSellers
    sendEmail()

if __name__=='__main__':
    main()