Subversion Repositories SmartDukaan

Rev

Rev 20372 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

import urllib2
from BeautifulSoup import BeautifulSoup
import re
from dtr.utils.utils import to_java_date
import optparse
from datetime import datetime
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from dtr.utils.utils import fetchResponseUsingProxy, get_mongo_connection, ungzipResponse
import json
import urllib
import chardet
from shop2020.utils.EmailAttachmentSender import get_attachment_part
from shop2020.utils import EmailAttachmentSender


con = None
parser = optparse.OptionParser()
parser.add_option("-m", "--m", dest="mongoHost",
                      default="localhost",
                      type="string", help="The HOST where the mongo server is running",
                      metavar="mongo_host")

(options, args) = parser.parse_args()

headers = {
            'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36',
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',      
            'Accept-Language' : 'en-US,en;q=0.8',                     
            'Accept-Encoding' : 'gzip,deflate,sdch',
            'Host' : 'www.homeshop18.com',
            'Referer': 'http://www.homeshop18.com/mobile-phones/categoryid:14569/search:/sort:Popularity/inStock:%28%22true%22%29/start:0/'     
}


exceptionList = []
bestSellers = []
now = datetime.now()
csrfValue = None

class __RankInfo:
    
    def __init__(self, identifier, rank, category, available_price, in_stock, thumbnail, source_product_name, marketPlaceUrl):
        self.identifier = identifier
        self.rank  = rank
        self.available_price = available_price
        self.in_stock = in_stock
        self.category = category
        self.thumbnail = thumbnail
        self.source_product_name = source_product_name
        self.marketPlaceUrl = marketPlaceUrl    
  
def commitBestSellers(category):
    global exceptionList
    print "Rank",
    print '\t',
    print 'Identifier'
    for x in bestSellers:
        print x.rank,
        print '\t',
        print x.identifier,
        print '\t',
        col = list(get_mongo_connection(host=options.mongoHost).Catalog.MasterData.find({'identifier':x.identifier, 'source_id':7}))
        print "count sku",len(col)
        print '\n'
        if len(col) == 0:
            x.category = category
            exceptionList.append(x)
        else:
            get_mongo_connection(host=options.mongoHost).Catalog.MasterData.update({'identifier':x.identifier, 'source_id':7 }, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}})

    
def scrapeBestSellerMobiles():
    global bestSellers
    rank = 0
    bestSellers = []
    print "Homeshop18 Best Sellers Mobiles..."
    for i in range(0,5):
        mobileCategoryUrl = 'http://www.homeshop18.com/mobile-phones/categoryid:14569/search:/sort:Popularity/inStock:%28%22true%22%29/start:'+str(i*24)+'/?lazy=true'
        #mobileCategoryUrl = "http://m.homeshop18.com/search.mobi?categoryId=14569&isAjax=true&page="+str(i)+"&csrf="+csrfValue
        data = fetchResponseUsingProxy(mobileCategoryUrl, proxy=False )
        soup = BeautifulSoup(data)
        tags = soup.findAll("div", {'class':'inside'})
        for tag in tags:
            if not tag.has_key('id'):
                continue
            rank = rank +1
            if rank >100:
                break
            titleTag = tag.find('p', {'class' : 'product_title'})
            source_product_name = titleTag.text
            '''
            encoding =  chardet.detect(source_product_name)
            try:
                source_product_name = source_product_name.decode(encoding.get('encoding'))
            except:
                source_product_name = source_product_name.decode('latin-1')
            '''
            source_product_name = source_product_name.encode('utf8')
            productUrl = titleTag.find('a').get('href')
            productUrl = 'http://www.homeshop18.com'+str(productUrl)
            inStock = 1
            identifier = tag['id'].split('_')[1]
            available_price = long(tag.find('p',{'class':'price clearfix'}).find('b').text.split(' ')[1])
            thumbnail = tag.find('p',{'class':'product_image'}).find('img').get('data-original')
            print productUrl, source_product_name, thumbnail, rank, available_price, identifier, inStock
            r_info = __RankInfo(identifier, rank, None, available_price, inStock, thumbnail, source_product_name, "http://m.homeshop18.com/product.mobi?productId=%s"%str(identifier))
            bestSellers.append(r_info)   
                
def scrapeBestSellerTablets():
    global bestSellers
    rank = 0
    bestSellers = []
    print "Homeshop18 Best Sellers Tablets..."
    for i in range(0,5):
        tabletCategoryUrl = 'http://www.homeshop18.com/tablets/categoryid:8937/search:/sort:Popularity/inStock:%28%22true%22%29/start:'+str(i*24)+'/?lazy=true'
        data = fetchResponseUsingProxy(tabletCategoryUrl, proxy=False)
        soup = BeautifulSoup(data)
        tags = soup.findAll("div", {'class':'inside'})
        for tag in tags:
            if not tag.has_key('id'):
                continue
            rank = rank +1
            if rank >100:
                break
            titleTag = tag.find('p', {'class' : 'product_title'})
            source_product_name = titleTag.text
            '''
            encoding =  chardet.detect(source_product_name)
            try:
                source_product_name = source_product_name.decode(encoding.get('encoding'))
            except:
                source_product_name = source_product_name.decode('latin-1')
            '''
            source_product_name = source_product_name.encode('utf8')
            productUrl = titleTag.find('a').get('href')
            productUrl = 'http://www.homeshop18.com'+str(productUrl)
            inStock = 1
            identifier = tag['id'].split('_')[1]
            available_price = long(tag.find('p',{'class':'price clearfix'}).find('b').text.split(' ')[1])
            thumbnail = tag.find('p',{'class':'product_image'}).find('img').get('data-original')
            print productUrl, source_product_name, thumbnail, rank, available_price, identifier, inStock
            r_info = __RankInfo(identifier, rank, None , available_price, inStock, thumbnail, source_product_name, "http://m.homeshop18.com/product.mobi?productId=%s"%str(identifier))
            bestSellers.append(r_info)
                
def resetRanks(category_id):
    get_mongo_connection(host=options.mongoHost).Catalog.MasterData.update({'rank':{'$gt':0},'source_id':7,'category_id':category_id}, {'$set':{'rank':0}}, upsert=False, multi=True)

def sendEmail():
    message="""<html>
            <body>
            <h3>HomeShop18 Best Sellers not in master</h3>
            <table border="1" style="width:100%;">
            <thead>
            <tr><th>Identifier</th>
            <th>Category</th>
            <th>Rank</th>
            <th>Available_price</th>
            <th>In_stock</th>
            <th>Thumbnail</th>
            <th>Source_product_name</th>
            <th>MarketPlaceUrl</th>
            </tr></thead>
            <tbody>"""
    for item in exceptionList:
        try:
            message+="""<tr>
            <td style="text-align:center">"""+str(item.identifier)+"""</td>
            <td style="text-align:center">"""+str(item.category)+"""</td>
            <td style="text-align:center">"""+str(item.rank)+"""</td>
            <td style="text-align:center">"""+str(item.available_price)+"""</td>
            <td style="text-align:center">"""+str(item.in_stock)+"""</td>
            <td style="text-align:center">"""+str(item.thumbnail)+"""</td>
            <td style="text-align:center">"""+str(item.source_product_name)+"""</td>
            <td style="text-align:center">"""+str(item.marketPlaceUrl)+"""</td>
            </tr>"""
        except:
            continue
    message+="""</tbody></table></body></html>"""
    print message
    #recipients = ['amit.gupta@saholic.com']
    recipients = ['kshitij.sood@saholic.com','ritesh.chauhan@saholic.com','aishwarya.singh@saholic.com']
    EmailAttachmentSender.mail_send_grid("dtr@profitmandi.com","apikey", "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", recipients, "Homeshop18 Best Sellers",message ,[],[],[])              
   
    
def main():
    #getCsrfValue()
    scrapeBestSellerMobiles()
    if len(bestSellers) > 0:
        resetRanks(3)
        commitBestSellers("MOBILE")
    scrapeBestSellerTablets()
    if len(bestSellers) > 0:
        resetRanks(5)
        commitBestSellers("TABLET")
    sendEmail()

        
if __name__=='__main__':
    main()