Subversion Repositories SmartDukaan

Rev

Rev 16019 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

import urllib2
from BeautifulSoup import BeautifulSoup
import pymongo
import re
from dtr.utils.utils import to_java_date
import optparse
from datetime import datetime
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

con = None
parser = optparse.OptionParser()
parser.add_option("-m", "--m", dest="mongoHost",
                      default="localhost",
                      type="string", help="The HOST where the mongo server is running",
                      metavar="mongo_host")

(options, args) = parser.parse_args()


exceptionList = []
bestSellers = []
baseUrl = "http://m.shopclues.com/products/getProductList/mobiles:top-selling-mobiles-and-tablets.html/%s/page=%s"
now = datetime.now()


class __ProductInfo:
    
    def __init__(self, identifier, rank, url, available_price, in_stock, codAvailable, source_product_name, thumbnail, coupon):
        self.identifier = identifier
        self.rank  = rank
        self.url = url
        self.available_price = available_price
        self.in_stock = in_stock
        self.codAvailable = codAvailable
        self.source_product_name = source_product_name
        self.thumbnail = thumbnail
        self.coupon = coupon 

def get_mongo_connection(host=options.mongoHost, port=27017):
    global con
    if con is None:
        print "Establishing connection %s host and port %d" %(host,port)
        try:
            con = pymongo.MongoClient(host, port)
        except Exception, e:
            print e
            return None
    return con

def getSoupObject(url):
    print "Getting soup object for"
    print url
    global RETRY_COUNT
    RETRY_COUNT = 1 
    while RETRY_COUNT < 10:
        try:
            soup = None
            request = urllib2.Request(url)
            request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
            request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')
            request.add_header('Connection','keep-alive')
            request.add_header('User-Agent','Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36')
        
            response = urllib2.urlopen(request)   
            response_data = response.read()
            response.close()
            try:
                page=response_data.decode("utf-8")
                soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
            except:
                soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)
            if soup is None:
                raise
            return soup
        except Exception as e:
            print e
            print "Retrying"
            RETRY_COUNT = RETRY_COUNT + 1

        
def scrapeBestSellers():
    global bestSellers
    bestSellers = []
    rank = 0
    page = 1
    while (True):
        url = (baseUrl)%(page,page-1)
        soup = getSoupObject(url)
        productDivs = soup.findAll('div',{'class':'pd-list-cont'})
        if productDivs is None or len(productDivs)==0:
            return
        for productDiv in productDivs:
            rank = rank + 1
            info_tag =  productDiv.find('a')
            link = info_tag['href']
            scin = info_tag['data-id'].strip()
            print link
            print scin
            product = list(get_mongo_connection().Catalog.MasterData.find({'source_id':5,'identifier':scin}))
            if len(product) > 0:
                get_mongo_connection().Catalog.MasterData.update({'source_id':5,'identifier':scin},{"$set":{'rank':rank}})
            else:
                #Lets bundle product by finding similar url pattern
                uri = link.replace('http://m.shopclues.com','')
                product = list(get_mongo_connection().Catalog.MasterData.find({'marketPlaceUrl':{'$regex': uri}}))
                if len(product) > 0:
                    bundleNewProduct(product[0],)
        page = page+1
          
def bundleNewProduct(existingProduct, toBundle):
    pass        

def main():
    scrapeBestSellers()
        
if __name__=='__main__':
    main()