WebSVN – SmartDukaan – /trunk/PyProj/src/shop2020/model/v1/catalog/script/CompetitorScraping.py

from elixir import session
from sqlalchemy.sql import asc
from sqlalchemy.sql.expression import or_
from shop2020.utils.daemon import Daemon
import optparse
import sys
import threading
import urllib2
import mechanize
import cookielib
import time
import requests as httpRequest
import simplejson as json
from shop2020.model.v1.catalog.impl import DataService
from shop2020.model.v1.catalog.script import FlipkartScraper, AmazonScraper, SellerCentralScraper
from operator import itemgetter
from shop2020.model.v1.catalog.impl.DataService import CompetitorPricing, CompetitorPricingRequest, \
SnapdealItem, FlipkartItem 

DataService.initialize(db_hostname='localhost')

class CompetitorScraping(Daemon):
    def __init__(self, logfile='/var/log/services/competitorScraping.log', pidfile='/var/run/competitor-scraper.pid'):
        Daemon.__init__(self, pidfile, stdout=logfile, stderr=logfile)
        
    def run(self):
        start()

def start():
    try:
        while True:
            requests = session.query(CompetitorPricingRequest).filter(or_(CompetitorPricingRequest.isProcessed==False,CompetitorPricingRequest.isProcessed==None)).order_by(asc(CompetitorPricingRequest.requestId)).all()
            if requests ==[] or requests is None:
                print "No new request to process, sleeeeeeping....."
                time.sleep(600)
            for request in requests:
                fetchDetails(request)
                request.isProcessed = True
                session.commit()
            close_session()
    except Exception as e:
        print e
        sys.exit(2)
    
def fetchDetails(request):
    login_url = "https://sellercentral.amazon.in/gp/homepage.html"
    br = login(login_url)
    items = session.query(CompetitorPricing).filter(CompetitorPricing.competitorPricing_requestId==request.requestId).all()
    print items
    snapdeal, flipkart, amazon =[],[],[]
    for item in items:
        if item.snapdealScraping:
            snapdeal.append(item)
        if item.flipkartScraping:
            flipkart.append(item)
        if item.amazonScraping:
            amazon.append(item)
    threads = []
    t1 = threading.Thread(target=scrapSnapdeal, args = (snapdeal,))
    t1.daemon = True
    t1.start()
    t2 = threading.Thread(target=scrapFlipkart, args = (flipkart,))
    t2.daemon = True
    t2.start()
    amazonLen = len(amazon)
    if amazonLen > 20:
        t3 = threading.Thread(target=scrapAmazon, args = (amazon[0:amazonLen/2],br))
        t3.daemon = True
        t3.start()
        t4 = threading.Thread(target=scrapAmazon, args = (amazon[amazonLen/2:],br))
        t4.daemon = True
        t4.start()
        threads.append(t4)
    else:
        t3 = threading.Thread(target=scrapAmazon, args = (amazon,br))
        t3.daemon = True
        t3.start()
    threads.append(t1)
    threads.append(t2)
    threads.append(t3)
    for th in threads:
        th.join()

def scrapSnapdeal(snapdealItems):
    for snapdealItem in snapdealItems:
        sdItem = SnapdealItem.get_by(item_id=snapdealItem.item_id)
        if sdItem is None:
            continue
        try:
            url="http://www.snapdeal.com/acors/json/gvbps?supc=%s&catId=91&sort=sellingPrice"%(sdItem.supc)
            print url
            time.sleep(1)
            req = urllib2.Request(url)
            response = urllib2.urlopen(req)
            json_input = response.read()
            vendorInfo = json.loads(json_input)
            lowestSp, iterator, ourInventory, lowestSellerInventory,ourSp,ourOfferPrice,lowestSp,lowestOfferPrice   = (0,)*8
            lowestSellerName = ''
            for vendor in vendorInfo:
                if iterator == 0:
                    lowestSellerName = vendor['vendorDisplayName']
                    try:
                        lowestSp = vendor['sellingPriceBefIntCashBack']
                    except:
                        lowestSp = vendor['sellingPrice']
                    lowestOfferPrice = vendor['sellingPrice']
                    lowestSellerInventory = vendor['buyableInventory']
                    
                if vendor['vendorDisplayName'] == 'MobilesnMore':
                    ourInventory = vendor['buyableInventory']
                    try:
                        ourSp = vendor['sellingPriceBefIntCashBack']
                    except:
                        ourSp = vendor['sellingPrice']
                    ourOfferPrice = vendor['sellingPrice']
                iterator+=1
        except:
            continue
        snapdealItem.ourSnapdealPrice = ourSp
        snapdealItem.ourSnapdealOfferPrice = ourOfferPrice
        snapdealItem.ourSnapdealInventory = ourInventory
        snapdealItem.lowestSnapdealPrice = lowestSp
        snapdealItem.lowestSnapdealOfferPrice = lowestOfferPrice
        snapdealItem.lowestSnapdealSeller = lowestSellerName 
        snapdealItem.lowestSnapdealSellerInventory = lowestSellerInventory  

def scrapFlipkart(flipkartItems):
    for flipkartItem in flipkartItems:
        scraperFk = FlipkartScraper.FlipkartScraper()
        fkItem = FlipkartItem.get_by(item_id=flipkartItem.item_id)
        if fkItem is None:
            continue
        try:
            url = "http://www.flipkart.com/ps/%s"%(fkItem.flipkartSerialNumber)
            vendorsData = scraperFk.read(url)
            sortedVendorsData = []
            sortedVendorsData = sorted(vendorsData, key=itemgetter('sellingPrice'))
            lowestSellerSp, iterator, ourSp = (0,)*3
            lowestSellerName = ''
            for data in sortedVendorsData:
                if iterator == 0:
                    lowestSellerName = data['sellerName']
                    lowestSellerSp = data['sellingPrice']
                        
                if data['sellerName'] == 'Saholic':
                    ourSp = data['sellingPrice']

                iterator+=1
        except:
            continue
        
        try:
            request_url = "https://api.flipkart.net/sellers/skus/%s/listings"%(str(fkItem.flipkartSerialNumber))
            r = httpRequest.get(request_url, auth=('m2z93iskuj81qiid', '0c7ab6a5-98c0-4cdc-8be3-72c591e0add4'))
            print "Inventory info",r.json()
            stock_count = int((r.json()['attributeValues'])['stock_count'])
        except:
            stock_count = 0
        finally:
                r={}
        flipkartItem.ourFlipkartPrice = ourSp
        flipkartItem.ourFlipkartInventory = stock_count
        flipkartItem.lowestFlipkartPrice = lowestSellerSp
        flipkartItem.lowestFlipkartSeller =  lowestSellerName
        


def close_session():
    if session.is_active:
        print "session is active. closing it."
        session.close()
        
def scrapAmazon(amazonItems,br):
    for amazonItem in amazonItems:
        sc = SellerCentralScraper.SellerCentralScraper()
        skuUrlMfn = "https://sellercentral.amazon.in/myi/search/ProductSummary?keyword=%d" %(amazonItem.item_id)
        skuUrlFba =  "https://sellercentral.amazon.in/myi/search/ProductSummary?keyword=FBA%d" %(amazonItem.item_id)
        try:
            print type(sc.requestSku(br, skuUrlMfn))
            asin, mfnInventory, mfnPrice= sc.requestSku(br, skuUrlMfn)
            fbaAsin, fbaInventory, fbaPrice= sc.requestSku(br, skuUrlFba)
        except Exception as e:
            print e
            print "Unable to fetch details from Seller Central for ",amazonItem.item_id
            continue
        scraperAmazon = AmazonScraper.AmazonScraper()
        try:
            if len(asin)==0 and len(fbaAsin)==0:
                print "No asin found for ",amazonItem.item_id
                continue
            if len(asin)==0:
                asin=fbaAsin
            url = "http://www.amazon.in/gp/offer-listing/%s/ref=olp_sort_ps"%(asin)
            scraperAmazon.read(url,True)
            lowestSp,lowestSeller = scraperAmazon.createData()
            amazonItem.lowestAmazonPrice = lowestSp
            
            print "Details of Item id ",amazonItem.item_id
            print "ASIN ",asin
            print "MFN INVENTORY ",mfnInventory
            print "FBA INVENTORY ",fbaInventory
            print "MFN PRICE ",mfnPrice
            print "FBA PRICE ",fbaPrice
            print "Lowest Seller SP ",lowestSp
            print "LOWEST SELLER NAME ",lowestSeller
            
            amazonItem.ourMfnPrice = float(mfnPrice.replace("Rs.","").replace(",",""))
            amazonItem.ourFbaPrice = float(fbaPrice.replace("Rs.","").replace(",",""))
            amazonItem.ourMfnInventory = int(mfnInventory)
            amazonItem.ourFbaInventory = int(fbaInventory)
            amazonItem.lowestAmazonPrice = float(lowestSp)
            amazonItem.lowestAmazonSeller = lowestSeller

        except Exception as e:
            print e
            print "Unable to fetch details from Amazon Listing page for ",amazonItem.item_id
            continue
        
def getBrowserObject():
        br = mechanize.Browser(factory=mechanize.RobustFactory())
        cj = cookielib.LWPCookieJar()
        br.set_cookiejar(cj)
        br.set_handle_equiv(True)
        br.set_handle_redirect(True)
        br.set_handle_referer(True)
        br.set_handle_robots(False)
        br.set_debug_http(False)
        br.set_debug_redirects(False)
        br.set_debug_responses(False)
        
        br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
        
        br.addheaders = [('User-agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'),
                         ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                         ('Accept-Encoding', 'gzip,deflate,sdch'),                  
                         ('Accept-Language', 'en-US,en;q=0.8'),                     
                         ('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3')]
        return br

def ungzipResponse(r,b):
    headers = r.info()
    if headers['Content-Encoding']=='gzip':
        import gzip
        print "********************"
        print "Deflating gzip response"
        print "********************"
        gz = gzip.GzipFile(fileobj=r, mode='rb')
        html = gz.read()
        gz.close()
        headers["Content-type"] = "text/html; charset=utf-8"
        r.set_data( html )
        b.set_response(r)

    
def login(url):
    br = getBrowserObject()
    br.open(url)
    response = br.open(url)
    ungzipResponse(response, br)
    #html = response.read()
    #print html
    br.select_form(name="signinWidget")
    br.form['username'] = "kshitij.sood@saholic.com"
    br.form['password'] = "pioneer"
    response = br.submit()
    print "********************"
    print "Attempting to Login"
    print "********************"
    #ungzipResponse(response, br)
    return br

        

if __name__ == "__main__":
    parser = optparse.OptionParser()
    parser.add_option("-l", "--logfile", dest="logfile",
                      type="string",
                      help="Log all output to LOG_FILE",
                      )
    parser.add_option("-i", "--pidfile", dest="pidfile",
                      type="string",
                      help="Write the PID to pidfile")
    (options, args) = parser.parse_args()
    daemon = CompetitorScraping(options.logfile, options.pidfile)
    if len(args) == 0:
        daemon.run()
    elif len(args) == 1:
        if 'start' == args[0]:
            daemon.start()
        elif 'stop' == args[0]:
            daemon.stop()
        elif 'restart' == args[0]:
            daemon.restart()
        else:
            print "Unknown command"
            sys.exit(2)
        sys.exit(0)
    else:
        print "usage: %s start|stop|restart" % sys.argv[0]
        sys.exit(2)
Subversion Repositories SmartDukaan

(root)/trunk/PyProj/src/shop2020/model/v1/catalog/script/CompetitorScraping.py – Rev 12256