Subversion Repositories SmartDukaan

Rev

Rev 17013 | Blame | Compare with Previous | Last modification | View Log | RSS feed

from dtr.utils.utils import fetchResponseUsingProxy
import re
import datetime
from pyquery import PyQuery
import traceback
import json
import urllib2
import gzip
import StringIO



headers = {
            'User-Agent':'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19',
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
            'Accept-Language' : 'en-US,en;q=0.8',                     
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Connection':'keep-alive',
            'Accept-Encoding' : 'gzip,deflate,sdch',
            'Host' : 'm.homeshop18.com'
        }

class HomeShop18Scraper:
    def __init__(self, livePricing=None):
        self.count_trials = 0
        self.livePricing = livePricing
    
    def read(self, url):
        response_data = ""
        try:
            response_data = fetchResponseUsingProxy(url,headers=headers,livePricing=self.livePricing,proxy=True)
        except Exception as e:
            print 'ERROR: ', e
            print 'Retrying'
            self.count_trials += 1
            
            if self.count_trials < 5:
                return self.read(url)
        
        self.response_data=response_data
        
        if "Server Busy" in self.response_data:
            self.count_trials += 1
            return self.read(url)
        return self.createData()
    
    def createData(self):
        #print self.response_data
        pq = PyQuery(self.response_data)
        tag = pq.find("script")
        #for tag in 
        requiredJson = None
        for val in str(tag).split('\n'):
            if 'hs18Cache.addProductItems(' in val:
                requiredJson = json.loads(val.strip().split('hs18Cache.addProductItems(')[1].split(');')[0])
        
        thumbnail = 'http://stat.homeshop18.com/homeshop18'+str(requiredJson['imageUrl'])
        inStock = 0
        totalStock = 0
        sellingPrice = 0
        shippingCharge = 0
        for item in requiredJson.get('itemList'):
            totalStock = totalStock + long(item['stockQuantity'])
            if sellingPrice ==0:
                sellingPrice = long(item.get('sellingPrice'))
            if item.get('shippingCharge') is not None and shippingCharge ==0:
                shippingCharge = long(item.get('shippingCharge'))
                
        if totalStock>0:
            inStock = 1
            
        print inStock, sellingPrice, shippingCharge, thumbnail
        return {'productId':str(requiredJson['productId']),'price':sellingPrice,'inStock':inStock,'shippingCharge':shippingCharge,'thumbnail':thumbnail}        
    
if __name__ == '__main__':
    print datetime.datetime.now()
    scraper = HomeShop18Scraper()
    print scraper.read('http://m.homeshop18.com/product.mobi?productId=32998885')
    #print scraper.read('http://www.homeshop18.com/spice-full-touch-dual-sim-phone-m6112/mobiles/mobile-phones/product:32866119/cid:3027/')
    #print scraper.read('http://m.homeshop18.com/product/stock.mobi?zipCode=110001&productId=32866119')
    print datetime.datetime.now()