Subversion Repositories SmartDukaan

Rev

Rev 17013 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
17013 manish.sha 1
from dtr.utils.utils import fetchResponseUsingProxy
2
import re
3
import datetime
4
from pyquery import PyQuery
5
import traceback
6
import json
7
import urllib2
8
import gzip
9
import StringIO
10
 
11
 
12
 
13
headers = {
14
            'User-Agent':'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19',
15
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
16
            'Accept-Language' : 'en-US,en;q=0.8',                     
17
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
18
            'Connection':'keep-alive',
19
            'Accept-Encoding' : 'gzip,deflate,sdch',
20
            'Host' : 'm.homeshop18.com'
21
        }
22
 
23
class HomeShop18Scraper:
24
    def __init__(self, livePricing=None):
25
        self.count_trials = 0
26
        self.livePricing = livePricing
27
 
28
    def read(self, url):
29
        response_data = ""
30
        try:
17075 manish.sha 31
            response_data = fetchResponseUsingProxy(url,headers=headers,livePricing=self.livePricing,proxy=True)
17013 manish.sha 32
        except Exception as e:
33
            print 'ERROR: ', e
34
            print 'Retrying'
35
            self.count_trials += 1
36
 
37
            if self.count_trials < 5:
38
                return self.read(url)
39
 
40
        self.response_data=response_data
41
 
42
        if "Server Busy" in self.response_data:
43
            self.count_trials += 1
44
            return self.read(url)
45
        return self.createData()
46
 
47
    def createData(self):
48
        #print self.response_data
49
        pq = PyQuery(self.response_data)
50
        tag = pq.find("script")
51
        #for tag in 
52
        requiredJson = None
53
        for val in str(tag).split('\n'):
54
            if 'hs18Cache.addProductItems(' in val:
55
                requiredJson = json.loads(val.strip().split('hs18Cache.addProductItems(')[1].split(');')[0])
56
 
57
        thumbnail = 'http://stat.homeshop18.com/homeshop18'+str(requiredJson['imageUrl'])
58
        inStock = 0
59
        totalStock = 0
60
        sellingPrice = 0
61
        shippingCharge = 0
62
        for item in requiredJson.get('itemList'):
63
            totalStock = totalStock + long(item['stockQuantity'])
64
            if sellingPrice ==0:
65
                sellingPrice = long(item.get('sellingPrice'))
66
            if item.get('shippingCharge') is not None and shippingCharge ==0:
67
                shippingCharge = long(item.get('shippingCharge'))
68
 
69
        if totalStock>0:
70
            inStock = 1
71
 
72
        print inStock, sellingPrice, shippingCharge, thumbnail
73
        return {'productId':str(requiredJson['productId']),'price':sellingPrice,'inStock':inStock,'shippingCharge':shippingCharge,'thumbnail':thumbnail}        
74
 
75
if __name__ == '__main__':
76
    print datetime.datetime.now()
17075 manish.sha 77
    scraper = HomeShop18Scraper()
78
    print scraper.read('http://m.homeshop18.com/product.mobi?productId=32998885')
79
    #print scraper.read('http://www.homeshop18.com/spice-full-touch-dual-sim-phone-m6112/mobiles/mobile-phones/product:32866119/cid:3027/')
17013 manish.sha 80
    #print scraper.read('http://m.homeshop18.com/product/stock.mobi?zipCode=110001&productId=32866119')
81
    print datetime.datetime.now()