Subversion Repositories SmartDukaan

Rev

Rev 191 | Rev 263 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 191 Rev 237
Line 13... Line 13...
13
from scrapy.http.headers import Headers
13
from scrapy.http.headers import Headers
14
from scrapy.http.request.form import FormRequest
14
from scrapy.http.request.form import FormRequest
15
from scrapy.log import msg
15
from scrapy.log import msg
16
from scrapy.http.response import Response
16
from scrapy.http.response import Response
17
from time import *
17
from time import *
-
 
18
from datastore.DataCodeAccessor import *
-
 
19
from datastore.DataAccessor import *
18
 
20
 
19
from datastore import DataAccessor
-
 
20
from datastore.DataAccessor import DataHelper
-
 
21
import urllib
21
import urllib
-
 
22
from html2text.unescaping import *
22
 
23
 
23
class naaptol_price2(BaseSpider):
24
class naaptol_price2(BaseSpider):
24
    
25
    
25
    def __init__(self):
26
    def __init__(self):
26
       self.domain_name = "naaptolphones2"
-
 
27
       #self.start_urls.append("http://www.naaptol.com/price/10415-Fly-Hummer-HT1.html")
-
 
28
       
27
       
-
 
28
       initialize_table()
-
 
29
       #NAAPTOL_DOMAINNAME2 = "naaptol2"   
-
 
30
       NAAPTOL_DOMAINNAME2 = get_code_word("NAAPTOL_DOMAINNAME2")
-
 
31
       self.domain_name = NAAPTOL_DOMAINNAME2 
-
 
32
       
-
 
33
       # get urls from the database and append them in the list for crawling
29
       da = DataHelper()
34
       da = DataHelper()
30
       for pitem in da.get_allmorenaaptolurls():
35
       for pitem in da.get_allmorenaaptolurls():
31
            self.start_urls.append(pitem.url.strip())
36
            self.start_urls.append(pitem.url.strip())
32
    
37
    
33
    def start_requests(self):
38
    def start_requests(self):
-
 
39
        #for each request a referer has to be set
34
        listreq = []
40
        listreq = []
-
 
41
        #NAAPTOL_REFERER = "http://www.google.com"
-
 
42
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
35
        for url1 in self.start_urls:
43
        for url1 in self.start_urls:
36
            request = Request(url = url1, callback=self.parse)
44
            request = Request(url = str(url1), callback=self.parse)
37
            request.headers.setdefault("Referer", "www.naaptol.com")
45
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
38
            listreq.append(request)
46
            listreq.append(request)
39
        return listreq
47
        return listreq
-
 
48
    
40
       
49
       
41
    def parse(self, response): 
50
    def parse(self, response): 
-
 
51
        da = DataHelper()
42
        f = open('/home/gaurav/Desktop/response.txt', 'w')
52
        #NAAPTOL_REMOVELIST = ["Rs.",","]
43
        f.write(response.body)
53
        #list separated by ';'
-
 
54
        NAAPTOL_REMOVELIST = get_code_word("NAAPTOL_REMOVELIST")
-
 
55
        NAAPTOL_REMOVELIST = NAAPTOL_REMOVELIST.split(';')
44
        print "  url  " + response.url
56
        #retreiving name from the the url
45
        name = str(response.url)
57
        name = str(response.url)
-
 
58
        name = unescape(name)
46
        name_pos = name.rfind("/")
59
        name_pos = name.rfind("/")
47
        name = name[name_pos+1:len(name)-5]
60
        name = name[name_pos+1:len(name)-5]
-
 
61
        name_pos = name.find("-")
-
 
62
        name = name[name_pos+1:len(name)]
-
 
63
            
48
        hxs = HtmlXPathSelector(response)
64
        hxs = HtmlXPathSelector(response)
-
 
65
       
49
        da = DataHelper()
66
        #price and price2 determine range
50
       #prices = hxs.select('//table[@class ="ProductDetails"]/tbody/tr[6]/td/span/text()')
67
        #NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
-
 
68
        NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
51
        prices = hxs.select('//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()')
69
        prices = hxs.select(NAAPTOL_XPATH2)
52
        try:
70
        try:
53
            price1 = prices.extract()[0]
71
            price1 = prices.extract()[0]
54
            price1 = price1.decode("utf-8")
72
            #price1 = price1.decode("utf-8")
55
            price1 = price1.strip()
73
            price1 = price1.strip()
56
        except:
74
        except:
57
            price1 = ""
75
            price1 = ""
58
        
76
        
59
        try:
77
        try:
60
            price2 = prices.extract()[1]
78
            price2 = prices.extract()[1]
61
            price2 = price2.decode("utf-8")
79
            #price2 = price2.decode("utf-8")
62
            price2 = price2.strip()
80
            price2 = price2.strip()
63
        except:
81
        except:
64
            price2 = ""
82
            price2 = ""
65
        
83
        
66
        try:
84
        try:
67
            if price1 == "" and price2 == "":
85
            if price1 == "" and price2 == "":
68
                prices = hxs.select('//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()')
86
                #NAAPTOL_XPATH3 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()'
-
 
87
                NAAPTOL_XPATH3 = get_code_word("NAAPTOL_XPATH3")
-
 
88
                prices = hxs.select(NAAPTOL_XPATH3)
69
                price = str(prices.extract()[0])
89
                price = str(prices.extract()[0])
70
                pos1 = price.find("'")
90
                pos1 = price.find("'")
71
                pos2 = price.find("'",pos1+1,len(price))
91
                pos2 = price.find("'",pos1+1,len(price))
72
                price1 = price[pos1+1:pos2] + "(approx)" 
92
                price1 = price[pos1+1:pos2] + "(approx)" 
73
                price2 = ""
93
                price2 = ""
74
        except:
94
        except:
75
            price1 = price2 = ""
95
            price1 = price2 = ""
-
 
96
        #removelist is used for converting price to decimal format containing only numbers and '.'    
76
            
97
            
77
        if price1 != '':
98
        if price1 != '':
-
 
99
            for r in NAAPTOL_REMOVELIST: 
78
            price1 = price1.replace("Rs.", "")
100
                while price1.find(r) != -1:
79
            price1 = price1.replace(",", "")
101
                    price1 = price1.replace(r, "")
80
            price1 = price1.strip()
102
            price1 = price1.strip()
81
        if price2 != '':        
103
        if price2 != '':        
-
 
104
            for r in NAAPTOL_REMOVELIST: 
82
            price2 = price2.replace("Rs.", "")
105
                while price2.find(r) != -1:
83
            price2 = price2.replace(",", "")
106
                    price2 = price2.replace(r, "")
84
            price2 = price2.strip()
107
            price2 = price2.strip()
85
        
108
        
86
        if price1 == "Rates Not Available":
109
        if price1 == "Rates Not Available":
87
            price1 = price2 = ""
110
            price1 = price2 = ""
88
        
111
        
89
        print name
-
 
90
        print price1
-
 
91
        print price2
112
        #range = price1 to price2
92
        print "\n"
-
 
93
        range = price1
113
        range = price1
94
        if price2 != "":
114
        if price2 != "":
95
            range = str(range) + " to " 
115
            range = str(range) + " to " 
96
            range = range + str(price2) 
116
            range = range + str(price2) 
97
            da.add_new_naaptolphone(name, range)
117
        da.add_new_naaptolphone(name, range)
98
            
118
        
99
        
119
        
100
        OnlineSellers_pricelist = []
120
        OnlineSellers_pricelist = []
101
        OnlineSellers_namelist = []
121
        OnlineSellers_namelist = []
102
        try:
122
        try:
-
 
123
            #ct1 holds the count of online sellers
103
            ct1 = hxs.select('//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()')
124
            #NAAPTOL_XPATH4 = '//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
-
 
125
            NAAPTOL_XPATH4 = get_code_word("NAAPTOL_XPATH4")
-
 
126
            ct1 = hxs.select(NAAPTOL_XPATH4)
104
            ct1 = str(ct1.extract()[0])
127
            ct1 = str(ct1.extract()[0])
105
            ct1 = ct1.decode("utf-8")
128
            ct1 = ct1.decode("utf-8")
106
            ct1 = ct1.strip()
129
            ct1 = ct1.strip()
107
            ps1 = ct1.find(" ")
130
            ps1 = ct1.find(" ")
108
            ct1 = ct1[0:ps1]
131
            ct1 = ct1[0:ps1]
109
            ct1 = int(ct1)
132
            ct1 = int(ct1)
110
        except:
133
        except:
111
            ct1 = 0
134
            ct1 = 0
112
        ct = ct1
135
        ct = ct1
113
        i = 0
136
        i = 0
114
        os_info = hxs.select('//div[@id="onSellerContents"]//td[@class="price"]')
137
        #NAAPTOL_XPATH5 = '//div[@id="onSellerContents"]//td[@class="price"]'
-
 
138
        NAAPTOL_XPATH5 = get_code_word("NAAPTOL_XPATH5")
115
        #print len(os_info)
139
        os_info = hxs.select(NAAPTOL_XPATH5)
116
        while ct > 0:
140
        while ct > 0:
117
            os = os_info[i].extract()
141
            os = os_info[i].extract()
118
            ps1 = os.find(">")
142
            ps1 = os.find(">")
119
            ps2 = os.find("<",ps1)
143
            ps2 = os.find("<",ps1)
120
            os = os[ps1+1:ps2]
144
            os = os[ps1+1:ps2]
-
 
145
            
121
            os = os.replace("Rs.", "")
146
            if os != '':        
-
 
147
                for r in NAAPTOL_REMOVELIST: 
-
 
148
                    while os.find(r) != -1:
122
            os = os.replace(",", "")
149
                        os = os.replace(r, "")
123
            os = urllib.unquote(os)
150
            os = urllib.unquote(os)
124
            try:
151
            try:
125
                os = int(os)
152
                os = int(os)
126
            except:
153
            except:
-
 
154
                #stored in format different than previous one
127
                os = os_info[i].extract()
155
                os = os_info[i].extract()
128
                ps1 = os.find(">",ps2)
156
                ps1 = os.find(">",ps2)
129
                ps2 = os.find("<",ps1)
157
                ps2 = os.find("<",ps1)
130
                os = os[ps1+1:ps2]
158
                os = os[ps1+1:ps2]
131
                os = os.replace("Rs.", "")
159
                if os != '':        
-
 
160
                    for r in NAAPTOL_REMOVELIST: 
-
 
161
                        while os.find(r) != -1:
132
                os = os.replace(",", "")
162
                            os = os.replace(r, "")
133
                os = urllib.unquote(os)
163
                os = urllib.unquote(os)
134
                os = int(os) 
164
                os = int(os) 
135
            print os
165
            
136
            OnlineSellers_pricelist.append(os)
166
            OnlineSellers_pricelist.append(os)
137
            
167
            
138
            path = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'
168
            #NAAPTOL_XPATH6 = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'  
-
 
169
            NAAPTOL_XPATH6 = get_code_word("NAAPTOL_XPATH6")
-
 
170
            #NAAPTOL_XPATH7 = '"]/span/text()'
-
 
171
            NAAPTOL_XPATH7 = get_code_word("NAAPTOL_XPATH7")
139
            path = path + str(i) 
172
            NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + str(i) 
-
 
173
            NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + NAAPTOL_XPATH7 
140
            path = path + '"]/span/text()'
174
            path = NAAPTOL_XPATH6
141
            osname = hxs.select(path)
175
            osname = hxs.select(path)
142
            #print len(osname)
-
 
143
            osname = osname.extract()[0]
176
            osname = osname.extract()[0]
-
 
177
            osname = unescape(osname)
144
            osname = urllib.unquote(osname)
178
            osname = urllib.unquote(osname)
145
            OnlineSellers_namelist.append(osname)
179
            OnlineSellers_namelist.append(osname)
146
            print osname
-
 
147
            i = i+1
180
            i = i+1
148
            ct = ct-1
181
            ct = ct-1
149
            
182
            
150
        l = len(OnlineSellers_pricelist)
183
        l = len(OnlineSellers_pricelist)
151
        i = 0
184
        i = 0 
152
        
-
 
153
        nid = da.get_naaptolphone(name,range).id
185
        nid = da.get_naaptolphone(name,range).id
154
        while l > 0:
186
        while l > 0:
155
            da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])
187
            da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])
156
            i = i+1
188
            i = i+1
157
            l = l-1
189
            l = l-1
158
        
190
        
159
        LocalSellers_pricelist = []   
191
        LocalSellers_pricelist = []   
160
        LocalSellers_namelist = []
192
        LocalSellers_namelist = []
161
        try:
193
        try:
-
 
194
            #ct1 holds the count of online sellers
162
            ct1 = hxs.select('//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()')
195
            #NAAPTOL_XPATH8 = '//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
-
 
196
            NAAPTOL_XPATH8 = get_code_word("NAAPTOL_XPATH8")
-
 
197
            ct1 = hxs.select(NAAPTOL_XPATH8)
163
            ct1 = str(ct1.extract()[0])
198
            ct1 = str(ct1.extract()[0])
164
            ct1 = ct1.decode("utf-8")
199
            ct1 = ct1.decode("utf-8")
165
            ct1 = ct1.strip()
200
            ct1 = ct1.strip()
166
            ps1 = ct1.find(" ")
201
            ps1 = ct1.find(" ")
167
            ct1 = ct1[0:ps1]
202
            ct1 = ct1[0:ps1]
168
            ct1 = int(ct1)
203
            ct1 = int(ct1)
169
        except:
204
        except:
170
            ct1 = 0
205
            ct1 = 0
171
        ct = ct1
206
        ct = ct1
172
        i = 0
207
        i = 0
173
        os_info = hxs.select('//div[@id="offSellerContents"]//td[@class="price"]')
208
        #NAAPTOL_XPATH9 = '//div[@id="offSellerContents"]//td[@class="price"]'
-
 
209
        NAAPTOL_XPATH9 = get_code_word("NAAPTOL_XPATH9")
174
        os_names = hxs.select('//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()')
210
        #NAAPTOL_XPATH10 = '//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()'
-
 
211
        NAAPTOL_XPATH10 = get_code_word("NAAPTOL_XPATH10")
175
        #print len(os_info)
212
        os_info = hxs.select(NAAPTOL_XPATH9)
-
 
213
        os_names = hxs.select(NAAPTOL_XPATH10)
-
 
214
        
176
        while ct > 0:
215
        while ct > 0:
177
            os = os_info[i].extract()
216
            os = os_info[i].extract()
178
            osname = os_names[i].extract() 
217
            osname = os_names[i].extract() 
179
            #os = os.encode("utf-8")
-
 
180
            ps1 = os.find(">")
218
            ps1 = os.find(">")
181
            ps2 = os.find("<",ps1)
219
            ps2 = os.find("<",ps1)
182
            os = os[ps1+1:ps2]
220
            os = os[ps1+1:ps2]
183
            os = os.replace("Rs.", "")
221
            if os != '':        
-
 
222
                for r in NAAPTOL_REMOVELIST: 
-
 
223
                    while os.find(r) != -1:
184
            os = os.replace(",", "")
224
                        os = os.replace(r, "")
185
            os = urllib.unquote(os)
225
            os = urllib.unquote(os)
186
            osname = urllib.unquote(osname)
226
            osname = urllib.unquote(osname)
-
 
227
            osname = unescape(osname)
187
            try:
228
            try:
188
                os = int(os)
229
                os = int(os)
189
            except:
230
            except:
-
 
231
                #stored in format different than previous one
190
                os = os_info[i].extract()
232
                os = os_info[i].extract()
191
                ps1 = os.find(">",ps2)
233
                ps1 = os.find(">",ps2)
192
                ps2 = os.find("<",ps1)
234
                ps2 = os.find("<",ps1)
193
                os = os[ps1+1:ps2]
235
                os = os[ps1+1:ps2]
194
                os = os.replace("Rs.", "")
236
                if os != '':        
-
 
237
                    for r in NAAPTOL_REMOVELIST: 
-
 
238
                        while os.find(r) != -1:
195
                os = os.replace(",", "")
239
                            os = os.replace(r, "")
196
                os = urllib.unquote(os)
240
                os = urllib.unquote(os)
197
                os = int(os)        
241
                os = int(os)        
198
            print os
-
 
199
            print osname
-
 
200
            LocalSellers_pricelist.append(os)
242
            LocalSellers_pricelist.append(os)
201
            LocalSellers_namelist.append(osname)
243
            LocalSellers_namelist.append(osname)
202
            i = i+1
244
            i = i+1
203
            ct = ct-1
245
            ct = ct-1
204
            
246
            
205
        l = len(LocalSellers_pricelist)
247
        l = len(LocalSellers_pricelist)
206
        i = 0
248
        i = 0
207
            
-
 
208
        nid = da.get_naaptolphone(name,range).id
249
        nid = da.get_naaptolphone(name,range).id
209
        while l > 0:
250
        while l > 0:
210
            da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])
251
            da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])
211
            i = i+1
252
            i = i+1
212
            l = l-1   
-
 
213
        '''
-
 
214
        l = len(OnlineSellers_list)
-
 
215
        i = 0
-
 
216
        while l > 0:
-
 
217
            #print OnlineSellers_list[i]
-
 
218
            i = i+1
-
 
219
            l = l-1 
253
            l = l-1
220
        '''    
-
 
221
        
-
 
222
        f.close()
-
 
223
               
-
 
224
    '''    
-
 
225
        site = response.url
-
 
226
        vatplustax = 0
-
 
227
        pos1 = pos2 = 0
-
 
228
        temp = ""
-
 
229
        pos1 = site.rfind('/')
-
 
230
        if pos1 != -1:
-
 
231
            temp = site[pos1+1:len(site)]
-
 
232
            #pos2 = site.rfind('/',0,pos1-1)
-
 
233
        #if pos2 > 0:
-
 
234
            #temp = site[pos2+1:len(site)]
-
 
235
        pos3 = temp.find('.')
-
 
236
        temp1 = temp[pos3:len(temp)]
-
 
237
        name = temp.replace(temp1,"")         
-
 
238
        hxs = HtmlXPathSelector(response)
-
 
239
        prices = hxs.select('//div[@id ="priceComp"]//tr[2]/td[3]/span/text()')
-
 
240
        
254
 
241
        da = DataHelper()
-
 
242
        for price in prices:
-
 
243
             name = str(name).strip()
-
 
244
             price = price.extract()
-
 
245
             price = str(price).strip()
-
 
246
             price = price.replace("Rs", "")
-
 
247
             price = price.replace("/", "")
-
 
248
             price = price.replace("-", "")
-
 
249
             price = price.replace(".", "")
-
 
250
             shown_pr = int(price)
-
 
251
             final_pr = shown_pr + vatplustax
-
 
252
             da.add_new_mobstorephone(name,shown_pr,final_pr)
-
 
253
             print name
-
 
254
             print final_pr
-
 
255
             print "\n"
-
 
256
          
-
 
257
        for i in items:
-
 
258
            str1 = str(i['title']).strip() 
-
 
259
            print str1
-
 
260
            amnt = i['price'].replace(",","")
-
 
261
            amnt = amnt.replace("Rs", "")
-
 
262
            amnt = amnt.replace("/", "")
-
 
263
            amnt = amnt.replace("-", "")
-
 
264
            amnt = amnt.strip()
-
 
265
            vatplustax = 4*int(amnt)/100
-
 
266
            pr = int(amnt) + vatplustax 
-
 
267
            #print pr
-
 
268
            da.add_new_univerphone(str1,amnt,pr)
-
 
269
        '''        
-
 
270
SPIDER = naaptol_price2()
255
SPIDER = naaptol_price2()
271
 
256