Subversion Repositories SmartDukaan

Rev

Rev 170 | Rev 266 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 170 Rev 240
Line 13... Line 13...
13
from scrapy.contrib.spidermiddleware import referer
13
from scrapy.contrib.spidermiddleware import referer
14
from scrapy.http.headers import Headers
14
from scrapy.http.headers import Headers
15
from scrapy.http.request.form import FormRequest
15
from scrapy.http.request.form import FormRequest
16
from scrapy.log import msg
16
from scrapy.log import msg
17
from scrapy.http.response import Response
17
from scrapy.http.response import Response
18
#from datastore.DataAccessor import add_new_phone
-
 
19
 
18
 
-
 
19
 
20
from datastore import DataAccessor
20
from datastore.DataCodeAccessor import *
21
from datastore.DataAccessor import DataHelper
21
from datastore.DataAccessor import *
-
 
22
from html2text.unescaping import *
22
 
23
 
23
class univercell_price(BaseSpider):
24
class univercell_price(BaseSpider):
24
    
25
    
25
    def __init__(self):
26
    def __init__(self):
-
 
27
       initialize_table()
-
 
28
       #UNIVERCELL_DOMAINNAME1 = "univercell1"   
-
 
29
       UNIVERCELL_DOMAINNAME1 = get_code_word("UNIVERCELL_DOMAINNAME1")
26
       self.domain_name = "univercellphones"
30
       self.domain_name = UNIVERCELL_DOMAINNAME1 
-
 
31
       
-
 
32
       # get urls from the database and append them in the list for crawling
27
       da = DataHelper()
33
       da = DataHelper()
28
       for pitem in da.get_all_univervendors():
34
       for pitem in da.get_all_univervendors():
29
            self.start_urls.append(pitem.v_site.strip())
35
            self.start_urls.append(pitem.v_site.strip())
30
    
36
    
31
    def start_requests(self):
37
    def start_requests(self):
-
 
38
        
-
 
39
        #for each request a referer has to be set
32
        listreq = []
40
        listreq = []
-
 
41
        #UNIVERCELL_REFERER = "www.google.com/search"
-
 
42
        UNIVERCELL_REFERER = get_code_word("UNIVERCELL_REFERER")
33
        for url1 in self.start_urls:
43
        for url1 in self.start_urls:
34
            request = Request(url = url1, callback=self.parse)
44
            request = Request(url = str(url1), callback=self.parse)
35
            request.headers.setdefault("Referer", "www.google.com/search")
45
            request.headers.setdefault("Referer", UNIVERCELL_REFERER)
36
            listreq.append(request)
46
            listreq.append(request)
37
        return listreq
47
        return listreq
38
       
48
        
39
    def parse(self, response):
49
    def parse(self, response):
40
        da = DataHelper()
50
        da = DataHelper()
41
        vatplustax = 0
51
        #VATPLUSTAX = 0
-
 
52
        #removelist is used for converting price to decimal format containing only numbers and '.'
-
 
53
        #UNIVERCELL_REMOVELIST = ["Rs",",","-","/"]
-
 
54
        #list separated by ';'
-
 
55
        UNIVERCELL_REMOVELIST = get_code_word("UNIVERCELL_REMOVELIST")
-
 
56
        UNIVERCELL_REMOVELIST = UNIVERCELL_REMOVELIST.split(';')
42
        hxs = HtmlXPathSelector(response)
57
        hxs = HtmlXPathSelector(response)
43
        #sites = hxs.select('//div[@id="productsDiv"]/table/tbody/tr[2]/td/div/table/tbody/tr/td/table/tbody')
58
        #UNIVERCELL_XPATH4 = '//td[@class="gray-border"]' 
44
        #sites = hxs.select('//div[@id="productsDiv"]/table/tr[2]//tr')
59
        UNIVERCELL_XPATH4 = get_code_word("UNIVERCELL_XPATH4")
45
        sites = hxs.select('//td[@class="gray-border"]')
60
        sites = hxs.select(UNIVERCELL_XPATH4)
46
        items = []
61
        items = []
47
        for site in sites:
62
        for site in sites:
48
            item = {}
63
            item = {}
49
            #tmp = site.select('.//tr[2]/td/a/text()')
64
            #UNIVERCELL_XPATH5 = './/tr[2]/td/a/text()'
-
 
65
            UNIVERCELL_XPATH5 = get_code_word("UNIVERCELL_XPATH5")
50
            item['title'] = site.select('.//tr[2]/td/a/text()')[0].extract()
66
            item['title'] = site.select(UNIVERCELL_XPATH5)[0].extract()
51
            #psite = site.select(".//a[3][@href]/@href")[0].extract()
67
            #UNIVERCELL_XPATH6 = './/tr[3]/th/label/text()'
-
 
68
            UNIVERCELL_XPATH6 = get_code_word("UNIVERCELL_XPATH6")
52
            item['price'] =site.select('.//tr[3]/th/label/text()')[0].extract()
69
            item['price'] =site.select(UNIVERCELL_XPATH6)[0].extract()
53
            items.append(item)
70
            items.append(item)
54
        da = DataHelper()
-
 
55
              
71
              
56
        for i in items:
72
        for i in items:
57
            str1 = str(i['title']).strip() 
73
            str1 = str(i['title']).strip()
58
            print str1
74
            amnt = i['price']
59
            amnt = i['price'].replace(",","")
75
            if amnt != '':        
60
            amnt = amnt.replace("Rs", "")
76
                for r in UNIVERCELL_REMOVELIST: 
61
            amnt = amnt.replace("/", "")
77
                    while amnt.find(r) != -1:
62
            amnt = amnt.replace("-", "")
78
                        amnt = amnt.replace(r, "")
63
            amnt = amnt.strip() 
79
            amnt = amnt.strip() 
-
 
80
            # 4% additional vat is there on the price
64
            vatplustax = 4*int(amnt)/100
81
            UNIVERCELL_VATPLUSTAX = 4*int(amnt)/100
65
            pr = int(amnt) + vatplustax 
82
            pr = int(amnt) + UNIVERCELL_VATPLUSTAX 
66
            #print pr
83
            #adding model-name,quotedprice and finalprice
67
            da.add_new_univerphone(str1,amnt,pr) 
84
            da.add_new_univerphone(unescape(str1),amnt,pr) 
68
            
-
 
69
        #lt = len(da.get_all_phones())  
-
 
70
        #print "length" + str(lt)
-
 
71
        #for ph in da.get_all_phones():
-
 
72
         #   print ph
85
                    
73
            
-
 
74
        #f = open('/home/gaurav/twopassscrapy/pricelinks.txt', 'w')
-
 
75
        #for i in items:
-
 
76
            #f.write(i['title'])
-
 
77
            #f.write("\n")
-
 
78
            #f.write(i['link'])
-
 
79
            #f.write("\n")
-
 
80
        #f.close()    
-
 
81
        
-
 
82
SPIDER = univercell_price()
86
SPIDER = univercell_price()
83
 
87