Subversion Repositories SmartDukaan

Rev

Rev 169 | Rev 265 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 169 Rev 239
Line 14... Line 14...
14
from scrapy.http.headers import Headers
14
from scrapy.http.headers import Headers
15
from scrapy.http.request.form import FormRequest
15
from scrapy.http.request.form import FormRequest
16
from scrapy.log import msg
16
from scrapy.log import msg
17
from scrapy.http.response import Response
17
from scrapy.http.response import Response
18
 
18
 
19
from datastore import DataAccessor
19
from datastore.DataCodeAccessor import *
20
from datastore.DataAccessor import DataHelper
20
from datastore.DataAccessor import *
-
 
21
from html2text.unescaping import *
-
 
22
 
21
 
23
 
22
 
24
 
23
class vendor_links(BaseSpider):
25
class vendor_links(BaseSpider):
24
    domain_name = "univercell"
-
 
25
    start_urls = [
-
 
26
          "http://www.univercell.in/mobiles/populateStore.action"
-
 
27
    ]
-
 
28
    
26
    
-
 
27
    def __init__(self):
-
 
28
        initialize_table()
-
 
29
        #UNIVERCELL_DOMAINNAME = "univercell"   
-
 
30
        UNIVERCELL_DOMAINNAME = get_code_word("UNIVERCELL_DOMAINNAME")
-
 
31
        self.domain_name = UNIVERCELL_DOMAINNAME 
-
 
32
        #UNIVERCELL_URL = "http://www.univercell.in/mobiles/populateStore.action"
-
 
33
        UNIVERCELL_URL = get_code_word("UNIVERCELL_URL")
-
 
34
        self.start_urls.append(UNIVERCELL_URL)
-
 
35
    
-
 
36
 
29
    def start_requests(self):
37
    def start_requests(self):
-
 
38
        #adding entry for the supplier i.e its name and site
-
 
39
        #UNIVERCELL_HOMEPAGE = "http://www.univercell.in"
-
 
40
        UNIVERCELL_HOMEPAGE = get_code_word("UNIVERCELL_HOMEPAGE")
30
        da = DataHelper()
41
        da = DataHelper()
31
        da.add_supplier(self.domain_name, "www.univercell.in")
42
        da.add_supplier(self.domain_name, UNIVERCELL_HOMEPAGE)
-
 
43
        listreq = []
-
 
44
        
-
 
45
        #for each request a referer has to be set
-
 
46
        #UNIVERCELL_REFERER = "www.google.com/search"
-
 
47
        UNIVERCELL_REFERER = get_code_word("UNIVERCELL_REFERER")
-
 
48
        for url1 in self.start_urls:
32
        request = Request(url = "http://www.univercell.in/mobiles/populateStore.action", callback=self.parse)
49
            request = Request(url = str(url1), callback=self.parse)
33
        request.headers.setdefault("Referer", "www.google.com/search")
50
            request.headers.setdefault("Referer", UNIVERCELL_REFERER)
-
 
51
            listreq.append(request)
34
        return [request]
52
        return listreq
35
    
53
        
36
    def parse(self, response):
54
    def parse(self, response):
-
 
55
        #url1 needed to get complete urls for phones
37
        str1 = "http://www.univercell.in"
56
        #UNIVERCELL_URL1 = "http://www.univercell.in"
-
 
57
        UNIVERCELL_URL1 = get_code_word("UNIVERCELL_URL1")
38
        hxs = HtmlXPathSelector(response)
58
        hxs = HtmlXPathSelector(response)
39
        vendor_info = hxs.select('//div[@id="mobilesTab"]/table/tr[1]/td/table/tr')
59
        #UNIVERCELL_XPATH1 = '//div[@id="mobilesTab"]/table/tr[1]/td/table/tr'
-
 
60
        UNIVERCELL_XPATH1 = get_code_word("UNIVERCELL_XPATH1")
40
        #print len(vendor_info)
61
        vendor_info = hxs.select(UNIVERCELL_XPATH1)
-
 
62
        
41
        items = []
63
        items = []
42
        for i in vendor_info:
64
        for i in vendor_info:
43
            item = {}
65
            item = {}
-
 
66
            #UNIVERCELL_XPATH2 = './/a/text()'
-
 
67
            UNIVERCELL_XPATH2 = get_code_word("UNIVERCELL_XPATH2")
-
 
68
            #UNIVERCELL_XPATH3 = './/a/@href' 
-
 
69
            UNIVERCELL_XPATH3 = get_code_word("UNIVERCELL_XPATH3")
44
            item['name'] = i.select('.//a/text()')[0].extract()
70
            item['name'] = i.select(UNIVERCELL_XPATH2)[0].extract()
45
            temp = i.select('.//a/@href')[0].extract()
71
            temp = i.select(UNIVERCELL_XPATH3)[0].extract()
-
 
72
            
-
 
73
            #site having data has url containing repopulate instead of populate
-
 
74
            #UNIVERCELL_VAR1 = ";"
-
 
75
            UNIVERCELL_VAR1 = get_code_word("UNIVERCELL_VAR1")
-
 
76
            #UNIVERCELL_VAR2 = "?"
-
 
77
            UNIVERCELL_VAR2 = get_code_word("UNIVERCELL_VAR2")
-
 
78
            #UNIVERCELL_VAR3 = "populate"
-
 
79
            UNIVERCELL_VAR3 = get_code_word("UNIVERCELL_VAR3")
-
 
80
            #UNIVERCELL_VAR4 = "rePopulate"
-
 
81
            UNIVERCELL_VAR4 = get_code_word("UNIVERCELL_VAR4")
46
            a = str(temp).find(";")
82
            a = str(temp).find(UNIVERCELL_VAR1)
47
            b = str(temp).find("?")
83
            b = str(temp).find(UNIVERCELL_VAR2)
48
            temp1 = str(temp)[a:b]
84
            temp1 = str(temp)[a:b]
49
            temp2 = str(temp).replace(temp1,"")
85
            temp2 = str(temp).replace(temp1,"")
50
            item['site'] =  str(temp2).replace("populate","rePopulate")
86
            item['site'] =  str(temp2).replace(UNIVERCELL_VAR3,UNIVERCELL_VAR4)
51
            items.append(item)
87
            items.append(item)
52
            
88
            
53
        da = DataHelper()
89
        da = DataHelper()
54
        for item in items:
90
        for item in items:
55
            str2 = str1 + str(item['site'])
91
            str2 = UNIVERCELL_URL1 + str(item['site'])
56
            da.add_univervendor(item['name'].strip(), str2)
92
            da.add_univervendor( unescape(item['name'].strip()), unescape(str2))
57
            print item['name']
-
 
58
            print str2
-
 
59
            
93
            
60
SPIDER = vendor_links()
94
SPIDER = vendor_links()
61
95