Subversion Repositories SmartDukaan

Rev

Rev 169 | Rev 265 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
138 ashish 1
'''
2
Created on 14-May-2010
3
 
4
@author: gaurav
5
'''
152 ashish 6
 
7
 
8
from scrapy.spider import BaseSpider
9
from scrapy.selector import HtmlXPathSelector
10
from scrapy.http import Request
11
 
12
from demo.items import DemoItem
13
from scrapy.contrib.spidermiddleware import referer
14
from scrapy.http.headers import Headers
15
from scrapy.http.request.form import FormRequest
16
from scrapy.log import msg
17
from scrapy.http.response import Response
18
 
239 ashish 19
from datastore.DataCodeAccessor import *
20
from datastore.DataAccessor import *
21
from html2text.unescaping import *
152 ashish 22
 
23
 
239 ashish 24
 
152 ashish 25
class vendor_links(BaseSpider):
26
 
239 ashish 27
    def __init__(self):
28
        initialize_table()
29
        #UNIVERCELL_DOMAINNAME = "univercell"   
30
        UNIVERCELL_DOMAINNAME = get_code_word("UNIVERCELL_DOMAINNAME")
31
        self.domain_name = UNIVERCELL_DOMAINNAME 
32
        #UNIVERCELL_URL = "http://www.univercell.in/mobiles/populateStore.action"
33
        UNIVERCELL_URL = get_code_word("UNIVERCELL_URL")
34
        self.start_urls.append(UNIVERCELL_URL)
35
 
36
 
152 ashish 37
    def start_requests(self):
239 ashish 38
        #adding entry for the supplier i.e its name and site
39
        #UNIVERCELL_HOMEPAGE = "http://www.univercell.in"
40
        UNIVERCELL_HOMEPAGE = get_code_word("UNIVERCELL_HOMEPAGE")
169 ashish 41
        da = DataHelper()
239 ashish 42
        da.add_supplier(self.domain_name, UNIVERCELL_HOMEPAGE)
43
        listreq = []
44
 
45
        #for each request a referer has to be set
46
        #UNIVERCELL_REFERER = "www.google.com/search"
47
        UNIVERCELL_REFERER = get_code_word("UNIVERCELL_REFERER")
48
        for url1 in self.start_urls:
49
            request = Request(url = str(url1), callback=self.parse)
50
            request.headers.setdefault("Referer", UNIVERCELL_REFERER)
51
            listreq.append(request)
52
        return listreq
53
 
152 ashish 54
    def parse(self, response):
239 ashish 55
        #url1 needed to get complete urls for phones
56
        #UNIVERCELL_URL1 = "http://www.univercell.in"
57
        UNIVERCELL_URL1 = get_code_word("UNIVERCELL_URL1")
152 ashish 58
        hxs = HtmlXPathSelector(response)
239 ashish 59
        #UNIVERCELL_XPATH1 = '//div[@id="mobilesTab"]/table/tr[1]/td/table/tr'
60
        UNIVERCELL_XPATH1 = get_code_word("UNIVERCELL_XPATH1")
61
        vendor_info = hxs.select(UNIVERCELL_XPATH1)
62
 
152 ashish 63
        items = []
64
        for i in vendor_info:
65
            item = {}
239 ashish 66
            #UNIVERCELL_XPATH2 = './/a/text()'
67
            UNIVERCELL_XPATH2 = get_code_word("UNIVERCELL_XPATH2")
68
            #UNIVERCELL_XPATH3 = './/a/@href' 
69
            UNIVERCELL_XPATH3 = get_code_word("UNIVERCELL_XPATH3")
70
            item['name'] = i.select(UNIVERCELL_XPATH2)[0].extract()
71
            temp = i.select(UNIVERCELL_XPATH3)[0].extract()
72
 
73
            #site having data has url containing repopulate instead of populate
74
            #UNIVERCELL_VAR1 = ";"
75
            UNIVERCELL_VAR1 = get_code_word("UNIVERCELL_VAR1")
76
            #UNIVERCELL_VAR2 = "?"
77
            UNIVERCELL_VAR2 = get_code_word("UNIVERCELL_VAR2")
78
            #UNIVERCELL_VAR3 = "populate"
79
            UNIVERCELL_VAR3 = get_code_word("UNIVERCELL_VAR3")
80
            #UNIVERCELL_VAR4 = "rePopulate"
81
            UNIVERCELL_VAR4 = get_code_word("UNIVERCELL_VAR4")
82
            a = str(temp).find(UNIVERCELL_VAR1)
83
            b = str(temp).find(UNIVERCELL_VAR2)
152 ashish 84
            temp1 = str(temp)[a:b]
85
            temp2 = str(temp).replace(temp1,"")
239 ashish 86
            item['site'] =  str(temp2).replace(UNIVERCELL_VAR3,UNIVERCELL_VAR4)
152 ashish 87
            items.append(item)
88
 
89
        da = DataHelper()
90
        for item in items:
239 ashish 91
            str2 = UNIVERCELL_URL1 + str(item['site'])
92
            da.add_univervendor( unescape(item['name'].strip()), unescape(str2))
152 ashish 93
 
94
SPIDER = vendor_links()