Subversion Repositories SmartDukaan

Rev

Rev 265 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
138 ashish 1
'''
2
Created on 14-May-2010
3
 
4
@author: gaurav
5
'''
152 ashish 6
 
7
 
8
from scrapy.spider import BaseSpider
9
from scrapy.selector import HtmlXPathSelector
10
from scrapy.http import Request
11
 
12
from demo.items import DemoItem
13
from scrapy.contrib.spidermiddleware import referer
14
from scrapy.http.headers import Headers
15
from scrapy.http.request.form import FormRequest
16
from scrapy.log import msg
17
from scrapy.http.response import Response
18
 
239 ashish 19
from datastore.DataCodeAccessor import *
20
from datastore.DataAccessor import *
21
from html2text.unescaping import *
152 ashish 22
 
23
 
239 ashish 24
 
152 ashish 25
class vendor_links(BaseSpider):
265 ashish 26
    """
27
    Documentation for class vendor_links
28
    This spider collects the url for the individual vendors 
29
    and store them in table datastore_datadefinition_univercell_data.
30
    """
239 ashish 31
    def __init__(self):
265 ashish 32
        """
33
        Documentation for constructor
34
        initialize_table is called to make all the tables known in
35
        the scope of this class.
36
        Also start url needs to be feeded to the spider through start_urls.append
37
        Domainname is name by which this spider is known outside
38
        So this will be used as an argument for calling this spider 
39
        """
239 ashish 40
        initialize_table()
41
        #UNIVERCELL_DOMAINNAME = "univercell"   
42
        UNIVERCELL_DOMAINNAME = get_code_word("UNIVERCELL_DOMAINNAME")
43
        self.domain_name = UNIVERCELL_DOMAINNAME 
44
        #UNIVERCELL_URL = "http://www.univercell.in/mobiles/populateStore.action"
45
        UNIVERCELL_URL = get_code_word("UNIVERCELL_URL")
46
        self.start_urls.append(UNIVERCELL_URL)
47
 
48
 
152 ashish 49
    def start_requests(self):
265 ashish 50
        """
51
        Documentation for method start_requests
52
        To set various properties of the request to be made
53
        like referer, headers and all.
54
        Also suppliers entry need to be done in the table
55
        datastore_datadefinition_suppliers.
56
        @return a list of well formed requests which will be 
57
        crawled by spider and spider will return the response
58
        """
239 ashish 59
        #adding entry for the supplier i.e its name and site
60
        #UNIVERCELL_HOMEPAGE = "http://www.univercell.in"
61
        UNIVERCELL_HOMEPAGE = get_code_word("UNIVERCELL_HOMEPAGE")
169 ashish 62
        da = DataHelper()
239 ashish 63
        da.add_supplier(self.domain_name, UNIVERCELL_HOMEPAGE)
64
        listreq = []
65
 
66
        #for each request a referer has to be set
67
        #UNIVERCELL_REFERER = "www.google.com/search"
68
        UNIVERCELL_REFERER = get_code_word("UNIVERCELL_REFERER")
69
        for url1 in self.start_urls:
70
            request = Request(url = str(url1), callback=self.parse)
71
            request.headers.setdefault("Referer", UNIVERCELL_REFERER)
72
            listreq.append(request)
73
        return listreq
74
 
152 ashish 75
    def parse(self, response):
265 ashish 76
        """
77
        Documentation for method parse
78
        @param response of individual requests
79
        Using Xpaths needed information is extracted out of the response
80
        and added to the database
81
        Xpath1 = Give us section for individual vendors
82
        Xpath2 = Give us name for individual vendors
83
        Xpath3 = Give us url for individual vendors
84
        Url1 = To get full url for individual vendors
85
        var1,var2,var3 and var4 are used to get proper url
86
        """
239 ashish 87
        #url1 needed to get complete urls for phones
88
        #UNIVERCELL_URL1 = "http://www.univercell.in"
89
        UNIVERCELL_URL1 = get_code_word("UNIVERCELL_URL1")
152 ashish 90
        hxs = HtmlXPathSelector(response)
239 ashish 91
        #UNIVERCELL_XPATH1 = '//div[@id="mobilesTab"]/table/tr[1]/td/table/tr'
92
        UNIVERCELL_XPATH1 = get_code_word("UNIVERCELL_XPATH1")
93
        vendor_info = hxs.select(UNIVERCELL_XPATH1)
152 ashish 94
        items = []
95
        for i in vendor_info:
96
            item = {}
239 ashish 97
            #UNIVERCELL_XPATH2 = './/a/text()'
98
            UNIVERCELL_XPATH2 = get_code_word("UNIVERCELL_XPATH2")
99
            #UNIVERCELL_XPATH3 = './/a/@href' 
100
            UNIVERCELL_XPATH3 = get_code_word("UNIVERCELL_XPATH3")
101
            item['name'] = i.select(UNIVERCELL_XPATH2)[0].extract()
102
            temp = i.select(UNIVERCELL_XPATH3)[0].extract()
103
 
104
            #site having data has url containing repopulate instead of populate
105
            #UNIVERCELL_VAR1 = ";"
106
            UNIVERCELL_VAR1 = get_code_word("UNIVERCELL_VAR1")
107
            #UNIVERCELL_VAR2 = "?"
108
            UNIVERCELL_VAR2 = get_code_word("UNIVERCELL_VAR2")
109
            #UNIVERCELL_VAR3 = "populate"
110
            UNIVERCELL_VAR3 = get_code_word("UNIVERCELL_VAR3")
111
            #UNIVERCELL_VAR4 = "rePopulate"
112
            UNIVERCELL_VAR4 = get_code_word("UNIVERCELL_VAR4")
113
            a = str(temp).find(UNIVERCELL_VAR1)
114
            b = str(temp).find(UNIVERCELL_VAR2)
152 ashish 115
            temp1 = str(temp)[a:b]
116
            temp2 = str(temp).replace(temp1,"")
239 ashish 117
            item['site'] =  str(temp2).replace(UNIVERCELL_VAR3,UNIVERCELL_VAR4)
152 ashish 118
            items.append(item)
119
 
120
        da = DataHelper()
121
        for item in items:
239 ashish 122
            str2 = UNIVERCELL_URL1 + str(item['site'])
123
            da.add_univervendor( unescape(item['name'].strip()), unescape(str2))
152 ashish 124
 
290 gaurav 125
SPIDER = vendor_links()