Subversion Repositories SmartDukaan

Rev

Rev 239 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 239 Rev 265
Line 21... Line 21...
21
from html2text.unescaping import *
21
from html2text.unescaping import *
22
 
22
 
23
 
23
 
24
 
24
 
25
class vendor_links(BaseSpider):
25
class vendor_links(BaseSpider):
-
 
26
    """
-
 
27
    Documentation for class vendor_links
-
 
28
    This spider collects the url for the individual vendors 
-
 
29
    and store them in table datastore_datadefinition_univercell_data.
26
    
30
    """
27
    def __init__(self):
31
    def __init__(self):
-
 
32
        """
-
 
33
        Documentation for constructor
-
 
34
        initialize_table is called to make all the tables known in
-
 
35
        the scope of this class.
-
 
36
        Also start url needs to be feeded to the spider through start_urls.append
-
 
37
        Domainname is name by which this spider is known outside
-
 
38
        So this will be used as an argument for calling this spider 
-
 
39
        """
28
        initialize_table()
40
        initialize_table()
29
        #UNIVERCELL_DOMAINNAME = "univercell"   
41
        #UNIVERCELL_DOMAINNAME = "univercell"   
30
        UNIVERCELL_DOMAINNAME = get_code_word("UNIVERCELL_DOMAINNAME")
42
        UNIVERCELL_DOMAINNAME = get_code_word("UNIVERCELL_DOMAINNAME")
31
        self.domain_name = UNIVERCELL_DOMAINNAME 
43
        self.domain_name = UNIVERCELL_DOMAINNAME 
32
        #UNIVERCELL_URL = "http://www.univercell.in/mobiles/populateStore.action"
44
        #UNIVERCELL_URL = "http://www.univercell.in/mobiles/populateStore.action"
33
        UNIVERCELL_URL = get_code_word("UNIVERCELL_URL")
45
        UNIVERCELL_URL = get_code_word("UNIVERCELL_URL")
34
        self.start_urls.append(UNIVERCELL_URL)
46
        self.start_urls.append(UNIVERCELL_URL)
35
    
47
    
36
 
48
 
37
    def start_requests(self):
49
    def start_requests(self):
-
 
50
        """
-
 
51
        Documentation for method start_requests
-
 
52
        To set various properties of the request to be made
-
 
53
        like referer, headers and all.
-
 
54
        Also suppliers entry need to be done in the table
-
 
55
        datastore_datadefinition_suppliers.
-
 
56
        @return a list of well formed requests which will be 
-
 
57
        crawled by spider and spider will return the response
-
 
58
        """
38
        #adding entry for the supplier i.e its name and site
59
        #adding entry for the supplier i.e its name and site
39
        #UNIVERCELL_HOMEPAGE = "http://www.univercell.in"
60
        #UNIVERCELL_HOMEPAGE = "http://www.univercell.in"
40
        UNIVERCELL_HOMEPAGE = get_code_word("UNIVERCELL_HOMEPAGE")
61
        UNIVERCELL_HOMEPAGE = get_code_word("UNIVERCELL_HOMEPAGE")
41
        da = DataHelper()
62
        da = DataHelper()
42
        da.add_supplier(self.domain_name, UNIVERCELL_HOMEPAGE)
63
        da.add_supplier(self.domain_name, UNIVERCELL_HOMEPAGE)
Line 50... Line 71...
50
            request.headers.setdefault("Referer", UNIVERCELL_REFERER)
71
            request.headers.setdefault("Referer", UNIVERCELL_REFERER)
51
            listreq.append(request)
72
            listreq.append(request)
52
        return listreq
73
        return listreq
53
        
74
        
54
    def parse(self, response):
75
    def parse(self, response):
-
 
76
        """
-
 
77
        Documentation for method parse
-
 
78
        @param response of individual requests
-
 
79
        Using Xpaths needed information is extracted out of the response
-
 
80
        and added to the database
-
 
81
        Xpath1 = Give us section for individual vendors
-
 
82
        Xpath2 = Give us name for individual vendors
-
 
83
        Xpath3 = Give us url for individual vendors
-
 
84
        Url1 = To get full url for individual vendors
-
 
85
        var1,var2,var3 and var4 are used to get proper url
-
 
86
        """
55
        #url1 needed to get complete urls for phones
87
        #url1 needed to get complete urls for phones
56
        #UNIVERCELL_URL1 = "http://www.univercell.in"
88
        #UNIVERCELL_URL1 = "http://www.univercell.in"
57
        UNIVERCELL_URL1 = get_code_word("UNIVERCELL_URL1")
89
        UNIVERCELL_URL1 = get_code_word("UNIVERCELL_URL1")
58
        hxs = HtmlXPathSelector(response)
90
        hxs = HtmlXPathSelector(response)
59
        #UNIVERCELL_XPATH1 = '//div[@id="mobilesTab"]/table/tr[1]/td/table/tr'
91
        #UNIVERCELL_XPATH1 = '//div[@id="mobilesTab"]/table/tr[1]/td/table/tr'