Subversion Repositories SmartDukaan

Rev

Rev 219 | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 219 Rev 251
Line 16... Line 16...
16
from scrapy.http.response import Response
16
from scrapy.http.response import Response
17
from datastore.DataAccessor import *
17
from datastore.DataAccessor import *
18
from datastore.DataCodeAccessor import *
18
from datastore.DataCodeAccessor import *
19
 
19
 
20
from html2text import *
20
from html2text import *
21
from babel.messages.pofile import unescape
-
 
22
import urllib
21
import urllib
23
 
22
 
24
class babuchak1(BaseSpider):
23
class babuchak1(BaseSpider):
-
 
24
    """
-
 
25
    Documentation for class babuchak1
-
 
26
    This spider collects the url for the individual vendors 
-
 
27
    and store them in table datastore_datadefinition_babuchak_urls.
25
    
28
    """
26
    def __init__(self):
29
    def __init__(self):
-
 
30
        """
-
 
31
        Documentation for constructor
-
 
32
        initialize_table is called to make all the tables known in
-
 
33
        the scope of this class.
-
 
34
        Also start url needs to be feeded to the spider through start_urls.append
-
 
35
        Domainname is name by which this spider is known outside
-
 
36
        So this will be used as an argument for calling this spider 
-
 
37
        """
27
        initialize_table()
38
        initialize_table()
28
        #BABUCHAK_DOMAINNAME = "babuchak"   
39
        #BABUCHAK_DOMAINNAME = "babuchak"   
29
        BABUCHAK_DOMAINNAME = get_code_word("BABUCHAK_DOMAINNAME")
40
        BABUCHAK_DOMAINNAME = get_code_word("BABUCHAK_DOMAINNAME")
30
        self.domain_name = BABUCHAK_DOMAINNAME 
41
        self.domain_name = BABUCHAK_DOMAINNAME 
31
        #BABUCHAK_URL = "http://www.shopping.babuchak.com/visitourstores.php?view=productListPage&category=108"
42
        #BABUCHAK_URL = "http://www.shopping.babuchak.com/visitourstores.php?view=productListPage&category=108"
32
        BABUCHAK_URL = get_code_word("BABUCHAK_URL")
43
        BABUCHAK_URL = get_code_word("BABUCHAK_URL")
33
        self.start_urls.append(BABUCHAK_URL)
44
        self.start_urls.append(BABUCHAK_URL)
34
    
45
    
35
 
46
 
36
    def start_requests(self):
47
    def start_requests(self):
-
 
48
        """
-
 
49
        Documentation for method start_requests
-
 
50
        To set various properties of the request to be made
-
 
51
        like referer, headers and all.
-
 
52
        Also suppliers entry need to be done in the table
-
 
53
        datastore_datadefinition_suppliers.
-
 
54
        @return a list of well formed requests which will be 
-
 
55
        crawled by spider and spider will return the response
-
 
56
        """
-
 
57
        
37
        #adding entry for the supplier i.e its name and site
58
        #adding entry for the supplier i.e its name and site
38
        #BABUCHAK_HOMEPAGE = "http://www.shopping.babuchak.com"
59
        #BABUCHAK_HOMEPAGE = "http://www.shopping.babuchak.com"
39
        BABUCHAK_HOMEPAGE = get_code_word("BABUCHAK_HOMEPAGE")
60
        BABUCHAK_HOMEPAGE = get_code_word("BABUCHAK_HOMEPAGE")
40
        da = DataHelper()
61
        da = DataHelper()
41
        da.add_supplier(self.domain_name, BABUCHAK_HOMEPAGE)
62
        da.add_supplier(self.domain_name, BABUCHAK_HOMEPAGE)
Line 49... Line 70...
49
            request.headers.setdefault("Referer", BABUCHAK_REFERER)
70
            request.headers.setdefault("Referer", BABUCHAK_REFERER)
50
            listreq.append(request)
71
            listreq.append(request)
51
        return listreq
72
        return listreq
52
        
73
        
53
    def parse(self, response):
74
    def parse(self, response):
-
 
75
        """
-
 
76
        Documentation for method parse
-
 
77
        @param response of individual requests
-
 
78
        Using Xpaths needed information is extracted out of the response
54
        #url1 needed to get complete urls
79
        and added to the database
-
 
80
        Xpath1 = Give us section for individual vendors
-
 
81
        Xpath2 = Give us no of pages for individual vendors
-
 
82
        Xpath3 = Give us url for individual vendors
-
 
83
        Url1 = To get full url for individual vendors
-
 
84
        """
55
        da = DataHelper()
85
        da = DataHelper()
56
        #BABUCHAK_URL1 = "http://www.shopping.babuchak.com/visitourstores.php"
86
        #BABUCHAK_URL1 = "http://www.shopping.babuchak.com/visitourstores.php"
57
        BABUCHAK_URL1 = get_code_word("BABUCHAK_URL1")
87
        BABUCHAK_URL1 = get_code_word("BABUCHAK_URL1")
58
        hxs = HtmlXPathSelector(response)
88
        hxs = HtmlXPathSelector(response)
59
        #BABUCHAK_XPATH1 = '//td[@class="mod-category-header"]'
89
        #BABUCHAK_XPATH1 = '//td[@class="mod-category-header"]'
Line 63... Line 93...
63
            #BABUCHAK_XPATH2 = './/text()'
93
            #BABUCHAK_XPATH2 = './/text()'
64
            BABUCHAK_XPATH2 = get_code_word("BABUCHAK_XPATH2")
94
            BABUCHAK_XPATH2 = get_code_word("BABUCHAK_XPATH2")
65
            #BABUCHAK_XPATH3 = './/a/@href' 
95
            #BABUCHAK_XPATH3 = './/a/@href' 
66
            BABUCHAK_XPATH3 = get_code_word("BABUCHAK_XPATH3")
96
            BABUCHAK_XPATH3 = get_code_word("BABUCHAK_XPATH3")
67
            no_pages = i.select(BABUCHAK_XPATH2)[2].extract()
97
            no_pages = i.select(BABUCHAK_XPATH2)[2].extract()
68
            #print i.select(BABUCHAK_XPATH2)[1].extract() + "  "
-
 
69
            url = i.select(BABUCHAK_XPATH3)[0].extract()
98
            url = i.select(BABUCHAK_XPATH3)[0].extract()
70
            url = BABUCHAK_URL1 + url
99
            url = BABUCHAK_URL1 + url
71
            no_pages = urllib.unquote(no_pages)
100
            no_pages = urllib.unquote(no_pages)
72
            no_pages = no_pages.strip()
101
            no_pages = no_pages.strip()
73
            no_pages = no_pages[1:len(no_pages)-1]
102
            no_pages = no_pages[1:len(no_pages)-1]
74
            no_pages = int(no_pages)
103
            no_pages = int(no_pages)
75
            #print url + " "
-
 
76
            #print no_pages
-
 
77
            da.add_babuchakurl(url, no_pages)
104
            da.add_babuchakurl(url, no_pages)
78
           
105
           
79
SPIDER = babuchak1()
106
SPIDER = babuchak1()
80
107