Subversion Repositories SmartDukaan

Rev

Rev 219 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
219 ashish 1
'''
2
Created on 06-Jun-2010
3
 
4
@author: gaurav
5
'''
6
 
7
from scrapy.spider import BaseSpider
8
from scrapy.selector import HtmlXPathSelector
9
from scrapy.http import Request
10
 
11
from demo.items import DemoItem
12
from scrapy.contrib.spidermiddleware import referer
13
from scrapy.http.headers import Headers
14
from scrapy.http.request.form import FormRequest
15
from scrapy.log import msg
16
from scrapy.http.response import Response
17
from datastore.DataAccessor import *
18
from datastore.DataCodeAccessor import *
19
 
20
from html2text import *
21
import urllib
22
 
23
class babuchak1(BaseSpider):
251 ashish 24
    """
25
    Documentation for class babuchak1
26
    This spider collects the url for the individual vendors 
27
    and store them in table datastore_datadefinition_babuchak_urls.
28
    """
219 ashish 29
    def __init__(self):
251 ashish 30
        """
31
        Documentation for constructor
32
        initialize_table is called to make all the tables known in
33
        the scope of this class.
34
        Also start url needs to be feeded to the spider through start_urls.append
35
        Domainname is name by which this spider is known outside
36
        So this will be used as an argument for calling this spider 
37
        """
219 ashish 38
        initialize_table()
39
        #BABUCHAK_DOMAINNAME = "babuchak"   
40
        BABUCHAK_DOMAINNAME = get_code_word("BABUCHAK_DOMAINNAME")
41
        self.domain_name = BABUCHAK_DOMAINNAME 
42
        #BABUCHAK_URL = "http://www.shopping.babuchak.com/visitourstores.php?view=productListPage&category=108"
43
        BABUCHAK_URL = get_code_word("BABUCHAK_URL")
44
        self.start_urls.append(BABUCHAK_URL)
45
 
46
 
47
    def start_requests(self):
251 ashish 48
        """
49
        Documentation for method start_requests
50
        To set various properties of the request to be made
51
        like referer, headers and all.
52
        Also suppliers entry need to be done in the table
53
        datastore_datadefinition_suppliers.
54
        @return a list of well formed requests which will be 
55
        crawled by spider and spider will return the response
56
        """
57
 
219 ashish 58
        #adding entry for the supplier i.e its name and site
59
        #BABUCHAK_HOMEPAGE = "http://www.shopping.babuchak.com"
60
        BABUCHAK_HOMEPAGE = get_code_word("BABUCHAK_HOMEPAGE")
61
        da = DataHelper()
62
        da.add_supplier(self.domain_name, BABUCHAK_HOMEPAGE)
63
        listreq = []
64
 
65
        #for each request a referer has to be set
66
        #BABUCHAK_REFERER = "www.google.com/search"
67
        BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")
68
        for url1 in self.start_urls:
69
            request = Request(url = str(url1), callback=self.parse)
70
            request.headers.setdefault("Referer", BABUCHAK_REFERER)
71
            listreq.append(request)
72
        return listreq
73
 
74
    def parse(self, response):
251 ashish 75
        """
76
        Documentation for method parse
77
        @param response of individual requests
78
        Using Xpaths needed information is extracted out of the response
79
        and added to the database
80
        Xpath1 = Give us section for individual vendors
81
        Xpath2 = Give us no of pages for individual vendors
82
        Xpath3 = Give us url for individual vendors
83
        Url1 = To get full url for individual vendors
84
        """
219 ashish 85
        da = DataHelper()
86
        #BABUCHAK_URL1 = "http://www.shopping.babuchak.com/visitourstores.php"
87
        BABUCHAK_URL1 = get_code_word("BABUCHAK_URL1")
88
        hxs = HtmlXPathSelector(response)
89
        #BABUCHAK_XPATH1 = '//td[@class="mod-category-header"]'
90
        BABUCHAK_XPATH1 = get_code_word("BABUCHAK_XPATH1")
91
        info = hxs.select(BABUCHAK_XPATH1)
92
        for i in info:
93
            #BABUCHAK_XPATH2 = './/text()'
94
            BABUCHAK_XPATH2 = get_code_word("BABUCHAK_XPATH2")
95
            #BABUCHAK_XPATH3 = './/a/@href' 
96
            BABUCHAK_XPATH3 = get_code_word("BABUCHAK_XPATH3")
97
            no_pages = i.select(BABUCHAK_XPATH2)[2].extract()
98
            url = i.select(BABUCHAK_XPATH3)[0].extract()
99
            url = BABUCHAK_URL1 + url
100
            no_pages = urllib.unquote(no_pages)
101
            no_pages = no_pages.strip()
102
            no_pages = no_pages[1:len(no_pages)-1]
103
            no_pages = int(no_pages)
104
            da.add_babuchakurl(url, no_pages)
105
 
106
SPIDER = babuchak1()