Subversion Repositories SmartDukaan

Rev

Rev 235 | Rev 271 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
187 ashish 1
'''
2
Created on 27-May-2010
3
 
4
@author: gaurav
5
'''
6
 
7
 
8
from scrapy.spider import BaseSpider
9
from scrapy.selector import HtmlXPathSelector
10
from scrapy.http import Request
11
 
12
from demo.items import DemoItem
13
from scrapy.contrib.spidermiddleware import referer
14
from scrapy.http.headers import Headers
15
from scrapy.http.request.form import FormRequest
16
from scrapy.log import msg
17
from scrapy.http.response import Response
235 ashish 18
from datastore.DataCodeAccessor import *
19
from datastore.DataAccessor import *
187 ashish 20
 
235 ashish 21
from html2text.unescaping import *
187 ashish 22
 
23
class naaptol_spider(BaseSpider):
261 ashish 24
    """
25
    Documentation for class naaptol_spider
26
    This spider collects the url for the individual phones
27
    and store them in table datastore_datadefinition_naaptol_urls.
28
    """
235 ashish 29
    def __init__(self):
261 ashish 30
       """
31
        Documentation for constructor
32
        initialize_table is called to make all the tables known in
33
        the scope of this class.
34
        Also start url needs to be feeded to the spider through start_urls.append
35
        Domainname is name by which this spider is known outside
36
        So this will be used as an argument for calling this spider 
37
       """
235 ashish 38
       initialize_table() 
39
       #NAAPTOL_DOMAINNAME = "naaptol"   
40
       NAAPTOL_DOMAINNAME = get_code_word("NAAPTOL_DOMAINNAME")
41
       self.domain_name = NAAPTOL_DOMAINNAME 
42
       #NAAPTOL_URL = "http://www.naaptol.com/sitemap.xml"
43
       NAAPTOL_URL = get_code_word("NAAPTOL_URL")
44
       self.start_urls.append(NAAPTOL_URL)
187 ashish 45
 
46
 
47
    def start_requests(self):
261 ashish 48
        """
49
        Documentation for method start_requests
50
        To set various properties of the request to be made
51
        like referer, headers and all.
52
        Also suppliers entry need to be done in the table
53
        datastore_datadefinition_suppliers.
54
        @return a list of well formed requests which will be 
55
        crawled by spider and spider will return the response
56
       """
235 ashish 57
        #adding entry for the supplier i.e its name and site
58
        #NAAPTOL_HOMEPAGE = "http://www.naaptol.com"
59
        NAAPTOL_HOMEPAGE = get_code_word("NAAPTOL_HOMEPAGE")
187 ashish 60
        da = DataHelper()
235 ashish 61
        da.add_supplier(self.domain_name, NAAPTOL_HOMEPAGE)
187 ashish 62
        listreq = []
235 ashish 63
 
64
        #for each request a referer has to be set
65
        #NAAPTOL_REFERER = "http://www.google.com"
66
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
187 ashish 67
        for url1 in self.start_urls:
68
            request = Request(url = str(url1), callback=self.parse)
235 ashish 69
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
187 ashish 70
            listreq.append(request)
71
        return listreq
72
 
73
    def parse(self, response):
261 ashish 74
        """
75
        Documentation for method parse
76
        @param response of individual requests
77
        Using Xpaths needed information is extracted out of the response
78
        and added to the database
79
        Xpath1 = Give us url for individual phones
80
        chklist1 = elements in chk_list are specific to this site for determining valid sites
81
        """
187 ashish 82
        da = DataHelper()
83
        hxs = HtmlXPathSelector(response)
235 ashish 84
        #NAAPTOL_XPATH1 = '//url/loc/text()'
85
        NAAPTOL_XPATH1 = get_code_word("NAAPTOL_XPATH1")
86
        phone_urls = hxs.select(NAAPTOL_XPATH1)
87
 
88
        #elements in chk_list are specific to this site for determining valid sites 
89
        #NAAPTOL_CHKLIST1 = ["mobile_phones/pdas_and_smartphones" ,"mobile_phones/gsm_handsets" ,"mobile_phones/cdma_handsets"]
90
        #list separeated by ';'
91
        NAAPTOL_CHKLIST1 = get_code_word("NAAPTOL_CHKLIST1")
92
        NAAPTOL_CHKLIST1 = NAAPTOL_CHKLIST1.split(';')
187 ashish 93
        for i in phone_urls:
94
            site = i.extract()
235 ashish 95
            site = unescape(site)
187 ashish 96
            pos1 = pos2 = pos3 = 0
97
            temp =""
235 ashish 98
 
99
            # temp contains string b/w 2nd last and 3rd last slash(/)
187 ashish 100
            pos1 = site.rfind('/')
101
            if pos1 != -1:
102
                pos2 = site.rfind('/',0,pos1-1)
103
            if pos2 != -1:    
104
                pos3 = site.rfind('/',0,pos2-1)
105
            if pos3 > 0:
106
                temp = site[pos3+1:pos1]
235 ashish 107
            for c in NAAPTOL_CHKLIST1:
108
                if temp == c:
109
                    da.add_naaptolurl(site)           
187 ashish 110
SPIDER = naaptol_spider()