Subversion Repositories SmartDukaan

Rev

Rev 226 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
158 ashish 1
'''
2
Created on 16-May-2010
3
 
4
@author: gaurav
5
'''
6
 
7
 
8
from scrapy.spider import BaseSpider
9
from scrapy.selector import HtmlXPathSelector
10
from scrapy.http import Request
11
 
12
from demo.items import DemoItem
13
from scrapy.contrib.spidermiddleware import referer
14
from scrapy.http.headers import Headers
15
from scrapy.http.request.form import FormRequest
16
from scrapy.log import msg
17
from scrapy.http.response import Response
18
 
226 ashish 19
from html2text.unescaping import *
20
from datastore.DataAccessor import *
21
from datastore.DataCodeAccessor import *
158 ashish 22
 
23
 
226 ashish 24
 
158 ashish 25
class indiaplaza_spider(BaseSpider):
257 ashish 26
    """
27
    Documentation for class indiaplaza_spider
28
    This spider collects the url for the individual phones
29
    and store them in table datastore_datadefinition_indiaplaza_data.
30
    """
158 ashish 31
    def __init__(self):
257 ashish 32
       """
33
        Documentation for constructor
34
        initialize_table is called to make all the tables known in
35
        the scope of this class.
36
        Also start url needs to be feeded to the spider through start_urls.append
37
        Domainname is name by which this spider is known outside
38
        So this will be used as an argument for calling this spider
39
        Since, the number of pages is not fixed Ct and no are used to make it dynamic 
40
       """
226 ashish 41
       initialize_table()
42
       da = DataHelper() 
43
       #INDIAPLAZA_DOMAINNAME = "indiaplaza"
44
       INDIAPLAZA_DOMAINNAME = get_code_word("INDIAPLAZA_DOMAINNAME")   
45
       self.domain_name = INDIAPLAZA_DOMAINNAME
46
       #INDIAPLAZA_CT = 18
47
       #INDIAPLAZA_CT = int(get_code_word("INDIAPLAZA_CT"))
48
       #INDIAPLAZA_NO = 1
49
       CT = int(da.get_extra_vars('indiaplaza_count'))
50
       NO = 1
51
       if CT>18:
52
           NO = CT
53
       #INDIAPLAZA_URL = "http://www.indiaplaza.in/mobile-phones-Mobiles-1.htm?PageNo="
54
       INDIAPLAZA_URL = get_code_word("INDIAPLAZA_URL")
55
       while(NO<=CT):
56
            url1 = INDIAPLAZA_URL + str(NO)
57
            self.start_urls.append(url1)
58
            NO=NO+1
59
 
158 ashish 60
    def start_requests(self):
257 ashish 61
        """
62
        Documentation for method start_requests
63
        To set various properties of the request to be made
64
        like referer, headers and all.
65
        Also suppliers entry need to be done in the table
66
        datastore_datadefinition_suppliers.
67
        @return a list of well formed requests which will be 
68
        crawled by spider and spider will return the response
69
        """
226 ashish 70
        #adding entry for the supplier i.e its name and site
71
        #INDIAPLAZA_HOMEPAGE = "www.indiaplaza.com"
72
        INDIAPLAZA_HOMEPAGE = get_code_word("INDIAPLAZA_HOMEPAGE")
178 ashish 73
        da = DataHelper()
226 ashish 74
        da.add_supplier(self.domain_name, INDIAPLAZA_HOMEPAGE)
158 ashish 75
        listreq = []
226 ashish 76
 
77
        #for each request a referer has to be set
78
        #INDIAPLAZA_REFERER = "www.google.com/search"
79
        INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")
158 ashish 80
        for url1 in self.start_urls:
226 ashish 81
            request = Request(url = str(url1), callback=self.parse, dont_filter=True)
82
            request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
158 ashish 83
            listreq.append(request)
84
        return listreq
226 ashish 85
 
158 ashish 86
    def parse(self, response):
257 ashish 87
        """
88
        Documentation for method parse
89
        @param response of individual requests
90
        Using Xpaths needed information is extracted out of the response
91
        and added to the database
92
        Xpath1 = Give us section for individual phone
93
        Xpath2 = Give us name of individual phone
94
        Xpath3 = Give us url of individual phone
95
        Url1 = To get full url for individual phones
96
        """
226 ashish 97
        da = DataHelper()
98
        #INDIAPLAZA_URL1 = "http://www.indiaplaza.in"
99
        INDIAPLAZA_URL1 = get_code_word("INDIAPLAZA_URL1")
158 ashish 100
        hxs = HtmlXPathSelector(response)
226 ashish 101
        #INDIAPLAZA_XPATH1 = '//tr/td/table[@id="browsesku"]'
102
        INDIAPLAZA_XPATH1 = get_code_word("INDIAPLAZA_XPATH1")
103
        phone_info = hxs.select(INDIAPLAZA_XPATH1)
158 ashish 104
        items = []
226 ashish 105
        #INDIAPLAZA_XPATH2 = './/div[@class="skuimg"]/a/@title'
106
        INDIAPLAZA_XPATH2 = get_code_word("INDIAPLAZA_XPATH2")
107
        #INDIAPLAZA_XPATH3 = './/div[@class="skuimg"]/a/@href'
108
        INDIAPLAZA_XPATH3 = get_code_word("INDIAPLAZA_XPATH3")
109
        if not phone_info:
110
            ct = int(da.get_extra_vars('indiaplaza_count'))
111
            if ct>18:
112
                fails = int(da.get_extra_vars('indiaplaza_fails'))
113
                fails = fails+1
114
                da.set_extra_vars('indiaplaza_fails',str(fails),'')
115
                if fails > 0:
116
                    da.set_extra_vars('indiaplaza_flag','FALSE','')
117
                da.set_extra_vars('indiaplaza_fails',str(fails),'')
118
        else:            
119
            for i in phone_info:
120
                item = {}
121
                item['title'] = i.select(INDIAPLAZA_XPATH2)[0].extract()
122
                item['url'] = i.select(INDIAPLAZA_XPATH3)[0].extract()
123
                items.append(item)
124
 
125
            for item in items:
126
                str2 = INDIAPLAZA_URL1 + str(item['url'])
127
                da.add_ipbasic(item['title'],unescape(str2))
128
 
158 ashish 129
SPIDER = indiaplaza_spider()