Subversion Repositories SmartDukaan

Rev

Rev 258 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
180 ashish 1
'''
2
Created on 17-May-2010
3
 
4
@author: gaurav
5
'''
6
 
7
from scrapy.spider import BaseSpider
8
from scrapy.selector import HtmlXPathSelector
9
from scrapy.http import Request
10
 
11
from demo.items import DemoItem
12
from scrapy.contrib.spidermiddleware import referer
13
from scrapy.http.headers import Headers
14
from scrapy.http.request.form import FormRequest
15
from scrapy.log import msg
16
from scrapy.http.response import Response
17
 
227 ashish 18
from datastore.DataAccessor import *
19
from datastore.DataCodeAccessor import *
180 ashish 20
import urllib
21
from xml.dom import INDEX_SIZE_ERR
227 ashish 22
from html2text.unescaping import *
180 ashish 23
 
24
 
25
class indiaplaza_extra(BaseSpider):
258 ashish 26
    """
27
    Documentation for class indiaplaza_extra
28
    This spider collects all the information for the individual phones
29
    and store them in table datastore_datadefinition_indiaplaza_items.
30
    """
180 ashish 31
    def __init__(self):
258 ashish 32
       """
33
        Documentation for constructor
34
        initialize_table is called to make all the tables known in
35
        the scope of this class.
36
        Also start url needs to be feeded to the spider through start_urls.append
37
        Domainname1 is name by which this spider is known outside
38
        So this will be used as an argument for calling this spider 
39
       """ 
227 ashish 40
       initialize_table()
41
       #INDIAPLAZA_DOMAINNAME1 = "indiaplaza1" 
42
       INDIAPLAZA_DOMAINNAME1 = get_code_word("INDIAPLAZA_DOMAINNAME1")  
43
       self.domain_name = INDIAPLAZA_DOMAINNAME1
44
 
45
        # get urls from the database and append them in the list for crawling
180 ashish 46
       da = DataHelper()
47
       for pitem in da.get_all_ipbasic():
48
            self.start_urls.append(pitem.v_site.strip())
49
 
50
    def start_requests(self):
258 ashish 51
        """
52
        Documentation for method start_requests
53
        To set various properties of the request to be made
54
        like referer, headers and all.
55
        @return a list of well formed requests which will be 
56
        crawled by spider and spider will return the response
57
        """
180 ashish 58
        listreq = []
227 ashish 59
        #for each request a referer has to be set
60
        #INDIAPLAZA_REFERER = "www.google.com/search"
61
        INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")
180 ashish 62
        for url1 in self.start_urls:
227 ashish 63
            request = Request(url = str(url1), callback=self.parse)
64
            request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
180 ashish 65
            listreq.append(request)
66
        return listreq
227 ashish 67
 
180 ashish 68
    def parse(self, response):
258 ashish 69
        """
70
        Documentation for method parse
71
        @param response of individual requests
72
        Using Xpaths needed information is extracted out of the response
73
        and added to the database
74
        Xpath4 = Give us name for individual phone
75
        Xpath5 = Give us quoted-price for individual phone
76
        Xpath6 = Give us ship-price for individual phone
77
        Xpath7 = Give us ship_price for individual phone, if not gettable form xpath6
78
        Xpath8 = Give us guarantee-info for individual phone
79
        Xpath9 = Give us guarantee-info for individual phone, if not gettable form xpath8
80
        Xpath10 = Give us ship-info for individual phone
81
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
82
        """
180 ashish 83
        hxs = HtmlXPathSelector(response)
227 ashish 84
        #INDIAPLAZA_REMOVELIST = ["Rs.","Rs",",","-","/"]
85
        #List separated by ';'
269 ashish 86
        INDIAPLAZA_REMOVELIST = str(get_code_word("INDIAPLAZA_REMOVELIST"))
87
        if len(INDIAPLAZA_REMOVELIST)>0:
88
            INDIAPLAZA_REMOVELIST = INDIAPLAZA_REMOVELIST.split(';') 
227 ashish 89
        #INDIAPLAZA_XPATH4 = './/div[@class="finDetHdr"]/h1/text()' 
90
        INDIAPLAZA_XPATH4 = get_code_word("INDIAPLAZA_XPATH4")
91
        #INDIAPLAZA_XPATH5 = './/div[@class="priceArea"]/span[1]/text()'
92
        INDIAPLAZA_XPATH5 = get_code_word("INDIAPLAZA_XPATH5")
93
        #INDIAPLAZA_XPATH6 = './/div[@class="priceArea"]/div[@class="row"][2]/text()'
94
        INDIAPLAZA_XPATH6 = get_code_word("INDIAPLAZA_XPATH6")
95
        #INDIAPLAZA_XPATH7 = './/div[@class="priceArea"]/div[@class="row"][2]/span/text()'
96
        INDIAPLAZA_XPATH7 = get_code_word("INDIAPLAZA_XPATH7")
97
        #INDIAPLAZA_XPATH8 = './/div[@class="priceArea"]/div[@class="row"][3]/text()'
98
        INDIAPLAZA_XPATH8 = get_code_word("INDIAPLAZA_XPATH8")
99
        #INDIAPLAZA_XPATH9 = './/div[@class="priceArea"]/div[@class="row"][4]/text()'
100
        INDIAPLAZA_XPATH9 = get_code_word("INDIAPLAZA_XPATH9")
101
        #INDIAPLAZA_XPATH10 = './/div[@class="priceArea"]/div[@class="row"][1]/text()'
102
        INDIAPLAZA_XPATH10 = get_code_word("INDIAPLAZA_XPATH10")
103
        name = hxs.select(INDIAPLAZA_XPATH4)[0].extract()
104
        name = unescape(name)
105
        price = hxs.select(INDIAPLAZA_XPATH5)[0].extract()
180 ashish 106
        try:
227 ashish 107
            ship_price = hxs.select(INDIAPLAZA_XPATH6)[0].extract()
180 ashish 108
        except IndexError:
227 ashish 109
            ship_price = hxs.select(INDIAPLAZA_XPATH7)[0].extract()
180 ashish 110
        try:
227 ashish 111
            guarantee_info = hxs.select(INDIAPLAZA_XPATH8)[0].extract()
180 ashish 112
        except IndexError:
227 ashish 113
            guarantee_info = hxs.select(INDIAPLAZA_XPATH9)[0].extract()
114
        ship_info = hxs.select(INDIAPLAZA_XPATH10)[0].extract() 
180 ashish 115
 
116
        urllib.unquote(name)
117
        urllib.unquote(price)
118
        urllib.unquote(ship_price)
119
        urllib.unquote(guarantee_info)
120
        urllib.unquote(ship_info)
227 ashish 121
 
122
        #INDIAPLAZA_VAR1 = "Free shipping" 
123
        INDIAPLAZA_VAR1 = get_code_word("INDIAPLAZA_VAR1")
124
        if ship_price == INDIAPLAZA_VAR1:
180 ashish 125
            ship_price = "0"
126
        else :
227 ashish 127
            if ship_price != '':        
128
                for r in INDIAPLAZA_REMOVELIST: 
129
                    while ship_price.find(r) != -1:
130
                        ship_price = ship_price.replace(r, "")
131
        if price != '':        
132
                for r in INDIAPLAZA_REMOVELIST: 
133
                    while price.find(r) != -1:
134
                        price = price.replace(r, "")
180 ashish 135
 
136
 
137
        name = name.strip()
138
        price = price.strip()
139
        ship_price = ship_price.strip()
140
        guarantee_info = guarantee_info.strip()
141
        ship_info = ship_info.strip()
142
 
143
        shown_pr = int(price)
144
        final_pr = shown_pr + int(ship_price)
227 ashish 145
 
180 ashish 146
        da = DataHelper()
227 ashish 147
        da.add_ipextra(name,shown_pr,final_pr,guarantee_info,ship_info)        
180 ashish 148
SPIDER = indiaplaza_extra()
149