Subversion Repositories SmartDukaan

Rev

Rev 253 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
223 ashish 1
'''
2
Created on 06-Jun-2010
3
 
4
@author: gaurav
5
'''
6
 
7
from scrapy.spider import BaseSpider
8
from scrapy.selector import HtmlXPathSelector
9
from scrapy.http import Request
10
 
11
from demo.items import DemoItem
12
from scrapy.contrib.spidermiddleware import referer
13
from scrapy.http.headers import Headers
14
from scrapy.http.request.form import FormRequest
15
from scrapy.log import msg
16
from scrapy.http.response import Response
17
 
18
from datastore.DataAccessor import *
19
from datastore.DataCodeAccessor import *
20
from html2text import *
21
import urllib
22
 
23
class babuchak3(BaseSpider):
253 ashish 24
    """
25
    Documentation for class babuchak3
26
    This spider collects the information for the individual phones
27
    and store them in table datastore_datadefinition_babuchak_phones.
28
    """
223 ashish 29
    def __init__(self):
253 ashish 30
        """
31
        Documentation for constructor
32
        initialize_table is called to make all the tables known in
33
        the scope of this class.
34
        Also start url needs to be feeded to the spider through start_urls.append
35
        Domainname2 is name by which this spider is known outside
36
        So this will be used as an argument for calling this spider 
37
        """
223 ashish 38
        initialize_table()
39
        #BABUCHAK_DOMAINNAME2 = "babuchak2"   
40
        BABUCHAK_DOMAINNAME2 = get_code_word("BABUCHAK_DOMAINNAME2")
41
        self.domain_name = BABUCHAK_DOMAINNAME2  
42
        da = DataHelper()
43
        for item in da.get_allbabuchakphoneurls():
44
            self.start_urls.append(item.url)
45
 
46
    def start_requests(self):
253 ashish 47
        """
48
        Documentation for method start_requests
49
        To set various properties of the request to be made
50
        like referer, headers and all.
51
        @return a list of well formed requests which will be 
52
        crawled by spider and spider will return the response
53
        """
223 ashish 54
        listreq = []
55
        #for each request a referer has to be set
56
        #BABUCHAK_REFERER = "www.google.com/search"
57
        BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")
58
        for url1 in self.start_urls:
59
            request = Request(url = str(url1), callback=self.parse)
60
            request.headers.setdefault("Referer", BABUCHAK_REFERER)
61
            listreq.append(request)
62
        return listreq
63
 
64
    def parse(self, response):
253 ashish 65
        """
66
        Documentation for method parse
67
        @param response of individual requests
68
        Using Xpaths needed information is extracted out of the response
69
        and added to the database
70
        Xpath5 = Give us name for individual phone
71
        Xpath6 = Give us quoted-price for individual phone
72
        Xpath7 = Give us final_price for individual phone
73
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
74
        """
223 ashish 75
        da = DataHelper()
76
        hxs = HtmlXPathSelector(response)
77
        #BABUCHAK_XPATH5 = '//td[@class="text-header"]/text()'
78
        BABUCHAK_XPATH5 = get_code_word("BABUCHAK_XPATH5")
79
        #BABUCHAK_XPATH6 = '//td[@class="xl63"]//strong/span/text()'
80
        BABUCHAK_XPATH6 = get_code_word("BABUCHAK_XPATH6")
81
        #BABUCHAK_XPATH7 = '//td[@class="mod-item-body-title"]/b/text()'
82
        BABUCHAK_XPATH7 = get_code_word("BABUCHAK_XPATH7")
83
        #BABUCHAK_REMOVELIST = ["Rs.","Rs",",","-","/"]
84
        #list separated by ';'
268 ashish 85
        BABUCHAK_REMOVELIST = str(get_code_word("BABUCHAK_REMOVELIST"))
86
        if len(BABUCHAK_REMOVELIST)>0:
87
            BABUCHAK_REMOVELIST = BABUCHAK_REMOVELIST.split(';')
223 ashish 88
        name = hxs.select(BABUCHAK_XPATH5)[0].extract()
89
        try:
90
            shown_price = hxs.select(BABUCHAK_XPATH6)[0].extract()
91
            final_price = hxs.select(BABUCHAK_XPATH6)[2].extract()
92
        except:
93
            final_price = shown_price = hxs.select(BABUCHAK_XPATH7)[0].extract()
94
 
95
 
96
        name = name.strip()
97
        shown_price = shown_price.strip()
98
        final_price = final_price.strip()
99
 
100
        if shown_price != '':        
101
            for r in BABUCHAK_REMOVELIST: 
102
                while shown_price.find(r) != -1:
103
                    shown_price = shown_price.replace(r, "")
104
        shown_price = shown_price.strip()
105
 
106
        if final_price != '':        
107
            for r in BABUCHAK_REMOVELIST: 
108
                while final_price.find(r) != -1:
109
                    final_price = final_price.replace(r, "")
110
        final_price = final_price.strip()
111
        ps1 = shown_price.find('.')
112
        if ps1 != -1:
113
            shown_price = shown_price[0:ps1]
114
            final_price = shown_price
115
        shown_price = int(shown_price)
116
        final_price = int(final_price)
253 ashish 117
 
118
        #There were some phones on which discount was there so it had
119
        #marked price, quoted price and final price
223 ashish 120
        if shown_price>final_price:
121
            try:
122
                shown_price = hxs.select(BABUCHAK_XPATH6)[1].extract()
123
                final_price = hxs.select(BABUCHAK_XPATH6)[2].extract()
124
            except:
125
                final_price = shown_price = hxs.select(BABUCHAK_XPATH7)[0].extract()
126
 
127
            name = name.strip()
128
            shown_price = shown_price.strip()
129
            final_price = final_price.strip()
130
 
131
            if shown_price != '':        
132
                for r in BABUCHAK_REMOVELIST: 
133
                    while shown_price.find(r) != -1:
134
                        shown_price = shown_price.replace(r, "")
135
            shown_price = shown_price.strip()
136
 
137
            if final_price != '':        
138
                for r in BABUCHAK_REMOVELIST: 
139
                    while final_price.find(r) != -1:
140
                        final_price = final_price.replace(r, "")
141
            final_price = final_price.strip()
142
            ps1 = shown_price.find('.')
143
            if ps1 != -1:
144
                shown_price = shown_price[0:ps1]
145
                final_price = shown_price
146
 
147
            shown_price = int(shown_price)
148
            final_price = int(final_price)
149
        da.add_babuchakphone(name,shown_price,final_price)        
150
SPIDER = babuchak3()