Subversion Repositories SmartDukaan

Rev

Rev 223 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
223 ashish 1
'''
2
Created on 06-Jun-2010
3
 
4
@author: gaurav
5
'''
6
 
7
from scrapy.spider import BaseSpider
8
from scrapy.selector import HtmlXPathSelector
9
from scrapy.http import Request
10
 
11
from demo.items import DemoItem
12
from scrapy.contrib.spidermiddleware import referer
13
from scrapy.http.headers import Headers
14
from scrapy.http.request.form import FormRequest
15
from scrapy.log import msg
16
from scrapy.http.response import Response
17
 
18
from datastore.DataAccessor import *
19
from datastore.DataCodeAccessor import *
20
from html2text import *
21
import urllib
22
 
23
class babuchak3(BaseSpider):
253 ashish 24
    """
25
    Documentation for class babuchak3
26
    This spider collects the information for the individual phones
27
    and store them in table datastore_datadefinition_babuchak_phones.
28
    """
223 ashish 29
    def __init__(self):
253 ashish 30
        """
31
        Documentation for constructor
32
        initialize_table is called to make all the tables known in
33
        the scope of this class.
34
        Also start url needs to be feeded to the spider through start_urls.append
35
        Domainname2 is name by which this spider is known outside
36
        So this will be used as an argument for calling this spider 
37
        """
223 ashish 38
        initialize_table()
39
        #BABUCHAK_DOMAINNAME2 = "babuchak2"   
40
        BABUCHAK_DOMAINNAME2 = get_code_word("BABUCHAK_DOMAINNAME2")
41
        self.domain_name = BABUCHAK_DOMAINNAME2  
42
        da = DataHelper()
43
        for item in da.get_allbabuchakphoneurls():
44
            self.start_urls.append(item.url)
45
 
46
    def start_requests(self):
253 ashish 47
        """
48
        Documentation for method start_requests
49
        To set various properties of the request to be made
50
        like referer, headers and all.
51
        @return a list of well formed requests which will be 
52
        crawled by spider and spider will return the response
53
        """
223 ashish 54
        listreq = []
55
        #for each request a referer has to be set
56
        #BABUCHAK_REFERER = "www.google.com/search"
57
        BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")
58
        for url1 in self.start_urls:
59
            request = Request(url = str(url1), callback=self.parse)
60
            request.headers.setdefault("Referer", BABUCHAK_REFERER)
61
            listreq.append(request)
62
        return listreq
63
 
64
    def parse(self, response):
253 ashish 65
        """
66
        Documentation for method parse
67
        @param response of individual requests
68
        Using Xpaths needed information is extracted out of the response
69
        and added to the database
70
        Xpath5 = Give us name for individual phone
71
        Xpath6 = Give us quoted-price for individual phone
72
        Xpath7 = Give us final_price for individual phone
73
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
74
        """
223 ashish 75
        da = DataHelper()
76
        hxs = HtmlXPathSelector(response)
77
        #BABUCHAK_XPATH5 = '//td[@class="text-header"]/text()'
78
        BABUCHAK_XPATH5 = get_code_word("BABUCHAK_XPATH5")
79
        #BABUCHAK_XPATH6 = '//td[@class="xl63"]//strong/span/text()'
80
        BABUCHAK_XPATH6 = get_code_word("BABUCHAK_XPATH6")
81
        #BABUCHAK_XPATH7 = '//td[@class="mod-item-body-title"]/b/text()'
82
        BABUCHAK_XPATH7 = get_code_word("BABUCHAK_XPATH7")
83
        #BABUCHAK_REMOVELIST = ["Rs.","Rs",",","-","/"]
84
        #list separated by ';'
85
        BABUCHAK_REMOVELIST = get_code_word("BABUCHAK_REMOVELIST")
86
        BABUCHAK_REMOVELIST = BABUCHAK_REMOVELIST.split(';')
87
        name = hxs.select(BABUCHAK_XPATH5)[0].extract()
88
        try:
89
            shown_price = hxs.select(BABUCHAK_XPATH6)[0].extract()
90
            final_price = hxs.select(BABUCHAK_XPATH6)[2].extract()
91
        except:
92
            final_price = shown_price = hxs.select(BABUCHAK_XPATH7)[0].extract()
93
 
94
 
95
        name = name.strip()
96
        shown_price = shown_price.strip()
97
        final_price = final_price.strip()
98
 
99
        if shown_price != '':        
100
            for r in BABUCHAK_REMOVELIST: 
101
                while shown_price.find(r) != -1:
102
                    shown_price = shown_price.replace(r, "")
103
        shown_price = shown_price.strip()
104
 
105
        if final_price != '':        
106
            for r in BABUCHAK_REMOVELIST: 
107
                while final_price.find(r) != -1:
108
                    final_price = final_price.replace(r, "")
109
        final_price = final_price.strip()
110
        ps1 = shown_price.find('.')
111
        if ps1 != -1:
112
            shown_price = shown_price[0:ps1]
113
            final_price = shown_price
114
        shown_price = int(shown_price)
115
        final_price = int(final_price)
253 ashish 116
 
117
        #There were some phones on which discount was there so it had
118
        #marked price, quoted price and final price
223 ashish 119
        if shown_price>final_price:
120
            try:
121
                shown_price = hxs.select(BABUCHAK_XPATH6)[1].extract()
122
                final_price = hxs.select(BABUCHAK_XPATH6)[2].extract()
123
            except:
124
                final_price = shown_price = hxs.select(BABUCHAK_XPATH7)[0].extract()
125
 
126
            name = name.strip()
127
            shown_price = shown_price.strip()
128
            final_price = final_price.strip()
129
 
130
            if shown_price != '':        
131
                for r in BABUCHAK_REMOVELIST: 
132
                    while shown_price.find(r) != -1:
133
                        shown_price = shown_price.replace(r, "")
134
            shown_price = shown_price.strip()
135
 
136
            if final_price != '':        
137
                for r in BABUCHAK_REMOVELIST: 
138
                    while final_price.find(r) != -1:
139
                        final_price = final_price.replace(r, "")
140
            final_price = final_price.strip()
141
            ps1 = shown_price.find('.')
142
            if ps1 != -1:
143
                shown_price = shown_price[0:ps1]
144
                final_price = shown_price
145
 
146
            shown_price = int(shown_price)
147
            final_price = int(final_price)
148
        da.add_babuchakphone(name,shown_price,final_price)        
149
SPIDER = babuchak3()