Subversion Repositories SmartDukaan

Rev

Rev 281 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
149 ashish 1
'''
2
Created on 13-May-2010
3
 
4
@author: gaurav
5
'''
6
 
7
from scrapy.spider import BaseSpider
8
from scrapy.selector import HtmlXPathSelector
9
from scrapy.http import Request
10
 
11
from demo.items import DemoItem
12
from scrapy.contrib.spidermiddleware import referer
13
from scrapy.http.headers import Headers
14
from scrapy.http.request.form import FormRequest
15
from scrapy.log import msg
16
from scrapy.http.response import Response
17
 
228 ashish 18
from datastore.DataAccessor import *
19
from datastore.DataCodeAccessor import *
20
from html2text.unescaping import *
21
from elixir import *
149 ashish 22
 
23
class infi_spider(BaseSpider):
259 ashish 24
    """
25
    Documentation for class infi_spider
26
    This spider collects the information for the individual phones
27
    and store them in table datastore_datadefinition_infibeam_data
28
    """
181 ashish 29
    def __init__(self): 
259 ashish 30
       """
31
        Documentation for constructor
32
        initialize_table is called to make all the tables known in
33
        the scope of this class.
34
        Also start url needs to be feeded to the spider through start_urls.append
35
        Domainname is name by which this spider is known outside
36
        So this will be used as an argument for calling this spider.
37
        As the number of pages to be crawled are not fixed so ct and no are used to make it dynamic.  
38
        """
228 ashish 39
       initialize_table()
40
       da = DataHelper()
41
       #INFIBEAM_DOMAINNAME = "infibeam"   
42
       INFIBEAM_DOMAINNAME = get_code_word("INFIBEAM_DOMAINNAME")
43
       print INFIBEAM_DOMAINNAME
44
       self.domain_name = INFIBEAM_DOMAINNAME
45
       #INFIBEAM_CT = 15
46
       #INFIBEAM_CT = int(get_code_word("INFIBEAM_CT")) 
47
       #ct = 15     
48
       ct = int(da.get_extra_vars('infibeam_count'))
49
       #INFIBEAM_NO = 1
149 ashish 50
       no = 1
228 ashish 51
       if ct>14:
52
           no = ct
53
       #INFIBEAM_URL = "http://www.infibeam.com/Mobiles/search?page="
54
       INFIBEAM_URL = get_code_word("INFIBEAM_URL")
149 ashish 55
       while(no<=ct):
228 ashish 56
            url1 = INFIBEAM_URL + str(no)
57
            self.start_urls.append(url1)
149 ashish 58
            no=no+1
228 ashish 59
 
149 ashish 60
    def start_requests(self):
259 ashish 61
        """
62
        Documentation for method start_requests
63
        To set various properties of the request to be made
64
        like referer, headers and all.
65
        Also suppliers entry need to be done in the table
66
        datastore_datadefinition_suppliers.
67
        @return a list of well formed requests which will be 
68
        crawled by spider and spider will return the response
69
        """
228 ashish 70
        #adding entry for the supplier i.e its name and site
71
        #INFIBEAM_HOMEPAGE = "www.infibeam.com"
72
        INFIBEAM_HOMEPAGE = get_code_word("INFIBEAM_HOMEPAGE")
181 ashish 73
        da = DataHelper()
228 ashish 74
        da.add_supplier(self.domain_name, INFIBEAM_HOMEPAGE)
149 ashish 75
        listreq = []
228 ashish 76
 
77
        #for each request a referer has to be set
78
        #INFIBEAM_REFERER = "www.google.com/search"
79
        INFIBEAM_REFERER = get_code_word("INFIBEAM_REFERER") 
149 ashish 80
        for url1 in self.start_urls:
228 ashish 81
            request = Request(url = str(url1), callback=self.parse, dont_filter=True)
82
            request.headers.setdefault("Referer", INFIBEAM_REFERER)
83
            listreq.append(request)    
149 ashish 84
        return listreq
228 ashish 85
 
149 ashish 86
    def parse(self, response):
259 ashish 87
        """
88
        Documentation for method parse
89
        @param response of individual requests
90
        Using Xpaths needed information is extracted out of the response
91
        and added to the database
92
        Xpath1 = Give us section for individual phone
93
        Xpath2 = Give us name for individual phone
94
        Xpath3 = Give us quoted price for individual phone
95
        vatplustax = to get final price from quoted price
96
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
97
        """
228 ashish 98
        da = DataHelper()
99
        #INFIBEAM_VATPLUSTAX = 0
100
        INFIBEAM_VATPLUSTAX = int(get_code_word("INFIBEAM_VATPLUSTAX"))
101
        #list elements are separated by ';'
102
        #INFIBEAM_REMOVELIST = ["Rs.",",","-","/","Rs"]
270 ashish 103
        INFIBEAM_REMOVELIST = str(get_code_word("INFIBEAM_REMOVELIST"))
104
        if len(INFIBEAM_REMOVELIST)>0:
105
            INFIBEAM_REMOVELIST = INFIBEAM_REMOVELIST.split(';') 
286 ashish 106
 
107
        for x in INFIBEAM_REMOVELIST:
108
            print x
109
 
149 ashish 110
        hxs = HtmlXPathSelector(response)
228 ashish 111
        #INFIBEAM_XPATH1 = '//ul[@class="srch_result portrait"]/li'
112
        INFIBEAM_XPATH1 = get_code_word("INFIBEAM_XPATH1")
113
        phone_info = hxs.select(INFIBEAM_XPATH1)
114
        #INFIBEAM_XPATH2 = './/p/span[@class="title"]/text()'
115
        INFIBEAM_XPATH2 = get_code_word("INFIBEAM_XPATH2")
116
        #INFIBEAM_XPATH3 = './/p/span[@class="price"]/text()'
117
        INFIBEAM_XPATH3 = get_code_word("INFIBEAM_XPATH3")
149 ashish 118
        items = []
228 ashish 119
 
120
        if not phone_info:
121
            ct = int(da.get_extra_vars('infibeam_count'))
122
            if ct>14:
123
                fails = int(da.get_extra_vars('infibeam_fails'))
124
                fails = fails+1
125
                if fails > 0:
126
                    da.set_extra_vars('infibeam_flag','FALSE','')
127
                da.set_extra_vars('infibeam_fails',str(fails),'')
128
        else:
129
            for i in phone_info:
130
                item = {}
131
                item['name'] = i.select(INFIBEAM_XPATH2)[0].extract()
132
                item['price'] = i.select(INFIBEAM_XPATH3)[0].extract()
133
                items.append(item)
149 ashish 134
 
228 ashish 135
            for i in items:
136
                amnt = i['price']
137
                if amnt != '':        
138
                    for r in INFIBEAM_REMOVELIST: 
139
                        while amnt.find(r) != -1:
140
                            amnt = amnt.replace(r, "") 
141
                        amnt = amnt.strip()
281 ashish 142
                pr = int(amnt) + int(INFIBEAM_VATPLUSTAX)
228 ashish 143
                da.add_infiphone(i['name'], amnt,pr)
144
 
149 ashish 145
SPIDER = infi_spider()