Subversion Repositories SmartDukaan

Rev

Rev 270 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
149 ashish 1
'''
2
Created on 13-May-2010
3
 
4
@author: gaurav
5
'''
6
 
7
from scrapy.spider import BaseSpider
8
from scrapy.selector import HtmlXPathSelector
9
from scrapy.http import Request
10
 
11
from demo.items import DemoItem
12
from scrapy.contrib.spidermiddleware import referer
13
from scrapy.http.headers import Headers
14
from scrapy.http.request.form import FormRequest
15
from scrapy.log import msg
16
from scrapy.http.response import Response
17
 
228 ashish 18
from datastore.DataAccessor import *
19
from datastore.DataCodeAccessor import *
20
from html2text.unescaping import *
21
from elixir import *
149 ashish 22
 
23
class infi_spider(BaseSpider):
259 ashish 24
    """
25
    Documentation for class infi_spider
26
    This spider collects the information for the individual phones
27
    and store them in table datastore_datadefinition_infibeam_data
28
    """
181 ashish 29
    def __init__(self): 
259 ashish 30
       """
31
        Documentation for constructor
32
        initialize_table is called to make all the tables known in
33
        the scope of this class.
34
        Also start url needs to be feeded to the spider through start_urls.append
35
        Domainname is name by which this spider is known outside
36
        So this will be used as an argument for calling this spider.
37
        As the number of pages to be crawled are not fixed so ct and no are used to make it dynamic.  
38
        """
228 ashish 39
       initialize_table()
40
       da = DataHelper()
41
       #INFIBEAM_DOMAINNAME = "infibeam"   
42
       INFIBEAM_DOMAINNAME = get_code_word("INFIBEAM_DOMAINNAME")
43
       print INFIBEAM_DOMAINNAME
44
       self.domain_name = INFIBEAM_DOMAINNAME
45
       #INFIBEAM_CT = 15
46
       #INFIBEAM_CT = int(get_code_word("INFIBEAM_CT")) 
47
       #ct = 15     
48
       ct = int(da.get_extra_vars('infibeam_count'))
49
       #INFIBEAM_NO = 1
149 ashish 50
       no = 1
228 ashish 51
       if ct>14:
52
           no = ct
53
       #INFIBEAM_URL = "http://www.infibeam.com/Mobiles/search?page="
54
       INFIBEAM_URL = get_code_word("INFIBEAM_URL")
149 ashish 55
       while(no<=ct):
228 ashish 56
            url1 = INFIBEAM_URL + str(no)
57
            self.start_urls.append(url1)
149 ashish 58
            no=no+1
228 ashish 59
 
149 ashish 60
    def start_requests(self):
259 ashish 61
        """
62
        Documentation for method start_requests
63
        To set various properties of the request to be made
64
        like referer, headers and all.
65
        Also suppliers entry need to be done in the table
66
        datastore_datadefinition_suppliers.
67
        @return a list of well formed requests which will be 
68
        crawled by spider and spider will return the response
69
        """
228 ashish 70
        #adding entry for the supplier i.e its name and site
71
        #INFIBEAM_HOMEPAGE = "www.infibeam.com"
72
        INFIBEAM_HOMEPAGE = get_code_word("INFIBEAM_HOMEPAGE")
181 ashish 73
        da = DataHelper()
228 ashish 74
        da.add_supplier(self.domain_name, INFIBEAM_HOMEPAGE)
149 ashish 75
        listreq = []
228 ashish 76
 
77
        #for each request a referer has to be set
78
        #INFIBEAM_REFERER = "www.google.com/search"
79
        INFIBEAM_REFERER = get_code_word("INFIBEAM_REFERER") 
149 ashish 80
        for url1 in self.start_urls:
228 ashish 81
            request = Request(url = str(url1), callback=self.parse, dont_filter=True)
82
            request.headers.setdefault("Referer", INFIBEAM_REFERER)
83
            listreq.append(request)    
149 ashish 84
        return listreq
228 ashish 85
 
149 ashish 86
    def parse(self, response):
259 ashish 87
        """
88
        Documentation for method parse
89
        @param response of individual requests
90
        Using Xpaths needed information is extracted out of the response
91
        and added to the database
92
        Xpath1 = Give us section for individual phone
93
        Xpath2 = Give us name for individual phone
94
        Xpath3 = Give us quoted price for individual phone
95
        vatplustax = to get final price from quoted price
96
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
97
        """
228 ashish 98
        da = DataHelper()
99
        #INFIBEAM_VATPLUSTAX = 0
100
        INFIBEAM_VATPLUSTAX = int(get_code_word("INFIBEAM_VATPLUSTAX"))
101
        #list elements are separated by ';'
102
        #INFIBEAM_REMOVELIST = ["Rs.",",","-","/","Rs"]
270 ashish 103
        INFIBEAM_REMOVELIST = str(get_code_word("INFIBEAM_REMOVELIST"))
104
        if len(INFIBEAM_REMOVELIST)>0:
105
            INFIBEAM_REMOVELIST = INFIBEAM_REMOVELIST.split(';') 
149 ashish 106
        hxs = HtmlXPathSelector(response)
228 ashish 107
        #INFIBEAM_XPATH1 = '//ul[@class="srch_result portrait"]/li'
108
        INFIBEAM_XPATH1 = get_code_word("INFIBEAM_XPATH1")
109
        phone_info = hxs.select(INFIBEAM_XPATH1)
110
        #INFIBEAM_XPATH2 = './/p/span[@class="title"]/text()'
111
        INFIBEAM_XPATH2 = get_code_word("INFIBEAM_XPATH2")
112
        #INFIBEAM_XPATH3 = './/p/span[@class="price"]/text()'
113
        INFIBEAM_XPATH3 = get_code_word("INFIBEAM_XPATH3")
149 ashish 114
        items = []
228 ashish 115
 
116
        if not phone_info:
117
            ct = int(da.get_extra_vars('infibeam_count'))
118
            if ct>14:
119
                fails = int(da.get_extra_vars('infibeam_fails'))
120
                fails = fails+1
121
                if fails > 0:
122
                    da.set_extra_vars('infibeam_flag','FALSE','')
123
                da.set_extra_vars('infibeam_fails',str(fails),'')
124
        else:
125
            for i in phone_info:
126
                item = {}
127
                item['name'] = i.select(INFIBEAM_XPATH2)[0].extract()
128
                item['price'] = i.select(INFIBEAM_XPATH3)[0].extract()
129
                items.append(item)
149 ashish 130
 
228 ashish 131
            for i in items:
132
                amnt = i['price']
133
                if amnt != '':        
134
                    for r in INFIBEAM_REMOVELIST: 
135
                        while amnt.find(r) != -1:
136
                            amnt = amnt.replace(r, "") 
137
                        amnt = amnt.strip()
281 ashish 138
                pr = int(amnt) + int(INFIBEAM_VATPLUSTAX)
228 ashish 139
                da.add_infiphone(i['name'], amnt,pr)
140
 
149 ashish 141
SPIDER = infi_spider()