Subversion Repositories SmartDukaan

Rev

Rev 266 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
154 ashish 1
'''
2
Created on 14-May-2010
3
 
4
@author: gaurav
5
'''
6
 
7
 
8
from scrapy.spider import BaseSpider
9
from scrapy.selector import HtmlXPathSelector
10
from scrapy.http import Request
11
 
12
from demo.items import DemoItem
13
from scrapy.contrib.spidermiddleware import referer
14
from scrapy.http.headers import Headers
15
from scrapy.http.request.form import FormRequest
16
from scrapy.log import msg
17
from scrapy.http.response import Response
18
 
19
 
240 ashish 20
from datastore.DataCodeAccessor import *
21
from datastore.DataAccessor import *
22
from html2text.unescaping import *
23
 
154 ashish 24
class univercell_price(BaseSpider):
266 ashish 25
    """
26
    Documentation for class univercell_price
27
    This spider collects the information for the individual phones
28
    and store them in table datastore_datadefinition_univercell_items
29
    """
154 ashish 30
    def __init__(self):
266 ashish 31
       """
32
        Documentation for constructor
33
        initialize_table is called to make all the tables known in
34
        the scope of this class.
35
        Also start url needs to be feeded to the spider through start_urls.append
36
        Domainname1 is name by which this spider is known outside
37
        So this will be used as an argument for calling this spider 
38
        """ 
240 ashish 39
       initialize_table()
40
       #UNIVERCELL_DOMAINNAME1 = "univercell1"   
41
       UNIVERCELL_DOMAINNAME1 = get_code_word("UNIVERCELL_DOMAINNAME1")
42
       self.domain_name = UNIVERCELL_DOMAINNAME1 
43
 
44
       # get urls from the database and append them in the list for crawling
154 ashish 45
       da = DataHelper()
46
       for pitem in da.get_all_univervendors():
47
            self.start_urls.append(pitem.v_site.strip())
48
 
49
    def start_requests(self):
266 ashish 50
        """
51
        Documentation for method start_requests
52
        To set various properties of the request to be made
53
        like referer, headers and all.
54
        @return a list of well formed requests which will be 
55
        crawled by spider and spider will return the response
56
        """
240 ashish 57
        #for each request a referer has to be set
154 ashish 58
        listreq = []
240 ashish 59
        #UNIVERCELL_REFERER = "www.google.com/search"
60
        UNIVERCELL_REFERER = get_code_word("UNIVERCELL_REFERER")
154 ashish 61
        for url1 in self.start_urls:
240 ashish 62
            request = Request(url = str(url1), callback=self.parse)
63
            request.headers.setdefault("Referer", UNIVERCELL_REFERER)
154 ashish 64
            listreq.append(request)
65
        return listreq
240 ashish 66
 
154 ashish 67
    def parse(self, response):
266 ashish 68
        """
69
        Documentation for method parse
70
        @param response of individual requests
71
        Using Xpaths needed information is extracted out of the response
72
        and added to the database
73
        Xpath4 = Give us section for individual phone
74
        Xpath5 = Give us name for individual phone
75
        Xpath6 = Give us quoted-price for individual phone
76
        vatplustax = Give us final_price for individual phone on adding with quoted-price
77
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
78
        """
170 ashish 79
        da = DataHelper()
267 ashish 80
        #UNIVERCELL_VATPLUSTAX = 0
240 ashish 81
        #removelist is used for converting price to decimal format containing only numbers and '.'
82
        #UNIVERCELL_REMOVELIST = ["Rs",",","-","/"]
83
        #list separated by ';'
84
        UNIVERCELL_REMOVELIST = get_code_word("UNIVERCELL_REMOVELIST")
85
        UNIVERCELL_REMOVELIST = UNIVERCELL_REMOVELIST.split(';')
154 ashish 86
        hxs = HtmlXPathSelector(response)
240 ashish 87
        #UNIVERCELL_XPATH4 = '//td[@class="gray-border"]' 
88
        UNIVERCELL_XPATH4 = get_code_word("UNIVERCELL_XPATH4")
89
        sites = hxs.select(UNIVERCELL_XPATH4)
154 ashish 90
        items = []
91
        for site in sites:
92
            item = {}
240 ashish 93
            #UNIVERCELL_XPATH5 = './/tr[2]/td/a/text()'
94
            UNIVERCELL_XPATH5 = get_code_word("UNIVERCELL_XPATH5")
95
            item['title'] = site.select(UNIVERCELL_XPATH5)[0].extract()
96
            #UNIVERCELL_XPATH6 = './/tr[3]/th/label/text()'
97
            UNIVERCELL_XPATH6 = get_code_word("UNIVERCELL_XPATH6")
98
            item['price'] =site.select(UNIVERCELL_XPATH6)[0].extract()
154 ashish 99
            items.append(item)
170 ashish 100
 
154 ashish 101
        for i in items:
240 ashish 102
            str1 = str(i['title']).strip()
103
            amnt = i['price']
104
            if amnt != '':        
105
                for r in UNIVERCELL_REMOVELIST: 
106
                    while amnt.find(r) != -1:
107
                        amnt = amnt.replace(r, "")
170 ashish 108
            amnt = amnt.strip() 
240 ashish 109
            # 4% additional vat is there on the price
110
            UNIVERCELL_VATPLUSTAX = 4*int(amnt)/100
111
            pr = int(amnt) + UNIVERCELL_VATPLUSTAX 
112
            #adding model-name,quotedprice and finalprice
113
            da.add_new_univerphone(unescape(str1),amnt,pr) 
114
 
154 ashish 115
SPIDER = univercell_price()
116