Subversion Repositories SmartDukaan

Rev

Rev 180 | Rev 258 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
180 ashish 1
'''
2
Created on 17-May-2010
3
 
4
@author: gaurav
5
'''
6
 
7
from scrapy.spider import BaseSpider
8
from scrapy.selector import HtmlXPathSelector
9
from scrapy.http import Request
10
 
11
from demo.items import DemoItem
12
from scrapy.contrib.spidermiddleware import referer
13
from scrapy.http.headers import Headers
14
from scrapy.http.request.form import FormRequest
15
from scrapy.log import msg
16
from scrapy.http.response import Response
17
 
227 ashish 18
from datastore.DataAccessor import *
19
from datastore.DataCodeAccessor import *
180 ashish 20
import urllib
21
from xml.dom import INDEX_SIZE_ERR
227 ashish 22
from html2text.unescaping import *
180 ashish 23
 
24
 
25
class indiaplaza_extra(BaseSpider):
26
 
27
    def __init__(self):
227 ashish 28
       initialize_table()
29
       #INDIAPLAZA_DOMAINNAME1 = "indiaplaza1" 
30
       INDIAPLAZA_DOMAINNAME1 = get_code_word("INDIAPLAZA_DOMAINNAME1")  
31
       self.domain_name = INDIAPLAZA_DOMAINNAME1
32
 
33
        # get urls from the database and append them in the list for crawling
180 ashish 34
       da = DataHelper()
35
       for pitem in da.get_all_ipbasic():
36
            self.start_urls.append(pitem.v_site.strip())
37
 
38
    def start_requests(self):
39
        listreq = []
227 ashish 40
        #for each request a referer has to be set
41
        #INDIAPLAZA_REFERER = "www.google.com/search"
42
        INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")
180 ashish 43
        for url1 in self.start_urls:
227 ashish 44
            request = Request(url = str(url1), callback=self.parse)
45
            request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
180 ashish 46
            listreq.append(request)
47
        return listreq
227 ashish 48
 
180 ashish 49
    def parse(self, response):
50
        hxs = HtmlXPathSelector(response)
227 ashish 51
        #INDIAPLAZA_REMOVELIST = ["Rs.","Rs",",","-","/"]
52
        #List separated by ';'
53
        INDIAPLAZA_REMOVELIST = get_code_word("INDIAPLAZA_REMOVELIST")
54
        INDIAPLAZA_REMOVELIST = INDIAPLAZA_REMOVELIST.split(';') 
55
        #INDIAPLAZA_XPATH4 = './/div[@class="finDetHdr"]/h1/text()' 
56
        INDIAPLAZA_XPATH4 = get_code_word("INDIAPLAZA_XPATH4")
57
        #INDIAPLAZA_XPATH5 = './/div[@class="priceArea"]/span[1]/text()'
58
        INDIAPLAZA_XPATH5 = get_code_word("INDIAPLAZA_XPATH5")
59
        #INDIAPLAZA_XPATH6 = './/div[@class="priceArea"]/div[@class="row"][2]/text()'
60
        INDIAPLAZA_XPATH6 = get_code_word("INDIAPLAZA_XPATH6")
61
        #INDIAPLAZA_XPATH7 = './/div[@class="priceArea"]/div[@class="row"][2]/span/text()'
62
        INDIAPLAZA_XPATH7 = get_code_word("INDIAPLAZA_XPATH7")
63
        #INDIAPLAZA_XPATH8 = './/div[@class="priceArea"]/div[@class="row"][3]/text()'
64
        INDIAPLAZA_XPATH8 = get_code_word("INDIAPLAZA_XPATH8")
65
        #INDIAPLAZA_XPATH9 = './/div[@class="priceArea"]/div[@class="row"][4]/text()'
66
        INDIAPLAZA_XPATH9 = get_code_word("INDIAPLAZA_XPATH9")
67
        #INDIAPLAZA_XPATH10 = './/div[@class="priceArea"]/div[@class="row"][1]/text()'
68
        INDIAPLAZA_XPATH10 = get_code_word("INDIAPLAZA_XPATH10")
69
        name = hxs.select(INDIAPLAZA_XPATH4)[0].extract()
70
        name = unescape(name)
71
        price = hxs.select(INDIAPLAZA_XPATH5)[0].extract()
180 ashish 72
        try:
227 ashish 73
            ship_price = hxs.select(INDIAPLAZA_XPATH6)[0].extract()
180 ashish 74
        except IndexError:
227 ashish 75
            ship_price = hxs.select(INDIAPLAZA_XPATH7)[0].extract()
180 ashish 76
        try:
227 ashish 77
            guarantee_info = hxs.select(INDIAPLAZA_XPATH8)[0].extract()
180 ashish 78
        except IndexError:
227 ashish 79
            guarantee_info = hxs.select(INDIAPLAZA_XPATH9)[0].extract()
80
        ship_info = hxs.select(INDIAPLAZA_XPATH10)[0].extract() 
180 ashish 81
 
82
        urllib.unquote(name)
83
        urllib.unquote(price)
84
        urllib.unquote(ship_price)
85
        urllib.unquote(guarantee_info)
86
        urllib.unquote(ship_info)
227 ashish 87
 
88
        #INDIAPLAZA_VAR1 = "Free shipping" 
89
        INDIAPLAZA_VAR1 = get_code_word("INDIAPLAZA_VAR1")
90
        if ship_price == INDIAPLAZA_VAR1:
180 ashish 91
            ship_price = "0"
92
        else :
227 ashish 93
            if ship_price != '':        
94
                for r in INDIAPLAZA_REMOVELIST: 
95
                    while ship_price.find(r) != -1:
96
                        ship_price = ship_price.replace(r, "")
97
        if price != '':        
98
                for r in INDIAPLAZA_REMOVELIST: 
99
                    while price.find(r) != -1:
100
                        price = price.replace(r, "")
180 ashish 101
 
102
 
103
        name = name.strip()
104
        price = price.strip()
105
        ship_price = ship_price.strip()
106
        guarantee_info = guarantee_info.strip()
107
        ship_info = ship_info.strip()
108
 
109
        shown_pr = int(price)
110
        final_pr = shown_pr + int(ship_price)
227 ashish 111
 
180 ashish 112
        da = DataHelper()
227 ashish 113
        da.add_ipextra(name,shown_pr,final_pr,guarantee_info,ship_info)        
180 ashish 114
SPIDER = indiaplaza_extra()
115