Subversion Repositories SmartDukaan

Rev

Rev 180 | Rev 258 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

'''
Created on 17-May-2010

@author: gaurav
'''

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request

from demo.items import DemoItem
from scrapy.contrib.spidermiddleware import referer
from scrapy.http.headers import Headers
from scrapy.http.request.form import FormRequest
from scrapy.log import msg
from scrapy.http.response import Response

from datastore.DataAccessor import *
from datastore.DataCodeAccessor import *
import urllib
from xml.dom import INDEX_SIZE_ERR
from html2text.unescaping import *


class indiaplaza_extra(BaseSpider):
    
    def __init__(self):
       initialize_table()
       #INDIAPLAZA_DOMAINNAME1 = "indiaplaza1" 
       INDIAPLAZA_DOMAINNAME1 = get_code_word("INDIAPLAZA_DOMAINNAME1")  
       self.domain_name = INDIAPLAZA_DOMAINNAME1
       
        # get urls from the database and append them in the list for crawling
       da = DataHelper()
       for pitem in da.get_all_ipbasic():
            self.start_urls.append(pitem.v_site.strip())
    
    def start_requests(self):
        listreq = []
        #for each request a referer has to be set
        #INDIAPLAZA_REFERER = "www.google.com/search"
        INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")
        for url1 in self.start_urls:
            request = Request(url = str(url1), callback=self.parse)
            request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
            listreq.append(request)
        return listreq
        
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        #INDIAPLAZA_REMOVELIST = ["Rs.","Rs",",","-","/"]
        #List separated by ';'
        INDIAPLAZA_REMOVELIST = get_code_word("INDIAPLAZA_REMOVELIST")
        INDIAPLAZA_REMOVELIST = INDIAPLAZA_REMOVELIST.split(';') 
        #INDIAPLAZA_XPATH4 = './/div[@class="finDetHdr"]/h1/text()' 
        INDIAPLAZA_XPATH4 = get_code_word("INDIAPLAZA_XPATH4")
        #INDIAPLAZA_XPATH5 = './/div[@class="priceArea"]/span[1]/text()'
        INDIAPLAZA_XPATH5 = get_code_word("INDIAPLAZA_XPATH5")
        #INDIAPLAZA_XPATH6 = './/div[@class="priceArea"]/div[@class="row"][2]/text()'
        INDIAPLAZA_XPATH6 = get_code_word("INDIAPLAZA_XPATH6")
        #INDIAPLAZA_XPATH7 = './/div[@class="priceArea"]/div[@class="row"][2]/span/text()'
        INDIAPLAZA_XPATH7 = get_code_word("INDIAPLAZA_XPATH7")
        #INDIAPLAZA_XPATH8 = './/div[@class="priceArea"]/div[@class="row"][3]/text()'
        INDIAPLAZA_XPATH8 = get_code_word("INDIAPLAZA_XPATH8")
        #INDIAPLAZA_XPATH9 = './/div[@class="priceArea"]/div[@class="row"][4]/text()'
        INDIAPLAZA_XPATH9 = get_code_word("INDIAPLAZA_XPATH9")
        #INDIAPLAZA_XPATH10 = './/div[@class="priceArea"]/div[@class="row"][1]/text()'
        INDIAPLAZA_XPATH10 = get_code_word("INDIAPLAZA_XPATH10")
        name = hxs.select(INDIAPLAZA_XPATH4)[0].extract()
        name = unescape(name)
        price = hxs.select(INDIAPLAZA_XPATH5)[0].extract()
        try:
            ship_price = hxs.select(INDIAPLAZA_XPATH6)[0].extract()
        except IndexError:
            ship_price = hxs.select(INDIAPLAZA_XPATH7)[0].extract()
        try:
            guarantee_info = hxs.select(INDIAPLAZA_XPATH8)[0].extract()
        except IndexError:
            guarantee_info = hxs.select(INDIAPLAZA_XPATH9)[0].extract()
        ship_info = hxs.select(INDIAPLAZA_XPATH10)[0].extract() 
        
        urllib.unquote(name)
        urllib.unquote(price)
        urllib.unquote(ship_price)
        urllib.unquote(guarantee_info)
        urllib.unquote(ship_info)
        
        #INDIAPLAZA_VAR1 = "Free shipping" 
        INDIAPLAZA_VAR1 = get_code_word("INDIAPLAZA_VAR1")
        if ship_price == INDIAPLAZA_VAR1:
            ship_price = "0"
        else :
            if ship_price != '':        
                for r in INDIAPLAZA_REMOVELIST: 
                    while ship_price.find(r) != -1:
                        ship_price = ship_price.replace(r, "")
        if price != '':        
                for r in INDIAPLAZA_REMOVELIST: 
                    while price.find(r) != -1:
                        price = price.replace(r, "")
                
        
        name = name.strip()
        price = price.strip()
        ship_price = ship_price.strip()
        guarantee_info = guarantee_info.strip()
        ship_info = ship_info.strip()
        
        shown_pr = int(price)
        final_pr = shown_pr + int(ship_price)
         
        da = DataHelper()
        da.add_ipextra(name,shown_pr,final_pr,guarantee_info,ship_info)        
SPIDER = indiaplaza_extra()