Subversion Repositories SmartDukaan

Rev

Rev 258 | Blame | Compare with Previous | Last modification | View Log | RSS feed

'''
Created on 17-May-2010

@author: gaurav
'''

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request

from demo.items import DemoItem
from scrapy.contrib.spidermiddleware import referer
from scrapy.http.headers import Headers
from scrapy.http.request.form import FormRequest
from scrapy.log import msg
from scrapy.http.response import Response

from datastore.DataAccessor import *
from datastore.DataCodeAccessor import *
import urllib
from xml.dom import INDEX_SIZE_ERR
from html2text.unescaping import *


class indiaplaza_extra(BaseSpider):
    """
    Documentation for class indiaplaza_extra
    This spider collects all the information for the individual phones
    and store them in table datastore_datadefinition_indiaplaza_items.
    """
    def __init__(self):
       """
        Documentation for constructor
        initialize_table is called to make all the tables known in
        the scope of this class.
        Also start url needs to be feeded to the spider through start_urls.append
        Domainname1 is name by which this spider is known outside
        So this will be used as an argument for calling this spider 
       """ 
       initialize_table()
       #INDIAPLAZA_DOMAINNAME1 = "indiaplaza1" 
       INDIAPLAZA_DOMAINNAME1 = get_code_word("INDIAPLAZA_DOMAINNAME1")  
       self.domain_name = INDIAPLAZA_DOMAINNAME1
       
        # get urls from the database and append them in the list for crawling
       da = DataHelper()
       for pitem in da.get_all_ipbasic():
            self.start_urls.append(pitem.v_site.strip())
    
    def start_requests(self):
        """
        Documentation for method start_requests
        To set various properties of the request to be made
        like referer, headers and all.
        @return a list of well formed requests which will be 
        crawled by spider and spider will return the response
        """
        listreq = []
        #for each request a referer has to be set
        #INDIAPLAZA_REFERER = "www.google.com/search"
        INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")
        for url1 in self.start_urls:
            request = Request(url = str(url1), callback=self.parse)
            request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
            listreq.append(request)
        return listreq
        
    def parse(self, response):
        """
        Documentation for method parse
        @param response of individual requests
        Using Xpaths needed information is extracted out of the response
        and added to the database
        Xpath4 = Give us name for individual phone
        Xpath5 = Give us quoted-price for individual phone
        Xpath6 = Give us ship-price for individual phone
        Xpath7 = Give us ship_price for individual phone, if not gettable form xpath6
        Xpath8 = Give us guarantee-info for individual phone
        Xpath9 = Give us guarantee-info for individual phone, if not gettable form xpath8
        Xpath10 = Give us ship-info for individual phone
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
        """
        hxs = HtmlXPathSelector(response)
        #INDIAPLAZA_REMOVELIST = ["Rs.","Rs",",","-","/"]
        #List separated by ';'
        INDIAPLAZA_REMOVELIST = str(get_code_word("INDIAPLAZA_REMOVELIST"))
        if len(INDIAPLAZA_REMOVELIST)>0:
            INDIAPLAZA_REMOVELIST = INDIAPLAZA_REMOVELIST.split(';') 
        #INDIAPLAZA_XPATH4 = './/div[@class="finDetHdr"]/h1/text()' 
        INDIAPLAZA_XPATH4 = get_code_word("INDIAPLAZA_XPATH4")
        #INDIAPLAZA_XPATH5 = './/div[@class="priceArea"]/span[1]/text()'
        INDIAPLAZA_XPATH5 = get_code_word("INDIAPLAZA_XPATH5")
        #INDIAPLAZA_XPATH6 = './/div[@class="priceArea"]/div[@class="row"][2]/text()'
        INDIAPLAZA_XPATH6 = get_code_word("INDIAPLAZA_XPATH6")
        #INDIAPLAZA_XPATH7 = './/div[@class="priceArea"]/div[@class="row"][2]/span/text()'
        INDIAPLAZA_XPATH7 = get_code_word("INDIAPLAZA_XPATH7")
        #INDIAPLAZA_XPATH8 = './/div[@class="priceArea"]/div[@class="row"][3]/text()'
        INDIAPLAZA_XPATH8 = get_code_word("INDIAPLAZA_XPATH8")
        #INDIAPLAZA_XPATH9 = './/div[@class="priceArea"]/div[@class="row"][4]/text()'
        INDIAPLAZA_XPATH9 = get_code_word("INDIAPLAZA_XPATH9")
        #INDIAPLAZA_XPATH10 = './/div[@class="priceArea"]/div[@class="row"][1]/text()'
        INDIAPLAZA_XPATH10 = get_code_word("INDIAPLAZA_XPATH10")
        name = hxs.select(INDIAPLAZA_XPATH4)[0].extract()
        name = unescape(name)
        price = hxs.select(INDIAPLAZA_XPATH5)[0].extract()
        try:
            ship_price = hxs.select(INDIAPLAZA_XPATH6)[0].extract()
        except IndexError:
            ship_price = hxs.select(INDIAPLAZA_XPATH7)[0].extract()
        try:
            guarantee_info = hxs.select(INDIAPLAZA_XPATH8)[0].extract()
        except IndexError:
            guarantee_info = hxs.select(INDIAPLAZA_XPATH9)[0].extract()
        ship_info = hxs.select(INDIAPLAZA_XPATH10)[0].extract() 
        
        urllib.unquote(name)
        urllib.unquote(price)
        urllib.unquote(ship_price)
        urllib.unquote(guarantee_info)
        urllib.unquote(ship_info)
        
        #INDIAPLAZA_VAR1 = "Free shipping" 
        INDIAPLAZA_VAR1 = get_code_word("INDIAPLAZA_VAR1")
        if ship_price == INDIAPLAZA_VAR1:
            ship_price = "0"
        else :
            if ship_price != '':        
                for r in INDIAPLAZA_REMOVELIST: 
                    while ship_price.find(r) != -1:
                        ship_price = ship_price.replace(r, "")
        if price != '':        
                for r in INDIAPLAZA_REMOVELIST: 
                    while price.find(r) != -1:
                        price = price.replace(r, "")
                
        
        name = name.strip()
        price = price.strip()
        ship_price = ship_price.strip()
        guarantee_info = guarantee_info.strip()
        ship_info = ship_info.strip()
        
        shown_pr = int(price)
        final_pr = shown_pr + int(ship_price)
         
        da = DataHelper()
        da.add_ipextra(name,shown_pr,final_pr,guarantee_info,ship_info)        
SPIDER = indiaplaza_extra()