Subversion Repositories SmartDukaan

Rev

Rev 263 | Blame | Compare with Previous | Last modification | View Log | RSS feed

'''
Created on 28-May-2010

@author: gaurav
'''

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request

from demo.items import DemoItem
from scrapy.contrib.spidermiddleware import referer
from scrapy.http.headers import Headers
from scrapy.http.request.form import FormRequest
from scrapy.log import msg
from scrapy.http.response import Response
from time import *
from datastore.DataCodeAccessor import *
from datastore.DataAccessor import *

import urllib
from html2text.unescaping import *

class naaptol_price2(BaseSpider):
    """
    Documentation for class naaptol_price
    Spider collects the information for the individual phones and store them in table 
    datastore_datadefinition_naaptol_phones   
    """
    def __init__(self):
       """
        Documentation for constructor
        initialize_table is called to make all the tables known in
        the scope of this class.
        Also start url needs to be feeded to the spider through start_urls.append
        Domainname2 is name by which this spider is known outside
        So this will be used as an argument for calling this spider 
       """ 
       initialize_table()
       #NAAPTOL_DOMAINNAME2 = "naaptol2"   
       NAAPTOL_DOMAINNAME2 = get_code_word("NAAPTOL_DOMAINNAME2")
       self.domain_name = NAAPTOL_DOMAINNAME2 
       
       # get urls from the database and append them in the list for crawling
       da = DataHelper()
       for pitem in da.get_allmorenaaptolurls():
            self.start_urls.append(pitem.url.strip())
    
    def start_requests(self):
        """
        Documentation for method start_requests
        To set various properties of the request to be made
        like referer, headers and all.
        @return a list of well formed requests which will be 
        crawled by spider and spider will return the response
        """
        #for each request a referer has to be set
        listreq = []
        #NAAPTOL_REFERER = "http://www.google.com"
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
        for url1 in self.start_urls:
            request = Request(url = str(url1), callback=self.parse)
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
            listreq.append(request)
        return listreq
    
       
    def parse(self, response): 
        """
        Documentation for method parse
        @param response of individual requests
        Using Xpaths needed information is extracted out of the response
        and added to the database
        Xpath2 = Give us price-range for individual phone
        Xpath3 = Give us price-range for individual phone, if unable to retrieve from xpath2
        Xpath4 = Give us number of onlinesellers for a particular phone
        Xpath5 = Give us price for a particular phone offered by onlinesellers
        Xpath6 and Xpath7 = Give us name of onlinesellers for a particular phone 
        Xpath8 = Give us number of offlinesellers for a particular phone
        Xpath9 = Give us price for a particular phone offered by offlinesellers
        Xpath10 = Give us name of offlinesellers for a particular phone
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
        """
        da = DataHelper()
        #NAAPTOL_REMOVELIST = ["Rs.",","]
        #list separated by ';'
        NAAPTOL_REMOVELIST = str(get_code_word("NAAPTOL_REMOVELIST"))
        if len(NAAPTOL_REMOVELIST)>0:
            NAAPTOL_REMOVELIST = NAAPTOL_REMOVELIST.split(';')
        #retreiving name from the the url
        name = str(response.url)
        name = unescape(name)
        name_pos = name.rfind("/")
        name = name[name_pos+1:len(name)-5]
        name_pos = name.find("-")
        name = name[name_pos+1:len(name)]
            
        hxs = HtmlXPathSelector(response)
       
        #price and price2 determine range
        #NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
        NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
        prices = hxs.select(NAAPTOL_XPATH2)
        try:
            price1 = prices.extract()[0]
            price1 = price1.strip()
        except:
            price1 = ""
        
        try:
            price2 = prices.extract()[1]
            price2 = price2.strip()
        except:
            price2 = ""
        
        try:
            if price1 == "" and price2 == "":
                #NAAPTOL_XPATH3 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()'
                NAAPTOL_XPATH3 = get_code_word("NAAPTOL_XPATH3")
                prices = hxs.select(NAAPTOL_XPATH3)
                price = str(prices.extract()[0])
                pos1 = price.find("'")
                pos2 = price.find("'",pos1+1,len(price))
                price1 = price[pos1+1:pos2] + "(approx)" 
                price2 = ""
        except:
            price1 = price2 = ""
        #removelist is used for converting price to decimal format containing only numbers and '.'    
            
        if price1 != '':
            for r in NAAPTOL_REMOVELIST: 
                while price1.find(r) != -1:
                    price1 = price1.replace(r, "")
            price1 = price1.strip()
        if price2 != '':        
            for r in NAAPTOL_REMOVELIST: 
                while price2.find(r) != -1:
                    price2 = price2.replace(r, "")
            price2 = price2.strip()
        
        if price1 == "Rates Not Available":
            price1 = price2 = ""
        
        #range = price1 to price2
        range = price1
        if price2 != "":
            range = str(range) + " to " 
            range = range + str(price2) 
        da.add_new_naaptolphone(name, range)
        
        
        OnlineSellers_pricelist = []
        OnlineSellers_namelist = []
        try:
            #ct1 holds the count of online sellers
            #NAAPTOL_XPATH4 = '//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
            NAAPTOL_XPATH4 = get_code_word("NAAPTOL_XPATH4")
            ct1 = hxs.select(NAAPTOL_XPATH4)
            ct1 = str(ct1.extract()[0])
            ct1 = ct1.decode("utf-8")
            ct1 = ct1.strip()
            ps1 = ct1.find(" ")
            ct1 = ct1[0:ps1]
            ct1 = int(ct1)
        except:
            ct1 = 0
        ct = ct1
        i = 0
        #NAAPTOL_XPATH5 = '//div[@id="onSellerContents"]//td[@class="price"]'
        NAAPTOL_XPATH5 = get_code_word("NAAPTOL_XPATH5")
        os_info = hxs.select(NAAPTOL_XPATH5)
        while ct > 0:
            os = os_info[i].extract()
            ps1 = os.find(">")
            ps2 = os.find("<",ps1)
            os = os[ps1+1:ps2]
            
            if os != '':        
                for r in NAAPTOL_REMOVELIST: 
                    while os.find(r) != -1:
                        os = os.replace(r, "")
            os = urllib.unquote(os)
            try:
                os = int(os)
            except:
                #stored in format different than previous one
                os = os_info[i].extract()
                ps1 = os.find(">",ps2)
                ps2 = os.find("<",ps1)
                os = os[ps1+1:ps2]
                if os != '':        
                    for r in NAAPTOL_REMOVELIST: 
                        while os.find(r) != -1:
                            os = os.replace(r, "")
                os = urllib.unquote(os)
                os = int(os) 
            
            OnlineSellers_pricelist.append(os)
            
            #NAAPTOL_XPATH6 = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'  
            NAAPTOL_XPATH6 = get_code_word("NAAPTOL_XPATH6")
            #NAAPTOL_XPATH7 = '"]/span/text()'
            NAAPTOL_XPATH7 = get_code_word("NAAPTOL_XPATH7")
            NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + str(i) 
            NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + NAAPTOL_XPATH7 
            path = NAAPTOL_XPATH6
            osname = hxs.select(path)
            osname = osname.extract()[0]
            osname = unescape(osname)
            osname = urllib.unquote(osname)
            OnlineSellers_namelist.append(osname)
            i = i+1
            ct = ct-1
            
        l = len(OnlineSellers_pricelist)
        i = 0 
        nid = da.get_naaptolphone(name,range).id
        while l > 0:
            da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])
            i = i+1
            l = l-1
        
        LocalSellers_pricelist = []   
        LocalSellers_namelist = []
        try:
            #ct1 holds the count of online sellers
            #NAAPTOL_XPATH8 = '//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
            NAAPTOL_XPATH8 = get_code_word("NAAPTOL_XPATH8")
            ct1 = hxs.select(NAAPTOL_XPATH8)
            ct1 = str(ct1.extract()[0])
            ct1 = ct1.decode("utf-8")
            ct1 = ct1.strip()
            ps1 = ct1.find(" ")
            ct1 = ct1[0:ps1]
            ct1 = int(ct1)
        except:
            ct1 = 0
        ct = ct1
        i = 0
        #NAAPTOL_XPATH9 = '//div[@id="offSellerContents"]//td[@class="price"]'
        NAAPTOL_XPATH9 = get_code_word("NAAPTOL_XPATH9")
        #NAAPTOL_XPATH10 = '//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()'
        NAAPTOL_XPATH10 = get_code_word("NAAPTOL_XPATH10")
        os_info = hxs.select(NAAPTOL_XPATH9)
        os_names = hxs.select(NAAPTOL_XPATH10)
        
        while ct > 0:
            os = os_info[i].extract()
            osname = os_names[i].extract() 
            ps1 = os.find(">")
            ps2 = os.find("<",ps1)
            os = os[ps1+1:ps2]
            if os != '':        
                for r in NAAPTOL_REMOVELIST: 
                    while os.find(r) != -1:
                        os = os.replace(r, "")
            os = urllib.unquote(os)
            osname = urllib.unquote(osname)
            osname = unescape(osname)
            try:
                os = int(os)
            except:
                #stored in format different than previous one
                os = os_info[i].extract()
                ps1 = os.find(">",ps2)
                ps2 = os.find("<",ps1)
                os = os[ps1+1:ps2]
                if os != '':        
                    for r in NAAPTOL_REMOVELIST: 
                        while os.find(r) != -1:
                            os = os.replace(r, "")
                os = urllib.unquote(os)
                os = int(os)        
            LocalSellers_pricelist.append(os)
            LocalSellers_namelist.append(osname)
            i = i+1
            ct = ct-1
            
        l = len(LocalSellers_pricelist)
        i = 0
        nid = da.get_naaptolphone(name,range).id
        while l > 0:
            da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])
            i = i+1
            l = l-1

SPIDER = naaptol_price2()