WebSVN – SmartDukaan – /prototype/naaptolpass2/src/demo/spiders/spider2.py

'''
Created on 27-May-2010

@author: gaurav
'''
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request

from demo.items import DemoItem
from scrapy.contrib.spidermiddleware import referer
from scrapy.http.headers import Headers
from scrapy.http.request.form import FormRequest
from scrapy.log import msg
from scrapy.http.response import Response
from time import *
from datastore.DataCodeAccessor import *
from datastore.DataAccessor import *

import urllib
from html2text.unescaping import *

class naaptol_price(BaseSpider):
    """
    Documentation for class naaptol_price
    Since the urls collected in the previous spider for naaptol.com
    are redirected to get the data for individual phones.
    Some are of the form "http://www.naaptol.com/features/10417-Fly-E300.html"
    while others are of the form "http://www.naaptol.com/price/10417-Fly-E300.html".
    So to make data extraction symmetric, this spider will accomplish 2 tasks
    First, for the urls conatining 'features' it collects the information for the 
    individual phones and store them in table datastore_datadefinition_naaptol_phones
    for the ones conatining 'prices' in the url, a new url having 'price' repalced  
    with 'features' is framed and stored in the table datastore_datadefinition_morenaaptol_urls.
    """
    def __init__(self): 
       """
        Documentation for constructor
        initialize_table is called to make all the tables known in
        the scope of this class.
        Also start url needs to be feeded to the spider through start_urls.append
        Domainname1 is name by which this spider is known outside
        So this will be used as an argument for calling this spider 
       """ 
       initialize_table()
       #NAAPTOL_DOMAINNAME1 = "naaptol1"   
       NAAPTOL_DOMAINNAME1 = get_code_word("NAAPTOL_DOMAINNAME1")
       self.domain_name = NAAPTOL_DOMAINNAME1 
       
       # get urls from the database and append them in the list for crawling
       da = DataHelper()
       #url = "http://www.naaptol.com/features/10417-Fly-E300.html"
       #self.start_urls.append(url)
       for pitem in da.get_allnaaptolurls():
            self.start_urls.append(pitem.url.strip())
    
    def start_requests(self):
        """
        Documentation for method start_requests
        To set various properties of the request to be made
        like referer, headers and all.
        @return a list of well formed requests which will be 
        crawled by spider and spider will return the response
        """
        #for each request a referer has to be set
        listreq = []
        #NAAPTOL_REFERER = "http://www.google.com"
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
        for url1 in self.start_urls:
            request = Request(url = str(url1), callback=self.parse)
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
            listreq.append(request)
        return listreq
    
    def parse(self, response):
        """
        Documentation for method parse
        @param response of individual requests
        Using Xpaths needed information is extracted out of the response
        and added to the database
        Xpath2 = Give us price-range for individual phone
        Xpath3 = Give us price-range for individual phone, if unable to retrieve from xpath2
        Xpath4 = Give us number of onlinesellers for a particular phone
        Xpath5 = Give us price for a particular phone offered by onlinesellers
        Xpath6 and Xpath7 = Give us name of onlinesellers for a particular phone 
        Xpath8 = Give us number of offlinesellers for a particular phone
        Xpath9 = Give us price for a particular phone offered by offlinesellers
        Xpath10 = Give us name of offlinesellers for a particular phone
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
        chklist2 = contains what needs to be replaced, presently it conatains 'price'
        part = contains 'features'
        """
        # there are two different type of urls one contains feature and other one contains price
        #both have to be processed differently
        msg(response.url)
        site = response.url
        site = unescape(site)
        sp1 = site.rfind("/")
        sp2 = site.rfind("/",0,sp1-1)
        catg = site[sp2+1:sp1]
        da = DataHelper()
        #change price to features and add to urls as both provide the same data but in different formats
        #otherwise crawl the url containing features
        #NAAPTOL_CHKLIST2 = ['price']
        #list separated by ';'
        NAAPTOL_CHKLIST2 = str(get_code_word("NAAPTOL_CHKLIST2"))
        if len(NAAPTOL_CHKLIST2)>0:
            NAAPTOL_CHKLIST2 = NAAPTOL_CHKLIST2.split(';')
        #NAAPTOL_PART = "features"
        NAAPTOL_PART = get_code_word("NAAPTOL_PART")
        #NAAPTOL_REMOVELIST = ["Rs.",","]
        NAAPTOL_REMOVELIST = str(get_code_word("NAAPTOL_REMOVELIST"))
        if len(NAAPTOL_REMOVELIST)>0:
            NAAPTOL_REMOVELIST = NAAPTOL_REMOVELIST.split(';')
        for c in NAAPTOL_CHKLIST2:
            if c == catg:
                site = site.replace(c,NAAPTOL_PART)
                da.add_morenaaptolurl(site)
                
                
        if catg == NAAPTOL_PART:    
            #retreiving name from the the url
            name = str(response.url)
            name = unescape(name)
            name_pos = name.rfind("/")
            name = name[name_pos+1:len(name)-5]
            name_pos = name.find("-")
            name = name[name_pos+1:len(name)]
            
            hxs = HtmlXPathSelector(response)  
            #price and price2 determine range
            #NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
            NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
            prices = hxs.select(NAAPTOL_XPATH2)
            try:
                price1 = prices.extract()[0]
                price1 = price1.strip()
            except:
                price1 = ""
            
            try:
                price2 = prices.extract()[1]
                price2 = price2.strip()
            except:
                price2 = ""  
            try:
                if price1 == "" and price2 == "":
                    #NAAPTOL_XPATH3 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()'
                    NAAPTOL_XPATH3 = get_code_word("NAAPTOL_XPATH3")
                    prices = hxs.select(NAAPTOL_XPATH3)
                    price = str(prices.extract()[0])
                    pos1 = price.find("'")
                    pos2 = price.find("'",pos1+1,len(price))
                    price1 = price[pos1+1:pos2] + "(approx)" 
                    price2 = ""
            except:
                price1 = price2 = ""
            #removelist is used for converting price to decimal format containing only numbers and '.'    
                
            if price1 != '':
                for r in NAAPTOL_REMOVELIST: 
                    while price1.find(r) != -1:
                        price1 = price1.replace(r, "")
                price1 = price1.strip()
            if price2 != '':        
                for r in NAAPTOL_REMOVELIST: 
                    while price2.find(r) != -1:
                        price2 = price2.replace(r, "")
                price2 = price2.strip()
            
            if price1 == "Rates Not Available":
                price1 = price2 = ""
            
            #range = price1 to price2
            range = price1
            if price2 != "":
                range = str(range) + " to " 
                range = range + str(price2) 
                
            da.add_new_naaptolphone(name, str(range))
            
            
            OnlineSellers_pricelist = []
            OnlineSellers_namelist = []
            try:
                #ct1 holds the count of online sellers
                #NAAPTOL_XPATH4 = '//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
                NAAPTOL_XPATH4 = get_code_word("NAAPTOL_XPATH4")
                ct1 = hxs.select(NAAPTOL_XPATH4)
                ct1 = str(ct1.extract()[0])
                ct1 = ct1.decode("utf-8")
                ct1 = ct1.strip()
                ps1 = ct1.find(" ")
                ct1 = ct1[0:ps1]
                ct1 = int(ct1)
            except:
                ct1 = 0
            ct = ct1
            i = 0
            #NAAPTOL_XPATH5 = '//div[@id="onSellerContents"]//td[@class="price"]'
            NAAPTOL_XPATH5 = get_code_word("NAAPTOL_XPATH5")
            os_info = hxs.select(NAAPTOL_XPATH5)
            while ct > 0:
                os = os_info[i].extract()
                ps1 = os.find(">")
                ps2 = os.find("<",ps1)
                os = os[ps1+1:ps2]
                
                if os != '':        
                    for r in NAAPTOL_REMOVELIST: 
                        while os.find(r) != -1:
                            os = os.replace(r, "")
                os = urllib.unquote(os)
                try:
                    os = int(os)
                except:
                    #stored in format different than previous one
                    os = os_info[i].extract()
                    ps1 = os.find(">",ps2)
                    ps2 = os.find("<",ps1)
                    os = os[ps1+1:ps2]
                    if os != '':        
                        for r in NAAPTOL_REMOVELIST: 
                            while os.find(r) != -1:
                                os = os.replace(r, "")
                    os = urllib.unquote(os)
                    os = int(os) 
                
                OnlineSellers_pricelist.append(os)
                
                #NAAPTOL_XPATH6 = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'  
                NAAPTOL_XPATH6 = get_code_word("NAAPTOL_XPATH6")
                #NAAPTOL_XPATH7 = '"]/span/text()'
                NAAPTOL_XPATH7 = get_code_word("NAAPTOL_XPATH7")
                NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + str(i) 
                NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + NAAPTOL_XPATH7 
                path = NAAPTOL_XPATH6
                osname = hxs.select(path)
                osname = osname.extract()[0]
                osname = unescape(osname)
                osname = urllib.unquote(osname)
                OnlineSellers_namelist.append(osname)
                i = i+1
                ct = ct-1
                
            l = len(OnlineSellers_pricelist)
            i = 0 
            nid = da.get_naaptolphone(name,range).id
            while l > 0:
                da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])
                i = i+1
                l = l-1
            
            LocalSellers_pricelist = []   
            LocalSellers_namelist = []
            try:
                #ct1 holds the count of online sellers
                #NAAPTOL_XPATH8 = '//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
                NAAPTOL_XPATH8 = get_code_word("NAAPTOL_XPATH8")
                ct1 = hxs.select(NAAPTOL_XPATH8)
                ct1 = str(ct1.extract()[0])
                ct1 = ct1.decode("utf-8")
                ct1 = ct1.strip()
                ps1 = ct1.find(" ")
                ct1 = ct1[0:ps1]
                ct1 = int(ct1)
            except:
                ct1 = 0
            ct = ct1
            i = 0
            #NAAPTOL_XPATH9 = '//div[@id="offSellerContents"]//td[@class="price"]'
            NAAPTOL_XPATH9 = get_code_word("NAAPTOL_XPATH9")
            #NAAPTOL_XPATH10 = '//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()'
            NAAPTOL_XPATH10 = get_code_word("NAAPTOL_XPATH10")
            os_info = hxs.select(NAAPTOL_XPATH9)
            os_names = hxs.select(NAAPTOL_XPATH10)
            
            while ct > 0:
                os = os_info[i].extract()
                osname = os_names[i].extract() 
                ps1 = os.find(">")
                ps2 = os.find("<",ps1)
                os = os[ps1+1:ps2]
                if os != '':        
                    for r in NAAPTOL_REMOVELIST: 
                        while os.find(r) != -1:
                            os = os.replace(r, "")
                os = urllib.unquote(os)
                osname = urllib.unquote(osname)
                osname = unescape(osname)
                try:
                    os = int(os)
                except:
                    #stored in format different than previous one
                    os = os_info[i].extract()
                    ps1 = os.find(">",ps2)
                    ps2 = os.find("<",ps1)
                    os = os[ps1+1:ps2]
                    if os != '':        
                        for r in NAAPTOL_REMOVELIST: 
                            while os.find(r) != -1:
                                os = os.replace(r, "")
                    os = urllib.unquote(os)
                    os = int(os)        
                LocalSellers_pricelist.append(os)
                LocalSellers_namelist.append(osname)
                i = i+1
                ct = ct-1
                
            l = len(LocalSellers_pricelist)
            i = 0
            nid = da.get_naaptolphone(name,range).id
            while l > 0:
                da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])
                i = i+1
                l = l-1
            
SPIDER = naaptol_price()
Subversion Repositories SmartDukaan

(root)/prototype/naaptolpass2/src/demo/spiders/spider2.py – Rev 272