WebSVN – SmartDukaan – /prototype/universalScrapypass1/src/demo/spiders/univercellspiderpass1.py

'''
Created on 14-May-2010

@author: gaurav
'''


from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request

from demo.items import DemoItem
from scrapy.contrib.spidermiddleware import referer
from scrapy.http.headers import Headers
from scrapy.http.request.form import FormRequest
from scrapy.log import msg
from scrapy.http.response import Response

from datastore.DataCodeAccessor import *
from datastore.DataAccessor import *
from html2text.unescaping import *



class vendor_links(BaseSpider):
    
    def __init__(self):
        initialize_table()
        #UNIVERCELL_DOMAINNAME = "univercell"   
        UNIVERCELL_DOMAINNAME = get_code_word("UNIVERCELL_DOMAINNAME")
        self.domain_name = UNIVERCELL_DOMAINNAME 
        #UNIVERCELL_URL = "http://www.univercell.in/mobiles/populateStore.action"
        UNIVERCELL_URL = get_code_word("UNIVERCELL_URL")
        self.start_urls.append(UNIVERCELL_URL)
    

    def start_requests(self):
        #adding entry for the supplier i.e its name and site
        #UNIVERCELL_HOMEPAGE = "http://www.univercell.in"
        UNIVERCELL_HOMEPAGE = get_code_word("UNIVERCELL_HOMEPAGE")
        da = DataHelper()
        da.add_supplier(self.domain_name, UNIVERCELL_HOMEPAGE)
        listreq = []
        
        #for each request a referer has to be set
        #UNIVERCELL_REFERER = "www.google.com/search"
        UNIVERCELL_REFERER = get_code_word("UNIVERCELL_REFERER")
        for url1 in self.start_urls:
            request = Request(url = str(url1), callback=self.parse)
            request.headers.setdefault("Referer", UNIVERCELL_REFERER)
            listreq.append(request)
        return listreq
        
    def parse(self, response):
        #url1 needed to get complete urls for phones
        #UNIVERCELL_URL1 = "http://www.univercell.in"
        UNIVERCELL_URL1 = get_code_word("UNIVERCELL_URL1")
        hxs = HtmlXPathSelector(response)
        #UNIVERCELL_XPATH1 = '//div[@id="mobilesTab"]/table/tr[1]/td/table/tr'
        UNIVERCELL_XPATH1 = get_code_word("UNIVERCELL_XPATH1")
        vendor_info = hxs.select(UNIVERCELL_XPATH1)
        
        items = []
        for i in vendor_info:
            item = {}
            #UNIVERCELL_XPATH2 = './/a/text()'
            UNIVERCELL_XPATH2 = get_code_word("UNIVERCELL_XPATH2")
            #UNIVERCELL_XPATH3 = './/a/@href' 
            UNIVERCELL_XPATH3 = get_code_word("UNIVERCELL_XPATH3")
            item['name'] = i.select(UNIVERCELL_XPATH2)[0].extract()
            temp = i.select(UNIVERCELL_XPATH3)[0].extract()
            
            #site having data has url containing repopulate instead of populate
            #UNIVERCELL_VAR1 = ";"
            UNIVERCELL_VAR1 = get_code_word("UNIVERCELL_VAR1")
            #UNIVERCELL_VAR2 = "?"
            UNIVERCELL_VAR2 = get_code_word("UNIVERCELL_VAR2")
            #UNIVERCELL_VAR3 = "populate"
            UNIVERCELL_VAR3 = get_code_word("UNIVERCELL_VAR3")
            #UNIVERCELL_VAR4 = "rePopulate"
            UNIVERCELL_VAR4 = get_code_word("UNIVERCELL_VAR4")
            a = str(temp).find(UNIVERCELL_VAR1)
            b = str(temp).find(UNIVERCELL_VAR2)
            temp1 = str(temp)[a:b]
            temp2 = str(temp).replace(temp1,"")
            item['site'] =  str(temp2).replace(UNIVERCELL_VAR3,UNIVERCELL_VAR4)
            items.append(item)
            
        da = DataHelper()
        for item in items:
            str2 = UNIVERCELL_URL1 + str(item['site'])
            da.add_univervendor( unescape(item['name'].strip()), unescape(str2))
            
SPIDER = vendor_links()
Subversion Repositories SmartDukaan

(root)/prototype/universalScrapypass1/src/demo/spiders/univercellspiderpass1.py – Rev 239