Rev 169 | Rev 265 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 14-May-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom datastore.DataCodeAccessor import *from datastore.DataAccessor import *from html2text.unescaping import *class vendor_links(BaseSpider):def __init__(self):initialize_table()#UNIVERCELL_DOMAINNAME = "univercell"UNIVERCELL_DOMAINNAME = get_code_word("UNIVERCELL_DOMAINNAME")self.domain_name = UNIVERCELL_DOMAINNAME#UNIVERCELL_URL = "http://www.univercell.in/mobiles/populateStore.action"UNIVERCELL_URL = get_code_word("UNIVERCELL_URL")self.start_urls.append(UNIVERCELL_URL)def start_requests(self):#adding entry for the supplier i.e its name and site#UNIVERCELL_HOMEPAGE = "http://www.univercell.in"UNIVERCELL_HOMEPAGE = get_code_word("UNIVERCELL_HOMEPAGE")da = DataHelper()da.add_supplier(self.domain_name, UNIVERCELL_HOMEPAGE)listreq = []#for each request a referer has to be set#UNIVERCELL_REFERER = "www.google.com/search"UNIVERCELL_REFERER = get_code_word("UNIVERCELL_REFERER")for url1 in self.start_urls:request = Request(url = str(url1), callback=self.parse)request.headers.setdefault("Referer", UNIVERCELL_REFERER)listreq.append(request)return listreqdef parse(self, response):#url1 needed to get complete urls for phones#UNIVERCELL_URL1 = "http://www.univercell.in"UNIVERCELL_URL1 = get_code_word("UNIVERCELL_URL1")hxs = HtmlXPathSelector(response)#UNIVERCELL_XPATH1 = '//div[@id="mobilesTab"]/table/tr[1]/td/table/tr'UNIVERCELL_XPATH1 = get_code_word("UNIVERCELL_XPATH1")vendor_info = hxs.select(UNIVERCELL_XPATH1)items = []for i in vendor_info:item = {}#UNIVERCELL_XPATH2 = './/a/text()'UNIVERCELL_XPATH2 = get_code_word("UNIVERCELL_XPATH2")#UNIVERCELL_XPATH3 = './/a/@href'UNIVERCELL_XPATH3 = get_code_word("UNIVERCELL_XPATH3")item['name'] = i.select(UNIVERCELL_XPATH2)[0].extract()temp = i.select(UNIVERCELL_XPATH3)[0].extract()#site having data has url containing repopulate instead of populate#UNIVERCELL_VAR1 = ";"UNIVERCELL_VAR1 = get_code_word("UNIVERCELL_VAR1")#UNIVERCELL_VAR2 = "?"UNIVERCELL_VAR2 = get_code_word("UNIVERCELL_VAR2")#UNIVERCELL_VAR3 = "populate"UNIVERCELL_VAR3 = get_code_word("UNIVERCELL_VAR3")#UNIVERCELL_VAR4 = "rePopulate"UNIVERCELL_VAR4 = get_code_word("UNIVERCELL_VAR4")a = str(temp).find(UNIVERCELL_VAR1)b = str(temp).find(UNIVERCELL_VAR2)temp1 = str(temp)[a:b]temp2 = str(temp).replace(temp1,"")item['site'] = str(temp2).replace(UNIVERCELL_VAR3,UNIVERCELL_VAR4)items.append(item)da = DataHelper()for item in items:str2 = UNIVERCELL_URL1 + str(item['site'])da.add_univervendor( unescape(item['name'].strip()), unescape(str2))SPIDER = vendor_links()