Rev 265 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 14-May-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom datastore.DataCodeAccessor import *from datastore.DataAccessor import *from html2text.unescaping import *class vendor_links(BaseSpider):"""Documentation for class vendor_linksThis spider collects the url for the individual vendorsand store them in table datastore_datadefinition_univercell_data."""def __init__(self):"""Documentation for constructorinitialize_table is called to make all the tables known inthe scope of this class.Also start url needs to be feeded to the spider through start_urls.appendDomainname is name by which this spider is known outsideSo this will be used as an argument for calling this spider"""initialize_table()#UNIVERCELL_DOMAINNAME = "univercell"UNIVERCELL_DOMAINNAME = get_code_word("UNIVERCELL_DOMAINNAME")self.domain_name = UNIVERCELL_DOMAINNAME#UNIVERCELL_URL = "http://www.univercell.in/mobiles/populateStore.action"UNIVERCELL_URL = get_code_word("UNIVERCELL_URL")self.start_urls.append(UNIVERCELL_URL)def start_requests(self):"""Documentation for method start_requestsTo set various properties of the request to be madelike referer, headers and all.Also suppliers entry need to be done in the tabledatastore_datadefinition_suppliers.@return a list of well formed requests which will becrawled by spider and spider will return the response"""#adding entry for the supplier i.e its name and site#UNIVERCELL_HOMEPAGE = "http://www.univercell.in"UNIVERCELL_HOMEPAGE = get_code_word("UNIVERCELL_HOMEPAGE")da = DataHelper()da.add_supplier(self.domain_name, UNIVERCELL_HOMEPAGE)listreq = []#for each request a referer has to be set#UNIVERCELL_REFERER = "www.google.com/search"UNIVERCELL_REFERER = get_code_word("UNIVERCELL_REFERER")for url1 in self.start_urls:request = Request(url = str(url1), callback=self.parse)request.headers.setdefault("Referer", UNIVERCELL_REFERER)listreq.append(request)return listreqdef parse(self, response):"""Documentation for method parse@param response of individual requestsUsing Xpaths needed information is extracted out of the responseand added to the databaseXpath1 = Give us section for individual vendorsXpath2 = Give us name for individual vendorsXpath3 = Give us url for individual vendorsUrl1 = To get full url for individual vendorsvar1,var2,var3 and var4 are used to get proper url"""#url1 needed to get complete urls for phones#UNIVERCELL_URL1 = "http://www.univercell.in"UNIVERCELL_URL1 = get_code_word("UNIVERCELL_URL1")hxs = HtmlXPathSelector(response)#UNIVERCELL_XPATH1 = '//div[@id="mobilesTab"]/table/tr[1]/td/table/tr'UNIVERCELL_XPATH1 = get_code_word("UNIVERCELL_XPATH1")vendor_info = hxs.select(UNIVERCELL_XPATH1)items = []for i in vendor_info:item = {}#UNIVERCELL_XPATH2 = './/a/text()'UNIVERCELL_XPATH2 = get_code_word("UNIVERCELL_XPATH2")#UNIVERCELL_XPATH3 = './/a/@href'UNIVERCELL_XPATH3 = get_code_word("UNIVERCELL_XPATH3")item['name'] = i.select(UNIVERCELL_XPATH2)[0].extract()temp = i.select(UNIVERCELL_XPATH3)[0].extract()#site having data has url containing repopulate instead of populate#UNIVERCELL_VAR1 = ";"UNIVERCELL_VAR1 = get_code_word("UNIVERCELL_VAR1")#UNIVERCELL_VAR2 = "?"UNIVERCELL_VAR2 = get_code_word("UNIVERCELL_VAR2")#UNIVERCELL_VAR3 = "populate"UNIVERCELL_VAR3 = get_code_word("UNIVERCELL_VAR3")#UNIVERCELL_VAR4 = "rePopulate"UNIVERCELL_VAR4 = get_code_word("UNIVERCELL_VAR4")a = str(temp).find(UNIVERCELL_VAR1)b = str(temp).find(UNIVERCELL_VAR2)temp1 = str(temp)[a:b]temp2 = str(temp).replace(temp1,"")item['site'] = str(temp2).replace(UNIVERCELL_VAR3,UNIVERCELL_VAR4)items.append(item)da = DataHelper()for item in items:str2 = UNIVERCELL_URL1 + str(item['site'])da.add_univervendor( unescape(item['name'].strip()), unescape(str2))SPIDER = vendor_links()