Rev 233 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 09-Jun-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom datastore.DataAccessor import *from datastore.DataCodeAccessor import *from html2text.unescaping import *import urllibclass mobilestore_spider0(BaseSpider):"""Documentation for class mobilestore_spider0This spider collects the information for the individual phonesand store them in table datastore_datadefinition_themobilestorephones_new"""def __init__(self):"""Documentation for constructorinitialize_table is called to make all the tables known inthe scope of this class.Also start url needs to be feeded to the spider through start_urls.appendDomainname is name by which this spider is known outsideSo this will be used as an argument for calling this spider.As the number of pages to be crawled are not fixed so ct and nt are used to make it dynamic.url1 and url2 are used for getting actual start urls."""initialize_table()#MOBILESTORE_DOMAINNAME0 = "mobilestore0"MOBILESTORE_DOMAINNAME0 = get_code_word("MOBILESTORE_DOMAINNAME0")self.domain_name = MOBILESTORE_DOMAINNAME0da = DataHelper()MOBILESTORE_CT = int(da.get_extra_vars('mobilestore_count'))nt = 1if MOBILESTORE_CT > 800:nt = MOBILESTORE_CT-50while nt < MOBILESTORE_CT:#MOBILESTORE_URL1 = "http://www.themobilestore.in/mobilestore/faces/tiles/product.jsp?productID=" + str(nt)MOBILESTORE_URL1 = get_code_word("MOBILESTORE_URL1") + str(nt)#MOBILESTORE_URL2 = MOBILESTORE_URL1 + "&catalogueID=3"MOBILESTORE_URL2 = MOBILESTORE_URL1 + get_code_word("MOBILESTORE_URL2")self.start_urls.append(MOBILESTORE_URL2)nt = nt+1def start_requests(self):"""Documentation for method start_requestsTo set various properties of the request to be madelike referer, headers and all.Also suppliers entry need to be done in the tabledatastore_datadefinition_suppliers.@return a list of well formed requests which will becrawled by spider and spider will return the response"""#adding entry for the supplier i.e its name and site#MOBILESTORE_HOMEPAGE = "www.themobilestore.in"MOBILESTORE_HOMEPAGE = get_code_word("MOBILESTORE_HOMEPAGE")da = DataHelper()da.add_supplier(self.domain_name, MOBILESTORE_HOMEPAGE)listreq = []#for each request a referer has to be set#MOBILESTORE_REFERER = "www.google.com/search"MOBILESTORE_REFERER = get_code_word("MOBILESTORE_REFERER")for url1 in self.start_urls:request = Request(url = str(url1), callback=self.parse)request.headers.setdefault("Referer", MOBILESTORE_REFERER)listreq.append(request)return listreqdef parse(self, response):"""Documentation for method parse@param response of individual requestsUsing Xpaths needed information is extracted out of the responseand added to the databaseXpath3 = Give us name for individual phoneXpath4 = Give us price for individual phoneXpath5 = Give us name for individual phone, if its not gettable from xpath3Xpath6 = Give us name for individual phone, if its not gettable from xpath3 and xpath5Xpath7 = to check that the phone can be bought or notXpath8 = to check that the item is mobile phone"""da = DataHelper()url1 = response.urlps1 = url1.find('=')ps2 = url1.find('&')str1 = url1[ps1+1:ps2]#da = DataHelper()hxs = HtmlXPathSelector(response)#MOBILESTORE_XPATH3 = '//span[@id="productLayoutForm:categoryNavigation:navigationList_2:navigationList3"]/text()'MOBILESTORE_XPATH3 = get_code_word("MOBILESTORE_XPATH3")#MOBILESTORE_XPATH4 = '//div[@id="priceComp"]//tr[2]/td[3]/span/text()'MOBILESTORE_XPATH4 = get_code_word("MOBILESTORE_XPATH4")#MOBILESTORE_XPATH5 = '//span[@id="productLayoutForm:categoryNavigation:navigationList_1:navigationList3"]/text()'MOBILESTORE_XPATH5 = get_code_word("MOBILESTORE_XPATH5")#MOBILESTORE_XPATH6 = '//span[@id="productLayoutForm:categoryNavigation:navigationList_0:navigationList3"]/text()'MOBILESTORE_XPATH6 = get_code_word("MOBILESTORE_XPATH6")#MOBILESTORE_XPATH7 = '//div[@id="priceComp"]/b/text()'MOBILESTORE_XPATH7 = get_code_word("MOBILESTORE_XPATH7")#MOBILESTORE_XPATH8 = '//span[@id="productLayoutForm:categoryNavigation:navigationList_0:navigationList1"]/text()'MOBILESTORE_XPATH8 = get_code_word("MOBILESTORE_XPATH8")try:catg = hxs.select(MOBILESTORE_XPATH8)catg = catg[0].extract()catg = catg.strip()catg = unescape(catg)print catgif catg == "Mobile Phones>":try:str2 = hxs.select(MOBILESTORE_XPATH7)str2 = str2[0].extract()str2 = "can buy"except:str2 = "can not buy"try:name = hxs.select(MOBILESTORE_XPATH3)name = name[0].extract()name = name.strip()price = hxs.select(MOBILESTORE_XPATH4)price = price[0].extract()price = price.strip()price = int(price)da.add_new_mobstorephone_new(name, price, price, str2)except:try:name = hxs.select(MOBILESTORE_XPATH5)name = name[0].extract()name = name.strip()price = hxs.select(MOBILESTORE_XPATH4)price = price[0].extract()price = price.strip()price = int(price)da.add_new_mobstorephone_new(name, price, price, str2)except:try:name = hxs.select(MOBILESTORE_XPATH6)name = name[0].extract()name = name.strip()price = hxs.select(MOBILESTORE_XPATH4)price = price[0].extract()price = price.strip()price = int(price)da.add_new_mobstorephone_new(name, price, price, str2)except:passexcept:ct = int(da.get_extra_vars('mobilestore_count'))if ct>800:fails = int(da.get_extra_vars('mobilestore_fails'))fails = fails+1da.set_extra_vars('mobilestore_fails',str(fails),'')if fails > 40:da.set_extra_vars('mobilestore_flag','FALSE','')SPIDER = mobilestore_spider0()