Rev 183 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 20-May-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom datastore import DataAccessorfrom datastore.DataAccessor import DataHelperfrom html2text.unescaping import *class mobilestore_spider(BaseSpider):def __init__(self):MOBILESTORE_DOMAINNAME = "mobilestore"self.domain_name = MOBILESTORE_DOMAINNAMEMOBILESTORE_URL = "http://www.themobilestore.in/sitemap.xml"self.start_urls.append(MOBILESTORE_URL)def start_requests(self):#adding entry for the supplier i.e its name and siteMOBILESTORE_HOMEPAGE = "www.themobilestore.in"da = DataHelper()da.add_supplier(self.domain_name, MOBILESTORE_HOMEPAGE)listreq = []#for each request a referer has to be setMOBILESTORE_REFERER = "www.google.com/search"for url1 in self.start_urls:request = Request(url = str(url1), callback=self.parse)request.headers.setdefault("Referer", MOBILESTORE_REFERER)listreq.append(request)return listreqdef parse(self, response):da = DataHelper()hxs = HtmlXPathSelector(response)MOBILESTORE_XPATH1 = '//url/loc/text()'phone_urls = hxs.select(MOBILESTORE_XPATH1)#elements in chk_list are specific to this site for determining valid sitesMOBILESTORE_CHKLIST1 = ["Mobile_Phones"]for i in phone_urls:site = i.extract()site = unescape(site)pos1 = pos2 = pos3 = 0temp =""pos1 = site.rfind('/')if pos1 != -1:pos2 = site.rfind('/',0,pos1-1)if pos2 != -1:pos3 = site.rfind('/',0,pos2-1)if pos3 > 0:temp = site[pos3+1:pos2]# adding valid urls to the DB from the site-mapfor m in MOBILESTORE_CHKLIST1:if temp == m:da.add_mobstoreurl(site)SPIDER = mobilestore_spider()