Rev 227 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 17-May-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom datastore import DataAccessorfrom datastore.DataAccessor import DataHelperimport urllibfrom xml.dom import INDEX_SIZE_ERRclass indiaplaza_extra(BaseSpider):def __init__(self):self.domain_name = "indiaplazaextrainfo"da = DataHelper()for pitem in da.get_all_ipbasic():self.start_urls.append(pitem.v_site.strip())def start_requests(self):listreq = []for url1 in self.start_urls:request = Request(url = url1, callback=self.parse)request.headers.setdefault("Referer", "www.google.com/search")listreq.append(request)return listreqdef parse(self, response):hxs = HtmlXPathSelector(response)#sites = hxs.select('//td[@class="gray-border"]')#msg(response.url)#print(len(sites))name = hxs.select('.//div[@class="finDetHdr"]/h1/text()')[0].extract()price = hxs.select('.//div[@class="priceArea"]/span[1]/text()')[0].extract()try:ship_price = hxs.select('.//div[@class="priceArea"]/div[@class="row"][2]/text()')[0].extract()except IndexError:ship_price = hxs.select('.//div[@class="priceArea"]/div[@class="row"][2]/span/text()')[0].extract()try:guarantee_info = hxs.select('.//div[@class="priceArea"]/div[@class="row"][3]/text()')[0].extract()except IndexError:guarantee_info = hxs.select('.//div[@class="priceArea"]/div[@class="row"][4]/text()')[0].extract()ship_info = hxs.select('.//div[@class="priceArea"]/div[@class="row"][1]/text()')[0].extract()urllib.unquote(name)urllib.unquote(price)urllib.unquote(ship_price)urllib.unquote(guarantee_info)urllib.unquote(ship_info)if ship_price == "Free shipping" :ship_price = "0"else :ship_price = ship_price.replace("Rs.","")price = price.replace("Rs.","")name = name.strip()price = price.strip()ship_price = ship_price.strip()guarantee_info = guarantee_info.strip()ship_info = ship_info.strip()shown_pr = int(price)final_pr = shown_pr + int(ship_price)print nameprint shown_prprint final_prprint guarantee_infoprint ship_infoda = DataHelper()da.add_ipextra(name,shown_pr,final_pr,guarantee_info,ship_info)'''for site in sites:item = {}#tmp = site.select('.//tr[2]/td/a/text()')item['name'] = response.select('.//div[@class="finDetHdr"]/h1/text()')[0].extract()#psite = site.select(".//a[3][@href]/@href")[0].extract()item['price'] =site.select('.//tr[3]/th/label/text()')[0].extract()items.append(item)for i in items:str1 = str(i['title']).strip()print str1amnt = i['price'].replace(",","")amnt = amnt.replace("Rs", "")amnt = amnt.replace("/", "")amnt = amnt.replace("-", "")amnt = amnt.strip()pr = int(amnt) + vatplustax#print prda.add_new_univerphone(str1,amnt,pr)'''#lt = len(da.get_all_phones())#print "length" + str(lt)#for ph in da.get_all_phones():# print ph#f = open('/home/gaurav/twopassscrapy/pricelinks.txt', 'w')#for i in items:#f.write(i['title'])#f.write("\n")#f.write(i['link'])#f.write("\n")#f.close()SPIDER = indiaplaza_extra()