Rev 262 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 27-May-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom time import *from datastore.DataCodeAccessor import *from datastore.DataAccessor import *import urllibfrom html2text.unescaping import *class naaptol_price(BaseSpider):"""Documentation for class naaptol_priceSince the urls collected in the previous spider for naaptol.comare redirected to get the data for individual phones.Some are of the form "http://www.naaptol.com/features/10417-Fly-E300.html"while others are of the form "http://www.naaptol.com/price/10417-Fly-E300.html".So to make data extraction symmetric, this spider will accomplish 2 tasksFirst, for the urls conatining 'features' it collects the information for theindividual phones and store them in table datastore_datadefinition_naaptol_phonesfor the ones conatining 'prices' in the url, a new url having 'price' repalcedwith 'features' is framed and stored in the table datastore_datadefinition_morenaaptol_urls."""def __init__(self):"""Documentation for constructorinitialize_table is called to make all the tables known inthe scope of this class.Also start url needs to be feeded to the spider through start_urls.appendDomainname1 is name by which this spider is known outsideSo this will be used as an argument for calling this spider"""initialize_table()#NAAPTOL_DOMAINNAME1 = "naaptol1"NAAPTOL_DOMAINNAME1 = get_code_word("NAAPTOL_DOMAINNAME1")self.domain_name = NAAPTOL_DOMAINNAME1# get urls from the database and append them in the list for crawlingda = DataHelper()#url = "http://www.naaptol.com/features/10417-Fly-E300.html"#self.start_urls.append(url)for pitem in da.get_allnaaptolurls():self.start_urls.append(pitem.url.strip())def start_requests(self):"""Documentation for method start_requestsTo set various properties of the request to be madelike referer, headers and all.@return a list of well formed requests which will becrawled by spider and spider will return the response"""#for each request a referer has to be setlistreq = []#NAAPTOL_REFERER = "http://www.google.com"NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")for url1 in self.start_urls:request = Request(url = str(url1), callback=self.parse)request.headers.setdefault("Referer", NAAPTOL_REFERER)listreq.append(request)return listreqdef parse(self, response):"""Documentation for method parse@param response of individual requestsUsing Xpaths needed information is extracted out of the responseand added to the databaseXpath2 = Give us price-range for individual phoneXpath3 = Give us price-range for individual phone, if unable to retrieve from xpath2Xpath4 = Give us number of onlinesellers for a particular phoneXpath5 = Give us price for a particular phone offered by onlinesellersXpath6 and Xpath7 = Give us name of onlinesellers for a particular phoneXpath8 = Give us number of offlinesellers for a particular phoneXpath9 = Give us price for a particular phone offered by offlinesellersXpath10 = Give us name of offlinesellers for a particular phoneRemovelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'chklist2 = contains what needs to be replaced, presently it conatains 'price'part = contains 'features'"""# there are two different type of urls one contains feature and other one contains price#both have to be processed differentlymsg(response.url)site = response.urlsite = unescape(site)sp1 = site.rfind("/")sp2 = site.rfind("/",0,sp1-1)catg = site[sp2+1:sp1]da = DataHelper()#change price to features and add to urls as both provide the same data but in different formats#otherwise crawl the url containing features#NAAPTOL_CHKLIST2 = ['price']#list separated by ';'NAAPTOL_CHKLIST2 = str(get_code_word("NAAPTOL_CHKLIST2"))if len(NAAPTOL_CHKLIST2)>0:NAAPTOL_CHKLIST2 = NAAPTOL_CHKLIST2.split(';')#NAAPTOL_PART = "features"NAAPTOL_PART = get_code_word("NAAPTOL_PART")#NAAPTOL_REMOVELIST = ["Rs.",","]NAAPTOL_REMOVELIST = str(get_code_word("NAAPTOL_REMOVELIST"))if len(NAAPTOL_REMOVELIST)>0:NAAPTOL_REMOVELIST = NAAPTOL_REMOVELIST.split(';')for c in NAAPTOL_CHKLIST2:if c == catg:site = site.replace(c,NAAPTOL_PART)da.add_morenaaptolurl(site)if catg == NAAPTOL_PART:#retreiving name from the the urlname = str(response.url)name = unescape(name)name_pos = name.rfind("/")name = name[name_pos+1:len(name)-5]name_pos = name.find("-")name = name[name_pos+1:len(name)]hxs = HtmlXPathSelector(response)#price and price2 determine range#NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")prices = hxs.select(NAAPTOL_XPATH2)try:price1 = prices.extract()[0]price1 = price1.strip()except:price1 = ""try:price2 = prices.extract()[1]price2 = price2.strip()except:price2 = ""try:if price1 == "" and price2 == "":#NAAPTOL_XPATH3 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()'NAAPTOL_XPATH3 = get_code_word("NAAPTOL_XPATH3")prices = hxs.select(NAAPTOL_XPATH3)price = str(prices.extract()[0])pos1 = price.find("'")pos2 = price.find("'",pos1+1,len(price))price1 = price[pos1+1:pos2] + "(approx)"price2 = ""except:price1 = price2 = ""#removelist is used for converting price to decimal format containing only numbers and '.'if price1 != '':for r in NAAPTOL_REMOVELIST:while price1.find(r) != -1:price1 = price1.replace(r, "")price1 = price1.strip()if price2 != '':for r in NAAPTOL_REMOVELIST:while price2.find(r) != -1:price2 = price2.replace(r, "")price2 = price2.strip()if price1 == "Rates Not Available":price1 = price2 = ""#range = price1 to price2range = price1if price2 != "":range = str(range) + " to "range = range + str(price2)da.add_new_naaptolphone(name, str(range))OnlineSellers_pricelist = []OnlineSellers_namelist = []try:#ct1 holds the count of online sellers#NAAPTOL_XPATH4 = '//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'NAAPTOL_XPATH4 = get_code_word("NAAPTOL_XPATH4")ct1 = hxs.select(NAAPTOL_XPATH4)ct1 = str(ct1.extract()[0])ct1 = ct1.decode("utf-8")ct1 = ct1.strip()ps1 = ct1.find(" ")ct1 = ct1[0:ps1]ct1 = int(ct1)except:ct1 = 0ct = ct1i = 0#NAAPTOL_XPATH5 = '//div[@id="onSellerContents"]//td[@class="price"]'NAAPTOL_XPATH5 = get_code_word("NAAPTOL_XPATH5")os_info = hxs.select(NAAPTOL_XPATH5)while ct > 0:os = os_info[i].extract()ps1 = os.find(">")ps2 = os.find("<",ps1)os = os[ps1+1:ps2]if os != '':for r in NAAPTOL_REMOVELIST:while os.find(r) != -1:os = os.replace(r, "")os = urllib.unquote(os)try:os = int(os)except:#stored in format different than previous oneos = os_info[i].extract()ps1 = os.find(">",ps2)ps2 = os.find("<",ps1)os = os[ps1+1:ps2]if os != '':for r in NAAPTOL_REMOVELIST:while os.find(r) != -1:os = os.replace(r, "")os = urllib.unquote(os)os = int(os)OnlineSellers_pricelist.append(os)#NAAPTOL_XPATH6 = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'NAAPTOL_XPATH6 = get_code_word("NAAPTOL_XPATH6")#NAAPTOL_XPATH7 = '"]/span/text()'NAAPTOL_XPATH7 = get_code_word("NAAPTOL_XPATH7")NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + str(i)NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + NAAPTOL_XPATH7path = NAAPTOL_XPATH6osname = hxs.select(path)osname = osname.extract()[0]osname = unescape(osname)osname = urllib.unquote(osname)OnlineSellers_namelist.append(osname)i = i+1ct = ct-1l = len(OnlineSellers_pricelist)i = 0nid = da.get_naaptolphone(name,range).idwhile l > 0:da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])i = i+1l = l-1LocalSellers_pricelist = []LocalSellers_namelist = []try:#ct1 holds the count of online sellers#NAAPTOL_XPATH8 = '//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'NAAPTOL_XPATH8 = get_code_word("NAAPTOL_XPATH8")ct1 = hxs.select(NAAPTOL_XPATH8)ct1 = str(ct1.extract()[0])ct1 = ct1.decode("utf-8")ct1 = ct1.strip()ps1 = ct1.find(" ")ct1 = ct1[0:ps1]ct1 = int(ct1)except:ct1 = 0ct = ct1i = 0#NAAPTOL_XPATH9 = '//div[@id="offSellerContents"]//td[@class="price"]'NAAPTOL_XPATH9 = get_code_word("NAAPTOL_XPATH9")#NAAPTOL_XPATH10 = '//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()'NAAPTOL_XPATH10 = get_code_word("NAAPTOL_XPATH10")os_info = hxs.select(NAAPTOL_XPATH9)os_names = hxs.select(NAAPTOL_XPATH10)while ct > 0:os = os_info[i].extract()osname = os_names[i].extract()ps1 = os.find(">")ps2 = os.find("<",ps1)os = os[ps1+1:ps2]if os != '':for r in NAAPTOL_REMOVELIST:while os.find(r) != -1:os = os.replace(r, "")os = urllib.unquote(os)osname = urllib.unquote(osname)osname = unescape(osname)try:os = int(os)except:#stored in format different than previous oneos = os_info[i].extract()ps1 = os.find(">",ps2)ps2 = os.find("<",ps1)os = os[ps1+1:ps2]if os != '':for r in NAAPTOL_REMOVELIST:while os.find(r) != -1:os = os.replace(r, "")os = urllib.unquote(os)os = int(os)LocalSellers_pricelist.append(os)LocalSellers_namelist.append(osname)i = i+1ct = ct-1l = len(LocalSellers_pricelist)i = 0nid = da.get_naaptolphone(name,range).idwhile l > 0:da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])i = i+1l = l-1SPIDER = naaptol_price()