WebSVN – SmartDukaan – Diff – //prototype/indiaplazascapypass2/src/demo/spiders/indiaplazaspider2.py

 from scrapy.http.headers import Headers
 from scrapy.http.request.form import FormRequest
 from scrapy.log import msg
 from scrapy.http.response import Response
-from datastore import DataAccessor
+from datastore.DataAccessor import *
-from datastore.DataAccessor import DataHelper
+from datastore.DataCodeAccessor import *
 import urllib
 from xml.dom import INDEX_SIZE_ERR
+from html2text.unescaping import *
 class indiaplaza_extra(BaseSpider):
     def __init__(self):
+       initialize_table()
+       #INDIAPLAZA_DOMAINNAME1 = "indiaplaza1"
+       INDIAPLAZA_DOMAINNAME1 = get_code_word("INDIAPLAZA_DOMAINNAME1")
-       self.domain_name = "indiaplazaextrainfo"
+       self.domain_name = INDIAPLAZA_DOMAINNAME1
+        # get urls from the database and append them in the list for crawling
        da = DataHelper()
        for pitem in da.get_all_ipbasic():
             self.start_urls.append(pitem.v_site.strip())
     def start_requests(self):
         listreq = []
+        #for each request a referer has to be set
+        #INDIAPLAZA_REFERER = "www.google.com/search"
+        INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")
         for url1 in self.start_urls:
-            request = Request(url = url1, callback=self.parse)
+            request = Request(url = str(url1), callback=self.parse)
-            request.headers.setdefault("Referer", "www.google.com/search")
+            request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
             listreq.append(request)
         return listreq
     def parse(self, response):
         hxs = HtmlXPathSelector(response)
-        #sites = hxs.select('//td[@class="gray-border"]')
+        #INDIAPLAZA_REMOVELIST = ["Rs.","Rs",",","-","/"]
-        #msg(response.url)
+        #List separated by ';'
+        INDIAPLAZA_REMOVELIST = get_code_word("INDIAPLAZA_REMOVELIST")
-        #print(len(sites))
+        INDIAPLAZA_REMOVELIST = INDIAPLAZA_REMOVELIST.split(';')
-        name = hxs.select('.//div[@class="finDetHdr"]/h1/text()')[0].extract()
+        #INDIAPLAZA_XPATH4 = './/div[@class="finDetHdr"]/h1/text()'
+        INDIAPLAZA_XPATH4 = get_code_word("INDIAPLAZA_XPATH4")
-        price = hxs.select('.//div[@class="priceArea"]/span[1]/text()')[0].extract()
+        #INDIAPLAZA_XPATH5 = './/div[@class="priceArea"]/span[1]/text()'
+        INDIAPLAZA_XPATH5 = get_code_word("INDIAPLAZA_XPATH5")
+        #INDIAPLAZA_XPATH6 = './/div[@class="priceArea"]/div[@class="row"][2]/text()'
+        INDIAPLAZA_XPATH6 = get_code_word("INDIAPLAZA_XPATH6")
+        #INDIAPLAZA_XPATH7 = './/div[@class="priceArea"]/div[@class="row"][2]/span/text()'
+        INDIAPLAZA_XPATH7 = get_code_word("INDIAPLAZA_XPATH7")
+        #INDIAPLAZA_XPATH8 = './/div[@class="priceArea"]/div[@class="row"][3]/text()'
+        INDIAPLAZA_XPATH8 = get_code_word("INDIAPLAZA_XPATH8")
+        #INDIAPLAZA_XPATH9 = './/div[@class="priceArea"]/div[@class="row"][4]/text()'
+        INDIAPLAZA_XPATH9 = get_code_word("INDIAPLAZA_XPATH9")
+        #INDIAPLAZA_XPATH10 = './/div[@class="priceArea"]/div[@class="row"][1]/text()'
+        INDIAPLAZA_XPATH10 = get_code_word("INDIAPLAZA_XPATH10")
+        name = hxs.select(INDIAPLAZA_XPATH4)[0].extract()
+        name = unescape(name)
+        price = hxs.select(INDIAPLAZA_XPATH5)[0].extract()
         try:
-            ship_price = hxs.select('.//div[@class="priceArea"]/div[@class="row"][2]/text()')[0].extract()
+            ship_price = hxs.select(INDIAPLAZA_XPATH6)[0].extract()
         except IndexError:
-            ship_price = hxs.select('.//div[@class="priceArea"]/div[@class="row"][2]/span/text()')[0].extract()
+            ship_price = hxs.select(INDIAPLAZA_XPATH7)[0].extract()
         try:
-            guarantee_info = hxs.select('.//div[@class="priceArea"]/div[@class="row"][3]/text()')[0].extract()
+            guarantee_info = hxs.select(INDIAPLAZA_XPATH8)[0].extract()
         except IndexError:
-            guarantee_info = hxs.select('.//div[@class="priceArea"]/div[@class="row"][4]/text()')[0].extract()
+            guarantee_info = hxs.select(INDIAPLAZA_XPATH9)[0].extract()
-        ship_info = hxs.select('.//div[@class="priceArea"]/div[@class="row"][1]/text()')[0].extract()
+        ship_info = hxs.select(INDIAPLAZA_XPATH10)[0].extract()
         urllib.unquote(name)
         urllib.unquote(price)
         urllib.unquote(ship_price)
         urllib.unquote(guarantee_info)
         urllib.unquote(ship_info)
+        #INDIAPLAZA_VAR1 = "Free shipping"
+        INDIAPLAZA_VAR1 = get_code_word("INDIAPLAZA_VAR1")
-        if ship_price == "Free shipping" :
+        if ship_price == INDIAPLAZA_VAR1:
             ship_price = "0"
         else :
+            if ship_price != '':
+                for r in INDIAPLAZA_REMOVELIST:
+                    while ship_price.find(r) != -1:
-            ship_price = ship_price.replace("Rs.","")
+                        ship_price = ship_price.replace(r, "")
+        if price != '':
+                for r in INDIAPLAZA_REMOVELIST:
+                    while price.find(r) != -1:
+                        price = price.replace(r, "")
-        price = price.replace("Rs.","")
         name = name.strip()
         price = price.strip()
         ship_price = ship_price.strip()
         guarantee_info = guarantee_info.strip()
         ship_info = ship_info.strip()
         shown_pr = int(price)
         final_pr = shown_pr + int(ship_price)
-        print name
-        print shown_pr
-        print final_pr
-        print guarantee_info
-        print ship_info
         da = DataHelper()
         da.add_ipextra(name,shown_pr,final_pr,guarantee_info,ship_info)
-    '''
-        for site in sites:
-            item = {}
-            #tmp = site.select('.//tr[2]/td/a/text()')
-            item['name'] = response.select('.//div[@class="finDetHdr"]/h1/text()')[0].extract()
-            #psite = site.select(".//a[3][@href]/@href")[0].extract()
-            item['price'] =site.select('.//tr[3]/th/label/text()')[0].extract()
-            items.append(item)
-        for i in items:
-            str1 = str(i['title']).strip()
-            print str1
-            amnt = i['price'].replace(",","")
-            amnt = amnt.replace("Rs", "")
-            amnt = amnt.replace("/", "")
-            amnt = amnt.replace("-", "")
-            amnt = amnt.strip()
-            pr = int(amnt) + vatplustax
-            #print pr
-            da.add_new_univerphone(str1,amnt,pr)
-        '''
-        #lt = len(da.get_all_phones())
-        #print "length" + str(lt)
-        #for ph in da.get_all_phones():
-         #   print ph
-        #f = open('/home/gaurav/twopassscrapy/pricelinks.txt', 'w')
-        #for i in items:
-            #f.write(i['title'])
-            #f.write("\n")
-            #f.write(i['link'])
-            #f.write("\n")
-        #f.close()
 SPIDER = indiaplaza_extra()

Subversion Repositories SmartDukaan

(root)//prototype/indiaplazascapypass2/src/demo/spiders/indiaplazaspider2.py – Rev 180 → 227