WebSVN – SmartDukaan – Diff – /prototype/naaptolpass1/src/demo/spiders/spider1.py

 from scrapy.contrib.spidermiddleware import referer
 from scrapy.http.headers import Headers
 from scrapy.http.request.form import FormRequest
 from scrapy.log import msg
 from scrapy.http.response import Response
+from datastore.DataCodeAccessor import *
+from datastore.DataAccessor import *
-from datastore import DataAccessor
-from datastore.DataAccessor import DataHelper
+from html2text.unescaping import *
 class naaptol_spider(BaseSpider):
     def __init__(self):
+       initialize_table()
+       #NAAPTOL_DOMAINNAME = "naaptol"
+       NAAPTOL_DOMAINNAME = get_code_word("NAAPTOL_DOMAINNAME")
-       self.domain_name = "naaptolsites"
+       self.domain_name = NAAPTOL_DOMAINNAME
-       str1 = "http://www.naaptol.com/sitemap.xml"
+       #NAAPTOL_URL = "http://www.naaptol.com/sitemap.xml"
+       NAAPTOL_URL = get_code_word("NAAPTOL_URL")
-       self.start_urls.append(str1)
+       self.start_urls.append(NAAPTOL_URL)
     def start_requests(self):
+        #adding entry for the supplier i.e its name and site
+        #NAAPTOL_HOMEPAGE = "http://www.naaptol.com"
+        NAAPTOL_HOMEPAGE = get_code_word("NAAPTOL_HOMEPAGE")
         da = DataHelper()
-        da.add_supplier(self.domain_name, "www.naaptol.com")
+        da.add_supplier(self.domain_name, NAAPTOL_HOMEPAGE)
         listreq = []
+        #for each request a referer has to be set
+        #NAAPTOL_REFERER = "http://www.google.com"
+        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
         for url1 in self.start_urls:
             request = Request(url = str(url1), callback=self.parse)
-            request.headers.setdefault("Referer", "http://www.naaptol.com/sitemap.xml")
+            request.headers.setdefault("Referer", NAAPTOL_REFERER)
             listreq.append(request)
         return listreq
     def parse(self, response):
         da = DataHelper()
         hxs = HtmlXPathSelector(response)
+        #NAAPTOL_XPATH1 = '//url/loc/text()'
+        NAAPTOL_XPATH1 = get_code_word("NAAPTOL_XPATH1")
-        phone_urls = hxs.select('//url/loc/text()')
+        phone_urls = hxs.select(NAAPTOL_XPATH1)
+        #elements in chk_list are specific to this site for determining valid sites
+        #NAAPTOL_CHKLIST1 = ["mobile_phones/pdas_and_smartphones" ,"mobile_phones/gsm_handsets" ,"mobile_phones/cdma_handsets"]
+        #list separeated by ';'
+        NAAPTOL_CHKLIST1 = get_code_word("NAAPTOL_CHKLIST1")
+        NAAPTOL_CHKLIST1 = NAAPTOL_CHKLIST1.split(';')
         for i in phone_urls:
             site = i.extract()
+            site = unescape(site)
             pos1 = pos2 = pos3 = 0
             temp =""
+            # temp contains string b/w 2nd last and 3rd last slash(/)
             pos1 = site.rfind('/')
             if pos1 != -1:
                 pos2 = site.rfind('/',0,pos1-1)
             if pos2 != -1:
                 pos3 = site.rfind('/',0,pos2-1)
             if pos3 > 0:
                 temp = site[pos3+1:pos1]
-            if temp == "mobile_phones/pdas_and_smartphones" or temp == "mobile_phones/gsm_handsets" or temp == "mobile_phones/cdma_handsets":
-                da.add_naaptolurl(site)
-                #print str(ct) + " " + site
+            for c in NAAPTOL_CHKLIST1:
-                #print "\n"
-                #ct = ct +1
+                if temp == c:
+                    da.add_naaptolurl(site)
 SPIDER = naaptol_spider()

Subversion Repositories SmartDukaan

(root)/prototype/naaptolpass1/src/demo/spiders/spider1.py – Rev 187 → 235