WebSVN – SmartDukaan – Diff – //prototype/babuchakScrapypass1/src/demo/spiders/spider1.py

 from scrapy.http.response import Response
 from datastore.DataAccessor import *
 from datastore.DataCodeAccessor import *
 from html2text import *
-from babel.messages.pofile import unescape
 import urllib
 class babuchak1(BaseSpider):
+    """
+    Documentation for class babuchak1
+    This spider collects the url for the individual vendors
+    and store them in table datastore_datadefinition_babuchak_urls.
+    """
     def __init__(self):
+        """
+        Documentation for constructor
+        initialize_table is called to make all the tables known in
+        the scope of this class.
+        Also start url needs to be feeded to the spider through start_urls.append
+        Domainname is name by which this spider is known outside
+        So this will be used as an argument for calling this spider
+        """
         initialize_table()
         #BABUCHAK_DOMAINNAME = "babuchak"
         BABUCHAK_DOMAINNAME = get_code_word("BABUCHAK_DOMAINNAME")
         self.domain_name = BABUCHAK_DOMAINNAME
         #BABUCHAK_URL = "http://www.shopping.babuchak.com/visitourstores.php?view=productListPage&category=108"
         BABUCHAK_URL = get_code_word("BABUCHAK_URL")
         self.start_urls.append(BABUCHAK_URL)
     def start_requests(self):
+        """
+        Documentation for method start_requests
+        To set various properties of the request to be made
+        like referer, headers and all.
+        Also suppliers entry need to be done in the table
+        datastore_datadefinition_suppliers.
+        @return a list of well formed requests which will be
+        crawled by spider and spider will return the response
+        """
         #adding entry for the supplier i.e its name and site
         #BABUCHAK_HOMEPAGE = "http://www.shopping.babuchak.com"
         BABUCHAK_HOMEPAGE = get_code_word("BABUCHAK_HOMEPAGE")
         da = DataHelper()
         da.add_supplier(self.domain_name, BABUCHAK_HOMEPAGE)
             request.headers.setdefault("Referer", BABUCHAK_REFERER)
             listreq.append(request)
         return listreq
     def parse(self, response):
+        """
+        Documentation for method parse
+        @param response of individual requests
+        Using Xpaths needed information is extracted out of the response
-        #url1 needed to get complete urls
+        and added to the database
+        Xpath1 = Give us section for individual vendors
+        Xpath2 = Give us no of pages for individual vendors
+        Xpath3 = Give us url for individual vendors
+        Url1 = To get full url for individual vendors
+        """
         da = DataHelper()
         #BABUCHAK_URL1 = "http://www.shopping.babuchak.com/visitourstores.php"
         BABUCHAK_URL1 = get_code_word("BABUCHAK_URL1")
         hxs = HtmlXPathSelector(response)
         #BABUCHAK_XPATH1 = '//td[@class="mod-category-header"]'
             #BABUCHAK_XPATH2 = './/text()'
             BABUCHAK_XPATH2 = get_code_word("BABUCHAK_XPATH2")
             #BABUCHAK_XPATH3 = './/a/@href'
             BABUCHAK_XPATH3 = get_code_word("BABUCHAK_XPATH3")
             no_pages = i.select(BABUCHAK_XPATH2)[2].extract()
-            #print i.select(BABUCHAK_XPATH2)[1].extract() + "  "
             url = i.select(BABUCHAK_XPATH3)[0].extract()
             url = BABUCHAK_URL1 + url
             no_pages = urllib.unquote(no_pages)
             no_pages = no_pages.strip()
             no_pages = no_pages[1:len(no_pages)-1]
             no_pages = int(no_pages)
-            #print url + " "
-            #print no_pages
             da.add_babuchakurl(url, no_pages)
 SPIDER = babuchak1()

Subversion Repositories SmartDukaan

(root)//prototype/babuchakScrapypass1/src/demo/spiders/spider1.py – Rev 219 → 251