WebSVN – SmartDukaan – Diff – /prototype/naaptolpass2/src/demo/spiders/spider2.py

 import urllib
 from html2text.unescaping import *
 class naaptol_price(BaseSpider):
+    """
+    Documentation for class naaptol_price
+    Since the urls collected in the previous spider for naaptol.com
+    are redirected to get the data for individual phones.
+    Some are of the form "http://www.naaptol.com/features/10417-Fly-E300.html"
+    while others are of the form "http://www.naaptol.com/price/10417-Fly-E300.html".
+    So to make data extraction symmetric, this spider will accomplish 2 tasks
+    First, for the urls conatining 'features' it collects the information for the
+    individual phones and store them in table datastore_datadefinition_naaptol_phones
+    for the ones conatining 'prices' in the url, a new url having 'price' repalced
+    with 'features' is framed and stored in the table datastore_datadefinition_morenaaptol_urls.
+    """
     def __init__(self):
+       """
+        Documentation for constructor
+        initialize_table is called to make all the tables known in
+        the scope of this class.
+        Also start url needs to be feeded to the spider through start_urls.append
+        Domainname1 is name by which this spider is known outside
+        So this will be used as an argument for calling this spider
+       """
        initialize_table()
        #NAAPTOL_DOMAINNAME1 = "naaptol1"
        NAAPTOL_DOMAINNAME1 = get_code_word("NAAPTOL_DOMAINNAME1")
        self.domain_name = NAAPTOL_DOMAINNAME1
        #self.start_urls.append(url)
        for pitem in da.get_allnaaptolurls():
             self.start_urls.append(pitem.url.strip())
     def start_requests(self):
+        """
+        Documentation for method start_requests
+        To set various properties of the request to be made
+        like referer, headers and all.
+        @return a list of well formed requests which will be
+        crawled by spider and spider will return the response
+        """
         #for each request a referer has to be set
         listreq = []
         #NAAPTOL_REFERER = "http://www.google.com"
         NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
         for url1 in self.start_urls:
             request.headers.setdefault("Referer", NAAPTOL_REFERER)
             listreq.append(request)
         return listreq
     def parse(self, response):
+        """
+        Documentation for method parse
+        @param response of individual requests
+        Using Xpaths needed information is extracted out of the response
+        and added to the database
+        Xpath2 = Give us price-range for individual phone
+        Xpath3 = Give us price-range for individual phone, if unable to retrieve from xpath2
+        Xpath4 = Give us number of onlinesellers for a particular phone
+        Xpath5 = Give us price for a particular phone offered by onlinesellers
+        Xpath6 and Xpath7 = Give us name of onlinesellers for a particular phone
+        Xpath8 = Give us number of offlinesellers for a particular phone
+        Xpath9 = Give us price for a particular phone offered by offlinesellers
+        Xpath10 = Give us name of offlinesellers for a particular phone
+        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
+        chklist2 = contains what needs to be replaced, presently it conatains 'price'
+        part = contains 'features'
+        """
         # there are two different type of urls one contains feature and other one contains price
         #both have to be processed differently
         msg(response.url)
         site = response.url
         site = unescape(site)
             #NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
             NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
             prices = hxs.select(NAAPTOL_XPATH2)
             try:
                 price1 = prices.extract()[0]
-                #price1 = price1.decode("utf-8")
                 price1 = price1.strip()
             except:
                 price1 = ""
             try:
                 price2 = prices.extract()[1]
-                #price2 = price2.decode("utf-8")
                 price2 = price2.strip()
             except:
                 price2 = ""
             try:
                 if price1 == "" and price2 == "":

Subversion Repositories SmartDukaan

(root)/prototype/naaptolpass2/src/demo/spiders/spider2.py – Rev 236 → 262