Subversion Repositories SmartDukaan

Rev

Rev 236 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 236 Rev 262
Line 19... Line 19...
19
 
19
 
20
import urllib
20
import urllib
21
from html2text.unescaping import *
21
from html2text.unescaping import *
22
 
22
 
23
class naaptol_price(BaseSpider):
23
class naaptol_price(BaseSpider):
24
    
24
    """
-
 
25
    Documentation for class naaptol_price
-
 
26
    Since the urls collected in the previous spider for naaptol.com
-
 
27
    are redirected to get the data for individual phones.
-
 
28
    Some are of the form "http://www.naaptol.com/features/10417-Fly-E300.html"
-
 
29
    while others are of the form "http://www.naaptol.com/price/10417-Fly-E300.html".
-
 
30
    So to make data extraction symmetric, this spider will accomplish 2 tasks
-
 
31
    First, for the urls conatining 'features' it collects the information for the 
-
 
32
    individual phones and store them in table datastore_datadefinition_naaptol_phones
-
 
33
    for the ones conatining 'prices' in the url, a new url having 'price' repalced  
-
 
34
    with 'features' is framed and stored in the table datastore_datadefinition_morenaaptol_urls.
-
 
35
    """
25
    def __init__(self): 
36
    def __init__(self): 
-
 
37
       """
-
 
38
        Documentation for constructor
-
 
39
        initialize_table is called to make all the tables known in
-
 
40
        the scope of this class.
-
 
41
        Also start url needs to be feeded to the spider through start_urls.append
-
 
42
        Domainname1 is name by which this spider is known outside
-
 
43
        So this will be used as an argument for calling this spider 
-
 
44
       """ 
26
       initialize_table()
45
       initialize_table()
27
       #NAAPTOL_DOMAINNAME1 = "naaptol1"   
46
       #NAAPTOL_DOMAINNAME1 = "naaptol1"   
28
       NAAPTOL_DOMAINNAME1 = get_code_word("NAAPTOL_DOMAINNAME1")
47
       NAAPTOL_DOMAINNAME1 = get_code_word("NAAPTOL_DOMAINNAME1")
29
       self.domain_name = NAAPTOL_DOMAINNAME1 
48
       self.domain_name = NAAPTOL_DOMAINNAME1 
30
       
49
       
Line 34... Line 53...
34
       #self.start_urls.append(url)
53
       #self.start_urls.append(url)
35
       for pitem in da.get_allnaaptolurls():
54
       for pitem in da.get_allnaaptolurls():
36
            self.start_urls.append(pitem.url.strip())
55
            self.start_urls.append(pitem.url.strip())
37
    
56
    
38
    def start_requests(self):
57
    def start_requests(self):
-
 
58
        """
-
 
59
        Documentation for method start_requests
-
 
60
        To set various properties of the request to be made
-
 
61
        like referer, headers and all.
-
 
62
        @return a list of well formed requests which will be 
-
 
63
        crawled by spider and spider will return the response
-
 
64
        """
39
        #for each request a referer has to be set
65
        #for each request a referer has to be set
40
        listreq = []
66
        listreq = []
41
        #NAAPTOL_REFERER = "http://www.google.com"
67
        #NAAPTOL_REFERER = "http://www.google.com"
42
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
68
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
43
        for url1 in self.start_urls:
69
        for url1 in self.start_urls:
Line 45... Line 71...
45
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
71
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
46
            listreq.append(request)
72
            listreq.append(request)
47
        return listreq
73
        return listreq
48
    
74
    
49
    def parse(self, response):
75
    def parse(self, response):
-
 
76
        """
-
 
77
        Documentation for method parse
-
 
78
        @param response of individual requests
-
 
79
        Using Xpaths needed information is extracted out of the response
-
 
80
        and added to the database
-
 
81
        Xpath2 = Give us price-range for individual phone
-
 
82
        Xpath3 = Give us price-range for individual phone, if unable to retrieve from xpath2
-
 
83
        Xpath4 = Give us number of onlinesellers for a particular phone
-
 
84
        Xpath5 = Give us price for a particular phone offered by onlinesellers
-
 
85
        Xpath6 and Xpath7 = Give us name of onlinesellers for a particular phone 
-
 
86
        Xpath8 = Give us number of offlinesellers for a particular phone
-
 
87
        Xpath9 = Give us price for a particular phone offered by offlinesellers
-
 
88
        Xpath10 = Give us name of offlinesellers for a particular phone
-
 
89
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
-
 
90
        chklist2 = contains what needs to be replaced, presently it conatains 'price'
-
 
91
        part = contains 'features'
-
 
92
        """
50
        # there are two different type of urls one contains feature and other one contains price
93
        # there are two different type of urls one contains feature and other one contains price
51
        #both have to be processed differently
94
        #both have to be processed differently
52
        msg(response.url)
95
        msg(response.url)
53
        site = response.url
96
        site = response.url
54
        site = unescape(site)
97
        site = unescape(site)
Line 87... Line 130...
87
            #NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
130
            #NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
88
            NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
131
            NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
89
            prices = hxs.select(NAAPTOL_XPATH2)
132
            prices = hxs.select(NAAPTOL_XPATH2)
90
            try:
133
            try:
91
                price1 = prices.extract()[0]
134
                price1 = prices.extract()[0]
92
                #price1 = price1.decode("utf-8")
-
 
93
                price1 = price1.strip()
135
                price1 = price1.strip()
94
            except:
136
            except:
95
                price1 = ""
137
                price1 = ""
96
            
138
            
97
            try:
139
            try:
98
                price2 = prices.extract()[1]
140
                price2 = prices.extract()[1]
99
                #price2 = price2.decode("utf-8")
-
 
100
                price2 = price2.strip()
141
                price2 = price2.strip()
101
            except:
142
            except:
102
                price2 = ""  
143
                price2 = ""  
103
            try:
144
            try:
104
                if price1 == "" and price2 == "":
145
                if price1 == "" and price2 == "":