Subversion Repositories SmartDukaan

Rev

Rev 14157 | Rev 15153 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 14157 Rev 14744
Line 1... Line -...
1
import urllib2
-
 
2
from BeautifulSoup import BeautifulSoup, NavigableString
1
from BeautifulSoup import BeautifulSoup, NavigableString
-
 
2
from dtr.utils.utils import fetchResponseUsingProxy
3
import re
3
import re
4
import sys
4
import sys
5
 
5
 
6
invalid_tags = ['b', 'i', 'u']
6
invalid_tags = ['b', 'i', 'u']
7
bestSellers = []
7
bestSellers = []
Line 25... Line 25...
25
class AmazonScraper:
25
class AmazonScraper:
26
    def __init__(self):
26
    def __init__(self):
27
        self.count_trials = 0
27
        self.count_trials = 0
28
    
28
    
29
    def read(self, url):
29
    def read(self, url):
30
        request = urllib2.Request(url)
-
 
31
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
-
 
32
        response_data = ""
30
        response_data = ""
33
        try:
31
        try:
34
            response = urllib2.urlopen(request)
-
 
35
            response_data = response.read()
32
            response_data = fetchResponseUsingProxy(url)
36
            response.close()
-
 
37
            
-
 
38
        except urllib2.HTTPError as e:
33
        except Exception as e:
39
            print 'ERROR: ', e
34
            print 'ERROR: ', e
40
            print 'Retrying'
35
            print 'Retrying'
41
            self.count_trials += 1
36
            self.count_trials += 1
42
            
37
            
43
            if self.count_trials < 3:
38
            if self.count_trials < 3:
Line 75... Line 70...
75
                return  unitCost
70
                return  unitCost
76
            
71
            
77
 
72
 
78
if __name__ == '__main__':
73
if __name__ == '__main__':
79
    scraper = AmazonScraper()
74
    scraper = AmazonScraper()
80
    print scraper.read('http://www.amazon.in/gp/offer-listing/B00J60MFTO/ref=olp_sort_ps')
75
    print scraper.read('http://www.amazon.in/gp/offer-listing/B00LSPHFCC/ref=olp_sort_ps')
81
    
76
    
82
77