Subversion Repositories SmartDukaan

Rev

Rev 14307 | Rev 15154 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 14307 Rev 14759
Line 1... Line 1...
1
import urllib2
1
import urllib2
2
from BeautifulSoup import BeautifulSoup, NavigableString
2
from BeautifulSoup import BeautifulSoup, NavigableString
3
import re
3
import re
4
import sys
4
from dtr.utils.utils import fetchResponseUsingProxy
5
 
5
 
6
invalid_tags = ['b', 'i', 'u']
6
invalid_tags = ['b', 'i', 'u']
7
bestSellers = []
7
bestSellers = []
8
 
8
 
9
def strip_tags(html, invalid_tags):
9
def strip_tags(html, invalid_tags):
Line 25... Line 25...
25
class AmazonScraper:
25
class AmazonScraper:
26
    def __init__(self):
26
    def __init__(self):
27
        self.count_trials = 0
27
        self.count_trials = 0
28
    
28
    
29
    def read(self, url):
29
    def read(self, url):
30
        request = urllib2.Request(url)
-
 
31
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
-
 
32
        response_data = ""
30
        response_data = ""
33
        try:
31
        try:
34
            response = urllib2.urlopen(request)
-
 
35
            response_data = response.read()
32
            response_data = fetchResponseUsingProxy(url)
36
            response.close()
-
 
37
            
-
 
38
        except urllib2.HTTPError as e:
33
        except Exception as e:
39
            print 'ERROR: ', e
34
            print 'ERROR: ', e
40
            print 'Retrying'
35
            print 'Retrying'
41
            self.count_trials += 1
36
            self.count_trials += 1
42
            
37
            
43
            if self.count_trials < 3:
38
            if self.count_trials < 3: