Subversion Repositories SmartDukaan

Rev

Rev 4039 | Rev 4199 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 4039 Rev 4198
Line 3... Line 3...
3
 
3
 
4
@author: Varun Gupta
4
@author: Varun Gupta
5
'''
5
'''
6
from BeautifulSoup import BeautifulSoup
6
from BeautifulSoup import BeautifulSoup
7
from BaseScraper import BaseScraper
7
from BaseScraper import BaseScraper
-
 
8
from Utils import removePriceFormatting
8
 
9
 
9
class HS18Scraper(BaseScraper):
10
class HS18Scraper(BaseScraper):
10
 
11
 
11
    def __init__(self):
12
    def __init__(self):
12
        self.url = None
13
        self.url = None
Line 27... Line 28...
27
            name = str(anchor['title'].strip())
28
            name = str(anchor['title'].strip())
28
            
29
            
29
            if name.endswith(' Mobile Phone'):  name = name.replace(' Mobile Phone', '')
30
            if name.endswith(' Mobile Phone'):  name = name.replace(' Mobile Phone', '')
30
            
31
            
31
            url = str(anchor['href'].strip())
32
            url = str(anchor['href'].strip())
32
            price = str(div.findAll('span', {'class': 'product_new_price'})[0].string.strip())
33
            price = removePriceFormatting(str(div.findAll('span', {'class': 'product_new_price'})[0].string.strip()))
33
            
34
            
34
            try:
35
            try:
-
 
36
                product_prices.append({
-
 
37
                        'name': name, 
-
 
38
                        'price': price,
-
 
39
                        'source': 'homeshop18', 
-
 
40
                        'in_stock': 1, 
35
                product_prices.append({'name': name, 'price': price, 'in_stock': 1, 'product_url': url})
41
                        'product_url': url
-
 
42
                    })
36
                
43
                
37
            except UnicodeEncodeError as e:
44
            except UnicodeEncodeError as e:
38
                print 'Unicode Error', e, name
45
                print 'Unicode Error', e, name
39
                name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
46
                name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
40
                print name_ascii
47
                print name_ascii
-
 
48
                product_prices.append({
-
 
49
                        "name": str(name_ascii), 
-
 
50
                        "price": str(price),
-
 
51
                        'source': 'homeshop18',
-
 
52
                        "in_stock": 1, 
41
                product_prices.append({"name": str(name_ascii), "price": str(price), "in_stock": 1, "product_url": str(url)})
53
                        "product_url": str(url)
-
 
54
                    })
42
        
55
        
43
        return product_prices
56
        return product_prices
44
    
57
    
45
    def getNextUrl(self):
58
    def getNextUrl(self):
46
        pagination_links = self.soup.findAll('div', {'class': 'pagination'})[0]('span')
59
        pagination_links = self.soup.find('div', {'class': 'pagination'}).findAll('span')
47
        
60
        
-
 
61
        try:
48
        if pagination_links[-1]['class'].strip() == 'disabled_pagination':
62
            if pagination_links[-1]['class'].strip() == 'disabled_pagination':
49
            return None
63
                return None
50
        else:
64
            else:
51
            return pagination_links[-1]('a')[0]['href'].strip()
65
                return pagination_links[-1]('a')[0]['href'].strip()
-
 
66
        except KeyError:
-
 
67
            print pagination_links
52
 
68
 
53
if __name__ == '__main__':
69
if __name__ == '__main__':
54
    scraper = HS18Scraper()
70
    scraper = HS18Scraper()
55
    scraper.setUrl('http://www.homeshop18.com/ipads-2f-tablets/category:8937/')
71
    scraper.setUrl('http://www.homeshop18.com//ipads-2f-tablets/categoryid:8937/search:*/start:32/')
56
    scraper.scrape()
72
    scraper.scrape()
57
    products = scraper.getPhones()
73
    products = scraper.getPhones()
58
    print scraper.getNextUrl()
74
    print scraper.getNextUrl()
59
    print products
75
    print products
60
76