Subversion Repositories SmartDukaan

Rev

Rev 4198 | Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4039 varun.gupt 1
'''
2
Created on 07-Sep-2011
3
 
4
@author: Varun Gupta
5
'''
6
from BeautifulSoup import BeautifulSoup
7
from BaseScraper import BaseScraper
8
 
9
class HS18Scraper(BaseScraper):
10
 
11
    def __init__(self):
12
        self.url = None
13
        self.id = None
14
 
15
    def setUrl(self, url):
16
        self.url = url
17
 
18
    def scrape(self):
19
        html = BaseScraper.read(self, self.url)
20
        self.soup = BeautifulSoup(html)
21
 
22
    def getPhones(self):
23
        product_prices = []
24
 
25
        for div in self.soup.findAll('div', {'class': 'product_div book_info_box'}):
26
            anchor = div.findAll('p', {'class': 'product_title'})[0]('a')[0]
27
            name = str(anchor['title'].strip())
28
 
29
            if name.endswith(' Mobile Phone'):  name = name.replace(' Mobile Phone', '')
30
 
31
            url = str(anchor['href'].strip())
32
            price = str(div.findAll('span', {'class': 'product_new_price'})[0].string.strip())
33
 
34
            try:
35
                product_prices.append({'name': name, 'price': price, 'in_stock': 1, 'product_url': url})
36
 
37
            except UnicodeEncodeError as e:
38
                print 'Unicode Error', e, name
39
                name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
40
                print name_ascii
41
                product_prices.append({"name": str(name_ascii), "price": str(price), "in_stock": 1, "product_url": str(url)})
42
 
43
        return product_prices
44
 
45
    def getNextUrl(self):
46
        pagination_links = self.soup.findAll('div', {'class': 'pagination'})[0]('span')
47
 
48
        if pagination_links[-1]['class'].strip() == 'disabled_pagination':
49
            return None
50
        else:
51
            return pagination_links[-1]('a')[0]['href'].strip()
52
 
53
if __name__ == '__main__':
54
    scraper = HS18Scraper()
55
    scraper.setUrl('http://www.homeshop18.com/ipads-2f-tablets/category:8937/')
56
    scraper.scrape()
57
    products = scraper.getPhones()
58
    print scraper.getNextUrl()
59
    print products