Subversion Repositories SmartDukaan

Rev

Rev 5401 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
4039 varun.gupt 1
'''
2
Created on 07-Sep-2011
3
 
4
@author: Varun Gupta
5
'''
6
from BeautifulSoup import BeautifulSoup
7
from BaseScraper import BaseScraper
4198 varun.gupt 8
from Utils import removePriceFormatting
5401 varun.gupt 9
from SoupSelect import select
4039 varun.gupt 10
 
11
class HS18Scraper(BaseScraper):
12
 
13
    def __init__(self):
14
        self.url = None
15
        self.id = None
16
 
17
    def setUrl(self, url):
18
        self.url = url
19
 
20
    def scrape(self):
21
        html = BaseScraper.read(self, self.url)
22
        self.soup = BeautifulSoup(html)
23
 
24
    def getPhones(self):
25
        product_prices = []
26
 
5401 varun.gupt 27
        for div in select(self.soup, "div.product_div"):#self.soup.findAll('div', {'class': 'product_div'}):
4039 varun.gupt 28
            try:
6168 amar.kumar 29
                anchor = div.find('p', {'class': 'product_title'})('a')[0]
30
                name = str(anchor['title'].strip())
4039 varun.gupt 31
 
6168 amar.kumar 32
                if name.endswith(' Mobile Phone'):
33
                    name = name.replace(' Mobile Phone', '')
34
 
35
                url = str(anchor['href'].strip())
36
                price = removePriceFormatting(str(div.findAll('span', {'class': 'product_new_price'})[0].string.strip()))
37
 
38
                try:
39
                    product_prices.append({
40
                            'name': name, 
41
                            'price': price,
42
                            'source': 'homeshop18', 
43
                            'in_stock': 1, 
44
                            'product_url': url
45
                        })
46
 
47
                except UnicodeEncodeError as e:
48
                    print 'Unicode Error', e, name
49
                    name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
50
                    print name_ascii
51
                    product_prices.append({
52
                            "name": str(name_ascii), 
53
                            "price": str(price),
54
                            'source': 'homeshop18',
55
                            "in_stock": 1, 
56
                            "product_url": str(url)
57
                        })
58
            except Exception as e:
59
                print e
4039 varun.gupt 60
        return product_prices
61
 
62
    def getNextUrl(self):
4198 varun.gupt 63
        pagination_links = self.soup.find('div', {'class': 'pagination'}).findAll('span')
4039 varun.gupt 64
 
4198 varun.gupt 65
        try:
66
            if pagination_links[-1]['class'].strip() == 'disabled_pagination':
67
                return None
68
            else:
69
                return pagination_links[-1]('a')[0]['href'].strip()
70
        except KeyError:
71
            print pagination_links
4039 varun.gupt 72
 
4199 varun.gupt 73
    def getDataFromProductPage(self, url):
74
        html = BaseScraper.read(self, url)
75
        soup = BeautifulSoup(html)
4203 varun.gupt 76
        name = soup.find('h1', {'id': 'productLayoutForm:pbiName'}).string.replace('Mobile Phone', '').strip()
77
        price = removePriceFormatting(soup.find('span',{'id': 'productLayoutForm:OurPrice'}).string)
4199 varun.gupt 78
 
79
        data = {
80
            "product_url": str(url), 
4203 varun.gupt 81
            "source": "homeshop18", 
4199 varun.gupt 82
            "price": price, 
4203 varun.gupt 83
            "in_stock": 1,
4199 varun.gupt 84
            "name": name
85
        }
86
        return data
87
 
88
 
4039 varun.gupt 89
if __name__ == '__main__':
90
    scraper = HS18Scraper()
5291 varun.gupt 91
#    print scraper.getDataFromProductPage('http://www.homeshop18.com/samsung-galaxy-note-n7000-mobile-phone/mobiles-accessories/gsm-handsets/product:16601211/cid:3027/')
5401 varun.gupt 92
    scraper.setUrl('http://www.homeshop18.com/mobiles/category:14569/')
5291 varun.gupt 93
    scraper.scrape()
94
    products = scraper.getPhones()
95
    print products
96
    print scraper.getNextUrl()