Subversion Repositories SmartDukaan

Rev

Rev 5291 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
4039 varun.gupt 1
'''
2
Created on 07-Sep-2011
3
 
4
@author: Varun Gupta
5
'''
6
from BeautifulSoup import BeautifulSoup
7
from BaseScraper import BaseScraper
4198 varun.gupt 8
from Utils import removePriceFormatting
5401 varun.gupt 9
from SoupSelect import select
4039 varun.gupt 10
 
11
class HS18Scraper(BaseScraper):
12
 
13
    def __init__(self):
14
        self.url = None
15
        self.id = None
16
 
17
    def setUrl(self, url):
18
        self.url = url
19
 
20
    def scrape(self):
21
        html = BaseScraper.read(self, self.url)
22
        self.soup = BeautifulSoup(html)
23
 
24
    def getPhones(self):
25
        product_prices = []
26
 
5401 varun.gupt 27
        for div in select(self.soup, "div.product_div"):#self.soup.findAll('div', {'class': 'product_div'}):
5291 varun.gupt 28
 
29
            anchor = div.find('p', {'class': 'product_title'})('a')[0]
4039 varun.gupt 30
            name = str(anchor['title'].strip())
31
 
5291 varun.gupt 32
            if name.endswith(' Mobile Phone'):
33
                name = name.replace(' Mobile Phone', '')
4039 varun.gupt 34
 
35
            url = str(anchor['href'].strip())
4198 varun.gupt 36
            price = removePriceFormatting(str(div.findAll('span', {'class': 'product_new_price'})[0].string.strip()))
4039 varun.gupt 37
 
38
            try:
4198 varun.gupt 39
                product_prices.append({
40
                        'name': name, 
41
                        'price': price,
42
                        'source': 'homeshop18', 
43
                        'in_stock': 1, 
44
                        'product_url': url
45
                    })
4039 varun.gupt 46
 
47
            except UnicodeEncodeError as e:
48
                print 'Unicode Error', e, name
49
                name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
50
                print name_ascii
4198 varun.gupt 51
                product_prices.append({
52
                        "name": str(name_ascii), 
53
                        "price": str(price),
54
                        'source': 'homeshop18',
55
                        "in_stock": 1, 
56
                        "product_url": str(url)
57
                    })
4039 varun.gupt 58
 
59
        return product_prices
60
 
61
    def getNextUrl(self):
4198 varun.gupt 62
        pagination_links = self.soup.find('div', {'class': 'pagination'}).findAll('span')
4039 varun.gupt 63
 
4198 varun.gupt 64
        try:
65
            if pagination_links[-1]['class'].strip() == 'disabled_pagination':
66
                return None
67
            else:
68
                return pagination_links[-1]('a')[0]['href'].strip()
69
        except KeyError:
70
            print pagination_links
4039 varun.gupt 71
 
4199 varun.gupt 72
    def getDataFromProductPage(self, url):
73
        html = BaseScraper.read(self, url)
74
        soup = BeautifulSoup(html)
4203 varun.gupt 75
        name = soup.find('h1', {'id': 'productLayoutForm:pbiName'}).string.replace('Mobile Phone', '').strip()
76
        price = removePriceFormatting(soup.find('span',{'id': 'productLayoutForm:OurPrice'}).string)
4199 varun.gupt 77
 
78
        data = {
79
            "product_url": str(url), 
4203 varun.gupt 80
            "source": "homeshop18", 
4199 varun.gupt 81
            "price": price, 
4203 varun.gupt 82
            "in_stock": 1,
4199 varun.gupt 83
            "name": name
84
        }
85
        return data
86
 
87
 
4039 varun.gupt 88
if __name__ == '__main__':
89
    scraper = HS18Scraper()
5291 varun.gupt 90
#    print scraper.getDataFromProductPage('http://www.homeshop18.com/samsung-galaxy-note-n7000-mobile-phone/mobiles-accessories/gsm-handsets/product:16601211/cid:3027/')
5401 varun.gupt 91
    scraper.setUrl('http://www.homeshop18.com/mobiles/category:14569/')
5291 varun.gupt 92
    scraper.scrape()
93
    products = scraper.getPhones()
94
    print products
95
    print scraper.getNextUrl()