Subversion Repositories SmartDukaan

Rev

Rev 4203 | Rev 5401 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
4039 varun.gupt 1
'''
2
Created on 07-Sep-2011
3
 
4
@author: Varun Gupta
5
'''
6
from BeautifulSoup import BeautifulSoup
7
from BaseScraper import BaseScraper
4198 varun.gupt 8
from Utils import removePriceFormatting
4039 varun.gupt 9
 
10
class HS18Scraper(BaseScraper):
11
 
12
    def __init__(self):
13
        self.url = None
14
        self.id = None
15
 
16
    def setUrl(self, url):
17
        self.url = url
18
 
19
    def scrape(self):
20
        html = BaseScraper.read(self, self.url)
21
        self.soup = BeautifulSoup(html)
22
 
23
    def getPhones(self):
24
        product_prices = []
25
 
5291 varun.gupt 26
        for div in self.soup.findAll('div', {'class': 'product_div'}):
27
 
28
            anchor = div.find('p', {'class': 'product_title'})('a')[0]
4039 varun.gupt 29
            name = str(anchor['title'].strip())
30
 
5291 varun.gupt 31
            if name.endswith(' Mobile Phone'):
32
                name = name.replace(' Mobile Phone', '')
4039 varun.gupt 33
 
34
            url = str(anchor['href'].strip())
4198 varun.gupt 35
            price = removePriceFormatting(str(div.findAll('span', {'class': 'product_new_price'})[0].string.strip()))
4039 varun.gupt 36
 
37
            try:
4198 varun.gupt 38
                product_prices.append({
39
                        'name': name, 
40
                        'price': price,
41
                        'source': 'homeshop18', 
42
                        'in_stock': 1, 
43
                        'product_url': url
44
                    })
4039 varun.gupt 45
 
46
            except UnicodeEncodeError as e:
47
                print 'Unicode Error', e, name
48
                name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
49
                print name_ascii
4198 varun.gupt 50
                product_prices.append({
51
                        "name": str(name_ascii), 
52
                        "price": str(price),
53
                        'source': 'homeshop18',
54
                        "in_stock": 1, 
55
                        "product_url": str(url)
56
                    })
4039 varun.gupt 57
 
58
        return product_prices
59
 
60
    def getNextUrl(self):
4198 varun.gupt 61
        pagination_links = self.soup.find('div', {'class': 'pagination'}).findAll('span')
4039 varun.gupt 62
 
4198 varun.gupt 63
        try:
64
            if pagination_links[-1]['class'].strip() == 'disabled_pagination':
65
                return None
66
            else:
67
                return pagination_links[-1]('a')[0]['href'].strip()
68
        except KeyError:
69
            print pagination_links
4039 varun.gupt 70
 
4199 varun.gupt 71
    def getDataFromProductPage(self, url):
72
        html = BaseScraper.read(self, url)
73
        soup = BeautifulSoup(html)
4203 varun.gupt 74
        name = soup.find('h1', {'id': 'productLayoutForm:pbiName'}).string.replace('Mobile Phone', '').strip()
75
        price = removePriceFormatting(soup.find('span',{'id': 'productLayoutForm:OurPrice'}).string)
4199 varun.gupt 76
 
77
        data = {
78
            "product_url": str(url), 
4203 varun.gupt 79
            "source": "homeshop18", 
4199 varun.gupt 80
            "price": price, 
4203 varun.gupt 81
            "in_stock": 1,
4199 varun.gupt 82
            "name": name
83
        }
84
        return data
85
 
86
 
4039 varun.gupt 87
if __name__ == '__main__':
88
    scraper = HS18Scraper()
5291 varun.gupt 89
#    print scraper.getDataFromProductPage('http://www.homeshop18.com/samsung-galaxy-note-n7000-mobile-phone/mobiles-accessories/gsm-handsets/product:16601211/cid:3027/')
90
    scraper.setUrl('http://www.homeshop18.com/gsm-mobiles/categoryid:3027/search:*/start:112/')
91
    scraper.scrape()
92
    products = scraper.getPhones()
93
    print products
94
    print scraper.getNextUrl()