Subversion Repositories SmartDukaan

Rev

Rev 4198 | Rev 4203 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
4039 varun.gupt 1
'''
2
Created on 07-Sep-2011
3
 
4
@author: Varun Gupta
5
'''
6
from BeautifulSoup import BeautifulSoup
7
from BaseScraper import BaseScraper
4198 varun.gupt 8
from Utils import removePriceFormatting
4039 varun.gupt 9
 
10
class HS18Scraper(BaseScraper):
11
 
12
    def __init__(self):
13
        self.url = None
14
        self.id = None
15
 
16
    def setUrl(self, url):
17
        self.url = url
18
 
19
    def scrape(self):
20
        html = BaseScraper.read(self, self.url)
21
        self.soup = BeautifulSoup(html)
22
 
23
    def getPhones(self):
24
        product_prices = []
25
 
26
        for div in self.soup.findAll('div', {'class': 'product_div book_info_box'}):
27
            anchor = div.findAll('p', {'class': 'product_title'})[0]('a')[0]
28
            name = str(anchor['title'].strip())
29
 
30
            if name.endswith(' Mobile Phone'):  name = name.replace(' Mobile Phone', '')
31
 
32
            url = str(anchor['href'].strip())
4198 varun.gupt 33
            price = removePriceFormatting(str(div.findAll('span', {'class': 'product_new_price'})[0].string.strip()))
4039 varun.gupt 34
 
35
            try:
4198 varun.gupt 36
                product_prices.append({
37
                        'name': name, 
38
                        'price': price,
39
                        'source': 'homeshop18', 
40
                        'in_stock': 1, 
41
                        'product_url': url
42
                    })
4039 varun.gupt 43
 
44
            except UnicodeEncodeError as e:
45
                print 'Unicode Error', e, name
46
                name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
47
                print name_ascii
4198 varun.gupt 48
                product_prices.append({
49
                        "name": str(name_ascii), 
50
                        "price": str(price),
51
                        'source': 'homeshop18',
52
                        "in_stock": 1, 
53
                        "product_url": str(url)
54
                    })
4039 varun.gupt 55
 
56
        return product_prices
57
 
58
    def getNextUrl(self):
4198 varun.gupt 59
        pagination_links = self.soup.find('div', {'class': 'pagination'}).findAll('span')
4039 varun.gupt 60
 
4198 varun.gupt 61
        try:
62
            if pagination_links[-1]['class'].strip() == 'disabled_pagination':
63
                return None
64
            else:
65
                return pagination_links[-1]('a')[0]['href'].strip()
66
        except KeyError:
67
            print pagination_links
4039 varun.gupt 68
 
4199 varun.gupt 69
    def getDataFromProductPage(self, url):
70
        html = BaseScraper.read(self, url)
71
        soup = BeautifulSoup(html)
72
        name = soup.find('h1', {'itemprop': 'name'}).string.strip()
73
        price = soup.find('span',{'id': 'fk-mprod-our-id'}).contents[2]
74
        in_stock = soup.find('div', {'id': 'fk-stock-info-id'}).string.strip()
75
 
76
        data = {
77
            "product_url": str(url), 
78
            "source": "flipkart", 
79
            "price": price, 
80
            "in_stock": 1 if in_stock == 'In Stock.' else 0, 
81
            "name": name
82
        }
83
        return data
84
 
85
 
4039 varun.gupt 86
if __name__ == '__main__':
87
    scraper = HS18Scraper()
4198 varun.gupt 88
    scraper.setUrl('http://www.homeshop18.com//ipads-2f-tablets/categoryid:8937/search:*/start:32/')
4039 varun.gupt 89
    scraper.scrape()
90
    products = scraper.getPhones()
91
    print scraper.getNextUrl()
92
    print products