| 3232 |
varun.gupt |
1 |
'''
|
|
|
2 |
Created on 24-Aug-2011
|
|
|
3 |
|
|
|
4 |
@author: Varun Gupta
|
|
|
5 |
'''
|
|
|
6 |
from BeautifulSoup import BeautifulSoup
|
| 4039 |
varun.gupt |
7 |
from BaseScraper import BaseScraper
|
| 3232 |
varun.gupt |
8 |
|
| 4039 |
varun.gupt |
9 |
class InfibeamScraper(BaseScraper):
|
| 3232 |
varun.gupt |
10 |
|
|
|
11 |
def __init__(self):
|
|
|
12 |
self.url = None
|
|
|
13 |
self.id = None
|
|
|
14 |
|
|
|
15 |
def setUrl(self, url):
|
|
|
16 |
self.url = url
|
|
|
17 |
|
|
|
18 |
def scrape(self):
|
| 4039 |
varun.gupt |
19 |
html = BaseScraper.read(self, self.url)
|
| 3232 |
varun.gupt |
20 |
self.soup = BeautifulSoup(html)
|
|
|
21 |
|
| 4039 |
varun.gupt |
22 |
def getPhones(self):
|
| 3232 |
varun.gupt |
23 |
phone_prices = []
|
| 4039 |
varun.gupt |
24 |
for li in self.soup.findAll('ul', {'class': 'srch_result portrait'})[0]('li'):
|
|
|
25 |
|
|
|
26 |
name = li.findAll('span', {'class': 'title'})[0].string
|
|
|
27 |
try:
|
|
|
28 |
price = li.findAll('div', {'class': 'price'})[0].findAll('span', {'class': 'normal'})[0].string
|
|
|
29 |
except IndexError:
|
|
|
30 |
price = li.findAll('span', {'class': 'price'})[0].contents[-1].strip()
|
|
|
31 |
|
|
|
32 |
url = li.findAll('a')[0]['href']
|
|
|
33 |
|
|
|
34 |
try:
|
|
|
35 |
phone_prices.append({'name': str(name), 'price': str(price), 'in_stock': 1, 'product_url': str(url)})
|
|
|
36 |
|
|
|
37 |
except UnicodeEncodeError as e:
|
|
|
38 |
print 'Unicode Error', e, name
|
|
|
39 |
name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
|
|
|
40 |
print name_ascii
|
|
|
41 |
phone_prices.append({"name": str(name_ascii), "price": str(price), "in_stock": 1, "product_url": str(url)})
|
|
|
42 |
|
| 3232 |
varun.gupt |
43 |
return phone_prices
|
|
|
44 |
|
|
|
45 |
def getNextUrl(self):
|
| 4039 |
varun.gupt |
46 |
b = self.soup.findAll('div', {'class': 'resultsSummary'})[0].findAll('b')
|
|
|
47 |
current_max = int(b[0].string.split('-')[1])
|
|
|
48 |
total_products = int(b[1].string)
|
|
|
49 |
|
|
|
50 |
return 'http://www.infibeam.com/Mobiles/search?page=%d' % (1 + current_max / 20) if current_max < total_products else None
|
| 3232 |
varun.gupt |
51 |
|
|
|
52 |
if __name__ == '__main__':
|
|
|
53 |
s = InfibeamScraper()
|
| 4039 |
varun.gupt |
54 |
s.setUrl('http://www.infibeam.com/Mobiles/search?page=17')
|
| 3232 |
varun.gupt |
55 |
s.scrape()
|
| 4039 |
varun.gupt |
56 |
products = s.getPhones()
|
|
|
57 |
print products
|