Rev 4106 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 19-Nov-2011@author: Varun Gupta'''from BaseScraper import BaseScraperfrom BeautifulSoup import BeautifulSoupimport jsonclass MySmartPrice(BaseScraper):def __init__(self):self.url_brand_list = 'http://www.mysmartprice.com/mobile/pricelist/'self.source_whitelist = ['adexmart', 'flipkart', 'homeshop18', 'infibeam', 'letsbuy', 'saholic']def getSourceName(self, url):for source in self.source_whitelist:if url.find(source) > -1: return sourcereturn Nonedef getSaholicEntityId(self, map):try:if map['saholic']['url'] is None:return Noneelse:return map['saholic']['url'].split('-')[-1]except KeyError:return Nonedef getBrandURLs(self):urls = []html = BaseScraper.read(self, self.url_brand_list)soup = BeautifulSoup(html)for td in soup.find('div', {'class': 'msp_left'}).find('table').find('table').findAll('td', {'width':"300px"}):urls.append(str(td.find('a')['href']))return urlsdef getPhoneURLsForBrand(self, url_brand):urls = []url_brand = 'http://www.mysmartprice.com' + url_brandhtml = BaseScraper.read(self, url_brand)soup = BeautifulSoup(html)for div in soup.findAll('div', {'class': 'item'}):a = div.find('a')if a is not None:urls.append(str(a['href']))return urlsdef getPhonePrices(self, url):html = BaseScraper.read(self, url)soup = BeautifulSoup(html)map = {}for div in soup.findAll('div', {'class': 'pt_row'}):url = div.find('td', {'width': '140px'}).find('a')['href'].split('?url=')[-1].strip()td_price = div.find('td', {'width': '135px'})if td_price.string is None:is_available = Trueprice = td_price.find('b').string.strip()else:is_available = Falsesource = self.getSourceName(url)if source is not None:map[source] = {'is_available': is_available,'price': price if is_available else 'Not Found','url': url if is_available else 'Not Found'}return mapif __name__ == '__main__':scraper = MySmartPrice()# brand_urls = scraper.getBrands()brand_urls = ['/mobile/pricelist/nokia-mobile-price-list-in-india.html','/mobile/pricelist/samsung-mobile-price-list-in-india.html','/mobile/pricelist/blackberry-mobile-price-list-in-india.html','/mobile/pricelist/lg-mobile-price-list-in-india.html','/mobile/pricelist/sony-ericsson-mobile-price-list-in-india.html','/mobile/pricelist/micromax-mobile-price-list-in-india.html','/mobile/pricelist/motorola-mobile-price-list-in-india.html','/mobile/pricelist/htc-mobile-price-list-in-india.html','/mobile/pricelist/apple-mobile-price-list-in-india.html','/mobile/pricelist/spice-mobile-price-list-in-india.html','/mobile/pricelist/karbonn-mobile-price-list-in-india.html','/mobile/pricelist/lava-mobile-price-list-in-india.html']phone_urls = []for brand_url in brand_urls:try:print brand_urlphone_urls.extend(scraper.getPhoneURLsForBrand(brand_url))except Exception as e:print econtinueprint phone_urls.__len__()for url in phone_urls:print urlmap = scraper.getPhonePrices(url)saholic_id = scraper.getSaholicEntityId(map)print mapprint saholic_idif saholic_id is not None:file_path = str("/tmp/msp_dir/%s" % saholic_id)file_to_write = open(file_path, "w")if file_to_write is None:print 'File pointer is None'else:json.dump(map, file_to_write, indent = 4)