Subversion Repositories SmartDukaan

Rev

Rev 4110 | Blame | Compare with Previous | Last modification | View Log | RSS feed

'''
Created on 19-Nov-2011

@author: Varun Gupta
'''
from BaseScraper import BaseScraper
from BeautifulSoup import BeautifulSoup

import json

class MySmartPrice(BaseScraper):

    def __init__(self):
        self.url_brand_list = 'http://www.mysmartprice.com/mobile/pricelist/'
        self.source_whitelist = ['adexmart', 'flipkart', 'homeshop18', 'infibeam', 'letsbuy', 'saholic']
    
    def getSourceName(self, url):
        for source in self.source_whitelist:
            if url.find(source) > -1:   return source
        
        return None
    
    def getSaholicEntityId(self, map):
        try:
            if map['saholic']['url'] is None:
                return None
            else:
                return map['saholic']['url'].split('-')[-1]
        
        except KeyError:
            return None
        
    def getBrandURLs(self):
        urls = []
        html = BaseScraper.read(self, self.url_brand_list)
        soup = BeautifulSoup(html)
        for td in soup.find('div', {'class': 'msp_left'}).find('table').find('table').findAll('td', {'width':"300px"}):
            urls.append(str(td.find('a')['href']))
        return urls

    def getPhoneURLsForBrand(self, url_brand):
        urls = []
        url_brand = 'http://www.mysmartprice.com' + url_brand
        html = BaseScraper.read(self, url_brand)
        soup = BeautifulSoup(html)
        for div in soup.findAll('div', {'class': 'item'}):
            a = div.find('a')
            
            if a is not None:
                urls.append(str(a['href']))
        return urls

    def getPhonePrices(self, url):
        html = BaseScraper.read(self, url)
        soup = BeautifulSoup(html)
        map = {}
        for div in soup.findAll('div', {'class': 'pt_row'}):
            url = div.find('td', {'width': '140px'}).find('a')['href'].split('?url=')[-1].strip()
            td_price = div.find('td', {'width': '135px'})
            
            if td_price.string is None:
                is_available = True
                price = td_price.find('b').string.strip()
            else:
                is_available = False
            
            source = self.getSourceName(url)
            
            if source is not None:
                map[source] = {
                        'is_available': is_available,
                        'price': price if is_available else 'Not Found',
                        'url': url if is_available else 'Not Found'
                }
        return map
                
if __name__ == '__main__':
    scraper = MySmartPrice()
#    brand_urls = scraper.getBrands()
    brand_urls = [
                  '/mobile/pricelist/nokia-mobile-price-list-in-india.html',
                  '/mobile/pricelist/samsung-mobile-price-list-in-india.html',
                  '/mobile/pricelist/blackberry-mobile-price-list-in-india.html',
                  '/mobile/pricelist/lg-mobile-price-list-in-india.html',
                  '/mobile/pricelist/sony-ericsson-mobile-price-list-in-india.html',
                  '/mobile/pricelist/micromax-mobile-price-list-in-india.html',
                  '/mobile/pricelist/motorola-mobile-price-list-in-india.html',
                  '/mobile/pricelist/htc-mobile-price-list-in-india.html',
                  '/mobile/pricelist/apple-mobile-price-list-in-india.html',
                  '/mobile/pricelist/spice-mobile-price-list-in-india.html',
                  '/mobile/pricelist/karbonn-mobile-price-list-in-india.html',
                  '/mobile/pricelist/lava-mobile-price-list-in-india.html']
    phone_urls = []
    
    for brand_url in brand_urls:
        try:
            print brand_url
            phone_urls.extend(scraper.getPhoneURLsForBrand(brand_url))
        except Exception as e:
            print e
            continue
    
    print phone_urls.__len__()

    for url in phone_urls:
        print url
        map = scraper.getPhonePrices(url)
        saholic_id = scraper.getSaholicEntityId(map)
        print map
        print saholic_id
        
        if saholic_id is not None:
            file_path = str("/usr/msp_dir/%s" % saholic_id)
            file_to_write = open(file_path, "w")
            
            if file_to_write is None:
                print 'File pointer is None'
            else:
                json.dump(map, file_to_write, indent = 4)