WebSVN – SmartDukaan – /trunk/PriceComparisonFramework/src/Scrapers/SulekhaScraper.py

from BeautifulSoup import BeautifulSoup
from BaseScraper import BaseScraper
from Utils import removePriceFormatting

class SulekhaScraper(BaseScraper):
    
    pageCount = 11
    productCountPerScraping = 24
    currentPage=1;
    
    def __init__(self):
        BaseScraper.__init__(self)
        self.url = None
        self.id = None
        #self.currentPage = 1
    
    def setUrl(self, url):
        self.url = url
    
    def scrape(self):
        html = BaseScraper.read(self, self.url)
        self.soup = BeautifulSoup(html)
        self.phones = None
        #self.setPageCount()
    
    def getPhones(self):
        phones = []
        allPhoneUl = self.soup.find('ul', id="MMobBrandOffersListCont")
        try:
            for li in allPhoneUl.findAll('li'):
                anchorDiv = li.find('div', {'class': 'dealtit'})
                anchor = anchorDiv.find('a')
                name = anchor.string.strip()
                price = li.find('span',{'class': 'deals-our-price'}).contents[1].strip()
                product_url = anchor['href'].strip()
                in_stock = 1
                
                try:
                    if price is None:
                        continue
                    else:
                        phones.append({
                            'name': str(name), 
                            'price': removePriceFormatting(price),
                            'source': 'sulekha', 
                            'product_url': str(product_url), 
                            'in_stock': in_stock
                        })
            
                except UnboundLocalError as e:
                    print e, name
                    print li
                
        except Exception as e:
            print e
        
        self.phones = phones
        return phones
    
    def setPageCount(self):
        self.currentPage = self.currentPage + 1
    
    def getNextUrl(self):
        if SulekhaScraper.currentPage < SulekhaScraper.pageCount:
            SulekhaScraper.currentPage += 1
            return 'http://mobiles.sulekha.com/common/common.aspx?type=mobileofferslist&makeId=0&modelId=0&pageNo=%s' % SulekhaScraper.currentPage         
        else:
            return None

    def getDataFromProductPage(self, url):
        html = BaseScraper.read(self, url)
        soup = BeautifulSoup(html)
        name = soup.find('h1', {'class': 'product-title'})('a')[0].contents[0].string.strip()
        price = soup.find('span',{'itemprop': 'price'}).string.strip()
        in_stock = 1
        
        data = {
            "product_url": str(url), 
            "source": "sulekha", 
            "price": price, 
            "in_stock": 1, 
            "name": name
        }
        return data

if __name__ == '__main__':
    s = SulekhaScraper()

    data = s.getDataFromProductPage('http://deals.sulekha.com/blackberry-curve-9360-white-17561')
    print data
Subversion Repositories SmartDukaan

(root)/trunk/PriceComparisonFramework/src/Scrapers/SulekhaScraper.py – Rev 5639