Subversion Repositories SmartDukaan

Rev

Rev 15265 | Rev 17182 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

from dtr.utils.utils import fetchResponseUsingProxy, transformUrl
from sys import exit
import json
import re
import traceback
import datetime
from pyquery import PyQuery



headers = { 
            'User-agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36',
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
            'Accept-Language' : 'en-US,en;q=0.8',                     
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Connection':'keep-alive',
            'Accept-Encoding' : 'gzip,deflate,sdch'
        }


class FlipkartProductPageScraper:
    def __init__(self):
        self.count_trials = 0
        self.redirectCount = 0
    
    def read(self, url):
        response_data = ""
        redirect_url = ""
        url = transformUrl(url,2)
        try:
            
            """quick fix,need to add it conf""" 
            
            response_data = fetchResponseUsingProxy(url, headers, proxy=True)                
            print "Fetched response from flipkart for %s" %(url)
            #redirect_url = response.url

        except Exception as e:
            traceback.print_exc()
            print 'ERROR: ', e
            print 'Retrying'
            self.count_trials += 1

            if self.count_trials < 3:
                return self.read(url)

        self.response_data=response_data
        print datetime.datetime.now()
        return self.createData(url,redirect_url)
    
    def createData(self,url, redirect_url):
        pq = PyQuery(self.response_data)
        buyBoxPrice = float(pq('span.selling-price.omniture-field').attr['data-evar48'])
        inStock = 1
        try:
            sellerJson = pq('div.seller-table-wrap').attr['data-config']
            x = json.loads(sellerJson)
            lines = sorted(x['dataModel'], key=lambda k: k['priceInfo'].get('sellingPrice', 0), reverse=False)
            sellingPrice =  float(lines[0]['priceInfo']['sellingPrice'])
            try:
                offerText = lines[0]['offerInfo']['listingOffers'][0]['description']
            except:
                offerText = ""
            return {'lowestSp':sellingPrice,'inStock':inStock,'buyBoxPrice':buyBoxPrice}
        except:
            """Not able to parse seller wrap section, probably due to only single seller option"""
            sellingPrice = buyBoxPrice
            stockDiv = pq('div.out-of-stock')
            if len(stockDiv) > 0:
                inStock = 0
            return {'lowestSp':sellingPrice,'inStock':inStock,'buyBoxPrice':buyBoxPrice}
            
if __name__ == '__main__':
    print datetime.datetime.now()
    scraper = FlipkartProductPageScraper()
    
    print scraper.read('http://www.flipkart.com/redmi-2/p/itme8ygtcfax6w39')
    print datetime.datetime.now()