Rev 15869 | Rev 16021 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
import urllib2from BeautifulSoup import BeautifulSoupimport pymongoimport refrom dtr.utils.utils import to_java_dateimport optparsefrom datetime import datetimeimport smtplibfrom email.mime.text import MIMETextfrom email.mime.multipart import MIMEMultipartfrom dtr.utils import ShopCluesScraperimport tracebackcon = Noneparser = optparse.OptionParser()parser.add_option("-m", "--m", dest="mongoHost",default="localhost",type="string", help="The HOST where the mongo server is running",metavar="mongo_host")(options, args) = parser.parse_args()exceptionList = []bestSellers = []baseUrl = "http://m.shopclues.com/products/getProductList/mobiles:top-selling-mobiles-and-tablets.html/%s/page=%s"headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36','Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language' : 'en-US,en;q=0.8','Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Connection':'keep-alive','Accept-Encoding' : 'gzip,deflate,sdch'}now = datetime.now()sc = ShopCluesScraper.ShopCluesScraper()class __ProductInfo:def __init__(self, identifier, rank, url, available_price, in_stock, codAvailable, source_product_name, thumbnail, coupon):self.identifier = identifierself.rank = rankself.url = urlself.available_price = available_priceself.in_stock = in_stockself.codAvailable = codAvailableself.source_product_name = source_product_nameself.thumbnail = thumbnailself.coupon = coupondef get_mongo_connection(host=options.mongoHost, port=27017):global conif con is None:print "Establishing connection %s host and port %d" %(host,port)try:con = pymongo.MongoClient(host, port)except Exception, e:print ereturn Nonereturn condef getSoupObject(url):print "Getting soup object for"print urlglobal RETRY_COUNTRETRY_COUNT = 1while RETRY_COUNT < 10:try:soup = Nonerequest = urllib2.Request(url, headers)response = urllib2.urlopen(request)response_data = response.read()response.close()try:page=response_data.decode("utf-8")soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)except:soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)if soup is None:raisereturn soupexcept Exception as e:print eprint "Retrying"RETRY_COUNT = RETRY_COUNT + 1def scrapeBestSellers():global bestSellersbestSellers = []rank = 0page = 1while (True):url = (baseUrl)%(page,page-1)soup = getSoupObject(url)productDivs = soup.findAll('div',{'class':'pd-list-cont'})if productDivs is None or len(productDivs)==0:returnfor productDiv in productDivs:rank = rank + 1info_tag = productDiv.find('a')link = info_tag['href']scin = info_tag['data-id'].strip()print linkprint scinproductName = productDiv.find('div',{'class':'pdt-name'}).stringtry:productInfo = sc.read(link)except Exception as e:traceback.print_exc()continueproduct = list(get_mongo_connection().Catalog.MasterData.find({'source_id':5,'identifier':scin}))if len(product) > 0:get_mongo_connection().Catalog.MasterData.update({'source_id':5,'identifier':scin},{"$set":{'rank':rank, 'available_price':productInfo['price'], \'in_stock':productInfo['inStock'], 'codAvailable':productInfo['isCod'], \'coupon':productInfo['coupon']}})else:#Lets bundle product by finding similar url patternuri = link.replace('http://m.shopclues.com','').replace(".html","")try:int(uri[uri.rfind('-')+1:])uri = uri[:uri.rfind('-')]except:passproduct = list(get_mongo_connection().Catalog.MasterData.find({'source_id':5,'marketPlaceUrl':{'$regex': uri}}))toBundle = __ProductInfo(scin, rank, link, productInfo['price'], productInfo['inStock'],productInfo['isCod'], productName, "" ,productInfo['coupon'])if len(product) > 0:bundleNewProduct(product[0], toBundle)else:exceptionList.append(toBundle)page = page+1def bundleNewProduct(existingProduct, toBundle):print "Adding new product"try:max_id = list(get_mongo_connection().Catalog.MasterData.find().sort([('_id',pymongo.DESCENDING)]).limit(1))existingProduct['_id'] = max_id[0]['_id'] + 1existingProduct['addedOn'] = to_java_date(datetime.now())existingProduct['available_price'] = toBundle.available_priceexistingProduct['updatedOn'] = to_java_date(datetime.now())existingProduct['codAvailable'] = toBundle.codAvailableexistingProduct['coupon'] = toBundle.couponexistingProduct['identifier'] = toBundle.identifierexistingProduct['in_stock'] = toBundle.in_stockexistingProduct['marketPlaceUrl'] = toBundle.urlexistingProduct['rank'] = toBundle.rankexistingProduct['source_product_name'] = toBundle.source_product_nameexistingProduct['url'] = toBundle.urlget_mongo_connection().Catalog.MasterData.insert(existingProduct)return {1:'Data added successfully.'}except Exception as e:print ereturn {0:'Unable to add data.'}def exceptionItems():for item in exceptionList:print vars(item)def main():scrapeBestSellers()exceptionItems()if __name__=='__main__':main()