Rev 13913 | Rev 20172 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
import urllib2from BeautifulSoup import BeautifulSoupimport pymongoimport refrom dtr.utils.utils import to_java_datefrom datetime import datetimeimport optparsecon = Noneparser = optparse.OptionParser()parser.add_option("-m", "--m", dest="mongoHost",default="localhost",type="string", help="The HOST where the mongo server is running",metavar="mongo_host")(options, args) = parser.parse_args()asin_regex = r'/([A-Z0-9]{10})'bestSellers = []now = datetime.now()class __RankInfo:def __init__(self, identifier, rank):self.identifier = identifierself.rank = rankdef get_mongo_connection(host=options.mongoHost, port=27017):global conif con is None:print "Establishing connection %s host and port %d" %(host,port)try:con = pymongo.MongoClient(host, port)except Exception, e:print ereturn Nonereturn condef getSoupObject(url):print "Getting soup object for"print urlglobal RETRY_COUNTRETRY_COUNT = 1while RETRY_COUNT < 10:try:soup = Nonerequest = urllib2.Request(url)request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')request.add_header('Connection','keep-alive')request.add_header('User-Agent','Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36')response = urllib2.urlopen(request)response_data = response.read()response.close()try:page=response_data.decode("utf-8")soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)except:soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)if soup is None:raisereturn soupexcept Exception as e:print eprint "Retrying"RETRY_COUNT = RETRY_COUNT + 1def scrapeBestSellerMobiles():global bestSellersrank = 0for i in range(1,6):aboveFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1389432031/ref=zg_bs_1389432031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=1" %(i,i)belowFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1389432031/ref=zg_bs_1389432031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=0" %(i,i)above_soup = getSoupObject(aboveFoldUrl)below_soup = getSoupObject(belowFoldUrl)for x in above_soup.findAll('div',{'class':'zg_itemImmersion'}):am_url = x.find('div',{'class':'zg_title'}).find('a')['href']identifier = (re.search(asin_regex, am_url)).group(1)rank = rank + 1r_info = __RankInfo(identifier,rank)bestSellers.append(r_info)for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):am_url = x.find('div',{'class':'zg_title'}).find('a')['href']identifier = (re.search(asin_regex, am_url)).group(1)rank = rank + 1r_info = __RankInfo(identifier,rank)bestSellers.append(r_info)def commitBestSellers():print "Rank",print '\t',print 'Identifier'for x in bestSellers:print x.rank,print '\t',print x.identifier,col = get_mongo_connection().Catalog.MasterData.find({'identifier':x.identifier.strip()})print "count sku",print '\t',print len(list(col))get_mongo_connection().Catalog.MasterData.update({'identifier':x.identifier.strip() }, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}}, multi=True)def scrapeBestSellerTablets():global bestSellersbestSellers = []rank = 0for i in range(1,6):aboveFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1375458031/ref=zg_bs_1375458031_pg_%d?ie=UTF8&pg=%d&ajax=1" %(i,i)belowFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1375458031/ref=zg_bs_1375458031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=0" %(i,i)above_soup = getSoupObject(aboveFoldUrl)below_soup = getSoupObject(belowFoldUrl)for x in above_soup.findAll('div',{'class':'zg_itemImmersion'}):am_url = x.find('div',{'class':'zg_title'}).find('a')['href']identifier = (re.search(asin_regex, am_url)).group(1)rank = rank + 1print identifier,print '\t',print rankr_info = __RankInfo(identifier,rank)bestSellers.append(r_info)for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):am_url = x.find('div',{'class':'zg_title'}).find('a')['href']identifier = (re.search(asin_regex, am_url)).group(1)rank = rank + 1print identifier,print '\t',print rankr_info = __RankInfo(identifier,rank)bestSellers.append(r_info)def resetRanks(category_id):oldRankedItems = get_mongo_connection().Catalog.MasterData.find({'rank':{'$gt':0},'source_id':1,'category_id':category_id})for item in oldRankedItems:get_mongo_connection().Catalog.MasterData.update({'_id':item['_id']}, {'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)def main():scrapeBestSellerMobiles()if len(bestSellers) > 0:resetRanks(3)commitBestSellers()scrapeBestSellerTablets()if len(bestSellers) > 0:resetRanks(5)commitBestSellers()if __name__=='__main__':main()