Rev 16019 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
import urllib2from BeautifulSoup import BeautifulSoupimport pymongoimport refrom dtr.utils.utils import to_java_dateimport optparsefrom datetime import datetimeimport smtplibfrom email.mime.text import MIMETextfrom email.mime.multipart import MIMEMultipartcon = Noneparser = optparse.OptionParser()parser.add_option("-m", "--m", dest="mongoHost",default="localhost",type="string", help="The HOST where the mongo server is running",metavar="mongo_host")(options, args) = parser.parse_args()exceptionList = []bestSellers = []baseUrl = "http://m.shopclues.com/products/getProductList/mobiles:top-selling-mobiles-and-tablets.html/%s/page=%s"now = datetime.now()class __ProductInfo:def __init__(self, identifier, rank, url, available_price, in_stock, codAvailable, source_product_name, thumbnail, coupon):self.identifier = identifierself.rank = rankself.url = urlself.available_price = available_priceself.in_stock = in_stockself.codAvailable = codAvailableself.source_product_name = source_product_nameself.thumbnail = thumbnailself.coupon = coupondef get_mongo_connection(host=options.mongoHost, port=27017):global conif con is None:print "Establishing connection %s host and port %d" %(host,port)try:con = pymongo.MongoClient(host, port)except Exception, e:print ereturn Nonereturn condef getSoupObject(url):print "Getting soup object for"print urlglobal RETRY_COUNTRETRY_COUNT = 1while RETRY_COUNT < 10:try:soup = Nonerequest = urllib2.Request(url)request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')request.add_header('Connection','keep-alive')request.add_header('User-Agent','Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36')response = urllib2.urlopen(request)response_data = response.read()response.close()try:page=response_data.decode("utf-8")soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)except:soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)if soup is None:raisereturn soupexcept Exception as e:print eprint "Retrying"RETRY_COUNT = RETRY_COUNT + 1def scrapeBestSellers():global bestSellersbestSellers = []rank = 0page = 1while (True):url = (baseUrl)%(page,page-1)soup = getSoupObject(url)productDivs = soup.findAll('div',{'class':'pd-list-cont'})if productDivs is None or len(productDivs)==0:returnfor productDiv in productDivs:rank = rank + 1info_tag = productDiv.find('a')link = info_tag['href']scin = info_tag['data-id'].strip()print linkprint scinproduct = list(get_mongo_connection().Catalog.MasterData.find({'source_id':5,'identifier':scin}))if len(product) > 0:get_mongo_connection().Catalog.MasterData.update({'source_id':5,'identifier':scin},{"$set":{'rank':rank}})else:#Lets bundle product by finding similar url patternuri = link.replace('http://m.shopclues.com','')product = list(get_mongo_connection().Catalog.MasterData.find({'marketPlaceUrl':{'$regex': uri}}))if len(product) > 0:bundleNewProduct(product[0],)page = page+1def bundleNewProduct(existingProduct, toBundle):passdef main():scrapeBestSellers()if __name__=='__main__':main()