Rev 21135 | Blame | Compare with Previous | Last modification | View Log | RSS feed
import urllib2import simplejson as jsonimport pymongofrom dtr.utils.utils import to_java_datefrom datetime import datetimeimport optparseimport smtplibfrom email.mime.text import MIMETextfrom email.mime.multipart import MIMEMultipartfrom BeautifulSoup import BeautifulSoupfrom dtr.utils.utils import ungzipResponseimport jsonimport chardetfrom shop2020.utils.EmailAttachmentSender import get_attachment_partfrom shop2020.utils import EmailAttachmentSendercon = Noneparser = optparse.OptionParser()parser.add_option("-m", "--m", dest="mongoHost",default="localhost",type="string", help="The HOST where the mongo server is running",metavar="mongo_host")(options, args) = parser.parse_args()exceptionList = []noAttributesSupc = []categoryMap = {3:'Mobiles',5:'Tablets'}headers = {'User-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11','Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language' : 'en-US,en;q=0.8','Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Accept-Encoding' : 'gzip,deflate,sdch'}bestSellers = []now = datetime.now()BASE_URL = "snapdeal.com/"class __RankInfo:def __init__(self, identifier, rank, category, product_name, page_url, baseColor, ppid, subAttributes, live):self.identifier = identifierself.rank = rankself.category =categoryself.product_name = product_nameself.page_url = page_urlself.baseColor = baseColorself.ppid = ppidself.subAttributes = subAttributesself.live = liveclass __SubAttributes:def __init__(self, identifier, color, name, value):self.identifier = identifierself.color = colorself.name = nameself.value = valueclass __Exception:def __init__(self, identifier, color, desc, pageurl, rank, category, product_name):self.identifier = identifierself.color = colorself.desc = descself.pageurl = pageurlself.rank = rankself.category = categoryself.product_name = product_namedef get_mongo_connection(host=options.mongoHost, port=27017):global conif con is None:print "Establishing connection %s host and port %d" %(host,port)try:con = pymongo.MongoClient(host, port)except Exception, e:print ereturn Nonereturn condef getSoupObject(url):print "Getting soup object for"print urlglobal RETRY_COUNTRETRY_COUNT = 1while RETRY_COUNT < 10:try:soup = Nonerequest = urllib2.Request(url,headers=headers)response = urllib2.urlopen(request)response_data = ungzipResponse(response)response.close()try:page=response_data.decode("utf-8")soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)except:soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)if soup is None:raisereturn soupexcept Exception as e:print eprint "Retrying"RETRY_COUNT = RETRY_COUNT + 1def fetchSupcDetails(p_url, rank, category):supcMap = {}soup = getSoupObject(p_url)attributes = soup.find('div',{'id':'attributesJson'}).stringtry:ppid = soup.find('input',{'id':'pppid'})['value']except:ppid = p_url[p_url.rfind('/')+1:]productName = soup.find('input',{'id':'productNamePDP'})['value']supcMap[ppid] = []if attributes == "[]":supc = soup.find('div',{'id':'defaultSupc'}).stringr_info = __RankInfo(supc, rank, category, productName, p_url, "", ppid, [], True)supcMap[ppid] = [r_info]p_info = json.loads(attributes)for product in p_info:color = product['value']supc = product['supc']live = product['live']r_info = __RankInfo(supc, rank, category, productName, p_url, color, ppid, [], live)temp = supcMap.get(ppid)temp.append(r_info)supcMap[ppid] = tempif product['subAttributes'] is not None:for subAttribute in product['subAttributes']:sub_supc = subAttribute['supc']sub_value = subAttribute['value']sub_name = subAttribute['name']subAttr = __SubAttributes(sub_supc, color, sub_name, sub_value)subAttributes_list = r_info.subAttributessubAttributes_list.append(subAttr)return supcMapdef scrapeBestSellerMobiles():global bestSellersbestSellers = []rank = 1for z in [0,20,40,60,80]:url = "http://www.snapdeal.com/acors/json/product/get/search/175/%d/20?q=&sort=bstslr&keyword=&clickSrc=&viewType=List&lang=en&snr=false" %(z)soup = getSoupObject(url)for product in soup.findAll('a',{'class':'dp-widget-link'}):print product['href']try:supcMap = fetchSupcDetails(product['href'],rank, 3)except:continueprint "supcMap ",supcMapbestSellers.append(supcMap)rank = rank + 1def scrapeBestSellerTablets():global bestSellersbestSellers = []rank = 1for z in [0,20,40,60,80]:url = "http://www.snapdeal.com/acors/json/product/get/search/133/%d/20?sort=bstslr&keyword=&clickSrc=&viewType=List&lang=en&snr=false" %(z)soup = getSoupObject(url)for product in soup.findAll('a',{'class':'dp-widget-link'}):print product['href']try:supcMap = fetchSupcDetails(product['href'],rank, 5)except:continueprint "supcMap ",supcMapbestSellers.append(supcMap)rank = rank + 1def resetRanks(category):get_mongo_connection().Catalog.MasterData.update({'rank':{'$gt':0},'source_id':3,'category_id':category},{'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)def commitBestSellers(category):global exceptionListfor bestSeller in bestSellers:for mapVal in bestSeller.itervalues():for v in mapVal:col = list(get_mongo_connection().Catalog.MasterData.find({'identifier':v.identifier.strip(),'source_id':3}))if len(col) ==0 and len(v.subAttributes) == 0:exObj = __Exception(v.identifier.strip(), v.baseColor, "", v.page_url ,v.rank, category, v.product_name)exceptionList.append(exObj)continueprint v.identifierprint v.rankget_mongo_connection().Catalog.MasterData.update({'identifier':v.identifier,'source_id':3},{'$set' : {'rank':v.rank,'updatedOn':to_java_date(now)}}, multi=True)for subAttr in v.subAttributes:print "Inside subattr"print vars(subAttr)col = list(get_mongo_connection().Catalog.MasterData.find({'identifier':subAttr.identifier.strip(),'source_id':3}))if len(col) ==0:exObj = __Exception(subAttr.identifier.strip(), subAttr.color, subAttr.name+" "+subAttr.value, v.page_url ,v.rank, category, v.product_name)exceptionList.append(exObj)else:print v.identifierprint v.rankget_mongo_connection().Catalog.MasterData.update({'identifier':subAttr.identifier,'source_id':3},{'$set' : {'rank':v.rank,'updatedOn':to_java_date(now)}}, multi=True)print exceptionListdef sendEmail():message="""<html><body><h3>Snapdeal Best Sellers not in master</h3><table border="1" style="width:100%;"><thead><tr><th>Identifier</th><th>Category</th><th>Rank</th><th>Product Name</th><th>Color</th><th>Description</th><th>Page Url</th></tr></thead><tbody>"""for item in exceptionList:encoding = chardet.detect(item.pageurl)try:message+="""<tr><td style="text-align:center">"""+(item.identifier)+"""</td><td style="text-align:center">"""+(categoryMap.get(item.category))+"""</td><td style="text-align:center">"""+str(item.rank)+"""</td><td style="text-align:center">"""+str(item.product_name)+"""</td><td style="text-align:center">"""+item.color+"""</td><td style="text-align:center">"""+item.desc+"""</td><td style="text-align:center">"""+item.pageurl+"""</td></tr>"""except:passmessage+="""</tbody></table></body></html>"""print message#recipients = ['kshitij.sood@saholic.com']recipients = ['kshitij.sood@saholic.com','ritesh.chauhan@saholic.com','aishwarya.singh@saholic.com']EmailAttachmentSender.mail_send_grid("dtr@smartdukaan.com","apikey", "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", recipients, "Snapdeal Best Sellers",message ,[],[],[])def main():import timescrapeBestSellerMobiles()if len(bestSellers) > 0:resetRanks(3)commitBestSellers(3)scrapeBestSellerTablets()if len(bestSellers) > 0:resetRanks(5)commitBestSellers(5)print bestSellerssendEmail()if __name__=='__main__':main()