Subversion Repositories SmartDukaan

Rev

Rev 20172 | Rev 20320 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 20172 Rev 20319
Line 1... Line -...
1
import urllib2
-
 
2
from BeautifulSoup import BeautifulSoup
-
 
3
import pymongo
1
import pymongo
4
import re
2
import re
5
from dtr.utils.utils import to_java_date
3
from dtr.utils.utils import to_java_date
6
import optparse
4
import optparse
7
from datetime import datetime
5
from datetime import datetime
8
import smtplib
6
import smtplib
9
from email.mime.text import MIMEText
7
from email.mime.text import MIMEText
10
from email.mime.multipart import MIMEMultipart
8
from email.mime.multipart import MIMEMultipart
-
 
9
from shop2020.model.v1.catalog.script import AmazonAdvertisingApi
11
 
10
 
12
con = None
11
con = None
13
parser = optparse.OptionParser()
12
parser = optparse.OptionParser()
14
parser.add_option("-m", "--m", dest="mongoHost",
13
parser.add_option("-m", "--m", dest="mongoHost",
15
                      default="localhost",
14
                      default="localhost",
Line 25... Line 24...
25
now = datetime.now()
24
now = datetime.now()
26
 
25
 
27
 
26
 
28
class __RankInfo:
27
class __RankInfo:
29
    
28
    
30
    def __init__(self, identifier, rank, category):
29
    def __init__(self, identifier, rank, title ,category):
31
        self.identifier = identifier
30
        self.identifier = identifier
32
        self.rank  = rank
31
        self.rank  = rank
-
 
32
        self.title = title
33
        self.category = category
33
        self.category = category
34
 
34
 
35
def get_mongo_connection(host=options.mongoHost, port=27017):
35
def get_mongo_connection(host=options.mongoHost, port=27017):
36
    global con
36
    global con
37
    if con is None:
37
    if con is None:
Line 41... Line 41...
41
        except Exception, e:
41
        except Exception, e:
42
            print e
42
            print e
43
            return None
43
            return None
44
    return con
44
    return con
45
 
45
 
46
def getSoupObject(url):
-
 
47
    print "Getting soup object for"
-
 
48
    print url
-
 
49
    global RETRY_COUNT
-
 
50
    RETRY_COUNT = 1 
-
 
51
    while RETRY_COUNT < 10:
-
 
52
        try:
-
 
53
            soup = None
-
 
54
            request = urllib2.Request(url)
-
 
55
            request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
-
 
56
            request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')
-
 
57
            request.add_header('Connection','keep-alive')
-
 
58
            request.add_header('User-Agent','Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36')
-
 
59
        
-
 
60
            response = urllib2.urlopen(request)   
-
 
61
            response_data = response.read()
-
 
62
            response.close()
-
 
63
            try:
-
 
64
                page=response_data.decode("utf-8")
-
 
65
                soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
-
 
66
            except:
-
 
67
                soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)
-
 
68
            if soup is None:
-
 
69
                raise
-
 
70
            return soup
-
 
71
        except Exception as e:
-
 
72
            print e
-
 
73
            print "Retrying"
-
 
74
            RETRY_COUNT = RETRY_COUNT + 1
-
 
75
 
46
 
76
 
47
 
77
def scrapeBestSellerMobiles():
48
def getBestSellers(browseNode, category):
78
    global bestSellers
49
    global bestSellers
79
    rank = 0
50
    rank = 1
80
    for i in range(1,6):
51
    for i in range(1,11):
81
        aboveFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1389432031/ref=zg_bs_1389432031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=1" %(i,i) 
-
 
82
        belowFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1389432031/ref=zg_bs_1389432031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=0" %(i,i)
-
 
83
        above_soup = getSoupObject(aboveFoldUrl)
-
 
84
        below_soup = getSoupObject(belowFoldUrl)
-
 
85
        for x in above_soup.findAll('div',{'class':'zg_itemImmersion'}):
-
 
86
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
52
        result = AmazonAdvertisingApi.get_best_seller_rank(browseNode, i)
87
            identifier =  (re.search(asin_regex, am_url)).group(1)
-
 
88
            rank = rank + 1
53
        for x in result:
89
            r_info = __RankInfo(identifier,rank, None)
54
            r_info = __RankInfo(x['asin'], rank, x['product_name'], category)
90
            bestSellers.append(r_info)
55
            bestSellers.append(r_info)
91
        for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):
-
 
92
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
-
 
93
            identifier =  (re.search(asin_regex, am_url)).group(1)
-
 
94
            rank = rank + 1
56
            rank = rank + 1
95
            r_info = __RankInfo(identifier,rank, None)
-
 
96
            bestSellers.append(r_info)
-
 
97
 
57
 
98
def commitBestSellers(category):
58
def commitBestSellers():
99
    global exceptionList
59
    global exceptionList
100
    print "Rank",
60
    print "Rank",
101
    print '\t',
61
    print '\t',
102
    print 'Identifier'
62
    print 'Identifier',
-
 
63
    print '\t',
-
 
64
    print 'title'
103
    for x in bestSellers:
65
    for x in bestSellers:
104
        print x.rank,
66
        print x.rank,
105
        print '\t',
67
        print '\t',
106
        print x.identifier,
68
        print x.identifier,
107
        col = get_mongo_connection().Catalog.MasterData.find({'identifier':x.identifier.strip()})
-
 
108
        print "count sku",
-
 
109
        print '\t',
69
        print '\t',
-
 
70
        print x.title
-
 
71
        col = get_mongo_connection().Catalog.MasterData.find({'identifier':x.identifier.strip()})
110
        if len(list(col)) == 0:
72
        if len(list(col)) == 0:
111
            x.category = category
-
 
112
            exceptionList.append(x)
73
            exceptionList.append(x)
113
        else:
74
        else:
114
            get_mongo_connection().Catalog.MasterData.update({'identifier':x.identifier.strip() }, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}}, multi=True)
75
            get_mongo_connection().Catalog.MasterData.update({'identifier':x.identifier.strip() }, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}}, multi=True)
115
        
76
        
116
def scrapeBestSellerTablets():
-
 
117
    global bestSellers
-
 
118
    bestSellers = []
-
 
119
    rank = 0
-
 
120
    for i in range(1,6):
-
 
121
        aboveFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1375458031/ref=zg_bs_1375458031_pg_%d?ie=UTF8&pg=%d&ajax=1" %(i,i) 
-
 
122
        belowFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1375458031/ref=zg_bs_1375458031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=0" %(i,i)
-
 
123
        above_soup = getSoupObject(aboveFoldUrl)
-
 
124
        below_soup = getSoupObject(belowFoldUrl)
-
 
125
        for x in above_soup.findAll('div',{'class':'zg_itemImmersion'}):
-
 
126
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
-
 
127
            identifier =  (re.search(asin_regex, am_url)).group(1)
-
 
128
            rank = rank + 1
-
 
129
            print identifier,
-
 
130
            print '\t',
-
 
131
            print rank
-
 
132
            r_info = __RankInfo(identifier,rank, None)
-
 
133
            bestSellers.append(r_info)
-
 
134
        for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):
-
 
135
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
-
 
136
            identifier =  (re.search(asin_regex, am_url)).group(1)
-
 
137
            rank = rank + 1
-
 
138
            print identifier,
-
 
139
            print '\t',
-
 
140
            print rank
-
 
141
            r_info = __RankInfo(identifier,rank, None)
-
 
142
            bestSellers.append(r_info)
-
 
143
 
-
 
144
def resetRanks(category_id):
77
def resetRanks():
145
    oldRankedItems = get_mongo_connection().Catalog.MasterData.find({'rank':{'$gt':0},'source_id':1,'category_id':category_id})
-
 
146
    for item in oldRankedItems:
-
 
147
        get_mongo_connection().Catalog.MasterData.update({'_id':item['_id']}, {'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)
78
    get_mongo_connection().Catalog.MasterData.update({'rank':{'$gt':0},'source_id':1},{'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)
148
 
79
 
149
def sendEmail():
80
def sendEmail():
150
    message="""<html>
81
    message="""<html>
151
            <body>
82
            <body>
152
            <h3>Amazon Best Sellers not in master</h3>
83
            <h3>Amazon Best Sellers not in master</h3>
153
            <table border="1" style="width:100%;">
84
            <table border="1" style="width:100%;">
154
            <thead>
85
            <thead>
155
            <tr><th>Identifier</th>
86
            <tr><th>Identifier</th>
156
            <th>Category</th>
87
            <th>Category</th>
157
            <th>Rank</th>
88
            <th>Rank</th>
-
 
89
            <th>Title</th>
158
            </tr></thead>
90
            </tr></thead>
159
            <tbody>"""
91
            <tbody>"""
160
    for item in exceptionList:
92
    for item in exceptionList:
161
        message+="""<tr>
93
        message+="""<tr>
162
        <td style="text-align:center">"""+(item.identifier)+"""</td>
94
        <td style="text-align:center">"""+(item.identifier)+"""</td>
163
        <td style="text-align:center">"""+(item.category)+"""</td>
95
        <td style="text-align:center">"""+(item.category)+"""</td>
164
        <td style="text-align:center">"""+str(item.rank)+"""</td>
96
        <td style="text-align:center">"""+str(item.rank)+"""</td>
-
 
97
        <td style="text-align:center">"""+str(item.title)+"""</td>
165
        </tr>"""
98
        </tr>"""
166
    message+="""</tbody></table></body></html>"""
99
    message+="""</tbody></table></body></html>"""
167
    print message
100
    print message
168
    #recipients = ['kshitij.sood@saholic.com']
-
 
169
    recipients = ['rajneesh.arora@saholic.com','kshitij.sood@saholic.com','chaitnaya.vats@saholic.com','ritesh.chauhan@saholic.com','khushal.bhatia@saholic.com']
101
    recipients = ['rajneesh.arora@saholic.com','kshitij.sood@saholic.com','chaitnaya.vats@saholic.com','ritesh.chauhan@saholic.com','khushal.bhatia@saholic.com']
170
    msg = MIMEMultipart()
102
    msg = MIMEMultipart()
171
    msg['Subject'] = "Amazon Best Sellers" + ' - ' + str(datetime.now())
103
    msg['Subject'] = "Amazon Best Sellers" + ' - ' + str(datetime.now())
172
    msg['From'] = ""
104
    msg['From'] = ""
173
    msg['To'] = ",".join(recipients)
105
    msg['To'] = ",".join(recipients)
Line 185... Line 117...
185
        print "Error: unable to send email."
117
        print "Error: unable to send email."
186
            
118
            
187
            
119
            
188
            
120
            
189
def main():
121
def main():
190
    scrapeBestSellerMobiles()
122
    getBestSellers("1389432031","Mobiles")
191
    if len(bestSellers) > 0:
-
 
192
        resetRanks(3)
-
 
193
        commitBestSellers("MOBILE")
-
 
194
    scrapeBestSellerTablets()
123
    getBestSellers("1375458031","Tablets")
195
    if len(bestSellers) > 0:
-
 
196
        resetRanks(5)
124
    resetRanks()
197
        commitBestSellers("TABLET")
125
    commitBestSellers()
198
    sendEmail()
126
    sendEmail()
199
        
127
        
200
if __name__=='__main__':
128
if __name__=='__main__':
201
    main()
129
    main()
202
130