Subversion Repositories SmartDukaan

Rev

Rev 14257 | Rev 20172 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 14257 Rev 14379
Line 1... Line 1...
1
import urllib2
1
import urllib2
2
from BeautifulSoup import BeautifulSoup
2
from BeautifulSoup import BeautifulSoup
3
import pymongo
3
import pymongo
4
import re
4
import re
5
from dtr.utils.utils import to_java_date
5
from dtr.utils.utils import to_java_date
6
from datetime import datetime
-
 
7
import optparse
6
import optparse
-
 
7
from datetime import datetime
-
 
8
import smtplib
-
 
9
from email.mime.text import MIMEText
-
 
10
from email.mime.multipart import MIMEMultipart
8
 
11
 
9
con = None
12
con = None
10
parser = optparse.OptionParser()
13
parser = optparse.OptionParser()
11
parser.add_option("-m", "--m", dest="mongoHost",
14
parser.add_option("-m", "--m", dest="mongoHost",
12
                      default="localhost",
15
                      default="localhost",
Line 14... Line 17...
14
                      metavar="mongo_host")
17
                      metavar="mongo_host")
15
 
18
 
16
(options, args) = parser.parse_args()
19
(options, args) = parser.parse_args()
17
 
20
 
18
 
21
 
-
 
22
exceptionList = []
19
asin_regex = r'/([A-Z0-9]{10})'
23
asin_regex = r'/([A-Z0-9]{10})'
20
bestSellers = []
24
bestSellers = []
21
now = datetime.now()
25
now = datetime.now()
22
 
26
 
-
 
27
 
23
class __RankInfo:
28
class __RankInfo:
24
    
29
    
25
    def __init__(self, identifier, rank):
30
    def __init__(self, identifier, rank, category):
26
        self.identifier = identifier
31
        self.identifier = identifier
27
        self.rank  = rank
32
        self.rank  = rank
-
 
33
        self.category = category
28
 
34
 
29
def get_mongo_connection(host=options.mongoHost, port=27017):
35
def get_mongo_connection(host=options.mongoHost, port=27017):
30
    global con
36
    global con
31
    if con is None:
37
    if con is None:
32
        print "Establishing connection %s host and port %d" %(host,port)
38
        print "Establishing connection %s host and port %d" %(host,port)
Line 78... Line 84...
78
        below_soup = getSoupObject(belowFoldUrl)
84
        below_soup = getSoupObject(belowFoldUrl)
79
        for x in above_soup.findAll('div',{'class':'zg_itemImmersion'}):
85
        for x in above_soup.findAll('div',{'class':'zg_itemImmersion'}):
80
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
86
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
81
            identifier =  (re.search(asin_regex, am_url)).group(1)
87
            identifier =  (re.search(asin_regex, am_url)).group(1)
82
            rank = rank + 1
88
            rank = rank + 1
83
            r_info = __RankInfo(identifier,rank)
89
            r_info = __RankInfo(identifier,rank, None)
84
            bestSellers.append(r_info)
90
            bestSellers.append(r_info)
85
        for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):
91
        for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):
86
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
92
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
87
            identifier =  (re.search(asin_regex, am_url)).group(1)
93
            identifier =  (re.search(asin_regex, am_url)).group(1)
88
            rank = rank + 1
94
            rank = rank + 1
89
            r_info = __RankInfo(identifier,rank)
95
            r_info = __RankInfo(identifier,rank, None)
90
            bestSellers.append(r_info)
96
            bestSellers.append(r_info)
91
 
97
 
92
def commitBestSellers():
98
def commitBestSellers(category):
-
 
99
    global exceptionList
93
    print "Rank",
100
    print "Rank",
94
    print '\t',
101
    print '\t',
95
    print 'Identifier'
102
    print 'Identifier'
96
    for x in bestSellers:
103
    for x in bestSellers:
97
        print x.rank,
104
        print x.rank,
98
        print '\t',
105
        print '\t',
99
        print x.identifier,
106
        print x.identifier,
100
        col = get_mongo_connection().Catalog.MasterData.find({'identifier':x.identifier.strip()})
107
        col = get_mongo_connection().Catalog.MasterData.find({'identifier':x.identifier.strip()})
101
        print "count sku",
108
        print "count sku",
102
        print '\t',
109
        print '\t',
103
        print len(list(col))
110
        if len(list(col)) == 0:
-
 
111
            x.category = category
-
 
112
            exceptionList.append(x)
-
 
113
        else:
104
        get_mongo_connection().Catalog.MasterData.update({'identifier':x.identifier.strip() }, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}}, multi=True)
114
            get_mongo_connection().Catalog.MasterData.update({'identifier':x.identifier.strip() }, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}}, multi=True)
105
        
115
        
106
def scrapeBestSellerTablets():
116
def scrapeBestSellerTablets():
107
    global bestSellers
117
    global bestSellers
108
    bestSellers = []
118
    bestSellers = []
109
    rank = 0
119
    rank = 0
Line 117... Line 127...
117
            identifier =  (re.search(asin_regex, am_url)).group(1)
127
            identifier =  (re.search(asin_regex, am_url)).group(1)
118
            rank = rank + 1
128
            rank = rank + 1
119
            print identifier,
129
            print identifier,
120
            print '\t',
130
            print '\t',
121
            print rank
131
            print rank
122
            r_info = __RankInfo(identifier,rank)
132
            r_info = __RankInfo(identifier,rank, None)
123
            bestSellers.append(r_info)
133
            bestSellers.append(r_info)
124
        for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):
134
        for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):
125
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
135
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
126
            identifier =  (re.search(asin_regex, am_url)).group(1)
136
            identifier =  (re.search(asin_regex, am_url)).group(1)
127
            rank = rank + 1
137
            rank = rank + 1
128
            print identifier,
138
            print identifier,
129
            print '\t',
139
            print '\t',
130
            print rank
140
            print rank
131
            r_info = __RankInfo(identifier,rank)
141
            r_info = __RankInfo(identifier,rank, None)
132
            bestSellers.append(r_info)
142
            bestSellers.append(r_info)
133
 
143
 
134
def resetRanks(category_id):
144
def resetRanks(category_id):
135
    oldRankedItems = get_mongo_connection().Catalog.MasterData.find({'rank':{'$gt':0},'source_id':1,'category_id':category_id})
145
    oldRankedItems = get_mongo_connection().Catalog.MasterData.find({'rank':{'$gt':0},'source_id':1,'category_id':category_id})
136
    for item in oldRankedItems:
146
    for item in oldRankedItems:
137
        get_mongo_connection().Catalog.MasterData.update({'_id':item['_id']}, {'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)
147
        get_mongo_connection().Catalog.MasterData.update({'_id':item['_id']}, {'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)
-
 
148
 
-
 
149
def sendEmail():
-
 
150
    message="""<html>
-
 
151
            <body>
-
 
152
            <h3>Amazon Best Sellers not in master</h3>
-
 
153
            <table border="1" style="width:100%;">
-
 
154
            <thead>
-
 
155
            <tr><th>Identifier</th>
-
 
156
            <th>Category</th>
-
 
157
            <th>Rank</th>
-
 
158
            </tr></thead>
-
 
159
            <tbody>"""
-
 
160
    for item in exceptionList:
-
 
161
        message+="""<tr>
-
 
162
        <td style="text-align:center">"""+(item.identifier)+"""</td>
-
 
163
        <td style="text-align:center">"""+(item.category)+"""</td>
-
 
164
        <td style="text-align:center">"""+str(item.rank)+"""</td>
-
 
165
        </tr>"""
-
 
166
    message+="""</tbody></table></body></html>"""
-
 
167
    print message
-
 
168
    #recipients = ['kshitij.sood@saholic.com']
-
 
169
    recipients = ['rajneesh.arora@saholic.com','kshitij.sood@saholic.com','chaitnaya.vats@saholic.com','manoj.kumar@saholic.com']
-
 
170
    msg = MIMEMultipart()
-
 
171
    msg['Subject'] = "Amazon Best Sellers" + ' - ' + str(datetime.now())
-
 
172
    msg['From'] = ""
-
 
173
    msg['To'] = ",".join(recipients)
-
 
174
    msg.preamble = "Amazon Best Sellers" + ' - ' + str(datetime.now())
-
 
175
    html_msg = MIMEText(message, 'html')
-
 
176
    msg.attach(html_msg)
-
 
177
    
-
 
178
    smtpServer = smtplib.SMTP('localhost')
-
 
179
    smtpServer.set_debuglevel(1)
-
 
180
    sender = 'dtr@shop2020.in'
-
 
181
    try:
-
 
182
        smtpServer.sendmail(sender, recipients, msg.as_string())
-
 
183
        print "Successfully sent email"
-
 
184
    except:
-
 
185
        print "Error: unable to send email."
-
 
186
            
-
 
187
            
138
            
188
            
139
def main():
189
def main():
140
    scrapeBestSellerMobiles()
190
    scrapeBestSellerMobiles()
141
    if len(bestSellers) > 0:
191
    if len(bestSellers) > 0:
142
        resetRanks(3)
192
        resetRanks(3)
143
        commitBestSellers()
193
        commitBestSellers("MOBILE")
144
    scrapeBestSellerTablets()
194
    scrapeBestSellerTablets()
145
    if len(bestSellers) > 0:
195
    if len(bestSellers) > 0:
146
        resetRanks(5)
196
        resetRanks(5)
147
        commitBestSellers()
197
        commitBestSellers("TABLET")
-
 
198
    sendEmail()
148
        
199
        
149
if __name__=='__main__':
200
if __name__=='__main__':
150
    main()
201
    main()
151
202