Subversion Repositories SmartDukaan

Rev

Rev 14379 | Rev 20319 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
13754 kshitij.so 1
import urllib2
2
from BeautifulSoup import BeautifulSoup
3
import pymongo
4
import re
13828 kshitij.so 5
from dtr.utils.utils import to_java_date
14379 kshitij.so 6
import optparse
13828 kshitij.so 7
from datetime import datetime
14379 kshitij.so 8
import smtplib
9
from email.mime.text import MIMEText
10
from email.mime.multipart import MIMEMultipart
13754 kshitij.so 11
 
14257 kshitij.so 12
con = None
13
parser = optparse.OptionParser()
14
parser.add_option("-m", "--m", dest="mongoHost",
15
                      default="localhost",
16
                      type="string", help="The HOST where the mongo server is running",
17
                      metavar="mongo_host")
18
 
19
(options, args) = parser.parse_args()
20
 
21
 
14379 kshitij.so 22
exceptionList = []
13754 kshitij.so 23
asin_regex = r'/([A-Z0-9]{10})'
24
bestSellers = []
13828 kshitij.so 25
now = datetime.now()
13754 kshitij.so 26
 
14379 kshitij.so 27
 
13754 kshitij.so 28
class __RankInfo:
29
 
14379 kshitij.so 30
    def __init__(self, identifier, rank, category):
13754 kshitij.so 31
        self.identifier = identifier
32
        self.rank  = rank
14379 kshitij.so 33
        self.category = category
13754 kshitij.so 34
 
14257 kshitij.so 35
def get_mongo_connection(host=options.mongoHost, port=27017):
13754 kshitij.so 36
    global con
37
    if con is None:
38
        print "Establishing connection %s host and port %d" %(host,port)
39
        try:
40
            con = pymongo.MongoClient(host, port)
41
        except Exception, e:
42
            print e
43
            return None
44
    return con
45
 
46
def getSoupObject(url):
47
    print "Getting soup object for"
48
    print url
49
    global RETRY_COUNT
50
    RETRY_COUNT = 1 
51
    while RETRY_COUNT < 10:
52
        try:
53
            soup = None
54
            request = urllib2.Request(url)
55
            request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
56
            request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')
57
            request.add_header('Connection','keep-alive')
58
            request.add_header('User-Agent','Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36')
59
 
60
            response = urllib2.urlopen(request)   
61
            response_data = response.read()
62
            response.close()
63
            try:
64
                page=response_data.decode("utf-8")
65
                soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
66
            except:
67
                soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)
68
            if soup is None:
69
                raise
70
            return soup
71
        except Exception as e:
72
            print e
73
            print "Retrying"
74
            RETRY_COUNT = RETRY_COUNT + 1
75
 
76
 
77
def scrapeBestSellerMobiles():
78
    global bestSellers
79
    rank = 0
80
    for i in range(1,6):
81
        aboveFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1389432031/ref=zg_bs_1389432031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=1" %(i,i) 
82
        belowFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1389432031/ref=zg_bs_1389432031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=0" %(i,i)
83
        above_soup = getSoupObject(aboveFoldUrl)
84
        below_soup = getSoupObject(belowFoldUrl)
85
        for x in above_soup.findAll('div',{'class':'zg_itemImmersion'}):
86
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
87
            identifier =  (re.search(asin_regex, am_url)).group(1)
88
            rank = rank + 1
14379 kshitij.so 89
            r_info = __RankInfo(identifier,rank, None)
13754 kshitij.so 90
            bestSellers.append(r_info)
91
        for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):
92
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
93
            identifier =  (re.search(asin_regex, am_url)).group(1)
94
            rank = rank + 1
14379 kshitij.so 95
            r_info = __RankInfo(identifier,rank, None)
13754 kshitij.so 96
            bestSellers.append(r_info)
97
 
14379 kshitij.so 98
def commitBestSellers(category):
99
    global exceptionList
13754 kshitij.so 100
    print "Rank",
101
    print '\t',
102
    print 'Identifier'
103
    for x in bestSellers:
104
        print x.rank,
105
        print '\t',
106
        print x.identifier,
107
        col = get_mongo_connection().Catalog.MasterData.find({'identifier':x.identifier.strip()})
108
        print "count sku",
109
        print '\t',
14379 kshitij.so 110
        if len(list(col)) == 0:
111
            x.category = category
112
            exceptionList.append(x)
113
        else:
114
            get_mongo_connection().Catalog.MasterData.update({'identifier':x.identifier.strip() }, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}}, multi=True)
13754 kshitij.so 115
 
116
def scrapeBestSellerTablets():
117
    global bestSellers
118
    bestSellers = []
119
    rank = 0
120
    for i in range(1,6):
121
        aboveFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1375458031/ref=zg_bs_1375458031_pg_%d?ie=UTF8&pg=%d&ajax=1" %(i,i) 
122
        belowFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1375458031/ref=zg_bs_1375458031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=0" %(i,i)
123
        above_soup = getSoupObject(aboveFoldUrl)
124
        below_soup = getSoupObject(belowFoldUrl)
125
        for x in above_soup.findAll('div',{'class':'zg_itemImmersion'}):
126
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
127
            identifier =  (re.search(asin_regex, am_url)).group(1)
128
            rank = rank + 1
13913 kshitij.so 129
            print identifier,
130
            print '\t',
131
            print rank
14379 kshitij.so 132
            r_info = __RankInfo(identifier,rank, None)
13754 kshitij.so 133
            bestSellers.append(r_info)
134
        for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):
135
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
136
            identifier =  (re.search(asin_regex, am_url)).group(1)
137
            rank = rank + 1
13913 kshitij.so 138
            print identifier,
139
            print '\t',
140
            print rank
14379 kshitij.so 141
            r_info = __RankInfo(identifier,rank, None)
13754 kshitij.so 142
            bestSellers.append(r_info)
143
 
13913 kshitij.so 144
def resetRanks(category_id):
145
    oldRankedItems = get_mongo_connection().Catalog.MasterData.find({'rank':{'$gt':0},'source_id':1,'category_id':category_id})
13828 kshitij.so 146
    for item in oldRankedItems:
147
        get_mongo_connection().Catalog.MasterData.update({'_id':item['_id']}, {'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)
14379 kshitij.so 148
 
149
def sendEmail():
150
    message="""<html>
151
            <body>
152
            <h3>Amazon Best Sellers not in master</h3>
153
            <table border="1" style="width:100%;">
154
            <thead>
155
            <tr><th>Identifier</th>
156
            <th>Category</th>
157
            <th>Rank</th>
158
            </tr></thead>
159
            <tbody>"""
160
    for item in exceptionList:
161
        message+="""<tr>
162
        <td style="text-align:center">"""+(item.identifier)+"""</td>
163
        <td style="text-align:center">"""+(item.category)+"""</td>
164
        <td style="text-align:center">"""+str(item.rank)+"""</td>
165
        </tr>"""
166
    message+="""</tbody></table></body></html>"""
167
    print message
168
    #recipients = ['kshitij.sood@saholic.com']
20172 aman.kumar 169
    recipients = ['rajneesh.arora@saholic.com','kshitij.sood@saholic.com','chaitnaya.vats@saholic.com','ritesh.chauhan@saholic.com','khushal.bhatia@saholic.com']
14379 kshitij.so 170
    msg = MIMEMultipart()
171
    msg['Subject'] = "Amazon Best Sellers" + ' - ' + str(datetime.now())
172
    msg['From'] = ""
173
    msg['To'] = ",".join(recipients)
174
    msg.preamble = "Amazon Best Sellers" + ' - ' + str(datetime.now())
175
    html_msg = MIMEText(message, 'html')
176
    msg.attach(html_msg)
177
 
178
    smtpServer = smtplib.SMTP('localhost')
179
    smtpServer.set_debuglevel(1)
180
    sender = 'dtr@shop2020.in'
181
    try:
182
        smtpServer.sendmail(sender, recipients, msg.as_string())
183
        print "Successfully sent email"
184
    except:
185
        print "Error: unable to send email."
13754 kshitij.so 186
 
14379 kshitij.so 187
 
188
 
13754 kshitij.so 189
def main():
190
    scrapeBestSellerMobiles()
191
    if len(bestSellers) > 0:
13913 kshitij.so 192
        resetRanks(3)
14379 kshitij.so 193
        commitBestSellers("MOBILE")
13754 kshitij.so 194
    scrapeBestSellerTablets()
195
    if len(bestSellers) > 0:
13913 kshitij.so 196
        resetRanks(5)
14379 kshitij.so 197
        commitBestSellers("TABLET")
198
    sendEmail()
13754 kshitij.so 199
 
200
if __name__=='__main__':
201
    main()