Subversion Repositories SmartDukaan

Rev

Rev 20371 | Rev 21135 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
17013 manish.sha 1
import urllib2
2
from BeautifulSoup import BeautifulSoup
3
import re
4
from dtr.utils.utils import to_java_date
5
import optparse
6
from datetime import datetime
7
import smtplib
8
from email.mime.text import MIMEText
9
from email.mime.multipart import MIMEMultipart
10
from dtr.utils.utils import fetchResponseUsingProxy, get_mongo_connection, ungzipResponse
11
import json
12
import urllib
17105 manish.sha 13
import chardet
17013 manish.sha 14
 
15
 
16
con = None
17
parser = optparse.OptionParser()
18
parser.add_option("-m", "--m", dest="mongoHost",
19
                      default="localhost",
20
                      type="string", help="The HOST where the mongo server is running",
21
                      metavar="mongo_host")
22
 
23
(options, args) = parser.parse_args()
24
 
25
headers = {
26
            'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36',
27
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',      
28
            'Accept-Language' : 'en-US,en;q=0.8',                     
29
            'Accept-Encoding' : 'gzip,deflate,sdch',
30
            'Host' : 'www.homeshop18.com',
31
            'Referer': 'http://www.homeshop18.com/mobile-phones/categoryid:14569/search:/sort:Popularity/inStock:%28%22true%22%29/start:0/'     
32
}
33
 
34
 
35
exceptionList = []
36
bestSellers = []
37
now = datetime.now()
38
csrfValue = None
39
 
40
class __RankInfo:
41
 
42
    def __init__(self, identifier, rank, category, available_price, in_stock, thumbnail, source_product_name, marketPlaceUrl):
43
        self.identifier = identifier
44
        self.rank  = rank
45
        self.available_price = available_price
46
        self.in_stock = in_stock
47
        self.category = category
48
        self.thumbnail = thumbnail
49
        self.source_product_name = source_product_name
50
        self.marketPlaceUrl = marketPlaceUrl    
51
 
52
def commitBestSellers(category):
53
    global exceptionList
54
    print "Rank",
55
    print '\t',
56
    print 'Identifier'
57
    for x in bestSellers:
58
        print x.rank,
59
        print '\t',
60
        print x.identifier,
61
        print '\t',
17068 kshitij.so 62
        col = list(get_mongo_connection(host=options.mongoHost).Catalog.MasterData.find({'identifier':x.identifier, 'source_id':7}))
17013 manish.sha 63
        print "count sku",len(col)
64
        print '\n'
65
        if len(col) == 0:
66
            x.category = category
67
            exceptionList.append(x)
68
        else:
17068 kshitij.so 69
            get_mongo_connection(host=options.mongoHost).Catalog.MasterData.update({'identifier':x.identifier, 'source_id':7 }, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}})
17013 manish.sha 70
 
71
 
72
def scrapeBestSellerMobiles():
73
    global bestSellers
74
    rank = 0
17040 manish.sha 75
    bestSellers = []
17013 manish.sha 76
    print "Homeshop18 Best Sellers Mobiles..."
77
    for i in range(0,5):
78
        mobileCategoryUrl = 'http://www.homeshop18.com/mobile-phones/categoryid:14569/search:/sort:Popularity/inStock:%28%22true%22%29/start:'+str(i*24)+'/?lazy=true'
79
        #mobileCategoryUrl = "http://m.homeshop18.com/search.mobi?categoryId=14569&isAjax=true&page="+str(i)+"&csrf="+csrfValue
80
        data = fetchResponseUsingProxy(mobileCategoryUrl, proxy=False )
81
        soup = BeautifulSoup(data)
82
        tags = soup.findAll("div", {'class':'inside'})
83
        for tag in tags:
84
            if not tag.has_key('id'):
85
                continue
86
            rank = rank +1
87
            if rank >100:
88
                break
89
            titleTag = tag.find('p', {'class' : 'product_title'})
90
            source_product_name = titleTag.text
17106 manish.sha 91
            '''
17105 manish.sha 92
            encoding =  chardet.detect(source_product_name)
93
            try:
94
                source_product_name = source_product_name.decode(encoding.get('encoding'))
95
            except:
96
                source_product_name = source_product_name.decode('latin-1')
17106 manish.sha 97
            '''
98
            source_product_name = source_product_name.encode('utf8')
17013 manish.sha 99
            productUrl = titleTag.find('a').get('href')
100
            productUrl = 'http://www.homeshop18.com'+str(productUrl)
101
            inStock = 1
17069 kshitij.so 102
            identifier = tag['id'].split('_')[1]
17013 manish.sha 103
            available_price = long(tag.find('p',{'class':'price clearfix'}).find('b').text.split(' ')[1])
104
            thumbnail = tag.find('p',{'class':'product_image'}).find('img').get('data-original')
17069 kshitij.so 105
            print productUrl, source_product_name, thumbnail, rank, available_price, identifier, inStock
106
            r_info = __RankInfo(identifier, rank, None, available_price, inStock, thumbnail, source_product_name, "http://m.homeshop18.com/product.mobi?productId=%s"%str(identifier))
17013 manish.sha 107
            bestSellers.append(r_info)   
108
 
109
def scrapeBestSellerTablets():
110
    global bestSellers
111
    rank = 0
17040 manish.sha 112
    bestSellers = []
17013 manish.sha 113
    print "Homeshop18 Best Sellers Tablets..."
114
    for i in range(0,5):
20371 kshitij.so 115
        tabletCategoryUrl = 'http://www.homeshop18.com/tablets/categoryid:8937/search:/sort:Popularity/inStock:%28%22true%22%29/start:'+str(i*24)+'/?lazy=true'
20372 kshitij.so 116
        data = fetchResponseUsingProxy(tabletCategoryUrl, proxy=False)
17013 manish.sha 117
        soup = BeautifulSoup(data)
118
        tags = soup.findAll("div", {'class':'inside'})
119
        for tag in tags:
120
            if not tag.has_key('id'):
121
                continue
122
            rank = rank +1
123
            if rank >100:
124
                break
125
            titleTag = tag.find('p', {'class' : 'product_title'})
126
            source_product_name = titleTag.text
17106 manish.sha 127
            '''
17105 manish.sha 128
            encoding =  chardet.detect(source_product_name)
129
            try:
130
                source_product_name = source_product_name.decode(encoding.get('encoding'))
131
            except:
132
                source_product_name = source_product_name.decode('latin-1')
17106 manish.sha 133
            '''
134
            source_product_name = source_product_name.encode('utf8')
17013 manish.sha 135
            productUrl = titleTag.find('a').get('href')
136
            productUrl = 'http://www.homeshop18.com'+str(productUrl)
137
            inStock = 1
17069 kshitij.so 138
            identifier = tag['id'].split('_')[1]
17013 manish.sha 139
            available_price = long(tag.find('p',{'class':'price clearfix'}).find('b').text.split(' ')[1])
140
            thumbnail = tag.find('p',{'class':'product_image'}).find('img').get('data-original')
17069 kshitij.so 141
            print productUrl, source_product_name, thumbnail, rank, available_price, identifier, inStock
142
            r_info = __RankInfo(identifier, rank, None , available_price, inStock, thumbnail, source_product_name, "http://m.homeshop18.com/product.mobi?productId=%s"%str(identifier))
17013 manish.sha 143
            bestSellers.append(r_info)
144
 
145
def resetRanks(category_id):
17068 kshitij.so 146
    get_mongo_connection(host=options.mongoHost).Catalog.MasterData.update({'rank':{'$gt':0},'source_id':7,'category_id':category_id}, {'$set':{'rank':0}}, upsert=False, multi=True)
17013 manish.sha 147
 
148
def sendEmail():
149
    message="""<html>
150
            <body>
151
            <h3>HomeShop18 Best Sellers not in master</h3>
152
            <table border="1" style="width:100%;">
153
            <thead>
154
            <tr><th>Identifier</th>
155
            <th>Category</th>
156
            <th>Rank</th>
157
            <th>Available_price</th>
158
            <th>In_stock</th>
159
            <th>Thumbnail</th>
160
            <th>Source_product_name</th>
161
            <th>MarketPlaceUrl</th>
162
            </tr></thead>
163
            <tbody>"""
164
    for item in exceptionList:
165
        try:
166
            message+="""<tr>
17039 manish.sha 167
            <td style="text-align:center">"""+str(item.identifier)+"""</td>
17038 manish.sha 168
            <td style="text-align:center">"""+str(item.category)+"""</td>
17013 manish.sha 169
            <td style="text-align:center">"""+str(item.rank)+"""</td>
170
            <td style="text-align:center">"""+str(item.available_price)+"""</td>
171
            <td style="text-align:center">"""+str(item.in_stock)+"""</td>
172
            <td style="text-align:center">"""+str(item.thumbnail)+"""</td>
173
            <td style="text-align:center">"""+str(item.source_product_name)+"""</td>
174
            <td style="text-align:center">"""+str(item.marketPlaceUrl)+"""</td>
175
            </tr>"""
176
        except:
177
            continue
178
    message+="""</tbody></table></body></html>"""
179
    print message
180
    #recipients = ['amit.gupta@saholic.com']
20172 aman.kumar 181
    recipients = ['rajneesh.arora@saholic.com', 'khushal.bhatia@saholic.com', 'kshitij.sood@saholic.com','chaitnaya.vats@saholic.com','ritesh.chauhan@saholic.com','amit.gupta@saholic.com']
17013 manish.sha 182
    msg = MIMEMultipart()
183
    msg['Subject'] = "HomeShop18 Best Sellers" + ' - ' + str(datetime.now())
184
    msg['From'] = ""
185
    msg['To'] = ",".join(recipients)
186
    msg.preamble = "HomeShop18 Best Sellers" + ' - ' + str(datetime.now())
187
    html_msg = MIMEText(message, 'html')
188
    msg.attach(html_msg)
189
 
190
    smtpServer = smtplib.SMTP('localhost')
191
    smtpServer.set_debuglevel(1)
192
    sender = 'dtr@shop2020.in'
193
    try:
194
        smtpServer.sendmail(sender, recipients, msg.as_string())
195
        print "Successfully sent email"
196
    except:
197
        print "Error: unable to send email."
198
 
199
def main():
200
    #getCsrfValue()
201
    scrapeBestSellerMobiles()
202
    if len(bestSellers) > 0:
203
        resetRanks(3)
204
        commitBestSellers("MOBILE")
205
    scrapeBestSellerTablets()
206
    if len(bestSellers) > 0:
207
        resetRanks(5)
208
        commitBestSellers("TABLET")
209
    sendEmail()
210
 
211
 
212
if __name__=='__main__':
213
    main()
214
 
215
 
216
 
217