Subversion Repositories SmartDukaan

Rev

Rev 21135 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
17013 manish.sha 1
import urllib2
2
from BeautifulSoup import BeautifulSoup
3
import re
4
from dtr.utils.utils import to_java_date
5
import optparse
6
from datetime import datetime
7
import smtplib
8
from email.mime.text import MIMEText
9
from email.mime.multipart import MIMEMultipart
10
from dtr.utils.utils import fetchResponseUsingProxy, get_mongo_connection, ungzipResponse
11
import json
12
import urllib
17105 manish.sha 13
import chardet
21135 kshitij.so 14
from shop2020.utils.EmailAttachmentSender import get_attachment_part
15
from shop2020.utils import EmailAttachmentSender
17013 manish.sha 16
 
17
 
18
con = None
19
parser = optparse.OptionParser()
20
parser.add_option("-m", "--m", dest="mongoHost",
21
                      default="localhost",
22
                      type="string", help="The HOST where the mongo server is running",
23
                      metavar="mongo_host")
24
 
25
(options, args) = parser.parse_args()
26
 
27
headers = {
28
            'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36',
29
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',      
30
            'Accept-Language' : 'en-US,en;q=0.8',                     
31
            'Accept-Encoding' : 'gzip,deflate,sdch',
32
            'Host' : 'www.homeshop18.com',
33
            'Referer': 'http://www.homeshop18.com/mobile-phones/categoryid:14569/search:/sort:Popularity/inStock:%28%22true%22%29/start:0/'     
34
}
35
 
36
 
37
exceptionList = []
38
bestSellers = []
39
now = datetime.now()
40
csrfValue = None
41
 
42
class __RankInfo:
43
 
44
    def __init__(self, identifier, rank, category, available_price, in_stock, thumbnail, source_product_name, marketPlaceUrl):
45
        self.identifier = identifier
46
        self.rank  = rank
47
        self.available_price = available_price
48
        self.in_stock = in_stock
49
        self.category = category
50
        self.thumbnail = thumbnail
51
        self.source_product_name = source_product_name
52
        self.marketPlaceUrl = marketPlaceUrl    
53
 
54
def commitBestSellers(category):
55
    global exceptionList
56
    print "Rank",
57
    print '\t',
58
    print 'Identifier'
59
    for x in bestSellers:
60
        print x.rank,
61
        print '\t',
62
        print x.identifier,
63
        print '\t',
17068 kshitij.so 64
        col = list(get_mongo_connection(host=options.mongoHost).Catalog.MasterData.find({'identifier':x.identifier, 'source_id':7}))
17013 manish.sha 65
        print "count sku",len(col)
66
        print '\n'
67
        if len(col) == 0:
68
            x.category = category
69
            exceptionList.append(x)
70
        else:
17068 kshitij.so 71
            get_mongo_connection(host=options.mongoHost).Catalog.MasterData.update({'identifier':x.identifier, 'source_id':7 }, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}})
17013 manish.sha 72
 
73
 
74
def scrapeBestSellerMobiles():
75
    global bestSellers
76
    rank = 0
17040 manish.sha 77
    bestSellers = []
17013 manish.sha 78
    print "Homeshop18 Best Sellers Mobiles..."
79
    for i in range(0,5):
80
        mobileCategoryUrl = 'http://www.homeshop18.com/mobile-phones/categoryid:14569/search:/sort:Popularity/inStock:%28%22true%22%29/start:'+str(i*24)+'/?lazy=true'
81
        #mobileCategoryUrl = "http://m.homeshop18.com/search.mobi?categoryId=14569&isAjax=true&page="+str(i)+"&csrf="+csrfValue
82
        data = fetchResponseUsingProxy(mobileCategoryUrl, proxy=False )
83
        soup = BeautifulSoup(data)
84
        tags = soup.findAll("div", {'class':'inside'})
85
        for tag in tags:
86
            if not tag.has_key('id'):
87
                continue
88
            rank = rank +1
89
            if rank >100:
90
                break
91
            titleTag = tag.find('p', {'class' : 'product_title'})
92
            source_product_name = titleTag.text
17106 manish.sha 93
            '''
17105 manish.sha 94
            encoding =  chardet.detect(source_product_name)
95
            try:
96
                source_product_name = source_product_name.decode(encoding.get('encoding'))
97
            except:
98
                source_product_name = source_product_name.decode('latin-1')
17106 manish.sha 99
            '''
100
            source_product_name = source_product_name.encode('utf8')
17013 manish.sha 101
            productUrl = titleTag.find('a').get('href')
102
            productUrl = 'http://www.homeshop18.com'+str(productUrl)
103
            inStock = 1
17069 kshitij.so 104
            identifier = tag['id'].split('_')[1]
17013 manish.sha 105
            available_price = long(tag.find('p',{'class':'price clearfix'}).find('b').text.split(' ')[1])
106
            thumbnail = tag.find('p',{'class':'product_image'}).find('img').get('data-original')
17069 kshitij.so 107
            print productUrl, source_product_name, thumbnail, rank, available_price, identifier, inStock
108
            r_info = __RankInfo(identifier, rank, None, available_price, inStock, thumbnail, source_product_name, "http://m.homeshop18.com/product.mobi?productId=%s"%str(identifier))
17013 manish.sha 109
            bestSellers.append(r_info)   
110
 
111
def scrapeBestSellerTablets():
112
    global bestSellers
113
    rank = 0
17040 manish.sha 114
    bestSellers = []
17013 manish.sha 115
    print "Homeshop18 Best Sellers Tablets..."
116
    for i in range(0,5):
20371 kshitij.so 117
        tabletCategoryUrl = 'http://www.homeshop18.com/tablets/categoryid:8937/search:/sort:Popularity/inStock:%28%22true%22%29/start:'+str(i*24)+'/?lazy=true'
20372 kshitij.so 118
        data = fetchResponseUsingProxy(tabletCategoryUrl, proxy=False)
17013 manish.sha 119
        soup = BeautifulSoup(data)
120
        tags = soup.findAll("div", {'class':'inside'})
121
        for tag in tags:
122
            if not tag.has_key('id'):
123
                continue
124
            rank = rank +1
125
            if rank >100:
126
                break
127
            titleTag = tag.find('p', {'class' : 'product_title'})
128
            source_product_name = titleTag.text
17106 manish.sha 129
            '''
17105 manish.sha 130
            encoding =  chardet.detect(source_product_name)
131
            try:
132
                source_product_name = source_product_name.decode(encoding.get('encoding'))
133
            except:
134
                source_product_name = source_product_name.decode('latin-1')
17106 manish.sha 135
            '''
136
            source_product_name = source_product_name.encode('utf8')
17013 manish.sha 137
            productUrl = titleTag.find('a').get('href')
138
            productUrl = 'http://www.homeshop18.com'+str(productUrl)
139
            inStock = 1
17069 kshitij.so 140
            identifier = tag['id'].split('_')[1]
17013 manish.sha 141
            available_price = long(tag.find('p',{'class':'price clearfix'}).find('b').text.split(' ')[1])
142
            thumbnail = tag.find('p',{'class':'product_image'}).find('img').get('data-original')
17069 kshitij.so 143
            print productUrl, source_product_name, thumbnail, rank, available_price, identifier, inStock
144
            r_info = __RankInfo(identifier, rank, None , available_price, inStock, thumbnail, source_product_name, "http://m.homeshop18.com/product.mobi?productId=%s"%str(identifier))
17013 manish.sha 145
            bestSellers.append(r_info)
146
 
147
def resetRanks(category_id):
17068 kshitij.so 148
    get_mongo_connection(host=options.mongoHost).Catalog.MasterData.update({'rank':{'$gt':0},'source_id':7,'category_id':category_id}, {'$set':{'rank':0}}, upsert=False, multi=True)
17013 manish.sha 149
 
150
def sendEmail():
151
    message="""<html>
152
            <body>
153
            <h3>HomeShop18 Best Sellers not in master</h3>
154
            <table border="1" style="width:100%;">
155
            <thead>
156
            <tr><th>Identifier</th>
157
            <th>Category</th>
158
            <th>Rank</th>
159
            <th>Available_price</th>
160
            <th>In_stock</th>
161
            <th>Thumbnail</th>
162
            <th>Source_product_name</th>
163
            <th>MarketPlaceUrl</th>
164
            </tr></thead>
165
            <tbody>"""
166
    for item in exceptionList:
167
        try:
168
            message+="""<tr>
17039 manish.sha 169
            <td style="text-align:center">"""+str(item.identifier)+"""</td>
17038 manish.sha 170
            <td style="text-align:center">"""+str(item.category)+"""</td>
17013 manish.sha 171
            <td style="text-align:center">"""+str(item.rank)+"""</td>
172
            <td style="text-align:center">"""+str(item.available_price)+"""</td>
173
            <td style="text-align:center">"""+str(item.in_stock)+"""</td>
174
            <td style="text-align:center">"""+str(item.thumbnail)+"""</td>
175
            <td style="text-align:center">"""+str(item.source_product_name)+"""</td>
176
            <td style="text-align:center">"""+str(item.marketPlaceUrl)+"""</td>
177
            </tr>"""
178
        except:
179
            continue
180
    message+="""</tbody></table></body></html>"""
181
    print message
182
    #recipients = ['amit.gupta@saholic.com']
21135 kshitij.so 183
    recipients = ['kshitij.sood@saholic.com','ritesh.chauhan@saholic.com','aishwarya.singh@saholic.com']
23839 amit.gupta 184
    EmailAttachmentSender.mail_send_grid("dtr@smartdukaan.com","apikey", "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", recipients, "Homeshop18 Best Sellers",message ,[],[],[])              
21135 kshitij.so 185
 
17013 manish.sha 186
 
187
def main():
188
    #getCsrfValue()
189
    scrapeBestSellerMobiles()
190
    if len(bestSellers) > 0:
191
        resetRanks(3)
192
        commitBestSellers("MOBILE")
193
    scrapeBestSellerTablets()
194
    if len(bestSellers) > 0:
195
        resetRanks(5)
196
        commitBestSellers("TABLET")
197
    sendEmail()
198
 
199
 
200
if __name__=='__main__':
201
    main()
202
 
203
 
204
 
205