Subversion Repositories SmartDukaan

Rev

Rev 13828 | Rev 14257 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
13754 kshitij.so 1
import urllib2
2
from BeautifulSoup import BeautifulSoup
3
import pymongo
4
import re
13828 kshitij.so 5
from dtr.utils.utils import to_java_date
6
from datetime import datetime
13754 kshitij.so 7
 
8
asin_regex = r'/([A-Z0-9]{10})'
9
con = None
10
bestSellers = []
13828 kshitij.so 11
now = datetime.now()
13754 kshitij.so 12
 
13
class __RankInfo:
14
 
15
    def __init__(self, identifier, rank):
16
        self.identifier = identifier
17
        self.rank  = rank
18
 
19
def get_mongo_connection(host='localhost', port=27017):
20
    global con
21
    if con is None:
22
        print "Establishing connection %s host and port %d" %(host,port)
23
        try:
24
            con = pymongo.MongoClient(host, port)
25
        except Exception, e:
26
            print e
27
            return None
28
    return con
29
 
30
def getSoupObject(url):
31
    print "Getting soup object for"
32
    print url
33
    global RETRY_COUNT
34
    RETRY_COUNT = 1 
35
    while RETRY_COUNT < 10:
36
        try:
37
            soup = None
38
            request = urllib2.Request(url)
39
            request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
40
            request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')
41
            request.add_header('Connection','keep-alive')
42
            request.add_header('User-Agent','Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36')
43
 
44
            response = urllib2.urlopen(request)   
45
            response_data = response.read()
46
            response.close()
47
            try:
48
                page=response_data.decode("utf-8")
49
                soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
50
            except:
51
                soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)
52
            if soup is None:
53
                raise
54
            return soup
55
        except Exception as e:
56
            print e
57
            print "Retrying"
58
            RETRY_COUNT = RETRY_COUNT + 1
59
 
60
 
61
def scrapeBestSellerMobiles():
62
    global bestSellers
63
    rank = 0
64
    for i in range(1,6):
65
        aboveFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1389432031/ref=zg_bs_1389432031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=1" %(i,i) 
66
        belowFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1389432031/ref=zg_bs_1389432031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=0" %(i,i)
67
        above_soup = getSoupObject(aboveFoldUrl)
68
        below_soup = getSoupObject(belowFoldUrl)
69
        for x in above_soup.findAll('div',{'class':'zg_itemImmersion'}):
70
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
71
            identifier =  (re.search(asin_regex, am_url)).group(1)
72
            rank = rank + 1
73
            r_info = __RankInfo(identifier,rank)
74
            bestSellers.append(r_info)
75
        for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):
76
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
77
            identifier =  (re.search(asin_regex, am_url)).group(1)
78
            rank = rank + 1
79
            r_info = __RankInfo(identifier,rank)
80
            bestSellers.append(r_info)
81
 
82
def commitBestSellers():
83
    print "Rank",
84
    print '\t',
85
    print 'Identifier'
86
    for x in bestSellers:
87
        print x.rank,
88
        print '\t',
89
        print x.identifier,
90
        col = get_mongo_connection().Catalog.MasterData.find({'identifier':x.identifier.strip()})
91
        print "count sku",
92
        print '\t',
93
        print len(list(col))
13828 kshitij.so 94
        get_mongo_connection().Catalog.MasterData.update({'identifier':x.identifier.strip() }, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}}, multi=True)
13754 kshitij.so 95
 
96
def scrapeBestSellerTablets():
97
    global bestSellers
98
    bestSellers = []
99
    rank = 0
100
    for i in range(1,6):
101
        aboveFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1375458031/ref=zg_bs_1375458031_pg_%d?ie=UTF8&pg=%d&ajax=1" %(i,i) 
102
        belowFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1375458031/ref=zg_bs_1375458031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=0" %(i,i)
103
        above_soup = getSoupObject(aboveFoldUrl)
104
        below_soup = getSoupObject(belowFoldUrl)
105
        for x in above_soup.findAll('div',{'class':'zg_itemImmersion'}):
106
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
107
            identifier =  (re.search(asin_regex, am_url)).group(1)
108
            rank = rank + 1
13913 kshitij.so 109
            print identifier,
110
            print '\t',
111
            print rank
13754 kshitij.so 112
            r_info = __RankInfo(identifier,rank)
113
            bestSellers.append(r_info)
114
        for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):
115
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
116
            identifier =  (re.search(asin_regex, am_url)).group(1)
117
            rank = rank + 1
13913 kshitij.so 118
            print identifier,
119
            print '\t',
120
            print rank
13754 kshitij.so 121
            r_info = __RankInfo(identifier,rank)
122
            bestSellers.append(r_info)
123
 
13913 kshitij.so 124
def resetRanks(category_id):
125
    oldRankedItems = get_mongo_connection().Catalog.MasterData.find({'rank':{'$gt':0},'source_id':1,'category_id':category_id})
13828 kshitij.so 126
    for item in oldRankedItems:
127
        get_mongo_connection().Catalog.MasterData.update({'_id':item['_id']}, {'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)
13754 kshitij.so 128
 
129
def main():
130
    scrapeBestSellerMobiles()
131
    if len(bestSellers) > 0:
13913 kshitij.so 132
        resetRanks(3)
13754 kshitij.so 133
        commitBestSellers()
134
    scrapeBestSellerTablets()
135
    if len(bestSellers) > 0:
13913 kshitij.so 136
        resetRanks(5)
13754 kshitij.so 137
        commitBestSellers()
138
 
139
if __name__=='__main__':
140
    main()