Subversion Repositories SmartDukaan

Rev

Rev 13913 | Rev 14379 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
13754 kshitij.so 1
import urllib2
2
from BeautifulSoup import BeautifulSoup
3
import pymongo
4
import re
13828 kshitij.so 5
from dtr.utils.utils import to_java_date
6
from datetime import datetime
14257 kshitij.so 7
import optparse
13754 kshitij.so 8
 
14257 kshitij.so 9
con = None
10
parser = optparse.OptionParser()
11
parser.add_option("-m", "--m", dest="mongoHost",
12
                      default="localhost",
13
                      type="string", help="The HOST where the mongo server is running",
14
                      metavar="mongo_host")
15
 
16
(options, args) = parser.parse_args()
17
 
18
 
13754 kshitij.so 19
asin_regex = r'/([A-Z0-9]{10})'
20
bestSellers = []
13828 kshitij.so 21
now = datetime.now()
13754 kshitij.so 22
 
23
class __RankInfo:
24
 
25
    def __init__(self, identifier, rank):
26
        self.identifier = identifier
27
        self.rank  = rank
28
 
14257 kshitij.so 29
def get_mongo_connection(host=options.mongoHost, port=27017):
13754 kshitij.so 30
    global con
31
    if con is None:
32
        print "Establishing connection %s host and port %d" %(host,port)
33
        try:
34
            con = pymongo.MongoClient(host, port)
35
        except Exception, e:
36
            print e
37
            return None
38
    return con
39
 
40
def getSoupObject(url):
41
    print "Getting soup object for"
42
    print url
43
    global RETRY_COUNT
44
    RETRY_COUNT = 1 
45
    while RETRY_COUNT < 10:
46
        try:
47
            soup = None
48
            request = urllib2.Request(url)
49
            request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
50
            request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')
51
            request.add_header('Connection','keep-alive')
52
            request.add_header('User-Agent','Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36')
53
 
54
            response = urllib2.urlopen(request)   
55
            response_data = response.read()
56
            response.close()
57
            try:
58
                page=response_data.decode("utf-8")
59
                soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
60
            except:
61
                soup = BeautifulSoup(response_data,convertEntities=BeautifulSoup.HTML_ENTITIES)
62
            if soup is None:
63
                raise
64
            return soup
65
        except Exception as e:
66
            print e
67
            print "Retrying"
68
            RETRY_COUNT = RETRY_COUNT + 1
69
 
70
 
71
def scrapeBestSellerMobiles():
72
    global bestSellers
73
    rank = 0
74
    for i in range(1,6):
75
        aboveFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1389432031/ref=zg_bs_1389432031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=1" %(i,i) 
76
        belowFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1389432031/ref=zg_bs_1389432031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=0" %(i,i)
77
        above_soup = getSoupObject(aboveFoldUrl)
78
        below_soup = getSoupObject(belowFoldUrl)
79
        for x in above_soup.findAll('div',{'class':'zg_itemImmersion'}):
80
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
81
            identifier =  (re.search(asin_regex, am_url)).group(1)
82
            rank = rank + 1
83
            r_info = __RankInfo(identifier,rank)
84
            bestSellers.append(r_info)
85
        for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):
86
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
87
            identifier =  (re.search(asin_regex, am_url)).group(1)
88
            rank = rank + 1
89
            r_info = __RankInfo(identifier,rank)
90
            bestSellers.append(r_info)
91
 
92
def commitBestSellers():
93
    print "Rank",
94
    print '\t',
95
    print 'Identifier'
96
    for x in bestSellers:
97
        print x.rank,
98
        print '\t',
99
        print x.identifier,
100
        col = get_mongo_connection().Catalog.MasterData.find({'identifier':x.identifier.strip()})
101
        print "count sku",
102
        print '\t',
103
        print len(list(col))
13828 kshitij.so 104
        get_mongo_connection().Catalog.MasterData.update({'identifier':x.identifier.strip() }, {'$set' : {'rank':x.rank,'updatedOn':to_java_date(now)}}, multi=True)
13754 kshitij.so 105
 
106
def scrapeBestSellerTablets():
107
    global bestSellers
108
    bestSellers = []
109
    rank = 0
110
    for i in range(1,6):
111
        aboveFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1375458031/ref=zg_bs_1375458031_pg_%d?ie=UTF8&pg=%d&ajax=1" %(i,i) 
112
        belowFoldUrl = "http://www.amazon.in/gp/bestsellers/electronics/1375458031/ref=zg_bs_1375458031_pg_%d?ie=UTF8&pg=%d&ajax=1&isAboveTheFold=0" %(i,i)
113
        above_soup = getSoupObject(aboveFoldUrl)
114
        below_soup = getSoupObject(belowFoldUrl)
115
        for x in above_soup.findAll('div',{'class':'zg_itemImmersion'}):
116
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
117
            identifier =  (re.search(asin_regex, am_url)).group(1)
118
            rank = rank + 1
13913 kshitij.so 119
            print identifier,
120
            print '\t',
121
            print rank
13754 kshitij.so 122
            r_info = __RankInfo(identifier,rank)
123
            bestSellers.append(r_info)
124
        for x in below_soup.findAll('div',{'class':'zg_itemImmersion'}):
125
            am_url =  x.find('div',{'class':'zg_title'}).find('a')['href']
126
            identifier =  (re.search(asin_regex, am_url)).group(1)
127
            rank = rank + 1
13913 kshitij.so 128
            print identifier,
129
            print '\t',
130
            print rank
13754 kshitij.so 131
            r_info = __RankInfo(identifier,rank)
132
            bestSellers.append(r_info)
133
 
13913 kshitij.so 134
def resetRanks(category_id):
135
    oldRankedItems = get_mongo_connection().Catalog.MasterData.find({'rank':{'$gt':0},'source_id':1,'category_id':category_id})
13828 kshitij.so 136
    for item in oldRankedItems:
137
        get_mongo_connection().Catalog.MasterData.update({'_id':item['_id']}, {'$set' : {'rank':0,'updatedOn':to_java_date(now)}}, multi=True)
13754 kshitij.so 138
 
139
def main():
140
    scrapeBestSellerMobiles()
141
    if len(bestSellers) > 0:
13913 kshitij.so 142
        resetRanks(3)
13754 kshitij.so 143
        commitBestSellers()
144
    scrapeBestSellerTablets()
145
    if len(bestSellers) > 0:
13913 kshitij.so 146
        resetRanks(5)
13754 kshitij.so 147
        commitBestSellers()
148
 
149
if __name__=='__main__':
150
    main()