Subversion Repositories SmartDukaan

Rev

Rev 2153 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
1942 vikas 1
#!/usr/bin/python
2
 
3
"""
4
 
5
 This script prints date of run, search-string, and position and page number where saholic result 
6
 first appears in google search results.
7
 
8
"""
9
 
10
import sys, pycurl, re
11
import datetime, time
12
import random
13
import MySQLdb
14
 
15
 
16
 
17
# some initial setup:
18
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0)'
19
# USER_AGENT = 'Mozilla/5.0'
20
FIND_DOMAIN = 'www.saholic.com'
21
LOCALE = '.co.in'
22
MAX_PAGE = 10
23
NUM_PER_PAGE = 100
2083 vikas 24
SEARCH_STRING_MAX_SLEEP = 120 #seconds
25
PAGE_MAX_SLEEP = 20 #seconds
1942 vikas 26
DB_HOST = "localhost"
27
DB_USER = "root"
28
DB_PASSWORD = "shop2020"
29
DB_NAME = "serp"
30
 
31
# define class to store result:
32
class ResposeStorage:
33
  def __init__(self):
34
    self.contents = ''
35
 
36
  def body_callback(self, buf):
37
    self.contents = self.contents + buf
38
 
39
class RankCheck:
1963 vikas 40
  def __init__(self, searchStringMaxSleep=SEARCH_STRING_MAX_SLEEP, pageSleep=PAGE_MAX_SLEEP):
41
    self.searchStringMaxSleep = searchStringMaxSleep
1942 vikas 42
    self.pageSleep = pageSleep
43
    self.userAgent = USER_AGENT
44
    self.findDomain = FIND_DOMAIN
45
    self.locale = LOCALE
46
    self.maxPage = MAX_PAGE
47
    self.numPerPage = NUM_PER_PAGE
48
    self.searchStrings = []
49
    self.results = []
50
    self.db_mode = False
51
    self.breakAfterNextItr = False
52
 
53
  def addSearchStrings(self, searchStrings):
54
    self.searchStrings.extend(searchStrings)  
55
 
56
  def init_curl(self):
57
    # set up curl:
58
    rankRequest = pycurl.Curl()
59
    rankRequest.setopt(pycurl.USERAGENT, USER_AGENT)
60
    rankRequest.setopt(pycurl.FOLLOWLOCATION, 1)
61
    rankRequest.setopt(pycurl.AUTOREFERER, 1)
62
    rankRequest.setopt(pycurl.COOKIEFILE, '')
63
    rankRequest.setopt(pycurl.HTTPGET, 1)
64
    rankRequest.setopt(pycurl.REFERER, '')
65
    return rankRequest
66
 
67
  def start(self):
68
    for search_string in self.searchStrings:
1963 vikas 69
      if len(self.results) >= 30:
2083 vikas 70
        time.sleep(random.randint(60*5, 60*15)) # sleep for 5 to 15 min after 20 queries
1963 vikas 71
        if self.db_mode:
72
          self.pushResultsToDb()
1942 vikas 73
      self.find_google_position(search_string)
74
      if self.db_mode:
1963 vikas 75
        time.sleep(random.randint(0, self.searchStringMaxSleep))
1942 vikas 76
 
77
      if self.breakAfterNextItr:
78
        break
79
 
80
    if(len(self.results) > 0 and self.db_mode):
81
      self.pushResultsToDb()
82
 
83
  def search_page(self, page, page_url):
84
    # instantiate curl and result objects:
85
    rankCheck = ResposeStorage();
86
    rankRequest = self.init_curl()
87
    rankRequest.setopt(pycurl.WRITEFUNCTION, rankCheck.body_callback)
88
    rankRequest.setopt(pycurl.URL, page_url + '&start=' + str(page * NUM_PER_PAGE))
89
    rankRequest.perform()
90
    # close curl:
91
    rankRequest.close()
92
 
93
    # collect the search results
94
    html = rankCheck.contents
95
    counter = page * NUM_PER_PAGE
96
    result = 0
1963 vikas 97
    this_url = ""
1942 vikas 98
 
99
    if html.count("Our systems have detected unusual traffic from your computer network.") > 0:
100
      print "Blocked by Google"
101
      self.breakAfterNextItr = True
2082 vikas 102
      return -1, ""
1942 vikas 103
    url = unicode(r'(<h3 class="r"><a href=")((https?):((//))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)')
104
    for google_result in re.finditer(url, html):
105
      # print m.group()
106
      this_url = google_result.group()
107
      this_url = this_url[23:]
108
      counter += 1
1963 vikas 109
 
1942 vikas 110
      google_url_regex = re.compile("((https?):((//))+([\w\d:#@%/;$()~_?\+-=\\\.&])*" + FIND_DOMAIN + "+([\w\d:#@%/;$()~_?\+-=\\\.&])*)")
111
      google_url_regex_result = google_url_regex.match(this_url)
112
      if google_url_regex_result:
113
        result = counter
114
        break
2583 vikas 115
 
116
    if result == 0:
117
      f = open('/var/log/rankdumper/rankdumper-' + datetime.datetime.now().strftime("%Y-%m-%d") + '.log', 'a')
118
      f.write(html + "\n")
1942 vikas 119
 
1963 vikas 120
    return result, this_url
1942 vikas 121
 
122
  def find_google_position(self, search_string):
123
    ENGINE_URL = 'http://www.google' + LOCALE + '/search?q=' + search_string.replace(' ', '+') + '&num=' + str(NUM_PER_PAGE)
124
    # print ENGINE_URL
125
 
126
    # run curl:
127
    for i in range(0, MAX_PAGE):
1963 vikas 128
      (position, url) = self.search_page(i, ENGINE_URL)
1942 vikas 129
      time.sleep(random.randint(0, self.pageSleep))
130
 
131
      if position != 0:
132
        break
133
 
134
    if position ==-1:
135
      return
136
    if position == 0:
137
      position = NUM_PER_PAGE * MAX_PAGE
1963 vikas 138
      url = ""
1942 vikas 139
 
1963 vikas 140
    self.results.append((position, ((position-1)/10 + 1), url, search_string))
141
    print "{0:s}, {1:s}, {2:d}, {3:d}, {4:s}".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), search_string, position, (position-1)/10 + 1, url)
1942 vikas 142
 
143
  def getDbConnection(self):
144
    return MySQLdb.connect(DB_HOST, DB_USER, DB_PASSWORD, DB_NAME)
145
 
146
  def closeConnection(self, conn):
147
    conn.close()
148
 
149
  def loadSearchStringsFromDb(self):
150
    conn = self.getDbConnection()
151
 
152
    # Prepare SQL query to INSERT a record into the database.
2135 vikas 153
    # sql = "SELECT query FROM query where is_active = 1"
2153 vikas 154
    sql = "SELECT query FROM query where is_active = 1 and id%2 = dayofweek(now())%2"
1942 vikas 155
    try:
156
      # prepare a cursor object using cursor() method
157
      cursor = conn.cursor()
158
      # Execute the SQL command
159
      cursor.execute(sql)
160
      # Fetch all the rows in a list of lists.
161
      results = cursor.fetchall()
162
      for row in results:
163
        self.searchStrings.append(row[0])
164
      cursor.close()
165
    except Exception as e:
166
      print "Error: unable to fetch data"
167
      print e
168
 
169
    self.closeConnection(conn)
170
 
171
  def pushResultsToDb(self):
172
    conn = self.getDbConnection()
173
 
174
    try:
175
      # prepare a cursor object using cursor() method
176
      cursor = conn.cursor()
177
 
178
      cursor.executemany (
179
          """
1963 vikas 180
              insert into rank(query_id, date, position, page, url) 
181
              select id , now(), %s, %s, %s from query where query = %s;
1942 vikas 182
          """, self.results)
183
      conn.commit()
184
      self.results = []
185
      cursor.close()
186
    except Exception as e:
187
      print "Error: unable to insert row"
188
      print e
189
 
190
    self.closeConnection(conn)
191
 
192
 
193
 
194
def main():
195
  if len(sys.argv) > 1:
196
    rank_cheker = RankCheck(0, 0)
197
    rank_cheker.addSearchStrings(sys.argv[1:])
198
  else:
199
    rank_cheker = RankCheck()
200
    rank_cheker.loadSearchStringsFromDb()
201
    rank_cheker.db_mode = True
202
 
203
  rank_cheker.start()
204
 
205
if __name__ == '__main__':
206
    main()