| 1942 |
vikas |
1 |
#!/usr/bin/python
|
|
|
2 |
|
|
|
3 |
"""
|
|
|
4 |
|
|
|
5 |
This script prints date of run, search-string, and position and page number where saholic result
|
|
|
6 |
first appears in google search results.
|
|
|
7 |
|
|
|
8 |
"""
|
|
|
9 |
|
|
|
10 |
import sys, pycurl, re
|
|
|
11 |
import datetime, time
|
|
|
12 |
import random
|
|
|
13 |
import MySQLdb
|
|
|
14 |
|
|
|
15 |
|
|
|
16 |
|
|
|
17 |
# some initial setup:
|
|
|
18 |
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0)'
|
|
|
19 |
# USER_AGENT = 'Mozilla/5.0'
|
|
|
20 |
FIND_DOMAIN = 'www.saholic.com'
|
|
|
21 |
LOCALE = '.co.in'
|
|
|
22 |
MAX_PAGE = 10
|
|
|
23 |
NUM_PER_PAGE = 100
|
| 2083 |
vikas |
24 |
SEARCH_STRING_MAX_SLEEP = 120 #seconds
|
|
|
25 |
PAGE_MAX_SLEEP = 20 #seconds
|
| 1942 |
vikas |
26 |
DB_HOST = "localhost"
|
|
|
27 |
DB_USER = "root"
|
|
|
28 |
DB_PASSWORD = "shop2020"
|
|
|
29 |
DB_NAME = "serp"
|
|
|
30 |
|
|
|
31 |
# define class to store result:
|
|
|
32 |
class ResposeStorage:
|
|
|
33 |
def __init__(self):
|
|
|
34 |
self.contents = ''
|
|
|
35 |
|
|
|
36 |
def body_callback(self, buf):
|
|
|
37 |
self.contents = self.contents + buf
|
|
|
38 |
|
|
|
39 |
class RankCheck:
|
| 1963 |
vikas |
40 |
def __init__(self, searchStringMaxSleep=SEARCH_STRING_MAX_SLEEP, pageSleep=PAGE_MAX_SLEEP):
|
|
|
41 |
self.searchStringMaxSleep = searchStringMaxSleep
|
| 1942 |
vikas |
42 |
self.pageSleep = pageSleep
|
|
|
43 |
self.userAgent = USER_AGENT
|
|
|
44 |
self.findDomain = FIND_DOMAIN
|
|
|
45 |
self.locale = LOCALE
|
|
|
46 |
self.maxPage = MAX_PAGE
|
|
|
47 |
self.numPerPage = NUM_PER_PAGE
|
|
|
48 |
self.searchStrings = []
|
|
|
49 |
self.results = []
|
|
|
50 |
self.db_mode = False
|
|
|
51 |
self.breakAfterNextItr = False
|
|
|
52 |
|
|
|
53 |
def addSearchStrings(self, searchStrings):
|
|
|
54 |
self.searchStrings.extend(searchStrings)
|
|
|
55 |
|
|
|
56 |
def init_curl(self):
|
|
|
57 |
# set up curl:
|
|
|
58 |
rankRequest = pycurl.Curl()
|
|
|
59 |
rankRequest.setopt(pycurl.USERAGENT, USER_AGENT)
|
|
|
60 |
rankRequest.setopt(pycurl.FOLLOWLOCATION, 1)
|
|
|
61 |
rankRequest.setopt(pycurl.AUTOREFERER, 1)
|
|
|
62 |
rankRequest.setopt(pycurl.COOKIEFILE, '')
|
|
|
63 |
rankRequest.setopt(pycurl.HTTPGET, 1)
|
|
|
64 |
rankRequest.setopt(pycurl.REFERER, '')
|
|
|
65 |
return rankRequest
|
|
|
66 |
|
|
|
67 |
def start(self):
|
|
|
68 |
for search_string in self.searchStrings:
|
| 1963 |
vikas |
69 |
if len(self.results) >= 30:
|
| 2083 |
vikas |
70 |
time.sleep(random.randint(60*5, 60*15)) # sleep for 5 to 15 min after 20 queries
|
| 1963 |
vikas |
71 |
if self.db_mode:
|
|
|
72 |
self.pushResultsToDb()
|
| 1942 |
vikas |
73 |
self.find_google_position(search_string)
|
|
|
74 |
if self.db_mode:
|
| 1963 |
vikas |
75 |
time.sleep(random.randint(0, self.searchStringMaxSleep))
|
| 1942 |
vikas |
76 |
|
|
|
77 |
if self.breakAfterNextItr:
|
|
|
78 |
break
|
|
|
79 |
|
|
|
80 |
if(len(self.results) > 0 and self.db_mode):
|
|
|
81 |
self.pushResultsToDb()
|
|
|
82 |
|
|
|
83 |
def search_page(self, page, page_url):
|
|
|
84 |
# instantiate curl and result objects:
|
|
|
85 |
rankCheck = ResposeStorage();
|
|
|
86 |
rankRequest = self.init_curl()
|
|
|
87 |
rankRequest.setopt(pycurl.WRITEFUNCTION, rankCheck.body_callback)
|
|
|
88 |
rankRequest.setopt(pycurl.URL, page_url + '&start=' + str(page * NUM_PER_PAGE))
|
|
|
89 |
rankRequest.perform()
|
|
|
90 |
# close curl:
|
|
|
91 |
rankRequest.close()
|
|
|
92 |
|
|
|
93 |
# collect the search results
|
|
|
94 |
html = rankCheck.contents
|
|
|
95 |
counter = page * NUM_PER_PAGE
|
|
|
96 |
result = 0
|
| 1963 |
vikas |
97 |
this_url = ""
|
| 1942 |
vikas |
98 |
|
|
|
99 |
if html.count("Our systems have detected unusual traffic from your computer network.") > 0:
|
|
|
100 |
print "Blocked by Google"
|
|
|
101 |
self.breakAfterNextItr = True
|
| 2082 |
vikas |
102 |
return -1, ""
|
| 1942 |
vikas |
103 |
url = unicode(r'(<h3 class="r"><a href=")((https?):((//))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)')
|
|
|
104 |
for google_result in re.finditer(url, html):
|
|
|
105 |
# print m.group()
|
|
|
106 |
this_url = google_result.group()
|
|
|
107 |
this_url = this_url[23:]
|
|
|
108 |
counter += 1
|
| 1963 |
vikas |
109 |
|
| 1942 |
vikas |
110 |
google_url_regex = re.compile("((https?):((//))+([\w\d:#@%/;$()~_?\+-=\\\.&])*" + FIND_DOMAIN + "+([\w\d:#@%/;$()~_?\+-=\\\.&])*)")
|
|
|
111 |
google_url_regex_result = google_url_regex.match(this_url)
|
|
|
112 |
if google_url_regex_result:
|
|
|
113 |
result = counter
|
|
|
114 |
break
|
| 2583 |
vikas |
115 |
|
|
|
116 |
if result == 0:
|
|
|
117 |
f = open('/var/log/rankdumper/rankdumper-' + datetime.datetime.now().strftime("%Y-%m-%d") + '.log', 'a')
|
|
|
118 |
f.write(html + "\n")
|
| 1942 |
vikas |
119 |
|
| 1963 |
vikas |
120 |
return result, this_url
|
| 1942 |
vikas |
121 |
|
|
|
122 |
def find_google_position(self, search_string):
|
|
|
123 |
ENGINE_URL = 'http://www.google' + LOCALE + '/search?q=' + search_string.replace(' ', '+') + '&num=' + str(NUM_PER_PAGE)
|
|
|
124 |
# print ENGINE_URL
|
|
|
125 |
|
|
|
126 |
# run curl:
|
|
|
127 |
for i in range(0, MAX_PAGE):
|
| 1963 |
vikas |
128 |
(position, url) = self.search_page(i, ENGINE_URL)
|
| 1942 |
vikas |
129 |
time.sleep(random.randint(0, self.pageSleep))
|
|
|
130 |
|
|
|
131 |
if position != 0:
|
|
|
132 |
break
|
|
|
133 |
|
|
|
134 |
if position ==-1:
|
|
|
135 |
return
|
|
|
136 |
if position == 0:
|
|
|
137 |
position = NUM_PER_PAGE * MAX_PAGE
|
| 1963 |
vikas |
138 |
url = ""
|
| 1942 |
vikas |
139 |
|
| 1963 |
vikas |
140 |
self.results.append((position, ((position-1)/10 + 1), url, search_string))
|
|
|
141 |
print "{0:s}, {1:s}, {2:d}, {3:d}, {4:s}".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), search_string, position, (position-1)/10 + 1, url)
|
| 1942 |
vikas |
142 |
|
|
|
143 |
def getDbConnection(self):
|
|
|
144 |
return MySQLdb.connect(DB_HOST, DB_USER, DB_PASSWORD, DB_NAME)
|
|
|
145 |
|
|
|
146 |
def closeConnection(self, conn):
|
|
|
147 |
conn.close()
|
|
|
148 |
|
|
|
149 |
def loadSearchStringsFromDb(self):
|
|
|
150 |
conn = self.getDbConnection()
|
|
|
151 |
|
|
|
152 |
# Prepare SQL query to INSERT a record into the database.
|
| 2135 |
vikas |
153 |
# sql = "SELECT query FROM query where is_active = 1"
|
| 2153 |
vikas |
154 |
sql = "SELECT query FROM query where is_active = 1 and id%2 = dayofweek(now())%2"
|
| 1942 |
vikas |
155 |
try:
|
|
|
156 |
# prepare a cursor object using cursor() method
|
|
|
157 |
cursor = conn.cursor()
|
|
|
158 |
# Execute the SQL command
|
|
|
159 |
cursor.execute(sql)
|
|
|
160 |
# Fetch all the rows in a list of lists.
|
|
|
161 |
results = cursor.fetchall()
|
|
|
162 |
for row in results:
|
|
|
163 |
self.searchStrings.append(row[0])
|
|
|
164 |
cursor.close()
|
|
|
165 |
except Exception as e:
|
|
|
166 |
print "Error: unable to fetch data"
|
|
|
167 |
print e
|
|
|
168 |
|
|
|
169 |
self.closeConnection(conn)
|
|
|
170 |
|
|
|
171 |
def pushResultsToDb(self):
|
|
|
172 |
conn = self.getDbConnection()
|
|
|
173 |
|
|
|
174 |
try:
|
|
|
175 |
# prepare a cursor object using cursor() method
|
|
|
176 |
cursor = conn.cursor()
|
|
|
177 |
|
|
|
178 |
cursor.executemany (
|
|
|
179 |
"""
|
| 1963 |
vikas |
180 |
insert into rank(query_id, date, position, page, url)
|
|
|
181 |
select id , now(), %s, %s, %s from query where query = %s;
|
| 1942 |
vikas |
182 |
""", self.results)
|
|
|
183 |
conn.commit()
|
|
|
184 |
self.results = []
|
|
|
185 |
cursor.close()
|
|
|
186 |
except Exception as e:
|
|
|
187 |
print "Error: unable to insert row"
|
|
|
188 |
print e
|
|
|
189 |
|
|
|
190 |
self.closeConnection(conn)
|
|
|
191 |
|
|
|
192 |
|
|
|
193 |
|
|
|
194 |
def main():
|
|
|
195 |
if len(sys.argv) > 1:
|
|
|
196 |
rank_cheker = RankCheck(0, 0)
|
|
|
197 |
rank_cheker.addSearchStrings(sys.argv[1:])
|
|
|
198 |
else:
|
|
|
199 |
rank_cheker = RankCheck()
|
|
|
200 |
rank_cheker.loadSearchStringsFromDb()
|
|
|
201 |
rank_cheker.db_mode = True
|
|
|
202 |
|
|
|
203 |
rank_cheker.start()
|
|
|
204 |
|
|
|
205 |
if __name__ == '__main__':
|
|
|
206 |
main()
|