| 1746 |
vikas |
1 |
#!/usr/bin/python
|
|
|
2 |
|
|
|
3 |
"""
|
|
|
4 |
|
|
|
5 |
This script accepts Domain, Search String and Google Locale arguments, then returns
|
|
|
6 |
which Search String results page for the Google Locale the Domain appears on.
|
|
|
7 |
|
|
|
8 |
|
|
|
9 |
Usage example:
|
|
|
10 |
|
|
|
11 |
rankcheck {domain} {searchstring} {locale}
|
|
|
12 |
|
|
|
13 |
|
|
|
14 |
Output example:
|
|
|
15 |
|
|
|
16 |
rankcheck geekology.co.za 'bash scripting' .co.za
|
|
|
17 |
- The domain 'geekology.co.za' is listed in position 2 (page 1) for the search 'bash+scripting' on google.co.za
|
|
|
18 |
|
|
|
19 |
"""
|
|
|
20 |
|
|
|
21 |
__author__ = "Willem van Zyl (willem@geekology.co.za)"
|
|
|
22 |
__version__ = "$Revision: 1.5 $"
|
|
|
23 |
__date__ = "$Date: 2009/02/10 12:10:24 $"
|
|
|
24 |
__license__ = "GPLv3"
|
|
|
25 |
|
|
|
26 |
import sys, pycurl, re
|
|
|
27 |
import datetime
|
|
|
28 |
|
|
|
29 |
# Search Strings to be monitored
|
|
|
30 |
SEARCH_STRINGS = ['spice qt 68']
|
|
|
31 |
# some initial setup:
|
|
|
32 |
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0)'
|
|
|
33 |
# USER_AGENT = 'Mozilla/5.0'
|
|
|
34 |
FIND_DOMAIN = 'www.saholic.com'
|
|
|
35 |
LOCALE = '.co.in'
|
|
|
36 |
MAX_PAGE = 1
|
|
|
37 |
NUM_PER_PAGE = 100
|
|
|
38 |
|
|
|
39 |
# define class to store result:
|
|
|
40 |
class RankCheck:
|
|
|
41 |
def __init__(self):
|
|
|
42 |
self.contents = ''
|
|
|
43 |
|
|
|
44 |
def body_callback(self, buf):
|
|
|
45 |
self.contents = self.contents + buf
|
|
|
46 |
|
|
|
47 |
|
|
|
48 |
def main():
|
|
|
49 |
for search_string in SEARCH_STRINGS:
|
|
|
50 |
find_google_position(search_string)
|
|
|
51 |
|
|
|
52 |
def init_curl(rankRequest, rankCheck):
|
|
|
53 |
# set up curl:
|
|
|
54 |
rankRequest.setopt(pycurl.USERAGENT, USER_AGENT)
|
|
|
55 |
rankRequest.setopt(pycurl.FOLLOWLOCATION, 1)
|
|
|
56 |
rankRequest.setopt(pycurl.AUTOREFERER, 1)
|
|
|
57 |
rankRequest.setopt(pycurl.WRITEFUNCTION, rankCheck.body_callback)
|
|
|
58 |
rankRequest.setopt(pycurl.COOKIEFILE, '')
|
|
|
59 |
rankRequest.setopt(pycurl.HTTPGET, 1)
|
|
|
60 |
rankRequest.setopt(pycurl.REFERER, '')
|
|
|
61 |
|
|
|
62 |
def search_page(page, page_url):
|
|
|
63 |
# instantiate curl and result objects:
|
|
|
64 |
rankRequest = pycurl.Curl()
|
|
|
65 |
rankCheck = RankCheck();
|
|
|
66 |
init_curl(rankRequest, rankCheck)
|
|
|
67 |
rankRequest.setopt(pycurl.URL, page_url + '&start=' + str(page * NUM_PER_PAGE))
|
|
|
68 |
rankRequest.perform()
|
|
|
69 |
# close curl:
|
|
|
70 |
rankRequest.close()
|
|
|
71 |
|
|
|
72 |
# collect the search results
|
|
|
73 |
html = rankCheck.contents
|
|
|
74 |
counter = page*NUM_PER_PAGE
|
|
|
75 |
result = 0
|
|
|
76 |
|
|
|
77 |
url=unicode(r'(<h3 class="r"><a href=")((https?):((//))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)')
|
|
|
78 |
for google_result in re.finditer(url, html):
|
|
|
79 |
# print m.group()
|
|
|
80 |
this_url = google_result.group()
|
|
|
81 |
this_url = this_url[23:]
|
|
|
82 |
counter += 1
|
|
|
83 |
|
|
|
84 |
google_url_regex = re.compile("((https?):((//))+([\w\d:#@%/;$()~_?\+-=\\\.&])*" + FIND_DOMAIN + "+([\w\d:#@%/;$()~_?\+-=\\\.&])*)")
|
|
|
85 |
google_url_regex_result = google_url_regex.match(this_url)
|
|
|
86 |
if google_url_regex_result:
|
|
|
87 |
result = counter
|
|
|
88 |
break
|
|
|
89 |
|
|
|
90 |
return result
|
|
|
91 |
|
|
|
92 |
def find_google_position(search_string):
|
|
|
93 |
ENGINE_URL = 'http://www.google' + LOCALE + '/search?q=' + search_string.replace(' ', '+') + '&num=' + str(NUM_PER_PAGE)
|
|
|
94 |
# print ENGINE_URL
|
|
|
95 |
|
|
|
96 |
# run curl:
|
|
|
97 |
for i in range(0, MAX_PAGE):
|
|
|
98 |
result = search_page(i, ENGINE_URL)
|
|
|
99 |
if result != 0:
|
|
|
100 |
break
|
|
|
101 |
|
|
|
102 |
# show results
|
|
|
103 |
if result == 0:
|
|
|
104 |
print "%s, %s, %d, %d" % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), search_string, NUM_PER_PAGE*MAX_PAGE, MAX_PAGE*NUM_PER_PAGE/10)
|
|
|
105 |
else:
|
|
|
106 |
print "%s, %s, %d, %d" % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), search_string, result, result/10 + 1)
|
|
|
107 |
|
|
|
108 |
|
|
|
109 |
# Run Main
|
|
|
110 |
main()
|