Rev 1785 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
#!/usr/bin/python"""This script accepts Domain, Search String and Google Locale arguments, then returnswhich Search String results page for the Google Locale the Domain appears on.Usage example:rankcheck {domain} {searchstring} {locale}Output example:rankcheck geekology.co.za 'bash scripting' .co.za- The domain 'geekology.co.za' is listed in position 2 (page 1) for the search 'bash+scripting' on google.co.za"""__author__ = "Willem van Zyl (willem@geekology.co.za)"__version__ = "$Revision: 1.5 $"__date__ = "$Date: 2009/02/10 12:10:24 $"__license__ = "GPLv3"import sys, pycurl, reimport datetime# Search Strings to be monitoredSEARCH_STRINGS = ['spice qt 68']# some initial setup:USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0)'# USER_AGENT = 'Mozilla/5.0'FIND_DOMAIN = 'www.saholic.com'LOCALE = '.co.in'MAX_PAGE = 1NUM_PER_PAGE = 100# define class to store result:class RankCheck:def __init__(self):self.contents = ''def body_callback(self, buf):self.contents = self.contents + bufdef main():for search_string in SEARCH_STRINGS:find_google_position(search_string)def init_curl(rankRequest, rankCheck):# set up curl:rankRequest.setopt(pycurl.USERAGENT, USER_AGENT)rankRequest.setopt(pycurl.FOLLOWLOCATION, 1)rankRequest.setopt(pycurl.AUTOREFERER, 1)rankRequest.setopt(pycurl.WRITEFUNCTION, rankCheck.body_callback)rankRequest.setopt(pycurl.COOKIEFILE, '')rankRequest.setopt(pycurl.HTTPGET, 1)rankRequest.setopt(pycurl.REFERER, '')def search_page(page, page_url):# instantiate curl and result objects:rankRequest = pycurl.Curl()rankCheck = RankCheck();init_curl(rankRequest, rankCheck)rankRequest.setopt(pycurl.URL, page_url + '&start=' + str(page * NUM_PER_PAGE))rankRequest.perform()# close curl:rankRequest.close()# collect the search resultshtml = rankCheck.contentscounter = page*NUM_PER_PAGEresult = 0url=unicode(r'(<h3 class="r"><a href=")((https?):((//))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)')for google_result in re.finditer(url, html):# print m.group()this_url = google_result.group()this_url = this_url[23:]counter += 1google_url_regex = re.compile("((https?):((//))+([\w\d:#@%/;$()~_?\+-=\\\.&])*" + FIND_DOMAIN + "+([\w\d:#@%/;$()~_?\+-=\\\.&])*)")google_url_regex_result = google_url_regex.match(this_url)if google_url_regex_result:result = counterbreakreturn resultdef find_google_position(search_string):ENGINE_URL = 'http://www.google' + LOCALE + '/search?q=' + search_string.replace(' ', '+') + '&num=' + str(NUM_PER_PAGE)# print ENGINE_URL# run curl:for i in range(0, MAX_PAGE):result = search_page(i, ENGINE_URL)if result != 0:break# show resultsif result == 0:print "%s, %s, %d, %d" % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), search_string, NUM_PER_PAGE*MAX_PAGE, MAX_PAGE*NUM_PER_PAGE/10)else:print "%s, %s, %d, %d" % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), search_string, result, result/10 + 1)# Run Mainmain()