WebSVN – SmartDukaan – /trunk/runutils/rankcheck.py

#!/usr/bin/python

"""

 This script accepts Domain, Search String and Google Locale arguments, then returns
 which Search String results page for the Google Locale the Domain appears on.


 Usage example:

  rankcheck {domain} {searchstring} {locale}


 Output example:

  rankcheck geekology.co.za 'bash scripting' .co.za
   - The domain 'geekology.co.za' is listed in position 2 (page 1) for the search 'bash+scripting' on google.co.za

"""

__author__    = "Willem van Zyl (willem@geekology.co.za)"
__version__   = "$Revision: 1.5 $"
__date__      = "$Date: 2009/02/10 12:10:24 $"
__license__   = "GPLv3"

import sys, pycurl, re
import datetime

# Search Strings to be monitored
SEARCH_STRINGS = ['spice qt 68']
# some initial setup:
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0)'
# USER_AGENT = 'Mozilla/5.0'
FIND_DOMAIN = 'www.saholic.com'
LOCALE = '.co.in'
MAX_PAGE = 1
NUM_PER_PAGE = 100

# define class to store result:
class RankCheck:
  def __init__(self):
    self.contents = ''

  def body_callback(self, buf):
    self.contents = self.contents + buf


def main():
  for search_string in SEARCH_STRINGS:
    find_google_position(search_string)

def init_curl(rankRequest, rankCheck):
  # set up curl:
  rankRequest.setopt(pycurl.USERAGENT, USER_AGENT)
  rankRequest.setopt(pycurl.FOLLOWLOCATION, 1)
  rankRequest.setopt(pycurl.AUTOREFERER, 1)
  rankRequest.setopt(pycurl.WRITEFUNCTION, rankCheck.body_callback)
  rankRequest.setopt(pycurl.COOKIEFILE, '')
  rankRequest.setopt(pycurl.HTTPGET, 1)
  rankRequest.setopt(pycurl.REFERER, '')

def search_page(page, page_url):
  # instantiate curl and result objects:
  rankRequest = pycurl.Curl()
  rankCheck = RankCheck();
  init_curl(rankRequest, rankCheck)
  rankRequest.setopt(pycurl.URL, page_url + '&start=' + str(page * NUM_PER_PAGE))
  rankRequest.perform()
  # close curl:
  rankRequest.close()
  
  # collect the search results
  html = rankCheck.contents
  counter = page*NUM_PER_PAGE
  result = 0

  url=unicode(r'(<h3 class="r"><a href=")((https?):((//))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)')
  for google_result in re.finditer(url, html):
    # print m.group()
    this_url = google_result.group()
    this_url = this_url[23:]
    counter += 1

    google_url_regex = re.compile("((https?):((//))+([\w\d:#@%/;$()~_?\+-=\\\.&])*" + FIND_DOMAIN + "+([\w\d:#@%/;$()~_?\+-=\\\.&])*)")
    google_url_regex_result = google_url_regex.match(this_url)
    if google_url_regex_result:
      result = counter
      break
  
  return result

def find_google_position(search_string):
  ENGINE_URL = 'http://www.google' + LOCALE + '/search?q=' + search_string.replace(' ', '+') + '&num=' + str(NUM_PER_PAGE)
  # print ENGINE_URL
  
  # run curl:
  for i in range(0, MAX_PAGE):
    result = search_page(i, ENGINE_URL)
    if result != 0:
      break

  # show results
  if result == 0:
    print "%s, %s, %d, %d" % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), search_string, NUM_PER_PAGE*MAX_PAGE, MAX_PAGE*NUM_PER_PAGE/10)
  else:
    print "%s, %s, %d, %d" % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), search_string, result, result/10 + 1)


# Run Main
main()
Subversion Repositories SmartDukaan

(root)/trunk/runutils/rankcheck.py – Rev 1776