WebSVN – SmartDukaan – /trunk/runutils/rankdumper.py

#!/usr/bin/python

"""

 This script prints date of run, search-string, and position and page number where saholic result 
 first appears in google search results.

"""

import sys, pycurl, re
import datetime, time
import random
import MySQLdb



# some initial setup:
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0)'
# USER_AGENT = 'Mozilla/5.0'
FIND_DOMAIN = 'www.saholic.com'
LOCALE = '.co.in'
MAX_PAGE = 10
NUM_PER_PAGE = 100
SEARCH_STRING_MAX_SLEEP = 120 #seconds
PAGE_MAX_SLEEP = 20 #seconds
DB_HOST = "localhost"
DB_USER = "root"
DB_PASSWORD = "shop2020"
DB_NAME = "serp"

# define class to store result:
class ResposeStorage:
  def __init__(self):
    self.contents = ''

  def body_callback(self, buf):
    self.contents = self.contents + buf

class RankCheck:
  def __init__(self, searchStringMaxSleep=SEARCH_STRING_MAX_SLEEP, pageSleep=PAGE_MAX_SLEEP):
    self.searchStringMaxSleep = searchStringMaxSleep
    self.pageSleep = pageSleep
    self.userAgent = USER_AGENT
    self.findDomain = FIND_DOMAIN
    self.locale = LOCALE
    self.maxPage = MAX_PAGE
    self.numPerPage = NUM_PER_PAGE
    self.searchStrings = []
    self.results = []
    self.db_mode = False
    self.breakAfterNextItr = False

  def addSearchStrings(self, searchStrings):
    self.searchStrings.extend(searchStrings)  

  def init_curl(self):
    # set up curl:
    rankRequest = pycurl.Curl()
    rankRequest.setopt(pycurl.USERAGENT, USER_AGENT)
    rankRequest.setopt(pycurl.FOLLOWLOCATION, 1)
    rankRequest.setopt(pycurl.AUTOREFERER, 1)
    rankRequest.setopt(pycurl.COOKIEFILE, '')
    rankRequest.setopt(pycurl.HTTPGET, 1)
    rankRequest.setopt(pycurl.REFERER, '')
    return rankRequest

  def start(self):
    for search_string in self.searchStrings:
      if len(self.results) >= 30:
        time.sleep(random.randint(60*5, 60*15)) # sleep for 5 to 15 min after 20 queries
        if self.db_mode:
          self.pushResultsToDb()
      self.find_google_position(search_string)
      if self.db_mode:
        time.sleep(random.randint(0, self.searchStringMaxSleep))
        
      if self.breakAfterNextItr:
        break
    
    if(len(self.results) > 0 and self.db_mode):
      self.pushResultsToDb()
  
  def search_page(self, page, page_url):
    # instantiate curl and result objects:
    rankCheck = ResposeStorage();
    rankRequest = self.init_curl()
    rankRequest.setopt(pycurl.WRITEFUNCTION, rankCheck.body_callback)
    rankRequest.setopt(pycurl.URL, page_url + '&start=' + str(page * NUM_PER_PAGE))
    rankRequest.perform()
    # close curl:
    rankRequest.close()
    
    # collect the search results
    html = rankCheck.contents
    counter = page * NUM_PER_PAGE
    result = 0
    this_url = ""
    
    if html.count("Our systems have detected unusual traffic from your computer network.") > 0:
      print "Blocked by Google"
      self.breakAfterNextItr = True
      return -1, ""
    url = unicode(r'(<h3 class="r"><a href=")((https?):((//))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)')
    for google_result in re.finditer(url, html):
      # print m.group()
      this_url = google_result.group()
      this_url = this_url[23:]
      counter += 1
      
      google_url_regex = re.compile("((https?):((//))+([\w\d:#@%/;$()~_?\+-=\\\.&])*" + FIND_DOMAIN + "+([\w\d:#@%/;$()~_?\+-=\\\.&])*)")
      google_url_regex_result = google_url_regex.match(this_url)
      if google_url_regex_result:
        result = counter
        break
      
    return result, this_url

  def find_google_position(self, search_string):
    ENGINE_URL = 'http://www.google' + LOCALE + '/search?q=' + search_string.replace(' ', '+') + '&num=' + str(NUM_PER_PAGE)
    # print ENGINE_URL
    
    # run curl:
    for i in range(0, MAX_PAGE):
      (position, url) = self.search_page(i, ENGINE_URL)
      time.sleep(random.randint(0, self.pageSleep))
      
      if position != 0:
        break
  
    if position ==-1:
      return
    if position == 0:
      position = NUM_PER_PAGE * MAX_PAGE
      url = ""
      
    self.results.append((position, ((position-1)/10 + 1), url, search_string))
    print "{0:s}, {1:s}, {2:d}, {3:d}, {4:s}".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), search_string, position, (position-1)/10 + 1, url)
    
  def getDbConnection(self):
    return MySQLdb.connect(DB_HOST, DB_USER, DB_PASSWORD, DB_NAME)
  
  def closeConnection(self, conn):
    conn.close()

  def loadSearchStringsFromDb(self):
    conn = self.getDbConnection()
    
    # Prepare SQL query to INSERT a record into the database.
    sql = "SELECT query FROM query where is_active = 1"
    try:
      # prepare a cursor object using cursor() method
      cursor = conn.cursor()
      # Execute the SQL command
      cursor.execute(sql)
      # Fetch all the rows in a list of lists.
      results = cursor.fetchall()
      for row in results:
        self.searchStrings.append(row[0])
      cursor.close()
    except Exception as e:
      print "Error: unable to fetch data"
      print e
      
    self.closeConnection(conn)

  def pushResultsToDb(self):
    conn = self.getDbConnection()
    
    try:
      # prepare a cursor object using cursor() method
      cursor = conn.cursor()
       
      cursor.executemany (
          """
              insert into rank(query_id, date, position, page, url) 
              select id , now(), %s, %s, %s from query where query = %s;
          """, self.results)
      conn.commit()
      self.results = []
      cursor.close()
    except Exception as e:
      print "Error: unable to insert row"
      print e
    
    self.closeConnection(conn)


  
def main():
  if len(sys.argv) > 1:
    rank_cheker = RankCheck(0, 0)
    rank_cheker.addSearchStrings(sys.argv[1:])
  else:
    rank_cheker = RankCheck()
    rank_cheker.loadSearchStringsFromDb()
    rank_cheker.db_mode = True

  rank_cheker.start()

if __name__ == '__main__':
    main()
Subversion Repositories SmartDukaan

(root)/trunk/runutils/rankdumper.py – Rev 2084