Rev 2135 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
#!/usr/bin/python"""This script prints date of run, search-string, and position and page number where saholic resultfirst appears in google search results."""import sys, pycurl, reimport datetime, timeimport randomimport MySQLdb# some initial setup:USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0)'# USER_AGENT = 'Mozilla/5.0'FIND_DOMAIN = 'www.saholic.com'LOCALE = '.co.in'MAX_PAGE = 10NUM_PER_PAGE = 100SEARCH_STRING_MAX_SLEEP = 120 #secondsPAGE_MAX_SLEEP = 20 #secondsDB_HOST = "localhost"DB_USER = "root"DB_PASSWORD = "shop2020"DB_NAME = "serp"# define class to store result:class ResposeStorage:def __init__(self):self.contents = ''def body_callback(self, buf):self.contents = self.contents + bufclass RankCheck:def __init__(self, searchStringMaxSleep=SEARCH_STRING_MAX_SLEEP, pageSleep=PAGE_MAX_SLEEP):self.searchStringMaxSleep = searchStringMaxSleepself.pageSleep = pageSleepself.userAgent = USER_AGENTself.findDomain = FIND_DOMAINself.locale = LOCALEself.maxPage = MAX_PAGEself.numPerPage = NUM_PER_PAGEself.searchStrings = []self.results = []self.db_mode = Falseself.breakAfterNextItr = Falsedef addSearchStrings(self, searchStrings):self.searchStrings.extend(searchStrings)def init_curl(self):# set up curl:rankRequest = pycurl.Curl()rankRequest.setopt(pycurl.USERAGENT, USER_AGENT)rankRequest.setopt(pycurl.FOLLOWLOCATION, 1)rankRequest.setopt(pycurl.AUTOREFERER, 1)rankRequest.setopt(pycurl.COOKIEFILE, '')rankRequest.setopt(pycurl.HTTPGET, 1)rankRequest.setopt(pycurl.REFERER, '')return rankRequestdef start(self):for search_string in self.searchStrings:if len(self.results) >= 30:time.sleep(random.randint(60*5, 60*15)) # sleep for 5 to 15 min after 20 queriesif self.db_mode:self.pushResultsToDb()self.find_google_position(search_string)if self.db_mode:time.sleep(random.randint(0, self.searchStringMaxSleep))if self.breakAfterNextItr:breakif(len(self.results) > 0 and self.db_mode):self.pushResultsToDb()def search_page(self, page, page_url):# instantiate curl and result objects:rankCheck = ResposeStorage();rankRequest = self.init_curl()rankRequest.setopt(pycurl.WRITEFUNCTION, rankCheck.body_callback)rankRequest.setopt(pycurl.URL, page_url + '&start=' + str(page * NUM_PER_PAGE))rankRequest.perform()# close curl:rankRequest.close()# collect the search resultshtml = rankCheck.contentscounter = page * NUM_PER_PAGEresult = 0this_url = ""if html.count("Our systems have detected unusual traffic from your computer network.") > 0:print "Blocked by Google"self.breakAfterNextItr = Truereturn -1, ""url = unicode(r'(<h3 class="r"><a href=")((https?):((//))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)')for google_result in re.finditer(url, html):# print m.group()this_url = google_result.group()this_url = this_url[23:]counter += 1google_url_regex = re.compile("((https?):((//))+([\w\d:#@%/;$()~_?\+-=\\\.&])*" + FIND_DOMAIN + "+([\w\d:#@%/;$()~_?\+-=\\\.&])*)")google_url_regex_result = google_url_regex.match(this_url)if google_url_regex_result:result = counterbreakreturn result, this_urldef find_google_position(self, search_string):ENGINE_URL = 'http://www.google' + LOCALE + '/search?q=' + search_string.replace(' ', '+') + '&num=' + str(NUM_PER_PAGE)# print ENGINE_URL# run curl:for i in range(0, MAX_PAGE):(position, url) = self.search_page(i, ENGINE_URL)time.sleep(random.randint(0, self.pageSleep))if position != 0:breakif position ==-1:returnif position == 0:position = NUM_PER_PAGE * MAX_PAGEurl = ""self.results.append((position, ((position-1)/10 + 1), url, search_string))print "{0:s}, {1:s}, {2:d}, {3:d}, {4:s}".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), search_string, position, (position-1)/10 + 1, url)def getDbConnection(self):return MySQLdb.connect(DB_HOST, DB_USER, DB_PASSWORD, DB_NAME)def closeConnection(self, conn):conn.close()def loadSearchStringsFromDb(self):conn = self.getDbConnection()# Prepare SQL query to INSERT a record into the database.# sql = "SELECT query FROM query where is_active = 1"sql = "SELECT query FROM query where is_active = 1 and id%2 = dayofweek(now())%2"try:# prepare a cursor object using cursor() methodcursor = conn.cursor()# Execute the SQL commandcursor.execute(sql)# Fetch all the rows in a list of lists.results = cursor.fetchall()for row in results:self.searchStrings.append(row[0])cursor.close()except Exception as e:print "Error: unable to fetch data"print eself.closeConnection(conn)def pushResultsToDb(self):conn = self.getDbConnection()try:# prepare a cursor object using cursor() methodcursor = conn.cursor()cursor.executemany ("""insert into rank(query_id, date, position, page, url)select id , now(), %s, %s, %s from query where query = %s;""", self.results)conn.commit()self.results = []cursor.close()except Exception as e:print "Error: unable to insert row"print eself.closeConnection(conn)def main():if len(sys.argv) > 1:rank_cheker = RankCheck(0, 0)rank_cheker.addSearchStrings(sys.argv[1:])else:rank_cheker = RankCheck()rank_cheker.loadSearchStringsFromDb()rank_cheker.db_mode = Truerank_cheker.start()if __name__ == '__main__':main()