Subversion Repositories SmartDukaan

Rev

Rev 1782 | Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
1746 vikas 1
#!/usr/bin/python
2
 
3
"""
4
 
5
 This script accepts Domain, Search String and Google Locale arguments, then returns
6
 which Search String results page for the Google Locale the Domain appears on.
7
 
8
 
9
 Usage example:
10
 
11
  rankcheck {domain} {searchstring} {locale}
12
 
13
 
14
 Output example:
15
 
16
  rankcheck geekology.co.za 'bash scripting' .co.za
17
   - The domain 'geekology.co.za' is listed in position 2 (page 1) for the search 'bash+scripting' on google.co.za
18
 
19
"""
20
 
21
__author__    = "Willem van Zyl (willem@geekology.co.za)"
22
__version__   = "$Revision: 1.5 $"
23
__date__      = "$Date: 2009/02/10 12:10:24 $"
24
__license__   = "GPLv3"
25
 
26
import sys, pycurl, re
27
import datetime
28
 
29
# Search Strings to be monitored
30
SEARCH_STRINGS = ['spice qt 68']
31
# some initial setup:
32
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0)'
33
# USER_AGENT = 'Mozilla/5.0'
34
FIND_DOMAIN = 'www.saholic.com'
35
LOCALE = '.co.in'
36
MAX_PAGE = 1
37
NUM_PER_PAGE = 100
38
 
39
# define class to store result:
40
class RankCheck:
41
  def __init__(self):
42
    self.contents = ''
43
 
44
  def body_callback(self, buf):
45
    self.contents = self.contents + buf
46
 
47
 
48
def main():
49
  for search_string in SEARCH_STRINGS:
50
    find_google_position(search_string)
51
 
52
def init_curl(rankRequest, rankCheck):
53
  # set up curl:
54
  rankRequest.setopt(pycurl.USERAGENT, USER_AGENT)
55
  rankRequest.setopt(pycurl.FOLLOWLOCATION, 1)
56
  rankRequest.setopt(pycurl.AUTOREFERER, 1)
57
  rankRequest.setopt(pycurl.WRITEFUNCTION, rankCheck.body_callback)
58
  rankRequest.setopt(pycurl.COOKIEFILE, '')
59
  rankRequest.setopt(pycurl.HTTPGET, 1)
60
  rankRequest.setopt(pycurl.REFERER, '')
61
 
62
def search_page(page, page_url):
63
  # instantiate curl and result objects:
64
  rankRequest = pycurl.Curl()
65
  rankCheck = RankCheck();
66
  init_curl(rankRequest, rankCheck)
67
  rankRequest.setopt(pycurl.URL, page_url + '&start=' + str(page * NUM_PER_PAGE))
68
  rankRequest.perform()
69
  # close curl:
70
  rankRequest.close()
71
 
72
  # collect the search results
73
  html = rankCheck.contents
74
  counter = page*NUM_PER_PAGE
75
  result = 0
76
 
77
  url=unicode(r'(<h3 class="r"><a href=")((https?):((//))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)')
78
  for google_result in re.finditer(url, html):
79
    # print m.group()
80
    this_url = google_result.group()
81
    this_url = this_url[23:]
82
    counter += 1
83
 
84
    google_url_regex = re.compile("((https?):((//))+([\w\d:#@%/;$()~_?\+-=\\\.&])*" + FIND_DOMAIN + "+([\w\d:#@%/;$()~_?\+-=\\\.&])*)")
85
    google_url_regex_result = google_url_regex.match(this_url)
86
    if google_url_regex_result:
87
      result = counter
88
      break
89
 
90
  return result
91
 
92
def find_google_position(search_string):
93
  ENGINE_URL = 'http://www.google' + LOCALE + '/search?q=' + search_string.replace(' ', '+') + '&num=' + str(NUM_PER_PAGE)
94
  # print ENGINE_URL
95
 
96
  # run curl:
97
  for i in range(0, MAX_PAGE):
98
    result = search_page(i, ENGINE_URL)
99
    if result != 0:
100
      break
101
 
102
  # show results
103
  if result == 0:
104
    print "%s, %s, %d, %d" % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), search_string, NUM_PER_PAGE*MAX_PAGE, MAX_PAGE*NUM_PER_PAGE/10)
105
  else:
106
    print "%s, %s, %d, %d" % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), search_string, result, result/10 + 1)
107
 
108
 
109
# Run Main
110
main()