Subversion Repositories SmartDukaan

Rev

Rev 1958 | Rev 2082 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 1958 Rev 1963
Line 35... Line 35...
35
 
35
 
36
  def body_callback(self, buf):
36
  def body_callback(self, buf):
37
    self.contents = self.contents + buf
37
    self.contents = self.contents + buf
38
 
38
 
39
class RankCheck:
39
class RankCheck:
40
  def __init__(self, searchStrinfMaxSleep=SEARCH_STRING_MAX_SLEEP, pageSleep=PAGE_MAX_SLEEP):
40
  def __init__(self, searchStringMaxSleep=SEARCH_STRING_MAX_SLEEP, pageSleep=PAGE_MAX_SLEEP):
41
    self.searchStrinfMaxSleep = searchStrinfMaxSleep
41
    self.searchStringMaxSleep = searchStringMaxSleep
42
    self.pageSleep = pageSleep
42
    self.pageSleep = pageSleep
43
    self.userAgent = USER_AGENT
43
    self.userAgent = USER_AGENT
44
    self.findDomain = FIND_DOMAIN
44
    self.findDomain = FIND_DOMAIN
45
    self.locale = LOCALE
45
    self.locale = LOCALE
46
    self.maxPage = MAX_PAGE
46
    self.maxPage = MAX_PAGE
Line 63... Line 63...
63
    rankRequest.setopt(pycurl.HTTPGET, 1)
63
    rankRequest.setopt(pycurl.HTTPGET, 1)
64
    rankRequest.setopt(pycurl.REFERER, '')
64
    rankRequest.setopt(pycurl.REFERER, '')
65
    return rankRequest
65
    return rankRequest
66
 
66
 
67
  def start(self):
67
  def start(self):
68
    i = 0
-
 
69
    for search_string in self.searchStrings:
68
    for search_string in self.searchStrings:
70
      i += 1
-
 
71
      if i%30 == 0:
69
      if len(self.results) >= 30:
72
        time.sleep(random.randint(60*5, 60*15)) # sleep for 2 to 10 min after 20 queries
70
        time.sleep(random.randint(60*5, 60*15)) # sleep for 2 to 10 min after 20 queries
-
 
71
        if self.db_mode:
-
 
72
          self.pushResultsToDb()
73
      self.find_google_position(search_string)
73
      self.find_google_position(search_string)
74
      if self.db_mode:
74
      if self.db_mode:
75
        time.sleep(random.randint(0, self.searchStrinfMaxSleep))
75
        time.sleep(random.randint(0, self.searchStringMaxSleep))
76
        
76
        
77
      if(len(self.results) >= 100 and self.db_mode):
-
 
78
        self.pushResultsToDb()
-
 
79
      
-
 
80
      if self.breakAfterNextItr:
77
      if self.breakAfterNextItr:
81
        break
78
        break
82
    
79
    
83
    if(len(self.results) > 0 and self.db_mode):
80
    if(len(self.results) > 0 and self.db_mode):
84
      self.pushResultsToDb()
81
      self.pushResultsToDb()
Line 95... Line 92...
95
    
92
    
96
    # collect the search results
93
    # collect the search results
97
    html = rankCheck.contents
94
    html = rankCheck.contents
98
    counter = page * NUM_PER_PAGE
95
    counter = page * NUM_PER_PAGE
99
    result = 0
96
    result = 0
-
 
97
    this_url = ""
100
    
98
    
101
    if html.count("Our systems have detected unusual traffic from your computer network.") > 0:
99
    if html.count("Our systems have detected unusual traffic from your computer network.") > 0:
102
      print "Blocked by Google"
100
      print "Blocked by Google"
103
      self.breakAfterNextItr = True
101
      self.breakAfterNextItr = True
104
      return -1
102
      return -1
Line 106... Line 104...
106
    for google_result in re.finditer(url, html):
104
    for google_result in re.finditer(url, html):
107
      # print m.group()
105
      # print m.group()
108
      this_url = google_result.group()
106
      this_url = google_result.group()
109
      this_url = this_url[23:]
107
      this_url = this_url[23:]
110
      counter += 1
108
      counter += 1
111
  
109
      
112
      google_url_regex = re.compile("((https?):((//))+([\w\d:#@%/;$()~_?\+-=\\\.&])*" + FIND_DOMAIN + "+([\w\d:#@%/;$()~_?\+-=\\\.&])*)")
110
      google_url_regex = re.compile("((https?):((//))+([\w\d:#@%/;$()~_?\+-=\\\.&])*" + FIND_DOMAIN + "+([\w\d:#@%/;$()~_?\+-=\\\.&])*)")
113
      google_url_regex_result = google_url_regex.match(this_url)
111
      google_url_regex_result = google_url_regex.match(this_url)
114
      if google_url_regex_result:
112
      if google_url_regex_result:
115
        result = counter
113
        result = counter
116
        break
114
        break
117
      
115
      
118
    return result
116
    return result, this_url
119
 
117
 
120
  def find_google_position(self, search_string):
118
  def find_google_position(self, search_string):
121
    ENGINE_URL = 'http://www.google' + LOCALE + '/search?q=' + search_string.replace(' ', '+') + '&num=' + str(NUM_PER_PAGE)
119
    ENGINE_URL = 'http://www.google' + LOCALE + '/search?q=' + search_string.replace(' ', '+') + '&num=' + str(NUM_PER_PAGE)
122
    # print ENGINE_URL
120
    # print ENGINE_URL
123
    
121
    
124
    # run curl:
122
    # run curl:
125
    for i in range(0, MAX_PAGE):
123
    for i in range(0, MAX_PAGE):
126
      position = self.search_page(i, ENGINE_URL)
124
      (position, url) = self.search_page(i, ENGINE_URL)
127
      time.sleep(random.randint(0, self.pageSleep))
125
      time.sleep(random.randint(0, self.pageSleep))
128
      
126
      
129
      if position != 0:
127
      if position != 0:
130
        break
128
        break
131
  
129
  
132
    if position ==-1:
130
    if position ==-1:
133
      return
131
      return
134
    if position == 0:
132
    if position == 0:
135
      position = NUM_PER_PAGE * MAX_PAGE
133
      position = NUM_PER_PAGE * MAX_PAGE
-
 
134
      url = ""
136
      
135
      
137
    self.results.append((position, ((position-1)/10 + 1), search_string))
136
    self.results.append((position, ((position-1)/10 + 1), url, search_string))
138
    print "{0:s}, {1:s}, {2:d}, {3:d}".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), search_string, position, (position-1)/10 + 1)
137
    print "{0:s}, {1:s}, {2:d}, {3:d}, {4:s}".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), search_string, position, (position-1)/10 + 1, url)
139
    
138
    
140
  def getDbConnection(self):
139
  def getDbConnection(self):
141
    return MySQLdb.connect(DB_HOST, DB_USER, DB_PASSWORD, DB_NAME)
140
    return MySQLdb.connect(DB_HOST, DB_USER, DB_PASSWORD, DB_NAME)
142
  
141
  
143
  def closeConnection(self, conn):
142
  def closeConnection(self, conn):
Line 171... Line 170...
171
      # prepare a cursor object using cursor() method
170
      # prepare a cursor object using cursor() method
172
      cursor = conn.cursor()
171
      cursor = conn.cursor()
173
       
172
       
174
      cursor.executemany (
173
      cursor.executemany (
175
          """
174
          """
176
              insert into rank(query_id, date, position, page) 
175
              insert into rank(query_id, date, position, page, url) 
177
              select id , now(), %s, %s from query where query = %s;
176
              select id , now(), %s, %s, %s from query where query = %s;
178
          """, self.results)
177
          """, self.results)
179
      conn.commit()
178
      conn.commit()
180
      self.results = []
179
      self.results = []
181
      cursor.close()
180
      cursor.close()
182
    except Exception as e:
181
    except Exception as e: