Commit c611c48f authored by Julien Muchembled's avatar Julien Muchembled

admin: fix monitoring timer after 2 identical consecutive checks

This fixes the bug that with only email notification, monitoring
stopped checking whether backup clusters are lagging after status is
unchanged since the last check (about lagging, what is compared is
the set of lagging backups). Until another event wakes up monitoring.

The code is also simplified in that there's no need for the moment to
have a different timeout between the normal case and a smtp failure.
parent 2f782572
...@@ -30,7 +30,6 @@ from neo.lib.exception import PrimaryFailure ...@@ -30,7 +30,6 @@ from neo.lib.exception import PrimaryFailure
from .handler import AdminEventHandler, BackupHandler, MasterEventHandler, \ from .handler import AdminEventHandler, BackupHandler, MasterEventHandler, \
UpstreamAdminHandler, NOT_CONNECTED_MESSAGE UpstreamAdminHandler, NOT_CONNECTED_MESSAGE
from neo.lib.bootstrap import BootstrapManager from neo.lib.bootstrap import BootstrapManager
from neo.lib.logger import INF
from neo.lib.protocol import \ from neo.lib.protocol import \
CellStates, ClusterStates, Errors, NodeTypes, Packets CellStates, ClusterStates, Errors, NodeTypes, Packets
from neo.lib.debug import register as registerLiveDebugger from neo.lib.debug import register as registerLiveDebugger
...@@ -167,7 +166,6 @@ class Application(BaseApplication, Monitor): ...@@ -167,7 +166,6 @@ class Application(BaseApplication, Monitor):
email_from = None email_from = None
self.email_from = formataddr(("NEO " + self.name, email_from)) self.email_from = formataddr(("NEO " + self.name, email_from))
self.smtp_exc = None self.smtp_exc = None
self.smtp_retry = INF
self.notifying = set() self.notifying = set()
logging.debug('IP address is %s, port is %d', *self.server) logging.debug('IP address is %s, port is %d', *self.server)
...@@ -321,10 +319,9 @@ class Application(BaseApplication, Monitor): ...@@ -321,10 +319,9 @@ class Application(BaseApplication, Monitor):
backup.monitor_changed = False backup.monitor_changed = False
changed.add(name) changed.add(name)
body = '\n'.join(body) body = '\n'.join(body)
if changed or self.smtp_retry < time(): email_list = self.email_list
logging.debug('monitor notification') while email_list: # not a loop
email_list = self.email_list if changed or self.smtp_exc:
while email_list: # not a loop
msg = MIMEText(body + (self.smtp_exc or '')) msg = MIMEText(body + (self.smtp_exc or ''))
msg['Date'] = formatdate() msg['Date'] = formatdate()
clusters, x = severity[1:] clusters, x = severity[1:]
...@@ -359,18 +356,18 @@ class Application(BaseApplication, Monitor): ...@@ -359,18 +356,18 @@ class Application(BaseApplication, Monitor):
self.smtp_exc = ( self.smtp_exc = (
"\n\nA notification could not be sent at %s:\n\n%s" "\n\nA notification could not be sent at %s:\n\n%s"
% (msg['Date'], x)) % (msg['Date'], x))
retry = self.smtp_retry = time() + 600
else: else:
self.smtp_exc = None self.smtp_exc = None
self.smtp_retry = INF # The timeout is only to check whether a backup cluster is
# lagging and for that, the main cluster and at least one
# backup cluster must be operational. Else, remain passive.
if not (self.operational and any(monitor.operational if not (self.operational and any(monitor.operational
for monitor in self.backup_dict.itervalues())): for monitor in self.backup_dict.itervalues())):
break break
retry = time() + 600
finally: finally:
s.close() s.close()
self.em.setTimeout(retry, self._notify) self.em.setTimeout(time() + 600, self._notify)
break break
neoctl = self.asking_monitor_information neoctl = self.asking_monitor_information
if neoctl: if neoctl:
del severity[my_severity][0] del severity[my_severity][0]
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment