Commit 9465e76b authored by Rafael Monnerat's avatar Rafael Monnerat

Update stalled alarm reliability

See merge request nexedi/slapos.core!481
parents 3ae06e91 e1a3d2ad
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
</item> </item>
<item> <item>
<key> <string>cache_duration</string> </key> <key> <string>cache_duration</string> </key>
<value> <int>86400</int> </value> <value> <int>172800</int> </value>
</item> </item>
<item> <item>
<key> <string>description</string> </key> <key> <string>description</string> </key>
......
...@@ -16,7 +16,6 @@ compute_node_title = context.getTitle() ...@@ -16,7 +16,6 @@ compute_node_title = context.getTitle()
node_ticket_title = "[MONITORING] Lost contact with compute_node %s" % reference node_ticket_title = "[MONITORING] Lost contact with compute_node %s" % reference
instance_ticket_title = "[MONITORING] Compute Node %s has a stalled instance process" % reference instance_ticket_title = "[MONITORING] Compute Node %s has a stalled instance process" % reference
software_ticket_title = "[MONITORING] Compute Node %s has a stalled software process" % reference
ticket_title = node_ticket_title ticket_title = node_ticket_title
description = "" description = ""
...@@ -62,8 +61,6 @@ if not should_notify: ...@@ -62,8 +61,6 @@ if not should_notify:
if instance_list: if instance_list:
should_notify = True should_notify = True
description = "The Compute Node %s (%s) didnt process its instances for more them 24 hours" % (
compute_node_title, reference)
for instance in instance_list: for instance in instance_list:
instance_access_status = instance.getAccessStatus() instance_access_status = instance.getAccessStatus()
...@@ -73,39 +70,13 @@ if not should_notify: ...@@ -73,39 +70,13 @@ if not should_notify:
# At lest one partition contacted in the last 24h30min. # At lest one partition contacted in the last 24h30min.
last_contact = max(DateTime(instance_access_status.get('created_at')), last_contact) last_contact = max(DateTime(instance_access_status.get('created_at')), last_contact)
if (now - DateTime(instance_access_status.get('created_at'))) < 1.01: if (now - DateTime(instance_access_status.get('created_at'))) < 1.05:
should_notify = False should_notify = False
description = ""
break break
if not should_notify: if should_notify:
ticket_title = software_ticket_title description = "The Compute Node %s (%s) didnt process its instances for more them 24 hours, last contact: %s" % (
notification_message_reference = 'slapos-crm-compute_node_check_stalled_software_state.notification' context.getTitle(), context.getReference(), last_contact)
last_contact = "No Contact Information"
# Since server is contacting, check for stalled software releases processes
software_installation_list = portal.portal_catalog(
portal_type='Software Installation',
default_aggregate_uid=context.getUid(),
validation_state='validated')
if software_installation_list:
should_notify = True
description = "The Compute Node %s (%s) didnt process its software releases for more them 24 hours" % (
compute_node_title, reference)
# Test if server didnt process the internal softwares releases for more them 24h
for installation in software_installation_list:
installation_access_status = installation.getAccessStatus()
if installation_access_status.get('no_data', None):
# Ignore if there isnt any data on it
continue
last_contact = max(DateTime(instance_access_status.get('created_at')), last_contact)
if (now - DateTime(installation_access_status.get('created_at'))) < 1.01:
should_notify = False
description = ""
break
if should_notify: if should_notify:
support_request = person.Base_getSupportRequestInProgress( support_request = person.Base_getSupportRequestInProgress(
......
...@@ -13,6 +13,9 @@ if context.getSimulationState() == "invalidated": ...@@ -13,6 +13,9 @@ if context.getSimulationState() == "invalidated":
if context.getPortalType() != "Support Request": if context.getPortalType() != "Support Request":
return "Not a Support Request" return "Not a Support Request"
now = DateTime()
portal = context.getPortalObject()
document = context.getAggregateValue() document = context.getAggregateValue()
if document is None: if document is None:
return True return True
...@@ -27,7 +30,34 @@ if aggregate_portal_type == "Compute Node": ...@@ -27,7 +30,34 @@ if aggregate_portal_type == "Compute Node":
return "No Contact Information" return "No Contact Information"
last_contact = DateTime(d.get('created_at')) last_contact = DateTime(d.get('created_at'))
if (DateTime() - last_contact) < 0.01: if (now - last_contact) < 0.01:
# If server has no partitions skip
compute_partition_uid_list = [
x.getUid() for x in document.contentValues(portal_type="Compute Partition")
if x.getSlapState() == 'busy']
if compute_partition_uid_list:
is_instance_stalled = True
last_contact = None
instance_list = portal.portal_catalog(
portal_type='Software Instance',
default_aggregate_uid=compute_partition_uid_list)
for instance in instance_list:
instance_access_status = instance.getAccessStatus()
if instance_access_status.get('no_data', None):
# Ignore if there isnt any data
continue
# At lest one partition contacted in the last 24h30min.
last_contact = max(DateTime(instance_access_status.get('created_at')), last_contact)
if (now - DateTime(instance_access_status.get('created_at'))) < 1.05:
is_instance_stalled = False
break
if is_instance_stalled and len(instance_list):
return "Process instance stalled, last contact was %s" % last_contact
return "All OK, latest contact: %s " % last_contact return "All OK, latest contact: %s " % last_contact
else: else:
return "Problem, latest contact: %s" % last_contact return "Problem, latest contact: %s" % last_contact
...@@ -40,7 +70,7 @@ if aggregate_portal_type == "Software Installation": ...@@ -40,7 +70,7 @@ if aggregate_portal_type == "Software Installation":
if document.getSlapState() not in ["start_requested", "stop_requested"]: if document.getSlapState() not in ["start_requested", "stop_requested"]:
return "Software Installation is Destroyed." return "Software Installation is Destroyed."
d = context.getAccessStatus() d = document.getAccessStatus()
if d.get("no_data", None) == 1: if d.get("no_data", None) == 1:
return "The software release %s did not started to build on %s since %s" % \ return "The software release %s did not started to build on %s since %s" % \
(document.getUrlString(), compute_node_title, document.getCreationDate()) (document.getUrlString(), compute_node_title, document.getCreationDate())
......
...@@ -968,48 +968,6 @@ class TestSlapOSComputeNode_CheckState(TestCRMSkinsMixin): ...@@ -968,48 +968,6 @@ class TestSlapOSComputeNode_CheckState(TestCRMSkinsMixin):
self.assertEqual(event.getSource(), person.getRelativeUrl()) self.assertEqual(event.getSource(), person.getRelativeUrl())
@simulate('ERP5Site_isSupportRequestCreationClosed', '*args, **kwargs','return 0')
@simulate('NotificationTool_getDocumentValue',
'reference=None',
'assert reference == "slapos-crm-compute_node_check_stalled_software_state.notification", reference\n' \
'return context.restrictedTraverse(' \
'context.REQUEST["test_ComputeNode_checkState_stalled_software"])')
def test_ComputeNode_checkState_stalled_software(self):
compute_node = self._makeComputeNode(owner=self.makePerson(user=0))[0]
self._makeComplexComputeNode()
person = compute_node.getSourceAdministrationValue()
self.portal.REQUEST['test_ComputeNode_checkState_stalled_software'] = \
self._makeNotificationMessage(compute_node.getReference())
# Computer is getting access, also internal instance
compute_node.setAccessStatus("")
self.start_requested_software_instance.setAccessStatus("")
try:
self.pinDateTime(DateTime()-1.1)
self.start_requested_software_installation.setAccessStatus("")
finally:
self.unpinDateTime()
compute_node.ComputeNode_checkState()
self.tic()
ticket_title = "[MONITORING] Compute Node %s has a stalled software process" % compute_node.getReference()
ticket = self._getGeneratedSupportRequest(compute_node.getUid(), ticket_title)
self.assertNotEqual(ticket, None)
event_list = ticket.getFollowUpRelatedValueList()
self.assertEqual(len(event_list), 1)
event = event_list[0]
self.assertEqual(event.getTitle(), ticket.getTitle())
self.assertIn(compute_node.getReference(), event.getTextContent())
self.assertEqual(event.getDestination(), ticket.getSourceSection())
self.assertEqual(event.getSource(), person.getRelativeUrl())
@simulate('ERP5Site_isSupportRequestCreationClosed', '*args, **kwargs','return 0') @simulate('ERP5Site_isSupportRequestCreationClosed', '*args, **kwargs','return 0')
@simulate('NotificationTool_getDocumentValue', @simulate('NotificationTool_getDocumentValue',
'reference=None', 'reference=None',
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment