Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
S
slapos.core
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Léo-Paul Géneau
slapos.core
Commits
9465e76b
Commit
9465e76b
authored
Jan 31, 2023
by
Rafael Monnerat
Browse files
Options
Browse Files
Download
Plain Diff
Update stalled alarm reliability
See merge request
nexedi/slapos.core!481
parents
3ae06e91
e1a3d2ad
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
53 additions
and
94 deletions
+53
-94
master/bt5/slapos_cloud/PathTemplateItem/portal_caches/access_status_data_cache_factory.xml
...teItem/portal_caches/access_status_data_cache_factory.xml
+1
-1
master/bt5/slapos_crm/SkinTemplateItem/portal_skins/slapos_crm_monitoring/ComputeNode_checkState.py
...tal_skins/slapos_crm_monitoring/ComputeNode_checkState.py
+16
-45
master/bt5/slapos_crm/SkinTemplateItem/portal_skins/slapos_crm_monitoring/SupportRequest_recheckMonitoring.py
...slapos_crm_monitoring/SupportRequest_recheckMonitoring.py
+36
-6
master/bt5/slapos_crm/TestTemplateItem/portal_components/test.erp5.testSlapOSCRMSkins.py
...ateItem/portal_components/test.erp5.testSlapOSCRMSkins.py
+0
-42
No files found.
master/bt5/slapos_cloud/PathTemplateItem/portal_caches/access_status_data_cache_factory.xml
View file @
9465e76b
...
@@ -26,7 +26,7 @@
...
@@ -26,7 +26,7 @@
</item>
</item>
<item>
<item>
<key>
<string>
cache_duration
</string>
</key>
<key>
<string>
cache_duration
</string>
</key>
<value>
<int>
864
00
</int>
</value>
<value>
<int>
1728
00
</int>
</value>
</item>
</item>
<item>
<item>
<key>
<string>
description
</string>
</key>
<key>
<string>
description
</string>
</key>
...
...
master/bt5/slapos_crm/SkinTemplateItem/portal_skins/slapos_crm_monitoring/ComputeNode_checkState.py
View file @
9465e76b
...
@@ -5,7 +5,7 @@ person = context.getSourceAdministrationValue(portal_type="Person")
...
@@ -5,7 +5,7 @@ person = context.getSourceAdministrationValue(portal_type="Person")
if
not
person
or
\
if
not
person
or
\
context
.
getMonitorScope
()
==
"disabled"
or
\
context
.
getMonitorScope
()
==
"disabled"
or
\
portal
.
ERP5Site_isSupportRequestCreationClosed
():
portal
.
ERP5Site_isSupportRequestCreationClosed
():
return
return
if
context
.
getAllocationScope
(
"open"
).
startswith
(
"close"
):
if
context
.
getAllocationScope
(
"open"
).
startswith
(
"close"
):
context
.
setMonitorScope
(
"disabled"
)
context
.
setMonitorScope
(
"disabled"
)
...
@@ -16,7 +16,6 @@ compute_node_title = context.getTitle()
...
@@ -16,7 +16,6 @@ compute_node_title = context.getTitle()
node_ticket_title
=
"[MONITORING] Lost contact with compute_node %s"
%
reference
node_ticket_title
=
"[MONITORING] Lost contact with compute_node %s"
%
reference
instance_ticket_title
=
"[MONITORING] Compute Node %s has a stalled instance process"
%
reference
instance_ticket_title
=
"[MONITORING] Compute Node %s has a stalled instance process"
%
reference
software_ticket_title
=
"[MONITORING] Compute Node %s has a stalled software process"
%
reference
ticket_title
=
node_ticket_title
ticket_title
=
node_ticket_title
description
=
""
description
=
""
...
@@ -54,58 +53,30 @@ if not should_notify:
...
@@ -54,58 +53,30 @@ if not should_notify:
compute_partition_uid_list
=
[
compute_partition_uid_list
=
[
x
.
getUid
()
for
x
in
context
.
contentValues
(
portal_type
=
"Compute Partition"
)
x
.
getUid
()
for
x
in
context
.
contentValues
(
portal_type
=
"Compute Partition"
)
if
x
.
getSlapState
()
==
'busy'
]
if
x
.
getSlapState
()
==
'busy'
]
if
compute_partition_uid_list
:
if
compute_partition_uid_list
:
instance_list
=
portal
.
portal_catalog
(
instance_list
=
portal
.
portal_catalog
(
portal_type
=
'Software Instance'
,
portal_type
=
'Software Instance'
,
default_aggregate_uid
=
compute_partition_uid_list
)
default_aggregate_uid
=
compute_partition_uid_list
)
if
instance_list
:
if
instance_list
:
should_notify
=
True
should_notify
=
True
description
=
"The Compute Node %s (%s) didnt process its instances for more them 24 hours"
%
(
compute_node_title
,
reference
)
for
instance
in
instance_list
:
for
instance
in
instance_list
:
instance_access_status
=
instance
.
getAccessStatus
()
instance_access_status
=
instance
.
getAccessStatus
()
if
instance_access_status
.
get
(
'no_data'
,
None
):
if
instance_access_status
.
get
(
'no_data'
,
None
):
# Ignore if there isnt any data
# Ignore if there isnt any data
continue
continue
# At lest one partition contacted in the last 24h30min.
# At lest one partition contacted in the last 24h30min.
last_contact
=
max
(
DateTime
(
instance_access_status
.
get
(
'created_at'
)),
last_contact
)
last_contact
=
max
(
DateTime
(
instance_access_status
.
get
(
'created_at'
)),
last_contact
)
if
(
now
-
DateTime
(
instance_access_status
.
get
(
'created_at'
)))
<
1.0
1
:
if
(
now
-
DateTime
(
instance_access_status
.
get
(
'created_at'
)))
<
1.0
5
:
should_notify
=
False
should_notify
=
False
description
=
""
break
break
if
not
should_notify
:
if
should_notify
:
ticket_title
=
software_ticket_title
description
=
"The Compute Node %s (%s) didnt process its instances for more them 24 hours, last contact: %s"
%
(
notification_message_reference
=
'slapos-crm-compute_node_check_stalled_software_state.notification'
context
.
getTitle
(),
context
.
getReference
(),
last_contact
)
last_contact
=
"No Contact Information"
# Since server is contacting, check for stalled software releases processes
software_installation_list
=
portal
.
portal_catalog
(
portal_type
=
'Software Installation'
,
default_aggregate_uid
=
context
.
getUid
(),
validation_state
=
'validated'
)
if
software_installation_list
:
should_notify
=
True
description
=
"The Compute Node %s (%s) didnt process its software releases for more them 24 hours"
%
(
compute_node_title
,
reference
)
# Test if server didnt process the internal softwares releases for more them 24h
for
installation
in
software_installation_list
:
installation_access_status
=
installation
.
getAccessStatus
()
if
installation_access_status
.
get
(
'no_data'
,
None
):
# Ignore if there isnt any data on it
continue
last_contact
=
max
(
DateTime
(
instance_access_status
.
get
(
'created_at'
)),
last_contact
)
if
(
now
-
DateTime
(
installation_access_status
.
get
(
'created_at'
)))
<
1.01
:
should_notify
=
False
description
=
""
break
if
should_notify
:
if
should_notify
:
support_request
=
person
.
Base_getSupportRequestInProgress
(
support_request
=
person
.
Base_getSupportRequestInProgress
(
...
@@ -119,22 +90,22 @@ if should_notify:
...
@@ -119,22 +90,22 @@ if should_notify:
if
support_request
is
None
:
if
support_request
is
None
:
person
.
notify
(
support_request_title
=
ticket_title
,
person
.
notify
(
support_request_title
=
ticket_title
,
support_request_description
=
description
,
support_request_description
=
description
,
aggregate
=
context
.
getRelativeUrl
())
aggregate
=
context
.
getRelativeUrl
())
support_request_relative_url
=
context
.
REQUEST
.
get
(
"support_request_relative_url"
)
support_request_relative_url
=
context
.
REQUEST
.
get
(
"support_request_relative_url"
)
if
support_request_relative_url
is
None
:
if
support_request_relative_url
is
None
:
return
return
support_request
=
portal
.
restrictedTraverse
(
support_request_relative_url
)
support_request
=
portal
.
restrictedTraverse
(
support_request_relative_url
)
if
support_request
is
None
:
if
support_request
is
None
:
return
return
# Send Notification message
# Send Notification message
notification_message
=
portal
.
portal_notifications
.
getDocumentValue
(
notification_message
=
portal
.
portal_notifications
.
getDocumentValue
(
reference
=
notification_message_reference
)
reference
=
notification_message_reference
)
if
notification_message
is
None
:
if
notification_message
is
None
:
message
=
"""%s"""
%
description
message
=
"""%s"""
%
description
else
:
else
:
...
@@ -143,9 +114,9 @@ if should_notify:
...
@@ -143,9 +114,9 @@ if should_notify:
'last_contact'
:
last_contact
}
'last_contact'
:
last_contact
}
message
=
notification_message
.
asText
(
message
=
notification_message
.
asText
(
substitution_method_parameter_dict
=
{
'mapping_dict'
:
mapping_dict
})
substitution_method_parameter_dict
=
{
'mapping_dict'
:
mapping_dict
})
event
=
support_request
.
SupportRequest_getLastEvent
(
ticket_title
)
event
=
support_request
.
SupportRequest_getLastEvent
(
ticket_title
)
if
event
is
None
:
if
event
is
None
:
support_request
.
notify
(
message_title
=
ticket_title
,
message
=
message
)
support_request
.
notify
(
message_title
=
ticket_title
,
message
=
message
)
return
support_request
return
support_request
master/bt5/slapos_crm/SkinTemplateItem/portal_skins/slapos_crm_monitoring/SupportRequest_recheckMonitoring.py
View file @
9465e76b
#
#
# XXX This ticket contains dupplicated coded found arround SlapOS
# XXX This ticket contains dupplicated coded found arround SlapOS
# It is required to rewrite this in a generic way.
# It is required to rewrite this in a generic way.
# See also: InstanceTree_checkSoftwareInstanceState
# See also: InstanceTree_checkSoftwareInstanceState
# See also: ComputeNode_checkState
# See also: ComputeNode_checkState
#
#
...
@@ -13,6 +13,9 @@ if context.getSimulationState() == "invalidated":
...
@@ -13,6 +13,9 @@ if context.getSimulationState() == "invalidated":
if
context
.
getPortalType
()
!=
"Support Request"
:
if
context
.
getPortalType
()
!=
"Support Request"
:
return
"Not a Support Request"
return
"Not a Support Request"
now
=
DateTime
()
portal
=
context
.
getPortalObject
()
document
=
context
.
getAggregateValue
()
document
=
context
.
getAggregateValue
()
if
document
is
None
:
if
document
is
None
:
return
True
return
True
...
@@ -25,13 +28,40 @@ if aggregate_portal_type == "Compute Node":
...
@@ -25,13 +28,40 @@ if aggregate_portal_type == "Compute Node":
d
=
document
.
getAccessStatus
()
d
=
document
.
getAccessStatus
()
if
d
.
get
(
"no_data"
,
None
)
==
1
:
if
d
.
get
(
"no_data"
,
None
)
==
1
:
return
"No Contact Information"
return
"No Contact Information"
last_contact
=
DateTime
(
d
.
get
(
'created_at'
))
last_contact
=
DateTime
(
d
.
get
(
'created_at'
))
if
(
DateTime
()
-
last_contact
)
<
0.01
:
if
(
now
-
last_contact
)
<
0.01
:
# If server has no partitions skip
compute_partition_uid_list
=
[
x
.
getUid
()
for
x
in
document
.
contentValues
(
portal_type
=
"Compute Partition"
)
if
x
.
getSlapState
()
==
'busy'
]
if
compute_partition_uid_list
:
is_instance_stalled
=
True
last_contact
=
None
instance_list
=
portal
.
portal_catalog
(
portal_type
=
'Software Instance'
,
default_aggregate_uid
=
compute_partition_uid_list
)
for
instance
in
instance_list
:
instance_access_status
=
instance
.
getAccessStatus
()
if
instance_access_status
.
get
(
'no_data'
,
None
):
# Ignore if there isnt any data
continue
# At lest one partition contacted in the last 24h30min.
last_contact
=
max
(
DateTime
(
instance_access_status
.
get
(
'created_at'
)),
last_contact
)
if
(
now
-
DateTime
(
instance_access_status
.
get
(
'created_at'
)))
<
1.05
:
is_instance_stalled
=
False
break
if
is_instance_stalled
and
len
(
instance_list
):
return
"Process instance stalled, last contact was %s"
%
last_contact
return
"All OK, latest contact: %s "
%
last_contact
return
"All OK, latest contact: %s "
%
last_contact
else
:
else
:
return
"Problem, latest contact: %s"
%
last_contact
return
"Problem, latest contact: %s"
%
last_contact
if
aggregate_portal_type
==
"Software Installation"
:
if
aggregate_portal_type
==
"Software Installation"
:
compute_node_title
=
document
.
getAggregateTitle
()
compute_node_title
=
document
.
getAggregateTitle
()
if
document
.
getAggregateValue
().
getMonitorScope
()
==
"disabled"
:
if
document
.
getAggregateValue
().
getMonitorScope
()
==
"disabled"
:
...
@@ -40,11 +70,11 @@ if aggregate_portal_type == "Software Installation":
...
@@ -40,11 +70,11 @@ if aggregate_portal_type == "Software Installation":
if
document
.
getSlapState
()
not
in
[
"start_requested"
,
"stop_requested"
]:
if
document
.
getSlapState
()
not
in
[
"start_requested"
,
"stop_requested"
]:
return
"Software Installation is Destroyed."
return
"Software Installation is Destroyed."
d
=
contex
t
.
getAccessStatus
()
d
=
documen
t
.
getAccessStatus
()
if
d
.
get
(
"no_data"
,
None
)
==
1
:
if
d
.
get
(
"no_data"
,
None
)
==
1
:
return
"The software release %s did not started to build on %s since %s"
%
\
return
"The software release %s did not started to build on %s since %s"
%
\
(
document
.
getUrlString
(),
compute_node_title
,
document
.
getCreationDate
())
(
document
.
getUrlString
(),
compute_node_title
,
document
.
getCreationDate
())
last_contact
=
DateTime
(
d
.
get
(
'created_at'
))
last_contact
=
DateTime
(
d
.
get
(
'created_at'
))
if
d
.
get
(
"text"
).
startswith
(
"building"
):
if
d
.
get
(
"text"
).
startswith
(
"building"
):
return
"The software release %s is building for mode them 12 hours on %s, started on %s"
%
\
return
"The software release %s is building for mode them 12 hours on %s, started on %s"
%
\
...
...
master/bt5/slapos_crm/TestTemplateItem/portal_components/test.erp5.testSlapOSCRMSkins.py
View file @
9465e76b
...
@@ -968,48 +968,6 @@ class TestSlapOSComputeNode_CheckState(TestCRMSkinsMixin):
...
@@ -968,48 +968,6 @@ class TestSlapOSComputeNode_CheckState(TestCRMSkinsMixin):
self
.
assertEqual
(
event
.
getSource
(),
person
.
getRelativeUrl
())
self
.
assertEqual
(
event
.
getSource
(),
person
.
getRelativeUrl
())
@
simulate
(
'ERP5Site_isSupportRequestCreationClosed'
,
'*args, **kwargs'
,
'return 0'
)
@
simulate
(
'NotificationTool_getDocumentValue'
,
'reference=None'
,
'assert reference == "slapos-crm-compute_node_check_stalled_software_state.notification", reference
\
n
'
\
'return context.restrictedTraverse('
\
'context.REQUEST["test_ComputeNode_checkState_stalled_software"])'
)
def
test_ComputeNode_checkState_stalled_software
(
self
):
compute_node
=
self
.
_makeComputeNode
(
owner
=
self
.
makePerson
(
user
=
0
))[
0
]
self
.
_makeComplexComputeNode
()
person
=
compute_node
.
getSourceAdministrationValue
()
self
.
portal
.
REQUEST
[
'test_ComputeNode_checkState_stalled_software'
]
=
\
self
.
_makeNotificationMessage
(
compute_node
.
getReference
())
# Computer is getting access, also internal instance
compute_node
.
setAccessStatus
(
""
)
self
.
start_requested_software_instance
.
setAccessStatus
(
""
)
try
:
self
.
pinDateTime
(
DateTime
()
-
1.1
)
self
.
start_requested_software_installation
.
setAccessStatus
(
""
)
finally
:
self
.
unpinDateTime
()
compute_node
.
ComputeNode_checkState
()
self
.
tic
()
ticket_title
=
"[MONITORING] Compute Node %s has a stalled software process"
%
compute_node
.
getReference
()
ticket
=
self
.
_getGeneratedSupportRequest
(
compute_node
.
getUid
(),
ticket_title
)
self
.
assertNotEqual
(
ticket
,
None
)
event_list
=
ticket
.
getFollowUpRelatedValueList
()
self
.
assertEqual
(
len
(
event_list
),
1
)
event
=
event_list
[
0
]
self
.
assertEqual
(
event
.
getTitle
(),
ticket
.
getTitle
())
self
.
assertIn
(
compute_node
.
getReference
(),
event
.
getTextContent
())
self
.
assertEqual
(
event
.
getDestination
(),
ticket
.
getSourceSection
())
self
.
assertEqual
(
event
.
getSource
(),
person
.
getRelativeUrl
())
@
simulate
(
'ERP5Site_isSupportRequestCreationClosed'
,
'*args, **kwargs'
,
'return 0'
)
@
simulate
(
'ERP5Site_isSupportRequestCreationClosed'
,
'*args, **kwargs'
,
'return 0'
)
@
simulate
(
'NotificationTool_getDocumentValue'
,
@
simulate
(
'NotificationTool_getDocumentValue'
,
'reference=None'
,
'reference=None'
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment