Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
S
slapos.core
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Léo-Paul Géneau
slapos.core
Commits
9465e76b
Commit
9465e76b
authored
Jan 31, 2023
by
Rafael Monnerat
Browse files
Options
Browse Files
Download
Plain Diff
Update stalled alarm reliability
See merge request
nexedi/slapos.core!481
parents
3ae06e91
e1a3d2ad
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
53 additions
and
94 deletions
+53
-94
master/bt5/slapos_cloud/PathTemplateItem/portal_caches/access_status_data_cache_factory.xml
...teItem/portal_caches/access_status_data_cache_factory.xml
+1
-1
master/bt5/slapos_crm/SkinTemplateItem/portal_skins/slapos_crm_monitoring/ComputeNode_checkState.py
...tal_skins/slapos_crm_monitoring/ComputeNode_checkState.py
+16
-45
master/bt5/slapos_crm/SkinTemplateItem/portal_skins/slapos_crm_monitoring/SupportRequest_recheckMonitoring.py
...slapos_crm_monitoring/SupportRequest_recheckMonitoring.py
+36
-6
master/bt5/slapos_crm/TestTemplateItem/portal_components/test.erp5.testSlapOSCRMSkins.py
...ateItem/portal_components/test.erp5.testSlapOSCRMSkins.py
+0
-42
No files found.
master/bt5/slapos_cloud/PathTemplateItem/portal_caches/access_status_data_cache_factory.xml
View file @
9465e76b
...
...
@@ -26,7 +26,7 @@
</item>
<item>
<key>
<string>
cache_duration
</string>
</key>
<value>
<int>
864
00
</int>
</value>
<value>
<int>
1728
00
</int>
</value>
</item>
<item>
<key>
<string>
description
</string>
</key>
...
...
master/bt5/slapos_crm/SkinTemplateItem/portal_skins/slapos_crm_monitoring/ComputeNode_checkState.py
View file @
9465e76b
...
...
@@ -16,7 +16,6 @@ compute_node_title = context.getTitle()
node_ticket_title
=
"[MONITORING] Lost contact with compute_node %s"
%
reference
instance_ticket_title
=
"[MONITORING] Compute Node %s has a stalled instance process"
%
reference
software_ticket_title
=
"[MONITORING] Compute Node %s has a stalled software process"
%
reference
ticket_title
=
node_ticket_title
description
=
""
...
...
@@ -62,8 +61,6 @@ if not should_notify:
if
instance_list
:
should_notify
=
True
description
=
"The Compute Node %s (%s) didnt process its instances for more them 24 hours"
%
(
compute_node_title
,
reference
)
for
instance
in
instance_list
:
instance_access_status
=
instance
.
getAccessStatus
()
...
...
@@ -73,39 +70,13 @@ if not should_notify:
# At lest one partition contacted in the last 24h30min.
last_contact
=
max
(
DateTime
(
instance_access_status
.
get
(
'created_at'
)),
last_contact
)
if
(
now
-
DateTime
(
instance_access_status
.
get
(
'created_at'
)))
<
1.0
1
:
if
(
now
-
DateTime
(
instance_access_status
.
get
(
'created_at'
)))
<
1.0
5
:
should_notify
=
False
description
=
""
break
if
not
should_notify
:
ticket_title
=
software_ticket_title
notification_message_reference
=
'slapos-crm-compute_node_check_stalled_software_state.notification'
last_contact
=
"No Contact Information"
# Since server is contacting, check for stalled software releases processes
software_installation_list
=
portal
.
portal_catalog
(
portal_type
=
'Software Installation'
,
default_aggregate_uid
=
context
.
getUid
(),
validation_state
=
'validated'
)
if
software_installation_list
:
should_notify
=
True
description
=
"The Compute Node %s (%s) didnt process its software releases for more them 24 hours"
%
(
compute_node_title
,
reference
)
# Test if server didnt process the internal softwares releases for more them 24h
for
installation
in
software_installation_list
:
installation_access_status
=
installation
.
getAccessStatus
()
if
installation_access_status
.
get
(
'no_data'
,
None
):
# Ignore if there isnt any data on it
continue
last_contact
=
max
(
DateTime
(
instance_access_status
.
get
(
'created_at'
)),
last_contact
)
if
(
now
-
DateTime
(
installation_access_status
.
get
(
'created_at'
)))
<
1.01
:
should_notify
=
False
description
=
""
break
if
should_notify
:
description
=
"The Compute Node %s (%s) didnt process its instances for more them 24 hours, last contact: %s"
%
(
context
.
getTitle
(),
context
.
getReference
(),
last_contact
)
if
should_notify
:
support_request
=
person
.
Base_getSupportRequestInProgress
(
...
...
master/bt5/slapos_crm/SkinTemplateItem/portal_skins/slapos_crm_monitoring/SupportRequest_recheckMonitoring.py
View file @
9465e76b
...
...
@@ -13,6 +13,9 @@ if context.getSimulationState() == "invalidated":
if
context
.
getPortalType
()
!=
"Support Request"
:
return
"Not a Support Request"
now
=
DateTime
()
portal
=
context
.
getPortalObject
()
document
=
context
.
getAggregateValue
()
if
document
is
None
:
return
True
...
...
@@ -27,7 +30,34 @@ if aggregate_portal_type == "Compute Node":
return
"No Contact Information"
last_contact
=
DateTime
(
d
.
get
(
'created_at'
))
if
(
DateTime
()
-
last_contact
)
<
0.01
:
if
(
now
-
last_contact
)
<
0.01
:
# If server has no partitions skip
compute_partition_uid_list
=
[
x
.
getUid
()
for
x
in
document
.
contentValues
(
portal_type
=
"Compute Partition"
)
if
x
.
getSlapState
()
==
'busy'
]
if
compute_partition_uid_list
:
is_instance_stalled
=
True
last_contact
=
None
instance_list
=
portal
.
portal_catalog
(
portal_type
=
'Software Instance'
,
default_aggregate_uid
=
compute_partition_uid_list
)
for
instance
in
instance_list
:
instance_access_status
=
instance
.
getAccessStatus
()
if
instance_access_status
.
get
(
'no_data'
,
None
):
# Ignore if there isnt any data
continue
# At lest one partition contacted in the last 24h30min.
last_contact
=
max
(
DateTime
(
instance_access_status
.
get
(
'created_at'
)),
last_contact
)
if
(
now
-
DateTime
(
instance_access_status
.
get
(
'created_at'
)))
<
1.05
:
is_instance_stalled
=
False
break
if
is_instance_stalled
and
len
(
instance_list
):
return
"Process instance stalled, last contact was %s"
%
last_contact
return
"All OK, latest contact: %s "
%
last_contact
else
:
return
"Problem, latest contact: %s"
%
last_contact
...
...
@@ -40,7 +70,7 @@ if aggregate_portal_type == "Software Installation":
if
document
.
getSlapState
()
not
in
[
"start_requested"
,
"stop_requested"
]:
return
"Software Installation is Destroyed."
d
=
contex
t
.
getAccessStatus
()
d
=
documen
t
.
getAccessStatus
()
if
d
.
get
(
"no_data"
,
None
)
==
1
:
return
"The software release %s did not started to build on %s since %s"
%
\
(
document
.
getUrlString
(),
compute_node_title
,
document
.
getCreationDate
())
...
...
master/bt5/slapos_crm/TestTemplateItem/portal_components/test.erp5.testSlapOSCRMSkins.py
View file @
9465e76b
...
...
@@ -968,48 +968,6 @@ class TestSlapOSComputeNode_CheckState(TestCRMSkinsMixin):
self
.
assertEqual
(
event
.
getSource
(),
person
.
getRelativeUrl
())
@
simulate
(
'ERP5Site_isSupportRequestCreationClosed'
,
'*args, **kwargs'
,
'return 0'
)
@
simulate
(
'NotificationTool_getDocumentValue'
,
'reference=None'
,
'assert reference == "slapos-crm-compute_node_check_stalled_software_state.notification", reference
\
n
'
\
'return context.restrictedTraverse('
\
'context.REQUEST["test_ComputeNode_checkState_stalled_software"])'
)
def
test_ComputeNode_checkState_stalled_software
(
self
):
compute_node
=
self
.
_makeComputeNode
(
owner
=
self
.
makePerson
(
user
=
0
))[
0
]
self
.
_makeComplexComputeNode
()
person
=
compute_node
.
getSourceAdministrationValue
()
self
.
portal
.
REQUEST
[
'test_ComputeNode_checkState_stalled_software'
]
=
\
self
.
_makeNotificationMessage
(
compute_node
.
getReference
())
# Computer is getting access, also internal instance
compute_node
.
setAccessStatus
(
""
)
self
.
start_requested_software_instance
.
setAccessStatus
(
""
)
try
:
self
.
pinDateTime
(
DateTime
()
-
1.1
)
self
.
start_requested_software_installation
.
setAccessStatus
(
""
)
finally
:
self
.
unpinDateTime
()
compute_node
.
ComputeNode_checkState
()
self
.
tic
()
ticket_title
=
"[MONITORING] Compute Node %s has a stalled software process"
%
compute_node
.
getReference
()
ticket
=
self
.
_getGeneratedSupportRequest
(
compute_node
.
getUid
(),
ticket_title
)
self
.
assertNotEqual
(
ticket
,
None
)
event_list
=
ticket
.
getFollowUpRelatedValueList
()
self
.
assertEqual
(
len
(
event_list
),
1
)
event
=
event_list
[
0
]
self
.
assertEqual
(
event
.
getTitle
(),
ticket
.
getTitle
())
self
.
assertIn
(
compute_node
.
getReference
(),
event
.
getTextContent
())
self
.
assertEqual
(
event
.
getDestination
(),
ticket
.
getSourceSection
())
self
.
assertEqual
(
event
.
getSource
(),
person
.
getRelativeUrl
())
@
simulate
(
'ERP5Site_isSupportRequestCreationClosed'
,
'*args, **kwargs'
,
'return 0'
)
@
simulate
(
'NotificationTool_getDocumentValue'
,
'reference=None'
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment