Commit cada4581 authored by Alain Takoudjou's avatar Alain Takoudjou

node boot: Start partition with requested_started state on boot

If computer partition requested state is `started`, slapos node boot will start
partition services on reboot, event if the node is not able to connect to master or
has no internet.
parent c27cce14
...@@ -31,6 +31,7 @@ from __future__ import print_function ...@@ -31,6 +31,7 @@ from __future__ import print_function
import subprocess import subprocess
from six.moves.urllib.parse import urlparse from six.moves.urllib.parse import urlparse
from six.moves import xmlrpc_client as xmlrpclib
from time import sleep from time import sleep
import glob import glob
import os import os
...@@ -41,6 +42,11 @@ from slapos.cli.command import check_root_user ...@@ -41,6 +42,11 @@ from slapos.cli.command import check_root_user
from slapos.cli.entry import SlapOSApp from slapos.cli.entry import SlapOSApp
from slapos.cli.config import ConfigCommand from slapos.cli.config import ConfigCommand
from slapos.format import isGlobalScopeAddress from slapos.format import isGlobalScopeAddress
from slapos.grid.slapgrid import (COMPUTER_PARTITION_REQUESTED_STATE_FILENAME,
COMPUTER_PARTITION_STARTED_STATE)
from slapos.grid.svcbackend import (_getSupervisordSocketPath,
getSupervisorRPC,
launchSupervisord)
from slapos.util import string_to_boolean from slapos.util import string_to_boolean
import argparse import argparse
import logging import logging
...@@ -59,6 +65,48 @@ def _removeTimestamp(instancehome, partition_base_name): ...@@ -59,6 +65,48 @@ def _removeTimestamp(instancehome, partition_base_name):
logger.info("Removing %s", timestamp_path) logger.info("Removing %s", timestamp_path)
os.remove(timestamp_path) os.remove(timestamp_path)
def _startComputerPartition(partition_id, supervisord_socket):
"""
With supervisord, start the instance that was deployed
"""
try:
with getSupervisorRPC(supervisord_socket) as supervisor:
supervisor.startProcessGroup(partition_id, False)
except xmlrpclib.Fault as exc:
if exc.faultString.startswith('BAD_NAME:'):
logger.info("Nothing to start on %s...", partition_id)
else:
raise
else:
logger.info("Requested start of %s...", partition_id)
def _startComputerPartitionList(instance_root, partition_base_name):
"""
Start services for partition which has requested state to 'started'
"""
partition_glob_path = os.path.join(
instance_root,
"%s*" % partition_base_name)
launchSupervisord(instance_root=instance_root, logger=logger)
for partition_path in glob.glob(partition_glob_path):
partition_state_path = os.path.join(
partition_path,
COMPUTER_PARTITION_REQUESTED_STATE_FILENAME
)
supervisord_socket_path = _getSupervisordSocketPath(
instance_root,
logger
)
if os.path.exists(partition_state_path):
partition_state = ""
with open(partition_state_path) as f:
partition_state = f.read()
if partition_state == COMPUTER_PARTITION_STARTED_STATE:
# Call start for this computer partition
_startComputerPartition(
os.path.basename(partition_path.rstrip('/')),
supervisord_socket_path
)
def _runBang(app): def _runBang(app):
""" """
...@@ -76,7 +124,9 @@ def _runFormat(app): ...@@ -76,7 +124,9 @@ def _runFormat(app):
Launch slapos node format. Launch slapos node format.
""" """
logger.info("[BOOT] Invoking slapos node format...") logger.info("[BOOT] Invoking slapos node format...")
result = app.run(['node', 'format', '--now', '--verbose']) # '--local' parameter is to prevent node format command to post data to
# master, so this command can work without internet and setup partitions IP.
result = app.run(['node', 'format', '--now', '--local', '--verbose'])
if result == 1: if result == 1:
return 0 return 0
return 1 return 1
...@@ -196,6 +246,15 @@ class BootCommand(ConfigCommand): ...@@ -196,6 +246,15 @@ class BootCommand(ConfigCommand):
if ipv6_interface is not None: if ipv6_interface is not None:
_waitIpv6Ready(ipv6_interface) _waitIpv6Ready(ipv6_interface)
app = SlapOSApp()
# Make sure slapos node format returns ok
while not _runFormat(app):
logger.error("[BOOT] Fail to format, try again in 15 seconds...")
sleep(15)
# Start computer partition services
_startComputerPartitionList(instance_root, partition_base_name)
# Check that node can ping master # Check that node can ping master
if valid_ipv4(master_hostname): if valid_ipv4(master_hostname):
_test_ping(master_hostname) _test_ping(master_hostname)
...@@ -205,12 +264,6 @@ class BootCommand(ConfigCommand): ...@@ -205,12 +264,6 @@ class BootCommand(ConfigCommand):
# hostname # hostname
_ping_hostname(master_hostname) _ping_hostname(master_hostname)
app = SlapOSApp()
# Make sure slapos node format returns ok
while not _runFormat(app):
logger.error("[BOOT] Fail to format, try again in 15 seconds...")
sleep(15)
# Make sure slapos node bang returns ok # Make sure slapos node bang returns ok
while not _runBang(app): while not _runBang(app):
logger.error("[BOOT] Fail to bang, try again in 15 seconds...") logger.error("[BOOT] Fail to bang, try again in 15 seconds...")
......
...@@ -83,6 +83,12 @@ class FormatCommand(ConfigCommand): ...@@ -83,6 +83,12 @@ class FormatCommand(ConfigCommand):
help='Launch slapformat without delay' help='Launch slapformat without delay'
' (default: %(default)s)') ' (default: %(default)s)')
ap.add_argument('--local',
default=False, # can have a default as it is not in .cfg
action="store_true",
help='Keep format data locally, do not post xml to master'
' (default: %(default)s)')
ap.add_argument('-n', '--dry_run', ap.add_argument('-n', '--dry_run',
default=False, # can have a default as it is not in .cfg default=False, # can have a default as it is not in .cfg
action="store_true", action="store_true",
......
...@@ -1408,8 +1408,9 @@ def do_format(conf): ...@@ -1408,8 +1408,9 @@ def do_format(conf):
computer.dump(path_to_xml=conf.computer_xml, computer.dump(path_to_xml=conf.computer_xml,
path_to_json=conf.computer_json, path_to_json=conf.computer_json,
logger=conf.logger) logger=conf.logger)
conf.logger.info('Posting information to %r' % conf.master_url) if not conf.local:
computer.send(conf) conf.logger.info('Posting information to %r' % conf.master_url)
computer.send(conf)
conf.logger.info('slapos successfully prepared the computer.') conf.logger.info('slapos successfully prepared the computer.')
......
...@@ -90,6 +90,7 @@ SLAPGRID_PROMISE_FAIL = 2 ...@@ -90,6 +90,7 @@ SLAPGRID_PROMISE_FAIL = 2
PROMISE_TIMEOUT = 20 PROMISE_TIMEOUT = 20
COMPUTER_PARTITION_TIMESTAMP_FILENAME = '.timestamp' COMPUTER_PARTITION_TIMESTAMP_FILENAME = '.timestamp'
COMPUTER_PARTITION_REQUESTED_STATE_FILENAME = '.requested_state'
COMPUTER_PARTITION_LATEST_BANG_TIMESTAMP_FILENAME = '.slapos_latest_bang_timestamp' COMPUTER_PARTITION_LATEST_BANG_TIMESTAMP_FILENAME = '.slapos_latest_bang_timestamp'
COMPUTER_PARTITION_INSTALL_ERROR_FILENAME = '.slapgrid-%s-error.log' COMPUTER_PARTITION_INSTALL_ERROR_FILENAME = '.slapgrid-%s-error.log'
COMPUTER_PARTITION_WAIT_LIST_FILENAME = '.slapos-report-wait-service-list' COMPUTER_PARTITION_WAIT_LIST_FILENAME = '.slapos-report-wait-service-list'
...@@ -1125,6 +1126,10 @@ stderr_logfile_backups=1 ...@@ -1125,6 +1126,10 @@ stderr_logfile_backups=1
instance_path, instance_path,
COMPUTER_PARTITION_TIMESTAMP_FILENAME COMPUTER_PARTITION_TIMESTAMP_FILENAME
) )
partition_state_path = os.path.join(
instance_path,
COMPUTER_PARTITION_REQUESTED_STATE_FILENAME
)
parameter_dict = computer_partition.getInstanceParameterDict() parameter_dict = computer_partition.getInstanceParameterDict()
timestamp = parameter_dict.get('timestamp') timestamp = parameter_dict.get('timestamp')
...@@ -1225,6 +1230,7 @@ stderr_logfile_backups=1 ...@@ -1225,6 +1230,7 @@ stderr_logfile_backups=1
return return
os.remove(timestamp_path) os.remove(timestamp_path)
os.remove(partition_state_path)
# Include Partition Logging # Include Partition Logging
log_folder_path = "%s/.slapgrid/log" % instance_path log_folder_path = "%s/.slapgrid/log" % instance_path
...@@ -1339,6 +1345,8 @@ stderr_logfile_backups=1 ...@@ -1339,6 +1345,8 @@ stderr_logfile_backups=1
if timestamp: if timestamp:
with open(timestamp_path, 'w') as f: with open(timestamp_path, 'w') as f:
f.write(str(timestamp)) f.write(str(timestamp))
with open(partition_state_path, 'w') as f:
f.write(str(computer_partition_state))
def FilterComputerPartitionList(self, computer_partition_list): def FilterComputerPartitionList(self, computer_partition_list):
""" """
......
...@@ -415,8 +415,13 @@ class TestCliBoot(CliMixin): ...@@ -415,8 +415,13 @@ class TestCliBoot(CliMixin):
os.mkdir(os.path.join(instance_root, partition_base_name + '1')) os.mkdir(os.path.join(instance_root, partition_base_name + '1'))
timestamp = os.path.join( timestamp = os.path.join(
instance_root, partition_base_name + '1', '.timestamp') instance_root, partition_base_name + '1', '.timestamp')
requested_state_path = os.path.join(instance_root,
partition_base_name + '1',
'.requested_state')
with open(timestamp, 'w') as f: with open(timestamp, 'w') as f:
f.write("1578552471") f.write("1578552471")
with open(requested_state_path, 'w') as f:
f.write("started")
# make a config file using this instance root # make a config file using this instance root
with tempfile.NamedTemporaryFile(mode='w') as slapos_conf: with tempfile.NamedTemporaryFile(mode='w') as slapos_conf:
...@@ -441,17 +446,21 @@ class TestCliBoot(CliMixin): ...@@ -441,17 +446,21 @@ class TestCliBoot(CliMixin):
patch( patch(
'slapos.cli.boot.netifaces.ifaddresses', 'slapos.cli.boot.netifaces.ifaddresses',
return_value={socket.AF_INET6: ({'addr': '2000::1'},),},) as ifaddresses,\ return_value={socket.AF_INET6: ({'addr': '2000::1'},),},) as ifaddresses,\
patch('slapos.cli.boot._startComputerPartition', return_value=None) as start_partition,\
patch('slapos.cli.boot.launchSupervisord', return_value=None),\
patch('slapos.cli.boot._ping_hostname', return_value=1) as _ping_hostname: patch('slapos.cli.boot._ping_hostname', return_value=1) as _ping_hostname:
app.run(('node', 'boot')) app.run(('node', 'boot'))
# boot command runs as root # boot command runs as root
check_root_user.assert_called_once() check_root_user.assert_called_once()
# Computer partition was started during boot
start_partition.assert_called_once()
# it waits for interface to have an IPv6 address # it waits for interface to have an IPv6 address
ifaddresses.assert_called_once_with('interface_name_from_config') ifaddresses.assert_called_once_with('interface_name_from_config')
# then ping master hostname to wait for connectivity # then ping master hostname to wait for connectivity
_ping_hostname.assert_called_once_with('slap.vifib.com') _ping_hostname.assert_called_once_with('slap.vifib.com')
# then format and bang # then format and bang
SlapOSApp().run.assert_any_call(['node', 'format', '--now', '--verbose']) SlapOSApp().run.assert_any_call(['node', 'format', '--now', '--local', '--verbose'])
SlapOSApp().run.assert_any_call(['node', 'bang', '-m', 'Reboot']) SlapOSApp().run.assert_any_call(['node', 'bang', '-m', 'Reboot'])
# timestamp files have been removed # timestamp files have been removed
...@@ -473,6 +482,7 @@ class TestCliBoot(CliMixin): ...@@ -473,6 +482,7 @@ class TestCliBoot(CliMixin):
patch('slapos.cli.boot.netifaces.ifaddresses', patch('slapos.cli.boot.netifaces.ifaddresses',
side_effect=[net1, net2, net3]),\ side_effect=[net1, net2, net3]),\
patch('slapos.cli.boot._ping_hostname', return_value=0),\ patch('slapos.cli.boot._ping_hostname', return_value=0),\
patch('slapos.cli.boot._startComputerPartitionList', return_value=None) as start_partition,\
patch('slapos.cli.format.check_root_user', return_value=True),\ patch('slapos.cli.format.check_root_user', return_value=True),\
patch('slapos.cli.format.logging.FileHandler', return_value=logging.NullHandler()),\ patch('slapos.cli.format.logging.FileHandler', return_value=logging.NullHandler()),\
patch('slapos.cli.bang.check_root_user', return_value=True),\ patch('slapos.cli.bang.check_root_user', return_value=True),\
...@@ -482,6 +492,7 @@ class TestCliBoot(CliMixin): ...@@ -482,6 +492,7 @@ class TestCliBoot(CliMixin):
app.run(('node', 'boot')) app.run(('node', 'boot'))
check_root_user.assert_called_once() check_root_user.assert_called_once()
start_partition.assert_called_once()
self.assertEqual(do_format.call_count, 3) self.assertEqual(do_format.call_count, 3)
self.assertEqual(do_bang.call_count, 3) self.assertEqual(do_bang.call_count, 3)
......
...@@ -1401,7 +1401,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase): ...@@ -1401,7 +1401,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.assertInstanceDirectoryListEqual(['0']) self.assertInstanceDirectoryListEqual(['0'])
partition = os.path.join(self.instance_root, '0') partition = os.path.join(self.instance_root, '0')
six.assertCountEqual(self, os.listdir(partition), six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay']) ['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay'])
six.assertCountEqual(self, os.listdir(self.software_root), [instance.software.software_hash]) six.assertCountEqual(self, os.listdir(self.software_root), [instance.software.software_hash])
timestamp_path = os.path.join(instance.partition_path, '.timestamp') timestamp_path = os.path.join(instance.partition_path, '.timestamp')
self.setSlapgrid() self.setSlapgrid()
...@@ -1422,7 +1422,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase): ...@@ -1422,7 +1422,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.assertInstanceDirectoryListEqual(['0']) self.assertInstanceDirectoryListEqual(['0'])
partition = os.path.join(self.instance_root, '0') partition = os.path.join(self.instance_root, '0')
six.assertCountEqual(self, os.listdir(partition), six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', 'buildout.cfg', ['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg',
'software_release', 'worked', '.slapos-retention-lock-delay']) 'software_release', 'worked', '.slapos-retention-lock-delay'])
six.assertCountEqual(self, os.listdir(self.software_root), [instance.software.software_hash]) six.assertCountEqual(self, os.listdir(self.software_root), [instance.software.software_hash])
...@@ -1445,7 +1445,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase): ...@@ -1445,7 +1445,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.assertInstanceDirectoryListEqual(['0']) self.assertInstanceDirectoryListEqual(['0'])
partition = os.path.join(self.instance_root, '0') partition = os.path.join(self.instance_root, '0')
six.assertCountEqual(self, os.listdir(partition), six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay']) ['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay'])
six.assertCountEqual(self, os.listdir(self.software_root), [instance.software.software_hash]) six.assertCountEqual(self, os.listdir(self.software_root), [instance.software.software_hash])
instance.timestamp = str(int(timestamp) - 1) instance.timestamp = str(int(timestamp) - 1)
self.assertEqual(self.launchSlapgrid(), slapgrid.SLAPGRID_SUCCESS) self.assertEqual(self.launchSlapgrid(), slapgrid.SLAPGRID_SUCCESS)
...@@ -1463,7 +1463,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase): ...@@ -1463,7 +1463,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.assertInstanceDirectoryListEqual(['0']) self.assertInstanceDirectoryListEqual(['0'])
partition = os.path.join(self.instance_root, '0') partition = os.path.join(self.instance_root, '0')
six.assertCountEqual(self, os.listdir(partition), six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay']) ['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay'])
six.assertCountEqual(self, os.listdir(self.software_root), [instance.software.software_hash]) six.assertCountEqual(self, os.listdir(self.software_root), [instance.software.software_hash])
instance.timestamp = str(int(timestamp) + 1) instance.timestamp = str(int(timestamp) + 1)
self.assertEqual(self.launchSlapgrid(), slapgrid.SLAPGRID_SUCCESS) self.assertEqual(self.launchSlapgrid(), slapgrid.SLAPGRID_SUCCESS)
...@@ -1491,7 +1491,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase): ...@@ -1491,7 +1491,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.assertInstanceDirectoryListEqual(['0']) self.assertInstanceDirectoryListEqual(['0'])
partition = os.path.join(self.instance_root, '0') partition = os.path.join(self.instance_root, '0')
six.assertCountEqual(self, os.listdir(partition), six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay']) ['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg', 'software_release', 'worked', '.slapos-retention-lock-delay'])
six.assertCountEqual(self, os.listdir(self.software_root), six.assertCountEqual(self, os.listdir(self.software_root),
[instance.software.software_hash]) [instance.software.software_hash])
instance.timestamp = None instance.timestamp = None
...@@ -1523,7 +1523,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase): ...@@ -1523,7 +1523,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.launchSlapgrid() self.launchSlapgrid()
partition = os.path.join(self.instance_root, '0') partition = os.path.join(self.instance_root, '0')
six.assertCountEqual(self, os.listdir(partition), six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', 'buildout.cfg', ['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg',
'software_release', 'worked', '.slapos-retention-lock-delay']) 'software_release', 'worked', '.slapos-retention-lock-delay'])
time.sleep(2) time.sleep(2)
...@@ -1533,7 +1533,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase): ...@@ -1533,7 +1533,7 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.launchSlapgrid() self.launchSlapgrid()
six.assertCountEqual(self, os.listdir(partition), six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', 'buildout.cfg', ['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg',
'software_release', 'worked', '.slapos-retention-lock-delay']) 'software_release', 'worked', '.slapos-retention-lock-delay'])
def test_one_partition_periodicity_from_file_does_not_disturb_others(self): def test_one_partition_periodicity_from_file_does_not_disturb_others(self):
...@@ -1710,6 +1710,43 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase): ...@@ -1710,6 +1710,43 @@ class TestSlapgridCPPartitionProcessing(MasterMixin, unittest.TestCase):
self.launchSlapgrid() self.launchSlapgrid()
self.assertEqual(mock_method.call_count, 2) self.assertEqual(mock_method.call_count, 2)
def test_partition_requested_state_created(self):
computer = self.getTestComputerClass()(self.software_root, self.instance_root)
with httmock.HTTMock(computer.request_handler):
instance = computer.instance_list[0]
timestamp = str(int(time.time()))
instance.timestamp = timestamp
self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_SUCCESS)
self.assertInstanceDirectoryListEqual(['0'])
partition = os.path.join(self.instance_root, '0')
six.assertCountEqual(self, os.listdir(partition),
['.slapgrid', '.timestamp', '.requested_state', 'buildout.cfg',
'software_release', 'worked', '.slapos-retention-lock-delay'])
six.assertCountEqual(self, os.listdir(self.software_root), [instance.software.software_hash])
requested_state_path = os.path.join(instance.partition_path, '.requested_state')
with open(requested_state_path) as f:
self.assertEqual(f.read(), slapgrid.COMPUTER_PARTITION_STOPPED_STATE)
self.assertEqual(instance.sequence,
['/stoppedComputerPartition'])
def test_partition_requested_state_not_created_if_failed(self):
computer = self.getTestComputerClass()(self.software_root, self.instance_root)
with httmock.HTTMock(computer.request_handler):
instance = computer.instance_list[0]
timestamp = str(int(time.time()))
instance.timestamp = timestamp
instance.software.setBuildout("""#!/bin/sh
exit 3""")
self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_FAIL)
self.assertInstanceDirectoryListEqual(['0'])
self.assertEqual(instance.sequence,
['/softwareInstanceError'])
requested_state_path = os.path.join(instance.partition_path, '.requested_state')
self.assertFalse(os.path.exists(requested_state_path))
def test_one_partition_buildout_fail_does_not_disturb_others(self): def test_one_partition_buildout_fail_does_not_disturb_others(self):
""" """
1. We set up two instance one using a corrupted buildout 1. We set up two instance one using a corrupted buildout
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment