Commit d3170bb5 authored by Thomas Gambier's avatar Thomas Gambier 🚴🏼

slapgrid: Start services even without connection to master

See merge request !515
parents 1b9f353c 4fac0158
......@@ -31,7 +31,6 @@ from __future__ import print_function
import subprocess
from six.moves.urllib.parse import urlparse
from six.moves import xmlrpc_client as xmlrpclib
from time import sleep
import glob
import os
......@@ -41,12 +40,7 @@ from netaddr import valid_ipv4, valid_ipv6
from slapos.cli.command import check_root_user
from slapos.cli.entry import SlapOSApp
from slapos.cli.config import ConfigCommand
from slapos.format import isGlobalScopeAddress
from slapos.grid.slapgrid import (COMPUTER_PARTITION_REQUESTED_STATE_FILENAME,
COMPUTER_PARTITION_STARTED_STATE)
from slapos.grid.svcbackend import (_getSupervisordSocketPath,
getSupervisorRPC,
launchSupervisord)
from slapos.format import isGlobalScopeAddress, FormatReturn
from slapos.util import string_to_boolean
import argparse
import logging
......@@ -65,58 +59,13 @@ def _removeTimestamp(instancehome, partition_base_name):
logger.info("Removing %s", timestamp_path)
os.remove(timestamp_path)
def _startComputerPartition(partition_id, supervisord_socket):
"""
With supervisord, start the instance that was deployed
"""
try:
with getSupervisorRPC(supervisord_socket) as supervisor:
supervisor.startProcessGroup(partition_id, False)
except xmlrpclib.Fault as exc:
if exc.faultString.startswith('BAD_NAME:'):
logger.info("Nothing to start on %s...", partition_id)
else:
raise
else:
logger.info("Requested start of %s...", partition_id)
def _startComputerPartitionList(instance_root, partition_base_name):
"""
Start services for partition which has requested state to 'started'
"""
partition_glob_path = os.path.join(
instance_root,
"%s*" % partition_base_name)
launchSupervisord(instance_root=instance_root, logger=logger)
for partition_path in glob.glob(partition_glob_path):
partition_state_path = os.path.join(
partition_path,
COMPUTER_PARTITION_REQUESTED_STATE_FILENAME
)
supervisord_socket_path = _getSupervisordSocketPath(
instance_root,
logger
)
if os.path.exists(partition_state_path):
partition_state = ""
with open(partition_state_path) as f:
partition_state = f.read()
if partition_state == COMPUTER_PARTITION_STARTED_STATE:
# Call start for this computer partition
_startComputerPartition(
os.path.basename(partition_path.rstrip('/')),
supervisord_socket_path
)
def _runBang(app):
"""
Launch slapos node format.
"""
logger.info("[BOOT] Invoking slapos node bang...")
result = app.run(['node', 'bang', '-m', 'Reboot'])
if result == 1:
return 0
return 1
return app.run(['node', 'bang', '-m', 'Reboot'])
def _runFormat(app):
......@@ -124,12 +73,7 @@ def _runFormat(app):
Launch slapos node format.
"""
logger.info("[BOOT] Invoking slapos node format...")
# '--local' parameter is to prevent node format command to post data to
# master, so this command can work without internet and setup partitions IP.
result = app.run(['node', 'format', '--now', '--local', '--verbose'])
if result == 1:
return 0
return 1
return app.run(['node', 'format', '--now', '--verbose'])
def _ping(hostname):
......@@ -189,6 +133,16 @@ def _ping_hostname(hostname):
is_ready = _ping6(hostname)
def _ping_master(master_hostname):
if valid_ipv4(master_hostname):
_test_ping(master_hostname)
elif valid_ipv6(master_hostname):
_test_ping6(master_hostname)
else:
# hostname
_ping_hostname(master_hostname)
def _waitIpv6Ready(ipv6_interface):
"""
test if ipv6 is ready on ipv6_interface
......@@ -204,6 +158,7 @@ def _waitIpv6Ready(ipv6_interface):
"try again in 5 seconds...", ipv6_interface)
sleep(5)
class BootCommand(ConfigCommand):
"""
Test network and invoke simple format and bang (Use on Linux startup)
......@@ -247,25 +202,28 @@ class BootCommand(ConfigCommand):
_waitIpv6Ready(ipv6_interface)
app = SlapOSApp()
while True:
# Make sure slapos node format returns ok
while not _runFormat(app):
result = _runFormat(app)
if result == FormatReturn.FAILURE:
logger.error("[BOOT] Fail to format, try again in 15 seconds...")
sleep(15)
continue
# Start computer partition services
_startComputerPartitionList(instance_root, partition_base_name)
if result == FormatReturn.OFFLINE_SUCCESS:
logger.error(
"[BOOT] Fail to post format information"
", try again when connection to master is up..."
)
sleep(15)
_ping_master(master_hostname)
continue
# Check that node can ping master
if valid_ipv4(master_hostname):
_test_ping(master_hostname)
elif valid_ipv6(master_hostname):
_test_ping6(master_hostname)
else:
# hostname
_ping_hostname(master_hostname)
break
# Make sure slapos node bang returns ok
while not _runBang(app):
while _runBang(app):
logger.error("[BOOT] Fail to bang, try again in 15 seconds...")
sleep(15)
......
......@@ -83,12 +83,6 @@ class FormatCommand(ConfigCommand):
help='Launch slapformat without delay'
' (default: %(default)s)')
ap.add_argument('--local',
default=False, # can have a default as it is not in .cfg
action="store_true",
help='Keep format data locally, do not post xml to master'
' (default: %(default)s)')
ap.add_argument('-n', '--dry_run',
default=False, # can have a default as it is not in .cfg
action="store_true",
......@@ -131,4 +125,4 @@ class FormatCommand(ConfigCommand):
tracing_monkeypatch(conf)
do_format(conf=conf)
return do_format(conf=conf)
......@@ -30,6 +30,7 @@
from six.moves import configparser
import distro
import enum
import errno
import fcntl
import grp
......@@ -68,6 +69,12 @@ from slapos import version
from slapos import manager as slapmanager
class FormatReturn(enum.IntEnum):
SUCCESS = 0
FAILURE = 1
OFFLINE_SUCCESS = 2
logger = logging.getLogger("slapos.format")
......@@ -1578,6 +1585,7 @@ def random_delay(conf):
def do_format(conf):
try:
random_delay(conf)
if conf.input_definition_file:
......@@ -1608,10 +1616,19 @@ def do_format(conf):
computer.dump(path_to_xml=conf.computer_xml,
path_to_json=conf.computer_json,
logger=conf.logger)
if not conf.local:
conf.logger.info('Posting information to %r' % conf.master_url)
try:
computer.send(conf)
return FormatReturn.SUCCESS
except Exception:
conf.logger.exception('failed to transfer information to %r' % conf.master_url)
return FormatReturn.OFFLINE_SUCCESS
finally:
conf.logger.info('slapos successfully prepared the computer.')
except Exception:
conf.logger.exception('slapos failed to prepare the computer.')
return FormatReturn.FAILURE
class FormatConfig(object):
......
This diff is collapsed.
......@@ -54,6 +54,8 @@ if sys.version_info < (2, 6):
warnings.warn('Used python version (%s) is old and has problems with'
' IPv6 connections' % sys.version.split('\n')[0])
from requests.exceptions import RequestException
from lxml import etree
from slapos import manager as slapmanager
......@@ -70,7 +72,8 @@ from slapos.grid.SlapObject import Software, Partition
from slapos.grid.svcbackend import (launchSupervisord,
createSupervisordConfiguration,
_getSupervisordConfigurationDirectory,
_getSupervisordSocketPath)
_getSupervisordSocketPath,
getSupervisorRPC)
from slapos.grid.utils import (md5digest,
dropPrivileges,
SlapPopen,
......@@ -92,10 +95,10 @@ COMPUTER_PARTITION_STOPPED_STATE = 'stopped'
SLAPGRID_SUCCESS = 0
SLAPGRID_FAIL = 1
SLAPGRID_PROMISE_FAIL = 2
SLAPGRID_OFFLINE_SUCCESS = 3
PROMISE_TIMEOUT = 20
COMPUTER_PARTITION_TIMESTAMP_FILENAME = '.timestamp'
COMPUTER_PARTITION_REQUESTED_STATE_FILENAME = '.requested_state'
COMPUTER_PARTITION_LATEST_BANG_TIMESTAMP_FILENAME = '.slapos_latest_bang_timestamp'
COMPUTER_PARTITION_INSTALL_ERROR_FILENAME = '.slapgrid-%s-error.log'
COMPUTER_PARTITION_WAIT_LIST_FILENAME = '.slapos-report-wait-service-list'
......@@ -1081,9 +1084,8 @@ stderr_logfile_backups=1
software_path=software_path,
instance_path=instance_path,
shared_part_list=self.shared_part_list,
supervisord_partition_configuration_path=os.path.join(
_getSupervisordConfigurationDirectory(self.instance_root),
computer_partition_id + '.conf'),
supervisord_partition_configuration_dir=(
_getSupervisordConfigurationDirectory(self.instance_root)),
supervisord_socket=self.supervisord_socket,
computer_partition=computer_partition,
computer_id=self.computer_id,
......@@ -1137,10 +1139,6 @@ stderr_logfile_backups=1
instance_path,
COMPUTER_PARTITION_TIMESTAMP_FILENAME
)
partition_state_path = os.path.join(
instance_path,
COMPUTER_PARTITION_REQUESTED_STATE_FILENAME
)
parameter_dict = computer_partition.getInstanceParameterDict()
timestamp = parameter_dict.get('timestamp')
......@@ -1178,9 +1176,8 @@ stderr_logfile_backups=1
software_path=software_path,
instance_path=instance_path,
shared_part_list=self.shared_part_list,
supervisord_partition_configuration_path=os.path.join(
_getSupervisordConfigurationDirectory(self.instance_root), '%s.conf' %
computer_partition_id),
supervisord_partition_configuration_dir=(
_getSupervisordConfigurationDirectory(self.instance_root)),
supervisord_socket=self.supervisord_socket,
computer_partition=computer_partition,
computer_id=self.computer_id,
......@@ -1243,7 +1240,6 @@ stderr_logfile_backups=1
return
os.remove(timestamp_path)
os.remove(partition_state_path)
# Include Partition Logging
log_folder_path = "%s/.slapgrid/log" % instance_path
......@@ -1358,8 +1354,6 @@ stderr_logfile_backups=1
if timestamp:
with open(timestamp_path, 'w') as f:
f.write(str(timestamp))
with open(partition_state_path, 'w') as f:
f.write(str(computer_partition_state))
def FilterComputerPartitionList(self, computer_partition_list):
"""
......@@ -1429,6 +1423,12 @@ stderr_logfile_backups=1
return filtered_computer_partition_list
def processComputerPartitionList(self):
try:
return self.processComputerPartitionListOnline()
except RequestException:
return self.processComputerPartitionListOffline()
def processComputerPartitionListOnline(self):
"""
Will start supervisord and process each Computer Partition.
"""
......@@ -1455,6 +1455,10 @@ stderr_logfile_backups=1
# Process the partition itself
self.processComputerPartition(computer_partition)
# Handle connection loss at the next level
except RequestException:
raise
# Send log before exiting
except (SystemExit, KeyboardInterrupt):
computer_partition.error(traceback.format_exc(), logger=self.logger)
......@@ -1511,6 +1515,20 @@ stderr_logfile_backups=1
return SLAPGRID_PROMISE_FAIL
return SLAPGRID_SUCCESS
def processComputerPartitionListOffline(self):
self.logger.info('Processing computer partitions offline...')
try:
supervisord_socket_path = _getSupervisordSocketPath(
self.instance_root,
self.logger
)
with getSupervisorRPC(supervisord_socket_path) as supervisor:
supervisor.startAllProcesses(False)
except Exception:
self.logger.exception('Error in offline mode while starting partitions:')
return SLAPGRID_FAIL
return SLAPGRID_OFFLINE_SUCCESS
def processPromiseList(self):
"""
Will check and process promises for each Computer Partition.
......@@ -1841,9 +1859,8 @@ stderr_logfile_backups=1
instance_path=os.path.join(self.instance_root,
computer_partition.getId()),
shared_part_list=self.shared_part_list,
supervisord_partition_configuration_path=os.path.join(
_getSupervisordConfigurationDirectory(self.instance_root), '%s.conf' %
computer_partition_id),
supervisord_partition_configuration_dir=(
_getSupervisordConfigurationDirectory(self.instance_root)),
supervisord_socket=self.supervisord_socket,
computer_partition=computer_partition,
computer_id=self.computer_id,
......
......@@ -156,17 +156,15 @@ class Manager(object):
# Generate supervisord configuration with socat processes added
partition.generateSupervisorConfiguration()
group_id = partition.addCustomGroup('socat', partition.partition_id,
[program['name']
for program in socat_programs])
for program in socat_programs:
partition.addProgramToGroup(group_id, program['name'], program['name'],
partition.addProgramToGroup('socat', program['name'], program['name'],
program['command'],
as_user=program['as_user'])
partition.writeSupervisorConfigurationFile()
partition.writeSupervisorConfigurationFiles()
# Start processes
group_id = partition.getGroupIdFromSuffix('socat')
with partition.getSupervisorRPC() as supervisor:
for program in socat_programs:
process_name = '{}:{}'.format(group_id, program['name'])
......
......@@ -73,11 +73,9 @@ class Manager(object):
group_suffix = "prerm"
logger.info("Adding pre-delete scripts to supervisord...")
partition.generateSupervisorConfiguration()
partition.addServiceToCustomGroup(group_suffix,
partition_id,
wrapper_list,
partition.prerm_path)
partition.writeSupervisorConfigurationFile()
partition.addServicesToCustomGroup(
group_suffix, wrapper_list, partition.prerm_path)
partition.writeSupervisorConfigurationFiles()
# check the state of all process, if the process is not started yes, start it
with partition.getSupervisorRPC() as supervisor:
......
......@@ -737,11 +737,7 @@ class StandaloneSlapOS(object):
state=state)
def start(self):
"""Start the system.
If system was stopped, it will start partitions.
If system was already running, this does not restart partitions.
"""
"""Start the system."""
self._logger.debug("Starting StandaloneSlapOS in %s", self._base_directory)
self._ensureSupervisordStarted()
self._ensureSlapOSAvailable()
......
......@@ -416,13 +416,8 @@ class TestCliBoot(CliMixin):
os.mkdir(os.path.join(instance_root, partition_base_name + '1'))
timestamp = os.path.join(
instance_root, partition_base_name + '1', '.timestamp')
requested_state_path = os.path.join(instance_root,
partition_base_name + '1',
'.requested_state')
with open(timestamp, 'w') as f:
f.write("1578552471")
with open(requested_state_path, 'w') as f:
f.write("started")
# make a config file using this instance root
with tempfile.NamedTemporaryFile(mode='w') as slapos_conf:
......@@ -441,27 +436,24 @@ class TestCliBoot(CliMixin):
# run slapos node boot
app = slapos.cli.entry.SlapOSApp()
fake = mock.Mock(return_value=mock.Mock(**{'run.return_value': 0}))
with patch('slapos.cli.boot.check_root_user', return_value=True) as check_root_user,\
patch('slapos.cli.boot.SlapOSApp') as SlapOSApp,\
patch('slapos.cli.boot.SlapOSApp', new=fake) as SlapOSApp,\
patch('slapos.cli.boot.ConfigCommand.config_path', return_value=slapos_conf.name), \
patch(
'slapos.cli.boot.netifaces.ifaddresses',
return_value={socket.AF_INET6: ({'addr': '2000::1'},),},) as ifaddresses,\
patch('slapos.cli.boot._startComputerPartition', return_value=None) as start_partition,\
patch('slapos.cli.boot.launchSupervisord', return_value=None),\
patch('slapos.cli.boot._ping_hostname', return_value=1) as _ping_hostname:
app.run(('node', 'boot'))
# boot command runs as root
check_root_user.assert_called_once()
# Computer partition was started during boot
start_partition.assert_called_once()
# it waits for interface to have an IPv6 address
ifaddresses.assert_called_once_with('interface_name_from_config')
# then ping master hostname to wait for connectivity
_ping_hostname.assert_called_once_with('slap.vifib.com')
# then format and bang
SlapOSApp().run.assert_any_call(['node', 'format', '--now', '--local', '--verbose'])
SlapOSApp().run.assert_any_call(['node', 'format', '--now', '--verbose'])
SlapOSApp().run.assert_any_call(['node', 'bang', '-m', 'Reboot'])
# timestamp files have been removed
......@@ -483,17 +475,15 @@ class TestCliBoot(CliMixin):
patch('slapos.cli.boot.netifaces.ifaddresses',
side_effect=[net1, net2, net3]),\
patch('slapos.cli.boot._ping_hostname', return_value=0),\
patch('slapos.cli.boot._startComputerPartitionList', return_value=None) as start_partition,\
patch('slapos.cli.format.check_root_user', return_value=True),\
patch('slapos.cli.format.logging.FileHandler', return_value=logging.NullHandler()),\
patch('slapos.cli.bang.check_root_user', return_value=True),\
patch('slapos.cli.format.do_format', side_effect=[Exception, Exception, None]) as do_format,\
patch('slapos.cli.bang.do_bang', side_effect=[Exception, Exception, None]) as do_bang:
patch('slapos.cli.format.do_format', side_effect=[Exception, Exception, 0]) as do_format,\
patch('slapos.cli.bang.do_bang', side_effect=[Exception, Exception, 0]) as do_bang:
app.run(('node', 'boot'))
check_root_user.assert_called_once()
start_partition.assert_called_once()
self.assertEqual(do_format.call_count, 3)
self.assertEqual(do_bang.call_count, 3)
......
......@@ -87,7 +87,7 @@ original_upload_network_cached = networkcache.upload_network_cached
originalBootstrapBuildout = utils.bootstrapBuildout
originalLaunchBuildout = utils.launchBuildout
originalUploadSoftwareRelease = Software.uploadSoftwareRelease
originalPartitionGenerateSupervisorConfigurationFile = Partition.generateSupervisorConfigurationFile
originalPartitionUpdateSupervisorConfiguration = Partition.updateSupervisorConfiguration
class MasterMixin(BasicMixin, unittest.TestCase):
"""
......@@ -170,8 +170,7 @@ class MasterMixin(BasicMixin, unittest.TestCase):
software_path=software_path,
instance_path=instance_path,
shared_part_list=shared_part_list,
supervisord_partition_configuration_path=os.path.join(
supervisor_configuration_path, partition_id),
supervisord_partition_configuration_dir=supervisor_configuration_path,
supervisord_socket=os.path.join(
supervisor_configuration_path, 'supervisor.sock'),
computer_partition=slap_computer_partition,
......@@ -378,14 +377,14 @@ class TestPartitionSlapObject(MasterMixin, unittest.TestCase):
def setUp(self):
MasterMixin.setUp(self)
Partition.generateSupervisorConfigurationFile = FakeCallAndNoop()
Partition.updateSupervisorConfiguration = FakeCallAndNoop()
utils.bootstrapBuildout = FakeCallAndNoop()
utils.launchBuildout = FakeCallAndStore()
def tearDown(self):
MasterMixin.tearDown(self)
Partition.generateSupervisorConfigurationFile = originalPartitionGenerateSupervisorConfigurationFile
Partition.updateSupervisorConfiguration = originalPartitionUpdateSupervisorConfiguration
def test_partition_timeout_default(self):
software = self.createSoftware()
......@@ -417,18 +416,6 @@ class TestPartitionSlapObject(MasterMixin, unittest.TestCase):
self.assertTrue(utils.launchBuildout.called)
def test_instance_is_deploying_if_software_release_exists(self):
"""
Test that slapgrid deploys an instance if its Software Release exists and
instance.cfg in the Software Release exists.
"""
software = self.createSoftware()
partition = self.createPartition(software.url)
partition.install()
self.assertTrue(utils.launchBuildout.called)
def test_backward_compatibility_instance_is_deploying_if_template_cfg_is_used(self):
"""
Backward compatibility test, for old software releases.
......@@ -507,50 +494,52 @@ class TestPartitionSupervisorConfig(MasterMixin, unittest.TestCase):
utils.launchBuildout = FakeCallAndNoop()
def test_grouped_program(self):
self.assertEqual(self.partition.supervisor_configuration_group, '')
self.assertEqual(self.partition.partition_supervisor_configuration, '')
self.partition.addProgramToGroup('test', 'sample-1', 'sample-1', '/bin/ls')
self.partition.writeSupervisorConfigurationFiles()
partition_id = self.partition.partition_id
group_id = self.partition.getGroupIdFromSuffix('test')
group_id = self.partition.addCustomGroup('test', partition_id,
['sample-1'])
self.assertIn('group:{}-test'.format(partition_id),
self.partition.supervisor_configuration_group)
filepath = os.path.join(
self.partition.supervisord_partition_configuration_dir,
group_id + '.conf'
)
self.partition.addProgramToGroup(group_id, 'sample-1', 'sample-1',
'/bin/ls')
with open(filepath) as f:
supervisor_conf = f.read()
self.assertIn('program:{}-test_sample-1'.format(partition_id),
self.partition.partition_supervisor_configuration)
self.assertIn('group:' + group_id, supervisor_conf)
self.assertIn('program:%s_sample-1' % group_id, supervisor_conf)
def test_simple_service(self):
self.assertEqual(self.partition.supervisor_configuration_group, '')
self.assertEqual(self.partition.partition_supervisor_configuration, '')
runners = ['runner-' + str(i) for i in range(3)]
path = os.path.join(self.partition.instance_path, 'etc/run')
self.partition.addServicesToGroup(runners, path)
self.partition.writeSupervisorConfigurationFiles()
partition_id = self.partition.partition_id
group_id = self.partition.getGroupIdFromSuffix()
runners = ['runner-{}'.format(i) for i in range(3)]
path = os.path.join(self.partition.instance_path, 'etc/run')
self.partition.addServiceToGroup(partition_id, runners, path)
filepath = os.path.join(
self.partition.supervisord_partition_configuration_dir,
group_id + '.conf'
)
with open(filepath) as f:
supervisor_conf = f.read()
for i in range(3):
self.assertIn('program:{}_runner-{}'.format(partition_id, i),
self.partition.partition_supervisor_configuration)
self.assertIn('program:%s_runner-%s' % (group_id, i), supervisor_conf)
runner_path = os.path.join(self.partition.instance_path, 'etc/run',
'runner-{}'.format(i))
class TestPartitionDestructionLock(MasterMixin, unittest.TestCase):
def setUp(self):
MasterMixin.setUp(self)
Partition.generateSupervisorConfigurationFile = FakeCallAndNoop()
Partition.updateSupervisorConfiguration = FakeCallAndNoop()
utils.bootstrapBuildout = FakeCallAndNoop()
utils.launchBuildout = FakeCallAndStore()
def tearDown(self):
MasterMixin.tearDown(self)
Partition.generateSupervisorConfigurationFile = originalPartitionGenerateSupervisorConfigurationFile
Partition.updateSupervisorConfiguration = originalPartitionUpdateSupervisorConfiguration
def test_retention_lock_delay_creation(self):
delay = 42
......@@ -640,13 +629,13 @@ class TestPartitionDestructionLock(MasterMixin, unittest.TestCase):
class TestPartitionDestructionUnwritable(MasterMixin, unittest.TestCase):
def setUp(self):
MasterMixin.setUp(self)
Partition.generateSupervisorConfigurationFile = FakeCallAndNoop()
Partition.updateSupervisorConfiguration = FakeCallAndNoop()
utils.bootstrapBuildout = FakeCallAndNoop()
utils.launchBuildout = FakeCallAndStore()
def tearDown(self):
MasterMixin.tearDown(self)
Partition.generateSupervisorConfigurationFile = originalPartitionGenerateSupervisorConfigurationFile
Partition.updateSupervisorConfiguration = originalPartitionUpdateSupervisorConfiguration
def test(self):
software = self.createSoftware()
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment