Improved handling of marking processes as dead

Run ndb_mgmd as deamon
Make extra attempt to check if processes are still alive
parent 39766f0c
...@@ -272,40 +272,17 @@ sub spawn_parent_impl { ...@@ -272,40 +272,17 @@ sub spawn_parent_impl {
last; last;
} }
# If one of the mysqld processes died, we want to # If one of the processes died, we want to
# mark this, and kill the mysqltest process. # mark this, and kill the mysqltest process.
foreach my $idx (0..1) mark_process_dead($ret_pid);
{
if ( $::master->[$idx]->{'pid'} eq $ret_pid )
{
mtr_debug("child $ret_pid was master[$idx], " .
"exit during mysqltest run");
$::master->[$idx]->{'pid'}= 0;
last;
}
}
foreach my $idx (0..2)
{
if ( $::slave->[$idx]->{'pid'} eq $ret_pid )
{
mtr_debug("child $ret_pid was slave[$idx], " .
"exit during mysqltest run");
$::slave->[$idx]->{'pid'}= 0;
last;
}
}
mtr_debug("waitpid() caught exit of unknown child $ret_pid, " .
"exit during mysqltest run");
} }
if ( $ret_pid != $pid ) if ( $ret_pid != $pid )
{ {
# We terminated the waiting because a "mysqld" process died. # We terminated the waiting because a "mysqld" process died.
# Kill the mysqltest process. # Kill the mysqltest process.
mtr_verbose("Kill mysqltest because another process died");
kill(9,$pid); kill(9,$pid);
$ret_pid= waitpid($pid,0); $ret_pid= waitpid($pid,0);
...@@ -639,13 +616,19 @@ sub mtr_check_stop_servers ($) { ...@@ -639,13 +616,19 @@ sub mtr_check_stop_servers ($) {
mtr_warning("couldn't delete $file"); mtr_warning("couldn't delete $file");
} }
} }
$srv->{'pid'}= 0;
} }
} }
} }
if ( $errors ) if ( $errors )
{ {
# We are in trouble, just die.... # There where errors killing processes
mtr_error("we could not kill or clean up all processes"); # do one last attempt to ping the servers
# and if they can't be pinged, assume they are dead
if ( ! mtr_ping_with_timeout( \@$spec ) )
{
mtr_error("we could not kill or clean up all processes");
}
} }
} }
...@@ -773,6 +756,49 @@ sub mtr_ping_with_timeout($) { ...@@ -773,6 +756,49 @@ sub mtr_ping_with_timeout($) {
return $res; return $res;
} }
#
# Loop through our list of processes and look for and entry
# with the provided pid
# Set the pid of that process to 0 if found
#
sub mark_process_dead($)
{
my $ret_pid= shift;
foreach my $mysqld (@{$::master}, @{$::slave})
{
if ( $mysqld->{'pid'} eq $ret_pid )
{
mtr_verbose("$mysqld->{'type'} $mysqld->{'idx'} exited, pid: $ret_pid");
$mysqld->{'pid'}= 0;
return;
}
}
foreach my $cluster (@{$::clusters})
{
if ( $cluster->{'pid'} eq $ret_pid )
{
mtr_verbose("$cluster->{'name'} cluster ndb_mgmd exited, pid: $ret_pid");
$cluster->{'pid'}= 0;
return;
}
foreach my $ndbd (@{$cluster->{'ndbds'}})
{
if ( $ndbd->{'pid'} eq $ret_pid )
{
mtr_verbose("$cluster->{'name'} cluster ndbd exited, pid: $ret_pid");
$ndbd->{'pid'}= 0;
return;
}
}
}
mtr_warning("mark_process_dead couldn't find an entry for pid: $ret_pid");
}
############################################################################## ##############################################################################
# #
# The operating system will keep information about dead children, # The operating system will keep information about dead children,
...@@ -789,45 +815,8 @@ sub mtr_record_dead_children () { ...@@ -789,45 +815,8 @@ sub mtr_record_dead_children () {
# -1 or 0 means there are no more procesess to wait for # -1 or 0 means there are no more procesess to wait for
while ( ($ret_pid= waitpid(-1,&WNOHANG)) != 0 and $ret_pid != -1) while ( ($ret_pid= waitpid(-1,&WNOHANG)) != 0 and $ret_pid != -1)
{ {
mtr_warning("waitpid() caught exit of child $ret_pid"); mtr_warning("mtr_record_dead_children: $ret_pid");
foreach my $idx (0..1) mark_process_dead($ret_pid);
{
if ( $::master->[$idx]->{'pid'} eq $ret_pid )
{
mtr_warning("child $ret_pid was master[$idx]");
$::master->[$idx]->{'pid'}= 0;
}
}
foreach my $idx (0..2)
{
if ( $::slave->[$idx]->{'pid'} eq $ret_pid )
{
mtr_warning("child $ret_pid was slave[$idx]");
$::slave->[$idx]->{'pid'}= 0;
last;
}
}
foreach my $cluster (@{$::clusters})
{
if ( $cluster->{'pid'} eq $ret_pid )
{
mtr_warning("child $ret_pid was $cluster->{'name'} cluster ndb_mgmd");
$cluster->{'pid'}= 0;
last;
}
foreach my $ndbd (@{$cluster->{'ndbds'}})
{
if ( $ndbd->{'pid'} eq $ret_pid )
{
mtr_warning("child $ret_pid was $cluster->{'name'} cluster ndbd");
$ndbd->{'pid'}= 0;
last;
}
}
}
} }
} }
...@@ -843,7 +832,8 @@ sub start_reap_all { ...@@ -843,7 +832,8 @@ sub start_reap_all {
my $pid; my $pid;
while(($pid= waitpid(-1, &WNOHANG)) != 0 and $pid != -1) while(($pid= waitpid(-1, &WNOHANG)) != 0 and $pid != -1)
{ {
print "start_reap_all: pid: $pid.\n"; mtr_warning("start_reap_all pid: $pid");
mark_process_dead($pid);
}; };
} }
...@@ -903,6 +893,7 @@ sub sleep_until_file_created ($$$) { ...@@ -903,6 +893,7 @@ sub sleep_until_file_created ($$$) {
# Check if it died after the fork() was successful # Check if it died after the fork() was successful
if ( $pid != 0 && waitpid($pid,&WNOHANG) == $pid ) if ( $pid != 0 && waitpid($pid,&WNOHANG) == $pid )
{ {
mtr_warning("Process $pid died");
return 0; return 0;
} }
......
...@@ -1687,6 +1687,7 @@ sub ndbcluster_wait_started($){ ...@@ -1687,6 +1687,7 @@ sub ndbcluster_wait_started($){
} }
sub mysqld_wait_started($){ sub mysqld_wait_started($){
my $mysqld= shift; my $mysqld= shift;
...@@ -1706,6 +1707,7 @@ sub ndb_mgmd_start ($) { ...@@ -1706,6 +1707,7 @@ sub ndb_mgmd_start ($) {
mtr_init_args(\$args); mtr_init_args(\$args);
mtr_add_arg($args, "--no-defaults"); mtr_add_arg($args, "--no-defaults");
mtr_add_arg($args, "--core"); mtr_add_arg($args, "--core");
mtr_add_arg($args, "--nodaemon");
mtr_add_arg($args, "--config-file=%s", "$cluster->{'data_dir'}/config.ini"); mtr_add_arg($args, "--config-file=%s", "$cluster->{'data_dir'}/config.ini");
...@@ -1716,9 +1718,23 @@ sub ndb_mgmd_start ($) { ...@@ -1716,9 +1718,23 @@ sub ndb_mgmd_start ($) {
"", "",
{ append_log_file => 1 }); { append_log_file => 1 });
# FIXME Should not be needed
# Unfortunately the cluster nodes will fail to start
# if ndb_mgmd has not started properly
sleep(1);
# if (!sleep_until_file_created($cluster->{'path_pid'},
# 30, # Seconds
# $pid))
# {
# mtr_warning("Failed to start ndb_mgd for $cluster->{'name'} cluster");
# return 1;
# }
# Remember pid of ndb_mgmd # Remember pid of ndb_mgmd
$cluster->{'pid'}= $pid; $cluster->{'pid'}= $pid;
mtr_verbose("ndb_mgmd_start, pid: $pid");
return $pid; return $pid;
} }
...@@ -1774,19 +1790,6 @@ sub ndbcluster_start ($$) { ...@@ -1774,19 +1790,6 @@ sub ndbcluster_start ($$) {
my $pid= ndb_mgmd_start($cluster); my $pid= ndb_mgmd_start($cluster);
# FIXME Should not be needed
# Unfortunately cluster will fail
# if ndb_mgmd has not started properly
# Wait for the ndb_mgmd pid file to be created
if (!sleep_until_file_created($cluster->{'path_pid'},
60,
$pid))
{
mtr_warning("Failed to start ndb_mgmd for $cluster->{'name'} cluster");
return 1;
}
for ( my $idx= 0; $idx < $cluster->{'nodes'}; $idx++ ) for ( my $idx= 0; $idx < $cluster->{'nodes'}; $idx++ )
{ {
ndbd_start($cluster, $idx, $extra_args); ndbd_start($cluster, $idx, $extra_args);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment