Commit 0224c9de authored by unknown's avatar unknown

Improved handling of marking processes as dead

Run ndb_mgmd as deamon
Make extra attempt to check if processes are still alive


mysql-test/lib/mtr_process.pl:
  Add common function to mark processes as dead
  When all attempts to kil processes has failed make an extra attempt with ping to check if they really are still alive
mysql-test/mysql-test-run.pl:
  Run ndb_mgmd with --nodaemon
parent f31afb02
...@@ -272,40 +272,17 @@ sub spawn_parent_impl { ...@@ -272,40 +272,17 @@ sub spawn_parent_impl {
last; last;
} }
# If one of the mysqld processes died, we want to # If one of the processes died, we want to
# mark this, and kill the mysqltest process. # mark this, and kill the mysqltest process.
foreach my $idx (0..1) mark_process_dead($ret_pid);
{
if ( $::master->[$idx]->{'pid'} eq $ret_pid )
{
mtr_debug("child $ret_pid was master[$idx], " .
"exit during mysqltest run");
$::master->[$idx]->{'pid'}= 0;
last;
}
}
foreach my $idx (0..2)
{
if ( $::slave->[$idx]->{'pid'} eq $ret_pid )
{
mtr_debug("child $ret_pid was slave[$idx], " .
"exit during mysqltest run");
$::slave->[$idx]->{'pid'}= 0;
last;
}
}
mtr_debug("waitpid() caught exit of unknown child $ret_pid, " .
"exit during mysqltest run");
} }
if ( $ret_pid != $pid ) if ( $ret_pid != $pid )
{ {
# We terminated the waiting because a "mysqld" process died. # We terminated the waiting because a "mysqld" process died.
# Kill the mysqltest process. # Kill the mysqltest process.
mtr_verbose("Kill mysqltest because another process died");
kill(9,$pid); kill(9,$pid);
$ret_pid= waitpid($pid,0); $ret_pid= waitpid($pid,0);
...@@ -639,13 +616,19 @@ sub mtr_check_stop_servers ($) { ...@@ -639,13 +616,19 @@ sub mtr_check_stop_servers ($) {
mtr_warning("couldn't delete $file"); mtr_warning("couldn't delete $file");
} }
} }
$srv->{'pid'}= 0;
} }
} }
} }
if ( $errors ) if ( $errors )
{ {
# We are in trouble, just die.... # There where errors killing processes
mtr_error("we could not kill or clean up all processes"); # do one last attempt to ping the servers
# and if they can't be pinged, assume they are dead
if ( ! mtr_ping_with_timeout( \@$spec ) )
{
mtr_error("we could not kill or clean up all processes");
}
} }
} }
...@@ -773,6 +756,49 @@ sub mtr_ping_with_timeout($) { ...@@ -773,6 +756,49 @@ sub mtr_ping_with_timeout($) {
return $res; return $res;
} }
#
# Loop through our list of processes and look for and entry
# with the provided pid
# Set the pid of that process to 0 if found
#
sub mark_process_dead($)
{
my $ret_pid= shift;
foreach my $mysqld (@{$::master}, @{$::slave})
{
if ( $mysqld->{'pid'} eq $ret_pid )
{
mtr_verbose("$mysqld->{'type'} $mysqld->{'idx'} exited, pid: $ret_pid");
$mysqld->{'pid'}= 0;
return;
}
}
foreach my $cluster (@{$::clusters})
{
if ( $cluster->{'pid'} eq $ret_pid )
{
mtr_verbose("$cluster->{'name'} cluster ndb_mgmd exited, pid: $ret_pid");
$cluster->{'pid'}= 0;
return;
}
foreach my $ndbd (@{$cluster->{'ndbds'}})
{
if ( $ndbd->{'pid'} eq $ret_pid )
{
mtr_verbose("$cluster->{'name'} cluster ndbd exited, pid: $ret_pid");
$ndbd->{'pid'}= 0;
return;
}
}
}
mtr_warning("mark_process_dead couldn't find an entry for pid: $ret_pid");
}
############################################################################## ##############################################################################
# #
# The operating system will keep information about dead children, # The operating system will keep information about dead children,
...@@ -789,45 +815,8 @@ sub mtr_record_dead_children () { ...@@ -789,45 +815,8 @@ sub mtr_record_dead_children () {
# -1 or 0 means there are no more procesess to wait for # -1 or 0 means there are no more procesess to wait for
while ( ($ret_pid= waitpid(-1,&WNOHANG)) != 0 and $ret_pid != -1) while ( ($ret_pid= waitpid(-1,&WNOHANG)) != 0 and $ret_pid != -1)
{ {
mtr_warning("waitpid() caught exit of child $ret_pid"); mtr_warning("mtr_record_dead_children: $ret_pid");
foreach my $idx (0..1) mark_process_dead($ret_pid);
{
if ( $::master->[$idx]->{'pid'} eq $ret_pid )
{
mtr_warning("child $ret_pid was master[$idx]");
$::master->[$idx]->{'pid'}= 0;
}
}
foreach my $idx (0..2)
{
if ( $::slave->[$idx]->{'pid'} eq $ret_pid )
{
mtr_warning("child $ret_pid was slave[$idx]");
$::slave->[$idx]->{'pid'}= 0;
last;
}
}
foreach my $cluster (@{$::clusters})
{
if ( $cluster->{'pid'} eq $ret_pid )
{
mtr_warning("child $ret_pid was $cluster->{'name'} cluster ndb_mgmd");
$cluster->{'pid'}= 0;
last;
}
foreach my $ndbd (@{$cluster->{'ndbds'}})
{
if ( $ndbd->{'pid'} eq $ret_pid )
{
mtr_warning("child $ret_pid was $cluster->{'name'} cluster ndbd");
$ndbd->{'pid'}= 0;
last;
}
}
}
} }
} }
...@@ -843,7 +832,8 @@ sub start_reap_all { ...@@ -843,7 +832,8 @@ sub start_reap_all {
my $pid; my $pid;
while(($pid= waitpid(-1, &WNOHANG)) != 0 and $pid != -1) while(($pid= waitpid(-1, &WNOHANG)) != 0 and $pid != -1)
{ {
print "start_reap_all: pid: $pid.\n"; mtr_warning("start_reap_all pid: $pid");
mark_process_dead($pid);
}; };
} }
...@@ -903,6 +893,7 @@ sub sleep_until_file_created ($$$) { ...@@ -903,6 +893,7 @@ sub sleep_until_file_created ($$$) {
# Check if it died after the fork() was successful # Check if it died after the fork() was successful
if ( $pid != 0 && waitpid($pid,&WNOHANG) == $pid ) if ( $pid != 0 && waitpid($pid,&WNOHANG) == $pid )
{ {
mtr_warning("Process $pid died");
return 0; return 0;
} }
......
...@@ -1687,6 +1687,7 @@ sub ndbcluster_wait_started($){ ...@@ -1687,6 +1687,7 @@ sub ndbcluster_wait_started($){
} }
sub mysqld_wait_started($){ sub mysqld_wait_started($){
my $mysqld= shift; my $mysqld= shift;
...@@ -1706,6 +1707,7 @@ sub ndb_mgmd_start ($) { ...@@ -1706,6 +1707,7 @@ sub ndb_mgmd_start ($) {
mtr_init_args(\$args); mtr_init_args(\$args);
mtr_add_arg($args, "--no-defaults"); mtr_add_arg($args, "--no-defaults");
mtr_add_arg($args, "--core"); mtr_add_arg($args, "--core");
mtr_add_arg($args, "--nodaemon");
mtr_add_arg($args, "--config-file=%s", "$cluster->{'data_dir'}/config.ini"); mtr_add_arg($args, "--config-file=%s", "$cluster->{'data_dir'}/config.ini");
...@@ -1716,9 +1718,23 @@ sub ndb_mgmd_start ($) { ...@@ -1716,9 +1718,23 @@ sub ndb_mgmd_start ($) {
"", "",
{ append_log_file => 1 }); { append_log_file => 1 });
# FIXME Should not be needed
# Unfortunately the cluster nodes will fail to start
# if ndb_mgmd has not started properly
sleep(1);
# if (!sleep_until_file_created($cluster->{'path_pid'},
# 30, # Seconds
# $pid))
# {
# mtr_warning("Failed to start ndb_mgd for $cluster->{'name'} cluster");
# return 1;
# }
# Remember pid of ndb_mgmd # Remember pid of ndb_mgmd
$cluster->{'pid'}= $pid; $cluster->{'pid'}= $pid;
mtr_verbose("ndb_mgmd_start, pid: $pid");
return $pid; return $pid;
} }
...@@ -1774,19 +1790,6 @@ sub ndbcluster_start ($$) { ...@@ -1774,19 +1790,6 @@ sub ndbcluster_start ($$) {
my $pid= ndb_mgmd_start($cluster); my $pid= ndb_mgmd_start($cluster);
# FIXME Should not be needed
# Unfortunately cluster will fail
# if ndb_mgmd has not started properly
# Wait for the ndb_mgmd pid file to be created
if (!sleep_until_file_created($cluster->{'path_pid'},
60,
$pid))
{
mtr_warning("Failed to start ndb_mgmd for $cluster->{'name'} cluster");
return 1;
}
for ( my $idx= 0; $idx < $cluster->{'nodes'}; $idx++ ) for ( my $idx= 0; $idx < $cluster->{'nodes'}; $idx++ )
{ {
ndbd_start($cluster, $idx, $extra_args); ndbd_start($cluster, $idx, $extra_args);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment