Commit b42334eb authored by Matthew Holt's avatar Matthew Holt

Several improvements and bug fixes related to graceful reloads

Added a -grace flag to customize graceful shutdown period, fixed bugs related to closing file descriptors (and dup'ed fds), improved healthcheck signaling to parent, fixed a race condition with the graceful listener, etc. These improvements mainly provide better support for frequent reloading or unusual use cases of Start and Stop after a Restart (POSIX systems). This forum thread was valuable help in debugging: https://forum.golangbridge.org/t/bind-address-already-in-use-even-after-listener-closed/1510?u=matt
parent 94c746c4
......@@ -29,6 +29,7 @@ import (
"path"
"strings"
"sync"
"time"
"github.com/mholt/caddy/caddy/letsencrypt"
"github.com/mholt/caddy/server"
......@@ -43,11 +44,14 @@ var (
// If true, initialization will not show any informative output.
Quiet bool
// HTTP2 indicates whether HTTP2 is enabled or not
// HTTP2 indicates whether HTTP2 is enabled or not.
HTTP2 bool // TODO: temporary flag until http2 is standard
// PidFile is the path to the pidfile to create
// PidFile is the path to the pidfile to create.
PidFile string
// GracefulTimeout is the maximum duration of a graceful shutdown.
GracefulTimeout time.Duration
)
var (
......@@ -91,17 +95,20 @@ const (
)
// Start starts Caddy with the given Caddyfile. If cdyfile
// is nil or the process is forked from a parent as part of
// a graceful restart, Caddy will check to see if Caddyfile
// was piped from stdin and use that. It blocks until all the
// servers are listening.
// is nil, the LoadCaddyfile function will be called to get
// one.
//
// This function blocks until all the servers are listening.
//
// If this process is a fork and no Caddyfile was piped in,
// an error will be returned (the Restart() function does this
// for you automatically). If this process is NOT a fork and
// cdyfile is nil, a default configuration will be assumed.
// In any case, an error is returned if Caddy could not be
// started.
// Note (POSIX): If Start is called in the child process of a
// restart more than once within the duration of the graceful
// cutoff (i.e. the child process called Start a first time,
// then called Stop, then Start again within the first 5 seconds
// or however long GracefulTimeout is) and the Caddyfiles have
// at least one listener address in common, the second Start
// may fail with "address already in use" as there's no
// guarantee that the parent process has relinquished the
// address before the grace period ends.
func Start(cdyfile Input) (err error) {
// If we return with no errors, we must do two things: tell the
// parent that we succeeded and write to the pidfile.
......@@ -148,17 +155,6 @@ func Start(cdyfile Input) (err error) {
}
startedBefore = true
// Close remaining file descriptors we may have inherited that we don't need
if IsRestart() {
for _, fdIndex := range loadedGob.ListenerFds {
file := os.NewFile(fdIndex, "")
fln, err := net.FileListener(file)
if err == nil {
fln.Close()
}
}
}
// Show initialization output
if !Quiet && !IsRestart() {
var checkedFdLimit bool
......@@ -192,7 +188,7 @@ func startServers(groupings bindingGroup) error {
errChan := make(chan error, len(groupings)) // must be buffered to allow Serve functions below to return if stopped later
for _, group := range groupings {
s, err := server.New(group.BindAddr.String(), group.Configs)
s, err := server.New(group.BindAddr.String(), group.Configs, GracefulTimeout)
if err != nil {
return err
}
......@@ -215,7 +211,8 @@ func startServers(groupings bindingGroup) error {
return errors.New("listener for " + s.Addr + " was not a ListenerFile")
}
delete(loadedGob.ListenerFds, s.Addr) // mark it as used
file.Close()
delete(loadedGob.ListenerFds, s.Addr)
}
}
......@@ -252,6 +249,14 @@ func startServers(groupings bindingGroup) error {
serversMu.Unlock()
}
// Close the remaining (unused) file descriptors to free up resources
if IsRestart() {
for key, fdIndex := range loadedGob.ListenerFds {
os.NewFile(fdIndex, "").Close()
delete(loadedGob.ListenerFds, key)
}
}
// Wait for all servers to finish starting
startupWg.Wait()
......@@ -276,7 +281,9 @@ func Stop() error {
serversMu.Lock()
for _, s := range servers {
s.Stop() // TODO: error checking/reporting?
if err := s.Stop(); err != nil {
log.Printf("[ERROR] Stopping %s: %v", s.Addr, err)
}
}
servers = []*server.Server{} // don't reuse servers
serversMu.Unlock()
......@@ -289,12 +296,13 @@ func Wait() {
wg.Wait()
}
// LoadCaddyfile loads a Caddyfile in a way that prioritizes
// reading from stdin pipe; otherwise it calls loader to load
// the Caddyfile. If loader does not return a Caddyfile, the
// default one will be returned. Thus, if there are no other
// errors, this function always returns at least the default
// Caddyfile (not the previously-used Caddyfile).
// LoadCaddyfile loads a Caddyfile, prioritizing a Caddyfile
// piped from stdin as part of a restart (only happens on first call
// to LoadCaddyfile). If it is not a restart, this function tries
// calling the user's loader function, and if that returns nil, then
// this function resorts to the default configuration. Thus, if there
// are no other errors, this function always returns at least the
// default Caddyfile.
func LoadCaddyfile(loader func() (Input, error)) (cdyfile Input, err error) {
// If we are a fork, finishing the restart is highest priority;
// piped input is required in this case.
......
......@@ -10,6 +10,7 @@ import (
"runtime"
"strconv"
"strings"
"sync"
"github.com/mholt/caddy/caddy/letsencrypt"
)
......@@ -44,7 +45,9 @@ func checkFdlimit() {
// If this process is not a restart, this function does nothing.
// Calling this function once this process has successfully initialized
// is vital so that the parent process can unblock and kill itself.
// This function is idempotent; it executes at most once per process.
func signalSuccessToParent() {
signalParentOnce.Do(func() {
if IsRestart() {
ppipe := os.NewFile(3, "") // parent is reading from pipe at index 3
_, err := ppipe.Write([]byte("success")) // we must send some bytes to the parent
......@@ -53,8 +56,17 @@ func signalSuccessToParent() {
}
ppipe.Close()
}
})
}
// signalParentOnce is used to make sure that the parent is only
// signaled once; doing so more than once breaks whatever socket is
// at fd 4 (the reason for this is still unclear - to reproduce,
// call Stop() and Start() in succession at least once after a
// restart, then try loading first host of Caddyfile in the browser).
// Do not use this directly - call signalSuccessToParent instead.
var signalParentOnce sync.Once
// caddyfileGob maps bind address to index of the file descriptor
// in the Files array passed to the child process. It also contains
// the caddyfile contents. Used only during graceful restarts.
......
......@@ -84,6 +84,11 @@ func Restart(newCaddyfile Input) error {
return err
}
// Immediately close our dup'ed fds and the write end of our signal pipe
for _, f := range extraFiles {
f.Close()
}
// Feed Caddyfile to the child
err = gob.NewEncoder(wpipe).Encode(cdyfileGob)
if err != nil {
......@@ -92,7 +97,6 @@ func Restart(newCaddyfile Input) error {
wpipe.Close()
// Determine whether child startup succeeded
sigwpipe.Close() // close our copy of the write end of the pipe -- TODO: why?
answer, readErr := ioutil.ReadAll(sigrpipe)
if answer == nil || len(answer) == 0 {
cmdErr := cmd.Wait() // get exit status
......@@ -103,6 +107,6 @@ func Restart(newCaddyfile Input) error {
return errIncompleteRestart
}
// Looks like child was successful; we can exit gracefully.
// Looks like child is successful; we can exit gracefully.
return Stop()
}
......@@ -10,8 +10,11 @@ CHANGES
- New -ca flag to customize ACME CA server URL
- New -revoke flag to revoke a certificate
- New -log flag to enable process log
- New -pidfile flag to enable writing pidfile
- New -grace flag to customize the graceful shutdown timeout
- browse: Render filenames with multiple whitespace properly
- markdown: Include Last-Modified header in response
- markdown: Render tables, strikethrough, and fenced code blocks
- startup, shutdown: Better Windows support
- templates: Bug fix for .Host when port is absent
- templates: Include Last-Modified header in response
......
......@@ -10,6 +10,7 @@ import (
"runtime"
"strconv"
"strings"
"time"
"github.com/mholt/caddy/caddy"
"github.com/mholt/caddy/caddy/letsencrypt"
......@@ -34,6 +35,7 @@ func init() {
flag.StringVar(&conf, "conf", "", "Configuration file to use (default="+caddy.DefaultConfigFile+")")
flag.StringVar(&cpu, "cpu", "100%", "CPU cap")
flag.StringVar(&letsencrypt.DefaultEmail, "email", "", "Default Let's Encrypt account email address")
flag.DurationVar(&caddy.GracefulTimeout, "grace", 5*time.Second, "Maximum duration of graceful shutdown")
flag.StringVar(&caddy.Host, "host", caddy.DefaultHost, "Default host")
flag.BoolVar(&caddy.HTTP2, "http2", true, "HTTP/2 support") // TODO: temporary flag until http2 merged into std lib
flag.StringVar(&logfile, "log", "", "Process log file")
......
......@@ -2,7 +2,6 @@ package server
import (
"net"
"os"
"sync"
"syscall"
)
......@@ -13,7 +12,9 @@ func newGracefulListener(l ListenerFile, wg *sync.WaitGroup) *gracefulListener {
gl := &gracefulListener{ListenerFile: l, stop: make(chan error), httpWg: wg}
go func() {
<-gl.stop
gl.Lock()
gl.stopped = true
gl.Unlock()
gl.stop <- gl.ListenerFile.Close()
}()
return gl
......@@ -26,10 +27,11 @@ type gracefulListener struct {
ListenerFile
stop chan error
stopped bool
sync.Mutex // protects the stopped flag
httpWg *sync.WaitGroup // pointer to the host's wg used for counting connections
}
// Accept accepts a connection. This type wraps
// Accept accepts a connection.
func (gl *gracefulListener) Accept() (c net.Conn, err error) {
c, err = gl.ListenerFile.Accept()
if err != nil {
......@@ -42,18 +44,16 @@ func (gl *gracefulListener) Accept() (c net.Conn, err error) {
// Close immediately closes the listener.
func (gl *gracefulListener) Close() error {
gl.Lock()
if gl.stopped {
gl.Unlock()
return syscall.EINVAL
}
gl.Unlock()
gl.stop <- nil
return <-gl.stop
}
// File implements ListenerFile; it gets the file of the listening socket.
func (gl *gracefulListener) File() (*os.File, error) {
return gl.ListenerFile.File()
}
// gracefulConn represents a connection on a
// gracefulListener so that we can keep track
// of the number of connections, thus facilitating
......
......@@ -8,7 +8,6 @@ import (
"crypto/x509"
"fmt"
"io/ioutil"
"log"
"net"
"net/http"
"os"
......@@ -33,6 +32,7 @@ type Server struct {
listenerMu sync.Mutex // protects listener
httpWg sync.WaitGroup // used to wait on outstanding connections
startChan chan struct{} // used to block until server is finished starting
connTimeout time.Duration // the maximum duration of a graceful shutdown
}
// ListenerFile represents a listener.
......@@ -42,14 +42,17 @@ type ListenerFile interface {
}
// New creates a new Server which will bind to addr and serve
// the sites/hosts configured in configs. This function does
// not start serving.
// the sites/hosts configured in configs. Its listener will
// gracefully close when the server is stopped which will take
// no longer than gracefulTimeout.
//
// This function does not start serving.
//
// Do not re-use a server (start, stop, then start again). We
// could probably add more locking to make this possible, but
// as it stands, you should dispose of a server after stopping it.
// The behavior of serving with a spent server is undefined.
func New(addr string, configs []Config) (*Server, error) {
func New(addr string, configs []Config, gracefulTimeout time.Duration) (*Server, error) {
var tls bool
if len(configs) > 0 {
tls = configs[0].TLS.Enabled
......@@ -66,6 +69,7 @@ func New(addr string, configs []Config) (*Server, error) {
tls: tls,
vhosts: make(map[string]virtualHost),
startChan: make(chan struct{}),
connTimeout: gracefulTimeout,
}
s.Handler = s // this is weird, but whatever
......@@ -241,7 +245,7 @@ func serveTLSWithSNI(s *Server, ln net.Listener, tlsConfigs []TLSConfig) error {
// connections to close (up to a max timeout of a few
// seconds); on Windows it will close the listener
// immediately.
func (s *Server) Stop() error {
func (s *Server) Stop() (err error) {
s.Server.SetKeepAlivesEnabled(false)
if runtime.GOOS != "windows" {
......@@ -256,20 +260,19 @@ func (s *Server) Stop() error {
// Wait for remaining connections to finish or
// force them all to close after timeout
select {
case <-time.After(5 * time.Second): // TODO: make configurable
case <-time.After(s.connTimeout):
case <-done:
}
}
// Close the listener now; this stops the server without delay
s.listenerMu.Lock()
err := s.listener.Close()
s.listenerMu.Unlock()
if err != nil {
log.Printf("[ERROR] Closing listener for %s: %v", s.Addr, err)
if s.listener != nil {
err = s.listener.Close()
}
s.listenerMu.Unlock()
return err
return
}
// WaitUntilStarted blocks until the server s is started, meaning
......@@ -279,12 +282,17 @@ func (s *Server) WaitUntilStarted() {
<-s.startChan
}
// ListenerFd gets the file descriptor of the listener.
// ListenerFd gets a dup'ed file of the listener. If there
// is no underlying file, the return value will be nil. It
// is the caller's responsibility to close the file.
func (s *Server) ListenerFd() *os.File {
s.listenerMu.Lock()
defer s.listenerMu.Unlock()
if s.listener != nil {
file, _ := s.listener.File()
return file
}
return nil
}
// ServeHTTP is the entry point for every request to the address that s
......@@ -370,7 +378,7 @@ func setupClientAuth(tlsConfigs []TLSConfig, config *tls.Config) error {
// RunFirstStartupFuncs runs all of the server's FirstStartup
// callback functions unless one of them returns an error first.
// It is up the caller's responsibility to call this only once and
// It is the caller's responsibility to call this only once and
// at the correct time. The functions here should not be executed
// at restarts or where the user does not explicitly start a new
// instance of the server.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment