Skip to content

Commit 2dce79e

Browse files
committed
Wait for discovery on container start error
This gives discovery a chance to initialize, particularly if the K/V store being used is in a container. Signed-off-by: Brian Goff <cpuguy83@gmail.com>
1 parent 16d0a89 commit 2dce79e

File tree

3 files changed

+103
-14
lines changed

3 files changed

+103
-14
lines changed

daemon/daemon.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,9 @@ func (daemon *Daemon) restore() error {
380380
}
381381
}
382382
}
383+
384+
// Make sure networks are available before starting
385+
daemon.waitForNetworks(c)
383386
if err := daemon.containerStart(c); err != nil {
384387
logrus.Errorf("Failed to start container %s: %s", c.ID, err)
385388
}
@@ -423,6 +426,33 @@ func (daemon *Daemon) restore() error {
423426
return nil
424427
}
425428

429+
// waitForNetworks is used during daemon initialization when starting up containers
430+
// It ensures that all of a container's networks are available before the daemon tries to start the container.
431+
// In practice it just makes sure the discovery service is available for containers which use a network that require discovery.
432+
func (daemon *Daemon) waitForNetworks(c *container.Container) {
433+
if daemon.discoveryWatcher == nil {
434+
return
435+
}
436+
// Make sure if the container has a network that requires discovery that the discovery service is available before starting
437+
for netName := range c.NetworkSettings.Networks {
438+
// If we get `ErrNoSuchNetwork` here, it can assumed that it is due to discovery not being ready
439+
// Most likely this is because the K/V store used for discovery is in a container and needs to be started
440+
if _, err := daemon.netController.NetworkByName(netName); err != nil {
441+
if _, ok := err.(libnetwork.ErrNoSuchNetwork); !ok {
442+
continue
443+
}
444+
// use a longish timeout here due to some slowdowns in libnetwork if the k/v store is on anything other than --net=host
445+
// FIXME: why is this slow???
446+
logrus.Debugf("Container %s waiting for network to be ready", c.Name)
447+
select {
448+
case <-daemon.discoveryWatcher.ReadyCh():
449+
case <-time.After(60 * time.Second):
450+
}
451+
return
452+
}
453+
}
454+
}
455+
426456
func (daemon *Daemon) mergeAndVerifyConfig(config *containertypes.Config, img *image.Image) error {
427457
if img != nil && img.Config != nil {
428458
if err := merge(config, img.Config); err != nil {

daemon/daemon_test.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,12 @@ func TestDaemonDiscoveryReload(t *testing.T) {
381381
&discovery.Entry{Host: "127.0.0.1", Port: "3333"},
382382
}
383383

384+
select {
385+
case <-time.After(10 * time.Second):
386+
t.Fatal("timeout waiting for discovery")
387+
case <-daemon.discoveryWatcher.ReadyCh():
388+
}
389+
384390
stopCh := make(chan struct{})
385391
defer close(stopCh)
386392
ch, errCh := daemon.discoveryWatcher.Watch(stopCh)
@@ -414,6 +420,13 @@ func TestDaemonDiscoveryReload(t *testing.T) {
414420
if err := daemon.Reload(newConfig); err != nil {
415421
t.Fatal(err)
416422
}
423+
424+
select {
425+
case <-time.After(10 * time.Second):
426+
t.Fatal("timeout waiting for discovery")
427+
case <-daemon.discoveryWatcher.ReadyCh():
428+
}
429+
417430
ch, errCh = daemon.discoveryWatcher.Watch(stopCh)
418431

419432
select {
@@ -450,6 +463,13 @@ func TestDaemonDiscoveryReloadFromEmptyDiscovery(t *testing.T) {
450463
if err := daemon.Reload(newConfig); err != nil {
451464
t.Fatal(err)
452465
}
466+
467+
select {
468+
case <-time.After(10 * time.Second):
469+
t.Fatal("timeout waiting for discovery")
470+
case <-daemon.discoveryWatcher.ReadyCh():
471+
}
472+
453473
stopCh := make(chan struct{})
454474
defer close(stopCh)
455475
ch, errCh := daemon.discoveryWatcher.Watch(stopCh)
@@ -488,6 +508,12 @@ func TestDaemonDiscoveryReloadOnlyClusterAdvertise(t *testing.T) {
488508
if err := daemon.Reload(newConfig); err != nil {
489509
t.Fatal(err)
490510
}
511+
512+
select {
513+
case <-daemon.discoveryWatcher.ReadyCh():
514+
case <-time.After(10 * time.Second):
515+
t.Fatal("Timeout waiting for discovery")
516+
}
491517
stopCh := make(chan struct{})
492518
defer close(stopCh)
493519
ch, errCh := daemon.discoveryWatcher.Watch(stopCh)

daemon/discovery.go

Lines changed: 47 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,24 @@ type discoveryReloader interface {
2727
discovery.Watcher
2828
Stop()
2929
Reload(backend, address string, clusterOpts map[string]string) error
30+
ReadyCh() <-chan struct{}
3031
}
3132

3233
type daemonDiscoveryReloader struct {
3334
backend discovery.Backend
3435
ticker *time.Ticker
3536
term chan bool
37+
readyCh chan struct{}
3638
}
3739

3840
func (d *daemonDiscoveryReloader) Watch(stopCh <-chan struct{}) (<-chan discovery.Entries, <-chan error) {
3941
return d.backend.Watch(stopCh)
4042
}
4143

44+
func (d *daemonDiscoveryReloader) ReadyCh() <-chan struct{} {
45+
return d.readyCh
46+
}
47+
4248
func discoveryOpts(clusterOpts map[string]string) (time.Duration, time.Duration, error) {
4349
var (
4450
heartbeat = defaultDiscoveryHeartbeat
@@ -87,38 +93,64 @@ func initDiscovery(backendAddress, advertiseAddress string, clusterOpts map[stri
8793
backend: backend,
8894
ticker: time.NewTicker(heartbeat),
8995
term: make(chan bool),
96+
readyCh: make(chan struct{}),
9097
}
9198
// We call Register() on the discovery backend in a loop for the whole lifetime of the daemon,
9299
// but we never actually Watch() for nodes appearing and disappearing for the moment.
93-
reloader.advertise(advertiseAddress)
100+
go reloader.advertiseHeartbeat(advertiseAddress)
94101
return reloader, nil
95102
}
96103

97-
func (d *daemonDiscoveryReloader) advertise(address string) {
98-
d.registerAddr(address)
99-
go d.advertiseHeartbeat(address)
100-
}
101-
102-
func (d *daemonDiscoveryReloader) registerAddr(addr string) {
103-
if err := d.backend.Register(addr); err != nil {
104-
log.Warnf("Registering as %q in discovery failed: %v", addr, err)
105-
}
106-
}
107-
108104
// advertiseHeartbeat registers the current node against the discovery backend using the specified
109105
// address. The function never returns, as registration against the backend comes with a TTL and
110106
// requires regular heartbeats.
111107
func (d *daemonDiscoveryReloader) advertiseHeartbeat(address string) {
108+
var ready bool
109+
if err := d.initHeartbeat(address); err == nil {
110+
ready = true
111+
close(d.readyCh)
112+
}
113+
112114
for {
113115
select {
114116
case <-d.ticker.C:
115-
d.registerAddr(address)
117+
if err := d.backend.Register(address); err != nil {
118+
log.Warnf("Registering as %q in discovery failed: %v", address, err)
119+
} else {
120+
if !ready {
121+
close(d.readyCh)
122+
ready = true
123+
}
124+
}
116125
case <-d.term:
117126
return
118127
}
119128
}
120129
}
121130

131+
// initHeartbeat is used to do the first heartbeat. It uses a tight loop until
132+
// either the timeout period is reached or the heartbeat is successful and returns.
133+
func (d *daemonDiscoveryReloader) initHeartbeat(address string) error {
134+
// Setup a short ticker until the first heartbeat has succeeded
135+
t := time.NewTicker(500 * time.Millisecond)
136+
defer t.Stop()
137+
// timeout makes sure that after a period of time we stop being so aggressive trying to reach the discovery service
138+
timeout := time.After(60 * time.Second)
139+
140+
for {
141+
select {
142+
case <-timeout:
143+
return errors.New("timeout waiting for initial discovery")
144+
case <-d.term:
145+
return errors.New("terminated")
146+
case <-t.C:
147+
if err := d.backend.Register(address); err == nil {
148+
return nil
149+
}
150+
}
151+
}
152+
}
153+
122154
// Reload makes the watcher to stop advertising and reconfigures it to advertise in a new address.
123155
func (d *daemonDiscoveryReloader) Reload(backendAddress, advertiseAddress string, clusterOpts map[string]string) error {
124156
d.Stop()
@@ -130,8 +162,9 @@ func (d *daemonDiscoveryReloader) Reload(backendAddress, advertiseAddress string
130162

131163
d.backend = backend
132164
d.ticker = time.NewTicker(heartbeat)
165+
d.readyCh = make(chan struct{})
133166

134-
d.advertise(advertiseAddress)
167+
go d.advertiseHeartbeat(advertiseAddress)
135168
return nil
136169
}
137170

0 commit comments

Comments
 (0)