Skip to content

Commit 4eb3147

Browse files
Merge pull request containerd#317 from mlaventure/handle-exec-clingy-children
Handle exec clingy children
2 parents 578bf0c + 600b4d1 commit 4eb3147

File tree

6 files changed

+75
-54
lines changed

6 files changed

+75
-54
lines changed

containerd-shim/main.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ func start(log *os.File) error {
106106
case s := <-signals:
107107
switch s {
108108
case syscall.SIGCHLD:
109-
exits, _ := osutils.Reap()
109+
exits, _ := osutils.Reap(false)
110110
for _, e := range exits {
111111
// check to see if runtime is one of the processes that has exited
112112
if e.Pid == p.pid() {
@@ -117,6 +117,9 @@ func start(log *os.File) error {
117117
}
118118
// runtime has exited so the shim can also exit
119119
if exitShim {
120+
// Wait for all the childs this process may have created
121+
// (only needed for exec, but it won't hurt when done on init)
122+
osutils.Reap(true)
120123
// Let containerd take care of calling the runtime delete
121124
f.Close()
122125
p.Wait()

containerd/main.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ import (
2323
"github.com/docker/containerd/api/grpc/server"
2424
"github.com/docker/containerd/api/grpc/types"
2525
"github.com/docker/containerd/api/http/pprof"
26-
"github.com/docker/containerd/osutils"
2726
"github.com/docker/containerd/supervisor"
2827
"github.com/docker/docker/pkg/listeners"
2928
"github.com/rcrowley/go-metrics"
@@ -160,7 +159,6 @@ func main() {
160159
func daemon(context *cli.Context) error {
161160
s := make(chan os.Signal, 2048)
162161
signal.Notify(s, syscall.SIGTERM, syscall.SIGINT)
163-
osutils.SetSubreaper(1)
164162
sv, err := supervisor.New(
165163
context.String("state-dir"),
166164
context.String("runtime"),

osutils/reaper.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,17 @@ type Exit struct {
1212

1313
// Reap reaps all child processes for the calling process and returns their
1414
// exit information
15-
func Reap() (exits []Exit, err error) {
15+
func Reap(wait bool) (exits []Exit, err error) {
1616
var (
1717
ws syscall.WaitStatus
1818
rus syscall.Rusage
1919
)
20+
flag := syscall.WNOHANG
21+
if wait {
22+
flag = 0
23+
}
2024
for {
21-
pid, err := syscall.Wait4(-1, &ws, syscall.WNOHANG, &rus)
25+
pid, err := syscall.Wait4(-1, &ws, flag, &rus)
2226
if err != nil {
2327
if err == syscall.ECHILD {
2428
return exits, nil

runtime/container.go

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"github.com/Sirupsen/logrus"
1515
"github.com/docker/containerd/specs"
1616
ocs "github.com/opencontainers/runtime-spec/specs-go"
17+
"golang.org/x/sys/unix"
1718
)
1819

1920
// Container defines the operations allowed on a container
@@ -480,12 +481,33 @@ func (c *container) createCmd(pid string, cmd *exec.Cmd, p *process) error {
480481
}
481482
return err
482483
}
483-
go func() {
484-
err := p.cmd.Wait()
485-
if err == nil {
486-
p.cmdSuccess = true
487-
}
488-
close(p.cmdDoneCh)
484+
// We need the pid file to have been written to run
485+
defer func() {
486+
go func() {
487+
err := p.cmd.Wait()
488+
if err == nil {
489+
p.cmdSuccess = true
490+
}
491+
492+
if same, err := p.isSameProcess(); same && p.pid > 0 {
493+
// The process changed its PR_SET_PDEATHSIG, so force
494+
// kill it
495+
logrus.Infof("containerd: %s:%s (pid %v) has become an orphan, killing it", p.container.id, p.id, p.pid)
496+
err = unix.Kill(p.pid, syscall.SIGKILL)
497+
if err != nil && err != syscall.ESRCH {
498+
logrus.Errorf("containerd: unable to SIGKILL %s:%s (pid %v): %v", p.container.id, p.id, p.pid, err)
499+
} else {
500+
for {
501+
err = unix.Kill(p.pid, 0)
502+
if err != nil {
503+
break
504+
}
505+
time.Sleep(5 * time.Millisecond)
506+
}
507+
}
508+
}
509+
close(p.cmdDoneCh)
510+
}()
489511
}()
490512
if err := c.waitForCreate(p, cmd); err != nil {
491513
return err

runtime/process.go

Lines changed: 23 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -228,24 +228,31 @@ func (p *process) Resize(w, h int) error {
228228
return err
229229
}
230230

231+
func (p *process) updateExitStatusFile(status int) (int, error) {
232+
p.stateLock.Lock()
233+
p.state = Stopped
234+
p.stateLock.Unlock()
235+
err := ioutil.WriteFile(filepath.Join(p.root, ExitStatusFile), []byte(fmt.Sprintf("%d", status)), 0644)
236+
return status, err
237+
}
238+
231239
func (p *process) handleSigkilledShim(rst int, rerr error) (int, error) {
232240
if p.cmd == nil || p.cmd.Process == nil {
233241
e := unix.Kill(p.pid, 0)
234242
if e == syscall.ESRCH {
235-
return rst, rerr
243+
logrus.Warnf("containerd: %s:%s (pid %d) does not exist", p.container.id, p.id, p.pid)
244+
// The process died while containerd was down (probably of
245+
// SIGKILL, but no way to be sure)
246+
return p.updateExitStatusFile(255)
236247
}
237248

238249
// If it's not the same process, just mark it stopped and set
239250
// the status to 255
240251
if same, err := p.isSameProcess(); !same {
241252
logrus.Warnf("containerd: %s:%s (pid %d) is not the same process anymore (%v)", p.container.id, p.id, p.pid, err)
242-
p.stateLock.Lock()
243-
p.state = Stopped
244-
p.stateLock.Unlock()
245253
// Create the file so we get the exit event generated once monitor kicks in
246-
// without going to this all process again
247-
rerr = ioutil.WriteFile(filepath.Join(p.root, ExitStatusFile), []byte("255"), 0644)
248-
return 255, nil
254+
// without having to go through all this process again
255+
return p.updateExitStatusFile(255)
249256
}
250257

251258
ppid, err := readProcStatField(p.pid, 4)
@@ -255,19 +262,21 @@ func (p *process) handleSigkilledShim(rst int, rerr error) (int, error) {
255262
if ppid == "1" {
256263
logrus.Warnf("containerd: %s:%s shim died, killing associated process", p.container.id, p.id)
257264
unix.Kill(p.pid, syscall.SIGKILL)
265+
if err != nil && err != syscall.ESRCH {
266+
return 255, fmt.Errorf("containerd: unable to SIGKILL %s:%s (pid %v): %v", p.container.id, p.id, p.pid, err)
267+
}
268+
258269
// wait for the process to die
259270
for {
260271
e := unix.Kill(p.pid, 0)
261272
if e == syscall.ESRCH {
262273
break
263274
}
264-
time.Sleep(10 * time.Millisecond)
275+
time.Sleep(5 * time.Millisecond)
265276
}
266-
267-
rst = 128 + int(syscall.SIGKILL)
268277
// Create the file so we get the exit event generated once monitor kicks in
269-
// without going to this all process again
270-
rerr = ioutil.WriteFile(filepath.Join(p.root, ExitStatusFile), []byte(fmt.Sprintf("%d", rst)), 0644)
278+
// without having to go through all this process again
279+
return p.updateExitStatusFile(128 + int(syscall.SIGKILL))
271280
}
272281

273282
return rst, rerr
@@ -286,29 +295,8 @@ func (p *process) handleSigkilledShim(rst int, rerr error) (int, error) {
286295
if shimStatus.Signaled() && shimStatus.Signal() == syscall.SIGKILL {
287296
logrus.Debugf("containerd: ExitStatus(container: %s, process: %s): shim was SIGKILL'ed reaping its child with pid %d", p.container.id, p.id, p.pid)
288297

289-
var (
290-
status unix.WaitStatus
291-
rusage unix.Rusage
292-
wpid int
293-
)
294-
295-
// Some processes change their PR_SET_PDEATHSIG, so force kill them
296-
unix.Kill(p.pid, syscall.SIGKILL)
297-
298-
for wpid == 0 {
299-
wpid, e = unix.Wait4(p.pid, &status, unix.WNOHANG, &rusage)
300-
if e != nil {
301-
logrus.Debugf("containerd: ExitStatus(container: %s, process: %s): Wait4(%d): %v", p.container.id, p.id, p.pid, rerr)
302-
return rst, rerr
303-
}
304-
}
305-
306-
if wpid == p.pid {
307-
rerr = nil
308-
rst = 128 + int(shimStatus.Signal())
309-
} else {
310-
logrus.Errorf("containerd: ExitStatus(container: %s, process: %s): unexpected returned pid from wait4 %v (expected %v)", p.container.id, p.id, wpid, p.pid)
311-
}
298+
rerr = nil
299+
rst = 128 + int(shimStatus.Signal())
312300

313301
p.stateLock.Lock()
314302
p.state = Stopped

supervisor/exit.go

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,19 @@ func (s *Supervisor) execExit(t *ExecExitTask) error {
7373
if err := container.RemoveProcess(t.PID); err != nil {
7474
logrus.WithField("error", err).Error("containerd: find container for pid")
7575
}
76-
t.Process.Wait()
77-
s.notifySubscribers(Event{
78-
Timestamp: time.Now(),
79-
ID: t.ID,
80-
Type: StateExit,
81-
PID: t.PID,
82-
Status: t.Status,
83-
})
76+
// If the exec spawned children which are still using its IO
77+
// waiting here will block until they die or close their IO
78+
// descriptors.
79+
// Hence, we use a go routine to avoid block all other operations
80+
go func() {
81+
t.Process.Wait()
82+
s.notifySubscribers(Event{
83+
Timestamp: time.Now(),
84+
ID: t.ID,
85+
Type: StateExit,
86+
PID: t.PID,
87+
Status: t.Status,
88+
})
89+
}()
8490
return nil
8591
}

0 commit comments

Comments
 (0)