Skip to content

Commit 8351cea

Browse files
committed
execute: support syscall filtering using seccomp filters
1 parent cd96b3b commit 8351cea

19 files changed

+517
-14
lines changed

Makefile.am

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -961,11 +961,15 @@ libsystemd_core_la_SOURCES = \
961961
src/core/switch-root.h \
962962
src/core/switch-root.c \
963963
src/core/killall.h \
964-
src/core/killall.c
964+
src/core/killall.c \
965+
src/core/syscall-list.c \
966+
src/core/syscall-list.h
965967

966968
nodist_libsystemd_core_la_SOURCES = \
967969
src/core/load-fragment-gperf.c \
968-
src/core/load-fragment-gperf-nulstr.c
970+
src/core/load-fragment-gperf-nulstr.c \
971+
src/core/syscall-from-name.h \
972+
src/core/syscall-to-name.h
969973

970974
libsystemd_core_la_CFLAGS = \
971975
$(AM_CFLAGS) \
@@ -998,7 +1002,23 @@ EXTRA_DIST += \
9981002
CLEANFILES += \
9991003
src/core/load-fragment-gperf.gperf \
10001004
src/core/load-fragment-gperf.c \
1001-
src/core/load-fragment-gperf-nulstr.c
1005+
src/core/load-fragment-gperf-nulstr.c \
1006+
src/core/syscall-list.txt \
1007+
src/core/syscall-from-name.gperf \
1008+
src/core/syscall-from-name.h \
1009+
src/core/syscall-to-name.h
1010+
1011+
src/core/syscall-list.txt: Makefile
1012+
$(AM_V_GEN)cpp -dM -include sys/syscall.h < /dev/null | $(AWK) '/^#define[ \t]+__NR_[^ ]+[ \t]+[0-9]/ { sub(/__NR_/, "", $$2); print $$2; }' > $@ || rm $@
1013+
1014+
src/core/syscall-from-name.gperf: src/core/syscall-list.txt Makefile
1015+
$(AM_V_GEN)$(AWK) 'BEGIN{ print "struct syscall_name { const char* name; int id; };"; print "%null-strings"; print "%%";} { printf "%s, __NR_%s\n", $$1, $$1 }' < $< > $@
1016+
1017+
src/core/syscall-from-name.h: src/core/syscall-from-name.gperf Makefile
1018+
$(AM_V_GEN)$(GPERF) -L ANSI-C -t --ignore-case -N lookup_syscall -H hash_syscall_name -p -C < $< > $@
1019+
1020+
src/core/syscall-to-name.h: src/core/syscall-list.txt Makefile
1021+
$(AM_V_GEN)$(AWK) 'BEGIN{ print "const char* const syscall_names[] = { "} { printf "[__NR_%s] = \"%s\",\n", $$1, $$1 } END{print "};"}' < $< > $@
10021022

10031023
# ------------------------------------------------------------------------------
10041024
systemd_SOURCES = \

TODO

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,13 @@ Bugfixes:
3333
Jul 09 18:22:37 mop [21866]: Process 21865 (systemd) dumped core.
3434

3535
Features:
36+
37+
* use cpp -dM for key mapping too?
38+
3639
* change mount access mode of 0700 or so for debugfs?
3740

3841
* logind: wakelock/opportunistic suspend support
3942

40-
* seccomp filters for services
41-
4243
* switch-root: sockets need relabelling
4344

4445
* segfault in journalctl during /var migration
@@ -60,8 +61,6 @@ Features:
6061
* load-fragment: when loading a unit file via a chain of symlinks
6162
verify that it isn't masked via any of the names traversed.
6263

63-
* journald: _BOOT_ID triggers too many collisions.
64-
6564
* journald: we currently rotate only after MaxUse+MaxFilesize has been reached.
6665

6766
* nspawn: bind mount /var/log/journal from the host
@@ -236,8 +235,6 @@ Features:
236235

237236
* write RPM spec macros for presets
238237

239-
* journal: extend hash tables as we go
240-
241238
* journal: API for looking for retrieving "all values of this field"
242239

243240
* journal: deal nicely with byte-by-byte copied files, especially regards header

man/systemd.exec.xml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1091,6 +1091,54 @@
10911091
shell pipelines.</para></listitem>
10921092
</varlistentry>
10931093

1094+
<varlistentry>
1095+
<term><varname>NoNewPrivileges=</varname></term>
1096+
1097+
<listitem><para>Takes a boolean
1098+
argument. If true ensures that the
1099+
service process and all its children
1100+
can never gain new privileges. This
1101+
option is more powerful than the respective
1102+
secure bits flags (see above), as it
1103+
also prohibits UID changes of any
1104+
kind. This is the simplest, most
1105+
effective way to ensure that a process
1106+
and its children can never elevate
1107+
privileges again.</para></listitem>
1108+
</varlistentry>
1109+
1110+
<varlistentry>
1111+
<term><varname>SystemCallFilter=</varname></term>
1112+
1113+
<listitem><para>Takes a space
1114+
separated list of system call
1115+
names. If this setting is used all
1116+
system calls executed by the unit
1117+
process except for the listed ones
1118+
will result in immediate process
1119+
termination with the SIGSYS signal
1120+
(whitelisting). If the first character
1121+
of the list is <literal>~</literal>
1122+
the effect is inverted: only the
1123+
listed system calls will result in
1124+
immediate process termination
1125+
(blacklisting). If this option is used
1126+
<varname>NoNewPrivileges=yes</varname>
1127+
is implied. This feature makes use of
1128+
the Secure Computing Mode 2 interfaces
1129+
of the kernel ('seccomp filtering')
1130+
and is useful for enforcing a minimal
1131+
sandboxing environment. Note that the
1132+
<function>execve</function>,
1133+
<function>rt_sigreturn</function>,
1134+
<function>sigreturn</function>,
1135+
<function>exit_group</function>,
1136+
<function>exit</function> system calls
1137+
are implicitly whitelisted and don't
1138+
need to be listed
1139+
explicitly.</para></listitem>
1140+
</varlistentry>
1141+
10941142
</variablelist>
10951143
</refsect1>
10961144

src/core/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
1+
/syscall-from-name.gperf
2+
/syscall-from-name.h
3+
/syscall-list.txt
4+
/syscall-to-name.h
15
/macros.systemd
26
/systemd.pc

src/core/dbus-execute.c

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "ioprio.h"
2929
#include "strv.h"
3030
#include "dbus-common.h"
31+
#include "syscall-list.h"
3132

3233
DEFINE_BUS_PROPERTY_APPEND_ENUM(bus_execute_append_kill_mode, kill_mode, KillMode);
3334

@@ -348,6 +349,32 @@ int bus_execute_append_command(DBusMessageIter *i, const char *property, void *d
348349
return 0;
349350
}
350351

352+
int bus_execute_append_syscall_filter(DBusMessageIter *i, const char *property, void *data) {
353+
ExecContext *c = data;
354+
dbus_bool_t b;
355+
DBusMessageIter sub;
356+
357+
assert(i);
358+
assert(property);
359+
assert(c);
360+
361+
if (!dbus_message_iter_open_container(i, DBUS_TYPE_ARRAY, "u", &sub))
362+
return -ENOMEM;
363+
364+
if (c->syscall_filter)
365+
b = dbus_message_iter_append_fixed_array(&sub, DBUS_TYPE_UINT32, &c->syscall_filter, (syscall_max() + 31) >> 4);
366+
else
367+
b = dbus_message_iter_append_fixed_array(&sub, DBUS_TYPE_UINT32, &c->syscall_filter, 0);
368+
369+
if (!b)
370+
return -ENOMEM;
371+
372+
if (!dbus_message_iter_close_container(i, &sub))
373+
return -ENOMEM;
374+
375+
return 0;
376+
}
377+
351378
const BusProperty bus_exec_context_properties[] = {
352379
{ "Environment", bus_property_append_strv, "as", offsetof(ExecContext, environment), true },
353380
{ "EnvironmentFiles", bus_execute_append_env_files, "a(sb)", offsetof(ExecContext, environment_files), true },
@@ -409,6 +436,8 @@ const BusProperty bus_exec_context_properties[] = {
409436
{ "UtmpIdentifier", bus_property_append_string, "s", offsetof(ExecContext, utmp_id), true },
410437
{ "ControlGroupModify", bus_property_append_bool, "b", offsetof(ExecContext, control_group_modify) },
411438
{ "ControlGroupPersistent", bus_property_append_tristate_false, "b", offsetof(ExecContext, control_group_persistent) },
412-
{ "IgnoreSIGPIPE", bus_property_append_bool, "b", offsetof(ExecContext, ignore_sigpipe ) },
439+
{ "IgnoreSIGPIPE", bus_property_append_bool, "b", offsetof(ExecContext, ignore_sigpipe) },
440+
{ "NoNewPrivileges", bus_property_append_bool, "b", offsetof(ExecContext, no_new_privileges) },
441+
{ "SystemCallFilter", bus_execute_append_syscall_filter, "au", 0 },
413442
{ NULL, }
414443
};

src/core/dbus-execute.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,9 @@
9696
" <property name=\"ControlGroupModify\" type=\"b\" access=\"read\"/>\n" \
9797
" <property name=\"ControlGroupPersistent\" type=\"b\" access=\"read\"/>\n" \
9898
" <property name=\"PrivateNetwork\" type=\"b\" access=\"read\"/>\n" \
99-
" <property name=\"IgnoreSIGPIPE\" type=\"b\" access=\"read\"/>\n"
99+
" <property name=\"IgnoreSIGPIPE\" type=\"b\" access=\"read\"/>\n" \
100+
" <property name=\"NoNewPrivileges\" type=\"b\" access=\"read\"/>\n" \
101+
" <property name=\"SystemCallFilter\" type=\"au\" access=\"read\"/>\n"
100102

101103
#define BUS_EXEC_COMMAND_INTERFACE(name) \
102104
" <property name=\"" name "\" type=\"a(sasbttuii)\" access=\"read\"/>\n"
@@ -121,5 +123,6 @@ int bus_execute_append_rlimits(DBusMessageIter *i, const char *property, void *d
121123
int bus_execute_append_command(DBusMessageIter *u, const char *property, void *data);
122124
int bus_execute_append_kill_mode(DBusMessageIter *i, const char *property, void *data);
123125
int bus_execute_append_env_files(DBusMessageIter *i, const char *property, void *data);
126+
int bus_execute_append_syscall_filter(DBusMessageIter *i, const char *property, void *data);
124127

125128
#endif

src/core/execute.c

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include <linux/fs.h>
3939
#include <linux/oom.h>
4040
#include <sys/poll.h>
41+
#include <linux/seccomp-bpf.h>
4142

4243
#ifdef HAVE_PAM
4344
#include <security/pam_appl.h>
@@ -60,6 +61,7 @@
6061
#include "def.h"
6162
#include "loopback-setup.h"
6263
#include "path-util.h"
64+
#include "syscall-list.h"
6365

6466
#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
6567

@@ -924,6 +926,59 @@ static void rename_process_from_path(const char *path) {
924926
rename_process(process_name);
925927
}
926928

929+
static int apply_seccomp(uint32_t *syscall_filter) {
930+
static const struct sock_filter header[] = {
931+
VALIDATE_ARCHITECTURE,
932+
EXAMINE_SYSCALL
933+
};
934+
static const struct sock_filter footer[] = {
935+
_KILL_PROCESS
936+
};
937+
938+
int i;
939+
unsigned n;
940+
struct sock_filter *f;
941+
struct sock_fprog prog;
942+
943+
assert(syscall_filter);
944+
945+
/* First: count the syscalls to check for */
946+
for (i = 0, n = 0; i < syscall_max(); i++)
947+
if (syscall_filter[i >> 4] & (1 << (i & 31)))
948+
n++;
949+
950+
/* Second: build the filter program from a header the syscall
951+
* matches and the footer */
952+
f = alloca(sizeof(struct sock_filter) * (ELEMENTSOF(header) + 2*n + ELEMENTSOF(footer)));
953+
memcpy(f, header, sizeof(header));
954+
955+
for (i = 0, n = 0; i < syscall_max(); i++)
956+
if (syscall_filter[i >> 4] & (1 << (i & 31))) {
957+
struct sock_filter item[] = {
958+
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, i, 0, 1),
959+
BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
960+
};
961+
962+
assert_cc(ELEMENTSOF(item) == 2);
963+
964+
f[ELEMENTSOF(header) + 2*n] = item[0];
965+
f[ELEMENTSOF(header) + 2*n+1] = item[1];
966+
967+
n++;
968+
}
969+
970+
memcpy(f + (ELEMENTSOF(header) + 2*n), footer, sizeof(footer));
971+
972+
/* Third: install the filter */
973+
zero(prog);
974+
prog.len = ELEMENTSOF(header) + ELEMENTSOF(footer) + 2*n;
975+
prog.filter = f;
976+
if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
977+
return -errno;
978+
979+
return 0;
980+
}
981+
927982
int exec_spawn(ExecCommand *command,
928983
char **argv,
929984
const ExecContext *context,
@@ -1355,6 +1410,21 @@ int exec_spawn(ExecCommand *command,
13551410
r = EXIT_CAPABILITIES;
13561411
goto fail_child;
13571412
}
1413+
1414+
if (context->no_new_privileges)
1415+
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
1416+
err = -errno;
1417+
r = EXIT_NO_NEW_PRIVILEGES;
1418+
goto fail_child;
1419+
}
1420+
1421+
if (context->syscall_filter) {
1422+
err = apply_seccomp(context->syscall_filter);
1423+
if (err < 0) {
1424+
r = EXIT_SECCOMP;
1425+
goto fail_child;
1426+
}
1427+
}
13581428
}
13591429

13601430
if (!(our_env = new0(char*, 7))) {

src/core/execute.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,8 @@ struct ExecContext {
164164
bool private_tmp;
165165
bool private_network;
166166

167+
bool no_new_privileges;
168+
167169
bool control_group_modify;
168170
int control_group_persistent;
169171

@@ -174,6 +176,8 @@ struct ExecContext {
174176
* don't enter a trigger loop. */
175177
bool same_pgrp;
176178

179+
uint32_t *syscall_filter;
180+
177181
bool oom_score_adjust_set:1;
178182
bool nice_set:1;
179183
bool ioprio_set:1;

src/core/load-fragment-gperf.gperf.m4

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ $1.Capabilities, config_parse_exec_capabilities, 0,
4848
$1.SecureBits, config_parse_exec_secure_bits, 0, offsetof($1, exec_context)
4949
$1.CapabilityBoundingSet, config_parse_bounding_set, 0, offsetof($1, exec_context.capability_bounding_set_drop)
5050
$1.TimerSlackNSec, config_parse_nsec, 0, offsetof($1, exec_context.timer_slack_nsec)
51+
$1.NoNewPrivileges config_parse_bool, 0, offsetof($1, exec_context.no_new_privileges)
52+
$1.SystemCallFilter, config_parse_syscall_filter, 0, offsetof($1, exec_context)
5153
$1.LimitCPU, config_parse_limit, RLIMIT_CPU, offsetof($1, exec_context.rlimit)
5254
$1.LimitFSIZE, config_parse_limit, RLIMIT_FSIZE, offsetof($1, exec_context.rlimit)
5355
$1.LimitDATA, config_parse_limit, RLIMIT_DATA, offsetof($1, exec_context.rlimit)

0 commit comments

Comments
 (0)