Commit 7027ff61 authored by Lennart Poettering's avatar Lennart Poettering
Browse files

nspawn: introduce the new /machine/ tree in the cgroup tree and move containers there

Containers will now carry a label (normally derived from the root
directory name, but configurable by the user), and the container's root
cgroup is /machine/<label>. This label is called "machine name", and can
cover both containers and VMs (as soon as libvirt also makes use of
/machine/).

libsystemd-login can be used to query the machine name from a process.

This patch also includes numerous clean-ups for the cgroup code.
parent cec4ead9
......@@ -47,7 +47,8 @@
<refname>sd_pid_get_unit</refname>
<refname>sd_pid_get_user_unit</refname>
<refname>sd_pid_get_owner_uid</refname>
<refpurpose>Determine session, service or owner of a session of a specific PID</refpurpose>
<refname>sd_pid_get_machine_name</refname>
<refpurpose>Determine session, service, owner of a session or container/VM of a specific PID</refpurpose>
</refnamediv>
<refsynopsisdiv>
......@@ -77,6 +78,12 @@
<paramdef>pid_t <parameter>pid</parameter></paramdef>
<paramdef>uid_t* <parameter>uid</parameter></paramdef>
</funcprototype>
<funcprototype>
<funcdef>int <function>sd_pid_get_machine_name</function></funcdef>
<paramdef>pid_t <parameter>pid</parameter></paramdef>
<paramdef>char** <parameter>name</parameter></paramdef>
</funcprototype>
</funcsynopsis>
</refsynopsisdiv>
......@@ -131,6 +138,14 @@
and not being a shared process of a user this function
will fail.</para>
<para><function>sd_pid_machine_name()</function> may
be used to determine the name of the VM or container
is a member of. The machine name is a short string,
suitable for usage in file system paths. The returned
string needs to be freed with the libc
<citerefentry><refentrytitle>free</refentrytitle><manvolnum>3</manvolnum></citerefentry>
call after use.</para>
<para>If the <literal>pid</literal> parameter of any
of these functions is passed as 0 the operation is
executed for the calling process.</para>
......@@ -149,10 +164,11 @@
<para>The <function>sd_pid_get_session()</function>,
<function>sd_pid_get_unit()</function>,
<function>sd_pid_get_user_unit()</function>, and
<function>sd_pid_get_owner_uid()</function> interfaces
are available as shared library, which can be compiled
and linked to with the
<function>sd_pid_get_user_unit()</function>,
<function>sd_pid_get_owner_uid()</function> and
<function>sd_pid_get_machine_name()</function>
interfaces are available as shared library, which can
be compiled and linked to with the
<literal>libsystemd-login</literal>
<citerefentry><refentrytitle>pkg-config</refentrytitle><manvolnum>1</manvolnum></citerefentry>
file.</para>
......
......@@ -202,6 +202,21 @@
</para></listitem>
</varlistentry>
<varlistentry>
<term><option>-M</option></term>
<term><option>--machine=</option></term>
<listitem><para>Sets the machine name
for this container. This name may be
used to identify this container on the
host, and is used to initialize the
container's hostname (which the
container can choose to override,
however). If not specified the last
component of the root directory of the
container is used.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--uuid=</option></term>
......
......@@ -171,19 +171,14 @@ int main(int argc, char *argv[]) {
arg_kernel_threads, output_flags);
} else {
char _cleanup_free_ *root = NULL;
const char *t = NULL;
r = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 1, &root);
if (r < 0)
t = "/";
else {
if (endswith(root, "/system"))
root[strlen(root)-7] = 0;
t = root[0] ? root : "/";
r = cg_get_root_path(&root);
if (r < 0) {
log_error("Failed to get root path: %s", strerror(-r));
goto finish;
}
r = show_cgroup(SYSTEMD_CGROUP_CONTROLLER, t, NULL, 0,
r = show_cgroup(SYSTEMD_CGROUP_CONTROLLER, root, NULL, 0,
arg_kernel_threads, output_flags);
}
}
......
......@@ -320,8 +320,9 @@ int cgroup_bonding_is_empty_list(CGroupBonding *first) {
int manager_setup_cgroup(Manager *m) {
_cleanup_free_ char *current = NULL, *path = NULL;
char suffix_buffer[sizeof("/systemd-") + DECIMAL_STR_MAX(pid_t)];
const char *suffix;
int r;
char suffix[sizeof("/systemd-") + DECIMAL_STR_MAX(pid_t)];
assert(m);
......@@ -332,17 +333,17 @@ int manager_setup_cgroup(Manager *m) {
}
/* 1. Determine hierarchy */
r = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &current);
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &current);
if (r < 0) {
log_error("Cannot determine cgroup we are running in: %s", strerror(-r));
return r;
}
if (m->running_as == SYSTEMD_SYSTEM)
strcpy(suffix, "/system");
suffix = "/system";
else {
snprintf(suffix, sizeof(suffix), "/systemd-%lu", (unsigned long) getpid());
char_array_0(suffix);
sprintf(suffix_buffer, "/systemd-%lu", (unsigned long) getpid());
suffix = suffix_buffer;
}
free(m->cgroup_hierarchy);
......@@ -350,11 +351,14 @@ int manager_setup_cgroup(Manager *m) {
/* We probably got reexecuted and can continue to use our root cgroup */
m->cgroup_hierarchy = current;
current = NULL;
} else {
/* We need a new root cgroup */
m->cgroup_hierarchy = NULL;
if (asprintf(&m->cgroup_hierarchy, "%s%s", streq(current, "/") ? "" : current, suffix) < 0)
if (streq(current, "/"))
m->cgroup_hierarchy = strdup(suffix);
else
m->cgroup_hierarchy = strappend(current, suffix);
if (!m->cgroup_hierarchy)
return log_oom();
}
......@@ -509,7 +513,7 @@ Unit* cgroup_unit_by_pid(Manager *m, pid_t pid) {
if (pid <= 1)
return NULL;
if (cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, pid, &group) < 0)
if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &group) < 0)
return NULL;
l = hashmap_get(m->cgroup_bondings, group);
......
......@@ -1034,7 +1034,7 @@ int bus_unit_cgroup_unset(Unit *u, DBusMessageIter *iter) {
unit_remove_drop_in(u, runtime, controller);
/* Try to migrate the old group away */
if (cg_get_by_pid(controller, 0, &target) >= 0)
if (cg_pid_get_path(controller, 0, &target) >= 0)
cgroup_bonding_migrate_to(u->cgroup_bondings, target, false);
cgroup_bonding_free(b, true);
......
......@@ -420,36 +420,6 @@ void server_vacuum(Server *s) {
s->cached_available_space_timestamp = 0;
}
static char *shortened_cgroup_path(pid_t pid) {
int r;
char _cleanup_free_ *process_path = NULL, *init_path = NULL;
char *path;
assert(pid > 0);
r = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, pid, &process_path);
if (r < 0)
return NULL;
r = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 1, &init_path);
if (r < 0)
return NULL;
if (endswith(init_path, "/system"))
init_path[strlen(init_path) - 7] = 0;
else if (streq(init_path, "/"))
init_path[0] = 0;
if (startswith(process_path, init_path)) {
path = strdup(process_path + strlen(init_path));
} else {
path = process_path;
process_path = NULL;
}
return path;
}
bool shall_try_append_again(JournalFile *f, int r) {
/* -E2BIG Hit configured limit
......@@ -620,8 +590,8 @@ static void dispatch_message_real(
IOVEC_SET_STRING(iovec[n++], audit_loginuid);
#endif
t = shortened_cgroup_path(ucred->pid);
if (t) {
r = cg_pid_get_path(NULL, ucred->pid, &t);
if (r >= 0) {
cgroup = strappend("_SYSTEMD_CGROUP=", t);
free(t);
......@@ -630,7 +600,8 @@ static void dispatch_message_real(
}
#ifdef HAVE_LOGIND
if (sd_pid_get_session(ucred->pid, &t) >= 0) {
r = cg_pid_get_session(ucred->pid, &t);
if (r >= 0) {
session = strappend("_SYSTEMD_SESSION=", t);
free(t);
......@@ -773,7 +744,7 @@ void server_dispatch_message(
const char *unit_id,
int priority) {
int rl;
int rl, r;
char _cleanup_free_ *path = NULL;
char *c;
......@@ -789,8 +760,8 @@ void server_dispatch_message(
if (!ucred)
goto finish;
path = shortened_cgroup_path(ucred->pid);
if (!path)
r = cg_pid_get_path_shifted(ucred->pid, NULL, &path);
if (r < 0)
goto finish;
/* example: /user/lennart/3/foobar
......
......@@ -68,4 +68,5 @@ global:
LIBSYSTEMD_LOGIN_202 {
global:
sd_pid_get_user_unit;
sd_pid_get_machine_name;
} LIBSYSTEMD_LOGIN_201;
......@@ -506,7 +506,7 @@ static int bus_manager_create_session(Manager *m, DBusMessage *message, DBusMess
dbus_message_iter_get_basic(&iter, &kill_processes);
r = cg_pid_get_cgroup(leader, NULL, &cgroup);
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, leader, &cgroup);
if (r < 0)
goto fail;
......
......@@ -436,7 +436,6 @@ static int session_create_one_group(Session *s, const char *controller, const ch
int r;
assert(s);
assert(controller);
assert(path);
if (s->leader > 0) {
......
......@@ -1100,21 +1100,18 @@ int manager_get_user_by_cgroup(Manager *m, const char *cgroup, User **user) {
}
int manager_get_session_by_pid(Manager *m, pid_t pid, Session **session) {
char *p;
_cleanup_free_ char *p = NULL;
int r;
assert(m);
assert(pid >= 1);
assert(session);
r = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, pid, &p);
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &p);
if (r < 0)
return r;
r = manager_get_session_by_cgroup(m, p, session);
free(p);
return r;
return manager_get_session_by_cgroup(m, p, session);
}
void manager_cgroup_notify_empty(Manager *m, const char *cgroup) {
......
......@@ -33,51 +33,19 @@
#include "fileio.h"
_public_ int sd_pid_get_session(pid_t pid, char **session) {
int r;
char *cgroup, *p;
if (pid < 0)
return -EINVAL;
if (!session)
return -EINVAL;
r = cg_pid_get_cgroup(pid, NULL, &cgroup);
if (r < 0)
return r;
if (!startswith(cgroup, "/user/")) {
free(cgroup);
return -ENOENT;
}
p = strchr(cgroup + 6, '/');
if (!p) {
free(cgroup);
return -ENOENT;
}
p++;
if (startswith(p, "shared/") || streq(p, "shared")) {
free(cgroup);
return -ENOENT;
}
p = strndup(p, strcspn(p, "/"));
free(cgroup);
if (!p)
return -ENOMEM;
*session = p;
return 0;
return cg_pid_get_session(pid, session);
}
_public_ int sd_pid_get_unit(pid_t pid, char **unit) {
if (pid < 0)
return -EINVAL;
if (!unit)
return -EINVAL;
......@@ -88,13 +56,22 @@ _public_ int sd_pid_get_user_unit(pid_t pid, char **unit) {
if (pid < 0)
return -EINVAL;
if (!unit)
return -EINVAL;
return cg_pid_get_user_unit(pid, unit);
}
_public_ int sd_pid_get_machine_name(pid_t pid, char **name) {
if (pid < 0)
return -EINVAL;
if (!name)
return -EINVAL;
return cg_pid_get_machine_name(pid, name);
}
_public_ int sd_pid_get_owner_uid(pid_t pid, uid_t *uid) {
int r;
char *root, *cgroup, *p, *cc;
......@@ -106,7 +83,7 @@ _public_ int sd_pid_get_owner_uid(pid_t pid, uid_t *uid) {
if (!uid)
return -EINVAL;
r = cg_pid_get_cgroup(pid, &root, &cgroup);
r = cg_pid_get_path_shifted(pid, &root, &cgroup);
if (r < 0)
return r;
......
......@@ -75,6 +75,7 @@ static char *arg_directory = NULL;
static char *arg_user = NULL;
static char **arg_controllers = NULL;
static char *arg_uuid = NULL;
static char *arg_machine = NULL;
static bool arg_private_network = false;
static bool arg_read_only = false;
static bool arg_boot = false;
......@@ -120,6 +121,7 @@ static int help(void) {
" -C --controllers=LIST Put the container in specified comma-separated\n"
" cgroup hierarchies\n"
" --uuid=UUID Set a specific machine UUID for the container\n"
" -M --machine=NAME Set the machine name for the container\n"
" --private-network Disable network in container\n"
" --read-only Mount the root directory read-only\n"
" --capability=CAP In addition to the default, retain specified\n"
......@@ -161,6 +163,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
{ "bind", required_argument, NULL, ARG_BIND },
{ "bind-ro", required_argument, NULL, ARG_BIND_RO },
{ "machine", required_argument, NULL, 'M' },
{ NULL, 0, NULL, 0 }
};
......@@ -194,22 +197,19 @@ static int parse_argv(int argc, char *argv[]) {
case 'u':
free(arg_user);
if (!(arg_user = strdup(optarg))) {
log_error("Failed to duplicate user name.");
return -ENOMEM;
}
arg_user = strdup(optarg);
if (!arg_user)
return log_oom();
break;
case 'C':
strv_free(arg_controllers);
arg_controllers = strv_split(optarg, ",");
if (!arg_controllers) {
log_error("Failed to split controllers list.");
return -ENOMEM;
}
strv_uniq(arg_controllers);
if (!arg_controllers)
return log_oom();
cg_shorten_controllers(arg_controllers);
break;
case ARG_PRIVATE_NETWORK:
......@@ -224,6 +224,19 @@ static int parse_argv(int argc, char *argv[]) {
arg_uuid = optarg;
break;
case 'M':
if (!hostname_is_valid(optarg)) {
log_error("Invalid machine name: %s", optarg);
return -EINVAL;
}
free(arg_machine);
arg_machine = strdup(optarg);
if (!arg_machine)
return log_oom();
break;
case ARG_READ_ONLY:
arg_read_only = true;
break;
......@@ -743,25 +756,11 @@ static int setup_kmsg(const char *dest, int kmsg_socket) {
}
static int setup_hostname(void) {
char *hn;
int r = 0;
hn = path_get_file_name(arg_directory);
if (hn) {
hn = strdup(hn);
if (!hn)
return -ENOMEM;
hostname_cleanup(hn);
if (!isempty(hn))
if (sethostname(hn, strlen(hn)) < 0)
r = -errno;
free(hn);
}
if (sethostname(arg_machine, strlen(arg_machine)) < 0)
return -errno;
return r;
return 0;
}
static int setup_journal(const char *directory) {
......@@ -896,6 +895,25 @@ static int setup_journal(const char *directory) {
return 0;
}
static int setup_cgroup(const char *path) {
char **c;
int r;
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
if (r < 0) {
log_error("Failed to create cgroup: %s", strerror(-r));
return r;
}
STRV_FOREACH(c, arg_controllers) {
r = cg_create_and_attach(*c, path, 1);
if (r < 0)
log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
}
return 0;
}
static int drop_capabilities(void) {
return capability_bounding_set_drop(~arg_retain, false);
}
......@@ -1159,9 +1177,9 @@ finish:
int main(int argc, char *argv[]) {
pid_t pid = 0;
int r = EXIT_FAILURE, k;
char *oldcg = NULL, *newcg = NULL;
char **controller = NULL;
int master = -1, n_fd_passed;
_cleanup_free_ char *machine_root = NULL, *newcg = NULL;
_cleanup_close_ int master = -1;
int n_fd_passed;
const char *console = NULL;
struct termios saved_attr, raw_attr;
sigset_t mask;
......@@ -1193,6 +1211,20 @@ int main(int argc, char *argv[]) {
path_kill_slashes(arg_directory);
if (!arg_machine) {
arg_machine = strdup(path_get_file_name(arg_directory));
if (!arg_machine) {
log_oom();
goto finish;
}
hostname_cleanup(arg_machine);
if (isempty(arg_machine)) {
log_error("Failed to determine machine name automatically, please use -M.");
goto finish;
}
}
if (geteuid() != 0) {
log_error("Need to be root.");
goto finish;
......@@ -1225,27 +1257,26 @@ int main(int argc, char *argv[]) {
fdset_close_others(fds);
log_open();
k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
k = cg_get_machine_path(&machine_root);
if (k < 0) {
log_error("Failed to determine current cgroup: %s", strerror(-k));
log_error("Failed to determine machine cgroup path: %s", strerror(-k));
goto finish;
}
if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
newcg = strjoin(machine_root, "/", arg_machine, NULL);
if (!newcg) {
log_error("Failed to allocate cgroup path.");
goto finish;
}
k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
if (k < 0) {
log_error("Failed to create cgroup: %s", strerror(-k));
goto finish;
}
r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, false);
if (r <= 0 && r != -ENOENT) {
log_error("Container already running.");
STRV_FOREACH(controller, arg_controllers) {
k = cg_create_and_attach(*controller, newcg, 0);
if (k < 0)
log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
free(newcg);
newcg = NULL;
goto finish;
}
master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
......@@ -1279,7 +1310,7 @@ int main(int argc, char *argv[]) {
}
if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
log_error("Failed to create kmsg socket pair");
log_error("Failed to create kmsg socket pair.");
goto finish;
}
......@@ -1382,6 +1413,9 @@ int main(int argc, char *argv[]) {
goto child_fail;
}
if (setup_cgroup(newcg) < 0)
goto child_fail;
/* Mark everything as slave, so that we still
* receive mounts from the real root, but don't
* propagate mounts to the real root. */
......@@ -1547,7 +1581,7 @@ int main(int argc, char *argv[]) {
}
if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
(asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
(asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
log_oom();
goto child_fail;
}
......@@ -1640,21 +1674,14 @@ finish:
if (saved_attr_valid)
tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
if (master >= 0)
close_nointr_nofail(master);
close_pipe(kmsg_socket_pair);
if (oldcg)
cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);