2019-05-27 14:55:01 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2013-12-30 01:27:10 +08:00
|
|
|
/*
|
|
|
|
* net/core/netclassid_cgroup.c Classid Cgroupfs Handling
|
|
|
|
*
|
|
|
|
* Authors: Thomas Graf <tgraf@suug.ch>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/cgroup.h>
|
|
|
|
#include <linux/fdtable.h>
|
2017-02-06 17:57:33 +08:00
|
|
|
#include <linux/sched/task.h>
|
|
|
|
|
2013-12-30 01:27:10 +08:00
|
|
|
#include <net/cls_cgroup.h>
|
|
|
|
#include <net/sock.h>
|
|
|
|
|
|
|
|
static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css)
|
|
|
|
{
|
|
|
|
return css ? container_of(css, struct cgroup_cls_state, css) : NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct cgroup_cls_state *task_cls_state(struct task_struct *p)
|
|
|
|
{
|
2015-07-22 17:23:20 +08:00
|
|
|
return css_cls_state(task_css_check(p, net_cls_cgrp_id,
|
|
|
|
rcu_read_lock_bh_held()));
|
2013-12-30 01:27:10 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(task_cls_state);
|
|
|
|
|
|
|
|
static struct cgroup_subsys_state *
|
|
|
|
cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
|
|
|
|
{
|
|
|
|
struct cgroup_cls_state *cs;
|
|
|
|
|
|
|
|
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
|
|
|
|
if (!cs)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
return &cs->css;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cgrp_css_online(struct cgroup_subsys_state *css)
|
|
|
|
{
|
|
|
|
struct cgroup_cls_state *cs = css_cls_state(css);
|
2014-05-17 01:22:48 +08:00
|
|
|
struct cgroup_cls_state *parent = css_cls_state(css->parent);
|
2013-12-30 01:27:10 +08:00
|
|
|
|
|
|
|
if (parent)
|
|
|
|
cs->classid = parent->classid;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cgrp_css_free(struct cgroup_subsys_state *css)
|
|
|
|
{
|
|
|
|
kfree(css_cls_state(css));
|
|
|
|
}
|
|
|
|
|
2020-03-05 22:45:57 +08:00
|
|
|
/*
|
|
|
|
* To avoid freezing of sockets creation for tasks with big number of threads
|
|
|
|
* and opened sockets lets release file_lock every 1000 iterated descriptors.
|
|
|
|
* New sockets will already have been created with new classid.
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct update_classid_context {
|
|
|
|
u32 classid;
|
|
|
|
unsigned int batch;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define UPDATE_CLASSID_BATCH 1000
|
|
|
|
|
2022-08-17 09:35:29 +08:00
|
|
|
static int update_classid_sock(const void *v, struct file *file, unsigned int n)
|
2013-12-30 01:27:10 +08:00
|
|
|
{
|
2020-03-05 22:45:57 +08:00
|
|
|
struct update_classid_context *ctx = (void *)v;
|
2020-12-04 19:36:04 +08:00
|
|
|
struct socket *sock = sock_from_file(file);
|
2013-12-30 01:27:10 +08:00
|
|
|
|
bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode
Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used.
Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
embedded per-socket cgroup information into sock->sk_cgrp_data and in order
to save 8 bytes in struct sock made both mutually exclusive, that is, when
cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2
falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp).
The assumption made was "there is no reason to mix the two and this is in line
with how legacy and v2 compatibility is handled" as stated in bd1060a1d671.
However, with Kubernetes more widely supporting cgroups v2 as well nowadays,
this assumption no longer holds, and the possibility of the v1/v2 mixed mode
with the v2 root fallback being hit becomes a real security issue.
Many of the cgroup v2 BPF programs are also used for policy enforcement, just
to pick _one_ example, that is, to programmatically deny socket related system
calls like connect(2) or bind(2). A v2 root fallback would implicitly cause
a policy bypass for the affected Pods.
In production environments, we have recently seen this case due to various
circumstances: i) a different 3rd party agent and/or ii) a container runtime
such as [0] in the user's environment configuring legacy cgroup v1 net_cls
tags, which triggered implicitly mentioned root fallback. Another case is
Kubernetes projects like kind [1] which create Kubernetes nodes in a container
and also add cgroup namespaces to the mix, meaning programs which are attached
to the cgroup v2 root of the cgroup namespace get attached to a non-root
cgroup v2 path from init namespace point of view. And the latter's root is
out of reach for agents on a kind Kubernetes node to configure. Meaning, any
entity on the node setting cgroup v1 net_cls tag will trigger the bypass
despite cgroup v2 BPF programs attached to the namespace root.
Generally, this mutual exclusiveness does not hold anymore in today's user
environments and makes cgroup v2 usage from BPF side fragile and unreliable.
This fix adds proper struct cgroup pointer for the cgroup v2 case to struct
sock_cgroup_data in order to address these issues; this implicitly also fixes
the tradeoffs being made back then with regards to races and refcount leaks
as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF
programs always operate as expected.
[0] https://github.com/nestybox/sysbox/
[1] https://kind.sigs.k8s.io/
Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net
2021-09-14 07:07:57 +08:00
|
|
|
if (sock)
|
2020-03-05 22:45:57 +08:00
|
|
|
sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
|
|
|
|
if (--ctx->batch == 0) {
|
|
|
|
ctx->batch = UPDATE_CLASSID_BATCH;
|
|
|
|
return n + 1;
|
|
|
|
}
|
2013-12-30 01:27:10 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-03-05 22:45:57 +08:00
|
|
|
static void update_classid_task(struct task_struct *p, u32 classid)
|
|
|
|
{
|
|
|
|
struct update_classid_context ctx = {
|
|
|
|
.classid = classid,
|
|
|
|
.batch = UPDATE_CLASSID_BATCH
|
|
|
|
};
|
|
|
|
unsigned int fd = 0;
|
|
|
|
|
|
|
|
do {
|
|
|
|
task_lock(p);
|
|
|
|
fd = iterate_fd(p->files, fd, update_classid_sock, &ctx);
|
|
|
|
task_unlock(p);
|
|
|
|
cond_resched();
|
|
|
|
} while (fd);
|
|
|
|
}
|
|
|
|
|
cgroup, net_cls: iterate the fds of only the tasks which are being migrated
The net_cls controller controls the classid field of each socket which
is associated with the cgroup. Because the classid is per-socket
attribute, when a task migrates to another cgroup or the configured
classid of the cgroup changes, the controller needs to walk all
sockets and update the classid value, which was implemented by
3b13758f51de ("cgroups: Allow dynamically changing net_classid").
While the approach is not scalable, migrating tasks which have a lot
of fds attached to them is rare and the cost is born by the ones
initiating the operations. However, for simplicity, both the
migration and classid config change paths call update_classid() which
scans all fds of all tasks in the target css. This is an overkill for
the migration path which only needs to cover a much smaller subset of
tasks which are actually getting migrated in.
On cgroup v1, this can lead to unexpected scalability issues when one
tries to migrate a task or process into a net_cls cgroup which already
contains a lot of fds. Even if the migration traget doesn't have many
to get scanned, update_classid() ends up scanning all fds in the
target cgroup which can be extremely numerous.
Unfortunately, on cgroup v2 which doesn't use net_cls, the problem is
even worse. Before bfc2cf6f61fc ("cgroup: call subsys->*attach() only
for subsystems which are actually affected by migration"), cgroup core
would call the ->css_attach callback even for controllers which don't
see actual migration to a different css.
As net_cls is always disabled but still mounted on cgroup v2, whenever
a process is migrated on the cgroup v2 hierarchy, net_cls sees
identity migration from root to root and cgroup core used to call
->css_attach callback for those. The net_cls ->css_attach ends up
calling update_classid() on the root net_cls css to which all
processes on the system belong to as the controller isn't used. This
makes any cgroup v2 migration O(total_number_of_fds_on_the_system)
which is horrible and easily leads to noticeable stalls triggering RCU
stall warnings and so on.
The worst symptom is already fixed in upstream by bfc2cf6f61fc
("cgroup: call subsys->*attach() only for subsystems which are
actually affected by migration"); however, backporting that commit is
too invasive and we want to avoid other cases too.
This patch updates net_cls's cgrp_attach() to iterate fds of only the
processes which are actually getting migrated. This removes the
surprising migration cost which is dependent on the total number of
fds in the target cgroup. As this leaves write_classid() the only
user of update_classid(), open-code the helper into write_classid().
Reported-by: David Goode <dgoode@fb.com>
Fixes: 3b13758f51de ("cgroups: Allow dynamically changing net_classid")
Cc: stable@vger.kernel.org # v4.4+
Cc: Nina Schiff <ninasc@fb.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-15 07:25:56 +08:00
|
|
|
static void cgrp_attach(struct cgroup_taskset *tset)
|
2013-12-30 01:27:10 +08:00
|
|
|
{
|
cgroup, net_cls: iterate the fds of only the tasks which are being migrated
The net_cls controller controls the classid field of each socket which
is associated with the cgroup. Because the classid is per-socket
attribute, when a task migrates to another cgroup or the configured
classid of the cgroup changes, the controller needs to walk all
sockets and update the classid value, which was implemented by
3b13758f51de ("cgroups: Allow dynamically changing net_classid").
While the approach is not scalable, migrating tasks which have a lot
of fds attached to them is rare and the cost is born by the ones
initiating the operations. However, for simplicity, both the
migration and classid config change paths call update_classid() which
scans all fds of all tasks in the target css. This is an overkill for
the migration path which only needs to cover a much smaller subset of
tasks which are actually getting migrated in.
On cgroup v1, this can lead to unexpected scalability issues when one
tries to migrate a task or process into a net_cls cgroup which already
contains a lot of fds. Even if the migration traget doesn't have many
to get scanned, update_classid() ends up scanning all fds in the
target cgroup which can be extremely numerous.
Unfortunately, on cgroup v2 which doesn't use net_cls, the problem is
even worse. Before bfc2cf6f61fc ("cgroup: call subsys->*attach() only
for subsystems which are actually affected by migration"), cgroup core
would call the ->css_attach callback even for controllers which don't
see actual migration to a different css.
As net_cls is always disabled but still mounted on cgroup v2, whenever
a process is migrated on the cgroup v2 hierarchy, net_cls sees
identity migration from root to root and cgroup core used to call
->css_attach callback for those. The net_cls ->css_attach ends up
calling update_classid() on the root net_cls css to which all
processes on the system belong to as the controller isn't used. This
makes any cgroup v2 migration O(total_number_of_fds_on_the_system)
which is horrible and easily leads to noticeable stalls triggering RCU
stall warnings and so on.
The worst symptom is already fixed in upstream by bfc2cf6f61fc
("cgroup: call subsys->*attach() only for subsystems which are
actually affected by migration"); however, backporting that commit is
too invasive and we want to avoid other cases too.
This patch updates net_cls's cgrp_attach() to iterate fds of only the
processes which are actually getting migrated. This removes the
surprising migration cost which is dependent on the total number of
fds in the target cgroup. As this leaves write_classid() the only
user of update_classid(), open-code the helper into write_classid().
Reported-by: David Goode <dgoode@fb.com>
Fixes: 3b13758f51de ("cgroups: Allow dynamically changing net_classid")
Cc: stable@vger.kernel.org # v4.4+
Cc: Nina Schiff <ninasc@fb.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-15 07:25:56 +08:00
|
|
|
struct cgroup_subsys_state *css;
|
2013-12-30 01:27:10 +08:00
|
|
|
struct task_struct *p;
|
|
|
|
|
cgroup, net_cls: iterate the fds of only the tasks which are being migrated
The net_cls controller controls the classid field of each socket which
is associated with the cgroup. Because the classid is per-socket
attribute, when a task migrates to another cgroup or the configured
classid of the cgroup changes, the controller needs to walk all
sockets and update the classid value, which was implemented by
3b13758f51de ("cgroups: Allow dynamically changing net_classid").
While the approach is not scalable, migrating tasks which have a lot
of fds attached to them is rare and the cost is born by the ones
initiating the operations. However, for simplicity, both the
migration and classid config change paths call update_classid() which
scans all fds of all tasks in the target css. This is an overkill for
the migration path which only needs to cover a much smaller subset of
tasks which are actually getting migrated in.
On cgroup v1, this can lead to unexpected scalability issues when one
tries to migrate a task or process into a net_cls cgroup which already
contains a lot of fds. Even if the migration traget doesn't have many
to get scanned, update_classid() ends up scanning all fds in the
target cgroup which can be extremely numerous.
Unfortunately, on cgroup v2 which doesn't use net_cls, the problem is
even worse. Before bfc2cf6f61fc ("cgroup: call subsys->*attach() only
for subsystems which are actually affected by migration"), cgroup core
would call the ->css_attach callback even for controllers which don't
see actual migration to a different css.
As net_cls is always disabled but still mounted on cgroup v2, whenever
a process is migrated on the cgroup v2 hierarchy, net_cls sees
identity migration from root to root and cgroup core used to call
->css_attach callback for those. The net_cls ->css_attach ends up
calling update_classid() on the root net_cls css to which all
processes on the system belong to as the controller isn't used. This
makes any cgroup v2 migration O(total_number_of_fds_on_the_system)
which is horrible and easily leads to noticeable stalls triggering RCU
stall warnings and so on.
The worst symptom is already fixed in upstream by bfc2cf6f61fc
("cgroup: call subsys->*attach() only for subsystems which are
actually affected by migration"); however, backporting that commit is
too invasive and we want to avoid other cases too.
This patch updates net_cls's cgrp_attach() to iterate fds of only the
processes which are actually getting migrated. This removes the
surprising migration cost which is dependent on the total number of
fds in the target cgroup. As this leaves write_classid() the only
user of update_classid(), open-code the helper into write_classid().
Reported-by: David Goode <dgoode@fb.com>
Fixes: 3b13758f51de ("cgroups: Allow dynamically changing net_classid")
Cc: stable@vger.kernel.org # v4.4+
Cc: Nina Schiff <ninasc@fb.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-15 07:25:56 +08:00
|
|
|
cgroup_taskset_for_each(p, css, tset) {
|
2020-03-05 22:45:57 +08:00
|
|
|
update_classid_task(p, css_cls_state(css)->classid);
|
2013-12-30 01:27:10 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
|
|
|
|
{
|
|
|
|
return css_cls_state(css)->classid;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
|
|
|
|
u64 value)
|
|
|
|
{
|
2015-11-21 04:31:39 +08:00
|
|
|
struct cgroup_cls_state *cs = css_cls_state(css);
|
cgroup, net_cls: iterate the fds of only the tasks which are being migrated
The net_cls controller controls the classid field of each socket which
is associated with the cgroup. Because the classid is per-socket
attribute, when a task migrates to another cgroup or the configured
classid of the cgroup changes, the controller needs to walk all
sockets and update the classid value, which was implemented by
3b13758f51de ("cgroups: Allow dynamically changing net_classid").
While the approach is not scalable, migrating tasks which have a lot
of fds attached to them is rare and the cost is born by the ones
initiating the operations. However, for simplicity, both the
migration and classid config change paths call update_classid() which
scans all fds of all tasks in the target css. This is an overkill for
the migration path which only needs to cover a much smaller subset of
tasks which are actually getting migrated in.
On cgroup v1, this can lead to unexpected scalability issues when one
tries to migrate a task or process into a net_cls cgroup which already
contains a lot of fds. Even if the migration traget doesn't have many
to get scanned, update_classid() ends up scanning all fds in the
target cgroup which can be extremely numerous.
Unfortunately, on cgroup v2 which doesn't use net_cls, the problem is
even worse. Before bfc2cf6f61fc ("cgroup: call subsys->*attach() only
for subsystems which are actually affected by migration"), cgroup core
would call the ->css_attach callback even for controllers which don't
see actual migration to a different css.
As net_cls is always disabled but still mounted on cgroup v2, whenever
a process is migrated on the cgroup v2 hierarchy, net_cls sees
identity migration from root to root and cgroup core used to call
->css_attach callback for those. The net_cls ->css_attach ends up
calling update_classid() on the root net_cls css to which all
processes on the system belong to as the controller isn't used. This
makes any cgroup v2 migration O(total_number_of_fds_on_the_system)
which is horrible and easily leads to noticeable stalls triggering RCU
stall warnings and so on.
The worst symptom is already fixed in upstream by bfc2cf6f61fc
("cgroup: call subsys->*attach() only for subsystems which are
actually affected by migration"); however, backporting that commit is
too invasive and we want to avoid other cases too.
This patch updates net_cls's cgrp_attach() to iterate fds of only the
processes which are actually getting migrated. This removes the
surprising migration cost which is dependent on the total number of
fds in the target cgroup. As this leaves write_classid() the only
user of update_classid(), open-code the helper into write_classid().
Reported-by: David Goode <dgoode@fb.com>
Fixes: 3b13758f51de ("cgroups: Allow dynamically changing net_classid")
Cc: stable@vger.kernel.org # v4.4+
Cc: Nina Schiff <ninasc@fb.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-15 07:25:56 +08:00
|
|
|
struct css_task_iter it;
|
|
|
|
struct task_struct *p;
|
2015-11-21 04:31:39 +08:00
|
|
|
|
|
|
|
cs->classid = (u32)value;
|
2013-12-30 01:27:10 +08:00
|
|
|
|
2017-05-15 21:34:01 +08:00
|
|
|
css_task_iter_start(css, 0, &it);
|
2020-04-20 15:04:24 +08:00
|
|
|
while ((p = css_task_iter_next(&it)))
|
2020-03-05 22:45:57 +08:00
|
|
|
update_classid_task(p, cs->classid);
|
cgroup, net_cls: iterate the fds of only the tasks which are being migrated
The net_cls controller controls the classid field of each socket which
is associated with the cgroup. Because the classid is per-socket
attribute, when a task migrates to another cgroup or the configured
classid of the cgroup changes, the controller needs to walk all
sockets and update the classid value, which was implemented by
3b13758f51de ("cgroups: Allow dynamically changing net_classid").
While the approach is not scalable, migrating tasks which have a lot
of fds attached to them is rare and the cost is born by the ones
initiating the operations. However, for simplicity, both the
migration and classid config change paths call update_classid() which
scans all fds of all tasks in the target css. This is an overkill for
the migration path which only needs to cover a much smaller subset of
tasks which are actually getting migrated in.
On cgroup v1, this can lead to unexpected scalability issues when one
tries to migrate a task or process into a net_cls cgroup which already
contains a lot of fds. Even if the migration traget doesn't have many
to get scanned, update_classid() ends up scanning all fds in the
target cgroup which can be extremely numerous.
Unfortunately, on cgroup v2 which doesn't use net_cls, the problem is
even worse. Before bfc2cf6f61fc ("cgroup: call subsys->*attach() only
for subsystems which are actually affected by migration"), cgroup core
would call the ->css_attach callback even for controllers which don't
see actual migration to a different css.
As net_cls is always disabled but still mounted on cgroup v2, whenever
a process is migrated on the cgroup v2 hierarchy, net_cls sees
identity migration from root to root and cgroup core used to call
->css_attach callback for those. The net_cls ->css_attach ends up
calling update_classid() on the root net_cls css to which all
processes on the system belong to as the controller isn't used. This
makes any cgroup v2 migration O(total_number_of_fds_on_the_system)
which is horrible and easily leads to noticeable stalls triggering RCU
stall warnings and so on.
The worst symptom is already fixed in upstream by bfc2cf6f61fc
("cgroup: call subsys->*attach() only for subsystems which are
actually affected by migration"); however, backporting that commit is
too invasive and we want to avoid other cases too.
This patch updates net_cls's cgrp_attach() to iterate fds of only the
processes which are actually getting migrated. This removes the
surprising migration cost which is dependent on the total number of
fds in the target cgroup. As this leaves write_classid() the only
user of update_classid(), open-code the helper into write_classid().
Reported-by: David Goode <dgoode@fb.com>
Fixes: 3b13758f51de ("cgroups: Allow dynamically changing net_classid")
Cc: stable@vger.kernel.org # v4.4+
Cc: Nina Schiff <ninasc@fb.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-15 07:25:56 +08:00
|
|
|
css_task_iter_end(&it);
|
|
|
|
|
2013-12-30 01:27:10 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct cftype ss_files[] = {
|
|
|
|
{
|
|
|
|
.name = "classid",
|
|
|
|
.read_u64 = read_classid,
|
|
|
|
.write_u64 = write_classid,
|
|
|
|
},
|
|
|
|
{ } /* terminate */
|
|
|
|
};
|
|
|
|
|
2014-02-08 23:36:58 +08:00
|
|
|
struct cgroup_subsys net_cls_cgrp_subsys = {
|
2013-12-30 01:27:10 +08:00
|
|
|
.css_alloc = cgrp_css_alloc,
|
|
|
|
.css_online = cgrp_css_online,
|
|
|
|
.css_free = cgrp_css_free,
|
|
|
|
.attach = cgrp_attach,
|
2014-07-15 23:05:09 +08:00
|
|
|
.legacy_cftypes = ss_files,
|
2013-12-30 01:27:10 +08:00
|
|
|
};
|