1
0
mirror of https://github.com/lxc/lxcfs.git synced 2026-02-05 09:46:18 +01:00

Merge pull request #691 from mihalicyn/psi_write_support

/proc/pressure/{cpu, io, memory} virtualization
This commit is contained in:
Stéphane Graber
2025-11-04 09:42:50 -05:00
committed by GitHub
14 changed files with 704 additions and 173 deletions

View File

@@ -48,6 +48,7 @@ runs:
meson setup build \
-Ddocs=false \
-Dtests=true \
-Dmocks=true \
-Dinit-script=systemd \
-Dprefix=/usr \
-Db_sanitize=address,undefined

View File

@@ -38,6 +38,7 @@ jobs:
- name: Test
env:
CC: ${{ matrix.compiler }}
LIBFUSE: ${{ matrix.fuse }}
run: |
echo "::group::Running the testsuite"

View File

@@ -69,6 +69,7 @@ conf.set_quoted('LXCFSTARGETDIR', join_paths(localstatedir, 'lib/lxcfs'))
# Custom configuration.
init_script = get_option('init-script')
want_tests = get_option('tests')
want_mocks = get_option('mocks')
want_docs = get_option('docs')
# Build flags.
@@ -232,12 +233,18 @@ liblxcfs_common_dependencies = declare_dependency(
libfuse,
])
liblxcfs_cargs = []
if want_mocks == true
liblxcfs_cargs = ['-DPSITRIGGERTEST', '-DDEBUG']
endif
liblxcfs = shared_module(
'lxcfs',
liblxcfs_sources,
dependencies: liblxcfs_common_dependencies,
install: true,
install_dir: lxcfsdir)
install_dir: lxcfsdir,
c_args: liblxcfs_cargs)
# Tests.
test_programs = []
@@ -248,7 +255,7 @@ if want_tests == true
dependencies: liblxcfs_common_dependencies,
install: false,
install_dir: lxcfsdir,
c_args: '-DRELOADTEST -DDEBUG')
c_args: ['-DRELOADTEST', '-DDEBUG'])
endif
# RPM spec.
@@ -306,6 +313,7 @@ status = [
'lxcfs source root directory: @0@'.format(project_source_root),
'init system(s): @0@'.format(init_script),
'tests: @0@'.format(want_tests),
'mocks: @0@'.format(want_mocks),
'documentation: @0@'.format(want_docs),
]
message('\n '.join(status))

View File

@@ -3,6 +3,9 @@
option('tests', type : 'boolean', value: 'false',
description : 'enable tests')
option('mocks', type : 'boolean', value: 'false',
description : 'enable CI-only features (mocks some things in LXCFS for CI/testing purposes)')
option('runtime-path', type : 'string', value : '/run',
description : 'the runtime directory')

View File

@@ -87,6 +87,16 @@ enum lxcfs_virt_t {
#define LXCFS_TYPE_SYS(type) (type >= LXC_TYPE_SYS && type <= LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_ONLINE)
#define LXCFS_TYPE_OK(type) (type >= LXC_TYPE_CGDIR && type < LXC_TYPE_MAX)
/*
* This signal will be used in /proc/pressure/{cpu, memory, io} poll()
* virtualization to notify a background thread that it's
* a time to finish.
*
* We don't need a handler for this, we want this signal to be
* ignored, but interrupt any interruptible syscall like poll().
*/
#define SIG_NOTIFY_POLL_WAKEUP (SIGRTMIN + 0)
/*
* This signal will be used to signal fuse request processing thread that
* request was interrupted (FUSE_INTERRUPT came from the kernel).
@@ -106,13 +116,20 @@ extern int rwlock_rdlock_interruptible(pthread_rwlock_t *l);
extern int rwlock_wrlock_interruptible(pthread_rwlock_t *l);
struct file_info {
char *controller;
char *cgroup;
char *file;
union {
struct {
char *controller;
char *cgroup;
char *file;
};
struct {
void *private_data;
};
};
int type;
char *buf; /* unused */
int buflen;
int size; /*actual data size */
int size; /* actual data size */
int cached;
};
@@ -125,17 +142,19 @@ struct lxcfs_opts {
* and the use of bool instead of explicited __u32 and __u64 we can't.
*/
__u32 version;
// As of opts version 2.
char runtime_path[PATH_MAX];
// As of opts version 2.
char runtime_path[PATH_MAX];
bool zswap_off;
bool psi_poll_on;
};
typedef enum lxcfs_opt_t {
LXCFS_SWAP_ON = 0,
LXCFS_PIDFD_ON = 1,
LXCFS_CFS_ON = 2,
LXCFS_ZSWAP_ON = 3,
LXCFS_OPTS_MAX = LXCFS_ZSWAP_ON,
LXCFS_SWAP_ON = 0,
LXCFS_PIDFD_ON = 1,
LXCFS_CFS_ON = 2,
LXCFS_ZSWAP_ON = 3,
LXCFS_PSI_POLL_ON = 4,
LXCFS_OPTS_MAX = LXCFS_PSI_POLL_ON,
} lxcfs_opt_t;
@@ -172,6 +191,8 @@ static inline bool lxcfs_has_opt(struct lxcfs_opts *opts, lxcfs_opt_t opt)
if (opts->version >= 3 && !opts->zswap_off)
return liblxcfs_can_use_zswap();
return false;
case LXCFS_PSI_POLL_ON:
return (opts->version >= 4 && opts->psi_poll_on);
}
return false;

View File

@@ -242,6 +242,8 @@ static void sigusr1_reload(int signo, siginfo_t *info, void *extra)
need_reload = 1;
}
static void sig_noop_handler(int signo, siginfo_t *info, void *extra) {}
/* Functions to run the library methods */
#define DEF_LIB_FS_OP(type, fsop) \
@@ -276,8 +278,14 @@ DEF_LIB_FS_OP(sys , read)
off_t offset, struct fuse_file_info *fi
#define LIB_FS_write_OP_ARGS path, buf, size, offset, fi
DEF_LIB_FS_OP(cg , write)
DEF_LIB_FS_OP(proc , write)
DEF_LIB_FS_OP(sys , write)
#define LIB_FS_poll_OP_ARGS_TYPE const char *path, struct fuse_file_info *fi, \
struct fuse_pollhandle *ph, unsigned *reventsp
#define LIB_FS_poll_OP_ARGS path, fi, ph, reventsp
DEF_LIB_FS_OP(proc , poll)
#define LIB_FS_mkdir_OP_ARGS_TYPE const char *path, mode_t mode
#define LIB_FS_mkdir_OP_ARGS path, mode
DEF_LIB_FS_OP(cg, mkdir)
@@ -605,6 +613,13 @@ int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
return ret;
}
if (LXCFS_TYPE_PROC(type)) {
up_users();
ret = do_proc_write(path, buf, size, offset, fi);
down_users();
return ret;
}
if (LXCFS_TYPE_SYS(type)) {
up_users();
ret = do_sys_write(path, buf, size, offset, fi);
@@ -615,6 +630,27 @@ int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
return -EINVAL;
}
int lxcfs_poll(const char *path, struct fuse_file_info *fi,
struct fuse_pollhandle *ph, unsigned *reventsp)
{
int ret;
enum lxcfs_virt_t type;
type = file_info_type(fi);
if (LXCFS_TYPE_PROC(type)) {
up_users();
ret = do_proc_poll(path, fi, ph, reventsp);
down_users();
return ret;
}
/* default f_op->poll() behavior when not supported */
fuse_pollhandle_destroy(ph);
*reventsp = DEFAULT_POLLMASK;
return 0;
}
int lxcfs_readlink(const char *path, char *buf, size_t size)
{
int ret;
@@ -842,6 +878,9 @@ const struct fuse_operations lxcfs_ops = {
.truncate = lxcfs_truncate,
.write = lxcfs_write,
.readlink = lxcfs_readlink,
#if HAVE_FUSE3
.poll = lxcfs_poll,
#endif
.create = NULL,
.destroy = NULL,
@@ -937,6 +976,7 @@ static const struct option long_options[] = {
{"enable-cfs", no_argument, 0, 0 },
{"enable-pidfd", no_argument, 0, 0 },
{"enable-cgroup", no_argument, 0, 0 },
{"enable-psi-poll", no_argument, 0, 0 },
{"pidfile", required_argument, 0, 'p' },
{"runtime-dir", required_argument, 0, 0 },
@@ -1011,7 +1051,8 @@ int main(int argc, char *argv[])
opts->zswap_off = false;
opts->use_pidfd = false;
opts->use_cfs = false;
opts->version = 3;
opts->psi_poll_on = false;
opts->version = 4;
while ((c = getopt_long(argc, argv, "dulfhvso:p:", long_options, &idx)) != -1) {
switch (c) {
@@ -1022,6 +1063,8 @@ int main(int argc, char *argv[])
opts->use_cfs = true;
else if (strcmp(long_options[idx].name, "enable-cgroup") == 0)
cgroup_is_enabled = true;
else if (strcmp(long_options[idx].name, "enable-psi-poll") == 0)
opts->psi_poll_on = true;
else if (strcmp(long_options[idx].name, "runtime-dir") == 0)
runtime_path_arg = optarg;
else
@@ -1189,6 +1232,11 @@ int main(int argc, char *argv[])
}
#endif
if (install_signal_handler(SIG_NOTIFY_POLL_WAKEUP, sig_noop_handler)) {
lxcfs_error("%s - Failed to install SIG_NOTIFY_POLL_WAKEUP signal handler", strerror(errno));
goto out;
}
if (!pidfile) {
snprintf(pidfile_buf, sizeof(pidfile_buf), "%s%s", runtime_path, PID_FILE);
pidfile = pidfile_buf;

View File

@@ -20,6 +20,8 @@
#define CGROUP2_SUPER_MAGIC 0x63677270
#endif
#define DEFAULT_POLLMASK (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)
#define lxcfs_debug_stream(stream, format, ...) \
do { \
fprintf(stream, "%s: %d: %s: " format "\n", __FILE__, \

View File

@@ -7,6 +7,7 @@
#include <fcntl.h>
#include <inttypes.h>
#include <libgen.h>
#include <poll.h>
#include <pthread.h>
#include <sched.h>
#include <stdarg.h>
@@ -313,18 +314,6 @@ __lxcfs_fuse_ops int proc_access(const char *path, int mask)
return 0;
}
__lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi)
{
do_release_file_info(fi);
return 0;
}
__lxcfs_fuse_ops int proc_releasedir(const char *path, struct fuse_file_info *fi)
{
do_release_file_info(fi);
return 0;
}
/**
* Gets a non-hierarchical memory controller limit, or UINT64_MAX if no limit is
* in place. If `swap` is true, reads 'swap' (v2) or 'memsw' (v1); otherwise
@@ -1709,7 +1698,7 @@ static int proc_slabinfo_read(char *buf, size_t size, off_t offset,
return total_len;
}
static int proc_pressure_io_read(char *buf, size_t size, off_t offset,
static int proc_pressure_read(char *buf, size_t size, off_t offset,
struct fuse_file_info *fi)
{
__do_free char *cgroup = NULL, *line = NULL;
@@ -1721,6 +1710,9 @@ static int proc_pressure_io_read(char *buf, size_t size, off_t offset,
size_t linelen = 0, total_len = 0;
char *cache = d->buf;
size_t cache_size = d->buflen;
char *fallback_path;
char *controller;
int (*get_pressure_fd)(struct cgroup_ops *ops, const char *cgroup);
pid_t initpid;
if (offset) {
@@ -1739,161 +1731,43 @@ static int proc_pressure_io_read(char *buf, size_t size, off_t offset,
return total_len;
}
initpid = lookup_initpid_in_store(fc->pid);
if (initpid <= 1 || is_shared_pidns(initpid))
initpid = fc->pid;
cgroup = get_pid_cgroup(initpid, "blkio");
if (!cgroup)
return read_file_fuse("/proc/pressure/io", buf, size, d);
prune_init_slice(cgroup);
fd = cgroup_ops->get_pressure_io_fd(cgroup_ops, cgroup);
if (fd < 0)
return read_file_fuse("/proc/pressure/io", buf, size, d);
f = fdopen_cached(fd, "re", &fopen_cache);
if (!f)
return read_file_fuse("/proc/pressure/io", buf, size, d);
while (getline(&line, &linelen, f) != -1) {
ssize_t l = snprintf(cache, cache_size, "%s", line);
if (l < 0)
return log_error(0, "Failed to write cache");
if ((size_t)l >= cache_size)
return log_error(0, "Write to cache was truncated");
cache += l;
cache_size -= l;
total_len += l;
}
d->cached = 1;
d->size = total_len;
if (total_len > size)
total_len = size;
memcpy(buf, d->buf, total_len);
return total_len;
}
static int proc_pressure_cpu_read(char *buf, size_t size, off_t offset,
struct fuse_file_info *fi)
{
__do_free char *cgroup = NULL, *line = NULL;
__do_free void *fopen_cache = NULL;
__do_fclose FILE *f = NULL;
__do_close int fd = -EBADF;
struct fuse_context *fc = fuse_get_context();
struct file_info *d = INTTYPE_TO_PTR(fi->fh);
size_t linelen = 0, total_len = 0;
char *cache = d->buf;
size_t cache_size = d->buflen;
pid_t initpid;
if (offset) {
size_t left;
if (offset > d->size)
return -EINVAL;
if (!d->cached)
return 0;
left = d->size - offset;
total_len = left > size ? size : left;
memcpy(buf, cache + offset, total_len);
return total_len;
switch (d->type) {
case LXC_TYPE_PROC_PRESSURE_IO:
fallback_path = LXC_TYPE_PROC_PRESSURE_IO_PATH;
controller = "blkio";
get_pressure_fd = cgroup_ops->get_pressure_io_fd;
break;
case LXC_TYPE_PROC_PRESSURE_CPU:
fallback_path = LXC_TYPE_PROC_PRESSURE_CPU_PATH;
controller = "cpu";
get_pressure_fd = cgroup_ops->get_pressure_cpu_fd;
break;
case LXC_TYPE_PROC_PRESSURE_MEMORY:
fallback_path = LXC_TYPE_PROC_PRESSURE_MEMORY_PATH;
controller = "memory";
get_pressure_fd = cgroup_ops->get_pressure_memory_fd;
break;
default:
return -EINVAL;
}
initpid = lookup_initpid_in_store(fc->pid);
if (initpid <= 1 || is_shared_pidns(initpid))
initpid = fc->pid;
cgroup = get_pid_cgroup(initpid, "cpu");
cgroup = get_pid_cgroup(initpid, controller);
if (!cgroup)
return read_file_fuse("/proc/pressure/cpu", buf, size, d);
return read_file_fuse(fallback_path, buf, size, d);
prune_init_slice(cgroup);
fd = cgroup_ops->get_pressure_cpu_fd(cgroup_ops, cgroup);
fd = get_pressure_fd(cgroup_ops, cgroup);
if (fd < 0)
return read_file_fuse("/proc/pressure/cpu", buf, size, d);
return read_file_fuse(fallback_path, buf, size, d);
f = fdopen_cached(fd, "re", &fopen_cache);
if (!f)
return read_file_fuse("/proc/pressure/cpu", buf, size, d);
while (getline(&line, &linelen, f) != -1) {
ssize_t l = snprintf(cache, cache_size, "%s", line);
if (l < 0)
return log_error(0, "Failed to write cache");
if ((size_t)l >= cache_size)
return log_error(0, "Write to cache was truncated");
cache += l;
cache_size -= l;
total_len += l;
}
d->cached = 1;
d->size = total_len;
if (total_len > size)
total_len = size;
memcpy(buf, d->buf, total_len);
return total_len;
}
static int proc_pressure_memory_read(char *buf, size_t size, off_t offset,
struct fuse_file_info *fi)
{
__do_free char *cgroup = NULL, *line = NULL;
__do_free void *fopen_cache = NULL;
__do_fclose FILE *f = NULL;
__do_close int fd = -EBADF;
struct fuse_context *fc = fuse_get_context();
struct file_info *d = INTTYPE_TO_PTR(fi->fh);
size_t linelen = 0, total_len = 0;
char *cache = d->buf;
size_t cache_size = d->buflen;
pid_t initpid;
if (offset) {
size_t left;
if (offset > d->size)
return -EINVAL;
if (!d->cached)
return 0;
left = d->size - offset;
total_len = left > size ? size : left;
memcpy(buf, cache + offset, total_len);
return total_len;
}
initpid = lookup_initpid_in_store(fc->pid);
if (initpid <= 1 || is_shared_pidns(initpid))
initpid = fc->pid;
cgroup = get_pid_cgroup(initpid, "memory");
if (!cgroup)
return read_file_fuse("/proc/pressure/memory", buf, size, d);
prune_init_slice(cgroup);
fd = cgroup_ops->get_pressure_memory_fd(cgroup_ops, cgroup);
if (fd < 0)
return read_file_fuse("/proc/pressure/memory", buf, size, d);
f = fdopen_cached(fd, "re", &fopen_cache);
if (!f)
return read_file_fuse("/proc/pressure/memory", buf, size, d);
return read_file_fuse(fallback_path, buf, size, d);
while (getline(&line, &linelen, f) != -1) {
ssize_t l = snprintf(cache, cache_size, "%s", line);
@@ -2015,19 +1889,19 @@ __lxcfs_fuse_ops int proc_read(const char *path, char *buf, size_t size,
buf, size, offset, f);
case LXC_TYPE_PROC_PRESSURE_IO:
if (liblxcfs_functional())
return proc_pressure_io_read(buf, size, offset, fi);
return proc_pressure_read(buf, size, offset, fi);
return read_file_fuse_with_offset(LXC_TYPE_PROC_PRESSURE_IO_PATH,
buf, size, offset, f);
case LXC_TYPE_PROC_PRESSURE_CPU:
if (liblxcfs_functional())
return proc_pressure_cpu_read(buf, size, offset, fi);
return proc_pressure_read(buf, size, offset, fi);
return read_file_fuse_with_offset(LXC_TYPE_PROC_PRESSURE_CPU_PATH,
buf, size, offset, f);
case LXC_TYPE_PROC_PRESSURE_MEMORY:
if (liblxcfs_functional())
return proc_pressure_memory_read(buf, size, offset, fi);
return proc_pressure_read(buf, size, offset, fi);
return read_file_fuse_with_offset(LXC_TYPE_PROC_PRESSURE_MEMORY_PATH,
buf, size, offset, f);
@@ -2035,3 +1909,473 @@ __lxcfs_fuse_ops int proc_read(const char *path, char *buf, size_t size,
return -EINVAL;
}
typedef enum {
POLL_NOTIFY_THREAD_EXITED = 0,
POLL_NOTIFY_THREAD_SPAWNED = 1,
POLL_NOTIFY_THREAD_RUNNING = 2,
} notify_poll_thread_state_t;
typedef struct psi_trigger {
/* increase version if the structure was changed */
__u16 version;
pthread_mutex_t lock;
pthread_t tid;
notify_poll_thread_state_t thread_state;
/* libfuse's FUSE_NOTIFY_POLL handle */
struct fuse_pollhandle *ph;
struct pollfd pfd;
} psi_trigger_t;
/* PSI trigger definitions from the kernel */
#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
static int proc_psi_trigger_write(const char *path, const char *buf, size_t size,
off_t offset, struct fuse_file_info *fi)
{
struct fuse_context *fc = fuse_get_context();
bool psi_virtualization_enabled = lxcfs_has_opt(fc->private_data, LXCFS_PSI_POLL_ON);
struct file_info *f = INTTYPE_TO_PTR(fi->fh);
__do_free psi_trigger_t *t = NULL;
__do_free char *cgroup = NULL;
__do_close int fd = -EBADF;
char *controller;
int (*get_pressure_fd)(struct cgroup_ops *ops, const char *cgroup);
pid_t initpid;
char tmpbuf[32];
size_t tmpbuf_size;
bool psi_full;
__u32 threshold_us;
__u32 window_us;
int ret;
if (!liblxcfs_functional())
return -EIO;
if (!psi_virtualization_enabled)
return -EINVAL;
/* mimic logic from kernel/sched/psi.c psi_write() */
if (f->private_data)
return -EBUSY;
if (offset || !size)
return -EINVAL;
/*
* We could just use buf as an input argument for write()
* syscall on a real fd, but it can be dangerous. It is safer
* to do all the parsing, parameters validation on LXCFS side
* and only then call write() on the underlying fd.
*/
tmpbuf_size = MIN(size, sizeof(tmpbuf));
memcpy(tmpbuf, buf, tmpbuf_size);
tmpbuf[tmpbuf_size - 1] = '\0';
if (sscanf(tmpbuf, "some %u %u", &threshold_us, &window_us) == 2)
psi_full = false;
else if (sscanf(tmpbuf, "full %u %u", &threshold_us, &window_us) == 2)
psi_full = true;
else
return -EINVAL;
if (window_us == 0 || window_us > WINDOW_MAX_US)
return -EINVAL;
/*
* Use limitation for unprivileged user (see kernels psi_trigger_create()).
* We can relax this later if needed.
*/
if (window_us % 2000000)
window_us = 2000000;
/* Check threshold */
if (threshold_us == 0 || threshold_us > window_us)
return -EINVAL;
/* rebuild everything back */
ret = strnprintf(tmpbuf, sizeof(tmpbuf), "%s %u %u",
psi_full ? "full" : "some", threshold_us, window_us);
if (ret < 0)
return -EIO;
switch (f->type) {
case LXC_TYPE_PROC_PRESSURE_IO:
controller = "blkio";
get_pressure_fd = cgroup_ops->get_pressure_io_fd;
break;
case LXC_TYPE_PROC_PRESSURE_CPU:
controller = "cpu";
get_pressure_fd = cgroup_ops->get_pressure_cpu_fd;
break;
case LXC_TYPE_PROC_PRESSURE_MEMORY:
controller = "memory";
get_pressure_fd = cgroup_ops->get_pressure_memory_fd;
break;
default:
return -EINVAL;
}
initpid = lookup_initpid_in_store(fc->pid);
if (initpid <= 1 || is_shared_pidns(initpid))
initpid = fc->pid;
cgroup = get_pid_cgroup(initpid, controller);
if (!cgroup)
return -EIO;
prune_init_slice(cgroup);
fd = get_pressure_fd(cgroup_ops, cgroup);
if (fd < 0)
return -EIO;
if (write(fd, tmpbuf, strlen(tmpbuf) + 1) < 0) {
lxcfs_error("/proc/pressure/{cpu, io, memory} write error: %s", strerror(errno));
return -EIO;
}
t = zalloc(sizeof(*t));
if (!t)
return -EIO;
t->version = 1;
if (pthread_mutex_init(&t->lock, NULL))
return -EIO;
t->pfd.fd = move_fd(fd);
t->pfd.events = POLLPRI;
/* will be released in proc_psi_trigger_release() */
f->private_data = move_ptr(t);
return size;
}
static void notify_poll(psi_trigger_t *t)
{
if (!t->ph) {
lxcfs_error("notify_poll called without pollhandle");
return;
}
lxcfs_debug("thread %d sends FUSE_NOTIFY_POLL", gettid());
/* send FUSE_NOTIFY_POLL to the kernel */
fuse_notify_poll(t->ph);
fuse_pollhandle_destroy(t->ph);
t->ph = NULL;
}
static void *poll_thread(void *arg)
{
psi_trigger_t *t = arg;
int n;
#ifdef PSITRIGGERTEST
struct timespec req = { 1, 0 };
#endif
pthread_mutex_lock(&t->lock);
t->thread_state = POLL_NOTIFY_THREAD_RUNNING;
/*
* If t->pfd.revents is not zero, then we already got
* a poll event before, so our job is only to notify
* kernel about it with FUSE_NOTIFY_POLL and clean t->pfd.revents.
*/
if (t->pfd.revents) {
notify_poll(t);
t->pfd.revents = 0;
goto exit;
}
lxcfs_debug("thread %d is going to poll", gettid());
again:
#ifndef PSITRIGGERTEST
n = poll(&t->pfd, 1, -1);
#else
/*
* It would be not practical and not stable to simulate pressure events
* on CI. So instead of real poll-ing on PSI files, lets just simulate "events"
* with nanosleep. This allows to test all the main logic.
*
* See also tests/test-psi-triggers-poll.c
*/
n = nanosleep(&req, NULL);
if (n == 0) {
/* simulate successful poll() return with event */
n = 1;
t->pfd.revents = DEFAULT_POLLMASK | POLLPRI;
}
#endif
if (n < 0) {
/* SIG_NOTIFY_POLL_WAKEUP signal? */
if (errno == EINTR) {
lxcfs_debug("poll() was interrupted by a signal");
/* see comment in proc_psi_trigger_release() */
if (t->pfd.fd > 0)
goto again;
} else {
lxcfs_error("poll() failed: %s", strerror(errno));
t->pfd.revents = DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
}
goto exit;
}
lxcfs_debug("thread %d has done with poll-ing %d %x", gettid(), n, t->pfd.revents);
if (n > 0)
notify_poll(t);
exit:
t->thread_state = POLL_NOTIFY_THREAD_EXITED;
pthread_mutex_unlock(&t->lock);
return NULL;
}
static int schedule_notify_poll(psi_trigger_t *t)
{
int ret = 0;
ret = pthread_create(&t->tid, NULL, poll_thread, t);
if (ret) {
lxcfs_error("Failed to spawn a thread for FUSE_NOTIFY_POLL: %s", strerror(ret));
goto exit;
}
ret = pthread_detach(t->tid);
if (ret) {
lxcfs_error("Failed to detach the FUSE_NOTIFY_POLL thread: %s", strerror(ret));
goto exit;
}
lxcfs_debug("FUSE_NOTIFY_POLL thread spawned");
t->thread_state = POLL_NOTIFY_THREAD_SPAWNED;
exit:
return ret;
}
static int proc_psi_trigger_poll(const char *path, struct fuse_file_info *fi,
struct fuse_pollhandle *ph, unsigned *reventsp)
{
struct file_info *f = INTTYPE_TO_PTR(fi->fh);
psi_trigger_t *t = f->private_data;
int ret = 0;
if (!liblxcfs_functional()) {
fuse_pollhandle_destroy(ph);
return -EIO;
}
if (f->type != LXC_TYPE_PROC_PRESSURE_IO &&
f->type != LXC_TYPE_PROC_PRESSURE_CPU &&
f->type != LXC_TYPE_PROC_PRESSURE_MEMORY) {
fuse_pollhandle_destroy(ph);
*reventsp = DEFAULT_POLLMASK;
return 0;
}
if (!t) {
fuse_pollhandle_destroy(ph);
*reventsp = DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
return 0;
}
/*
* We shouldn't block the kernel and bail out quickly,
* otherwise poll-s with timeout won't work.
*/
if (pthread_mutex_trylock(&t->lock)) {
lxcfs_debug("proc_psi_trigger_poll() can't take a mutex now");
fuse_pollhandle_destroy(ph);
*reventsp = DEFAULT_POLLMASK;
return 0;
}
lxcfs_debug("proc_psi_trigger_poll() has taken t->lock");
/*
* We want to ensure that notify thread was able to take t->lock mutex
* and make its job (or wasn't spawned yet) before we go again.
*/
if (t->thread_state != POLL_NOTIFY_THREAD_EXITED) {
lxcfs_debug("FUSE_NOTIFY_POLL thread hasn't finished before next FUSE_POLL arrive");
goto exit_retry;
}
if (!t->pfd.revents) {
int n = poll(&t->pfd, 1, 0);
if (n < 0) {
lxcfs_error("poll() failed: %s", strerror(errno));
t->pfd.revents = DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
}
}
/* let know the kernel which events we've got */
*reventsp = t->pfd.revents;
/* cleanup old notify handle */
if (t->ph) {
fuse_pollhandle_destroy(t->ph);
t->ph = NULL;
}
/*
* When ph is not NULL it means that kernel asks us
* to do FUSE_NOTIFY_POLL once we have something.
*
* We must do this from a separate thread to prevent deadlocks.
*/
if (ph) {
t->ph = ph;
ret = schedule_notify_poll(t);
if (ret)
goto exit;
} else {
/*
* All my tests show that ph is *always* not NULL, basically,
* we always being asked by the kernel to send FUSE_NOTIFY_POLL
* even when *reventsp is non-zero. After sending a notification
* we clean t->pfd.revents to prepare it for the next poll() if needed.
*
* But if ph is NULL, we need to clean it up in here.
*/
t->pfd.revents = 0;
}
exit:
pthread_mutex_unlock(&t->lock);
lxcfs_debug("returning ret = %d *reventsp = %x", ret, *reventsp);
return ret;
exit_retry:
if (ph) {
if (t->ph) {
fuse_pollhandle_destroy(t->ph);
t->ph = NULL;
}
t->ph = ph;
}
/* ask the kernel to get back later */
*reventsp = 0;
pthread_mutex_unlock(&t->lock);
lxcfs_debug("returning ret = %d *reventsp = %x", ret, *reventsp);
return 0;
}
static void proc_psi_trigger_release(struct file_info *f)
{
psi_trigger_t *t = f->private_data;
if (!t)
return;
/*
* It can happen, that proc_psi_trigger_release() is called
* while poll notify thread is running. We can use t->pfd.fd
* as a signal for this thread that it's time to finish.
* Once we close fd and set t->pfd.fd to -EBADF, thread will
* gracefully exit on our signal.
*/
close_prot_errno_disarm(t->pfd.fd);
pthread_kill(t->tid, SIG_NOTIFY_POLL_WAKEUP);
lxcfs_debug("proc_psi_trigger_release is waiting on t->lock");
pthread_mutex_lock(&t->lock);
if (t->thread_state != POLL_NOTIFY_THREAD_EXITED) {
lxcfs_error("FUSE_NOTIFY_POLL thread hasn't finished (gracefully)");
pthread_kill(t->tid, SIGKILL);
}
pthread_mutex_unlock(&t->lock);
/*
* pthread_mutex_lock() works as a barrier in this case, we know
* that nobody can access (t) anymore, because poll notify thread
* is dead by now and we are in f_op->release() callback, there is
* no active read/write/poll syscall in there, because (struct file)
* is being destroyed.
*/
pthread_mutex_destroy(&t->lock);
if (t->ph) {
fuse_pollhandle_destroy(t->ph);
t->ph = NULL;
}
free_disarm(f->private_data);
lxcfs_debug("proc_psi_trigger_release has done");
}
__lxcfs_fuse_ops int proc_write(const char *path, const char *buf, size_t size,
off_t offset, struct fuse_file_info *fi)
{
struct file_info *f = INTTYPE_TO_PTR(fi->fh);
switch (f->type) {
case LXC_TYPE_PROC_PRESSURE_IO:
case LXC_TYPE_PROC_PRESSURE_CPU:
case LXC_TYPE_PROC_PRESSURE_MEMORY:
return proc_psi_trigger_write(path, buf, size, offset, fi);
}
return -EINVAL;
}
__lxcfs_fuse_ops int proc_poll(const char *path, struct fuse_file_info *fi,
struct fuse_pollhandle *ph, unsigned *reventsp)
{
struct file_info *f = INTTYPE_TO_PTR(fi->fh);
switch (f->type) {
case LXC_TYPE_PROC_PRESSURE_IO:
case LXC_TYPE_PROC_PRESSURE_CPU:
case LXC_TYPE_PROC_PRESSURE_MEMORY:
return proc_psi_trigger_poll(path, fi, ph, reventsp);
}
fuse_pollhandle_destroy(ph);
*reventsp = DEFAULT_POLLMASK;
return 0;
}
__lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi)
{
struct file_info *f;
f = INTTYPE_TO_PTR(fi->fh);
if (!f)
return 0;
switch (f->type) {
case LXC_TYPE_PROC_PRESSURE_IO:
case LXC_TYPE_PROC_PRESSURE_CPU:
case LXC_TYPE_PROC_PRESSURE_MEMORY:
proc_psi_trigger_release(f);
break;
}
do_release_file_info(fi);
return 0;
}
__lxcfs_fuse_ops int proc_releasedir(const char *path, struct fuse_file_info *fi)
{
do_release_file_info(fi);
return 0;
}

View File

@@ -21,6 +21,8 @@ __visible extern int proc_open(const char *path, struct fuse_file_info *fi);
__visible extern int proc_opendir(const char *path, struct fuse_file_info *fi);
__visible extern int proc_access(const char *path, int mask);
__visible extern int proc_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi);
__visible extern int proc_write(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi);
__visible extern int proc_poll(const char *path, struct fuse_file_info *fi, struct fuse_pollhandle *ph, unsigned *reventsp);
__visible extern int proc_release(const char *path, struct fuse_file_info *fi);
__visible extern int proc_releasedir(const char *path, struct fuse_file_info *fi);

View File

@@ -51,7 +51,7 @@ if [ -x ${lxcfs} ]; then
export LD_LIBRARY_PATH="{{LXCFS_BUILD_ROOT}}"
fi
echo "=> Spawning ${lxcfs} ${LXCFSDIR}"
${lxcfs} --enable-cgroup -p ${pidfile} ${LXCFSDIR} &
${lxcfs} --enable-cgroup --enable-psi-poll -p ${pidfile} ${LXCFSDIR} &
LXCFSPID=$!
else
UNSHARE=0

View File

@@ -45,7 +45,7 @@ if [ -x ${lxcfs} ]; then
export LD_LIBRARY_PATH="{{LXCFS_BUILD_ROOT}}"
fi
echo "=> Spawning ${lxcfs} ${LXCFSDIR}"
${lxcfs} --enable-cgroup -p ${pidfile} ${LXCFSDIR} &
${lxcfs} --enable-cgroup --enable-psi-poll -p ${pidfile} ${LXCFSDIR} &
LXCFSPID=$!
else
UNSHARE=0

View File

@@ -145,3 +145,10 @@ test_programs += executable(
include_directories: config_include,
install: false,
build_by_default: want_tests != false)
test_programs += executable(
'test-psi-triggers-poll',
'test-psi-triggers-poll.c',
include_directories: config_include,
install: false,
build_by_default: want_tests != false)

View File

@@ -0,0 +1,88 @@
/* Borrowed from https://www.kernel.org/doc/Documentation/accounting/psi.rst */
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <poll.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#define WAIT_EVENTS_NUM 4
/*
* Monitor cpu partial stall with 2s tracking window size
* and 20ms threshold.
*/
int main(int argc, char **argv) {
const char trig[] = "some 20000 2000000";
struct pollfd fds;
int n;
char *path;
size_t len;
int events_cnt = 0;
if (geteuid() != 0) {
fprintf(stderr, "Run me as root\n");
exit(1);
}
if (argc != 2) {
fprintf(stderr, "Usage: %s [lxcfs_mount_path]\n", argv[0]);
exit(1);
}
len = strlen(argv[1]) + strlen("/proc/pressure/cpu") + 1;
path = alloca(len);
snprintf(path, len, "%s/proc/pressure/cpu", argv[1]);
fds.fd = open(path, O_RDWR | O_NONBLOCK);
if (fds.fd < 0) {
printf("%s open error: %s\n", path, strerror(errno));
return 1;
}
fds.events = POLLPRI;
if (write(fds.fd, trig, strlen(trig) + 1) < 0) {
printf("/proc/pressure/cpu write error: %s\n",
strerror(errno));
return 1;
}
printf("waiting for events...\n");
time_t t1 = time(NULL);
while (1) {
/* test code in proc_fuse.c poll_thread() generates 1 event per second */
n = poll(&fds, 1, 3 * 1000);
if (n < 0) {
printf("poll error: %s\n", strerror(errno));
return 1;
}
if (n == 0) {
printf("timeout\n");
return 1;
}
if (fds.revents & POLLERR) {
printf("got POLLERR, event source is gone\n");
return 1;
}
if (fds.revents & POLLPRI) {
printf("event triggered!\n");
events_cnt++;
if (events_cnt == WAIT_EVENTS_NUM)
break;
} else {
printf("unknown event received: 0x%x\n", fds.revents);
return 1;
}
}
time_t t2 = time(NULL);
printf("events_cnt = %d time diff = %ld\n", events_cnt, (long)(t2 - t1));
/* events frequency is 1 HZ, so we can expect that difference <= 1 */
if (labs((long)(t2 - t1) - events_cnt) > 1) {
printf("| (t2 - t1) - events_cnt | > 1 while should be <= 1\n");
return 1;
}
return 0;
}

View File

@@ -92,4 +92,10 @@ echo "==> Testing /proc/stat"
echo "==> Testing /proc/meminfo"
grep -q "^MemTotal.*65536 kB$" ${LXCFSDIR}/proc/meminfo
# We don't support PSI triggers virtualization with old libfuse
if [ "$LIBFUSE" != "fuse" ]; then
echo "==> Testing /proc/pressure/{cpu, io, memory}"
{{LXCFS_BUILD_ROOT}}/tests/test-psi-triggers-poll ${LXCFSDIR}
fi
PASS=1