diff --git a/.github/actions/build/action.yml b/.github/actions/build/action.yml index 8b05c2c..80e8b68 100644 --- a/.github/actions/build/action.yml +++ b/.github/actions/build/action.yml @@ -48,6 +48,7 @@ runs: meson setup build \ -Ddocs=false \ -Dtests=true \ + -Dmocks=true \ -Dinit-script=systemd \ -Dprefix=/usr \ -Db_sanitize=address,undefined diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index edac402..251435c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -38,6 +38,7 @@ jobs: - name: Test env: CC: ${{ matrix.compiler }} + LIBFUSE: ${{ matrix.fuse }} run: | echo "::group::Running the testsuite" diff --git a/meson.build b/meson.build index ad26a4b..1073afb 100644 --- a/meson.build +++ b/meson.build @@ -69,6 +69,7 @@ conf.set_quoted('LXCFSTARGETDIR', join_paths(localstatedir, 'lib/lxcfs')) # Custom configuration. init_script = get_option('init-script') want_tests = get_option('tests') +want_mocks = get_option('mocks') want_docs = get_option('docs') # Build flags. @@ -232,12 +233,18 @@ liblxcfs_common_dependencies = declare_dependency( libfuse, ]) +liblxcfs_cargs = [] +if want_mocks == true + liblxcfs_cargs = ['-DPSITRIGGERTEST', '-DDEBUG'] +endif + liblxcfs = shared_module( 'lxcfs', liblxcfs_sources, dependencies: liblxcfs_common_dependencies, install: true, - install_dir: lxcfsdir) + install_dir: lxcfsdir, + c_args: liblxcfs_cargs) # Tests. test_programs = [] @@ -248,7 +255,7 @@ if want_tests == true dependencies: liblxcfs_common_dependencies, install: false, install_dir: lxcfsdir, - c_args: '-DRELOADTEST -DDEBUG') + c_args: ['-DRELOADTEST', '-DDEBUG']) endif # RPM spec. @@ -306,6 +313,7 @@ status = [ 'lxcfs source root directory: @0@'.format(project_source_root), 'init system(s): @0@'.format(init_script), 'tests: @0@'.format(want_tests), + 'mocks: @0@'.format(want_mocks), 'documentation: @0@'.format(want_docs), ] message('\n '.join(status)) diff --git a/meson_options.txt b/meson_options.txt index 5566f4c..9f29f63 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -3,6 +3,9 @@ option('tests', type : 'boolean', value: 'false', description : 'enable tests') +option('mocks', type : 'boolean', value: 'false', + description : 'enable CI-only features (mocks some things in LXCFS for CI/testing purposes)') + option('runtime-path', type : 'string', value : '/run', description : 'the runtime directory') diff --git a/src/bindings.h b/src/bindings.h index 3abd757..f65e42d 100644 --- a/src/bindings.h +++ b/src/bindings.h @@ -87,6 +87,16 @@ enum lxcfs_virt_t { #define LXCFS_TYPE_SYS(type) (type >= LXC_TYPE_SYS && type <= LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_ONLINE) #define LXCFS_TYPE_OK(type) (type >= LXC_TYPE_CGDIR && type < LXC_TYPE_MAX) +/* + * This signal will be used in /proc/pressure/{cpu, memory, io} poll() + * virtualization to notify a background thread that it's + * a time to finish. + * + * We don't need a handler for this, we want this signal to be + * ignored, but interrupt any interruptible syscall like poll(). + */ +#define SIG_NOTIFY_POLL_WAKEUP (SIGRTMIN + 0) + /* * This signal will be used to signal fuse request processing thread that * request was interrupted (FUSE_INTERRUPT came from the kernel). @@ -106,13 +116,20 @@ extern int rwlock_rdlock_interruptible(pthread_rwlock_t *l); extern int rwlock_wrlock_interruptible(pthread_rwlock_t *l); struct file_info { - char *controller; - char *cgroup; - char *file; + union { + struct { + char *controller; + char *cgroup; + char *file; + }; + struct { + void *private_data; + }; + }; int type; char *buf; /* unused */ int buflen; - int size; /*actual data size */ + int size; /* actual data size */ int cached; }; @@ -125,17 +142,19 @@ struct lxcfs_opts { * and the use of bool instead of explicited __u32 and __u64 we can't. */ __u32 version; - // As of opts version 2. - char runtime_path[PATH_MAX]; + // As of opts version 2. + char runtime_path[PATH_MAX]; bool zswap_off; + bool psi_poll_on; }; typedef enum lxcfs_opt_t { - LXCFS_SWAP_ON = 0, - LXCFS_PIDFD_ON = 1, - LXCFS_CFS_ON = 2, - LXCFS_ZSWAP_ON = 3, - LXCFS_OPTS_MAX = LXCFS_ZSWAP_ON, + LXCFS_SWAP_ON = 0, + LXCFS_PIDFD_ON = 1, + LXCFS_CFS_ON = 2, + LXCFS_ZSWAP_ON = 3, + LXCFS_PSI_POLL_ON = 4, + LXCFS_OPTS_MAX = LXCFS_PSI_POLL_ON, } lxcfs_opt_t; @@ -172,6 +191,8 @@ static inline bool lxcfs_has_opt(struct lxcfs_opts *opts, lxcfs_opt_t opt) if (opts->version >= 3 && !opts->zswap_off) return liblxcfs_can_use_zswap(); return false; + case LXCFS_PSI_POLL_ON: + return (opts->version >= 4 && opts->psi_poll_on); } return false; diff --git a/src/lxcfs.c b/src/lxcfs.c index 246ff4b..af14e3b 100644 --- a/src/lxcfs.c +++ b/src/lxcfs.c @@ -242,6 +242,8 @@ static void sigusr1_reload(int signo, siginfo_t *info, void *extra) need_reload = 1; } +static void sig_noop_handler(int signo, siginfo_t *info, void *extra) {} + /* Functions to run the library methods */ #define DEF_LIB_FS_OP(type, fsop) \ @@ -276,8 +278,14 @@ DEF_LIB_FS_OP(sys , read) off_t offset, struct fuse_file_info *fi #define LIB_FS_write_OP_ARGS path, buf, size, offset, fi DEF_LIB_FS_OP(cg , write) +DEF_LIB_FS_OP(proc , write) DEF_LIB_FS_OP(sys , write) +#define LIB_FS_poll_OP_ARGS_TYPE const char *path, struct fuse_file_info *fi, \ + struct fuse_pollhandle *ph, unsigned *reventsp +#define LIB_FS_poll_OP_ARGS path, fi, ph, reventsp +DEF_LIB_FS_OP(proc , poll) + #define LIB_FS_mkdir_OP_ARGS_TYPE const char *path, mode_t mode #define LIB_FS_mkdir_OP_ARGS path, mode DEF_LIB_FS_OP(cg, mkdir) @@ -605,6 +613,13 @@ int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset, return ret; } + if (LXCFS_TYPE_PROC(type)) { + up_users(); + ret = do_proc_write(path, buf, size, offset, fi); + down_users(); + return ret; + } + if (LXCFS_TYPE_SYS(type)) { up_users(); ret = do_sys_write(path, buf, size, offset, fi); @@ -615,6 +630,27 @@ int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset, return -EINVAL; } +int lxcfs_poll(const char *path, struct fuse_file_info *fi, + struct fuse_pollhandle *ph, unsigned *reventsp) +{ + int ret; + enum lxcfs_virt_t type; + + type = file_info_type(fi); + + if (LXCFS_TYPE_PROC(type)) { + up_users(); + ret = do_proc_poll(path, fi, ph, reventsp); + down_users(); + return ret; + } + + /* default f_op->poll() behavior when not supported */ + fuse_pollhandle_destroy(ph); + *reventsp = DEFAULT_POLLMASK; + return 0; +} + int lxcfs_readlink(const char *path, char *buf, size_t size) { int ret; @@ -842,6 +878,9 @@ const struct fuse_operations lxcfs_ops = { .truncate = lxcfs_truncate, .write = lxcfs_write, .readlink = lxcfs_readlink, +#if HAVE_FUSE3 + .poll = lxcfs_poll, +#endif .create = NULL, .destroy = NULL, @@ -937,6 +976,7 @@ static const struct option long_options[] = { {"enable-cfs", no_argument, 0, 0 }, {"enable-pidfd", no_argument, 0, 0 }, {"enable-cgroup", no_argument, 0, 0 }, + {"enable-psi-poll", no_argument, 0, 0 }, {"pidfile", required_argument, 0, 'p' }, {"runtime-dir", required_argument, 0, 0 }, @@ -1011,7 +1051,8 @@ int main(int argc, char *argv[]) opts->zswap_off = false; opts->use_pidfd = false; opts->use_cfs = false; - opts->version = 3; + opts->psi_poll_on = false; + opts->version = 4; while ((c = getopt_long(argc, argv, "dulfhvso:p:", long_options, &idx)) != -1) { switch (c) { @@ -1022,6 +1063,8 @@ int main(int argc, char *argv[]) opts->use_cfs = true; else if (strcmp(long_options[idx].name, "enable-cgroup") == 0) cgroup_is_enabled = true; + else if (strcmp(long_options[idx].name, "enable-psi-poll") == 0) + opts->psi_poll_on = true; else if (strcmp(long_options[idx].name, "runtime-dir") == 0) runtime_path_arg = optarg; else @@ -1189,6 +1232,11 @@ int main(int argc, char *argv[]) } #endif + if (install_signal_handler(SIG_NOTIFY_POLL_WAKEUP, sig_noop_handler)) { + lxcfs_error("%s - Failed to install SIG_NOTIFY_POLL_WAKEUP signal handler", strerror(errno)); + goto out; + } + if (!pidfile) { snprintf(pidfile_buf, sizeof(pidfile_buf), "%s%s", runtime_path, PID_FILE); pidfile = pidfile_buf; diff --git a/src/macro.h b/src/macro.h index 9e2808b..4c939f8 100644 --- a/src/macro.h +++ b/src/macro.h @@ -20,6 +20,8 @@ #define CGROUP2_SUPER_MAGIC 0x63677270 #endif +#define DEFAULT_POLLMASK (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM) + #define lxcfs_debug_stream(stream, format, ...) \ do { \ fprintf(stream, "%s: %d: %s: " format "\n", __FILE__, \ diff --git a/src/proc_fuse.c b/src/proc_fuse.c index 335ff40..8ed3be2 100644 --- a/src/proc_fuse.c +++ b/src/proc_fuse.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -313,18 +314,6 @@ __lxcfs_fuse_ops int proc_access(const char *path, int mask) return 0; } -__lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi) -{ - do_release_file_info(fi); - return 0; -} - -__lxcfs_fuse_ops int proc_releasedir(const char *path, struct fuse_file_info *fi) -{ - do_release_file_info(fi); - return 0; -} - /** * Gets a non-hierarchical memory controller limit, or UINT64_MAX if no limit is * in place. If `swap` is true, reads 'swap' (v2) or 'memsw' (v1); otherwise @@ -1709,7 +1698,7 @@ static int proc_slabinfo_read(char *buf, size_t size, off_t offset, return total_len; } -static int proc_pressure_io_read(char *buf, size_t size, off_t offset, +static int proc_pressure_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { __do_free char *cgroup = NULL, *line = NULL; @@ -1721,6 +1710,9 @@ static int proc_pressure_io_read(char *buf, size_t size, off_t offset, size_t linelen = 0, total_len = 0; char *cache = d->buf; size_t cache_size = d->buflen; + char *fallback_path; + char *controller; + int (*get_pressure_fd)(struct cgroup_ops *ops, const char *cgroup); pid_t initpid; if (offset) { @@ -1739,161 +1731,43 @@ static int proc_pressure_io_read(char *buf, size_t size, off_t offset, return total_len; } - initpid = lookup_initpid_in_store(fc->pid); - if (initpid <= 1 || is_shared_pidns(initpid)) - initpid = fc->pid; - - cgroup = get_pid_cgroup(initpid, "blkio"); - if (!cgroup) - return read_file_fuse("/proc/pressure/io", buf, size, d); - - prune_init_slice(cgroup); - - fd = cgroup_ops->get_pressure_io_fd(cgroup_ops, cgroup); - if (fd < 0) - return read_file_fuse("/proc/pressure/io", buf, size, d); - - f = fdopen_cached(fd, "re", &fopen_cache); - if (!f) - return read_file_fuse("/proc/pressure/io", buf, size, d); - - while (getline(&line, &linelen, f) != -1) { - ssize_t l = snprintf(cache, cache_size, "%s", line); - if (l < 0) - return log_error(0, "Failed to write cache"); - if ((size_t)l >= cache_size) - return log_error(0, "Write to cache was truncated"); - - cache += l; - cache_size -= l; - total_len += l; - } - - d->cached = 1; - d->size = total_len; - if (total_len > size) - total_len = size; - memcpy(buf, d->buf, total_len); - - return total_len; -} - -static int proc_pressure_cpu_read(char *buf, size_t size, off_t offset, - struct fuse_file_info *fi) -{ - __do_free char *cgroup = NULL, *line = NULL; - __do_free void *fopen_cache = NULL; - __do_fclose FILE *f = NULL; - __do_close int fd = -EBADF; - struct fuse_context *fc = fuse_get_context(); - struct file_info *d = INTTYPE_TO_PTR(fi->fh); - size_t linelen = 0, total_len = 0; - char *cache = d->buf; - size_t cache_size = d->buflen; - pid_t initpid; - - if (offset) { - size_t left; - - if (offset > d->size) - return -EINVAL; - - if (!d->cached) - return 0; - - left = d->size - offset; - total_len = left > size ? size : left; - memcpy(buf, cache + offset, total_len); - - return total_len; + switch (d->type) { + case LXC_TYPE_PROC_PRESSURE_IO: + fallback_path = LXC_TYPE_PROC_PRESSURE_IO_PATH; + controller = "blkio"; + get_pressure_fd = cgroup_ops->get_pressure_io_fd; + break; + case LXC_TYPE_PROC_PRESSURE_CPU: + fallback_path = LXC_TYPE_PROC_PRESSURE_CPU_PATH; + controller = "cpu"; + get_pressure_fd = cgroup_ops->get_pressure_cpu_fd; + break; + case LXC_TYPE_PROC_PRESSURE_MEMORY: + fallback_path = LXC_TYPE_PROC_PRESSURE_MEMORY_PATH; + controller = "memory"; + get_pressure_fd = cgroup_ops->get_pressure_memory_fd; + break; + default: + return -EINVAL; } initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; - cgroup = get_pid_cgroup(initpid, "cpu"); + cgroup = get_pid_cgroup(initpid, controller); if (!cgroup) - return read_file_fuse("/proc/pressure/cpu", buf, size, d); + return read_file_fuse(fallback_path, buf, size, d); prune_init_slice(cgroup); - fd = cgroup_ops->get_pressure_cpu_fd(cgroup_ops, cgroup); + fd = get_pressure_fd(cgroup_ops, cgroup); if (fd < 0) - return read_file_fuse("/proc/pressure/cpu", buf, size, d); + return read_file_fuse(fallback_path, buf, size, d); f = fdopen_cached(fd, "re", &fopen_cache); if (!f) - return read_file_fuse("/proc/pressure/cpu", buf, size, d); - - while (getline(&line, &linelen, f) != -1) { - ssize_t l = snprintf(cache, cache_size, "%s", line); - if (l < 0) - return log_error(0, "Failed to write cache"); - if ((size_t)l >= cache_size) - return log_error(0, "Write to cache was truncated"); - - cache += l; - cache_size -= l; - total_len += l; - } - - d->cached = 1; - d->size = total_len; - if (total_len > size) - total_len = size; - memcpy(buf, d->buf, total_len); - - return total_len; -} - -static int proc_pressure_memory_read(char *buf, size_t size, off_t offset, - struct fuse_file_info *fi) -{ - __do_free char *cgroup = NULL, *line = NULL; - __do_free void *fopen_cache = NULL; - __do_fclose FILE *f = NULL; - __do_close int fd = -EBADF; - struct fuse_context *fc = fuse_get_context(); - struct file_info *d = INTTYPE_TO_PTR(fi->fh); - size_t linelen = 0, total_len = 0; - char *cache = d->buf; - size_t cache_size = d->buflen; - pid_t initpid; - - if (offset) { - size_t left; - - if (offset > d->size) - return -EINVAL; - - if (!d->cached) - return 0; - - left = d->size - offset; - total_len = left > size ? size : left; - memcpy(buf, cache + offset, total_len); - - return total_len; - } - - initpid = lookup_initpid_in_store(fc->pid); - if (initpid <= 1 || is_shared_pidns(initpid)) - initpid = fc->pid; - - cgroup = get_pid_cgroup(initpid, "memory"); - if (!cgroup) - return read_file_fuse("/proc/pressure/memory", buf, size, d); - - prune_init_slice(cgroup); - - fd = cgroup_ops->get_pressure_memory_fd(cgroup_ops, cgroup); - if (fd < 0) - return read_file_fuse("/proc/pressure/memory", buf, size, d); - - f = fdopen_cached(fd, "re", &fopen_cache); - if (!f) - return read_file_fuse("/proc/pressure/memory", buf, size, d); + return read_file_fuse(fallback_path, buf, size, d); while (getline(&line, &linelen, f) != -1) { ssize_t l = snprintf(cache, cache_size, "%s", line); @@ -2015,19 +1889,19 @@ __lxcfs_fuse_ops int proc_read(const char *path, char *buf, size_t size, buf, size, offset, f); case LXC_TYPE_PROC_PRESSURE_IO: if (liblxcfs_functional()) - return proc_pressure_io_read(buf, size, offset, fi); + return proc_pressure_read(buf, size, offset, fi); return read_file_fuse_with_offset(LXC_TYPE_PROC_PRESSURE_IO_PATH, buf, size, offset, f); case LXC_TYPE_PROC_PRESSURE_CPU: if (liblxcfs_functional()) - return proc_pressure_cpu_read(buf, size, offset, fi); + return proc_pressure_read(buf, size, offset, fi); return read_file_fuse_with_offset(LXC_TYPE_PROC_PRESSURE_CPU_PATH, buf, size, offset, f); case LXC_TYPE_PROC_PRESSURE_MEMORY: if (liblxcfs_functional()) - return proc_pressure_memory_read(buf, size, offset, fi); + return proc_pressure_read(buf, size, offset, fi); return read_file_fuse_with_offset(LXC_TYPE_PROC_PRESSURE_MEMORY_PATH, buf, size, offset, f); @@ -2035,3 +1909,473 @@ __lxcfs_fuse_ops int proc_read(const char *path, char *buf, size_t size, return -EINVAL; } + +typedef enum { + POLL_NOTIFY_THREAD_EXITED = 0, + POLL_NOTIFY_THREAD_SPAWNED = 1, + POLL_NOTIFY_THREAD_RUNNING = 2, +} notify_poll_thread_state_t; + +typedef struct psi_trigger { + /* increase version if the structure was changed */ + __u16 version; + + pthread_mutex_t lock; + + pthread_t tid; + notify_poll_thread_state_t thread_state; + + /* libfuse's FUSE_NOTIFY_POLL handle */ + struct fuse_pollhandle *ph; + + struct pollfd pfd; +} psi_trigger_t; + +/* PSI trigger definitions from the kernel */ +#define WINDOW_MAX_US 10000000 /* Max window size is 10s */ + +static int proc_psi_trigger_write(const char *path, const char *buf, size_t size, + off_t offset, struct fuse_file_info *fi) +{ + struct fuse_context *fc = fuse_get_context(); + bool psi_virtualization_enabled = lxcfs_has_opt(fc->private_data, LXCFS_PSI_POLL_ON); + struct file_info *f = INTTYPE_TO_PTR(fi->fh); + __do_free psi_trigger_t *t = NULL; + __do_free char *cgroup = NULL; + __do_close int fd = -EBADF; + char *controller; + int (*get_pressure_fd)(struct cgroup_ops *ops, const char *cgroup); + pid_t initpid; + char tmpbuf[32]; + size_t tmpbuf_size; + bool psi_full; + __u32 threshold_us; + __u32 window_us; + int ret; + + if (!liblxcfs_functional()) + return -EIO; + + if (!psi_virtualization_enabled) + return -EINVAL; + + /* mimic logic from kernel/sched/psi.c psi_write() */ + if (f->private_data) + return -EBUSY; + + if (offset || !size) + return -EINVAL; + + /* + * We could just use buf as an input argument for write() + * syscall on a real fd, but it can be dangerous. It is safer + * to do all the parsing, parameters validation on LXCFS side + * and only then call write() on the underlying fd. + */ + + tmpbuf_size = MIN(size, sizeof(tmpbuf)); + memcpy(tmpbuf, buf, tmpbuf_size); + tmpbuf[tmpbuf_size - 1] = '\0'; + + if (sscanf(tmpbuf, "some %u %u", &threshold_us, &window_us) == 2) + psi_full = false; + else if (sscanf(tmpbuf, "full %u %u", &threshold_us, &window_us) == 2) + psi_full = true; + else + return -EINVAL; + + if (window_us == 0 || window_us > WINDOW_MAX_US) + return -EINVAL; + + /* + * Use limitation for unprivileged user (see kernels psi_trigger_create()). + * We can relax this later if needed. + */ + if (window_us % 2000000) + window_us = 2000000; + + /* Check threshold */ + if (threshold_us == 0 || threshold_us > window_us) + return -EINVAL; + + /* rebuild everything back */ + ret = strnprintf(tmpbuf, sizeof(tmpbuf), "%s %u %u", + psi_full ? "full" : "some", threshold_us, window_us); + if (ret < 0) + return -EIO; + + switch (f->type) { + case LXC_TYPE_PROC_PRESSURE_IO: + controller = "blkio"; + get_pressure_fd = cgroup_ops->get_pressure_io_fd; + break; + case LXC_TYPE_PROC_PRESSURE_CPU: + controller = "cpu"; + get_pressure_fd = cgroup_ops->get_pressure_cpu_fd; + break; + case LXC_TYPE_PROC_PRESSURE_MEMORY: + controller = "memory"; + get_pressure_fd = cgroup_ops->get_pressure_memory_fd; + break; + default: + return -EINVAL; + } + + initpid = lookup_initpid_in_store(fc->pid); + if (initpid <= 1 || is_shared_pidns(initpid)) + initpid = fc->pid; + + cgroup = get_pid_cgroup(initpid, controller); + if (!cgroup) + return -EIO; + + prune_init_slice(cgroup); + + fd = get_pressure_fd(cgroup_ops, cgroup); + if (fd < 0) + return -EIO; + + if (write(fd, tmpbuf, strlen(tmpbuf) + 1) < 0) { + lxcfs_error("/proc/pressure/{cpu, io, memory} write error: %s", strerror(errno)); + return -EIO; + } + + t = zalloc(sizeof(*t)); + if (!t) + return -EIO; + + t->version = 1; + + if (pthread_mutex_init(&t->lock, NULL)) + return -EIO; + + t->pfd.fd = move_fd(fd); + t->pfd.events = POLLPRI; + + /* will be released in proc_psi_trigger_release() */ + f->private_data = move_ptr(t); + + return size; +} + +static void notify_poll(psi_trigger_t *t) +{ + if (!t->ph) { + lxcfs_error("notify_poll called without pollhandle"); + return; + } + + lxcfs_debug("thread %d sends FUSE_NOTIFY_POLL", gettid()); + + /* send FUSE_NOTIFY_POLL to the kernel */ + fuse_notify_poll(t->ph); + + fuse_pollhandle_destroy(t->ph); + t->ph = NULL; +} + +static void *poll_thread(void *arg) +{ + psi_trigger_t *t = arg; + int n; +#ifdef PSITRIGGERTEST + struct timespec req = { 1, 0 }; +#endif + + pthread_mutex_lock(&t->lock); + t->thread_state = POLL_NOTIFY_THREAD_RUNNING; + + /* + * If t->pfd.revents is not zero, then we already got + * a poll event before, so our job is only to notify + * kernel about it with FUSE_NOTIFY_POLL and clean t->pfd.revents. + */ + if (t->pfd.revents) { + notify_poll(t); + t->pfd.revents = 0; + goto exit; + } + + lxcfs_debug("thread %d is going to poll", gettid()); + +again: +#ifndef PSITRIGGERTEST + n = poll(&t->pfd, 1, -1); +#else + /* + * It would be not practical and not stable to simulate pressure events + * on CI. So instead of real poll-ing on PSI files, lets just simulate "events" + * with nanosleep. This allows to test all the main logic. + * + * See also tests/test-psi-triggers-poll.c + */ + n = nanosleep(&req, NULL); + if (n == 0) { + /* simulate successful poll() return with event */ + n = 1; + t->pfd.revents = DEFAULT_POLLMASK | POLLPRI; + } +#endif + + if (n < 0) { + /* SIG_NOTIFY_POLL_WAKEUP signal? */ + if (errno == EINTR) { + lxcfs_debug("poll() was interrupted by a signal"); + + /* see comment in proc_psi_trigger_release() */ + if (t->pfd.fd > 0) + goto again; + } else { + lxcfs_error("poll() failed: %s", strerror(errno)); + t->pfd.revents = DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + } + + goto exit; + } + + lxcfs_debug("thread %d has done with poll-ing %d %x", gettid(), n, t->pfd.revents); + + if (n > 0) + notify_poll(t); + +exit: + t->thread_state = POLL_NOTIFY_THREAD_EXITED; + pthread_mutex_unlock(&t->lock); + return NULL; +} + +static int schedule_notify_poll(psi_trigger_t *t) +{ + int ret = 0; + + ret = pthread_create(&t->tid, NULL, poll_thread, t); + if (ret) { + lxcfs_error("Failed to spawn a thread for FUSE_NOTIFY_POLL: %s", strerror(ret)); + goto exit; + } + + ret = pthread_detach(t->tid); + if (ret) { + lxcfs_error("Failed to detach the FUSE_NOTIFY_POLL thread: %s", strerror(ret)); + goto exit; + } + + lxcfs_debug("FUSE_NOTIFY_POLL thread spawned"); + t->thread_state = POLL_NOTIFY_THREAD_SPAWNED; + +exit: + return ret; +} + +static int proc_psi_trigger_poll(const char *path, struct fuse_file_info *fi, + struct fuse_pollhandle *ph, unsigned *reventsp) +{ + struct file_info *f = INTTYPE_TO_PTR(fi->fh); + psi_trigger_t *t = f->private_data; + int ret = 0; + + if (!liblxcfs_functional()) { + fuse_pollhandle_destroy(ph); + return -EIO; + } + + if (f->type != LXC_TYPE_PROC_PRESSURE_IO && + f->type != LXC_TYPE_PROC_PRESSURE_CPU && + f->type != LXC_TYPE_PROC_PRESSURE_MEMORY) { + fuse_pollhandle_destroy(ph); + *reventsp = DEFAULT_POLLMASK; + return 0; + } + + if (!t) { + fuse_pollhandle_destroy(ph); + *reventsp = DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + return 0; + } + + /* + * We shouldn't block the kernel and bail out quickly, + * otherwise poll-s with timeout won't work. + */ + if (pthread_mutex_trylock(&t->lock)) { + lxcfs_debug("proc_psi_trigger_poll() can't take a mutex now"); + fuse_pollhandle_destroy(ph); + *reventsp = DEFAULT_POLLMASK; + return 0; + } + + lxcfs_debug("proc_psi_trigger_poll() has taken t->lock"); + + /* + * We want to ensure that notify thread was able to take t->lock mutex + * and make its job (or wasn't spawned yet) before we go again. + */ + if (t->thread_state != POLL_NOTIFY_THREAD_EXITED) { + lxcfs_debug("FUSE_NOTIFY_POLL thread hasn't finished before next FUSE_POLL arrive"); + goto exit_retry; + } + + if (!t->pfd.revents) { + int n = poll(&t->pfd, 1, 0); + if (n < 0) { + lxcfs_error("poll() failed: %s", strerror(errno)); + t->pfd.revents = DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + } + } + + /* let know the kernel which events we've got */ + *reventsp = t->pfd.revents; + + /* cleanup old notify handle */ + if (t->ph) { + fuse_pollhandle_destroy(t->ph); + t->ph = NULL; + } + + /* + * When ph is not NULL it means that kernel asks us + * to do FUSE_NOTIFY_POLL once we have something. + * + * We must do this from a separate thread to prevent deadlocks. + */ + if (ph) { + t->ph = ph; + + ret = schedule_notify_poll(t); + if (ret) + goto exit; + } else { + /* + * All my tests show that ph is *always* not NULL, basically, + * we always being asked by the kernel to send FUSE_NOTIFY_POLL + * even when *reventsp is non-zero. After sending a notification + * we clean t->pfd.revents to prepare it for the next poll() if needed. + * + * But if ph is NULL, we need to clean it up in here. + */ + t->pfd.revents = 0; + } + +exit: + pthread_mutex_unlock(&t->lock); + lxcfs_debug("returning ret = %d *reventsp = %x", ret, *reventsp); + return ret; + +exit_retry: + if (ph) { + if (t->ph) { + fuse_pollhandle_destroy(t->ph); + t->ph = NULL; + } + + t->ph = ph; + } + + /* ask the kernel to get back later */ + *reventsp = 0; + pthread_mutex_unlock(&t->lock); + lxcfs_debug("returning ret = %d *reventsp = %x", ret, *reventsp); + return 0; +} + +static void proc_psi_trigger_release(struct file_info *f) +{ + psi_trigger_t *t = f->private_data; + + if (!t) + return; + + /* + * It can happen, that proc_psi_trigger_release() is called + * while poll notify thread is running. We can use t->pfd.fd + * as a signal for this thread that it's time to finish. + * Once we close fd and set t->pfd.fd to -EBADF, thread will + * gracefully exit on our signal. + */ + close_prot_errno_disarm(t->pfd.fd); + pthread_kill(t->tid, SIG_NOTIFY_POLL_WAKEUP); + + lxcfs_debug("proc_psi_trigger_release is waiting on t->lock"); + pthread_mutex_lock(&t->lock); + if (t->thread_state != POLL_NOTIFY_THREAD_EXITED) { + lxcfs_error("FUSE_NOTIFY_POLL thread hasn't finished (gracefully)"); + pthread_kill(t->tid, SIGKILL); + } + pthread_mutex_unlock(&t->lock); + + /* + * pthread_mutex_lock() works as a barrier in this case, we know + * that nobody can access (t) anymore, because poll notify thread + * is dead by now and we are in f_op->release() callback, there is + * no active read/write/poll syscall in there, because (struct file) + * is being destroyed. + */ + + pthread_mutex_destroy(&t->lock); + + if (t->ph) { + fuse_pollhandle_destroy(t->ph); + t->ph = NULL; + } + + free_disarm(f->private_data); + lxcfs_debug("proc_psi_trigger_release has done"); +} + +__lxcfs_fuse_ops int proc_write(const char *path, const char *buf, size_t size, + off_t offset, struct fuse_file_info *fi) +{ + struct file_info *f = INTTYPE_TO_PTR(fi->fh); + + switch (f->type) { + case LXC_TYPE_PROC_PRESSURE_IO: + case LXC_TYPE_PROC_PRESSURE_CPU: + case LXC_TYPE_PROC_PRESSURE_MEMORY: + return proc_psi_trigger_write(path, buf, size, offset, fi); + } + + return -EINVAL; +} + +__lxcfs_fuse_ops int proc_poll(const char *path, struct fuse_file_info *fi, + struct fuse_pollhandle *ph, unsigned *reventsp) +{ + struct file_info *f = INTTYPE_TO_PTR(fi->fh); + + switch (f->type) { + case LXC_TYPE_PROC_PRESSURE_IO: + case LXC_TYPE_PROC_PRESSURE_CPU: + case LXC_TYPE_PROC_PRESSURE_MEMORY: + return proc_psi_trigger_poll(path, fi, ph, reventsp); + } + + fuse_pollhandle_destroy(ph); + *reventsp = DEFAULT_POLLMASK; + return 0; +} + +__lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi) +{ + struct file_info *f; + + f = INTTYPE_TO_PTR(fi->fh); + if (!f) + return 0; + + switch (f->type) { + case LXC_TYPE_PROC_PRESSURE_IO: + case LXC_TYPE_PROC_PRESSURE_CPU: + case LXC_TYPE_PROC_PRESSURE_MEMORY: + proc_psi_trigger_release(f); + break; + } + + do_release_file_info(fi); + return 0; +} + +__lxcfs_fuse_ops int proc_releasedir(const char *path, struct fuse_file_info *fi) +{ + do_release_file_info(fi); + return 0; +} diff --git a/src/proc_fuse.h b/src/proc_fuse.h index ebf8e7a..239d738 100644 --- a/src/proc_fuse.h +++ b/src/proc_fuse.h @@ -21,6 +21,8 @@ __visible extern int proc_open(const char *path, struct fuse_file_info *fi); __visible extern int proc_opendir(const char *path, struct fuse_file_info *fi); __visible extern int proc_access(const char *path, int mask); __visible extern int proc_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi); +__visible extern int proc_write(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi); +__visible extern int proc_poll(const char *path, struct fuse_file_info *fi, struct fuse_pollhandle *ph, unsigned *reventsp); __visible extern int proc_release(const char *path, struct fuse_file_info *fi); __visible extern int proc_releasedir(const char *path, struct fuse_file_info *fi); diff --git a/tests/live-upgrade-test.sh.in b/tests/live-upgrade-test.sh.in index d0f829b..39c8e0b 100755 --- a/tests/live-upgrade-test.sh.in +++ b/tests/live-upgrade-test.sh.in @@ -51,7 +51,7 @@ if [ -x ${lxcfs} ]; then export LD_LIBRARY_PATH="{{LXCFS_BUILD_ROOT}}" fi echo "=> Spawning ${lxcfs} ${LXCFSDIR}" - ${lxcfs} --enable-cgroup -p ${pidfile} ${LXCFSDIR} & + ${lxcfs} --enable-cgroup --enable-psi-poll -p ${pidfile} ${LXCFSDIR} & LXCFSPID=$! else UNSHARE=0 diff --git a/tests/main.sh.in b/tests/main.sh.in index 088286d..07fd25a 100755 --- a/tests/main.sh.in +++ b/tests/main.sh.in @@ -45,7 +45,7 @@ if [ -x ${lxcfs} ]; then export LD_LIBRARY_PATH="{{LXCFS_BUILD_ROOT}}" fi echo "=> Spawning ${lxcfs} ${LXCFSDIR}" - ${lxcfs} --enable-cgroup -p ${pidfile} ${LXCFSDIR} & + ${lxcfs} --enable-cgroup --enable-psi-poll -p ${pidfile} ${LXCFSDIR} & LXCFSPID=$! else UNSHARE=0 diff --git a/tests/meson.build b/tests/meson.build index d82d7be..bca9e42 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -145,3 +145,10 @@ test_programs += executable( include_directories: config_include, install: false, build_by_default: want_tests != false) + +test_programs += executable( + 'test-psi-triggers-poll', + 'test-psi-triggers-poll.c', + include_directories: config_include, + install: false, + build_by_default: want_tests != false) diff --git a/tests/test-psi-triggers-poll.c b/tests/test-psi-triggers-poll.c new file mode 100644 index 0000000..52e0040 --- /dev/null +++ b/tests/test-psi-triggers-poll.c @@ -0,0 +1,88 @@ +/* Borrowed from https://www.kernel.org/doc/Documentation/accounting/psi.rst */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define WAIT_EVENTS_NUM 4 + +/* +* Monitor cpu partial stall with 2s tracking window size +* and 20ms threshold. +*/ +int main(int argc, char **argv) { + const char trig[] = "some 20000 2000000"; + struct pollfd fds; + int n; + char *path; + size_t len; + int events_cnt = 0; + + if (geteuid() != 0) { + fprintf(stderr, "Run me as root\n"); + exit(1); + } + + if (argc != 2) { + fprintf(stderr, "Usage: %s [lxcfs_mount_path]\n", argv[0]); + exit(1); + } + + len = strlen(argv[1]) + strlen("/proc/pressure/cpu") + 1; + path = alloca(len); + snprintf(path, len, "%s/proc/pressure/cpu", argv[1]); + fds.fd = open(path, O_RDWR | O_NONBLOCK); + if (fds.fd < 0) { + printf("%s open error: %s\n", path, strerror(errno)); + return 1; + } + fds.events = POLLPRI; + + if (write(fds.fd, trig, strlen(trig) + 1) < 0) { + printf("/proc/pressure/cpu write error: %s\n", + strerror(errno)); + return 1; + } + + printf("waiting for events...\n"); + time_t t1 = time(NULL); + while (1) { + /* test code in proc_fuse.c poll_thread() generates 1 event per second */ + n = poll(&fds, 1, 3 * 1000); + if (n < 0) { + printf("poll error: %s\n", strerror(errno)); + return 1; + } + if (n == 0) { + printf("timeout\n"); + return 1; + } + if (fds.revents & POLLERR) { + printf("got POLLERR, event source is gone\n"); + return 1; + } + if (fds.revents & POLLPRI) { + printf("event triggered!\n"); + events_cnt++; + if (events_cnt == WAIT_EVENTS_NUM) + break; + } else { + printf("unknown event received: 0x%x\n", fds.revents); + return 1; + } + } + time_t t2 = time(NULL); + + printf("events_cnt = %d time diff = %ld\n", events_cnt, (long)(t2 - t1)); + /* events frequency is 1 HZ, so we can expect that difference <= 1 */ + if (labs((long)(t2 - t1) - events_cnt) > 1) { + printf("| (t2 - t1) - events_cnt | > 1 while should be <= 1\n"); + return 1; + } + + return 0; +} diff --git a/tests/test_proc.in b/tests/test_proc.in index 9232631..887c212 100755 --- a/tests/test_proc.in +++ b/tests/test_proc.in @@ -92,4 +92,10 @@ echo "==> Testing /proc/stat" echo "==> Testing /proc/meminfo" grep -q "^MemTotal.*65536 kB$" ${LXCFSDIR}/proc/meminfo +# We don't support PSI triggers virtualization with old libfuse +if [ "$LIBFUSE" != "fuse" ]; then + echo "==> Testing /proc/pressure/{cpu, io, memory}" + {{LXCFS_BUILD_ROOT}}/tests/test-psi-triggers-poll ${LXCFSDIR} +fi + PASS=1