Commit aca0b81e authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'introduce bpf_iter for task_vma'

Song Liu says:

====================

This set introduces bpf_iter for task_vma, which can be used to generate
information similar to /proc/pid/maps. Patch 4/4 adds an example that
mimics /proc/pid/maps.

Current /proc/<pid>/maps and /proc/<pid>/smaps provide information of
vma's of a process. However, these information are not flexible enough to
cover all use cases. For example, if a vma cover mixed 2MB pages and 4kB
pages (x86_64), there is no easy way to tell which address ranges are
backed by 2MB pages. task_vma solves the problem by enabling the user to
generate customize information based on the vma (and vma->vm_mm,
vma->vm_file, etc.).

Changes v6 => v7:
  1. Let BPF iter program use bpf_d_path without specifying sleepable.
     (Alexei)

Changes v5 => v6:
  1. Add more comments for task_vma_seq_get_next() to explain the logic
     of find_vma() calls. (Alexei)
  2. Skip vma found by find_vma() when both vm_start and vm_end matches
     prev_vm_[start|end]. Previous versions only compares vm_start.
     IOW, if vma of [4k, 8k] is replaced by [4k, 12k] after relocking
     mmap_lock, v5 will skip the new vma, while v6 will process it.

Changes v4 => v5:
  1. Fix a refcount leak on task_struct. (Yonghong)
  2. Fix the selftest. (Yonghong)

Changes v3 => v4:
  1. Avoid skipping vma by assigning invalid prev_vm_start in
     task_vma_seq_stop(). (Yonghong)
  2. Move "again" label in task_vma_seq_get_next() save a check. (Yonghong)

Changes v2 => v3:
  1. Rewrite 1/4 so that we hold mmap_lock while calling BPF program. This
     enables the BPF program to access the real vma with BTF. (Alexei)
  2. Fix the logic when the control is returned to user space. (Yonghong)
  3. Revise commit log and cover letter. (Yonghong)

Changes v1 => v2:
  1. Small fixes in task_iter.c and the selftests. (Yonghong)
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents a79e88dd e8168840
......@@ -286,9 +286,248 @@ static const struct seq_operations task_file_seq_ops = {
.show = task_file_seq_show,
};
struct bpf_iter_seq_task_vma_info {
/* The first field must be struct bpf_iter_seq_task_common.
* this is assumed by {init, fini}_seq_pidns() callback functions.
*/
struct bpf_iter_seq_task_common common;
struct task_struct *task;
struct vm_area_struct *vma;
u32 tid;
unsigned long prev_vm_start;
unsigned long prev_vm_end;
};
enum bpf_task_vma_iter_find_op {
task_vma_iter_first_vma, /* use mm->mmap */
task_vma_iter_next_vma, /* use curr_vma->vm_next */
task_vma_iter_find_vma, /* use find_vma() to find next vma */
};
static struct vm_area_struct *
task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
{
struct pid_namespace *ns = info->common.ns;
enum bpf_task_vma_iter_find_op op;
struct vm_area_struct *curr_vma;
struct task_struct *curr_task;
u32 curr_tid = info->tid;
/* If this function returns a non-NULL vma, it holds a reference to
* the task_struct, and holds read lock on vma->mm->mmap_lock.
* If this function returns NULL, it does not hold any reference or
* lock.
*/
if (info->task) {
curr_task = info->task;
curr_vma = info->vma;
/* In case of lock contention, drop mmap_lock to unblock
* the writer.
*
* After relock, call find(mm, prev_vm_end - 1) to find
* new vma to process.
*
* +------+------+-----------+
* | VMA1 | VMA2 | VMA3 |
* +------+------+-----------+
* | | | |
* 4k 8k 16k 400k
*
* For example, curr_vma == VMA2. Before unlock, we set
*
* prev_vm_start = 8k
* prev_vm_end = 16k
*
* There are a few cases:
*
* 1) VMA2 is freed, but VMA3 exists.
*
* find_vma() will return VMA3, just process VMA3.
*
* 2) VMA2 still exists.
*
* find_vma() will return VMA2, process VMA2->next.
*
* 3) no more vma in this mm.
*
* Process the next task.
*
* 4) find_vma() returns a different vma, VMA2'.
*
* 4.1) If VMA2 covers same range as VMA2', skip VMA2',
* because we already covered the range;
* 4.2) VMA2 and VMA2' covers different ranges, process
* VMA2'.
*/
if (mmap_lock_is_contended(curr_task->mm)) {
info->prev_vm_start = curr_vma->vm_start;
info->prev_vm_end = curr_vma->vm_end;
op = task_vma_iter_find_vma;
mmap_read_unlock(curr_task->mm);
if (mmap_read_lock_killable(curr_task->mm))
goto finish;
} else {
op = task_vma_iter_next_vma;
}
} else {
again:
curr_task = task_seq_get_next(ns, &curr_tid, true);
if (!curr_task) {
info->tid = curr_tid + 1;
goto finish;
}
if (curr_tid != info->tid) {
info->tid = curr_tid;
/* new task, process the first vma */
op = task_vma_iter_first_vma;
} else {
/* Found the same tid, which means the user space
* finished data in previous buffer and read more.
* We dropped mmap_lock before returning to user
* space, so it is necessary to use find_vma() to
* find the next vma to process.
*/
op = task_vma_iter_find_vma;
}
if (!curr_task->mm)
goto next_task;
if (mmap_read_lock_killable(curr_task->mm))
goto finish;
}
switch (op) {
case task_vma_iter_first_vma:
curr_vma = curr_task->mm->mmap;
break;
case task_vma_iter_next_vma:
curr_vma = curr_vma->vm_next;
break;
case task_vma_iter_find_vma:
/* We dropped mmap_lock so it is necessary to use find_vma
* to find the next vma. This is similar to the mechanism
* in show_smaps_rollup().
*/
curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1);
/* case 1) and 4.2) above just use curr_vma */
/* check for case 2) or case 4.1) above */
if (curr_vma &&
curr_vma->vm_start == info->prev_vm_start &&
curr_vma->vm_end == info->prev_vm_end)
curr_vma = curr_vma->vm_next;
break;
}
if (!curr_vma) {
/* case 3) above, or case 2) 4.1) with vma->next == NULL */
mmap_read_unlock(curr_task->mm);
goto next_task;
}
info->task = curr_task;
info->vma = curr_vma;
return curr_vma;
next_task:
put_task_struct(curr_task);
info->task = NULL;
curr_tid++;
goto again;
finish:
if (curr_task)
put_task_struct(curr_task);
info->task = NULL;
info->vma = NULL;
return NULL;
}
static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
{
struct bpf_iter_seq_task_vma_info *info = seq->private;
struct vm_area_struct *vma;
vma = task_vma_seq_get_next(info);
if (vma && *pos == 0)
++*pos;
return vma;
}
static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct bpf_iter_seq_task_vma_info *info = seq->private;
++*pos;
return task_vma_seq_get_next(info);
}
struct bpf_iter__task_vma {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct task_struct *, task);
__bpf_md_ptr(struct vm_area_struct *, vma);
};
DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
struct task_struct *task, struct vm_area_struct *vma)
static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
{
struct bpf_iter_seq_task_vma_info *info = seq->private;
struct bpf_iter__task_vma ctx;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, in_stop);
if (!prog)
return 0;
ctx.meta = &meta;
ctx.task = info->task;
ctx.vma = info->vma;
return bpf_iter_run_prog(prog, &ctx);
}
static int task_vma_seq_show(struct seq_file *seq, void *v)
{
return __task_vma_seq_show(seq, false);
}
static void task_vma_seq_stop(struct seq_file *seq, void *v)
{
struct bpf_iter_seq_task_vma_info *info = seq->private;
if (!v) {
(void)__task_vma_seq_show(seq, true);
} else {
/* info->vma has not been seen by the BPF program. If the
* user space reads more, task_vma_seq_get_next should
* return this vma again. Set prev_vm_start to ~0UL,
* so that we don't skip the vma returned by the next
* find_vma() (case task_vma_iter_find_vma in
* task_vma_seq_get_next()).
*/
info->prev_vm_start = ~0UL;
info->prev_vm_end = info->vma->vm_end;
mmap_read_unlock(info->task->mm);
put_task_struct(info->task);
info->task = NULL;
}
}
static const struct seq_operations task_vma_seq_ops = {
.start = task_vma_seq_start,
.next = task_vma_seq_next,
.stop = task_vma_seq_stop,
.show = task_vma_seq_show,
};
BTF_ID_LIST(btf_task_file_ids)
BTF_ID(struct, task_struct)
BTF_ID(struct, file)
BTF_ID(struct, vm_area_struct)
static const struct bpf_iter_seq_info task_seq_info = {
.seq_ops = &task_seq_ops,
......@@ -328,6 +567,26 @@ static struct bpf_iter_reg task_file_reg_info = {
.seq_info = &task_file_seq_info,
};
static const struct bpf_iter_seq_info task_vma_seq_info = {
.seq_ops = &task_vma_seq_ops,
.init_seq_private = init_seq_pidns,
.fini_seq_private = fini_seq_pidns,
.seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info),
};
static struct bpf_iter_reg task_vma_reg_info = {
.target = "task_vma",
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__task_vma, task),
PTR_TO_BTF_ID_OR_NULL },
{ offsetof(struct bpf_iter__task_vma, vma),
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_vma_seq_info,
};
static int __init task_iter_init(void)
{
int ret;
......@@ -339,6 +598,12 @@ static int __init task_iter_init(void)
task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
return bpf_iter_reg_target(&task_file_reg_info);
ret = bpf_iter_reg_target(&task_file_reg_info);
if (ret)
return ret;
task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2];
return bpf_iter_reg_target(&task_vma_reg_info);
}
late_initcall(task_iter_init);
......@@ -1191,6 +1191,10 @@ BTF_SET_END(btf_allowlist_d_path)
static bool bpf_d_path_allowed(const struct bpf_prog *prog)
{
if (prog->type == BPF_PROG_TYPE_TRACING &&
prog->expected_attach_type == BPF_TRACE_ITER)
return true;
if (prog->type == BPF_PROG_TYPE_LSM)
return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
......
......@@ -7,6 +7,7 @@
#include "bpf_iter_task.skel.h"
#include "bpf_iter_task_stack.skel.h"
#include "bpf_iter_task_file.skel.h"
#include "bpf_iter_task_vma.skel.h"
#include "bpf_iter_task_btf.skel.h"
#include "bpf_iter_tcp4.skel.h"
#include "bpf_iter_tcp6.skel.h"
......@@ -64,6 +65,22 @@ static void do_dummy_read(struct bpf_program *prog)
bpf_link__destroy(link);
}
static int read_fd_into_buffer(int fd, char *buf, int size)
{
int bufleft = size;
int len;
do {
len = read(fd, buf, bufleft);
if (len > 0) {
buf += len;
bufleft -= len;
}
} while (len > 0);
return len < 0 ? len : size - bufleft;
}
static void test_ipv6_route(void)
{
struct bpf_iter_ipv6_route *skel;
......@@ -177,7 +194,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel)
{
struct bpf_program *prog = skel->progs.dump_task_struct;
struct bpf_iter_task_btf__bss *bss = skel->bss;
int iter_fd = -1, len = 0, bufleft = TASKBUFSZ;
int iter_fd = -1, err;
struct bpf_link *link;
char *buf = taskbuf;
int ret = 0;
......@@ -190,14 +207,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel)
if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
goto free_link;
do {
len = read(iter_fd, buf, bufleft);
if (len > 0) {
buf += len;
bufleft -= len;
}
} while (len > 0);
err = read_fd_into_buffer(iter_fd, buf, TASKBUFSZ);
if (bss->skip) {
printf("%s:SKIP:no __builtin_btf_type_id\n", __func__);
ret = 1;
......@@ -205,7 +215,7 @@ static int do_btf_read(struct bpf_iter_task_btf *skel)
goto free_link;
}
if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
if (CHECK(err < 0, "read", "read failed: %s\n", strerror(errno)))
goto free_link;
CHECK(strstr(taskbuf, "(struct task_struct)") == NULL,
......@@ -1133,6 +1143,92 @@ static void test_buf_neg_offset(void)
bpf_iter_test_kern6__destroy(skel);
}
#define CMP_BUFFER_SIZE 1024
static char task_vma_output[CMP_BUFFER_SIZE];
static char proc_maps_output[CMP_BUFFER_SIZE];
/* remove \0 and \t from str, and only keep the first line */
static void str_strip_first_line(char *str)
{
char *dst = str, *src = str;
do {
if (*src == ' ' || *src == '\t')
src++;
else
*(dst++) = *(src++);
} while (*src != '\0' && *src != '\n');
*dst = '\0';
}
#define min(a, b) ((a) < (b) ? (a) : (b))
static void test_task_vma(void)
{
int err, iter_fd = -1, proc_maps_fd = -1;
struct bpf_iter_task_vma *skel;
int len, read_size = 4;
char maps_path[64];
skel = bpf_iter_task_vma__open();
if (CHECK(!skel, "bpf_iter_task_vma__open", "skeleton open failed\n"))
return;
skel->bss->pid = getpid();
err = bpf_iter_task_vma__load(skel);
if (CHECK(err, "bpf_iter_task_vma__load", "skeleton load failed\n"))
goto out;
skel->links.proc_maps = bpf_program__attach_iter(
skel->progs.proc_maps, NULL);
if (CHECK(IS_ERR(skel->links.proc_maps), "bpf_program__attach_iter",
"attach iterator failed\n")) {
skel->links.proc_maps = NULL;
goto out;
}
iter_fd = bpf_iter_create(bpf_link__fd(skel->links.proc_maps));
if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
goto out;
/* Read CMP_BUFFER_SIZE (1kB) from bpf_iter. Read in small chunks
* to trigger seq_file corner cases. The expected output is much
* longer than 1kB, so the while loop will terminate.
*/
len = 0;
while (len < CMP_BUFFER_SIZE) {
err = read_fd_into_buffer(iter_fd, task_vma_output + len,
min(read_size, CMP_BUFFER_SIZE - len));
if (CHECK(err < 0, "read_iter_fd", "read_iter_fd failed\n"))
goto out;
len += err;
}
/* read CMP_BUFFER_SIZE (1kB) from /proc/pid/maps */
snprintf(maps_path, 64, "/proc/%u/maps", skel->bss->pid);
proc_maps_fd = open(maps_path, O_RDONLY);
if (CHECK(proc_maps_fd < 0, "open_proc_maps", "open_proc_maps failed\n"))
goto out;
err = read_fd_into_buffer(proc_maps_fd, proc_maps_output, CMP_BUFFER_SIZE);
if (CHECK(err < 0, "read_prog_maps_fd", "read_prog_maps_fd failed\n"))
goto out;
/* strip and compare the first line of the two files */
str_strip_first_line(task_vma_output);
str_strip_first_line(proc_maps_output);
CHECK(strcmp(task_vma_output, proc_maps_output), "compare_output",
"found mismatch\n");
out:
close(proc_maps_fd);
close(iter_fd);
bpf_iter_task_vma__destroy(skel);
}
void test_bpf_iter(void)
{
if (test__start_subtest("btf_id_or_null"))
......@@ -1149,6 +1245,8 @@ void test_bpf_iter(void)
test_task_stack();
if (test__start_subtest("task_file"))
test_task_file();
if (test__start_subtest("task_vma"))
test_task_vma();
if (test__start_subtest("task_btf"))
test_task_btf();
if (test__start_subtest("tcp4"))
......
......@@ -7,6 +7,7 @@
#define bpf_iter__netlink bpf_iter__netlink___not_used
#define bpf_iter__task bpf_iter__task___not_used
#define bpf_iter__task_file bpf_iter__task_file___not_used
#define bpf_iter__task_vma bpf_iter__task_vma___not_used
#define bpf_iter__tcp bpf_iter__tcp___not_used
#define tcp6_sock tcp6_sock___not_used
#define bpf_iter__udp bpf_iter__udp___not_used
......@@ -26,6 +27,7 @@
#undef bpf_iter__netlink
#undef bpf_iter__task
#undef bpf_iter__task_file
#undef bpf_iter__task_vma
#undef bpf_iter__tcp
#undef tcp6_sock
#undef bpf_iter__udp
......@@ -67,6 +69,12 @@ struct bpf_iter__task_file {
struct file *file;
} __attribute__((preserve_access_index));
struct bpf_iter__task_vma {
struct bpf_iter_meta *meta;
struct task_struct *task;
struct vm_area_struct *vma;
} __attribute__((preserve_access_index));
struct bpf_iter__bpf_map {
struct bpf_iter_meta *meta;
struct bpf_map *map;
......
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#include "bpf_iter.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
char _license[] SEC("license") = "GPL";
/* Copied from mm.h */
#define VM_READ 0x00000001
#define VM_WRITE 0x00000002
#define VM_EXEC 0x00000004
#define VM_MAYSHARE 0x00000080
/* Copied from kdev_t.h */
#define MINORBITS 20
#define MINORMASK ((1U << MINORBITS) - 1)
#define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS))
#define MINOR(dev) ((unsigned int) ((dev) & MINORMASK))
#define D_PATH_BUF_SIZE 1024
char d_path_buf[D_PATH_BUF_SIZE] = {};
__u32 pid = 0;
SEC("iter/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx)
{
struct vm_area_struct *vma = ctx->vma;
struct seq_file *seq = ctx->meta->seq;
struct task_struct *task = ctx->task;
struct file *file;
char perm_str[] = "----";
if (task == (void *)0 || vma == (void *)0)
return 0;
file = vma->vm_file;
if (task->tgid != pid)
return 0;
perm_str[0] = (vma->vm_flags & VM_READ) ? 'r' : '-';
perm_str[1] = (vma->vm_flags & VM_WRITE) ? 'w' : '-';
perm_str[2] = (vma->vm_flags & VM_EXEC) ? 'x' : '-';
perm_str[3] = (vma->vm_flags & VM_MAYSHARE) ? 's' : 'p';
BPF_SEQ_PRINTF(seq, "%08llx-%08llx %s ", vma->vm_start, vma->vm_end, perm_str);
if (file) {
__u32 dev = file->f_inode->i_sb->s_dev;
bpf_d_path(&file->f_path, d_path_buf, D_PATH_BUF_SIZE);
BPF_SEQ_PRINTF(seq, "%08llx ", vma->vm_pgoff << 12);
BPF_SEQ_PRINTF(seq, "%02x:%02x %u", MAJOR(dev), MINOR(dev),
file->f_inode->i_ino);
BPF_SEQ_PRINTF(seq, "\t%s\n", d_path_buf);
} else {
BPF_SEQ_PRINTF(seq, "%08llx 00:00 0\n", 0ULL);
}
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment