GNU Linux-libre 5.17.9-gnu
[releases.git] / kernel / bpf / task_iter.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2020 Facebook */
3
4 #include <linux/init.h>
5 #include <linux/namei.h>
6 #include <linux/pid_namespace.h>
7 #include <linux/fs.h>
8 #include <linux/fdtable.h>
9 #include <linux/filter.h>
10 #include <linux/btf_ids.h>
11 #include "mmap_unlock_work.h"
12
13 struct bpf_iter_seq_task_common {
14         struct pid_namespace *ns;
15 };
16
17 struct bpf_iter_seq_task_info {
18         /* The first field must be struct bpf_iter_seq_task_common.
19          * this is assumed by {init, fini}_seq_pidns() callback functions.
20          */
21         struct bpf_iter_seq_task_common common;
22         u32 tid;
23 };
24
25 static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
26                                              u32 *tid,
27                                              bool skip_if_dup_files)
28 {
29         struct task_struct *task = NULL;
30         struct pid *pid;
31
32         rcu_read_lock();
33 retry:
34         pid = find_ge_pid(*tid, ns);
35         if (pid) {
36                 *tid = pid_nr_ns(pid, ns);
37                 task = get_pid_task(pid, PIDTYPE_PID);
38                 if (!task) {
39                         ++*tid;
40                         goto retry;
41                 } else if (skip_if_dup_files && !thread_group_leader(task) &&
42                            task->files == task->group_leader->files) {
43                         put_task_struct(task);
44                         task = NULL;
45                         ++*tid;
46                         goto retry;
47                 }
48         }
49         rcu_read_unlock();
50
51         return task;
52 }
53
54 static void *task_seq_start(struct seq_file *seq, loff_t *pos)
55 {
56         struct bpf_iter_seq_task_info *info = seq->private;
57         struct task_struct *task;
58
59         task = task_seq_get_next(info->common.ns, &info->tid, false);
60         if (!task)
61                 return NULL;
62
63         if (*pos == 0)
64                 ++*pos;
65         return task;
66 }
67
68 static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
69 {
70         struct bpf_iter_seq_task_info *info = seq->private;
71         struct task_struct *task;
72
73         ++*pos;
74         ++info->tid;
75         put_task_struct((struct task_struct *)v);
76         task = task_seq_get_next(info->common.ns, &info->tid, false);
77         if (!task)
78                 return NULL;
79
80         return task;
81 }
82
83 struct bpf_iter__task {
84         __bpf_md_ptr(struct bpf_iter_meta *, meta);
85         __bpf_md_ptr(struct task_struct *, task);
86 };
87
88 DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
89
90 static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
91                            bool in_stop)
92 {
93         struct bpf_iter_meta meta;
94         struct bpf_iter__task ctx;
95         struct bpf_prog *prog;
96
97         meta.seq = seq;
98         prog = bpf_iter_get_info(&meta, in_stop);
99         if (!prog)
100                 return 0;
101
102         meta.seq = seq;
103         ctx.meta = &meta;
104         ctx.task = task;
105         return bpf_iter_run_prog(prog, &ctx);
106 }
107
108 static int task_seq_show(struct seq_file *seq, void *v)
109 {
110         return __task_seq_show(seq, v, false);
111 }
112
113 static void task_seq_stop(struct seq_file *seq, void *v)
114 {
115         if (!v)
116                 (void)__task_seq_show(seq, v, true);
117         else
118                 put_task_struct((struct task_struct *)v);
119 }
120
121 static const struct seq_operations task_seq_ops = {
122         .start  = task_seq_start,
123         .next   = task_seq_next,
124         .stop   = task_seq_stop,
125         .show   = task_seq_show,
126 };
127
128 struct bpf_iter_seq_task_file_info {
129         /* The first field must be struct bpf_iter_seq_task_common.
130          * this is assumed by {init, fini}_seq_pidns() callback functions.
131          */
132         struct bpf_iter_seq_task_common common;
133         struct task_struct *task;
134         u32 tid;
135         u32 fd;
136 };
137
138 static struct file *
139 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
140 {
141         struct pid_namespace *ns = info->common.ns;
142         u32 curr_tid = info->tid;
143         struct task_struct *curr_task;
144         unsigned int curr_fd = info->fd;
145
146         /* If this function returns a non-NULL file object,
147          * it held a reference to the task/file.
148          * Otherwise, it does not hold any reference.
149          */
150 again:
151         if (info->task) {
152                 curr_task = info->task;
153                 curr_fd = info->fd;
154         } else {
155                 curr_task = task_seq_get_next(ns, &curr_tid, true);
156                 if (!curr_task) {
157                         info->task = NULL;
158                         info->tid = curr_tid;
159                         return NULL;
160                 }
161
162                 /* set info->task and info->tid */
163                 info->task = curr_task;
164                 if (curr_tid == info->tid) {
165                         curr_fd = info->fd;
166                 } else {
167                         info->tid = curr_tid;
168                         curr_fd = 0;
169                 }
170         }
171
172         rcu_read_lock();
173         for (;; curr_fd++) {
174                 struct file *f;
175                 f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
176                 if (!f)
177                         break;
178                 if (!get_file_rcu(f))
179                         continue;
180
181                 /* set info->fd */
182                 info->fd = curr_fd;
183                 rcu_read_unlock();
184                 return f;
185         }
186
187         /* the current task is done, go to the next task */
188         rcu_read_unlock();
189         put_task_struct(curr_task);
190         info->task = NULL;
191         info->fd = 0;
192         curr_tid = ++(info->tid);
193         goto again;
194 }
195
196 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
197 {
198         struct bpf_iter_seq_task_file_info *info = seq->private;
199         struct file *file;
200
201         info->task = NULL;
202         file = task_file_seq_get_next(info);
203         if (file && *pos == 0)
204                 ++*pos;
205
206         return file;
207 }
208
209 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211         struct bpf_iter_seq_task_file_info *info = seq->private;
212
213         ++*pos;
214         ++info->fd;
215         fput((struct file *)v);
216         return task_file_seq_get_next(info);
217 }
218
219 struct bpf_iter__task_file {
220         __bpf_md_ptr(struct bpf_iter_meta *, meta);
221         __bpf_md_ptr(struct task_struct *, task);
222         u32 fd __aligned(8);
223         __bpf_md_ptr(struct file *, file);
224 };
225
226 DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
227                      struct task_struct *task, u32 fd,
228                      struct file *file)
229
230 static int __task_file_seq_show(struct seq_file *seq, struct file *file,
231                                 bool in_stop)
232 {
233         struct bpf_iter_seq_task_file_info *info = seq->private;
234         struct bpf_iter__task_file ctx;
235         struct bpf_iter_meta meta;
236         struct bpf_prog *prog;
237
238         meta.seq = seq;
239         prog = bpf_iter_get_info(&meta, in_stop);
240         if (!prog)
241                 return 0;
242
243         ctx.meta = &meta;
244         ctx.task = info->task;
245         ctx.fd = info->fd;
246         ctx.file = file;
247         return bpf_iter_run_prog(prog, &ctx);
248 }
249
250 static int task_file_seq_show(struct seq_file *seq, void *v)
251 {
252         return __task_file_seq_show(seq, v, false);
253 }
254
255 static void task_file_seq_stop(struct seq_file *seq, void *v)
256 {
257         struct bpf_iter_seq_task_file_info *info = seq->private;
258
259         if (!v) {
260                 (void)__task_file_seq_show(seq, v, true);
261         } else {
262                 fput((struct file *)v);
263                 put_task_struct(info->task);
264                 info->task = NULL;
265         }
266 }
267
268 static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
269 {
270         struct bpf_iter_seq_task_common *common = priv_data;
271
272         common->ns = get_pid_ns(task_active_pid_ns(current));
273         return 0;
274 }
275
276 static void fini_seq_pidns(void *priv_data)
277 {
278         struct bpf_iter_seq_task_common *common = priv_data;
279
280         put_pid_ns(common->ns);
281 }
282
283 static const struct seq_operations task_file_seq_ops = {
284         .start  = task_file_seq_start,
285         .next   = task_file_seq_next,
286         .stop   = task_file_seq_stop,
287         .show   = task_file_seq_show,
288 };
289
290 struct bpf_iter_seq_task_vma_info {
291         /* The first field must be struct bpf_iter_seq_task_common.
292          * this is assumed by {init, fini}_seq_pidns() callback functions.
293          */
294         struct bpf_iter_seq_task_common common;
295         struct task_struct *task;
296         struct vm_area_struct *vma;
297         u32 tid;
298         unsigned long prev_vm_start;
299         unsigned long prev_vm_end;
300 };
301
302 enum bpf_task_vma_iter_find_op {
303         task_vma_iter_first_vma,   /* use mm->mmap */
304         task_vma_iter_next_vma,    /* use curr_vma->vm_next */
305         task_vma_iter_find_vma,    /* use find_vma() to find next vma */
306 };
307
308 static struct vm_area_struct *
309 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
310 {
311         struct pid_namespace *ns = info->common.ns;
312         enum bpf_task_vma_iter_find_op op;
313         struct vm_area_struct *curr_vma;
314         struct task_struct *curr_task;
315         u32 curr_tid = info->tid;
316
317         /* If this function returns a non-NULL vma, it holds a reference to
318          * the task_struct, and holds read lock on vma->mm->mmap_lock.
319          * If this function returns NULL, it does not hold any reference or
320          * lock.
321          */
322         if (info->task) {
323                 curr_task = info->task;
324                 curr_vma = info->vma;
325                 /* In case of lock contention, drop mmap_lock to unblock
326                  * the writer.
327                  *
328                  * After relock, call find(mm, prev_vm_end - 1) to find
329                  * new vma to process.
330                  *
331                  *   +------+------+-----------+
332                  *   | VMA1 | VMA2 | VMA3      |
333                  *   +------+------+-----------+
334                  *   |      |      |           |
335                  *  4k     8k     16k         400k
336                  *
337                  * For example, curr_vma == VMA2. Before unlock, we set
338                  *
339                  *    prev_vm_start = 8k
340                  *    prev_vm_end   = 16k
341                  *
342                  * There are a few cases:
343                  *
344                  * 1) VMA2 is freed, but VMA3 exists.
345                  *
346                  *    find_vma() will return VMA3, just process VMA3.
347                  *
348                  * 2) VMA2 still exists.
349                  *
350                  *    find_vma() will return VMA2, process VMA2->next.
351                  *
352                  * 3) no more vma in this mm.
353                  *
354                  *    Process the next task.
355                  *
356                  * 4) find_vma() returns a different vma, VMA2'.
357                  *
358                  *    4.1) If VMA2 covers same range as VMA2', skip VMA2',
359                  *         because we already covered the range;
360                  *    4.2) VMA2 and VMA2' covers different ranges, process
361                  *         VMA2'.
362                  */
363                 if (mmap_lock_is_contended(curr_task->mm)) {
364                         info->prev_vm_start = curr_vma->vm_start;
365                         info->prev_vm_end = curr_vma->vm_end;
366                         op = task_vma_iter_find_vma;
367                         mmap_read_unlock(curr_task->mm);
368                         if (mmap_read_lock_killable(curr_task->mm))
369                                 goto finish;
370                 } else {
371                         op = task_vma_iter_next_vma;
372                 }
373         } else {
374 again:
375                 curr_task = task_seq_get_next(ns, &curr_tid, true);
376                 if (!curr_task) {
377                         info->tid = curr_tid + 1;
378                         goto finish;
379                 }
380
381                 if (curr_tid != info->tid) {
382                         info->tid = curr_tid;
383                         /* new task, process the first vma */
384                         op = task_vma_iter_first_vma;
385                 } else {
386                         /* Found the same tid, which means the user space
387                          * finished data in previous buffer and read more.
388                          * We dropped mmap_lock before returning to user
389                          * space, so it is necessary to use find_vma() to
390                          * find the next vma to process.
391                          */
392                         op = task_vma_iter_find_vma;
393                 }
394
395                 if (!curr_task->mm)
396                         goto next_task;
397
398                 if (mmap_read_lock_killable(curr_task->mm))
399                         goto finish;
400         }
401
402         switch (op) {
403         case task_vma_iter_first_vma:
404                 curr_vma = curr_task->mm->mmap;
405                 break;
406         case task_vma_iter_next_vma:
407                 curr_vma = curr_vma->vm_next;
408                 break;
409         case task_vma_iter_find_vma:
410                 /* We dropped mmap_lock so it is necessary to use find_vma
411                  * to find the next vma. This is similar to the  mechanism
412                  * in show_smaps_rollup().
413                  */
414                 curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1);
415                 /* case 1) and 4.2) above just use curr_vma */
416
417                 /* check for case 2) or case 4.1) above */
418                 if (curr_vma &&
419                     curr_vma->vm_start == info->prev_vm_start &&
420                     curr_vma->vm_end == info->prev_vm_end)
421                         curr_vma = curr_vma->vm_next;
422                 break;
423         }
424         if (!curr_vma) {
425                 /* case 3) above, or case 2) 4.1) with vma->next == NULL */
426                 mmap_read_unlock(curr_task->mm);
427                 goto next_task;
428         }
429         info->task = curr_task;
430         info->vma = curr_vma;
431         return curr_vma;
432
433 next_task:
434         put_task_struct(curr_task);
435         info->task = NULL;
436         curr_tid++;
437         goto again;
438
439 finish:
440         if (curr_task)
441                 put_task_struct(curr_task);
442         info->task = NULL;
443         info->vma = NULL;
444         return NULL;
445 }
446
447 static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
448 {
449         struct bpf_iter_seq_task_vma_info *info = seq->private;
450         struct vm_area_struct *vma;
451
452         vma = task_vma_seq_get_next(info);
453         if (vma && *pos == 0)
454                 ++*pos;
455
456         return vma;
457 }
458
459 static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
460 {
461         struct bpf_iter_seq_task_vma_info *info = seq->private;
462
463         ++*pos;
464         return task_vma_seq_get_next(info);
465 }
466
467 struct bpf_iter__task_vma {
468         __bpf_md_ptr(struct bpf_iter_meta *, meta);
469         __bpf_md_ptr(struct task_struct *, task);
470         __bpf_md_ptr(struct vm_area_struct *, vma);
471 };
472
473 DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
474                      struct task_struct *task, struct vm_area_struct *vma)
475
476 static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
477 {
478         struct bpf_iter_seq_task_vma_info *info = seq->private;
479         struct bpf_iter__task_vma ctx;
480         struct bpf_iter_meta meta;
481         struct bpf_prog *prog;
482
483         meta.seq = seq;
484         prog = bpf_iter_get_info(&meta, in_stop);
485         if (!prog)
486                 return 0;
487
488         ctx.meta = &meta;
489         ctx.task = info->task;
490         ctx.vma = info->vma;
491         return bpf_iter_run_prog(prog, &ctx);
492 }
493
494 static int task_vma_seq_show(struct seq_file *seq, void *v)
495 {
496         return __task_vma_seq_show(seq, false);
497 }
498
499 static void task_vma_seq_stop(struct seq_file *seq, void *v)
500 {
501         struct bpf_iter_seq_task_vma_info *info = seq->private;
502
503         if (!v) {
504                 (void)__task_vma_seq_show(seq, true);
505         } else {
506                 /* info->vma has not been seen by the BPF program. If the
507                  * user space reads more, task_vma_seq_get_next should
508                  * return this vma again. Set prev_vm_start to ~0UL,
509                  * so that we don't skip the vma returned by the next
510                  * find_vma() (case task_vma_iter_find_vma in
511                  * task_vma_seq_get_next()).
512                  */
513                 info->prev_vm_start = ~0UL;
514                 info->prev_vm_end = info->vma->vm_end;
515                 mmap_read_unlock(info->task->mm);
516                 put_task_struct(info->task);
517                 info->task = NULL;
518         }
519 }
520
521 static const struct seq_operations task_vma_seq_ops = {
522         .start  = task_vma_seq_start,
523         .next   = task_vma_seq_next,
524         .stop   = task_vma_seq_stop,
525         .show   = task_vma_seq_show,
526 };
527
528 static const struct bpf_iter_seq_info task_seq_info = {
529         .seq_ops                = &task_seq_ops,
530         .init_seq_private       = init_seq_pidns,
531         .fini_seq_private       = fini_seq_pidns,
532         .seq_priv_size          = sizeof(struct bpf_iter_seq_task_info),
533 };
534
535 static struct bpf_iter_reg task_reg_info = {
536         .target                 = "task",
537         .feature                = BPF_ITER_RESCHED,
538         .ctx_arg_info_size      = 1,
539         .ctx_arg_info           = {
540                 { offsetof(struct bpf_iter__task, task),
541                   PTR_TO_BTF_ID_OR_NULL },
542         },
543         .seq_info               = &task_seq_info,
544 };
545
546 static const struct bpf_iter_seq_info task_file_seq_info = {
547         .seq_ops                = &task_file_seq_ops,
548         .init_seq_private       = init_seq_pidns,
549         .fini_seq_private       = fini_seq_pidns,
550         .seq_priv_size          = sizeof(struct bpf_iter_seq_task_file_info),
551 };
552
553 static struct bpf_iter_reg task_file_reg_info = {
554         .target                 = "task_file",
555         .feature                = BPF_ITER_RESCHED,
556         .ctx_arg_info_size      = 2,
557         .ctx_arg_info           = {
558                 { offsetof(struct bpf_iter__task_file, task),
559                   PTR_TO_BTF_ID_OR_NULL },
560                 { offsetof(struct bpf_iter__task_file, file),
561                   PTR_TO_BTF_ID_OR_NULL },
562         },
563         .seq_info               = &task_file_seq_info,
564 };
565
566 static const struct bpf_iter_seq_info task_vma_seq_info = {
567         .seq_ops                = &task_vma_seq_ops,
568         .init_seq_private       = init_seq_pidns,
569         .fini_seq_private       = fini_seq_pidns,
570         .seq_priv_size          = sizeof(struct bpf_iter_seq_task_vma_info),
571 };
572
573 static struct bpf_iter_reg task_vma_reg_info = {
574         .target                 = "task_vma",
575         .feature                = BPF_ITER_RESCHED,
576         .ctx_arg_info_size      = 2,
577         .ctx_arg_info           = {
578                 { offsetof(struct bpf_iter__task_vma, task),
579                   PTR_TO_BTF_ID_OR_NULL },
580                 { offsetof(struct bpf_iter__task_vma, vma),
581                   PTR_TO_BTF_ID_OR_NULL },
582         },
583         .seq_info               = &task_vma_seq_info,
584 };
585
586 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
587            bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
588 {
589         struct mmap_unlock_irq_work *work = NULL;
590         struct vm_area_struct *vma;
591         bool irq_work_busy = false;
592         struct mm_struct *mm;
593         int ret = -ENOENT;
594
595         if (flags)
596                 return -EINVAL;
597
598         if (!task)
599                 return -ENOENT;
600
601         mm = task->mm;
602         if (!mm)
603                 return -ENOENT;
604
605         irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
606
607         if (irq_work_busy || !mmap_read_trylock(mm))
608                 return -EBUSY;
609
610         vma = find_vma(mm, start);
611
612         if (vma && vma->vm_start <= start && vma->vm_end > start) {
613                 callback_fn((u64)(long)task, (u64)(long)vma,
614                             (u64)(long)callback_ctx, 0, 0);
615                 ret = 0;
616         }
617         bpf_mmap_unlock_mm(work, mm);
618         return ret;
619 }
620
621 const struct bpf_func_proto bpf_find_vma_proto = {
622         .func           = bpf_find_vma,
623         .ret_type       = RET_INTEGER,
624         .arg1_type      = ARG_PTR_TO_BTF_ID,
625         .arg1_btf_id    = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
626         .arg2_type      = ARG_ANYTHING,
627         .arg3_type      = ARG_PTR_TO_FUNC,
628         .arg4_type      = ARG_PTR_TO_STACK_OR_NULL,
629         .arg5_type      = ARG_ANYTHING,
630 };
631
632 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
633
634 static void do_mmap_read_unlock(struct irq_work *entry)
635 {
636         struct mmap_unlock_irq_work *work;
637
638         if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
639                 return;
640
641         work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
642         mmap_read_unlock_non_owner(work->mm);
643 }
644
645 static int __init task_iter_init(void)
646 {
647         struct mmap_unlock_irq_work *work;
648         int ret, cpu;
649
650         for_each_possible_cpu(cpu) {
651                 work = per_cpu_ptr(&mmap_unlock_work, cpu);
652                 init_irq_work(&work->irq_work, do_mmap_read_unlock);
653         }
654
655         task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
656         ret = bpf_iter_reg_target(&task_reg_info);
657         if (ret)
658                 return ret;
659
660         task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
661         task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
662         ret =  bpf_iter_reg_target(&task_file_reg_info);
663         if (ret)
664                 return ret;
665
666         task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
667         task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
668         return bpf_iter_reg_target(&task_vma_reg_info);
669 }
670 late_initcall(task_iter_init);