kernel/bpf/syscall.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
   3  */
   4 #include <linux/bpf.h>
   5 #include <linux/bpf-cgroup.h>
   6 #include <linux/bpf_trace.h>
   7 #include <linux/bpf_lirc.h>
   8 #include <linux/bpf_verifier.h>
   9 #include <linux/bsearch.h>
  10 #include <linux/btf.h>
  11 #include <linux/syscalls.h>
  12 #include <linux/slab.h>
  13 #include <linux/sched/signal.h>
  14 #include <linux/vmalloc.h>
  15 #include <linux/mmzone.h>
  16 #include <linux/anon_inodes.h>
  17 #include <linux/fdtable.h>
  18 #include <linux/file.h>
  19 #include <linux/fs.h>
  20 #include <linux/license.h>
  21 #include <linux/filter.h>
  22 #include <linux/kernel.h>
  23 #include <linux/idr.h>
  24 #include <linux/cred.h>
  25 #include <linux/timekeeping.h>
  26 #include <linux/ctype.h>
  27 #include <linux/nospec.h>
  28 #include <linux/audit.h>
  29 #include <uapi/linux/btf.h>
  30 #include <linux/pgtable.h>
  31 #include <linux/bpf_lsm.h>
  32 #include <linux/poll.h>
  33 #include <linux/sort.h>
  34 #include <linux/bpf-netns.h>
  35 #include <linux/rcupdate_trace.h>
  36 #include <linux/memcontrol.h>
  37 #include <linux/trace_events.h>
  38
  39 #include <net/netfilter/nf_bpf_link.h>
  40 #include <net/netkit.h>
  41 #include <net/tcx.h>
  42
  43 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
  44                           (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
  45                           (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
  46 #define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
  47 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
  48 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
  49                         IS_FD_HASH(map))
  50
  51 #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
  52
  53 DEFINE_PER_CPU(int, bpf_prog_active);
  54 static DEFINE_IDR(prog_idr);
  55 static DEFINE_SPINLOCK(prog_idr_lock);
  56 static DEFINE_IDR(map_idr);
  57 static DEFINE_SPINLOCK(map_idr_lock);
  58 static DEFINE_IDR(link_idr);
  59 static DEFINE_SPINLOCK(link_idr_lock);
  60
  61 int sysctl_unprivileged_bpf_disabled __read_mostly =
  62         IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
  63
  64 static const struct bpf_map_ops * const bpf_map_types[] = {
  65 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
  66 #define BPF_MAP_TYPE(_id, _ops) \
  67         [_id] = &_ops,
  68 #define BPF_LINK_TYPE(_id, _name)
  69 #include <linux/bpf_types.h>
  70 #undef BPF_PROG_TYPE
  71 #undef BPF_MAP_TYPE
  72 #undef BPF_LINK_TYPE
  73 };
  74
  75 /*
  76  * If we're handed a bigger struct than we know of, ensure all the unknown bits
  77  * are 0 - i.e. new user-space does not rely on any kernel feature extensions
  78  * we don't know about yet.
  79  *
  80  * There is a ToCToU between this function call and the following
  81  * copy_from_user() call. However, this is not a concern since this function is
  82  * meant to be a future-proofing of bits.
  83  */
  84 int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
  85                              size_t expected_size,
  86                              size_t actual_size)
  87 {
  88         int res;
  89
  90         if (unlikely(actual_size > PAGE_SIZE))  /* silly large */
  91                 return -E2BIG;
  92
  93         if (actual_size <= expected_size)
  94                 return 0;
  95
  96         if (uaddr.is_kernel)
  97                 res = memchr_inv(uaddr.kernel + expected_size, 0,
  98                                  actual_size - expected_size) == NULL;
  99         else
 100                 res = check_zeroed_user(uaddr.user + expected_size,
 101                                         actual_size - expected_size);
 102         if (res < 0)
 103                 return res;
 104         return res ? 0 : -E2BIG;
 105 }
 106
 107 const struct bpf_map_ops bpf_map_offload_ops = {
 108         .map_meta_equal = bpf_map_meta_equal,
 109         .map_alloc = bpf_map_offload_map_alloc,
 110         .map_free = bpf_map_offload_map_free,
 111         .map_check_btf = map_check_no_btf,
 112         .map_mem_usage = bpf_map_offload_map_mem_usage,
 113 };
 114
 115 static void bpf_map_write_active_inc(struct bpf_map *map)
 116 {
 117         atomic64_inc(&map->writecnt);
 118 }
 119
 120 static void bpf_map_write_active_dec(struct bpf_map *map)
 121 {
 122         atomic64_dec(&map->writecnt);
 123 }
 124
 125 bool bpf_map_write_active(const struct bpf_map *map)
 126 {
 127         return atomic64_read(&map->writecnt) != 0;
 128 }
 129
 130 static u32 bpf_map_value_size(const struct bpf_map *map)
 131 {
 132         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 133             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
 134             map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
 135             map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
 136                 return round_up(map->value_size, 8) * num_possible_cpus();
 137         else if (IS_FD_MAP(map))
 138                 return sizeof(u32);
 139         else
 140                 return  map->value_size;
 141 }
 142
 143 static void maybe_wait_bpf_programs(struct bpf_map *map)
 144 {
 145         /* Wait for any running non-sleepable BPF programs to complete so that
 146          * userspace, when we return to it, knows that all non-sleepable
 147          * programs that could be running use the new map value. For sleepable
 148          * BPF programs, synchronize_rcu_tasks_trace() should be used to wait
 149          * for the completions of these programs, but considering the waiting
 150          * time can be very long and userspace may think it will hang forever,
 151          * so don't handle sleepable BPF programs now.
 152          */
 153         if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
 154             map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
 155                 synchronize_rcu();
 156 }
 157
 158 static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
 159                                 void *key, void *value, __u64 flags)
 160 {
 161         int err;
 162
 163         /* Need to create a kthread, thus must support schedule */
 164         if (bpf_map_is_offloaded(map)) {
 165                 return bpf_map_offload_update_elem(map, key, value, flags);
 166         } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
 167                    map->map_type == BPF_MAP_TYPE_ARENA ||
 168                    map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
 169                 return map->ops->map_update_elem(map, key, value, flags);
 170         } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
 171                    map->map_type == BPF_MAP_TYPE_SOCKMAP) {
 172                 return sock_map_update_elem_sys(map, key, value, flags);
 173         } else if (IS_FD_PROG_ARRAY(map)) {
 174                 return bpf_fd_array_map_update_elem(map, map_file, key, value,
 175                                                     flags);
 176         }
 177
 178         bpf_disable_instrumentation();
 179         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 180             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
 181                 err = bpf_percpu_hash_update(map, key, value, flags);
 182         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 183                 err = bpf_percpu_array_update(map, key, value, flags);
 184         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
 185                 err = bpf_percpu_cgroup_storage_update(map, key, value,
 186                                                        flags);
 187         } else if (IS_FD_ARRAY(map)) {
 188                 err = bpf_fd_array_map_update_elem(map, map_file, key, value,
 189                                                    flags);
 190         } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
 191                 err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
 192                                                   flags);
 193         } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
 194                 /* rcu_read_lock() is not needed */
 195                 err = bpf_fd_reuseport_array_update_elem(map, key, value,
 196                                                          flags);
 197         } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
 198                    map->map_type == BPF_MAP_TYPE_STACK ||
 199                    map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
 200                 err = map->ops->map_push_elem(map, value, flags);
 201         } else {
 202                 rcu_read_lock();
 203                 err = map->ops->map_update_elem(map, key, value, flags);
 204                 rcu_read_unlock();
 205         }
 206         bpf_enable_instrumentation();
 207
 208         return err;
 209 }
 210
 211 static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
 212                               __u64 flags)
 213 {
 214         void *ptr;
 215         int err;
 216
 217         if (bpf_map_is_offloaded(map))
 218                 return bpf_map_offload_lookup_elem(map, key, value);
 219
 220         bpf_disable_instrumentation();
 221         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 222             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
 223                 err = bpf_percpu_hash_copy(map, key, value);
 224         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 225                 err = bpf_percpu_array_copy(map, key, value);
 226         } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
 227                 err = bpf_percpu_cgroup_storage_copy(map, key, value);
 228         } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
 229                 err = bpf_stackmap_copy(map, key, value);
 230         } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
 231                 err = bpf_fd_array_map_lookup_elem(map, key, value);
 232         } else if (IS_FD_HASH(map)) {
 233                 err = bpf_fd_htab_map_lookup_elem(map, key, value);
 234         } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
 235                 err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
 236         } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
 237                    map->map_type == BPF_MAP_TYPE_STACK ||
 238                    map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
 239                 err = map->ops->map_peek_elem(map, value);
 240         } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
 241                 /* struct_ops map requires directly updating "value" */
 242                 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
 243         } else {
 244                 rcu_read_lock();
 245                 if (map->ops->map_lookup_elem_sys_only)
 246                         ptr = map->ops->map_lookup_elem_sys_only(map, key);
 247                 else
 248                         ptr = map->ops->map_lookup_elem(map, key);
 249                 if (IS_ERR(ptr)) {
 250                         err = PTR_ERR(ptr);
 251                 } else if (!ptr) {
 252                         err = -ENOENT;
 253                 } else {
 254                         err = 0;
 255                         if (flags & BPF_F_LOCK)
 256                                 /* lock 'ptr' and copy everything but lock */
 257                                 copy_map_value_locked(map, value, ptr, true);
 258                         else
 259                                 copy_map_value(map, value, ptr);
 260                         /* mask lock and timer, since value wasn't zero inited */
 261                         check_and_init_map_value(map, value);
 262                 }
 263                 rcu_read_unlock();
 264         }
 265
 266         bpf_enable_instrumentation();
 267
 268         return err;
 269 }
 270
 271 /* Please, do not use this function outside from the map creation path
 272  * (e.g. in map update path) without taking care of setting the active
 273  * memory cgroup (see at bpf_map_kmalloc_node() for example).
 274  */
 275 static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
 276 {
 277         /* We really just want to fail instead of triggering OOM killer
 278          * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
 279          * which is used for lower order allocation requests.
 280          *
 281          * It has been observed that higher order allocation requests done by
 282          * vmalloc with __GFP_NORETRY being set might fail due to not trying
 283          * to reclaim memory from the page cache, thus we set
 284          * __GFP_RETRY_MAYFAIL to avoid such situations.
 285          */
 286
 287         gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);
 288         unsigned int flags = 0;
 289         unsigned long align = 1;
 290         void *area;
 291
 292         if (size >= SIZE_MAX)
 293                 return NULL;
 294
 295         /* kmalloc()'ed memory can't be mmap()'ed */
 296         if (mmapable) {
 297                 BUG_ON(!PAGE_ALIGNED(size));
 298                 align = SHMLBA;
 299                 flags = VM_USERMAP;
 300         } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
 301                 area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
 302                                     numa_node);
 303                 if (area != NULL)
 304                         return area;
 305         }
 306
 307         return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
 308                         gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
 309                         flags, numa_node, __builtin_return_address(0));
 310 }
 311
 312 void *bpf_map_area_alloc(u64 size, int numa_node)
 313 {
 314         return __bpf_map_area_alloc(size, numa_node, false);
 315 }
 316
 317 void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
 318 {
 319         return __bpf_map_area_alloc(size, numa_node, true);
 320 }
 321
 322 void bpf_map_area_free(void *area)
 323 {
 324         kvfree(area);
 325 }
 326
 327 static u32 bpf_map_flags_retain_permanent(u32 flags)
 328 {
 329         /* Some map creation flags are not tied to the map object but
 330          * rather to the map fd instead, so they have no meaning upon
 331          * map object inspection since multiple file descriptors with
 332          * different (access) properties can exist here. Thus, given
 333          * this has zero meaning for the map itself, lets clear these
 334          * from here.
 335          */
 336         return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
 337 }
 338
 339 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
 340 {
 341         map->map_type = attr->map_type;
 342         map->key_size = attr->key_size;
 343         map->value_size = attr->value_size;
 344         map->max_entries = attr->max_entries;
 345         map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
 346         map->numa_node = bpf_map_attr_numa_node(attr);
 347         map->map_extra = attr->map_extra;
 348 }
 349
 350 static int bpf_map_alloc_id(struct bpf_map *map)
 351 {
 352         int id;
 353
 354         idr_preload(GFP_KERNEL);
 355         spin_lock_bh(&map_idr_lock);
 356         id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
 357         if (id > 0)
 358                 map->id = id;
 359         spin_unlock_bh(&map_idr_lock);
 360         idr_preload_end();
 361
 362         if (WARN_ON_ONCE(!id))
 363                 return -ENOSPC;
 364
 365         return id > 0 ? 0 : id;
 366 }
 367
 368 void bpf_map_free_id(struct bpf_map *map)
 369 {
 370         unsigned long flags;
 371
 372         /* Offloaded maps are removed from the IDR store when their device
 373          * disappears - even if someone holds an fd to them they are unusable,
 374          * the memory is gone, all ops will fail; they are simply waiting for
 375          * refcnt to drop to be freed.
 376          */
 377         if (!map->id)
 378                 return;
 379
 380         spin_lock_irqsave(&map_idr_lock, flags);
 381
 382         idr_remove(&map_idr, map->id);
 383         map->id = 0;
 384
 385         spin_unlock_irqrestore(&map_idr_lock, flags);
 386 }
 387
 388 #ifdef CONFIG_MEMCG_KMEM
 389 static void bpf_map_save_memcg(struct bpf_map *map)
 390 {
 391         /* Currently if a map is created by a process belonging to the root
 392          * memory cgroup, get_obj_cgroup_from_current() will return NULL.
 393          * So we have to check map->objcg for being NULL each time it's
 394          * being used.
 395          */
 396         if (memcg_bpf_enabled())
 397                 map->objcg = get_obj_cgroup_from_current();
 398 }
 399
 400 static void bpf_map_release_memcg(struct bpf_map *map)
 401 {
 402         if (map->objcg)
 403                 obj_cgroup_put(map->objcg);
 404 }
 405
 406 static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
 407 {
 408         if (map->objcg)
 409                 return get_mem_cgroup_from_objcg(map->objcg);
 410
 411         return root_mem_cgroup;
 412 }
 413
 414 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
 415                            int node)
 416 {
 417         struct mem_cgroup *memcg, *old_memcg;
 418         void *ptr;
 419
 420         memcg = bpf_map_get_memcg(map);
 421         old_memcg = set_active_memcg(memcg);
 422         ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
 423         set_active_memcg(old_memcg);
 424         mem_cgroup_put(memcg);
 425
 426         return ptr;
 427 }
 428
 429 void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
 430 {
 431         struct mem_cgroup *memcg, *old_memcg;
 432         void *ptr;
 433
 434         memcg = bpf_map_get_memcg(map);
 435         old_memcg = set_active_memcg(memcg);
 436         ptr = kzalloc(size, flags | __GFP_ACCOUNT);
 437         set_active_memcg(old_memcg);
 438         mem_cgroup_put(memcg);
 439
 440         return ptr;
 441 }
 442
 443 void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
 444                        gfp_t flags)
 445 {
 446         struct mem_cgroup *memcg, *old_memcg;
 447         void *ptr;
 448
 449         memcg = bpf_map_get_memcg(map);
 450         old_memcg = set_active_memcg(memcg);
 451         ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
 452         set_active_memcg(old_memcg);
 453         mem_cgroup_put(memcg);
 454
 455         return ptr;
 456 }
 457
 458 void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
 459                                     size_t align, gfp_t flags)
 460 {
 461         struct mem_cgroup *memcg, *old_memcg;
 462         void __percpu *ptr;
 463
 464         memcg = bpf_map_get_memcg(map);
 465         old_memcg = set_active_memcg(memcg);
 466         ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
 467         set_active_memcg(old_memcg);
 468         mem_cgroup_put(memcg);
 469
 470         return ptr;
 471 }
 472
 473 #else
 474 static void bpf_map_save_memcg(struct bpf_map *map)
 475 {
 476 }
 477
 478 static void bpf_map_release_memcg(struct bpf_map *map)
 479 {
 480 }
 481 #endif
 482
 483 int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
 484                         unsigned long nr_pages, struct page **pages)
 485 {
 486         unsigned long i, j;
 487         struct page *pg;
 488         int ret = 0;
 489 #ifdef CONFIG_MEMCG_KMEM
 490         struct mem_cgroup *memcg, *old_memcg;
 491
 492         memcg = bpf_map_get_memcg(map);
 493         old_memcg = set_active_memcg(memcg);
 494 #endif
 495         for (i = 0; i < nr_pages; i++) {
 496                 pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);
 497
 498                 if (pg) {
 499                         pages[i] = pg;
 500                         continue;
 501                 }
 502                 for (j = 0; j < i; j++)
 503                         __free_page(pages[j]);
 504                 ret = -ENOMEM;
 505                 break;
 506         }
 507
 508 #ifdef CONFIG_MEMCG_KMEM
 509         set_active_memcg(old_memcg);
 510         mem_cgroup_put(memcg);
 511 #endif
 512         return ret;
 513 }
 514
 515
 516 static int btf_field_cmp(const void *a, const void *b)
 517 {
 518         const struct btf_field *f1 = a, *f2 = b;
 519
 520         if (f1->offset < f2->offset)
 521                 return -1;
 522         else if (f1->offset > f2->offset)
 523                 return 1;
 524         return 0;
 525 }
 526
 527 struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
 528                                   u32 field_mask)
 529 {
 530         struct btf_field *field;
 531
 532         if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
 533                 return NULL;
 534         field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
 535         if (!field || !(field->type & field_mask))
 536                 return NULL;
 537         return field;
 538 }
 539
 540 void btf_record_free(struct btf_record *rec)
 541 {
 542         int i;
 543
 544         if (IS_ERR_OR_NULL(rec))
 545                 return;
 546         for (i = 0; i < rec->cnt; i++) {
 547                 switch (rec->fields[i].type) {
 548                 case BPF_KPTR_UNREF:
 549                 case BPF_KPTR_REF:
 550                 case BPF_KPTR_PERCPU:
 551                         if (rec->fields[i].kptr.module)
 552                                 module_put(rec->fields[i].kptr.module);
 553                         btf_put(rec->fields[i].kptr.btf);
 554                         break;
 555                 case BPF_LIST_HEAD:
 556                 case BPF_LIST_NODE:
 557                 case BPF_RB_ROOT:
 558                 case BPF_RB_NODE:
 559                 case BPF_SPIN_LOCK:
 560                 case BPF_TIMER:
 561                 case BPF_REFCOUNT:
 562                         /* Nothing to release */
 563                         break;
 564                 default:
 565                         WARN_ON_ONCE(1);
 566                         continue;
 567                 }
 568         }
 569         kfree(rec);
 570 }
 571
 572 void bpf_map_free_record(struct bpf_map *map)
 573 {
 574         btf_record_free(map->record);
 575         map->record = NULL;
 576 }
 577
 578 struct btf_record *btf_record_dup(const struct btf_record *rec)
 579 {
 580         const struct btf_field *fields;
 581         struct btf_record *new_rec;
 582         int ret, size, i;
 583
 584         if (IS_ERR_OR_NULL(rec))
 585                 return NULL;
 586         size = offsetof(struct btf_record, fields[rec->cnt]);
 587         new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
 588         if (!new_rec)
 589                 return ERR_PTR(-ENOMEM);
 590         /* Do a deep copy of the btf_record */
 591         fields = rec->fields;
 592         new_rec->cnt = 0;
 593         for (i = 0; i < rec->cnt; i++) {
 594                 switch (fields[i].type) {
 595                 case BPF_KPTR_UNREF:
 596                 case BPF_KPTR_REF:
 597                 case BPF_KPTR_PERCPU:
 598                         btf_get(fields[i].kptr.btf);
 599                         if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
 600                                 ret = -ENXIO;
 601                                 goto free;
 602                         }
 603                         break;
 604                 case BPF_LIST_HEAD:
 605                 case BPF_LIST_NODE:
 606                 case BPF_RB_ROOT:
 607                 case BPF_RB_NODE:
 608                 case BPF_SPIN_LOCK:
 609                 case BPF_TIMER:
 610                 case BPF_REFCOUNT:
 611                         /* Nothing to acquire */
 612                         break;
 613                 default:
 614                         ret = -EFAULT;
 615                         WARN_ON_ONCE(1);
 616                         goto free;
 617                 }
 618                 new_rec->cnt++;
 619         }
 620         return new_rec;
 621 free:
 622         btf_record_free(new_rec);
 623         return ERR_PTR(ret);
 624 }
 625
 626 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
 627 {
 628         bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
 629         int size;
 630
 631         if (!a_has_fields && !b_has_fields)
 632                 return true;
 633         if (a_has_fields != b_has_fields)
 634                 return false;
 635         if (rec_a->cnt != rec_b->cnt)
 636                 return false;
 637         size = offsetof(struct btf_record, fields[rec_a->cnt]);
 638         /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
 639          * members are zeroed out. So memcmp is safe to do without worrying
 640          * about padding/unused fields.
 641          *
 642          * While spin_lock, timer, and kptr have no relation to map BTF,
 643          * list_head metadata is specific to map BTF, the btf and value_rec
 644          * members in particular. btf is the map BTF, while value_rec points to
 645          * btf_record in that map BTF.
 646          *
 647          * So while by default, we don't rely on the map BTF (which the records
 648          * were parsed from) matching for both records, which is not backwards
 649          * compatible, in case list_head is part of it, we implicitly rely on
 650          * that by way of depending on memcmp succeeding for it.
 651          */
 652         return !memcmp(rec_a, rec_b, size);
 653 }
 654
 655 void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
 656 {
 657         if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
 658                 return;
 659         bpf_timer_cancel_and_free(obj + rec->timer_off);
 660 }
 661
 662 void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 663 {
 664         const struct btf_field *fields;
 665         int i;
 666
 667         if (IS_ERR_OR_NULL(rec))
 668                 return;
 669         fields = rec->fields;
 670         for (i = 0; i < rec->cnt; i++) {
 671                 struct btf_struct_meta *pointee_struct_meta;
 672                 const struct btf_field *field = &fields[i];
 673                 void *field_ptr = obj + field->offset;
 674                 void *xchgd_field;
 675
 676                 switch (fields[i].type) {
 677                 case BPF_SPIN_LOCK:
 678                         break;
 679                 case BPF_TIMER:
 680                         bpf_timer_cancel_and_free(field_ptr);
 681                         break;
 682                 case BPF_KPTR_UNREF:
 683                         WRITE_ONCE(*(u64 *)field_ptr, 0);
 684                         break;
 685                 case BPF_KPTR_REF:
 686                 case BPF_KPTR_PERCPU:
 687                         xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
 688                         if (!xchgd_field)
 689                                 break;
 690
 691                         if (!btf_is_kernel(field->kptr.btf)) {
 692                                 pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
 693                                                                            field->kptr.btf_id);
 694                                 migrate_disable();
 695                                 __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
 696                                                                  pointee_struct_meta->record : NULL,
 697                                                                  fields[i].type == BPF_KPTR_PERCPU);
 698                                 migrate_enable();
 699                         } else {
 700                                 field->kptr.dtor(xchgd_field);
 701                         }
 702                         break;
 703                 case BPF_LIST_HEAD:
 704                         if (WARN_ON_ONCE(rec->spin_lock_off < 0))
 705                                 continue;
 706                         bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
 707                         break;
 708                 case BPF_RB_ROOT:
 709                         if (WARN_ON_ONCE(rec->spin_lock_off < 0))
 710                                 continue;
 711                         bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off);
 712                         break;
 713                 case BPF_LIST_NODE:
 714                 case BPF_RB_NODE:
 715                 case BPF_REFCOUNT:
 716                         break;
 717                 default:
 718                         WARN_ON_ONCE(1);
 719                         continue;
 720                 }
 721         }
 722 }
 723
 724 /* called from workqueue */
 725 static void bpf_map_free_deferred(struct work_struct *work)
 726 {
 727         struct bpf_map *map = container_of(work, struct bpf_map, work);
 728         struct btf_record *rec = map->record;
 729         struct btf *btf = map->btf;
 730
 731         security_bpf_map_free(map);
 732         bpf_map_release_memcg(map);
 733         /* implementation dependent freeing */
 734         map->ops->map_free(map);
 735         /* Delay freeing of btf_record for maps, as map_free
 736          * callback usually needs access to them. It is better to do it here
 737          * than require each callback to do the free itself manually.
 738          *
 739          * Note that the btf_record stashed in map->inner_map_meta->record was
 740          * already freed using the map_free callback for map in map case which
 741          * eventually calls bpf_map_free_meta, since inner_map_meta is only a
 742          * template bpf_map struct used during verification.
 743          */
 744         btf_record_free(rec);
 745         /* Delay freeing of btf for maps, as map_free callback may need
 746          * struct_meta info which will be freed with btf_put().
 747          */
 748         btf_put(btf);
 749 }
 750
 751 static void bpf_map_put_uref(struct bpf_map *map)
 752 {
 753         if (atomic64_dec_and_test(&map->usercnt)) {
 754                 if (map->ops->map_release_uref)
 755                         map->ops->map_release_uref(map);
 756         }
 757 }
 758
 759 static void bpf_map_free_in_work(struct bpf_map *map)
 760 {
 761         INIT_WORK(&map->work, bpf_map_free_deferred);
 762         /* Avoid spawning kworkers, since they all might contend
 763          * for the same mutex like slab_mutex.
 764          */
 765         queue_work(system_unbound_wq, &map->work);
 766 }
 767
 768 static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
 769 {
 770         bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
 771 }
 772
 773 static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
 774 {
 775         if (rcu_trace_implies_rcu_gp())
 776                 bpf_map_free_rcu_gp(rcu);
 777         else
 778                 call_rcu(rcu, bpf_map_free_rcu_gp);
 779 }
 780
 781 /* decrement map refcnt and schedule it for freeing via workqueue
 782  * (underlying map implementation ops->map_free() might sleep)
 783  */
 784 void bpf_map_put(struct bpf_map *map)
 785 {
 786         if (atomic64_dec_and_test(&map->refcnt)) {
 787                 /* bpf_map_free_id() must be called first */
 788                 bpf_map_free_id(map);
 789
 790                 WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
 791                 if (READ_ONCE(map->free_after_mult_rcu_gp))
 792                         call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
 793                 else if (READ_ONCE(map->free_after_rcu_gp))
 794                         call_rcu(&map->rcu, bpf_map_free_rcu_gp);
 795                 else
 796                         bpf_map_free_in_work(map);
 797         }
 798 }
 799 EXPORT_SYMBOL_GPL(bpf_map_put);
 800
 801 void bpf_map_put_with_uref(struct bpf_map *map)
 802 {
 803         bpf_map_put_uref(map);
 804         bpf_map_put(map);
 805 }
 806
 807 static int bpf_map_release(struct inode *inode, struct file *filp)
 808 {
 809         struct bpf_map *map = filp->private_data;
 810
 811         if (map->ops->map_release)
 812                 map->ops->map_release(map, filp);
 813
 814         bpf_map_put_with_uref(map);
 815         return 0;
 816 }
 817
 818 static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
 819 {
 820         fmode_t mode = f.file->f_mode;
 821
 822         /* Our file permissions may have been overridden by global
 823          * map permissions facing syscall side.
 824          */
 825         if (READ_ONCE(map->frozen))
 826                 mode &= ~FMODE_CAN_WRITE;
 827         return mode;
 828 }
 829
 830 #ifdef CONFIG_PROC_FS
 831 /* Show the memory usage of a bpf map */
 832 static u64 bpf_map_memory_usage(const struct bpf_map *map)
 833 {
 834         return map->ops->map_mem_usage(map);
 835 }
 836
 837 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 838 {
 839         struct bpf_map *map = filp->private_data;
 840         u32 type = 0, jited = 0;
 841
 842         if (map_type_contains_progs(map)) {
 843                 spin_lock(&map->owner.lock);
 844                 type  = map->owner.type;
 845                 jited = map->owner.jited;
 846                 spin_unlock(&map->owner.lock);
 847         }
 848
 849         seq_printf(m,
 850                    "map_type:\t%u\n"
 851                    "key_size:\t%u\n"
 852                    "value_size:\t%u\n"
 853                    "max_entries:\t%u\n"
 854                    "map_flags:\t%#x\n"
 855                    "map_extra:\t%#llx\n"
 856                    "memlock:\t%llu\n"
 857                    "map_id:\t%u\n"
 858                    "frozen:\t%u\n",
 859                    map->map_type,
 860                    map->key_size,
 861                    map->value_size,
 862                    map->max_entries,
 863                    map->map_flags,
 864                    (unsigned long long)map->map_extra,
 865                    bpf_map_memory_usage(map),
 866                    map->id,
 867                    READ_ONCE(map->frozen));
 868         if (type) {
 869                 seq_printf(m, "owner_prog_type:\t%u\n", type);
 870                 seq_printf(m, "owner_jited:\t%u\n", jited);
 871         }
 872 }
 873 #endif
 874
 875 static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
 876                               loff_t *ppos)
 877 {
 878         /* We need this handler such that alloc_file() enables
 879          * f_mode with FMODE_CAN_READ.
 880          */
 881         return -EINVAL;
 882 }
 883
 884 static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
 885                                size_t siz, loff_t *ppos)
 886 {
 887         /* We need this handler such that alloc_file() enables
 888          * f_mode with FMODE_CAN_WRITE.
 889          */
 890         return -EINVAL;
 891 }
 892
 893 /* called for any extra memory-mapped regions (except initial) */
 894 static void bpf_map_mmap_open(struct vm_area_struct *vma)
 895 {
 896         struct bpf_map *map = vma->vm_file->private_data;
 897
 898         if (vma->vm_flags & VM_MAYWRITE)
 899                 bpf_map_write_active_inc(map);
 900 }
 901
 902 /* called for all unmapped memory region (including initial) */
 903 static void bpf_map_mmap_close(struct vm_area_struct *vma)
 904 {
 905         struct bpf_map *map = vma->vm_file->private_data;
 906
 907         if (vma->vm_flags & VM_MAYWRITE)
 908                 bpf_map_write_active_dec(map);
 909 }
 910
 911 static const struct vm_operations_struct bpf_map_default_vmops = {
 912         .open           = bpf_map_mmap_open,
 913         .close          = bpf_map_mmap_close,
 914 };
 915
 916 static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
 917 {
 918         struct bpf_map *map = filp->private_data;
 919         int err;
 920
 921         if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
 922                 return -ENOTSUPP;
 923
 924         if (!(vma->vm_flags & VM_SHARED))
 925                 return -EINVAL;
 926
 927         mutex_lock(&map->freeze_mutex);
 928
 929         if (vma->vm_flags & VM_WRITE) {
 930                 if (map->frozen) {
 931                         err = -EPERM;
 932                         goto out;
 933                 }
 934                 /* map is meant to be read-only, so do not allow mapping as
 935                  * writable, because it's possible to leak a writable page
 936                  * reference and allows user-space to still modify it after
 937                  * freezing, while verifier will assume contents do not change
 938                  */
 939                 if (map->map_flags & BPF_F_RDONLY_PROG) {
 940                         err = -EACCES;
 941                         goto out;
 942                 }
 943         }
 944
 945         /* set default open/close callbacks */
 946         vma->vm_ops = &bpf_map_default_vmops;
 947         vma->vm_private_data = map;
 948         vm_flags_clear(vma, VM_MAYEXEC);
 949         if (!(vma->vm_flags & VM_WRITE))
 950                 /* disallow re-mapping with PROT_WRITE */
 951                 vm_flags_clear(vma, VM_MAYWRITE);
 952
 953         err = map->ops->map_mmap(map, vma);
 954         if (err)
 955                 goto out;
 956
 957         if (vma->vm_flags & VM_MAYWRITE)
 958                 bpf_map_write_active_inc(map);
 959 out:
 960         mutex_unlock(&map->freeze_mutex);
 961         return err;
 962 }
 963
 964 static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
 965 {
 966         struct bpf_map *map = filp->private_data;
 967
 968         if (map->ops->map_poll)
 969                 return map->ops->map_poll(map, filp, pts);
 970
 971         return EPOLLERR;
 972 }
 973
 974 static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr,
 975                                            unsigned long len, unsigned long pgoff,
 976                                            unsigned long flags)
 977 {
 978         struct bpf_map *map = filp->private_data;
 979
 980         if (map->ops->map_get_unmapped_area)
 981                 return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
 982 #ifdef CONFIG_MMU
 983         return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
 984 #else
 985         return addr;
 986 #endif
 987 }
 988
 989 const struct file_operations bpf_map_fops = {
 990 #ifdef CONFIG_PROC_FS
 991         .show_fdinfo    = bpf_map_show_fdinfo,
 992 #endif
 993         .release        = bpf_map_release,
 994         .read           = bpf_dummy_read,
 995         .write          = bpf_dummy_write,
 996         .mmap           = bpf_map_mmap,
 997         .poll           = bpf_map_poll,
 998         .get_unmapped_area = bpf_get_unmapped_area,
 999 };
1000
1001 int bpf_map_new_fd(struct bpf_map *map, int flags)
1002 {
1003         int ret;
1004
1005         ret = security_bpf_map(map, OPEN_FMODE(flags));
1006         if (ret < 0)
1007                 return ret;
1008
1009         return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
1010                                 flags | O_CLOEXEC);
1011 }
1012
1013 int bpf_get_file_flag(int flags)
1014 {
1015         if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
1016                 return -EINVAL;
1017         if (flags & BPF_F_RDONLY)
1018                 return O_RDONLY;
1019         if (flags & BPF_F_WRONLY)
1020                 return O_WRONLY;
1021         return O_RDWR;
1022 }
1023
1024 /* helper macro to check that unused fields 'union bpf_attr' are zero */
1025 #define CHECK_ATTR(CMD) \
1026         memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
1027                    sizeof(attr->CMD##_LAST_FIELD), 0, \
1028                    sizeof(*attr) - \
1029                    offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
1030                    sizeof(attr->CMD##_LAST_FIELD)) != NULL
1031
1032 /* dst and src must have at least "size" number of bytes.
1033  * Return strlen on success and < 0 on error.
1034  */
1035 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
1036 {
1037         const char *end = src + size;
1038         const char *orig_src = src;
1039
1040         memset(dst, 0, size);
1041         /* Copy all isalnum(), '_' and '.' chars. */
1042         while (src < end && *src) {
1043                 if (!isalnum(*src) &&
1044                     *src != '_' && *src != '.')
1045                         return -EINVAL;
1046                 *dst++ = *src++;
1047         }
1048
1049         /* No '\0' found in "size" number of bytes */
1050         if (src == end)
1051                 return -EINVAL;
1052
1053         return src - orig_src;
1054 }
1055
1056 int map_check_no_btf(const struct bpf_map *map,
1057                      const struct btf *btf,
1058                      const struct btf_type *key_type,
1059                      const struct btf_type *value_type)
1060 {
1061         return -ENOTSUPP;
1062 }
1063
1064 static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
1065                          const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
1066 {
1067         const struct btf_type *key_type, *value_type;
1068         u32 key_size, value_size;
1069         int ret = 0;
1070
1071         /* Some maps allow key to be unspecified. */
1072         if (btf_key_id) {
1073                 key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
1074                 if (!key_type || key_size != map->key_size)
1075                         return -EINVAL;
1076         } else {
1077                 key_type = btf_type_by_id(btf, 0);
1078                 if (!map->ops->map_check_btf)
1079                         return -EINVAL;
1080         }
1081
1082         value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
1083         if (!value_type || value_size != map->value_size)
1084                 return -EINVAL;
1085
1086         map->record = btf_parse_fields(btf, value_type,
1087                                        BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
1088                                        BPF_RB_ROOT | BPF_REFCOUNT,
1089                                        map->value_size);
1090         if (!IS_ERR_OR_NULL(map->record)) {
1091                 int i;
1092
1093                 if (!bpf_token_capable(token, CAP_BPF)) {
1094                         ret = -EPERM;
1095                         goto free_map_tab;
1096                 }
1097                 if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
1098                         ret = -EACCES;
1099                         goto free_map_tab;
1100                 }
1101                 for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
1102                         switch (map->record->field_mask & (1 << i)) {
1103                         case 0:
1104                                 continue;
1105                         case BPF_SPIN_LOCK:
1106                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1107                                     map->map_type != BPF_MAP_TYPE_ARRAY &&
1108                                     map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
1109                                     map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
1110                                     map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
1111                                     map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
1112                                     map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
1113                                         ret = -EOPNOTSUPP;
1114                                         goto free_map_tab;
1115                                 }
1116                                 break;
1117                         case BPF_TIMER:
1118                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1119                                     map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1120                                     map->map_type != BPF_MAP_TYPE_ARRAY) {
1121                                         ret = -EOPNOTSUPP;
1122                                         goto free_map_tab;
1123                                 }
1124                                 break;
1125                         case BPF_KPTR_UNREF:
1126                         case BPF_KPTR_REF:
1127                         case BPF_KPTR_PERCPU:
1128                         case BPF_REFCOUNT:
1129                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1130                                     map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
1131                                     map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1132                                     map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
1133                                     map->map_type != BPF_MAP_TYPE_ARRAY &&
1134                                     map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
1135                                     map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
1136                                     map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
1137                                     map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
1138                                     map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
1139                                         ret = -EOPNOTSUPP;
1140                                         goto free_map_tab;
1141                                 }
1142                                 break;
1143                         case BPF_LIST_HEAD:
1144                         case BPF_RB_ROOT:
1145                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
1146                                     map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1147                                     map->map_type != BPF_MAP_TYPE_ARRAY) {
1148                                         ret = -EOPNOTSUPP;
1149                                         goto free_map_tab;
1150                                 }
1151                                 break;
1152                         default:
1153                                 /* Fail if map_type checks are missing for a field type */
1154                                 ret = -EOPNOTSUPP;
1155                                 goto free_map_tab;
1156                         }
1157                 }
1158         }
1159
1160         ret = btf_check_and_fixup_fields(btf, map->record);
1161         if (ret < 0)
1162                 goto free_map_tab;
1163
1164         if (map->ops->map_check_btf) {
1165                 ret = map->ops->map_check_btf(map, btf, key_type, value_type);
1166                 if (ret < 0)
1167                         goto free_map_tab;
1168         }
1169
1170         return ret;
1171 free_map_tab:
1172         bpf_map_free_record(map);
1173         return ret;
1174 }
1175
1176 static bool bpf_net_capable(void)
1177 {
1178         return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
1179 }
1180
1181 #define BPF_MAP_CREATE_LAST_FIELD map_token_fd
1182 /* called via syscall */
1183 static int map_create(union bpf_attr *attr)
1184 {
1185         const struct bpf_map_ops *ops;
1186         struct bpf_token *token = NULL;
1187         int numa_node = bpf_map_attr_numa_node(attr);
1188         u32 map_type = attr->map_type;
1189         struct bpf_map *map;
1190         bool token_flag;
1191         int f_flags;
1192         int err;
1193
1194         err = CHECK_ATTR(BPF_MAP_CREATE);
1195         if (err)
1196                 return -EINVAL;
1197
1198         /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it
1199          * to avoid per-map type checks tripping on unknown flag
1200          */
1201         token_flag = attr->map_flags & BPF_F_TOKEN_FD;
1202         attr->map_flags &= ~BPF_F_TOKEN_FD;
1203
1204         if (attr->btf_vmlinux_value_type_id) {
1205                 if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
1206                     attr->btf_key_type_id || attr->btf_value_type_id)
1207                         return -EINVAL;
1208         } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
1209                 return -EINVAL;
1210         }
1211
1212         if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
1213             attr->map_type != BPF_MAP_TYPE_ARENA &&
1214             attr->map_extra != 0)
1215                 return -EINVAL;
1216
1217         f_flags = bpf_get_file_flag(attr->map_flags);
1218         if (f_flags < 0)
1219                 return f_flags;
1220
1221         if (numa_node != NUMA_NO_NODE &&
1222             ((unsigned int)numa_node >= nr_node_ids ||
1223              !node_online(numa_node)))
1224                 return -EINVAL;
1225
1226         /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
1227         map_type = attr->map_type;
1228         if (map_type >= ARRAY_SIZE(bpf_map_types))
1229                 return -EINVAL;
1230         map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
1231         ops = bpf_map_types[map_type];
1232         if (!ops)
1233                 return -EINVAL;
1234
1235         if (ops->map_alloc_check) {
1236                 err = ops->map_alloc_check(attr);
1237                 if (err)
1238                         return err;
1239         }
1240         if (attr->map_ifindex)
1241                 ops = &bpf_map_offload_ops;
1242         if (!ops->map_mem_usage)
1243                 return -EINVAL;
1244
1245         if (token_flag) {
1246                 token = bpf_token_get_from_fd(attr->map_token_fd);
1247                 if (IS_ERR(token))
1248                         return PTR_ERR(token);
1249
1250                 /* if current token doesn't grant map creation permissions,
1251                  * then we can't use this token, so ignore it and rely on
1252                  * system-wide capabilities checks
1253                  */
1254                 if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
1255                     !bpf_token_allow_map_type(token, attr->map_type)) {
1256                         bpf_token_put(token);
1257                         token = NULL;
1258                 }
1259         }
1260
1261         err = -EPERM;
1262
1263         /* Intent here is for unprivileged_bpf_disabled to block BPF map
1264          * creation for unprivileged users; other actions depend
1265          * on fd availability and access to bpffs, so are dependent on
1266          * object creation success. Even with unprivileged BPF disabled,
1267          * capability checks are still carried out.
1268          */
1269         if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
1270                 goto put_token;
1271
1272         /* check privileged map type permissions */
1273         switch (map_type) {
1274         case BPF_MAP_TYPE_ARRAY:
1275         case BPF_MAP_TYPE_PERCPU_ARRAY:
1276         case BPF_MAP_TYPE_PROG_ARRAY:
1277         case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
1278         case BPF_MAP_TYPE_CGROUP_ARRAY:
1279         case BPF_MAP_TYPE_ARRAY_OF_MAPS:
1280         case BPF_MAP_TYPE_HASH:
1281         case BPF_MAP_TYPE_PERCPU_HASH:
1282         case BPF_MAP_TYPE_HASH_OF_MAPS:
1283         case BPF_MAP_TYPE_RINGBUF:
1284         case BPF_MAP_TYPE_USER_RINGBUF:
1285         case BPF_MAP_TYPE_CGROUP_STORAGE:
1286         case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
1287                 /* unprivileged */
1288                 break;
1289         case BPF_MAP_TYPE_SK_STORAGE:
1290         case BPF_MAP_TYPE_INODE_STORAGE:
1291         case BPF_MAP_TYPE_TASK_STORAGE:
1292         case BPF_MAP_TYPE_CGRP_STORAGE:
1293         case BPF_MAP_TYPE_BLOOM_FILTER:
1294         case BPF_MAP_TYPE_LPM_TRIE:
1295         case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
1296         case BPF_MAP_TYPE_STACK_TRACE:
1297         case BPF_MAP_TYPE_QUEUE:
1298         case BPF_MAP_TYPE_STACK:
1299         case BPF_MAP_TYPE_LRU_HASH:
1300         case BPF_MAP_TYPE_LRU_PERCPU_HASH:
1301         case BPF_MAP_TYPE_STRUCT_OPS:
1302         case BPF_MAP_TYPE_CPUMAP:
1303         case BPF_MAP_TYPE_ARENA:
1304                 if (!bpf_token_capable(token, CAP_BPF))
1305                         goto put_token;
1306                 break;
1307         case BPF_MAP_TYPE_SOCKMAP:
1308         case BPF_MAP_TYPE_SOCKHASH:
1309         case BPF_MAP_TYPE_DEVMAP:
1310         case BPF_MAP_TYPE_DEVMAP_HASH:
1311         case BPF_MAP_TYPE_XSKMAP:
1312                 if (!bpf_token_capable(token, CAP_NET_ADMIN))
1313                         goto put_token;
1314                 break;
1315         default:
1316                 WARN(1, "unsupported map type %d", map_type);
1317                 goto put_token;
1318         }
1319
1320         map = ops->map_alloc(attr);
1321         if (IS_ERR(map)) {
1322                 err = PTR_ERR(map);
1323                 goto put_token;
1324         }
1325         map->ops = ops;
1326         map->map_type = map_type;
1327
1328         err = bpf_obj_name_cpy(map->name, attr->map_name,
1329                                sizeof(attr->map_name));
1330         if (err < 0)
1331                 goto free_map;
1332
1333         atomic64_set(&map->refcnt, 1);
1334         atomic64_set(&map->usercnt, 1);
1335         mutex_init(&map->freeze_mutex);
1336         spin_lock_init(&map->owner.lock);
1337
1338         if (attr->btf_key_type_id || attr->btf_value_type_id ||
1339             /* Even the map's value is a kernel's struct,
1340              * the bpf_prog.o must have BTF to begin with
1341              * to figure out the corresponding kernel's
1342              * counter part.  Thus, attr->btf_fd has
1343              * to be valid also.
1344              */
1345             attr->btf_vmlinux_value_type_id) {
1346                 struct btf *btf;
1347
1348                 btf = btf_get_by_fd(attr->btf_fd);
1349                 if (IS_ERR(btf)) {
1350                         err = PTR_ERR(btf);
1351                         goto free_map;
1352                 }
1353                 if (btf_is_kernel(btf)) {
1354                         btf_put(btf);
1355                         err = -EACCES;
1356                         goto free_map;
1357                 }
1358                 map->btf = btf;
1359
1360                 if (attr->btf_value_type_id) {
1361                         err = map_check_btf(map, token, btf, attr->btf_key_type_id,
1362                                             attr->btf_value_type_id);
1363                         if (err)
1364                                 goto free_map;
1365                 }
1366
1367                 map->btf_key_type_id = attr->btf_key_type_id;
1368                 map->btf_value_type_id = attr->btf_value_type_id;
1369                 map->btf_vmlinux_value_type_id =
1370                         attr->btf_vmlinux_value_type_id;
1371         }
1372
1373         err = security_bpf_map_create(map, attr, token);
1374         if (err)
1375                 goto free_map_sec;
1376
1377         err = bpf_map_alloc_id(map);
1378         if (err)
1379                 goto free_map_sec;
1380
1381         bpf_map_save_memcg(map);
1382         bpf_token_put(token);
1383
1384         err = bpf_map_new_fd(map, f_flags);
1385         if (err < 0) {
1386                 /* failed to allocate fd.
1387                  * bpf_map_put_with_uref() is needed because the above
1388                  * bpf_map_alloc_id() has published the map
1389                  * to the userspace and the userspace may
1390                  * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
1391                  */
1392                 bpf_map_put_with_uref(map);
1393                 return err;
1394         }
1395
1396         return err;
1397
1398 free_map_sec:
1399         security_bpf_map_free(map);
1400 free_map:
1401         btf_put(map->btf);
1402         map->ops->map_free(map);
1403 put_token:
1404         bpf_token_put(token);
1405         return err;
1406 }
1407
1408 /* if error is returned, fd is released.
1409  * On success caller should complete fd access with matching fdput()
1410  */
1411 struct bpf_map *__bpf_map_get(struct fd f)
1412 {
1413         if (!f.file)
1414                 return ERR_PTR(-EBADF);
1415         if (f.file->f_op != &bpf_map_fops) {
1416                 fdput(f);
1417                 return ERR_PTR(-EINVAL);
1418         }
1419
1420         return f.file->private_data;
1421 }
1422
1423 void bpf_map_inc(struct bpf_map *map)
1424 {
1425         atomic64_inc(&map->refcnt);
1426 }
1427 EXPORT_SYMBOL_GPL(bpf_map_inc);
1428
1429 void bpf_map_inc_with_uref(struct bpf_map *map)
1430 {
1431         atomic64_inc(&map->refcnt);
1432         atomic64_inc(&map->usercnt);
1433 }
1434 EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
1435
1436 struct bpf_map *bpf_map_get(u32 ufd)
1437 {
1438         struct fd f = fdget(ufd);
1439         struct bpf_map *map;
1440
1441         map = __bpf_map_get(f);
1442         if (IS_ERR(map))
1443                 return map;
1444
1445         bpf_map_inc(map);
1446         fdput(f);
1447
1448         return map;
1449 }
1450 EXPORT_SYMBOL(bpf_map_get);
1451
1452 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
1453 {
1454         struct fd f = fdget(ufd);
1455         struct bpf_map *map;
1456
1457         map = __bpf_map_get(f);
1458         if (IS_ERR(map))
1459                 return map;
1460
1461         bpf_map_inc_with_uref(map);
1462         fdput(f);
1463
1464         return map;
1465 }
1466
1467 /* map_idr_lock should have been held or the map should have been
1468  * protected by rcu read lock.
1469  */
1470 struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
1471 {
1472         int refold;
1473
1474         refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
1475         if (!refold)
1476                 return ERR_PTR(-ENOENT);
1477         if (uref)
1478                 atomic64_inc(&map->usercnt);
1479
1480         return map;
1481 }
1482
1483 struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
1484 {
1485         spin_lock_bh(&map_idr_lock);
1486         map = __bpf_map_inc_not_zero(map, false);
1487         spin_unlock_bh(&map_idr_lock);
1488
1489         return map;
1490 }
1491 EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
1492
1493 int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
1494 {
1495         return -ENOTSUPP;
1496 }
1497
1498 static void *__bpf_copy_key(void __user *ukey, u64 key_size)
1499 {
1500         if (key_size)
1501                 return vmemdup_user(ukey, key_size);
1502
1503         if (ukey)
1504                 return ERR_PTR(-EINVAL);
1505
1506         return NULL;
1507 }
1508
1509 static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
1510 {
1511         if (key_size)
1512                 return kvmemdup_bpfptr(ukey, key_size);
1513
1514         if (!bpfptr_is_null(ukey))
1515                 return ERR_PTR(-EINVAL);
1516
1517         return NULL;
1518 }
1519
1520 /* last field in 'union bpf_attr' used by this command */
1521 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
1522
1523 static int map_lookup_elem(union bpf_attr *attr)
1524 {
1525         void __user *ukey = u64_to_user_ptr(attr->key);
1526         void __user *uvalue = u64_to_user_ptr(attr->value);
1527         int ufd = attr->map_fd;
1528         struct bpf_map *map;
1529         void *key, *value;
1530         u32 value_size;
1531         struct fd f;
1532         int err;
1533
1534         if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
1535                 return -EINVAL;
1536
1537         if (attr->flags & ~BPF_F_LOCK)
1538                 return -EINVAL;
1539
1540         f = fdget(ufd);
1541         map = __bpf_map_get(f);
1542         if (IS_ERR(map))
1543                 return PTR_ERR(map);
1544         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1545                 err = -EPERM;
1546                 goto err_put;
1547         }
1548
1549         if ((attr->flags & BPF_F_LOCK) &&
1550             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1551                 err = -EINVAL;
1552                 goto err_put;
1553         }
1554
1555         key = __bpf_copy_key(ukey, map->key_size);
1556         if (IS_ERR(key)) {
1557                 err = PTR_ERR(key);
1558                 goto err_put;
1559         }
1560
1561         value_size = bpf_map_value_size(map);
1562
1563         err = -ENOMEM;
1564         value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
1565         if (!value)
1566                 goto free_key;
1567
1568         if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
1569                 if (copy_from_user(value, uvalue, value_size))
1570                         err = -EFAULT;
1571                 else
1572                         err = bpf_map_copy_value(map, key, value, attr->flags);
1573                 goto free_value;
1574         }
1575
1576         err = bpf_map_copy_value(map, key, value, attr->flags);
1577         if (err)
1578                 goto free_value;
1579
1580         err = -EFAULT;
1581         if (copy_to_user(uvalue, value, value_size) != 0)
1582                 goto free_value;
1583
1584         err = 0;
1585
1586 free_value:
1587         kvfree(value);
1588 free_key:
1589         kvfree(key);
1590 err_put:
1591         fdput(f);
1592         return err;
1593 }
1594
1595
1596 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
1597
1598 static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
1599 {
1600         bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
1601         bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
1602         int ufd = attr->map_fd;
1603         struct bpf_map *map;
1604         void *key, *value;
1605         u32 value_size;
1606         struct fd f;
1607         int err;
1608
1609         if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
1610                 return -EINVAL;
1611
1612         f = fdget(ufd);
1613         map = __bpf_map_get(f);
1614         if (IS_ERR(map))
1615                 return PTR_ERR(map);
1616         bpf_map_write_active_inc(map);
1617         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1618                 err = -EPERM;
1619                 goto err_put;
1620         }
1621
1622         if ((attr->flags & BPF_F_LOCK) &&
1623             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1624                 err = -EINVAL;
1625                 goto err_put;
1626         }
1627
1628         key = ___bpf_copy_key(ukey, map->key_size);
1629         if (IS_ERR(key)) {
1630                 err = PTR_ERR(key);
1631                 goto err_put;
1632         }
1633
1634         value_size = bpf_map_value_size(map);
1635         value = kvmemdup_bpfptr(uvalue, value_size);
1636         if (IS_ERR(value)) {
1637                 err = PTR_ERR(value);
1638                 goto free_key;
1639         }
1640
1641         err = bpf_map_update_value(map, f.file, key, value, attr->flags);
1642         if (!err)
1643                 maybe_wait_bpf_programs(map);
1644
1645         kvfree(value);
1646 free_key:
1647         kvfree(key);
1648 err_put:
1649         bpf_map_write_active_dec(map);
1650         fdput(f);
1651         return err;
1652 }
1653
1654 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
1655
1656 static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
1657 {
1658         bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
1659         int ufd = attr->map_fd;
1660         struct bpf_map *map;
1661         struct fd f;
1662         void *key;
1663         int err;
1664
1665         if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
1666                 return -EINVAL;
1667
1668         f = fdget(ufd);
1669         map = __bpf_map_get(f);
1670         if (IS_ERR(map))
1671                 return PTR_ERR(map);
1672         bpf_map_write_active_inc(map);
1673         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1674                 err = -EPERM;
1675                 goto err_put;
1676         }
1677
1678         key = ___bpf_copy_key(ukey, map->key_size);
1679         if (IS_ERR(key)) {
1680                 err = PTR_ERR(key);
1681                 goto err_put;
1682         }
1683
1684         if (bpf_map_is_offloaded(map)) {
1685                 err = bpf_map_offload_delete_elem(map, key);
1686                 goto out;
1687         } else if (IS_FD_PROG_ARRAY(map) ||
1688                    map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
1689                 /* These maps require sleepable context */
1690                 err = map->ops->map_delete_elem(map, key);
1691                 goto out;
1692         }
1693
1694         bpf_disable_instrumentation();
1695         rcu_read_lock();
1696         err = map->ops->map_delete_elem(map, key);
1697         rcu_read_unlock();
1698         bpf_enable_instrumentation();
1699         if (!err)
1700                 maybe_wait_bpf_programs(map);
1701 out:
1702         kvfree(key);
1703 err_put:
1704         bpf_map_write_active_dec(map);
1705         fdput(f);
1706         return err;
1707 }
1708
1709 /* last field in 'union bpf_attr' used by this command */
1710 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
1711
1712 static int map_get_next_key(union bpf_attr *attr)
1713 {
1714         void __user *ukey = u64_to_user_ptr(attr->key);
1715         void __user *unext_key = u64_to_user_ptr(attr->next_key);
1716         int ufd = attr->map_fd;
1717         struct bpf_map *map;
1718         void *key, *next_key;
1719         struct fd f;
1720         int err;
1721
1722         if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
1723                 return -EINVAL;
1724
1725         f = fdget(ufd);
1726         map = __bpf_map_get(f);
1727         if (IS_ERR(map))
1728                 return PTR_ERR(map);
1729         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1730                 err = -EPERM;
1731                 goto err_put;
1732         }
1733
1734         if (ukey) {
1735                 key = __bpf_copy_key(ukey, map->key_size);
1736                 if (IS_ERR(key)) {
1737                         err = PTR_ERR(key);
1738                         goto err_put;
1739                 }
1740         } else {
1741                 key = NULL;
1742         }
1743
1744         err = -ENOMEM;
1745         next_key = kvmalloc(map->key_size, GFP_USER);
1746         if (!next_key)
1747                 goto free_key;
1748
1749         if (bpf_map_is_offloaded(map)) {
1750                 err = bpf_map_offload_get_next_key(map, key, next_key);
1751                 goto out;
1752         }
1753
1754         rcu_read_lock();
1755         err = map->ops->map_get_next_key(map, key, next_key);
1756         rcu_read_unlock();
1757 out:
1758         if (err)
1759                 goto free_next_key;
1760
1761         err = -EFAULT;
1762         if (copy_to_user(unext_key, next_key, map->key_size) != 0)
1763                 goto free_next_key;
1764
1765         err = 0;
1766
1767 free_next_key:
1768         kvfree(next_key);
1769 free_key:
1770         kvfree(key);
1771 err_put:
1772         fdput(f);
1773         return err;
1774 }
1775
1776 int generic_map_delete_batch(struct bpf_map *map,
1777                              const union bpf_attr *attr,
1778                              union bpf_attr __user *uattr)
1779 {
1780         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1781         u32 cp, max_count;
1782         int err = 0;
1783         void *key;
1784
1785         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1786                 return -EINVAL;
1787
1788         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1789             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1790                 return -EINVAL;
1791         }
1792
1793         max_count = attr->batch.count;
1794         if (!max_count)
1795                 return 0;
1796
1797         if (put_user(0, &uattr->batch.count))
1798                 return -EFAULT;
1799
1800         key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1801         if (!key)
1802                 return -ENOMEM;
1803
1804         for (cp = 0; cp < max_count; cp++) {
1805                 err = -EFAULT;
1806                 if (copy_from_user(key, keys + cp * map->key_size,
1807                                    map->key_size))
1808                         break;
1809
1810                 if (bpf_map_is_offloaded(map)) {
1811                         err = bpf_map_offload_delete_elem(map, key);
1812                         break;
1813                 }
1814
1815                 bpf_disable_instrumentation();
1816                 rcu_read_lock();
1817                 err = map->ops->map_delete_elem(map, key);
1818                 rcu_read_unlock();
1819                 bpf_enable_instrumentation();
1820                 if (err)
1821                         break;
1822                 cond_resched();
1823         }
1824         if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1825                 err = -EFAULT;
1826
1827         kvfree(key);
1828
1829         return err;
1830 }
1831
1832 int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
1833                              const union bpf_attr *attr,
1834                              union bpf_attr __user *uattr)
1835 {
1836         void __user *values = u64_to_user_ptr(attr->batch.values);
1837         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1838         u32 value_size, cp, max_count;
1839         void *key, *value;
1840         int err = 0;
1841
1842         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1843                 return -EINVAL;
1844
1845         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1846             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
1847                 return -EINVAL;
1848         }
1849
1850         value_size = bpf_map_value_size(map);
1851
1852         max_count = attr->batch.count;
1853         if (!max_count)
1854                 return 0;
1855
1856         if (put_user(0, &uattr->batch.count))
1857                 return -EFAULT;
1858
1859         key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1860         if (!key)
1861                 return -ENOMEM;
1862
1863         value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
1864         if (!value) {
1865                 kvfree(key);
1866                 return -ENOMEM;
1867         }
1868
1869         for (cp = 0; cp < max_count; cp++) {
1870                 err = -EFAULT;
1871                 if (copy_from_user(key, keys + cp * map->key_size,
1872                     map->key_size) ||
1873                     copy_from_user(value, values + cp * value_size, value_size))
1874                         break;
1875
1876                 err = bpf_map_update_value(map, map_file, key, value,
1877                                            attr->batch.elem_flags);
1878
1879                 if (err)
1880                         break;
1881                 cond_resched();
1882         }
1883
1884         if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1885                 err = -EFAULT;
1886
1887         kvfree(value);
1888         kvfree(key);
1889
1890         return err;
1891 }
1892
1893 #define MAP_LOOKUP_RETRIES 3
1894
1895 int generic_map_lookup_batch(struct bpf_map *map,
1896                                     const union bpf_attr *attr,
1897                                     union bpf_attr __user *uattr)
1898 {
1899         void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
1900         void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1901         void __user *values = u64_to_user_ptr(attr->batch.values);
1902         void __user *keys = u64_to_user_ptr(attr->batch.keys);
1903         void *buf, *buf_prevkey, *prev_key, *key, *value;
1904         int err, retry = MAP_LOOKUP_RETRIES;
1905         u32 value_size, cp, max_count;
1906
1907         if (attr->batch.elem_flags & ~BPF_F_LOCK)
1908                 return -EINVAL;
1909
1910         if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1911             !btf_record_has_field(map->record, BPF_SPIN_LOCK))
1912                 return -EINVAL;
1913
1914         value_size = bpf_map_value_size(map);
1915
1916         max_count = attr->batch.count;
1917         if (!max_count)
1918                 return 0;
1919
1920         if (put_user(0, &uattr->batch.count))
1921                 return -EFAULT;
1922
1923         buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1924         if (!buf_prevkey)
1925                 return -ENOMEM;
1926
1927         buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
1928         if (!buf) {
1929                 kvfree(buf_prevkey);
1930                 return -ENOMEM;
1931         }
1932
1933         err = -EFAULT;
1934         prev_key = NULL;
1935         if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
1936                 goto free_buf;
1937         key = buf;
1938         value = key + map->key_size;
1939         if (ubatch)
1940                 prev_key = buf_prevkey;
1941
1942         for (cp = 0; cp < max_count;) {
1943                 rcu_read_lock();
1944                 err = map->ops->map_get_next_key(map, prev_key, key);
1945                 rcu_read_unlock();
1946                 if (err)
1947                         break;
1948                 err = bpf_map_copy_value(map, key, value,
1949                                          attr->batch.elem_flags);
1950
1951                 if (err == -ENOENT) {
1952                         if (retry) {
1953                                 retry--;
1954                                 continue;
1955                         }
1956                         err = -EINTR;
1957                         break;
1958                 }
1959
1960                 if (err)
1961                         goto free_buf;
1962
1963                 if (copy_to_user(keys + cp * map->key_size, key,
1964                                  map->key_size)) {
1965                         err = -EFAULT;
1966                         goto free_buf;
1967                 }
1968                 if (copy_to_user(values + cp * value_size, value, value_size)) {
1969                         err = -EFAULT;
1970                         goto free_buf;
1971                 }
1972
1973                 if (!prev_key)
1974                         prev_key = buf_prevkey;
1975
1976                 swap(prev_key, key);
1977                 retry = MAP_LOOKUP_RETRIES;
1978                 cp++;
1979                 cond_resched();
1980         }
1981
1982         if (err == -EFAULT)
1983                 goto free_buf;
1984
1985         if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
1986                     (cp && copy_to_user(uobatch, prev_key, map->key_size))))
1987                 err = -EFAULT;
1988
1989 free_buf:
1990         kvfree(buf_prevkey);
1991         kvfree(buf);
1992         return err;
1993 }
1994
1995 #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
1996
1997 static int map_lookup_and_delete_elem(union bpf_attr *attr)
1998 {
1999         void __user *ukey = u64_to_user_ptr(attr->key);
2000         void __user *uvalue = u64_to_user_ptr(attr->value);
2001         int ufd = attr->map_fd;
2002         struct bpf_map *map;
2003         void *key, *value;
2004         u32 value_size;
2005         struct fd f;
2006         int err;
2007
2008         if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
2009                 return -EINVAL;
2010
2011         if (attr->flags & ~BPF_F_LOCK)
2012                 return -EINVAL;
2013
2014         f = fdget(ufd);
2015         map = __bpf_map_get(f);
2016         if (IS_ERR(map))
2017                 return PTR_ERR(map);
2018         bpf_map_write_active_inc(map);
2019         if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
2020             !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
2021                 err = -EPERM;
2022                 goto err_put;
2023         }
2024
2025         if (attr->flags &&
2026             (map->map_type == BPF_MAP_TYPE_QUEUE ||
2027              map->map_type == BPF_MAP_TYPE_STACK)) {
2028                 err = -EINVAL;
2029                 goto err_put;
2030         }
2031
2032         if ((attr->flags & BPF_F_LOCK) &&
2033             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
2034                 err = -EINVAL;
2035                 goto err_put;
2036         }
2037
2038         key = __bpf_copy_key(ukey, map->key_size);
2039         if (IS_ERR(key)) {
2040                 err = PTR_ERR(key);
2041                 goto err_put;
2042         }
2043
2044         value_size = bpf_map_value_size(map);
2045
2046         err = -ENOMEM;
2047         value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
2048         if (!value)
2049                 goto free_key;
2050
2051         err = -ENOTSUPP;
2052         if (map->map_type == BPF_MAP_TYPE_QUEUE ||
2053             map->map_type == BPF_MAP_TYPE_STACK) {
2054                 err = map->ops->map_pop_elem(map, value);
2055         } else if (map->map_type == BPF_MAP_TYPE_HASH ||
2056                    map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
2057                    map->map_type == BPF_MAP_TYPE_LRU_HASH ||
2058                    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2059                 if (!bpf_map_is_offloaded(map)) {
2060                         bpf_disable_instrumentation();
2061                         rcu_read_lock();
2062                         err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
2063                         rcu_read_unlock();
2064                         bpf_enable_instrumentation();
2065                 }
2066         }
2067
2068         if (err)
2069                 goto free_value;
2070
2071         if (copy_to_user(uvalue, value, value_size) != 0) {
2072                 err = -EFAULT;
2073                 goto free_value;
2074         }
2075
2076         err = 0;
2077
2078 free_value:
2079         kvfree(value);
2080 free_key:
2081         kvfree(key);
2082 err_put:
2083         bpf_map_write_active_dec(map);
2084         fdput(f);
2085         return err;
2086 }
2087
2088 #define BPF_MAP_FREEZE_LAST_FIELD map_fd
2089
2090 static int map_freeze(const union bpf_attr *attr)
2091 {
2092         int err = 0, ufd = attr->map_fd;
2093         struct bpf_map *map;
2094         struct fd f;
2095
2096         if (CHECK_ATTR(BPF_MAP_FREEZE))
2097                 return -EINVAL;
2098
2099         f = fdget(ufd);
2100         map = __bpf_map_get(f);
2101         if (IS_ERR(map))
2102                 return PTR_ERR(map);
2103
2104         if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
2105                 fdput(f);
2106                 return -ENOTSUPP;
2107         }
2108
2109         if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
2110                 fdput(f);
2111                 return -EPERM;
2112         }
2113
2114         mutex_lock(&map->freeze_mutex);
2115         if (bpf_map_write_active(map)) {
2116                 err = -EBUSY;
2117                 goto err_put;
2118         }
2119         if (READ_ONCE(map->frozen)) {
2120                 err = -EBUSY;
2121                 goto err_put;
2122         }
2123
2124         WRITE_ONCE(map->frozen, true);
2125 err_put:
2126         mutex_unlock(&map->freeze_mutex);
2127         fdput(f);
2128         return err;
2129 }
2130
2131 static const struct bpf_prog_ops * const bpf_prog_types[] = {
2132 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
2133         [_id] = & _name ## _prog_ops,
2134 #define BPF_MAP_TYPE(_id, _ops)
2135 #define BPF_LINK_TYPE(_id, _name)
2136 #include <linux/bpf_types.h>
2137 #undef BPF_PROG_TYPE
2138 #undef BPF_MAP_TYPE
2139 #undef BPF_LINK_TYPE
2140 };
2141
2142 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
2143 {
2144         const struct bpf_prog_ops *ops;
2145
2146         if (type >= ARRAY_SIZE(bpf_prog_types))
2147                 return -EINVAL;
2148         type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
2149         ops = bpf_prog_types[type];
2150         if (!ops)
2151                 return -EINVAL;
2152
2153         if (!bpf_prog_is_offloaded(prog->aux))
2154                 prog->aux->ops = ops;
2155         else
2156                 prog->aux->ops = &bpf_offload_prog_ops;
2157         prog->type = type;
2158         return 0;
2159 }
2160
2161 enum bpf_audit {
2162         BPF_AUDIT_LOAD,
2163         BPF_AUDIT_UNLOAD,
2164         BPF_AUDIT_MAX,
2165 };
2166
2167 static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
2168         [BPF_AUDIT_LOAD]   = "LOAD",
2169         [BPF_AUDIT_UNLOAD] = "UNLOAD",
2170 };
2171
2172 static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
2173 {
2174         struct audit_context *ctx = NULL;
2175         struct audit_buffer *ab;
2176
2177         if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
2178                 return;
2179         if (audit_enabled == AUDIT_OFF)
2180                 return;
2181         if (!in_irq() && !irqs_disabled())
2182                 ctx = audit_context();
2183         ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
2184         if (unlikely(!ab))
2185                 return;
2186         audit_log_format(ab, "prog-id=%u op=%s",
2187                          prog->aux->id, bpf_audit_str[op]);
2188         audit_log_end(ab);
2189 }
2190
2191 static int bpf_prog_alloc_id(struct bpf_prog *prog)
2192 {
2193         int id;
2194
2195         idr_preload(GFP_KERNEL);
2196         spin_lock_bh(&prog_idr_lock);
2197         id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
2198         if (id > 0)
2199                 prog->aux->id = id;
2200         spin_unlock_bh(&prog_idr_lock);
2201         idr_preload_end();
2202
2203         /* id is in [1, INT_MAX) */
2204         if (WARN_ON_ONCE(!id))
2205                 return -ENOSPC;
2206
2207         return id > 0 ? 0 : id;
2208 }
2209
2210 void bpf_prog_free_id(struct bpf_prog *prog)
2211 {
2212         unsigned long flags;
2213
2214         /* cBPF to eBPF migrations are currently not in the idr store.
2215          * Offloaded programs are removed from the store when their device
2216          * disappears - even if someone grabs an fd to them they are unusable,
2217          * simply waiting for refcnt to drop to be freed.
2218          */
2219         if (!prog->aux->id)
2220                 return;
2221
2222         spin_lock_irqsave(&prog_idr_lock, flags);
2223         idr_remove(&prog_idr, prog->aux->id);
2224         prog->aux->id = 0;
2225         spin_unlock_irqrestore(&prog_idr_lock, flags);
2226 }
2227
2228 static void __bpf_prog_put_rcu(struct rcu_head *rcu)
2229 {
2230         struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
2231
2232         kvfree(aux->func_info);
2233         kfree(aux->func_info_aux);
2234         free_uid(aux->user);
2235         security_bpf_prog_free(aux->prog);
2236         bpf_prog_free(aux->prog);
2237 }
2238
2239 static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
2240 {
2241         bpf_prog_kallsyms_del_all(prog);
2242         btf_put(prog->aux->btf);
2243         module_put(prog->aux->mod);
2244         kvfree(prog->aux->jited_linfo);
2245         kvfree(prog->aux->linfo);
2246         kfree(prog->aux->kfunc_tab);
2247         if (prog->aux->attach_btf)
2248                 btf_put(prog->aux->attach_btf);
2249
2250         if (deferred) {
2251                 if (prog->sleepable)
2252                         call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
2253                 else
2254                         call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
2255         } else {
2256                 __bpf_prog_put_rcu(&prog->aux->rcu);
2257         }
2258 }
2259
2260 static void bpf_prog_put_deferred(struct work_struct *work)
2261 {
2262         struct bpf_prog_aux *aux;
2263         struct bpf_prog *prog;
2264
2265         aux = container_of(work, struct bpf_prog_aux, work);
2266         prog = aux->prog;
2267         perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
2268         bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
2269         bpf_prog_free_id(prog);
2270         __bpf_prog_put_noref(prog, true);
2271 }
2272
2273 static void __bpf_prog_put(struct bpf_prog *prog)
2274 {
2275         struct bpf_prog_aux *aux = prog->aux;
2276
2277         if (atomic64_dec_and_test(&aux->refcnt)) {
2278                 if (in_irq() || irqs_disabled()) {
2279                         INIT_WORK(&aux->work, bpf_prog_put_deferred);
2280                         schedule_work(&aux->work);
2281                 } else {
2282                         bpf_prog_put_deferred(&aux->work);
2283                 }
2284         }
2285 }
2286
2287 void bpf_prog_put(struct bpf_prog *prog)
2288 {
2289         __bpf_prog_put(prog);
2290 }
2291 EXPORT_SYMBOL_GPL(bpf_prog_put);
2292
2293 static int bpf_prog_release(struct inode *inode, struct file *filp)
2294 {
2295         struct bpf_prog *prog = filp->private_data;
2296
2297         bpf_prog_put(prog);
2298         return 0;
2299 }
2300
2301 struct bpf_prog_kstats {
2302         u64 nsecs;
2303         u64 cnt;
2304         u64 misses;
2305 };
2306
2307 void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
2308 {
2309         struct bpf_prog_stats *stats;
2310         unsigned int flags;
2311
2312         stats = this_cpu_ptr(prog->stats);
2313         flags = u64_stats_update_begin_irqsave(&stats->syncp);
2314         u64_stats_inc(&stats->misses);
2315         u64_stats_update_end_irqrestore(&stats->syncp, flags);
2316 }
2317
2318 static void bpf_prog_get_stats(const struct bpf_prog *prog,
2319                                struct bpf_prog_kstats *stats)
2320 {
2321         u64 nsecs = 0, cnt = 0, misses = 0;
2322         int cpu;
2323
2324         for_each_possible_cpu(cpu) {
2325                 const struct bpf_prog_stats *st;
2326                 unsigned int start;
2327                 u64 tnsecs, tcnt, tmisses;
2328
2329                 st = per_cpu_ptr(prog->stats, cpu);
2330                 do {
2331                         start = u64_stats_fetch_begin(&st->syncp);
2332                         tnsecs = u64_stats_read(&st->nsecs);
2333                         tcnt = u64_stats_read(&st->cnt);
2334                         tmisses = u64_stats_read(&st->misses);
2335                 } while (u64_stats_fetch_retry(&st->syncp, start));
2336                 nsecs += tnsecs;
2337                 cnt += tcnt;
2338                 misses += tmisses;
2339         }
2340         stats->nsecs = nsecs;
2341         stats->cnt = cnt;
2342         stats->misses = misses;
2343 }
2344
2345 #ifdef CONFIG_PROC_FS
2346 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
2347 {
2348         const struct bpf_prog *prog = filp->private_data;
2349         char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
2350         struct bpf_prog_kstats stats;
2351
2352         bpf_prog_get_stats(prog, &stats);
2353         bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
2354         seq_printf(m,
2355                    "prog_type:\t%u\n"
2356                    "prog_jited:\t%u\n"
2357                    "prog_tag:\t%s\n"
2358                    "memlock:\t%llu\n"
2359                    "prog_id:\t%u\n"
2360                    "run_time_ns:\t%llu\n"
2361                    "run_cnt:\t%llu\n"
2362                    "recursion_misses:\t%llu\n"
2363                    "verified_insns:\t%u\n",
2364                    prog->type,
2365                    prog->jited,
2366                    prog_tag,
2367                    prog->pages * 1ULL << PAGE_SHIFT,
2368                    prog->aux->id,
2369                    stats.nsecs,
2370                    stats.cnt,
2371                    stats.misses,
2372                    prog->aux->verified_insns);
2373 }
2374 #endif
2375
2376 const struct file_operations bpf_prog_fops = {
2377 #ifdef CONFIG_PROC_FS
2378         .show_fdinfo    = bpf_prog_show_fdinfo,
2379 #endif
2380         .release        = bpf_prog_release,
2381         .read           = bpf_dummy_read,
2382         .write          = bpf_dummy_write,
2383 };
2384
2385 int bpf_prog_new_fd(struct bpf_prog *prog)
2386 {
2387         int ret;
2388
2389         ret = security_bpf_prog(prog);
2390         if (ret < 0)
2391                 return ret;
2392
2393         return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
2394                                 O_RDWR | O_CLOEXEC);
2395 }
2396
2397 static struct bpf_prog *____bpf_prog_get(struct fd f)
2398 {
2399         if (!f.file)
2400                 return ERR_PTR(-EBADF);
2401         if (f.file->f_op != &bpf_prog_fops) {
2402                 fdput(f);
2403                 return ERR_PTR(-EINVAL);
2404         }
2405
2406         return f.file->private_data;
2407 }
2408
2409 void bpf_prog_add(struct bpf_prog *prog, int i)
2410 {
2411         atomic64_add(i, &prog->aux->refcnt);
2412 }
2413 EXPORT_SYMBOL_GPL(bpf_prog_add);
2414
2415 void bpf_prog_sub(struct bpf_prog *prog, int i)
2416 {
2417         /* Only to be used for undoing previous bpf_prog_add() in some
2418          * error path. We still know that another entity in our call
2419          * path holds a reference to the program, thus atomic_sub() can
2420          * be safely used in such cases!
2421          */
2422         WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
2423 }
2424 EXPORT_SYMBOL_GPL(bpf_prog_sub);
2425
2426 void bpf_prog_inc(struct bpf_prog *prog)
2427 {
2428         atomic64_inc(&prog->aux->refcnt);
2429 }
2430 EXPORT_SYMBOL_GPL(bpf_prog_inc);
2431
2432 /* prog_idr_lock should have been held */
2433 struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
2434 {
2435         int refold;
2436
2437         refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
2438
2439         if (!refold)
2440                 return ERR_PTR(-ENOENT);
2441
2442         return prog;
2443 }
2444 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
2445
2446 bool bpf_prog_get_ok(struct bpf_prog *prog,
2447                             enum bpf_prog_type *attach_type, bool attach_drv)
2448 {
2449         /* not an attachment, just a refcount inc, always allow */
2450         if (!attach_type)
2451                 return true;
2452
2453         if (prog->type != *attach_type)
2454                 return false;
2455         if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
2456                 return false;
2457
2458         return true;
2459 }
2460
2461 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
2462                                        bool attach_drv)
2463 {
2464         struct fd f = fdget(ufd);
2465         struct bpf_prog *prog;
2466
2467         prog = ____bpf_prog_get(f);
2468         if (IS_ERR(prog))
2469                 return prog;
2470         if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
2471                 prog = ERR_PTR(-EINVAL);
2472                 goto out;
2473         }
2474
2475         bpf_prog_inc(prog);
2476 out:
2477         fdput(f);
2478         return prog;
2479 }
2480
2481 struct bpf_prog *bpf_prog_get(u32 ufd)
2482 {
2483         return __bpf_prog_get(ufd, NULL, false);
2484 }
2485
2486 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
2487                                        bool attach_drv)
2488 {
2489         return __bpf_prog_get(ufd, &type, attach_drv);
2490 }
2491 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
2492
2493 /* Initially all BPF programs could be loaded w/o specifying
2494  * expected_attach_type. Later for some of them specifying expected_attach_type
2495  * at load time became required so that program could be validated properly.
2496  * Programs of types that are allowed to be loaded both w/ and w/o (for
2497  * backward compatibility) expected_attach_type, should have the default attach
2498  * type assigned to expected_attach_type for the latter case, so that it can be
2499  * validated later at attach time.
2500  *
2501  * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
2502  * prog type requires it but has some attach types that have to be backward
2503  * compatible.
2504  */
2505 static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
2506 {
2507         switch (attr->prog_type) {
2508         case BPF_PROG_TYPE_CGROUP_SOCK:
2509                 /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
2510                  * exist so checking for non-zero is the way to go here.
2511                  */
2512                 if (!attr->expected_attach_type)
2513                         attr->expected_attach_type =
2514                                 BPF_CGROUP_INET_SOCK_CREATE;
2515                 break;
2516         case BPF_PROG_TYPE_SK_REUSEPORT:
2517                 if (!attr->expected_attach_type)
2518                         attr->expected_attach_type =
2519                                 BPF_SK_REUSEPORT_SELECT;
2520                 break;
2521         }
2522 }
2523
2524 static int
2525 bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
2526                            enum bpf_attach_type expected_attach_type,
2527                            struct btf *attach_btf, u32 btf_id,
2528                            struct bpf_prog *dst_prog)
2529 {
2530         if (btf_id) {
2531                 if (btf_id > BTF_MAX_TYPE)
2532                         return -EINVAL;
2533
2534                 if (!attach_btf && !dst_prog)
2535                         return -EINVAL;
2536
2537                 switch (prog_type) {
2538                 case BPF_PROG_TYPE_TRACING:
2539                 case BPF_PROG_TYPE_LSM:
2540                 case BPF_PROG_TYPE_STRUCT_OPS:
2541                 case BPF_PROG_TYPE_EXT:
2542                         break;
2543                 default:
2544                         return -EINVAL;
2545                 }
2546         }
2547
2548         if (attach_btf && (!btf_id || dst_prog))
2549                 return -EINVAL;
2550
2551         if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
2552             prog_type != BPF_PROG_TYPE_EXT)
2553                 return -EINVAL;
2554
2555         switch (prog_type) {
2556         case BPF_PROG_TYPE_CGROUP_SOCK:
2557                 switch (expected_attach_type) {
2558                 case BPF_CGROUP_INET_SOCK_CREATE:
2559                 case BPF_CGROUP_INET_SOCK_RELEASE:
2560                 case BPF_CGROUP_INET4_POST_BIND:
2561                 case BPF_CGROUP_INET6_POST_BIND:
2562                         return 0;
2563                 default:
2564                         return -EINVAL;
2565                 }
2566         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2567                 switch (expected_attach_type) {
2568                 case BPF_CGROUP_INET4_BIND:
2569                 case BPF_CGROUP_INET6_BIND:
2570                 case BPF_CGROUP_INET4_CONNECT:
2571                 case BPF_CGROUP_INET6_CONNECT:
2572                 case BPF_CGROUP_UNIX_CONNECT:
2573                 case BPF_CGROUP_INET4_GETPEERNAME:
2574                 case BPF_CGROUP_INET6_GETPEERNAME:
2575                 case BPF_CGROUP_UNIX_GETPEERNAME:
2576                 case BPF_CGROUP_INET4_GETSOCKNAME:
2577                 case BPF_CGROUP_INET6_GETSOCKNAME:
2578                 case BPF_CGROUP_UNIX_GETSOCKNAME:
2579                 case BPF_CGROUP_UDP4_SENDMSG:
2580                 case BPF_CGROUP_UDP6_SENDMSG:
2581                 case BPF_CGROUP_UNIX_SENDMSG:
2582                 case BPF_CGROUP_UDP4_RECVMSG:
2583                 case BPF_CGROUP_UDP6_RECVMSG:
2584                 case BPF_CGROUP_UNIX_RECVMSG:
2585                         return 0;
2586                 default:
2587                         return -EINVAL;
2588                 }
2589         case BPF_PROG_TYPE_CGROUP_SKB:
2590                 switch (expected_attach_type) {
2591                 case BPF_CGROUP_INET_INGRESS:
2592                 case BPF_CGROUP_INET_EGRESS:
2593                         return 0;
2594                 default:
2595                         return -EINVAL;
2596                 }
2597         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2598                 switch (expected_attach_type) {
2599                 case BPF_CGROUP_SETSOCKOPT:
2600                 case BPF_CGROUP_GETSOCKOPT:
2601                         return 0;
2602                 default:
2603                         return -EINVAL;
2604                 }
2605         case BPF_PROG_TYPE_SK_LOOKUP:
2606                 if (expected_attach_type == BPF_SK_LOOKUP)
2607                         return 0;
2608                 return -EINVAL;
2609         case BPF_PROG_TYPE_SK_REUSEPORT:
2610                 switch (expected_attach_type) {
2611                 case BPF_SK_REUSEPORT_SELECT:
2612                 case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
2613                         return 0;
2614                 default:
2615                         return -EINVAL;
2616                 }
2617         case BPF_PROG_TYPE_NETFILTER:
2618                 if (expected_attach_type == BPF_NETFILTER)
2619                         return 0;
2620                 return -EINVAL;
2621         case BPF_PROG_TYPE_SYSCALL:
2622         case BPF_PROG_TYPE_EXT:
2623                 if (expected_attach_type)
2624                         return -EINVAL;
2625                 fallthrough;
2626         default:
2627                 return 0;
2628         }
2629 }
2630
2631 static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
2632 {
2633         switch (prog_type) {
2634         case BPF_PROG_TYPE_SCHED_CLS:
2635         case BPF_PROG_TYPE_SCHED_ACT:
2636         case BPF_PROG_TYPE_XDP:
2637         case BPF_PROG_TYPE_LWT_IN:
2638         case BPF_PROG_TYPE_LWT_OUT:
2639         case BPF_PROG_TYPE_LWT_XMIT:
2640         case BPF_PROG_TYPE_LWT_SEG6LOCAL:
2641         case BPF_PROG_TYPE_SK_SKB:
2642         case BPF_PROG_TYPE_SK_MSG:
2643         case BPF_PROG_TYPE_FLOW_DISSECTOR:
2644         case BPF_PROG_TYPE_CGROUP_DEVICE:
2645         case BPF_PROG_TYPE_CGROUP_SOCK:
2646         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2647         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2648         case BPF_PROG_TYPE_CGROUP_SYSCTL:
2649         case BPF_PROG_TYPE_SOCK_OPS:
2650         case BPF_PROG_TYPE_EXT: /* extends any prog */
2651         case BPF_PROG_TYPE_NETFILTER:
2652                 return true;
2653         case BPF_PROG_TYPE_CGROUP_SKB:
2654                 /* always unpriv */
2655         case BPF_PROG_TYPE_SK_REUSEPORT:
2656                 /* equivalent to SOCKET_FILTER. need CAP_BPF only */
2657         default:
2658                 return false;
2659         }
2660 }
2661
2662 static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
2663 {
2664         switch (prog_type) {
2665         case BPF_PROG_TYPE_KPROBE:
2666         case BPF_PROG_TYPE_TRACEPOINT:
2667         case BPF_PROG_TYPE_PERF_EVENT:
2668         case BPF_PROG_TYPE_RAW_TRACEPOINT:
2669         case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
2670         case BPF_PROG_TYPE_TRACING:
2671         case BPF_PROG_TYPE_LSM:
2672         case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
2673         case BPF_PROG_TYPE_EXT: /* extends any prog */
2674                 return true;
2675         default:
2676                 return false;
2677         }
2678 }
2679
2680 /* last field in 'union bpf_attr' used by this command */
2681 #define BPF_PROG_LOAD_LAST_FIELD prog_token_fd
2682
2683 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
2684 {
2685         enum bpf_prog_type type = attr->prog_type;
2686         struct bpf_prog *prog, *dst_prog = NULL;
2687         struct btf *attach_btf = NULL;
2688         struct bpf_token *token = NULL;
2689         bool bpf_cap;
2690         int err;
2691         char license[128];
2692
2693         if (CHECK_ATTR(BPF_PROG_LOAD))
2694                 return -EINVAL;
2695
2696         if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
2697                                  BPF_F_ANY_ALIGNMENT |
2698                                  BPF_F_TEST_STATE_FREQ |
2699                                  BPF_F_SLEEPABLE |
2700                                  BPF_F_TEST_RND_HI32 |
2701                                  BPF_F_XDP_HAS_FRAGS |
2702                                  BPF_F_XDP_DEV_BOUND_ONLY |
2703                                  BPF_F_TEST_REG_INVARIANTS |
2704                                  BPF_F_TOKEN_FD))
2705                 return -EINVAL;
2706
2707         bpf_prog_load_fixup_attach_type(attr);
2708
2709         if (attr->prog_flags & BPF_F_TOKEN_FD) {
2710                 token = bpf_token_get_from_fd(attr->prog_token_fd);
2711                 if (IS_ERR(token))
2712                         return PTR_ERR(token);
2713                 /* if current token doesn't grant prog loading permissions,
2714                  * then we can't use this token, so ignore it and rely on
2715                  * system-wide capabilities checks
2716                  */
2717                 if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) ||
2718                     !bpf_token_allow_prog_type(token, attr->prog_type,
2719                                                attr->expected_attach_type)) {
2720                         bpf_token_put(token);
2721                         token = NULL;
2722                 }
2723         }
2724
2725         bpf_cap = bpf_token_capable(token, CAP_BPF);
2726         err = -EPERM;
2727
2728         if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
2729             (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
2730             !bpf_cap)
2731                 goto put_token;
2732
2733         /* Intent here is for unprivileged_bpf_disabled to block BPF program
2734          * creation for unprivileged users; other actions depend
2735          * on fd availability and access to bpffs, so are dependent on
2736          * object creation success. Even with unprivileged BPF disabled,
2737          * capability checks are still carried out for these
2738          * and other operations.
2739          */
2740         if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
2741                 goto put_token;
2742
2743         if (attr->insn_cnt == 0 ||
2744             attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
2745                 err = -E2BIG;
2746                 goto put_token;
2747         }
2748         if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
2749             type != BPF_PROG_TYPE_CGROUP_SKB &&
2750             !bpf_cap)
2751                 goto put_token;
2752
2753         if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
2754                 goto put_token;
2755         if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
2756                 goto put_token;
2757
2758         /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
2759          * or btf, we need to check which one it is
2760          */
2761         if (attr->attach_prog_fd) {
2762                 dst_prog = bpf_prog_get(attr->attach_prog_fd);
2763                 if (IS_ERR(dst_prog)) {
2764                         dst_prog = NULL;
2765                         attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
2766                         if (IS_ERR(attach_btf)) {
2767                                 err = -EINVAL;
2768                                 goto put_token;
2769                         }
2770                         if (!btf_is_kernel(attach_btf)) {
2771                                 /* attaching through specifying bpf_prog's BTF
2772                                  * objects directly might be supported eventually
2773                                  */
2774                                 btf_put(attach_btf);
2775                                 err = -ENOTSUPP;
2776                                 goto put_token;
2777                         }
2778                 }
2779         } else if (attr->attach_btf_id) {
2780                 /* fall back to vmlinux BTF, if BTF type ID is specified */
2781                 attach_btf = bpf_get_btf_vmlinux();
2782                 if (IS_ERR(attach_btf)) {
2783                         err = PTR_ERR(attach_btf);
2784                         goto put_token;
2785                 }
2786                 if (!attach_btf) {
2787                         err = -EINVAL;
2788                         goto put_token;
2789                 }
2790                 btf_get(attach_btf);
2791         }
2792
2793         if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
2794                                        attach_btf, attr->attach_btf_id,
2795                                        dst_prog)) {
2796                 if (dst_prog)
2797                         bpf_prog_put(dst_prog);
2798                 if (attach_btf)
2799                         btf_put(attach_btf);
2800                 err = -EINVAL;
2801                 goto put_token;
2802         }
2803
2804         /* plain bpf_prog allocation */
2805         prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
2806         if (!prog) {
2807                 if (dst_prog)
2808                         bpf_prog_put(dst_prog);
2809                 if (attach_btf)
2810                         btf_put(attach_btf);
2811                 err = -EINVAL;
2812                 goto put_token;
2813         }
2814
2815         prog->expected_attach_type = attr->expected_attach_type;
2816         prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE);
2817         prog->aux->attach_btf = attach_btf;
2818         prog->aux->attach_btf_id = attr->attach_btf_id;
2819         prog->aux->dst_prog = dst_prog;
2820         prog->aux->dev_bound = !!attr->prog_ifindex;
2821         prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
2822
2823         /* move token into prog->aux, reuse taken refcnt */
2824         prog->aux->token = token;
2825         token = NULL;
2826
2827         prog->aux->user = get_current_user();
2828         prog->len = attr->insn_cnt;
2829
2830         err = -EFAULT;
2831         if (copy_from_bpfptr(prog->insns,
2832                              make_bpfptr(attr->insns, uattr.is_kernel),
2833                              bpf_prog_insn_size(prog)) != 0)
2834                 goto free_prog;
2835         /* copy eBPF program license from user space */
2836         if (strncpy_from_bpfptr(license,
2837                                 make_bpfptr(attr->license, uattr.is_kernel),
2838                                 sizeof(license) - 1) < 0)
2839                 goto free_prog;
2840         license[sizeof(license) - 1] = 0;
2841
2842         /* eBPF programs must be GPL compatible to use GPL-ed functions */
2843         prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
2844
2845         prog->orig_prog = NULL;
2846         prog->jited = 0;
2847
2848         atomic64_set(&prog->aux->refcnt, 1);
2849
2850         if (bpf_prog_is_dev_bound(prog->aux)) {
2851                 err = bpf_prog_dev_bound_init(prog, attr);
2852                 if (err)
2853                         goto free_prog;
2854         }
2855
2856         if (type == BPF_PROG_TYPE_EXT && dst_prog &&
2857             bpf_prog_is_dev_bound(dst_prog->aux)) {
2858                 err = bpf_prog_dev_bound_inherit(prog, dst_prog);
2859                 if (err)
2860                         goto free_prog;
2861         }
2862
2863         /*
2864          * Bookkeeping for managing the program attachment chain.
2865          *
2866          * It might be tempting to set attach_tracing_prog flag at the attachment
2867          * time, but this will not prevent from loading bunch of tracing prog
2868          * first, then attach them one to another.
2869          *
2870          * The flag attach_tracing_prog is set for the whole program lifecycle, and
2871          * doesn't have to be cleared in bpf_tracing_link_release, since tracing
2872          * programs cannot change attachment target.
2873          */
2874         if (type == BPF_PROG_TYPE_TRACING && dst_prog &&
2875             dst_prog->type == BPF_PROG_TYPE_TRACING) {
2876                 prog->aux->attach_tracing_prog = true;
2877         }
2878
2879         /* find program type: socket_filter vs tracing_filter */
2880         err = find_prog_type(type, prog);
2881         if (err < 0)
2882                 goto free_prog;
2883
2884         prog->aux->load_time = ktime_get_boottime_ns();
2885         err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
2886                                sizeof(attr->prog_name));
2887         if (err < 0)
2888                 goto free_prog;
2889
2890         err = security_bpf_prog_load(prog, attr, token);
2891         if (err)
2892                 goto free_prog_sec;
2893
2894         /* run eBPF verifier */
2895         err = bpf_check(&prog, attr, uattr, uattr_size);
2896         if (err < 0)
2897                 goto free_used_maps;
2898
2899         prog = bpf_prog_select_runtime(prog, &err);
2900         if (err < 0)
2901                 goto free_used_maps;
2902
2903         err = bpf_prog_alloc_id(prog);
2904         if (err)
2905                 goto free_used_maps;
2906
2907         /* Upon success of bpf_prog_alloc_id(), the BPF prog is
2908          * effectively publicly exposed. However, retrieving via
2909          * bpf_prog_get_fd_by_id() will take another reference,
2910          * therefore it cannot be gone underneath us.
2911          *
2912          * Only for the time /after/ successful bpf_prog_new_fd()
2913          * and before returning to userspace, we might just hold
2914          * one reference and any parallel close on that fd could
2915          * rip everything out. Hence, below notifications must
2916          * happen before bpf_prog_new_fd().
2917          *
2918          * Also, any failure handling from this point onwards must
2919          * be using bpf_prog_put() given the program is exposed.
2920          */
2921         bpf_prog_kallsyms_add(prog);
2922         perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
2923         bpf_audit_prog(prog, BPF_AUDIT_LOAD);
2924
2925         err = bpf_prog_new_fd(prog);
2926         if (err < 0)
2927                 bpf_prog_put(prog);
2928         return err;
2929
2930 free_used_maps:
2931         /* In case we have subprogs, we need to wait for a grace
2932          * period before we can tear down JIT memory since symbols
2933          * are already exposed under kallsyms.
2934          */
2935         __bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
2936         return err;
2937
2938 free_prog_sec:
2939         security_bpf_prog_free(prog);
2940 free_prog:
2941         free_uid(prog->aux->user);
2942         if (prog->aux->attach_btf)
2943                 btf_put(prog->aux->attach_btf);
2944         bpf_prog_free(prog);
2945 put_token:
2946         bpf_token_put(token);
2947         return err;
2948 }
2949
2950 #define BPF_OBJ_LAST_FIELD path_fd
2951
2952 static int bpf_obj_pin(const union bpf_attr *attr)
2953 {
2954         int path_fd;
2955
2956         if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD)
2957                 return -EINVAL;
2958
2959         /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
2960         if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
2961                 return -EINVAL;
2962
2963         path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
2964         return bpf_obj_pin_user(attr->bpf_fd, path_fd,
2965                                 u64_to_user_ptr(attr->pathname));
2966 }
2967
2968 static int bpf_obj_get(const union bpf_attr *attr)
2969 {
2970         int path_fd;
2971
2972         if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
2973             attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD))
2974                 return -EINVAL;
2975
2976         /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
2977         if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
2978                 return -EINVAL;
2979
2980         path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
2981         return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname),
2982                                 attr->file_flags);
2983 }
2984
2985 void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
2986                    const struct bpf_link_ops *ops, struct bpf_prog *prog)
2987 {
2988         atomic64_set(&link->refcnt, 1);
2989         link->type = type;
2990         link->id = 0;
2991         link->ops = ops;
2992         link->prog = prog;
2993 }
2994
2995 static void bpf_link_free_id(int id)
2996 {
2997         if (!id)
2998                 return;
2999
3000         spin_lock_bh(&link_idr_lock);
3001         idr_remove(&link_idr, id);
3002         spin_unlock_bh(&link_idr_lock);
3003 }
3004
3005 /* Clean up bpf_link and corresponding anon_inode file and FD. After
3006  * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
3007  * anon_inode's release() call. This helper marks bpf_link as
3008  * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
3009  * is not decremented, it's the responsibility of a calling code that failed
3010  * to complete bpf_link initialization.
3011  * This helper eventually calls link's dealloc callback, but does not call
3012  * link's release callback.
3013  */
3014 void bpf_link_cleanup(struct bpf_link_primer *primer)
3015 {
3016         primer->link->prog = NULL;
3017         bpf_link_free_id(primer->id);
3018         fput(primer->file);
3019         put_unused_fd(primer->fd);
3020 }
3021
3022 void bpf_link_inc(struct bpf_link *link)
3023 {
3024         atomic64_inc(&link->refcnt);
3025 }
3026
3027 static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
3028 {
3029         struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
3030
3031         /* free bpf_link and its containing memory */
3032         link->ops->dealloc_deferred(link);
3033 }
3034
3035 static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
3036 {
3037         if (rcu_trace_implies_rcu_gp())
3038                 bpf_link_defer_dealloc_rcu_gp(rcu);
3039         else
3040                 call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
3041 }
3042
3043 /* bpf_link_free is guaranteed to be called from process context */
3044 static void bpf_link_free(struct bpf_link *link)
3045 {
3046         bool sleepable = false;
3047
3048         bpf_link_free_id(link->id);
3049         if (link->prog) {
3050                 sleepable = link->prog->sleepable;
3051                 /* detach BPF program, clean up used resources */
3052                 link->ops->release(link);
3053                 bpf_prog_put(link->prog);
3054         }
3055         if (link->ops->dealloc_deferred) {
3056                 /* schedule BPF link deallocation; if underlying BPF program
3057                  * is sleepable, we need to first wait for RCU tasks trace
3058                  * sync, then go through "classic" RCU grace period
3059                  */
3060                 if (sleepable)
3061                         call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
3062                 else
3063                         call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
3064         }
3065         if (link->ops->dealloc)
3066                 link->ops->dealloc(link);
3067 }
3068
3069 static void bpf_link_put_deferred(struct work_struct *work)
3070 {
3071         struct bpf_link *link = container_of(work, struct bpf_link, work);
3072
3073         bpf_link_free(link);
3074 }
3075
3076 /* bpf_link_put might be called from atomic context. It needs to be called
3077  * from sleepable context in order to acquire sleeping locks during the process.
3078  */
3079 void bpf_link_put(struct bpf_link *link)
3080 {
3081         if (!atomic64_dec_and_test(&link->refcnt))
3082                 return;
3083
3084         INIT_WORK(&link->work, bpf_link_put_deferred);
3085         schedule_work(&link->work);
3086 }
3087 EXPORT_SYMBOL(bpf_link_put);
3088
3089 static void bpf_link_put_direct(struct bpf_link *link)
3090 {
3091         if (!atomic64_dec_and_test(&link->refcnt))
3092                 return;
3093         bpf_link_free(link);
3094 }
3095
3096 static int bpf_link_release(struct inode *inode, struct file *filp)
3097 {
3098         struct bpf_link *link = filp->private_data;
3099
3100         bpf_link_put_direct(link);
3101         return 0;
3102 }
3103
3104 #ifdef CONFIG_PROC_FS
3105 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
3106 #define BPF_MAP_TYPE(_id, _ops)
3107 #define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
3108 static const char *bpf_link_type_strs[] = {
3109         [BPF_LINK_TYPE_UNSPEC] = "<invalid>",
3110 #include <linux/bpf_types.h>
3111 };
3112 #undef BPF_PROG_TYPE
3113 #undef BPF_MAP_TYPE
3114 #undef BPF_LINK_TYPE
3115
3116 static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
3117 {
3118         const struct bpf_link *link = filp->private_data;
3119         const struct bpf_prog *prog = link->prog;
3120         char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
3121
3122         seq_printf(m,
3123                    "link_type:\t%s\n"
3124                    "link_id:\t%u\n",
3125                    bpf_link_type_strs[link->type],
3126                    link->id);
3127         if (prog) {
3128                 bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
3129                 seq_printf(m,
3130                            "prog_tag:\t%s\n"
3131                            "prog_id:\t%u\n",
3132                            prog_tag,
3133                            prog->aux->id);
3134         }
3135         if (link->ops->show_fdinfo)
3136                 link->ops->show_fdinfo(link, m);
3137 }
3138 #endif
3139
3140 static const struct file_operations bpf_link_fops = {
3141 #ifdef CONFIG_PROC_FS
3142         .show_fdinfo    = bpf_link_show_fdinfo,
3143 #endif
3144         .release        = bpf_link_release,
3145         .read           = bpf_dummy_read,
3146         .write          = bpf_dummy_write,
3147 };
3148
3149 static int bpf_link_alloc_id(struct bpf_link *link)
3150 {
3151         int id;
3152
3153         idr_preload(GFP_KERNEL);
3154         spin_lock_bh(&link_idr_lock);
3155         id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
3156         spin_unlock_bh(&link_idr_lock);
3157         idr_preload_end();
3158
3159         return id;
3160 }
3161
3162 /* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
3163  * reserving unused FD and allocating ID from link_idr. This is to be paired
3164  * with bpf_link_settle() to install FD and ID and expose bpf_link to
3165  * user-space, if bpf_link is successfully attached. If not, bpf_link and
3166  * pre-allocated resources are to be freed with bpf_cleanup() call. All the
3167  * transient state is passed around in struct bpf_link_primer.
3168  * This is preferred way to create and initialize bpf_link, especially when
3169  * there are complicated and expensive operations in between creating bpf_link
3170  * itself and attaching it to BPF hook. By using bpf_link_prime() and
3171  * bpf_link_settle() kernel code using bpf_link doesn't have to perform
3172  * expensive (and potentially failing) roll back operations in a rare case
3173  * that file, FD, or ID can't be allocated.
3174  */
3175 int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
3176 {
3177         struct file *file;
3178         int fd, id;
3179
3180         fd = get_unused_fd_flags(O_CLOEXEC);
3181         if (fd < 0)
3182                 return fd;
3183
3184
3185         id = bpf_link_alloc_id(link);
3186         if (id < 0) {
3187                 put_unused_fd(fd);
3188                 return id;
3189         }
3190
3191         file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
3192         if (IS_ERR(file)) {
3193                 bpf_link_free_id(id);
3194                 put_unused_fd(fd);
3195                 return PTR_ERR(file);
3196         }
3197
3198         primer->link = link;
3199         primer->file = file;
3200         primer->fd = fd;
3201         primer->id = id;
3202         return 0;
3203 }
3204
3205 int bpf_link_settle(struct bpf_link_primer *primer)
3206 {
3207         /* make bpf_link fetchable by ID */
3208         spin_lock_bh(&link_idr_lock);
3209         primer->link->id = primer->id;
3210         spin_unlock_bh(&link_idr_lock);
3211         /* make bpf_link fetchable by FD */
3212         fd_install(primer->fd, primer->file);
3213         /* pass through installed FD */
3214         return primer->fd;
3215 }
3216
3217 int bpf_link_new_fd(struct bpf_link *link)
3218 {
3219         return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
3220 }
3221
3222 struct bpf_link *bpf_link_get_from_fd(u32 ufd)
3223 {
3224         struct fd f = fdget(ufd);
3225         struct bpf_link *link;
3226
3227         if (!f.file)
3228                 return ERR_PTR(-EBADF);
3229         if (f.file->f_op != &bpf_link_fops) {
3230                 fdput(f);
3231                 return ERR_PTR(-EINVAL);
3232         }
3233
3234         link = f.file->private_data;
3235         bpf_link_inc(link);
3236         fdput(f);
3237
3238         return link;
3239 }
3240 EXPORT_SYMBOL(bpf_link_get_from_fd);
3241
3242 static void bpf_tracing_link_release(struct bpf_link *link)
3243 {
3244         struct bpf_tracing_link *tr_link =
3245                 container_of(link, struct bpf_tracing_link, link.link);
3246
3247         WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
3248                                                 tr_link->trampoline));
3249
3250         bpf_trampoline_put(tr_link->trampoline);
3251
3252         /* tgt_prog is NULL if target is a kernel function */
3253         if (tr_link->tgt_prog)
3254                 bpf_prog_put(tr_link->tgt_prog);
3255 }
3256
3257 static void bpf_tracing_link_dealloc(struct bpf_link *link)
3258 {
3259         struct bpf_tracing_link *tr_link =
3260                 container_of(link, struct bpf_tracing_link, link.link);
3261
3262         kfree(tr_link);
3263 }
3264
3265 static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
3266                                          struct seq_file *seq)
3267 {
3268         struct bpf_tracing_link *tr_link =
3269                 container_of(link, struct bpf_tracing_link, link.link);
3270         u32 target_btf_id, target_obj_id;
3271
3272         bpf_trampoline_unpack_key(tr_link->trampoline->key,
3273                                   &target_obj_id, &target_btf_id);
3274         seq_printf(seq,
3275                    "attach_type:\t%d\n"
3276                    "target_obj_id:\t%u\n"
3277                    "target_btf_id:\t%u\n",
3278                    tr_link->attach_type,
3279                    target_obj_id,
3280                    target_btf_id);
3281 }
3282
3283 static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
3284                                            struct bpf_link_info *info)
3285 {
3286         struct bpf_tracing_link *tr_link =
3287                 container_of(link, struct bpf_tracing_link, link.link);
3288
3289         info->tracing.attach_type = tr_link->attach_type;
3290         bpf_trampoline_unpack_key(tr_link->trampoline->key,
3291                                   &info->tracing.target_obj_id,
3292                                   &info->tracing.target_btf_id);
3293
3294         return 0;
3295 }
3296
3297 static const struct bpf_link_ops bpf_tracing_link_lops = {
3298         .release = bpf_tracing_link_release,
3299         .dealloc = bpf_tracing_link_dealloc,
3300         .show_fdinfo = bpf_tracing_link_show_fdinfo,
3301         .fill_link_info = bpf_tracing_link_fill_link_info,
3302 };
3303
3304 static int bpf_tracing_prog_attach(struct bpf_prog *prog,
3305                                    int tgt_prog_fd,
3306                                    u32 btf_id,
3307                                    u64 bpf_cookie)
3308 {
3309         struct bpf_link_primer link_primer;
3310         struct bpf_prog *tgt_prog = NULL;
3311         struct bpf_trampoline *tr = NULL;
3312         struct bpf_tracing_link *link;
3313         u64 key = 0;
3314         int err;
3315
3316         switch (prog->type) {
3317         case BPF_PROG_TYPE_TRACING:
3318                 if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
3319                     prog->expected_attach_type != BPF_TRACE_FEXIT &&
3320                     prog->expected_attach_type != BPF_MODIFY_RETURN) {
3321                         err = -EINVAL;
3322                         goto out_put_prog;
3323                 }
3324                 break;
3325         case BPF_PROG_TYPE_EXT:
3326                 if (prog->expected_attach_type != 0) {
3327                         err = -EINVAL;
3328                         goto out_put_prog;
3329                 }
3330                 break;
3331         case BPF_PROG_TYPE_LSM:
3332                 if (prog->expected_attach_type != BPF_LSM_MAC) {
3333                         err = -EINVAL;
3334                         goto out_put_prog;
3335                 }
3336                 break;
3337         default:
3338                 err = -EINVAL;
3339                 goto out_put_prog;
3340         }
3341
3342         if (!!tgt_prog_fd != !!btf_id) {
3343                 err = -EINVAL;
3344                 goto out_put_prog;
3345         }
3346
3347         if (tgt_prog_fd) {
3348                 /*
3349                  * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this
3350                  * part would be changed to implement the same for
3351                  * BPF_PROG_TYPE_TRACING, do not forget to update the way how
3352                  * attach_tracing_prog flag is set.
3353                  */
3354                 if (prog->type != BPF_PROG_TYPE_EXT) {
3355                         err = -EINVAL;
3356                         goto out_put_prog;
3357                 }
3358
3359                 tgt_prog = bpf_prog_get(tgt_prog_fd);
3360                 if (IS_ERR(tgt_prog)) {
3361                         err = PTR_ERR(tgt_prog);
3362                         tgt_prog = NULL;
3363                         goto out_put_prog;
3364                 }
3365
3366                 key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
3367         }
3368
3369         link = kzalloc(sizeof(*link), GFP_USER);
3370         if (!link) {
3371                 err = -ENOMEM;
3372                 goto out_put_prog;
3373         }
3374         bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
3375                       &bpf_tracing_link_lops, prog);
3376         link->attach_type = prog->expected_attach_type;
3377         link->link.cookie = bpf_cookie;
3378
3379         mutex_lock(&prog->aux->dst_mutex);
3380
3381         /* There are a few possible cases here:
3382          *
3383          * - if prog->aux->dst_trampoline is set, the program was just loaded
3384          *   and not yet attached to anything, so we can use the values stored
3385          *   in prog->aux
3386          *
3387          * - if prog->aux->dst_trampoline is NULL, the program has already been
3388          *   attached to a target and its initial target was cleared (below)
3389          *
3390          * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
3391          *   target_btf_id using the link_create API.
3392          *
3393          * - if tgt_prog == NULL when this function was called using the old
3394          *   raw_tracepoint_open API, and we need a target from prog->aux
3395          *
3396          * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
3397          *   was detached and is going for re-attachment.
3398          *
3399          * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf
3400          *   are NULL, then program was already attached and user did not provide
3401          *   tgt_prog_fd so we have no way to find out or create trampoline
3402          */
3403         if (!prog->aux->dst_trampoline && !tgt_prog) {
3404                 /*
3405                  * Allow re-attach for TRACING and LSM programs. If it's
3406                  * currently linked, bpf_trampoline_link_prog will fail.
3407                  * EXT programs need to specify tgt_prog_fd, so they
3408                  * re-attach in separate code path.
3409                  */
3410                 if (prog->type != BPF_PROG_TYPE_TRACING &&
3411                     prog->type != BPF_PROG_TYPE_LSM) {
3412                         err = -EINVAL;
3413                         goto out_unlock;
3414                 }
3415                 /* We can allow re-attach only if we have valid attach_btf. */
3416                 if (!prog->aux->attach_btf) {
3417                         err = -EINVAL;
3418                         goto out_unlock;
3419                 }
3420                 btf_id = prog->aux->attach_btf_id;
3421                 key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
3422         }
3423
3424         if (!prog->aux->dst_trampoline ||
3425             (key && key != prog->aux->dst_trampoline->key)) {
3426                 /* If there is no saved target, or the specified target is
3427                  * different from the destination specified at load time, we
3428                  * need a new trampoline and a check for compatibility
3429                  */
3430                 struct bpf_attach_target_info tgt_info = {};
3431
3432                 err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
3433                                               &tgt_info);
3434                 if (err)
3435                         goto out_unlock;
3436
3437                 if (tgt_info.tgt_mod) {
3438                         module_put(prog->aux->mod);
3439                         prog->aux->mod = tgt_info.tgt_mod;
3440                 }
3441
3442                 tr = bpf_trampoline_get(key, &tgt_info);
3443                 if (!tr) {
3444                         err = -ENOMEM;
3445                         goto out_unlock;
3446                 }
3447         } else {
3448                 /* The caller didn't specify a target, or the target was the
3449                  * same as the destination supplied during program load. This
3450                  * means we can reuse the trampoline and reference from program
3451                  * load time, and there is no need to allocate a new one. This
3452                  * can only happen once for any program, as the saved values in
3453                  * prog->aux are cleared below.
3454                  */
3455                 tr = prog->aux->dst_trampoline;
3456                 tgt_prog = prog->aux->dst_prog;
3457         }
3458
3459         err = bpf_link_prime(&link->link.link, &link_primer);
3460         if (err)
3461                 goto out_unlock;
3462
3463         err = bpf_trampoline_link_prog(&link->link, tr);
3464         if (err) {
3465                 bpf_link_cleanup(&link_primer);
3466                 link = NULL;
3467                 goto out_unlock;
3468         }
3469
3470         link->tgt_prog = tgt_prog;
3471         link->trampoline = tr;
3472
3473         /* Always clear the trampoline and target prog from prog->aux to make
3474          * sure the original attach destination is not kept alive after a
3475          * program is (re-)attached to another target.
3476          */
3477         if (prog->aux->dst_prog &&
3478             (tgt_prog_fd || tr != prog->aux->dst_trampoline))
3479                 /* got extra prog ref from syscall, or attaching to different prog */
3480                 bpf_prog_put(prog->aux->dst_prog);
3481         if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
3482                 /* we allocated a new trampoline, so free the old one */
3483                 bpf_trampoline_put(prog->aux->dst_trampoline);
3484
3485         prog->aux->dst_prog = NULL;
3486         prog->aux->dst_trampoline = NULL;
3487         mutex_unlock(&prog->aux->dst_mutex);
3488
3489         return bpf_link_settle(&link_primer);
3490 out_unlock:
3491         if (tr && tr != prog->aux->dst_trampoline)
3492                 bpf_trampoline_put(tr);
3493         mutex_unlock(&prog->aux->dst_mutex);
3494         kfree(link);
3495 out_put_prog:
3496         if (tgt_prog_fd && tgt_prog)
3497                 bpf_prog_put(tgt_prog);
3498         return err;
3499 }
3500
3501 struct bpf_raw_tp_link {
3502         struct bpf_link link;
3503         struct bpf_raw_event_map *btp;
3504 };
3505
3506 static void bpf_raw_tp_link_release(struct bpf_link *link)
3507 {
3508         struct bpf_raw_tp_link *raw_tp =
3509                 container_of(link, struct bpf_raw_tp_link, link);
3510
3511         bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
3512         bpf_put_raw_tracepoint(raw_tp->btp);
3513 }
3514
3515 static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
3516 {
3517         struct bpf_raw_tp_link *raw_tp =
3518                 container_of(link, struct bpf_raw_tp_link, link);
3519
3520         kfree(raw_tp);
3521 }
3522
3523 static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
3524                                         struct seq_file *seq)
3525 {
3526         struct bpf_raw_tp_link *raw_tp_link =
3527                 container_of(link, struct bpf_raw_tp_link, link);
3528
3529         seq_printf(seq,
3530                    "tp_name:\t%s\n",
3531                    raw_tp_link->btp->tp->name);
3532 }
3533
3534 static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen,
3535                             u32 len)
3536 {
3537         if (ulen >= len + 1) {
3538                 if (copy_to_user(ubuf, buf, len + 1))
3539                         return -EFAULT;
3540         } else {
3541                 char zero = '\0';
3542
3543                 if (copy_to_user(ubuf, buf, ulen - 1))
3544                         return -EFAULT;
3545                 if (put_user(zero, ubuf + ulen - 1))
3546                         return -EFAULT;
3547                 return -ENOSPC;
3548         }
3549
3550         return 0;
3551 }
3552
3553 static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
3554                                           struct bpf_link_info *info)
3555 {
3556         struct bpf_raw_tp_link *raw_tp_link =
3557                 container_of(link, struct bpf_raw_tp_link, link);
3558         char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
3559         const char *tp_name = raw_tp_link->btp->tp->name;
3560         u32 ulen = info->raw_tracepoint.tp_name_len;
3561         size_t tp_len = strlen(tp_name);
3562
3563         if (!ulen ^ !ubuf)
3564                 return -EINVAL;
3565
3566         info->raw_tracepoint.tp_name_len = tp_len + 1;
3567
3568         if (!ubuf)
3569                 return 0;
3570
3571         return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len);
3572 }
3573
3574 static const struct bpf_link_ops bpf_raw_tp_link_lops = {
3575         .release = bpf_raw_tp_link_release,
3576         .dealloc_deferred = bpf_raw_tp_link_dealloc,
3577         .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
3578         .fill_link_info = bpf_raw_tp_link_fill_link_info,
3579 };
3580
3581 #ifdef CONFIG_PERF_EVENTS
3582 struct bpf_perf_link {
3583         struct bpf_link link;
3584         struct file *perf_file;
3585 };
3586
3587 static void bpf_perf_link_release(struct bpf_link *link)
3588 {
3589         struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
3590         struct perf_event *event = perf_link->perf_file->private_data;
3591
3592         perf_event_free_bpf_prog(event);
3593         fput(perf_link->perf_file);
3594 }
3595
3596 static void bpf_perf_link_dealloc(struct bpf_link *link)
3597 {
3598         struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
3599
3600         kfree(perf_link);
3601 }
3602
3603 static int bpf_perf_link_fill_common(const struct perf_event *event,
3604                                      char __user *uname, u32 ulen,
3605                                      u64 *probe_offset, u64 *probe_addr,
3606                                      u32 *fd_type, unsigned long *missed)
3607 {
3608         const char *buf;
3609         u32 prog_id;
3610         size_t len;
3611         int err;
3612
3613         if (!ulen ^ !uname)
3614                 return -EINVAL;
3615
3616         err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
3617                                       probe_offset, probe_addr, missed);
3618         if (err)
3619                 return err;
3620         if (!uname)
3621                 return 0;
3622         if (buf) {
3623                 len = strlen(buf);
3624                 err = bpf_copy_to_user(uname, buf, ulen, len);
3625                 if (err)
3626                         return err;
3627         } else {
3628                 char zero = '\0';
3629
3630                 if (put_user(zero, uname))
3631                         return -EFAULT;
3632         }
3633         return 0;
3634 }
3635
3636 #ifdef CONFIG_KPROBE_EVENTS
3637 static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
3638                                      struct bpf_link_info *info)
3639 {
3640         unsigned long missed;
3641         char __user *uname;
3642         u64 addr, offset;
3643         u32 ulen, type;
3644         int err;
3645
3646         uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
3647         ulen = info->perf_event.kprobe.name_len;
3648         err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
3649                                         &type, &missed);
3650         if (err)
3651                 return err;
3652         if (type == BPF_FD_TYPE_KRETPROBE)
3653                 info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
3654         else
3655                 info->perf_event.type = BPF_PERF_EVENT_KPROBE;
3656
3657         info->perf_event.kprobe.offset = offset;
3658         info->perf_event.kprobe.missed = missed;
3659         if (!kallsyms_show_value(current_cred()))
3660                 addr = 0;
3661         info->perf_event.kprobe.addr = addr;
3662         info->perf_event.kprobe.cookie = event->bpf_cookie;
3663         return 0;
3664 }
3665 #endif
3666
3667 #ifdef CONFIG_UPROBE_EVENTS
3668 static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
3669                                      struct bpf_link_info *info)
3670 {
3671         char __user *uname;
3672         u64 addr, offset;
3673         u32 ulen, type;
3674         int err;
3675
3676         uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
3677         ulen = info->perf_event.uprobe.name_len;
3678         err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
3679                                         &type, NULL);
3680         if (err)
3681                 return err;
3682
3683         if (type == BPF_FD_TYPE_URETPROBE)
3684                 info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
3685         else
3686                 info->perf_event.type = BPF_PERF_EVENT_UPROBE;
3687         info->perf_event.uprobe.offset = offset;
3688         info->perf_event.uprobe.cookie = event->bpf_cookie;
3689         return 0;
3690 }
3691 #endif
3692
3693 static int bpf_perf_link_fill_probe(const struct perf_event *event,
3694                                     struct bpf_link_info *info)
3695 {
3696 #ifdef CONFIG_KPROBE_EVENTS
3697         if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
3698                 return bpf_perf_link_fill_kprobe(event, info);
3699 #endif
3700 #ifdef CONFIG_UPROBE_EVENTS
3701         if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
3702                 return bpf_perf_link_fill_uprobe(event, info);
3703 #endif
3704         return -EOPNOTSUPP;
3705 }
3706
3707 static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
3708                                          struct bpf_link_info *info)
3709 {
3710         char __user *uname;
3711         u32 ulen;
3712
3713         uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
3714         ulen = info->perf_event.tracepoint.name_len;
3715         info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
3716         info->perf_event.tracepoint.cookie = event->bpf_cookie;
3717         return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
3718 }
3719
3720 static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
3721                                          struct bpf_link_info *info)
3722 {
3723         info->perf_event.event.type = event->attr.type;
3724         info->perf_event.event.config = event->attr.config;
3725         info->perf_event.event.cookie = event->bpf_cookie;
3726         info->perf_event.type = BPF_PERF_EVENT_EVENT;
3727         return 0;
3728 }
3729
3730 static int bpf_perf_link_fill_link_info(const struct bpf_link *link,
3731                                         struct bpf_link_info *info)
3732 {
3733         struct bpf_perf_link *perf_link;
3734         const struct perf_event *event;
3735
3736         perf_link = container_of(link, struct bpf_perf_link, link);
3737         event = perf_get_event(perf_link->perf_file);
3738         if (IS_ERR(event))
3739                 return PTR_ERR(event);
3740
3741         switch (event->prog->type) {
3742         case BPF_PROG_TYPE_PERF_EVENT:
3743                 return bpf_perf_link_fill_perf_event(event, info);
3744         case BPF_PROG_TYPE_TRACEPOINT:
3745                 return bpf_perf_link_fill_tracepoint(event, info);
3746         case BPF_PROG_TYPE_KPROBE:
3747                 return bpf_perf_link_fill_probe(event, info);
3748         default:
3749                 return -EOPNOTSUPP;
3750         }
3751 }
3752
3753 static const struct bpf_link_ops bpf_perf_link_lops = {
3754         .release = bpf_perf_link_release,
3755         .dealloc = bpf_perf_link_dealloc,
3756         .fill_link_info = bpf_perf_link_fill_link_info,
3757 };
3758
3759 static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
3760 {
3761         struct bpf_link_primer link_primer;
3762         struct bpf_perf_link *link;
3763         struct perf_event *event;
3764         struct file *perf_file;
3765         int err;
3766
3767         if (attr->link_create.flags)
3768                 return -EINVAL;
3769
3770         perf_file = perf_event_get(attr->link_create.target_fd);
3771         if (IS_ERR(perf_file))
3772                 return PTR_ERR(perf_file);
3773
3774         link = kzalloc(sizeof(*link), GFP_USER);
3775         if (!link) {
3776                 err = -ENOMEM;
3777                 goto out_put_file;
3778         }
3779         bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
3780         link->perf_file = perf_file;
3781
3782         err = bpf_link_prime(&link->link, &link_primer);
3783         if (err) {
3784                 kfree(link);
3785                 goto out_put_file;
3786         }
3787
3788         event = perf_file->private_data;
3789         err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
3790         if (err) {
3791                 bpf_link_cleanup(&link_primer);
3792                 goto out_put_file;
3793         }
3794         /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
3795         bpf_prog_inc(prog);
3796
3797         return bpf_link_settle(&link_primer);
3798
3799 out_put_file:
3800         fput(perf_file);
3801         return err;
3802 }
3803 #else
3804 static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
3805 {
3806         return -EOPNOTSUPP;
3807 }
3808 #endif /* CONFIG_PERF_EVENTS */
3809
3810 static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
3811                                   const char __user *user_tp_name)
3812 {
3813         struct bpf_link_primer link_primer;
3814         struct bpf_raw_tp_link *link;
3815         struct bpf_raw_event_map *btp;
3816         const char *tp_name;
3817         char buf[128];
3818         int err;
3819
3820         switch (prog->type) {
3821         case BPF_PROG_TYPE_TRACING:
3822         case BPF_PROG_TYPE_EXT:
3823         case BPF_PROG_TYPE_LSM:
3824                 if (user_tp_name)
3825                         /* The attach point for this category of programs
3826                          * should be specified via btf_id during program load.
3827                          */
3828                         return -EINVAL;
3829                 if (prog->type == BPF_PROG_TYPE_TRACING &&
3830                     prog->expected_attach_type == BPF_TRACE_RAW_TP) {
3831                         tp_name = prog->aux->attach_func_name;
3832                         break;
3833                 }
3834                 return bpf_tracing_prog_attach(prog, 0, 0, 0);
3835         case BPF_PROG_TYPE_RAW_TRACEPOINT:
3836         case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
3837                 if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
3838                         return -EFAULT;
3839                 buf[sizeof(buf) - 1] = 0;
3840                 tp_name = buf;
3841                 break;
3842         default:
3843                 return -EINVAL;
3844         }
3845
3846         btp = bpf_get_raw_tracepoint(tp_name);
3847         if (!btp)
3848                 return -ENOENT;
3849
3850         link = kzalloc(sizeof(*link), GFP_USER);
3851         if (!link) {
3852                 err = -ENOMEM;
3853                 goto out_put_btp;
3854         }
3855         bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
3856                       &bpf_raw_tp_link_lops, prog);
3857         link->btp = btp;
3858
3859         err = bpf_link_prime(&link->link, &link_primer);
3860         if (err) {
3861                 kfree(link);
3862                 goto out_put_btp;
3863         }
3864
3865         err = bpf_probe_register(link->btp, prog);
3866         if (err) {
3867                 bpf_link_cleanup(&link_primer);
3868                 goto out_put_btp;
3869         }
3870
3871         return bpf_link_settle(&link_primer);
3872
3873 out_put_btp:
3874         bpf_put_raw_tracepoint(btp);
3875         return err;
3876 }
3877
3878 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
3879
3880 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
3881 {
3882         struct bpf_prog *prog;
3883         int fd;
3884
3885         if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
3886                 return -EINVAL;
3887
3888         prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
3889         if (IS_ERR(prog))
3890                 return PTR_ERR(prog);
3891
3892         fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
3893         if (fd < 0)
3894                 bpf_prog_put(prog);
3895         return fd;
3896 }
3897
3898 static enum bpf_prog_type
3899 attach_type_to_prog_type(enum bpf_attach_type attach_type)
3900 {
3901         switch (attach_type) {
3902         case BPF_CGROUP_INET_INGRESS:
3903         case BPF_CGROUP_INET_EGRESS:
3904                 return BPF_PROG_TYPE_CGROUP_SKB;
3905         case BPF_CGROUP_INET_SOCK_CREATE:
3906         case BPF_CGROUP_INET_SOCK_RELEASE:
3907         case BPF_CGROUP_INET4_POST_BIND:
3908         case BPF_CGROUP_INET6_POST_BIND:
3909                 return BPF_PROG_TYPE_CGROUP_SOCK;
3910         case BPF_CGROUP_INET4_BIND:
3911         case BPF_CGROUP_INET6_BIND:
3912         case BPF_CGROUP_INET4_CONNECT:
3913         case BPF_CGROUP_INET6_CONNECT:
3914         case BPF_CGROUP_UNIX_CONNECT:
3915         case BPF_CGROUP_INET4_GETPEERNAME:
3916         case BPF_CGROUP_INET6_GETPEERNAME:
3917         case BPF_CGROUP_UNIX_GETPEERNAME:
3918         case BPF_CGROUP_INET4_GETSOCKNAME:
3919         case BPF_CGROUP_INET6_GETSOCKNAME:
3920         case BPF_CGROUP_UNIX_GETSOCKNAME:
3921         case BPF_CGROUP_UDP4_SENDMSG:
3922         case BPF_CGROUP_UDP6_SENDMSG:
3923         case BPF_CGROUP_UNIX_SENDMSG:
3924         case BPF_CGROUP_UDP4_RECVMSG:
3925         case BPF_CGROUP_UDP6_RECVMSG:
3926         case BPF_CGROUP_UNIX_RECVMSG:
3927                 return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
3928         case BPF_CGROUP_SOCK_OPS:
3929                 return BPF_PROG_TYPE_SOCK_OPS;
3930         case BPF_CGROUP_DEVICE:
3931                 return BPF_PROG_TYPE_CGROUP_DEVICE;
3932         case BPF_SK_MSG_VERDICT:
3933                 return BPF_PROG_TYPE_SK_MSG;
3934         case BPF_SK_SKB_STREAM_PARSER:
3935         case BPF_SK_SKB_STREAM_VERDICT:
3936         case BPF_SK_SKB_VERDICT:
3937                 return BPF_PROG_TYPE_SK_SKB;
3938         case BPF_LIRC_MODE2:
3939                 return BPF_PROG_TYPE_LIRC_MODE2;
3940         case BPF_FLOW_DISSECTOR:
3941                 return BPF_PROG_TYPE_FLOW_DISSECTOR;
3942         case BPF_CGROUP_SYSCTL:
3943                 return BPF_PROG_TYPE_CGROUP_SYSCTL;
3944         case BPF_CGROUP_GETSOCKOPT:
3945         case BPF_CGROUP_SETSOCKOPT:
3946                 return BPF_PROG_TYPE_CGROUP_SOCKOPT;
3947         case BPF_TRACE_ITER:
3948         case BPF_TRACE_RAW_TP:
3949         case BPF_TRACE_FENTRY:
3950         case BPF_TRACE_FEXIT:
3951         case BPF_MODIFY_RETURN:
3952                 return BPF_PROG_TYPE_TRACING;
3953         case BPF_LSM_MAC:
3954                 return BPF_PROG_TYPE_LSM;
3955         case BPF_SK_LOOKUP:
3956                 return BPF_PROG_TYPE_SK_LOOKUP;
3957         case BPF_XDP:
3958                 return BPF_PROG_TYPE_XDP;
3959         case BPF_LSM_CGROUP:
3960                 return BPF_PROG_TYPE_LSM;
3961         case BPF_TCX_INGRESS:
3962         case BPF_TCX_EGRESS:
3963         case BPF_NETKIT_PRIMARY:
3964         case BPF_NETKIT_PEER:
3965                 return BPF_PROG_TYPE_SCHED_CLS;
3966         default:
3967                 return BPF_PROG_TYPE_UNSPEC;
3968         }
3969 }
3970
3971 static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
3972                                              enum bpf_attach_type attach_type)
3973 {
3974         enum bpf_prog_type ptype;
3975
3976         switch (prog->type) {
3977         case BPF_PROG_TYPE_CGROUP_SOCK:
3978         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3979         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3980         case BPF_PROG_TYPE_SK_LOOKUP:
3981                 return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
3982         case BPF_PROG_TYPE_CGROUP_SKB:
3983                 if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN))
3984                         /* cg-skb progs can be loaded by unpriv user.
3985                          * check permissions at attach time.
3986                          */
3987                         return -EPERM;
3988                 return prog->enforce_expected_attach_type &&
3989                         prog->expected_attach_type != attach_type ?
3990                         -EINVAL : 0;
3991         case BPF_PROG_TYPE_EXT:
3992                 return 0;
3993         case BPF_PROG_TYPE_NETFILTER:
3994                 if (attach_type != BPF_NETFILTER)
3995                         return -EINVAL;
3996                 return 0;
3997         case BPF_PROG_TYPE_PERF_EVENT:
3998         case BPF_PROG_TYPE_TRACEPOINT:
3999                 if (attach_type != BPF_PERF_EVENT)
4000                         return -EINVAL;
4001                 return 0;
4002         case BPF_PROG_TYPE_KPROBE:
4003                 if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
4004                     attach_type != BPF_TRACE_KPROBE_MULTI)
4005                         return -EINVAL;
4006                 if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
4007                     attach_type != BPF_TRACE_UPROBE_MULTI)
4008                         return -EINVAL;
4009                 if (attach_type != BPF_PERF_EVENT &&
4010                     attach_type != BPF_TRACE_KPROBE_MULTI &&
4011                     attach_type != BPF_TRACE_UPROBE_MULTI)
4012                         return -EINVAL;
4013                 return 0;
4014         case BPF_PROG_TYPE_SCHED_CLS:
4015                 if (attach_type != BPF_TCX_INGRESS &&
4016                     attach_type != BPF_TCX_EGRESS &&
4017                     attach_type != BPF_NETKIT_PRIMARY &&
4018                     attach_type != BPF_NETKIT_PEER)
4019                         return -EINVAL;
4020                 return 0;
4021         default:
4022                 ptype = attach_type_to_prog_type(attach_type);
4023                 if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type)
4024                         return -EINVAL;
4025                 return 0;
4026         }
4027 }
4028
4029 #define BPF_PROG_ATTACH_LAST_FIELD expected_revision
4030
4031 #define BPF_F_ATTACH_MASK_BASE  \
4032         (BPF_F_ALLOW_OVERRIDE | \
4033          BPF_F_ALLOW_MULTI |    \
4034          BPF_F_REPLACE)
4035
4036 #define BPF_F_ATTACH_MASK_MPROG \
4037         (BPF_F_REPLACE |        \
4038          BPF_F_BEFORE |         \
4039          BPF_F_AFTER |          \
4040          BPF_F_ID |             \
4041          BPF_F_LINK)
4042
4043 static int bpf_prog_attach(const union bpf_attr *attr)
4044 {
4045         enum bpf_prog_type ptype;
4046         struct bpf_prog *prog;
4047         int ret;
4048
4049         if (CHECK_ATTR(BPF_PROG_ATTACH))
4050                 return -EINVAL;
4051
4052         ptype = attach_type_to_prog_type(attr->attach_type);
4053         if (ptype == BPF_PROG_TYPE_UNSPEC)
4054                 return -EINVAL;
4055         if (bpf_mprog_supported(ptype)) {
4056                 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
4057                         return -EINVAL;
4058         } else {
4059                 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
4060                         return -EINVAL;
4061                 if (attr->relative_fd ||
4062                     attr->expected_revision)
4063                         return -EINVAL;
4064         }
4065
4066         prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
4067         if (IS_ERR(prog))
4068                 return PTR_ERR(prog);
4069
4070         if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
4071                 bpf_prog_put(prog);
4072                 return -EINVAL;
4073         }
4074
4075         switch (ptype) {
4076         case BPF_PROG_TYPE_SK_SKB:
4077         case BPF_PROG_TYPE_SK_MSG:
4078                 ret = sock_map_get_from_fd(attr, prog);
4079                 break;
4080         case BPF_PROG_TYPE_LIRC_MODE2:
4081                 ret = lirc_prog_attach(attr, prog);
4082                 break;
4083         case BPF_PROG_TYPE_FLOW_DISSECTOR:
4084                 ret = netns_bpf_prog_attach(attr, prog);
4085                 break;
4086         case BPF_PROG_TYPE_CGROUP_DEVICE:
4087         case BPF_PROG_TYPE_CGROUP_SKB:
4088         case BPF_PROG_TYPE_CGROUP_SOCK:
4089         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
4090         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
4091         case BPF_PROG_TYPE_CGROUP_SYSCTL:
4092         case BPF_PROG_TYPE_SOCK_OPS:
4093         case BPF_PROG_TYPE_LSM:
4094                 if (ptype == BPF_PROG_TYPE_LSM &&
4095                     prog->expected_attach_type != BPF_LSM_CGROUP)
4096                         ret = -EINVAL;
4097                 else
4098                         ret = cgroup_bpf_prog_attach(attr, ptype, prog);
4099                 break;
4100         case BPF_PROG_TYPE_SCHED_CLS:
4101                 if (attr->attach_type == BPF_TCX_INGRESS ||
4102                     attr->attach_type == BPF_TCX_EGRESS)
4103                         ret = tcx_prog_attach(attr, prog);
4104                 else
4105                         ret = netkit_prog_attach(attr, prog);
4106                 break;
4107         default:
4108                 ret = -EINVAL;
4109         }
4110
4111         if (ret)
4112                 bpf_prog_put(prog);
4113         return ret;
4114 }
4115
4116 #define BPF_PROG_DETACH_LAST_FIELD expected_revision
4117
4118 static int bpf_prog_detach(const union bpf_attr *attr)
4119 {
4120         struct bpf_prog *prog = NULL;
4121         enum bpf_prog_type ptype;
4122         int ret;
4123
4124         if (CHECK_ATTR(BPF_PROG_DETACH))
4125                 return -EINVAL;
4126
4127         ptype = attach_type_to_prog_type(attr->attach_type);
4128         if (bpf_mprog_supported(ptype)) {
4129                 if (ptype == BPF_PROG_TYPE_UNSPEC)
4130                         return -EINVAL;
4131                 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
4132                         return -EINVAL;
4133                 if (attr->attach_bpf_fd) {
4134                         prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
4135                         if (IS_ERR(prog))
4136                                 return PTR_ERR(prog);
4137                 }
4138         } else if (attr->attach_flags ||
4139                    attr->relative_fd ||
4140                    attr->expected_revision) {
4141                 return -EINVAL;
4142         }
4143
4144         switch (ptype) {
4145         case BPF_PROG_TYPE_SK_MSG:
4146         case BPF_PROG_TYPE_SK_SKB:
4147                 ret = sock_map_prog_detach(attr, ptype);
4148                 break;
4149         case BPF_PROG_TYPE_LIRC_MODE2:
4150                 ret = lirc_prog_detach(attr);
4151                 break;
4152         case BPF_PROG_TYPE_FLOW_DISSECTOR:
4153                 ret = netns_bpf_prog_detach(attr, ptype);
4154                 break;
4155         case BPF_PROG_TYPE_CGROUP_DEVICE:
4156         case BPF_PROG_TYPE_CGROUP_SKB:
4157         case BPF_PROG_TYPE_CGROUP_SOCK:
4158         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
4159         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
4160         case BPF_PROG_TYPE_CGROUP_SYSCTL:
4161         case BPF_PROG_TYPE_SOCK_OPS:
4162         case BPF_PROG_TYPE_LSM:
4163                 ret = cgroup_bpf_prog_detach(attr, ptype);
4164                 break;
4165         case BPF_PROG_TYPE_SCHED_CLS:
4166                 if (attr->attach_type == BPF_TCX_INGRESS ||
4167                     attr->attach_type == BPF_TCX_EGRESS)
4168                         ret = tcx_prog_detach(attr, prog);
4169                 else
4170                         ret = netkit_prog_detach(attr, prog);
4171                 break;
4172         default:
4173                 ret = -EINVAL;
4174         }
4175
4176         if (prog)
4177                 bpf_prog_put(prog);
4178         return ret;
4179 }
4180
4181 #define BPF_PROG_QUERY_LAST_FIELD query.revision
4182
4183 static int bpf_prog_query(const union bpf_attr *attr,
4184                           union bpf_attr __user *uattr)
4185 {
4186         if (!bpf_net_capable())
4187                 return -EPERM;
4188         if (CHECK_ATTR(BPF_PROG_QUERY))
4189                 return -EINVAL;
4190         if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
4191                 return -EINVAL;
4192
4193         switch (attr->query.attach_type) {
4194         case BPF_CGROUP_INET_INGRESS:
4195         case BPF_CGROUP_INET_EGRESS:
4196         case BPF_CGROUP_INET_SOCK_CREATE:
4197         case BPF_CGROUP_INET_SOCK_RELEASE:
4198         case BPF_CGROUP_INET4_BIND:
4199         case BPF_CGROUP_INET6_BIND:
4200         case BPF_CGROUP_INET4_POST_BIND:
4201         case BPF_CGROUP_INET6_POST_BIND:
4202         case BPF_CGROUP_INET4_CONNECT:
4203         case BPF_CGROUP_INET6_CONNECT:
4204         case BPF_CGROUP_UNIX_CONNECT:
4205         case BPF_CGROUP_INET4_GETPEERNAME:
4206         case BPF_CGROUP_INET6_GETPEERNAME:
4207         case BPF_CGROUP_UNIX_GETPEERNAME:
4208         case BPF_CGROUP_INET4_GETSOCKNAME:
4209         case BPF_CGROUP_INET6_GETSOCKNAME:
4210         case BPF_CGROUP_UNIX_GETSOCKNAME:
4211         case BPF_CGROUP_UDP4_SENDMSG:
4212         case BPF_CGROUP_UDP6_SENDMSG:
4213         case BPF_CGROUP_UNIX_SENDMSG:
4214         case BPF_CGROUP_UDP4_RECVMSG:
4215         case BPF_CGROUP_UDP6_RECVMSG:
4216         case BPF_CGROUP_UNIX_RECVMSG:
4217         case BPF_CGROUP_SOCK_OPS:
4218         case BPF_CGROUP_DEVICE:
4219         case BPF_CGROUP_SYSCTL:
4220         case BPF_CGROUP_GETSOCKOPT:
4221         case BPF_CGROUP_SETSOCKOPT:
4222         case BPF_LSM_CGROUP:
4223                 return cgroup_bpf_prog_query(attr, uattr);
4224         case BPF_LIRC_MODE2:
4225                 return lirc_prog_query(attr, uattr);
4226         case BPF_FLOW_DISSECTOR:
4227         case BPF_SK_LOOKUP:
4228                 return netns_bpf_prog_query(attr, uattr);
4229         case BPF_SK_SKB_STREAM_PARSER:
4230         case BPF_SK_SKB_STREAM_VERDICT:
4231         case BPF_SK_MSG_VERDICT:
4232         case BPF_SK_SKB_VERDICT:
4233                 return sock_map_bpf_prog_query(attr, uattr);
4234         case BPF_TCX_INGRESS:
4235         case BPF_TCX_EGRESS:
4236                 return tcx_prog_query(attr, uattr);
4237         case BPF_NETKIT_PRIMARY:
4238         case BPF_NETKIT_PEER:
4239                 return netkit_prog_query(attr, uattr);
4240         default:
4241                 return -EINVAL;
4242         }
4243 }
4244
4245 #define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
4246
4247 static int bpf_prog_test_run(const union bpf_attr *attr,
4248                              union bpf_attr __user *uattr)
4249 {
4250         struct bpf_prog *prog;
4251         int ret = -ENOTSUPP;
4252
4253         if (CHECK_ATTR(BPF_PROG_TEST_RUN))
4254                 return -EINVAL;
4255
4256         if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
4257             (!attr->test.ctx_size_in && attr->test.ctx_in))
4258                 return -EINVAL;
4259
4260         if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
4261             (!attr->test.ctx_size_out && attr->test.ctx_out))
4262                 return -EINVAL;
4263
4264         prog = bpf_prog_get(attr->test.prog_fd);
4265         if (IS_ERR(prog))
4266                 return PTR_ERR(prog);
4267
4268         if (prog->aux->ops->test_run)
4269                 ret = prog->aux->ops->test_run(prog, attr, uattr);
4270
4271         bpf_prog_put(prog);
4272         return ret;
4273 }
4274
4275 #define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
4276
4277 static int bpf_obj_get_next_id(const union bpf_attr *attr,
4278                                union bpf_attr __user *uattr,
4279                                struct idr *idr,
4280                                spinlock_t *lock)
4281 {
4282         u32 next_id = attr->start_id;
4283         int err = 0;
4284
4285         if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
4286                 return -EINVAL;
4287
4288         if (!capable(CAP_SYS_ADMIN))
4289                 return -EPERM;
4290
4291         next_id++;
4292         spin_lock_bh(lock);
4293         if (!idr_get_next(idr, &next_id))
4294                 err = -ENOENT;
4295         spin_unlock_bh(lock);
4296
4297         if (!err)
4298                 err = put_user(next_id, &uattr->next_id);
4299
4300         return err;
4301 }
4302
4303 struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
4304 {
4305         struct bpf_map *map;
4306
4307         spin_lock_bh(&map_idr_lock);
4308 again:
4309         map = idr_get_next(&map_idr, id);
4310         if (map) {
4311                 map = __bpf_map_inc_not_zero(map, false);
4312                 if (IS_ERR(map)) {
4313                         (*id)++;
4314                         goto again;
4315                 }
4316         }
4317         spin_unlock_bh(&map_idr_lock);
4318
4319         return map;
4320 }
4321
4322 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
4323 {
4324         struct bpf_prog *prog;
4325
4326         spin_lock_bh(&prog_idr_lock);
4327 again:
4328         prog = idr_get_next(&prog_idr, id);
4329         if (prog) {
4330                 prog = bpf_prog_inc_not_zero(prog);
4331                 if (IS_ERR(prog)) {
4332                         (*id)++;
4333                         goto again;
4334                 }
4335         }
4336         spin_unlock_bh(&prog_idr_lock);
4337
4338         return prog;
4339 }
4340
4341 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
4342
4343 struct bpf_prog *bpf_prog_by_id(u32 id)
4344 {
4345         struct bpf_prog *prog;
4346
4347         if (!id)
4348                 return ERR_PTR(-ENOENT);
4349
4350         spin_lock_bh(&prog_idr_lock);
4351         prog = idr_find(&prog_idr, id);
4352         if (prog)
4353                 prog = bpf_prog_inc_not_zero(prog);
4354         else
4355                 prog = ERR_PTR(-ENOENT);
4356         spin_unlock_bh(&prog_idr_lock);
4357         return prog;
4358 }
4359
4360 static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
4361 {
4362         struct bpf_prog *prog;
4363         u32 id = attr->prog_id;
4364         int fd;
4365
4366         if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
4367                 return -EINVAL;
4368
4369         if (!capable(CAP_SYS_ADMIN))
4370                 return -EPERM;
4371
4372         prog = bpf_prog_by_id(id);
4373         if (IS_ERR(prog))
4374                 return PTR_ERR(prog);
4375
4376         fd = bpf_prog_new_fd(prog);
4377         if (fd < 0)
4378                 bpf_prog_put(prog);
4379
4380         return fd;
4381 }
4382
4383 #define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
4384
4385 static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
4386 {
4387         struct bpf_map *map;
4388         u32 id = attr->map_id;
4389         int f_flags;
4390         int fd;
4391
4392         if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
4393             attr->open_flags & ~BPF_OBJ_FLAG_MASK)
4394                 return -EINVAL;
4395
4396         if (!capable(CAP_SYS_ADMIN))
4397                 return -EPERM;
4398
4399         f_flags = bpf_get_file_flag(attr->open_flags);
4400         if (f_flags < 0)
4401                 return f_flags;
4402
4403         spin_lock_bh(&map_idr_lock);
4404         map = idr_find(&map_idr, id);
4405         if (map)
4406                 map = __bpf_map_inc_not_zero(map, true);
4407         else
4408                 map = ERR_PTR(-ENOENT);
4409         spin_unlock_bh(&map_idr_lock);
4410
4411         if (IS_ERR(map))
4412                 return PTR_ERR(map);
4413
4414         fd = bpf_map_new_fd(map, f_flags);
4415         if (fd < 0)
4416                 bpf_map_put_with_uref(map);
4417
4418         return fd;
4419 }
4420
4421 static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
4422                                               unsigned long addr, u32 *off,
4423                                               u32 *type)
4424 {
4425         const struct bpf_map *map;
4426         int i;
4427
4428         mutex_lock(&prog->aux->used_maps_mutex);
4429         for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
4430                 map = prog->aux->used_maps[i];
4431                 if (map == (void *)addr) {
4432                         *type = BPF_PSEUDO_MAP_FD;
4433                         goto out;
4434                 }
4435                 if (!map->ops->map_direct_value_meta)
4436                         continue;
4437                 if (!map->ops->map_direct_value_meta(map, addr, off)) {
4438                         *type = BPF_PSEUDO_MAP_VALUE;
4439                         goto out;
4440                 }
4441         }
4442         map = NULL;
4443
4444 out:
4445         mutex_unlock(&prog->aux->used_maps_mutex);
4446         return map;
4447 }
4448
4449 static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
4450                                               const struct cred *f_cred)
4451 {
4452         const struct bpf_map *map;
4453         struct bpf_insn *insns;
4454         u32 off, type;
4455         u64 imm;
4456         u8 code;
4457         int i;
4458
4459         insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
4460                         GFP_USER);
4461         if (!insns)
4462                 return insns;
4463
4464         for (i = 0; i < prog->len; i++) {
4465                 code = insns[i].code;
4466
4467                 if (code == (BPF_JMP | BPF_TAIL_CALL)) {
4468                         insns[i].code = BPF_JMP | BPF_CALL;
4469                         insns[i].imm = BPF_FUNC_tail_call;
4470                         /* fall-through */
4471                 }
4472                 if (code == (BPF_JMP | BPF_CALL) ||
4473                     code == (BPF_JMP | BPF_CALL_ARGS)) {
4474                         if (code == (BPF_JMP | BPF_CALL_ARGS))
4475                                 insns[i].code = BPF_JMP | BPF_CALL;
4476                         if (!bpf_dump_raw_ok(f_cred))
4477                                 insns[i].imm = 0;
4478                         continue;
4479                 }
4480                 if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
4481                         insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
4482                         continue;
4483                 }
4484
4485                 if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX ||
4486                      BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) {
4487                         insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM;
4488                         continue;
4489                 }
4490
4491                 if (code != (BPF_LD | BPF_IMM | BPF_DW))
4492                         continue;
4493
4494                 imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
4495                 map = bpf_map_from_imm(prog, imm, &off, &type);
4496                 if (map) {
4497                         insns[i].src_reg = type;
4498                         insns[i].imm = map->id;
4499                         insns[i + 1].imm = off;
4500                         continue;
4501                 }
4502         }
4503
4504         return insns;
4505 }
4506
4507 static int set_info_rec_size(struct bpf_prog_info *info)
4508 {
4509         /*
4510          * Ensure info.*_rec_size is the same as kernel expected size
4511          *
4512          * or
4513          *
4514          * Only allow zero *_rec_size if both _rec_size and _cnt are
4515          * zero.  In this case, the kernel will set the expected
4516          * _rec_size back to the info.
4517          */
4518
4519         if ((info->nr_func_info || info->func_info_rec_size) &&
4520             info->func_info_rec_size != sizeof(struct bpf_func_info))
4521                 return -EINVAL;
4522
4523         if ((info->nr_line_info || info->line_info_rec_size) &&
4524             info->line_info_rec_size != sizeof(struct bpf_line_info))
4525                 return -EINVAL;
4526
4527         if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
4528             info->jited_line_info_rec_size != sizeof(__u64))
4529                 return -EINVAL;
4530
4531         info->func_info_rec_size = sizeof(struct bpf_func_info);
4532         info->line_info_rec_size = sizeof(struct bpf_line_info);
4533         info->jited_line_info_rec_size = sizeof(__u64);
4534
4535         return 0;
4536 }
4537
4538 static int bpf_prog_get_info_by_fd(struct file *file,
4539                                    struct bpf_prog *prog,
4540                                    const union bpf_attr *attr,
4541                                    union bpf_attr __user *uattr)
4542 {
4543         struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4544         struct btf *attach_btf = bpf_prog_get_target_btf(prog);
4545         struct bpf_prog_info info;
4546         u32 info_len = attr->info.info_len;
4547         struct bpf_prog_kstats stats;
4548         char __user *uinsns;
4549         u32 ulen;
4550         int err;
4551
4552         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
4553         if (err)
4554                 return err;
4555         info_len = min_t(u32, sizeof(info), info_len);
4556
4557         memset(&info, 0, sizeof(info));
4558         if (copy_from_user(&info, uinfo, info_len))
4559                 return -EFAULT;
4560
4561         info.type = prog->type;
4562         info.id = prog->aux->id;
4563         info.load_time = prog->aux->load_time;
4564         info.created_by_uid = from_kuid_munged(current_user_ns(),
4565                                                prog->aux->user->uid);
4566         info.gpl_compatible = prog->gpl_compatible;
4567
4568         memcpy(info.tag, prog->tag, sizeof(prog->tag));
4569         memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
4570
4571         mutex_lock(&prog->aux->used_maps_mutex);
4572         ulen = info.nr_map_ids;
4573         info.nr_map_ids = prog->aux->used_map_cnt;
4574         ulen = min_t(u32, info.nr_map_ids, ulen);
4575         if (ulen) {
4576                 u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
4577                 u32 i;
4578
4579                 for (i = 0; i < ulen; i++)
4580                         if (put_user(prog->aux->used_maps[i]->id,
4581                                      &user_map_ids[i])) {
4582                                 mutex_unlock(&prog->aux->used_maps_mutex);
4583                                 return -EFAULT;
4584                         }
4585         }
4586         mutex_unlock(&prog->aux->used_maps_mutex);
4587
4588         err = set_info_rec_size(&info);
4589         if (err)
4590                 return err;
4591
4592         bpf_prog_get_stats(prog, &stats);
4593         info.run_time_ns = stats.nsecs;
4594         info.run_cnt = stats.cnt;
4595         info.recursion_misses = stats.misses;
4596
4597         info.verified_insns = prog->aux->verified_insns;
4598
4599         if (!bpf_capable()) {
4600                 info.jited_prog_len = 0;
4601                 info.xlated_prog_len = 0;
4602                 info.nr_jited_ksyms = 0;
4603                 info.nr_jited_func_lens = 0;
4604                 info.nr_func_info = 0;
4605                 info.nr_line_info = 0;
4606                 info.nr_jited_line_info = 0;
4607                 goto done;
4608         }
4609
4610         ulen = info.xlated_prog_len;
4611         info.xlated_prog_len = bpf_prog_insn_size(prog);
4612         if (info.xlated_prog_len && ulen) {
4613                 struct bpf_insn *insns_sanitized;
4614                 bool fault;
4615
4616                 if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) {
4617                         info.xlated_prog_insns = 0;
4618                         goto done;
4619                 }
4620                 insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
4621                 if (!insns_sanitized)
4622                         return -ENOMEM;
4623                 uinsns = u64_to_user_ptr(info.xlated_prog_insns);
4624                 ulen = min_t(u32, info.xlated_prog_len, ulen);
4625                 fault = copy_to_user(uinsns, insns_sanitized, ulen);
4626                 kfree(insns_sanitized);
4627                 if (fault)
4628                         return -EFAULT;
4629         }
4630
4631         if (bpf_prog_is_offloaded(prog->aux)) {
4632                 err = bpf_prog_offload_info_fill(&info, prog);
4633                 if (err)
4634                         return err;
4635                 goto done;
4636         }
4637
4638         /* NOTE: the following code is supposed to be skipped for offload.
4639          * bpf_prog_offload_info_fill() is the place to fill similar fields
4640          * for offload.
4641          */
4642         ulen = info.jited_prog_len;
4643         if (prog->aux->func_cnt) {
4644                 u32 i;
4645
4646                 info.jited_prog_len = 0;
4647                 for (i = 0; i < prog->aux->func_cnt; i++)
4648                         info.jited_prog_len += prog->aux->func[i]->jited_len;
4649         } else {
4650                 info.jited_prog_len = prog->jited_len;
4651         }
4652
4653         if (info.jited_prog_len && ulen) {
4654                 if (bpf_dump_raw_ok(file->f_cred)) {
4655                         uinsns = u64_to_user_ptr(info.jited_prog_insns);
4656                         ulen = min_t(u32, info.jited_prog_len, ulen);
4657
4658                         /* for multi-function programs, copy the JITed
4659                          * instructions for all the functions
4660                          */
4661                         if (prog->aux->func_cnt) {
4662                                 u32 len, free, i;
4663                                 u8 *img;
4664
4665                                 free = ulen;
4666                                 for (i = 0; i < prog->aux->func_cnt; i++) {
4667                                         len = prog->aux->func[i]->jited_len;
4668                                         len = min_t(u32, len, free);
4669                                         img = (u8 *) prog->aux->func[i]->bpf_func;
4670                                         if (copy_to_user(uinsns, img, len))
4671                                                 return -EFAULT;
4672                                         uinsns += len;
4673                                         free -= len;
4674                                         if (!free)
4675                                                 break;
4676                                 }
4677                         } else {
4678                                 if (copy_to_user(uinsns, prog->bpf_func, ulen))
4679                                         return -EFAULT;
4680                         }
4681                 } else {
4682                         info.jited_prog_insns = 0;
4683                 }
4684         }
4685
4686         ulen = info.nr_jited_ksyms;
4687         info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
4688         if (ulen) {
4689                 if (bpf_dump_raw_ok(file->f_cred)) {
4690                         unsigned long ksym_addr;
4691                         u64 __user *user_ksyms;
4692                         u32 i;
4693
4694                         /* copy the address of the kernel symbol
4695                          * corresponding to each function
4696                          */
4697                         ulen = min_t(u32, info.nr_jited_ksyms, ulen);
4698                         user_ksyms = u64_to_user_ptr(info.jited_ksyms);
4699                         if (prog->aux->func_cnt) {
4700                                 for (i = 0; i < ulen; i++) {
4701                                         ksym_addr = (unsigned long)
4702                                                 prog->aux->func[i]->bpf_func;
4703                                         if (put_user((u64) ksym_addr,
4704                                                      &user_ksyms[i]))
4705                                                 return -EFAULT;
4706                                 }
4707                         } else {
4708                                 ksym_addr = (unsigned long) prog->bpf_func;
4709                                 if (put_user((u64) ksym_addr, &user_ksyms[0]))
4710                                         return -EFAULT;
4711                         }
4712                 } else {
4713                         info.jited_ksyms = 0;
4714                 }
4715         }
4716
4717         ulen = info.nr_jited_func_lens;
4718         info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
4719         if (ulen) {
4720                 if (bpf_dump_raw_ok(file->f_cred)) {
4721                         u32 __user *user_lens;
4722                         u32 func_len, i;
4723
4724                         /* copy the JITed image lengths for each function */
4725                         ulen = min_t(u32, info.nr_jited_func_lens, ulen);
4726                         user_lens = u64_to_user_ptr(info.jited_func_lens);
4727                         if (prog->aux->func_cnt) {
4728                                 for (i = 0; i < ulen; i++) {
4729                                         func_len =
4730                                                 prog->aux->func[i]->jited_len;
4731                                         if (put_user(func_len, &user_lens[i]))
4732                                                 return -EFAULT;
4733                                 }
4734                         } else {
4735                                 func_len = prog->jited_len;
4736                                 if (put_user(func_len, &user_lens[0]))
4737                                         return -EFAULT;
4738                         }
4739                 } else {
4740                         info.jited_func_lens = 0;
4741                 }
4742         }
4743
4744         if (prog->aux->btf)
4745                 info.btf_id = btf_obj_id(prog->aux->btf);
4746         info.attach_btf_id = prog->aux->attach_btf_id;
4747         if (attach_btf)
4748                 info.attach_btf_obj_id = btf_obj_id(attach_btf);
4749
4750         ulen = info.nr_func_info;
4751         info.nr_func_info = prog->aux->func_info_cnt;
4752         if (info.nr_func_info && ulen) {
4753                 char __user *user_finfo;
4754
4755                 user_finfo = u64_to_user_ptr(info.func_info);
4756                 ulen = min_t(u32, info.nr_func_info, ulen);
4757                 if (copy_to_user(user_finfo, prog->aux->func_info,
4758                                  info.func_info_rec_size * ulen))
4759                         return -EFAULT;
4760         }
4761
4762         ulen = info.nr_line_info;
4763         info.nr_line_info = prog->aux->nr_linfo;
4764         if (info.nr_line_info && ulen) {
4765                 __u8 __user *user_linfo;
4766
4767                 user_linfo = u64_to_user_ptr(info.line_info);
4768                 ulen = min_t(u32, info.nr_line_info, ulen);
4769                 if (copy_to_user(user_linfo, prog->aux->linfo,
4770                                  info.line_info_rec_size * ulen))
4771                         return -EFAULT;
4772         }
4773
4774         ulen = info.nr_jited_line_info;
4775         if (prog->aux->jited_linfo)
4776                 info.nr_jited_line_info = prog->aux->nr_linfo;
4777         else
4778                 info.nr_jited_line_info = 0;
4779         if (info.nr_jited_line_info && ulen) {
4780                 if (bpf_dump_raw_ok(file->f_cred)) {
4781                         unsigned long line_addr;
4782                         __u64 __user *user_linfo;
4783                         u32 i;
4784
4785                         user_linfo = u64_to_user_ptr(info.jited_line_info);
4786                         ulen = min_t(u32, info.nr_jited_line_info, ulen);
4787                         for (i = 0; i < ulen; i++) {
4788                                 line_addr = (unsigned long)prog->aux->jited_linfo[i];
4789                                 if (put_user((__u64)line_addr, &user_linfo[i]))
4790                                         return -EFAULT;
4791                         }
4792                 } else {
4793                         info.jited_line_info = 0;
4794                 }
4795         }
4796
4797         ulen = info.nr_prog_tags;
4798         info.nr_prog_tags = prog->aux->func_cnt ? : 1;
4799         if (ulen) {
4800                 __u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
4801                 u32 i;
4802
4803                 user_prog_tags = u64_to_user_ptr(info.prog_tags);
4804                 ulen = min_t(u32, info.nr_prog_tags, ulen);
4805                 if (prog->aux->func_cnt) {
4806                         for (i = 0; i < ulen; i++) {
4807                                 if (copy_to_user(user_prog_tags[i],
4808                                                  prog->aux->func[i]->tag,
4809                                                  BPF_TAG_SIZE))
4810                                         return -EFAULT;
4811                         }
4812                 } else {
4813                         if (copy_to_user(user_prog_tags[0],
4814                                          prog->tag, BPF_TAG_SIZE))
4815                                 return -EFAULT;
4816                 }
4817         }
4818
4819 done:
4820         if (copy_to_user(uinfo, &info, info_len) ||
4821             put_user(info_len, &uattr->info.info_len))
4822                 return -EFAULT;
4823
4824         return 0;
4825 }
4826
4827 static int bpf_map_get_info_by_fd(struct file *file,
4828                                   struct bpf_map *map,
4829                                   const union bpf_attr *attr,
4830                                   union bpf_attr __user *uattr)
4831 {
4832         struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4833         struct bpf_map_info info;
4834         u32 info_len = attr->info.info_len;
4835         int err;
4836
4837         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
4838         if (err)
4839                 return err;
4840         info_len = min_t(u32, sizeof(info), info_len);
4841
4842         memset(&info, 0, sizeof(info));
4843         info.type = map->map_type;
4844         info.id = map->id;
4845         info.key_size = map->key_size;
4846         info.value_size = map->value_size;
4847         info.max_entries = map->max_entries;
4848         info.map_flags = map->map_flags;
4849         info.map_extra = map->map_extra;
4850         memcpy(info.name, map->name, sizeof(map->name));
4851
4852         if (map->btf) {
4853                 info.btf_id = btf_obj_id(map->btf);
4854                 info.btf_key_type_id = map->btf_key_type_id;
4855                 info.btf_value_type_id = map->btf_value_type_id;
4856         }
4857         info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
4858         if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS)
4859                 bpf_map_struct_ops_info_fill(&info, map);
4860
4861         if (bpf_map_is_offloaded(map)) {
4862                 err = bpf_map_offload_info_fill(&info, map);
4863                 if (err)
4864                         return err;
4865         }
4866
4867         if (copy_to_user(uinfo, &info, info_len) ||
4868             put_user(info_len, &uattr->info.info_len))
4869                 return -EFAULT;
4870
4871         return 0;
4872 }
4873
4874 static int bpf_btf_get_info_by_fd(struct file *file,
4875                                   struct btf *btf,
4876                                   const union bpf_attr *attr,
4877                                   union bpf_attr __user *uattr)
4878 {
4879         struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4880         u32 info_len = attr->info.info_len;
4881         int err;
4882
4883         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
4884         if (err)
4885                 return err;
4886
4887         return btf_get_info_by_fd(btf, attr, uattr);
4888 }
4889
4890 static int bpf_link_get_info_by_fd(struct file *file,
4891                                   struct bpf_link *link,
4892                                   const union bpf_attr *attr,
4893                                   union bpf_attr __user *uattr)
4894 {
4895         struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4896         struct bpf_link_info info;
4897         u32 info_len = attr->info.info_len;
4898         int err;
4899
4900         err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
4901         if (err)
4902                 return err;
4903         info_len = min_t(u32, sizeof(info), info_len);
4904
4905         memset(&info, 0, sizeof(info));
4906         if (copy_from_user(&info, uinfo, info_len))
4907                 return -EFAULT;
4908
4909         info.type = link->type;
4910         info.id = link->id;
4911         if (link->prog)
4912                 info.prog_id = link->prog->aux->id;
4913
4914         if (link->ops->fill_link_info) {
4915                 err = link->ops->fill_link_info(link, &info);
4916                 if (err)
4917                         return err;
4918         }
4919
4920         if (copy_to_user(uinfo, &info, info_len) ||
4921             put_user(info_len, &uattr->info.info_len))
4922                 return -EFAULT;
4923
4924         return 0;
4925 }
4926
4927
4928 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
4929
4930 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
4931                                   union bpf_attr __user *uattr)
4932 {
4933         int ufd = attr->info.bpf_fd;
4934         struct fd f;
4935         int err;
4936
4937         if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
4938                 return -EINVAL;
4939
4940         f = fdget(ufd);
4941         if (!f.file)
4942                 return -EBADFD;
4943
4944         if (f.file->f_op == &bpf_prog_fops)
4945                 err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
4946                                               uattr);
4947         else if (f.file->f_op == &bpf_map_fops)
4948                 err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
4949                                              uattr);
4950         else if (f.file->f_op == &btf_fops)
4951                 err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
4952         else if (f.file->f_op == &bpf_link_fops)
4953                 err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
4954                                               attr, uattr);
4955         else
4956                 err = -EINVAL;
4957
4958         fdput(f);
4959         return err;
4960 }
4961
4962 #define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
4963
4964 static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
4965 {
4966         struct bpf_token *token = NULL;
4967
4968         if (CHECK_ATTR(BPF_BTF_LOAD))
4969                 return -EINVAL;
4970
4971         if (attr->btf_flags & ~BPF_F_TOKEN_FD)
4972                 return -EINVAL;
4973
4974         if (attr->btf_flags & BPF_F_TOKEN_FD) {
4975                 token = bpf_token_get_from_fd(attr->btf_token_fd);
4976                 if (IS_ERR(token))
4977                         return PTR_ERR(token);
4978                 if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) {
4979                         bpf_token_put(token);
4980                         token = NULL;
4981                 }
4982         }
4983
4984         if (!bpf_token_capable(token, CAP_BPF)) {
4985                 bpf_token_put(token);
4986                 return -EPERM;
4987         }
4988
4989         bpf_token_put(token);
4990
4991         return btf_new_fd(attr, uattr, uattr_size);
4992 }
4993
4994 #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
4995
4996 static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
4997 {
4998         if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
4999                 return -EINVAL;
5000
5001         if (!capable(CAP_SYS_ADMIN))
5002                 return -EPERM;
5003
5004         return btf_get_fd_by_id(attr->btf_id);
5005 }
5006
5007 static int bpf_task_fd_query_copy(const union bpf_attr *attr,
5008                                     union bpf_attr __user *uattr,
5009                                     u32 prog_id, u32 fd_type,
5010                                     const char *buf, u64 probe_offset,
5011                                     u64 probe_addr)
5012 {
5013         char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
5014         u32 len = buf ? strlen(buf) : 0, input_len;
5015         int err = 0;
5016
5017         if (put_user(len, &uattr->task_fd_query.buf_len))
5018                 return -EFAULT;
5019         input_len = attr->task_fd_query.buf_len;
5020         if (input_len && ubuf) {
5021                 if (!len) {
5022                         /* nothing to copy, just make ubuf NULL terminated */
5023                         char zero = '\0';
5024
5025                         if (put_user(zero, ubuf))
5026                                 return -EFAULT;
5027                 } else if (input_len >= len + 1) {
5028                         /* ubuf can hold the string with NULL terminator */
5029                         if (copy_to_user(ubuf, buf, len + 1))
5030                                 return -EFAULT;
5031                 } else {
5032                         /* ubuf cannot hold the string with NULL terminator,
5033                          * do a partial copy with NULL terminator.
5034                          */
5035                         char zero = '\0';
5036
5037                         err = -ENOSPC;
5038                         if (copy_to_user(ubuf, buf, input_len - 1))
5039                                 return -EFAULT;
5040                         if (put_user(zero, ubuf + input_len - 1))
5041                                 return -EFAULT;
5042                 }
5043         }
5044
5045         if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
5046             put_user(fd_type, &uattr->task_fd_query.fd_type) ||
5047             put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
5048             put_user(probe_addr, &uattr->task_fd_query.probe_addr))
5049                 return -EFAULT;
5050
5051         return err;
5052 }
5053
5054 #define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
5055
5056 static int bpf_task_fd_query(const union bpf_attr *attr,
5057                              union bpf_attr __user *uattr)
5058 {
5059         pid_t pid = attr->task_fd_query.pid;
5060         u32 fd = attr->task_fd_query.fd;
5061         const struct perf_event *event;
5062         struct task_struct *task;
5063         struct file *file;
5064         int err;
5065
5066         if (CHECK_ATTR(BPF_TASK_FD_QUERY))
5067                 return -EINVAL;
5068
5069         if (!capable(CAP_SYS_ADMIN))
5070                 return -EPERM;
5071
5072         if (attr->task_fd_query.flags != 0)
5073                 return -EINVAL;
5074
5075         rcu_read_lock();
5076         task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
5077         rcu_read_unlock();
5078         if (!task)
5079                 return -ENOENT;
5080
5081         err = 0;
5082         file = fget_task(task, fd);
5083         put_task_struct(task);
5084         if (!file)
5085                 return -EBADF;
5086
5087         if (file->f_op == &bpf_link_fops) {
5088                 struct bpf_link *link = file->private_data;
5089
5090                 if (link->ops == &bpf_raw_tp_link_lops) {
5091                         struct bpf_raw_tp_link *raw_tp =
5092                                 container_of(link, struct bpf_raw_tp_link, link);
5093                         struct bpf_raw_event_map *btp = raw_tp->btp;
5094
5095                         err = bpf_task_fd_query_copy(attr, uattr,
5096                                                      raw_tp->link.prog->aux->id,
5097                                                      BPF_FD_TYPE_RAW_TRACEPOINT,
5098                                                      btp->tp->name, 0, 0);
5099                         goto put_file;
5100                 }
5101                 goto out_not_supp;
5102         }
5103
5104         event = perf_get_event(file);
5105         if (!IS_ERR(event)) {
5106                 u64 probe_offset, probe_addr;
5107                 u32 prog_id, fd_type;
5108                 const char *buf;
5109
5110                 err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
5111                                               &buf, &probe_offset,
5112                                               &probe_addr, NULL);
5113                 if (!err)
5114                         err = bpf_task_fd_query_copy(attr, uattr, prog_id,
5115                                                      fd_type, buf,
5116                                                      probe_offset,
5117                                                      probe_addr);
5118                 goto put_file;
5119         }
5120
5121 out_not_supp:
5122         err = -ENOTSUPP;
5123 put_file:
5124         fput(file);
5125         return err;
5126 }
5127
5128 #define BPF_MAP_BATCH_LAST_FIELD batch.flags
5129
5130 #define BPF_DO_BATCH(fn, ...)                   \
5131         do {                                    \
5132                 if (!fn) {                      \
5133                         err = -ENOTSUPP;        \
5134                         goto err_put;           \
5135                 }                               \
5136                 err = fn(__VA_ARGS__);          \
5137         } while (0)
5138
5139 static int bpf_map_do_batch(const union bpf_attr *attr,
5140                             union bpf_attr __user *uattr,
5141                             int cmd)
5142 {
5143         bool has_read  = cmd == BPF_MAP_LOOKUP_BATCH ||
5144                          cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
5145         bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
5146         struct bpf_map *map;
5147         int err, ufd;
5148         struct fd f;
5149
5150         if (CHECK_ATTR(BPF_MAP_BATCH))
5151                 return -EINVAL;
5152
5153         ufd = attr->batch.map_fd;
5154         f = fdget(ufd);
5155         map = __bpf_map_get(f);
5156         if (IS_ERR(map))
5157                 return PTR_ERR(map);
5158         if (has_write)
5159                 bpf_map_write_active_inc(map);
5160         if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
5161                 err = -EPERM;
5162                 goto err_put;
5163         }
5164         if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
5165                 err = -EPERM;
5166                 goto err_put;
5167         }
5168
5169         if (cmd == BPF_MAP_LOOKUP_BATCH)
5170                 BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
5171         else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
5172                 BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
5173         else if (cmd == BPF_MAP_UPDATE_BATCH)
5174                 BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
5175         else
5176                 BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
5177 err_put:
5178         if (has_write) {
5179                 maybe_wait_bpf_programs(map);
5180                 bpf_map_write_active_dec(map);
5181         }
5182         fdput(f);
5183         return err;
5184 }
5185
5186 #define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
5187 static int link_create(union bpf_attr *attr, bpfptr_t uattr)
5188 {
5189         struct bpf_prog *prog;
5190         int ret;
5191
5192         if (CHECK_ATTR(BPF_LINK_CREATE))
5193                 return -EINVAL;
5194
5195         if (attr->link_create.attach_type == BPF_STRUCT_OPS)
5196                 return bpf_struct_ops_link_create(attr);
5197
5198         prog = bpf_prog_get(attr->link_create.prog_fd);
5199         if (IS_ERR(prog))
5200                 return PTR_ERR(prog);
5201
5202         ret = bpf_prog_attach_check_attach_type(prog,
5203                                                 attr->link_create.attach_type);
5204         if (ret)
5205                 goto out;
5206
5207         switch (prog->type) {
5208         case BPF_PROG_TYPE_CGROUP_SKB:
5209         case BPF_PROG_TYPE_CGROUP_SOCK:
5210         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
5211         case BPF_PROG_TYPE_SOCK_OPS:
5212         case BPF_PROG_TYPE_CGROUP_DEVICE:
5213         case BPF_PROG_TYPE_CGROUP_SYSCTL:
5214         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
5215                 ret = cgroup_bpf_link_attach(attr, prog);
5216                 break;
5217         case BPF_PROG_TYPE_EXT:
5218                 ret = bpf_tracing_prog_attach(prog,
5219                                               attr->link_create.target_fd,
5220                                               attr->link_create.target_btf_id,
5221                                               attr->link_create.tracing.cookie);
5222                 break;
5223         case BPF_PROG_TYPE_LSM:
5224         case BPF_PROG_TYPE_TRACING:
5225                 if (attr->link_create.attach_type != prog->expected_attach_type) {
5226                         ret = -EINVAL;
5227                         goto out;
5228                 }
5229                 if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
5230                         ret = bpf_raw_tp_link_attach(prog, NULL);
5231                 else if (prog->expected_attach_type == BPF_TRACE_ITER)
5232                         ret = bpf_iter_link_attach(attr, uattr, prog);
5233                 else if (prog->expected_attach_type == BPF_LSM_CGROUP)
5234                         ret = cgroup_bpf_link_attach(attr, prog);
5235                 else
5236                         ret = bpf_tracing_prog_attach(prog,
5237                                                       attr->link_create.target_fd,
5238                                                       attr->link_create.target_btf_id,
5239                                                       attr->link_create.tracing.cookie);
5240                 break;
5241         case BPF_PROG_TYPE_FLOW_DISSECTOR:
5242         case BPF_PROG_TYPE_SK_LOOKUP:
5243                 ret = netns_bpf_link_create(attr, prog);
5244                 break;
5245 #ifdef CONFIG_NET
5246         case BPF_PROG_TYPE_XDP:
5247                 ret = bpf_xdp_link_attach(attr, prog);
5248                 break;
5249         case BPF_PROG_TYPE_SCHED_CLS:
5250                 if (attr->link_create.attach_type == BPF_TCX_INGRESS ||
5251                     attr->link_create.attach_type == BPF_TCX_EGRESS)
5252                         ret = tcx_link_attach(attr, prog);
5253                 else
5254                         ret = netkit_link_attach(attr, prog);
5255                 break;
5256         case BPF_PROG_TYPE_NETFILTER:
5257                 ret = bpf_nf_link_attach(attr, prog);
5258                 break;
5259 #endif
5260         case BPF_PROG_TYPE_PERF_EVENT:
5261         case BPF_PROG_TYPE_TRACEPOINT:
5262                 ret = bpf_perf_link_attach(attr, prog);
5263                 break;
5264         case BPF_PROG_TYPE_KPROBE:
5265                 if (attr->link_create.attach_type == BPF_PERF_EVENT)
5266                         ret = bpf_perf_link_attach(attr, prog);
5267                 else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI)
5268                         ret = bpf_kprobe_multi_link_attach(attr, prog);
5269                 else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
5270                         ret = bpf_uprobe_multi_link_attach(attr, prog);
5271                 break;
5272         default:
5273                 ret = -EINVAL;
5274         }
5275
5276 out:
5277         if (ret < 0)
5278                 bpf_prog_put(prog);
5279         return ret;
5280 }
5281
5282 static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
5283 {
5284         struct bpf_map *new_map, *old_map = NULL;
5285         int ret;
5286
5287         new_map = bpf_map_get(attr->link_update.new_map_fd);
5288         if (IS_ERR(new_map))
5289                 return PTR_ERR(new_map);
5290
5291         if (attr->link_update.flags & BPF_F_REPLACE) {
5292                 old_map = bpf_map_get(attr->link_update.old_map_fd);
5293                 if (IS_ERR(old_map)) {
5294                         ret = PTR_ERR(old_map);
5295                         goto out_put;
5296                 }
5297         } else if (attr->link_update.old_map_fd) {
5298                 ret = -EINVAL;
5299                 goto out_put;
5300         }
5301
5302         ret = link->ops->update_map(link, new_map, old_map);
5303
5304         if (old_map)
5305                 bpf_map_put(old_map);
5306 out_put:
5307         bpf_map_put(new_map);
5308         return ret;
5309 }
5310
5311 #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
5312
5313 static int link_update(union bpf_attr *attr)
5314 {
5315         struct bpf_prog *old_prog = NULL, *new_prog;
5316         struct bpf_link *link;
5317         u32 flags;
5318         int ret;
5319
5320         if (CHECK_ATTR(BPF_LINK_UPDATE))
5321                 return -EINVAL;
5322
5323         flags = attr->link_update.flags;
5324         if (flags & ~BPF_F_REPLACE)
5325                 return -EINVAL;
5326
5327         link = bpf_link_get_from_fd(attr->link_update.link_fd);
5328         if (IS_ERR(link))
5329                 return PTR_ERR(link);
5330
5331         if (link->ops->update_map) {
5332                 ret = link_update_map(link, attr);
5333                 goto out_put_link;
5334         }
5335
5336         new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
5337         if (IS_ERR(new_prog)) {
5338                 ret = PTR_ERR(new_prog);
5339                 goto out_put_link;
5340         }
5341
5342         if (flags & BPF_F_REPLACE) {
5343                 old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
5344                 if (IS_ERR(old_prog)) {
5345                         ret = PTR_ERR(old_prog);
5346                         old_prog = NULL;
5347                         goto out_put_progs;
5348                 }
5349         } else if (attr->link_update.old_prog_fd) {
5350                 ret = -EINVAL;
5351                 goto out_put_progs;
5352         }
5353
5354         if (link->ops->update_prog)
5355                 ret = link->ops->update_prog(link, new_prog, old_prog);
5356         else
5357                 ret = -EINVAL;
5358
5359 out_put_progs:
5360         if (old_prog)
5361                 bpf_prog_put(old_prog);
5362         if (ret)
5363                 bpf_prog_put(new_prog);
5364 out_put_link:
5365         bpf_link_put_direct(link);
5366         return ret;
5367 }
5368
5369 #define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
5370
5371 static int link_detach(union bpf_attr *attr)
5372 {
5373         struct bpf_link *link;
5374         int ret;
5375
5376         if (CHECK_ATTR(BPF_LINK_DETACH))
5377                 return -EINVAL;
5378
5379         link = bpf_link_get_from_fd(attr->link_detach.link_fd);
5380         if (IS_ERR(link))
5381                 return PTR_ERR(link);
5382
5383         if (link->ops->detach)
5384                 ret = link->ops->detach(link);
5385         else
5386                 ret = -EOPNOTSUPP;
5387
5388         bpf_link_put_direct(link);
5389         return ret;
5390 }
5391
5392 static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
5393 {
5394         return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
5395 }
5396
5397 struct bpf_link *bpf_link_by_id(u32 id)
5398 {
5399         struct bpf_link *link;
5400
5401         if (!id)
5402                 return ERR_PTR(-ENOENT);
5403
5404         spin_lock_bh(&link_idr_lock);
5405         /* before link is "settled", ID is 0, pretend it doesn't exist yet */
5406         link = idr_find(&link_idr, id);
5407         if (link) {
5408                 if (link->id)
5409                         link = bpf_link_inc_not_zero(link);
5410                 else
5411                         link = ERR_PTR(-EAGAIN);
5412         } else {
5413                 link = ERR_PTR(-ENOENT);
5414         }
5415         spin_unlock_bh(&link_idr_lock);
5416         return link;
5417 }
5418
5419 struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
5420 {
5421         struct bpf_link *link;
5422
5423         spin_lock_bh(&link_idr_lock);
5424 again:
5425         link = idr_get_next(&link_idr, id);
5426         if (link) {
5427                 link = bpf_link_inc_not_zero(link);
5428                 if (IS_ERR(link)) {
5429                         (*id)++;
5430                         goto again;
5431                 }
5432         }
5433         spin_unlock_bh(&link_idr_lock);
5434
5435         return link;
5436 }
5437
5438 #define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
5439
5440 static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
5441 {
5442         struct bpf_link *link;
5443         u32 id = attr->link_id;
5444         int fd;
5445
5446         if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
5447                 return -EINVAL;
5448
5449         if (!capable(CAP_SYS_ADMIN))
5450                 return -EPERM;
5451
5452         link = bpf_link_by_id(id);
5453         if (IS_ERR(link))
5454                 return PTR_ERR(link);
5455
5456         fd = bpf_link_new_fd(link);
5457         if (fd < 0)
5458                 bpf_link_put_direct(link);
5459
5460         return fd;
5461 }
5462
5463 DEFINE_MUTEX(bpf_stats_enabled_mutex);
5464
5465 static int bpf_stats_release(struct inode *inode, struct file *file)
5466 {
5467         mutex_lock(&bpf_stats_enabled_mutex);
5468         static_key_slow_dec(&bpf_stats_enabled_key.key);
5469         mutex_unlock(&bpf_stats_enabled_mutex);
5470         return 0;
5471 }
5472
5473 static const struct file_operations bpf_stats_fops = {
5474         .release = bpf_stats_release,
5475 };
5476
5477 static int bpf_enable_runtime_stats(void)
5478 {
5479         int fd;
5480
5481         mutex_lock(&bpf_stats_enabled_mutex);
5482
5483         /* Set a very high limit to avoid overflow */
5484         if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
5485                 mutex_unlock(&bpf_stats_enabled_mutex);
5486                 return -EBUSY;
5487         }
5488
5489         fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
5490         if (fd >= 0)
5491                 static_key_slow_inc(&bpf_stats_enabled_key.key);
5492
5493         mutex_unlock(&bpf_stats_enabled_mutex);
5494         return fd;
5495 }
5496
5497 #define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
5498
5499 static int bpf_enable_stats(union bpf_attr *attr)
5500 {
5501
5502         if (CHECK_ATTR(BPF_ENABLE_STATS))
5503                 return -EINVAL;
5504
5505         if (!capable(CAP_SYS_ADMIN))
5506                 return -EPERM;
5507
5508         switch (attr->enable_stats.type) {
5509         case BPF_STATS_RUN_TIME:
5510                 return bpf_enable_runtime_stats();
5511         default:
5512                 break;
5513         }
5514         return -EINVAL;
5515 }
5516
5517 #define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
5518
5519 static int bpf_iter_create(union bpf_attr *attr)
5520 {
5521         struct bpf_link *link;
5522         int err;
5523
5524         if (CHECK_ATTR(BPF_ITER_CREATE))
5525                 return -EINVAL;
5526
5527         if (attr->iter_create.flags)
5528                 return -EINVAL;
5529
5530         link = bpf_link_get_from_fd(attr->iter_create.link_fd);
5531         if (IS_ERR(link))
5532                 return PTR_ERR(link);
5533
5534         err = bpf_iter_new_fd(link);
5535         bpf_link_put_direct(link);
5536
5537         return err;
5538 }
5539
5540 #define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
5541
5542 static int bpf_prog_bind_map(union bpf_attr *attr)
5543 {
5544         struct bpf_prog *prog;
5545         struct bpf_map *map;
5546         struct bpf_map **used_maps_old, **used_maps_new;
5547         int i, ret = 0;
5548
5549         if (CHECK_ATTR(BPF_PROG_BIND_MAP))
5550                 return -EINVAL;
5551
5552         if (attr->prog_bind_map.flags)
5553                 return -EINVAL;
5554
5555         prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
5556         if (IS_ERR(prog))
5557                 return PTR_ERR(prog);
5558
5559         map = bpf_map_get(attr->prog_bind_map.map_fd);
5560         if (IS_ERR(map)) {
5561                 ret = PTR_ERR(map);
5562                 goto out_prog_put;
5563         }
5564
5565         mutex_lock(&prog->aux->used_maps_mutex);
5566
5567         used_maps_old = prog->aux->used_maps;
5568
5569         for (i = 0; i < prog->aux->used_map_cnt; i++)
5570                 if (used_maps_old[i] == map) {
5571                         bpf_map_put(map);
5572                         goto out_unlock;
5573                 }
5574
5575         used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
5576                                       sizeof(used_maps_new[0]),
5577                                       GFP_KERNEL);
5578         if (!used_maps_new) {
5579                 ret = -ENOMEM;
5580                 goto out_unlock;
5581         }
5582
5583         /* The bpf program will not access the bpf map, but for the sake of
5584          * simplicity, increase sleepable_refcnt for sleepable program as well.
5585          */
5586         if (prog->sleepable)
5587                 atomic64_inc(&map->sleepable_refcnt);
5588         memcpy(used_maps_new, used_maps_old,
5589                sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
5590         used_maps_new[prog->aux->used_map_cnt] = map;
5591
5592         prog->aux->used_map_cnt++;
5593         prog->aux->used_maps = used_maps_new;
5594
5595         kfree(used_maps_old);
5596
5597 out_unlock:
5598         mutex_unlock(&prog->aux->used_maps_mutex);
5599
5600         if (ret)
5601                 bpf_map_put(map);
5602 out_prog_put:
5603         bpf_prog_put(prog);
5604         return ret;
5605 }
5606
5607 #define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd
5608
5609 static int token_create(union bpf_attr *attr)
5610 {
5611         if (CHECK_ATTR(BPF_TOKEN_CREATE))
5612                 return -EINVAL;
5613
5614         /* no flags are supported yet */
5615         if (attr->token_create.flags)
5616                 return -EINVAL;
5617
5618         return bpf_token_create(attr);
5619 }
5620
5621 static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
5622 {
5623         union bpf_attr attr;
5624         int err;
5625
5626         err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
5627         if (err)
5628                 return err;
5629         size = min_t(u32, size, sizeof(attr));
5630
5631         /* copy attributes from user space, may be less than sizeof(bpf_attr) */
5632         memset(&attr, 0, sizeof(attr));
5633         if (copy_from_bpfptr(&attr, uattr, size) != 0)
5634                 return -EFAULT;
5635
5636         err = security_bpf(cmd, &attr, size);
5637         if (err < 0)
5638                 return err;
5639
5640         switch (cmd) {
5641         case BPF_MAP_CREATE:
5642                 err = map_create(&attr);
5643                 break;
5644         case BPF_MAP_LOOKUP_ELEM:
5645                 err = map_lookup_elem(&attr);
5646                 break;
5647         case BPF_MAP_UPDATE_ELEM:
5648                 err = map_update_elem(&attr, uattr);
5649                 break;
5650         case BPF_MAP_DELETE_ELEM:
5651                 err = map_delete_elem(&attr, uattr);
5652                 break;
5653         case BPF_MAP_GET_NEXT_KEY:
5654                 err = map_get_next_key(&attr);
5655                 break;
5656         case BPF_MAP_FREEZE:
5657                 err = map_freeze(&attr);
5658                 break;
5659         case BPF_PROG_LOAD:
5660                 err = bpf_prog_load(&attr, uattr, size);
5661                 break;
5662         case BPF_OBJ_PIN:
5663                 err = bpf_obj_pin(&attr);
5664                 break;
5665         case BPF_OBJ_GET:
5666                 err = bpf_obj_get(&attr);
5667                 break;
5668         case BPF_PROG_ATTACH:
5669                 err = bpf_prog_attach(&attr);
5670                 break;
5671         case BPF_PROG_DETACH:
5672                 err = bpf_prog_detach(&attr);
5673                 break;
5674         case BPF_PROG_QUERY:
5675                 err = bpf_prog_query(&attr, uattr.user);
5676                 break;
5677         case BPF_PROG_TEST_RUN:
5678                 err = bpf_prog_test_run(&attr, uattr.user);
5679                 break;
5680         case BPF_PROG_GET_NEXT_ID:
5681                 err = bpf_obj_get_next_id(&attr, uattr.user,
5682                                           &prog_idr, &prog_idr_lock);
5683                 break;
5684         case BPF_MAP_GET_NEXT_ID:
5685                 err = bpf_obj_get_next_id(&attr, uattr.user,
5686                                           &map_idr, &map_idr_lock);
5687                 break;
5688         case BPF_BTF_GET_NEXT_ID:
5689                 err = bpf_obj_get_next_id(&attr, uattr.user,
5690                                           &btf_idr, &btf_idr_lock);
5691                 break;
5692         case BPF_PROG_GET_FD_BY_ID:
5693                 err = bpf_prog_get_fd_by_id(&attr);
5694                 break;
5695         case BPF_MAP_GET_FD_BY_ID:
5696                 err = bpf_map_get_fd_by_id(&attr);
5697                 break;
5698         case BPF_OBJ_GET_INFO_BY_FD:
5699                 err = bpf_obj_get_info_by_fd(&attr, uattr.user);
5700                 break;
5701         case BPF_RAW_TRACEPOINT_OPEN:
5702                 err = bpf_raw_tracepoint_open(&attr);
5703                 break;
5704         case BPF_BTF_LOAD:
5705                 err = bpf_btf_load(&attr, uattr, size);
5706                 break;
5707         case BPF_BTF_GET_FD_BY_ID:
5708                 err = bpf_btf_get_fd_by_id(&attr);
5709                 break;
5710         case BPF_TASK_FD_QUERY:
5711                 err = bpf_task_fd_query(&attr, uattr.user);
5712                 break;
5713         case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
5714                 err = map_lookup_and_delete_elem(&attr);
5715                 break;
5716         case BPF_MAP_LOOKUP_BATCH:
5717                 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
5718                 break;
5719         case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
5720                 err = bpf_map_do_batch(&attr, uattr.user,
5721                                        BPF_MAP_LOOKUP_AND_DELETE_BATCH);
5722                 break;
5723         case BPF_MAP_UPDATE_BATCH:
5724                 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
5725                 break;
5726         case BPF_MAP_DELETE_BATCH:
5727                 err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
5728                 break;
5729         case BPF_LINK_CREATE:
5730                 err = link_create(&attr, uattr);
5731                 break;
5732         case BPF_LINK_UPDATE:
5733                 err = link_update(&attr);
5734                 break;
5735         case BPF_LINK_GET_FD_BY_ID:
5736                 err = bpf_link_get_fd_by_id(&attr);
5737                 break;
5738         case BPF_LINK_GET_NEXT_ID:
5739                 err = bpf_obj_get_next_id(&attr, uattr.user,
5740                                           &link_idr, &link_idr_lock);
5741                 break;
5742         case BPF_ENABLE_STATS:
5743                 err = bpf_enable_stats(&attr);
5744                 break;
5745         case BPF_ITER_CREATE:
5746                 err = bpf_iter_create(&attr);
5747                 break;
5748         case BPF_LINK_DETACH:
5749                 err = link_detach(&attr);
5750                 break;
5751         case BPF_PROG_BIND_MAP:
5752                 err = bpf_prog_bind_map(&attr);
5753                 break;
5754         case BPF_TOKEN_CREATE:
5755                 err = token_create(&attr);
5756                 break;
5757         default:
5758                 err = -EINVAL;
5759                 break;
5760         }
5761
5762         return err;
5763 }
5764
5765 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
5766 {
5767         return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
5768 }
5769
5770 static bool syscall_prog_is_valid_access(int off, int size,
5771                                          enum bpf_access_type type,
5772                                          const struct bpf_prog *prog,
5773                                          struct bpf_insn_access_aux *info)
5774 {
5775         if (off < 0 || off >= U16_MAX)
5776                 return false;
5777         if (off % size != 0)
5778                 return false;
5779         return true;
5780 }
5781
5782 BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
5783 {
5784         switch (cmd) {
5785         case BPF_MAP_CREATE:
5786         case BPF_MAP_DELETE_ELEM:
5787         case BPF_MAP_UPDATE_ELEM:
5788         case BPF_MAP_FREEZE:
5789         case BPF_MAP_GET_FD_BY_ID:
5790         case BPF_PROG_LOAD:
5791         case BPF_BTF_LOAD:
5792         case BPF_LINK_CREATE:
5793         case BPF_RAW_TRACEPOINT_OPEN:
5794                 break;
5795         default:
5796                 return -EINVAL;
5797         }
5798         return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
5799 }
5800
5801
5802 /* To shut up -Wmissing-prototypes.
5803  * This function is used by the kernel light skeleton
5804  * to load bpf programs when modules are loaded or during kernel boot.
5805  * See tools/lib/bpf/skel_internal.h
5806  */
5807 int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
5808
5809 int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
5810 {
5811         struct bpf_prog * __maybe_unused prog;
5812         struct bpf_tramp_run_ctx __maybe_unused run_ctx;
5813
5814         switch (cmd) {
5815 #ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
5816         case BPF_PROG_TEST_RUN:
5817                 if (attr->test.data_in || attr->test.data_out ||
5818                     attr->test.ctx_out || attr->test.duration ||
5819                     attr->test.repeat || attr->test.flags)
5820                         return -EINVAL;
5821
5822                 prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
5823                 if (IS_ERR(prog))
5824                         return PTR_ERR(prog);
5825
5826                 if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
5827                     attr->test.ctx_size_in > U16_MAX) {
5828                         bpf_prog_put(prog);
5829                         return -EINVAL;
5830                 }
5831
5832                 run_ctx.bpf_cookie = 0;
5833                 if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
5834                         /* recursion detected */
5835                         __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
5836                         bpf_prog_put(prog);
5837                         return -EBUSY;
5838                 }
5839                 attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
5840                 __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
5841                                                 &run_ctx);
5842                 bpf_prog_put(prog);
5843                 return 0;
5844 #endif
5845         default:
5846                 return ____bpf_sys_bpf(cmd, attr, size);
5847         }
5848 }
5849 EXPORT_SYMBOL(kern_sys_bpf);
5850
5851 static const struct bpf_func_proto bpf_sys_bpf_proto = {
5852         .func           = bpf_sys_bpf,
5853         .gpl_only       = false,
5854         .ret_type       = RET_INTEGER,
5855         .arg1_type      = ARG_ANYTHING,
5856         .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
5857         .arg3_type      = ARG_CONST_SIZE,
5858 };
5859
5860 const struct bpf_func_proto * __weak
5861 tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5862 {
5863         return bpf_base_func_proto(func_id, prog);
5864 }
5865
5866 BPF_CALL_1(bpf_sys_close, u32, fd)
5867 {
5868         /* When bpf program calls this helper there should not be
5869          * an fdget() without matching completed fdput().
5870          * This helper is allowed in the following callchain only:
5871          * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
5872          */
5873         return close_fd(fd);
5874 }
5875
5876 static const struct bpf_func_proto bpf_sys_close_proto = {
5877         .func           = bpf_sys_close,
5878         .gpl_only       = false,
5879         .ret_type       = RET_INTEGER,
5880         .arg1_type      = ARG_ANYTHING,
5881 };
5882
5883 BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
5884 {
5885         if (flags)
5886                 return -EINVAL;
5887
5888         if (name_sz <= 1 || name[name_sz - 1])
5889                 return -EINVAL;
5890
5891         if (!bpf_dump_raw_ok(current_cred()))
5892                 return -EPERM;
5893
5894         *res = kallsyms_lookup_name(name);
5895         return *res ? 0 : -ENOENT;
5896 }
5897
5898 static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
5899         .func           = bpf_kallsyms_lookup_name,
5900         .gpl_only       = false,
5901         .ret_type       = RET_INTEGER,
5902         .arg1_type      = ARG_PTR_TO_MEM,
5903         .arg2_type      = ARG_CONST_SIZE_OR_ZERO,
5904         .arg3_type      = ARG_ANYTHING,
5905         .arg4_type      = ARG_PTR_TO_LONG,
5906 };
5907
5908 static const struct bpf_func_proto *
5909 syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5910 {
5911         switch (func_id) {
5912         case BPF_FUNC_sys_bpf:
5913                 return !bpf_token_capable(prog->aux->token, CAP_PERFMON)
5914                        ? NULL : &bpf_sys_bpf_proto;
5915         case BPF_FUNC_btf_find_by_name_kind:
5916                 return &bpf_btf_find_by_name_kind_proto;
5917         case BPF_FUNC_sys_close:
5918                 return &bpf_sys_close_proto;
5919         case BPF_FUNC_kallsyms_lookup_name:
5920                 return &bpf_kallsyms_lookup_name_proto;
5921         default:
5922                 return tracing_prog_func_proto(func_id, prog);
5923         }
5924 }
5925
5926 const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
5927         .get_func_proto  = syscall_prog_func_proto,
5928         .is_valid_access = syscall_prog_is_valid_access,
5929 };
5930
5931 const struct bpf_prog_ops bpf_syscall_prog_ops = {
5932         .test_run = bpf_prog_test_run_syscall,
5933 };
5934
5935 #ifdef CONFIG_SYSCTL
5936 static int bpf_stats_handler(struct ctl_table *table, int write,
5937                              void *buffer, size_t *lenp, loff_t *ppos)
5938 {
5939         struct static_key *key = (struct static_key *)table->data;
5940         static int saved_val;
5941         int val, ret;
5942         struct ctl_table tmp = {
5943                 .data   = &val,
5944                 .maxlen = sizeof(val),
5945                 .mode   = table->mode,
5946                 .extra1 = SYSCTL_ZERO,
5947                 .extra2 = SYSCTL_ONE,
5948         };
5949
5950         if (write && !capable(CAP_SYS_ADMIN))
5951                 return -EPERM;
5952
5953         mutex_lock(&bpf_stats_enabled_mutex);
5954         val = saved_val;
5955         ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
5956         if (write && !ret && val != saved_val) {
5957                 if (val)
5958                         static_key_slow_inc(key);
5959                 else
5960                         static_key_slow_dec(key);
5961                 saved_val = val;
5962         }
5963         mutex_unlock(&bpf_stats_enabled_mutex);
5964         return ret;
5965 }
5966
5967 void __weak unpriv_ebpf_notify(int new_state)
5968 {
5969 }
5970
5971 static int bpf_unpriv_handler(struct ctl_table *table, int write,
5972                               void *buffer, size_t *lenp, loff_t *ppos)
5973 {
5974         int ret, unpriv_enable = *(int *)table->data;
5975         bool locked_state = unpriv_enable == 1;
5976         struct ctl_table tmp = *table;
5977
5978         if (write && !capable(CAP_SYS_ADMIN))
5979                 return -EPERM;
5980
5981         tmp.data = &unpriv_enable;
5982         ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
5983         if (write && !ret) {
5984                 if (locked_state && unpriv_enable != 1)
5985                         return -EPERM;
5986                 *(int *)table->data = unpriv_enable;
5987         }
5988
5989         if (write)
5990                 unpriv_ebpf_notify(unpriv_enable);
5991
5992         return ret;
5993 }
5994
5995 static struct ctl_table bpf_syscall_table[] = {
5996         {
5997                 .procname       = "unprivileged_bpf_disabled",
5998                 .data           = &sysctl_unprivileged_bpf_disabled,
5999                 .maxlen         = sizeof(sysctl_unprivileged_bpf_disabled),
6000                 .mode           = 0644,
6001                 .proc_handler   = bpf_unpriv_handler,
6002                 .extra1         = SYSCTL_ZERO,
6003                 .extra2         = SYSCTL_TWO,
6004         },
6005         {
6006                 .procname       = "bpf_stats_enabled",
6007                 .data           = &bpf_stats_enabled_key.key,
6008                 .mode           = 0644,
6009                 .proc_handler   = bpf_stats_handler,
6010         },
6011         { }
6012 };
6013
6014 static int __init bpf_syscall_sysctl_init(void)
6015 {
6016         register_sysctl_init("kernel", bpf_syscall_table);
6017         return 0;
6018 }
6019 late_initcall(bpf_syscall_sysctl_init);
6020 #endif /* CONFIG_SYSCTL */