1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
4 static const char *__doc__ =
5 " XDP redirect with a CPU-map type \"BPF_MAP_TYPE_CPUMAP\"";
15 #include <sys/resource.h>
16 #include <sys/sysinfo.h>
20 #include <linux/limits.h>
23 #include <linux/err.h>
25 #include <arpa/inet.h>
26 #include <linux/if_link.h>
28 /* How many xdp_progs are defined in _kern.c */
36 static int ifindex = -1;
37 static char ifname_buf[IF_NAMESIZE];
41 static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
43 static int cpu_map_fd;
44 static int rx_cnt_map_fd;
45 static int redirect_err_cnt_map_fd;
46 static int cpumap_enqueue_cnt_map_fd;
47 static int cpumap_kthread_cnt_map_fd;
48 static int cpus_available_map_fd;
49 static int cpus_count_map_fd;
50 static int cpus_iterator_map_fd;
51 static int exception_cnt_map_fd;
54 struct bpf_link *tp_links[NUM_TP] = { 0 };
55 static int tp_cnt = 0;
57 /* Exit return codes */
60 #define EXIT_FAIL_OPTION 2
61 #define EXIT_FAIL_XDP 3
62 #define EXIT_FAIL_BPF 4
63 #define EXIT_FAIL_MEM 5
65 static const struct option long_options[] = {
66 {"help", no_argument, NULL, 'h' },
67 {"dev", required_argument, NULL, 'd' },
68 {"skb-mode", no_argument, NULL, 'S' },
69 {"sec", required_argument, NULL, 's' },
70 {"progname", required_argument, NULL, 'p' },
71 {"qsize", required_argument, NULL, 'q' },
72 {"cpu", required_argument, NULL, 'c' },
73 {"stress-mode", no_argument, NULL, 'x' },
74 {"no-separators", no_argument, NULL, 'z' },
75 {"force", no_argument, NULL, 'F' },
79 static void int_exit(int sig)
81 __u32 curr_prog_id = 0;
84 if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) {
85 printf("bpf_get_link_xdp_id failed\n");
88 if (prog_id == curr_prog_id) {
90 "Interrupted: Removing XDP program on ifindex:%d device:%s\n",
92 bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
93 } else if (!curr_prog_id) {
94 printf("couldn't find a prog id on a given iface\n");
96 printf("program on interface changed, not removing\n");
99 /* Detach tracepoints */
101 bpf_link__destroy(tp_links[--tp_cnt]);
106 static void print_avail_progs(struct bpf_object *obj)
108 struct bpf_program *pos;
110 bpf_object__for_each_program(pos, obj) {
111 if (bpf_program__is_xdp(pos))
112 printf(" %s\n", bpf_program__title(pos, false));
116 static void usage(char *argv[], struct bpf_object *obj)
120 printf("\nDOCUMENTATION:\n%s\n", __doc__);
122 printf(" Usage: %s (options-see-below)\n", argv[0]);
123 printf(" Listing options:\n");
124 for (i = 0; long_options[i].name != 0; i++) {
125 printf(" --%-12s", long_options[i].name);
126 if (long_options[i].flag != NULL)
127 printf(" flag (internal value:%d)",
128 *long_options[i].flag);
130 printf(" short-option: -%c",
131 long_options[i].val);
134 printf("\n Programs to be used for --progname:\n");
135 print_avail_progs(obj);
139 /* gettime returns the current time of day in nanoseconds.
140 * Cost: clock_gettime (ns) => 26ns (CLOCK_MONOTONIC)
141 * clock_gettime (ns) => 9ns (CLOCK_MONOTONIC_COARSE)
143 #define NANOSEC_PER_SEC 1000000000 /* 10^9 */
144 static __u64 gettime(void)
149 res = clock_gettime(CLOCK_MONOTONIC, &t);
151 fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
154 return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
157 /* Common stats data record shared with _kern.c */
165 struct datarec total;
168 struct stats_record {
169 struct record rx_cnt;
170 struct record redir_err;
171 struct record kthread;
172 struct record exception;
176 static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
178 /* For percpu maps, userspace gets a value per possible CPU */
179 unsigned int nr_cpus = bpf_num_possible_cpus();
180 struct datarec values[nr_cpus];
181 __u64 sum_processed = 0;
182 __u64 sum_dropped = 0;
186 if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
188 "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
191 /* Get time as close as possible to reading map contents */
192 rec->timestamp = gettime();
194 /* Record and sum values from each CPU */
195 for (i = 0; i < nr_cpus; i++) {
196 rec->cpu[i].processed = values[i].processed;
197 sum_processed += values[i].processed;
198 rec->cpu[i].dropped = values[i].dropped;
199 sum_dropped += values[i].dropped;
200 rec->cpu[i].issue = values[i].issue;
201 sum_issue += values[i].issue;
203 rec->total.processed = sum_processed;
204 rec->total.dropped = sum_dropped;
205 rec->total.issue = sum_issue;
209 static struct datarec *alloc_record_per_cpu(void)
211 unsigned int nr_cpus = bpf_num_possible_cpus();
212 struct datarec *array;
214 array = calloc(nr_cpus, sizeof(struct datarec));
216 fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
222 static struct stats_record *alloc_stats_record(void)
224 struct stats_record *rec;
227 size = sizeof(*rec) + n_cpus * sizeof(struct record);
230 fprintf(stderr, "Mem alloc error\n");
233 memset(rec, 0, size);
234 rec->rx_cnt.cpu = alloc_record_per_cpu();
235 rec->redir_err.cpu = alloc_record_per_cpu();
236 rec->kthread.cpu = alloc_record_per_cpu();
237 rec->exception.cpu = alloc_record_per_cpu();
238 for (i = 0; i < n_cpus; i++)
239 rec->enq[i].cpu = alloc_record_per_cpu();
244 static void free_stats_record(struct stats_record *r)
248 for (i = 0; i < n_cpus; i++)
250 free(r->exception.cpu);
251 free(r->kthread.cpu);
252 free(r->redir_err.cpu);
257 static double calc_period(struct record *r, struct record *p)
262 period = r->timestamp - p->timestamp;
264 period_ = ((double) period / NANOSEC_PER_SEC);
269 static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
275 packets = r->processed - p->processed;
276 pps = packets / period_;
281 static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_)
287 packets = r->dropped - p->dropped;
288 pps = packets / period_;
293 static __u64 calc_errs_pps(struct datarec *r,
294 struct datarec *p, double period_)
300 packets = r->issue - p->issue;
301 pps = packets / period_;
306 static void stats_print(struct stats_record *stats_rec,
307 struct stats_record *stats_prev,
310 unsigned int nr_cpus = bpf_num_possible_cpus();
311 double pps = 0, drop = 0, err = 0;
312 struct record *rec, *prev;
318 printf("Running XDP/eBPF prog_name:%s\n", prog_name);
319 printf("%-15s %-7s %-14s %-11s %-9s\n",
320 "XDP-cpumap", "CPU:to", "pps", "drop-pps", "extra-info");
324 char *fmt_rx = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n";
325 char *fm2_rx = "%-15s %-7s %'-14.0f %'-11.0f\n";
328 rec = &stats_rec->rx_cnt;
329 prev = &stats_prev->rx_cnt;
330 t = calc_period(rec, prev);
331 for (i = 0; i < nr_cpus; i++) {
332 struct datarec *r = &rec->cpu[i];
333 struct datarec *p = &prev->cpu[i];
335 pps = calc_pps(r, p, t);
336 drop = calc_drop_pps(r, p, t);
337 err = calc_errs_pps(r, p, t);
339 errstr = "cpu-dest/err";
341 printf(fmt_rx, "XDP-RX",
342 i, pps, drop, err, errstr);
344 pps = calc_pps(&rec->total, &prev->total, t);
345 drop = calc_drop_pps(&rec->total, &prev->total, t);
346 err = calc_errs_pps(&rec->total, &prev->total, t);
347 printf(fm2_rx, "XDP-RX", "total", pps, drop);
350 /* cpumap enqueue stats */
351 for (to_cpu = 0; to_cpu < n_cpus; to_cpu++) {
352 char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
353 char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
356 rec = &stats_rec->enq[to_cpu];
357 prev = &stats_prev->enq[to_cpu];
358 t = calc_period(rec, prev);
359 for (i = 0; i < nr_cpus; i++) {
360 struct datarec *r = &rec->cpu[i];
361 struct datarec *p = &prev->cpu[i];
363 pps = calc_pps(r, p, t);
364 drop = calc_drop_pps(r, p, t);
365 err = calc_errs_pps(r, p, t);
367 errstr = "bulk-average";
368 err = pps / err; /* calc average bulk size */
371 printf(fmt, "cpumap-enqueue",
372 i, to_cpu, pps, drop, err, errstr);
374 pps = calc_pps(&rec->total, &prev->total, t);
376 drop = calc_drop_pps(&rec->total, &prev->total, t);
377 err = calc_errs_pps(&rec->total, &prev->total, t);
379 errstr = "bulk-average";
380 err = pps / err; /* calc average bulk size */
382 printf(fm2, "cpumap-enqueue",
383 "sum", to_cpu, pps, drop, err, errstr);
387 /* cpumap kthread stats */
389 char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n";
390 char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f %s\n";
393 rec = &stats_rec->kthread;
394 prev = &stats_prev->kthread;
395 t = calc_period(rec, prev);
396 for (i = 0; i < nr_cpus; i++) {
397 struct datarec *r = &rec->cpu[i];
398 struct datarec *p = &prev->cpu[i];
400 pps = calc_pps(r, p, t);
401 drop = calc_drop_pps(r, p, t);
402 err = calc_errs_pps(r, p, t);
406 printf(fmt_k, "cpumap_kthread",
407 i, pps, drop, err, e_str);
409 pps = calc_pps(&rec->total, &prev->total, t);
410 drop = calc_drop_pps(&rec->total, &prev->total, t);
411 err = calc_errs_pps(&rec->total, &prev->total, t);
414 printf(fm2_k, "cpumap_kthread", "total", pps, drop, err, e_str);
417 /* XDP redirect err tracepoints (very unlikely) */
419 char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n";
420 char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n";
422 rec = &stats_rec->redir_err;
423 prev = &stats_prev->redir_err;
424 t = calc_period(rec, prev);
425 for (i = 0; i < nr_cpus; i++) {
426 struct datarec *r = &rec->cpu[i];
427 struct datarec *p = &prev->cpu[i];
429 pps = calc_pps(r, p, t);
430 drop = calc_drop_pps(r, p, t);
432 printf(fmt_err, "redirect_err", i, pps, drop);
434 pps = calc_pps(&rec->total, &prev->total, t);
435 drop = calc_drop_pps(&rec->total, &prev->total, t);
436 printf(fm2_err, "redirect_err", "total", pps, drop);
439 /* XDP general exception tracepoints */
441 char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n";
442 char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n";
444 rec = &stats_rec->exception;
445 prev = &stats_prev->exception;
446 t = calc_period(rec, prev);
447 for (i = 0; i < nr_cpus; i++) {
448 struct datarec *r = &rec->cpu[i];
449 struct datarec *p = &prev->cpu[i];
451 pps = calc_pps(r, p, t);
452 drop = calc_drop_pps(r, p, t);
454 printf(fmt_err, "xdp_exception", i, pps, drop);
456 pps = calc_pps(&rec->total, &prev->total, t);
457 drop = calc_drop_pps(&rec->total, &prev->total, t);
458 printf(fm2_err, "xdp_exception", "total", pps, drop);
465 static void stats_collect(struct stats_record *rec)
470 map_collect_percpu(fd, 0, &rec->rx_cnt);
472 fd = redirect_err_cnt_map_fd;
473 map_collect_percpu(fd, 1, &rec->redir_err);
475 fd = cpumap_enqueue_cnt_map_fd;
476 for (i = 0; i < n_cpus; i++)
477 map_collect_percpu(fd, i, &rec->enq[i]);
479 fd = cpumap_kthread_cnt_map_fd;
480 map_collect_percpu(fd, 0, &rec->kthread);
482 fd = exception_cnt_map_fd;
483 map_collect_percpu(fd, 0, &rec->exception);
487 /* Pointer swap trick */
488 static inline void swap(struct stats_record **a, struct stats_record **b)
490 struct stats_record *tmp;
497 static int create_cpu_entry(__u32 cpu, __u32 queue_size,
498 __u32 avail_idx, bool new)
500 __u32 curr_cpus_count = 0;
504 /* Add a CPU entry to cpumap, as this allocate a cpu entry in
505 * the kernel for the cpu.
507 ret = bpf_map_update_elem(cpu_map_fd, &cpu, &queue_size, 0);
509 fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret);
513 /* Inform bpf_prog's that a new CPU is available to select
514 * from via some control maps.
516 ret = bpf_map_update_elem(cpus_available_map_fd, &avail_idx, &cpu, 0);
518 fprintf(stderr, "Add to avail CPUs failed\n");
522 /* When not replacing/updating existing entry, bump the count */
523 ret = bpf_map_lookup_elem(cpus_count_map_fd, &key, &curr_cpus_count);
525 fprintf(stderr, "Failed reading curr cpus_count\n");
530 ret = bpf_map_update_elem(cpus_count_map_fd, &key,
531 &curr_cpus_count, 0);
533 fprintf(stderr, "Failed write curr cpus_count\n");
537 /* map_fd[7] = cpus_iterator */
538 printf("%s CPU:%u as idx:%u queue_size:%d (total cpus_count:%u)\n",
539 new ? "Add-new":"Replace", cpu, avail_idx,
540 queue_size, curr_cpus_count);
545 /* CPUs are zero-indexed. Thus, add a special sentinel default value
546 * in map cpus_available to mark CPU index'es not configured
548 static void mark_cpus_unavailable(void)
550 __u32 invalid_cpu = n_cpus;
553 for (i = 0; i < n_cpus; i++) {
554 ret = bpf_map_update_elem(cpus_available_map_fd, &i,
557 fprintf(stderr, "Failed marking CPU unavailable\n");
563 /* Stress cpumap management code by concurrently changing underlying cpumap */
564 static void stress_cpumap(void)
566 /* Changing qsize will cause kernel to free and alloc a new
567 * bpf_cpu_map_entry, with an associated/complicated tear-down
570 create_cpu_entry(1, 1024, 0, false);
571 create_cpu_entry(1, 8, 0, false);
572 create_cpu_entry(1, 16000, 0, false);
575 static void stats_poll(int interval, bool use_separators, char *prog_name,
578 struct stats_record *record, *prev;
580 record = alloc_stats_record();
581 prev = alloc_stats_record();
582 stats_collect(record);
584 /* Trick to pretty printf with thousands separators use %' */
586 setlocale(LC_NUMERIC, "en_US");
589 swap(&prev, &record);
590 stats_collect(record);
591 stats_print(record, prev, prog_name);
597 free_stats_record(record);
598 free_stats_record(prev);
601 static struct bpf_link * attach_tp(struct bpf_object *obj,
602 const char *tp_category,
605 struct bpf_program *prog;
606 struct bpf_link *link;
607 char sec_name[PATH_MAX];
610 len = snprintf(sec_name, PATH_MAX, "tracepoint/%s/%s",
611 tp_category, tp_name);
615 prog = bpf_object__find_program_by_title(obj, sec_name);
617 fprintf(stderr, "ERR: finding progsec: %s\n", sec_name);
621 link = bpf_program__attach_tracepoint(prog, tp_category, tp_name);
628 static void init_tracepoints(struct bpf_object *obj) {
629 tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_err");
630 tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_map_err");
631 tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_exception");
632 tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_enqueue");
633 tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_kthread");
636 static int init_map_fds(struct bpf_object *obj)
638 /* Maps updated by tracepoints */
639 redirect_err_cnt_map_fd =
640 bpf_object__find_map_fd_by_name(obj, "redirect_err_cnt");
641 exception_cnt_map_fd =
642 bpf_object__find_map_fd_by_name(obj, "exception_cnt");
643 cpumap_enqueue_cnt_map_fd =
644 bpf_object__find_map_fd_by_name(obj, "cpumap_enqueue_cnt");
645 cpumap_kthread_cnt_map_fd =
646 bpf_object__find_map_fd_by_name(obj, "cpumap_kthread_cnt");
648 /* Maps used by XDP */
649 rx_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rx_cnt");
650 cpu_map_fd = bpf_object__find_map_fd_by_name(obj, "cpu_map");
651 cpus_available_map_fd =
652 bpf_object__find_map_fd_by_name(obj, "cpus_available");
653 cpus_count_map_fd = bpf_object__find_map_fd_by_name(obj, "cpus_count");
654 cpus_iterator_map_fd =
655 bpf_object__find_map_fd_by_name(obj, "cpus_iterator");
657 if (cpu_map_fd < 0 || rx_cnt_map_fd < 0 ||
658 redirect_err_cnt_map_fd < 0 || cpumap_enqueue_cnt_map_fd < 0 ||
659 cpumap_kthread_cnt_map_fd < 0 || cpus_available_map_fd < 0 ||
660 cpus_count_map_fd < 0 || cpus_iterator_map_fd < 0 ||
661 exception_cnt_map_fd < 0)
667 int main(int argc, char **argv)
669 struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
670 char *prog_name = "xdp_cpu_map5_lb_hash_ip_pairs";
671 struct bpf_prog_load_attr prog_load_attr = {
672 .prog_type = BPF_PROG_TYPE_UNSPEC,
674 struct bpf_prog_info info = {};
675 __u32 info_len = sizeof(info);
676 bool use_separators = true;
677 bool stress_mode = false;
678 struct bpf_program *prog;
679 struct bpf_object *obj;
689 n_cpus = get_nprocs_conf();
691 /* Notice: choosing he queue size is very important with the
692 * ixgbe driver, because it's driver page recycling trick is
693 * dependend on pages being returned quickly. The number of
694 * out-standing packets in the system must be less-than 2x
699 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
700 prog_load_attr.file = filename;
702 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
703 perror("setrlimit(RLIMIT_MEMLOCK)");
707 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
711 fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n",
715 init_tracepoints(obj);
716 if (init_map_fds(obj) < 0) {
717 fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n");
720 mark_cpus_unavailable();
722 /* Parse commands line args */
723 while ((opt = getopt_long(argc, argv, "hSd:s:p:q:c:xzF",
724 long_options, &longindex)) != -1) {
727 if (strlen(optarg) >= IF_NAMESIZE) {
728 fprintf(stderr, "ERR: --dev name too long\n");
731 ifname = (char *)&ifname_buf;
732 strncpy(ifname, optarg, IF_NAMESIZE);
733 ifindex = if_nametoindex(ifname);
736 "ERR: --dev name unknown err(%d):%s\n",
737 errno, strerror(errno));
742 interval = atoi(optarg);
745 xdp_flags |= XDP_FLAGS_SKB_MODE;
751 use_separators = false;
754 /* Selecting eBPF prog to load */
758 /* Add multiple CPUs */
759 add_cpu = strtoul(optarg, NULL, 0);
760 if (add_cpu >= n_cpus) {
762 "--cpu nr too large for cpumap err(%d):%s\n",
763 errno, strerror(errno));
766 create_cpu_entry(add_cpu, qsize, added_cpus, true);
770 qsize = atoi(optarg);
773 xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
779 return EXIT_FAIL_OPTION;
782 /* Required option */
784 fprintf(stderr, "ERR: required option --dev missing\n");
786 return EXIT_FAIL_OPTION;
788 /* Required option */
790 fprintf(stderr, "ERR: required option --cpu missing\n");
791 fprintf(stderr, " Specify multiple --cpu option to add more\n");
793 return EXIT_FAIL_OPTION;
796 /* Remove XDP program when program is interrupted or killed */
797 signal(SIGINT, int_exit);
798 signal(SIGTERM, int_exit);
800 prog = bpf_object__find_program_by_title(obj, prog_name);
802 fprintf(stderr, "bpf_object__find_program_by_title failed\n");
806 prog_fd = bpf_program__fd(prog);
808 fprintf(stderr, "bpf_program__fd failed\n");
812 if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
813 fprintf(stderr, "link set xdp fd failed\n");
814 return EXIT_FAIL_XDP;
817 err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
819 printf("can't get prog info - %s\n", strerror(errno));
824 stats_poll(interval, use_separators, prog_name, stress_mode);