/* SPDX-License-Identifier: GPL-2.0 */ #define _GNU_SOURCE /* Needed by sched_getcpu */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef SO_PREFER_BUSY_POLL #define SO_PREFER_BUSY_POLL 69 #endif #ifndef SO_BUSY_POLL_BUDGET #define SO_BUSY_POLL_BUDGET 70 #endif #include /* provided by libbpf */ #include "common_params.h" #include "common_user_bpf_xdp.h" // #include "common_libbpf.h" #include "af_xdp_kern_shared.h" #include "lib_xsk_extend.h" #include "ethtool_utils.h" #include "lib_checksum.h" #define NUM_FRAMES 4096 /* Frames per queue */ #define FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE /* 4096 */ #define FRAME_SIZE_MASK (FRAME_SIZE - 1) #define RX_BATCH_SIZE 64 #define FQ_REFILL_MAX (RX_BATCH_SIZE * 2) #define INVALID_UMEM_FRAME UINT64_MAX struct mem_frame_allocator { uint32_t umem_frame_free; uint32_t umem_frame_max; uint64_t *umem_frame_addr; /* array */ }; struct xsk_umem_info { struct xsk_ring_prod init_fq; struct xsk_ring_cons init_cq; struct xsk_umem *umem; void *buffer; struct mem_frame_allocator mem; }; struct stats_record { uint64_t timestamp; uint64_t rx_packets; uint64_t rx_bytes; uint64_t tx_packets; uint64_t tx_bytes; }; struct xsk_socket_info { struct xsk_ring_cons rx; struct xsk_ring_prod tx; struct xsk_umem_info *umem; struct xsk_socket *xsk; struct xsk_ring_prod fq; struct xsk_ring_cons cq; uint32_t outstanding_tx; int queue_id; struct stats_record stats; struct stats_record prev_stats; }; struct xsk_container { struct xsk_socket_info *sockets[MAX_AF_SOCKS]; int num; /* Number of xsk_sockets configured */ }; static void __exit_with_error(int error, const char *file, const char *func, int line) { fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func, line, error, strerror(error)); exit(EXIT_FAILURE); } #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) /** * BTF setup XDP-hints * ------------------- * Setup the data structures for accessing the XDP-hints provided by * kernel side BPF-prog via decoding BTF-info provided in BPF * ELF-object file. */ /* This struct BTF mirrors kernel-side struct xdp_hints_rx_time */ struct xdp_hints_rx_time { __u32 btf_type_id; /* cached xsk_btf__btf_type_id(xbi) */ struct xsk_btf_info *xbi; struct xsk_btf_member rx_ktime; struct xsk_btf_member xdp_rx_cpu; } xdp_hints_rx_time = { 0 }; /* This struct BTF mirrors kernel-side struct xdp_hints_mark */ struct xdp_hints_mark { __u32 btf_type_id; /* cached xsk_btf__btf_type_id(xbi) */ struct xsk_btf_info *xbi; struct xsk_btf_member mark; } xdp_hints_mark = { 0 }; struct xsk_btf_info *setup_btf_info(struct btf *btf, const char *struct_name) { struct xsk_btf_info *xbi = NULL; int err; err = xsk_btf__init_xdp_hint(btf, struct_name, &xbi); if (err) { fprintf(stderr, "WARN(%d): Cannot BTF locate valid struct:%s\n", err, struct_name); return NULL; } if (debug_meta) printf("Setup BTF based XDP hints for struct: %s\n", struct_name); return xbi; } int init_btf_info_via_bpf_object(struct bpf_object *bpf_obj) { struct btf *btf = bpf_object__btf(bpf_obj); struct xsk_btf_info *xbi; xbi = setup_btf_info(btf, "xdp_hints_rx_time"); if (xbi) { /* Lookup info on required member "rx_ktime" */ if (!xsk_btf__field_member("rx_ktime", xbi, &xdp_hints_rx_time.rx_ktime)) return -EBADSLT; if (!xsk_btf__field_member("xdp_rx_cpu", xbi, &xdp_hints_rx_time.xdp_rx_cpu)) return -EBADSLT; xdp_hints_rx_time.btf_type_id = xsk_btf__btf_type_id(xbi); xdp_hints_rx_time.xbi = xbi; } xbi = setup_btf_info(btf, "xdp_hints_mark"); if (xbi) { if (!xsk_btf__field_member("mark", xbi, &xdp_hints_mark.mark)) return -EBADSLT; xdp_hints_mark.btf_type_id = xsk_btf__btf_type_id(xbi); xdp_hints_mark.xbi = xbi; } return 0; } void pr_addr_info(const char *msg, uint64_t pkt_addr, struct xsk_umem_info *umem) { uint64_t pkt_nr = pkt_addr / FRAME_SIZE; /* Integer div round off */ uint32_t offset = pkt_addr - (pkt_nr * FRAME_SIZE); /* what got rounded off */ uint8_t *pkt_ptr = NULL; if (!debug) return; if (umem) pkt_ptr = xsk_umem__get_data(umem->buffer, pkt_addr); printf(" - Addr-info: %s pkt_nr:%lu offset:%u (addr:0x%lX) ptr:%p\n", msg, pkt_nr, offset, pkt_addr, pkt_ptr); } #define NANOSEC_PER_SEC 1000000000 /* 10^9 */ static uint64_t gettime(void) { struct timespec t; int res; res = clock_gettime(CLOCK_MONOTONIC, &t); if (res < 0) { fprintf(stderr, "Error with clock_gettime! (%i)\n", res); exit(EXIT_FAIL); } return (uint64_t) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; } static inline __u32 xsk_ring_prod__free(struct xsk_ring_prod *r) { r->cached_cons = *r->consumer + r->size; return r->cached_cons - r->cached_prod; } static const char *__doc__ = "AF_XDP kernel bypass example\n"; static const struct option_wrapper long_options[] = { {{"help", no_argument, NULL, 'h' }, "Show help", false}, {{"dev", required_argument, NULL, 'd' }, "Operate on device ", "", true}, {{"skb-mode", no_argument, NULL, 'S' }, "Install XDP program in SKB (AKA generic) mode"}, {{"native-mode", no_argument, NULL, 'N' }, "Install XDP program in native mode"}, {{"auto-mode", no_argument, NULL, 'A' }, "Auto-detect SKB or native mode"}, {{"force", no_argument, NULL, 'F' }, "Force install, replacing existing program on interface"}, {{"copy", no_argument, NULL, 'c' }, "Force copy mode"}, {{"zero-copy", no_argument, NULL, 'z' }, "Force zero-copy mode"}, {{"queue", required_argument, NULL, 'Q' }, "Configure single interface receive queue for AF_XDP"}, {{"priority", required_argument, NULL, 'p' }, "Setup real-time priority for process"}, {{"wakeup-mode", no_argument, NULL, 'w' }, "Use poll() API waiting for packets to arrive via wakeup from kernel"}, {{"spin-mode", no_argument, NULL, 's' }, "Let userspace process spin checking for packets (disable --wakeup-mode)"}, {{"unload", no_argument, NULL, 'U' }, "Unload XDP program instead of loading"}, {{"quiet", no_argument, NULL, 'q' }, "Quiet mode (no output)"}, {{"pktinfo", no_argument, NULL, 'P' }, "Print packet info output mode (debug)"}, {{"metainfo", no_argument, NULL, 'm' }, "Print XDP metadata info output mode (debug)"}, {{"timedebug", no_argument, NULL, 't' }, "Print timestamps info for wakeup accuracy (debug)"}, {{"debug", no_argument, NULL, 'D' }, "Debug info output mode (debug)"}, {{"filename", required_argument, NULL, 1 }, "Load program from ", ""}, {{"progsec", required_argument, NULL, 2 }, "Load program in
of the ELF file", "
"}, {{"src-ip", required_argument, NULL, 4 }, "Change IPv4 source address in generated packets", ""}, {{"dst-ip", required_argument, NULL, 5 }, "Change IPv4 destination address in generated packets", ""}, {{"busy-poll", no_argument, NULL, 'B' }, "Enable socket prefer NAPI busy-poll mode (remember adjust sysctl too)"}, {{"tx-dmac", required_argument, NULL, 'G' }, "Dest MAC addr of TX frame in aa:bb:cc:dd:ee:ff format", "aa:bb:cc:dd:ee:ff"}, {{"tx-smac", required_argument, NULL, 'H' }, "Src MAC addr of TX frame in aa:bb:cc:dd:ee:ff format", "aa:bb:cc:dd:ee:ff"}, {{"interval", required_argument, NULL, 'i' }, "Periodic TX-cyclic interval wakeup period in usec", ""}, {{"batch-pkts", required_argument, NULL, 'b' }, "Periodic TX-cyclic batch send pkts", ""}, {{0, 0, NULL, 0 }, NULL, false} }; static bool global_exit; int print_libbpf_log(enum libbpf_print_level lvl, const char *fmt, va_list args) { if (!debug && lvl >= LIBBPF_DEBUG) return 0; return vfprintf(stderr, fmt, args); } /* Later set custom log handler via: libbpf_set_print(print_libbpf_log); */ /** * Simple memory allocator for umem frames */ static uint64_t mem_alloc_umem_frame(struct mem_frame_allocator *mem) { uint64_t frame; if (mem->umem_frame_free == 0) return INVALID_UMEM_FRAME; frame = mem->umem_frame_addr[--mem->umem_frame_free]; mem->umem_frame_addr[mem->umem_frame_free] = INVALID_UMEM_FRAME; return frame; } static void mem_free_umem_frame(struct mem_frame_allocator *mem, uint64_t frame) { assert(mem->umem_frame_free < mem->umem_frame_max); /* Remove any packet offset from the frame addr. The kernel RX process * will add some headroom. Our userspace TX process can also choose to * add headroom. Thus, frame addr can be returned to our mem allocator * including this offset. */ // frame = (frame / FRAME_SIZE) * FRAME_SIZE; frame = frame & ~FRAME_SIZE_MASK; mem->umem_frame_addr[mem->umem_frame_free++] = frame; } static uint64_t mem_avail_umem_frames(struct mem_frame_allocator *mem) { return mem->umem_frame_free; } static void mem_init_umem_frame_allocator(struct mem_frame_allocator *mem, uint32_t nr_frames) { /* Initialize umem frame allocator */ int i; mem->umem_frame_addr = calloc(nr_frames, sizeof(*mem->umem_frame_addr)); if (!mem->umem_frame_addr) { fprintf(stderr, "ERROR: Cannot allocate umem_frame_addr array sz:%u\n", nr_frames); exit(EXIT_FAILURE); } mem->umem_frame_max = nr_frames; /* The umem_frame_addr is basically index into umem->buffer memory area */ for (i = 0; i < nr_frames; i++) { uint64_t addr = i * FRAME_SIZE; mem->umem_frame_addr[i] = addr; } mem->umem_frame_free = nr_frames; } static void apply_setsockopt(struct xsk_socket_info *xsk, bool opt_busy_poll, int opt_batch_size) { int sock_opt; if (!opt_busy_poll) return; sock_opt = 1; if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_PREFER_BUSY_POLL, (void *)&sock_opt, sizeof(sock_opt)) < 0) exit_with_error(errno); sock_opt = 20; if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL, (void *)&sock_opt, sizeof(sock_opt)) < 0) exit_with_error(errno); sock_opt = opt_batch_size; if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, (void *)&sock_opt, sizeof(sock_opt)) < 0) exit_with_error(errno); } static struct xsk_umem_info *configure_xsk_umem(void *buffer, uint64_t size, uint32_t frame_size, uint32_t nr_frames) { struct xsk_umem_info *umem; int ret; struct xsk_umem_config xsk_umem_cfg = { /* We recommend that you set the fill ring size >= HW RX ring size + * AF_XDP RX ring size. Make sure you fill up the fill ring * with buffers at regular intervals, and you will with this setting * avoid allocation failures in the driver. These are usually quite * expensive since drivers have not been written to assume that * allocation failures are common. For regular sockets, kernel * allocated memory is used that only runs out in OOM situations * that should be rare. */ // .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, /* Fix later */ .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = frame_size, /* Notice XSK_UMEM__DEFAULT_FRAME_HEADROOM is zero */ .frame_headroom = 256, //.frame_headroom = 0, .flags = 0 }; umem = calloc(1, sizeof(*umem)); if (!umem) return NULL; ret = xsk_umem__create(&umem->umem, buffer, size, &umem->init_fq, &umem->init_cq, &xsk_umem_cfg); if (ret) { errno = -ret; return NULL; } umem->buffer = buffer; /* Setup our own umem frame allocator system */ mem_init_umem_frame_allocator(&umem->mem, nr_frames); return umem; } static int xsk_populate_fill_ring(struct xsk_ring_prod *fq, struct xsk_umem_info *umem, int nr_frames) { uint32_t idx; int ret, i; /* Stuff the receive path with buffers, we assume we have enough */ ret = xsk_ring_prod__reserve(fq, nr_frames, &idx); if (ret != nr_frames) goto error_exit; for (i = 0; i < nr_frames; i++) *xsk_ring_prod__fill_addr(fq, idx++) = mem_alloc_umem_frame(&umem->mem); xsk_ring_prod__submit(fq, nr_frames); return 0; error_exit: return -EINVAL; } static struct xsk_socket_info *xsk_configure_socket(struct config *cfg, struct xsk_umem_info *umem, int queue_id, int xsks_map_fd) { struct xsk_socket_config xsk_cfg; struct xsk_socket_info *xsk_info; int _queue_id = queue_id; uint32_t prog_id = 0; int ret; xsk_info = calloc(1, sizeof(*xsk_info)); if (!xsk_info) return NULL; /* If user specified explicit --queue number then use that */ if (cfg->xsk_if_queue >= 0) _queue_id = cfg->xsk_if_queue; xsk_info->queue_id = _queue_id; xsk_info->umem = umem; xsk_cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; xsk_cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; xsk_cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD; xsk_cfg.xdp_flags = cfg->xdp_flags; xsk_cfg.bind_flags = cfg->xsk_bind_flags; // ret = xsk_socket__create(&xsk_info->xsk, cfg->ifname, // _queue_id, umem->umem, &xsk_info->rx, // &xsk_info->tx, &xsk_cfg); ret = xsk_socket__create_shared(&xsk_info->xsk, cfg->ifname, _queue_id, umem->umem, &xsk_info->rx, &xsk_info->tx, &xsk_info->fq, &xsk_info->cq, &xsk_cfg); if (ret) goto error_exit; ret = bpf_get_link_xdp_id(cfg->ifindex, &prog_id, cfg->xdp_flags); if (ret) goto error_exit; /* Due to XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD manually update map */ // xsk_socket__update_xskmap(xsk_info->xsk, xsks_map_fd); apply_setsockopt(xsk_info, cfg->opt_busy_poll, RX_BATCH_SIZE); return xsk_info; error_exit: errno = -ret; return NULL; } static int kick_tx(struct xsk_socket_info *xsk) { int err = 0; int ret; ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); if (ret < 0) { /* On error, -1 is returned, and errno is set */ fprintf(stderr, "WARN: %s() sendto() failed with errno:%d\n", __func__, errno); err = errno; } /* Kernel samples/bpf/ xdp_sock_user.c kick_tx variant doesn't * treat the following errno values as errors: * ENOBUFS , EAGAIN , EBUSY , ENETDOWN */ return err; } static int complete_tx(struct xsk_socket_info *xsk) { unsigned int completed; uint32_t idx_cq; int err; if (!xsk->outstanding_tx) return 0; /* Notify kernel via sendto syscall that TX packet are avail */ err = kick_tx(xsk); /* Collect/free completed TX buffers */ completed = xsk_ring_cons__peek(&xsk->cq, XSK_RING_CONS__DEFAULT_NUM_DESCS, &idx_cq); if (completed > 0) { for (int i = 0; i < completed; i++) { uint64_t addr; addr = *xsk_ring_cons__comp_addr(&xsk->cq, idx_cq++); mem_free_umem_frame(&xsk->umem->mem, addr); //pr_addr_info(__func__, addr, xsk->umem); } xsk_ring_cons__release(&xsk->cq, completed); if (completed > xsk->outstanding_tx) { fprintf(stderr, "WARN: %s() " "reset outstanding_tx(%d) as completed(%d)" "more than outstanding TX pakcets\n", __func__, xsk->outstanding_tx, completed); } xsk->outstanding_tx -= completed < xsk->outstanding_tx ? completed : xsk->outstanding_tx; } return err; } static inline __sum16 csum16_add(__sum16 csum, __be16 addend) { uint16_t res = (uint16_t)csum; res += (__u16)addend; return (__sum16)(res + (res < (__u16)addend)); } static inline __sum16 csum16_sub(__sum16 csum, __be16 addend) { return csum16_add(csum, ~addend); } static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) { *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } /** * Packet fill helpers */ static uint8_t base_pkt_data[FRAME_SIZE]; /* Can be changed via cmdline options (-G|--tx-dmac) or (-H|--tx-smac) */ static struct ether_addr default_tx_smac = {{ 0x24, 0x5e, 0xbe, 0x57, 0xf1, 0x64 }}; static struct ether_addr default_tx_dmac = {{ 0xbc, 0xee, 0x7b, 0xda, 0xc2, 0x62 }}; #define MIN_PKT_SIZE 64 static uint16_t opt_pkt_size = MIN_PKT_SIZE; #define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ sizeof(struct udphdr)) #define ETH_FCS_SIZE 4 #define PKT_SIZE (opt_pkt_size - ETH_FCS_SIZE) #define IP_PKT_SIZE (PKT_SIZE - sizeof(struct ethhdr)) #define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) static void gen_eth_hdr(struct config *cfg, struct ethhdr *eth_hdr) { /* Ethernet header: * Can be changed via cmdline options (-G|--tx-dmac) or (-H|--tx-smac) */ memcpy(eth_hdr->h_dest , &cfg->opt_tx_dmac, ETH_ALEN); memcpy(eth_hdr->h_source, &cfg->opt_tx_smac, ETH_ALEN); eth_hdr->h_proto = htons(ETH_P_IP); } static char *opt_ip_str_src = "192.168.44.1"; static char *opt_ip_str_dst = "192.168.44.3"; static void gen_ip_hdr(struct config *cfg, struct iphdr *ip_hdr) { if (cfg->opt_ip_src == 0) get_ipv4_u32(opt_ip_str_src, &cfg->opt_ip_src); if (cfg->opt_ip_dst == 0) get_ipv4_u32(opt_ip_str_dst, &cfg->opt_ip_dst); /* IP header */ ip_hdr->version = IPVERSION; ip_hdr->ihl = 0x5; /* 20 byte header */ ip_hdr->tos = 0x0; ip_hdr->tot_len = htons(IP_PKT_SIZE); ip_hdr->id = 0; ip_hdr->frag_off = 0; ip_hdr->ttl = IPDEFTTL; ip_hdr->protocol = IPPROTO_UDP; ip_hdr->saddr = cfg->opt_ip_src; ip_hdr->daddr = cfg->opt_ip_dst; /* IP header checksum */ ip_hdr->check = 0; ip_hdr->check = ip_fast_csum((const void *)ip_hdr, ip_hdr->ihl); } static uint32_t opt_pkt_fill_pattern = 0x41424344; static void gen_udp_hdr(struct udphdr *udp_hdr, struct iphdr *ip_hdr) { /* UDP header */ udp_hdr->source = htons(0x1000); udp_hdr->dest = htons(0x1000); udp_hdr->len = htons(UDP_PKT_SIZE); /* UDP data */ memset32_htonl((void*)udp_hdr + sizeof(struct udphdr), opt_pkt_fill_pattern, UDP_PKT_DATA_SIZE); /* UDP header checksum */ udp_hdr->check = 0; udp_hdr->check = udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, IPPROTO_UDP, (__u16 *)udp_hdr); } static void gen_base_pkt(struct config *cfg, uint8_t *pkt_ptr) { struct ethhdr *eth_hdr = (struct ethhdr *)pkt_ptr; struct iphdr *ip_hdr = (struct iphdr *)(pkt_ptr + sizeof(struct ethhdr)); struct udphdr *udp_hdr = (struct udphdr *)(pkt_ptr + sizeof(struct ethhdr) + sizeof(struct iphdr)); gen_eth_hdr(cfg, eth_hdr); gen_ip_hdr(cfg, ip_hdr); gen_udp_hdr(udp_hdr, ip_hdr); } /** * BTF accessing XDP-hints * ----------------------- * Accessing the XDP-hints via BTF requires setup done earlier. As our target * application have real-time requirements, it is preferred that the setup can * happen outside the packet processing path. E.g. avoid doing the setup first * time a packet with a new BTF-ID is seen. */ static int print_meta_info_time(uint8_t *pkt, struct xdp_hints_rx_time *meta, __u32 qid) { __u64 time_now; // = gettime(); __u32 xdp_rx_cpu = 0xffff; __u32 cpu_running; __u64 *rx_ktime_ptr; /* Points directly to member memory */ __u64 rx_ktime; __u64 diff; int err; /* Quick stats */ static bool first = true; static unsigned int max = 0; static unsigned int min = -1; static double tot = 0; static __u64 cnt = 0; /* API doesn't involve allocations to access BTF struct member */ err = xsk_btf__read((void **)&rx_ktime_ptr, sizeof(*rx_ktime_ptr), &meta->rx_ktime, meta->xbi, pkt); if (err) { fprintf(stderr, "ERROR(%d) no rx_ktime?!\n", err); return err; } /* Notice how rx_ktime_ptr becomes a pointer into struct memory */ rx_ktime = *rx_ktime_ptr; time_now = gettime(); diff = time_now - rx_ktime; /* Quick stats, exclude first measurement */ if (!first) { min = (min < diff) ? min : diff; max = (max > diff) ? max : diff; cnt++; tot += diff; } first = false; cpu_running = sched_getcpu(); XSK_BTF_READ_INTO(xdp_rx_cpu, &meta->xdp_rx_cpu, meta->xbi, pkt); if (debug_meta) printf("Q[%u] CPU[rx:%d/run:%d]:%s" " meta-time rx_ktime:%llu time_now:%llu diff:%llu ns" "(avg:%.0f min:%u max:%u )\n", qid, xdp_rx_cpu, cpu_running, (xdp_rx_cpu == cpu_running) ? "same" : "remote", rx_ktime, time_now, diff, tot / cnt, min , max); return 0; } /* Demo API xsk_btf__read_field() that use string for BTF lookup */ static int print_meta_info_time_api2(uint8_t *pkt, __u32 qid) { struct xsk_btf_info *xbi = xdp_hints_rx_time.xbi; __u64 time_now; // = gettime(); __u64 *rx_ktime_ptr; /* Points directly to member memory */ __u64 rx_ktime; __u64 diff; int err; /* This API cache string lookup (in hashmap), which cause an * allocation first time this is called. Something to consider * for real-time use-cases. */ err = xsk_btf__read_field((void **)&rx_ktime_ptr, sizeof(*rx_ktime_ptr), "rx_ktime", xbi, pkt); if (err) { fprintf(stderr, "ERROR(%d) no rx_ktime?!\n", err); return err; } rx_ktime = *rx_ktime_ptr; /* same as XSK_BTF_READ_FIELD_INTO(rx_ktime, rx_ktime, xbi, pkt); */ time_now = gettime(); diff = time_now - rx_ktime; if (debug_meta) printf("Q[%u] meta-time rx_ktime:%llu time_now:%llu diff:%llu ns\n", qid, rx_ktime, time_now, diff); return 0; } static void print_meta_info_mark(uint8_t *pkt, struct xdp_hints_mark *meta, __u32 qid) { struct xsk_btf_info *xbi = meta->xbi; __u32 mark = 0; /* The 'mark' value is not updated in case of errors */ XSK_BTF_READ_INTO(mark, &meta->mark, xbi, pkt); if (debug_meta) printf("Q[%u] meta-mark mark:%u\n", qid, mark); } static void print_meta_info_via_btf(uint8_t *pkt, struct xsk_socket_info *xsk) { __u32 btf_id = xsk_umem__btf_id(pkt); __u32 qid = xsk->queue_id; if (btf_id == 0) { if (debug_meta) printf("No meta BTF info (btf_id zero)\n"); return; } if (btf_id == xdp_hints_rx_time.btf_type_id) { print_meta_info_time(pkt, &xdp_hints_rx_time, qid); } else if (btf_id == xdp_hints_mark.btf_type_id) { print_meta_info_mark(pkt, &xdp_hints_mark, qid); } } /* As debug tool print some info about packet */ static void print_pkt_info(uint8_t *pkt, uint32_t len) { struct ethhdr *eth = (struct ethhdr *) pkt; __u16 proto = ntohs(eth->h_proto); char *fmt = "DEBUG-pkt len=%04d Eth-proto:0x%X %s " "src:%s -> dst:%s\n"; char src_str[128] = { 0 }; char dst_str[128] = { 0 }; if (proto == ETH_P_IP) { struct iphdr *ipv4 = (struct iphdr *) (eth + 1); inet_ntop(AF_INET, &ipv4->saddr, src_str, sizeof(src_str)); inet_ntop(AF_INET, &ipv4->daddr, dst_str, sizeof(dst_str)); printf(fmt, len, proto, "IPv4", src_str, dst_str); } else if (proto == ETH_P_ARP) { printf(fmt, len, proto, "ARP", "", ""); } else if (proto == ETH_P_IPV6) { struct ipv6hdr *ipv6 = (struct ipv6hdr *) (eth + 1); inet_ntop(AF_INET6, &ipv6->saddr, src_str, sizeof(src_str)); inet_ntop(AF_INET6, &ipv6->daddr, dst_str, sizeof(dst_str)); printf(fmt, len, proto, "IPv6", src_str, dst_str); } else { printf(fmt, len, proto, "Unknown", "", ""); } } static int tx_pkt(struct config *cfg, struct xsk_socket_info *xsk) { struct xsk_umem_info *umem = xsk->umem; uint64_t pkt_addr = mem_alloc_umem_frame(&umem->mem); uint8_t *pkt = NULL; uint32_t offset = 0; // 256; pkt_addr += offset; pr_addr_info(__func__, pkt_addr, umem); pkt = xsk_umem__get_data(umem->buffer, pkt_addr); gen_base_pkt(cfg, pkt); { uint32_t tx_idx = 0; int ret; ret = xsk_ring_prod__reserve(&xsk->tx, 1, &tx_idx); if (ret != 1) { /* No more transmit slots, drop the packet */ mem_free_umem_frame(&umem->mem, pkt_addr); fprintf(stderr, "ERR: %s() failed transmit, no slots\n", __func__); return ENOSPC; } xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->addr = pkt_addr; xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->len = 64; xsk_ring_prod__submit(&xsk->tx, 1); xsk->outstanding_tx++; } return complete_tx(xsk); } /* Generate some fake packets (in umem area). Real system will deliver TX * packets containing the needed control information. */ static int invent_tx_pkts(struct config *cfg, struct xsk_umem_info *umem, const unsigned int n, struct xdp_desc pkts[n]) { uint32_t len = opt_pkt_size; uint32_t offset = 256; int i; for (i = 0; i < n; i++) { uint64_t pkt_addr = mem_alloc_umem_frame(&umem->mem); struct xdp_desc desc; uint8_t *pkt_data; if (pkt_addr == INVALID_UMEM_FRAME) return i; pkt_addr += offset; desc.addr = pkt_addr; desc.len = len; desc.options = 0; /* Write into packet memory area */ pkt_data = xsk_umem__get_data(umem->buffer, pkt_addr); gen_base_pkt(cfg, pkt_data); pkts[i] = desc; } return i; } static int tx_batch_pkts(struct xsk_socket_info *xsk, const unsigned int nr, struct xdp_desc pkts[nr]) { struct xsk_umem_info *umem = xsk->umem; uint32_t tx_res; uint32_t tx_idx = 0; int i; tx_res = xsk_ring_prod__reserve(&xsk->tx, nr, &tx_idx); if (tx_res != nr) { /* No more transmit slots, drop all packets. Normally AF_XDP * code would try to run TX-completion CQ step to free up slots, * but we don't want to introduce variability due to RT * requirements. Other code make sure CQ is processed. */ for (i = 0; i < nr; i++) { mem_free_umem_frame(&umem->mem, pkts[i].addr); } return 0; } for (i = 0; i < nr ; i++) { struct xdp_desc *tx_desc; tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, tx_idx + i); *tx_desc = pkts[i]; //xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->addr = pkt_addr; //xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->len = 64; xsk->outstanding_tx++; } xsk_ring_prod__submit(&xsk->tx, nr); // Kick Tx // sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); complete_tx(xsk); // See if kicking Rx-side works // recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); return nr; } static bool process_packet(struct xsk_socket_info *xsk, uint64_t addr, uint32_t len) { uint8_t *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); print_meta_info_via_btf(pkt, xsk); //if (debug) // printf("XXX addr:0x%lX pkt_ptr:0x%p\n", addr, pkt); if (debug_pkt) print_pkt_info(pkt, len); /* Lesson#3: Write an IPv6 ICMP ECHO parser to send responses * * Some assumptions to make it easier: * - No VLAN handling * - Only if nexthdr is ICMP * - Just return all data with MAC/IP swapped, and type set to * ICMPV6_ECHO_REPLY * - Recalculate the icmp checksum */ if (true) { int ret; uint32_t tx_idx = 0; uint8_t tmp_mac[ETH_ALEN]; struct in6_addr tmp_ip; struct ethhdr *eth = (struct ethhdr *) pkt; struct ipv6hdr *ipv6 = (struct ipv6hdr *) (eth + 1); struct icmp6hdr *icmp = (struct icmp6hdr *) (ipv6 + 1); if (ntohs(eth->h_proto) != ETH_P_IPV6 || len < (sizeof(*eth) + sizeof(*ipv6) + sizeof(*icmp)) || ipv6->nexthdr != IPPROTO_ICMPV6 || icmp->icmp6_type != ICMPV6_ECHO_REQUEST) return false; memcpy(tmp_mac, eth->h_dest, ETH_ALEN); memcpy(eth->h_dest, eth->h_source, ETH_ALEN); memcpy(eth->h_source, tmp_mac, ETH_ALEN); memcpy(&tmp_ip, &ipv6->saddr, sizeof(tmp_ip)); memcpy(&ipv6->saddr, &ipv6->daddr, sizeof(tmp_ip)); memcpy(&ipv6->daddr, &tmp_ip, sizeof(tmp_ip)); icmp->icmp6_type = ICMPV6_ECHO_REPLY; csum_replace2(&icmp->icmp6_cksum, htons(ICMPV6_ECHO_REQUEST << 8), htons(ICMPV6_ECHO_REPLY << 8)); /* Here we sent the packet out of the receive port. Note that * we allocate one entry and schedule it. Your design would be * faster if you do batch processing/transmission */ ret = xsk_ring_prod__reserve(&xsk->tx, 1, &tx_idx); if (ret != 1) { /* No more transmit slots, drop the packet */ return false; } xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->addr = addr; xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->len = len; xsk_ring_prod__submit(&xsk->tx, 1); xsk->outstanding_tx++; xsk->stats.tx_bytes += len; xsk->stats.tx_packets++; return true; } return false; } void restock_receive_fill_queue(struct xsk_socket_info *xsk) { unsigned int i, stock_frames; uint32_t idx_fq = 0; int ret; /* Limit refill size as it takes time */ int free_frames = mem_avail_umem_frames(&xsk->umem->mem); int refill = (free_frames > FQ_REFILL_MAX) ? FQ_REFILL_MAX : free_frames; __u64 start = gettime(); stock_frames = xsk_prod_nb_free(&xsk->fq, refill); if (stock_frames > 0) { ret = xsk_ring_prod__reserve(&xsk->fq, stock_frames, &idx_fq); /* This should not happen, but just in case */ if (ret != stock_frames) { printf("XXX %s() should not happen (%d vs %d)\n", __func__, stock_frames, ret); stock_frames = ret; } for (i = 0; i < stock_frames; i++) *xsk_ring_prod__fill_addr(&xsk->fq, idx_fq++) = mem_alloc_umem_frame(&xsk->umem->mem); xsk_ring_prod__submit(&xsk->fq, stock_frames); } __u64 now = gettime(); if (debug && stock_frames > 1) printf("XXX stock_frame:%d free_frames:%d cost of xsk_prod_nb_free() %llu ns\n", stock_frames, free_frames, now - start); } static void handle_receive_packets(struct xsk_socket_info *xsk) { unsigned int rcvd, i; uint32_t idx_rx = 0; // FIXME: Needed when in NAPI busy_poll mode? recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); rcvd = xsk_ring_cons__peek(&xsk->rx, RX_BATCH_SIZE, &idx_rx); if (!rcvd) return; /* Process received packets */ for (i = 0; i < rcvd; i++) { uint64_t addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; uint32_t len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; pr_addr_info(__func__, addr, xsk->umem); if (!process_packet(xsk, addr, len)) mem_free_umem_frame(&xsk->umem->mem, addr); xsk->stats.rx_bytes += len; } xsk->stats.rx_packets += rcvd; restock_receive_fill_queue(xsk); xsk_ring_cons__release(&xsk->rx, rcvd); /* Do we need to wake up the kernel for transmission */ complete_tx(xsk); if (verbose && rcvd > 1) printf("%s(): RX batch %d packets (i:%d)\n", __func__, rcvd, i); } static void rx_and_process(struct config *cfg, struct xsk_container *xsks) { struct pollfd fds[MAX_AF_SOCKS] = { 0 }; int ret, n_fds, i; // struct xsk_socket_info *xsk_socket = xsks->sockets[0]; // FIXME n_fds = xsks->num; for (i = 0; i < n_fds; i++) { struct xsk_socket_info *xsk_info = xsks->sockets[i]; fds[i].fd = xsk_socket__fd(xsk_info->xsk); fds[i].events = POLLIN; } while(!global_exit) { if (cfg->xsk_wakeup_mode) { /* poll will wait for events on file descriptors */ ret = poll(fds, n_fds, -1); if (ret <= 0 || ret > 1) continue; } for (i = 0; i < n_fds; i++) { struct xsk_socket_info *xsk_info = xsks->sockets[i]; //printf("XXX i[%d] queue:%d xsk_info:%p \n", // i, xsk_info->queue_id, xsk_info); handle_receive_packets(xsk_info); } } } static void rx_avail_packets(struct xsk_container *xsks) { for (int i = 0; i < xsks->num; i++) { struct xsk_socket_info *xsk_info = xsks->sockets[i]; handle_receive_packets(xsk_info); } } /* Default interval in usec */ #define DEFAULT_INTERVAL 1000000 #define USEC_PER_SEC 1000000 #define NSEC_PER_SEC 1000000000 static inline void tsnorm(struct timespec *ts) { while (ts->tv_nsec >= NSEC_PER_SEC) { ts->tv_nsec -= NSEC_PER_SEC; ts->tv_sec++; } } static inline uint64_t timespec2ns(struct timespec *ts) { return (uint64_t) ts->tv_sec * NANOSEC_PER_SEC + ts->tv_nsec; } static inline void ns2timespec(uint64_t ns, struct timespec *ts) { ts->tv_sec = ns / NANOSEC_PER_SEC; ts->tv_nsec = ns % NANOSEC_PER_SEC; } static inline int64_t calcdiff(struct timespec t1, struct timespec t2) { int64_t diff; diff = USEC_PER_SEC * (long long)((int) t1.tv_sec - (int) t2.tv_sec); diff += ((int) t1.tv_nsec - (int) t2.tv_nsec) / 1000; return diff; } static inline int64_t calcdiff_ns(struct timespec t1, struct timespec t2) { int64_t diff; diff = NSEC_PER_SEC * (long long)((int) t1.tv_sec - (int) t2.tv_sec); diff += ((int) t1.tv_nsec - (int) t2.tv_nsec); return diff; } static void print_timespec(struct timespec *ts, char *msg) { printf("Time: %lu.%lu - %s\n", ts->tv_sec, ts->tv_nsec, msg); } struct wakeup_stat { long min; long max; long curr; long prev; double avg; unsigned long events; }; /* Use-case: Accurate cyclic Tx and lazy RX-processing * * This processing loop is simulating a Time-Triggered schedule, where * transmitting packets within a small time-window is the most * important task. Picking up frames in RX-queue is less time * critical, as the PCF synchronization packets will have been * timestamped (rx_ktime) by XDP before they got enqueued. */ static void tx_cyclic_and_rx_process(struct config *cfg, struct xsk_container *xsks) { struct timespec now, next, next_adj, interval, now_prev; struct wakeup_stat stat = { .min = DEFAULT_INTERVAL, .max = -0xFFFF }; struct wakeup_stat stat_adj = { .min = DEFAULT_INTERVAL, .max = -0xFFFF }; struct xdp_desc tx_pkts[BATCH_PKTS_MAX]; int batch_nr = cfg->batch_pkts; int tx_nr; bool first = true; int period = cfg->interval; int timermode = TIMER_ABSTIME; int clock = CLOCK_MONOTONIC; // Choosing xsk id 0 struct xsk_socket_info *xsk = xsks->sockets[0]; /* Get packets for *first* iteration */ tx_nr = invent_tx_pkts(cfg, xsk->umem, batch_nr, tx_pkts); interval.tv_sec = period / USEC_PER_SEC; interval.tv_nsec = (period % USEC_PER_SEC) * 1000; clock_gettime(clock, &now); next = now; next.tv_sec += interval.tv_sec; next.tv_nsec += interval.tv_nsec; tsnorm(&next); next_adj = next; /* Not adjusted yet */ while (!global_exit) { int64_t diff, diff2adj, diff_interval; int64_t avg, avg2adj; int err, n; /* Wait for next period, but adjusted for measured inaccuracy */ err = clock_nanosleep(clock, timermode, &next_adj, NULL); /* Took case MODE_CLOCK_NANOSLEEP from cyclictest */ if (err) { if (err != EINTR) fprintf(stderr, "clock_nanosleep failed." " err:%d errno:%d\n", err, errno); goto out; } /* Expecting to wakeup at "next" get systime "now" to check */ now_prev = now; err = clock_gettime(clock, &now); if (err) { if (err != EINTR) fprintf(stderr, "clock_getttime() failed." " err:%d errno:%d\n", err, errno); goto out; } /* How close is wakeup time to our actual target */ diff = calcdiff_ns(now, next); /* Positive num = wokeup after */ /* Exclude first measurement as no next_adj happened */ if (!first) { if (diff < stat.min) stat.min = diff; if (diff > stat.max) stat.max = diff; } first = false; stat.avg += (double) diff; stat.prev = stat.curr; stat.curr = diff; stat.events++; avg = (stat.avg / stat.events); /* Measure inaccuracy of clock_nanosleep */ diff2adj = calcdiff_ns(now, next_adj); /* Positive num = wokeup after */ stat_adj.avg += (double) diff2adj; stat_adj.events++; avg2adj = (stat_adj.avg / stat_adj.events); // IDEA: Spin until exact time occurs (if diff negative) /* Send batch of packets */ n = tx_batch_pkts(xsk, tx_nr, tx_pkts); diff_interval = calcdiff_ns(now, now_prev); if (verbose >=1 ) printf("TX pkts:%d event:%lu" " inaccurate wakeup(nanosec) curr:%ld" "(min:%ld max:%ld avg:%ld avg2adj:%ld)" " variance(n-1):%ld interval-ns:%ld\n", n, stat.events, stat.curr, stat.min, stat.max, avg, avg2adj, stat.curr - stat.prev, diff_interval); if (debug_time) { print_timespec(&now, "now"); print_timespec(&next_adj, "next_adj"); print_timespec(&next, "next"); } /* Calculate next time to wakeup */ next.tv_sec += interval.tv_sec; next.tv_nsec += interval.tv_nsec; tsnorm(&next); /* Adjust for inaccuracy of clock_nanosleep wakeup */ uint64_t next_adj_ns = timespec2ns(&next); next_adj_ns = next_adj_ns - avg2adj; ns2timespec(next_adj_ns, &next_adj); tsnorm(&next_adj); /* Get packets for *next* iteration */ tx_nr = invent_tx_pkts(cfg, xsk->umem, batch_nr, tx_pkts); /* Empty RX queues */ rx_avail_packets(xsks); } out: /* Free umem frames */ for (int i = 0; i < tx_nr; i++) { mem_free_umem_frame(&xsk->umem->mem, tx_pkts[i].addr); } } static double calc_period(struct stats_record *r, struct stats_record *p) { double period_ = 0; __u64 period = 0; period = r->timestamp - p->timestamp; if (period > 0) period_ = ((double) period / NANOSEC_PER_SEC); return period_; } static void stats_print(struct stats_record *stats_rec, struct stats_record *stats_prev) { uint64_t packets, bytes; double period; double pps; /* packets per sec */ double bps; /* bits per sec */ char *fmt = "%-12s %'11lld pkts (%'10.0f pps)" " %'11lld Kbytes (%'6.0f Mbits/s)" " period:%f\n"; period = calc_period(stats_rec, stats_prev); if (period == 0) period = 1; packets = stats_rec->rx_packets - stats_prev->rx_packets; pps = packets / period; bytes = stats_rec->rx_bytes - stats_prev->rx_bytes; bps = (bytes * 8) / period / 1000000; printf(fmt, "AF_XDP RX:", stats_rec->rx_packets, pps, stats_rec->rx_bytes / 1000 , bps, period); packets = stats_rec->tx_packets - stats_prev->tx_packets; pps = packets / period; bytes = stats_rec->tx_bytes - stats_prev->tx_bytes; bps = (bytes * 8) / period / 1000000; printf(fmt, " TX:", stats_rec->tx_packets, pps, stats_rec->tx_bytes / 1000 , bps, period); printf("\n"); } static void *stats_poll(void *arg) { unsigned int interval = 2; struct xsk_container *xsks = arg; struct xsk_socket_info *xsk = xsks->sockets[0]; // FIXME static struct stats_record previous_stats = { 0 }; previous_stats.timestamp = gettime(); /* Trick to pretty printf with thousands separators use %' */ setlocale(LC_NUMERIC, "en_US"); while (!global_exit) { sleep(interval); xsk->stats.timestamp = gettime(); stats_print(&xsk->stats, &previous_stats); previous_stats = xsk->stats; } return NULL; } static void enter_xsks_into_map(int xsks_map, struct xsk_container *xsks) { int i; if (xsks_map < 0) { fprintf(stderr, "ERROR: no xsks map found: %s\n", strerror(xsks_map)); exit(EXIT_FAILURE); } for (i = 0; i < xsks->num; i++) { int fd = xsk_socket__fd(xsks->sockets[i]->xsk); int key, ret; key = i; /* When entering XSK socket into map redirect have effect */ ret = bpf_map_update_elem(xsks_map, &key, &fd, 0); if (ret) { fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i); exit(EXIT_FAILURE); } if (debug) printf("%s() enable redir for xsks_map_fd:%d Key:%d fd:%d\n", __func__, xsks_map, key, fd); } } static void exit_application(int signal) { signal = signal; global_exit = true; } int main(int argc, char **argv) { int ret, err; int xsks_map_fd; void *packet_buffer; uint64_t packet_buffer_size; struct rlimit rlim = {RLIM_INFINITY, RLIM_INFINITY}; struct config cfg = { .ifindex = -1, .do_unload = false, .filename = "af_xdp_kern.o", .progsec = "xdp_sock", .xsk_wakeup_mode = true, /* Default, change via --spin */ .xsk_if_queue = -1, .opt_tx_dmac = default_tx_dmac, .opt_tx_smac = default_tx_smac, .interval = DEFAULT_INTERVAL, .batch_pkts = BATCH_PKTS_DEFAULT, }; pthread_t stats_poll_thread; struct xsk_umem_info *umem; struct xsk_container xsks; int queues_max, queues_set; int total_nr_frames, nr_frames; struct sched_param schedp; int i; /* Default to AF_XDP copy mode. * * It seems counter intuitive to not-use Zero-Copy mode, but there is an * explaination. Our application don't consume EVERY packet, e.g * letting netstack handle ARP/NDP packets via returning XDP_PASS in * bpf-prog. * * XDP_PASS in Zero-Copy mode results in the kernel allocating a new * memory page (and SKB) and copying over packet contents, before giving * packet to netstack. * * For our Real-Time use-case, we want to avoid allocations more than * cost of copying over packet data to our preallocated AF_XDP umem * area. */ //cfg.xsk_bind_flags = XDP_COPY; cfg.xsk_bind_flags = XDP_COPY | XDP_USE_NEED_WAKEUP; struct bpf_object *bpf_obj = NULL; struct bpf_map *map; /* Global shutdown handler */ signal(SIGINT, exit_application); /* Cmdline options can change progsec */ parse_cmdline_args(argc, argv, long_options, &cfg, __doc__); /* Required option */ if (cfg.ifindex == -1) { fprintf(stderr, "ERROR: Required option --dev missing\n\n"); usage(argv[0], __doc__, long_options, (argc == 1)); return EXIT_FAIL_OPTION; } libbpf_set_print(print_libbpf_log); /* set custom log handler */ /* Unload XDP program if requested */ if (cfg.do_unload) return xdp_link_detach(cfg.ifindex, cfg.xdp_flags, 0); /* Require loading custom BPF program */ if (cfg.filename[0] == 0) { fprintf(stderr, "ERROR: must load custom BPF-prog\n"); exit(EXIT_FAILURE); } else { bpf_obj = load_bpf_and_xdp_attach(&cfg); if (!bpf_obj) { /* Error handling done in load_bpf_and_xdp_attach() */ exit(EXIT_FAILURE); } /* We also need to load the xsks_map */ map = bpf_object__find_map_by_name(bpf_obj, "xsks_map"); xsks_map_fd = bpf_map__fd(map); if (xsks_map_fd < 0) { fprintf(stderr, "ERROR: no xsks map found: %s\n", strerror(xsks_map_fd)); exit(EXIT_FAILURE); } } queues_max = ethtool_get_max_channels(cfg.ifname); queues_set = ethtool_get_channels(cfg.ifname); if (verbose || debug_meta) printf("Interface: %s - queues max:%d set:%d\n", cfg.ifname, queues_max, queues_set); xsks.num = queues_set; /* Allocate frames according to how many queues are handled */ nr_frames = NUM_FRAMES; total_nr_frames = nr_frames * xsks.num; if (verbose || debug_meta) printf("For XSK queues:%d alloc total:%d frames (per-q:%d)\n", xsks.num, total_nr_frames, nr_frames); err = init_btf_info_via_bpf_object(bpf_obj); if (err) { fprintf(stderr, "ERROR(%d): Invalid BTF info: errno:%s\n", err, strerror(errno)); return EXIT_FAILURE; } /* Allow unlimited locking of memory, so all memory needed for packet * buffers can be locked. */ if (setrlimit(RLIMIT_MEMLOCK, &rlim)) { fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", strerror(errno)); exit(EXIT_FAILURE); } /* Allocate memory for total_nr_frames of the default XDP frame size */ packet_buffer_size = total_nr_frames * FRAME_SIZE; if (posix_memalign(&packet_buffer, getpagesize(), /* PAGE_SIZE aligned */ packet_buffer_size)) { fprintf(stderr, "ERROR: Can't allocate buffer memory \"%s\"\n", strerror(errno)); exit(EXIT_FAILURE); } /* Initialize shared packet_buffer for umem usage */ umem = configure_xsk_umem(packet_buffer, packet_buffer_size, FRAME_SIZE, total_nr_frames); if (umem == NULL) { fprintf(stderr, "ERROR: Can't create umem \"%s\"\n", strerror(errno)); exit(EXIT_FAILURE); } /* Generate packets to TX */ gen_base_pkt(&cfg, (uint8_t*)&base_pkt_data); /* Open and configure the AF_XDP (xsk) socket(s) */ for (i = 0; i < xsks.num; i++) { struct xsk_socket_info *xski; xski = xsk_configure_socket(&cfg, umem, i, xsks_map_fd); if (xski == NULL) { fprintf(stderr, "ERROR(%d): Can't setup AF_XDP socket " "\"%s\"\n", errno, strerror(errno)); exit(EXIT_FAILURE); } xsks.sockets[i] = xski; if (xsk_populate_fill_ring(&xski->fq, umem, nr_frames / 2)) { fprintf(stderr, "ERROR: Can't populate fill ring\n"); exit(EXIT_FAILURE); } } enter_xsks_into_map(xsks_map_fd, &xsks); /* Start thread to do statistics display */ if (0 && verbose) { // FIXME disabled as currently broken ret = pthread_create(&stats_poll_thread, NULL, stats_poll, &xsks); if (ret) { fprintf(stderr, "ERROR: Failed creating statistics thread " "\"%s\"\n", strerror(errno)); exit(EXIT_FAILURE); } } if (cfg.sched_prio) { /* Setup sched priority: Have impact on wakeup accuracy */ memset(&schedp, 0, sizeof(schedp)); schedp.sched_priority = cfg.sched_prio; err = sched_setscheduler(0, cfg.sched_policy, &schedp); if (err) { fprintf(stderr, "ERROR(%d): failed to set priority(%d): %s\n", errno, cfg.sched_prio, strerror(errno)); if (errno != EPERM) return EXIT_FAILURE; } if (debug) printf("Setup RT prio %d - policy SCHED_FIFO(%d)\n ", cfg.sched_prio, cfg.sched_policy); } /* Issue: At this point AF_XDP socket might not be ready e.g. for TX. * It seems related with XDP attachment causing link down/up event for * some drivers. Q: What is the right method/API that waits for link to * be initilized correctly? * * This workaround keeps trying to send a single packet, and * check return value seen from sendto() syscall, until it * doesn't return an error. */ while ((err = tx_pkt(&cfg, xsks.sockets[0]))) { fprintf(stderr, "WARN(%d): Failed to Tx pkt, will retry\n", err); sleep(1); } /* Receive and count packets than drop them */ // rx_and_process(&cfg, &xsks); /* Send packets cyclic */ tx_cyclic_and_rx_process(&cfg, &xsks); /* Cleanup */ for (i = 0; i < xsks.num; i++) xsk_socket__delete(xsks.sockets[i]->xsk); xsk_umem__delete(umem->umem); xdp_link_detach(cfg.ifindex, cfg.xdp_flags, 0); return EXIT_OK; }