diff --git a/headers/bpf/compiler.h b/headers/bpf/compiler.h new file mode 100644 index 0000000..2588023 --- /dev/null +++ b/headers/bpf/compiler.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2016-2020 Authors of Cilium */ + +#ifndef __BPF_COMPILER_H_ +#define __BPF_COMPILER_H_ + +#ifndef __non_bpf_context +# include "stddef.h" +#endif + +#ifndef __section +# define __section(X) __attribute__((section(X), used)) +#endif + +#ifndef __maybe_unused +# define __maybe_unused __attribute__((__unused__)) +#endif + +#ifndef offsetof +# define offsetof(T, M) __builtin_offsetof(T, M) +#endif + +#ifndef field_sizeof +# define field_sizeof(T, M) sizeof((((T *)NULL)->M)) +#endif + +#ifndef __packed +# define __packed __attribute__((packed)) +#endif + +#ifndef __nobuiltin +# if __clang_major__ >= 10 +# define __nobuiltin(X) __attribute__((no_builtin(X))) +# else +# define __nobuiltin(X) +# endif +#endif + +#ifndef likely +# define likely(X) __builtin_expect(!!(X), 1) +#endif + +#ifndef unlikely +# define unlikely(X) __builtin_expect(!!(X), 0) +#endif + +#ifndef always_succeeds /* Mainly for documentation purpose. */ +# define always_succeeds(X) likely(X) +#endif + +#undef __always_inline /* stddef.h defines its own */ +#define __always_inline inline __attribute__((always_inline)) + +#ifndef __stringify +# define __stringify(X) #X +#endif + +#ifndef __fetch +# define __fetch(X) (__u32)(__u64)(&(X)) +#endif + +#ifndef __aligned +# define __aligned(X) __attribute__((aligned(X))) +#endif + +#ifndef build_bug_on +# define build_bug_on(E) ((void)sizeof(char[1 - 2*!!(E)])) +#endif + +#ifndef __throw_build_bug +# define __throw_build_bug() __builtin_trap() +#endif + +#ifndef __printf +# define __printf(X, Y) __attribute__((__format__(printf, X, Y))) +#endif + +#ifndef barrier +# define barrier() asm volatile("": : :"memory") +#endif + +#ifndef barrier_data +# define barrier_data(ptr) asm volatile("": :"r"(ptr) :"memory") +#endif + +static __always_inline void bpf_barrier(void) +{ + /* Workaround to avoid verifier complaint: + * "dereference of modified ctx ptr R5 off=48+0, ctx+const is allowed, + * ctx+const+const is not" + */ + barrier(); +} + +#ifndef ARRAY_SIZE +# define ARRAY_SIZE(A) (sizeof(A) / sizeof((A)[0])) +#endif + +#ifndef __READ_ONCE +# define __READ_ONCE(X) (*(volatile typeof(X) *)&X) +#endif + +#ifndef __WRITE_ONCE +# define __WRITE_ONCE(X, V) (*(volatile typeof(X) *)&X) = (V) +#endif + +/* {READ,WRITE}_ONCE() with verifier workaround via bpf_barrier(). */ + +#ifndef READ_ONCE +# define READ_ONCE(X) \ + ({ typeof(X) __val = __READ_ONCE(X); \ + bpf_barrier(); \ + __val; }) +#endif + +#ifndef WRITE_ONCE +# define WRITE_ONCE(X, V) \ + ({ typeof(X) __val = (V); \ + __WRITE_ONCE(X, __val); \ + bpf_barrier(); \ + __val; }) +#endif + +#endif /* __BPF_COMPILER_H_ */ diff --git a/headers/xdp/parsing_helpers.h b/headers/xdp/parsing_helpers.h index c29f23b..de6705b 100644 --- a/headers/xdp/parsing_helpers.h +++ b/headers/xdp/parsing_helpers.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-clause) */ /* - * This file contains parsing functions that can be used in eXDP programs. The - * functions are marked as __always_inline, and fully defined in this header - * file to be included in the BPF program. + * This file contains parsing functions that are used in the packetXX XDP + * programs. The functions are marked as __always_inline, and fully defined in + * this header file to be included in the BPF program. * * Each helper parses a packet header, including doing bounds checking, and * returns the type of its contents if successful, and -1 otherwise. @@ -10,6 +10,10 @@ * For Ethernet and IP headers, the content type is the type of the payload * (h_proto for Ethernet, nexthdr for IPv6), for ICMP it is the ICMP type field. * All return values are in host byte order. + * + * The versions of the functions included here are slightly expanded versions of + * the functions in the packet01 lesson. For instance, the Ethernet header + * parsing has support for parsing VLAN tags. */ #ifndef __PARSING_HELPERS_H @@ -54,7 +58,7 @@ struct icmphdr_common { /* Allow users of header file to redefine VLAN max depth */ #ifndef VLAN_MAX_DEPTH -#define VLAN_MAX_DEPTH 4 +#define VLAN_MAX_DEPTH 2 #endif /* Longest chain of IPv6 extension headers to resolve */ @@ -62,6 +66,11 @@ struct icmphdr_common { #define IPV6_EXT_MAX_CHAIN 6 #endif +#define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ +/* Struct for collecting VLANs after parsing via parse_ethhdr_vlan */ +struct collect_vlans { + __u16 id[VLAN_MAX_DEPTH]; +}; static __always_inline int proto_is_vlan(__u16 h_proto) { @@ -74,18 +83,24 @@ static __always_inline int proto_is_vlan(__u16 h_proto) * Ethernet header. Thus, caller can look at eth->h_proto to see if this was a * VLAN tagged packet. */ -static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end, - struct ethhdr **ethhdr) +static __always_inline int parse_ethhdr_vlan(struct hdr_cursor *nh, + void *data_end, + struct ethhdr **ethhdr, + struct collect_vlans *vlans) { struct ethhdr *eth = nh->pos; + int hdrsize = sizeof(*eth); struct vlan_hdr *vlh; __u16 h_proto; int i; - if (eth + 1 > data_end) + /* Byte-count bounds check; check if current pointer + size of header + * is after data_end. + */ + if (nh->pos + hdrsize > data_end) return -1; - nh->pos = eth + 1; + nh->pos += hdrsize; *ethhdr = eth; vlh = nh->pos; h_proto = eth->h_proto; @@ -102,6 +117,10 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end, break; h_proto = vlh->h_vlan_encapsulated_proto; + if (vlans) /* collect VLAN ids */ + vlans->id[i] = + (bpf_ntohs(vlh->h_vlan_TCI) & VLAN_VID_MASK); + vlh++; } @@ -109,6 +128,14 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end, return h_proto; /* network-byte-order */ } +static __always_inline int parse_ethhdr(struct hdr_cursor *nh, + void *data_end, + struct ethhdr **ethhdr) +{ + /* Expect compiler removes the code that collects VLAN ids */ + return parse_ethhdr_vlan(nh, data_end, ethhdr, NULL); +} + static __always_inline int skip_ip6hdrext(struct hdr_cursor *nh, void *data_end, __u8 next_hdr_type) @@ -174,6 +201,9 @@ static __always_inline int parse_iphdr(struct hdr_cursor *nh, return -1; hdrsize = iph->ihl * 4; + /* Sanity check packet field is valid */ + if(hdrsize < sizeof(iph)) + return -1; /* Variable-length IPv4 header, need to use byte-based arithmetic */ if (nh->pos + hdrsize > data_end) @@ -267,10 +297,15 @@ static __always_inline int parse_tcphdr(struct hdr_cursor *nh, return -1; len = h->doff * 4; - if ((void *) h + len > data_end) + /* Sanity check packet field is valid */ + if(len < sizeof(h)) return -1; - nh->pos = h + 1; + /* Variable-length TCP header, need to use byte-based arithmetic */ + if (nh->pos + len > data_end) + return -1; + + nh->pos += len; *tcphdr = h; return len; diff --git a/traffic-pacing-edt/Makefile b/traffic-pacing-edt/Makefile index cb3def9..aa6d3aa 100644 --- a/traffic-pacing-edt/Makefile +++ b/traffic-pacing-edt/Makefile @@ -1,14 +1,23 @@ # SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) -USER_TARGETS := -BPF_TARGETS := edt_pacer01 -BPF_TARGETS += edt_pacer02 +USER_TARGETS := xdp_cpumap_loader +BPF_TARGETS := edt_pacer_vlan +BPF_TARGETS += xdp_cpumap_qinq + +EXTRA_DEPS += config.mk LIB_DIR = ../lib include $(LIB_DIR)/common.mk +include config.mk -# The iproute2 'tc' tool doesn't understand BTF debug info +all: config.mk + +config.mk: configure + @sh configure + +ifndef HAVE_TC_LIBBPF +# If the iproute2 'tc' tool doesn't understand BTF debug info # use llvm-strip to remove this debug info from object file # # *BUT* cannot strip everything as it removes ELF elems needed for @@ -16,6 +25,8 @@ include $(LIB_DIR)/common.mk # .PHONY: strip_tc_obj strip_tc_obj: ${BPF_TARGETS:=.o} + $(Q) echo "TC don't support libbpf - strip BTF info" $(Q) llvm-strip --no-strip-all --remove-section .BTF $? all: strip_tc_obj +endif diff --git a/traffic-pacing-edt/bpf_egress_loader.sh b/traffic-pacing-edt/bpf_egress_loader.sh index 316cddd..efaf597 100755 --- a/traffic-pacing-edt/bpf_egress_loader.sh +++ b/traffic-pacing-edt/bpf_egress_loader.sh @@ -11,12 +11,12 @@ root_check_run_with_sudo "$@" # Use common parameters source ${basedir}/parameters.sh -export TC=/sbin/tc +export TC=tc # This can be changed via --file or --obj if [[ -z ${BPF_OBJ} ]]; then # Fallback default - BPF_OBJ=edt_pacer02.o + BPF_OBJ=edt_pacer_vlan.o fi info "Applying TC-BPF egress setup on device: $DEV with object file: $BPF_OBJ" diff --git a/traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt b/traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt new file mode 100755 index 0000000..15c3c4d --- /dev/null +++ b/traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt @@ -0,0 +1,31 @@ +#!/usr/local/bin/bpftrace + +#include + +/* Measure time difference between EDT-time and real "NIC" TX-time. + * + * Assuming packets are EDT timestamped by the BPF-program, we can + * detect/measure how accuratly packets are actually transmitted + * towards the NIC driver, by comparing EDT-time against "now" + * timestamp in the function transmitting to the NIC driver. + */ + +// tracepoint:net:net_dev_start_xmit +tracepoint:net:net_dev_xmit +{ + $skb = (struct sk_buff *)args->skbaddr; + //$tstamp = (uint64)$skb->tstamp; + $tstamp = $skb->skb_mstamp_ns; + $now = nsecs; + + // if ($skb->mark > 0) { + if ($tstamp > 0) { + if ($now >= $tstamp) { + $diff_late = $now - $tstamp; + } else { + $diff_ahead = $tstamp - $now; + } + @tstamp_diff_late = hist($diff_late / 1000); + @tstamp_diff_ahead = hist($diff_ahead / 1000); + } +} diff --git a/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt b/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt new file mode 100755 index 0000000..add3270 --- /dev/null +++ b/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt @@ -0,0 +1,78 @@ +#!/usr/local/bin/bpftrace + +#include + +// tracepoint:net:net_dev_start_xmit +tracepoint:net:net_dev_xmit +{ + $skb = (struct sk_buff *)args->skbaddr; + //$tstamp = (uint64)$skb->tstamp; + $tstamp = $skb->skb_mstamp_ns; + $now = nsecs; + +// if ($skb->mark > 0) { + if ($tstamp > 0) { + if ($now >= $tstamp) { + $diff_late = $now - $tstamp; + } else { + $diff_ahead = $tstamp - $now; + } + @tstamp_usec_diff_late = hist($diff_late / 1000); + @tstamp_usec_diff_ahead = hist($diff_ahead / 1000); + } + + /* Capture burstiness over a time period, by dividing nanosec + * timestamp with wanted period, and keeping state byte counter as + * long as timestamp match. + * + * Practical usage shows that bpftrace uses a hash-map to implement + * this, which unfortunately cost too much (shows 5% jhash cpu + * usage), enough overhead to change behavior of prod system. + */ + //$period = $now / 10000; /* 10000 = 10 usec */ + $period = $now / 30000; /* 30000 = 30 usec */ + if (@state[cpu] == $period) { + @state_bytes[cpu] += $skb->len; + } else { + @state[cpu] = $period; + if (@state_bytes[cpu] > 0) { + @byte_burst[cpu] = hist(@state_bytes[cpu]); + } + @state_bytes[cpu] = $skb->len; /* Reset counter */ + } +} + +/* +tracepoint:qdisc:qdisc_dequeue +{ + @qdisc_bulk_dequeue = lhist(args->packets, 0,64,1); +} +*/ + +/* +kretfunc:dev_hard_start_xmit +{ +// Wanted to know if ret == NETDEV_TX_BUSY +# ERROR: kfunc/kretfunc not available for your linked against bcc version. +} +*/ + + +/* How often does FQ-pacer find no-packets are qualified to be + * scheduled, which leads to scheduling an hrtimer event, that will + * start qdisc again at a later time. + * + * We cannot kprobe fq_dequeue as it is a module. + */ + +/* +kprobe:qdisc_watchdog_schedule_range_ns +{ + @qdisc_watchdog[cpu] = count(); +} + +kprobe:__netif_schedule +{ + @__netif_schedule[cpu] = count(); +} +*/ diff --git a/traffic-pacing-edt/codel_impl.h b/traffic-pacing-edt/codel_impl.h new file mode 100644 index 0000000..549dc61 --- /dev/null +++ b/traffic-pacing-edt/codel_impl.h @@ -0,0 +1,153 @@ +#ifndef __CODEL_IMPL_H +#define __CODEL_IMPL_H + +#ifndef CODEL_TARGET +#define CODEL_TARGET (10 * 1000 * 1000ULL) /* 10 ms in nanosec */ +#endif + +#ifndef CODEL_EXCEED_INTERVAL +#define CODEL_EXCEED_INTERVAL (100 * 1000 * 1000ULL) /* 100 ms in ns*/ +#endif + +/* Codel like dropping scheme, inspired by: + * - RFC: https://queue.acm.org/detail.cfm?id=2209336 + * - Code: https://queue.acm.org/appendices/codel.html + * - Kernel: include/net/codel_impl.h + */ +struct codel_state { + /* codel like dropping scheme */ + __u64 first_above_time; /* Time when above target (0 if below)*/ + __u64 drop_next; /* Time to drop next packet */ + __u32 count; /* Packets dropped since going into drop state */ + __u32 dropping; /* Equal to 1 if in drop state */ +}; + +/* Table lookup for square-root shifted 16 bit */ +static __always_inline __u32 get_sqrt_sh16(__u64 cnt) +{ + switch (cnt) { + case 1: return 65536; /* 65536 * sqrt(1) */ + case 2: return 92682; /* 65536 * sqrt(2) */ + case 3: return 113512; /* 65536 * sqrt(3) */ + case 4: return 131072; /* 65536 * sqrt(4) */ + case 5: return 146543; /* 65536 * sqrt(5) */ + case 6: return 160530; /* 65536 * sqrt(6) */ + case 7: return 173392; + case 8: return 185364; + case 9: return 196608; + case 10: return 207243; + case 11: return 217358; + case 12: return 227023; + case 13: return 236293; + case 14: return 245213; + case 15: return 253820; + case 16: return 262144; /* 100 ms / sqrt(16) = 25 ms */ + case 17: return 270212; + case 18: return 278046; + case 19: return 285664; + case 20: return 293086; + case 21: return 300324; + case 22: return 307391; + case 23: return 314300; + case 24: return 321060; + case 25: return 327680; /* 100 ms / sqrt(25) = 20 ms */ + case 26: return 334169; + case 27: return 340535; + case 28: return 346784; + case 29: return 352922; + case 30: return 358955; + case 31: return 364889; + case 32: return 370728; + case 33: return 376476; + case 34: return 382137; + case 35: return 387716; + case 36: return 393216; /* 100 / sqrt(36) = 16.66 ms */ + default: + return 463410; /* 65536*sqrt(50) => 100/sqrt(50) = 14.14 ms */ + } +} + +static __always_inline __u64 get_next_interval_sqrt(__u64 cnt) +{ + __u64 val = (__u64)CODEL_EXCEED_INTERVAL << 16 / get_sqrt_sh16(cnt); + return val; +} + +static __always_inline __u64 +codel_control_law(__u64 t, __u64 cnt) +{ + return t + get_next_interval_sqrt(cnt); +} + +static __always_inline +bool codel_should_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now) +{ + __u64 interval = CODEL_EXCEED_INTERVAL; + + if (t_queue_sz < CODEL_TARGET) { + /* went below so we'll stay below for at least interval */ + codel->first_above_time = 0; + return false; + } + + if (codel->first_above_time == 0) { + /* just went above from below. If we stay above + * for at least interval we'll say it's ok to drop + */ + codel->first_above_time = now + interval; + return false; + } else if (now >= codel->first_above_time) { + return true; + } + return false; +} + +static __always_inline +bool codel_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now) +{ + __u64 interval = CODEL_EXCEED_INTERVAL; + + /* If horizon have been exceed for a while, inc drop intensity*/ + bool drop = codel_should_drop(codel, t_queue_sz, now); + + if (codel->dropping) { /* In dropping state */ + if (!drop) { + /* time below target - leave dropping state */ + codel->dropping = false; + return false; + } else if (now >= codel->drop_next) { + /* It's time for the next drop. Drop the current + * packet. Schedule the next drop + */ + codel->count += 1; + // schedule the next drop. + codel->drop_next = + codel_control_law(codel->drop_next, codel->count); + return true; + } + } else if (drop && + ((now - codel->drop_next < interval) || + (now - codel->first_above_time >= interval))) { + /* If we get here, then we're not in dropping state. + * Decide whether it's time to enter dropping state. + */ + __u32 count = codel->count; + + codel->dropping = true; + + /* If we're in a drop cycle, drop rate that controlled queue + * on the last cycle is a good starting point to control it now. + */ + if (now - codel->drop_next < interval) + count = count > 2 ? (count - 2) : 1; + else + count = 1; + + codel->count = count; + codel->drop_next = codel_control_law(now, count); + return true; + } + return false; +} + +#endif /* __CODEL_IMPL_H */ diff --git a/traffic-pacing-edt/configure b/traffic-pacing-edt/configure new file mode 100755 index 0000000..248c846 --- /dev/null +++ b/traffic-pacing-edt/configure @@ -0,0 +1,29 @@ +#!/bin/bash +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +# This is not an autoconf generated configure +# + +# Output file which is input to Makefile +CONFIG=config.mk + +# Assume tc is in $PATH +TC=tc + +check_tc_libbpf() +{ + tc_version=$($TC -V) + if echo $tc_version | grep -q libbpf; then + libbpf_version=${tc_version##*libbpf } + echo "HAVE_TC_LIBBPF:=y" >> $CONFIG + echo "BPF_CFLAGS += -DHAVE_TC_LIBBPF" >> $CONFIG + echo "yes ($libbpf_version)" + else + echo "no" + fi +} + +echo "# Generated config" > $CONFIG +echo "Detecting available features on system" + +echo -n " - libbpf support in tc tool: " +check_tc_libbpf diff --git a/traffic-pacing-edt/edt_pacer01.c b/traffic-pacing-edt/edt_pacer01.c deleted file mode 100644 index 044158f..0000000 --- a/traffic-pacing-edt/edt_pacer01.c +++ /dev/null @@ -1,40 +0,0 @@ -#include -#include -#include -#include "iproute2_compat.h" - -char _license[] SEC("license") = "GPL"; - -/* The tc tool (iproute2) use another ELF map layout than libbpf (struct - * bpf_map_def), see struct bpf_elf_map from iproute2. - */ -struct bpf_elf_map SEC("maps") cnt_map = { - .type = BPF_MAP_TYPE_ARRAY, - .size_key = sizeof(__u32), - .size_value = sizeof(__u64), - .max_elem = 1, - //.pinning = PIN_GLOBAL_NS, -}; - -SEC("classifier") int tc_dummy(struct __sk_buff *skb) -{ - volatile void *data, *data_end; - int ret = BPF_OK; - struct ethhdr *eth; - - data = (void *)(long)skb->data; - data_end = (void *)(long)skb->data_end; - eth = (struct ethhdr *)data; - - if (data + sizeof(*eth) > data_end) - return BPF_DROP; - - /* Keep ARP resolution working */ - if (eth->h_proto == bpf_htons(ETH_P_ARP)) { - ret = BPF_OK; - goto out; - } - - out: - return ret; -} diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c deleted file mode 100644 index 47eecc7..0000000 --- a/traffic-pacing-edt/edt_pacer02.c +++ /dev/null @@ -1,126 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -#include -#include -#include -#include "iproute2_compat.h" - -char _license[] SEC("license") = "GPL"; - -#define NS_PER_SEC 1000000000 - -/* skb->len in bytes, thus easier to keep rate in bytes */ -#define RATE_IN_BITS (1000 * 1000 * 1000) -#define RATE_IN_BYTES (RATE_IN_BITS / 8) - -#define T_HORIZON_DROP (2000 * 1000 * 1000) - -/* FIXME add proper READ_ONCE / WRITE_ONCE macros, for now use for annotation */ -#define READ_ONCE(V) (V) -#define WRITE_ONCE(X,V) (X) = (V) - -struct edt_val { - __u64 rate; - __u64 t_last; - __u64 t_horizon_drop; - __u64 t_horizon_ecn; -}; - -/* The tc tool (iproute2) use another ELF map layout than libbpf (struct - * bpf_map_def), see struct bpf_elf_map from iproute2. - */ -struct bpf_elf_map SEC("maps") time_delay_map = { - .type = BPF_MAP_TYPE_ARRAY, - .size_key = sizeof(__u32), - .size_value = sizeof(struct edt_val), - .max_elem = 1, - //.pinning = PIN_GLOBAL_NS, -}; - -/* Role of EDT (Earliest Departure Time) is to schedule departure of packets to - * be send in the future. - */ -static __always_inline int sched_departure(struct __sk_buff *skb) -{ - struct edt_val *edt; - __u64 t_queue_sz; - __u64 t_xmit_ns; - __u64 t_next; - __u64 t_curr; - int key = 0; - __u64 now; - - edt = bpf_map_lookup_elem(&time_delay_map, &key); - if (!edt) - return BPF_DROP; - - /* Calc transmission time it takes to send packet 'bytes' */ - t_xmit_ns = ((__u64)skb->len) * NS_PER_SEC / RATE_IN_BYTES; - // t_xmit_ns = ((__u64)skb->len) * NS_PER_SEC / edt->rate; - - now = bpf_ktime_get_ns(); - - /* Allow others to set skb tstamp prior to us */ - t_curr = skb->tstamp; - if (t_curr < now) - t_curr = now; - - /* The 't_last' timestamp can be in the future. Packets scheduled a head - * of his packet can be seen as the queue size measured in time, via - * correlating this to 'now' timestamp. - */ - t_next = READ_ONCE(edt->t_last) + t_xmit_ns; - - /* If packet doesn't get scheduled into the future, then there is - * no-queue and we are not above rate limit. Send packet immediately and - * move forward t_last timestamp to now. - */ - if (t_next <= t_curr) { - WRITE_ONCE(edt->t_last, t_curr); - return BPF_OK; - } - - /* Calc queue size measured in time */ - t_queue_sz = t_next - now; - - /* FQ-pacing qdisc also have horizon, but cannot use that, because this - * BPF-prog will have updated map (t_last) on packet and assumes it got - * its part of bandwidth. - */ - if (t_queue_sz >= T_HORIZON_DROP /* edt->t_horizon_drop */) - return BPF_DROP; - - // TODO Add ECN marking horizon - - /* Advance "time queue" */ - WRITE_ONCE(edt->t_last, t_next); - - /* Schedule packet to be send at future timestamp */ - skb->tstamp = t_next; - return BPF_OK; -} - -SEC("classifier") int tc_edt_simple(struct __sk_buff *skb) -{ - volatile void *data, *data_end; - int ret = BPF_OK; - struct ethhdr *eth; - - data = (void *)(long)skb->data; - data_end = (void *)(long)skb->data_end; - eth = (struct ethhdr *)data; - - if (data + sizeof(*eth) > data_end) - return BPF_DROP; - - /* Keep ARP resolution working */ - if (eth->h_proto == bpf_htons(ETH_P_ARP)) { - ret = BPF_OK; - goto out; - } - - // TODO: match on vlan16 and only apply EDT on that - return sched_departure(skb); - - out: - return ret; -} diff --git a/traffic-pacing-edt/edt_pacer_vlan.c b/traffic-pacing-edt/edt_pacer_vlan.c new file mode 100644 index 0000000..a361079 --- /dev/null +++ b/traffic-pacing-edt/edt_pacer_vlan.c @@ -0,0 +1,280 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#include +#include +#include + +#include + +#define VLAN_MAX_DEPTH 2 +#include + +char _license[] SEC("license") = "GPL"; + +#define NS_PER_SEC 1000000000 + +/* Strategy: Shape at MAC (Medium Access Control) layer with Ethernet + * + * Production use-case is pacing traffic at 1Gbit/s wirespeed, using a + * 10Gbit/s NIC, because 1G end-user switch cannot handle bursts. + * + * (https://en.wikipedia.org/wiki/Interpacket_gap + * 12 bytes = interframe gap (IFG) 96 bit + + * (https://en.wikipedia.org/wiki/Ethernet_frame) + * 8 bytes = MAC preamble + * 4 bytes = Ethernet Frame Check Sequence (FCS) CRC + * 46 bytes = Minimum Payload size + * + * 14 bytes = Ethernet header + * 8 bytes = 2x VLAN headers + */ +//#define RATE_IN_BITS (1000 * 1000 * 1000ULL) /* Full 1Gbit/s */ +#define RATE_IN_BITS (990 * 1000 * 1000ULL) +//#define RATE_IN_BITS (950 * 1000 * 1000ULL) +#define OVERHEAD (12 + 8 + 4 + 8) /* 14 already in wire_len */ +//#define OVERHEAD (12 + 8 + 4) /* 14 already in wire_len */ +#define ETH_MIN (84) + +/* skb->len in bytes, thus convert rate to bytes */ +#define RATE_IN_BYTES (RATE_IN_BITS / 8) + +/* Controlling how large queue (in time) is allow to grow */ +#define T_HORIZON_DROP (40 * 1000 * 1000ULL) +#define T_HORIZON_TARGET (5 * 1000 * 1000ULL) +#define T_HORIZON_ECN (1 * 1000 * 1000ULL) + +/* Codel: If queue exceed target for more than one interval, start dropping */ +#define T_EXCEED_INTERVAL (100 * 1000 * 1000ULL) /* 100 ms in ns*/ + +#define CODEL_TARGET T_HORIZON_TARGET +#define CODEL_EXCEED_INTERVAL T_EXCEED_INTERVAL +#include "codel_impl.h" + +struct edt_val { + __u64 rate; + __u64 t_last; + __u64 t_horizon_drop; + __u64 t_horizon_ecn; + struct codel_state codel; +} __aligned(64); /* Align struct to cache-size to avoid false-sharing */ + +#ifdef HAVE_TC_LIBBPF /* detected by configure script in config.mk */ +/* Use BTF format to create map */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 4096); /* Max possible VLANs */ + __type(key, __u32); + __type(value, struct edt_val); +// __uint(pinning, LIBBPF_PIN_BY_NAME); +} time_delay_map SEC(".maps"); + +#else +/* The (iproute2) tc tool (without libbpf support) use another ELF map + * layout than libbpf (struct bpf_map_def), see struct bpf_elf_map + * from iproute2. + */ +#include "iproute2_compat.h" +struct bpf_elf_map SEC("maps") time_delay_map = { + .type = BPF_MAP_TYPE_ARRAY, + .size_key = sizeof(__u32), + .size_value = sizeof(struct edt_val), + .max_elem = 4096, /* Max possible VLANs */ +// .pinning = PIN_GLOBAL_NS, +}; +#endif + + +/* Role of EDT (Earliest Departure Time) is to schedule departure of packets to + * be send in the future. + */ +static __always_inline int sched_departure(struct __sk_buff *skb, __u32 key) +{ + struct edt_val *edt; + __u64 t_queue_sz; + __u64 t_xmit_ns; + __u64 wire_len; + __u64 t_next; + __u64 t_curr; + __u64 now; + + edt = bpf_map_lookup_elem(&time_delay_map, &key); + if (!edt) + return BPF_DROP; + + /* Calc transmission time it takes to send packet 'bytes'. + * + * Details on getting precise bytes on wire. The skb->len does include + * length of GRO/GSO segments, but not the segment headers that gets + * added on transmit. Fortunately skb->wire_len at TC-egress hook (not + * ingress) include these headers. (See: qdisc_pkt_len_init()) + */ + wire_len = skb->wire_len + OVERHEAD; + wire_len = wire_len > ETH_MIN ? wire_len : ETH_MIN; + + t_xmit_ns = (wire_len) * NS_PER_SEC / RATE_IN_BYTES; + +// t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / RATE_IN_BYTES; + // t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / edt->rate; + + // now = bpf_ktime_get_ns(); + now = bpf_ktime_get_boot_ns(); /* Use same ktime as bpftrace */ + + /* Allow others to set skb tstamp prior to us */ + t_curr = skb->tstamp; + if (t_curr < now) + t_curr = now; + + /* The 't_last' timestamp can be in the future. Packets scheduled a head + * of his packet can be seen as the queue size measured in time, via + * correlating this to 'now' timestamp. + */ + t_next = READ_ONCE(edt->t_last) + t_xmit_ns; + + /* If packet doesn't get scheduled into the future, then there is + * no-queue and we are not above rate limit. Normally send packet + * immediately and move forward t_last timestamp to now. + * + * But in our use-case the traffic need smoothing at a earlier + * stage, as bursts at lower rates can hurt the crapy switch. + * Thus, schedule SKB transmissing as new + t_xmit_ns. + */ + if (t_next <= t_curr) { +#if 1 + __u64 t_curr_next; + __u32 min_len = 1538; + + /* Minimum delay for all packet if no time-queue */ + wire_len = (wire_len > min_len) ? wire_len : min_len; + t_xmit_ns = (wire_len) * NS_PER_SEC / RATE_IN_BYTES; + t_curr_next = t_curr + t_xmit_ns; + + WRITE_ONCE(edt->t_last, t_curr_next); + skb->tstamp = t_curr_next; + skb->mark = 1; /* No queue - add minimum delay */ +#else + WRITE_ONCE(edt->t_last, t_curr); +#endif + return BPF_OK; + + } + + /* Calc queue size measured in time */ + t_queue_sz = t_next - now; + + /* FQ-pacing qdisc also have horizon, but cannot use that, because this + * BPF-prog will have updated map (t_last) on packet and assumes it got + * its part of bandwidth. + */ + if (t_queue_sz >= T_HORIZON_DROP /* edt->t_horizon_drop */) + return BPF_DROP; + + /* If TCP didn't react to ECN marking, then start dropping some */ + // if (codel_drop(edt, t_queue_sz, now)) + if (codel_drop(&edt->codel, t_queue_sz, t_next)) + return BPF_DROP; + + skb->mark = 2; /* (time) queue exist - and small/below T_HORIZON_ECN */ + + /* ECN marking horizon */ + if (t_queue_sz >= T_HORIZON_ECN) { + skb->mark = 3; /* (time) queue exist - and is large */ + bpf_skb_ecn_set_ce(skb); + } + + /* Advance "time queue" */ + WRITE_ONCE(edt->t_last, t_next); + + /* Schedule packet to be send at future timestamp */ + skb->tstamp = t_next; + return BPF_OK; +} + +static __always_inline +__u16 get_inner_qinq_vlan(struct __sk_buff *skb, struct collect_vlans *vlans) +{ + __u16 vlan_key; + + /* NIC can HW "offload" the outer VLAN, moving it to skb context */ + if (skb->vlan_present) + vlan_key = vlans->id[0]; /* Inner vlan placed as first inline */ + else + vlan_key = vlans->id[1]; /* All VLAN headers inline */ + + return vlan_key; +} + +static __always_inline +__u16 get_vlan(struct __sk_buff *skb, struct collect_vlans *vlans) +{ + __u16 vlan_key; + + /* Handle extracting VLAN if skb context have VLAN offloaded */ + if (skb->vlan_present) + vlan_key = skb->vlan_tci & VLAN_VID_MASK; + else + vlan_key = vlans->id[0]; + + return vlan_key; +} + +static __always_inline +__u16 extract_vlan_key(struct __sk_buff *skb, struct collect_vlans *vlans) +{ + int QinQ = 0; + + /* The inner VLAN is the key to extract. But it is complicated + * due to NIC "offloaded" VLAN (skb->vlan_present). In case + * BPF-prog is loaded on outer VLAN net_device, the BPF-prog + * sees the inner-VLAN at the first and only VLAN. + */ + if (skb->vlan_present) { + if (vlans->id[0]) + QinQ = 1; + } else { + if (vlans->id[1]) + QinQ = 1; + } + + if (QinQ) + return get_inner_qinq_vlan(skb, vlans); + else + return get_vlan(skb, vlans); +} + +SEC("classifier") int tc_edt_vlan(struct __sk_buff *skb) +{ + void *data = (void *)(long)skb->data; + void *data_end = (void *)(long)skb->data_end; + struct collect_vlans vlans = { 0 }; + struct ethhdr *eth; + int ret = BPF_OK; + __u16 vlan_key; + + /* These keep track of the next header type and iterator pointer */ + struct hdr_cursor nh; + int eth_type; + nh.pos = data; + + eth_type = parse_ethhdr_vlan(&nh, data_end, ð, &vlans); + if (eth_type < 0) + return BPF_DROP; + + /* Keep ARP resolution working */ + if (eth_type == bpf_htons(ETH_P_ARP)) { + ret = BPF_OK; + goto out; + } + + if (!proto_is_vlan(eth->h_proto) && !skb->vlan_present) { + /* Skip non-VLAN frames */ + return BPF_OK; + } + + vlan_key = extract_vlan_key(skb, &vlans); + + /* Each (inner) VLAN id gets it own EDT pacing */ + return sched_departure(skb, vlan_key); + + out: + return ret; +} diff --git a/traffic-pacing-edt/functions.sh b/traffic-pacing-edt/functions.sh index a92f482..32cbdde 100644 --- a/traffic-pacing-edt/functions.sh +++ b/traffic-pacing-edt/functions.sh @@ -62,3 +62,28 @@ function call_tc() { function call_tc_allow_fail() { _call_tc "allow_fail" "$@" } + +## -- Wrapper calls for IP -- +function _call_ip() { + local allow_fail="$1" + shift + if [[ -n "$VERBOSE" ]]; then + echo "ip $@" + fi + if [[ -n "$DRYRUN" ]]; then + return + fi + $IP "$@" + local status=$? + if (( $status != 0 )); then + if [[ "$allow_fail" == "" ]]; then + err 3 "Exec error($status) occurred cmd: \"$IP $@\"" + fi + fi +} +function call_ip() { + _call_ip "" "$@" +} +function call_ip_allow_fail() { + _call_ip "allow_fail" "$@" +} diff --git a/traffic-pacing-edt/hash_func01.h b/traffic-pacing-edt/hash_func01.h new file mode 100644 index 0000000..3825581 --- /dev/null +++ b/traffic-pacing-edt/hash_func01.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: LGPL-2.1 + * + * Based on Paul Hsieh's (LGPG 2.1) hash function + * From: http://www.azillionmonkeys.com/qed/hash.html + */ + +#define get16bits(d) (*((const __u16 *) (d))) + +static __always_inline +__u32 SuperFastHash (const char *data, int len, __u32 initval) { + __u32 hash = initval; + __u32 tmp; + int rem; + + if (len <= 0 || data == NULL) return 0; + + rem = len & 3; + len >>= 2; + + /* Main loop */ +#pragma clang loop unroll(full) + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (__u16); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= ((signed char)data[sizeof (__u16)]) << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += (signed char)*data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} diff --git a/traffic-pacing-edt/iproute2_compat.h b/traffic-pacing-edt/iproute2_compat.h index a535f5f..3d72546 100644 --- a/traffic-pacing-edt/iproute2_compat.h +++ b/traffic-pacing-edt/iproute2_compat.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* Taken from from #include */ #ifndef __IPROUTE2_COMPAT_H #define __IPROUTE2_COMPAT_H @@ -8,6 +9,11 @@ * binary layout until "flags". Thus, BPF-progs can use both if careful. */ +/* Object pinning settings */ +#define PIN_NONE 0 +#define PIN_OBJECT_NS 1 +#define PIN_GLOBAL_NS 2 + /* ELF map definition (copied from iproute2 source code) */ struct bpf_elf_map { __u32 type; diff --git a/traffic-pacing-edt/parameters.sh b/traffic-pacing-edt/parameters.sh index 6d0841d..d0077ab 100644 --- a/traffic-pacing-edt/parameters.sh +++ b/traffic-pacing-edt/parameters.sh @@ -10,10 +10,10 @@ function usage() { echo "Usage: $0 [-vh] --dev ethX" echo " -d | --dev : (\$DEV) Interface/device (required)" echo " -v | --verbose : (\$VERBOSE) verbose" - echo " --remove : (\$REMOVE) Remove the TC rules" + echo " --remove : (\$REMOVE) Remove the rules" echo " --dry-run : (\$DRYRUN) Dry-run only (echo tc commands)" - echo " -s | --stats : (\$STATS_ONLY) Call TC statistics command" - echo " -l | --list : (\$LIST) List TC filter setup after setup" + echo " -s | --stats : (\$STATS_ONLY) Call statistics command" + echo " -l | --list : (\$LIST) List setup after setup" echo " --file | --obj : (\$BPF_OBJ) BPF-object file to load" echo "" } @@ -80,5 +80,5 @@ done if [ -z "$DEV" ]; then usage - err 2 "Please specify TC net_device" + err 2 "Please specify net_device (\$DEV)" fi diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh new file mode 100755 index 0000000..882b146 --- /dev/null +++ b/traffic-pacing-edt/tc_fq_pacer.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# +# Loading FQ pacing qdisc in multi-queue MQ setup to avoid root qdisc lock. +# +# The FQ pacing qdisc is doing all the work of pacing packet out according to +# the EDT (Earliest Departure Time) future timestamps set by our BPF-prog that +# runs a TC-egress hook. +# +# Author: Jesper Dangaaard Brouer +# License: GPLv2 +# +basedir=`dirname $0` +source ${basedir}/functions.sh + +root_check_run_with_sudo "$@" + +# Use common parameters +source ${basedir}/parameters.sh + +export TC=tc + +# Default verbose +VERBOSE=1 + +# Select between multiq or single root qdisc +if [[ -z $1 ]]; then + if [[ -z $REMOVE ]]; then + err 1 "Specify root qdisc system: single or mq (multi-queue)" + fi +fi +TYPE=$1 + +# Delete existing root qdisc +call_tc_allow_fail qdisc del dev "$DEV" root + +if [[ -n $REMOVE ]]; then + exit 0 +fi + +function use_multiq() +{ + # MQ (Multi-Queue) as root qdisc + call_tc qdisc replace dev $DEV root handle 7FFF: mq + + # Add FQ-pacer qdisc on each NIC avail TX-queue + i=0 + for dir in /sys/class/net/$DEV/queues/tx-*; do + # Details: cause-off-by-one, as tx-0 becomes handle 1: + ((i++)) || true + #call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq + # + # The higher 'flow_limit' is needed for high-BW pacing + call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq \ + flow_limit 1000 + # + # quantum $((1514*4)) initial_quantum $((1514*20)) + # call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit + done +} + +function use_single_fq_pacer() +{ + call_tc qdisc replace dev $DEV root handle 7FFF: fq \ + flow_limit 1000 +} + +case "$TYPE" in + mq | multiq ) + use_multiq + ;; + single | fq ) + use_single_fq_pacer + ;; + * ) + err 1 "Unknown type: ${TYPE}" + ;; +esac diff --git a/traffic-pacing-edt/tc_htb_shaper.sh b/traffic-pacing-edt/tc_htb_shaper.sh new file mode 100755 index 0000000..08e0b06 --- /dev/null +++ b/traffic-pacing-edt/tc_htb_shaper.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# +# This HTB shaper setup script is available for easier comparing the +# accuracy against the EDT solution. +# +# Author: Jesper Dangaaard Brouer +# License: GPLv2 +# +basedir=`dirname $0` +source ${basedir}/functions.sh + +root_check_run_with_sudo "$@" + +# Use common parameters +source ${basedir}/parameters.sh + +export TC=/sbin/tc + +# It seems measured BW is TCP goodput, but configured BW is wirespeed. +# Measurements show around 930Mbit best-case. Q-in-Q result in MTU +# 1522 bytes. TCP goodput segments are 1448 bytes. +# +#RATE=$((930*1522/1448))Mbit +##RATE=$((933*1522/1448))Mbit +##CEIL=$((999*1522/1448)) +#CEIL=1Gbit +#CEIL=980mbit + +# EDT shaper show TCP goodput of 956 Mbit/s. +# echo $((956*1514/1448)) = 999 +RATE=999Mbit +CEIL=1000Mbit + +#RATE=500mbit +#CEIL=577mbit + +# Each of the HTB root-class(es) get these RATE+CEIL upper bandwidth bounds. +ROOT_RATE=9000Mbit +ROOT_CEIL=9500Mbit + +DEFAULT_RATE=6000Mbit +DEFAULT_CEIL=6000Mbit + +TC=/usr/sbin/tc +VERBOSE=1 + +function tc() { + _call_tc "" "$@" +} + +# Delete existing root qdisc +call_tc_allow_fail qdisc del dev "$DEV" root + +if [[ -n $REMOVE ]]; then + exit 0 +fi + +# HTB shaper +#tc qdisc add dev "$DEV" root handle 1: htb default 2 +tc qdisc add dev "$DEV" root handle 1: htb default 16 + +# The root-class set upper bandwidth usage +tc class add dev "$DEV" parent 1: classid 1:1 \ + htb rate $ROOT_RATE ceil $ROOT_CEIL + +# Default class 1:2 +tc class add dev "$DEV" parent 1: classid 1:2 htb \ + rate "$DEFAULT_RATE" ceil "$DEFAULT_CEIL" +# burst 100000 cburst 100000 +tc qdisc add dev $DEV parent 1:2 fq_codel + + +# Class for vlan 16 +tc class add dev "$DEV" parent 1: classid 1:16 htb rate "$RATE" ceil "$CEIL" \ + burst $((1522*2)) cburst $((1522*2)) \ + linklayer ethernet +# burst 1522 cburst 1522 + #burst 1 cburst 1 +# burst $((1522*2)) cburst $((1522*2)) +# overhead $((14+4+4)) linklayer ethernet +#tc qdisc add dev "$DEV" parent 1:16 fq_codel +tc qdisc add dev "$DEV" parent 1:16 fq_codel quantum $((1514+4+4)) +#tc qdisc add dev "$DEV" parent 1:16 pfifo + +# parent filter: +#tc filter add dev "$DEV" parent 1:0 prio 100 protocol 802.1q u32 +# +# vlan 16: +#tc filter add dev "$DEV" parent 1:0 prio 100 \ +# protocol 802.1q \ +# u32 match u16 0x0010 0x0fff at -4 \ +# flowid 1:16 + +tc filter add dev $DEV protocol all parent 1:0 prio 101 \ + basic match "meta(vlan mask 0xfff eq 16)" flowid 1:16 diff --git a/traffic-pacing-edt/testlab_vlan_setup.sh b/traffic-pacing-edt/testlab_vlan_setup.sh new file mode 100755 index 0000000..2228af5 --- /dev/null +++ b/traffic-pacing-edt/testlab_vlan_setup.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# +# Testlab setup script for VLAN Q-in-Q (double tagged VLAN) config. +# +# Author: Jesper Dangaaard Brouer +# License: GPLv2 +# +basedir=`dirname $0` +source ${basedir}/functions.sh + +root_check_run_with_sudo "$@" + +# Use common parameters +source ${basedir}/parameters.sh + +export IP=/sbin/ip +function ip() { + call_ip "$@" +} + +function create_vlan_device() { + local vlan=${1} + local device=${2:-$DEV} + shift 2 + + if [[ -z "$vlan" ]]; then + err 2 "Missing VLAN is as input" + fi + + ip link add link "$device" name ${device}.${vlan} type vlan id ${vlan} + ip link set ${device}.${vlan} up +} + +function create_vlan_device_802_1ad() { + local vlan=${1} + local device=${2:-$DEV} + shift 2 + + if [[ -z "$vlan" ]]; then + err 2 "Missing VLAN is as input" + fi + + ip link add link "$device" name ${device}.${vlan} type vlan id ${vlan} \ + protocol 802.1ad + ip link set ${device}.${vlan} up +} + + +function delete_vlan_device() { + local vlan=${1} + local device=${2:-$DEV} + shift 2 + + if [[ -z "$vlan" ]]; then + err 2 "Missing VLAN is as input" + fi + + ip link del ${device}.${vlan} +} + + +if [[ -z "$1" ]]; then + err 3 "Missing arg#1 for outer vlan" +fi +OUTER=$1 + +if [[ -z "$2" ]]; then + err 3 "Missing arg#2 for inner vlan" +fi +INNER=$2 + +if [[ -n $REMOVE ]]; then + delete_vlan_device $INNER ${DEV}.${OUTER} + delete_vlan_device $OUTER $DEV + exit 0 +fi + +create_vlan_device $OUTER $DEV +create_vlan_device $INNER ${DEV}.${OUTER} + +# Set MTU to handle extra VLAN headers, NICs usually allow one VLAN +# header even though they have configured MTU 1500. +ip link set $DEV mtu 1508 +ip link set ${DEV}.${OUTER} mtu 1504 diff --git a/traffic-pacing-edt/vlans_load_edt.sh b/traffic-pacing-edt/vlans_load_edt.sh new file mode 100755 index 0000000..3dc1fab --- /dev/null +++ b/traffic-pacing-edt/vlans_load_edt.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# +# Script for loading EDT-pacer BPF-prog on all downstream VLANs +# +basedir=`dirname $0` +source ${basedir}/functions.sh + +root_check_run_with_sudo "$@" + +# Use common parameters +source ${basedir}/parameters.sh + +# Default verbose +VERBOSE=1 + +# Downstream dev: ens6f0 +VLAN_START=168 +VLAN_END=205 + +cmd=${basedir}/bpf_egress_loader.sh + +options="" + +if [[ -n $REMOVE ]]; then + options+=" --remove" +fi +if [[ -n $DRYRUN ]]; then + options+=" --dry-run" + #cmd="echo $cmd" +fi +if [[ -n $VERBOSE ]]; then + options+=" --verbose" +fi + +for (( vlan=${VLAN_START}; vlan<=${VLAN_END}; vlan++ )) +do + VLAN=${DEV}.$vlan + $cmd --dev $VLAN $options +done diff --git a/traffic-pacing-edt/xdp_cpumap_loader.c b/traffic-pacing-edt/xdp_cpumap_loader.c new file mode 100644 index 0000000..0afd46a --- /dev/null +++ b/traffic-pacing-edt/xdp_cpumap_loader.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0+ +static const char *__doc__ = + " XDP load-balancing with CPU-map"; + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include /* XDP defines */ + +static int ifindex = -1; +static char ifname_buf[IF_NAMESIZE]; +static char *ifname; + +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; + +/* Exit return codes */ +#define EXIT_OK 0 +#define EXIT_FAIL 1 +#define EXIT_FAIL_OPTION 2 +#define EXIT_FAIL_XDP 3 +#define EXIT_FAIL_BPF 4 +#define EXIT_FAIL_MEM 5 +#define EXIT_FAIL_FILE 6 + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"dev", required_argument, NULL, 'd' }, + {"qsize", required_argument, NULL, 'q' }, + {"force", no_argument, NULL, 'F' }, + {"remove", no_argument, NULL, 'r' }, + {"non-cpu", required_argument, NULL, 'x' }, + {"exclude-cpu", required_argument, NULL, 'x' }, + {0, 0, NULL, 0 } +}; + +static void usage(char *argv[]) +{ + int i; + + printf("\nDOCUMENTATION:\n%s\n", __doc__); + printf("\n"); + printf(" Usage: %s (options-see-below)\n", argv[0]); + printf(" Listing options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)", + *long_options[i].flag); + else + printf(" short-option: -%c", + long_options[i].val); + printf("\n"); + } + printf("\n"); +} + +struct cpumap_config { + int fd_cpumap; + int fd_cpus_enabled; + int fd_cpus_count; + int *cpu_exclude; + int max_cpus; + __u32 qsize; +}; + +static int cpumap_config_init(struct cpumap_config *cfg) +{ + int n_cpus = get_nprocs_conf(); + int *cpu_exclude; + + memset(cfg, 0, sizeof(*cfg)); + + cpu_exclude = malloc(n_cpus * sizeof(int)); + if (!cpu_exclude) { + fprintf(stderr, "failed to allocate array\n"); + return EXIT_FAIL_MEM; + } + memset(cpu_exclude, 0, n_cpus * sizeof(int)); + + cfg->cpu_exclude = cpu_exclude; + cfg->max_cpus = n_cpus; + return 0; +} + +int __find_map_fd_by_name(struct bpf_object *obj, char *name) +{ + int fd; + + fd = bpf_object__find_map_fd_by_name(obj, name); + if (fd < 0) { + printf("No map found! - named: %s\n", name); + exit(EXIT_FAIL_BPF); + } + return fd; +} + +/* Get file descriptors to BPF-maps */ +static int cpumap_config_find_maps(struct bpf_object *obj, + struct cpumap_config *cfg) +{ + cfg->fd_cpumap = __find_map_fd_by_name(obj, "cpumap"); + cfg->fd_cpus_enabled = __find_map_fd_by_name(obj, "cpus_enabled"); + cfg->fd_cpus_count = __find_map_fd_by_name(obj, "cpus_count"); + return 0; +} + +static int create_cpu_entry(struct cpumap_config *cfg, __u32 cpu, + struct bpf_cpumap_val *value, + __u32 enabled_idx, bool new) +{ + __u32 curr_cpus_count = 0; + __u32 key = 0; + int err, fd; + + /* Add a CPU entry to cpumap, as this allocate a cpu entry in + * the kernel for the cpu. + */ + fd = cfg->fd_cpumap; + err = bpf_map_update_elem(fd, &cpu, value, 0); + if (err) { + fprintf(stderr, "Create(fd:%d) CPU(%d) entry failed (err:%d)\n", + fd, cpu, err); + return EXIT_FAIL_BPF; + } + + /* Inform bpf_prog's that a new CPU is enabled and available + * to be select from the map, that maps index to actual CPU. + */ + fd = cfg->fd_cpus_enabled; + err = bpf_map_update_elem(fd, &enabled_idx, &cpu, 0); + if (err) { + fprintf(stderr, "Add to enabled avail CPUs failed\n"); + return EXIT_FAIL_BPF; + } + + /* When not replacing/updating existing entry, bump the count */ + fd = cfg->fd_cpus_count; + err = bpf_map_lookup_elem(fd, &key, &curr_cpus_count); + if (err) { + fprintf(stderr, "Failed reading curr cpus_count\n"); + return EXIT_FAIL_BPF; + } + if (new) { + curr_cpus_count++; + err = bpf_map_update_elem(fd, &key, &curr_cpus_count, 0); + if (err) { + fprintf(stderr, "Failed write curr cpus_count\n"); + return EXIT_FAIL_BPF; + } + } + + return 0; +} + +/* Userspace MUST create/populate CPUMAP entries for redirect to work + */ +static int configure_cpus(struct cpumap_config *cfg) +{ + struct bpf_cpumap_val value = { 0 }; + int n_cpus = cfg->max_cpus; + int *exclude = cfg->cpu_exclude; + int enabled_idx = 0; + bool new = true; + int cpu, err; + + value.qsize = cfg->qsize; + + for (cpu = 0; cpu < n_cpus; cpu++) { + + if (exclude[cpu] == -1) { + printf("Excluding CPU:%d\n", cpu); + continue; + } + printf("Enable CPU:%d\n", cpu); + err = create_cpu_entry(cfg, cpu, &value, enabled_idx, new); + if (err) + return err; + enabled_idx++; + } + return 0; +} + +struct bpf_object *do_load_bpf_obj(struct bpf_object *obj) +{ + char buf[200]; + int err; + + err = bpf_object__load(obj); + if (err) { + libbpf_strerror(err, buf, sizeof(buf)); + printf("Error loading: %s\n", buf); + return NULL; + } + return obj; +} + +int do_xdp_attach(int ifindex, struct bpf_program *prog, __u32 xdp_flags) +{ + int prog_fd = bpf_program__fd(prog); + int err; + + if (prog_fd < 0) { + fprintf(stderr, "bpf_program__fd failed\n"); + return EXIT_FAIL_BPF; + } + + err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags); + if (err) { + fprintf(stderr, "%s(): link set xdp fd failed (err:%d)\n", + __func__, err); + return EXIT_FAIL_XDP; + } + return EXIT_OK; +} + +int do_xdp_detach(int ifindex, __u32 xdp_flags) +{ + int err; + + err = bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + if (err) { + fprintf(stderr, "%s(): link set xdp fd failed (err:%d)\n", + __func__, err); + return EXIT_FAIL_XDP; + } + return EXIT_OK; +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + bool do_detach = false; + int opt, longindex = 0; + char buf[100]; + int err; + + struct bpf_object *obj = NULL; + struct bpf_program *prog; + + /* System to setup and exclude some CPUs */ + struct cpumap_config cfg; + int n_cpus = get_nprocs_conf(); + int non_cpu = -1; + int *cpu_exclude; + + cpumap_config_init(&cfg); + cpu_exclude = cfg.cpu_exclude; + cfg.qsize = 512; /* Default queue size */ + + /* Always use XDP native driver mode */ + xdp_flags |= XDP_FLAGS_DRV_MODE; + + obj = bpf_object__open_file("xdp_cpumap_qinq.o", NULL); + err = libbpf_get_error(obj); + if (err) { + libbpf_strerror(err, buf, sizeof(buf)); + printf("Error opening file: %s\n", buf); + return EXIT_FAIL_FILE; + } + err = EXIT_OK; + + /* Parse commands line args */ + while ((opt = getopt_long(argc, argv, "hd:q:Frx:", + long_options, &longindex)) != -1) { + switch (opt) { + case 'd': + if (strlen(optarg) >= IF_NAMESIZE) { + fprintf(stderr, "ERR: --dev name too long\n"); + goto error; + } + ifname = (char *)&ifname_buf; + strncpy(ifname, optarg, IF_NAMESIZE); + ifindex = if_nametoindex(ifname); + if (ifindex == 0) { + fprintf(stderr, + "ERR: --dev name unknown err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + break; + case 'q': + cfg.qsize = strtol(optarg, NULL, 10); + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + case 'r': + do_detach = true; + break; + case 'x': /* --exclude-cpu or --non-cpu */ + /* Possible to exclude multiple CPUs on cmdline */ + non_cpu = strtoul(optarg, NULL, 0); + if (non_cpu >= n_cpus) { + fprintf(stderr, + "--cpu nr too large for cpumap err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + cpu_exclude[non_cpu] = -1; + break; + + case 'h': + error: + default: + usage(argv); + free(cpu_exclude); + return EXIT_FAIL_OPTION; + } + } + /* Required option */ + if (ifindex == -1) { + fprintf(stderr, "ERR: required option --dev missing\n"); + usage(argv); + err = EXIT_FAIL_OPTION; + goto out; + } + + if (do_detach) + return do_xdp_detach(ifindex, xdp_flags); + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + err = EXIT_FAIL_MEM; + goto out; + } + + obj = do_load_bpf_obj(obj); + if (!obj) { + err = EXIT_FAIL_BPF; + goto out; + } + + /* Pickup first BPF-program */ + prog = bpf_program__next(NULL, obj); + if (!prog) { + printf("No program!\n"); + err = EXIT_FAIL_BPF; + goto out; + } + + /* Find maps maps */ + if (cpumap_config_find_maps(obj, &cfg)) { + err = EXIT_FAIL_BPF; + goto out; + } + + /* Configure cpumap */ + if (configure_cpus(&cfg)) { + err = EXIT_FAIL_BPF; + goto out; + } + + /* Attach XDP program */ + err = do_xdp_attach(ifindex, prog, xdp_flags); + if (err) + goto out; + + printf("Attached XDP program:\"%s\" on netdev:%s (ifindex:%d)\n", + bpf_program__name(prog), ifname, ifindex); + printf("CPUs: %d\n", n_cpus); + +out: + if (obj) + bpf_object__close(obj); + + free(cpu_exclude); + return err; +} diff --git a/traffic-pacing-edt/xdp_cpumap_qinq.c b/traffic-pacing-edt/xdp_cpumap_qinq.c new file mode 100644 index 0000000..eada801 --- /dev/null +++ b/traffic-pacing-edt/xdp_cpumap_qinq.c @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#include +#include /* struct bpf_cpumap_val */ +#include +#include + +#define INITVAL 15485863 +//#define INITVAL 2654435761 + +#include "hash_func01.h" /* SuperFastHash */ + +#include + +#define VLAN_MAX_DEPTH 2 +#include + +#define MAX_CPUS 24 + +/* This global variable is used for limiting CPU that can be selected */ +__u32 global_max_cpus = 12; /* TODO: Allow userspace to adjust this */ + +/* Special map type that can XDP_REDIRECT frames to another CPU */ +struct { + __uint(type, BPF_MAP_TYPE_CPUMAP); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_cpumap_val)); + __uint(max_entries, MAX_CPUS); +} cpumap SEC(".maps"); + +/* Mapping table with CPUs enabled, for hashing between */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, MAX_CPUS); +} cpus_enabled SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, 1); +} cpus_count SEC(".maps"); + +static __always_inline +__u32 extract_vlan_key(struct collect_vlans *vlans) +{ + /* Combine inner and outer VLAN as a key */ + __u32 vlan_key = (vlans->id[1] << 16) | vlans->id[0]; + return vlan_key; +} + +SEC("xdp") +int xdp_cpumap_qinq(struct xdp_md *ctx) +{ + void *data = (void *)(long)ctx->data; + void *data_end = (void *)(long)ctx->data_end; + struct collect_vlans vlans = { 0 }; + __u32 hash_key, vlan_key; + struct ethhdr *eth; + __u32 cpu_idx, cpu_dest = 0; + __u32 *cpu_lookup; + __u64 action; + __u32 *cpu_max; + + + /* These keep track of the next header type and iterator pointer */ + struct hdr_cursor nh; + int eth_type; + nh.pos = data; + + eth_type = parse_ethhdr_vlan(&nh, data_end, ð, &vlans); + if (eth_type < 0) { + action = XDP_ABORTED; + goto out; + } + + /* Keep ARP resolution working */ + if (eth_type == bpf_htons(ETH_P_ARP)) { + action = XDP_PASS; + goto out; + } + + if (!proto_is_vlan(eth->h_proto)) { + /* Skip non-VLAN frames */ + action = XDP_PASS; + goto out; + } + + int key0 = 0; + cpu_max = bpf_map_lookup_elem(&cpus_count, &key0); + if (!cpu_max) + return XDP_ABORTED; + + /* Use inner+outer VLAN as key and hash based on max_cpus */ + vlan_key = extract_vlan_key(&vlans); + hash_key = SuperFastHash((char *)&vlan_key, 4, INITVAL); + cpu_idx = hash_key % *cpu_max; + + /* To allow excluding some CPUs, a mapping table cpus_enabled + * translates cpu_idx to real CPU-id + */ + cpu_lookup = bpf_map_lookup_elem(&cpus_enabled, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + /* Notice: Userspace MUST insert entries into cpumap */ + action = bpf_redirect_map(&cpumap, cpu_dest, XDP_PASS); +out: + return action; +}