From bcd60745674563717757ac26398d88ce9b86591f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 14 Nov 2020 12:41:22 +0100 Subject: [PATCH 01/61] traffic-pacing-edt: Add HTB shaper script This is primary to comparing HTB shaper accuracy against EDT Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/tc_htb_shaper.sh | 84 +++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100755 traffic-pacing-edt/tc_htb_shaper.sh diff --git a/traffic-pacing-edt/tc_htb_shaper.sh b/traffic-pacing-edt/tc_htb_shaper.sh new file mode 100755 index 0000000..8652fb0 --- /dev/null +++ b/traffic-pacing-edt/tc_htb_shaper.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# +# This HTB shaper setup script is available for easier comparing the +# accuracy against the EDT solution. +# +# Author: Jesper Dangaaard Brouer +# License: GPLv2 +# +basedir=`dirname $0` +source ${basedir}/functions.sh + +root_check_run_with_sudo "$@" + +# Use common parameters +source ${basedir}/parameters.sh + +export TC=/sbin/tc + +# It seems measured BW is TCP goodput, but configured BW is wirespeed. +# Measurements show around 930Mbit best-case. Q-in-Q result in MTU +# 1522 bytes. TCP goodput segments are 1448 bytes. +# +RATE=$((930*1522/1448))Mbit +##RATE=$((933*1522/1448))Mbit +##CEIL=$((999*1522/1448)) +#CEIL=1Gbit +CEIL=980mbit + +#RATE=500mbit +#CEIL=577mbit + +# Each of the HTB root-class(es) get these RATE+CEIL upper bandwidth bounds. +ROOT_RATE=9000Mbit +ROOT_CEIL=9500Mbit + +DEFAULT_RATE=6000Mbit +DEFAULT_CEIL=6000Mbit + +TC=/usr/sbin/tc +VERBOSE=1 + +function tc() { + _call_tc "" "$@" +} + +# HTB shaper +call_tc_allow_fail qdisc del dev "$DEV" root +#tc qdisc add dev "$DEV" root handle 1: htb default 2 +tc qdisc add dev "$DEV" root handle 1: htb default 16 + +# The root-class set upper bandwidth usage +tc class add dev "$DEV" parent 1: classid 1:1 \ + htb rate $ROOT_RATE ceil $ROOT_CEIL + +# Default class 1:2 +tc class add dev "$DEV" parent 1: classid 1:2 htb \ + rate "$DEFAULT_RATE" ceil "$DEFAULT_CEIL" +# burst 100000 cburst 100000 +tc qdisc add dev $DEV parent 1:2 fq_codel + + +# Class for vlan 16 +tc class add dev "$DEV" parent 1: classid 1:16 htb rate "$RATE" ceil "$CEIL" \ + burst $((1522*2)) cburst $((1522*2)) \ + linklayer ethernet +# burst 1522 cburst 1522 + #burst 1 cburst 1 +# burst $((1522*2)) cburst $((1522*2)) +# overhead $((14+4+4)) linklayer ethernet +#tc qdisc add dev "$DEV" parent 1:16 fq_codel +tc qdisc add dev "$DEV" parent 1:16 fq_codel quantum $((1514+4+4)) +#tc qdisc add dev "$DEV" parent 1:16 pfifo + +# parent filter: +#tc filter add dev "$DEV" parent 1:0 prio 100 protocol 802.1q u32 +# +# vlan 16: +#tc filter add dev "$DEV" parent 1:0 prio 100 \ +# protocol 802.1q \ +# u32 match u16 0x0010 0x0fff at -4 \ +# flowid 1:16 + +tc filter add dev $DEV protocol all parent 1:0 prio 101 \ + basic match "meta(vlan mask 0xfff eq 16)" flowid 1:16 From aae2db44962041b6a28454d2bd7e255846dbdfaf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 14 Nov 2020 13:09:39 +0100 Subject: [PATCH 02/61] traffic-pacing-edt/tc_htb_shaper.sh: Make it easy to remove HTB qdisc Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/tc_htb_shaper.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/traffic-pacing-edt/tc_htb_shaper.sh b/traffic-pacing-edt/tc_htb_shaper.sh index 8652fb0..69aedb9 100755 --- a/traffic-pacing-edt/tc_htb_shaper.sh +++ b/traffic-pacing-edt/tc_htb_shaper.sh @@ -43,8 +43,14 @@ function tc() { _call_tc "" "$@" } -# HTB shaper +# Delete existing root qdisc call_tc_allow_fail qdisc del dev "$DEV" root + +if [[ -n $REMOVE ]]; then + exit 0 +fi + +# HTB shaper #tc qdisc add dev "$DEV" root handle 1: htb default 2 tc qdisc add dev "$DEV" root handle 1: htb default 16 From a5ed0071f1368dd8b6e82a73786cf7b255b089de Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 14 Nov 2020 13:43:42 +0100 Subject: [PATCH 03/61] traffic-pacing-edt: Add tc_fq_pacer.sh script for MQ-FQ setup Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/tc_fq_pacer.sh | 35 +++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100755 traffic-pacing-edt/tc_fq_pacer.sh diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh new file mode 100755 index 0000000..dc32fdc --- /dev/null +++ b/traffic-pacing-edt/tc_fq_pacer.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# +# Loading FQ pacing qdisc in multi-queue MQ setup to avoid root qdisc lock. +# +# Author: Jesper Dangaaard Brouer +# License: GPLv2 +# +basedir=`dirname $0` +source ${basedir}/functions.sh + +root_check_run_with_sudo "$@" + +# Use common parameters +source ${basedir}/parameters.sh + +export TC=/sbin/tc +function tc() { + _call_tc "" "$@" +} + +# Default verbose +VERBOSE=1 + +# Delete existing root qdisc +call_tc_allow_fail qdisc del dev "$DEV" root + +# MQ (Multi-Queue) as root qdisc +tc qdisc replace dev $DEV root handle 7FFF: mq + +# Add FQ-pacer qdisc on each NIC avail TX-queue +i=0 +for dir in /sys/class/net/$DEV/queues/tx-*; do + ((i++)) || true + tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq +done From 3969089c646e8f68bac7f99dddbeff8276907795 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 14 Nov 2020 13:53:49 +0100 Subject: [PATCH 04/61] traffic-pacing-edt/tc_fq_pacer.sh: Add doc explaining Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/tc_fq_pacer.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh index dc32fdc..9682f12 100755 --- a/traffic-pacing-edt/tc_fq_pacer.sh +++ b/traffic-pacing-edt/tc_fq_pacer.sh @@ -2,6 +2,10 @@ # # Loading FQ pacing qdisc in multi-queue MQ setup to avoid root qdisc lock. # +# The FQ pacing qdisc is doing all the work of pacing packet out according to +# the EDT (Earliest Departure Time) future timestamps set by our BPF-prog that +# runs a TC-egress hook. +# # Author: Jesper Dangaaard Brouer # License: GPLv2 # @@ -30,6 +34,7 @@ tc qdisc replace dev $DEV root handle 7FFF: mq # Add FQ-pacer qdisc on each NIC avail TX-queue i=0 for dir in /sys/class/net/$DEV/queues/tx-*; do + # Details: cause-off-by-one, as tx-0 becomes handle 1: ((i++)) || true tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq done From a6294dd946e2147edd3c3231f182e0d9027e6740 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 14 Nov 2020 14:51:30 +0100 Subject: [PATCH 05/61] edt_pacer02: Use skb wire_len Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/edt_pacer02.c | 12 +++++++++--- traffic-pacing-edt/tc_fq_pacer.sh | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 47eecc7..b90e780 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -53,9 +53,15 @@ static __always_inline int sched_departure(struct __sk_buff *skb) if (!edt) return BPF_DROP; - /* Calc transmission time it takes to send packet 'bytes' */ - t_xmit_ns = ((__u64)skb->len) * NS_PER_SEC / RATE_IN_BYTES; - // t_xmit_ns = ((__u64)skb->len) * NS_PER_SEC / edt->rate; + /* Calc transmission time it takes to send packet 'bytes'. + * + * Details on getting precise bytes on wire. The skb->len does include + * length of GRO/GSO segments, but not the segment headers that gets + * added on transmit. Fortunately skb->wire_len at TC-egress hook (not + * ingress) include these headers. (See: qdisc_pkt_len_init()) + */ + t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / RATE_IN_BYTES; + // t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / edt->rate; now = bpf_ktime_get_ns(); diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh index 9682f12..dd1c8fa 100755 --- a/traffic-pacing-edt/tc_fq_pacer.sh +++ b/traffic-pacing-edt/tc_fq_pacer.sh @@ -37,4 +37,5 @@ for dir in /sys/class/net/$DEV/queues/tx-*; do # Details: cause-off-by-one, as tx-0 becomes handle 1: ((i++)) || true tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq + # tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit done From 252a40763abb5842555e496efb95de9a6ecd5492 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 14 Nov 2020 14:58:47 +0100 Subject: [PATCH 06/61] traffic-pacing-edt: Adjustments to HTB script to get closer to EDT system These adjustment doesn't help, EDT is still closer to 1Gbit/s at wire-level. Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/tc_htb_shaper.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/traffic-pacing-edt/tc_htb_shaper.sh b/traffic-pacing-edt/tc_htb_shaper.sh index 69aedb9..08e0b06 100755 --- a/traffic-pacing-edt/tc_htb_shaper.sh +++ b/traffic-pacing-edt/tc_htb_shaper.sh @@ -20,11 +20,16 @@ export TC=/sbin/tc # Measurements show around 930Mbit best-case. Q-in-Q result in MTU # 1522 bytes. TCP goodput segments are 1448 bytes. # -RATE=$((930*1522/1448))Mbit +#RATE=$((930*1522/1448))Mbit ##RATE=$((933*1522/1448))Mbit ##CEIL=$((999*1522/1448)) #CEIL=1Gbit -CEIL=980mbit +#CEIL=980mbit + +# EDT shaper show TCP goodput of 956 Mbit/s. +# echo $((956*1514/1448)) = 999 +RATE=999Mbit +CEIL=1000Mbit #RATE=500mbit #CEIL=577mbit From 55a8513e2c9cb946c9acc6c5717d2e880ca0b48c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 14 Nov 2020 16:51:11 +0100 Subject: [PATCH 07/61] traffic-pacing-edt: Play with edt_pacer02 drop horizon This didn't help. Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/edt_pacer02.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index b90e780..532efa8 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -9,10 +9,12 @@ char _license[] SEC("license") = "GPL"; #define NS_PER_SEC 1000000000 /* skb->len in bytes, thus easier to keep rate in bytes */ -#define RATE_IN_BITS (1000 * 1000 * 1000) +#define RATE_IN_BITS (1000 * 1000 * 1000ULL) #define RATE_IN_BYTES (RATE_IN_BITS / 8) -#define T_HORIZON_DROP (2000 * 1000 * 1000) +#define T_HORIZON_DROP (2000 * 1000 * 1000ULL) +//#define T_HORIZON_DROP (200000 * 1000 * 1000ULL) +//#define T_HORIZON_DROP (20 * 1000 * 1000ULL) /* FIXME add proper READ_ONCE / WRITE_ONCE macros, for now use for annotation */ #define READ_ONCE(V) (V) From 5a3e52cf430c59f8ab5de3d42d7ab28bc5b20c37 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 14 Nov 2020 16:51:56 +0100 Subject: [PATCH 08/61] traffic-pacing-edt: Make fq script respect --remove Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/tc_fq_pacer.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh index dd1c8fa..92aa21d 100755 --- a/traffic-pacing-edt/tc_fq_pacer.sh +++ b/traffic-pacing-edt/tc_fq_pacer.sh @@ -28,6 +28,10 @@ VERBOSE=1 # Delete existing root qdisc call_tc_allow_fail qdisc del dev "$DEV" root +if [[ -n $REMOVE ]]; then + exit 0 +fi + # MQ (Multi-Queue) as root qdisc tc qdisc replace dev $DEV root handle 7FFF: mq From 9f97d984cbda08b17abfac69c2026ce76dcd6d54 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 14 Nov 2020 17:16:03 +0100 Subject: [PATCH 09/61] traffic-pacing-edt: edt_pacer02.c add ECN marking horizon Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/edt_pacer02.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 532efa8..ed66ba1 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -10,12 +10,15 @@ char _license[] SEC("license") = "GPL"; /* skb->len in bytes, thus easier to keep rate in bytes */ #define RATE_IN_BITS (1000 * 1000 * 1000ULL) +//#define RATE_IN_BITS (500 * 1000 * 1000ULL) #define RATE_IN_BYTES (RATE_IN_BITS / 8) #define T_HORIZON_DROP (2000 * 1000 * 1000ULL) //#define T_HORIZON_DROP (200000 * 1000 * 1000ULL) //#define T_HORIZON_DROP (20 * 1000 * 1000ULL) +#define T_HORIZON_ECN (5 * 1000 * 1000ULL) + /* FIXME add proper READ_ONCE / WRITE_ONCE macros, for now use for annotation */ #define READ_ONCE(V) (V) #define WRITE_ONCE(X,V) (X) = (V) @@ -97,7 +100,9 @@ static __always_inline int sched_departure(struct __sk_buff *skb) if (t_queue_sz >= T_HORIZON_DROP /* edt->t_horizon_drop */) return BPF_DROP; - // TODO Add ECN marking horizon + /* ECN marking horizon */ + if (t_queue_sz >= T_HORIZON_ECN) + bpf_skb_ecn_set_ce(skb); /* Advance "time queue" */ WRITE_ONCE(edt->t_last, t_next); From 1fb44832079319cd98647bcdc25f2d148431733a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 14 Nov 2020 18:44:40 +0100 Subject: [PATCH 10/61] traffic-pacing-edt: tc_fq_pacer.sh adjust packet per flow_limit This was causing strange issues, where a TCP single flow could not achieve the correct bandwidth. Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/edt_pacer02.c | 1 + traffic-pacing-edt/tc_fq_pacer.sh | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index ed66ba1..83e2823 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -10,6 +10,7 @@ char _license[] SEC("license") = "GPL"; /* skb->len in bytes, thus easier to keep rate in bytes */ #define RATE_IN_BITS (1000 * 1000 * 1000ULL) +//#define RATE_IN_BITS (200 * 1000 * 1000ULL) //#define RATE_IN_BITS (500 * 1000 * 1000ULL) #define RATE_IN_BYTES (RATE_IN_BITS / 8) diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh index 92aa21d..1ad9805 100755 --- a/traffic-pacing-edt/tc_fq_pacer.sh +++ b/traffic-pacing-edt/tc_fq_pacer.sh @@ -40,6 +40,12 @@ i=0 for dir in /sys/class/net/$DEV/queues/tx-*; do # Details: cause-off-by-one, as tx-0 becomes handle 1: ((i++)) || true - tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq + #tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq + # + # The higher 'flow_limit' is needed for high-BW pacing + tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq \ + flow_limit 1000 + # + # quantum $((1514*4)) initial_quantum $((1514*20)) # tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit done From 4ded8f7015f25d90cc2a7787620f6aa7f053a50e Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 14 Nov 2020 19:33:42 +0100 Subject: [PATCH 11/61] traffic-pacing-edt: control latency via horizon drop When number of parallel (iperf -P N) flows increase, then the latency increase as well (measured via simple ping through router). This can be controlled via a much tigher drop horizon. Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/edt_pacer02.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 83e2823..779c6eb 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -14,9 +14,9 @@ char _license[] SEC("license") = "GPL"; //#define RATE_IN_BITS (500 * 1000 * 1000ULL) #define RATE_IN_BYTES (RATE_IN_BITS / 8) -#define T_HORIZON_DROP (2000 * 1000 * 1000ULL) +//#define T_HORIZON_DROP (2000 * 1000 * 1000ULL) //#define T_HORIZON_DROP (200000 * 1000 * 1000ULL) -//#define T_HORIZON_DROP (20 * 1000 * 1000ULL) +#define T_HORIZON_DROP (15 * 1000 * 1000ULL) #define T_HORIZON_ECN (5 * 1000 * 1000ULL) From 6ee640393be650a8ad7e2228bd2343895c197254 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 15 Nov 2020 15:12:20 +0100 Subject: [PATCH 12/61] Update parsing_helpers.h from xdp-tutorial Signed-off-by: Jesper Dangaard Brouer --- headers/xdp/parsing_helpers.h | 93 ++++++++++++++++------------------- 1 file changed, 43 insertions(+), 50 deletions(-) diff --git a/headers/xdp/parsing_helpers.h b/headers/xdp/parsing_helpers.h index c29f23b..f889fb3 100644 --- a/headers/xdp/parsing_helpers.h +++ b/headers/xdp/parsing_helpers.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-clause) */ /* - * This file contains parsing functions that can be used in eXDP programs. The - * functions are marked as __always_inline, and fully defined in this header - * file to be included in the BPF program. + * This file contains parsing functions that are used in the packetXX XDP + * programs. The functions are marked as __always_inline, and fully defined in + * this header file to be included in the BPF program. * * Each helper parses a packet header, including doing bounds checking, and * returns the type of its contents if successful, and -1 otherwise. @@ -10,6 +10,10 @@ * For Ethernet and IP headers, the content type is the type of the payload * (h_proto for Ethernet, nexthdr for IPv6), for ICMP it is the ICMP type field. * All return values are in host byte order. + * + * The versions of the functions included here are slightly expanded versions of + * the functions in the packet01 lesson. For instance, the Ethernet header + * parsing has support for parsing VLAN tags. */ #ifndef __PARSING_HELPERS_H @@ -24,8 +28,6 @@ #include #include #include -#include -#include /* Header cursor to keep track of current parsing position */ struct hdr_cursor { @@ -54,14 +56,14 @@ struct icmphdr_common { /* Allow users of header file to redefine VLAN max depth */ #ifndef VLAN_MAX_DEPTH -#define VLAN_MAX_DEPTH 4 -#endif - -/* Longest chain of IPv6 extension headers to resolve */ -#ifndef IPV6_EXT_MAX_CHAIN -#define IPV6_EXT_MAX_CHAIN 6 +#define VLAN_MAX_DEPTH 2 #endif +#define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ +/* Struct for collecting VLANs after parsing via parse_ethhdr_vlan */ +struct collect_vlans { + __u16 id[VLAN_MAX_DEPTH]; +}; static __always_inline int proto_is_vlan(__u16 h_proto) { @@ -74,18 +76,24 @@ static __always_inline int proto_is_vlan(__u16 h_proto) * Ethernet header. Thus, caller can look at eth->h_proto to see if this was a * VLAN tagged packet. */ -static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end, - struct ethhdr **ethhdr) +static __always_inline int parse_ethhdr_vlan(struct hdr_cursor *nh, + void *data_end, + struct ethhdr **ethhdr, + struct collect_vlans *vlans) { struct ethhdr *eth = nh->pos; + int hdrsize = sizeof(*eth); struct vlan_hdr *vlh; __u16 h_proto; int i; - if (eth + 1 > data_end) + /* Byte-count bounds check; check if current pointer + size of header + * is after data_end. + */ + if (nh->pos + hdrsize > data_end) return -1; - nh->pos = eth + 1; + nh->pos += hdrsize; *ethhdr = eth; vlh = nh->pos; h_proto = eth->h_proto; @@ -102,6 +110,10 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end, break; h_proto = vlh->h_vlan_encapsulated_proto; + if (vlans) /* collect VLAN ids */ + vlans->id[i] = + (bpf_ntohs(vlh->h_vlan_TCI) & VLAN_VID_MASK); + vlh++; } @@ -109,39 +121,12 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end, return h_proto; /* network-byte-order */ } -static __always_inline int skip_ip6hdrext(struct hdr_cursor *nh, - void *data_end, - __u8 next_hdr_type) +static __always_inline int parse_ethhdr(struct hdr_cursor *nh, + void *data_end, + struct ethhdr **ethhdr) { - for (int i = 0; i < IPV6_EXT_MAX_CHAIN; ++i) { - struct ipv6_opt_hdr *hdr = nh->pos; - - if (hdr + 1 > data_end) - return -1; - - switch (next_hdr_type) { - case IPPROTO_HOPOPTS: - case IPPROTO_DSTOPTS: - case IPPROTO_ROUTING: - case IPPROTO_MH: - nh->pos = (char *)hdr + (hdr->hdrlen + 1) * 8; - next_hdr_type = hdr->nexthdr; - break; - case IPPROTO_AH: - nh->pos = (char *)hdr + (hdr->hdrlen + 2) * 4; - next_hdr_type = hdr->nexthdr; - break; - case IPPROTO_FRAGMENT: - nh->pos = (char *)hdr + 8; - next_hdr_type = hdr->nexthdr; - break; - default: - /* Found a header that is not an IPv6 extension header */ - return next_hdr_type; - } - } - - return -1; + /* Expect compiler removes the code that collects VLAN ids */ + return parse_ethhdr_vlan(nh, data_end, ethhdr, NULL); } static __always_inline int parse_ip6hdr(struct hdr_cursor *nh, @@ -160,7 +145,7 @@ static __always_inline int parse_ip6hdr(struct hdr_cursor *nh, nh->pos = ip6h + 1; *ip6hdr = ip6h; - return skip_ip6hdrext(nh, data_end, ip6h->nexthdr); + return ip6h->nexthdr; } static __always_inline int parse_iphdr(struct hdr_cursor *nh, @@ -174,6 +159,9 @@ static __always_inline int parse_iphdr(struct hdr_cursor *nh, return -1; hdrsize = iph->ihl * 4; + /* Sanity check packet field is valid */ + if(hdrsize < sizeof(iph)) + return -1; /* Variable-length IPv4 header, need to use byte-based arithmetic */ if (nh->pos + hdrsize > data_end) @@ -267,10 +255,15 @@ static __always_inline int parse_tcphdr(struct hdr_cursor *nh, return -1; len = h->doff * 4; - if ((void *) h + len > data_end) + /* Sanity check packet field is valid */ + if(len < sizeof(h)) return -1; - nh->pos = h + 1; + /* Variable-length TCP header, need to use byte-based arithmetic */ + if (nh->pos + len > data_end) + return -1; + + nh->pos += len; *tcphdr = h; return len; From 097079cde19a5b7c45804b217e854c2fa632ee92 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 15 Nov 2020 15:17:00 +0100 Subject: [PATCH 13/61] Fix includes in parsing_helpers.h Signed-off-by: Jesper Dangaard Brouer --- headers/xdp/parsing_helpers.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/headers/xdp/parsing_helpers.h b/headers/xdp/parsing_helpers.h index f889fb3..c9d6363 100644 --- a/headers/xdp/parsing_helpers.h +++ b/headers/xdp/parsing_helpers.h @@ -28,6 +28,8 @@ #include #include #include +#include +#include /* Header cursor to keep track of current parsing position */ struct hdr_cursor { From c0cd6aedba40535857ae5b4c6043115671ed68be Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 15 Nov 2020 15:26:12 +0100 Subject: [PATCH 14/61] parsing_helpers.h re-add IPv6 skip of extension headers This code comes from xdp-tools repo. Signed-off-by: Jesper Dangaard Brouer --- headers/xdp/parsing_helpers.h | 42 ++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/headers/xdp/parsing_helpers.h b/headers/xdp/parsing_helpers.h index c9d6363..de6705b 100644 --- a/headers/xdp/parsing_helpers.h +++ b/headers/xdp/parsing_helpers.h @@ -61,6 +61,11 @@ struct icmphdr_common { #define VLAN_MAX_DEPTH 2 #endif +/* Longest chain of IPv6 extension headers to resolve */ +#ifndef IPV6_EXT_MAX_CHAIN +#define IPV6_EXT_MAX_CHAIN 6 +#endif + #define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ /* Struct for collecting VLANs after parsing via parse_ethhdr_vlan */ struct collect_vlans { @@ -131,6 +136,41 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, return parse_ethhdr_vlan(nh, data_end, ethhdr, NULL); } +static __always_inline int skip_ip6hdrext(struct hdr_cursor *nh, + void *data_end, + __u8 next_hdr_type) +{ + for (int i = 0; i < IPV6_EXT_MAX_CHAIN; ++i) { + struct ipv6_opt_hdr *hdr = nh->pos; + + if (hdr + 1 > data_end) + return -1; + + switch (next_hdr_type) { + case IPPROTO_HOPOPTS: + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_MH: + nh->pos = (char *)hdr + (hdr->hdrlen + 1) * 8; + next_hdr_type = hdr->nexthdr; + break; + case IPPROTO_AH: + nh->pos = (char *)hdr + (hdr->hdrlen + 2) * 4; + next_hdr_type = hdr->nexthdr; + break; + case IPPROTO_FRAGMENT: + nh->pos = (char *)hdr + 8; + next_hdr_type = hdr->nexthdr; + break; + default: + /* Found a header that is not an IPv6 extension header */ + return next_hdr_type; + } + } + + return -1; +} + static __always_inline int parse_ip6hdr(struct hdr_cursor *nh, void *data_end, struct ipv6hdr **ip6hdr) @@ -147,7 +187,7 @@ static __always_inline int parse_ip6hdr(struct hdr_cursor *nh, nh->pos = ip6h + 1; *ip6hdr = ip6h; - return ip6h->nexthdr; + return skip_ip6hdrext(nh, data_end, ip6h->nexthdr); } static __always_inline int parse_iphdr(struct hdr_cursor *nh, From 9ea235637ecab158a895bbae25bb096316fe9749 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 15 Nov 2020 15:38:15 +0100 Subject: [PATCH 15/61] Add compiler.h to define some common compiler directives This is taken from the Cilium project: https://github.com/cilium/cilium/blob/master/bpf/include/bpf/compiler.h The use-case was adding READ_ONCE and WRITE_ONCE, but via re-using the Cilium version we get a lot more useful compiler annotations. Signed-off-by: Jesper Dangaard Brouer --- headers/bpf/compiler.h | 124 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 headers/bpf/compiler.h diff --git a/headers/bpf/compiler.h b/headers/bpf/compiler.h new file mode 100644 index 0000000..2588023 --- /dev/null +++ b/headers/bpf/compiler.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2016-2020 Authors of Cilium */ + +#ifndef __BPF_COMPILER_H_ +#define __BPF_COMPILER_H_ + +#ifndef __non_bpf_context +# include "stddef.h" +#endif + +#ifndef __section +# define __section(X) __attribute__((section(X), used)) +#endif + +#ifndef __maybe_unused +# define __maybe_unused __attribute__((__unused__)) +#endif + +#ifndef offsetof +# define offsetof(T, M) __builtin_offsetof(T, M) +#endif + +#ifndef field_sizeof +# define field_sizeof(T, M) sizeof((((T *)NULL)->M)) +#endif + +#ifndef __packed +# define __packed __attribute__((packed)) +#endif + +#ifndef __nobuiltin +# if __clang_major__ >= 10 +# define __nobuiltin(X) __attribute__((no_builtin(X))) +# else +# define __nobuiltin(X) +# endif +#endif + +#ifndef likely +# define likely(X) __builtin_expect(!!(X), 1) +#endif + +#ifndef unlikely +# define unlikely(X) __builtin_expect(!!(X), 0) +#endif + +#ifndef always_succeeds /* Mainly for documentation purpose. */ +# define always_succeeds(X) likely(X) +#endif + +#undef __always_inline /* stddef.h defines its own */ +#define __always_inline inline __attribute__((always_inline)) + +#ifndef __stringify +# define __stringify(X) #X +#endif + +#ifndef __fetch +# define __fetch(X) (__u32)(__u64)(&(X)) +#endif + +#ifndef __aligned +# define __aligned(X) __attribute__((aligned(X))) +#endif + +#ifndef build_bug_on +# define build_bug_on(E) ((void)sizeof(char[1 - 2*!!(E)])) +#endif + +#ifndef __throw_build_bug +# define __throw_build_bug() __builtin_trap() +#endif + +#ifndef __printf +# define __printf(X, Y) __attribute__((__format__(printf, X, Y))) +#endif + +#ifndef barrier +# define barrier() asm volatile("": : :"memory") +#endif + +#ifndef barrier_data +# define barrier_data(ptr) asm volatile("": :"r"(ptr) :"memory") +#endif + +static __always_inline void bpf_barrier(void) +{ + /* Workaround to avoid verifier complaint: + * "dereference of modified ctx ptr R5 off=48+0, ctx+const is allowed, + * ctx+const+const is not" + */ + barrier(); +} + +#ifndef ARRAY_SIZE +# define ARRAY_SIZE(A) (sizeof(A) / sizeof((A)[0])) +#endif + +#ifndef __READ_ONCE +# define __READ_ONCE(X) (*(volatile typeof(X) *)&X) +#endif + +#ifndef __WRITE_ONCE +# define __WRITE_ONCE(X, V) (*(volatile typeof(X) *)&X) = (V) +#endif + +/* {READ,WRITE}_ONCE() with verifier workaround via bpf_barrier(). */ + +#ifndef READ_ONCE +# define READ_ONCE(X) \ + ({ typeof(X) __val = __READ_ONCE(X); \ + bpf_barrier(); \ + __val; }) +#endif + +#ifndef WRITE_ONCE +# define WRITE_ONCE(X, V) \ + ({ typeof(X) __val = (V); \ + __WRITE_ONCE(X, __val); \ + bpf_barrier(); \ + __val; }) +#endif + +#endif /* __BPF_COMPILER_H_ */ From 692202e60ed194a2df5df7d22f639b6ca1b8bbeb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 15 Nov 2020 15:40:42 +0100 Subject: [PATCH 16/61] traffic-pacing-edt: use READ_ONCE and WRITE_ONCE via compiler.h Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/edt_pacer02.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 779c6eb..c742d27 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0+ */ #include #include +#include #include #include "iproute2_compat.h" @@ -20,10 +21,6 @@ char _license[] SEC("license") = "GPL"; #define T_HORIZON_ECN (5 * 1000 * 1000ULL) -/* FIXME add proper READ_ONCE / WRITE_ONCE macros, for now use for annotation */ -#define READ_ONCE(V) (V) -#define WRITE_ONCE(X,V) (X) = (V) - struct edt_val { __u64 rate; __u64 t_last; From 21ebc4d8cd2aaa8a70d210f4c04c1027d9376e72 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 15 Nov 2020 15:56:31 +0100 Subject: [PATCH 17/61] traffic-pacing-edt: Align map struct to cache-line size Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/edt_pacer02.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index c742d27..b8f22c9 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -26,7 +26,7 @@ struct edt_val { __u64 t_last; __u64 t_horizon_drop; __u64 t_horizon_ecn; -}; +} __aligned(64); /* Align struct to cache-size to avoid false-sharing */ /* The tc tool (iproute2) use another ELF map layout than libbpf (struct * bpf_map_def), see struct bpf_elf_map from iproute2. From 9c5ccaed9be3818f44b7c6aa1b0fce1bd5af1bc2 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 15 Nov 2020 16:23:46 +0100 Subject: [PATCH 18/61] traffic-pacing-edt: Simple VLAN parsing via parse_ethhdr_vlan Using the XDP based ethhdr VLAN parser. This cannot handle if the SKB don't have the VLAN inlined. Static match on VLAN 16 as test case. Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/edt_pacer02.c | 35 ++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index b8f22c9..52137f1 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -2,9 +2,11 @@ #include #include #include -#include #include "iproute2_compat.h" +#define VLAN_MAX_DEPTH 2 +#include + char _license[] SEC("license") = "GPL"; #define NS_PER_SEC 1000000000 @@ -112,25 +114,36 @@ static __always_inline int sched_departure(struct __sk_buff *skb) SEC("classifier") int tc_edt_simple(struct __sk_buff *skb) { - volatile void *data, *data_end; - int ret = BPF_OK; + void *data = (void *)(long)skb->data; + void *data_end = (void *)(long)skb->data_end; + struct collect_vlans vlans = { 0 }; struct ethhdr *eth; + int ret = BPF_OK; - data = (void *)(long)skb->data; - data_end = (void *)(long)skb->data_end; - eth = (struct ethhdr *)data; + /* These keep track of the next header type and iterator pointer */ + struct hdr_cursor nh; + int eth_type; + nh.pos = data; - if (data + sizeof(*eth) > data_end) - return BPF_DROP; + eth_type = parse_ethhdr_vlan(&nh, data_end, ð, &vlans); + if (eth_type < 0) + return XDP_ABORTED; /* Keep ARP resolution working */ - if (eth->h_proto == bpf_htons(ETH_P_ARP)) { + if (eth_type == bpf_htons(ETH_P_ARP)) { ret = BPF_OK; goto out; } - // TODO: match on vlan16 and only apply EDT on that - return sched_departure(skb); + if (!proto_is_vlan(eth->h_proto)) { + /* Skip non-VLAN frames */ + return BPF_OK; + } + + /* Match on vlan16 and only apply EDT on that */ + // FIXME: handle if VLAN is not inlined in packet + if (vlans.id[0] == 16) + return sched_departure(skb); out: return ret; From 82186cfe72fc8e51d099c1c072230fd00eba11eb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 15 Nov 2020 17:03:04 +0100 Subject: [PATCH 19/61] traffic-pacing-edt: script for VLAN setup Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/testlab_vlan_setup.sh | 65 ++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100755 traffic-pacing-edt/testlab_vlan_setup.sh diff --git a/traffic-pacing-edt/testlab_vlan_setup.sh b/traffic-pacing-edt/testlab_vlan_setup.sh new file mode 100755 index 0000000..dfcfbfc --- /dev/null +++ b/traffic-pacing-edt/testlab_vlan_setup.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# +# Testlab setup script for VLAN Q-in-Q (double tagged VLAN) config. +# +# Author: Jesper Dangaaard Brouer +# License: GPLv2 +# +basedir=`dirname $0` +source ${basedir}/functions.sh + +root_check_run_with_sudo "$@" + +# Use common parameters +source ${basedir}/parameters.sh + +export IP=/sbin/ip +function ip() { + echo $IP "$@" + $IP "$@" +} + +function create_vlan_device() { + local vlan=${1} + local device=${2:-$DEV} + shift 2 + + if [[ -z "$vlan" ]]; then + err 2 "Missing VLAN is as input" + fi + + ip link add link "$device" name ${device}.${vlan} type vlan id ${vlan} + ip link set ${device}.${vlan} up +} + +function delete_vlan_device() { + local vlan=${1} + local device=${2:-$DEV} + shift 2 + + if [[ -z "$vlan" ]]; then + err 2 "Missing VLAN is as input" + fi + + ip link del ${device}.${vlan} +} + + +if [[ -z "$1" ]]; then + err 3 "Missing arg#1 for outer vlan" +fi +OUTER=$1 + +if [[ -z "$2" ]]; then + err 3 "Missing arg#2 for inner vlan" +fi +INNER=$2 + +if [[ -n $REMOVE ]]; then + delete_vlan_device $INNER ${DEV}.${OUTER} + delete_vlan_device $OUTER $DEV + exit 0 +fi + +create_vlan_device $OUTER $DEV +create_vlan_device $INNER ${DEV}.${OUTER} From 1196c6cf14e2419dc451ed111be378119e2f9bfa Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 15 Nov 2020 17:23:41 +0100 Subject: [PATCH 20/61] traffic-pacing-edt: adjust parameters help txt to be more general Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/parameters.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/traffic-pacing-edt/parameters.sh b/traffic-pacing-edt/parameters.sh index 6d0841d..d0077ab 100644 --- a/traffic-pacing-edt/parameters.sh +++ b/traffic-pacing-edt/parameters.sh @@ -10,10 +10,10 @@ function usage() { echo "Usage: $0 [-vh] --dev ethX" echo " -d | --dev : (\$DEV) Interface/device (required)" echo " -v | --verbose : (\$VERBOSE) verbose" - echo " --remove : (\$REMOVE) Remove the TC rules" + echo " --remove : (\$REMOVE) Remove the rules" echo " --dry-run : (\$DRYRUN) Dry-run only (echo tc commands)" - echo " -s | --stats : (\$STATS_ONLY) Call TC statistics command" - echo " -l | --list : (\$LIST) List TC filter setup after setup" + echo " -s | --stats : (\$STATS_ONLY) Call statistics command" + echo " -l | --list : (\$LIST) List setup after setup" echo " --file | --obj : (\$BPF_OBJ) BPF-object file to load" echo "" } @@ -80,5 +80,5 @@ done if [ -z "$DEV" ]; then usage - err 2 "Please specify TC net_device" + err 2 "Please specify net_device (\$DEV)" fi From d8a992aab48be36e0b75bfb06a18cdfbb9ca40f0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 15 Nov 2020 17:27:09 +0100 Subject: [PATCH 21/61] traffic-pacing-edt: Add IP wrapper functions Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/functions.sh | 25 ++++++++++++++++++++++++ traffic-pacing-edt/testlab_vlan_setup.sh | 3 +-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/traffic-pacing-edt/functions.sh b/traffic-pacing-edt/functions.sh index a92f482..32cbdde 100644 --- a/traffic-pacing-edt/functions.sh +++ b/traffic-pacing-edt/functions.sh @@ -62,3 +62,28 @@ function call_tc() { function call_tc_allow_fail() { _call_tc "allow_fail" "$@" } + +## -- Wrapper calls for IP -- +function _call_ip() { + local allow_fail="$1" + shift + if [[ -n "$VERBOSE" ]]; then + echo "ip $@" + fi + if [[ -n "$DRYRUN" ]]; then + return + fi + $IP "$@" + local status=$? + if (( $status != 0 )); then + if [[ "$allow_fail" == "" ]]; then + err 3 "Exec error($status) occurred cmd: \"$IP $@\"" + fi + fi +} +function call_ip() { + _call_ip "" "$@" +} +function call_ip_allow_fail() { + _call_ip "allow_fail" "$@" +} diff --git a/traffic-pacing-edt/testlab_vlan_setup.sh b/traffic-pacing-edt/testlab_vlan_setup.sh index dfcfbfc..8c1b33b 100755 --- a/traffic-pacing-edt/testlab_vlan_setup.sh +++ b/traffic-pacing-edt/testlab_vlan_setup.sh @@ -15,8 +15,7 @@ source ${basedir}/parameters.sh export IP=/sbin/ip function ip() { - echo $IP "$@" - $IP "$@" + call_ip "$@" } function create_vlan_device() { From 6a67b105ee3f9cc9743d80fc873f1d439279adb1 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 15 Nov 2020 20:53:22 +0100 Subject: [PATCH 22/61] traffic-pacing-edt: Remember MTU setting on netdevices Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/testlab_vlan_setup.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/traffic-pacing-edt/testlab_vlan_setup.sh b/traffic-pacing-edt/testlab_vlan_setup.sh index 8c1b33b..2228af5 100755 --- a/traffic-pacing-edt/testlab_vlan_setup.sh +++ b/traffic-pacing-edt/testlab_vlan_setup.sh @@ -31,6 +31,21 @@ function create_vlan_device() { ip link set ${device}.${vlan} up } +function create_vlan_device_802_1ad() { + local vlan=${1} + local device=${2:-$DEV} + shift 2 + + if [[ -z "$vlan" ]]; then + err 2 "Missing VLAN is as input" + fi + + ip link add link "$device" name ${device}.${vlan} type vlan id ${vlan} \ + protocol 802.1ad + ip link set ${device}.${vlan} up +} + + function delete_vlan_device() { local vlan=${1} local device=${2:-$DEV} @@ -62,3 +77,8 @@ fi create_vlan_device $OUTER $DEV create_vlan_device $INNER ${DEV}.${OUTER} + +# Set MTU to handle extra VLAN headers, NICs usually allow one VLAN +# header even though they have configured MTU 1500. +ip link set $DEV mtu 1508 +ip link set ${DEV}.${OUTER} mtu 1504 From a0f3760d6c9f9d962f382f12df2ac84672f8313e Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 15 Nov 2020 21:19:10 +0100 Subject: [PATCH 23/61] traffic-pacing-edt: Handle if VLAN is offloaded to SKB metadata Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/edt_pacer02.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 52137f1..8943973 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -119,6 +119,7 @@ SEC("classifier") int tc_edt_simple(struct __sk_buff *skb) struct collect_vlans vlans = { 0 }; struct ethhdr *eth; int ret = BPF_OK; + __u16 vlan_key; /* These keep track of the next header type and iterator pointer */ struct hdr_cursor nh; @@ -135,14 +136,19 @@ SEC("classifier") int tc_edt_simple(struct __sk_buff *skb) goto out; } - if (!proto_is_vlan(eth->h_proto)) { + if (!proto_is_vlan(eth->h_proto) && !skb->vlan_present) { /* Skip non-VLAN frames */ return BPF_OK; } - /* Match on vlan16 and only apply EDT on that */ - // FIXME: handle if VLAN is not inlined in packet - if (vlans.id[0] == 16) + /* NIC can HW "offload" the outer VLAN, moving it to skb context */ + if (skb->vlan_present) + vlan_key = vlans.id[0]; /* Inner vlan placed as first inline */ + else + vlan_key = vlans.id[1]; /* All VLAN headers inline */ + + /* For-now: Match on vlan16 and only apply EDT on that */ + if (vlan_key == 16) return sched_departure(skb); out: From 71db45b28ecb78b1c0bc92b3ea27a218b4ca0feb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 15 Nov 2020 21:54:39 +0100 Subject: [PATCH 24/61] traffic-pacing-edt: Handle if loaded on outer VLAN net_device Signed-off-by: Jesper Dangaard Brouer --- traffic-pacing-edt/edt_pacer02.c | 62 ++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 8943973..a3893b3 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -112,7 +112,59 @@ static __always_inline int sched_departure(struct __sk_buff *skb) return BPF_OK; } -SEC("classifier") int tc_edt_simple(struct __sk_buff *skb) +static __always_inline +__u16 get_inner_qinq_vlan(struct __sk_buff *skb, struct collect_vlans *vlans) +{ + __u16 vlan_key; + + /* NIC can HW "offload" the outer VLAN, moving it to skb context */ + if (skb->vlan_present) + vlan_key = vlans->id[0]; /* Inner vlan placed as first inline */ + else + vlan_key = vlans->id[1]; /* All VLAN headers inline */ + + return vlan_key; +} + +static __always_inline +__u16 get_vlan(struct __sk_buff *skb, struct collect_vlans *vlans) +{ + __u16 vlan_key; + + /* Handle extracting VLAN if skb context have VLAN offloaded */ + if (skb->vlan_present) + vlan_key = skb->vlan_tci & VLAN_VID_MASK; + else + vlan_key = vlans->id[0]; + + return vlan_key; +} + +static __always_inline +__u16 extract_vlan_key(struct __sk_buff *skb, struct collect_vlans *vlans) +{ + int QinQ = 0; + + /* The inner VLAN is the key to extract. But it is complicated + * due to NIC "offloaded" VLAN (skb->vlan_present). In case + * BPF-prog is loaded on outer VLAN net_device, the BPF-prog + * sees the inner-VLAN at the first and only VLAN. + */ + if (skb->vlan_present) { + if (vlans->id[0]) + QinQ = 1; + } else { + if (vlans->id[1]) + QinQ = 1; + } + + if (QinQ) + return get_inner_qinq_vlan(skb, vlans); + else + return get_vlan(skb, vlans); +} + +SEC("classifier") int tc_edt_vlan(struct __sk_buff *skb) { void *data = (void *)(long)skb->data; void *data_end = (void *)(long)skb->data_end; @@ -128,7 +180,7 @@ SEC("classifier") int tc_edt_simple(struct __sk_buff *skb) eth_type = parse_ethhdr_vlan(&nh, data_end, ð, &vlans); if (eth_type < 0) - return XDP_ABORTED; + return BPF_DROP; /* Keep ARP resolution working */ if (eth_type == bpf_htons(ETH_P_ARP)) { @@ -141,11 +193,7 @@ SEC("classifier") int tc_edt_simple(struct __sk_buff *skb) return BPF_OK; } - /* NIC can HW "offload" the outer VLAN, moving it to skb context */ - if (skb->vlan_present) - vlan_key = vlans.id[0]; /* Inner vlan placed as first inline */ - else - vlan_key = vlans.id[1]; /* All VLAN headers inline */ + vlan_key = extract_vlan_key(skb, &vlans); /* For-now: Match on vlan16 and only apply EDT on that */ if (vlan_key == 16) From 740416975ffb83cc5eb162397cf5a5e5d1069aae Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sat, 21 Nov 2020 13:05:47 +0100 Subject: [PATCH 25/61] traffic-pacing-edt: allow tc util to be install in other places Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/bpf_egress_loader.sh | 2 +- traffic-pacing-edt/tc_fq_pacer.sh | 13 +++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/traffic-pacing-edt/bpf_egress_loader.sh b/traffic-pacing-edt/bpf_egress_loader.sh index 316cddd..934117d 100755 --- a/traffic-pacing-edt/bpf_egress_loader.sh +++ b/traffic-pacing-edt/bpf_egress_loader.sh @@ -11,7 +11,7 @@ root_check_run_with_sudo "$@" # Use common parameters source ${basedir}/parameters.sh -export TC=/sbin/tc +export TC=tc # This can be changed via --file or --obj if [[ -z ${BPF_OBJ} ]]; then diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh index 1ad9805..e7a502f 100755 --- a/traffic-pacing-edt/tc_fq_pacer.sh +++ b/traffic-pacing-edt/tc_fq_pacer.sh @@ -17,10 +17,7 @@ root_check_run_with_sudo "$@" # Use common parameters source ${basedir}/parameters.sh -export TC=/sbin/tc -function tc() { - _call_tc "" "$@" -} +export TC=tc # Default verbose VERBOSE=1 @@ -33,19 +30,19 @@ if [[ -n $REMOVE ]]; then fi # MQ (Multi-Queue) as root qdisc -tc qdisc replace dev $DEV root handle 7FFF: mq +call_tc qdisc replace dev $DEV root handle 7FFF: mq # Add FQ-pacer qdisc on each NIC avail TX-queue i=0 for dir in /sys/class/net/$DEV/queues/tx-*; do # Details: cause-off-by-one, as tx-0 becomes handle 1: ((i++)) || true - #tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq + #call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq # # The higher 'flow_limit' is needed for high-BW pacing - tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq \ + call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq \ flow_limit 1000 # # quantum $((1514*4)) initial_quantum $((1514*20)) - # tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit + # call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit done From 6b5648158bf4f90efe2bc6c16f64b66b44c68703 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sat, 21 Nov 2020 13:53:53 +0100 Subject: [PATCH 26/61] traffic-pacing-edt: Testing rates in production Test different rates in production machine, and measure iperf3 TCP-goodput Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/edt_pacer02.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index a3893b3..ebb502a 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -11,10 +11,20 @@ char _license[] SEC("license") = "GPL"; #define NS_PER_SEC 1000000000 +//#define RATE_IN_BITS (1000 * 1000 * 1000ULL) +//#define RATE_IN_BITS (998 * 1000 * 1000ULL) + +/* Test different rates in production machine, and measure iperf3 TCP-goodput */ +//#define RATE_IN_BITS (800 * 1000 * 1000ULL)// prod: 765 Mbits/sec (stable) +//#define RATE_IN_BITS (900 * 1000 * 1000ULL)// prod: 861 Mbits/sec (stable) +//#define RATE_IN_BITS (950 * 1000 * 1000ULL)// prod: 908 Mbits/sec (stable) +//#define RATE_IN_BITS (960 * 1000 * 1000ULL)// prod: 918 Mbits/sec +#define RATE_IN_BITS (970 * 1000 * 1000ULL)// prod: 928 Mbits/sec +//#define RATE_IN_BITS (980 * 1000 * 1000ULL)// prod: 920 Mbits/sec (unstable) +//#define RATE_IN_BITS (990 * 1000 * 1000ULL)// prod: 920 Mbits/sec (unstable) +//#define RATE_IN_BITS (999 * 1000 * 1000ULL)// prod: (unstable) + /* skb->len in bytes, thus easier to keep rate in bytes */ -#define RATE_IN_BITS (1000 * 1000 * 1000ULL) -//#define RATE_IN_BITS (200 * 1000 * 1000ULL) -//#define RATE_IN_BITS (500 * 1000 * 1000ULL) #define RATE_IN_BYTES (RATE_IN_BITS / 8) //#define T_HORIZON_DROP (2000 * 1000 * 1000ULL) From 794c074d7d6db58837d70c96c9fb7499ab915af1 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sat, 21 Nov 2020 15:57:14 +0100 Subject: [PATCH 27/61] traffic-pacing-edt: New strategy: Shape at MAC layer with Ethernet Take into account MAC layer overhead per packet. Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/edt_pacer02.c | 45 +++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index ebb502a..8a0b687 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -11,19 +11,50 @@ char _license[] SEC("license") = "GPL"; #define NS_PER_SEC 1000000000 -//#define RATE_IN_BITS (1000 * 1000 * 1000ULL) //#define RATE_IN_BITS (998 * 1000 * 1000ULL) /* Test different rates in production machine, and measure iperf3 TCP-goodput */ //#define RATE_IN_BITS (800 * 1000 * 1000ULL)// prod: 765 Mbits/sec (stable) //#define RATE_IN_BITS (900 * 1000 * 1000ULL)// prod: 861 Mbits/sec (stable) -//#define RATE_IN_BITS (950 * 1000 * 1000ULL)// prod: 908 Mbits/sec (stable) +///#define RATE_IN_BITS (950 * 1000 * 1000ULL)// prod: 908 Mbits/sec (stable) //#define RATE_IN_BITS (960 * 1000 * 1000ULL)// prod: 918 Mbits/sec -#define RATE_IN_BITS (970 * 1000 * 1000ULL)// prod: 928 Mbits/sec +//#define RATE_IN_BITS (970 * 1000 * 1000ULL)// prod: 928 Mbits/sec //#define RATE_IN_BITS (980 * 1000 * 1000ULL)// prod: 920 Mbits/sec (unstable) //#define RATE_IN_BITS (990 * 1000 * 1000ULL)// prod: 920 Mbits/sec (unstable) //#define RATE_IN_BITS (999 * 1000 * 1000ULL)// prod: (unstable) +/* Per packet overhead: two VLAN headers == 8 bytes + * + * skb->wire_len doesn't seem to take the two VLAN headers into + * account. Loading BPF-prog on VLAN net_device is can only see 1 + * VLAN, and this is likely HW offloaded into skb->vlan. + */ +//#define OVERHEAD (8) + + +/* New strategy: Shape at MAC (Medium Access Control) layer with Ethernet + * + * Production use-case is pacing traffic at 1Gbit/s wirespeed, using a + * 10Gbit/s NIC, because 1G end-user switch cannot handle bursts. + * + * (https://en.wikipedia.org/wiki/Interpacket_gap + * 12 bytes = interframe gap (IFG) 96 bit + + * (https://en.wikipedia.org/wiki/Ethernet_frame) + * 8 bytes = MAC preamble + * 4 bytes = Ethernet Frame Check Sequence (FCS) CRC + * 46 bytes = Minimum Payload size + * + * 14 bytes = Ethernet header + * 8 bytes = 2x VLAN headers + */ +//#define RATE_IN_BITS (1000 * 1000 * 1000ULL) /* Full 1Gbit/s */ +//#define RATE_IN_BITS (990 * 1000 * 1000ULL) +#define RATE_IN_BITS (950 * 1000 * 1000ULL) +#define OVERHEAD (12 + 8 + 4 + 8) /* 14 already in wire_len */ +//#define OVERHEAD (12 + 8 + 4) /* 14 already in wire_len */ +#define ETH_MIN (84) + /* skb->len in bytes, thus easier to keep rate in bytes */ #define RATE_IN_BYTES (RATE_IN_BITS / 8) @@ -59,6 +90,7 @@ static __always_inline int sched_departure(struct __sk_buff *skb) struct edt_val *edt; __u64 t_queue_sz; __u64 t_xmit_ns; + __u64 wire_len; __u64 t_next; __u64 t_curr; int key = 0; @@ -75,7 +107,12 @@ static __always_inline int sched_departure(struct __sk_buff *skb) * added on transmit. Fortunately skb->wire_len at TC-egress hook (not * ingress) include these headers. (See: qdisc_pkt_len_init()) */ - t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / RATE_IN_BYTES; + wire_len = skb->wire_len + OVERHEAD; + wire_len = wire_len > ETH_MIN ? wire_len : ETH_MIN; + + t_xmit_ns = (wire_len) * NS_PER_SEC / RATE_IN_BYTES; + +// t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / RATE_IN_BYTES; // t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / edt->rate; now = bpf_ktime_get_ns(); From 8714c9a37d0a5f18125f8081f07935ae8656a903 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sat, 21 Nov 2020 16:54:58 +0100 Subject: [PATCH 28/61] traffic-pacing-edt: also pace packets a bit on empty queue Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/edt_pacer02.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 8a0b687..f0ee70c 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -49,8 +49,8 @@ char _license[] SEC("license") = "GPL"; * 8 bytes = 2x VLAN headers */ //#define RATE_IN_BITS (1000 * 1000 * 1000ULL) /* Full 1Gbit/s */ -//#define RATE_IN_BITS (990 * 1000 * 1000ULL) -#define RATE_IN_BITS (950 * 1000 * 1000ULL) +#define RATE_IN_BITS (990 * 1000 * 1000ULL) +//#define RATE_IN_BITS (950 * 1000 * 1000ULL) #define OVERHEAD (12 + 8 + 4 + 8) /* 14 already in wire_len */ //#define OVERHEAD (12 + 8 + 4) /* 14 already in wire_len */ #define ETH_MIN (84) @@ -129,11 +129,18 @@ static __always_inline int sched_departure(struct __sk_buff *skb) t_next = READ_ONCE(edt->t_last) + t_xmit_ns; /* If packet doesn't get scheduled into the future, then there is - * no-queue and we are not above rate limit. Send packet immediately and - * move forward t_last timestamp to now. + * no-queue and we are not above rate limit. Normally send packet + * immediately and move forward t_last timestamp to now. + * + * But in our use-case the traffic need smoothing at a earlier + * stage, as bursts at lower rates can hurt the crapy switch. + * Thus, schedule SKB transmissing as new + t_xmit_ns. */ if (t_next <= t_curr) { - WRITE_ONCE(edt->t_last, t_curr); + __u64 t_curr_next = t_curr + t_xmit_ns; + + WRITE_ONCE(edt->t_last, t_curr_next); + skb->tstamp = t_curr_next; return BPF_OK; } From 4671be73a8292d67e31cc0eb357a2f4f43b3a5ce Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sat, 21 Nov 2020 18:02:45 +0100 Subject: [PATCH 29/61] traffic-pacing-edt: Minimum delay for all packet if no time-queue Trying to trigger more NET_TX_SOFTIRQ to get packets scheduled out more spaced out in time. It is of-cause important to disable GRO in the first place. E.g. cmdline: sudo ethtool -K ens6f1 gso off tso off gro off Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/edt_pacer02.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index f0ee70c..6ff121c 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -137,7 +137,13 @@ static __always_inline int sched_departure(struct __sk_buff *skb) * Thus, schedule SKB transmissing as new + t_xmit_ns. */ if (t_next <= t_curr) { - __u64 t_curr_next = t_curr + t_xmit_ns; + __u64 t_curr_next; + __u32 min_len = 1538 * 2; + + /* Minimum delay for all packet if no time-queue */ + wire_len = (wire_len > min_len) ? wire_len : min_len; + t_xmit_ns = (wire_len) * NS_PER_SEC / RATE_IN_BYTES; + t_curr_next = t_curr + t_xmit_ns; WRITE_ONCE(edt->t_last, t_curr_next); skb->tstamp = t_curr_next; From 68505a2dbd5c50e83234f587af017d465d213ced Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sat, 21 Nov 2020 18:27:37 +0100 Subject: [PATCH 30/61] traffic-pacing-edt: tc_fq_pacer.sh select between MQ and single FQ For some reason cannot get correct scheduling with FQ in a MQ setup. In production traffic is Q-in-Q double tagged VLAN traffic. Perhaps the RX-hash is doing strange stuff, or BPF-prog concurrency is wrong. Due to Q-in-Q NIC RSS cause most packets to hit CPU-6 for some strange reason. Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/tc_fq_pacer.sh | 61 +++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh index e7a502f..882b146 100755 --- a/traffic-pacing-edt/tc_fq_pacer.sh +++ b/traffic-pacing-edt/tc_fq_pacer.sh @@ -22,6 +22,14 @@ export TC=tc # Default verbose VERBOSE=1 +# Select between multiq or single root qdisc +if [[ -z $1 ]]; then + if [[ -z $REMOVE ]]; then + err 1 "Specify root qdisc system: single or mq (multi-queue)" + fi +fi +TYPE=$1 + # Delete existing root qdisc call_tc_allow_fail qdisc del dev "$DEV" root @@ -29,20 +37,41 @@ if [[ -n $REMOVE ]]; then exit 0 fi -# MQ (Multi-Queue) as root qdisc -call_tc qdisc replace dev $DEV root handle 7FFF: mq +function use_multiq() +{ + # MQ (Multi-Queue) as root qdisc + call_tc qdisc replace dev $DEV root handle 7FFF: mq -# Add FQ-pacer qdisc on each NIC avail TX-queue -i=0 -for dir in /sys/class/net/$DEV/queues/tx-*; do - # Details: cause-off-by-one, as tx-0 becomes handle 1: - ((i++)) || true - #call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq - # - # The higher 'flow_limit' is needed for high-BW pacing - call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq \ - flow_limit 1000 - # - # quantum $((1514*4)) initial_quantum $((1514*20)) - # call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit -done + # Add FQ-pacer qdisc on each NIC avail TX-queue + i=0 + for dir in /sys/class/net/$DEV/queues/tx-*; do + # Details: cause-off-by-one, as tx-0 becomes handle 1: + ((i++)) || true + #call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq + # + # The higher 'flow_limit' is needed for high-BW pacing + call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq \ + flow_limit 1000 + # + # quantum $((1514*4)) initial_quantum $((1514*20)) + # call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit + done +} + +function use_single_fq_pacer() +{ + call_tc qdisc replace dev $DEV root handle 7FFF: fq \ + flow_limit 1000 +} + +case "$TYPE" in + mq | multiq ) + use_multiq + ;; + single | fq ) + use_single_fq_pacer + ;; + * ) + err 1 "Unknown type: ${TYPE}" + ;; +esac From a45ae39775af21a7b8c631ea20b9febabef4323e Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sat, 21 Nov 2020 18:59:03 +0100 Subject: [PATCH 31/61] traffic-pacing-edt: make it easy to remove minimum delay trick in code Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/edt_pacer02.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 6ff121c..727c0b9 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -137,8 +137,9 @@ static __always_inline int sched_departure(struct __sk_buff *skb) * Thus, schedule SKB transmissing as new + t_xmit_ns. */ if (t_next <= t_curr) { +#if 1 __u64 t_curr_next; - __u32 min_len = 1538 * 2; + __u32 min_len = 1538; /* Minimum delay for all packet if no time-queue */ wire_len = (wire_len > min_len) ? wire_len : min_len; @@ -147,7 +148,11 @@ static __always_inline int sched_departure(struct __sk_buff *skb) WRITE_ONCE(edt->t_last, t_curr_next); skb->tstamp = t_curr_next; +#else + WRITE_ONCE(edt->t_last, t_curr); +#endif return BPF_OK; + } /* Calc queue size measured in time */ From eacff13518b960ed9f0bc8f048de771714e7cb0c Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sun, 22 Nov 2020 14:45:13 +0100 Subject: [PATCH 32/61] traffic-pacing-edt: Experiment random drop packets exceeding 10 ms queue Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/edt_pacer02.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 727c0b9..48d10c9 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -60,9 +60,12 @@ char _license[] SEC("license") = "GPL"; //#define T_HORIZON_DROP (2000 * 1000 * 1000ULL) //#define T_HORIZON_DROP (200000 * 1000 * 1000ULL) -#define T_HORIZON_DROP (15 * 1000 * 1000ULL) -#define T_HORIZON_ECN (5 * 1000 * 1000ULL) +#define T_HORIZON_DROP (15 * 1000 * 1000ULL) + +#define T_HORIZON_DROP_SOME (10 * 1000 * 1000ULL) + +#define T_HORIZON_ECN (5 * 1000 * 1000ULL) struct edt_val { __u64 rate; @@ -165,6 +168,22 @@ static __always_inline int sched_departure(struct __sk_buff *skb) if (t_queue_sz >= T_HORIZON_DROP /* edt->t_horizon_drop */) return BPF_DROP; + /* If TCP didn't react to ECN marking, then start dropping some */ + if (t_queue_sz >= T_HORIZON_DROP_SOME) { + __u32 random = (bpf_get_prandom_u32() >> 4) & 0x0f; + + if (random >= 8) + return BPF_DROP; + + // TODO If horizon have been exceed for a while, then + + + // "next drop time" + } else { + /* TODO: Queue delay drops below reset */ + } + + /* ECN marking horizon */ if (t_queue_sz >= T_HORIZON_ECN) bpf_skb_ecn_set_ce(skb); From d8845714daf6432562772cb8a488a0357837fede Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sun, 22 Nov 2020 14:53:27 +0100 Subject: [PATCH 33/61] traffic-pacing-edt: Codel like scheme Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/edt_pacer02.c | 52 ++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 48d10c9..2fbdd0c 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -63,15 +63,27 @@ char _license[] SEC("license") = "GPL"; #define T_HORIZON_DROP (15 * 1000 * 1000ULL) -#define T_HORIZON_DROP_SOME (10 * 1000 * 1000ULL) +#define T_HORIZON_TARGET (10 * 1000 * 1000ULL) #define T_HORIZON_ECN (5 * 1000 * 1000ULL) +/* Codel like dropping scheme, inspired by: + * - RFC: https://queue.acm.org/detail.cfm?id=2209336 + * - Code: https://queue.acm.org/appendices/codel.html + */ + struct edt_val { __u64 rate; __u64 t_last; __u64 t_horizon_drop; __u64 t_horizon_ecn; + + /* codel like dropping scheme */ + __u64 first_above_time; /* Time when above target (0 if below)*/ + //__u64 drop_next; /* Time to drop next packet */ + uint32_t count; /* Packets dropped since going into drop state */ + uint32_t dropping; /*/ Equal to 1 if in drop state */ + } __aligned(64); /* Align struct to cache-size to avoid false-sharing */ /* The tc tool (iproute2) use another ELF map layout than libbpf (struct @@ -85,6 +97,42 @@ struct bpf_elf_map SEC("maps") time_delay_map = { //.pinning = PIN_GLOBAL_NS, }; +/* */ +#define T_EXCEED_INTERVAL (100 * 1000 * 1000ULL) /* 100 ms in ns*/ + +/* Table lookup for square-root shifted 16 bit */ +static __always_inline __u32 get_sqrt_sh16(__u64 cnt) +{ + switch (cnt) { + case 1: return 65536; /* 65536 * sqrt(1) */ + case 2: return 92682; /* 65536 * sqrt(2) */ + case 3: return 113512; /* 65536 * sqrt(3) */ + case 4: return 131072; /* 65536 * sqrt(4) */ + case 5: return 146543; /* 65536 * sqrt(5) */ + case 6: return 160530; /* 65536 * sqrt(6) */ + case 7: return 173392; + case 8: return 185364; + case 9: return 196608; + case 10: return 207243; + case 11: return 217358; + case 12: return 227023; + case 13: return 236293; + case 14: return 245213; + case 15: return 253820; + case 16: return 262144; /* 100 ms / sqrt(16) = 25 ms */ + default: + return 370728; /* 65536*sqrt(32) => 100/sqrt(32) = 17.68 ms */ + } +} + +static __always_inline __u64 get_next_interval(__u64 cnt) +{ + __u64 val = (__u64)T_EXCEED_INTERVAL << 16 / get_sqrt_sh16(cnt); + return val; +} + + + /* Role of EDT (Earliest Departure Time) is to schedule departure of packets to * be send in the future. */ @@ -169,7 +217,7 @@ static __always_inline int sched_departure(struct __sk_buff *skb) return BPF_DROP; /* If TCP didn't react to ECN marking, then start dropping some */ - if (t_queue_sz >= T_HORIZON_DROP_SOME) { + if (t_queue_sz >= T_HORIZON_TARGET) { __u32 random = (bpf_get_prandom_u32() >> 4) & 0x0f; if (random >= 8) From f50c74101185b2162cee0b7ca5f120f92e648a72 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sun, 22 Nov 2020 18:11:11 +0100 Subject: [PATCH 34/61] traffic-pacing-edt: finished codel implementation based on [1] [1] https://queue.acm.org/appendices/codel.html Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/edt_pacer02.c | 103 +++++++++++++++++++++++++------ 1 file changed, 85 insertions(+), 18 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 2fbdd0c..64838e5 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -4,6 +4,8 @@ #include #include "iproute2_compat.h" +#include + #define VLAN_MAX_DEPTH 2 #include @@ -70,6 +72,7 @@ char _license[] SEC("license") = "GPL"; /* Codel like dropping scheme, inspired by: * - RFC: https://queue.acm.org/detail.cfm?id=2209336 * - Code: https://queue.acm.org/appendices/codel.html + * - Kernel: include/net/codel_impl.h */ struct edt_val { @@ -80,9 +83,9 @@ struct edt_val { /* codel like dropping scheme */ __u64 first_above_time; /* Time when above target (0 if below)*/ - //__u64 drop_next; /* Time to drop next packet */ - uint32_t count; /* Packets dropped since going into drop state */ - uint32_t dropping; /*/ Equal to 1 if in drop state */ + __u64 drop_next; /* Time to drop next packet */ + __u32 count; /* Packets dropped since going into drop state */ + __u32 dropping; /* Equal to 1 if in drop state */ } __aligned(64); /* Align struct to cache-size to avoid false-sharing */ @@ -125,13 +128,88 @@ static __always_inline __u32 get_sqrt_sh16(__u64 cnt) } } -static __always_inline __u64 get_next_interval(__u64 cnt) +static __always_inline __u64 get_next_interval_sqrt(__u64 cnt) { __u64 val = (__u64)T_EXCEED_INTERVAL << 16 / get_sqrt_sh16(cnt); return val; } +static __always_inline __u64 +codel_control_law(__u64 t, __u64 cnt) +{ + return t + get_next_interval_sqrt(cnt); +} +static __always_inline +bool codel_should_drop(struct edt_val *edt, __u64 t_queue_sz, __u64 now) +{ + __u64 interval = T_EXCEED_INTERVAL; + + if (t_queue_sz < T_HORIZON_TARGET) { + /* went below so we'll stay below for at least interval */ + edt->first_above_time = 0; + return false; + } + + if (edt->first_above_time == 0) { + /* just went above from below. If we stay above + * for at least interval we'll say it's ok to drop + */ + edt->first_above_time = now + interval; + return false; + } else if (now >= edt->first_above_time) { + return true; + } + return false; +} + +static __always_inline +bool codel_drop(struct edt_val *edt, __u64 t_queue_sz, __u64 now) +{ + __u64 interval = T_EXCEED_INTERVAL; + + /* If horizon have been exceed for a while, inc drop intensity*/ + bool drop = codel_should_drop(edt, t_queue_sz, now); + + if (edt->dropping) { /* In dropping state */ + if (!drop) { + /* time below target - leave dropping state */ + edt->dropping = false; + return false; + } else if (now >= edt->drop_next) { + /* It's time for the next drop. Drop the current + * packet. Schedule the next drop + */ + edt->count += 1; + // schedule the next drop. + edt->drop_next = + codel_control_law(edt->drop_next, edt->count); + return true; + } + } else if (drop && + ((now - edt->drop_next < interval) || + (now - edt->first_above_time >= interval))) { + /* If we get here, then we're not in dropping state. + * Decide whether it's time to enter dropping state. + */ + __u32 count = edt->count; + + edt->dropping = true; + + /* If we're in a drop cycle, drop rate that controlled queue + * on the last cycle is a good starting point to control it now. + */ + if (now - edt->drop_next < interval) + count = count > 2 ? (count - 2) : 1; + else + count = 1; + + edt->count = count; + edt->drop_next = codel_control_law(now, count); + return true; + } + return false; +} /* Role of EDT (Earliest Departure Time) is to schedule departure of packets to * be send in the future. @@ -217,20 +295,9 @@ static __always_inline int sched_departure(struct __sk_buff *skb) return BPF_DROP; /* If TCP didn't react to ECN marking, then start dropping some */ - if (t_queue_sz >= T_HORIZON_TARGET) { - __u32 random = (bpf_get_prandom_u32() >> 4) & 0x0f; - - if (random >= 8) - return BPF_DROP; - - // TODO If horizon have been exceed for a while, then - - - // "next drop time" - } else { - /* TODO: Queue delay drops below reset */ - } - + // if (codel_drop(edt, t_queue_sz, now)) + if (codel_drop(edt, t_queue_sz, t_next)) + return BPF_DROP; /* ECN marking horizon */ if (t_queue_sz >= T_HORIZON_ECN) From 2f6580dea46439a0091a7fc7b1ce6c30d822d9c2 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sat, 28 Nov 2020 13:37:07 +0100 Subject: [PATCH 35/61] Factor out codel structure Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/edt_pacer02.c | 57 ++++++++++++++++---------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 64838e5..fc21ed4 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -74,19 +74,20 @@ char _license[] SEC("license") = "GPL"; * - Code: https://queue.acm.org/appendices/codel.html * - Kernel: include/net/codel_impl.h */ +struct codel_state { + /* codel like dropping scheme */ + __u64 first_above_time; /* Time when above target (0 if below)*/ + __u64 drop_next; /* Time to drop next packet */ + __u32 count; /* Packets dropped since going into drop state */ + __u32 dropping; /* Equal to 1 if in drop state */ +}; struct edt_val { __u64 rate; __u64 t_last; __u64 t_horizon_drop; __u64 t_horizon_ecn; - - /* codel like dropping scheme */ - __u64 first_above_time; /* Time when above target (0 if below)*/ - __u64 drop_next; /* Time to drop next packet */ - __u32 count; /* Packets dropped since going into drop state */ - __u32 dropping; /* Equal to 1 if in drop state */ - + struct codel_state codel; } __aligned(64); /* Align struct to cache-size to avoid false-sharing */ /* The tc tool (iproute2) use another ELF map layout than libbpf (struct @@ -141,71 +142,71 @@ codel_control_law(__u64 t, __u64 cnt) } static __always_inline -bool codel_should_drop(struct edt_val *edt, __u64 t_queue_sz, __u64 now) +bool codel_should_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now) { __u64 interval = T_EXCEED_INTERVAL; if (t_queue_sz < T_HORIZON_TARGET) { /* went below so we'll stay below for at least interval */ - edt->first_above_time = 0; + codel->first_above_time = 0; return false; } - if (edt->first_above_time == 0) { + if (codel->first_above_time == 0) { /* just went above from below. If we stay above * for at least interval we'll say it's ok to drop */ - edt->first_above_time = now + interval; + codel->first_above_time = now + interval; return false; - } else if (now >= edt->first_above_time) { + } else if (now >= codel->first_above_time) { return true; } return false; } static __always_inline -bool codel_drop(struct edt_val *edt, __u64 t_queue_sz, __u64 now) +bool codel_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now) { __u64 interval = T_EXCEED_INTERVAL; /* If horizon have been exceed for a while, inc drop intensity*/ - bool drop = codel_should_drop(edt, t_queue_sz, now); + bool drop = codel_should_drop(codel, t_queue_sz, now); - if (edt->dropping) { /* In dropping state */ + if (codel->dropping) { /* In dropping state */ if (!drop) { /* time below target - leave dropping state */ - edt->dropping = false; + codel->dropping = false; return false; - } else if (now >= edt->drop_next) { + } else if (now >= codel->drop_next) { /* It's time for the next drop. Drop the current * packet. Schedule the next drop */ - edt->count += 1; + codel->count += 1; // schedule the next drop. - edt->drop_next = - codel_control_law(edt->drop_next, edt->count); + codel->drop_next = + codel_control_law(codel->drop_next, codel->count); return true; } } else if (drop && - ((now - edt->drop_next < interval) || - (now - edt->first_above_time >= interval))) { + ((now - codel->drop_next < interval) || + (now - codel->first_above_time >= interval))) { /* If we get here, then we're not in dropping state. * Decide whether it's time to enter dropping state. */ - __u32 count = edt->count; + __u32 count = codel->count; - edt->dropping = true; + codel->dropping = true; /* If we're in a drop cycle, drop rate that controlled queue * on the last cycle is a good starting point to control it now. */ - if (now - edt->drop_next < interval) + if (now - codel->drop_next < interval) count = count > 2 ? (count - 2) : 1; else count = 1; - edt->count = count; - edt->drop_next = codel_control_law(now, count); + codel->count = count; + codel->drop_next = codel_control_law(now, count); return true; } return false; @@ -296,7 +297,7 @@ static __always_inline int sched_departure(struct __sk_buff *skb) /* If TCP didn't react to ECN marking, then start dropping some */ // if (codel_drop(edt, t_queue_sz, now)) - if (codel_drop(edt, t_queue_sz, t_next)) + if (codel_drop(&edt->codel, t_queue_sz, t_next)) return BPF_DROP; /* ECN marking horizon */ From 516668c62c9f4de91c1d3d6f5718461e4d21b5ba Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sat, 28 Nov 2020 13:51:49 +0100 Subject: [PATCH 36/61] Move codel implementation to header file Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/codel_impl.h | 133 +++++++++++++++++++++++++++++++ traffic-pacing-edt/edt_pacer02.c | 128 ++--------------------------- 2 files changed, 138 insertions(+), 123 deletions(-) create mode 100644 traffic-pacing-edt/codel_impl.h diff --git a/traffic-pacing-edt/codel_impl.h b/traffic-pacing-edt/codel_impl.h new file mode 100644 index 0000000..6970646 --- /dev/null +++ b/traffic-pacing-edt/codel_impl.h @@ -0,0 +1,133 @@ +#ifndef __CODEL_IMPL_H +#define __CODEL_IMPL_H + +#ifndef CODEL_TARGET +#define CODEL_TARGET (10 * 1000 * 1000ULL) /* 10 ms in nanosec */ +#endif + +#ifndef CODEL_EXCEED_INTERVAL +#define CODEL_EXCEED_INTERVAL (100 * 1000 * 1000ULL) /* 100 ms in ns*/ +#endif + +/* Codel like dropping scheme, inspired by: + * - RFC: https://queue.acm.org/detail.cfm?id=2209336 + * - Code: https://queue.acm.org/appendices/codel.html + * - Kernel: include/net/codel_impl.h + */ +struct codel_state { + /* codel like dropping scheme */ + __u64 first_above_time; /* Time when above target (0 if below)*/ + __u64 drop_next; /* Time to drop next packet */ + __u32 count; /* Packets dropped since going into drop state */ + __u32 dropping; /* Equal to 1 if in drop state */ +}; + +/* Table lookup for square-root shifted 16 bit */ +static __always_inline __u32 get_sqrt_sh16(__u64 cnt) +{ + switch (cnt) { + case 1: return 65536; /* 65536 * sqrt(1) */ + case 2: return 92682; /* 65536 * sqrt(2) */ + case 3: return 113512; /* 65536 * sqrt(3) */ + case 4: return 131072; /* 65536 * sqrt(4) */ + case 5: return 146543; /* 65536 * sqrt(5) */ + case 6: return 160530; /* 65536 * sqrt(6) */ + case 7: return 173392; + case 8: return 185364; + case 9: return 196608; + case 10: return 207243; + case 11: return 217358; + case 12: return 227023; + case 13: return 236293; + case 14: return 245213; + case 15: return 253820; + case 16: return 262144; /* 100 ms / sqrt(16) = 25 ms */ + default: + return 370728; /* 65536*sqrt(32) => 100/sqrt(32) = 17.68 ms */ + } +} + +static __always_inline __u64 get_next_interval_sqrt(__u64 cnt) +{ + __u64 val = (__u64)CODEL_EXCEED_INTERVAL << 16 / get_sqrt_sh16(cnt); + return val; +} + +static __always_inline __u64 +codel_control_law(__u64 t, __u64 cnt) +{ + return t + get_next_interval_sqrt(cnt); +} + +static __always_inline +bool codel_should_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now) +{ + __u64 interval = CODEL_EXCEED_INTERVAL; + + if (t_queue_sz < CODEL_TARGET) { + /* went below so we'll stay below for at least interval */ + codel->first_above_time = 0; + return false; + } + + if (codel->first_above_time == 0) { + /* just went above from below. If we stay above + * for at least interval we'll say it's ok to drop + */ + codel->first_above_time = now + interval; + return false; + } else if (now >= codel->first_above_time) { + return true; + } + return false; +} + +static __always_inline +bool codel_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now) +{ + __u64 interval = CODEL_EXCEED_INTERVAL; + + /* If horizon have been exceed for a while, inc drop intensity*/ + bool drop = codel_should_drop(codel, t_queue_sz, now); + + if (codel->dropping) { /* In dropping state */ + if (!drop) { + /* time below target - leave dropping state */ + codel->dropping = false; + return false; + } else if (now >= codel->drop_next) { + /* It's time for the next drop. Drop the current + * packet. Schedule the next drop + */ + codel->count += 1; + // schedule the next drop. + codel->drop_next = + codel_control_law(codel->drop_next, codel->count); + return true; + } + } else if (drop && + ((now - codel->drop_next < interval) || + (now - codel->first_above_time >= interval))) { + /* If we get here, then we're not in dropping state. + * Decide whether it's time to enter dropping state. + */ + __u32 count = codel->count; + + codel->dropping = true; + + /* If we're in a drop cycle, drop rate that controlled queue + * on the last cycle is a good starting point to control it now. + */ + if (now - codel->drop_next < interval) + count = count > 2 ? (count - 2) : 1; + else + count = 1; + + codel->count = count; + codel->drop_next = codel_control_law(now, count); + return true; + } + return false; +} + +#endif /* __CODEL_IMPL_H */ diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index fc21ed4..8a0d54d 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -69,18 +69,11 @@ char _license[] SEC("license") = "GPL"; #define T_HORIZON_ECN (5 * 1000 * 1000ULL) -/* Codel like dropping scheme, inspired by: - * - RFC: https://queue.acm.org/detail.cfm?id=2209336 - * - Code: https://queue.acm.org/appendices/codel.html - * - Kernel: include/net/codel_impl.h - */ -struct codel_state { - /* codel like dropping scheme */ - __u64 first_above_time; /* Time when above target (0 if below)*/ - __u64 drop_next; /* Time to drop next packet */ - __u32 count; /* Packets dropped since going into drop state */ - __u32 dropping; /* Equal to 1 if in drop state */ -}; +#define T_EXCEED_INTERVAL (100 * 1000 * 1000ULL) /* 100 ms in ns*/ + +#define CODEL_TARGET T_HORIZON_TARGET +#define CODEL_EXCEED_INTERVAL T_EXCEED_INTERVAL +#include "codel_impl.h" struct edt_val { __u64 rate; @@ -101,117 +94,6 @@ struct bpf_elf_map SEC("maps") time_delay_map = { //.pinning = PIN_GLOBAL_NS, }; -/* */ -#define T_EXCEED_INTERVAL (100 * 1000 * 1000ULL) /* 100 ms in ns*/ - -/* Table lookup for square-root shifted 16 bit */ -static __always_inline __u32 get_sqrt_sh16(__u64 cnt) -{ - switch (cnt) { - case 1: return 65536; /* 65536 * sqrt(1) */ - case 2: return 92682; /* 65536 * sqrt(2) */ - case 3: return 113512; /* 65536 * sqrt(3) */ - case 4: return 131072; /* 65536 * sqrt(4) */ - case 5: return 146543; /* 65536 * sqrt(5) */ - case 6: return 160530; /* 65536 * sqrt(6) */ - case 7: return 173392; - case 8: return 185364; - case 9: return 196608; - case 10: return 207243; - case 11: return 217358; - case 12: return 227023; - case 13: return 236293; - case 14: return 245213; - case 15: return 253820; - case 16: return 262144; /* 100 ms / sqrt(16) = 25 ms */ - default: - return 370728; /* 65536*sqrt(32) => 100/sqrt(32) = 17.68 ms */ - } -} - -static __always_inline __u64 get_next_interval_sqrt(__u64 cnt) -{ - __u64 val = (__u64)T_EXCEED_INTERVAL << 16 / get_sqrt_sh16(cnt); - return val; -} - -static __always_inline __u64 -codel_control_law(__u64 t, __u64 cnt) -{ - return t + get_next_interval_sqrt(cnt); -} - -static __always_inline -bool codel_should_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now) -{ - __u64 interval = T_EXCEED_INTERVAL; - - if (t_queue_sz < T_HORIZON_TARGET) { - /* went below so we'll stay below for at least interval */ - codel->first_above_time = 0; - return false; - } - - if (codel->first_above_time == 0) { - /* just went above from below. If we stay above - * for at least interval we'll say it's ok to drop - */ - codel->first_above_time = now + interval; - return false; - } else if (now >= codel->first_above_time) { - return true; - } - return false; -} - -static __always_inline -bool codel_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now) -{ - __u64 interval = T_EXCEED_INTERVAL; - - /* If horizon have been exceed for a while, inc drop intensity*/ - bool drop = codel_should_drop(codel, t_queue_sz, now); - - if (codel->dropping) { /* In dropping state */ - if (!drop) { - /* time below target - leave dropping state */ - codel->dropping = false; - return false; - } else if (now >= codel->drop_next) { - /* It's time for the next drop. Drop the current - * packet. Schedule the next drop - */ - codel->count += 1; - // schedule the next drop. - codel->drop_next = - codel_control_law(codel->drop_next, codel->count); - return true; - } - } else if (drop && - ((now - codel->drop_next < interval) || - (now - codel->first_above_time >= interval))) { - /* If we get here, then we're not in dropping state. - * Decide whether it's time to enter dropping state. - */ - __u32 count = codel->count; - - codel->dropping = true; - - /* If we're in a drop cycle, drop rate that controlled queue - * on the last cycle is a good starting point to control it now. - */ - if (now - codel->drop_next < interval) - count = count > 2 ? (count - 2) : 1; - else - count = 1; - - codel->count = count; - codel->drop_next = codel_control_law(now, count); - return true; - } - return false; -} - /* Role of EDT (Earliest Departure Time) is to schedule departure of packets to * be send in the future. */ From 3e0ac4f24dc62ea3f31ed9d72e51dd901bfc7858 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sat, 28 Nov 2020 14:02:52 +0100 Subject: [PATCH 37/61] Cleanup some comments Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/edt_pacer02.c | 32 ++++---------------------------- 1 file changed, 4 insertions(+), 28 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 8a0d54d..84f7e55 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -13,28 +13,7 @@ char _license[] SEC("license") = "GPL"; #define NS_PER_SEC 1000000000 -//#define RATE_IN_BITS (998 * 1000 * 1000ULL) - -/* Test different rates in production machine, and measure iperf3 TCP-goodput */ -//#define RATE_IN_BITS (800 * 1000 * 1000ULL)// prod: 765 Mbits/sec (stable) -//#define RATE_IN_BITS (900 * 1000 * 1000ULL)// prod: 861 Mbits/sec (stable) -///#define RATE_IN_BITS (950 * 1000 * 1000ULL)// prod: 908 Mbits/sec (stable) -//#define RATE_IN_BITS (960 * 1000 * 1000ULL)// prod: 918 Mbits/sec -//#define RATE_IN_BITS (970 * 1000 * 1000ULL)// prod: 928 Mbits/sec -//#define RATE_IN_BITS (980 * 1000 * 1000ULL)// prod: 920 Mbits/sec (unstable) -//#define RATE_IN_BITS (990 * 1000 * 1000ULL)// prod: 920 Mbits/sec (unstable) -//#define RATE_IN_BITS (999 * 1000 * 1000ULL)// prod: (unstable) - -/* Per packet overhead: two VLAN headers == 8 bytes - * - * skb->wire_len doesn't seem to take the two VLAN headers into - * account. Loading BPF-prog on VLAN net_device is can only see 1 - * VLAN, and this is likely HW offloaded into skb->vlan. - */ -//#define OVERHEAD (8) - - -/* New strategy: Shape at MAC (Medium Access Control) layer with Ethernet +/* Strategy: Shape at MAC (Medium Access Control) layer with Ethernet * * Production use-case is pacing traffic at 1Gbit/s wirespeed, using a * 10Gbit/s NIC, because 1G end-user switch cannot handle bursts. @@ -57,18 +36,15 @@ char _license[] SEC("license") = "GPL"; //#define OVERHEAD (12 + 8 + 4) /* 14 already in wire_len */ #define ETH_MIN (84) -/* skb->len in bytes, thus easier to keep rate in bytes */ +/* skb->len in bytes, thus convert rate to bytes */ #define RATE_IN_BYTES (RATE_IN_BITS / 8) -//#define T_HORIZON_DROP (2000 * 1000 * 1000ULL) -//#define T_HORIZON_DROP (200000 * 1000 * 1000ULL) - +/* Controlling how large queue (in time) is allow to grow */ #define T_HORIZON_DROP (15 * 1000 * 1000ULL) - #define T_HORIZON_TARGET (10 * 1000 * 1000ULL) - #define T_HORIZON_ECN (5 * 1000 * 1000ULL) +/* Codel: If queue exceed target for more than one interval, start dropping */ #define T_EXCEED_INTERVAL (100 * 1000 * 1000ULL) /* 100 ms in ns*/ #define CODEL_TARGET T_HORIZON_TARGET From 60a851c2a00513b8c5d0fe7b7a51a7b081666cd4 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sat, 28 Nov 2020 14:22:29 +0100 Subject: [PATCH 38/61] Now that codel works adjust horizons The hard drop horizon (T_HORIZON_DROP) can be increased (to 40ms) as codel target latency (T_HORIZON_TARGET) is taking care of signaling TCP downloads via drops (after codel scheme). Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/edt_pacer02.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 84f7e55..269835b 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -40,9 +40,9 @@ char _license[] SEC("license") = "GPL"; #define RATE_IN_BYTES (RATE_IN_BITS / 8) /* Controlling how large queue (in time) is allow to grow */ -#define T_HORIZON_DROP (15 * 1000 * 1000ULL) -#define T_HORIZON_TARGET (10 * 1000 * 1000ULL) -#define T_HORIZON_ECN (5 * 1000 * 1000ULL) +#define T_HORIZON_DROP (40 * 1000 * 1000ULL) +#define T_HORIZON_TARGET (5 * 1000 * 1000ULL) +#define T_HORIZON_ECN (1 * 1000 * 1000ULL) /* Codel: If queue exceed target for more than one interval, start dropping */ #define T_EXCEED_INTERVAL (100 * 1000 * 1000ULL) /* 100 ms in ns*/ From 2786f8af65b274a9aa0e9f470484c14303d83f8b Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sat, 28 Nov 2020 14:38:48 +0100 Subject: [PATCH 39/61] Extend the sqrt lookup table with more entries Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/codel_impl.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/traffic-pacing-edt/codel_impl.h b/traffic-pacing-edt/codel_impl.h index 6970646..549dc61 100644 --- a/traffic-pacing-edt/codel_impl.h +++ b/traffic-pacing-edt/codel_impl.h @@ -42,8 +42,28 @@ static __always_inline __u32 get_sqrt_sh16(__u64 cnt) case 14: return 245213; case 15: return 253820; case 16: return 262144; /* 100 ms / sqrt(16) = 25 ms */ + case 17: return 270212; + case 18: return 278046; + case 19: return 285664; + case 20: return 293086; + case 21: return 300324; + case 22: return 307391; + case 23: return 314300; + case 24: return 321060; + case 25: return 327680; /* 100 ms / sqrt(25) = 20 ms */ + case 26: return 334169; + case 27: return 340535; + case 28: return 346784; + case 29: return 352922; + case 30: return 358955; + case 31: return 364889; + case 32: return 370728; + case 33: return 376476; + case 34: return 382137; + case 35: return 387716; + case 36: return 393216; /* 100 / sqrt(36) = 16.66 ms */ default: - return 370728; /* 65536*sqrt(32) => 100/sqrt(32) = 17.68 ms */ + return 463410; /* 65536*sqrt(50) => 100/sqrt(50) = 14.14 ms */ } } From 3248b602486a32e118d267551999f359e20607d6 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sat, 28 Nov 2020 15:09:12 +0100 Subject: [PATCH 40/61] Do EDT pacing on all inner VLAN ids Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/edt_pacer02.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 269835b..eb1b997 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -66,14 +66,14 @@ struct bpf_elf_map SEC("maps") time_delay_map = { .type = BPF_MAP_TYPE_ARRAY, .size_key = sizeof(__u32), .size_value = sizeof(struct edt_val), - .max_elem = 1, + .max_elem = 4096, /* Max possible VLANs */ //.pinning = PIN_GLOBAL_NS, }; /* Role of EDT (Earliest Departure Time) is to schedule departure of packets to * be send in the future. */ -static __always_inline int sched_departure(struct __sk_buff *skb) +static __always_inline int sched_departure(struct __sk_buff *skb, __u32 key) { struct edt_val *edt; __u64 t_queue_sz; @@ -81,7 +81,6 @@ static __always_inline int sched_departure(struct __sk_buff *skb) __u64 wire_len; __u64 t_next; __u64 t_curr; - int key = 0; __u64 now; edt = bpf_map_lookup_elem(&time_delay_map, &key); @@ -253,9 +252,8 @@ SEC("classifier") int tc_edt_vlan(struct __sk_buff *skb) vlan_key = extract_vlan_key(skb, &vlans); - /* For-now: Match on vlan16 and only apply EDT on that */ - if (vlan_key == 16) - return sched_departure(skb); + /* Each (inner) VLAN id gets it own EDT pacing */ + return sched_departure(skb, vlan_key); out: return ret; From dea36b9d8fd919202a31dae98d51c737df96fd7d Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Sat, 28 Nov 2020 15:45:52 +0100 Subject: [PATCH 41/61] Add practical script for loading on all outer VLAN devices Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/vlans_load_edt.sh | 39 ++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100755 traffic-pacing-edt/vlans_load_edt.sh diff --git a/traffic-pacing-edt/vlans_load_edt.sh b/traffic-pacing-edt/vlans_load_edt.sh new file mode 100755 index 0000000..3dc1fab --- /dev/null +++ b/traffic-pacing-edt/vlans_load_edt.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# +# Script for loading EDT-pacer BPF-prog on all downstream VLANs +# +basedir=`dirname $0` +source ${basedir}/functions.sh + +root_check_run_with_sudo "$@" + +# Use common parameters +source ${basedir}/parameters.sh + +# Default verbose +VERBOSE=1 + +# Downstream dev: ens6f0 +VLAN_START=168 +VLAN_END=205 + +cmd=${basedir}/bpf_egress_loader.sh + +options="" + +if [[ -n $REMOVE ]]; then + options+=" --remove" +fi +if [[ -n $DRYRUN ]]; then + options+=" --dry-run" + #cmd="echo $cmd" +fi +if [[ -n $VERBOSE ]]; then + options+=" --verbose" +fi + +for (( vlan=${VLAN_START}; vlan<=${VLAN_END}; vlan++ )) +do + VLAN=${DEV}.$vlan + $cmd --dev $VLAN $options +done From 93116e0fb25dd501a1ab7b1b99e4c455acb06b6d Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Mon, 30 Nov 2020 12:41:48 +0100 Subject: [PATCH 42/61] Add bpftrace dir and program developed last night Signed-off-by: Jesper D. Brouer --- .../bpftrace/edt_tstamp_diff.bt | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100755 traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt diff --git a/traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt b/traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt new file mode 100755 index 0000000..15c3c4d --- /dev/null +++ b/traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt @@ -0,0 +1,31 @@ +#!/usr/local/bin/bpftrace + +#include + +/* Measure time difference between EDT-time and real "NIC" TX-time. + * + * Assuming packets are EDT timestamped by the BPF-program, we can + * detect/measure how accuratly packets are actually transmitted + * towards the NIC driver, by comparing EDT-time against "now" + * timestamp in the function transmitting to the NIC driver. + */ + +// tracepoint:net:net_dev_start_xmit +tracepoint:net:net_dev_xmit +{ + $skb = (struct sk_buff *)args->skbaddr; + //$tstamp = (uint64)$skb->tstamp; + $tstamp = $skb->skb_mstamp_ns; + $now = nsecs; + + // if ($skb->mark > 0) { + if ($tstamp > 0) { + if ($now >= $tstamp) { + $diff_late = $now - $tstamp; + } else { + $diff_ahead = $tstamp - $now; + } + @tstamp_diff_late = hist($diff_late / 1000); + @tstamp_diff_ahead = hist($diff_ahead / 1000); + } +} From 381dd9a512dfaf64d875c2a136e37218bbccfe77 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Mon, 30 Nov 2020 12:43:14 +0100 Subject: [PATCH 43/61] Add more advanced version of script edt_tstamp_diff_advanced.bt Signed-off-by: Jesper D. Brouer --- .../bpftrace/edt_tstamp_diff_advanced.bt | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100755 traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt diff --git a/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt b/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt new file mode 100755 index 0000000..0029055 --- /dev/null +++ b/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt @@ -0,0 +1,59 @@ +#!/usr/local/bin/bpftrace + +#include + +// tracepoint:net:net_dev_start_xmit +tracepoint:net:net_dev_xmit +{ + $skb = (struct sk_buff *)args->skbaddr; + //$tstamp = (uint64)$skb->tstamp; + $tstamp = $skb->skb_mstamp_ns; + $now = nsecs; + +// if ($skb->mark > 0) { + if ($tstamp > 0) { + if ($now >= $tstamp) { + $diff_late = $now - $tstamp; + } else { + $diff_ahead = $tstamp - $now; + } + @tstamp_usec_diff_late = hist($diff_late / 1000); + @tstamp_usec_diff_ahead = hist($diff_ahead / 1000); + } + + //$period = $now / 10000; /* 10000 = 10 usec */ + $period = $now / 30000; /* 30000 = 30 usec */ + if (@state[cpu] == $period) { + @state_bytes[cpu] += $skb->len; + } else { + @state[cpu] = $period; + if (@state_bytes[cpu] > 0) { + @byte_burst[cpu] = hist(@state_bytes[cpu]); + } + @state_bytes[cpu] = $skb->len; /* Reset counter */ + } +} + +tracepoint:qdisc:qdisc_dequeue +{ + @qdisc_bulk_dequeue = lhist(args->packets, 0,64,1); +} + + +/* +kretfunc:dev_hard_start_xmit +{ +// Wanted to know if ret == NETDEV_TX_BUSY +# ERROR: kfunc/kretfunc not available for your linked against bcc version. +} +*/ + +kprobe:qdisc_watchdog_schedule_range_ns +{ + @qdisc_watchdog[cpu] = count(); +} + +kprobe:__netif_schedule +{ + @__netif_schedule[cpu] = count(); +} From b84b89dc4baee4dfbb0604ba76571aff15b537a6 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Tue, 1 Dec 2020 09:34:18 +0100 Subject: [PATCH 44/61] bpftrace/edt_tstamp_diff_advanced.bt: add doc comments Also found measurement tool can disturb timing. I might have to write this in BPF-C directly to avoid overhead. Signed-off-by: Jesper D. Brouer --- .../bpftrace/edt_tstamp_diff_advanced.bt | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt b/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt index 0029055..add3270 100755 --- a/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt +++ b/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt @@ -21,6 +21,14 @@ tracepoint:net:net_dev_xmit @tstamp_usec_diff_ahead = hist($diff_ahead / 1000); } + /* Capture burstiness over a time period, by dividing nanosec + * timestamp with wanted period, and keeping state byte counter as + * long as timestamp match. + * + * Practical usage shows that bpftrace uses a hash-map to implement + * this, which unfortunately cost too much (shows 5% jhash cpu + * usage), enough overhead to change behavior of prod system. + */ //$period = $now / 10000; /* 10000 = 10 usec */ $period = $now / 30000; /* 30000 = 30 usec */ if (@state[cpu] == $period) { @@ -34,11 +42,12 @@ tracepoint:net:net_dev_xmit } } +/* tracepoint:qdisc:qdisc_dequeue { @qdisc_bulk_dequeue = lhist(args->packets, 0,64,1); } - +*/ /* kretfunc:dev_hard_start_xmit @@ -48,6 +57,15 @@ kretfunc:dev_hard_start_xmit } */ + +/* How often does FQ-pacer find no-packets are qualified to be + * scheduled, which leads to scheduling an hrtimer event, that will + * start qdisc again at a later time. + * + * We cannot kprobe fq_dequeue as it is a module. + */ + +/* kprobe:qdisc_watchdog_schedule_range_ns { @qdisc_watchdog[cpu] = count(); @@ -57,3 +75,4 @@ kprobe:__netif_schedule { @__netif_schedule[cpu] = count(); } +*/ From 79466715cfac0479307b3d20e9c1054a2bc58e02 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Tue, 1 Dec 2020 10:07:25 +0100 Subject: [PATCH 45/61] traffic-pacing-edt: Use SKB->mark to identify different stages This can be used by bpftrace programs to identify different stages, when trying to determine the EDT accuracy. Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/edt_pacer02.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index eb1b997..dc8321b 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -135,6 +135,7 @@ static __always_inline int sched_departure(struct __sk_buff *skb, __u32 key) WRITE_ONCE(edt->t_last, t_curr_next); skb->tstamp = t_curr_next; + skb->mark = 1; /* No queue - add minimum delay */ #else WRITE_ONCE(edt->t_last, t_curr); #endif @@ -156,10 +157,14 @@ static __always_inline int sched_departure(struct __sk_buff *skb, __u32 key) // if (codel_drop(edt, t_queue_sz, now)) if (codel_drop(&edt->codel, t_queue_sz, t_next)) return BPF_DROP; + + skb->mark = 2; /* (time) queue exist - and small/below T_HORIZON_ECN */ /* ECN marking horizon */ - if (t_queue_sz >= T_HORIZON_ECN) + if (t_queue_sz >= T_HORIZON_ECN) { + skb->mark = 3; /* (time) queue exist - and is large */ bpf_skb_ecn_set_ce(skb); + } /* Advance "time queue" */ WRITE_ONCE(edt->t_last, t_next); From 23f73c86ac1c2ec7cf6c4d8e0a0b421fbb5d8bef Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Tue, 1 Dec 2020 12:13:24 +0100 Subject: [PATCH 46/61] traffic-pacing-edt: Use bpf_ktime_get_boot_ns The bpftrace programs use bpf_ktime_get_boot_ns, for underlying 'nsecs' keyword. Switch TC-BPF prog to use the same, to make sure that we don't report false result when detecting/measureing EDT accuracy. Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/edt_pacer02.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index dc8321b..5b89d4a 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -102,7 +102,8 @@ static __always_inline int sched_departure(struct __sk_buff *skb, __u32 key) // t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / RATE_IN_BYTES; // t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / edt->rate; - now = bpf_ktime_get_ns(); + // now = bpf_ktime_get_ns(); + now = bpf_ktime_get_boot_ns(); /* Use same ktime as bpftrace */ /* Allow others to set skb tstamp prior to us */ t_curr = skb->tstamp; From 048c960756eb65301a72d2d7c41218906bd63204 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Tue, 1 Dec 2020 14:27:10 +0100 Subject: [PATCH 47/61] iproute2 tc util have recently gotten libbpf support Implement configure script that detect support, and Makefile defines that propagate to BPF-C file, making it possible to use and compile with BTF type maps. Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/Makefile | 13 ++++++++++++- traffic-pacing-edt/configure | 29 ++++++++++++++++++++++++++++ traffic-pacing-edt/edt_pacer02.c | 22 +++++++++++++++++---- traffic-pacing-edt/iproute2_compat.h | 6 ++++++ 4 files changed, 65 insertions(+), 5 deletions(-) create mode 100755 traffic-pacing-edt/configure diff --git a/traffic-pacing-edt/Makefile b/traffic-pacing-edt/Makefile index cb3def9..73c1306 100644 --- a/traffic-pacing-edt/Makefile +++ b/traffic-pacing-edt/Makefile @@ -4,11 +4,20 @@ USER_TARGETS := BPF_TARGETS := edt_pacer01 BPF_TARGETS += edt_pacer02 +EXTRA_DEPS += config.mk + LIB_DIR = ../lib include $(LIB_DIR)/common.mk +include config.mk -# The iproute2 'tc' tool doesn't understand BTF debug info +all: config.mk + +config.mk: configure + @sh configure + +ifndef HAVE_TC_LIBBPF +# If the iproute2 'tc' tool doesn't understand BTF debug info # use llvm-strip to remove this debug info from object file # # *BUT* cannot strip everything as it removes ELF elems needed for @@ -16,6 +25,8 @@ include $(LIB_DIR)/common.mk # .PHONY: strip_tc_obj strip_tc_obj: ${BPF_TARGETS:=.o} + $(Q) echo "TC don't support libbpf - strip BTF info" $(Q) llvm-strip --no-strip-all --remove-section .BTF $? all: strip_tc_obj +endif diff --git a/traffic-pacing-edt/configure b/traffic-pacing-edt/configure new file mode 100755 index 0000000..9b01369 --- /dev/null +++ b/traffic-pacing-edt/configure @@ -0,0 +1,29 @@ +#!/bin/bash +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +# This is not an autoconf generated configure +# + +# Output file which is input to Makefile +CONFIG=config.mk + +# Assume tc is in $PATH +TC=tc + +check_tc_libbpf() +{ + tc_version=$($TC -V) + if echo $tc_version | grep -q libbpf; then + libbpf_version=${tc_version##*libbpf } + echo "HAVE_TC_LIBBPF:=y" >> $CONFIG + echo "CFLAGS += -DHAVE_LIBBPF" >> $CONFIG + echo "yes ($libbpf_version)" + else + echo "no" + fi +} + +echo "# Generated config" > $CONFIG +echo "Detecting available features on system" + +echo -n " - libbpf support in tc tool: " +check_tc_libbpf diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c index 5b89d4a..a361079 100644 --- a/traffic-pacing-edt/edt_pacer02.c +++ b/traffic-pacing-edt/edt_pacer02.c @@ -2,7 +2,6 @@ #include #include #include -#include "iproute2_compat.h" #include @@ -59,16 +58,31 @@ struct edt_val { struct codel_state codel; } __aligned(64); /* Align struct to cache-size to avoid false-sharing */ -/* The tc tool (iproute2) use another ELF map layout than libbpf (struct - * bpf_map_def), see struct bpf_elf_map from iproute2. +#ifdef HAVE_TC_LIBBPF /* detected by configure script in config.mk */ +/* Use BTF format to create map */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 4096); /* Max possible VLANs */ + __type(key, __u32); + __type(value, struct edt_val); +// __uint(pinning, LIBBPF_PIN_BY_NAME); +} time_delay_map SEC(".maps"); + +#else +/* The (iproute2) tc tool (without libbpf support) use another ELF map + * layout than libbpf (struct bpf_map_def), see struct bpf_elf_map + * from iproute2. */ +#include "iproute2_compat.h" struct bpf_elf_map SEC("maps") time_delay_map = { .type = BPF_MAP_TYPE_ARRAY, .size_key = sizeof(__u32), .size_value = sizeof(struct edt_val), .max_elem = 4096, /* Max possible VLANs */ - //.pinning = PIN_GLOBAL_NS, +// .pinning = PIN_GLOBAL_NS, }; +#endif + /* Role of EDT (Earliest Departure Time) is to schedule departure of packets to * be send in the future. diff --git a/traffic-pacing-edt/iproute2_compat.h b/traffic-pacing-edt/iproute2_compat.h index a535f5f..3d72546 100644 --- a/traffic-pacing-edt/iproute2_compat.h +++ b/traffic-pacing-edt/iproute2_compat.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* Taken from from #include */ #ifndef __IPROUTE2_COMPAT_H #define __IPROUTE2_COMPAT_H @@ -8,6 +9,11 @@ * binary layout until "flags". Thus, BPF-progs can use both if careful. */ +/* Object pinning settings */ +#define PIN_NONE 0 +#define PIN_OBJECT_NS 1 +#define PIN_GLOBAL_NS 2 + /* ELF map definition (copied from iproute2 source code) */ struct bpf_elf_map { __u32 type; From 9d52254be6bb5a148c26f6908a8639a40152cd4d Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Tue, 1 Dec 2020 15:29:04 +0100 Subject: [PATCH 48/61] traffic-pacing-edt: rename edt_pacer02.c to edt_pacer_vlan.c Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/{edt_pacer02.c => edt_pacer_vlan.c} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename traffic-pacing-edt/{edt_pacer02.c => edt_pacer_vlan.c} (100%) diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer_vlan.c similarity index 100% rename from traffic-pacing-edt/edt_pacer02.c rename to traffic-pacing-edt/edt_pacer_vlan.c From 5aab70b25dc03571a6376920122addbc23345c50 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Tue, 1 Dec 2020 15:31:08 +0100 Subject: [PATCH 49/61] traffic-pacing-edt: Adjust after file rename Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/Makefile | 2 +- traffic-pacing-edt/bpf_egress_loader.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/traffic-pacing-edt/Makefile b/traffic-pacing-edt/Makefile index 73c1306..4190dfe 100644 --- a/traffic-pacing-edt/Makefile +++ b/traffic-pacing-edt/Makefile @@ -2,7 +2,7 @@ USER_TARGETS := BPF_TARGETS := edt_pacer01 -BPF_TARGETS += edt_pacer02 +BPF_TARGETS += edt_pacer_vlan EXTRA_DEPS += config.mk diff --git a/traffic-pacing-edt/bpf_egress_loader.sh b/traffic-pacing-edt/bpf_egress_loader.sh index 934117d..efaf597 100755 --- a/traffic-pacing-edt/bpf_egress_loader.sh +++ b/traffic-pacing-edt/bpf_egress_loader.sh @@ -16,7 +16,7 @@ export TC=tc # This can be changed via --file or --obj if [[ -z ${BPF_OBJ} ]]; then # Fallback default - BPF_OBJ=edt_pacer02.o + BPF_OBJ=edt_pacer_vlan.o fi info "Applying TC-BPF egress setup on device: $DEV with object file: $BPF_OBJ" From e7401bb5004fa14af00d48d9a8bf9239f4f4fd17 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Tue, 1 Dec 2020 15:36:54 +0100 Subject: [PATCH 50/61] traffic-pacing-edt: Remove test program edt_pacer01.c Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/Makefile | 3 +-- traffic-pacing-edt/edt_pacer01.c | 40 -------------------------------- 2 files changed, 1 insertion(+), 42 deletions(-) delete mode 100644 traffic-pacing-edt/edt_pacer01.c diff --git a/traffic-pacing-edt/Makefile b/traffic-pacing-edt/Makefile index 4190dfe..09cdd24 100644 --- a/traffic-pacing-edt/Makefile +++ b/traffic-pacing-edt/Makefile @@ -1,8 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) USER_TARGETS := -BPF_TARGETS := edt_pacer01 -BPF_TARGETS += edt_pacer_vlan +BPF_TARGETS := edt_pacer_vlan EXTRA_DEPS += config.mk diff --git a/traffic-pacing-edt/edt_pacer01.c b/traffic-pacing-edt/edt_pacer01.c deleted file mode 100644 index 044158f..0000000 --- a/traffic-pacing-edt/edt_pacer01.c +++ /dev/null @@ -1,40 +0,0 @@ -#include -#include -#include -#include "iproute2_compat.h" - -char _license[] SEC("license") = "GPL"; - -/* The tc tool (iproute2) use another ELF map layout than libbpf (struct - * bpf_map_def), see struct bpf_elf_map from iproute2. - */ -struct bpf_elf_map SEC("maps") cnt_map = { - .type = BPF_MAP_TYPE_ARRAY, - .size_key = sizeof(__u32), - .size_value = sizeof(__u64), - .max_elem = 1, - //.pinning = PIN_GLOBAL_NS, -}; - -SEC("classifier") int tc_dummy(struct __sk_buff *skb) -{ - volatile void *data, *data_end; - int ret = BPF_OK; - struct ethhdr *eth; - - data = (void *)(long)skb->data; - data_end = (void *)(long)skb->data_end; - eth = (struct ethhdr *)data; - - if (data + sizeof(*eth) > data_end) - return BPF_DROP; - - /* Keep ARP resolution working */ - if (eth->h_proto == bpf_htons(ETH_P_ARP)) { - ret = BPF_OK; - goto out; - } - - out: - return ret; -} From 89aeeafa0e40425d05ee5325b89458d163f9d5dc Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Tue, 15 Dec 2020 16:34:26 +0100 Subject: [PATCH 51/61] Update UAPI header file bpf.h I need the struct bpf_cpumap_val definition for the next example. Signed-off-by: Jesper D. Brouer --- headers/linux/bpf.h | 818 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 689 insertions(+), 129 deletions(-) diff --git a/headers/linux/bpf.h b/headers/linux/bpf.h index b9ed9f1..1bc3738 100644 --- a/headers/linux/bpf.h +++ b/headers/linux/bpf.h @@ -81,6 +81,12 @@ struct bpf_cgroup_storage_key { __u32 attach_type; /* program attach type */ }; +union bpf_iter_link_info { + struct { + __u32 map_fd; + } map; +}; + /* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { BPF_MAP_CREATE, @@ -117,6 +123,7 @@ enum bpf_cmd { BPF_LINK_GET_NEXT_ID, BPF_ENABLE_STATS, BPF_ITER_CREATE, + BPF_LINK_DETACH, }; enum bpf_map_type { @@ -189,6 +196,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_STRUCT_OPS, BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, + BPF_PROG_TYPE_SK_LOOKUP, }; enum bpf_attach_type { @@ -226,6 +234,10 @@ enum bpf_attach_type { BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, + BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, + BPF_SK_LOOKUP, + BPF_XDP, __MAX_BPF_ATTACH_TYPE }; @@ -238,6 +250,7 @@ enum bpf_link_type { BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, BPF_LINK_TYPE_NETNS = 5, + BPF_LINK_TYPE_XDP = 6, MAX_BPF_LINK_TYPE, }; @@ -603,9 +616,14 @@ union bpf_attr { struct { /* struct used by BPF_LINK_CREATE command */ __u32 prog_fd; /* eBPF program to attach */ - __u32 target_fd; /* object to attach to */ + union { + __u32 target_fd; /* object to attach to */ + __u32 target_ifindex; /* target ifindex */ + }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ } link_create; struct { /* struct used by BPF_LINK_UPDATE command */ @@ -618,6 +636,10 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { + __u32 link_fd; + } link_detach; + struct { /* struct used by BPF_ENABLE_STATS command */ __u32 type; } enable_stats; @@ -653,7 +675,7 @@ union bpf_attr { * Map value associated to *key*, or **NULL** if no entry was * found. * - * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) * Description * Add or update the value of the entry associated to *key* in * *map* with *value*. *flags* is one of: @@ -671,13 +693,13 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_delete_elem(struct bpf_map *map, const void *key) + * long bpf_map_delete_elem(struct bpf_map *map, const void *key) * Description * Delete entry with *key* from *map*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) * Description * For tracing programs, safely attempt to read *size* bytes from * kernel space address *unsafe_ptr* and store the data in *dst*. @@ -695,7 +717,7 @@ union bpf_attr { * Return * Current *ktime*. * - * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...) * Description * This helper is a "printk()-like" facility for debugging. It * prints a message defined by format *fmt* (of size *fmt_size*) @@ -745,7 +767,7 @@ union bpf_attr { * * Also, note that **bpf_trace_printk**\ () is slow, and should * only be used for debugging purposes. For this reason, a notice - * bloc (spanning several lines) is printed to kernel logs and + * block (spanning several lines) is printed to kernel logs and * states that the helper should not be used "for production use" * the first time this helper is used (or more precisely, when * **trace_printk**\ () buffers are allocated). For passing values @@ -775,7 +797,7 @@ union bpf_attr { * Return * The SMP id of the processor running the program. * - * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. *flags* are a combination of @@ -792,7 +814,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) * Description * Recompute the layer 3 (e.g. IP) checksum for the packet * associated to *skb*. Computation is incremental, so the helper @@ -817,7 +839,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) * Description * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the * packet associated to *skb*. Computation is incremental, so the @@ -849,7 +871,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) * Description * This special helper is used to trigger a "tail call", or in * other words, to jump into another eBPF program. The same stack @@ -880,7 +902,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) * Description * Clone and redirect the packet associated to *skb* to another * net device of index *ifindex*. Both ingress and egress @@ -916,7 +938,7 @@ union bpf_attr { * A 64-bit integer containing the current GID and UID, and * created as such: *current_gid* **<< 32 \|** *current_uid*. * - * int bpf_get_current_comm(void *buf, u32 size_of_buf) + * long bpf_get_current_comm(void *buf, u32 size_of_buf) * Description * Copy the **comm** attribute of the current task into *buf* of * *size_of_buf*. The **comm** attribute contains the name of @@ -953,7 +975,7 @@ union bpf_attr { * Return * The classid, or 0 for the default unconfigured classid. * - * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) * Description * Push a *vlan_tci* (VLAN tag control information) of protocol * *vlan_proto* to the packet associated to *skb*, then update @@ -969,7 +991,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_vlan_pop(struct sk_buff *skb) + * long bpf_skb_vlan_pop(struct sk_buff *skb) * Description * Pop a VLAN header from the packet associated to *skb*. * @@ -981,7 +1003,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Get tunnel metadata. This helper takes a pointer *key* to an * empty **struct bpf_tunnel_key** of **size**, that will be @@ -1011,14 +1033,14 @@ union bpf_attr { * * int ret; * struct bpf_tunnel_key key = {}; - * + * * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); * if (ret < 0) * return TC_ACT_SHOT; // drop packet - * + * * if (key.remote_ipv4 != 0x0a000001) * return TC_ACT_SHOT; // drop packet - * + * * return TC_ACT_OK; // accept packet * * This interface can also be used with all encapsulation devices @@ -1032,7 +1054,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Populate tunnel metadata for packet associated to *skb.* The * tunnel metadata is set to the contents of *key*, of *size*. The @@ -1098,7 +1120,7 @@ union bpf_attr { * The value of the perf event counter read from the map, or a * negative error code in case of failure. * - * int bpf_redirect(u32 ifindex, u64 flags) + * long bpf_redirect(u32 ifindex, u64 flags) * Description * Redirect the packet to another net device of index *ifindex*. * This helper is somewhat similar to **bpf_clone_redirect**\ @@ -1125,7 +1147,7 @@ union bpf_attr { * Description * Retrieve the realm or the route, that is to say the * **tclassid** field of the destination for the *skb*. The - * indentifier retrieved is a user-provided tag, similar to the + * identifier retrieved is a user-provided tag, similar to the * one used with the net_cls cgroup (see description for * **bpf_get_cgroup_classid**\ () helper), but here this tag is * held by a route (a destination entry), not by a task. @@ -1145,7 +1167,7 @@ union bpf_attr { * The realm of the route for the packet associated to *skb*, or 0 * if none was found. * - * int bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -1190,7 +1212,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) + * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) * Description * This helper was provided as an easy way to load data from a * packet. It can be used to load *len* bytes from *offset* from @@ -1207,7 +1229,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) + * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve * this, the helper needs *ctx*, which is a pointer to the context @@ -1276,7 +1298,7 @@ union bpf_attr { * The checksum result, or a negative error code in case of * failure. * - * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Retrieve tunnel options metadata for the packet associated to * *skb*, and store the raw tunnel option data to the buffer *opt* @@ -1294,7 +1316,7 @@ union bpf_attr { * Return * The size of the option data retrieved. * - * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Set tunnel options metadata for the packet associated to *skb* * to the option data contained in the raw buffer *opt* of *size*. @@ -1304,7 +1326,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) * Description * Change the protocol of the *skb* to *proto*. Currently * supported are transition from IPv4 to IPv6, and from IPv6 to @@ -1331,7 +1353,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_change_type(struct sk_buff *skb, u32 type) + * long bpf_skb_change_type(struct sk_buff *skb, u32 type) * Description * Change the packet type for the packet associated to *skb*. This * comes down to setting *skb*\ **->pkt_type** to *type*, except @@ -1358,7 +1380,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) * Description * Check whether *skb* is a descendant of the cgroup2 held by * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. @@ -1389,7 +1411,7 @@ union bpf_attr { * Return * A pointer to the current task struct. * - * int bpf_probe_write_user(void *dst, const void *src, u32 len) + * long bpf_probe_write_user(void *dst, const void *src, u32 len) * Description * Attempt in a safe way to write *len* bytes from the buffer * *src* to *dst* in memory. It only works for threads that are in @@ -1408,7 +1430,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) * Description * Check whether the probe is being run is the context of a given * subset of the cgroup2 hierarchy. The cgroup2 to test is held by @@ -1420,7 +1442,7 @@ union bpf_attr { * * 1, if the *skb* task does not belong to the cgroup2. * * A negative error code, if an error occurred. * - * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) * Description * Resize (trim or grow) the packet associated to *skb* to the * new *len*. The *flags* are reserved for future usage, and must @@ -1444,7 +1466,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * long bpf_skb_pull_data(struct sk_buff *skb, u32 len) * Description * Pull in non-linear data in case the *skb* is non-linear and not * all of *len* are part of the linear section. Make *len* bytes @@ -1500,7 +1522,7 @@ union bpf_attr { * recalculation the next time the kernel tries to access this * hash or when the **bpf_get_hash_recalc**\ () helper is called. * - * int bpf_get_numa_node_id(void) + * long bpf_get_numa_node_id(void) * Description * Return the id of the current NUMA node. The primary use case * for this helper is the selection of sockets for the local NUMA @@ -1511,7 +1533,7 @@ union bpf_attr { * Return * The id of current NUMA node. * - * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) * Description * Grows headroom of packet associated to *skb* and adjusts the * offset of the MAC header accordingly, adding *len* bytes of @@ -1532,7 +1554,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that * it is possible to use a negative value for *delta*. This helper @@ -1547,7 +1569,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for @@ -1595,14 +1617,14 @@ union bpf_attr { * is returned (note that **overflowuid** might also be the actual * UID value for the socket). * - * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) + * long bpf_set_hash(struct sk_buff *skb, u32 hash) * Description * Set the full hash for *skb* (set the field *skb*\ **->hash**) * to value *hash*. * Return * 0 * - * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1621,20 +1643,30 @@ union bpf_attr { * * * **SOL_SOCKET**, which supports the following *optname*\ s: * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, - * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**. + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**. * * **IPPROTO_TCP**, which supports the following *optname*\ s: * **TCP_CONGESTION**, **TCP_BPF_IW**, - * **TCP_BPF_SNDCWND_CLAMP**. + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) + * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer @@ -1669,7 +1701,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the endpoint referenced by *map* at * index *key*. Depending on its type, this *map* can contain @@ -1690,7 +1722,7 @@ union bpf_attr { * **XDP_REDIRECT** on success, or the value of the two lower bits * of the *flags* argument on error. * - * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) + * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and @@ -1701,7 +1733,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a *map* referencing sockets. The * *skops* is used as a new value for the entry associated to @@ -1720,7 +1752,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) * Description * Adjust the address pointed by *xdp_md*\ **->data_meta** by * *delta* (which can be positive or negative). Note that this @@ -1749,7 +1781,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) * Description * Read the value of a perf event counter, and store it into *buf* * of size *buf_size*. This helper relies on a *map* of type @@ -1799,7 +1831,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) * Description * For en eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in @@ -1810,7 +1842,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1835,7 +1867,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_override_return(struct pt_regs *regs, u64 rc) + * long bpf_override_return(struct pt_regs *regs, u64 rc) * Description * Used for error injection, this helper uses kprobes to override * the return value of the probed function, and to set it to *rc*. @@ -1860,7 +1892,7 @@ union bpf_attr { * Return * 0 * - * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) + * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) * Description * Attempt to set the value of the **bpf_sock_ops_cb_flags** field * for the full TCP socket associated to *bpf_sock_ops* to @@ -1904,7 +1936,7 @@ union bpf_attr { * be set is returned (which comes down to 0 if all bits were set * as required). * - * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if @@ -1918,7 +1950,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, apply the verdict of the eBPF program to * the next *bytes* (number of bytes) of message *msg*. @@ -1952,7 +1984,7 @@ union bpf_attr { * Return * 0 * - * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, prevent the execution of the verdict eBPF * program for message *msg* until *bytes* (byte number) have been @@ -1970,7 +2002,7 @@ union bpf_attr { * Return * 0 * - * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) * Description * For socket policies, pull in non-linear data from user space * for *msg* and set pointers *msg*\ **->data** and *msg*\ @@ -2001,7 +2033,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) + * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) * Description * Bind the socket associated to *ctx* to the address pointed by * *addr*, of length *addr_len*. This allows for making outgoing @@ -2019,7 +2051,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is * possible to both shrink and grow the packet tail. @@ -2033,7 +2065,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) * Description * Retrieve the XFRM state (IP transform framework, see also * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. @@ -2049,7 +2081,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) + * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *ctx*, which is a pointer @@ -2082,7 +2114,7 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * - * int bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) + * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that * it provides an easy way to load *len* bytes from *offset* @@ -2104,7 +2136,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) * Description * Do FIB lookup in kernel tables using parameters in *params*. * If lookup is successful and result shows packet is to be @@ -2135,7 +2167,7 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * - * int bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. * The *skops* is used as a new value for the entry associated to @@ -2154,7 +2186,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if @@ -2168,7 +2200,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. @@ -2182,7 +2214,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) * Description * Encapsulate the packet associated to *skb* within a Layer 3 * protocol header. This header is provided in the buffer at @@ -2219,7 +2251,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. Only the flags, tag and TLVs @@ -2234,7 +2266,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) * Description * Adjust the size allocated to TLVs in the outermost IPv6 * Segment Routing Header contained in the packet associated to @@ -2250,7 +2282,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) * Description * Apply an IPv6 Segment Routing action of type *action* to the * packet associated to *skb*. Each action takes a parameter @@ -2279,7 +2311,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_repeat(void *ctx) + * long bpf_rc_repeat(void *ctx) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded repeat key message. This delays @@ -2298,7 +2330,7 @@ union bpf_attr { * Return * 0 * - * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded key press with *scancode*, @@ -2363,7 +2395,7 @@ union bpf_attr { * Return * A pointer to the local storage area. * - * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. @@ -2408,7 +2440,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -2445,7 +2477,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -2464,7 +2496,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * int bpf_sk_release(struct bpf_sock *sock) + * long bpf_sk_release(struct bpf_sock *sock) * Description * Release the reference held by *sock*. *sock* must be a * non-**NULL** pointer that was returned from @@ -2472,7 +2504,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) * Description * Push an element *value* in *map*. *flags* is one of: * @@ -2482,19 +2514,19 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * long bpf_map_pop_elem(struct bpf_map *map, void *value) * Description * Pop an element from *map*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * long bpf_map_peek_elem(struct bpf_map *map, void *value) * Description * Get an element from *map* without removing it. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * For socket policies, insert *len* bytes into *msg* at offset * *start*. @@ -2510,7 +2542,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * Will remove *len* bytes from a *msg* starting at byte *start*. * This may result in **ENOMEM** errors under certain situations if @@ -2522,7 +2554,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) + * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded pointer movement. @@ -2536,7 +2568,7 @@ union bpf_attr { * Return * 0 * - * int bpf_spin_lock(struct bpf_spin_lock *lock) + * long bpf_spin_lock(struct bpf_spin_lock *lock) * Description * Acquire a spinlock represented by the pointer *lock*, which is * stored as part of a value of a map. Taking the lock allows to @@ -2584,7 +2616,7 @@ union bpf_attr { * Return * 0 * - * int bpf_spin_unlock(struct bpf_spin_lock *lock) + * long bpf_spin_unlock(struct bpf_spin_lock *lock) * Description * Release the *lock* previously locked by a call to * **bpf_spin_lock**\ (\ *lock*\ ). @@ -2607,7 +2639,7 @@ union bpf_attr { * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * - * int bpf_skb_ecn_set_ce(struct sk_buff *skb) + * long bpf_skb_ecn_set_ce(struct sk_buff *skb) * Description * Set ECN (Explicit Congestion Notification) field of IP header * to **CE** (Congestion Encountered) if current value is **ECT** @@ -2644,7 +2676,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * long bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK for * the listening socket in *sk*. @@ -2659,7 +2691,7 @@ union bpf_attr { * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. * - * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) * Description * Get name of sysctl in /proc/sys/ and copy it into provided by * program buffer *buf* of size *buf_len*. @@ -2675,7 +2707,7 @@ union bpf_attr { * **-E2BIG** if the buffer wasn't big enough (*buf* will contain * truncated name in this case). * - * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get current value of sysctl as it is presented in /proc/sys * (incl. newline, etc), and copy it as a string into provided @@ -2694,7 +2726,7 @@ union bpf_attr { * **-EINVAL** if current value was unavailable, e.g. because * sysctl is uninitialized and read returns -EIO for it. * - * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get new value being written by user space to sysctl (before * the actual write happens) and copy it as a string into @@ -2711,7 +2743,7 @@ union bpf_attr { * * **-EINVAL** if sysctl is being read. * - * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) * Description * Override new value being written by user space to sysctl with * value provided by program in buffer *buf* of size *buf_len*. @@ -2728,7 +2760,7 @@ union bpf_attr { * * **-EINVAL** if sysctl is being read. * - * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to a long integer according to the given base @@ -2752,7 +2784,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to an unsigned long integer according to the @@ -2803,7 +2835,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return @@ -2811,7 +2843,7 @@ union bpf_attr { * * **-ENOENT** if the bpf-local-storage cannot be found. * - * int bpf_send_signal(u32 sig) + * long bpf_send_signal(u32 sig) * Description * Send signal *sig* to the process of the current task. * The signal may be delivered to any of this process's threads. @@ -2852,7 +2884,7 @@ union bpf_attr { * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 * - * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -2876,21 +2908,21 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from user space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from kernel space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe user address * *unsafe_ptr* to *dst*. The *size* should include the @@ -2934,7 +2966,7 @@ union bpf_attr { * including the trailing NUL character. On error, a negative * value. * - * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. @@ -2942,14 +2974,14 @@ union bpf_attr { * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. * - * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt) * Description * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. * *rcv_nxt* is the ack_seq to be sent out. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_send_signal_thread(u32 sig) + * long bpf_send_signal_thread(u32 sig) * Description * Send signal *sig* to the thread corresponding to the current task. * Return @@ -2969,7 +3001,7 @@ union bpf_attr { * Return * The 64 bit jiffies * - * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) * Description * For an eBPF program attached to a perf event, retrieve the * branch records (**struct perf_branch_entry**) associated to *ctx* @@ -2988,7 +3020,7 @@ union bpf_attr { * * **-ENOENT** if architecture does not support branch records. * - * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) * Description * Returns 0 on success, values for *pid* and *tgid* as seen from the current * *namespace* will be returned in *nsdata*. @@ -3000,7 +3032,7 @@ union bpf_attr { * * **-ENOENT** if pidns does not exists for the current task. * - * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -3055,8 +3087,12 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SCHED_CLS** and + * **BPF_PROG_TYPE_SCHED_ACT** programs. + * * Assign the *sk* to the *skb*. When combined with appropriate * routing configuration to receive the packet towards the socket, * will cause *skb* to be delivered to the specified socket. @@ -3082,6 +3118,56 @@ union bpf_attr { * **-ESOCKTNOSUPPORT** if the socket type is not supported * (reuseport). * + * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs. + * + * Select the *sk* as a result of a socket lookup. + * + * For the operation to succeed passed socket must be compatible + * with the packet description provided by the *ctx* object. + * + * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must + * be an exact match. While IP family (**AF_INET** or + * **AF_INET6**) must be compatible, that is IPv6 sockets + * that are not v6-only can be selected for IPv4 packets. + * + * Only TCP listeners and UDP unconnected sockets can be + * selected. *sk* can also be NULL to reset any previous + * selection. + * + * *flags* argument can combination of following values: + * + * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous + * socket selection, potentially done by a BPF program + * that ran before us. + * + * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip + * load-balancing within reuseport group for the socket + * being selected. + * + * On success *ctx->sk* will point to the selected socket. + * + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EAFNOSUPPORT** if socket family (*sk->family*) is + * not compatible with packet family (*ctx->family*). + * + * * **-EEXIST** if socket has been already selected, + * potentially by another program, and + * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified. + * + * * **-EINVAL** if unsupported flags were specified. + * + * * **-EPROTOTYPE** if socket L4 protocol + * (*sk->protocol*) doesn't match packet protocol + * (*ctx->protocol*). + * + * * **-ESOCKTNOSUPPORT** if socket is not in allowed + * state (TCP listening or UDP unconnected). + * * u64 bpf_ktime_get_boot_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. @@ -3090,7 +3176,7 @@ union bpf_attr { * Return * Current *ktime*. * - * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) * Description * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print * out the format string. @@ -3119,7 +3205,7 @@ union bpf_attr { * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * - * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * long bpf_seq_write(struct seq_file *m, const void *data, u32 len) * Description * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. * The *m* represents the seq_file. The *data* and *len* represent the @@ -3161,16 +3247,15 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return - * 0, on success; - * < 0, on error. + * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description @@ -3182,20 +3267,20 @@ union bpf_attr { * void bpf_ringbuf_submit(void *data, u64 flags) * Description * Submit reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * * void bpf_ringbuf_discard(void *data, u64 flags) * Description * Discard reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * @@ -3203,16 +3288,227 @@ union bpf_attr { * Description * Query various characteristics of provided ring buffer. What * exactly is queries is determined by *flags*: - * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; - * - BPF_RB_RING_SIZE - the size of ring buffer; - * - BPF_RB_CONS_POS - consumer position (can wrap around); - * - BPF_RB_PROD_POS - producer(s) position (can wrap around); - * Data returned is just a momentary snapshots of actual values + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values * and could be inaccurate, so this facility should be used to * power heuristics and for reporting, not to make 100% correct * calculation. * Return - * Requested value, or 0, if flags are not recognized. + * Requested value, or 0, if *flags* are not recognized. + * + * long bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. + * + * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to struct task_struct. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. + * + * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) + * Description + * Load header option. Support reading a particular TCP header + * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * + * If *flags* is 0, it will search the option from the + * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * has details on what skb_data contains under different + * sock_ops->op. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * Return + * >0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. + * + * **-EINVAL** If param is invalid + * + * **-ENOMSG** The option is not found + * + * **-ENOENT** No syn packet available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * + * **-ENOSPC** Not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** Cannot parse the header options in the packet + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) + * Description + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid + * + * **-ENOSPC** Not enough space in the header. + * Nothing has been written + * + * **-EEXIST** The option has already existed + * + * **-EFAULT** Cannot parse the existing header options + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) + * Description + * Reserve *len* bytes for the bpf header option. The + * space will be used by bpf_store_hdr_opt() later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * If bpf_reserve_hdr_opt() is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if param is invalid + * + * **-ENOSPC** Not enough space in the header. + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3349,7 +3645,18 @@ union bpf_attr { FN(ringbuf_reserve), \ FN(ringbuf_submit), \ FN(ringbuf_discard), \ - FN(ringbuf_query), + FN(ringbuf_query), \ + FN(csum_level), \ + FN(skc_to_tcp6_sock), \ + FN(skc_to_tcp_sock), \ + FN(skc_to_tcp_timewait_sock), \ + FN(skc_to_tcp_request_sock), \ + FN(skc_to_udp6_sock), \ + FN(get_task_stack), \ + FN(load_hdr_opt), \ + FN(store_hdr_opt), \ + FN(reserve_hdr_opt), + /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3426,6 +3733,14 @@ enum { BPF_F_CURRENT_NETNS = (-1L), }; +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + /* BPF_FUNC_skb_adjust_room flags. */ enum { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), @@ -3433,6 +3748,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), }; enum { @@ -3482,6 +3798,12 @@ enum { BPF_RINGBUF_HDR_SZ = 8, }; +/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */ +enum { + BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0), + BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1), +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, @@ -3712,6 +4034,32 @@ struct xdp_md { __u32 egress_ifindex; /* txq->dev->ifindex */ }; +/* DEVMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_devmap_val { + __u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + enum sk_action { SK_DROP = 0, SK_PASS, @@ -3840,16 +4188,26 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __aligned_u64 target_name; /* in/out: target_name buffer ptr */ + __u32 target_name_len; /* in/out: target_name buffer len */ + union { + __u32 map_id; + } map; + } iter; struct { __u32 netns_ino; __u32 attach_type; } netns; + struct { + __u32 ifindex; + } xdp; }; } __attribute__((aligned(8))); /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on - * attach attach type). + * attach type). */ struct bpf_sock_addr { __u32 user_family; /* Allows 4-byte read, but no write. */ @@ -3924,6 +4282,36 @@ struct bpf_sock_ops { __u64 bytes_received; __u64 bytes_acked; __bpf_md_ptr(struct bpf_sock *, sk); + /* [skb_data, skb_data_end) covers the whole TCP header. + * + * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received + * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the + * header has not been written. + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have + * been written so far. + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes + * the 3WHS. + * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes + * the 3WHS. + * + * bpf_load_hdr_opt() can also be used to read a particular option. + */ + __bpf_md_ptr(void *, skb_data); + __bpf_md_ptr(void *, skb_data_end); + __u32 skb_len; /* The total length of a packet. + * It includes the header, options, + * and payload. + */ + __u32 skb_tcp_flags; /* tcp_flags of the header. It provides + * an easy way to check for tcp_flags + * without parsing skb_data. + * + * In particular, the skb_tcp_flags + * will still be available in + * BPF_SOCK_OPS_HDR_OPT_LEN even though + * the outgoing header has not + * been written yet. + */ }; /* Definitions for bpf_sock_ops_cb_flags */ @@ -3932,8 +4320,51 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), + /* Call bpf for all received TCP headers. The bpf prog will be + * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + * + * It could be used at the client/active side (i.e. connect() side) + * when the server told it that the server was in syncookie + * mode and required the active side to resend the bpf-written + * options. The active side can keep writing the bpf-options until + * it received a valid packet from the server side to confirm + * the earlier packet (and options) has been received. The later + * example patch is using it like this at the active side when the + * server is in syncookie mode. + * + * The bpf prog will usually turn this off in the common cases. + */ + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf when kernel has received a header option that + * the kernel cannot handle. The bpf prog will be called under + * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + */ + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + /* Call bpf when the kernel is writing header options for the + * outgoing packet. The bpf prog will first be called + * to reserve space in a skb under + * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then + * the bpf prog will be called to write the header option(s) + * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB + * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option + * related helpers that will be useful to the bpf programs. + * + * The kernel gets its chance to reserve space and write + * options first before the BPF program does. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; /* List of known BPF sock_ops operators. @@ -3989,6 +4420,63 @@ enum { */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. */ + BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. + * It will be called to handle + * the packets received at + * an already established + * connection. + * + * sock_ops->skb_data: + * Referring to the received skb. + * It covers the TCP header only. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option. + */ + BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the + * header option later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Not available because no header has + * been written yet. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the + * outgoing skb. (e.g. SYN, ACK, FIN). + * + * bpf_reserve_hdr_opt() should + * be used to reserve space. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Referring to the outgoing skb. + * It covers the TCP header + * that has already been written + * by the kernel and the + * earlier bpf-progs. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the outgoing + * skb. (e.g. SYN, ACK, FIN). + * + * bpf_store_hdr_opt() should + * be used to write the + * option. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option that + * has already been written + * by the kernel or the + * earlier bpf-progs. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect @@ -4016,6 +4504,63 @@ enum { enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ + TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ + TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ + /* Copy the SYN pkt to optval + * + * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the + * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit + * to only getting from the saved_syn. It can either get the + * syn packet from: + * + * 1. the just-received SYN packet (only available when writing the + * SYNACK). It will be useful when it is not necessary to + * save the SYN packet for latter use. It is also the only way + * to get the SYN during syncookie mode because the syn + * packet cannot be saved during syncookie. + * + * OR + * + * 2. the earlier saved syn which was done by + * bpf_setsockopt(TCP_SAVE_SYN). + * + * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the + * SYN packet is obtained. + * + * If the bpf-prog does not need the IP[46] header, the + * bpf-prog can avoid parsing the IP header by using + * TCP_BPF_SYN. Otherwise, the bpf-prog can get both + * IP[46] and TCP header by using TCP_BPF_SYN_IP. + * + * >0: Total number of bytes copied + * -ENOSPC: Not enough space in optval. Only optlen number of + * bytes is copied. + * -ENOENT: The SYN skb is not available now and the earlier SYN pkt + * is not saved by setsockopt(TCP_SAVE_SYN). + */ + TCP_BPF_SYN = 1005, /* Copy the TCP header */ + TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ + TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ +}; + +enum { + BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), +}; + +/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + */ +enum { + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the + * total option spaces + * required for an established + * sk in order to calculate the + * MSS. No skb is actually + * sent. + */ + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode + * when sending a SYN. + */ }; struct bpf_perf_event_value { @@ -4198,4 +4743,19 @@ struct bpf_pidns_info { __u32 pid; __u32 tgid; }; + +/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ +struct bpf_sk_lookup { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + + __u32 family; /* Protocol family (AF_INET, AF_INET6) */ + __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ + __u32 remote_ip4; /* Network byte order */ + __u32 remote_ip6[4]; /* Network byte order */ + __u32 remote_port; /* Network byte order */ + __u32 local_ip4; /* Network byte order */ + __u32 local_ip6[4]; /* Network byte order */ + __u32 local_port; /* Host byte order */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ From e9c45d7f648f1f1be3217868ee863d8979380e37 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Tue, 15 Dec 2020 16:49:57 +0100 Subject: [PATCH 52/61] traffic-pacing-edt: start working on xdp_cpumap_qinq Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/Makefile | 1 + traffic-pacing-edt/xdp_cpumap_qinq.c | 60 ++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 traffic-pacing-edt/xdp_cpumap_qinq.c diff --git a/traffic-pacing-edt/Makefile b/traffic-pacing-edt/Makefile index 09cdd24..fdf2613 100644 --- a/traffic-pacing-edt/Makefile +++ b/traffic-pacing-edt/Makefile @@ -2,6 +2,7 @@ USER_TARGETS := BPF_TARGETS := edt_pacer_vlan +BPF_TARGETS += xdp_cpumap_qinq EXTRA_DEPS += config.mk diff --git a/traffic-pacing-edt/xdp_cpumap_qinq.c b/traffic-pacing-edt/xdp_cpumap_qinq.c new file mode 100644 index 0000000..1fc98b0 --- /dev/null +++ b/traffic-pacing-edt/xdp_cpumap_qinq.c @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#include +#include /* struct bpf_cpumap_val */ +#include +#include + + +#include + +#define VLAN_MAX_DEPTH 2 +#include + +#define MAX_CPUS 24 + +/* Special map type that can XDP_REDIRECT frames to another CPU */ +struct { + __uint(type, BPF_MAP_TYPE_CPUMAP); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_cpumap_val)); + __uint(max_entries, MAX_CPUS); +} cpumap SEC(".maps"); + +SEC("xdp") +int xdp_cpumap_qinq(struct xdp_md *ctx) +{ + void *data = (void *)(long)ctx->data; + void *data_end = (void *)(long)ctx->data_end; + struct collect_vlans vlans = { 0 }; + struct ethhdr *eth; + __u32 cpu_dest = 0; + __u64 action; + + /* These keep track of the next header type and iterator pointer */ + struct hdr_cursor nh; + int eth_type; + nh.pos = data; + + eth_type = parse_ethhdr_vlan(&nh, data_end, ð, &vlans); + if (eth_type < 0) { + action = XDP_ABORTED; + goto out; + } + + /* Keep ARP resolution working */ + if (eth_type == bpf_htons(ETH_P_ARP)) { + action = XDP_PASS; + goto out; + } + + if (!proto_is_vlan(eth->h_proto)) { + /* Skip non-VLAN frames */ + action = XDP_PASS; + goto out; + } + + // WARNING: Userspace MUST insert entries into cpumap + action = bpf_redirect_map(&cpumap, cpu_dest, XDP_PASS); +out: + return action; +} From c8682ec27f2f6e20db9bfdf2e4aa5673957b2b2e Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Tue, 15 Dec 2020 17:55:24 +0100 Subject: [PATCH 53/61] traffic-pacing-edt: userspace loader for xdp_cpumap_qinq Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/Makefile | 2 +- traffic-pacing-edt/xdp_cpumap_loader.c | 240 +++++++++++++++++++++++++ 2 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 traffic-pacing-edt/xdp_cpumap_loader.c diff --git a/traffic-pacing-edt/Makefile b/traffic-pacing-edt/Makefile index fdf2613..aa6d3aa 100644 --- a/traffic-pacing-edt/Makefile +++ b/traffic-pacing-edt/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) -USER_TARGETS := +USER_TARGETS := xdp_cpumap_loader BPF_TARGETS := edt_pacer_vlan BPF_TARGETS += xdp_cpumap_qinq diff --git a/traffic-pacing-edt/xdp_cpumap_loader.c b/traffic-pacing-edt/xdp_cpumap_loader.c new file mode 100644 index 0000000..8196bac --- /dev/null +++ b/traffic-pacing-edt/xdp_cpumap_loader.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0+ +static const char *__doc__ = + " XDP load-balancing with CPU-map"; + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include /* XDP defines */ + +static int ifindex = -1; +static char ifname_buf[IF_NAMESIZE]; +static char *ifname; + +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; + +/* Exit return codes */ +#define EXIT_OK 0 +#define EXIT_FAIL 1 +#define EXIT_FAIL_OPTION 2 +#define EXIT_FAIL_XDP 3 +#define EXIT_FAIL_BPF 4 +#define EXIT_FAIL_MEM 5 +#define EXIT_FAIL_FILE 6 + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"dev", required_argument, NULL, 'd' }, + {"qsize", required_argument, NULL, 'q' }, + {"force", no_argument, NULL, 'F' }, + {0, 0, NULL, 0 } +}; + +static void usage(char *argv[]) +{ + int i; + + printf("\nDOCUMENTATION:\n%s\n", __doc__); + printf("\n"); + printf(" Usage: %s (options-see-below)\n", argv[0]); + printf(" Listing options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)", + *long_options[i].flag); + else + printf(" short-option: -%c", + long_options[i].val); + printf("\n"); + } + printf("\n"); +} + +static int create_cpu_entry(int cpumap_fd, __u32 cpu, + struct bpf_cpumap_val *value) +{ + int err; + + /* Add a CPU entry to cpumap, as this allocate a cpu entry in + * the kernel for the cpu. + */ + err = bpf_map_update_elem(cpumap_fd, &cpu, value, 0); + if (err) { + fprintf(stderr, "Create CPU entry failed (err:%d)\n", err); + exit(EXIT_FAIL_BPF); + } + + return 0; +} + +/* Userspace MUST create/populate CPUMAP entries for redirect to work + */ +static void enable_all_cpus(int cpumap_fd, __u32 qsize) +{ + struct bpf_cpumap_val value = { 0 }; + int n_cpus = get_nprocs_conf(); + int i; + + value.qsize = qsize; + + for (i = 0; i < n_cpus; i++) { + printf("Enable CPU:%d\n", i); + create_cpu_entry(cpumap_fd, i, &value); + } +} + +struct bpf_object *do_load_bpf_obj(struct bpf_object *obj) +{ + char buf[200]; + int err; + + err = bpf_object__load(obj); + if (err) { + libbpf_strerror(err, buf, sizeof(buf)); + printf("Error loading: %s\n", buf); + return NULL; + } + return obj; +} + +int do_xdp_attach(int ifindex, struct bpf_program *prog, __u32 xdp_flags) +{ + int prog_fd = bpf_program__fd(prog); + int err; + + if (prog_fd < 0) { + fprintf(stderr, "bpf_program__fd failed\n"); + return EXIT_FAIL_BPF; + } + + err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags); + if (err) { + fprintf(stderr, "link set xdp fd failed (err:%d)\n", err); + return EXIT_FAIL_XDP; + } + return EXIT_OK; +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int opt, longindex = 0; + __u32 cfg_qsize = 512; + char buf[100]; + int err; + + struct bpf_object *obj = NULL; + struct bpf_program *prog; + int cpumap_fd = -1; + + int n_cpus = get_nprocs_conf(); + + obj = bpf_object__open_file("xdp_cpumap_qinq.o", NULL); + err = libbpf_get_error(obj); + if (err) { + libbpf_strerror(err, buf, sizeof(buf)); + printf("Error opening file: %s\n", buf); + return EXIT_FAIL_FILE; + } + err = EXIT_OK; + + /* Parse commands line args */ + while ((opt = getopt_long(argc, argv, "hSd:s:p:q:c:xzFf:e:r:m:", + long_options, &longindex)) != -1) { + switch (opt) { + case 'd': + if (strlen(optarg) >= IF_NAMESIZE) { + fprintf(stderr, "ERR: --dev name too long\n"); + goto error; + } + ifname = (char *)&ifname_buf; + strncpy(ifname, optarg, IF_NAMESIZE); + ifindex = if_nametoindex(ifname); + if (ifindex == 0) { + fprintf(stderr, + "ERR: --dev name unknown err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + break; + case 'q': + cfg_qsize = strtol(optarg, NULL, 10); + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + case 'h': + error: + default: + usage(argv); + return EXIT_FAIL_OPTION; + } + } + /* Required option */ + if (ifindex == -1) { + fprintf(stderr, "ERR: required option --dev missing\n"); + usage(argv); + return EXIT_FAIL_OPTION; + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return EXIT_FAIL_MEM; + } + + /* Always use XDP native driver mode */ + xdp_flags |= XDP_FLAGS_DRV_MODE; + + obj = do_load_bpf_obj(obj); + if (!obj) + return EXIT_FAIL_BPF; + + /* Pickup first BPF-program */ + prog = bpf_program__next(NULL, obj); + if (!prog) { + printf("No program!\n"); + err = EXIT_FAIL_BPF; + goto out; + } + + /* Get file descriptor to BPF-map */ + cpumap_fd = bpf_object__find_map_fd_by_name(obj, "cpumap"); + if (cpumap_fd < 0) { + printf("No cpumap found!\n"); + err = EXIT_FAIL_BPF; + goto out; + } + /* Configure cpumap */ + enable_all_cpus(cpumap_fd, cfg_qsize); + + /* Attach XDP program */ + err = do_xdp_attach(ifindex, prog, xdp_flags); + if (err) + goto out; + + printf("Attached XDP program:\"%s\" on netdev:%s (ifindex:%d)\n", + bpf_program__name(prog), ifname, ifindex); + printf("CPUs: %d\n", n_cpus); + +out: + if (obj) + bpf_object__close(obj); + + return err; +} From b3ebc2c18ce6ce1988bd5cc9a5fca458aec76032 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Tue, 15 Dec 2020 19:34:03 +0100 Subject: [PATCH 54/61] traffic-pacing-edt: implement option for --remove Need quick way to remove before testing on production. Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/xdp_cpumap_loader.c | 29 +++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/traffic-pacing-edt/xdp_cpumap_loader.c b/traffic-pacing-edt/xdp_cpumap_loader.c index 8196bac..fb6f08b 100644 --- a/traffic-pacing-edt/xdp_cpumap_loader.c +++ b/traffic-pacing-edt/xdp_cpumap_loader.c @@ -42,6 +42,7 @@ static const struct option long_options[] = { {"dev", required_argument, NULL, 'd' }, {"qsize", required_argument, NULL, 'q' }, {"force", no_argument, NULL, 'F' }, + {"remove", no_argument, NULL, 'r' }, {0, 0, NULL, 0 } }; @@ -125,7 +126,21 @@ int do_xdp_attach(int ifindex, struct bpf_program *prog, __u32 xdp_flags) err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags); if (err) { - fprintf(stderr, "link set xdp fd failed (err:%d)\n", err); + fprintf(stderr, "%s(): link set xdp fd failed (err:%d)\n", + __func__, err); + return EXIT_FAIL_XDP; + } + return EXIT_OK; +} + +int do_xdp_detach(int ifindex, __u32 xdp_flags) +{ + int err; + + err = bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + if (err) { + fprintf(stderr, "%s(): link set xdp fd failed (err:%d)\n", + __func__, err); return EXIT_FAIL_XDP; } return EXIT_OK; @@ -134,6 +149,7 @@ int do_xdp_attach(int ifindex, struct bpf_program *prog, __u32 xdp_flags) int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + bool do_detach = false; int opt, longindex = 0; __u32 cfg_qsize = 512; char buf[100]; @@ -145,6 +161,9 @@ int main(int argc, char **argv) int n_cpus = get_nprocs_conf(); + /* Always use XDP native driver mode */ + xdp_flags |= XDP_FLAGS_DRV_MODE; + obj = bpf_object__open_file("xdp_cpumap_qinq.o", NULL); err = libbpf_get_error(obj); if (err) { @@ -179,6 +198,9 @@ int main(int argc, char **argv) case 'F': xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; break; + case 'r': + do_detach = true; + break; case 'h': error: default: @@ -193,13 +215,14 @@ int main(int argc, char **argv) return EXIT_FAIL_OPTION; } + if (do_detach) + return do_xdp_detach(ifindex, xdp_flags); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { perror("setrlimit(RLIMIT_MEMLOCK)"); return EXIT_FAIL_MEM; } - /* Always use XDP native driver mode */ - xdp_flags |= XDP_FLAGS_DRV_MODE; obj = do_load_bpf_obj(obj); if (!obj) From e8ae6a92870a805d6008b4281c1cb345f1da1f2b Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Wed, 16 Dec 2020 16:40:15 +0100 Subject: [PATCH 55/61] traffic-pacing-edt: implement spead across CPUs Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/xdp_cpumap_loader.c | 2 +- traffic-pacing-edt/xdp_cpumap_qinq.c | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/traffic-pacing-edt/xdp_cpumap_loader.c b/traffic-pacing-edt/xdp_cpumap_loader.c index fb6f08b..3a954b9 100644 --- a/traffic-pacing-edt/xdp_cpumap_loader.c +++ b/traffic-pacing-edt/xdp_cpumap_loader.c @@ -78,7 +78,7 @@ static int create_cpu_entry(int cpumap_fd, __u32 cpu, err = bpf_map_update_elem(cpumap_fd, &cpu, value, 0); if (err) { fprintf(stderr, "Create CPU entry failed (err:%d)\n", err); - exit(EXIT_FAIL_BPF); + return EXIT_FAIL_BPF; } return 0; diff --git a/traffic-pacing-edt/xdp_cpumap_qinq.c b/traffic-pacing-edt/xdp_cpumap_qinq.c index 1fc98b0..5803c95 100644 --- a/traffic-pacing-edt/xdp_cpumap_qinq.c +++ b/traffic-pacing-edt/xdp_cpumap_qinq.c @@ -12,6 +12,9 @@ #define MAX_CPUS 24 +/* This global variable is used for limiting CPU that can be selected */ +__u32 global_max_cpus = 12; /* TODO: Allow userspace to adjust this */ + /* Special map type that can XDP_REDIRECT frames to another CPU */ struct { __uint(type, BPF_MAP_TYPE_CPUMAP); @@ -20,6 +23,22 @@ struct { __uint(max_entries, MAX_CPUS); } cpumap SEC(".maps"); +static __always_inline +__u16 extract_vlan_key(struct collect_vlans *vlans) +{ + __u16 vlan_key = 0; + + if (vlans->id[1]) { + /* Inner Q-in-Q VLAN present use that as key */ + vlan_key = vlans->id[1]; + } else { + /* If only one VLAN tag, use it as key */ + vlan_key = vlans->id[0]; + } + + return vlan_key; +} + SEC("xdp") int xdp_cpumap_qinq(struct xdp_md *ctx) { @@ -53,7 +72,10 @@ int xdp_cpumap_qinq(struct xdp_md *ctx) goto out; } - // WARNING: Userspace MUST insert entries into cpumap + /* Use inner VLAN as key and hash based on max_cpus */ + cpu_dest = extract_vlan_key(&vlans) % global_max_cpus; + + /* Notice: Userspace MUST insert entries into cpumap */ action = bpf_redirect_map(&cpumap, cpu_dest, XDP_PASS); out: return action; From 74a47be6973ba3bb69ea19851d1bed5fe5d9a929 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Wed, 16 Dec 2020 20:42:37 +0100 Subject: [PATCH 56/61] traffic-pacing-edt: Add "SuperFastHash" based on Paul Hsieh design Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/hash_func01.h | 55 ++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 traffic-pacing-edt/hash_func01.h diff --git a/traffic-pacing-edt/hash_func01.h b/traffic-pacing-edt/hash_func01.h new file mode 100644 index 0000000..3825581 --- /dev/null +++ b/traffic-pacing-edt/hash_func01.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: LGPL-2.1 + * + * Based on Paul Hsieh's (LGPG 2.1) hash function + * From: http://www.azillionmonkeys.com/qed/hash.html + */ + +#define get16bits(d) (*((const __u16 *) (d))) + +static __always_inline +__u32 SuperFastHash (const char *data, int len, __u32 initval) { + __u32 hash = initval; + __u32 tmp; + int rem; + + if (len <= 0 || data == NULL) return 0; + + rem = len & 3; + len >>= 2; + + /* Main loop */ +#pragma clang loop unroll(full) + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (__u16); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= ((signed char)data[sizeof (__u16)]) << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += (signed char)*data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} From a16ab11e70cb8c1fb71c44cc4faffd4ce6288b8d Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Wed, 16 Dec 2020 21:09:28 +0100 Subject: [PATCH 57/61] traffic-pacing-edt: Use hash function to calc cpu_dest Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/xdp_cpumap_qinq.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/traffic-pacing-edt/xdp_cpumap_qinq.c b/traffic-pacing-edt/xdp_cpumap_qinq.c index 5803c95..3ecc623 100644 --- a/traffic-pacing-edt/xdp_cpumap_qinq.c +++ b/traffic-pacing-edt/xdp_cpumap_qinq.c @@ -4,6 +4,8 @@ #include #include +#define INITVAL 15485863 +#include "hash_func01.h" /* SuperFastHash */ #include @@ -24,18 +26,10 @@ struct { } cpumap SEC(".maps"); static __always_inline -__u16 extract_vlan_key(struct collect_vlans *vlans) +__u32 extract_vlan_key(struct collect_vlans *vlans) { - __u16 vlan_key = 0; - - if (vlans->id[1]) { - /* Inner Q-in-Q VLAN present use that as key */ - vlan_key = vlans->id[1]; - } else { - /* If only one VLAN tag, use it as key */ - vlan_key = vlans->id[0]; - } - + /* Combine inner and outer VLAN as a key */ + __u32 vlan_key = (vlans->id[1] << 16) | vlans->id[0]; return vlan_key; } @@ -45,6 +39,7 @@ int xdp_cpumap_qinq(struct xdp_md *ctx) void *data = (void *)(long)ctx->data; void *data_end = (void *)(long)ctx->data_end; struct collect_vlans vlans = { 0 }; + __u32 hash_key, vlan_key; struct ethhdr *eth; __u32 cpu_dest = 0; __u64 action; @@ -72,8 +67,10 @@ int xdp_cpumap_qinq(struct xdp_md *ctx) goto out; } - /* Use inner VLAN as key and hash based on max_cpus */ - cpu_dest = extract_vlan_key(&vlans) % global_max_cpus; + /* Use inner+outer VLAN as key and hash based on max_cpus */ + vlan_key = extract_vlan_key(&vlans); + hash_key = SuperFastHash((char *)&vlan_key, 4, INITVAL); + cpu_dest = hash_key % global_max_cpus; /* Notice: Userspace MUST insert entries into cpumap */ action = bpf_redirect_map(&cpumap, cpu_dest, XDP_PASS); From 3b6a0c0aa969e247abc80834f68c89343024a6e6 Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Wed, 16 Dec 2020 21:24:14 +0100 Subject: [PATCH 58/61] traffic-pacing-edt: Exclude CPU-6 in the code On production setup i40e driver sends all packets to CPU-6 (RX). Thus, we want to exclude CPU-6 itself from processing/pacing packets itself. Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/xdp_cpumap_qinq.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/traffic-pacing-edt/xdp_cpumap_qinq.c b/traffic-pacing-edt/xdp_cpumap_qinq.c index 3ecc623..63c8138 100644 --- a/traffic-pacing-edt/xdp_cpumap_qinq.c +++ b/traffic-pacing-edt/xdp_cpumap_qinq.c @@ -72,6 +72,10 @@ int xdp_cpumap_qinq(struct xdp_md *ctx) hash_key = SuperFastHash((char *)&vlan_key, 4, INITVAL); cpu_dest = hash_key % global_max_cpus; + /* TODO: Find more generic way to exclude CPU-6 */ + if (cpu_dest == 6) + cpu_dest = 11; + /* Notice: Userspace MUST insert entries into cpumap */ action = bpf_redirect_map(&cpumap, cpu_dest, XDP_PASS); out: From 47e5cb1c391ccea12cb97c9fa69d73deb19604aa Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Fri, 18 Dec 2020 20:13:55 +0100 Subject: [PATCH 59/61] traffic-pacing-edt: playing with hash initval It didn't help kept the original value. Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/xdp_cpumap_qinq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/traffic-pacing-edt/xdp_cpumap_qinq.c b/traffic-pacing-edt/xdp_cpumap_qinq.c index 63c8138..f38d61b 100644 --- a/traffic-pacing-edt/xdp_cpumap_qinq.c +++ b/traffic-pacing-edt/xdp_cpumap_qinq.c @@ -5,6 +5,8 @@ #include #define INITVAL 15485863 +//#define INITVAL 2654435761 + #include "hash_func01.h" /* SuperFastHash */ #include @@ -29,7 +31,7 @@ static __always_inline __u32 extract_vlan_key(struct collect_vlans *vlans) { /* Combine inner and outer VLAN as a key */ - __u32 vlan_key = (vlans->id[1] << 16) | vlans->id[0]; + __u32 vlan_key = (vlans->id[1] << 16) | vlans->id[0]; return vlan_key; } From 39ab41d0d61b10928a2eae42f8fb6f19bbcf6a5d Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Fri, 18 Dec 2020 22:09:24 +0100 Subject: [PATCH 60/61] Add CPU mapping layer to allow excluding some CPUs Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/xdp_cpumap_loader.c | 172 +++++++++++++++++++++---- traffic-pacing-edt/xdp_cpumap_qinq.c | 36 +++++- 2 files changed, 177 insertions(+), 31 deletions(-) diff --git a/traffic-pacing-edt/xdp_cpumap_loader.c b/traffic-pacing-edt/xdp_cpumap_loader.c index 3a954b9..0afd46a 100644 --- a/traffic-pacing-edt/xdp_cpumap_loader.c +++ b/traffic-pacing-edt/xdp_cpumap_loader.c @@ -43,6 +43,8 @@ static const struct option long_options[] = { {"qsize", required_argument, NULL, 'q' }, {"force", no_argument, NULL, 'F' }, {"remove", no_argument, NULL, 'r' }, + {"non-cpu", required_argument, NULL, 'x' }, + {"exclude-cpu", required_argument, NULL, 'x' }, {0, 0, NULL, 0 } }; @@ -67,37 +69,130 @@ static void usage(char *argv[]) printf("\n"); } -static int create_cpu_entry(int cpumap_fd, __u32 cpu, - struct bpf_cpumap_val *value) +struct cpumap_config { + int fd_cpumap; + int fd_cpus_enabled; + int fd_cpus_count; + int *cpu_exclude; + int max_cpus; + __u32 qsize; +}; + +static int cpumap_config_init(struct cpumap_config *cfg) { - int err; + int n_cpus = get_nprocs_conf(); + int *cpu_exclude; + + memset(cfg, 0, sizeof(*cfg)); + + cpu_exclude = malloc(n_cpus * sizeof(int)); + if (!cpu_exclude) { + fprintf(stderr, "failed to allocate array\n"); + return EXIT_FAIL_MEM; + } + memset(cpu_exclude, 0, n_cpus * sizeof(int)); + + cfg->cpu_exclude = cpu_exclude; + cfg->max_cpus = n_cpus; + return 0; +} + +int __find_map_fd_by_name(struct bpf_object *obj, char *name) +{ + int fd; + + fd = bpf_object__find_map_fd_by_name(obj, name); + if (fd < 0) { + printf("No map found! - named: %s\n", name); + exit(EXIT_FAIL_BPF); + } + return fd; +} + +/* Get file descriptors to BPF-maps */ +static int cpumap_config_find_maps(struct bpf_object *obj, + struct cpumap_config *cfg) +{ + cfg->fd_cpumap = __find_map_fd_by_name(obj, "cpumap"); + cfg->fd_cpus_enabled = __find_map_fd_by_name(obj, "cpus_enabled"); + cfg->fd_cpus_count = __find_map_fd_by_name(obj, "cpus_count"); + return 0; +} + +static int create_cpu_entry(struct cpumap_config *cfg, __u32 cpu, + struct bpf_cpumap_val *value, + __u32 enabled_idx, bool new) +{ + __u32 curr_cpus_count = 0; + __u32 key = 0; + int err, fd; /* Add a CPU entry to cpumap, as this allocate a cpu entry in * the kernel for the cpu. */ - err = bpf_map_update_elem(cpumap_fd, &cpu, value, 0); + fd = cfg->fd_cpumap; + err = bpf_map_update_elem(fd, &cpu, value, 0); if (err) { - fprintf(stderr, "Create CPU entry failed (err:%d)\n", err); + fprintf(stderr, "Create(fd:%d) CPU(%d) entry failed (err:%d)\n", + fd, cpu, err); return EXIT_FAIL_BPF; } + /* Inform bpf_prog's that a new CPU is enabled and available + * to be select from the map, that maps index to actual CPU. + */ + fd = cfg->fd_cpus_enabled; + err = bpf_map_update_elem(fd, &enabled_idx, &cpu, 0); + if (err) { + fprintf(stderr, "Add to enabled avail CPUs failed\n"); + return EXIT_FAIL_BPF; + } + + /* When not replacing/updating existing entry, bump the count */ + fd = cfg->fd_cpus_count; + err = bpf_map_lookup_elem(fd, &key, &curr_cpus_count); + if (err) { + fprintf(stderr, "Failed reading curr cpus_count\n"); + return EXIT_FAIL_BPF; + } + if (new) { + curr_cpus_count++; + err = bpf_map_update_elem(fd, &key, &curr_cpus_count, 0); + if (err) { + fprintf(stderr, "Failed write curr cpus_count\n"); + return EXIT_FAIL_BPF; + } + } + return 0; } /* Userspace MUST create/populate CPUMAP entries for redirect to work */ -static void enable_all_cpus(int cpumap_fd, __u32 qsize) +static int configure_cpus(struct cpumap_config *cfg) { struct bpf_cpumap_val value = { 0 }; - int n_cpus = get_nprocs_conf(); - int i; + int n_cpus = cfg->max_cpus; + int *exclude = cfg->cpu_exclude; + int enabled_idx = 0; + bool new = true; + int cpu, err; - value.qsize = qsize; + value.qsize = cfg->qsize; - for (i = 0; i < n_cpus; i++) { - printf("Enable CPU:%d\n", i); - create_cpu_entry(cpumap_fd, i, &value); + for (cpu = 0; cpu < n_cpus; cpu++) { + + if (exclude[cpu] == -1) { + printf("Excluding CPU:%d\n", cpu); + continue; + } + printf("Enable CPU:%d\n", cpu); + err = create_cpu_entry(cfg, cpu, &value, enabled_idx, new); + if (err) + return err; + enabled_idx++; } + return 0; } struct bpf_object *do_load_bpf_obj(struct bpf_object *obj) @@ -151,15 +246,21 @@ int main(int argc, char **argv) struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; bool do_detach = false; int opt, longindex = 0; - __u32 cfg_qsize = 512; char buf[100]; int err; struct bpf_object *obj = NULL; struct bpf_program *prog; - int cpumap_fd = -1; + /* System to setup and exclude some CPUs */ + struct cpumap_config cfg; int n_cpus = get_nprocs_conf(); + int non_cpu = -1; + int *cpu_exclude; + + cpumap_config_init(&cfg); + cpu_exclude = cfg.cpu_exclude; + cfg.qsize = 512; /* Default queue size */ /* Always use XDP native driver mode */ xdp_flags |= XDP_FLAGS_DRV_MODE; @@ -174,7 +275,7 @@ int main(int argc, char **argv) err = EXIT_OK; /* Parse commands line args */ - while ((opt = getopt_long(argc, argv, "hSd:s:p:q:c:xzFf:e:r:m:", + while ((opt = getopt_long(argc, argv, "hd:q:Frx:", long_options, &longindex)) != -1) { switch (opt) { case 'd': @@ -193,7 +294,7 @@ int main(int argc, char **argv) } break; case 'q': - cfg_qsize = strtol(optarg, NULL, 10); + cfg.qsize = strtol(optarg, NULL, 10); break; case 'F': xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; @@ -201,10 +302,23 @@ int main(int argc, char **argv) case 'r': do_detach = true; break; + case 'x': /* --exclude-cpu or --non-cpu */ + /* Possible to exclude multiple CPUs on cmdline */ + non_cpu = strtoul(optarg, NULL, 0); + if (non_cpu >= n_cpus) { + fprintf(stderr, + "--cpu nr too large for cpumap err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + cpu_exclude[non_cpu] = -1; + break; + case 'h': error: default: usage(argv); + free(cpu_exclude); return EXIT_FAIL_OPTION; } } @@ -212,7 +326,8 @@ int main(int argc, char **argv) if (ifindex == -1) { fprintf(stderr, "ERR: required option --dev missing\n"); usage(argv); - return EXIT_FAIL_OPTION; + err = EXIT_FAIL_OPTION; + goto out; } if (do_detach) @@ -220,13 +335,15 @@ int main(int argc, char **argv) if (setrlimit(RLIMIT_MEMLOCK, &r)) { perror("setrlimit(RLIMIT_MEMLOCK)"); - return EXIT_FAIL_MEM; + err = EXIT_FAIL_MEM; + goto out; } - obj = do_load_bpf_obj(obj); - if (!obj) - return EXIT_FAIL_BPF; + if (!obj) { + err = EXIT_FAIL_BPF; + goto out; + } /* Pickup first BPF-program */ prog = bpf_program__next(NULL, obj); @@ -236,15 +353,17 @@ int main(int argc, char **argv) goto out; } - /* Get file descriptor to BPF-map */ - cpumap_fd = bpf_object__find_map_fd_by_name(obj, "cpumap"); - if (cpumap_fd < 0) { - printf("No cpumap found!\n"); + /* Find maps maps */ + if (cpumap_config_find_maps(obj, &cfg)) { err = EXIT_FAIL_BPF; goto out; } + /* Configure cpumap */ - enable_all_cpus(cpumap_fd, cfg_qsize); + if (configure_cpus(&cfg)) { + err = EXIT_FAIL_BPF; + goto out; + } /* Attach XDP program */ err = do_xdp_attach(ifindex, prog, xdp_flags); @@ -259,5 +378,6 @@ out: if (obj) bpf_object__close(obj); + free(cpu_exclude); return err; } diff --git a/traffic-pacing-edt/xdp_cpumap_qinq.c b/traffic-pacing-edt/xdp_cpumap_qinq.c index f38d61b..eada801 100644 --- a/traffic-pacing-edt/xdp_cpumap_qinq.c +++ b/traffic-pacing-edt/xdp_cpumap_qinq.c @@ -27,6 +27,20 @@ struct { __uint(max_entries, MAX_CPUS); } cpumap SEC(".maps"); +/* Mapping table with CPUs enabled, for hashing between */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, MAX_CPUS); +} cpus_enabled SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, 1); +} cpus_count SEC(".maps"); + static __always_inline __u32 extract_vlan_key(struct collect_vlans *vlans) { @@ -43,8 +57,11 @@ int xdp_cpumap_qinq(struct xdp_md *ctx) struct collect_vlans vlans = { 0 }; __u32 hash_key, vlan_key; struct ethhdr *eth; - __u32 cpu_dest = 0; + __u32 cpu_idx, cpu_dest = 0; + __u32 *cpu_lookup; __u64 action; + __u32 *cpu_max; + /* These keep track of the next header type and iterator pointer */ struct hdr_cursor nh; @@ -69,14 +86,23 @@ int xdp_cpumap_qinq(struct xdp_md *ctx) goto out; } + int key0 = 0; + cpu_max = bpf_map_lookup_elem(&cpus_count, &key0); + if (!cpu_max) + return XDP_ABORTED; + /* Use inner+outer VLAN as key and hash based on max_cpus */ vlan_key = extract_vlan_key(&vlans); hash_key = SuperFastHash((char *)&vlan_key, 4, INITVAL); - cpu_dest = hash_key % global_max_cpus; + cpu_idx = hash_key % *cpu_max; - /* TODO: Find more generic way to exclude CPU-6 */ - if (cpu_dest == 6) - cpu_dest = 11; + /* To allow excluding some CPUs, a mapping table cpus_enabled + * translates cpu_idx to real CPU-id + */ + cpu_lookup = bpf_map_lookup_elem(&cpus_enabled, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; /* Notice: Userspace MUST insert entries into cpumap */ action = bpf_redirect_map(&cpumap, cpu_dest, XDP_PASS); From 904c820e7ee202b58d3e8d3e120d06f0a52c094f Mon Sep 17 00:00:00 2001 From: "Jesper D. Brouer" Date: Tue, 22 Dec 2020 19:16:10 +0100 Subject: [PATCH 61/61] traffic-pacing-edt: Propagate define that enables BTF maps Two errors: - Wrong define in config.mk - Use BPF_CFLAGS to reach llvm compile cflags Signed-off-by: Jesper D. Brouer --- traffic-pacing-edt/configure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/traffic-pacing-edt/configure b/traffic-pacing-edt/configure index 9b01369..248c846 100755 --- a/traffic-pacing-edt/configure +++ b/traffic-pacing-edt/configure @@ -15,7 +15,7 @@ check_tc_libbpf() if echo $tc_version | grep -q libbpf; then libbpf_version=${tc_version##*libbpf } echo "HAVE_TC_LIBBPF:=y" >> $CONFIG - echo "CFLAGS += -DHAVE_LIBBPF" >> $CONFIG + echo "BPF_CFLAGS += -DHAVE_TC_LIBBPF" >> $CONFIG echo "yes ($libbpf_version)" else echo "no"