From bcd60745674563717757ac26398d88ce9b86591f Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sat, 14 Nov 2020 12:41:22 +0100
Subject: [PATCH 01/61] traffic-pacing-edt: Add HTB shaper script

This is primary to comparing HTB shaper accuracy against EDT

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/tc_htb_shaper.sh | 84 +++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100755 traffic-pacing-edt/tc_htb_shaper.sh

diff --git a/traffic-pacing-edt/tc_htb_shaper.sh b/traffic-pacing-edt/tc_htb_shaper.sh
new file mode 100755
index 0000000..8652fb0
--- /dev/null
+++ b/traffic-pacing-edt/tc_htb_shaper.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+#
+# This HTB shaper setup script is available for easier comparing the
+# accuracy against the EDT solution.
+#
+# Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
+# License: GPLv2
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+
+root_check_run_with_sudo "$@"
+
+# Use common parameters
+source ${basedir}/parameters.sh
+
+export TC=/sbin/tc
+
+# It seems measured BW is TCP goodput, but configured BW is wirespeed.
+# Measurements show around 930Mbit best-case.  Q-in-Q result in MTU
+# 1522 bytes.  TCP goodput segments are 1448 bytes.
+#
+RATE=$((930*1522/1448))Mbit
+##RATE=$((933*1522/1448))Mbit
+##CEIL=$((999*1522/1448))
+#CEIL=1Gbit
+CEIL=980mbit
+
+#RATE=500mbit
+#CEIL=577mbit
+
+# Each of the HTB root-class(es) get these RATE+CEIL upper bandwidth bounds.
+ROOT_RATE=9000Mbit
+ROOT_CEIL=9500Mbit
+
+DEFAULT_RATE=6000Mbit
+DEFAULT_CEIL=6000Mbit
+
+TC=/usr/sbin/tc
+VERBOSE=1
+
+function tc() {
+    _call_tc "" "$@"
+}
+
+# HTB shaper
+call_tc_allow_fail qdisc del dev "$DEV" root
+#tc qdisc add dev "$DEV" root handle 1: htb default 2
+tc qdisc add dev "$DEV" root handle 1: htb default 16
+
+# The root-class set upper bandwidth usage
+tc class add dev "$DEV" parent 1: classid 1:1 \
+       htb rate $ROOT_RATE ceil $ROOT_CEIL
+
+# Default class 1:2
+tc class add dev "$DEV" parent 1: classid 1:2 htb \
+        rate "$DEFAULT_RATE" ceil "$DEFAULT_CEIL"
+#       burst 100000 cburst 100000
+tc qdisc add dev $DEV parent 1:2 fq_codel
+
+
+# Class for vlan 16
+tc class add dev "$DEV" parent 1: classid 1:16 htb rate "$RATE" ceil "$CEIL" \
+        burst $((1522*2)) cburst $((1522*2)) \
+        linklayer ethernet
+#       burst 1522 cburst 1522
+        #burst 1 cburst 1
+#       burst $((1522*2)) cburst $((1522*2))
+#       overhead $((14+4+4)) linklayer ethernet
+#tc qdisc add dev "$DEV" parent 1:16 fq_codel
+tc qdisc add dev "$DEV" parent 1:16 fq_codel quantum $((1514+4+4))
+#tc qdisc add dev "$DEV" parent 1:16 pfifo
+
+# parent filter:
+#tc filter add dev "$DEV" parent 1:0 prio 100 protocol 802.1q u32
+#
+# vlan 16:
+#tc filter add dev "$DEV" parent 1:0 prio 100 \
+#        protocol 802.1q \
+#        u32 match u16 0x0010 0x0fff at -4 \
+#        flowid 1:16
+
+tc filter add dev $DEV protocol all parent 1:0 prio 101 \
+        basic match "meta(vlan mask 0xfff eq 16)" flowid 1:16

From aae2db44962041b6a28454d2bd7e255846dbdfaf Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sat, 14 Nov 2020 13:09:39 +0100
Subject: [PATCH 02/61] traffic-pacing-edt/tc_htb_shaper.sh: Make it easy to
 remove HTB qdisc

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/tc_htb_shaper.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/traffic-pacing-edt/tc_htb_shaper.sh b/traffic-pacing-edt/tc_htb_shaper.sh
index 8652fb0..69aedb9 100755
--- a/traffic-pacing-edt/tc_htb_shaper.sh
+++ b/traffic-pacing-edt/tc_htb_shaper.sh
@@ -43,8 +43,14 @@ function tc() {
     _call_tc "" "$@"
 }
 
-# HTB shaper
+# Delete existing root qdisc
 call_tc_allow_fail qdisc del dev "$DEV" root
+
+if [[ -n $REMOVE ]]; then
+    exit 0
+fi
+
+# HTB shaper
 #tc qdisc add dev "$DEV" root handle 1: htb default 2
 tc qdisc add dev "$DEV" root handle 1: htb default 16
 

From a5ed0071f1368dd8b6e82a73786cf7b255b089de Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sat, 14 Nov 2020 13:43:42 +0100
Subject: [PATCH 03/61] traffic-pacing-edt: Add tc_fq_pacer.sh script for MQ-FQ
 setup

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/tc_fq_pacer.sh | 35 +++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100755 traffic-pacing-edt/tc_fq_pacer.sh

diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh
new file mode 100755
index 0000000..dc32fdc
--- /dev/null
+++ b/traffic-pacing-edt/tc_fq_pacer.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+#
+# Loading FQ pacing qdisc in multi-queue MQ setup to avoid root qdisc lock.
+#
+# Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
+# License: GPLv2
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+
+root_check_run_with_sudo "$@"
+
+# Use common parameters
+source ${basedir}/parameters.sh
+
+export TC=/sbin/tc
+function tc() {
+    _call_tc "" "$@"
+}
+
+# Default verbose
+VERBOSE=1
+
+# Delete existing root qdisc
+call_tc_allow_fail qdisc del dev "$DEV" root
+
+# MQ (Multi-Queue) as root qdisc
+tc qdisc replace dev $DEV root handle 7FFF: mq
+
+# Add FQ-pacer qdisc on each NIC avail TX-queue
+i=0
+for dir in /sys/class/net/$DEV/queues/tx-*; do
+    ((i++)) || true
+    tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq
+done

From 3969089c646e8f68bac7f99dddbeff8276907795 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sat, 14 Nov 2020 13:53:49 +0100
Subject: [PATCH 04/61] traffic-pacing-edt/tc_fq_pacer.sh: Add doc explaining

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/tc_fq_pacer.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh
index dc32fdc..9682f12 100755
--- a/traffic-pacing-edt/tc_fq_pacer.sh
+++ b/traffic-pacing-edt/tc_fq_pacer.sh
@@ -2,6 +2,10 @@
 #
 # Loading FQ pacing qdisc in multi-queue MQ setup to avoid root qdisc lock.
 #
+# The FQ pacing qdisc is doing all the work of pacing packet out according to
+# the EDT (Earliest Departure Time) future timestamps set by our BPF-prog that
+# runs a TC-egress hook.
+#
 # Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
 # License: GPLv2
 #
@@ -30,6 +34,7 @@ tc qdisc replace dev $DEV root handle 7FFF: mq
 # Add FQ-pacer qdisc on each NIC avail TX-queue
 i=0
 for dir in /sys/class/net/$DEV/queues/tx-*; do
+    # Details: cause-off-by-one, as tx-0 becomes handle 1:
     ((i++)) || true
     tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq
 done

From a6294dd946e2147edd3c3231f182e0d9027e6740 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sat, 14 Nov 2020 14:51:30 +0100
Subject: [PATCH 05/61] edt_pacer02: Use skb wire_len

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/edt_pacer02.c  | 12 +++++++++---
 traffic-pacing-edt/tc_fq_pacer.sh |  1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 47eecc7..b90e780 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -53,9 +53,15 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 	if (!edt)
 		return BPF_DROP;
 
-	/* Calc transmission time it takes to send packet 'bytes' */
-	t_xmit_ns = ((__u64)skb->len) * NS_PER_SEC / RATE_IN_BYTES;
-	// t_xmit_ns = ((__u64)skb->len) * NS_PER_SEC / edt->rate;
+	/* Calc transmission time it takes to send packet 'bytes'.
+	 *
+	 * Details on getting precise bytes on wire.  The skb->len does include
+	 * length of GRO/GSO segments, but not the segment headers that gets
+	 * added on transmit.  Fortunately skb->wire_len at TC-egress hook (not
+	 * ingress) include these headers. (See: qdisc_pkt_len_init())
+	 */
+	t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / RATE_IN_BYTES;
+	// t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / edt->rate;
 
 	now = bpf_ktime_get_ns();
 
diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh
index 9682f12..dd1c8fa 100755
--- a/traffic-pacing-edt/tc_fq_pacer.sh
+++ b/traffic-pacing-edt/tc_fq_pacer.sh
@@ -37,4 +37,5 @@ for dir in /sys/class/net/$DEV/queues/tx-*; do
     # Details: cause-off-by-one, as tx-0 becomes handle 1:
     ((i++)) || true
     tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq
+    # tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit
 done

From 252a40763abb5842555e496efb95de9a6ecd5492 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sat, 14 Nov 2020 14:58:47 +0100
Subject: [PATCH 06/61] traffic-pacing-edt: Adjustments to HTB script to get
 closer to EDT system

These adjustment doesn't help, EDT is still closer to 1Gbit/s at wire-level.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/tc_htb_shaper.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/traffic-pacing-edt/tc_htb_shaper.sh b/traffic-pacing-edt/tc_htb_shaper.sh
index 69aedb9..08e0b06 100755
--- a/traffic-pacing-edt/tc_htb_shaper.sh
+++ b/traffic-pacing-edt/tc_htb_shaper.sh
@@ -20,11 +20,16 @@ export TC=/sbin/tc
 # Measurements show around 930Mbit best-case.  Q-in-Q result in MTU
 # 1522 bytes.  TCP goodput segments are 1448 bytes.
 #
-RATE=$((930*1522/1448))Mbit
+#RATE=$((930*1522/1448))Mbit
 ##RATE=$((933*1522/1448))Mbit
 ##CEIL=$((999*1522/1448))
 #CEIL=1Gbit
-CEIL=980mbit
+#CEIL=980mbit
+
+# EDT shaper show TCP goodput of 956 Mbit/s.
+#  echo $((956*1514/1448)) = 999
+RATE=999Mbit
+CEIL=1000Mbit
 
 #RATE=500mbit
 #CEIL=577mbit

From 55a8513e2c9cb946c9acc6c5717d2e880ca0b48c Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sat, 14 Nov 2020 16:51:11 +0100
Subject: [PATCH 07/61] traffic-pacing-edt: Play with edt_pacer02 drop horizon

This didn't help.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/edt_pacer02.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index b90e780..532efa8 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -9,10 +9,12 @@ char _license[] SEC("license") = "GPL";
 #define NS_PER_SEC 1000000000
 
 /* skb->len in bytes, thus easier to keep rate in bytes */
-#define RATE_IN_BITS	(1000 * 1000 * 1000)
+#define RATE_IN_BITS	(1000 * 1000 * 1000ULL)
 #define RATE_IN_BYTES	(RATE_IN_BITS / 8)
 
-#define T_HORIZON_DROP	(2000 * 1000 * 1000)
+#define T_HORIZON_DROP	(2000 * 1000 * 1000ULL)
+//#define T_HORIZON_DROP	(200000 * 1000 * 1000ULL)
+//#define T_HORIZON_DROP	(20 * 1000 * 1000ULL)
 
 /* FIXME add proper READ_ONCE / WRITE_ONCE macros, for now use for annotation */
 #define READ_ONCE(V)		(V)

From 5a3e52cf430c59f8ab5de3d42d7ab28bc5b20c37 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sat, 14 Nov 2020 16:51:56 +0100
Subject: [PATCH 08/61] traffic-pacing-edt: Make fq script respect --remove

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/tc_fq_pacer.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh
index dd1c8fa..92aa21d 100755
--- a/traffic-pacing-edt/tc_fq_pacer.sh
+++ b/traffic-pacing-edt/tc_fq_pacer.sh
@@ -28,6 +28,10 @@ VERBOSE=1
 # Delete existing root qdisc
 call_tc_allow_fail qdisc del dev "$DEV" root
 
+if [[ -n $REMOVE ]]; then
+    exit 0
+fi
+
 # MQ (Multi-Queue) as root qdisc
 tc qdisc replace dev $DEV root handle 7FFF: mq
 

From 9f97d984cbda08b17abfac69c2026ce76dcd6d54 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sat, 14 Nov 2020 17:16:03 +0100
Subject: [PATCH 09/61] traffic-pacing-edt: edt_pacer02.c add ECN marking
 horizon

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/edt_pacer02.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 532efa8..ed66ba1 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -10,12 +10,15 @@ char _license[] SEC("license") = "GPL";
 
 /* skb->len in bytes, thus easier to keep rate in bytes */
 #define RATE_IN_BITS	(1000 * 1000 * 1000ULL)
+//#define RATE_IN_BITS	(500 * 1000 * 1000ULL)
 #define RATE_IN_BYTES	(RATE_IN_BITS / 8)
 
 #define T_HORIZON_DROP	(2000 * 1000 * 1000ULL)
 //#define T_HORIZON_DROP	(200000 * 1000 * 1000ULL)
 //#define T_HORIZON_DROP	(20 * 1000 * 1000ULL)
 
+#define T_HORIZON_ECN	(5 * 1000 * 1000ULL)
+
 /* FIXME add proper READ_ONCE / WRITE_ONCE macros, for now use for annotation */
 #define READ_ONCE(V)		(V)
 #define WRITE_ONCE(X,V)	(X) = (V)
@@ -97,7 +100,9 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 	if (t_queue_sz >= T_HORIZON_DROP /* edt->t_horizon_drop */)
 		return BPF_DROP;
 
-	// TODO Add ECN marking horizon
+	/* ECN marking horizon */
+	if (t_queue_sz >= T_HORIZON_ECN)
+		bpf_skb_ecn_set_ce(skb);
 
 	/* Advance "time queue" */
 	WRITE_ONCE(edt->t_last, t_next);

From 1fb44832079319cd98647bcdc25f2d148431733a Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sat, 14 Nov 2020 18:44:40 +0100
Subject: [PATCH 10/61] traffic-pacing-edt: tc_fq_pacer.sh adjust packet per
 flow_limit

This was causing strange issues, where a TCP single flow could not
achieve the correct bandwidth.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/edt_pacer02.c  | 1 +
 traffic-pacing-edt/tc_fq_pacer.sh | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index ed66ba1..83e2823 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -10,6 +10,7 @@ char _license[] SEC("license") = "GPL";
 
 /* skb->len in bytes, thus easier to keep rate in bytes */
 #define RATE_IN_BITS	(1000 * 1000 * 1000ULL)
+//#define RATE_IN_BITS	(200 * 1000 * 1000ULL)
 //#define RATE_IN_BITS	(500 * 1000 * 1000ULL)
 #define RATE_IN_BYTES	(RATE_IN_BITS / 8)
 
diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh
index 92aa21d..1ad9805 100755
--- a/traffic-pacing-edt/tc_fq_pacer.sh
+++ b/traffic-pacing-edt/tc_fq_pacer.sh
@@ -40,6 +40,12 @@ i=0
 for dir in /sys/class/net/$DEV/queues/tx-*; do
     # Details: cause-off-by-one, as tx-0 becomes handle 1:
     ((i++)) || true
-    tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq
+    #tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq
+    #
+    # The higher 'flow_limit' is needed for high-BW pacing
+    tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq \
+       flow_limit 1000
+    #
+    #   quantum $((1514*4)) initial_quantum $((1514*20))
     # tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit
 done

From 4ded8f7015f25d90cc2a7787620f6aa7f053a50e Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sat, 14 Nov 2020 19:33:42 +0100
Subject: [PATCH 11/61] traffic-pacing-edt: control latency via horizon drop

When number of parallel (iperf -P N) flows increase, then the latency
increase as well (measured via simple ping through router).

This can be controlled via a much tigher drop horizon.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/edt_pacer02.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 83e2823..779c6eb 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -14,9 +14,9 @@ char _license[] SEC("license") = "GPL";
 //#define RATE_IN_BITS	(500 * 1000 * 1000ULL)
 #define RATE_IN_BYTES	(RATE_IN_BITS / 8)
 
-#define T_HORIZON_DROP	(2000 * 1000 * 1000ULL)
+//#define T_HORIZON_DROP	(2000 * 1000 * 1000ULL)
 //#define T_HORIZON_DROP	(200000 * 1000 * 1000ULL)
-//#define T_HORIZON_DROP	(20 * 1000 * 1000ULL)
+#define T_HORIZON_DROP	(15 * 1000 * 1000ULL)
 
 #define T_HORIZON_ECN	(5 * 1000 * 1000ULL)
 

From 6ee640393be650a8ad7e2228bd2343895c197254 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 15 Nov 2020 15:12:20 +0100
Subject: [PATCH 12/61] Update parsing_helpers.h from xdp-tutorial

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 headers/xdp/parsing_helpers.h | 93 ++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 50 deletions(-)

diff --git a/headers/xdp/parsing_helpers.h b/headers/xdp/parsing_helpers.h
index c29f23b..f889fb3 100644
--- a/headers/xdp/parsing_helpers.h
+++ b/headers/xdp/parsing_helpers.h
@@ -1,8 +1,8 @@
 /* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-clause) */
 /*
- * This file contains parsing functions that can be used in eXDP programs. The
- * functions are marked as __always_inline, and fully defined in this header
- * file to be included in the BPF program.
+ * This file contains parsing functions that are used in the packetXX XDP
+ * programs. The functions are marked as __always_inline, and fully defined in
+ * this header file to be included in the BPF program.
  *
  * Each helper parses a packet header, including doing bounds checking, and
  * returns the type of its contents if successful, and -1 otherwise.
@@ -10,6 +10,10 @@
  * For Ethernet and IP headers, the content type is the type of the payload
  * (h_proto for Ethernet, nexthdr for IPv6), for ICMP it is the ICMP type field.
  * All return values are in host byte order.
+ *
+ * The versions of the functions included here are slightly expanded versions of
+ * the functions in the packet01 lesson. For instance, the Ethernet header
+ * parsing has support for parsing VLAN tags.
  */
 
 #ifndef __PARSING_HELPERS_H
@@ -24,8 +28,6 @@
 #include <linux/icmpv6.h>
 #include <linux/udp.h>
 #include <linux/tcp.h>
-#include <linux/in.h>
-#include <bpf/bpf_endian.h>
 
 /* Header cursor to keep track of current parsing position */
 struct hdr_cursor {
@@ -54,14 +56,14 @@ struct icmphdr_common {
 
 /* Allow users of header file to redefine VLAN max depth */
 #ifndef VLAN_MAX_DEPTH
-#define VLAN_MAX_DEPTH 4
-#endif
-
-/* Longest chain of IPv6 extension headers to resolve */
-#ifndef IPV6_EXT_MAX_CHAIN
-#define IPV6_EXT_MAX_CHAIN 6
+#define VLAN_MAX_DEPTH 2
 #endif
 
+#define VLAN_VID_MASK		0x0fff /* VLAN Identifier */
+/* Struct for collecting VLANs after parsing via parse_ethhdr_vlan */
+struct collect_vlans {
+	__u16 id[VLAN_MAX_DEPTH];
+};
 
 static __always_inline int proto_is_vlan(__u16 h_proto)
 {
@@ -74,18 +76,24 @@ static __always_inline int proto_is_vlan(__u16 h_proto)
  * Ethernet header. Thus, caller can look at eth->h_proto to see if this was a
  * VLAN tagged packet.
  */
-static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end,
-					struct ethhdr **ethhdr)
+static __always_inline int parse_ethhdr_vlan(struct hdr_cursor *nh,
+					     void *data_end,
+					     struct ethhdr **ethhdr,
+					     struct collect_vlans *vlans)
 {
 	struct ethhdr *eth = nh->pos;
+	int hdrsize = sizeof(*eth);
 	struct vlan_hdr *vlh;
 	__u16 h_proto;
 	int i;
 
-	if (eth + 1 > data_end)
+	/* Byte-count bounds check; check if current pointer + size of header
+	 * is after data_end.
+	 */
+	if (nh->pos + hdrsize > data_end)
 		return -1;
 
-	nh->pos = eth + 1;
+	nh->pos += hdrsize;
 	*ethhdr = eth;
 	vlh = nh->pos;
 	h_proto = eth->h_proto;
@@ -102,6 +110,10 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end,
 			break;
 
 		h_proto = vlh->h_vlan_encapsulated_proto;
+		if (vlans) /* collect VLAN ids */
+			vlans->id[i] =
+				(bpf_ntohs(vlh->h_vlan_TCI) & VLAN_VID_MASK);
+
 		vlh++;
 	}
 
@@ -109,39 +121,12 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end,
 	return h_proto; /* network-byte-order */
 }
 
-static __always_inline int skip_ip6hdrext(struct hdr_cursor *nh,
-					  void *data_end,
-					  __u8 next_hdr_type)
+static __always_inline int parse_ethhdr(struct hdr_cursor *nh,
+					void *data_end,
+					struct ethhdr **ethhdr)
 {
-	for (int i = 0; i < IPV6_EXT_MAX_CHAIN; ++i) {
-		struct ipv6_opt_hdr *hdr = nh->pos;
-
-		if (hdr + 1 > data_end)
-			return -1;
-
-		switch (next_hdr_type) {
-		case IPPROTO_HOPOPTS:
-		case IPPROTO_DSTOPTS:
-		case IPPROTO_ROUTING:
-		case IPPROTO_MH:
-			nh->pos = (char *)hdr + (hdr->hdrlen + 1) * 8;
-			next_hdr_type = hdr->nexthdr;
-			break;
-		case IPPROTO_AH:
-			nh->pos = (char *)hdr + (hdr->hdrlen + 2) * 4;
-			next_hdr_type = hdr->nexthdr;
-			break;
-		case IPPROTO_FRAGMENT:
-			nh->pos = (char *)hdr + 8;
-			next_hdr_type = hdr->nexthdr;
-			break;
-		default:
-			/* Found a header that is not an IPv6 extension header */
-			return next_hdr_type;
-		}
-	}
-
-	return -1;
+	/* Expect compiler removes the code that collects VLAN ids */
+	return parse_ethhdr_vlan(nh, data_end, ethhdr, NULL);
 }
 
 static __always_inline int parse_ip6hdr(struct hdr_cursor *nh,
@@ -160,7 +145,7 @@ static __always_inline int parse_ip6hdr(struct hdr_cursor *nh,
 	nh->pos = ip6h + 1;
 	*ip6hdr = ip6h;
 
-	return skip_ip6hdrext(nh, data_end, ip6h->nexthdr);
+	return ip6h->nexthdr;
 }
 
 static __always_inline int parse_iphdr(struct hdr_cursor *nh,
@@ -174,6 +159,9 @@ static __always_inline int parse_iphdr(struct hdr_cursor *nh,
 		return -1;
 
 	hdrsize = iph->ihl * 4;
+	/* Sanity check packet field is valid */
+	if(hdrsize < sizeof(iph))
+		return -1;
 
 	/* Variable-length IPv4 header, need to use byte-based arithmetic */
 	if (nh->pos + hdrsize > data_end)
@@ -267,10 +255,15 @@ static __always_inline int parse_tcphdr(struct hdr_cursor *nh,
 		return -1;
 
 	len = h->doff * 4;
-	if ((void *) h + len > data_end)
+	/* Sanity check packet field is valid */
+	if(len < sizeof(h))
 		return -1;
 
-	nh->pos  = h + 1;
+	/* Variable-length TCP header, need to use byte-based arithmetic */
+	if (nh->pos + len > data_end)
+		return -1;
+
+	nh->pos += len;
 	*tcphdr = h;
 
 	return len;

From 097079cde19a5b7c45804b217e854c2fa632ee92 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 15 Nov 2020 15:17:00 +0100
Subject: [PATCH 13/61] Fix includes in parsing_helpers.h

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 headers/xdp/parsing_helpers.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/headers/xdp/parsing_helpers.h b/headers/xdp/parsing_helpers.h
index f889fb3..c9d6363 100644
--- a/headers/xdp/parsing_helpers.h
+++ b/headers/xdp/parsing_helpers.h
@@ -28,6 +28,8 @@
 #include <linux/icmpv6.h>
 #include <linux/udp.h>
 #include <linux/tcp.h>
+#include <linux/in.h>
+#include <bpf/bpf_endian.h>
 
 /* Header cursor to keep track of current parsing position */
 struct hdr_cursor {

From c0cd6aedba40535857ae5b4c6043115671ed68be Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 15 Nov 2020 15:26:12 +0100
Subject: [PATCH 14/61] parsing_helpers.h re-add IPv6 skip of extension headers

This code comes from xdp-tools repo.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 headers/xdp/parsing_helpers.h | 42 ++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/headers/xdp/parsing_helpers.h b/headers/xdp/parsing_helpers.h
index c9d6363..de6705b 100644
--- a/headers/xdp/parsing_helpers.h
+++ b/headers/xdp/parsing_helpers.h
@@ -61,6 +61,11 @@ struct icmphdr_common {
 #define VLAN_MAX_DEPTH 2
 #endif
 
+/* Longest chain of IPv6 extension headers to resolve */
+#ifndef IPV6_EXT_MAX_CHAIN
+#define IPV6_EXT_MAX_CHAIN 6
+#endif
+
 #define VLAN_VID_MASK		0x0fff /* VLAN Identifier */
 /* Struct for collecting VLANs after parsing via parse_ethhdr_vlan */
 struct collect_vlans {
@@ -131,6 +136,41 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh,
 	return parse_ethhdr_vlan(nh, data_end, ethhdr, NULL);
 }
 
+static __always_inline int skip_ip6hdrext(struct hdr_cursor *nh,
+					  void *data_end,
+					  __u8 next_hdr_type)
+{
+	for (int i = 0; i < IPV6_EXT_MAX_CHAIN; ++i) {
+		struct ipv6_opt_hdr *hdr = nh->pos;
+
+		if (hdr + 1 > data_end)
+			return -1;
+
+		switch (next_hdr_type) {
+		case IPPROTO_HOPOPTS:
+		case IPPROTO_DSTOPTS:
+		case IPPROTO_ROUTING:
+		case IPPROTO_MH:
+			nh->pos = (char *)hdr + (hdr->hdrlen + 1) * 8;
+			next_hdr_type = hdr->nexthdr;
+			break;
+		case IPPROTO_AH:
+			nh->pos = (char *)hdr + (hdr->hdrlen + 2) * 4;
+			next_hdr_type = hdr->nexthdr;
+			break;
+		case IPPROTO_FRAGMENT:
+			nh->pos = (char *)hdr + 8;
+			next_hdr_type = hdr->nexthdr;
+			break;
+		default:
+			/* Found a header that is not an IPv6 extension header */
+			return next_hdr_type;
+		}
+	}
+
+	return -1;
+}
+
 static __always_inline int parse_ip6hdr(struct hdr_cursor *nh,
 					void *data_end,
 					struct ipv6hdr **ip6hdr)
@@ -147,7 +187,7 @@ static __always_inline int parse_ip6hdr(struct hdr_cursor *nh,
 	nh->pos = ip6h + 1;
 	*ip6hdr = ip6h;
 
-	return ip6h->nexthdr;
+	return skip_ip6hdrext(nh, data_end, ip6h->nexthdr);
 }
 
 static __always_inline int parse_iphdr(struct hdr_cursor *nh,

From 9ea235637ecab158a895bbae25bb096316fe9749 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 15 Nov 2020 15:38:15 +0100
Subject: [PATCH 15/61] Add compiler.h to define some common compiler
 directives

This is taken from the Cilium project:
 https://github.com/cilium/cilium/blob/master/bpf/include/bpf/compiler.h

The use-case was adding READ_ONCE and WRITE_ONCE, but via re-using
the Cilium version we get a lot more useful compiler annotations.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 headers/bpf/compiler.h | 124 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 headers/bpf/compiler.h

diff --git a/headers/bpf/compiler.h b/headers/bpf/compiler.h
new file mode 100644
index 0000000..2588023
--- /dev/null
+++ b/headers/bpf/compiler.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2016-2020 Authors of Cilium */
+
+#ifndef __BPF_COMPILER_H_
+#define __BPF_COMPILER_H_
+
+#ifndef __non_bpf_context
+# include "stddef.h"
+#endif
+
+#ifndef __section
+# define __section(X)		__attribute__((section(X), used))
+#endif
+
+#ifndef __maybe_unused
+# define __maybe_unused		__attribute__((__unused__))
+#endif
+
+#ifndef offsetof
+# define offsetof(T, M)		__builtin_offsetof(T, M)
+#endif
+
+#ifndef field_sizeof
+# define field_sizeof(T, M)	sizeof((((T *)NULL)->M))
+#endif
+
+#ifndef __packed
+# define __packed		__attribute__((packed))
+#endif
+
+#ifndef __nobuiltin
+# if __clang_major__ >= 10
+#  define __nobuiltin(X)	__attribute__((no_builtin(X)))
+# else
+#  define __nobuiltin(X)
+# endif
+#endif
+
+#ifndef likely
+# define likely(X)		__builtin_expect(!!(X), 1)
+#endif
+
+#ifndef unlikely
+# define unlikely(X)		__builtin_expect(!!(X), 0)
+#endif
+
+#ifndef always_succeeds		/* Mainly for documentation purpose. */
+# define always_succeeds(X)	likely(X)
+#endif
+
+#undef __always_inline		/* stddef.h defines its own */
+#define __always_inline		inline __attribute__((always_inline))
+
+#ifndef __stringify
+# define __stringify(X)		#X
+#endif
+
+#ifndef __fetch
+# define __fetch(X)		(__u32)(__u64)(&(X))
+#endif
+
+#ifndef __aligned
+# define __aligned(X)		__attribute__((aligned(X)))
+#endif
+
+#ifndef build_bug_on
+# define build_bug_on(E)	((void)sizeof(char[1 - 2*!!(E)]))
+#endif
+
+#ifndef __throw_build_bug
+# define __throw_build_bug()	__builtin_trap()
+#endif
+
+#ifndef __printf
+# define __printf(X, Y)		__attribute__((__format__(printf, X, Y)))
+#endif
+
+#ifndef barrier
+# define barrier()		asm volatile("": : :"memory")
+#endif
+
+#ifndef barrier_data
+# define barrier_data(ptr)	asm volatile("": :"r"(ptr) :"memory")
+#endif
+
+static __always_inline void bpf_barrier(void)
+{
+	/* Workaround to avoid verifier complaint:
+	 * "dereference of modified ctx ptr R5 off=48+0, ctx+const is allowed,
+	 *        ctx+const+const is not"
+	 */
+	barrier();
+}
+
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(A)		(sizeof(A) / sizeof((A)[0]))
+#endif
+
+#ifndef __READ_ONCE
+# define __READ_ONCE(X)		(*(volatile typeof(X) *)&X)
+#endif
+
+#ifndef __WRITE_ONCE
+# define __WRITE_ONCE(X, V)	(*(volatile typeof(X) *)&X) = (V)
+#endif
+
+/* {READ,WRITE}_ONCE() with verifier workaround via bpf_barrier(). */
+
+#ifndef READ_ONCE
+# define READ_ONCE(X)						\
+			({ typeof(X) __val = __READ_ONCE(X);	\
+			   bpf_barrier();			\
+			   __val; })
+#endif
+
+#ifndef WRITE_ONCE
+# define WRITE_ONCE(X, V)					\
+				({ typeof(X) __val = (V);	\
+				   __WRITE_ONCE(X, __val);	\
+				   bpf_barrier();		\
+				   __val; })
+#endif
+
+#endif /* __BPF_COMPILER_H_ */

From 692202e60ed194a2df5df7d22f639b6ca1b8bbeb Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 15 Nov 2020 15:40:42 +0100
Subject: [PATCH 16/61] traffic-pacing-edt: use READ_ONCE and WRITE_ONCE via
 compiler.h

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/edt_pacer02.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 779c6eb..c742d27 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0+ */
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include <bpf/compiler.h>
 #include <xdp/parsing_helpers.h>
 #include "iproute2_compat.h"
 
@@ -20,10 +21,6 @@ char _license[] SEC("license") = "GPL";
 
 #define T_HORIZON_ECN	(5 * 1000 * 1000ULL)
 
-/* FIXME add proper READ_ONCE / WRITE_ONCE macros, for now use for annotation */
-#define READ_ONCE(V)		(V)
-#define WRITE_ONCE(X,V)	(X) = (V)
-
 struct edt_val {
 	__u64	rate;
 	__u64	t_last;

From 21ebc4d8cd2aaa8a70d210f4c04c1027d9376e72 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 15 Nov 2020 15:56:31 +0100
Subject: [PATCH 17/61] traffic-pacing-edt: Align map struct to cache-line size

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/edt_pacer02.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index c742d27..b8f22c9 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -26,7 +26,7 @@ struct edt_val {
 	__u64	t_last;
 	__u64	t_horizon_drop;
 	__u64	t_horizon_ecn;
-};
+} __aligned(64); /* Align struct to cache-size to avoid false-sharing */
 
 /* The tc tool (iproute2) use another ELF map layout than libbpf (struct
  * bpf_map_def), see struct bpf_elf_map from iproute2.

From 9c5ccaed9be3818f44b7c6aa1b0fce1bd5af1bc2 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 15 Nov 2020 16:23:46 +0100
Subject: [PATCH 18/61] traffic-pacing-edt: Simple VLAN parsing via
 parse_ethhdr_vlan

Using the XDP based ethhdr VLAN parser.  This cannot handle
if the SKB don't have the VLAN inlined.

Static match on VLAN 16 as test case.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/edt_pacer02.c | 35 ++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index b8f22c9..52137f1 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -2,9 +2,11 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/compiler.h>
-#include <xdp/parsing_helpers.h>
 #include "iproute2_compat.h"
 
+#define VLAN_MAX_DEPTH 2
+#include <xdp/parsing_helpers.h>
+
 char _license[] SEC("license") = "GPL";
 
 #define NS_PER_SEC 1000000000
@@ -112,25 +114,36 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 
 SEC("classifier") int tc_edt_simple(struct __sk_buff *skb)
 {
-	volatile void *data, *data_end;
-	int ret = BPF_OK;
+	void *data     = (void *)(long)skb->data;
+	void *data_end = (void *)(long)skb->data_end;
+	struct collect_vlans vlans = { 0 };
 	struct ethhdr *eth;
+	int ret = BPF_OK;
 
-	data     = (void *)(long)skb->data;
-	data_end = (void *)(long)skb->data_end;
-	eth = (struct ethhdr *)data;
+	/* These keep track of the next header type and iterator pointer */
+	struct hdr_cursor nh;
+	int eth_type;
+	nh.pos = data;
 
-	if (data + sizeof(*eth) > data_end)
-		return BPF_DROP;
+	eth_type = parse_ethhdr_vlan(&nh, data_end, &eth, &vlans);
+	if (eth_type < 0)
+		return XDP_ABORTED;
 
 	/* Keep ARP resolution working */
-	if (eth->h_proto == bpf_htons(ETH_P_ARP)) {
+	if (eth_type == bpf_htons(ETH_P_ARP)) {
 		ret = BPF_OK;
 		goto out;
 	}
 
-	// TODO: match on vlan16 and only apply EDT on that
-	return sched_departure(skb);
+	if (!proto_is_vlan(eth->h_proto)) {
+		/* Skip non-VLAN frames */
+		return BPF_OK;
+	}
+
+	/* Match on vlan16 and only apply EDT on that */
+	// FIXME: handle if VLAN is not inlined in packet
+	if (vlans.id[0] == 16)
+		return sched_departure(skb);
 
  out:
 	return ret;

From 82186cfe72fc8e51d099c1c072230fd00eba11eb Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 15 Nov 2020 17:03:04 +0100
Subject: [PATCH 19/61] traffic-pacing-edt: script for VLAN setup

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/testlab_vlan_setup.sh | 65 ++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100755 traffic-pacing-edt/testlab_vlan_setup.sh

diff --git a/traffic-pacing-edt/testlab_vlan_setup.sh b/traffic-pacing-edt/testlab_vlan_setup.sh
new file mode 100755
index 0000000..dfcfbfc
--- /dev/null
+++ b/traffic-pacing-edt/testlab_vlan_setup.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+#
+# Testlab setup script for VLAN Q-in-Q (double tagged VLAN) config.
+#
+# Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
+# License: GPLv2
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+
+root_check_run_with_sudo "$@"
+
+# Use common parameters
+source ${basedir}/parameters.sh
+
+export IP=/sbin/ip
+function ip() {
+    echo $IP "$@"
+    $IP "$@"
+}
+
+function create_vlan_device() {
+    local vlan=${1}
+    local device=${2:-$DEV}
+    shift 2
+
+    if [[ -z "$vlan" ]]; then
+	err 2 "Missing VLAN is as input"
+    fi
+
+    ip link add link "$device" name ${device}.${vlan} type vlan id ${vlan}
+    ip link set ${device}.${vlan} up
+}
+
+function delete_vlan_device() {
+    local vlan=${1}
+    local device=${2:-$DEV}
+    shift 2
+
+    if [[ -z "$vlan" ]]; then
+	err 2 "Missing VLAN is as input"
+    fi
+
+    ip link del ${device}.${vlan}
+}
+
+
+if [[ -z "$1" ]]; then
+    err 3 "Missing arg#1 for outer vlan"
+fi
+OUTER=$1
+
+if [[ -z "$2" ]]; then
+    err 3 "Missing arg#2 for inner vlan"
+fi
+INNER=$2
+
+if [[ -n $REMOVE ]]; then
+    delete_vlan_device $INNER ${DEV}.${OUTER}
+    delete_vlan_device $OUTER $DEV
+    exit 0
+fi
+
+create_vlan_device $OUTER $DEV
+create_vlan_device $INNER ${DEV}.${OUTER}

From 1196c6cf14e2419dc451ed111be378119e2f9bfa Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 15 Nov 2020 17:23:41 +0100
Subject: [PATCH 20/61] traffic-pacing-edt: adjust parameters help txt to be
 more general

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/parameters.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/traffic-pacing-edt/parameters.sh b/traffic-pacing-edt/parameters.sh
index 6d0841d..d0077ab 100644
--- a/traffic-pacing-edt/parameters.sh
+++ b/traffic-pacing-edt/parameters.sh
@@ -10,10 +10,10 @@ function usage() {
     echo "Usage: $0 [-vh] --dev ethX"
     echo "  -d | --dev     : (\$DEV)        Interface/device (required)"
     echo "  -v | --verbose : (\$VERBOSE)    verbose"
-    echo "  --remove       : (\$REMOVE)     Remove the TC rules"
+    echo "  --remove       : (\$REMOVE)     Remove the rules"
     echo "  --dry-run      : (\$DRYRUN)     Dry-run only (echo tc commands)"
-    echo "  -s | --stats   : (\$STATS_ONLY) Call TC statistics command"
-    echo "  -l | --list    : (\$LIST)       List TC filter setup after setup"
+    echo "  -s | --stats   : (\$STATS_ONLY) Call statistics command"
+    echo "  -l | --list    : (\$LIST)       List setup after setup"
     echo "  --file | --obj : (\$BPF_OBJ)    BPF-object file to load"
     echo ""
 }
@@ -80,5 +80,5 @@ done
 
 if [ -z "$DEV" ]; then
     usage
-    err 2 "Please specify TC net_device"
+    err 2 "Please specify net_device (\$DEV)"
 fi

From d8a992aab48be36e0b75bfb06a18cdfbb9ca40f0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 15 Nov 2020 17:27:09 +0100
Subject: [PATCH 21/61] traffic-pacing-edt: Add IP wrapper functions

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/functions.sh          | 25 ++++++++++++++++++++++++
 traffic-pacing-edt/testlab_vlan_setup.sh |  3 +--
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/traffic-pacing-edt/functions.sh b/traffic-pacing-edt/functions.sh
index a92f482..32cbdde 100644
--- a/traffic-pacing-edt/functions.sh
+++ b/traffic-pacing-edt/functions.sh
@@ -62,3 +62,28 @@ function call_tc() {
 function call_tc_allow_fail() {
     _call_tc "allow_fail" "$@"
 }
+
+## -- Wrapper calls for IP --
+function _call_ip() {
+    local allow_fail="$1"
+    shift
+    if [[ -n "$VERBOSE" ]]; then
+	echo "ip $@"
+    fi
+    if [[ -n "$DRYRUN" ]]; then
+	return
+    fi
+    $IP "$@"
+    local status=$?
+    if (( $status != 0 )); then
+	if [[ "$allow_fail" == "" ]]; then
+	    err 3 "Exec error($status) occurred cmd: \"$IP $@\""
+	fi
+    fi
+}
+function call_ip() {
+    _call_ip "" "$@"
+}
+function call_ip_allow_fail() {
+    _call_ip "allow_fail" "$@"
+}
diff --git a/traffic-pacing-edt/testlab_vlan_setup.sh b/traffic-pacing-edt/testlab_vlan_setup.sh
index dfcfbfc..8c1b33b 100755
--- a/traffic-pacing-edt/testlab_vlan_setup.sh
+++ b/traffic-pacing-edt/testlab_vlan_setup.sh
@@ -15,8 +15,7 @@ source ${basedir}/parameters.sh
 
 export IP=/sbin/ip
 function ip() {
-    echo $IP "$@"
-    $IP "$@"
+    call_ip "$@"
 }
 
 function create_vlan_device() {

From 6a67b105ee3f9cc9743d80fc873f1d439279adb1 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 15 Nov 2020 20:53:22 +0100
Subject: [PATCH 22/61] traffic-pacing-edt: Remember MTU setting on netdevices

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/testlab_vlan_setup.sh | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/traffic-pacing-edt/testlab_vlan_setup.sh b/traffic-pacing-edt/testlab_vlan_setup.sh
index 8c1b33b..2228af5 100755
--- a/traffic-pacing-edt/testlab_vlan_setup.sh
+++ b/traffic-pacing-edt/testlab_vlan_setup.sh
@@ -31,6 +31,21 @@ function create_vlan_device() {
     ip link set ${device}.${vlan} up
 }
 
+function create_vlan_device_802_1ad() {
+    local vlan=${1}
+    local device=${2:-$DEV}
+    shift 2
+
+    if [[ -z "$vlan" ]]; then
+	err 2 "Missing VLAN is as input"
+    fi
+
+    ip link add link "$device" name ${device}.${vlan} type vlan id ${vlan} \
+       protocol 802.1ad
+    ip link set ${device}.${vlan} up
+}
+
+
 function delete_vlan_device() {
     local vlan=${1}
     local device=${2:-$DEV}
@@ -62,3 +77,8 @@ fi
 
 create_vlan_device $OUTER $DEV
 create_vlan_device $INNER ${DEV}.${OUTER}
+
+# Set MTU to handle extra VLAN headers, NICs usually allow one VLAN
+# header even though they have configured MTU 1500.
+ip link set $DEV mtu 1508
+ip link set ${DEV}.${OUTER} mtu 1504

From a0f3760d6c9f9d962f382f12df2ac84672f8313e Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 15 Nov 2020 21:19:10 +0100
Subject: [PATCH 23/61] traffic-pacing-edt: Handle if VLAN is offloaded to SKB
 metadata

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/edt_pacer02.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 52137f1..8943973 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -119,6 +119,7 @@ SEC("classifier") int tc_edt_simple(struct __sk_buff *skb)
 	struct collect_vlans vlans = { 0 };
 	struct ethhdr *eth;
 	int ret = BPF_OK;
+	__u16 vlan_key;
 
 	/* These keep track of the next header type and iterator pointer */
 	struct hdr_cursor nh;
@@ -135,14 +136,19 @@ SEC("classifier") int tc_edt_simple(struct __sk_buff *skb)
 		goto out;
 	}
 
-	if (!proto_is_vlan(eth->h_proto)) {
+	if (!proto_is_vlan(eth->h_proto) && !skb->vlan_present) {
 		/* Skip non-VLAN frames */
 		return BPF_OK;
 	}
 
-	/* Match on vlan16 and only apply EDT on that */
-	// FIXME: handle if VLAN is not inlined in packet
-	if (vlans.id[0] == 16)
+	/* NIC can HW "offload" the outer VLAN, moving it to skb context */
+	if (skb->vlan_present)
+		vlan_key = vlans.id[0]; /* Inner vlan placed as first inline */
+	else
+		vlan_key = vlans.id[1]; /* All VLAN headers inline */
+
+	/* For-now: Match on vlan16 and only apply EDT on that */
+	if (vlan_key == 16)
 		return sched_departure(skb);
 
  out:

From 71db45b28ecb78b1c0bc92b3ea27a218b4ca0feb Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 15 Nov 2020 21:54:39 +0100
Subject: [PATCH 24/61] traffic-pacing-edt: Handle if loaded on outer VLAN
 net_device

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 traffic-pacing-edt/edt_pacer02.c | 62 ++++++++++++++++++++++++++++----
 1 file changed, 55 insertions(+), 7 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 8943973..a3893b3 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -112,7 +112,59 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 	return BPF_OK;
 }
 
-SEC("classifier") int tc_edt_simple(struct __sk_buff *skb)
+static __always_inline
+__u16 get_inner_qinq_vlan(struct __sk_buff *skb, struct collect_vlans *vlans)
+{
+	__u16 vlan_key;
+
+	/* NIC can HW "offload" the outer VLAN, moving it to skb context */
+	if (skb->vlan_present)
+		vlan_key = vlans->id[0]; /* Inner vlan placed as first inline */
+	else
+		vlan_key = vlans->id[1]; /* All VLAN headers inline */
+
+	return vlan_key;
+}
+
+static __always_inline
+__u16 get_vlan(struct __sk_buff *skb, struct collect_vlans *vlans)
+{
+	__u16 vlan_key;
+
+	/* Handle extracting VLAN if skb context have VLAN offloaded */
+	if (skb->vlan_present)
+		vlan_key = skb->vlan_tci & VLAN_VID_MASK;
+	else
+		vlan_key = vlans->id[0];
+
+	return vlan_key;
+}
+
+static __always_inline
+__u16 extract_vlan_key(struct __sk_buff *skb, struct collect_vlans *vlans)
+{
+	int QinQ = 0;
+
+	/* The inner VLAN is the key to extract. But it is complicated
+	 * due to NIC "offloaded" VLAN (skb->vlan_present).  In case
+	 * BPF-prog is loaded on outer VLAN net_device, the BPF-prog
+	 * sees the inner-VLAN at the first and only VLAN.
+	 */
+	if (skb->vlan_present) {
+		if (vlans->id[0])
+			QinQ = 1;
+	} else {
+		if (vlans->id[1])
+			QinQ = 1;
+	}
+
+	if (QinQ)
+		return get_inner_qinq_vlan(skb, vlans);
+	else
+		return get_vlan(skb, vlans);
+}
+
+SEC("classifier") int tc_edt_vlan(struct __sk_buff *skb)
 {
 	void *data     = (void *)(long)skb->data;
 	void *data_end = (void *)(long)skb->data_end;
@@ -128,7 +180,7 @@ SEC("classifier") int tc_edt_simple(struct __sk_buff *skb)
 
 	eth_type = parse_ethhdr_vlan(&nh, data_end, &eth, &vlans);
 	if (eth_type < 0)
-		return XDP_ABORTED;
+		return BPF_DROP;
 
 	/* Keep ARP resolution working */
 	if (eth_type == bpf_htons(ETH_P_ARP)) {
@@ -141,11 +193,7 @@ SEC("classifier") int tc_edt_simple(struct __sk_buff *skb)
 		return BPF_OK;
 	}
 
-	/* NIC can HW "offload" the outer VLAN, moving it to skb context */
-	if (skb->vlan_present)
-		vlan_key = vlans.id[0]; /* Inner vlan placed as first inline */
-	else
-		vlan_key = vlans.id[1]; /* All VLAN headers inline */
+	vlan_key = extract_vlan_key(skb, &vlans);
 
 	/* For-now: Match on vlan16 and only apply EDT on that */
 	if (vlan_key == 16)

From 740416975ffb83cc5eb162397cf5a5e5d1069aae Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sat, 21 Nov 2020 13:05:47 +0100
Subject: [PATCH 25/61] traffic-pacing-edt: allow tc util to be install in
 other places

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/bpf_egress_loader.sh |  2 +-
 traffic-pacing-edt/tc_fq_pacer.sh       | 13 +++++--------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/traffic-pacing-edt/bpf_egress_loader.sh b/traffic-pacing-edt/bpf_egress_loader.sh
index 316cddd..934117d 100755
--- a/traffic-pacing-edt/bpf_egress_loader.sh
+++ b/traffic-pacing-edt/bpf_egress_loader.sh
@@ -11,7 +11,7 @@ root_check_run_with_sudo "$@"
 # Use common parameters
 source ${basedir}/parameters.sh
 
-export TC=/sbin/tc
+export TC=tc
 
 # This can be changed via --file or --obj
 if [[ -z ${BPF_OBJ} ]]; then
diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh
index 1ad9805..e7a502f 100755
--- a/traffic-pacing-edt/tc_fq_pacer.sh
+++ b/traffic-pacing-edt/tc_fq_pacer.sh
@@ -17,10 +17,7 @@ root_check_run_with_sudo "$@"
 # Use common parameters
 source ${basedir}/parameters.sh
 
-export TC=/sbin/tc
-function tc() {
-    _call_tc "" "$@"
-}
+export TC=tc
 
 # Default verbose
 VERBOSE=1
@@ -33,19 +30,19 @@ if [[ -n $REMOVE ]]; then
 fi
 
 # MQ (Multi-Queue) as root qdisc
-tc qdisc replace dev $DEV root handle 7FFF: mq
+call_tc qdisc replace dev $DEV root handle 7FFF: mq
 
 # Add FQ-pacer qdisc on each NIC avail TX-queue
 i=0
 for dir in /sys/class/net/$DEV/queues/tx-*; do
     # Details: cause-off-by-one, as tx-0 becomes handle 1:
     ((i++)) || true
-    #tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq
+    #call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq
     #
     # The higher 'flow_limit' is needed for high-BW pacing
-    tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq \
+    call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq \
        flow_limit 1000
     #
     #   quantum $((1514*4)) initial_quantum $((1514*20))
-    # tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit
+    # call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit
 done

From 6b5648158bf4f90efe2bc6c16f64b66b44c68703 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sat, 21 Nov 2020 13:53:53 +0100
Subject: [PATCH 26/61] traffic-pacing-edt: Testing rates in production

Test different rates in production machine, and measure iperf3 TCP-goodput

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/edt_pacer02.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index a3893b3..ebb502a 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -11,10 +11,20 @@ char _license[] SEC("license") = "GPL";
 
 #define NS_PER_SEC 1000000000
 
+//#define RATE_IN_BITS	(1000 * 1000 * 1000ULL)
+//#define RATE_IN_BITS	(998 * 1000 * 1000ULL)
+
+/* Test different rates in production machine, and measure iperf3 TCP-goodput */
+//#define RATE_IN_BITS	(800 * 1000 * 1000ULL)// prod: 765 Mbits/sec (stable) 
+//#define RATE_IN_BITS	(900 * 1000 * 1000ULL)// prod: 861 Mbits/sec (stable)
+//#define RATE_IN_BITS	(950 * 1000 * 1000ULL)// prod: 908 Mbits/sec (stable)
+//#define RATE_IN_BITS	(960 * 1000 * 1000ULL)// prod: 918 Mbits/sec
+#define RATE_IN_BITS	(970 * 1000 * 1000ULL)// prod: 928 Mbits/sec
+//#define RATE_IN_BITS	(980 * 1000 * 1000ULL)// prod: 920 Mbits/sec (unstable)
+//#define RATE_IN_BITS	(990 * 1000 * 1000ULL)// prod: 920 Mbits/sec (unstable)
+//#define RATE_IN_BITS	(999 * 1000 * 1000ULL)// prod: (unstable)
+
 /* skb->len in bytes, thus easier to keep rate in bytes */
-#define RATE_IN_BITS	(1000 * 1000 * 1000ULL)
-//#define RATE_IN_BITS	(200 * 1000 * 1000ULL)
-//#define RATE_IN_BITS	(500 * 1000 * 1000ULL)
 #define RATE_IN_BYTES	(RATE_IN_BITS / 8)
 
 //#define T_HORIZON_DROP	(2000 * 1000 * 1000ULL)

From 794c074d7d6db58837d70c96c9fb7499ab915af1 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sat, 21 Nov 2020 15:57:14 +0100
Subject: [PATCH 27/61] traffic-pacing-edt: New strategy: Shape at MAC layer
 with Ethernet

Take into account MAC layer overhead per packet.

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/edt_pacer02.c | 45 +++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index ebb502a..8a0b687 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -11,19 +11,50 @@ char _license[] SEC("license") = "GPL";
 
 #define NS_PER_SEC 1000000000
 
-//#define RATE_IN_BITS	(1000 * 1000 * 1000ULL)
 //#define RATE_IN_BITS	(998 * 1000 * 1000ULL)
 
 /* Test different rates in production machine, and measure iperf3 TCP-goodput */
 //#define RATE_IN_BITS	(800 * 1000 * 1000ULL)// prod: 765 Mbits/sec (stable) 
 //#define RATE_IN_BITS	(900 * 1000 * 1000ULL)// prod: 861 Mbits/sec (stable)
-//#define RATE_IN_BITS	(950 * 1000 * 1000ULL)// prod: 908 Mbits/sec (stable)
+///#define RATE_IN_BITS	(950 * 1000 * 1000ULL)// prod: 908 Mbits/sec (stable)
 //#define RATE_IN_BITS	(960 * 1000 * 1000ULL)// prod: 918 Mbits/sec
-#define RATE_IN_BITS	(970 * 1000 * 1000ULL)// prod: 928 Mbits/sec
+//#define RATE_IN_BITS	(970 * 1000 * 1000ULL)// prod: 928 Mbits/sec
 //#define RATE_IN_BITS	(980 * 1000 * 1000ULL)// prod: 920 Mbits/sec (unstable)
 //#define RATE_IN_BITS	(990 * 1000 * 1000ULL)// prod: 920 Mbits/sec (unstable)
 //#define RATE_IN_BITS	(999 * 1000 * 1000ULL)// prod: (unstable)
 
+/* Per packet overhead: two VLAN headers == 8 bytes
+ *
+ * skb->wire_len doesn't seem to take the two VLAN headers into
+ * account.  Loading BPF-prog on VLAN net_device is can only see 1
+ * VLAN, and this is likely HW offloaded into skb->vlan.
+ */
+//#define OVERHEAD	(8)
+
+
+/* New strategy: Shape at MAC (Medium Access Control) layer with Ethernet
+ *
+ * Production use-case is pacing traffic at 1Gbit/s wirespeed, using a
+ * 10Gbit/s NIC, because 1G end-user switch cannot handle bursts.
+ * 
+ *            (https://en.wikipedia.org/wiki/Interpacket_gap
+ * 12 bytes = interframe gap (IFG) 96 bit
+
+ *            (https://en.wikipedia.org/wiki/Ethernet_frame)
+ *  8 bytes = MAC preamble
+ *  4 bytes = Ethernet Frame Check Sequence (FCS) CRC
+ * 46 bytes = Minimum Payload size
+ *
+ * 14 bytes = Ethernet header
+ *  8 bytes = 2x VLAN headers
+ */
+//#define RATE_IN_BITS	(1000 * 1000 * 1000ULL) /* Full 1Gbit/s */
+//#define RATE_IN_BITS	(990 * 1000 * 1000ULL)
+#define RATE_IN_BITS	(950 * 1000 * 1000ULL)
+#define OVERHEAD	(12 + 8 + 4 + 8)  /* 14 already in wire_len */
+//#define OVERHEAD	(12 + 8 + 4)      /* 14 already in wire_len */
+#define ETH_MIN		(84)
+
 /* skb->len in bytes, thus easier to keep rate in bytes */
 #define RATE_IN_BYTES	(RATE_IN_BITS / 8)
 
@@ -59,6 +90,7 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 	struct edt_val *edt;
 	__u64 t_queue_sz;
 	__u64 t_xmit_ns;
+	__u64 wire_len;
 	__u64 t_next;
 	__u64 t_curr;
 	int key = 0;
@@ -75,7 +107,12 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 	 * added on transmit.  Fortunately skb->wire_len at TC-egress hook (not
 	 * ingress) include these headers. (See: qdisc_pkt_len_init())
 	 */
-	t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / RATE_IN_BYTES;
+	wire_len = skb->wire_len + OVERHEAD;
+	wire_len = wire_len > ETH_MIN ? wire_len : ETH_MIN;
+	
+	t_xmit_ns = (wire_len) * NS_PER_SEC / RATE_IN_BYTES;
+
+//	t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / RATE_IN_BYTES;
 	// t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / edt->rate;
 
 	now = bpf_ktime_get_ns();

From 8714c9a37d0a5f18125f8081f07935ae8656a903 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sat, 21 Nov 2020 16:54:58 +0100
Subject: [PATCH 28/61] traffic-pacing-edt: also pace packets a bit on empty
 queue

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/edt_pacer02.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 8a0b687..f0ee70c 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -49,8 +49,8 @@ char _license[] SEC("license") = "GPL";
  *  8 bytes = 2x VLAN headers
  */
 //#define RATE_IN_BITS	(1000 * 1000 * 1000ULL) /* Full 1Gbit/s */
-//#define RATE_IN_BITS	(990 * 1000 * 1000ULL)
-#define RATE_IN_BITS	(950 * 1000 * 1000ULL)
+#define RATE_IN_BITS	(990 * 1000 * 1000ULL)
+//#define RATE_IN_BITS	(950 * 1000 * 1000ULL)
 #define OVERHEAD	(12 + 8 + 4 + 8)  /* 14 already in wire_len */
 //#define OVERHEAD	(12 + 8 + 4)      /* 14 already in wire_len */
 #define ETH_MIN		(84)
@@ -129,11 +129,18 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 	t_next = READ_ONCE(edt->t_last) + t_xmit_ns;
 
 	/* If packet doesn't get scheduled into the future, then there is
-	 * no-queue and we are not above rate limit. Send packet immediately and
-	 * move forward t_last timestamp to now.
+	 * no-queue and we are not above rate limit. Normally send packet
+	 * immediately and move forward t_last timestamp to now.
+	 *
+	 * But in our use-case the traffic need smoothing at a earlier
+	 * stage, as bursts at lower rates can hurt the crapy switch.
+	 * Thus, schedule SKB transmissing as new + t_xmit_ns.
 	 */
 	if (t_next <= t_curr) {
-		WRITE_ONCE(edt->t_last, t_curr);
+		__u64 t_curr_next = t_curr + t_xmit_ns;
+
+		WRITE_ONCE(edt->t_last, t_curr_next);
+		skb->tstamp = t_curr_next;
 		return BPF_OK;
 	}
 

From 4671be73a8292d67e31cc0eb357a2f4f43b3a5ce Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sat, 21 Nov 2020 18:02:45 +0100
Subject: [PATCH 29/61] traffic-pacing-edt: Minimum delay for all packet if no
 time-queue

Trying to trigger more NET_TX_SOFTIRQ to get packets scheduled
out more spaced out in time.

It is of-cause important to disable GRO in the first place.

E.g. cmdline:
 sudo ethtool -K ens6f1 gso off tso off gro off

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/edt_pacer02.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index f0ee70c..6ff121c 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -137,7 +137,13 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 	 * Thus, schedule SKB transmissing as new + t_xmit_ns.
 	 */
 	if (t_next <= t_curr) {
-		__u64 t_curr_next = t_curr + t_xmit_ns;
+		__u64 t_curr_next;
+		__u32 min_len = 1538 * 2;
+
+		/* Minimum delay for all packet if no time-queue */
+		wire_len = (wire_len > min_len) ?  wire_len : min_len;
+		t_xmit_ns = (wire_len) * NS_PER_SEC / RATE_IN_BYTES;
+		t_curr_next = t_curr + t_xmit_ns;
 
 		WRITE_ONCE(edt->t_last, t_curr_next);
 		skb->tstamp = t_curr_next;

From 68505a2dbd5c50e83234f587af017d465d213ced Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sat, 21 Nov 2020 18:27:37 +0100
Subject: [PATCH 30/61] traffic-pacing-edt: tc_fq_pacer.sh select between MQ
 and single FQ

For some reason cannot get correct scheduling with FQ in a MQ setup.

In production traffic is Q-in-Q double tagged VLAN traffic.

Perhaps the RX-hash is doing strange stuff, or BPF-prog concurrency
is wrong.  Due to Q-in-Q NIC RSS cause most packets to hit CPU-6
for some strange reason.

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/tc_fq_pacer.sh | 61 +++++++++++++++++++++++--------
 1 file changed, 45 insertions(+), 16 deletions(-)

diff --git a/traffic-pacing-edt/tc_fq_pacer.sh b/traffic-pacing-edt/tc_fq_pacer.sh
index e7a502f..882b146 100755
--- a/traffic-pacing-edt/tc_fq_pacer.sh
+++ b/traffic-pacing-edt/tc_fq_pacer.sh
@@ -22,6 +22,14 @@ export TC=tc
 # Default verbose
 VERBOSE=1
 
+# Select between multiq or single root qdisc
+if [[ -z $1 ]]; then
+    if [[ -z $REMOVE ]]; then
+	err 1 "Specify root qdisc system: single or mq (multi-queue)"
+    fi
+fi
+TYPE=$1
+
 # Delete existing root qdisc
 call_tc_allow_fail qdisc del dev "$DEV" root
 
@@ -29,20 +37,41 @@ if [[ -n $REMOVE ]]; then
     exit 0
 fi
 
-# MQ (Multi-Queue) as root qdisc
-call_tc qdisc replace dev $DEV root handle 7FFF: mq
+function use_multiq()
+{
+    # MQ (Multi-Queue) as root qdisc
+    call_tc qdisc replace dev $DEV root handle 7FFF: mq
 
-# Add FQ-pacer qdisc on each NIC avail TX-queue
-i=0
-for dir in /sys/class/net/$DEV/queues/tx-*; do
-    # Details: cause-off-by-one, as tx-0 becomes handle 1:
-    ((i++)) || true
-    #call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq
-    #
-    # The higher 'flow_limit' is needed for high-BW pacing
-    call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq \
-       flow_limit 1000
-    #
-    #   quantum $((1514*4)) initial_quantum $((1514*20))
-    # call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit
-done
+    # Add FQ-pacer qdisc on each NIC avail TX-queue
+    i=0
+    for dir in /sys/class/net/$DEV/queues/tx-*; do
+	# Details: cause-off-by-one, as tx-0 becomes handle 1:
+	((i++)) || true
+	#call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq
+	#
+	# The higher 'flow_limit' is needed for high-BW pacing
+	call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq \
+		flow_limit 1000
+	#
+	#   quantum $((1514*4)) initial_quantum $((1514*20))
+	# call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit
+    done
+}
+
+function use_single_fq_pacer()
+{
+    call_tc qdisc replace dev $DEV root handle 7FFF: fq \
+	    flow_limit 1000
+}
+
+case "$TYPE" in
+    mq | multiq )
+	use_multiq
+	;;
+    single | fq )
+	use_single_fq_pacer
+	;;
+    * )
+	err 1 "Unknown type: ${TYPE}"
+	;;
+esac

From a45ae39775af21a7b8c631ea20b9febabef4323e Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sat, 21 Nov 2020 18:59:03 +0100
Subject: [PATCH 31/61] traffic-pacing-edt: make it easy to remove minimum
 delay trick in code

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/edt_pacer02.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 6ff121c..727c0b9 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -137,8 +137,9 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 	 * Thus, schedule SKB transmissing as new + t_xmit_ns.
 	 */
 	if (t_next <= t_curr) {
+#if 1
 		__u64 t_curr_next;
-		__u32 min_len = 1538 * 2;
+		__u32 min_len = 1538;
 
 		/* Minimum delay for all packet if no time-queue */
 		wire_len = (wire_len > min_len) ?  wire_len : min_len;
@@ -147,7 +148,11 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 
 		WRITE_ONCE(edt->t_last, t_curr_next);
 		skb->tstamp = t_curr_next;
+#else
+		WRITE_ONCE(edt->t_last, t_curr);
+#endif
 		return BPF_OK;
+
 	}
 
 	/* Calc queue size measured in time */

From eacff13518b960ed9f0bc8f048de771714e7cb0c Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sun, 22 Nov 2020 14:45:13 +0100
Subject: [PATCH 32/61] traffic-pacing-edt: Experiment random drop packets
 exceeding 10 ms queue

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/edt_pacer02.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 727c0b9..48d10c9 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -60,9 +60,12 @@ char _license[] SEC("license") = "GPL";
 
 //#define T_HORIZON_DROP	(2000 * 1000 * 1000ULL)
 //#define T_HORIZON_DROP	(200000 * 1000 * 1000ULL)
-#define T_HORIZON_DROP	(15 * 1000 * 1000ULL)
 
-#define T_HORIZON_ECN	(5 * 1000 * 1000ULL)
+#define T_HORIZON_DROP		(15 * 1000 * 1000ULL)
+
+#define T_HORIZON_DROP_SOME	(10 * 1000 * 1000ULL)
+
+#define T_HORIZON_ECN		(5 * 1000 * 1000ULL)
 
 struct edt_val {
 	__u64	rate;
@@ -165,6 +168,22 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 	if (t_queue_sz >= T_HORIZON_DROP /* edt->t_horizon_drop */)
 		return BPF_DROP;
 
+	/* If TCP didn't react to ECN marking, then start dropping some */
+	if (t_queue_sz >= T_HORIZON_DROP_SOME) {
+		__u32 random = (bpf_get_prandom_u32() >> 4) & 0x0f;
+
+		if (random >= 8)
+			return BPF_DROP;
+
+		// TODO If horizon have been exceed for a while, then
+		
+
+		// "next drop time"
+	} else {
+		/* TODO: Queue delay drops below reset */
+	}
+
+	
 	/* ECN marking horizon */
 	if (t_queue_sz >= T_HORIZON_ECN)
 		bpf_skb_ecn_set_ce(skb);

From d8845714daf6432562772cb8a488a0357837fede Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sun, 22 Nov 2020 14:53:27 +0100
Subject: [PATCH 33/61] traffic-pacing-edt: Codel like scheme

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/edt_pacer02.c | 52 ++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 48d10c9..2fbdd0c 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -63,15 +63,27 @@ char _license[] SEC("license") = "GPL";
 
 #define T_HORIZON_DROP		(15 * 1000 * 1000ULL)
 
-#define T_HORIZON_DROP_SOME	(10 * 1000 * 1000ULL)
+#define T_HORIZON_TARGET	(10 * 1000 * 1000ULL)
 
 #define T_HORIZON_ECN		(5 * 1000 * 1000ULL)
 
+/* Codel like dropping scheme, inspired by:
+ * - RFC:  https://queue.acm.org/detail.cfm?id=2209336
+ * - Code: https://queue.acm.org/appendices/codel.html
+ */
+
 struct edt_val {
 	__u64	rate;
 	__u64	t_last;
 	__u64	t_horizon_drop;
 	__u64	t_horizon_ecn;
+
+	/* codel like dropping scheme */
+	__u64	first_above_time; /* Time when above target (0 if below)*/
+	//__u64	drop_next;	  /* Time to drop next packet */
+	uint32_t count;	/* Packets dropped since going into drop state */
+	uint32_t dropping; /*/ Equal to 1 if in drop state */
+
 } __aligned(64); /* Align struct to cache-size to avoid false-sharing */
 
 /* The tc tool (iproute2) use another ELF map layout than libbpf (struct
@@ -85,6 +97,42 @@ struct bpf_elf_map SEC("maps") time_delay_map = {
 	//.pinning	= PIN_GLOBAL_NS,
 };
 
+/* */
+#define T_EXCEED_INTERVAL	(100 * 1000 * 1000ULL) /* 100 ms in ns*/
+
+/* Table lookup for square-root shifted 16 bit */
+static __always_inline __u32 get_sqrt_sh16(__u64 cnt)
+{
+	switch (cnt) {
+	case 1:	return 65536; /* 65536 * sqrt(1) */
+	case 2:	return 92682; /* 65536 * sqrt(2) */
+	case 3:	return 113512; /* 65536 * sqrt(3) */
+	case 4:	return 131072; /* 65536 * sqrt(4) */
+	case 5:	return 146543; /* 65536 * sqrt(5) */
+	case 6:	return 160530; /* 65536 * sqrt(6) */
+	case 7:	return 173392;
+	case 8:	return 185364;
+	case 9:	return 196608;
+	case 10: return 207243;
+	case 11: return 217358;
+	case 12: return 227023;
+	case 13: return 236293;
+	case 14: return 245213;
+	case 15: return 253820;
+	case 16: return 262144; /* 100 ms / sqrt(16) = 25 ms */    	
+	default:
+		return 370728; /* 65536*sqrt(32) => 100/sqrt(32) = 17.68 ms */
+	}
+}
+
+static __always_inline __u64 get_next_interval(__u64 cnt)
+{
+	__u64 val = (__u64)T_EXCEED_INTERVAL << 16 / get_sqrt_sh16(cnt);
+	return val;
+}
+
+
+
 /* Role of EDT (Earliest Departure Time) is to schedule departure of packets to
  * be send in the future.
  */
@@ -169,7 +217,7 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 		return BPF_DROP;
 
 	/* If TCP didn't react to ECN marking, then start dropping some */
-	if (t_queue_sz >= T_HORIZON_DROP_SOME) {
+	if (t_queue_sz >= T_HORIZON_TARGET) {
 		__u32 random = (bpf_get_prandom_u32() >> 4) & 0x0f;
 
 		if (random >= 8)

From f50c74101185b2162cee0b7ca5f120f92e648a72 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sun, 22 Nov 2020 18:11:11 +0100
Subject: [PATCH 34/61] traffic-pacing-edt: finished codel implementation based
 on [1]

[1] https://queue.acm.org/appendices/codel.html

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/edt_pacer02.c | 103 +++++++++++++++++++++++++------
 1 file changed, 85 insertions(+), 18 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 2fbdd0c..64838e5 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -4,6 +4,8 @@
 #include <bpf/compiler.h>
 #include "iproute2_compat.h"
 
+#include <stdbool.h>
+
 #define VLAN_MAX_DEPTH 2
 #include <xdp/parsing_helpers.h>
 
@@ -70,6 +72,7 @@ char _license[] SEC("license") = "GPL";
 /* Codel like dropping scheme, inspired by:
  * - RFC:  https://queue.acm.org/detail.cfm?id=2209336
  * - Code: https://queue.acm.org/appendices/codel.html
+ * - Kernel: include/net/codel_impl.h
  */
 
 struct edt_val {
@@ -80,9 +83,9 @@ struct edt_val {
 
 	/* codel like dropping scheme */
 	__u64	first_above_time; /* Time when above target (0 if below)*/
-	//__u64	drop_next;	  /* Time to drop next packet */
-	uint32_t count;	/* Packets dropped since going into drop state */
-	uint32_t dropping; /*/ Equal to 1 if in drop state */
+	__u64	drop_next;	  /* Time to drop next packet */
+	__u32	count;	/* Packets dropped since going into drop state */
+	__u32	dropping; /* Equal to 1 if in drop state */
 
 } __aligned(64); /* Align struct to cache-size to avoid false-sharing */
 
@@ -125,13 +128,88 @@ static __always_inline __u32 get_sqrt_sh16(__u64 cnt)
 	}
 }
 
-static __always_inline __u64 get_next_interval(__u64 cnt)
+static __always_inline __u64 get_next_interval_sqrt(__u64 cnt)
 {
 	__u64 val = (__u64)T_EXCEED_INTERVAL << 16 / get_sqrt_sh16(cnt);
 	return val;
 }
 
+static __always_inline __u64
+codel_control_law(__u64 t, __u64 cnt)
+{
+	return t + get_next_interval_sqrt(cnt);
+}
 
+static __always_inline
+bool codel_should_drop(struct edt_val *edt, __u64 t_queue_sz, __u64 now)
+{
+	__u64 interval = T_EXCEED_INTERVAL;
+
+	if (t_queue_sz < T_HORIZON_TARGET) {
+		/* went below so we'll stay below for at least interval */
+		edt->first_above_time = 0;
+		return false;
+	}
+
+	if (edt->first_above_time == 0) {
+		/* just went above from below. If we stay above
+		 * for at least interval we'll say it's ok to drop
+		 */
+		edt->first_above_time = now + interval;
+		return false;
+	} else if (now >= edt->first_above_time) {
+		return true;
+	}
+	return false;
+}
+
+static __always_inline
+bool codel_drop(struct edt_val *edt, __u64 t_queue_sz, __u64 now)
+{
+	__u64 interval = T_EXCEED_INTERVAL;
+
+	/* If horizon have been exceed for a while, inc drop intensity*/
+	bool drop = codel_should_drop(edt, t_queue_sz, now);
+
+	if (edt->dropping) { /* In dropping state */
+		if (!drop) {
+			/* time below target - leave dropping state */
+			edt->dropping = false;
+			return false;
+		} else if (now >= edt->drop_next) {
+			/* It's time for the next drop. Drop the current
+			 * packet. Schedule the next drop
+			 */
+			edt->count += 1;
+			// schedule the next drop.
+                        edt->drop_next =
+				codel_control_law(edt->drop_next, edt->count);
+			return true;
+		}
+	} else if (drop &&
+		   ((now - edt->drop_next < interval) ||
+		    (now - edt->first_above_time >= interval))) {
+		/* If we get here, then we're not in dropping state.
+		 * Decide  whether it's time to enter dropping state.
+		 */
+		__u32 count = edt->count;
+
+		edt->dropping = true;
+
+		/* If we're in a drop cycle, drop rate that controlled queue
+                 * on the last cycle is a good starting point to control it now.
+		 */
+		if (now - edt->drop_next < interval)
+			count = count > 2 ? (count - 2) : 1;
+		else
+			count = 1;
+
+		edt->count = count;
+		edt->drop_next = codel_control_law(now, count);
+		return true;
+	}
+	return false;
+}
 
 /* Role of EDT (Earliest Departure Time) is to schedule departure of packets to
  * be send in the future.
@@ -217,20 +295,9 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 		return BPF_DROP;
 
 	/* If TCP didn't react to ECN marking, then start dropping some */
-	if (t_queue_sz >= T_HORIZON_TARGET) {
-		__u32 random = (bpf_get_prandom_u32() >> 4) & 0x0f;
-
-		if (random >= 8)
-			return BPF_DROP;
-
-		// TODO If horizon have been exceed for a while, then
-		
-
-		// "next drop time"
-	} else {
-		/* TODO: Queue delay drops below reset */
-	}
-
+	// if (codel_drop(edt, t_queue_sz, now))
+	if (codel_drop(edt, t_queue_sz, t_next))
+		return BPF_DROP;
 	
 	/* ECN marking horizon */
 	if (t_queue_sz >= T_HORIZON_ECN)

From 2f6580dea46439a0091a7fc7b1ce6c30d822d9c2 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sat, 28 Nov 2020 13:37:07 +0100
Subject: [PATCH 35/61] Factor out codel structure

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/edt_pacer02.c | 57 ++++++++++++++++----------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 64838e5..fc21ed4 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -74,19 +74,20 @@ char _license[] SEC("license") = "GPL";
  * - Code: https://queue.acm.org/appendices/codel.html
  * - Kernel: include/net/codel_impl.h
  */
+struct codel_state {
+	/* codel like dropping scheme */
+	__u64	first_above_time; /* Time when above target (0 if below)*/
+	__u64	drop_next;	  /* Time to drop next packet */
+	__u32	count;	/* Packets dropped since going into drop state */
+	__u32	dropping; /* Equal to 1 if in drop state */	
+};
 
 struct edt_val {
 	__u64	rate;
 	__u64	t_last;
 	__u64	t_horizon_drop;
 	__u64	t_horizon_ecn;
-
-	/* codel like dropping scheme */
-	__u64	first_above_time; /* Time when above target (0 if below)*/
-	__u64	drop_next;	  /* Time to drop next packet */
-	__u32	count;	/* Packets dropped since going into drop state */
-	__u32	dropping; /* Equal to 1 if in drop state */
-
+	struct codel_state codel;
 } __aligned(64); /* Align struct to cache-size to avoid false-sharing */
 
 /* The tc tool (iproute2) use another ELF map layout than libbpf (struct
@@ -141,71 +142,71 @@ codel_control_law(__u64 t, __u64 cnt)
 }
 
 static __always_inline
-bool codel_should_drop(struct edt_val *edt, __u64 t_queue_sz, __u64 now)
+bool codel_should_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now)
 {
 	__u64 interval = T_EXCEED_INTERVAL;
 
 	if (t_queue_sz < T_HORIZON_TARGET) {
 		/* went below so we'll stay below for at least interval */
-		edt->first_above_time = 0;
+		codel->first_above_time = 0;
 		return false;
 	}
 
-	if (edt->first_above_time == 0) {
+	if (codel->first_above_time == 0) {
 		/* just went above from below. If we stay above
 		 * for at least interval we'll say it's ok to drop
 		 */
-		edt->first_above_time = now + interval;
+		codel->first_above_time = now + interval;
 		return false;
-	} else if (now >= edt->first_above_time) {
+	} else if (now >= codel->first_above_time) {
 		return true;
 	}
 	return false;
 }
 
 static __always_inline
-bool codel_drop(struct edt_val *edt, __u64 t_queue_sz, __u64 now)
+bool codel_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now)
 {
 	__u64 interval = T_EXCEED_INTERVAL;
 
 	/* If horizon have been exceed for a while, inc drop intensity*/
-	bool drop = codel_should_drop(edt, t_queue_sz, now);
+	bool drop = codel_should_drop(codel, t_queue_sz, now);
 
-	if (edt->dropping) { /* In dropping state */
+	if (codel->dropping) { /* In dropping state */
 		if (!drop) {
 			/* time below target - leave dropping state */
-			edt->dropping = false;
+			codel->dropping = false;
 			return false;
-		} else if (now >= edt->drop_next) {
+		} else if (now >= codel->drop_next) {
 			/* It's time for the next drop. Drop the current
 			 * packet. Schedule the next drop
 			 */
-			edt->count += 1;
+			codel->count += 1;
 			// schedule the next drop.
-                        edt->drop_next =
-				codel_control_law(edt->drop_next, edt->count);
+                        codel->drop_next =
+				codel_control_law(codel->drop_next, codel->count);
 			return true;
 		}
 	} else if (drop &&
-		   ((now - edt->drop_next < interval) ||
-		    (now - edt->first_above_time >= interval))) {
+		   ((now - codel->drop_next < interval) ||
+		    (now - codel->first_above_time >= interval))) {
 		/* If we get here, then we're not in dropping state.
 		 * Decide  whether it's time to enter dropping state.
 		 */
-		__u32 count = edt->count;
+		__u32 count = codel->count;
 
-		edt->dropping = true;
+		codel->dropping = true;
 
 		/* If we're in a drop cycle, drop rate that controlled queue
                  * on the last cycle is a good starting point to control it now.
 		 */
-		if (now - edt->drop_next < interval)
+		if (now - codel->drop_next < interval)
 			count = count > 2 ? (count - 2) : 1;
 		else
 			count = 1;
 
-		edt->count = count;
-		edt->drop_next = codel_control_law(now, count);
+		codel->count = count;
+		codel->drop_next = codel_control_law(now, count);
 		return true;
 	}
 	return false;
@@ -296,7 +297,7 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 
 	/* If TCP didn't react to ECN marking, then start dropping some */
 	// if (codel_drop(edt, t_queue_sz, now))
-	if (codel_drop(edt, t_queue_sz, t_next))
+	if (codel_drop(&edt->codel, t_queue_sz, t_next))
 		return BPF_DROP;
 	
 	/* ECN marking horizon */

From 516668c62c9f4de91c1d3d6f5718461e4d21b5ba Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sat, 28 Nov 2020 13:51:49 +0100
Subject: [PATCH 36/61] Move codel implementation to header file

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/codel_impl.h  | 133 +++++++++++++++++++++++++++++++
 traffic-pacing-edt/edt_pacer02.c | 128 ++---------------------------
 2 files changed, 138 insertions(+), 123 deletions(-)
 create mode 100644 traffic-pacing-edt/codel_impl.h

diff --git a/traffic-pacing-edt/codel_impl.h b/traffic-pacing-edt/codel_impl.h
new file mode 100644
index 0000000..6970646
--- /dev/null
+++ b/traffic-pacing-edt/codel_impl.h
@@ -0,0 +1,133 @@
+#ifndef __CODEL_IMPL_H
+#define __CODEL_IMPL_H
+
+#ifndef CODEL_TARGET
+#define CODEL_TARGET (10 * 1000 * 1000ULL) /* 10 ms in nanosec */
+#endif
+
+#ifndef CODEL_EXCEED_INTERVAL
+#define CODEL_EXCEED_INTERVAL	(100 * 1000 * 1000ULL) /* 100 ms in ns*/
+#endif
+
+/* Codel like dropping scheme, inspired by:
+ * - RFC:  https://queue.acm.org/detail.cfm?id=2209336
+ * - Code: https://queue.acm.org/appendices/codel.html
+ * - Kernel: include/net/codel_impl.h
+ */
+struct codel_state {
+	/* codel like dropping scheme */
+	__u64	first_above_time; /* Time when above target (0 if below)*/
+	__u64	drop_next;	  /* Time to drop next packet */
+	__u32	count;	/* Packets dropped since going into drop state */
+	__u32	dropping; /* Equal to 1 if in drop state */
+};
+
+/* Table lookup for square-root shifted 16 bit */
+static __always_inline __u32 get_sqrt_sh16(__u64 cnt)
+{
+	switch (cnt) {
+	case 1:	return 65536; /* 65536 * sqrt(1) */
+	case 2:	return 92682; /* 65536 * sqrt(2) */
+	case 3:	return 113512; /* 65536 * sqrt(3) */
+	case 4:	return 131072; /* 65536 * sqrt(4) */
+	case 5:	return 146543; /* 65536 * sqrt(5) */
+	case 6:	return 160530; /* 65536 * sqrt(6) */
+	case 7:	return 173392;
+	case 8:	return 185364;
+	case 9:	return 196608;
+	case 10: return 207243;
+	case 11: return 217358;
+	case 12: return 227023;
+	case 13: return 236293;
+	case 14: return 245213;
+	case 15: return 253820;
+	case 16: return 262144; /* 100 ms / sqrt(16) = 25 ms */
+	default:
+		return 370728; /* 65536*sqrt(32) => 100/sqrt(32) = 17.68 ms */
+	}
+}
+
+static __always_inline __u64 get_next_interval_sqrt(__u64 cnt)
+{
+	__u64 val = (__u64)CODEL_EXCEED_INTERVAL << 16 / get_sqrt_sh16(cnt);
+	return val;
+}
+
+static __always_inline __u64
+codel_control_law(__u64 t, __u64 cnt)
+{
+	return t + get_next_interval_sqrt(cnt);
+}
+
+static __always_inline
+bool codel_should_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now)
+{
+	__u64 interval = CODEL_EXCEED_INTERVAL;
+
+	if (t_queue_sz < CODEL_TARGET) {
+		/* went below so we'll stay below for at least interval */
+		codel->first_above_time = 0;
+		return false;
+	}
+
+	if (codel->first_above_time == 0) {
+		/* just went above from below. If we stay above
+		 * for at least interval we'll say it's ok to drop
+		 */
+		codel->first_above_time = now + interval;
+		return false;
+	} else if (now >= codel->first_above_time) {
+		return true;
+	}
+	return false;
+}
+
+static __always_inline
+bool codel_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now)
+{
+	__u64 interval = CODEL_EXCEED_INTERVAL;
+
+	/* If horizon have been exceed for a while, inc drop intensity*/
+	bool drop = codel_should_drop(codel, t_queue_sz, now);
+
+	if (codel->dropping) { /* In dropping state */
+		if (!drop) {
+			/* time below target - leave dropping state */
+			codel->dropping = false;
+			return false;
+		} else if (now >= codel->drop_next) {
+			/* It's time for the next drop. Drop the current
+			 * packet. Schedule the next drop
+			 */
+			codel->count += 1;
+			// schedule the next drop.
+                        codel->drop_next =
+				codel_control_law(codel->drop_next, codel->count);
+			return true;
+		}
+	} else if (drop &&
+		   ((now - codel->drop_next < interval) ||
+		    (now - codel->first_above_time >= interval))) {
+		/* If we get here, then we're not in dropping state.
+		 * Decide  whether it's time to enter dropping state.
+		 */
+		__u32 count = codel->count;
+
+		codel->dropping = true;
+
+		/* If we're in a drop cycle, drop rate that controlled queue
+                 * on the last cycle is a good starting point to control it now.
+		 */
+		if (now - codel->drop_next < interval)
+			count = count > 2 ? (count - 2) : 1;
+		else
+			count = 1;
+
+		codel->count = count;
+		codel->drop_next = codel_control_law(now, count);
+		return true;
+	}
+	return false;
+}
+
+#endif /* __CODEL_IMPL_H */
diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index fc21ed4..8a0d54d 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -69,18 +69,11 @@ char _license[] SEC("license") = "GPL";
 
 #define T_HORIZON_ECN		(5 * 1000 * 1000ULL)
 
-/* Codel like dropping scheme, inspired by:
- * - RFC:  https://queue.acm.org/detail.cfm?id=2209336
- * - Code: https://queue.acm.org/appendices/codel.html
- * - Kernel: include/net/codel_impl.h
- */
-struct codel_state {
-	/* codel like dropping scheme */
-	__u64	first_above_time; /* Time when above target (0 if below)*/
-	__u64	drop_next;	  /* Time to drop next packet */
-	__u32	count;	/* Packets dropped since going into drop state */
-	__u32	dropping; /* Equal to 1 if in drop state */	
-};
+#define T_EXCEED_INTERVAL	(100 * 1000 * 1000ULL) /* 100 ms in ns*/
+
+#define CODEL_TARGET		T_HORIZON_TARGET
+#define CODEL_EXCEED_INTERVAL	T_EXCEED_INTERVAL
+#include "codel_impl.h"
 
 struct edt_val {
 	__u64	rate;
@@ -101,117 +94,6 @@ struct bpf_elf_map SEC("maps") time_delay_map = {
 	//.pinning	= PIN_GLOBAL_NS,
 };
 
-/* */
-#define T_EXCEED_INTERVAL	(100 * 1000 * 1000ULL) /* 100 ms in ns*/
-
-/* Table lookup for square-root shifted 16 bit */
-static __always_inline __u32 get_sqrt_sh16(__u64 cnt)
-{
-	switch (cnt) {
-	case 1:	return 65536; /* 65536 * sqrt(1) */
-	case 2:	return 92682; /* 65536 * sqrt(2) */
-	case 3:	return 113512; /* 65536 * sqrt(3) */
-	case 4:	return 131072; /* 65536 * sqrt(4) */
-	case 5:	return 146543; /* 65536 * sqrt(5) */
-	case 6:	return 160530; /* 65536 * sqrt(6) */
-	case 7:	return 173392;
-	case 8:	return 185364;
-	case 9:	return 196608;
-	case 10: return 207243;
-	case 11: return 217358;
-	case 12: return 227023;
-	case 13: return 236293;
-	case 14: return 245213;
-	case 15: return 253820;
-	case 16: return 262144; /* 100 ms / sqrt(16) = 25 ms */    	
-	default:
-		return 370728; /* 65536*sqrt(32) => 100/sqrt(32) = 17.68 ms */
-	}
-}
-
-static __always_inline __u64 get_next_interval_sqrt(__u64 cnt)
-{
-	__u64 val = (__u64)T_EXCEED_INTERVAL << 16 / get_sqrt_sh16(cnt);
-	return val;
-}
-
-static __always_inline __u64
-codel_control_law(__u64 t, __u64 cnt)
-{
-	return t + get_next_interval_sqrt(cnt);
-}
-
-static __always_inline
-bool codel_should_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now)
-{
-	__u64 interval = T_EXCEED_INTERVAL;
-
-	if (t_queue_sz < T_HORIZON_TARGET) {
-		/* went below so we'll stay below for at least interval */
-		codel->first_above_time = 0;
-		return false;
-	}
-
-	if (codel->first_above_time == 0) {
-		/* just went above from below. If we stay above
-		 * for at least interval we'll say it's ok to drop
-		 */
-		codel->first_above_time = now + interval;
-		return false;
-	} else if (now >= codel->first_above_time) {
-		return true;
-	}
-	return false;
-}
-
-static __always_inline
-bool codel_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now)
-{
-	__u64 interval = T_EXCEED_INTERVAL;
-
-	/* If horizon have been exceed for a while, inc drop intensity*/
-	bool drop = codel_should_drop(codel, t_queue_sz, now);
-
-	if (codel->dropping) { /* In dropping state */
-		if (!drop) {
-			/* time below target - leave dropping state */
-			codel->dropping = false;
-			return false;
-		} else if (now >= codel->drop_next) {
-			/* It's time for the next drop. Drop the current
-			 * packet. Schedule the next drop
-			 */
-			codel->count += 1;
-			// schedule the next drop.
-                        codel->drop_next =
-				codel_control_law(codel->drop_next, codel->count);
-			return true;
-		}
-	} else if (drop &&
-		   ((now - codel->drop_next < interval) ||
-		    (now - codel->first_above_time >= interval))) {
-		/* If we get here, then we're not in dropping state.
-		 * Decide  whether it's time to enter dropping state.
-		 */
-		__u32 count = codel->count;
-
-		codel->dropping = true;
-
-		/* If we're in a drop cycle, drop rate that controlled queue
-                 * on the last cycle is a good starting point to control it now.
-		 */
-		if (now - codel->drop_next < interval)
-			count = count > 2 ? (count - 2) : 1;
-		else
-			count = 1;
-
-		codel->count = count;
-		codel->drop_next = codel_control_law(now, count);
-		return true;
-	}
-	return false;
-}
-
 /* Role of EDT (Earliest Departure Time) is to schedule departure of packets to
  * be send in the future.
  */

From 3e0ac4f24dc62ea3f31ed9d72e51dd901bfc7858 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sat, 28 Nov 2020 14:02:52 +0100
Subject: [PATCH 37/61] Cleanup some comments

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/edt_pacer02.c | 32 ++++----------------------------
 1 file changed, 4 insertions(+), 28 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 8a0d54d..84f7e55 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -13,28 +13,7 @@ char _license[] SEC("license") = "GPL";
 
 #define NS_PER_SEC 1000000000
 
-//#define RATE_IN_BITS	(998 * 1000 * 1000ULL)
-
-/* Test different rates in production machine, and measure iperf3 TCP-goodput */
-//#define RATE_IN_BITS	(800 * 1000 * 1000ULL)// prod: 765 Mbits/sec (stable) 
-//#define RATE_IN_BITS	(900 * 1000 * 1000ULL)// prod: 861 Mbits/sec (stable)
-///#define RATE_IN_BITS	(950 * 1000 * 1000ULL)// prod: 908 Mbits/sec (stable)
-//#define RATE_IN_BITS	(960 * 1000 * 1000ULL)// prod: 918 Mbits/sec
-//#define RATE_IN_BITS	(970 * 1000 * 1000ULL)// prod: 928 Mbits/sec
-//#define RATE_IN_BITS	(980 * 1000 * 1000ULL)// prod: 920 Mbits/sec (unstable)
-//#define RATE_IN_BITS	(990 * 1000 * 1000ULL)// prod: 920 Mbits/sec (unstable)
-//#define RATE_IN_BITS	(999 * 1000 * 1000ULL)// prod: (unstable)
-
-/* Per packet overhead: two VLAN headers == 8 bytes
- *
- * skb->wire_len doesn't seem to take the two VLAN headers into
- * account.  Loading BPF-prog on VLAN net_device is can only see 1
- * VLAN, and this is likely HW offloaded into skb->vlan.
- */
-//#define OVERHEAD	(8)
-
-
-/* New strategy: Shape at MAC (Medium Access Control) layer with Ethernet
+/* Strategy: Shape at MAC (Medium Access Control) layer with Ethernet
  *
  * Production use-case is pacing traffic at 1Gbit/s wirespeed, using a
  * 10Gbit/s NIC, because 1G end-user switch cannot handle bursts.
@@ -57,18 +36,15 @@ char _license[] SEC("license") = "GPL";
 //#define OVERHEAD	(12 + 8 + 4)      /* 14 already in wire_len */
 #define ETH_MIN		(84)
 
-/* skb->len in bytes, thus easier to keep rate in bytes */
+/* skb->len in bytes, thus convert rate to bytes */
 #define RATE_IN_BYTES	(RATE_IN_BITS / 8)
 
-//#define T_HORIZON_DROP	(2000 * 1000 * 1000ULL)
-//#define T_HORIZON_DROP	(200000 * 1000 * 1000ULL)
-
+/* Controlling how large queue (in time) is allow to grow */
 #define T_HORIZON_DROP		(15 * 1000 * 1000ULL)
-
 #define T_HORIZON_TARGET	(10 * 1000 * 1000ULL)
-
 #define T_HORIZON_ECN		(5 * 1000 * 1000ULL)
 
+/* Codel: If queue exceed target for more than one interval, start dropping */
 #define T_EXCEED_INTERVAL	(100 * 1000 * 1000ULL) /* 100 ms in ns*/
 
 #define CODEL_TARGET		T_HORIZON_TARGET

From 60a851c2a00513b8c5d0fe7b7a51a7b081666cd4 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sat, 28 Nov 2020 14:22:29 +0100
Subject: [PATCH 38/61] Now that codel works adjust horizons

The hard drop horizon (T_HORIZON_DROP) can be increased (to 40ms)
as codel target latency (T_HORIZON_TARGET) is taking care of
signaling TCP downloads via drops (after codel scheme).

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/edt_pacer02.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 84f7e55..269835b 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -40,9 +40,9 @@ char _license[] SEC("license") = "GPL";
 #define RATE_IN_BYTES	(RATE_IN_BITS / 8)
 
 /* Controlling how large queue (in time) is allow to grow */
-#define T_HORIZON_DROP		(15 * 1000 * 1000ULL)
-#define T_HORIZON_TARGET	(10 * 1000 * 1000ULL)
-#define T_HORIZON_ECN		(5 * 1000 * 1000ULL)
+#define T_HORIZON_DROP		(40 * 1000 * 1000ULL)
+#define T_HORIZON_TARGET	(5 * 1000 * 1000ULL)
+#define T_HORIZON_ECN		(1 * 1000 * 1000ULL)
 
 /* Codel: If queue exceed target for more than one interval, start dropping */
 #define T_EXCEED_INTERVAL	(100 * 1000 * 1000ULL) /* 100 ms in ns*/

From 2786f8af65b274a9aa0e9f470484c14303d83f8b Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sat, 28 Nov 2020 14:38:48 +0100
Subject: [PATCH 39/61] Extend the sqrt lookup table with more entries

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/codel_impl.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/traffic-pacing-edt/codel_impl.h b/traffic-pacing-edt/codel_impl.h
index 6970646..549dc61 100644
--- a/traffic-pacing-edt/codel_impl.h
+++ b/traffic-pacing-edt/codel_impl.h
@@ -42,8 +42,28 @@ static __always_inline __u32 get_sqrt_sh16(__u64 cnt)
 	case 14: return 245213;
 	case 15: return 253820;
 	case 16: return 262144; /* 100 ms / sqrt(16) = 25 ms */
+	case 17: return 270212;
+	case 18: return 278046;
+	case 19: return 285664;
+	case 20: return 293086;
+	case 21: return 300324;
+	case 22: return 307391;
+	case 23: return 314300;
+	case 24: return 321060;
+	case 25: return 327680; /* 100 ms / sqrt(25) = 20 ms */
+	case 26: return 334169;
+	case 27: return 340535;
+	case 28: return 346784;
+	case 29: return 352922;
+	case 30: return 358955;
+	case 31: return 364889;
+	case 32: return 370728;
+	case 33: return 376476;
+	case 34: return 382137;
+	case 35: return 387716;
+	case 36: return 393216; /* 100 / sqrt(36) = 16.66 ms */
 	default:
-		return 370728; /* 65536*sqrt(32) => 100/sqrt(32) = 17.68 ms */
+		return 463410; /* 65536*sqrt(50) => 100/sqrt(50) = 14.14 ms */
 	}
 }
 

From 3248b602486a32e118d267551999f359e20607d6 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sat, 28 Nov 2020 15:09:12 +0100
Subject: [PATCH 40/61] Do EDT pacing on all inner VLAN ids

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/edt_pacer02.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 269835b..eb1b997 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -66,14 +66,14 @@ struct bpf_elf_map SEC("maps") time_delay_map = {
 	.type		= BPF_MAP_TYPE_ARRAY,
 	.size_key	= sizeof(__u32),
 	.size_value	= sizeof(struct edt_val),
-	.max_elem	= 1,
+	.max_elem	= 4096, /* Max possible VLANs */
 	//.pinning	= PIN_GLOBAL_NS,
 };
 
 /* Role of EDT (Earliest Departure Time) is to schedule departure of packets to
  * be send in the future.
  */
-static __always_inline int sched_departure(struct __sk_buff *skb)
+static __always_inline int sched_departure(struct __sk_buff *skb, __u32 key)
 {
 	struct edt_val *edt;
 	__u64 t_queue_sz;
@@ -81,7 +81,6 @@ static __always_inline int sched_departure(struct __sk_buff *skb)
 	__u64 wire_len;
 	__u64 t_next;
 	__u64 t_curr;
-	int key = 0;
 	__u64 now;
 
 	edt = bpf_map_lookup_elem(&time_delay_map, &key);
@@ -253,9 +252,8 @@ SEC("classifier") int tc_edt_vlan(struct __sk_buff *skb)
 
 	vlan_key = extract_vlan_key(skb, &vlans);
 
-	/* For-now: Match on vlan16 and only apply EDT on that */
-	if (vlan_key == 16)
-		return sched_departure(skb);
+	/* Each (inner)  VLAN id gets it own EDT pacing */
+	return sched_departure(skb, vlan_key);
 
  out:
 	return ret;

From dea36b9d8fd919202a31dae98d51c737df96fd7d Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Sat, 28 Nov 2020 15:45:52 +0100
Subject: [PATCH 41/61] Add practical script for loading on all outer VLAN
 devices

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/vlans_load_edt.sh | 39 ++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100755 traffic-pacing-edt/vlans_load_edt.sh

diff --git a/traffic-pacing-edt/vlans_load_edt.sh b/traffic-pacing-edt/vlans_load_edt.sh
new file mode 100755
index 0000000..3dc1fab
--- /dev/null
+++ b/traffic-pacing-edt/vlans_load_edt.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#
+# Script for loading EDT-pacer BPF-prog on all downstream VLANs
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+
+root_check_run_with_sudo "$@"
+
+# Use common parameters
+source ${basedir}/parameters.sh
+
+# Default verbose
+VERBOSE=1
+
+# Downstream dev: ens6f0
+VLAN_START=168
+VLAN_END=205
+
+cmd=${basedir}/bpf_egress_loader.sh
+
+options=""
+
+if [[ -n $REMOVE ]]; then
+    options+=" --remove"
+fi
+if [[ -n $DRYRUN ]]; then
+    options+=" --dry-run"
+    #cmd="echo $cmd"
+fi
+if [[ -n $VERBOSE ]]; then
+    options+=" --verbose"
+fi
+
+for (( vlan=${VLAN_START}; vlan<=${VLAN_END}; vlan++ ))
+do
+    VLAN=${DEV}.$vlan
+    $cmd --dev $VLAN $options
+done

From 93116e0fb25dd501a1ab7b1b99e4c455acb06b6d Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Mon, 30 Nov 2020 12:41:48 +0100
Subject: [PATCH 42/61] Add bpftrace dir and program developed last night

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 .../bpftrace/edt_tstamp_diff.bt               | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100755 traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt

diff --git a/traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt b/traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt
new file mode 100755
index 0000000..15c3c4d
--- /dev/null
+++ b/traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt
@@ -0,0 +1,31 @@
+#!/usr/local/bin/bpftrace
+
+#include <linux/skbuff.h>
+
+/* Measure time difference between EDT-time and real "NIC" TX-time.
+ *
+ * Assuming packets are EDT timestamped by the BPF-program, we can
+ * detect/measure how accuratly packets are actually transmitted
+ * towards the NIC driver, by comparing EDT-time against "now"
+ * timestamp in the function transmitting to the NIC driver.
+ */
+
+// tracepoint:net:net_dev_start_xmit
+tracepoint:net:net_dev_xmit
+{
+	$skb = (struct sk_buff *)args->skbaddr;
+	//$tstamp = (uint64)$skb->tstamp;
+	$tstamp = $skb->skb_mstamp_ns;
+	$now = nsecs;
+
+	// if ($skb->mark > 0) {
+	if ($tstamp > 0) {
+		if ($now >= $tstamp) {
+			$diff_late = $now - $tstamp;
+		} else {
+			$diff_ahead = $tstamp - $now;
+		}
+		@tstamp_diff_late = hist($diff_late / 1000);
+		@tstamp_diff_ahead = hist($diff_ahead / 1000);
+	}
+}

From 381dd9a512dfaf64d875c2a136e37218bbccfe77 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Mon, 30 Nov 2020 12:43:14 +0100
Subject: [PATCH 43/61] Add more advanced version of script
 edt_tstamp_diff_advanced.bt

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 .../bpftrace/edt_tstamp_diff_advanced.bt      | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100755 traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt

diff --git a/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt b/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt
new file mode 100755
index 0000000..0029055
--- /dev/null
+++ b/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt
@@ -0,0 +1,59 @@
+#!/usr/local/bin/bpftrace
+
+#include <linux/skbuff.h>
+
+// tracepoint:net:net_dev_start_xmit
+tracepoint:net:net_dev_xmit
+{
+	$skb = (struct sk_buff *)args->skbaddr;
+	//$tstamp = (uint64)$skb->tstamp;
+	$tstamp = $skb->skb_mstamp_ns;
+	$now = nsecs;
+
+//	if ($skb->mark > 0) {
+	if ($tstamp > 0) {
+		if ($now >= $tstamp) {
+			$diff_late = $now - $tstamp;
+		} else {
+			$diff_ahead = $tstamp - $now;
+		}
+		@tstamp_usec_diff_late = hist($diff_late / 1000);
+		@tstamp_usec_diff_ahead = hist($diff_ahead / 1000);
+	}
+
+	//$period = $now / 10000; /* 10000 = 10 usec */
+	$period = $now / 30000; /* 30000 = 30 usec */
+	if (@state[cpu] == $period) {
+		@state_bytes[cpu] += $skb->len;
+	} else {	
+		@state[cpu] = $period;
+		if (@state_bytes[cpu] > 0) {
+			@byte_burst[cpu] = hist(@state_bytes[cpu]);
+		}
+		@state_bytes[cpu] = $skb->len; /* Reset counter */
+	}
+}
+
+tracepoint:qdisc:qdisc_dequeue
+{
+	@qdisc_bulk_dequeue = lhist(args->packets, 0,64,1);
+}
+
+
+/*
+kretfunc:dev_hard_start_xmit
+{
+// Wanted to know if ret == NETDEV_TX_BUSY
+# ERROR: kfunc/kretfunc not available for your linked against bcc version.
+}
+*/
+
+kprobe:qdisc_watchdog_schedule_range_ns
+{
+	@qdisc_watchdog[cpu] = count();
+}
+
+kprobe:__netif_schedule
+{
+	@__netif_schedule[cpu] = count();
+}

From b84b89dc4baee4dfbb0604ba76571aff15b537a6 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Tue, 1 Dec 2020 09:34:18 +0100
Subject: [PATCH 44/61] bpftrace/edt_tstamp_diff_advanced.bt: add doc comments

Also found measurement tool can disturb timing.
I might have to write this in BPF-C directly to avoid overhead.

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 .../bpftrace/edt_tstamp_diff_advanced.bt      | 21 ++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt b/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt
index 0029055..add3270 100755
--- a/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt
+++ b/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt
@@ -21,6 +21,14 @@ tracepoint:net:net_dev_xmit
 		@tstamp_usec_diff_ahead = hist($diff_ahead / 1000);
 	}
 
+	/* Capture burstiness over a time period, by dividing nanosec
+	 * timestamp with wanted period, and keeping state byte counter as
+	 * long as timestamp match.
+	 *
+	 * Practical usage shows that bpftrace uses a hash-map to implement
+	 * this, which unfortunately cost too much (shows 5% jhash cpu
+	 * usage), enough overhead to change behavior of prod system.
+	 */
 	//$period = $now / 10000; /* 10000 = 10 usec */
 	$period = $now / 30000; /* 30000 = 30 usec */
 	if (@state[cpu] == $period) {
@@ -34,11 +42,12 @@ tracepoint:net:net_dev_xmit
 	}
 }
 
+/*
 tracepoint:qdisc:qdisc_dequeue
 {
 	@qdisc_bulk_dequeue = lhist(args->packets, 0,64,1);
 }
-
+*/
 
 /*
 kretfunc:dev_hard_start_xmit
@@ -48,6 +57,15 @@ kretfunc:dev_hard_start_xmit
 }
 */
 
+
+/* How often does FQ-pacer find no-packets are qualified to be
+ * scheduled, which leads to scheduling an hrtimer event, that will
+ * start qdisc again at a later time.
+ *
+ * We cannot kprobe fq_dequeue as it is a module.
+ */
+
+/*
 kprobe:qdisc_watchdog_schedule_range_ns
 {
 	@qdisc_watchdog[cpu] = count();
@@ -57,3 +75,4 @@ kprobe:__netif_schedule
 {
 	@__netif_schedule[cpu] = count();
 }
+*/

From 79466715cfac0479307b3d20e9c1054a2bc58e02 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Tue, 1 Dec 2020 10:07:25 +0100
Subject: [PATCH 45/61] traffic-pacing-edt: Use SKB->mark to identify different
 stages

This can be used by bpftrace programs to identify different
stages, when trying to determine the EDT accuracy.

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/edt_pacer02.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index eb1b997..dc8321b 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -135,6 +135,7 @@ static __always_inline int sched_departure(struct __sk_buff *skb, __u32 key)
 
 		WRITE_ONCE(edt->t_last, t_curr_next);
 		skb->tstamp = t_curr_next;
+		skb->mark = 1; /* No queue - add minimum delay */
 #else
 		WRITE_ONCE(edt->t_last, t_curr);
 #endif
@@ -156,10 +157,14 @@ static __always_inline int sched_departure(struct __sk_buff *skb, __u32 key)
 	// if (codel_drop(edt, t_queue_sz, now))
 	if (codel_drop(&edt->codel, t_queue_sz, t_next))
 		return BPF_DROP;
+
+	skb->mark = 2; /* (time) queue exist - and small/below T_HORIZON_ECN */
 	
 	/* ECN marking horizon */
-	if (t_queue_sz >= T_HORIZON_ECN)
+	if (t_queue_sz >= T_HORIZON_ECN) {
+		skb->mark = 3; /* (time) queue exist - and is large */
 		bpf_skb_ecn_set_ce(skb);
+	}
 
 	/* Advance "time queue" */
 	WRITE_ONCE(edt->t_last, t_next);

From 23f73c86ac1c2ec7cf6c4d8e0a0b421fbb5d8bef Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Tue, 1 Dec 2020 12:13:24 +0100
Subject: [PATCH 46/61] traffic-pacing-edt: Use bpf_ktime_get_boot_ns

The bpftrace programs use bpf_ktime_get_boot_ns, for underlying 'nsecs'
keyword.  Switch TC-BPF prog to use the same, to make sure that we
don't report false result when detecting/measureing EDT accuracy.

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/edt_pacer02.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index dc8321b..5b89d4a 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -102,7 +102,8 @@ static __always_inline int sched_departure(struct __sk_buff *skb, __u32 key)
 //	t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / RATE_IN_BYTES;
 	// t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / edt->rate;
 
-	now = bpf_ktime_get_ns();
+	// now = bpf_ktime_get_ns();
+	now = bpf_ktime_get_boot_ns(); /* Use same ktime as bpftrace */
 
 	/* Allow others to set skb tstamp prior to us */
 	t_curr  = skb->tstamp;

From 048c960756eb65301a72d2d7c41218906bd63204 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Tue, 1 Dec 2020 14:27:10 +0100
Subject: [PATCH 47/61] iproute2 tc util have recently gotten libbpf support

Implement configure script that detect support, and Makefile
defines that propagate to BPF-C file, making it possible to
use and compile with BTF type maps.

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/Makefile          | 13 ++++++++++++-
 traffic-pacing-edt/configure         | 29 ++++++++++++++++++++++++++++
 traffic-pacing-edt/edt_pacer02.c     | 22 +++++++++++++++++----
 traffic-pacing-edt/iproute2_compat.h |  6 ++++++
 4 files changed, 65 insertions(+), 5 deletions(-)
 create mode 100755 traffic-pacing-edt/configure

diff --git a/traffic-pacing-edt/Makefile b/traffic-pacing-edt/Makefile
index cb3def9..73c1306 100644
--- a/traffic-pacing-edt/Makefile
+++ b/traffic-pacing-edt/Makefile
@@ -4,11 +4,20 @@ USER_TARGETS :=
 BPF_TARGETS := edt_pacer01
 BPF_TARGETS += edt_pacer02
 
+EXTRA_DEPS += config.mk
+
 LIB_DIR = ../lib
 
 include $(LIB_DIR)/common.mk
+include config.mk
 
-# The iproute2 'tc' tool doesn't understand BTF debug info
+all: config.mk
+
+config.mk: configure
+	@sh configure
+
+ifndef HAVE_TC_LIBBPF
+# If the iproute2 'tc' tool doesn't understand BTF debug info
 # use llvm-strip to remove this debug info from object file
 #
 # *BUT* cannot strip everything as it removes ELF elems needed for
@@ -16,6 +25,8 @@ include $(LIB_DIR)/common.mk
 #
 .PHONY: strip_tc_obj
 strip_tc_obj: ${BPF_TARGETS:=.o}
+	$(Q) echo "TC don't support libbpf - strip BTF info"
 	$(Q) llvm-strip --no-strip-all --remove-section .BTF $?
 
 all: strip_tc_obj
+endif
diff --git a/traffic-pacing-edt/configure b/traffic-pacing-edt/configure
new file mode 100755
index 0000000..9b01369
--- /dev/null
+++ b/traffic-pacing-edt/configure
@@ -0,0 +1,29 @@
+#!/bin/bash
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+# This is not an autoconf generated configure
+#
+
+# Output file which is input to Makefile
+CONFIG=config.mk
+
+# Assume tc is in $PATH
+TC=tc
+
+check_tc_libbpf()
+{
+    tc_version=$($TC -V)
+    if echo $tc_version | grep -q libbpf; then
+	libbpf_version=${tc_version##*libbpf }
+	echo "HAVE_TC_LIBBPF:=y" >> $CONFIG
+	echo "CFLAGS += -DHAVE_LIBBPF" >> $CONFIG
+	echo "yes ($libbpf_version)"
+    else
+	echo "no"
+    fi
+}
+
+echo "# Generated config" > $CONFIG
+echo "Detecting available features on system"
+
+echo -n " - libbpf support in tc tool: "
+check_tc_libbpf
diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer02.c
index 5b89d4a..a361079 100644
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -2,7 +2,6 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/compiler.h>
-#include "iproute2_compat.h"
 
 #include <stdbool.h>
 
@@ -59,16 +58,31 @@ struct edt_val {
 	struct codel_state codel;
 } __aligned(64); /* Align struct to cache-size to avoid false-sharing */
 
-/* The tc tool (iproute2) use another ELF map layout than libbpf (struct
- * bpf_map_def), see struct bpf_elf_map from iproute2.
+#ifdef HAVE_TC_LIBBPF /* detected by configure script in config.mk */
+/* Use BTF format to create map */
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 4096); /* Max possible VLANs */
+	__type(key, __u32);
+	__type(value, struct edt_val);
+//	__uint(pinning, LIBBPF_PIN_BY_NAME);
+} time_delay_map SEC(".maps");
+
+#else
+/* The (iproute2) tc tool (without libbpf support) use another ELF map
+ * layout than libbpf (struct bpf_map_def), see struct bpf_elf_map
+ * from iproute2.
  */
+#include "iproute2_compat.h"
 struct bpf_elf_map SEC("maps") time_delay_map = {
 	.type		= BPF_MAP_TYPE_ARRAY,
 	.size_key	= sizeof(__u32),
 	.size_value	= sizeof(struct edt_val),
 	.max_elem	= 4096, /* Max possible VLANs */
-	//.pinning	= PIN_GLOBAL_NS,
+//	.pinning	= PIN_GLOBAL_NS,
 };
+#endif
+
 
 /* Role of EDT (Earliest Departure Time) is to schedule departure of packets to
  * be send in the future.
diff --git a/traffic-pacing-edt/iproute2_compat.h b/traffic-pacing-edt/iproute2_compat.h
index a535f5f..3d72546 100644
--- a/traffic-pacing-edt/iproute2_compat.h
+++ b/traffic-pacing-edt/iproute2_compat.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/* Taken from from #include <iproute2/bpf_elf.h> */
 
 #ifndef __IPROUTE2_COMPAT_H
 #define __IPROUTE2_COMPAT_H
@@ -8,6 +9,11 @@
  * binary layout until "flags". Thus, BPF-progs can use both if careful.
  */
 
+/* Object pinning settings */
+#define PIN_NONE                0
+#define PIN_OBJECT_NS           1
+#define PIN_GLOBAL_NS           2
+
 /* ELF map definition (copied from iproute2 source code) */
 struct bpf_elf_map {
 	__u32 type;

From 9d52254be6bb5a148c26f6908a8639a40152cd4d Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Tue, 1 Dec 2020 15:29:04 +0100
Subject: [PATCH 48/61] traffic-pacing-edt: rename edt_pacer02.c to
 edt_pacer_vlan.c

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/{edt_pacer02.c => edt_pacer_vlan.c} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename traffic-pacing-edt/{edt_pacer02.c => edt_pacer_vlan.c} (100%)

diff --git a/traffic-pacing-edt/edt_pacer02.c b/traffic-pacing-edt/edt_pacer_vlan.c
similarity index 100%
rename from traffic-pacing-edt/edt_pacer02.c
rename to traffic-pacing-edt/edt_pacer_vlan.c

From 5aab70b25dc03571a6376920122addbc23345c50 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Tue, 1 Dec 2020 15:31:08 +0100
Subject: [PATCH 49/61] traffic-pacing-edt: Adjust after file rename

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/Makefile             | 2 +-
 traffic-pacing-edt/bpf_egress_loader.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/traffic-pacing-edt/Makefile b/traffic-pacing-edt/Makefile
index 73c1306..4190dfe 100644
--- a/traffic-pacing-edt/Makefile
+++ b/traffic-pacing-edt/Makefile
@@ -2,7 +2,7 @@
 
 USER_TARGETS :=
 BPF_TARGETS := edt_pacer01
-BPF_TARGETS += edt_pacer02
+BPF_TARGETS += edt_pacer_vlan
 
 EXTRA_DEPS += config.mk
 
diff --git a/traffic-pacing-edt/bpf_egress_loader.sh b/traffic-pacing-edt/bpf_egress_loader.sh
index 934117d..efaf597 100755
--- a/traffic-pacing-edt/bpf_egress_loader.sh
+++ b/traffic-pacing-edt/bpf_egress_loader.sh
@@ -16,7 +16,7 @@ export TC=tc
 # This can be changed via --file or --obj
 if [[ -z ${BPF_OBJ} ]]; then
     # Fallback default
-    BPF_OBJ=edt_pacer02.o
+    BPF_OBJ=edt_pacer_vlan.o
 fi
 
 info "Applying TC-BPF egress setup on device: $DEV with object file: $BPF_OBJ"

From e7401bb5004fa14af00d48d9a8bf9239f4f4fd17 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Tue, 1 Dec 2020 15:36:54 +0100
Subject: [PATCH 50/61] traffic-pacing-edt: Remove test program edt_pacer01.c

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/Makefile      |  3 +--
 traffic-pacing-edt/edt_pacer01.c | 40 --------------------------------
 2 files changed, 1 insertion(+), 42 deletions(-)
 delete mode 100644 traffic-pacing-edt/edt_pacer01.c

diff --git a/traffic-pacing-edt/Makefile b/traffic-pacing-edt/Makefile
index 4190dfe..09cdd24 100644
--- a/traffic-pacing-edt/Makefile
+++ b/traffic-pacing-edt/Makefile
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
 
 USER_TARGETS :=
-BPF_TARGETS := edt_pacer01
-BPF_TARGETS += edt_pacer_vlan
+BPF_TARGETS := edt_pacer_vlan
 
 EXTRA_DEPS += config.mk
 
diff --git a/traffic-pacing-edt/edt_pacer01.c b/traffic-pacing-edt/edt_pacer01.c
deleted file mode 100644
index 044158f..0000000
--- a/traffic-pacing-edt/edt_pacer01.c
+++ /dev/null
@@ -1,40 +0,0 @@
-#include <linux/bpf.h>
-#include <bpf/bpf_helpers.h>
-#include <xdp/parsing_helpers.h>
-#include "iproute2_compat.h"
-
-char _license[] SEC("license") = "GPL";
-
-/* The tc tool (iproute2) use another ELF map layout than libbpf (struct
- * bpf_map_def), see struct bpf_elf_map from iproute2.
- */
-struct bpf_elf_map  SEC("maps") cnt_map = {
-	.type		= BPF_MAP_TYPE_ARRAY,
-	.size_key	= sizeof(__u32),
-	.size_value	= sizeof(__u64),
-	.max_elem	= 1,
-	//.pinning	= PIN_GLOBAL_NS,
-};
-
-SEC("classifier") int tc_dummy(struct __sk_buff *skb)
-{
-	volatile void *data, *data_end;
-	int ret = BPF_OK;
-	struct ethhdr *eth;
-
-	data     = (void *)(long)skb->data;
-	data_end = (void *)(long)skb->data_end;
-	eth = (struct ethhdr *)data;
-
-	if (data + sizeof(*eth) > data_end)
-		return BPF_DROP;
-
-	/* Keep ARP resolution working */
-	if (eth->h_proto == bpf_htons(ETH_P_ARP)) {
-		ret = BPF_OK;
-		goto out;
-	}
-
- out:
-        return ret;
-}

From 89aeeafa0e40425d05ee5325b89458d163f9d5dc Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Tue, 15 Dec 2020 16:34:26 +0100
Subject: [PATCH 51/61] Update UAPI header file bpf.h

I need the struct bpf_cpumap_val definition for the next example.

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 headers/linux/bpf.h | 818 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 689 insertions(+), 129 deletions(-)

diff --git a/headers/linux/bpf.h b/headers/linux/bpf.h
index b9ed9f1..1bc3738 100644
--- a/headers/linux/bpf.h
+++ b/headers/linux/bpf.h
@@ -81,6 +81,12 @@ struct bpf_cgroup_storage_key {
 	__u32	attach_type;		/* program attach type */
 };
 
+union bpf_iter_link_info {
+	struct {
+		__u32	map_fd;
+	} map;
+};
+
 /* BPF syscall commands, see bpf(2) man-page for details. */
 enum bpf_cmd {
 	BPF_MAP_CREATE,
@@ -117,6 +123,7 @@ enum bpf_cmd {
 	BPF_LINK_GET_NEXT_ID,
 	BPF_ENABLE_STATS,
 	BPF_ITER_CREATE,
+	BPF_LINK_DETACH,
 };
 
 enum bpf_map_type {
@@ -189,6 +196,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_STRUCT_OPS,
 	BPF_PROG_TYPE_EXT,
 	BPF_PROG_TYPE_LSM,
+	BPF_PROG_TYPE_SK_LOOKUP,
 };
 
 enum bpf_attach_type {
@@ -226,6 +234,10 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET4_GETSOCKNAME,
 	BPF_CGROUP_INET6_GETSOCKNAME,
 	BPF_XDP_DEVMAP,
+	BPF_CGROUP_INET_SOCK_RELEASE,
+	BPF_XDP_CPUMAP,
+	BPF_SK_LOOKUP,
+	BPF_XDP,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -238,6 +250,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_CGROUP = 3,
 	BPF_LINK_TYPE_ITER = 4,
 	BPF_LINK_TYPE_NETNS = 5,
+	BPF_LINK_TYPE_XDP = 6,
 
 	MAX_BPF_LINK_TYPE,
 };
@@ -603,9 +616,14 @@ union bpf_attr {
 
 	struct { /* struct used by BPF_LINK_CREATE command */
 		__u32		prog_fd;	/* eBPF program to attach */
-		__u32		target_fd;	/* object to attach to */
+		union {
+			__u32		target_fd;	/* object to attach to */
+			__u32		target_ifindex; /* target ifindex */
+		};
 		__u32		attach_type;	/* attach type */
 		__u32		flags;		/* extra flags */
+		__aligned_u64	iter_info;	/* extra bpf_iter_link_info */
+		__u32		iter_info_len;	/* iter_info length */
 	} link_create;
 
 	struct { /* struct used by BPF_LINK_UPDATE command */
@@ -618,6 +636,10 @@ union bpf_attr {
 		__u32		old_prog_fd;
 	} link_update;
 
+	struct {
+		__u32		link_fd;
+	} link_detach;
+
 	struct { /* struct used by BPF_ENABLE_STATS command */
 		__u32		type;
 	} enable_stats;
@@ -653,7 +675,7 @@ union bpf_attr {
  * 		Map value associated to *key*, or **NULL** if no entry was
  * 		found.
  *
- * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
+ * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
  * 	Description
  * 		Add or update the value of the entry associated to *key* in
  * 		*map* with *value*. *flags* is one of:
@@ -671,13 +693,13 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_map_delete_elem(struct bpf_map *map, const void *key)
+ * long bpf_map_delete_elem(struct bpf_map *map, const void *key)
  * 	Description
  * 		Delete entry with *key* from *map*.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr)
+ * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr)
  * 	Description
  * 		For tracing programs, safely attempt to read *size* bytes from
  * 		kernel space address *unsafe_ptr* and store the data in *dst*.
@@ -695,7 +717,7 @@ union bpf_attr {
  * 	Return
  * 		Current *ktime*.
  *
- * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
+ * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
  * 	Description
  * 		This helper is a "printk()-like" facility for debugging. It
  * 		prints a message defined by format *fmt* (of size *fmt_size*)
@@ -745,7 +767,7 @@ union bpf_attr {
  *
  * 		Also, note that **bpf_trace_printk**\ () is slow, and should
  * 		only be used for debugging purposes. For this reason, a notice
- * 		bloc (spanning several lines) is printed to kernel logs and
+ * 		block (spanning several lines) is printed to kernel logs and
  * 		states that the helper should not be used "for production use"
  * 		the first time this helper is used (or more precisely, when
  * 		**trace_printk**\ () buffers are allocated). For passing values
@@ -775,7 +797,7 @@ union bpf_attr {
  * 	Return
  * 		The SMP id of the processor running the program.
  *
- * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
+ * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
  * 	Description
  * 		Store *len* bytes from address *from* into the packet
  * 		associated to *skb*, at *offset*. *flags* are a combination of
@@ -792,7 +814,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
+ * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
  * 	Description
  * 		Recompute the layer 3 (e.g. IP) checksum for the packet
  * 		associated to *skb*. Computation is incremental, so the helper
@@ -817,7 +839,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
+ * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
  * 	Description
  * 		Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
  * 		packet associated to *skb*. Computation is incremental, so the
@@ -849,7 +871,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
+ * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
  * 	Description
  * 		This special helper is used to trigger a "tail call", or in
  * 		other words, to jump into another eBPF program. The same stack
@@ -880,7 +902,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
+ * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
  * 	Description
  * 		Clone and redirect the packet associated to *skb* to another
  * 		net device of index *ifindex*. Both ingress and egress
@@ -916,7 +938,7 @@ union bpf_attr {
  * 		A 64-bit integer containing the current GID and UID, and
  * 		created as such: *current_gid* **<< 32 \|** *current_uid*.
  *
- * int bpf_get_current_comm(void *buf, u32 size_of_buf)
+ * long bpf_get_current_comm(void *buf, u32 size_of_buf)
  * 	Description
  * 		Copy the **comm** attribute of the current task into *buf* of
  * 		*size_of_buf*. The **comm** attribute contains the name of
@@ -953,7 +975,7 @@ union bpf_attr {
  * 	Return
  * 		The classid, or 0 for the default unconfigured classid.
  *
- * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+ * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
  * 	Description
  * 		Push a *vlan_tci* (VLAN tag control information) of protocol
  * 		*vlan_proto* to the packet associated to *skb*, then update
@@ -969,7 +991,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_vlan_pop(struct sk_buff *skb)
+ * long bpf_skb_vlan_pop(struct sk_buff *skb)
  * 	Description
  * 		Pop a VLAN header from the packet associated to *skb*.
  *
@@ -981,7 +1003,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
  * 	Description
  * 		Get tunnel metadata. This helper takes a pointer *key* to an
  * 		empty **struct bpf_tunnel_key** of **size**, that will be
@@ -1011,14 +1033,14 @@ union bpf_attr {
  *
  * 			int ret;
  * 			struct bpf_tunnel_key key = {};
- * 			
+ *
  * 			ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
  * 			if (ret < 0)
  * 				return TC_ACT_SHOT;	// drop packet
- * 			
+ *
  * 			if (key.remote_ipv4 != 0x0a000001)
  * 				return TC_ACT_SHOT;	// drop packet
- * 			
+ *
  * 			return TC_ACT_OK;		// accept packet
  *
  * 		This interface can also be used with all encapsulation devices
@@ -1032,7 +1054,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
  * 	Description
  * 		Populate tunnel metadata for packet associated to *skb.* The
  * 		tunnel metadata is set to the contents of *key*, of *size*. The
@@ -1098,7 +1120,7 @@ union bpf_attr {
  * 		The value of the perf event counter read from the map, or a
  * 		negative error code in case of failure.
  *
- * int bpf_redirect(u32 ifindex, u64 flags)
+ * long bpf_redirect(u32 ifindex, u64 flags)
  * 	Description
  * 		Redirect the packet to another net device of index *ifindex*.
  * 		This helper is somewhat similar to **bpf_clone_redirect**\
@@ -1125,7 +1147,7 @@ union bpf_attr {
  * 	Description
  * 		Retrieve the realm or the route, that is to say the
  * 		**tclassid** field of the destination for the *skb*. The
- * 		indentifier retrieved is a user-provided tag, similar to the
+ * 		identifier retrieved is a user-provided tag, similar to the
  * 		one used with the net_cls cgroup (see description for
  * 		**bpf_get_cgroup_classid**\ () helper), but here this tag is
  * 		held by a route (a destination entry), not by a task.
@@ -1145,7 +1167,7 @@ union bpf_attr {
  * 		The realm of the route for the packet associated to *skb*, or 0
  * 		if none was found.
  *
- * int bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
  * 	Description
  * 		Write raw *data* blob into a special BPF perf event held by
  * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
@@ -1190,7 +1212,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len)
+ * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len)
  * 	Description
  * 		This helper was provided as an easy way to load data from a
  * 		packet. It can be used to load *len* bytes from *offset* from
@@ -1207,7 +1229,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags)
+ * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags)
  * 	Description
  * 		Walk a user or a kernel stack and return its id. To achieve
  * 		this, the helper needs *ctx*, which is a pointer to the context
@@ -1276,7 +1298,7 @@ union bpf_attr {
  * 		The checksum result, or a negative error code in case of
  * 		failure.
  *
- * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size)
+ * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size)
  * 	Description
  * 		Retrieve tunnel options metadata for the packet associated to
  * 		*skb*, and store the raw tunnel option data to the buffer *opt*
@@ -1294,7 +1316,7 @@ union bpf_attr {
  * 	Return
  * 		The size of the option data retrieved.
  *
- * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size)
+ * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size)
  * 	Description
  * 		Set tunnel options metadata for the packet associated to *skb*
  * 		to the option data contained in the raw buffer *opt* of *size*.
@@ -1304,7 +1326,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
+ * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
  * 	Description
  * 		Change the protocol of the *skb* to *proto*. Currently
  * 		supported are transition from IPv4 to IPv6, and from IPv6 to
@@ -1331,7 +1353,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_change_type(struct sk_buff *skb, u32 type)
+ * long bpf_skb_change_type(struct sk_buff *skb, u32 type)
  * 	Description
  * 		Change the packet type for the packet associated to *skb*. This
  * 		comes down to setting *skb*\ **->pkt_type** to *type*, except
@@ -1358,7 +1380,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
+ * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
  * 	Description
  * 		Check whether *skb* is a descendant of the cgroup2 held by
  * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
@@ -1389,7 +1411,7 @@ union bpf_attr {
  * 	Return
  * 		A pointer to the current task struct.
  *
- * int bpf_probe_write_user(void *dst, const void *src, u32 len)
+ * long bpf_probe_write_user(void *dst, const void *src, u32 len)
  * 	Description
  * 		Attempt in a safe way to write *len* bytes from the buffer
  * 		*src* to *dst* in memory. It only works for threads that are in
@@ -1408,7 +1430,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
+ * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
  * 	Description
  * 		Check whether the probe is being run is the context of a given
  * 		subset of the cgroup2 hierarchy. The cgroup2 to test is held by
@@ -1420,7 +1442,7 @@ union bpf_attr {
  * 		* 1, if the *skb* task does not belong to the cgroup2.
  * 		* A negative error code, if an error occurred.
  *
- * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
+ * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
  * 	Description
  * 		Resize (trim or grow) the packet associated to *skb* to the
  * 		new *len*. The *flags* are reserved for future usage, and must
@@ -1444,7 +1466,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_pull_data(struct sk_buff *skb, u32 len)
+ * long bpf_skb_pull_data(struct sk_buff *skb, u32 len)
  * 	Description
  * 		Pull in non-linear data in case the *skb* is non-linear and not
  * 		all of *len* are part of the linear section. Make *len* bytes
@@ -1500,7 +1522,7 @@ union bpf_attr {
  * 		recalculation the next time the kernel tries to access this
  * 		hash or when the **bpf_get_hash_recalc**\ () helper is called.
  *
- * int bpf_get_numa_node_id(void)
+ * long bpf_get_numa_node_id(void)
  * 	Description
  * 		Return the id of the current NUMA node. The primary use case
  * 		for this helper is the selection of sockets for the local NUMA
@@ -1511,7 +1533,7 @@ union bpf_attr {
  * 	Return
  * 		The id of current NUMA node.
  *
- * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
+ * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
  * 	Description
  * 		Grows headroom of packet associated to *skb* and adjusts the
  * 		offset of the MAC header accordingly, adding *len* bytes of
@@ -1532,7 +1554,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
+ * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
  * 	Description
  * 		Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
  * 		it is possible to use a negative value for *delta*. This helper
@@ -1547,7 +1569,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr)
+ * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr)
  * 	Description
  * 		Copy a NUL terminated string from an unsafe kernel address
  * 		*unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for
@@ -1595,14 +1617,14 @@ union bpf_attr {
  * 		is returned (note that **overflowuid** might also be the actual
  * 		UID value for the socket).
  *
- * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
+ * long bpf_set_hash(struct sk_buff *skb, u32 hash)
  * 	Description
  * 		Set the full hash for *skb* (set the field *skb*\ **->hash**)
  * 		to value *hash*.
  * 	Return
  * 		0
  *
- * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
+ * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
  * 	Description
  * 		Emulate a call to **setsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1621,20 +1643,30 @@ union bpf_attr {
  *
  * 		* **SOL_SOCKET**, which supports the following *optname*\ s:
  * 		  **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
- * 		  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
+ * 		  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**,
+ * 		  **SO_BINDTODEVICE**, **SO_KEEPALIVE**.
  * 		* **IPPROTO_TCP**, which supports the following *optname*\ s:
  * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
- * 		  **TCP_BPF_SNDCWND_CLAMP**.
+ * 		  **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
+ * 		  **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
+ * 		  **TCP_SYNCNT**, **TCP_USER_TIMEOUT**.
  * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
  * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags)
+ * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags)
  * 	Description
  * 		Grow or shrink the room for data in the packet associated to
  * 		*skb* by *len_diff*, and according to the selected *mode*.
  *
+ * 		By default, the helper will reset any offloaded checksum
+ * 		indicator of the skb to CHECKSUM_NONE. This can be avoided
+ * 		by the following flag:
+ *
+ * 		* **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded
+ * 		  checksum data of the skb to CHECKSUM_NONE.
+ *
  *		There are two supported modes at this time:
  *
  *		* **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
@@ -1669,7 +1701,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
  * 	Description
  * 		Redirect the packet to the endpoint referenced by *map* at
  * 		index *key*. Depending on its type, this *map* can contain
@@ -1690,7 +1722,7 @@ union bpf_attr {
  * 		**XDP_REDIRECT** on success, or the value of the two lower bits
  * 		of the *flags* argument on error.
  *
- * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags)
+ * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags)
  * 	Description
  * 		Redirect the packet to the socket referenced by *map* (of type
  * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
@@ -1701,7 +1733,7 @@ union bpf_attr {
  * 	Return
  * 		**SK_PASS** on success, or **SK_DROP** on error.
  *
- * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
+ * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
  * 	Description
  * 		Add an entry to, or update a *map* referencing sockets. The
  * 		*skops* is used as a new value for the entry associated to
@@ -1720,7 +1752,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
+ * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
  * 	Description
  * 		Adjust the address pointed by *xdp_md*\ **->data_meta** by
  * 		*delta* (which can be positive or negative). Note that this
@@ -1749,7 +1781,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
+ * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
  * 	Description
  * 		Read the value of a perf event counter, and store it into *buf*
  * 		of size *buf_size*. This helper relies on a *map* of type
@@ -1799,7 +1831,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
+ * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
  * 	Description
  * 		For en eBPF program attached to a perf event, retrieve the
  * 		value of the event counter associated to *ctx* and store it in
@@ -1810,7 +1842,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
+ * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
  * 	Description
  * 		Emulate a call to **getsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1835,7 +1867,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_override_return(struct pt_regs *regs, u64 rc)
+ * long bpf_override_return(struct pt_regs *regs, u64 rc)
  * 	Description
  * 		Used for error injection, this helper uses kprobes to override
  * 		the return value of the probed function, and to set it to *rc*.
@@ -1860,7 +1892,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
+ * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
  * 	Description
  * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
  * 		for the full TCP socket associated to *bpf_sock_ops* to
@@ -1904,7 +1936,7 @@ union bpf_attr {
  * 		be set is returned (which comes down to 0 if all bits were set
  * 		as required).
  *
- * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
+ * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
  * 	Description
  * 		This helper is used in programs implementing policies at the
  * 		socket level. If the message *msg* is allowed to pass (i.e. if
@@ -1918,7 +1950,7 @@ union bpf_attr {
  * 	Return
  * 		**SK_PASS** on success, or **SK_DROP** on error.
  *
- * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
  * 	Description
  * 		For socket policies, apply the verdict of the eBPF program to
  * 		the next *bytes* (number of bytes) of message *msg*.
@@ -1952,7 +1984,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
  * 	Description
  * 		For socket policies, prevent the execution of the verdict eBPF
  * 		program for message *msg* until *bytes* (byte number) have been
@@ -1970,7 +2002,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
+ * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
  * 	Description
  * 		For socket policies, pull in non-linear data from user space
  * 		for *msg* and set pointers *msg*\ **->data** and *msg*\
@@ -2001,7 +2033,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
+ * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
  * 	Description
  * 		Bind the socket associated to *ctx* to the address pointed by
  * 		*addr*, of length *addr_len*. This allows for making outgoing
@@ -2019,7 +2051,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
+ * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
  * 	Description
  * 		Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
  * 		possible to both shrink and grow the packet tail.
@@ -2033,7 +2065,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
+ * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
  * 	Description
  * 		Retrieve the XFRM state (IP transform framework, see also
  * 		**ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
@@ -2049,7 +2081,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags)
+ * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags)
  * 	Description
  * 		Return a user or a kernel stack in bpf program provided buffer.
  * 		To achieve this, the helper needs *ctx*, which is a pointer
@@ -2082,7 +2114,7 @@ union bpf_attr {
  * 		A non-negative value equal to or less than *size* on success,
  * 		or a negative error in case of failure.
  *
- * int bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header)
+ * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header)
  * 	Description
  * 		This helper is similar to **bpf_skb_load_bytes**\ () in that
  * 		it provides an easy way to load *len* bytes from *offset*
@@ -2104,7 +2136,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
+ * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
  *	Description
  *		Do FIB lookup in kernel tables using parameters in *params*.
  *		If lookup is successful and result shows packet is to be
@@ -2135,7 +2167,7 @@ union bpf_attr {
  *		* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
  *		  packet is not forwarded or needs assist from full stack
  *
- * int bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
+ * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
  *	Description
  *		Add an entry to, or update a sockhash *map* referencing sockets.
  *		The *skops* is used as a new value for the entry associated to
@@ -2154,7 +2186,7 @@ union bpf_attr {
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
- * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
+ * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
  *	Description
  *		This helper is used in programs implementing policies at the
  *		socket level. If the message *msg* is allowed to pass (i.e. if
@@ -2168,7 +2200,7 @@ union bpf_attr {
  *	Return
  *		**SK_PASS** on success, or **SK_DROP** on error.
  *
- * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
+ * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
  *	Description
  *		This helper is used in programs implementing policies at the
  *		skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
@@ -2182,7 +2214,7 @@ union bpf_attr {
  *	Return
  *		**SK_PASS** on success, or **SK_DROP** on error.
  *
- * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+ * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
  *	Description
  *		Encapsulate the packet associated to *skb* within a Layer 3
  *		protocol header. This header is provided in the buffer at
@@ -2219,7 +2251,7 @@ union bpf_attr {
  *	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
+ * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
  *	Description
  *		Store *len* bytes from address *from* into the packet
  *		associated to *skb*, at *offset*. Only the flags, tag and TLVs
@@ -2234,7 +2266,7 @@ union bpf_attr {
  *	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
+ * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
  *	Description
  *		Adjust the size allocated to TLVs in the outermost IPv6
  *		Segment Routing Header contained in the packet associated to
@@ -2250,7 +2282,7 @@ union bpf_attr {
  *	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
+ * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
  *	Description
  *		Apply an IPv6 Segment Routing action of type *action* to the
  *		packet associated to *skb*. Each action takes a parameter
@@ -2279,7 +2311,7 @@ union bpf_attr {
  *	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_rc_repeat(void *ctx)
+ * long bpf_rc_repeat(void *ctx)
  *	Description
  *		This helper is used in programs implementing IR decoding, to
  *		report a successfully decoded repeat key message. This delays
@@ -2298,7 +2330,7 @@ union bpf_attr {
  *	Return
  *		0
  *
- * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
+ * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
  *	Description
  *		This helper is used in programs implementing IR decoding, to
  *		report a successfully decoded key press with *scancode*,
@@ -2363,7 +2395,7 @@ union bpf_attr {
  *	Return
  *		A pointer to the local storage area.
  *
- * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
+ * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
  *	Description
  *		Select a **SO_REUSEPORT** socket from a
  *		**BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*.
@@ -2408,7 +2440,7 @@ union bpf_attr {
  *			Look for an IPv6 socket.
  *
  *		If the *netns* is a negative signed 32-bit integer, then the
- *		socket lookup table in the netns associated with the *ctx* will
+ *		socket lookup table in the netns associated with the *ctx*
  *		will be used. For the TC hooks, this is the netns of the device
  *		in the skb. For socket hooks, this is the netns of the socket.
  *		If *netns* is any other signed 32-bit value greater than or
@@ -2445,7 +2477,7 @@ union bpf_attr {
  *			Look for an IPv6 socket.
  *
  *		If the *netns* is a negative signed 32-bit integer, then the
- *		socket lookup table in the netns associated with the *ctx* will
+ *		socket lookup table in the netns associated with the *ctx*
  *		will be used. For the TC hooks, this is the netns of the device
  *		in the skb. For socket hooks, this is the netns of the socket.
  *		If *netns* is any other signed 32-bit value greater than or
@@ -2464,7 +2496,7 @@ union bpf_attr {
  *		result is from *reuse*\ **->socks**\ [] using the hash of the
  *		tuple.
  *
- * int bpf_sk_release(struct bpf_sock *sock)
+ * long bpf_sk_release(struct bpf_sock *sock)
  *	Description
  *		Release the reference held by *sock*. *sock* must be a
  *		non-**NULL** pointer that was returned from
@@ -2472,7 +2504,7 @@ union bpf_attr {
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
- * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
+ * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
  * 	Description
  * 		Push an element *value* in *map*. *flags* is one of:
  *
@@ -2482,19 +2514,19 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_map_pop_elem(struct bpf_map *map, void *value)
+ * long bpf_map_pop_elem(struct bpf_map *map, void *value)
  * 	Description
  * 		Pop an element from *map*.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_map_peek_elem(struct bpf_map *map, void *value)
+ * long bpf_map_peek_elem(struct bpf_map *map, void *value)
  * 	Description
  * 		Get an element from *map* without removing it.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags)
+ * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags)
  *	Description
  *		For socket policies, insert *len* bytes into *msg* at offset
  *		*start*.
@@ -2510,7 +2542,7 @@ union bpf_attr {
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
- * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags)
+ * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags)
  *	Description
  *		Will remove *len* bytes from a *msg* starting at byte *start*.
  *		This may result in **ENOMEM** errors under certain situations if
@@ -2522,7 +2554,7 @@ union bpf_attr {
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
- * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y)
+ * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y)
  *	Description
  *		This helper is used in programs implementing IR decoding, to
  *		report a successfully decoded pointer movement.
@@ -2536,7 +2568,7 @@ union bpf_attr {
  *	Return
  *		0
  *
- * int bpf_spin_lock(struct bpf_spin_lock *lock)
+ * long bpf_spin_lock(struct bpf_spin_lock *lock)
  *	Description
  *		Acquire a spinlock represented by the pointer *lock*, which is
  *		stored as part of a value of a map. Taking the lock allows to
@@ -2584,7 +2616,7 @@ union bpf_attr {
  *	Return
  *		0
  *
- * int bpf_spin_unlock(struct bpf_spin_lock *lock)
+ * long bpf_spin_unlock(struct bpf_spin_lock *lock)
  *	Description
  *		Release the *lock* previously locked by a call to
  *		**bpf_spin_lock**\ (\ *lock*\ ).
@@ -2607,7 +2639,7 @@ union bpf_attr {
  *		A **struct bpf_tcp_sock** pointer on success, or **NULL** in
  *		case of failure.
  *
- * int bpf_skb_ecn_set_ce(struct sk_buff *skb)
+ * long bpf_skb_ecn_set_ce(struct sk_buff *skb)
  *	Description
  *		Set ECN (Explicit Congestion Notification) field of IP header
  *		to **CE** (Congestion Encountered) if current value is **ECT**
@@ -2644,7 +2676,7 @@ union bpf_attr {
  *		result is from *reuse*\ **->socks**\ [] using the hash of the
  *		tuple.
  *
- * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
+ * long bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
  * 	Description
  * 		Check whether *iph* and *th* contain a valid SYN cookie ACK for
  * 		the listening socket in *sk*.
@@ -2659,7 +2691,7 @@ union bpf_attr {
  * 		0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
  * 		error otherwise.
  *
- * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags)
+ * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags)
  *	Description
  *		Get name of sysctl in /proc/sys/ and copy it into provided by
  *		program buffer *buf* of size *buf_len*.
@@ -2675,7 +2707,7 @@ union bpf_attr {
  *		**-E2BIG** if the buffer wasn't big enough (*buf* will contain
  *		truncated name in this case).
  *
- * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len)
+ * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len)
  *	Description
  *		Get current value of sysctl as it is presented in /proc/sys
  *		(incl. newline, etc), and copy it as a string into provided
@@ -2694,7 +2726,7 @@ union bpf_attr {
  *		**-EINVAL** if current value was unavailable, e.g. because
  *		sysctl is uninitialized and read returns -EIO for it.
  *
- * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len)
+ * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len)
  *	Description
  *		Get new value being written by user space to sysctl (before
  *		the actual write happens) and copy it as a string into
@@ -2711,7 +2743,7 @@ union bpf_attr {
  *
  *		**-EINVAL** if sysctl is being read.
  *
- * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len)
+ * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len)
  *	Description
  *		Override new value being written by user space to sysctl with
  *		value provided by program in buffer *buf* of size *buf_len*.
@@ -2728,7 +2760,7 @@ union bpf_attr {
  *
  *		**-EINVAL** if sysctl is being read.
  *
- * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res)
+ * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res)
  *	Description
  *		Convert the initial part of the string from buffer *buf* of
  *		size *buf_len* to a long integer according to the given base
@@ -2752,7 +2784,7 @@ union bpf_attr {
  *
  *		**-ERANGE** if resulting value was out of range.
  *
- * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res)
+ * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res)
  *	Description
  *		Convert the initial part of the string from buffer *buf* of
  *		size *buf_len* to an unsigned long integer according to the
@@ -2803,7 +2835,7 @@ union bpf_attr {
  *		**NULL** if not found or there was an error in adding
  *		a new bpf-local-storage.
  *
- * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk)
+ * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk)
  *	Description
  *		Delete a bpf-local-storage from a *sk*.
  *	Return
@@ -2811,7 +2843,7 @@ union bpf_attr {
  *
  *		**-ENOENT** if the bpf-local-storage cannot be found.
  *
- * int bpf_send_signal(u32 sig)
+ * long bpf_send_signal(u32 sig)
  *	Description
  *		Send signal *sig* to the process of the current task.
  *		The signal may be delivered to any of this process's threads.
@@ -2852,7 +2884,7 @@ union bpf_attr {
  *
  *		**-EPROTONOSUPPORT** IP packet version is not 4 or 6
  *
- * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
  * 	Description
  * 		Write raw *data* blob into a special BPF perf event held by
  * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
@@ -2876,21 +2908,21 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr)
+ * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr)
  * 	Description
  * 		Safely attempt to read *size* bytes from user space address
  * 		*unsafe_ptr* and store the data in *dst*.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
+ * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
  * 	Description
  * 		Safely attempt to read *size* bytes from kernel space address
  * 		*unsafe_ptr* and store the data in *dst*.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr)
+ * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr)
  * 	Description
  * 		Copy a NUL terminated string from an unsafe user address
  * 		*unsafe_ptr* to *dst*. The *size* should include the
@@ -2934,7 +2966,7 @@ union bpf_attr {
  * 		including the trailing NUL character. On error, a negative
  * 		value.
  *
- * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
+ * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
  * 	Description
  * 		Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
  * 		to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply.
@@ -2942,14 +2974,14 @@ union bpf_attr {
  * 		On success, the strictly positive length of the string, including
  * 		the trailing NUL character. On error, a negative value.
  *
- * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
+ * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
  *	Description
  *		Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**.
  *		*rcv_nxt* is the ack_seq to be sent out.
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
- * int bpf_send_signal_thread(u32 sig)
+ * long bpf_send_signal_thread(u32 sig)
  *	Description
  *		Send signal *sig* to the thread corresponding to the current task.
  *	Return
@@ -2969,7 +3001,7 @@ union bpf_attr {
  *	Return
  *		The 64 bit jiffies
  *
- * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
+ * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
  *	Description
  *		For an eBPF program attached to a perf event, retrieve the
  *		branch records (**struct perf_branch_entry**) associated to *ctx*
@@ -2988,7 +3020,7 @@ union bpf_attr {
  *
  *		**-ENOENT** if architecture does not support branch records.
  *
- * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size)
+ * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size)
  *	Description
  *		Returns 0 on success, values for *pid* and *tgid* as seen from the current
  *		*namespace* will be returned in *nsdata*.
@@ -3000,7 +3032,7 @@ union bpf_attr {
  *
  *		**-ENOENT** if pidns does not exists for the current task.
  *
- * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
  *	Description
  *		Write raw *data* blob into a special BPF perf event held by
  *		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
@@ -3055,8 +3087,12 @@ union bpf_attr {
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
  *
- * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
+ * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
  *	Description
+ *		Helper is overloaded depending on BPF program type. This
+ *		description applies to **BPF_PROG_TYPE_SCHED_CLS** and
+ *		**BPF_PROG_TYPE_SCHED_ACT** programs.
+ *
  *		Assign the *sk* to the *skb*. When combined with appropriate
  *		routing configuration to receive the packet towards the socket,
  *		will cause *skb* to be delivered to the specified socket.
@@ -3082,6 +3118,56 @@ union bpf_attr {
  *		**-ESOCKTNOSUPPORT** if the socket type is not supported
  *		(reuseport).
  *
+ * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags)
+ *	Description
+ *		Helper is overloaded depending on BPF program type. This
+ *		description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs.
+ *
+ *		Select the *sk* as a result of a socket lookup.
+ *
+ *		For the operation to succeed passed socket must be compatible
+ *		with the packet description provided by the *ctx* object.
+ *
+ *		L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must
+ *		be an exact match. While IP family (**AF_INET** or
+ *		**AF_INET6**) must be compatible, that is IPv6 sockets
+ *		that are not v6-only can be selected for IPv4 packets.
+ *
+ *		Only TCP listeners and UDP unconnected sockets can be
+ *		selected. *sk* can also be NULL to reset any previous
+ *		selection.
+ *
+ *		*flags* argument can combination of following values:
+ *
+ *		* **BPF_SK_LOOKUP_F_REPLACE** to override the previous
+ *		  socket selection, potentially done by a BPF program
+ *		  that ran before us.
+ *
+ *		* **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip
+ *		  load-balancing within reuseport group for the socket
+ *		  being selected.
+ *
+ *		On success *ctx->sk* will point to the selected socket.
+ *
+ *	Return
+ *		0 on success, or a negative errno in case of failure.
+ *
+ *		* **-EAFNOSUPPORT** if socket family (*sk->family*) is
+ *		  not compatible with packet family (*ctx->family*).
+ *
+ *		* **-EEXIST** if socket has been already selected,
+ *		  potentially by another program, and
+ *		  **BPF_SK_LOOKUP_F_REPLACE** flag was not specified.
+ *
+ *		* **-EINVAL** if unsupported flags were specified.
+ *
+ *		* **-EPROTOTYPE** if socket L4 protocol
+ *		  (*sk->protocol*) doesn't match packet protocol
+ *		  (*ctx->protocol*).
+ *
+ *		* **-ESOCKTNOSUPPORT** if socket is not in allowed
+ *		  state (TCP listening or UDP unconnected).
+ *
  * u64 bpf_ktime_get_boot_ns(void)
  * 	Description
  * 		Return the time elapsed since system boot, in nanoseconds.
@@ -3090,7 +3176,7 @@ union bpf_attr {
  * 	Return
  * 		Current *ktime*.
  *
- * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
+ * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
  * 	Description
  * 		**bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
  * 		out the format string.
@@ -3119,7 +3205,7 @@ union bpf_attr {
  *
  *		**-EOVERFLOW** if an overflow happened: The same object will be tried again.
  *
- * int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
+ * long bpf_seq_write(struct seq_file *m, const void *data, u32 len)
  * 	Description
  * 		**bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data.
  * 		The *m* represents the seq_file. The *data* and *len* represent the
@@ -3161,16 +3247,15 @@ union bpf_attr {
  *	Return
  *		The id is returned or 0 in case the id could not be retrieved.
  *
- * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
+ * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
  * 	Description
  * 		Copy *size* bytes from *data* into a ring buffer *ringbuf*.
- * 		If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
- * 		new data availability is sent.
- * 		IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
- * 		new data availability is sent unconditionally.
+ * 		If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
+ * 		of new data availability is sent.
+ * 		If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
+ * 		of new data availability is sent unconditionally.
  * 	Return
- * 		0, on success;
- * 		< 0, on error.
+ * 		0 on success, or a negative error in case of failure.
  *
  * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags)
  * 	Description
@@ -3182,20 +3267,20 @@ union bpf_attr {
  * void bpf_ringbuf_submit(void *data, u64 flags)
  * 	Description
  * 		Submit reserved ring buffer sample, pointed to by *data*.
- * 		If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
- * 		new data availability is sent.
- * 		IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
- * 		new data availability is sent unconditionally.
+ * 		If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
+ * 		of new data availability is sent.
+ * 		If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
+ * 		of new data availability is sent unconditionally.
  * 	Return
  * 		Nothing. Always succeeds.
  *
  * void bpf_ringbuf_discard(void *data, u64 flags)
  * 	Description
  * 		Discard reserved ring buffer sample, pointed to by *data*.
- * 		If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
- * 		new data availability is sent.
- * 		IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
- * 		new data availability is sent unconditionally.
+ * 		If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
+ * 		of new data availability is sent.
+ * 		If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
+ * 		of new data availability is sent unconditionally.
  * 	Return
  * 		Nothing. Always succeeds.
  *
@@ -3203,16 +3288,227 @@ union bpf_attr {
  *	Description
  *		Query various characteristics of provided ring buffer. What
  *		exactly is queries is determined by *flags*:
- *		  - BPF_RB_AVAIL_DATA - amount of data not yet consumed;
- *		  - BPF_RB_RING_SIZE - the size of ring buffer;
- *		  - BPF_RB_CONS_POS - consumer position (can wrap around);
- *		  - BPF_RB_PROD_POS - producer(s) position (can wrap around);
- *		Data returned is just a momentary snapshots of actual values
+ *
+ *		* **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed.
+ *		* **BPF_RB_RING_SIZE**: The size of ring buffer.
+ *		* **BPF_RB_CONS_POS**: Consumer position (can wrap around).
+ *		* **BPF_RB_PROD_POS**: Producer(s) position (can wrap around).
+ *
+ *		Data returned is just a momentary snapshot of actual values
  *		and could be inaccurate, so this facility should be used to
  *		power heuristics and for reporting, not to make 100% correct
  *		calculation.
  *	Return
- *		Requested value, or 0, if flags are not recognized.
+ *		Requested value, or 0, if *flags* are not recognized.
+ *
+ * long bpf_csum_level(struct sk_buff *skb, u64 level)
+ * 	Description
+ * 		Change the skbs checksum level by one layer up or down, or
+ * 		reset it entirely to none in order to have the stack perform
+ * 		checksum validation. The level is applicable to the following
+ * 		protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of
+ * 		| ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP |
+ * 		through **bpf_skb_adjust_room**\ () helper with passing in
+ * 		**BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one	call
+ * 		to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since
+ * 		the UDP header is removed. Similarly, an encap of the latter
+ * 		into the former could be accompanied by a helper call to
+ * 		**bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the
+ * 		skb is still intended to be processed in higher layers of the
+ * 		stack instead of just egressing at tc.
+ *
+ * 		There are three supported level settings at this time:
+ *
+ * 		* **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs
+ * 		  with CHECKSUM_UNNECESSARY.
+ * 		* **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs
+ * 		  with CHECKSUM_UNNECESSARY.
+ * 		* **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and
+ * 		  sets CHECKSUM_NONE to force checksum validation by the stack.
+ * 		* **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current
+ * 		  skb->csum_level.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure. In the
+ * 		case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
+ * 		is returned or the error code -EACCES in case the skb is not
+ * 		subject to CHECKSUM_UNNECESSARY.
+ *
+ * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk)
+ *	Description
+ *		Dynamically cast a *sk* pointer to a *tcp6_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk)
+ *	Description
+ *		Dynamically cast a *sk* pointer to a *tcp_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk)
+ * 	Description
+ *		Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk)
+ * 	Description
+ *		Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk)
+ * 	Description
+ *		Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
+ *	Description
+ *		Return a user or a kernel stack in bpf program provided buffer.
+ *		To achieve this, the helper needs *task*, which is a valid
+ *		pointer to struct task_struct. To store the stacktrace, the
+ *		bpf program provides *buf* with	a nonnegative *size*.
+ *
+ *		The last argument, *flags*, holds the number of stack frames to
+ *		skip (from 0 to 255), masked with
+ *		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ *		the following flags:
+ *
+ *		**BPF_F_USER_STACK**
+ *			Collect a user space stack instead of a kernel stack.
+ *		**BPF_F_USER_BUILD_ID**
+ *			Collect buildid+offset instead of ips for user stack,
+ *			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ *		**bpf_get_task_stack**\ () can collect up to
+ *		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ *		to sufficient large buffer size. Note that
+ *		this limit can be controlled with the **sysctl** program, and
+ *		that it should be manually increased in order to profile long
+ *		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ *		::
+ *
+ *			# sysctl kernel.perf_event_max_stack=<new value>
+ *	Return
+ *		A non-negative value equal to or less than *size* on success,
+ *		or a negative error in case of failure.
+ *
+ * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags)
+ *	Description
+ *		Load header option.  Support reading a particular TCP header
+ *		option for bpf program (BPF_PROG_TYPE_SOCK_OPS).
+ *
+ *		If *flags* is 0, it will search the option from the
+ *		sock_ops->skb_data.  The comment in "struct bpf_sock_ops"
+ *		has details on what skb_data contains under different
+ *		sock_ops->op.
+ *
+ *		The first byte of the *searchby_res* specifies the
+ *		kind that it wants to search.
+ *
+ *		If the searching kind is an experimental kind
+ *		(i.e. 253 or 254 according to RFC6994).  It also
+ *		needs to specify the "magic" which is either
+ *		2 bytes or 4 bytes.  It then also needs to
+ *		specify the size of the magic by using
+ *		the 2nd byte which is "kind-length" of a TCP
+ *		header option and the "kind-length" also
+ *		includes the first 2 bytes "kind" and "kind-length"
+ *		itself as a normal TCP header option also does.
+ *
+ *		For example, to search experimental kind 254 with
+ *		2 byte magic 0xeB9F, the searchby_res should be
+ *		[ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ].
+ *
+ *		To search for the standard window scale option (3),
+ *		the searchby_res should be [ 3, 0, 0, .... 0 ].
+ *		Note, kind-length must be 0 for regular option.
+ *
+ *		Searching for No-Op (0) and End-of-Option-List (1) are
+ *		not supported.
+ *
+ *		*len* must be at least 2 bytes which is the minimal size
+ *		of a header option.
+ *
+ *		Supported flags:
+ *		* **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the
+ *		  saved_syn packet or the just-received syn packet.
+ *
+ *	Return
+ *		>0 when found, the header option is copied to *searchby_res*.
+ *		The return value is the total length copied.
+ *
+ *		**-EINVAL** If param is invalid
+ *
+ *		**-ENOMSG** The option is not found
+ *
+ *		**-ENOENT** No syn packet available when
+ *			    **BPF_LOAD_HDR_OPT_TCP_SYN** is used
+ *
+ *		**-ENOSPC** Not enough space.  Only *len* number of
+ *			    bytes are copied.
+ *
+ *		**-EFAULT** Cannot parse the header options in the packet
+ *
+ *		**-EPERM** This helper cannot be used under the
+ *			   current sock_ops->op.
+ *
+ * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags)
+ *	Description
+ *		Store header option.  The data will be copied
+ *		from buffer *from* with length *len* to the TCP header.
+ *
+ *		The buffer *from* should have the whole option that
+ *		includes the kind, kind-length, and the actual
+ *		option data.  The *len* must be at least kind-length
+ *		long.  The kind-length does not have to be 4 byte
+ *		aligned.  The kernel will take care of the padding
+ *		and setting the 4 bytes aligned value to th->doff.
+ *
+ *		This helper will check for duplicated option
+ *		by searching the same option in the outgoing skb.
+ *
+ *		This helper can only be called during
+ *		BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
+ *
+ *	Return
+ *		0 on success, or negative error in case of failure:
+ *
+ *		**-EINVAL** If param is invalid
+ *
+ *		**-ENOSPC** Not enough space in the header.
+ *			    Nothing has been written
+ *
+ *		**-EEXIST** The option has already existed
+ *
+ *		**-EFAULT** Cannot parse the existing header options
+ *
+ *		**-EPERM** This helper cannot be used under the
+ *			   current sock_ops->op.
+ *
+ * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags)
+ *	Description
+ *		Reserve *len* bytes for the bpf header option.  The
+ *		space will be used by bpf_store_hdr_opt() later in
+ *		BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
+ *
+ *		If bpf_reserve_hdr_opt() is called multiple times,
+ *		the total number of bytes will be reserved.
+ *
+ *		This helper can only be called during
+ *		BPF_SOCK_OPS_HDR_OPT_LEN_CB.
+ *
+ *	Return
+ *		0 on success, or negative error in case of failure:
+ *
+ *		**-EINVAL** if param is invalid
+ *
+ *		**-ENOSPC** Not enough space in the header.
+ *
+ *		**-EPERM** This helper cannot be used under the
+ *			   current sock_ops->op.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3349,7 +3645,18 @@ union bpf_attr {
 	FN(ringbuf_reserve),		\
 	FN(ringbuf_submit),		\
 	FN(ringbuf_discard),		\
-	FN(ringbuf_query),
+	FN(ringbuf_query),		\
+	FN(csum_level),			\
+	FN(skc_to_tcp6_sock),		\
+	FN(skc_to_tcp_sock),		\
+	FN(skc_to_tcp_timewait_sock),	\
+	FN(skc_to_tcp_request_sock),	\
+	FN(skc_to_udp6_sock),		\
+	FN(get_task_stack),		\
+	FN(load_hdr_opt),		\
+	FN(store_hdr_opt),		\
+	FN(reserve_hdr_opt),
+	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -3426,6 +3733,14 @@ enum {
 	BPF_F_CURRENT_NETNS		= (-1L),
 };
 
+/* BPF_FUNC_csum_level level values. */
+enum {
+	BPF_CSUM_LEVEL_QUERY,
+	BPF_CSUM_LEVEL_INC,
+	BPF_CSUM_LEVEL_DEC,
+	BPF_CSUM_LEVEL_RESET,
+};
+
 /* BPF_FUNC_skb_adjust_room flags. */
 enum {
 	BPF_F_ADJ_ROOM_FIXED_GSO	= (1ULL << 0),
@@ -3433,6 +3748,7 @@ enum {
 	BPF_F_ADJ_ROOM_ENCAP_L3_IPV6	= (1ULL << 2),
 	BPF_F_ADJ_ROOM_ENCAP_L4_GRE	= (1ULL << 3),
 	BPF_F_ADJ_ROOM_ENCAP_L4_UDP	= (1ULL << 4),
+	BPF_F_ADJ_ROOM_NO_CSUM_RESET	= (1ULL << 5),
 };
 
 enum {
@@ -3482,6 +3798,12 @@ enum {
 	BPF_RINGBUF_HDR_SZ		= 8,
 };
 
+/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */
+enum {
+	BPF_SK_LOOKUP_F_REPLACE		= (1ULL << 0),
+	BPF_SK_LOOKUP_F_NO_REUSEPORT	= (1ULL << 1),
+};
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
@@ -3712,6 +4034,32 @@ struct xdp_md {
 	__u32 egress_ifindex;  /* txq->dev->ifindex */
 };
 
+/* DEVMAP map-value layout
+ *
+ * The struct data-layout of map-value is a configuration interface.
+ * New members can only be added to the end of this structure.
+ */
+struct bpf_devmap_val {
+	__u32 ifindex;   /* device index */
+	union {
+		int   fd;  /* prog fd on map write */
+		__u32 id;  /* prog id on map read */
+	} bpf_prog;
+};
+
+/* CPUMAP map-value layout
+ *
+ * The struct data-layout of map-value is a configuration interface.
+ * New members can only be added to the end of this structure.
+ */
+struct bpf_cpumap_val {
+	__u32 qsize;	/* queue size to remote target CPU */
+	union {
+		int   fd;	/* prog fd on map write */
+		__u32 id;	/* prog id on map read */
+	} bpf_prog;
+};
+
 enum sk_action {
 	SK_DROP = 0,
 	SK_PASS,
@@ -3840,16 +4188,26 @@ struct bpf_link_info {
 			__u64 cgroup_id;
 			__u32 attach_type;
 		} cgroup;
+		struct {
+			__aligned_u64 target_name; /* in/out: target_name buffer ptr */
+			__u32 target_name_len;	   /* in/out: target_name buffer len */
+			union {
+				__u32 map_id;
+			} map;
+		} iter;
 		struct  {
 			__u32 netns_ino;
 			__u32 attach_type;
 		} netns;
+		struct {
+			__u32 ifindex;
+		} xdp;
 	};
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
  * by user and intended to be used by socket (e.g. to bind to, depends on
- * attach attach type).
+ * attach type).
  */
 struct bpf_sock_addr {
 	__u32 user_family;	/* Allows 4-byte read, but no write. */
@@ -3924,6 +4282,36 @@ struct bpf_sock_ops {
 	__u64 bytes_received;
 	__u64 bytes_acked;
 	__bpf_md_ptr(struct bpf_sock *, sk);
+	/* [skb_data, skb_data_end) covers the whole TCP header.
+	 *
+	 * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received
+	 * BPF_SOCK_OPS_HDR_OPT_LEN_CB:   Not useful because the
+	 *                                header has not been written.
+	 * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have
+	 *				  been written so far.
+	 * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:  The SYNACK that concludes
+	 *					the 3WHS.
+	 * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes
+	 *					the 3WHS.
+	 *
+	 * bpf_load_hdr_opt() can also be used to read a particular option.
+	 */
+	__bpf_md_ptr(void *, skb_data);
+	__bpf_md_ptr(void *, skb_data_end);
+	__u32 skb_len;		/* The total length of a packet.
+				 * It includes the header, options,
+				 * and payload.
+				 */
+	__u32 skb_tcp_flags;	/* tcp_flags of the header.  It provides
+				 * an easy way to check for tcp_flags
+				 * without parsing skb_data.
+				 *
+				 * In particular, the skb_tcp_flags
+				 * will still be available in
+				 * BPF_SOCK_OPS_HDR_OPT_LEN even though
+				 * the outgoing header has not
+				 * been written yet.
+				 */
 };
 
 /* Definitions for bpf_sock_ops_cb_flags */
@@ -3932,8 +4320,51 @@ enum {
 	BPF_SOCK_OPS_RETRANS_CB_FLAG	= (1<<1),
 	BPF_SOCK_OPS_STATE_CB_FLAG	= (1<<2),
 	BPF_SOCK_OPS_RTT_CB_FLAG	= (1<<3),
+	/* Call bpf for all received TCP headers.  The bpf prog will be
+	 * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB
+	 *
+	 * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB
+	 * for the header option related helpers that will be useful
+	 * to the bpf programs.
+	 *
+	 * It could be used at the client/active side (i.e. connect() side)
+	 * when the server told it that the server was in syncookie
+	 * mode and required the active side to resend the bpf-written
+	 * options.  The active side can keep writing the bpf-options until
+	 * it received a valid packet from the server side to confirm
+	 * the earlier packet (and options) has been received.  The later
+	 * example patch is using it like this at the active side when the
+	 * server is in syncookie mode.
+	 *
+	 * The bpf prog will usually turn this off in the common cases.
+	 */
+	BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG	= (1<<4),
+	/* Call bpf when kernel has received a header option that
+	 * the kernel cannot handle.  The bpf prog will be called under
+	 * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB.
+	 *
+	 * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB
+	 * for the header option related helpers that will be useful
+	 * to the bpf programs.
+	 */
+	BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5),
+	/* Call bpf when the kernel is writing header options for the
+	 * outgoing packet.  The bpf prog will first be called
+	 * to reserve space in a skb under
+	 * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB.  Then
+	 * the bpf prog will be called to write the header option(s)
+	 * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
+	 *
+	 * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB
+	 * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option
+	 * related helpers that will be useful to the bpf programs.
+	 *
+	 * The kernel gets its chance to reserve space and write
+	 * options first before the BPF program does.
+	 */
+	BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6),
 /* Mask of all currently supported cb flags */
-	BPF_SOCK_OPS_ALL_CB_FLAGS       = 0xF,
+	BPF_SOCK_OPS_ALL_CB_FLAGS       = 0x7F,
 };
 
 /* List of known BPF sock_ops operators.
@@ -3989,6 +4420,63 @@ enum {
 					 */
 	BPF_SOCK_OPS_RTT_CB,		/* Called on every RTT.
 					 */
+	BPF_SOCK_OPS_PARSE_HDR_OPT_CB,	/* Parse the header option.
+					 * It will be called to handle
+					 * the packets received at
+					 * an already established
+					 * connection.
+					 *
+					 * sock_ops->skb_data:
+					 * Referring to the received skb.
+					 * It covers the TCP header only.
+					 *
+					 * bpf_load_hdr_opt() can also
+					 * be used to search for a
+					 * particular option.
+					 */
+	BPF_SOCK_OPS_HDR_OPT_LEN_CB,	/* Reserve space for writing the
+					 * header option later in
+					 * BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
+					 * Arg1: bool want_cookie. (in
+					 *       writing SYNACK only)
+					 *
+					 * sock_ops->skb_data:
+					 * Not available because no header has
+					 * been	written yet.
+					 *
+					 * sock_ops->skb_tcp_flags:
+					 * The tcp_flags of the
+					 * outgoing skb. (e.g. SYN, ACK, FIN).
+					 *
+					 * bpf_reserve_hdr_opt() should
+					 * be used to reserve space.
+					 */
+	BPF_SOCK_OPS_WRITE_HDR_OPT_CB,	/* Write the header options
+					 * Arg1: bool want_cookie. (in
+					 *       writing SYNACK only)
+					 *
+					 * sock_ops->skb_data:
+					 * Referring to the outgoing skb.
+					 * It covers the TCP header
+					 * that has already been written
+					 * by the kernel and the
+					 * earlier bpf-progs.
+					 *
+					 * sock_ops->skb_tcp_flags:
+					 * The tcp_flags of the outgoing
+					 * skb. (e.g. SYN, ACK, FIN).
+					 *
+					 * bpf_store_hdr_opt() should
+					 * be used to write the
+					 * option.
+					 *
+					 * bpf_load_hdr_opt() can also
+					 * be used to search for a
+					 * particular option that
+					 * has already been written
+					 * by the kernel or the
+					 * earlier bpf-progs.
+					 */
 };
 
 /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
@@ -4016,6 +4504,63 @@ enum {
 enum {
 	TCP_BPF_IW		= 1001,	/* Set TCP initial congestion window */
 	TCP_BPF_SNDCWND_CLAMP	= 1002,	/* Set sndcwnd_clamp */
+	TCP_BPF_DELACK_MAX	= 1003, /* Max delay ack in usecs */
+	TCP_BPF_RTO_MIN		= 1004, /* Min delay ack in usecs */
+	/* Copy the SYN pkt to optval
+	 *
+	 * BPF_PROG_TYPE_SOCK_OPS only.  It is similar to the
+	 * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit
+	 * to only getting from the saved_syn.  It can either get the
+	 * syn packet from:
+	 *
+	 * 1. the just-received SYN packet (only available when writing the
+	 *    SYNACK).  It will be useful when it is not necessary to
+	 *    save the SYN packet for latter use.  It is also the only way
+	 *    to get the SYN during syncookie mode because the syn
+	 *    packet cannot be saved during syncookie.
+	 *
+	 * OR
+	 *
+	 * 2. the earlier saved syn which was done by
+	 *    bpf_setsockopt(TCP_SAVE_SYN).
+	 *
+	 * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the
+	 * SYN packet is obtained.
+	 *
+	 * If the bpf-prog does not need the IP[46] header,  the
+	 * bpf-prog can avoid parsing the IP header by using
+	 * TCP_BPF_SYN.  Otherwise, the bpf-prog can get both
+	 * IP[46] and TCP header by using TCP_BPF_SYN_IP.
+	 *
+	 *      >0: Total number of bytes copied
+	 * -ENOSPC: Not enough space in optval. Only optlen number of
+	 *          bytes is copied.
+	 * -ENOENT: The SYN skb is not available now and the earlier SYN pkt
+	 *	    is not saved by setsockopt(TCP_SAVE_SYN).
+	 */
+	TCP_BPF_SYN		= 1005, /* Copy the TCP header */
+	TCP_BPF_SYN_IP		= 1006, /* Copy the IP[46] and TCP header */
+	TCP_BPF_SYN_MAC         = 1007, /* Copy the MAC, IP[46], and TCP header */
+};
+
+enum {
+	BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0),
+};
+
+/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
+ */
+enum {
+	BPF_WRITE_HDR_TCP_CURRENT_MSS = 1,	/* Kernel is finding the
+						 * total option spaces
+						 * required for an established
+						 * sk in order to calculate the
+						 * MSS.  No skb is actually
+						 * sent.
+						 */
+	BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2,	/* Kernel is in syncookie mode
+						 * when sending a SYN.
+						 */
 };
 
 struct bpf_perf_event_value {
@@ -4198,4 +4743,19 @@ struct bpf_pidns_info {
 	__u32 pid;
 	__u32 tgid;
 };
+
+/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
+struct bpf_sk_lookup {
+	__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+
+	__u32 family;		/* Protocol family (AF_INET, AF_INET6) */
+	__u32 protocol;		/* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
+	__u32 remote_ip4;	/* Network byte order */
+	__u32 remote_ip6[4];	/* Network byte order */
+	__u32 remote_port;	/* Network byte order */
+	__u32 local_ip4;	/* Network byte order */
+	__u32 local_ip6[4];	/* Network byte order */
+	__u32 local_port;	/* Host byte order */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */

From e9c45d7f648f1f1be3217868ee863d8979380e37 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Tue, 15 Dec 2020 16:49:57 +0100
Subject: [PATCH 52/61] traffic-pacing-edt: start working on xdp_cpumap_qinq

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/Makefile          |  1 +
 traffic-pacing-edt/xdp_cpumap_qinq.c | 60 ++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)
 create mode 100644 traffic-pacing-edt/xdp_cpumap_qinq.c

diff --git a/traffic-pacing-edt/Makefile b/traffic-pacing-edt/Makefile
index 09cdd24..fdf2613 100644
--- a/traffic-pacing-edt/Makefile
+++ b/traffic-pacing-edt/Makefile
@@ -2,6 +2,7 @@
 
 USER_TARGETS :=
 BPF_TARGETS := edt_pacer_vlan
+BPF_TARGETS += xdp_cpumap_qinq
 
 EXTRA_DEPS += config.mk
 
diff --git a/traffic-pacing-edt/xdp_cpumap_qinq.c b/traffic-pacing-edt/xdp_cpumap_qinq.c
new file mode 100644
index 0000000..1fc98b0
--- /dev/null
+++ b/traffic-pacing-edt/xdp_cpumap_qinq.c
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#include <linux/types.h>
+#include <linux/bpf.h> /* struct bpf_cpumap_val */
+#include <bpf/bpf_helpers.h>
+#include <bpf/compiler.h>
+
+
+#include <bpf/bpf_helpers.h>
+
+#define VLAN_MAX_DEPTH 2
+#include <xdp/parsing_helpers.h>
+
+#define MAX_CPUS 24
+
+/* Special map type that can XDP_REDIRECT frames to another CPU */
+struct {
+	__uint(type, BPF_MAP_TYPE_CPUMAP);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct bpf_cpumap_val));
+	__uint(max_entries, MAX_CPUS);
+} cpumap SEC(".maps");
+
+SEC("xdp")
+int  xdp_cpumap_qinq(struct xdp_md *ctx)
+{
+	void *data     = (void *)(long)ctx->data;
+	void *data_end = (void *)(long)ctx->data_end;
+	struct collect_vlans vlans = { 0 };
+	struct ethhdr *eth;
+	__u32 cpu_dest = 0;
+	__u64 action;
+
+	/* These keep track of the next header type and iterator pointer */
+	struct hdr_cursor nh;
+	int eth_type;
+	nh.pos = data;
+
+	eth_type = parse_ethhdr_vlan(&nh, data_end, &eth, &vlans);
+	if (eth_type < 0) {
+		action = XDP_ABORTED;
+		goto out;
+	}
+
+	/* Keep ARP resolution working */
+	if (eth_type == bpf_htons(ETH_P_ARP)) {
+		action = XDP_PASS;
+		goto out;
+	}
+
+	if (!proto_is_vlan(eth->h_proto)) {
+		/* Skip non-VLAN frames */
+		action = XDP_PASS;
+		goto out;
+	}
+
+	// WARNING: Userspace MUST insert entries into cpumap
+	action = bpf_redirect_map(&cpumap, cpu_dest, XDP_PASS);
+out:
+	return action;
+}

From c8682ec27f2f6e20db9bfdf2e4aa5673957b2b2e Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Tue, 15 Dec 2020 17:55:24 +0100
Subject: [PATCH 53/61] traffic-pacing-edt: userspace loader for
 xdp_cpumap_qinq

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/Makefile            |   2 +-
 traffic-pacing-edt/xdp_cpumap_loader.c | 240 +++++++++++++++++++++++++
 2 files changed, 241 insertions(+), 1 deletion(-)
 create mode 100644 traffic-pacing-edt/xdp_cpumap_loader.c

diff --git a/traffic-pacing-edt/Makefile b/traffic-pacing-edt/Makefile
index fdf2613..aa6d3aa 100644
--- a/traffic-pacing-edt/Makefile
+++ b/traffic-pacing-edt/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
 
-USER_TARGETS :=
+USER_TARGETS := xdp_cpumap_loader
 BPF_TARGETS := edt_pacer_vlan
 BPF_TARGETS += xdp_cpumap_qinq
 
diff --git a/traffic-pacing-edt/xdp_cpumap_loader.c b/traffic-pacing-edt/xdp_cpumap_loader.c
new file mode 100644
index 0000000..8196bac
--- /dev/null
+++ b/traffic-pacing-edt/xdp_cpumap_loader.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0+
+static const char *__doc__ =
+	" XDP load-balancing with CPU-map";
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <locale.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+#include <getopt.h>
+#include <net/if.h>
+#include <time.h>
+#include <linux/limits.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <linux/if_link.h> /* XDP defines */
+
+static int ifindex = -1;
+static char ifname_buf[IF_NAMESIZE];
+static char *ifname;
+
+static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+
+/* Exit return codes */
+#define EXIT_OK 		0
+#define EXIT_FAIL		1
+#define EXIT_FAIL_OPTION	2
+#define EXIT_FAIL_XDP		3
+#define EXIT_FAIL_BPF		4
+#define EXIT_FAIL_MEM		5
+#define EXIT_FAIL_FILE		6
+
+static const struct option long_options[] = {
+	{"help",	no_argument,		NULL, 'h' },
+	{"dev",		required_argument,	NULL, 'd' },
+	{"qsize",	required_argument,	NULL, 'q' },
+	{"force",	no_argument,		NULL, 'F' },
+	{0, 0, NULL,  0 }
+};
+
+static void usage(char *argv[])
+{
+	int i;
+
+	printf("\nDOCUMENTATION:\n%s\n", __doc__);
+	printf("\n");
+	printf(" Usage: %s (options-see-below)\n", argv[0]);
+	printf(" Listing options:\n");
+	for (i = 0; long_options[i].name != 0; i++) {
+		printf(" --%-12s", long_options[i].name);
+		if (long_options[i].flag != NULL)
+			printf(" flag (internal value:%d)",
+				*long_options[i].flag);
+		else
+			printf(" short-option: -%c",
+				long_options[i].val);
+		printf("\n");
+	}
+	printf("\n");
+}
+
+static int create_cpu_entry(int cpumap_fd, __u32 cpu,
+			    struct bpf_cpumap_val *value)
+{
+	int err;
+
+	/* Add a CPU entry to cpumap, as this allocate a cpu entry in
+	 * the kernel for the cpu.
+	 */
+	err = bpf_map_update_elem(cpumap_fd, &cpu, value, 0);
+	if (err) {
+		fprintf(stderr, "Create CPU entry failed (err:%d)\n", err);
+		exit(EXIT_FAIL_BPF);
+	}
+
+	return 0;
+}
+
+/* Userspace MUST create/populate CPUMAP entries for redirect to work
+ */
+static void enable_all_cpus(int cpumap_fd, __u32 qsize)
+{
+	struct bpf_cpumap_val value = { 0 };
+        int n_cpus = get_nprocs_conf();
+	int i;
+
+	value.qsize = qsize;
+
+	for (i = 0; i < n_cpus; i++) {
+		printf("Enable CPU:%d\n", i);
+		create_cpu_entry(cpumap_fd, i, &value);
+	}
+}
+
+struct bpf_object *do_load_bpf_obj(struct bpf_object *obj)
+{
+	char buf[200];
+	int err;
+
+	err = bpf_object__load(obj);
+	if (err) {
+		libbpf_strerror(err, buf, sizeof(buf));
+		printf("Error loading: %s\n", buf);
+		return NULL;
+	}
+	return obj;
+}
+
+int do_xdp_attach(int ifindex, struct bpf_program *prog, __u32 xdp_flags)
+{
+	int prog_fd = bpf_program__fd(prog);
+	int err;
+
+	if (prog_fd < 0) {
+		fprintf(stderr, "bpf_program__fd failed\n");
+		return EXIT_FAIL_BPF;
+	}
+
+	err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
+	if (err) {
+		fprintf(stderr, "link set xdp fd failed (err:%d)\n", err);
+		return EXIT_FAIL_XDP;
+	}
+	return EXIT_OK;
+}
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	int opt, longindex = 0;
+	__u32 cfg_qsize = 512;
+	char buf[100];
+	int err;
+
+	struct bpf_object *obj = NULL;
+	struct bpf_program *prog;
+	int cpumap_fd = -1;
+
+	int n_cpus = get_nprocs_conf();
+
+        obj = bpf_object__open_file("xdp_cpumap_qinq.o", NULL);
+	err = libbpf_get_error(obj);
+	if (err) {
+		libbpf_strerror(err, buf, sizeof(buf));
+		printf("Error opening file: %s\n", buf);
+		return EXIT_FAIL_FILE;
+	}
+	err = EXIT_OK;
+
+	/* Parse commands line args */
+	while ((opt = getopt_long(argc, argv, "hSd:s:p:q:c:xzFf:e:r:m:",
+				  long_options, &longindex)) != -1) {
+		switch (opt) {
+		case 'd':
+			if (strlen(optarg) >= IF_NAMESIZE) {
+				fprintf(stderr, "ERR: --dev name too long\n");
+				goto error;
+			}
+			ifname = (char *)&ifname_buf;
+			strncpy(ifname, optarg, IF_NAMESIZE);
+			ifindex = if_nametoindex(ifname);
+			if (ifindex == 0) {
+				fprintf(stderr,
+					"ERR: --dev name unknown err(%d):%s\n",
+					errno, strerror(errno));
+				goto error;
+			}
+			break;
+		case 'q':
+			cfg_qsize = strtol(optarg, NULL, 10);
+			break;
+		case 'F':
+			xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+			break;
+		case 'h':
+		error:
+		default:
+			usage(argv);
+			return EXIT_FAIL_OPTION;
+		}
+	}
+	/* Required option */
+	if (ifindex == -1) {
+		fprintf(stderr, "ERR: required option --dev missing\n");
+		usage(argv);
+		return EXIT_FAIL_OPTION;
+	}
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return EXIT_FAIL_MEM;
+	}
+
+	/* Always use XDP native driver mode */
+	xdp_flags |= XDP_FLAGS_DRV_MODE;
+
+	obj = do_load_bpf_obj(obj);
+	if (!obj)
+		return EXIT_FAIL_BPF;
+
+	/* Pickup first BPF-program */
+	prog = bpf_program__next(NULL, obj);
+	if (!prog) {
+		printf("No program!\n");
+		err = EXIT_FAIL_BPF;
+		goto out;
+	}
+
+	/* Get file descriptor to BPF-map */
+	cpumap_fd = bpf_object__find_map_fd_by_name(obj, "cpumap");
+	if (cpumap_fd < 0) {
+		printf("No cpumap found!\n");
+		err = EXIT_FAIL_BPF;
+		goto out;
+	}
+	/* Configure cpumap */
+	enable_all_cpus(cpumap_fd, cfg_qsize);
+
+	/* Attach XDP program */
+	err = do_xdp_attach(ifindex, prog, xdp_flags);
+	if (err)
+		goto out;
+
+	printf("Attached XDP program:\"%s\" on netdev:%s (ifindex:%d)\n",
+	       bpf_program__name(prog), ifname, ifindex);
+	printf("CPUs: %d\n", n_cpus);
+
+out:
+	if (obj)
+		bpf_object__close(obj);
+
+	return err;
+}

From b3ebc2c18ce6ce1988bd5cc9a5fca458aec76032 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Tue, 15 Dec 2020 19:34:03 +0100
Subject: [PATCH 54/61] traffic-pacing-edt: implement option for --remove

Need quick way to remove before testing on production.

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/xdp_cpumap_loader.c | 29 +++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/traffic-pacing-edt/xdp_cpumap_loader.c b/traffic-pacing-edt/xdp_cpumap_loader.c
index 8196bac..fb6f08b 100644
--- a/traffic-pacing-edt/xdp_cpumap_loader.c
+++ b/traffic-pacing-edt/xdp_cpumap_loader.c
@@ -42,6 +42,7 @@ static const struct option long_options[] = {
 	{"dev",		required_argument,	NULL, 'd' },
 	{"qsize",	required_argument,	NULL, 'q' },
 	{"force",	no_argument,		NULL, 'F' },
+	{"remove",	no_argument,		NULL, 'r' },
 	{0, 0, NULL,  0 }
 };
 
@@ -125,7 +126,21 @@ int do_xdp_attach(int ifindex, struct bpf_program *prog, __u32 xdp_flags)
 
 	err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
 	if (err) {
-		fprintf(stderr, "link set xdp fd failed (err:%d)\n", err);
+		fprintf(stderr, "%s(): link set xdp fd failed (err:%d)\n",
+			__func__, err);
+		return EXIT_FAIL_XDP;
+	}
+	return EXIT_OK;
+}
+
+int do_xdp_detach(int ifindex, __u32 xdp_flags)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+	if (err) {
+		fprintf(stderr, "%s(): link set xdp fd failed (err:%d)\n",
+			__func__, err);
 		return EXIT_FAIL_XDP;
 	}
 	return EXIT_OK;
@@ -134,6 +149,7 @@ int do_xdp_attach(int ifindex, struct bpf_program *prog, __u32 xdp_flags)
 int main(int argc, char **argv)
 {
 	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	bool do_detach = false;
 	int opt, longindex = 0;
 	__u32 cfg_qsize = 512;
 	char buf[100];
@@ -145,6 +161,9 @@ int main(int argc, char **argv)
 
 	int n_cpus = get_nprocs_conf();
 
+	/* Always use XDP native driver mode */
+	xdp_flags |= XDP_FLAGS_DRV_MODE;
+
         obj = bpf_object__open_file("xdp_cpumap_qinq.o", NULL);
 	err = libbpf_get_error(obj);
 	if (err) {
@@ -179,6 +198,9 @@ int main(int argc, char **argv)
 		case 'F':
 			xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
 			break;
+		case 'r':
+			do_detach = true;
+			break;
 		case 'h':
 		error:
 		default:
@@ -193,13 +215,14 @@ int main(int argc, char **argv)
 		return EXIT_FAIL_OPTION;
 	}
 
+	if (do_detach)
+		return do_xdp_detach(ifindex, xdp_flags);
+
 	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
 		perror("setrlimit(RLIMIT_MEMLOCK)");
 		return EXIT_FAIL_MEM;
 	}
 
-	/* Always use XDP native driver mode */
-	xdp_flags |= XDP_FLAGS_DRV_MODE;
 
 	obj = do_load_bpf_obj(obj);
 	if (!obj)

From e8ae6a92870a805d6008b4281c1cb345f1da1f2b Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Wed, 16 Dec 2020 16:40:15 +0100
Subject: [PATCH 55/61] traffic-pacing-edt: implement spead across CPUs

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/xdp_cpumap_loader.c |  2 +-
 traffic-pacing-edt/xdp_cpumap_qinq.c   | 24 +++++++++++++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/traffic-pacing-edt/xdp_cpumap_loader.c b/traffic-pacing-edt/xdp_cpumap_loader.c
index fb6f08b..3a954b9 100644
--- a/traffic-pacing-edt/xdp_cpumap_loader.c
+++ b/traffic-pacing-edt/xdp_cpumap_loader.c
@@ -78,7 +78,7 @@ static int create_cpu_entry(int cpumap_fd, __u32 cpu,
 	err = bpf_map_update_elem(cpumap_fd, &cpu, value, 0);
 	if (err) {
 		fprintf(stderr, "Create CPU entry failed (err:%d)\n", err);
-		exit(EXIT_FAIL_BPF);
+		return EXIT_FAIL_BPF;
 	}
 
 	return 0;
diff --git a/traffic-pacing-edt/xdp_cpumap_qinq.c b/traffic-pacing-edt/xdp_cpumap_qinq.c
index 1fc98b0..5803c95 100644
--- a/traffic-pacing-edt/xdp_cpumap_qinq.c
+++ b/traffic-pacing-edt/xdp_cpumap_qinq.c
@@ -12,6 +12,9 @@
 
 #define MAX_CPUS 24
 
+/* This global variable is used for limiting CPU that can be selected */
+__u32 global_max_cpus = 12; /* TODO: Allow userspace to adjust this */
+
 /* Special map type that can XDP_REDIRECT frames to another CPU */
 struct {
 	__uint(type, BPF_MAP_TYPE_CPUMAP);
@@ -20,6 +23,22 @@ struct {
 	__uint(max_entries, MAX_CPUS);
 } cpumap SEC(".maps");
 
+static __always_inline
+__u16 extract_vlan_key(struct collect_vlans *vlans)
+{
+	__u16 vlan_key = 0;
+
+	if (vlans->id[1]) {
+		/* Inner Q-in-Q VLAN present use that as key */
+		vlan_key = vlans->id[1];
+	} else {
+		/* If only one VLAN tag, use it as key */
+		vlan_key = vlans->id[0];
+	}
+
+	return vlan_key;
+}
+
 SEC("xdp")
 int  xdp_cpumap_qinq(struct xdp_md *ctx)
 {
@@ -53,7 +72,10 @@ int  xdp_cpumap_qinq(struct xdp_md *ctx)
 		goto out;
 	}
 
-	// WARNING: Userspace MUST insert entries into cpumap
+	/* Use inner VLAN as key and hash based on max_cpus */
+	cpu_dest = extract_vlan_key(&vlans) % global_max_cpus;
+
+	/* Notice: Userspace MUST insert entries into cpumap */
 	action = bpf_redirect_map(&cpumap, cpu_dest, XDP_PASS);
 out:
 	return action;

From 74a47be6973ba3bb69ea19851d1bed5fe5d9a929 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Wed, 16 Dec 2020 20:42:37 +0100
Subject: [PATCH 56/61] traffic-pacing-edt: Add "SuperFastHash" based on Paul
 Hsieh design

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/hash_func01.h | 55 ++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 traffic-pacing-edt/hash_func01.h

diff --git a/traffic-pacing-edt/hash_func01.h b/traffic-pacing-edt/hash_func01.h
new file mode 100644
index 0000000..3825581
--- /dev/null
+++ b/traffic-pacing-edt/hash_func01.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: LGPL-2.1
+ *
+ * Based on Paul Hsieh's (LGPG 2.1) hash function
+ * From: http://www.azillionmonkeys.com/qed/hash.html
+ */
+
+#define get16bits(d) (*((const __u16 *) (d)))
+
+static __always_inline
+__u32 SuperFastHash (const char *data, int len, __u32 initval) {
+	__u32 hash = initval;
+	__u32 tmp;
+	int rem;
+
+	if (len <= 0 || data == NULL) return 0;
+
+	rem = len & 3;
+	len >>= 2;
+
+	/* Main loop */
+#pragma clang loop unroll(full)
+	for (;len > 0; len--) {
+		hash  += get16bits (data);
+		tmp    = (get16bits (data+2) << 11) ^ hash;
+		hash   = (hash << 16) ^ tmp;
+		data  += 2*sizeof (__u16);
+		hash  += hash >> 11;
+	}
+
+	/* Handle end cases */
+	switch (rem) {
+        case 3: hash += get16bits (data);
+                hash ^= hash << 16;
+                hash ^= ((signed char)data[sizeof (__u16)]) << 18;
+                hash += hash >> 11;
+                break;
+        case 2: hash += get16bits (data);
+                hash ^= hash << 11;
+                hash += hash >> 17;
+                break;
+        case 1: hash += (signed char)*data;
+                hash ^= hash << 10;
+                hash += hash >> 1;
+	}
+
+	/* Force "avalanching" of final 127 bits */
+	hash ^= hash << 3;
+	hash += hash >> 5;
+	hash ^= hash << 4;
+	hash += hash >> 17;
+	hash ^= hash << 25;
+	hash += hash >> 6;
+
+	return hash;
+}

From a16ab11e70cb8c1fb71c44cc4faffd4ce6288b8d Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Wed, 16 Dec 2020 21:09:28 +0100
Subject: [PATCH 57/61] traffic-pacing-edt: Use hash function to calc cpu_dest

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/xdp_cpumap_qinq.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/traffic-pacing-edt/xdp_cpumap_qinq.c b/traffic-pacing-edt/xdp_cpumap_qinq.c
index 5803c95..3ecc623 100644
--- a/traffic-pacing-edt/xdp_cpumap_qinq.c
+++ b/traffic-pacing-edt/xdp_cpumap_qinq.c
@@ -4,6 +4,8 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/compiler.h>
 
+#define INITVAL 15485863
+#include "hash_func01.h" /* SuperFastHash */
 
 #include <bpf/bpf_helpers.h>
 
@@ -24,18 +26,10 @@ struct {
 } cpumap SEC(".maps");
 
 static __always_inline
-__u16 extract_vlan_key(struct collect_vlans *vlans)
+__u32 extract_vlan_key(struct collect_vlans *vlans)
 {
-	__u16 vlan_key = 0;
-
-	if (vlans->id[1]) {
-		/* Inner Q-in-Q VLAN present use that as key */
-		vlan_key = vlans->id[1];
-	} else {
-		/* If only one VLAN tag, use it as key */
-		vlan_key = vlans->id[0];
-	}
-
+	/* Combine inner and outer VLAN as a key */
+	__u32  vlan_key = (vlans->id[1] << 16) |  vlans->id[0];
 	return vlan_key;
 }
 
@@ -45,6 +39,7 @@ int  xdp_cpumap_qinq(struct xdp_md *ctx)
 	void *data     = (void *)(long)ctx->data;
 	void *data_end = (void *)(long)ctx->data_end;
 	struct collect_vlans vlans = { 0 };
+	__u32 hash_key, vlan_key;
 	struct ethhdr *eth;
 	__u32 cpu_dest = 0;
 	__u64 action;
@@ -72,8 +67,10 @@ int  xdp_cpumap_qinq(struct xdp_md *ctx)
 		goto out;
 	}
 
-	/* Use inner VLAN as key and hash based on max_cpus */
-	cpu_dest = extract_vlan_key(&vlans) % global_max_cpus;
+	/* Use inner+outer VLAN as key and hash based on max_cpus */
+	vlan_key = extract_vlan_key(&vlans);
+	hash_key = SuperFastHash((char *)&vlan_key, 4, INITVAL);
+	cpu_dest = hash_key % global_max_cpus;
 
 	/* Notice: Userspace MUST insert entries into cpumap */
 	action = bpf_redirect_map(&cpumap, cpu_dest, XDP_PASS);

From 3b6a0c0aa969e247abc80834f68c89343024a6e6 Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Wed, 16 Dec 2020 21:24:14 +0100
Subject: [PATCH 58/61] traffic-pacing-edt: Exclude CPU-6 in the code

On production setup i40e driver sends all packets to CPU-6 (RX).
Thus, we want to exclude CPU-6 itself from processing/pacing
packets itself.

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/xdp_cpumap_qinq.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/traffic-pacing-edt/xdp_cpumap_qinq.c b/traffic-pacing-edt/xdp_cpumap_qinq.c
index 3ecc623..63c8138 100644
--- a/traffic-pacing-edt/xdp_cpumap_qinq.c
+++ b/traffic-pacing-edt/xdp_cpumap_qinq.c
@@ -72,6 +72,10 @@ int  xdp_cpumap_qinq(struct xdp_md *ctx)
 	hash_key = SuperFastHash((char *)&vlan_key, 4, INITVAL);
 	cpu_dest = hash_key % global_max_cpus;
 
+	/* TODO: Find more generic way to exclude CPU-6 */
+	if (cpu_dest == 6)
+		cpu_dest = 11;
+
 	/* Notice: Userspace MUST insert entries into cpumap */
 	action = bpf_redirect_map(&cpumap, cpu_dest, XDP_PASS);
 out:

From 47e5cb1c391ccea12cb97c9fa69d73deb19604aa Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Fri, 18 Dec 2020 20:13:55 +0100
Subject: [PATCH 59/61] traffic-pacing-edt: playing with hash initval

It didn't help kept the original value.

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/xdp_cpumap_qinq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/traffic-pacing-edt/xdp_cpumap_qinq.c b/traffic-pacing-edt/xdp_cpumap_qinq.c
index 63c8138..f38d61b 100644
--- a/traffic-pacing-edt/xdp_cpumap_qinq.c
+++ b/traffic-pacing-edt/xdp_cpumap_qinq.c
@@ -5,6 +5,8 @@
 #include <bpf/compiler.h>
 
 #define INITVAL 15485863
+//#define INITVAL 2654435761
+
 #include "hash_func01.h" /* SuperFastHash */
 
 #include <bpf/bpf_helpers.h>
@@ -29,7 +31,7 @@ static __always_inline
 __u32 extract_vlan_key(struct collect_vlans *vlans)
 {
 	/* Combine inner and outer VLAN as a key */
-	__u32  vlan_key = (vlans->id[1] << 16) |  vlans->id[0];
+	__u32  vlan_key = (vlans->id[1] << 16) | vlans->id[0];
 	return vlan_key;
 }
 

From 39ab41d0d61b10928a2eae42f8fb6f19bbcf6a5d Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Fri, 18 Dec 2020 22:09:24 +0100
Subject: [PATCH 60/61] Add CPU mapping layer to allow excluding some CPUs

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/xdp_cpumap_loader.c | 172 +++++++++++++++++++++----
 traffic-pacing-edt/xdp_cpumap_qinq.c   |  36 +++++-
 2 files changed, 177 insertions(+), 31 deletions(-)

diff --git a/traffic-pacing-edt/xdp_cpumap_loader.c b/traffic-pacing-edt/xdp_cpumap_loader.c
index 3a954b9..0afd46a 100644
--- a/traffic-pacing-edt/xdp_cpumap_loader.c
+++ b/traffic-pacing-edt/xdp_cpumap_loader.c
@@ -43,6 +43,8 @@ static const struct option long_options[] = {
 	{"qsize",	required_argument,	NULL, 'q' },
 	{"force",	no_argument,		NULL, 'F' },
 	{"remove",	no_argument,		NULL, 'r' },
+	{"non-cpu",	required_argument,	NULL, 'x' },
+	{"exclude-cpu",	required_argument,	NULL, 'x' },
 	{0, 0, NULL,  0 }
 };
 
@@ -67,37 +69,130 @@ static void usage(char *argv[])
 	printf("\n");
 }
 
-static int create_cpu_entry(int cpumap_fd, __u32 cpu,
-			    struct bpf_cpumap_val *value)
+struct cpumap_config {
+	int fd_cpumap;
+	int fd_cpus_enabled;
+	int fd_cpus_count;
+	int *cpu_exclude;
+	int max_cpus;
+	__u32 qsize;
+};
+
+static int cpumap_config_init(struct cpumap_config *cfg)
 {
-	int err;
+        int n_cpus = get_nprocs_conf();
+	int *cpu_exclude;
+
+	memset(cfg, 0, sizeof(*cfg));
+
+	cpu_exclude = malloc(n_cpus * sizeof(int));
+	if (!cpu_exclude) {
+		fprintf(stderr, "failed to allocate array\n");
+		return EXIT_FAIL_MEM;
+	}
+	memset(cpu_exclude, 0, n_cpus * sizeof(int));
+
+	cfg->cpu_exclude = cpu_exclude;
+	cfg->max_cpus = n_cpus;
+	return 0;
+}
+
+int __find_map_fd_by_name(struct bpf_object *obj, char *name)
+{
+	int fd;
+
+	fd = bpf_object__find_map_fd_by_name(obj, name);
+	if (fd < 0) {
+		printf("No map found! - named: %s\n", name);
+		exit(EXIT_FAIL_BPF);
+	}
+	return fd;
+}
+
+/* Get file descriptors to BPF-maps */
+static int cpumap_config_find_maps(struct bpf_object *obj,
+				   struct cpumap_config *cfg)
+{
+	cfg->fd_cpumap       = __find_map_fd_by_name(obj, "cpumap");
+	cfg->fd_cpus_enabled = __find_map_fd_by_name(obj, "cpus_enabled");
+	cfg->fd_cpus_count   = __find_map_fd_by_name(obj, "cpus_count");
+	return 0;
+}
+
+static int create_cpu_entry(struct cpumap_config *cfg, __u32 cpu,
+			    struct bpf_cpumap_val *value,
+			    __u32 enabled_idx, bool new)
+{
+	__u32 curr_cpus_count = 0;
+	__u32 key = 0;
+	int err, fd;
 
 	/* Add a CPU entry to cpumap, as this allocate a cpu entry in
 	 * the kernel for the cpu.
 	 */
-	err = bpf_map_update_elem(cpumap_fd, &cpu, value, 0);
+	fd = cfg->fd_cpumap;
+	err = bpf_map_update_elem(fd, &cpu, value, 0);
 	if (err) {
-		fprintf(stderr, "Create CPU entry failed (err:%d)\n", err);
+		fprintf(stderr, "Create(fd:%d) CPU(%d) entry failed (err:%d)\n",
+			fd, cpu, err);
 		return EXIT_FAIL_BPF;
 	}
 
+	/* Inform bpf_prog's that a new CPU is enabled and available
+	 * to be select from the map, that maps index to actual CPU.
+	 */
+	fd = cfg->fd_cpus_enabled;
+	err = bpf_map_update_elem(fd, &enabled_idx, &cpu, 0);
+	if (err) {
+		fprintf(stderr, "Add to enabled avail CPUs failed\n");
+		return EXIT_FAIL_BPF;
+	}
+
+	/* When not replacing/updating existing entry, bump the count */
+	fd = cfg->fd_cpus_count;
+	err = bpf_map_lookup_elem(fd, &key, &curr_cpus_count);
+	if (err) {
+		fprintf(stderr, "Failed reading curr cpus_count\n");
+		return EXIT_FAIL_BPF;
+	}
+	if (new) {
+		curr_cpus_count++;
+		err = bpf_map_update_elem(fd, &key, &curr_cpus_count, 0);
+		if (err) {
+			fprintf(stderr, "Failed write curr cpus_count\n");
+			return EXIT_FAIL_BPF;
+		}
+	}
+
 	return 0;
 }
 
 /* Userspace MUST create/populate CPUMAP entries for redirect to work
  */
-static void enable_all_cpus(int cpumap_fd, __u32 qsize)
+static int configure_cpus(struct cpumap_config *cfg)
 {
 	struct bpf_cpumap_val value = { 0 };
-        int n_cpus = get_nprocs_conf();
-	int i;
+        int n_cpus = cfg->max_cpus;
+	int *exclude = cfg->cpu_exclude;
+	int enabled_idx = 0;
+	bool new = true;
+	int cpu, err;
 
-	value.qsize = qsize;
+	value.qsize = cfg->qsize;
 
-	for (i = 0; i < n_cpus; i++) {
-		printf("Enable CPU:%d\n", i);
-		create_cpu_entry(cpumap_fd, i, &value);
+	for (cpu = 0; cpu < n_cpus; cpu++) {
+
+		if (exclude[cpu] == -1) {
+			printf("Excluding CPU:%d\n", cpu);
+			continue;
+		}
+		printf("Enable CPU:%d\n", cpu);
+		err = create_cpu_entry(cfg, cpu, &value, enabled_idx, new);
+		if (err)
+			return err;
+		enabled_idx++;
 	}
+	return 0;
 }
 
 struct bpf_object *do_load_bpf_obj(struct bpf_object *obj)
@@ -151,15 +246,21 @@ int main(int argc, char **argv)
 	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
 	bool do_detach = false;
 	int opt, longindex = 0;
-	__u32 cfg_qsize = 512;
 	char buf[100];
 	int err;
 
 	struct bpf_object *obj = NULL;
 	struct bpf_program *prog;
-	int cpumap_fd = -1;
 
+	/* System to setup and exclude some CPUs */
+	struct cpumap_config cfg;
 	int n_cpus = get_nprocs_conf();
+	int non_cpu = -1;
+	int *cpu_exclude;
+
+	cpumap_config_init(&cfg);
+	cpu_exclude = cfg.cpu_exclude;
+	cfg.qsize = 512; /* Default queue size */
 
 	/* Always use XDP native driver mode */
 	xdp_flags |= XDP_FLAGS_DRV_MODE;
@@ -174,7 +275,7 @@ int main(int argc, char **argv)
 	err = EXIT_OK;
 
 	/* Parse commands line args */
-	while ((opt = getopt_long(argc, argv, "hSd:s:p:q:c:xzFf:e:r:m:",
+	while ((opt = getopt_long(argc, argv, "hd:q:Frx:",
 				  long_options, &longindex)) != -1) {
 		switch (opt) {
 		case 'd':
@@ -193,7 +294,7 @@ int main(int argc, char **argv)
 			}
 			break;
 		case 'q':
-			cfg_qsize = strtol(optarg, NULL, 10);
+			cfg.qsize = strtol(optarg, NULL, 10);
 			break;
 		case 'F':
 			xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
@@ -201,10 +302,23 @@ int main(int argc, char **argv)
 		case 'r':
 			do_detach = true;
 			break;
+		case 'x': /* --exclude-cpu or --non-cpu */
+			/* Possible to exclude multiple CPUs on cmdline */
+			non_cpu = strtoul(optarg, NULL, 0);
+			if (non_cpu >= n_cpus) {
+				fprintf(stderr,
+				"--cpu nr too large for cpumap err(%d):%s\n",
+					errno, strerror(errno));
+				goto error;
+			}
+			cpu_exclude[non_cpu] = -1;
+			break;
+
 		case 'h':
 		error:
 		default:
 			usage(argv);
+			free(cpu_exclude);
 			return EXIT_FAIL_OPTION;
 		}
 	}
@@ -212,7 +326,8 @@ int main(int argc, char **argv)
 	if (ifindex == -1) {
 		fprintf(stderr, "ERR: required option --dev missing\n");
 		usage(argv);
-		return EXIT_FAIL_OPTION;
+		err = EXIT_FAIL_OPTION;
+		goto out;
 	}
 
 	if (do_detach)
@@ -220,13 +335,15 @@ int main(int argc, char **argv)
 
 	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
 		perror("setrlimit(RLIMIT_MEMLOCK)");
-		return EXIT_FAIL_MEM;
+		err = EXIT_FAIL_MEM;
+		goto out;
 	}
 
-
 	obj = do_load_bpf_obj(obj);
-	if (!obj)
-		return EXIT_FAIL_BPF;
+	if (!obj) {
+		err = EXIT_FAIL_BPF;
+		goto out;
+	}
 
 	/* Pickup first BPF-program */
 	prog = bpf_program__next(NULL, obj);
@@ -236,15 +353,17 @@ int main(int argc, char **argv)
 		goto out;
 	}
 
-	/* Get file descriptor to BPF-map */
-	cpumap_fd = bpf_object__find_map_fd_by_name(obj, "cpumap");
-	if (cpumap_fd < 0) {
-		printf("No cpumap found!\n");
+	/* Find maps maps */
+	if (cpumap_config_find_maps(obj, &cfg)) {
 		err = EXIT_FAIL_BPF;
 		goto out;
 	}
+
 	/* Configure cpumap */
-	enable_all_cpus(cpumap_fd, cfg_qsize);
+	if (configure_cpus(&cfg)) {
+		err = EXIT_FAIL_BPF;
+		goto out;
+	}
 
 	/* Attach XDP program */
 	err = do_xdp_attach(ifindex, prog, xdp_flags);
@@ -259,5 +378,6 @@ out:
 	if (obj)
 		bpf_object__close(obj);
 
+	free(cpu_exclude);
 	return err;
 }
diff --git a/traffic-pacing-edt/xdp_cpumap_qinq.c b/traffic-pacing-edt/xdp_cpumap_qinq.c
index f38d61b..eada801 100644
--- a/traffic-pacing-edt/xdp_cpumap_qinq.c
+++ b/traffic-pacing-edt/xdp_cpumap_qinq.c
@@ -27,6 +27,20 @@ struct {
 	__uint(max_entries, MAX_CPUS);
 } cpumap SEC(".maps");
 
+/* Mapping table with CPUs enabled, for hashing between */
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, __u32);
+	__type(value, __u32);
+	__uint(max_entries, MAX_CPUS);
+} cpus_enabled SEC(".maps");
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, __u32);
+	__type(value, __u32);
+	__uint(max_entries, 1);
+} cpus_count SEC(".maps");
+
 static __always_inline
 __u32 extract_vlan_key(struct collect_vlans *vlans)
 {
@@ -43,8 +57,11 @@ int  xdp_cpumap_qinq(struct xdp_md *ctx)
 	struct collect_vlans vlans = { 0 };
 	__u32 hash_key, vlan_key;
 	struct ethhdr *eth;
-	__u32 cpu_dest = 0;
+	__u32 cpu_idx, cpu_dest = 0;
+	__u32 *cpu_lookup;
 	__u64 action;
+	__u32 *cpu_max;
+
 
 	/* These keep track of the next header type and iterator pointer */
 	struct hdr_cursor nh;
@@ -69,14 +86,23 @@ int  xdp_cpumap_qinq(struct xdp_md *ctx)
 		goto out;
 	}
 
+	int key0 = 0;
+	cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
+	if (!cpu_max)
+		return XDP_ABORTED;
+
 	/* Use inner+outer VLAN as key and hash based on max_cpus */
 	vlan_key = extract_vlan_key(&vlans);
 	hash_key = SuperFastHash((char *)&vlan_key, 4, INITVAL);
-	cpu_dest = hash_key % global_max_cpus;
+	cpu_idx = hash_key % *cpu_max;
 
-	/* TODO: Find more generic way to exclude CPU-6 */
-	if (cpu_dest == 6)
-		cpu_dest = 11;
+	/* To allow excluding some CPUs, a mapping table cpus_enabled
+	 * translates cpu_idx to real CPU-id
+	 */
+	cpu_lookup = bpf_map_lookup_elem(&cpus_enabled, &cpu_idx);
+	if (!cpu_lookup)
+		return XDP_ABORTED;
+	cpu_dest = *cpu_lookup;
 
 	/* Notice: Userspace MUST insert entries into cpumap */
 	action = bpf_redirect_map(&cpumap, cpu_dest, XDP_PASS);

From 904c820e7ee202b58d3e8d3e120d06f0a52c094f Mon Sep 17 00:00:00 2001
From: "Jesper D. Brouer" <netoptimizer@brouer.com>
Date: Tue, 22 Dec 2020 19:16:10 +0100
Subject: [PATCH 61/61] traffic-pacing-edt: Propagate define that enables BTF
 maps

Two errors:
- Wrong define in config.mk
- Use BPF_CFLAGS to reach llvm compile cflags

Signed-off-by: Jesper D. Brouer <netoptimizer@brouer.com>
---
 traffic-pacing-edt/configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/traffic-pacing-edt/configure b/traffic-pacing-edt/configure
index 9b01369..248c846 100755
--- a/traffic-pacing-edt/configure
+++ b/traffic-pacing-edt/configure
@@ -15,7 +15,7 @@ check_tc_libbpf()
     if echo $tc_version | grep -q libbpf; then
 	libbpf_version=${tc_version##*libbpf }
 	echo "HAVE_TC_LIBBPF:=y" >> $CONFIG
-	echo "CFLAGS += -DHAVE_LIBBPF" >> $CONFIG
+	echo "BPF_CFLAGS += -DHAVE_TC_LIBBPF" >> $CONFIG
 	echo "yes ($libbpf_version)"
     else
 	echo "no"