Merge branch 'master' of https://github.com/netoptimizer/bpf-examples into netoptimizer-master

Signed-off-by: Jesper Dangaard Brouer <netoptimizer@brouer.com>
2024-05-06 15:54:53 +00:00 · 2021-01-08 14:54:40 +01:00
parent 583f7a213f 904c820e7e
commit a47528b709
21 changed files with 1636 additions and 186 deletions
--- a/headers/bpf/compiler.h
+++ b/headers/bpf/compiler.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2016-2020 Authors of Cilium */
+
+#ifndef __BPF_COMPILER_H_
+#define __BPF_COMPILER_H_
+
+#ifndef __non_bpf_context
+# include "stddef.h"
+#endif
+
+#ifndef __section
+# define __section(X)		__attribute__((section(X), used))
+#endif
+
+#ifndef __maybe_unused
+# define __maybe_unused		__attribute__((__unused__))
+#endif
+
+#ifndef offsetof
+# define offsetof(T, M)		__builtin_offsetof(T, M)
+#endif
+
+#ifndef field_sizeof
+# define field_sizeof(T, M)	sizeof((((T *)NULL)->M))
+#endif
+
+#ifndef __packed
+# define __packed		__attribute__((packed))
+#endif
+
+#ifndef __nobuiltin
+# if __clang_major__ >= 10
+#  define __nobuiltin(X)	__attribute__((no_builtin(X)))
+# else
+#  define __nobuiltin(X)
+# endif
+#endif
+
+#ifndef likely
+# define likely(X)		__builtin_expect(!!(X), 1)
+#endif
+
+#ifndef unlikely
+# define unlikely(X)		__builtin_expect(!!(X), 0)
+#endif
+
+#ifndef always_succeeds		/* Mainly for documentation purpose. */
+# define always_succeeds(X)	likely(X)
+#endif
+
+#undef __always_inline		/* stddef.h defines its own */
+#define __always_inline		inline __attribute__((always_inline))
+
+#ifndef __stringify
+# define __stringify(X)		#X
+#endif
+
+#ifndef __fetch
+# define __fetch(X)		(__u32)(__u64)(&(X))
+#endif
+
+#ifndef __aligned
+# define __aligned(X)		__attribute__((aligned(X)))
+#endif
+
+#ifndef build_bug_on
+# define build_bug_on(E)	((void)sizeof(char[1 - 2*!!(E)]))
+#endif
+
+#ifndef __throw_build_bug
+# define __throw_build_bug()	__builtin_trap()
+#endif
+
+#ifndef __printf
+# define __printf(X, Y)		__attribute__((__format__(printf, X, Y)))
+#endif
+
+#ifndef barrier
+# define barrier()		asm volatile("": : :"memory")
+#endif
+
+#ifndef barrier_data
+# define barrier_data(ptr)	asm volatile("": :"r"(ptr) :"memory")
+#endif
+
+static __always_inline void bpf_barrier(void)
+{
+	/* Workaround to avoid verifier complaint:
+	 * "dereference of modified ctx ptr R5 off=48+0, ctx+const is allowed,
+	 *        ctx+const+const is not"
+	 */
+	barrier();
+}
+
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(A)		(sizeof(A) / sizeof((A)[0]))
+#endif
+
+#ifndef __READ_ONCE
+# define __READ_ONCE(X)		(*(volatile typeof(X) *)&X)
+#endif
+
+#ifndef __WRITE_ONCE
+# define __WRITE_ONCE(X, V)	(*(volatile typeof(X) *)&X) = (V)
+#endif
+
+/* {READ,WRITE}_ONCE() with verifier workaround via bpf_barrier(). */
+
+#ifndef READ_ONCE
+# define READ_ONCE(X)						\
+			({ typeof(X) __val = __READ_ONCE(X);	\
+			   bpf_barrier();			\
+			   __val; })
+#endif
+
+#ifndef WRITE_ONCE
+# define WRITE_ONCE(X, V)					\
+				({ typeof(X) __val = (V);	\
+				   __WRITE_ONCE(X, __val);	\
+				   bpf_barrier();		\
+				   __val; })
+#endif
+
+#endif /* __BPF_COMPILER_H_ */
--- a/headers/xdp/parsing_helpers.h
+++ b/headers/xdp/parsing_helpers.h
@@ -1,8 +1,8 @@
 /* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-clause) */
 /*
- * This file contains parsing functions that can be used in eXDP programs. The
- * functions are marked as __always_inline, and fully defined in this header
- * file to be included in the BPF program.
+ * This file contains parsing functions that are used in the packetXX XDP
+ * programs. The functions are marked as __always_inline, and fully defined in
+ * this header file to be included in the BPF program.
 *
 * Each helper parses a packet header, including doing bounds checking, and
 * returns the type of its contents if successful, and -1 otherwise.
@@ -10,6 +10,10 @@
 * For Ethernet and IP headers, the content type is the type of the payload
 * (h_proto for Ethernet, nexthdr for IPv6), for ICMP it is the ICMP type field.
 * All return values are in host byte order.
+ *
+ * The versions of the functions included here are slightly expanded versions of
+ * the functions in the packet01 lesson. For instance, the Ethernet header
+ * parsing has support for parsing VLAN tags.
 */

 #ifndef __PARSING_HELPERS_H
@@ -54,7 +58,7 @@ struct icmphdr_common {

 /* Allow users of header file to redefine VLAN max depth */
 #ifndef VLAN_MAX_DEPTH
-#define VLAN_MAX_DEPTH 4
+#define VLAN_MAX_DEPTH 2
 #endif

 /* Longest chain of IPv6 extension headers to resolve */
@@ -62,6 +66,11 @@ struct icmphdr_common {
 #define IPV6_EXT_MAX_CHAIN 6
 #endif

+#define VLAN_VID_MASK		0x0fff /* VLAN Identifier */
+/* Struct for collecting VLANs after parsing via parse_ethhdr_vlan */
+struct collect_vlans {
+	__u16 id[VLAN_MAX_DEPTH];
+};

 static __always_inline int proto_is_vlan(__u16 h_proto)
 {
@@ -74,18 +83,24 @@ static __always_inline int proto_is_vlan(__u16 h_proto)
 * Ethernet header. Thus, caller can look at eth->h_proto to see if this was a
 * VLAN tagged packet.
 */
-static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end,
-					struct ethhdr **ethhdr)
+static __always_inline int parse_ethhdr_vlan(struct hdr_cursor *nh,
+					     void *data_end,
+					     struct ethhdr **ethhdr,
+					     struct collect_vlans *vlans)
 {
 	struct ethhdr *eth = nh->pos;
+	int hdrsize = sizeof(*eth);
 	struct vlan_hdr *vlh;
 	__u16 h_proto;
 	int i;

-	if (eth + 1 > data_end)
+	/* Byte-count bounds check; check if current pointer + size of header
+	 * is after data_end.
+	 */
+	if (nh->pos + hdrsize > data_end)
 		return -1;

-	nh->pos = eth + 1;
+	nh->pos += hdrsize;
 	*ethhdr = eth;
 	vlh = nh->pos;
 	h_proto = eth->h_proto;
@@ -102,6 +117,10 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end,
 			break;

 		h_proto = vlh->h_vlan_encapsulated_proto;
+		if (vlans) /* collect VLAN ids */
+			vlans->id[i] =
+				(bpf_ntohs(vlh->h_vlan_TCI) & VLAN_VID_MASK);
+
 		vlh++;
 	}

@@ -109,6 +128,14 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end,
 	return h_proto; /* network-byte-order */
 }

+static __always_inline int parse_ethhdr(struct hdr_cursor *nh,
+					void *data_end,
+					struct ethhdr **ethhdr)
+{
+	/* Expect compiler removes the code that collects VLAN ids */
+	return parse_ethhdr_vlan(nh, data_end, ethhdr, NULL);
+}
+
 static __always_inline int skip_ip6hdrext(struct hdr_cursor *nh,
 					  void *data_end,
 					  __u8 next_hdr_type)
@@ -174,6 +201,9 @@ static __always_inline int parse_iphdr(struct hdr_cursor *nh,
 		return -1;

 	hdrsize = iph->ihl * 4;
+	/* Sanity check packet field is valid */
+	if(hdrsize < sizeof(iph))
+		return -1;

 	/* Variable-length IPv4 header, need to use byte-based arithmetic */
 	if (nh->pos + hdrsize > data_end)
@@ -267,10 +297,15 @@ static __always_inline int parse_tcphdr(struct hdr_cursor *nh,
 		return -1;

 	len = h->doff * 4;
-	if ((void *) h + len > data_end)
+	/* Sanity check packet field is valid */
+	if(len < sizeof(h))
 		return -1;

-	nh->pos  = h + 1;
+	/* Variable-length TCP header, need to use byte-based arithmetic */
+	if (nh->pos + len > data_end)
+		return -1;
+
+	nh->pos += len;
 	*tcphdr = h;

 	return len;
--- a/traffic-pacing-edt/Makefile
+++ b/traffic-pacing-edt/Makefile
@@ -1,14 +1,23 @@
 # SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)

-USER_TARGETS :=
-BPF_TARGETS := edt_pacer01
-BPF_TARGETS += edt_pacer02
+USER_TARGETS := xdp_cpumap_loader
+BPF_TARGETS := edt_pacer_vlan
+BPF_TARGETS += xdp_cpumap_qinq
+
+EXTRA_DEPS += config.mk

 LIB_DIR = ../lib

 include $(LIB_DIR)/common.mk
+include config.mk

-# The iproute2 'tc' tool doesn't understand BTF debug info
+all: config.mk
+
+config.mk: configure
+	@sh configure
+
+ifndef HAVE_TC_LIBBPF
+# If the iproute2 'tc' tool doesn't understand BTF debug info
 # use llvm-strip to remove this debug info from object file
 #
 # *BUT* cannot strip everything as it removes ELF elems needed for
@@ -16,6 +25,8 @@ include $(LIB_DIR)/common.mk
 #
 .PHONY: strip_tc_obj
 strip_tc_obj: ${BPF_TARGETS:=.o}
+	$(Q) echo "TC don't support libbpf - strip BTF info"
 	$(Q) llvm-strip --no-strip-all --remove-section .BTF $?

 all: strip_tc_obj
+endif
--- a/traffic-pacing-edt/bpf_egress_loader.sh
+++ b/traffic-pacing-edt/bpf_egress_loader.sh
@@ -11,12 +11,12 @@ root_check_run_with_sudo "$@"
 # Use common parameters
 source ${basedir}/parameters.sh

-export TC=/sbin/tc
+export TC=tc

 # This can be changed via --file or --obj
 if [[ -z ${BPF_OBJ} ]]; then
    # Fallback default
-    BPF_OBJ=edt_pacer02.o
+    BPF_OBJ=edt_pacer_vlan.o
 fi

 info "Applying TC-BPF egress setup on device: $DEV with object file: $BPF_OBJ"
--- a/traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt
+++ b/traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt
@@ -0,0 +1,31 @@
+#!/usr/local/bin/bpftrace
+
+#include <linux/skbuff.h>
+
+/* Measure time difference between EDT-time and real "NIC" TX-time.
+ *
+ * Assuming packets are EDT timestamped by the BPF-program, we can
+ * detect/measure how accuratly packets are actually transmitted
+ * towards the NIC driver, by comparing EDT-time against "now"
+ * timestamp in the function transmitting to the NIC driver.
+ */
+
+// tracepoint:net:net_dev_start_xmit
+tracepoint:net:net_dev_xmit
+{
+	$skb = (struct sk_buff *)args->skbaddr;
+	//$tstamp = (uint64)$skb->tstamp;
+	$tstamp = $skb->skb_mstamp_ns;
+	$now = nsecs;
+
+	// if ($skb->mark > 0) {
+	if ($tstamp > 0) {
+		if ($now >= $tstamp) {
+			$diff_late = $now - $tstamp;
+		} else {
+			$diff_ahead = $tstamp - $now;
+		}
+		@tstamp_diff_late = hist($diff_late / 1000);
+		@tstamp_diff_ahead = hist($diff_ahead / 1000);
+	}
+}
--- a/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt
+++ b/traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt
@@ -0,0 +1,78 @@
+#!/usr/local/bin/bpftrace
+
+#include <linux/skbuff.h>
+
+// tracepoint:net:net_dev_start_xmit
+tracepoint:net:net_dev_xmit
+{
+	$skb = (struct sk_buff *)args->skbaddr;
+	//$tstamp = (uint64)$skb->tstamp;
+	$tstamp = $skb->skb_mstamp_ns;
+	$now = nsecs;
+
+//	if ($skb->mark > 0) {
+	if ($tstamp > 0) {
+		if ($now >= $tstamp) {
+			$diff_late = $now - $tstamp;
+		} else {
+			$diff_ahead = $tstamp - $now;
+		}
+		@tstamp_usec_diff_late = hist($diff_late / 1000);
+		@tstamp_usec_diff_ahead = hist($diff_ahead / 1000);
+	}
+
+	/* Capture burstiness over a time period, by dividing nanosec
+	 * timestamp with wanted period, and keeping state byte counter as
+	 * long as timestamp match.
+	 *
+	 * Practical usage shows that bpftrace uses a hash-map to implement
+	 * this, which unfortunately cost too much (shows 5% jhash cpu
+	 * usage), enough overhead to change behavior of prod system.
+	 */
+	//$period = $now / 10000; /* 10000 = 10 usec */
+	$period = $now / 30000; /* 30000 = 30 usec */
+	if (@state[cpu] == $period) {
+		@state_bytes[cpu] += $skb->len;
+	} else {	
+		@state[cpu] = $period;
+		if (@state_bytes[cpu] > 0) {
+			@byte_burst[cpu] = hist(@state_bytes[cpu]);
+		}
+		@state_bytes[cpu] = $skb->len; /* Reset counter */
+	}
+}
+
+/*
+tracepoint:qdisc:qdisc_dequeue
+{
+	@qdisc_bulk_dequeue = lhist(args->packets, 0,64,1);
+}
+*/
+
+/*
+kretfunc:dev_hard_start_xmit
+{
+// Wanted to know if ret == NETDEV_TX_BUSY
+# ERROR: kfunc/kretfunc not available for your linked against bcc version.
+}
+*/
+
+
+/* How often does FQ-pacer find no-packets are qualified to be
+ * scheduled, which leads to scheduling an hrtimer event, that will
+ * start qdisc again at a later time.
+ *
+ * We cannot kprobe fq_dequeue as it is a module.
+ */
+
+/*
+kprobe:qdisc_watchdog_schedule_range_ns
+{
+	@qdisc_watchdog[cpu] = count();
+}
+
+kprobe:__netif_schedule
+{
+	@__netif_schedule[cpu] = count();
+}
+*/
--- a/traffic-pacing-edt/codel_impl.h
+++ b/traffic-pacing-edt/codel_impl.h
@@ -0,0 +1,153 @@
+#ifndef __CODEL_IMPL_H
+#define __CODEL_IMPL_H
+
+#ifndef CODEL_TARGET
+#define CODEL_TARGET (10 * 1000 * 1000ULL) /* 10 ms in nanosec */
+#endif
+
+#ifndef CODEL_EXCEED_INTERVAL
+#define CODEL_EXCEED_INTERVAL	(100 * 1000 * 1000ULL) /* 100 ms in ns*/
+#endif
+
+/* Codel like dropping scheme, inspired by:
+ * - RFC:  https://queue.acm.org/detail.cfm?id=2209336
+ * - Code: https://queue.acm.org/appendices/codel.html
+ * - Kernel: include/net/codel_impl.h
+ */
+struct codel_state {
+	/* codel like dropping scheme */
+	__u64	first_above_time; /* Time when above target (0 if below)*/
+	__u64	drop_next;	  /* Time to drop next packet */
+	__u32	count;	/* Packets dropped since going into drop state */
+	__u32	dropping; /* Equal to 1 if in drop state */
+};
+
+/* Table lookup for square-root shifted 16 bit */
+static __always_inline __u32 get_sqrt_sh16(__u64 cnt)
+{
+	switch (cnt) {
+	case 1:	return 65536; /* 65536 * sqrt(1) */
+	case 2:	return 92682; /* 65536 * sqrt(2) */
+	case 3:	return 113512; /* 65536 * sqrt(3) */
+	case 4:	return 131072; /* 65536 * sqrt(4) */
+	case 5:	return 146543; /* 65536 * sqrt(5) */
+	case 6:	return 160530; /* 65536 * sqrt(6) */
+	case 7:	return 173392;
+	case 8:	return 185364;
+	case 9:	return 196608;
+	case 10: return 207243;
+	case 11: return 217358;
+	case 12: return 227023;
+	case 13: return 236293;
+	case 14: return 245213;
+	case 15: return 253820;
+	case 16: return 262144; /* 100 ms / sqrt(16) = 25 ms */
+	case 17: return 270212;
+	case 18: return 278046;
+	case 19: return 285664;
+	case 20: return 293086;
+	case 21: return 300324;
+	case 22: return 307391;
+	case 23: return 314300;
+	case 24: return 321060;
+	case 25: return 327680; /* 100 ms / sqrt(25) = 20 ms */
+	case 26: return 334169;
+	case 27: return 340535;
+	case 28: return 346784;
+	case 29: return 352922;
+	case 30: return 358955;
+	case 31: return 364889;
+	case 32: return 370728;
+	case 33: return 376476;
+	case 34: return 382137;
+	case 35: return 387716;
+	case 36: return 393216; /* 100 / sqrt(36) = 16.66 ms */
+	default:
+		return 463410; /* 65536*sqrt(50) => 100/sqrt(50) = 14.14 ms */
+	}
+}
+
+static __always_inline __u64 get_next_interval_sqrt(__u64 cnt)
+{
+	__u64 val = (__u64)CODEL_EXCEED_INTERVAL << 16 / get_sqrt_sh16(cnt);
+	return val;
+}
+
+static __always_inline __u64
+codel_control_law(__u64 t, __u64 cnt)
+{
+	return t + get_next_interval_sqrt(cnt);
+}
+
+static __always_inline
+bool codel_should_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now)
+{
+	__u64 interval = CODEL_EXCEED_INTERVAL;
+
+	if (t_queue_sz < CODEL_TARGET) {
+		/* went below so we'll stay below for at least interval */
+		codel->first_above_time = 0;
+		return false;
+	}
+
+	if (codel->first_above_time == 0) {
+		/* just went above from below. If we stay above
+		 * for at least interval we'll say it's ok to drop
+		 */
+		codel->first_above_time = now + interval;
+		return false;
+	} else if (now >= codel->first_above_time) {
+		return true;
+	}
+	return false;
+}
+
+static __always_inline
+bool codel_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now)
+{
+	__u64 interval = CODEL_EXCEED_INTERVAL;
+
+	/* If horizon have been exceed for a while, inc drop intensity*/
+	bool drop = codel_should_drop(codel, t_queue_sz, now);
+
+	if (codel->dropping) { /* In dropping state */
+		if (!drop) {
+			/* time below target - leave dropping state */
+			codel->dropping = false;
+			return false;
+		} else if (now >= codel->drop_next) {
+			/* It's time for the next drop. Drop the current
+			 * packet. Schedule the next drop
+			 */
+			codel->count += 1;
+			// schedule the next drop.
+                        codel->drop_next =
+				codel_control_law(codel->drop_next, codel->count);
+			return true;
+		}
+	} else if (drop &&
+		   ((now - codel->drop_next < interval) ||
+		    (now - codel->first_above_time >= interval))) {
+		/* If we get here, then we're not in dropping state.
+		 * Decide  whether it's time to enter dropping state.
+		 */
+		__u32 count = codel->count;
+
+		codel->dropping = true;
+
+		/* If we're in a drop cycle, drop rate that controlled queue
+                 * on the last cycle is a good starting point to control it now.
+		 */
+		if (now - codel->drop_next < interval)
+			count = count > 2 ? (count - 2) : 1;
+		else
+			count = 1;
+
+		codel->count = count;
+		codel->drop_next = codel_control_law(now, count);
+		return true;
+	}
+	return false;
+}
+
+#endif /* __CODEL_IMPL_H */
--- a/traffic-pacing-edt/configure
+++ b/traffic-pacing-edt/configure
@@ -0,0 +1,29 @@
+#!/bin/bash
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+# This is not an autoconf generated configure
+#
+
+# Output file which is input to Makefile
+CONFIG=config.mk
+
+# Assume tc is in $PATH
+TC=tc
+
+check_tc_libbpf()
+{
+    tc_version=$($TC -V)
+    if echo $tc_version | grep -q libbpf; then
+	libbpf_version=${tc_version##*libbpf }
+	echo "HAVE_TC_LIBBPF:=y" >> $CONFIG
+	echo "BPF_CFLAGS += -DHAVE_TC_LIBBPF" >> $CONFIG
+	echo "yes ($libbpf_version)"
+    else
+	echo "no"
+    fi
+}
+
+echo "# Generated config" > $CONFIG
+echo "Detecting available features on system"
+
+echo -n " - libbpf support in tc tool: "
+check_tc_libbpf
--- a/traffic-pacing-edt/edt_pacer01.c
+++ b/traffic-pacing-edt/edt_pacer01.c
@@ -1,40 +0,0 @@
-#include <linux/bpf.h>
-#include <bpf/bpf_helpers.h>
-#include <xdp/parsing_helpers.h>
-#include "iproute2_compat.h"
-
-char _license[] SEC("license") = "GPL";
-
-/* The tc tool (iproute2) use another ELF map layout than libbpf (struct
- * bpf_map_def), see struct bpf_elf_map from iproute2.
- */
-struct bpf_elf_map  SEC("maps") cnt_map = {
-	.type		= BPF_MAP_TYPE_ARRAY,
-	.size_key	= sizeof(__u32),
-	.size_value	= sizeof(__u64),
-	.max_elem	= 1,
-	//.pinning	= PIN_GLOBAL_NS,
-};
-
-SEC("classifier") int tc_dummy(struct __sk_buff *skb)
-{
-	volatile void *data, *data_end;
-	int ret = BPF_OK;
-	struct ethhdr *eth;
-
-	data     = (void *)(long)skb->data;
-	data_end = (void *)(long)skb->data_end;
-	eth = (struct ethhdr *)data;
-
-	if (data + sizeof(*eth) > data_end)
-		return BPF_DROP;
-
-	/* Keep ARP resolution working */
-	if (eth->h_proto == bpf_htons(ETH_P_ARP)) {
-		ret = BPF_OK;
-		goto out;
-	}
-
- out:
-        return ret;
-}
--- a/traffic-pacing-edt/edt_pacer02.c
+++ b/traffic-pacing-edt/edt_pacer02.c
@@ -1,126 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-#include <linux/bpf.h>
-#include <bpf/bpf_helpers.h>
-#include <xdp/parsing_helpers.h>
-#include "iproute2_compat.h"
-
-char _license[] SEC("license") = "GPL";
-
-#define NS_PER_SEC 1000000000
-
-/* skb->len in bytes, thus easier to keep rate in bytes */
-#define RATE_IN_BITS	(1000 * 1000 * 1000)
-#define RATE_IN_BYTES	(RATE_IN_BITS / 8)
-
-#define T_HORIZON_DROP	(2000 * 1000 * 1000)
-
-/* FIXME add proper READ_ONCE / WRITE_ONCE macros, for now use for annotation */
-#define READ_ONCE(V)		(V)
-#define WRITE_ONCE(X,V)	(X) = (V)
-
-struct edt_val {
-	__u64	rate;
-	__u64	t_last;
-	__u64	t_horizon_drop;
-	__u64	t_horizon_ecn;
-};
-
-/* The tc tool (iproute2) use another ELF map layout than libbpf (struct
- * bpf_map_def), see struct bpf_elf_map from iproute2.
- */
-struct bpf_elf_map SEC("maps") time_delay_map = {
-	.type		= BPF_MAP_TYPE_ARRAY,
-	.size_key	= sizeof(__u32),
-	.size_value	= sizeof(struct edt_val),
-	.max_elem	= 1,
-	//.pinning	= PIN_GLOBAL_NS,
-};
-
-/* Role of EDT (Earliest Departure Time) is to schedule departure of packets to
- * be send in the future.
- */
-static __always_inline int sched_departure(struct __sk_buff *skb)
-{
-	struct edt_val *edt;
-	__u64 t_queue_sz;
-	__u64 t_xmit_ns;
-	__u64 t_next;
-	__u64 t_curr;
-	int key = 0;
-	__u64 now;
-
-	edt = bpf_map_lookup_elem(&time_delay_map, &key);
-	if (!edt)
-		return BPF_DROP;
-
-	/* Calc transmission time it takes to send packet 'bytes' */
-	t_xmit_ns = ((__u64)skb->len) * NS_PER_SEC / RATE_IN_BYTES;
-	// t_xmit_ns = ((__u64)skb->len) * NS_PER_SEC / edt->rate;
-
-	now = bpf_ktime_get_ns();
-
-	/* Allow others to set skb tstamp prior to us */
-	t_curr  = skb->tstamp;
-	if (t_curr < now)
-		t_curr = now;
-
-	/* The 't_last' timestamp can be in the future. Packets scheduled a head
-	 * of his packet can be seen as the queue size measured in time, via
-	 * correlating this to 'now' timestamp.
-	 */
-	t_next = READ_ONCE(edt->t_last) + t_xmit_ns;
-
-	/* If packet doesn't get scheduled into the future, then there is
-	 * no-queue and we are not above rate limit. Send packet immediately and
-	 * move forward t_last timestamp to now.
-	 */
-	if (t_next <= t_curr) {
-		WRITE_ONCE(edt->t_last, t_curr);
-		return BPF_OK;
-	}
-
-	/* Calc queue size measured in time */
-	t_queue_sz = t_next - now;
-
-	/* FQ-pacing qdisc also have horizon, but cannot use that, because this
-	 * BPF-prog will have updated map (t_last) on packet and assumes it got
-	 * its part of bandwidth.
-	 */
-	if (t_queue_sz >= T_HORIZON_DROP /* edt->t_horizon_drop */)
-		return BPF_DROP;
-
-	// TODO Add ECN marking horizon
-
-	/* Advance "time queue" */
-	WRITE_ONCE(edt->t_last, t_next);
-
-	/* Schedule packet to be send at future timestamp */
-	skb->tstamp = t_next;
-	return BPF_OK;
-}
-
-SEC("classifier") int tc_edt_simple(struct __sk_buff *skb)
-{
-	volatile void *data, *data_end;
-	int ret = BPF_OK;
-	struct ethhdr *eth;
-
-	data     = (void *)(long)skb->data;
-	data_end = (void *)(long)skb->data_end;
-	eth = (struct ethhdr *)data;
-
-	if (data + sizeof(*eth) > data_end)
-		return BPF_DROP;
-
-	/* Keep ARP resolution working */
-	if (eth->h_proto == bpf_htons(ETH_P_ARP)) {
-		ret = BPF_OK;
-		goto out;
-	}
-
-	// TODO: match on vlan16 and only apply EDT on that
-	return sched_departure(skb);
-
- out:
-	return ret;
-}
--- a/traffic-pacing-edt/edt_pacer_vlan.c
+++ b/traffic-pacing-edt/edt_pacer_vlan.c
@@ -0,0 +1,280 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/compiler.h>
+
+#include <stdbool.h>
+
+#define VLAN_MAX_DEPTH 2
+#include <xdp/parsing_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+#define NS_PER_SEC 1000000000
+
+/* Strategy: Shape at MAC (Medium Access Control) layer with Ethernet
+ *
+ * Production use-case is pacing traffic at 1Gbit/s wirespeed, using a
+ * 10Gbit/s NIC, because 1G end-user switch cannot handle bursts.
+ * 
+ *            (https://en.wikipedia.org/wiki/Interpacket_gap
+ * 12 bytes = interframe gap (IFG) 96 bit
+
+ *            (https://en.wikipedia.org/wiki/Ethernet_frame)
+ *  8 bytes = MAC preamble
+ *  4 bytes = Ethernet Frame Check Sequence (FCS) CRC
+ * 46 bytes = Minimum Payload size
+ *
+ * 14 bytes = Ethernet header
+ *  8 bytes = 2x VLAN headers
+ */
+//#define RATE_IN_BITS	(1000 * 1000 * 1000ULL) /* Full 1Gbit/s */
+#define RATE_IN_BITS	(990 * 1000 * 1000ULL)
+//#define RATE_IN_BITS	(950 * 1000 * 1000ULL)
+#define OVERHEAD	(12 + 8 + 4 + 8)  /* 14 already in wire_len */
+//#define OVERHEAD	(12 + 8 + 4)      /* 14 already in wire_len */
+#define ETH_MIN		(84)
+
+/* skb->len in bytes, thus convert rate to bytes */
+#define RATE_IN_BYTES	(RATE_IN_BITS / 8)
+
+/* Controlling how large queue (in time) is allow to grow */
+#define T_HORIZON_DROP		(40 * 1000 * 1000ULL)
+#define T_HORIZON_TARGET	(5 * 1000 * 1000ULL)
+#define T_HORIZON_ECN		(1 * 1000 * 1000ULL)
+
+/* Codel: If queue exceed target for more than one interval, start dropping */
+#define T_EXCEED_INTERVAL	(100 * 1000 * 1000ULL) /* 100 ms in ns*/
+
+#define CODEL_TARGET		T_HORIZON_TARGET
+#define CODEL_EXCEED_INTERVAL	T_EXCEED_INTERVAL
+#include "codel_impl.h"
+
+struct edt_val {
+	__u64	rate;
+	__u64	t_last;
+	__u64	t_horizon_drop;
+	__u64	t_horizon_ecn;
+	struct codel_state codel;
+} __aligned(64); /* Align struct to cache-size to avoid false-sharing */
+
+#ifdef HAVE_TC_LIBBPF /* detected by configure script in config.mk */
+/* Use BTF format to create map */
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 4096); /* Max possible VLANs */
+	__type(key, __u32);
+	__type(value, struct edt_val);
+//	__uint(pinning, LIBBPF_PIN_BY_NAME);
+} time_delay_map SEC(".maps");
+
+#else
+/* The (iproute2) tc tool (without libbpf support) use another ELF map
+ * layout than libbpf (struct bpf_map_def), see struct bpf_elf_map
+ * from iproute2.
+ */
+#include "iproute2_compat.h"
+struct bpf_elf_map SEC("maps") time_delay_map = {
+	.type		= BPF_MAP_TYPE_ARRAY,
+	.size_key	= sizeof(__u32),
+	.size_value	= sizeof(struct edt_val),
+	.max_elem	= 4096, /* Max possible VLANs */
+//	.pinning	= PIN_GLOBAL_NS,
+};
+#endif
+
+
+/* Role of EDT (Earliest Departure Time) is to schedule departure of packets to
+ * be send in the future.
+ */
+static __always_inline int sched_departure(struct __sk_buff *skb, __u32 key)
+{
+	struct edt_val *edt;
+	__u64 t_queue_sz;
+	__u64 t_xmit_ns;
+	__u64 wire_len;
+	__u64 t_next;
+	__u64 t_curr;
+	__u64 now;
+
+	edt = bpf_map_lookup_elem(&time_delay_map, &key);
+	if (!edt)
+		return BPF_DROP;
+
+	/* Calc transmission time it takes to send packet 'bytes'.
+	 *
+	 * Details on getting precise bytes on wire.  The skb->len does include
+	 * length of GRO/GSO segments, but not the segment headers that gets
+	 * added on transmit.  Fortunately skb->wire_len at TC-egress hook (not
+	 * ingress) include these headers. (See: qdisc_pkt_len_init())
+	 */
+	wire_len = skb->wire_len + OVERHEAD;
+	wire_len = wire_len > ETH_MIN ? wire_len : ETH_MIN;
+	
+	t_xmit_ns = (wire_len) * NS_PER_SEC / RATE_IN_BYTES;
+
+//	t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / RATE_IN_BYTES;
+	// t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / edt->rate;
+
+	// now = bpf_ktime_get_ns();
+	now = bpf_ktime_get_boot_ns(); /* Use same ktime as bpftrace */
+
+	/* Allow others to set skb tstamp prior to us */
+	t_curr  = skb->tstamp;
+	if (t_curr < now)
+		t_curr = now;
+
+	/* The 't_last' timestamp can be in the future. Packets scheduled a head
+	 * of his packet can be seen as the queue size measured in time, via
+	 * correlating this to 'now' timestamp.
+	 */
+	t_next = READ_ONCE(edt->t_last) + t_xmit_ns;
+
+	/* If packet doesn't get scheduled into the future, then there is
+	 * no-queue and we are not above rate limit. Normally send packet
+	 * immediately and move forward t_last timestamp to now.
+	 *
+	 * But in our use-case the traffic need smoothing at a earlier
+	 * stage, as bursts at lower rates can hurt the crapy switch.
+	 * Thus, schedule SKB transmissing as new + t_xmit_ns.
+	 */
+	if (t_next <= t_curr) {
+#if 1
+		__u64 t_curr_next;
+		__u32 min_len = 1538;
+
+		/* Minimum delay for all packet if no time-queue */
+		wire_len = (wire_len > min_len) ?  wire_len : min_len;
+		t_xmit_ns = (wire_len) * NS_PER_SEC / RATE_IN_BYTES;
+		t_curr_next = t_curr + t_xmit_ns;
+
+		WRITE_ONCE(edt->t_last, t_curr_next);
+		skb->tstamp = t_curr_next;
+		skb->mark = 1; /* No queue - add minimum delay */
+#else
+		WRITE_ONCE(edt->t_last, t_curr);
+#endif
+		return BPF_OK;
+
+	}
+
+	/* Calc queue size measured in time */
+	t_queue_sz = t_next - now;
+
+	/* FQ-pacing qdisc also have horizon, but cannot use that, because this
+	 * BPF-prog will have updated map (t_last) on packet and assumes it got
+	 * its part of bandwidth.
+	 */
+	if (t_queue_sz >= T_HORIZON_DROP /* edt->t_horizon_drop */)
+		return BPF_DROP;
+
+	/* If TCP didn't react to ECN marking, then start dropping some */
+	// if (codel_drop(edt, t_queue_sz, now))
+	if (codel_drop(&edt->codel, t_queue_sz, t_next))
+		return BPF_DROP;
+
+	skb->mark = 2; /* (time) queue exist - and small/below T_HORIZON_ECN */
+	
+	/* ECN marking horizon */
+	if (t_queue_sz >= T_HORIZON_ECN) {
+		skb->mark = 3; /* (time) queue exist - and is large */
+		bpf_skb_ecn_set_ce(skb);
+	}
+
+	/* Advance "time queue" */
+	WRITE_ONCE(edt->t_last, t_next);
+
+	/* Schedule packet to be send at future timestamp */
+	skb->tstamp = t_next;
+	return BPF_OK;
+}
+
+static __always_inline
+__u16 get_inner_qinq_vlan(struct __sk_buff *skb, struct collect_vlans *vlans)
+{
+	__u16 vlan_key;
+
+	/* NIC can HW "offload" the outer VLAN, moving it to skb context */
+	if (skb->vlan_present)
+		vlan_key = vlans->id[0]; /* Inner vlan placed as first inline */
+	else
+		vlan_key = vlans->id[1]; /* All VLAN headers inline */
+
+	return vlan_key;
+}
+
+static __always_inline
+__u16 get_vlan(struct __sk_buff *skb, struct collect_vlans *vlans)
+{
+	__u16 vlan_key;
+
+	/* Handle extracting VLAN if skb context have VLAN offloaded */
+	if (skb->vlan_present)
+		vlan_key = skb->vlan_tci & VLAN_VID_MASK;
+	else
+		vlan_key = vlans->id[0];
+
+	return vlan_key;
+}
+
+static __always_inline
+__u16 extract_vlan_key(struct __sk_buff *skb, struct collect_vlans *vlans)
+{
+	int QinQ = 0;
+
+	/* The inner VLAN is the key to extract. But it is complicated
+	 * due to NIC "offloaded" VLAN (skb->vlan_present).  In case
+	 * BPF-prog is loaded on outer VLAN net_device, the BPF-prog
+	 * sees the inner-VLAN at the first and only VLAN.
+	 */
+	if (skb->vlan_present) {
+		if (vlans->id[0])
+			QinQ = 1;
+	} else {
+		if (vlans->id[1])
+			QinQ = 1;
+	}
+
+	if (QinQ)
+		return get_inner_qinq_vlan(skb, vlans);
+	else
+		return get_vlan(skb, vlans);
+}
+
+SEC("classifier") int tc_edt_vlan(struct __sk_buff *skb)
+{
+	void *data     = (void *)(long)skb->data;
+	void *data_end = (void *)(long)skb->data_end;
+	struct collect_vlans vlans = { 0 };
+	struct ethhdr *eth;
+	int ret = BPF_OK;
+	__u16 vlan_key;
+
+	/* These keep track of the next header type and iterator pointer */
+	struct hdr_cursor nh;
+	int eth_type;
+	nh.pos = data;
+
+	eth_type = parse_ethhdr_vlan(&nh, data_end, &eth, &vlans);
+	if (eth_type < 0)
+		return BPF_DROP;
+
+	/* Keep ARP resolution working */
+	if (eth_type == bpf_htons(ETH_P_ARP)) {
+		ret = BPF_OK;
+		goto out;
+	}
+
+	if (!proto_is_vlan(eth->h_proto) && !skb->vlan_present) {
+		/* Skip non-VLAN frames */
+		return BPF_OK;
+	}
+
+	vlan_key = extract_vlan_key(skb, &vlans);
+
+	/* Each (inner)  VLAN id gets it own EDT pacing */
+	return sched_departure(skb, vlan_key);
+
+ out:
+	return ret;
+}
--- a/traffic-pacing-edt/functions.sh
+++ b/traffic-pacing-edt/functions.sh
@@ -62,3 +62,28 @@ function call_tc() {
 function call_tc_allow_fail() {
    _call_tc "allow_fail" "$@"
 }
+
+## -- Wrapper calls for IP --
+function _call_ip() {
+    local allow_fail="$1"
+    shift
+    if [[ -n "$VERBOSE" ]]; then
+	echo "ip $@"
+    fi
+    if [[ -n "$DRYRUN" ]]; then
+	return
+    fi
+    $IP "$@"
+    local status=$?
+    if (( $status != 0 )); then
+	if [[ "$allow_fail" == "" ]]; then
+	    err 3 "Exec error($status) occurred cmd: \"$IP $@\""
+	fi
+    fi
+}
+function call_ip() {
+    _call_ip "" "$@"
+}
+function call_ip_allow_fail() {
+    _call_ip "allow_fail" "$@"
+}
--- a/traffic-pacing-edt/hash_func01.h
+++ b/traffic-pacing-edt/hash_func01.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: LGPL-2.1
+ *
+ * Based on Paul Hsieh's (LGPG 2.1) hash function
+ * From: http://www.azillionmonkeys.com/qed/hash.html
+ */
+
+#define get16bits(d) (*((const __u16 *) (d)))
+
+static __always_inline
+__u32 SuperFastHash (const char *data, int len, __u32 initval) {
+	__u32 hash = initval;
+	__u32 tmp;
+	int rem;
+
+	if (len <= 0 || data == NULL) return 0;
+
+	rem = len & 3;
+	len >>= 2;
+
+	/* Main loop */
+#pragma clang loop unroll(full)
+	for (;len > 0; len--) {
+		hash  += get16bits (data);
+		tmp    = (get16bits (data+2) << 11) ^ hash;
+		hash   = (hash << 16) ^ tmp;
+		data  += 2*sizeof (__u16);
+		hash  += hash >> 11;
+	}
+
+	/* Handle end cases */
+	switch (rem) {
+        case 3: hash += get16bits (data);
+                hash ^= hash << 16;
+                hash ^= ((signed char)data[sizeof (__u16)]) << 18;
+                hash += hash >> 11;
+                break;
+        case 2: hash += get16bits (data);
+                hash ^= hash << 11;
+                hash += hash >> 17;
+                break;
+        case 1: hash += (signed char)*data;
+                hash ^= hash << 10;
+                hash += hash >> 1;
+	}
+
+	/* Force "avalanching" of final 127 bits */
+	hash ^= hash << 3;
+	hash += hash >> 5;
+	hash ^= hash << 4;
+	hash += hash >> 17;
+	hash ^= hash << 25;
+	hash += hash >> 6;
+
+	return hash;
+}
--- a/traffic-pacing-edt/iproute2_compat.h
+++ b/traffic-pacing-edt/iproute2_compat.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/* Taken from from #include <iproute2/bpf_elf.h> */

 #ifndef __IPROUTE2_COMPAT_H
 #define __IPROUTE2_COMPAT_H
@@ -8,6 +9,11 @@
 * binary layout until "flags". Thus, BPF-progs can use both if careful.
 */

+/* Object pinning settings */
+#define PIN_NONE                0
+#define PIN_OBJECT_NS           1
+#define PIN_GLOBAL_NS           2
+
 /* ELF map definition (copied from iproute2 source code) */
 struct bpf_elf_map {
 	__u32 type;
--- a/traffic-pacing-edt/parameters.sh
+++ b/traffic-pacing-edt/parameters.sh
@@ -10,10 +10,10 @@ function usage() {
    echo "Usage: $0 [-vh] --dev ethX"
    echo "  -d | --dev     : (\$DEV)        Interface/device (required)"
    echo "  -v | --verbose : (\$VERBOSE)    verbose"
-    echo "  --remove       : (\$REMOVE)     Remove the TC rules"
+    echo "  --remove       : (\$REMOVE)     Remove the rules"
    echo "  --dry-run      : (\$DRYRUN)     Dry-run only (echo tc commands)"
-    echo "  -s | --stats   : (\$STATS_ONLY) Call TC statistics command"
-    echo "  -l | --list    : (\$LIST)       List TC filter setup after setup"
+    echo "  -s | --stats   : (\$STATS_ONLY) Call statistics command"
+    echo "  -l | --list    : (\$LIST)       List setup after setup"
    echo "  --file | --obj : (\$BPF_OBJ)    BPF-object file to load"
    echo ""
 }
@@ -80,5 +80,5 @@ done

 if [ -z "$DEV" ]; then
    usage
-    err 2 "Please specify TC net_device"
+    err 2 "Please specify net_device (\$DEV)"
 fi
--- a/traffic-pacing-edt/tc_fq_pacer.sh
+++ b/traffic-pacing-edt/tc_fq_pacer.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+#
+# Loading FQ pacing qdisc in multi-queue MQ setup to avoid root qdisc lock.
+#
+# The FQ pacing qdisc is doing all the work of pacing packet out according to
+# the EDT (Earliest Departure Time) future timestamps set by our BPF-prog that
+# runs a TC-egress hook.
+#
+# Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
+# License: GPLv2
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+
+root_check_run_with_sudo "$@"
+
+# Use common parameters
+source ${basedir}/parameters.sh
+
+export TC=tc
+
+# Default verbose
+VERBOSE=1
+
+# Select between multiq or single root qdisc
+if [[ -z $1 ]]; then
+    if [[ -z $REMOVE ]]; then
+	err 1 "Specify root qdisc system: single or mq (multi-queue)"
+    fi
+fi
+TYPE=$1
+
+# Delete existing root qdisc
+call_tc_allow_fail qdisc del dev "$DEV" root
+
+if [[ -n $REMOVE ]]; then
+    exit 0
+fi
+
+function use_multiq()
+{
+    # MQ (Multi-Queue) as root qdisc
+    call_tc qdisc replace dev $DEV root handle 7FFF: mq
+
+    # Add FQ-pacer qdisc on each NIC avail TX-queue
+    i=0
+    for dir in /sys/class/net/$DEV/queues/tx-*; do
+	# Details: cause-off-by-one, as tx-0 becomes handle 1:
+	((i++)) || true
+	#call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq
+	#
+	# The higher 'flow_limit' is needed for high-BW pacing
+	call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq \
+		flow_limit 1000
+	#
+	#   quantum $((1514*4)) initial_quantum $((1514*20))
+	# call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit
+    done
+}
+
+function use_single_fq_pacer()
+{
+    call_tc qdisc replace dev $DEV root handle 7FFF: fq \
+	    flow_limit 1000
+}
+
+case "$TYPE" in
+    mq | multiq )
+	use_multiq
+	;;
+    single | fq )
+	use_single_fq_pacer
+	;;
+    * )
+	err 1 "Unknown type: ${TYPE}"
+	;;
+esac
--- a/traffic-pacing-edt/tc_htb_shaper.sh
+++ b/traffic-pacing-edt/tc_htb_shaper.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+#
+# This HTB shaper setup script is available for easier comparing the
+# accuracy against the EDT solution.
+#
+# Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
+# License: GPLv2
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+
+root_check_run_with_sudo "$@"
+
+# Use common parameters
+source ${basedir}/parameters.sh
+
+export TC=/sbin/tc
+
+# It seems measured BW is TCP goodput, but configured BW is wirespeed.
+# Measurements show around 930Mbit best-case.  Q-in-Q result in MTU
+# 1522 bytes.  TCP goodput segments are 1448 bytes.
+#
+#RATE=$((930*1522/1448))Mbit
+##RATE=$((933*1522/1448))Mbit
+##CEIL=$((999*1522/1448))
+#CEIL=1Gbit
+#CEIL=980mbit
+
+# EDT shaper show TCP goodput of 956 Mbit/s.
+#  echo $((956*1514/1448)) = 999
+RATE=999Mbit
+CEIL=1000Mbit
+
+#RATE=500mbit
+#CEIL=577mbit
+
+# Each of the HTB root-class(es) get these RATE+CEIL upper bandwidth bounds.
+ROOT_RATE=9000Mbit
+ROOT_CEIL=9500Mbit
+
+DEFAULT_RATE=6000Mbit
+DEFAULT_CEIL=6000Mbit
+
+TC=/usr/sbin/tc
+VERBOSE=1
+
+function tc() {
+    _call_tc "" "$@"
+}
+
+# Delete existing root qdisc
+call_tc_allow_fail qdisc del dev "$DEV" root
+
+if [[ -n $REMOVE ]]; then
+    exit 0
+fi
+
+# HTB shaper
+#tc qdisc add dev "$DEV" root handle 1: htb default 2
+tc qdisc add dev "$DEV" root handle 1: htb default 16
+
+# The root-class set upper bandwidth usage
+tc class add dev "$DEV" parent 1: classid 1:1 \
+       htb rate $ROOT_RATE ceil $ROOT_CEIL
+
+# Default class 1:2
+tc class add dev "$DEV" parent 1: classid 1:2 htb \
+        rate "$DEFAULT_RATE" ceil "$DEFAULT_CEIL"
+#       burst 100000 cburst 100000
+tc qdisc add dev $DEV parent 1:2 fq_codel
+
+
+# Class for vlan 16
+tc class add dev "$DEV" parent 1: classid 1:16 htb rate "$RATE" ceil "$CEIL" \
+        burst $((1522*2)) cburst $((1522*2)) \
+        linklayer ethernet
+#       burst 1522 cburst 1522
+        #burst 1 cburst 1
+#       burst $((1522*2)) cburst $((1522*2))
+#       overhead $((14+4+4)) linklayer ethernet
+#tc qdisc add dev "$DEV" parent 1:16 fq_codel
+tc qdisc add dev "$DEV" parent 1:16 fq_codel quantum $((1514+4+4))
+#tc qdisc add dev "$DEV" parent 1:16 pfifo
+
+# parent filter:
+#tc filter add dev "$DEV" parent 1:0 prio 100 protocol 802.1q u32
+#
+# vlan 16:
+#tc filter add dev "$DEV" parent 1:0 prio 100 \
+#        protocol 802.1q \
+#        u32 match u16 0x0010 0x0fff at -4 \
+#        flowid 1:16
+
+tc filter add dev $DEV protocol all parent 1:0 prio 101 \
+        basic match "meta(vlan mask 0xfff eq 16)" flowid 1:16
--- a/traffic-pacing-edt/testlab_vlan_setup.sh
+++ b/traffic-pacing-edt/testlab_vlan_setup.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+#
+# Testlab setup script for VLAN Q-in-Q (double tagged VLAN) config.
+#
+# Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
+# License: GPLv2
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+
+root_check_run_with_sudo "$@"
+
+# Use common parameters
+source ${basedir}/parameters.sh
+
+export IP=/sbin/ip
+function ip() {
+    call_ip "$@"
+}
+
+function create_vlan_device() {
+    local vlan=${1}
+    local device=${2:-$DEV}
+    shift 2
+
+    if [[ -z "$vlan" ]]; then
+	err 2 "Missing VLAN is as input"
+    fi
+
+    ip link add link "$device" name ${device}.${vlan} type vlan id ${vlan}
+    ip link set ${device}.${vlan} up
+}
+
+function create_vlan_device_802_1ad() {
+    local vlan=${1}
+    local device=${2:-$DEV}
+    shift 2
+
+    if [[ -z "$vlan" ]]; then
+	err 2 "Missing VLAN is as input"
+    fi
+
+    ip link add link "$device" name ${device}.${vlan} type vlan id ${vlan} \
+       protocol 802.1ad
+    ip link set ${device}.${vlan} up
+}
+
+
+function delete_vlan_device() {
+    local vlan=${1}
+    local device=${2:-$DEV}
+    shift 2
+
+    if [[ -z "$vlan" ]]; then
+	err 2 "Missing VLAN is as input"
+    fi
+
+    ip link del ${device}.${vlan}
+}
+
+
+if [[ -z "$1" ]]; then
+    err 3 "Missing arg#1 for outer vlan"
+fi
+OUTER=$1
+
+if [[ -z "$2" ]]; then
+    err 3 "Missing arg#2 for inner vlan"
+fi
+INNER=$2
+
+if [[ -n $REMOVE ]]; then
+    delete_vlan_device $INNER ${DEV}.${OUTER}
+    delete_vlan_device $OUTER $DEV
+    exit 0
+fi
+
+create_vlan_device $OUTER $DEV
+create_vlan_device $INNER ${DEV}.${OUTER}
+
+# Set MTU to handle extra VLAN headers, NICs usually allow one VLAN
+# header even though they have configured MTU 1500.
+ip link set $DEV mtu 1508
+ip link set ${DEV}.${OUTER} mtu 1504
--- a/traffic-pacing-edt/vlans_load_edt.sh
+++ b/traffic-pacing-edt/vlans_load_edt.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#
+# Script for loading EDT-pacer BPF-prog on all downstream VLANs
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+
+root_check_run_with_sudo "$@"
+
+# Use common parameters
+source ${basedir}/parameters.sh
+
+# Default verbose
+VERBOSE=1
+
+# Downstream dev: ens6f0
+VLAN_START=168
+VLAN_END=205
+
+cmd=${basedir}/bpf_egress_loader.sh
+
+options=""
+
+if [[ -n $REMOVE ]]; then
+    options+=" --remove"
+fi
+if [[ -n $DRYRUN ]]; then
+    options+=" --dry-run"
+    #cmd="echo $cmd"
+fi
+if [[ -n $VERBOSE ]]; then
+    options+=" --verbose"
+fi
+
+for (( vlan=${VLAN_START}; vlan<=${VLAN_END}; vlan++ ))
+do
+    VLAN=${DEV}.$vlan
+    $cmd --dev $VLAN $options
+done
--- a/traffic-pacing-edt/xdp_cpumap_loader.c
+++ b/traffic-pacing-edt/xdp_cpumap_loader.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0+
+static const char *__doc__ =
+	" XDP load-balancing with CPU-map";
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <locale.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+#include <getopt.h>
+#include <net/if.h>
+#include <time.h>
+#include <linux/limits.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <linux/if_link.h> /* XDP defines */
+
+static int ifindex = -1;
+static char ifname_buf[IF_NAMESIZE];
+static char *ifname;
+
+static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+
+/* Exit return codes */
+#define EXIT_OK 		0
+#define EXIT_FAIL		1
+#define EXIT_FAIL_OPTION	2
+#define EXIT_FAIL_XDP		3
+#define EXIT_FAIL_BPF		4
+#define EXIT_FAIL_MEM		5
+#define EXIT_FAIL_FILE		6
+
+static const struct option long_options[] = {
+	{"help",	no_argument,		NULL, 'h' },
+	{"dev",		required_argument,	NULL, 'd' },
+	{"qsize",	required_argument,	NULL, 'q' },
+	{"force",	no_argument,		NULL, 'F' },
+	{"remove",	no_argument,		NULL, 'r' },
+	{"non-cpu",	required_argument,	NULL, 'x' },
+	{"exclude-cpu",	required_argument,	NULL, 'x' },
+	{0, 0, NULL,  0 }
+};
+
+static void usage(char *argv[])
+{
+	int i;
+
+	printf("\nDOCUMENTATION:\n%s\n", __doc__);
+	printf("\n");
+	printf(" Usage: %s (options-see-below)\n", argv[0]);
+	printf(" Listing options:\n");
+	for (i = 0; long_options[i].name != 0; i++) {
+		printf(" --%-12s", long_options[i].name);
+		if (long_options[i].flag != NULL)
+			printf(" flag (internal value:%d)",
+				*long_options[i].flag);
+		else
+			printf(" short-option: -%c",
+				long_options[i].val);
+		printf("\n");
+	}
+	printf("\n");
+}
+
+struct cpumap_config {
+	int fd_cpumap;
+	int fd_cpus_enabled;
+	int fd_cpus_count;
+	int *cpu_exclude;
+	int max_cpus;
+	__u32 qsize;
+};
+
+static int cpumap_config_init(struct cpumap_config *cfg)
+{
+        int n_cpus = get_nprocs_conf();
+	int *cpu_exclude;
+
+	memset(cfg, 0, sizeof(*cfg));
+
+	cpu_exclude = malloc(n_cpus * sizeof(int));
+	if (!cpu_exclude) {
+		fprintf(stderr, "failed to allocate array\n");
+		return EXIT_FAIL_MEM;
+	}
+	memset(cpu_exclude, 0, n_cpus * sizeof(int));
+
+	cfg->cpu_exclude = cpu_exclude;
+	cfg->max_cpus = n_cpus;
+	return 0;
+}
+
+int __find_map_fd_by_name(struct bpf_object *obj, char *name)
+{
+	int fd;
+
+	fd = bpf_object__find_map_fd_by_name(obj, name);
+	if (fd < 0) {
+		printf("No map found! - named: %s\n", name);
+		exit(EXIT_FAIL_BPF);
+	}
+	return fd;
+}
+
+/* Get file descriptors to BPF-maps */
+static int cpumap_config_find_maps(struct bpf_object *obj,
+				   struct cpumap_config *cfg)
+{
+	cfg->fd_cpumap       = __find_map_fd_by_name(obj, "cpumap");
+	cfg->fd_cpus_enabled = __find_map_fd_by_name(obj, "cpus_enabled");
+	cfg->fd_cpus_count   = __find_map_fd_by_name(obj, "cpus_count");
+	return 0;
+}
+
+static int create_cpu_entry(struct cpumap_config *cfg, __u32 cpu,
+			    struct bpf_cpumap_val *value,
+			    __u32 enabled_idx, bool new)
+{
+	__u32 curr_cpus_count = 0;
+	__u32 key = 0;
+	int err, fd;
+
+	/* Add a CPU entry to cpumap, as this allocate a cpu entry in
+	 * the kernel for the cpu.
+	 */
+	fd = cfg->fd_cpumap;
+	err = bpf_map_update_elem(fd, &cpu, value, 0);
+	if (err) {
+		fprintf(stderr, "Create(fd:%d) CPU(%d) entry failed (err:%d)\n",
+			fd, cpu, err);
+		return EXIT_FAIL_BPF;
+	}
+
+	/* Inform bpf_prog's that a new CPU is enabled and available
+	 * to be select from the map, that maps index to actual CPU.
+	 */
+	fd = cfg->fd_cpus_enabled;
+	err = bpf_map_update_elem(fd, &enabled_idx, &cpu, 0);
+	if (err) {
+		fprintf(stderr, "Add to enabled avail CPUs failed\n");
+		return EXIT_FAIL_BPF;
+	}
+
+	/* When not replacing/updating existing entry, bump the count */
+	fd = cfg->fd_cpus_count;
+	err = bpf_map_lookup_elem(fd, &key, &curr_cpus_count);
+	if (err) {
+		fprintf(stderr, "Failed reading curr cpus_count\n");
+		return EXIT_FAIL_BPF;
+	}
+	if (new) {
+		curr_cpus_count++;
+		err = bpf_map_update_elem(fd, &key, &curr_cpus_count, 0);
+		if (err) {
+			fprintf(stderr, "Failed write curr cpus_count\n");
+			return EXIT_FAIL_BPF;
+		}
+	}
+
+	return 0;
+}
+
+/* Userspace MUST create/populate CPUMAP entries for redirect to work
+ */
+static int configure_cpus(struct cpumap_config *cfg)
+{
+	struct bpf_cpumap_val value = { 0 };
+        int n_cpus = cfg->max_cpus;
+	int *exclude = cfg->cpu_exclude;
+	int enabled_idx = 0;
+	bool new = true;
+	int cpu, err;
+
+	value.qsize = cfg->qsize;
+
+	for (cpu = 0; cpu < n_cpus; cpu++) {
+
+		if (exclude[cpu] == -1) {
+			printf("Excluding CPU:%d\n", cpu);
+			continue;
+		}
+		printf("Enable CPU:%d\n", cpu);
+		err = create_cpu_entry(cfg, cpu, &value, enabled_idx, new);
+		if (err)
+			return err;
+		enabled_idx++;
+	}
+	return 0;
+}
+
+struct bpf_object *do_load_bpf_obj(struct bpf_object *obj)
+{
+	char buf[200];
+	int err;
+
+	err = bpf_object__load(obj);
+	if (err) {
+		libbpf_strerror(err, buf, sizeof(buf));
+		printf("Error loading: %s\n", buf);
+		return NULL;
+	}
+	return obj;
+}
+
+int do_xdp_attach(int ifindex, struct bpf_program *prog, __u32 xdp_flags)
+{
+	int prog_fd = bpf_program__fd(prog);
+	int err;
+
+	if (prog_fd < 0) {
+		fprintf(stderr, "bpf_program__fd failed\n");
+		return EXIT_FAIL_BPF;
+	}
+
+	err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
+	if (err) {
+		fprintf(stderr, "%s(): link set xdp fd failed (err:%d)\n",
+			__func__, err);
+		return EXIT_FAIL_XDP;
+	}
+	return EXIT_OK;
+}
+
+int do_xdp_detach(int ifindex, __u32 xdp_flags)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+	if (err) {
+		fprintf(stderr, "%s(): link set xdp fd failed (err:%d)\n",
+			__func__, err);
+		return EXIT_FAIL_XDP;
+	}
+	return EXIT_OK;
+}
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	bool do_detach = false;
+	int opt, longindex = 0;
+	char buf[100];
+	int err;
+
+	struct bpf_object *obj = NULL;
+	struct bpf_program *prog;
+
+	/* System to setup and exclude some CPUs */
+	struct cpumap_config cfg;
+	int n_cpus = get_nprocs_conf();
+	int non_cpu = -1;
+	int *cpu_exclude;
+
+	cpumap_config_init(&cfg);
+	cpu_exclude = cfg.cpu_exclude;
+	cfg.qsize = 512; /* Default queue size */
+
+	/* Always use XDP native driver mode */
+	xdp_flags |= XDP_FLAGS_DRV_MODE;
+
+        obj = bpf_object__open_file("xdp_cpumap_qinq.o", NULL);
+	err = libbpf_get_error(obj);
+	if (err) {
+		libbpf_strerror(err, buf, sizeof(buf));
+		printf("Error opening file: %s\n", buf);
+		return EXIT_FAIL_FILE;
+	}
+	err = EXIT_OK;
+
+	/* Parse commands line args */
+	while ((opt = getopt_long(argc, argv, "hd:q:Frx:",
+				  long_options, &longindex)) != -1) {
+		switch (opt) {
+		case 'd':
+			if (strlen(optarg) >= IF_NAMESIZE) {
+				fprintf(stderr, "ERR: --dev name too long\n");
+				goto error;
+			}
+			ifname = (char *)&ifname_buf;
+			strncpy(ifname, optarg, IF_NAMESIZE);
+			ifindex = if_nametoindex(ifname);
+			if (ifindex == 0) {
+				fprintf(stderr,
+					"ERR: --dev name unknown err(%d):%s\n",
+					errno, strerror(errno));
+				goto error;
+			}
+			break;
+		case 'q':
+			cfg.qsize = strtol(optarg, NULL, 10);
+			break;
+		case 'F':
+			xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+			break;
+		case 'r':
+			do_detach = true;
+			break;
+		case 'x': /* --exclude-cpu or --non-cpu */
+			/* Possible to exclude multiple CPUs on cmdline */
+			non_cpu = strtoul(optarg, NULL, 0);
+			if (non_cpu >= n_cpus) {
+				fprintf(stderr,
+				"--cpu nr too large for cpumap err(%d):%s\n",
+					errno, strerror(errno));
+				goto error;
+			}
+			cpu_exclude[non_cpu] = -1;
+			break;
+
+		case 'h':
+		error:
+		default:
+			usage(argv);
+			free(cpu_exclude);
+			return EXIT_FAIL_OPTION;
+		}
+	}
+	/* Required option */
+	if (ifindex == -1) {
+		fprintf(stderr, "ERR: required option --dev missing\n");
+		usage(argv);
+		err = EXIT_FAIL_OPTION;
+		goto out;
+	}
+
+	if (do_detach)
+		return do_xdp_detach(ifindex, xdp_flags);
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		err = EXIT_FAIL_MEM;
+		goto out;
+	}
+
+	obj = do_load_bpf_obj(obj);
+	if (!obj) {
+		err = EXIT_FAIL_BPF;
+		goto out;
+	}
+
+	/* Pickup first BPF-program */
+	prog = bpf_program__next(NULL, obj);
+	if (!prog) {
+		printf("No program!\n");
+		err = EXIT_FAIL_BPF;
+		goto out;
+	}
+
+	/* Find maps maps */
+	if (cpumap_config_find_maps(obj, &cfg)) {
+		err = EXIT_FAIL_BPF;
+		goto out;
+	}
+
+	/* Configure cpumap */
+	if (configure_cpus(&cfg)) {
+		err = EXIT_FAIL_BPF;
+		goto out;
+	}
+
+	/* Attach XDP program */
+	err = do_xdp_attach(ifindex, prog, xdp_flags);
+	if (err)
+		goto out;
+
+	printf("Attached XDP program:\"%s\" on netdev:%s (ifindex:%d)\n",
+	       bpf_program__name(prog), ifname, ifindex);
+	printf("CPUs: %d\n", n_cpus);
+
+out:
+	if (obj)
+		bpf_object__close(obj);
+
+	free(cpu_exclude);
+	return err;
+}
--- a/traffic-pacing-edt/xdp_cpumap_qinq.c
+++ b/traffic-pacing-edt/xdp_cpumap_qinq.c
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#include <linux/types.h>
+#include <linux/bpf.h> /* struct bpf_cpumap_val */
+#include <bpf/bpf_helpers.h>
+#include <bpf/compiler.h>
+
+#define INITVAL 15485863
+//#define INITVAL 2654435761
+
+#include "hash_func01.h" /* SuperFastHash */
+
+#include <bpf/bpf_helpers.h>
+
+#define VLAN_MAX_DEPTH 2
+#include <xdp/parsing_helpers.h>
+
+#define MAX_CPUS 24
+
+/* This global variable is used for limiting CPU that can be selected */
+__u32 global_max_cpus = 12; /* TODO: Allow userspace to adjust this */
+
+/* Special map type that can XDP_REDIRECT frames to another CPU */
+struct {
+	__uint(type, BPF_MAP_TYPE_CPUMAP);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct bpf_cpumap_val));
+	__uint(max_entries, MAX_CPUS);
+} cpumap SEC(".maps");
+
+/* Mapping table with CPUs enabled, for hashing between */
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, __u32);
+	__type(value, __u32);
+	__uint(max_entries, MAX_CPUS);
+} cpus_enabled SEC(".maps");
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, __u32);
+	__type(value, __u32);
+	__uint(max_entries, 1);
+} cpus_count SEC(".maps");
+
+static __always_inline
+__u32 extract_vlan_key(struct collect_vlans *vlans)
+{
+	/* Combine inner and outer VLAN as a key */
+	__u32  vlan_key = (vlans->id[1] << 16) | vlans->id[0];
+	return vlan_key;
+}
+
+SEC("xdp")
+int  xdp_cpumap_qinq(struct xdp_md *ctx)
+{
+	void *data     = (void *)(long)ctx->data;
+	void *data_end = (void *)(long)ctx->data_end;
+	struct collect_vlans vlans = { 0 };
+	__u32 hash_key, vlan_key;
+	struct ethhdr *eth;
+	__u32 cpu_idx, cpu_dest = 0;
+	__u32 *cpu_lookup;
+	__u64 action;
+	__u32 *cpu_max;
+
+
+	/* These keep track of the next header type and iterator pointer */
+	struct hdr_cursor nh;
+	int eth_type;
+	nh.pos = data;
+
+	eth_type = parse_ethhdr_vlan(&nh, data_end, &eth, &vlans);
+	if (eth_type < 0) {
+		action = XDP_ABORTED;
+		goto out;
+	}
+
+	/* Keep ARP resolution working */
+	if (eth_type == bpf_htons(ETH_P_ARP)) {
+		action = XDP_PASS;
+		goto out;
+	}
+
+	if (!proto_is_vlan(eth->h_proto)) {
+		/* Skip non-VLAN frames */
+		action = XDP_PASS;
+		goto out;
+	}
+
+	int key0 = 0;
+	cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
+	if (!cpu_max)
+		return XDP_ABORTED;
+
+	/* Use inner+outer VLAN as key and hash based on max_cpus */
+	vlan_key = extract_vlan_key(&vlans);
+	hash_key = SuperFastHash((char *)&vlan_key, 4, INITVAL);
+	cpu_idx = hash_key % *cpu_max;
+
+	/* To allow excluding some CPUs, a mapping table cpus_enabled
+	 * translates cpu_idx to real CPU-id
+	 */
+	cpu_lookup = bpf_map_lookup_elem(&cpus_enabled, &cpu_idx);
+	if (!cpu_lookup)
+		return XDP_ABORTED;
+	cpu_dest = *cpu_lookup;
+
+	/* Notice: Userspace MUST insert entries into cpumap */
+	action = bpf_redirect_map(&cpumap, cpu_dest, XDP_PASS);
+out:
+	return action;
+}