Merge branch 'master' of https://github.com/netoptimizer/bpf-examples into netoptimizer-master

Signed-off-by: Jesper Dangaard Brouer <netoptimizer@brouer.com>
This commit is contained in:
Jesper Dangaard Brouer
2021-01-08 14:54:40 +01:00
21 changed files with 1636 additions and 186 deletions

124
headers/bpf/compiler.h Normal file
View File

@@ -0,0 +1,124 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (C) 2016-2020 Authors of Cilium */
#ifndef __BPF_COMPILER_H_
#define __BPF_COMPILER_H_
#ifndef __non_bpf_context
# include "stddef.h"
#endif
#ifndef __section
# define __section(X) __attribute__((section(X), used))
#endif
#ifndef __maybe_unused
# define __maybe_unused __attribute__((__unused__))
#endif
#ifndef offsetof
# define offsetof(T, M) __builtin_offsetof(T, M)
#endif
#ifndef field_sizeof
# define field_sizeof(T, M) sizeof((((T *)NULL)->M))
#endif
#ifndef __packed
# define __packed __attribute__((packed))
#endif
#ifndef __nobuiltin
# if __clang_major__ >= 10
# define __nobuiltin(X) __attribute__((no_builtin(X)))
# else
# define __nobuiltin(X)
# endif
#endif
#ifndef likely
# define likely(X) __builtin_expect(!!(X), 1)
#endif
#ifndef unlikely
# define unlikely(X) __builtin_expect(!!(X), 0)
#endif
#ifndef always_succeeds /* Mainly for documentation purpose. */
# define always_succeeds(X) likely(X)
#endif
#undef __always_inline /* stddef.h defines its own */
#define __always_inline inline __attribute__((always_inline))
#ifndef __stringify
# define __stringify(X) #X
#endif
#ifndef __fetch
# define __fetch(X) (__u32)(__u64)(&(X))
#endif
#ifndef __aligned
# define __aligned(X) __attribute__((aligned(X)))
#endif
#ifndef build_bug_on
# define build_bug_on(E) ((void)sizeof(char[1 - 2*!!(E)]))
#endif
#ifndef __throw_build_bug
# define __throw_build_bug() __builtin_trap()
#endif
#ifndef __printf
# define __printf(X, Y) __attribute__((__format__(printf, X, Y)))
#endif
#ifndef barrier
# define barrier() asm volatile("": : :"memory")
#endif
#ifndef barrier_data
# define barrier_data(ptr) asm volatile("": :"r"(ptr) :"memory")
#endif
static __always_inline void bpf_barrier(void)
{
/* Workaround to avoid verifier complaint:
* "dereference of modified ctx ptr R5 off=48+0, ctx+const is allowed,
* ctx+const+const is not"
*/
barrier();
}
#ifndef ARRAY_SIZE
# define ARRAY_SIZE(A) (sizeof(A) / sizeof((A)[0]))
#endif
#ifndef __READ_ONCE
# define __READ_ONCE(X) (*(volatile typeof(X) *)&X)
#endif
#ifndef __WRITE_ONCE
# define __WRITE_ONCE(X, V) (*(volatile typeof(X) *)&X) = (V)
#endif
/* {READ,WRITE}_ONCE() with verifier workaround via bpf_barrier(). */
#ifndef READ_ONCE
# define READ_ONCE(X) \
({ typeof(X) __val = __READ_ONCE(X); \
bpf_barrier(); \
__val; })
#endif
#ifndef WRITE_ONCE
# define WRITE_ONCE(X, V) \
({ typeof(X) __val = (V); \
__WRITE_ONCE(X, __val); \
bpf_barrier(); \
__val; })
#endif
#endif /* __BPF_COMPILER_H_ */

View File

@@ -1,8 +1,8 @@
/* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-clause) */
/*
* This file contains parsing functions that can be used in eXDP programs. The
* functions are marked as __always_inline, and fully defined in this header
* file to be included in the BPF program.
* This file contains parsing functions that are used in the packetXX XDP
* programs. The functions are marked as __always_inline, and fully defined in
* this header file to be included in the BPF program.
*
* Each helper parses a packet header, including doing bounds checking, and
* returns the type of its contents if successful, and -1 otherwise.
@@ -10,6 +10,10 @@
* For Ethernet and IP headers, the content type is the type of the payload
* (h_proto for Ethernet, nexthdr for IPv6), for ICMP it is the ICMP type field.
* All return values are in host byte order.
*
* The versions of the functions included here are slightly expanded versions of
* the functions in the packet01 lesson. For instance, the Ethernet header
* parsing has support for parsing VLAN tags.
*/
#ifndef __PARSING_HELPERS_H
@@ -54,7 +58,7 @@ struct icmphdr_common {
/* Allow users of header file to redefine VLAN max depth */
#ifndef VLAN_MAX_DEPTH
#define VLAN_MAX_DEPTH 4
#define VLAN_MAX_DEPTH 2
#endif
/* Longest chain of IPv6 extension headers to resolve */
@@ -62,6 +66,11 @@ struct icmphdr_common {
#define IPV6_EXT_MAX_CHAIN 6
#endif
#define VLAN_VID_MASK 0x0fff /* VLAN Identifier */
/* Struct for collecting VLANs after parsing via parse_ethhdr_vlan */
struct collect_vlans {
__u16 id[VLAN_MAX_DEPTH];
};
static __always_inline int proto_is_vlan(__u16 h_proto)
{
@@ -74,18 +83,24 @@ static __always_inline int proto_is_vlan(__u16 h_proto)
* Ethernet header. Thus, caller can look at eth->h_proto to see if this was a
* VLAN tagged packet.
*/
static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end,
struct ethhdr **ethhdr)
static __always_inline int parse_ethhdr_vlan(struct hdr_cursor *nh,
void *data_end,
struct ethhdr **ethhdr,
struct collect_vlans *vlans)
{
struct ethhdr *eth = nh->pos;
int hdrsize = sizeof(*eth);
struct vlan_hdr *vlh;
__u16 h_proto;
int i;
if (eth + 1 > data_end)
/* Byte-count bounds check; check if current pointer + size of header
* is after data_end.
*/
if (nh->pos + hdrsize > data_end)
return -1;
nh->pos = eth + 1;
nh->pos += hdrsize;
*ethhdr = eth;
vlh = nh->pos;
h_proto = eth->h_proto;
@@ -102,6 +117,10 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end,
break;
h_proto = vlh->h_vlan_encapsulated_proto;
if (vlans) /* collect VLAN ids */
vlans->id[i] =
(bpf_ntohs(vlh->h_vlan_TCI) & VLAN_VID_MASK);
vlh++;
}
@@ -109,6 +128,14 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end,
return h_proto; /* network-byte-order */
}
static __always_inline int parse_ethhdr(struct hdr_cursor *nh,
void *data_end,
struct ethhdr **ethhdr)
{
/* Expect compiler removes the code that collects VLAN ids */
return parse_ethhdr_vlan(nh, data_end, ethhdr, NULL);
}
static __always_inline int skip_ip6hdrext(struct hdr_cursor *nh,
void *data_end,
__u8 next_hdr_type)
@@ -174,6 +201,9 @@ static __always_inline int parse_iphdr(struct hdr_cursor *nh,
return -1;
hdrsize = iph->ihl * 4;
/* Sanity check packet field is valid */
if(hdrsize < sizeof(iph))
return -1;
/* Variable-length IPv4 header, need to use byte-based arithmetic */
if (nh->pos + hdrsize > data_end)
@@ -267,10 +297,15 @@ static __always_inline int parse_tcphdr(struct hdr_cursor *nh,
return -1;
len = h->doff * 4;
if ((void *) h + len > data_end)
/* Sanity check packet field is valid */
if(len < sizeof(h))
return -1;
nh->pos = h + 1;
/* Variable-length TCP header, need to use byte-based arithmetic */
if (nh->pos + len > data_end)
return -1;
nh->pos += len;
*tcphdr = h;
return len;

View File

@@ -1,14 +1,23 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
USER_TARGETS :=
BPF_TARGETS := edt_pacer01
BPF_TARGETS += edt_pacer02
USER_TARGETS := xdp_cpumap_loader
BPF_TARGETS := edt_pacer_vlan
BPF_TARGETS += xdp_cpumap_qinq
EXTRA_DEPS += config.mk
LIB_DIR = ../lib
include $(LIB_DIR)/common.mk
include config.mk
# The iproute2 'tc' tool doesn't understand BTF debug info
all: config.mk
config.mk: configure
@sh configure
ifndef HAVE_TC_LIBBPF
# If the iproute2 'tc' tool doesn't understand BTF debug info
# use llvm-strip to remove this debug info from object file
#
# *BUT* cannot strip everything as it removes ELF elems needed for
@@ -16,6 +25,8 @@ include $(LIB_DIR)/common.mk
#
.PHONY: strip_tc_obj
strip_tc_obj: ${BPF_TARGETS:=.o}
$(Q) echo "TC don't support libbpf - strip BTF info"
$(Q) llvm-strip --no-strip-all --remove-section .BTF $?
all: strip_tc_obj
endif

View File

@@ -11,12 +11,12 @@ root_check_run_with_sudo "$@"
# Use common parameters
source ${basedir}/parameters.sh
export TC=/sbin/tc
export TC=tc
# This can be changed via --file or --obj
if [[ -z ${BPF_OBJ} ]]; then
# Fallback default
BPF_OBJ=edt_pacer02.o
BPF_OBJ=edt_pacer_vlan.o
fi
info "Applying TC-BPF egress setup on device: $DEV with object file: $BPF_OBJ"

View File

@@ -0,0 +1,31 @@
#!/usr/local/bin/bpftrace
#include <linux/skbuff.h>
/* Measure time difference between EDT-time and real "NIC" TX-time.
*
* Assuming packets are EDT timestamped by the BPF-program, we can
* detect/measure how accuratly packets are actually transmitted
* towards the NIC driver, by comparing EDT-time against "now"
* timestamp in the function transmitting to the NIC driver.
*/
// tracepoint:net:net_dev_start_xmit
tracepoint:net:net_dev_xmit
{
$skb = (struct sk_buff *)args->skbaddr;
//$tstamp = (uint64)$skb->tstamp;
$tstamp = $skb->skb_mstamp_ns;
$now = nsecs;
// if ($skb->mark > 0) {
if ($tstamp > 0) {
if ($now >= $tstamp) {
$diff_late = $now - $tstamp;
} else {
$diff_ahead = $tstamp - $now;
}
@tstamp_diff_late = hist($diff_late / 1000);
@tstamp_diff_ahead = hist($diff_ahead / 1000);
}
}

View File

@@ -0,0 +1,78 @@
#!/usr/local/bin/bpftrace
#include <linux/skbuff.h>
// tracepoint:net:net_dev_start_xmit
tracepoint:net:net_dev_xmit
{
$skb = (struct sk_buff *)args->skbaddr;
//$tstamp = (uint64)$skb->tstamp;
$tstamp = $skb->skb_mstamp_ns;
$now = nsecs;
// if ($skb->mark > 0) {
if ($tstamp > 0) {
if ($now >= $tstamp) {
$diff_late = $now - $tstamp;
} else {
$diff_ahead = $tstamp - $now;
}
@tstamp_usec_diff_late = hist($diff_late / 1000);
@tstamp_usec_diff_ahead = hist($diff_ahead / 1000);
}
/* Capture burstiness over a time period, by dividing nanosec
* timestamp with wanted period, and keeping state byte counter as
* long as timestamp match.
*
* Practical usage shows that bpftrace uses a hash-map to implement
* this, which unfortunately cost too much (shows 5% jhash cpu
* usage), enough overhead to change behavior of prod system.
*/
//$period = $now / 10000; /* 10000 = 10 usec */
$period = $now / 30000; /* 30000 = 30 usec */
if (@state[cpu] == $period) {
@state_bytes[cpu] += $skb->len;
} else {
@state[cpu] = $period;
if (@state_bytes[cpu] > 0) {
@byte_burst[cpu] = hist(@state_bytes[cpu]);
}
@state_bytes[cpu] = $skb->len; /* Reset counter */
}
}
/*
tracepoint:qdisc:qdisc_dequeue
{
@qdisc_bulk_dequeue = lhist(args->packets, 0,64,1);
}
*/
/*
kretfunc:dev_hard_start_xmit
{
// Wanted to know if ret == NETDEV_TX_BUSY
# ERROR: kfunc/kretfunc not available for your linked against bcc version.
}
*/
/* How often does FQ-pacer find no-packets are qualified to be
* scheduled, which leads to scheduling an hrtimer event, that will
* start qdisc again at a later time.
*
* We cannot kprobe fq_dequeue as it is a module.
*/
/*
kprobe:qdisc_watchdog_schedule_range_ns
{
@qdisc_watchdog[cpu] = count();
}
kprobe:__netif_schedule
{
@__netif_schedule[cpu] = count();
}
*/

View File

@@ -0,0 +1,153 @@
#ifndef __CODEL_IMPL_H
#define __CODEL_IMPL_H
#ifndef CODEL_TARGET
#define CODEL_TARGET (10 * 1000 * 1000ULL) /* 10 ms in nanosec */
#endif
#ifndef CODEL_EXCEED_INTERVAL
#define CODEL_EXCEED_INTERVAL (100 * 1000 * 1000ULL) /* 100 ms in ns*/
#endif
/* Codel like dropping scheme, inspired by:
* - RFC: https://queue.acm.org/detail.cfm?id=2209336
* - Code: https://queue.acm.org/appendices/codel.html
* - Kernel: include/net/codel_impl.h
*/
struct codel_state {
/* codel like dropping scheme */
__u64 first_above_time; /* Time when above target (0 if below)*/
__u64 drop_next; /* Time to drop next packet */
__u32 count; /* Packets dropped since going into drop state */
__u32 dropping; /* Equal to 1 if in drop state */
};
/* Table lookup for square-root shifted 16 bit */
static __always_inline __u32 get_sqrt_sh16(__u64 cnt)
{
switch (cnt) {
case 1: return 65536; /* 65536 * sqrt(1) */
case 2: return 92682; /* 65536 * sqrt(2) */
case 3: return 113512; /* 65536 * sqrt(3) */
case 4: return 131072; /* 65536 * sqrt(4) */
case 5: return 146543; /* 65536 * sqrt(5) */
case 6: return 160530; /* 65536 * sqrt(6) */
case 7: return 173392;
case 8: return 185364;
case 9: return 196608;
case 10: return 207243;
case 11: return 217358;
case 12: return 227023;
case 13: return 236293;
case 14: return 245213;
case 15: return 253820;
case 16: return 262144; /* 100 ms / sqrt(16) = 25 ms */
case 17: return 270212;
case 18: return 278046;
case 19: return 285664;
case 20: return 293086;
case 21: return 300324;
case 22: return 307391;
case 23: return 314300;
case 24: return 321060;
case 25: return 327680; /* 100 ms / sqrt(25) = 20 ms */
case 26: return 334169;
case 27: return 340535;
case 28: return 346784;
case 29: return 352922;
case 30: return 358955;
case 31: return 364889;
case 32: return 370728;
case 33: return 376476;
case 34: return 382137;
case 35: return 387716;
case 36: return 393216; /* 100 / sqrt(36) = 16.66 ms */
default:
return 463410; /* 65536*sqrt(50) => 100/sqrt(50) = 14.14 ms */
}
}
static __always_inline __u64 get_next_interval_sqrt(__u64 cnt)
{
__u64 val = (__u64)CODEL_EXCEED_INTERVAL << 16 / get_sqrt_sh16(cnt);
return val;
}
static __always_inline __u64
codel_control_law(__u64 t, __u64 cnt)
{
return t + get_next_interval_sqrt(cnt);
}
static __always_inline
bool codel_should_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now)
{
__u64 interval = CODEL_EXCEED_INTERVAL;
if (t_queue_sz < CODEL_TARGET) {
/* went below so we'll stay below for at least interval */
codel->first_above_time = 0;
return false;
}
if (codel->first_above_time == 0) {
/* just went above from below. If we stay above
* for at least interval we'll say it's ok to drop
*/
codel->first_above_time = now + interval;
return false;
} else if (now >= codel->first_above_time) {
return true;
}
return false;
}
static __always_inline
bool codel_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now)
{
__u64 interval = CODEL_EXCEED_INTERVAL;
/* If horizon have been exceed for a while, inc drop intensity*/
bool drop = codel_should_drop(codel, t_queue_sz, now);
if (codel->dropping) { /* In dropping state */
if (!drop) {
/* time below target - leave dropping state */
codel->dropping = false;
return false;
} else if (now >= codel->drop_next) {
/* It's time for the next drop. Drop the current
* packet. Schedule the next drop
*/
codel->count += 1;
// schedule the next drop.
codel->drop_next =
codel_control_law(codel->drop_next, codel->count);
return true;
}
} else if (drop &&
((now - codel->drop_next < interval) ||
(now - codel->first_above_time >= interval))) {
/* If we get here, then we're not in dropping state.
* Decide whether it's time to enter dropping state.
*/
__u32 count = codel->count;
codel->dropping = true;
/* If we're in a drop cycle, drop rate that controlled queue
* on the last cycle is a good starting point to control it now.
*/
if (now - codel->drop_next < interval)
count = count > 2 ? (count - 2) : 1;
else
count = 1;
codel->count = count;
codel->drop_next = codel_control_law(now, count);
return true;
}
return false;
}
#endif /* __CODEL_IMPL_H */

29
traffic-pacing-edt/configure vendored Executable file
View File

@@ -0,0 +1,29 @@
#!/bin/bash
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
# This is not an autoconf generated configure
#
# Output file which is input to Makefile
CONFIG=config.mk
# Assume tc is in $PATH
TC=tc
check_tc_libbpf()
{
tc_version=$($TC -V)
if echo $tc_version | grep -q libbpf; then
libbpf_version=${tc_version##*libbpf }
echo "HAVE_TC_LIBBPF:=y" >> $CONFIG
echo "BPF_CFLAGS += -DHAVE_TC_LIBBPF" >> $CONFIG
echo "yes ($libbpf_version)"
else
echo "no"
fi
}
echo "# Generated config" > $CONFIG
echo "Detecting available features on system"
echo -n " - libbpf support in tc tool: "
check_tc_libbpf

View File

@@ -1,40 +0,0 @@
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <xdp/parsing_helpers.h>
#include "iproute2_compat.h"
char _license[] SEC("license") = "GPL";
/* The tc tool (iproute2) use another ELF map layout than libbpf (struct
* bpf_map_def), see struct bpf_elf_map from iproute2.
*/
struct bpf_elf_map SEC("maps") cnt_map = {
.type = BPF_MAP_TYPE_ARRAY,
.size_key = sizeof(__u32),
.size_value = sizeof(__u64),
.max_elem = 1,
//.pinning = PIN_GLOBAL_NS,
};
SEC("classifier") int tc_dummy(struct __sk_buff *skb)
{
volatile void *data, *data_end;
int ret = BPF_OK;
struct ethhdr *eth;
data = (void *)(long)skb->data;
data_end = (void *)(long)skb->data_end;
eth = (struct ethhdr *)data;
if (data + sizeof(*eth) > data_end)
return BPF_DROP;
/* Keep ARP resolution working */
if (eth->h_proto == bpf_htons(ETH_P_ARP)) {
ret = BPF_OK;
goto out;
}
out:
return ret;
}

View File

@@ -1,126 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0+ */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <xdp/parsing_helpers.h>
#include "iproute2_compat.h"
char _license[] SEC("license") = "GPL";
#define NS_PER_SEC 1000000000
/* skb->len in bytes, thus easier to keep rate in bytes */
#define RATE_IN_BITS (1000 * 1000 * 1000)
#define RATE_IN_BYTES (RATE_IN_BITS / 8)
#define T_HORIZON_DROP (2000 * 1000 * 1000)
/* FIXME add proper READ_ONCE / WRITE_ONCE macros, for now use for annotation */
#define READ_ONCE(V) (V)
#define WRITE_ONCE(X,V) (X) = (V)
struct edt_val {
__u64 rate;
__u64 t_last;
__u64 t_horizon_drop;
__u64 t_horizon_ecn;
};
/* The tc tool (iproute2) use another ELF map layout than libbpf (struct
* bpf_map_def), see struct bpf_elf_map from iproute2.
*/
struct bpf_elf_map SEC("maps") time_delay_map = {
.type = BPF_MAP_TYPE_ARRAY,
.size_key = sizeof(__u32),
.size_value = sizeof(struct edt_val),
.max_elem = 1,
//.pinning = PIN_GLOBAL_NS,
};
/* Role of EDT (Earliest Departure Time) is to schedule departure of packets to
* be send in the future.
*/
static __always_inline int sched_departure(struct __sk_buff *skb)
{
struct edt_val *edt;
__u64 t_queue_sz;
__u64 t_xmit_ns;
__u64 t_next;
__u64 t_curr;
int key = 0;
__u64 now;
edt = bpf_map_lookup_elem(&time_delay_map, &key);
if (!edt)
return BPF_DROP;
/* Calc transmission time it takes to send packet 'bytes' */
t_xmit_ns = ((__u64)skb->len) * NS_PER_SEC / RATE_IN_BYTES;
// t_xmit_ns = ((__u64)skb->len) * NS_PER_SEC / edt->rate;
now = bpf_ktime_get_ns();
/* Allow others to set skb tstamp prior to us */
t_curr = skb->tstamp;
if (t_curr < now)
t_curr = now;
/* The 't_last' timestamp can be in the future. Packets scheduled a head
* of his packet can be seen as the queue size measured in time, via
* correlating this to 'now' timestamp.
*/
t_next = READ_ONCE(edt->t_last) + t_xmit_ns;
/* If packet doesn't get scheduled into the future, then there is
* no-queue and we are not above rate limit. Send packet immediately and
* move forward t_last timestamp to now.
*/
if (t_next <= t_curr) {
WRITE_ONCE(edt->t_last, t_curr);
return BPF_OK;
}
/* Calc queue size measured in time */
t_queue_sz = t_next - now;
/* FQ-pacing qdisc also have horizon, but cannot use that, because this
* BPF-prog will have updated map (t_last) on packet and assumes it got
* its part of bandwidth.
*/
if (t_queue_sz >= T_HORIZON_DROP /* edt->t_horizon_drop */)
return BPF_DROP;
// TODO Add ECN marking horizon
/* Advance "time queue" */
WRITE_ONCE(edt->t_last, t_next);
/* Schedule packet to be send at future timestamp */
skb->tstamp = t_next;
return BPF_OK;
}
SEC("classifier") int tc_edt_simple(struct __sk_buff *skb)
{
volatile void *data, *data_end;
int ret = BPF_OK;
struct ethhdr *eth;
data = (void *)(long)skb->data;
data_end = (void *)(long)skb->data_end;
eth = (struct ethhdr *)data;
if (data + sizeof(*eth) > data_end)
return BPF_DROP;
/* Keep ARP resolution working */
if (eth->h_proto == bpf_htons(ETH_P_ARP)) {
ret = BPF_OK;
goto out;
}
// TODO: match on vlan16 and only apply EDT on that
return sched_departure(skb);
out:
return ret;
}

View File

@@ -0,0 +1,280 @@
/* SPDX-License-Identifier: GPL-2.0+ */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/compiler.h>
#include <stdbool.h>
#define VLAN_MAX_DEPTH 2
#include <xdp/parsing_helpers.h>
char _license[] SEC("license") = "GPL";
#define NS_PER_SEC 1000000000
/* Strategy: Shape at MAC (Medium Access Control) layer with Ethernet
*
* Production use-case is pacing traffic at 1Gbit/s wirespeed, using a
* 10Gbit/s NIC, because 1G end-user switch cannot handle bursts.
*
* (https://en.wikipedia.org/wiki/Interpacket_gap
* 12 bytes = interframe gap (IFG) 96 bit
* (https://en.wikipedia.org/wiki/Ethernet_frame)
* 8 bytes = MAC preamble
* 4 bytes = Ethernet Frame Check Sequence (FCS) CRC
* 46 bytes = Minimum Payload size
*
* 14 bytes = Ethernet header
* 8 bytes = 2x VLAN headers
*/
//#define RATE_IN_BITS (1000 * 1000 * 1000ULL) /* Full 1Gbit/s */
#define RATE_IN_BITS (990 * 1000 * 1000ULL)
//#define RATE_IN_BITS (950 * 1000 * 1000ULL)
#define OVERHEAD (12 + 8 + 4 + 8) /* 14 already in wire_len */
//#define OVERHEAD (12 + 8 + 4) /* 14 already in wire_len */
#define ETH_MIN (84)
/* skb->len in bytes, thus convert rate to bytes */
#define RATE_IN_BYTES (RATE_IN_BITS / 8)
/* Controlling how large queue (in time) is allow to grow */
#define T_HORIZON_DROP (40 * 1000 * 1000ULL)
#define T_HORIZON_TARGET (5 * 1000 * 1000ULL)
#define T_HORIZON_ECN (1 * 1000 * 1000ULL)
/* Codel: If queue exceed target for more than one interval, start dropping */
#define T_EXCEED_INTERVAL (100 * 1000 * 1000ULL) /* 100 ms in ns*/
#define CODEL_TARGET T_HORIZON_TARGET
#define CODEL_EXCEED_INTERVAL T_EXCEED_INTERVAL
#include "codel_impl.h"
struct edt_val {
__u64 rate;
__u64 t_last;
__u64 t_horizon_drop;
__u64 t_horizon_ecn;
struct codel_state codel;
} __aligned(64); /* Align struct to cache-size to avoid false-sharing */
#ifdef HAVE_TC_LIBBPF /* detected by configure script in config.mk */
/* Use BTF format to create map */
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 4096); /* Max possible VLANs */
__type(key, __u32);
__type(value, struct edt_val);
// __uint(pinning, LIBBPF_PIN_BY_NAME);
} time_delay_map SEC(".maps");
#else
/* The (iproute2) tc tool (without libbpf support) use another ELF map
* layout than libbpf (struct bpf_map_def), see struct bpf_elf_map
* from iproute2.
*/
#include "iproute2_compat.h"
struct bpf_elf_map SEC("maps") time_delay_map = {
.type = BPF_MAP_TYPE_ARRAY,
.size_key = sizeof(__u32),
.size_value = sizeof(struct edt_val),
.max_elem = 4096, /* Max possible VLANs */
// .pinning = PIN_GLOBAL_NS,
};
#endif
/* Role of EDT (Earliest Departure Time) is to schedule departure of packets to
* be send in the future.
*/
static __always_inline int sched_departure(struct __sk_buff *skb, __u32 key)
{
struct edt_val *edt;
__u64 t_queue_sz;
__u64 t_xmit_ns;
__u64 wire_len;
__u64 t_next;
__u64 t_curr;
__u64 now;
edt = bpf_map_lookup_elem(&time_delay_map, &key);
if (!edt)
return BPF_DROP;
/* Calc transmission time it takes to send packet 'bytes'.
*
* Details on getting precise bytes on wire. The skb->len does include
* length of GRO/GSO segments, but not the segment headers that gets
* added on transmit. Fortunately skb->wire_len at TC-egress hook (not
* ingress) include these headers. (See: qdisc_pkt_len_init())
*/
wire_len = skb->wire_len + OVERHEAD;
wire_len = wire_len > ETH_MIN ? wire_len : ETH_MIN;
t_xmit_ns = (wire_len) * NS_PER_SEC / RATE_IN_BYTES;
// t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / RATE_IN_BYTES;
// t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / edt->rate;
// now = bpf_ktime_get_ns();
now = bpf_ktime_get_boot_ns(); /* Use same ktime as bpftrace */
/* Allow others to set skb tstamp prior to us */
t_curr = skb->tstamp;
if (t_curr < now)
t_curr = now;
/* The 't_last' timestamp can be in the future. Packets scheduled a head
* of his packet can be seen as the queue size measured in time, via
* correlating this to 'now' timestamp.
*/
t_next = READ_ONCE(edt->t_last) + t_xmit_ns;
/* If packet doesn't get scheduled into the future, then there is
* no-queue and we are not above rate limit. Normally send packet
* immediately and move forward t_last timestamp to now.
*
* But in our use-case the traffic need smoothing at a earlier
* stage, as bursts at lower rates can hurt the crapy switch.
* Thus, schedule SKB transmissing as new + t_xmit_ns.
*/
if (t_next <= t_curr) {
#if 1
__u64 t_curr_next;
__u32 min_len = 1538;
/* Minimum delay for all packet if no time-queue */
wire_len = (wire_len > min_len) ? wire_len : min_len;
t_xmit_ns = (wire_len) * NS_PER_SEC / RATE_IN_BYTES;
t_curr_next = t_curr + t_xmit_ns;
WRITE_ONCE(edt->t_last, t_curr_next);
skb->tstamp = t_curr_next;
skb->mark = 1; /* No queue - add minimum delay */
#else
WRITE_ONCE(edt->t_last, t_curr);
#endif
return BPF_OK;
}
/* Calc queue size measured in time */
t_queue_sz = t_next - now;
/* FQ-pacing qdisc also have horizon, but cannot use that, because this
* BPF-prog will have updated map (t_last) on packet and assumes it got
* its part of bandwidth.
*/
if (t_queue_sz >= T_HORIZON_DROP /* edt->t_horizon_drop */)
return BPF_DROP;
/* If TCP didn't react to ECN marking, then start dropping some */
// if (codel_drop(edt, t_queue_sz, now))
if (codel_drop(&edt->codel, t_queue_sz, t_next))
return BPF_DROP;
skb->mark = 2; /* (time) queue exist - and small/below T_HORIZON_ECN */
/* ECN marking horizon */
if (t_queue_sz >= T_HORIZON_ECN) {
skb->mark = 3; /* (time) queue exist - and is large */
bpf_skb_ecn_set_ce(skb);
}
/* Advance "time queue" */
WRITE_ONCE(edt->t_last, t_next);
/* Schedule packet to be send at future timestamp */
skb->tstamp = t_next;
return BPF_OK;
}
static __always_inline
__u16 get_inner_qinq_vlan(struct __sk_buff *skb, struct collect_vlans *vlans)
{
__u16 vlan_key;
/* NIC can HW "offload" the outer VLAN, moving it to skb context */
if (skb->vlan_present)
vlan_key = vlans->id[0]; /* Inner vlan placed as first inline */
else
vlan_key = vlans->id[1]; /* All VLAN headers inline */
return vlan_key;
}
static __always_inline
__u16 get_vlan(struct __sk_buff *skb, struct collect_vlans *vlans)
{
__u16 vlan_key;
/* Handle extracting VLAN if skb context have VLAN offloaded */
if (skb->vlan_present)
vlan_key = skb->vlan_tci & VLAN_VID_MASK;
else
vlan_key = vlans->id[0];
return vlan_key;
}
static __always_inline
__u16 extract_vlan_key(struct __sk_buff *skb, struct collect_vlans *vlans)
{
int QinQ = 0;
/* The inner VLAN is the key to extract. But it is complicated
* due to NIC "offloaded" VLAN (skb->vlan_present). In case
* BPF-prog is loaded on outer VLAN net_device, the BPF-prog
* sees the inner-VLAN at the first and only VLAN.
*/
if (skb->vlan_present) {
if (vlans->id[0])
QinQ = 1;
} else {
if (vlans->id[1])
QinQ = 1;
}
if (QinQ)
return get_inner_qinq_vlan(skb, vlans);
else
return get_vlan(skb, vlans);
}
SEC("classifier") int tc_edt_vlan(struct __sk_buff *skb)
{
void *data = (void *)(long)skb->data;
void *data_end = (void *)(long)skb->data_end;
struct collect_vlans vlans = { 0 };
struct ethhdr *eth;
int ret = BPF_OK;
__u16 vlan_key;
/* These keep track of the next header type and iterator pointer */
struct hdr_cursor nh;
int eth_type;
nh.pos = data;
eth_type = parse_ethhdr_vlan(&nh, data_end, &eth, &vlans);
if (eth_type < 0)
return BPF_DROP;
/* Keep ARP resolution working */
if (eth_type == bpf_htons(ETH_P_ARP)) {
ret = BPF_OK;
goto out;
}
if (!proto_is_vlan(eth->h_proto) && !skb->vlan_present) {
/* Skip non-VLAN frames */
return BPF_OK;
}
vlan_key = extract_vlan_key(skb, &vlans);
/* Each (inner) VLAN id gets it own EDT pacing */
return sched_departure(skb, vlan_key);
out:
return ret;
}

View File

@@ -62,3 +62,28 @@ function call_tc() {
function call_tc_allow_fail() {
_call_tc "allow_fail" "$@"
}
## -- Wrapper calls for IP --
function _call_ip() {
local allow_fail="$1"
shift
if [[ -n "$VERBOSE" ]]; then
echo "ip $@"
fi
if [[ -n "$DRYRUN" ]]; then
return
fi
$IP "$@"
local status=$?
if (( $status != 0 )); then
if [[ "$allow_fail" == "" ]]; then
err 3 "Exec error($status) occurred cmd: \"$IP $@\""
fi
fi
}
function call_ip() {
_call_ip "" "$@"
}
function call_ip_allow_fail() {
_call_ip "allow_fail" "$@"
}

View File

@@ -0,0 +1,55 @@
/* SPDX-License-Identifier: LGPL-2.1
*
* Based on Paul Hsieh's (LGPG 2.1) hash function
* From: http://www.azillionmonkeys.com/qed/hash.html
*/
#define get16bits(d) (*((const __u16 *) (d)))
static __always_inline
__u32 SuperFastHash (const char *data, int len, __u32 initval) {
__u32 hash = initval;
__u32 tmp;
int rem;
if (len <= 0 || data == NULL) return 0;
rem = len & 3;
len >>= 2;
/* Main loop */
#pragma clang loop unroll(full)
for (;len > 0; len--) {
hash += get16bits (data);
tmp = (get16bits (data+2) << 11) ^ hash;
hash = (hash << 16) ^ tmp;
data += 2*sizeof (__u16);
hash += hash >> 11;
}
/* Handle end cases */
switch (rem) {
case 3: hash += get16bits (data);
hash ^= hash << 16;
hash ^= ((signed char)data[sizeof (__u16)]) << 18;
hash += hash >> 11;
break;
case 2: hash += get16bits (data);
hash ^= hash << 11;
hash += hash >> 17;
break;
case 1: hash += (signed char)*data;
hash ^= hash << 10;
hash += hash >> 1;
}
/* Force "avalanching" of final 127 bits */
hash ^= hash << 3;
hash += hash >> 5;
hash ^= hash << 4;
hash += hash >> 17;
hash ^= hash << 25;
hash += hash >> 6;
return hash;
}

View File

@@ -1,4 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Taken from from #include <iproute2/bpf_elf.h> */
#ifndef __IPROUTE2_COMPAT_H
#define __IPROUTE2_COMPAT_H
@@ -8,6 +9,11 @@
* binary layout until "flags". Thus, BPF-progs can use both if careful.
*/
/* Object pinning settings */
#define PIN_NONE 0
#define PIN_OBJECT_NS 1
#define PIN_GLOBAL_NS 2
/* ELF map definition (copied from iproute2 source code) */
struct bpf_elf_map {
__u32 type;

View File

@@ -10,10 +10,10 @@ function usage() {
echo "Usage: $0 [-vh] --dev ethX"
echo " -d | --dev : (\$DEV) Interface/device (required)"
echo " -v | --verbose : (\$VERBOSE) verbose"
echo " --remove : (\$REMOVE) Remove the TC rules"
echo " --remove : (\$REMOVE) Remove the rules"
echo " --dry-run : (\$DRYRUN) Dry-run only (echo tc commands)"
echo " -s | --stats : (\$STATS_ONLY) Call TC statistics command"
echo " -l | --list : (\$LIST) List TC filter setup after setup"
echo " -s | --stats : (\$STATS_ONLY) Call statistics command"
echo " -l | --list : (\$LIST) List setup after setup"
echo " --file | --obj : (\$BPF_OBJ) BPF-object file to load"
echo ""
}
@@ -80,5 +80,5 @@ done
if [ -z "$DEV" ]; then
usage
err 2 "Please specify TC net_device"
err 2 "Please specify net_device (\$DEV)"
fi

View File

@@ -0,0 +1,77 @@
#!/bin/bash
#
# Loading FQ pacing qdisc in multi-queue MQ setup to avoid root qdisc lock.
#
# The FQ pacing qdisc is doing all the work of pacing packet out according to
# the EDT (Earliest Departure Time) future timestamps set by our BPF-prog that
# runs a TC-egress hook.
#
# Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
# License: GPLv2
#
basedir=`dirname $0`
source ${basedir}/functions.sh
root_check_run_with_sudo "$@"
# Use common parameters
source ${basedir}/parameters.sh
export TC=tc
# Default verbose
VERBOSE=1
# Select between multiq or single root qdisc
if [[ -z $1 ]]; then
if [[ -z $REMOVE ]]; then
err 1 "Specify root qdisc system: single or mq (multi-queue)"
fi
fi
TYPE=$1
# Delete existing root qdisc
call_tc_allow_fail qdisc del dev "$DEV" root
if [[ -n $REMOVE ]]; then
exit 0
fi
function use_multiq()
{
# MQ (Multi-Queue) as root qdisc
call_tc qdisc replace dev $DEV root handle 7FFF: mq
# Add FQ-pacer qdisc on each NIC avail TX-queue
i=0
for dir in /sys/class/net/$DEV/queues/tx-*; do
# Details: cause-off-by-one, as tx-0 becomes handle 1:
((i++)) || true
#call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq
#
# The higher 'flow_limit' is needed for high-BW pacing
call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq \
flow_limit 1000
#
# quantum $((1514*4)) initial_quantum $((1514*20))
# call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit
done
}
function use_single_fq_pacer()
{
call_tc qdisc replace dev $DEV root handle 7FFF: fq \
flow_limit 1000
}
case "$TYPE" in
mq | multiq )
use_multiq
;;
single | fq )
use_single_fq_pacer
;;
* )
err 1 "Unknown type: ${TYPE}"
;;
esac

View File

@@ -0,0 +1,95 @@
#!/bin/bash
#
# This HTB shaper setup script is available for easier comparing the
# accuracy against the EDT solution.
#
# Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
# License: GPLv2
#
basedir=`dirname $0`
source ${basedir}/functions.sh
root_check_run_with_sudo "$@"
# Use common parameters
source ${basedir}/parameters.sh
export TC=/sbin/tc
# It seems measured BW is TCP goodput, but configured BW is wirespeed.
# Measurements show around 930Mbit best-case. Q-in-Q result in MTU
# 1522 bytes. TCP goodput segments are 1448 bytes.
#
#RATE=$((930*1522/1448))Mbit
##RATE=$((933*1522/1448))Mbit
##CEIL=$((999*1522/1448))
#CEIL=1Gbit
#CEIL=980mbit
# EDT shaper show TCP goodput of 956 Mbit/s.
# echo $((956*1514/1448)) = 999
RATE=999Mbit
CEIL=1000Mbit
#RATE=500mbit
#CEIL=577mbit
# Each of the HTB root-class(es) get these RATE+CEIL upper bandwidth bounds.
ROOT_RATE=9000Mbit
ROOT_CEIL=9500Mbit
DEFAULT_RATE=6000Mbit
DEFAULT_CEIL=6000Mbit
TC=/usr/sbin/tc
VERBOSE=1
function tc() {
_call_tc "" "$@"
}
# Delete existing root qdisc
call_tc_allow_fail qdisc del dev "$DEV" root
if [[ -n $REMOVE ]]; then
exit 0
fi
# HTB shaper
#tc qdisc add dev "$DEV" root handle 1: htb default 2
tc qdisc add dev "$DEV" root handle 1: htb default 16
# The root-class set upper bandwidth usage
tc class add dev "$DEV" parent 1: classid 1:1 \
htb rate $ROOT_RATE ceil $ROOT_CEIL
# Default class 1:2
tc class add dev "$DEV" parent 1: classid 1:2 htb \
rate "$DEFAULT_RATE" ceil "$DEFAULT_CEIL"
# burst 100000 cburst 100000
tc qdisc add dev $DEV parent 1:2 fq_codel
# Class for vlan 16
tc class add dev "$DEV" parent 1: classid 1:16 htb rate "$RATE" ceil "$CEIL" \
burst $((1522*2)) cburst $((1522*2)) \
linklayer ethernet
# burst 1522 cburst 1522
#burst 1 cburst 1
# burst $((1522*2)) cburst $((1522*2))
# overhead $((14+4+4)) linklayer ethernet
#tc qdisc add dev "$DEV" parent 1:16 fq_codel
tc qdisc add dev "$DEV" parent 1:16 fq_codel quantum $((1514+4+4))
#tc qdisc add dev "$DEV" parent 1:16 pfifo
# parent filter:
#tc filter add dev "$DEV" parent 1:0 prio 100 protocol 802.1q u32
#
# vlan 16:
#tc filter add dev "$DEV" parent 1:0 prio 100 \
# protocol 802.1q \
# u32 match u16 0x0010 0x0fff at -4 \
# flowid 1:16
tc filter add dev $DEV protocol all parent 1:0 prio 101 \
basic match "meta(vlan mask 0xfff eq 16)" flowid 1:16

View File

@@ -0,0 +1,84 @@
#!/bin/bash
#
# Testlab setup script for VLAN Q-in-Q (double tagged VLAN) config.
#
# Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
# License: GPLv2
#
basedir=`dirname $0`
source ${basedir}/functions.sh
root_check_run_with_sudo "$@"
# Use common parameters
source ${basedir}/parameters.sh
export IP=/sbin/ip
function ip() {
call_ip "$@"
}
function create_vlan_device() {
local vlan=${1}
local device=${2:-$DEV}
shift 2
if [[ -z "$vlan" ]]; then
err 2 "Missing VLAN is as input"
fi
ip link add link "$device" name ${device}.${vlan} type vlan id ${vlan}
ip link set ${device}.${vlan} up
}
function create_vlan_device_802_1ad() {
local vlan=${1}
local device=${2:-$DEV}
shift 2
if [[ -z "$vlan" ]]; then
err 2 "Missing VLAN is as input"
fi
ip link add link "$device" name ${device}.${vlan} type vlan id ${vlan} \
protocol 802.1ad
ip link set ${device}.${vlan} up
}
function delete_vlan_device() {
local vlan=${1}
local device=${2:-$DEV}
shift 2
if [[ -z "$vlan" ]]; then
err 2 "Missing VLAN is as input"
fi
ip link del ${device}.${vlan}
}
if [[ -z "$1" ]]; then
err 3 "Missing arg#1 for outer vlan"
fi
OUTER=$1
if [[ -z "$2" ]]; then
err 3 "Missing arg#2 for inner vlan"
fi
INNER=$2
if [[ -n $REMOVE ]]; then
delete_vlan_device $INNER ${DEV}.${OUTER}
delete_vlan_device $OUTER $DEV
exit 0
fi
create_vlan_device $OUTER $DEV
create_vlan_device $INNER ${DEV}.${OUTER}
# Set MTU to handle extra VLAN headers, NICs usually allow one VLAN
# header even though they have configured MTU 1500.
ip link set $DEV mtu 1508
ip link set ${DEV}.${OUTER} mtu 1504

View File

@@ -0,0 +1,39 @@
#!/bin/bash
#
# Script for loading EDT-pacer BPF-prog on all downstream VLANs
#
basedir=`dirname $0`
source ${basedir}/functions.sh
root_check_run_with_sudo "$@"
# Use common parameters
source ${basedir}/parameters.sh
# Default verbose
VERBOSE=1
# Downstream dev: ens6f0
VLAN_START=168
VLAN_END=205
cmd=${basedir}/bpf_egress_loader.sh
options=""
if [[ -n $REMOVE ]]; then
options+=" --remove"
fi
if [[ -n $DRYRUN ]]; then
options+=" --dry-run"
#cmd="echo $cmd"
fi
if [[ -n $VERBOSE ]]; then
options+=" --verbose"
fi
for (( vlan=${VLAN_START}; vlan<=${VLAN_END}; vlan++ ))
do
VLAN=${DEV}.$vlan
$cmd --dev $VLAN $options
done

View File

@@ -0,0 +1,383 @@
// SPDX-License-Identifier: GPL-2.0+
static const char *__doc__ =
" XDP load-balancing with CPU-map";
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>
#include <locale.h>
#include <sys/resource.h>
#include <sys/sysinfo.h>
#include <getopt.h>
#include <net/if.h>
#include <time.h>
#include <linux/limits.h>
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include <linux/if_link.h> /* XDP defines */
static int ifindex = -1;
static char ifname_buf[IF_NAMESIZE];
static char *ifname;
static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
/* Exit return codes */
#define EXIT_OK 0
#define EXIT_FAIL 1
#define EXIT_FAIL_OPTION 2
#define EXIT_FAIL_XDP 3
#define EXIT_FAIL_BPF 4
#define EXIT_FAIL_MEM 5
#define EXIT_FAIL_FILE 6
static const struct option long_options[] = {
{"help", no_argument, NULL, 'h' },
{"dev", required_argument, NULL, 'd' },
{"qsize", required_argument, NULL, 'q' },
{"force", no_argument, NULL, 'F' },
{"remove", no_argument, NULL, 'r' },
{"non-cpu", required_argument, NULL, 'x' },
{"exclude-cpu", required_argument, NULL, 'x' },
{0, 0, NULL, 0 }
};
static void usage(char *argv[])
{
int i;
printf("\nDOCUMENTATION:\n%s\n", __doc__);
printf("\n");
printf(" Usage: %s (options-see-below)\n", argv[0]);
printf(" Listing options:\n");
for (i = 0; long_options[i].name != 0; i++) {
printf(" --%-12s", long_options[i].name);
if (long_options[i].flag != NULL)
printf(" flag (internal value:%d)",
*long_options[i].flag);
else
printf(" short-option: -%c",
long_options[i].val);
printf("\n");
}
printf("\n");
}
struct cpumap_config {
int fd_cpumap;
int fd_cpus_enabled;
int fd_cpus_count;
int *cpu_exclude;
int max_cpus;
__u32 qsize;
};
static int cpumap_config_init(struct cpumap_config *cfg)
{
int n_cpus = get_nprocs_conf();
int *cpu_exclude;
memset(cfg, 0, sizeof(*cfg));
cpu_exclude = malloc(n_cpus * sizeof(int));
if (!cpu_exclude) {
fprintf(stderr, "failed to allocate array\n");
return EXIT_FAIL_MEM;
}
memset(cpu_exclude, 0, n_cpus * sizeof(int));
cfg->cpu_exclude = cpu_exclude;
cfg->max_cpus = n_cpus;
return 0;
}
int __find_map_fd_by_name(struct bpf_object *obj, char *name)
{
int fd;
fd = bpf_object__find_map_fd_by_name(obj, name);
if (fd < 0) {
printf("No map found! - named: %s\n", name);
exit(EXIT_FAIL_BPF);
}
return fd;
}
/* Get file descriptors to BPF-maps */
static int cpumap_config_find_maps(struct bpf_object *obj,
struct cpumap_config *cfg)
{
cfg->fd_cpumap = __find_map_fd_by_name(obj, "cpumap");
cfg->fd_cpus_enabled = __find_map_fd_by_name(obj, "cpus_enabled");
cfg->fd_cpus_count = __find_map_fd_by_name(obj, "cpus_count");
return 0;
}
static int create_cpu_entry(struct cpumap_config *cfg, __u32 cpu,
struct bpf_cpumap_val *value,
__u32 enabled_idx, bool new)
{
__u32 curr_cpus_count = 0;
__u32 key = 0;
int err, fd;
/* Add a CPU entry to cpumap, as this allocate a cpu entry in
* the kernel for the cpu.
*/
fd = cfg->fd_cpumap;
err = bpf_map_update_elem(fd, &cpu, value, 0);
if (err) {
fprintf(stderr, "Create(fd:%d) CPU(%d) entry failed (err:%d)\n",
fd, cpu, err);
return EXIT_FAIL_BPF;
}
/* Inform bpf_prog's that a new CPU is enabled and available
* to be select from the map, that maps index to actual CPU.
*/
fd = cfg->fd_cpus_enabled;
err = bpf_map_update_elem(fd, &enabled_idx, &cpu, 0);
if (err) {
fprintf(stderr, "Add to enabled avail CPUs failed\n");
return EXIT_FAIL_BPF;
}
/* When not replacing/updating existing entry, bump the count */
fd = cfg->fd_cpus_count;
err = bpf_map_lookup_elem(fd, &key, &curr_cpus_count);
if (err) {
fprintf(stderr, "Failed reading curr cpus_count\n");
return EXIT_FAIL_BPF;
}
if (new) {
curr_cpus_count++;
err = bpf_map_update_elem(fd, &key, &curr_cpus_count, 0);
if (err) {
fprintf(stderr, "Failed write curr cpus_count\n");
return EXIT_FAIL_BPF;
}
}
return 0;
}
/* Userspace MUST create/populate CPUMAP entries for redirect to work
*/
static int configure_cpus(struct cpumap_config *cfg)
{
struct bpf_cpumap_val value = { 0 };
int n_cpus = cfg->max_cpus;
int *exclude = cfg->cpu_exclude;
int enabled_idx = 0;
bool new = true;
int cpu, err;
value.qsize = cfg->qsize;
for (cpu = 0; cpu < n_cpus; cpu++) {
if (exclude[cpu] == -1) {
printf("Excluding CPU:%d\n", cpu);
continue;
}
printf("Enable CPU:%d\n", cpu);
err = create_cpu_entry(cfg, cpu, &value, enabled_idx, new);
if (err)
return err;
enabled_idx++;
}
return 0;
}
struct bpf_object *do_load_bpf_obj(struct bpf_object *obj)
{
char buf[200];
int err;
err = bpf_object__load(obj);
if (err) {
libbpf_strerror(err, buf, sizeof(buf));
printf("Error loading: %s\n", buf);
return NULL;
}
return obj;
}
int do_xdp_attach(int ifindex, struct bpf_program *prog, __u32 xdp_flags)
{
int prog_fd = bpf_program__fd(prog);
int err;
if (prog_fd < 0) {
fprintf(stderr, "bpf_program__fd failed\n");
return EXIT_FAIL_BPF;
}
err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
if (err) {
fprintf(stderr, "%s(): link set xdp fd failed (err:%d)\n",
__func__, err);
return EXIT_FAIL_XDP;
}
return EXIT_OK;
}
int do_xdp_detach(int ifindex, __u32 xdp_flags)
{
int err;
err = bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
if (err) {
fprintf(stderr, "%s(): link set xdp fd failed (err:%d)\n",
__func__, err);
return EXIT_FAIL_XDP;
}
return EXIT_OK;
}
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
bool do_detach = false;
int opt, longindex = 0;
char buf[100];
int err;
struct bpf_object *obj = NULL;
struct bpf_program *prog;
/* System to setup and exclude some CPUs */
struct cpumap_config cfg;
int n_cpus = get_nprocs_conf();
int non_cpu = -1;
int *cpu_exclude;
cpumap_config_init(&cfg);
cpu_exclude = cfg.cpu_exclude;
cfg.qsize = 512; /* Default queue size */
/* Always use XDP native driver mode */
xdp_flags |= XDP_FLAGS_DRV_MODE;
obj = bpf_object__open_file("xdp_cpumap_qinq.o", NULL);
err = libbpf_get_error(obj);
if (err) {
libbpf_strerror(err, buf, sizeof(buf));
printf("Error opening file: %s\n", buf);
return EXIT_FAIL_FILE;
}
err = EXIT_OK;
/* Parse commands line args */
while ((opt = getopt_long(argc, argv, "hd:q:Frx:",
long_options, &longindex)) != -1) {
switch (opt) {
case 'd':
if (strlen(optarg) >= IF_NAMESIZE) {
fprintf(stderr, "ERR: --dev name too long\n");
goto error;
}
ifname = (char *)&ifname_buf;
strncpy(ifname, optarg, IF_NAMESIZE);
ifindex = if_nametoindex(ifname);
if (ifindex == 0) {
fprintf(stderr,
"ERR: --dev name unknown err(%d):%s\n",
errno, strerror(errno));
goto error;
}
break;
case 'q':
cfg.qsize = strtol(optarg, NULL, 10);
break;
case 'F':
xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
break;
case 'r':
do_detach = true;
break;
case 'x': /* --exclude-cpu or --non-cpu */
/* Possible to exclude multiple CPUs on cmdline */
non_cpu = strtoul(optarg, NULL, 0);
if (non_cpu >= n_cpus) {
fprintf(stderr,
"--cpu nr too large for cpumap err(%d):%s\n",
errno, strerror(errno));
goto error;
}
cpu_exclude[non_cpu] = -1;
break;
case 'h':
error:
default:
usage(argv);
free(cpu_exclude);
return EXIT_FAIL_OPTION;
}
}
/* Required option */
if (ifindex == -1) {
fprintf(stderr, "ERR: required option --dev missing\n");
usage(argv);
err = EXIT_FAIL_OPTION;
goto out;
}
if (do_detach)
return do_xdp_detach(ifindex, xdp_flags);
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK)");
err = EXIT_FAIL_MEM;
goto out;
}
obj = do_load_bpf_obj(obj);
if (!obj) {
err = EXIT_FAIL_BPF;
goto out;
}
/* Pickup first BPF-program */
prog = bpf_program__next(NULL, obj);
if (!prog) {
printf("No program!\n");
err = EXIT_FAIL_BPF;
goto out;
}
/* Find maps maps */
if (cpumap_config_find_maps(obj, &cfg)) {
err = EXIT_FAIL_BPF;
goto out;
}
/* Configure cpumap */
if (configure_cpus(&cfg)) {
err = EXIT_FAIL_BPF;
goto out;
}
/* Attach XDP program */
err = do_xdp_attach(ifindex, prog, xdp_flags);
if (err)
goto out;
printf("Attached XDP program:\"%s\" on netdev:%s (ifindex:%d)\n",
bpf_program__name(prog), ifname, ifindex);
printf("CPUs: %d\n", n_cpus);
out:
if (obj)
bpf_object__close(obj);
free(cpu_exclude);
return err;
}

View File

@@ -0,0 +1,111 @@
/* SPDX-License-Identifier: GPL-2.0+ */
#include <linux/types.h>
#include <linux/bpf.h> /* struct bpf_cpumap_val */
#include <bpf/bpf_helpers.h>
#include <bpf/compiler.h>
#define INITVAL 15485863
//#define INITVAL 2654435761
#include "hash_func01.h" /* SuperFastHash */
#include <bpf/bpf_helpers.h>
#define VLAN_MAX_DEPTH 2
#include <xdp/parsing_helpers.h>
#define MAX_CPUS 24
/* This global variable is used for limiting CPU that can be selected */
__u32 global_max_cpus = 12; /* TODO: Allow userspace to adjust this */
/* Special map type that can XDP_REDIRECT frames to another CPU */
struct {
__uint(type, BPF_MAP_TYPE_CPUMAP);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(struct bpf_cpumap_val));
__uint(max_entries, MAX_CPUS);
} cpumap SEC(".maps");
/* Mapping table with CPUs enabled, for hashing between */
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, __u32);
__type(value, __u32);
__uint(max_entries, MAX_CPUS);
} cpus_enabled SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, __u32);
__type(value, __u32);
__uint(max_entries, 1);
} cpus_count SEC(".maps");
static __always_inline
__u32 extract_vlan_key(struct collect_vlans *vlans)
{
/* Combine inner and outer VLAN as a key */
__u32 vlan_key = (vlans->id[1] << 16) | vlans->id[0];
return vlan_key;
}
SEC("xdp")
int xdp_cpumap_qinq(struct xdp_md *ctx)
{
void *data = (void *)(long)ctx->data;
void *data_end = (void *)(long)ctx->data_end;
struct collect_vlans vlans = { 0 };
__u32 hash_key, vlan_key;
struct ethhdr *eth;
__u32 cpu_idx, cpu_dest = 0;
__u32 *cpu_lookup;
__u64 action;
__u32 *cpu_max;
/* These keep track of the next header type and iterator pointer */
struct hdr_cursor nh;
int eth_type;
nh.pos = data;
eth_type = parse_ethhdr_vlan(&nh, data_end, &eth, &vlans);
if (eth_type < 0) {
action = XDP_ABORTED;
goto out;
}
/* Keep ARP resolution working */
if (eth_type == bpf_htons(ETH_P_ARP)) {
action = XDP_PASS;
goto out;
}
if (!proto_is_vlan(eth->h_proto)) {
/* Skip non-VLAN frames */
action = XDP_PASS;
goto out;
}
int key0 = 0;
cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
if (!cpu_max)
return XDP_ABORTED;
/* Use inner+outer VLAN as key and hash based on max_cpus */
vlan_key = extract_vlan_key(&vlans);
hash_key = SuperFastHash((char *)&vlan_key, 4, INITVAL);
cpu_idx = hash_key % *cpu_max;
/* To allow excluding some CPUs, a mapping table cpus_enabled
* translates cpu_idx to real CPU-id
*/
cpu_lookup = bpf_map_lookup_elem(&cpus_enabled, &cpu_idx);
if (!cpu_lookup)
return XDP_ABORTED;
cpu_dest = *cpu_lookup;
/* Notice: Userspace MUST insert entries into cpumap */
action = bpf_redirect_map(&cpumap, cpu_dest, XDP_PASS);
out:
return action;
}