mirror of
https://github.com/xdp-project/bpf-examples.git
synced 2024-05-06 15:54:53 +00:00
Merge branch 'master' of https://github.com/netoptimizer/bpf-examples into netoptimizer-master
Signed-off-by: Jesper Dangaard Brouer <netoptimizer@brouer.com>
This commit is contained in:
124
headers/bpf/compiler.h
Normal file
124
headers/bpf/compiler.h
Normal file
@@ -0,0 +1,124 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* Copyright (C) 2016-2020 Authors of Cilium */
|
||||
|
||||
#ifndef __BPF_COMPILER_H_
|
||||
#define __BPF_COMPILER_H_
|
||||
|
||||
#ifndef __non_bpf_context
|
||||
# include "stddef.h"
|
||||
#endif
|
||||
|
||||
#ifndef __section
|
||||
# define __section(X) __attribute__((section(X), used))
|
||||
#endif
|
||||
|
||||
#ifndef __maybe_unused
|
||||
# define __maybe_unused __attribute__((__unused__))
|
||||
#endif
|
||||
|
||||
#ifndef offsetof
|
||||
# define offsetof(T, M) __builtin_offsetof(T, M)
|
||||
#endif
|
||||
|
||||
#ifndef field_sizeof
|
||||
# define field_sizeof(T, M) sizeof((((T *)NULL)->M))
|
||||
#endif
|
||||
|
||||
#ifndef __packed
|
||||
# define __packed __attribute__((packed))
|
||||
#endif
|
||||
|
||||
#ifndef __nobuiltin
|
||||
# if __clang_major__ >= 10
|
||||
# define __nobuiltin(X) __attribute__((no_builtin(X)))
|
||||
# else
|
||||
# define __nobuiltin(X)
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef likely
|
||||
# define likely(X) __builtin_expect(!!(X), 1)
|
||||
#endif
|
||||
|
||||
#ifndef unlikely
|
||||
# define unlikely(X) __builtin_expect(!!(X), 0)
|
||||
#endif
|
||||
|
||||
#ifndef always_succeeds /* Mainly for documentation purpose. */
|
||||
# define always_succeeds(X) likely(X)
|
||||
#endif
|
||||
|
||||
#undef __always_inline /* stddef.h defines its own */
|
||||
#define __always_inline inline __attribute__((always_inline))
|
||||
|
||||
#ifndef __stringify
|
||||
# define __stringify(X) #X
|
||||
#endif
|
||||
|
||||
#ifndef __fetch
|
||||
# define __fetch(X) (__u32)(__u64)(&(X))
|
||||
#endif
|
||||
|
||||
#ifndef __aligned
|
||||
# define __aligned(X) __attribute__((aligned(X)))
|
||||
#endif
|
||||
|
||||
#ifndef build_bug_on
|
||||
# define build_bug_on(E) ((void)sizeof(char[1 - 2*!!(E)]))
|
||||
#endif
|
||||
|
||||
#ifndef __throw_build_bug
|
||||
# define __throw_build_bug() __builtin_trap()
|
||||
#endif
|
||||
|
||||
#ifndef __printf
|
||||
# define __printf(X, Y) __attribute__((__format__(printf, X, Y)))
|
||||
#endif
|
||||
|
||||
#ifndef barrier
|
||||
# define barrier() asm volatile("": : :"memory")
|
||||
#endif
|
||||
|
||||
#ifndef barrier_data
|
||||
# define barrier_data(ptr) asm volatile("": :"r"(ptr) :"memory")
|
||||
#endif
|
||||
|
||||
static __always_inline void bpf_barrier(void)
|
||||
{
|
||||
/* Workaround to avoid verifier complaint:
|
||||
* "dereference of modified ctx ptr R5 off=48+0, ctx+const is allowed,
|
||||
* ctx+const+const is not"
|
||||
*/
|
||||
barrier();
|
||||
}
|
||||
|
||||
#ifndef ARRAY_SIZE
|
||||
# define ARRAY_SIZE(A) (sizeof(A) / sizeof((A)[0]))
|
||||
#endif
|
||||
|
||||
#ifndef __READ_ONCE
|
||||
# define __READ_ONCE(X) (*(volatile typeof(X) *)&X)
|
||||
#endif
|
||||
|
||||
#ifndef __WRITE_ONCE
|
||||
# define __WRITE_ONCE(X, V) (*(volatile typeof(X) *)&X) = (V)
|
||||
#endif
|
||||
|
||||
/* {READ,WRITE}_ONCE() with verifier workaround via bpf_barrier(). */
|
||||
|
||||
#ifndef READ_ONCE
|
||||
# define READ_ONCE(X) \
|
||||
({ typeof(X) __val = __READ_ONCE(X); \
|
||||
bpf_barrier(); \
|
||||
__val; })
|
||||
#endif
|
||||
|
||||
#ifndef WRITE_ONCE
|
||||
# define WRITE_ONCE(X, V) \
|
||||
({ typeof(X) __val = (V); \
|
||||
__WRITE_ONCE(X, __val); \
|
||||
bpf_barrier(); \
|
||||
__val; })
|
||||
#endif
|
||||
|
||||
#endif /* __BPF_COMPILER_H_ */
|
@@ -1,8 +1,8 @@
|
||||
/* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-clause) */
|
||||
/*
|
||||
* This file contains parsing functions that can be used in eXDP programs. The
|
||||
* functions are marked as __always_inline, and fully defined in this header
|
||||
* file to be included in the BPF program.
|
||||
* This file contains parsing functions that are used in the packetXX XDP
|
||||
* programs. The functions are marked as __always_inline, and fully defined in
|
||||
* this header file to be included in the BPF program.
|
||||
*
|
||||
* Each helper parses a packet header, including doing bounds checking, and
|
||||
* returns the type of its contents if successful, and -1 otherwise.
|
||||
@@ -10,6 +10,10 @@
|
||||
* For Ethernet and IP headers, the content type is the type of the payload
|
||||
* (h_proto for Ethernet, nexthdr for IPv6), for ICMP it is the ICMP type field.
|
||||
* All return values are in host byte order.
|
||||
*
|
||||
* The versions of the functions included here are slightly expanded versions of
|
||||
* the functions in the packet01 lesson. For instance, the Ethernet header
|
||||
* parsing has support for parsing VLAN tags.
|
||||
*/
|
||||
|
||||
#ifndef __PARSING_HELPERS_H
|
||||
@@ -54,7 +58,7 @@ struct icmphdr_common {
|
||||
|
||||
/* Allow users of header file to redefine VLAN max depth */
|
||||
#ifndef VLAN_MAX_DEPTH
|
||||
#define VLAN_MAX_DEPTH 4
|
||||
#define VLAN_MAX_DEPTH 2
|
||||
#endif
|
||||
|
||||
/* Longest chain of IPv6 extension headers to resolve */
|
||||
@@ -62,6 +66,11 @@ struct icmphdr_common {
|
||||
#define IPV6_EXT_MAX_CHAIN 6
|
||||
#endif
|
||||
|
||||
#define VLAN_VID_MASK 0x0fff /* VLAN Identifier */
|
||||
/* Struct for collecting VLANs after parsing via parse_ethhdr_vlan */
|
||||
struct collect_vlans {
|
||||
__u16 id[VLAN_MAX_DEPTH];
|
||||
};
|
||||
|
||||
static __always_inline int proto_is_vlan(__u16 h_proto)
|
||||
{
|
||||
@@ -74,18 +83,24 @@ static __always_inline int proto_is_vlan(__u16 h_proto)
|
||||
* Ethernet header. Thus, caller can look at eth->h_proto to see if this was a
|
||||
* VLAN tagged packet.
|
||||
*/
|
||||
static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end,
|
||||
struct ethhdr **ethhdr)
|
||||
static __always_inline int parse_ethhdr_vlan(struct hdr_cursor *nh,
|
||||
void *data_end,
|
||||
struct ethhdr **ethhdr,
|
||||
struct collect_vlans *vlans)
|
||||
{
|
||||
struct ethhdr *eth = nh->pos;
|
||||
int hdrsize = sizeof(*eth);
|
||||
struct vlan_hdr *vlh;
|
||||
__u16 h_proto;
|
||||
int i;
|
||||
|
||||
if (eth + 1 > data_end)
|
||||
/* Byte-count bounds check; check if current pointer + size of header
|
||||
* is after data_end.
|
||||
*/
|
||||
if (nh->pos + hdrsize > data_end)
|
||||
return -1;
|
||||
|
||||
nh->pos = eth + 1;
|
||||
nh->pos += hdrsize;
|
||||
*ethhdr = eth;
|
||||
vlh = nh->pos;
|
||||
h_proto = eth->h_proto;
|
||||
@@ -102,6 +117,10 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end,
|
||||
break;
|
||||
|
||||
h_proto = vlh->h_vlan_encapsulated_proto;
|
||||
if (vlans) /* collect VLAN ids */
|
||||
vlans->id[i] =
|
||||
(bpf_ntohs(vlh->h_vlan_TCI) & VLAN_VID_MASK);
|
||||
|
||||
vlh++;
|
||||
}
|
||||
|
||||
@@ -109,6 +128,14 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end,
|
||||
return h_proto; /* network-byte-order */
|
||||
}
|
||||
|
||||
static __always_inline int parse_ethhdr(struct hdr_cursor *nh,
|
||||
void *data_end,
|
||||
struct ethhdr **ethhdr)
|
||||
{
|
||||
/* Expect compiler removes the code that collects VLAN ids */
|
||||
return parse_ethhdr_vlan(nh, data_end, ethhdr, NULL);
|
||||
}
|
||||
|
||||
static __always_inline int skip_ip6hdrext(struct hdr_cursor *nh,
|
||||
void *data_end,
|
||||
__u8 next_hdr_type)
|
||||
@@ -174,6 +201,9 @@ static __always_inline int parse_iphdr(struct hdr_cursor *nh,
|
||||
return -1;
|
||||
|
||||
hdrsize = iph->ihl * 4;
|
||||
/* Sanity check packet field is valid */
|
||||
if(hdrsize < sizeof(iph))
|
||||
return -1;
|
||||
|
||||
/* Variable-length IPv4 header, need to use byte-based arithmetic */
|
||||
if (nh->pos + hdrsize > data_end)
|
||||
@@ -267,10 +297,15 @@ static __always_inline int parse_tcphdr(struct hdr_cursor *nh,
|
||||
return -1;
|
||||
|
||||
len = h->doff * 4;
|
||||
if ((void *) h + len > data_end)
|
||||
/* Sanity check packet field is valid */
|
||||
if(len < sizeof(h))
|
||||
return -1;
|
||||
|
||||
nh->pos = h + 1;
|
||||
/* Variable-length TCP header, need to use byte-based arithmetic */
|
||||
if (nh->pos + len > data_end)
|
||||
return -1;
|
||||
|
||||
nh->pos += len;
|
||||
*tcphdr = h;
|
||||
|
||||
return len;
|
||||
|
@@ -1,14 +1,23 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
|
||||
|
||||
USER_TARGETS :=
|
||||
BPF_TARGETS := edt_pacer01
|
||||
BPF_TARGETS += edt_pacer02
|
||||
USER_TARGETS := xdp_cpumap_loader
|
||||
BPF_TARGETS := edt_pacer_vlan
|
||||
BPF_TARGETS += xdp_cpumap_qinq
|
||||
|
||||
EXTRA_DEPS += config.mk
|
||||
|
||||
LIB_DIR = ../lib
|
||||
|
||||
include $(LIB_DIR)/common.mk
|
||||
include config.mk
|
||||
|
||||
# The iproute2 'tc' tool doesn't understand BTF debug info
|
||||
all: config.mk
|
||||
|
||||
config.mk: configure
|
||||
@sh configure
|
||||
|
||||
ifndef HAVE_TC_LIBBPF
|
||||
# If the iproute2 'tc' tool doesn't understand BTF debug info
|
||||
# use llvm-strip to remove this debug info from object file
|
||||
#
|
||||
# *BUT* cannot strip everything as it removes ELF elems needed for
|
||||
@@ -16,6 +25,8 @@ include $(LIB_DIR)/common.mk
|
||||
#
|
||||
.PHONY: strip_tc_obj
|
||||
strip_tc_obj: ${BPF_TARGETS:=.o}
|
||||
$(Q) echo "TC don't support libbpf - strip BTF info"
|
||||
$(Q) llvm-strip --no-strip-all --remove-section .BTF $?
|
||||
|
||||
all: strip_tc_obj
|
||||
endif
|
||||
|
@@ -11,12 +11,12 @@ root_check_run_with_sudo "$@"
|
||||
# Use common parameters
|
||||
source ${basedir}/parameters.sh
|
||||
|
||||
export TC=/sbin/tc
|
||||
export TC=tc
|
||||
|
||||
# This can be changed via --file or --obj
|
||||
if [[ -z ${BPF_OBJ} ]]; then
|
||||
# Fallback default
|
||||
BPF_OBJ=edt_pacer02.o
|
||||
BPF_OBJ=edt_pacer_vlan.o
|
||||
fi
|
||||
|
||||
info "Applying TC-BPF egress setup on device: $DEV with object file: $BPF_OBJ"
|
||||
|
31
traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt
Executable file
31
traffic-pacing-edt/bpftrace/edt_tstamp_diff.bt
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/usr/local/bin/bpftrace
|
||||
|
||||
#include <linux/skbuff.h>
|
||||
|
||||
/* Measure time difference between EDT-time and real "NIC" TX-time.
|
||||
*
|
||||
* Assuming packets are EDT timestamped by the BPF-program, we can
|
||||
* detect/measure how accuratly packets are actually transmitted
|
||||
* towards the NIC driver, by comparing EDT-time against "now"
|
||||
* timestamp in the function transmitting to the NIC driver.
|
||||
*/
|
||||
|
||||
// tracepoint:net:net_dev_start_xmit
|
||||
tracepoint:net:net_dev_xmit
|
||||
{
|
||||
$skb = (struct sk_buff *)args->skbaddr;
|
||||
//$tstamp = (uint64)$skb->tstamp;
|
||||
$tstamp = $skb->skb_mstamp_ns;
|
||||
$now = nsecs;
|
||||
|
||||
// if ($skb->mark > 0) {
|
||||
if ($tstamp > 0) {
|
||||
if ($now >= $tstamp) {
|
||||
$diff_late = $now - $tstamp;
|
||||
} else {
|
||||
$diff_ahead = $tstamp - $now;
|
||||
}
|
||||
@tstamp_diff_late = hist($diff_late / 1000);
|
||||
@tstamp_diff_ahead = hist($diff_ahead / 1000);
|
||||
}
|
||||
}
|
78
traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt
Executable file
78
traffic-pacing-edt/bpftrace/edt_tstamp_diff_advanced.bt
Executable file
@@ -0,0 +1,78 @@
|
||||
#!/usr/local/bin/bpftrace
|
||||
|
||||
#include <linux/skbuff.h>
|
||||
|
||||
// tracepoint:net:net_dev_start_xmit
|
||||
tracepoint:net:net_dev_xmit
|
||||
{
|
||||
$skb = (struct sk_buff *)args->skbaddr;
|
||||
//$tstamp = (uint64)$skb->tstamp;
|
||||
$tstamp = $skb->skb_mstamp_ns;
|
||||
$now = nsecs;
|
||||
|
||||
// if ($skb->mark > 0) {
|
||||
if ($tstamp > 0) {
|
||||
if ($now >= $tstamp) {
|
||||
$diff_late = $now - $tstamp;
|
||||
} else {
|
||||
$diff_ahead = $tstamp - $now;
|
||||
}
|
||||
@tstamp_usec_diff_late = hist($diff_late / 1000);
|
||||
@tstamp_usec_diff_ahead = hist($diff_ahead / 1000);
|
||||
}
|
||||
|
||||
/* Capture burstiness over a time period, by dividing nanosec
|
||||
* timestamp with wanted period, and keeping state byte counter as
|
||||
* long as timestamp match.
|
||||
*
|
||||
* Practical usage shows that bpftrace uses a hash-map to implement
|
||||
* this, which unfortunately cost too much (shows 5% jhash cpu
|
||||
* usage), enough overhead to change behavior of prod system.
|
||||
*/
|
||||
//$period = $now / 10000; /* 10000 = 10 usec */
|
||||
$period = $now / 30000; /* 30000 = 30 usec */
|
||||
if (@state[cpu] == $period) {
|
||||
@state_bytes[cpu] += $skb->len;
|
||||
} else {
|
||||
@state[cpu] = $period;
|
||||
if (@state_bytes[cpu] > 0) {
|
||||
@byte_burst[cpu] = hist(@state_bytes[cpu]);
|
||||
}
|
||||
@state_bytes[cpu] = $skb->len; /* Reset counter */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
tracepoint:qdisc:qdisc_dequeue
|
||||
{
|
||||
@qdisc_bulk_dequeue = lhist(args->packets, 0,64,1);
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
kretfunc:dev_hard_start_xmit
|
||||
{
|
||||
// Wanted to know if ret == NETDEV_TX_BUSY
|
||||
# ERROR: kfunc/kretfunc not available for your linked against bcc version.
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
/* How often does FQ-pacer find no-packets are qualified to be
|
||||
* scheduled, which leads to scheduling an hrtimer event, that will
|
||||
* start qdisc again at a later time.
|
||||
*
|
||||
* We cannot kprobe fq_dequeue as it is a module.
|
||||
*/
|
||||
|
||||
/*
|
||||
kprobe:qdisc_watchdog_schedule_range_ns
|
||||
{
|
||||
@qdisc_watchdog[cpu] = count();
|
||||
}
|
||||
|
||||
kprobe:__netif_schedule
|
||||
{
|
||||
@__netif_schedule[cpu] = count();
|
||||
}
|
||||
*/
|
153
traffic-pacing-edt/codel_impl.h
Normal file
153
traffic-pacing-edt/codel_impl.h
Normal file
@@ -0,0 +1,153 @@
|
||||
#ifndef __CODEL_IMPL_H
|
||||
#define __CODEL_IMPL_H
|
||||
|
||||
#ifndef CODEL_TARGET
|
||||
#define CODEL_TARGET (10 * 1000 * 1000ULL) /* 10 ms in nanosec */
|
||||
#endif
|
||||
|
||||
#ifndef CODEL_EXCEED_INTERVAL
|
||||
#define CODEL_EXCEED_INTERVAL (100 * 1000 * 1000ULL) /* 100 ms in ns*/
|
||||
#endif
|
||||
|
||||
/* Codel like dropping scheme, inspired by:
|
||||
* - RFC: https://queue.acm.org/detail.cfm?id=2209336
|
||||
* - Code: https://queue.acm.org/appendices/codel.html
|
||||
* - Kernel: include/net/codel_impl.h
|
||||
*/
|
||||
struct codel_state {
|
||||
/* codel like dropping scheme */
|
||||
__u64 first_above_time; /* Time when above target (0 if below)*/
|
||||
__u64 drop_next; /* Time to drop next packet */
|
||||
__u32 count; /* Packets dropped since going into drop state */
|
||||
__u32 dropping; /* Equal to 1 if in drop state */
|
||||
};
|
||||
|
||||
/* Table lookup for square-root shifted 16 bit */
|
||||
static __always_inline __u32 get_sqrt_sh16(__u64 cnt)
|
||||
{
|
||||
switch (cnt) {
|
||||
case 1: return 65536; /* 65536 * sqrt(1) */
|
||||
case 2: return 92682; /* 65536 * sqrt(2) */
|
||||
case 3: return 113512; /* 65536 * sqrt(3) */
|
||||
case 4: return 131072; /* 65536 * sqrt(4) */
|
||||
case 5: return 146543; /* 65536 * sqrt(5) */
|
||||
case 6: return 160530; /* 65536 * sqrt(6) */
|
||||
case 7: return 173392;
|
||||
case 8: return 185364;
|
||||
case 9: return 196608;
|
||||
case 10: return 207243;
|
||||
case 11: return 217358;
|
||||
case 12: return 227023;
|
||||
case 13: return 236293;
|
||||
case 14: return 245213;
|
||||
case 15: return 253820;
|
||||
case 16: return 262144; /* 100 ms / sqrt(16) = 25 ms */
|
||||
case 17: return 270212;
|
||||
case 18: return 278046;
|
||||
case 19: return 285664;
|
||||
case 20: return 293086;
|
||||
case 21: return 300324;
|
||||
case 22: return 307391;
|
||||
case 23: return 314300;
|
||||
case 24: return 321060;
|
||||
case 25: return 327680; /* 100 ms / sqrt(25) = 20 ms */
|
||||
case 26: return 334169;
|
||||
case 27: return 340535;
|
||||
case 28: return 346784;
|
||||
case 29: return 352922;
|
||||
case 30: return 358955;
|
||||
case 31: return 364889;
|
||||
case 32: return 370728;
|
||||
case 33: return 376476;
|
||||
case 34: return 382137;
|
||||
case 35: return 387716;
|
||||
case 36: return 393216; /* 100 / sqrt(36) = 16.66 ms */
|
||||
default:
|
||||
return 463410; /* 65536*sqrt(50) => 100/sqrt(50) = 14.14 ms */
|
||||
}
|
||||
}
|
||||
|
||||
static __always_inline __u64 get_next_interval_sqrt(__u64 cnt)
|
||||
{
|
||||
__u64 val = (__u64)CODEL_EXCEED_INTERVAL << 16 / get_sqrt_sh16(cnt);
|
||||
return val;
|
||||
}
|
||||
|
||||
static __always_inline __u64
|
||||
codel_control_law(__u64 t, __u64 cnt)
|
||||
{
|
||||
return t + get_next_interval_sqrt(cnt);
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
bool codel_should_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now)
|
||||
{
|
||||
__u64 interval = CODEL_EXCEED_INTERVAL;
|
||||
|
||||
if (t_queue_sz < CODEL_TARGET) {
|
||||
/* went below so we'll stay below for at least interval */
|
||||
codel->first_above_time = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (codel->first_above_time == 0) {
|
||||
/* just went above from below. If we stay above
|
||||
* for at least interval we'll say it's ok to drop
|
||||
*/
|
||||
codel->first_above_time = now + interval;
|
||||
return false;
|
||||
} else if (now >= codel->first_above_time) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
bool codel_drop(struct codel_state *codel, __u64 t_queue_sz, __u64 now)
|
||||
{
|
||||
__u64 interval = CODEL_EXCEED_INTERVAL;
|
||||
|
||||
/* If horizon have been exceed for a while, inc drop intensity*/
|
||||
bool drop = codel_should_drop(codel, t_queue_sz, now);
|
||||
|
||||
if (codel->dropping) { /* In dropping state */
|
||||
if (!drop) {
|
||||
/* time below target - leave dropping state */
|
||||
codel->dropping = false;
|
||||
return false;
|
||||
} else if (now >= codel->drop_next) {
|
||||
/* It's time for the next drop. Drop the current
|
||||
* packet. Schedule the next drop
|
||||
*/
|
||||
codel->count += 1;
|
||||
// schedule the next drop.
|
||||
codel->drop_next =
|
||||
codel_control_law(codel->drop_next, codel->count);
|
||||
return true;
|
||||
}
|
||||
} else if (drop &&
|
||||
((now - codel->drop_next < interval) ||
|
||||
(now - codel->first_above_time >= interval))) {
|
||||
/* If we get here, then we're not in dropping state.
|
||||
* Decide whether it's time to enter dropping state.
|
||||
*/
|
||||
__u32 count = codel->count;
|
||||
|
||||
codel->dropping = true;
|
||||
|
||||
/* If we're in a drop cycle, drop rate that controlled queue
|
||||
* on the last cycle is a good starting point to control it now.
|
||||
*/
|
||||
if (now - codel->drop_next < interval)
|
||||
count = count > 2 ? (count - 2) : 1;
|
||||
else
|
||||
count = 1;
|
||||
|
||||
codel->count = count;
|
||||
codel->drop_next = codel_control_law(now, count);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif /* __CODEL_IMPL_H */
|
29
traffic-pacing-edt/configure
vendored
Executable file
29
traffic-pacing-edt/configure
vendored
Executable file
@@ -0,0 +1,29 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
|
||||
# This is not an autoconf generated configure
|
||||
#
|
||||
|
||||
# Output file which is input to Makefile
|
||||
CONFIG=config.mk
|
||||
|
||||
# Assume tc is in $PATH
|
||||
TC=tc
|
||||
|
||||
check_tc_libbpf()
|
||||
{
|
||||
tc_version=$($TC -V)
|
||||
if echo $tc_version | grep -q libbpf; then
|
||||
libbpf_version=${tc_version##*libbpf }
|
||||
echo "HAVE_TC_LIBBPF:=y" >> $CONFIG
|
||||
echo "BPF_CFLAGS += -DHAVE_TC_LIBBPF" >> $CONFIG
|
||||
echo "yes ($libbpf_version)"
|
||||
else
|
||||
echo "no"
|
||||
fi
|
||||
}
|
||||
|
||||
echo "# Generated config" > $CONFIG
|
||||
echo "Detecting available features on system"
|
||||
|
||||
echo -n " - libbpf support in tc tool: "
|
||||
check_tc_libbpf
|
@@ -1,40 +0,0 @@
|
||||
#include <linux/bpf.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <xdp/parsing_helpers.h>
|
||||
#include "iproute2_compat.h"
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
/* The tc tool (iproute2) use another ELF map layout than libbpf (struct
|
||||
* bpf_map_def), see struct bpf_elf_map from iproute2.
|
||||
*/
|
||||
struct bpf_elf_map SEC("maps") cnt_map = {
|
||||
.type = BPF_MAP_TYPE_ARRAY,
|
||||
.size_key = sizeof(__u32),
|
||||
.size_value = sizeof(__u64),
|
||||
.max_elem = 1,
|
||||
//.pinning = PIN_GLOBAL_NS,
|
||||
};
|
||||
|
||||
SEC("classifier") int tc_dummy(struct __sk_buff *skb)
|
||||
{
|
||||
volatile void *data, *data_end;
|
||||
int ret = BPF_OK;
|
||||
struct ethhdr *eth;
|
||||
|
||||
data = (void *)(long)skb->data;
|
||||
data_end = (void *)(long)skb->data_end;
|
||||
eth = (struct ethhdr *)data;
|
||||
|
||||
if (data + sizeof(*eth) > data_end)
|
||||
return BPF_DROP;
|
||||
|
||||
/* Keep ARP resolution working */
|
||||
if (eth->h_proto == bpf_htons(ETH_P_ARP)) {
|
||||
ret = BPF_OK;
|
||||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
@@ -1,126 +0,0 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0+ */
|
||||
#include <linux/bpf.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <xdp/parsing_helpers.h>
|
||||
#include "iproute2_compat.h"
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
#define NS_PER_SEC 1000000000
|
||||
|
||||
/* skb->len in bytes, thus easier to keep rate in bytes */
|
||||
#define RATE_IN_BITS (1000 * 1000 * 1000)
|
||||
#define RATE_IN_BYTES (RATE_IN_BITS / 8)
|
||||
|
||||
#define T_HORIZON_DROP (2000 * 1000 * 1000)
|
||||
|
||||
/* FIXME add proper READ_ONCE / WRITE_ONCE macros, for now use for annotation */
|
||||
#define READ_ONCE(V) (V)
|
||||
#define WRITE_ONCE(X,V) (X) = (V)
|
||||
|
||||
struct edt_val {
|
||||
__u64 rate;
|
||||
__u64 t_last;
|
||||
__u64 t_horizon_drop;
|
||||
__u64 t_horizon_ecn;
|
||||
};
|
||||
|
||||
/* The tc tool (iproute2) use another ELF map layout than libbpf (struct
|
||||
* bpf_map_def), see struct bpf_elf_map from iproute2.
|
||||
*/
|
||||
struct bpf_elf_map SEC("maps") time_delay_map = {
|
||||
.type = BPF_MAP_TYPE_ARRAY,
|
||||
.size_key = sizeof(__u32),
|
||||
.size_value = sizeof(struct edt_val),
|
||||
.max_elem = 1,
|
||||
//.pinning = PIN_GLOBAL_NS,
|
||||
};
|
||||
|
||||
/* Role of EDT (Earliest Departure Time) is to schedule departure of packets to
|
||||
* be send in the future.
|
||||
*/
|
||||
static __always_inline int sched_departure(struct __sk_buff *skb)
|
||||
{
|
||||
struct edt_val *edt;
|
||||
__u64 t_queue_sz;
|
||||
__u64 t_xmit_ns;
|
||||
__u64 t_next;
|
||||
__u64 t_curr;
|
||||
int key = 0;
|
||||
__u64 now;
|
||||
|
||||
edt = bpf_map_lookup_elem(&time_delay_map, &key);
|
||||
if (!edt)
|
||||
return BPF_DROP;
|
||||
|
||||
/* Calc transmission time it takes to send packet 'bytes' */
|
||||
t_xmit_ns = ((__u64)skb->len) * NS_PER_SEC / RATE_IN_BYTES;
|
||||
// t_xmit_ns = ((__u64)skb->len) * NS_PER_SEC / edt->rate;
|
||||
|
||||
now = bpf_ktime_get_ns();
|
||||
|
||||
/* Allow others to set skb tstamp prior to us */
|
||||
t_curr = skb->tstamp;
|
||||
if (t_curr < now)
|
||||
t_curr = now;
|
||||
|
||||
/* The 't_last' timestamp can be in the future. Packets scheduled a head
|
||||
* of his packet can be seen as the queue size measured in time, via
|
||||
* correlating this to 'now' timestamp.
|
||||
*/
|
||||
t_next = READ_ONCE(edt->t_last) + t_xmit_ns;
|
||||
|
||||
/* If packet doesn't get scheduled into the future, then there is
|
||||
* no-queue and we are not above rate limit. Send packet immediately and
|
||||
* move forward t_last timestamp to now.
|
||||
*/
|
||||
if (t_next <= t_curr) {
|
||||
WRITE_ONCE(edt->t_last, t_curr);
|
||||
return BPF_OK;
|
||||
}
|
||||
|
||||
/* Calc queue size measured in time */
|
||||
t_queue_sz = t_next - now;
|
||||
|
||||
/* FQ-pacing qdisc also have horizon, but cannot use that, because this
|
||||
* BPF-prog will have updated map (t_last) on packet and assumes it got
|
||||
* its part of bandwidth.
|
||||
*/
|
||||
if (t_queue_sz >= T_HORIZON_DROP /* edt->t_horizon_drop */)
|
||||
return BPF_DROP;
|
||||
|
||||
// TODO Add ECN marking horizon
|
||||
|
||||
/* Advance "time queue" */
|
||||
WRITE_ONCE(edt->t_last, t_next);
|
||||
|
||||
/* Schedule packet to be send at future timestamp */
|
||||
skb->tstamp = t_next;
|
||||
return BPF_OK;
|
||||
}
|
||||
|
||||
SEC("classifier") int tc_edt_simple(struct __sk_buff *skb)
|
||||
{
|
||||
volatile void *data, *data_end;
|
||||
int ret = BPF_OK;
|
||||
struct ethhdr *eth;
|
||||
|
||||
data = (void *)(long)skb->data;
|
||||
data_end = (void *)(long)skb->data_end;
|
||||
eth = (struct ethhdr *)data;
|
||||
|
||||
if (data + sizeof(*eth) > data_end)
|
||||
return BPF_DROP;
|
||||
|
||||
/* Keep ARP resolution working */
|
||||
if (eth->h_proto == bpf_htons(ETH_P_ARP)) {
|
||||
ret = BPF_OK;
|
||||
goto out;
|
||||
}
|
||||
|
||||
// TODO: match on vlan16 and only apply EDT on that
|
||||
return sched_departure(skb);
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
280
traffic-pacing-edt/edt_pacer_vlan.c
Normal file
280
traffic-pacing-edt/edt_pacer_vlan.c
Normal file
@@ -0,0 +1,280 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0+ */
|
||||
#include <linux/bpf.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/compiler.h>
|
||||
|
||||
#include <stdbool.h>
|
||||
|
||||
#define VLAN_MAX_DEPTH 2
|
||||
#include <xdp/parsing_helpers.h>
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
#define NS_PER_SEC 1000000000
|
||||
|
||||
/* Strategy: Shape at MAC (Medium Access Control) layer with Ethernet
|
||||
*
|
||||
* Production use-case is pacing traffic at 1Gbit/s wirespeed, using a
|
||||
* 10Gbit/s NIC, because 1G end-user switch cannot handle bursts.
|
||||
*
|
||||
* (https://en.wikipedia.org/wiki/Interpacket_gap
|
||||
* 12 bytes = interframe gap (IFG) 96 bit
|
||||
|
||||
* (https://en.wikipedia.org/wiki/Ethernet_frame)
|
||||
* 8 bytes = MAC preamble
|
||||
* 4 bytes = Ethernet Frame Check Sequence (FCS) CRC
|
||||
* 46 bytes = Minimum Payload size
|
||||
*
|
||||
* 14 bytes = Ethernet header
|
||||
* 8 bytes = 2x VLAN headers
|
||||
*/
|
||||
//#define RATE_IN_BITS (1000 * 1000 * 1000ULL) /* Full 1Gbit/s */
|
||||
#define RATE_IN_BITS (990 * 1000 * 1000ULL)
|
||||
//#define RATE_IN_BITS (950 * 1000 * 1000ULL)
|
||||
#define OVERHEAD (12 + 8 + 4 + 8) /* 14 already in wire_len */
|
||||
//#define OVERHEAD (12 + 8 + 4) /* 14 already in wire_len */
|
||||
#define ETH_MIN (84)
|
||||
|
||||
/* skb->len in bytes, thus convert rate to bytes */
|
||||
#define RATE_IN_BYTES (RATE_IN_BITS / 8)
|
||||
|
||||
/* Controlling how large queue (in time) is allow to grow */
|
||||
#define T_HORIZON_DROP (40 * 1000 * 1000ULL)
|
||||
#define T_HORIZON_TARGET (5 * 1000 * 1000ULL)
|
||||
#define T_HORIZON_ECN (1 * 1000 * 1000ULL)
|
||||
|
||||
/* Codel: If queue exceed target for more than one interval, start dropping */
|
||||
#define T_EXCEED_INTERVAL (100 * 1000 * 1000ULL) /* 100 ms in ns*/
|
||||
|
||||
#define CODEL_TARGET T_HORIZON_TARGET
|
||||
#define CODEL_EXCEED_INTERVAL T_EXCEED_INTERVAL
|
||||
#include "codel_impl.h"
|
||||
|
||||
struct edt_val {
|
||||
__u64 rate;
|
||||
__u64 t_last;
|
||||
__u64 t_horizon_drop;
|
||||
__u64 t_horizon_ecn;
|
||||
struct codel_state codel;
|
||||
} __aligned(64); /* Align struct to cache-size to avoid false-sharing */
|
||||
|
||||
#ifdef HAVE_TC_LIBBPF /* detected by configure script in config.mk */
|
||||
/* Use BTF format to create map */
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||
__uint(max_entries, 4096); /* Max possible VLANs */
|
||||
__type(key, __u32);
|
||||
__type(value, struct edt_val);
|
||||
// __uint(pinning, LIBBPF_PIN_BY_NAME);
|
||||
} time_delay_map SEC(".maps");
|
||||
|
||||
#else
|
||||
/* The (iproute2) tc tool (without libbpf support) use another ELF map
|
||||
* layout than libbpf (struct bpf_map_def), see struct bpf_elf_map
|
||||
* from iproute2.
|
||||
*/
|
||||
#include "iproute2_compat.h"
|
||||
struct bpf_elf_map SEC("maps") time_delay_map = {
|
||||
.type = BPF_MAP_TYPE_ARRAY,
|
||||
.size_key = sizeof(__u32),
|
||||
.size_value = sizeof(struct edt_val),
|
||||
.max_elem = 4096, /* Max possible VLANs */
|
||||
// .pinning = PIN_GLOBAL_NS,
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
/* Role of EDT (Earliest Departure Time) is to schedule departure of packets to
|
||||
* be send in the future.
|
||||
*/
|
||||
static __always_inline int sched_departure(struct __sk_buff *skb, __u32 key)
|
||||
{
|
||||
struct edt_val *edt;
|
||||
__u64 t_queue_sz;
|
||||
__u64 t_xmit_ns;
|
||||
__u64 wire_len;
|
||||
__u64 t_next;
|
||||
__u64 t_curr;
|
||||
__u64 now;
|
||||
|
||||
edt = bpf_map_lookup_elem(&time_delay_map, &key);
|
||||
if (!edt)
|
||||
return BPF_DROP;
|
||||
|
||||
/* Calc transmission time it takes to send packet 'bytes'.
|
||||
*
|
||||
* Details on getting precise bytes on wire. The skb->len does include
|
||||
* length of GRO/GSO segments, but not the segment headers that gets
|
||||
* added on transmit. Fortunately skb->wire_len at TC-egress hook (not
|
||||
* ingress) include these headers. (See: qdisc_pkt_len_init())
|
||||
*/
|
||||
wire_len = skb->wire_len + OVERHEAD;
|
||||
wire_len = wire_len > ETH_MIN ? wire_len : ETH_MIN;
|
||||
|
||||
t_xmit_ns = (wire_len) * NS_PER_SEC / RATE_IN_BYTES;
|
||||
|
||||
// t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / RATE_IN_BYTES;
|
||||
// t_xmit_ns = ((__u64)skb->wire_len) * NS_PER_SEC / edt->rate;
|
||||
|
||||
// now = bpf_ktime_get_ns();
|
||||
now = bpf_ktime_get_boot_ns(); /* Use same ktime as bpftrace */
|
||||
|
||||
/* Allow others to set skb tstamp prior to us */
|
||||
t_curr = skb->tstamp;
|
||||
if (t_curr < now)
|
||||
t_curr = now;
|
||||
|
||||
/* The 't_last' timestamp can be in the future. Packets scheduled a head
|
||||
* of his packet can be seen as the queue size measured in time, via
|
||||
* correlating this to 'now' timestamp.
|
||||
*/
|
||||
t_next = READ_ONCE(edt->t_last) + t_xmit_ns;
|
||||
|
||||
/* If packet doesn't get scheduled into the future, then there is
|
||||
* no-queue and we are not above rate limit. Normally send packet
|
||||
* immediately and move forward t_last timestamp to now.
|
||||
*
|
||||
* But in our use-case the traffic need smoothing at a earlier
|
||||
* stage, as bursts at lower rates can hurt the crapy switch.
|
||||
* Thus, schedule SKB transmissing as new + t_xmit_ns.
|
||||
*/
|
||||
if (t_next <= t_curr) {
|
||||
#if 1
|
||||
__u64 t_curr_next;
|
||||
__u32 min_len = 1538;
|
||||
|
||||
/* Minimum delay for all packet if no time-queue */
|
||||
wire_len = (wire_len > min_len) ? wire_len : min_len;
|
||||
t_xmit_ns = (wire_len) * NS_PER_SEC / RATE_IN_BYTES;
|
||||
t_curr_next = t_curr + t_xmit_ns;
|
||||
|
||||
WRITE_ONCE(edt->t_last, t_curr_next);
|
||||
skb->tstamp = t_curr_next;
|
||||
skb->mark = 1; /* No queue - add minimum delay */
|
||||
#else
|
||||
WRITE_ONCE(edt->t_last, t_curr);
|
||||
#endif
|
||||
return BPF_OK;
|
||||
|
||||
}
|
||||
|
||||
/* Calc queue size measured in time */
|
||||
t_queue_sz = t_next - now;
|
||||
|
||||
/* FQ-pacing qdisc also have horizon, but cannot use that, because this
|
||||
* BPF-prog will have updated map (t_last) on packet and assumes it got
|
||||
* its part of bandwidth.
|
||||
*/
|
||||
if (t_queue_sz >= T_HORIZON_DROP /* edt->t_horizon_drop */)
|
||||
return BPF_DROP;
|
||||
|
||||
/* If TCP didn't react to ECN marking, then start dropping some */
|
||||
// if (codel_drop(edt, t_queue_sz, now))
|
||||
if (codel_drop(&edt->codel, t_queue_sz, t_next))
|
||||
return BPF_DROP;
|
||||
|
||||
skb->mark = 2; /* (time) queue exist - and small/below T_HORIZON_ECN */
|
||||
|
||||
/* ECN marking horizon */
|
||||
if (t_queue_sz >= T_HORIZON_ECN) {
|
||||
skb->mark = 3; /* (time) queue exist - and is large */
|
||||
bpf_skb_ecn_set_ce(skb);
|
||||
}
|
||||
|
||||
/* Advance "time queue" */
|
||||
WRITE_ONCE(edt->t_last, t_next);
|
||||
|
||||
/* Schedule packet to be send at future timestamp */
|
||||
skb->tstamp = t_next;
|
||||
return BPF_OK;
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
__u16 get_inner_qinq_vlan(struct __sk_buff *skb, struct collect_vlans *vlans)
|
||||
{
|
||||
__u16 vlan_key;
|
||||
|
||||
/* NIC can HW "offload" the outer VLAN, moving it to skb context */
|
||||
if (skb->vlan_present)
|
||||
vlan_key = vlans->id[0]; /* Inner vlan placed as first inline */
|
||||
else
|
||||
vlan_key = vlans->id[1]; /* All VLAN headers inline */
|
||||
|
||||
return vlan_key;
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
__u16 get_vlan(struct __sk_buff *skb, struct collect_vlans *vlans)
|
||||
{
|
||||
__u16 vlan_key;
|
||||
|
||||
/* Handle extracting VLAN if skb context have VLAN offloaded */
|
||||
if (skb->vlan_present)
|
||||
vlan_key = skb->vlan_tci & VLAN_VID_MASK;
|
||||
else
|
||||
vlan_key = vlans->id[0];
|
||||
|
||||
return vlan_key;
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
__u16 extract_vlan_key(struct __sk_buff *skb, struct collect_vlans *vlans)
|
||||
{
|
||||
int QinQ = 0;
|
||||
|
||||
/* The inner VLAN is the key to extract. But it is complicated
|
||||
* due to NIC "offloaded" VLAN (skb->vlan_present). In case
|
||||
* BPF-prog is loaded on outer VLAN net_device, the BPF-prog
|
||||
* sees the inner-VLAN at the first and only VLAN.
|
||||
*/
|
||||
if (skb->vlan_present) {
|
||||
if (vlans->id[0])
|
||||
QinQ = 1;
|
||||
} else {
|
||||
if (vlans->id[1])
|
||||
QinQ = 1;
|
||||
}
|
||||
|
||||
if (QinQ)
|
||||
return get_inner_qinq_vlan(skb, vlans);
|
||||
else
|
||||
return get_vlan(skb, vlans);
|
||||
}
|
||||
|
||||
SEC("classifier") int tc_edt_vlan(struct __sk_buff *skb)
|
||||
{
|
||||
void *data = (void *)(long)skb->data;
|
||||
void *data_end = (void *)(long)skb->data_end;
|
||||
struct collect_vlans vlans = { 0 };
|
||||
struct ethhdr *eth;
|
||||
int ret = BPF_OK;
|
||||
__u16 vlan_key;
|
||||
|
||||
/* These keep track of the next header type and iterator pointer */
|
||||
struct hdr_cursor nh;
|
||||
int eth_type;
|
||||
nh.pos = data;
|
||||
|
||||
eth_type = parse_ethhdr_vlan(&nh, data_end, ð, &vlans);
|
||||
if (eth_type < 0)
|
||||
return BPF_DROP;
|
||||
|
||||
/* Keep ARP resolution working */
|
||||
if (eth_type == bpf_htons(ETH_P_ARP)) {
|
||||
ret = BPF_OK;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!proto_is_vlan(eth->h_proto) && !skb->vlan_present) {
|
||||
/* Skip non-VLAN frames */
|
||||
return BPF_OK;
|
||||
}
|
||||
|
||||
vlan_key = extract_vlan_key(skb, &vlans);
|
||||
|
||||
/* Each (inner) VLAN id gets it own EDT pacing */
|
||||
return sched_departure(skb, vlan_key);
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
@@ -62,3 +62,28 @@ function call_tc() {
|
||||
function call_tc_allow_fail() {
|
||||
_call_tc "allow_fail" "$@"
|
||||
}
|
||||
|
||||
## -- Wrapper calls for IP --
|
||||
function _call_ip() {
|
||||
local allow_fail="$1"
|
||||
shift
|
||||
if [[ -n "$VERBOSE" ]]; then
|
||||
echo "ip $@"
|
||||
fi
|
||||
if [[ -n "$DRYRUN" ]]; then
|
||||
return
|
||||
fi
|
||||
$IP "$@"
|
||||
local status=$?
|
||||
if (( $status != 0 )); then
|
||||
if [[ "$allow_fail" == "" ]]; then
|
||||
err 3 "Exec error($status) occurred cmd: \"$IP $@\""
|
||||
fi
|
||||
fi
|
||||
}
|
||||
function call_ip() {
|
||||
_call_ip "" "$@"
|
||||
}
|
||||
function call_ip_allow_fail() {
|
||||
_call_ip "allow_fail" "$@"
|
||||
}
|
||||
|
55
traffic-pacing-edt/hash_func01.h
Normal file
55
traffic-pacing-edt/hash_func01.h
Normal file
@@ -0,0 +1,55 @@
|
||||
/* SPDX-License-Identifier: LGPL-2.1
|
||||
*
|
||||
* Based on Paul Hsieh's (LGPG 2.1) hash function
|
||||
* From: http://www.azillionmonkeys.com/qed/hash.html
|
||||
*/
|
||||
|
||||
#define get16bits(d) (*((const __u16 *) (d)))
|
||||
|
||||
static __always_inline
|
||||
__u32 SuperFastHash (const char *data, int len, __u32 initval) {
|
||||
__u32 hash = initval;
|
||||
__u32 tmp;
|
||||
int rem;
|
||||
|
||||
if (len <= 0 || data == NULL) return 0;
|
||||
|
||||
rem = len & 3;
|
||||
len >>= 2;
|
||||
|
||||
/* Main loop */
|
||||
#pragma clang loop unroll(full)
|
||||
for (;len > 0; len--) {
|
||||
hash += get16bits (data);
|
||||
tmp = (get16bits (data+2) << 11) ^ hash;
|
||||
hash = (hash << 16) ^ tmp;
|
||||
data += 2*sizeof (__u16);
|
||||
hash += hash >> 11;
|
||||
}
|
||||
|
||||
/* Handle end cases */
|
||||
switch (rem) {
|
||||
case 3: hash += get16bits (data);
|
||||
hash ^= hash << 16;
|
||||
hash ^= ((signed char)data[sizeof (__u16)]) << 18;
|
||||
hash += hash >> 11;
|
||||
break;
|
||||
case 2: hash += get16bits (data);
|
||||
hash ^= hash << 11;
|
||||
hash += hash >> 17;
|
||||
break;
|
||||
case 1: hash += (signed char)*data;
|
||||
hash ^= hash << 10;
|
||||
hash += hash >> 1;
|
||||
}
|
||||
|
||||
/* Force "avalanching" of final 127 bits */
|
||||
hash ^= hash << 3;
|
||||
hash += hash >> 5;
|
||||
hash ^= hash << 4;
|
||||
hash += hash >> 17;
|
||||
hash ^= hash << 25;
|
||||
hash += hash >> 6;
|
||||
|
||||
return hash;
|
||||
}
|
@@ -1,4 +1,5 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* Taken from from #include <iproute2/bpf_elf.h> */
|
||||
|
||||
#ifndef __IPROUTE2_COMPAT_H
|
||||
#define __IPROUTE2_COMPAT_H
|
||||
@@ -8,6 +9,11 @@
|
||||
* binary layout until "flags". Thus, BPF-progs can use both if careful.
|
||||
*/
|
||||
|
||||
/* Object pinning settings */
|
||||
#define PIN_NONE 0
|
||||
#define PIN_OBJECT_NS 1
|
||||
#define PIN_GLOBAL_NS 2
|
||||
|
||||
/* ELF map definition (copied from iproute2 source code) */
|
||||
struct bpf_elf_map {
|
||||
__u32 type;
|
||||
|
@@ -10,10 +10,10 @@ function usage() {
|
||||
echo "Usage: $0 [-vh] --dev ethX"
|
||||
echo " -d | --dev : (\$DEV) Interface/device (required)"
|
||||
echo " -v | --verbose : (\$VERBOSE) verbose"
|
||||
echo " --remove : (\$REMOVE) Remove the TC rules"
|
||||
echo " --remove : (\$REMOVE) Remove the rules"
|
||||
echo " --dry-run : (\$DRYRUN) Dry-run only (echo tc commands)"
|
||||
echo " -s | --stats : (\$STATS_ONLY) Call TC statistics command"
|
||||
echo " -l | --list : (\$LIST) List TC filter setup after setup"
|
||||
echo " -s | --stats : (\$STATS_ONLY) Call statistics command"
|
||||
echo " -l | --list : (\$LIST) List setup after setup"
|
||||
echo " --file | --obj : (\$BPF_OBJ) BPF-object file to load"
|
||||
echo ""
|
||||
}
|
||||
@@ -80,5 +80,5 @@ done
|
||||
|
||||
if [ -z "$DEV" ]; then
|
||||
usage
|
||||
err 2 "Please specify TC net_device"
|
||||
err 2 "Please specify net_device (\$DEV)"
|
||||
fi
|
||||
|
77
traffic-pacing-edt/tc_fq_pacer.sh
Executable file
77
traffic-pacing-edt/tc_fq_pacer.sh
Executable file
@@ -0,0 +1,77 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Loading FQ pacing qdisc in multi-queue MQ setup to avoid root qdisc lock.
|
||||
#
|
||||
# The FQ pacing qdisc is doing all the work of pacing packet out according to
|
||||
# the EDT (Earliest Departure Time) future timestamps set by our BPF-prog that
|
||||
# runs a TC-egress hook.
|
||||
#
|
||||
# Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
|
||||
# License: GPLv2
|
||||
#
|
||||
basedir=`dirname $0`
|
||||
source ${basedir}/functions.sh
|
||||
|
||||
root_check_run_with_sudo "$@"
|
||||
|
||||
# Use common parameters
|
||||
source ${basedir}/parameters.sh
|
||||
|
||||
export TC=tc
|
||||
|
||||
# Default verbose
|
||||
VERBOSE=1
|
||||
|
||||
# Select between multiq or single root qdisc
|
||||
if [[ -z $1 ]]; then
|
||||
if [[ -z $REMOVE ]]; then
|
||||
err 1 "Specify root qdisc system: single or mq (multi-queue)"
|
||||
fi
|
||||
fi
|
||||
TYPE=$1
|
||||
|
||||
# Delete existing root qdisc
|
||||
call_tc_allow_fail qdisc del dev "$DEV" root
|
||||
|
||||
if [[ -n $REMOVE ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
function use_multiq()
|
||||
{
|
||||
# MQ (Multi-Queue) as root qdisc
|
||||
call_tc qdisc replace dev $DEV root handle 7FFF: mq
|
||||
|
||||
# Add FQ-pacer qdisc on each NIC avail TX-queue
|
||||
i=0
|
||||
for dir in /sys/class/net/$DEV/queues/tx-*; do
|
||||
# Details: cause-off-by-one, as tx-0 becomes handle 1:
|
||||
((i++)) || true
|
||||
#call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq
|
||||
#
|
||||
# The higher 'flow_limit' is needed for high-BW pacing
|
||||
call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq \
|
||||
flow_limit 1000
|
||||
#
|
||||
# quantum $((1514*4)) initial_quantum $((1514*20))
|
||||
# call_tc qdisc add dev $DEV parent 7FFF:$i handle $i: fq maxrate 930mbit
|
||||
done
|
||||
}
|
||||
|
||||
function use_single_fq_pacer()
|
||||
{
|
||||
call_tc qdisc replace dev $DEV root handle 7FFF: fq \
|
||||
flow_limit 1000
|
||||
}
|
||||
|
||||
case "$TYPE" in
|
||||
mq | multiq )
|
||||
use_multiq
|
||||
;;
|
||||
single | fq )
|
||||
use_single_fq_pacer
|
||||
;;
|
||||
* )
|
||||
err 1 "Unknown type: ${TYPE}"
|
||||
;;
|
||||
esac
|
95
traffic-pacing-edt/tc_htb_shaper.sh
Executable file
95
traffic-pacing-edt/tc_htb_shaper.sh
Executable file
@@ -0,0 +1,95 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# This HTB shaper setup script is available for easier comparing the
|
||||
# accuracy against the EDT solution.
|
||||
#
|
||||
# Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
|
||||
# License: GPLv2
|
||||
#
|
||||
basedir=`dirname $0`
|
||||
source ${basedir}/functions.sh
|
||||
|
||||
root_check_run_with_sudo "$@"
|
||||
|
||||
# Use common parameters
|
||||
source ${basedir}/parameters.sh
|
||||
|
||||
export TC=/sbin/tc
|
||||
|
||||
# It seems measured BW is TCP goodput, but configured BW is wirespeed.
|
||||
# Measurements show around 930Mbit best-case. Q-in-Q result in MTU
|
||||
# 1522 bytes. TCP goodput segments are 1448 bytes.
|
||||
#
|
||||
#RATE=$((930*1522/1448))Mbit
|
||||
##RATE=$((933*1522/1448))Mbit
|
||||
##CEIL=$((999*1522/1448))
|
||||
#CEIL=1Gbit
|
||||
#CEIL=980mbit
|
||||
|
||||
# EDT shaper show TCP goodput of 956 Mbit/s.
|
||||
# echo $((956*1514/1448)) = 999
|
||||
RATE=999Mbit
|
||||
CEIL=1000Mbit
|
||||
|
||||
#RATE=500mbit
|
||||
#CEIL=577mbit
|
||||
|
||||
# Each of the HTB root-class(es) get these RATE+CEIL upper bandwidth bounds.
|
||||
ROOT_RATE=9000Mbit
|
||||
ROOT_CEIL=9500Mbit
|
||||
|
||||
DEFAULT_RATE=6000Mbit
|
||||
DEFAULT_CEIL=6000Mbit
|
||||
|
||||
TC=/usr/sbin/tc
|
||||
VERBOSE=1
|
||||
|
||||
function tc() {
|
||||
_call_tc "" "$@"
|
||||
}
|
||||
|
||||
# Delete existing root qdisc
|
||||
call_tc_allow_fail qdisc del dev "$DEV" root
|
||||
|
||||
if [[ -n $REMOVE ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# HTB shaper
|
||||
#tc qdisc add dev "$DEV" root handle 1: htb default 2
|
||||
tc qdisc add dev "$DEV" root handle 1: htb default 16
|
||||
|
||||
# The root-class set upper bandwidth usage
|
||||
tc class add dev "$DEV" parent 1: classid 1:1 \
|
||||
htb rate $ROOT_RATE ceil $ROOT_CEIL
|
||||
|
||||
# Default class 1:2
|
||||
tc class add dev "$DEV" parent 1: classid 1:2 htb \
|
||||
rate "$DEFAULT_RATE" ceil "$DEFAULT_CEIL"
|
||||
# burst 100000 cburst 100000
|
||||
tc qdisc add dev $DEV parent 1:2 fq_codel
|
||||
|
||||
|
||||
# Class for vlan 16
|
||||
tc class add dev "$DEV" parent 1: classid 1:16 htb rate "$RATE" ceil "$CEIL" \
|
||||
burst $((1522*2)) cburst $((1522*2)) \
|
||||
linklayer ethernet
|
||||
# burst 1522 cburst 1522
|
||||
#burst 1 cburst 1
|
||||
# burst $((1522*2)) cburst $((1522*2))
|
||||
# overhead $((14+4+4)) linklayer ethernet
|
||||
#tc qdisc add dev "$DEV" parent 1:16 fq_codel
|
||||
tc qdisc add dev "$DEV" parent 1:16 fq_codel quantum $((1514+4+4))
|
||||
#tc qdisc add dev "$DEV" parent 1:16 pfifo
|
||||
|
||||
# parent filter:
|
||||
#tc filter add dev "$DEV" parent 1:0 prio 100 protocol 802.1q u32
|
||||
#
|
||||
# vlan 16:
|
||||
#tc filter add dev "$DEV" parent 1:0 prio 100 \
|
||||
# protocol 802.1q \
|
||||
# u32 match u16 0x0010 0x0fff at -4 \
|
||||
# flowid 1:16
|
||||
|
||||
tc filter add dev $DEV protocol all parent 1:0 prio 101 \
|
||||
basic match "meta(vlan mask 0xfff eq 16)" flowid 1:16
|
84
traffic-pacing-edt/testlab_vlan_setup.sh
Executable file
84
traffic-pacing-edt/testlab_vlan_setup.sh
Executable file
@@ -0,0 +1,84 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Testlab setup script for VLAN Q-in-Q (double tagged VLAN) config.
|
||||
#
|
||||
# Author: Jesper Dangaaard Brouer <netoptimizer@brouer.com>
|
||||
# License: GPLv2
|
||||
#
|
||||
basedir=`dirname $0`
|
||||
source ${basedir}/functions.sh
|
||||
|
||||
root_check_run_with_sudo "$@"
|
||||
|
||||
# Use common parameters
|
||||
source ${basedir}/parameters.sh
|
||||
|
||||
export IP=/sbin/ip
|
||||
function ip() {
|
||||
call_ip "$@"
|
||||
}
|
||||
|
||||
function create_vlan_device() {
|
||||
local vlan=${1}
|
||||
local device=${2:-$DEV}
|
||||
shift 2
|
||||
|
||||
if [[ -z "$vlan" ]]; then
|
||||
err 2 "Missing VLAN is as input"
|
||||
fi
|
||||
|
||||
ip link add link "$device" name ${device}.${vlan} type vlan id ${vlan}
|
||||
ip link set ${device}.${vlan} up
|
||||
}
|
||||
|
||||
function create_vlan_device_802_1ad() {
|
||||
local vlan=${1}
|
||||
local device=${2:-$DEV}
|
||||
shift 2
|
||||
|
||||
if [[ -z "$vlan" ]]; then
|
||||
err 2 "Missing VLAN is as input"
|
||||
fi
|
||||
|
||||
ip link add link "$device" name ${device}.${vlan} type vlan id ${vlan} \
|
||||
protocol 802.1ad
|
||||
ip link set ${device}.${vlan} up
|
||||
}
|
||||
|
||||
|
||||
function delete_vlan_device() {
|
||||
local vlan=${1}
|
||||
local device=${2:-$DEV}
|
||||
shift 2
|
||||
|
||||
if [[ -z "$vlan" ]]; then
|
||||
err 2 "Missing VLAN is as input"
|
||||
fi
|
||||
|
||||
ip link del ${device}.${vlan}
|
||||
}
|
||||
|
||||
|
||||
if [[ -z "$1" ]]; then
|
||||
err 3 "Missing arg#1 for outer vlan"
|
||||
fi
|
||||
OUTER=$1
|
||||
|
||||
if [[ -z "$2" ]]; then
|
||||
err 3 "Missing arg#2 for inner vlan"
|
||||
fi
|
||||
INNER=$2
|
||||
|
||||
if [[ -n $REMOVE ]]; then
|
||||
delete_vlan_device $INNER ${DEV}.${OUTER}
|
||||
delete_vlan_device $OUTER $DEV
|
||||
exit 0
|
||||
fi
|
||||
|
||||
create_vlan_device $OUTER $DEV
|
||||
create_vlan_device $INNER ${DEV}.${OUTER}
|
||||
|
||||
# Set MTU to handle extra VLAN headers, NICs usually allow one VLAN
|
||||
# header even though they have configured MTU 1500.
|
||||
ip link set $DEV mtu 1508
|
||||
ip link set ${DEV}.${OUTER} mtu 1504
|
39
traffic-pacing-edt/vlans_load_edt.sh
Executable file
39
traffic-pacing-edt/vlans_load_edt.sh
Executable file
@@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Script for loading EDT-pacer BPF-prog on all downstream VLANs
|
||||
#
|
||||
basedir=`dirname $0`
|
||||
source ${basedir}/functions.sh
|
||||
|
||||
root_check_run_with_sudo "$@"
|
||||
|
||||
# Use common parameters
|
||||
source ${basedir}/parameters.sh
|
||||
|
||||
# Default verbose
|
||||
VERBOSE=1
|
||||
|
||||
# Downstream dev: ens6f0
|
||||
VLAN_START=168
|
||||
VLAN_END=205
|
||||
|
||||
cmd=${basedir}/bpf_egress_loader.sh
|
||||
|
||||
options=""
|
||||
|
||||
if [[ -n $REMOVE ]]; then
|
||||
options+=" --remove"
|
||||
fi
|
||||
if [[ -n $DRYRUN ]]; then
|
||||
options+=" --dry-run"
|
||||
#cmd="echo $cmd"
|
||||
fi
|
||||
if [[ -n $VERBOSE ]]; then
|
||||
options+=" --verbose"
|
||||
fi
|
||||
|
||||
for (( vlan=${VLAN_START}; vlan<=${VLAN_END}; vlan++ ))
|
||||
do
|
||||
VLAN=${DEV}.$vlan
|
||||
$cmd --dev $VLAN $options
|
||||
done
|
383
traffic-pacing-edt/xdp_cpumap_loader.c
Normal file
383
traffic-pacing-edt/xdp_cpumap_loader.c
Normal file
@@ -0,0 +1,383 @@
|
||||
// SPDX-License-Identifier: GPL-2.0+
|
||||
static const char *__doc__ =
|
||||
" XDP load-balancing with CPU-map";
|
||||
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <locale.h>
|
||||
#include <sys/resource.h>
|
||||
#include <sys/sysinfo.h>
|
||||
#include <getopt.h>
|
||||
#include <net/if.h>
|
||||
#include <time.h>
|
||||
#include <linux/limits.h>
|
||||
|
||||
#include <bpf/bpf.h>
|
||||
#include <bpf/libbpf.h>
|
||||
|
||||
#include <linux/if_link.h> /* XDP defines */
|
||||
|
||||
static int ifindex = -1;
|
||||
static char ifname_buf[IF_NAMESIZE];
|
||||
static char *ifname;
|
||||
|
||||
static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
|
||||
|
||||
/* Exit return codes */
|
||||
#define EXIT_OK 0
|
||||
#define EXIT_FAIL 1
|
||||
#define EXIT_FAIL_OPTION 2
|
||||
#define EXIT_FAIL_XDP 3
|
||||
#define EXIT_FAIL_BPF 4
|
||||
#define EXIT_FAIL_MEM 5
|
||||
#define EXIT_FAIL_FILE 6
|
||||
|
||||
static const struct option long_options[] = {
|
||||
{"help", no_argument, NULL, 'h' },
|
||||
{"dev", required_argument, NULL, 'd' },
|
||||
{"qsize", required_argument, NULL, 'q' },
|
||||
{"force", no_argument, NULL, 'F' },
|
||||
{"remove", no_argument, NULL, 'r' },
|
||||
{"non-cpu", required_argument, NULL, 'x' },
|
||||
{"exclude-cpu", required_argument, NULL, 'x' },
|
||||
{0, 0, NULL, 0 }
|
||||
};
|
||||
|
||||
static void usage(char *argv[])
|
||||
{
|
||||
int i;
|
||||
|
||||
printf("\nDOCUMENTATION:\n%s\n", __doc__);
|
||||
printf("\n");
|
||||
printf(" Usage: %s (options-see-below)\n", argv[0]);
|
||||
printf(" Listing options:\n");
|
||||
for (i = 0; long_options[i].name != 0; i++) {
|
||||
printf(" --%-12s", long_options[i].name);
|
||||
if (long_options[i].flag != NULL)
|
||||
printf(" flag (internal value:%d)",
|
||||
*long_options[i].flag);
|
||||
else
|
||||
printf(" short-option: -%c",
|
||||
long_options[i].val);
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
struct cpumap_config {
|
||||
int fd_cpumap;
|
||||
int fd_cpus_enabled;
|
||||
int fd_cpus_count;
|
||||
int *cpu_exclude;
|
||||
int max_cpus;
|
||||
__u32 qsize;
|
||||
};
|
||||
|
||||
static int cpumap_config_init(struct cpumap_config *cfg)
|
||||
{
|
||||
int n_cpus = get_nprocs_conf();
|
||||
int *cpu_exclude;
|
||||
|
||||
memset(cfg, 0, sizeof(*cfg));
|
||||
|
||||
cpu_exclude = malloc(n_cpus * sizeof(int));
|
||||
if (!cpu_exclude) {
|
||||
fprintf(stderr, "failed to allocate array\n");
|
||||
return EXIT_FAIL_MEM;
|
||||
}
|
||||
memset(cpu_exclude, 0, n_cpus * sizeof(int));
|
||||
|
||||
cfg->cpu_exclude = cpu_exclude;
|
||||
cfg->max_cpus = n_cpus;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __find_map_fd_by_name(struct bpf_object *obj, char *name)
|
||||
{
|
||||
int fd;
|
||||
|
||||
fd = bpf_object__find_map_fd_by_name(obj, name);
|
||||
if (fd < 0) {
|
||||
printf("No map found! - named: %s\n", name);
|
||||
exit(EXIT_FAIL_BPF);
|
||||
}
|
||||
return fd;
|
||||
}
|
||||
|
||||
/* Get file descriptors to BPF-maps */
|
||||
static int cpumap_config_find_maps(struct bpf_object *obj,
|
||||
struct cpumap_config *cfg)
|
||||
{
|
||||
cfg->fd_cpumap = __find_map_fd_by_name(obj, "cpumap");
|
||||
cfg->fd_cpus_enabled = __find_map_fd_by_name(obj, "cpus_enabled");
|
||||
cfg->fd_cpus_count = __find_map_fd_by_name(obj, "cpus_count");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int create_cpu_entry(struct cpumap_config *cfg, __u32 cpu,
|
||||
struct bpf_cpumap_val *value,
|
||||
__u32 enabled_idx, bool new)
|
||||
{
|
||||
__u32 curr_cpus_count = 0;
|
||||
__u32 key = 0;
|
||||
int err, fd;
|
||||
|
||||
/* Add a CPU entry to cpumap, as this allocate a cpu entry in
|
||||
* the kernel for the cpu.
|
||||
*/
|
||||
fd = cfg->fd_cpumap;
|
||||
err = bpf_map_update_elem(fd, &cpu, value, 0);
|
||||
if (err) {
|
||||
fprintf(stderr, "Create(fd:%d) CPU(%d) entry failed (err:%d)\n",
|
||||
fd, cpu, err);
|
||||
return EXIT_FAIL_BPF;
|
||||
}
|
||||
|
||||
/* Inform bpf_prog's that a new CPU is enabled and available
|
||||
* to be select from the map, that maps index to actual CPU.
|
||||
*/
|
||||
fd = cfg->fd_cpus_enabled;
|
||||
err = bpf_map_update_elem(fd, &enabled_idx, &cpu, 0);
|
||||
if (err) {
|
||||
fprintf(stderr, "Add to enabled avail CPUs failed\n");
|
||||
return EXIT_FAIL_BPF;
|
||||
}
|
||||
|
||||
/* When not replacing/updating existing entry, bump the count */
|
||||
fd = cfg->fd_cpus_count;
|
||||
err = bpf_map_lookup_elem(fd, &key, &curr_cpus_count);
|
||||
if (err) {
|
||||
fprintf(stderr, "Failed reading curr cpus_count\n");
|
||||
return EXIT_FAIL_BPF;
|
||||
}
|
||||
if (new) {
|
||||
curr_cpus_count++;
|
||||
err = bpf_map_update_elem(fd, &key, &curr_cpus_count, 0);
|
||||
if (err) {
|
||||
fprintf(stderr, "Failed write curr cpus_count\n");
|
||||
return EXIT_FAIL_BPF;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Userspace MUST create/populate CPUMAP entries for redirect to work
|
||||
*/
|
||||
static int configure_cpus(struct cpumap_config *cfg)
|
||||
{
|
||||
struct bpf_cpumap_val value = { 0 };
|
||||
int n_cpus = cfg->max_cpus;
|
||||
int *exclude = cfg->cpu_exclude;
|
||||
int enabled_idx = 0;
|
||||
bool new = true;
|
||||
int cpu, err;
|
||||
|
||||
value.qsize = cfg->qsize;
|
||||
|
||||
for (cpu = 0; cpu < n_cpus; cpu++) {
|
||||
|
||||
if (exclude[cpu] == -1) {
|
||||
printf("Excluding CPU:%d\n", cpu);
|
||||
continue;
|
||||
}
|
||||
printf("Enable CPU:%d\n", cpu);
|
||||
err = create_cpu_entry(cfg, cpu, &value, enabled_idx, new);
|
||||
if (err)
|
||||
return err;
|
||||
enabled_idx++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct bpf_object *do_load_bpf_obj(struct bpf_object *obj)
|
||||
{
|
||||
char buf[200];
|
||||
int err;
|
||||
|
||||
err = bpf_object__load(obj);
|
||||
if (err) {
|
||||
libbpf_strerror(err, buf, sizeof(buf));
|
||||
printf("Error loading: %s\n", buf);
|
||||
return NULL;
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
|
||||
int do_xdp_attach(int ifindex, struct bpf_program *prog, __u32 xdp_flags)
|
||||
{
|
||||
int prog_fd = bpf_program__fd(prog);
|
||||
int err;
|
||||
|
||||
if (prog_fd < 0) {
|
||||
fprintf(stderr, "bpf_program__fd failed\n");
|
||||
return EXIT_FAIL_BPF;
|
||||
}
|
||||
|
||||
err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
|
||||
if (err) {
|
||||
fprintf(stderr, "%s(): link set xdp fd failed (err:%d)\n",
|
||||
__func__, err);
|
||||
return EXIT_FAIL_XDP;
|
||||
}
|
||||
return EXIT_OK;
|
||||
}
|
||||
|
||||
int do_xdp_detach(int ifindex, __u32 xdp_flags)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
|
||||
if (err) {
|
||||
fprintf(stderr, "%s(): link set xdp fd failed (err:%d)\n",
|
||||
__func__, err);
|
||||
return EXIT_FAIL_XDP;
|
||||
}
|
||||
return EXIT_OK;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
|
||||
bool do_detach = false;
|
||||
int opt, longindex = 0;
|
||||
char buf[100];
|
||||
int err;
|
||||
|
||||
struct bpf_object *obj = NULL;
|
||||
struct bpf_program *prog;
|
||||
|
||||
/* System to setup and exclude some CPUs */
|
||||
struct cpumap_config cfg;
|
||||
int n_cpus = get_nprocs_conf();
|
||||
int non_cpu = -1;
|
||||
int *cpu_exclude;
|
||||
|
||||
cpumap_config_init(&cfg);
|
||||
cpu_exclude = cfg.cpu_exclude;
|
||||
cfg.qsize = 512; /* Default queue size */
|
||||
|
||||
/* Always use XDP native driver mode */
|
||||
xdp_flags |= XDP_FLAGS_DRV_MODE;
|
||||
|
||||
obj = bpf_object__open_file("xdp_cpumap_qinq.o", NULL);
|
||||
err = libbpf_get_error(obj);
|
||||
if (err) {
|
||||
libbpf_strerror(err, buf, sizeof(buf));
|
||||
printf("Error opening file: %s\n", buf);
|
||||
return EXIT_FAIL_FILE;
|
||||
}
|
||||
err = EXIT_OK;
|
||||
|
||||
/* Parse commands line args */
|
||||
while ((opt = getopt_long(argc, argv, "hd:q:Frx:",
|
||||
long_options, &longindex)) != -1) {
|
||||
switch (opt) {
|
||||
case 'd':
|
||||
if (strlen(optarg) >= IF_NAMESIZE) {
|
||||
fprintf(stderr, "ERR: --dev name too long\n");
|
||||
goto error;
|
||||
}
|
||||
ifname = (char *)&ifname_buf;
|
||||
strncpy(ifname, optarg, IF_NAMESIZE);
|
||||
ifindex = if_nametoindex(ifname);
|
||||
if (ifindex == 0) {
|
||||
fprintf(stderr,
|
||||
"ERR: --dev name unknown err(%d):%s\n",
|
||||
errno, strerror(errno));
|
||||
goto error;
|
||||
}
|
||||
break;
|
||||
case 'q':
|
||||
cfg.qsize = strtol(optarg, NULL, 10);
|
||||
break;
|
||||
case 'F':
|
||||
xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
|
||||
break;
|
||||
case 'r':
|
||||
do_detach = true;
|
||||
break;
|
||||
case 'x': /* --exclude-cpu or --non-cpu */
|
||||
/* Possible to exclude multiple CPUs on cmdline */
|
||||
non_cpu = strtoul(optarg, NULL, 0);
|
||||
if (non_cpu >= n_cpus) {
|
||||
fprintf(stderr,
|
||||
"--cpu nr too large for cpumap err(%d):%s\n",
|
||||
errno, strerror(errno));
|
||||
goto error;
|
||||
}
|
||||
cpu_exclude[non_cpu] = -1;
|
||||
break;
|
||||
|
||||
case 'h':
|
||||
error:
|
||||
default:
|
||||
usage(argv);
|
||||
free(cpu_exclude);
|
||||
return EXIT_FAIL_OPTION;
|
||||
}
|
||||
}
|
||||
/* Required option */
|
||||
if (ifindex == -1) {
|
||||
fprintf(stderr, "ERR: required option --dev missing\n");
|
||||
usage(argv);
|
||||
err = EXIT_FAIL_OPTION;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (do_detach)
|
||||
return do_xdp_detach(ifindex, xdp_flags);
|
||||
|
||||
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
|
||||
perror("setrlimit(RLIMIT_MEMLOCK)");
|
||||
err = EXIT_FAIL_MEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
obj = do_load_bpf_obj(obj);
|
||||
if (!obj) {
|
||||
err = EXIT_FAIL_BPF;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Pickup first BPF-program */
|
||||
prog = bpf_program__next(NULL, obj);
|
||||
if (!prog) {
|
||||
printf("No program!\n");
|
||||
err = EXIT_FAIL_BPF;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Find maps maps */
|
||||
if (cpumap_config_find_maps(obj, &cfg)) {
|
||||
err = EXIT_FAIL_BPF;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Configure cpumap */
|
||||
if (configure_cpus(&cfg)) {
|
||||
err = EXIT_FAIL_BPF;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Attach XDP program */
|
||||
err = do_xdp_attach(ifindex, prog, xdp_flags);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
printf("Attached XDP program:\"%s\" on netdev:%s (ifindex:%d)\n",
|
||||
bpf_program__name(prog), ifname, ifindex);
|
||||
printf("CPUs: %d\n", n_cpus);
|
||||
|
||||
out:
|
||||
if (obj)
|
||||
bpf_object__close(obj);
|
||||
|
||||
free(cpu_exclude);
|
||||
return err;
|
||||
}
|
111
traffic-pacing-edt/xdp_cpumap_qinq.c
Normal file
111
traffic-pacing-edt/xdp_cpumap_qinq.c
Normal file
@@ -0,0 +1,111 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0+ */
|
||||
#include <linux/types.h>
|
||||
#include <linux/bpf.h> /* struct bpf_cpumap_val */
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/compiler.h>
|
||||
|
||||
#define INITVAL 15485863
|
||||
//#define INITVAL 2654435761
|
||||
|
||||
#include "hash_func01.h" /* SuperFastHash */
|
||||
|
||||
#include <bpf/bpf_helpers.h>
|
||||
|
||||
#define VLAN_MAX_DEPTH 2
|
||||
#include <xdp/parsing_helpers.h>
|
||||
|
||||
#define MAX_CPUS 24
|
||||
|
||||
/* This global variable is used for limiting CPU that can be selected */
|
||||
__u32 global_max_cpus = 12; /* TODO: Allow userspace to adjust this */
|
||||
|
||||
/* Special map type that can XDP_REDIRECT frames to another CPU */
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_CPUMAP);
|
||||
__uint(key_size, sizeof(__u32));
|
||||
__uint(value_size, sizeof(struct bpf_cpumap_val));
|
||||
__uint(max_entries, MAX_CPUS);
|
||||
} cpumap SEC(".maps");
|
||||
|
||||
/* Mapping table with CPUs enabled, for hashing between */
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||
__type(key, __u32);
|
||||
__type(value, __u32);
|
||||
__uint(max_entries, MAX_CPUS);
|
||||
} cpus_enabled SEC(".maps");
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||
__type(key, __u32);
|
||||
__type(value, __u32);
|
||||
__uint(max_entries, 1);
|
||||
} cpus_count SEC(".maps");
|
||||
|
||||
static __always_inline
|
||||
__u32 extract_vlan_key(struct collect_vlans *vlans)
|
||||
{
|
||||
/* Combine inner and outer VLAN as a key */
|
||||
__u32 vlan_key = (vlans->id[1] << 16) | vlans->id[0];
|
||||
return vlan_key;
|
||||
}
|
||||
|
||||
SEC("xdp")
|
||||
int xdp_cpumap_qinq(struct xdp_md *ctx)
|
||||
{
|
||||
void *data = (void *)(long)ctx->data;
|
||||
void *data_end = (void *)(long)ctx->data_end;
|
||||
struct collect_vlans vlans = { 0 };
|
||||
__u32 hash_key, vlan_key;
|
||||
struct ethhdr *eth;
|
||||
__u32 cpu_idx, cpu_dest = 0;
|
||||
__u32 *cpu_lookup;
|
||||
__u64 action;
|
||||
__u32 *cpu_max;
|
||||
|
||||
|
||||
/* These keep track of the next header type and iterator pointer */
|
||||
struct hdr_cursor nh;
|
||||
int eth_type;
|
||||
nh.pos = data;
|
||||
|
||||
eth_type = parse_ethhdr_vlan(&nh, data_end, ð, &vlans);
|
||||
if (eth_type < 0) {
|
||||
action = XDP_ABORTED;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Keep ARP resolution working */
|
||||
if (eth_type == bpf_htons(ETH_P_ARP)) {
|
||||
action = XDP_PASS;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!proto_is_vlan(eth->h_proto)) {
|
||||
/* Skip non-VLAN frames */
|
||||
action = XDP_PASS;
|
||||
goto out;
|
||||
}
|
||||
|
||||
int key0 = 0;
|
||||
cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
|
||||
if (!cpu_max)
|
||||
return XDP_ABORTED;
|
||||
|
||||
/* Use inner+outer VLAN as key and hash based on max_cpus */
|
||||
vlan_key = extract_vlan_key(&vlans);
|
||||
hash_key = SuperFastHash((char *)&vlan_key, 4, INITVAL);
|
||||
cpu_idx = hash_key % *cpu_max;
|
||||
|
||||
/* To allow excluding some CPUs, a mapping table cpus_enabled
|
||||
* translates cpu_idx to real CPU-id
|
||||
*/
|
||||
cpu_lookup = bpf_map_lookup_elem(&cpus_enabled, &cpu_idx);
|
||||
if (!cpu_lookup)
|
||||
return XDP_ABORTED;
|
||||
cpu_dest = *cpu_lookup;
|
||||
|
||||
/* Notice: Userspace MUST insert entries into cpumap */
|
||||
action = bpf_redirect_map(&cpumap, cpu_dest, XDP_PASS);
|
||||
out:
|
||||
return action;
|
||||
}
|
Reference in New Issue
Block a user