2020-11-08 14:24:51 +01:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0+ */
|
|
|
|
#include <linux/bpf.h>
|
|
|
|
#include <bpf/bpf_helpers.h>
|
2020-11-15 15:40:42 +01:00
|
|
|
#include <bpf/compiler.h>
|
2020-11-08 14:24:51 +01:00
|
|
|
|
2020-11-22 18:11:11 +01:00
|
|
|
#include <stdbool.h>
|
|
|
|
|
2020-11-15 16:23:46 +01:00
|
|
|
#define VLAN_MAX_DEPTH 2
|
|
|
|
#include <xdp/parsing_helpers.h>
|
|
|
|
|
2020-11-08 14:24:51 +01:00
|
|
|
char _license[] SEC("license") = "GPL";
|
|
|
|
|
|
|
|
#define NS_PER_SEC 1000000000
|
|
|
|
|
2020-11-28 14:02:52 +01:00
|
|
|
/* Strategy: Shape at MAC (Medium Access Control) layer with Ethernet
|
2020-11-21 15:57:14 +01:00
|
|
|
*
|
|
|
|
* Production use-case is pacing traffic at 1Gbit/s wirespeed, using a
|
|
|
|
* 10Gbit/s NIC, because 1G end-user switch cannot handle bursts.
|
2021-02-09 14:28:45 +01:00
|
|
|
*
|
2020-11-21 15:57:14 +01:00
|
|
|
* (https://en.wikipedia.org/wiki/Interpacket_gap
|
|
|
|
* 12 bytes = interframe gap (IFG) 96 bit
|
|
|
|
|
|
|
|
* (https://en.wikipedia.org/wiki/Ethernet_frame)
|
|
|
|
* 8 bytes = MAC preamble
|
|
|
|
* 4 bytes = Ethernet Frame Check Sequence (FCS) CRC
|
|
|
|
* 46 bytes = Minimum Payload size
|
|
|
|
*
|
|
|
|
* 14 bytes = Ethernet header
|
|
|
|
* 8 bytes = 2x VLAN headers
|
|
|
|
*/
|
|
|
|
//#define RATE_IN_BITS (1000 * 1000 * 1000ULL) /* Full 1Gbit/s */
|
2020-11-21 16:54:58 +01:00
|
|
|
#define RATE_IN_BITS (990 * 1000 * 1000ULL)
|
|
|
|
//#define RATE_IN_BITS (950 * 1000 * 1000ULL)
|
2020-11-21 15:57:14 +01:00
|
|
|
#define OVERHEAD (12 + 8 + 4 + 8) /* 14 already in wire_len */
|
|
|
|
//#define OVERHEAD (12 + 8 + 4) /* 14 already in wire_len */
|
|
|
|
#define ETH_MIN (84)
|
|
|
|
|
2020-11-28 14:02:52 +01:00
|
|
|
/* skb->len in bytes, thus convert rate to bytes */
|
2020-11-14 10:43:56 +01:00
|
|
|
#define RATE_IN_BYTES (RATE_IN_BITS / 8)
|
|
|
|
|
2020-11-28 14:02:52 +01:00
|
|
|
/* Controlling how large queue (in time) is allow to grow */
|
2020-11-28 14:22:29 +01:00
|
|
|
#define T_HORIZON_DROP (40 * 1000 * 1000ULL)
|
|
|
|
#define T_HORIZON_TARGET (5 * 1000 * 1000ULL)
|
|
|
|
#define T_HORIZON_ECN (1 * 1000 * 1000ULL)
|
2020-11-14 17:16:03 +01:00
|
|
|
|
2020-11-28 14:02:52 +01:00
|
|
|
/* Codel: If queue exceed target for more than one interval, start dropping */
|
2020-11-28 13:51:49 +01:00
|
|
|
#define T_EXCEED_INTERVAL (100 * 1000 * 1000ULL) /* 100 ms in ns*/
|
|
|
|
|
|
|
|
#define CODEL_TARGET T_HORIZON_TARGET
|
|
|
|
#define CODEL_EXCEED_INTERVAL T_EXCEED_INTERVAL
|
|
|
|
#include "codel_impl.h"
|
2020-11-22 14:53:27 +01:00
|
|
|
|
2020-11-08 14:24:51 +01:00
|
|
|
struct edt_val {
|
|
|
|
__u64 rate;
|
|
|
|
__u64 t_last;
|
|
|
|
__u64 t_horizon_drop;
|
|
|
|
__u64 t_horizon_ecn;
|
2020-11-28 13:37:07 +01:00
|
|
|
struct codel_state codel;
|
2020-11-15 15:56:31 +01:00
|
|
|
} __aligned(64); /* Align struct to cache-size to avoid false-sharing */
|
2020-11-08 14:24:51 +01:00
|
|
|
|
2020-12-01 14:27:10 +01:00
|
|
|
#ifdef HAVE_TC_LIBBPF /* detected by configure script in config.mk */
|
|
|
|
/* Use BTF format to create map */
|
|
|
|
struct {
|
|
|
|
__uint(type, BPF_MAP_TYPE_ARRAY);
|
|
|
|
__uint(max_entries, 4096); /* Max possible VLANs */
|
|
|
|
__type(key, __u32);
|
|
|
|
__type(value, struct edt_val);
|
|
|
|
// __uint(pinning, LIBBPF_PIN_BY_NAME);
|
|
|
|
} time_delay_map SEC(".maps");
|
|
|
|
|
|
|
|
#else
|
|
|
|
/* The (iproute2) tc tool (without libbpf support) use another ELF map
|
|
|
|
* layout than libbpf (struct bpf_map_def), see struct bpf_elf_map
|
|
|
|
* from iproute2.
|
2020-11-08 14:24:51 +01:00
|
|
|
*/
|
2020-12-01 14:27:10 +01:00
|
|
|
#include "iproute2_compat.h"
|
2020-11-08 14:24:51 +01:00
|
|
|
struct bpf_elf_map SEC("maps") time_delay_map = {
|
|
|
|
.type = BPF_MAP_TYPE_ARRAY,
|
|
|
|
.size_key = sizeof(__u32),
|
|
|
|
.size_value = sizeof(struct edt_val),
|
2020-11-28 15:09:12 +01:00
|
|
|
.max_elem = 4096, /* Max possible VLANs */
|
2020-12-01 14:27:10 +01:00
|
|
|
// .pinning = PIN_GLOBAL_NS,
|
2020-11-08 14:24:51 +01:00
|
|
|
};
|
2020-12-01 14:27:10 +01:00
|
|
|
#endif
|
|
|
|
|
2020-11-08 14:24:51 +01:00
|
|
|
|
2020-11-14 10:43:56 +01:00
|
|
|
/* Role of EDT (Earliest Departure Time) is to schedule departure of packets to
|
|
|
|
* be send in the future.
|
|
|
|
*/
|
2020-11-28 15:09:12 +01:00
|
|
|
static __always_inline int sched_departure(struct __sk_buff *skb, __u32 key)
|
2020-11-08 14:24:51 +01:00
|
|
|
{
|
|
|
|
struct edt_val *edt;
|
2021-02-09 14:28:45 +01:00
|
|
|
__u64 rate_in_bytes;
|
2020-11-14 12:06:25 +01:00
|
|
|
__u64 t_queue_sz;
|
2020-11-08 14:24:51 +01:00
|
|
|
__u64 t_xmit_ns;
|
2020-11-21 15:57:14 +01:00
|
|
|
__u64 wire_len;
|
2020-11-14 10:43:56 +01:00
|
|
|
__u64 t_next;
|
|
|
|
__u64 t_curr;
|
2020-11-08 14:24:51 +01:00
|
|
|
__u64 now;
|
|
|
|
|
|
|
|
edt = bpf_map_lookup_elem(&time_delay_map, &key);
|
|
|
|
if (!edt)
|
|
|
|
return BPF_DROP;
|
|
|
|
|
2021-02-09 14:28:45 +01:00
|
|
|
rate_in_bytes = RATE_IN_BYTES;
|
|
|
|
if (edt->rate > 0)
|
|
|
|
rate_in_bytes = edt->rate;
|
|
|
|
|
2020-11-14 14:51:30 +01:00
|
|
|
/* Calc transmission time it takes to send packet 'bytes'.
|
|
|
|
*
|
|
|
|
* Details on getting precise bytes on wire. The skb->len does include
|
|
|
|
* length of GRO/GSO segments, but not the segment headers that gets
|
|
|
|
* added on transmit. Fortunately skb->wire_len at TC-egress hook (not
|
|
|
|
* ingress) include these headers. (See: qdisc_pkt_len_init())
|
|
|
|
*/
|
2020-11-21 15:57:14 +01:00
|
|
|
wire_len = skb->wire_len + OVERHEAD;
|
|
|
|
wire_len = wire_len > ETH_MIN ? wire_len : ETH_MIN;
|
2021-02-09 14:28:45 +01:00
|
|
|
|
2021-02-09 14:28:45 +01:00
|
|
|
t_xmit_ns = (wire_len) * NS_PER_SEC / rate_in_bytes;
|
2020-11-08 14:24:51 +01:00
|
|
|
|
2020-12-01 12:13:24 +01:00
|
|
|
// now = bpf_ktime_get_ns();
|
|
|
|
now = bpf_ktime_get_boot_ns(); /* Use same ktime as bpftrace */
|
2020-11-14 10:43:56 +01:00
|
|
|
|
|
|
|
/* Allow others to set skb tstamp prior to us */
|
|
|
|
t_curr = skb->tstamp;
|
|
|
|
if (t_curr < now)
|
|
|
|
t_curr = now;
|
|
|
|
|
|
|
|
/* The 't_last' timestamp can be in the future. Packets scheduled a head
|
|
|
|
* of his packet can be seen as the queue size measured in time, via
|
|
|
|
* correlating this to 'now' timestamp.
|
|
|
|
*/
|
|
|
|
t_next = READ_ONCE(edt->t_last) + t_xmit_ns;
|
|
|
|
|
|
|
|
/* If packet doesn't get scheduled into the future, then there is
|
2020-11-21 16:54:58 +01:00
|
|
|
* no-queue and we are not above rate limit. Normally send packet
|
|
|
|
* immediately and move forward t_last timestamp to now.
|
|
|
|
*
|
|
|
|
* But in our use-case the traffic need smoothing at a earlier
|
|
|
|
* stage, as bursts at lower rates can hurt the crapy switch.
|
|
|
|
* Thus, schedule SKB transmissing as new + t_xmit_ns.
|
2020-11-14 10:43:56 +01:00
|
|
|
*/
|
|
|
|
if (t_next <= t_curr) {
|
2020-11-21 18:59:03 +01:00
|
|
|
#if 1
|
2020-11-21 18:02:45 +01:00
|
|
|
__u64 t_curr_next;
|
2020-11-21 18:59:03 +01:00
|
|
|
__u32 min_len = 1538;
|
2020-11-21 18:02:45 +01:00
|
|
|
|
|
|
|
/* Minimum delay for all packet if no time-queue */
|
|
|
|
wire_len = (wire_len > min_len) ? wire_len : min_len;
|
2021-02-09 14:28:45 +01:00
|
|
|
t_xmit_ns = (wire_len) * NS_PER_SEC / rate_in_bytes;
|
2020-11-21 18:02:45 +01:00
|
|
|
t_curr_next = t_curr + t_xmit_ns;
|
2020-11-21 16:54:58 +01:00
|
|
|
|
|
|
|
WRITE_ONCE(edt->t_last, t_curr_next);
|
|
|
|
skb->tstamp = t_curr_next;
|
2020-12-01 10:07:25 +01:00
|
|
|
skb->mark = 1; /* No queue - add minimum delay */
|
2020-11-21 18:59:03 +01:00
|
|
|
#else
|
|
|
|
WRITE_ONCE(edt->t_last, t_curr);
|
|
|
|
#endif
|
2020-11-14 10:43:56 +01:00
|
|
|
return BPF_OK;
|
2020-11-21 18:59:03 +01:00
|
|
|
|
2020-11-14 10:43:56 +01:00
|
|
|
}
|
|
|
|
|
2020-11-14 12:06:25 +01:00
|
|
|
/* Calc queue size measured in time */
|
|
|
|
t_queue_sz = t_next - now;
|
|
|
|
|
|
|
|
/* FQ-pacing qdisc also have horizon, but cannot use that, because this
|
|
|
|
* BPF-prog will have updated map (t_last) on packet and assumes it got
|
|
|
|
* its part of bandwidth.
|
|
|
|
*/
|
|
|
|
if (t_queue_sz >= T_HORIZON_DROP /* edt->t_horizon_drop */)
|
|
|
|
return BPF_DROP;
|
|
|
|
|
2020-11-22 14:45:13 +01:00
|
|
|
/* If TCP didn't react to ECN marking, then start dropping some */
|
2020-11-22 18:11:11 +01:00
|
|
|
// if (codel_drop(edt, t_queue_sz, now))
|
2020-11-28 13:37:07 +01:00
|
|
|
if (codel_drop(&edt->codel, t_queue_sz, t_next))
|
2020-11-22 18:11:11 +01:00
|
|
|
return BPF_DROP;
|
2020-12-01 10:07:25 +01:00
|
|
|
|
|
|
|
skb->mark = 2; /* (time) queue exist - and small/below T_HORIZON_ECN */
|
2021-02-09 14:28:45 +01:00
|
|
|
|
2020-11-14 17:16:03 +01:00
|
|
|
/* ECN marking horizon */
|
2020-12-01 10:07:25 +01:00
|
|
|
if (t_queue_sz >= T_HORIZON_ECN) {
|
|
|
|
skb->mark = 3; /* (time) queue exist - and is large */
|
2020-11-14 17:16:03 +01:00
|
|
|
bpf_skb_ecn_set_ce(skb);
|
2020-12-01 10:07:25 +01:00
|
|
|
}
|
2020-11-14 10:43:56 +01:00
|
|
|
|
|
|
|
/* Advance "time queue" */
|
|
|
|
WRITE_ONCE(edt->t_last, t_next);
|
|
|
|
|
|
|
|
/* Schedule packet to be send at future timestamp */
|
|
|
|
skb->tstamp = t_next;
|
2020-11-08 14:24:51 +01:00
|
|
|
return BPF_OK;
|
|
|
|
}
|
|
|
|
|
2020-11-15 21:54:39 +01:00
|
|
|
static __always_inline
|
|
|
|
__u16 get_inner_qinq_vlan(struct __sk_buff *skb, struct collect_vlans *vlans)
|
|
|
|
{
|
|
|
|
__u16 vlan_key;
|
|
|
|
|
|
|
|
/* NIC can HW "offload" the outer VLAN, moving it to skb context */
|
|
|
|
if (skb->vlan_present)
|
|
|
|
vlan_key = vlans->id[0]; /* Inner vlan placed as first inline */
|
|
|
|
else
|
|
|
|
vlan_key = vlans->id[1]; /* All VLAN headers inline */
|
|
|
|
|
|
|
|
return vlan_key;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline
|
|
|
|
__u16 get_vlan(struct __sk_buff *skb, struct collect_vlans *vlans)
|
|
|
|
{
|
|
|
|
__u16 vlan_key;
|
|
|
|
|
|
|
|
/* Handle extracting VLAN if skb context have VLAN offloaded */
|
|
|
|
if (skb->vlan_present)
|
|
|
|
vlan_key = skb->vlan_tci & VLAN_VID_MASK;
|
|
|
|
else
|
|
|
|
vlan_key = vlans->id[0];
|
|
|
|
|
|
|
|
return vlan_key;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline
|
|
|
|
__u16 extract_vlan_key(struct __sk_buff *skb, struct collect_vlans *vlans)
|
|
|
|
{
|
|
|
|
int QinQ = 0;
|
|
|
|
|
|
|
|
/* The inner VLAN is the key to extract. But it is complicated
|
|
|
|
* due to NIC "offloaded" VLAN (skb->vlan_present). In case
|
|
|
|
* BPF-prog is loaded on outer VLAN net_device, the BPF-prog
|
|
|
|
* sees the inner-VLAN at the first and only VLAN.
|
|
|
|
*/
|
|
|
|
if (skb->vlan_present) {
|
|
|
|
if (vlans->id[0])
|
|
|
|
QinQ = 1;
|
|
|
|
} else {
|
|
|
|
if (vlans->id[1])
|
|
|
|
QinQ = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (QinQ)
|
|
|
|
return get_inner_qinq_vlan(skb, vlans);
|
|
|
|
else
|
|
|
|
return get_vlan(skb, vlans);
|
|
|
|
}
|
|
|
|
|
|
|
|
SEC("classifier") int tc_edt_vlan(struct __sk_buff *skb)
|
2020-11-08 14:24:51 +01:00
|
|
|
{
|
2020-11-15 16:23:46 +01:00
|
|
|
void *data = (void *)(long)skb->data;
|
|
|
|
void *data_end = (void *)(long)skb->data_end;
|
|
|
|
struct collect_vlans vlans = { 0 };
|
2020-11-08 14:24:51 +01:00
|
|
|
struct ethhdr *eth;
|
2020-11-15 16:23:46 +01:00
|
|
|
int ret = BPF_OK;
|
2020-11-15 21:19:10 +01:00
|
|
|
__u16 vlan_key;
|
2020-11-08 14:24:51 +01:00
|
|
|
|
2020-11-15 16:23:46 +01:00
|
|
|
/* These keep track of the next header type and iterator pointer */
|
|
|
|
struct hdr_cursor nh;
|
|
|
|
int eth_type;
|
|
|
|
nh.pos = data;
|
2020-11-08 14:24:51 +01:00
|
|
|
|
2020-11-15 16:23:46 +01:00
|
|
|
eth_type = parse_ethhdr_vlan(&nh, data_end, ð, &vlans);
|
|
|
|
if (eth_type < 0)
|
2020-11-15 21:54:39 +01:00
|
|
|
return BPF_DROP;
|
2020-11-08 14:24:51 +01:00
|
|
|
|
|
|
|
/* Keep ARP resolution working */
|
2020-11-15 16:23:46 +01:00
|
|
|
if (eth_type == bpf_htons(ETH_P_ARP)) {
|
2020-11-08 14:24:51 +01:00
|
|
|
ret = BPF_OK;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2020-11-15 21:19:10 +01:00
|
|
|
if (!proto_is_vlan(eth->h_proto) && !skb->vlan_present) {
|
2020-11-15 16:23:46 +01:00
|
|
|
/* Skip non-VLAN frames */
|
|
|
|
return BPF_OK;
|
|
|
|
}
|
|
|
|
|
2020-11-15 21:54:39 +01:00
|
|
|
vlan_key = extract_vlan_key(skb, &vlans);
|
2020-11-15 21:19:10 +01:00
|
|
|
|
2021-02-09 14:28:45 +01:00
|
|
|
/* Each (inner) VLAN id gets it own EDT pacing */
|
2020-11-28 15:09:12 +01:00
|
|
|
return sched_departure(skb, vlan_key);
|
2020-11-08 14:24:51 +01:00
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|