mirror of
https://github.com/xdp-project/bpf-examples.git
synced 2024-05-06 15:54:53 +00:00
Merge pull request #13 from simosund/pping_Add_Sampling
Add sampling to pping
This commit is contained in:
@@ -1,34 +1,11 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
|
||||
|
||||
USER_TARGETS := pping
|
||||
TC_BPF_TARGETS := pping_kern_tc
|
||||
BPF_TARGETS := pping_kern_xdp
|
||||
BPF_TARGETS += $(TC_BPF_TARGETS)
|
||||
BPF_TARGETS := pping_kern
|
||||
|
||||
LDFLAGS += -pthread
|
||||
EXTRA_DEPS += config.mk pping.h pping_helpers.h
|
||||
EXTRA_DEPS += pping.h
|
||||
|
||||
LIB_DIR = ../lib
|
||||
|
||||
include $(LIB_DIR)/common.mk
|
||||
include config.mk
|
||||
|
||||
all: config.mk
|
||||
|
||||
config.mk: configure
|
||||
@sh configure
|
||||
|
||||
ifndef HAVE_TC_LIBBPF
|
||||
# If the iproute2 'tc' tool doesn't understand BTF debug info
|
||||
# use llvm-strip to remove this debug info from object file
|
||||
#
|
||||
# *BUT* cannot strip everything as it removes ELF elems needed for
|
||||
# creating maps
|
||||
#
|
||||
.PHONY: strip_tc_obj
|
||||
strip_tc_obj: ${TC_BPF_TARGETS:=.o}
|
||||
$(Q) echo "TC don't support libbpf - strip BTF info"
|
||||
$(Q) llvm-strip --no-strip-all --remove-section .BTF $?
|
||||
|
||||
all: strip_tc_obj
|
||||
endif
|
||||
|
104
pping/README.md
104
pping/README.md
@@ -1,21 +1,99 @@
|
||||
# PPing using XDP and TC-BPF
|
||||
A re-implementation of [Kathie Nichols' passive ping
|
||||
(pping)](https://github.com/pollere/pping) utility using XDP (on ingress)
|
||||
and TC-BPF (on egress) for the packet capture logic.
|
||||
(pping)](https://github.com/pollere/pping) utility using XDP (on ingress) and
|
||||
TC-BPF (on egress) for the packet capture logic.
|
||||
|
||||
## Simple description
|
||||
Passive Ping (PPing) makes use of the TCP Timestamp option to calculate the RTT for TCP traffic passing through.
|
||||
PPing can be used on measure RTTs on end hosts or any device which sees both directions of the TCP flow.
|
||||
Passive Ping (PPing) is a simple tool for passively measuring per-flow RTTs. It
|
||||
can be used on endhosts as well as any (BPF-capable Linux) device which can see
|
||||
both directions of the traffic (ex router or middlebox). Currently it only works
|
||||
for TCP traffic which uses the TCP timestamp option, but could be extended to
|
||||
also work with for example TCP seq/ACK numbers, the QUIC spinbit and ICMP
|
||||
echo-reply messages. See the [TODO-list](./TODO.md) for more potential features
|
||||
(which may or may not ever get implemented).
|
||||
|
||||
For outgoing packets, it checks for TCP timestamp TSval in the TCP header. If it finds one it creates a timestamp
|
||||
for when it saw that TSval in a particular flow. On incomming packets it parses the TCP timestamp TSecr (which
|
||||
is the TSval echoed by the receiving host) and checks it has seen any previous outgoing packets with that TCP
|
||||
timestamp. If it has, an RTT is calculated as the difference in time between when it saw an outgoing packet
|
||||
with a TSval, and when it received an incomming packet from the reverse flow with a matching TSecr.
|
||||
The fundamental logic of pping is to timestamp a pseudo-unique identifier for
|
||||
outgoing packets, and then look for matches in the incoming packets. If a match
|
||||
is found, the RTT is simply calculated as the time difference between the
|
||||
current time and the timestamp.
|
||||
|
||||
Note that TCP timestamps may not be unique for every packet in a flow, therefore it only matches the first
|
||||
outgoing packet with a particular TSval with the first incomming packet with a matching TSecr. Duplicate
|
||||
TSval/TSecr are ignored.
|
||||
This tool, just as Kathie's original pping implementation, uses TCP timestamps
|
||||
as identifiers. For outgoing packets, the TSval (which is a timestamp in and off
|
||||
itself) is timestamped. Incoming packets are then parsed for the TSecr, which
|
||||
are the echoed TSval values from the receiver. The TCP timestamps are not
|
||||
necessarily unique for every packet (they have a limited update frequency,
|
||||
appears to be 1000 Hz for modern Linux systems), so only the first instance of
|
||||
an identifier is timestamped, and matched against the first incoming packet with
|
||||
the identifier. The mechanism to ensure only the first packet is timestamped and
|
||||
matched differs from the one in Kathie's pping, and is further described in
|
||||
[SAMPLING_DESIGN](./SAMPLING_DESIGN.md).
|
||||
|
||||
## Planned design
|
||||
## Design and technical description
|
||||

|
||||
|
||||
### Files:
|
||||
- **pping.c:** Userspace program that loads and attaches the BPF programs, pulls
|
||||
the perf-buffer `rtt_events` to print out RTT messages and periodically cleans
|
||||
up the hash-maps from old entries. Also passes user options to the BPF
|
||||
programs by setting a "global variable" (stored in the programs .rodata
|
||||
section).
|
||||
- **pping_kern.c:** Contains the BPF programs that are loaded on tc (egress) and
|
||||
XDP (ingress), as well as several common functions, a global constant `config`
|
||||
(set from userspace) and map definitions. The tc program `pping_egress()`
|
||||
parses outgoing packets for identifiers. If an identifier is found and the
|
||||
sampling strategy allows it, a timestamp for the packet is created in
|
||||
`packet_ts`. The XDP program `pping_ingress()` parses incomming packets for an
|
||||
identifier. If found, it looks up the `packet_ts` map for a match on the
|
||||
reverse flow (to match source/dest on egress). If there is a match, it
|
||||
calculates the RTT from the stored timestamp and deletes the entry. The
|
||||
calculated RTT (together with the flow-tuple) is pushed to the perf-buffer
|
||||
`rtt_events`.
|
||||
- **bpf_egress_loader.sh:** A shell script that's used by `pping.c` to setup a
|
||||
clsact qdisc and attach the `pping_egress()` program to egress using
|
||||
tc. **Note**: Unless your iproute2 comes with libbpf support, tc will use
|
||||
iproute's own loading mechanism when loading and attaching object files
|
||||
directly through the tc command line. To ensure that libbpf is always used to
|
||||
load `pping_egress()`, `pping.c` actually loads the program and pins it to
|
||||
`/sys/fs/bpf/pping/classifier`, and tc only attaches the pinned program.
|
||||
- **functions.sh and parameters.sh:** Imported by `bpf_egress_loader.sh`.
|
||||
- **pping.h:** Common header file included by `pping.c` and
|
||||
`pping_kern.c`. Contains some common structs used by both (are part of the
|
||||
maps).
|
||||
|
||||
### BPF Maps:
|
||||
- **flow_state:** A hash-map storing some basic state for each flow, such as the
|
||||
last seen identifier for the flow and when the last timestamp entry for the
|
||||
flow was created. Entries are created by `pping_egress()`, and can be updated
|
||||
or deleted by both `pping_egress()` and `pping_ingress()`. Leftover entries
|
||||
are eventually removed by `pping.c`. Pinned at `/sys/fs/bpf/pping`.
|
||||
- **packet_ts:** A hash-map storing a timestamp for a specific packet
|
||||
identifier. Entries are created by `pping_egress()` and removed by
|
||||
`pping_ingress()` if a match is found. Leftover entries are eventually
|
||||
removed by `pping.c`. Pinned at `/sys/fs/bpf/pping`.
|
||||
- **rtt_events:** A perf-buffer used by `pping_ingress()` to push calculated RTTs
|
||||
to `pping.c`, which continuously polls the map the print out the RTTs.
|
||||
|
||||
## Similar projects
|
||||
Passively measuring the RTT for TCP traffic is not a novel concept, and there
|
||||
exists a number of other tools that can do so. A good overview of how passive
|
||||
RTT calculation using TCP timestamps (as in this project) works is provided in
|
||||
[this paper](https://doi.org/10.1145/2523426.2539132) from 2013.
|
||||
|
||||
- [pping](https://github.com/pollere/pping): This project is largely a
|
||||
re-implementation of Kathie's pping, but by using BPF and XDP as well as
|
||||
implementing some filtering logic the hope is to be able to create a always-on
|
||||
tool that can scale well even to large amounts of massive flows.
|
||||
- [ppviz](https://github.com/pollere/ppviz): Web-based visualization tool for
|
||||
the "machine-friendly" output from Kathie's pping tool. If/when we implement a
|
||||
similar machine readable output option it should hopefully work with this
|
||||
implementation as well.
|
||||
- [tcptrace](https://github.com/blitz/tcptrace): A post-processing tool which
|
||||
can analyze a tcpdump file and among other things calculate RTTs based on
|
||||
seq/ACK numbers (`-r` or `-R` flag).
|
||||
- **Dapper**: A passive TCP data plane monitoring tool implemented in P4 which
|
||||
can among other things calculate the RTT based on the matching seq/ACK
|
||||
numbers. [Paper](https://doi.org/10.1145/3050220.3050228). [Unofficial
|
||||
source](https://github.com/muhe1991/p4-programs-survey/tree/master/dapper).
|
||||
- [P4 Tofino TCP RTT measurement](https://github.com/Princeton-Cabernet/p4-projects/tree/master/RTT-tofino):
|
||||
A passive TCP RTT monitor based on seq/ACK numbers implemented in P4 for
|
||||
Tofino programmable switches. [Paper](https://doi.org/10.1145/3405669.3405823).
|
||||
|
386
pping/SAMPLING_DESIGN.md
Normal file
386
pping/SAMPLING_DESIGN.md
Normal file
@@ -0,0 +1,386 @@
|
||||
# Introduction
|
||||
This file is intended to document some of the challenges and design
|
||||
decisions for adding sampling functionality to pping. It is partly
|
||||
based on discussions from my supervisor meeting on 2021-02-22, and the
|
||||
contents of my
|
||||
[status slides](https://github.com/xdp-project/bpf-research/blob/master/meetings/simon/work_summary_20210222.org)
|
||||
from that meeting.
|
||||
|
||||
## Purpose of sampling
|
||||
The main purpose of adding sampling to pping is to prevent a massive
|
||||
amount of timestamp entries being created and quickly filling up the
|
||||
map. This prevents new entries from being made until old ones can be
|
||||
cleared out. A few large flows could thus "hog" all the map entries,
|
||||
and prevent RTTs from other flows from being reported. Sampling is
|
||||
therefore only used on egress to determine if a timestamp entry should
|
||||
be created for a packet. All packets on ingress will still be parsed
|
||||
and checked for a potential match.
|
||||
|
||||
A secondary purpose of the sampling is the reduce the amount of output
|
||||
that pping creates. In most circumstances, getting 1000 RTT reports
|
||||
per second from a single flow will probably not be of interest, making
|
||||
it less useful as a direct command-line utility.
|
||||
|
||||
# Considered sampling approaches
|
||||
There are a number of different ways that the sampling could be
|
||||
performed, ex:
|
||||
|
||||
- Sample every N packets per flow
|
||||
- Not very flexible
|
||||
- If same rate is used for all flows small flows would get very few
|
||||
samples.
|
||||
- Sample completely random packets
|
||||
- Probably not a good idea...
|
||||
- Head sampling (sample the first few packets of each flow)
|
||||
- Not suitable for monitoring long flows
|
||||
- RTT may change over lifetime of flow (due to buffer bloat)
|
||||
- Probabilistic approach
|
||||
- Probabilistic approaches have been used to for example capture
|
||||
most relevant information with limited overhead in INT
|
||||
- Could potentially be configured across multiple devices, so that
|
||||
pping on all of the devices together capture the most relevant
|
||||
traffic.
|
||||
- While it could potentially work well, I'm not very familiar with
|
||||
these approaches. Would take considerable research from my side
|
||||
to figure out how these methods work, how to best apply it to
|
||||
pping, and how to implement it in BPF.
|
||||
- Used time-based sampling, limiting the rate of how often entries
|
||||
can be created per flow
|
||||
- Intuitively simple
|
||||
- Should correspond quite well with the output you would probably
|
||||
want? I.e. a few entries per flow (regardless of how heavy they
|
||||
are) stating their current RTT.
|
||||
|
||||
I believe that time-based sampling is the most promising solution that
|
||||
I can implement in a reasonable time. In the future additional
|
||||
sampling methods could potentially be added.
|
||||
|
||||
# Considerations for time-based sampling
|
||||
## Time interval
|
||||
For the time-based sampling, we must determine how the interval
|
||||
between when new timestamp entries are allowed should be set.
|
||||
|
||||
### Static time interval
|
||||
The simplest alternative is probably to use a static limit, ex
|
||||
100ms. This would provide a rather simple and predictable limit for
|
||||
how often entries can be created (per flow), and how much output you
|
||||
would get (per flow).
|
||||
|
||||
### RTT-based time interval
|
||||
It may be desirable to use a more dynamic time limit, which is
|
||||
adapted to each flow. One way to do this, would be do base the time
|
||||
limit on the RTT for the flow. Flows with short RTTs could be expected
|
||||
to undergo more rapid changes than flows with long RTTs. This would
|
||||
require keeping track of the RTT for each flow, for example a moving
|
||||
average. Additionally, some fall back is required before the RTT for
|
||||
the flow is known.
|
||||
|
||||
### User configurable
|
||||
Regardless if a static or RTT-based (or some other alternative) is
|
||||
used, it should probably be user configurable (including allowing the
|
||||
user to disable sampling entirely).
|
||||
|
||||
## Allowing bursts
|
||||
It may be desirable to allow to allow for multiple packets in a short
|
||||
burst to be timestamped. Due to delayed ACKs, one may only get a
|
||||
response for every other packet. If the first packed is timestamped,
|
||||
and shortly after a second packet is sent (that has a different
|
||||
identifier), then the response will effectively be for the second
|
||||
packet, and no match for the timestamped identifier will be found. For
|
||||
flows of the right (or wrong, depending on how you look at it)
|
||||
intensity, slow enough where consecutive packets are likely to get
|
||||
different TCP timestamps, but fast enough for the delayed ACKs to
|
||||
acknowledge multiple packets, then you essentially have a 50/50 chance
|
||||
of timestamping the wrong identifier and miss the RTT.
|
||||
|
||||
To handle this, you could timestamp multiple consecutive packets (with
|
||||
unique indentifiers) in a short burst. You probably need to limit this
|
||||
burst in both number of packets, as well as timeframe after the first
|
||||
packet that additional packets may be included. For example, allowing
|
||||
up to 3 packets (with different identifiers) get a timestamp for up to
|
||||
4 ms after the first one of them are timestamped.
|
||||
|
||||
If allowing bursts of timestamps to be created, it may also be
|
||||
desirable to rate limit the output, in order to not get a burst of
|
||||
similar RTTs for the flow in the output (which may also skew averages
|
||||
and other post-processing).
|
||||
|
||||
## Handing duplicate identifiers
|
||||
TCP timestamps are only updated at a limited rate (ex. 1000 Hz), and
|
||||
thus you can have multiple consecutive packets with the same TCP
|
||||
timestamp if they're sent fast enough. For the calculated RTT to be
|
||||
correct, you should only match the first sent packet with a unique
|
||||
identifier with the first received packet with a matching
|
||||
identifier. Otherwise, you may for example have a sequence with 100
|
||||
packets with the same identifier, and match the last of the outgoing
|
||||
packets with the first incoming response, which may underestimate the
|
||||
RTT with as much as the TCP timestamp clock rate (ex. 1 ms).
|
||||
|
||||
### Current solution
|
||||
The current solution to this is very simple. For outgoing packets, a
|
||||
timestamp entry is only allowed to be created if no previous entry for
|
||||
the identifier exists (realized through the `BPF_NOEXIST` flag to
|
||||
`bpf_map_update_elem()` call). Thus only the first outgoing packet with
|
||||
a specific identifier can be timestamped. On egress, the first packet
|
||||
with a matching identifier will mark the timestamp as used, preventing
|
||||
later incoming responses from using that timestamp. The reason why the
|
||||
timestamp is marked as used rather than directly deleted once a
|
||||
matching packet on ingress is found, is to avoid the egress side
|
||||
creating a new entry for the same identifier. This could occur if the
|
||||
RTT is shorter than the TCP timestamp clock rate, and could result in
|
||||
a massively underestimated RTT. This is the same mechanic that is used
|
||||
in the original pping, as explained
|
||||
[here](https://github.com/pollere/pping/blob/777eb72fd9b748b4bb628ef97b7fff19b751f1fd/pping.cpp#L155-L168).
|
||||
|
||||
### New solution
|
||||
The current solution will no longer work if sampling is
|
||||
introduced. With sampling, there's no guarantee that the sampled
|
||||
packed will be the first outgoing packet in the sequence of packets
|
||||
with identical timestamps. Thus the RTT may still be underestimated by
|
||||
as much as the TCP timestamp clock rate (ex. 1 ms). Therefore, a new
|
||||
solution is needed. The current idea is to keep track of the last-seen
|
||||
identifier of each flow, and only allow a packet to be sampled for
|
||||
timestamping if its identifier differs from the last-seen identifier
|
||||
of the flow, i.e. it is the first packet in the flow with that
|
||||
identifier. This would perhaps be problematic with some sampling
|
||||
approaches as it requires that the packet is both the first one with a
|
||||
specific identifier, as well as being elected for sampling. However
|
||||
for the rate-limited sampling it should work quite well, as it will
|
||||
only delay the sampling until a packet with a new identifier is found.
|
||||
|
||||
Another advantage with this solution is that it should allow for
|
||||
timestamp entries to be deleted as soon as the matching response is
|
||||
found on ingress. The timestamp no longer needs to be kept around only
|
||||
to prevent egress to create a new timestamp with the same identifier,
|
||||
as this new solution should take care of that. This would help a lot
|
||||
with keeping the map clean, as the timestamp entries would then
|
||||
automatically be removed as soon as they are no longer needed. The
|
||||
periodic cleanup from userspace would only be needed to remove the
|
||||
occasional entries that were never matched for some reason (e.g. the
|
||||
previously mentioned issue with delayed ACKs, flow stopped, the
|
||||
reverse flow can't be observed etc.).
|
||||
|
||||
One issue for this new solution is handling out-of-order packets. If
|
||||
an entry with an older identifier is a bit delayed, it may arrive after
|
||||
the last seen identifier for the flow has been updated. This old
|
||||
identifier may then be considered new (as it differs from the current
|
||||
one), allowing an entry to be created for it and reverting the last
|
||||
seen identifier to a previous one. Additionally, this may
|
||||
now allow the next packet having what used to be the current
|
||||
identifier, also being detected as a new identifier (as the out-of
|
||||
order packet reverted the last-seen identifier to an old one, creating
|
||||
a bit of a ping-pong effect). For TCP timestamps this can easily be
|
||||
avoided by simply requiring the new identifier to be greater than the
|
||||
last-seen identifier (as TCP timestamps should be monotonically
|
||||
increasing). That solution may however not be suitable if one wants to
|
||||
reuse this mechanic for other protocols, such as the QUIC spinbit.
|
||||
|
||||
## Keeping per-flow information
|
||||
In order for the per-flow rate limiting to work, some per-flow state
|
||||
must be maintained, namely when the last timestamp for that flow was
|
||||
added (so that one can check that sufficient time has passed before
|
||||
attempting to add another one).
|
||||
|
||||
There may be some drawbacks with having to keep per-flow state. First
|
||||
off, there will be some additional overhead from having to keep track
|
||||
of this state. However, the savings from sampling the per-packet state
|
||||
(the identifier/timestamps mappings) should hopefully cover the
|
||||
overhead from keeping some per-flow state (and then some).
|
||||
|
||||
Another issue that is worth keeping in mind is that this flow-state
|
||||
will also need to be cleaned up eventually. This cleanup could be
|
||||
handled in a similar manner as the current per-packet state is cleaned
|
||||
up, by having the userspace process occasionally remove old
|
||||
entries. In this case, the entries could be deemed as old if there was
|
||||
a long time since the last timestamp was added for the flow, ex 300
|
||||
seconds as used by the [original
|
||||
pping](https://github.com/pollere/pping/blob/777eb72fd9b748b4bb628ef97b7fff19b751f1fd/pping.cpp#L117).
|
||||
Additionally, one can parse the packets for indications that the
|
||||
connection is being closed (ex TCP FIN/RST), and then directly delete
|
||||
the flow-state for that flow from the BPF programs.
|
||||
|
||||
Later on, this per-flow state could potentially be expanded to include
|
||||
other information deemed useful (such as ex. minimum and average RTT).
|
||||
|
||||
### Alternative solution - keeping identifier in flow-state
|
||||
One idea that came up during my supervisor meeting, was that instead
|
||||
of creating timestamps for individual packets as is currently done,
|
||||
you only create a number of timestamps for each flow. That is, instead
|
||||
of creating per-packet entries in a separate map, you include a number
|
||||
of timestamp/identifier pairs in the flow-state information itself.
|
||||
|
||||
While this would potentially be rather efficient, limiting the number
|
||||
of timestamp entries to a fixed number per flow, I'm opposed to this
|
||||
idea for a few reasons:
|
||||
|
||||
1. The sampling rate would be inherently tied to the RTT of the
|
||||
flow. While this may in many cases be desirable, it is not very
|
||||
flexible. It would also make it hard to ex. turn of sampling
|
||||
completely.
|
||||
2. The number of timestamps per flow would need to be fixed and known
|
||||
at compile time(?). As the timestamps/identifier pairs are kept in
|
||||
the state-flow information itself, and the state-flow information
|
||||
needs to be of a known and fixed size when creating the maps. This
|
||||
may also result in some wasted space if the flow-state includes
|
||||
spots for several timestamp/identifier pairs, but most flows only
|
||||
makes use of a few (although having an additional timestamp entry
|
||||
map of fixed size wastes space in a similar manner).
|
||||
2. If a low number of timestamp/identifier pairs are kept, selecting
|
||||
an identifier that is missed (ex due to delayed ACKs) could
|
||||
effectivly block new timestamps from being created (and thus from
|
||||
RTTs being calculated) for the flow for a relatively long
|
||||
while. New timestamps can only be created if you have a free slot,
|
||||
and you can only free a slot by either getting a matching reply, or
|
||||
waiting until it can be safely assumed that the response was missed
|
||||
(and not just delayed).
|
||||
|
||||
## Graceful degradation
|
||||
Another aspect I've been asked to consider is how to gracefully reduce
|
||||
the functionality of pping as the timestamp entry map gets full (as
|
||||
with sufficiently many and heavy flows, it's likely inevitable).
|
||||
|
||||
What currently happens when the timestamp entry map is full, is simply
|
||||
that no more entries can be made until some have been cleared
|
||||
out. When adding a rate-limit to the number of entries per flow, as
|
||||
well as directly deleting entries upon match, I believe this is a
|
||||
reasonable way to handle the situation. As soon as some RTTs for
|
||||
current flows have been reported, space for new entries will be
|
||||
available. The next outgoing packet with a valid identifier from any
|
||||
flow that does not have to currently wait for its rate limit will then
|
||||
be able to grab the next spot. However this will still favor heavy
|
||||
flows over smaller flows, as heavy flows are more likely to be able to
|
||||
get in a packet first, but they will at least still be limited by the
|
||||
rate limit, and thus have to take turns with other flows.
|
||||
|
||||
It also worth noting that as per-flow state will need to be kept,
|
||||
there will be strict limit to the number of concurrent flows that can
|
||||
be monitored, corresponding to the number of entries that can be held
|
||||
by the map for the per-flow state. Once the per-flow state map is
|
||||
full, no new flows can be added until one is cleared. It also doesn't
|
||||
make sense to add packet timestamp entries for flows which state
|
||||
cannot be tracked, as the rate limit cannot be enforced then.
|
||||
|
||||
I see a few ways to more actively handle degradation, depending on what
|
||||
one views as desirable:
|
||||
|
||||
1. One can attempt to monitor many flows, with infrequent RTT
|
||||
calculations for each. In this case, the userspace process that
|
||||
occasionally clears out the timestamp map could automatically
|
||||
decrease the per-flow rate limit if it detects the map is getting
|
||||
close to full. That way, fewer entries would be generated per flow,
|
||||
and flows would be forced to take turns to a greater degree when
|
||||
the map is completely full. Similarly, one may wish to reduce the
|
||||
timeout for old flows if the per-flow map is getting full, in order
|
||||
to more quickly allow new flows to be monitored, and only keeping
|
||||
the most active flows around.
|
||||
2. One can attempt to monitor fewer flows, but with more frequent RTT
|
||||
calculations for each. The easiest way to achieve this is to
|
||||
probably to set a smaller size on the per-flow map relative to the
|
||||
per-packet timestamp map. In case one wants to primarily focus on
|
||||
heavier flows, one could possibly add ex. packet rate to the
|
||||
per-flow information, and remove the flows with the lowest packet
|
||||
rates.
|
||||
3. One can attempt to focus on flows with shorter RTTs. Flows with
|
||||
shorter RTTs should make more efficient use of timestamp entries,
|
||||
as they can be cleared out faster allowing for new entries. On the
|
||||
other hand, flows with longer RTTs may be the more interesting
|
||||
ones, as they are more likely to indicate some issue.
|
||||
4. One can simply try to create a larger map (and copy over the old
|
||||
contents) once the map is approaching full. This way one can start
|
||||
with reasonably small maps, and only start eating up more memory if
|
||||
required.
|
||||
|
||||
While I'm leaning towards option 1 or 4, I don't have a very strong
|
||||
personal opinion here, and would like some input on what others (who
|
||||
may have more experience with network measurements) think are
|
||||
reasonable trade-offs to do.
|
||||
|
||||
# Implementation considerations
|
||||
There are of course several more practical considerations as well when
|
||||
implementing the sampling, some of which I'll try to address here.
|
||||
|
||||
## "Global" vs PERCPU maps
|
||||
In general, it's likely wise to go with PERCPU maps over "global" (aka
|
||||
non-PERCPU) maps whenever possible, as PERCPU maps should be more
|
||||
performant, and also avoids concurrency issues. But this only applies
|
||||
of course, if the BPF programs don't need to act on global state.
|
||||
|
||||
For pping, I unfortunately see no way for the program to work with
|
||||
only information local to each CPU core individually. The per-packet
|
||||
identifier and timestamps need to be global, as there is no guarantee
|
||||
that the same core that timestamped a packet will process the response
|
||||
for that packet. Likewise, the per-flow information, like the time of
|
||||
the last timestamping, also needs to be global. Otherwise rate limit
|
||||
would be per-CPU-per-flow rather than just per-flow.
|
||||
|
||||
In practice, packets from the same flow are apparently often handled
|
||||
by the same CPU, but this is not guaranteed, and therefore not
|
||||
something we can rely on (especially when state needs to be shared by
|
||||
both ingress and egress). Could try to use a CPU map to enforce this
|
||||
behavior, but probably not a great idea.
|
||||
|
||||
## Concurrency issues
|
||||
In addition to the performance hit, sharing global state between
|
||||
multiple concurrent processes risks running into concurrency issues
|
||||
unless access is synchronized in some manner (in BPF, the two
|
||||
mechanics I know of are atomic adds and spin-locks for maps). With the
|
||||
risk of me misunderstanding the memory model for BPF programs (which
|
||||
from what I can tell I'm probably not alone about), I will attempt to
|
||||
explain the potential concurrency issues I see with the pping
|
||||
implementation.
|
||||
|
||||
The current pping implementation already has a potential concurrency
|
||||
issue. When matches for identifiers are found on ingress, a check is
|
||||
performed to see if the timestamp has already been used or
|
||||
not. Multiple packets processed in parallel could potentially all
|
||||
find that the timestamp is unused, before any of them manage to mark
|
||||
it as used for the others. This may result in pping matching several
|
||||
responses to a single timestamp entry and reporting the RTTs for each
|
||||
of them. I do not consider this a significant issue however, as if
|
||||
they are concurrent enough that they manage to lookup the used status
|
||||
before another has time to set it, the difference in time between them
|
||||
should be very small, and therefore compute very similar RTTs. So the
|
||||
reported RTTs should still be rather accurate, just over-reported.
|
||||
|
||||
When adding sampling and per-flow information, some additional
|
||||
concurrency issues may be encountered. Mainly, multiple packets may
|
||||
find that they are allowed to add a new timestamp, before they manage
|
||||
to update the time of last added time-stamp in the per-flow
|
||||
state. This may lead to multiple attempts at creating a timestamp at
|
||||
approximately the same time. For TCP timestamps, all the identifiers
|
||||
are likely to be identical (as the TCP timestamp itself is only
|
||||
updated at limited rate), so only one of them should succeed
|
||||
anyways. If using identifiers that are more unique however, such as
|
||||
TCP sequence numbers, then it's possible that a short burst of entries
|
||||
would be created instead of just a single entry within the rate-limit
|
||||
for the flow.
|
||||
|
||||
Overall, I don't think these concurrency issues are that severe, as
|
||||
they should still result in accurate RTTs, just some possible
|
||||
over-reporting. I don't believe these issues warrants the performance
|
||||
impact and potential code complexity of trying to synchronize
|
||||
access. Furthermore, from what I understand these concurrency issues
|
||||
are not too likely to occur in reality, as packets from the same flow
|
||||
are often processed on the same core.
|
||||
|
||||
## Global variable vs single-entry map
|
||||
With BTF, there seems like BPF programs now support the use of global
|
||||
variables. These global variables can supposedly be modified from user
|
||||
space, and should from what I've heard also be more efficient than map
|
||||
lookups. They therefore seem like promising way to pass some
|
||||
user-configured options from userspace to the BPF programs.
|
||||
|
||||
I would however need to lookup how to actually use these, as the
|
||||
examples I've seen have used a slightly different libbpf setup, where
|
||||
a "skeleton" header-file is compiled and imported to the userspace
|
||||
program. There should be some examples in the [xdp-tools
|
||||
repository](https://github.com/xdp-project/xdp-tools).
|
||||
|
||||
The alternative I guess would be to use a
|
||||
`BPF_MAP_TYPE_PERCPU_ARRAY` with a single entry, which is filled in
|
||||
with the user-configured option by the userspace program.
|
||||
|
||||
|
||||
|
||||
|
||||
|
@@ -2,27 +2,60 @@
|
||||
|
||||
## Protocols
|
||||
- [x] TCP (based on timestamp options)
|
||||
- [ ] Skip pure ACKs for egress?
|
||||
- [x] Skip pure ACKs for egress
|
||||
- Timestamping pure ACKs may lead to erroneous RTTs (ex. delay
|
||||
between application attempting to send data being recognized as
|
||||
an RTT)
|
||||
- [ ] Add fallback to SEQ/ACK in case of no timestamp?
|
||||
- Some machines may not use TCP timestamps (either not supported
|
||||
at all, or disabled as in ex. Windows 10)
|
||||
- If one only considers SEQ/ACK (and don't check for SACK
|
||||
options), could result in ex. delay from retransmission being
|
||||
included in RTT
|
||||
- [ ] ICMP (ex Echo/Reply)
|
||||
- [ ] QUIC (based on spinbit)
|
||||
|
||||
## General pping
|
||||
- [x] Add sampling so that RTT is not calculated for every packet
|
||||
(with unique value) for large flows
|
||||
- [ ] Allow short bursts to bypass sampling in order to handle
|
||||
delayed ACKs
|
||||
- [x] Keep some per-flow state
|
||||
- Will likely be needed for the sampling
|
||||
- [ ] Could potentially include keeping track of average RTT, which
|
||||
may be useful for some decisions (ex. how often to sample,
|
||||
when entry can be removed etc)
|
||||
- [ ] Could potentially include keeping track of minimum RTT (as
|
||||
done by the original pping), ex. to track bufferbloat
|
||||
- [ ] Could potentially include keeping track of if flow is
|
||||
bi-directional
|
||||
- Original pping checks if flow is bi-directional before adding
|
||||
timestamps, but this could miss shorter flows
|
||||
- [ ] Dynamically grow the maps if they are starting to get full
|
||||
- [ ] Improve map cleaning: Use a dynamic time to live for map entries
|
||||
based on flow's RTT, instead of static 10s limit
|
||||
- Keeping entries around for a long time allows the map to grow
|
||||
unnecessarily large, which slows down the cleaning and may block
|
||||
new entries
|
||||
- [ ] Use libxdp to load XDP program
|
||||
- [ ] Check for existance of reverse flow before adding to hash-map (to avoid adding identifiers for flows that we can't see the reverse traffic for)?
|
||||
- This could miss the first few packets, would not be ideal for short flows
|
||||
- [ ] Keep track of minimum RTT for each flow (done by Pollere's pping, and helps identify buffer bloat)
|
||||
- [ ] Add configurable rate-limit for how often each flow can add entries to the map (prevent high-rate flows from quickly filling up the map)
|
||||
- [ ] Improve map cleaning: Use a dynamic time to live for hash map entries based on flow's RTT, instead of static 10s limit
|
||||
- [ ] Add support for automatically deleting entries if they are unique
|
||||
- TCP timestamp need to be kept for a while (because multiple packets can have the same timestamp), but for identifiers that are unique per packet, they can be removed directly after RTT is calculated
|
||||
- [ ] Add option for machine-readable output (as original pping)
|
||||
- It may be a good idea to keep the same format as original pping,
|
||||
so that tools such as [ppviz](https://github.com/pollere/ppviz)
|
||||
works for both pping implementations.
|
||||
- [ ] Add timestamps to output (as original pping)
|
||||
- [ ] Add support for other hooks
|
||||
- Ex TC-BFP on ingress instead of XDP?
|
||||
|
||||
## Done
|
||||
- [x] Clean up commits and add signed-off-by tags
|
||||
- [x] Add SPDX-license-identifier tags
|
||||
- [x] Format C-code in kernel style
|
||||
- [x] Use existing funcionality to reuse maps by using BTF-defined maps
|
||||
- [x] Use BTF-defined maps for TC-BPF as well if iproute has libbpf support
|
||||
- [x] Cleanup: Unload TC-BPF at program shutdown, and unpin map - In userspace part
|
||||
- [x] Use existing functionality to reuse maps by using BTF-defined
|
||||
maps
|
||||
- [x] Use BTF-defined maps for TC-BPF as well if iproute has libbpf
|
||||
support
|
||||
- [x] Cleanup: Unload TC-BPF at program shutdown, and unpin map - In
|
||||
userspace part
|
||||
- [x] Add IPv6 support
|
||||
- [x] Refactor to support easy addition of other protocols
|
||||
- [x] Load tc-bpf program with libbpf (only attach it with tc)
|
||||
|
@@ -4,7 +4,7 @@
|
||||
# License: GPLv2
|
||||
#
|
||||
# Modified by Simon Sundberg <simon.sundberg@kau.se> to add support
|
||||
# of optional section (--sec) option and changed default BPF_OBJ
|
||||
# of optional section (--sec) option or attaching a pinned program
|
||||
#
|
||||
basedir=`dirname $0`
|
||||
source ${basedir}/functions.sh
|
||||
@@ -64,6 +64,16 @@ function tc_egress_bpf_attach()
|
||||
egress bpf da obj "$objfile" sec "$section"
|
||||
}
|
||||
|
||||
function tc_egress_bpf_attach_pinned()
|
||||
{
|
||||
local device=${1:-$DEV}
|
||||
local pinprog=${2:-$PIN_PROG}
|
||||
shift 2
|
||||
|
||||
call_tc filter add dev "$device" pref 2 handle 2 \
|
||||
egress bpf da pinned "$pinprog"
|
||||
}
|
||||
|
||||
function tc_egress_list()
|
||||
{
|
||||
local device=${1:-$DEV}
|
||||
@@ -77,7 +87,12 @@ if [[ -n $REMOVE ]]; then
|
||||
fi
|
||||
|
||||
tc_init_clsact $DEV
|
||||
tc_egress_bpf_attach $DEV $BPF_OBJ $SEC
|
||||
|
||||
if [[ -n $PIN_PROG ]]; then
|
||||
tc_egress_bpf_attach_pinned $DEV $PIN_PROG
|
||||
else
|
||||
tc_egress_bpf_attach $DEV $BPF_OBJ $SEC
|
||||
fi
|
||||
|
||||
# Practical to list egress filters after setup.
|
||||
# (It's a common mistake to have several progs loaded)
|
||||
|
29
pping/configure
vendored
29
pping/configure
vendored
@@ -1,29 +0,0 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
|
||||
# This is not an autoconf generated configure
|
||||
#
|
||||
|
||||
# Output file which is input to Makefile
|
||||
CONFIG=config.mk
|
||||
|
||||
# Assume tc is in $PATH
|
||||
TC=tc
|
||||
|
||||
check_tc_libbpf()
|
||||
{
|
||||
tc_version=$($TC -V)
|
||||
if echo $tc_version | grep -q libbpf; then
|
||||
libbpf_version=${tc_version##*libbpf }
|
||||
echo "HAVE_TC_LIBBPF:=y" >> $CONFIG
|
||||
echo "BPF_CFLAGS += -DHAVE_TC_LIBBPF" >> $CONFIG
|
||||
echo "yes ($libbpf_version)"
|
||||
else
|
||||
echo "no"
|
||||
fi
|
||||
}
|
||||
|
||||
echo "# Generated config" > $CONFIG
|
||||
echo "Detecting available features on system"
|
||||
|
||||
echo -n " - libbpf support in tc tool: "
|
||||
check_tc_libbpf
|
Binary file not shown.
Before Width: | Height: | Size: 49 KiB After Width: | Height: | Size: 54 KiB |
@@ -6,7 +6,7 @@
|
||||
# License: GPLv2
|
||||
#
|
||||
# Modified by Simon Sundberg <simon.sundberg@kau.se> to add support
|
||||
# of optional section (--sec) option
|
||||
# of optional section (--sec) option or attaching a pinned program
|
||||
#
|
||||
|
||||
function usage() {
|
||||
@@ -20,12 +20,13 @@ function usage() {
|
||||
echo " -l | --list : (\$LIST) List setup after setup"
|
||||
echo " --file | --obj : (\$BPF_OBJ) BPF-object file to load"
|
||||
echo " --sec : (\$SEC) Section of BPF-object to load"
|
||||
echo " --pinned : (\$PIN_PROG) Path to pinned program to attach"
|
||||
echo ""
|
||||
}
|
||||
|
||||
# Using external program "getopt" to get --long-options
|
||||
OPTIONS=$(getopt -o vshd:l \
|
||||
--long verbose,dry-run,remove,stats,list,help,dev:,file:,obj:,sec: -- "$@")
|
||||
--long verbose,dry-run,remove,stats,list,help,dev:,file:,obj:,sec:,pinned: -- "$@")
|
||||
if (( $? != 0 )); then
|
||||
usage
|
||||
err 2 "Error calling getopt"
|
||||
@@ -50,6 +51,11 @@ while true; do
|
||||
info "Section to load: $SEC" >&2
|
||||
shift 2
|
||||
;;
|
||||
--pinned )
|
||||
export PIN_PROG=$2
|
||||
info "Pinned program path: $PIN_PROG" >&2
|
||||
shift 2
|
||||
;;
|
||||
-v | --verbose)
|
||||
export VERBOSE=yes
|
||||
# info "Verbose mode: VERBOSE=$VERBOSE" >&2
|
||||
|
592
pping/pping.c
592
pping/pping.c
@@ -1,4 +1,7 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
static const char *__doc__ =
|
||||
"Passive Ping - monitor flow RTT based on TCP timestamps";
|
||||
|
||||
#include <bpf/bpf.h>
|
||||
#include <bpf/libbpf.h>
|
||||
#include <linux/if_link.h>
|
||||
@@ -10,7 +13,9 @@
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <getopt.h>
|
||||
#include <stdbool.h>
|
||||
#include <limits.h>
|
||||
#include <signal.h> // For detecting Ctrl-C
|
||||
#include <sys/resource.h> // For setting rlmit
|
||||
#include <sys/wait.h>
|
||||
@@ -18,25 +23,18 @@
|
||||
#include <time.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#include "pping.h" //key and value structs for the ts_start map
|
||||
#include "pping.h" //common structs for user-space and BPF parts
|
||||
|
||||
#define NS_PER_SECOND 1000000000UL
|
||||
#define NS_PER_MS 1000000UL
|
||||
|
||||
#define TCBPF_LOADER_SCRIPT "./bpf_egress_loader.sh"
|
||||
#define PINNED_DIR "/sys/fs/bpf/tc/globals"
|
||||
#define PPING_XDP_OBJ "pping_kern_xdp.o"
|
||||
#define PPING_TCBPF_OBJ "pping_kern_tc.o"
|
||||
|
||||
#define XDP_FLAGS XDP_FLAGS_UPDATE_IF_NOEXIST
|
||||
|
||||
#define TS_MAP "ts_start"
|
||||
#define MAP_CLEANUP_INTERVAL \
|
||||
(1 * NS_PER_SECOND) // Clean timestamp map once per second
|
||||
#define TIMESTAMP_LIFETIME \
|
||||
(10 * NS_PER_SECOND) // Clear out entries from ts_start if they're over 10 seconds
|
||||
(10 * NS_PER_SECOND) // Clear out packet timestamps if they're over 10 seconds
|
||||
#define FLOW_LIFETIME \
|
||||
(300 * NS_PER_SECOND) // Clear out flows if they're inactive over 300 seconds
|
||||
|
||||
#define PERF_BUFFER "rtt_events"
|
||||
#define PERF_BUFFER_PAGES 64 // Related to the perf-buffer size?
|
||||
#define PERF_POLL_TIMEOUT_MS 100
|
||||
|
||||
@@ -57,12 +55,146 @@
|
||||
|
||||
// Structure to contain arguments for clean_map (for passing to pthread_create)
|
||||
struct map_cleanup_args {
|
||||
int map_fd;
|
||||
__u64 max_age_ns;
|
||||
__u64 cleanup_interval;
|
||||
int packet_map_fd;
|
||||
int flow_map_fd;
|
||||
};
|
||||
|
||||
// Store configuration values in struct to easily pass around
|
||||
struct pping_config {
|
||||
struct bpf_config bpf_config;
|
||||
__u64 cleanup_interval;
|
||||
int xdp_flags;
|
||||
int ifindex;
|
||||
char ifname[IF_NAMESIZE];
|
||||
bool force;
|
||||
char *object_path;
|
||||
char *ingress_sec;
|
||||
char *egress_sec;
|
||||
char *pin_dir;
|
||||
char *packet_map;
|
||||
char *flow_map;
|
||||
char *rtt_map;
|
||||
};
|
||||
|
||||
static volatile int keep_running = 1;
|
||||
|
||||
static const struct option long_options[] = {
|
||||
{ "help", no_argument, NULL, 'h' },
|
||||
{ "interface", required_argument, NULL, 'i' }, // Name of interface to run on
|
||||
{ "rate-limit", required_argument, NULL, 'r' }, // Sampling rate-limit in ms
|
||||
{ "force", no_argument, NULL, 'f' }, // Detach any existing XDP program on interface
|
||||
{ "cleanup-interval", required_argument, NULL, 'c' }, // Map cleaning interval in s
|
||||
{ 0, 0, NULL, 0 }
|
||||
};
|
||||
|
||||
/*
|
||||
* Copied from Jesper Dangaaard Brouer's traffic-pacing-edt example
|
||||
*/
|
||||
static void print_usage(char *argv[])
|
||||
{
|
||||
int i;
|
||||
|
||||
printf("\nDOCUMENTATION:\n%s\n", __doc__);
|
||||
printf("\n");
|
||||
printf(" Usage: %s (options-see-below)\n", argv[0]);
|
||||
printf(" Listing options:\n");
|
||||
for (i = 0; long_options[i].name != 0; i++) {
|
||||
printf(" --%-12s", long_options[i].name);
|
||||
if (long_options[i].flag != NULL)
|
||||
printf(" flag (internal value:%d)",
|
||||
*long_options[i].flag);
|
||||
else
|
||||
printf(" short-option: -%c", long_options[i].val);
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static double parse_positive_double_argument(const char *str,
|
||||
const char *parname)
|
||||
{
|
||||
char *endptr;
|
||||
double val;
|
||||
val = strtod(str, &endptr);
|
||||
if (strlen(str) != endptr - str) {
|
||||
fprintf(stderr, "%s %s is not a valid number\n", parname, str);
|
||||
return -EINVAL;
|
||||
}
|
||||
if (val < 0) {
|
||||
fprintf(stderr, "%s must be positive\n", parname);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
static int parse_arguments(int argc, char *argv[], struct pping_config *config)
|
||||
{
|
||||
int err, opt;
|
||||
double rate_limit_ms, cleanup_interval_s;
|
||||
|
||||
config->ifindex = 0;
|
||||
|
||||
while ((opt = getopt_long(argc, argv, "hfi:r:c:", long_options,
|
||||
NULL)) != -1) {
|
||||
switch (opt) {
|
||||
case 'i':
|
||||
if (strlen(optarg) > IF_NAMESIZE) {
|
||||
fprintf(stderr, "interface name too long\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
strncpy(config->ifname, optarg, IF_NAMESIZE);
|
||||
|
||||
config->ifindex = if_nametoindex(config->ifname);
|
||||
if (config->ifindex == 0) {
|
||||
err = -errno;
|
||||
fprintf(stderr,
|
||||
"Could not get index of interface %s: %s\n",
|
||||
config->ifname, strerror(err));
|
||||
return err;
|
||||
}
|
||||
break;
|
||||
case 'r':
|
||||
rate_limit_ms = parse_positive_double_argument(
|
||||
optarg, "rate-limit");
|
||||
if (rate_limit_ms < 0)
|
||||
return -EINVAL;
|
||||
|
||||
config->bpf_config.rate_limit =
|
||||
rate_limit_ms * NS_PER_MS;
|
||||
break;
|
||||
case 'c':
|
||||
cleanup_interval_s = parse_positive_double_argument(
|
||||
optarg, "cleanup-interval");
|
||||
if (cleanup_interval_s < 0)
|
||||
return -EINVAL;
|
||||
|
||||
config->cleanup_interval =
|
||||
cleanup_interval_s * NS_PER_SECOND;
|
||||
break;
|
||||
case 'f':
|
||||
config->force = true;
|
||||
break;
|
||||
case 'h':
|
||||
printf("HELP:\n");
|
||||
print_usage(argv);
|
||||
exit(0);
|
||||
default:
|
||||
fprintf(stderr, "Unknown option %s\n", argv[optind]);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
if (config->ifindex == 0) {
|
||||
fprintf(stderr,
|
||||
"An interface (-i or --interface) must be provided\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void abort_program(int sig)
|
||||
{
|
||||
keep_running = 0;
|
||||
@@ -78,28 +210,48 @@ static int set_rlimit(long int lim)
|
||||
return !setrlimit(RLIMIT_MEMLOCK, &rlim) ? 0 : -errno;
|
||||
}
|
||||
|
||||
static int mkdir_if_noexist(const char *path)
|
||||
static int
|
||||
bpf_obj_run_prog_pindir_func(struct bpf_object *obj, const char *prog_title,
|
||||
const char *pin_dir,
|
||||
int (*func)(struct bpf_program *, const char *))
|
||||
{
|
||||
int ret;
|
||||
struct stat st = { 0 };
|
||||
int len;
|
||||
struct bpf_program *prog;
|
||||
char path[MAX_PATH_LEN];
|
||||
|
||||
ret = stat(path, &st);
|
||||
if (ret) {
|
||||
if (errno != ENOENT)
|
||||
return -errno;
|
||||
len = snprintf(path, MAX_PATH_LEN, "%s/%s", pin_dir, prog_title);
|
||||
if (len < 0)
|
||||
return len;
|
||||
if (len > MAX_PATH_LEN)
|
||||
return -ENAMETOOLONG;
|
||||
|
||||
return mkdir(path, 0700) ? -errno : 0;
|
||||
}
|
||||
return S_ISDIR(st.st_mode) ? 0 : -EEXIST;
|
||||
prog = bpf_object__find_program_by_title(obj, prog_title);
|
||||
if (!prog || libbpf_get_error(prog))
|
||||
return prog ? libbpf_get_error(prog) : -EINVAL;
|
||||
|
||||
return func(prog, path);
|
||||
}
|
||||
|
||||
static int bpf_obj_open(struct bpf_object **obj, const char *obj_path,
|
||||
char *map_path)
|
||||
/*
|
||||
* Similar to bpf_object__pin_programs, but only attemps to pin a
|
||||
* single program prog_title at path pin_dir/prog_title
|
||||
*/
|
||||
static int bpf_obj_pin_program(struct bpf_object *obj, const char *prog_title,
|
||||
const char *pin_dir)
|
||||
{
|
||||
DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts,
|
||||
.pin_root_path = map_path);
|
||||
*obj = bpf_object__open_file(obj_path, map_path ? &opts : NULL);
|
||||
return libbpf_get_error(*obj);
|
||||
return bpf_obj_run_prog_pindir_func(obj, prog_title, pin_dir,
|
||||
bpf_program__pin);
|
||||
}
|
||||
|
||||
/*
|
||||
* Similar to bpf_object__unpin_programs, but only attempts to unpin a
|
||||
* single program prog_title at path pin_dir/prog_title.
|
||||
*/
|
||||
static int bpf_obj_unpin_program(struct bpf_object *obj, const char *prog_title,
|
||||
const char *pin_dir)
|
||||
{
|
||||
return bpf_obj_run_prog_pindir_func(obj, prog_title, pin_dir,
|
||||
bpf_program__unpin);
|
||||
}
|
||||
|
||||
static int xdp_detach(int ifindex, __u32 xdp_flags)
|
||||
@@ -112,7 +264,6 @@ static int xdp_attach(struct bpf_object *obj, const char *sec, int ifindex,
|
||||
{
|
||||
struct bpf_program *prog;
|
||||
int prog_fd;
|
||||
int err;
|
||||
|
||||
if (sec)
|
||||
prog = bpf_object__find_program_by_title(obj, sec);
|
||||
@@ -120,24 +271,28 @@ static int xdp_attach(struct bpf_object *obj, const char *sec, int ifindex,
|
||||
prog = bpf_program__next(NULL, obj);
|
||||
|
||||
prog_fd = bpf_program__fd(prog);
|
||||
if (prog_fd < 0) {
|
||||
fprintf(stderr, "Could not find program to attach\n");
|
||||
if (prog_fd < 0)
|
||||
return prog_fd;
|
||||
}
|
||||
|
||||
if (force) // detach current (if any) xdp-program first
|
||||
xdp_detach(ifindex, xdp_flags);
|
||||
|
||||
err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
|
||||
if (err < 0) {
|
||||
fprintf(stderr, "Failed loading xdp-program on interface %d\n",
|
||||
ifindex);
|
||||
return err;
|
||||
}
|
||||
return 0;
|
||||
return bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
|
||||
}
|
||||
|
||||
static int run_program(const char *path, char *const argv[])
|
||||
static int init_rodata(struct bpf_object *obj, void *src, size_t size)
|
||||
{
|
||||
struct bpf_map *map = NULL;
|
||||
bpf_object__for_each_map(map, obj) {
|
||||
if (strstr(bpf_map__name(map), ".rodata"))
|
||||
return bpf_map__set_initial_value(map, src, size);
|
||||
}
|
||||
|
||||
// No .rodata map found
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int run_external_program(const char *path, char *const argv[])
|
||||
{
|
||||
int status;
|
||||
int ret = -1;
|
||||
@@ -157,18 +312,24 @@ static int run_program(const char *path, char *const argv[])
|
||||
}
|
||||
}
|
||||
|
||||
static int tc_bpf_load(char *bpf_object, char *section, char *interface)
|
||||
static int tc_bpf_attach(const char *pin_dir, const char *section,
|
||||
char *interface)
|
||||
{
|
||||
char *const argv[] = { TCBPF_LOADER_SCRIPT, "--dev", interface, "--obj",
|
||||
bpf_object, "--sec", section, NULL };
|
||||
return run_program(TCBPF_LOADER_SCRIPT, argv);
|
||||
char prog_path[MAX_PATH_LEN];
|
||||
char *const argv[] = { TCBPF_LOADER_SCRIPT, "--dev", interface,
|
||||
"--pinned", prog_path, NULL };
|
||||
|
||||
if (snprintf(prog_path, sizeof(prog_path), "%s/%s", pin_dir, section) < 0)
|
||||
return -EINVAL;
|
||||
|
||||
return run_external_program(TCBPF_LOADER_SCRIPT, argv);
|
||||
}
|
||||
|
||||
static int tc_bpf_clear(char *interface)
|
||||
{
|
||||
char *const argv[] = { TCBPF_LOADER_SCRIPT, "--dev", interface,
|
||||
"--remove", NULL };
|
||||
return run_program(TCBPF_LOADER_SCRIPT, argv);
|
||||
return run_external_program(TCBPF_LOADER_SCRIPT, argv);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -184,45 +345,82 @@ static __u64 get_time_ns(void)
|
||||
return (__u64)t.tv_sec * NS_PER_SECOND + (__u64)t.tv_nsec;
|
||||
}
|
||||
|
||||
static int clean_map(int map_fd, __u64 max_age)
|
||||
static bool packet_ts_timeout(void *val_ptr, __u64 now)
|
||||
{
|
||||
__u64 ts = *(__u64 *)val_ptr;
|
||||
if (now > ts && now - ts > TIMESTAMP_LIFETIME)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool flow_timeout(void *val_ptr, __u64 now)
|
||||
{
|
||||
__u64 ts = ((struct flow_state *)val_ptr)->last_timestamp;
|
||||
if (now > ts && now - ts > FLOW_LIFETIME)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Loops through all entries in a map, running del_decision_func(value, time)
|
||||
* on every entry, and deleting those for which it returns true.
|
||||
* On sucess, returns the number of entries deleted, otherwise returns the
|
||||
* (negative) error code.
|
||||
*/
|
||||
//TODO - maybe add some pointer to arguments for del_decision_func?
|
||||
static int clean_map(int map_fd, size_t key_size, size_t value_size,
|
||||
bool (*del_decision_func)(void *, __u64))
|
||||
{
|
||||
int removed = 0;
|
||||
struct packet_id key, prev_key = { 0 };
|
||||
struct packet_timestamp value;
|
||||
void *key, *prev_key, *value;
|
||||
bool delete_prev = false;
|
||||
__u64 now_nsec = get_time_ns();
|
||||
|
||||
int entries = 0; // Just for debug
|
||||
__u64 duration; // Just for debug
|
||||
#ifdef DEBUG
|
||||
int entries = 0;
|
||||
__u64 duration;
|
||||
#endif
|
||||
|
||||
if (now_nsec == 0)
|
||||
return -errno;
|
||||
|
||||
key = malloc(key_size);
|
||||
prev_key = malloc(key_size);
|
||||
value = malloc(value_size);
|
||||
if (!key || !prev_key || !value) {
|
||||
removed = -ENOMEM;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// Cannot delete current key because then loop will reset, see https://www.bouncybouncy.net/blog/bpf_map_get_next_key-pitfalls/
|
||||
while (bpf_map_get_next_key(map_fd, &prev_key, &key) == 0) {
|
||||
while (bpf_map_get_next_key(map_fd, prev_key, key) == 0) {
|
||||
if (delete_prev) {
|
||||
bpf_map_delete_elem(map_fd, &prev_key);
|
||||
bpf_map_delete_elem(map_fd, prev_key);
|
||||
removed++;
|
||||
delete_prev = false;
|
||||
}
|
||||
|
||||
if (bpf_map_lookup_elem(map_fd, &key, &value) == 0) {
|
||||
if (now_nsec > value.timestamp &&
|
||||
now_nsec - value.timestamp > max_age) {
|
||||
delete_prev = true;
|
||||
}
|
||||
}
|
||||
if (bpf_map_lookup_elem(map_fd, key, value) == 0)
|
||||
delete_prev = del_decision_func(value, now_nsec);
|
||||
#ifdef DEBUG
|
||||
entries++;
|
||||
prev_key = key;
|
||||
#endif
|
||||
memcpy(prev_key, key, key_size);
|
||||
}
|
||||
if (delete_prev) {
|
||||
bpf_map_delete_elem(map_fd, &prev_key);
|
||||
bpf_map_delete_elem(map_fd, prev_key);
|
||||
removed++;
|
||||
}
|
||||
#ifdef DEBUG
|
||||
duration = get_time_ns() - now_nsec;
|
||||
printf("Gone through %d entries and removed %d of them in %llu.%09llu s\n",
|
||||
entries, removed, duration / NS_PER_SECOND,
|
||||
printf("%d: Gone through %d entries and removed %d of them in %llu.%09llu s\n",
|
||||
map_fd, entries, removed, duration / NS_PER_SECOND,
|
||||
duration % NS_PER_SECOND);
|
||||
#endif
|
||||
cleanup:
|
||||
free(key);
|
||||
free(prev_key);
|
||||
free(value);
|
||||
return removed;
|
||||
}
|
||||
|
||||
@@ -230,11 +428,14 @@ static void *periodic_map_cleanup(void *args)
|
||||
{
|
||||
struct map_cleanup_args *argp = args;
|
||||
struct timespec interval;
|
||||
interval.tv_sec = MAP_CLEANUP_INTERVAL / NS_PER_SECOND;
|
||||
interval.tv_nsec = MAP_CLEANUP_INTERVAL % NS_PER_SECOND;
|
||||
interval.tv_sec = argp->cleanup_interval / NS_PER_SECOND;
|
||||
interval.tv_nsec = argp->cleanup_interval % NS_PER_SECOND;
|
||||
|
||||
while (keep_running) {
|
||||
clean_map(argp->map_fd, argp->max_age_ns);
|
||||
clean_map(argp->packet_map_fd, sizeof(struct packet_id),
|
||||
sizeof(__u64), packet_ts_timeout);
|
||||
clean_map(argp->flow_map_fd, sizeof(struct network_tuple),
|
||||
sizeof(struct flow_state), flow_timeout);
|
||||
nanosleep(&interval, NULL);
|
||||
}
|
||||
pthread_exit(NULL);
|
||||
@@ -274,28 +475,134 @@ static void handle_missed_rtt_event(void *ctx, int cpu, __u64 lost_cnt)
|
||||
fprintf(stderr, "Lost %llu RTT events on CPU %d\n", lost_cnt, cpu);
|
||||
}
|
||||
|
||||
static int load_attach_bpfprogs(struct bpf_object **obj,
|
||||
struct pping_config *config, bool *tc_attached,
|
||||
bool *xdp_attached)
|
||||
{
|
||||
int err;
|
||||
|
||||
// Open and load ELF file
|
||||
*obj = bpf_object__open(config->object_path);
|
||||
err = libbpf_get_error(*obj);
|
||||
if (err) {
|
||||
fprintf(stderr, "Failed opening object file %s: %s\n",
|
||||
config->object_path, strerror(-err));
|
||||
return err;
|
||||
}
|
||||
|
||||
err = init_rodata(*obj, &config->bpf_config,
|
||||
sizeof(config->bpf_config));
|
||||
if (err) {
|
||||
fprintf(stderr, "Failed pushing user-configration to %s: %s\n",
|
||||
config->object_path, strerror(-err));
|
||||
return err;
|
||||
}
|
||||
|
||||
err = bpf_object__load(*obj);
|
||||
if (err) {
|
||||
fprintf(stderr, "Failed loading bpf program in %s: %s\n",
|
||||
config->object_path, strerror(-err));
|
||||
return err;
|
||||
}
|
||||
|
||||
// Attach tc program
|
||||
err = bpf_obj_pin_program(*obj, config->egress_sec, config->pin_dir);
|
||||
if (err) {
|
||||
fprintf(stderr, "Failed pinning tc program to %s/%s: %s\n",
|
||||
config->pin_dir, config->egress_sec, strerror(-err));
|
||||
return err;
|
||||
}
|
||||
|
||||
err = tc_bpf_attach(config->pin_dir, config->egress_sec,
|
||||
config->ifname);
|
||||
if (err) {
|
||||
fprintf(stderr,
|
||||
"Failed attaching tc program on interface %s: %s\n",
|
||||
config->ifname, strerror(-err));
|
||||
return err;
|
||||
}
|
||||
*tc_attached = true;
|
||||
|
||||
// Attach XDP program
|
||||
err = xdp_attach(*obj, config->ingress_sec, config->ifindex,
|
||||
config->xdp_flags, config->force);
|
||||
if (err) {
|
||||
fprintf(stderr, "Failed attaching XDP program to %s%s: %s\n",
|
||||
config->ifname,
|
||||
config->force ? "" : ", ensure no other XDP program is already running on interface",
|
||||
strerror(-err));
|
||||
return err;
|
||||
}
|
||||
*xdp_attached = true;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int setup_periodical_map_cleaning(struct bpf_object *obj,
|
||||
struct pping_config *config)
|
||||
{
|
||||
pthread_t tid;
|
||||
struct map_cleanup_args clean_args = {
|
||||
.cleanup_interval = config->cleanup_interval
|
||||
};
|
||||
int err;
|
||||
|
||||
clean_args.packet_map_fd =
|
||||
bpf_object__find_map_fd_by_name(obj, config->packet_map);
|
||||
if (clean_args.packet_map_fd < 0) {
|
||||
fprintf(stderr, "Could not get file descriptor of map %s: %s\n",
|
||||
config->packet_map,
|
||||
strerror(-clean_args.packet_map_fd));
|
||||
return clean_args.packet_map_fd;
|
||||
}
|
||||
|
||||
clean_args.flow_map_fd =
|
||||
bpf_object__find_map_fd_by_name(obj, config->flow_map);
|
||||
if (clean_args.flow_map_fd < 0) {
|
||||
fprintf(stderr, "Could not get file descriptor of map %s: %s\n",
|
||||
config->flow_map, strerror(-clean_args.flow_map_fd));
|
||||
return clean_args.packet_map_fd;
|
||||
}
|
||||
|
||||
err = pthread_create(&tid, NULL, periodic_map_cleanup, &clean_args);
|
||||
if (err) {
|
||||
fprintf(stderr,
|
||||
"Failed starting thread to perform periodic map cleanup: %s\n",
|
||||
strerror(-err));
|
||||
return err;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int err = 0;
|
||||
int ifindex = 0;
|
||||
bool xdp_attached = false;
|
||||
|
||||
bool tc_attached = false;
|
||||
char map_path[MAX_PATH_LEN];
|
||||
bool xdp_attached = false;
|
||||
|
||||
struct bpf_object *obj = NULL;
|
||||
struct bpf_map *map = NULL;
|
||||
|
||||
pthread_t tid;
|
||||
struct map_cleanup_args clean_args;
|
||||
struct pping_config config = {
|
||||
.bpf_config = { .rate_limit = 100 * NS_PER_MS },
|
||||
.cleanup_interval = 1 * NS_PER_SECOND,
|
||||
.object_path = "pping_kern.o",
|
||||
.ingress_sec = INGRESS_PROG_SEC,
|
||||
.egress_sec = EGRESS_PROG_SEC,
|
||||
.pin_dir = "/sys/fs/bpf/pping",
|
||||
.packet_map = "packet_ts",
|
||||
.flow_map = "flow_state",
|
||||
.rtt_map = "rtt_events",
|
||||
.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
|
||||
.force = false,
|
||||
};
|
||||
|
||||
struct perf_buffer *pb = NULL;
|
||||
struct perf_buffer_opts pb_opts;
|
||||
|
||||
// TODO - better argument parsing (more relevant as featureas are added)
|
||||
if (argc < 2) {
|
||||
printf("Usage: ./pping_user <dev>\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
struct perf_buffer_opts pb_opts = {
|
||||
.sample_cb = handle_rtt_event,
|
||||
.lost_cb = handle_missed_rtt_event,
|
||||
};
|
||||
|
||||
// Detect if running as root
|
||||
if (geteuid() != 0) {
|
||||
@@ -308,98 +615,41 @@ int main(int argc, char *argv[])
|
||||
if (err) {
|
||||
fprintf(stderr, "Could not set rlimit to infinity: %s\n",
|
||||
strerror(-err));
|
||||
goto cleanup;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
// Get index of interface
|
||||
ifindex = if_nametoindex(argv[1]);
|
||||
if (ifindex == 0) {
|
||||
err = -errno;
|
||||
fprintf(stderr, "Could not get index of interface %s: %s\n",
|
||||
argv[1], strerror(-err));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// Load and attach the XDP program
|
||||
err = mkdir_if_noexist("/sys/fs/bpf/tc");
|
||||
err = parse_arguments(argc, argv, &config);
|
||||
if (err) {
|
||||
fprintf(stderr,
|
||||
"Failed creating directory %s in which to pin map: %s\n",
|
||||
"/sys/fs/bpf/tc", strerror(-err));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
err = bpf_obj_open(&obj, PPING_XDP_OBJ, PINNED_DIR);
|
||||
if (err) {
|
||||
fprintf(stderr, "Failed opening object file %s: %s\n",
|
||||
PPING_XDP_OBJ, strerror(-err));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// Get map here to allow for unpinning at cleanup
|
||||
map = bpf_object__find_map_by_name(obj, TS_MAP);
|
||||
err = libbpf_get_error(map);
|
||||
if (err) {
|
||||
fprintf(stderr, "Could not find map %s in %s: %s\n", TS_MAP,
|
||||
PPING_XDP_OBJ, strerror(err));
|
||||
map = NULL;
|
||||
}
|
||||
|
||||
err = bpf_object__load(obj);
|
||||
if (err) {
|
||||
fprintf(stderr, "Failed loading XDP program: %s\n",
|
||||
fprintf(stderr, "Failed parsing arguments: %s\n",
|
||||
strerror(-err));
|
||||
goto cleanup;
|
||||
print_usage(argv);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
err = xdp_attach(obj, XDP_PROG_SEC, ifindex, XDP_FLAGS, false);
|
||||
if (err) {
|
||||
fprintf(stderr, "Failed attaching XDP program to %s: %s\n",
|
||||
argv[1], strerror(-err));
|
||||
goto cleanup;
|
||||
}
|
||||
xdp_attached = true;
|
||||
|
||||
// Load tc-bpf section on interface egress
|
||||
err = tc_bpf_load(PPING_TCBPF_OBJ, TCBPF_PROG_SEC, argv[1]);
|
||||
err = load_attach_bpfprogs(&obj, &config, &tc_attached, &xdp_attached);
|
||||
if (err) {
|
||||
fprintf(stderr,
|
||||
"Could not load section %s of %s on interface %s: %s\n",
|
||||
TCBPF_PROG_SEC, PPING_TCBPF_OBJ, argv[1],
|
||||
"Failed loading and attaching BPF programs in %s\n",
|
||||
config.object_path);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
err = setup_periodical_map_cleaning(obj, &config);
|
||||
if (err) {
|
||||
fprintf(stderr, "Failed setting up map cleaning: %s\n",
|
||||
strerror(-err));
|
||||
goto cleanup;
|
||||
}
|
||||
tc_attached = true;
|
||||
|
||||
// Set up the periodical map cleaning
|
||||
clean_args.max_age_ns = TIMESTAMP_LIFETIME;
|
||||
clean_args.map_fd = bpf_map__fd(map);
|
||||
if (clean_args.map_fd < 0) {
|
||||
fprintf(stderr,
|
||||
"Could not get file descriptor of map %s in object %s: %s\n",
|
||||
TS_MAP, PPING_XDP_OBJ, strerror(-clean_args.map_fd));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
err = pthread_create(&tid, NULL, periodic_map_cleanup, &clean_args);
|
||||
if (err) {
|
||||
fprintf(stderr,
|
||||
"Failed starting thread to perform periodic map cleanup: %s\n",
|
||||
strerror(err));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// Set up perf buffer
|
||||
pb_opts.sample_cb = handle_rtt_event;
|
||||
pb_opts.lost_cb = handle_missed_rtt_event;
|
||||
|
||||
pb = perf_buffer__new(bpf_object__find_map_fd_by_name(obj, PERF_BUFFER),
|
||||
pb = perf_buffer__new(bpf_object__find_map_fd_by_name(obj,
|
||||
config.rtt_map),
|
||||
PERF_BUFFER_PAGES, &pb_opts);
|
||||
err = libbpf_get_error(pb);
|
||||
if (err) {
|
||||
pb = NULL;
|
||||
fprintf(stderr, "Failed to open perf buffer %s: %s\n",
|
||||
PERF_BUFFER, strerror(err));
|
||||
config.rtt_map, strerror(err));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
@@ -419,30 +669,30 @@ int main(int argc, char *argv[])
|
||||
|
||||
cleanup:
|
||||
perf_buffer__free(pb);
|
||||
if (map && bpf_map__is_pinned(map)) {
|
||||
snprintf(map_path, sizeof(map_path), "%s/%s", PINNED_DIR,
|
||||
TS_MAP);
|
||||
err = bpf_map__unpin(map, map_path);
|
||||
if (err) {
|
||||
fprintf(stderr, "Failed unpinning map from %s: %s\n",
|
||||
map_path, strerror(-err));
|
||||
}
|
||||
}
|
||||
|
||||
if (xdp_attached) {
|
||||
err = xdp_detach(ifindex, XDP_FLAGS);
|
||||
if (err) {
|
||||
err = xdp_detach(config.ifindex, config.xdp_flags);
|
||||
if (err)
|
||||
fprintf(stderr,
|
||||
"Failed deatching program from ifindex %d: %s\n",
|
||||
ifindex, strerror(-err));
|
||||
}
|
||||
"Failed deatching program from ifindex %s: %s\n",
|
||||
config.ifname, strerror(-err));
|
||||
}
|
||||
|
||||
if (tc_attached) {
|
||||
err = tc_bpf_clear(argv[1]); //system(tc_cmd);
|
||||
if (err) {
|
||||
err = tc_bpf_clear(config.ifname);
|
||||
if (err)
|
||||
fprintf(stderr,
|
||||
"Failed removing tc-bpf program from interface %s: %s\n",
|
||||
argv[1], strerror(-err));
|
||||
}
|
||||
config.ifname, strerror(-err));
|
||||
}
|
||||
|
||||
if (obj && !libbpf_get_error(obj)) {
|
||||
err = bpf_obj_unpin_program(obj, config.egress_sec,
|
||||
config.pin_dir);
|
||||
if (err)
|
||||
fprintf(stderr,
|
||||
"Failed unpinning tc program from %s: %s\n",
|
||||
config.pin_dir, strerror(-err));
|
||||
}
|
||||
|
||||
return err != 0;
|
||||
|
@@ -5,8 +5,12 @@
|
||||
#include <linux/types.h>
|
||||
#include <linux/in6.h>
|
||||
|
||||
#define XDP_PROG_SEC "xdp"
|
||||
#define TCBPF_PROG_SEC "pping_egress"
|
||||
#define INGRESS_PROG_SEC "xdp"
|
||||
#define EGRESS_PROG_SEC "classifier"
|
||||
|
||||
struct bpf_config {
|
||||
__u64 rate_limit;
|
||||
};
|
||||
|
||||
/*
|
||||
* Struct that can hold the source or destination address for a flow (l3+l4).
|
||||
@@ -34,17 +38,17 @@ struct network_tuple {
|
||||
__u8 reserved;
|
||||
};
|
||||
|
||||
struct flow_state {
|
||||
__u64 last_timestamp;
|
||||
__u32 last_id;
|
||||
__u32 reserved;
|
||||
};
|
||||
|
||||
struct packet_id {
|
||||
struct network_tuple flow;
|
||||
__u32 identifier; //tsval for TCP packets
|
||||
};
|
||||
|
||||
struct packet_timestamp {
|
||||
__u64 timestamp;
|
||||
__u8 used;
|
||||
__u8 reserved[7];
|
||||
};
|
||||
|
||||
struct rtt_event {
|
||||
__u64 rtt;
|
||||
struct network_tuple flow;
|
||||
|
@@ -1,187 +0,0 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#ifndef PPING_HELPERS_H
|
||||
#define PPING_HELPERS_H
|
||||
|
||||
#include <linux/bpf.h>
|
||||
#include <xdp/parsing_helpers.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/in6.h>
|
||||
#include <linux/if_ether.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/ipv6.h>
|
||||
#include <linux/tcp.h>
|
||||
|
||||
#include <stdbool.h>
|
||||
#include "pping.h"
|
||||
|
||||
#define AF_INET 2
|
||||
#define AF_INET6 10
|
||||
#define MAX_TCP_OPTIONS 10
|
||||
|
||||
/*
|
||||
* This struct keeps track of the data and data_end pointers from the xdp_md or
|
||||
* __skb_buff contexts, as well as a currently parsed to position kept in nh.
|
||||
* Additionally, it also keeps the length of the entire packet, which together
|
||||
* with the other members can be used to determine ex. how much data each
|
||||
* header encloses.
|
||||
*/
|
||||
struct parsing_context {
|
||||
void *data; //Start of eth hdr
|
||||
void *data_end; //End of safe acessible area
|
||||
struct hdr_cursor nh; //Position to parse next
|
||||
__u32 pkt_len; //Full packet length (headers+data)
|
||||
};
|
||||
|
||||
/*
|
||||
* Maps an IPv4 address into an IPv6 address according to RFC 4291 sec 2.5.5.2
|
||||
*/
|
||||
static void map_ipv4_to_ipv6(__be32 ipv4, struct in6_addr *ipv6)
|
||||
{
|
||||
__builtin_memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10);
|
||||
__builtin_memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2);
|
||||
ipv6->in6_u.u6_addr32[3] = ipv4;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parses the TSval and TSecr values from the TCP options field. If sucessful
|
||||
* the TSval and TSecr values will be stored at tsval and tsecr (in network
|
||||
* byte order).
|
||||
* Returns 0 if sucessful and -1 on failure
|
||||
*/
|
||||
static int parse_tcp_ts(struct tcphdr *tcph, void *data_end, __u32 *tsval,
|
||||
__u32 *tsecr)
|
||||
{
|
||||
int len = tcph->doff << 2;
|
||||
void *opt_end = (void *)tcph + len;
|
||||
__u8 *pos = (__u8 *)(tcph + 1); //Current pos in TCP options
|
||||
__u8 i, opt, opt_size;
|
||||
|
||||
if (tcph + 1 > data_end || len <= sizeof(struct tcphdr))
|
||||
return -1;
|
||||
|
||||
for (i = 0; i < MAX_TCP_OPTIONS; i++) {
|
||||
if (pos + 1 > opt_end || pos + 1 > data_end)
|
||||
return -1;
|
||||
|
||||
opt = *pos;
|
||||
if (opt == 0) // Reached end of TCP options
|
||||
return -1;
|
||||
|
||||
if (opt == 1) { // TCP NOP option - advance one byte
|
||||
pos++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Option > 1, should have option size
|
||||
if (pos + 2 > opt_end || pos + 2 > data_end)
|
||||
return -1;
|
||||
opt_size = *(pos + 1);
|
||||
|
||||
// Option-kind is TCP timestap (yey!)
|
||||
if (opt == 8 && opt_size == 10) {
|
||||
if (pos + opt_size > opt_end ||
|
||||
pos + opt_size > data_end)
|
||||
return -1;
|
||||
*tsval = *(__u32 *)(pos + 2);
|
||||
*tsecr = *(__u32 *)(pos + 6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Some other TCP option - advance option-length bytes
|
||||
pos += opt_size;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
/*
|
||||
* Attempts to fetch an identifier for TCP packets, based on the TCP timestamp
|
||||
* option. If sucessful, identifier will be set to TSval if is_ingress, TSecr
|
||||
* otherwise, the port-members of saddr and daddr will be set the the TCP source
|
||||
* and dest, respectively, and 0 will be returned. On failure, -1 will be
|
||||
* returned.
|
||||
*/
|
||||
static int parse_tcp_identifier(struct parsing_context *ctx, bool is_egress,
|
||||
__be16 *sport, __be16 *dport, __u32 *identifier)
|
||||
{
|
||||
__u32 tsval, tsecr;
|
||||
struct tcphdr *tcph;
|
||||
|
||||
if (parse_tcphdr(&ctx->nh, ctx->data_end, &tcph) < 0)
|
||||
return -1;
|
||||
|
||||
// Do not timestamp pure ACKs
|
||||
if (is_egress && ctx->nh.pos - ctx->data >= ctx->pkt_len && !tcph->syn)
|
||||
return -1;
|
||||
|
||||
if (parse_tcp_ts(tcph, ctx->data_end, &tsval, &tsecr) < 0)
|
||||
return -1; //Possible TODO, fall back on seq/ack instead
|
||||
|
||||
*sport = tcph->source;
|
||||
*dport = tcph->dest;
|
||||
*identifier = is_egress ? tsval : tsecr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempts to parse the packet limited by the data and data_end pointers,
|
||||
* to retrieve a protocol dependent packet identifier. If sucessful, the
|
||||
* pointed to p_id will be filled with parsed information from the packet
|
||||
* packet, and 0 will be returned. On failure, -1 will be returned.
|
||||
* If is_egress saddr and daddr will match source and destination of packet,
|
||||
* respectively, and identifier will be set to the identifer for an outgoing
|
||||
* packet. Otherwise, saddr and daddr will be swapped (will match
|
||||
* destination and source of packet, respectively), and identifier will be
|
||||
* set to the identifier of a response.
|
||||
*/
|
||||
static int parse_packet_identifier(struct parsing_context *ctx, bool is_egress,
|
||||
struct packet_id *p_id)
|
||||
{
|
||||
int proto, err;
|
||||
struct ethhdr *eth;
|
||||
struct iphdr *iph;
|
||||
struct ipv6hdr *ip6h;
|
||||
struct flow_address *saddr, *daddr;
|
||||
|
||||
// Switch saddr <--> daddr on ingress to match egress
|
||||
if (is_egress) {
|
||||
saddr = &p_id->flow.saddr;
|
||||
daddr = &p_id->flow.daddr;
|
||||
} else {
|
||||
saddr = &p_id->flow.daddr;
|
||||
daddr = &p_id->flow.saddr;
|
||||
}
|
||||
|
||||
proto = parse_ethhdr(&ctx->nh, ctx->data_end, ð);
|
||||
|
||||
// Parse IPv4/6 header
|
||||
if (proto == bpf_htons(ETH_P_IP)) {
|
||||
p_id->flow.ipv = AF_INET;
|
||||
proto = parse_iphdr(&ctx->nh, ctx->data_end, &iph);
|
||||
} else if (proto == bpf_htons(ETH_P_IPV6)) {
|
||||
p_id->flow.ipv = AF_INET6;
|
||||
proto = parse_ip6hdr(&ctx->nh, ctx->data_end, &ip6h);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Add new protocols here
|
||||
if (proto == IPPROTO_TCP) {
|
||||
err = parse_tcp_identifier(ctx, is_egress, &saddr->port,
|
||||
&daddr->port, &p_id->identifier);
|
||||
if (err)
|
||||
return -1;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Sucessfully parsed packet identifier - fill in IP-addresses and return
|
||||
if (p_id->flow.ipv == AF_INET) {
|
||||
map_ipv4_to_ipv6(iph->saddr, &saddr->ip);
|
||||
map_ipv4_to_ipv6(iph->daddr, &daddr->ip);
|
||||
} else { // IPv6
|
||||
saddr->ip = ip6h->saddr;
|
||||
daddr->ip = ip6h->daddr;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
361
pping/pping_kern.c
Normal file
361
pping/pping_kern.c
Normal file
@@ -0,0 +1,361 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#include <linux/bpf.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/in6.h>
|
||||
#include <linux/if_ether.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/ipv6.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
// overwrite xdp/parsing_helpers.h value to avoid hitting verifier limit
|
||||
#ifdef IPV6_EXT_MAX_CHAIN
|
||||
#undef IPV6_EXT_MAX_CHAIN
|
||||
#endif
|
||||
#define IPV6_EXT_MAX_CHAIN 3
|
||||
|
||||
#include <xdp/parsing_helpers.h>
|
||||
#include "pping.h"
|
||||
|
||||
#define AF_INET 2
|
||||
#define AF_INET6 10
|
||||
#define MAX_TCP_OPTIONS 10
|
||||
|
||||
/*
|
||||
* This struct keeps track of the data and data_end pointers from the xdp_md or
|
||||
* __skb_buff contexts, as well as a currently parsed to position kept in nh.
|
||||
* Additionally, it also keeps the length of the entire packet, which together
|
||||
* with the other members can be used to determine ex. how much data each
|
||||
* header encloses.
|
||||
*/
|
||||
struct parsing_context {
|
||||
void *data; //Start of eth hdr
|
||||
void *data_end; //End of safe acessible area
|
||||
struct hdr_cursor nh; //Position to parse next
|
||||
__u32 pkt_len; //Full packet length (headers+data)
|
||||
bool is_egress; //Is packet on egress or ingress?
|
||||
};
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
// Global config struct - set from userspace
|
||||
static volatile const struct bpf_config config = {};
|
||||
|
||||
// Map definitions
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__type(key, struct packet_id);
|
||||
__type(value, __u64);
|
||||
__uint(max_entries, 16384);
|
||||
} packet_ts SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__type(key, struct network_tuple);
|
||||
__type(value, struct flow_state);
|
||||
__uint(max_entries, 16384);
|
||||
} flow_state SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(__u32));
|
||||
__uint(value_size, sizeof(__u32));
|
||||
} rtt_events SEC(".maps");
|
||||
|
||||
// Help functions
|
||||
|
||||
/*
|
||||
* Maps an IPv4 address into an IPv6 address according to RFC 4291 sec 2.5.5.2
|
||||
*/
|
||||
static void map_ipv4_to_ipv6(__be32 ipv4, struct in6_addr *ipv6)
|
||||
{
|
||||
__builtin_memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10);
|
||||
__builtin_memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2);
|
||||
ipv6->in6_u.u6_addr32[3] = ipv4;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parses the TSval and TSecr values from the TCP options field. If sucessful
|
||||
* the TSval and TSecr values will be stored at tsval and tsecr (in network
|
||||
* byte order).
|
||||
* Returns 0 if sucessful and -1 on failure
|
||||
*/
|
||||
static int parse_tcp_ts(struct tcphdr *tcph, void *data_end, __u32 *tsval,
|
||||
__u32 *tsecr)
|
||||
{
|
||||
int len = tcph->doff << 2;
|
||||
void *opt_end = (void *)tcph + len;
|
||||
__u8 *pos = (__u8 *)(tcph + 1); //Current pos in TCP options
|
||||
__u8 i, opt;
|
||||
volatile __u8
|
||||
opt_size; // Seems to ensure it's always read of from stack as u8
|
||||
|
||||
if (tcph + 1 > data_end || len <= sizeof(struct tcphdr))
|
||||
return -1;
|
||||
#pragma unroll //temporary solution until we can identify why the non-unrolled loop gets stuck in an infinite loop
|
||||
for (i = 0; i < MAX_TCP_OPTIONS; i++) {
|
||||
if (pos + 1 > opt_end || pos + 1 > data_end)
|
||||
return -1;
|
||||
|
||||
opt = *pos;
|
||||
if (opt == 0) // Reached end of TCP options
|
||||
return -1;
|
||||
|
||||
if (opt == 1) { // TCP NOP option - advance one byte
|
||||
pos++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Option > 1, should have option size
|
||||
if (pos + 2 > opt_end || pos + 2 > data_end)
|
||||
return -1;
|
||||
opt_size = *(pos + 1);
|
||||
if (opt_size < 2) // Stop parsing options if opt_size has an invalid value
|
||||
return -1;
|
||||
|
||||
// Option-kind is TCP timestap (yey!)
|
||||
if (opt == 8 && opt_size == 10) {
|
||||
if (pos + 10 > opt_end || pos + 10 > data_end)
|
||||
return -1;
|
||||
*tsval = *(__u32 *)(pos + 2);
|
||||
*tsecr = *(__u32 *)(pos + 6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Some other TCP option - advance option-length bytes
|
||||
pos += opt_size;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempts to fetch an identifier for TCP packets, based on the TCP timestamp
|
||||
* option. If sucessful, identifier will be set to TSval if is_ingress, TSecr
|
||||
* otherwise, the port-members of saddr and daddr will be set the the TCP source
|
||||
* and dest, respectively, and 0 will be returned. On failure, -1 will be
|
||||
* returned. Additionally, if the connection is closing (FIN or RST flag), sets
|
||||
* flow_closing to true.
|
||||
*/
|
||||
static int parse_tcp_identifier(struct parsing_context *ctx, __be16 *sport,
|
||||
__be16 *dport, bool *flow_closing,
|
||||
__u32 *identifier)
|
||||
{
|
||||
__u32 tsval, tsecr;
|
||||
struct tcphdr *tcph;
|
||||
|
||||
if (parse_tcphdr(&ctx->nh, ctx->data_end, &tcph) < 0)
|
||||
return -1;
|
||||
|
||||
// Check if connection is closing
|
||||
if (tcph->fin || tcph->rst) {
|
||||
*flow_closing = true;
|
||||
/* bpf_printk("Detected connection closing on %d\n", */
|
||||
/* ctx->is_egress); //Upsets verifier? */
|
||||
}
|
||||
|
||||
// Do not timestamp pure ACKs
|
||||
if (ctx->is_egress && ctx->nh.pos - ctx->data >= ctx->pkt_len &&
|
||||
!tcph->syn)
|
||||
return -1;
|
||||
|
||||
if (parse_tcp_ts(tcph, ctx->data_end, &tsval, &tsecr) < 0)
|
||||
return -1; //Possible TODO, fall back on seq/ack instead
|
||||
|
||||
*sport = tcph->source;
|
||||
*dport = tcph->dest;
|
||||
*identifier = ctx->is_egress ? tsval : tsecr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempts to parse the packet limited by the data and data_end pointers,
|
||||
* to retrieve a protocol dependent packet identifier. If sucessful, the
|
||||
* pointed to p_id will be filled with parsed information from the packet
|
||||
* packet, and 0 will be returned. On failure, -1 will be returned.
|
||||
* If is_egress saddr and daddr will match source and destination of packet,
|
||||
* respectively, and identifier will be set to the identifer for an outgoing
|
||||
* packet. Otherwise, saddr and daddr will be swapped (will match
|
||||
* destination and source of packet, respectively), and identifier will be
|
||||
* set to the identifier of a response.
|
||||
*/
|
||||
static int parse_packet_identifier(struct parsing_context *ctx,
|
||||
struct packet_id *p_id, bool *flow_closing)
|
||||
{
|
||||
int proto, err;
|
||||
struct ethhdr *eth;
|
||||
struct iphdr *iph;
|
||||
struct ipv6hdr *ip6h;
|
||||
struct flow_address *saddr, *daddr;
|
||||
|
||||
// Switch saddr <--> daddr on ingress to match egress
|
||||
if (ctx->is_egress) {
|
||||
saddr = &p_id->flow.saddr;
|
||||
daddr = &p_id->flow.daddr;
|
||||
} else {
|
||||
saddr = &p_id->flow.daddr;
|
||||
daddr = &p_id->flow.saddr;
|
||||
}
|
||||
|
||||
proto = parse_ethhdr(&ctx->nh, ctx->data_end, ð);
|
||||
|
||||
// Parse IPv4/6 header
|
||||
if (proto == bpf_htons(ETH_P_IP)) {
|
||||
p_id->flow.ipv = AF_INET;
|
||||
proto = parse_iphdr(&ctx->nh, ctx->data_end, &iph);
|
||||
} else if (proto == bpf_htons(ETH_P_IPV6)) {
|
||||
p_id->flow.ipv = AF_INET6;
|
||||
proto = parse_ip6hdr(&ctx->nh, ctx->data_end, &ip6h);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Add new protocols here
|
||||
if (proto == IPPROTO_TCP) {
|
||||
err = parse_tcp_identifier(ctx, &saddr->port, &daddr->port,
|
||||
flow_closing, &p_id->identifier);
|
||||
if (err)
|
||||
return -1;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Sucessfully parsed packet identifier - fill in IP-addresses and return
|
||||
if (p_id->flow.ipv == AF_INET) {
|
||||
map_ipv4_to_ipv6(iph->saddr, &saddr->ip);
|
||||
map_ipv4_to_ipv6(iph->daddr, &daddr->ip);
|
||||
} else { // IPv6
|
||||
saddr->ip = ip6h->saddr;
|
||||
daddr->ip = ip6h->daddr;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Programs
|
||||
|
||||
// TC-BFP for parsing packet identifier from egress traffic and add to map
|
||||
SEC(EGRESS_PROG_SEC)
|
||||
int pping_egress(struct __sk_buff *skb)
|
||||
{
|
||||
struct packet_id p_id = { 0 };
|
||||
__u64 p_ts;
|
||||
struct parsing_context pctx = {
|
||||
.data = (void *)(long)skb->data,
|
||||
.data_end = (void *)(long)skb->data_end,
|
||||
.pkt_len = skb->len,
|
||||
.nh = { .pos = pctx.data },
|
||||
.is_egress = true,
|
||||
};
|
||||
bool flow_closing = false;
|
||||
struct flow_state *f_state;
|
||||
struct flow_state new_state = { 0 };
|
||||
|
||||
if (parse_packet_identifier(&pctx, &p_id, &flow_closing) < 0)
|
||||
goto out;
|
||||
|
||||
// Delete flow and create no timestamp entry if flow is closing
|
||||
if (flow_closing) {
|
||||
bpf_map_delete_elem(&flow_state, &p_id.flow);
|
||||
goto out;
|
||||
}
|
||||
|
||||
// Check flow state
|
||||
f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow);
|
||||
if (!f_state) { // No previous state - attempt to create it
|
||||
bpf_map_update_elem(&flow_state, &p_id.flow, &new_state,
|
||||
BPF_NOEXIST);
|
||||
f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow);
|
||||
if (!f_state)
|
||||
goto out;
|
||||
}
|
||||
|
||||
// Check if identfier is new
|
||||
/* The gap between checking and updating last_id may cause concurrency
|
||||
* issues where multiple packets may simultaneously think they are the
|
||||
* first with a new identifier. As long as all of the identifiers are
|
||||
* the same though, only one should be able to create a timestamp entry.
|
||||
|
||||
* A bigger issue is that older identifiers (for example due to
|
||||
* out-of-order packets) may pass this check and update the current
|
||||
* identifier to an old one. This means that both the packet with the
|
||||
* old identifier itself as well the next packet with the current
|
||||
* identifier may be considered packets with new identifiers (even if
|
||||
* both have been seen before). For TCP timestamps this could be
|
||||
* prevented by changing the check to '>=' instead, but it may not be
|
||||
* suitable for other protocols, such as QUIC and its spinbit.
|
||||
*
|
||||
* For now, just hope that the rate limit saves us from creating an
|
||||
* incorrect timestamp. That may however also fail, either due to the
|
||||
* to it happening in a time it's not limited by rate sampling, or
|
||||
* because of rate check failing due to concurrency issues.
|
||||
*/
|
||||
if (f_state->last_id == p_id.identifier)
|
||||
goto out;
|
||||
f_state->last_id = p_id.identifier;
|
||||
|
||||
// Check rate-limit
|
||||
/*
|
||||
* The window between checking and updating last_timestamp may cause
|
||||
* concurrency issues, where multiple packets simultaneously pass the
|
||||
* rate limit. However, as long as they have the same identifier, only
|
||||
* a single timestamp entry should successfully be created.
|
||||
*/
|
||||
p_ts = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns
|
||||
if (p_ts < f_state->last_timestamp ||
|
||||
p_ts - f_state->last_timestamp < config.rate_limit)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Updates attempt at creating timestamp, even if creation of timestamp
|
||||
* fails (due to map being full). This should make the competition for
|
||||
* the next available map slot somewhat fairer between heavy and sparse
|
||||
* flows.
|
||||
*/
|
||||
f_state->last_timestamp = p_ts;
|
||||
bpf_map_update_elem(&packet_ts, &p_id, &p_ts, BPF_NOEXIST);
|
||||
|
||||
out:
|
||||
return BPF_OK;
|
||||
}
|
||||
|
||||
// XDP program for parsing identifier in ingress traffic and check for match in map
|
||||
SEC(INGRESS_PROG_SEC)
|
||||
int pping_ingress(struct xdp_md *ctx)
|
||||
{
|
||||
struct packet_id p_id = { 0 };
|
||||
__u64 *p_ts;
|
||||
struct rtt_event event = { 0 };
|
||||
struct parsing_context pctx = {
|
||||
.data = (void *)(long)ctx->data,
|
||||
.data_end = (void *)(long)ctx->data_end,
|
||||
.pkt_len = pctx.data_end - pctx.data,
|
||||
.nh = { .pos = pctx.data },
|
||||
.is_egress = false,
|
||||
};
|
||||
bool flow_closing = false;
|
||||
|
||||
if (parse_packet_identifier(&pctx, &p_id, &flow_closing) < 0)
|
||||
goto out;
|
||||
|
||||
// Delete flow, but allow final attempt at RTT calculation
|
||||
if (flow_closing)
|
||||
bpf_map_delete_elem(&flow_state, &p_id.flow);
|
||||
|
||||
p_ts = bpf_map_lookup_elem(&packet_ts, &p_id);
|
||||
if (!p_ts)
|
||||
goto out;
|
||||
|
||||
event.rtt = bpf_ktime_get_ns() - *p_ts;
|
||||
/*
|
||||
* Attempt to delete timestamp entry as soon as RTT is calculated.
|
||||
* But could have potential concurrency issue where multiple packets
|
||||
* manage to match against the identifier before it can be deleted.
|
||||
*/
|
||||
bpf_map_delete_elem(&packet_ts, &p_id);
|
||||
|
||||
__builtin_memcpy(&event.flow, &p_id.flow, sizeof(struct network_tuple));
|
||||
bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU, &event,
|
||||
sizeof(event));
|
||||
|
||||
out:
|
||||
return XDP_PASS;
|
||||
}
|
@@ -1,51 +0,0 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#include <linux/bpf.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <iproute2/bpf_elf.h>
|
||||
|
||||
#include "pping.h"
|
||||
#include "pping_helpers.h"
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
#ifdef HAVE_TC_LIBBPF /* detected by configure script in config.mk */
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(key_size, sizeof(struct packet_id));
|
||||
__uint(value_size, sizeof(struct packet_timestamp));
|
||||
__uint(max_entries, 16384);
|
||||
__uint(pinning, LIBBPF_PIN_BY_NAME);
|
||||
} ts_start SEC(".maps");
|
||||
|
||||
#else
|
||||
struct bpf_elf_map SEC("maps") ts_start = {
|
||||
.type = BPF_MAP_TYPE_HASH,
|
||||
.size_key = sizeof(struct packet_id),
|
||||
.size_value = sizeof(struct packet_timestamp),
|
||||
.max_elem = 16384,
|
||||
.pinning = PIN_GLOBAL_NS,
|
||||
};
|
||||
#endif
|
||||
|
||||
// TC-BFP for parsing packet identifier from egress traffic and add to map
|
||||
SEC(TCBPF_PROG_SEC)
|
||||
int tc_bpf_prog_egress(struct __sk_buff *skb)
|
||||
{
|
||||
struct packet_id p_id = { 0 };
|
||||
struct packet_timestamp p_ts = { 0 };
|
||||
struct parsing_context pctx = {
|
||||
.data = (void *)(long)skb->data,
|
||||
.data_end = (void *)(long)skb->data_end,
|
||||
.pkt_len = skb->len,
|
||||
.nh = { .pos = pctx.data },
|
||||
};
|
||||
|
||||
if (parse_packet_identifier(&pctx, true, &p_id) < 0)
|
||||
goto end;
|
||||
|
||||
p_ts.timestamp = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns
|
||||
bpf_map_update_elem(&ts_start, &p_id, &p_ts, BPF_NOEXIST);
|
||||
|
||||
end:
|
||||
return BPF_OK;
|
||||
}
|
@@ -1,63 +0,0 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#include <linux/bpf.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
|
||||
#include "pping.h"
|
||||
#include "pping_helpers.h"
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(key_size, sizeof(struct packet_id));
|
||||
__uint(value_size, sizeof(struct packet_timestamp));
|
||||
__uint(max_entries, 16384);
|
||||
__uint(pinning, LIBBPF_PIN_BY_NAME);
|
||||
} ts_start SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(__u32));
|
||||
__uint(value_size, sizeof(__u32));
|
||||
} rtt_events SEC(".maps");
|
||||
|
||||
// XDP program for parsing identifier in ingress traffic and check for match in map
|
||||
SEC(XDP_PROG_SEC)
|
||||
int xdp_prog_ingress(struct xdp_md *ctx)
|
||||
{
|
||||
struct packet_id p_id = { 0 };
|
||||
struct packet_timestamp *p_ts;
|
||||
struct rtt_event event = { 0 };
|
||||
struct parsing_context pctx = {
|
||||
.data = (void *)(long)ctx->data,
|
||||
.data_end = (void *)(long)ctx->data_end,
|
||||
.pkt_len = pctx.data_end - pctx.data,
|
||||
.nh = { .pos = pctx.data },
|
||||
};
|
||||
|
||||
if (parse_packet_identifier(&pctx, false, &p_id) < 0)
|
||||
goto end;
|
||||
|
||||
p_ts = bpf_map_lookup_elem(&ts_start, &p_id);
|
||||
|
||||
// Only calculate RTT for first packet with matching identifer
|
||||
if (p_ts && p_ts->used == 0) {
|
||||
/*
|
||||
* As used is not set atomically with the lookup, could
|
||||
* potentially have multiple "first" packets (on different
|
||||
* CPUs), but all those should then also have very similar RTT,
|
||||
* so don't consider it a significant issue
|
||||
*/
|
||||
p_ts->used = 1;
|
||||
// TODO - Optional delete of entry (if identifier is garantued unique)
|
||||
|
||||
__builtin_memcpy(&event.flow, &p_id.flow,
|
||||
sizeof(struct network_tuple));
|
||||
event.rtt = bpf_ktime_get_ns() - p_ts->timestamp;
|
||||
bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU,
|
||||
&event, sizeof(event));
|
||||
}
|
||||
|
||||
end:
|
||||
return XDP_PASS;
|
||||
}
|
Reference in New Issue
Block a user