Merge pull request #13 from simosund/pping_Add_Sampling

Add sampling to pping
This commit is contained in:
Toke Høiland-Jørgensen
2021-04-23 14:51:37 +02:00
committed by GitHub
14 changed files with 1342 additions and 562 deletions

View File

@@ -1,34 +1,11 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
USER_TARGETS := pping
TC_BPF_TARGETS := pping_kern_tc
BPF_TARGETS := pping_kern_xdp
BPF_TARGETS += $(TC_BPF_TARGETS)
BPF_TARGETS := pping_kern
LDFLAGS += -pthread
EXTRA_DEPS += config.mk pping.h pping_helpers.h
EXTRA_DEPS += pping.h
LIB_DIR = ../lib
include $(LIB_DIR)/common.mk
include config.mk
all: config.mk
config.mk: configure
@sh configure
ifndef HAVE_TC_LIBBPF
# If the iproute2 'tc' tool doesn't understand BTF debug info
# use llvm-strip to remove this debug info from object file
#
# *BUT* cannot strip everything as it removes ELF elems needed for
# creating maps
#
.PHONY: strip_tc_obj
strip_tc_obj: ${TC_BPF_TARGETS:=.o}
$(Q) echo "TC don't support libbpf - strip BTF info"
$(Q) llvm-strip --no-strip-all --remove-section .BTF $?
all: strip_tc_obj
endif

View File

@@ -1,21 +1,99 @@
# PPing using XDP and TC-BPF
A re-implementation of [Kathie Nichols' passive ping
(pping)](https://github.com/pollere/pping) utility using XDP (on ingress)
and TC-BPF (on egress) for the packet capture logic.
(pping)](https://github.com/pollere/pping) utility using XDP (on ingress) and
TC-BPF (on egress) for the packet capture logic.
## Simple description
Passive Ping (PPing) makes use of the TCP Timestamp option to calculate the RTT for TCP traffic passing through.
PPing can be used on measure RTTs on end hosts or any device which sees both directions of the TCP flow.
Passive Ping (PPing) is a simple tool for passively measuring per-flow RTTs. It
can be used on endhosts as well as any (BPF-capable Linux) device which can see
both directions of the traffic (ex router or middlebox). Currently it only works
for TCP traffic which uses the TCP timestamp option, but could be extended to
also work with for example TCP seq/ACK numbers, the QUIC spinbit and ICMP
echo-reply messages. See the [TODO-list](./TODO.md) for more potential features
(which may or may not ever get implemented).
For outgoing packets, it checks for TCP timestamp TSval in the TCP header. If it finds one it creates a timestamp
for when it saw that TSval in a particular flow. On incomming packets it parses the TCP timestamp TSecr (which
is the TSval echoed by the receiving host) and checks it has seen any previous outgoing packets with that TCP
timestamp. If it has, an RTT is calculated as the difference in time between when it saw an outgoing packet
with a TSval, and when it received an incomming packet from the reverse flow with a matching TSecr.
The fundamental logic of pping is to timestamp a pseudo-unique identifier for
outgoing packets, and then look for matches in the incoming packets. If a match
is found, the RTT is simply calculated as the time difference between the
current time and the timestamp.
Note that TCP timestamps may not be unique for every packet in a flow, therefore it only matches the first
outgoing packet with a particular TSval with the first incomming packet with a matching TSecr. Duplicate
TSval/TSecr are ignored.
This tool, just as Kathie's original pping implementation, uses TCP timestamps
as identifiers. For outgoing packets, the TSval (which is a timestamp in and off
itself) is timestamped. Incoming packets are then parsed for the TSecr, which
are the echoed TSval values from the receiver. The TCP timestamps are not
necessarily unique for every packet (they have a limited update frequency,
appears to be 1000 Hz for modern Linux systems), so only the first instance of
an identifier is timestamped, and matched against the first incoming packet with
the identifier. The mechanism to ensure only the first packet is timestamped and
matched differs from the one in Kathie's pping, and is further described in
[SAMPLING_DESIGN](./SAMPLING_DESIGN.md).
## Planned design
## Design and technical description
!["Design of eBPF pping](./eBPF_pping_design.png)
### Files:
- **pping.c:** Userspace program that loads and attaches the BPF programs, pulls
the perf-buffer `rtt_events` to print out RTT messages and periodically cleans
up the hash-maps from old entries. Also passes user options to the BPF
programs by setting a "global variable" (stored in the programs .rodata
section).
- **pping_kern.c:** Contains the BPF programs that are loaded on tc (egress) and
XDP (ingress), as well as several common functions, a global constant `config`
(set from userspace) and map definitions. The tc program `pping_egress()`
parses outgoing packets for identifiers. If an identifier is found and the
sampling strategy allows it, a timestamp for the packet is created in
`packet_ts`. The XDP program `pping_ingress()` parses incomming packets for an
identifier. If found, it looks up the `packet_ts` map for a match on the
reverse flow (to match source/dest on egress). If there is a match, it
calculates the RTT from the stored timestamp and deletes the entry. The
calculated RTT (together with the flow-tuple) is pushed to the perf-buffer
`rtt_events`.
- **bpf_egress_loader.sh:** A shell script that's used by `pping.c` to setup a
clsact qdisc and attach the `pping_egress()` program to egress using
tc. **Note**: Unless your iproute2 comes with libbpf support, tc will use
iproute's own loading mechanism when loading and attaching object files
directly through the tc command line. To ensure that libbpf is always used to
load `pping_egress()`, `pping.c` actually loads the program and pins it to
`/sys/fs/bpf/pping/classifier`, and tc only attaches the pinned program.
- **functions.sh and parameters.sh:** Imported by `bpf_egress_loader.sh`.
- **pping.h:** Common header file included by `pping.c` and
`pping_kern.c`. Contains some common structs used by both (are part of the
maps).
### BPF Maps:
- **flow_state:** A hash-map storing some basic state for each flow, such as the
last seen identifier for the flow and when the last timestamp entry for the
flow was created. Entries are created by `pping_egress()`, and can be updated
or deleted by both `pping_egress()` and `pping_ingress()`. Leftover entries
are eventually removed by `pping.c`. Pinned at `/sys/fs/bpf/pping`.
- **packet_ts:** A hash-map storing a timestamp for a specific packet
identifier. Entries are created by `pping_egress()` and removed by
`pping_ingress()` if a match is found. Leftover entries are eventually
removed by `pping.c`. Pinned at `/sys/fs/bpf/pping`.
- **rtt_events:** A perf-buffer used by `pping_ingress()` to push calculated RTTs
to `pping.c`, which continuously polls the map the print out the RTTs.
## Similar projects
Passively measuring the RTT for TCP traffic is not a novel concept, and there
exists a number of other tools that can do so. A good overview of how passive
RTT calculation using TCP timestamps (as in this project) works is provided in
[this paper](https://doi.org/10.1145/2523426.2539132) from 2013.
- [pping](https://github.com/pollere/pping): This project is largely a
re-implementation of Kathie's pping, but by using BPF and XDP as well as
implementing some filtering logic the hope is to be able to create a always-on
tool that can scale well even to large amounts of massive flows.
- [ppviz](https://github.com/pollere/ppviz): Web-based visualization tool for
the "machine-friendly" output from Kathie's pping tool. If/when we implement a
similar machine readable output option it should hopefully work with this
implementation as well.
- [tcptrace](https://github.com/blitz/tcptrace): A post-processing tool which
can analyze a tcpdump file and among other things calculate RTTs based on
seq/ACK numbers (`-r` or `-R` flag).
- **Dapper**: A passive TCP data plane monitoring tool implemented in P4 which
can among other things calculate the RTT based on the matching seq/ACK
numbers. [Paper](https://doi.org/10.1145/3050220.3050228). [Unofficial
source](https://github.com/muhe1991/p4-programs-survey/tree/master/dapper).
- [P4 Tofino TCP RTT measurement](https://github.com/Princeton-Cabernet/p4-projects/tree/master/RTT-tofino):
A passive TCP RTT monitor based on seq/ACK numbers implemented in P4 for
Tofino programmable switches. [Paper](https://doi.org/10.1145/3405669.3405823).

386
pping/SAMPLING_DESIGN.md Normal file
View File

@@ -0,0 +1,386 @@
# Introduction
This file is intended to document some of the challenges and design
decisions for adding sampling functionality to pping. It is partly
based on discussions from my supervisor meeting on 2021-02-22, and the
contents of my
[status slides](https://github.com/xdp-project/bpf-research/blob/master/meetings/simon/work_summary_20210222.org)
from that meeting.
## Purpose of sampling
The main purpose of adding sampling to pping is to prevent a massive
amount of timestamp entries being created and quickly filling up the
map. This prevents new entries from being made until old ones can be
cleared out. A few large flows could thus "hog" all the map entries,
and prevent RTTs from other flows from being reported. Sampling is
therefore only used on egress to determine if a timestamp entry should
be created for a packet. All packets on ingress will still be parsed
and checked for a potential match.
A secondary purpose of the sampling is the reduce the amount of output
that pping creates. In most circumstances, getting 1000 RTT reports
per second from a single flow will probably not be of interest, making
it less useful as a direct command-line utility.
# Considered sampling approaches
There are a number of different ways that the sampling could be
performed, ex:
- Sample every N packets per flow
- Not very flexible
- If same rate is used for all flows small flows would get very few
samples.
- Sample completely random packets
- Probably not a good idea...
- Head sampling (sample the first few packets of each flow)
- Not suitable for monitoring long flows
- RTT may change over lifetime of flow (due to buffer bloat)
- Probabilistic approach
- Probabilistic approaches have been used to for example capture
most relevant information with limited overhead in INT
- Could potentially be configured across multiple devices, so that
pping on all of the devices together capture the most relevant
traffic.
- While it could potentially work well, I'm not very familiar with
these approaches. Would take considerable research from my side
to figure out how these methods work, how to best apply it to
pping, and how to implement it in BPF.
- Used time-based sampling, limiting the rate of how often entries
can be created per flow
- Intuitively simple
- Should correspond quite well with the output you would probably
want? I.e. a few entries per flow (regardless of how heavy they
are) stating their current RTT.
I believe that time-based sampling is the most promising solution that
I can implement in a reasonable time. In the future additional
sampling methods could potentially be added.
# Considerations for time-based sampling
## Time interval
For the time-based sampling, we must determine how the interval
between when new timestamp entries are allowed should be set.
### Static time interval
The simplest alternative is probably to use a static limit, ex
100ms. This would provide a rather simple and predictable limit for
how often entries can be created (per flow), and how much output you
would get (per flow).
### RTT-based time interval
It may be desirable to use a more dynamic time limit, which is
adapted to each flow. One way to do this, would be do base the time
limit on the RTT for the flow. Flows with short RTTs could be expected
to undergo more rapid changes than flows with long RTTs. This would
require keeping track of the RTT for each flow, for example a moving
average. Additionally, some fall back is required before the RTT for
the flow is known.
### User configurable
Regardless if a static or RTT-based (or some other alternative) is
used, it should probably be user configurable (including allowing the
user to disable sampling entirely).
## Allowing bursts
It may be desirable to allow to allow for multiple packets in a short
burst to be timestamped. Due to delayed ACKs, one may only get a
response for every other packet. If the first packed is timestamped,
and shortly after a second packet is sent (that has a different
identifier), then the response will effectively be for the second
packet, and no match for the timestamped identifier will be found. For
flows of the right (or wrong, depending on how you look at it)
intensity, slow enough where consecutive packets are likely to get
different TCP timestamps, but fast enough for the delayed ACKs to
acknowledge multiple packets, then you essentially have a 50/50 chance
of timestamping the wrong identifier and miss the RTT.
To handle this, you could timestamp multiple consecutive packets (with
unique indentifiers) in a short burst. You probably need to limit this
burst in both number of packets, as well as timeframe after the first
packet that additional packets may be included. For example, allowing
up to 3 packets (with different identifiers) get a timestamp for up to
4 ms after the first one of them are timestamped.
If allowing bursts of timestamps to be created, it may also be
desirable to rate limit the output, in order to not get a burst of
similar RTTs for the flow in the output (which may also skew averages
and other post-processing).
## Handing duplicate identifiers
TCP timestamps are only updated at a limited rate (ex. 1000 Hz), and
thus you can have multiple consecutive packets with the same TCP
timestamp if they're sent fast enough. For the calculated RTT to be
correct, you should only match the first sent packet with a unique
identifier with the first received packet with a matching
identifier. Otherwise, you may for example have a sequence with 100
packets with the same identifier, and match the last of the outgoing
packets with the first incoming response, which may underestimate the
RTT with as much as the TCP timestamp clock rate (ex. 1 ms).
### Current solution
The current solution to this is very simple. For outgoing packets, a
timestamp entry is only allowed to be created if no previous entry for
the identifier exists (realized through the `BPF_NOEXIST` flag to
`bpf_map_update_elem()` call). Thus only the first outgoing packet with
a specific identifier can be timestamped. On egress, the first packet
with a matching identifier will mark the timestamp as used, preventing
later incoming responses from using that timestamp. The reason why the
timestamp is marked as used rather than directly deleted once a
matching packet on ingress is found, is to avoid the egress side
creating a new entry for the same identifier. This could occur if the
RTT is shorter than the TCP timestamp clock rate, and could result in
a massively underestimated RTT. This is the same mechanic that is used
in the original pping, as explained
[here](https://github.com/pollere/pping/blob/777eb72fd9b748b4bb628ef97b7fff19b751f1fd/pping.cpp#L155-L168).
### New solution
The current solution will no longer work if sampling is
introduced. With sampling, there's no guarantee that the sampled
packed will be the first outgoing packet in the sequence of packets
with identical timestamps. Thus the RTT may still be underestimated by
as much as the TCP timestamp clock rate (ex. 1 ms). Therefore, a new
solution is needed. The current idea is to keep track of the last-seen
identifier of each flow, and only allow a packet to be sampled for
timestamping if its identifier differs from the last-seen identifier
of the flow, i.e. it is the first packet in the flow with that
identifier. This would perhaps be problematic with some sampling
approaches as it requires that the packet is both the first one with a
specific identifier, as well as being elected for sampling. However
for the rate-limited sampling it should work quite well, as it will
only delay the sampling until a packet with a new identifier is found.
Another advantage with this solution is that it should allow for
timestamp entries to be deleted as soon as the matching response is
found on ingress. The timestamp no longer needs to be kept around only
to prevent egress to create a new timestamp with the same identifier,
as this new solution should take care of that. This would help a lot
with keeping the map clean, as the timestamp entries would then
automatically be removed as soon as they are no longer needed. The
periodic cleanup from userspace would only be needed to remove the
occasional entries that were never matched for some reason (e.g. the
previously mentioned issue with delayed ACKs, flow stopped, the
reverse flow can't be observed etc.).
One issue for this new solution is handling out-of-order packets. If
an entry with an older identifier is a bit delayed, it may arrive after
the last seen identifier for the flow has been updated. This old
identifier may then be considered new (as it differs from the current
one), allowing an entry to be created for it and reverting the last
seen identifier to a previous one. Additionally, this may
now allow the next packet having what used to be the current
identifier, also being detected as a new identifier (as the out-of
order packet reverted the last-seen identifier to an old one, creating
a bit of a ping-pong effect). For TCP timestamps this can easily be
avoided by simply requiring the new identifier to be greater than the
last-seen identifier (as TCP timestamps should be monotonically
increasing). That solution may however not be suitable if one wants to
reuse this mechanic for other protocols, such as the QUIC spinbit.
## Keeping per-flow information
In order for the per-flow rate limiting to work, some per-flow state
must be maintained, namely when the last timestamp for that flow was
added (so that one can check that sufficient time has passed before
attempting to add another one).
There may be some drawbacks with having to keep per-flow state. First
off, there will be some additional overhead from having to keep track
of this state. However, the savings from sampling the per-packet state
(the identifier/timestamps mappings) should hopefully cover the
overhead from keeping some per-flow state (and then some).
Another issue that is worth keeping in mind is that this flow-state
will also need to be cleaned up eventually. This cleanup could be
handled in a similar manner as the current per-packet state is cleaned
up, by having the userspace process occasionally remove old
entries. In this case, the entries could be deemed as old if there was
a long time since the last timestamp was added for the flow, ex 300
seconds as used by the [original
pping](https://github.com/pollere/pping/blob/777eb72fd9b748b4bb628ef97b7fff19b751f1fd/pping.cpp#L117).
Additionally, one can parse the packets for indications that the
connection is being closed (ex TCP FIN/RST), and then directly delete
the flow-state for that flow from the BPF programs.
Later on, this per-flow state could potentially be expanded to include
other information deemed useful (such as ex. minimum and average RTT).
### Alternative solution - keeping identifier in flow-state
One idea that came up during my supervisor meeting, was that instead
of creating timestamps for individual packets as is currently done,
you only create a number of timestamps for each flow. That is, instead
of creating per-packet entries in a separate map, you include a number
of timestamp/identifier pairs in the flow-state information itself.
While this would potentially be rather efficient, limiting the number
of timestamp entries to a fixed number per flow, I'm opposed to this
idea for a few reasons:
1. The sampling rate would be inherently tied to the RTT of the
flow. While this may in many cases be desirable, it is not very
flexible. It would also make it hard to ex. turn of sampling
completely.
2. The number of timestamps per flow would need to be fixed and known
at compile time(?). As the timestamps/identifier pairs are kept in
the state-flow information itself, and the state-flow information
needs to be of a known and fixed size when creating the maps. This
may also result in some wasted space if the flow-state includes
spots for several timestamp/identifier pairs, but most flows only
makes use of a few (although having an additional timestamp entry
map of fixed size wastes space in a similar manner).
2. If a low number of timestamp/identifier pairs are kept, selecting
an identifier that is missed (ex due to delayed ACKs) could
effectivly block new timestamps from being created (and thus from
RTTs being calculated) for the flow for a relatively long
while. New timestamps can only be created if you have a free slot,
and you can only free a slot by either getting a matching reply, or
waiting until it can be safely assumed that the response was missed
(and not just delayed).
## Graceful degradation
Another aspect I've been asked to consider is how to gracefully reduce
the functionality of pping as the timestamp entry map gets full (as
with sufficiently many and heavy flows, it's likely inevitable).
What currently happens when the timestamp entry map is full, is simply
that no more entries can be made until some have been cleared
out. When adding a rate-limit to the number of entries per flow, as
well as directly deleting entries upon match, I believe this is a
reasonable way to handle the situation. As soon as some RTTs for
current flows have been reported, space for new entries will be
available. The next outgoing packet with a valid identifier from any
flow that does not have to currently wait for its rate limit will then
be able to grab the next spot. However this will still favor heavy
flows over smaller flows, as heavy flows are more likely to be able to
get in a packet first, but they will at least still be limited by the
rate limit, and thus have to take turns with other flows.
It also worth noting that as per-flow state will need to be kept,
there will be strict limit to the number of concurrent flows that can
be monitored, corresponding to the number of entries that can be held
by the map for the per-flow state. Once the per-flow state map is
full, no new flows can be added until one is cleared. It also doesn't
make sense to add packet timestamp entries for flows which state
cannot be tracked, as the rate limit cannot be enforced then.
I see a few ways to more actively handle degradation, depending on what
one views as desirable:
1. One can attempt to monitor many flows, with infrequent RTT
calculations for each. In this case, the userspace process that
occasionally clears out the timestamp map could automatically
decrease the per-flow rate limit if it detects the map is getting
close to full. That way, fewer entries would be generated per flow,
and flows would be forced to take turns to a greater degree when
the map is completely full. Similarly, one may wish to reduce the
timeout for old flows if the per-flow map is getting full, in order
to more quickly allow new flows to be monitored, and only keeping
the most active flows around.
2. One can attempt to monitor fewer flows, but with more frequent RTT
calculations for each. The easiest way to achieve this is to
probably to set a smaller size on the per-flow map relative to the
per-packet timestamp map. In case one wants to primarily focus on
heavier flows, one could possibly add ex. packet rate to the
per-flow information, and remove the flows with the lowest packet
rates.
3. One can attempt to focus on flows with shorter RTTs. Flows with
shorter RTTs should make more efficient use of timestamp entries,
as they can be cleared out faster allowing for new entries. On the
other hand, flows with longer RTTs may be the more interesting
ones, as they are more likely to indicate some issue.
4. One can simply try to create a larger map (and copy over the old
contents) once the map is approaching full. This way one can start
with reasonably small maps, and only start eating up more memory if
required.
While I'm leaning towards option 1 or 4, I don't have a very strong
personal opinion here, and would like some input on what others (who
may have more experience with network measurements) think are
reasonable trade-offs to do.
# Implementation considerations
There are of course several more practical considerations as well when
implementing the sampling, some of which I'll try to address here.
## "Global" vs PERCPU maps
In general, it's likely wise to go with PERCPU maps over "global" (aka
non-PERCPU) maps whenever possible, as PERCPU maps should be more
performant, and also avoids concurrency issues. But this only applies
of course, if the BPF programs don't need to act on global state.
For pping, I unfortunately see no way for the program to work with
only information local to each CPU core individually. The per-packet
identifier and timestamps need to be global, as there is no guarantee
that the same core that timestamped a packet will process the response
for that packet. Likewise, the per-flow information, like the time of
the last timestamping, also needs to be global. Otherwise rate limit
would be per-CPU-per-flow rather than just per-flow.
In practice, packets from the same flow are apparently often handled
by the same CPU, but this is not guaranteed, and therefore not
something we can rely on (especially when state needs to be shared by
both ingress and egress). Could try to use a CPU map to enforce this
behavior, but probably not a great idea.
## Concurrency issues
In addition to the performance hit, sharing global state between
multiple concurrent processes risks running into concurrency issues
unless access is synchronized in some manner (in BPF, the two
mechanics I know of are atomic adds and spin-locks for maps). With the
risk of me misunderstanding the memory model for BPF programs (which
from what I can tell I'm probably not alone about), I will attempt to
explain the potential concurrency issues I see with the pping
implementation.
The current pping implementation already has a potential concurrency
issue. When matches for identifiers are found on ingress, a check is
performed to see if the timestamp has already been used or
not. Multiple packets processed in parallel could potentially all
find that the timestamp is unused, before any of them manage to mark
it as used for the others. This may result in pping matching several
responses to a single timestamp entry and reporting the RTTs for each
of them. I do not consider this a significant issue however, as if
they are concurrent enough that they manage to lookup the used status
before another has time to set it, the difference in time between them
should be very small, and therefore compute very similar RTTs. So the
reported RTTs should still be rather accurate, just over-reported.
When adding sampling and per-flow information, some additional
concurrency issues may be encountered. Mainly, multiple packets may
find that they are allowed to add a new timestamp, before they manage
to update the time of last added time-stamp in the per-flow
state. This may lead to multiple attempts at creating a timestamp at
approximately the same time. For TCP timestamps, all the identifiers
are likely to be identical (as the TCP timestamp itself is only
updated at limited rate), so only one of them should succeed
anyways. If using identifiers that are more unique however, such as
TCP sequence numbers, then it's possible that a short burst of entries
would be created instead of just a single entry within the rate-limit
for the flow.
Overall, I don't think these concurrency issues are that severe, as
they should still result in accurate RTTs, just some possible
over-reporting. I don't believe these issues warrants the performance
impact and potential code complexity of trying to synchronize
access. Furthermore, from what I understand these concurrency issues
are not too likely to occur in reality, as packets from the same flow
are often processed on the same core.
## Global variable vs single-entry map
With BTF, there seems like BPF programs now support the use of global
variables. These global variables can supposedly be modified from user
space, and should from what I've heard also be more efficient than map
lookups. They therefore seem like promising way to pass some
user-configured options from userspace to the BPF programs.
I would however need to lookup how to actually use these, as the
examples I've seen have used a slightly different libbpf setup, where
a "skeleton" header-file is compiled and imported to the userspace
program. There should be some examples in the [xdp-tools
repository](https://github.com/xdp-project/xdp-tools).
The alternative I guess would be to use a
`BPF_MAP_TYPE_PERCPU_ARRAY` with a single entry, which is filled in
with the user-configured option by the userspace program.

View File

@@ -2,27 +2,60 @@
## Protocols
- [x] TCP (based on timestamp options)
- [ ] Skip pure ACKs for egress?
- [x] Skip pure ACKs for egress
- Timestamping pure ACKs may lead to erroneous RTTs (ex. delay
between application attempting to send data being recognized as
an RTT)
- [ ] Add fallback to SEQ/ACK in case of no timestamp?
- Some machines may not use TCP timestamps (either not supported
at all, or disabled as in ex. Windows 10)
- If one only considers SEQ/ACK (and don't check for SACK
options), could result in ex. delay from retransmission being
included in RTT
- [ ] ICMP (ex Echo/Reply)
- [ ] QUIC (based on spinbit)
## General pping
- [x] Add sampling so that RTT is not calculated for every packet
(with unique value) for large flows
- [ ] Allow short bursts to bypass sampling in order to handle
delayed ACKs
- [x] Keep some per-flow state
- Will likely be needed for the sampling
- [ ] Could potentially include keeping track of average RTT, which
may be useful for some decisions (ex. how often to sample,
when entry can be removed etc)
- [ ] Could potentially include keeping track of minimum RTT (as
done by the original pping), ex. to track bufferbloat
- [ ] Could potentially include keeping track of if flow is
bi-directional
- Original pping checks if flow is bi-directional before adding
timestamps, but this could miss shorter flows
- [ ] Dynamically grow the maps if they are starting to get full
- [ ] Improve map cleaning: Use a dynamic time to live for map entries
based on flow's RTT, instead of static 10s limit
- Keeping entries around for a long time allows the map to grow
unnecessarily large, which slows down the cleaning and may block
new entries
- [ ] Use libxdp to load XDP program
- [ ] Check for existance of reverse flow before adding to hash-map (to avoid adding identifiers for flows that we can't see the reverse traffic for)?
- This could miss the first few packets, would not be ideal for short flows
- [ ] Keep track of minimum RTT for each flow (done by Pollere's pping, and helps identify buffer bloat)
- [ ] Add configurable rate-limit for how often each flow can add entries to the map (prevent high-rate flows from quickly filling up the map)
- [ ] Improve map cleaning: Use a dynamic time to live for hash map entries based on flow's RTT, instead of static 10s limit
- [ ] Add support for automatically deleting entries if they are unique
- TCP timestamp need to be kept for a while (because multiple packets can have the same timestamp), but for identifiers that are unique per packet, they can be removed directly after RTT is calculated
- [ ] Add option for machine-readable output (as original pping)
- It may be a good idea to keep the same format as original pping,
so that tools such as [ppviz](https://github.com/pollere/ppviz)
works for both pping implementations.
- [ ] Add timestamps to output (as original pping)
- [ ] Add support for other hooks
- Ex TC-BFP on ingress instead of XDP?
## Done
- [x] Clean up commits and add signed-off-by tags
- [x] Add SPDX-license-identifier tags
- [x] Format C-code in kernel style
- [x] Use existing funcionality to reuse maps by using BTF-defined maps
- [x] Use BTF-defined maps for TC-BPF as well if iproute has libbpf support
- [x] Cleanup: Unload TC-BPF at program shutdown, and unpin map - In userspace part
- [x] Use existing functionality to reuse maps by using BTF-defined
maps
- [x] Use BTF-defined maps for TC-BPF as well if iproute has libbpf
support
- [x] Cleanup: Unload TC-BPF at program shutdown, and unpin map - In
userspace part
- [x] Add IPv6 support
- [x] Refactor to support easy addition of other protocols
- [x] Load tc-bpf program with libbpf (only attach it with tc)

View File

@@ -4,7 +4,7 @@
# License: GPLv2
#
# Modified by Simon Sundberg <simon.sundberg@kau.se> to add support
# of optional section (--sec) option and changed default BPF_OBJ
# of optional section (--sec) option or attaching a pinned program
#
basedir=`dirname $0`
source ${basedir}/functions.sh
@@ -64,6 +64,16 @@ function tc_egress_bpf_attach()
egress bpf da obj "$objfile" sec "$section"
}
function tc_egress_bpf_attach_pinned()
{
local device=${1:-$DEV}
local pinprog=${2:-$PIN_PROG}
shift 2
call_tc filter add dev "$device" pref 2 handle 2 \
egress bpf da pinned "$pinprog"
}
function tc_egress_list()
{
local device=${1:-$DEV}
@@ -77,7 +87,12 @@ if [[ -n $REMOVE ]]; then
fi
tc_init_clsact $DEV
tc_egress_bpf_attach $DEV $BPF_OBJ $SEC
if [[ -n $PIN_PROG ]]; then
tc_egress_bpf_attach_pinned $DEV $PIN_PROG
else
tc_egress_bpf_attach $DEV $BPF_OBJ $SEC
fi
# Practical to list egress filters after setup.
# (It's a common mistake to have several progs loaded)

29
pping/configure vendored
View File

@@ -1,29 +0,0 @@
#!/bin/bash
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
# This is not an autoconf generated configure
#
# Output file which is input to Makefile
CONFIG=config.mk
# Assume tc is in $PATH
TC=tc
check_tc_libbpf()
{
tc_version=$($TC -V)
if echo $tc_version | grep -q libbpf; then
libbpf_version=${tc_version##*libbpf }
echo "HAVE_TC_LIBBPF:=y" >> $CONFIG
echo "BPF_CFLAGS += -DHAVE_TC_LIBBPF" >> $CONFIG
echo "yes ($libbpf_version)"
else
echo "no"
fi
}
echo "# Generated config" > $CONFIG
echo "Detecting available features on system"
echo -n " - libbpf support in tc tool: "
check_tc_libbpf

View File

Binary file not shown.

Before

Width:  |  Height:  |  Size: 49 KiB

After

Width:  |  Height:  |  Size: 54 KiB

View File

@@ -6,7 +6,7 @@
# License: GPLv2
#
# Modified by Simon Sundberg <simon.sundberg@kau.se> to add support
# of optional section (--sec) option
# of optional section (--sec) option or attaching a pinned program
#
function usage() {
@@ -20,12 +20,13 @@ function usage() {
echo " -l | --list : (\$LIST) List setup after setup"
echo " --file | --obj : (\$BPF_OBJ) BPF-object file to load"
echo " --sec : (\$SEC) Section of BPF-object to load"
echo " --pinned : (\$PIN_PROG) Path to pinned program to attach"
echo ""
}
# Using external program "getopt" to get --long-options
OPTIONS=$(getopt -o vshd:l \
--long verbose,dry-run,remove,stats,list,help,dev:,file:,obj:,sec: -- "$@")
--long verbose,dry-run,remove,stats,list,help,dev:,file:,obj:,sec:,pinned: -- "$@")
if (( $? != 0 )); then
usage
err 2 "Error calling getopt"
@@ -50,6 +51,11 @@ while true; do
info "Section to load: $SEC" >&2
shift 2
;;
--pinned )
export PIN_PROG=$2
info "Pinned program path: $PIN_PROG" >&2
shift 2
;;
-v | --verbose)
export VERBOSE=yes
# info "Verbose mode: VERBOSE=$VERBOSE" >&2

View File

@@ -1,4 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
static const char *__doc__ =
"Passive Ping - monitor flow RTT based on TCP timestamps";
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include <linux/if_link.h>
@@ -10,7 +13,9 @@
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
#include <stdbool.h>
#include <limits.h>
#include <signal.h> // For detecting Ctrl-C
#include <sys/resource.h> // For setting rlmit
#include <sys/wait.h>
@@ -18,25 +23,18 @@
#include <time.h>
#include <pthread.h>
#include "pping.h" //key and value structs for the ts_start map
#include "pping.h" //common structs for user-space and BPF parts
#define NS_PER_SECOND 1000000000UL
#define NS_PER_MS 1000000UL
#define TCBPF_LOADER_SCRIPT "./bpf_egress_loader.sh"
#define PINNED_DIR "/sys/fs/bpf/tc/globals"
#define PPING_XDP_OBJ "pping_kern_xdp.o"
#define PPING_TCBPF_OBJ "pping_kern_tc.o"
#define XDP_FLAGS XDP_FLAGS_UPDATE_IF_NOEXIST
#define TS_MAP "ts_start"
#define MAP_CLEANUP_INTERVAL \
(1 * NS_PER_SECOND) // Clean timestamp map once per second
#define TIMESTAMP_LIFETIME \
(10 * NS_PER_SECOND) // Clear out entries from ts_start if they're over 10 seconds
(10 * NS_PER_SECOND) // Clear out packet timestamps if they're over 10 seconds
#define FLOW_LIFETIME \
(300 * NS_PER_SECOND) // Clear out flows if they're inactive over 300 seconds
#define PERF_BUFFER "rtt_events"
#define PERF_BUFFER_PAGES 64 // Related to the perf-buffer size?
#define PERF_POLL_TIMEOUT_MS 100
@@ -57,12 +55,146 @@
// Structure to contain arguments for clean_map (for passing to pthread_create)
struct map_cleanup_args {
int map_fd;
__u64 max_age_ns;
__u64 cleanup_interval;
int packet_map_fd;
int flow_map_fd;
};
// Store configuration values in struct to easily pass around
struct pping_config {
struct bpf_config bpf_config;
__u64 cleanup_interval;
int xdp_flags;
int ifindex;
char ifname[IF_NAMESIZE];
bool force;
char *object_path;
char *ingress_sec;
char *egress_sec;
char *pin_dir;
char *packet_map;
char *flow_map;
char *rtt_map;
};
static volatile int keep_running = 1;
static const struct option long_options[] = {
{ "help", no_argument, NULL, 'h' },
{ "interface", required_argument, NULL, 'i' }, // Name of interface to run on
{ "rate-limit", required_argument, NULL, 'r' }, // Sampling rate-limit in ms
{ "force", no_argument, NULL, 'f' }, // Detach any existing XDP program on interface
{ "cleanup-interval", required_argument, NULL, 'c' }, // Map cleaning interval in s
{ 0, 0, NULL, 0 }
};
/*
* Copied from Jesper Dangaaard Brouer's traffic-pacing-edt example
*/
static void print_usage(char *argv[])
{
int i;
printf("\nDOCUMENTATION:\n%s\n", __doc__);
printf("\n");
printf(" Usage: %s (options-see-below)\n", argv[0]);
printf(" Listing options:\n");
for (i = 0; long_options[i].name != 0; i++) {
printf(" --%-12s", long_options[i].name);
if (long_options[i].flag != NULL)
printf(" flag (internal value:%d)",
*long_options[i].flag);
else
printf(" short-option: -%c", long_options[i].val);
printf("\n");
}
printf("\n");
}
static double parse_positive_double_argument(const char *str,
const char *parname)
{
char *endptr;
double val;
val = strtod(str, &endptr);
if (strlen(str) != endptr - str) {
fprintf(stderr, "%s %s is not a valid number\n", parname, str);
return -EINVAL;
}
if (val < 0) {
fprintf(stderr, "%s must be positive\n", parname);
return -EINVAL;
}
return val;
}
static int parse_arguments(int argc, char *argv[], struct pping_config *config)
{
int err, opt;
double rate_limit_ms, cleanup_interval_s;
config->ifindex = 0;
while ((opt = getopt_long(argc, argv, "hfi:r:c:", long_options,
NULL)) != -1) {
switch (opt) {
case 'i':
if (strlen(optarg) > IF_NAMESIZE) {
fprintf(stderr, "interface name too long\n");
return -EINVAL;
}
strncpy(config->ifname, optarg, IF_NAMESIZE);
config->ifindex = if_nametoindex(config->ifname);
if (config->ifindex == 0) {
err = -errno;
fprintf(stderr,
"Could not get index of interface %s: %s\n",
config->ifname, strerror(err));
return err;
}
break;
case 'r':
rate_limit_ms = parse_positive_double_argument(
optarg, "rate-limit");
if (rate_limit_ms < 0)
return -EINVAL;
config->bpf_config.rate_limit =
rate_limit_ms * NS_PER_MS;
break;
case 'c':
cleanup_interval_s = parse_positive_double_argument(
optarg, "cleanup-interval");
if (cleanup_interval_s < 0)
return -EINVAL;
config->cleanup_interval =
cleanup_interval_s * NS_PER_SECOND;
break;
case 'f':
config->force = true;
break;
case 'h':
printf("HELP:\n");
print_usage(argv);
exit(0);
default:
fprintf(stderr, "Unknown option %s\n", argv[optind]);
return -EINVAL;
}
}
if (config->ifindex == 0) {
fprintf(stderr,
"An interface (-i or --interface) must be provided\n");
return -EINVAL;
}
return 0;
}
void abort_program(int sig)
{
keep_running = 0;
@@ -78,28 +210,48 @@ static int set_rlimit(long int lim)
return !setrlimit(RLIMIT_MEMLOCK, &rlim) ? 0 : -errno;
}
static int mkdir_if_noexist(const char *path)
static int
bpf_obj_run_prog_pindir_func(struct bpf_object *obj, const char *prog_title,
const char *pin_dir,
int (*func)(struct bpf_program *, const char *))
{
int ret;
struct stat st = { 0 };
int len;
struct bpf_program *prog;
char path[MAX_PATH_LEN];
ret = stat(path, &st);
if (ret) {
if (errno != ENOENT)
return -errno;
len = snprintf(path, MAX_PATH_LEN, "%s/%s", pin_dir, prog_title);
if (len < 0)
return len;
if (len > MAX_PATH_LEN)
return -ENAMETOOLONG;
return mkdir(path, 0700) ? -errno : 0;
}
return S_ISDIR(st.st_mode) ? 0 : -EEXIST;
prog = bpf_object__find_program_by_title(obj, prog_title);
if (!prog || libbpf_get_error(prog))
return prog ? libbpf_get_error(prog) : -EINVAL;
return func(prog, path);
}
static int bpf_obj_open(struct bpf_object **obj, const char *obj_path,
char *map_path)
/*
* Similar to bpf_object__pin_programs, but only attemps to pin a
* single program prog_title at path pin_dir/prog_title
*/
static int bpf_obj_pin_program(struct bpf_object *obj, const char *prog_title,
const char *pin_dir)
{
DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts,
.pin_root_path = map_path);
*obj = bpf_object__open_file(obj_path, map_path ? &opts : NULL);
return libbpf_get_error(*obj);
return bpf_obj_run_prog_pindir_func(obj, prog_title, pin_dir,
bpf_program__pin);
}
/*
* Similar to bpf_object__unpin_programs, but only attempts to unpin a
* single program prog_title at path pin_dir/prog_title.
*/
static int bpf_obj_unpin_program(struct bpf_object *obj, const char *prog_title,
const char *pin_dir)
{
return bpf_obj_run_prog_pindir_func(obj, prog_title, pin_dir,
bpf_program__unpin);
}
static int xdp_detach(int ifindex, __u32 xdp_flags)
@@ -112,7 +264,6 @@ static int xdp_attach(struct bpf_object *obj, const char *sec, int ifindex,
{
struct bpf_program *prog;
int prog_fd;
int err;
if (sec)
prog = bpf_object__find_program_by_title(obj, sec);
@@ -120,24 +271,28 @@ static int xdp_attach(struct bpf_object *obj, const char *sec, int ifindex,
prog = bpf_program__next(NULL, obj);
prog_fd = bpf_program__fd(prog);
if (prog_fd < 0) {
fprintf(stderr, "Could not find program to attach\n");
if (prog_fd < 0)
return prog_fd;
}
if (force) // detach current (if any) xdp-program first
xdp_detach(ifindex, xdp_flags);
err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
if (err < 0) {
fprintf(stderr, "Failed loading xdp-program on interface %d\n",
ifindex);
return err;
}
return 0;
return bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
}
static int run_program(const char *path, char *const argv[])
static int init_rodata(struct bpf_object *obj, void *src, size_t size)
{
struct bpf_map *map = NULL;
bpf_object__for_each_map(map, obj) {
if (strstr(bpf_map__name(map), ".rodata"))
return bpf_map__set_initial_value(map, src, size);
}
// No .rodata map found
return -EINVAL;
}
static int run_external_program(const char *path, char *const argv[])
{
int status;
int ret = -1;
@@ -157,18 +312,24 @@ static int run_program(const char *path, char *const argv[])
}
}
static int tc_bpf_load(char *bpf_object, char *section, char *interface)
static int tc_bpf_attach(const char *pin_dir, const char *section,
char *interface)
{
char *const argv[] = { TCBPF_LOADER_SCRIPT, "--dev", interface, "--obj",
bpf_object, "--sec", section, NULL };
return run_program(TCBPF_LOADER_SCRIPT, argv);
char prog_path[MAX_PATH_LEN];
char *const argv[] = { TCBPF_LOADER_SCRIPT, "--dev", interface,
"--pinned", prog_path, NULL };
if (snprintf(prog_path, sizeof(prog_path), "%s/%s", pin_dir, section) < 0)
return -EINVAL;
return run_external_program(TCBPF_LOADER_SCRIPT, argv);
}
static int tc_bpf_clear(char *interface)
{
char *const argv[] = { TCBPF_LOADER_SCRIPT, "--dev", interface,
"--remove", NULL };
return run_program(TCBPF_LOADER_SCRIPT, argv);
return run_external_program(TCBPF_LOADER_SCRIPT, argv);
}
/*
@@ -184,45 +345,82 @@ static __u64 get_time_ns(void)
return (__u64)t.tv_sec * NS_PER_SECOND + (__u64)t.tv_nsec;
}
static int clean_map(int map_fd, __u64 max_age)
static bool packet_ts_timeout(void *val_ptr, __u64 now)
{
__u64 ts = *(__u64 *)val_ptr;
if (now > ts && now - ts > TIMESTAMP_LIFETIME)
return true;
return false;
}
static bool flow_timeout(void *val_ptr, __u64 now)
{
__u64 ts = ((struct flow_state *)val_ptr)->last_timestamp;
if (now > ts && now - ts > FLOW_LIFETIME)
return true;
return false;
}
/*
* Loops through all entries in a map, running del_decision_func(value, time)
* on every entry, and deleting those for which it returns true.
* On sucess, returns the number of entries deleted, otherwise returns the
* (negative) error code.
*/
//TODO - maybe add some pointer to arguments for del_decision_func?
static int clean_map(int map_fd, size_t key_size, size_t value_size,
bool (*del_decision_func)(void *, __u64))
{
int removed = 0;
struct packet_id key, prev_key = { 0 };
struct packet_timestamp value;
void *key, *prev_key, *value;
bool delete_prev = false;
__u64 now_nsec = get_time_ns();
int entries = 0; // Just for debug
__u64 duration; // Just for debug
#ifdef DEBUG
int entries = 0;
__u64 duration;
#endif
if (now_nsec == 0)
return -errno;
key = malloc(key_size);
prev_key = malloc(key_size);
value = malloc(value_size);
if (!key || !prev_key || !value) {
removed = -ENOMEM;
goto cleanup;
}
// Cannot delete current key because then loop will reset, see https://www.bouncybouncy.net/blog/bpf_map_get_next_key-pitfalls/
while (bpf_map_get_next_key(map_fd, &prev_key, &key) == 0) {
while (bpf_map_get_next_key(map_fd, prev_key, key) == 0) {
if (delete_prev) {
bpf_map_delete_elem(map_fd, &prev_key);
bpf_map_delete_elem(map_fd, prev_key);
removed++;
delete_prev = false;
}
if (bpf_map_lookup_elem(map_fd, &key, &value) == 0) {
if (now_nsec > value.timestamp &&
now_nsec - value.timestamp > max_age) {
delete_prev = true;
}
}
if (bpf_map_lookup_elem(map_fd, key, value) == 0)
delete_prev = del_decision_func(value, now_nsec);
#ifdef DEBUG
entries++;
prev_key = key;
#endif
memcpy(prev_key, key, key_size);
}
if (delete_prev) {
bpf_map_delete_elem(map_fd, &prev_key);
bpf_map_delete_elem(map_fd, prev_key);
removed++;
}
#ifdef DEBUG
duration = get_time_ns() - now_nsec;
printf("Gone through %d entries and removed %d of them in %llu.%09llu s\n",
entries, removed, duration / NS_PER_SECOND,
printf("%d: Gone through %d entries and removed %d of them in %llu.%09llu s\n",
map_fd, entries, removed, duration / NS_PER_SECOND,
duration % NS_PER_SECOND);
#endif
cleanup:
free(key);
free(prev_key);
free(value);
return removed;
}
@@ -230,11 +428,14 @@ static void *periodic_map_cleanup(void *args)
{
struct map_cleanup_args *argp = args;
struct timespec interval;
interval.tv_sec = MAP_CLEANUP_INTERVAL / NS_PER_SECOND;
interval.tv_nsec = MAP_CLEANUP_INTERVAL % NS_PER_SECOND;
interval.tv_sec = argp->cleanup_interval / NS_PER_SECOND;
interval.tv_nsec = argp->cleanup_interval % NS_PER_SECOND;
while (keep_running) {
clean_map(argp->map_fd, argp->max_age_ns);
clean_map(argp->packet_map_fd, sizeof(struct packet_id),
sizeof(__u64), packet_ts_timeout);
clean_map(argp->flow_map_fd, sizeof(struct network_tuple),
sizeof(struct flow_state), flow_timeout);
nanosleep(&interval, NULL);
}
pthread_exit(NULL);
@@ -274,28 +475,134 @@ static void handle_missed_rtt_event(void *ctx, int cpu, __u64 lost_cnt)
fprintf(stderr, "Lost %llu RTT events on CPU %d\n", lost_cnt, cpu);
}
static int load_attach_bpfprogs(struct bpf_object **obj,
struct pping_config *config, bool *tc_attached,
bool *xdp_attached)
{
int err;
// Open and load ELF file
*obj = bpf_object__open(config->object_path);
err = libbpf_get_error(*obj);
if (err) {
fprintf(stderr, "Failed opening object file %s: %s\n",
config->object_path, strerror(-err));
return err;
}
err = init_rodata(*obj, &config->bpf_config,
sizeof(config->bpf_config));
if (err) {
fprintf(stderr, "Failed pushing user-configration to %s: %s\n",
config->object_path, strerror(-err));
return err;
}
err = bpf_object__load(*obj);
if (err) {
fprintf(stderr, "Failed loading bpf program in %s: %s\n",
config->object_path, strerror(-err));
return err;
}
// Attach tc program
err = bpf_obj_pin_program(*obj, config->egress_sec, config->pin_dir);
if (err) {
fprintf(stderr, "Failed pinning tc program to %s/%s: %s\n",
config->pin_dir, config->egress_sec, strerror(-err));
return err;
}
err = tc_bpf_attach(config->pin_dir, config->egress_sec,
config->ifname);
if (err) {
fprintf(stderr,
"Failed attaching tc program on interface %s: %s\n",
config->ifname, strerror(-err));
return err;
}
*tc_attached = true;
// Attach XDP program
err = xdp_attach(*obj, config->ingress_sec, config->ifindex,
config->xdp_flags, config->force);
if (err) {
fprintf(stderr, "Failed attaching XDP program to %s%s: %s\n",
config->ifname,
config->force ? "" : ", ensure no other XDP program is already running on interface",
strerror(-err));
return err;
}
*xdp_attached = true;
return 0;
}
static int setup_periodical_map_cleaning(struct bpf_object *obj,
struct pping_config *config)
{
pthread_t tid;
struct map_cleanup_args clean_args = {
.cleanup_interval = config->cleanup_interval
};
int err;
clean_args.packet_map_fd =
bpf_object__find_map_fd_by_name(obj, config->packet_map);
if (clean_args.packet_map_fd < 0) {
fprintf(stderr, "Could not get file descriptor of map %s: %s\n",
config->packet_map,
strerror(-clean_args.packet_map_fd));
return clean_args.packet_map_fd;
}
clean_args.flow_map_fd =
bpf_object__find_map_fd_by_name(obj, config->flow_map);
if (clean_args.flow_map_fd < 0) {
fprintf(stderr, "Could not get file descriptor of map %s: %s\n",
config->flow_map, strerror(-clean_args.flow_map_fd));
return clean_args.packet_map_fd;
}
err = pthread_create(&tid, NULL, periodic_map_cleanup, &clean_args);
if (err) {
fprintf(stderr,
"Failed starting thread to perform periodic map cleanup: %s\n",
strerror(-err));
return err;
}
return 0;
}
int main(int argc, char *argv[])
{
int err = 0;
int ifindex = 0;
bool xdp_attached = false;
bool tc_attached = false;
char map_path[MAX_PATH_LEN];
bool xdp_attached = false;
struct bpf_object *obj = NULL;
struct bpf_map *map = NULL;
pthread_t tid;
struct map_cleanup_args clean_args;
struct pping_config config = {
.bpf_config = { .rate_limit = 100 * NS_PER_MS },
.cleanup_interval = 1 * NS_PER_SECOND,
.object_path = "pping_kern.o",
.ingress_sec = INGRESS_PROG_SEC,
.egress_sec = EGRESS_PROG_SEC,
.pin_dir = "/sys/fs/bpf/pping",
.packet_map = "packet_ts",
.flow_map = "flow_state",
.rtt_map = "rtt_events",
.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
.force = false,
};
struct perf_buffer *pb = NULL;
struct perf_buffer_opts pb_opts;
// TODO - better argument parsing (more relevant as featureas are added)
if (argc < 2) {
printf("Usage: ./pping_user <dev>\n");
return EXIT_FAILURE;
}
struct perf_buffer_opts pb_opts = {
.sample_cb = handle_rtt_event,
.lost_cb = handle_missed_rtt_event,
};
// Detect if running as root
if (geteuid() != 0) {
@@ -308,98 +615,41 @@ int main(int argc, char *argv[])
if (err) {
fprintf(stderr, "Could not set rlimit to infinity: %s\n",
strerror(-err));
goto cleanup;
return EXIT_FAILURE;
}
// Get index of interface
ifindex = if_nametoindex(argv[1]);
if (ifindex == 0) {
err = -errno;
fprintf(stderr, "Could not get index of interface %s: %s\n",
argv[1], strerror(-err));
goto cleanup;
}
// Load and attach the XDP program
err = mkdir_if_noexist("/sys/fs/bpf/tc");
err = parse_arguments(argc, argv, &config);
if (err) {
fprintf(stderr,
"Failed creating directory %s in which to pin map: %s\n",
"/sys/fs/bpf/tc", strerror(-err));
goto cleanup;
}
err = bpf_obj_open(&obj, PPING_XDP_OBJ, PINNED_DIR);
if (err) {
fprintf(stderr, "Failed opening object file %s: %s\n",
PPING_XDP_OBJ, strerror(-err));
goto cleanup;
}
// Get map here to allow for unpinning at cleanup
map = bpf_object__find_map_by_name(obj, TS_MAP);
err = libbpf_get_error(map);
if (err) {
fprintf(stderr, "Could not find map %s in %s: %s\n", TS_MAP,
PPING_XDP_OBJ, strerror(err));
map = NULL;
}
err = bpf_object__load(obj);
if (err) {
fprintf(stderr, "Failed loading XDP program: %s\n",
fprintf(stderr, "Failed parsing arguments: %s\n",
strerror(-err));
goto cleanup;
print_usage(argv);
return EXIT_FAILURE;
}
err = xdp_attach(obj, XDP_PROG_SEC, ifindex, XDP_FLAGS, false);
if (err) {
fprintf(stderr, "Failed attaching XDP program to %s: %s\n",
argv[1], strerror(-err));
goto cleanup;
}
xdp_attached = true;
// Load tc-bpf section on interface egress
err = tc_bpf_load(PPING_TCBPF_OBJ, TCBPF_PROG_SEC, argv[1]);
err = load_attach_bpfprogs(&obj, &config, &tc_attached, &xdp_attached);
if (err) {
fprintf(stderr,
"Could not load section %s of %s on interface %s: %s\n",
TCBPF_PROG_SEC, PPING_TCBPF_OBJ, argv[1],
"Failed loading and attaching BPF programs in %s\n",
config.object_path);
goto cleanup;
}
err = setup_periodical_map_cleaning(obj, &config);
if (err) {
fprintf(stderr, "Failed setting up map cleaning: %s\n",
strerror(-err));
goto cleanup;
}
tc_attached = true;
// Set up the periodical map cleaning
clean_args.max_age_ns = TIMESTAMP_LIFETIME;
clean_args.map_fd = bpf_map__fd(map);
if (clean_args.map_fd < 0) {
fprintf(stderr,
"Could not get file descriptor of map %s in object %s: %s\n",
TS_MAP, PPING_XDP_OBJ, strerror(-clean_args.map_fd));
goto cleanup;
}
err = pthread_create(&tid, NULL, periodic_map_cleanup, &clean_args);
if (err) {
fprintf(stderr,
"Failed starting thread to perform periodic map cleanup: %s\n",
strerror(err));
goto cleanup;
}
// Set up perf buffer
pb_opts.sample_cb = handle_rtt_event;
pb_opts.lost_cb = handle_missed_rtt_event;
pb = perf_buffer__new(bpf_object__find_map_fd_by_name(obj, PERF_BUFFER),
pb = perf_buffer__new(bpf_object__find_map_fd_by_name(obj,
config.rtt_map),
PERF_BUFFER_PAGES, &pb_opts);
err = libbpf_get_error(pb);
if (err) {
pb = NULL;
fprintf(stderr, "Failed to open perf buffer %s: %s\n",
PERF_BUFFER, strerror(err));
config.rtt_map, strerror(err));
goto cleanup;
}
@@ -419,30 +669,30 @@ int main(int argc, char *argv[])
cleanup:
perf_buffer__free(pb);
if (map && bpf_map__is_pinned(map)) {
snprintf(map_path, sizeof(map_path), "%s/%s", PINNED_DIR,
TS_MAP);
err = bpf_map__unpin(map, map_path);
if (err) {
fprintf(stderr, "Failed unpinning map from %s: %s\n",
map_path, strerror(-err));
}
}
if (xdp_attached) {
err = xdp_detach(ifindex, XDP_FLAGS);
if (err) {
err = xdp_detach(config.ifindex, config.xdp_flags);
if (err)
fprintf(stderr,
"Failed deatching program from ifindex %d: %s\n",
ifindex, strerror(-err));
}
"Failed deatching program from ifindex %s: %s\n",
config.ifname, strerror(-err));
}
if (tc_attached) {
err = tc_bpf_clear(argv[1]); //system(tc_cmd);
if (err) {
err = tc_bpf_clear(config.ifname);
if (err)
fprintf(stderr,
"Failed removing tc-bpf program from interface %s: %s\n",
argv[1], strerror(-err));
}
config.ifname, strerror(-err));
}
if (obj && !libbpf_get_error(obj)) {
err = bpf_obj_unpin_program(obj, config.egress_sec,
config.pin_dir);
if (err)
fprintf(stderr,
"Failed unpinning tc program from %s: %s\n",
config.pin_dir, strerror(-err));
}
return err != 0;

View File

@@ -5,8 +5,12 @@
#include <linux/types.h>
#include <linux/in6.h>
#define XDP_PROG_SEC "xdp"
#define TCBPF_PROG_SEC "pping_egress"
#define INGRESS_PROG_SEC "xdp"
#define EGRESS_PROG_SEC "classifier"
struct bpf_config {
__u64 rate_limit;
};
/*
* Struct that can hold the source or destination address for a flow (l3+l4).
@@ -34,17 +38,17 @@ struct network_tuple {
__u8 reserved;
};
struct flow_state {
__u64 last_timestamp;
__u32 last_id;
__u32 reserved;
};
struct packet_id {
struct network_tuple flow;
__u32 identifier; //tsval for TCP packets
};
struct packet_timestamp {
__u64 timestamp;
__u8 used;
__u8 reserved[7];
};
struct rtt_event {
__u64 rtt;
struct network_tuple flow;

View File

@@ -1,187 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef PPING_HELPERS_H
#define PPING_HELPERS_H
#include <linux/bpf.h>
#include <xdp/parsing_helpers.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <stdbool.h>
#include "pping.h"
#define AF_INET 2
#define AF_INET6 10
#define MAX_TCP_OPTIONS 10
/*
* This struct keeps track of the data and data_end pointers from the xdp_md or
* __skb_buff contexts, as well as a currently parsed to position kept in nh.
* Additionally, it also keeps the length of the entire packet, which together
* with the other members can be used to determine ex. how much data each
* header encloses.
*/
struct parsing_context {
void *data; //Start of eth hdr
void *data_end; //End of safe acessible area
struct hdr_cursor nh; //Position to parse next
__u32 pkt_len; //Full packet length (headers+data)
};
/*
* Maps an IPv4 address into an IPv6 address according to RFC 4291 sec 2.5.5.2
*/
static void map_ipv4_to_ipv6(__be32 ipv4, struct in6_addr *ipv6)
{
__builtin_memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10);
__builtin_memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2);
ipv6->in6_u.u6_addr32[3] = ipv4;
}
/*
* Parses the TSval and TSecr values from the TCP options field. If sucessful
* the TSval and TSecr values will be stored at tsval and tsecr (in network
* byte order).
* Returns 0 if sucessful and -1 on failure
*/
static int parse_tcp_ts(struct tcphdr *tcph, void *data_end, __u32 *tsval,
__u32 *tsecr)
{
int len = tcph->doff << 2;
void *opt_end = (void *)tcph + len;
__u8 *pos = (__u8 *)(tcph + 1); //Current pos in TCP options
__u8 i, opt, opt_size;
if (tcph + 1 > data_end || len <= sizeof(struct tcphdr))
return -1;
for (i = 0; i < MAX_TCP_OPTIONS; i++) {
if (pos + 1 > opt_end || pos + 1 > data_end)
return -1;
opt = *pos;
if (opt == 0) // Reached end of TCP options
return -1;
if (opt == 1) { // TCP NOP option - advance one byte
pos++;
continue;
}
// Option > 1, should have option size
if (pos + 2 > opt_end || pos + 2 > data_end)
return -1;
opt_size = *(pos + 1);
// Option-kind is TCP timestap (yey!)
if (opt == 8 && opt_size == 10) {
if (pos + opt_size > opt_end ||
pos + opt_size > data_end)
return -1;
*tsval = *(__u32 *)(pos + 2);
*tsecr = *(__u32 *)(pos + 6);
return 0;
}
// Some other TCP option - advance option-length bytes
pos += opt_size;
}
return -1;
}
/*
* Attempts to fetch an identifier for TCP packets, based on the TCP timestamp
* option. If sucessful, identifier will be set to TSval if is_ingress, TSecr
* otherwise, the port-members of saddr and daddr will be set the the TCP source
* and dest, respectively, and 0 will be returned. On failure, -1 will be
* returned.
*/
static int parse_tcp_identifier(struct parsing_context *ctx, bool is_egress,
__be16 *sport, __be16 *dport, __u32 *identifier)
{
__u32 tsval, tsecr;
struct tcphdr *tcph;
if (parse_tcphdr(&ctx->nh, ctx->data_end, &tcph) < 0)
return -1;
// Do not timestamp pure ACKs
if (is_egress && ctx->nh.pos - ctx->data >= ctx->pkt_len && !tcph->syn)
return -1;
if (parse_tcp_ts(tcph, ctx->data_end, &tsval, &tsecr) < 0)
return -1; //Possible TODO, fall back on seq/ack instead
*sport = tcph->source;
*dport = tcph->dest;
*identifier = is_egress ? tsval : tsecr;
return 0;
}
/*
* Attempts to parse the packet limited by the data and data_end pointers,
* to retrieve a protocol dependent packet identifier. If sucessful, the
* pointed to p_id will be filled with parsed information from the packet
* packet, and 0 will be returned. On failure, -1 will be returned.
* If is_egress saddr and daddr will match source and destination of packet,
* respectively, and identifier will be set to the identifer for an outgoing
* packet. Otherwise, saddr and daddr will be swapped (will match
* destination and source of packet, respectively), and identifier will be
* set to the identifier of a response.
*/
static int parse_packet_identifier(struct parsing_context *ctx, bool is_egress,
struct packet_id *p_id)
{
int proto, err;
struct ethhdr *eth;
struct iphdr *iph;
struct ipv6hdr *ip6h;
struct flow_address *saddr, *daddr;
// Switch saddr <--> daddr on ingress to match egress
if (is_egress) {
saddr = &p_id->flow.saddr;
daddr = &p_id->flow.daddr;
} else {
saddr = &p_id->flow.daddr;
daddr = &p_id->flow.saddr;
}
proto = parse_ethhdr(&ctx->nh, ctx->data_end, &eth);
// Parse IPv4/6 header
if (proto == bpf_htons(ETH_P_IP)) {
p_id->flow.ipv = AF_INET;
proto = parse_iphdr(&ctx->nh, ctx->data_end, &iph);
} else if (proto == bpf_htons(ETH_P_IPV6)) {
p_id->flow.ipv = AF_INET6;
proto = parse_ip6hdr(&ctx->nh, ctx->data_end, &ip6h);
} else {
return -1;
}
// Add new protocols here
if (proto == IPPROTO_TCP) {
err = parse_tcp_identifier(ctx, is_egress, &saddr->port,
&daddr->port, &p_id->identifier);
if (err)
return -1;
} else {
return -1;
}
// Sucessfully parsed packet identifier - fill in IP-addresses and return
if (p_id->flow.ipv == AF_INET) {
map_ipv4_to_ipv6(iph->saddr, &saddr->ip);
map_ipv4_to_ipv6(iph->daddr, &daddr->ip);
} else { // IPv6
saddr->ip = ip6h->saddr;
daddr->ip = ip6h->daddr;
}
return 0;
}
#endif

361
pping/pping_kern.c Normal file
View File

@@ -0,0 +1,361 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <stdbool.h>
// overwrite xdp/parsing_helpers.h value to avoid hitting verifier limit
#ifdef IPV6_EXT_MAX_CHAIN
#undef IPV6_EXT_MAX_CHAIN
#endif
#define IPV6_EXT_MAX_CHAIN 3
#include <xdp/parsing_helpers.h>
#include "pping.h"
#define AF_INET 2
#define AF_INET6 10
#define MAX_TCP_OPTIONS 10
/*
* This struct keeps track of the data and data_end pointers from the xdp_md or
* __skb_buff contexts, as well as a currently parsed to position kept in nh.
* Additionally, it also keeps the length of the entire packet, which together
* with the other members can be used to determine ex. how much data each
* header encloses.
*/
struct parsing_context {
void *data; //Start of eth hdr
void *data_end; //End of safe acessible area
struct hdr_cursor nh; //Position to parse next
__u32 pkt_len; //Full packet length (headers+data)
bool is_egress; //Is packet on egress or ingress?
};
char _license[] SEC("license") = "GPL";
// Global config struct - set from userspace
static volatile const struct bpf_config config = {};
// Map definitions
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, struct packet_id);
__type(value, __u64);
__uint(max_entries, 16384);
} packet_ts SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, struct network_tuple);
__type(value, struct flow_state);
__uint(max_entries, 16384);
} flow_state SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(__u32));
} rtt_events SEC(".maps");
// Help functions
/*
* Maps an IPv4 address into an IPv6 address according to RFC 4291 sec 2.5.5.2
*/
static void map_ipv4_to_ipv6(__be32 ipv4, struct in6_addr *ipv6)
{
__builtin_memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10);
__builtin_memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2);
ipv6->in6_u.u6_addr32[3] = ipv4;
}
/*
* Parses the TSval and TSecr values from the TCP options field. If sucessful
* the TSval and TSecr values will be stored at tsval and tsecr (in network
* byte order).
* Returns 0 if sucessful and -1 on failure
*/
static int parse_tcp_ts(struct tcphdr *tcph, void *data_end, __u32 *tsval,
__u32 *tsecr)
{
int len = tcph->doff << 2;
void *opt_end = (void *)tcph + len;
__u8 *pos = (__u8 *)(tcph + 1); //Current pos in TCP options
__u8 i, opt;
volatile __u8
opt_size; // Seems to ensure it's always read of from stack as u8
if (tcph + 1 > data_end || len <= sizeof(struct tcphdr))
return -1;
#pragma unroll //temporary solution until we can identify why the non-unrolled loop gets stuck in an infinite loop
for (i = 0; i < MAX_TCP_OPTIONS; i++) {
if (pos + 1 > opt_end || pos + 1 > data_end)
return -1;
opt = *pos;
if (opt == 0) // Reached end of TCP options
return -1;
if (opt == 1) { // TCP NOP option - advance one byte
pos++;
continue;
}
// Option > 1, should have option size
if (pos + 2 > opt_end || pos + 2 > data_end)
return -1;
opt_size = *(pos + 1);
if (opt_size < 2) // Stop parsing options if opt_size has an invalid value
return -1;
// Option-kind is TCP timestap (yey!)
if (opt == 8 && opt_size == 10) {
if (pos + 10 > opt_end || pos + 10 > data_end)
return -1;
*tsval = *(__u32 *)(pos + 2);
*tsecr = *(__u32 *)(pos + 6);
return 0;
}
// Some other TCP option - advance option-length bytes
pos += opt_size;
}
return -1;
}
/*
* Attempts to fetch an identifier for TCP packets, based on the TCP timestamp
* option. If sucessful, identifier will be set to TSval if is_ingress, TSecr
* otherwise, the port-members of saddr and daddr will be set the the TCP source
* and dest, respectively, and 0 will be returned. On failure, -1 will be
* returned. Additionally, if the connection is closing (FIN or RST flag), sets
* flow_closing to true.
*/
static int parse_tcp_identifier(struct parsing_context *ctx, __be16 *sport,
__be16 *dport, bool *flow_closing,
__u32 *identifier)
{
__u32 tsval, tsecr;
struct tcphdr *tcph;
if (parse_tcphdr(&ctx->nh, ctx->data_end, &tcph) < 0)
return -1;
// Check if connection is closing
if (tcph->fin || tcph->rst) {
*flow_closing = true;
/* bpf_printk("Detected connection closing on %d\n", */
/* ctx->is_egress); //Upsets verifier? */
}
// Do not timestamp pure ACKs
if (ctx->is_egress && ctx->nh.pos - ctx->data >= ctx->pkt_len &&
!tcph->syn)
return -1;
if (parse_tcp_ts(tcph, ctx->data_end, &tsval, &tsecr) < 0)
return -1; //Possible TODO, fall back on seq/ack instead
*sport = tcph->source;
*dport = tcph->dest;
*identifier = ctx->is_egress ? tsval : tsecr;
return 0;
}
/*
* Attempts to parse the packet limited by the data and data_end pointers,
* to retrieve a protocol dependent packet identifier. If sucessful, the
* pointed to p_id will be filled with parsed information from the packet
* packet, and 0 will be returned. On failure, -1 will be returned.
* If is_egress saddr and daddr will match source and destination of packet,
* respectively, and identifier will be set to the identifer for an outgoing
* packet. Otherwise, saddr and daddr will be swapped (will match
* destination and source of packet, respectively), and identifier will be
* set to the identifier of a response.
*/
static int parse_packet_identifier(struct parsing_context *ctx,
struct packet_id *p_id, bool *flow_closing)
{
int proto, err;
struct ethhdr *eth;
struct iphdr *iph;
struct ipv6hdr *ip6h;
struct flow_address *saddr, *daddr;
// Switch saddr <--> daddr on ingress to match egress
if (ctx->is_egress) {
saddr = &p_id->flow.saddr;
daddr = &p_id->flow.daddr;
} else {
saddr = &p_id->flow.daddr;
daddr = &p_id->flow.saddr;
}
proto = parse_ethhdr(&ctx->nh, ctx->data_end, &eth);
// Parse IPv4/6 header
if (proto == bpf_htons(ETH_P_IP)) {
p_id->flow.ipv = AF_INET;
proto = parse_iphdr(&ctx->nh, ctx->data_end, &iph);
} else if (proto == bpf_htons(ETH_P_IPV6)) {
p_id->flow.ipv = AF_INET6;
proto = parse_ip6hdr(&ctx->nh, ctx->data_end, &ip6h);
} else {
return -1;
}
// Add new protocols here
if (proto == IPPROTO_TCP) {
err = parse_tcp_identifier(ctx, &saddr->port, &daddr->port,
flow_closing, &p_id->identifier);
if (err)
return -1;
} else {
return -1;
}
// Sucessfully parsed packet identifier - fill in IP-addresses and return
if (p_id->flow.ipv == AF_INET) {
map_ipv4_to_ipv6(iph->saddr, &saddr->ip);
map_ipv4_to_ipv6(iph->daddr, &daddr->ip);
} else { // IPv6
saddr->ip = ip6h->saddr;
daddr->ip = ip6h->daddr;
}
return 0;
}
// Programs
// TC-BFP for parsing packet identifier from egress traffic and add to map
SEC(EGRESS_PROG_SEC)
int pping_egress(struct __sk_buff *skb)
{
struct packet_id p_id = { 0 };
__u64 p_ts;
struct parsing_context pctx = {
.data = (void *)(long)skb->data,
.data_end = (void *)(long)skb->data_end,
.pkt_len = skb->len,
.nh = { .pos = pctx.data },
.is_egress = true,
};
bool flow_closing = false;
struct flow_state *f_state;
struct flow_state new_state = { 0 };
if (parse_packet_identifier(&pctx, &p_id, &flow_closing) < 0)
goto out;
// Delete flow and create no timestamp entry if flow is closing
if (flow_closing) {
bpf_map_delete_elem(&flow_state, &p_id.flow);
goto out;
}
// Check flow state
f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow);
if (!f_state) { // No previous state - attempt to create it
bpf_map_update_elem(&flow_state, &p_id.flow, &new_state,
BPF_NOEXIST);
f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow);
if (!f_state)
goto out;
}
// Check if identfier is new
/* The gap between checking and updating last_id may cause concurrency
* issues where multiple packets may simultaneously think they are the
* first with a new identifier. As long as all of the identifiers are
* the same though, only one should be able to create a timestamp entry.
* A bigger issue is that older identifiers (for example due to
* out-of-order packets) may pass this check and update the current
* identifier to an old one. This means that both the packet with the
* old identifier itself as well the next packet with the current
* identifier may be considered packets with new identifiers (even if
* both have been seen before). For TCP timestamps this could be
* prevented by changing the check to '>=' instead, but it may not be
* suitable for other protocols, such as QUIC and its spinbit.
*
* For now, just hope that the rate limit saves us from creating an
* incorrect timestamp. That may however also fail, either due to the
* to it happening in a time it's not limited by rate sampling, or
* because of rate check failing due to concurrency issues.
*/
if (f_state->last_id == p_id.identifier)
goto out;
f_state->last_id = p_id.identifier;
// Check rate-limit
/*
* The window between checking and updating last_timestamp may cause
* concurrency issues, where multiple packets simultaneously pass the
* rate limit. However, as long as they have the same identifier, only
* a single timestamp entry should successfully be created.
*/
p_ts = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns
if (p_ts < f_state->last_timestamp ||
p_ts - f_state->last_timestamp < config.rate_limit)
goto out;
/*
* Updates attempt at creating timestamp, even if creation of timestamp
* fails (due to map being full). This should make the competition for
* the next available map slot somewhat fairer between heavy and sparse
* flows.
*/
f_state->last_timestamp = p_ts;
bpf_map_update_elem(&packet_ts, &p_id, &p_ts, BPF_NOEXIST);
out:
return BPF_OK;
}
// XDP program for parsing identifier in ingress traffic and check for match in map
SEC(INGRESS_PROG_SEC)
int pping_ingress(struct xdp_md *ctx)
{
struct packet_id p_id = { 0 };
__u64 *p_ts;
struct rtt_event event = { 0 };
struct parsing_context pctx = {
.data = (void *)(long)ctx->data,
.data_end = (void *)(long)ctx->data_end,
.pkt_len = pctx.data_end - pctx.data,
.nh = { .pos = pctx.data },
.is_egress = false,
};
bool flow_closing = false;
if (parse_packet_identifier(&pctx, &p_id, &flow_closing) < 0)
goto out;
// Delete flow, but allow final attempt at RTT calculation
if (flow_closing)
bpf_map_delete_elem(&flow_state, &p_id.flow);
p_ts = bpf_map_lookup_elem(&packet_ts, &p_id);
if (!p_ts)
goto out;
event.rtt = bpf_ktime_get_ns() - *p_ts;
/*
* Attempt to delete timestamp entry as soon as RTT is calculated.
* But could have potential concurrency issue where multiple packets
* manage to match against the identifier before it can be deleted.
*/
bpf_map_delete_elem(&packet_ts, &p_id);
__builtin_memcpy(&event.flow, &p_id.flow, sizeof(struct network_tuple));
bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU, &event,
sizeof(event));
out:
return XDP_PASS;
}

View File

@@ -1,51 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <iproute2/bpf_elf.h>
#include "pping.h"
#include "pping_helpers.h"
char _license[] SEC("license") = "GPL";
#ifdef HAVE_TC_LIBBPF /* detected by configure script in config.mk */
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(key_size, sizeof(struct packet_id));
__uint(value_size, sizeof(struct packet_timestamp));
__uint(max_entries, 16384);
__uint(pinning, LIBBPF_PIN_BY_NAME);
} ts_start SEC(".maps");
#else
struct bpf_elf_map SEC("maps") ts_start = {
.type = BPF_MAP_TYPE_HASH,
.size_key = sizeof(struct packet_id),
.size_value = sizeof(struct packet_timestamp),
.max_elem = 16384,
.pinning = PIN_GLOBAL_NS,
};
#endif
// TC-BFP for parsing packet identifier from egress traffic and add to map
SEC(TCBPF_PROG_SEC)
int tc_bpf_prog_egress(struct __sk_buff *skb)
{
struct packet_id p_id = { 0 };
struct packet_timestamp p_ts = { 0 };
struct parsing_context pctx = {
.data = (void *)(long)skb->data,
.data_end = (void *)(long)skb->data_end,
.pkt_len = skb->len,
.nh = { .pos = pctx.data },
};
if (parse_packet_identifier(&pctx, true, &p_id) < 0)
goto end;
p_ts.timestamp = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns
bpf_map_update_elem(&ts_start, &p_id, &p_ts, BPF_NOEXIST);
end:
return BPF_OK;
}

View File

@@ -1,63 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include "pping.h"
#include "pping_helpers.h"
char _license[] SEC("license") = "GPL";
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(key_size, sizeof(struct packet_id));
__uint(value_size, sizeof(struct packet_timestamp));
__uint(max_entries, 16384);
__uint(pinning, LIBBPF_PIN_BY_NAME);
} ts_start SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(__u32));
} rtt_events SEC(".maps");
// XDP program for parsing identifier in ingress traffic and check for match in map
SEC(XDP_PROG_SEC)
int xdp_prog_ingress(struct xdp_md *ctx)
{
struct packet_id p_id = { 0 };
struct packet_timestamp *p_ts;
struct rtt_event event = { 0 };
struct parsing_context pctx = {
.data = (void *)(long)ctx->data,
.data_end = (void *)(long)ctx->data_end,
.pkt_len = pctx.data_end - pctx.data,
.nh = { .pos = pctx.data },
};
if (parse_packet_identifier(&pctx, false, &p_id) < 0)
goto end;
p_ts = bpf_map_lookup_elem(&ts_start, &p_id);
// Only calculate RTT for first packet with matching identifer
if (p_ts && p_ts->used == 0) {
/*
* As used is not set atomically with the lookup, could
* potentially have multiple "first" packets (on different
* CPUs), but all those should then also have very similar RTT,
* so don't consider it a significant issue
*/
p_ts->used = 1;
// TODO - Optional delete of entry (if identifier is garantued unique)
__builtin_memcpy(&event.flow, &p_id.flow,
sizeof(struct network_tuple));
event.rtt = bpf_ktime_get_ns() - p_ts->timestamp;
bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU,
&event, sizeof(event));
}
end:
return XDP_PASS;
}