mirror of
https://github.com/xdp-project/bpf-examples.git
synced 2024-05-06 15:54:53 +00:00
Merge pull request #32 from simosund/pping_improve_capabilities
PPing core improvements
This commit is contained in:
189
pping/README.md
189
pping/README.md
@@ -6,28 +6,36 @@ TC-BPF (on egress) for the packet capture logic.
|
||||
## Simple description
|
||||
Passive Ping (PPing) is a simple tool for passively measuring per-flow RTTs. It
|
||||
can be used on endhosts as well as any (BPF-capable Linux) device which can see
|
||||
both directions of the traffic (ex router or middlebox). Currently it only works
|
||||
for TCP traffic which uses the TCP timestamp option, but could be extended to
|
||||
also work with for example TCP seq/ACK numbers, the QUIC spinbit and ICMP
|
||||
echo-reply messages. See the [TODO-list](./TODO.md) for more potential features
|
||||
(which may or may not ever get implemented).
|
||||
both directions of the traffic (ex router or middlebox). Currently it works for
|
||||
TCP traffic which uses the TCP timestamp option and ICMP echo messages, but
|
||||
could be extended to also work with for example TCP seq/ACK numbers, the QUIC
|
||||
spinbit and DNS queries. See the [TODO-list](./TODO.md) for more potential
|
||||
features (which may or may not ever get implemented).
|
||||
|
||||
The fundamental logic of pping is to timestamp a pseudo-unique identifier for
|
||||
outgoing packets, and then look for matches in the incoming packets. If a match
|
||||
is found, the RTT is simply calculated as the time difference between the
|
||||
current time and the stored timestamp.
|
||||
packets, and then look for matches in the reply packets. If a match is found,
|
||||
the RTT is simply calculated as the time difference between the current time and
|
||||
the stored timestamp.
|
||||
|
||||
This tool, just as Kathie's original pping implementation, uses TCP timestamps
|
||||
as identifiers. For outgoing packets, the TSval (which is a timestamp in and off
|
||||
itself) is timestamped. Incoming packets are then parsed for the TSecr, which
|
||||
are the echoed TSval values from the receiver. The TCP timestamps are not
|
||||
necessarily unique for every packet (they have a limited update frequency,
|
||||
appears to be 1000 Hz for modern Linux systems), so only the first instance of
|
||||
an identifier is timestamped, and matched against the first incoming packet with
|
||||
the identifier. The mechanism to ensure only the first packet is timestamped and
|
||||
matched differs from the one in Kathie's pping, and is further described in
|
||||
as identifiers for TCP traffic. The TSval (which is a timestamp in and off
|
||||
itself) is used as an identifier and timestamped. Reply packets in the reverse
|
||||
flow are then parsed for the TSecr, which are the echoed TSval values from the
|
||||
receiver. The TCP timestamps are not necessarily unique for every packet (they
|
||||
have a limited update frequency, appears to be 1000 Hz for modern Linux
|
||||
systems), so only the first instance of an identifier is timestamped, and
|
||||
matched against the first incoming packet with a matching reply identifier. The
|
||||
mechanism to ensure only the first packet is timestamped and matched differs
|
||||
from the one in Kathie's pping, and is further described in
|
||||
[SAMPLING_DESIGN](./SAMPLING_DESIGN.md).
|
||||
|
||||
For ICMP echo, it uses the echo identifier as port numbers, and echo sequence
|
||||
number as identifer to match against. Linux systems will typically use different
|
||||
echo identifers for different instances of ping, and thus each ping instance
|
||||
will be recongnized as a separate flow. Windows systems typically use a static
|
||||
echo identifer, and thus all instaces of ping originating from a particular
|
||||
Windows host and the same target host will be considered a single flow.
|
||||
|
||||
## Output formats
|
||||
pping currently supports 3 different formats, *standard*, *ppviz* and *json*. In
|
||||
general, the output consists of two different types of events, flow-events which
|
||||
@@ -41,12 +49,12 @@ single line per event.
|
||||
|
||||
An example of the format is provided below:
|
||||
```shell
|
||||
16:00:46.142279766 10.11.1.1:5201+10.11.1.2:59528 opening due to SYN-ACK from src
|
||||
16:00:46.147705205 5.425439 ms 5.425439 ms 10.11.1.1:5201+10.11.1.2:59528
|
||||
16:00:47.148905125 5.261430 ms 5.261430 ms 10.11.1.1:5201+10.11.1.2:59528
|
||||
16:00:48.151666385 5.972284 ms 5.261430 ms 10.11.1.1:5201+10.11.1.2:59528
|
||||
16:00:49.152489316 6.017589 ms 5.261430 ms 10.11.1.1:5201+10.11.1.2:59528
|
||||
16:00:49.878508114 10.11.1.1:5201+10.11.1.2:59528 closing due to RST from dest
|
||||
16:00:46.142279766 TCP 10.11.1.1:5201+10.11.1.2:59528 opening due to SYN-ACK from dest
|
||||
16:00:46.147705205 5.425439 ms 5.425439 ms TCP 10.11.1.1:5201+10.11.1.2:59528
|
||||
16:00:47.148905125 5.261430 ms 5.261430 ms TCP 10.11.1.1:5201+10.11.1.2:59528
|
||||
16:00:48.151666385 5.972284 ms 5.261430 ms TCP 10.11.1.1:5201+10.11.1.2:59528
|
||||
16:00:49.152489316 6.017589 ms 5.261430 ms TCP 10.11.1.1:5201+10.11.1.2:59528
|
||||
16:00:49.878508114 TCP 10.11.1.1:5201+10.11.1.2:59528 closing due to RST from dest
|
||||
```
|
||||
|
||||
### ppviz format
|
||||
@@ -89,7 +97,7 @@ An example of a (pretty-printed) flow-event is provided below:
|
||||
"protocol": "TCP",
|
||||
"flow_event": "opening",
|
||||
"reason": "SYN-ACK",
|
||||
"triggered_by": "src"
|
||||
"triggered_by": "dest"
|
||||
}
|
||||
```
|
||||
|
||||
@@ -107,7 +115,8 @@ An example of a (pretty-printed) RTT-even is provided below:
|
||||
"sent_packets": 9393,
|
||||
"sent_bytes": 492457296,
|
||||
"rec_packets": 5922,
|
||||
"rec_bytes": 37
|
||||
"rec_bytes": 37,
|
||||
"match_on_egress": false
|
||||
}
|
||||
```
|
||||
|
||||
@@ -116,22 +125,20 @@ An example of a (pretty-printed) RTT-even is provided below:
|
||||
|
||||
### Files:
|
||||
- **pping.c:** Userspace program that loads and attaches the BPF programs, pulls
|
||||
the perf-buffer `rtt_events` to print out RTT messages and periodically cleans
|
||||
the perf-buffer `events` to print out RTT messages and periodically cleans
|
||||
up the hash-maps from old entries. Also passes user options to the BPF
|
||||
programs by setting a "global variable" (stored in the programs .rodata
|
||||
section).
|
||||
- **pping_kern.c:** Contains the BPF programs that are loaded on tc (egress) and
|
||||
XDP (ingress), as well as several common functions, a global constant `config`
|
||||
(set from userspace) and map definitions. The tc program `pping_egress()`
|
||||
parses outgoing packets for identifiers. If an identifier is found and the
|
||||
sampling strategy allows it, a timestamp for the packet is created in
|
||||
`packet_ts`. The XDP program `pping_ingress()` parses incomming packets for an
|
||||
identifier. If found, it looks up the `packet_ts` map for a match on the
|
||||
reverse flow (to match source/dest on egress). If there is a match, it
|
||||
calculates the RTT from the stored timestamp and deletes the entry. The
|
||||
calculated RTT (together with the flow-tuple) is pushed to the perf-buffer
|
||||
`events`. Both `pping_egress()` and `pping_ingress` can also push flow-events
|
||||
to the `events` buffer.
|
||||
- **pping_kern.c:** Contains the BPF programs that are loaded on egress (tc) and
|
||||
ingress (XDP or tc), as well as several common functions, a global constant
|
||||
`config` (set from userspace) and map definitions. Essentially the same pping
|
||||
program is loaded on both ingress and egress. All packets are parsed for both
|
||||
an identifier that can be used to create a timestamp entry `packet_ts`, and a
|
||||
reply identifier that can be used to match the packet with a previously
|
||||
timestamped one in the reverse flow. If a match is found, an RTT is calculated
|
||||
and an RTT-event is pushed to userspace through the perf-buffer `events`. For
|
||||
each packet with a valid identifier, the program also keeps track of and
|
||||
updates the state flow and reverse flow, stored in the `flow_state` map.
|
||||
- **pping.h:** Common header file included by `pping.c` and
|
||||
`pping_kern.c`. Contains some common structs used by both (are part of the
|
||||
maps).
|
||||
@@ -139,113 +146,15 @@ An example of a (pretty-printed) RTT-even is provided below:
|
||||
### BPF Maps:
|
||||
- **flow_state:** A hash-map storing some basic state for each flow, such as the
|
||||
last seen identifier for the flow and when the last timestamp entry for the
|
||||
flow was created. Entries are created by `pping_egress()`, and can be updated
|
||||
or deleted by both `pping_egress()` and `pping_ingress()`. Leftover entries
|
||||
are eventually removed by `pping.c`.
|
||||
flow was created. Entries are created, updated and deleted by the BPF pping
|
||||
programs. Leftover entries are eventually removed by userspace (`pping.c`).
|
||||
- **packet_ts:** A hash-map storing a timestamp for a specific packet
|
||||
identifier. Entries are created by `pping_egress()` and removed by
|
||||
`pping_ingress()` if a match is found. Leftover entries are eventually removed
|
||||
by `pping.c`.
|
||||
identifier. Entries are created by the BPF pping program if a valid identifier
|
||||
is found, and removed if a match is found. Leftover entries are eventually
|
||||
removed by userspace (`pping.c`).
|
||||
- **events:** A perf-buffer used by the BPF programs to push flow or RTT events
|
||||
to `pping.c`, which continuously polls the map the prints them out.
|
||||
|
||||
### A note on concurrency
|
||||
The program uses "global" (not `PERCPU`) hash maps to keep state. As the BPF
|
||||
programs need to see the global view to function properly, using `PERCPU` maps
|
||||
is not an option. The program must be able to match against stored packet
|
||||
timestamps regardless of the CPU the packets are processed on, and must also
|
||||
have a global view of the flow state in order for the sampling to work
|
||||
correctly.
|
||||
|
||||
As the BPF programs may run concurrently on different CPU cores accessing these
|
||||
global hash maps, this may result in some concurrency issues. In practice, I do
|
||||
not believe these will occur particularly often, as I'm under the impression
|
||||
that packets from the same flow will typically be processed by the some
|
||||
CPU. Furthermore, most of the concurrency issues will not be that problematic
|
||||
even if they do occur. For now, I've therefore left these concurrency issues
|
||||
unattended, even if some of them could be avoided with atomic operations and/or
|
||||
spinlocks, in order to keep things simple and not hurt performance.
|
||||
|
||||
The (known) potential concurrency issues are:
|
||||
|
||||
#### Tracking last seen identifier
|
||||
The tc/egress program keeps track of the last seen outgoing identifier for each
|
||||
flow, by storing it in the `flow_state` map. This is done to detect the first
|
||||
packet with a new identifier. If multiple packets are processed concurrently,
|
||||
several of them could potentially detect themselves as being first with the same
|
||||
identifier (which only matters if they also pass rate-limit check as well),
|
||||
alternatively if the concurrent packets have different identifiers there may be
|
||||
a lost update (but for TCP timestamps, concurrent packets would typically be
|
||||
expected to have the same timestamp).
|
||||
|
||||
A possibly more severe issue is out-of-order packets. If a packet with an old
|
||||
identifier arrives out of order, that identifier could be detected as a new
|
||||
identifier. If for example the following flow of four packets with just two
|
||||
different identifiers (id1 and id2) were to occur:
|
||||
|
||||
id1 -> id2 -> id1 -> id2
|
||||
|
||||
Then the tc/egress program would consider each of these packets to have new
|
||||
identifiers and try to create a new timestamp for each of them if the sampling
|
||||
strategy allows it. However even if the sampling strategy allows it, the
|
||||
(incorrect) creation of timestamps for id1 and id2 the second time would only be
|
||||
successful in case the first timestamps for id1 and id2 have already been
|
||||
matched against (and thus deleted). Even if that is the case, they would only
|
||||
result in reporting an incorrect RTT in case there are also new matches against
|
||||
these identifiers.
|
||||
|
||||
This issue could be avoided entirely by requiring that new-id > old-id instead
|
||||
of simply checking that new-id != old-id, as TCP timestamps should monotonically
|
||||
increase. That may however not be a suitable solution if/when we add support for
|
||||
other types of identifiers.
|
||||
|
||||
#### Rate-limiting new timestamps
|
||||
In the tc/egress program packets to timestamp are sampled by using a per-flow
|
||||
rate-limit, which is enforced by storing when the last timestamp was created in
|
||||
the `flow_state` map. If multiple packets perform this check concurrently, it's
|
||||
possible that multiple packets think they are allowed to create timestamps
|
||||
before any of them are able to update the `last_timestamp`. When they update
|
||||
`last_timestamp` it might also be slightly incorrect, however if they are
|
||||
processed concurrently then they should also generate very similar timestamps.
|
||||
|
||||
If the packets have different identifiers, (which would typically not be
|
||||
expected for concurrent TCP timestamps), then this would allow some packets to
|
||||
bypass the rate-limit. By bypassing the rate-limit, the flow would use up some
|
||||
additional map space and report some additional RTT(s) more than expected
|
||||
(however the reported RTTs should still be correct).
|
||||
|
||||
If the packets have the same identifier, they must first have managed to bypass
|
||||
the previous check for unique identifiers (see [previous point](#Tracking last
|
||||
seen identifier)), and only one of them will be able to successfully store a
|
||||
timestamp entry.
|
||||
|
||||
#### Matching against stored timestamps
|
||||
The XDP/ingress program could potentially match multiple concurrent packets with
|
||||
the same identifier against a single timestamp entry in `packet_ts`, before any
|
||||
of them manage to delete the timestamp entry. This would result in multiple RTTs
|
||||
being reported for the same identifier, but if they are processed concurrently
|
||||
these RTTs should be very similar, so would mainly result in over-reporting
|
||||
rather than reporting incorrect RTTs.
|
||||
|
||||
#### Updating flow statistics
|
||||
Both the tc/egress and XDP/ingress programs will try to update some flow
|
||||
statistics each time they successfully parse a packet with an
|
||||
identifier. Specifically, they'll update the number of packets and bytes
|
||||
sent/received. This is not done in an atomic fashion, so there could potentially
|
||||
be some lost updates resulting an underestimate.
|
||||
|
||||
Furthermore, whenever the XDP/ingress program calculates an RTT, it will check
|
||||
if this is the lowest RTT seen so far for the flow. If multiple RTTs are
|
||||
calculated concurrently, then several could pass this check concurrently and
|
||||
there may be a lost update. It should only be possible for multiple RTTs to be
|
||||
calculated concurrently in case either the [timestamp rate-limit was
|
||||
bypassed](#Rate-limiting new timestamps) or [multiple packets managed to match
|
||||
against the same timestamp](#Matching against stored timestamps).
|
||||
|
||||
It's worth noting that with sampling the reported minimum-RTT is only an
|
||||
estimate anyways (may never calculate RTT for packet with the true minimum
|
||||
RTT). And even without sampling there is some inherent sampling due to TCP
|
||||
timestamps only being updated at a limited rate (1000 Hz).
|
||||
|
||||
## Similar projects
|
||||
Passively measuring the RTT for TCP traffic is not a novel concept, and there
|
||||
|
185
pping/TODO.md
185
pping/TODO.md
@@ -6,23 +6,26 @@
|
||||
- Timestamping pure ACKs may lead to erroneous RTTs (ex. delay
|
||||
between application attempting to send data being recognized as
|
||||
an RTT)
|
||||
- [x] Skip non-ACKs for ingress
|
||||
- The echoed TSecr is not valid if the ACK-flag is not set
|
||||
- [ ] Add fallback to SEQ/ACK in case of no timestamp?
|
||||
- Some machines may not use TCP timestamps (either not supported
|
||||
at all, or disabled as in ex. Windows 10)
|
||||
- If one only considers SEQ/ACK (and don't check for SACK
|
||||
options), could result in ex. delay from retransmission being
|
||||
included in RTT
|
||||
- [ ] ICMP (ex Echo/Reply)
|
||||
- [x] ICMP (ex Echo/Reply)
|
||||
- [ ] QUIC (based on spinbit)
|
||||
- [ ] DNS queries
|
||||
|
||||
## General pping
|
||||
- [x] Add sampling so that RTT is not calculated for every packet
|
||||
(with unique value) for large flows
|
||||
- [ ] Allow short bursts to bypass sampling in order to handle
|
||||
delayed ACKs
|
||||
delayed ACKs, reordered or lost packets etc.
|
||||
- [x] Keep some per-flow state
|
||||
- Will likely be needed for the sampling
|
||||
- [ ] Could potentially include keeping track of average RTT, which
|
||||
- [x] Could potentially include keeping track of average RTT, which
|
||||
may be useful for some decisions (ex. how often to sample,
|
||||
when entry can be removed etc)
|
||||
- [x] Could potentially include keeping track of minimum RTT (as
|
||||
@@ -38,8 +41,6 @@
|
||||
unnecessarily large, which slows down the cleaning and may block
|
||||
new entries
|
||||
- [ ] Use libxdp to load XDP program
|
||||
- [ ] Add support for other hooks
|
||||
- Ex TC-BFP on ingress instead of XDP?
|
||||
|
||||
## Done
|
||||
- [x] Clean up commits and add signed-off-by tags
|
||||
@@ -60,3 +61,177 @@
|
||||
so that tools such as [ppviz](https://github.com/pollere/ppviz)
|
||||
works for both pping implementations.
|
||||
- [x] Add timestamps to output (as original pping)
|
||||
- [x] Add support for other hooks
|
||||
- TC-BFP on ingress instead of XDP
|
||||
|
||||
# Potential issues
|
||||
## Limited information in different output formats
|
||||
The ppviz format is a bit limited in what information it can
|
||||
include. One of these limitations is that it does not include any
|
||||
protocol information as it was designed with only TCP in mind. If
|
||||
using PPing with other protocols than TCP may therefore not be
|
||||
possible to distinguish flows with different protocols. PPing will
|
||||
therefore emit a warning if attempting to use the ppviz format with
|
||||
protocols other than TCP, but will still allow it.
|
||||
|
||||
Another piece of information tracked by PPing which can't be included
|
||||
in the ppviz format is if the calculated RTT includes the local
|
||||
processing delay or not (that is, it was timestamped on ingress and
|
||||
matched on egress instead of being timestamped on egress and matched
|
||||
on ingress). Currently this information is only included in the JSON
|
||||
format, but could potentially be added to the standard format if
|
||||
deemed important.
|
||||
|
||||
## Cannot detect end of ICMP "flow"
|
||||
ICMP is not a flow-based protocol, and therefore there is no signaling
|
||||
that the ICMP "flow" is about to close. Subsequently, there is not way
|
||||
for PPing to automatically detect that and ICMP flow has stopped and
|
||||
delete its flow-state entry (and send a timely flow closing event).
|
||||
|
||||
A consequence of this is that the ICMP flow entries will stick around
|
||||
and occupy a space in the flow state map until they are cleaned out by
|
||||
the periodic cleanup process. The current timeout for inactive flows
|
||||
is a very generous 5 minutes, which means a lot of short ICMP flows
|
||||
could quickly fill up the flow map and potentially block other flows
|
||||
for a long while.
|
||||
|
||||
## RTT-based sampling
|
||||
The RTT-based sampling features means that timestamp entries may only
|
||||
be created at an interval proportional to the flows RTT. This allows
|
||||
flows with shorter RTTs to get more frequent RTT samples than flows
|
||||
with long RTTs. However, as the flows RTT can only be updated based on
|
||||
the calculated RTT samples, this creates a situation where the RTTs
|
||||
update rate is dependent on itself. Flows with short RTTs will update
|
||||
the RTT more often, which in turn affects how often they can update
|
||||
the RTT.
|
||||
|
||||
This mainly becomes problematic if basing the sampling rate on the
|
||||
sRTT which may grow. In this case the sRTT will generally be prone to
|
||||
growing faster than it shrinks, as if it starts with a low RTT it will
|
||||
quickly update it to higher RTTs, but with high RTTs it will take
|
||||
longer for it do decrease to a lower RTT again.
|
||||
|
||||
## Concurrency issues
|
||||
|
||||
The program uses "global" (not `PERCPU`) hash maps to keep state. As
|
||||
the BPF programs need to see the global view to function properly,
|
||||
using `PERCPU` maps is not an option. The program must be able to
|
||||
match against stored packet timestamps regardless of the CPU the
|
||||
packets are processed on, and must also have a global view of the flow
|
||||
state in order for the sampling to work correctly.
|
||||
|
||||
As the BPF programs may run concurrently on different CPU cores
|
||||
accessing these global hash maps, this may result in some concurrency
|
||||
issues. In practice, I do not believe these will occur particularly
|
||||
often as the hash-map entries are per-flow, and I'm under the
|
||||
impression that packets from the same flow will typically be processed
|
||||
by the same CPU. Furthermore, most of the concurrency issues will not
|
||||
be that problematic even if they do occur. For now, I've therefore
|
||||
left these concurrency issues unattended, even if some of them could
|
||||
be avoided with atomic operations and/or spinlocks, in order to keep
|
||||
things simple and not hurt performance.
|
||||
|
||||
The (known) potential concurrency issues are:
|
||||
|
||||
### Tracking last seen identifier
|
||||
The tc/egress program keeps track of the last seen outgoing identifier
|
||||
for each flow, by storing it in the `flow_state` map. This is done to
|
||||
detect the first packet with a new identifier. If multiple packets are
|
||||
processed concurrently, several of them could potentially detect
|
||||
themselves as being first with the same identifier (which only matters
|
||||
if they also pass rate-limit check as well), alternatively if the
|
||||
concurrent packets have different identifiers there may be a lost
|
||||
update (but for TCP timestamps, concurrent packets would typically be
|
||||
expected to have the same timestamp).
|
||||
|
||||
A possibly more severe issue is out-of-order packets. If a packet with
|
||||
an old identifier arrives out of order, that identifier could be
|
||||
detected as a new identifier. If for example the following flow of
|
||||
four packets with just two different identifiers (id1 and id2) were to
|
||||
occur:
|
||||
|
||||
id1 -> id2 -> id1 -> id2
|
||||
|
||||
Then the tc/egress program would consider each of these packets to
|
||||
have new identifiers and try to create a new timestamp for each of
|
||||
them if the sampling strategy allows it. However even if the sampling
|
||||
strategy allows it, the (incorrect) creation of timestamps for id1 and
|
||||
id2 the second time would only be successful in case the first
|
||||
timestamps for id1 and id2 have already been matched against (and thus
|
||||
deleted). Even if that is the case, they would only result in
|
||||
reporting an incorrect RTT in case there are also new matches against
|
||||
these identifiers.
|
||||
|
||||
This issue could be avoided entirely by requiring that new-id > old-id
|
||||
instead of simply checking that new-id != old-id, as TCP timestamps
|
||||
should monotonically increase. That may however not be a suitable
|
||||
solution for other types of identifiers.
|
||||
|
||||
### Rate-limiting new timestamps
|
||||
In the tc/egress program packets to timestamp are sampled by using a
|
||||
per-flow rate-limit, which is enforced by storing when the last
|
||||
timestamp was created in the `flow_state` map. If multiple packets
|
||||
perform this check concurrently, it's possible that multiple packets
|
||||
think they are allowed to create timestamps before any of them are
|
||||
able to update the `last_timestamp`. When they update `last_timestamp`
|
||||
it might also be slightly incorrect, however if they are processed
|
||||
concurrently then they should also generate very similar timestamps.
|
||||
|
||||
If the packets have different identifiers, (which would typically not
|
||||
be expected for concurrent TCP timestamps), then this would allow some
|
||||
packets to bypass the rate-limit. By bypassing the rate-limit, the
|
||||
flow would use up some additional map space and report some additional
|
||||
RTT(s) more than expected (however the reported RTTs should still be
|
||||
correct).
|
||||
|
||||
If the packets have the same identifier, they must first have managed
|
||||
to bypass the previous check for unique identifiers (see [previous
|
||||
point](#tracking-last-seen-identifier)), and only one of them will be
|
||||
able to successfully store a timestamp entry.
|
||||
|
||||
### Matching against stored timestamps
|
||||
The XDP/ingress program could potentially match multiple concurrent
|
||||
packets with the same identifier against a single timestamp entry in
|
||||
`packet_ts`, before any of them manage to delete the timestamp
|
||||
entry. This would result in multiple RTTs being reported for the same
|
||||
identifier, but if they are processed concurrently these RTTs should
|
||||
be very similar, so would mainly result in over-reporting rather than
|
||||
reporting incorrect RTTs.
|
||||
|
||||
### Updating flow statistics
|
||||
Both the tc/egress and XDP/ingress programs will try to update some
|
||||
flow statistics each time they successfully parse a packet with an
|
||||
identifier. Specifically, they'll update the number of packets and
|
||||
bytes sent/received. This is not done in an atomic fashion, so there
|
||||
could potentially be some lost updates resulting an underestimate.
|
||||
|
||||
Furthermore, whenever the XDP/ingress program calculates an RTT, it
|
||||
will check if this is the lowest RTT seen so far for the flow. If
|
||||
multiple RTTs are calculated concurrently, then several could pass
|
||||
this check concurrently and there may be a lost update. It should only
|
||||
be possible for multiple RTTs to be calculated concurrently in case
|
||||
either the [timestamp rate-limit was
|
||||
bypassed](#rate-limiting-new-timestamps) or [multiple packets managed
|
||||
to match against the same
|
||||
timestamp](#matching-against-stored-timestamps).
|
||||
|
||||
It's worth noting that with sampling the reported minimum-RTT is only
|
||||
an estimate anyways (may never calculate RTT for packet with the true
|
||||
minimum RTT). And even without sampling there is some inherent
|
||||
sampling due to TCP timestamps only being updated at a limited rate
|
||||
(1000 Hz).
|
||||
|
||||
### Outputting flow opening/closing events
|
||||
A flow is not considered opened until a reply has been seen for
|
||||
it. The `flow_state` map keeps information about if the flow has been
|
||||
opened or not, which is checked and updated for each reply. The check
|
||||
and update of this information is not performed atomically, which may
|
||||
result in multiple replies thinking they are the first, emitting
|
||||
multiple flow-opened events, in case they are processed concurrently.
|
||||
|
||||
Likewise, when flows are closed it checks if the flow has been opened
|
||||
to determine if a flow closing message should be sent. If multiple
|
||||
replies are processed concurrently, it's possible one of them will
|
||||
update the flow-open information and emit a flow opening message, but
|
||||
another reply closing the flow without thinking it's ever been opened,
|
||||
thus not sending a flow closing message.
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 54 KiB After Width: | Height: | Size: 45 KiB |
201
pping/pping.c
201
pping/pping.c
@@ -1,6 +1,6 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
static const char *__doc__ =
|
||||
"Passive Ping - monitor flow RTT based on TCP timestamps";
|
||||
"Passive Ping - monitor flow RTT based on header inspection";
|
||||
|
||||
#include <bpf/bpf.h>
|
||||
#include <bpf/libbpf.h>
|
||||
@@ -15,11 +15,8 @@ static const char *__doc__ =
|
||||
#include <unistd.h>
|
||||
#include <getopt.h>
|
||||
#include <stdbool.h>
|
||||
#include <limits.h>
|
||||
#include <signal.h> // For detecting Ctrl-C
|
||||
#include <sys/resource.h> // For setting rlmit
|
||||
#include <sys/wait.h>
|
||||
#include <sys/stat.h>
|
||||
#include <time.h>
|
||||
#include <pthread.h>
|
||||
|
||||
@@ -51,16 +48,16 @@ enum PPING_OUTPUT_FORMAT {
|
||||
};
|
||||
|
||||
/*
|
||||
* BPF implementation of pping using libbpf
|
||||
* Uses TC-BPF for egress and XDP for ingress
|
||||
* - On egrees, packets are parsed for TCP TSval,
|
||||
* if found added to hashmap using flow+TSval as key,
|
||||
* and current time as value
|
||||
* - On ingress, packets are parsed for TCP TSecr,
|
||||
* if found looksup hashmap using reverse-flow+TSecr as key,
|
||||
* and calculates RTT as different between now map value
|
||||
* - Calculated RTTs are pushed to userspace
|
||||
* (together with the related flow) and printed out
|
||||
* BPF implementation of pping using libbpf.
|
||||
* Uses TC-BPF for egress and XDP for ingress.
|
||||
* - On egrees, packets are parsed for an identifer,
|
||||
* if found added to hashmap using flow+identifier as key,
|
||||
* and current time as value.
|
||||
* - On ingress, packets are parsed for reply identifer,
|
||||
* if found looksup hashmap using reverse-flow+identifier as key,
|
||||
* and calculates RTT as different between now and stored timestamp.
|
||||
* - Calculated RTTs are pushed to userspace
|
||||
* (together with the related flow) and printed out.
|
||||
*/
|
||||
|
||||
// Structure to contain arguments for clean_map (for passing to pthread_create)
|
||||
@@ -94,16 +91,21 @@ struct pping_config {
|
||||
|
||||
static volatile int keep_running = 1;
|
||||
static json_writer_t *json_ctx = NULL;
|
||||
static void (*print_event_func)(void *, int, void *, __u32) = NULL;
|
||||
static void (*print_event_func)(const union pping_event *) = NULL;
|
||||
|
||||
static const struct option long_options[] = {
|
||||
{ "help", no_argument, NULL, 'h' },
|
||||
{ "interface", required_argument, NULL, 'i' }, // Name of interface to run on
|
||||
{ "rate-limit", required_argument, NULL, 'r' }, // Sampling rate-limit in ms
|
||||
{ "rtt-rate", required_argument, NULL, 'R' }, // Sampling rate in terms of flow-RTT (ex 1 sample per RTT-interval)
|
||||
{ "rtt-type", required_argument, NULL, 't' }, // What type of RTT the RTT-rate should be applied to ("min" or "smoothed"), only relevant if rtt-rate is provided
|
||||
{ "force", no_argument, NULL, 'f' }, // Overwrite any existing XDP program on interface, remove qdisc on cleanup
|
||||
{ "cleanup-interval", required_argument, NULL, 'c' }, // Map cleaning interval in s, 0 to disable
|
||||
{ "format", required_argument, NULL, 'F' }, // Which format to output in (standard/json/ppviz)
|
||||
{ "ingress-hook", required_argument, NULL, 'I' }, // Use tc or XDP as ingress hook
|
||||
{ "tcp", no_argument, NULL, 'T' }, // Calculate and report RTTs for TCP traffic (with TCP timestamps)
|
||||
{ "icmp", no_argument, NULL, 'C' }, // Calculate and report RTTs for ICMP echo-reply traffic
|
||||
{ "include-local", no_argument, NULL, 'l' }, // Also report "internal" RTTs
|
||||
{ 0, 0, NULL, 0 }
|
||||
};
|
||||
|
||||
@@ -165,12 +167,15 @@ static int parse_bounded_double(double *res, const char *str, double low,
|
||||
static int parse_arguments(int argc, char *argv[], struct pping_config *config)
|
||||
{
|
||||
int err, opt;
|
||||
double rate_limit_ms, cleanup_interval_s;
|
||||
double rate_limit_ms, cleanup_interval_s, rtt_rate;
|
||||
|
||||
config->ifindex = 0;
|
||||
config->bpf_config.localfilt = true;
|
||||
config->force = false;
|
||||
config->bpf_config.track_tcp = false;
|
||||
config->bpf_config.track_icmp = false;
|
||||
|
||||
while ((opt = getopt_long(argc, argv, "hfi:r:c:F:I:", long_options,
|
||||
while ((opt = getopt_long(argc, argv, "hflTCi:r:R:t:c:F:I:", long_options,
|
||||
NULL)) != -1) {
|
||||
switch (opt) {
|
||||
case 'i':
|
||||
@@ -199,6 +204,26 @@ static int parse_arguments(int argc, char *argv[], struct pping_config *config)
|
||||
config->bpf_config.rate_limit =
|
||||
rate_limit_ms * NS_PER_MS;
|
||||
break;
|
||||
case 'R':
|
||||
err = parse_bounded_double(&rtt_rate, optarg, 0, 10000,
|
||||
"rtt-rate");
|
||||
if (err)
|
||||
return -EINVAL;
|
||||
config->bpf_config.rtt_rate =
|
||||
DOUBLE_TO_FIXPOINT(rtt_rate);
|
||||
break;
|
||||
case 't':
|
||||
if (strcmp(optarg, "min") == 0) {
|
||||
config->bpf_config.use_srtt = false;
|
||||
}
|
||||
else if (strcmp(optarg, "smoothed") == 0) {
|
||||
config->bpf_config.use_srtt = true;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"rtt-type must be \"min\" or \"smoothed\"\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
break;
|
||||
case 'c':
|
||||
err = parse_bounded_double(&cleanup_interval_s, optarg,
|
||||
0, 7 * S_PER_DAY,
|
||||
@@ -231,10 +256,19 @@ static int parse_arguments(int argc, char *argv[], struct pping_config *config)
|
||||
return -EINVAL;
|
||||
}
|
||||
break;
|
||||
case 'l':
|
||||
config->bpf_config.localfilt = false;
|
||||
break;
|
||||
case 'f':
|
||||
config->force = true;
|
||||
config->xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
|
||||
break;
|
||||
case 'T':
|
||||
config->bpf_config.track_tcp = true;
|
||||
break;
|
||||
case 'C':
|
||||
config->bpf_config.track_icmp = true;
|
||||
break;
|
||||
case 'h':
|
||||
printf("HELP:\n");
|
||||
print_usage(argv);
|
||||
@@ -254,6 +288,23 @@ static int parse_arguments(int argc, char *argv[], struct pping_config *config)
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char *tracked_protocols_to_str(struct pping_config *config)
|
||||
{
|
||||
bool tcp = config->bpf_config.track_tcp;
|
||||
bool icmp = config->bpf_config.track_icmp;
|
||||
return tcp && icmp ? "TCP, ICMP" : tcp ? "TCP" : "ICMP";
|
||||
}
|
||||
|
||||
const char *output_format_to_str(enum PPING_OUTPUT_FORMAT format)
|
||||
{
|
||||
switch (format) {
|
||||
case PPING_OUTPUT_STANDARD: return "standard";
|
||||
case PPING_OUTPUT_JSON: return "json";
|
||||
case PPING_OUTPUT_PPVIZ: return "ppviz";
|
||||
default: return "unkown format";
|
||||
}
|
||||
}
|
||||
|
||||
void abort_program(int sig)
|
||||
{
|
||||
keep_running = 0;
|
||||
@@ -449,17 +500,18 @@ static bool packet_ts_timeout(void *key_ptr, void *val_ptr, __u64 now)
|
||||
static bool flow_timeout(void *key_ptr, void *val_ptr, __u64 now)
|
||||
{
|
||||
struct flow_event fe;
|
||||
__u64 ts = ((struct flow_state *)val_ptr)->last_timestamp;
|
||||
struct flow_state *f_state = val_ptr;
|
||||
|
||||
if (now > ts && now - ts > FLOW_LIFETIME) {
|
||||
if (print_event_func) {
|
||||
if (now > f_state->last_timestamp &&
|
||||
now - f_state->last_timestamp > FLOW_LIFETIME) {
|
||||
if (print_event_func && f_state->has_opened) {
|
||||
fe.event_type = EVENT_TYPE_FLOW;
|
||||
fe.timestamp = now;
|
||||
memcpy(&fe.flow, key_ptr, sizeof(struct network_tuple));
|
||||
fe.event_info.event = FLOW_EVENT_CLOSING;
|
||||
fe.event_info.reason = EVENT_REASON_FLOW_TIMEOUT;
|
||||
reverse_flow(&fe.flow, key_ptr);
|
||||
fe.flow_event_type = FLOW_EVENT_CLOSING;
|
||||
fe.reason = EVENT_REASON_FLOW_TIMEOUT;
|
||||
fe.source = EVENT_SOURCE_USERSPACE;
|
||||
print_event_func(NULL, 0, &fe, sizeof(fe));
|
||||
print_event_func((union pping_event *)&fe);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -608,6 +660,7 @@ static const char *flowevent_to_str(enum flow_event_type fe)
|
||||
case FLOW_EVENT_OPENING:
|
||||
return "opening";
|
||||
case FLOW_EVENT_CLOSING:
|
||||
case FLOW_EVENT_CLOSING_BOTH:
|
||||
return "closing";
|
||||
default:
|
||||
return "unknown";
|
||||
@@ -625,8 +678,6 @@ static const char *eventreason_to_str(enum flow_event_reason er)
|
||||
return "first observed packet";
|
||||
case EVENT_REASON_FIN:
|
||||
return "FIN";
|
||||
case EVENT_REASON_FIN_ACK:
|
||||
return "FIN-ACK";
|
||||
case EVENT_REASON_RST:
|
||||
return "RST";
|
||||
case EVENT_REASON_FLOW_TIMEOUT:
|
||||
@@ -639,9 +690,9 @@ static const char *eventreason_to_str(enum flow_event_reason er)
|
||||
static const char *eventsource_to_str(enum flow_event_source es)
|
||||
{
|
||||
switch (es) {
|
||||
case EVENT_SOURCE_EGRESS:
|
||||
case EVENT_SOURCE_PKT_SRC:
|
||||
return "src";
|
||||
case EVENT_SOURCE_INGRESS:
|
||||
case EVENT_SOURCE_PKT_DEST:
|
||||
return "dest";
|
||||
case EVENT_SOURCE_USERSPACE:
|
||||
return "userspace-cleanup";
|
||||
@@ -671,43 +722,42 @@ static void print_ns_datetime(FILE *stream, __u64 monotonic_ns)
|
||||
fprintf(stream, "%s.%09llu", timestr, ts % NS_PER_SECOND);
|
||||
}
|
||||
|
||||
static void print_event_standard(void *ctx, int cpu, void *data,
|
||||
__u32 data_size)
|
||||
static void print_event_standard(const union pping_event *e)
|
||||
{
|
||||
const union pping_event *e = data;
|
||||
|
||||
if (e->event_type == EVENT_TYPE_RTT) {
|
||||
print_ns_datetime(stdout, e->rtt_event.timestamp);
|
||||
printf(" %llu.%06llu ms %llu.%06llu ms ",
|
||||
printf(" %llu.%06llu ms %llu.%06llu ms %s ",
|
||||
e->rtt_event.rtt / NS_PER_MS,
|
||||
e->rtt_event.rtt % NS_PER_MS,
|
||||
e->rtt_event.min_rtt / NS_PER_MS,
|
||||
e->rtt_event.min_rtt % NS_PER_MS);
|
||||
e->rtt_event.min_rtt % NS_PER_MS,
|
||||
proto_to_str(e->rtt_event.flow.proto));
|
||||
print_flow_ppvizformat(stdout, &e->rtt_event.flow);
|
||||
printf("\n");
|
||||
} else if (e->event_type == EVENT_TYPE_FLOW) {
|
||||
print_ns_datetime(stdout, e->flow_event.timestamp);
|
||||
printf(" ");
|
||||
printf(" %s ", proto_to_str(e->rtt_event.flow.proto));
|
||||
print_flow_ppvizformat(stdout, &e->flow_event.flow);
|
||||
printf(" %s due to %s from %s\n",
|
||||
flowevent_to_str(e->flow_event.event_info.event),
|
||||
eventreason_to_str(e->flow_event.event_info.reason),
|
||||
flowevent_to_str(e->flow_event.flow_event_type),
|
||||
eventreason_to_str(e->flow_event.reason),
|
||||
eventsource_to_str(e->flow_event.source));
|
||||
}
|
||||
}
|
||||
|
||||
static void print_event_ppviz(void *ctx, int cpu, void *data, __u32 data_size)
|
||||
static void print_event_ppviz(const union pping_event *e)
|
||||
{
|
||||
const struct rtt_event *e = data;
|
||||
__u64 time = convert_monotonic_to_realtime(e->timestamp);
|
||||
|
||||
// ppviz format does not support flow events
|
||||
if (e->event_type != EVENT_TYPE_RTT)
|
||||
return;
|
||||
|
||||
const struct rtt_event *re = &e->rtt_event;
|
||||
__u64 time = convert_monotonic_to_realtime(re->timestamp);
|
||||
|
||||
printf("%llu.%09llu %llu.%09llu %llu.%09llu ", time / NS_PER_SECOND,
|
||||
time % NS_PER_SECOND, e->rtt / NS_PER_SECOND,
|
||||
e->rtt % NS_PER_SECOND, e->min_rtt / NS_PER_SECOND, e->min_rtt);
|
||||
print_flow_ppvizformat(stdout, &e->flow);
|
||||
time % NS_PER_SECOND, re->rtt / NS_PER_SECOND,
|
||||
re->rtt % NS_PER_SECOND, re->min_rtt / NS_PER_SECOND, re->min_rtt);
|
||||
print_flow_ppvizformat(stdout, &re->flow);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
@@ -739,22 +789,21 @@ static void print_rttevent_fields_json(json_writer_t *ctx,
|
||||
jsonw_u64_field(ctx, "sent_bytes", re->sent_bytes);
|
||||
jsonw_u64_field(ctx, "rec_packets", re->rec_pkts);
|
||||
jsonw_u64_field(ctx, "rec_bytes", re->rec_bytes);
|
||||
jsonw_bool_field(ctx, "match_on_egress", re->match_on_egress);
|
||||
}
|
||||
|
||||
static void print_flowevent_fields_json(json_writer_t *ctx,
|
||||
const struct flow_event *fe)
|
||||
{
|
||||
jsonw_string_field(ctx, "flow_event",
|
||||
flowevent_to_str(fe->event_info.event));
|
||||
flowevent_to_str(fe->flow_event_type));
|
||||
jsonw_string_field(ctx, "reason",
|
||||
eventreason_to_str(fe->event_info.reason));
|
||||
eventreason_to_str(fe->reason));
|
||||
jsonw_string_field(ctx, "triggered_by", eventsource_to_str(fe->source));
|
||||
}
|
||||
|
||||
static void print_event_json(void *ctx, int cpu, void *data, __u32 data_size)
|
||||
static void print_event_json(const union pping_event *e)
|
||||
{
|
||||
const union pping_event *e = data;
|
||||
|
||||
if (e->event_type != EVENT_TYPE_RTT && e->event_type != EVENT_TYPE_FLOW)
|
||||
return;
|
||||
|
||||
@@ -772,9 +821,39 @@ static void print_event_json(void *ctx, int cpu, void *data, __u32 data_size)
|
||||
jsonw_end_object(json_ctx);
|
||||
}
|
||||
|
||||
static void handle_missed_rtt_event(void *ctx, int cpu, __u64 lost_cnt)
|
||||
static void warn_map_full(const struct map_full_event *e)
|
||||
{
|
||||
fprintf(stderr, "Lost %llu RTT events on CPU %d\n", lost_cnt, cpu);
|
||||
print_ns_datetime(stderr, e->timestamp);
|
||||
fprintf(stderr, " Warning: Unable to create %s entry for flow ",
|
||||
e->map == PPING_MAP_FLOWSTATE ? "flow" : "timestamp");
|
||||
print_flow_ppvizformat(stderr, &e->flow);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
static void handle_event(void *ctx, int cpu, void *data, __u32 data_size)
|
||||
{
|
||||
const union pping_event *e = data;
|
||||
|
||||
if (data_size < sizeof(e->event_type))
|
||||
return;
|
||||
|
||||
switch (e->event_type) {
|
||||
case EVENT_TYPE_MAP_FULL:
|
||||
warn_map_full(&e->map_event);
|
||||
break;
|
||||
case EVENT_TYPE_RTT:
|
||||
case EVENT_TYPE_FLOW:
|
||||
print_event_func(e);
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "Warning: Unknown event type %llu\n",
|
||||
e->event_type);
|
||||
};
|
||||
}
|
||||
|
||||
static void handle_missed_events(void *ctx, int cpu, __u64 lost_cnt)
|
||||
{
|
||||
fprintf(stderr, "Lost %llu events on CPU %d\n", lost_cnt, cpu);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -947,7 +1026,9 @@ int main(int argc, char *argv[])
|
||||
DECLARE_LIBBPF_OPTS(bpf_tc_opts, tc_egress_opts);
|
||||
|
||||
struct pping_config config = {
|
||||
.bpf_config = { .rate_limit = 100 * NS_PER_MS },
|
||||
.bpf_config = { .rate_limit = 100 * NS_PER_MS,
|
||||
.rtt_rate = 0,
|
||||
.use_srtt = false },
|
||||
.cleanup_interval = 1 * NS_PER_SECOND,
|
||||
.object_path = "pping_kern.o",
|
||||
.ingress_prog = "pping_xdp_ingress",
|
||||
@@ -983,6 +1064,14 @@ int main(int argc, char *argv[])
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
if (!config.bpf_config.track_tcp && !config.bpf_config.track_icmp)
|
||||
config.bpf_config.track_tcp = true;
|
||||
|
||||
if (config.bpf_config.track_icmp &&
|
||||
config.output_format == PPING_OUTPUT_PPVIZ)
|
||||
fprintf(stderr,
|
||||
"Warning: ppviz format mainly intended for TCP traffic, but may now include ICMP traffic as well\n");
|
||||
|
||||
switch (config.output_format) {
|
||||
case PPING_OUTPUT_STANDARD:
|
||||
print_event_func = print_event_standard;
|
||||
@@ -995,6 +1084,10 @@ int main(int argc, char *argv[])
|
||||
break;
|
||||
}
|
||||
|
||||
fprintf(stderr, "Starting ePPing in %s mode tracking %s on %s\n",
|
||||
output_format_to_str(config.output_format),
|
||||
tracked_protocols_to_str(&config), config.ifname);
|
||||
|
||||
err = load_attach_bpfprogs(&obj, &config);
|
||||
if (err) {
|
||||
fprintf(stderr,
|
||||
@@ -1013,8 +1106,8 @@ int main(int argc, char *argv[])
|
||||
// Set up perf buffer
|
||||
pb = perf_buffer__new(bpf_object__find_map_fd_by_name(obj,
|
||||
config.event_map),
|
||||
PERF_BUFFER_PAGES, print_event_func,
|
||||
handle_missed_rtt_event, NULL, NULL);
|
||||
PERF_BUFFER_PAGES, handle_event,
|
||||
handle_missed_events, NULL, NULL);
|
||||
err = libbpf_get_error(pb);
|
||||
if (err) {
|
||||
fprintf(stderr, "Failed to open perf buffer %s: %s\n",
|
||||
|
@@ -6,14 +6,21 @@
|
||||
#include <linux/in6.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
typedef __u64 fixpoint64;
|
||||
#define FIXPOINT_SHIFT 16
|
||||
#define DOUBLE_TO_FIXPOINT(X) ((fixpoint64)((X) * (1UL << FIXPOINT_SHIFT)))
|
||||
#define FIXPOINT_TO_UINT(X) ((X) >> FIXPOINT_SHIFT)
|
||||
|
||||
/* For the event_type members of rtt_event and flow_event */
|
||||
#define EVENT_TYPE_FLOW 1
|
||||
#define EVENT_TYPE_RTT 2
|
||||
#define EVENT_TYPE_MAP_FULL 3
|
||||
|
||||
enum __attribute__((__packed__)) flow_event_type {
|
||||
FLOW_EVENT_NONE,
|
||||
FLOW_EVENT_OPENING,
|
||||
FLOW_EVENT_CLOSING
|
||||
FLOW_EVENT_CLOSING,
|
||||
FLOW_EVENT_CLOSING_BOTH
|
||||
};
|
||||
|
||||
enum __attribute__((__packed__)) flow_event_reason {
|
||||
@@ -21,19 +28,29 @@ enum __attribute__((__packed__)) flow_event_reason {
|
||||
EVENT_REASON_SYN_ACK,
|
||||
EVENT_REASON_FIRST_OBS_PCKT,
|
||||
EVENT_REASON_FIN,
|
||||
EVENT_REASON_FIN_ACK,
|
||||
EVENT_REASON_RST,
|
||||
EVENT_REASON_FLOW_TIMEOUT
|
||||
};
|
||||
|
||||
enum __attribute__((__packed__)) flow_event_source {
|
||||
EVENT_SOURCE_EGRESS,
|
||||
EVENT_SOURCE_INGRESS,
|
||||
EVENT_SOURCE_PKT_SRC,
|
||||
EVENT_SOURCE_PKT_DEST,
|
||||
EVENT_SOURCE_USERSPACE
|
||||
};
|
||||
|
||||
enum __attribute__((__packed__)) pping_map {
|
||||
PPING_MAP_FLOWSTATE = 0,
|
||||
PPING_MAP_PACKETTS
|
||||
};
|
||||
|
||||
struct bpf_config {
|
||||
__u64 rate_limit;
|
||||
fixpoint64 rtt_rate;
|
||||
bool use_srtt;
|
||||
bool track_tcp;
|
||||
bool track_icmp;
|
||||
bool localfilt;
|
||||
__u32 reserved;
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -64,13 +81,16 @@ struct network_tuple {
|
||||
|
||||
struct flow_state {
|
||||
__u64 min_rtt;
|
||||
__u64 srtt;
|
||||
__u64 last_timestamp;
|
||||
__u64 sent_pkts;
|
||||
__u64 sent_bytes;
|
||||
__u64 rec_pkts;
|
||||
__u64 rec_bytes;
|
||||
__u32 last_id;
|
||||
__u32 reserved;
|
||||
bool has_opened;
|
||||
enum flow_event_reason opening_reason;
|
||||
__u16 reserved;
|
||||
};
|
||||
|
||||
struct packet_id {
|
||||
@@ -79,10 +99,15 @@ struct packet_id {
|
||||
};
|
||||
|
||||
/*
|
||||
* An RTT event message that can be passed from the bpf-programs to user-space.
|
||||
* Events that can be passed from the BPF-programs to the user space
|
||||
* application.
|
||||
* The initial event_type memeber is used to allow multiplexing between
|
||||
* different event types in a single perf buffer. Memebers up to and including
|
||||
* flow are identical to other event types.
|
||||
* flow are identical for all event types.
|
||||
*/
|
||||
|
||||
/*
|
||||
* An RTT event message passed when an RTT has been calculated
|
||||
* Uses explicit padding instead of packing based on recommendations in cilium's
|
||||
* BPF reference documentation at https://docs.cilium.io/en/stable/bpf/#llvm.
|
||||
*/
|
||||
@@ -97,33 +122,55 @@ struct rtt_event {
|
||||
__u64 sent_bytes;
|
||||
__u64 rec_pkts;
|
||||
__u64 rec_bytes;
|
||||
__u32 reserved;
|
||||
};
|
||||
|
||||
struct flow_event_info {
|
||||
enum flow_event_type event;
|
||||
enum flow_event_reason reason;
|
||||
bool match_on_egress;
|
||||
__u8 reserved[7];
|
||||
};
|
||||
|
||||
/*
|
||||
* A flow event message that can be passed from the bpf-programs to user-space.
|
||||
* The initial event_type memeber is used to allow multiplexing between
|
||||
* different event types in a single perf buffer. Memebers up to and including
|
||||
* flow are identical to other event types.
|
||||
* A flow event message passed when a flow has changed state (opened/closed)
|
||||
*/
|
||||
struct flow_event {
|
||||
__u64 event_type;
|
||||
__u64 timestamp;
|
||||
struct network_tuple flow;
|
||||
struct flow_event_info event_info;
|
||||
enum flow_event_type flow_event_type;
|
||||
enum flow_event_reason reason;
|
||||
enum flow_event_source source;
|
||||
__u8 reserved;
|
||||
};
|
||||
|
||||
/*
|
||||
* An event indicating that a new entry could not be created the map due to the
|
||||
* map being full.
|
||||
*/
|
||||
struct map_full_event {
|
||||
__u64 event_type;
|
||||
__u64 timestamp;
|
||||
struct network_tuple flow;
|
||||
enum pping_map map;
|
||||
__u8 reserved[3];
|
||||
};
|
||||
|
||||
union pping_event {
|
||||
__u64 event_type;
|
||||
struct rtt_event rtt_event;
|
||||
struct flow_event flow_event;
|
||||
struct map_full_event map_event;
|
||||
};
|
||||
|
||||
/*
|
||||
* Convenience function for getting the corresponding reverse flow.
|
||||
* PPing needs to keep track of flow in both directions, and sometimes
|
||||
* also needs to reverse the flow to report the "correct" (consistent
|
||||
* with Kathie's PPing) src and dest address.
|
||||
*/
|
||||
static void reverse_flow(struct network_tuple *dest, struct network_tuple *src)
|
||||
{
|
||||
dest->ipv = src->ipv;
|
||||
dest->proto = src->proto;
|
||||
dest->saddr = src->daddr;
|
||||
dest->daddr = src->saddr;
|
||||
dest->reserved = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -8,6 +8,8 @@
|
||||
#include <linux/ip.h>
|
||||
#include <linux/ipv6.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/icmp.h>
|
||||
#include <linux/icmpv6.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
// overwrite xdp/parsing_helpers.h value to avoid hitting verifier limit
|
||||
@@ -23,6 +25,12 @@
|
||||
#define AF_INET6 10
|
||||
#define MAX_TCP_OPTIONS 10
|
||||
|
||||
// Mask for IPv6 flowlabel + traffic class - used in fib lookup
|
||||
#define IPV6_FLOWINFO_MASK __cpu_to_be32(0x0FFFFFFF)
|
||||
|
||||
// Emit a warning max once per second when failing to add entry to map
|
||||
#define WARN_MAP_FULL_INTERVAL 1000000000UL
|
||||
|
||||
/*
|
||||
* This struct keeps track of the data and data_end pointers from the xdp_md or
|
||||
* __skb_buff contexts, as well as a currently parsed to position kept in nh.
|
||||
@@ -31,16 +39,50 @@
|
||||
* header encloses.
|
||||
*/
|
||||
struct parsing_context {
|
||||
void *data; //Start of eth hdr
|
||||
void *data_end; //End of safe acessible area
|
||||
struct hdr_cursor nh; //Position to parse next
|
||||
__u32 pkt_len; //Full packet length (headers+data)
|
||||
bool is_egress; //Is packet on egress or ingress?
|
||||
void *data; // Start of eth hdr
|
||||
void *data_end; // End of safe acessible area
|
||||
struct hdr_cursor nh; // Position to parse next
|
||||
__u32 pkt_len; // Full packet length (headers+data)
|
||||
__u32 ingress_ifindex; // Interface packet arrived on
|
||||
bool is_egress; // Is packet on egress or ingress?
|
||||
};
|
||||
|
||||
/*
|
||||
* Struct filled in by parse_packet_id.
|
||||
*
|
||||
* Note: As long as parse_packet_id is successful, the flow-parts of pid
|
||||
* and reply_pid should be valid, regardless of value for pid_valid and
|
||||
* reply_pid valid. The *pid_valid members are there to indicate that the
|
||||
* identifier part of *pid are valid and can be used for timestamping/lookup.
|
||||
* The reason for not keeping the flow parts as an entirely separate members
|
||||
* is to save some performance by avoid doing a copy for lookup/insertion
|
||||
* in the packet_ts map.
|
||||
*/
|
||||
struct packet_info {
|
||||
union {
|
||||
struct iphdr *iph;
|
||||
struct ipv6hdr *ip6h;
|
||||
};
|
||||
union {
|
||||
struct icmphdr *icmph;
|
||||
struct icmp6hdr *icmp6h;
|
||||
struct tcphdr *tcph;
|
||||
};
|
||||
__u64 time; // Arrival time of packet
|
||||
__u32 payload; // Size of packet data (excluding headers)
|
||||
struct packet_id pid; // identifier to timestamp (ex. TSval)
|
||||
struct packet_id reply_pid; // identifier to match against (ex. TSecr)
|
||||
bool pid_valid; // identifier can be used to timestamp packet
|
||||
bool reply_pid_valid; // reply_identifier can be used to match packet
|
||||
enum flow_event_type event_type; // flow event triggered by packet
|
||||
enum flow_event_reason event_reason; // reason for triggering flow event
|
||||
};
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
// Global config struct - set from userspace
|
||||
static volatile const struct bpf_config config = {};
|
||||
static volatile __u64 last_warn_time[2] = { 0 };
|
||||
|
||||
|
||||
// Map definitions
|
||||
struct {
|
||||
@@ -68,13 +110,24 @@ struct {
|
||||
/*
|
||||
* Maps an IPv4 address into an IPv6 address according to RFC 4291 sec 2.5.5.2
|
||||
*/
|
||||
static void map_ipv4_to_ipv6(__be32 ipv4, struct in6_addr *ipv6)
|
||||
static void map_ipv4_to_ipv6(struct in6_addr *ipv6, __be32 ipv4)
|
||||
{
|
||||
__builtin_memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10);
|
||||
__builtin_memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2);
|
||||
ipv6->in6_u.u6_addr32[3] = ipv4;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the number of unparsed bytes left in the packet (bytes after nh.pos)
|
||||
*/
|
||||
static __u32 remaining_pkt_payload(struct parsing_context *ctx)
|
||||
{
|
||||
// pkt_len - (pos - data) fails because compiler transforms it to pkt_len - pos + data (pkt_len - pos not ok because value - pointer)
|
||||
// data + pkt_len - pos fails on (data+pkt_len) - pos due to math between pkt_pointer and unbounded register
|
||||
__u32 parsed_bytes = ctx->nh.pos - ctx->data;
|
||||
return parsed_bytes < ctx->pkt_len ? ctx->pkt_len - parsed_bytes : 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parses the TSval and TSecr values from the TCP options field. If sucessful
|
||||
* the TSval and TSecr values will be stored at tsval and tsecr (in network
|
||||
@@ -132,202 +185,452 @@ static int parse_tcp_ts(struct tcphdr *tcph, void *data_end, __u32 *tsval,
|
||||
/*
|
||||
* Attempts to fetch an identifier for TCP packets, based on the TCP timestamp
|
||||
* option.
|
||||
* If successful, identifier will be set to TSval if is_ingress, or TSecr
|
||||
* otherwise, the port-members of saddr and daddr will be set to the TCP source
|
||||
* and dest, respectively, fei will be filled appropriately (based on
|
||||
* SYN/FIN/RST) and 0 will be returned.
|
||||
* On failure, -1 will be returned.
|
||||
*
|
||||
* Will use the TSval as pid and TSecr as reply_pid, and the TCP source and dest
|
||||
* as port numbers.
|
||||
*
|
||||
* If successful, the pid (identifer + flow.port), reply_pid, pid_valid,
|
||||
* reply_pid_valid, event_type and event_reason members of p_info will be set
|
||||
* appropriately and 0 will be returned.
|
||||
* On failure -1 will be returned (no guarantees on values set in p_info).
|
||||
*/
|
||||
static int parse_tcp_identifier(struct parsing_context *ctx, __be16 *sport,
|
||||
__be16 *dport, struct flow_event_info *fei,
|
||||
__u32 *identifier)
|
||||
static int parse_tcp_identifier(struct parsing_context *pctx,
|
||||
struct packet_info *p_info)
|
||||
{
|
||||
__u32 tsval, tsecr;
|
||||
struct tcphdr *tcph;
|
||||
|
||||
if (parse_tcphdr(&ctx->nh, ctx->data_end, &tcph) < 0)
|
||||
if (parse_tcphdr(&pctx->nh, pctx->data_end, &p_info->tcph) < 0)
|
||||
return -1;
|
||||
|
||||
// Do not timestamp pure ACKs
|
||||
if (ctx->is_egress && ctx->nh.pos - ctx->data >= ctx->pkt_len &&
|
||||
!tcph->syn)
|
||||
return -1;
|
||||
|
||||
// Check if connection is opening/closing
|
||||
if (tcph->syn) {
|
||||
fei->event = FLOW_EVENT_OPENING;
|
||||
fei->reason =
|
||||
tcph->ack ? EVENT_REASON_SYN_ACK : EVENT_REASON_SYN;
|
||||
} else if (tcph->rst) {
|
||||
fei->event = FLOW_EVENT_CLOSING;
|
||||
fei->reason = EVENT_REASON_RST;
|
||||
} else if (!ctx->is_egress && tcph->fin) {
|
||||
fei->event = FLOW_EVENT_CLOSING;
|
||||
fei->reason =
|
||||
tcph->ack ? EVENT_REASON_FIN_ACK : EVENT_REASON_FIN;
|
||||
} else {
|
||||
fei->event = FLOW_EVENT_NONE;
|
||||
}
|
||||
|
||||
if (parse_tcp_ts(tcph, ctx->data_end, &tsval, &tsecr) < 0)
|
||||
if (parse_tcp_ts(p_info->tcph, pctx->data_end, &p_info->pid.identifier,
|
||||
&p_info->reply_pid.identifier) < 0)
|
||||
return -1; //Possible TODO, fall back on seq/ack instead
|
||||
|
||||
*sport = tcph->source;
|
||||
*dport = tcph->dest;
|
||||
*identifier = ctx->is_egress ? tsval : tsecr;
|
||||
p_info->pid.flow.saddr.port = p_info->tcph->source;
|
||||
p_info->pid.flow.daddr.port = p_info->tcph->dest;
|
||||
|
||||
// Do not timestamp pure ACKs (no payload)
|
||||
p_info->pid_valid =
|
||||
pctx->nh.pos - pctx->data < pctx->pkt_len || p_info->tcph->syn;
|
||||
|
||||
// Do not match on non-ACKs (TSecr not valid)
|
||||
p_info->reply_pid_valid = p_info->tcph->ack;
|
||||
|
||||
// Check if connection is opening/closing
|
||||
if (p_info->tcph->rst) {
|
||||
p_info->event_type = FLOW_EVENT_CLOSING_BOTH;
|
||||
p_info->event_reason = EVENT_REASON_RST;
|
||||
} else if (p_info->tcph->fin) {
|
||||
p_info->event_type = FLOW_EVENT_CLOSING;
|
||||
p_info->event_reason = EVENT_REASON_FIN;
|
||||
} else if (p_info->tcph->syn) {
|
||||
p_info->event_type = FLOW_EVENT_OPENING;
|
||||
p_info->event_reason = p_info->tcph->ack ?
|
||||
EVENT_REASON_SYN_ACK :
|
||||
EVENT_REASON_SYN;
|
||||
} else {
|
||||
p_info->event_type = FLOW_EVENT_NONE;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempts to parse the packet limited by the data and data_end pointers,
|
||||
* to retrieve a protocol dependent packet identifier. If sucessful, the
|
||||
* pointed to p_id and fei will be filled with parsed information from the
|
||||
* packet, and 0 will be returned. On failure, -1 will be returned.
|
||||
* If is_egress saddr and daddr will match source and destination of packet,
|
||||
* respectively, and identifier will be set to the identifer for an outgoing
|
||||
* packet. Otherwise, saddr and daddr will be swapped (will match
|
||||
* destination and source of packet, respectively), and identifier will be
|
||||
* set to the identifier of a response.
|
||||
* Attempts to fetch an identifier for an ICMPv6 header, based on the echo
|
||||
* request/reply sequence number.
|
||||
*
|
||||
* Will use the echo sequence number as pid/reply_pid and the echo identifier
|
||||
* as port numbers. Echo requests will only generate a valid pid and echo
|
||||
* replies will only generate a valid reply_pid.
|
||||
*
|
||||
* If successful, the pid (identifier + flow.port), reply_pid, pid_valid,
|
||||
* reply pid_valid and event_type of p_info will be set appropriately and 0
|
||||
* will be returned.
|
||||
* On failure, -1 will be returned (no guarantees on p_info members).
|
||||
*
|
||||
* Note: Will store the 16-bit sequence number in network byte order
|
||||
* in the 32-bit (reply_)pid.identifier.
|
||||
*/
|
||||
static int parse_packet_identifier(struct parsing_context *ctx,
|
||||
struct packet_id *p_id,
|
||||
struct flow_event_info *fei)
|
||||
static int parse_icmp6_identifier(struct parsing_context *pctx,
|
||||
struct packet_info *p_info)
|
||||
{
|
||||
if (parse_icmp6hdr(&pctx->nh, pctx->data_end, &p_info->icmp6h) < 0)
|
||||
return -1;
|
||||
|
||||
if (p_info->icmp6h->icmp6_code != 0)
|
||||
return -1;
|
||||
|
||||
if (p_info->icmp6h->icmp6_type == ICMPV6_ECHO_REQUEST) {
|
||||
p_info->pid.identifier = p_info->icmp6h->icmp6_sequence;
|
||||
p_info->pid_valid = true;
|
||||
p_info->reply_pid_valid = false;
|
||||
} else if (p_info->icmp6h->icmp6_type == ICMPV6_ECHO_REPLY) {
|
||||
p_info->reply_pid.identifier = p_info->icmp6h->icmp6_sequence;
|
||||
p_info->reply_pid_valid = true;
|
||||
p_info->pid_valid = false;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
p_info->event_type = FLOW_EVENT_NONE;
|
||||
p_info->pid.flow.saddr.port = p_info->icmp6h->icmp6_identifier;
|
||||
p_info->pid.flow.daddr.port = p_info->pid.flow.saddr.port;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Same as parse_icmp6_identifier, but for an ICMP(v4) header instead.
|
||||
*/
|
||||
static int parse_icmp_identifier(struct parsing_context *pctx,
|
||||
struct packet_info *p_info)
|
||||
{
|
||||
if (parse_icmphdr(&pctx->nh, pctx->data_end, &p_info->icmph) < 0)
|
||||
return -1;
|
||||
|
||||
if (p_info->icmph->code != 0)
|
||||
return -1;
|
||||
|
||||
if (p_info->icmph->type == ICMP_ECHO) {
|
||||
p_info->pid.identifier = p_info->icmph->un.echo.sequence;
|
||||
p_info->pid_valid = true;
|
||||
p_info->reply_pid_valid = false;
|
||||
} else if (p_info->icmph->type == ICMP_ECHOREPLY) {
|
||||
p_info->reply_pid.identifier = p_info->icmph->un.echo.sequence;
|
||||
p_info->reply_pid_valid = true;
|
||||
p_info->pid_valid = false;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
p_info->event_type = FLOW_EVENT_NONE;
|
||||
p_info->pid.flow.saddr.port = p_info->icmph->un.echo.id;
|
||||
p_info->pid.flow.daddr.port = p_info->pid.flow.saddr.port;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempts to parse the packet defined by pctx for a valid packet identifier
|
||||
* and reply identifier, filling in p_info.
|
||||
*
|
||||
* If succesful, all members of p_info will be set appropriately and 0 will
|
||||
* be returned.
|
||||
* On failure -1 will be returned (no garantuees on p_info members).
|
||||
*/
|
||||
static int parse_packet_identifier(struct parsing_context *pctx,
|
||||
struct packet_info *p_info)
|
||||
{
|
||||
int proto, err;
|
||||
struct ethhdr *eth;
|
||||
struct iphdr *iph;
|
||||
struct ipv6hdr *ip6h;
|
||||
struct flow_address *saddr, *daddr;
|
||||
|
||||
// Switch saddr <--> daddr on ingress to match egress
|
||||
if (ctx->is_egress) {
|
||||
saddr = &p_id->flow.saddr;
|
||||
daddr = &p_id->flow.daddr;
|
||||
} else {
|
||||
saddr = &p_id->flow.daddr;
|
||||
daddr = &p_id->flow.saddr;
|
||||
}
|
||||
|
||||
proto = parse_ethhdr(&ctx->nh, ctx->data_end, ð);
|
||||
p_info->time = bpf_ktime_get_ns();
|
||||
proto = parse_ethhdr(&pctx->nh, pctx->data_end, ð);
|
||||
|
||||
// Parse IPv4/6 header
|
||||
if (proto == bpf_htons(ETH_P_IP)) {
|
||||
p_id->flow.ipv = AF_INET;
|
||||
p_id->flow.proto = parse_iphdr(&ctx->nh, ctx->data_end, &iph);
|
||||
p_info->pid.flow.ipv = AF_INET;
|
||||
p_info->pid.flow.proto =
|
||||
parse_iphdr(&pctx->nh, pctx->data_end, &p_info->iph);
|
||||
} else if (proto == bpf_htons(ETH_P_IPV6)) {
|
||||
p_id->flow.ipv = AF_INET6;
|
||||
p_id->flow.proto = parse_ip6hdr(&ctx->nh, ctx->data_end, &ip6h);
|
||||
p_info->pid.flow.ipv = AF_INET6;
|
||||
p_info->pid.flow.proto =
|
||||
parse_ip6hdr(&pctx->nh, pctx->data_end, &p_info->ip6h);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Add new protocols here
|
||||
if (p_id->flow.proto == IPPROTO_TCP) {
|
||||
err = parse_tcp_identifier(ctx, &saddr->port, &daddr->port,
|
||||
fei, &p_id->identifier);
|
||||
if (err)
|
||||
return -1;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
// Parse identifer from suitable protocol
|
||||
if (config.track_tcp && p_info->pid.flow.proto == IPPROTO_TCP)
|
||||
err = parse_tcp_identifier(pctx, p_info);
|
||||
else if (config.track_icmp &&
|
||||
p_info->pid.flow.proto == IPPROTO_ICMPV6 &&
|
||||
p_info->pid.flow.ipv == AF_INET6)
|
||||
err = parse_icmp6_identifier(pctx, p_info);
|
||||
else if (config.track_icmp && p_info->pid.flow.proto == IPPROTO_ICMP &&
|
||||
p_info->pid.flow.ipv == AF_INET)
|
||||
err = parse_icmp_identifier(pctx, p_info);
|
||||
else
|
||||
return -1; // No matching protocol
|
||||
if (err)
|
||||
return -1; // Failed parsing protocol
|
||||
|
||||
// Sucessfully parsed packet identifier - fill in IP-addresses and return
|
||||
if (p_id->flow.ipv == AF_INET) {
|
||||
map_ipv4_to_ipv6(iph->saddr, &saddr->ip);
|
||||
map_ipv4_to_ipv6(iph->daddr, &daddr->ip);
|
||||
if (p_info->pid.flow.ipv == AF_INET) {
|
||||
map_ipv4_to_ipv6(&p_info->pid.flow.saddr.ip,
|
||||
p_info->iph->saddr);
|
||||
map_ipv4_to_ipv6(&p_info->pid.flow.daddr.ip,
|
||||
p_info->iph->daddr);
|
||||
} else { // IPv6
|
||||
saddr->ip = ip6h->saddr;
|
||||
daddr->ip = ip6h->daddr;
|
||||
p_info->pid.flow.saddr.ip = p_info->ip6h->saddr;
|
||||
p_info->pid.flow.daddr.ip = p_info->ip6h->daddr;
|
||||
}
|
||||
|
||||
reverse_flow(&p_info->reply_pid.flow, &p_info->pid.flow);
|
||||
p_info->payload = remaining_pkt_payload(pctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the number of unparsed bytes left in the packet (bytes after nh.pos)
|
||||
* Calculate a smoothed rtt similar to how TCP stack does it in
|
||||
* net/ipv4/tcp_input.c/tcp_rtt_estimator().
|
||||
*
|
||||
* NOTE: Will cause roundoff errors, but if RTTs > 1000ns errors should be small
|
||||
*/
|
||||
static __u32 remaining_pkt_payload(struct parsing_context *ctx)
|
||||
static __u64 calculate_srtt(__u64 prev_srtt, __u64 rtt)
|
||||
{
|
||||
// pkt_len - (pos - data) fails because compiler transforms it to pkt_len - pos + data (pkt_len - pos not ok because value - pointer)
|
||||
// data + pkt_len - pos fails on (data+pkt_len) - pos due to math between pkt_pointer and unbounded register
|
||||
__u32 parsed_bytes = ctx->nh.pos - ctx->data;
|
||||
return parsed_bytes < ctx->pkt_len ? ctx->pkt_len - parsed_bytes : 0;
|
||||
if (!prev_srtt)
|
||||
return rtt;
|
||||
// srtt = 7/8*prev_srtt + 1/8*rtt
|
||||
return prev_srtt - (prev_srtt >> 3) + (rtt >> 3);
|
||||
}
|
||||
|
||||
static bool is_rate_limited(__u64 now, __u64 last_ts, __u64 rtt)
|
||||
{
|
||||
if (now < last_ts)
|
||||
return true;
|
||||
|
||||
// RTT-based rate limit
|
||||
if (config.rtt_rate && rtt)
|
||||
return now - last_ts < FIXPOINT_TO_UINT(config.rtt_rate * rtt);
|
||||
|
||||
// Static rate limit
|
||||
return now - last_ts < config.rate_limit;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fills in event_type, timestamp, flow, source and reserved.
|
||||
* Does not fill in the flow_info.
|
||||
* Send a flow opening event through the perf-buffer.
|
||||
* As these events are only sent upon receiving a reply, need to access state
|
||||
* of the reverse flow to get reason flow was opened and when the original
|
||||
* packet opening the flow was sent.
|
||||
*/
|
||||
static void fill_flow_event(struct flow_event *fe, __u64 timestamp,
|
||||
struct network_tuple *flow,
|
||||
enum flow_event_source source)
|
||||
static void send_flow_open_event(void *ctx, struct packet_info *p_info,
|
||||
struct flow_state *rev_flow)
|
||||
{
|
||||
fe->event_type = EVENT_TYPE_FLOW;
|
||||
fe->timestamp = timestamp;
|
||||
__builtin_memcpy(&fe->flow, flow, sizeof(struct network_tuple));
|
||||
fe->source = source;
|
||||
fe->reserved = 0; // Make sure it's initilized
|
||||
struct flow_event fe = {
|
||||
.event_type = EVENT_TYPE_FLOW,
|
||||
.flow_event_type = FLOW_EVENT_OPENING,
|
||||
.source = EVENT_SOURCE_PKT_DEST,
|
||||
.flow = p_info->pid.flow,
|
||||
.reason = rev_flow->opening_reason,
|
||||
.timestamp = rev_flow->last_timestamp,
|
||||
.reserved = 0,
|
||||
};
|
||||
|
||||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &fe, sizeof(fe));
|
||||
}
|
||||
|
||||
/*
|
||||
* Main function for handling the pping egress path.
|
||||
* Parses the packet for an identifer and attemps to store a timestamp for it
|
||||
* in the packet_ts map.
|
||||
* Sends a flow-event message based on p_info.
|
||||
*
|
||||
* The rev_flow argument is used to inform if the message is for the flow
|
||||
* in the current direction or the reverse flow, and will adapt the flow and
|
||||
* source members accordingly.
|
||||
*/
|
||||
static void pping_egress(void *ctx, struct parsing_context *pctx)
|
||||
static void send_flow_event(void *ctx, struct packet_info *p_info,
|
||||
bool rev_flow)
|
||||
{
|
||||
struct packet_id p_id = { 0 };
|
||||
struct flow_event fe;
|
||||
struct flow_state *f_state;
|
||||
struct flow_state new_state = { 0 };
|
||||
__u64 now;
|
||||
struct flow_event fe = {
|
||||
.event_type = EVENT_TYPE_FLOW,
|
||||
.flow_event_type = p_info->event_type,
|
||||
.reason = p_info->event_reason,
|
||||
.timestamp = p_info->time,
|
||||
.reserved = 0, // Make sure it's initilized
|
||||
};
|
||||
|
||||
if (parse_packet_identifier(pctx, &p_id, &fe.event_info) < 0)
|
||||
if (rev_flow) {
|
||||
fe.flow = p_info->pid.flow;
|
||||
fe.source = EVENT_SOURCE_PKT_SRC;
|
||||
} else {
|
||||
fe.flow = p_info->reply_pid.flow;
|
||||
fe.source = EVENT_SOURCE_PKT_DEST;
|
||||
}
|
||||
|
||||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &fe, sizeof(fe));
|
||||
}
|
||||
|
||||
/*
|
||||
* Send a map-full event for the map.
|
||||
* Will only trigger once every WARN_MAP_FULL_INTERVAL
|
||||
*/
|
||||
static void send_map_full_event(void *ctx, struct packet_info *p_info,
|
||||
enum pping_map map)
|
||||
{
|
||||
struct map_full_event me;
|
||||
|
||||
if (p_info->time < last_warn_time[map] ||
|
||||
p_info->time - last_warn_time[map] < WARN_MAP_FULL_INTERVAL)
|
||||
return;
|
||||
|
||||
now = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns
|
||||
f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow);
|
||||
last_warn_time[map] = p_info->time;
|
||||
|
||||
__builtin_memset(&me, 0, sizeof(me));
|
||||
me.event_type = EVENT_TYPE_MAP_FULL;
|
||||
me.timestamp = p_info->time;
|
||||
me.flow = p_info->pid.flow;
|
||||
me.map = map;
|
||||
|
||||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &me, sizeof(me));
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempt to create a new flow-state.
|
||||
* Returns a pointer to the flow_state if successful, NULL otherwise
|
||||
*/
|
||||
static struct flow_state *create_flow(void *ctx, struct packet_info *p_info)
|
||||
{
|
||||
struct flow_state new_state = { 0 };
|
||||
|
||||
new_state.last_timestamp = p_info->time;
|
||||
new_state.opening_reason = p_info->event_type == FLOW_EVENT_OPENING ?
|
||||
p_info->event_reason :
|
||||
EVENT_REASON_FIRST_OBS_PCKT;
|
||||
|
||||
if (bpf_map_update_elem(&flow_state, &p_info->pid.flow, &new_state,
|
||||
BPF_NOEXIST) != 0) {
|
||||
send_map_full_event(ctx, p_info, PPING_MAP_FLOWSTATE);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return bpf_map_lookup_elem(&flow_state, &p_info->pid.flow);
|
||||
}
|
||||
|
||||
static struct flow_state *update_flow(void *ctx, struct packet_info *p_info,
|
||||
bool *new_flow)
|
||||
{
|
||||
struct flow_state *f_state;
|
||||
*new_flow = false;
|
||||
|
||||
f_state = bpf_map_lookup_elem(&flow_state, &p_info->pid.flow);
|
||||
|
||||
// Attempt to create flow if it does not exist
|
||||
if (!f_state && p_info->pid_valid &&
|
||||
!(p_info->event_type == FLOW_EVENT_CLOSING ||
|
||||
p_info->event_type == FLOW_EVENT_CLOSING_BOTH)) {
|
||||
*new_flow = true;
|
||||
f_state = create_flow(ctx, p_info);
|
||||
}
|
||||
|
||||
if (!f_state)
|
||||
return NULL;
|
||||
|
||||
// Update flow state
|
||||
f_state->sent_pkts++;
|
||||
f_state->sent_bytes += p_info->payload;
|
||||
|
||||
return f_state;
|
||||
}
|
||||
|
||||
static struct flow_state *update_rev_flow(void *ctx, struct packet_info *p_info)
|
||||
{
|
||||
struct flow_state *f_state;
|
||||
|
||||
f_state = bpf_map_lookup_elem(&flow_state, &p_info->reply_pid.flow);
|
||||
if (!f_state)
|
||||
return NULL;
|
||||
|
||||
// Is a new flow, push opening flow message
|
||||
if (!f_state->has_opened &&
|
||||
p_info->event_type != FLOW_EVENT_CLOSING_BOTH) {
|
||||
f_state->has_opened = true;
|
||||
send_flow_open_event(ctx, p_info, f_state);
|
||||
}
|
||||
|
||||
// Update flow state
|
||||
f_state->rec_pkts++;
|
||||
f_state->rec_bytes += p_info->payload;
|
||||
|
||||
return f_state;
|
||||
}
|
||||
|
||||
static void delete_closed_flows(void *ctx, struct packet_info *p_info,
|
||||
struct flow_state *flow,
|
||||
struct flow_state *rev_flow)
|
||||
{
|
||||
bool has_opened;
|
||||
|
||||
// Flow closing - try to delete flow state and push closing-event
|
||||
if (fe.event_info.event == FLOW_EVENT_CLOSING) {
|
||||
if (!f_state) {
|
||||
bpf_map_delete_elem(&flow_state, &p_id.flow);
|
||||
fill_flow_event(&fe, now, &p_id.flow,
|
||||
EVENT_SOURCE_EGRESS);
|
||||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
|
||||
&fe, sizeof(fe));
|
||||
}
|
||||
if (flow && (p_info->event_type == FLOW_EVENT_CLOSING ||
|
||||
p_info->event_type == FLOW_EVENT_CLOSING_BOTH)) {
|
||||
has_opened = flow->has_opened;
|
||||
if (!bpf_map_delete_elem(&flow_state, &p_info->pid.flow) &&
|
||||
has_opened)
|
||||
send_flow_event(ctx, p_info, false);
|
||||
}
|
||||
|
||||
// Also close reverse flow
|
||||
if (rev_flow && p_info->event_type == FLOW_EVENT_CLOSING_BOTH) {
|
||||
has_opened = rev_flow->has_opened;
|
||||
if (!bpf_map_delete_elem(&flow_state,
|
||||
&p_info->reply_pid.flow) &&
|
||||
has_opened)
|
||||
send_flow_event(ctx, p_info, true);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true if p_info->pid.flow.daddr is a "local" address.
|
||||
*
|
||||
* Works by performing a fib lookup for p_info->pid.flow.
|
||||
* Lookup struct filled based on examples from
|
||||
* samples/bpf/xdp_fwd_kern.c/xdp_fwd_flags() and
|
||||
* tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c
|
||||
*/
|
||||
static bool is_local_address(struct packet_info *p_info, void *ctx,
|
||||
struct parsing_context *pctx)
|
||||
{
|
||||
int ret;
|
||||
struct bpf_fib_lookup lookup;
|
||||
__builtin_memset(&lookup, 0, sizeof(lookup));
|
||||
|
||||
lookup.ifindex = pctx->ingress_ifindex;
|
||||
lookup.family = p_info->pid.flow.ipv;
|
||||
|
||||
if (lookup.family == AF_INET) {
|
||||
lookup.tos = p_info->iph->tos;
|
||||
lookup.tot_len = bpf_ntohs(p_info->iph->tot_len);
|
||||
lookup.ipv4_src = p_info->iph->saddr;
|
||||
lookup.ipv4_dst = p_info->iph->daddr;
|
||||
} else if (lookup.family == AF_INET6) {
|
||||
struct in6_addr *src = (struct in6_addr *)lookup.ipv6_src;
|
||||
struct in6_addr *dst = (struct in6_addr *)lookup.ipv6_dst;
|
||||
|
||||
lookup.flowinfo = *(__be32 *)p_info->ip6h & IPV6_FLOWINFO_MASK;
|
||||
lookup.tot_len = bpf_ntohs(p_info->ip6h->payload_len);
|
||||
*src = p_info->pid.flow.saddr.ip; //verifier did not like ip6h->saddr
|
||||
*dst = p_info->pid.flow.daddr.ip;
|
||||
}
|
||||
|
||||
lookup.l4_protocol = p_info->pid.flow.proto;
|
||||
lookup.sport = 0;
|
||||
lookup.dport = 0;
|
||||
|
||||
ret = bpf_fib_lookup(ctx, &lookup, sizeof(lookup), 0);
|
||||
|
||||
return ret == BPF_FIB_LKUP_RET_NOT_FWDED ||
|
||||
ret == BPF_FIB_LKUP_RET_FWD_DISABLED;
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempt to create a timestamp-entry for packet p_info for flow in f_state
|
||||
*/
|
||||
static void pping_timestamp_packet(struct flow_state *f_state, void *ctx,
|
||||
struct parsing_context *pctx,
|
||||
struct packet_info *p_info, bool new_flow)
|
||||
{
|
||||
if (!f_state || !p_info->pid_valid)
|
||||
return;
|
||||
}
|
||||
|
||||
// No previous state - attempt to create it and push flow-opening event
|
||||
if (!f_state) {
|
||||
bpf_map_update_elem(&flow_state, &p_id.flow, &new_state,
|
||||
BPF_NOEXIST);
|
||||
f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow);
|
||||
|
||||
if (!f_state) // Creation failed
|
||||
return;
|
||||
|
||||
if (fe.event_info.event != FLOW_EVENT_OPENING) {
|
||||
fe.event_info.event = FLOW_EVENT_OPENING;
|
||||
fe.event_info.reason = EVENT_REASON_FIRST_OBS_PCKT;
|
||||
}
|
||||
fill_flow_event(&fe, now, &p_id.flow, EVENT_SOURCE_EGRESS);
|
||||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &fe,
|
||||
sizeof(fe));
|
||||
}
|
||||
|
||||
f_state->sent_pkts++;
|
||||
f_state->sent_bytes += remaining_pkt_payload(pctx);
|
||||
if (config.localfilt && !pctx->is_egress &&
|
||||
is_local_address(p_info, ctx, pctx))
|
||||
return;
|
||||
|
||||
// Check if identfier is new
|
||||
if (f_state->last_id == p_id.identifier)
|
||||
if (!new_flow && f_state->last_id == p_info->pid.identifier)
|
||||
return;
|
||||
f_state->last_id = p_id.identifier;
|
||||
f_state->last_id = p_info->pid.identifier;
|
||||
|
||||
// Check rate-limit
|
||||
if (now < f_state->last_timestamp ||
|
||||
now - f_state->last_timestamp < config.rate_limit)
|
||||
if (!new_flow &&
|
||||
is_rate_limited(p_info->time, f_state->last_timestamp,
|
||||
config.use_srtt ? f_state->srtt : f_state->min_rtt))
|
||||
return;
|
||||
|
||||
/*
|
||||
@@ -336,70 +639,72 @@ static void pping_egress(void *ctx, struct parsing_context *pctx)
|
||||
* the next available map slot somewhat fairer between heavy and sparse
|
||||
* flows.
|
||||
*/
|
||||
f_state->last_timestamp = now;
|
||||
bpf_map_update_elem(&packet_ts, &p_id, &now, BPF_NOEXIST);
|
||||
f_state->last_timestamp = p_info->time;
|
||||
|
||||
return;
|
||||
if (bpf_map_update_elem(&packet_ts, &p_info->pid, &p_info->time,
|
||||
BPF_NOEXIST) != 0)
|
||||
send_map_full_event(ctx, p_info, PPING_MAP_PACKETTS);
|
||||
}
|
||||
|
||||
/*
|
||||
* Main function for handling the pping ingress path.
|
||||
* Parses the packet for an identifer and tries to lookup a stored timestmap.
|
||||
* If it finds a match, it pushes an rtt_event to the events buffer.
|
||||
* Attempt to match packet in p_info with a timestamp from flow in f_state
|
||||
*/
|
||||
static void pping_ingress(void *ctx, struct parsing_context *pctx)
|
||||
static void pping_match_packet(struct flow_state *f_state, void *ctx,
|
||||
struct parsing_context *pctx,
|
||||
struct packet_info *p_info)
|
||||
{
|
||||
struct packet_id p_id = { 0 };
|
||||
struct flow_event fe;
|
||||
struct rtt_event re = { 0 };
|
||||
struct flow_state *f_state;
|
||||
__u64 *p_ts;
|
||||
__u64 now;
|
||||
|
||||
if (parse_packet_identifier(pctx, &p_id, &fe.event_info) < 0)
|
||||
if (!f_state || !p_info->reply_pid_valid)
|
||||
return;
|
||||
|
||||
f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow);
|
||||
if (!f_state)
|
||||
p_ts = bpf_map_lookup_elem(&packet_ts, &p_info->reply_pid);
|
||||
if (!p_ts || p_info->time < *p_ts)
|
||||
return;
|
||||
|
||||
f_state->rec_pkts++;
|
||||
f_state->rec_bytes += remaining_pkt_payload(pctx);
|
||||
|
||||
now = bpf_ktime_get_ns();
|
||||
p_ts = bpf_map_lookup_elem(&packet_ts, &p_id);
|
||||
if (!p_ts || now < *p_ts)
|
||||
goto validflow_out;
|
||||
|
||||
re.rtt = now - *p_ts;
|
||||
|
||||
re.rtt = p_info->time - *p_ts;
|
||||
// Delete timestamp entry as soon as RTT is calculated
|
||||
bpf_map_delete_elem(&packet_ts, &p_id);
|
||||
bpf_map_delete_elem(&packet_ts, &p_info->reply_pid);
|
||||
|
||||
if (f_state->min_rtt == 0 || re.rtt < f_state->min_rtt)
|
||||
f_state->min_rtt = re.rtt;
|
||||
f_state->srtt = calculate_srtt(f_state->srtt, re.rtt);
|
||||
|
||||
// Fill event and push to perf-buffer
|
||||
re.event_type = EVENT_TYPE_RTT;
|
||||
re.timestamp = now;
|
||||
re.timestamp = p_info->time;
|
||||
re.min_rtt = f_state->min_rtt;
|
||||
re.sent_pkts = f_state->sent_pkts;
|
||||
re.sent_bytes = f_state->sent_bytes;
|
||||
re.rec_pkts = f_state->rec_pkts;
|
||||
re.rec_bytes = f_state->rec_bytes;
|
||||
re.flow = p_id.flow;
|
||||
re.flow = p_info->pid.flow;
|
||||
re.match_on_egress = pctx->is_egress;
|
||||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &re, sizeof(re));
|
||||
}
|
||||
|
||||
validflow_out:
|
||||
// Wait with deleting flow until having pushed final RTT message
|
||||
if (fe.event_info.event == FLOW_EVENT_CLOSING && f_state) {
|
||||
bpf_map_delete_elem(&flow_state, &p_id.flow);
|
||||
fill_flow_event(&fe, now, &p_id.flow, EVENT_SOURCE_INGRESS);
|
||||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &fe,
|
||||
sizeof(fe));
|
||||
}
|
||||
/*
|
||||
* Will parse the ingress/egress packet in pctx and attempt to create a
|
||||
* timestamp for it and match it against the reverse flow.
|
||||
*/
|
||||
static void pping(void *ctx, struct parsing_context *pctx)
|
||||
{
|
||||
struct packet_info p_info = { 0 };
|
||||
struct flow_state *flow, *rev_flow;;
|
||||
bool new_flow;
|
||||
|
||||
if (parse_packet_identifier(pctx, &p_info) < 0)
|
||||
return;
|
||||
|
||||
flow = update_flow(ctx, &p_info, &new_flow);
|
||||
pping_timestamp_packet(flow, ctx, pctx, &p_info, new_flow);
|
||||
|
||||
rev_flow = update_rev_flow(ctx, &p_info);
|
||||
pping_match_packet(rev_flow, ctx, pctx, &p_info);
|
||||
|
||||
delete_closed_flows(ctx, &p_info, flow, rev_flow);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Programs
|
||||
@@ -416,7 +721,7 @@ int pping_tc_egress(struct __sk_buff *skb)
|
||||
.is_egress = true,
|
||||
};
|
||||
|
||||
pping_egress(skb, &pctx);
|
||||
pping(skb, &pctx);
|
||||
|
||||
return TC_ACT_UNSPEC;
|
||||
}
|
||||
@@ -430,10 +735,11 @@ int pping_tc_ingress(struct __sk_buff *skb)
|
||||
.data_end = (void *)(long)skb->data_end,
|
||||
.pkt_len = skb->len,
|
||||
.nh = { .pos = pctx.data },
|
||||
.ingress_ifindex = skb->ingress_ifindex,
|
||||
.is_egress = false,
|
||||
};
|
||||
|
||||
pping_ingress(skb, &pctx);
|
||||
pping(skb, &pctx);
|
||||
|
||||
return TC_ACT_UNSPEC;
|
||||
}
|
||||
@@ -447,10 +753,11 @@ int pping_xdp_ingress(struct xdp_md *ctx)
|
||||
.data_end = (void *)(long)ctx->data_end,
|
||||
.pkt_len = pctx.data_end - pctx.data,
|
||||
.nh = { .pos = pctx.data },
|
||||
.ingress_ifindex = ctx->ingress_ifindex,
|
||||
.is_egress = false,
|
||||
};
|
||||
|
||||
pping_ingress(ctx, &pctx);
|
||||
pping(ctx, &pctx);
|
||||
|
||||
return XDP_PASS;
|
||||
}
|
||||
|
Reference in New Issue
Block a user