diff --git a/pping/Makefile b/pping/Makefile index f323ee9..0c68add 100644 --- a/pping/Makefile +++ b/pping/Makefile @@ -1,34 +1,11 @@ # SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) USER_TARGETS := pping -TC_BPF_TARGETS := pping_kern_tc -BPF_TARGETS := pping_kern_xdp -BPF_TARGETS += $(TC_BPF_TARGETS) +BPF_TARGETS := pping_kern LDFLAGS += -pthread -EXTRA_DEPS += config.mk pping.h pping_helpers.h +EXTRA_DEPS += pping.h LIB_DIR = ../lib include $(LIB_DIR)/common.mk -include config.mk - -all: config.mk - -config.mk: configure - @sh configure - -ifndef HAVE_TC_LIBBPF -# If the iproute2 'tc' tool doesn't understand BTF debug info -# use llvm-strip to remove this debug info from object file -# -# *BUT* cannot strip everything as it removes ELF elems needed for -# creating maps -# -.PHONY: strip_tc_obj -strip_tc_obj: ${TC_BPF_TARGETS:=.o} - $(Q) echo "TC don't support libbpf - strip BTF info" - $(Q) llvm-strip --no-strip-all --remove-section .BTF $? - -all: strip_tc_obj -endif diff --git a/pping/README.md b/pping/README.md index c3d3978..ad5508b 100644 --- a/pping/README.md +++ b/pping/README.md @@ -1,21 +1,99 @@ # PPing using XDP and TC-BPF A re-implementation of [Kathie Nichols' passive ping -(pping)](https://github.com/pollere/pping) utility using XDP (on ingress) -and TC-BPF (on egress) for the packet capture logic. +(pping)](https://github.com/pollere/pping) utility using XDP (on ingress) and +TC-BPF (on egress) for the packet capture logic. ## Simple description -Passive Ping (PPing) makes use of the TCP Timestamp option to calculate the RTT for TCP traffic passing through. -PPing can be used on measure RTTs on end hosts or any device which sees both directions of the TCP flow. +Passive Ping (PPing) is a simple tool for passively measuring per-flow RTTs. It +can be used on endhosts as well as any (BPF-capable Linux) device which can see +both directions of the traffic (ex router or middlebox). Currently it only works +for TCP traffic which uses the TCP timestamp option, but could be extended to +also work with for example TCP seq/ACK numbers, the QUIC spinbit and ICMP +echo-reply messages. See the [TODO-list](./TODO.md) for more potential features +(which may or may not ever get implemented). -For outgoing packets, it checks for TCP timestamp TSval in the TCP header. If it finds one it creates a timestamp -for when it saw that TSval in a particular flow. On incomming packets it parses the TCP timestamp TSecr (which -is the TSval echoed by the receiving host) and checks it has seen any previous outgoing packets with that TCP -timestamp. If it has, an RTT is calculated as the difference in time between when it saw an outgoing packet -with a TSval, and when it received an incomming packet from the reverse flow with a matching TSecr. +The fundamental logic of pping is to timestamp a pseudo-unique identifier for +outgoing packets, and then look for matches in the incoming packets. If a match +is found, the RTT is simply calculated as the time difference between the +current time and the timestamp. -Note that TCP timestamps may not be unique for every packet in a flow, therefore it only matches the first -outgoing packet with a particular TSval with the first incomming packet with a matching TSecr. Duplicate -TSval/TSecr are ignored. +This tool, just as Kathie's original pping implementation, uses TCP timestamps +as identifiers. For outgoing packets, the TSval (which is a timestamp in and off +itself) is timestamped. Incoming packets are then parsed for the TSecr, which +are the echoed TSval values from the receiver. The TCP timestamps are not +necessarily unique for every packet (they have a limited update frequency, +appears to be 1000 Hz for modern Linux systems), so only the first instance of +an identifier is timestamped, and matched against the first incoming packet with +the identifier. The mechanism to ensure only the first packet is timestamped and +matched differs from the one in Kathie's pping, and is further described in +[SAMPLING_DESIGN](./SAMPLING_DESIGN.md). -## Planned design +## Design and technical description !["Design of eBPF pping](./eBPF_pping_design.png) + +### Files: +- **pping.c:** Userspace program that loads and attaches the BPF programs, pulls + the perf-buffer `rtt_events` to print out RTT messages and periodically cleans + up the hash-maps from old entries. Also passes user options to the BPF + programs by setting a "global variable" (stored in the programs .rodata + section). +- **pping_kern.c:** Contains the BPF programs that are loaded on tc (egress) and + XDP (ingress), as well as several common functions, a global constant `config` + (set from userspace) and map definitions. The tc program `pping_egress()` + parses outgoing packets for identifiers. If an identifier is found and the + sampling strategy allows it, a timestamp for the packet is created in + `packet_ts`. The XDP program `pping_ingress()` parses incomming packets for an + identifier. If found, it looks up the `packet_ts` map for a match on the + reverse flow (to match source/dest on egress). If there is a match, it + calculates the RTT from the stored timestamp and deletes the entry. The + calculated RTT (together with the flow-tuple) is pushed to the perf-buffer + `rtt_events`. +- **bpf_egress_loader.sh:** A shell script that's used by `pping.c` to setup a + clsact qdisc and attach the `pping_egress()` program to egress using + tc. **Note**: Unless your iproute2 comes with libbpf support, tc will use + iproute's own loading mechanism when loading and attaching object files + directly through the tc command line. To ensure that libbpf is always used to + load `pping_egress()`, `pping.c` actually loads the program and pins it to + `/sys/fs/bpf/pping/classifier`, and tc only attaches the pinned program. +- **functions.sh and parameters.sh:** Imported by `bpf_egress_loader.sh`. +- **pping.h:** Common header file included by `pping.c` and + `pping_kern.c`. Contains some common structs used by both (are part of the + maps). + +### BPF Maps: +- **flow_state:** A hash-map storing some basic state for each flow, such as the + last seen identifier for the flow and when the last timestamp entry for the + flow was created. Entries are created by `pping_egress()`, and can be updated + or deleted by both `pping_egress()` and `pping_ingress()`. Leftover entries + are eventually removed by `pping.c`. Pinned at `/sys/fs/bpf/pping`. +- **packet_ts:** A hash-map storing a timestamp for a specific packet + identifier. Entries are created by `pping_egress()` and removed by + `pping_ingress()` if a match is found. Leftover entries are eventually + removed by `pping.c`. Pinned at `/sys/fs/bpf/pping`. +- **rtt_events:** A perf-buffer used by `pping_ingress()` to push calculated RTTs + to `pping.c`, which continuously polls the map the print out the RTTs. + +## Similar projects +Passively measuring the RTT for TCP traffic is not a novel concept, and there +exists a number of other tools that can do so. A good overview of how passive +RTT calculation using TCP timestamps (as in this project) works is provided in +[this paper](https://doi.org/10.1145/2523426.2539132) from 2013. + +- [pping](https://github.com/pollere/pping): This project is largely a + re-implementation of Kathie's pping, but by using BPF and XDP as well as + implementing some filtering logic the hope is to be able to create a always-on + tool that can scale well even to large amounts of massive flows. +- [ppviz](https://github.com/pollere/ppviz): Web-based visualization tool for + the "machine-friendly" output from Kathie's pping tool. If/when we implement a + similar machine readable output option it should hopefully work with this + implementation as well. +- [tcptrace](https://github.com/blitz/tcptrace): A post-processing tool which + can analyze a tcpdump file and among other things calculate RTTs based on + seq/ACK numbers (`-r` or `-R` flag). +- **Dapper**: A passive TCP data plane monitoring tool implemented in P4 which + can among other things calculate the RTT based on the matching seq/ACK + numbers. [Paper](https://doi.org/10.1145/3050220.3050228). [Unofficial + source](https://github.com/muhe1991/p4-programs-survey/tree/master/dapper). +- [P4 Tofino TCP RTT measurement](https://github.com/Princeton-Cabernet/p4-projects/tree/master/RTT-tofino): + A passive TCP RTT monitor based on seq/ACK numbers implemented in P4 for + Tofino programmable switches. [Paper](https://doi.org/10.1145/3405669.3405823). diff --git a/pping/SAMPLING_DESIGN.md b/pping/SAMPLING_DESIGN.md new file mode 100644 index 0000000..00ff6cf --- /dev/null +++ b/pping/SAMPLING_DESIGN.md @@ -0,0 +1,386 @@ +# Introduction +This file is intended to document some of the challenges and design +decisions for adding sampling functionality to pping. It is partly +based on discussions from my supervisor meeting on 2021-02-22, and the +contents of my +[status slides](https://github.com/xdp-project/bpf-research/blob/master/meetings/simon/work_summary_20210222.org) +from that meeting. + +## Purpose of sampling +The main purpose of adding sampling to pping is to prevent a massive +amount of timestamp entries being created and quickly filling up the +map. This prevents new entries from being made until old ones can be +cleared out. A few large flows could thus "hog" all the map entries, +and prevent RTTs from other flows from being reported. Sampling is +therefore only used on egress to determine if a timestamp entry should +be created for a packet. All packets on ingress will still be parsed +and checked for a potential match. + +A secondary purpose of the sampling is the reduce the amount of output +that pping creates. In most circumstances, getting 1000 RTT reports +per second from a single flow will probably not be of interest, making +it less useful as a direct command-line utility. + +# Considered sampling approaches +There are a number of different ways that the sampling could be +performed, ex: + +- Sample every N packets per flow + - Not very flexible + - If same rate is used for all flows small flows would get very few + samples. +- Sample completely random packets + - Probably not a good idea... +- Head sampling (sample the first few packets of each flow) + - Not suitable for monitoring long flows + - RTT may change over lifetime of flow (due to buffer bloat) +- Probabilistic approach + - Probabilistic approaches have been used to for example capture + most relevant information with limited overhead in INT + - Could potentially be configured across multiple devices, so that + pping on all of the devices together capture the most relevant + traffic. + - While it could potentially work well, I'm not very familiar with + these approaches. Would take considerable research from my side + to figure out how these methods work, how to best apply it to + pping, and how to implement it in BPF. +- Used time-based sampling, limiting the rate of how often entries + can be created per flow + - Intuitively simple + - Should correspond quite well with the output you would probably + want? I.e. a few entries per flow (regardless of how heavy they + are) stating their current RTT. + +I believe that time-based sampling is the most promising solution that +I can implement in a reasonable time. In the future additional +sampling methods could potentially be added. + +# Considerations for time-based sampling +## Time interval +For the time-based sampling, we must determine how the interval +between when new timestamp entries are allowed should be set. + +### Static time interval +The simplest alternative is probably to use a static limit, ex +100ms. This would provide a rather simple and predictable limit for +how often entries can be created (per flow), and how much output you +would get (per flow). + +### RTT-based time interval +It may be desirable to use a more dynamic time limit, which is +adapted to each flow. One way to do this, would be do base the time +limit on the RTT for the flow. Flows with short RTTs could be expected +to undergo more rapid changes than flows with long RTTs. This would +require keeping track of the RTT for each flow, for example a moving +average. Additionally, some fall back is required before the RTT for +the flow is known. + +### User configurable +Regardless if a static or RTT-based (or some other alternative) is +used, it should probably be user configurable (including allowing the +user to disable sampling entirely). + +## Allowing bursts +It may be desirable to allow to allow for multiple packets in a short +burst to be timestamped. Due to delayed ACKs, one may only get a +response for every other packet. If the first packed is timestamped, +and shortly after a second packet is sent (that has a different +identifier), then the response will effectively be for the second +packet, and no match for the timestamped identifier will be found. For +flows of the right (or wrong, depending on how you look at it) +intensity, slow enough where consecutive packets are likely to get +different TCP timestamps, but fast enough for the delayed ACKs to +acknowledge multiple packets, then you essentially have a 50/50 chance +of timestamping the wrong identifier and miss the RTT. + +To handle this, you could timestamp multiple consecutive packets (with +unique indentifiers) in a short burst. You probably need to limit this +burst in both number of packets, as well as timeframe after the first +packet that additional packets may be included. For example, allowing +up to 3 packets (with different identifiers) get a timestamp for up to +4 ms after the first one of them are timestamped. + +If allowing bursts of timestamps to be created, it may also be +desirable to rate limit the output, in order to not get a burst of +similar RTTs for the flow in the output (which may also skew averages +and other post-processing). + +## Handing duplicate identifiers +TCP timestamps are only updated at a limited rate (ex. 1000 Hz), and +thus you can have multiple consecutive packets with the same TCP +timestamp if they're sent fast enough. For the calculated RTT to be +correct, you should only match the first sent packet with a unique +identifier with the first received packet with a matching +identifier. Otherwise, you may for example have a sequence with 100 +packets with the same identifier, and match the last of the outgoing +packets with the first incoming response, which may underestimate the +RTT with as much as the TCP timestamp clock rate (ex. 1 ms). + +### Current solution +The current solution to this is very simple. For outgoing packets, a +timestamp entry is only allowed to be created if no previous entry for +the identifier exists (realized through the `BPF_NOEXIST` flag to +`bpf_map_update_elem()` call). Thus only the first outgoing packet with +a specific identifier can be timestamped. On egress, the first packet +with a matching identifier will mark the timestamp as used, preventing +later incoming responses from using that timestamp. The reason why the +timestamp is marked as used rather than directly deleted once a +matching packet on ingress is found, is to avoid the egress side +creating a new entry for the same identifier. This could occur if the +RTT is shorter than the TCP timestamp clock rate, and could result in +a massively underestimated RTT. This is the same mechanic that is used +in the original pping, as explained +[here](https://github.com/pollere/pping/blob/777eb72fd9b748b4bb628ef97b7fff19b751f1fd/pping.cpp#L155-L168). + +### New solution +The current solution will no longer work if sampling is +introduced. With sampling, there's no guarantee that the sampled +packed will be the first outgoing packet in the sequence of packets +with identical timestamps. Thus the RTT may still be underestimated by +as much as the TCP timestamp clock rate (ex. 1 ms). Therefore, a new +solution is needed. The current idea is to keep track of the last-seen +identifier of each flow, and only allow a packet to be sampled for +timestamping if its identifier differs from the last-seen identifier +of the flow, i.e. it is the first packet in the flow with that +identifier. This would perhaps be problematic with some sampling +approaches as it requires that the packet is both the first one with a +specific identifier, as well as being elected for sampling. However +for the rate-limited sampling it should work quite well, as it will +only delay the sampling until a packet with a new identifier is found. + +Another advantage with this solution is that it should allow for +timestamp entries to be deleted as soon as the matching response is +found on ingress. The timestamp no longer needs to be kept around only +to prevent egress to create a new timestamp with the same identifier, +as this new solution should take care of that. This would help a lot +with keeping the map clean, as the timestamp entries would then +automatically be removed as soon as they are no longer needed. The +periodic cleanup from userspace would only be needed to remove the +occasional entries that were never matched for some reason (e.g. the +previously mentioned issue with delayed ACKs, flow stopped, the +reverse flow can't be observed etc.). + +One issue for this new solution is handling out-of-order packets. If +an entry with an older identifier is a bit delayed, it may arrive after +the last seen identifier for the flow has been updated. This old +identifier may then be considered new (as it differs from the current +one), allowing an entry to be created for it and reverting the last +seen identifier to a previous one. Additionally, this may +now allow the next packet having what used to be the current +identifier, also being detected as a new identifier (as the out-of +order packet reverted the last-seen identifier to an old one, creating +a bit of a ping-pong effect). For TCP timestamps this can easily be +avoided by simply requiring the new identifier to be greater than the +last-seen identifier (as TCP timestamps should be monotonically +increasing). That solution may however not be suitable if one wants to +reuse this mechanic for other protocols, such as the QUIC spinbit. + +## Keeping per-flow information +In order for the per-flow rate limiting to work, some per-flow state +must be maintained, namely when the last timestamp for that flow was +added (so that one can check that sufficient time has passed before +attempting to add another one). + +There may be some drawbacks with having to keep per-flow state. First +off, there will be some additional overhead from having to keep track +of this state. However, the savings from sampling the per-packet state +(the identifier/timestamps mappings) should hopefully cover the +overhead from keeping some per-flow state (and then some). + +Another issue that is worth keeping in mind is that this flow-state +will also need to be cleaned up eventually. This cleanup could be +handled in a similar manner as the current per-packet state is cleaned +up, by having the userspace process occasionally remove old +entries. In this case, the entries could be deemed as old if there was +a long time since the last timestamp was added for the flow, ex 300 +seconds as used by the [original +pping](https://github.com/pollere/pping/blob/777eb72fd9b748b4bb628ef97b7fff19b751f1fd/pping.cpp#L117). +Additionally, one can parse the packets for indications that the +connection is being closed (ex TCP FIN/RST), and then directly delete +the flow-state for that flow from the BPF programs. + +Later on, this per-flow state could potentially be expanded to include +other information deemed useful (such as ex. minimum and average RTT). + +### Alternative solution - keeping identifier in flow-state +One idea that came up during my supervisor meeting, was that instead +of creating timestamps for individual packets as is currently done, +you only create a number of timestamps for each flow. That is, instead +of creating per-packet entries in a separate map, you include a number +of timestamp/identifier pairs in the flow-state information itself. + +While this would potentially be rather efficient, limiting the number +of timestamp entries to a fixed number per flow, I'm opposed to this +idea for a few reasons: + +1. The sampling rate would be inherently tied to the RTT of the + flow. While this may in many cases be desirable, it is not very + flexible. It would also make it hard to ex. turn of sampling + completely. +2. The number of timestamps per flow would need to be fixed and known + at compile time(?). As the timestamps/identifier pairs are kept in + the state-flow information itself, and the state-flow information + needs to be of a known and fixed size when creating the maps. This + may also result in some wasted space if the flow-state includes + spots for several timestamp/identifier pairs, but most flows only + makes use of a few (although having an additional timestamp entry + map of fixed size wastes space in a similar manner). +2. If a low number of timestamp/identifier pairs are kept, selecting + an identifier that is missed (ex due to delayed ACKs) could + effectivly block new timestamps from being created (and thus from + RTTs being calculated) for the flow for a relatively long + while. New timestamps can only be created if you have a free slot, + and you can only free a slot by either getting a matching reply, or + waiting until it can be safely assumed that the response was missed + (and not just delayed). + +## Graceful degradation +Another aspect I've been asked to consider is how to gracefully reduce +the functionality of pping as the timestamp entry map gets full (as +with sufficiently many and heavy flows, it's likely inevitable). + +What currently happens when the timestamp entry map is full, is simply +that no more entries can be made until some have been cleared +out. When adding a rate-limit to the number of entries per flow, as +well as directly deleting entries upon match, I believe this is a +reasonable way to handle the situation. As soon as some RTTs for +current flows have been reported, space for new entries will be +available. The next outgoing packet with a valid identifier from any +flow that does not have to currently wait for its rate limit will then +be able to grab the next spot. However this will still favor heavy +flows over smaller flows, as heavy flows are more likely to be able to +get in a packet first, but they will at least still be limited by the +rate limit, and thus have to take turns with other flows. + +It also worth noting that as per-flow state will need to be kept, +there will be strict limit to the number of concurrent flows that can +be monitored, corresponding to the number of entries that can be held +by the map for the per-flow state. Once the per-flow state map is +full, no new flows can be added until one is cleared. It also doesn't +make sense to add packet timestamp entries for flows which state +cannot be tracked, as the rate limit cannot be enforced then. + +I see a few ways to more actively handle degradation, depending on what +one views as desirable: + +1. One can attempt to monitor many flows, with infrequent RTT + calculations for each. In this case, the userspace process that + occasionally clears out the timestamp map could automatically + decrease the per-flow rate limit if it detects the map is getting + close to full. That way, fewer entries would be generated per flow, + and flows would be forced to take turns to a greater degree when + the map is completely full. Similarly, one may wish to reduce the + timeout for old flows if the per-flow map is getting full, in order + to more quickly allow new flows to be monitored, and only keeping + the most active flows around. +2. One can attempt to monitor fewer flows, but with more frequent RTT + calculations for each. The easiest way to achieve this is to + probably to set a smaller size on the per-flow map relative to the + per-packet timestamp map. In case one wants to primarily focus on + heavier flows, one could possibly add ex. packet rate to the + per-flow information, and remove the flows with the lowest packet + rates. +3. One can attempt to focus on flows with shorter RTTs. Flows with + shorter RTTs should make more efficient use of timestamp entries, + as they can be cleared out faster allowing for new entries. On the + other hand, flows with longer RTTs may be the more interesting + ones, as they are more likely to indicate some issue. +4. One can simply try to create a larger map (and copy over the old + contents) once the map is approaching full. This way one can start + with reasonably small maps, and only start eating up more memory if + required. + +While I'm leaning towards option 1 or 4, I don't have a very strong +personal opinion here, and would like some input on what others (who +may have more experience with network measurements) think are +reasonable trade-offs to do. + +# Implementation considerations +There are of course several more practical considerations as well when +implementing the sampling, some of which I'll try to address here. + +## "Global" vs PERCPU maps +In general, it's likely wise to go with PERCPU maps over "global" (aka +non-PERCPU) maps whenever possible, as PERCPU maps should be more +performant, and also avoids concurrency issues. But this only applies +of course, if the BPF programs don't need to act on global state. + +For pping, I unfortunately see no way for the program to work with +only information local to each CPU core individually. The per-packet +identifier and timestamps need to be global, as there is no guarantee +that the same core that timestamped a packet will process the response +for that packet. Likewise, the per-flow information, like the time of +the last timestamping, also needs to be global. Otherwise rate limit +would be per-CPU-per-flow rather than just per-flow. + +In practice, packets from the same flow are apparently often handled +by the same CPU, but this is not guaranteed, and therefore not +something we can rely on (especially when state needs to be shared by +both ingress and egress). Could try to use a CPU map to enforce this +behavior, but probably not a great idea. + +## Concurrency issues +In addition to the performance hit, sharing global state between +multiple concurrent processes risks running into concurrency issues +unless access is synchronized in some manner (in BPF, the two +mechanics I know of are atomic adds and spin-locks for maps). With the +risk of me misunderstanding the memory model for BPF programs (which +from what I can tell I'm probably not alone about), I will attempt to +explain the potential concurrency issues I see with the pping +implementation. + +The current pping implementation already has a potential concurrency +issue. When matches for identifiers are found on ingress, a check is +performed to see if the timestamp has already been used or +not. Multiple packets processed in parallel could potentially all +find that the timestamp is unused, before any of them manage to mark +it as used for the others. This may result in pping matching several +responses to a single timestamp entry and reporting the RTTs for each +of them. I do not consider this a significant issue however, as if +they are concurrent enough that they manage to lookup the used status +before another has time to set it, the difference in time between them +should be very small, and therefore compute very similar RTTs. So the +reported RTTs should still be rather accurate, just over-reported. + +When adding sampling and per-flow information, some additional +concurrency issues may be encountered. Mainly, multiple packets may +find that they are allowed to add a new timestamp, before they manage +to update the time of last added time-stamp in the per-flow +state. This may lead to multiple attempts at creating a timestamp at +approximately the same time. For TCP timestamps, all the identifiers +are likely to be identical (as the TCP timestamp itself is only +updated at limited rate), so only one of them should succeed +anyways. If using identifiers that are more unique however, such as +TCP sequence numbers, then it's possible that a short burst of entries +would be created instead of just a single entry within the rate-limit +for the flow. + +Overall, I don't think these concurrency issues are that severe, as +they should still result in accurate RTTs, just some possible +over-reporting. I don't believe these issues warrants the performance +impact and potential code complexity of trying to synchronize +access. Furthermore, from what I understand these concurrency issues +are not too likely to occur in reality, as packets from the same flow +are often processed on the same core. + +## Global variable vs single-entry map +With BTF, there seems like BPF programs now support the use of global +variables. These global variables can supposedly be modified from user +space, and should from what I've heard also be more efficient than map +lookups. They therefore seem like promising way to pass some +user-configured options from userspace to the BPF programs. + +I would however need to lookup how to actually use these, as the +examples I've seen have used a slightly different libbpf setup, where +a "skeleton" header-file is compiled and imported to the userspace +program. There should be some examples in the [xdp-tools +repository](https://github.com/xdp-project/xdp-tools). + +The alternative I guess would be to use a +`BPF_MAP_TYPE_PERCPU_ARRAY` with a single entry, which is filled in +with the user-configured option by the userspace program. + + + + + diff --git a/pping/TODO.md b/pping/TODO.md index b14a563..546ee34 100644 --- a/pping/TODO.md +++ b/pping/TODO.md @@ -2,27 +2,60 @@ ## Protocols - [x] TCP (based on timestamp options) - - [ ] Skip pure ACKs for egress? + - [x] Skip pure ACKs for egress + - Timestamping pure ACKs may lead to erroneous RTTs (ex. delay + between application attempting to send data being recognized as + an RTT) - [ ] Add fallback to SEQ/ACK in case of no timestamp? + - Some machines may not use TCP timestamps (either not supported + at all, or disabled as in ex. Windows 10) + - If one only considers SEQ/ACK (and don't check for SACK + options), could result in ex. delay from retransmission being + included in RTT - [ ] ICMP (ex Echo/Reply) - [ ] QUIC (based on spinbit) ## General pping +- [x] Add sampling so that RTT is not calculated for every packet + (with unique value) for large flows + - [ ] Allow short bursts to bypass sampling in order to handle + delayed ACKs +- [x] Keep some per-flow state + - Will likely be needed for the sampling + - [ ] Could potentially include keeping track of average RTT, which + may be useful for some decisions (ex. how often to sample, + when entry can be removed etc) + - [ ] Could potentially include keeping track of minimum RTT (as + done by the original pping), ex. to track bufferbloat + - [ ] Could potentially include keeping track of if flow is + bi-directional + - Original pping checks if flow is bi-directional before adding + timestamps, but this could miss shorter flows +- [ ] Dynamically grow the maps if they are starting to get full +- [ ] Improve map cleaning: Use a dynamic time to live for map entries + based on flow's RTT, instead of static 10s limit + - Keeping entries around for a long time allows the map to grow + unnecessarily large, which slows down the cleaning and may block + new entries - [ ] Use libxdp to load XDP program -- [ ] Check for existance of reverse flow before adding to hash-map (to avoid adding identifiers for flows that we can't see the reverse traffic for)? - - This could miss the first few packets, would not be ideal for short flows -- [ ] Keep track of minimum RTT for each flow (done by Pollere's pping, and helps identify buffer bloat) -- [ ] Add configurable rate-limit for how often each flow can add entries to the map (prevent high-rate flows from quickly filling up the map) -- [ ] Improve map cleaning: Use a dynamic time to live for hash map entries based on flow's RTT, instead of static 10s limit -- [ ] Add support for automatically deleting entries if they are unique - - TCP timestamp need to be kept for a while (because multiple packets can have the same timestamp), but for identifiers that are unique per packet, they can be removed directly after RTT is calculated +- [ ] Add option for machine-readable output (as original pping) + - It may be a good idea to keep the same format as original pping, + so that tools such as [ppviz](https://github.com/pollere/ppviz) + works for both pping implementations. +- [ ] Add timestamps to output (as original pping) +- [ ] Add support for other hooks + - Ex TC-BFP on ingress instead of XDP? ## Done - [x] Clean up commits and add signed-off-by tags - [x] Add SPDX-license-identifier tags - [x] Format C-code in kernel style -- [x] Use existing funcionality to reuse maps by using BTF-defined maps - - [x] Use BTF-defined maps for TC-BPF as well if iproute has libbpf support -- [x] Cleanup: Unload TC-BPF at program shutdown, and unpin map - In userspace part +- [x] Use existing functionality to reuse maps by using BTF-defined + maps + - [x] Use BTF-defined maps for TC-BPF as well if iproute has libbpf + support +- [x] Cleanup: Unload TC-BPF at program shutdown, and unpin map - In + userspace part - [x] Add IPv6 support - [x] Refactor to support easy addition of other protocols +- [x] Load tc-bpf program with libbpf (only attach it with tc) diff --git a/pping/bpf_egress_loader.sh b/pping/bpf_egress_loader.sh index 9332ad7..f15de65 100755 --- a/pping/bpf_egress_loader.sh +++ b/pping/bpf_egress_loader.sh @@ -4,7 +4,7 @@ # License: GPLv2 # # Modified by Simon Sundberg to add support -# of optional section (--sec) option and changed default BPF_OBJ +# of optional section (--sec) option or attaching a pinned program # basedir=`dirname $0` source ${basedir}/functions.sh @@ -64,6 +64,16 @@ function tc_egress_bpf_attach() egress bpf da obj "$objfile" sec "$section" } +function tc_egress_bpf_attach_pinned() +{ + local device=${1:-$DEV} + local pinprog=${2:-$PIN_PROG} + shift 2 + + call_tc filter add dev "$device" pref 2 handle 2 \ + egress bpf da pinned "$pinprog" +} + function tc_egress_list() { local device=${1:-$DEV} @@ -77,7 +87,12 @@ if [[ -n $REMOVE ]]; then fi tc_init_clsact $DEV -tc_egress_bpf_attach $DEV $BPF_OBJ $SEC + +if [[ -n $PIN_PROG ]]; then + tc_egress_bpf_attach_pinned $DEV $PIN_PROG +else + tc_egress_bpf_attach $DEV $BPF_OBJ $SEC +fi # Practical to list egress filters after setup. # (It's a common mistake to have several progs loaded) diff --git a/pping/configure b/pping/configure deleted file mode 100644 index 2f4c54b..0000000 --- a/pping/configure +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) -# This is not an autoconf generated configure -# - -# Output file which is input to Makefile -CONFIG=config.mk - -# Assume tc is in $PATH -TC=tc - -check_tc_libbpf() -{ - tc_version=$($TC -V) - if echo $tc_version | grep -q libbpf; then - libbpf_version=${tc_version##*libbpf } - echo "HAVE_TC_LIBBPF:=y" >> $CONFIG - echo "BPF_CFLAGS += -DHAVE_TC_LIBBPF" >> $CONFIG - echo "yes ($libbpf_version)" - else - echo "no" - fi -} - -echo "# Generated config" > $CONFIG -echo "Detecting available features on system" - -echo -n " - libbpf support in tc tool: " -check_tc_libbpf \ No newline at end of file diff --git a/pping/eBPF_pping_design.png b/pping/eBPF_pping_design.png index 88a5971..ab91002 100644 Binary files a/pping/eBPF_pping_design.png and b/pping/eBPF_pping_design.png differ diff --git a/pping/parameters.sh b/pping/parameters.sh index c947e27..1a1a49a 100644 --- a/pping/parameters.sh +++ b/pping/parameters.sh @@ -6,7 +6,7 @@ # License: GPLv2 # # Modified by Simon Sundberg to add support -# of optional section (--sec) option +# of optional section (--sec) option or attaching a pinned program # function usage() { @@ -20,12 +20,13 @@ function usage() { echo " -l | --list : (\$LIST) List setup after setup" echo " --file | --obj : (\$BPF_OBJ) BPF-object file to load" echo " --sec : (\$SEC) Section of BPF-object to load" + echo " --pinned : (\$PIN_PROG) Path to pinned program to attach" echo "" } # Using external program "getopt" to get --long-options OPTIONS=$(getopt -o vshd:l \ - --long verbose,dry-run,remove,stats,list,help,dev:,file:,obj:,sec: -- "$@") + --long verbose,dry-run,remove,stats,list,help,dev:,file:,obj:,sec:,pinned: -- "$@") if (( $? != 0 )); then usage err 2 "Error calling getopt" @@ -50,6 +51,11 @@ while true; do info "Section to load: $SEC" >&2 shift 2 ;; + --pinned ) + export PIN_PROG=$2 + info "Pinned program path: $PIN_PROG" >&2 + shift 2 + ;; -v | --verbose) export VERBOSE=yes # info "Verbose mode: VERBOSE=$VERBOSE" >&2 diff --git a/pping/pping.c b/pping/pping.c index 0df6d3b..f834676 100644 --- a/pping/pping.c +++ b/pping/pping.c @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ +static const char *__doc__ = + "Passive Ping - monitor flow RTT based on TCP timestamps"; + #include #include #include @@ -10,7 +13,9 @@ #include #include #include +#include #include +#include #include // For detecting Ctrl-C #include // For setting rlmit #include @@ -18,25 +23,18 @@ #include #include -#include "pping.h" //key and value structs for the ts_start map +#include "pping.h" //common structs for user-space and BPF parts #define NS_PER_SECOND 1000000000UL #define NS_PER_MS 1000000UL #define TCBPF_LOADER_SCRIPT "./bpf_egress_loader.sh" -#define PINNED_DIR "/sys/fs/bpf/tc/globals" -#define PPING_XDP_OBJ "pping_kern_xdp.o" -#define PPING_TCBPF_OBJ "pping_kern_tc.o" -#define XDP_FLAGS XDP_FLAGS_UPDATE_IF_NOEXIST - -#define TS_MAP "ts_start" -#define MAP_CLEANUP_INTERVAL \ - (1 * NS_PER_SECOND) // Clean timestamp map once per second #define TIMESTAMP_LIFETIME \ - (10 * NS_PER_SECOND) // Clear out entries from ts_start if they're over 10 seconds + (10 * NS_PER_SECOND) // Clear out packet timestamps if they're over 10 seconds +#define FLOW_LIFETIME \ + (300 * NS_PER_SECOND) // Clear out flows if they're inactive over 300 seconds -#define PERF_BUFFER "rtt_events" #define PERF_BUFFER_PAGES 64 // Related to the perf-buffer size? #define PERF_POLL_TIMEOUT_MS 100 @@ -57,12 +55,146 @@ // Structure to contain arguments for clean_map (for passing to pthread_create) struct map_cleanup_args { - int map_fd; - __u64 max_age_ns; + __u64 cleanup_interval; + int packet_map_fd; + int flow_map_fd; +}; + +// Store configuration values in struct to easily pass around +struct pping_config { + struct bpf_config bpf_config; + __u64 cleanup_interval; + int xdp_flags; + int ifindex; + char ifname[IF_NAMESIZE]; + bool force; + char *object_path; + char *ingress_sec; + char *egress_sec; + char *pin_dir; + char *packet_map; + char *flow_map; + char *rtt_map; }; static volatile int keep_running = 1; +static const struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "interface", required_argument, NULL, 'i' }, // Name of interface to run on + { "rate-limit", required_argument, NULL, 'r' }, // Sampling rate-limit in ms + { "force", no_argument, NULL, 'f' }, // Detach any existing XDP program on interface + { "cleanup-interval", required_argument, NULL, 'c' }, // Map cleaning interval in s + { 0, 0, NULL, 0 } +}; + +/* + * Copied from Jesper Dangaaard Brouer's traffic-pacing-edt example + */ +static void print_usage(char *argv[]) +{ + int i; + + printf("\nDOCUMENTATION:\n%s\n", __doc__); + printf("\n"); + printf(" Usage: %s (options-see-below)\n", argv[0]); + printf(" Listing options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)", + *long_options[i].flag); + else + printf(" short-option: -%c", long_options[i].val); + printf("\n"); + } + printf("\n"); +} + +static double parse_positive_double_argument(const char *str, + const char *parname) +{ + char *endptr; + double val; + val = strtod(str, &endptr); + if (strlen(str) != endptr - str) { + fprintf(stderr, "%s %s is not a valid number\n", parname, str); + return -EINVAL; + } + if (val < 0) { + fprintf(stderr, "%s must be positive\n", parname); + return -EINVAL; + } + + return val; +} + +static int parse_arguments(int argc, char *argv[], struct pping_config *config) +{ + int err, opt; + double rate_limit_ms, cleanup_interval_s; + + config->ifindex = 0; + + while ((opt = getopt_long(argc, argv, "hfi:r:c:", long_options, + NULL)) != -1) { + switch (opt) { + case 'i': + if (strlen(optarg) > IF_NAMESIZE) { + fprintf(stderr, "interface name too long\n"); + return -EINVAL; + } + strncpy(config->ifname, optarg, IF_NAMESIZE); + + config->ifindex = if_nametoindex(config->ifname); + if (config->ifindex == 0) { + err = -errno; + fprintf(stderr, + "Could not get index of interface %s: %s\n", + config->ifname, strerror(err)); + return err; + } + break; + case 'r': + rate_limit_ms = parse_positive_double_argument( + optarg, "rate-limit"); + if (rate_limit_ms < 0) + return -EINVAL; + + config->bpf_config.rate_limit = + rate_limit_ms * NS_PER_MS; + break; + case 'c': + cleanup_interval_s = parse_positive_double_argument( + optarg, "cleanup-interval"); + if (cleanup_interval_s < 0) + return -EINVAL; + + config->cleanup_interval = + cleanup_interval_s * NS_PER_SECOND; + break; + case 'f': + config->force = true; + break; + case 'h': + printf("HELP:\n"); + print_usage(argv); + exit(0); + default: + fprintf(stderr, "Unknown option %s\n", argv[optind]); + return -EINVAL; + } + } + + if (config->ifindex == 0) { + fprintf(stderr, + "An interface (-i or --interface) must be provided\n"); + return -EINVAL; + } + + return 0; +} + void abort_program(int sig) { keep_running = 0; @@ -78,28 +210,48 @@ static int set_rlimit(long int lim) return !setrlimit(RLIMIT_MEMLOCK, &rlim) ? 0 : -errno; } -static int mkdir_if_noexist(const char *path) +static int +bpf_obj_run_prog_pindir_func(struct bpf_object *obj, const char *prog_title, + const char *pin_dir, + int (*func)(struct bpf_program *, const char *)) { - int ret; - struct stat st = { 0 }; + int len; + struct bpf_program *prog; + char path[MAX_PATH_LEN]; - ret = stat(path, &st); - if (ret) { - if (errno != ENOENT) - return -errno; + len = snprintf(path, MAX_PATH_LEN, "%s/%s", pin_dir, prog_title); + if (len < 0) + return len; + if (len > MAX_PATH_LEN) + return -ENAMETOOLONG; - return mkdir(path, 0700) ? -errno : 0; - } - return S_ISDIR(st.st_mode) ? 0 : -EEXIST; + prog = bpf_object__find_program_by_title(obj, prog_title); + if (!prog || libbpf_get_error(prog)) + return prog ? libbpf_get_error(prog) : -EINVAL; + + return func(prog, path); } -static int bpf_obj_open(struct bpf_object **obj, const char *obj_path, - char *map_path) +/* + * Similar to bpf_object__pin_programs, but only attemps to pin a + * single program prog_title at path pin_dir/prog_title + */ +static int bpf_obj_pin_program(struct bpf_object *obj, const char *prog_title, + const char *pin_dir) { - DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, - .pin_root_path = map_path); - *obj = bpf_object__open_file(obj_path, map_path ? &opts : NULL); - return libbpf_get_error(*obj); + return bpf_obj_run_prog_pindir_func(obj, prog_title, pin_dir, + bpf_program__pin); +} + +/* + * Similar to bpf_object__unpin_programs, but only attempts to unpin a + * single program prog_title at path pin_dir/prog_title. + */ +static int bpf_obj_unpin_program(struct bpf_object *obj, const char *prog_title, + const char *pin_dir) +{ + return bpf_obj_run_prog_pindir_func(obj, prog_title, pin_dir, + bpf_program__unpin); } static int xdp_detach(int ifindex, __u32 xdp_flags) @@ -112,7 +264,6 @@ static int xdp_attach(struct bpf_object *obj, const char *sec, int ifindex, { struct bpf_program *prog; int prog_fd; - int err; if (sec) prog = bpf_object__find_program_by_title(obj, sec); @@ -120,24 +271,28 @@ static int xdp_attach(struct bpf_object *obj, const char *sec, int ifindex, prog = bpf_program__next(NULL, obj); prog_fd = bpf_program__fd(prog); - if (prog_fd < 0) { - fprintf(stderr, "Could not find program to attach\n"); + if (prog_fd < 0) return prog_fd; - } if (force) // detach current (if any) xdp-program first xdp_detach(ifindex, xdp_flags); - err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags); - if (err < 0) { - fprintf(stderr, "Failed loading xdp-program on interface %d\n", - ifindex); - return err; - } - return 0; + return bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags); } -static int run_program(const char *path, char *const argv[]) +static int init_rodata(struct bpf_object *obj, void *src, size_t size) +{ + struct bpf_map *map = NULL; + bpf_object__for_each_map(map, obj) { + if (strstr(bpf_map__name(map), ".rodata")) + return bpf_map__set_initial_value(map, src, size); + } + + // No .rodata map found + return -EINVAL; +} + +static int run_external_program(const char *path, char *const argv[]) { int status; int ret = -1; @@ -157,18 +312,24 @@ static int run_program(const char *path, char *const argv[]) } } -static int tc_bpf_load(char *bpf_object, char *section, char *interface) +static int tc_bpf_attach(const char *pin_dir, const char *section, + char *interface) { - char *const argv[] = { TCBPF_LOADER_SCRIPT, "--dev", interface, "--obj", - bpf_object, "--sec", section, NULL }; - return run_program(TCBPF_LOADER_SCRIPT, argv); + char prog_path[MAX_PATH_LEN]; + char *const argv[] = { TCBPF_LOADER_SCRIPT, "--dev", interface, + "--pinned", prog_path, NULL }; + + if (snprintf(prog_path, sizeof(prog_path), "%s/%s", pin_dir, section) < 0) + return -EINVAL; + + return run_external_program(TCBPF_LOADER_SCRIPT, argv); } static int tc_bpf_clear(char *interface) { char *const argv[] = { TCBPF_LOADER_SCRIPT, "--dev", interface, "--remove", NULL }; - return run_program(TCBPF_LOADER_SCRIPT, argv); + return run_external_program(TCBPF_LOADER_SCRIPT, argv); } /* @@ -184,45 +345,82 @@ static __u64 get_time_ns(void) return (__u64)t.tv_sec * NS_PER_SECOND + (__u64)t.tv_nsec; } -static int clean_map(int map_fd, __u64 max_age) +static bool packet_ts_timeout(void *val_ptr, __u64 now) +{ + __u64 ts = *(__u64 *)val_ptr; + if (now > ts && now - ts > TIMESTAMP_LIFETIME) + return true; + return false; +} + +static bool flow_timeout(void *val_ptr, __u64 now) +{ + __u64 ts = ((struct flow_state *)val_ptr)->last_timestamp; + if (now > ts && now - ts > FLOW_LIFETIME) + return true; + return false; +} + +/* + * Loops through all entries in a map, running del_decision_func(value, time) + * on every entry, and deleting those for which it returns true. + * On sucess, returns the number of entries deleted, otherwise returns the + * (negative) error code. + */ +//TODO - maybe add some pointer to arguments for del_decision_func? +static int clean_map(int map_fd, size_t key_size, size_t value_size, + bool (*del_decision_func)(void *, __u64)) { int removed = 0; - struct packet_id key, prev_key = { 0 }; - struct packet_timestamp value; + void *key, *prev_key, *value; bool delete_prev = false; __u64 now_nsec = get_time_ns(); - int entries = 0; // Just for debug - __u64 duration; // Just for debug +#ifdef DEBUG + int entries = 0; + __u64 duration; +#endif if (now_nsec == 0) return -errno; + key = malloc(key_size); + prev_key = malloc(key_size); + value = malloc(value_size); + if (!key || !prev_key || !value) { + removed = -ENOMEM; + goto cleanup; + } + // Cannot delete current key because then loop will reset, see https://www.bouncybouncy.net/blog/bpf_map_get_next_key-pitfalls/ - while (bpf_map_get_next_key(map_fd, &prev_key, &key) == 0) { + while (bpf_map_get_next_key(map_fd, prev_key, key) == 0) { if (delete_prev) { - bpf_map_delete_elem(map_fd, &prev_key); + bpf_map_delete_elem(map_fd, prev_key); removed++; delete_prev = false; } - if (bpf_map_lookup_elem(map_fd, &key, &value) == 0) { - if (now_nsec > value.timestamp && - now_nsec - value.timestamp > max_age) { - delete_prev = true; - } - } + if (bpf_map_lookup_elem(map_fd, key, value) == 0) + delete_prev = del_decision_func(value, now_nsec); +#ifdef DEBUG entries++; - prev_key = key; +#endif + memcpy(prev_key, key, key_size); } if (delete_prev) { - bpf_map_delete_elem(map_fd, &prev_key); + bpf_map_delete_elem(map_fd, prev_key); removed++; } +#ifdef DEBUG duration = get_time_ns() - now_nsec; - printf("Gone through %d entries and removed %d of them in %llu.%09llu s\n", - entries, removed, duration / NS_PER_SECOND, + printf("%d: Gone through %d entries and removed %d of them in %llu.%09llu s\n", + map_fd, entries, removed, duration / NS_PER_SECOND, duration % NS_PER_SECOND); +#endif +cleanup: + free(key); + free(prev_key); + free(value); return removed; } @@ -230,11 +428,14 @@ static void *periodic_map_cleanup(void *args) { struct map_cleanup_args *argp = args; struct timespec interval; - interval.tv_sec = MAP_CLEANUP_INTERVAL / NS_PER_SECOND; - interval.tv_nsec = MAP_CLEANUP_INTERVAL % NS_PER_SECOND; + interval.tv_sec = argp->cleanup_interval / NS_PER_SECOND; + interval.tv_nsec = argp->cleanup_interval % NS_PER_SECOND; while (keep_running) { - clean_map(argp->map_fd, argp->max_age_ns); + clean_map(argp->packet_map_fd, sizeof(struct packet_id), + sizeof(__u64), packet_ts_timeout); + clean_map(argp->flow_map_fd, sizeof(struct network_tuple), + sizeof(struct flow_state), flow_timeout); nanosleep(&interval, NULL); } pthread_exit(NULL); @@ -274,28 +475,134 @@ static void handle_missed_rtt_event(void *ctx, int cpu, __u64 lost_cnt) fprintf(stderr, "Lost %llu RTT events on CPU %d\n", lost_cnt, cpu); } +static int load_attach_bpfprogs(struct bpf_object **obj, + struct pping_config *config, bool *tc_attached, + bool *xdp_attached) +{ + int err; + + // Open and load ELF file + *obj = bpf_object__open(config->object_path); + err = libbpf_get_error(*obj); + if (err) { + fprintf(stderr, "Failed opening object file %s: %s\n", + config->object_path, strerror(-err)); + return err; + } + + err = init_rodata(*obj, &config->bpf_config, + sizeof(config->bpf_config)); + if (err) { + fprintf(stderr, "Failed pushing user-configration to %s: %s\n", + config->object_path, strerror(-err)); + return err; + } + + err = bpf_object__load(*obj); + if (err) { + fprintf(stderr, "Failed loading bpf program in %s: %s\n", + config->object_path, strerror(-err)); + return err; + } + + // Attach tc program + err = bpf_obj_pin_program(*obj, config->egress_sec, config->pin_dir); + if (err) { + fprintf(stderr, "Failed pinning tc program to %s/%s: %s\n", + config->pin_dir, config->egress_sec, strerror(-err)); + return err; + } + + err = tc_bpf_attach(config->pin_dir, config->egress_sec, + config->ifname); + if (err) { + fprintf(stderr, + "Failed attaching tc program on interface %s: %s\n", + config->ifname, strerror(-err)); + return err; + } + *tc_attached = true; + + // Attach XDP program + err = xdp_attach(*obj, config->ingress_sec, config->ifindex, + config->xdp_flags, config->force); + if (err) { + fprintf(stderr, "Failed attaching XDP program to %s%s: %s\n", + config->ifname, + config->force ? "" : ", ensure no other XDP program is already running on interface", + strerror(-err)); + return err; + } + *xdp_attached = true; + + return 0; +} + +static int setup_periodical_map_cleaning(struct bpf_object *obj, + struct pping_config *config) +{ + pthread_t tid; + struct map_cleanup_args clean_args = { + .cleanup_interval = config->cleanup_interval + }; + int err; + + clean_args.packet_map_fd = + bpf_object__find_map_fd_by_name(obj, config->packet_map); + if (clean_args.packet_map_fd < 0) { + fprintf(stderr, "Could not get file descriptor of map %s: %s\n", + config->packet_map, + strerror(-clean_args.packet_map_fd)); + return clean_args.packet_map_fd; + } + + clean_args.flow_map_fd = + bpf_object__find_map_fd_by_name(obj, config->flow_map); + if (clean_args.flow_map_fd < 0) { + fprintf(stderr, "Could not get file descriptor of map %s: %s\n", + config->flow_map, strerror(-clean_args.flow_map_fd)); + return clean_args.packet_map_fd; + } + + err = pthread_create(&tid, NULL, periodic_map_cleanup, &clean_args); + if (err) { + fprintf(stderr, + "Failed starting thread to perform periodic map cleanup: %s\n", + strerror(-err)); + return err; + } + + return 0; +} + int main(int argc, char *argv[]) { int err = 0; - int ifindex = 0; - bool xdp_attached = false; + bool tc_attached = false; - char map_path[MAX_PATH_LEN]; + bool xdp_attached = false; struct bpf_object *obj = NULL; - struct bpf_map *map = NULL; - pthread_t tid; - struct map_cleanup_args clean_args; + struct pping_config config = { + .bpf_config = { .rate_limit = 100 * NS_PER_MS }, + .cleanup_interval = 1 * NS_PER_SECOND, + .object_path = "pping_kern.o", + .ingress_sec = INGRESS_PROG_SEC, + .egress_sec = EGRESS_PROG_SEC, + .pin_dir = "/sys/fs/bpf/pping", + .packet_map = "packet_ts", + .flow_map = "flow_state", + .rtt_map = "rtt_events", + .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST, + .force = false, + }; struct perf_buffer *pb = NULL; - struct perf_buffer_opts pb_opts; - - // TODO - better argument parsing (more relevant as featureas are added) - if (argc < 2) { - printf("Usage: ./pping_user \n"); - return EXIT_FAILURE; - } + struct perf_buffer_opts pb_opts = { + .sample_cb = handle_rtt_event, + .lost_cb = handle_missed_rtt_event, + }; // Detect if running as root if (geteuid() != 0) { @@ -308,98 +615,41 @@ int main(int argc, char *argv[]) if (err) { fprintf(stderr, "Could not set rlimit to infinity: %s\n", strerror(-err)); - goto cleanup; + return EXIT_FAILURE; } - // Get index of interface - ifindex = if_nametoindex(argv[1]); - if (ifindex == 0) { - err = -errno; - fprintf(stderr, "Could not get index of interface %s: %s\n", - argv[1], strerror(-err)); - goto cleanup; - } - - // Load and attach the XDP program - err = mkdir_if_noexist("/sys/fs/bpf/tc"); + err = parse_arguments(argc, argv, &config); if (err) { - fprintf(stderr, - "Failed creating directory %s in which to pin map: %s\n", - "/sys/fs/bpf/tc", strerror(-err)); - goto cleanup; - } - - err = bpf_obj_open(&obj, PPING_XDP_OBJ, PINNED_DIR); - if (err) { - fprintf(stderr, "Failed opening object file %s: %s\n", - PPING_XDP_OBJ, strerror(-err)); - goto cleanup; - } - - // Get map here to allow for unpinning at cleanup - map = bpf_object__find_map_by_name(obj, TS_MAP); - err = libbpf_get_error(map); - if (err) { - fprintf(stderr, "Could not find map %s in %s: %s\n", TS_MAP, - PPING_XDP_OBJ, strerror(err)); - map = NULL; - } - - err = bpf_object__load(obj); - if (err) { - fprintf(stderr, "Failed loading XDP program: %s\n", + fprintf(stderr, "Failed parsing arguments: %s\n", strerror(-err)); - goto cleanup; + print_usage(argv); + return EXIT_FAILURE; } - err = xdp_attach(obj, XDP_PROG_SEC, ifindex, XDP_FLAGS, false); - if (err) { - fprintf(stderr, "Failed attaching XDP program to %s: %s\n", - argv[1], strerror(-err)); - goto cleanup; - } - xdp_attached = true; - - // Load tc-bpf section on interface egress - err = tc_bpf_load(PPING_TCBPF_OBJ, TCBPF_PROG_SEC, argv[1]); + err = load_attach_bpfprogs(&obj, &config, &tc_attached, &xdp_attached); if (err) { fprintf(stderr, - "Could not load section %s of %s on interface %s: %s\n", - TCBPF_PROG_SEC, PPING_TCBPF_OBJ, argv[1], + "Failed loading and attaching BPF programs in %s\n", + config.object_path); + goto cleanup; + } + + err = setup_periodical_map_cleaning(obj, &config); + if (err) { + fprintf(stderr, "Failed setting up map cleaning: %s\n", strerror(-err)); goto cleanup; } - tc_attached = true; - - // Set up the periodical map cleaning - clean_args.max_age_ns = TIMESTAMP_LIFETIME; - clean_args.map_fd = bpf_map__fd(map); - if (clean_args.map_fd < 0) { - fprintf(stderr, - "Could not get file descriptor of map %s in object %s: %s\n", - TS_MAP, PPING_XDP_OBJ, strerror(-clean_args.map_fd)); - goto cleanup; - } - - err = pthread_create(&tid, NULL, periodic_map_cleanup, &clean_args); - if (err) { - fprintf(stderr, - "Failed starting thread to perform periodic map cleanup: %s\n", - strerror(err)); - goto cleanup; - } // Set up perf buffer - pb_opts.sample_cb = handle_rtt_event; - pb_opts.lost_cb = handle_missed_rtt_event; - - pb = perf_buffer__new(bpf_object__find_map_fd_by_name(obj, PERF_BUFFER), + pb = perf_buffer__new(bpf_object__find_map_fd_by_name(obj, + config.rtt_map), PERF_BUFFER_PAGES, &pb_opts); err = libbpf_get_error(pb); if (err) { pb = NULL; fprintf(stderr, "Failed to open perf buffer %s: %s\n", - PERF_BUFFER, strerror(err)); + config.rtt_map, strerror(err)); goto cleanup; } @@ -419,30 +669,30 @@ int main(int argc, char *argv[]) cleanup: perf_buffer__free(pb); - if (map && bpf_map__is_pinned(map)) { - snprintf(map_path, sizeof(map_path), "%s/%s", PINNED_DIR, - TS_MAP); - err = bpf_map__unpin(map, map_path); - if (err) { - fprintf(stderr, "Failed unpinning map from %s: %s\n", - map_path, strerror(-err)); - } - } + if (xdp_attached) { - err = xdp_detach(ifindex, XDP_FLAGS); - if (err) { + err = xdp_detach(config.ifindex, config.xdp_flags); + if (err) fprintf(stderr, - "Failed deatching program from ifindex %d: %s\n", - ifindex, strerror(-err)); - } + "Failed deatching program from ifindex %s: %s\n", + config.ifname, strerror(-err)); } + if (tc_attached) { - err = tc_bpf_clear(argv[1]); //system(tc_cmd); - if (err) { + err = tc_bpf_clear(config.ifname); + if (err) fprintf(stderr, "Failed removing tc-bpf program from interface %s: %s\n", - argv[1], strerror(-err)); - } + config.ifname, strerror(-err)); + } + + if (obj && !libbpf_get_error(obj)) { + err = bpf_obj_unpin_program(obj, config.egress_sec, + config.pin_dir); + if (err) + fprintf(stderr, + "Failed unpinning tc program from %s: %s\n", + config.pin_dir, strerror(-err)); } return err != 0; diff --git a/pping/pping.h b/pping/pping.h index 755bc11..ac7d188 100644 --- a/pping/pping.h +++ b/pping/pping.h @@ -5,8 +5,12 @@ #include #include -#define XDP_PROG_SEC "xdp" -#define TCBPF_PROG_SEC "pping_egress" +#define INGRESS_PROG_SEC "xdp" +#define EGRESS_PROG_SEC "classifier" + +struct bpf_config { + __u64 rate_limit; +}; /* * Struct that can hold the source or destination address for a flow (l3+l4). @@ -34,17 +38,17 @@ struct network_tuple { __u8 reserved; }; +struct flow_state { + __u64 last_timestamp; + __u32 last_id; + __u32 reserved; +}; + struct packet_id { struct network_tuple flow; __u32 identifier; //tsval for TCP packets }; -struct packet_timestamp { - __u64 timestamp; - __u8 used; - __u8 reserved[7]; -}; - struct rtt_event { __u64 rtt; struct network_tuple flow; diff --git a/pping/pping_helpers.h b/pping/pping_helpers.h deleted file mode 100644 index 83d1078..0000000 --- a/pping/pping_helpers.h +++ /dev/null @@ -1,187 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef PPING_HELPERS_H -#define PPING_HELPERS_H - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "pping.h" - -#define AF_INET 2 -#define AF_INET6 10 -#define MAX_TCP_OPTIONS 10 - -/* - * This struct keeps track of the data and data_end pointers from the xdp_md or - * __skb_buff contexts, as well as a currently parsed to position kept in nh. - * Additionally, it also keeps the length of the entire packet, which together - * with the other members can be used to determine ex. how much data each - * header encloses. - */ -struct parsing_context { - void *data; //Start of eth hdr - void *data_end; //End of safe acessible area - struct hdr_cursor nh; //Position to parse next - __u32 pkt_len; //Full packet length (headers+data) -}; - -/* - * Maps an IPv4 address into an IPv6 address according to RFC 4291 sec 2.5.5.2 - */ -static void map_ipv4_to_ipv6(__be32 ipv4, struct in6_addr *ipv6) -{ - __builtin_memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10); - __builtin_memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2); - ipv6->in6_u.u6_addr32[3] = ipv4; -} - -/* - * Parses the TSval and TSecr values from the TCP options field. If sucessful - * the TSval and TSecr values will be stored at tsval and tsecr (in network - * byte order). - * Returns 0 if sucessful and -1 on failure - */ -static int parse_tcp_ts(struct tcphdr *tcph, void *data_end, __u32 *tsval, - __u32 *tsecr) -{ - int len = tcph->doff << 2; - void *opt_end = (void *)tcph + len; - __u8 *pos = (__u8 *)(tcph + 1); //Current pos in TCP options - __u8 i, opt, opt_size; - - if (tcph + 1 > data_end || len <= sizeof(struct tcphdr)) - return -1; - - for (i = 0; i < MAX_TCP_OPTIONS; i++) { - if (pos + 1 > opt_end || pos + 1 > data_end) - return -1; - - opt = *pos; - if (opt == 0) // Reached end of TCP options - return -1; - - if (opt == 1) { // TCP NOP option - advance one byte - pos++; - continue; - } - - // Option > 1, should have option size - if (pos + 2 > opt_end || pos + 2 > data_end) - return -1; - opt_size = *(pos + 1); - - // Option-kind is TCP timestap (yey!) - if (opt == 8 && opt_size == 10) { - if (pos + opt_size > opt_end || - pos + opt_size > data_end) - return -1; - *tsval = *(__u32 *)(pos + 2); - *tsecr = *(__u32 *)(pos + 6); - return 0; - } - - // Some other TCP option - advance option-length bytes - pos += opt_size; - } - return -1; -} -/* - * Attempts to fetch an identifier for TCP packets, based on the TCP timestamp - * option. If sucessful, identifier will be set to TSval if is_ingress, TSecr - * otherwise, the port-members of saddr and daddr will be set the the TCP source - * and dest, respectively, and 0 will be returned. On failure, -1 will be - * returned. - */ -static int parse_tcp_identifier(struct parsing_context *ctx, bool is_egress, - __be16 *sport, __be16 *dport, __u32 *identifier) -{ - __u32 tsval, tsecr; - struct tcphdr *tcph; - - if (parse_tcphdr(&ctx->nh, ctx->data_end, &tcph) < 0) - return -1; - - // Do not timestamp pure ACKs - if (is_egress && ctx->nh.pos - ctx->data >= ctx->pkt_len && !tcph->syn) - return -1; - - if (parse_tcp_ts(tcph, ctx->data_end, &tsval, &tsecr) < 0) - return -1; //Possible TODO, fall back on seq/ack instead - - *sport = tcph->source; - *dport = tcph->dest; - *identifier = is_egress ? tsval : tsecr; - return 0; -} - -/* - * Attempts to parse the packet limited by the data and data_end pointers, - * to retrieve a protocol dependent packet identifier. If sucessful, the - * pointed to p_id will be filled with parsed information from the packet - * packet, and 0 will be returned. On failure, -1 will be returned. - * If is_egress saddr and daddr will match source and destination of packet, - * respectively, and identifier will be set to the identifer for an outgoing - * packet. Otherwise, saddr and daddr will be swapped (will match - * destination and source of packet, respectively), and identifier will be - * set to the identifier of a response. - */ -static int parse_packet_identifier(struct parsing_context *ctx, bool is_egress, - struct packet_id *p_id) -{ - int proto, err; - struct ethhdr *eth; - struct iphdr *iph; - struct ipv6hdr *ip6h; - struct flow_address *saddr, *daddr; - - // Switch saddr <--> daddr on ingress to match egress - if (is_egress) { - saddr = &p_id->flow.saddr; - daddr = &p_id->flow.daddr; - } else { - saddr = &p_id->flow.daddr; - daddr = &p_id->flow.saddr; - } - - proto = parse_ethhdr(&ctx->nh, ctx->data_end, ð); - - // Parse IPv4/6 header - if (proto == bpf_htons(ETH_P_IP)) { - p_id->flow.ipv = AF_INET; - proto = parse_iphdr(&ctx->nh, ctx->data_end, &iph); - } else if (proto == bpf_htons(ETH_P_IPV6)) { - p_id->flow.ipv = AF_INET6; - proto = parse_ip6hdr(&ctx->nh, ctx->data_end, &ip6h); - } else { - return -1; - } - - // Add new protocols here - if (proto == IPPROTO_TCP) { - err = parse_tcp_identifier(ctx, is_egress, &saddr->port, - &daddr->port, &p_id->identifier); - if (err) - return -1; - } else { - return -1; - } - - // Sucessfully parsed packet identifier - fill in IP-addresses and return - if (p_id->flow.ipv == AF_INET) { - map_ipv4_to_ipv6(iph->saddr, &saddr->ip); - map_ipv4_to_ipv6(iph->daddr, &daddr->ip); - } else { // IPv6 - saddr->ip = ip6h->saddr; - daddr->ip = ip6h->daddr; - } - return 0; -} - -#endif diff --git a/pping/pping_kern.c b/pping/pping_kern.c new file mode 100644 index 0000000..735cec0 --- /dev/null +++ b/pping/pping_kern.c @@ -0,0 +1,361 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// overwrite xdp/parsing_helpers.h value to avoid hitting verifier limit +#ifdef IPV6_EXT_MAX_CHAIN +#undef IPV6_EXT_MAX_CHAIN +#endif +#define IPV6_EXT_MAX_CHAIN 3 + +#include +#include "pping.h" + +#define AF_INET 2 +#define AF_INET6 10 +#define MAX_TCP_OPTIONS 10 + +/* + * This struct keeps track of the data and data_end pointers from the xdp_md or + * __skb_buff contexts, as well as a currently parsed to position kept in nh. + * Additionally, it also keeps the length of the entire packet, which together + * with the other members can be used to determine ex. how much data each + * header encloses. + */ +struct parsing_context { + void *data; //Start of eth hdr + void *data_end; //End of safe acessible area + struct hdr_cursor nh; //Position to parse next + __u32 pkt_len; //Full packet length (headers+data) + bool is_egress; //Is packet on egress or ingress? +}; + +char _license[] SEC("license") = "GPL"; +// Global config struct - set from userspace +static volatile const struct bpf_config config = {}; + +// Map definitions +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct packet_id); + __type(value, __u64); + __uint(max_entries, 16384); +} packet_ts SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct network_tuple); + __type(value, struct flow_state); + __uint(max_entries, 16384); +} flow_state SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} rtt_events SEC(".maps"); + +// Help functions + +/* + * Maps an IPv4 address into an IPv6 address according to RFC 4291 sec 2.5.5.2 + */ +static void map_ipv4_to_ipv6(__be32 ipv4, struct in6_addr *ipv6) +{ + __builtin_memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10); + __builtin_memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2); + ipv6->in6_u.u6_addr32[3] = ipv4; +} + +/* + * Parses the TSval and TSecr values from the TCP options field. If sucessful + * the TSval and TSecr values will be stored at tsval and tsecr (in network + * byte order). + * Returns 0 if sucessful and -1 on failure + */ +static int parse_tcp_ts(struct tcphdr *tcph, void *data_end, __u32 *tsval, + __u32 *tsecr) +{ + int len = tcph->doff << 2; + void *opt_end = (void *)tcph + len; + __u8 *pos = (__u8 *)(tcph + 1); //Current pos in TCP options + __u8 i, opt; + volatile __u8 + opt_size; // Seems to ensure it's always read of from stack as u8 + + if (tcph + 1 > data_end || len <= sizeof(struct tcphdr)) + return -1; +#pragma unroll //temporary solution until we can identify why the non-unrolled loop gets stuck in an infinite loop + for (i = 0; i < MAX_TCP_OPTIONS; i++) { + if (pos + 1 > opt_end || pos + 1 > data_end) + return -1; + + opt = *pos; + if (opt == 0) // Reached end of TCP options + return -1; + + if (opt == 1) { // TCP NOP option - advance one byte + pos++; + continue; + } + + // Option > 1, should have option size + if (pos + 2 > opt_end || pos + 2 > data_end) + return -1; + opt_size = *(pos + 1); + if (opt_size < 2) // Stop parsing options if opt_size has an invalid value + return -1; + + // Option-kind is TCP timestap (yey!) + if (opt == 8 && opt_size == 10) { + if (pos + 10 > opt_end || pos + 10 > data_end) + return -1; + *tsval = *(__u32 *)(pos + 2); + *tsecr = *(__u32 *)(pos + 6); + return 0; + } + + // Some other TCP option - advance option-length bytes + pos += opt_size; + } + return -1; +} + +/* + * Attempts to fetch an identifier for TCP packets, based on the TCP timestamp + * option. If sucessful, identifier will be set to TSval if is_ingress, TSecr + * otherwise, the port-members of saddr and daddr will be set the the TCP source + * and dest, respectively, and 0 will be returned. On failure, -1 will be + * returned. Additionally, if the connection is closing (FIN or RST flag), sets + * flow_closing to true. + */ +static int parse_tcp_identifier(struct parsing_context *ctx, __be16 *sport, + __be16 *dport, bool *flow_closing, + __u32 *identifier) +{ + __u32 tsval, tsecr; + struct tcphdr *tcph; + + if (parse_tcphdr(&ctx->nh, ctx->data_end, &tcph) < 0) + return -1; + + // Check if connection is closing + if (tcph->fin || tcph->rst) { + *flow_closing = true; + /* bpf_printk("Detected connection closing on %d\n", */ + /* ctx->is_egress); //Upsets verifier? */ + } + + // Do not timestamp pure ACKs + if (ctx->is_egress && ctx->nh.pos - ctx->data >= ctx->pkt_len && + !tcph->syn) + return -1; + + if (parse_tcp_ts(tcph, ctx->data_end, &tsval, &tsecr) < 0) + return -1; //Possible TODO, fall back on seq/ack instead + + *sport = tcph->source; + *dport = tcph->dest; + *identifier = ctx->is_egress ? tsval : tsecr; + return 0; +} + +/* + * Attempts to parse the packet limited by the data and data_end pointers, + * to retrieve a protocol dependent packet identifier. If sucessful, the + * pointed to p_id will be filled with parsed information from the packet + * packet, and 0 will be returned. On failure, -1 will be returned. + * If is_egress saddr and daddr will match source and destination of packet, + * respectively, and identifier will be set to the identifer for an outgoing + * packet. Otherwise, saddr and daddr will be swapped (will match + * destination and source of packet, respectively), and identifier will be + * set to the identifier of a response. + */ +static int parse_packet_identifier(struct parsing_context *ctx, + struct packet_id *p_id, bool *flow_closing) +{ + int proto, err; + struct ethhdr *eth; + struct iphdr *iph; + struct ipv6hdr *ip6h; + struct flow_address *saddr, *daddr; + + // Switch saddr <--> daddr on ingress to match egress + if (ctx->is_egress) { + saddr = &p_id->flow.saddr; + daddr = &p_id->flow.daddr; + } else { + saddr = &p_id->flow.daddr; + daddr = &p_id->flow.saddr; + } + + proto = parse_ethhdr(&ctx->nh, ctx->data_end, ð); + + // Parse IPv4/6 header + if (proto == bpf_htons(ETH_P_IP)) { + p_id->flow.ipv = AF_INET; + proto = parse_iphdr(&ctx->nh, ctx->data_end, &iph); + } else if (proto == bpf_htons(ETH_P_IPV6)) { + p_id->flow.ipv = AF_INET6; + proto = parse_ip6hdr(&ctx->nh, ctx->data_end, &ip6h); + } else { + return -1; + } + + // Add new protocols here + if (proto == IPPROTO_TCP) { + err = parse_tcp_identifier(ctx, &saddr->port, &daddr->port, + flow_closing, &p_id->identifier); + if (err) + return -1; + } else { + return -1; + } + + // Sucessfully parsed packet identifier - fill in IP-addresses and return + if (p_id->flow.ipv == AF_INET) { + map_ipv4_to_ipv6(iph->saddr, &saddr->ip); + map_ipv4_to_ipv6(iph->daddr, &daddr->ip); + } else { // IPv6 + saddr->ip = ip6h->saddr; + daddr->ip = ip6h->daddr; + } + return 0; +} + +// Programs + +// TC-BFP for parsing packet identifier from egress traffic and add to map +SEC(EGRESS_PROG_SEC) +int pping_egress(struct __sk_buff *skb) +{ + struct packet_id p_id = { 0 }; + __u64 p_ts; + struct parsing_context pctx = { + .data = (void *)(long)skb->data, + .data_end = (void *)(long)skb->data_end, + .pkt_len = skb->len, + .nh = { .pos = pctx.data }, + .is_egress = true, + }; + bool flow_closing = false; + struct flow_state *f_state; + struct flow_state new_state = { 0 }; + + if (parse_packet_identifier(&pctx, &p_id, &flow_closing) < 0) + goto out; + + // Delete flow and create no timestamp entry if flow is closing + if (flow_closing) { + bpf_map_delete_elem(&flow_state, &p_id.flow); + goto out; + } + + // Check flow state + f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow); + if (!f_state) { // No previous state - attempt to create it + bpf_map_update_elem(&flow_state, &p_id.flow, &new_state, + BPF_NOEXIST); + f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow); + if (!f_state) + goto out; + } + + // Check if identfier is new + /* The gap between checking and updating last_id may cause concurrency + * issues where multiple packets may simultaneously think they are the + * first with a new identifier. As long as all of the identifiers are + * the same though, only one should be able to create a timestamp entry. + + * A bigger issue is that older identifiers (for example due to + * out-of-order packets) may pass this check and update the current + * identifier to an old one. This means that both the packet with the + * old identifier itself as well the next packet with the current + * identifier may be considered packets with new identifiers (even if + * both have been seen before). For TCP timestamps this could be + * prevented by changing the check to '>=' instead, but it may not be + * suitable for other protocols, such as QUIC and its spinbit. + * + * For now, just hope that the rate limit saves us from creating an + * incorrect timestamp. That may however also fail, either due to the + * to it happening in a time it's not limited by rate sampling, or + * because of rate check failing due to concurrency issues. + */ + if (f_state->last_id == p_id.identifier) + goto out; + f_state->last_id = p_id.identifier; + + // Check rate-limit + /* + * The window between checking and updating last_timestamp may cause + * concurrency issues, where multiple packets simultaneously pass the + * rate limit. However, as long as they have the same identifier, only + * a single timestamp entry should successfully be created. + */ + p_ts = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns + if (p_ts < f_state->last_timestamp || + p_ts - f_state->last_timestamp < config.rate_limit) + goto out; + + /* + * Updates attempt at creating timestamp, even if creation of timestamp + * fails (due to map being full). This should make the competition for + * the next available map slot somewhat fairer between heavy and sparse + * flows. + */ + f_state->last_timestamp = p_ts; + bpf_map_update_elem(&packet_ts, &p_id, &p_ts, BPF_NOEXIST); + +out: + return BPF_OK; +} + +// XDP program for parsing identifier in ingress traffic and check for match in map +SEC(INGRESS_PROG_SEC) +int pping_ingress(struct xdp_md *ctx) +{ + struct packet_id p_id = { 0 }; + __u64 *p_ts; + struct rtt_event event = { 0 }; + struct parsing_context pctx = { + .data = (void *)(long)ctx->data, + .data_end = (void *)(long)ctx->data_end, + .pkt_len = pctx.data_end - pctx.data, + .nh = { .pos = pctx.data }, + .is_egress = false, + }; + bool flow_closing = false; + + if (parse_packet_identifier(&pctx, &p_id, &flow_closing) < 0) + goto out; + + // Delete flow, but allow final attempt at RTT calculation + if (flow_closing) + bpf_map_delete_elem(&flow_state, &p_id.flow); + + p_ts = bpf_map_lookup_elem(&packet_ts, &p_id); + if (!p_ts) + goto out; + + event.rtt = bpf_ktime_get_ns() - *p_ts; + /* + * Attempt to delete timestamp entry as soon as RTT is calculated. + * But could have potential concurrency issue where multiple packets + * manage to match against the identifier before it can be deleted. + */ + bpf_map_delete_elem(&packet_ts, &p_id); + + __builtin_memcpy(&event.flow, &p_id.flow, sizeof(struct network_tuple)); + bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU, &event, + sizeof(event)); + +out: + return XDP_PASS; +} diff --git a/pping/pping_kern_tc.c b/pping/pping_kern_tc.c deleted file mode 100644 index c750ea3..0000000 --- a/pping/pping_kern_tc.c +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#include -#include -#include - -#include "pping.h" -#include "pping_helpers.h" - -char _license[] SEC("license") = "GPL"; - -#ifdef HAVE_TC_LIBBPF /* detected by configure script in config.mk */ -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(key_size, sizeof(struct packet_id)); - __uint(value_size, sizeof(struct packet_timestamp)); - __uint(max_entries, 16384); - __uint(pinning, LIBBPF_PIN_BY_NAME); -} ts_start SEC(".maps"); - -#else -struct bpf_elf_map SEC("maps") ts_start = { - .type = BPF_MAP_TYPE_HASH, - .size_key = sizeof(struct packet_id), - .size_value = sizeof(struct packet_timestamp), - .max_elem = 16384, - .pinning = PIN_GLOBAL_NS, -}; -#endif - -// TC-BFP for parsing packet identifier from egress traffic and add to map -SEC(TCBPF_PROG_SEC) -int tc_bpf_prog_egress(struct __sk_buff *skb) -{ - struct packet_id p_id = { 0 }; - struct packet_timestamp p_ts = { 0 }; - struct parsing_context pctx = { - .data = (void *)(long)skb->data, - .data_end = (void *)(long)skb->data_end, - .pkt_len = skb->len, - .nh = { .pos = pctx.data }, - }; - - if (parse_packet_identifier(&pctx, true, &p_id) < 0) - goto end; - - p_ts.timestamp = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns - bpf_map_update_elem(&ts_start, &p_id, &p_ts, BPF_NOEXIST); - -end: - return BPF_OK; -} diff --git a/pping/pping_kern_xdp.c b/pping/pping_kern_xdp.c deleted file mode 100644 index f0a95aa..0000000 --- a/pping/pping_kern_xdp.c +++ /dev/null @@ -1,63 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#include -#include - -#include "pping.h" -#include "pping_helpers.h" - -char _license[] SEC("license") = "GPL"; - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(key_size, sizeof(struct packet_id)); - __uint(value_size, sizeof(struct packet_timestamp)); - __uint(max_entries, 16384); - __uint(pinning, LIBBPF_PIN_BY_NAME); -} ts_start SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __uint(key_size, sizeof(__u32)); - __uint(value_size, sizeof(__u32)); -} rtt_events SEC(".maps"); - -// XDP program for parsing identifier in ingress traffic and check for match in map -SEC(XDP_PROG_SEC) -int xdp_prog_ingress(struct xdp_md *ctx) -{ - struct packet_id p_id = { 0 }; - struct packet_timestamp *p_ts; - struct rtt_event event = { 0 }; - struct parsing_context pctx = { - .data = (void *)(long)ctx->data, - .data_end = (void *)(long)ctx->data_end, - .pkt_len = pctx.data_end - pctx.data, - .nh = { .pos = pctx.data }, - }; - - if (parse_packet_identifier(&pctx, false, &p_id) < 0) - goto end; - - p_ts = bpf_map_lookup_elem(&ts_start, &p_id); - - // Only calculate RTT for first packet with matching identifer - if (p_ts && p_ts->used == 0) { - /* - * As used is not set atomically with the lookup, could - * potentially have multiple "first" packets (on different - * CPUs), but all those should then also have very similar RTT, - * so don't consider it a significant issue - */ - p_ts->used = 1; - // TODO - Optional delete of entry (if identifier is garantued unique) - - __builtin_memcpy(&event.flow, &p_id.flow, - sizeof(struct network_tuple)); - event.rtt = bpf_ktime_get_ns() - p_ts->timestamp; - bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU, - &event, sizeof(event)); - } - -end: - return XDP_PASS; -}