Merge pull request #13 from simosund/pping_Add_Sampling

Add sampling to pping
2024-05-06 15:54:53 +00:00 · 2021-04-23 14:51:37 +02:00
parent 6cd22850af 20c6dbec4c
commit 7ff771f665
14 changed files with 1342 additions and 562 deletions
--- a/pping/Makefile
+++ b/pping/Makefile
@@ -1,34 +1,11 @@
 # SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)

 USER_TARGETS   := pping
-TC_BPF_TARGETS := pping_kern_tc
-BPF_TARGETS    := pping_kern_xdp
-BPF_TARGETS    += $(TC_BPF_TARGETS)
+BPF_TARGETS    := pping_kern

 LDFLAGS    += -pthread
-EXTRA_DEPS += config.mk pping.h pping_helpers.h
+EXTRA_DEPS += pping.h

 LIB_DIR = ../lib

 include $(LIB_DIR)/common.mk
-include config.mk
-
-all: config.mk
-
-config.mk: configure
-	@sh configure
-
-ifndef HAVE_TC_LIBBPF
-# If the iproute2 'tc' tool doesn't understand BTF debug info
-# use llvm-strip to remove this debug info from object file
-#
-# *BUT* cannot strip everything as it removes ELF elems needed for
-#  creating maps
-#
-.PHONY: strip_tc_obj
-strip_tc_obj: ${TC_BPF_TARGETS:=.o}
-	$(Q) echo "TC don't support libbpf - strip BTF info"
-	$(Q) llvm-strip --no-strip-all --remove-section .BTF $?
-
-all: strip_tc_obj
-endif
--- a/pping/README.md
+++ b/pping/README.md
@@ -1,21 +1,99 @@
 # PPing using XDP and TC-BPF
 A re-implementation of [Kathie Nichols' passive ping
-(pping)](https://github.com/pollere/pping) utility using XDP (on ingress)
-and TC-BPF (on egress) for the packet capture logic.
+(pping)](https://github.com/pollere/pping) utility using XDP (on ingress) and
+TC-BPF (on egress) for the packet capture logic.

 ## Simple description
-Passive Ping (PPing) makes use of the TCP Timestamp option to calculate the RTT for TCP traffic passing through.
-PPing can be used on measure RTTs on end hosts or any device which sees both directions of the TCP flow.
+Passive Ping (PPing) is a simple tool for passively measuring per-flow RTTs. It
+can be used on endhosts as well as any (BPF-capable Linux) device which can see
+both directions of the traffic (ex router or middlebox). Currently it only works
+for TCP traffic which uses the TCP timestamp option, but could be extended to
+also work with for example TCP seq/ACK numbers, the QUIC spinbit and ICMP
+echo-reply messages. See the [TODO-list](./TODO.md) for more potential features
+(which may or may not ever get implemented).

-For outgoing packets, it checks for TCP timestamp TSval in the TCP header. If it finds one it creates a timestamp
-for when it saw that TSval in a particular flow. On incomming packets it parses the TCP timestamp TSecr (which
-is the TSval echoed by the receiving host) and checks it has seen any previous outgoing packets with that TCP
-timestamp. If it has, an RTT is calculated as the difference in time between when it saw an outgoing packet
-with a TSval, and when it received an incomming packet from the reverse flow with a matching TSecr.
+The fundamental logic of pping is to timestamp a pseudo-unique identifier for
+outgoing packets, and then look for matches in the incoming packets. If a match
+is found, the RTT is simply calculated as the time difference between the
+current time and the timestamp.

-Note that TCP timestamps may not be unique for every packet in a flow, therefore it only matches the first
-outgoing packet with a particular TSval with the first incomming packet with a matching TSecr. Duplicate
-TSval/TSecr are ignored.
+This tool, just as Kathie's original pping implementation, uses TCP timestamps
+as identifiers. For outgoing packets, the TSval (which is a timestamp in and off
+itself) is timestamped. Incoming packets are then parsed for the TSecr, which
+are the echoed TSval values from the receiver. The TCP timestamps are not
+necessarily unique for every packet (they have a limited update frequency,
+appears to be 1000 Hz for modern Linux systems), so only the first instance of
+an identifier is timestamped, and matched against the first incoming packet with
+the identifier. The mechanism to ensure only the first packet is timestamped and
+matched differs from the one in Kathie's pping, and is further described in
+[SAMPLING_DESIGN](./SAMPLING_DESIGN.md).

-## Planned design
+## Design and technical description
 !["Design of eBPF pping](./eBPF_pping_design.png)
+
+### Files:
+- **pping.c:** Userspace program that loads and attaches the BPF programs, pulls
+  the perf-buffer `rtt_events` to print out RTT messages and periodically cleans
+  up the hash-maps from old entries. Also passes user options to the BPF
+  programs by setting a "global variable" (stored in the programs .rodata
+  section).
+- **pping_kern.c:** Contains the BPF programs that are loaded on tc (egress) and
+  XDP (ingress), as well as several common functions, a global constant `config`
+  (set from userspace) and map definitions. The tc program `pping_egress()`
+  parses outgoing packets for identifiers. If an identifier is found and the
+  sampling strategy allows it, a timestamp for the packet is created in
+  `packet_ts`. The XDP program `pping_ingress()` parses incomming packets for an
+  identifier. If found, it looks up the `packet_ts` map for a match on the
+  reverse flow (to match source/dest on egress). If there is a match, it
+  calculates the RTT from the stored timestamp and deletes the entry. The
+  calculated RTT (together with the flow-tuple) is pushed to the perf-buffer
+  `rtt_events`.
+- **bpf_egress_loader.sh:** A shell script that's used by `pping.c` to setup a
+  clsact qdisc and attach the `pping_egress()` program to egress using
+  tc. **Note**: Unless your iproute2 comes with libbpf support, tc will use
+  iproute's own loading mechanism when loading and attaching object files
+  directly through the tc command line. To ensure that libbpf is always used to
+  load `pping_egress()`, `pping.c` actually loads the program and pins it to
+  `/sys/fs/bpf/pping/classifier`, and tc only attaches the pinned program.
+- **functions.sh and parameters.sh:** Imported by `bpf_egress_loader.sh`.
+- **pping.h:** Common header file included by `pping.c` and
+  `pping_kern.c`. Contains some common structs used by both (are part of the
+  maps).
+
+### BPF Maps:
+- **flow_state:** A hash-map storing some basic state for each flow, such as the
+  last seen identifier for the flow and when the last timestamp entry for the
+  flow was created. Entries are created by `pping_egress()`, and can be updated
+  or deleted by both `pping_egress()` and `pping_ingress()`. Leftover entries
+  are eventually removed by `pping.c`. Pinned at `/sys/fs/bpf/pping`.
+- **packet_ts:** A hash-map storing a timestamp for a specific packet
+  identifier. Entries are created by `pping_egress()` and removed by
+  `pping_ingress()` if a match is found. Leftover entries are eventually
+  removed by `pping.c`. Pinned at `/sys/fs/bpf/pping`.
+- **rtt_events:** A perf-buffer used by `pping_ingress()` to push calculated RTTs
+  to `pping.c`, which continuously polls the map the print out the RTTs.
+
+## Similar projects
+Passively measuring the RTT for TCP traffic is not a novel concept, and there
+exists a number of other tools that can do so. A good overview of how passive
+RTT calculation using TCP timestamps (as in this project) works is provided in
+[this paper](https://doi.org/10.1145/2523426.2539132) from 2013.
+
+- [pping](https://github.com/pollere/pping): This project is largely a
+  re-implementation of Kathie's pping, but by using BPF and XDP as well as
+  implementing some filtering logic the hope is to be able to create a always-on
+  tool that can scale well even to large amounts of massive flows.
+- [ppviz](https://github.com/pollere/ppviz): Web-based visualization tool for
+  the "machine-friendly" output from Kathie's pping tool. If/when we implement a
+  similar machine readable output option it should hopefully work with this
+  implementation as well.
+- [tcptrace](https://github.com/blitz/tcptrace): A post-processing tool which
+  can analyze a tcpdump file and among other things calculate RTTs based on
+  seq/ACK numbers (`-r` or `-R` flag).
+- **Dapper**: A passive TCP data plane monitoring tool implemented in P4 which
+  can among other things calculate the RTT based on the matching seq/ACK
+  numbers. [Paper](https://doi.org/10.1145/3050220.3050228). [Unofficial
+  source](https://github.com/muhe1991/p4-programs-survey/tree/master/dapper).
+- [P4 Tofino TCP RTT measurement](https://github.com/Princeton-Cabernet/p4-projects/tree/master/RTT-tofino): 
+  A passive TCP RTT monitor based on seq/ACK numbers implemented in P4 for
+  Tofino programmable switches. [Paper](https://doi.org/10.1145/3405669.3405823).
--- a/pping/SAMPLING_DESIGN.md
+++ b/pping/SAMPLING_DESIGN.md
@@ -0,0 +1,386 @@
+# Introduction
+This file is intended to document some of the challenges and design
+decisions for adding sampling functionality to pping. It is partly
+based on discussions from my supervisor meeting on 2021-02-22, and the
+contents of my 
+[status slides](https://github.com/xdp-project/bpf-research/blob/master/meetings/simon/work_summary_20210222.org)
+from that meeting.
+
+## Purpose of sampling
+The main purpose of adding sampling to pping is to prevent a massive
+amount of timestamp entries being created and quickly filling up the
+map. This prevents new entries from being made until old ones can be
+cleared out. A few large flows could thus "hog" all the map entries,
+and prevent RTTs from other flows from being reported. Sampling is
+therefore only used on egress to determine if a timestamp entry should
+be created for a packet. All packets on ingress will still be parsed
+and checked for a potential match.
+
+A secondary purpose of the sampling is the reduce the amount of output
+that pping creates. In most circumstances, getting 1000 RTT reports
+per second from a single flow will probably not be of interest, making
+it less useful as a direct command-line utility.
+
+# Considered sampling approaches
+There are a number of different ways that the sampling could be
+performed, ex:
+
+- Sample every N packets per flow
+  - Not very flexible
+  - If same rate is used for all flows small flows would get very few
+    samples.
+- Sample completely random packets
+  - Probably not a good idea...
+- Head sampling (sample the first few packets of each flow)
+  - Not suitable for monitoring long flows
+  - RTT may change over lifetime of flow (due to buffer bloat)
+- Probabilistic approach
+  - Probabilistic approaches have been used to for example capture
+    most relevant information with limited overhead in INT
+  - Could potentially be configured across multiple devices, so that
+    pping on all of the devices together capture the most relevant
+    traffic.
+  - While it could potentially work well, I'm not very familiar with
+    these approaches. Would take considerable research from my side
+    to figure out how these methods work, how to best apply it to
+    pping, and how to implement it in BPF.
+- Used time-based sampling, limiting the rate of how often entries
+  can be created per flow
+  - Intuitively simple
+  - Should correspond quite well with the output you would probably
+    want? I.e. a few entries per flow (regardless of how heavy they
+    are) stating their current RTT.
+
+I believe that time-based sampling is the most promising solution that
+I can implement in a reasonable time. In the future additional
+sampling methods could potentially be added.
+
+# Considerations for time-based sampling
+## Time interval
+For the time-based sampling, we must determine how the interval
+between when new timestamp entries are allowed should be set. 
+
+### Static time interval
+The simplest alternative is probably to use a static limit, ex
+100ms. This would provide a rather simple and predictable limit for
+how often entries can be created (per flow), and how much output you
+would get (per flow).
+
+### RTT-based time interval
+It may be desirable to use a more dynamic time limit, which is
+adapted to each flow. One way to do this, would be do base the time
+limit on the RTT for the flow. Flows with short RTTs could be expected
+to undergo more rapid changes than flows with long RTTs. This would
+require keeping track of the RTT for each flow, for example a moving
+average. Additionally, some fall back is required before the RTT for
+the flow is known.
+
+### User configurable
+Regardless if a static or RTT-based (or some other alternative) is
+used, it should probably be user configurable (including allowing the
+user to disable sampling entirely).
+
+## Allowing bursts
+It may be desirable to allow to allow for multiple packets in a short
+burst to be timestamped. Due to delayed ACKs, one may only get a
+response for every other packet. If the first packed is timestamped,
+and shortly after a second packet is sent (that has a different
+identifier), then the response will effectively be for the second
+packet, and no match for the timestamped identifier will be found. For
+flows of the right (or wrong, depending on how you look at it)
+intensity, slow enough where consecutive packets are likely to get
+different TCP timestamps, but fast enough for the delayed ACKs to
+acknowledge multiple packets, then you essentially have a 50/50 chance
+of timestamping the wrong identifier and miss the RTT.
+
+To handle this, you could timestamp multiple consecutive packets (with
+unique indentifiers) in a short burst. You probably need to limit this
+burst in both number of packets, as well as timeframe after the first
+packet that additional packets may be included. For example, allowing
+up to 3 packets (with different identifiers) get a timestamp for up to
+4 ms after the first one of them are timestamped.
+
+If allowing bursts of timestamps to be created, it may also be
+desirable to rate limit the output, in order to not get a burst of
+similar RTTs for the flow in the output (which may also skew averages
+and other post-processing).
+
+## Handing duplicate identifiers
+TCP timestamps are only updated at a limited rate (ex. 1000 Hz), and
+thus you can have multiple consecutive packets with the same TCP
+timestamp if they're sent fast enough. For the calculated RTT to be
+correct, you should only match the first sent packet with a unique
+identifier with the first received packet with a matching
+identifier. Otherwise, you may for example have a sequence with 100
+packets with the same identifier, and match the last of the outgoing
+packets with the first incoming response, which may underestimate the
+RTT with as much as the TCP timestamp clock rate (ex. 1 ms). 
+
+### Current solution
+The current solution to this is very simple. For outgoing packets, a
+timestamp entry is only allowed to be created if no previous entry for
+the identifier exists (realized through the `BPF_NOEXIST` flag to
+`bpf_map_update_elem()` call). Thus only the first outgoing packet with
+a specific identifier can be timestamped. On egress, the first packet
+with a matching identifier will mark the timestamp as used, preventing
+later incoming responses from using that timestamp. The reason why the
+timestamp is marked as used rather than directly deleted once a
+matching packet on ingress is found, is to avoid the egress side
+creating a new entry for the same identifier. This could occur if the
+RTT is shorter than the TCP timestamp clock rate, and could result in
+a massively underestimated RTT. This is the same mechanic that is used
+in the original pping, as explained
+[here](https://github.com/pollere/pping/blob/777eb72fd9b748b4bb628ef97b7fff19b751f1fd/pping.cpp#L155-L168).
+
+### New solution
+The current solution will no longer work if sampling is
+introduced. With sampling, there's no guarantee that the sampled
+packed will be the first outgoing packet in the sequence of packets
+with identical timestamps. Thus the RTT may still be underestimated by
+as much as the TCP timestamp clock rate (ex. 1 ms). Therefore, a new
+solution is needed. The current idea is to keep track of the last-seen
+identifier of each flow, and only allow a packet to be sampled for
+timestamping if its identifier differs from the last-seen identifier
+of the flow, i.e. it is the first packet in the flow with that
+identifier. This would perhaps be problematic with some sampling
+approaches as it requires that the packet is both the first one with a
+specific identifier, as well as being elected for sampling. However
+for the rate-limited sampling it should work quite well, as it will
+only delay the sampling until a packet with a new identifier is found.
+
+Another advantage with this solution is that it should allow for
+timestamp entries to be deleted as soon as the matching response is
+found on ingress. The timestamp no longer needs to be kept around only
+to prevent egress to create a new timestamp with the same identifier,
+as this new solution should take care of that. This would help a lot
+with keeping the map clean, as the timestamp entries would then
+automatically be removed as soon as they are no longer needed. The
+periodic cleanup from userspace would only be needed to remove the
+occasional entries that were never matched for some reason (e.g. the
+previously mentioned issue with delayed ACKs, flow stopped, the
+reverse flow can't be observed etc.).
+
+One issue for this new solution is handling out-of-order packets. If
+an entry with an older identifier is a bit delayed, it may arrive after
+the last seen identifier for the flow has been updated. This old
+identifier may then be considered new (as it differs from the current
+one), allowing an entry to be created for it and reverting the last
+seen identifier to a previous one. Additionally, this may
+now allow the next packet having what used to be the current
+identifier, also being detected as a new identifier (as the out-of
+order packet reverted the last-seen identifier to an old one, creating
+a bit of a ping-pong effect). For TCP timestamps this can easily be
+avoided by simply requiring the new identifier to be greater than the
+last-seen identifier (as TCP timestamps should be monotonically
+increasing). That solution may however not be suitable if one wants to
+reuse this mechanic for other protocols, such as the QUIC spinbit.
+
+## Keeping per-flow information
+In order for the per-flow rate limiting to work, some per-flow state
+must be maintained, namely when the last timestamp for that flow was
+added (so that one can check that sufficient time has passed before
+attempting to add another one).
+
+There may be some drawbacks with having to keep per-flow state. First
+off, there will be some additional overhead from having to keep track
+of this state. However, the savings from sampling the per-packet state
+(the identifier/timestamps mappings) should hopefully cover the
+overhead from keeping some per-flow state (and then some). 
+
+Another issue that is worth keeping in mind is that this flow-state
+will also need to be cleaned up eventually. This cleanup could be
+handled in a similar manner as the current per-packet state is cleaned
+up, by having the userspace process occasionally remove old
+entries. In this case, the entries could be deemed as old if there was
+a long time since the last timestamp was added for the flow, ex 300
+seconds as used by the [original
+pping](https://github.com/pollere/pping/blob/777eb72fd9b748b4bb628ef97b7fff19b751f1fd/pping.cpp#L117).
+Additionally, one can parse the packets for indications that the
+connection is being closed (ex TCP FIN/RST), and then directly delete
+the flow-state for that flow from the BPF programs.
+
+Later on, this per-flow state could potentially be expanded to include
+other information deemed useful (such as ex. minimum and average RTT).
+
+### Alternative solution - keeping identifier in flow-state
+One idea that came up during my supervisor meeting, was that instead
+of creating timestamps for individual packets as is currently done,
+you only create a number of timestamps for each flow. That is, instead
+of creating per-packet entries in a separate map, you include a number
+of timestamp/identifier pairs in the flow-state information itself.
+
+While this would potentially be rather efficient, limiting the number
+of timestamp entries to a fixed number per flow, I'm opposed to this
+idea for a few reasons:
+
+1. The sampling rate would be inherently tied to the RTT of the
+   flow. While this may in many cases be desirable, it is not very
+   flexible. It would also make it hard to ex. turn of sampling
+   completely.
+2. The number of timestamps per flow would need to be fixed and known
+   at compile time(?). As the timestamps/identifier pairs are kept in
+   the state-flow information itself, and the state-flow information
+   needs to be of a known and fixed size when creating the maps. This
+   may also result in some wasted space if the flow-state includes
+   spots for several timestamp/identifier pairs, but most flows only
+   makes use of a few (although having an additional timestamp entry
+   map of fixed size wastes space in a similar manner).
+2. If a low number of timestamp/identifier pairs are kept, selecting
+   an identifier that is missed (ex due to delayed ACKs) could
+   effectivly block new timestamps from being created (and thus from
+   RTTs being calculated) for the flow for a relatively long
+   while. New timestamps can only be created if you have a free slot,
+   and you can only free a slot by either getting a matching reply, or
+   waiting until it can be safely assumed that the response was missed
+   (and not just delayed).
+
+## Graceful degradation
+Another aspect I've been asked to consider is how to gracefully reduce
+the functionality of pping as the timestamp entry map gets full (as
+with sufficiently many and heavy flows, it's likely inevitable).
+
+What currently happens when the timestamp entry map is full, is simply
+that no more entries can be made until some have been cleared
+out. When adding a rate-limit to the number of entries per flow, as
+well as directly deleting entries upon match, I believe this is a
+reasonable way to handle the situation. As soon as some RTTs for
+current flows have been reported, space for new entries will be
+available. The next outgoing packet with a valid identifier from any
+flow that does not have to currently wait for its rate limit will then
+be able to grab the next spot. However this will still favor heavy
+flows over smaller flows, as heavy flows are more likely to be able to
+get in a packet first, but they will at least still be limited by the
+rate limit, and thus have to take turns with other flows.
+
+It also worth noting that as per-flow state will need to be kept,
+there will be strict limit to the number of concurrent flows that can
+be monitored, corresponding to the number of entries that can be held
+by the map for the per-flow state. Once the per-flow state map is
+full, no new flows can be added until one is cleared. It also doesn't
+make sense to add packet timestamp entries for flows which state
+cannot be tracked, as the rate limit cannot be enforced then.
+
+I see a few ways to more actively handle degradation, depending on what
+one views as desirable:
+
+1. One can attempt to monitor many flows, with infrequent RTT
+   calculations for each. In this case, the userspace process that
+   occasionally clears out the timestamp map could automatically
+   decrease the per-flow rate limit if it detects the map is getting
+   close to full. That way, fewer entries would be generated per flow,
+   and flows would be forced to take turns to a greater degree when
+   the map is completely full. Similarly, one may wish to reduce the
+   timeout for old flows if the per-flow map is getting full, in order
+   to more quickly allow new flows to be monitored, and only keeping
+   the most active flows around.
+2. One can attempt to monitor fewer flows, but with more frequent RTT
+   calculations for each. The easiest way to achieve this is to
+   probably to set a smaller size on the per-flow map relative to the
+   per-packet timestamp map. In case one wants to primarily focus on
+   heavier flows, one could possibly add ex. packet rate to the
+   per-flow information, and remove the flows with the lowest packet
+   rates.
+3. One can attempt to focus on flows with shorter RTTs. Flows with
+   shorter RTTs should make more efficient use of timestamp entries,
+   as they can be cleared out faster allowing for new entries. On the
+   other hand, flows with longer RTTs may be the more interesting
+   ones, as they are more likely to indicate some issue.
+4. One can simply try to create a larger map (and copy over the old
+   contents) once the map is approaching full. This way one can start
+   with reasonably small maps, and only start eating up more memory if
+   required. 
+
+While I'm leaning towards option 1 or 4, I don't have a very strong
+personal opinion here, and would like some input on what others (who
+may have more experience with network measurements) think are
+reasonable trade-offs to do.
+
+# Implementation considerations
+There are of course several more practical considerations as well when
+implementing the sampling, some of which I'll try to address here.
+
+## "Global" vs PERCPU maps
+In general, it's likely wise to go with PERCPU maps over "global" (aka
+non-PERCPU) maps whenever possible, as PERCPU maps should be more
+performant, and also avoids concurrency issues. But this only applies
+of course, if the BPF programs don't need to act on global state.
+
+For pping, I unfortunately see no way for the program to work with
+only information local to each CPU core individually. The per-packet
+identifier and timestamps need to be global, as there is no guarantee
+that the same core that timestamped a packet will process the response
+for that packet. Likewise, the per-flow information, like the time of
+the last timestamping, also needs to be global. Otherwise rate limit
+would be per-CPU-per-flow rather than just per-flow.
+
+In practice, packets from the same flow are apparently often handled
+by the same CPU, but this is not guaranteed, and therefore not
+something we can rely on (especially when state needs to be shared by
+both ingress and egress). Could try to use a CPU map to enforce this
+behavior, but probably not a great idea.
+
+## Concurrency issues
+In addition to the performance hit, sharing global state between
+multiple concurrent processes risks running into concurrency issues
+unless access is synchronized in some manner (in BPF, the two
+mechanics I know of are atomic adds and spin-locks for maps). With the
+risk of me misunderstanding the memory model for BPF programs (which
+from what I can tell I'm probably not alone about), I will attempt to
+explain the potential concurrency issues I see with the pping
+implementation.
+
+The current pping implementation already has a potential concurrency
+issue. When matches for identifiers are found on ingress, a check is
+performed to see if the timestamp has already been used or
+not. Multiple packets processed in parallel could potentially all
+find that the timestamp is unused, before any of them manage to mark
+it as used for the others. This may result in pping matching several
+responses to a single timestamp entry and reporting the RTTs for each
+of them. I do not consider this a significant issue however, as if
+they are concurrent enough that they manage to lookup the used status
+before another has time to set it, the difference in time between them
+should be very small, and therefore compute very similar RTTs. So the
+reported RTTs should still be rather accurate, just over-reported.
+
+When adding sampling and per-flow information, some additional
+concurrency issues may be encountered. Mainly, multiple packets may
+find that they are allowed to add a new timestamp, before they manage
+to update the time of last added time-stamp in the per-flow
+state. This may lead to multiple attempts at creating a timestamp at
+approximately the same time. For TCP timestamps, all the identifiers
+are likely to be identical (as the TCP timestamp itself is only
+updated at limited rate), so only one of them should succeed
+anyways. If using identifiers that are more unique however, such as
+TCP sequence numbers, then it's possible that a short burst of entries
+would be created instead of just a single entry within the rate-limit
+for the flow.
+
+Overall, I don't think these concurrency issues are that severe, as
+they should still result in accurate RTTs, just some possible
+over-reporting. I don't believe these issues warrants the performance
+impact and potential code complexity of trying to synchronize
+access. Furthermore, from what I understand these concurrency issues
+are not too likely to occur in reality, as packets from the same flow
+are often processed on the same core.
+
+## Global variable vs single-entry map
+With BTF, there seems like BPF programs now support the use of global
+variables. These global variables can supposedly be modified from user
+space, and should from what I've heard also be more efficient than map
+lookups. They therefore seem like promising way to pass some
+user-configured options from userspace to the BPF programs.
+
+I would however need to lookup how to actually use these, as the
+examples I've seen have used a slightly different libbpf setup, where
+a "skeleton" header-file is compiled and imported to the userspace
+program. There should be some examples in the [xdp-tools
+repository](https://github.com/xdp-project/xdp-tools).
+
+The alternative I guess would be to use a
+`BPF_MAP_TYPE_PERCPU_ARRAY` with a single entry, which is filled in
+with the user-configured option by the userspace program.
+
+
+
+
+
--- a/pping/TODO.md
+++ b/pping/TODO.md
@@ -2,27 +2,60 @@

 ## Protocols
 - [x] TCP (based on timestamp options)
-  - [ ] Skip pure ACKs for egress?
+  - [x] Skip pure ACKs for egress
+    - Timestamping pure ACKs may lead to erroneous RTTs (ex. delay
+      between application attempting to send data being recognized as
+      an RTT)
  - [ ] Add fallback to SEQ/ACK in case of no timestamp?
+    - Some machines may not use TCP timestamps (either not supported
+      at all, or disabled as in ex. Windows 10)
+    - If one only considers SEQ/ACK (and don't check for SACK
+      options), could result in ex. delay from retransmission being
+      included in RTT
 - [ ] ICMP (ex Echo/Reply)
 - [ ] QUIC (based on spinbit)

 ## General pping
+- [x] Add sampling so that RTT is not calculated for every packet
+      (with unique value) for large flows
+  - [ ] Allow short bursts to bypass sampling in order to handle 
+        delayed ACKs
+- [x] Keep some per-flow state
+  - Will likely be needed for the sampling
+  - [ ] Could potentially include keeping track of average RTT, which
+        may be useful for some decisions (ex. how often to sample,
+        when entry can be removed etc)
+  - [ ] Could potentially include keeping track of minimum RTT (as
+        done by the original pping), ex. to track bufferbloat
+  - [ ] Could potentially include keeping track of if flow is
+        bi-directional
+    - Original pping checks if flow is bi-directional before adding
+      timestamps, but this could miss shorter flows
+- [ ] Dynamically grow the maps if they are starting to get full
+- [ ] Improve map cleaning: Use a dynamic time to live for map entries
+      based on flow's RTT, instead of static 10s limit
+  - Keeping entries around for a long time allows the map to grow
+    unnecessarily large, which slows down the cleaning and may block
+    new entries
 - [ ] Use libxdp to load XDP program
- [ ] Check for existance of reverse flow before adding to hash-map (to avoid adding identifiers for flows that we can't see the reverse traffic for)?
-  -  This could miss the first few packets, would not be ideal for short flows
- [ ] Keep track of minimum RTT for each flow (done by Pollere's pping, and helps identify buffer bloat)
- [ ] Add configurable rate-limit for how often each flow can add entries to the map (prevent high-rate flows from quickly filling up the map)
- [ ] Improve map cleaning: Use a dynamic time to live for hash map entries based on flow's RTT, instead of static 10s limit
- [ ] Add support for automatically deleting entries if they are unique
-  - TCP timestamp need to be kept for a while (because multiple packets can have the same timestamp), but for identifiers that are unique per packet, they can be removed directly after RTT is calculated
+- [ ] Add option for machine-readable output (as original pping)
+  - It may be a good idea to keep the same format as original pping,
+    so that tools such as [ppviz](https://github.com/pollere/ppviz)
+    works for both pping implementations.
+- [ ] Add timestamps to output (as original pping)
+- [ ] Add support for other hooks
+  - Ex TC-BFP on ingress instead of XDP?

 ## Done
 - [x] Clean up commits and add signed-off-by tags
 - [x] Add SPDX-license-identifier tags
 - [x] Format C-code in kernel style
- [x] Use existing funcionality to reuse maps by using BTF-defined maps
-  - [x] Use BTF-defined maps for TC-BPF as well if iproute has libbpf support
- [x] Cleanup: Unload TC-BPF at program shutdown, and unpin map - In userspace part
+- [x] Use existing functionality to reuse maps by using BTF-defined
+      maps
+  - [x] Use BTF-defined maps for TC-BPF as well if iproute has libbpf
+        support
+- [x] Cleanup: Unload TC-BPF at program shutdown, and unpin map - In
+      userspace part
 - [x] Add IPv6 support
 - [x] Refactor to support easy addition of other protocols
+- [x] Load tc-bpf program with libbpf (only attach it with tc)
--- a/pping/bpf_egress_loader.sh
+++ b/pping/bpf_egress_loader.sh
@@ -4,7 +4,7 @@
 # License: GPLv2
 #
 # Modified by Simon Sundberg <simon.sundberg@kau.se> to add support
-# of optional section (--sec) option and changed default BPF_OBJ
+# of optional section (--sec) option or attaching a pinned program
 #
 basedir=`dirname $0`
 source ${basedir}/functions.sh
@@ -64,6 +64,16 @@ function tc_egress_bpf_attach()
 	    egress bpf da obj "$objfile" sec "$section"
 }

+function tc_egress_bpf_attach_pinned()
+{
+    local device=${1:-$DEV}
+    local pinprog=${2:-$PIN_PROG}
+    shift 2
+
+    call_tc filter add dev "$device" pref 2  handle 2 \
+	    egress bpf da pinned "$pinprog"
+}
+
 function tc_egress_list()
 {
    local device=${1:-$DEV}
@@ -77,7 +87,12 @@ if [[ -n $REMOVE ]]; then
 fi

 tc_init_clsact $DEV
-tc_egress_bpf_attach $DEV $BPF_OBJ $SEC
+
+if [[ -n $PIN_PROG ]]; then
+    tc_egress_bpf_attach_pinned $DEV $PIN_PROG
+else
+    tc_egress_bpf_attach $DEV $BPF_OBJ $SEC
+fi

 # Practical to list egress filters after setup.
 # (It's a common mistake to have several progs loaded)
--- a/pping/configure
+++ b/pping/configure
@@ -1,29 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
-# This is not an autoconf generated configure
-#
-
-# Output file which is input to Makefile
-CONFIG=config.mk
-
-# Assume tc is in $PATH
-TC=tc
-
-check_tc_libbpf()
-{
-    tc_version=$($TC -V)
-    if echo $tc_version | grep -q libbpf; then
-	libbpf_version=${tc_version##*libbpf }
-	echo "HAVE_TC_LIBBPF:=y" >> $CONFIG
-	echo "BPF_CFLAGS += -DHAVE_TC_LIBBPF" >> $CONFIG
-	echo "yes ($libbpf_version)"
-    else
-	echo "no"
-    fi
-}
-
-echo "# Generated config" > $CONFIG
-echo "Detecting available features on system"
-
-echo -n " - libbpf support in tc tool: "
-check_tc_libbpf
--- a/pping/eBPF_pping_design.png
+++ b/pping/eBPF_pping_design.png
--- a/pping/parameters.sh
+++ b/pping/parameters.sh
@@ -6,7 +6,7 @@
 # License: GPLv2
 #
 # Modified by Simon Sundberg <simon.sundberg@kau.se> to add support
-# of optional section (--sec) option
+# of optional section (--sec) option or attaching a pinned program
 #

 function usage() {
@@ -20,12 +20,13 @@ function usage() {
    echo "  -l | --list    : (\$LIST)       List setup after setup"
    echo "  --file | --obj : (\$BPF_OBJ)    BPF-object file to load"
    echo "  --sec          : (\$SEC)        Section of BPF-object to load"
+    echo "  --pinned       : (\$PIN_PROG)   Path to pinned program to attach"
    echo ""
 }

 # Using external program "getopt" to get --long-options
 OPTIONS=$(getopt -o vshd:l \
-    --long verbose,dry-run,remove,stats,list,help,dev:,file:,obj:,sec: -- "$@")
+    --long verbose,dry-run,remove,stats,list,help,dev:,file:,obj:,sec:,pinned: -- "$@")
 if (( $? != 0 )); then
    usage
    err 2 "Error calling getopt"
@@ -50,6 +51,11 @@ while true; do
 	  info "Section to load: $SEC" >&2
          shift 2
          ;;
+	--pinned )
+	  export PIN_PROG=$2
+	  info "Pinned program path: $PIN_PROG" >&2
+	  shift 2
+	  ;;
        -v | --verbose)
          export VERBOSE=yes
          # info "Verbose mode: VERBOSE=$VERBOSE" >&2
--- a/pping/pping.c
+++ b/pping/pping.c
@@ -1,4 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
+static const char *__doc__ =
+	"Passive Ping - monitor flow RTT based on TCP timestamps";
+
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
 #include <linux/if_link.h>
@@ -10,7 +13,9 @@
 #include <string.h>
 #include <errno.h>
 #include <unistd.h>
+#include <getopt.h>
 #include <stdbool.h>
+#include <limits.h>
 #include <signal.h> // For detecting Ctrl-C
 #include <sys/resource.h> // For setting rlmit
 #include <sys/wait.h>
@@ -18,25 +23,18 @@
 #include <time.h>
 #include <pthread.h>

-#include "pping.h" //key and value structs for the ts_start map
+#include "pping.h" //common structs for user-space and BPF parts

 #define NS_PER_SECOND 1000000000UL
 #define NS_PER_MS 1000000UL

 #define TCBPF_LOADER_SCRIPT "./bpf_egress_loader.sh"
-#define PINNED_DIR "/sys/fs/bpf/tc/globals"
-#define PPING_XDP_OBJ "pping_kern_xdp.o"
-#define PPING_TCBPF_OBJ "pping_kern_tc.o"

-#define XDP_FLAGS XDP_FLAGS_UPDATE_IF_NOEXIST
-
-#define TS_MAP "ts_start"
-#define MAP_CLEANUP_INTERVAL                                                   \
-	(1 * NS_PER_SECOND) // Clean timestamp map once per second
 #define TIMESTAMP_LIFETIME                                                     \
-	(10 * NS_PER_SECOND) // Clear out entries from ts_start if they're over 10 seconds
+	(10 * NS_PER_SECOND) // Clear out packet timestamps if they're over 10 seconds
+#define FLOW_LIFETIME                                                          \
+	(300 * NS_PER_SECOND) // Clear out flows if they're inactive over 300 seconds

-#define PERF_BUFFER "rtt_events"
 #define PERF_BUFFER_PAGES 64 // Related to the perf-buffer size?
 #define PERF_POLL_TIMEOUT_MS 100

@@ -57,12 +55,146 @@

 // Structure to contain arguments for clean_map (for passing to pthread_create)
 struct map_cleanup_args {
-	int map_fd;
-	__u64 max_age_ns;
+	__u64 cleanup_interval;
+	int packet_map_fd;
+	int flow_map_fd;
+};
+
+// Store configuration values in struct to easily pass around
+struct pping_config {
+	struct bpf_config bpf_config;
+	__u64 cleanup_interval;
+	int xdp_flags;
+	int ifindex;
+	char ifname[IF_NAMESIZE];
+	bool force;
+	char *object_path;
+	char *ingress_sec;
+	char *egress_sec;
+	char *pin_dir;
+	char *packet_map;
+	char *flow_map;
+	char *rtt_map;
 };

 static volatile int keep_running = 1;

+static const struct option long_options[] = {
+	{ "help",             no_argument,       NULL, 'h' },
+	{ "interface",        required_argument, NULL, 'i' }, // Name of interface to run on
+	{ "rate-limit",       required_argument, NULL, 'r' }, // Sampling rate-limit in ms
+	{ "force",            no_argument,       NULL, 'f' }, // Detach any existing XDP program on interface
+	{ "cleanup-interval", required_argument, NULL, 'c' }, // Map cleaning interval in s
+	{ 0, 0, NULL, 0 }
+};
+
+/*
+ * Copied from Jesper Dangaaard Brouer's traffic-pacing-edt example
+ */
+static void print_usage(char *argv[])
+{
+	int i;
+
+	printf("\nDOCUMENTATION:\n%s\n", __doc__);
+	printf("\n");
+	printf(" Usage: %s (options-see-below)\n", argv[0]);
+	printf(" Listing options:\n");
+	for (i = 0; long_options[i].name != 0; i++) {
+		printf(" --%-12s", long_options[i].name);
+		if (long_options[i].flag != NULL)
+			printf(" flag (internal value:%d)",
+			       *long_options[i].flag);
+		else
+			printf(" short-option: -%c", long_options[i].val);
+		printf("\n");
+	}
+	printf("\n");
+}
+
+static double parse_positive_double_argument(const char *str,
+					     const char *parname)
+{
+	char *endptr;
+	double val;
+	val = strtod(str, &endptr);
+	if (strlen(str) != endptr - str) {
+		fprintf(stderr, "%s %s is not a valid number\n", parname, str);
+		return -EINVAL;
+	}
+	if (val < 0) {
+		fprintf(stderr, "%s must be positive\n", parname);
+		return -EINVAL;
+	}
+
+	return val;
+}
+
+static int parse_arguments(int argc, char *argv[], struct pping_config *config)
+{
+	int err, opt;
+	double rate_limit_ms, cleanup_interval_s;
+
+	config->ifindex = 0;
+
+	while ((opt = getopt_long(argc, argv, "hfi:r:c:", long_options,
+				  NULL)) != -1) {
+		switch (opt) {
+		case 'i':
+			if (strlen(optarg) > IF_NAMESIZE) {
+				fprintf(stderr, "interface name too long\n");
+				return -EINVAL;
+			}
+			strncpy(config->ifname, optarg, IF_NAMESIZE);
+
+			config->ifindex = if_nametoindex(config->ifname);
+			if (config->ifindex == 0) {
+				err = -errno;
+				fprintf(stderr,
+					"Could not get index of interface %s: %s\n",
+					config->ifname, strerror(err));
+				return err;
+			}
+			break;
+		case 'r':
+			rate_limit_ms = parse_positive_double_argument(
+				optarg, "rate-limit");
+			if (rate_limit_ms < 0)
+				return -EINVAL;
+
+			config->bpf_config.rate_limit =
+				rate_limit_ms * NS_PER_MS;
+			break;
+		case 'c':
+			cleanup_interval_s = parse_positive_double_argument(
+				optarg, "cleanup-interval");
+			if (cleanup_interval_s < 0)
+				return -EINVAL;
+
+			config->cleanup_interval =
+				cleanup_interval_s * NS_PER_SECOND;
+			break;
+		case 'f':
+			config->force = true;
+			break;
+		case 'h':
+			printf("HELP:\n");
+			print_usage(argv);
+			exit(0);
+		default:
+			fprintf(stderr, "Unknown option %s\n", argv[optind]);
+			return -EINVAL;
+		}
+	}
+
+	if (config->ifindex == 0) {
+		fprintf(stderr,
+			"An interface (-i or --interface) must be provided\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 void abort_program(int sig)
 {
 	keep_running = 0;
@@ -78,28 +210,48 @@ static int set_rlimit(long int lim)
 	return !setrlimit(RLIMIT_MEMLOCK, &rlim) ? 0 : -errno;
 }

-static int mkdir_if_noexist(const char *path)
+static int
+bpf_obj_run_prog_pindir_func(struct bpf_object *obj, const char *prog_title,
+			     const char *pin_dir,
+			     int (*func)(struct bpf_program *, const char *))
 {
-	int ret;
-	struct stat st = { 0 };
+	int len;
+	struct bpf_program *prog;
+	char path[MAX_PATH_LEN];

-	ret = stat(path, &st);
-	if (ret) {
-		if (errno != ENOENT)
-			return -errno;
+	len = snprintf(path, MAX_PATH_LEN, "%s/%s", pin_dir, prog_title);
+	if (len < 0)
+		return len;
+	if (len > MAX_PATH_LEN)
+		return -ENAMETOOLONG;

-		return mkdir(path, 0700) ? -errno : 0;
-	}
-	return S_ISDIR(st.st_mode) ? 0 : -EEXIST;
+	prog = bpf_object__find_program_by_title(obj, prog_title);
+	if (!prog || libbpf_get_error(prog))
+		return prog ? libbpf_get_error(prog) : -EINVAL;
+
+	return func(prog, path);
 }

-static int bpf_obj_open(struct bpf_object **obj, const char *obj_path,
-			char *map_path)
+/*
+ * Similar to bpf_object__pin_programs, but only attemps to pin a
+ * single program prog_title at path pin_dir/prog_title
+ */
+static int bpf_obj_pin_program(struct bpf_object *obj, const char *prog_title,
+			       const char *pin_dir)
 {
-	DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts,
-			    .pin_root_path = map_path);
-	*obj = bpf_object__open_file(obj_path, map_path ? &opts : NULL);
-	return libbpf_get_error(*obj);
+	return bpf_obj_run_prog_pindir_func(obj, prog_title, pin_dir,
+					    bpf_program__pin);
+}
+
+/*
+ * Similar to bpf_object__unpin_programs, but only attempts to unpin a
+ * single program prog_title at path pin_dir/prog_title.
+ */
+static int bpf_obj_unpin_program(struct bpf_object *obj, const char *prog_title,
+				 const char *pin_dir)
+{
+	return bpf_obj_run_prog_pindir_func(obj, prog_title, pin_dir,
+					    bpf_program__unpin);
 }

 static int xdp_detach(int ifindex, __u32 xdp_flags)
@@ -112,7 +264,6 @@ static int xdp_attach(struct bpf_object *obj, const char *sec, int ifindex,
 {
 	struct bpf_program *prog;
 	int prog_fd;
-	int err;

 	if (sec)
 		prog = bpf_object__find_program_by_title(obj, sec);
@@ -120,24 +271,28 @@ static int xdp_attach(struct bpf_object *obj, const char *sec, int ifindex,
 		prog = bpf_program__next(NULL, obj);

 	prog_fd = bpf_program__fd(prog);
-	if (prog_fd < 0) {
-		fprintf(stderr, "Could not find program to attach\n");
+	if (prog_fd < 0)
 		return prog_fd;
-	}

 	if (force) // detach current (if any) xdp-program first
 		xdp_detach(ifindex, xdp_flags);

-	err = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
-	if (err < 0) {
-		fprintf(stderr, "Failed loading xdp-program on interface %d\n",
-			ifindex);
-		return err;
-	}
-	return 0;
+	return bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
 }

-static int run_program(const char *path, char *const argv[])
+static int init_rodata(struct bpf_object *obj, void *src, size_t size)
+{
+	struct bpf_map *map = NULL;
+	bpf_object__for_each_map(map, obj) {
+		if (strstr(bpf_map__name(map), ".rodata"))
+			return bpf_map__set_initial_value(map, src, size);
+	}
+
+	// No .rodata map found
+	return -EINVAL;
+}
+
+static int run_external_program(const char *path, char *const argv[])
 {
 	int status;
 	int ret = -1;
@@ -157,18 +312,24 @@ static int run_program(const char *path, char *const argv[])
 	}
 }

-static int tc_bpf_load(char *bpf_object, char *section, char *interface)
+static int tc_bpf_attach(const char *pin_dir, const char *section,
+			 char *interface)
 {
-	char *const argv[] = { TCBPF_LOADER_SCRIPT, "--dev", interface, "--obj",
-			       bpf_object,	    "--sec", section,	NULL };
-	return run_program(TCBPF_LOADER_SCRIPT, argv);
+	char prog_path[MAX_PATH_LEN];
+	char *const argv[] = { TCBPF_LOADER_SCRIPT, "--dev",   interface,
+			       "--pinned",	    prog_path, NULL };
+
+	if (snprintf(prog_path, sizeof(prog_path), "%s/%s", pin_dir, section) < 0)
+		return -EINVAL;
+
+	return run_external_program(TCBPF_LOADER_SCRIPT, argv);
 }

 static int tc_bpf_clear(char *interface)
 {
 	char *const argv[] = { TCBPF_LOADER_SCRIPT, "--dev", interface,
 			       "--remove", NULL };
-	return run_program(TCBPF_LOADER_SCRIPT, argv);
+	return run_external_program(TCBPF_LOADER_SCRIPT, argv);
 }

 /*
@@ -184,45 +345,82 @@ static __u64 get_time_ns(void)
 	return (__u64)t.tv_sec * NS_PER_SECOND + (__u64)t.tv_nsec;
 }

-static int clean_map(int map_fd, __u64 max_age)
+static bool packet_ts_timeout(void *val_ptr, __u64 now)
+{
+	__u64 ts = *(__u64 *)val_ptr;
+	if (now > ts && now - ts > TIMESTAMP_LIFETIME)
+		return true;
+	return false;
+}
+
+static bool flow_timeout(void *val_ptr, __u64 now)
+{
+	__u64 ts = ((struct flow_state *)val_ptr)->last_timestamp;
+	if (now > ts && now - ts > FLOW_LIFETIME)
+		return true;
+	return false;
+}
+
+/*
+ * Loops through all entries in a map, running del_decision_func(value, time)
+ * on every entry, and deleting those for which it returns true.
+ * On sucess, returns the number of entries deleted, otherwise returns the
+ * (negative) error code.
+ */
+//TODO - maybe add some pointer to arguments for del_decision_func?
+static int clean_map(int map_fd, size_t key_size, size_t value_size,
+		     bool (*del_decision_func)(void *, __u64))
 {
 	int removed = 0;
-	struct packet_id key, prev_key = { 0 };
-	struct packet_timestamp value;
+	void *key, *prev_key, *value;
 	bool delete_prev = false;
 	__u64 now_nsec = get_time_ns();

-	int entries = 0; // Just for debug
-	__u64 duration; // Just for debug
+#ifdef DEBUG
+	int entries = 0;
+	__u64 duration;
+#endif

 	if (now_nsec == 0)
 		return -errno;

+	key = malloc(key_size);
+	prev_key = malloc(key_size);
+	value = malloc(value_size);
+	if (!key || !prev_key || !value) {
+		removed = -ENOMEM;
+		goto cleanup;
+	}
+
 	// Cannot delete current key because then loop will reset, see https://www.bouncybouncy.net/blog/bpf_map_get_next_key-pitfalls/
-	while (bpf_map_get_next_key(map_fd, &prev_key, &key) == 0) {
+	while (bpf_map_get_next_key(map_fd, prev_key, key) == 0) {
 		if (delete_prev) {
-			bpf_map_delete_elem(map_fd, &prev_key);
+			bpf_map_delete_elem(map_fd, prev_key);
 			removed++;
 			delete_prev = false;
 		}

-		if (bpf_map_lookup_elem(map_fd, &key, &value) == 0) {
-			if (now_nsec > value.timestamp &&
-			    now_nsec - value.timestamp > max_age) {
-				delete_prev = true;
-			}
-		}
+		if (bpf_map_lookup_elem(map_fd, key, value) == 0)
+			delete_prev = del_decision_func(value, now_nsec);
+#ifdef DEBUG
 		entries++;
-		prev_key = key;
+#endif
+		memcpy(prev_key, key, key_size);
 	}
 	if (delete_prev) {
-		bpf_map_delete_elem(map_fd, &prev_key);
+		bpf_map_delete_elem(map_fd, prev_key);
 		removed++;
 	}
+#ifdef DEBUG
 	duration = get_time_ns() - now_nsec;
-	printf("Gone through %d entries and removed %d of them in %llu.%09llu s\n",
-	       entries, removed, duration / NS_PER_SECOND,
+	printf("%d: Gone through %d entries and removed %d of them in %llu.%09llu s\n",
+	       map_fd, entries, removed, duration / NS_PER_SECOND,
 	       duration % NS_PER_SECOND);
+#endif
+cleanup:
+	free(key);
+	free(prev_key);
+	free(value);
 	return removed;
 }

@@ -230,11 +428,14 @@ static void *periodic_map_cleanup(void *args)
 {
 	struct map_cleanup_args *argp = args;
 	struct timespec interval;
-	interval.tv_sec = MAP_CLEANUP_INTERVAL / NS_PER_SECOND;
-	interval.tv_nsec = MAP_CLEANUP_INTERVAL % NS_PER_SECOND;
+	interval.tv_sec = argp->cleanup_interval / NS_PER_SECOND;
+	interval.tv_nsec = argp->cleanup_interval % NS_PER_SECOND;

 	while (keep_running) {
-		clean_map(argp->map_fd, argp->max_age_ns);
+		clean_map(argp->packet_map_fd, sizeof(struct packet_id),
+			  sizeof(__u64), packet_ts_timeout);
+		clean_map(argp->flow_map_fd, sizeof(struct network_tuple),
+			  sizeof(struct flow_state), flow_timeout);
 		nanosleep(&interval, NULL);
 	}
 	pthread_exit(NULL);
@@ -274,28 +475,134 @@ static void handle_missed_rtt_event(void *ctx, int cpu, __u64 lost_cnt)
 	fprintf(stderr, "Lost %llu RTT events on CPU %d\n", lost_cnt, cpu);
 }

+static int load_attach_bpfprogs(struct bpf_object **obj,
+				struct pping_config *config, bool *tc_attached,
+				bool *xdp_attached)
+{
+	int err;
+
+	// Open and load ELF file
+	*obj = bpf_object__open(config->object_path);
+	err = libbpf_get_error(*obj);
+	if (err) {
+		fprintf(stderr, "Failed opening object file %s: %s\n",
+			config->object_path, strerror(-err));
+		return err;
+	}
+
+	err = init_rodata(*obj, &config->bpf_config,
+			  sizeof(config->bpf_config));
+	if (err) {
+		fprintf(stderr, "Failed pushing user-configration to %s: %s\n",
+			config->object_path, strerror(-err));
+		return err;
+	}
+
+	err = bpf_object__load(*obj);
+	if (err) {
+		fprintf(stderr, "Failed loading bpf program in %s: %s\n",
+			config->object_path, strerror(-err));
+		return err;
+	}
+
+	// Attach tc program
+	err = bpf_obj_pin_program(*obj, config->egress_sec, config->pin_dir);
+	if (err) {
+		fprintf(stderr, "Failed pinning tc program to %s/%s: %s\n",
+			config->pin_dir, config->egress_sec, strerror(-err));
+		return err;
+	}
+
+	err = tc_bpf_attach(config->pin_dir, config->egress_sec,
+			    config->ifname);
+	if (err) {
+		fprintf(stderr,
+			"Failed attaching tc program on interface %s: %s\n",
+			config->ifname, strerror(-err));
+		return err;
+	}
+	*tc_attached = true;
+
+	// Attach XDP program
+	err = xdp_attach(*obj, config->ingress_sec, config->ifindex,
+			 config->xdp_flags, config->force);
+	if (err) {
+		fprintf(stderr, "Failed attaching XDP program to %s%s: %s\n",
+			config->ifname,
+			config->force ? "" : ", ensure no other XDP program is already running on interface",
+			strerror(-err));
+		return err;
+	}
+	*xdp_attached = true;
+
+	return 0;
+}
+
+static int setup_periodical_map_cleaning(struct bpf_object *obj,
+					 struct pping_config *config)
+{
+	pthread_t tid;
+	struct map_cleanup_args clean_args = {
+		.cleanup_interval = config->cleanup_interval
+	};
+	int err;
+
+	clean_args.packet_map_fd =
+		bpf_object__find_map_fd_by_name(obj, config->packet_map);
+	if (clean_args.packet_map_fd < 0) {
+		fprintf(stderr, "Could not get file descriptor of map %s: %s\n",
+			config->packet_map,
+			strerror(-clean_args.packet_map_fd));
+		return clean_args.packet_map_fd;
+	}
+
+	clean_args.flow_map_fd =
+		bpf_object__find_map_fd_by_name(obj, config->flow_map);
+	if (clean_args.flow_map_fd < 0) {
+		fprintf(stderr, "Could not get file descriptor of map %s: %s\n",
+			config->flow_map, strerror(-clean_args.flow_map_fd));
+		return clean_args.packet_map_fd;
+	}
+
+	err = pthread_create(&tid, NULL, periodic_map_cleanup, &clean_args);
+	if (err) {
+		fprintf(stderr,
+			"Failed starting thread to perform periodic map cleanup: %s\n",
+			strerror(-err));
+		return err;
+	}
+
+	return 0;
+}
+
 int main(int argc, char *argv[])
 {
 	int err = 0;
-	int ifindex = 0;
-	bool xdp_attached = false;
+
 	bool tc_attached = false;
-	char map_path[MAX_PATH_LEN];
+	bool xdp_attached = false;

 	struct bpf_object *obj = NULL;
-	struct bpf_map *map = NULL;

-	pthread_t tid;
-	struct map_cleanup_args clean_args;
+	struct pping_config config = {
+		.bpf_config = { .rate_limit = 100 * NS_PER_MS },
+		.cleanup_interval = 1 * NS_PER_SECOND,
+		.object_path = "pping_kern.o",
+		.ingress_sec = INGRESS_PROG_SEC,
+		.egress_sec = EGRESS_PROG_SEC,
+		.pin_dir = "/sys/fs/bpf/pping",
+		.packet_map = "packet_ts",
+		.flow_map = "flow_state",
+		.rtt_map = "rtt_events",
+		.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
+		.force = false,
+	};

 	struct perf_buffer *pb = NULL;
-	struct perf_buffer_opts pb_opts;
-
-	// TODO - better argument parsing (more relevant as featureas are added)
-	if (argc < 2) {
-		printf("Usage: ./pping_user <dev>\n");
-		return EXIT_FAILURE;
-	}
+	struct perf_buffer_opts pb_opts = {
+		.sample_cb = handle_rtt_event,
+		.lost_cb = handle_missed_rtt_event,
+	};

 	// Detect if running as root
 	if (geteuid() != 0) {
@@ -308,98 +615,41 @@ int main(int argc, char *argv[])
 	if (err) {
 		fprintf(stderr, "Could not set rlimit to infinity: %s\n",
 			strerror(-err));
-		goto cleanup;
+		return EXIT_FAILURE;
 	}

-	// Get index of interface
-	ifindex = if_nametoindex(argv[1]);
-	if (ifindex == 0) {
-		err = -errno;
-		fprintf(stderr, "Could not get index of interface %s: %s\n",
-			argv[1], strerror(-err));
-		goto cleanup;
-	}
-
-	// Load and attach the XDP program
-	err = mkdir_if_noexist("/sys/fs/bpf/tc");
+	err = parse_arguments(argc, argv, &config);
 	if (err) {
-		fprintf(stderr,
-			"Failed creating directory %s in which to pin map: %s\n",
-			"/sys/fs/bpf/tc", strerror(-err));
-		goto cleanup;
-	}
-
-	err = bpf_obj_open(&obj, PPING_XDP_OBJ, PINNED_DIR);
-	if (err) {
-		fprintf(stderr, "Failed opening object file %s: %s\n",
-			PPING_XDP_OBJ, strerror(-err));
-		goto cleanup;
-	}
-
-	// Get map here to allow for unpinning at cleanup
-	map = bpf_object__find_map_by_name(obj, TS_MAP);
-	err = libbpf_get_error(map);
-	if (err) {
-		fprintf(stderr, "Could not find map %s in %s: %s\n", TS_MAP,
-			PPING_XDP_OBJ, strerror(err));
-		map = NULL;
-	}
-
-	err = bpf_object__load(obj);
-	if (err) {
-		fprintf(stderr, "Failed loading XDP program: %s\n",
+		fprintf(stderr, "Failed parsing arguments:  %s\n",
 			strerror(-err));
-		goto cleanup;
+		print_usage(argv);
+		return EXIT_FAILURE;
 	}

-	err = xdp_attach(obj, XDP_PROG_SEC, ifindex, XDP_FLAGS, false);
-	if (err) {
-		fprintf(stderr, "Failed attaching XDP program to %s: %s\n",
-			argv[1], strerror(-err));
-		goto cleanup;
-	}
-	xdp_attached = true;
-
-	// Load tc-bpf section on interface egress
-	err = tc_bpf_load(PPING_TCBPF_OBJ, TCBPF_PROG_SEC, argv[1]);
+	err = load_attach_bpfprogs(&obj, &config, &tc_attached, &xdp_attached);
 	if (err) {
 		fprintf(stderr,
-			"Could not load section %s of %s on interface %s: %s\n",
-			TCBPF_PROG_SEC, PPING_TCBPF_OBJ, argv[1],
+			"Failed loading and attaching BPF programs in %s\n",
+			config.object_path);
+		goto cleanup;
+	}
+
+	err = setup_periodical_map_cleaning(obj, &config);
+	if (err) {
+		fprintf(stderr, "Failed setting up map cleaning: %s\n",
 			strerror(-err));
 		goto cleanup;
 	}
-	tc_attached = true;
-
-	// Set up the periodical map cleaning
-	clean_args.max_age_ns = TIMESTAMP_LIFETIME;
-	clean_args.map_fd = bpf_map__fd(map);
-	if (clean_args.map_fd < 0) {
-		fprintf(stderr,
-			"Could not get file descriptor of map  %s in object %s: %s\n",
-			TS_MAP, PPING_XDP_OBJ, strerror(-clean_args.map_fd));
-		goto cleanup;
-	}
-
-	err = pthread_create(&tid, NULL, periodic_map_cleanup, &clean_args);
-	if (err) {
-		fprintf(stderr,
-			"Failed starting thread to perform periodic map cleanup: %s\n",
-			strerror(err));
-		goto cleanup;
-	}

 	// Set up perf buffer
-	pb_opts.sample_cb = handle_rtt_event;
-	pb_opts.lost_cb = handle_missed_rtt_event;
-
-	pb = perf_buffer__new(bpf_object__find_map_fd_by_name(obj, PERF_BUFFER),
+	pb = perf_buffer__new(bpf_object__find_map_fd_by_name(obj,
+							      config.rtt_map),
 			      PERF_BUFFER_PAGES, &pb_opts);
 	err = libbpf_get_error(pb);
 	if (err) {
 		pb = NULL;
 		fprintf(stderr, "Failed to open perf buffer %s: %s\n",
-			PERF_BUFFER, strerror(err));
+			config.rtt_map, strerror(err));
 		goto cleanup;
 	}

@@ -419,30 +669,30 @@ int main(int argc, char *argv[])

 cleanup:
 	perf_buffer__free(pb);
-	if (map && bpf_map__is_pinned(map)) {
-		snprintf(map_path, sizeof(map_path), "%s/%s", PINNED_DIR,
-			 TS_MAP);
-		err = bpf_map__unpin(map, map_path);
-		if (err) {
-			fprintf(stderr, "Failed unpinning map from %s: %s\n",
-				map_path, strerror(-err));
-		}
-	}
+
 	if (xdp_attached) {
-		err = xdp_detach(ifindex, XDP_FLAGS);
-		if (err) {
+		err = xdp_detach(config.ifindex, config.xdp_flags);
+		if (err)
 			fprintf(stderr,
-				"Failed deatching program from ifindex %d: %s\n",
-				ifindex, strerror(-err));
-		}
+				"Failed deatching program from ifindex %s: %s\n",
+				config.ifname, strerror(-err));
 	}
+
 	if (tc_attached) {
-		err = tc_bpf_clear(argv[1]); //system(tc_cmd);
-		if (err) {
+		err = tc_bpf_clear(config.ifname);
+		if (err)
 			fprintf(stderr,
 				"Failed removing tc-bpf program from interface %s: %s\n",
-				argv[1], strerror(-err));
-		}
+				config.ifname, strerror(-err));
+	}
+
+	if (obj && !libbpf_get_error(obj)) {
+		err = bpf_obj_unpin_program(obj, config.egress_sec,
+					    config.pin_dir);
+		if (err)
+			fprintf(stderr,
+				"Failed unpinning tc program from %s: %s\n",
+				config.pin_dir, strerror(-err));
 	}

 	return err != 0;
--- a/pping/pping.h
+++ b/pping/pping.h
@@ -5,8 +5,12 @@
 #include <linux/types.h>
 #include <linux/in6.h>

-#define XDP_PROG_SEC "xdp"
-#define TCBPF_PROG_SEC "pping_egress"
+#define INGRESS_PROG_SEC "xdp"
+#define EGRESS_PROG_SEC "classifier"
+
+struct bpf_config {
+	__u64 rate_limit;
+};

 /*
 * Struct that can hold the source or destination address for a flow (l3+l4).
@@ -34,17 +38,17 @@ struct network_tuple {
 	__u8 reserved;
 };

+struct flow_state {
+	__u64 last_timestamp;
+	__u32 last_id;
+	__u32 reserved;
+};
+
 struct packet_id {
 	struct network_tuple flow;
 	__u32 identifier; //tsval for TCP packets
 };

-struct packet_timestamp {
-	__u64 timestamp;
-	__u8 used;
-	__u8 reserved[7];
-};
-
 struct rtt_event {
 	__u64 rtt;
 	struct network_tuple flow;
--- a/pping/pping_helpers.h
+++ b/pping/pping_helpers.h
@@ -1,187 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef PPING_HELPERS_H
-#define PPING_HELPERS_H
-
-#include <linux/bpf.h>
-#include <xdp/parsing_helpers.h>
-#include <linux/in.h>
-#include <linux/in6.h>
-#include <linux/if_ether.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/tcp.h>
-
-#include <stdbool.h>
-#include "pping.h"
-
-#define AF_INET 2
-#define AF_INET6 10
-#define MAX_TCP_OPTIONS 10
-
-/*
- * This struct keeps track of the data and data_end pointers from the xdp_md or
- * __skb_buff contexts, as well as a currently parsed to position kept in nh.
- * Additionally, it also keeps the length of the entire packet, which together
- * with the other members can be used to determine ex. how much data each
- * header encloses.
- */
-struct parsing_context {
-	void *data;           //Start of eth hdr
-	void *data_end;       //End of safe acessible area
-	struct hdr_cursor nh; //Position to parse next
-	__u32 pkt_len;        //Full packet length (headers+data)
-};
-
-/*
- * Maps an IPv4 address into an IPv6 address according to RFC 4291 sec 2.5.5.2
- */
-static void map_ipv4_to_ipv6(__be32 ipv4, struct in6_addr *ipv6)
-{
-	__builtin_memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10);
-	__builtin_memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2);
-	ipv6->in6_u.u6_addr32[3] = ipv4;
-}
-
-/*
- * Parses the TSval and TSecr values from the TCP options field. If sucessful
- * the TSval and TSecr values will be stored at tsval and tsecr (in network
- * byte order).
- * Returns 0 if sucessful and -1 on failure
- */
-static int parse_tcp_ts(struct tcphdr *tcph, void *data_end, __u32 *tsval,
-			__u32 *tsecr)
-{
-	int len = tcph->doff << 2;
-	void *opt_end = (void *)tcph + len;
-	__u8 *pos = (__u8 *)(tcph + 1); //Current pos in TCP options
-	__u8 i, opt, opt_size;
-
-	if (tcph + 1 > data_end || len <= sizeof(struct tcphdr))
-		return -1;
-
-	for (i = 0; i < MAX_TCP_OPTIONS; i++) {
-		if (pos + 1 > opt_end || pos + 1 > data_end)
-			return -1;
-
-		opt = *pos;
-		if (opt == 0) // Reached end of TCP options
-			return -1;
-
-		if (opt == 1) { // TCP NOP option - advance one byte
-			pos++;
-			continue;
-		}
-
-		// Option > 1, should have option size
-		if (pos + 2 > opt_end || pos + 2 > data_end)
-			return -1;
-		opt_size = *(pos + 1);
-
-		// Option-kind is TCP timestap (yey!)
-		if (opt == 8 && opt_size == 10) {
-			if (pos + opt_size > opt_end ||
-			    pos + opt_size > data_end)
-				return -1;
-			*tsval = *(__u32 *)(pos + 2);
-			*tsecr = *(__u32 *)(pos + 6);
-			return 0;
-		}
-
-		// Some other TCP option - advance option-length bytes
-		pos += opt_size;
-	}
-	return -1;
-}
-/*
- * Attempts to fetch an identifier for TCP packets, based on the TCP timestamp
- * option. If sucessful, identifier will be set to TSval if is_ingress, TSecr
- * otherwise, the port-members of saddr and daddr will be set the the TCP source
- * and dest, respectively, and 0 will be returned. On failure, -1 will be
- * returned.
- */
-static int parse_tcp_identifier(struct parsing_context *ctx, bool is_egress,
-				__be16 *sport, __be16 *dport, __u32 *identifier)
-{
-	__u32 tsval, tsecr;
-	struct tcphdr *tcph;
-
-	if (parse_tcphdr(&ctx->nh, ctx->data_end, &tcph) < 0)
-		return -1;
-
-	// Do not timestamp pure ACKs
-	if (is_egress && ctx->nh.pos - ctx->data >= ctx->pkt_len && !tcph->syn)
-		return -1;
-
-	if (parse_tcp_ts(tcph, ctx->data_end, &tsval, &tsecr) < 0)
-		return -1; //Possible TODO, fall back on seq/ack instead
-
-	*sport = tcph->source;
-	*dport = tcph->dest;
-	*identifier = is_egress ? tsval : tsecr;
-	return 0;
-}
-
-/*
- * Attempts to parse the packet limited by the data and data_end pointers,
- * to retrieve a protocol dependent packet identifier. If sucessful, the
- * pointed to p_id will be filled with parsed information from the packet
- * packet, and 0 will be returned. On failure, -1 will be returned.
- * If is_egress saddr and daddr will match source and destination of packet,
- * respectively, and identifier will be set to the identifer for an outgoing
- * packet. Otherwise, saddr and daddr will be swapped (will match
- * destination and source of packet, respectively), and identifier will be
- * set to the identifier of a response.
- */
-static int parse_packet_identifier(struct parsing_context *ctx, bool is_egress,
-				   struct packet_id *p_id)
-{
-	int proto, err;
-	struct ethhdr *eth;
-	struct iphdr *iph;
-	struct ipv6hdr *ip6h;
-	struct flow_address *saddr, *daddr;
-
-	// Switch saddr <--> daddr on ingress to match egress
-	if (is_egress) {
-		saddr = &p_id->flow.saddr;
-		daddr = &p_id->flow.daddr;
-	} else {
-		saddr = &p_id->flow.daddr;
-		daddr = &p_id->flow.saddr;
-	}
-
-	proto = parse_ethhdr(&ctx->nh, ctx->data_end, &eth);
-
-	// Parse IPv4/6 header
-	if (proto == bpf_htons(ETH_P_IP)) {
-		p_id->flow.ipv = AF_INET;
-		proto = parse_iphdr(&ctx->nh, ctx->data_end, &iph);
-	} else if (proto == bpf_htons(ETH_P_IPV6)) {
-		p_id->flow.ipv = AF_INET6;
-		proto = parse_ip6hdr(&ctx->nh, ctx->data_end, &ip6h);
-	} else {
-		return -1;
-	}
-
-	// Add new protocols here
-	if (proto == IPPROTO_TCP) {
-		err = parse_tcp_identifier(ctx, is_egress, &saddr->port,
-					   &daddr->port, &p_id->identifier);
-		if (err)
-			return -1;
-	} else {
-		return -1;
-	}
-
-	// Sucessfully parsed packet identifier - fill in IP-addresses and return
-	if (p_id->flow.ipv == AF_INET) {
-		map_ipv4_to_ipv6(iph->saddr, &saddr->ip);
-		map_ipv4_to_ipv6(iph->daddr, &daddr->ip);
-	} else { // IPv6
-		saddr->ip = ip6h->saddr;
-		daddr->ip = ip6h->daddr;
-	}
-	return 0;
-}
-
-#endif
--- a/pping/pping_kern.c
+++ b/pping/pping_kern.c
@@ -0,0 +1,361 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <stdbool.h>
+
+// overwrite xdp/parsing_helpers.h value to avoid hitting verifier limit
+#ifdef IPV6_EXT_MAX_CHAIN
+#undef IPV6_EXT_MAX_CHAIN
+#endif
+#define IPV6_EXT_MAX_CHAIN 3
+
+#include <xdp/parsing_helpers.h>
+#include "pping.h"
+
+#define AF_INET 2
+#define AF_INET6 10
+#define MAX_TCP_OPTIONS 10
+
+/*
+ * This struct keeps track of the data and data_end pointers from the xdp_md or
+ * __skb_buff contexts, as well as a currently parsed to position kept in nh.
+ * Additionally, it also keeps the length of the entire packet, which together
+ * with the other members can be used to determine ex. how much data each
+ * header encloses.
+ */
+struct parsing_context {
+	void *data; //Start of eth hdr
+	void *data_end; //End of safe acessible area
+	struct hdr_cursor nh; //Position to parse next
+	__u32 pkt_len; //Full packet length (headers+data)
+	bool is_egress; //Is packet on egress or ingress?
+};
+
+char _license[] SEC("license") = "GPL";
+// Global config struct - set from userspace
+static volatile const struct bpf_config config = {};
+
+// Map definitions
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, struct packet_id);
+	__type(value, __u64);
+	__uint(max_entries, 16384);
+} packet_ts SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, struct network_tuple);
+	__type(value, struct flow_state);
+	__uint(max_entries, 16384);
+} flow_state SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} rtt_events SEC(".maps");
+
+// Help functions
+
+/*
+ * Maps an IPv4 address into an IPv6 address according to RFC 4291 sec 2.5.5.2
+ */
+static void map_ipv4_to_ipv6(__be32 ipv4, struct in6_addr *ipv6)
+{
+	__builtin_memset(&ipv6->in6_u.u6_addr8[0], 0x00, 10);
+	__builtin_memset(&ipv6->in6_u.u6_addr8[10], 0xff, 2);
+	ipv6->in6_u.u6_addr32[3] = ipv4;
+}
+
+/*
+ * Parses the TSval and TSecr values from the TCP options field. If sucessful
+ * the TSval and TSecr values will be stored at tsval and tsecr (in network
+ * byte order).
+ * Returns 0 if sucessful and -1 on failure
+ */
+static int parse_tcp_ts(struct tcphdr *tcph, void *data_end, __u32 *tsval,
+			__u32 *tsecr)
+{
+	int len = tcph->doff << 2;
+	void *opt_end = (void *)tcph + len;
+	__u8 *pos = (__u8 *)(tcph + 1); //Current pos in TCP options
+	__u8 i, opt;
+	volatile __u8
+		opt_size; // Seems to ensure it's always read of from stack as u8
+
+	if (tcph + 1 > data_end || len <= sizeof(struct tcphdr))
+		return -1;
+#pragma unroll //temporary solution until we can identify why the non-unrolled loop gets stuck in an infinite loop
+	for (i = 0; i < MAX_TCP_OPTIONS; i++) {
+		if (pos + 1 > opt_end || pos + 1 > data_end)
+			return -1;
+
+		opt = *pos;
+		if (opt == 0) // Reached end of TCP options
+			return -1;
+
+		if (opt == 1) { // TCP NOP option - advance one byte
+			pos++;
+			continue;
+		}
+
+		// Option > 1, should have option size
+		if (pos + 2 > opt_end || pos + 2 > data_end)
+			return -1;
+		opt_size = *(pos + 1);
+		if (opt_size < 2) // Stop parsing options if opt_size has an invalid value
+			return -1;
+
+		// Option-kind is TCP timestap (yey!)
+		if (opt == 8 && opt_size == 10) {
+			if (pos + 10 > opt_end || pos + 10 > data_end)
+				return -1;
+			*tsval = *(__u32 *)(pos + 2);
+			*tsecr = *(__u32 *)(pos + 6);
+			return 0;
+		}
+
+		// Some other TCP option - advance option-length bytes
+		pos += opt_size;
+	}
+	return -1;
+}
+
+/*
+ * Attempts to fetch an identifier for TCP packets, based on the TCP timestamp
+ * option. If sucessful, identifier will be set to TSval if is_ingress, TSecr
+ * otherwise, the port-members of saddr and daddr will be set the the TCP source
+ * and dest, respectively, and 0 will be returned. On failure, -1 will be
+ * returned. Additionally, if the connection is closing (FIN or RST flag), sets
+ * flow_closing to true.
+ */
+static int parse_tcp_identifier(struct parsing_context *ctx, __be16 *sport,
+				__be16 *dport, bool *flow_closing,
+				__u32 *identifier)
+{
+	__u32 tsval, tsecr;
+	struct tcphdr *tcph;
+
+	if (parse_tcphdr(&ctx->nh, ctx->data_end, &tcph) < 0)
+		return -1;
+
+	// Check if connection is closing
+	if (tcph->fin || tcph->rst) {
+		*flow_closing = true;
+		/* bpf_printk("Detected connection closing on %d\n", */
+		/* 	   ctx->is_egress); //Upsets verifier? */
+	}
+
+	// Do not timestamp pure ACKs
+	if (ctx->is_egress && ctx->nh.pos - ctx->data >= ctx->pkt_len &&
+	    !tcph->syn)
+		return -1;
+
+	if (parse_tcp_ts(tcph, ctx->data_end, &tsval, &tsecr) < 0)
+		return -1; //Possible TODO, fall back on seq/ack instead
+
+	*sport = tcph->source;
+	*dport = tcph->dest;
+	*identifier = ctx->is_egress ? tsval : tsecr;
+	return 0;
+}
+
+/*
+ * Attempts to parse the packet limited by the data and data_end pointers,
+ * to retrieve a protocol dependent packet identifier. If sucessful, the
+ * pointed to p_id will be filled with parsed information from the packet
+ * packet, and 0 will be returned. On failure, -1 will be returned.
+ * If is_egress saddr and daddr will match source and destination of packet,
+ * respectively, and identifier will be set to the identifer for an outgoing
+ * packet. Otherwise, saddr and daddr will be swapped (will match
+ * destination and source of packet, respectively), and identifier will be
+ * set to the identifier of a response.
+ */
+static int parse_packet_identifier(struct parsing_context *ctx,
+				   struct packet_id *p_id, bool *flow_closing)
+{
+	int proto, err;
+	struct ethhdr *eth;
+	struct iphdr *iph;
+	struct ipv6hdr *ip6h;
+	struct flow_address *saddr, *daddr;
+
+	// Switch saddr <--> daddr on ingress to match egress
+	if (ctx->is_egress) {
+		saddr = &p_id->flow.saddr;
+		daddr = &p_id->flow.daddr;
+	} else {
+		saddr = &p_id->flow.daddr;
+		daddr = &p_id->flow.saddr;
+	}
+
+	proto = parse_ethhdr(&ctx->nh, ctx->data_end, &eth);
+
+	// Parse IPv4/6 header
+	if (proto == bpf_htons(ETH_P_IP)) {
+		p_id->flow.ipv = AF_INET;
+		proto = parse_iphdr(&ctx->nh, ctx->data_end, &iph);
+	} else if (proto == bpf_htons(ETH_P_IPV6)) {
+		p_id->flow.ipv = AF_INET6;
+		proto = parse_ip6hdr(&ctx->nh, ctx->data_end, &ip6h);
+	} else {
+		return -1;
+	}
+
+	// Add new protocols here
+	if (proto == IPPROTO_TCP) {
+		err = parse_tcp_identifier(ctx, &saddr->port, &daddr->port,
+					   flow_closing, &p_id->identifier);
+		if (err)
+			return -1;
+	} else {
+		return -1;
+	}
+
+	// Sucessfully parsed packet identifier - fill in IP-addresses and return
+	if (p_id->flow.ipv == AF_INET) {
+		map_ipv4_to_ipv6(iph->saddr, &saddr->ip);
+		map_ipv4_to_ipv6(iph->daddr, &daddr->ip);
+	} else { // IPv6
+		saddr->ip = ip6h->saddr;
+		daddr->ip = ip6h->daddr;
+	}
+	return 0;
+}
+
+// Programs
+
+// TC-BFP for parsing packet identifier from egress traffic and add to map
+SEC(EGRESS_PROG_SEC)
+int pping_egress(struct __sk_buff *skb)
+{
+	struct packet_id p_id = { 0 };
+	__u64 p_ts;
+	struct parsing_context pctx = {
+		.data = (void *)(long)skb->data,
+		.data_end = (void *)(long)skb->data_end,
+		.pkt_len = skb->len,
+		.nh = { .pos = pctx.data },
+		.is_egress = true,
+	};
+	bool flow_closing = false;
+	struct flow_state *f_state;
+	struct flow_state new_state = { 0 };
+
+	if (parse_packet_identifier(&pctx, &p_id, &flow_closing) < 0)
+		goto out;
+
+	// Delete flow and create no timestamp entry if flow is closing
+	if (flow_closing) {
+		bpf_map_delete_elem(&flow_state, &p_id.flow);
+		goto out;
+	}
+
+	// Check flow state
+	f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow);
+	if (!f_state) { // No previous state - attempt to create it
+		bpf_map_update_elem(&flow_state, &p_id.flow, &new_state,
+				    BPF_NOEXIST);
+		f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow);
+		if (!f_state)
+			goto out;
+	}
+
+	// Check if identfier is new
+	/* The gap between checking and updating last_id may cause concurrency
+	 * issues where multiple packets may simultaneously think they are the
+	 * first with a new identifier. As long as all of the identifiers are
+	 * the same though, only one should be able to create a timestamp entry.
+
+	 * A bigger issue is that older identifiers (for example due to
+         * out-of-order packets) may pass this check and update the current
+	 * identifier to an old one. This means that both the packet with the
+	 * old identifier itself as well the next packet with the current
+	 * identifier may be considered packets with new identifiers (even if
+	 * both have been seen before). For TCP timestamps this could be
+	 * prevented by changing the check to '>=' instead, but it may not be
+	 * suitable for other protocols, such as QUIC and its spinbit.
+	 *
+	 * For now, just hope that the rate limit saves us from creating an
+	 * incorrect timestamp. That may however also fail, either due to the
+	 * to it happening in a time it's not limited by rate sampling, or
+	 * because of rate check failing due to concurrency issues.
+	 */
+	if (f_state->last_id == p_id.identifier)
+		goto out;
+	f_state->last_id = p_id.identifier;
+
+	// Check rate-limit
+	/*
+	 * The window between checking and updating last_timestamp may cause
+	 * concurrency issues, where multiple packets simultaneously pass the
+	 * rate limit. However, as long as they have the same identifier, only
+	 * a single timestamp entry should successfully be created.
+	 */
+	p_ts = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns
+	if (p_ts < f_state->last_timestamp ||
+	    p_ts - f_state->last_timestamp < config.rate_limit)
+		goto out;
+
+	/*
+	 * Updates attempt at creating timestamp, even if creation of timestamp
+	 * fails (due to map being full). This should make the competition for
+	 * the next available map slot somewhat fairer between heavy and sparse
+	 * flows.
+	 */
+	f_state->last_timestamp = p_ts;
+	bpf_map_update_elem(&packet_ts, &p_id, &p_ts, BPF_NOEXIST);
+
+out:
+	return BPF_OK;
+}
+
+// XDP program for parsing identifier in ingress traffic and check for match in map
+SEC(INGRESS_PROG_SEC)
+int pping_ingress(struct xdp_md *ctx)
+{
+	struct packet_id p_id = { 0 };
+	__u64 *p_ts;
+	struct rtt_event event = { 0 };
+	struct parsing_context pctx = {
+		.data = (void *)(long)ctx->data,
+		.data_end = (void *)(long)ctx->data_end,
+		.pkt_len = pctx.data_end - pctx.data,
+		.nh = { .pos = pctx.data },
+		.is_egress = false,
+	};
+	bool flow_closing = false;
+
+	if (parse_packet_identifier(&pctx, &p_id, &flow_closing) < 0)
+		goto out;
+
+	// Delete flow, but allow final attempt at RTT calculation
+	if (flow_closing)
+		bpf_map_delete_elem(&flow_state, &p_id.flow);
+
+	p_ts = bpf_map_lookup_elem(&packet_ts, &p_id);
+	if (!p_ts)
+		goto out;
+
+	event.rtt = bpf_ktime_get_ns() - *p_ts;
+	/*
+	 * Attempt to delete timestamp entry as soon as RTT is calculated.
+	 * But could have potential concurrency issue where multiple packets
+	 * manage to match against the identifier before it can be deleted.
+	 */
+	bpf_map_delete_elem(&packet_ts, &p_id);
+
+	__builtin_memcpy(&event.flow, &p_id.flow, sizeof(struct network_tuple));
+	bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU, &event,
+			      sizeof(event));
+
+out:
+	return XDP_PASS;
+}
--- a/pping/pping_kern_tc.c
+++ b/pping/pping_kern_tc.c
@@ -1,51 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#include <linux/bpf.h>
-#include <bpf/bpf_helpers.h>
-#include <iproute2/bpf_elf.h>
-
-#include "pping.h"
-#include "pping_helpers.h"
-
-char _license[] SEC("license") = "GPL";
-
-#ifdef HAVE_TC_LIBBPF /* detected by configure script in config.mk */
-struct {
-	__uint(type, BPF_MAP_TYPE_HASH);
-	__uint(key_size, sizeof(struct packet_id));
-	__uint(value_size, sizeof(struct packet_timestamp));
-	__uint(max_entries, 16384);
-	__uint(pinning, LIBBPF_PIN_BY_NAME);
-} ts_start SEC(".maps");
-
-#else
-struct bpf_elf_map SEC("maps") ts_start = {
-	.type = BPF_MAP_TYPE_HASH,
-	.size_key = sizeof(struct packet_id),
-	.size_value = sizeof(struct packet_timestamp),
-	.max_elem = 16384,
-	.pinning = PIN_GLOBAL_NS,
-};
-#endif
-
-// TC-BFP for parsing packet identifier from egress traffic and add to map
-SEC(TCBPF_PROG_SEC)
-int tc_bpf_prog_egress(struct __sk_buff *skb)
-{
-	struct packet_id p_id = { 0 };
-	struct packet_timestamp p_ts = { 0 };
-	struct parsing_context pctx = {
-		.data = (void *)(long)skb->data,
-		.data_end = (void *)(long)skb->data_end,
-		.pkt_len = skb->len,
-		.nh = { .pos = pctx.data },
-	};
-
-	if (parse_packet_identifier(&pctx, true, &p_id) < 0)
-		goto end;
-
-	p_ts.timestamp = bpf_ktime_get_ns(); // or bpf_ktime_get_boot_ns
-	bpf_map_update_elem(&ts_start, &p_id, &p_ts, BPF_NOEXIST);
-
-end:
-	return BPF_OK;
-}
--- a/pping/pping_kern_xdp.c
+++ b/pping/pping_kern_xdp.c
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#include <linux/bpf.h>
-#include <bpf/bpf_helpers.h>
-
-#include "pping.h"
-#include "pping_helpers.h"
-
-char _license[] SEC("license") = "GPL";
-
-struct {
-	__uint(type, BPF_MAP_TYPE_HASH);
-	__uint(key_size, sizeof(struct packet_id));
-	__uint(value_size, sizeof(struct packet_timestamp));
-	__uint(max_entries, 16384);
-	__uint(pinning, LIBBPF_PIN_BY_NAME);
-} ts_start SEC(".maps");
-
-struct {
-	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
-	__uint(key_size, sizeof(__u32));
-	__uint(value_size, sizeof(__u32));
-} rtt_events SEC(".maps");
-
-// XDP program for parsing identifier in ingress traffic and check for match in map
-SEC(XDP_PROG_SEC)
-int xdp_prog_ingress(struct xdp_md *ctx)
-{
-	struct packet_id p_id = { 0 };
-	struct packet_timestamp *p_ts;
-	struct rtt_event event = { 0 };
-	struct parsing_context pctx = {
-		.data = (void *)(long)ctx->data,
-		.data_end = (void *)(long)ctx->data_end,
-		.pkt_len = pctx.data_end - pctx.data,
-		.nh = { .pos = pctx.data },
-	};
-
-	if (parse_packet_identifier(&pctx, false, &p_id) < 0)
-		goto end;
-
-	p_ts = bpf_map_lookup_elem(&ts_start, &p_id);
-
-	// Only calculate RTT for first packet with matching identifer
-	if (p_ts && p_ts->used == 0) {
-		/*
-		 * As used is not set atomically with the lookup, could 
-		 * potentially have multiple "first" packets (on different 
-		 * CPUs), but all those should then also have very similar RTT,
-		 * so don't consider it a significant issue
-		 */
-		p_ts->used = 1;
-		// TODO - Optional delete of entry (if identifier is garantued unique)
-
-		__builtin_memcpy(&event.flow, &p_id.flow,
-				 sizeof(struct network_tuple));
-		event.rtt = bpf_ktime_get_ns() - p_ts->timestamp;
-		bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU,
-				      &event, sizeof(event));
-	}
-
-end:
-	return XDP_PASS;
-}