mirror of
				https://github.com/xdp-project/bpf-examples.git
				synced 2024-05-06 15:54:53 +00:00 
			
		
		
		
	pping: Add timestamp and min-RTT to output
To add timestamp to output, push the timestamp when packet was processed from kernel as part of the rtt-event. Also keep track of minimum encountered RTT for each flow in kernel, and also push that as part of the RTT-event. Additionally, avoid pushing RTT messages at all if no flow-state information can be found (due to ex. being deleted from egress side), as no valid min-RTT can then be given. Furthermore, no longer delete flow-information once seeing the FIN-flag on egress in order to keep useful flow-state around for RTT-messages longer. Due to the FIN-handshake process, it is sufficient if the ingress program deletes the flow-state upon seeing FIN. However, still delete flow-state from either ingress or egress upon seeing RST flag, as RST does not have a handshake process allowing for delayed deletion. While minimum RTT could also be tracked from the userspace process, userspace is not aware of when the flow is closed so would have to add additional logic to keep track of minimum RTT for each flow and periodically clean them up. Furthermore, keeping RTT statistics in the flow-state map is useful for implementing future features, such as an RTT-based sampling interval. It would also be useful in case pping is changed to no longer have a long-running userspace process printing out all the calculated RTTs, but instead simply occasionally looks up the RTT from the flow-state map. Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
This commit is contained in:
		| @@ -25,7 +25,7 @@ | ||||
|   - [ ] Could potentially include keeping track of average RTT, which | ||||
|         may be useful for some decisions (ex. how often to sample, | ||||
|         when entry can be removed etc) | ||||
|   - [ ] Could potentially include keeping track of minimum RTT (as | ||||
|   - [x] Could potentially include keeping track of minimum RTT (as | ||||
|         done by the original pping), ex. to track bufferbloat | ||||
|   - [ ] Could potentially include keeping track of if flow is | ||||
|         bi-directional | ||||
| @@ -42,7 +42,6 @@ | ||||
|   - It may be a good idea to keep the same format as original pping, | ||||
|     so that tools such as [ppviz](https://github.com/pollere/ppviz) | ||||
|     works for both pping implementations. | ||||
| - [ ] Add timestamps to output (as original pping) | ||||
| - [ ] Add support for other hooks | ||||
|   - Ex TC-BFP on ingress instead of XDP? | ||||
|  | ||||
| @@ -59,3 +58,4 @@ | ||||
| - [x] Add IPv6 support | ||||
| - [x] Refactor to support easy addition of other protocols | ||||
| - [x] Load tc-bpf program with libbpf (only attach it with tc) | ||||
| - [x] Add timestamps to output (as original pping) | ||||
|   | ||||
| @@ -40,6 +40,9 @@ static const char *__doc__ = | ||||
|  | ||||
| #define MAX_PATH_LEN 1024 | ||||
|  | ||||
| #define MON_TO_REAL_UPDATE_FREQ                                                \ | ||||
| 	(1 * NS_PER_SECOND) // Update offset between CLOCK_MONOTONIC and CLOCK_REALTIME once per second | ||||
|  | ||||
| /*  | ||||
|  * BPF implementation of pping using libbpf | ||||
|  * Uses TC-BPF for egress and XDP for ingress | ||||
| @@ -336,10 +339,10 @@ static int tc_bpf_clear(char *interface) | ||||
|  * Returns time of CLOCK_MONOTONIC as nanoseconds in a single __u64. | ||||
|  * On failure, the value 0 is returned (and errno will be set). | ||||
|  */ | ||||
| static __u64 get_time_ns(void) | ||||
| static __u64 get_time_ns(clockid_t clockid) | ||||
| { | ||||
| 	struct timespec t; | ||||
| 	if (clock_gettime(CLOCK_MONOTONIC, &t) != 0) | ||||
| 	if (clock_gettime(clockid, &t) != 0) | ||||
| 		return 0; | ||||
|  | ||||
| 	return (__u64)t.tv_sec * NS_PER_SECOND + (__u64)t.tv_nsec; | ||||
| @@ -374,7 +377,7 @@ static int clean_map(int map_fd, size_t key_size, size_t value_size, | ||||
| 	int removed = 0; | ||||
| 	void *key, *prev_key, *value; | ||||
| 	bool delete_prev = false; | ||||
| 	__u64 now_nsec = get_time_ns(); | ||||
| 	__u64 now_nsec = get_time_ns(CLOCK_MONOTONIC); | ||||
|  | ||||
| #ifdef DEBUG | ||||
| 	int entries = 0; | ||||
| @@ -412,7 +415,7 @@ static int clean_map(int map_fd, size_t key_size, size_t value_size, | ||||
| 		removed++; | ||||
| 	} | ||||
| #ifdef DEBUG | ||||
| 	duration = get_time_ns() - now_nsec; | ||||
| 	duration = get_time_ns(CLOCK_MONOTONIC) - now_nsec; | ||||
| 	printf("%d: Gone through %d entries and removed %d of them in %llu.%09llu s\n", | ||||
| 	       map_fd, entries, removed, duration / NS_PER_SECOND, | ||||
| 	       duration % NS_PER_SECOND); | ||||
| @@ -441,6 +444,28 @@ static void *periodic_map_cleanup(void *args) | ||||
| 	pthread_exit(NULL); | ||||
| } | ||||
|  | ||||
| static __u64 convert_monotonic_to_realtime(__u64 monotonic_time) | ||||
| { | ||||
| 	__u64 now_mon, now_rt; | ||||
| 	static __u64 offset = 0; | ||||
| 	static __u64 offset_updated = 0; | ||||
|  | ||||
| 	now_mon = get_time_ns(CLOCK_MONOTONIC); | ||||
| 	if (offset == 0 || | ||||
| 	    (now_mon > offset_updated && | ||||
| 	     now_mon - offset_updated > MON_TO_REAL_UPDATE_FREQ)) { | ||||
| 		now_mon = get_time_ns(CLOCK_MONOTONIC); | ||||
| 		now_rt = get_time_ns(CLOCK_REALTIME); | ||||
| 		if (now_rt < now_mon) | ||||
| 			return 0; | ||||
|  | ||||
| 		offset = now_rt - now_mon; | ||||
| 		offset_updated = now_mon; | ||||
| 	} | ||||
|  | ||||
| 	return monotonic_time + offset; | ||||
| } | ||||
|  | ||||
| /* | ||||
|  * Wrapper around inet_ntop designed to handle the "bug" that mapped IPv4 | ||||
|  * addresses are formated as IPv6 addresses for AF_INET6 | ||||
| @@ -461,13 +486,18 @@ static void handle_rtt_event(void *ctx, int cpu, void *data, __u32 data_size) | ||||
| 	const struct rtt_event *e = data; | ||||
| 	char saddr[INET6_ADDRSTRLEN]; | ||||
| 	char daddr[INET6_ADDRSTRLEN]; | ||||
| 	char timestr[9]; | ||||
| 	__u64 ts = convert_monotonic_to_realtime(e->timestamp); | ||||
| 	time_t ts_s = ts / NS_PER_SECOND; | ||||
|  | ||||
| 	format_ip_address(e->flow.ipv, &e->flow.saddr.ip, saddr, sizeof(saddr)); | ||||
| 	format_ip_address(e->flow.ipv, &e->flow.daddr.ip, daddr, sizeof(daddr)); | ||||
| 	strftime(timestr, sizeof(timestr), "%H:%M:%S", localtime(&ts_s)); | ||||
|  | ||||
| 	printf("%llu.%06llu ms %s:%d+%s:%d\n", e->rtt / NS_PER_MS, | ||||
| 	       e->rtt % NS_PER_MS, saddr, ntohs(e->flow.saddr.port), daddr, | ||||
| 	       ntohs(e->flow.daddr.port)); | ||||
| 	printf("%s.%09llu %llu.%06llu ms %llu.%06llu ms %s:%d+%s:%d\n", timestr, | ||||
| 	       ts % NS_PER_SECOND, e->rtt / NS_PER_MS, e->rtt % NS_PER_MS, | ||||
| 	       e->min_rtt / NS_PER_MS, e->min_rtt % NS_PER_MS, saddr, | ||||
| 	       ntohs(e->flow.saddr.port), daddr, ntohs(e->flow.daddr.port)); | ||||
| } | ||||
|  | ||||
| static void handle_missed_rtt_event(void *ctx, int cpu, __u64 lost_cnt) | ||||
|   | ||||
| @@ -39,6 +39,7 @@ struct network_tuple { | ||||
| }; | ||||
|  | ||||
| struct flow_state { | ||||
| 	__u64 min_rtt; | ||||
| 	__u64 last_timestamp; | ||||
| 	__u32 last_id; | ||||
| 	__u32 reserved; | ||||
| @@ -51,6 +52,8 @@ struct packet_id { | ||||
|  | ||||
| struct rtt_event { | ||||
| 	__u64 rtt; | ||||
| 	__u64 min_rtt; | ||||
| 	__u64 timestamp; | ||||
| 	struct network_tuple flow; | ||||
| 	__u32 reserved; | ||||
| }; | ||||
|   | ||||
| @@ -147,11 +147,7 @@ static int parse_tcp_identifier(struct parsing_context *ctx, __be16 *sport, | ||||
| 		return -1; | ||||
|  | ||||
| 	// Check if connection is closing | ||||
| 	if (tcph->fin || tcph->rst) { | ||||
| 		*flow_closing = true; | ||||
| 		/* bpf_printk("Detected connection closing on %d\n", */ | ||||
| 		/* 	   ctx->is_egress); //Upsets verifier? */ | ||||
| 	} | ||||
| 	*flow_closing = tcph->rst || (!ctx->is_egress && tcph->fin); | ||||
|  | ||||
| 	// Do not timestamp pure ACKs | ||||
| 	if (ctx->is_egress && ctx->nh.pos - ctx->data >= ctx->pkt_len && | ||||
| @@ -324,6 +320,7 @@ int pping_ingress(struct xdp_md *ctx) | ||||
| 	struct packet_id p_id = { 0 }; | ||||
| 	__u64 *p_ts; | ||||
| 	struct rtt_event event = { 0 }; | ||||
| 	struct flow_state *f_state; | ||||
| 	struct parsing_context pctx = { | ||||
| 		.data = (void *)(long)ctx->data, | ||||
| 		.data_end = (void *)(long)ctx->data_end, | ||||
| @@ -332,19 +329,18 @@ int pping_ingress(struct xdp_md *ctx) | ||||
| 		.is_egress = false, | ||||
| 	}; | ||||
| 	bool flow_closing = false; | ||||
| 	__u64 now; | ||||
|  | ||||
| 	if (parse_packet_identifier(&pctx, &p_id, &flow_closing) < 0) | ||||
| 		goto out; | ||||
|  | ||||
| 	// Delete flow, but allow final attempt at RTT calculation | ||||
| 	if (flow_closing) | ||||
| 		bpf_map_delete_elem(&flow_state, &p_id.flow); | ||||
|  | ||||
| 	now = bpf_ktime_get_ns(); | ||||
| 	p_ts = bpf_map_lookup_elem(&packet_ts, &p_id); | ||||
| 	if (!p_ts) | ||||
| 		goto out; | ||||
| 	if (!p_ts || now < *p_ts) | ||||
| 		goto validflow_out; | ||||
|  | ||||
| 	event.rtt = bpf_ktime_get_ns() - *p_ts; | ||||
| 	event.rtt = now - *p_ts; | ||||
| 	event.timestamp = now; | ||||
| 	/* | ||||
| 	 * Attempt to delete timestamp entry as soon as RTT is calculated. | ||||
| 	 * But could have potential concurrency issue where multiple packets | ||||
| @@ -352,10 +348,25 @@ int pping_ingress(struct xdp_md *ctx) | ||||
| 	 */ | ||||
| 	bpf_map_delete_elem(&packet_ts, &p_id); | ||||
|  | ||||
| 	// Update flow's min-RTT, may have concurrency issues | ||||
| 	f_state = bpf_map_lookup_elem(&flow_state, &p_id.flow); | ||||
| 	if (!f_state) | ||||
| 		goto validflow_out; | ||||
|  | ||||
| 	if (f_state->min_rtt == 0 || event.rtt < f_state->min_rtt) | ||||
| 		f_state->min_rtt = event.rtt; | ||||
|  | ||||
| 	event.min_rtt = f_state->min_rtt; | ||||
|  | ||||
| 	__builtin_memcpy(&event.flow, &p_id.flow, sizeof(struct network_tuple)); | ||||
| 	bpf_perf_event_output(ctx, &rtt_events, BPF_F_CURRENT_CPU, &event, | ||||
| 			      sizeof(event)); | ||||
|  | ||||
| validflow_out: | ||||
| 	// Wait with deleting flow until having pushed final RTT message | ||||
| 	if (flow_closing) | ||||
| 		bpf_map_delete_elem(&flow_state, &p_id.flow); | ||||
|  | ||||
| out: | ||||
| 	return XDP_PASS; | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user