Merge pull request #25 from xdp-project/nat64

Add nat64 example
This commit is contained in:
Toke Høiland-Jørgensen
2021-10-12 20:24:17 +02:00
committed by GitHub
9 changed files with 1260 additions and 2 deletions

1
.gitignore vendored
View File

@@ -3,3 +3,4 @@ config.mk
compile_commands.json
*.ll
*.o
*.skel.h

2
configure vendored
View File

@@ -19,6 +19,7 @@ check_toolchain()
: ${CC=gcc}
: ${CLANG=clang}
: ${LLC=llc}
: ${BPFTOOL=bpftool}
for TOOL in $PKG_CONFIG $CC $CLANG $LLC; do
if [ ! $(command -v ${TOOL} 2>/dev/null) ]; then
@@ -39,6 +40,7 @@ check_toolchain()
echo "CC:=${CC}" >>$CONFIG
echo "CLANG:=${CLANG}" >>$CONFIG
echo "LLC:=${LLC}" >>$CONFIG
echo "BPFTOOL:=${BPFTOOL}" >>$CONFIG
}
check_elf()

View File

@@ -12,6 +12,7 @@
#
BPF_C = ${BPF_TARGETS:=.c}
BPF_OBJ = ${BPF_C:.c=.o}
BPF_SKEL = ${BPF_SKEL_OBJ:.o=.skel.h}
USER_C := ${USER_TARGETS:=.c}
USER_OBJ := ${USER_C:.c=.o}
BPF_OBJ_INSTALL ?= $(BPF_OBJ)
@@ -51,7 +52,7 @@ BPF_CFLAGS += -I$(INCLUDE_DIR) -I$(HEADER_DIR) $(EXTRA_CFLAGS)
BPF_HEADERS := $(wildcard $(HEADER_DIR)/*/*.h) $(wildcard $(INCLUDE_DIR)/*/*.h)
all: $(USER_TARGETS) $(BPF_OBJ) $(EXTRA_TARGETS)
all: $(USER_TARGETS) $(BPF_OBJ) $(EXTRA_TARGETS) $(BPF_SKEL)
.PHONY: clean
clean::
@@ -70,7 +71,7 @@ LIB_H := ${LIB_OBJS:.o=.h}
$(LIB_OBJS): %.o: %.c %.h $(LIB_H)
$(Q)$(MAKE) -C $(dir $@) $(notdir $@)
$(USER_TARGETS): %: %.c $(OBJECT_LIBBPF) $(OBJECT_LIBXDP) $(LIBMK) $(LIB_OBJS) $(KERN_USER_H) $(EXTRA_DEPS) $(EXTRA_USER_DEPS)
$(USER_TARGETS): %: %.c $(OBJECT_LIBBPF) $(OBJECT_LIBXDP) $(LIBMK) $(LIB_OBJS) $(KERN_USER_H) $(EXTRA_DEPS) $(EXTRA_USER_DEPS) $(BPF_SKEL)
$(QUIET_CC)$(CC) -Wall $(CFLAGS) $(LDFLAGS) -o $@ $(LIB_OBJS) \
$< $(LDLIBS)
@@ -86,6 +87,9 @@ $(BPF_OBJ): %.o: %.c $(KERN_USER_H) $(EXTRA_DEPS) $(BPF_HEADERS) $(LIBMK)
-O2 -emit-llvm -c -g -o ${@:.o=.ll} $<
$(QUIET_LLC)$(LLC) -march=bpf -filetype=obj -o $@ ${@:.o=.ll}
$(BPF_SKEL): %.skel.h: %.o
$(QUIET_GEN)$(BPFTOOL) gen skeleton ${@:.skel.h=.o} > $@
.PHONY: test
ifeq ($(TEST_FILE),)

2
nat64-bpf/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
nat64

13
nat64-bpf/Makefile Normal file
View File

@@ -0,0 +1,13 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
USER_TARGETS := nat64
BPF_TARGETS := nat64_kern
BPF_SKEL_OBJ := nat64_kern.o
#LDLIBS += -pthread
USER_LIBS = -lmnl
EXTRA_DEPS += nat64.h
LIB_DIR = ../lib
include $(LIB_DIR)/common.mk

86
nat64-bpf/README.org Normal file
View File

@@ -0,0 +1,86 @@
* NAT64 BPF implementation
This directory contains a BPF implementation of a stateless NAT64
implementation, like that performed by Tayga, but entirely in BPF. It works by
attaching to the TC hooks of an interface and translating incoming IPv6
addresses with a destination in the configured NAT64 prefix, and routing v4
packets back out through that interface based on the (v4) prefix used for
translation.
** Running
To run the translator on =eth0= with an IPv4 prefix of =10.0.1.0/24= and using
the default well-known v6 prefix (=64:ff9b::/96=), simply issue
#+begin_src sh
sudo ./nat64 -i eth0 -4 10.0.1.0/24 -a fc00::/8
#+end_src
Run again with a =-u= parameter to unload (but make sure to also specify the
rest of the parameters as they are needed to properly clean up). To specify
another v6 prefix, use =-6=.
The userspace utility will install the necessary routing rules, and setup the
BPF programs, then exit. The translator will then keep running entirely in the
kernel until unloaded (with =-u=).
** Assumptions
The operation of this NAT64 translator makes a few assumptions:
- A single v6 NAT64 prefix is used, and the prefix length is always 96 (i.e.,
the v4 addresses live in the last four bytes). By default the well-known
prefix =64:ff9b::/96= is used.
- IPv6 source addresses are mapped into a configured IPv4 prefix one-to-one.
Regular NAT4 can be applied afterwards to map to a single public IP. A
separate v4 prefix should be used for every interface that the translator runs
on. Source address v6-to-v4 mappings are dynamically created as new sources
appear, and time out after two hours.
- An allowlist of IPv6 source prefixes that should be subject to translation is
maintained.
** How it works
Two BPF programs are attached to the ingress and egress hooks of the interface
being configured. The ingress program will process IPv6 packets, and any packet
with a destination address in the configured NAT64 prefix will be either
translated (if the source is allowed), or dropped. The egress program processes
IPv4 packets and any packet with a destination in the configured v4 prefix will
be either translated (if a v6 address is found in the state map) or dropped.
To make sure the v4 traffic makes it to the right interface, a v4-via-v6 route
is installed on that interface with a gateway address of the network address of
the v6 prefix, and a fake neighbour entry is installed to avoid the kernel doing
neighbour lookups of the gateway. This gets the packets to where the BPF program
can process them, and after translation a new neighbour lookup with be performed
with the new v6 destination.
Note that because of the place of the BPF hook in ingress processing, the
ingress BPF program will need to redirect the packet to the same interface after
translation for re-processing as an IPv4 packet. This means that things like
tcpdump will see first the original IPv6 packet, and then the translated IPv4
packet. On egress the translation happens earlier, so only the translated packet
will be seen.
** Limitations / known issues
At least the first two of these should probably be fixed before deploying this:
- The IP headers in ICMP error message payloads are not translated, which
probably breaks ICMP errors.
- The BPF programs assume the interface is an Ethernet interface, so translation
won't work on layer 3 devices (like Wireguard tunnels).
- IP options are not handled at all. In particular this means that fragmented
IPv6 packets won't pass the translator.
- The BPF programs support specifying multiple allowed source IPv6 prefixes, as
well as doing ahead-of-time static mappings, but the userspace component
doesn't support these yet.
- The userspace program also has no way to print its status, or dump the state
of the translation table. The BPF maps can be inspected with bpftool as a
stopgap measure, though.

454
nat64-bpf/nat64.c Normal file
View File

@@ -0,0 +1,454 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <net/if.h>
#include <linux/if_arp.h>
#include <getopt.h>
#include <linux/in6.h>
#include <arpa/inet.h>
#include <linux/bpf.h>
#include <bpf/libbpf.h>
#include <bpf/bpf.h>
#include <libmnl/libmnl.h>
#include <linux/rtnetlink.h>
#include "nat64.h"
#include "nat64_kern.skel.h"
#define NS_PER_SECOND 1000000000UL
#define NS_PER_MS 1000000UL
static const struct option long_options[] = {
{ "help", no_argument, NULL, 'h' },
{ "unload", no_argument, NULL, 'u' },
{ "interface", required_argument, NULL, 'i' }, // Name of interface to run on
{ "allowed-src", required_argument, NULL, 'a' }, // v6 prefix to allow as source
{ "v6-prefix", required_argument, NULL, '6' }, // v6 prefix to use for nat64
{ "v4-prefix", required_argument, NULL, '4' }, // v4 prefix to use for nat64
{ "timeout", required_argument, NULL, 't' }, // Address mapping timeout interval in s
{ 0, 0, NULL, 0 }
};
struct nat64_user_config {
struct nat64_config c;
int ifindex;
char ifname[IF_NAMESIZE+1];
struct in6_addr v6_allow;
__u32 v6_allow_pxlen;
__u32 v4_pxlen;
bool unload;
};
static int parse_v6_prefix(char *str, struct in6_addr *v6addr)
{
char *net;
int pxlen;
net = strstr(str, "/");
if (!net) {
fprintf(stderr, "Invalid v6 prefix: %s\n", str);
return -EINVAL;
}
pxlen = atoi(net + 1);
*net = '\0';
if (inet_pton(AF_INET6, str, v6addr) != 1) {
fprintf(stderr, "Invalid v6 addr: %s\n", str);
return -EINVAL;
}
return pxlen;
}
static int parse_arguments(int argc, char *argv[], struct nat64_user_config *config)
{
struct in6_addr v6addr;
struct in_addr v4addr;
int pxlen, seconds;
int err, opt;
char *net;
config->ifindex = 0;
config->c.timeout_ns = 7200 * NS_PER_SECOND;
config->c.next_addr = 1;
/* Default to special prefix 64:ff9b::/96 */
config->c.v6_prefix.s6_addr[1] = 0x64;
config->c.v6_prefix.s6_addr[2] = 0xff;
config->c.v6_prefix.s6_addr[3] = 0x9b;
while ((opt = getopt_long(argc, argv, "i:6:4:t:a:hu", long_options,
NULL)) != -1) {
switch (opt) {
case 'i':
if (strlen(optarg) > IF_NAMESIZE) {
fprintf(stderr, "interface name too long\n");
return -EINVAL;
}
strncpy(config->ifname, optarg, IF_NAMESIZE);
config->ifindex = if_nametoindex(config->ifname);
if (config->ifindex == 0) {
err = -errno;
fprintf(stderr,
"Could not get index of interface %s: %s\n",
config->ifname, strerror(err));
return err;
}
break;
case 'a':
pxlen = parse_v6_prefix(optarg, &v6addr);
if (pxlen < 0)
return pxlen;
config->v6_allow = v6addr;
config->v6_allow_pxlen = pxlen;
break;
case '6':
pxlen = parse_v6_prefix(optarg, &v6addr);
if (pxlen < 0)
return pxlen;
if (pxlen != 96) {
fprintf(stderr, "v6 prefix must have pxlen 96\n");
return -EINVAL;
}
if (v6addr.s6_addr32[3]) {
fprintf(stderr, "Not a /96 network address: %s\n", optarg);
return -EINVAL;
}
config->c.v6_prefix = v6addr;
break;
case '4':
net = strstr(optarg, "/");
if (!net) {
fprintf(stderr, "Invalid v6 prefix: %s\n", optarg);
return -EINVAL;
}
pxlen = atoi(net + 1);
if (pxlen < 1 || pxlen > 31) {
fprintf(stderr, "v4_pxlen must be between 1 and 31\n");
return -EINVAL;
}
*net = '\0';
if (inet_pton(AF_INET, optarg, &v4addr) != 1) {
fprintf(stderr, "Invalid v4 addr: %s\n", optarg);
return -EINVAL;
}
config->c.v4_mask = 0xFFFFFFFF << (32 - pxlen);
config->v4_pxlen = pxlen;
config->c.v4_prefix = ntohl(v4addr.s_addr);
if (config->c.v4_prefix & ~config->c.v4_mask) {
fprintf(stderr, "Not a network address: %s\n", optarg);
return -EINVAL;
}
break;
case 't':
seconds = atoi(optarg);
if (seconds < 1 || seconds > 100000) {
fprintf(stderr, "Timeout must be in the interval between 1 and 100000 seconds\n");
return -EINVAL;
}
config->c.timeout_ns = (__u64)seconds * NS_PER_SECOND;
break;
case 'u':
config->unload = true;
break;
default:
fprintf(stderr, "Unknown option %s\n", argv[optind]);
return -EINVAL;
}
}
if (config->ifindex == 0) {
fprintf(stderr,
"An interface (-i or --interface) must be provided\n");
return -EINVAL;
}
if (!config->c.v4_prefix) {
fprintf(stderr,
"A v4 prefix (-4 or --v4-prefix) must be provided\n");
return -EINVAL;
}
return 0;
}
static int do_v4_neigh(struct mnl_socket *nl, struct nat64_user_config *cfg, bool create)
{
char buf[MNL_SOCKET_BUFFER_SIZE];
struct nlmsghdr *nlh;
uint32_t seq, portid;
struct rtmsg *rtm;
int ret, err = 0;
struct {
__u16 family;
struct in6_addr addr;
} __attribute__((packed)) via = {
.family = AF_INET6,
.addr = cfg->c.v6_prefix
};
nlh = mnl_nlmsg_put_header(buf);
if (create) {
nlh->nlmsg_type = RTM_NEWROUTE;
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
} else {
nlh->nlmsg_type = RTM_DELROUTE;
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
}
nlh->nlmsg_seq = seq = time(NULL);
rtm = mnl_nlmsg_put_extra_header(nlh, sizeof(struct rtmsg));
rtm->rtm_family = AF_INET;
rtm->rtm_dst_len = cfg->v4_pxlen;
rtm->rtm_src_len = 0;
rtm->rtm_tos = 0;
rtm->rtm_protocol = RTPROT_STATIC;
rtm->rtm_table = RT_TABLE_MAIN;
rtm->rtm_type = RTN_UNICAST;
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
rtm->rtm_flags = RTNH_F_ONLINK;
mnl_attr_put_u32(nlh, RTA_DST, htonl(cfg->c.v4_prefix));
mnl_attr_put_u32(nlh, RTA_OIF, cfg->ifindex);
mnl_attr_put(nlh, RTA_VIA, sizeof(via), &via);
portid = mnl_socket_get_portid(nl);
if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
perror("mnl_socket_sendto");
err = -errno;
goto out;
}
ret = mnl_socket_recvfrom(nl, buf, sizeof(buf));
if (ret < 0) {
perror("mnl_socket_recvfrom");
err = -errno;
goto out;
}
ret = mnl_cb_run(buf, ret, seq, portid, NULL, NULL);
if (ret < 0) {
if ((create && errno != EEXIST) ||
!(create && errno != ENOENT && errno != ESRCH))
err = -errno;
goto out;
}
out:
return err;
}
static int do_v4_route(struct mnl_socket *nl, struct nat64_user_config *cfg, bool create)
{
char buf[MNL_SOCKET_BUFFER_SIZE];
struct nlmsghdr *nlh;
uint32_t seq, portid;
struct ndmsg *ndm;
int ret, err = 0;
__u8 lladdr[6] = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55 };
nlh = mnl_nlmsg_put_header(buf);
if (create) {
nlh->nlmsg_type = RTM_NEWNEIGH;
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
} else {
nlh->nlmsg_type = RTM_DELNEIGH;
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
}
nlh->nlmsg_seq = seq = time(NULL);
ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(struct ndmsg));
ndm->ndm_family = AF_INET6;
ndm->ndm_ifindex = cfg->ifindex;
ndm->ndm_state = NUD_PERMANENT;
ndm->ndm_type = 0;
mnl_attr_put(nlh, NDA_LLADDR, sizeof(lladdr), &lladdr);
mnl_attr_put(nlh, NDA_DST, sizeof(struct in6_addr), &cfg->c.v6_prefix);
portid = mnl_socket_get_portid(nl);
if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
perror("mnl_socket_sendto");
err = -errno;
goto out;
}
ret = mnl_socket_recvfrom(nl, buf, sizeof(buf));
if (ret < 0) {
perror("mnl_socket_recvfrom");
err = -errno;
goto out;
}
ret = mnl_cb_run(buf, ret, seq, portid, NULL, NULL);
if (ret < 0) {
if ((create && errno != EEXIST) ||
!(create && errno != ENOENT && errno != ESRCH))
err = -errno;
goto out;
}
out:
return err;
}
static int do_netlink(struct nat64_user_config *cfg, bool create)
{
struct mnl_socket *nl;
int err = 0;
nl = mnl_socket_open(NETLINK_ROUTE);
if (nl == NULL) {
perror("mnl_socket_open");
err = -errno;
goto out;
}
if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) {
perror("mnl_socket_bind");
err = -errno;
goto out;
}
err = do_v4_route(nl, cfg, create);
err = err ?: do_v4_neigh(nl, cfg, create);
out:
mnl_socket_close(nl);
return err;
}
int teardown(struct nat64_user_config *cfg)
{
DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook,
.attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS,
.ifindex = cfg->ifindex);
int err;
err = bpf_tc_hook_destroy(&hook);
if (err)
fprintf(stderr, "Couldn't remove clsact qdisc on %s\n", cfg->ifname);
err = do_netlink(cfg, false);
if (err)
fprintf(stderr, "Couldn't remove route on %s: %s\n",
cfg->ifname, strerror(-err));
return err;
}
int main(int argc, char *argv[])
{
struct v6_trie_key prefix_key = {};
struct nat64_user_config cfg = {};
struct nat64_kern *obj;
unsigned int num_addr;
char buf[100];
int err = 0;
DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS);
DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_egress);
DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_ingress);
err = parse_arguments(argc, argv, &cfg);
if (err)
return EXIT_FAILURE;
hook.ifindex = cfg.ifindex;
if (cfg.unload)
return teardown(&cfg);
obj = nat64_kern__open();
err = libbpf_get_error(obj);
if (err) {
libbpf_strerror(err, buf, sizeof(buf));
fprintf(stderr, "Couldn't open BPF skeleton: %s\n", buf);
return err;
}
num_addr = (cfg.c.v4_prefix | ~cfg.c.v4_mask) - cfg.c.v4_prefix - 2;
obj->bss->config = cfg.c;
bpf_map__resize(obj->maps.v6_state_map, num_addr);
bpf_map__resize(obj->maps.v4_reversemap, num_addr);
bpf_map__resize(obj->maps.reclaimed_addrs, num_addr);
err = nat64_kern__load(obj);
if (err) {
libbpf_strerror(err, buf, sizeof(buf));
fprintf(stderr, "Couldn't load BPF skeleton: %s\n", buf);
goto out;
}
if (cfg.v6_allow_pxlen) {
__u32 value = 0;
prefix_key.t.prefixlen = cfg.v6_allow_pxlen;
prefix_key.addr = cfg.v6_allow;
err = bpf_map_update_elem(bpf_map__fd(obj->maps.allowed_v6_src),
&prefix_key, &value, 0);
if (err) {
fprintf(stderr, "Couldn't insert allowed prefix\n");
goto out;
}
}
attach_ingress.prog_fd = bpf_program__fd(obj->progs.nat64_ingress);
if (attach_ingress.prog_fd < 0) {
fprintf(stderr, "Couldn't find ingress program\n");
err = -ENOENT;
goto out;
}
attach_egress.prog_fd = bpf_program__fd(obj->progs.nat64_egress);
if (attach_egress.prog_fd < 0) {
fprintf(stderr, "Couldn't find egress program\n");
err = -ENOENT;
goto out;
}
err = bpf_tc_hook_create(&hook);
if (err && err != -EEXIST) {
fprintf(stderr, "Couldn't create ingress hook for ifindex %d\n", cfg.ifindex);
goto out;
}
hook.attach_point = BPF_TC_INGRESS;
err = bpf_tc_attach(&hook, &attach_ingress);
if (err) {
fprintf(stderr, "Couldn't attach ingress program to ifindex %d\n",
hook.ifindex);
goto out;
}
hook.attach_point = BPF_TC_EGRESS;
err = bpf_tc_attach(&hook, &attach_egress);
if (err) {
fprintf(stderr, "Couldn't attach egress program to ifindex %d\n",
hook.ifindex);
goto out;
}
err = do_netlink(&cfg, true);
if (err) {
fprintf(stderr, "Couldn't create route: %s\n", strerror(-err));
err = teardown(&cfg);
}
out:
nat64_kern__destroy(obj);
return err;
}

25
nat64-bpf/nat64.h Normal file
View File

@@ -0,0 +1,25 @@
#ifndef __NAT64_H__
#define __NAT64_H__
#include <linux/in6.h>
struct nat64_config {
struct in6_addr v6_prefix;
__u64 timeout_ns;
__u64 next_addr;
__u32 v4_prefix;
__u32 v4_mask;
};
struct v6_addr_state {
__u64 last_seen;
__u32 v4_addr;
__u32 static_conf;
};
struct v6_trie_key {
struct bpf_lpm_trie_key t;
struct in6_addr addr;
};
#endif

671
nat64-bpf/nat64_kern.c Normal file
View File

@@ -0,0 +1,671 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright 2021 Toke Høiland-Jørgensen <toke@toke.dk> */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/compiler.h>
#include <linux/pkt_sched.h>
#include <linux/pkt_cls.h>
#include <stdbool.h>
#include "../include/xdp/parsing_helpers.h"
#include "nat64.h"
char _license[] SEC("license") = "GPL";
struct nat64_config config;
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, struct in6_addr);
__type(value, struct v6_addr_state);
__uint(max_entries, 1);
__uint(map_flags, BPF_F_NO_PREALLOC);
} v6_state_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, __u32);
__type(value, struct in6_addr);
__uint(max_entries, 1);
__uint(map_flags, BPF_F_NO_PREALLOC);
} v4_reversemap SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_LPM_TRIE);
__uint(key_size, sizeof(struct in6_addr));
__uint(value_size, sizeof(__u32));
__uint(max_entries, 1);
__uint(map_flags, BPF_F_NO_PREALLOC);
} allowed_v6_src SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_QUEUE);
__uint(key_size, 0);
__uint(value_size, sizeof(__u32));
__uint(max_entries, 1);
} reclaimed_addrs SEC(".maps");
#ifdef DEBUG
#define DBG(fmt, ...) \
({ \
char ____fmt[] = "nat64: " fmt; \
bpf_trace_printk(____fmt, sizeof(____fmt), \
##__VA_ARGS__); \
})
#else
#define DBG
#endif
struct icmpv6_pseudo {
struct in6_addr saddr;
struct in6_addr daddr;
__u32 len;
__u8 padding[3];
__u8 nh;
} __attribute__((packed));
static __always_inline void update_icmp_checksum(struct __sk_buff *skb,
struct ipv6hdr *ip6h,
void *icmp_before,
void *icmp_after,
bool add)
{
void *data = (void *)(unsigned long long)skb->data;
struct icmpv6_pseudo ph = {
.nh = IPPROTO_ICMPV6,
.saddr = ip6h->saddr,
.daddr = ip6h->daddr,
.len = ip6h->payload_len
};
__u16 h_before, h_after, offset;
__u32 csum, u_before, u_after;
/* Do checksum update in two passes: first compute the incremental
* checksum update of the ICMPv6 pseudo header, update the checksum
* using bpf_l4_csum_replace(), and then do a separate update for the
* ICMP type and code (which is two consecutive bytes, so cast them to
* u16). The bpf_csum_diff() helper can be used to compute the
* incremental update of the full block, whereas the
* bpf_l4_csum_replace() helper can do the two-byte diff and update by
* itself.
*/
csum = bpf_csum_diff((__be32 *)&ph, add ? 0 : sizeof(ph),
(__be32 *)&ph, add ? sizeof(ph) : 0,
0);
offset = ((void *)icmp_after - data) + 2;
/* first two bytes of ICMP header, type and code */
h_before = *(__u16 *)icmp_before;
h_after = *(__u16 *)icmp_after;
/* last four bytes of ICMP header, the data union */
u_before = *(__u32 *)(icmp_before + 4);
u_after = *(__u32 *)(icmp_after + 4);
bpf_l4_csum_replace(skb, offset, 0, csum, BPF_F_PSEUDO_HDR);
bpf_l4_csum_replace(skb, offset, h_before, h_after, 2);
if (u_before != u_after)
bpf_l4_csum_replace(skb, offset, u_before, u_after, 4);
}
static int rewrite_icmp(struct iphdr *iph, struct ipv6hdr *ip6h, struct __sk_buff *skb)
{
void *data_end = (void *)(unsigned long long)skb->data_end;
struct icmphdr old_icmp, *icmp = (void *)(iph + 1);
struct icmp6hdr icmp6, *new_icmp6;
__u32 mtu;
if (icmp + 1 > data_end)
return -1;
old_icmp = *icmp;
new_icmp6 = (void *)icmp;
icmp6 = *new_icmp6;
/* These translations are defined in RFC6145 section 4.2 */
switch (icmp->type) {
case ICMP_ECHO:
icmp6.icmp6_type = ICMPV6_ECHO_REQUEST;
break;
case ICMP_ECHOREPLY:
icmp6.icmp6_type = ICMPV6_ECHO_REPLY;
break;
case ICMP_DEST_UNREACH:
icmp6.icmp6_type = ICMPV6_DEST_UNREACH;
switch(icmp->code) {
case ICMP_NET_UNREACH:
case ICMP_HOST_UNREACH:
case ICMP_SR_FAILED:
case ICMP_NET_UNKNOWN:
case ICMP_HOST_UNKNOWN:
case ICMP_HOST_ISOLATED:
case ICMP_NET_UNR_TOS:
case ICMP_HOST_UNR_TOS:
icmp6.icmp6_code = ICMPV6_NOROUTE;
break;
case ICMP_PROT_UNREACH:
icmp6.icmp6_type = ICMPV6_PARAMPROB;
icmp6.icmp6_code = ICMPV6_UNK_NEXTHDR;
icmp6.icmp6_pointer = bpf_htonl(offsetof(struct ipv6hdr, nexthdr));
case ICMP_PORT_UNREACH:
icmp6.icmp6_code = ICMPV6_PORT_UNREACH;
break;
case ICMP_FRAG_NEEDED:
icmp6.icmp6_type = ICMPV6_PKT_TOOBIG;
icmp6.icmp6_code = 0;
mtu = bpf_ntohs(icmp->un.frag.mtu) + 20;
/* RFC6145 section 6, "second approach" - should not be
* necessary, but might as well do this
*/
if (mtu < 1280)
mtu = 1280;
icmp6.icmp6_mtu = bpf_htonl(mtu);
case ICMP_NET_ANO:
case ICMP_HOST_ANO:
case ICMP_PKT_FILTERED:
case ICMP_PREC_CUTOFF:
icmp6.icmp6_code = ICMPV6_ADM_PROHIBITED;
default:
return -1;
}
break;
case ICMP_PARAMETERPROB:
if (icmp->code == 1)
return -1;
icmp6.icmp6_type = ICMPV6_PARAMPROB;
icmp6.icmp6_code = ICMPV6_HDR_FIELD;
/* The pointer field not defined in the Linux header. This
* translation is from Figure 3 of RFC6145.
*/
switch (icmp->un.reserved[0]) {
case 0: /* version/IHL */
icmp6.icmp6_pointer = 0;
break;
case 1: /* Type of Service */
icmp6.icmp6_pointer = bpf_htonl(1);
break;
case 2: /* Total length */
case 3:
icmp6.icmp6_pointer = bpf_htonl(4);
break;
case 8: /* Time to Live */
icmp6.icmp6_pointer = bpf_htonl(7);
break;
case 9: /* Protocol */
icmp6.icmp6_pointer = bpf_htonl(6);
break;
case 12: /* Source address */
case 13:
case 14:
case 15:
icmp6.icmp6_pointer = bpf_htonl(8);
break;
case 16: /* Destination address */
case 17:
case 18:
case 19:
icmp6.icmp6_pointer = bpf_htonl(24);
break;
default:
return -1;
}
default:
return -1;
}
*new_icmp6 = icmp6;
update_icmp_checksum(skb, ip6h, &old_icmp, new_icmp6, true);
/* FIXME: also need to rewrite IP header embedded in ICMP error */
return 0;
}
static int nat64_handle_v4(struct __sk_buff *skb, struct hdr_cursor *nh)
{
void *data_end = (void *)(unsigned long long)skb->data_end;
void *data = (void *)(unsigned long long)skb->data;
int ip_type, iphdr_len, ip_offset;
struct in6_addr *dst_v6;
struct ipv6hdr *ip6h;
int ret = TC_ACT_OK;
struct iphdr *iph;
struct ethhdr *eth;
__u32 dst_v4;
struct ipv6hdr dst_hdr = {
.version = 6,
.saddr = config.v6_prefix,
};
ip_offset = (nh->pos - data) & 0x1fff;
ip_type = parse_iphdr(nh, data_end, &iph);
if (ip_type < 0)
goto out;
dst_v4 = bpf_ntohl(iph->daddr);
if ((dst_v4 & config.v4_mask) != config.v4_prefix)
goto out;
/* At this point we know the destination IP is within the configured
* subnet, so if we can't rewrite the packet it should be dropped (so as
* not to leak traffic in that subnet).
*/
ret = TC_ACT_SHOT;
/* we don't bother dealing with IP options or fragmented packets. The
* latter are identified by the 'frag_off' field having a value (either
* the MF bit, or the fragmet offset, or both). However, this field also
* contains the "don't fragment" (DF) bit, which we ignore, so mask that
* out. The DF is the second-most-significant bit (as bit 0 is
* reserved).
*/
iphdr_len = iph->ihl * 4;
if (iphdr_len != sizeof(struct iphdr) ||
(iph->frag_off & ~bpf_htons(1<<14))) {
DBG("v4: pkt src/dst %pI4/%pI4 has IP options or is fragmented, dropping\n",
&iph->daddr, &iph->saddr);
goto out;
}
dst_v6 = bpf_map_lookup_elem(&v4_reversemap, &dst_v4);
if (!dst_v6) {
DBG("v4: no mapping found for dst %pI4\n", &iph->daddr);
goto out;
}
DBG("v4: Found mapping for dst %pI4 to %pI6c\n", &iph->daddr, dst_v6);
// src v4 as last octet of nat64 address
dst_hdr.saddr.s6_addr32[3] = iph->saddr;
dst_hdr.daddr = *dst_v6;
dst_hdr.nexthdr = iph->protocol;
dst_hdr.hop_limit = iph->ttl;
/* weird definition in ipv6hdr */
dst_hdr.priority = (iph->tos & 0x70) >> 4;
dst_hdr.flow_lbl[0] = iph->tos << 4;
dst_hdr.payload_len = bpf_htons(bpf_ntohs(iph->tot_len) - iphdr_len);
if (dst_hdr.nexthdr == IPPROTO_ICMP) {
if (rewrite_icmp(iph, &dst_hdr, skb))
goto out;
dst_hdr.nexthdr = IPPROTO_ICMPV6;
}
if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IPV6), 0))
goto out;
data = (void *)(unsigned long long)skb->data;
data_end = (void *)(unsigned long long)skb->data_end;
eth = data;
ip6h = data + ip_offset;
if (eth + 1 > data_end || ip6h + 1 > data_end)
goto out;
eth->h_proto = bpf_htons(ETH_P_IPV6);
*ip6h = dst_hdr;
ret = bpf_redirect_neigh(skb->ifindex, NULL, 0, 0);
out:
return ret;
}
static long check_item(struct bpf_map *map, const void *key, void *value, void *ctx)
{
struct v6_addr_state *state = value;
__u64 timeout = *((__u64 *)ctx);
if (state->last_seen < timeout && !state->static_conf) {
__u32 v4_addr = state->v4_addr;
bpf_map_delete_elem(map, key);
bpf_map_delete_elem(&v4_reversemap, &v4_addr);
bpf_map_push_elem(&reclaimed_addrs, &v4_addr, 0);
/* only reclaim one address at a time, so mappings don't expire
* until they absolutely have to
*/
return 1;
}
return 0;
}
static __u32 reclaim_v4_addr(void)
{
__u64 timeout = bpf_ktime_get_ns() - config.timeout_ns;
__u32 src_v4;
if (bpf_map_pop_elem(&reclaimed_addrs, &src_v4) == 0)
return src_v4;
bpf_for_each_map_elem(&v6_state_map, check_item, &timeout, 0);
return bpf_map_pop_elem(&reclaimed_addrs, &src_v4) ? 0 : src_v4;
}
static struct v6_addr_state *alloc_new_state(struct in6_addr *src_v6)
{
struct v6_addr_state new_v6_state = { .last_seen = bpf_ktime_get_ns() };
__u32 max_v4 = (config.v4_prefix | ~config.v4_mask) - 1;
__u32 src_v4 = 0;
int i;
for (i = 0; i < 10; i++) {
__u32 next_v4, next_addr;
next_addr = __sync_fetch_and_add(&config.next_addr, 0);
next_v4 = config.v4_prefix + next_addr;
if (next_v4 >= max_v4) {
src_v4 = reclaim_v4_addr();
break;
}
if (__sync_val_compare_and_swap(&config.next_addr,
next_addr,
next_addr + 1) == next_addr) {
src_v4 = next_v4;
break;
}
}
/* If src_v4 is 0 here, we failed to find an available addr */
if (!src_v4)
return NULL;
new_v6_state.v4_addr = src_v4;
if (bpf_map_update_elem(&v6_state_map, src_v6, &new_v6_state, BPF_NOEXIST))
goto err;
if (bpf_map_update_elem(&v4_reversemap, &src_v4, src_v6, BPF_NOEXIST))
goto err_v4;
return bpf_map_lookup_elem(&v6_state_map, src_v6);
err_v4:
bpf_map_delete_elem(&v6_state_map, src_v6);
err:
/* failed to insert entry in maps, put the address back in the queue for
* reclaiming
*/
bpf_map_push_elem(&reclaimed_addrs, &src_v4, 0);
return NULL;
}
static int cmp_v6addr(struct in6_addr *a, struct in6_addr *b)
{
int i;
for (i = 0; i < 4; i++) {
if (a->s6_addr32[i] < b->s6_addr32[i])
return -1;
if (a->s6_addr32[i] > b->s6_addr32[i])
return 1;
}
return 0;
}
static __always_inline __u16 csum_fold_helper(__u32 csum)
{
__u32 sum;
sum = (csum >> 16) + (csum & 0xffff);
sum += (sum >> 16);
return ~sum;
}
static int rewrite_icmpv6(struct ipv6hdr *ip6h, struct __sk_buff *skb)
{
void *data_end = (void *)(unsigned long long)skb->data_end;
struct icmp6hdr old_icmp6, *icmp6 = (void *)(ip6h + 1);
struct icmphdr icmp, *new_icmp;
__u32 mtu, ptr;
if (icmp6 + 1 > data_end)
return -1;
old_icmp6 = *icmp6;
new_icmp = (void *)icmp6;
icmp = *new_icmp;
/* These translations are defined in RFC6145 section 5.2 */
switch (icmp6->icmp6_type) {
case ICMPV6_ECHO_REQUEST:
icmp.type = ICMP_ECHO;
break;
case ICMPV6_ECHO_REPLY:
icmp.type = ICMP_ECHOREPLY;
break;
case ICMPV6_DEST_UNREACH:
icmp.type = ICMP_DEST_UNREACH;
switch(icmp6->icmp6_code) {
case ICMPV6_NOROUTE:
case ICMPV6_NOT_NEIGHBOUR:
case ICMPV6_ADDR_UNREACH:
icmp.code = ICMP_HOST_UNREACH;
break;
case ICMPV6_ADM_PROHIBITED:
icmp.code = ICMP_HOST_ANO;
break;
case ICMPV6_PORT_UNREACH:
icmp.code = ICMP_PORT_UNREACH;
break;
default:
return -1;
}
break;
case ICMPV6_PKT_TOOBIG:
icmp.type = ICMP_DEST_UNREACH;
icmp.code = ICMP_FRAG_NEEDED;
mtu = bpf_htonl(icmp6->icmp6_mtu) - 20;
if (mtu > 0xffff)
return -1;
icmp.un.frag.mtu = bpf_htons(mtu);
break;
case ICMPV6_TIME_EXCEED:
icmp.type = ICMP_TIME_EXCEEDED;
break;
case ICMPV6_PARAMPROB:
switch (icmp6->icmp6_code) {
case 0:
icmp.type = ICMP_PARAMETERPROB;
icmp.code = 0;
break;
case 1:
icmp.type = ICMP_DEST_UNREACH;
icmp.code = ICMP_PROT_UNREACH;
ptr = bpf_ntohl(icmp6->icmp6_pointer);
/* Figure 6 in RFC6145 - using if statements b/c of
* range at the bottom
*/
if (ptr == 0 || ptr == 1)
icmp.un.reserved[0] = ptr;
else if (ptr == 4 || ptr == 5)
icmp.un.reserved[0] = 2;
else if (ptr == 6)
icmp.un.reserved[0] = 9;
else if (ptr == 7)
icmp.un.reserved[0] = 8;
else if (ptr >= 8 && ptr <= 23)
icmp.un.reserved[0] = 12;
else if (ptr >= 24 && ptr <= 39)
icmp.un.reserved[0] = 16;
else
return -1;
break;
default:
return -1;
}
break;
default:
return -1;
}
*new_icmp = icmp;
update_icmp_checksum(skb, ip6h, &old_icmp6, new_icmp, false);
/* FIXME: also need to rewrite IP header embedded in ICMP error */
return 0;
}
static int nat64_handle_v6(struct __sk_buff *skb, struct hdr_cursor *nh)
{
void *data_end = (void *)(unsigned long long)skb->data_end;
void *data = (void *)(unsigned long long)skb->data;
struct v6_trie_key saddr_key = { .t.prefixlen = 128 };
struct in6_addr *dst_v6, subnet_v6 = {};
__u32 *allowval, src_v4, dst_v4;
int ip_type, ip_offset;
struct ipv6hdr *ip6h;
int ret = TC_ACT_OK;
struct ethhdr *eth;
struct iphdr *iph;
struct v6_addr_state *v6_state;
struct iphdr dst_hdr = {
.version = 4,
.ihl = 5,
.frag_off = bpf_htons(1<<14), /* set Don't Fragment bit */
};
ip_offset = (nh->pos - data) & 0x1fff;
ip_type = parse_ip6hdr(nh, data_end, &ip6h);
if (ip_type < 0)
goto out;
dst_v6 = &ip6h->daddr;
subnet_v6 = *dst_v6;
/* v6 pxlen is always 96 */
subnet_v6.s6_addr32[3] = 0;
if (cmp_v6addr(&subnet_v6, &config.v6_prefix)) {
DBG("v6: dst subnet %pI6c not in configured prefix %pI6c\n",
&subnet_v6, &config.v6_prefix);
goto out;
}
/* At this point we know the destination IP is within the configured
* subnet, so if we can't rewrite the packet it should be dropped (so as
* not to leak traffic in that subnet).
*/
ret = TC_ACT_SHOT;
/* drop packets with IP options - parser skips options */
if (ip_type != ip6h->nexthdr) {
DBG("v6: dropping packet with IP options from %pI6c\n",
&ip6h->saddr);
goto out;
}
/* drop a few special addresses */
dst_v4 = ip6h->daddr.s6_addr32[3];
if (!dst_v4 || /* 0.0.0.0 */
(dst_v4 & bpf_htonl(0xFF000000)) == bpf_htonl(0x7F000000) || /* 127.x.x.x */
(dst_v4 & bpf_htonl(0xF0000000)) == bpf_htonl(0xe0000000)) { /* multicast */
DBG("v6: dropping invalid v4 dst %pI4 from %pI6c\n",
&dst_v4, &ip6h->saddr);
goto out;
}
saddr_key.addr = ip6h->saddr;
allowval = bpf_map_lookup_elem(&allowed_v6_src, &saddr_key);
if (!allowval) {
DBG("v6: saddr %pI6c not in allowed src\n", &ip6h->saddr);
goto out;
}
v6_state = bpf_map_lookup_elem(&v6_state_map, &ip6h->saddr);
if (!v6_state) {
v6_state = alloc_new_state(&ip6h->saddr);
if (!v6_state) {
DBG("v6: failed to allocate state for src %pI6c\n",
&ip6h->saddr);
goto out;
}
src_v4 = bpf_htonl(v6_state->v4_addr);
DBG("v6: created new state for v6 %pI6c -> %pI4\n",
&ip6h->saddr, &src_v4);
} else {
v6_state->last_seen = bpf_ktime_get_ns();
bpf_map_update_elem(&v6_state_map, &ip6h->saddr, v6_state, BPF_EXIST);
src_v4 = bpf_htonl(v6_state->v4_addr);
DBG("v6: updated old state for v6 %pI6c -> %pI4\n",
&ip6h->saddr, &src_v4);
}
dst_hdr.daddr = dst_v4;
dst_hdr.saddr = bpf_htonl(v6_state->v4_addr);
dst_hdr.protocol = ip6h->nexthdr;
dst_hdr.ttl = ip6h->hop_limit;
dst_hdr.tos = ip6h->priority << 4 | (ip6h->flow_lbl[0] >> 4);
dst_hdr.tot_len = bpf_htons(bpf_ntohs(ip6h->payload_len) + sizeof(dst_hdr));
if (dst_hdr.protocol == IPPROTO_ICMPV6) {
if (rewrite_icmpv6(ip6h, skb))
goto out;
dst_hdr.protocol = IPPROTO_ICMP;
}
dst_hdr.check = csum_fold_helper(bpf_csum_diff((__be32 *)&dst_hdr, 0,
(__be32 *)&dst_hdr, sizeof(dst_hdr),
0));
if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IP), 0))
goto out;
data = (void *)(unsigned long long)skb->data;
data_end = (void *)(unsigned long long)skb->data_end;
eth = data;
iph = data + ip_offset;
if (eth + 1 > data_end || iph + 1 > data_end)
goto out;
eth->h_proto = bpf_htons(ETH_P_IP);
*iph = dst_hdr;
ret = bpf_redirect(skb->ifindex, BPF_F_INGRESS);
out:
return ret;
}
static int nat64_handler(struct __sk_buff *skb, bool egress)
{
void *data_end = (void *)(unsigned long long)skb->data_end;
void *data = (void *)(unsigned long long)skb->data;
struct hdr_cursor nh = { .pos = data };
struct ethhdr *eth;
int eth_type;
/* Parse Ethernet and IP/IPv6 headers */
eth_type = parse_ethhdr(&nh, data_end, &eth);
if (eth_type == bpf_htons(ETH_P_IP) && egress)
return nat64_handle_v4(skb, &nh);
else if (eth_type == bpf_htons(ETH_P_IPV6) && !egress)
return nat64_handle_v6(skb, &nh);
return TC_ACT_OK;
}
SEC("classifier")
int nat64_egress(struct __sk_buff *skb)
{
return nat64_handler(skb, true);
}
SEC("classifier")
int nat64_ingress(struct __sk_buff *skb)
{
return nat64_handler(skb, false);
}