mirror of
https://github.com/xdp-project/bpf-examples.git
synced 2024-05-06 15:54:53 +00:00
1
.gitignore
vendored
1
.gitignore
vendored
@@ -3,3 +3,4 @@ config.mk
|
||||
compile_commands.json
|
||||
*.ll
|
||||
*.o
|
||||
*.skel.h
|
||||
|
2
configure
vendored
2
configure
vendored
@@ -19,6 +19,7 @@ check_toolchain()
|
||||
: ${CC=gcc}
|
||||
: ${CLANG=clang}
|
||||
: ${LLC=llc}
|
||||
: ${BPFTOOL=bpftool}
|
||||
|
||||
for TOOL in $PKG_CONFIG $CC $CLANG $LLC; do
|
||||
if [ ! $(command -v ${TOOL} 2>/dev/null) ]; then
|
||||
@@ -39,6 +40,7 @@ check_toolchain()
|
||||
echo "CC:=${CC}" >>$CONFIG
|
||||
echo "CLANG:=${CLANG}" >>$CONFIG
|
||||
echo "LLC:=${LLC}" >>$CONFIG
|
||||
echo "BPFTOOL:=${BPFTOOL}" >>$CONFIG
|
||||
}
|
||||
|
||||
check_elf()
|
||||
|
@@ -12,6 +12,7 @@
|
||||
#
|
||||
BPF_C = ${BPF_TARGETS:=.c}
|
||||
BPF_OBJ = ${BPF_C:.c=.o}
|
||||
BPF_SKEL = ${BPF_SKEL_OBJ:.o=.skel.h}
|
||||
USER_C := ${USER_TARGETS:=.c}
|
||||
USER_OBJ := ${USER_C:.c=.o}
|
||||
BPF_OBJ_INSTALL ?= $(BPF_OBJ)
|
||||
@@ -51,7 +52,7 @@ BPF_CFLAGS += -I$(INCLUDE_DIR) -I$(HEADER_DIR) $(EXTRA_CFLAGS)
|
||||
|
||||
BPF_HEADERS := $(wildcard $(HEADER_DIR)/*/*.h) $(wildcard $(INCLUDE_DIR)/*/*.h)
|
||||
|
||||
all: $(USER_TARGETS) $(BPF_OBJ) $(EXTRA_TARGETS)
|
||||
all: $(USER_TARGETS) $(BPF_OBJ) $(EXTRA_TARGETS) $(BPF_SKEL)
|
||||
|
||||
.PHONY: clean
|
||||
clean::
|
||||
@@ -70,7 +71,7 @@ LIB_H := ${LIB_OBJS:.o=.h}
|
||||
$(LIB_OBJS): %.o: %.c %.h $(LIB_H)
|
||||
$(Q)$(MAKE) -C $(dir $@) $(notdir $@)
|
||||
|
||||
$(USER_TARGETS): %: %.c $(OBJECT_LIBBPF) $(OBJECT_LIBXDP) $(LIBMK) $(LIB_OBJS) $(KERN_USER_H) $(EXTRA_DEPS) $(EXTRA_USER_DEPS)
|
||||
$(USER_TARGETS): %: %.c $(OBJECT_LIBBPF) $(OBJECT_LIBXDP) $(LIBMK) $(LIB_OBJS) $(KERN_USER_H) $(EXTRA_DEPS) $(EXTRA_USER_DEPS) $(BPF_SKEL)
|
||||
$(QUIET_CC)$(CC) -Wall $(CFLAGS) $(LDFLAGS) -o $@ $(LIB_OBJS) \
|
||||
$< $(LDLIBS)
|
||||
|
||||
@@ -86,6 +87,9 @@ $(BPF_OBJ): %.o: %.c $(KERN_USER_H) $(EXTRA_DEPS) $(BPF_HEADERS) $(LIBMK)
|
||||
-O2 -emit-llvm -c -g -o ${@:.o=.ll} $<
|
||||
$(QUIET_LLC)$(LLC) -march=bpf -filetype=obj -o $@ ${@:.o=.ll}
|
||||
|
||||
$(BPF_SKEL): %.skel.h: %.o
|
||||
$(QUIET_GEN)$(BPFTOOL) gen skeleton ${@:.skel.h=.o} > $@
|
||||
|
||||
|
||||
.PHONY: test
|
||||
ifeq ($(TEST_FILE),)
|
||||
|
2
nat64-bpf/.gitignore
vendored
Normal file
2
nat64-bpf/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
nat64
|
||||
|
13
nat64-bpf/Makefile
Normal file
13
nat64-bpf/Makefile
Normal file
@@ -0,0 +1,13 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
|
||||
|
||||
USER_TARGETS := nat64
|
||||
BPF_TARGETS := nat64_kern
|
||||
BPF_SKEL_OBJ := nat64_kern.o
|
||||
|
||||
#LDLIBS += -pthread
|
||||
USER_LIBS = -lmnl
|
||||
EXTRA_DEPS += nat64.h
|
||||
|
||||
LIB_DIR = ../lib
|
||||
|
||||
include $(LIB_DIR)/common.mk
|
86
nat64-bpf/README.org
Normal file
86
nat64-bpf/README.org
Normal file
@@ -0,0 +1,86 @@
|
||||
* NAT64 BPF implementation
|
||||
|
||||
This directory contains a BPF implementation of a stateless NAT64
|
||||
implementation, like that performed by Tayga, but entirely in BPF. It works by
|
||||
attaching to the TC hooks of an interface and translating incoming IPv6
|
||||
addresses with a destination in the configured NAT64 prefix, and routing v4
|
||||
packets back out through that interface based on the (v4) prefix used for
|
||||
translation.
|
||||
|
||||
** Running
|
||||
|
||||
To run the translator on =eth0= with an IPv4 prefix of =10.0.1.0/24= and using
|
||||
the default well-known v6 prefix (=64:ff9b::/96=), simply issue
|
||||
|
||||
#+begin_src sh
|
||||
sudo ./nat64 -i eth0 -4 10.0.1.0/24 -a fc00::/8
|
||||
#+end_src
|
||||
|
||||
Run again with a =-u= parameter to unload (but make sure to also specify the
|
||||
rest of the parameters as they are needed to properly clean up). To specify
|
||||
another v6 prefix, use =-6=.
|
||||
|
||||
The userspace utility will install the necessary routing rules, and setup the
|
||||
BPF programs, then exit. The translator will then keep running entirely in the
|
||||
kernel until unloaded (with =-u=).
|
||||
|
||||
** Assumptions
|
||||
|
||||
The operation of this NAT64 translator makes a few assumptions:
|
||||
|
||||
- A single v6 NAT64 prefix is used, and the prefix length is always 96 (i.e.,
|
||||
the v4 addresses live in the last four bytes). By default the well-known
|
||||
prefix =64:ff9b::/96= is used.
|
||||
|
||||
- IPv6 source addresses are mapped into a configured IPv4 prefix one-to-one.
|
||||
Regular NAT4 can be applied afterwards to map to a single public IP. A
|
||||
separate v4 prefix should be used for every interface that the translator runs
|
||||
on. Source address v6-to-v4 mappings are dynamically created as new sources
|
||||
appear, and time out after two hours.
|
||||
|
||||
- An allowlist of IPv6 source prefixes that should be subject to translation is
|
||||
maintained.
|
||||
|
||||
** How it works
|
||||
|
||||
Two BPF programs are attached to the ingress and egress hooks of the interface
|
||||
being configured. The ingress program will process IPv6 packets, and any packet
|
||||
with a destination address in the configured NAT64 prefix will be either
|
||||
translated (if the source is allowed), or dropped. The egress program processes
|
||||
IPv4 packets and any packet with a destination in the configured v4 prefix will
|
||||
be either translated (if a v6 address is found in the state map) or dropped.
|
||||
|
||||
To make sure the v4 traffic makes it to the right interface, a v4-via-v6 route
|
||||
is installed on that interface with a gateway address of the network address of
|
||||
the v6 prefix, and a fake neighbour entry is installed to avoid the kernel doing
|
||||
neighbour lookups of the gateway. This gets the packets to where the BPF program
|
||||
can process them, and after translation a new neighbour lookup with be performed
|
||||
with the new v6 destination.
|
||||
|
||||
Note that because of the place of the BPF hook in ingress processing, the
|
||||
ingress BPF program will need to redirect the packet to the same interface after
|
||||
translation for re-processing as an IPv4 packet. This means that things like
|
||||
tcpdump will see first the original IPv6 packet, and then the translated IPv4
|
||||
packet. On egress the translation happens earlier, so only the translated packet
|
||||
will be seen.
|
||||
|
||||
** Limitations / known issues
|
||||
At least the first two of these should probably be fixed before deploying this:
|
||||
|
||||
- The IP headers in ICMP error message payloads are not translated, which
|
||||
probably breaks ICMP errors.
|
||||
|
||||
- The BPF programs assume the interface is an Ethernet interface, so translation
|
||||
won't work on layer 3 devices (like Wireguard tunnels).
|
||||
|
||||
- IP options are not handled at all. In particular this means that fragmented
|
||||
IPv6 packets won't pass the translator.
|
||||
|
||||
- The BPF programs support specifying multiple allowed source IPv6 prefixes, as
|
||||
well as doing ahead-of-time static mappings, but the userspace component
|
||||
doesn't support these yet.
|
||||
|
||||
- The userspace program also has no way to print its status, or dump the state
|
||||
of the translation table. The BPF maps can be inspected with bpftool as a
|
||||
stopgap measure, though.
|
||||
|
454
nat64-bpf/nat64.c
Normal file
454
nat64-bpf/nat64.c
Normal file
@@ -0,0 +1,454 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#include <net/if.h>
|
||||
#include <linux/if_arp.h>
|
||||
#include <getopt.h>
|
||||
#include <linux/in6.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <linux/bpf.h>
|
||||
|
||||
#include <bpf/libbpf.h>
|
||||
#include <bpf/bpf.h>
|
||||
|
||||
#include <libmnl/libmnl.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
|
||||
#include "nat64.h"
|
||||
#include "nat64_kern.skel.h"
|
||||
|
||||
#define NS_PER_SECOND 1000000000UL
|
||||
#define NS_PER_MS 1000000UL
|
||||
|
||||
static const struct option long_options[] = {
|
||||
{ "help", no_argument, NULL, 'h' },
|
||||
{ "unload", no_argument, NULL, 'u' },
|
||||
{ "interface", required_argument, NULL, 'i' }, // Name of interface to run on
|
||||
{ "allowed-src", required_argument, NULL, 'a' }, // v6 prefix to allow as source
|
||||
{ "v6-prefix", required_argument, NULL, '6' }, // v6 prefix to use for nat64
|
||||
{ "v4-prefix", required_argument, NULL, '4' }, // v4 prefix to use for nat64
|
||||
{ "timeout", required_argument, NULL, 't' }, // Address mapping timeout interval in s
|
||||
{ 0, 0, NULL, 0 }
|
||||
};
|
||||
|
||||
struct nat64_user_config {
|
||||
struct nat64_config c;
|
||||
int ifindex;
|
||||
char ifname[IF_NAMESIZE+1];
|
||||
struct in6_addr v6_allow;
|
||||
__u32 v6_allow_pxlen;
|
||||
__u32 v4_pxlen;
|
||||
bool unload;
|
||||
};
|
||||
|
||||
|
||||
|
||||
static int parse_v6_prefix(char *str, struct in6_addr *v6addr)
|
||||
{
|
||||
char *net;
|
||||
int pxlen;
|
||||
|
||||
net = strstr(str, "/");
|
||||
if (!net) {
|
||||
fprintf(stderr, "Invalid v6 prefix: %s\n", str);
|
||||
return -EINVAL;
|
||||
}
|
||||
pxlen = atoi(net + 1);
|
||||
*net = '\0';
|
||||
if (inet_pton(AF_INET6, str, v6addr) != 1) {
|
||||
fprintf(stderr, "Invalid v6 addr: %s\n", str);
|
||||
return -EINVAL;
|
||||
}
|
||||
return pxlen;
|
||||
}
|
||||
|
||||
static int parse_arguments(int argc, char *argv[], struct nat64_user_config *config)
|
||||
{
|
||||
struct in6_addr v6addr;
|
||||
struct in_addr v4addr;
|
||||
int pxlen, seconds;
|
||||
int err, opt;
|
||||
char *net;
|
||||
|
||||
config->ifindex = 0;
|
||||
config->c.timeout_ns = 7200 * NS_PER_SECOND;
|
||||
config->c.next_addr = 1;
|
||||
|
||||
/* Default to special prefix 64:ff9b::/96 */
|
||||
config->c.v6_prefix.s6_addr[1] = 0x64;
|
||||
config->c.v6_prefix.s6_addr[2] = 0xff;
|
||||
config->c.v6_prefix.s6_addr[3] = 0x9b;
|
||||
|
||||
while ((opt = getopt_long(argc, argv, "i:6:4:t:a:hu", long_options,
|
||||
NULL)) != -1) {
|
||||
switch (opt) {
|
||||
case 'i':
|
||||
if (strlen(optarg) > IF_NAMESIZE) {
|
||||
fprintf(stderr, "interface name too long\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
strncpy(config->ifname, optarg, IF_NAMESIZE);
|
||||
|
||||
config->ifindex = if_nametoindex(config->ifname);
|
||||
if (config->ifindex == 0) {
|
||||
err = -errno;
|
||||
fprintf(stderr,
|
||||
"Could not get index of interface %s: %s\n",
|
||||
config->ifname, strerror(err));
|
||||
return err;
|
||||
}
|
||||
break;
|
||||
case 'a':
|
||||
pxlen = parse_v6_prefix(optarg, &v6addr);
|
||||
if (pxlen < 0)
|
||||
return pxlen;
|
||||
config->v6_allow = v6addr;
|
||||
config->v6_allow_pxlen = pxlen;
|
||||
break;
|
||||
case '6':
|
||||
pxlen = parse_v6_prefix(optarg, &v6addr);
|
||||
if (pxlen < 0)
|
||||
return pxlen;
|
||||
if (pxlen != 96) {
|
||||
fprintf(stderr, "v6 prefix must have pxlen 96\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (v6addr.s6_addr32[3]) {
|
||||
fprintf(stderr, "Not a /96 network address: %s\n", optarg);
|
||||
return -EINVAL;
|
||||
}
|
||||
config->c.v6_prefix = v6addr;
|
||||
break;
|
||||
case '4':
|
||||
net = strstr(optarg, "/");
|
||||
if (!net) {
|
||||
fprintf(stderr, "Invalid v6 prefix: %s\n", optarg);
|
||||
return -EINVAL;
|
||||
}
|
||||
pxlen = atoi(net + 1);
|
||||
if (pxlen < 1 || pxlen > 31) {
|
||||
fprintf(stderr, "v4_pxlen must be between 1 and 31\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
*net = '\0';
|
||||
if (inet_pton(AF_INET, optarg, &v4addr) != 1) {
|
||||
fprintf(stderr, "Invalid v4 addr: %s\n", optarg);
|
||||
return -EINVAL;
|
||||
}
|
||||
config->c.v4_mask = 0xFFFFFFFF << (32 - pxlen);
|
||||
config->v4_pxlen = pxlen;
|
||||
config->c.v4_prefix = ntohl(v4addr.s_addr);
|
||||
if (config->c.v4_prefix & ~config->c.v4_mask) {
|
||||
fprintf(stderr, "Not a network address: %s\n", optarg);
|
||||
return -EINVAL;
|
||||
}
|
||||
break;
|
||||
case 't':
|
||||
seconds = atoi(optarg);
|
||||
if (seconds < 1 || seconds > 100000) {
|
||||
fprintf(stderr, "Timeout must be in the interval between 1 and 100000 seconds\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
config->c.timeout_ns = (__u64)seconds * NS_PER_SECOND;
|
||||
break;
|
||||
case 'u':
|
||||
config->unload = true;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "Unknown option %s\n", argv[optind]);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
if (config->ifindex == 0) {
|
||||
fprintf(stderr,
|
||||
"An interface (-i or --interface) must be provided\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (!config->c.v4_prefix) {
|
||||
fprintf(stderr,
|
||||
"A v4 prefix (-4 or --v4-prefix) must be provided\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int do_v4_neigh(struct mnl_socket *nl, struct nat64_user_config *cfg, bool create)
|
||||
{
|
||||
char buf[MNL_SOCKET_BUFFER_SIZE];
|
||||
struct nlmsghdr *nlh;
|
||||
uint32_t seq, portid;
|
||||
struct rtmsg *rtm;
|
||||
int ret, err = 0;
|
||||
|
||||
struct {
|
||||
__u16 family;
|
||||
struct in6_addr addr;
|
||||
} __attribute__((packed)) via = {
|
||||
.family = AF_INET6,
|
||||
.addr = cfg->c.v6_prefix
|
||||
};
|
||||
|
||||
|
||||
nlh = mnl_nlmsg_put_header(buf);
|
||||
if (create) {
|
||||
nlh->nlmsg_type = RTM_NEWROUTE;
|
||||
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
|
||||
} else {
|
||||
nlh->nlmsg_type = RTM_DELROUTE;
|
||||
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
|
||||
}
|
||||
nlh->nlmsg_seq = seq = time(NULL);
|
||||
|
||||
rtm = mnl_nlmsg_put_extra_header(nlh, sizeof(struct rtmsg));
|
||||
rtm->rtm_family = AF_INET;
|
||||
rtm->rtm_dst_len = cfg->v4_pxlen;
|
||||
rtm->rtm_src_len = 0;
|
||||
rtm->rtm_tos = 0;
|
||||
rtm->rtm_protocol = RTPROT_STATIC;
|
||||
rtm->rtm_table = RT_TABLE_MAIN;
|
||||
rtm->rtm_type = RTN_UNICAST;
|
||||
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
|
||||
rtm->rtm_flags = RTNH_F_ONLINK;
|
||||
|
||||
mnl_attr_put_u32(nlh, RTA_DST, htonl(cfg->c.v4_prefix));
|
||||
mnl_attr_put_u32(nlh, RTA_OIF, cfg->ifindex);
|
||||
mnl_attr_put(nlh, RTA_VIA, sizeof(via), &via);
|
||||
|
||||
portid = mnl_socket_get_portid(nl);
|
||||
if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
|
||||
perror("mnl_socket_sendto");
|
||||
err = -errno;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = mnl_socket_recvfrom(nl, buf, sizeof(buf));
|
||||
if (ret < 0) {
|
||||
perror("mnl_socket_recvfrom");
|
||||
err = -errno;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = mnl_cb_run(buf, ret, seq, portid, NULL, NULL);
|
||||
if (ret < 0) {
|
||||
if ((create && errno != EEXIST) ||
|
||||
!(create && errno != ENOENT && errno != ESRCH))
|
||||
err = -errno;
|
||||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int do_v4_route(struct mnl_socket *nl, struct nat64_user_config *cfg, bool create)
|
||||
{
|
||||
char buf[MNL_SOCKET_BUFFER_SIZE];
|
||||
struct nlmsghdr *nlh;
|
||||
uint32_t seq, portid;
|
||||
struct ndmsg *ndm;
|
||||
int ret, err = 0;
|
||||
|
||||
__u8 lladdr[6] = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55 };
|
||||
nlh = mnl_nlmsg_put_header(buf);
|
||||
if (create) {
|
||||
nlh->nlmsg_type = RTM_NEWNEIGH;
|
||||
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
|
||||
} else {
|
||||
nlh->nlmsg_type = RTM_DELNEIGH;
|
||||
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
|
||||
}
|
||||
nlh->nlmsg_seq = seq = time(NULL);
|
||||
|
||||
ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(struct ndmsg));
|
||||
ndm->ndm_family = AF_INET6;
|
||||
ndm->ndm_ifindex = cfg->ifindex;
|
||||
ndm->ndm_state = NUD_PERMANENT;
|
||||
ndm->ndm_type = 0;
|
||||
|
||||
mnl_attr_put(nlh, NDA_LLADDR, sizeof(lladdr), &lladdr);
|
||||
mnl_attr_put(nlh, NDA_DST, sizeof(struct in6_addr), &cfg->c.v6_prefix);
|
||||
|
||||
portid = mnl_socket_get_portid(nl);
|
||||
if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
|
||||
perror("mnl_socket_sendto");
|
||||
err = -errno;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = mnl_socket_recvfrom(nl, buf, sizeof(buf));
|
||||
if (ret < 0) {
|
||||
perror("mnl_socket_recvfrom");
|
||||
err = -errno;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = mnl_cb_run(buf, ret, seq, portid, NULL, NULL);
|
||||
if (ret < 0) {
|
||||
if ((create && errno != EEXIST) ||
|
||||
!(create && errno != ENOENT && errno != ESRCH))
|
||||
err = -errno;
|
||||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int do_netlink(struct nat64_user_config *cfg, bool create)
|
||||
{
|
||||
struct mnl_socket *nl;
|
||||
int err = 0;
|
||||
|
||||
nl = mnl_socket_open(NETLINK_ROUTE);
|
||||
if (nl == NULL) {
|
||||
perror("mnl_socket_open");
|
||||
err = -errno;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) {
|
||||
perror("mnl_socket_bind");
|
||||
err = -errno;
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = do_v4_route(nl, cfg, create);
|
||||
err = err ?: do_v4_neigh(nl, cfg, create);
|
||||
|
||||
out:
|
||||
mnl_socket_close(nl);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
int teardown(struct nat64_user_config *cfg)
|
||||
{
|
||||
DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook,
|
||||
.attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS,
|
||||
.ifindex = cfg->ifindex);
|
||||
int err;
|
||||
|
||||
err = bpf_tc_hook_destroy(&hook);
|
||||
if (err)
|
||||
fprintf(stderr, "Couldn't remove clsact qdisc on %s\n", cfg->ifname);
|
||||
|
||||
err = do_netlink(cfg, false);
|
||||
if (err)
|
||||
fprintf(stderr, "Couldn't remove route on %s: %s\n",
|
||||
cfg->ifname, strerror(-err));
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
struct v6_trie_key prefix_key = {};
|
||||
struct nat64_user_config cfg = {};
|
||||
struct nat64_kern *obj;
|
||||
unsigned int num_addr;
|
||||
char buf[100];
|
||||
int err = 0;
|
||||
DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS);
|
||||
DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_egress);
|
||||
DECLARE_LIBBPF_OPTS(bpf_tc_opts, attach_ingress);
|
||||
|
||||
err = parse_arguments(argc, argv, &cfg);
|
||||
if (err)
|
||||
return EXIT_FAILURE;
|
||||
|
||||
hook.ifindex = cfg.ifindex;
|
||||
if (cfg.unload)
|
||||
return teardown(&cfg);
|
||||
|
||||
obj = nat64_kern__open();
|
||||
err = libbpf_get_error(obj);
|
||||
if (err) {
|
||||
libbpf_strerror(err, buf, sizeof(buf));
|
||||
fprintf(stderr, "Couldn't open BPF skeleton: %s\n", buf);
|
||||
return err;
|
||||
}
|
||||
|
||||
num_addr = (cfg.c.v4_prefix | ~cfg.c.v4_mask) - cfg.c.v4_prefix - 2;
|
||||
|
||||
obj->bss->config = cfg.c;
|
||||
bpf_map__resize(obj->maps.v6_state_map, num_addr);
|
||||
bpf_map__resize(obj->maps.v4_reversemap, num_addr);
|
||||
bpf_map__resize(obj->maps.reclaimed_addrs, num_addr);
|
||||
|
||||
err = nat64_kern__load(obj);
|
||||
if (err) {
|
||||
libbpf_strerror(err, buf, sizeof(buf));
|
||||
fprintf(stderr, "Couldn't load BPF skeleton: %s\n", buf);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (cfg.v6_allow_pxlen) {
|
||||
__u32 value = 0;
|
||||
|
||||
prefix_key.t.prefixlen = cfg.v6_allow_pxlen;
|
||||
prefix_key.addr = cfg.v6_allow;
|
||||
err = bpf_map_update_elem(bpf_map__fd(obj->maps.allowed_v6_src),
|
||||
&prefix_key, &value, 0);
|
||||
if (err) {
|
||||
fprintf(stderr, "Couldn't insert allowed prefix\n");
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
attach_ingress.prog_fd = bpf_program__fd(obj->progs.nat64_ingress);
|
||||
if (attach_ingress.prog_fd < 0) {
|
||||
fprintf(stderr, "Couldn't find ingress program\n");
|
||||
err = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
attach_egress.prog_fd = bpf_program__fd(obj->progs.nat64_egress);
|
||||
if (attach_egress.prog_fd < 0) {
|
||||
fprintf(stderr, "Couldn't find egress program\n");
|
||||
err = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = bpf_tc_hook_create(&hook);
|
||||
if (err && err != -EEXIST) {
|
||||
fprintf(stderr, "Couldn't create ingress hook for ifindex %d\n", cfg.ifindex);
|
||||
goto out;
|
||||
}
|
||||
|
||||
hook.attach_point = BPF_TC_INGRESS;
|
||||
err = bpf_tc_attach(&hook, &attach_ingress);
|
||||
if (err) {
|
||||
fprintf(stderr, "Couldn't attach ingress program to ifindex %d\n",
|
||||
hook.ifindex);
|
||||
goto out;
|
||||
}
|
||||
|
||||
hook.attach_point = BPF_TC_EGRESS;
|
||||
err = bpf_tc_attach(&hook, &attach_egress);
|
||||
if (err) {
|
||||
fprintf(stderr, "Couldn't attach egress program to ifindex %d\n",
|
||||
hook.ifindex);
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = do_netlink(&cfg, true);
|
||||
if (err) {
|
||||
fprintf(stderr, "Couldn't create route: %s\n", strerror(-err));
|
||||
err = teardown(&cfg);
|
||||
}
|
||||
|
||||
out:
|
||||
nat64_kern__destroy(obj);
|
||||
return err;
|
||||
}
|
25
nat64-bpf/nat64.h
Normal file
25
nat64-bpf/nat64.h
Normal file
@@ -0,0 +1,25 @@
|
||||
#ifndef __NAT64_H__
|
||||
#define __NAT64_H__
|
||||
|
||||
#include <linux/in6.h>
|
||||
|
||||
struct nat64_config {
|
||||
struct in6_addr v6_prefix;
|
||||
__u64 timeout_ns;
|
||||
__u64 next_addr;
|
||||
__u32 v4_prefix;
|
||||
__u32 v4_mask;
|
||||
};
|
||||
|
||||
struct v6_addr_state {
|
||||
__u64 last_seen;
|
||||
__u32 v4_addr;
|
||||
__u32 static_conf;
|
||||
};
|
||||
|
||||
struct v6_trie_key {
|
||||
struct bpf_lpm_trie_key t;
|
||||
struct in6_addr addr;
|
||||
};
|
||||
|
||||
#endif
|
671
nat64-bpf/nat64_kern.c
Normal file
671
nat64-bpf/nat64_kern.c
Normal file
@@ -0,0 +1,671 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* Copyright 2021 Toke Høiland-Jørgensen <toke@toke.dk> */
|
||||
|
||||
|
||||
#include <linux/bpf.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/compiler.h>
|
||||
#include <linux/pkt_sched.h>
|
||||
#include <linux/pkt_cls.h>
|
||||
#include <stdbool.h>
|
||||
#include "../include/xdp/parsing_helpers.h"
|
||||
#include "nat64.h"
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
struct nat64_config config;
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__type(key, struct in6_addr);
|
||||
__type(value, struct v6_addr_state);
|
||||
__uint(max_entries, 1);
|
||||
__uint(map_flags, BPF_F_NO_PREALLOC);
|
||||
} v6_state_map SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__type(key, __u32);
|
||||
__type(value, struct in6_addr);
|
||||
__uint(max_entries, 1);
|
||||
__uint(map_flags, BPF_F_NO_PREALLOC);
|
||||
} v4_reversemap SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_LPM_TRIE);
|
||||
__uint(key_size, sizeof(struct in6_addr));
|
||||
__uint(value_size, sizeof(__u32));
|
||||
__uint(max_entries, 1);
|
||||
__uint(map_flags, BPF_F_NO_PREALLOC);
|
||||
} allowed_v6_src SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_QUEUE);
|
||||
__uint(key_size, 0);
|
||||
__uint(value_size, sizeof(__u32));
|
||||
__uint(max_entries, 1);
|
||||
} reclaimed_addrs SEC(".maps");
|
||||
|
||||
#ifdef DEBUG
|
||||
#define DBG(fmt, ...) \
|
||||
({ \
|
||||
char ____fmt[] = "nat64: " fmt; \
|
||||
bpf_trace_printk(____fmt, sizeof(____fmt), \
|
||||
##__VA_ARGS__); \
|
||||
})
|
||||
#else
|
||||
#define DBG
|
||||
#endif
|
||||
|
||||
struct icmpv6_pseudo {
|
||||
struct in6_addr saddr;
|
||||
struct in6_addr daddr;
|
||||
__u32 len;
|
||||
__u8 padding[3];
|
||||
__u8 nh;
|
||||
} __attribute__((packed));
|
||||
|
||||
static __always_inline void update_icmp_checksum(struct __sk_buff *skb,
|
||||
struct ipv6hdr *ip6h,
|
||||
void *icmp_before,
|
||||
void *icmp_after,
|
||||
bool add)
|
||||
{
|
||||
void *data = (void *)(unsigned long long)skb->data;
|
||||
struct icmpv6_pseudo ph = {
|
||||
.nh = IPPROTO_ICMPV6,
|
||||
.saddr = ip6h->saddr,
|
||||
.daddr = ip6h->daddr,
|
||||
.len = ip6h->payload_len
|
||||
};
|
||||
__u16 h_before, h_after, offset;
|
||||
__u32 csum, u_before, u_after;
|
||||
|
||||
/* Do checksum update in two passes: first compute the incremental
|
||||
* checksum update of the ICMPv6 pseudo header, update the checksum
|
||||
* using bpf_l4_csum_replace(), and then do a separate update for the
|
||||
* ICMP type and code (which is two consecutive bytes, so cast them to
|
||||
* u16). The bpf_csum_diff() helper can be used to compute the
|
||||
* incremental update of the full block, whereas the
|
||||
* bpf_l4_csum_replace() helper can do the two-byte diff and update by
|
||||
* itself.
|
||||
*/
|
||||
csum = bpf_csum_diff((__be32 *)&ph, add ? 0 : sizeof(ph),
|
||||
(__be32 *)&ph, add ? sizeof(ph) : 0,
|
||||
0);
|
||||
|
||||
offset = ((void *)icmp_after - data) + 2;
|
||||
/* first two bytes of ICMP header, type and code */
|
||||
h_before = *(__u16 *)icmp_before;
|
||||
h_after = *(__u16 *)icmp_after;
|
||||
|
||||
/* last four bytes of ICMP header, the data union */
|
||||
u_before = *(__u32 *)(icmp_before + 4);
|
||||
u_after = *(__u32 *)(icmp_after + 4);
|
||||
|
||||
bpf_l4_csum_replace(skb, offset, 0, csum, BPF_F_PSEUDO_HDR);
|
||||
bpf_l4_csum_replace(skb, offset, h_before, h_after, 2);
|
||||
|
||||
if (u_before != u_after)
|
||||
bpf_l4_csum_replace(skb, offset, u_before, u_after, 4);
|
||||
}
|
||||
|
||||
|
||||
static int rewrite_icmp(struct iphdr *iph, struct ipv6hdr *ip6h, struct __sk_buff *skb)
|
||||
{
|
||||
void *data_end = (void *)(unsigned long long)skb->data_end;
|
||||
|
||||
struct icmphdr old_icmp, *icmp = (void *)(iph + 1);
|
||||
struct icmp6hdr icmp6, *new_icmp6;
|
||||
__u32 mtu;
|
||||
|
||||
if (icmp + 1 > data_end)
|
||||
return -1;
|
||||
|
||||
old_icmp = *icmp;
|
||||
new_icmp6 = (void *)icmp;
|
||||
icmp6 = *new_icmp6;
|
||||
|
||||
/* These translations are defined in RFC6145 section 4.2 */
|
||||
switch (icmp->type) {
|
||||
case ICMP_ECHO:
|
||||
icmp6.icmp6_type = ICMPV6_ECHO_REQUEST;
|
||||
break;
|
||||
case ICMP_ECHOREPLY:
|
||||
icmp6.icmp6_type = ICMPV6_ECHO_REPLY;
|
||||
break;
|
||||
case ICMP_DEST_UNREACH:
|
||||
icmp6.icmp6_type = ICMPV6_DEST_UNREACH;
|
||||
switch(icmp->code) {
|
||||
case ICMP_NET_UNREACH:
|
||||
case ICMP_HOST_UNREACH:
|
||||
case ICMP_SR_FAILED:
|
||||
case ICMP_NET_UNKNOWN:
|
||||
case ICMP_HOST_UNKNOWN:
|
||||
case ICMP_HOST_ISOLATED:
|
||||
case ICMP_NET_UNR_TOS:
|
||||
case ICMP_HOST_UNR_TOS:
|
||||
icmp6.icmp6_code = ICMPV6_NOROUTE;
|
||||
break;
|
||||
case ICMP_PROT_UNREACH:
|
||||
icmp6.icmp6_type = ICMPV6_PARAMPROB;
|
||||
icmp6.icmp6_code = ICMPV6_UNK_NEXTHDR;
|
||||
icmp6.icmp6_pointer = bpf_htonl(offsetof(struct ipv6hdr, nexthdr));
|
||||
case ICMP_PORT_UNREACH:
|
||||
icmp6.icmp6_code = ICMPV6_PORT_UNREACH;
|
||||
break;
|
||||
case ICMP_FRAG_NEEDED:
|
||||
icmp6.icmp6_type = ICMPV6_PKT_TOOBIG;
|
||||
icmp6.icmp6_code = 0;
|
||||
mtu = bpf_ntohs(icmp->un.frag.mtu) + 20;
|
||||
/* RFC6145 section 6, "second approach" - should not be
|
||||
* necessary, but might as well do this
|
||||
*/
|
||||
if (mtu < 1280)
|
||||
mtu = 1280;
|
||||
icmp6.icmp6_mtu = bpf_htonl(mtu);
|
||||
case ICMP_NET_ANO:
|
||||
case ICMP_HOST_ANO:
|
||||
case ICMP_PKT_FILTERED:
|
||||
case ICMP_PREC_CUTOFF:
|
||||
icmp6.icmp6_code = ICMPV6_ADM_PROHIBITED;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
break;
|
||||
case ICMP_PARAMETERPROB:
|
||||
if (icmp->code == 1)
|
||||
return -1;
|
||||
icmp6.icmp6_type = ICMPV6_PARAMPROB;
|
||||
icmp6.icmp6_code = ICMPV6_HDR_FIELD;
|
||||
/* The pointer field not defined in the Linux header. This
|
||||
* translation is from Figure 3 of RFC6145.
|
||||
*/
|
||||
switch (icmp->un.reserved[0]) {
|
||||
case 0: /* version/IHL */
|
||||
icmp6.icmp6_pointer = 0;
|
||||
break;
|
||||
case 1: /* Type of Service */
|
||||
icmp6.icmp6_pointer = bpf_htonl(1);
|
||||
break;
|
||||
case 2: /* Total length */
|
||||
case 3:
|
||||
icmp6.icmp6_pointer = bpf_htonl(4);
|
||||
break;
|
||||
case 8: /* Time to Live */
|
||||
icmp6.icmp6_pointer = bpf_htonl(7);
|
||||
break;
|
||||
case 9: /* Protocol */
|
||||
icmp6.icmp6_pointer = bpf_htonl(6);
|
||||
break;
|
||||
case 12: /* Source address */
|
||||
case 13:
|
||||
case 14:
|
||||
case 15:
|
||||
icmp6.icmp6_pointer = bpf_htonl(8);
|
||||
break;
|
||||
case 16: /* Destination address */
|
||||
case 17:
|
||||
case 18:
|
||||
case 19:
|
||||
icmp6.icmp6_pointer = bpf_htonl(24);
|
||||
break;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
|
||||
*new_icmp6 = icmp6;
|
||||
update_icmp_checksum(skb, ip6h, &old_icmp, new_icmp6, true);
|
||||
|
||||
/* FIXME: also need to rewrite IP header embedded in ICMP error */
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nat64_handle_v4(struct __sk_buff *skb, struct hdr_cursor *nh)
|
||||
{
|
||||
void *data_end = (void *)(unsigned long long)skb->data_end;
|
||||
void *data = (void *)(unsigned long long)skb->data;
|
||||
|
||||
int ip_type, iphdr_len, ip_offset;
|
||||
struct in6_addr *dst_v6;
|
||||
struct ipv6hdr *ip6h;
|
||||
int ret = TC_ACT_OK;
|
||||
struct iphdr *iph;
|
||||
struct ethhdr *eth;
|
||||
__u32 dst_v4;
|
||||
|
||||
struct ipv6hdr dst_hdr = {
|
||||
.version = 6,
|
||||
.saddr = config.v6_prefix,
|
||||
};
|
||||
|
||||
ip_offset = (nh->pos - data) & 0x1fff;
|
||||
|
||||
ip_type = parse_iphdr(nh, data_end, &iph);
|
||||
if (ip_type < 0)
|
||||
goto out;
|
||||
|
||||
dst_v4 = bpf_ntohl(iph->daddr);
|
||||
if ((dst_v4 & config.v4_mask) != config.v4_prefix)
|
||||
goto out;
|
||||
|
||||
/* At this point we know the destination IP is within the configured
|
||||
* subnet, so if we can't rewrite the packet it should be dropped (so as
|
||||
* not to leak traffic in that subnet).
|
||||
*/
|
||||
ret = TC_ACT_SHOT;
|
||||
|
||||
/* we don't bother dealing with IP options or fragmented packets. The
|
||||
* latter are identified by the 'frag_off' field having a value (either
|
||||
* the MF bit, or the fragmet offset, or both). However, this field also
|
||||
* contains the "don't fragment" (DF) bit, which we ignore, so mask that
|
||||
* out. The DF is the second-most-significant bit (as bit 0 is
|
||||
* reserved).
|
||||
*/
|
||||
iphdr_len = iph->ihl * 4;
|
||||
if (iphdr_len != sizeof(struct iphdr) ||
|
||||
(iph->frag_off & ~bpf_htons(1<<14))) {
|
||||
DBG("v4: pkt src/dst %pI4/%pI4 has IP options or is fragmented, dropping\n",
|
||||
&iph->daddr, &iph->saddr);
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
||||
dst_v6 = bpf_map_lookup_elem(&v4_reversemap, &dst_v4);
|
||||
if (!dst_v6) {
|
||||
DBG("v4: no mapping found for dst %pI4\n", &iph->daddr);
|
||||
goto out;
|
||||
}
|
||||
|
||||
DBG("v4: Found mapping for dst %pI4 to %pI6c\n", &iph->daddr, dst_v6);
|
||||
|
||||
// src v4 as last octet of nat64 address
|
||||
dst_hdr.saddr.s6_addr32[3] = iph->saddr;
|
||||
dst_hdr.daddr = *dst_v6;
|
||||
dst_hdr.nexthdr = iph->protocol;
|
||||
dst_hdr.hop_limit = iph->ttl;
|
||||
/* weird definition in ipv6hdr */
|
||||
dst_hdr.priority = (iph->tos & 0x70) >> 4;
|
||||
dst_hdr.flow_lbl[0] = iph->tos << 4;
|
||||
dst_hdr.payload_len = bpf_htons(bpf_ntohs(iph->tot_len) - iphdr_len);
|
||||
|
||||
if (dst_hdr.nexthdr == IPPROTO_ICMP) {
|
||||
if (rewrite_icmp(iph, &dst_hdr, skb))
|
||||
goto out;
|
||||
dst_hdr.nexthdr = IPPROTO_ICMPV6;
|
||||
}
|
||||
|
||||
if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IPV6), 0))
|
||||
goto out;
|
||||
|
||||
data = (void *)(unsigned long long)skb->data;
|
||||
data_end = (void *)(unsigned long long)skb->data_end;
|
||||
|
||||
eth = data;
|
||||
ip6h = data + ip_offset;
|
||||
if (eth + 1 > data_end || ip6h + 1 > data_end)
|
||||
goto out;
|
||||
|
||||
eth->h_proto = bpf_htons(ETH_P_IPV6);
|
||||
*ip6h = dst_hdr;
|
||||
|
||||
ret = bpf_redirect_neigh(skb->ifindex, NULL, 0, 0);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static long check_item(struct bpf_map *map, const void *key, void *value, void *ctx)
|
||||
{
|
||||
struct v6_addr_state *state = value;
|
||||
__u64 timeout = *((__u64 *)ctx);
|
||||
|
||||
if (state->last_seen < timeout && !state->static_conf) {
|
||||
__u32 v4_addr = state->v4_addr;
|
||||
bpf_map_delete_elem(map, key);
|
||||
bpf_map_delete_elem(&v4_reversemap, &v4_addr);
|
||||
bpf_map_push_elem(&reclaimed_addrs, &v4_addr, 0);
|
||||
|
||||
/* only reclaim one address at a time, so mappings don't expire
|
||||
* until they absolutely have to
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static __u32 reclaim_v4_addr(void)
|
||||
{
|
||||
__u64 timeout = bpf_ktime_get_ns() - config.timeout_ns;
|
||||
__u32 src_v4;
|
||||
|
||||
if (bpf_map_pop_elem(&reclaimed_addrs, &src_v4) == 0)
|
||||
return src_v4;
|
||||
|
||||
bpf_for_each_map_elem(&v6_state_map, check_item, &timeout, 0);
|
||||
|
||||
return bpf_map_pop_elem(&reclaimed_addrs, &src_v4) ? 0 : src_v4;
|
||||
}
|
||||
|
||||
static struct v6_addr_state *alloc_new_state(struct in6_addr *src_v6)
|
||||
{
|
||||
struct v6_addr_state new_v6_state = { .last_seen = bpf_ktime_get_ns() };
|
||||
__u32 max_v4 = (config.v4_prefix | ~config.v4_mask) - 1;
|
||||
__u32 src_v4 = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 10; i++) {
|
||||
__u32 next_v4, next_addr;
|
||||
|
||||
next_addr = __sync_fetch_and_add(&config.next_addr, 0);
|
||||
next_v4 = config.v4_prefix + next_addr;
|
||||
|
||||
if (next_v4 >= max_v4) {
|
||||
src_v4 = reclaim_v4_addr();
|
||||
break;
|
||||
}
|
||||
|
||||
if (__sync_val_compare_and_swap(&config.next_addr,
|
||||
next_addr,
|
||||
next_addr + 1) == next_addr) {
|
||||
src_v4 = next_v4;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* If src_v4 is 0 here, we failed to find an available addr */
|
||||
if (!src_v4)
|
||||
return NULL;
|
||||
|
||||
new_v6_state.v4_addr = src_v4;
|
||||
if (bpf_map_update_elem(&v6_state_map, src_v6, &new_v6_state, BPF_NOEXIST))
|
||||
goto err;
|
||||
if (bpf_map_update_elem(&v4_reversemap, &src_v4, src_v6, BPF_NOEXIST))
|
||||
goto err_v4;
|
||||
|
||||
return bpf_map_lookup_elem(&v6_state_map, src_v6);
|
||||
|
||||
err_v4:
|
||||
bpf_map_delete_elem(&v6_state_map, src_v6);
|
||||
err:
|
||||
/* failed to insert entry in maps, put the address back in the queue for
|
||||
* reclaiming
|
||||
*/
|
||||
bpf_map_push_elem(&reclaimed_addrs, &src_v4, 0);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int cmp_v6addr(struct in6_addr *a, struct in6_addr *b)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < 4; i++) {
|
||||
if (a->s6_addr32[i] < b->s6_addr32[i])
|
||||
return -1;
|
||||
if (a->s6_addr32[i] > b->s6_addr32[i])
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static __always_inline __u16 csum_fold_helper(__u32 csum)
|
||||
{
|
||||
__u32 sum;
|
||||
sum = (csum >> 16) + (csum & 0xffff);
|
||||
sum += (sum >> 16);
|
||||
return ~sum;
|
||||
}
|
||||
|
||||
static int rewrite_icmpv6(struct ipv6hdr *ip6h, struct __sk_buff *skb)
|
||||
{
|
||||
void *data_end = (void *)(unsigned long long)skb->data_end;
|
||||
|
||||
struct icmp6hdr old_icmp6, *icmp6 = (void *)(ip6h + 1);
|
||||
struct icmphdr icmp, *new_icmp;
|
||||
__u32 mtu, ptr;
|
||||
|
||||
if (icmp6 + 1 > data_end)
|
||||
return -1;
|
||||
|
||||
old_icmp6 = *icmp6;
|
||||
new_icmp = (void *)icmp6;
|
||||
icmp = *new_icmp;
|
||||
|
||||
/* These translations are defined in RFC6145 section 5.2 */
|
||||
switch (icmp6->icmp6_type) {
|
||||
case ICMPV6_ECHO_REQUEST:
|
||||
icmp.type = ICMP_ECHO;
|
||||
break;
|
||||
case ICMPV6_ECHO_REPLY:
|
||||
icmp.type = ICMP_ECHOREPLY;
|
||||
break;
|
||||
case ICMPV6_DEST_UNREACH:
|
||||
icmp.type = ICMP_DEST_UNREACH;
|
||||
switch(icmp6->icmp6_code) {
|
||||
case ICMPV6_NOROUTE:
|
||||
case ICMPV6_NOT_NEIGHBOUR:
|
||||
case ICMPV6_ADDR_UNREACH:
|
||||
icmp.code = ICMP_HOST_UNREACH;
|
||||
break;
|
||||
case ICMPV6_ADM_PROHIBITED:
|
||||
icmp.code = ICMP_HOST_ANO;
|
||||
break;
|
||||
case ICMPV6_PORT_UNREACH:
|
||||
icmp.code = ICMP_PORT_UNREACH;
|
||||
break;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
break;
|
||||
case ICMPV6_PKT_TOOBIG:
|
||||
icmp.type = ICMP_DEST_UNREACH;
|
||||
icmp.code = ICMP_FRAG_NEEDED;
|
||||
|
||||
mtu = bpf_htonl(icmp6->icmp6_mtu) - 20;
|
||||
if (mtu > 0xffff)
|
||||
return -1;
|
||||
icmp.un.frag.mtu = bpf_htons(mtu);
|
||||
break;
|
||||
case ICMPV6_TIME_EXCEED:
|
||||
icmp.type = ICMP_TIME_EXCEEDED;
|
||||
break;
|
||||
case ICMPV6_PARAMPROB:
|
||||
switch (icmp6->icmp6_code) {
|
||||
case 0:
|
||||
icmp.type = ICMP_PARAMETERPROB;
|
||||
icmp.code = 0;
|
||||
break;
|
||||
case 1:
|
||||
icmp.type = ICMP_DEST_UNREACH;
|
||||
icmp.code = ICMP_PROT_UNREACH;
|
||||
ptr = bpf_ntohl(icmp6->icmp6_pointer);
|
||||
/* Figure 6 in RFC6145 - using if statements b/c of
|
||||
* range at the bottom
|
||||
*/
|
||||
if (ptr == 0 || ptr == 1)
|
||||
icmp.un.reserved[0] = ptr;
|
||||
else if (ptr == 4 || ptr == 5)
|
||||
icmp.un.reserved[0] = 2;
|
||||
else if (ptr == 6)
|
||||
icmp.un.reserved[0] = 9;
|
||||
else if (ptr == 7)
|
||||
icmp.un.reserved[0] = 8;
|
||||
else if (ptr >= 8 && ptr <= 23)
|
||||
icmp.un.reserved[0] = 12;
|
||||
else if (ptr >= 24 && ptr <= 39)
|
||||
icmp.un.reserved[0] = 16;
|
||||
else
|
||||
return -1;
|
||||
break;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
|
||||
*new_icmp = icmp;
|
||||
update_icmp_checksum(skb, ip6h, &old_icmp6, new_icmp, false);
|
||||
|
||||
/* FIXME: also need to rewrite IP header embedded in ICMP error */
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int nat64_handle_v6(struct __sk_buff *skb, struct hdr_cursor *nh)
|
||||
{
|
||||
void *data_end = (void *)(unsigned long long)skb->data_end;
|
||||
void *data = (void *)(unsigned long long)skb->data;
|
||||
|
||||
struct v6_trie_key saddr_key = { .t.prefixlen = 128 };
|
||||
struct in6_addr *dst_v6, subnet_v6 = {};
|
||||
__u32 *allowval, src_v4, dst_v4;
|
||||
int ip_type, ip_offset;
|
||||
struct ipv6hdr *ip6h;
|
||||
int ret = TC_ACT_OK;
|
||||
struct ethhdr *eth;
|
||||
struct iphdr *iph;
|
||||
|
||||
struct v6_addr_state *v6_state;
|
||||
|
||||
struct iphdr dst_hdr = {
|
||||
.version = 4,
|
||||
.ihl = 5,
|
||||
.frag_off = bpf_htons(1<<14), /* set Don't Fragment bit */
|
||||
};
|
||||
|
||||
ip_offset = (nh->pos - data) & 0x1fff;
|
||||
|
||||
ip_type = parse_ip6hdr(nh, data_end, &ip6h);
|
||||
if (ip_type < 0)
|
||||
goto out;
|
||||
|
||||
dst_v6 = &ip6h->daddr;
|
||||
subnet_v6 = *dst_v6;
|
||||
/* v6 pxlen is always 96 */
|
||||
subnet_v6.s6_addr32[3] = 0;
|
||||
if (cmp_v6addr(&subnet_v6, &config.v6_prefix)) {
|
||||
DBG("v6: dst subnet %pI6c not in configured prefix %pI6c\n",
|
||||
&subnet_v6, &config.v6_prefix);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* At this point we know the destination IP is within the configured
|
||||
* subnet, so if we can't rewrite the packet it should be dropped (so as
|
||||
* not to leak traffic in that subnet).
|
||||
*/
|
||||
ret = TC_ACT_SHOT;
|
||||
|
||||
/* drop packets with IP options - parser skips options */
|
||||
if (ip_type != ip6h->nexthdr) {
|
||||
DBG("v6: dropping packet with IP options from %pI6c\n",
|
||||
&ip6h->saddr);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* drop a few special addresses */
|
||||
dst_v4 = ip6h->daddr.s6_addr32[3];
|
||||
if (!dst_v4 || /* 0.0.0.0 */
|
||||
(dst_v4 & bpf_htonl(0xFF000000)) == bpf_htonl(0x7F000000) || /* 127.x.x.x */
|
||||
(dst_v4 & bpf_htonl(0xF0000000)) == bpf_htonl(0xe0000000)) { /* multicast */
|
||||
DBG("v6: dropping invalid v4 dst %pI4 from %pI6c\n",
|
||||
&dst_v4, &ip6h->saddr);
|
||||
goto out;
|
||||
}
|
||||
|
||||
saddr_key.addr = ip6h->saddr;
|
||||
allowval = bpf_map_lookup_elem(&allowed_v6_src, &saddr_key);
|
||||
if (!allowval) {
|
||||
DBG("v6: saddr %pI6c not in allowed src\n", &ip6h->saddr);
|
||||
goto out;
|
||||
}
|
||||
|
||||
v6_state = bpf_map_lookup_elem(&v6_state_map, &ip6h->saddr);
|
||||
if (!v6_state) {
|
||||
v6_state = alloc_new_state(&ip6h->saddr);
|
||||
if (!v6_state) {
|
||||
DBG("v6: failed to allocate state for src %pI6c\n",
|
||||
&ip6h->saddr);
|
||||
goto out;
|
||||
}
|
||||
src_v4 = bpf_htonl(v6_state->v4_addr);
|
||||
DBG("v6: created new state for v6 %pI6c -> %pI4\n",
|
||||
&ip6h->saddr, &src_v4);
|
||||
} else {
|
||||
v6_state->last_seen = bpf_ktime_get_ns();
|
||||
bpf_map_update_elem(&v6_state_map, &ip6h->saddr, v6_state, BPF_EXIST);
|
||||
|
||||
src_v4 = bpf_htonl(v6_state->v4_addr);
|
||||
DBG("v6: updated old state for v6 %pI6c -> %pI4\n",
|
||||
&ip6h->saddr, &src_v4);
|
||||
}
|
||||
|
||||
dst_hdr.daddr = dst_v4;
|
||||
dst_hdr.saddr = bpf_htonl(v6_state->v4_addr);
|
||||
dst_hdr.protocol = ip6h->nexthdr;
|
||||
dst_hdr.ttl = ip6h->hop_limit;
|
||||
dst_hdr.tos = ip6h->priority << 4 | (ip6h->flow_lbl[0] >> 4);
|
||||
dst_hdr.tot_len = bpf_htons(bpf_ntohs(ip6h->payload_len) + sizeof(dst_hdr));
|
||||
|
||||
if (dst_hdr.protocol == IPPROTO_ICMPV6) {
|
||||
if (rewrite_icmpv6(ip6h, skb))
|
||||
goto out;
|
||||
dst_hdr.protocol = IPPROTO_ICMP;
|
||||
}
|
||||
|
||||
dst_hdr.check = csum_fold_helper(bpf_csum_diff((__be32 *)&dst_hdr, 0,
|
||||
(__be32 *)&dst_hdr, sizeof(dst_hdr),
|
||||
0));
|
||||
|
||||
if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IP), 0))
|
||||
goto out;
|
||||
|
||||
data = (void *)(unsigned long long)skb->data;
|
||||
data_end = (void *)(unsigned long long)skb->data_end;
|
||||
|
||||
eth = data;
|
||||
iph = data + ip_offset;
|
||||
if (eth + 1 > data_end || iph + 1 > data_end)
|
||||
goto out;
|
||||
|
||||
eth->h_proto = bpf_htons(ETH_P_IP);
|
||||
*iph = dst_hdr;
|
||||
|
||||
ret = bpf_redirect(skb->ifindex, BPF_F_INGRESS);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int nat64_handler(struct __sk_buff *skb, bool egress)
|
||||
{
|
||||
void *data_end = (void *)(unsigned long long)skb->data_end;
|
||||
void *data = (void *)(unsigned long long)skb->data;
|
||||
struct hdr_cursor nh = { .pos = data };
|
||||
struct ethhdr *eth;
|
||||
int eth_type;
|
||||
|
||||
/* Parse Ethernet and IP/IPv6 headers */
|
||||
eth_type = parse_ethhdr(&nh, data_end, ð);
|
||||
if (eth_type == bpf_htons(ETH_P_IP) && egress)
|
||||
return nat64_handle_v4(skb, &nh);
|
||||
else if (eth_type == bpf_htons(ETH_P_IPV6) && !egress)
|
||||
return nat64_handle_v6(skb, &nh);
|
||||
|
||||
return TC_ACT_OK;
|
||||
}
|
||||
SEC("classifier")
|
||||
int nat64_egress(struct __sk_buff *skb)
|
||||
{
|
||||
return nat64_handler(skb, true);
|
||||
}
|
||||
|
||||
SEC("classifier")
|
||||
int nat64_ingress(struct __sk_buff *skb)
|
||||
{
|
||||
return nat64_handler(skb, false);
|
||||
}
|
Reference in New Issue
Block a user