From 794a4eefa11f3166404d91edddd0f4f19458f652 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Thu, 11 Nov 2021 16:25:59 +0100 Subject: [PATCH 1/2] Keeping un-unmmappable pages until they can be reused On Linux, munmap() may fail with ENOMEM when virtual memory is too fragmented. Working this around by just keeping such blocks for future use. --- lib/locking.h | 1 + lib/resource.c | 23 +++++++++++++++++------ sysdep/unix/alloc.c | 43 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/lib/locking.h b/lib/locking.h index 0a69f50f..1a8bdcd4 100644 --- a/lib/locking.h +++ b/lib/locking.h @@ -19,6 +19,7 @@ struct lock_order { struct domain_generic *attrs; struct domain_generic *cork; struct domain_generic *event; + struct domain_generic *resource; }; extern _Thread_local struct lock_order locking_stack; diff --git a/lib/resource.c b/lib/resource.c index e80b315b..2d041ad5 100644 --- a/lib/resource.c +++ b/lib/resource.c @@ -60,7 +60,7 @@ static struct resclass pool_class = { pool root_pool; void *alloc_sys_page(void); -void free_sys_page(void *); +int free_sys_page(void *); static int indent; @@ -98,8 +98,10 @@ pool_free(resource *P) if (p->pages) { ASSERT_DIE(!p->pages->used); - for (uint i=0; ipages->free; i++) + + for (uint i = 0; i < p->pages->free; i++) free_sys_page(p->pages->ptr[i]); + free_sys_page(p->pages); } } @@ -476,10 +478,19 @@ free_page(pool *p, void *ptr) ASSERT_DIE(p->pages); p->pages->used--; - if (p->pages->free >= POOL_PAGES_MAX) - return free_sys_page(ptr); - else - p->pages->ptr[p->pages->free++] = ptr; + ASSERT_DIE(p->pages->free <= POOL_PAGES_MAX); + + if (p->pages->free == POOL_PAGES_MAX) + { + const unsigned long keep = POOL_PAGES_MAX / 4; + + for (uint i = keep; i < p->pages->free; i++) + free_sys_page(p->pages->ptr[i]); + + p->pages->free = keep; + } + + p->pages->ptr[p->pages->free++] = ptr; } diff --git a/sysdep/unix/alloc.c b/sysdep/unix/alloc.c index 4c9d5eb5..4ae1a9db 100644 --- a/sysdep/unix/alloc.c +++ b/sysdep/unix/alloc.c @@ -11,6 +11,8 @@ #include #include +#include +#include #ifdef HAVE_MMAP #include @@ -19,6 +21,13 @@ long page_size = 0; _Bool alloc_multipage = 0; +static _Atomic int global_page_list_not_empty; +static list global_page_list; +static _Atomic int global_page_spinlock; + +#define GLOBAL_PAGE_SPIN_LOCK for (int v = 0; !atomic_compare_exchange_weak_explicit(&global_page_spinlock, &v, 1, memory_order_acq_rel, memory_order_acquire); v = 0) +#define GLOBAL_PAGE_SPIN_UNLOCK do { int v = 1; ASSERT_DIE(atomic_compare_exchange_strong_explicit(&global_page_spinlock, &v, 0, memory_order_acq_rel, memory_order_acquire)); } while (0) + #ifdef HAVE_MMAP static _Bool use_fake = 0; #else @@ -28,12 +37,14 @@ static _Bool use_fake = 1; void resource_sys_init(void) { #ifdef HAVE_MMAP + init_list(&global_page_list); + if (!(page_size = sysconf(_SC_PAGESIZE))) die("System page size must be non-zero"); if ((u64_popcount(page_size) > 1) || (page_size > 16384)) - { #endif + { /* Too big or strange page, use the aligned allocator instead */ page_size = 4096; use_fake = 1; @@ -46,6 +57,22 @@ alloc_sys_page(void) #ifdef HAVE_MMAP if (!use_fake) { + if (atomic_load_explicit(&global_page_list_not_empty, memory_order_relaxed)) + { + GLOBAL_PAGE_SPIN_LOCK; + if (!EMPTY_LIST(global_page_list)) + { + node *ret = HEAD(global_page_list); + rem_node(ret); + if (EMPTY_LIST(global_page_list)) + atomic_store_explicit(&global_page_list_not_empty, 0, memory_order_relaxed); + GLOBAL_PAGE_SPIN_UNLOCK; + memset(ret, 0, sizeof(node)); + return (void *) ret; + } + GLOBAL_PAGE_SPIN_UNLOCK; + } + if (alloc_multipage) { void *big = mmap(NULL, page_size * 2, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); @@ -90,7 +117,19 @@ free_sys_page(void *ptr) if (!use_fake) { if (munmap(ptr, page_size) < 0) - bug("munmap(%p) failed: %m", ptr); +#ifdef ENOMEM + if (errno == ENOMEM) + { + memset(ptr, 0, page_size); + + GLOBAL_PAGE_SPIN_LOCK; + add_tail(&global_page_list, (node *) ptr); + atomic_store_explicit(&global_page_list_not_empty, 1, memory_order_relaxed); + GLOBAL_PAGE_SPIN_UNLOCK; + } + else +#endif + bug("munmap(%p) failed: %m", ptr); } else #endif From dc160e11e1a9e4344bbee6fd0bc8aee229d7c540 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Fri, 12 Nov 2021 15:53:33 +0000 Subject: [PATCH 2/2] Route table import-to-export announcement indirection to reduce pipe traffic --- nest/proto.c | 3 +++ nest/route.h | 7 ++++--- nest/rt-table.c | 53 ++++++++++++++++++++++++++++++++++--------------- 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/nest/proto.c b/nest/proto.c index 623585f1..b7dbae5e 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -468,6 +468,7 @@ channel_start_import(struct channel *c) c->in_req = (struct rt_import_request) { .name = rn, + .list = proto_work_list(c->proto), .trace_routes = c->debug | c->proto->debug, .dump_req = channel_dump_import_req, .log_state_change = channel_import_log_state_change, @@ -886,6 +887,7 @@ channel_setup_in_table(struct channel *c, int best) c->in_table = &cat->cat; c->in_table->push = (struct rt_import_request) { .name = cat->name, + .list = proto_work_list(c->proto), .trace_routes = c->debug | c->proto->debug, .dump_req = channel_in_push_dump_req, .log_state_change = channel_push_log_state_change, @@ -928,6 +930,7 @@ channel_setup_out_table(struct channel *c) c->out_table = &cat->cat; c->out_table->push = (struct rt_import_request) { .name = cat->name, + .list = proto_work_list(c->proto), .trace_routes = c->debug | c->proto->debug, .dump_req = channel_out_push_dump_req, .log_state_change = channel_push_log_state_change, diff --git a/nest/route.h b/nest/route.h index 9417d97d..3f8bf433 100644 --- a/nest/route.h +++ b/nest/route.h @@ -170,6 +170,7 @@ typedef struct rtable_private { struct hmap id_map; struct hostcache *hostcache; struct event *prune_event; /* Event to prune abandoned routes */ + struct event *announce_event; /* Event to announce pending exports */ struct event *ec_event; /* Event to prune finished exports */ struct event *hcu_event; /* Event to update host cache */ struct event *delete_event; /* Event to delete the table */ @@ -191,8 +192,6 @@ typedef struct rtable_private { struct timer *settle_timer; /* Settle time for notifications */ list pending_exports; /* List of packed struct rt_pending_export */ - btime base_export_time; /* When first pending export was announced */ - struct timer *export_timer; struct rt_pending_export *first_export; /* First export to announce */ u64 next_export_seq; /* The next export will have this ID */ @@ -221,7 +220,6 @@ struct rtable_config { byte sorted; /* Routes of network are sorted according to rte_better() */ btime min_settle_time; /* Minimum settle time for notifications */ btime max_settle_time; /* Maximum settle time for notifications */ - btime export_settle_time; /* Delay before exports are announced */ uint cork_limit; /* Amount of routes to be pending on export to cork imports */ }; @@ -309,6 +307,8 @@ struct rt_import_request { char *name; u8 trace_routes; + event_list *list; /* Where to schedule import events */ + void (*dump_req)(struct rt_import_request *req); void (*log_state_change)(struct rt_import_request *req, u8 state); /* Preimport is called when the @new route is just-to-be inserted, replacing @old. @@ -339,6 +339,7 @@ struct rt_import_hook { u8 stale_pruned; /* Last prune finished when this value was set at stale_valid */ u8 stale_pruning; /* Last prune started when this value was set at stale_valid */ + struct event *export_announce_event; /* Event to run to announce new exports */ struct event *stopped; /* Event to run when import is stopped */ }; diff --git a/nest/rt-table.c b/nest/rt-table.c index f304372f..f33b9153 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -1023,8 +1023,6 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ rt_notify_hostcache(tab, net); } - rt_schedule_notify(tab); - if (EMPTY_LIST(tab->exports) && EMPTY_LIST(tab->pending_exports)) { /* No export hook and no pending exports to cleanup. We may free the route immediately. */ @@ -1105,10 +1103,7 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ { ev_cork(&rt_cork); tab->cork_active = 1; - tm_start_in(tab->export_timer, 0, tab->loop); } - else if (!tm_active(tab->export_timer)) - tm_start_in(tab->export_timer, tab->config->export_settle_time, tab->loop); } static struct rt_pending_export * @@ -1158,11 +1153,13 @@ rt_send_export_event(struct rt_export_hook *hook) } static void -rt_announce_exports(timer *tm) +rt_announce_exports(void *data) { - rtable_private *tab = tm->data; + rtable_private *tab = data; ASSERT_DIE(birdloop_inside(tab->loop)); + rt_schedule_notify(tab); + struct rt_export_hook *c; node *n; WALK_LIST2(c, n, tab->exports, n) { @@ -1173,6 +1170,26 @@ rt_announce_exports(timer *tm) } } +static void +rt_import_announce_exports(void *data) +{ + struct rt_import_hook *hook = data; + RT_LOCKED(hook->table, tab) + { + if (hook->import_state == TIS_CLEARED) + { + rfree(hook->export_announce_event); + + ev_send(hook->stopped->list, hook->stopped); + rem_node(&hook->n); + mb_free(hook); + rt_unlock_table(tab); + } + else + ev_send_loop(tab->loop, tab->announce_event); + } +} + static struct rt_pending_export * rt_last_export(rtable_private *tab) { @@ -1471,6 +1488,8 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * rte_announce(table, net, new_stored, old_stored, net->routes, old_best_stored); + ev_send(req->list, c->export_announce_event); + if (!net->routes && (table->gc_counter++ >= table->config->gc_max_ops) && (table->gc_time + table->config->gc_min_time <= current_time())) @@ -1709,6 +1728,8 @@ rt_request_import(rtable *t, struct rt_import_request *req) hook->req = req; hook->table = t; + hook->export_announce_event = ev_new_init(tab->rp, rt_import_announce_exports, hook); + if (!hook->stale_set) hook->stale_set = hook->stale_valid = hook->stale_pruning = hook->stale_pruned = 1; @@ -1727,11 +1748,12 @@ rt_stop_import(struct rt_import_request *req, event *stopped) struct rt_import_hook *hook = req->hook; RT_LOCK(hook->table); + rt_schedule_prune(RT_PRIV(hook->table)); rt_set_import_state(hook, TIS_STOP); - hook->stopped = stopped; + RT_UNLOCK(hook->table); } @@ -2158,6 +2180,7 @@ rt_setup(pool *pp, struct rtable_config *cf) t->loop = birdloop_new(p, DOMAIN_ORDER(rtable), nb); + t->announce_event = ev_new_init(p, rt_announce_exports, t); t->ec_event = ev_new_init(p, rt_export_cleanup, t); t->prune_event = ev_new_init(p, rt_prune_table, t); t->hcu_event = ev_new_init(p, rt_update_hostcache, t); @@ -2166,7 +2189,6 @@ rt_setup(pool *pp, struct rtable_config *cf) t->nhu_event->cork = &rt_cork; t->prune_event->cork = &rt_cork; - t->export_timer = tm_new_init(p, rt_announce_exports, t, 0, 0); t->last_rt_change = t->gc_time = current_time(); t->next_export_seq = 1; @@ -2472,15 +2494,11 @@ done:; if (!first_export || (first_export->seq >= ih->flush_seq)) { ih->import_state = TIS_CLEARED; - ev_send(ih->stopped->list, ih->stopped); - rem_node(&ih->n); - mb_free(ih); - rt_unlock_table(tab); + ev_send(ih->req->list, ih->export_announce_event); } - - if (EMPTY_LIST(tab->pending_exports) && tm_active(tab->export_timer)) - tm_stop(tab->export_timer); + if (EMPTY_LIST(tab->pending_exports) && ev_active(tab->announce_event)) + ev_postpone(tab->announce_event); /* If reduced to at most one export block pending */ if (tab->cork_active && @@ -2753,6 +2771,8 @@ rt_next_hop_update(void *data) if (atomic_fetch_and_explicit(&tab->nhu_state, NHU_SCHEDULED, memory_order_acq_rel) != NHU_RUNNING) ev_send_loop(tab->loop, tab->nhu_event); + ev_send_loop(tab->loop, tab->announce_event); + rt_unlock_table(tab); } @@ -2809,6 +2829,7 @@ rt_loop_stopped(void *data) r->loop = NULL; r->prune_event->list = r->ec_event->list = NULL; r->nhu_event->list = r->hcu_event->list = NULL; + r->announce_event->list = NULL; ev_send(r->delete_event->list, r->delete_event); }