diff options
| author | Julian Anastasov <ja@ssi.bg> | 2026-03-03 23:04:07 +0200 |
|---|---|---|
| committer | Florian Westphal <fw@strlen.de> | 2026-03-04 11:45:45 +0100 |
| commit | 2fa7cc9c70254d42a82bf82827d8d20cafe975d2 (patch) | |
| tree | 07f14eb227175bc7dd9cf7377a7a166c7abe277e /include | |
| parent | 840aac3d900d09ec8fb8efe41bd7d09f9eb15538 (diff) | |
| download | linux-2fa7cc9c70254d42a82bf82827d8d20cafe975d2.tar.gz linux-2fa7cc9c70254d42a82bf82827d8d20cafe975d2.zip | |
ipvs: switch to per-net connection table
Use per-net resizable hash table for connections. The global table
is slow to walk when using many namespaces.
The table can be resized in the range of [256 - ip_vs_conn_tab_size].
Table is attached only while services are present. Resizing is done
by delayed work based on load (the number of connections).
Add a hash_key field into the connection to store the table ID in
the highest bit and the entry's hash value in the lowest bits. The
lowest part of the hash value is used as bucket ID, the remaining
part is used to filter the entries in the bucket before matching
the keys and as result, helps the lookup operation to access only
one cache line. By knowing the table ID and bucket ID for entry,
we can unlink it without calculating the hash value and doing
lookup by keys. We need only to validate the saved hash_key under
lock.
For better security switch from jhash to siphash for the default
connection hashing but the persistence engines may use their own
function. Keeping the hash table loaded with entries below the
size (12%) allows to avoid collision for 96+% of the conns.
ip_vs_conn_fill_cport() now will rehash the connection with proper
locking because unhash+hash is not safe for RCU readers.
To invalidate the templates setting just dport to 0xffff is enough,
no need to rehash them. As result, ip_vs_conn_unhash() is now
unused and removed.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Florian Westphal <fw@strlen.de>
Diffstat (limited to 'include')
| -rw-r--r-- | include/net/ip_vs.h | 34 |
1 files changed, 29 insertions, 5 deletions
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 663ad6ad9518..3d595bd99eb3 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -36,6 +36,14 @@ #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 +/* conn_tab limits (as per Kconfig) */ +#define IP_VS_CONN_TAB_MIN_BITS 8 +#if BITS_PER_LONG > 32 +#define IP_VS_CONN_TAB_MAX_BITS 27 +#else +#define IP_VS_CONN_TAB_MAX_BITS 20 +#endif + /* svc_table limits */ #define IP_VS_SVC_TAB_MIN_BITS 4 #define IP_VS_SVC_TAB_MAX_BITS 20 @@ -289,6 +297,7 @@ static inline int ip_vs_af_index(int af) enum { IP_VS_WORK_SVC_RESIZE, /* Schedule svc_resize_work */ IP_VS_WORK_SVC_NORESIZE, /* Stopping svc_resize_work */ + IP_VS_WORK_CONN_RESIZE, /* Schedule conn_resize_work */ }; /* The port number of FTP service (in network order). */ @@ -779,18 +788,19 @@ struct ip_vs_conn_param { /* IP_VS structure allocated for each dynamically scheduled connection */ struct ip_vs_conn { - struct hlist_node c_list; /* hashed list heads */ + struct hlist_bl_node c_list; /* node in conn_tab */ + __u32 hash_key; /* Key for the hash table */ /* Protocol, addresses and port numbers */ __be16 cport; __be16 dport; __be16 vport; u16 af; /* address family */ + __u16 protocol; /* Which protocol (TCP/UDP) */ + __u16 daf; /* Address family of the dest */ union nf_inet_addr caddr; /* client address */ union nf_inet_addr vaddr; /* virtual address */ union nf_inet_addr daddr; /* destination address */ volatile __u32 flags; /* status flags */ - __u16 protocol; /* Which protocol (TCP/UDP) */ - __u16 daf; /* Address family of the dest */ struct netns_ipvs *ipvs; /* counter and timer */ @@ -1009,8 +1019,8 @@ struct ip_vs_pe { int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb); bool (*ct_match)(const struct ip_vs_conn_param *p, struct ip_vs_conn *ct); - u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval, - bool inverse); + u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, + struct ip_vs_rht *t, bool inverse); int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf); /* create connections for real-server outgoing packets */ struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc, @@ -1150,6 +1160,7 @@ struct netns_ipvs { /* ip_vs_conn */ atomic_t conn_count; /* connection counter */ atomic_t no_cport_conns[IP_VS_AF_MAX]; + struct delayed_work conn_resize_work;/* resize conn_tab */ /* ip_vs_ctl */ struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ @@ -1226,6 +1237,7 @@ struct netns_ipvs { int sysctl_est_nice; /* kthread nice */ int est_stopped; /* stop tasks */ #endif + int sysctl_conn_lfactor; int sysctl_svc_lfactor; /* ip_vs_lblc */ @@ -1269,6 +1281,8 @@ struct netns_ipvs { unsigned int hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */ struct ip_vs_rht __rcu *svc_table; /* Services */ + struct ip_vs_rht __rcu *conn_tab; /* Connections */ + atomic_t conn_tab_changes;/* ++ on new table */ }; #define DEFAULT_SYNC_THRESHOLD 3 @@ -1518,6 +1532,12 @@ static inline int sysctl_est_nice(struct netns_ipvs *ipvs) #endif +/* Get load factor to map conn_count/u_thresh to t->size */ +static inline int sysctl_conn_lfactor(struct netns_ipvs *ipvs) +{ + return READ_ONCE(ipvs->sysctl_conn_lfactor); +} + /* Get load factor to map num_services/u_thresh to t->size * Smaller value decreases u_thresh to reduce collisions but increases * the table size @@ -1603,6 +1623,10 @@ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) } void ip_vs_conn_put(struct ip_vs_conn *cp); void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); +int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, + int lfactor); +struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets, + int lfactor); struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, |
