diff options
| author | Julian Anastasov <ja@ssi.bg> | 2026-03-03 23:04:05 +0200 |
|---|---|---|
| committer | Florian Westphal <fw@strlen.de> | 2026-03-04 11:45:45 +0100 |
| commit | b655388111cf7e43f70e49db64bdaa42bcb8a038 (patch) | |
| tree | a50a74e2d242fcd8e94617386d7f8313f425f564 /include | |
| parent | 1ac252ad036cdb18f5fb7f76bb6061adfed9cedf (diff) | |
| download | linux-b655388111cf7e43f70e49db64bdaa42bcb8a038.tar.gz linux-b655388111cf7e43f70e49db64bdaa42bcb8a038.zip | |
ipvs: add resizable hash tables
Add infrastructure for resizable hash tables based on hlist_bl
which we will use in followup patches.
The tables allow RCU lookups during resizing, bucket modifications
are protected with per-bucket bit lock and additional custom locking,
the tables are resized when load reaches thresholds determined based
on load factor parameter.
Compared to other implementations we rely on:
* fast entry removal by using node unlinking without pre-lookup
* entry rehashing when hash key changes
* entries can contain multiple hash nodes
* custom locking depending on different contexts
* adjustable load factor to customize the grow/shrink process
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Florian Westphal <fw@strlen.de>
Diffstat (limited to 'include')
| -rw-r--r-- | include/net/ip_vs.h | 198 |
1 files changed, 198 insertions, 0 deletions
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ad8a16146ac5..c373fbdd2d0f 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -11,6 +11,7 @@ #include <asm/types.h> /* for __uXX types */ #include <linux/list.h> /* for struct list_head */ +#include <linux/rculist_bl.h> /* for struct hlist_bl_head */ #include <linux/spinlock.h> /* for struct rwlock_t */ #include <linux/atomic.h> /* for struct atomic_t */ #include <linux/refcount.h> /* for struct refcount_t */ @@ -30,6 +31,7 @@ #endif #include <net/net_namespace.h> /* Netw namespace */ #include <linux/sched/isolation.h> +#include <linux/siphash.h> #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 @@ -271,6 +273,10 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len, pr_err(msg, ##__VA_ARGS__); \ } while (0) +struct ip_vs_aligned_lock { + spinlock_t l; /* Protect buckets */ +} ____cacheline_aligned_in_smp; + /* For arrays per family */ enum { IP_VS_AF_INET, @@ -484,6 +490,198 @@ struct ip_vs_est_kt_data { int est_row; /* estimated row */ }; +/* IPVS resizable hash tables */ +struct ip_vs_rht { + struct hlist_bl_head *buckets; + struct ip_vs_rht __rcu *new_tbl; /* New/Same table */ + seqcount_t *seqc; /* Protects moves */ + struct ip_vs_aligned_lock *lock; /* Protect seqc */ + int mask; /* Buckets mask */ + int size; /* Buckets */ + int seqc_mask; /* seqc mask */ + int lock_mask; /* lock mask */ + u32 table_id; + int u_thresh; /* upper threshold */ + int l_thresh; /* lower threshold */ + int lfactor; /* Load Factor (shift)*/ + int bits; /* size = 1 << bits */ + siphash_key_t hash_key; + struct rcu_head rcu_head; +}; + +/** + * ip_vs_rht_for_each_table() - Walk the hash tables + * @table: struct ip_vs_rht __rcu *table + * @t: current table, used as cursor, struct ip_vs_rht *var + * @p: previous table, temp struct ip_vs_rht *var + * + * Walk tables assuming others can not change the installed tables + */ +#define ip_vs_rht_for_each_table(table, t, p) \ + for (p = NULL, t = rcu_dereference_protected(table, 1); \ + t != p; \ + p = t, t = rcu_dereference_protected(t->new_tbl, 1)) + +/** + * ip_vs_rht_for_each_table_rcu() - Walk the hash tables under RCU reader lock + * @table: struct ip_vs_rht __rcu *table + * @t: current table, used as cursor, struct ip_vs_rht *var + * @p: previous table, temp struct ip_vs_rht *var + * + * We usually search in one table and also in second table on resizing + */ +#define ip_vs_rht_for_each_table_rcu(table, t, p) \ + for (p = NULL, t = rcu_dereference(table); \ + t != p; \ + p = t, t = rcu_dereference(t->new_tbl)) + +/** + * ip_vs_rht_for_each_bucket() - Walk all table buckets + * @t: current table, used as cursor, struct ip_vs_rht *var + * @bucket: bucket index, used as cursor, u32 var + * @head: bucket address, used as cursor, struct hlist_bl_head *var + */ +#define ip_vs_rht_for_each_bucket(t, bucket, head) \ + for (bucket = 0, head = (t)->buckets; \ + bucket < t->size; bucket++, head++) + +/** + * ip_vs_rht_for_bucket_retry() - Retry bucket if entries are moved + * @t: current table, used as cursor, struct ip_vs_rht *var + * @bucket: index of current bucket or hash key + * @sc: temp seqcount_t *var + * @seq: temp unsigned int var for sequence count + * @retry: temp int var + */ +#define ip_vs_rht_for_bucket_retry(t, bucket, sc, seq, retry) \ + for (retry = 1, sc = &(t)->seqc[(bucket) & (t)->seqc_mask]; \ + retry && ({ seq = read_seqcount_begin(sc); 1; }); \ + retry = read_seqcount_retry(sc, seq)) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() - Declare variables + * + * Variables for ip_vs_rht_walk_buckets_rcu + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() \ + struct ip_vs_rht *_t, *_p; \ + unsigned int _seq; \ + seqcount_t *_sc; \ + u32 _bucket; \ + int _retry +/** + * ip_vs_rht_walk_buckets_rcu() - Walk all buckets under RCU read lock + * @table: struct ip_vs_rht __rcu *table + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Can be used while others add/delete/move entries + * Not suitable if duplicates are not desired + * Possible cases for reader that uses cond_resched_rcu() in the loop: + * - new table can not be installed, no need to repeat + * - new table can be installed => check and repeat if new table is + * installed, needed for !PREEMPT_RCU + */ +#define ip_vs_rht_walk_buckets_rcu(table, head) \ + ip_vs_rht_for_each_table_rcu(table, _t, _p) \ + ip_vs_rht_for_each_bucket(_t, _bucket, head) \ + ip_vs_rht_for_bucket_retry(_t, _bucket, _sc, \ + _seq, _retry) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() - Declare variables + * + * Variables for ip_vs_rht_walk_bucket_rcu + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() \ + unsigned int _seq; \ + seqcount_t *_sc; \ + int _retry +/** + * ip_vs_rht_walk_bucket_rcu() - Walk bucket under RCU read lock + * @t: current table, struct ip_vs_rht *var + * @bucket: index of current bucket or hash key + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Can be used while others add/delete/move entries + * Not suitable if duplicates are not desired + * Possible cases for reader that uses cond_resched_rcu() in the loop: + * - new table can not be installed, no need to repeat + * - new table can be installed => check and repeat if new table is + * installed, needed for !PREEMPT_RCU + */ +#define ip_vs_rht_walk_bucket_rcu(t, bucket, head) \ + if (({ head = (t)->buckets + ((bucket) & (t)->mask); 0; })) \ + {} \ + else \ + ip_vs_rht_for_bucket_retry(t, (bucket), _sc, _seq, _retry) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() - Declare variables + * + * Variables for ip_vs_rht_walk_buckets_safe_rcu + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() \ + struct ip_vs_rht *_t, *_p; \ + u32 _bucket +/** + * ip_vs_rht_walk_buckets_safe_rcu() - Walk all buckets under RCU read lock + * @table: struct ip_vs_rht __rcu *table + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Can be used while others add/delete entries but moving is disabled + * Using cond_resched_rcu() should be safe if tables do not change + */ +#define ip_vs_rht_walk_buckets_safe_rcu(table, head) \ + ip_vs_rht_for_each_table_rcu(table, _t, _p) \ + ip_vs_rht_for_each_bucket(_t, _bucket, head) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKETS() - Declare variables + * + * Variables for ip_vs_rht_walk_buckets + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKETS() \ + struct ip_vs_rht *_t, *_p; \ + u32 _bucket + +/** + * ip_vs_rht_walk_buckets() - Walk all buckets + * @table: struct ip_vs_rht __rcu *table + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Use if others can not add/delete/move entries + */ +#define ip_vs_rht_walk_buckets(table, head) \ + ip_vs_rht_for_each_table(table, _t, _p) \ + ip_vs_rht_for_each_bucket(_t, _bucket, head) + +/* Entries can be in one of two tables, so we flip bit when new table is + * created and store it as highest bit in hash keys + */ +#define IP_VS_RHT_TABLE_ID_MASK BIT(31) + +/* Check if hash key is from this table */ +static inline bool ip_vs_rht_same_table(struct ip_vs_rht *t, u32 hash_key) +{ + return !((t->table_id ^ hash_key) & IP_VS_RHT_TABLE_ID_MASK); +} + +/* Build per-table hash key from hash value */ +static inline u32 ip_vs_rht_build_hash_key(struct ip_vs_rht *t, u32 hash) +{ + return t->table_id | (hash & ~IP_VS_RHT_TABLE_ID_MASK); +} + +void ip_vs_rht_free(struct ip_vs_rht *t); +void ip_vs_rht_rcu_free(struct rcu_head *head); +struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks); +int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, + int lfactor, int min_bits, int max_bits); +void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor, + int min_bits, int max_bits); +u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af, + const union nf_inet_addr *addr, u32 v1, u32 v2); + struct dst_entry; struct iphdr; struct ip_vs_conn; |
