summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorJulian Anastasov <ja@ssi.bg>2026-03-03 23:04:05 +0200
committerFlorian Westphal <fw@strlen.de>2026-03-04 11:45:45 +0100
commitb655388111cf7e43f70e49db64bdaa42bcb8a038 (patch)
treea50a74e2d242fcd8e94617386d7f8313f425f564 /include
parent1ac252ad036cdb18f5fb7f76bb6061adfed9cedf (diff)
downloadlinux-b655388111cf7e43f70e49db64bdaa42bcb8a038.tar.gz
linux-b655388111cf7e43f70e49db64bdaa42bcb8a038.zip
ipvs: add resizable hash tables
Add infrastructure for resizable hash tables based on hlist_bl which we will use in followup patches. The tables allow RCU lookups during resizing, bucket modifications are protected with per-bucket bit lock and additional custom locking, the tables are resized when load reaches thresholds determined based on load factor parameter. Compared to other implementations we rely on: * fast entry removal by using node unlinking without pre-lookup * entry rehashing when hash key changes * entries can contain multiple hash nodes * custom locking depending on different contexts * adjustable load factor to customize the grow/shrink process Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Florian Westphal <fw@strlen.de>
Diffstat (limited to 'include')
-rw-r--r--include/net/ip_vs.h198
1 files changed, 198 insertions, 0 deletions
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index ad8a16146ac5..c373fbdd2d0f 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -11,6 +11,7 @@
#include <asm/types.h> /* for __uXX types */
#include <linux/list.h> /* for struct list_head */
+#include <linux/rculist_bl.h> /* for struct hlist_bl_head */
#include <linux/spinlock.h> /* for struct rwlock_t */
#include <linux/atomic.h> /* for struct atomic_t */
#include <linux/refcount.h> /* for struct refcount_t */
@@ -30,6 +31,7 @@
#endif
#include <net/net_namespace.h> /* Netw namespace */
#include <linux/sched/isolation.h>
+#include <linux/siphash.h>
#define IP_VS_HDR_INVERSE 1
#define IP_VS_HDR_ICMP 2
@@ -271,6 +273,10 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
pr_err(msg, ##__VA_ARGS__); \
} while (0)
+struct ip_vs_aligned_lock {
+ spinlock_t l; /* Protect buckets */
+} ____cacheline_aligned_in_smp;
+
/* For arrays per family */
enum {
IP_VS_AF_INET,
@@ -484,6 +490,198 @@ struct ip_vs_est_kt_data {
int est_row; /* estimated row */
};
+/* IPVS resizable hash tables */
+struct ip_vs_rht {
+ struct hlist_bl_head *buckets;
+ struct ip_vs_rht __rcu *new_tbl; /* New/Same table */
+ seqcount_t *seqc; /* Protects moves */
+ struct ip_vs_aligned_lock *lock; /* Protect seqc */
+ int mask; /* Buckets mask */
+ int size; /* Buckets */
+ int seqc_mask; /* seqc mask */
+ int lock_mask; /* lock mask */
+ u32 table_id;
+ int u_thresh; /* upper threshold */
+ int l_thresh; /* lower threshold */
+ int lfactor; /* Load Factor (shift)*/
+ int bits; /* size = 1 << bits */
+ siphash_key_t hash_key;
+ struct rcu_head rcu_head;
+};
+
+/**
+ * ip_vs_rht_for_each_table() - Walk the hash tables
+ * @table: struct ip_vs_rht __rcu *table
+ * @t: current table, used as cursor, struct ip_vs_rht *var
+ * @p: previous table, temp struct ip_vs_rht *var
+ *
+ * Walk tables assuming others can not change the installed tables
+ */
+#define ip_vs_rht_for_each_table(table, t, p) \
+ for (p = NULL, t = rcu_dereference_protected(table, 1); \
+ t != p; \
+ p = t, t = rcu_dereference_protected(t->new_tbl, 1))
+
+/**
+ * ip_vs_rht_for_each_table_rcu() - Walk the hash tables under RCU reader lock
+ * @table: struct ip_vs_rht __rcu *table
+ * @t: current table, used as cursor, struct ip_vs_rht *var
+ * @p: previous table, temp struct ip_vs_rht *var
+ *
+ * We usually search in one table and also in second table on resizing
+ */
+#define ip_vs_rht_for_each_table_rcu(table, t, p) \
+ for (p = NULL, t = rcu_dereference(table); \
+ t != p; \
+ p = t, t = rcu_dereference(t->new_tbl))
+
+/**
+ * ip_vs_rht_for_each_bucket() - Walk all table buckets
+ * @t: current table, used as cursor, struct ip_vs_rht *var
+ * @bucket: bucket index, used as cursor, u32 var
+ * @head: bucket address, used as cursor, struct hlist_bl_head *var
+ */
+#define ip_vs_rht_for_each_bucket(t, bucket, head) \
+ for (bucket = 0, head = (t)->buckets; \
+ bucket < t->size; bucket++, head++)
+
+/**
+ * ip_vs_rht_for_bucket_retry() - Retry bucket if entries are moved
+ * @t: current table, used as cursor, struct ip_vs_rht *var
+ * @bucket: index of current bucket or hash key
+ * @sc: temp seqcount_t *var
+ * @seq: temp unsigned int var for sequence count
+ * @retry: temp int var
+ */
+#define ip_vs_rht_for_bucket_retry(t, bucket, sc, seq, retry) \
+ for (retry = 1, sc = &(t)->seqc[(bucket) & (t)->seqc_mask]; \
+ retry && ({ seq = read_seqcount_begin(sc); 1; }); \
+ retry = read_seqcount_retry(sc, seq))
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_buckets_rcu
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() \
+ struct ip_vs_rht *_t, *_p; \
+ unsigned int _seq; \
+ seqcount_t *_sc; \
+ u32 _bucket; \
+ int _retry
+/**
+ * ip_vs_rht_walk_buckets_rcu() - Walk all buckets under RCU read lock
+ * @table: struct ip_vs_rht __rcu *table
+ * @head: bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Can be used while others add/delete/move entries
+ * Not suitable if duplicates are not desired
+ * Possible cases for reader that uses cond_resched_rcu() in the loop:
+ * - new table can not be installed, no need to repeat
+ * - new table can be installed => check and repeat if new table is
+ * installed, needed for !PREEMPT_RCU
+ */
+#define ip_vs_rht_walk_buckets_rcu(table, head) \
+ ip_vs_rht_for_each_table_rcu(table, _t, _p) \
+ ip_vs_rht_for_each_bucket(_t, _bucket, head) \
+ ip_vs_rht_for_bucket_retry(_t, _bucket, _sc, \
+ _seq, _retry)
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_bucket_rcu
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() \
+ unsigned int _seq; \
+ seqcount_t *_sc; \
+ int _retry
+/**
+ * ip_vs_rht_walk_bucket_rcu() - Walk bucket under RCU read lock
+ * @t: current table, struct ip_vs_rht *var
+ * @bucket: index of current bucket or hash key
+ * @head: bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Can be used while others add/delete/move entries
+ * Not suitable if duplicates are not desired
+ * Possible cases for reader that uses cond_resched_rcu() in the loop:
+ * - new table can not be installed, no need to repeat
+ * - new table can be installed => check and repeat if new table is
+ * installed, needed for !PREEMPT_RCU
+ */
+#define ip_vs_rht_walk_bucket_rcu(t, bucket, head) \
+ if (({ head = (t)->buckets + ((bucket) & (t)->mask); 0; })) \
+ {} \
+ else \
+ ip_vs_rht_for_bucket_retry(t, (bucket), _sc, _seq, _retry)
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_buckets_safe_rcu
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() \
+ struct ip_vs_rht *_t, *_p; \
+ u32 _bucket
+/**
+ * ip_vs_rht_walk_buckets_safe_rcu() - Walk all buckets under RCU read lock
+ * @table: struct ip_vs_rht __rcu *table
+ * @head: bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Can be used while others add/delete entries but moving is disabled
+ * Using cond_resched_rcu() should be safe if tables do not change
+ */
+#define ip_vs_rht_walk_buckets_safe_rcu(table, head) \
+ ip_vs_rht_for_each_table_rcu(table, _t, _p) \
+ ip_vs_rht_for_each_bucket(_t, _bucket, head)
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKETS() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_buckets
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKETS() \
+ struct ip_vs_rht *_t, *_p; \
+ u32 _bucket
+
+/**
+ * ip_vs_rht_walk_buckets() - Walk all buckets
+ * @table: struct ip_vs_rht __rcu *table
+ * @head: bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Use if others can not add/delete/move entries
+ */
+#define ip_vs_rht_walk_buckets(table, head) \
+ ip_vs_rht_for_each_table(table, _t, _p) \
+ ip_vs_rht_for_each_bucket(_t, _bucket, head)
+
+/* Entries can be in one of two tables, so we flip bit when new table is
+ * created and store it as highest bit in hash keys
+ */
+#define IP_VS_RHT_TABLE_ID_MASK BIT(31)
+
+/* Check if hash key is from this table */
+static inline bool ip_vs_rht_same_table(struct ip_vs_rht *t, u32 hash_key)
+{
+ return !((t->table_id ^ hash_key) & IP_VS_RHT_TABLE_ID_MASK);
+}
+
+/* Build per-table hash key from hash value */
+static inline u32 ip_vs_rht_build_hash_key(struct ip_vs_rht *t, u32 hash)
+{
+ return t->table_id | (hash & ~IP_VS_RHT_TABLE_ID_MASK);
+}
+
+void ip_vs_rht_free(struct ip_vs_rht *t);
+void ip_vs_rht_rcu_free(struct rcu_head *head);
+struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks);
+int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
+ int lfactor, int min_bits, int max_bits);
+void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor,
+ int min_bits, int max_bits);
+u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af,
+ const union nf_inet_addr *addr, u32 v1, u32 v2);
+
struct dst_entry;
struct iphdr;
struct ip_vs_conn;