summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorJulian Anastasov <ja@ssi.bg>2026-03-03 23:04:06 +0200
committerFlorian Westphal <fw@strlen.de>2026-03-04 11:45:45 +0100
commit840aac3d900d09ec8fb8efe41bd7d09f9eb15538 (patch)
tree828b1061ed1d7acdab2edb022db2eedcab6a4d5b /include
parentb655388111cf7e43f70e49db64bdaa42bcb8a038 (diff)
downloadlinux-840aac3d900d09ec8fb8efe41bd7d09f9eb15538.tar.gz
linux-840aac3d900d09ec8fb8efe41bd7d09f9eb15538.zip
ipvs: use resizable hash table for services
Make the hash table for services resizable in the bit range of 4-20. Table is attached only while services are present. Resizing is done by delayed work based on load (the number of hashed services). Table grows when load increases 2+ times (above 12.5% with lfactor=-3) and shrinks 8+ times when load decreases 16+ times (below 0.78%). Switch to jhash hashing to reduce the collisions for multiple services. Add a hash_key field into the service to store the table ID in the highest bit and the entry's hash value in the lowest bits. The lowest part of the hash value is used as bucket ID, the remaining part is used to filter the entries in the bucket before matching the keys and as result, helps the lookup operation to access only one cache line. By knowing the table ID and bucket ID for entry, we can unlink it without calculating the hash value and doing lookup by keys. We need only to validate the saved hash_key under lock. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Florian Westphal <fw@strlen.de>
Diffstat (limited to 'include')
-rw-r--r--include/net/ip_vs.h49
1 files changed, 34 insertions, 15 deletions
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index c373fbdd2d0f..663ad6ad9518 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -35,12 +35,10 @@
#define IP_VS_HDR_INVERSE 1
#define IP_VS_HDR_ICMP 2
-/*
- * Hash table: for virtual service lookups
- */
-#define IP_VS_SVC_TAB_BITS 8
-#define IP_VS_SVC_TAB_SIZE BIT(IP_VS_SVC_TAB_BITS)
-#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
+
+/* svc_table limits */
+#define IP_VS_SVC_TAB_MIN_BITS 4
+#define IP_VS_SVC_TAB_MAX_BITS 20
/* Generic access of ipvs struct */
static inline struct netns_ipvs *net_ipvs(struct net* net)
@@ -51,8 +49,6 @@ static inline struct netns_ipvs *net_ipvs(struct net* net)
/* Connections' size value needed by ip_vs_ctl.c */
extern int ip_vs_conn_tab_size;
-extern struct mutex __ip_vs_mutex;
-
struct ip_vs_iphdr {
int hdr_flags; /* ipvs flags */
__u32 off; /* Where IP or IPv4 header starts */
@@ -289,6 +285,12 @@ static inline int ip_vs_af_index(int af)
return af == AF_INET6 ? IP_VS_AF_INET6 : IP_VS_AF_INET;
}
+/* work_flags */
+enum {
+ IP_VS_WORK_SVC_RESIZE, /* Schedule svc_resize_work */
+ IP_VS_WORK_SVC_NORESIZE, /* Stopping svc_resize_work */
+};
+
/* The port number of FTP service (in network order). */
#define FTPPORT cpu_to_be16(21)
#define FTPDATA cpu_to_be16(20)
@@ -889,14 +891,15 @@ struct ip_vs_dest_user_kern {
* forwarding entries.
*/
struct ip_vs_service {
- struct hlist_node s_list; /* node in service table */
- atomic_t refcnt; /* reference counter */
-
+ struct hlist_bl_node s_list; /* node in service table */
+ u32 hash_key; /* Key for the hash table */
u16 af; /* address family */
__u16 protocol; /* which protocol (TCP/UDP) */
+
union nf_inet_addr addr; /* IP address for virtual service */
- __be16 port; /* port number for the service */
__u32 fwmark; /* firewall mark of the service */
+ atomic_t refcnt; /* reference counter */
+ __be16 port; /* port number for the service */
unsigned int flags; /* service status flags */
unsigned int timeout; /* persistent timeout in ticks */
__be32 netmask; /* grouping granularity, mask/plen */
@@ -1155,6 +1158,10 @@ struct netns_ipvs {
struct list_head dest_trash;
spinlock_t dest_trash_lock;
struct timer_list dest_trash_timer; /* expiration timer */
+ struct mutex service_mutex; /* service reconfig */
+ struct rw_semaphore svc_resize_sem; /* svc_table resizing */
+ struct delayed_work svc_resize_work; /* resize svc_table */
+ atomic_t svc_table_changes;/* ++ on new table */
/* Service counters */
atomic_t num_services[IP_VS_AF_MAX]; /* Services */
atomic_t fwm_services[IP_VS_AF_MAX]; /* Services */
@@ -1219,6 +1226,7 @@ struct netns_ipvs {
int sysctl_est_nice; /* kthread nice */
int est_stopped; /* stop tasks */
#endif
+ int sysctl_svc_lfactor;
/* ip_vs_lblc */
int sysctl_lblc_expiration;
@@ -1228,6 +1236,7 @@ struct netns_ipvs {
int sysctl_lblcr_expiration;
struct ctl_table_header *lblcr_ctl_header;
struct ctl_table *lblcr_ctl_table;
+ unsigned long work_flags; /* IP_VS_WORK_* flags */
/* ip_vs_est */
struct delayed_work est_reload_work;/* Reload kthread tasks */
struct mutex est_mutex; /* protect kthread tasks */
@@ -1259,9 +1268,7 @@ struct netns_ipvs {
unsigned int mixed_address_family_dests;
unsigned int hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */
- /* the service mutex that protect svc_table and svc_fwm_table */
- struct mutex service_mutex;
- struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE]; /* Services */
+ struct ip_vs_rht __rcu *svc_table; /* Services */
};
#define DEFAULT_SYNC_THRESHOLD 3
@@ -1511,6 +1518,18 @@ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
#endif
+/* Get load factor to map num_services/u_thresh to t->size
+ * Smaller value decreases u_thresh to reduce collisions but increases
+ * the table size
+ * Returns factor where:
+ * - <0: u_thresh = size >> -factor, eg. lfactor -2 = 25% load
+ * - >=0: u_thresh = size << factor, eg. lfactor 1 = 200% load
+ */
+static inline int sysctl_svc_lfactor(struct netns_ipvs *ipvs)
+{
+ return READ_ONCE(ipvs->sysctl_svc_lfactor);
+}
+
/* IPVS core functions
* (from ip_vs_core.c)
*/