From 6e7333d315a768170a59ac771297ee0551bdddbf Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Mon, 1 Feb 2016 18:51:05 -0500
Subject: net: add rx_nohandler stat counter

This adds an rx_nohandler stat counter, along with a sysfs statistics
node, and copies the counter out via netlink as well.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jiri@mellanox.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Tom Herbert <tom@herbertland.com>
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <gospo@cumulusnetworks.com>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index a30b78090594..d3e90b91e07e 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -35,6 +35,8 @@ struct rtnl_link_stats {
 	/* for cslip etc */
 	__u32	rx_compressed;
 	__u32	tx_compressed;
+
+	__u32	rx_nohandler;		/* dropped, no handler found	*/
 };
 
 /* The main device statistics structure */
@@ -68,6 +70,8 @@ struct rtnl_link_stats64 {
 	/* for cslip etc */
 	__u64	rx_compressed;
 	__u64	tx_compressed;
+
+	__u64	rx_nohandler;		/* dropped, no handler found	*/
 };
 
 /* The struct should be in sync with struct ifmap */
-- 
cgit v1.2.3


From 824bd0ce6c7c43a9e1e210abf124958e54d88342 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 1 Feb 2016 22:39:53 -0800
Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_HASH map

Introduce BPF_MAP_TYPE_PERCPU_HASH map type which is used to do
accurate counters without need to use BPF_XADD instruction which turned
out to be too costly for high-performance network monitoring.
In the typical use case the 'key' is the flow tuple or other long
living object that sees a lot of events per second.

bpf_map_lookup_elem() returns per-cpu area.
Example:
struct {
  u32 packets;
  u32 bytes;
} * ptr = bpf_map_lookup_elem(&map, &key);
/* ptr points to this_cpu area of the value, so the following
 * increments will not collide with other cpus
 */
ptr->packets ++;
ptr->bytes += skb->len;

bpf_update_elem() atomically creates a new element where all per-cpu
values are zero initialized and this_cpu value is populated with
given 'value'.
Note that non-per-cpu hash map always allocates new element
and then deletes old after rcu grace period to maintain atomicity
of update. Per-cpu hash map updates element values in-place.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |   1 +
 kernel/bpf/hashtab.c     | 275 +++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 229 insertions(+), 47 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa6f8571de13..43ae40c8763e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -81,6 +81,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_ARRAY,
 	BPF_MAP_TYPE_PROG_ARRAY,
 	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	BPF_MAP_TYPE_PERCPU_HASH,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c5b30fd8a315..2be5f6e8bb04 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -31,21 +31,27 @@ struct bpf_htab {
 struct htab_elem {
 	struct hlist_node hash_node;
 	struct rcu_head rcu;
-	u32 hash;
+	union {
+		u32 hash;
+		u32 key_size;
+	};
 	char key[0] __aligned(8);
 };
 
 /* Called from syscall */
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
+	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
 	struct bpf_htab *htab;
 	int err, i;
+	u64 cost;
 
 	htab = kzalloc(sizeof(*htab), GFP_USER);
 	if (!htab)
 		return ERR_PTR(-ENOMEM);
 
 	/* mandatory map attributes */
+	htab->map.map_type = attr->map_type;
 	htab->map.key_size = attr->key_size;
 	htab->map.value_size = attr->value_size;
 	htab->map.max_entries = attr->max_entries;
@@ -77,24 +83,34 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		 */
 		goto free_htab;
 
+	if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
+		/* make sure the size for pcpu_alloc() is reasonable */
+		goto free_htab;
+
 	htab->elem_size = sizeof(struct htab_elem) +
-			  round_up(htab->map.key_size, 8) +
-			  htab->map.value_size;
+			  round_up(htab->map.key_size, 8);
+	if (percpu)
+		htab->elem_size += sizeof(void *);
+	else
+		htab->elem_size += htab->map.value_size;
 
 	/* prevent zero size kmalloc and check for u32 overflow */
 	if (htab->n_buckets == 0 ||
 	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
 		goto free_htab;
 
-	if ((u64) htab->n_buckets * sizeof(struct bucket) +
-	    (u64) htab->elem_size * htab->map.max_entries >=
-	    U32_MAX - PAGE_SIZE)
+	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+	       (u64) htab->elem_size * htab->map.max_entries;
+
+	if (percpu)
+		cost += (u64) round_up(htab->map.value_size, 8) *
+			num_possible_cpus() * htab->map.max_entries;
+
+	if (cost >= U32_MAX - PAGE_SIZE)
 		/* make sure page count doesn't overflow */
 		goto free_htab;
 
-	htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
-				   htab->elem_size * htab->map.max_entries,
-				   PAGE_SIZE) >> PAGE_SHIFT;
+	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	err = -ENOMEM;
 	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
@@ -148,7 +164,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
 }
 
 /* Called from syscall or from eBPF program */
-static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_head *head;
@@ -166,6 +182,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
 
 	l = lookup_elem_raw(head, hash, key, key_size);
 
+	return l;
+}
+
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
 	if (l)
 		return l->key + round_up(map->key_size, 8);
 
@@ -230,65 +253,139 @@ find_first_elem:
 	return -ENOENT;
 }
 
+
+static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
+				     void __percpu *pptr)
+{
+	*(void __percpu **)(l->key + key_size) = pptr;
+}
+
+static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
+{
+	return *(void __percpu **)(l->key + key_size);
+}
+
+static void htab_percpu_elem_free(struct htab_elem *l)
+{
+	free_percpu(htab_elem_get_ptr(l, l->key_size));
+	kfree(l);
+}
+
+static void htab_percpu_elem_free_rcu(struct rcu_head *head)
+{
+	struct htab_elem *l = container_of(head, struct htab_elem, rcu);
+
+	htab_percpu_elem_free(l);
+}
+
+static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
+{
+	if (percpu) {
+		l->key_size = key_size;
+		call_rcu(&l->rcu, htab_percpu_elem_free_rcu);
+	} else {
+		kfree_rcu(l, rcu);
+	}
+}
+
+static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
+					 void *value, u32 key_size, u32 hash,
+					 bool percpu)
+{
+	u32 size = htab->map.value_size;
+	struct htab_elem *l_new;
+	void __percpu *pptr;
+
+	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+	if (!l_new)
+		return NULL;
+
+	memcpy(l_new->key, key, key_size);
+	if (percpu) {
+		/* round up value_size to 8 bytes */
+		size = round_up(size, 8);
+
+		/* alloc_percpu zero-fills */
+		pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN);
+		if (!pptr) {
+			kfree(l_new);
+			return NULL;
+		}
+
+		/* copy true value_size bytes */
+		memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+		htab_elem_set_ptr(l_new, key_size, pptr);
+	} else {
+		memcpy(l_new->key + round_up(key_size, 8), value, size);
+	}
+
+	l_new->hash = hash;
+	return l_new;
+}
+
+static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
+		       u64 map_flags)
+{
+	if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries))
+		/* if elem with this 'key' doesn't exist and we've reached
+		 * max_entries limit, fail insertion of new elem
+		 */
+		return -E2BIG;
+
+	if (l_old && map_flags == BPF_NOEXIST)
+		/* elem already exists */
+		return -EEXIST;
+
+	if (!l_old && map_flags == BPF_EXIST)
+		/* elem doesn't exist, cannot update it */
+		return -ENOENT;
+
+	return 0;
+}
+
 /* Called from syscall or from eBPF program */
 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 				u64 map_flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct htab_elem *l_new, *l_old;
+	struct htab_elem *l_new = NULL, *l_old;
 	struct hlist_head *head;
-	struct bucket *b;
 	unsigned long flags;
-	u32 key_size;
+	struct bucket *b;
+	u32 key_size, hash;
 	int ret;
 
-	if (map_flags > BPF_EXIST)
+	if (unlikely(map_flags > BPF_EXIST))
 		/* unknown flags */
 		return -EINVAL;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	/* allocate new element outside of lock */
-	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
-	if (!l_new)
-		return -ENOMEM;
-
 	key_size = map->key_size;
 
-	memcpy(l_new->key, key, key_size);
-	memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+	hash = htab_map_hash(key, key_size);
+
+	/* allocate new element outside of the lock, since
+	 * we're most likley going to insert it
+	 */
+	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false);
+	if (!l_new)
+		return -ENOMEM;
 
-	l_new->hash = htab_map_hash(l_new->key, key_size);
-	b = __select_bucket(htab, l_new->hash);
+	b = __select_bucket(htab, hash);
 	head = &b->head;
 
 	/* bpf_map_update_elem() can be called in_irq() */
 	raw_spin_lock_irqsave(&b->lock, flags);
 
-	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+	l_old = lookup_elem_raw(head, hash, key, key_size);
 
-	if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
-		/* if elem with this 'key' doesn't exist and we've reached
-		 * max_entries limit, fail insertion of new elem
-		 */
-		ret = -E2BIG;
+	ret = check_flags(htab, l_old, map_flags);
+	if (ret)
 		goto err;
-	}
 
-	if (l_old && map_flags == BPF_NOEXIST) {
-		/* elem already exists */
-		ret = -EEXIST;
-		goto err;
-	}
-
-	if (!l_old && map_flags == BPF_EXIST) {
-		/* elem doesn't exist, cannot update it */
-		ret = -ENOENT;
-		goto err;
-	}
-
-	/* add new element to the head of the list, so that concurrent
-	 * search will find it before old elem
+	/* add new element to the head of the list, so that
+	 * concurrent search will find it before old elem
 	 */
 	hlist_add_head_rcu(&l_new->hash_node, head);
 	if (l_old) {
@@ -298,7 +395,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 		atomic_inc(&htab->count);
 	}
 	raw_spin_unlock_irqrestore(&b->lock, flags);
-
 	return 0;
 err:
 	raw_spin_unlock_irqrestore(&b->lock, flags);
@@ -306,10 +402,64 @@ err:
 	return ret;
 }
 
+static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem *l_new = NULL, *l_old;
+	struct hlist_head *head;
+	unsigned long flags;
+	struct bucket *b;
+	u32 key_size, hash;
+	int ret;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		/* unknown flags */
+		return -EINVAL;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	raw_spin_lock_irqsave(&b->lock, flags);
+
+	l_old = lookup_elem_raw(head, hash, key, key_size);
+
+	ret = check_flags(htab, l_old, map_flags);
+	if (ret)
+		goto err;
+
+	if (l_old) {
+		/* per-cpu hash map can update value in-place */
+		memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)),
+		       value, htab->map.value_size);
+	} else {
+		l_new = alloc_htab_elem(htab, key, value, key_size,
+					hash, true);
+		if (!l_new) {
+			ret = -ENOMEM;
+			goto err;
+		}
+		hlist_add_head_rcu(&l_new->hash_node, head);
+		atomic_inc(&htab->count);
+	}
+	ret = 0;
+err:
+	raw_spin_unlock_irqrestore(&b->lock, flags);
+	return ret;
+}
+
 /* Called from syscall or from eBPF program */
 static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH;
 	struct hlist_head *head;
 	struct bucket *b;
 	struct htab_elem *l;
@@ -332,7 +482,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	if (l) {
 		hlist_del_rcu(&l->hash_node);
 		atomic_dec(&htab->count);
-		kfree_rcu(l, rcu);
+		free_htab_elem(l, percpu, key_size);
 		ret = 0;
 	}
 
@@ -352,7 +502,12 @@ static void delete_all_elements(struct bpf_htab *htab)
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			hlist_del_rcu(&l->hash_node);
 			atomic_dec(&htab->count);
-			kfree(l);
+			if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+				l->key_size = htab->map.key_size;
+				htab_percpu_elem_free(l);
+			} else {
+				kfree(l);
+			}
 		}
 	}
 }
@@ -391,9 +546,35 @@ static struct bpf_map_type_list htab_type __read_mostly = {
 	.type = BPF_MAP_TYPE_HASH,
 };
 
+/* Called from eBPF program */
+static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+	if (l)
+		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
+	else
+		return NULL;
+}
+
+static const struct bpf_map_ops htab_percpu_ops = {
+	.map_alloc = htab_map_alloc,
+	.map_free = htab_map_free,
+	.map_get_next_key = htab_map_get_next_key,
+	.map_lookup_elem = htab_percpu_map_lookup_elem,
+	.map_update_elem = htab_percpu_map_update_elem,
+	.map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_percpu_type __read_mostly = {
+	.ops = &htab_percpu_ops,
+	.type = BPF_MAP_TYPE_PERCPU_HASH,
+};
+
 static int __init register_htab_map(void)
 {
 	bpf_register_map_type(&htab_type);
+	bpf_register_map_type(&htab_percpu_type);
 	return 0;
 }
 late_initcall(register_htab_map);
-- 
cgit v1.2.3


From a10423b87a7eae75da79ce80a8d9475047a674ee Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 1 Feb 2016 22:39:54 -0800
Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_ARRAY map

Primary use case is a histogram array of latency
where bpf program computes the latency of block requests or other
events and stores histogram of latency into array of 64 elements.
All cpus are constantly running, so normal increment is not accurate,
bpf_xadd causes cache ping-pong and this per-cpu approach allows
fastest collision-free counters.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   1 +
 include/uapi/linux/bpf.h |   1 +
 kernel/bpf/arraymap.c    | 102 ++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 93 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 83d1926c61e4..141fb0d45731 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -151,6 +151,7 @@ struct bpf_array {
 	union {
 		char value[0] __aligned(8);
 		void *ptrs[0] __aligned(8);
+		void __percpu *pptrs[0] __aligned(8);
 	};
 };
 #define MAX_TAIL_CALL_CNT 32
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ae40c8763e..2ee0fde1bf96 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -82,6 +82,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PROG_ARRAY,
 	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 	BPF_MAP_TYPE_PERCPU_HASH,
+	BPF_MAP_TYPE_PERCPU_ARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 89ebbc4d1164..b9bf1d7949ca 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -17,11 +17,39 @@
 #include <linux/filter.h>
 #include <linux/perf_event.h>
 
+static void bpf_array_free_percpu(struct bpf_array *array)
+{
+	int i;
+
+	for (i = 0; i < array->map.max_entries; i++)
+		free_percpu(array->pptrs[i]);
+}
+
+static int bpf_array_alloc_percpu(struct bpf_array *array)
+{
+	void __percpu *ptr;
+	int i;
+
+	for (i = 0; i < array->map.max_entries; i++) {
+		ptr = __alloc_percpu_gfp(array->elem_size, 8,
+					 GFP_USER | __GFP_NOWARN);
+		if (!ptr) {
+			bpf_array_free_percpu(array);
+			return -ENOMEM;
+		}
+		array->pptrs[i] = ptr;
+	}
+
+	return 0;
+}
+
 /* Called from syscall */
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 {
+	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	struct bpf_array *array;
-	u32 elem_size, array_size;
+	u64 array_size;
+	u32 elem_size;
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -36,12 +64,16 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	elem_size = round_up(attr->value_size, 8);
 
-	/* check round_up into zero and u32 overflow */
-	if (elem_size == 0 ||
-	    attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size)
+	array_size = sizeof(*array);
+	if (percpu)
+		array_size += (u64) attr->max_entries * sizeof(void *);
+	else
+		array_size += (u64) attr->max_entries * elem_size;
+
+	/* make sure there is no u32 overflow later in round_up() */
+	if (array_size >= U32_MAX - PAGE_SIZE)
 		return ERR_PTR(-ENOMEM);
 
-	array_size = sizeof(*array) + attr->max_entries * elem_size;
 
 	/* allocate all map elements and zero-initialize them */
 	array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
@@ -52,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	}
 
 	/* copy mandatory map attributes */
+	array->map.map_type = attr->map_type;
 	array->map.key_size = attr->key_size;
 	array->map.value_size = attr->value_size;
 	array->map.max_entries = attr->max_entries;
-	array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
 	array->elem_size = elem_size;
 
+	if (!percpu)
+		goto out;
+
+	array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
+
+	if (array_size >= U32_MAX - PAGE_SIZE ||
+	    elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
+		kvfree(array);
+		return ERR_PTR(-ENOMEM);
+	}
+out:
+	array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
+
 	return &array->map;
 }
 
@@ -67,12 +112,24 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 index = *(u32 *)key;
 
-	if (index >= array->map.max_entries)
+	if (unlikely(index >= array->map.max_entries))
 		return NULL;
 
 	return array->value + array->elem_size * index;
 }
 
+/* Called from eBPF program */
+static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+
+	if (unlikely(index >= array->map.max_entries))
+		return NULL;
+
+	return this_cpu_ptr(array->pptrs[index]);
+}
+
 /* Called from syscall */
 static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
@@ -99,19 +156,24 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 index = *(u32 *)key;
 
-	if (map_flags > BPF_EXIST)
+	if (unlikely(map_flags > BPF_EXIST))
 		/* unknown flags */
 		return -EINVAL;
 
-	if (index >= array->map.max_entries)
+	if (unlikely(index >= array->map.max_entries))
 		/* all elements were pre-allocated, cannot insert a new one */
 		return -E2BIG;
 
-	if (map_flags == BPF_NOEXIST)
+	if (unlikely(map_flags == BPF_NOEXIST))
 		/* all elements already exist */
 		return -EEXIST;
 
-	memcpy(array->value + array->elem_size * index, value, map->value_size);
+	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+		memcpy(this_cpu_ptr(array->pptrs[index]),
+		       value, map->value_size);
+	else
+		memcpy(array->value + array->elem_size * index,
+		       value, map->value_size);
 	return 0;
 }
 
@@ -133,6 +195,9 @@ static void array_map_free(struct bpf_map *map)
 	 */
 	synchronize_rcu();
 
+	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+		bpf_array_free_percpu(array);
+
 	kvfree(array);
 }
 
@@ -150,9 +215,24 @@ static struct bpf_map_type_list array_type __read_mostly = {
 	.type = BPF_MAP_TYPE_ARRAY,
 };
 
+static const struct bpf_map_ops percpu_array_ops = {
+	.map_alloc = array_map_alloc,
+	.map_free = array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = percpu_array_map_lookup_elem,
+	.map_update_elem = array_map_update_elem,
+	.map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list percpu_array_type __read_mostly = {
+	.ops = &percpu_array_ops,
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+};
+
 static int __init register_array_map(void)
 {
 	bpf_register_map_type(&array_type);
+	bpf_register_map_type(&percpu_array_type);
 	return 0;
 }
 late_initcall(register_array_map);
-- 
cgit v1.2.3


From 67eb03318bc5fe170ae832423fda7a23b0d801cf Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 2 Feb 2016 07:43:45 -0800
Subject: net: Add support for fill_slave_info to VRF device

Allows userspace to have direct access to VRF table association
versus looking up master device and its table.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c            | 21 +++++++++++++++++++++
 include/uapi/linux/if_link.h |  8 ++++++++
 2 files changed, 29 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 66addb7a7911..76e1fc9d8748 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -877,6 +877,24 @@ static int vrf_fillinfo(struct sk_buff *skb,
 	return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id);
 }
 
+static size_t vrf_get_slave_size(const struct net_device *bond_dev,
+				 const struct net_device *slave_dev)
+{
+	return nla_total_size(sizeof(u32));  /* IFLA_VRF_PORT_TABLE */
+}
+
+static int vrf_fill_slave_info(struct sk_buff *skb,
+			       const struct net_device *vrf_dev,
+			       const struct net_device *slave_dev)
+{
+	struct net_vrf *vrf = netdev_priv(vrf_dev);
+
+	if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = {
 	[IFLA_VRF_TABLE] = { .type = NLA_U32 },
 };
@@ -890,6 +908,9 @@ static struct rtnl_link_ops vrf_link_ops __read_mostly = {
 	.validate	= vrf_validate,
 	.fill_info	= vrf_fillinfo,
 
+	.get_slave_size  = vrf_get_slave_size,
+	.fill_slave_info = vrf_fill_slave_info,
+
 	.newlink	= vrf_newlink,
 	.dellink	= vrf_dellink,
 	.setup		= vrf_setup,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d3e90b91e07e..d452cea59020 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -405,6 +405,14 @@ enum {
 
 #define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1)
 
+enum {
+	IFLA_VRF_PORT_UNSPEC,
+	IFLA_VRF_PORT_TABLE,
+	__IFLA_VRF_PORT_MAX
+};
+
+#define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1)
+
 /* IPVLAN section */
 enum {
 	IFLA_IPVLAN_UNSPEC,
-- 
cgit v1.2.3


From 103a8ad1fa3b261c78dfc842cb315defe9d40be0 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Wed, 3 Feb 2016 04:04:36 +0100
Subject: ethtool: add speed/duplex validation functions

Add functions which check if the speed/duplex are defined.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 57fa39005e79..b2e180181629 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1319,11 +1319,45 @@ enum ethtool_sfeatures_retval_bits {
 
 #define SPEED_UNKNOWN		-1
 
+static inline int ethtool_validate_speed(__u32 speed)
+{
+	switch (speed) {
+	case SPEED_10:
+	case SPEED_100:
+	case SPEED_1000:
+	case SPEED_2500:
+	case SPEED_5000:
+	case SPEED_10000:
+	case SPEED_20000:
+	case SPEED_25000:
+	case SPEED_40000:
+	case SPEED_50000:
+	case SPEED_56000:
+	case SPEED_100000:
+	case SPEED_UNKNOWN:
+		return 1;
+	}
+
+	return 0;
+}
+
 /* Duplex, half or full. */
 #define DUPLEX_HALF		0x00
 #define DUPLEX_FULL		0x01
 #define DUPLEX_UNKNOWN		0xff
 
+static inline int ethtool_validate_duplex(__u8 duplex)
+{
+	switch (duplex) {
+	case DUPLEX_HALF:
+	case DUPLEX_FULL:
+	case DUPLEX_UNKNOWN:
+		return 1;
+	}
+
+	return 0;
+}
+
 /* Which connector port. */
 #define PORT_TP			0x00
 #define PORT_AUI		0x01
-- 
cgit v1.2.3


From 157ede6784ba2837c7dc43f195418c75927f8488 Mon Sep 17 00:00:00 2001
From: Elad Raz <eladr@mellanox.com>
Date: Wed, 3 Feb 2016 09:57:04 +0100
Subject: bridge: mdb: add support for offloaded mdb entries

Add new bitmask member 'flags' to br_mdb_entry structure. Adding
MDB_FLAGS_OFFLOAD bit which indicates MDB entries is offloaded to hardware.

Signed-off-by: Elad Raz <eladr@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 18db14477bdd..ec3547234998 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -183,6 +183,8 @@ struct br_mdb_entry {
 #define MDB_TEMPORARY 0
 #define MDB_PERMANENT 1
 	__u8 state;
+#define MDB_FLAGS_OFFLOAD	(1 << 0)
+	__u8 flags;
 	__u16 vid;
 	struct {
 		union {
-- 
cgit v1.2.3


From 12b74dfadb5a7a23baf4db941dc9fd9d371f249a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:17 +0100
Subject: ipv4: add option to drop unicast encapsulated in L2 multicast

In order to solve a problem with 802.11, the so-called hole-196 attack,
add an option (sysctl) called "drop_unicast_in_l2_multicast" which, if
enabled, causes the stack to drop IPv4 unicast packets encapsulated in
link-layer multi- or broadcast frames. Such frames can (as an attack)
be created by any member of the same wireless network and transmitted
as valid encrypted frames since the symmetric key for broadcast frames
is shared between all stations.

Additionally, enabling this option provides compliance with a SHOULD
clause of RFC 1122.

Reviewed-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  7 +++++++
 include/uapi/linux/ip.h                |  1 +
 net/ipv4/devinet.c                     |  2 ++
 net/ipv4/ip_input.c                    | 25 ++++++++++++++++++++++++-
 4 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 73b36d7c7b0d..d5910d63214d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1216,6 +1216,13 @@ promote_secondaries - BOOLEAN
 	promote a corresponding secondary IP address instead of
 	removing all the corresponding secondary IP addresses.
 
+drop_unicast_in_l2_multicast - BOOLEAN
+	Drop any unicast IP packets that are received in link-layer
+	multicast (or broadcast) frames.
+	This behavior (for multicast) is actually a SHOULD in RFC
+	1122, but is disabled by default for compatibility reasons.
+	Default: off (0)
+
 
 tag - INTEGER
 	Allows you to write a number, which can be used as required.
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 08f894d2ddbd..584834f7e95c 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -165,6 +165,7 @@ enum
 	IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
+	IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	__IPV4_DEVCONF_MAX
 };
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index cebd9d31e65a..dbbab28a52a4 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2192,6 +2192,8 @@ static struct devinet_sysctl_table {
 					      "promote_secondaries"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
 					      "route_localnet"),
+		DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST,
+					      "drop_unicast_in_l2_multicast"),
 	},
 };
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d77eb0c3b684..852002f64c68 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -362,8 +362,31 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	rt = skb_rtable(skb);
 	if (rt->rt_type == RTN_MULTICAST) {
 		IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len);
-	} else if (rt->rt_type == RTN_BROADCAST)
+	} else if (rt->rt_type == RTN_BROADCAST) {
 		IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len);
+	} else if (skb->pkt_type == PACKET_BROADCAST ||
+		   skb->pkt_type == PACKET_MULTICAST) {
+		struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+
+		/* RFC 1122 3.3.6:
+		 *
+		 *   When a host sends a datagram to a link-layer broadcast
+		 *   address, the IP destination address MUST be a legal IP
+		 *   broadcast or IP multicast address.
+		 *
+		 *   A host SHOULD silently discard a datagram that is received
+		 *   via a link-layer broadcast (see Section 2.4) but does not
+		 *   specify an IP multicast or broadcast destination address.
+		 *
+		 * This doesn't explicitly say L2 *broadcast*, but broadcast is
+		 * in a way a form of multicast and the most common use case for
+		 * this is 802.11 protecting against cross-station spoofing (the
+		 * so-called "hole-196" attack) so do it for both.
+		 */
+		if (in_dev &&
+		    IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
+			goto drop;
+	}
 
 	return dst_input(skb);
 
-- 
cgit v1.2.3


From 97daf331455077645ae1f13438bebd3d1a2e94ee Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:18 +0100
Subject: ipv4: add option to drop gratuitous ARP packets

In certain 802.11 wireless deployments, there will be ARP proxies
that use knowledge of the network to correctly answer requests.
To prevent gratuitous ARP frames on the shared medium from being
a problem, on such deployments wireless needs to drop them.

Enable this by providing an option called "drop_gratuitous_arp".

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 6 ++++++
 include/uapi/linux/ip.h                | 1 +
 net/ipv4/arp.c                         | 8 ++++++++
 net/ipv4/devinet.c                     | 2 ++
 4 files changed, 17 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index d5910d63214d..a53bbfaff1c7 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1223,6 +1223,12 @@ drop_unicast_in_l2_multicast - BOOLEAN
 	1122, but is disabled by default for compatibility reasons.
 	Default: off (0)
 
+drop_gratuitous_arp - BOOLEAN
+	Drop all gratuitous ARP frames, for example if there's a known
+	good ARP proxy on the network and such frames need not be used
+	(or in the case of 802.11, must not be used to prevent attacks.)
+	Default: off (0)
+
 
 tag - INTEGER
 	Allows you to write a number, which can be used as required.
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 584834f7e95c..f291569768dd 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -166,6 +166,7 @@ enum
 	IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
+	IPV4_DEVCONF_DROP_GRATUITOUS_ARP,
 	__IPV4_DEVCONF_MAX
 };
 
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 59b3e0e8fd51..c102eb5ac55c 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -735,6 +735,14 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 	    (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
 		goto out;
 
+ /*
+  *	For some 802.11 wireless deployments (and possibly other networks),
+  *	there will be an ARP proxy and gratuitous ARP frames are attacks
+  *	and thus should not be accepted.
+  */
+	if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
+		goto out;
+
 /*
  *     Special case: We must set Frame Relay source Q.922 address
  */
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index dbbab28a52a4..3d835313575e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2185,6 +2185,8 @@ static struct devinet_sysctl_table {
 					"igmpv3_unsolicited_report_interval"),
 		DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
 					"ignore_routes_with_linkdown"),
+		DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP,
+					"drop_gratuitous_arp"),
 
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
-- 
cgit v1.2.3


From abbc30436d39dfed8ebfca338d253f211ac7b094 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:19 +0100
Subject: ipv6: add option to drop unicast encapsulated in L2 multicast

In order to solve a problem with 802.11, the so-called hole-196 attack,
add an option (sysctl) called "drop_unicast_in_l2_multicast" which, if
enabled, causes the stack to drop IPv6 unicast packets encapsulated in
link-layer multi- or broadcast frames. Such frames can (as an attack)
be created by any member of the same wireless network and transmitted
as valid encrypted frames since the symmetric key for broadcast frames
is shared between all stations.

Reviewed-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  6 ++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    |  8 ++++++++
 net/ipv6/ip6_input.c                   | 10 ++++++++++
 5 files changed, 26 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a53bbfaff1c7..e0e7350a4e6a 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1674,6 +1674,12 @@ stable_secret - IPv6 address
 
 	By default the stable secret is unset.
 
+drop_unicast_in_l2_multicast - BOOLEAN
+	Drop any unicast IPv6 packets that are received in link-layer
+	multicast (or broadcast) frames.
+
+	By default this is turned off.
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 402753bccafa..4a4c1ae826cb 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -50,6 +50,7 @@ struct ipv6_devconf {
 	__s32		mc_forwarding;
 #endif
 	__s32		disable_ipv6;
+	__s32		drop_unicast_in_l2_multicast;
 	__s32		accept_dad;
 	__s32		force_tllao;
 	__s32           ndisc_notify;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 38b4fef20219..4c413570efe8 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -174,6 +174,7 @@ enum {
 	DEVCONF_USE_OIF_ADDRS_ONLY,
 	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
 	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
+	DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 38eeddedfc21..23e325f39f8e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4711,6 +4711,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown;
 	/* we omit DEVCONF_STABLE_SECRET for now */
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
+	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5784,6 +5785,13 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= addrconf_sysctl_ignore_routes_with_linkdown,
 		},
+		{
+			.procname	= "drop_unicast_in_l2_multicast",
+			.data		= &ipv6_devconf.drop_unicast_in_l2_multicast,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
 		{
 			/* sentinel */
 		}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 9075acf081dd..31ac3c56da4b 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -134,6 +134,16 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	    IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1)
 		goto err;
 
+	/* If enabled, drop unicast packets that were encapsulated in link-layer
+	 * multicast or broadcast to protected against the so-called "hole-196"
+	 * attack in 802.11 wireless.
+	 */
+	if (!ipv6_addr_is_multicast(&hdr->daddr) &&
+	    (skb->pkt_type == PACKET_BROADCAST ||
+	     skb->pkt_type == PACKET_MULTICAST) &&
+	    idev->cnf.drop_unicast_in_l2_multicast)
+		goto err;
+
 	/* RFC4291 2.7
 	 * Nodes must not originate a packet to a multicast address whose scope
 	 * field contains the reserved value 0; if such a packet is received, it
-- 
cgit v1.2.3


From 7a02bf892d8f1e5298af1676f001bee410509d80 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:20 +0100
Subject: ipv6: add option to drop unsolicited neighbor advertisements

In certain 802.11 wireless deployments, there will be NA proxies
that use knowledge of the network to correctly answer requests.
To prevent unsolicitd advertisements on the shared medium from
being a problem, on such deployments wireless needs to drop them.

Enable this by providing an option called "drop_unsolicited_na".

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 7 +++++++
 include/linux/ipv6.h                   | 1 +
 include/uapi/linux/ipv6.h              | 1 +
 net/ipv6/addrconf.c                    | 8 ++++++++
 net/ipv6/ndisc.c                       | 9 +++++++++
 5 files changed, 26 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e0e7350a4e6a..24ce97f42d35 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1680,6 +1680,13 @@ drop_unicast_in_l2_multicast - BOOLEAN
 
 	By default this is turned off.
 
+drop_unsolicited_na - BOOLEAN
+	Drop all unsolicited neighbor advertisements, for example if there's
+	a known good NA proxy on the network and such frames need not be used
+	(or in the case of 802.11, must not be used to prevent attacks.)
+
+	By default this is turned off.
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 4a4c1ae826cb..4b2267e1b7c3 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -56,6 +56,7 @@ struct ipv6_devconf {
 	__s32           ndisc_notify;
 	__s32		suppress_frag_ndisc;
 	__s32		accept_ra_mtu;
+	__s32		drop_unsolicited_na;
 	struct ipv6_stable_secret {
 		bool initialized;
 		struct in6_addr secret;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 4c413570efe8..ec117b65d5a5 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -175,6 +175,7 @@ enum {
 	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
 	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
+	DEVCONF_DROP_UNSOLICITED_NA,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 23e325f39f8e..ac0ba9e4e06b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4712,6 +4712,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	/* we omit DEVCONF_STABLE_SECRET for now */
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
+	array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5792,6 +5793,13 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname	= "drop_unsolicited_na",
+			.data		= &ipv6_devconf.drop_unsolicited_na,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
 		{
 			/* sentinel */
 		}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 84afb9a77278..c245895a3d41 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -883,6 +883,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
 				    offsetof(struct nd_msg, opt));
 	struct ndisc_options ndopts;
 	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = __in6_dev_get(dev);
 	struct inet6_ifaddr *ifp;
 	struct neighbour *neigh;
 
@@ -902,6 +903,14 @@ static void ndisc_recv_na(struct sk_buff *skb)
 		return;
 	}
 
+	/* For some 802.11 wireless deployments (and possibly other networks),
+	 * there will be a NA proxy and unsolicitd packets are attacks
+	 * and thus should not be accepted.
+	 */
+	if (!msg->icmph.icmp6_solicited && idev &&
+	    idev->cnf.drop_unsolicited_na)
+		return;
+
 	if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
 		ND_PRINTK(2, warn, "NS: invalid ND option\n");
 		return;
-- 
cgit v1.2.3


From 72bb68721f80a1441e871b6afc9ab0b3793d5031 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Fri, 5 Feb 2016 11:16:21 +0000
Subject: ethtool: add IPv6 to the NFC API

Signed-off-by: Edward Cree <ecree@solarflare.com>
Reviewed-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 70 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 64 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index b2e180181629..5e0940dcbfe8 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -748,6 +748,56 @@ struct ethtool_usrip4_spec {
 	__u8    proto;
 };
 
+/**
+ * struct ethtool_tcpip6_spec - flow specification for TCP/IPv6 etc.
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @psrc: Source port
+ * @pdst: Destination port
+ * @tclass: Traffic Class
+ *
+ * This can be used to specify a TCP/IPv6, UDP/IPv6 or SCTP/IPv6 flow.
+ */
+struct ethtool_tcpip6_spec {
+	__be32	ip6src[4];
+	__be32	ip6dst[4];
+	__be16	psrc;
+	__be16	pdst;
+	__u8    tclass;
+};
+
+/**
+ * struct ethtool_ah_espip6_spec - flow specification for IPsec/IPv6
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @spi: Security parameters index
+ * @tclass: Traffic Class
+ *
+ * This can be used to specify an IPsec transport or tunnel over IPv6.
+ */
+struct ethtool_ah_espip6_spec {
+	__be32	ip6src[4];
+	__be32	ip6dst[4];
+	__be32	spi;
+	__u8    tclass;
+};
+
+/**
+ * struct ethtool_usrip6_spec - general flow specification for IPv6
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @l4_4_bytes: First 4 bytes of transport (layer 4) header
+ * @tclass: Traffic Class
+ * @l4_proto: Transport protocol number (nexthdr after any Extension Headers)
+ */
+struct ethtool_usrip6_spec {
+	__be32	ip6src[4];
+	__be32	ip6dst[4];
+	__be32	l4_4_bytes;
+	__u8    tclass;
+	__u8    l4_proto;
+};
+
 union ethtool_flow_union {
 	struct ethtool_tcpip4_spec		tcp_ip4_spec;
 	struct ethtool_tcpip4_spec		udp_ip4_spec;
@@ -755,6 +805,12 @@ union ethtool_flow_union {
 	struct ethtool_ah_espip4_spec		ah_ip4_spec;
 	struct ethtool_ah_espip4_spec		esp_ip4_spec;
 	struct ethtool_usrip4_spec		usr_ip4_spec;
+	struct ethtool_tcpip6_spec		tcp_ip6_spec;
+	struct ethtool_tcpip6_spec		udp_ip6_spec;
+	struct ethtool_tcpip6_spec		sctp_ip6_spec;
+	struct ethtool_ah_espip6_spec		ah_ip6_spec;
+	struct ethtool_ah_espip6_spec		esp_ip6_spec;
+	struct ethtool_usrip6_spec		usr_ip6_spec;
 	struct ethhdr				ether_spec;
 	__u8					hdata[52];
 };
@@ -1401,15 +1457,17 @@ static inline int ethtool_validate_duplex(__u8 duplex)
 #define	UDP_V4_FLOW	0x02	/* hash or spec (udp_ip4_spec) */
 #define	SCTP_V4_FLOW	0x03	/* hash or spec (sctp_ip4_spec) */
 #define	AH_ESP_V4_FLOW	0x04	/* hash only */
-#define	TCP_V6_FLOW	0x05	/* hash only */
-#define	UDP_V6_FLOW	0x06	/* hash only */
-#define	SCTP_V6_FLOW	0x07	/* hash only */
+#define	TCP_V6_FLOW	0x05	/* hash or spec (tcp_ip6_spec; nfc only) */
+#define	UDP_V6_FLOW	0x06	/* hash or spec (udp_ip6_spec; nfc only) */
+#define	SCTP_V6_FLOW	0x07	/* hash or spec (sctp_ip6_spec; nfc only) */
 #define	AH_ESP_V6_FLOW	0x08	/* hash only */
 #define	AH_V4_FLOW	0x09	/* hash or spec (ah_ip4_spec) */
 #define	ESP_V4_FLOW	0x0a	/* hash or spec (esp_ip4_spec) */
-#define	AH_V6_FLOW	0x0b	/* hash only */
-#define	ESP_V6_FLOW	0x0c	/* hash only */
-#define	IP_USER_FLOW	0x0d	/* spec only (usr_ip4_spec) */
+#define	AH_V6_FLOW	0x0b	/* hash or spec (ah_ip6_spec; nfc only) */
+#define	ESP_V6_FLOW	0x0c	/* hash or spec (esp_ip6_spec; nfc only) */
+#define	IPV4_USER_FLOW	0x0d	/* spec only (usr_ip4_spec) */
+#define	IP_USER_FLOW	IPV4_USER_FLOW
+#define	IPV6_USER_FLOW	0x0e	/* spec only (usr_ip6_spec; nfc only) */
 #define	IPV4_FLOW	0x10	/* hash only */
 #define	IPV6_FLOW	0x11	/* hash only */
 #define	ETHER_FLOW	0x12	/* spec only (ether_spec) */
-- 
cgit v1.2.3


From 4456ed04ea44b800d691b18c14a68ec9894d2aca Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 7 Feb 2016 23:27:55 +0200
Subject: ethtool: future-proof interface for speed extensions

Many virtual and not quite virtual devices allow any speed to be set
through ethtool. In particular, this applies to the virtio-net devices.
Document this fact to make sure people don't assume the enum lists all
possible values.  Reserve values greater than INT_MAX for future
extension and to avoid conflict with SPEED_UNKNOWN.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 5e0940dcbfe8..4345f80a2e33 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -31,7 +31,7 @@
  *	physical connectors and other link features that are
  *	advertised through autonegotiation or enabled for
  *	auto-detection.
- * @speed: Low bits of the speed
+ * @speed: Low bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN
  * @duplex: Duplex mode; one of %DUPLEX_*
  * @port: Physical connector type; one of %PORT_*
  * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not
@@ -47,7 +47,7 @@
  *	obsoleted by &struct ethtool_coalesce.  Read-only; deprecated.
  * @maxrxpkt: Historically used to report RX IRQ coalescing; now
  *	obsoleted by &struct ethtool_coalesce.  Read-only; deprecated.
- * @speed_hi: High bits of the speed
+ * @speed_hi: High bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN
  * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of
  *	%ETH_TP_MDI_*.  If the status is unknown or not applicable, the
  *	value will be %ETH_TP_MDI_INVALID.  Read-only.
@@ -1359,7 +1359,7 @@ enum ethtool_sfeatures_retval_bits {
  * it was forced up into this mode or autonegotiated.
  */
 
-/* The forced speed, 10Mb, 100Mb, gigabit, [2.5|5|10|20|25|40|50|56|100]GbE. */
+/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. */
 #define SPEED_10		10
 #define SPEED_100		100
 #define SPEED_1000		1000
-- 
cgit v1.2.3


From 4a92602aa1cd5bbaeedbd9536ff992f7d26fe9d1 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho.andersen@canonical.com>
Date: Fri, 5 Feb 2016 09:20:52 -0700
Subject: openvswitch: allow management from inside user namespaces

Operations with the GENL_ADMIN_PERM flag fail permissions checks because
this flag means we call netlink_capable, which uses the init user ns.

Instead, let's introduce a new flag, GENL_UNS_ADMIN_PERM for operations
which should be allowed inside a user namespace.

The motivation for this is to be able to run openvswitch in unprivileged
containers. I've tested this and it seems to work, but I really have no
idea about the security consequences of this patch, so thoughts would be
much appreciated.

v2: use the GENL_UNS_ADMIN_PERM flag instead of a check in each function
v3: use separate ifs for UNS_ADMIN_PERM and ADMIN_PERM, instead of one
    massive one

Reported-by: James Page <james.page@canonical.com>
Signed-off-by: Tycho Andersen <tycho.andersen@canonical.com>
CC: Eric Biederman <ebiederm@xmission.com>
CC: Pravin Shelar <pshelar@ovn.org>
CC: Justin Pettit <jpettit@nicira.com>
CC: "David S. Miller" <davem@davemloft.net>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/genetlink.h |  1 +
 net/netlink/genetlink.c        |  4 ++++
 net/openvswitch/datapath.c     | 20 ++++++++++----------
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index c3363ba1ae05..5512c90af7e3 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -21,6 +21,7 @@ struct genlmsghdr {
 #define GENL_CMD_CAP_DO		0x02
 #define GENL_CMD_CAP_DUMP	0x04
 #define GENL_CMD_CAP_HASPOL	0x08
+#define GENL_UNS_ADMIN_PERM	0x10
 
 /*
  * List of reserved static generic netlink identifiers:
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index f830326b3b1d..0ffd721126e7 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -580,6 +580,10 @@ static int genl_family_rcv_msg(struct genl_family *family,
 	    !netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
+	if ((ops->flags & GENL_UNS_ADMIN_PERM) &&
+	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
 	if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
 		int rc;
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index deadfdab1bc3..d6f7fe92744a 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -654,7 +654,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_packet_genl_ops[] = {
 	{ .cmd = OVS_PACKET_CMD_EXECUTE,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = packet_policy,
 	  .doit = ovs_packet_cmd_execute
 	}
@@ -1391,12 +1391,12 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_flow_genl_ops[] = {
 	{ .cmd = OVS_FLOW_CMD_NEW,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = flow_policy,
 	  .doit = ovs_flow_cmd_new
 	},
 	{ .cmd = OVS_FLOW_CMD_DEL,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = flow_policy,
 	  .doit = ovs_flow_cmd_del
 	},
@@ -1407,7 +1407,7 @@ static const struct genl_ops dp_flow_genl_ops[] = {
 	  .dumpit = ovs_flow_cmd_dump
 	},
 	{ .cmd = OVS_FLOW_CMD_SET,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = flow_policy,
 	  .doit = ovs_flow_cmd_set,
 	},
@@ -1777,12 +1777,12 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_datapath_genl_ops[] = {
 	{ .cmd = OVS_DP_CMD_NEW,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = datapath_policy,
 	  .doit = ovs_dp_cmd_new
 	},
 	{ .cmd = OVS_DP_CMD_DEL,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = datapath_policy,
 	  .doit = ovs_dp_cmd_del
 	},
@@ -1793,7 +1793,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
 	  .dumpit = ovs_dp_cmd_dump
 	},
 	{ .cmd = OVS_DP_CMD_SET,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = datapath_policy,
 	  .doit = ovs_dp_cmd_set,
 	},
@@ -2158,12 +2158,12 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_vport_genl_ops[] = {
 	{ .cmd = OVS_VPORT_CMD_NEW,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = vport_policy,
 	  .doit = ovs_vport_cmd_new
 	},
 	{ .cmd = OVS_VPORT_CMD_DEL,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = vport_policy,
 	  .doit = ovs_vport_cmd_del
 	},
@@ -2174,7 +2174,7 @@ static const struct genl_ops dp_vport_genl_ops[] = {
 	  .dumpit = ovs_vport_cmd_dump
 	},
 	{ .cmd = OVS_VPORT_CMD_SET,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = vport_policy,
 	  .doit = ovs_vport_cmd_set,
 	},
-- 
cgit v1.2.3


From e02564ee334a7ae46b71fc18576391cb9455433e Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Sun, 7 Feb 2016 21:52:23 +0100
Subject: ethtool: make validate_speed accept all speeds between 0 and INT_MAX

Devices these days can have any speed and as was recently pointed out
any speed from 0 to INT_MAX is valid so adjust speed validation to
accept such values.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 4345f80a2e33..190aea0faaf4 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1377,24 +1377,7 @@ enum ethtool_sfeatures_retval_bits {
 
 static inline int ethtool_validate_speed(__u32 speed)
 {
-	switch (speed) {
-	case SPEED_10:
-	case SPEED_100:
-	case SPEED_1000:
-	case SPEED_2500:
-	case SPEED_5000:
-	case SPEED_10000:
-	case SPEED_20000:
-	case SPEED_25000:
-	case SPEED_40000:
-	case SPEED_50000:
-	case SPEED_56000:
-	case SPEED_100000:
-	case SPEED_UNKNOWN:
-		return 1;
-	}
-
-	return 0;
+	return speed <= INT_MAX || speed == SPEED_UNKNOWN;
 }
 
 /* Duplex, half or full. */
-- 
cgit v1.2.3


From cd9b266095f422267bddbec88f9098b48ea548fc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 11 Feb 2016 22:02:53 -0800
Subject: tcp: add tcpi_min_rtt and tcpi_notsent_bytes to tcp_info

tcpi_min_rtt reports the minimal rtt observed by TCP stack for the flow,
in usec unit. Might be ~0U if not yet known.

tcpi_notsent_bytes reports the amount of bytes in the write queue that
were not yet sent.

This is done in a single patch to not add a temporary 32bit padding hole
in tcp_info.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 3 +++
 net/ipv4/tcp.c           | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 65a77b071e22..fe95446e9abf 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -196,6 +196,9 @@ struct tcp_info {
 	__u64	tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
 	__u32	tcpi_segs_out;	     /* RFC4898 tcpEStatsPerfSegsOut */
 	__u32	tcpi_segs_in;	     /* RFC4898 tcpEStatsPerfSegsIn */
+
+	__u32	tcpi_notsent_bytes;
+	__u32	tcpi_min_rtt;
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 014f18e2f7b3..f93150d15199 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2642,6 +2642,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 now = tcp_time_stamp;
 	unsigned int start;
+	int notsent_bytes;
 	u64 rate64;
 	u32 rate;
 
@@ -2722,6 +2723,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	} while (u64_stats_fetch_retry_irq(&tp->syncp, start));
 	info->tcpi_segs_out = tp->segs_out;
 	info->tcpi_segs_in = tp->segs_in;
+
+	notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
+	info->tcpi_notsent_bytes = max(0, notsent_bytes);
+
+	info->tcpi_min_rtt = tcp_min_rtt(tp);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
-- 
cgit v1.2.3


From d1b4c689d4130bcfd3532680b64db562300716b6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 18 Feb 2016 15:03:24 +0100
Subject: netlink: remove mmapped netlink support

mmapped netlink has a number of unresolved issues:

- TX zerocopy support had to be disabled more than a year ago via
  commit 4682a0358639b29cf ("netlink: Always copy on mmap TX.")
  because the content of the mmapped area can change after netlink
  attribute validation but before message processing.

- RX support was implemented mainly to speed up nfqueue dumping packet
  payload to userspace.  However, since commit ae08ce0021087a5d812d2
  ("netfilter: nfnetlink_queue: zero copy support") we avoid one copy
  with the socket-based interface too (via the skb_zerocopy helper).

The other problem is that skbs attached to mmaped netlink socket
behave different from normal skbs:

- they don't have a shinfo area, so all functions that use skb_shinfo()
(e.g. skb_clone) cannot be used.

- reserving headroom prevents userspace from seeing the content as
it expects message to start at skb->head.
See for instance
commit aa3a022094fa ("netlink: not trim skb for mmaped socket when dump").

- skbs handed e.g. to netlink_ack must have non-NULL skb->sk, else we
crash because it needs the sk to check if a tx ring is attached.

Also not obvious, leads to non-intuitive bug fixes such as 7c7bdf359
("netfilter: nfnetlink: use original skbuff when acking batches").

mmaped netlink also didn't play nicely with the skb_zerocopy helper
used by nfqueue and openvswitch.  Daniel Borkmann fixed this via
commit 6bb0fef489f6 ("netlink, mmap: fix edge-case leakages in nf queue
zero-copy")' but at the cost of also needing to provide remaining
length to the allocation function.

nfqueue also has problems when used with mmaped rx netlink:
- mmaped netlink doesn't allow use of nfqueue batch verdict messages.
  Problem is that in the mmap case, the allocation time also determines
  the ordering in which the frame will be seen by userspace (A
  allocating before B means that A is located in earlier ring slot,
  but this also means that B might get a lower sequence number then A
  since seqno is decided later.  To fix this we would need to extend the
  spinlocked region to also cover the allocation and message setup which
  isn't desirable.
- nfqueue can now be configured to queue large (GSO) skbs to userspace.
  Queing GSO packets is faster than having to force a software segmentation
  in the kernel, so this is a desirable option.  However, with a mmap based
  ring one has to use 64kb per ring slot element, else mmap has to fall back
  to the socket path (NL_MMAP_STATUS_COPY) for all large packets.

To use the mmap interface, userspace not only has to probe for mmap netlink
support, it also has to implement a recv/socket receive path in order to
handle messages that exceed the size of an rx ring element.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Ken-ichirou MATSUZAWA <chamaken@gmail.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netlink_mmap.txt | 332 -------------
 include/uapi/linux/netlink.h              |   4 +
 include/uapi/linux/netlink_diag.h         |   2 +
 net/netlink/Kconfig                       |   9 -
 net/netlink/af_netlink.c                  | 754 +-----------------------------
 net/netlink/af_netlink.h                  |  15 -
 net/netlink/diag.c                        |  39 --
 7 files changed, 15 insertions(+), 1140 deletions(-)
 delete mode 100644 Documentation/networking/netlink_mmap.txt

(limited to 'include/uapi')

diff --git a/Documentation/networking/netlink_mmap.txt b/Documentation/networking/netlink_mmap.txt
deleted file mode 100644
index 54f10478e8e3..000000000000
--- a/Documentation/networking/netlink_mmap.txt
+++ /dev/null
@@ -1,332 +0,0 @@
-This file documents how to use memory mapped I/O with netlink.
-
-Author: Patrick McHardy <kaber@trash.net>
-
-Overview
---------
-
-Memory mapped netlink I/O can be used to increase throughput and decrease
-overhead of unicast receive and transmit operations. Some netlink subsystems
-require high throughput, these are mainly the netfilter subsystems
-nfnetlink_queue and nfnetlink_log, but it can also help speed up large
-dump operations of f.i. the routing database.
-
-Memory mapped netlink I/O used two circular ring buffers for RX and TX which
-are mapped into the processes address space.
-
-The RX ring is used by the kernel to directly construct netlink messages into
-user-space memory without copying them as done with regular socket I/O,
-additionally as long as the ring contains messages no recvmsg() or poll()
-syscalls have to be issued by user-space to get more message.
-
-The TX ring is used to process messages directly from user-space memory, the
-kernel processes all messages contained in the ring using a single sendmsg()
-call.
-
-Usage overview
---------------
-
-In order to use memory mapped netlink I/O, user-space needs three main changes:
-
-- ring setup
-- conversion of the RX path to get messages from the ring instead of recvmsg()
-- conversion of the TX path to construct messages into the ring
-
-Ring setup is done using setsockopt() to provide the ring parameters to the
-kernel, then a call to mmap() to map the ring into the processes address space:
-
-- setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &params, sizeof(params));
-- setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &params, sizeof(params));
-- ring = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)
-
-Usage of either ring is optional, but even if only the RX ring is used the
-mapping still needs to be writable in order to update the frame status after
-processing.
-
-Conversion of the reception path involves calling poll() on the file
-descriptor, once the socket is readable the frames from the ring are
-processed in order until no more messages are available, as indicated by
-a status word in the frame header.
-
-On kernel side, in order to make use of memory mapped I/O on receive, the
-originating netlink subsystem needs to support memory mapped I/O, otherwise
-it will use an allocated socket buffer as usual and the contents will be
- copied to the ring on transmission, nullifying most of the performance gains.
-Dumps of kernel databases automatically support memory mapped I/O.
-
-Conversion of the transmit path involves changing message construction to
-use memory from the TX ring instead of (usually) a buffer declared on the
-stack and setting up the frame header appropriately. Optionally poll() can
-be used to wait for free frames in the TX ring.
-
-Structured and definitions for using memory mapped I/O are contained in
-<linux/netlink.h>.
-
-RX and TX rings
-----------------
-
-Each ring contains a number of continuous memory blocks, containing frames of
-fixed size dependent on the parameters used for ring setup.
-
-Ring:	[ block 0 ]
-		[ frame 0 ]
-		[ frame 1 ]
-	[ block 1 ]
-		[ frame 2 ]
-		[ frame 3 ]
-	...
-	[ block n ]
-		[ frame 2 * n ]
-		[ frame 2 * n + 1 ]
-
-The blocks are only visible to the kernel, from the point of view of user-space
-the ring just contains the frames in a continuous memory zone.
-
-The ring parameters used for setting up the ring are defined as follows:
-
-struct nl_mmap_req {
-	unsigned int	nm_block_size;
-	unsigned int	nm_block_nr;
-	unsigned int	nm_frame_size;
-	unsigned int	nm_frame_nr;
-};
-
-Frames are grouped into blocks, where each block is a continuous region of memory
-and holds nm_block_size / nm_frame_size frames. The total number of frames in
-the ring is nm_frame_nr. The following invariants hold:
-
-- frames_per_block = nm_block_size / nm_frame_size
-
-- nm_frame_nr = frames_per_block * nm_block_nr
-
-Some parameters are constrained, specifically:
-
-- nm_block_size must be a multiple of the architectures memory page size.
-  The getpagesize() function can be used to get the page size.
-
-- nm_frame_size must be equal or larger to NL_MMAP_HDRLEN, IOW a frame must be
-  able to hold at least the frame header
-
-- nm_frame_size must be smaller or equal to nm_block_size
-
-- nm_frame_size must be a multiple of NL_MMAP_MSG_ALIGNMENT
-
-- nm_frame_nr must equal the actual number of frames as specified above.
-
-When the kernel can't allocate physically continuous memory for a ring block,
-it will fall back to use physically discontinuous memory. This might affect
-performance negatively, in order to avoid this the nm_frame_size parameter
-should be chosen to be as small as possible for the required frame size and
-the number of blocks should be increased instead.
-
-Ring frames
-------------
-
-Each frames contain a frame header, consisting of a synchronization word and some
-meta-data, and the message itself.
-
-Frame:	[ header message ]
-
-The frame header is defined as follows:
-
-struct nl_mmap_hdr {
-	unsigned int	nm_status;
-	unsigned int	nm_len;
-	__u32		nm_group;
-	/* credentials */
-	__u32		nm_pid;
-	__u32		nm_uid;
-	__u32		nm_gid;
-};
-
-- nm_status is used for synchronizing processing between the kernel and user-
-  space and specifies ownership of the frame as well as the operation to perform
-
-- nm_len contains the length of the message contained in the data area
-
-- nm_group specified the destination multicast group of message
-
-- nm_pid, nm_uid and nm_gid contain the netlink pid, UID and GID of the sending
-  process. These values correspond to the data available using SOCK_PASSCRED in
-  the SCM_CREDENTIALS cmsg.
-
-The possible values in the status word are:
-
-- NL_MMAP_STATUS_UNUSED:
-	RX ring:	frame belongs to the kernel and contains no message
-			for user-space. Approriate action is to invoke poll()
-			to wait for new messages.
-
-	TX ring:	frame belongs to user-space and can be used for
-			message construction.
-
-- NL_MMAP_STATUS_RESERVED:
-	RX ring only:	frame is currently used by the kernel for message
-			construction and contains no valid message yet.
-			Appropriate action is to invoke poll() to wait for
-			new messages.
-
-- NL_MMAP_STATUS_VALID:
-	RX ring:	frame contains a valid message. Approriate action is
-			to process the message and release the frame back to
-			the kernel by setting the status to
-			NL_MMAP_STATUS_UNUSED or queue the frame by setting the
-			status to NL_MMAP_STATUS_SKIP.
-
-	TX ring:	the frame contains a valid message from user-space to
-			be processed by the kernel. After completing processing
-			the kernel will release the frame back to user-space by
-			setting the status to NL_MMAP_STATUS_UNUSED.
-
-- NL_MMAP_STATUS_COPY:
-	RX ring only:	a message is ready to be processed but could not be
-			stored in the ring, either because it exceeded the
-			frame size or because the originating subsystem does
-			not support memory mapped I/O. Appropriate action is
-			to invoke recvmsg() to receive the message and release
-			the frame back to the kernel by setting the status to
-			NL_MMAP_STATUS_UNUSED.
-
-- NL_MMAP_STATUS_SKIP:
-	RX ring only:	user-space queued the message for later processing, but
-			processed some messages following it in the ring. The
-			kernel should skip this frame when looking for unused
-			frames.
-
-The data area of a frame begins at a offset of NL_MMAP_HDRLEN relative to the
-frame header.
-
-TX limitations
---------------
-
-As of Jan 2015 the message is always copied from the ring frame to an
-allocated buffer due to unresolved security concerns.
-See commit 4682a0358639b29cf ("netlink: Always copy on mmap TX.").
-
-Example
--------
-
-Ring setup:
-
-	unsigned int block_size = 16 * getpagesize();
-	struct nl_mmap_req req = {
-		.nm_block_size		= block_size,
-		.nm_block_nr		= 64,
-		.nm_frame_size		= 16384,
-		.nm_frame_nr		= 64 * block_size / 16384,
-	};
-	unsigned int ring_size;
-	void *rx_ring, *tx_ring;
-
-	/* Configure ring parameters */
-	if (setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &req, sizeof(req)) < 0)
-		exit(1);
-	if (setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &req, sizeof(req)) < 0)
-		exit(1)
-
-	/* Calculate size of each individual ring */
-	ring_size = req.nm_block_nr * req.nm_block_size;
-
-	/* Map RX/TX rings. The TX ring is located after the RX ring */
-	rx_ring = mmap(NULL, 2 * ring_size, PROT_READ | PROT_WRITE,
-		       MAP_SHARED, fd, 0);
-	if ((long)rx_ring == -1L)
-		exit(1);
-	tx_ring = rx_ring + ring_size:
-
-Message reception:
-
-This example assumes some ring parameters of the ring setup are available.
-
-	unsigned int frame_offset = 0;
-	struct nl_mmap_hdr *hdr;
-	struct nlmsghdr *nlh;
-	unsigned char buf[16384];
-	ssize_t len;
-
-	while (1) {
-		struct pollfd pfds[1];
-
-		pfds[0].fd	= fd;
-		pfds[0].events	= POLLIN | POLLERR;
-		pfds[0].revents	= 0;
-
-		if (poll(pfds, 1, -1) < 0 && errno != -EINTR)
-			exit(1);
-
-		/* Check for errors. Error handling omitted */
-		if (pfds[0].revents & POLLERR)
-			<handle error>
-
-		/* If no new messages, poll again */
-		if (!(pfds[0].revents & POLLIN))
-			continue;
-
-		/* Process all frames */
-		while (1) {
-			/* Get next frame header */
-			hdr = rx_ring + frame_offset;
-
-			if (hdr->nm_status == NL_MMAP_STATUS_VALID) {
-				/* Regular memory mapped frame */
-				nlh = (void *)hdr + NL_MMAP_HDRLEN;
-				len = hdr->nm_len;
-
-				/* Release empty message immediately. May happen
-				 * on error during message construction.
-				 */
-				if (len == 0)
-					goto release;
-			} else if (hdr->nm_status == NL_MMAP_STATUS_COPY) {
-				/* Frame queued to socket receive queue */
-				len = recv(fd, buf, sizeof(buf), MSG_DONTWAIT);
-				if (len <= 0)
-					break;
-				nlh = buf;
-			} else
-				/* No more messages to process, continue polling */
-				break;
-
-			process_msg(nlh);
-release:
-			/* Release frame back to the kernel */
-			hdr->nm_status = NL_MMAP_STATUS_UNUSED;
-
-			/* Advance frame offset to next frame */
-			frame_offset = (frame_offset + frame_size) % ring_size;
-		}
-	}
-
-Message transmission:
-
-This example assumes some ring parameters of the ring setup are available.
-A single message is constructed and transmitted, to send multiple messages
-at once they would be constructed in consecutive frames before a final call
-to sendto().
-
-	unsigned int frame_offset = 0;
-	struct nl_mmap_hdr *hdr;
-	struct nlmsghdr *nlh;
-	struct sockaddr_nl addr = {
-		.nl_family	= AF_NETLINK,
-	};
-
-	hdr = tx_ring + frame_offset;
-	if (hdr->nm_status != NL_MMAP_STATUS_UNUSED)
-		/* No frame available. Use poll() to avoid. */
-		exit(1);
-
-	nlh = (void *)hdr + NL_MMAP_HDRLEN;
-
-	/* Build message */
-	build_message(nlh);
-
-	/* Fill frame header: length and status need to be set */
-	hdr->nm_len	= nlh->nlmsg_len;
-	hdr->nm_status	= NL_MMAP_STATUS_VALID;
-
-	if (sendto(fd, NULL, 0, 0, &addr, sizeof(addr)) < 0)
-		exit(1);
-
-	/* Advance frame offset to next frame */
-	frame_offset = (frame_offset + frame_size) % ring_size;
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index f095155d8749..0dba4e4ed2be 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -107,8 +107,10 @@ struct nlmsgerr {
 #define NETLINK_PKTINFO			3
 #define NETLINK_BROADCAST_ERROR		4
 #define NETLINK_NO_ENOBUFS		5
+#ifndef __KERNEL__
 #define NETLINK_RX_RING			6
 #define NETLINK_TX_RING			7
+#endif
 #define NETLINK_LISTEN_ALL_NSID		8
 #define NETLINK_LIST_MEMBERSHIPS	9
 #define NETLINK_CAP_ACK			10
@@ -134,6 +136,7 @@ struct nl_mmap_hdr {
 	__u32		nm_gid;
 };
 
+#ifndef __KERNEL__
 enum nl_mmap_status {
 	NL_MMAP_STATUS_UNUSED,
 	NL_MMAP_STATUS_RESERVED,
@@ -145,6 +148,7 @@ enum nl_mmap_status {
 #define NL_MMAP_MSG_ALIGNMENT		NLMSG_ALIGNTO
 #define NL_MMAP_MSG_ALIGN(sz)		__ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT)
 #define NL_MMAP_HDRLEN			NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr))
+#endif
 
 #define NET_MAJOR 36		/* Major 36 is reserved for networking 						*/
 
diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h
index f2159d30d1f5..d79399394b46 100644
--- a/include/uapi/linux/netlink_diag.h
+++ b/include/uapi/linux/netlink_diag.h
@@ -48,6 +48,8 @@ enum {
 
 #define NDIAG_SHOW_MEMINFO	0x00000001 /* show memory info of a socket */
 #define NDIAG_SHOW_GROUPS	0x00000002 /* show groups of a netlink socket */
+#ifndef __KERNEL__
 #define NDIAG_SHOW_RING_CFG	0x00000004 /* show ring configuration */
+#endif
 
 #endif
diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig
index 2c5e95e9bfbd..5d6e8c05b3d4 100644
--- a/net/netlink/Kconfig
+++ b/net/netlink/Kconfig
@@ -2,15 +2,6 @@
 # Netlink Sockets
 #
 
-config NETLINK_MMAP
-	bool "NETLINK: mmaped IO"
-	---help---
-	  This option enables support for memory mapped netlink IO. This
-	  reduces overhead by avoiding copying data between kernel- and
-	  userspace.
-
-	  If unsure, say N.
-
 config NETLINK_DIAG
 	tristate "NETLINK: socket monitoring interface"
 	default n
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f1ffb34e253f..85aa6ef86dfd 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -225,7 +225,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
 
 	dev_hold(dev);
 
-	if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head))
+	if (is_vmalloc_addr(skb->head))
 		nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
 	else
 		nskb = skb_clone(skb, GFP_ATOMIC);
@@ -300,610 +300,8 @@ static void netlink_rcv_wake(struct sock *sk)
 		wake_up_interruptible(&nlk->wait);
 }
 
-#ifdef CONFIG_NETLINK_MMAP
-static bool netlink_rx_is_mmaped(struct sock *sk)
-{
-	return nlk_sk(sk)->rx_ring.pg_vec != NULL;
-}
-
-static bool netlink_tx_is_mmaped(struct sock *sk)
-{
-	return nlk_sk(sk)->tx_ring.pg_vec != NULL;
-}
-
-static __pure struct page *pgvec_to_page(const void *addr)
-{
-	if (is_vmalloc_addr(addr))
-		return vmalloc_to_page(addr);
-	else
-		return virt_to_page(addr);
-}
-
-static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
-{
-	unsigned int i;
-
-	for (i = 0; i < len; i++) {
-		if (pg_vec[i] != NULL) {
-			if (is_vmalloc_addr(pg_vec[i]))
-				vfree(pg_vec[i]);
-			else
-				free_pages((unsigned long)pg_vec[i], order);
-		}
-	}
-	kfree(pg_vec);
-}
-
-static void *alloc_one_pg_vec_page(unsigned long order)
-{
-	void *buffer;
-	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
-			  __GFP_NOWARN | __GFP_NORETRY;
-
-	buffer = (void *)__get_free_pages(gfp_flags, order);
-	if (buffer != NULL)
-		return buffer;
-
-	buffer = vzalloc((1 << order) * PAGE_SIZE);
-	if (buffer != NULL)
-		return buffer;
-
-	gfp_flags &= ~__GFP_NORETRY;
-	return (void *)__get_free_pages(gfp_flags, order);
-}
-
-static void **alloc_pg_vec(struct netlink_sock *nlk,
-			   struct nl_mmap_req *req, unsigned int order)
-{
-	unsigned int block_nr = req->nm_block_nr;
-	unsigned int i;
-	void **pg_vec;
-
-	pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
-	if (pg_vec == NULL)
-		return NULL;
-
-	for (i = 0; i < block_nr; i++) {
-		pg_vec[i] = alloc_one_pg_vec_page(order);
-		if (pg_vec[i] == NULL)
-			goto err1;
-	}
-
-	return pg_vec;
-err1:
-	free_pg_vec(pg_vec, order, block_nr);
-	return NULL;
-}
-
-
-static void
-__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec,
-		   unsigned int order)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct sk_buff_head *queue;
-	struct netlink_ring *ring;
-
-	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
-	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
-
-	spin_lock_bh(&queue->lock);
-
-	ring->frame_max		= req->nm_frame_nr - 1;
-	ring->head		= 0;
-	ring->frame_size	= req->nm_frame_size;
-	ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE;
-
-	swap(ring->pg_vec_len, req->nm_block_nr);
-	swap(ring->pg_vec_order, order);
-	swap(ring->pg_vec, pg_vec);
-
-	__skb_queue_purge(queue);
-	spin_unlock_bh(&queue->lock);
-
-	WARN_ON(atomic_read(&nlk->mapped));
-
-	if (pg_vec)
-		free_pg_vec(pg_vec, order, req->nm_block_nr);
-}
-
-static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
-			    bool tx_ring)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring;
-	void **pg_vec = NULL;
-	unsigned int order = 0;
-
-	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
-
-	if (atomic_read(&nlk->mapped))
-		return -EBUSY;
-	if (atomic_read(&ring->pending))
-		return -EBUSY;
-
-	if (req->nm_block_nr) {
-		if (ring->pg_vec != NULL)
-			return -EBUSY;
-
-		if ((int)req->nm_block_size <= 0)
-			return -EINVAL;
-		if (!PAGE_ALIGNED(req->nm_block_size))
-			return -EINVAL;
-		if (req->nm_frame_size < NL_MMAP_HDRLEN)
-			return -EINVAL;
-		if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
-			return -EINVAL;
-
-		ring->frames_per_block = req->nm_block_size /
-					 req->nm_frame_size;
-		if (ring->frames_per_block == 0)
-			return -EINVAL;
-		if (ring->frames_per_block * req->nm_block_nr !=
-		    req->nm_frame_nr)
-			return -EINVAL;
-
-		order = get_order(req->nm_block_size);
-		pg_vec = alloc_pg_vec(nlk, req, order);
-		if (pg_vec == NULL)
-			return -ENOMEM;
-	} else {
-		if (req->nm_frame_nr)
-			return -EINVAL;
-	}
-
-	mutex_lock(&nlk->pg_vec_lock);
-	if (atomic_read(&nlk->mapped) == 0) {
-		__netlink_set_ring(sk, req, tx_ring, pg_vec, order);
-		mutex_unlock(&nlk->pg_vec_lock);
-		return 0;
-	}
-
-	mutex_unlock(&nlk->pg_vec_lock);
-
-	if (pg_vec)
-		free_pg_vec(pg_vec, order, req->nm_block_nr);
-
-	return -EBUSY;
-}
-
-static void netlink_mm_open(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct socket *sock = file->private_data;
-	struct sock *sk = sock->sk;
-
-	if (sk)
-		atomic_inc(&nlk_sk(sk)->mapped);
-}
-
-static void netlink_mm_close(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct socket *sock = file->private_data;
-	struct sock *sk = sock->sk;
-
-	if (sk)
-		atomic_dec(&nlk_sk(sk)->mapped);
-}
-
-static const struct vm_operations_struct netlink_mmap_ops = {
-	.open	= netlink_mm_open,
-	.close	= netlink_mm_close,
-};
-
-static int netlink_mmap(struct file *file, struct socket *sock,
-			struct vm_area_struct *vma)
-{
-	struct sock *sk = sock->sk;
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring;
-	unsigned long start, size, expected;
-	unsigned int i;
-	int err = -EINVAL;
-
-	if (vma->vm_pgoff)
-		return -EINVAL;
-
-	mutex_lock(&nlk->pg_vec_lock);
-
-	expected = 0;
-	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
-		if (ring->pg_vec == NULL)
-			continue;
-		expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
-	}
-
-	if (expected == 0)
-		goto out;
-
-	size = vma->vm_end - vma->vm_start;
-	if (size != expected)
-		goto out;
-
-	start = vma->vm_start;
-	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
-		if (ring->pg_vec == NULL)
-			continue;
-
-		for (i = 0; i < ring->pg_vec_len; i++) {
-			struct page *page;
-			void *kaddr = ring->pg_vec[i];
-			unsigned int pg_num;
-
-			for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
-				page = pgvec_to_page(kaddr);
-				err = vm_insert_page(vma, start, page);
-				if (err < 0)
-					goto out;
-				start += PAGE_SIZE;
-				kaddr += PAGE_SIZE;
-			}
-		}
-	}
-
-	atomic_inc(&nlk->mapped);
-	vma->vm_ops = &netlink_mmap_ops;
-	err = 0;
-out:
-	mutex_unlock(&nlk->pg_vec_lock);
-	return err;
-}
-
-static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len)
-{
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
-	struct page *p_start, *p_end;
-
-	/* First page is flushed through netlink_{get,set}_status */
-	p_start = pgvec_to_page(hdr + PAGE_SIZE);
-	p_end   = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1);
-	while (p_start <= p_end) {
-		flush_dcache_page(p_start);
-		p_start++;
-	}
-#endif
-}
-
-static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
-{
-	smp_rmb();
-	flush_dcache_page(pgvec_to_page(hdr));
-	return hdr->nm_status;
-}
-
-static void netlink_set_status(struct nl_mmap_hdr *hdr,
-			       enum nl_mmap_status status)
-{
-	smp_mb();
-	hdr->nm_status = status;
-	flush_dcache_page(pgvec_to_page(hdr));
-}
-
-static struct nl_mmap_hdr *
-__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
-{
-	unsigned int pg_vec_pos, frame_off;
-
-	pg_vec_pos = pos / ring->frames_per_block;
-	frame_off  = pos % ring->frames_per_block;
-
-	return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
-}
-
-static struct nl_mmap_hdr *
-netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
-		     enum nl_mmap_status status)
-{
-	struct nl_mmap_hdr *hdr;
-
-	hdr = __netlink_lookup_frame(ring, pos);
-	if (netlink_get_status(hdr) != status)
-		return NULL;
-
-	return hdr;
-}
-
-static struct nl_mmap_hdr *
-netlink_current_frame(const struct netlink_ring *ring,
-		      enum nl_mmap_status status)
-{
-	return netlink_lookup_frame(ring, ring->head, status);
-}
-
-static void netlink_increment_head(struct netlink_ring *ring)
-{
-	ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
-}
-
-static void netlink_forward_ring(struct netlink_ring *ring)
-{
-	unsigned int head = ring->head;
-	const struct nl_mmap_hdr *hdr;
-
-	do {
-		hdr = __netlink_lookup_frame(ring, ring->head);
-		if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
-			break;
-		if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
-			break;
-		netlink_increment_head(ring);
-	} while (ring->head != head);
-}
-
-static bool netlink_has_valid_frame(struct netlink_ring *ring)
-{
-	unsigned int head = ring->head, pos = head;
-	const struct nl_mmap_hdr *hdr;
-
-	do {
-		hdr = __netlink_lookup_frame(ring, pos);
-		if (hdr->nm_status == NL_MMAP_STATUS_VALID)
-			return true;
-		pos = pos != 0 ? pos - 1 : ring->frame_max;
-	} while (pos != head);
-
-	return false;
-}
-
-static bool netlink_dump_space(struct netlink_sock *nlk)
-{
-	struct netlink_ring *ring = &nlk->rx_ring;
-	struct nl_mmap_hdr *hdr;
-	unsigned int n;
-
-	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
-	if (hdr == NULL)
-		return false;
-
-	n = ring->head + ring->frame_max / 2;
-	if (n > ring->frame_max)
-		n -= ring->frame_max;
-
-	hdr = __netlink_lookup_frame(ring, n);
-
-	return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
-}
-
-static unsigned int netlink_poll(struct file *file, struct socket *sock,
-				 poll_table *wait)
-{
-	struct sock *sk = sock->sk;
-	struct netlink_sock *nlk = nlk_sk(sk);
-	unsigned int mask;
-	int err;
-
-	if (nlk->rx_ring.pg_vec != NULL) {
-		/* Memory mapped sockets don't call recvmsg(), so flow control
-		 * for dumps is performed here. A dump is allowed to continue
-		 * if at least half the ring is unused.
-		 */
-		while (nlk->cb_running && netlink_dump_space(nlk)) {
-			err = netlink_dump(sk);
-			if (err < 0) {
-				sk->sk_err = -err;
-				sk->sk_error_report(sk);
-				break;
-			}
-		}
-		netlink_rcv_wake(sk);
-	}
-
-	mask = datagram_poll(file, sock, wait);
-
-	/* We could already have received frames in the normal receive
-	 * queue, that will show up as NL_MMAP_STATUS_COPY in the ring,
-	 * so if mask contains pollin/etc already, there's no point
-	 * walking the ring.
-	 */
-	if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) {
-		spin_lock_bh(&sk->sk_receive_queue.lock);
-		if (nlk->rx_ring.pg_vec) {
-			if (netlink_has_valid_frame(&nlk->rx_ring))
-				mask |= POLLIN | POLLRDNORM;
-		}
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-	}
-
-	spin_lock_bh(&sk->sk_write_queue.lock);
-	if (nlk->tx_ring.pg_vec) {
-		if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
-			mask |= POLLOUT | POLLWRNORM;
-	}
-	spin_unlock_bh(&sk->sk_write_queue.lock);
-
-	return mask;
-}
-
-static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
-{
-	return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
-}
-
-static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
-				   struct netlink_ring *ring,
-				   struct nl_mmap_hdr *hdr)
-{
-	unsigned int size;
-	void *data;
-
-	size = ring->frame_size - NL_MMAP_HDRLEN;
-	data = (void *)hdr + NL_MMAP_HDRLEN;
-
-	skb->head	= data;
-	skb->data	= data;
-	skb_reset_tail_pointer(skb);
-	skb->end	= skb->tail + size;
-	skb->len	= 0;
-
-	skb->destructor	= netlink_skb_destructor;
-	NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
-	NETLINK_CB(skb).sk = sk;
-}
-
-static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
-				u32 dst_portid, u32 dst_group,
-				struct scm_cookie *scm)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring;
-	struct nl_mmap_hdr *hdr;
-	struct sk_buff *skb;
-	unsigned int maxlen;
-	int err = 0, len = 0;
-
-	mutex_lock(&nlk->pg_vec_lock);
-
-	ring   = &nlk->tx_ring;
-	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-
-	do {
-		unsigned int nm_len;
-
-		hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
-		if (hdr == NULL) {
-			if (!(msg->msg_flags & MSG_DONTWAIT) &&
-			    atomic_read(&nlk->tx_ring.pending))
-				schedule();
-			continue;
-		}
-
-		nm_len = ACCESS_ONCE(hdr->nm_len);
-		if (nm_len > maxlen) {
-			err = -EINVAL;
-			goto out;
-		}
-
-		netlink_frame_flush_dcache(hdr, nm_len);
-
-		skb = alloc_skb(nm_len, GFP_KERNEL);
-		if (skb == NULL) {
-			err = -ENOBUFS;
-			goto out;
-		}
-		__skb_put(skb, nm_len);
-		memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len);
-		netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
-
-		netlink_increment_head(ring);
-
-		NETLINK_CB(skb).portid	  = nlk->portid;
-		NETLINK_CB(skb).dst_group = dst_group;
-		NETLINK_CB(skb).creds	  = scm->creds;
-
-		err = security_netlink_send(sk, skb);
-		if (err) {
-			kfree_skb(skb);
-			goto out;
-		}
-
-		if (unlikely(dst_group)) {
-			atomic_inc(&skb->users);
-			netlink_broadcast(sk, skb, dst_portid, dst_group,
-					  GFP_KERNEL);
-		}
-		err = netlink_unicast(sk, skb, dst_portid,
-				      msg->msg_flags & MSG_DONTWAIT);
-		if (err < 0)
-			goto out;
-		len += err;
-
-	} while (hdr != NULL ||
-		 (!(msg->msg_flags & MSG_DONTWAIT) &&
-		  atomic_read(&nlk->tx_ring.pending)));
-
-	if (len > 0)
-		err = len;
-out:
-	mutex_unlock(&nlk->pg_vec_lock);
-	return err;
-}
-
-static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
-{
-	struct nl_mmap_hdr *hdr;
-
-	hdr = netlink_mmap_hdr(skb);
-	hdr->nm_len	= skb->len;
-	hdr->nm_group	= NETLINK_CB(skb).dst_group;
-	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
-	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
-	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
-	netlink_frame_flush_dcache(hdr, hdr->nm_len);
-	netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
-
-	NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
-	kfree_skb(skb);
-}
-
-static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring = &nlk->rx_ring;
-	struct nl_mmap_hdr *hdr;
-
-	spin_lock_bh(&sk->sk_receive_queue.lock);
-	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
-	if (hdr == NULL) {
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-		kfree_skb(skb);
-		netlink_overrun(sk);
-		return;
-	}
-	netlink_increment_head(ring);
-	__skb_queue_tail(&sk->sk_receive_queue, skb);
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-
-	hdr->nm_len	= skb->len;
-	hdr->nm_group	= NETLINK_CB(skb).dst_group;
-	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
-	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
-	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
-	netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
-}
-
-#else /* CONFIG_NETLINK_MMAP */
-#define netlink_rx_is_mmaped(sk)	false
-#define netlink_tx_is_mmaped(sk)	false
-#define netlink_mmap			sock_no_mmap
-#define netlink_poll			datagram_poll
-#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm)	0
-#endif /* CONFIG_NETLINK_MMAP */
-
 static void netlink_skb_destructor(struct sk_buff *skb)
 {
-#ifdef CONFIG_NETLINK_MMAP
-	struct nl_mmap_hdr *hdr;
-	struct netlink_ring *ring;
-	struct sock *sk;
-
-	/* If a packet from the kernel to userspace was freed because of an
-	 * error without being delivered to userspace, the kernel must reset
-	 * the status. In the direction userspace to kernel, the status is
-	 * always reset here after the packet was processed and freed.
-	 */
-	if (netlink_skb_is_mmaped(skb)) {
-		hdr = netlink_mmap_hdr(skb);
-		sk = NETLINK_CB(skb).sk;
-
-		if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
-			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
-			ring = &nlk_sk(sk)->tx_ring;
-		} else {
-			if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
-				hdr->nm_len = 0;
-				netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
-			}
-			ring = &nlk_sk(sk)->rx_ring;
-		}
-
-		WARN_ON(atomic_read(&ring->pending) == 0);
-		atomic_dec(&ring->pending);
-		sock_put(sk);
-
-		skb->head = NULL;
-	}
-#endif
 	if (is_vmalloc_addr(skb->head)) {
 		if (!skb->cloned ||
 		    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
@@ -937,18 +335,6 @@ static void netlink_sock_destruct(struct sock *sk)
 	}
 
 	skb_queue_purge(&sk->sk_receive_queue);
-#ifdef CONFIG_NETLINK_MMAP
-	if (1) {
-		struct nl_mmap_req req;
-
-		memset(&req, 0, sizeof(req));
-		if (nlk->rx_ring.pg_vec)
-			__netlink_set_ring(sk, &req, false, NULL, 0);
-		memset(&req, 0, sizeof(req));
-		if (nlk->tx_ring.pg_vec)
-			__netlink_set_ring(sk, &req, true, NULL, 0);
-	}
-#endif /* CONFIG_NETLINK_MMAP */
 
 	if (!sock_flag(sk, SOCK_DEAD)) {
 		printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
@@ -1194,9 +580,6 @@ static int __netlink_create(struct net *net, struct socket *sock,
 		mutex_init(nlk->cb_mutex);
 	}
 	init_waitqueue_head(&nlk->wait);
-#ifdef CONFIG_NETLINK_MMAP
-	mutex_init(&nlk->pg_vec_lock);
-#endif
 
 	sk->sk_destruct = netlink_sock_destruct;
 	sk->sk_protocol = protocol;
@@ -1728,8 +1111,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
 	nlk = nlk_sk(sk);
 
 	if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-	     test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
-	    !netlink_skb_is_mmaped(skb)) {
+	     test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
 		DECLARE_WAITQUEUE(wait, current);
 		if (!*timeo) {
 			if (!ssk || netlink_is_kernel(ssk))
@@ -1767,14 +1149,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
 
 	netlink_deliver_tap(skb);
 
-#ifdef CONFIG_NETLINK_MMAP
-	if (netlink_skb_is_mmaped(skb))
-		netlink_queue_mmaped_skb(sk, skb);
-	else if (netlink_rx_is_mmaped(sk))
-		netlink_ring_set_copied(sk, skb);
-	else
-#endif /* CONFIG_NETLINK_MMAP */
-		skb_queue_tail(&sk->sk_receive_queue, skb);
+	skb_queue_tail(&sk->sk_receive_queue, skb);
 	sk->sk_data_ready(sk);
 	return len;
 }
@@ -1798,9 +1173,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
 	int delta;
 
 	WARN_ON(skb->sk != NULL);
-	if (netlink_skb_is_mmaped(skb))
-		return skb;
-
 	delta = skb->end - skb->tail;
 	if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
 		return skb;
@@ -1880,71 +1252,6 @@ struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
 				    unsigned int ldiff, u32 dst_portid,
 				    gfp_t gfp_mask)
 {
-#ifdef CONFIG_NETLINK_MMAP
-	unsigned int maxlen, linear_size;
-	struct sock *sk = NULL;
-	struct sk_buff *skb;
-	struct netlink_ring *ring;
-	struct nl_mmap_hdr *hdr;
-
-	sk = netlink_getsockbyportid(ssk, dst_portid);
-	if (IS_ERR(sk))
-		goto out;
-
-	ring = &nlk_sk(sk)->rx_ring;
-	/* fast-path without atomic ops for common case: non-mmaped receiver */
-	if (ring->pg_vec == NULL)
-		goto out_put;
-
-	/* We need to account the full linear size needed as a ring
-	 * slot cannot have non-linear parts.
-	 */
-	linear_size = size + ldiff;
-	if (ring->frame_size - NL_MMAP_HDRLEN < linear_size)
-		goto out_put;
-
-	skb = alloc_skb_head(gfp_mask);
-	if (skb == NULL)
-		goto err1;
-
-	spin_lock_bh(&sk->sk_receive_queue.lock);
-	/* check again under lock */
-	if (ring->pg_vec == NULL)
-		goto out_free;
-
-	/* check again under lock */
-	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-	if (maxlen < linear_size)
-		goto out_free;
-
-	netlink_forward_ring(ring);
-	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
-	if (hdr == NULL)
-		goto err2;
-
-	netlink_ring_setup_skb(skb, sk, ring, hdr);
-	netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
-	atomic_inc(&ring->pending);
-	netlink_increment_head(ring);
-
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-	return skb;
-
-err2:
-	kfree_skb(skb);
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-	netlink_overrun(sk);
-err1:
-	sock_put(sk);
-	return NULL;
-
-out_free:
-	kfree_skb(skb);
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-out_put:
-	sock_put(sk);
-out:
-#endif
 	return alloc_skb(size, gfp_mask);
 }
 EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
@@ -2225,8 +1532,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 	if (level != SOL_NETLINK)
 		return -ENOPROTOOPT;
 
-	if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
-	    optlen >= sizeof(int) &&
+	if (optlen >= sizeof(int) &&
 	    get_user(val, (unsigned int __user *)optval))
 		return -EFAULT;
 
@@ -2279,25 +1585,6 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		}
 		err = 0;
 		break;
-#ifdef CONFIG_NETLINK_MMAP
-	case NETLINK_RX_RING:
-	case NETLINK_TX_RING: {
-		struct nl_mmap_req req;
-
-		/* Rings might consume more memory than queue limits, require
-		 * CAP_NET_ADMIN.
-		 */
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
-		if (optlen < sizeof(req))
-			return -EINVAL;
-		if (copy_from_user(&req, optval, sizeof(req)))
-			return -EFAULT;
-		err = netlink_set_ring(sk, &req,
-				       optname == NETLINK_TX_RING);
-		break;
-	}
-#endif /* CONFIG_NETLINK_MMAP */
 	case NETLINK_LISTEN_ALL_NSID:
 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
 			return -EPERM;
@@ -2467,18 +1754,6 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 		smp_rmb();
 	}
 
-	/* It's a really convoluted way for userland to ask for mmaped
-	 * sendmsg(), but that's what we've got...
-	 */
-	if (netlink_tx_is_mmaped(sk) &&
-	    iter_is_iovec(&msg->msg_iter) &&
-	    msg->msg_iter.nr_segs == 1 &&
-	    msg->msg_iter.iov->iov_base == NULL) {
-		err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
-					   &scm);
-		goto out;
-	}
-
 	err = -EMSGSIZE;
 	if (len > sk->sk_sndbuf - 32)
 		goto out;
@@ -2794,8 +2069,7 @@ static int netlink_dump(struct sock *sk)
 		goto errout_skb;
 	}
 
-	if (!netlink_rx_is_mmaped(sk) &&
-	    atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
 		goto errout_skb;
 
 	/* NLMSG_GOODSIZE is small to avoid high order allocations being
@@ -2831,8 +2105,7 @@ static int netlink_dump(struct sock *sk)
 	 * reasonable static buffer based on the expected largest dump of a
 	 * single netdev. The outcome is MSG_TRUNC error.
 	 */
-	if (!netlink_rx_is_mmaped(sk))
-		skb_reserve(skb, skb_tailroom(skb) - alloc_size);
+	skb_reserve(skb, skb_tailroom(skb) - alloc_size);
 	netlink_skb_set_owner_r(skb, sk);
 
 	len = cb->dump(skb, cb);
@@ -2884,16 +2157,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	struct netlink_sock *nlk;
 	int ret;
 
-	/* Memory mapped dump requests need to be copied to avoid looping
-	 * on the pending state in netlink_mmap_sendmsg() while the CB hold
-	 * a reference to the skb.
-	 */
-	if (netlink_skb_is_mmaped(skb)) {
-		skb = skb_copy(skb, GFP_KERNEL);
-		if (skb == NULL)
-			return -ENOBUFS;
-	} else
-		atomic_inc(&skb->users);
+	atomic_inc(&skb->users);
 
 	sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
 	if (sk == NULL) {
@@ -3241,7 +2505,7 @@ static const struct proto_ops netlink_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	netlink_getname,
-	.poll =		netlink_poll,
+	.poll =		datagram_poll,
 	.ioctl =	sock_no_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
@@ -3249,7 +2513,7 @@ static const struct proto_ops netlink_ops = {
 	.getsockopt =	netlink_getsockopt,
 	.sendmsg =	netlink_sendmsg,
 	.recvmsg =	netlink_recvmsg,
-	.mmap =		netlink_mmap,
+	.mmap =		sock_no_mmap,
 	.sendpage =	sock_no_sendpage,
 };
 
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 14437d9b1965..e68ef9ccd703 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -44,12 +44,6 @@ struct netlink_sock {
 	int			(*netlink_bind)(struct net *net, int group);
 	void			(*netlink_unbind)(struct net *net, int group);
 	struct module		*module;
-#ifdef CONFIG_NETLINK_MMAP
-	struct mutex		pg_vec_lock;
-	struct netlink_ring	rx_ring;
-	struct netlink_ring	tx_ring;
-	atomic_t		mapped;
-#endif /* CONFIG_NETLINK_MMAP */
 
 	struct rhash_head	node;
 	struct rcu_head		rcu;
@@ -60,15 +54,6 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk)
 	return container_of(sk, struct netlink_sock, sk);
 }
 
-static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb)
-{
-#ifdef CONFIG_NETLINK_MMAP
-	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
-#else
-	return false;
-#endif /* CONFIG_NETLINK_MMAP */
-}
-
 struct netlink_table {
 	struct rhashtable	hash;
 	struct hlist_head	mc_list;
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 3ee63a3cff30..8dd836a8dd60 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -8,41 +8,6 @@
 
 #include "af_netlink.h"
 
-#ifdef CONFIG_NETLINK_MMAP
-static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type,
-			    struct sk_buff *nlskb)
-{
-	struct netlink_diag_ring ndr;
-
-	ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
-	ndr.ndr_block_nr   = ring->pg_vec_len;
-	ndr.ndr_frame_size = ring->frame_size;
-	ndr.ndr_frame_nr   = ring->frame_max + 1;
-
-	return nla_put(nlskb, nl_type, sizeof(ndr), &ndr);
-}
-
-static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	int ret;
-
-	mutex_lock(&nlk->pg_vec_lock);
-	ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb);
-	if (!ret)
-		ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING,
-				       nlskb);
-	mutex_unlock(&nlk->pg_vec_lock);
-
-	return ret;
-}
-#else
-static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
-{
-	return 0;
-}
-#endif
-
 static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
@@ -87,10 +52,6 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 	    sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO))
 		goto out_nlmsg_trim;
 
-	if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) &&
-	    sk_diag_put_rings_cfg(sk, skb))
-		goto out_nlmsg_trim;
-
 	nlmsg_end(skb, nlh);
 	return 0;
 
-- 
cgit v1.2.3


From 2125715635053d4207a756a35aa718f548824e58 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 16 Feb 2016 12:46:54 +0100
Subject: bridge: mdb: add support for more attributes and export timer

Currently mdb entries are exported directly as a structure inside
MDBA_MDB_ENTRY_INFO attribute, we can't really extend it without
breaking user-space. In order to export new mdb fields, I've converted
the MDBA_MDB_ENTRY_INFO into a nested attribute which starts like before
with struct br_mdb_entry (without header, as it's casted directly in
iproute2) and continues with MDBA_MDB_EATTR_ attributes. This way we
keep compatibility with older users and can export new data.
I've tested this with iproute2, both with and without support for the
added attribute and it works fine.
So basically we again have MDBA_MDB_ENTRY_INFO with struct br_mdb_entry
inside but it may contain also some additional MDBA_MDB_EATTR_ attributes
such as MDBA_MDB_EATTR_TIMER which can be parsed by user-space.

So the new structure is:
[MDBA_MDB] = {
     [MDBA_MDB_ENTRY] = {
         [MDBA_MDB_ENTRY_INFO]
         [MDBA_MDB_ENTRY_INFO] { <- Nested attribute
             struct br_mdb_entry <- nla_put_nohdr()
             [MDBA_MDB_ENTRY attributes] <- normal netlink attributes
         }
     }
}

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 13 ++++++++++++-
 net/bridge/br_mdb.c            | 16 +++++++++++++++-
 2 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index ec3547234998..0890b217580d 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -137,7 +137,10 @@ struct bridge_vlan_info {
 /* Bridge multicast database attributes
  * [MDBA_MDB] = {
  *     [MDBA_MDB_ENTRY] = {
- *         [MDBA_MDB_ENTRY_INFO]
+ *         [MDBA_MDB_ENTRY_INFO] {
+ *		struct br_mdb_entry
+ *		[MDBA_MDB_EATTR attributes]
+ *         }
  *     }
  * }
  * [MDBA_ROUTER] = {
@@ -166,6 +169,14 @@ enum {
 };
 #define MDBA_MDB_ENTRY_MAX (__MDBA_MDB_ENTRY_MAX - 1)
 
+/* per mdb entry additional attributes */
+enum {
+	MDBA_MDB_EATTR_UNSPEC,
+	MDBA_MDB_EATTR_TIMER,
+	__MDBA_MDB_EATTR_MAX
+};
+#define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1)
+
 enum {
 	MDBA_ROUTER_UNSPEC,
 	MDBA_ROUTER_PORT,
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index e66619171386..cf51b7bcb5d5 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -88,11 +88,13 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 			for (pp = &mp->ports;
 			     (p = rcu_dereference(*pp)) != NULL;
 			      pp = &p->next) {
+				struct nlattr *nest_ent;
 				struct br_mdb_entry e;
 
 				port = p->port;
 				if (!port)
 					continue;
+
 				memset(&e, 0, sizeof(e));
 				e.ifindex = port->dev->ifindex;
 				e.vid = p->addr.vid;
@@ -104,11 +106,23 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 					e.addr.u.ip6 = p->addr.u.ip6;
 #endif
 				e.addr.proto = p->addr.proto;
-				if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) {
+				nest_ent = nla_nest_start(skb,
+							  MDBA_MDB_ENTRY_INFO);
+				if (!nest_ent) {
+					nla_nest_cancel(skb, nest2);
+					err = -EMSGSIZE;
+					goto out;
+				}
+				if (nla_put_nohdr(skb, sizeof(e), &e) ||
+				    nla_put_u32(skb,
+						MDBA_MDB_EATTR_TIMER,
+						br_timer_value(&p->timer))) {
+					nla_nest_cancel(skb, nest_ent);
 					nla_nest_cancel(skb, nest2);
 					err = -EMSGSIZE;
 					goto out;
 				}
+				nla_nest_end(skb, nest_ent);
 			}
 			nla_nest_end(skb, nest2);
 		skip:
-- 
cgit v1.2.3


From ac2c7ad0e5d6030452c9af2fafd192e17fd04264 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Fri, 19 Feb 2016 09:24:01 -0500
Subject: net/ethtool: introduce a new ioctl for per queue setting

Introduce a new ioctl ETHTOOL_PERQUEUE for per queue parameters setting.
The following patches will enable some SUB_COMMANDs for per queue
setting.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Reviewed-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 17 +++++++++++++++++
 net/core/ethtool.c           | 27 +++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 190aea0faaf4..f15ae02621a1 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1202,6 +1202,21 @@ enum ethtool_sfeatures_retval_bits {
 #define ETHTOOL_F_WISH          (1 << ETHTOOL_F_WISH__BIT)
 #define ETHTOOL_F_COMPAT        (1 << ETHTOOL_F_COMPAT__BIT)
 
+#define MAX_NUM_QUEUE		4096
+
+/**
+ * struct ethtool_per_queue_op - apply sub command to the queues in mask.
+ * @cmd: ETHTOOL_PERQUEUE
+ * @sub_command: the sub command which apply to each queues
+ * @queue_mask: Bitmap of the queues which sub command apply to
+ * @data: A complete command structure following for each of the queues addressed
+ */
+struct ethtool_per_queue_op {
+	__u32	cmd;
+	__u32	sub_command;
+	__u32	queue_mask[DIV_ROUND_UP(MAX_NUM_QUEUE, 32)];
+	char	data[];
+};
 
 /* CMDs currently supported */
 #define ETHTOOL_GSET		0x00000001 /* Get settings. */
@@ -1285,6 +1300,8 @@ enum ethtool_sfeatures_retval_bits {
 #define ETHTOOL_STUNABLE	0x00000049 /* Set tunable configuration */
 #define ETHTOOL_GPHYSTATS	0x0000004a /* get PHY-specific statistics */
 
+#define ETHTOOL_PERQUEUE	0x0000004b /* Set per queue options */
+
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
 #define SPARC_ETH_SSET		ETHTOOL_SSET
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index c2d3118b1395..d640ecf71e74 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1888,13 +1888,27 @@ out:
 	return ret;
 }
 
+static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_per_queue_op per_queue_opt;
+
+	if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt)))
+		return -EFAULT;
+
+	switch (per_queue_opt.sub_command) {
+
+	default:
+		return -EOPNOTSUPP;
+	};
+}
+
 /* The main entry point in this file.  Called from net/core/dev_ioctl.c */
 
 int dev_ethtool(struct net *net, struct ifreq *ifr)
 {
 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
 	void __user *useraddr = ifr->ifr_data;
-	u32 ethcmd;
+	u32 ethcmd, sub_cmd;
 	int rc;
 	netdev_features_t old_features;
 
@@ -1904,8 +1918,14 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
 		return -EFAULT;
 
+	if (ethcmd == ETHTOOL_PERQUEUE) {
+		if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)))
+			return -EFAULT;
+	} else {
+		sub_cmd = ethcmd;
+	}
 	/* Allow some commands to be done by anyone */
-	switch (ethcmd) {
+	switch (sub_cmd) {
 	case ETHTOOL_GSET:
 	case ETHTOOL_GDRVINFO:
 	case ETHTOOL_GMSGLVL:
@@ -2135,6 +2155,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GPHYSTATS:
 		rc = ethtool_get_phy_stats(dev, useraddr);
 		break;
+	case ETHTOOL_PERQUEUE:
+		rc = ethtool_set_per_queue(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From d5a3b1f691865be576c2bffa708549b8cdccda19 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 17 Feb 2016 19:58:58 -0800
Subject: bpf: introduce BPF_MAP_TYPE_STACK_TRACE

add new map type to store stack traces and corresponding helper
bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id
@ctx: struct pt_regs*
@map: pointer to stack_trace map
@flags: bits 0-7 - numer of stack frames to skip
        bit 8 - collect user stack instead of kernel
        bit 9 - compare stacks by hash only
        bit 10 - if two different stacks hash into the same stackid
                 discard old
        other bits - reserved
Return: >= 0 stackid on success or negative error

stackid is a 32-bit integer handle that can be further combined with
other data (including other stackid) and used as a key into maps.

Userspace will access stackmap using standard lookup/delete syscall commands to
retrieve full stack trace for given stackid.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   1 +
 include/uapi/linux/bpf.h |  21 +++++
 kernel/bpf/Makefile      |   3 +
 kernel/bpf/stackmap.c    | 237 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c    |   6 +-
 kernel/trace/bpf_trace.c |   2 +
 6 files changed, 269 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/stackmap.c

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 90ee6ab24bc5..0cadbb7456c0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -237,6 +237,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
+extern const struct bpf_func_proto bpf_get_stackid_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2ee0fde1bf96..d3e77da8e9e8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -83,6 +83,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 	BPF_MAP_TYPE_PERCPU_HASH,
 	BPF_MAP_TYPE_PERCPU_ARRAY,
+	BPF_MAP_TYPE_STACK_TRACE,
 };
 
 enum bpf_prog_type {
@@ -272,6 +273,20 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_perf_event_output,
 	BPF_FUNC_skb_load_bytes,
+
+	/**
+	 * bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id
+	 * @ctx: struct pt_regs*
+	 * @map: pointer to stack_trace map
+	 * @flags: bits 0-7 - numer of stack frames to skip
+	 *         bit 8 - collect user stack instead of kernel
+	 *         bit 9 - compare stacks by hash only
+	 *         bit 10 - if two different stacks hash into the same stackid
+	 *                  discard old
+	 *         other bits - reserved
+	 * Return: >= 0 stackid on success or negative error
+	 */
+	BPF_FUNC_get_stackid,
 	__BPF_FUNC_MAX_ID,
 };
 
@@ -294,6 +309,12 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
+/* BPF_FUNC_get_stackid flags. */
+#define BPF_F_SKIP_FIELD_MASK		0xffULL
+#define BPF_F_USER_STACK		(1ULL << 8)
+#define BPF_F_FAST_STACK_CMP		(1ULL << 9)
+#define BPF_F_REUSE_STACKID		(1ULL << 10)
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 13272582eee0..8a932d079c24 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,3 +2,6 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o
+ifeq ($(CONFIG_PERF_EVENTS),y)
+obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
+endif
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
new file mode 100644
index 000000000000..8a60ee14a977
--- /dev/null
+++ b/kernel/bpf/stackmap.c
@@ -0,0 +1,237 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+#include <linux/stacktrace.h>
+#include <linux/perf_event.h>
+
+struct stack_map_bucket {
+	struct rcu_head rcu;
+	u32 hash;
+	u32 nr;
+	u64 ip[];
+};
+
+struct bpf_stack_map {
+	struct bpf_map map;
+	u32 n_buckets;
+	struct stack_map_bucket __rcu *buckets[];
+};
+
+/* Called from syscall */
+static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
+{
+	u32 value_size = attr->value_size;
+	struct bpf_stack_map *smap;
+	u64 cost, n_buckets;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    value_size < 8 || value_size % 8 ||
+	    value_size / 8 > PERF_MAX_STACK_DEPTH)
+		return ERR_PTR(-EINVAL);
+
+	/* hash table size must be power of 2 */
+	n_buckets = roundup_pow_of_two(attr->max_entries);
+
+	cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
+	if (cost >= U32_MAX - PAGE_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	smap = kzalloc(cost, GFP_USER | __GFP_NOWARN);
+	if (!smap) {
+		smap = vzalloc(cost);
+		if (!smap)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	err = -E2BIG;
+	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_smap;
+
+	smap->map.map_type = attr->map_type;
+	smap->map.key_size = attr->key_size;
+	smap->map.value_size = value_size;
+	smap->map.max_entries = attr->max_entries;
+	smap->n_buckets = n_buckets;
+	smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	err = get_callchain_buffers();
+	if (err)
+		goto free_smap;
+
+	return &smap->map;
+
+free_smap:
+	kvfree(smap);
+	return ERR_PTR(err);
+}
+
+static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+{
+	struct pt_regs *regs = (struct pt_regs *) (long) r1;
+	struct bpf_map *map = (struct bpf_map *) (long) r2;
+	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+	struct perf_callchain_entry *trace;
+	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
+	u32 max_depth = map->value_size / 8;
+	/* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
+	u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+	u32 hash, id, trace_nr, trace_len;
+	bool user = flags & BPF_F_USER_STACK;
+	bool kernel = !user;
+	u64 *ips;
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+		return -EINVAL;
+
+	trace = get_perf_callchain(regs, init_nr, kernel, user, false, false);
+
+	if (unlikely(!trace))
+		/* couldn't fetch the stack trace */
+		return -EFAULT;
+
+	/* get_perf_callchain() guarantees that trace->nr >= init_nr
+	 * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+	 */
+	trace_nr = trace->nr - init_nr;
+
+	if (trace_nr <= skip)
+		/* skipping more than usable stack trace */
+		return -EFAULT;
+
+	trace_nr -= skip;
+	trace_len = trace_nr * sizeof(u64);
+	ips = trace->ip + skip + init_nr;
+	hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
+	id = hash & (smap->n_buckets - 1);
+	bucket = rcu_dereference(smap->buckets[id]);
+
+	if (bucket && bucket->hash == hash) {
+		if (flags & BPF_F_FAST_STACK_CMP)
+			return id;
+		if (bucket->nr == trace_nr &&
+		    memcmp(bucket->ip, ips, trace_len) == 0)
+			return id;
+	}
+
+	/* this call stack is not in the map, try to add it */
+	if (bucket && !(flags & BPF_F_REUSE_STACKID))
+		return -EEXIST;
+
+	new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size,
+			     GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!new_bucket))
+		return -ENOMEM;
+
+	memcpy(new_bucket->ip, ips, trace_len);
+	memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len);
+	new_bucket->hash = hash;
+	new_bucket->nr = trace_nr;
+
+	old_bucket = xchg(&smap->buckets[id], new_bucket);
+	if (old_bucket)
+		kfree_rcu(old_bucket, rcu);
+	return id;
+}
+
+const struct bpf_func_proto bpf_get_stackid_proto = {
+	.func		= bpf_get_stackid,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+/* Called from syscall or from eBPF program */
+static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+	struct stack_map_bucket *bucket;
+	u32 id = *(u32 *)key;
+
+	if (unlikely(id >= smap->n_buckets))
+		return NULL;
+	bucket = rcu_dereference(smap->buckets[id]);
+	return bucket ? bucket->ip : NULL;
+}
+
+static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	return -EINVAL;
+}
+
+static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+				 u64 map_flags)
+{
+	return -EINVAL;
+}
+
+/* Called from syscall or from eBPF program */
+static int stack_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+	struct stack_map_bucket *old_bucket;
+	u32 id = *(u32 *)key;
+
+	if (unlikely(id >= smap->n_buckets))
+		return -E2BIG;
+
+	old_bucket = xchg(&smap->buckets[id], NULL);
+	if (old_bucket) {
+		kfree_rcu(old_bucket, rcu);
+		return 0;
+	} else {
+		return -ENOENT;
+	}
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void stack_map_free(struct bpf_map *map)
+{
+	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+	int i;
+
+	synchronize_rcu();
+
+	for (i = 0; i < smap->n_buckets; i++)
+		if (smap->buckets[i])
+			kfree_rcu(smap->buckets[i], rcu);
+	kvfree(smap);
+	put_callchain_buffers();
+}
+
+static const struct bpf_map_ops stack_map_ops = {
+	.map_alloc = stack_map_alloc,
+	.map_free = stack_map_free,
+	.map_get_next_key = stack_map_get_next_key,
+	.map_lookup_elem = stack_map_lookup_elem,
+	.map_update_elem = stack_map_update_elem,
+	.map_delete_elem = stack_map_delete_elem,
+};
+
+static struct bpf_map_type_list stack_map_type __read_mostly = {
+	.ops = &stack_map_ops,
+	.type = BPF_MAP_TYPE_STACK_TRACE,
+};
+
+static int __init register_stack_map(void)
+{
+	bpf_register_map_type(&stack_map_type);
+	return 0;
+}
+late_initcall(register_stack_map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d1d3e8f57de9..42ba4ccc020b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -246,6 +246,7 @@ static const struct {
 	{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
 	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
 	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
+	{BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid},
 };
 
 static void print_verifier_state(struct verifier_env *env)
@@ -911,8 +912,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		 * don't allow any other map type to be passed into
 		 * the special func;
 		 */
-		if (bool_func && bool_map != bool_func)
+		if (bool_func && bool_map != bool_func) {
+			verbose("cannot pass map_type %d into func %d\n",
+				map->map_type, func_id);
 			return -EINVAL;
+		}
 	}
 
 	return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 326a75e884db..4b8caa392b86 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -299,6 +299,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_perf_event_read_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_perf_event_output_proto;
+	case BPF_FUNC_get_stackid:
+		return &bpf_get_stackid_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 7d672345ed295b1356a5d9f7111da1d1d7d65867 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 19 Feb 2016 23:05:23 +0100
Subject: bpf: add generic bpf_csum_diff helper

For L4 checksums, we currently have bpf_l4_csum_replace() helper. It's
currently limited to handle 2 and 4 byte changes in a header and feeds the
from/to into inet_proto_csum_replace{2,4}() helpers of the kernel. When
working with IPv6, for example, this makes it rather cumbersome to deal
with, similarly when editing larger parts of a header.

Instead, extend the API in a more generic way: For bpf_l4_csum_replace(),
add a case for header field mask of 0 to change the checksum at a given
offset through inet_proto_csum_replace_by_diff(), and provide a helper
bpf_csum_diff() that can generically calculate a from/to diff for arbitrary
amounts of data.

This can be used in multiple ways: for the bpf_l4_csum_replace() only
part, this even provides us with the option to insert precalculated diffs
from user space f.e. from a map, or from bpf_csum_diff() during runtime.

bpf_csum_diff() has a optional from/to stack buffer input, so we can
calculate a diff by using a scratchbuffer for scenarios where we're
inserting (from is NULL), removing (to is NULL) or diffing (from/to buffers
don't need to be of equal size) data. Also, bpf_csum_diff() allows to
feed a previous csum into csum_partial(), so the function can also be
cascaded.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 11 ++++++++++
 net/core/filter.c        | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d3e77da8e9e8..48d0a6c54609 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -287,6 +287,17 @@ enum bpf_func_id {
 	 * Return: >= 0 stackid on success or negative error
 	 */
 	BPF_FUNC_get_stackid,
+
+	/**
+	 * bpf_csum_diff(from, from_size, to, to_size, seed) - calculate csum diff
+	 * @from: raw from buffer
+	 * @from_size: length of from buffer
+	 * @to: raw to buffer
+	 * @to_size: length of to buffer
+	 * @seed: optional seed
+	 * Return: csum result
+	 */
+	BPF_FUNC_csum_diff,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 2a6e9562f1ab..bf504f8fbe15 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1491,6 +1491,12 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EFAULT;
 
 	switch (flags & BPF_F_HDR_FIELD_MASK) {
+	case 0:
+		if (unlikely(from != 0))
+			return -EINVAL;
+
+		inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
+		break;
 	case 2:
 		inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
 		break;
@@ -1519,6 +1525,51 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+struct bpf_csum_scratchpad {
+	__be32 diff[128];
+};
+
+static DEFINE_PER_CPU(struct bpf_csum_scratchpad, bpf_csum_sp);
+
+static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
+{
+	struct bpf_csum_scratchpad *sp = this_cpu_ptr(&bpf_csum_sp);
+	u64 diff_size = from_size + to_size;
+	__be32 *from = (__be32 *) (long) r1;
+	__be32 *to   = (__be32 *) (long) r3;
+	int i, j = 0;
+
+	/* This is quite flexible, some examples:
+	 *
+	 * from_size == 0, to_size > 0,  seed := csum --> pushing data
+	 * from_size > 0,  to_size == 0, seed := csum --> pulling data
+	 * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
+	 *
+	 * Even for diffing, from_size and to_size don't need to be equal.
+	 */
+	if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
+		     diff_size > sizeof(sp->diff)))
+		return -EINVAL;
+
+	for (i = 0; i < from_size / sizeof(__be32); i++, j++)
+		sp->diff[j] = ~from[i];
+	for (i = 0; i <   to_size / sizeof(__be32); i++, j++)
+		sp->diff[j] = to[i];
+
+	return csum_partial(sp->diff, diff_size, seed);
+}
+
+const struct bpf_func_proto bpf_csum_diff_proto = {
+	.func		= bpf_csum_diff,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
+	.arg3_type	= ARG_PTR_TO_STACK,
+	.arg4_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2;
@@ -1849,6 +1900,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
 	case BPF_FUNC_l3_csum_replace:
 		return &bpf_l3_csum_replace_proto;
 	case BPF_FUNC_l4_csum_replace:
-- 
cgit v1.2.3


From 2f72959a9c1260ade234f353ccca91118151af66 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 19 Feb 2016 23:05:26 +0100
Subject: bpf: fix csum update in bpf_l4_csum_replace helper for udp

When using this helper for updating UDP checksums, we need to extend
this in order to write CSUM_MANGLED_0 for csum computations that result
into 0 as sum. Reason we need this is because packets with a checksum
could otherwise become incorrectly marked as a packet without a checksum.
Likewise, if the user indicates BPF_F_MARK_MANGLED_0, then we should
not turn packets without a checksum into ones with a checksum.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 48d0a6c54609..6496f98d3d68 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -313,6 +313,7 @@ enum bpf_func_id {
 
 /* BPF_FUNC_l4_csum_replace flags. */
 #define BPF_F_PSEUDO_HDR		(1ULL << 4)
+#define BPF_F_MARK_MANGLED_0		(1ULL << 5)
 
 /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
 #define BPF_F_INGRESS			(1ULL << 0)
diff --git a/net/core/filter.c b/net/core/filter.c
index f031b82128f3..8a0b8c3eb189 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1477,10 +1477,12 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
+	bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
 	int offset = (int) r2;
 	__sum16 sum, *ptr;
 
-	if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
+	if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
+			       BPF_F_HDR_FIELD_MASK)))
 		return -EINVAL;
 	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
@@ -1490,6 +1492,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
 	if (unlikely(!ptr))
 		return -EFAULT;
+	if (is_mmzero && !*ptr)
+		return 0;
 
 	switch (flags & BPF_F_HDR_FIELD_MASK) {
 	case 0:
@@ -1508,6 +1512,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EINVAL;
 	}
 
+	if (is_mmzero && !*ptr)
+		*ptr = CSUM_MANGLED_0;
 	if (ptr == &sum)
 		/* skb_store_bits guaranteed to not return -EFAULT here */
 		skb_store_bits(skb, offset, ptr, sizeof(sum));
-- 
cgit v1.2.3


From d4634e8dea13ccc969dd3f33dab3873cfdf3bc51 Mon Sep 17 00:00:00 2001
From: João Paulo Rechi Vita <jprvita@gmail.com>
Date: Tue, 19 Jan 2016 10:42:42 -0500
Subject: rfkill: Update userspace API documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a note to userspace on the effect of RFKILL_OP_CHANGE_ALL also
updating the default state for hotplugged devices.

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
[reword a bit]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/rfkill.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/rfkill.h b/include/uapi/linux/rfkill.h
index 058757f7a733..2e00dcebebd0 100644
--- a/include/uapi/linux/rfkill.h
+++ b/include/uapi/linux/rfkill.h
@@ -59,6 +59,8 @@ enum rfkill_type {
  * @RFKILL_OP_DEL: a device was removed
  * @RFKILL_OP_CHANGE: a device's state changed -- userspace changes one device
  * @RFKILL_OP_CHANGE_ALL: userspace changes all devices (of a type, or all)
+ *	into a state, also updating the default state used for devices that
+ *	are hot-plugged later.
  */
 enum rfkill_operation {
 	RFKILL_OP_ADD = 0,
-- 
cgit v1.2.3


From 34d505193bd10668acf1caba02d2f66bddc23fea Mon Sep 17 00:00:00 2001
From: Lior David <liord@codeaurora.org>
Date: Thu, 28 Jan 2016 10:58:25 +0200
Subject: cfg80211: basic support for PBSS network type

PBSS (Personal Basic Service Set) is a new BSS type for DMG
networks. It is similar to infrastructure BSS, having an AP-like
entity called PCP (PBSS Control Point), but it has few differences.
PBSS support is mandatory for 11ad devices.

Add support for PBSS by introducing a new PBSS flag attribute.
The PBSS flag is used in the START_AP command to request starting
a PCP instead of an AP, and in the CONNECT command to request
connecting to a PCP instead of an AP.

Signed-off-by: Lior David <liord@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/wil6210/cfg80211.c | 10 ++++++++++
 include/net/cfg80211.h                      |  8 ++++++++
 include/uapi/linux/nl80211.h                |  6 ++++++
 net/wireless/nl80211.c                      | 11 +++++++++++
 net/wireless/sme.c                          |  9 ++++++---
 5 files changed, 41 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c
index 20d07ef679e8..1f231cd08138 100644
--- a/drivers/net/wireless/ath/wil6210/cfg80211.c
+++ b/drivers/net/wireless/ath/wil6210/cfg80211.c
@@ -422,6 +422,11 @@ static int wil_cfg80211_connect(struct wiphy *wiphy,
 	if (sme->privacy && !rsn_eid)
 		wil_info(wil, "WSC connection\n");
 
+	if (sme->pbss) {
+		wil_err(wil, "connect - PBSS not yet supported\n");
+		return -EOPNOTSUPP;
+	}
+
 	bss = cfg80211_get_bss(wiphy, sme->channel, sme->bssid,
 			       sme->ssid, sme->ssid_len,
 			       IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY);
@@ -870,6 +875,11 @@ static int wil_cfg80211_start_ap(struct wiphy *wiphy,
 		return -EINVAL;
 	}
 
+	if (info->pbss) {
+		wil_err(wil, "AP: PBSS not yet supported\n");
+		return -EOPNOTSUPP;
+	}
+
 	switch (info->hidden_ssid) {
 	case NL80211_HIDDEN_SSID_NOT_IN_USE:
 		hidden_ssid = WMI_HIDDEN_SSID_DISABLED;
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9bcaaf7cd15a..9e1b24c29f0c 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -712,6 +712,8 @@ struct cfg80211_acl_data {
  * @p2p_opp_ps: P2P opportunistic PS
  * @acl: ACL configuration used by the drivers which has support for
  *	MAC address based access control
+ * @pbss: If set, start as a PCP instead of AP. Relevant for DMG
+ *	networks.
  */
 struct cfg80211_ap_settings {
 	struct cfg80211_chan_def chandef;
@@ -730,6 +732,7 @@ struct cfg80211_ap_settings {
 	u8 p2p_ctwindow;
 	bool p2p_opp_ps;
 	const struct cfg80211_acl_data *acl;
+	bool pbss;
 };
 
 /**
@@ -1888,6 +1891,8 @@ struct cfg80211_ibss_params {
  * @ht_capa_mask:  The bits of ht_capa which are to be used.
  * @vht_capa:  VHT Capability overrides
  * @vht_capa_mask: The bits of vht_capa which are to be used.
+ * @pbss: if set, connect to a PCP instead of AP. Valid for DMG
+ *	networks.
  */
 struct cfg80211_connect_params {
 	struct ieee80211_channel *channel;
@@ -1910,6 +1915,7 @@ struct cfg80211_connect_params {
 	struct ieee80211_ht_cap ht_capa_mask;
 	struct ieee80211_vht_cap vht_capa;
 	struct ieee80211_vht_cap vht_capa_mask;
+	bool pbss;
 };
 
 /**
@@ -3489,6 +3495,7 @@ struct cfg80211_cached_keys;
  *	registered for unexpected class 3 frames (AP mode)
  * @conn: (private) cfg80211 software SME connection state machine data
  * @connect_keys: (private) keys to set after connection is established
+ * @conn_bss_type: connecting/connected BSS type
  * @ibss_fixed: (private) IBSS is using fixed BSSID
  * @ibss_dfs_possible: (private) IBSS may change to a DFS channel
  * @event_list: (private) list for internal event processing
@@ -3519,6 +3526,7 @@ struct wireless_dev {
 	u8 ssid_len, mesh_id_len, mesh_id_up_len;
 	struct cfg80211_conn *conn;
 	struct cfg80211_cached_keys *connect_keys;
+	enum ieee80211_bss_type conn_bss_type;
 
 	struct list_head event_list;
 	spinlock_t event_lock;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 5b7b5ebe7ca8..7758969a2a8e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1789,6 +1789,10 @@ enum nl80211_commands {
  *	thus it must not specify the number of iterations, only the interval
  *	between scans. The scan plans are executed sequentially.
  *	Each scan plan is a nested attribute of &enum nl80211_sched_scan_plan.
+ * @NL80211_ATTR_PBSS: flag attribute. If set it means operate
+ *	in a PBSS. Specified in %NL80211_CMD_CONNECT to request
+ *	connecting to a PCP, and in %NL80211_CMD_START_AP to start
+ *	a PCP instead of AP. Relevant for DMG networks only.
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -2164,6 +2168,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_MAX_SCAN_PLAN_ITERATIONS,
 	NL80211_ATTR_SCHED_SCAN_PLANS,
 
+	NL80211_ATTR_PBSS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d4786f2802aa..268cb493f6a5 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -401,6 +401,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 },
 	[NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 },
 	[NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG },
+	[NL80211_ATTR_PBSS] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -3461,6 +3462,10 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			return PTR_ERR(params.acl);
 	}
 
+	params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
+	if (params.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ])
+		return -EOPNOTSUPP;
+
 	wdev_lock(wdev);
 	err = rdev_start_ap(rdev, dev, &params);
 	if (!err) {
@@ -7980,6 +7985,12 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 		connect.flags |= ASSOC_REQ_USE_RRM;
 	}
 
+	connect.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
+	if (connect.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ]) {
+		kzfree(connkeys);
+		return -EOPNOTSUPP;
+	}
+
 	wdev_lock(dev->ieee80211_ptr);
 	err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL);
 	wdev_unlock(dev->ieee80211_ptr);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 8020b5b094d4..79bd3a171caa 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -264,7 +264,7 @@ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev)
 			       wdev->conn->params.bssid,
 			       wdev->conn->params.ssid,
 			       wdev->conn->params.ssid_len,
-			       IEEE80211_BSS_TYPE_ESS,
+			       wdev->conn_bss_type,
 			       IEEE80211_PRIVACY(wdev->conn->params.privacy));
 	if (!bss)
 		return NULL;
@@ -687,7 +687,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 		WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect);
 		bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid,
 				       wdev->ssid, wdev->ssid_len,
-				       IEEE80211_BSS_TYPE_ESS,
+				       wdev->conn_bss_type,
 				       IEEE80211_PRIVACY_ANY);
 		if (bss)
 			cfg80211_hold_bss(bss_from_pub(bss));
@@ -846,7 +846,7 @@ void cfg80211_roamed(struct net_device *dev,
 
 	bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, wdev->ssid,
 			       wdev->ssid_len,
-			       IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY);
+			       wdev->conn_bss_type, IEEE80211_PRIVACY_ANY);
 	if (WARN_ON(!bss))
 		return;
 
@@ -1017,6 +1017,9 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 	memcpy(wdev->ssid, connect->ssid, connect->ssid_len);
 	wdev->ssid_len = connect->ssid_len;
 
+	wdev->conn_bss_type = connect->pbss ? IEEE80211_BSS_TYPE_PBSS :
+					      IEEE80211_BSS_TYPE_ESS;
+
 	if (!rdev->ops->connect)
 		err = cfg80211_sme_connect(wdev, connect, prev_bssid);
 	else
-- 
cgit v1.2.3


From 0c9ca11b1ae8eb16c1b6bbae91991392d2321372 Mon Sep 17 00:00:00 2001
From: Beni Lev <beni.lev@intel.com>
Date: Wed, 17 Feb 2016 20:30:00 +0200
Subject: cfg80211: Add global RRM capability

Today, the supplicant will add the RRM capabilities
Information Element in the association request only if
Quiet period is supported (NL80211_FEATURE_QUIET).

Quiet is one of many RRM features, and there are other RRM
features that are not related to Quiet (e.g. neighbor
report). Therefore, requiring Quiet to enable RRM is too
restrictive.
Some of the features, like neighbor report, can be
supported by user space without any help from the kernel.
Hence adding the RRM capabilities IE to association request
should be the sole user space's decision.
Removing the RRM dependency on Quiet in the driver solves
this problem, but using an old driver with a user space
tool that would not require Quiet feature would be
problematic: the user space would add NL80211_ATTR_USE_RRM
in the association request even if the kernel doesn't
advertize NL80211_FEATURE_QUIET and the association would
be denied by the kernel.

This solution adds a global RRM capability, that tells user
space that it can request RRM capabilities IE publishment
without any specific feature support in the kernel.

Signed-off-by: Beni Lev <beni.lev@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  8 ++++++++
 net/wireless/nl80211.c       | 18 +++++++++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 7758969a2a8e..5a30a7563633 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1727,6 +1727,8 @@ enum nl80211_commands {
  *	underlying device supports these minimal RRM features:
  *		%NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES,
  *		%NL80211_FEATURE_QUIET,
+ *	Or, if global RRM is supported, see:
+ *		%NL80211_EXT_FEATURE_RRM
  *	If this flag is used, driver must add the Power Capabilities IE to the
  *	association request. In addition, it must also set the RRM capability
  *	flag in the association request's Capability Info field.
@@ -4402,12 +4404,18 @@ enum nl80211_feature_flags {
 /**
  * enum nl80211_ext_feature_index - bit index of extended features.
  * @NL80211_EXT_FEATURE_VHT_IBSS: This driver supports IBSS with VHT datarates.
+ * @NL80211_EXT_FEATURE_RRM: This driver supports RRM. When featured, user can
+ *	can request to use RRM (see %NL80211_ATTR_USE_RRM) with
+ *	%NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests, which will set
+ *	the ASSOC_REQ_USE_RRM flag in the association request even if
+ *	NL80211_FEATURE_QUIET is not advertized.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
 enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_VHT_IBSS,
+	NL80211_EXT_FEATURE_RRM,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 268cb493f6a5..90890f183c0e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3,7 +3,7 @@
  *
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright 2015	Intel Deutschland GmbH
+ * Copyright 2015-2016	Intel Deutschland GmbH
  */
 
 #include <linux/if.h>
@@ -7286,9 +7286,11 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
-		if (!(rdev->wiphy.features &
-		      NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) ||
-		    !(rdev->wiphy.features & NL80211_FEATURE_QUIET))
+		if (!((rdev->wiphy.features &
+			NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) &&
+		       (rdev->wiphy.features & NL80211_FEATURE_QUIET)) &&
+		    !wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_RRM))
 			return -EINVAL;
 		req.flags |= ASSOC_REQ_USE_RRM;
 	}
@@ -7976,9 +7978,11 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
-		if (!(rdev->wiphy.features &
-		      NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) ||
-		    !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) {
+		if (!((rdev->wiphy.features &
+			NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) &&
+		       (rdev->wiphy.features & NL80211_FEATURE_QUIET)) &&
+		    !wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_RRM)) {
 			kzfree(connkeys);
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From f1705ec197e705b79ea40fe7a2cc5acfa1d3bfac Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 24 Feb 2016 09:25:37 -0800
Subject: net: ipv6: Make address flushing on ifdown optional

Currently, all ipv6 addresses are flushed when the interface is configured
down, including global, static addresses:

    $ ip -6 addr show dev eth1
    3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 state UP qlen 1000
        inet6 2100:1::2/120 scope global
           valid_lft forever preferred_lft forever
        inet6 fe80::e0:f9ff:fe79:34bd/64 scope link
           valid_lft forever preferred_lft forever
    $ ip link set dev eth1 down
    $ ip -6 addr show dev eth1
    << nothing; all addresses have been flushed>>

Add a new sysctl to make this behavior optional. The new setting defaults to
flush all addresses to maintain backwards compatibility. When the set global
addresses with no expire times are not flushed on an admin down. The sysctl
is per-interface or system-wide for all interfaces

    $ sysctl -w net.ipv6.conf.eth1.keep_addr_on_down=1
or
    $ sysctl -w net.ipv6.conf.all.keep_addr_on_down=1

Will keep addresses on eth1 on an admin down.

    $ ip -6 addr show dev eth1
    3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 state UP qlen 1000
        inet6 2100:1::2/120 scope global
           valid_lft forever preferred_lft forever
        inet6 fe80::e0:f9ff:fe79:34bd/64 scope link
           valid_lft forever preferred_lft forever
    $ ip link set dev eth1 down
    $ ip -6 addr show dev eth1
    3: eth1: <BROADCAST,MULTICAST> mtu 1500 state DOWN qlen 1000
        inet6 2100:1::2/120 scope global tentative
           valid_lft forever preferred_lft forever
        inet6 fe80::e0:f9ff:fe79:34bd/64 scope link tentative
           valid_lft forever preferred_lft forever

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |   9 +++
 include/linux/ipv6.h                   |   1 +
 include/uapi/linux/ipv6.h              |   1 +
 net/ipv6/addrconf.c                    | 136 +++++++++++++++++++++++++++++----
 4 files changed, 132 insertions(+), 15 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 24ce97f42d35..d5df40c75aa4 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1563,6 +1563,15 @@ temp_prefered_lft - INTEGER
 	Preferred lifetime (in seconds) for temporary addresses.
 	Default: 86400 (1 day)
 
+keep_addr_on_down - INTEGER
+	Keep all IPv6 addresses on an interface down event. If set static
+	global addresses with no expiration time are not flushed.
+	  >0 : enabled
+	   0 : system default
+	  <0 : disabled
+
+	Default: 0 (addresses are removed)
+
 max_desync_factor - INTEGER
 	Maximum value for DESYNC_FACTOR, which is a random value
 	that ensures that clients don't synchronize with each
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 4b2267e1b7c3..7edc14fb66b6 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -62,6 +62,7 @@ struct ipv6_devconf {
 		struct in6_addr secret;
 	} stable_secret;
 	__s32		use_oif_addrs_only;
+	__s32		keep_addr_on_down;
 	void		*sysctl;
 };
 
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index ec117b65d5a5..395876060f50 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -176,6 +176,7 @@ enum {
 	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	DEVCONF_DROP_UNSOLICITED_NA,
+	DEVCONF_KEEP_ADDR_ON_DOWN,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4751f8922362..a2d6f6c242af 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -216,6 +216,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	},
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
+	.keep_addr_on_down	= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -260,6 +261,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	},
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
+	.keep_addr_on_down	= 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -3168,6 +3170,55 @@ static void addrconf_gre_config(struct net_device *dev)
 }
 #endif
 
+static int fixup_permanent_addr(struct inet6_dev *idev,
+				struct inet6_ifaddr *ifp)
+{
+	if (!ifp->rt) {
+		struct rt6_info *rt;
+
+		rt = addrconf_dst_alloc(idev, &ifp->addr, false);
+		if (unlikely(IS_ERR(rt)))
+			return PTR_ERR(rt);
+
+		ifp->rt = rt;
+	}
+
+	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+				      idev->dev, 0, 0);
+	}
+
+	addrconf_dad_start(ifp);
+
+	return 0;
+}
+
+static void addrconf_permanent_addr(struct net_device *dev)
+{
+	struct inet6_ifaddr *ifp, *tmp;
+	struct inet6_dev *idev;
+
+	idev = __in6_dev_get(dev);
+	if (!idev)
+		return;
+
+	write_lock_bh(&idev->lock);
+
+	list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
+		if ((ifp->flags & IFA_F_PERMANENT) &&
+		    fixup_permanent_addr(idev, ifp) < 0) {
+			write_unlock_bh(&idev->lock);
+			ipv6_del_addr(ifp);
+			write_lock_bh(&idev->lock);
+
+			net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n",
+					     idev->dev->name, &ifp->addr);
+		}
+	}
+
+	write_unlock_bh(&idev->lock);
+}
+
 static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			   void *ptr)
 {
@@ -3253,6 +3304,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			run_pending = 1;
 		}
 
+		/* restore routes for permanent addresses */
+		addrconf_permanent_addr(dev);
+
 		switch (dev->type) {
 #if IS_ENABLED(CONFIG_IPV6_SIT)
 		case ARPHRD_SIT:
@@ -3356,7 +3410,10 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 {
 	struct net *net = dev_net(dev);
 	struct inet6_dev *idev;
-	struct inet6_ifaddr *ifa;
+	struct inet6_ifaddr *ifa, *tmp;
+	struct list_head del_list;
+	int _keep_addr;
+	bool keep_addr;
 	int state, i;
 
 	ASSERT_RTNL();
@@ -3383,6 +3440,16 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 	}
 
+	/* aggregate the system setting and interface setting */
+	_keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
+	if (!_keep_addr)
+		_keep_addr = idev->cnf.keep_addr_on_down;
+
+	/* combine the user config with event to determine if permanent
+	 * addresses are to be removed from address hash table
+	 */
+	keep_addr = !(how || _keep_addr <= 0);
+
 	/* Step 2: clear hash table */
 	for (i = 0; i < IN6_ADDR_HSIZE; i++) {
 		struct hlist_head *h = &inet6_addr_lst[i];
@@ -3391,9 +3458,15 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 restart:
 		hlist_for_each_entry_rcu(ifa, h, addr_lst) {
 			if (ifa->idev == idev) {
-				hlist_del_init_rcu(&ifa->addr_lst);
 				addrconf_del_dad_work(ifa);
-				goto restart;
+				/* combined flag + permanent flag decide if
+				 * address is retained on a down event
+				 */
+				if (!keep_addr ||
+				    !(ifa->flags & IFA_F_PERMANENT)) {
+					hlist_del_init_rcu(&ifa->addr_lst);
+					goto restart;
+				}
 			}
 		}
 		spin_unlock_bh(&addrconf_hash_lock);
@@ -3427,31 +3500,53 @@ restart:
 		write_lock_bh(&idev->lock);
 	}
 
-	while (!list_empty(&idev->addr_list)) {
-		ifa = list_first_entry(&idev->addr_list,
-				       struct inet6_ifaddr, if_list);
-		addrconf_del_dad_work(ifa);
+	/* re-combine the user config with event to determine if permanent
+	 * addresses are to be removed from the interface list
+	 */
+	keep_addr = (!how && _keep_addr > 0);
 
-		list_del(&ifa->if_list);
+	INIT_LIST_HEAD(&del_list);
+	list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
+		addrconf_del_dad_work(ifa);
 
 		write_unlock_bh(&idev->lock);
-
 		spin_lock_bh(&ifa->lock);
-		state = ifa->state;
-		ifa->state = INET6_IFADDR_STATE_DEAD;
+
+		if (keep_addr && (ifa->flags & IFA_F_PERMANENT)) {
+			/* set state to skip the notifier below */
+			state = INET6_IFADDR_STATE_DEAD;
+			ifa->state = 0;
+			if (!(ifa->flags & IFA_F_NODAD))
+				ifa->flags |= IFA_F_TENTATIVE;
+		} else {
+			state = ifa->state;
+			ifa->state = INET6_IFADDR_STATE_DEAD;
+
+			list_del(&ifa->if_list);
+			list_add(&ifa->if_list, &del_list);
+		}
+
 		spin_unlock_bh(&ifa->lock);
 
 		if (state != INET6_IFADDR_STATE_DEAD) {
 			__ipv6_ifa_notify(RTM_DELADDR, ifa);
 			inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
 		}
-		in6_ifa_put(ifa);
 
 		write_lock_bh(&idev->lock);
 	}
 
 	write_unlock_bh(&idev->lock);
 
+	/* now clean up addresses to be removed */
+	while (!list_empty(&del_list)) {
+		ifa = list_first_entry(&del_list,
+				       struct inet6_ifaddr, if_list);
+		list_del(&ifa->if_list);
+
+		in6_ifa_put(ifa);
+	}
+
 	/* Step 5: Discard anycast and multicast list */
 	if (how) {
 		ipv6_ac_destroy_dev(idev);
@@ -4716,6 +4811,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
 	array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
+	array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5197,10 +5293,12 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 			if (rt)
 				ip6_del_rt(rt);
 		}
-		dst_hold(&ifp->rt->dst);
-
-		ip6_del_rt(ifp->rt);
+		if (ifp->rt) {
+			dst_hold(&ifp->rt->dst);
 
+			ip6_del_rt(ifp->rt);
+			ifp->rt = NULL;
+		}
 		rt_genid_bump_ipv6(net);
 		break;
 	}
@@ -5803,6 +5901,14 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname       = "keep_addr_on_down",
+			.data           = &ipv6_devconf.keep_addr_on_down,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec,
+
+		},
 		{
 			/* sentinel */
 		}
-- 
cgit v1.2.3


From a87cb3e48ee86d29868d3f59cfb9ce1a8fa63314 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Wed, 24 Feb 2016 10:02:52 -0800
Subject: net: Facility to report route quality of connected sockets

This patch add the SO_CNX_ADVICE socket option (setsockopt only). The
purpose is to allow an application to give feedback to the kernel about
the quality of the network path for a connected socket. The value
argument indicates the type of quality report. For this initial patch
the only supported advice is a value of 1 which indicates "bad path,
please reroute"-- the action taken by the kernel is to call
dst_negative_advice which will attempt to choose a different ECMP route,
reset the TX hash for flow label and UDP source port in encapsulation,
etc.

This facility should be useful for connected UDP sockets where only the
application can provide any feedback about path quality. It could also
be useful for TCP applications that have additional knowledge about the
path outside of the normal TCP control loop.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h   | 2 ++
 arch/avr32/include/uapi/asm/socket.h   | 2 ++
 arch/frv/include/uapi/asm/socket.h     | 2 ++
 arch/ia64/include/uapi/asm/socket.h    | 2 ++
 arch/m32r/include/uapi/asm/socket.h    | 2 ++
 arch/mips/include/uapi/asm/socket.h    | 2 ++
 arch/mn10300/include/uapi/asm/socket.h | 2 ++
 arch/parisc/include/uapi/asm/socket.h  | 2 ++
 arch/powerpc/include/uapi/asm/socket.h | 2 ++
 arch/s390/include/uapi/asm/socket.h    | 2 ++
 arch/sparc/include/uapi/asm/socket.h   | 2 ++
 arch/xtensa/include/uapi/asm/socket.h  | 2 ++
 include/uapi/asm-generic/socket.h      | 2 ++
 net/core/sock.c                        | 4 ++++
 14 files changed, 30 insertions(+)

(limited to 'include/uapi')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index c5fb9e6bc3a5..9e46d6e656d9 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -95,4 +95,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
index 9de0796240a0..1fd147f09a38 100644
--- a/arch/avr32/include/uapi/asm/socket.h
+++ b/arch/avr32/include/uapi/asm/socket.h
@@ -88,4 +88,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _UAPI__ASM_AVR32_SOCKET_H */
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index f02e4849ae83..afbc98f02d27 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -88,5 +88,7 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index bce29166de1b..0018fad9039f 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -97,4 +97,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index 14aa4a6bccf1..5fe42fc7b6c5 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -88,4 +88,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 5910fe294e93..2027240aafbb 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -106,4 +106,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index 58b1aa01ab9f..5129f23a9ee1 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -88,4 +88,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index f9cf1223422c..9c935d717df9 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -87,4 +87,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	0x402C
 #define SO_ATTACH_REUSEPORT_EBPF	0x402D
 
+#define SO_CNX_ADVICE		0x402E
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index dd54f28ecdec..1672e3398270 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -95,4 +95,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif	/* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index d02e89d14fef..41b51c2f4f1b 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -94,4 +94,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index d270ee91968e..31aede3af088 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -84,6 +84,8 @@
 #define SO_ATTACH_REUSEPORT_CBPF	0x0035
 #define SO_ATTACH_REUSEPORT_EBPF	0x0036
 
+#define SO_CNX_ADVICE		0x0037
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index fd3b96d1153f..81435d995e11 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -99,4 +99,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index fb8a41668382..67d632f1743d 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -90,4 +90,6 @@
 #define SO_ATTACH_REUSEPORT_CBPF	51
 #define SO_ATTACH_REUSEPORT_EBPF	52
 
+#define SO_CNX_ADVICE		53
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 46dc8ad7d050..4493ff820c2c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -987,6 +987,10 @@ set_rcvbuf:
 		sk->sk_incoming_cpu = val;
 		break;
 
+	case SO_CNX_ADVICE:
+		if (val == 1)
+			dst_negative_advice(sk);
+		break;
 	default:
 		ret = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From 3f1ac7a700d039c61d8d8b99f28d605d489a60cf Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Wed, 24 Feb 2016 10:57:59 -0800
Subject: net: ethtool: add new ETHTOOL_xLINKSETTINGS API

This patch defines a new ETHTOOL_GLINKSETTINGS/SLINKSETTINGS API,
handled by the new get_link_ksettings/set_link_ksettings callbacks.
This API provides support for most legacy ethtool_cmd fields, adds
support for larger link mode masks (up to 4064 bits, variable length),
and removes ethtool_cmd deprecated
fields (transceiver/maxrxpkt/maxtxpkt).

This API is deprecating the legacy ETHTOOL_GSET/SSET API and provides
the following backward compatibility properties:
 - legacy ethtool with legacy drivers: no change, still using the
   get_settings/set_settings callbacks.
 - legacy ethtool with new get/set_link_ksettings drivers: the new
   driver callbacks are used, data internally converted to legacy
   ethtool_cmd. ETHTOOL_GSET will return only the 1st 32b of each link
   mode mask. ETHTOOL_SSET will fail if user tries to set the
   ethtool_cmd deprecated fields to
   non-0 (transceiver/maxrxpkt/maxtxpkt). A kernel warning is logged if
   driver sets higher bits.
 - future ethtool with legacy drivers: no change, still using the
   get_settings/set_settings callbacks, internally converted to new data
   structure. Deprecated fields (transceiver/maxrxpkt/maxtxpkt) will be
   ignored and seen as 0 from user space. Note that that "future"
   ethtool tool will not allow changes to these deprecated fields.
 - future ethtool with new drivers: direct call to the new callbacks.

By "future" ethtool, what is meant is:
 - query: first try ETHTOOL_GLINKSETTINGS, and revert to ETHTOOL_GSET if
   fails
 - set: query first and remember which of ETHTOOL_GLINKSETTINGS or
   ETHTOOL_GSET was successful
   + if ETHTOOL_GLINKSETTINGS was successful, then change config with
     ETHTOOL_SLINKSETTINGS. A failure there is final (do not try
     ETHTOOL_SSET).
   + otherwise ETHTOOL_GSET was successful, change config with
     ETHTOOL_SSET. A failure there is final (do not try
     ETHTOOL_SLINKSETTINGS).

The interaction user/kernel via the new API requires a small
ETHTOOL_GLINKSETTINGS handshake first to agree on the length of the link
mode bitmaps. If kernel doesn't agree with user, it returns the bitmap
length it is expecting from user as a negative length (and cmd field is
0). When kernel and user agree, kernel returns valid info in all
fields (ie. link mode length > 0 and cmd is ETHTOOL_GLINKSETTINGS).

Data structure crossing user/kernel boundary is 32/64-bit
agnostic. Converted internally to a legal kernel bitmap.

The internal __ethtool_get_settings kernel helper will gradually be
replaced by __ethtool_get_link_ksettings by the time the first
"link_settings" drivers start to appear. So this patch doesn't change
it, it will be removed before it needs to be changed.

Signed-off-by: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h      |  91 ++++++++-
 include/uapi/linux/ethtool.h | 322 +++++++++++++++++++++++-------
 net/core/ethtool.c           | 453 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 786 insertions(+), 80 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 472d7d7b01c2..8a400a54c92e 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -12,6 +12,7 @@
 #ifndef _LINUX_ETHTOOL_H
 #define _LINUX_ETHTOOL_H
 
+#include <linux/bitmap.h>
 #include <linux/compat.h>
 #include <uapi/linux/ethtool.h>
 
@@ -40,9 +41,6 @@ struct compat_ethtool_rxnfc {
 
 #include <linux/rculist.h>
 
-extern int __ethtool_get_settings(struct net_device *dev,
-				  struct ethtool_cmd *cmd);
-
 /**
  * enum ethtool_phys_id_state - indicator state for physical identification
  * @ETHTOOL_ID_INACTIVE: Physical ID indicator should be deactivated
@@ -97,13 +95,74 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
 	return index % n_rx_rings;
 }
 
+/* number of link mode bits/ulongs handled internally by kernel */
+#define __ETHTOOL_LINK_MODE_MASK_NBITS			\
+	(__ETHTOOL_LINK_MODE_LAST + 1)
+
+/* declare a link mode bitmap */
+#define __ETHTOOL_DECLARE_LINK_MODE_MASK(name)		\
+	DECLARE_BITMAP(name, __ETHTOOL_LINK_MODE_MASK_NBITS)
+
+/* drivers must ignore base.cmd and base.link_mode_masks_nwords
+ * fields, but they are allowed to overwrite them (will be ignored).
+ */
+struct ethtool_link_ksettings {
+	struct ethtool_link_settings base;
+	struct {
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising);
+	} link_modes;
+};
+
+/**
+ * ethtool_link_ksettings_zero_link_mode - clear link_ksettings link mode mask
+ *   @ptr : pointer to struct ethtool_link_ksettings
+ *   @name : one of supported/advertising/lp_advertising
+ */
+#define ethtool_link_ksettings_zero_link_mode(ptr, name)		\
+	bitmap_zero((ptr)->link_modes.name, __ETHTOOL_LINK_MODE_MASK_NBITS)
+
+/**
+ * ethtool_link_ksettings_add_link_mode - set bit in link_ksettings
+ * link mode mask
+ *   @ptr : pointer to struct ethtool_link_ksettings
+ *   @name : one of supported/advertising/lp_advertising
+ *   @mode : one of the ETHTOOL_LINK_MODE_*_BIT
+ * (not atomic, no bound checking)
+ */
+#define ethtool_link_ksettings_add_link_mode(ptr, name, mode)		\
+	__set_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name)
+
+/**
+ * ethtool_link_ksettings_test_link_mode - test bit in ksettings link mode mask
+ *   @ptr : pointer to struct ethtool_link_ksettings
+ *   @name : one of supported/advertising/lp_advertising
+ *   @mode : one of the ETHTOOL_LINK_MODE_*_BIT
+ * (not atomic, no bound checking)
+ *
+ * Returns true/false.
+ */
+#define ethtool_link_ksettings_test_link_mode(ptr, name, mode)		\
+	test_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name)
+
+extern int
+__ethtool_get_link_ksettings(struct net_device *dev,
+			     struct ethtool_link_ksettings *link_ksettings);
+
+/* DEPRECATED, use __ethtool_get_link_ksettings */
+extern int __ethtool_get_settings(struct net_device *dev,
+				  struct ethtool_cmd *cmd);
+
 /**
  * struct ethtool_ops - optional netdev operations
- * @get_settings: Get various device settings including Ethernet link
+ * @get_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
+ *	API. Get various device settings including Ethernet link
  *	settings. The @cmd parameter is expected to have been cleared
- *	before get_settings is called. Returns a negative error code or
- *	zero.
- * @set_settings: Set various device settings including Ethernet link
+ *	before get_settings is called. Returns a negative error code
+ *	or zero.
+ * @set_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
+ *	API. Set various device settings including Ethernet link
  *	settings.  Returns a negative error code or zero.
  * @get_drvinfo: Report driver/device information.  Should only set the
  *	@driver, @version, @fw_version and @bus_info fields.  If not
@@ -211,6 +270,19 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
  *	a TX queue has this number, return -EINVAL. If only a RX queue or a TX
  *	queue has this number, ignore the inapplicable fields.
  *	Returns a negative error code or zero.
+ * @get_link_ksettings: When defined, takes precedence over the
+ *	%get_settings method. Get various device settings
+ *	including Ethernet link settings. The %cmd and
+ *	%link_mode_masks_nwords fields should be ignored (use
+ *	%__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), any
+ *	change to them will be overwritten by kernel. Returns a
+ *	negative error code or zero.
+ * @set_link_ksettings: When defined, takes precedence over the
+ *	%set_settings method. Set various device settings including
+ *	Ethernet link settings. The %cmd and %link_mode_masks_nwords
+ *	fields should be ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS
+ *	instead of the latter), any change to them will be overwritten
+ *	by kernel. Returns a negative error code or zero.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -293,6 +365,9 @@ struct ethtool_ops {
 					  struct ethtool_coalesce *);
 	int	(*set_per_queue_coalesce)(struct net_device *, u32,
 					  struct ethtool_coalesce *);
-
+	int	(*get_link_ksettings)(struct net_device *,
+				      struct ethtool_link_ksettings *);
+	int	(*set_link_ksettings)(struct net_device *,
+				      const struct ethtool_link_ksettings *);
 };
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index f15ae02621a1..37fd6dc33de4 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -21,7 +21,8 @@
  */
 
 /**
- * struct ethtool_cmd - link control and status
+ * struct ethtool_cmd - DEPRECATED, link control and status
+ * This structure is DEPRECATED, please use struct ethtool_link_settings.
  * @cmd: Command number = %ETHTOOL_GSET or %ETHTOOL_SSET
  * @supported: Bitmask of %SUPPORTED_* flags for the link modes,
  *	physical connectors and other link features for which the
@@ -1219,8 +1220,12 @@ struct ethtool_per_queue_op {
 };
 
 /* CMDs currently supported */
-#define ETHTOOL_GSET		0x00000001 /* Get settings. */
-#define ETHTOOL_SSET		0x00000002 /* Set settings. */
+#define ETHTOOL_GSET		0x00000001 /* DEPRECATED, Get settings.
+					    * Please use ETHTOOL_GLINKSETTINGS
+					    */
+#define ETHTOOL_SSET		0x00000002 /* DEPRECATED, Set settings.
+					    * Please use ETHTOOL_SLINKSETTINGS
+					    */
 #define ETHTOOL_GDRVINFO	0x00000003 /* Get driver info. */
 #define ETHTOOL_GREGS		0x00000004 /* Get NIC registers. */
 #define ETHTOOL_GWOL		0x00000005 /* Get wake-on-lan options. */
@@ -1302,73 +1307,139 @@ struct ethtool_per_queue_op {
 
 #define ETHTOOL_PERQUEUE	0x0000004b /* Set per queue options */
 
+#define ETHTOOL_GLINKSETTINGS	0x0000004c /* Get ethtool_link_settings */
+#define ETHTOOL_SLINKSETTINGS	0x0000004d /* Set ethtool_link_settings */
+
+
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
 #define SPARC_ETH_SSET		ETHTOOL_SSET
 
-#define SUPPORTED_10baseT_Half		(1 << 0)
-#define SUPPORTED_10baseT_Full		(1 << 1)
-#define SUPPORTED_100baseT_Half		(1 << 2)
-#define SUPPORTED_100baseT_Full		(1 << 3)
-#define SUPPORTED_1000baseT_Half	(1 << 4)
-#define SUPPORTED_1000baseT_Full	(1 << 5)
-#define SUPPORTED_Autoneg		(1 << 6)
-#define SUPPORTED_TP			(1 << 7)
-#define SUPPORTED_AUI			(1 << 8)
-#define SUPPORTED_MII			(1 << 9)
-#define SUPPORTED_FIBRE			(1 << 10)
-#define SUPPORTED_BNC			(1 << 11)
-#define SUPPORTED_10000baseT_Full	(1 << 12)
-#define SUPPORTED_Pause			(1 << 13)
-#define SUPPORTED_Asym_Pause		(1 << 14)
-#define SUPPORTED_2500baseX_Full	(1 << 15)
-#define SUPPORTED_Backplane		(1 << 16)
-#define SUPPORTED_1000baseKX_Full	(1 << 17)
-#define SUPPORTED_10000baseKX4_Full	(1 << 18)
-#define SUPPORTED_10000baseKR_Full	(1 << 19)
-#define SUPPORTED_10000baseR_FEC	(1 << 20)
-#define SUPPORTED_20000baseMLD2_Full	(1 << 21)
-#define SUPPORTED_20000baseKR2_Full	(1 << 22)
-#define SUPPORTED_40000baseKR4_Full	(1 << 23)
-#define SUPPORTED_40000baseCR4_Full	(1 << 24)
-#define SUPPORTED_40000baseSR4_Full	(1 << 25)
-#define SUPPORTED_40000baseLR4_Full	(1 << 26)
-#define SUPPORTED_56000baseKR4_Full	(1 << 27)
-#define SUPPORTED_56000baseCR4_Full	(1 << 28)
-#define SUPPORTED_56000baseSR4_Full	(1 << 29)
-#define SUPPORTED_56000baseLR4_Full	(1 << 30)
-
-#define ADVERTISED_10baseT_Half		(1 << 0)
-#define ADVERTISED_10baseT_Full		(1 << 1)
-#define ADVERTISED_100baseT_Half	(1 << 2)
-#define ADVERTISED_100baseT_Full	(1 << 3)
-#define ADVERTISED_1000baseT_Half	(1 << 4)
-#define ADVERTISED_1000baseT_Full	(1 << 5)
-#define ADVERTISED_Autoneg		(1 << 6)
-#define ADVERTISED_TP			(1 << 7)
-#define ADVERTISED_AUI			(1 << 8)
-#define ADVERTISED_MII			(1 << 9)
-#define ADVERTISED_FIBRE		(1 << 10)
-#define ADVERTISED_BNC			(1 << 11)
-#define ADVERTISED_10000baseT_Full	(1 << 12)
-#define ADVERTISED_Pause		(1 << 13)
-#define ADVERTISED_Asym_Pause		(1 << 14)
-#define ADVERTISED_2500baseX_Full	(1 << 15)
-#define ADVERTISED_Backplane		(1 << 16)
-#define ADVERTISED_1000baseKX_Full	(1 << 17)
-#define ADVERTISED_10000baseKX4_Full	(1 << 18)
-#define ADVERTISED_10000baseKR_Full	(1 << 19)
-#define ADVERTISED_10000baseR_FEC	(1 << 20)
-#define ADVERTISED_20000baseMLD2_Full	(1 << 21)
-#define ADVERTISED_20000baseKR2_Full	(1 << 22)
-#define ADVERTISED_40000baseKR4_Full	(1 << 23)
-#define ADVERTISED_40000baseCR4_Full	(1 << 24)
-#define ADVERTISED_40000baseSR4_Full	(1 << 25)
-#define ADVERTISED_40000baseLR4_Full	(1 << 26)
-#define ADVERTISED_56000baseKR4_Full	(1 << 27)
-#define ADVERTISED_56000baseCR4_Full	(1 << 28)
-#define ADVERTISED_56000baseSR4_Full	(1 << 29)
-#define ADVERTISED_56000baseLR4_Full	(1 << 30)
+/* Link mode bit indices */
+enum ethtool_link_mode_bit_indices {
+	ETHTOOL_LINK_MODE_10baseT_Half_BIT	= 0,
+	ETHTOOL_LINK_MODE_10baseT_Full_BIT	= 1,
+	ETHTOOL_LINK_MODE_100baseT_Half_BIT	= 2,
+	ETHTOOL_LINK_MODE_100baseT_Full_BIT	= 3,
+	ETHTOOL_LINK_MODE_1000baseT_Half_BIT	= 4,
+	ETHTOOL_LINK_MODE_1000baseT_Full_BIT	= 5,
+	ETHTOOL_LINK_MODE_Autoneg_BIT		= 6,
+	ETHTOOL_LINK_MODE_TP_BIT		= 7,
+	ETHTOOL_LINK_MODE_AUI_BIT		= 8,
+	ETHTOOL_LINK_MODE_MII_BIT		= 9,
+	ETHTOOL_LINK_MODE_FIBRE_BIT		= 10,
+	ETHTOOL_LINK_MODE_BNC_BIT		= 11,
+	ETHTOOL_LINK_MODE_10000baseT_Full_BIT	= 12,
+	ETHTOOL_LINK_MODE_Pause_BIT		= 13,
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT	= 14,
+	ETHTOOL_LINK_MODE_2500baseX_Full_BIT	= 15,
+	ETHTOOL_LINK_MODE_Backplane_BIT		= 16,
+	ETHTOOL_LINK_MODE_1000baseKX_Full_BIT	= 17,
+	ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT	= 18,
+	ETHTOOL_LINK_MODE_10000baseKR_Full_BIT	= 19,
+	ETHTOOL_LINK_MODE_10000baseR_FEC_BIT	= 20,
+	ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT = 21,
+	ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT	= 22,
+	ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT	= 23,
+	ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT	= 24,
+	ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT	= 25,
+	ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT	= 26,
+	ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT	= 27,
+	ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT	= 28,
+	ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT	= 29,
+	ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT	= 30,
+
+	/* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit
+	 * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_*
+	 * macro for bits > 31. The only way to use indices > 31 is to
+	 * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API.
+	 */
+
+	__ETHTOOL_LINK_MODE_LAST
+	  = ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT,
+};
+
+#define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name)	\
+	(1UL << (ETHTOOL_LINK_MODE_ ## base_name ## _BIT))
+
+/* DEPRECATED macros. Please migrate to
+ * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT
+ * define any new SUPPORTED_* macro for bits > 31.
+ */
+#define SUPPORTED_10baseT_Half		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half)
+#define SUPPORTED_10baseT_Full		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full)
+#define SUPPORTED_100baseT_Half		__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half)
+#define SUPPORTED_100baseT_Full		__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full)
+#define SUPPORTED_1000baseT_Half	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half)
+#define SUPPORTED_1000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full)
+#define SUPPORTED_Autoneg		__ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg)
+#define SUPPORTED_TP			__ETHTOOL_LINK_MODE_LEGACY_MASK(TP)
+#define SUPPORTED_AUI			__ETHTOOL_LINK_MODE_LEGACY_MASK(AUI)
+#define SUPPORTED_MII			__ETHTOOL_LINK_MODE_LEGACY_MASK(MII)
+#define SUPPORTED_FIBRE			__ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE)
+#define SUPPORTED_BNC			__ETHTOOL_LINK_MODE_LEGACY_MASK(BNC)
+#define SUPPORTED_10000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full)
+#define SUPPORTED_Pause			__ETHTOOL_LINK_MODE_LEGACY_MASK(Pause)
+#define SUPPORTED_Asym_Pause		__ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause)
+#define SUPPORTED_2500baseX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full)
+#define SUPPORTED_Backplane		__ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane)
+#define SUPPORTED_1000baseKX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full)
+#define SUPPORTED_10000baseKX4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full)
+#define SUPPORTED_10000baseKR_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full)
+#define SUPPORTED_10000baseR_FEC	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC)
+#define SUPPORTED_20000baseMLD2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full)
+#define SUPPORTED_20000baseKR2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full)
+#define SUPPORTED_40000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full)
+#define SUPPORTED_40000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full)
+#define SUPPORTED_40000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full)
+#define SUPPORTED_40000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full)
+#define SUPPORTED_56000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full)
+#define SUPPORTED_56000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full)
+#define SUPPORTED_56000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full)
+#define SUPPORTED_56000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full)
+/* Please do not define any new SUPPORTED_* macro for bits > 31, see
+ * notice above.
+ */
+
+/*
+ * DEPRECATED macros. Please migrate to
+ * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT
+ * define any new ADERTISE_* macro for bits > 31.
+ */
+#define ADVERTISED_10baseT_Half		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half)
+#define ADVERTISED_10baseT_Full		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full)
+#define ADVERTISED_100baseT_Half	__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half)
+#define ADVERTISED_100baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full)
+#define ADVERTISED_1000baseT_Half	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half)
+#define ADVERTISED_1000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full)
+#define ADVERTISED_Autoneg		__ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg)
+#define ADVERTISED_TP			__ETHTOOL_LINK_MODE_LEGACY_MASK(TP)
+#define ADVERTISED_AUI			__ETHTOOL_LINK_MODE_LEGACY_MASK(AUI)
+#define ADVERTISED_MII			__ETHTOOL_LINK_MODE_LEGACY_MASK(MII)
+#define ADVERTISED_FIBRE		__ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE)
+#define ADVERTISED_BNC			__ETHTOOL_LINK_MODE_LEGACY_MASK(BNC)
+#define ADVERTISED_10000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full)
+#define ADVERTISED_Pause		__ETHTOOL_LINK_MODE_LEGACY_MASK(Pause)
+#define ADVERTISED_Asym_Pause		__ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause)
+#define ADVERTISED_2500baseX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full)
+#define ADVERTISED_Backplane		__ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane)
+#define ADVERTISED_1000baseKX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full)
+#define ADVERTISED_10000baseKX4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full)
+#define ADVERTISED_10000baseKR_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full)
+#define ADVERTISED_10000baseR_FEC	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC)
+#define ADVERTISED_20000baseMLD2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full)
+#define ADVERTISED_20000baseKR2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full)
+#define ADVERTISED_40000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full)
+#define ADVERTISED_40000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full)
+#define ADVERTISED_40000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full)
+#define ADVERTISED_40000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full)
+#define ADVERTISED_56000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full)
+#define ADVERTISED_56000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full)
+#define ADVERTISED_56000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full)
+#define ADVERTISED_56000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full)
+/* Please do not define any new ADVERTISED_* macro for bits > 31, see
+ * notice above.
+ */
 
 /* The following are all involved in forcing a particular link
  * mode for the device for setting things.  When getting the
@@ -1533,4 +1604,123 @@ enum ethtool_reset_flags {
 };
 #define ETH_RESET_SHARED_SHIFT	16
 
+
+/**
+ * struct ethtool_link_settings - link control and status
+ *
+ * IMPORTANT, Backward compatibility notice: When implementing new
+ *	user-space tools, please first try %ETHTOOL_GLINKSETTINGS, and
+ *	if it succeeds use %ETHTOOL_SLINKSETTINGS to change link
+ *	settings; do not use %ETHTOOL_SSET if %ETHTOOL_GLINKSETTINGS
+ *	succeeded: stick to %ETHTOOL_GLINKSETTINGS/%SLINKSETTINGS in
+ *	that case.  Conversely, if %ETHTOOL_GLINKSETTINGS fails, use
+ *	%ETHTOOL_GSET to query and %ETHTOOL_SSET to change link
+ *	settings; do not use %ETHTOOL_SLINKSETTINGS if
+ *	%ETHTOOL_GLINKSETTINGS failed: stick to
+ *	%ETHTOOL_GSET/%ETHTOOL_SSET in that case.
+ *
+ * @cmd: Command number = %ETHTOOL_GLINKSETTINGS or %ETHTOOL_SLINKSETTINGS
+ * @speed: Link speed (Mbps)
+ * @duplex: Duplex mode; one of %DUPLEX_*
+ * @port: Physical connector type; one of %PORT_*
+ * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not
+ *	applicable.  For clause 45 PHYs this is the PRTAD.
+ * @autoneg: Enable/disable autonegotiation and auto-detection;
+ *	either %AUTONEG_DISABLE or %AUTONEG_ENABLE
+ * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO
+ *	protocols supported by the interface; 0 if unknown.
+ *	Read-only.
+ * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of
+ *	%ETH_TP_MDI_*.  If the status is unknown or not applicable, the
+ *	value will be %ETH_TP_MDI_INVALID.  Read-only.
+ * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of
+ *	%ETH_TP_MDI_*.  If MDI(-X) control is not implemented, reads
+ *	yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected.
+ *	When written successfully, the link should be renegotiated if
+ *	necessary.
+ * @link_mode_masks_nwords: Number of 32-bit words for each of the
+ *	supported, advertising, lp_advertising link mode bitmaps. For
+ *	%ETHTOOL_GLINKSETTINGS: on entry, number of words passed by user
+ *	(>= 0); on return, if handshake in progress, negative if
+ *	request size unsupported by kernel: absolute value indicates
+ *	kernel recommended size and cmd field is 0, as well as all the
+ *	other fields; otherwise (handshake completed), strictly
+ *	positive to indicate size used by kernel and cmd field is
+ *	%ETHTOOL_GLINKSETTINGS, all other fields populated by driver. For
+ *	%ETHTOOL_SLINKSETTINGS: must be valid on entry, ie. a positive
+ *	value returned previously by %ETHTOOL_GLINKSETTINGS, otherwise
+ *	refused. For drivers: ignore this field (use kernel's
+ *	__ETHTOOL_LINK_MODE_MASK_NBITS instead), any change to it will
+ *	be overwritten by kernel.
+ * @supported: Bitmap with each bit meaning given by
+ *	%ethtool_link_mode_bit_indices for the link modes, physical
+ *	connectors and other link features for which the interface
+ *	supports autonegotiation or auto-detection.  Read-only.
+ * @advertising: Bitmap with each bit meaning given by
+ *	%ethtool_link_mode_bit_indices for the link modes, physical
+ *	connectors and other link features that are advertised through
+ *	autonegotiation or enabled for auto-detection.
+ * @lp_advertising: Bitmap with each bit meaning given by
+ *	%ethtool_link_mode_bit_indices for the link modes, and other
+ *	link features that the link partner advertised through
+ *	autonegotiation; 0 if unknown or not applicable.  Read-only.
+ *
+ * If autonegotiation is disabled, the speed and @duplex represent the
+ * fixed link mode and are writable if the driver supports multiple
+ * link modes.  If it is enabled then they are read-only; if the link
+ * is up they represent the negotiated link mode; if the link is down,
+ * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and
+ * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode.
+ *
+ * Some hardware interfaces may have multiple PHYs and/or physical
+ * connectors fitted or do not allow the driver to detect which are
+ * fitted.  For these interfaces @port and/or @phy_address may be
+ * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE.
+ * Otherwise, attempts to write different values may be ignored or
+ * rejected.
+ *
+ * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt
+ * are not available in %ethtool_link_settings. Until all drivers are
+ * converted to ignore them or to the new %ethtool_link_settings API,
+ * for both queries and changes, users should always try
+ * %ETHTOOL_GLINKSETTINGS first, and if it fails with -ENOTSUPP stick
+ * only to %ETHTOOL_GSET and %ETHTOOL_SSET consistently. If it
+ * succeeds, then users should stick to %ETHTOOL_GLINKSETTINGS and
+ * %ETHTOOL_SLINKSETTINGS (which would support drivers implementing
+ * either %ethtool_cmd or %ethtool_link_settings).
+ *
+ * Users should assume that all fields not marked read-only are
+ * writable and subject to validation by the driver.  They should use
+ * %ETHTOOL_GLINKSETTINGS to get the current values before making specific
+ * changes and then applying them with %ETHTOOL_SLINKSETTINGS.
+ *
+ * Drivers that implement %get_link_ksettings and/or
+ * %set_link_ksettings should ignore the @cmd
+ * and @link_mode_masks_nwords fields (any change to them overwritten
+ * by kernel), and rely only on kernel's internal
+ * %__ETHTOOL_LINK_MODE_MASK_NBITS and
+ * %ethtool_link_mode_mask_t. Drivers that implement
+ * %set_link_ksettings() should validate all fields other than @cmd
+ * and @link_mode_masks_nwords that are not described as read-only or
+ * deprecated, and must ignore all fields described as read-only.
+ */
+struct ethtool_link_settings {
+	__u32	cmd;
+	__u32	speed;
+	__u8	duplex;
+	__u8	port;
+	__u8	phy_address;
+	__u8	autoneg;
+	__u8	mdio_support;
+	__u8	eth_tp_mdix;
+	__u8	eth_tp_mdix_ctrl;
+	__s8	link_mode_masks_nwords;
+	__u32	reserved[8];
+	__u32	link_mode_masks[0];
+	/* layout of link_mode_masks fields:
+	 * __u32 map_supported[link_mode_masks_nwords];
+	 * __u32 map_advertising[link_mode_masks_nwords];
+	 * __u32 map_lp_advertising[link_mode_masks_nwords];
+	 */
+};
 #endif /* _UAPI_LINUX_ETHTOOL_H */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 2406101002b1..edcec56ed228 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -387,6 +387,359 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
 	return 0;
 }
 
+static void convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32)
+{
+	bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
+	dst[0] = legacy_u32;
+}
+
+/* return false if src had higher bits set. lower bits always updated. */
+static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32,
+					    const unsigned long *src)
+{
+	bool retval = true;
+
+	/* TODO: following test will soon always be true */
+	if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) {
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(ext);
+
+		bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+		bitmap_fill(ext, 32);
+		bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+		if (bitmap_intersects(ext, src,
+				      __ETHTOOL_LINK_MODE_MASK_NBITS)) {
+			/* src mask goes beyond bit 31 */
+			retval = false;
+		}
+	}
+	*legacy_u32 = src[0];
+	return retval;
+}
+
+/* return false if legacy contained non-0 deprecated fields
+ * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated
+ */
+static bool
+convert_legacy_settings_to_link_ksettings(
+	struct ethtool_link_ksettings *link_ksettings,
+	const struct ethtool_cmd *legacy_settings)
+{
+	bool retval = true;
+
+	memset(link_ksettings, 0, sizeof(*link_ksettings));
+
+	/* This is used to tell users that driver is still using these
+	 * deprecated legacy fields, and they should not use
+	 * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
+	 */
+	if (legacy_settings->transceiver ||
+	    legacy_settings->maxtxpkt ||
+	    legacy_settings->maxrxpkt)
+		retval = false;
+
+	convert_legacy_u32_to_link_mode(
+		link_ksettings->link_modes.supported,
+		legacy_settings->supported);
+	convert_legacy_u32_to_link_mode(
+		link_ksettings->link_modes.advertising,
+		legacy_settings->advertising);
+	convert_legacy_u32_to_link_mode(
+		link_ksettings->link_modes.lp_advertising,
+		legacy_settings->lp_advertising);
+	link_ksettings->base.speed
+		= ethtool_cmd_speed(legacy_settings);
+	link_ksettings->base.duplex
+		= legacy_settings->duplex;
+	link_ksettings->base.port
+		= legacy_settings->port;
+	link_ksettings->base.phy_address
+		= legacy_settings->phy_address;
+	link_ksettings->base.autoneg
+		= legacy_settings->autoneg;
+	link_ksettings->base.mdio_support
+		= legacy_settings->mdio_support;
+	link_ksettings->base.eth_tp_mdix
+		= legacy_settings->eth_tp_mdix;
+	link_ksettings->base.eth_tp_mdix_ctrl
+		= legacy_settings->eth_tp_mdix_ctrl;
+	return retval;
+}
+
+/* return false if ksettings link modes had higher bits
+ * set. legacy_settings always updated (best effort)
+ */
+static bool
+convert_link_ksettings_to_legacy_settings(
+	struct ethtool_cmd *legacy_settings,
+	const struct ethtool_link_ksettings *link_ksettings)
+{
+	bool retval = true;
+
+	memset(legacy_settings, 0, sizeof(*legacy_settings));
+	/* this also clears the deprecated fields in legacy structure:
+	 * __u8		transceiver;
+	 * __u32	maxtxpkt;
+	 * __u32	maxrxpkt;
+	 */
+
+	retval &= convert_link_mode_to_legacy_u32(
+		&legacy_settings->supported,
+		link_ksettings->link_modes.supported);
+	retval &= convert_link_mode_to_legacy_u32(
+		&legacy_settings->advertising,
+		link_ksettings->link_modes.advertising);
+	retval &= convert_link_mode_to_legacy_u32(
+		&legacy_settings->lp_advertising,
+		link_ksettings->link_modes.lp_advertising);
+	ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed);
+	legacy_settings->duplex
+		= link_ksettings->base.duplex;
+	legacy_settings->port
+		= link_ksettings->base.port;
+	legacy_settings->phy_address
+		= link_ksettings->base.phy_address;
+	legacy_settings->autoneg
+		= link_ksettings->base.autoneg;
+	legacy_settings->mdio_support
+		= link_ksettings->base.mdio_support;
+	legacy_settings->eth_tp_mdix
+		= link_ksettings->base.eth_tp_mdix;
+	legacy_settings->eth_tp_mdix_ctrl
+		= link_ksettings->base.eth_tp_mdix_ctrl;
+	return retval;
+}
+
+/* number of 32-bit words to store the user's link mode bitmaps */
+#define __ETHTOOL_LINK_MODE_MASK_NU32			\
+	DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32)
+
+/* layout of the struct passed from/to userland */
+struct ethtool_link_usettings {
+	struct ethtool_link_settings base;
+	struct {
+		__u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32];
+		__u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+		__u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+	} link_modes;
+};
+
+/* Internal kernel helper to query a device ethtool_link_settings.
+ *
+ * Backward compatibility note: for compatibility with legacy drivers
+ * that implement only the ethtool_cmd API, this has to work with both
+ * drivers implementing get_link_ksettings API and drivers
+ * implementing get_settings API. When drivers implement get_settings
+ * and report ethtool_cmd deprecated fields
+ * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored
+ * because the resulting struct ethtool_link_settings does not report them.
+ */
+int __ethtool_get_link_ksettings(struct net_device *dev,
+				 struct ethtool_link_ksettings *link_ksettings)
+{
+	int err;
+	struct ethtool_cmd cmd;
+
+	ASSERT_RTNL();
+
+	if (dev->ethtool_ops->get_link_ksettings) {
+		memset(link_ksettings, 0, sizeof(*link_ksettings));
+		return dev->ethtool_ops->get_link_ksettings(dev,
+							    link_ksettings);
+	}
+
+	/* driver doesn't support %ethtool_link_ksettings API. revert to
+	 * legacy %ethtool_cmd API, unless it's not supported either.
+	 * TODO: remove when ethtool_ops::get_settings disappears internally
+	 */
+	err = __ethtool_get_settings(dev, &cmd);
+	if (err < 0)
+		return err;
+
+	/* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt
+	 */
+	convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd);
+	return err;
+}
+EXPORT_SYMBOL(__ethtool_get_link_ksettings);
+
+/* convert ethtool_link_usettings in user space to a kernel internal
+ * ethtool_link_ksettings. return 0 on success, errno on error.
+ */
+static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to,
+					 const void __user *from)
+{
+	struct ethtool_link_usettings link_usettings;
+
+	if (copy_from_user(&link_usettings, from, sizeof(link_usettings)))
+		return -EFAULT;
+
+	memcpy(&to->base, &link_usettings.base, sizeof(to->base));
+	bitmap_from_u32array(to->link_modes.supported,
+			     __ETHTOOL_LINK_MODE_MASK_NBITS,
+			     link_usettings.link_modes.supported,
+			     __ETHTOOL_LINK_MODE_MASK_NU32);
+	bitmap_from_u32array(to->link_modes.advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NBITS,
+			     link_usettings.link_modes.advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NU32);
+	bitmap_from_u32array(to->link_modes.lp_advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NBITS,
+			     link_usettings.link_modes.lp_advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NU32);
+
+	return 0;
+}
+
+/* convert a kernel internal ethtool_link_ksettings to
+ * ethtool_link_usettings in user space. return 0 on success, errno on
+ * error.
+ */
+static int
+store_link_ksettings_for_user(void __user *to,
+			      const struct ethtool_link_ksettings *from)
+{
+	struct ethtool_link_usettings link_usettings;
+
+	memcpy(&link_usettings.base, &from->base, sizeof(link_usettings));
+	bitmap_to_u32array(link_usettings.link_modes.supported,
+			   __ETHTOOL_LINK_MODE_MASK_NU32,
+			   from->link_modes.supported,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS);
+	bitmap_to_u32array(link_usettings.link_modes.advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NU32,
+			   from->link_modes.advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS);
+	bitmap_to_u32array(link_usettings.link_modes.lp_advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NU32,
+			   from->link_modes.lp_advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+	if (copy_to_user(to, &link_usettings, sizeof(link_usettings)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/* Query device for its ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::get_link_ksettings, even if legacy
+ * ethtool_ops::get_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_GSET for
+ * this driver, so that they can correctly access the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_get_link_ksettings(struct net_device *dev,
+				      void __user *useraddr)
+{
+	int err = 0;
+	struct ethtool_link_ksettings link_ksettings;
+
+	ASSERT_RTNL();
+
+	if (!dev->ethtool_ops->get_link_ksettings)
+		return -EOPNOTSUPP;
+
+	/* handle bitmap nbits handshake */
+	if (copy_from_user(&link_ksettings.base, useraddr,
+			   sizeof(link_ksettings.base)))
+		return -EFAULT;
+
+	if (__ETHTOOL_LINK_MODE_MASK_NU32
+	    != link_ksettings.base.link_mode_masks_nwords) {
+		/* wrong link mode nbits requested */
+		memset(&link_ksettings, 0, sizeof(link_ksettings));
+		/* keep cmd field reset to 0 */
+		/* send back number of words required as negative val */
+		compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX,
+				   "need too many bits for link modes!");
+		link_ksettings.base.link_mode_masks_nwords
+			= -((s8)__ETHTOOL_LINK_MODE_MASK_NU32);
+
+		/* copy the base fields back to user, not the link
+		 * mode bitmaps
+		 */
+		if (copy_to_user(useraddr, &link_ksettings.base,
+				 sizeof(link_ksettings.base)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	/* handshake successful: user/kernel agree on
+	 * link_mode_masks_nwords
+	 */
+
+	memset(&link_ksettings, 0, sizeof(link_ksettings));
+	err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
+	if (err < 0)
+		return err;
+
+	/* make sure we tell the right values to user */
+	link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
+	link_ksettings.base.link_mode_masks_nwords
+		= __ETHTOOL_LINK_MODE_MASK_NU32;
+
+	return store_link_ksettings_for_user(useraddr, &link_ksettings);
+}
+
+/* Update device ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::set_link_ksettings, even if legacy
+ * ethtool_ops::set_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_SSET for
+ * this driver, so that they can correctly update the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_set_link_ksettings(struct net_device *dev,
+				      void __user *useraddr)
+{
+	int err;
+	struct ethtool_link_ksettings link_ksettings;
+
+	ASSERT_RTNL();
+
+	if (!dev->ethtool_ops->set_link_ksettings)
+		return -EOPNOTSUPP;
+
+	/* make sure nbits field has expected value */
+	if (copy_from_user(&link_ksettings.base, useraddr,
+			   sizeof(link_ksettings.base)))
+		return -EFAULT;
+
+	if (__ETHTOOL_LINK_MODE_MASK_NU32
+	    != link_ksettings.base.link_mode_masks_nwords)
+		return -EINVAL;
+
+	/* copy the whole structure, now that we know it has expected
+	 * format
+	 */
+	err = load_link_ksettings_from_user(&link_ksettings, useraddr);
+	if (err)
+		return err;
+
+	/* re-check nwords field, just in case */
+	if (__ETHTOOL_LINK_MODE_MASK_NU32
+	    != link_ksettings.base.link_mode_masks_nwords)
+		return -EINVAL;
+
+	return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+}
+
+/* Internal kernel helper to query a device ethtool_cmd settings.
+ *
+ * Note about transition to ethtool_link_settings API: We do not need
+ * (or want) this function to support "dev" instances that implement
+ * the ethtool_link_settings API as we will update the drivers calling
+ * this function to call __ethtool_get_link_ksettings instead, before
+ * the first drivers implement ethtool_ops::get_link_ksettings.
+ *
+ * TODO 1: at least make this function static when no driver is using it
+ * TODO 2: remove when ethtool_ops::get_settings disappears internally
+ */
 int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
 	ASSERT_RTNL();
@@ -400,30 +753,112 @@ int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 }
 EXPORT_SYMBOL(__ethtool_get_settings);
 
+static void
+warn_incomplete_ethtool_legacy_settings_conversion(const char *details)
+{
+	char name[sizeof(current->comm)];
+
+	pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n",
+		     get_task_comm(name, current), details);
+}
+
+/* Query device for its ethtool_cmd settings.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing get_link_ksettings
+ * API and drivers implementing get_settings API. When drivers
+ * implement get_link_ksettings and report higher link mode bits, a
+ * kernel warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, but the command is successful
+ * (only the lower link mode bits reported back to user).
+ */
 static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
 {
-	int err;
 	struct ethtool_cmd cmd;
 
-	err = __ethtool_get_settings(dev, &cmd);
-	if (err < 0)
-		return err;
+	ASSERT_RTNL();
+
+	if (dev->ethtool_ops->get_link_ksettings) {
+		/* First, use link_ksettings API if it is supported */
+		int err;
+		struct ethtool_link_ksettings link_ksettings;
+
+		memset(&link_ksettings, 0, sizeof(link_ksettings));
+		err = dev->ethtool_ops->get_link_ksettings(dev,
+							   &link_ksettings);
+		if (err < 0)
+			return err;
+		if (!convert_link_ksettings_to_legacy_settings(&cmd,
+							       &link_ksettings))
+			warn_incomplete_ethtool_legacy_settings_conversion(
+				"link modes are only partially reported");
+
+		/* send a sensible cmd tag back to user */
+		cmd.cmd = ETHTOOL_GSET;
+	} else {
+		int err;
+		/* TODO: return -EOPNOTSUPP when
+		 * ethtool_ops::get_settings disappears internally
+		 */
+
+		/* driver doesn't support %ethtool_link_ksettings
+		 * API. revert to legacy %ethtool_cmd API, unless it's
+		 * not supported either.
+		 */
+		err = __ethtool_get_settings(dev, &cmd);
+		if (err < 0)
+			return err;
+	}
 
 	if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
 		return -EFAULT;
+
 	return 0;
 }
 
+/* Update device link settings with given ethtool_cmd.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing set_link_ksettings
+ * API and drivers implementing set_settings API. When drivers
+ * implement set_link_ksettings and user's request updates deprecated
+ * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
+ * warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, and the request is rejected.
+ */
 static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_cmd cmd;
 
-	if (!dev->ethtool_ops->set_settings)
-		return -EOPNOTSUPP;
+	ASSERT_RTNL();
 
 	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
 		return -EFAULT;
 
+	/* first, try new %ethtool_link_ksettings API. */
+	if (dev->ethtool_ops->set_link_ksettings) {
+		struct ethtool_link_ksettings link_ksettings;
+
+		if (!convert_legacy_settings_to_link_ksettings(&link_ksettings,
+							       &cmd))
+			return -EINVAL;
+
+		link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS;
+		link_ksettings.base.link_mode_masks_nwords
+			= __ETHTOOL_LINK_MODE_MASK_NU32;
+		return dev->ethtool_ops->set_link_ksettings(dev,
+							    &link_ksettings);
+	}
+
+	/* legacy %ethtool_cmd API */
+
+	/* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings
+	 * disappears internally
+	 */
+
+	if (!dev->ethtool_ops->set_settings)
+		return -EOPNOTSUPP;
+
 	return dev->ethtool_ops->set_settings(dev, &cmd);
 }
 
@@ -2252,6 +2687,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_PERQUEUE:
 		rc = ethtool_set_per_queue(dev, useraddr);
 		break;
+	case ETHTOOL_GLINKSETTINGS:
+		rc = ethtool_get_link_ksettings(dev, useraddr);
+		break;
+	case ETHTOOL_SLINKSETTINGS:
+		rc = ethtool_set_link_ksettings(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From b07edbe1cf3dae9ba81f24888e2f2a9dbe778918 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 16 Feb 2016 17:24:08 +0100
Subject: netfilter: meta: add PRANDOM support

Can be used to randomly match packets e.g. for statistic traffic sampling.

See commit 3ad0040573b0c00f8848
("bpf: split state from prandom_u32() and consolidate {c, e}BPF prngs")
for more info why this doesn't use prandom_u32 directly.

Unlike bpf nft_meta can be built as a module, so add an EXPORT_SYMBOL
for prandom_seed_full_state too.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 lib/random32.c                           |  1 +
 net/netfilter/nft_meta.c                 | 11 +++++++++++
 3 files changed, 14 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index be41ffc128b8..b19be0a098c0 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -681,6 +681,7 @@ enum nft_exthdr_attributes {
  * @NFT_META_IIFGROUP: packet input interface group
  * @NFT_META_OIFGROUP: packet output interface group
  * @NFT_META_CGROUP: socket control group (skb->sk->sk_classid)
+ * @NFT_META_PRANDOM: a 32bit pseudo-random number
  */
 enum nft_meta_keys {
 	NFT_META_LEN,
@@ -707,6 +708,7 @@ enum nft_meta_keys {
 	NFT_META_IIFGROUP,
 	NFT_META_OIFGROUP,
 	NFT_META_CGROUP,
+	NFT_META_PRANDOM,
 };
 
 /**
diff --git a/lib/random32.c b/lib/random32.c
index 12111910ccd0..510d1ce7d4d2 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -255,6 +255,7 @@ void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state)
 		prandom_warmup(state);
 	}
 }
+EXPORT_SYMBOL(prandom_seed_full_state);
 
 /*
  *	Generate better values after random number generator
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index fe885bf271c5..16c50b0dd426 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -28,6 +28,8 @@
 
 #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
 
+static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
+
 void nft_meta_get_eval(const struct nft_expr *expr,
 		       struct nft_regs *regs,
 		       const struct nft_pktinfo *pkt)
@@ -181,6 +183,11 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 		*dest = sock_cgroup_classid(&sk->sk_cgrp_data);
 		break;
 #endif
+	case NFT_META_PRANDOM: {
+		struct rnd_state *state = this_cpu_ptr(&nft_prandom_state);
+		*dest = prandom_u32_state(state);
+		break;
+	}
 	default:
 		WARN_ON(1);
 		goto err;
@@ -277,6 +284,10 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 	case NFT_META_OIFNAME:
 		len = IFNAMSIZ;
 		break;
+	case NFT_META_PRANDOM:
+		prandom_init_once(&nft_prandom_state);
+		len = sizeof(u32);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From 9e8ce79cd711d4dfe09d8bba6822cd9bb7db96bd Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 26 Feb 2016 07:54:39 -0800
Subject: net: sched: cls_u32 add bit to specify software only rules

In the initial implementation the only way to stop a rule from being
inserted into the hardware table was via the device feature flag.
However this doesn't work well when working on an end host system
where packets are expect to hit both the hardware and software
datapaths.

For example we can imagine a rule that will match an IP address and
increment a field. If we install this rule in both hardware and
software we may increment the field twice. To date we have only
added support for the drop action so we have been able to ignore
these cases. But as we extend the action support we will hit this
example plus more such cases. Arguably these are not even corner
cases in many working systems these cases will be common.

To avoid forcing the driver to always abort (i.e. the above example)
this patch adds a flag to add a rule in software only. A careful
user can use this flag to build software and hardware datapaths
that work together. One example we have found particularly useful
is to use hardware resources to set the skb->mark on the skb when
the match may be expensive to run in software but a mark lookup
in a hash table is cheap. The idea here is hardware can do in one
lookup what the u32 classifier may need to traverse multiple lists
and hash tables to compute. The flag is only passed down on inserts.
On deletion to avoid stale references in hardware we always try
to remove a rule if it exists.

The flags field is part of the classifier specific options. Although
it is tempting to lift this into the generic structure doing this
proves difficult do to how the tc netlink attributes are implemented
along with how the dump/change routines are called. There is also
precedence for putting seemingly generic pieces in the specific
classifier options such as TCA_U32_POLICE, TCA_U32_ACT, etc. So
although not ideal I've left FLAGS in the u32 options as well as it
simplifies the code greatly and user space has already learned how
to manage these bits ala 'tc' tool.

Another thing if trying to update a rule we require the flags to
be unchanged. This is to force user space, software u32 and
the hardware u32 to keep in sync. Thanks to Simon Horman for
catching this case.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h        | 13 +++++++++++--
 include/uapi/linux/pkt_cls.h |  1 +
 net/sched/cls_u32.c          | 37 +++++++++++++++++++++++++++----------
 3 files changed, 39 insertions(+), 12 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 6096e96fb78b..bea14eee373e 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -392,12 +392,21 @@ struct tc_cls_u32_offload {
 	};
 };
 
-static inline bool tc_should_offload(struct net_device *dev)
+/* tca flags definitions */
+#define TCA_CLS_FLAGS_SKIP_HW 1
+
+static inline bool tc_should_offload(struct net_device *dev, u32 flags)
 {
 	if (!(dev->features & NETIF_F_HW_TC))
 		return false;
 
-	return dev->netdev_ops->ndo_setup_tc;
+	if (flags & TCA_CLS_FLAGS_SKIP_HW)
+		return false;
+
+	if (!dev->netdev_ops->ndo_setup_tc)
+		return false;
+
+	return true;
 }
 
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 439873775d49..9874f5680926 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -172,6 +172,7 @@ enum {
 	TCA_U32_INDEV,
 	TCA_U32_PCNT,
 	TCA_U32_MARK,
+	TCA_U32_FLAGS,
 	__TCA_U32_MAX
 };
 
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 24e888b9b728..563cdad76448 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -59,6 +59,7 @@ struct tc_u_knode {
 #ifdef CONFIG_CLS_U32_PERF
 	struct tc_u32_pcnt __percpu *pf;
 #endif
+	u32			flags;
 #ifdef CONFIG_CLS_U32_MARK
 	u32			val;
 	u32			mask;
@@ -434,7 +435,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, 0)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
 		offload.cls_u32->knode.handle = handle;
 		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
@@ -442,7 +443,9 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	}
 }
 
-static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
+static void u32_replace_hw_hnode(struct tcf_proto *tp,
+				 struct tc_u_hnode *h,
+				 u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload u32_offload = {0};
@@ -451,7 +454,7 @@ static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, flags)) {
 		offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -471,7 +474,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, 0)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -482,7 +485,9 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	}
 }
 
-static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
+static void u32_replace_hw_knode(struct tcf_proto *tp,
+				 struct tc_u_knode *n,
+				 u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload u32_offload = {0};
@@ -491,7 +496,7 @@ static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, flags)) {
 		offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
 		offload.cls_u32->knode.handle = n->handle;
 		offload.cls_u32->knode.fshift = n->fshift;
@@ -679,6 +684,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
 	[TCA_U32_SEL]		= { .len = sizeof(struct tc_u32_sel) },
 	[TCA_U32_INDEV]		= { .type = NLA_STRING, .len = IFNAMSIZ },
 	[TCA_U32_MARK]		= { .len = sizeof(struct tc_u32_mark) },
+	[TCA_U32_FLAGS]		= { .type = NLA_U32 },
 };
 
 static int u32_set_parms(struct net *net, struct tcf_proto *tp,
@@ -786,6 +792,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
 #endif
 	new->fshift = n->fshift;
 	new->res = n->res;
+	new->flags = n->flags;
 	RCU_INIT_POINTER(new->ht_down, n->ht_down);
 
 	/* bump reference count as long as we hold pointer to structure */
@@ -825,7 +832,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	struct tc_u32_sel *s;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_U32_MAX + 1];
-	u32 htid;
+	u32 htid, flags = 0;
 	int err;
 #ifdef CONFIG_CLS_U32_PERF
 	size_t size;
@@ -838,6 +845,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		return err;
 
+	if (tb[TCA_U32_FLAGS])
+		flags = nla_get_u32(tb[TCA_U32_FLAGS]);
+
 	n = (struct tc_u_knode *)*arg;
 	if (n) {
 		struct tc_u_knode *new;
@@ -845,6 +855,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		if (TC_U32_KEY(n->handle) == 0)
 			return -EINVAL;
 
+		if (n->flags != flags)
+			return -EINVAL;
+
 		new = u32_init_knode(tp, n);
 		if (!new)
 			return -ENOMEM;
@@ -861,7 +874,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		u32_replace_knode(tp, tp_c, new);
 		tcf_unbind_filter(tp, &n->res);
 		call_rcu(&n->rcu, u32_delete_key_rcu);
-		u32_replace_hw_knode(tp, new);
+		u32_replace_hw_knode(tp, new, flags);
 		return 0;
 	}
 
@@ -889,7 +902,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		rcu_assign_pointer(tp_c->hlist, ht);
 		*arg = (unsigned long)ht;
 
-		u32_replace_hw_hnode(tp, ht);
+		u32_replace_hw_hnode(tp, ht, flags);
 		return 0;
 	}
 
@@ -940,6 +953,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	RCU_INIT_POINTER(n->ht_up, ht);
 	n->handle = handle;
 	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
+	n->flags = flags;
 	tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
 	n->tp = tp;
 
@@ -972,7 +986,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 
 		RCU_INIT_POINTER(n->next, pins);
 		rcu_assign_pointer(*ins, n);
-		u32_replace_hw_knode(tp, n);
+		u32_replace_hw_knode(tp, n, flags);
 		*arg = (unsigned long)n;
 		return 0;
 	}
@@ -1077,6 +1091,9 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		    nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
 			goto nla_put_failure;
 
+		if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags))
+			goto nla_put_failure;
+
 #ifdef CONFIG_CLS_U32_MARK
 		if ((n->val || n->mask)) {
 			struct tc_u32_mark mark = {.val = n->val,
-- 
cgit v1.2.3


From bfcd3a46617209454cfc0947ab093e37fd1e84ef Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 26 Feb 2016 17:32:23 +0100
Subject: Introduce devlink infrastructure

Introduce devlink infrastructure for drivers to register and expose to
userspace via generic Netlink interface.

There are two basic objects defined:
devlink - one instance for every "parent device", for example switch ASIC
devlink port - one instance for every physical port of the device.

This initial portion implements basic get/dump of objects to userspace.
Also, port splitter and port type setting is implemented.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                  |   8 +
 include/net/devlink.h        | 140 ++++++++
 include/uapi/linux/devlink.h |  72 +++++
 net/Kconfig                  |   7 +
 net/core/Makefile            |   1 +
 net/core/devlink.c           | 738 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 966 insertions(+)
 create mode 100644 include/net/devlink.h
 create mode 100644 include/uapi/linux/devlink.h
 create mode 100644 net/core/devlink.c

(limited to 'include/uapi')

diff --git a/MAINTAINERS b/MAINTAINERS
index 12b764f4c93c..e45682745263 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3499,6 +3499,14 @@ F:	include/linux/device-mapper.h
 F:	include/linux/dm-*.h
 F:	include/uapi/linux/dm-*.h
 
+DEVLINK
+M:	Jiri Pirko <jiri@mellanox.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	net/core/devlink.c
+F:	include/net/devlink.h
+F:	include/uapi/linux/devlink.h
+
 DIALOG SEMICONDUCTOR DRIVERS
 M:	Support Opensource <support.opensource@diasemi.com>
 W:	http://www.dialog-semiconductor.com/products
diff --git a/include/net/devlink.h b/include/net/devlink.h
new file mode 100644
index 000000000000..c37d257891d6
--- /dev/null
+++ b/include/net/devlink.h
@@ -0,0 +1,140 @@
+/*
+ * include/net/devlink.h - Network physical device Netlink interface
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#ifndef _NET_DEVLINK_H_
+#define _NET_DEVLINK_H_
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <uapi/linux/devlink.h>
+
+struct devlink_ops;
+
+struct devlink {
+	struct list_head list;
+	struct list_head port_list;
+	const struct devlink_ops *ops;
+	struct device *dev;
+	possible_net_t _net;
+	char priv[0] __aligned(NETDEV_ALIGN);
+};
+
+struct devlink_port {
+	struct list_head list;
+	struct devlink *devlink;
+	unsigned index;
+	bool registered;
+	enum devlink_port_type type;
+	enum devlink_port_type desired_type;
+	void *type_dev;
+	bool split;
+	u32 split_group;
+};
+
+struct devlink_ops {
+	size_t priv_size;
+	int (*port_type_set)(struct devlink_port *devlink_port,
+			     enum devlink_port_type port_type);
+	int (*port_split)(struct devlink *devlink, unsigned int port_index,
+			  unsigned int count);
+	int (*port_unsplit)(struct devlink *devlink, unsigned int port_index);
+};
+
+static inline void *devlink_priv(struct devlink *devlink)
+{
+	BUG_ON(!devlink);
+	return &devlink->priv;
+}
+
+static inline struct devlink *priv_to_devlink(void *priv)
+{
+	BUG_ON(!priv);
+	return container_of(priv, struct devlink, priv);
+}
+
+struct ib_device;
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+
+struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size);
+int devlink_register(struct devlink *devlink, struct device *dev);
+void devlink_unregister(struct devlink *devlink);
+void devlink_free(struct devlink *devlink);
+int devlink_port_register(struct devlink *devlink,
+			  struct devlink_port *devlink_port,
+			  unsigned int port_index);
+void devlink_port_unregister(struct devlink_port *devlink_port);
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+			       struct net_device *netdev);
+void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+			      struct ib_device *ibdev);
+void devlink_port_type_clear(struct devlink_port *devlink_port);
+void devlink_port_split_set(struct devlink_port *devlink_port,
+			    u32 split_group);
+
+#else
+
+static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
+					    size_t priv_size)
+{
+	return kzalloc(sizeof(struct devlink) + priv_size, GFP_KERNEL);
+}
+
+static inline int devlink_register(struct devlink *devlink, struct device *dev)
+{
+	return 0;
+}
+
+static inline void devlink_unregister(struct devlink *devlink)
+{
+}
+
+static inline void devlink_free(struct devlink *devlink)
+{
+	kfree(devlink);
+}
+
+static inline int devlink_port_register(struct devlink *devlink,
+					struct devlink_port *devlink_port,
+					unsigned int port_index)
+{
+	return 0;
+}
+
+static inline void devlink_port_unregister(struct devlink_port *devlink_port)
+{
+}
+
+static inline void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+					     struct net_device *netdev)
+{
+}
+
+static inline void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+					    struct ib_device *ibdev)
+{
+}
+
+static inline void devlink_port_type_clear(struct devlink_port *devlink_port)
+{
+}
+
+static inline void devlink_port_split_set(struct devlink_port *devlink_port,
+					  u32 split_group)
+{
+}
+
+#endif
+
+#endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
new file mode 100644
index 000000000000..c9fee5781eb1
--- /dev/null
+++ b/include/uapi/linux/devlink.h
@@ -0,0 +1,72 @@
+/*
+ * include/uapi/linux/devlink.h - Network physical device Netlink interface
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_DEVLINK_H_
+#define _UAPI_LINUX_DEVLINK_H_
+
+#define DEVLINK_GENL_NAME "devlink"
+#define DEVLINK_GENL_VERSION 0x1
+#define DEVLINK_GENL_MCGRP_CONFIG_NAME "config"
+
+enum devlink_command {
+	/* don't change the order or add anything between, this is ABI! */
+	DEVLINK_CMD_UNSPEC,
+
+	DEVLINK_CMD_GET,		/* can dump */
+	DEVLINK_CMD_SET,
+	DEVLINK_CMD_NEW,
+	DEVLINK_CMD_DEL,
+
+	DEVLINK_CMD_PORT_GET,		/* can dump */
+	DEVLINK_CMD_PORT_SET,
+	DEVLINK_CMD_PORT_NEW,
+	DEVLINK_CMD_PORT_DEL,
+
+	DEVLINK_CMD_PORT_SPLIT,
+	DEVLINK_CMD_PORT_UNSPLIT,
+
+	/* add new commands above here */
+
+	__DEVLINK_CMD_MAX,
+	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
+};
+
+enum devlink_port_type {
+	DEVLINK_PORT_TYPE_NOTSET,
+	DEVLINK_PORT_TYPE_AUTO,
+	DEVLINK_PORT_TYPE_ETH,
+	DEVLINK_PORT_TYPE_IB,
+};
+
+enum devlink_attr {
+	/* don't change the order or add anything between, this is ABI! */
+	DEVLINK_ATTR_UNSPEC,
+
+	/* bus name + dev name together are a handle for devlink entity */
+	DEVLINK_ATTR_BUS_NAME,			/* string */
+	DEVLINK_ATTR_DEV_NAME,			/* string */
+
+	DEVLINK_ATTR_PORT_INDEX,		/* u32 */
+	DEVLINK_ATTR_PORT_TYPE,			/* u16 */
+	DEVLINK_ATTR_PORT_DESIRED_TYPE,		/* u16 */
+	DEVLINK_ATTR_PORT_NETDEV_IFINDEX,	/* u32 */
+	DEVLINK_ATTR_PORT_NETDEV_NAME,		/* string */
+	DEVLINK_ATTR_PORT_IBDEV_NAME,		/* string */
+	DEVLINK_ATTR_PORT_SPLIT_COUNT,		/* u32 */
+	DEVLINK_ATTR_PORT_SPLIT_GROUP,		/* u32 */
+
+	/* add new attributes above here, update the policy in devlink.c */
+
+	__DEVLINK_ATTR_MAX,
+	DEVLINK_ATTR_MAX = __DEVLINK_ATTR_MAX - 1
+};
+
+#endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/Kconfig b/net/Kconfig
index b80efecfc1a0..6c9cfb0d7639 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -396,6 +396,13 @@ config DST_CACHE
 	bool "dst cache"
 	default n
 
+config NET_DEVLINK
+	tristate "Network physical/parent device Netlink interface"
+	help
+	  Network physical/parent device Netlink interface provides
+	  infrastructure to support access to physical chip-wide config and
+	  monitoring.
+
 endif   # if NET
 
 # Used by archs to tell that they support BPF_JIT
diff --git a/net/core/Makefile b/net/core/Makefile
index 7a8fb8aef992..014422e2561f 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
+obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/devlink.c b/net/core/devlink.c
new file mode 100644
index 000000000000..590fa561cb7f
--- /dev/null
+++ b/net/core/devlink.c
@@ -0,0 +1,738 @@
+/*
+ * net/core/devlink.c - Network physical/parent device Netlink interface
+ *
+ * Heavily inspired by net/wireless/
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <rdma/ib_verbs.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/devlink.h>
+
+static LIST_HEAD(devlink_list);
+
+/* devlink_mutex
+ *
+ * An overall lock guarding every operation coming from userspace.
+ * It also guards devlink devices list and it is taken when
+ * driver registers/unregisters it.
+ */
+static DEFINE_MUTEX(devlink_mutex);
+
+/* devlink_port_mutex
+ *
+ * Shared lock to guard lists of ports in all devlink devices.
+ */
+static DEFINE_MUTEX(devlink_port_mutex);
+
+static struct net *devlink_net(const struct devlink *devlink)
+{
+	return read_pnet(&devlink->_net);
+}
+
+static void devlink_net_set(struct devlink *devlink, struct net *net)
+{
+	write_pnet(&devlink->_net, net);
+}
+
+static struct devlink *devlink_get_from_attrs(struct net *net,
+					      struct nlattr **attrs)
+{
+	struct devlink *devlink;
+	char *busname;
+	char *devname;
+
+	if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME])
+		return ERR_PTR(-EINVAL);
+
+	busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]);
+	devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]);
+
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (strcmp(devlink->dev->bus->name, busname) == 0 &&
+		    strcmp(dev_name(devlink->dev), devname) == 0 &&
+		    net_eq(devlink_net(devlink), net))
+			return devlink;
+	}
+
+	return ERR_PTR(-ENODEV);
+}
+
+static struct devlink *devlink_get_from_info(struct genl_info *info)
+{
+	return devlink_get_from_attrs(genl_info_net(info), info->attrs);
+}
+
+static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
+						      int port_index)
+{
+	struct devlink_port *devlink_port;
+
+	list_for_each_entry(devlink_port, &devlink->port_list, list) {
+		if (devlink_port->index == port_index)
+			return devlink_port;
+	}
+	return NULL;
+}
+
+static bool devlink_port_index_exists(struct devlink *devlink, int port_index)
+{
+	return devlink_port_get_by_index(devlink, port_index);
+}
+
+static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
+							struct nlattr **attrs)
+{
+	if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
+		u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+		struct devlink_port *devlink_port;
+
+		devlink_port = devlink_port_get_by_index(devlink, port_index);
+		if (!devlink_port)
+			return ERR_PTR(-ENODEV);
+		return devlink_port;
+	}
+	return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
+						       struct genl_info *info)
+{
+	return devlink_port_get_from_attrs(devlink, info->attrs);
+}
+
+#define DEVLINK_NL_FLAG_NEED_PORT	BIT(0)
+
+static int devlink_nl_pre_doit(const struct genl_ops *ops,
+			       struct sk_buff *skb, struct genl_info *info)
+{
+	struct devlink *devlink;
+
+	mutex_lock(&devlink_mutex);
+	devlink = devlink_get_from_info(info);
+	if (IS_ERR(devlink)) {
+		mutex_unlock(&devlink_mutex);
+		return PTR_ERR(devlink);
+	}
+	info->user_ptr[0] = devlink;
+	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
+		struct devlink_port *devlink_port;
+
+		mutex_lock(&devlink_port_mutex);
+		devlink_port = devlink_port_get_from_info(devlink, info);
+		if (IS_ERR(devlink_port)) {
+			mutex_unlock(&devlink_port_mutex);
+			mutex_unlock(&devlink_mutex);
+			return PTR_ERR(devlink_port);
+		}
+		info->user_ptr[1] = devlink_port;
+	}
+	return 0;
+}
+
+static void devlink_nl_post_doit(const struct genl_ops *ops,
+				 struct sk_buff *skb, struct genl_info *info)
+{
+	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT)
+		mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink_mutex);
+}
+
+static struct genl_family devlink_nl_family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= DEVLINK_GENL_NAME,
+	.version	= DEVLINK_GENL_VERSION,
+	.maxattr	= DEVLINK_ATTR_MAX,
+	.netnsok	= true,
+	.pre_doit	= devlink_nl_pre_doit,
+	.post_doit	= devlink_nl_post_doit,
+};
+
+enum devlink_multicast_groups {
+	DEVLINK_MCGRP_CONFIG,
+};
+
+static const struct genl_multicast_group devlink_nl_mcgrps[] = {
+	[DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME },
+};
+
+static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
+{
+	if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name))
+		return -EMSGSIZE;
+	if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev)))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
+			   enum devlink_command cmd, u32 portid,
+			   u32 seq, int flags)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
+{
+	struct sk_buff *msg;
+	int err;
+
+	WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
+				struct devlink_port *devlink_port,
+				enum devlink_command cmd, u32 portid,
+				u32 seq, int flags)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+		goto nla_put_failure;
+	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
+		goto nla_put_failure;
+	if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
+	    nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE,
+			devlink_port->desired_type))
+		goto nla_put_failure;
+	if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
+		struct net_device *netdev = devlink_port->type_dev;
+
+		if (netdev &&
+		    (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
+				 netdev->ifindex) ||
+		     nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
+				    netdev->name)))
+			goto nla_put_failure;
+	}
+	if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
+		struct ib_device *ibdev = devlink_port->type_dev;
+
+		if (ibdev &&
+		    nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
+				   ibdev->name))
+			goto nla_put_failure;
+	}
+	if (devlink_port->split &&
+	    nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
+			devlink_port->split_group))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static void devlink_port_notify(struct devlink_port *devlink_port,
+				enum devlink_command cmd)
+{
+	struct devlink *devlink = devlink_port->devlink;
+	struct sk_buff *msg;
+	int err;
+
+	if (!devlink_port->registered)
+		return;
+
+	WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+			      info->snd_portid, info->snd_seq, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg,
+				     struct netlink_callback *cb)
+{
+	struct devlink *devlink;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		if (idx < start) {
+			idx++;
+			continue;
+		}
+		err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+				      NETLINK_CB(cb->skb).portid,
+				      cb->nlh->nlmsg_seq, NLM_F_MULTI);
+		if (err)
+			goto out;
+		idx++;
+	}
+out:
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_port *devlink_port = info->user_ptr[1];
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_port_fill(msg, devlink, devlink_port,
+				   DEVLINK_CMD_PORT_NEW,
+				   info->snd_portid, info->snd_seq, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
+					  struct netlink_callback *cb)
+{
+	struct devlink *devlink;
+	struct devlink_port *devlink_port;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink_port_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		list_for_each_entry(devlink_port, &devlink->port_list, list) {
+			if (idx < start) {
+				idx++;
+				continue;
+			}
+			err = devlink_nl_port_fill(msg, devlink, devlink_port,
+						   DEVLINK_CMD_NEW,
+						   NETLINK_CB(cb->skb).portid,
+						   cb->nlh->nlmsg_seq,
+						   NLM_F_MULTI);
+			if (err)
+				goto out;
+			idx++;
+		}
+	}
+out:
+	mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static int devlink_port_type_set(struct devlink *devlink,
+				 struct devlink_port *devlink_port,
+				 enum devlink_port_type port_type)
+
+{
+	int err;
+
+	if (devlink->ops && devlink->ops->port_type_set) {
+		if (port_type == DEVLINK_PORT_TYPE_NOTSET)
+			return -EINVAL;
+		err = devlink->ops->port_type_set(devlink_port, port_type);
+		if (err)
+			return err;
+		devlink_port->desired_type = port_type;
+		devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+		return 0;
+	}
+	return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_port *devlink_port = info->user_ptr[1];
+	int err;
+
+	if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
+		enum devlink_port_type port_type;
+
+		port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
+		err = devlink_port_type_set(devlink, devlink_port, port_type);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int devlink_port_split(struct devlink *devlink,
+			      u32 port_index, u32 count)
+
+{
+	if (devlink->ops && devlink->ops->port_split)
+		return devlink->ops->port_split(devlink, port_index, count);
+	return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
+					  struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	u32 port_index;
+	u32 count;
+
+	if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] ||
+	    !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT])
+		return -EINVAL;
+
+	port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+	count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
+	return devlink_port_split(devlink, port_index, count);
+}
+
+static int devlink_port_unsplit(struct devlink *devlink, u32 port_index)
+
+{
+	if (devlink->ops && devlink->ops->port_unsplit)
+		return devlink->ops->port_unsplit(devlink, port_index);
+	return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
+					    struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	u32 port_index;
+
+	if (!info->attrs[DEVLINK_ATTR_PORT_INDEX])
+		return -EINVAL;
+
+	port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+	return devlink_port_unsplit(devlink, port_index);
+}
+
+static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
+	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
+	[DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 },
+	[DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 },
+};
+
+static const struct genl_ops devlink_nl_ops[] = {
+	{
+		.cmd = DEVLINK_CMD_GET,
+		.doit = devlink_nl_cmd_get_doit,
+		.dumpit = devlink_nl_cmd_get_dumpit,
+		.policy = devlink_nl_policy,
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_GET,
+		.doit = devlink_nl_cmd_port_get_doit,
+		.dumpit = devlink_nl_cmd_port_get_dumpit,
+		.policy = devlink_nl_policy,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_SET,
+		.doit = devlink_nl_cmd_port_set_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_SPLIT,
+		.doit = devlink_nl_cmd_port_split_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_UNSPLIT,
+		.doit = devlink_nl_cmd_port_unsplit_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+};
+
+/**
+ *	devlink_alloc - Allocate new devlink instance resources
+ *
+ *	@ops: ops
+ *	@priv_size: size of user private data
+ *
+ *	Allocate new devlink instance resources, including devlink index
+ *	and name.
+ */
+struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
+{
+	struct devlink *devlink;
+
+	devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL);
+	if (!devlink)
+		return NULL;
+	devlink->ops = ops;
+	devlink_net_set(devlink, &init_net);
+	INIT_LIST_HEAD(&devlink->port_list);
+	return devlink;
+}
+EXPORT_SYMBOL_GPL(devlink_alloc);
+
+/**
+ *	devlink_register - Register devlink instance
+ *
+ *	@devlink: devlink
+ */
+int devlink_register(struct devlink *devlink, struct device *dev)
+{
+	mutex_lock(&devlink_mutex);
+	devlink->dev = dev;
+	list_add_tail(&devlink->list, &devlink_list);
+	devlink_notify(devlink, DEVLINK_CMD_NEW);
+	mutex_unlock(&devlink_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_register);
+
+/**
+ *	devlink_unregister - Unregister devlink instance
+ *
+ *	@devlink: devlink
+ */
+void devlink_unregister(struct devlink *devlink)
+{
+	mutex_lock(&devlink_mutex);
+	devlink_notify(devlink, DEVLINK_CMD_DEL);
+	list_del(&devlink->list);
+	mutex_unlock(&devlink_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_unregister);
+
+/**
+ *	devlink_free - Free devlink instance resources
+ *
+ *	@devlink: devlink
+ */
+void devlink_free(struct devlink *devlink)
+{
+	kfree(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_free);
+
+/**
+ *	devlink_port_register - Register devlink port
+ *
+ *	@devlink: devlink
+ *	@devlink_port: devlink port
+ *	@port_index
+ *
+ *	Register devlink port with provided port index. User can use
+ *	any indexing, even hw-related one. devlink_port structure
+ *	is convenient to be embedded inside user driver private structure.
+ *	Note that the caller should take care of zeroing the devlink_port
+ *	structure.
+ */
+int devlink_port_register(struct devlink *devlink,
+			  struct devlink_port *devlink_port,
+			  unsigned int port_index)
+{
+	mutex_lock(&devlink_port_mutex);
+	if (devlink_port_index_exists(devlink, port_index)) {
+		mutex_unlock(&devlink_port_mutex);
+		return -EEXIST;
+	}
+	devlink_port->devlink = devlink;
+	devlink_port->index = port_index;
+	devlink_port->type = DEVLINK_PORT_TYPE_NOTSET;
+	devlink_port->registered = true;
+	list_add_tail(&devlink_port->list, &devlink->port_list);
+	mutex_unlock(&devlink_port_mutex);
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_register);
+
+/**
+ *	devlink_port_unregister - Unregister devlink port
+ *
+ *	@devlink_port: devlink port
+ */
+void devlink_port_unregister(struct devlink_port *devlink_port)
+{
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
+	mutex_lock(&devlink_port_mutex);
+	list_del(&devlink_port->list);
+	mutex_unlock(&devlink_port_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_port_unregister);
+
+static void __devlink_port_type_set(struct devlink_port *devlink_port,
+				    enum devlink_port_type type,
+				    void *type_dev)
+{
+	devlink_port->type = type;
+	devlink_port->type_dev = type_dev;
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
+/**
+ *	devlink_port_type_eth_set - Set port type to Ethernet
+ *
+ *	@devlink_port: devlink port
+ *	@netdev: related netdevice
+ */
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+			       struct net_device *netdev)
+{
+	return __devlink_port_type_set(devlink_port,
+				       DEVLINK_PORT_TYPE_ETH, netdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
+
+/**
+ *	devlink_port_type_ib_set - Set port type to InfiniBand
+ *
+ *	@devlink_port: devlink port
+ *	@ibdev: related IB device
+ */
+void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+			      struct ib_device *ibdev)
+{
+	return __devlink_port_type_set(devlink_port,
+				       DEVLINK_PORT_TYPE_IB, ibdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
+
+/**
+ *	devlink_port_type_clear - Clear port type
+ *
+ *	@devlink_port: devlink port
+ */
+void devlink_port_type_clear(struct devlink_port *devlink_port)
+{
+	return __devlink_port_type_set(devlink_port,
+				       DEVLINK_PORT_TYPE_NOTSET, NULL);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_clear);
+
+/**
+ *	devlink_port_split_set - Set port is split
+ *
+ *	@devlink_port: devlink port
+ *	@split_group: split group - identifies group split port is part of
+ */
+void devlink_port_split_set(struct devlink_port *devlink_port,
+			    u32 split_group)
+{
+	devlink_port->split = true;
+	devlink_port->split_group = split_group;
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+EXPORT_SYMBOL_GPL(devlink_port_split_set);
+
+static int __init devlink_module_init(void)
+{
+	return genl_register_family_with_ops_groups(&devlink_nl_family,
+						    devlink_nl_ops,
+						    devlink_nl_mcgrps);
+}
+
+static void __exit devlink_module_exit(void)
+{
+	genl_unregister_family(&devlink_nl_family);
+}
+
+module_init(devlink_module_init);
+module_exit(devlink_module_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Network physical device Netlink interface");
+MODULE_ALIAS_GENL_FAMILY(DEVLINK_GENL_NAME);
-- 
cgit v1.2.3


From 7f0aec7a668419bdbff12de6e8016544f874e708 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 26 Feb 2016 21:20:01 +0100
Subject: bridge: mcast: use names for the different multicast_router types

Using raw values makes it difficult to extend and also understand the
code, give them names and do explicit per-option manipulation in
br_multicast_set_port_router.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  7 +++++
 net/bridge/br_multicast.c      | 61 +++++++++++++++++++++++-------------------
 2 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 0890b217580d..e47f3bc7f323 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -177,6 +177,13 @@ enum {
 };
 #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1)
 
+/* multicast router types */
+enum {
+	MDB_RTR_TYPE_DISABLED,
+	MDB_RTR_TYPE_TEMP_QUERY,
+	MDB_RTR_TYPE_PERM,
+};
+
 enum {
 	MDBA_ROUTER_UNSPEC,
 	MDBA_ROUTER_PORT,
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 8b6e4249be1b..71c109b0943f 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -759,7 +759,7 @@ static void br_multicast_router_expired(unsigned long data)
 	struct net_bridge *br = port->br;
 
 	spin_lock(&br->multicast_lock);
-	if (port->multicast_router != 1 ||
+	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY ||
 	    timer_pending(&port->multicast_router_timer) ||
 	    hlist_unhashed(&port->rlist))
 		goto out;
@@ -912,7 +912,7 @@ static void br_ip6_multicast_port_query_expired(unsigned long data)
 
 void br_multicast_add_port(struct net_bridge_port *port)
 {
-	port->multicast_router = 1;
+	port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 
 	setup_timer(&port->multicast_router_timer, br_multicast_router_expired,
 		    (unsigned long)port);
@@ -959,7 +959,8 @@ void br_multicast_enable_port(struct net_bridge_port *port)
 #if IS_ENABLED(CONFIG_IPV6)
 	br_multicast_enable(&port->ip6_own_query);
 #endif
-	if (port->multicast_router == 2 && hlist_unhashed(&port->rlist))
+	if (port->multicast_router == MDB_RTR_TYPE_PERM &&
+	    hlist_unhashed(&port->rlist))
 		br_multicast_add_router(br, port);
 
 out:
@@ -1227,13 +1228,13 @@ static void br_multicast_mark_router(struct net_bridge *br,
 	unsigned long now = jiffies;
 
 	if (!port) {
-		if (br->multicast_router == 1)
+		if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY)
 			mod_timer(&br->multicast_router_timer,
 				  now + br->multicast_querier_interval);
 		return;
 	}
 
-	if (port->multicast_router != 1)
+	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
 		return;
 
 	br_multicast_add_router(br, port);
@@ -1713,7 +1714,7 @@ void br_multicast_init(struct net_bridge *br)
 	br->hash_elasticity = 4;
 	br->hash_max = 512;
 
-	br->multicast_router = 1;
+	br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 	br->multicast_querier = 0;
 	br->multicast_query_use_ifaddr = 0;
 	br->multicast_last_member_count = 2;
@@ -1823,11 +1824,11 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
 	spin_lock_bh(&br->multicast_lock);
 
 	switch (val) {
-	case 0:
-	case 2:
+	case MDB_RTR_TYPE_DISABLED:
+	case MDB_RTR_TYPE_PERM:
 		del_timer(&br->multicast_router_timer);
 		/* fall through */
-	case 1:
+	case MDB_RTR_TYPE_TEMP_QUERY:
 		br->multicast_router = val;
 		err = 0;
 		break;
@@ -1838,6 +1839,14 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
 	return err;
 }
 
+static void __del_port_router(struct net_bridge_port *p)
+{
+	if (hlist_unhashed(&p->rlist))
+		return;
+	hlist_del_init_rcu(&p->rlist);
+	br_rtr_notify(p->br->dev, p, RTM_DELMDB);
+}
+
 int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 {
 	struct net_bridge *br = p->br;
@@ -1846,29 +1855,25 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 	spin_lock(&br->multicast_lock);
 
 	switch (val) {
-	case 0:
-	case 1:
-	case 2:
-		p->multicast_router = val;
-		err = 0;
-
-		if (val < 2 && !hlist_unhashed(&p->rlist)) {
-			hlist_del_init_rcu(&p->rlist);
-			br_rtr_notify(br->dev, p, RTM_DELMDB);
-		}
-
-		if (val == 1)
-			break;
-
+	case MDB_RTR_TYPE_DISABLED:
+		p->multicast_router = MDB_RTR_TYPE_DISABLED;
+		__del_port_router(p);
+		del_timer(&p->multicast_router_timer);
+		break;
+	case MDB_RTR_TYPE_TEMP_QUERY:
+		p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
+		__del_port_router(p);
+		break;
+	case MDB_RTR_TYPE_PERM:
+		p->multicast_router = MDB_RTR_TYPE_PERM;
 		del_timer(&p->multicast_router_timer);
-
-		if (val == 0)
-			break;
-
 		br_multicast_add_router(br, p);
 		break;
+	default:
+		goto unlock;
 	}
-
+	err = 0;
+unlock:
 	spin_unlock(&br->multicast_lock);
 
 	return err;
-- 
cgit v1.2.3


From a55d8246abcc910346771175b521ee2bce5a69b3 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 26 Feb 2016 21:20:03 +0100
Subject: bridge: mcast: add support for temporary port router

Add support for a temporary router port which doesn't depend only on the
incoming query. It can be refreshed if set to the same value, which is
a no-op for the rest.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_multicast.c      | 21 +++++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index e47f3bc7f323..74ee03a47e79 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -182,6 +182,7 @@ enum {
 	MDB_RTR_TYPE_DISABLED,
 	MDB_RTR_TYPE_TEMP_QUERY,
 	MDB_RTR_TYPE_PERM,
+	MDB_RTR_TYPE_TEMP
 };
 
 enum {
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f1140cf5168d..a4c15df2b792 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -759,13 +759,17 @@ static void br_multicast_router_expired(unsigned long data)
 	struct net_bridge *br = port->br;
 
 	spin_lock(&br->multicast_lock);
-	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY ||
+	if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
+	    port->multicast_router == MDB_RTR_TYPE_PERM ||
 	    timer_pending(&port->multicast_router_timer) ||
 	    hlist_unhashed(&port->rlist))
 		goto out;
 
 	hlist_del_init_rcu(&port->rlist);
 	br_rtr_notify(br->dev, port, RTM_DELMDB);
+	/* Don't allow timer refresh if the router expired */
+	if (port->multicast_router == MDB_RTR_TYPE_TEMP)
+		port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 
 out:
 	spin_unlock(&br->multicast_lock);
@@ -981,6 +985,9 @@ void br_multicast_disable_port(struct net_bridge_port *port)
 	if (!hlist_unhashed(&port->rlist)) {
 		hlist_del_init_rcu(&port->rlist);
 		br_rtr_notify(br->dev, port, RTM_DELMDB);
+		/* Don't allow timer refresh if disabling */
+		if (port->multicast_router == MDB_RTR_TYPE_TEMP)
+			port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 	}
 	del_timer(&port->multicast_router_timer);
 	del_timer(&port->ip4_own_query.timer);
@@ -1234,7 +1241,8 @@ static void br_multicast_mark_router(struct net_bridge *br,
 		return;
 	}
 
-	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
+	if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
+	    port->multicast_router == MDB_RTR_TYPE_PERM)
 		return;
 
 	br_multicast_add_router(br, port);
@@ -1850,10 +1858,15 @@ static void __del_port_router(struct net_bridge_port *p)
 int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 {
 	struct net_bridge *br = p->br;
+	unsigned long now = jiffies;
 	int err = -EINVAL;
 
 	spin_lock(&br->multicast_lock);
 	if (p->multicast_router == val) {
+		/* Refresh the temp router port timer */
+		if (p->multicast_router == MDB_RTR_TYPE_TEMP)
+			mod_timer(&p->multicast_router_timer,
+				  now + br->multicast_querier_interval);
 		err = 0;
 		goto unlock;
 	}
@@ -1872,6 +1885,10 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 		del_timer(&p->multicast_router_timer);
 		br_multicast_add_router(br, p);
 		break;
+	case MDB_RTR_TYPE_TEMP:
+		p->multicast_router = MDB_RTR_TYPE_TEMP;
+		br_multicast_mark_router(br, p);
+		break;
 	default:
 		goto unlock;
 	}
-- 
cgit v1.2.3


From 59f78f9f6c2e80dcf0f520be85b660f856217b79 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 26 Feb 2016 21:20:04 +0100
Subject: bridge: mcast: add support for more router port information dumping

Allow for more multicast router port information to be dumped such as
timer and type attributes. For that that purpose we need to extend the
MDBA_ROUTER_PORT attribute similar to how it was done for the mdb entries
recently. The new format is thus:
[MDBA_ROUTER_PORT] = { <- nested attribute
    u32 ifindex <- router port ifindex for user-space compatibility
    [MDBA_ROUTER_PATTR attributes]
}
This way it remains compatible with older users (they'll simply retrieve
the u32 in the beginning) and new users can parse the remaining
attributes. It would also allow to add future extensions to the router
port without breaking compatibility.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 14 +++++++++++++-
 net/bridge/br_mdb.c            | 16 ++++++++++++++--
 2 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 74ee03a47e79..0536eefff9bf 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -144,7 +144,10 @@ struct bridge_vlan_info {
  *     }
  * }
  * [MDBA_ROUTER] = {
- *    [MDBA_ROUTER_PORT]
+ *    [MDBA_ROUTER_PORT] = {
+ *        u32 ifindex
+ *        [MDBA_ROUTER_PATTR attributes]
+ *    }
  * }
  */
 enum {
@@ -192,6 +195,15 @@ enum {
 };
 #define MDBA_ROUTER_MAX (__MDBA_ROUTER_MAX - 1)
 
+/* router port attributes */
+enum {
+	MDBA_ROUTER_PATTR_UNSPEC,
+	MDBA_ROUTER_PATTR_TIMER,
+	MDBA_ROUTER_PATTR_TYPE,
+	__MDBA_ROUTER_PATTR_MAX
+};
+#define MDBA_ROUTER_PATTR_MAX (__MDBA_ROUTER_PATTR_MAX - 1)
+
 struct br_port_msg {
 	__u8  family;
 	__u32 ifindex;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 73786e2fe065..253bc77eda3b 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -20,7 +20,7 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 {
 	struct net_bridge *br = netdev_priv(dev);
 	struct net_bridge_port *p;
-	struct nlattr *nest;
+	struct nlattr *nest, *port_nest;
 
 	if (!br->multicast_router || hlist_empty(&br->router_list))
 		return 0;
@@ -30,8 +30,20 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 		return -EMSGSIZE;
 
 	hlist_for_each_entry_rcu(p, &br->router_list, rlist) {
-		if (p && nla_put_u32(skb, MDBA_ROUTER_PORT, p->dev->ifindex))
+		if (!p)
+			continue;
+		port_nest = nla_nest_start(skb, MDBA_ROUTER_PORT);
+		if (!port_nest)
 			goto fail;
+		if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) ||
+		    nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER,
+				br_timer_value(&p->multicast_router_timer)) ||
+		    nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE,
+			       p->multicast_router)) {
+			nla_nest_cancel(skb, port_nest);
+			goto fail;
+		}
+		nla_nest_end(skb, port_nest);
 	}
 
 	nla_nest_end(skb, nest);
-- 
cgit v1.2.3


From ef6980b6becb1afd9d82a4f043749a10ae81bf14 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sat, 27 Feb 2016 08:08:54 -0500
Subject: introduce IFE action

This action allows for a sending side to encapsulate arbitrary metadata
which is decapsulated by the receiving end.
The sender runs in encoding mode and the receiver in decode mode.
Both sender and receiver must specify the same ethertype.
At some point we hope to have a registered ethertype and we'll
then provide a default so the user doesnt have to specify it.
For now we enforce the user specify it.

Lets show example usage where we encode icmp from a sender towards
a receiver with an skbmark of 17; both sender and receiver use
ethertype of 0xdead to interop.

YYYY: Lets start with Receiver-side policy config:
xxx: add an ingress qdisc
sudo tc qdisc add dev $ETH ingress

xxx: any packets with ethertype 0xdead will be subjected to ife decoding
xxx: we then restart the classification so we can match on icmp at prio 3
sudo $TC filter add dev $ETH parent ffff: prio 2 protocol 0xdead \
u32 match u32 0 0 flowid 1:1 \
action ife decode reclassify

xxx: on restarting the classification from above if it was an icmp
xxx: packet, then match it here and continue to the next rule at prio 4
xxx: which will match based on skb mark of 17
sudo tc filter add dev $ETH parent ffff: prio 3 protocol ip \
u32 match ip protocol 1 0xff flowid 1:1 \
action continue

xxx: match on skbmark of 0x11 (decimal 17) and accept
sudo tc filter add dev $ETH parent ffff: prio 4 protocol ip \
handle 0x11 fw flowid 1:1 \
action ok

xxx: Lets show the decoding policy
sudo tc -s filter ls dev $ETH parent ffff: protocol 0xdead
xxx:
filter pref 2 u32
filter pref 2 u32 fh 800: ht divisor 1
filter pref 2 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1  (rule hit 0 success 0)
  match 00000000/00000000 at 0 (success 0 )
        action order 1: ife decode action reclassify
         index 1 ref 1 bind 1 installed 14 sec used 14 sec
         type: 0x0
         Metadata: allow mark allow hash allow prio allow qmap
        Action statistics:
        Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0
xxx:
Observe that above lists all metadatum it can decode. Typically these
submodules will already be compiled into a monolithic kernel or
loaded as modules

YYYY: Lets show the sender side now ..

xxx: Add an egress qdisc on the sender netdev
sudo tc qdisc add dev $ETH root handle 1: prio
xxx:
xxx: Match all icmp packets to 192.168.122.237/24, then
xxx: tag the packet with skb mark of decimal 17, then
xxx: Encode it with:
xxx:	ethertype 0xdead
xxx:	add skb->mark to whitelist of metadatum to send
xxx:	rewrite target dst MAC address to 02:15:15:15:15:15
xxx:
sudo $TC filter add dev $ETH parent 1: protocol ip prio 10  u32 \
match ip dst 192.168.122.237/24 \
match ip protocol 1 0xff \
flowid 1:2 \
action skbedit mark 17 \
action ife encode \
type 0xDEAD \
allow mark \
dst 02:15:15:15:15:15

xxx: Lets show the encoding policy
sudo tc -s filter ls dev $ETH parent 1: protocol ip
xxx:
filter pref 10 u32
filter pref 10 u32 fh 800: ht divisor 1
filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:2  (rule hit 0 success 0)
  match c0a87aed/ffffffff at 16 (success 0 )
  match 00010000/00ff0000 at 8 (success 0 )

	action order 1:  skbedit mark 17
	 index 6 ref 1 bind 1
 	Action statistics:
	Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
	backlog 0b 0p requeues 0

	action order 2: ife encode action pipe
	 index 3 ref 1 bind 1
	 dst MAC: 02:15:15:15:15:15 type: 0xDEAD
 	 Metadata: allow mark
 	Action statistics:
	Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
	backlog 0b 0p requeues 0
xxx:

test by sending ping from sender to destination

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ife.h        |  61 +++
 include/uapi/linux/tc_act/tc_ife.h |  38 ++
 net/sched/Kconfig                  |  12 +
 net/sched/Makefile                 |   1 +
 net/sched/act_ife.c                | 870 +++++++++++++++++++++++++++++++++++++
 5 files changed, 982 insertions(+)
 create mode 100644 include/net/tc_act/tc_ife.h
 create mode 100644 include/uapi/linux/tc_act/tc_ife.h
 create mode 100644 net/sched/act_ife.c

(limited to 'include/uapi')

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
new file mode 100644
index 000000000000..dc9a09aefb33
--- /dev/null
+++ b/include/net/tc_act/tc_ife.h
@@ -0,0 +1,61 @@
+#ifndef __NET_TC_IFE_H
+#define __NET_TC_IFE_H
+
+#include <net/act_api.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+
+#define IFE_METAHDRLEN 2
+struct tcf_ife_info {
+	struct tcf_common common;
+	u8 eth_dst[ETH_ALEN];
+	u8 eth_src[ETH_ALEN];
+	u16 eth_type;
+	u16 flags;
+	/* list of metaids allowed */
+	struct list_head metalist;
+};
+#define to_ife(a) \
+	container_of(a->priv, struct tcf_ife_info, common)
+
+struct tcf_meta_info {
+	const struct tcf_meta_ops *ops;
+	void *metaval;
+	u16 metaid;
+	struct list_head metalist;
+};
+
+struct tcf_meta_ops {
+	u16 metaid; /*Maintainer provided ID */
+	u16 metatype; /*netlink attribute type (look at net/netlink.h) */
+	const char *name;
+	const char *synopsis;
+	struct list_head list;
+	int	(*check_presence)(struct sk_buff *, struct tcf_meta_info *);
+	int	(*encode)(struct sk_buff *, void *, struct tcf_meta_info *);
+	int	(*decode)(struct sk_buff *, void *, u16 len);
+	int	(*get)(struct sk_buff *skb, struct tcf_meta_info *mi);
+	int	(*alloc)(struct tcf_meta_info *, void *);
+	void	(*release)(struct tcf_meta_info *);
+	int	(*validate)(void *val, int len);
+	struct module	*owner;
+};
+
+#define MODULE_ALIAS_IFE_META(metan)   MODULE_ALIAS("ifemeta" __stringify_1(metan))
+
+int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi);
+int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi);
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
+			const void *dval);
+int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval);
+int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval);
+int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi);
+int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi);
+int ife_validate_meta_u32(void *val, int len);
+int ife_validate_meta_u16(void *val, int len);
+void ife_release_meta_gen(struct tcf_meta_info *mi);
+int register_ife_op(struct tcf_meta_ops *mops);
+int unregister_ife_op(struct tcf_meta_ops *mops);
+
+#endif /* __NET_TC_IFE_H */
diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h
new file mode 100644
index 000000000000..d648ff66586f
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_ife.h
@@ -0,0 +1,38 @@
+#ifndef __UAPI_TC_IFE_H
+#define __UAPI_TC_IFE_H
+
+#include <linux/types.h>
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_IFE 25
+/* Flag bits for now just encoding/decoding; mutually exclusive */
+#define IFE_ENCODE 1
+#define IFE_DECODE 0
+
+struct tc_ife {
+	tc_gen;
+	__u16 flags;
+};
+
+/*XXX: We need to encode the total number of bytes consumed */
+enum {
+	TCA_IFE_UNSPEC,
+	TCA_IFE_PARMS,
+	TCA_IFE_TM,
+	TCA_IFE_DMAC,
+	TCA_IFE_SMAC,
+	TCA_IFE_TYPE,
+	TCA_IFE_METALST,
+	__TCA_IFE_MAX
+};
+#define TCA_IFE_MAX (__TCA_IFE_MAX - 1)
+
+#define IFE_META_SKBMARK 1
+#define IFE_META_HASHID 2
+#define	IFE_META_PRIO 3
+#define	IFE_META_QMAP 4
+/*Can be overridden at runtime by module option*/
+#define	__IFE_META_MAX 5
+#define IFE_META_MAX (__IFE_META_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 82830824fb1f..4d48ef57e564 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -739,6 +739,18 @@ config NET_ACT_CONNMARK
 	  To compile this code as a module, choose M here: the
 	  module will be called act_connmark.
 
+config NET_ACT_IFE
+        tristate "Inter-FE action based on IETF ForCES InterFE LFB"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to allow for sourcing and terminating metadata
+	  For details refer to netdev01 paper:
+	  "Distributing Linux Traffic Control Classifier-Action Subsystem"
+	   Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_ife.
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 690c1689e090..3d176671b0e1 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
 obj-$(CONFIG_NET_ACT_VLAN)	+= act_vlan.o
 obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
 obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
+obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
new file mode 100644
index 000000000000..6e7ec257790d
--- /dev/null
+++ b/net/sched/act_ife.c
@@ -0,0 +1,870 @@
+/*
+ * net/sched/ife.c	Inter-FE action based on ForCES WG InterFE LFB
+ *
+ *		Refer to:
+ *		draft-ietf-forces-interfelfb-03
+ *		and
+ *		netdev01 paper:
+ *		"Distributing Linux Traffic Control Classifier-Action
+ *		Subsystem"
+ *		Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2015)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+#include <linux/etherdevice.h>
+
+#define IFE_TAB_MASK 15
+
+static int ife_net_id;
+static int max_metacnt = IFE_META_MAX + 1;
+
+static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
+	[TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)},
+	[TCA_IFE_DMAC] = { .len = ETH_ALEN},
+	[TCA_IFE_SMAC] = { .len = ETH_ALEN},
+	[TCA_IFE_TYPE] = { .type = NLA_U16},
+};
+
+/* Caller takes care of presenting data in network order
+*/
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
+{
+	u32 *tlv = (u32 *)(skbdata);
+	u16 totlen = nla_total_size(dlen);	/*alignment + hdr */
+	char *dptr = (char *)tlv + NLA_HDRLEN;
+	u32 htlv = attrtype << 16 | totlen;
+
+	*tlv = htonl(htlv);
+	memset(dptr, 0, totlen - NLA_HDRLEN);
+	memcpy(dptr, dval, dlen);
+
+	return totlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
+
+int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi)
+{
+	if (mi->metaval)
+		return nla_put_u32(skb, mi->metaid, *(u32 *)mi->metaval);
+	else
+		return nla_put(skb, mi->metaid, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(ife_get_meta_u32);
+
+int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi)
+{
+	if (metaval || mi->metaval)
+		return 8; /* T+L+V == 2+2+4 */
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ife_check_meta_u32);
+
+int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi)
+{
+	u32 edata = metaval;
+
+	if (mi->metaval)
+		edata = *(u32 *)mi->metaval;
+	else if (metaval)
+		edata = metaval;
+
+	if (!edata) /* will not encode */
+		return 0;
+
+	edata = htonl(edata);
+	return ife_tlv_meta_encode(skbdata, mi->metaid, 4, &edata);
+}
+EXPORT_SYMBOL_GPL(ife_encode_meta_u32);
+
+int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi)
+{
+	if (mi->metaval)
+		return nla_put_u16(skb, mi->metaid, *(u16 *)mi->metaval);
+	else
+		return nla_put(skb, mi->metaid, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(ife_get_meta_u16);
+
+int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval)
+{
+	mi->metaval = kmemdup(&metaval, sizeof(u32), GFP_KERNEL);
+	if (!mi->metaval)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ife_alloc_meta_u32);
+
+int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval)
+{
+	mi->metaval = kmemdup(&metaval, sizeof(u16), GFP_KERNEL);
+	if (!mi->metaval)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ife_alloc_meta_u16);
+
+void ife_release_meta_gen(struct tcf_meta_info *mi)
+{
+	kfree(mi->metaval);
+}
+EXPORT_SYMBOL_GPL(ife_release_meta_gen);
+
+int ife_validate_meta_u32(void *val, int len)
+{
+	if (len == 4)
+		return 0;
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ife_validate_meta_u32);
+
+int ife_validate_meta_u16(void *val, int len)
+{
+	/* length will include padding */
+	if (len == NLA_ALIGN(2))
+		return 0;
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ife_validate_meta_u16);
+
+static LIST_HEAD(ifeoplist);
+static DEFINE_RWLOCK(ife_mod_lock);
+
+static struct tcf_meta_ops *find_ife_oplist(u16 metaid)
+{
+	struct tcf_meta_ops *o;
+
+	read_lock(&ife_mod_lock);
+	list_for_each_entry(o, &ifeoplist, list) {
+		if (o->metaid == metaid) {
+			if (!try_module_get(o->owner))
+				o = NULL;
+			read_unlock(&ife_mod_lock);
+			return o;
+		}
+	}
+	read_unlock(&ife_mod_lock);
+
+	return NULL;
+}
+
+int register_ife_op(struct tcf_meta_ops *mops)
+{
+	struct tcf_meta_ops *m;
+
+	if (!mops->metaid || !mops->metatype || !mops->name ||
+	    !mops->check_presence || !mops->encode || !mops->decode ||
+	    !mops->get || !mops->alloc)
+		return -EINVAL;
+
+	write_lock(&ife_mod_lock);
+
+	list_for_each_entry(m, &ifeoplist, list) {
+		if (m->metaid == mops->metaid ||
+		    (strcmp(mops->name, m->name) == 0)) {
+			write_unlock(&ife_mod_lock);
+			return -EEXIST;
+		}
+	}
+
+	if (!mops->release)
+		mops->release = ife_release_meta_gen;
+
+	list_add_tail(&mops->list, &ifeoplist);
+	write_unlock(&ife_mod_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ife_op);
+
+int unregister_ife_op(struct tcf_meta_ops *mops)
+{
+	struct tcf_meta_ops *m;
+	int err = -ENOENT;
+
+	write_lock(&ife_mod_lock);
+	list_for_each_entry(m, &ifeoplist, list) {
+		if (m->metaid == mops->metaid) {
+			list_del(&mops->list);
+			err = 0;
+			break;
+		}
+	}
+	write_unlock(&ife_mod_lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(register_ife_op);
+
+static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len)
+{
+	int ret = 0;
+	/* XXX: unfortunately cant use nla_policy at this point
+	* because a length of 0 is valid in the case of
+	* "allow". "use" semantics do enforce for proper
+	* length and i couldve use nla_policy but it makes it hard
+	* to use it just for that..
+	*/
+	if (ops->validate)
+		return ops->validate(val, len);
+
+	if (ops->metatype == NLA_U32)
+		ret = ife_validate_meta_u32(val, len);
+	else if (ops->metatype == NLA_U16)
+		ret = ife_validate_meta_u16(val, len);
+
+	return ret;
+}
+
+/* called when adding new meta information
+ * under ife->tcf_lock
+*/
+static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
+				void *val, int len)
+{
+	struct tcf_meta_ops *ops = find_ife_oplist(metaid);
+	int ret = 0;
+
+	if (!ops) {
+		ret = -ENOENT;
+#ifdef CONFIG_MODULES
+		spin_unlock_bh(&ife->tcf_lock);
+		rtnl_unlock();
+		request_module("ifemeta%u", metaid);
+		rtnl_lock();
+		spin_lock_bh(&ife->tcf_lock);
+		ops = find_ife_oplist(metaid);
+#endif
+	}
+
+	if (ops) {
+		ret = 0;
+		if (len)
+			ret = ife_validate_metatype(ops, val, len);
+
+		module_put(ops->owner);
+	}
+
+	return ret;
+}
+
+/* called when adding new meta information
+ * under ife->tcf_lock
+*/
+static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
+			int len)
+{
+	struct tcf_meta_info *mi = NULL;
+	struct tcf_meta_ops *ops = find_ife_oplist(metaid);
+	int ret = 0;
+
+	if (!ops)
+		return -ENOENT;
+
+	mi = kzalloc(sizeof(*mi), GFP_KERNEL);
+	if (!mi) {
+		/*put back what find_ife_oplist took */
+		module_put(ops->owner);
+		return -ENOMEM;
+	}
+
+	mi->metaid = metaid;
+	mi->ops = ops;
+	if (len > 0) {
+		ret = ops->alloc(mi, metaval);
+		if (ret != 0) {
+			kfree(mi);
+			module_put(ops->owner);
+			return ret;
+		}
+	}
+
+	list_add_tail(&mi->metalist, &ife->metalist);
+
+	return ret;
+}
+
+static int use_all_metadata(struct tcf_ife_info *ife)
+{
+	struct tcf_meta_ops *o;
+	int rc = 0;
+	int installed = 0;
+
+	list_for_each_entry(o, &ifeoplist, list) {
+		rc = add_metainfo(ife, o->metaid, NULL, 0);
+		if (rc == 0)
+			installed += 1;
+	}
+
+	if (installed)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife)
+{
+	struct tcf_meta_info *e;
+	struct nlattr *nest;
+	unsigned char *b = skb_tail_pointer(skb);
+	int total_encoded = 0;
+
+	/*can only happen on decode */
+	if (list_empty(&ife->metalist))
+		return 0;
+
+	nest = nla_nest_start(skb, TCA_IFE_METALST);
+	if (!nest)
+		goto out_nlmsg_trim;
+
+	list_for_each_entry(e, &ife->metalist, metalist) {
+		if (!e->ops->get(skb, e))
+			total_encoded += 1;
+	}
+
+	if (!total_encoded)
+		goto out_nlmsg_trim;
+
+	nla_nest_end(skb, nest);
+
+	return 0;
+
+out_nlmsg_trim:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+/* under ife->tcf_lock */
+static void _tcf_ife_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_ife_info *ife = a->priv;
+	struct tcf_meta_info *e, *n;
+
+	list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+		module_put(e->ops->owner);
+		list_del(&e->metalist);
+		if (e->metaval) {
+			if (e->ops->release)
+				e->ops->release(e);
+			else
+				kfree(e->metaval);
+		}
+		kfree(e);
+	}
+}
+
+static void tcf_ife_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_ife_info *ife = a->priv;
+
+	spin_lock_bh(&ife->tcf_lock);
+	_tcf_ife_cleanup(a, bind);
+	spin_unlock_bh(&ife->tcf_lock);
+}
+
+/* under ife->tcf_lock */
+static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb)
+{
+	int len = 0;
+	int rc = 0;
+	int i = 0;
+	void *val;
+
+	for (i = 1; i < max_metacnt; i++) {
+		if (tb[i]) {
+			val = nla_data(tb[i]);
+			len = nla_len(tb[i]);
+
+			rc = load_metaops_and_vet(ife, i, val, len);
+			if (rc != 0)
+				return rc;
+
+			rc = add_metainfo(ife, i, val, len);
+			if (rc)
+				return rc;
+		}
+	}
+
+	return rc;
+}
+
+static int tcf_ife_init(struct net *net, struct nlattr *nla,
+			struct nlattr *est, struct tc_action *a,
+			int ovr, int bind)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+	struct nlattr *tb[TCA_IFE_MAX + 1];
+	struct nlattr *tb2[IFE_META_MAX + 1];
+	struct tcf_ife_info *ife;
+	struct tc_ife *parm;
+	u16 ife_type = 0;
+	u8 *daddr = NULL;
+	u8 *saddr = NULL;
+	int ret = 0;
+	int err;
+
+	err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_IFE_PARMS])
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_IFE_PARMS]);
+
+	if (parm->flags & IFE_ENCODE) {
+		/* Until we get issued the ethertype, we cant have
+		 * a default..
+		**/
+		if (!tb[TCA_IFE_TYPE]) {
+			pr_info("You MUST pass etherype for encoding\n");
+			return -EINVAL;
+		}
+	}
+
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*ife),
+				      bind, false);
+		if (ret)
+			return ret;
+		ret = ACT_P_CREATED;
+	} else {
+		if (bind)	/* dont override defaults */
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	ife = to_ife(a);
+	ife->flags = parm->flags;
+
+	if (parm->flags & IFE_ENCODE) {
+		ife_type = nla_get_u16(tb[TCA_IFE_TYPE]);
+		if (tb[TCA_IFE_DMAC])
+			daddr = nla_data(tb[TCA_IFE_DMAC]);
+		if (tb[TCA_IFE_SMAC])
+			saddr = nla_data(tb[TCA_IFE_SMAC]);
+	}
+
+	spin_lock_bh(&ife->tcf_lock);
+	ife->tcf_action = parm->action;
+
+	if (parm->flags & IFE_ENCODE) {
+		if (daddr)
+			ether_addr_copy(ife->eth_dst, daddr);
+		else
+			eth_zero_addr(ife->eth_dst);
+
+		if (saddr)
+			ether_addr_copy(ife->eth_src, saddr);
+		else
+			eth_zero_addr(ife->eth_src);
+
+		ife->eth_type = ife_type;
+	}
+
+	if (ret == ACT_P_CREATED)
+		INIT_LIST_HEAD(&ife->metalist);
+
+	if (tb[TCA_IFE_METALST]) {
+		err = nla_parse_nested(tb2, IFE_META_MAX, tb[TCA_IFE_METALST],
+				       NULL);
+		if (err) {
+metadata_parse_err:
+			if (ret == ACT_P_CREATED)
+				_tcf_ife_cleanup(a, bind);
+
+			spin_unlock_bh(&ife->tcf_lock);
+			return err;
+		}
+
+		err = populate_metalist(ife, tb2);
+		if (err)
+			goto metadata_parse_err;
+
+	} else {
+		/* if no passed metadata allow list or passed allow-all
+		 * then here we process by adding as many supported metadatum
+		 * as we can. You better have at least one else we are
+		 * going to bail out
+		 */
+		err = use_all_metadata(ife);
+		if (err) {
+			if (ret == ACT_P_CREATED)
+				_tcf_ife_cleanup(a, bind);
+
+			spin_unlock_bh(&ife->tcf_lock);
+			return err;
+		}
+	}
+
+	spin_unlock_bh(&ife->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, a);
+
+	return ret;
+}
+
+static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+			int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_ife_info *ife = a->priv;
+	struct tc_ife opt = {
+		.index = ife->tcf_index,
+		.refcnt = ife->tcf_refcnt - ref,
+		.bindcnt = ife->tcf_bindcnt - bind,
+		.action = ife->tcf_action,
+		.flags = ife->flags,
+	};
+	struct tcf_t t;
+
+	if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(ife->tcf_tm.expires);
+	if (nla_put(skb, TCA_IFE_TM, sizeof(t), &t))
+		goto nla_put_failure;
+
+	if (!is_zero_ether_addr(ife->eth_dst)) {
+		if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, ife->eth_dst))
+			goto nla_put_failure;
+	}
+
+	if (!is_zero_ether_addr(ife->eth_src)) {
+		if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, ife->eth_src))
+			goto nla_put_failure;
+	}
+
+	if (nla_put(skb, TCA_IFE_TYPE, 2, &ife->eth_type))
+		goto nla_put_failure;
+
+	if (dump_metalist(skb, ife)) {
+		/*ignore failure to dump metalist */
+		pr_info("Failed to dump metalist\n");
+	}
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
+		       u16 metaid, u16 mlen, void *mdata)
+{
+	struct tcf_meta_info *e;
+
+	/* XXX: use hash to speed up */
+	list_for_each_entry(e, &ife->metalist, metalist) {
+		if (metaid == e->metaid) {
+			if (e->ops) {
+				/* We check for decode presence already */
+				return e->ops->decode(skb, mdata, mlen);
+			}
+		}
+	}
+
+	return 0;
+}
+
+struct ifeheadr {
+	__be16 metalen;
+	u8 tlv_data[];
+};
+
+struct meta_tlvhdr {
+	__be16 type;
+	__be16 len;
+};
+
+static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_ife_info *ife = a->priv;
+	int action = ife->tcf_action;
+	struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
+	u16 ifehdrln = ifehdr->metalen;
+	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data);
+
+	spin_lock(&ife->tcf_lock);
+	bstats_update(&ife->tcf_bstats, skb);
+	ife->tcf_tm.lastuse = jiffies;
+	spin_unlock(&ife->tcf_lock);
+
+	ifehdrln = ntohs(ifehdrln);
+	if (unlikely(!pskb_may_pull(skb, ifehdrln))) {
+		spin_lock(&ife->tcf_lock);
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
+	}
+
+	skb_set_mac_header(skb, ifehdrln);
+	__skb_pull(skb, ifehdrln);
+	skb->protocol = eth_type_trans(skb, skb->dev);
+	ifehdrln -= IFE_METAHDRLEN;
+
+	while (ifehdrln > 0) {
+		u8 *tlvdata = (u8 *)tlv;
+		u16 mtype = tlv->type;
+		u16 mlen = tlv->len;
+
+		mtype = ntohs(mtype);
+		mlen = ntohs(mlen);
+
+		if (find_decode_metaid(skb, ife, mtype, (mlen - 4),
+				       (void *)(tlvdata + 4))) {
+			/* abuse overlimits to count when we receive metadata
+			 * but dont have an ops for it
+			 */
+			pr_info_ratelimited("Unknown metaid %d alnlen %d\n",
+					    mtype, mlen);
+			ife->tcf_qstats.overlimits++;
+		}
+
+		tlvdata += mlen;
+		ifehdrln -= mlen;
+		tlv = (struct meta_tlvhdr *)tlvdata;
+	}
+
+	skb_reset_network_header(skb);
+	return action;
+}
+
+/*XXX: check if we can do this at install time instead of current
+ * send data path
+**/
+static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife)
+{
+	struct tcf_meta_info *e, *n;
+	int tot_run_sz = 0, run_sz = 0;
+
+	list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+		if (e->ops->check_presence) {
+			run_sz = e->ops->check_presence(skb, e);
+			tot_run_sz += run_sz;
+		}
+	}
+
+	return tot_run_sz;
+}
+
+static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_ife_info *ife = a->priv;
+	int action = ife->tcf_action;
+	struct ethhdr *oethh;	/* outer ether header */
+	struct ethhdr *iethh;	/* inner eth header */
+	struct tcf_meta_info *e;
+	/*
+	   OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
+	   where ORIGDATA = original ethernet header ...
+	 */
+	u16 metalen = ife_get_sz(skb, ife);
+	int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
+	unsigned int skboff = skb->dev->hard_header_len;
+	u32 at = G_TC_AT(skb->tc_verd);
+	int new_len = skb->len + hdrm;
+	bool exceed_mtu = false;
+	int err;
+
+	if (at & AT_EGRESS) {
+		if (new_len > skb->dev->mtu)
+			exceed_mtu = true;
+	}
+
+	spin_lock(&ife->tcf_lock);
+	bstats_update(&ife->tcf_bstats, skb);
+	ife->tcf_tm.lastuse = jiffies;
+
+	if (!metalen) {		/* no metadata to send */
+		/* abuse overlimits to count when we allow packet
+		 * with no metadata
+		 */
+		ife->tcf_qstats.overlimits++;
+		spin_unlock(&ife->tcf_lock);
+		return action;
+	}
+	/* could be stupid policy setup or mtu config
+	 * so lets be conservative.. */
+	if ((action == TC_ACT_SHOT) || exceed_mtu) {
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
+	}
+
+	iethh = eth_hdr(skb);
+
+	err = skb_cow_head(skb, hdrm);
+	if (unlikely(err)) {
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
+	}
+
+	if (!(at & AT_EGRESS))
+		skb_push(skb, skb->dev->hard_header_len);
+
+	__skb_push(skb, hdrm);
+	memcpy(skb->data, iethh, skb->mac_len);
+	skb_reset_mac_header(skb);
+	oethh = eth_hdr(skb);
+
+	/*total metadata length */
+	metalen += IFE_METAHDRLEN;
+	metalen = htons(metalen);
+	memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN);
+	skboff += IFE_METAHDRLEN;
+
+	/* XXX: we dont have a clever way of telling encode to
+	 * not repeat some of the computations that are done by
+	 * ops->presence_check...
+	 */
+	list_for_each_entry(e, &ife->metalist, metalist) {
+		if (e->ops->encode) {
+			err = e->ops->encode(skb, (void *)(skb->data + skboff),
+					     e);
+		}
+		if (err < 0) {
+			/* too corrupt to keep around if overwritten */
+			ife->tcf_qstats.drops++;
+			spin_unlock(&ife->tcf_lock);
+			return TC_ACT_SHOT;
+		}
+		skboff += err;
+	}
+
+	if (!is_zero_ether_addr(ife->eth_src))
+		ether_addr_copy(oethh->h_source, ife->eth_src);
+	else
+		ether_addr_copy(oethh->h_source, iethh->h_source);
+	if (!is_zero_ether_addr(ife->eth_dst))
+		ether_addr_copy(oethh->h_dest, ife->eth_dst);
+	else
+		ether_addr_copy(oethh->h_dest, iethh->h_dest);
+	oethh->h_proto = htons(ife->eth_type);
+
+	if (!(at & AT_EGRESS))
+		skb_pull(skb, skb->dev->hard_header_len);
+
+	spin_unlock(&ife->tcf_lock);
+
+	return action;
+}
+
+static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
+		       struct tcf_result *res)
+{
+	struct tcf_ife_info *ife = a->priv;
+
+	if (ife->flags & IFE_ENCODE)
+		return tcf_ife_encode(skb, a, res);
+
+	if (!(ife->flags & IFE_ENCODE))
+		return tcf_ife_decode(skb, a, res);
+
+	pr_info_ratelimited("unknown failure(policy neither de/encode\n");
+	spin_lock(&ife->tcf_lock);
+	bstats_update(&ife->tcf_bstats, skb);
+	ife->tcf_tm.lastuse = jiffies;
+	ife->tcf_qstats.drops++;
+	spin_unlock(&ife->tcf_lock);
+
+	return TC_ACT_SHOT;
+}
+
+static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
+			  struct netlink_callback *cb, int type,
+			  struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_ife_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_ife_ops = {
+	.kind = "ife",
+	.type = TCA_ACT_IFE,
+	.owner = THIS_MODULE,
+	.act = tcf_ife_act,
+	.dump = tcf_ife_dump,
+	.cleanup = tcf_ife_cleanup,
+	.init = tcf_ife_init,
+	.walk = tcf_ife_walker,
+	.lookup = tcf_ife_search,
+};
+
+static __net_init int ife_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	return tc_action_net_init(tn, &act_ife_ops, IFE_TAB_MASK);
+}
+
+static void __net_exit ife_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations ife_net_ops = {
+	.init = ife_init_net,
+	.exit = ife_exit_net,
+	.id   = &ife_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init ife_init_module(void)
+{
+	return tcf_register_action(&act_ife_ops, &ife_net_ops);
+}
+
+static void __exit ife_cleanup_module(void)
+{
+	tcf_unregister_action(&act_ife_ops, &ife_net_ops);
+}
+
+module_init(ife_init_module);
+module_exit(ife_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2015)");
+MODULE_DESCRIPTION("Inter-FE LFB action");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 8a6bf5da1aefdafd60b73d9122c7af9fd2d7bb9c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 1 Mar 2016 19:55:14 +0100
Subject: netfilter: nft_masq: support port range

Complete masquerading support by allowing port range selection.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_masq.h         |  4 ++-
 include/uapi/linux/netfilter/nf_tables.h |  4 +++
 net/ipv4/netfilter/nft_masq_ipv4.c       |  7 ++++-
 net/ipv6/netfilter/nft_masq_ipv6.c       |  7 ++++-
 net/netfilter/nft_masq.c                 | 51 +++++++++++++++++++++++++-------
 5 files changed, 59 insertions(+), 14 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/netfilter/nft_masq.h b/include/net/netfilter/nft_masq.h
index e2a518b60e19..a3f3c11b2526 100644
--- a/include/net/netfilter/nft_masq.h
+++ b/include/net/netfilter/nft_masq.h
@@ -2,7 +2,9 @@
 #define _NFT_MASQ_H_
 
 struct nft_masq {
-	u32	flags;
+	u32			flags;
+	enum nft_registers      sreg_proto_min:8;
+	enum nft_registers      sreg_proto_max:8;
 };
 
 extern const struct nla_policy nft_masq_policy[];
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index b19be0a098c0..eeffde196f80 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -951,10 +951,14 @@ enum nft_nat_attributes {
  * enum nft_masq_attributes - nf_tables masquerade expression attributes
  *
  * @NFTA_MASQ_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32)
+ * @NFTA_MASQ_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers)
+ * @NFTA_MASQ_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers)
  */
 enum nft_masq_attributes {
 	NFTA_MASQ_UNSPEC,
 	NFTA_MASQ_FLAGS,
+	NFTA_MASQ_REG_PROTO_MIN,
+	NFTA_MASQ_REG_PROTO_MAX,
 	__NFTA_MASQ_MAX
 };
 #define NFTA_MASQ_MAX		(__NFTA_MASQ_MAX - 1)
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index b72ffc58e255..51ced81b616c 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -25,7 +25,12 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
 
 	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
-
+	if (priv->sreg_proto_min) {
+		range.min_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_min];
+		range.max_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_max];
+	}
 	regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook,
 						    &range, pkt->out);
 }
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index cd1ac1637a05..9597ffb74077 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -26,7 +26,12 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
 
 	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
-
+	if (priv->sreg_proto_min) {
+		range.min_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_min];
+		range.max_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_max];
+	}
 	regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out);
 }
 
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 9aea747b43ea..81b5ad6165ac 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -17,7 +17,9 @@
 #include <net/netfilter/nft_masq.h>
 
 const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = {
-	[NFTA_MASQ_FLAGS]	= { .type = NLA_U32 },
+	[NFTA_MASQ_FLAGS]		= { .type = NLA_U32 },
+	[NFTA_MASQ_REG_PROTO_MIN]	= { .type = NLA_U32 },
+	[NFTA_MASQ_REG_PROTO_MAX]	= { .type = NLA_U32 },
 };
 EXPORT_SYMBOL_GPL(nft_masq_policy);
 
@@ -40,6 +42,7 @@ int nft_masq_init(const struct nft_ctx *ctx,
 		  const struct nft_expr *expr,
 		  const struct nlattr * const tb[])
 {
+	u32 plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all);
 	struct nft_masq *priv = nft_expr_priv(expr);
 	int err;
 
@@ -47,12 +50,32 @@ int nft_masq_init(const struct nft_ctx *ctx,
 	if (err)
 		return err;
 
-	if (tb[NFTA_MASQ_FLAGS] == NULL)
-		return 0;
-
-	priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
-	if (priv->flags & ~NF_NAT_RANGE_MASK)
-		return -EINVAL;
+	if (tb[NFTA_MASQ_FLAGS]) {
+		priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
+		if (priv->flags & ~NF_NAT_RANGE_MASK)
+			return -EINVAL;
+	}
+
+	if (tb[NFTA_MASQ_REG_PROTO_MIN]) {
+		priv->sreg_proto_min =
+			nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]);
+
+		err = nft_validate_register_load(priv->sreg_proto_min, plen);
+		if (err < 0)
+			return err;
+
+		if (tb[NFTA_MASQ_REG_PROTO_MAX]) {
+			priv->sreg_proto_max =
+				nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]);
+
+			err = nft_validate_register_load(priv->sreg_proto_max,
+							 plen);
+			if (err < 0)
+				return err;
+		} else {
+			priv->sreg_proto_max = priv->sreg_proto_min;
+		}
+	}
 
 	return 0;
 }
@@ -62,12 +85,18 @@ int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_masq *priv = nft_expr_priv(expr);
 
-	if (priv->flags == 0)
-		return 0;
-
-	if (nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags)))
+	if (priv->flags != 0 &&
+	    nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags)))
 		goto nla_put_failure;
 
+	if (priv->sreg_proto_min) {
+		if (nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MIN,
+				      priv->sreg_proto_min) ||
+		    nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MAX,
+				      priv->sreg_proto_max))
+			goto nla_put_failure;
+	}
+
 	return 0;
 
 nla_put_failure:
-- 
cgit v1.2.3


From b5d3755a22e0cc4c369c0985aef0c52c2477c1e7 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 4 Mar 2016 11:52:16 +0100
Subject: uapi: define DIV_ROUND_UP for userland

DIV_ROUND_UP is defined in linux/kernel.h only for the kernel.
When ethtool.h is included by a userland app, we got the following error:

include/linux/ethtool.h:1218:8: error: variably modified 'queue_mask' at file scope
  __u32 queue_mask[DIV_ROUND_UP(MAX_NUM_QUEUE, 32)];
        ^

Let's add a common definition in uapi and use it everywhere.

Fixes: ac2c7ad0e5d6 ("net/ethtool: introduce a new ioctl for per queue setting")
CC: Kan Liang <kan.liang@intel.com>
Suggested-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/kernel.h       | 2 +-
 include/uapi/linux/ethtool.h | 3 ++-
 include/uapi/linux/kernel.h  | 1 +
 include/uapi/linux/mroute6.h | 9 ++-------
 4 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f31638c6e873..ac1923957236 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -64,7 +64,7 @@
 #define round_down(x, y) ((x) & ~__round_mask(x, y))
 
 #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP
 #define DIV_ROUND_UP_ULL(ll,d) \
 	({ unsigned long long _tmp = (ll)+(d)-1; do_div(_tmp, d); _tmp; })
 
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 37fd6dc33de4..9c22249ebf35 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -13,6 +13,7 @@
 #ifndef _UAPI_LINUX_ETHTOOL_H
 #define _UAPI_LINUX_ETHTOOL_H
 
+#include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/if_ether.h>
 
@@ -1215,7 +1216,7 @@ enum ethtool_sfeatures_retval_bits {
 struct ethtool_per_queue_op {
 	__u32	cmd;
 	__u32	sub_command;
-	__u32	queue_mask[DIV_ROUND_UP(MAX_NUM_QUEUE, 32)];
+	__u32	queue_mask[__KERNEL_DIV_ROUND_UP(MAX_NUM_QUEUE, 32)];
 	char	data[];
 };
 
diff --git a/include/uapi/linux/kernel.h b/include/uapi/linux/kernel.h
index 321e399457f5..466073f0ce46 100644
--- a/include/uapi/linux/kernel.h
+++ b/include/uapi/linux/kernel.h
@@ -9,5 +9,6 @@
 #define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
 #define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
 
+#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
 
 #endif /* _UAPI_LINUX_KERNEL_H */
diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h
index ce91215cf7e6..5062fb5751e1 100644
--- a/include/uapi/linux/mroute6.h
+++ b/include/uapi/linux/mroute6.h
@@ -1,6 +1,7 @@
 #ifndef _UAPI__LINUX_MROUTE6_H
 #define _UAPI__LINUX_MROUTE6_H
 
+#include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/sockios.h>
 
@@ -46,14 +47,8 @@ typedef unsigned short mifi_t;
 typedef	__u32		if_mask;
 #define NIFBITS (sizeof(if_mask) * 8)        /* bits per mask */
 
-#if !defined(__KERNEL__)
-#if !defined(DIV_ROUND_UP)
-#define	DIV_ROUND_UP(x,y)	(((x) + ((y) - 1)) / (y))
-#endif
-#endif
-
 typedef struct if_set {
-	if_mask ifs_bits[DIV_ROUND_UP(IF_SETSIZE, NIFBITS)];
+	if_mask ifs_bits[__KERNEL_DIV_ROUND_UP(IF_SETSIZE, NIFBITS)];
 } if_set;
 
 #define IF_SET(n, p)    ((p)->ifs_bits[(n)/NIFBITS] |= (1 << ((n) % NIFBITS)))
-- 
cgit v1.2.3


From 14e2037902d65213842b4e40305ff54a64abbcb6 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 4 Mar 2016 11:52:19 +0100
Subject: ethtool.h: define INT_MAX for userland

INT_MAX needs limits.h in userland.
When ethtool.h is included by a userland app, we got the following error:

.../usr/include/linux/ethtool.h: In function 'ethtool_validate_speed':
.../usr/include/linux/ethtool.h:1471:18: error: 'INT_MAX' undeclared (first use in this function)
  return speed <= INT_MAX || speed == SPEED_UNKNOWN
                  ^

Fixes: e02564ee334a ("ethtool: make validate_speed accept all speeds between 0 and INT_MAX")
CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 9c22249ebf35..2835b07416b7 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -17,6 +17,10 @@
 #include <linux/types.h>
 #include <linux/if_ether.h>
 
+#ifndef __KERNEL__
+#include <limits.h> /* for INT_MAX */
+#endif
+
 /* All structures exposed to userland should be defined such that they
  * have the same layout for 32-bit and 64-bit userland.
  */
-- 
cgit v1.2.3


From 8afd54c87ad7089734ef0527937a256586ba828a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:03 +0100
Subject: bpf: add flags to bpf_skb_store_bytes for clearing hash

When overwriting parts of the packet with bpf_skb_store_bytes() that
were fed previously into skb->hash calculation, we should clear the
current hash with skb_clear_hash(), so that a next skb_get_hash() call
can determine the correct hash related to this skb.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ee2193287cbe..2e3e90309904 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -305,6 +305,7 @@ enum bpf_func_id {
 
 /* BPF_FUNC_skb_store_bytes flags. */
 #define BPF_F_RECOMPUTE_CSUM		(1ULL << 0)
+#define BPF_F_INVALIDATE_HASH		(1ULL << 1)
 
 /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags.
  * First 4 bits are for passing the header field size.
diff --git a/net/core/filter.c b/net/core/filter.c
index 356a251657a5..a1fe246a6147 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1353,7 +1353,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 	unsigned int len = (unsigned int) r4;
 	void *ptr;
 
-	if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM)))
+	if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
 		return -EINVAL;
 
 	/* bpf verifier guarantees that:
@@ -1384,6 +1384,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 
 	if (flags & BPF_F_RECOMPUTE_CSUM)
 		skb_postpush_rcsum(skb, ptr, len);
+	if (flags & BPF_F_INVALIDATE_HASH)
+		skb_clear_hash(skb);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 2208087061c4ad88de188911367effc550144836 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:05 +0100
Subject: bpf: allow to propagate df in bpf_skb_set_tunnel_key

Added by 9a628224a61b ("ip_tunnel: Add dont fragment flag."), allow to
feed df flag into tunneling facilities (currently supported on TX by
vxlan, geneve and gre) as a hint from eBPF's bpf_skb_set_tunnel_key()
helper.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2e3e90309904..21ee6d52016f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -330,6 +330,7 @@ enum bpf_func_id {
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
+#define BPF_F_DONT_FRAGMENT		(1ULL << 2)
 
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
diff --git a/net/core/filter.c b/net/core/filter.c
index ce4e18dd2c89..6c9d15561d04 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1819,7 +1819,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	u8 compat[sizeof(struct bpf_tunnel_key)];
 	struct ip_tunnel_info *info;
 
-	if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX)))
+	if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
+			       BPF_F_DONT_FRAGMENT)))
 		return -EINVAL;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
@@ -1844,6 +1845,9 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	info->mode = IP_TUNNEL_INFO_TX;
 
 	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM;
+	if (flags & BPF_F_DONT_FRAGMENT)
+		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+
 	info->key.tun_id = cpu_to_be64(from->tunnel_id);
 	info->key.tos = from->tunnel_tos;
 	info->key.ttl = from->tunnel_ttl;
-- 
cgit v1.2.3


From 14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:06 +0100
Subject: bpf: support for access to tunnel options

After eBPF being able to programmatically access/manage tunnel key meta
data via commit d3aa45ce6b94 ("bpf: add helpers to access tunnel metadata")
and more recently also for IPv6 through c6c33454072f ("bpf: support ipv6
for bpf_skb_{set,get}_tunnel_key"), this work adds two complementary
helpers to generically access their auxiliary tunnel options.

Geneve and vxlan support this facility. For geneve, TLVs can be pushed,
and for the vxlan case its GBP extension. I.e. setting tunnel key for geneve
case only makes sense, if we can also read/write TLVs into it. In the GBP
case, it provides the flexibility to easily map the group policy ID in
combination with other helpers or maps.

I chose to model this as two separate helpers, bpf_skb_{set,get}_tunnel_opt(),
for a couple of reasons. bpf_skb_{set,get}_tunnel_key() is already rather
complex by itself, and there may be cases for tunnel key backends where
tunnel options are not always needed. If we would have integrated this
into bpf_skb_{set,get}_tunnel_key() nevertheless, we are very limited with
remaining helper arguments, so keeping compatibility on structs in case of
passing in a flat buffer gets more cumbersome. Separating both also allows
for more flexibility and future extensibility, f.e. options could be fed
directly from a map, etc.

Moreover, change geneve's xmit path to test only for info->options_len
instead of TUNNEL_GENEVE_OPT flag. This makes it more consistent with vxlan's
xmit path and allows for avoiding to specify a protocol flag in the API on
xmit, so it can be protocol agnostic. Having info->options_len is enough
information that is needed. Tested with vxlan and geneve.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c     |  4 +--
 include/uapi/linux/bpf.h | 11 +++++++
 net/core/filter.c        | 83 ++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 90 insertions(+), 8 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index bc5da357e16d..36db4cf0579c 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -940,7 +940,7 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		u8 vni[3];
 
 		tunnel_id_to_vni(key->tun_id, vni);
-		if (key->tun_flags & TUNNEL_GENEVE_OPT)
+		if (info->options_len)
 			opts = ip_tunnel_info_opts(info);
 
 		if (key->tun_flags & TUNNEL_CSUM)
@@ -1027,7 +1027,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		u8 vni[3];
 
 		tunnel_id_to_vni(key->tun_id, vni);
-		if (key->tun_flags & TUNNEL_GENEVE_OPT)
+		if (info->options_len)
 			opts = ip_tunnel_info_opts(info);
 
 		if (key->tun_flags & TUNNEL_CSUM)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 21ee6d52016f..9221f653fee3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -298,6 +298,17 @@ enum bpf_func_id {
 	 * Return: csum result
 	 */
 	BPF_FUNC_csum_diff,
+
+	/**
+	 * bpf_skb_[gs]et_tunnel_opt(skb, opt, size)
+	 * retrieve or populate tunnel options metadata
+	 * @skb: pointer to skb
+	 * @opt: pointer to raw tunnel option data
+	 * @size: size of @opt
+	 * Return: 0 on success for set, option size for get
+	 */
+	BPF_FUNC_skb_get_tunnel_opt,
+	BPF_FUNC_skb_set_tunnel_opt,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 6c9d15561d04..012a10c2da94 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1809,6 +1809,32 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	u8 *to = (u8 *) (long) r2;
+	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+	if (unlikely(!info ||
+		     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)))
+		return -ENOENT;
+	if (unlikely(size < info->options_len))
+		return -ENOMEM;
+
+	ip_tunnel_info_opts_get(to, info);
+
+	return info->options_len;
+}
+
+static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
+	.func		= bpf_skb_get_tunnel_opt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+};
+
 static struct metadata_dst __percpu *md_dst;
 
 static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
@@ -1875,17 +1901,58 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void)
+#define BPF_TUNLEN_MAX	255
+
+static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	u8 *from = (u8 *) (long) r2;
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+	const struct metadata_dst *md = this_cpu_ptr(md_dst);
+
+	if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
+		return -EINVAL;
+	if (unlikely(size > BPF_TUNLEN_MAX))
+		return -ENOMEM;
+
+	ip_tunnel_info_opts_set(info, from, size);
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
+	.func		= bpf_skb_set_tunnel_opt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *
+bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
 {
 	if (!md_dst) {
-		/* race is not possible, since it's called from
-		 * verifier that is holding verifier mutex
+		BUILD_BUG_ON(FIELD_SIZEOF(struct ip_tunnel_info,
+					  options_len) != 1);
+
+		/* Race is not possible, since it's called from verifier
+		 * that is holding verifier mutex.
 		 */
-		md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL);
+		md_dst = metadata_dst_alloc_percpu(BPF_TUNLEN_MAX,
+						   GFP_KERNEL);
 		if (!md_dst)
 			return NULL;
 	}
-	return &bpf_skb_set_tunnel_key_proto;
+
+	switch (which) {
+	case BPF_FUNC_skb_set_tunnel_key:
+		return &bpf_skb_set_tunnel_key_proto;
+	case BPF_FUNC_skb_set_tunnel_opt:
+		return &bpf_skb_set_tunnel_opt_proto;
+	default:
+		return NULL;
+	}
 }
 
 static const struct bpf_func_proto *
@@ -1939,7 +2006,11 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_skb_get_tunnel_key:
 		return &bpf_skb_get_tunnel_key_proto;
 	case BPF_FUNC_skb_set_tunnel_key:
-		return bpf_get_skb_set_tunnel_key_proto();
+		return bpf_get_skb_set_tunnel_proto(func_id);
+	case BPF_FUNC_skb_get_tunnel_opt:
+		return &bpf_skb_get_tunnel_opt_proto;
+	case BPF_FUNC_skb_set_tunnel_opt:
+		return bpf_get_skb_set_tunnel_proto(func_id);
 	case BPF_FUNC_redirect:
 		return &bpf_redirect_proto;
 	case BPF_FUNC_get_route_realm:
-- 
cgit v1.2.3


From 6c90598174322b8888029e40dd84a4eb01f56afe Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Mar 2016 21:57:15 -0800
Subject: bpf: pre-allocate hash map elements

If kprobe is placed on spin_unlock then calling kmalloc/kfree from
bpf programs is not safe, since the following dead lock is possible:
kfree->spin_lock(kmem_cache_node->lock)...spin_unlock->kprobe->
bpf_prog->map_update->kmalloc->spin_lock(of the same kmem_cache_node->lock)
and deadlocks.

The following solutions were considered and some implemented, but
eventually discarded
- kmem_cache_create for every map
- add recursion check to slow-path of slub
- use reserved memory in bpf_map_update for in_irq or in preempt_disabled
- kmalloc via irq_work

At the end pre-allocation of all map elements turned out to be the simplest
solution and since the user is charged upfront for all the memory, such
pre-allocation doesn't affect the user space visible behavior.

Since it's impossible to tell whether kprobe is triggered in a safe
location from kmalloc point of view, use pre-allocation by default
and introduce new BPF_F_NO_PREALLOC flag.

While testing of per-cpu hash maps it was discovered
that alloc_percpu(GFP_ATOMIC) has odd corner cases and often
fails to allocate memory even when 90% of it is free.
The pre-allocation of per-cpu hash elements solves this problem as well.

Turned out that bpf_map_update() quickly followed by
bpf_map_lookup()+bpf_map_delete() is very common pattern used
in many of iovisor/bcc/tools, so there is additional benefit of
pre-allocation, since such use cases are must faster.

Since all hash map elements are now pre-allocated we can remove
atomic increment of htab->count and save few more cycles.

Also add bpf_map_precharge_memlock() to check rlimit_memlock early to avoid
large malloc/free done by users who don't have sufficient limits.

Pre-allocation is done with vmalloc and alloc/free is done
via percpu_freelist. Here are performance numbers for different
pre-allocation algorithms that were implemented, but discarded
in favor of percpu_freelist:

1 cpu:
pcpu_ida	2.1M
pcpu_ida nolock	2.3M
bt		2.4M
kmalloc		1.8M
hlist+spinlock	2.3M
pcpu_freelist	2.6M

4 cpu:
pcpu_ida	1.5M
pcpu_ida nolock	1.8M
bt w/smp_align	1.7M
bt no/smp_align	1.1M
kmalloc		0.7M
hlist+spinlock	0.2M
pcpu_freelist	2.0M

8 cpu:
pcpu_ida	0.7M
bt w/smp_align	0.8M
kmalloc		0.4M
pcpu_freelist	1.5M

32 cpu:
kmalloc		0.13M
pcpu_freelist	0.49M

pcpu_ida nolock is a modified percpu_ida algorithm without
percpu_ida_cpu locks and without cross-cpu tag stealing.
It's faster than existing percpu_ida, but not as fast as pcpu_freelist.

bt is a variant of block/blk-mq-tag.c simlified and customized
for bpf use case. bt w/smp_align is using cache line for every 'long'
(similar to blk-mq-tag). bt no/smp_align allocates 'long'
bitmasks continuously to save memory. It's comparable to percpu_ida
and in some cases faster, but slower than percpu_freelist

hlist+spinlock is the simplest free list with single spinlock.
As expeceted it has very bad scaling in SMP.

kmalloc is existing implementation which is still available via
BPF_F_NO_PREALLOC flag. It's significantly slower in single cpu and
in 8 cpu setup it's 3 times slower than pre-allocation with pcpu_freelist,
but saves memory, so in cases where map->max_entries can be large
and number of map update/delete per second is low, it may make
sense to use it.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   2 +
 include/uapi/linux/bpf.h |   3 +
 kernel/bpf/hashtab.c     | 240 +++++++++++++++++++++++++++++++++--------------
 kernel/bpf/syscall.c     |  15 ++-
 4 files changed, 186 insertions(+), 74 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4b070827200d..efd1d4ca95c6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -37,6 +37,7 @@ struct bpf_map {
 	u32 key_size;
 	u32 value_size;
 	u32 max_entries;
+	u32 map_flags;
 	u32 pages;
 	struct user_struct *user;
 	const struct bpf_map_ops *ops;
@@ -178,6 +179,7 @@ struct bpf_map *__bpf_map_get(struct fd f);
 void bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
+int bpf_map_precharge_memlock(u32 pages);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9221f653fee3..0e30b19012a5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,12 +101,15 @@ enum bpf_prog_type {
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
 #define BPF_EXIST	2 /* update existing element */
 
+#define BPF_F_NO_PREALLOC	(1U << 0)
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
 		__u32	key_size;	/* size of key in bytes */
 		__u32	value_size;	/* size of value in bytes */
 		__u32	max_entries;	/* max number of entries in a map */
+		__u32	map_flags;	/* prealloc or not */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a68e95133fcd..fff3650d52fc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -13,6 +14,7 @@
 #include <linux/jhash.h>
 #include <linux/filter.h>
 #include <linux/vmalloc.h>
+#include "percpu_freelist.h"
 
 struct bucket {
 	struct hlist_head head;
@@ -22,6 +24,8 @@ struct bucket {
 struct bpf_htab {
 	struct bpf_map map;
 	struct bucket *buckets;
+	void *elems;
+	struct pcpu_freelist freelist;
 	atomic_t count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
@@ -29,15 +33,86 @@ struct bpf_htab {
 
 /* each htab element is struct htab_elem + key + value */
 struct htab_elem {
-	struct hlist_node hash_node;
-	struct rcu_head rcu;
 	union {
-		u32 hash;
-		u32 key_size;
+		struct hlist_node hash_node;
+		struct bpf_htab *htab;
+		struct pcpu_freelist_node fnode;
 	};
+	struct rcu_head rcu;
+	u32 hash;
 	char key[0] __aligned(8);
 };
 
+static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
+				     void __percpu *pptr)
+{
+	*(void __percpu **)(l->key + key_size) = pptr;
+}
+
+static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
+{
+	return *(void __percpu **)(l->key + key_size);
+}
+
+static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
+{
+	return (struct htab_elem *) (htab->elems + i * htab->elem_size);
+}
+
+static void htab_free_elems(struct bpf_htab *htab)
+{
+	int i;
+
+	if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+		goto free_elems;
+
+	for (i = 0; i < htab->map.max_entries; i++) {
+		void __percpu *pptr;
+
+		pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
+					 htab->map.key_size);
+		free_percpu(pptr);
+	}
+free_elems:
+	vfree(htab->elems);
+}
+
+static int prealloc_elems_and_freelist(struct bpf_htab *htab)
+{
+	int err = -ENOMEM, i;
+
+	htab->elems = vzalloc(htab->elem_size * htab->map.max_entries);
+	if (!htab->elems)
+		return -ENOMEM;
+
+	if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+		goto skip_percpu_elems;
+
+	for (i = 0; i < htab->map.max_entries; i++) {
+		u32 size = round_up(htab->map.value_size, 8);
+		void __percpu *pptr;
+
+		pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
+		if (!pptr)
+			goto free_elems;
+		htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
+				  pptr);
+	}
+
+skip_percpu_elems:
+	err = pcpu_freelist_init(&htab->freelist);
+	if (err)
+		goto free_elems;
+
+	pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size,
+			       htab->map.max_entries);
+	return 0;
+
+free_elems:
+	htab_free_elems(htab);
+	return err;
+}
+
 /* Called from syscall */
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
@@ -46,6 +121,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	int err, i;
 	u64 cost;
 
+	if (attr->map_flags & ~BPF_F_NO_PREALLOC)
+		/* reserved bits should not be used */
+		return ERR_PTR(-EINVAL);
+
 	htab = kzalloc(sizeof(*htab), GFP_USER);
 	if (!htab)
 		return ERR_PTR(-ENOMEM);
@@ -55,6 +134,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	htab->map.key_size = attr->key_size;
 	htab->map.value_size = attr->value_size;
 	htab->map.max_entries = attr->max_entries;
+	htab->map.map_flags = attr->map_flags;
 
 	/* check sanity of attributes.
 	 * value_size == 0 may be allowed in the future to use map as a set
@@ -92,7 +172,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	if (percpu)
 		htab->elem_size += sizeof(void *);
 	else
-		htab->elem_size += htab->map.value_size;
+		htab->elem_size += round_up(htab->map.value_size, 8);
 
 	/* prevent zero size kmalloc and check for u32 overflow */
 	if (htab->n_buckets == 0 ||
@@ -112,6 +192,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 
 	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
+	/* if map size is larger than memlock limit, reject it early */
+	err = bpf_map_precharge_memlock(htab->map.pages);
+	if (err)
+		goto free_htab;
+
 	err = -ENOMEM;
 	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
 				      GFP_USER | __GFP_NOWARN);
@@ -127,10 +212,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		raw_spin_lock_init(&htab->buckets[i].lock);
 	}
 
-	atomic_set(&htab->count, 0);
+	if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
+		err = prealloc_elems_and_freelist(htab);
+		if (err)
+			goto free_buckets;
+	}
 
 	return &htab->map;
 
+free_buckets:
+	kvfree(htab->buckets);
 free_htab:
 	kfree(htab);
 	return ERR_PTR(err);
@@ -249,42 +340,42 @@ find_first_elem:
 		}
 	}
 
-	/* itereated over all buckets and all elements */
+	/* iterated over all buckets and all elements */
 	return -ENOENT;
 }
 
-
-static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
-				     void __percpu *pptr)
-{
-	*(void __percpu **)(l->key + key_size) = pptr;
-}
-
-static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
-{
-	return *(void __percpu **)(l->key + key_size);
-}
-
-static void htab_percpu_elem_free(struct htab_elem *l)
+static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 {
-	free_percpu(htab_elem_get_ptr(l, l->key_size));
+	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
+		free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
 	kfree(l);
+
 }
 
-static void htab_percpu_elem_free_rcu(struct rcu_head *head)
+static void htab_elem_free_rcu(struct rcu_head *head)
 {
 	struct htab_elem *l = container_of(head, struct htab_elem, rcu);
+	struct bpf_htab *htab = l->htab;
 
-	htab_percpu_elem_free(l);
+	/* must increment bpf_prog_active to avoid kprobe+bpf triggering while
+	 * we're calling kfree, otherwise deadlock is possible if kprobes
+	 * are placed somewhere inside of slub
+	 */
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
+	htab_elem_free(htab, l);
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
 }
 
-static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 {
-	if (percpu) {
-		l->key_size = key_size;
-		call_rcu(&l->rcu, htab_percpu_elem_free_rcu);
+	if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
+		pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
-		kfree_rcu(l, rcu);
+		atomic_dec(&htab->count);
+		l->htab = htab;
+		call_rcu(&l->rcu, htab_elem_free_rcu);
 	}
 }
 
@@ -293,23 +384,39 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 					 bool percpu, bool onallcpus)
 {
 	u32 size = htab->map.value_size;
+	bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
 	struct htab_elem *l_new;
 	void __percpu *pptr;
 
-	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
-	if (!l_new)
-		return NULL;
+	if (prealloc) {
+		l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
+		if (!l_new)
+			return ERR_PTR(-E2BIG);
+	} else {
+		if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+			atomic_dec(&htab->count);
+			return ERR_PTR(-E2BIG);
+		}
+		l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+		if (!l_new)
+			return ERR_PTR(-ENOMEM);
+	}
 
 	memcpy(l_new->key, key, key_size);
 	if (percpu) {
 		/* round up value_size to 8 bytes */
 		size = round_up(size, 8);
 
-		/* alloc_percpu zero-fills */
-		pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN);
-		if (!pptr) {
-			kfree(l_new);
-			return NULL;
+		if (prealloc) {
+			pptr = htab_elem_get_ptr(l_new, key_size);
+		} else {
+			/* alloc_percpu zero-fills */
+			pptr = __alloc_percpu_gfp(size, 8,
+						  GFP_ATOMIC | __GFP_NOWARN);
+			if (!pptr) {
+				kfree(l_new);
+				return ERR_PTR(-ENOMEM);
+			}
 		}
 
 		if (!onallcpus) {
@@ -324,7 +431,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 				off += size;
 			}
 		}
-		htab_elem_set_ptr(l_new, key_size, pptr);
+		if (!prealloc)
+			htab_elem_set_ptr(l_new, key_size, pptr);
 	} else {
 		memcpy(l_new->key + round_up(key_size, 8), value, size);
 	}
@@ -336,12 +444,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
 		       u64 map_flags)
 {
-	if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries))
-		/* if elem with this 'key' doesn't exist and we've reached
-		 * max_entries limit, fail insertion of new elem
-		 */
-		return -E2BIG;
-
 	if (l_old && map_flags == BPF_NOEXIST)
 		/* elem already exists */
 		return -EEXIST;
@@ -375,13 +477,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 
 	hash = htab_map_hash(key, key_size);
 
-	/* allocate new element outside of the lock, since
-	 * we're most likley going to insert it
-	 */
-	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
-	if (!l_new)
-		return -ENOMEM;
-
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
@@ -394,21 +489,24 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (ret)
 		goto err;
 
+	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
+	if (IS_ERR(l_new)) {
+		/* all pre-allocated elements are in use or memory exhausted */
+		ret = PTR_ERR(l_new);
+		goto err;
+	}
+
 	/* add new element to the head of the list, so that
 	 * concurrent search will find it before old elem
 	 */
 	hlist_add_head_rcu(&l_new->hash_node, head);
 	if (l_old) {
 		hlist_del_rcu(&l_old->hash_node);
-		kfree_rcu(l_old, rcu);
-	} else {
-		atomic_inc(&htab->count);
+		free_htab_elem(htab, l_old);
 	}
-	raw_spin_unlock_irqrestore(&b->lock, flags);
-	return 0;
+	ret = 0;
 err:
 	raw_spin_unlock_irqrestore(&b->lock, flags);
-	kfree(l_new);
 	return ret;
 }
 
@@ -466,12 +564,11 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 	} else {
 		l_new = alloc_htab_elem(htab, key, value, key_size,
 					hash, true, onallcpus);
-		if (!l_new) {
-			ret = -ENOMEM;
+		if (IS_ERR(l_new)) {
+			ret = PTR_ERR(l_new);
 			goto err;
 		}
 		hlist_add_head_rcu(&l_new->hash_node, head);
-		atomic_inc(&htab->count);
 	}
 	ret = 0;
 err:
@@ -489,7 +586,6 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH;
 	struct hlist_head *head;
 	struct bucket *b;
 	struct htab_elem *l;
@@ -511,8 +607,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 
 	if (l) {
 		hlist_del_rcu(&l->hash_node);
-		atomic_dec(&htab->count);
-		free_htab_elem(l, percpu, key_size);
+		free_htab_elem(htab, l);
 		ret = 0;
 	}
 
@@ -531,17 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab)
 
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			hlist_del_rcu(&l->hash_node);
-			atomic_dec(&htab->count);
-			if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) {
-				l->key_size = htab->map.key_size;
-				htab_percpu_elem_free(l);
-			} else {
-				kfree(l);
-			}
+			htab_elem_free(htab, l);
 		}
 	}
 }
-
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
 static void htab_map_free(struct bpf_map *map)
 {
@@ -554,10 +642,16 @@ static void htab_map_free(struct bpf_map *map)
 	 */
 	synchronize_rcu();
 
-	/* some of kfree_rcu() callbacks for elements of this map may not have
-	 * executed. It's ok. Proceed to free residual elements and map itself
+	/* some of free_htab_elem() callbacks for elements of this map may
+	 * not have executed. Wait for them.
 	 */
-	delete_all_elements(htab);
+	rcu_barrier();
+	if (htab->map.map_flags & BPF_F_NO_PREALLOC) {
+		delete_all_elements(htab);
+	} else {
+		htab_free_elems(htab);
+		pcpu_freelist_destroy(&htab->freelist);
+	}
 	kvfree(htab->buckets);
 	kfree(htab);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dc99f6a000f5..cbd94b2144ff 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -48,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
 	list_add(&tl->list_node, &bpf_map_types);
 }
 
+int bpf_map_precharge_memlock(u32 pages)
+{
+	struct user_struct *user = get_current_user();
+	unsigned long memlock_limit, cur;
+
+	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	cur = atomic_long_read(&user->locked_vm);
+	free_uid(user);
+	if (cur + pages > memlock_limit)
+		return -EPERM;
+	return 0;
+}
+
 static int bpf_map_charge_memlock(struct bpf_map *map)
 {
 	struct user_struct *user = get_current_user();
@@ -153,7 +166,7 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
-#define BPF_MAP_CREATE_LAST_FIELD max_entries
+#define BPF_MAP_CREATE_LAST_FIELD map_flags
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
-- 
cgit v1.2.3


From ab7ac4eb9832e32a09f4e8042705484d2fb0aad3 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:06 -0800
Subject: kcm: Kernel Connection Multiplexor module

This module implements the Kernel Connection Multiplexor.

Kernel Connection Multiplexor (KCM) is a facility that provides a
message based interface over TCP for generic application protocols.
With KCM an application can efficiently send and receive application
protocol messages over TCP using datagram sockets.

For more information see the included Documentation/networking/kcm.txt

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h   |    6 +-
 include/net/kcm.h        |  125 +++
 include/uapi/linux/kcm.h |   40 +
 net/Kconfig              |    1 +
 net/Makefile             |    1 +
 net/kcm/Kconfig          |   10 +
 net/kcm/Makefile         |    3 +
 net/kcm/kcmsock.c        | 2016 ++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 2201 insertions(+), 1 deletion(-)
 create mode 100644 include/net/kcm.h
 create mode 100644 include/uapi/linux/kcm.h
 create mode 100644 net/kcm/Kconfig
 create mode 100644 net/kcm/Makefile
 create mode 100644 net/kcm/kcmsock.c

(limited to 'include/uapi')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index d834af22a460..73bf6c6a833b 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -200,7 +200,9 @@ struct ucred {
 #define AF_ALG		38	/* Algorithm sockets		*/
 #define AF_NFC		39	/* NFC sockets			*/
 #define AF_VSOCK	40	/* vSockets			*/
-#define AF_MAX		41	/* For now.. */
+#define AF_KCM		41	/* Kernel Connection Multiplexor*/
+
+#define AF_MAX		42	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -246,6 +248,7 @@ struct ucred {
 #define PF_ALG		AF_ALG
 #define PF_NFC		AF_NFC
 #define PF_VSOCK	AF_VSOCK
+#define PF_KCM		AF_KCM
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
@@ -323,6 +326,7 @@ struct ucred {
 #define SOL_CAIF	278
 #define SOL_ALG		279
 #define SOL_NFC		280
+#define SOL_KCM		281
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/include/net/kcm.h b/include/net/kcm.h
new file mode 100644
index 000000000000..1bcae39070ec
--- /dev/null
+++ b/include/net/kcm.h
@@ -0,0 +1,125 @@
+/*
+ * Kernel Connection Multiplexor
+ *
+ * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef __NET_KCM_H_
+#define __NET_KCM_H_
+
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <uapi/linux/kcm.h>
+
+extern unsigned int kcm_net_id;
+
+struct kcm_tx_msg {
+	unsigned int sent;
+	unsigned int fragidx;
+	unsigned int frag_offset;
+	unsigned int msg_flags;
+	struct sk_buff *frag_skb;
+	struct sk_buff *last_skb;
+};
+
+struct kcm_rx_msg {
+	int full_len;
+	int accum_len;
+	int offset;
+};
+
+/* Socket structure for KCM client sockets */
+struct kcm_sock {
+	struct sock sk;
+	struct kcm_mux *mux;
+	struct list_head kcm_sock_list;
+	int index;
+	u32 done : 1;
+	struct work_struct done_work;
+
+	/* Transmit */
+	struct kcm_psock *tx_psock;
+	struct work_struct tx_work;
+	struct list_head wait_psock_list;
+	struct sk_buff *seq_skb;
+
+	/* Don't use bit fields here, these are set under different locks */
+	bool tx_wait;
+	bool tx_wait_more;
+
+	/* Receive */
+	struct kcm_psock *rx_psock;
+	struct list_head wait_rx_list; /* KCMs waiting for receiving */
+	bool rx_wait;
+	u32 rx_disabled : 1;
+};
+
+struct bpf_prog;
+
+/* Structure for an attached lower socket */
+struct kcm_psock {
+	struct sock *sk;
+	struct kcm_mux *mux;
+	int index;
+
+	u32 tx_stopped : 1;
+	u32 rx_stopped : 1;
+	u32 done : 1;
+	u32 unattaching : 1;
+
+	void (*save_state_change)(struct sock *sk);
+	void (*save_data_ready)(struct sock *sk);
+	void (*save_write_space)(struct sock *sk);
+
+	struct list_head psock_list;
+
+	/* Receive */
+	struct sk_buff *rx_skb_head;
+	struct sk_buff **rx_skb_nextp;
+	struct sk_buff *ready_rx_msg;
+	struct list_head psock_ready_list;
+	struct work_struct rx_work;
+	struct delayed_work rx_delayed_work;
+	struct bpf_prog *bpf_prog;
+	struct kcm_sock *rx_kcm;
+
+	/* Transmit */
+	struct kcm_sock *tx_kcm;
+	struct list_head psock_avail_list;
+};
+
+/* Per net MUX list */
+struct kcm_net {
+	struct mutex mutex;
+	struct list_head mux_list;
+	int count;
+};
+
+/* Structure for a MUX */
+struct kcm_mux {
+	struct list_head kcm_mux_list;
+	struct rcu_head rcu;
+	struct kcm_net *knet;
+
+	struct list_head kcm_socks;	/* All KCM sockets on MUX */
+	int kcm_socks_cnt;		/* Total KCM socket count for MUX */
+	struct list_head psocks;	/* List of all psocks on MUX */
+	int psocks_cnt;		/* Total attached sockets */
+
+	/* Receive */
+	spinlock_t rx_lock ____cacheline_aligned_in_smp;
+	struct list_head kcm_rx_waiters; /* KCMs waiting for receiving */
+	struct list_head psocks_ready;	/* List of psocks with a msg ready */
+	struct sk_buff_head rx_hold_queue;
+
+	/* Transmit */
+	spinlock_t  lock ____cacheline_aligned_in_smp;	/* TX and mux locking */
+	struct list_head psocks_avail;	/* List of available psocks */
+	struct list_head kcm_tx_waiters; /* KCMs waiting for a TX psock */
+};
+
+#endif /* __NET_KCM_H_ */
diff --git a/include/uapi/linux/kcm.h b/include/uapi/linux/kcm.h
new file mode 100644
index 000000000000..a5a530940b99
--- /dev/null
+++ b/include/uapi/linux/kcm.h
@@ -0,0 +1,40 @@
+/*
+ * Kernel Connection Multiplexor
+ *
+ * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * User API to clone KCM sockets and attach transport socket to a KCM
+ * multiplexor.
+ */
+
+#ifndef KCM_KERNEL_H
+#define KCM_KERNEL_H
+
+struct kcm_attach {
+	int fd;
+	int bpf_fd;
+};
+
+struct kcm_unattach {
+	int fd;
+};
+
+struct kcm_clone {
+	int fd;
+};
+
+#define SIOCKCMATTACH	(SIOCPROTOPRIVATE + 0)
+#define SIOCKCMUNATTACH	(SIOCPROTOPRIVATE + 1)
+#define SIOCKCMCLONE	(SIOCPROTOPRIVATE + 2)
+
+#define KCMPROTO_CONNECTED	0
+
+/* Socket options */
+#define KCM_RECV_DISABLE	1
+
+#endif
+
diff --git a/net/Kconfig b/net/Kconfig
index 2760825e53fa..10640d5f8bee 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -360,6 +360,7 @@ source "net/can/Kconfig"
 source "net/irda/Kconfig"
 source "net/bluetooth/Kconfig"
 source "net/rxrpc/Kconfig"
+source "net/kcm/Kconfig"
 
 config FIB_RULES
 	bool
diff --git a/net/Makefile b/net/Makefile
index a5d04098dfce..81d14119eab5 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_IRDA)		+= irda/
 obj-$(CONFIG_BT)		+= bluetooth/
 obj-$(CONFIG_SUNRPC)		+= sunrpc/
 obj-$(CONFIG_AF_RXRPC)		+= rxrpc/
+obj-$(CONFIG_AF_KCM)		+= kcm/
 obj-$(CONFIG_ATM)		+= atm/
 obj-$(CONFIG_L2TP)		+= l2tp/
 obj-$(CONFIG_DECNET)		+= decnet/
diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig
new file mode 100644
index 000000000000..5db94d940ecc
--- /dev/null
+++ b/net/kcm/Kconfig
@@ -0,0 +1,10 @@
+
+config AF_KCM
+	tristate "KCM sockets"
+	depends on INET
+	select BPF_SYSCALL
+	---help---
+	  KCM (Kernel Connection Multiplexor) sockets provide a method
+	  for multiplexing messages of a message based application
+	  protocol over kernel connectons (e.g. TCP connections).
+
diff --git a/net/kcm/Makefile b/net/kcm/Makefile
new file mode 100644
index 000000000000..cb525f7c5a13
--- /dev/null
+++ b/net/kcm/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_AF_KCM) += kcm.o
+
+kcm-y := kcmsock.o
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
new file mode 100644
index 000000000000..30ef69ac6b81
--- /dev/null
+++ b/net/kcm/kcmsock.c
@@ -0,0 +1,2016 @@
+#include <linux/bpf.h>
+#include <linux/errno.h>
+#include <linux/errqueue.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/poll.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/uaccess.h>
+#include <linux/workqueue.h>
+#include <net/kcm.h>
+#include <net/netns/generic.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <uapi/linux/kcm.h>
+
+unsigned int kcm_net_id;
+
+static struct kmem_cache *kcm_psockp __read_mostly;
+static struct kmem_cache *kcm_muxp __read_mostly;
+static struct workqueue_struct *kcm_wq;
+
+static inline struct kcm_sock *kcm_sk(const struct sock *sk)
+{
+	return (struct kcm_sock *)sk;
+}
+
+static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
+{
+	return (struct kcm_tx_msg *)skb->cb;
+}
+
+static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb)
+{
+	return (struct kcm_rx_msg *)((void *)skb->cb +
+				     offsetof(struct qdisc_skb_cb, data));
+}
+
+static void report_csk_error(struct sock *csk, int err)
+{
+	csk->sk_err = EPIPE;
+	csk->sk_error_report(csk);
+}
+
+/* Callback lock held */
+static void kcm_abort_rx_psock(struct kcm_psock *psock, int err,
+			       struct sk_buff *skb)
+{
+	struct sock *csk = psock->sk;
+
+	/* Unrecoverable error in receive */
+
+	if (psock->rx_stopped)
+		return;
+
+	psock->rx_stopped = 1;
+
+	/* Report an error on the lower socket */
+	report_csk_error(csk, err);
+}
+
+static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
+			       bool wakeup_kcm)
+{
+	struct sock *csk = psock->sk;
+	struct kcm_mux *mux = psock->mux;
+
+	/* Unrecoverable error in transmit */
+
+	spin_lock_bh(&mux->lock);
+
+	if (psock->tx_stopped) {
+		spin_unlock_bh(&mux->lock);
+		return;
+	}
+
+	psock->tx_stopped = 1;
+
+	if (!psock->tx_kcm) {
+		/* Take off psocks_avail list */
+		list_del(&psock->psock_avail_list);
+	} else if (wakeup_kcm) {
+		/* In this case psock is being aborted while outside of
+		 * write_msgs and psock is reserved. Schedule tx_work
+		 * to handle the failure there. Need to commit tx_stopped
+		 * before queuing work.
+		 */
+		smp_mb();
+
+		queue_work(kcm_wq, &psock->tx_kcm->tx_work);
+	}
+
+	spin_unlock_bh(&mux->lock);
+
+	/* Report error on lower socket */
+	report_csk_error(csk, err);
+}
+
+static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+/* KCM is ready to receive messages on its queue-- either the KCM is new or
+ * has become unblocked after being blocked on full socket buffer. Queue any
+ * pending ready messages on a psock. RX mux lock held.
+ */
+static void kcm_rcv_ready(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+	struct kcm_psock *psock;
+	struct sk_buff *skb;
+
+	if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled))
+		return;
+
+	while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) {
+		if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+			/* Assuming buffer limit has been reached */
+			skb_queue_head(&mux->rx_hold_queue, skb);
+			WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
+			return;
+		}
+	}
+
+	while (!list_empty(&mux->psocks_ready)) {
+		psock = list_first_entry(&mux->psocks_ready, struct kcm_psock,
+					 psock_ready_list);
+
+		if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) {
+			/* Assuming buffer limit has been reached */
+			WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
+			return;
+		}
+
+		/* Consumed the ready message on the psock. Schedule rx_work to
+		 * get more messages.
+		 */
+		list_del(&psock->psock_ready_list);
+		psock->ready_rx_msg = NULL;
+
+		/* Commit clearing of ready_rx_msg for queuing work */
+		smp_mb();
+
+		queue_work(kcm_wq, &psock->rx_work);
+	}
+
+	/* Buffer limit is okay now, add to ready list */
+	list_add_tail(&kcm->wait_rx_list,
+		      &kcm->mux->kcm_rx_waiters);
+	kcm->rx_wait = true;
+}
+
+static void kcm_rfree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	struct kcm_sock *kcm = kcm_sk(sk);
+	struct kcm_mux *mux = kcm->mux;
+	unsigned int len = skb->truesize;
+
+	sk_mem_uncharge(sk, len);
+	atomic_sub(len, &sk->sk_rmem_alloc);
+
+	/* For reading rx_wait and rx_psock without holding lock */
+	smp_mb__after_atomic();
+
+	if (!kcm->rx_wait && !kcm->rx_psock &&
+	    sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) {
+		spin_lock_bh(&mux->rx_lock);
+		kcm_rcv_ready(kcm);
+		spin_unlock_bh(&mux->rx_lock);
+	}
+}
+
+static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff_head *list = &sk->sk_receive_queue;
+
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+		return -ENOMEM;
+
+	if (!sk_rmem_schedule(sk, skb, skb->truesize))
+		return -ENOBUFS;
+
+	skb->dev = NULL;
+
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = kcm_rfree;
+	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+	sk_mem_charge(sk, skb->truesize);
+
+	skb_queue_tail(list, skb);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk);
+
+	return 0;
+}
+
+/* Requeue received messages for a kcm socket to other kcm sockets. This is
+ * called with a kcm socket is receive disabled.
+ * RX mux lock held.
+ */
+static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head)
+{
+	struct sk_buff *skb;
+	struct kcm_sock *kcm;
+
+	while ((skb = __skb_dequeue(head))) {
+		/* Reset destructor to avoid calling kcm_rcv_ready */
+		skb->destructor = sock_rfree;
+		skb_orphan(skb);
+try_again:
+		if (list_empty(&mux->kcm_rx_waiters)) {
+			skb_queue_tail(&mux->rx_hold_queue, skb);
+			continue;
+		}
+
+		kcm = list_first_entry(&mux->kcm_rx_waiters,
+				       struct kcm_sock, wait_rx_list);
+
+		if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+			/* Should mean socket buffer full */
+			list_del(&kcm->wait_rx_list);
+			kcm->rx_wait = false;
+
+			/* Commit rx_wait to read in kcm_free */
+			smp_wmb();
+
+			goto try_again;
+		}
+	}
+}
+
+/* Lower sock lock held */
+static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
+				       struct sk_buff *head)
+{
+	struct kcm_mux *mux = psock->mux;
+	struct kcm_sock *kcm;
+
+	WARN_ON(psock->ready_rx_msg);
+
+	if (psock->rx_kcm)
+		return psock->rx_kcm;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	if (psock->rx_kcm) {
+		spin_unlock_bh(&mux->rx_lock);
+		return psock->rx_kcm;
+	}
+
+	if (list_empty(&mux->kcm_rx_waiters)) {
+		psock->ready_rx_msg = head;
+		list_add_tail(&psock->psock_ready_list,
+			      &mux->psocks_ready);
+		spin_unlock_bh(&mux->rx_lock);
+		return NULL;
+	}
+
+	kcm = list_first_entry(&mux->kcm_rx_waiters,
+			       struct kcm_sock, wait_rx_list);
+	list_del(&kcm->wait_rx_list);
+	kcm->rx_wait = false;
+
+	psock->rx_kcm = kcm;
+	kcm->rx_psock = psock;
+
+	spin_unlock_bh(&mux->rx_lock);
+
+	return kcm;
+}
+
+static void kcm_done(struct kcm_sock *kcm);
+
+static void kcm_done_work(struct work_struct *w)
+{
+	kcm_done(container_of(w, struct kcm_sock, done_work));
+}
+
+/* Lower sock held */
+static void unreserve_rx_kcm(struct kcm_psock *psock,
+			     bool rcv_ready)
+{
+	struct kcm_sock *kcm = psock->rx_kcm;
+	struct kcm_mux *mux = psock->mux;
+
+	if (!kcm)
+		return;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	psock->rx_kcm = NULL;
+	kcm->rx_psock = NULL;
+
+	/* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
+	 * kcm_rfree
+	 */
+	smp_mb();
+
+	if (unlikely(kcm->done)) {
+		spin_unlock_bh(&mux->rx_lock);
+
+		/* Need to run kcm_done in a task since we need to qcquire
+		 * callback locks which may already be held here.
+		 */
+		INIT_WORK(&kcm->done_work, kcm_done_work);
+		schedule_work(&kcm->done_work);
+		return;
+	}
+
+	if (unlikely(kcm->rx_disabled)) {
+		requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
+	} else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) {
+		/* Check for degenerative race with rx_wait that all
+		 * data was dequeued (accounted for in kcm_rfree).
+		 */
+		kcm_rcv_ready(kcm);
+	}
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+/* Macro to invoke filter function. */
+#define KCM_RUN_FILTER(prog, ctx) \
+	(*prog->bpf_func)(ctx, prog->insnsi)
+
+/* Lower socket lock held */
+static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
+			unsigned int orig_offset, size_t orig_len)
+{
+	struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data;
+	struct kcm_rx_msg *rxm;
+	struct kcm_sock *kcm;
+	struct sk_buff *head, *skb;
+	size_t eaten = 0, cand_len;
+	ssize_t extra;
+	int err;
+	bool cloned_orig = false;
+
+	if (psock->ready_rx_msg)
+		return 0;
+
+	head = psock->rx_skb_head;
+	if (head) {
+		/* Message already in progress */
+
+		if (unlikely(orig_offset)) {
+			/* Getting data with a non-zero offset when a message is
+			 * in progress is not expected. If it does happen, we
+			 * need to clone and pull since we can't deal with
+			 * offsets in the skbs for a message expect in the head.
+			 */
+			orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
+			if (!orig_skb) {
+				desc->error = -ENOMEM;
+				return 0;
+			}
+			if (!pskb_pull(orig_skb, orig_offset)) {
+				kfree_skb(orig_skb);
+				desc->error = -ENOMEM;
+				return 0;
+			}
+			cloned_orig = true;
+			orig_offset = 0;
+		}
+
+		if (!psock->rx_skb_nextp) {
+			/* We are going to append to the frags_list of head.
+			 * Need to unshare the frag_list.
+			 */
+			err = skb_unclone(head, GFP_ATOMIC);
+			if (err) {
+				desc->error = err;
+				return 0;
+			}
+
+			if (unlikely(skb_shinfo(head)->frag_list)) {
+				/* We can't append to an sk_buff that already
+				 * has a frag_list. We create a new head, point
+				 * the frag_list of that to the old head, and
+				 * then are able to use the old head->next for
+				 * appending to the message.
+				 */
+				if (WARN_ON(head->next)) {
+					desc->error = -EINVAL;
+					return 0;
+				}
+
+				skb = alloc_skb(0, GFP_ATOMIC);
+				if (!skb) {
+					desc->error = -ENOMEM;
+					return 0;
+				}
+				skb->len = head->len;
+				skb->data_len = head->len;
+				skb->truesize = head->truesize;
+				*kcm_rx_msg(skb) = *kcm_rx_msg(head);
+				psock->rx_skb_nextp = &head->next;
+				skb_shinfo(skb)->frag_list = head;
+				psock->rx_skb_head = skb;
+				head = skb;
+			} else {
+				psock->rx_skb_nextp =
+				    &skb_shinfo(head)->frag_list;
+			}
+		}
+	}
+
+	while (eaten < orig_len) {
+		/* Always clone since we will consume something */
+		skb = skb_clone(orig_skb, GFP_ATOMIC);
+		if (!skb) {
+			desc->error = -ENOMEM;
+			break;
+		}
+
+		cand_len = orig_len - eaten;
+
+		head = psock->rx_skb_head;
+		if (!head) {
+			head = skb;
+			psock->rx_skb_head = head;
+			/* Will set rx_skb_nextp on next packet if needed */
+			psock->rx_skb_nextp = NULL;
+			rxm = kcm_rx_msg(head);
+			memset(rxm, 0, sizeof(*rxm));
+			rxm->offset = orig_offset + eaten;
+		} else {
+			/* Unclone since we may be appending to an skb that we
+			 * already share a frag_list with.
+			 */
+			err = skb_unclone(skb, GFP_ATOMIC);
+			if (err) {
+				desc->error = err;
+				break;
+			}
+
+			rxm = kcm_rx_msg(head);
+			*psock->rx_skb_nextp = skb;
+			psock->rx_skb_nextp = &skb->next;
+			head->data_len += skb->len;
+			head->len += skb->len;
+			head->truesize += skb->truesize;
+		}
+
+		if (!rxm->full_len) {
+			ssize_t len;
+
+			len = KCM_RUN_FILTER(psock->bpf_prog, head);
+
+			if (!len) {
+				/* Need more header to determine length */
+				rxm->accum_len += cand_len;
+				eaten += cand_len;
+				WARN_ON(eaten != orig_len);
+				break;
+			} else if (len <= (ssize_t)head->len -
+					  skb->len - rxm->offset) {
+				/* Length must be into new skb (and also
+				 * greater than zero)
+				 */
+				desc->error = -EPROTO;
+				psock->rx_skb_head = NULL;
+				kcm_abort_rx_psock(psock, EPROTO, head);
+				break;
+			}
+
+			rxm->full_len = len;
+		}
+
+		extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len;
+
+		if (extra < 0) {
+			/* Message not complete yet. */
+			rxm->accum_len += cand_len;
+			eaten += cand_len;
+			WARN_ON(eaten != orig_len);
+			break;
+		}
+
+		/* Positive extra indicates ore bytes than needed for the
+		 * message
+		 */
+
+		WARN_ON(extra > cand_len);
+
+		eaten += (cand_len - extra);
+
+		/* Hurray, we have a new message! */
+		psock->rx_skb_head = NULL;
+
+try_queue:
+		kcm = reserve_rx_kcm(psock, head);
+		if (!kcm) {
+			/* Unable to reserve a KCM, message is held in psock. */
+			break;
+		}
+
+		if (kcm_queue_rcv_skb(&kcm->sk, head)) {
+			/* Should mean socket buffer full */
+			unreserve_rx_kcm(psock, false);
+			goto try_queue;
+		}
+	}
+
+	if (cloned_orig)
+		kfree_skb(orig_skb);
+
+	return eaten;
+}
+
+/* Called with lock held on lower socket */
+static int psock_tcp_read_sock(struct kcm_psock *psock)
+{
+	read_descriptor_t desc;
+
+	desc.arg.data = psock;
+	desc.error = 0;
+	desc.count = 1; /* give more than one skb per call */
+
+	/* sk should be locked here, so okay to do tcp_read_sock */
+	tcp_read_sock(psock->sk, &desc, kcm_tcp_recv);
+
+	unreserve_rx_kcm(psock, true);
+
+	return desc.error;
+}
+
+/* Lower sock lock held */
+static void psock_tcp_data_ready(struct sock *sk)
+{
+	struct kcm_psock *psock;
+
+	read_lock_bh(&sk->sk_callback_lock);
+
+	psock = (struct kcm_psock *)sk->sk_user_data;
+	if (unlikely(!psock || psock->rx_stopped))
+		goto out;
+
+	if (psock->ready_rx_msg)
+		goto out;
+
+	if (psock_tcp_read_sock(psock) == -ENOMEM)
+		queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void do_psock_rx_work(struct kcm_psock *psock)
+{
+	read_descriptor_t rd_desc;
+	struct sock *csk = psock->sk;
+
+	/* We need the read lock to synchronize with psock_tcp_data_ready. We
+	 * need the socket lock for calling tcp_read_sock.
+	 */
+	lock_sock(csk);
+	read_lock_bh(&csk->sk_callback_lock);
+
+	if (unlikely(csk->sk_user_data != psock))
+		goto out;
+
+	if (unlikely(psock->rx_stopped))
+		goto out;
+
+	if (psock->ready_rx_msg)
+		goto out;
+
+	rd_desc.arg.data = psock;
+
+	if (psock_tcp_read_sock(psock) == -ENOMEM)
+		queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+
+out:
+	read_unlock_bh(&csk->sk_callback_lock);
+	release_sock(csk);
+}
+
+static void psock_rx_work(struct work_struct *w)
+{
+	do_psock_rx_work(container_of(w, struct kcm_psock, rx_work));
+}
+
+static void psock_rx_delayed_work(struct work_struct *w)
+{
+	do_psock_rx_work(container_of(w, struct kcm_psock,
+				      rx_delayed_work.work));
+}
+
+static void psock_tcp_state_change(struct sock *sk)
+{
+	/* TCP only does a POLLIN for a half close. Do a POLLHUP here
+	 * since application will normally not poll with POLLIN
+	 * on the TCP sockets.
+	 */
+
+	report_csk_error(sk, EPIPE);
+}
+
+static void psock_tcp_write_space(struct sock *sk)
+{
+	struct kcm_psock *psock;
+	struct kcm_mux *mux;
+	struct kcm_sock *kcm;
+
+	read_lock_bh(&sk->sk_callback_lock);
+
+	psock = (struct kcm_psock *)sk->sk_user_data;
+	if (unlikely(!psock))
+		goto out;
+
+	mux = psock->mux;
+
+	spin_lock_bh(&mux->lock);
+
+	/* Check if the socket is reserved so someone is waiting for sending. */
+	kcm = psock->tx_kcm;
+	if (kcm)
+		queue_work(kcm_wq, &kcm->tx_work);
+
+	spin_unlock_bh(&mux->lock);
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void unreserve_psock(struct kcm_sock *kcm);
+
+/* kcm sock is locked. */
+static struct kcm_psock *reserve_psock(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+	struct kcm_psock *psock;
+
+	psock = kcm->tx_psock;
+
+	smp_rmb(); /* Must read tx_psock before tx_wait */
+
+	if (psock) {
+		WARN_ON(kcm->tx_wait);
+		if (unlikely(psock->tx_stopped))
+			unreserve_psock(kcm);
+		else
+			return kcm->tx_psock;
+	}
+
+	spin_lock_bh(&mux->lock);
+
+	/* Check again under lock to see if psock was reserved for this
+	 * psock via psock_unreserve.
+	 */
+	psock = kcm->tx_psock;
+	if (unlikely(psock)) {
+		WARN_ON(kcm->tx_wait);
+		spin_unlock_bh(&mux->lock);
+		return kcm->tx_psock;
+	}
+
+	if (!list_empty(&mux->psocks_avail)) {
+		psock = list_first_entry(&mux->psocks_avail,
+					 struct kcm_psock,
+					 psock_avail_list);
+		list_del(&psock->psock_avail_list);
+		if (kcm->tx_wait) {
+			list_del(&kcm->wait_psock_list);
+			kcm->tx_wait = false;
+		}
+		kcm->tx_psock = psock;
+		psock->tx_kcm = kcm;
+	} else if (!kcm->tx_wait) {
+		list_add_tail(&kcm->wait_psock_list,
+			      &mux->kcm_tx_waiters);
+		kcm->tx_wait = true;
+	}
+
+	spin_unlock_bh(&mux->lock);
+
+	return psock;
+}
+
+/* mux lock held */
+static void psock_now_avail(struct kcm_psock *psock)
+{
+	struct kcm_mux *mux = psock->mux;
+	struct kcm_sock *kcm;
+
+	if (list_empty(&mux->kcm_tx_waiters)) {
+		list_add_tail(&psock->psock_avail_list,
+			      &mux->psocks_avail);
+	} else {
+		kcm = list_first_entry(&mux->kcm_tx_waiters,
+				       struct kcm_sock,
+				       wait_psock_list);
+		list_del(&kcm->wait_psock_list);
+		kcm->tx_wait = false;
+		psock->tx_kcm = kcm;
+
+		/* Commit before changing tx_psock since that is read in
+		 * reserve_psock before queuing work.
+		 */
+		smp_mb();
+
+		kcm->tx_psock = psock;
+		queue_work(kcm_wq, &kcm->tx_work);
+	}
+}
+
+/* kcm sock is locked. */
+static void unreserve_psock(struct kcm_sock *kcm)
+{
+	struct kcm_psock *psock;
+	struct kcm_mux *mux = kcm->mux;
+
+	spin_lock_bh(&mux->lock);
+
+	psock = kcm->tx_psock;
+
+	if (WARN_ON(!psock)) {
+		spin_unlock_bh(&mux->lock);
+		return;
+	}
+
+	smp_rmb(); /* Read tx_psock before tx_wait */
+
+	WARN_ON(kcm->tx_wait);
+
+	kcm->tx_psock = NULL;
+	psock->tx_kcm = NULL;
+
+	if (unlikely(psock->tx_stopped)) {
+		if (psock->done) {
+			/* Deferred free */
+			list_del(&psock->psock_list);
+			mux->psocks_cnt--;
+			sock_put(psock->sk);
+			fput(psock->sk->sk_socket->file);
+			kmem_cache_free(kcm_psockp, psock);
+		}
+
+		/* Don't put back on available list */
+
+		spin_unlock_bh(&mux->lock);
+
+		return;
+	}
+
+	psock_now_avail(psock);
+
+	spin_unlock_bh(&mux->lock);
+}
+
+/* Write any messages ready on the kcm socket.  Called with kcm sock lock
+ * held.  Return bytes actually sent or error.
+ */
+static int kcm_write_msgs(struct kcm_sock *kcm)
+{
+	struct sock *sk = &kcm->sk;
+	struct kcm_psock *psock;
+	struct sk_buff *skb, *head;
+	struct kcm_tx_msg *txm;
+	unsigned short fragidx, frag_offset;
+	unsigned int sent, total_sent = 0;
+	int ret = 0;
+
+	kcm->tx_wait_more = false;
+	psock = kcm->tx_psock;
+	if (unlikely(psock && psock->tx_stopped)) {
+		/* A reserved psock was aborted asynchronously. Unreserve
+		 * it and we'll retry the message.
+		 */
+		unreserve_psock(kcm);
+		if (skb_queue_empty(&sk->sk_write_queue))
+			return 0;
+
+		kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0;
+
+	} else if (skb_queue_empty(&sk->sk_write_queue)) {
+		return 0;
+	}
+
+	head = skb_peek(&sk->sk_write_queue);
+	txm = kcm_tx_msg(head);
+
+	if (txm->sent) {
+		/* Send of first skbuff in queue already in progress */
+		if (WARN_ON(!psock)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		sent = txm->sent;
+		frag_offset = txm->frag_offset;
+		fragidx = txm->fragidx;
+		skb = txm->frag_skb;
+
+		goto do_frag;
+	}
+
+try_again:
+	psock = reserve_psock(kcm);
+	if (!psock)
+		goto out;
+
+	do {
+		skb = head;
+		txm = kcm_tx_msg(head);
+		sent = 0;
+
+do_frag_list:
+		if (WARN_ON(!skb_shinfo(skb)->nr_frags)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags;
+		     fragidx++) {
+			skb_frag_t *frag;
+
+			frag_offset = 0;
+do_frag:
+			frag = &skb_shinfo(skb)->frags[fragidx];
+			if (WARN_ON(!frag->size)) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			ret = kernel_sendpage(psock->sk->sk_socket,
+					      frag->page.p,
+					      frag->page_offset + frag_offset,
+					      frag->size - frag_offset,
+					      MSG_DONTWAIT);
+			if (ret <= 0) {
+				if (ret == -EAGAIN) {
+					/* Save state to try again when there's
+					 * write space on the socket
+					 */
+					txm->sent = sent;
+					txm->frag_offset = frag_offset;
+					txm->fragidx = fragidx;
+					txm->frag_skb = skb;
+
+					ret = 0;
+					goto out;
+				}
+
+				/* Hard failure in sending message, abort this
+				 * psock since it has lost framing
+				 * synchonization and retry sending the
+				 * message from the beginning.
+				 */
+				kcm_abort_tx_psock(psock, ret ? -ret : EPIPE,
+						   true);
+				unreserve_psock(kcm);
+
+				txm->sent = 0;
+				ret = 0;
+
+				goto try_again;
+			}
+
+			sent += ret;
+			frag_offset += ret;
+			if (frag_offset < frag->size) {
+				/* Not finished with this frag */
+				goto do_frag;
+			}
+		}
+
+		if (skb == head) {
+			if (skb_has_frag_list(skb)) {
+				skb = skb_shinfo(skb)->frag_list;
+				goto do_frag_list;
+			}
+		} else if (skb->next) {
+			skb = skb->next;
+			goto do_frag_list;
+		}
+
+		/* Successfully sent the whole packet, account for it. */
+		skb_dequeue(&sk->sk_write_queue);
+		kfree_skb(head);
+		sk->sk_wmem_queued -= sent;
+		total_sent += sent;
+	} while ((head = skb_peek(&sk->sk_write_queue)));
+out:
+	if (!head) {
+		/* Done with all queued messages. */
+		WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
+		unreserve_psock(kcm);
+	}
+
+	/* Check if write space is available */
+	sk->sk_write_space(sk);
+
+	return total_sent ? : ret;
+}
+
+static void kcm_tx_work(struct work_struct *w)
+{
+	struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work);
+	struct sock *sk = &kcm->sk;
+	int err;
+
+	lock_sock(sk);
+
+	/* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx
+	 * aborts
+	 */
+	err = kcm_write_msgs(kcm);
+	if (err < 0) {
+		/* Hard failure in write, report error on KCM socket */
+		pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err);
+		report_csk_error(&kcm->sk, -err);
+		goto out;
+	}
+
+	/* Primarily for SOCK_SEQPACKET sockets */
+	if (likely(sk->sk_socket) &&
+	    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		sk->sk_write_space(sk);
+	}
+
+out:
+	release_sock(sk);
+}
+
+static void kcm_push(struct kcm_sock *kcm)
+{
+	if (kcm->tx_wait_more)
+		kcm_write_msgs(kcm);
+}
+
+static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct kcm_sock *kcm = kcm_sk(sk);
+	struct sk_buff *skb = NULL, *head = NULL;
+	size_t copy, copied = 0;
+	long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+	int eor = (sock->type == SOCK_DGRAM) ?
+		  !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR);
+	int err = -EPIPE;
+
+	lock_sock(sk);
+
+	/* Per tcp_sendmsg this should be in poll */
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+	if (sk->sk_err)
+		goto out_error;
+
+	if (kcm->seq_skb) {
+		/* Previously opened message */
+		head = kcm->seq_skb;
+		skb = kcm_tx_msg(head)->last_skb;
+		goto start;
+	}
+
+	/* Call the sk_stream functions to manage the sndbuf mem. */
+	if (!sk_stream_memory_free(sk)) {
+		kcm_push(kcm);
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err)
+			goto out_error;
+	}
+
+	/* New message, alloc head skb */
+	head = alloc_skb(0, sk->sk_allocation);
+	while (!head) {
+		kcm_push(kcm);
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err)
+			goto out_error;
+
+		head = alloc_skb(0, sk->sk_allocation);
+	}
+
+	skb = head;
+
+	/* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
+	 * csum_and_copy_from_iter from skb_do_copy_data_nocache.
+	 */
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+start:
+	while (msg_data_left(msg)) {
+		bool merge = true;
+		int i = skb_shinfo(skb)->nr_frags;
+		struct page_frag *pfrag = sk_page_frag(sk);
+
+		if (!sk_page_frag_refill(sk, pfrag))
+			goto wait_for_memory;
+
+		if (!skb_can_coalesce(skb, i, pfrag->page,
+				      pfrag->offset)) {
+			if (i == MAX_SKB_FRAGS) {
+				struct sk_buff *tskb;
+
+				tskb = alloc_skb(0, sk->sk_allocation);
+				if (!tskb)
+					goto wait_for_memory;
+
+				if (head == skb)
+					skb_shinfo(head)->frag_list = tskb;
+				else
+					skb->next = tskb;
+
+				skb = tskb;
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+				continue;
+			}
+			merge = false;
+		}
+
+		copy = min_t(int, msg_data_left(msg),
+			     pfrag->size - pfrag->offset);
+
+		if (!sk_wmem_schedule(sk, copy))
+			goto wait_for_memory;
+
+		err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
+					       pfrag->page,
+					       pfrag->offset,
+					       copy);
+		if (err)
+			goto out_error;
+
+		/* Update the skb. */
+		if (merge) {
+			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+		} else {
+			skb_fill_page_desc(skb, i, pfrag->page,
+					   pfrag->offset, copy);
+			get_page(pfrag->page);
+		}
+
+		pfrag->offset += copy;
+		copied += copy;
+		if (head != skb) {
+			head->len += copy;
+			head->data_len += copy;
+		}
+
+		continue;
+
+wait_for_memory:
+		kcm_push(kcm);
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err)
+			goto out_error;
+	}
+
+	if (eor) {
+		bool not_busy = skb_queue_empty(&sk->sk_write_queue);
+
+		/* Message complete, queue it on send buffer */
+		__skb_queue_tail(&sk->sk_write_queue, head);
+		kcm->seq_skb = NULL;
+
+		if (msg->msg_flags & MSG_BATCH) {
+			kcm->tx_wait_more = true;
+		} else if (kcm->tx_wait_more || not_busy) {
+			err = kcm_write_msgs(kcm);
+			if (err < 0) {
+				/* We got a hard error in write_msgs but have
+				 * already queued this message. Report an error
+				 * in the socket, but don't affect return value
+				 * from sendmsg
+				 */
+				pr_warn("KCM: Hard failure on kcm_write_msgs\n");
+				report_csk_error(&kcm->sk, -err);
+			}
+		}
+	} else {
+		/* Message not complete, save state */
+partial_message:
+		kcm->seq_skb = head;
+		kcm_tx_msg(head)->last_skb = skb;
+	}
+
+	release_sock(sk);
+	return copied;
+
+out_error:
+	kcm_push(kcm);
+
+	if (copied && sock->type == SOCK_SEQPACKET) {
+		/* Wrote some bytes before encountering an
+		 * error, return partial success.
+		 */
+		goto partial_message;
+	}
+
+	if (head != kcm->seq_skb)
+		kfree_skb(head);
+
+	err = sk_stream_error(sk, msg->msg_flags, err);
+
+	/* make sure we wake any epoll edge trigger waiter */
+	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+		sk->sk_write_space(sk);
+
+	release_sock(sk);
+	return err;
+}
+
+static struct sk_buff *kcm_wait_data(struct sock *sk, int flags,
+				     long timeo, int *err)
+{
+	struct sk_buff *skb;
+
+	while (!(skb = skb_peek(&sk->sk_receive_queue))) {
+		if (sk->sk_err) {
+			*err = sock_error(sk);
+			return NULL;
+		}
+
+		if (sock_flag(sk, SOCK_DONE))
+			return NULL;
+
+		if ((flags & MSG_DONTWAIT) || !timeo) {
+			*err = -EAGAIN;
+			return NULL;
+		}
+
+		sk_wait_data(sk, &timeo, NULL);
+
+		/* Handle signals */
+		if (signal_pending(current)) {
+			*err = sock_intr_errno(timeo);
+			return NULL;
+		}
+	}
+
+	return skb;
+}
+
+static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
+		       size_t len, int flags)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+	long timeo;
+	struct kcm_rx_msg *rxm;
+	int copied = 0;
+	struct sk_buff *skb;
+
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+	lock_sock(sk);
+
+	skb = kcm_wait_data(sk, flags, timeo, &err);
+	if (!skb)
+		goto out;
+
+	/* Okay, have a message on the receive queue */
+
+	rxm = kcm_rx_msg(skb);
+
+	if (len > rxm->full_len)
+		len = rxm->full_len;
+
+	err = skb_copy_datagram_msg(skb, rxm->offset, msg, len);
+	if (err < 0)
+		goto out;
+
+	copied = len;
+	if (likely(!(flags & MSG_PEEK))) {
+		if (copied < rxm->full_len) {
+			if (sock->type == SOCK_DGRAM) {
+				/* Truncated message */
+				msg->msg_flags |= MSG_TRUNC;
+				goto msg_finished;
+			}
+			rxm->offset += copied;
+			rxm->full_len -= copied;
+		} else {
+msg_finished:
+			/* Finished with message */
+			msg->msg_flags |= MSG_EOR;
+			skb_unlink(skb, &sk->sk_receive_queue);
+			kfree_skb(skb);
+		}
+	}
+
+out:
+	release_sock(sk);
+
+	return copied ? : err;
+}
+
+/* kcm sock lock held */
+static void kcm_recv_disable(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+
+	if (kcm->rx_disabled)
+		return;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	kcm->rx_disabled = 1;
+
+	/* If a psock is reserved we'll do cleanup in unreserve */
+	if (!kcm->rx_psock) {
+		if (kcm->rx_wait) {
+			list_del(&kcm->wait_rx_list);
+			kcm->rx_wait = false;
+		}
+
+		requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
+	}
+
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+/* kcm sock lock held */
+static void kcm_recv_enable(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+
+	if (!kcm->rx_disabled)
+		return;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	kcm->rx_disabled = 0;
+	kcm_rcv_ready(kcm);
+
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+static int kcm_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	int val, valbool;
+	int err = 0;
+
+	if (level != SOL_KCM)
+		return -ENOPROTOOPT;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EINVAL;
+
+	valbool = val ? 1 : 0;
+
+	switch (optname) {
+	case KCM_RECV_DISABLE:
+		lock_sock(&kcm->sk);
+		if (valbool)
+			kcm_recv_disable(kcm);
+		else
+			kcm_recv_enable(kcm);
+		release_sock(&kcm->sk);
+		break;
+	default:
+		err = -ENOPROTOOPT;
+	}
+
+	return err;
+}
+
+static int kcm_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	int val, len;
+
+	if (level != SOL_KCM)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case KCM_RECV_DISABLE:
+		val = kcm->rx_disabled;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+
+static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
+{
+	struct kcm_sock *tkcm;
+	struct list_head *head;
+	int index = 0;
+
+	/* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
+	 * we set sk_state, otherwise epoll_wait always returns right away with
+	 * POLLHUP
+	 */
+	kcm->sk.sk_state = TCP_ESTABLISHED;
+
+	/* Add to mux's kcm sockets list */
+	kcm->mux = mux;
+	spin_lock_bh(&mux->lock);
+
+	head = &mux->kcm_socks;
+	list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) {
+		if (tkcm->index != index)
+			break;
+		head = &tkcm->kcm_sock_list;
+		index++;
+	}
+
+	list_add(&kcm->kcm_sock_list, head);
+	kcm->index = index;
+
+	mux->kcm_socks_cnt++;
+	spin_unlock_bh(&mux->lock);
+
+	INIT_WORK(&kcm->tx_work, kcm_tx_work);
+
+	spin_lock_bh(&mux->rx_lock);
+	kcm_rcv_ready(kcm);
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+static int kcm_attach(struct socket *sock, struct socket *csock,
+		      struct bpf_prog *prog)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	struct kcm_mux *mux = kcm->mux;
+	struct sock *csk;
+	struct kcm_psock *psock = NULL, *tpsock;
+	struct list_head *head;
+	int index = 0;
+
+	if (csock->ops->family != PF_INET &&
+	    csock->ops->family != PF_INET6)
+		return -EINVAL;
+
+	csk = csock->sk;
+	if (!csk)
+		return -EINVAL;
+
+	/* Only support TCP for now */
+	if (csk->sk_protocol != IPPROTO_TCP)
+		return -EINVAL;
+
+	psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
+	if (!psock)
+		return -ENOMEM;
+
+	psock->mux = mux;
+	psock->sk = csk;
+	psock->bpf_prog = prog;
+	INIT_WORK(&psock->rx_work, psock_rx_work);
+	INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work);
+
+	sock_hold(csk);
+
+	write_lock_bh(&csk->sk_callback_lock);
+	psock->save_data_ready = csk->sk_data_ready;
+	psock->save_write_space = csk->sk_write_space;
+	psock->save_state_change = csk->sk_state_change;
+	csk->sk_user_data = psock;
+	csk->sk_data_ready = psock_tcp_data_ready;
+	csk->sk_write_space = psock_tcp_write_space;
+	csk->sk_state_change = psock_tcp_state_change;
+	write_unlock_bh(&csk->sk_callback_lock);
+
+	/* Finished initialization, now add the psock to the MUX. */
+	spin_lock_bh(&mux->lock);
+	head = &mux->psocks;
+	list_for_each_entry(tpsock, &mux->psocks, psock_list) {
+		if (tpsock->index != index)
+			break;
+		head = &tpsock->psock_list;
+		index++;
+	}
+
+	list_add(&psock->psock_list, head);
+	psock->index = index;
+
+	mux->psocks_cnt++;
+	psock_now_avail(psock);
+	spin_unlock_bh(&mux->lock);
+
+	/* Schedule RX work in case there are already bytes queued */
+	queue_work(kcm_wq, &psock->rx_work);
+
+	return 0;
+}
+
+static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
+{
+	struct socket *csock;
+	struct bpf_prog *prog;
+	int err;
+
+	csock = sockfd_lookup(info->fd, &err);
+	if (!csock)
+		return -ENOENT;
+
+	prog = bpf_prog_get(info->bpf_fd);
+	if (IS_ERR(prog)) {
+		err = PTR_ERR(prog);
+		goto out;
+	}
+
+	if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
+		bpf_prog_put(prog);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = kcm_attach(sock, csock, prog);
+	if (err) {
+		bpf_prog_put(prog);
+		goto out;
+	}
+
+	/* Keep reference on file also */
+
+	return 0;
+out:
+	fput(csock->file);
+	return err;
+}
+
+static void kcm_unattach(struct kcm_psock *psock)
+{
+	struct sock *csk = psock->sk;
+	struct kcm_mux *mux = psock->mux;
+
+	/* Stop getting callbacks from TCP socket. After this there should
+	 * be no way to reserve a kcm for this psock.
+	 */
+	write_lock_bh(&csk->sk_callback_lock);
+	csk->sk_user_data = NULL;
+	csk->sk_data_ready = psock->save_data_ready;
+	csk->sk_write_space = psock->save_write_space;
+	csk->sk_state_change = psock->save_state_change;
+	psock->rx_stopped = 1;
+
+	if (WARN_ON(psock->rx_kcm)) {
+		write_unlock_bh(&csk->sk_callback_lock);
+		return;
+	}
+
+	spin_lock_bh(&mux->rx_lock);
+
+	/* Stop receiver activities. After this point psock should not be
+	 * able to get onto ready list either through callbacks or work.
+	 */
+	if (psock->ready_rx_msg) {
+		list_del(&psock->psock_ready_list);
+		kfree_skb(psock->ready_rx_msg);
+		psock->ready_rx_msg = NULL;
+	}
+
+	spin_unlock_bh(&mux->rx_lock);
+
+	write_unlock_bh(&csk->sk_callback_lock);
+
+	cancel_work_sync(&psock->rx_work);
+	cancel_delayed_work_sync(&psock->rx_delayed_work);
+
+	bpf_prog_put(psock->bpf_prog);
+
+	kfree_skb(psock->rx_skb_head);
+	psock->rx_skb_head = NULL;
+
+	spin_lock_bh(&mux->lock);
+
+	if (psock->tx_kcm) {
+		/* psock was reserved.  Just mark it finished and we will clean
+		 * up in the kcm paths, we need kcm lock which can not be
+		 * acquired here.
+		 */
+		spin_unlock_bh(&mux->lock);
+
+		/* We are unattaching a socket that is reserved. Abort the
+		 * socket since we may be out of sync in sending on it. We need
+		 * to do this without the mux lock.
+		 */
+		kcm_abort_tx_psock(psock, EPIPE, false);
+
+		spin_lock_bh(&mux->lock);
+		if (!psock->tx_kcm) {
+			/* psock now unreserved in window mux was unlocked */
+			goto no_reserved;
+		}
+		psock->done = 1;
+
+		/* Commit done before queuing work to process it */
+		smp_mb();
+
+		/* Queue tx work to make sure psock->done is handled */
+		queue_work(kcm_wq, &psock->tx_kcm->tx_work);
+		spin_unlock_bh(&mux->lock);
+	} else {
+no_reserved:
+		if (!psock->tx_stopped)
+			list_del(&psock->psock_avail_list);
+		list_del(&psock->psock_list);
+		mux->psocks_cnt--;
+		spin_unlock_bh(&mux->lock);
+
+		sock_put(csk);
+		fput(csk->sk_socket->file);
+		kmem_cache_free(kcm_psockp, psock);
+	}
+}
+
+static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	struct kcm_mux *mux = kcm->mux;
+	struct kcm_psock *psock;
+	struct socket *csock;
+	struct sock *csk;
+	int err;
+
+	csock = sockfd_lookup(info->fd, &err);
+	if (!csock)
+		return -ENOENT;
+
+	csk = csock->sk;
+	if (!csk) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = -ENOENT;
+
+	spin_lock_bh(&mux->lock);
+
+	list_for_each_entry(psock, &mux->psocks, psock_list) {
+		if (psock->sk != csk)
+			continue;
+
+		/* Found the matching psock */
+
+		if (psock->unattaching || WARN_ON(psock->done)) {
+			err = -EALREADY;
+			break;
+		}
+
+		psock->unattaching = 1;
+
+		spin_unlock_bh(&mux->lock);
+
+		kcm_unattach(psock);
+
+		err = 0;
+		goto out;
+	}
+
+	spin_unlock_bh(&mux->lock);
+
+out:
+	fput(csock->file);
+	return err;
+}
+
+static struct proto kcm_proto = {
+	.name	= "KCM",
+	.owner	= THIS_MODULE,
+	.obj_size = sizeof(struct kcm_sock),
+};
+
+/* Clone a kcm socket. */
+static int kcm_clone(struct socket *osock, struct kcm_clone *info,
+		     struct socket **newsockp)
+{
+	struct socket *newsock;
+	struct sock *newsk;
+	struct file *newfile;
+	int err, newfd;
+
+	err = -ENFILE;
+	newsock = sock_alloc();
+	if (!newsock)
+		goto out;
+
+	newsock->type = osock->type;
+	newsock->ops = osock->ops;
+
+	__module_get(newsock->ops->owner);
+
+	newfd = get_unused_fd_flags(0);
+	if (unlikely(newfd < 0)) {
+		err = newfd;
+		goto out_fd_fail;
+	}
+
+	newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
+	if (unlikely(IS_ERR(newfile))) {
+		err = PTR_ERR(newfile);
+		goto out_sock_alloc_fail;
+	}
+
+	newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
+			 &kcm_proto, true);
+	if (!newsk) {
+		err = -ENOMEM;
+		goto out_sk_alloc_fail;
+	}
+
+	sock_init_data(newsock, newsk);
+	init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
+
+	fd_install(newfd, newfile);
+	*newsockp = newsock;
+	info->fd = newfd;
+
+	return 0;
+
+out_sk_alloc_fail:
+	fput(newfile);
+out_sock_alloc_fail:
+	put_unused_fd(newfd);
+out_fd_fail:
+	sock_release(newsock);
+out:
+	return err;
+}
+
+static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	switch (cmd) {
+	case SIOCKCMATTACH: {
+		struct kcm_attach info;
+
+		if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+			err = -EFAULT;
+
+		err = kcm_attach_ioctl(sock, &info);
+
+		break;
+	}
+	case SIOCKCMUNATTACH: {
+		struct kcm_unattach info;
+
+		if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+			err = -EFAULT;
+
+		err = kcm_unattach_ioctl(sock, &info);
+
+		break;
+	}
+	case SIOCKCMCLONE: {
+		struct kcm_clone info;
+		struct socket *newsock = NULL;
+
+		if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+			err = -EFAULT;
+
+		err = kcm_clone(sock, &info, &newsock);
+
+		if (!err) {
+			if (copy_to_user((void __user *)arg, &info,
+					 sizeof(info))) {
+				err = -EFAULT;
+				sock_release(newsock);
+			}
+		}
+
+		break;
+	}
+	default:
+		err = -ENOIOCTLCMD;
+		break;
+	}
+
+	return err;
+}
+
+static void free_mux(struct rcu_head *rcu)
+{
+	struct kcm_mux *mux = container_of(rcu,
+	    struct kcm_mux, rcu);
+
+	kmem_cache_free(kcm_muxp, mux);
+}
+
+static void release_mux(struct kcm_mux *mux)
+{
+	struct kcm_net *knet = mux->knet;
+	struct kcm_psock *psock, *tmp_psock;
+
+	/* Release psocks */
+	list_for_each_entry_safe(psock, tmp_psock,
+				 &mux->psocks, psock_list) {
+		if (!WARN_ON(psock->unattaching))
+			kcm_unattach(psock);
+	}
+
+	if (WARN_ON(mux->psocks_cnt))
+		return;
+
+	__skb_queue_purge(&mux->rx_hold_queue);
+
+	mutex_lock(&knet->mutex);
+	list_del_rcu(&mux->kcm_mux_list);
+	knet->count--;
+	mutex_unlock(&knet->mutex);
+
+	call_rcu(&mux->rcu, free_mux);
+}
+
+static void kcm_done(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+	struct sock *sk = &kcm->sk;
+	int socks_cnt;
+
+	spin_lock_bh(&mux->rx_lock);
+	if (kcm->rx_psock) {
+		/* Cleanup in unreserve_rx_kcm */
+		WARN_ON(kcm->done);
+		kcm->rx_disabled = 1;
+		kcm->done = 1;
+		spin_unlock_bh(&mux->rx_lock);
+		return;
+	}
+
+	if (kcm->rx_wait) {
+		list_del(&kcm->wait_rx_list);
+		kcm->rx_wait = false;
+	}
+	/* Move any pending receive messages to other kcm sockets */
+	requeue_rx_msgs(mux, &sk->sk_receive_queue);
+
+	spin_unlock_bh(&mux->rx_lock);
+
+	if (WARN_ON(sk_rmem_alloc_get(sk)))
+		return;
+
+	/* Detach from MUX */
+	spin_lock_bh(&mux->lock);
+
+	list_del(&kcm->kcm_sock_list);
+	mux->kcm_socks_cnt--;
+	socks_cnt = mux->kcm_socks_cnt;
+
+	spin_unlock_bh(&mux->lock);
+
+	if (!socks_cnt) {
+		/* We are done with the mux now. */
+		release_mux(mux);
+	}
+
+	WARN_ON(kcm->rx_wait);
+
+	sock_put(&kcm->sk);
+}
+
+/* Called by kcm_release to close a KCM socket.
+ * If this is the last KCM socket on the MUX, destroy the MUX.
+ */
+static int kcm_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct kcm_sock *kcm;
+	struct kcm_mux *mux;
+	struct kcm_psock *psock;
+
+	if (!sk)
+		return 0;
+
+	kcm = kcm_sk(sk);
+	mux = kcm->mux;
+
+	sock_orphan(sk);
+	kfree_skb(kcm->seq_skb);
+
+	lock_sock(sk);
+	/* Purge queue under lock to avoid race condition with tx_work trying
+	 * to act when queue is nonempty. If tx_work runs after this point
+	 * it will just return.
+	 */
+	__skb_queue_purge(&sk->sk_write_queue);
+	release_sock(sk);
+
+	spin_lock_bh(&mux->lock);
+	if (kcm->tx_wait) {
+		/* Take of tx_wait list, after this point there should be no way
+		 * that a psock will be assigned to this kcm.
+		 */
+		list_del(&kcm->wait_psock_list);
+		kcm->tx_wait = false;
+	}
+	spin_unlock_bh(&mux->lock);
+
+	/* Cancel work. After this point there should be no outside references
+	 * to the kcm socket.
+	 */
+	cancel_work_sync(&kcm->tx_work);
+
+	lock_sock(sk);
+	psock = kcm->tx_psock;
+	if (psock) {
+		/* A psock was reserved, so we need to kill it since it
+		 * may already have some bytes queued from a message. We
+		 * need to do this after removing kcm from tx_wait list.
+		 */
+		kcm_abort_tx_psock(psock, EPIPE, false);
+		unreserve_psock(kcm);
+	}
+	release_sock(sk);
+
+	WARN_ON(kcm->tx_wait);
+	WARN_ON(kcm->tx_psock);
+
+	sock->sk = NULL;
+
+	kcm_done(kcm);
+
+	return 0;
+}
+
+static const struct proto_ops kcm_ops = {
+	.family =	PF_KCM,
+	.owner =	THIS_MODULE,
+	.release =	kcm_release,
+	.bind =		sock_no_bind,
+	.connect =	sock_no_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	sock_no_getname,
+	.poll =		datagram_poll,
+	.ioctl =	kcm_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	kcm_setsockopt,
+	.getsockopt =	kcm_getsockopt,
+	.sendmsg =	kcm_sendmsg,
+	.recvmsg =	kcm_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+/* Create proto operation for kcm sockets */
+static int kcm_create(struct net *net, struct socket *sock,
+		      int protocol, int kern)
+{
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+	struct sock *sk;
+	struct kcm_mux *mux;
+
+	switch (sock->type) {
+	case SOCK_DGRAM:
+	case SOCK_SEQPACKET:
+		sock->ops = &kcm_ops;
+		break;
+	default:
+		return -ESOCKTNOSUPPORT;
+	}
+
+	if (protocol != KCMPROTO_CONNECTED)
+		return -EPROTONOSUPPORT;
+
+	sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern);
+	if (!sk)
+		return -ENOMEM;
+
+	/* Allocate a kcm mux, shared between KCM sockets */
+	mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL);
+	if (!mux) {
+		sk_free(sk);
+		return -ENOMEM;
+	}
+
+	spin_lock_init(&mux->lock);
+	spin_lock_init(&mux->rx_lock);
+	INIT_LIST_HEAD(&mux->kcm_socks);
+	INIT_LIST_HEAD(&mux->kcm_rx_waiters);
+	INIT_LIST_HEAD(&mux->kcm_tx_waiters);
+
+	INIT_LIST_HEAD(&mux->psocks);
+	INIT_LIST_HEAD(&mux->psocks_ready);
+	INIT_LIST_HEAD(&mux->psocks_avail);
+
+	mux->knet = knet;
+
+	/* Add new MUX to list */
+	mutex_lock(&knet->mutex);
+	list_add_rcu(&mux->kcm_mux_list, &knet->mux_list);
+	knet->count++;
+	mutex_unlock(&knet->mutex);
+
+	skb_queue_head_init(&mux->rx_hold_queue);
+
+	/* Init KCM socket */
+	sock_init_data(sock, sk);
+	init_kcm_sock(kcm_sk(sk), mux);
+
+	return 0;
+}
+
+static struct net_proto_family kcm_family_ops = {
+	.family = PF_KCM,
+	.create = kcm_create,
+	.owner  = THIS_MODULE,
+};
+
+static __net_init int kcm_init_net(struct net *net)
+{
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+	INIT_LIST_HEAD_RCU(&knet->mux_list);
+	mutex_init(&knet->mutex);
+
+	return 0;
+}
+
+static __net_exit void kcm_exit_net(struct net *net)
+{
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+	/* All KCM sockets should be closed at this point, which should mean
+	 * that all multiplexors and psocks have been destroyed.
+	 */
+	WARN_ON(!list_empty(&knet->mux_list));
+}
+
+static struct pernet_operations kcm_net_ops = {
+	.init = kcm_init_net,
+	.exit = kcm_exit_net,
+	.id   = &kcm_net_id,
+	.size = sizeof(struct kcm_net),
+};
+
+static int __init kcm_init(void)
+{
+	int err = -ENOMEM;
+
+	kcm_muxp = kmem_cache_create("kcm_mux_cache",
+				     sizeof(struct kcm_mux), 0,
+				     SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	if (!kcm_muxp)
+		goto fail;
+
+	kcm_psockp = kmem_cache_create("kcm_psock_cache",
+				       sizeof(struct kcm_psock), 0,
+					SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	if (!kcm_psockp)
+		goto fail;
+
+	kcm_wq = create_singlethread_workqueue("kkcmd");
+	if (!kcm_wq)
+		goto fail;
+
+	err = proto_register(&kcm_proto, 1);
+	if (err)
+		goto fail;
+
+	err = sock_register(&kcm_family_ops);
+	if (err)
+		goto sock_register_fail;
+
+	err = register_pernet_device(&kcm_net_ops);
+	if (err)
+		goto net_ops_fail;
+
+	return 0;
+
+net_ops_fail:
+	sock_unregister(PF_KCM);
+
+sock_register_fail:
+	proto_unregister(&kcm_proto);
+
+fail:
+	kmem_cache_destroy(kcm_muxp);
+	kmem_cache_destroy(kcm_psockp);
+
+	if (kcm_wq)
+		destroy_workqueue(kcm_wq);
+
+	return err;
+}
+
+static void __exit kcm_exit(void)
+{
+	unregister_pernet_device(&kcm_net_ops);
+	sock_unregister(PF_KCM);
+	proto_unregister(&kcm_proto);
+	destroy_workqueue(kcm_wq);
+
+	kmem_cache_destroy(kcm_muxp);
+	kmem_cache_destroy(kcm_psockp);
+}
+
+module_init(kcm_init);
+module_exit(kcm_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_KCM);
+
-- 
cgit v1.2.3


From 5b33f48842fa1e13e9c0ea8cc59c1d0df19042db Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Tue, 8 Mar 2016 12:42:29 +0200
Subject: net/flower: Introduce hardware offload support

This patch is based on a patch made by John Fastabend.
It adds support for offloading cls_flower.
when NETIF_F_HW_TC is on:
  flags = 0       => Rule will be processed twice - by hardware, and if
                     still relevant, by software.
  flags = SKIP_HW => Rull will be processed by software only

If hardware fail/not capabale to apply the rule, operation will NOT
fail. Filter will be processed by SW only.

Acked-by: Jiri Pirko <jiri@mellanox.com>
Suggested-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |  2 ++
 include/net/pkt_cls.h        | 14 ++++++++++
 include/uapi/linux/pkt_cls.h |  2 ++
 net/sched/cls_flower.c       | 64 +++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 81 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fd30cb545c45..41df0b450757 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -786,6 +786,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 enum {
 	TC_SETUP_MQPRIO,
 	TC_SETUP_CLSU32,
+	TC_SETUP_CLSFLOWER,
 };
 
 struct tc_cls_u32_offload;
@@ -795,6 +796,7 @@ struct tc_to_netdev {
 	union {
 		u8 tc;
 		struct tc_cls_u32_offload *cls_u32;
+		struct tc_cls_flower_offload *cls_flower;
 	};
 };
 
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index bea14eee373e..5b4e8f08b8f0 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -409,4 +409,18 @@ static inline bool tc_should_offload(struct net_device *dev, u32 flags)
 	return true;
 }
 
+enum tc_fl_command {
+	TC_CLSFLOWER_REPLACE,
+	TC_CLSFLOWER_DESTROY,
+};
+
+struct tc_cls_flower_offload {
+	enum tc_fl_command command;
+	u64 cookie;
+	struct flow_dissector *dissector;
+	struct fl_flow_key *mask;
+	struct fl_flow_key *key;
+	struct tcf_exts *exts;
+};
+
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 9874f5680926..c43c5f78b9c4 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -417,6 +417,8 @@ enum {
 	TCA_FLOWER_KEY_TCP_DST,		/* be16 */
 	TCA_FLOWER_KEY_UDP_SRC,		/* be16 */
 	TCA_FLOWER_KEY_UDP_DST,		/* be16 */
+
+	TCA_FLOWER_FLAGS,
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 95b021243233..25d87666bf1e 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -165,6 +165,51 @@ static void fl_destroy_filter(struct rcu_head *head)
 	kfree(f);
 }
 
+static void fl_hw_destroy_filter(struct tcf_proto *tp, u64 cookie)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_flower_offload offload = {0};
+	struct tc_to_netdev tc;
+
+	if (!tc_should_offload(dev, 0))
+		return;
+
+	offload.command = TC_CLSFLOWER_DESTROY;
+	offload.cookie = cookie;
+
+	tc.type = TC_SETUP_CLSFLOWER;
+	tc.cls_flower = &offload;
+
+	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+}
+
+static void fl_hw_replace_filter(struct tcf_proto *tp,
+				 struct flow_dissector *dissector,
+				 struct fl_flow_key *mask,
+				 struct fl_flow_key *key,
+				 struct tcf_exts *actions,
+				 u64 cookie, u32 flags)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_flower_offload offload = {0};
+	struct tc_to_netdev tc;
+
+	if (!tc_should_offload(dev, flags))
+		return;
+
+	offload.command = TC_CLSFLOWER_REPLACE;
+	offload.cookie = cookie;
+	offload.dissector = dissector;
+	offload.mask = mask;
+	offload.key = key;
+	offload.exts = actions;
+
+	tc.type = TC_SETUP_CLSFLOWER;
+	tc.cls_flower = &offload;
+
+	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+}
+
 static bool fl_destroy(struct tcf_proto *tp, bool force)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
@@ -174,6 +219,7 @@ static bool fl_destroy(struct tcf_proto *tp, bool force)
 		return false;
 
 	list_for_each_entry_safe(f, next, &head->filters, list) {
+		fl_hw_destroy_filter(tp, (u64)f);
 		list_del_rcu(&f->list);
 		call_rcu(&f->rcu, fl_destroy_filter);
 	}
@@ -459,6 +505,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	struct cls_fl_filter *fnew;
 	struct nlattr *tb[TCA_FLOWER_MAX + 1];
 	struct fl_flow_mask mask = {};
+	u32 flags = 0;
 	int err;
 
 	if (!tca[TCA_OPTIONS])
@@ -486,6 +533,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	}
 	fnew->handle = handle;
 
+	if (tb[TCA_FLOWER_FLAGS])
+		flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
+
 	err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr);
 	if (err)
 		goto errout;
@@ -498,9 +548,20 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 				     head->ht_params);
 	if (err)
 		goto errout;
-	if (fold)
+
+	fl_hw_replace_filter(tp,
+			     &head->dissector,
+			     &mask.key,
+			     &fnew->key,
+			     &fnew->exts,
+			     (u64)fnew,
+			     flags);
+
+	if (fold) {
 		rhashtable_remove_fast(&head->ht, &fold->ht_node,
 				       head->ht_params);
+		fl_hw_destroy_filter(tp, (u64)fold);
+	}
 
 	*arg = (unsigned long) fnew;
 
@@ -527,6 +588,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
 	rhashtable_remove_fast(&head->ht, &f->ht_node,
 			       head->ht_params);
 	list_del_rcu(&f->list);
+	fl_hw_destroy_filter(tp, (u64)f);
 	tcf_unbind_filter(tp, &f->res);
 	call_rcu(&f->rcu, fl_destroy_filter);
 	return 0;
-- 
cgit v1.2.3


From e7f70af111f086a20800ad2e17f544b2e3e0f375 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Mar 2016 03:00:03 +0100
Subject: vxlan: support setting IPv6 flow label

This work adds support for setting the IPv6 flow label for vxlan per
device and through collect metadata (ip_tunnel_key) frontends. The
vxlan dst cache does not need any special considerations here, for
the cases where caches can be used, the label is static per cache.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c          | 26 +++++++++++++++++++++-----
 include/net/vxlan.h          |  1 +
 include/uapi/linux/if_link.h |  1 +
 3 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 8bdcd5ea8424..8eda76f9e474 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1790,6 +1790,7 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
 #if IS_ENABLED(CONFIG_IPV6)
 static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 					  struct sk_buff *skb, int oif, u8 tos,
+					  __be32 label,
 					  const struct in6_addr *daddr,
 					  struct in6_addr *saddr,
 					  struct dst_cache *dst_cache,
@@ -1813,6 +1814,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 	fl6.flowi6_tos = RT_TOS(tos);
 	fl6.daddr = *daddr;
 	fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr;
+	fl6.flowlabel = label;
 	fl6.flowi6_mark = skb->mark;
 	fl6.flowi6_proto = IPPROTO_UDP;
 
@@ -1888,7 +1890,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	struct vxlan_metadata _md;
 	struct vxlan_metadata *md = &_md;
 	__be16 src_port = 0, dst_port;
-	__be32 vni;
+	__be32 vni, label;
 	__be16 df = 0;
 	__u8 tos, ttl;
 	int err;
@@ -1939,12 +1941,14 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	if (tos == 1)
 		tos = ip_tunnel_get_dsfield(old_iph, skb);
 
+	label = vxlan->cfg.label;
 	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
 				     vxlan->cfg.port_max, true);
 
 	if (info) {
 		ttl = info->key.ttl;
 		tos = info->key.tos;
+		label = info->key.label;
 		udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
 
 		if (info->options_len)
@@ -2020,7 +2024,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		ndst = vxlan6_get_route(vxlan, skb,
 					rdst ? rdst->remote_ifindex : 0, tos,
-					&dst->sin6.sin6_addr, &saddr,
+					label, &dst->sin6.sin6_addr, &saddr,
 					dst_cache, info);
 		if (IS_ERR(ndst)) {
 			netdev_dbg(dev, "no route to %pI6\n",
@@ -2066,8 +2070,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			return;
 		}
 		udp_tunnel6_xmit_skb(ndst, sk, skb, dev,
-				     &saddr, &dst->sin6.sin6_addr, tos, ttl, 0,
-				     src_port, dst_port, !udp_sum);
+				     &saddr, &dst->sin6.sin6_addr, tos, ttl,
+				     label, src_port, dst_port, !udp_sum);
 #endif
 	}
 
@@ -2390,7 +2394,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 		if (!vxlan->vn6_sock)
 			return -EINVAL;
 		ndst = vxlan6_get_route(vxlan, skb, 0, info->key.tos,
-					&info->key.u.ipv6.dst,
+					info->key.label, &info->key.u.ipv6.dst,
 					&info->key.u.ipv6.src, NULL, info);
 		if (IS_ERR(ndst))
 			return PTR_ERR(ndst);
@@ -2505,6 +2509,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_LOCAL6]	= { .len = sizeof(struct in6_addr) },
 	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_LABEL]	= { .type = NLA_U32 },
 	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_AGEING]	= { .type = NLA_U32 },
 	[IFLA_VXLAN_LIMIT]	= { .type = NLA_U32 },
@@ -2739,6 +2744,11 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
 		vxlan->flags |= VXLAN_F_IPV6;
 	}
 
+	if (conf->label && !use_ipv6) {
+		pr_info("label only supported in use with IPv6\n");
+		return -EINVAL;
+	}
+
 	if (conf->remote_ifindex) {
 		lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
 		dst->remote_ifindex = conf->remote_ifindex;
@@ -2887,6 +2897,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 	if (data[IFLA_VXLAN_TTL])
 		conf.ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
 
+	if (data[IFLA_VXLAN_LABEL])
+		conf.label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
+			     IPV6_FLOWLABEL_MASK;
+
 	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
 		conf.flags |= VXLAN_F_LEARN;
 
@@ -2990,6 +3004,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
@@ -3053,6 +3068,7 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 
 	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
 	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
+	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
 	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
 			!!(vxlan->flags & VXLAN_F_LEARN)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 6eda4ed4d78b..a763c96ecde4 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -162,6 +162,7 @@ struct vxlan_config {
 	u16			port_max;
 	u8			tos;
 	u8			ttl;
+	__be32			label;
 	u32			flags;
 	unsigned long		age_interval;
 	unsigned int		addrmax;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d452cea59020..6bebc975031d 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -456,6 +456,7 @@ enum {
 	IFLA_VXLAN_GBP,
 	IFLA_VXLAN_REMCSUM_NOPARTIAL,
 	IFLA_VXLAN_COLLECT_METADATA,
+	IFLA_VXLAN_LABEL,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit v1.2.3


From 8eb3b99554b82da968d1fbc00df9f3156c5e2d63 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Mar 2016 03:00:04 +0100
Subject: geneve: support setting IPv6 flow label

This work adds support for setting the IPv6 flow label for geneve per
device and through collect metadata (ip_tunnel_key) frontends. Also here,
the geneve dst cache does not need any special considerations, for the
cases where caches can be used, the label is static per cache.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c         | 35 +++++++++++++++++++++++++++--------
 include/uapi/linux/if_link.h |  1 +
 2 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 89ccff79d76c..33185b9a435e 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -68,6 +68,7 @@ struct geneve_dev {
 	u8                 tos;		/* TOS override */
 	union geneve_addr  remote;	/* IP address for link partner */
 	struct list_head   next;	/* geneve's per namespace list */
+	__be32		   label;	/* IPv6 flowlabel override */
 	__be16		   dst_port;
 	bool		   collect_md;
 	struct gro_cells   gro_cells;
@@ -846,6 +847,7 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 		fl6->daddr = info->key.u.ipv6.dst;
 		fl6->saddr = info->key.u.ipv6.src;
 		fl6->flowi6_tos = RT_TOS(info->key.tos);
+		fl6->flowlabel = info->key.label;
 		dst_cache = &info->dst_cache;
 	} else {
 		prio = geneve->tos;
@@ -857,6 +859,7 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 		}
 
 		fl6->flowi6_tos = RT_TOS(prio);
+		fl6->flowlabel = geneve->label;
 		fl6->daddr = geneve->remote.sin6.sin6_addr;
 		dst_cache = &geneve->dst_cache;
 	}
@@ -998,6 +1001,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	struct flowi6 fl6;
 	__u8 prio, ttl;
 	__be16 sport;
+	__be32 label;
 	bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
 	u32 flags = geneve->flags;
 
@@ -1041,6 +1045,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 
 		prio = ip_tunnel_ecn_encap(key->tos, iip, skb);
 		ttl = key->ttl;
+		label = info->key.label;
 	} else {
 		err = geneve6_build_skb(dst, skb, 0, geneve->vni,
 					0, NULL, flags, xnet);
@@ -1052,9 +1057,11 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		if (!ttl && ipv6_addr_is_multicast(&fl6.daddr))
 			ttl = 1;
 		ttl = ttl ? : ip6_dst_hoplimit(dst);
+		label = geneve->label;
 	}
+
 	udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
-			     &fl6.saddr, &fl6.daddr, prio, ttl, 0,
+			     &fl6.saddr, &fl6.daddr, prio, ttl, label,
 			     sport, geneve->dst_port,
 			     !!(flags & GENEVE_F_UDP_ZERO_CSUM6_TX));
 	return NETDEV_TX_OK;
@@ -1238,6 +1245,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
 	[IFLA_GENEVE_REMOTE6]		= { .len = sizeof(struct in6_addr) },
 	[IFLA_GENEVE_TTL]		= { .type = NLA_U8 },
 	[IFLA_GENEVE_TOS]		= { .type = NLA_U8 },
+	[IFLA_GENEVE_LABEL]		= { .type = NLA_U32 },
 	[IFLA_GENEVE_PORT]		= { .type = NLA_U16 },
 	[IFLA_GENEVE_COLLECT_METADATA]	= { .type = NLA_FLAG },
 	[IFLA_GENEVE_UDP_CSUM]		= { .type = NLA_U8 },
@@ -1295,8 +1303,8 @@ static struct geneve_dev *geneve_find_dev(struct geneve_net *gn,
 
 static int geneve_configure(struct net *net, struct net_device *dev,
 			    union geneve_addr *remote,
-			    __u32 vni, __u8 ttl, __u8 tos, __be16 dst_port,
-			    bool metadata, u32 flags)
+			    __u32 vni, __u8 ttl, __u8 tos, __be32 label,
+			    __be16 dst_port, bool metadata, u32 flags)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_dev *t, *geneve = netdev_priv(dev);
@@ -1306,7 +1314,7 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 	if (!remote)
 		return -EINVAL;
 	if (metadata &&
-	    (remote->sa.sa_family != AF_UNSPEC || vni || tos || ttl))
+	    (remote->sa.sa_family != AF_UNSPEC || vni || tos || ttl || label))
 		return -EINVAL;
 
 	geneve->net = net;
@@ -1321,10 +1329,14 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 	    (remote->sa.sa_family == AF_INET6 &&
 	     ipv6_addr_is_multicast(&remote->sin6.sin6_addr)))
 		return -EINVAL;
+	if (label && remote->sa.sa_family != AF_INET6)
+		return -EINVAL;
+
 	geneve->remote = *remote;
 
 	geneve->ttl = ttl;
 	geneve->tos = tos;
+	geneve->label = label;
 	geneve->dst_port = dst_port;
 	geneve->collect_md = metadata;
 	geneve->flags = flags;
@@ -1367,6 +1379,7 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 	__u8 ttl = 0, tos = 0;
 	bool metadata = false;
 	union geneve_addr remote = geneve_remote_unspec;
+	__be32 label = 0;
 	__u32 vni = 0;
 	u32 flags = 0;
 
@@ -1403,6 +1416,10 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 	if (data[IFLA_GENEVE_TOS])
 		tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
 
+	if (data[IFLA_GENEVE_LABEL])
+		label = nla_get_be32(data[IFLA_GENEVE_LABEL]) &
+			IPV6_FLOWLABEL_MASK;
+
 	if (data[IFLA_GENEVE_PORT])
 		dst_port = nla_get_be16(data[IFLA_GENEVE_PORT]);
 
@@ -1421,8 +1438,8 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 	    nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]))
 		flags |= GENEVE_F_UDP_ZERO_CSUM6_RX;
 
-	return geneve_configure(net, dev, &remote, vni, ttl, tos, dst_port,
-				metadata, flags);
+	return geneve_configure(net, dev, &remote, vni, ttl, tos, label,
+				dst_port, metadata, flags);
 }
 
 static void geneve_dellink(struct net_device *dev, struct list_head *head)
@@ -1439,6 +1456,7 @@ static size_t geneve_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
+		nla_total_size(sizeof(__be32)) +  /* IFLA_GENEVE_LABEL */
 		nla_total_size(sizeof(__be16)) +  /* IFLA_GENEVE_PORT */
 		nla_total_size(0) +	 /* IFLA_GENEVE_COLLECT_METADATA */
 		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */
@@ -1469,7 +1487,8 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	}
 
 	if (nla_put_u8(skb, IFLA_GENEVE_TTL, geneve->ttl) ||
-	    nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos))
+	    nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos) ||
+	    nla_put_be32(skb, IFLA_GENEVE_LABEL, geneve->label))
 		goto nla_put_failure;
 
 	if (nla_put_be16(skb, IFLA_GENEVE_PORT, geneve->dst_port))
@@ -1521,7 +1540,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 		return dev;
 
 	err = geneve_configure(net, dev, &geneve_remote_unspec,
-			       0, 0, 0, htons(dst_port), true,
+			       0, 0, 0, 0, htons(dst_port), true,
 			       GENEVE_F_UDP_ZERO_CSUM6_RX);
 	if (err)
 		goto err;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 6bebc975031d..249eef9a21bd 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -479,6 +479,7 @@ enum {
 	IFLA_GENEVE_UDP_CSUM,
 	IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
 	IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
+	IFLA_GENEVE_LABEL,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
-- 
cgit v1.2.3


From 4018ab1875e0d00b84ac61bc15427136ad55849e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Mar 2016 03:00:05 +0100
Subject: bpf: support flow label for bpf_skb_{set, get}_tunnel_key

This patch extends bpf_tunnel_key with a tunnel_label member, that maps
to ip_tunnel_key's label so underlying backends like vxlan and geneve
can propagate the label to udp_tunnel6_xmit_skb(), where it's being set
in the IPv6 header. It allows for having 20 more bits to encode/decode
flow related meta information programmatically. Tested with vxlan and
geneve.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  1 +
 net/core/filter.c        | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0e30b19012a5..924f537183fd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -375,6 +375,7 @@ struct bpf_tunnel_key {
 	};
 	__u8 tunnel_tos;
 	__u8 tunnel_ttl;
+	__u32 tunnel_label;
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index a66dc03c261f..6fc3893a6170 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1770,12 +1770,15 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 		return -EPROTO;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
+		case offsetof(struct bpf_tunnel_key, tunnel_label):
+			goto set_compat;
 		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
 			/* Fixup deprecated structure layouts here, so we have
 			 * a common path later on.
 			 */
 			if (ip_tunnel_info_af(info) != AF_INET)
 				return -EINVAL;
+set_compat:
 			to = (struct bpf_tunnel_key *)compat;
 			break;
 		default:
@@ -1787,11 +1790,13 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	to->tunnel_tos = info->key.tos;
 	to->tunnel_ttl = info->key.ttl;
 
-	if (flags & BPF_F_TUNINFO_IPV6)
+	if (flags & BPF_F_TUNINFO_IPV6) {
 		memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
 		       sizeof(to->remote_ipv6));
-	else
+		to->tunnel_label = be32_to_cpu(info->key.label);
+	} else {
 		to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+	}
 
 	if (unlikely(size != sizeof(struct bpf_tunnel_key)))
 		memcpy((void *)(long) r2, to, size);
@@ -1850,6 +1855,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 		return -EINVAL;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
+		case offsetof(struct bpf_tunnel_key, tunnel_label):
 		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
 			/* Fixup deprecated structure layouts here, so we have
 			 * a common path later on.
@@ -1862,6 +1868,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 			return -EINVAL;
 		}
 	}
+	if (unlikely(!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label))
+		return -EINVAL;
 
 	skb_dst_drop(skb);
 	dst_hold((struct dst_entry *) md);
@@ -1882,6 +1890,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 		info->mode |= IP_TUNNEL_INFO_IPV6;
 		memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
 		       sizeof(from->remote_ipv6));
+		info->key.label = cpu_to_be32(from->tunnel_label) &
+				  IPV6_FLOWLABEL_MASK;
 	} else {
 		info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
 		if (flags & BPF_F_ZERO_CSUM_TX)
-- 
cgit v1.2.3


From 136ba622de49a6bf1f6e5eab3391ed5d5dbe30e3 Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Thu, 10 Mar 2016 08:55:50 +0000
Subject: netconf: add macro to represent all attributes

This patch adds macro NETCONFA_ALL to represent all type of netconf
attributes for IPv4 and IPv6.

Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netconf.h |  1 +
 net/ipv4/devinet.c           | 40 +++++++++++++++++++++++-----------------
 net/ipv6/addrconf.c          | 36 +++++++++++++++++++++---------------
 3 files changed, 45 insertions(+), 32 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index 23cbd34e4ac7..45dfad509c4d 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -19,6 +19,7 @@ enum {
 	__NETCONFA_MAX
 };
 #define NETCONFA_MAX	(__NETCONFA_MAX - 1)
+#define NETCONFA_ALL	-1
 
 #define NETCONFA_IFINDEX_ALL		-1
 #define NETCONFA_IFINDEX_DEFAULT	-2
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 8c3df2ccba45..65e76a48382c 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1753,17 +1753,20 @@ static int inet_netconf_msgsize_devconf(int type)
 {
 	int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
 		   + nla_total_size(4);	/* NETCONFA_IFINDEX */
+	bool all = false;
 
-	/* type -1 is used for ALL */
-	if (type == -1 || type == NETCONFA_FORWARDING)
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	if (all || type == NETCONFA_FORWARDING)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_RP_FILTER)
+	if (all || type == NETCONFA_RP_FILTER)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_MC_FORWARDING)
+	if (all || type == NETCONFA_MC_FORWARDING)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+	if (all || type == NETCONFA_PROXY_NEIGH)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+	if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
 		size += nla_total_size(4);
 
 	return size;
@@ -1776,36 +1779,39 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 {
 	struct nlmsghdr  *nlh;
 	struct netconfmsg *ncm;
+	bool all = false;
 
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
 			flags);
 	if (!nlh)
 		return -EMSGSIZE;
 
+	if (type == NETCONFA_ALL)
+		all = true;
+
 	ncm = nlmsg_data(nlh);
 	ncm->ncm_family = AF_INET;
 
 	if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
 		goto nla_put_failure;
 
-	/* type -1 is used for ALL */
-	if ((type == -1 || type == NETCONFA_FORWARDING) &&
+	if ((all || type == NETCONFA_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_FORWARDING,
 			IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_RP_FILTER) &&
+	if ((all || type == NETCONFA_RP_FILTER) &&
 	    nla_put_s32(skb, NETCONFA_RP_FILTER,
 			IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+	if ((all || type == NETCONFA_MC_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
 			IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+	if ((all || type == NETCONFA_PROXY_NEIGH) &&
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
 			IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+	if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
 	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
 			IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
 		goto nla_put_failure;
@@ -1893,14 +1899,14 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
 	}
 
 	err = -ENOBUFS;
-	skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+	skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
 
 	err = inet_netconf_fill_devconf(skb, ifindex, devconf,
 					NETLINK_CB(in_skb).portid,
 					nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
-					-1);
+					NETCONFA_ALL);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
 		WARN_ON(err == -EMSGSIZE);
@@ -1944,7 +1950,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
 						      cb->nlh->nlmsg_seq,
 						      RTM_NEWNETCONF,
 						      NLM_F_MULTI,
-						      -1) < 0) {
+						      NETCONFA_ALL) < 0) {
 				rcu_read_unlock();
 				goto done;
 			}
@@ -1960,7 +1966,7 @@ cont:
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
 					      RTM_NEWNETCONF, NLM_F_MULTI,
-					      -1) < 0)
+					      NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
@@ -1971,7 +1977,7 @@ cont:
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
 					      RTM_NEWNETCONF, NLM_F_MULTI,
-					      -1) < 0)
+					      NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8c0dab2de5c9..27aed1afcf81 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -473,18 +473,21 @@ static int inet6_netconf_msgsize_devconf(int type)
 {
 	int size =  NLMSG_ALIGN(sizeof(struct netconfmsg))
 		    + nla_total_size(4);	/* NETCONFA_IFINDEX */
+	bool all = false;
 
-	/* type -1 is used for ALL */
-	if (type == -1 || type == NETCONFA_FORWARDING)
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	if (all || type == NETCONFA_FORWARDING)
 		size += nla_total_size(4);
 #ifdef CONFIG_IPV6_MROUTE
-	if (type == -1 || type == NETCONFA_MC_FORWARDING)
+	if (all || type == NETCONFA_MC_FORWARDING)
 		size += nla_total_size(4);
 #endif
-	if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+	if (all || type == NETCONFA_PROXY_NEIGH)
 		size += nla_total_size(4);
 
-	if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+	if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
 		size += nla_total_size(4);
 
 	return size;
@@ -497,33 +500,36 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 {
 	struct nlmsghdr  *nlh;
 	struct netconfmsg *ncm;
+	bool all = false;
 
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
 			flags);
 	if (!nlh)
 		return -EMSGSIZE;
 
+	if (type == NETCONFA_ALL)
+		all = true;
+
 	ncm = nlmsg_data(nlh);
 	ncm->ncm_family = AF_INET6;
 
 	if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
 		goto nla_put_failure;
 
-	/* type -1 is used for ALL */
-	if ((type == -1 || type == NETCONFA_FORWARDING) &&
+	if ((all || type == NETCONFA_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0)
 		goto nla_put_failure;
 #ifdef CONFIG_IPV6_MROUTE
-	if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+	if ((all || type == NETCONFA_MC_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
 			devconf->mc_forwarding) < 0)
 		goto nla_put_failure;
 #endif
-	if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+	if ((all || type == NETCONFA_PROXY_NEIGH) &&
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0)
 		goto nla_put_failure;
 
-	if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+	if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
 	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
 			devconf->ignore_routes_with_linkdown) < 0)
 		goto nla_put_failure;
@@ -609,14 +615,14 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
 	}
 
 	err = -ENOBUFS;
-	skb = nlmsg_new(inet6_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+	skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
 
 	err = inet6_netconf_fill_devconf(skb, ifindex, devconf,
 					 NETLINK_CB(in_skb).portid,
 					 nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
-					 -1);
+					 NETCONFA_ALL);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */
 		WARN_ON(err == -EMSGSIZE);
@@ -660,7 +666,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb,
 						       cb->nlh->nlmsg_seq,
 						       RTM_NEWNETCONF,
 						       NLM_F_MULTI,
-						       -1) < 0) {
+						       NETCONFA_ALL) < 0) {
 				rcu_read_unlock();
 				goto done;
 			}
@@ -676,7 +682,7 @@ cont:
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq,
 					       RTM_NEWNETCONF, NLM_F_MULTI,
-					       -1) < 0)
+					       NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
@@ -687,7 +693,7 @@ cont:
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq,
 					       RTM_NEWNETCONF, NLM_F_MULTI,
-					       -1) < 0)
+					       NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
-- 
cgit v1.2.3


From dece8d2b78d19df7fe5e4e965f1f0d1a3e188d1b Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 11 Mar 2016 18:07:31 +0100
Subject: uapi: add MACsec bits

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/Kbuild      |   1 +
 include/uapi/linux/if_ether.h  |   1 +
 include/uapi/linux/if_link.h   |  29 ++++++++
 include/uapi/linux/if_macsec.h | 161 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 192 insertions(+)
 create mode 100644 include/uapi/linux/if_macsec.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index ebd10e624598..e25ebcfbcb48 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -173,6 +173,7 @@ header-y += if_hippi.h
 header-y += if_infiniband.h
 header-y += if_link.h
 header-y += if_ltalk.h
+header-y += if_macsec.h
 header-y += if_packet.h
 header-y += if_phonet.h
 header-y += if_plip.h
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index ea9221b0331a..4a93051c578c 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -83,6 +83,7 @@
 #define ETH_P_8021AD	0x88A8          /* 802.1ad Service VLAN		*/
 #define ETH_P_802_EX1	0x88B5		/* 802.1 Local Experimental 1.  */
 #define ETH_P_TIPC	0x88CA		/* TIPC 			*/
+#define ETH_P_MACSEC	0x88E5		/* 802.1ae MACsec */
 #define ETH_P_8021AH	0x88E7          /* 802.1ah Backbone Service Tag */
 #define ETH_P_MVRP	0x88F5          /* 802.1Q MVRP                  */
 #define ETH_P_1588	0x88F7		/* IEEE 1588 Timesync */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 249eef9a21bd..8e3f88fa5b59 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -413,6 +413,35 @@ enum {
 
 #define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1)
 
+/* MACSEC section */
+enum {
+	IFLA_MACSEC_UNSPEC,
+	IFLA_MACSEC_SCI,
+	IFLA_MACSEC_PORT,
+	IFLA_MACSEC_ICV_LEN,
+	IFLA_MACSEC_CIPHER_SUITE,
+	IFLA_MACSEC_WINDOW,
+	IFLA_MACSEC_ENCODING_SA,
+	IFLA_MACSEC_ENCRYPT,
+	IFLA_MACSEC_PROTECT,
+	IFLA_MACSEC_INC_SCI,
+	IFLA_MACSEC_ES,
+	IFLA_MACSEC_SCB,
+	IFLA_MACSEC_REPLAY_PROTECT,
+	IFLA_MACSEC_VALIDATION,
+	__IFLA_MACSEC_MAX,
+};
+
+#define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1)
+
+enum macsec_validation_type {
+	MACSEC_VALIDATE_DISABLED = 0,
+	MACSEC_VALIDATE_CHECK = 1,
+	MACSEC_VALIDATE_STRICT = 2,
+	__MACSEC_VALIDATE_END,
+	MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1,
+};
+
 /* IPVLAN section */
 enum {
 	IFLA_IPVLAN_UNSPEC,
diff --git a/include/uapi/linux/if_macsec.h b/include/uapi/linux/if_macsec.h
new file mode 100644
index 000000000000..26b0d1e3e3e7
--- /dev/null
+++ b/include/uapi/linux/if_macsec.h
@@ -0,0 +1,161 @@
+/*
+ * include/uapi/linux/if_macsec.h - MACsec device
+ *
+ * Copyright (c) 2015 Sabrina Dubroca <sd@queasysnail.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _UAPI_MACSEC_H
+#define _UAPI_MACSEC_H
+
+#include <linux/types.h>
+
+#define MACSEC_GENL_NAME "macsec"
+#define MACSEC_GENL_VERSION 1
+
+#define MACSEC_MAX_KEY_LEN 128
+
+#define DEFAULT_CIPHER_ID   0x0080020001000001ULL
+#define DEFAULT_CIPHER_ALT  0x0080C20001000001ULL
+
+#define MACSEC_MIN_ICV_LEN 8
+#define MACSEC_MAX_ICV_LEN 32
+
+enum macsec_attrs {
+	MACSEC_ATTR_UNSPEC,
+	MACSEC_ATTR_IFINDEX,     /* u32, ifindex of the MACsec netdevice */
+	MACSEC_ATTR_RXSC_CONFIG, /* config, nested macsec_rxsc_attrs */
+	MACSEC_ATTR_SA_CONFIG,   /* config, nested macsec_sa_attrs */
+	MACSEC_ATTR_SECY,        /* dump, nested macsec_secy_attrs */
+	MACSEC_ATTR_TXSA_LIST,   /* dump, nested, macsec_sa_attrs for each TXSA */
+	MACSEC_ATTR_RXSC_LIST,   /* dump, nested, macsec_rxsc_attrs for each RXSC */
+	MACSEC_ATTR_TXSC_STATS,  /* dump, nested, macsec_txsc_stats_attr */
+	MACSEC_ATTR_SECY_STATS,  /* dump, nested, macsec_secy_stats_attr */
+	__MACSEC_ATTR_END,
+	NUM_MACSEC_ATTR = __MACSEC_ATTR_END,
+	MACSEC_ATTR_MAX = __MACSEC_ATTR_END - 1,
+};
+
+enum macsec_secy_attrs {
+	MACSEC_SECY_ATTR_UNSPEC,
+	MACSEC_SECY_ATTR_SCI,
+	MACSEC_SECY_ATTR_ENCODING_SA,
+	MACSEC_SECY_ATTR_WINDOW,
+	MACSEC_SECY_ATTR_CIPHER_SUITE,
+	MACSEC_SECY_ATTR_ICV_LEN,
+	MACSEC_SECY_ATTR_PROTECT,
+	MACSEC_SECY_ATTR_REPLAY,
+	MACSEC_SECY_ATTR_OPER,
+	MACSEC_SECY_ATTR_VALIDATE,
+	MACSEC_SECY_ATTR_ENCRYPT,
+	MACSEC_SECY_ATTR_INC_SCI,
+	MACSEC_SECY_ATTR_ES,
+	MACSEC_SECY_ATTR_SCB,
+	__MACSEC_SECY_ATTR_END,
+	NUM_MACSEC_SECY_ATTR = __MACSEC_SECY_ATTR_END,
+	MACSEC_SECY_ATTR_MAX = __MACSEC_SECY_ATTR_END - 1,
+};
+
+enum macsec_rxsc_attrs {
+	MACSEC_RXSC_ATTR_UNSPEC,
+	MACSEC_RXSC_ATTR_SCI,     /* config/dump, u64 */
+	MACSEC_RXSC_ATTR_ACTIVE,  /* config/dump, u8 0..1 */
+	MACSEC_RXSC_ATTR_SA_LIST, /* dump, nested */
+	MACSEC_RXSC_ATTR_STATS,   /* dump, nested, macsec_rxsc_stats_attr */
+	__MACSEC_RXSC_ATTR_END,
+	NUM_MACSEC_RXSC_ATTR = __MACSEC_RXSC_ATTR_END,
+	MACSEC_RXSC_ATTR_MAX = __MACSEC_RXSC_ATTR_END - 1,
+};
+
+enum macsec_sa_attrs {
+	MACSEC_SA_ATTR_UNSPEC,
+	MACSEC_SA_ATTR_AN,     /* config/dump, u8 0..3 */
+	MACSEC_SA_ATTR_ACTIVE, /* config/dump, u8 0..1 */
+	MACSEC_SA_ATTR_PN,     /* config/dump, u32 */
+	MACSEC_SA_ATTR_KEY,    /* config, data */
+	MACSEC_SA_ATTR_KEYID,  /* config/dump, u64 */
+	MACSEC_SA_ATTR_STATS,  /* dump, nested, macsec_sa_stats_attr */
+	__MACSEC_SA_ATTR_END,
+	NUM_MACSEC_SA_ATTR = __MACSEC_SA_ATTR_END,
+	MACSEC_SA_ATTR_MAX = __MACSEC_SA_ATTR_END - 1,
+};
+
+enum macsec_nl_commands {
+	MACSEC_CMD_GET_TXSC,
+	MACSEC_CMD_ADD_RXSC,
+	MACSEC_CMD_DEL_RXSC,
+	MACSEC_CMD_UPD_RXSC,
+	MACSEC_CMD_ADD_TXSA,
+	MACSEC_CMD_DEL_TXSA,
+	MACSEC_CMD_UPD_TXSA,
+	MACSEC_CMD_ADD_RXSA,
+	MACSEC_CMD_DEL_RXSA,
+	MACSEC_CMD_UPD_RXSA,
+};
+
+/* u64 per-RXSC stats */
+enum macsec_rxsc_stats_attr {
+	MACSEC_RXSC_STATS_ATTR_UNSPEC,
+	MACSEC_RXSC_STATS_ATTR_IN_OCTETS_VALIDATED,
+	MACSEC_RXSC_STATS_ATTR_IN_OCTETS_DECRYPTED,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNCHECKED,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_DELAYED,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_OK,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_INVALID,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_LATE,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_VALID,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_USING_SA,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNUSED_SA,
+	__MACSEC_RXSC_STATS_ATTR_END,
+	NUM_MACSEC_RXSC_STATS_ATTR = __MACSEC_RXSC_STATS_ATTR_END,
+	MACSEC_RXSC_STATS_ATTR_MAX = __MACSEC_RXSC_STATS_ATTR_END - 1,
+};
+
+/* u32 per-{RX,TX}SA stats */
+enum macsec_sa_stats_attr {
+	MACSEC_SA_STATS_ATTR_UNSPEC,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_OK,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_INVALID,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_VALID,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_USING_SA,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_UNUSED_SA,
+	MACSEC_SA_STATS_ATTR_OUT_PKTS_PROTECTED,
+	MACSEC_SA_STATS_ATTR_OUT_PKTS_ENCRYPTED,
+	__MACSEC_SA_STATS_ATTR_END,
+	NUM_MACSEC_SA_STATS_ATTR = __MACSEC_SA_STATS_ATTR_END,
+	MACSEC_SA_STATS_ATTR_MAX = __MACSEC_SA_STATS_ATTR_END - 1,
+};
+
+/* u64 per-TXSC stats */
+enum macsec_txsc_stats_attr {
+	MACSEC_TXSC_STATS_ATTR_UNSPEC,
+	MACSEC_TXSC_STATS_ATTR_OUT_PKTS_PROTECTED,
+	MACSEC_TXSC_STATS_ATTR_OUT_PKTS_ENCRYPTED,
+	MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_PROTECTED,
+	MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_ENCRYPTED,
+	__MACSEC_TXSC_STATS_ATTR_END,
+	NUM_MACSEC_TXSC_STATS_ATTR = __MACSEC_TXSC_STATS_ATTR_END,
+	MACSEC_TXSC_STATS_ATTR_MAX = __MACSEC_TXSC_STATS_ATTR_END - 1,
+};
+
+/* u64 per-SecY stats */
+enum macsec_secy_stats_attr {
+	MACSEC_SECY_STATS_ATTR_UNSPEC,
+	MACSEC_SECY_STATS_ATTR_OUT_PKTS_UNTAGGED,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_UNTAGGED,
+	MACSEC_SECY_STATS_ATTR_OUT_PKTS_TOO_LONG,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_TAG,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_BAD_TAG,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_UNKNOWN_SCI,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_SCI,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_OVERRUN,
+	__MACSEC_SECY_STATS_ATTR_END,
+	NUM_MACSEC_SECY_STATS_ATTR = __MACSEC_SECY_STATS_ATTR_END,
+	MACSEC_SECY_STATS_ATTR_MAX = __MACSEC_SECY_STATS_ATTR_END - 1,
+};
+
+#endif /* _UAPI_MACSEC_H */
-- 
cgit v1.2.3


From a44d6eacdaf56f74fad699af7f4925a5f5ac0e7f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 14 Mar 2016 10:52:15 -0700
Subject: tcp: Add RFC4898 tcpEStatsPerfDataSegsOut/In

Per RFC4898, they count segments sent/received
containing a positive length data segment (that includes
retransmission segments carrying data).  Unlike
tcpi_segs_out/in, tcpi_data_segs_out/in excludes segments
carrying no data (e.g. pure ack).

The patch also updates the segs_in in tcp_fastopen_add_skb()
so that segs_in >= data_segs_in property is kept.

Together with retransmission data, tcpi_data_segs_out
gives a better signal on the rxmit rate.

v6: Rebase on the latest net-next

v5: Eric pointed out that checking skb->len is still needed in
tcp_fastopen_add_skb() because skb can carry a FIN without data.
Hence, instead of open coding segs_in and data_segs_in, tcp_segs_in()
helper is used.  Comment is added to the fastopen case to explain why
segs_in has to be reset and tcp_segs_in() has to be called before
__skb_pull().

v4: Add comment to the changes in tcp_fastopen_add_skb()
and also add remark on this case in the commit message.

v3: Add const modifier to the skb parameter in tcp_segs_in()

v2: Rework based on recent fix by Eric:
commit a9d99ce28ed3 ("tcp: fix tcpi_segs_in after connection establishment")

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Chris Rapier <rapier@psc.edu>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Marcelo Ricardo Leitner <mleitner@redhat.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  6 ++++++
 include/net/tcp.h        | 10 ++++++++++
 include/uapi/linux/tcp.h |  2 ++
 net/ipv4/tcp.c           |  2 ++
 net/ipv4/tcp_fastopen.c  |  8 ++++++++
 net/ipv4/tcp_ipv4.c      |  2 +-
 net/ipv4/tcp_minisocks.c |  2 +-
 net/ipv4/tcp_output.c    |  4 +++-
 net/ipv6/tcp_ipv6.c      |  2 +-
 9 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index bcbf51da4e1e..7be9b1242354 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -158,6 +158,9 @@ struct tcp_sock {
 	u32	segs_in;	/* RFC4898 tcpEStatsPerfSegsIn
 				 * total number of segments in.
 				 */
+	u32	data_segs_in;	/* RFC4898 tcpEStatsPerfDataSegsIn
+				 * total number of data segments in.
+				 */
  	u32	rcv_nxt;	/* What we want to receive next 	*/
 	u32	copied_seq;	/* Head of yet unread data		*/
 	u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
@@ -165,6 +168,9 @@ struct tcp_sock {
 	u32	segs_out;	/* RFC4898 tcpEStatsPerfSegsOut
 				 * The total number of segments sent.
 				 */
+	u32	data_segs_out;	/* RFC4898 tcpEStatsPerfDataSegsOut
+				 * total number of data segments sent.
+				 */
 	u64	bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0302636af98c..c8dbd293daae 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1840,4 +1840,14 @@ static inline int tcp_inq(struct sock *sk)
 	return answ;
 }
 
+static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+	u16 segs_in;
+
+	segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tp->segs_in += segs_in;
+	if (skb->len > tcp_hdrlen(skb))
+		tp->data_segs_in += segs_in;
+}
+
 #endif	/* _TCP_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index fe95446e9abf..53e8e3fe6b1b 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -199,6 +199,8 @@ struct tcp_info {
 
 	__u32	tcpi_notsent_bytes;
 	__u32	tcpi_min_rtt;
+	__u32	tcpi_data_segs_in;	/* RFC4898 tcpEStatsDataSegsIn */
+	__u32	tcpi_data_segs_out;	/* RFC4898 tcpEStatsDataSegsOut */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a265f00b9df9..992b3103ec3e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2715,6 +2715,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_notsent_bytes = max(0, notsent_bytes);
 
 	info->tcpi_min_rtt = tcp_min_rtt(tp);
+	info->tcpi_data_segs_in = tp->data_segs_in;
+	info->tcpi_data_segs_out = tp->data_segs_out;
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index fdb286ddba04..4fc0061bebf4 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -140,6 +140,14 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
 		return;
 
 	skb_dst_drop(skb);
+	/* segs_in has been initialized to 1 in tcp_create_openreq_child().
+	 * Hence, reset segs_in to 0 before calling tcp_segs_in()
+	 * to avoid double counting.  Also, tcp_segs_in() expects
+	 * skb->len to include the tcp_hdrlen.  Hence, it should
+	 * be called before __skb_pull().
+	 */
+	tp->segs_in = 0;
+	tcp_segs_in(tp, skb);
 	__skb_pull(skb, tcp_hdrlen(skb));
 	skb_set_owner_r(skb, sk);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4c8d58dfac9b..0b02ef773705 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1650,7 +1650,7 @@ process:
 	sk_incoming_cpu_update(sk);
 
 	bh_lock_sock_nested(sk);
-	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
 		if (!tcp_prequeue(sk, skb))
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ae90e4b34bd3..acb366dd61e6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -812,7 +812,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
 	int ret = 0;
 	int state = child->sk_state;
 
-	tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tcp_segs_in(tcp_sk(child), skb);
 	if (!sock_owned_by_user(child)) {
 		ret = tcp_rcv_state_process(child, skb);
 		/* Wakeup parent, send SIGIO */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7d2c7a400456..7d2dc015cd19 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1003,8 +1003,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	if (likely(tcb->tcp_flags & TCPHDR_ACK))
 		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
 
-	if (skb->len != tcp_header_size)
+	if (skb->len != tcp_header_size) {
 		tcp_event_data_sent(tp, sk);
+		tp->data_segs_out += tcp_skb_pcount(skb);
+	}
 
 	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
 		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 33f2820181f9..9c16565b70cc 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1443,7 +1443,7 @@ process:
 	sk_incoming_cpu_update(sk);
 
 	bh_lock_sock_nested(sk);
-	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
 		if (!tcp_prequeue(sk, skb))
-- 
cgit v1.2.3


From bfa3f9d7f3b349acea8982d2248e33a0ed84c687 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 10 Mar 2016 10:54:16 -0800
Subject: netfilter: Remove IP_CT_NEW_REPLY definition.

Remove the definition of IP_CT_NEW_REPLY from the kernel as it does
not make sense.  This allows the definition of IP_CT_NUMBER to be
simplified as well.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h | 12 +++++++++---
 net/openvswitch/conntrack.c                        |  2 --
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 319f47128db8..6d074d14ee27 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -20,9 +20,15 @@ enum ip_conntrack_info {
 
 	IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY,
 	IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY,
-	IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY,	
-	/* Number of distinct IP_CT types (no NEW in reply dirn). */
-	IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
+	/* No NEW in reply direction. */
+
+	/* Number of distinct IP_CT types. */
+	IP_CT_NUMBER,
+
+	/* only for userspace compatibility */
+#ifndef __KERNEL__
+	IP_CT_NEW_REPLY = IP_CT_NUMBER,
+#endif
 };
 
 #define NF_CT_STATE_INVALID_BIT			(1 << 0)
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index ee6ff8ffc12d..304529015744 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -75,7 +75,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
 	switch (ctinfo) {
 	case IP_CT_ESTABLISHED_REPLY:
 	case IP_CT_RELATED_REPLY:
-	case IP_CT_NEW_REPLY:
 		ct_state |= OVS_CS_F_REPLY_DIR;
 		break;
 	default:
@@ -92,7 +91,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
 		ct_state |= OVS_CS_F_RELATED;
 		break;
 	case IP_CT_NEW:
-	case IP_CT_NEW_REPLY:
 		ct_state |= OVS_CS_F_NEW;
 		break;
 	default:
-- 
cgit v1.2.3


From 05752523e56502cd9975aec0a2ded465d51a71f3 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 10 Mar 2016 10:54:23 -0800
Subject: openvswitch: Interface with NAT.

Extend OVS conntrack interface to cover NAT.  New nested
OVS_CT_ATTR_NAT attribute may be used to include NAT with a CT action.
A bare OVS_CT_ATTR_NAT only mangles existing and expected connections.
If OVS_NAT_ATTR_SRC or OVS_NAT_ATTR_DST is included within the nested
attributes, new (non-committed/non-confirmed) connections are mangled
according to the rest of the nested attributes.

The corresponding OVS userspace patch series includes test cases (in
tests/system-traffic.at) that also serve as example uses.

This work extends on a branch by Thomas Graf at
https://github.com/tgraf/ovs/tree/nat.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/openvswitch.h |  49 ++++
 net/openvswitch/Kconfig          |   3 +-
 net/openvswitch/conntrack.c      | 524 +++++++++++++++++++++++++++++++++++++--
 net/openvswitch/conntrack.h      |   3 +-
 4 files changed, 551 insertions(+), 28 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index a27222d5b413..616d04761730 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -454,6 +454,14 @@ struct ovs_key_ct_labels {
 #define OVS_CS_F_REPLY_DIR         0x08 /* Flow is in the reply direction. */
 #define OVS_CS_F_INVALID           0x10 /* Could not track connection. */
 #define OVS_CS_F_TRACKED           0x20 /* Conntrack has occurred. */
+#define OVS_CS_F_SRC_NAT           0x40 /* Packet's source address/port was
+					 * mangled by NAT.
+					 */
+#define OVS_CS_F_DST_NAT           0x80 /* Packet's destination address/port
+					 * was mangled by NAT.
+					 */
+
+#define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
 
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
@@ -632,6 +640,8 @@ struct ovs_action_hash {
  * mask. For each bit set in the mask, the corresponding bit in the value is
  * copied to the connection tracking label field in the connection.
  * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
+ * @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address
+ * translation (NAT) on the packet.
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
@@ -641,11 +651,50 @@ enum ovs_ct_attr {
 	OVS_CT_ATTR_LABELS,     /* labels to associate with this connection. */
 	OVS_CT_ATTR_HELPER,     /* netlink helper to assist detection of
 				   related connections. */
+	OVS_CT_ATTR_NAT,        /* Nested OVS_NAT_ATTR_* */
 	__OVS_CT_ATTR_MAX
 };
 
 #define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)
 
+/**
+ * enum ovs_nat_attr - Attributes for %OVS_CT_ATTR_NAT.
+ *
+ * @OVS_NAT_ATTR_SRC: Flag for Source NAT (mangle source address/port).
+ * @OVS_NAT_ATTR_DST: Flag for Destination NAT (mangle destination
+ * address/port).  Only one of (@OVS_NAT_ATTR_SRC, @OVS_NAT_ATTR_DST) may be
+ * specified.  Effective only for packets for ct_state NEW connections.
+ * Packets of committed connections are mangled by the NAT action according to
+ * the committed NAT type regardless of the flags specified.  As a corollary, a
+ * NAT action without a NAT type flag will only mangle packets of committed
+ * connections.  The following NAT attributes only apply for NEW
+ * (non-committed) connections, and they may be included only when the CT
+ * action has the @OVS_CT_ATTR_COMMIT flag and either @OVS_NAT_ATTR_SRC or
+ * @OVS_NAT_ATTR_DST is also included.
+ * @OVS_NAT_ATTR_IP_MIN: struct in_addr or struct in6_addr
+ * @OVS_NAT_ATTR_IP_MAX: struct in_addr or struct in6_addr
+ * @OVS_NAT_ATTR_PROTO_MIN: u16 L4 protocol specific lower boundary (port)
+ * @OVS_NAT_ATTR_PROTO_MAX: u16 L4 protocol specific upper boundary (port)
+ * @OVS_NAT_ATTR_PERSISTENT: Flag for persistent IP mapping across reboots
+ * @OVS_NAT_ATTR_PROTO_HASH: Flag for pseudo random L4 port mapping (MD5)
+ * @OVS_NAT_ATTR_PROTO_RANDOM: Flag for fully randomized L4 port mapping
+ */
+enum ovs_nat_attr {
+	OVS_NAT_ATTR_UNSPEC,
+	OVS_NAT_ATTR_SRC,
+	OVS_NAT_ATTR_DST,
+	OVS_NAT_ATTR_IP_MIN,
+	OVS_NAT_ATTR_IP_MAX,
+	OVS_NAT_ATTR_PROTO_MIN,
+	OVS_NAT_ATTR_PROTO_MAX,
+	OVS_NAT_ATTR_PERSISTENT,
+	OVS_NAT_ATTR_PROTO_HASH,
+	OVS_NAT_ATTR_PROTO_RANDOM,
+	__OVS_NAT_ATTR_MAX,
+};
+
+#define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1)
+
 /**
  * enum ovs_action_attr - Action types.
  *
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index cd5fd9d728a7..234a73344c6e 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -6,7 +6,8 @@ config OPENVSWITCH
 	tristate "Open vSwitch"
 	depends on INET
 	depends on !NF_CONNTRACK || \
-		   (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6))
+		   (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
+				     (!NF_NAT || NF_NAT)))
 	select LIBCRC32C
 	select MPLS
 	select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index f718b724e650..dc5eb29fe7d6 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -13,21 +13,31 @@
 
 #include <linux/module.h>
 #include <linux/openvswitch.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
 #include <net/ip.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 
+#ifdef CONFIG_NF_NAT_NEEDED
+#include <linux/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#endif
+
 #include "datapath.h"
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_netlink.h"
 
 struct ovs_ct_len_tbl {
-	size_t maxlen;
-	size_t minlen;
+	int maxlen;
+	int minlen;
 };
 
 /* Metadata mark for masked write to conntrack mark */
@@ -42,15 +52,25 @@ struct md_labels {
 	struct ovs_key_ct_labels mask;
 };
 
+enum ovs_ct_nat {
+	OVS_CT_NAT = 1 << 0,     /* NAT for committed connections only. */
+	OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */
+	OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */
+};
+
 /* Conntrack action context for execution. */
 struct ovs_conntrack_info {
 	struct nf_conntrack_helper *helper;
 	struct nf_conntrack_zone zone;
 	struct nf_conn *ct;
 	u8 commit : 1;
+	u8 nat : 3;                 /* enum ovs_ct_nat */
 	u16 family;
 	struct md_mark mark;
 	struct md_labels labels;
+#ifdef CONFIG_NF_NAT_NEEDED
+	struct nf_nat_range range;  /* Only present for SRC NAT and DST NAT. */
+#endif
 };
 
 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -137,12 +157,15 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 	ovs_ct_get_labels(ct, &key->ct.labels);
 }
 
-/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
- * previously sent the packet to conntrack via the ct action.
+/* Update 'key' based on skb->nfct.  If 'post_ct' is true, then OVS has
+ * previously sent the packet to conntrack via the ct action.  If
+ * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
+ * initialized from the connection status.
  */
 static void ovs_ct_update_key(const struct sk_buff *skb,
 			      const struct ovs_conntrack_info *info,
-			      struct sw_flow_key *key, bool post_ct)
+			      struct sw_flow_key *key, bool post_ct,
+			      bool keep_nat_flags)
 {
 	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
 	enum ip_conntrack_info ctinfo;
@@ -160,6 +183,14 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
 		 */
 		if (ct->master)
 			state |= OVS_CS_F_RELATED;
+		if (keep_nat_flags) {
+			state |= key->ct.state & OVS_CS_F_NAT_MASK;
+		} else {
+			if (ct->status & IPS_SRC_NAT)
+				state |= OVS_CS_F_SRC_NAT;
+			if (ct->status & IPS_DST_NAT)
+				state |= OVS_CS_F_DST_NAT;
+		}
 		zone = nf_ct_zone(ct);
 	} else if (post_ct) {
 		state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
@@ -174,7 +205,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
  */
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
 {
-	ovs_ct_update_key(skb, NULL, key, false);
+	ovs_ct_update_key(skb, NULL, key, false, false);
 }
 
 int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
@@ -263,6 +294,7 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
 	enum ip_conntrack_info ctinfo;
 	unsigned int protoff;
 	struct nf_conn *ct;
+	int err;
 
 	ct = nf_ct_get(skb, &ctinfo);
 	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
@@ -299,7 +331,18 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
 		return NF_DROP;
 	}
 
-	return helper->help(skb, protoff, ct, ctinfo);
+	err = helper->help(skb, protoff, ct, ctinfo);
+	if (err != NF_ACCEPT)
+		return err;
+
+	/* Adjust seqs after helper.  This is needed due to some helpers (e.g.,
+	 * FTP with NAT) adusting the TCP payload size when mangling IP
+	 * addresses and/or port numbers in the text-based control connection.
+	 */
+	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+	    !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+		return NF_DROP;
+	return NF_ACCEPT;
 }
 
 /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
@@ -468,6 +511,200 @@ static bool skb_nfct_cached(struct net *net,
 	return true;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+			      enum ip_conntrack_info ctinfo,
+			      const struct nf_nat_range *range,
+			      enum nf_nat_manip_type maniptype)
+{
+	int hooknum, nh_off, err = NF_ACCEPT;
+
+	nh_off = skb_network_offset(skb);
+	skb_pull(skb, nh_off);
+
+	/* See HOOK2MANIP(). */
+	if (maniptype == NF_NAT_MANIP_SRC)
+		hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+	else
+		hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED_REPLY:
+		if (skb->protocol == htons(ETH_P_IP) &&
+		    ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+							   hooknum))
+				err = NF_DROP;
+			goto push;
+#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
+		} else if (skb->protocol == htons(ETH_P_IPV6)) {
+			__be16 frag_off;
+			u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+			int hdrlen = ipv6_skip_exthdr(skb,
+						      sizeof(struct ipv6hdr),
+						      &nexthdr, &frag_off);
+
+			if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+				if (!nf_nat_icmpv6_reply_translation(skb, ct,
+								     ctinfo,
+								     hooknum,
+								     hdrlen))
+					err = NF_DROP;
+				goto push;
+			}
+#endif
+		}
+		/* Non-ICMP, fall thru to initialize if needed. */
+	case IP_CT_NEW:
+		/* Seen it before?  This can happen for loopback, retrans,
+		 * or local packets.
+		 */
+		if (!nf_nat_initialized(ct, maniptype)) {
+			/* Initialize according to the NAT action. */
+			err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+				/* Action is set up to establish a new
+				 * mapping.
+				 */
+				? nf_nat_setup_info(ct, range, maniptype)
+				: nf_nat_alloc_null_binding(ct, hooknum);
+			if (err != NF_ACCEPT)
+				goto push;
+		}
+		break;
+
+	case IP_CT_ESTABLISHED:
+	case IP_CT_ESTABLISHED_REPLY:
+		break;
+
+	default:
+		err = NF_DROP;
+		goto push;
+	}
+
+	err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+push:
+	skb_push(skb, nh_off);
+
+	return err;
+}
+
+static void ovs_nat_update_key(struct sw_flow_key *key,
+			       const struct sk_buff *skb,
+			       enum nf_nat_manip_type maniptype)
+{
+	if (maniptype == NF_NAT_MANIP_SRC) {
+		__be16 src;
+
+		key->ct.state |= OVS_CS_F_SRC_NAT;
+		if (key->eth.type == htons(ETH_P_IP))
+			key->ipv4.addr.src = ip_hdr(skb)->saddr;
+		else if (key->eth.type == htons(ETH_P_IPV6))
+			memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr,
+			       sizeof(key->ipv6.addr.src));
+		else
+			return;
+
+		if (key->ip.proto == IPPROTO_UDP)
+			src = udp_hdr(skb)->source;
+		else if (key->ip.proto == IPPROTO_TCP)
+			src = tcp_hdr(skb)->source;
+		else if (key->ip.proto == IPPROTO_SCTP)
+			src = sctp_hdr(skb)->source;
+		else
+			return;
+
+		key->tp.src = src;
+	} else {
+		__be16 dst;
+
+		key->ct.state |= OVS_CS_F_DST_NAT;
+		if (key->eth.type == htons(ETH_P_IP))
+			key->ipv4.addr.dst = ip_hdr(skb)->daddr;
+		else if (key->eth.type == htons(ETH_P_IPV6))
+			memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr,
+			       sizeof(key->ipv6.addr.dst));
+		else
+			return;
+
+		if (key->ip.proto == IPPROTO_UDP)
+			dst = udp_hdr(skb)->dest;
+		else if (key->ip.proto == IPPROTO_TCP)
+			dst = tcp_hdr(skb)->dest;
+		else if (key->ip.proto == IPPROTO_SCTP)
+			dst = sctp_hdr(skb)->dest;
+		else
+			return;
+
+		key->tp.dst = dst;
+	}
+}
+
+/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+		      const struct ovs_conntrack_info *info,
+		      struct sk_buff *skb, struct nf_conn *ct,
+		      enum ip_conntrack_info ctinfo)
+{
+	enum nf_nat_manip_type maniptype;
+	int err;
+
+	if (nf_ct_is_untracked(ct)) {
+		/* A NAT action may only be performed on tracked packets. */
+		return NF_ACCEPT;
+	}
+
+	/* Add NAT extension if not confirmed yet. */
+	if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+		return NF_ACCEPT;   /* Can't NAT. */
+
+	/* Determine NAT type.
+	 * Check if the NAT type can be deduced from the tracked connection.
+	 * Make sure expected traffic is NATted only when committing.
+	 */
+	if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW &&
+	    ct->status & IPS_NAT_MASK &&
+	    (!(ct->status & IPS_EXPECTED_BIT) || info->commit)) {
+		/* NAT an established or related connection like before. */
+		if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+			/* This is the REPLY direction for a connection
+			 * for which NAT was applied in the forward
+			 * direction.  Do the reverse NAT.
+			 */
+			maniptype = ct->status & IPS_SRC_NAT
+				? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+		else
+			maniptype = ct->status & IPS_SRC_NAT
+				? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+	} else if (info->nat & OVS_CT_SRC_NAT) {
+		maniptype = NF_NAT_MANIP_SRC;
+	} else if (info->nat & OVS_CT_DST_NAT) {
+		maniptype = NF_NAT_MANIP_DST;
+	} else {
+		return NF_ACCEPT; /* Connection is not NATed. */
+	}
+	err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
+
+	/* Mark NAT done if successful and update the flow key. */
+	if (err == NF_ACCEPT)
+		ovs_nat_update_key(key, skb, maniptype);
+
+	return err;
+}
+#else /* !CONFIG_NF_NAT_NEEDED */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+		      const struct ovs_conntrack_info *info,
+		      struct sk_buff *skb, struct nf_conn *ct,
+		      enum ip_conntrack_info ctinfo)
+{
+	return NF_ACCEPT;
+}
+#endif
+
 /* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
  * not done already.  Update key with new CT state after passing the packet
  * through conntrack.
@@ -509,19 +746,43 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 		if (err != NF_ACCEPT)
 			return -ENOENT;
 
-		ovs_ct_update_key(skb, info, key, true);
+		/* Clear CT state NAT flags to mark that we have not yet done
+		 * NAT after the nf_conntrack_in() call.  We can actually clear
+		 * the whole state, as it will be re-initialized below.
+		 */
+		key->ct.state = 0;
+
+		/* Update the key, but keep the NAT flags. */
+		ovs_ct_update_key(skb, info, key, true, true);
 	}
 
-	/* Call the helper only if:
-	 * - nf_conntrack_in() was executed above ("!cached") for a confirmed
-	 *   connection, or
-	 * - When committing an unconfirmed connection.
-	 */
 	ct = nf_ct_get(skb, &ctinfo);
-	if (ct && (nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
-	    ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
-		WARN_ONCE(1, "helper rejected packet");
-		return -EINVAL;
+	if (ct) {
+		/* Packets starting a new connection must be NATted before the
+		 * helper, so that the helper knows about the NAT.  We enforce
+		 * this by delaying both NAT and helper calls for unconfirmed
+		 * connections until the committing CT action.  For later
+		 * packets NAT and Helper may be called in either order.
+		 *
+		 * NAT will be done only if the CT action has NAT, and only
+		 * once per packet (per zone), as guarded by the NAT bits in
+		 * the key->ct.state.
+		 */
+		if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
+		    (nf_ct_is_confirmed(ct) || info->commit) &&
+		    ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
+			return -EINVAL;
+		}
+
+		/* Call the helper only if:
+		 * - nf_conntrack_in() was executed above ("!cached") for a
+		 *   confirmed connection, or
+		 * - When committing an unconfirmed connection.
+		 */
+		if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
+		    ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
+			return -EINVAL;
+		}
 	}
 
 	return 0;
@@ -545,15 +806,13 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 	if (exp) {
 		u8 state;
 
+		/* NOTE: New connections are NATted and Helped only when
+		 * committed, so we are not calling into NAT here.
+		 */
 		state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
 		__ovs_ct_update_key(key, state, &info->zone, exp->master);
-	} else {
-		int err;
-
-		err = __ovs_ct_lookup(net, key, info, skb);
-		if (err)
-			return err;
-	}
+	} else
+		return __ovs_ct_lookup(net, key, info, skb);
 
 	return 0;
 }
@@ -653,6 +912,135 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
 	return 0;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+static int parse_nat(const struct nlattr *attr,
+		     struct ovs_conntrack_info *info, bool log)
+{
+	struct nlattr *a;
+	int rem;
+	bool have_ip_max = false;
+	bool have_proto_max = false;
+	bool ip_vers = (info->family == NFPROTO_IPV6);
+
+	nla_for_each_nested(a, attr, rem) {
+		static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
+			[OVS_NAT_ATTR_SRC] = {0, 0},
+			[OVS_NAT_ATTR_DST] = {0, 0},
+			[OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
+						 sizeof(struct in6_addr)},
+			[OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
+						 sizeof(struct in6_addr)},
+			[OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)},
+			[OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)},
+			[OVS_NAT_ATTR_PERSISTENT] = {0, 0},
+			[OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
+			[OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
+		};
+		int type = nla_type(a);
+
+		if (type > OVS_NAT_ATTR_MAX) {
+			OVS_NLERR(log,
+				  "Unknown NAT attribute (type=%d, max=%d).\n",
+				  type, OVS_NAT_ATTR_MAX);
+			return -EINVAL;
+		}
+
+		if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) {
+			OVS_NLERR(log,
+				  "NAT attribute type %d has unexpected length (%d != %d).\n",
+				  type, nla_len(a),
+				  ovs_nat_attr_lens[type][ip_vers]);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_NAT_ATTR_SRC:
+		case OVS_NAT_ATTR_DST:
+			if (info->nat) {
+				OVS_NLERR(log,
+					  "Only one type of NAT may be specified.\n"
+					  );
+				return -ERANGE;
+			}
+			info->nat |= OVS_CT_NAT;
+			info->nat |= ((type == OVS_NAT_ATTR_SRC)
+					? OVS_CT_SRC_NAT : OVS_CT_DST_NAT);
+			break;
+
+		case OVS_NAT_ATTR_IP_MIN:
+			nla_memcpy(&info->range.min_addr, a, nla_len(a));
+			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+			break;
+
+		case OVS_NAT_ATTR_IP_MAX:
+			have_ip_max = true;
+			nla_memcpy(&info->range.max_addr, a,
+				   sizeof(info->range.max_addr));
+			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_MIN:
+			info->range.min_proto.all = htons(nla_get_u16(a));
+			info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_MAX:
+			have_proto_max = true;
+			info->range.max_proto.all = htons(nla_get_u16(a));
+			info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+			break;
+
+		case OVS_NAT_ATTR_PERSISTENT:
+			info->range.flags |= NF_NAT_RANGE_PERSISTENT;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_HASH:
+			info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_RANDOM:
+			info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY;
+			break;
+
+		default:
+			OVS_NLERR(log, "Unknown nat attribute (%d).\n", type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem);
+		return -EINVAL;
+	}
+	if (!info->nat) {
+		/* Do not allow flags if no type is given. */
+		if (info->range.flags) {
+			OVS_NLERR(log,
+				  "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n"
+				  );
+			return -EINVAL;
+		}
+		info->nat = OVS_CT_NAT;   /* NAT existing connections. */
+	} else if (!info->commit) {
+		OVS_NLERR(log,
+			  "NAT attributes may be specified only when CT COMMIT flag is also specified.\n"
+			  );
+		return -EINVAL;
+	}
+	/* Allow missing IP_MAX. */
+	if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) {
+		memcpy(&info->range.max_addr, &info->range.min_addr,
+		       sizeof(info->range.max_addr));
+	}
+	/* Allow missing PROTO_MAX. */
+	if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+	    !have_proto_max) {
+		info->range.max_proto.all = info->range.min_proto.all;
+	}
+	return 0;
+}
+#endif
+
 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 	[OVS_CT_ATTR_COMMIT]	= { .minlen = 0, .maxlen = 0 },
 	[OVS_CT_ATTR_ZONE]	= { .minlen = sizeof(u16),
@@ -662,7 +1050,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 	[OVS_CT_ATTR_LABELS]	= { .minlen = sizeof(struct md_labels),
 				    .maxlen = sizeof(struct md_labels) },
 	[OVS_CT_ATTR_HELPER]	= { .minlen = 1,
-				    .maxlen = NF_CT_HELPER_NAME_LEN }
+				    .maxlen = NF_CT_HELPER_NAME_LEN },
+#ifdef CONFIG_NF_NAT_NEEDED
+	/* NAT length is checked when parsing the nested attributes. */
+	[OVS_CT_ATTR_NAT]	= { .minlen = 0, .maxlen = INT_MAX },
+#endif
 };
 
 static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -729,6 +1121,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
 				return -EINVAL;
 			}
 			break;
+#ifdef CONFIG_NF_NAT_NEEDED
+		case OVS_CT_ATTR_NAT: {
+			int err = parse_nat(a, info, log);
+
+			if (err)
+				return err;
+			break;
+		}
+#endif
 		default:
 			OVS_NLERR(log, "Unknown conntrack attr (%d)",
 				  type);
@@ -816,6 +1217,74 @@ err_free_ct:
 	return err;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
+			       struct sk_buff *skb)
+{
+	struct nlattr *start;
+
+	start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
+	if (!start)
+		return false;
+
+	if (info->nat & OVS_CT_SRC_NAT) {
+		if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
+			return false;
+	} else if (info->nat & OVS_CT_DST_NAT) {
+		if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
+			return false;
+	} else {
+		goto out;
+	}
+
+	if (info->range.flags & NF_NAT_RANGE_MAP_IPS) {
+		if (info->family == NFPROTO_IPV4) {
+			if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN,
+					    info->range.min_addr.ip) ||
+			    (info->range.max_addr.ip
+			     != info->range.min_addr.ip &&
+			     (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX,
+					      info->range.max_addr.ip))))
+				return false;
+#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
+		} else if (info->family == NFPROTO_IPV6) {
+			if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN,
+					     &info->range.min_addr.in6) ||
+			    (memcmp(&info->range.max_addr.in6,
+				    &info->range.min_addr.in6,
+				    sizeof(info->range.max_addr.in6)) &&
+			     (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX,
+					       &info->range.max_addr.in6))))
+				return false;
+#endif
+		} else {
+			return false;
+		}
+	}
+	if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+	    (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN,
+			 ntohs(info->range.min_proto.all)) ||
+	     (info->range.max_proto.all != info->range.min_proto.all &&
+	      nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX,
+			  ntohs(info->range.max_proto.all)))))
+		return false;
+
+	if (info->range.flags & NF_NAT_RANGE_PERSISTENT &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT))
+		return false;
+	if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH))
+		return false;
+	if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM))
+		return false;
+out:
+	nla_nest_end(skb, start);
+
+	return true;
+}
+#endif
+
 int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 			  struct sk_buff *skb)
 {
@@ -844,7 +1313,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 				   ct_info->helper->name))
 			return -EMSGSIZE;
 	}
-
+#ifdef CONFIG_NF_NAT_NEEDED
+	if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb))
+		return -EMSGSIZE;
+#endif
 	nla_nest_end(skb, start);
 
 	return 0;
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index a7544f405c16..8f6230bd6183 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -37,7 +37,8 @@ void ovs_ct_free_action(const struct nlattr *a);
 
 #define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
 			   OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \
-			   OVS_CS_F_INVALID | OVS_CS_F_TRACKED)
+			   OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \
+			   OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
 #else
 #include <linux/errno.h>
 
-- 
cgit v1.2.3


From 93e68cd6115f67d8363c94dae8206af36f6d3b00 Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Wed, 16 Mar 2016 09:12:46 +0000
Subject: net: fix a comment typo

Fix a comment typo.

Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
index 9cf2394f0bcf..f80277569f24 100644
--- a/include/uapi/linux/if.h
+++ b/include/uapi/linux/if.h
@@ -37,7 +37,7 @@
  * are shared for all types of net_devices. The sysfs entries are available
  * via /sys/class/net/<dev>/flags. Flags which can be toggled through sysfs
  * are annotated below, note that only a few flags can be toggled and some
- * other flags are always always preserved from the original net_device flags
+ * other flags are always preserved from the original net_device flags
  * even if you try to set them via sysfs. Flags which are always preserved
  * are kept under the flag grouping @IFF_VOLATILE. Flags which are volatile
  * are annotated below as such.
-- 
cgit v1.2.3