aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2024-12-05 11:48:58 -0800
committerJakub Kicinski <kuba@kernel.org>2024-12-12 14:19:05 -0800
commit5098462fbac60cbec76171a8b4998a36b85891a1 (patch)
treee9d6b62251ba4a670216f12390cc1ccad8148569 /kernel
parentMerge branch 'net-smc-two-features-for-smc-r' (diff)
parentMerge tag 'net-6.13-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/net... (diff)
downloadlinux-5098462fbac60cbec76171a8b4998a36b85891a1.tar.gz
linux-5098462fbac60cbec76171a8b4998a36b85891a1.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR (net-6.13-rc3). No conflicts or adjacent changes. Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/auditsc.c4
-rw-r--r--kernel/bpf/bpf_lsm.c2
-rw-r--r--kernel/bpf/devmap.c6
-rw-r--r--kernel/bpf/lpm_trie.c133
-rw-r--r--kernel/bpf/verifier.c27
-rw-r--r--kernel/futex/futex.h4
-rw-r--r--kernel/irq/proc.c7
-rw-r--r--kernel/locking/rtmutex.c3
-rw-r--r--kernel/sched/core.c4
-rw-r--r--kernel/sched/deadline.c3
-rw-r--r--kernel/sched/fair.c14
-rw-r--r--kernel/sched/syscalls.c2
-rw-r--r--kernel/softirq.c15
-rw-r--r--kernel/time/clocksource.c11
-rw-r--r--kernel/time/timekeeping.c6
-rw-r--r--kernel/time/timekeeping_internal.h8
-rw-r--r--kernel/trace/trace_eprobe.c5
17 files changed, 163 insertions, 91 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 279ba5c420a4..561d96affe9f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2728,8 +2728,8 @@ void __audit_ptrace(struct task_struct *t)
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
- security_task_getlsmprop_obj(t, &context->target_ref);
strscpy(context->target_comm, t->comm);
+ security_task_getlsmprop_obj(t, &context->target_ref);
}
/**
@@ -2755,8 +2755,8 @@ int audit_signal_info_syscall(struct task_struct *t)
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
- security_task_getlsmprop_obj(t, &ctx->target_ref);
strscpy(ctx->target_comm, t->comm);
+ security_task_getlsmprop_obj(t, &ctx->target_ref);
return 0;
}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 3bc61628ab25..967492b65185 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -375,8 +375,6 @@ BTF_ID(func, bpf_lsm_socket_socketpair)
BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
-BTF_ID(func, bpf_lsm_current_getsecid_subj)
-BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index effde52bc857..482d284a1553 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -184,7 +184,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
static void dev_map_free(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int i;
+ u32 i;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the programs (can be more than one that used this map) were
@@ -821,7 +821,7 @@ static long dev_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
- int k = *(u32 *)key;
+ u32 k = *(u32 *)key;
if (k >= map->max_entries)
return -EINVAL;
@@ -838,7 +838,7 @@ static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
- int k = *(u32 *)key;
+ u32 k = *(u32 *)key;
unsigned long flags;
int ret = -ENOENT;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 9b60eda0f727..f8bc1e096182 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -15,6 +15,7 @@
#include <net/ipv6.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
+#include <linux/bpf_mem_alloc.h>
/* Intermediate node */
#define LPM_TREE_NODE_FLAG_IM BIT(0)
@@ -22,7 +23,6 @@
struct lpm_trie_node;
struct lpm_trie_node {
- struct rcu_head rcu;
struct lpm_trie_node __rcu *child[2];
u32 prefixlen;
u32 flags;
@@ -32,10 +32,11 @@ struct lpm_trie_node {
struct lpm_trie {
struct bpf_map map;
struct lpm_trie_node __rcu *root;
+ struct bpf_mem_alloc ma;
size_t n_entries;
size_t max_prefixlen;
size_t data_size;
- spinlock_t lock;
+ raw_spinlock_t lock;
};
/* This trie implements a longest prefix match algorithm that can be used to
@@ -287,17 +288,18 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
return found->data + trie->data_size;
}
-static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
- const void *value)
+static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
+ const void *value,
+ bool disable_migration)
{
struct lpm_trie_node *node;
- size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
- if (value)
- size += trie->map.value_size;
+ if (disable_migration)
+ migrate_disable();
+ node = bpf_mem_cache_alloc(&trie->ma);
+ if (disable_migration)
+ migrate_enable();
- node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
- trie->map.numa_node);
if (!node)
return NULL;
@@ -310,12 +312,22 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
return node;
}
+static int trie_check_add_elem(struct lpm_trie *trie, u64 flags)
+{
+ if (flags == BPF_EXIST)
+ return -ENOENT;
+ if (trie->n_entries == trie->map.max_entries)
+ return -ENOSPC;
+ trie->n_entries++;
+ return 0;
+}
+
/* Called from syscall or from eBPF program */
static long trie_update_elem(struct bpf_map *map,
void *_key, void *value, u64 flags)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
- struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
+ struct lpm_trie_node *node, *im_node, *new_node;
struct lpm_trie_node *free_node = NULL;
struct lpm_trie_node __rcu **slot;
struct bpf_lpm_trie_key_u8 *key = _key;
@@ -330,22 +342,14 @@ static long trie_update_elem(struct bpf_map *map,
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;
- spin_lock_irqsave(&trie->lock, irq_flags);
-
- /* Allocate and fill a new node */
-
- if (trie->n_entries == trie->map.max_entries) {
- ret = -ENOSPC;
- goto out;
- }
-
- new_node = lpm_trie_node_alloc(trie, value);
- if (!new_node) {
- ret = -ENOMEM;
- goto out;
- }
+ /* Allocate and fill a new node. Need to disable migration before
+ * invoking bpf_mem_cache_alloc().
+ */
+ new_node = lpm_trie_node_alloc(trie, value, true);
+ if (!new_node)
+ return -ENOMEM;
- trie->n_entries++;
+ raw_spin_lock_irqsave(&trie->lock, irq_flags);
new_node->prefixlen = key->prefixlen;
RCU_INIT_POINTER(new_node->child[0], NULL);
@@ -364,8 +368,7 @@ static long trie_update_elem(struct bpf_map *map,
matchlen = longest_prefix_match(trie, node, key);
if (node->prefixlen != matchlen ||
- node->prefixlen == key->prefixlen ||
- node->prefixlen == trie->max_prefixlen)
+ node->prefixlen == key->prefixlen)
break;
next_bit = extract_bit(key->data, node->prefixlen);
@@ -376,6 +379,10 @@ static long trie_update_elem(struct bpf_map *map,
* simply assign the @new_node to that slot and be done.
*/
if (!node) {
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+
rcu_assign_pointer(*slot, new_node);
goto out;
}
@@ -384,18 +391,30 @@ static long trie_update_elem(struct bpf_map *map,
* which already has the correct data array set.
*/
if (node->prefixlen == matchlen) {
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) {
+ if (flags == BPF_NOEXIST) {
+ ret = -EEXIST;
+ goto out;
+ }
+ } else {
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+ }
+
new_node->child[0] = node->child[0];
new_node->child[1] = node->child[1];
- if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
- trie->n_entries--;
-
rcu_assign_pointer(*slot, new_node);
free_node = node;
goto out;
}
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+
/* If the new node matches the prefix completely, it must be inserted
* as an ancestor. Simply insert it between @node and *@slot.
*/
@@ -406,8 +425,10 @@ static long trie_update_elem(struct bpf_map *map,
goto out;
}
- im_node = lpm_trie_node_alloc(trie, NULL);
+ /* migration is disabled within the locked scope */
+ im_node = lpm_trie_node_alloc(trie, NULL, false);
if (!im_node) {
+ trie->n_entries--;
ret = -ENOMEM;
goto out;
}
@@ -429,16 +450,13 @@ static long trie_update_elem(struct bpf_map *map,
rcu_assign_pointer(*slot, im_node);
out:
- if (ret) {
- if (new_node)
- trie->n_entries--;
+ raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
- kfree(new_node);
- kfree(im_node);
- }
-
- spin_unlock_irqrestore(&trie->lock, irq_flags);
- kfree_rcu(free_node, rcu);
+ migrate_disable();
+ if (ret)
+ bpf_mem_cache_free(&trie->ma, new_node);
+ bpf_mem_cache_free_rcu(&trie->ma, free_node);
+ migrate_enable();
return ret;
}
@@ -459,7 +477,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;
- spin_lock_irqsave(&trie->lock, irq_flags);
+ raw_spin_lock_irqsave(&trie->lock, irq_flags);
/* Walk the tree looking for an exact key/length match and keeping
* track of the path we traverse. We will need to know the node
@@ -535,9 +553,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
free_node = node;
out:
- spin_unlock_irqrestore(&trie->lock, irq_flags);
- kfree_rcu(free_parent, rcu);
- kfree_rcu(free_node, rcu);
+ raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+ migrate_disable();
+ bpf_mem_cache_free_rcu(&trie->ma, free_parent);
+ bpf_mem_cache_free_rcu(&trie->ma, free_node);
+ migrate_enable();
return ret;
}
@@ -559,6 +580,8 @@ out:
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
struct lpm_trie *trie;
+ size_t leaf_size;
+ int err;
/* check sanity of attributes */
if (attr->max_entries == 0 ||
@@ -581,9 +604,19 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
offsetof(struct bpf_lpm_trie_key_u8, data);
trie->max_prefixlen = trie->data_size * 8;
- spin_lock_init(&trie->lock);
+ raw_spin_lock_init(&trie->lock);
+ /* Allocate intermediate and leaf nodes from the same allocator */
+ leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
+ trie->map.value_size;
+ err = bpf_mem_alloc_init(&trie->ma, leaf_size, false);
+ if (err)
+ goto free_out;
return &trie->map;
+
+free_out:
+ bpf_map_area_free(trie);
+ return ERR_PTR(err);
}
static void trie_free(struct bpf_map *map)
@@ -615,13 +648,17 @@ static void trie_free(struct bpf_map *map)
continue;
}
- kfree(node);
+ /* No bpf program may access the map, so freeing the
+ * node without waiting for the extra RCU GP.
+ */
+ bpf_mem_cache_raw_free(node);
RCU_INIT_POINTER(*slot, NULL);
break;
}
}
out:
+ bpf_mem_alloc_destroy(&trie->ma);
bpf_map_area_free(trie);
}
@@ -633,7 +670,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
struct lpm_trie_node **node_stack = NULL;
int err = 0, stack_ptr = -1;
unsigned int next_bit;
- size_t matchlen;
+ size_t matchlen = 0;
/* The get_next_key follows postorder. For the 4 node example in
* the top of this file, the trie_get_next_key() returns the following
@@ -672,7 +709,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
next_bit = extract_bit(key->data, node->prefixlen);
node = rcu_dereference(node->child[next_bit]);
}
- if (!node || node->prefixlen != key->prefixlen ||
+ if (!node || node->prefixlen != matchlen ||
(node->flags & LPM_TREE_NODE_FLAG_IM))
goto find_leftmost;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1c4ebb326785..01fbef9576e0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1202,14 +1202,17 @@ static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack)
/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
* case they are equivalent, or it's STACK_ZERO, in which case we preserve
* more precise STACK_ZERO.
- * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take
- * env->allow_ptr_leaks into account and force STACK_MISC, if necessary.
+ * Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged
+ * mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is
+ * unnecessary as both are considered equivalent when loading data and pruning,
+ * in case of unprivileged mode it will be incorrect to allow reads of invalid
+ * slots.
*/
static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
{
if (*stype == STACK_ZERO)
return;
- if (env->allow_ptr_leaks && *stype == STACK_INVALID)
+ if (*stype == STACK_INVALID)
return;
*stype = STACK_MISC;
}
@@ -4700,6 +4703,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
*/
if (!env->allow_ptr_leaks &&
is_spilled_reg(&state->stack[spi]) &&
+ !is_spilled_scalar_reg(&state->stack[spi]) &&
size != BPF_REG_SIZE) {
verbose(env, "attempt to corrupt spilled pointer on stack\n");
return -EACCES;
@@ -8071,7 +8075,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) {
verbose(env,
"arg#%d expected pointer to stack or const struct bpf_dynptr\n",
- regno);
+ regno - 1);
return -EINVAL;
}
@@ -8125,7 +8129,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
if (!is_dynptr_reg_valid_init(env, reg)) {
verbose(env,
"Expected an initialized dynptr as arg #%d\n",
- regno);
+ regno - 1);
return -EINVAL;
}
@@ -8133,7 +8137,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
verbose(env,
"Expected a dynptr of type %s as arg #%d\n",
- dynptr_type_str(arg_to_dynptr_type(arg_type)), regno);
+ dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1);
return -EINVAL;
}
@@ -8189,6 +8193,11 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
const struct btf_type *t;
int spi, err, i, nr_slots, btf_id;
+ if (reg->type != PTR_TO_STACK) {
+ verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1);
+ return -EINVAL;
+ }
+
/* For iter_{new,next,destroy} functions, btf_check_iter_kfuncs()
* ensures struct convention, so we wouldn't need to do any BTF
* validation here. But given iter state can be passed as a parameter
@@ -8197,7 +8206,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
*/
btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1);
if (btf_id < 0) {
- verbose(env, "expected valid iter pointer as arg #%d\n", regno);
+ verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1);
return -EINVAL;
}
t = btf_type_by_id(meta->btf, btf_id);
@@ -8207,7 +8216,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
/* bpf_iter_<type>_new() expects pointer to uninit iter state */
if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
verbose(env, "expected uninitialized iter_%s as arg #%d\n",
- iter_type_str(meta->btf, btf_id), regno);
+ iter_type_str(meta->btf, btf_id), regno - 1);
return -EINVAL;
}
@@ -8231,7 +8240,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
break;
case -EINVAL:
verbose(env, "expected an initialized iter_%s as arg #%d\n",
- iter_type_str(meta->btf, btf_id), regno);
+ iter_type_str(meta->btf, btf_id), regno - 1);
return err;
case -EPROTO:
verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 618ce1fe870e..99b32e728c4a 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -265,11 +265,11 @@ static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
else if (!user_read_access_begin(from, sizeof(*from)))
return -EFAULT;
unsafe_get_user(val, from, Efault);
- user_access_end();
+ user_read_access_end();
*dest = val;
return 0;
Efault:
- user_access_end();
+ user_read_access_end();
return -EFAULT;
}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index f36c33bd2da4..8e29809de38d 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -501,17 +501,18 @@ int show_interrupts(struct seq_file *p, void *v)
seq_put_decimal_ull_width(p, " ", cnt, 10);
}
+ seq_putc(p, ' ');
raw_spin_lock_irqsave(&desc->lock, flags);
if (desc->irq_data.chip) {
if (desc->irq_data.chip->irq_print_chip)
desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
else if (desc->irq_data.chip->name)
- seq_printf(p, " %8s", desc->irq_data.chip->name);
+ seq_printf(p, "%8s", desc->irq_data.chip->name);
else
- seq_printf(p, " %8s", "-");
+ seq_printf(p, "%8s", "-");
} else {
- seq_printf(p, " %8s", "None");
+ seq_printf(p, "%8s", "None");
}
if (desc->irq_data.domain)
seq_printf(p, " %*lu", prec, desc->irq_data.hwirq);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index ac1365afcc4a..e858de203eb6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1248,10 +1248,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
/* Check whether the waiter should back out immediately */
rtm = container_of(lock, struct rt_mutex, rtmutex);
- preempt_disable();
res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q);
- wake_up_q(wake_q);
- preempt_enable();
if (res) {
raw_spin_lock(&task->pi_lock);
rt_mutex_dequeue(lock, waiter);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 95e40895a519..c6d8232ad9ee 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1283,9 +1283,9 @@ static void nohz_csd_func(void *info)
WARN_ON(!(flags & NOHZ_KICK_MASK));
rq->idle_balance = idle_cpu(cpu);
- if (rq->idle_balance && !need_resched()) {
+ if (rq->idle_balance) {
rq->nohz_idle_balance = flags;
- raise_softirq_irqoff(SCHED_SOFTIRQ);
+ __raise_softirq_irqoff(SCHED_SOFTIRQ);
}
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d9d5a702f1a6..db47f33cb7d2 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -781,7 +781,7 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
* If it is a deferred reservation, and the server
* is not handling an starvation case, defer it.
*/
- if (dl_se->dl_defer & !dl_se->dl_defer_running) {
+ if (dl_se->dl_defer && !dl_se->dl_defer_running) {
dl_se->dl_throttled = 1;
dl_se->dl_defer_armed = 1;
}
@@ -2042,6 +2042,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se);
} else if ((flags & ENQUEUE_RESTORE) &&
+ !is_dl_boosted(dl_se) &&
dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
setup_new_dl_entity(dl_se);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fbdca89c677f..aa0238ee4857 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3399,10 +3399,16 @@ retry_pids:
/* Initialise new per-VMA NUMAB state. */
if (!vma->numab_state) {
- vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
- GFP_KERNEL);
- if (!vma->numab_state)
+ struct vma_numab_state *ptr;
+
+ ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ continue;
+
+ if (cmpxchg(&vma->numab_state, NULL, ptr)) {
+ kfree(ptr);
continue;
+ }
vma->numab_state->start_scan_seq = mm->numa_scan_seq;
@@ -12568,7 +12574,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
* work being done for other CPUs. Next load
* balancing owner will pick it up.
*/
- if (need_resched()) {
+ if (!idle_cpu(this_cpu) && need_resched()) {
if (flags & NOHZ_STATS_KICK)
has_blocked_load = true;
if (flags & NOHZ_NEXT_KICK)
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 0d71fcbaf1e3..ff0e5ab4e37c 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -1200,7 +1200,7 @@ int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
bool empty = !cpumask_and(new_mask, new_mask,
ctx->user_mask);
- if (WARN_ON_ONCE(empty))
+ if (empty)
cpumask_copy(new_mask, cpus_allowed);
}
__set_cpus_allowed_ptr(p, ctx);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8b41bd13cc3d..4dae6ac2e83f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -280,17 +280,24 @@ static inline void invoke_softirq(void)
wakeup_softirqd();
}
+#define SCHED_SOFTIRQ_MASK BIT(SCHED_SOFTIRQ)
+
/*
* flush_smp_call_function_queue() can raise a soft interrupt in a function
- * call. On RT kernels this is undesired and the only known functionality
- * in the block layer which does this is disabled on RT. If soft interrupts
- * get raised which haven't been raised before the flush, warn so it can be
+ * call. On RT kernels this is undesired and the only known functionalities
+ * are in the block layer which is disabled on RT, and in the scheduler for
+ * idle load balancing. If soft interrupts get raised which haven't been
+ * raised before the flush, warn if it is not a SCHED_SOFTIRQ so it can be
* investigated.
*/
void do_softirq_post_smp_call_flush(unsigned int was_pending)
{
- if (WARN_ON_ONCE(was_pending != local_softirq_pending()))
+ unsigned int is_pending = local_softirq_pending();
+
+ if (unlikely(was_pending != is_pending)) {
+ WARN_ON_ONCE(was_pending != (is_pending & ~SCHED_SOFTIRQ_MASK));
invoke_softirq();
+ }
}
#else /* CONFIG_PREEMPT_RT */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index aab6472853fa..7304d7cf47f2 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -24,7 +24,7 @@ static void clocksource_enqueue(struct clocksource *cs);
static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end)
{
- u64 delta = clocksource_delta(end, start, cs->mask);
+ u64 delta = clocksource_delta(end, start, cs->mask, cs->max_raw_delta);
if (likely(delta < cs->max_cycles))
return clocksource_cyc2ns(delta, cs->mult, cs->shift);
@@ -993,6 +993,15 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs)
cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
cs->maxadj, cs->mask,
&cs->max_cycles);
+
+ /*
+ * Threshold for detecting negative motion in clocksource_delta().
+ *
+ * Allow for 0.875 of the counter width so that overly long idle
+ * sleeps, which go slightly over mask/2, do not trigger the
+ * negative motion detection.
+ */
+ cs->max_raw_delta = (cs->mask >> 1) + (cs->mask >> 2) + (cs->mask >> 3);
}
static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0ca85ff4fbb4..3d128825d343 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -755,7 +755,8 @@ static void timekeeping_forward_now(struct timekeeper *tk)
u64 cycle_now, delta;
cycle_now = tk_clock_read(&tk->tkr_mono);
- delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+ delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
+ tk->tkr_mono.clock->max_raw_delta);
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
@@ -2230,7 +2231,8 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
return false;
offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
- tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+ tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
+ tk->tkr_mono.clock->max_raw_delta);
/* Check if there's really nothing to do */
if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 63e600e943a7..8c9079108ffb 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -30,15 +30,15 @@ static inline void timekeeping_inc_mg_floor_swaps(void)
#endif
-static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
+static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta)
{
u64 ret = (now - last) & mask;
/*
- * Prevent time going backwards by checking the MSB of mask in
- * the result. If set, return 0.
+ * Prevent time going backwards by checking the result against
+ * @max_delta. If greater, return 0.
*/
- return ret & ~(mask >> 1) ? 0 : ret;
+ return ret > max_delta ? 0 : ret;
}
/* Semi public for serialization of non timekeeper VDSO updates. */
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index ebda68ee9abf..be8be0c1aaf0 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -963,6 +963,11 @@ static int __trace_eprobe_create(int argc, const char *argv[])
goto error;
}
ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
+ if (ret < 0) {
+ trace_probe_unregister_event_call(&ep->tp);
+ mutex_unlock(&event_mutex);
+ goto error;
+ }
mutex_unlock(&event_mutex);
return ret;
parse_error: