diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2024-12-05 11:48:58 -0800 |
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2024-12-12 14:19:05 -0800 |
| commit | 5098462fbac60cbec76171a8b4998a36b85891a1 (patch) | |
| tree | e9d6b62251ba4a670216f12390cc1ccad8148569 /kernel | |
| parent | Merge branch 'net-smc-two-features-for-smc-r' (diff) | |
| parent | Merge tag 'net-6.13-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/net... (diff) | |
| download | linux-5098462fbac60cbec76171a8b4998a36b85891a1.tar.gz linux-5098462fbac60cbec76171a8b4998a36b85891a1.zip | |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR (net-6.13-rc3).
No conflicts or adjacent changes.
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/auditsc.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/bpf_lsm.c | 2 | ||||
| -rw-r--r-- | kernel/bpf/devmap.c | 6 | ||||
| -rw-r--r-- | kernel/bpf/lpm_trie.c | 133 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 27 | ||||
| -rw-r--r-- | kernel/futex/futex.h | 4 | ||||
| -rw-r--r-- | kernel/irq/proc.c | 7 | ||||
| -rw-r--r-- | kernel/locking/rtmutex.c | 3 | ||||
| -rw-r--r-- | kernel/sched/core.c | 4 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 3 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 14 | ||||
| -rw-r--r-- | kernel/sched/syscalls.c | 2 | ||||
| -rw-r--r-- | kernel/softirq.c | 15 | ||||
| -rw-r--r-- | kernel/time/clocksource.c | 11 | ||||
| -rw-r--r-- | kernel/time/timekeeping.c | 6 | ||||
| -rw-r--r-- | kernel/time/timekeeping_internal.h | 8 | ||||
| -rw-r--r-- | kernel/trace/trace_eprobe.c | 5 |
17 files changed, 163 insertions, 91 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 279ba5c420a4..561d96affe9f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2728,8 +2728,8 @@ void __audit_ptrace(struct task_struct *t) context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); - security_task_getlsmprop_obj(t, &context->target_ref); strscpy(context->target_comm, t->comm); + security_task_getlsmprop_obj(t, &context->target_ref); } /** @@ -2755,8 +2755,8 @@ int audit_signal_info_syscall(struct task_struct *t) ctx->target_auid = audit_get_loginuid(t); ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); - security_task_getlsmprop_obj(t, &ctx->target_ref); strscpy(ctx->target_comm, t->comm); + security_task_getlsmprop_obj(t, &ctx->target_ref); return 0; } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 3bc61628ab25..967492b65185 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -375,8 +375,6 @@ BTF_ID(func, bpf_lsm_socket_socketpair) BTF_ID(func, bpf_lsm_syslog) BTF_ID(func, bpf_lsm_task_alloc) -BTF_ID(func, bpf_lsm_current_getsecid_subj) -BTF_ID(func, bpf_lsm_task_getsecid_obj) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) BTF_ID(func, bpf_lsm_task_to_inode) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index effde52bc857..482d284a1553 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -184,7 +184,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int i; + u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were @@ -821,7 +821,7 @@ static long dev_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; - int k = *(u32 *)key; + u32 k = *(u32 *)key; if (k >= map->max_entries) return -EINVAL; @@ -838,7 +838,7 @@ static long dev_map_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; - int k = *(u32 *)key; + u32 k = *(u32 *)key; unsigned long flags; int ret = -ENOENT; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 9b60eda0f727..f8bc1e096182 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -15,6 +15,7 @@ #include <net/ipv6.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> +#include <linux/bpf_mem_alloc.h> /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -22,7 +23,6 @@ struct lpm_trie_node; struct lpm_trie_node { - struct rcu_head rcu; struct lpm_trie_node __rcu *child[2]; u32 prefixlen; u32 flags; @@ -32,10 +32,11 @@ struct lpm_trie_node { struct lpm_trie { struct bpf_map map; struct lpm_trie_node __rcu *root; + struct bpf_mem_alloc ma; size_t n_entries; size_t max_prefixlen; size_t data_size; - spinlock_t lock; + raw_spinlock_t lock; }; /* This trie implements a longest prefix match algorithm that can be used to @@ -287,17 +288,18 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) return found->data + trie->data_size; } -static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, - const void *value) +static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie, + const void *value, + bool disable_migration) { struct lpm_trie_node *node; - size_t size = sizeof(struct lpm_trie_node) + trie->data_size; - if (value) - size += trie->map.value_size; + if (disable_migration) + migrate_disable(); + node = bpf_mem_cache_alloc(&trie->ma); + if (disable_migration) + migrate_enable(); - node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN, - trie->map.numa_node); if (!node) return NULL; @@ -310,12 +312,22 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, return node; } +static int trie_check_add_elem(struct lpm_trie *trie, u64 flags) +{ + if (flags == BPF_EXIST) + return -ENOENT; + if (trie->n_entries == trie->map.max_entries) + return -ENOSPC; + trie->n_entries++; + return 0; +} + /* Called from syscall or from eBPF program */ static long trie_update_elem(struct bpf_map *map, void *_key, void *value, u64 flags) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); - struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; + struct lpm_trie_node *node, *im_node, *new_node; struct lpm_trie_node *free_node = NULL; struct lpm_trie_node __rcu **slot; struct bpf_lpm_trie_key_u8 *key = _key; @@ -330,22 +342,14 @@ static long trie_update_elem(struct bpf_map *map, if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - spin_lock_irqsave(&trie->lock, irq_flags); - - /* Allocate and fill a new node */ - - if (trie->n_entries == trie->map.max_entries) { - ret = -ENOSPC; - goto out; - } - - new_node = lpm_trie_node_alloc(trie, value); - if (!new_node) { - ret = -ENOMEM; - goto out; - } + /* Allocate and fill a new node. Need to disable migration before + * invoking bpf_mem_cache_alloc(). + */ + new_node = lpm_trie_node_alloc(trie, value, true); + if (!new_node) + return -ENOMEM; - trie->n_entries++; + raw_spin_lock_irqsave(&trie->lock, irq_flags); new_node->prefixlen = key->prefixlen; RCU_INIT_POINTER(new_node->child[0], NULL); @@ -364,8 +368,7 @@ static long trie_update_elem(struct bpf_map *map, matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || - node->prefixlen == key->prefixlen || - node->prefixlen == trie->max_prefixlen) + node->prefixlen == key->prefixlen) break; next_bit = extract_bit(key->data, node->prefixlen); @@ -376,6 +379,10 @@ static long trie_update_elem(struct bpf_map *map, * simply assign the @new_node to that slot and be done. */ if (!node) { + ret = trie_check_add_elem(trie, flags); + if (ret) + goto out; + rcu_assign_pointer(*slot, new_node); goto out; } @@ -384,18 +391,30 @@ static long trie_update_elem(struct bpf_map *map, * which already has the correct data array set. */ if (node->prefixlen == matchlen) { + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) { + if (flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out; + } + } else { + ret = trie_check_add_elem(trie, flags); + if (ret) + goto out; + } + new_node->child[0] = node->child[0]; new_node->child[1] = node->child[1]; - if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) - trie->n_entries--; - rcu_assign_pointer(*slot, new_node); free_node = node; goto out; } + ret = trie_check_add_elem(trie, flags); + if (ret) + goto out; + /* If the new node matches the prefix completely, it must be inserted * as an ancestor. Simply insert it between @node and *@slot. */ @@ -406,8 +425,10 @@ static long trie_update_elem(struct bpf_map *map, goto out; } - im_node = lpm_trie_node_alloc(trie, NULL); + /* migration is disabled within the locked scope */ + im_node = lpm_trie_node_alloc(trie, NULL, false); if (!im_node) { + trie->n_entries--; ret = -ENOMEM; goto out; } @@ -429,16 +450,13 @@ static long trie_update_elem(struct bpf_map *map, rcu_assign_pointer(*slot, im_node); out: - if (ret) { - if (new_node) - trie->n_entries--; + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); - kfree(new_node); - kfree(im_node); - } - - spin_unlock_irqrestore(&trie->lock, irq_flags); - kfree_rcu(free_node, rcu); + migrate_disable(); + if (ret) + bpf_mem_cache_free(&trie->ma, new_node); + bpf_mem_cache_free_rcu(&trie->ma, free_node); + migrate_enable(); return ret; } @@ -459,7 +477,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - spin_lock_irqsave(&trie->lock, irq_flags); + raw_spin_lock_irqsave(&trie->lock, irq_flags); /* Walk the tree looking for an exact key/length match and keeping * track of the path we traverse. We will need to know the node @@ -535,9 +553,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) free_node = node; out: - spin_unlock_irqrestore(&trie->lock, irq_flags); - kfree_rcu(free_parent, rcu); - kfree_rcu(free_node, rcu); + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + + migrate_disable(); + bpf_mem_cache_free_rcu(&trie->ma, free_parent); + bpf_mem_cache_free_rcu(&trie->ma, free_node); + migrate_enable(); return ret; } @@ -559,6 +580,8 @@ out: static struct bpf_map *trie_alloc(union bpf_attr *attr) { struct lpm_trie *trie; + size_t leaf_size; + int err; /* check sanity of attributes */ if (attr->max_entries == 0 || @@ -581,9 +604,19 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) offsetof(struct bpf_lpm_trie_key_u8, data); trie->max_prefixlen = trie->data_size * 8; - spin_lock_init(&trie->lock); + raw_spin_lock_init(&trie->lock); + /* Allocate intermediate and leaf nodes from the same allocator */ + leaf_size = sizeof(struct lpm_trie_node) + trie->data_size + + trie->map.value_size; + err = bpf_mem_alloc_init(&trie->ma, leaf_size, false); + if (err) + goto free_out; return &trie->map; + +free_out: + bpf_map_area_free(trie); + return ERR_PTR(err); } static void trie_free(struct bpf_map *map) @@ -615,13 +648,17 @@ static void trie_free(struct bpf_map *map) continue; } - kfree(node); + /* No bpf program may access the map, so freeing the + * node without waiting for the extra RCU GP. + */ + bpf_mem_cache_raw_free(node); RCU_INIT_POINTER(*slot, NULL); break; } } out: + bpf_mem_alloc_destroy(&trie->ma); bpf_map_area_free(trie); } @@ -633,7 +670,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) struct lpm_trie_node **node_stack = NULL; int err = 0, stack_ptr = -1; unsigned int next_bit; - size_t matchlen; + size_t matchlen = 0; /* The get_next_key follows postorder. For the 4 node example in * the top of this file, the trie_get_next_key() returns the following @@ -672,7 +709,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) next_bit = extract_bit(key->data, node->prefixlen); node = rcu_dereference(node->child[next_bit]); } - if (!node || node->prefixlen != key->prefixlen || + if (!node || node->prefixlen != matchlen || (node->flags & LPM_TREE_NODE_FLAG_IM)) goto find_leftmost; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1c4ebb326785..01fbef9576e0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1202,14 +1202,17 @@ static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack) /* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which * case they are equivalent, or it's STACK_ZERO, in which case we preserve * more precise STACK_ZERO. - * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take - * env->allow_ptr_leaks into account and force STACK_MISC, if necessary. + * Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged + * mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is + * unnecessary as both are considered equivalent when loading data and pruning, + * in case of unprivileged mode it will be incorrect to allow reads of invalid + * slots. */ static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype) { if (*stype == STACK_ZERO) return; - if (env->allow_ptr_leaks && *stype == STACK_INVALID) + if (*stype == STACK_INVALID) return; *stype = STACK_MISC; } @@ -4700,6 +4703,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, */ if (!env->allow_ptr_leaks && is_spilled_reg(&state->stack[spi]) && + !is_spilled_scalar_reg(&state->stack[spi]) && size != BPF_REG_SIZE) { verbose(env, "attempt to corrupt spilled pointer on stack\n"); return -EACCES; @@ -8071,7 +8075,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { verbose(env, "arg#%d expected pointer to stack or const struct bpf_dynptr\n", - regno); + regno - 1); return -EINVAL; } @@ -8125,7 +8129,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn if (!is_dynptr_reg_valid_init(env, reg)) { verbose(env, "Expected an initialized dynptr as arg #%d\n", - regno); + regno - 1); return -EINVAL; } @@ -8133,7 +8137,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { verbose(env, "Expected a dynptr of type %s as arg #%d\n", - dynptr_type_str(arg_to_dynptr_type(arg_type)), regno); + dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1); return -EINVAL; } @@ -8189,6 +8193,11 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id const struct btf_type *t; int spi, err, i, nr_slots, btf_id; + if (reg->type != PTR_TO_STACK) { + verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1); + return -EINVAL; + } + /* For iter_{new,next,destroy} functions, btf_check_iter_kfuncs() * ensures struct convention, so we wouldn't need to do any BTF * validation here. But given iter state can be passed as a parameter @@ -8197,7 +8206,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id */ btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1); if (btf_id < 0) { - verbose(env, "expected valid iter pointer as arg #%d\n", regno); + verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1); return -EINVAL; } t = btf_type_by_id(meta->btf, btf_id); @@ -8207,7 +8216,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id /* bpf_iter_<type>_new() expects pointer to uninit iter state */ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) { verbose(env, "expected uninitialized iter_%s as arg #%d\n", - iter_type_str(meta->btf, btf_id), regno); + iter_type_str(meta->btf, btf_id), regno - 1); return -EINVAL; } @@ -8231,7 +8240,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id break; case -EINVAL: verbose(env, "expected an initialized iter_%s as arg #%d\n", - iter_type_str(meta->btf, btf_id), regno); + iter_type_str(meta->btf, btf_id), regno - 1); return err; case -EPROTO: verbose(env, "expected an RCU CS when using %s\n", meta->func_name); diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 618ce1fe870e..99b32e728c4a 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -265,11 +265,11 @@ static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from) else if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(val, from, Efault); - user_access_end(); + user_read_access_end(); *dest = val; return 0; Efault: - user_access_end(); + user_read_access_end(); return -EFAULT; } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index f36c33bd2da4..8e29809de38d 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -501,17 +501,18 @@ int show_interrupts(struct seq_file *p, void *v) seq_put_decimal_ull_width(p, " ", cnt, 10); } + seq_putc(p, ' '); raw_spin_lock_irqsave(&desc->lock, flags); if (desc->irq_data.chip) { if (desc->irq_data.chip->irq_print_chip) desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); else if (desc->irq_data.chip->name) - seq_printf(p, " %8s", desc->irq_data.chip->name); + seq_printf(p, "%8s", desc->irq_data.chip->name); else - seq_printf(p, " %8s", "-"); + seq_printf(p, "%8s", "-"); } else { - seq_printf(p, " %8s", "None"); + seq_printf(p, "%8s", "None"); } if (desc->irq_data.domain) seq_printf(p, " %*lu", prec, desc->irq_data.hwirq); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index ac1365afcc4a..e858de203eb6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1248,10 +1248,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, /* Check whether the waiter should back out immediately */ rtm = container_of(lock, struct rt_mutex, rtmutex); - preempt_disable(); res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q); - wake_up_q(wake_q); - preempt_enable(); if (res) { raw_spin_lock(&task->pi_lock); rt_mutex_dequeue(lock, waiter); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 95e40895a519..c6d8232ad9ee 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1283,9 +1283,9 @@ static void nohz_csd_func(void *info) WARN_ON(!(flags & NOHZ_KICK_MASK)); rq->idle_balance = idle_cpu(cpu); - if (rq->idle_balance && !need_resched()) { + if (rq->idle_balance) { rq->nohz_idle_balance = flags; - raise_softirq_irqoff(SCHED_SOFTIRQ); + __raise_softirq_irqoff(SCHED_SOFTIRQ); } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d9d5a702f1a6..db47f33cb7d2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -781,7 +781,7 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, * If it is a deferred reservation, and the server * is not handling an starvation case, defer it. */ - if (dl_se->dl_defer & !dl_se->dl_defer_running) { + if (dl_se->dl_defer && !dl_se->dl_defer_running) { dl_se->dl_throttled = 1; dl_se->dl_defer_armed = 1; } @@ -2042,6 +2042,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && + !is_dl_boosted(dl_se) && dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) { setup_new_dl_entity(dl_se); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fbdca89c677f..aa0238ee4857 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3399,10 +3399,16 @@ retry_pids: /* Initialise new per-VMA NUMAB state. */ if (!vma->numab_state) { - vma->numab_state = kzalloc(sizeof(struct vma_numab_state), - GFP_KERNEL); - if (!vma->numab_state) + struct vma_numab_state *ptr; + + ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); + if (!ptr) + continue; + + if (cmpxchg(&vma->numab_state, NULL, ptr)) { + kfree(ptr); continue; + } vma->numab_state->start_scan_seq = mm->numa_scan_seq; @@ -12568,7 +12574,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) * work being done for other CPUs. Next load * balancing owner will pick it up. */ - if (need_resched()) { + if (!idle_cpu(this_cpu) && need_resched()) { if (flags & NOHZ_STATS_KICK) has_blocked_load = true; if (flags & NOHZ_NEXT_KICK) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 0d71fcbaf1e3..ff0e5ab4e37c 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -1200,7 +1200,7 @@ int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) bool empty = !cpumask_and(new_mask, new_mask, ctx->user_mask); - if (WARN_ON_ONCE(empty)) + if (empty) cpumask_copy(new_mask, cpus_allowed); } __set_cpus_allowed_ptr(p, ctx); diff --git a/kernel/softirq.c b/kernel/softirq.c index 8b41bd13cc3d..4dae6ac2e83f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -280,17 +280,24 @@ static inline void invoke_softirq(void) wakeup_softirqd(); } +#define SCHED_SOFTIRQ_MASK BIT(SCHED_SOFTIRQ) + /* * flush_smp_call_function_queue() can raise a soft interrupt in a function - * call. On RT kernels this is undesired and the only known functionality - * in the block layer which does this is disabled on RT. If soft interrupts - * get raised which haven't been raised before the flush, warn so it can be + * call. On RT kernels this is undesired and the only known functionalities + * are in the block layer which is disabled on RT, and in the scheduler for + * idle load balancing. If soft interrupts get raised which haven't been + * raised before the flush, warn if it is not a SCHED_SOFTIRQ so it can be * investigated. */ void do_softirq_post_smp_call_flush(unsigned int was_pending) { - if (WARN_ON_ONCE(was_pending != local_softirq_pending())) + unsigned int is_pending = local_softirq_pending(); + + if (unlikely(was_pending != is_pending)) { + WARN_ON_ONCE(was_pending != (is_pending & ~SCHED_SOFTIRQ_MASK)); invoke_softirq(); + } } #else /* CONFIG_PREEMPT_RT */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index aab6472853fa..7304d7cf47f2 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -24,7 +24,7 @@ static void clocksource_enqueue(struct clocksource *cs); static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end) { - u64 delta = clocksource_delta(end, start, cs->mask); + u64 delta = clocksource_delta(end, start, cs->mask, cs->max_raw_delta); if (likely(delta < cs->max_cycles)) return clocksource_cyc2ns(delta, cs->mult, cs->shift); @@ -993,6 +993,15 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs) cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, cs->mask, &cs->max_cycles); + + /* + * Threshold for detecting negative motion in clocksource_delta(). + * + * Allow for 0.875 of the counter width so that overly long idle + * sleeps, which go slightly over mask/2, do not trigger the + * negative motion detection. + */ + cs->max_raw_delta = (cs->mask >> 1) + (cs->mask >> 2) + (cs->mask >> 3); } static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0ca85ff4fbb4..3d128825d343 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -755,7 +755,8 @@ static void timekeeping_forward_now(struct timekeeper *tk) u64 cycle_now, delta; cycle_now = tk_clock_read(&tk->tkr_mono); - delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, + tk->tkr_mono.clock->max_raw_delta); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; @@ -2230,7 +2231,8 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) return false; offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), - tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + tk->tkr_mono.cycle_last, tk->tkr_mono.mask, + tk->tkr_mono.clock->max_raw_delta); /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 63e600e943a7..8c9079108ffb 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -30,15 +30,15 @@ static inline void timekeeping_inc_mg_floor_swaps(void) #endif -static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) +static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta) { u64 ret = (now - last) & mask; /* - * Prevent time going backwards by checking the MSB of mask in - * the result. If set, return 0. + * Prevent time going backwards by checking the result against + * @max_delta. If greater, return 0. */ - return ret & ~(mask >> 1) ? 0 : ret; + return ret > max_delta ? 0 : ret; } /* Semi public for serialization of non timekeeper VDSO updates. */ diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index ebda68ee9abf..be8be0c1aaf0 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -963,6 +963,11 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } ret = dyn_event_add(&ep->devent, &ep->tp.event->call); + if (ret < 0) { + trace_probe_unregister_event_call(&ep->tp); + mutex_unlock(&event_mutex); + goto error; + } mutex_unlock(&event_mutex); return ret; parse_error: |
