diff options
| author | Thomas Gleixner <tglx@linutronix.de> | 2018-09-15 20:50:42 +0200 |
|---|---|---|
| committer | Thomas Gleixner <tglx@linutronix.de> | 2018-09-15 20:50:42 +0200 |
| commit | 9ac669fc01dbfef707ecaa6b618c0d03294cca16 (patch) | |
| tree | 3207995766ab5664c60026daae5da268806a3262 /kernel | |
| parent | clockevents: Warn if cpu_all_mask is used as cpumask (diff) | |
| parent | RISC-V: Request newstat syscalls (diff) | |
| download | linux-9ac669fc01dbfef707ecaa6b618c0d03294cca16.tar.gz linux-9ac669fc01dbfef707ecaa6b618c0d03294cca16.zip | |
Merge tag 'y2038' of git://git.kernel.org/pub/scm/linux/kernel/git/arnd/playground into timers/core
Pull more y2038 work from Arnd Bergman:
y2038: convert more syscalls
Here is another set of system call changes to prepare the change over to
64-bit time_t. As before, the strategy is to change system calls that
take a 'struct timespec' argument over to 'struct __kernel_timespec',
which for now is defined to be the same but will get redefined to use a
64-bit time_t argument once we are ready to modify the system call tables.
The major change from previous patches is that the plan is no longer
to directly use the 'compat' system calls for providing compatibility
with the existing 32-bit time_t based entry points. Instead, we rename
the compat code to something that makes more sense on 32-bit architectures,
e.g. compat_timespec becomes old_timespec32.
With the renamed types in place, change over the 'stat' and 'utimes'
families of system calls, sched_rr_get_interval, recvmmsg and
rt_sigtimedwait. Another series for poll, select and io_pgetevents is
currently being tested.
Diffstat (limited to 'kernel')
168 files changed, 6693 insertions, 4104 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 3f9c97419f02..cd1655122ec0 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -18,6 +18,7 @@ config PREEMPT_NONE config PREEMPT_VOLUNTARY bool "Voluntary Kernel Preemption (Desktop)" + depends on !ARCH_NO_PREEMPT help This option reduces the latency of the kernel by adding more "explicit preemption points" to the kernel code. These new @@ -35,6 +36,7 @@ config PREEMPT_VOLUNTARY config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" + depends on !ARCH_NO_PREEMPT select PREEMPT_COUNT select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK help diff --git a/kernel/Makefile b/kernel/Makefile index 04bc07c2b42a..7a63d567fdb5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -123,7 +123,7 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) diff --git a/kernel/audit.c b/kernel/audit.c index e7478cb58079..2a8058764aa6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -83,9 +83,6 @@ #define AUDIT_INITIALIZED 1 static int audit_initialized; -#define AUDIT_OFF 0 -#define AUDIT_ON 1 -#define AUDIT_LOCKED 2 u32 audit_enabled = AUDIT_OFF; bool audit_ever_enabled = !!AUDIT_OFF; @@ -1724,7 +1721,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - *t = current_kernel_time64(); + ktime_get_coarse_real_ts64(t); *serial = audit_serial(); } } @@ -1754,7 +1751,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (audit_initialized != AUDIT_INITIALIZED) return NULL; - if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE))) + if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE))) return NULL; /* NOTE: don't ever fail/sleep on these two conditions: diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index c99ebaae5abc..ea43181cde4a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -168,7 +168,8 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); /* Function to return search key in our hash from inode. */ static unsigned long inode_to_key(const struct inode *inode) { - return (unsigned long)inode; + /* Use address pointed to by connector->obj as the key */ + return (unsigned long)&inode->i_fsnotify_marks; } /* @@ -183,7 +184,7 @@ static unsigned long chunk_to_key(struct audit_chunk *chunk) */ if (WARN_ON_ONCE(!chunk->mark.connector)) return 0; - return (unsigned long)chunk->mark.connector->inode; + return (unsigned long)chunk->mark.connector->obj; } static inline struct list_head *chunk_hash(unsigned long key) @@ -258,7 +259,7 @@ static void untag_chunk(struct node *p) spin_lock(&entry->lock); /* * mark_mutex protects mark from getting detached and thus also from - * mark->connector->inode getting NULL. + * mark->connector->obj getting NULL. */ if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { spin_unlock(&entry->lock); @@ -288,8 +289,8 @@ static void untag_chunk(struct node *p) if (!new) goto Fallback; - if (fsnotify_add_inode_mark_locked(&new->mark, entry->connector->inode, - 1)) { + if (fsnotify_add_mark_locked(&new->mark, entry->connector->obj, + FSNOTIFY_OBJ_TYPE_INODE, 1)) { fsnotify_put_mark(&new->mark); goto Fallback; } @@ -423,7 +424,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_lock(&old_entry->lock); /* * mark_mutex protects mark from getting detached and thus also from - * mark->connector->inode getting NULL. + * mark->connector->obj getting NULL. */ if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { /* old_entry is being shot, lets just lie */ @@ -434,8 +435,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return -ENOENT; } - if (fsnotify_add_inode_mark_locked(chunk_entry, - old_entry->connector->inode, 1)) { + if (fsnotify_add_mark_locked(chunk_entry, old_entry->connector->obj, + FSNOTIFY_OBJ_TYPE_INODE, 1)) { spin_unlock(&old_entry->lock); mutex_unlock(&old_entry->group->mark_mutex); fsnotify_put_mark(chunk_entry); @@ -497,6 +498,8 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule) { struct audit_buffer *ab; + if (!audit_enabled) + return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index c17c0c268436..787c7afdf829 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -238,20 +238,21 @@ out: static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op) { - if (audit_enabled) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); - if (unlikely(!ab)) - return; - audit_log_format(ab, "auid=%u ses=%u op=%s", - from_kuid(&init_user_ns, audit_get_loginuid(current)), - audit_get_sessionid(current), op); - audit_log_format(ab, " path="); - audit_log_untrustedstring(ab, w->path); - audit_log_key(ab, r->filterkey); - audit_log_format(ab, " list=%d res=1", r->listnr); - audit_log_end(ab); - } + struct audit_buffer *ab; + + if (!audit_enabled) + return; + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + if (!ab) + return; + audit_log_format(ab, "auid=%u ses=%u op=%s", + from_kuid(&init_user_ns, audit_get_loginuid(current)), + audit_get_sessionid(current), op); + audit_log_format(ab, " path="); + audit_log_untrustedstring(ab, w->path); + audit_log_key(ab, r->filterkey); + audit_log_format(ab, " list=%d res=1", r->listnr); + audit_log_end(ab); } /* Update inode info in audit rules based on filesystem event. */ @@ -419,6 +420,13 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) struct path parent_path; int h, ret = 0; + /* + * When we will be calling audit_add_to_parent, krule->watch might have + * been updated and watch might have been freed. + * So we need to keep a reference of watch. + */ + audit_get_watch(watch); + mutex_unlock(&audit_filter_mutex); /* Avoid calling path_lookup under audit_filter_mutex. */ @@ -427,8 +435,10 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - if (ret) + if (ret) { + audit_put_watch(watch); return ret; + } /* either find an old parent or attach a new one */ parent = audit_find_parent(d_backing_inode(parent_path.dentry)); @@ -446,6 +456,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) *list = &audit_inode_hash[h]; error: path_put(&parent_path); + audit_put_watch(watch); return ret; } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index eaa320148d97..bf309f2592c4 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -264,7 +264,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data * case AUDIT_FILTER_TASK: #endif case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: ; } @@ -337,7 +337,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) { switch(f->type) { case AUDIT_MSGTYPE: - if (entry->rule.listnr != AUDIT_FILTER_TYPE && + if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE && entry->rule.listnr != AUDIT_FILTER_USER) return -EINVAL; break; @@ -428,8 +428,6 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_EXE: if (f->op != Audit_not_equal && f->op != Audit_equal) return -EINVAL; - if (entry->rule.listnr != AUDIT_FILTER_EXIT) - return -EINVAL; break; } return 0; @@ -931,7 +929,7 @@ static inline int audit_add_rule(struct audit_entry *entry) /* If any of these, don't count towards total */ switch(entry->rule.listnr) { case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: dont_count = 1; } @@ -1013,7 +1011,7 @@ int audit_del_rule(struct audit_entry *entry) /* If any of these, don't count towards total */ switch(entry->rule.listnr) { case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: dont_count = 1; } @@ -1360,6 +1358,11 @@ int audit_filter(int msgtype, unsigned int listtype) f->type, f->op, f->lsm_rule, NULL); } break; + case AUDIT_EXE: + result = audit_exe_compare(current, e->rule.exe); + if (f->op == Audit_not_equal) + result = !result; + break; default: goto unlock_and_return; } @@ -1369,7 +1372,7 @@ int audit_filter(int msgtype, unsigned int listtype) break; } if (result > 0) { - if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_TYPE) + if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_EXCLUDE) ret = 0; break; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ceb1c4596c51..b2d1f043f17f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -494,20 +494,20 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_gid_comparator(cred->gid, f->op, f->gid); if (f->op == Audit_equal) { if (!result) - result = in_group_p(f->gid); + result = groups_search(cred->group_info, f->gid); } else if (f->op == Audit_not_equal) { if (result) - result = !in_group_p(f->gid); + result = !groups_search(cred->group_info, f->gid); } break; case AUDIT_EGID: result = audit_gid_comparator(cred->egid, f->op, f->gid); if (f->op == Audit_equal) { if (!result) - result = in_egroup_p(f->gid); + result = groups_search(cred->group_info, f->gid); } else if (f->op == Audit_not_equal) { if (result) - result = !in_egroup_p(f->gid); + result = !groups_search(cred->group_info, f->gid); } break; case AUDIT_SGID: @@ -1279,8 +1279,12 @@ static void show_special(struct audit_context *context, int *call_panic) break; case AUDIT_KERN_MODULE: audit_log_format(ab, "name="); - audit_log_untrustedstring(ab, context->module.name); - kfree(context->module.name); + if (context->module.name) { + audit_log_untrustedstring(ab, context->module.name); + kfree(context->module.name); + } else + audit_log_format(ab, "(null)"); + break; } audit_log_end(ab); @@ -1540,10 +1544,10 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, context->argv[2] = a3; context->argv[3] = a4; context->serial = 0; - context->ctime = current_kernel_time64(); context->in_syscall = 1; context->current_state = state; context->ppid = 0; + ktime_get_coarse_real_ts64(&context->ctime); } /** @@ -2411,8 +2415,9 @@ void __audit_log_kern_module(char *name) { struct audit_context *context = audit_context(); - context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL); - strcpy(context->module.name, name); + context->module.name = kstrdup(name, GFP_KERNEL); + if (!context->module.name) + audit_log_lost("out of memory in __audit_log_kern_module"); context->type = AUDIT_KERN_MODULE; } @@ -2461,7 +2466,7 @@ void audit_core_dumps(long signr) if (signr == SIGQUIT) /* don't care for those */ return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_ABEND); if (unlikely(!ab)) return; audit_log_task(ab); @@ -2485,7 +2490,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SECCOMP); if (unlikely(!ab)) return; audit_log_task(ab); diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f27f5496d6fe..0488b8258321 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -3,6 +3,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) @@ -22,3 +23,6 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 544e58f5f642..0c17aab3ce5f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -54,7 +54,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ -static int array_map_alloc_check(union bpf_attr *attr) +int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); @@ -358,27 +358,20 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) +static int array_map_check_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) { - const struct btf_type *key_type, *value_type; - u32 key_size, value_size; u32 int_data; - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; int_data = *(u32 *)(key_type + 1); - /* bpf array can only take a u32 key. This check makes - * sure that the btf matches the attr used during map_create. + /* bpf array can only take a u32 key. This check makes sure + * that the btf matches the attr used during map_create. */ - if (BTF_INT_BITS(int_data) != 32 || key_size != 4 || - BTF_INT_OFFSET(int_data)) - return -EINVAL; - - value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size > map->value_size) + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) return -EINVAL; return 0; @@ -405,6 +398,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_check_btf = array_map_check_btf, }; static int fd_array_map_alloc_check(union bpf_attr *attr) @@ -546,6 +540,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, + .map_check_btf = map_check_no_btf, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -634,6 +629,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, + .map_check_btf = map_check_no_btf, }; #ifdef CONFIG_CGROUPS @@ -665,6 +661,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, + .map_check_btf = map_check_no_btf, }; #endif @@ -749,4 +746,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, + .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2d49d18b793a..2590700237c1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -450,7 +450,7 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) */ static bool btf_type_int_is_regular(const struct btf_type *t) { - u16 nr_bits, nr_bytes; + u8 nr_bits, nr_bytes; u32 int_data; int_data = btf_type_int(t); @@ -991,38 +991,38 @@ static void btf_int_bits_seq_show(const struct btf *btf, void *data, u8 bits_offset, struct seq_file *m) { + u16 left_shift_bits, right_shift_bits; u32 int_data = btf_type_int(t); - u16 nr_bits = BTF_INT_BITS(int_data); - u16 total_bits_offset; - u16 nr_copy_bytes; - u16 nr_copy_bits; - u8 nr_upper_bits; - union { - u64 u64_num; - u8 u8_nums[8]; - } print_num; + u8 nr_bits = BTF_INT_BITS(int_data); + u8 total_bits_offset; + u8 nr_copy_bytes; + u8 nr_copy_bits; + u64 print_num; + /* + * bits_offset is at most 7. + * BTF_INT_OFFSET() cannot exceed 64 bits. + */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); - print_num.u64_num = 0; - memcpy(&print_num.u64_num, data, nr_copy_bytes); - - /* Ditch the higher order bits */ - nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits); - if (nr_upper_bits) { - /* We need to mask out some bits of the upper byte. */ - u8 mask = (1 << nr_upper_bits) - 1; + print_num = 0; + memcpy(&print_num, data, nr_copy_bytes); - print_num.u8_nums[nr_copy_bytes - 1] &= mask; - } +#ifdef __BIG_ENDIAN_BITFIELD + left_shift_bits = bits_offset; +#else + left_shift_bits = BITS_PER_U64 - nr_copy_bits; +#endif + right_shift_bits = BITS_PER_U64 - nr_bits; - print_num.u64_num >>= bits_offset; + print_num <<= left_shift_bits; + print_num >>= right_shift_bits; - seq_printf(m, "0x%llx", print_num.u64_num); + seq_printf(m, "0x%llx", print_num); } static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, @@ -1032,7 +1032,7 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, u32 int_data = btf_type_int(t); u8 encoding = BTF_INT_ENCODING(int_data); bool sign = encoding & BTF_INT_SIGNED; - u32 nr_bits = BTF_INT_BITS(int_data); + u8 nr_bits = BTF_INT_BITS(int_data); if (bits_offset || BTF_INT_OFFSET(int_data) || BITS_PER_BYTE_MASKED(nr_bits)) { @@ -1519,9 +1519,9 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, { bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION; const struct btf_member *member; + u32 meta_needed, last_offset; struct btf *btf = env->btf; u32 struct_size = t->size; - u32 meta_needed; u16 i; meta_needed = btf_type_vlen(t) * sizeof(*member); @@ -1534,6 +1534,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, btf_verifier_log_type(env, t, NULL); + last_offset = 0; for_each_member(i, t, member) { if (!btf_name_offset_valid(btf, member->name_off)) { btf_verifier_log_member(env, t, member, @@ -1555,6 +1556,16 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* + * ">" instead of ">=" because the last member could be + * "char a[0];" + */ + if (last_offset > member->offset) { + btf_verifier_log_member(env, t, member, + "Invalid member bits_offset"); + return -EINVAL; + } + if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { btf_verifier_log_member(env, t, member, "Memmber bits_offset exceeds its struct size"); @@ -1562,6 +1573,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, } btf_verifier_log_member(env, t, member, NULL); + last_offset = member->offset; } return meta_needed; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index f7c00bd6f8e4..6a7d931bbc55 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -34,6 +34,8 @@ void cgroup_bpf_put(struct cgroup *cgrp) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); bpf_prog_put(pl->prog); + bpf_cgroup_storage_unlink(pl->storage); + bpf_cgroup_storage_free(pl->storage); kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -95,7 +97,7 @@ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, struct bpf_prog_array __rcu **array) { - struct bpf_prog_array __rcu *progs; + struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; int cnt = 0; @@ -115,18 +117,20 @@ static int compute_effective_progs(struct cgroup *cgrp, cnt = 0; p = cgrp; do { - if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) - list_for_each_entry(pl, - &p->bpf.progs[type], node) { - if (!pl->prog) - continue; - rcu_dereference_protected(progs, 1)-> - progs[cnt++] = pl->prog; - } - p = cgroup_parent(p); - } while (p); + if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + continue; + + list_for_each_entry(pl, &p->bpf.progs[type], node) { + if (!pl->prog) + continue; + + progs->items[cnt].prog = pl->prog; + progs->items[cnt].cgroup_storage = pl->storage; + cnt++; + } + } while ((p = cgroup_parent(p))); - *array = progs; + rcu_assign_pointer(*array, progs); return 0; } @@ -173,6 +177,45 @@ cleanup: return -ENOMEM; } +static int update_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type) +{ + struct cgroup_subsys_state *css; + int err; + + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; + } + + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + return err; +} + #define BPF_CGROUP_MAX_PROGS 64 /** @@ -189,7 +232,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct cgroup_subsys_state *css; + struct bpf_cgroup_storage *storage, *old_storage = NULL; struct bpf_prog_list *pl; bool pl_was_allocated; int err; @@ -211,72 +254,71 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; + storage = bpf_cgroup_storage_alloc(prog); + if (IS_ERR(storage)) + return -ENOMEM; + if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) - if (pl->prog == prog) + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog) { /* disallow attaching the same prog twice */ + bpf_cgroup_storage_free(storage); return -EINVAL; + } + } pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) + if (!pl) { + bpf_cgroup_storage_free(storage); return -ENOMEM; + } + pl_was_allocated = true; pl->prog = prog; + pl->storage = storage; list_add_tail(&pl->node, progs); } else { if (list_empty(progs)) { pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) + if (!pl) { + bpf_cgroup_storage_free(storage); return -ENOMEM; + } pl_was_allocated = true; list_add_tail(&pl->node, progs); } else { pl = list_first_entry(progs, typeof(*pl), node); old_prog = pl->prog; + old_storage = pl->storage; + bpf_cgroup_storage_unlink(old_storage); pl_was_allocated = false; } pl->prog = prog; + pl->storage = storage; } cgrp->bpf.flags[type] = flags; - /* allocate and recompute effective prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - err = compute_effective_progs(desc, type, &desc->bpf.inactive); - if (err) - goto cleanup; - } - - /* all allocations were successful. Activate all prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - activate_effective_progs(desc, type, desc->bpf.inactive); - desc->bpf.inactive = NULL; - } + err = update_effective_progs(cgrp, type); + if (err) + goto cleanup; static_branch_inc(&cgroup_bpf_enabled_key); + if (old_storage) + bpf_cgroup_storage_free(old_storage); if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } + bpf_cgroup_storage_link(storage, cgrp, type); return 0; cleanup: - /* oom while computing effective. Free all computed effective arrays - * since they were not activated - */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - bpf_prog_array_free(desc->bpf.inactive); - desc->bpf.inactive = NULL; - } - /* and cleanup the prog list */ pl->prog = old_prog; + bpf_cgroup_storage_free(pl->storage); + pl->storage = old_storage; + bpf_cgroup_storage_link(old_storage, cgrp, type); if (pl_was_allocated) { list_del(&pl->node); kfree(pl); @@ -299,7 +341,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; struct bpf_prog *old_prog = NULL; - struct cgroup_subsys_state *css; struct bpf_prog_list *pl; int err; @@ -338,25 +379,14 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; } - /* allocate and recompute effective prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - err = compute_effective_progs(desc, type, &desc->bpf.inactive); - if (err) - goto cleanup; - } - - /* all allocations were successful. Activate all prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - activate_effective_progs(desc, type, desc->bpf.inactive); - desc->bpf.inactive = NULL; - } + err = update_effective_progs(cgrp, type); + if (err) + goto cleanup; /* now can actually delete it from this cgroup list */ list_del(&pl->node); + bpf_cgroup_storage_unlink(pl->storage); + bpf_cgroup_storage_free(pl->storage); kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ @@ -367,16 +397,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, return 0; cleanup: - /* oom while computing effective. Free all computed effective arrays - * since they were not activated - */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - bpf_prog_array_free(desc->bpf.inactive); - desc->bpf.inactive = NULL; - } - /* and restore back old_prog */ pl->prog = old_prog; return err; @@ -428,6 +448,60 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return ret; } +int cgroup_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog) +{ + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + attr->attach_flags); + cgroup_put(cgrp); + return ret; +} + +int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + prog = NULL; + + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + if (prog) + bpf_prog_put(prog); + + cgroup_put(cgrp); + return ret; +} + +int cgroup_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->query.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + ret = cgroup_bpf_query(cgrp, attr, uattr); + + cgroup_put(cgrp); + return ret; +} + /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic @@ -601,6 +675,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_delete_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a9e6c04d0f4a..3f5bf1af0826 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_fill_ill_insns(hdr, size); hdr->pages = size / PAGE_SIZE; - hdr->locked = 0; - hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) return 0; } -static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp) -{ -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - int i, err; - - for (i = 0; i < fp->aux->func_cnt; i++) { - err = bpf_prog_check_pages_ro_single(fp->aux->func[i]); - if (err) - return err; - } - - return bpf_prog_check_pages_ro_single(fp); -#endif - return 0; -} - static void bpf_prog_select_func(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON @@ -1524,17 +1506,7 @@ finalize: * all eBPF JITs might immediately support all features. */ *err = bpf_check_tail_call(fp); - if (*err) - return fp; - - /* Checkpoint: at this point onwards any cBPF -> eBPF or - * native eBPF program is read-only. If we failed to change - * the page attributes (e.g. allocation failure from - * splitting large pages), then reject the whole program - * in order to guarantee not ending up with any W+X pages - * from BPF side in kernel. - */ - *err = bpf_prog_check_pages_ro_locked(fp); + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); @@ -1566,11 +1538,12 @@ static struct { .null_prog = NULL, }; -struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) +struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { if (prog_cnt) return kzalloc(sizeof(struct bpf_prog_array) + - sizeof(struct bpf_prog *) * (prog_cnt + 1), + sizeof(struct bpf_prog_array_item) * + (prog_cnt + 1), flags); return &empty_prog_array.hdr; @@ -1584,43 +1557,45 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) kfree_rcu(progs, rcu); } -int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) +int bpf_prog_array_length(struct bpf_prog_array __rcu *array) { - struct bpf_prog **prog; + struct bpf_prog_array_item *item; u32 cnt = 0; rcu_read_lock(); - prog = rcu_dereference(progs)->progs; - for (; *prog; prog++) - if (*prog != &dummy_bpf_prog.prog) + item = rcu_dereference(array)->items; + for (; item->prog; item++) + if (item->prog != &dummy_bpf_prog.prog) cnt++; rcu_read_unlock(); return cnt; } -static bool bpf_prog_array_copy_core(struct bpf_prog **prog, + +static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, u32 *prog_ids, u32 request_cnt) { + struct bpf_prog_array_item *item; int i = 0; - for (; *prog; prog++) { - if (*prog == &dummy_bpf_prog.prog) + item = rcu_dereference_check(array, 1)->items; + for (; item->prog; item++) { + if (item->prog == &dummy_bpf_prog.prog) continue; - prog_ids[i] = (*prog)->aux->id; + prog_ids[i] = item->prog->aux->id; if (++i == request_cnt) { - prog++; + item++; break; } } - return !!(*prog); + return !!(item->prog); } -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, __u32 __user *prog_ids, u32 cnt) { - struct bpf_prog **prog; unsigned long err = 0; bool nospc; u32 *ids; @@ -1639,8 +1614,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, if (!ids) return -ENOMEM; rcu_read_lock(); - prog = rcu_dereference(progs)->progs; - nospc = bpf_prog_array_copy_core(prog, ids, cnt); + nospc = bpf_prog_array_copy_core(array, ids, cnt); rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); @@ -1651,14 +1625,14 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, return 0; } -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, +void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, struct bpf_prog *old_prog) { - struct bpf_prog **prog = progs->progs; + struct bpf_prog_array_item *item = array->items; - for (; *prog; prog++) - if (*prog == old_prog) { - WRITE_ONCE(*prog, &dummy_bpf_prog.prog); + for (; item->prog; item++) + if (item->prog == old_prog) { + WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } @@ -1669,7 +1643,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; - struct bpf_prog **existing_prog; + struct bpf_prog_array_item *existing; struct bpf_prog_array *array; bool found_exclude = false; int new_prog_idx = 0; @@ -1678,15 +1652,15 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, * the new array. */ if (old_array) { - existing_prog = old_array->progs; - for (; *existing_prog; existing_prog++) { - if (*existing_prog == exclude_prog) { + existing = old_array->items; + for (; existing->prog; existing++) { + if (existing->prog == exclude_prog) { found_exclude = true; continue; } - if (*existing_prog != &dummy_bpf_prog.prog) + if (existing->prog != &dummy_bpf_prog.prog) carry_prog_cnt++; - if (*existing_prog == include_prog) + if (existing->prog == include_prog) return -EEXIST; } } @@ -1712,15 +1686,17 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, /* Fill in the new prog array */ if (carry_prog_cnt) { - existing_prog = old_array->progs; - for (; *existing_prog; existing_prog++) - if (*existing_prog != exclude_prog && - *existing_prog != &dummy_bpf_prog.prog) - array->progs[new_prog_idx++] = *existing_prog; + existing = old_array->items; + for (; existing->prog; existing++) + if (existing->prog != exclude_prog && + existing->prog != &dummy_bpf_prog.prog) { + array->items[new_prog_idx++].prog = + existing->prog; + } } if (include_prog) - array->progs[new_prog_idx++] = include_prog; - array->progs[new_prog_idx] = NULL; + array->items[new_prog_idx++].prog = include_prog; + array->items[new_prog_idx].prog = NULL; *new_array = array; return 0; } @@ -1729,7 +1705,6 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { - struct bpf_prog **prog; u32 cnt = 0; if (array) @@ -1742,8 +1717,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, return 0; /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ - prog = rcu_dereference_check(array, 1)->progs; - return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC + return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC : 0; } @@ -1821,6 +1795,7 @@ const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; +const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index e0918d180f08..24aac0d0f412 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -69,7 +69,7 @@ struct bpf_cpu_map { }; static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq); + struct xdp_bulk_queue *bq, bool in_napi_ctx); static u64 cpu_map_bitmap_size(const union bpf_attr *attr) { @@ -375,7 +375,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(rcpu, bq); + bq_flush_to_queue(rcpu, bq, false); } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ @@ -479,6 +479,8 @@ static void cpu_map_free(struct bpf_map *map) * It does __not__ ensure pending flush operations (if any) are * complete. */ + + bpf_clear_redirect_map(map); synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush @@ -555,10 +557,11 @@ const struct bpf_map_ops cpu_map_ops = { .map_update_elem = cpu_map_update_elem, .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, + .map_check_btf = map_check_no_btf, }; static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq) + struct xdp_bulk_queue *bq, bool in_napi_ctx) { unsigned int processed = 0, drops = 0; const int to_cpu = rcpu->cpu; @@ -578,7 +581,10 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - xdp_return_frame_rx_napi(xdpf); + if (likely(in_napi_ctx)) + xdp_return_frame_rx_napi(xdpf); + else + xdp_return_frame(xdpf); } processed++; } @@ -598,7 +604,7 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(rcpu, bq); + bq_flush_to_queue(rcpu, bq, true); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -661,7 +667,7 @@ void __cpu_map_flush(struct bpf_map *map) /* Flush all frames in bulkq to real queue */ bq = this_cpu_ptr(rcpu->bulkq); - bq_flush_to_queue(rcpu, bq); + bq_flush_to_queue(rcpu, bq, true); /* If already running, costs spin_lock_irqsave + smb_mb */ wake_up_process(rcpu->kthread); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 642c97f6d1b8..141710b82a6c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -161,6 +161,7 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); + bpf_clear_redirect_map(map); synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush @@ -217,7 +218,8 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) } static int bq_xmit_all(struct bpf_dtab_netdev *obj, - struct xdp_bulk_queue *bq, u32 flags) + struct xdp_bulk_queue *bq, u32 flags, + bool in_napi_ctx) { struct net_device *dev = obj->dev; int sent = 0, drops = 0, err = 0; @@ -254,7 +256,10 @@ error: struct xdp_frame *xdpf = bq->q[i]; /* RX path under NAPI protection, can return frames faster */ - xdp_return_frame_rx_napi(xdpf); + if (likely(in_napi_ctx)) + xdp_return_frame_rx_napi(xdpf); + else + xdp_return_frame(xdpf); drops++; } goto out; @@ -286,7 +291,7 @@ void __dev_map_flush(struct bpf_map *map) __clear_bit(bit, bitmap); bq = this_cpu_ptr(dev->bulkq); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH); + bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); } } @@ -316,7 +321,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(obj, bq, 0); + bq_xmit_all(obj, bq, 0, true); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -334,10 +339,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; struct xdp_frame *xdpf; + int err; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -350,7 +360,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, { int err; - err = __xdp_generic_ok_fwd_dev(skb, dst->dev); + err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; skb->dev = dst->dev; @@ -380,7 +390,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) __clear_bit(dev->bit, bitmap); bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH); + bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); } } } @@ -479,6 +489,7 @@ const struct bpf_map_ops dev_map_ops = { .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, + .map_check_btf = map_check_no_btf, }; static int dev_map_notification(struct notifier_block *notifier, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3ca2198a6d22..04b8eda94e7d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -11,9 +11,11 @@ * General Public License for more details. */ #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/jhash.h> #include <linux/filter.h> #include <linux/rculist_nulls.h> +#include <uapi/linux/btf.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -747,13 +749,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, * old element will be freed immediately. * Otherwise return an error */ - atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); + l_new = ERR_PTR(-E2BIG); + goto dec_count; } l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); - if (!l_new) - return ERR_PTR(-ENOMEM); + if (!l_new) { + l_new = ERR_PTR(-ENOMEM); + goto dec_count; + } } memcpy(l_new->key, key, key_size); @@ -766,7 +770,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, GFP_ATOMIC | __GFP_NOWARN); if (!pptr) { kfree(l_new); - return ERR_PTR(-ENOMEM); + l_new = ERR_PTR(-ENOMEM); + goto dec_count; } } @@ -780,6 +785,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new->hash = hash; return l_new; +dec_count: + atomic_dec(&htab->count); + return l_new; } static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, @@ -1156,6 +1164,27 @@ static void htab_map_free(struct bpf_map *map) kfree(htab); } +static void htab_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + + rcu_read_lock(); + + value = htab_map_lookup_elem(map, key); + if (!value) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); + seq_puts(m, "\n"); + + rcu_read_unlock(); +} + const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1165,6 +1194,7 @@ const struct bpf_map_ops htab_map_ops = { .map_update_elem = htab_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, + .map_seq_show_elem = htab_map_seq_show_elem, }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1176,6 +1206,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, + .map_seq_show_elem = htab_map_seq_show_elem, }; /* Called from eBPF program */ @@ -1402,4 +1433,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, + .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 73065e2d23c2..1991466b8327 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -193,4 +193,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +DECLARE_PER_CPU(void*, bpf_cgroup_storage); + +BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) +{ + /* map and flags arguments are not used now, + * but provide an ability to extend the API + * for other types of local storages. + * verifier checks that their values are correct. + */ + return (unsigned long) this_cpu_read(bpf_cgroup_storage); +} + +const struct bpf_func_proto bpf_get_local_storage_proto = { + .func = bpf_get_local_storage, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; #endif diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 76efe9a183f5..2ada5e21dfa6 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -196,19 +196,21 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) { struct bpf_map *map = seq_file_to_map(m); void *key = map_iter(m)->key; + void *prev_key; if (map_iter(m)->done) return NULL; if (unlikely(v == SEQ_START_TOKEN)) - goto done; + prev_key = NULL; + else + prev_key = key; - if (map->ops->map_get_next_key(map, key, key)) { + if (map->ops->map_get_next_key(map, prev_key, key)) { map_iter(m)->done = true; return NULL; } -done: ++(*pos); return key; } @@ -332,7 +334,8 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) struct bpf_map *map = arg; return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, - map->btf ? &bpffs_map_fops : &bpffs_obj_fops); + bpf_map_support_seq_show(map) ? + &bpffs_map_fops : &bpffs_obj_fops); } static struct dentry * diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c new file mode 100644 index 000000000000..22ad967d1e5f --- /dev/null +++ b/kernel/bpf/local_storage.c @@ -0,0 +1,379 @@ +//SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf-cgroup.h> +#include <linux/bpf.h> +#include <linux/bug.h> +#include <linux/filter.h> +#include <linux/mm.h> +#include <linux/rbtree.h> +#include <linux/slab.h> + +DEFINE_PER_CPU(void*, bpf_cgroup_storage); + +#ifdef CONFIG_CGROUP_BPF + +#define LOCAL_STORAGE_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + +struct bpf_cgroup_storage_map { + struct bpf_map map; + + spinlock_t lock; + struct bpf_prog *prog; + struct rb_root root; + struct list_head list; +}; + +static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) +{ + return container_of(map, struct bpf_cgroup_storage_map, map); +} + +static int bpf_cgroup_storage_key_cmp( + const struct bpf_cgroup_storage_key *key1, + const struct bpf_cgroup_storage_key *key2) +{ + if (key1->cgroup_inode_id < key2->cgroup_inode_id) + return -1; + else if (key1->cgroup_inode_id > key2->cgroup_inode_id) + return 1; + else if (key1->attach_type < key2->attach_type) + return -1; + else if (key1->attach_type > key2->attach_type) + return 1; + return 0; +} + +static struct bpf_cgroup_storage *cgroup_storage_lookup( + struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key, + bool locked) +{ + struct rb_root *root = &map->root; + struct rb_node *node; + + if (!locked) + spin_lock_bh(&map->lock); + + node = root->rb_node; + while (node) { + struct bpf_cgroup_storage *storage; + + storage = container_of(node, struct bpf_cgroup_storage, node); + + switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) { + case -1: + node = node->rb_left; + break; + case 1: + node = node->rb_right; + break; + default: + if (!locked) + spin_unlock_bh(&map->lock); + return storage; + } + } + + if (!locked) + spin_unlock_bh(&map->lock); + + return NULL; +} + +static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, + struct bpf_cgroup_storage *storage) +{ + struct rb_root *root = &map->root; + struct rb_node **new = &(root->rb_node), *parent = NULL; + + while (*new) { + struct bpf_cgroup_storage *this; + + this = container_of(*new, struct bpf_cgroup_storage, node); + + parent = *new; + switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) { + case -1: + new = &((*new)->rb_left); + break; + case 1: + new = &((*new)->rb_right); + break; + default: + return -EEXIST; + } + } + + rb_link_node(&storage->node, parent, new); + rb_insert_color(&storage->node, root); + + return 0; +} + +static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + + storage = cgroup_storage_lookup(map, key, false); + if (!storage) + return NULL; + + return &READ_ONCE(storage->buf)->data[0]; +} + +static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, + void *value, u64 flags) +{ + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + struct bpf_storage_buffer *new; + + if (flags & BPF_NOEXIST) + return -EINVAL; + + storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, + key, false); + if (!storage) + return -ENOENT; + + new = kmalloc_node(sizeof(struct bpf_storage_buffer) + + map->value_size, __GFP_ZERO | GFP_USER, + map->numa_node); + if (!new) + return -ENOMEM; + + memcpy(&new->data[0], value, map->value_size); + + new = xchg(&storage->buf, new); + kfree_rcu(new, rcu); + + return 0; +} + +static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, + void *_next_key) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage_key *next = _next_key; + struct bpf_cgroup_storage *storage; + + spin_lock_bh(&map->lock); + + if (list_empty(&map->list)) + goto enoent; + + if (key) { + storage = cgroup_storage_lookup(map, key, true); + if (!storage) + goto enoent; + + storage = list_next_entry(storage, list); + if (!storage) + goto enoent; + } else { + storage = list_first_entry(&map->list, + struct bpf_cgroup_storage, list); + } + + spin_unlock_bh(&map->lock); + next->attach_type = storage->key.attach_type; + next->cgroup_inode_id = storage->key.cgroup_inode_id; + return 0; + +enoent: + spin_unlock_bh(&map->lock); + return -ENOENT; +} + +static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) +{ + int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_cgroup_storage_map *map; + + if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) + return ERR_PTR(-EINVAL); + + if (attr->value_size > PAGE_SIZE) + return ERR_PTR(-E2BIG); + + if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) + /* reserved bits should not be used */ + return ERR_PTR(-EINVAL); + + if (attr->max_entries) + /* max_entries is not used and enforced to be 0 */ + return ERR_PTR(-EINVAL); + + map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), + __GFP_ZERO | GFP_USER, numa_node); + if (!map) + return ERR_PTR(-ENOMEM); + + map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), + PAGE_SIZE) >> PAGE_SHIFT; + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&map->map, attr); + + spin_lock_init(&map->lock); + map->root = RB_ROOT; + INIT_LIST_HEAD(&map->list); + + return &map->map; +} + +static void cgroup_storage_map_free(struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + + WARN_ON(!RB_EMPTY_ROOT(&map->root)); + WARN_ON(!list_empty(&map->list)); + + kfree(map); +} + +static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +const struct bpf_map_ops cgroup_storage_map_ops = { + .map_alloc = cgroup_storage_map_alloc, + .map_free = cgroup_storage_map_free, + .map_get_next_key = cgroup_storage_get_next_key, + .map_lookup_elem = cgroup_storage_lookup_elem, + .map_update_elem = cgroup_storage_update_elem, + .map_delete_elem = cgroup_storage_delete_elem, + .map_check_btf = map_check_no_btf, +}; + +int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + int ret = -EBUSY; + + spin_lock_bh(&map->lock); + + if (map->prog && map->prog != prog) + goto unlock; + if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map) + goto unlock; + + map->prog = prog; + prog->aux->cgroup_storage = _map; + ret = 0; +unlock: + spin_unlock_bh(&map->lock); + + return ret; +} + +void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + + spin_lock_bh(&map->lock); + if (map->prog == prog) { + WARN_ON(prog->aux->cgroup_storage != _map); + map->prog = NULL; + prog->aux->cgroup_storage = NULL; + } + spin_unlock_bh(&map->lock); +} + +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog) +{ + struct bpf_cgroup_storage *storage; + struct bpf_map *map; + u32 pages; + + map = prog->aux->cgroup_storage; + if (!map) + return NULL; + + pages = round_up(sizeof(struct bpf_cgroup_storage) + + sizeof(struct bpf_storage_buffer) + + map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + if (bpf_map_charge_memlock(map, pages)) + return ERR_PTR(-EPERM); + + storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), + __GFP_ZERO | GFP_USER, map->numa_node); + if (!storage) { + bpf_map_uncharge_memlock(map, pages); + return ERR_PTR(-ENOMEM); + } + + storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) + + map->value_size, __GFP_ZERO | GFP_USER, + map->numa_node); + if (!storage->buf) { + bpf_map_uncharge_memlock(map, pages); + kfree(storage); + return ERR_PTR(-ENOMEM); + } + + storage->map = (struct bpf_cgroup_storage_map *)map; + + return storage; +} + +void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) +{ + u32 pages; + struct bpf_map *map; + + if (!storage) + return; + + map = &storage->map->map; + pages = round_up(sizeof(struct bpf_cgroup_storage) + + sizeof(struct bpf_storage_buffer) + + map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + bpf_map_uncharge_memlock(map, pages); + + kfree_rcu(storage->buf, rcu); + kfree_rcu(storage, rcu); +} + +void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, + struct cgroup *cgroup, + enum bpf_attach_type type) +{ + struct bpf_cgroup_storage_map *map; + + if (!storage) + return; + + storage->key.attach_type = type; + storage->key.cgroup_inode_id = cgroup->kn->id.id; + + map = storage->map; + + spin_lock_bh(&map->lock); + WARN_ON(cgroup_storage_insert(map, storage)); + list_add(&storage->list, &map->list); + spin_unlock_bh(&map->lock); +} + +void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) +{ + struct bpf_cgroup_storage_map *map; + struct rb_root *root; + + if (!storage) + return; + + map = storage->map; + + spin_lock_bh(&map->lock); + root = &map->root; + rb_erase(&storage->node, root); + + list_del(&storage->list); + spin_unlock_bh(&map->lock); +} + +#endif diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1603492c9cc7..9058317ba9de 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -10,11 +10,13 @@ */ #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <net/ipv6.h> +#include <uapi/linux/btf.h> /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -686,6 +688,15 @@ free_stack: return err; } +static int trie_check_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + /* Keys must have struct bpf_lpm_trie_key embedded. */ + return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ? + -EINVAL : 0; +} + const struct bpf_map_ops trie_map_ops = { .map_alloc = trie_alloc, .map_free = trie_free, @@ -693,4 +704,5 @@ const struct bpf_map_ops trie_map_ops = { .map_lookup_elem = trie_lookup_elem, .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, + .map_check_btf = trie_check_btf, }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 1da574612bea..3bfbf4464416 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -23,7 +23,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) * is a runtime binding. Doing static check alone * in the verifier is not enough. */ - if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || + inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE) { fdput(f); return ERR_PTR(-ENOTSUPP); } diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index ac747d5cf7c6..177a52436394 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -18,19 +18,43 @@ #include <linux/bug.h> #include <linux/kdev_t.h> #include <linux/list.h> +#include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/proc_ns.h> +#include <linux/rhashtable.h> #include <linux/rtnetlink.h> #include <linux/rwsem.h> -/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members +/* Protects offdevs, members of bpf_offload_netdev and offload members * of all progs. * RTNL lock cannot be taken when holding this lock. */ static DECLARE_RWSEM(bpf_devs_lock); -static LIST_HEAD(bpf_prog_offload_devs); -static LIST_HEAD(bpf_map_offload_devs); + +struct bpf_offload_dev { + struct list_head netdevs; +}; + +struct bpf_offload_netdev { + struct rhash_head l; + struct net_device *netdev; + struct bpf_offload_dev *offdev; + struct list_head progs; + struct list_head maps; + struct list_head offdev_netdevs; +}; + +static const struct rhashtable_params offdevs_params = { + .nelem_hint = 4, + .key_len = sizeof(struct net_device *), + .key_offset = offsetof(struct bpf_offload_netdev, netdev), + .head_offset = offsetof(struct bpf_offload_netdev, l), + .automatic_shrinking = true, +}; + +static struct rhashtable offdevs; +static bool offdevs_inited; static int bpf_dev_offload_check(struct net_device *netdev) { @@ -41,8 +65,19 @@ static int bpf_dev_offload_check(struct net_device *netdev) return 0; } +static struct bpf_offload_netdev * +bpf_offload_find_netdev(struct net_device *netdev) +{ + lockdep_assert_held(&bpf_devs_lock); + + if (!offdevs_inited) + return NULL; + return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); +} + int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { + struct bpf_offload_netdev *ondev; struct bpf_prog_offload *offload; int err; @@ -66,12 +101,13 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) goto err_maybe_put; down_write(&bpf_devs_lock); - if (offload->netdev->reg_state != NETREG_REGISTERED) { + ondev = bpf_offload_find_netdev(offload->netdev); + if (!ondev) { err = -EINVAL; goto err_unlock; } prog->aux->offload = offload; - list_add_tail(&offload->offloads, &bpf_prog_offload_devs); + list_add_tail(&offload->offloads, &ondev->progs); dev_put(offload->netdev); up_write(&bpf_devs_lock); @@ -294,6 +330,7 @@ static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { struct net *net = current->nsproxy->net_ns; + struct bpf_offload_netdev *ondev; struct bpf_offloaded_map *offmap; int err; @@ -316,11 +353,17 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) if (err) goto err_unlock; + ondev = bpf_offload_find_netdev(offmap->netdev); + if (!ondev) { + err = -EINVAL; + goto err_unlock; + } + err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); if (err) goto err_unlock; - list_add_tail(&offmap->offloads, &bpf_map_offload_devs); + list_add_tail(&offmap->offloads, &ondev->maps); up_write(&bpf_devs_lock); rtnl_unlock(); @@ -468,77 +511,159 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) return 0; } -bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) +static bool __bpf_offload_dev_match(struct bpf_prog *prog, + struct net_device *netdev) { - struct bpf_offloaded_map *offmap; + struct bpf_offload_netdev *ondev1, *ondev2; struct bpf_prog_offload *offload; - bool ret; if (!bpf_prog_is_dev_bound(prog->aux)) return false; - if (!bpf_map_is_dev_bound(map)) - return bpf_map_offload_neutral(map); - down_read(&bpf_devs_lock); offload = prog->aux->offload; - offmap = map_to_offmap(map); + if (!offload) + return false; + if (offload->netdev == netdev) + return true; - ret = offload && offload->netdev == offmap->netdev; + ondev1 = bpf_offload_find_netdev(offload->netdev); + ondev2 = bpf_offload_find_netdev(netdev); + + return ondev1 && ondev2 && ondev1->offdev == ondev2->offdev; +} + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) +{ + bool ret; + + down_read(&bpf_devs_lock); + ret = __bpf_offload_dev_match(prog, netdev); up_read(&bpf_devs_lock); return ret; } +EXPORT_SYMBOL_GPL(bpf_offload_dev_match); -static void bpf_offload_orphan_all_progs(struct net_device *netdev) +bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) { - struct bpf_prog_offload *offload, *tmp; + struct bpf_offloaded_map *offmap; + bool ret; - list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) - if (offload->netdev == netdev) - __bpf_prog_offload_destroy(offload->prog); + if (!bpf_map_is_dev_bound(map)) + return bpf_map_offload_neutral(map); + offmap = map_to_offmap(map); + + down_read(&bpf_devs_lock); + ret = __bpf_offload_dev_match(prog, offmap->netdev); + up_read(&bpf_devs_lock); + + return ret; } -static void bpf_offload_orphan_all_maps(struct net_device *netdev) +int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, + struct net_device *netdev) { - struct bpf_offloaded_map *offmap, *tmp; + struct bpf_offload_netdev *ondev; + int err; - list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) - if (offmap->netdev == netdev) - __bpf_map_offload_destroy(offmap); + ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); + if (!ondev) + return -ENOMEM; + + ondev->netdev = netdev; + ondev->offdev = offdev; + INIT_LIST_HEAD(&ondev->progs); + INIT_LIST_HEAD(&ondev->maps); + + down_write(&bpf_devs_lock); + err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); + if (err) { + netdev_warn(netdev, "failed to register for BPF offload\n"); + goto err_unlock_free; + } + + list_add(&ondev->offdev_netdevs, &offdev->netdevs); + up_write(&bpf_devs_lock); + return 0; + +err_unlock_free: + up_write(&bpf_devs_lock); + kfree(ondev); + return err; } +EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); -static int bpf_offload_notification(struct notifier_block *notifier, - ulong event, void *ptr) +void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, + struct net_device *netdev) { - struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct bpf_offload_netdev *ondev, *altdev; + struct bpf_offloaded_map *offmap, *mtmp; + struct bpf_prog_offload *offload, *ptmp; ASSERT_RTNL(); - switch (event) { - case NETDEV_UNREGISTER: - /* ignore namespace changes */ - if (netdev->reg_state != NETREG_UNREGISTERING) - break; - - down_write(&bpf_devs_lock); - bpf_offload_orphan_all_progs(netdev); - bpf_offload_orphan_all_maps(netdev); - up_write(&bpf_devs_lock); - break; - default: - break; + down_write(&bpf_devs_lock); + ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); + if (WARN_ON(!ondev)) + goto unlock; + + WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); + list_del(&ondev->offdev_netdevs); + + /* Try to move the objects to another netdev of the device */ + altdev = list_first_entry_or_null(&offdev->netdevs, + struct bpf_offload_netdev, + offdev_netdevs); + if (altdev) { + list_for_each_entry(offload, &ondev->progs, offloads) + offload->netdev = altdev->netdev; + list_splice_init(&ondev->progs, &altdev->progs); + + list_for_each_entry(offmap, &ondev->maps, offloads) + offmap->netdev = altdev->netdev; + list_splice_init(&ondev->maps, &altdev->maps); + } else { + list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) + __bpf_prog_offload_destroy(offload->prog); + list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) + __bpf_map_offload_destroy(offmap); } - return NOTIFY_OK; -} -static struct notifier_block bpf_offload_notifier = { - .notifier_call = bpf_offload_notification, -}; + WARN_ON(!list_empty(&ondev->progs)); + WARN_ON(!list_empty(&ondev->maps)); + kfree(ondev); +unlock: + up_write(&bpf_devs_lock); +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); -static int __init bpf_offload_init(void) +struct bpf_offload_dev *bpf_offload_dev_create(void) { - register_netdevice_notifier(&bpf_offload_notifier); - return 0; + struct bpf_offload_dev *offdev; + int err; + + down_write(&bpf_devs_lock); + if (!offdevs_inited) { + err = rhashtable_init(&offdevs, &offdevs_params); + if (err) + return ERR_PTR(err); + offdevs_inited = true; + } + up_write(&bpf_devs_lock); + + offdev = kzalloc(sizeof(*offdev), GFP_KERNEL); + if (!offdev) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&offdev->netdevs); + + return offdev; } +EXPORT_SYMBOL_GPL(bpf_offload_dev_create); -subsys_initcall(bpf_offload_init); +void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev) +{ + WARN_ON(!list_empty(&offdev->netdevs)); + kfree(offdev); +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c new file mode 100644 index 000000000000..18e225de80ff --- /dev/null +++ b/kernel/bpf/reuseport_array.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Facebook + */ +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/sock_diag.h> +#include <net/sock_reuseport.h> + +struct reuseport_array { + struct bpf_map map; + struct sock __rcu *ptrs[]; +}; + +static struct reuseport_array *reuseport_array(struct bpf_map *map) +{ + return (struct reuseport_array *)map; +} + +/* The caller must hold the reuseport_lock */ +void bpf_sk_reuseport_detach(struct sock *sk) +{ + struct sock __rcu **socks; + + write_lock_bh(&sk->sk_callback_lock); + socks = sk->sk_user_data; + if (socks) { + WRITE_ONCE(sk->sk_user_data, NULL); + /* + * Do not move this NULL assignment outside of + * sk->sk_callback_lock because there is + * a race with reuseport_array_free() + * which does not hold the reuseport_lock. + */ + RCU_INIT_POINTER(*socks, NULL); + } + write_unlock_bh(&sk->sk_callback_lock); +} + +static int reuseport_array_alloc_check(union bpf_attr *attr) +{ + if (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) + return -EINVAL; + + return array_map_alloc_check(attr); +} + +static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return rcu_dereference(array->ptrs[index]); +} + +/* Called from syscall only */ +static int reuseport_array_delete_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + struct sock *sk; + int err; + + if (index >= map->max_entries) + return -E2BIG; + + if (!rcu_access_pointer(array->ptrs[index])) + return -ENOENT; + + spin_lock_bh(&reuseport_lock); + + sk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + WRITE_ONCE(sk->sk_user_data, NULL); + RCU_INIT_POINTER(array->ptrs[index], NULL); + write_unlock_bh(&sk->sk_callback_lock); + err = 0; + } else { + err = -ENOENT; + } + + spin_unlock_bh(&reuseport_lock); + + return err; +} + +static void reuseport_array_free(struct bpf_map *map) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *sk; + u32 i; + + synchronize_rcu(); + + /* + * ops->map_*_elem() will not be able to access this + * array now. Hence, this function only races with + * bpf_sk_reuseport_detach() which was triggerred by + * close() or disconnect(). + * + * This function and bpf_sk_reuseport_detach() are + * both removing sk from "array". Who removes it + * first does not matter. + * + * The only concern here is bpf_sk_reuseport_detach() + * may access "array" which is being freed here. + * bpf_sk_reuseport_detach() access this "array" + * through sk->sk_user_data _and_ with sk->sk_callback_lock + * held which is enough because this "array" is not freed + * until all sk->sk_user_data has stopped referencing this "array". + * + * Hence, due to the above, taking "reuseport_lock" is not + * needed here. + */ + + /* + * Since reuseport_lock is not taken, sk is accessed under + * rcu_read_lock() + */ + rcu_read_lock(); + for (i = 0; i < map->max_entries; i++) { + sk = rcu_dereference(array->ptrs[i]); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + /* + * No need for WRITE_ONCE(). At this point, + * no one is reading it without taking the + * sk->sk_callback_lock. + */ + sk->sk_user_data = NULL; + write_unlock_bh(&sk->sk_callback_lock); + RCU_INIT_POINTER(array->ptrs[i], NULL); + } + } + rcu_read_unlock(); + + /* + * Once reaching here, all sk->sk_user_data is not + * referenceing this "array". "array" can be freed now. + */ + bpf_map_area_free(array); +} + +static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) +{ + int err, numa_node = bpf_map_attr_numa_node(attr); + struct reuseport_array *array; + u64 cost, array_size; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + array_size = sizeof(*array); + array_size += (u64)attr->max_entries * sizeof(struct sock *); + + /* make sure there is no u32 overflow later in round_up() */ + cost = array_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-ENOMEM); + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = bpf_map_precharge_memlock(cost); + if (err) + return ERR_PTR(err); + + /* allocate all map elements and zero-initialize them */ + array = bpf_map_area_alloc(array_size, numa_node); + if (!array) + return ERR_PTR(-ENOMEM); + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&array->map, attr); + array->map.pages = cost; + + return &array->map; +} + +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value) +{ + struct sock *sk; + int err; + + if (map->value_size != sizeof(u64)) + return -ENOSPC; + + rcu_read_lock(); + sk = reuseport_array_lookup_elem(map, key); + if (sk) { + *(u64 *)value = sock_gen_cookie(sk); + err = 0; + } else { + err = -ENOENT; + } + rcu_read_unlock(); + + return err; +} + +static int +reuseport_array_update_check(const struct reuseport_array *array, + const struct sock *nsk, + const struct sock *osk, + const struct sock_reuseport *nsk_reuse, + u32 map_flags) +{ + if (osk && map_flags == BPF_NOEXIST) + return -EEXIST; + + if (!osk && map_flags == BPF_EXIST) + return -ENOENT; + + if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) + return -ENOTSUPP; + + if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) + return -ENOTSUPP; + + if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) + return -ENOTSUPP; + + /* + * sk must be hashed (i.e. listening in the TCP case or binded + * in the UDP case) and + * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). + * + * Also, sk will be used in bpf helper that is protected by + * rcu_read_lock(). + */ + if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) + return -EINVAL; + + /* READ_ONCE because the sk->sk_callback_lock may not be held here */ + if (READ_ONCE(nsk->sk_user_data)) + return -EBUSY; + + return 0; +} + +/* + * Called from syscall only. + * The "nsk" in the fd refcnt. + * The "osk" and "reuse" are protected by reuseport_lock. + */ +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *free_osk = NULL, *osk, *nsk; + struct sock_reuseport *reuse; + u32 index = *(u32 *)key; + struct socket *socket; + int err, fd; + + if (map_flags > BPF_EXIST) + return -EINVAL; + + if (index >= map->max_entries) + return -E2BIG; + + if (map->value_size == sizeof(u64)) { + u64 fd64 = *(u64 *)value; + + if (fd64 > S32_MAX) + return -EINVAL; + fd = fd64; + } else { + fd = *(int *)value; + } + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + nsk = socket->sk; + if (!nsk) { + err = -EINVAL; + goto put_file; + } + + /* Quick checks before taking reuseport_lock */ + err = reuseport_array_update_check(array, nsk, + rcu_access_pointer(array->ptrs[index]), + rcu_access_pointer(nsk->sk_reuseport_cb), + map_flags); + if (err) + goto put_file; + + spin_lock_bh(&reuseport_lock); + /* + * Some of the checks only need reuseport_lock + * but it is done under sk_callback_lock also + * for simplicity reason. + */ + write_lock_bh(&nsk->sk_callback_lock); + + osk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); + if (err) + goto put_file_unlock; + + /* Ensure reuse->reuseport_id is set */ + err = reuseport_get_id(reuse); + if (err < 0) + goto put_file_unlock; + + WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); + rcu_assign_pointer(array->ptrs[index], nsk); + free_osk = osk; + err = 0; + +put_file_unlock: + write_unlock_bh(&nsk->sk_callback_lock); + + if (free_osk) { + write_lock_bh(&free_osk->sk_callback_lock); + WRITE_ONCE(free_osk->sk_user_data, NULL); + write_unlock_bh(&free_osk->sk_callback_lock); + } + + spin_unlock_bh(&reuseport_lock); +put_file: + fput(socket->file); + return err; +} + +/* Called from syscall */ +static int reuseport_array_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +const struct bpf_map_ops reuseport_array_ops = { + .map_alloc_check = reuseport_array_alloc_check, + .map_alloc = reuseport_array_alloc, + .map_free = reuseport_array_free, + .map_lookup_elem = reuseport_array_lookup_elem, + .map_get_next_key = reuseport_array_get_next_key, + .map_delete_elem = reuseport_array_delete_elem, +}; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 52a91d816c0e..98e621a29e8e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -58,6 +58,7 @@ struct bpf_stab { struct bpf_map map; struct sock **sock_map; struct bpf_sock_progs progs; + raw_spinlock_t lock; }; struct bucket { @@ -72,6 +73,7 @@ struct bpf_htab { u32 n_buckets; u32 elem_size; struct bpf_sock_progs progs; + struct rcu_head rcu; }; struct htab_elem { @@ -88,9 +90,9 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; + struct bpf_map *map; struct sock **entry; - struct htab_elem *hash_link; - struct bpf_htab *htab; + struct htab_elem __rcu *hash_link; }; struct smap_psock { @@ -120,6 +122,7 @@ struct smap_psock { struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; struct list_head maps; + spinlock_t maps_lock; /* Back reference used when sock callback trigger sockmap operations */ struct sock *sock; @@ -140,6 +143,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +static void bpf_tcp_close(struct sock *sk, long timeout); static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { @@ -161,7 +165,42 @@ out: return !empty; } -static struct proto tcp_bpf_proto; +enum { + SOCKMAP_IPV4, + SOCKMAP_IPV6, + SOCKMAP_NUM_PROTS, +}; + +enum { + SOCKMAP_BASE, + SOCKMAP_TX, + SOCKMAP_NUM_CONFIGS, +}; + +static struct proto *saved_tcpv6_prot __read_mostly; +static DEFINE_SPINLOCK(tcpv6_prot_lock); +static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; +static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], + struct proto *base) +{ + prot[SOCKMAP_BASE] = *base; + prot[SOCKMAP_BASE].close = bpf_tcp_close; + prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; + prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; + + prot[SOCKMAP_TX] = prot[SOCKMAP_BASE]; + prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg; + prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage; +} + +static void update_sk_prot(struct sock *sk, struct smap_psock *psock) +{ + int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4; + int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE; + + sk->sk_prot = &bpf_tcp_prots[family][conf]; +} + static int bpf_tcp_init(struct sock *sk) { struct smap_psock *psock; @@ -181,14 +220,17 @@ static int bpf_tcp_init(struct sock *sk) psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; - if (psock->bpf_tx_msg) { - tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; - tcp_bpf_proto.sendpage = bpf_tcp_sendpage; - tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg; - tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read; + /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ + if (sk->sk_family == AF_INET6 && + unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { + spin_lock_bh(&tcpv6_prot_lock); + if (likely(sk->sk_prot != saved_tcpv6_prot)) { + build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot); + smp_store_release(&saved_tcpv6_prot, sk->sk_prot); + } + spin_unlock_bh(&tcpv6_prot_lock); } - - sk->sk_prot = &tcp_bpf_proto; + update_sk_prot(sk, psock); rcu_read_unlock(); return 0; } @@ -219,24 +261,64 @@ out: rcu_read_unlock(); } +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, + u32 hash, void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) { + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + } + + return NULL; +} + +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { atomic_dec(&htab->count); kfree_rcu(l, rcu); } +static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, + struct smap_psock *psock) +{ + struct smap_psock_map_entry *e; + + spin_lock_bh(&psock->maps_lock); + e = list_first_entry_or_null(&psock->maps, + struct smap_psock_map_entry, + list); + if (e) + list_del(&e->list); + spin_unlock_bh(&psock->maps_lock); + return e; +} + static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); - struct smap_psock_map_entry *e, *tmp; + struct smap_psock_map_entry *e; struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; struct sock *osk; + lock_sock(sk); rcu_read_lock(); psock = smap_psock_sk(sk); if (unlikely(!psock)) { rcu_read_unlock(); + release_sock(sk); return sk->sk_prot->close(sk, timeout); } @@ -247,7 +329,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout) */ close_fun = psock->save_close; - write_lock_bh(&sk->sk_callback_lock); if (psock->cork) { free_start_sg(psock->sock, psock->cork); kfree(psock->cork); @@ -260,21 +341,46 @@ static void bpf_tcp_close(struct sock *sk, long timeout) kfree(md); } - list_for_each_entry_safe(e, tmp, &psock->maps, list) { + e = psock_map_pop(sk, psock); + while (e) { if (e->entry) { - osk = cmpxchg(e->entry, sk, NULL); + struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map); + + raw_spin_lock_bh(&stab->lock); + osk = *e->entry; if (osk == sk) { - list_del(&e->list); + *e->entry = NULL; smap_release_sock(psock, sk); } + raw_spin_unlock_bh(&stab->lock); } else { - hlist_del_rcu(&e->hash_link->hash_node); - smap_release_sock(psock, e->hash_link->sk); - free_htab_elem(e->htab, e->hash_link); + struct htab_elem *link = rcu_dereference(e->hash_link); + struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + struct bucket *b; + + b = __select_bucket(htab, link->hash); + head = &b->head; + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, + link->hash, link->key, + htab->map.key_size); + /* If another thread deleted this object skip deletion. + * The refcnt on psock may or may not be zero. + */ + if (l) { + hlist_del_rcu(&link->hash_node); + smap_release_sock(psock, link->sk); + free_htab_elem(htab, link); + } + raw_spin_unlock_bh(&b->lock); } + kfree(e); + e = psock_map_pop(sk, psock); } - write_unlock_bh(&sk->sk_callback_lock); rcu_read_unlock(); + release_sock(sk); close_fun(sk, timeout); } @@ -472,7 +578,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) while (sg[i].length) { free += sg[i].length; sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); + if (!md->skb) + put_page(sg_page(&sg[i])); sg[i].length = 0; sg[i].page_link = 0; sg[i].offset = 0; @@ -481,6 +588,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) if (i == MAX_SKB_FRAGS) i = 0; } + if (md->skb) + consume_skb(md->skb); return free; } @@ -623,11 +732,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, { bool ingress = !!(md->flags & BPF_F_INGRESS); struct smap_psock *psock; - struct scatterlist *sg; int err = 0; - sg = md->sg_data; - rcu_read_lock(); psock = smap_psock_sk(sk); if (unlikely(!psock)) @@ -946,12 +1052,12 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); while (msg_data_left(msg)) { - struct sk_msg_buff *m; + struct sk_msg_buff *m = NULL; bool enospc = false; int copy; if (sk->sk_err) { - err = sk->sk_err; + err = -sk->sk_err; goto out_err; } @@ -1014,8 +1120,11 @@ wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: err = sk_stream_wait_memory(sk, &timeo); - if (err) + if (err) { + if (m && m != psock->cork) + free_start_sg(sk, m); goto out_err; + } } out_err: if (err < 0) @@ -1111,8 +1220,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock, static int bpf_tcp_ulp_register(void) { - tcp_bpf_proto = tcp_prot; - tcp_bpf_proto.close = bpf_tcp_close; + build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); /* Once BPF TX ULP is registered it is never unregistered. It * will be in the ULP list for the lifetime of the system. Doing * duplicate registers is not a problem. @@ -1135,7 +1243,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) */ TCP_SKB_CB(skb)->bpf.sk_redir = NULL; skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); preempt_enable(); @@ -1357,7 +1465,9 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { if (refcount_dec_and_test(&psock->refcnt)) { tcp_cleanup_ulp(sock); + write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); clear_bit(SMAP_TX_RUNNING, &psock->state); rcu_assign_sk_user_data(sock, NULL); call_rcu_sched(&psock->rcu, smap_destroy_psock); @@ -1388,7 +1498,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); @@ -1508,6 +1618,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, int node) INIT_LIST_HEAD(&psock->maps); INIT_LIST_HEAD(&psock->ingress); refcount_set(&psock->refcnt, 1); + spin_lock_init(&psock->maps_lock); rcu_assign_sk_user_data(sock, psock); sock_hold(sock); @@ -1537,6 +1648,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); + raw_spin_lock_init(&stab->lock); /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); @@ -1564,18 +1676,36 @@ free_stab: return ERR_PTR(err); } -static void smap_list_remove(struct smap_psock *psock, - struct sock **entry, - struct htab_elem *hash_link) +static void smap_list_map_remove(struct smap_psock *psock, + struct sock **entry) { struct smap_psock_map_entry *e, *tmp; + spin_lock_bh(&psock->maps_lock); list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry || e->hash_link == hash_link) { + if (e->entry == entry) { list_del(&e->list); - break; + kfree(e); } } + spin_unlock_bh(&psock->maps_lock); +} + +static void smap_list_hash_remove(struct smap_psock *psock, + struct htab_elem *hash_link) +{ + struct smap_psock_map_entry *e, *tmp; + + spin_lock_bh(&psock->maps_lock); + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + struct htab_elem *c = rcu_dereference(e->hash_link); + + if (c == hash_link) { + list_del(&e->list); + kfree(e); + } + } + spin_unlock_bh(&psock->maps_lock); } static void sock_map_free(struct bpf_map *map) @@ -1593,15 +1723,15 @@ static void sock_map_free(struct bpf_map *map) * and a grace period expire to ensure psock is really safe to remove. */ rcu_read_lock(); + raw_spin_lock_bh(&stab->lock); for (i = 0; i < stab->map.max_entries; i++) { struct smap_psock *psock; struct sock *sock; - sock = xchg(&stab->sock_map[i], NULL); + sock = stab->sock_map[i]; if (!sock) continue; - - write_lock_bh(&sock->sk_callback_lock); + stab->sock_map[i] = NULL; psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get the * sk_callback_lock before this case but after xchg happens @@ -1609,11 +1739,11 @@ static void sock_map_free(struct bpf_map *map) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, &stab->sock_map[i], NULL); + smap_list_map_remove(psock, &stab->sock_map[i]); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); } + raw_spin_unlock_bh(&stab->lock); rcu_read_unlock(); sock_map_remove_complete(stab); @@ -1657,21 +1787,23 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (k >= map->max_entries) return -EINVAL; - sock = xchg(&stab->sock_map[k], NULL); + raw_spin_lock_bh(&stab->lock); + sock = stab->sock_map[k]; + stab->sock_map[k] = NULL; + raw_spin_unlock_bh(&stab->lock); if (!sock) return -EINVAL; - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); if (!psock) - goto out; - - if (psock->bpf_parse) + return 0; + if (psock->bpf_parse) { + write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); - smap_list_remove(psock, &stab->sock_map[k], NULL); + write_unlock_bh(&sock->sk_callback_lock); + } + smap_list_map_remove(psock, &stab->sock_map[k]); smap_release_sock(psock, sock); -out: - write_unlock_bh(&sock->sk_callback_lock); return 0; } @@ -1707,11 +1839,9 @@ out: static int __sock_map_ctx_update_elem(struct bpf_map *map, struct bpf_sock_progs *progs, struct sock *sock, - struct sock **map_link, void *key) { struct bpf_prog *verdict, *parse, *tx_msg; - struct smap_psock_map_entry *e = NULL; struct smap_psock *psock; bool new = false; int err = 0; @@ -1752,7 +1882,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, } } - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* 2. Do not allow inheriting programs if psock exists and has @@ -1785,14 +1914,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, new = true; } - if (map_link) { - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) { - err = -ENOMEM; - goto out_progs; - } - } - /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. */ @@ -1809,19 +1930,11 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, if (err) goto out_free; smap_init_progs(psock, verdict, parse); + write_lock_bh(&sock->sk_callback_lock); smap_start_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); } - /* 4. Place psock in sockmap for use and stop any programs on - * the old sock assuming its not the same sock we are replacing - * it with. Because we can only have a single set of programs if - * old_sock has a strp we can stop it. - */ - if (map_link) { - e->entry = map_link; - list_add_tail(&e->list, &psock->maps); - } - write_unlock_bh(&sock->sk_callback_lock); return err; out_free: smap_release_sock(psock, sock); @@ -1832,8 +1945,6 @@ out_progs: } if (tx_msg) bpf_prog_put(tx_msg); - write_unlock_bh(&sock->sk_callback_lock); - kfree(e); return err; } @@ -1843,38 +1954,57 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_sock_progs *progs = &stab->progs; - struct sock *osock, *sock; + struct sock *osock, *sock = skops->sk; + struct smap_psock_map_entry *e; + struct smap_psock *psock; u32 i = *(u32 *)key; int err; if (unlikely(flags > BPF_EXIST)) return -EINVAL; - if (unlikely(i >= stab->map.max_entries)) return -E2BIG; - sock = READ_ONCE(stab->sock_map[i]); - if (flags == BPF_EXIST && !sock) - return -ENOENT; - else if (flags == BPF_NOEXIST && sock) - return -EEXIST; + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) + return -ENOMEM; - sock = skops->sk; - err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i], - key); + err = __sock_map_ctx_update_elem(map, progs, sock, key); if (err) goto out; - osock = xchg(&stab->sock_map[i], sock); - if (osock) { - struct smap_psock *opsock = smap_psock_sk(osock); + /* psock guaranteed to be present. */ + psock = smap_psock_sk(sock); + raw_spin_lock_bh(&stab->lock); + osock = stab->sock_map[i]; + if (osock && flags == BPF_NOEXIST) { + err = -EEXIST; + goto out_unlock; + } + if (!osock && flags == BPF_EXIST) { + err = -ENOENT; + goto out_unlock; + } - write_lock_bh(&osock->sk_callback_lock); - smap_list_remove(opsock, &stab->sock_map[i], NULL); - smap_release_sock(opsock, osock); - write_unlock_bh(&osock->sk_callback_lock); + e->entry = &stab->sock_map[i]; + e->map = map; + spin_lock_bh(&psock->maps_lock); + list_add_tail(&e->list, &psock->maps); + spin_unlock_bh(&psock->maps_lock); + + stab->sock_map[i] = sock; + if (osock) { + psock = smap_psock_sk(osock); + smap_list_map_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, osock); } + raw_spin_unlock_bh(&stab->lock); + return 0; +out_unlock: + smap_release_sock(psock, sock); + raw_spin_unlock_bh(&stab->lock); out: + kfree(e); return err; } @@ -1915,6 +2045,24 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) return 0; } +int sockmap_get_from_fd(const union bpf_attr *attr, int type, + struct bpf_prog *prog) +{ + int ufd = attr->target_fd; + struct bpf_map *map; + struct fd f; + int err; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = sock_map_prog(map, prog, attr->attach_type); + fdput(f); + return err; +} + static void *sock_map_lookup(struct bpf_map *map, void *key) { return NULL; @@ -1944,7 +2092,13 @@ static int sock_map_update_elem(struct bpf_map *map, return -EOPNOTSUPP; } + lock_sock(skops.sk); + preempt_disable(); + rcu_read_lock(); err = sock_map_ctx_update_elem(&skops, map, key, flags); + rcu_read_unlock(); + preempt_enable(); + release_sock(skops.sk); fput(socket->file); return err; } @@ -2043,14 +2197,13 @@ free_htab: return ERR_PTR(err); } -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +static void __bpf_htab_free(struct rcu_head *rcu) { - return &htab->buckets[hash & (htab->n_buckets - 1)]; -} + struct bpf_htab *htab; -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &__select_bucket(htab, hash)->head; + htab = container_of(rcu, struct bpf_htab, rcu); + bpf_map_area_free(htab->buckets); + kfree(htab); } static void sock_hash_free(struct bpf_map *map) @@ -2069,16 +2222,18 @@ static void sock_hash_free(struct bpf_map *map) */ rcu_read_lock(); for (i = 0; i < htab->n_buckets; i++) { - struct hlist_head *head = select_bucket(htab, i); + struct bucket *b = __select_bucket(htab, i); + struct hlist_head *head; struct hlist_node *n; struct htab_elem *l; + raw_spin_lock_bh(&b->lock); + head = &b->head; hlist_for_each_entry_safe(l, n, head, hash_node) { struct sock *sock = l->sk; struct smap_psock *psock; hlist_del_rcu(&l->hash_node); - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get * the sk_callback_lock before this case but after xchg @@ -2086,16 +2241,15 @@ static void sock_hash_free(struct bpf_map *map) * (psock) to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, NULL, l); + smap_list_hash_remove(psock, l); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); - kfree(l); + free_htab_elem(htab, l); } + raw_spin_unlock_bh(&b->lock); } rcu_read_unlock(); - bpf_map_area_free(htab->buckets); - kfree(htab); + call_rcu(&htab->rcu, __bpf_htab_free); } static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, @@ -2122,19 +2276,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, return l_new; } -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, - u32 hash, void *key, u32 key_size) -{ - struct htab_elem *l; - - hlist_for_each_entry_rcu(l, head, hash_node) { - if (l->hash == hash && !memcmp(&l->key, key, key_size)) - return l; - } - - return NULL; -} - static inline u32 htab_map_hash(const void *key, u32 key_len) { return jhash(key, key_len, 0); @@ -2226,11 +2367,14 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, b = __select_bucket(htab, hash); head = &b->head; - err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key); + err = __sock_map_ctx_update_elem(map, progs, sock, key); if (err) goto err; - /* bpf_map_update_elem() can be called in_irq() */ + /* psock is valid here because otherwise above *ctx_update_elem would + * have thrown an error. It is safe to skip error check. + */ + psock = smap_psock_sk(sock); raw_spin_lock_bh(&b->lock); l_old = lookup_elem_raw(head, hash, key, key_size); if (l_old && map_flags == BPF_NOEXIST) { @@ -2248,15 +2392,11 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto bucket_err; } - psock = smap_psock_sk(sock); - if (unlikely(!psock)) { - err = -EINVAL; - goto bucket_err; - } - - e->hash_link = l_new; - e->htab = container_of(map, struct bpf_htab, map); + rcu_assign_pointer(e->hash_link, l_new); + e->map = map; + spin_lock_bh(&psock->maps_lock); list_add_tail(&e->list, &psock->maps); + spin_unlock_bh(&psock->maps_lock); /* add new element to the head of the list, so that * concurrent search will find it before old elem @@ -2266,19 +2406,17 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, psock = smap_psock_sk(l_old->sk); hlist_del_rcu(&l_old->hash_node); - smap_list_remove(psock, NULL, l_old); + smap_list_hash_remove(psock, l_old); smap_release_sock(psock, l_old->sk); free_htab_elem(htab, l_old); } raw_spin_unlock_bh(&b->lock); return 0; bucket_err: + smap_release_sock(psock, sock); raw_spin_unlock_bh(&b->lock); err: kfree(e); - psock = smap_psock_sk(sock); - if (psock) - smap_release_sock(psock, sock); return err; } @@ -2300,7 +2438,13 @@ static int sock_hash_update_elem(struct bpf_map *map, return -EINVAL; } + lock_sock(skops.sk); + preempt_disable(); + rcu_read_lock(); err = sock_hash_ctx_update_elem(&skops, map, key, flags); + rcu_read_unlock(); + preempt_enable(); + release_sock(skops.sk); fput(socket->file); return err; } @@ -2326,7 +2470,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) struct smap_psock *psock; hlist_del_rcu(&l->hash_node); - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get the * sk_callback_lock before this case but after xchg happens @@ -2334,10 +2477,9 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, NULL, l); + smap_list_hash_remove(psock, l); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); free_htab_elem(htab, l); ret = 0; } @@ -2359,10 +2501,8 @@ struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - raw_spin_lock_bh(&b->lock); l = lookup_elem_raw(head, hash, key, key_size); sk = l ? l->sk : NULL; - raw_spin_unlock_bh(&b->lock); return sk; } @@ -2374,6 +2514,7 @@ const struct bpf_map_ops sock_map_ops = { .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_release_uref = sock_map_release, + .map_check_btf = map_check_no_btf, }; const struct bpf_map_ops sock_hash_ops = { @@ -2383,6 +2524,8 @@ const struct bpf_map_ops sock_hash_ops = { .map_get_next_key = sock_hash_get_next_key, .map_update_elem = sock_hash_update_elem, .map_delete_elem = sock_hash_delete_elem, + .map_release_uref = sock_map_release, + .map_check_btf = map_check_no_btf, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b675a3f3d141..8061a439ef18 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -607,6 +607,7 @@ const struct bpf_map_ops stack_map_ops = { .map_lookup_elem = stack_map_lookup_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, + .map_check_btf = map_check_no_btf, }; static int __init stack_map_init(void) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35dc466641f2..8339d81cba1d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -103,6 +103,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, const struct bpf_map_ops bpf_map_offload_ops = { .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, + .map_check_btf = map_check_no_btf, }; static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) @@ -181,32 +182,60 @@ int bpf_map_precharge_memlock(u32 pages) return 0; } -static int bpf_map_charge_memlock(struct bpf_map *map) +static int bpf_charge_memlock(struct user_struct *user, u32 pages) { - struct user_struct *user = get_current_user(); - unsigned long memlock_limit; + unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) { + atomic_long_sub(pages, &user->locked_vm); + return -EPERM; + } + return 0; +} - atomic_long_add(map->pages, &user->locked_vm); +static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) +{ + atomic_long_sub(pages, &user->locked_vm); +} - if (atomic_long_read(&user->locked_vm) > memlock_limit) { - atomic_long_sub(map->pages, &user->locked_vm); +static int bpf_map_init_memlock(struct bpf_map *map) +{ + struct user_struct *user = get_current_user(); + int ret; + + ret = bpf_charge_memlock(user, map->pages); + if (ret) { free_uid(user); - return -EPERM; + return ret; } map->user = user; - return 0; + return ret; } -static void bpf_map_uncharge_memlock(struct bpf_map *map) +static void bpf_map_release_memlock(struct bpf_map *map) { struct user_struct *user = map->user; - - atomic_long_sub(map->pages, &user->locked_vm); + bpf_uncharge_memlock(user, map->pages); free_uid(user); } +int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) +{ + int ret; + + ret = bpf_charge_memlock(map->user, pages); + if (ret) + return ret; + map->pages += pages; + return ret; +} + +void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) +{ + bpf_uncharge_memlock(map->user, pages); + map->pages -= pages; +} + static int bpf_map_alloc_id(struct bpf_map *map) { int id; @@ -256,7 +285,7 @@ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); - bpf_map_uncharge_memlock(map); + bpf_map_release_memlock(map); security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); @@ -427,6 +456,34 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } +int map_check_no_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + return -ENOTSUPP; +} + +static int map_check_btf(const struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) +{ + const struct btf_type *key_type, *value_type; + u32 key_size, value_size; + int ret = 0; + + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); + if (!value_type || value_size != map->value_size) + return -EINVAL; + + if (map->ops->map_check_btf) + ret = map->ops->map_check_btf(map, key_type, value_type); + + return ret; +} + #define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) @@ -461,8 +518,7 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); - if (bpf_map_support_seq_show(map) && - (attr->btf_key_type_id || attr->btf_value_type_id)) { + if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; if (!attr->btf_key_type_id || !attr->btf_value_type_id) { @@ -476,8 +532,8 @@ static int map_create(union bpf_attr *attr) goto free_map_nouncharge; } - err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, - attr->btf_value_type_id); + err = map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); if (err) { btf_put(btf); goto free_map_nouncharge; @@ -492,7 +548,7 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_nouncharge; - err = bpf_map_charge_memlock(map); + err = bpf_map_init_memlock(map); if (err) goto free_map_sec; @@ -515,7 +571,7 @@ static int map_create(union bpf_attr *attr) return err; free_map: - bpf_map_uncharge_memlock(map); + bpf_map_release_memlock(map); free_map_sec: security_bpf_map_free(map); free_map_nouncharge: @@ -575,7 +631,7 @@ static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, { int refold; - refold = __atomic_add_unless(&map->refcnt, 1, 0); + refold = atomic_fetch_add_unless(&map->refcnt, 1, 0); if (refold >= BPF_MAX_REFCNT) { __bpf_map_put(map, false); @@ -656,6 +712,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + err = bpf_fd_reuseport_array_lookup_elem(map, key, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -735,7 +793,9 @@ static int map_update_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_update_elem(map, key, value, attr->flags); goto out; - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH || + map->map_type == BPF_MAP_TYPE_SOCKMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } @@ -760,6 +820,10 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_fd_htab_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + /* rcu_read_lock() is not needed */ + err = bpf_fd_reuseport_array_update_elem(map, key, value, + attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); @@ -927,6 +991,9 @@ static void free_used_maps(struct bpf_prog_aux *aux) { int i; + if (aux->cgroup_storage) + bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage); + for (i = 0; i < aux->used_map_cnt; i++) bpf_map_put(aux->used_maps[i]); @@ -1142,7 +1209,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) { int refold; - refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0); + refold = atomic_fetch_add_unless(&prog->aux->refcnt, 1, 0); if (refold >= BPF_MAX_REFCNT) { __bpf_prog_put(prog, false); @@ -1483,8 +1550,6 @@ out_free_tp: return err; } -#ifdef CONFIG_CGROUP_BPF - static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { @@ -1499,40 +1564,6 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, - int type, bool attach) -{ - struct bpf_prog *prog = NULL; - int ufd = attr->target_fd; - struct bpf_map *map; - struct fd f; - int err; - - f = fdget(ufd); - map = __bpf_map_get(f); - if (IS_ERR(map)) - return PTR_ERR(map); - - if (attach) { - prog = bpf_prog_get_type(attr->attach_bpf_fd, type); - if (IS_ERR(prog)) { - fdput(f); - return PTR_ERR(prog); - } - } - - err = sock_map_prog(map, prog, attr->attach_type); - if (err) { - fdput(f); - if (prog) - bpf_prog_put(prog); - return err; - } - - fdput(f); - return 0; -} - #define BPF_F_ATTACH_MASK \ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) @@ -1540,7 +1571,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; struct bpf_prog *prog; - struct cgroup *cgrp; int ret; if (!capable(CAP_NET_ADMIN)) @@ -1577,12 +1607,15 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); + ptype = BPF_PROG_TYPE_SK_MSG; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); + ptype = BPF_PROG_TYPE_SK_SKB; + break; case BPF_LIRC_MODE2: - return lirc_prog_attach(attr); + ptype = BPF_PROG_TYPE_LIRC_MODE2; + break; default: return -EINVAL; } @@ -1596,18 +1629,20 @@ static int bpf_prog_attach(const union bpf_attr *attr) return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) { - bpf_prog_put(prog); - return PTR_ERR(cgrp); + switch (ptype) { + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + ret = sockmap_get_from_fd(attr, ptype, prog); + break; + case BPF_PROG_TYPE_LIRC_MODE2: + ret = lirc_prog_attach(attr, prog); + break; + default: + ret = cgroup_bpf_prog_attach(attr, ptype, prog); } - ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, - attr->attach_flags); if (ret) bpf_prog_put(prog); - cgroup_put(cgrp); - return ret; } @@ -1616,9 +1651,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - struct bpf_prog *prog; - struct cgroup *cgrp; - int ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -1651,29 +1683,17 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); default: return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - - prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); - if (IS_ERR(prog)) - prog = NULL; - - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); - if (prog) - bpf_prog_put(prog); - cgroup_put(cgrp); - return ret; + return cgroup_bpf_prog_detach(attr, ptype); } #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt @@ -1681,9 +1701,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - struct cgroup *cgrp; - int ret; - if (!capable(CAP_NET_ADMIN)) return -EPERM; if (CHECK_ATTR(BPF_PROG_QUERY)) @@ -1711,14 +1728,9 @@ static int bpf_prog_query(const union bpf_attr *attr, default: return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->query.target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - ret = cgroup_bpf_query(cgrp, attr, uattr); - cgroup_put(cgrp); - return ret; + + return cgroup_bpf_prog_query(attr, uattr); } -#endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -2365,7 +2377,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; -#ifdef CONFIG_CGROUP_BPF case BPF_PROG_ATTACH: err = bpf_prog_attach(&attr); break; @@ -2375,7 +2386,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_QUERY: err = bpf_prog_query(&attr, uattr); break; -#endif case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e2bf834f13a..92246117d2b0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1310,6 +1310,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_REUSEPORT: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; @@ -2127,6 +2128,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + case BPF_MAP_TYPE_CGROUP_STORAGE: + if (func_id != BPF_FUNC_get_local_storage) + goto error; + break; /* devmap returns a pointer to a live net_device ifindex that we cannot * allow to be modified from bpf side. So do not allow lookup elements * for now. @@ -2162,6 +2167,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_hash) goto error; break; + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + if (func_id != BPF_FUNC_sk_select_reuseport) + goto error; + break; default: break; } @@ -2209,6 +2218,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; + case BPF_FUNC_get_local_storage: + if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + goto error; + break; + case BPF_FUNC_sk_select_reuseport: + if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) + goto error; + break; default: break; } @@ -2533,6 +2550,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs = cur_regs(env); + + /* check that flags argument in get_local_storage(map, flags) is 0, + * this is required because get_local_storage() can't return an error. + */ + if (func_id == BPF_FUNC_get_local_storage && + !register_is_null(®s[BPF_REG_2])) { + verbose(env, "get_local_storage() doesn't support non-zero flags\n"); + return -EINVAL; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -2545,8 +2572,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_unknown(env, regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; - } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || + fn->ret_type == RET_PTR_TO_MAP_VALUE) { + if (fn->ret_type == RET_PTR_TO_MAP_VALUE) + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; + else + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].off = 0; @@ -3238,8 +3269,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } - /* check dest operand */ - err = check_reg_arg(env, insn->dst_reg, DST_OP); + /* check dest operand, mark as required later */ + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); if (err) return err; @@ -3265,6 +3296,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) /* case: R = imm * remember the value we stored into this reg */ + /* clear any state __mark_reg_known doesn't set */ + mark_reg_unknown(env, regs, insn->dst_reg); regs[insn->dst_reg].type = SCALAR_VALUE; if (BPF_CLASS(insn->code) == BPF_ALU64) { __mark_reg_known(regs + insn->dst_reg, @@ -5054,7 +5087,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && - !bpf_offload_dev_match(prog, map)) { + !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); return -EINVAL; } @@ -5152,6 +5185,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } env->used_maps[env->used_map_cnt++] = map; + if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE && + bpf_cgroup_storage_assign(env->prog, map)) { + verbose(env, + "only one cgroup storage is allowed\n"); + fdput(f); + return -EBUSY; + } + fdput(f); next_insn: insn++; @@ -5178,6 +5219,10 @@ static void release_maps(struct bpf_verifier_env *env) { int i; + if (env->prog->aux->cgroup_storage) + bpf_cgroup_storage_release(env->prog, + env->prog->aux->cgroup_storage); + for (i = 0; i < env->used_map_cnt; i++) bpf_map_put(env->used_maps[i]); } @@ -5430,6 +5475,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; + /* Upon error here we cannot fall back to interpreter but + * need a hard reject of the program. Thus -EFAULT is + * propagated in any case. + */ subprog = find_subprog(env, i + insn->imm + 1); if (subprog < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", @@ -5450,7 +5499,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); if (!func) - return -ENOMEM; + goto out_undo_insn; for (i = 0; i < env->subprog_cnt; i++) { subprog_start = subprog_end; @@ -5515,7 +5564,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) tmp = bpf_int_jit_compile(func[i]); if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); - err = -EFAULT; + err = -ENOTSUPP; goto out_free; } cond_resched(); @@ -5552,6 +5601,7 @@ out_free: if (func[i]) bpf_jit_free(func[i]); kfree(func); +out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { @@ -5578,6 +5628,8 @@ static int fixup_call_args(struct bpf_verifier_env *env) err = jit_subprogs(env); if (err == 0) return 0; + if (err == -EFAULT) + return err; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON for (i = 0; i < prog->len; i++, insn++) { @@ -5792,27 +5844,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) goto patch_call_imm; } - if (insn->imm == BPF_FUNC_redirect_map) { - /* Note, we cannot use prog directly as imm as subsequent - * rewrites would still change the prog pointer. The only - * stable address we can use is aux, which also works with - * prog clones during blinding. - */ - u64 addr = (unsigned long)prog->aux; - struct bpf_insn r4_ld[] = { - BPF_LD_IMM64(BPF_REG_4, addr), - *insn, - }; - cnt = ARRAY_SIZE(r4_ld); - - new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - } patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index b3c557476a8d..9f8463afda9c 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -75,6 +75,7 @@ static void xsk_map_free(struct bpf_map *map) struct xsk_map *m = container_of(map, struct xsk_map, map); int i; + bpf_clear_redirect_map(map); synchronize_net(); for (i = 0; i < map->max_entries; i++) { @@ -227,6 +228,5 @@ const struct bpf_map_ops xsk_map_ops = { .map_lookup_elem = xsk_map_lookup_elem, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, + .map_check_btf = map_check_no_btf, }; - - diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 77ff1cd6a252..75568fcf2180 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -8,6 +8,32 @@ #include <linux/list.h> #include <linux/refcount.h> +#define TRACE_CGROUP_PATH_LEN 1024 +extern spinlock_t trace_cgroup_path_lock; +extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; + +/* + * cgroup_path() takes a spin lock. It is good practice not to take + * spin locks within trace point handlers, as they are mostly hidden + * from normal view. As cgroup_path() can take the kernfs_rename_lock + * spin lock, it is best to not call that function from the trace event + * handler. + * + * Note: trace_cgroup_##type##_enabled() is a static branch that will only + * be set when the trace event is enabled. + */ +#define TRACE_CGROUP_PATH(type, cgrp, ...) \ + do { \ + if (trace_cgroup_##type##_enabled()) { \ + spin_lock(&trace_cgroup_path_lock); \ + cgroup_path(cgrp, trace_cgroup_path, \ + TRACE_CGROUP_PATH_LEN); \ + trace_cgroup_##type(cgrp, trace_cgroup_path, \ + ##__VA_ARGS__); \ + spin_unlock(&trace_cgroup_path_lock); \ + } \ + } while (0) + /* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 8b4f0768efd6..51063e7a93c2 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -135,7 +135,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (task) { ret = cgroup_migrate(task, false, &mgctx); if (!ret) - trace_cgroup_transfer_tasks(to, task, false); + TRACE_CGROUP_PATH(transfer_tasks, to, task, false); put_task_struct(task); } } while (task && !ret); @@ -865,7 +865,7 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent ret = kernfs_rename(kn, new_parent, new_name_str); if (!ret) - trace_cgroup_rename(cgrp); + TRACE_CGROUP_PATH(rename, cgrp); mutex_unlock(&cgroup_mutex); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 077370bf8964..aae10baf1902 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -83,6 +83,9 @@ EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif +DEFINE_SPINLOCK(trace_cgroup_path_lock); +char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; + /* * Protects cgroup_idr and css_idr so that IDs can be released without * grabbing cgroup_mutex. @@ -2638,7 +2641,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, cgroup_migrate_finish(&mgctx); if (!ret) - trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); + TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup); return ret; } @@ -3557,7 +3560,9 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, key = &cft->lockdep_key; #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), - cgroup_file_mode(cft), 0, cft->kf_ops, cft, + cgroup_file_mode(cft), + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + 0, cft->kf_ops, cft, NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); @@ -4634,7 +4639,7 @@ static void css_release_work_fn(struct work_struct *work) struct cgroup *tcgrp; /* cgroup release path */ - trace_cgroup_release(cgrp); + TRACE_CGROUP_PATH(release, cgrp); if (cgroup_on_dfl(cgrp)) cgroup_rstat_flush(cgrp); @@ -4977,7 +4982,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) if (ret) goto out_destroy; - trace_cgroup_mkdir(cgrp); + TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ kernfs_activate(kn); @@ -5165,9 +5170,8 @@ int cgroup_rmdir(struct kernfs_node *kn) return 0; ret = cgroup_destroy_locked(cgrp); - if (!ret) - trace_cgroup_rmdir(cgrp); + TRACE_CGROUP_PATH(rmdir, cgrp); cgroup_kn_unlock(kn); return ret; diff --git a/kernel/compat.c b/kernel/compat.c index 8e40efc2928a..089d00d0da9c 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -93,28 +93,28 @@ int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) return 0; } -static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) +static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) { return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || __get_user(tv->tv_sec, &ctv->tv_sec) || __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) +static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) { return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || __put_user(tv->tv_sec, &ctv->tv_sec) || __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts) +static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts) +static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) { return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || __put_user(ts->tv_sec, &cts->tv_sec) || diff --git a/kernel/cpu.c b/kernel/cpu.c index 0db8938fbb23..ed44d7d34c2d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -60,6 +60,7 @@ struct cpuhp_cpu_state { bool rollback; bool single; bool bringup; + bool booted_once; struct hlist_node *node; struct hlist_node *last; enum cpuhp_state cb_state; @@ -290,6 +291,12 @@ void cpus_read_lock(void) } EXPORT_SYMBOL_GPL(cpus_read_lock); +int cpus_read_trylock(void) +{ + return percpu_down_read_trylock(&cpu_hotplug_lock); +} +EXPORT_SYMBOL_GPL(cpus_read_trylock); + void cpus_read_unlock(void) { percpu_up_read(&cpu_hotplug_lock); @@ -342,6 +349,85 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_HOTPLUG_SMT +enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; +EXPORT_SYMBOL_GPL(cpu_smt_control); + +static bool cpu_smt_available __read_mostly; + +void __init cpu_smt_disable(bool force) +{ + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || + cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + return; + + if (force) { + pr_info("SMT: Force disabled\n"); + cpu_smt_control = CPU_SMT_FORCE_DISABLED; + } else { + cpu_smt_control = CPU_SMT_DISABLED; + } +} + +/* + * The decision whether SMT is supported can only be done after the full + * CPU identification. Called from architecture code before non boot CPUs + * are brought up. + */ +void __init cpu_smt_check_topology_early(void) +{ + if (!topology_smt_supported()) + cpu_smt_control = CPU_SMT_NOT_SUPPORTED; +} + +/* + * If SMT was disabled by BIOS, detect it here, after the CPUs have been + * brought online. This ensures the smt/l1tf sysfs entries are consistent + * with reality. cpu_smt_available is set to true during the bringup of non + * boot CPUs when a SMT sibling is detected. Note, this may overwrite + * cpu_smt_control's previous setting. + */ +void __init cpu_smt_check_topology(void) +{ + if (!cpu_smt_available) + cpu_smt_control = CPU_SMT_NOT_SUPPORTED; +} + +static int __init smt_cmdline_disable(char *str) +{ + cpu_smt_disable(str && !strcmp(str, "force")); + return 0; +} +early_param("nosmt", smt_cmdline_disable); + +static inline bool cpu_smt_allowed(unsigned int cpu) +{ + if (topology_is_primary_thread(cpu)) + return true; + + /* + * If the CPU is not a 'primary' thread and the booted_once bit is + * set then the processor has SMT support. Store this information + * for the late check of SMT support in cpu_smt_check_topology(). + */ + if (per_cpu(cpuhp_state, cpu).booted_once) + cpu_smt_available = true; + + if (cpu_smt_control == CPU_SMT_ENABLED) + return true; + + /* + * On x86 it's required to boot all logical CPUs at least once so + * that the init code can get a chance to set CR4.MCE on each + * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any + * core will shutdown the machine. + */ + return !per_cpu(cpuhp_state, cpu).booted_once; +} +#else +static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } +#endif + static inline enum cpuhp_state cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) { @@ -422,6 +508,16 @@ static int bringup_wait_for_ap(unsigned int cpu) stop_machine_unpark(cpu); kthread_unpark(st->thread); + /* + * SMT soft disabling on X86 requires to bring the CPU out of the + * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The + * CPU marked itself as booted_once in cpu_notify_starting() so the + * cpu_smt_allowed() check will now return false if this is not the + * primary sibling. + */ + if (!cpu_smt_allowed(cpu)) + return -ECANCELED; + if (st->target <= CPUHP_AP_ONLINE_IDLE) return 0; @@ -754,7 +850,6 @@ static int takedown_cpu(unsigned int cpu) /* Park the smpboot threads */ kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); - smpboot_park_threads(cpu); /* * Prevent irq alloc/free while the dying cpu reorganizes the @@ -907,20 +1002,19 @@ out: return ret; } +static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) +{ + if (cpu_hotplug_disabled) + return -EBUSY; + return _cpu_down(cpu, 0, target); +} + static int do_cpu_down(unsigned int cpu, enum cpuhp_state target) { int err; cpu_maps_update_begin(); - - if (cpu_hotplug_disabled) { - err = -EBUSY; - goto out; - } - - err = _cpu_down(cpu, 0, target); - -out: + err = cpu_down_maps_locked(cpu, target); cpu_maps_update_done(); return err; } @@ -949,6 +1043,7 @@ void notify_cpu_starting(unsigned int cpu) int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ + st->booted_once = true; while (st->state < target) { st->state++; ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); @@ -1058,6 +1153,10 @@ static int do_cpu_up(unsigned int cpu, enum cpuhp_state target) err = -EBUSY; goto out; } + if (!cpu_smt_allowed(cpu)) { + err = -EPERM; + goto out; + } err = _cpu_up(cpu, 0, target); out: @@ -1274,7 +1373,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { * otherwise a RCU stall occurs. */ [CPUHP_TIMERS_PREPARE] = { - .name = "timers:dead", + .name = "timers:prepare", .startup.single = timers_prepare_cpu, .teardown.single = timers_dead_cpu, }, @@ -1332,7 +1431,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot/threads:online", .startup.single = smpboot_unpark_threads, - .teardown.single = NULL, + .teardown.single = smpboot_park_threads, }, [CPUHP_AP_IRQ_AFFINITY_ONLINE] = { .name = "irq/affinity:online", @@ -1344,6 +1443,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = perf_event_init_cpu, .teardown.single = perf_event_exit_cpu, }, + [CPUHP_AP_WATCHDOG_ONLINE] = { + .name = "lockup_detector:online", + .startup.single = lockup_detector_online_cpu, + .teardown.single = lockup_detector_offline_cpu, + }, [CPUHP_AP_WORKQUEUE_ONLINE] = { .name = "workqueue:online", .startup.single = workqueue_online_cpu, @@ -1906,10 +2010,172 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = { NULL }; +#ifdef CONFIG_HOTPLUG_SMT + +static const char *smt_states[] = { + [CPU_SMT_ENABLED] = "on", + [CPU_SMT_DISABLED] = "off", + [CPU_SMT_FORCE_DISABLED] = "forceoff", + [CPU_SMT_NOT_SUPPORTED] = "notsupported", +}; + +static ssize_t +show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]); +} + +static void cpuhp_offline_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = true; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_OFFLINE); +} + +static void cpuhp_online_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = false; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_ONLINE); +} + +static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) +{ + int cpu, ret = 0; + + cpu_maps_update_begin(); + for_each_online_cpu(cpu) { + if (topology_is_primary_thread(cpu)) + continue; + ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); + if (ret) + break; + /* + * As this needs to hold the cpu maps lock it's impossible + * to call device_offline() because that ends up calling + * cpu_down() which takes cpu maps lock. cpu maps lock + * needs to be held as this might race against in kernel + * abusers of the hotplug machinery (thermal management). + * + * So nothing would update device:offline state. That would + * leave the sysfs entry stale and prevent onlining after + * smt control has been changed to 'off' again. This is + * called under the sysfs hotplug lock, so it is properly + * serialized against the regular offline usage. + */ + cpuhp_offline_cpu_device(cpu); + } + if (!ret) + cpu_smt_control = ctrlval; + cpu_maps_update_done(); + return ret; +} + +static int cpuhp_smt_enable(void) +{ + int cpu, ret = 0; + + cpu_maps_update_begin(); + cpu_smt_control = CPU_SMT_ENABLED; + for_each_present_cpu(cpu) { + /* Skip online CPUs and CPUs on offline nodes */ + if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) + continue; + ret = _cpu_up(cpu, 0, CPUHP_ONLINE); + if (ret) + break; + /* See comment in cpuhp_smt_disable() */ + cpuhp_online_cpu_device(cpu); + } + cpu_maps_update_done(); + return ret; +} + +static ssize_t +store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + int ctrlval, ret; + + if (sysfs_streq(buf, "on")) + ctrlval = CPU_SMT_ENABLED; + else if (sysfs_streq(buf, "off")) + ctrlval = CPU_SMT_DISABLED; + else if (sysfs_streq(buf, "forceoff")) + ctrlval = CPU_SMT_FORCE_DISABLED; + else + return -EINVAL; + + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED) + return -EPERM; + + if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + return -ENODEV; + + ret = lock_device_hotplug_sysfs(); + if (ret) + return ret; + + if (ctrlval != cpu_smt_control) { + switch (ctrlval) { + case CPU_SMT_ENABLED: + ret = cpuhp_smt_enable(); + break; + case CPU_SMT_DISABLED: + case CPU_SMT_FORCE_DISABLED: + ret = cpuhp_smt_disable(ctrlval); + break; + } + } + + unlock_device_hotplug(); + return ret ? ret : count; +} +static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control); + +static ssize_t +show_smt_active(struct device *dev, struct device_attribute *attr, char *buf) +{ + bool active = topology_max_smt_threads() > 1; + + return snprintf(buf, PAGE_SIZE - 2, "%d\n", active); +} +static DEVICE_ATTR(active, 0444, show_smt_active, NULL); + +static struct attribute *cpuhp_smt_attrs[] = { + &dev_attr_control.attr, + &dev_attr_active.attr, + NULL +}; + +static const struct attribute_group cpuhp_smt_attr_group = { + .attrs = cpuhp_smt_attrs, + .name = "smt", + NULL +}; + +static int __init cpu_smt_state_init(void) +{ + return sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpuhp_smt_attr_group); +} + +#else +static inline int cpu_smt_state_init(void) { return 0; } +#endif + static int __init cpuhp_sysfs_init(void) { int cpu, ret; + ret = cpu_smt_state_init(); + if (ret) + return ret; + ret = sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpuhp_cpu_root_attr_group); if (ret) @@ -2010,7 +2276,10 @@ void __init boot_cpu_init(void) /* * Must be called _AFTER_ setting up the per_cpu areas */ -void __init boot_cpu_state_init(void) +void __init boot_cpu_hotplug_init(void) { - per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE; +#ifdef CONFIG_SMP + this_cpu_write(cpuhp_state.booted_once, true); +#endif + this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index b66aced5e8c2..933cb3e45b98 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -14,8 +14,8 @@ #include <asm/sections.h> /* vmcoreinfo stuff */ -static unsigned char *vmcoreinfo_data; -static size_t vmcoreinfo_size; +unsigned char *vmcoreinfo_data; +size_t vmcoreinfo_size; u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ @@ -344,7 +344,7 @@ void crash_save_vmcoreinfo(void) if (vmcoreinfo_data_safecopy) vmcoreinfo_data = vmcoreinfo_data_safecopy; - vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); + vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds()); update_vmcoreinfo_note(); } @@ -401,7 +401,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(init_uts_ns); VMCOREINFO_SYMBOL(node_online_map); #ifdef CONFIG_MMU - VMCOREINFO_SYMBOL(swapper_pg_dir); + VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); #endif VMCOREINFO_SYMBOL(_stext); VMCOREINFO_SYMBOL(vmap_area_list); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index d987dcd1bd56..286d82329eb0 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -178,7 +178,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, * @dev: Pointer to device for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). - * @gfp_mask: GFP flags to use for this allocation. + * @no_warn: Avoid printing message about failed allocation. * * This function allocates memory buffer for specified device. It uses * device specific contiguous memory area if available or the default @@ -186,12 +186,12 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, * function. */ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int align, gfp_t gfp_mask) + unsigned int align, bool no_warn) { if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask); + return cma_alloc(dev_get_cma_area(dev), count, align, no_warn); } /** diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8be8106270c2..1c35b7b945d0 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -78,7 +78,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, again: /* CMA can be used only in the context which permits sleeping */ if (gfpflags_allow_blocking(gfp)) { - page = dma_alloc_from_contiguous(dev, count, page_order, gfp); + page = dma_alloc_from_contiguous(dev, count, page_order, + gfp & __GFP_NOWARN); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_release_from_contiguous(dev, page, count); page = NULL; @@ -180,10 +181,10 @@ int dma_direct_supported(struct device *dev, u64 mask) return 0; #endif /* - * Various PCI/PCIe bridges have broken support for > 32bit DMA even - * if the device itself might support it. + * Upstream PCI/PCIe bridges or SoC interconnects may not carry + * as many DMA address bits as the device itself supports. */ - if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32)) + if (dev->bus_dma_mask && mask > dev->bus_dma_mask) return 0; return 1; } diff --git a/kernel/dma/noncoherent.c b/kernel/dma/noncoherent.c index 79e9a757387f..031fe235d958 100644 --- a/kernel/dma/noncoherent.c +++ b/kernel/dma/noncoherent.c @@ -49,11 +49,13 @@ static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl, return nents; } -#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) static void dma_noncoherent_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir); + arch_sync_dma_for_cpu_all(dev); } static void dma_noncoherent_sync_sg_for_cpu(struct device *dev, @@ -64,6 +66,7 @@ static void dma_noncoherent_sync_sg_for_cpu(struct device *dev, for_each_sg(sgl, sg, nents, i) arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); + arch_sync_dma_for_cpu_all(dev); } static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr, @@ -89,7 +92,8 @@ const struct dma_map_ops dma_noncoherent_ops = { .sync_sg_for_device = dma_noncoherent_sync_sg_for_device, .map_page = dma_noncoherent_map_page, .map_sg = dma_noncoherent_map_sg, -#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) .sync_single_for_cpu = dma_noncoherent_sync_single_for_cpu, .sync_sg_for_cpu = dma_noncoherent_sync_sg_for_cpu, .unmap_page = dma_noncoherent_unmap_page, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 904541055792..4f8a6dbf0b60 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -17,6 +17,8 @@ * 08/12/11 beckyb Add highmem support */ +#define pr_fmt(fmt) "software IO TLB: " fmt + #include <linux/cache.h> #include <linux/dma-direct.h> #include <linux/mm.h> @@ -162,20 +164,16 @@ static bool no_iotlb_memory; void swiotlb_print_info(void) { unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; - unsigned char *vstart, *vend; if (no_iotlb_memory) { - pr_warn("software IO TLB: No low mem\n"); + pr_warn("No low mem\n"); return; } - vstart = phys_to_virt(io_tlb_start); - vend = phys_to_virt(io_tlb_end); - - printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n", + pr_info("mapped [mem %#010llx-%#010llx] (%luMB)\n", (unsigned long long)io_tlb_start, (unsigned long long)io_tlb_end, - bytes >> 20, vstart, vend - 1); + bytes >> 20); } /* @@ -275,7 +273,7 @@ swiotlb_init(int verbose) if (io_tlb_start) memblock_free_early(io_tlb_start, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); - pr_warn("Cannot allocate SWIOTLB buffer"); + pr_warn("Cannot allocate buffer"); no_iotlb_memory = true; } @@ -317,8 +315,8 @@ swiotlb_late_init_with_default_size(size_t default_size) return -ENOMEM; } if (order != get_order(bytes)) { - printk(KERN_WARNING "Warning: only able to allocate %ld MB " - "for software IO TLB\n", (PAGE_SIZE << order) >> 20); + pr_warn("only able to allocate %ld MB\n", + (PAGE_SIZE << order) >> 20); io_tlb_nslabs = SLABS_PER_PAGE << order; } rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c187aa3df3c8..24a77c34e9ad 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -4,7 +4,7 @@ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra - * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 8f0434a9951a..2a62b96600ad 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1334,7 +1334,7 @@ static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { - return perf_event_pid_type(event, p, __PIDTYPE_TGID); + return perf_event_pid_type(event, p, PIDTYPE_TGID); } static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) @@ -1656,7 +1656,7 @@ perf_event_groups_next(struct perf_event *event) typeof(*event), group_node)) /* - * Add a event from the lists for its context. + * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void @@ -1844,7 +1844,7 @@ static void perf_group_attach(struct perf_event *event) } /* - * Remove a event from the lists for its context. + * Remove an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void @@ -2148,7 +2148,7 @@ static void __perf_event_disable(struct perf_event *event, } /* - * Disable a event. + * Disable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to @@ -2677,7 +2677,7 @@ static void __perf_event_enable(struct perf_event *event, } /* - * Enable a event. + * Enable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to @@ -2755,7 +2755,7 @@ static int __perf_event_stop(void *info) * events will refuse to restart because of rb::aux_mmap_count==0, * see comments in perf_aux_output_begin(). * - * Since this is happening on a event-local CPU, no trace is lost + * Since this is happening on an event-local CPU, no trace is lost * while restarting. */ if (sd->restart) @@ -4827,7 +4827,7 @@ __perf_read(struct perf_event *event, char __user *buf, size_t count) int ret; /* - * Return end-of-file for a read on a event that is in + * Return end-of-file for a read on an event that is in * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ @@ -5246,8 +5246,8 @@ void perf_event_update_userpage(struct perf_event *event) userpg = rb->user_page; /* - * Disable preemption so as to not let the corresponding user-space - * spin too long if we get preempted. + * Disable preemption to guarantee consistent time stamps are stored to + * the user page. */ preempt_disable(); ++userpg->lock; @@ -5273,11 +5273,11 @@ unlock: } EXPORT_SYMBOL_GPL(perf_event_update_userpage); -static int perf_mmap_fault(struct vm_fault *vmf) +static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) { struct perf_event *event = vmf->vma->vm_file->private_data; struct ring_buffer *rb; - int ret = VM_FAULT_SIGBUS; + vm_fault_t ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { if (vmf->pgoff == 0) @@ -6343,7 +6343,7 @@ static u64 perf_virt_to_phys(u64 virt) static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; -static struct perf_callchain_entry * +struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; @@ -6382,7 +6382,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - data->callchain = perf_callchain(event, regs); + if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + data->callchain = perf_callchain(event, regs); + size += data->callchain->nr; header->size += size * sizeof(u64); @@ -7335,6 +7337,10 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { + /* d_inode(NULL) won't be equal to any mapped user-space file */ + if (!filter->path.dentry) + return false; + if (d_inode(filter->path.dentry) != file_inode(file)) return false; @@ -9898,7 +9904,7 @@ enabled: } /* - * Allocate and initialize a event structure + * Allocate and initialize an event structure */ static struct perf_event * perf_event_alloc(struct perf_event_attr *attr, int cpu, @@ -11229,7 +11235,7 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) } /* - * Inherit a event from parent task to child task. + * Inherit an event from parent task to child task. * * Returns: * - valid pointer on success diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 6e28d2866be5..b3814fce5ecb 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -345,13 +345,13 @@ void release_bp_slot(struct perf_event *bp) mutex_unlock(&nr_bp_mutex); } -static int __modify_bp_slot(struct perf_event *bp, u64 old_type) +static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { int err; __release_bp_slot(bp, old_type); - err = __reserve_bp_slot(bp, bp->attr.bp_type); + err = __reserve_bp_slot(bp, new_type); if (err) { /* * Reserve the old_type slot back in case @@ -367,12 +367,12 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type) return err; } -static int modify_bp_slot(struct perf_event *bp, u64 old_type) +static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { int ret; mutex_lock(&nr_bp_mutex); - ret = __modify_bp_slot(bp, old_type); + ret = __modify_bp_slot(bp, old_type, new_type); mutex_unlock(&nr_bp_mutex); return ret; } @@ -400,16 +400,18 @@ int dbg_release_bp_slot(struct perf_event *bp) return 0; } -static int validate_hw_breakpoint(struct perf_event *bp) +static int hw_breakpoint_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) { - int ret; + int err; - ret = arch_validate_hwbkpt_settings(bp); - if (ret) - return ret; + err = hw_breakpoint_arch_parse(bp, attr, hw); + if (err) + return err; - if (arch_check_bp_in_kernelspace(bp)) { - if (bp->attr.exclude_kernel) + if (arch_check_bp_in_kernelspace(hw)) { + if (attr->exclude_kernel) return -EINVAL; /* * Don't let unprivileged users set a breakpoint in the trap @@ -424,19 +426,22 @@ static int validate_hw_breakpoint(struct perf_event *bp) int register_perf_hw_breakpoint(struct perf_event *bp) { - int ret; - - ret = reserve_bp_slot(bp); - if (ret) - return ret; + struct arch_hw_breakpoint hw; + int err; - ret = validate_hw_breakpoint(bp); + err = reserve_bp_slot(bp); + if (err) + return err; - /* if arch_validate_hwbkpt_settings() fails then release bp slot */ - if (ret) + err = hw_breakpoint_parse(bp, &bp->attr, &hw); + if (err) { release_bp_slot(bp); + return err; + } - return ret; + bp->hw.info = hw; + + return 0; } /** @@ -456,35 +461,44 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); +static void hw_breakpoint_copy_attr(struct perf_event_attr *to, + struct perf_event_attr *from) +{ + to->bp_addr = from->bp_addr; + to->bp_type = from->bp_type; + to->bp_len = from->bp_len; + to->disabled = from->disabled; +} + int modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, bool check) { - u64 old_addr = bp->attr.bp_addr; - u64 old_len = bp->attr.bp_len; - int old_type = bp->attr.bp_type; - bool modify = attr->bp_type != old_type; - int err = 0; + struct arch_hw_breakpoint hw; + int err; - bp->attr.bp_addr = attr->bp_addr; - bp->attr.bp_type = attr->bp_type; - bp->attr.bp_len = attr->bp_len; + err = hw_breakpoint_parse(bp, attr, &hw); + if (err) + return err; - if (check && memcmp(&bp->attr, attr, sizeof(*attr))) - return -EINVAL; + if (check) { + struct perf_event_attr old_attr; - err = validate_hw_breakpoint(bp); - if (!err && modify) - err = modify_bp_slot(bp, old_type); + old_attr = bp->attr; + hw_breakpoint_copy_attr(&old_attr, attr); + if (memcmp(&old_attr, attr, sizeof(*attr))) + return -EINVAL; + } - if (err) { - bp->attr.bp_addr = old_addr; - bp->attr.bp_type = old_type; - bp->attr.bp_len = old_len; - return err; + if (bp->attr.bp_type != attr->bp_type) { + err = modify_bp_slot(bp, bp->attr.bp_type, attr->bp_type); + if (err) + return err; } - bp->attr.disabled = attr->disabled; + hw_breakpoint_copy_attr(&bp->attr, attr); + bp->hw.info = hw; + return 0; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ccc579a7d32e..3207a4d26849 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -299,8 +299,8 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * Called with mm->mmap_sem held for write. * Return 0 (success) or a negative errno. */ -int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, - uprobe_opcode_t opcode) +int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; struct vm_area_struct *vma; @@ -351,7 +351,7 @@ put_old: */ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); } /** @@ -366,7 +366,8 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); + return uprobe_write_opcode(auprobe, mm, vaddr, + *(uprobe_opcode_t *)&auprobe->insn); } static struct uprobe *get_uprobe(struct uprobe *uprobe) @@ -840,13 +841,8 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) return err; } -static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) -{ - consumer_add(uprobe, uc); - return register_for_each_vma(uprobe, uc); -} - -static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) +static void +__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; @@ -860,24 +856,46 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u } /* - * uprobe_register - register a probe + * uprobe_unregister - unregister an already registered probe. + * @inode: the file in which the probe has to be removed. + * @offset: offset from the start of the file. + * @uc: identify which probe if multiple probes are colocated. + */ +void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +{ + struct uprobe *uprobe; + + uprobe = find_uprobe(inode, offset); + if (WARN_ON(!uprobe)) + return; + + down_write(&uprobe->register_rwsem); + __uprobe_unregister(uprobe, uc); + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); +} +EXPORT_SYMBOL_GPL(uprobe_unregister); + +/* + * __uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. * @uc: information on howto handle the probe.. * - * Apart from the access refcount, uprobe_register() takes a creation + * Apart from the access refcount, __uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe - * unregisters. Caller of uprobe_register() is required to keep @inode + * unregisters. Caller of __uprobe_register() is required to keep @inode * (and the containing mount) referenced. * * Return errno if it cannot successully install probes * else return 0 (success) */ -int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +static int __uprobe_register(struct inode *inode, loff_t offset, + struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; @@ -904,7 +922,8 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * down_write(&uprobe->register_rwsem); ret = -EAGAIN; if (likely(uprobe_is_active(uprobe))) { - ret = __uprobe_register(uprobe, uc); + consumer_add(uprobe, uc); + ret = register_for_each_vma(uprobe, uc); if (ret) __uprobe_unregister(uprobe, uc); } @@ -915,10 +934,16 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * goto retry; return ret; } + +int uprobe_register(struct inode *inode, loff_t offset, + struct uprobe_consumer *uc) +{ + return __uprobe_register(inode, offset, uc); +} EXPORT_SYMBOL_GPL(uprobe_register); /* - * uprobe_apply - unregister a already registered probe. + * uprobe_apply - unregister an already registered probe. * @inode: the file in which the probe has to be removed. * @offset: offset from the start of the file. * @uc: consumer which wants to add more or remove some breakpoints @@ -946,27 +971,6 @@ int uprobe_apply(struct inode *inode, loff_t offset, return ret; } -/* - * uprobe_unregister - unregister a already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. - * @uc: identify which probe if multiple probes are colocated. - */ -void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) -{ - struct uprobe *uprobe; - - uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) - return; - - down_write(&uprobe->register_rwsem); - __uprobe_unregister(uprobe, uc); - up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); -} -EXPORT_SYMBOL_GPL(uprobe_unregister); - static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { struct vm_area_struct *vma; @@ -1403,7 +1407,7 @@ static struct return_instance *free_ret_instance(struct return_instance *ri) /* * Called with no locks held. - * Called in context of a exiting or a exec-ing thread. + * Called in context of an exiting or an exec-ing thread. */ void uprobe_free_utask(struct task_struct *t) { diff --git a/kernel/exit.c b/kernel/exit.c index c3c7ac560114..0e21e6d21f35 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -73,6 +73,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) nr_threads--; detach_pid(p, PIDTYPE_PID); if (group_dead) { + detach_pid(p, PIDTYPE_TGID); detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); @@ -680,7 +681,8 @@ static void forget_original_parent(struct task_struct *father, t->parent = t->real_parent; if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, - SEND_SIG_NOINFO, t); + SEND_SIG_NOINFO, t, + PIDTYPE_TGID); } /* * If this is a threaded reparent there is no need to @@ -1001,14 +1003,6 @@ struct wait_opts { int notask_error; }; -static inline -struct pid *task_pid_type(struct task_struct *task, enum pid_type type) -{ - if (type != PIDTYPE_PID) - task = task->group_leader; - return task->pids[type].pid; -} - static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 5349c91c2298..bc80a4e268c0 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -184,9 +184,6 @@ static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) if (should_fail(&fei_fault_attr, 1)) { regs_set_return_value(regs, attr->retval); override_function_with_return(regs); - /* Kprobe specific fixup */ - reset_current_kprobe(); - preempt_enable_no_resched(); return 1; } diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925c..d896e9ca38b0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -303,11 +303,37 @@ struct kmem_cache *files_cachep; struct kmem_cache *fs_cachep; /* SLAB cache for vm_area_struct structures */ -struct kmem_cache *vm_area_cachep; +static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (vma) + vma_init(vma, mm); + return vma; +} + +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) +{ + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + + if (new) { + *new = *orig; + INIT_LIST_HEAD(&new->anon_vma_chain); + } + return new; +} + +void vm_area_free(struct vm_area_struct *vma) +{ + kmem_cache_free(vm_area_cachep, vma); +} + static void account_kernel_stack(struct task_struct *tsk, int account) { void *stack = task_stack_page(tsk); @@ -455,11 +481,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto fail_nomem; charge = len; } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; - *tmp = *mpnt; - INIT_LIST_HEAD(&tmp->anon_vma_chain); retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; @@ -539,7 +563,7 @@ fail_uprobe_end: fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); + vm_area_free(tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); @@ -843,6 +867,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->fail_nth = 0; #endif +#ifdef CONFIG_BLK_CGROUP + tsk->throttle_queue = NULL; + tsk->use_memdelay = 0; +#endif + +#ifdef CONFIG_MEMCG + tsk->active_memcg = NULL; +#endif return tsk; free_stack: @@ -1270,6 +1302,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) tsk->nvcsw = tsk->nivcsw = 0; #ifdef CONFIG_DETECT_HUNG_TASK tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; + tsk->last_switch_time = 0; #endif tsk->mm = NULL; @@ -1394,7 +1427,9 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; atomic_set(&sig->count, 1); + spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); + spin_unlock_irq(¤t->sighand->siglock); return 0; } @@ -1456,6 +1491,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); + INIT_HLIST_HEAD(&sig->multiprocess); seqlock_init(&sig->stats_lock); prev_cputime_init(&sig->prev_cputime); @@ -1549,10 +1585,22 @@ static void posix_cpu_timers_init(struct task_struct *tsk) static inline void posix_cpu_timers_init(struct task_struct *tsk) { } #endif +static inline void init_task_pid_links(struct task_struct *task) +{ + enum pid_type type; + + for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { + INIT_HLIST_NODE(&task->pid_links[type]); + } +} + static inline void init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { - task->pids[type].pid = pid; + if (type == PIDTYPE_PID) + task->thread_pid = pid; + else + task->signal->pids[type] = pid; } static inline void rcu_copy_process(struct task_struct *p) @@ -1590,6 +1638,7 @@ static __latent_entropy struct task_struct *copy_process( { int retval; struct task_struct *p; + struct multiprocess_signals delayed; /* * Don't allow sharing the root directory with processes in a different @@ -1637,6 +1686,24 @@ static __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + /* + * Force any signals received before this point to be delivered + * before the fork happens. Collect up signals sent to multiple + * processes that happen during the fork and delay them so that + * they appear to happen after the fork. + */ + sigemptyset(&delayed.signal); + INIT_HLIST_NODE(&delayed.node); + + spin_lock_irq(¤t->sighand->siglock); + if (!(clone_flags & CLONE_THREAD)) + hlist_add_head(&delayed.node, ¤t->signal->multiprocess); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + retval = -ERESTARTNOINTR; + if (signal_pending(current)) + goto fork_out; + retval = -ENOMEM; p = dup_task_struct(current, node); if (!p) @@ -1910,29 +1977,26 @@ static __latent_entropy struct task_struct *copy_process( rseq_fork(p, clone_flags); - /* - * Process group and session signals need to be delivered to just the - * parent before the fork or both the parent and the child after the - * fork. Restart if a signal comes in before we add the new process to - * it's process group. - * A fatal signal pending means that current will exit, so the new - * thread can't slip out of an OOM kill (or normal SIGKILL). - */ - recalc_sigpending(); - if (signal_pending(current)) { - retval = -ERESTARTNOINTR; - goto bad_fork_cancel_cgroup; - } + /* Don't start children in a dying pid namespace */ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; goto bad_fork_cancel_cgroup; } + /* Let kill terminate clone/fork in the middle */ + if (fatal_signal_pending(current)) { + retval = -EINTR; + goto bad_fork_cancel_cgroup; + } + + + init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); init_task_pid(p, PIDTYPE_PID, pid); if (thread_group_leader(p)) { + init_task_pid(p, PIDTYPE_TGID, pid); init_task_pid(p, PIDTYPE_PGID, task_pgrp(current)); init_task_pid(p, PIDTYPE_SID, task_session(current)); @@ -1940,8 +2004,7 @@ static __latent_entropy struct task_struct *copy_process( ns_of_pid(pid)->child_reaper = p; p->signal->flags |= SIGNAL_UNKILLABLE; } - - p->signal->leader_pid = pid; + p->signal->shared_pending.signal = delayed.signal; p->signal->tty = tty_kref_get(current->signal->tty); /* * Inherit has_child_subreaper flag under the same @@ -1952,6 +2015,7 @@ static __latent_entropy struct task_struct *copy_process( p->real_parent->signal->is_child_subreaper; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); + attach_pid(p, PIDTYPE_TGID); attach_pid(p, PIDTYPE_PGID); attach_pid(p, PIDTYPE_SID); __this_cpu_inc(process_counts); @@ -1959,6 +2023,7 @@ static __latent_entropy struct task_struct *copy_process( current->signal->nr_threads++; atomic_inc(¤t->signal->live); atomic_inc(¤t->signal->sigcnt); + task_join_group_stop(p); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); list_add_tail_rcu(&p->thread_node, @@ -1967,8 +2032,8 @@ static __latent_entropy struct task_struct *copy_process( attach_pid(p, PIDTYPE_PID); nr_threads++; } - total_forks++; + hlist_del_init(&delayed.node); spin_unlock(¤t->sighand->siglock); syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); @@ -2033,16 +2098,19 @@ bad_fork_free: put_task_stack(p); free_task(p); fork_out: + spin_lock_irq(¤t->sighand->siglock); + hlist_del_init(&delayed.node); + spin_unlock_irq(¤t->sighand->siglock); return ERR_PTR(retval); } -static inline void init_idle_pids(struct pid_link *links) +static inline void init_idle_pids(struct task_struct *idle) { enum pid_type type; for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { - INIT_HLIST_NODE(&links[type].node); /* not really needed */ - links[type].pid = &init_struct_pid; + INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */ + init_task_pid(idle, type, &init_struct_pid); } } @@ -2052,7 +2120,7 @@ struct task_struct *fork_idle(int cpu) task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, cpu_to_node(cpu)); if (!IS_ERR(task)) { - init_idle_pids(task->pids); + init_idle_pids(task); init_idle(task, cpu); } @@ -2253,6 +2321,8 @@ static void sighand_ctor(void *data) void __init proc_caches_init(void) { + unsigned int mm_size; + sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -2269,15 +2339,16 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); + /* - * FIXME! The "sizeof(struct mm_struct)" currently includes the - * whole struct cpumask for the OFFSTACK case. We could change - * this to *only* allocate as much of it as required by the - * maximum number of CPU's we can ever have. The cpumask_allocation - * is at the end of the structure, exactly for that reason. + * The mm_cpumask is located at the end of mm_struct, and is + * dynamically sized based on the maximum CPU number this system + * can have, taking hotplug into account (nr_cpu_ids). */ + mm_size = sizeof(struct mm_struct) + cpumask_size(); + mm_cachep = kmem_cache_create_usercopy("mm_struct", - sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, + mm_size, ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, offsetof(struct mm_struct, saved_auxv), sizeof_field(struct mm_struct, saved_auxv), diff --git a/kernel/freezer.c b/kernel/freezer.c index 6f56a9e219fa..b162b74611e4 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -15,7 +15,9 @@ atomic_t system_freezing_cnt = ATOMIC_INIT(0); EXPORT_SYMBOL(system_freezing_cnt); -/* indicate whether PM freezing is in effect, protected by pm_mutex */ +/* indicate whether PM freezing is in effect, protected by + * system_transition_mutex + */ bool pm_freezing; bool pm_nosig_freezing; diff --git a/kernel/futex.c b/kernel/futex.c index 1f450e092c74..11fc3bb456d6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3523,10 +3523,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; + /* fall through */ case FUTEX_WAIT_BITSET: return futex_wait(uaddr, flags, val, timeout, val3); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; + /* fall through */ case FUTEX_WAKE_BITSET: return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 83f830acbb5f..410a77a8f6e2 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -173,7 +173,7 @@ err_unlock: } COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct compat_timespec __user *, utime, u32 __user *, uaddr2, + struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { struct timespec ts; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 32b479468e4d..b9132d1269ef 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -40,6 +40,11 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; +/* + * Zero (default value) means use sysctl_hung_task_timeout_secs: + */ +unsigned long __read_mostly sysctl_hung_task_check_interval_secs; + int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; @@ -98,8 +103,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (switch_count != t->last_switch_count) { t->last_switch_count = switch_count; + t->last_switch_time = jiffies; return; } + if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + return; trace_sched_process_hang(t); @@ -245,8 +253,13 @@ static int watchdog(void *dummy) for ( ; ; ) { unsigned long timeout = sysctl_hung_task_timeout_secs; - long t = hung_timeout_jiffies(hung_last_checked, timeout); + unsigned long interval = sysctl_hung_task_check_interval_secs; + long t; + if (interval == 0) + interval = timeout; + interval = min_t(unsigned long, interval, timeout); + t = hung_timeout_jiffies(hung_last_checked, interval); if (t <= 0) { if (!atomic_xchg(&reset_hung_task, 0)) check_hung_uninterruptible_tasks(timeout); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index c6766f326072..5f3e2baefca9 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -134,7 +134,6 @@ config GENERIC_IRQ_DEBUGFS endmenu config GENERIC_IRQ_MULTI_HANDLER - depends on !MULTI_IRQ_HANDLER bool help Allow to specify the low level IRQ handler at run time. diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index afc7f902d74a..578d0e5f1b5b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -443,6 +443,7 @@ static void free_desc(unsigned int irq) * We free the descriptor, masks and stat fields via RCU. That * allows demultiplex interrupts to do rcu based management of * the child interrupts. + * This also allows us to use rcu in kstat_irqs_usr(). */ call_rcu(&desc->rcu, delayed_free_desc); } @@ -928,17 +929,17 @@ unsigned int kstat_irqs(unsigned int irq) * kstat_irqs_usr - Get the statistics for an interrupt * @irq: The interrupt number * - * Returns the sum of interrupt counts on all cpus since boot for - * @irq. Contrary to kstat_irqs() this can be called from any - * preemptible context. It's protected against concurrent removal of - * an interrupt descriptor when sparse irqs are enabled. + * Returns the sum of interrupt counts on all cpus since boot for @irq. + * Contrary to kstat_irqs() this can be called from any context. + * It uses rcu since a concurrent removal of an interrupt descriptor is + * observing an rcu grace period before delayed_free_desc()/irq_kobj_release(). */ unsigned int kstat_irqs_usr(unsigned int irq) { unsigned int sum; - irq_lock_sparse(); + rcu_read_lock(); sum = kstat_irqs(irq); - irq_unlock_sparse(); + rcu_read_unlock(); return sum; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index daeabd791d58..fb86146037a7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -790,9 +790,19 @@ static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) static int irq_wait_for_interrupt(struct irqaction *action) { - set_current_state(TASK_INTERRUPTIBLE); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { + if (kthread_should_stop()) { + /* may need to run one last time */ + if (test_and_clear_bit(IRQTF_RUNTHREAD, + &action->thread_flags)) { + __set_current_state(TASK_RUNNING); + return 0; + } + __set_current_state(TASK_RUNNING); + return -1; + } if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { @@ -800,10 +810,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) return 0; } schedule(); - set_current_state(TASK_INTERRUPTIBLE); } - __set_current_state(TASK_RUNNING); - return -1; } /* @@ -1024,11 +1031,8 @@ static int irq_thread(void *data) /* * This is the regular exit path. __free_irq() is stopping the * thread via kthread_stop() after calling - * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the - * oneshot mask bit can be set. We cannot verify that as we - * cannot touch the oneshot mask at this point anymore as - * __setup_irq() might have given out currents thread_mask - * again. + * synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the + * oneshot mask bit can be set. */ task_work_cancel(current, irq_thread_dtor); return 0; @@ -1068,6 +1072,13 @@ static int irq_setup_forced_threading(struct irqaction *new) if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) return 0; + /* + * No further action required for interrupts which are requested as + * threaded interrupts already + */ + if (new->handler == irq_default_primary_handler) + return 0; + new->flags |= IRQF_ONESHOT; /* @@ -1075,7 +1086,7 @@ static int irq_setup_forced_threading(struct irqaction *new) * thread handler. We force thread them as well by creating a * secondary action. */ - if (new->handler != irq_default_primary_handler && new->thread_fn) { + if (new->handler && new->thread_fn) { /* Allocate the secondary action */ new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!new->secondary) @@ -1244,8 +1255,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* * Protects against a concurrent __free_irq() call which might wait - * for synchronize_irq() to complete without holding the optional - * chip bus lock and desc->lock. + * for synchronize_hardirq() to complete without holding the optional + * chip bus lock and desc->lock. Also protects against handing out + * a recycled oneshot thread_mask bit while it's still in use by + * its previous owner. */ mutex_lock(&desc->request_mutex); @@ -1564,9 +1577,6 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); - if (!desc) - return NULL; - mutex_lock(&desc->request_mutex); chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); @@ -1613,11 +1623,11 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) /* * Drop bus_lock here so the changes which were done in the chip * callbacks above are synced out to the irq chips which hang - * behind a slow bus (I2C, SPI) before calling synchronize_irq(). + * behind a slow bus (I2C, SPI) before calling synchronize_hardirq(). * * Aside of that the bus_lock can also be taken from the threaded * handler in irq_finalize_oneshot() which results in a deadlock - * because synchronize_irq() would wait forever for the thread to + * because kthread_stop() would wait forever for the thread to * complete, which is blocked on the bus lock. * * The still held desc->request_mutex() protects against a @@ -1629,7 +1639,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) unregister_handler_proc(irq, action); /* Make sure it's not being used on another CPU: */ - synchronize_irq(irq); + synchronize_hardirq(irq); #ifdef CONFIG_DEBUG_SHIRQ /* @@ -1638,7 +1648,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) * is so by doing an extra call to the handler .... * * ( We do this after actually deregistering it, to make sure that a - * 'real' IRQ doesn't run in * parallel with our fake. ) + * 'real' IRQ doesn't run in parallel with our fake. ) */ if (action->flags & IRQF_SHARED) { local_irq_save(flags); @@ -1647,6 +1657,12 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) } #endif + /* + * The action has already been removed above, but the thread writes + * its oneshot mask bit when it completes. Though request_mutex is + * held across this which prevents __setup_irq() from handing out + * the same bit to a newly requested action. + */ if (action->thread) { kthread_stop(action->thread); put_task_struct(action->thread); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 37eda10f5c36..da9addb8d655 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -475,22 +475,24 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - irq_lock_sparse(); + rcu_read_lock(); desc = irq_to_desc(i); if (!desc) goto outsparse; - raw_spin_lock_irqsave(&desc->lock, flags); - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); - action = desc->action; - if ((!action || irq_desc_is_chained(desc)) && !any_count) - goto out; + if (desc->kstat_irqs) + for_each_online_cpu(j) + any_count |= *per_cpu_ptr(desc->kstat_irqs, j); + + if ((!desc->action || irq_desc_is_chained(desc)) && !any_count) + goto outsparse; seq_printf(p, "%*d: ", prec, i); for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + seq_printf(p, "%10u ", desc->kstat_irqs ? + *per_cpu_ptr(desc->kstat_irqs, j) : 0); + raw_spin_lock_irqsave(&desc->lock, flags); if (desc->irq_data.chip) { if (desc->irq_data.chip->irq_print_chip) desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); @@ -511,6 +513,7 @@ int show_interrupts(struct seq_file *p, void *v) if (desc->name) seq_printf(p, "-%-8s", desc->name); + action = desc->action; if (action) { seq_printf(p, " %s", action->name); while ((action = action->next) != NULL) @@ -518,10 +521,9 @@ int show_interrupts(struct seq_file *p, void *v) } seq_putc(p, '\n'); -out: raw_spin_unlock_irqrestore(&desc->lock, flags); outsparse: - irq_unlock_sparse(); + rcu_read_unlock(); return 0; } #endif diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index a23e21ada81b..02a0b01380d8 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -432,6 +432,7 @@ int sprint_backtrace(char *buffer, unsigned long address) /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; + loff_t pos_arch_end; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; unsigned long value; @@ -443,9 +444,29 @@ struct kallsym_iter { int show_value; }; +int __weak arch_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name) +{ + return -EINVAL; +} + +static int get_ksymbol_arch(struct kallsym_iter *iter) +{ + int ret = arch_get_kallsym(iter->pos - kallsyms_num_syms, + &iter->value, &iter->type, + iter->name); + + if (ret < 0) { + iter->pos_arch_end = iter->pos; + return 0; + } + + return 1; +} + static int get_ksymbol_mod(struct kallsym_iter *iter) { - int ret = module_get_kallsym(iter->pos - kallsyms_num_syms, + int ret = module_get_kallsym(iter->pos - iter->pos_arch_end, &iter->value, &iter->type, iter->name, iter->module_name, &iter->exported); @@ -501,32 +522,34 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->nameoff = get_symbol_offset(new_pos); iter->pos = new_pos; if (new_pos == 0) { + iter->pos_arch_end = 0; iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; } } +/* + * The end position (last + 1) of each additional kallsyms section is recorded + * in iter->pos_..._end as each section is added, and so can be used to + * determine which get_ksymbol_...() function to call next. + */ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) { iter->pos = pos; - if (iter->pos_ftrace_mod_end > 0 && - iter->pos_ftrace_mod_end < iter->pos) - return get_ksymbol_bpf(iter); + if ((!iter->pos_arch_end || iter->pos_arch_end > pos) && + get_ksymbol_arch(iter)) + return 1; - if (iter->pos_mod_end > 0 && - iter->pos_mod_end < iter->pos) { - if (!get_ksymbol_ftrace_mod(iter)) - return get_ksymbol_bpf(iter); + if ((!iter->pos_mod_end || iter->pos_mod_end > pos) && + get_ksymbol_mod(iter)) return 1; - } - if (!get_ksymbol_mod(iter)) { - if (!get_ksymbol_ftrace_mod(iter)) - return get_ksymbol_bpf(iter); - } + if ((!iter->pos_ftrace_mod_end || iter->pos_ftrace_mod_end > pos) && + get_ksymbol_ftrace_mod(iter)) + return 1; - return 1; + return get_ksymbol_bpf(iter); } /* Returns false if pos at or past end of file. */ diff --git a/kernel/kexec.c b/kernel/kexec.c index aed8fb2564b3..68559808fdfa 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -11,6 +11,7 @@ #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> +#include <linux/security.h> #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> @@ -195,10 +196,17 @@ out: static inline int kexec_load_check(unsigned long nr_segments, unsigned long flags) { + int result; + /* We only trust the superuser with rebooting the system. */ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) return -EPERM; + /* Permit LSMs and IMA to fail the kexec */ + result = security_kernel_load_data(LOADING_KEXEC_IMAGE); + if (result < 0) + return result; + /* * Verify we have a legal set of flags * This leaves us room for future extensions. diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ea619021d901..ab257be4d924 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -627,8 +627,8 @@ static void optimize_kprobe(struct kprobe *p) (kprobe_disabled(p) || kprobes_all_disarmed)) return; - /* Both of break_handler and post_handler are not supported. */ - if (p->break_handler || p->post_handler) + /* kprobes with post_handler can not be optimized */ + if (p->post_handler) return; op = container_of(p, struct optimized_kprobe, kp); @@ -710,9 +710,7 @@ static void reuse_unused_kprobe(struct kprobe *ap) * there is still a relative jump) and disabled. */ op = container_of(ap, struct optimized_kprobe, kp); - if (unlikely(list_empty(&op->list))) - printk(KERN_WARNING "Warning: found a stray unused " - "aggrprobe@%p\n", ap->addr); + WARN_ON_ONCE(list_empty(&op->list)); /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; /* Optimize it again (remove from op->list) */ @@ -985,7 +983,8 @@ static int arm_kprobe_ftrace(struct kprobe *p) ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 0, 0); if (ret) { - pr_debug("Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); + pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n", + p->addr, ret); return ret; } @@ -1025,7 +1024,8 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); - WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); + WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n", + p->addr, ret); return ret; } #else /* !CONFIG_KPROBES_ON_FTRACE */ @@ -1116,20 +1116,6 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, } NOKPROBE_SYMBOL(aggr_fault_handler); -static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe *cur = __this_cpu_read(kprobe_instance); - int ret = 0; - - if (cur && cur->break_handler) { - if (cur->break_handler(cur, regs)) - ret = 1; - } - reset_kprobe_instance(); - return ret; -} -NOKPROBE_SYMBOL(aggr_break_handler); - /* Walks the list and increments nmissed count for multiprobe case */ void kprobes_inc_nmissed_count(struct kprobe *p) { @@ -1270,24 +1256,15 @@ static void cleanup_rp_inst(struct kretprobe *rp) } NOKPROBE_SYMBOL(cleanup_rp_inst); -/* -* Add the new probe to ap->list. Fail if this is the -* second jprobe at the address - two jprobes can't coexist -*/ +/* Add the new probe to ap->list */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); - if (p->break_handler || p->post_handler) + if (p->post_handler) unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */ - if (p->break_handler) { - if (ap->break_handler) - return -EEXIST; - list_add_tail_rcu(&p->list, &ap->list); - ap->break_handler = aggr_break_handler; - } else - list_add_rcu(&p->list, &ap->list); + list_add_rcu(&p->list, &ap->list); if (p->post_handler && !ap->post_handler) ap->post_handler = aggr_post_handler; @@ -1310,8 +1287,6 @@ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) /* We don't care the kprobe which has gone. */ if (p->post_handler && !kprobe_gone(p)) ap->post_handler = aggr_post_handler; - if (p->break_handler && !kprobe_gone(p)) - ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); INIT_HLIST_NODE(&ap->hlist); @@ -1706,8 +1681,6 @@ static int __unregister_kprobe_top(struct kprobe *p) goto disarmed; else { /* If disabling probe has special handlers, update aggrprobe */ - if (p->break_handler && !kprobe_gone(p)) - ap->break_handler = NULL; if (p->post_handler && !kprobe_gone(p)) { list_for_each_entry_rcu(list_p, &ap->list, list) { if ((list_p != p) && (list_p->post_handler)) @@ -1812,77 +1785,6 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -#if 0 -int register_jprobes(struct jprobe **jps, int num) -{ - int ret = 0, i; - - if (num <= 0) - return -EINVAL; - - for (i = 0; i < num; i++) { - ret = register_jprobe(jps[i]); - - if (ret < 0) { - if (i > 0) - unregister_jprobes(jps, i); - break; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(register_jprobes); - -int register_jprobe(struct jprobe *jp) -{ - unsigned long addr, offset; - struct kprobe *kp = &jp->kp; - - /* - * Verify probepoint as well as the jprobe handler are - * valid function entry points. - */ - addr = arch_deref_entry_point(jp->entry); - - if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0 && - kprobe_on_func_entry(kp->addr, kp->symbol_name, kp->offset)) { - kp->pre_handler = setjmp_pre_handler; - kp->break_handler = longjmp_break_handler; - return register_kprobe(kp); - } - - return -EINVAL; -} -EXPORT_SYMBOL_GPL(register_jprobe); - -void unregister_jprobe(struct jprobe *jp) -{ - unregister_jprobes(&jp, 1); -} -EXPORT_SYMBOL_GPL(unregister_jprobe); - -void unregister_jprobes(struct jprobe **jps, int num) -{ - int i; - - if (num <= 0) - return; - mutex_lock(&kprobe_mutex); - for (i = 0; i < num; i++) - if (__unregister_kprobe_top(&jps[i]->kp) < 0) - jps[i]->kp.addr = NULL; - mutex_unlock(&kprobe_mutex); - - synchronize_sched(); - for (i = 0; i < num; i++) { - if (jps[i]->kp.addr) - __unregister_kprobe_bottom(&jps[i]->kp); - } -} -EXPORT_SYMBOL_GPL(unregister_jprobes); -#endif - #ifdef CONFIG_KRETPROBES /* * This kprobe pre_handler is registered with every kretprobe. When probe @@ -1982,7 +1884,6 @@ int register_kretprobe(struct kretprobe *rp) rp->kp.pre_handler = pre_handler_kretprobe; rp->kp.post_handler = NULL; rp->kp.fault_handler = NULL; - rp->kp.break_handler = NULL; /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { @@ -2105,7 +2006,6 @@ static void kill_kprobe(struct kprobe *p) list_for_each_entry_rcu(kp, &p->list, list) kp->flags |= KPROBE_FLAG_GONE; p->post_handler = NULL; - p->break_handler = NULL; kill_optimized_kprobe(p); } /* @@ -2169,11 +2069,12 @@ out: } EXPORT_SYMBOL_GPL(enable_kprobe); +/* Caller must NOT call this in usual path. This is only for critical case */ void dump_kprobe(struct kprobe *kp) { - printk(KERN_WARNING "Dumping kprobe:\n"); - printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", - kp->symbol_name, kp->addr, kp->offset); + pr_err("Dumping kprobe:\n"); + pr_err("Name: %s\nOffset: %x\nAddress: %pS\n", + kp->symbol_name, kp->offset, kp->addr); } NOKPROBE_SYMBOL(dump_kprobe); @@ -2196,11 +2097,8 @@ static int __init populate_kprobe_blacklist(unsigned long *start, entry = arch_deref_entry_point((void *)*iter); if (!kernel_text_address(entry) || - !kallsyms_lookup_size_offset(entry, &size, &offset)) { - pr_err("Failed to find blacklist at %p\n", - (void *)entry); + !kallsyms_lookup_size_offset(entry, &size, &offset)) continue; - } ent = kmalloc(sizeof(*ent), GFP_KERNEL); if (!ent) @@ -2326,21 +2224,23 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, const char *sym, int offset, char *modname, struct kprobe *pp) { char *kprobe_type; + void *addr = p->addr; if (p->pre_handler == pre_handler_kretprobe) kprobe_type = "r"; - else if (p->pre_handler == setjmp_pre_handler) - kprobe_type = "j"; else kprobe_type = "k"; + if (!kallsyms_show_value()) + addr = NULL; + if (sym) - seq_printf(pi, "%p %s %s+0x%x %s ", - p->addr, kprobe_type, sym, offset, + seq_printf(pi, "%px %s %s+0x%x %s ", + addr, kprobe_type, sym, offset, (modname ? modname : " ")); - else - seq_printf(pi, "%p %s %p ", - p->addr, kprobe_type, p->addr); + else /* try to use %pS */ + seq_printf(pi, "%px %s %pS ", + addr, kprobe_type, p->addr); if (!pp) pp = p; @@ -2428,8 +2328,16 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) struct kprobe_blacklist_entry *ent = list_entry(v, struct kprobe_blacklist_entry, list); - seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr, - (void *)ent->end_addr, (void *)ent->start_addr); + /* + * If /proc/kallsyms is not showing kernel address, we won't + * show them here either. + */ + if (!kallsyms_show_value()) + seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL, + (void *)ent->start_addr); + else + seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr, + (void *)ent->end_addr, (void *)ent->start_addr); return 0; } @@ -2611,7 +2519,7 @@ static int __init debugfs_kprobe_init(void) if (!dir) return -ENOMEM; - file = debugfs_create_file("list", 0444, dir, NULL, + file = debugfs_create_file("list", 0400, dir, NULL, &debugfs_kprobes_operations); if (!file) goto error; @@ -2621,7 +2529,7 @@ static int __init debugfs_kprobe_init(void) if (!file) goto error; - file = debugfs_create_file("blacklist", 0444, dir, NULL, + file = debugfs_create_file("blacklist", 0400, dir, NULL, &debugfs_kprobe_blacklist_ops); if (!file) goto error; @@ -2637,6 +2545,3 @@ late_initcall(debugfs_kprobe_init); #endif /* CONFIG_DEBUG_FS */ module_init(init_kprobes); - -/* defined in arch/.../kernel/kprobes.c */ -EXPORT_SYMBOL_GPL(jprobe_return); diff --git a/kernel/kthread.c b/kernel/kthread.c index 481951bf091d..087d18d771b5 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task) static void __kthread_parkme(struct kthread *self) { for (;;) { - set_current_state(TASK_PARKED); + /* + * TASK_PARKED is a special state; we must serialize against + * possible pending wakeups to avoid store-store collisions on + * task->state. + * + * Such a collision might possibly result in the task state + * changin from TASK_PARKED and us failing the + * wait_task_inactive() in kthread_park(). + */ + set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; + + complete(&self->parked); schedule(); } __set_current_state(TASK_RUNNING); @@ -191,11 +202,6 @@ void kthread_parkme(void) } EXPORT_SYMBOL_GPL(kthread_parkme); -void kthread_park_complete(struct task_struct *k) -{ - complete_all(&to_kthread(k)->parked); -} - static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -319,8 +325,14 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), task = create->result; if (!IS_ERR(task)) { static const struct sched_param param = { .sched_priority = 0 }; + char name[TASK_COMM_LEN]; - vsnprintf(task->comm, sizeof(task->comm), namefmt, args); + /* + * task is already visible to other tasks, so updating + * COMM must be protected. + */ + vsnprintf(name, sizeof(name), namefmt, args); + set_task_comm(task, name); /* * root may have changed our (kthreadd's) priority or CPU mask. * The kernel thread should not inherit these properties. @@ -459,8 +471,10 @@ void kthread_unpark(struct task_struct *k) if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); - reinit_completion(&kthread->parked); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. + */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -484,10 +498,22 @@ int kthread_park(struct task_struct *k) if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; + if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) + return -EBUSY; + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); + /* + * Wait for __kthread_parkme() to complete(), this means we + * _will_ have TASK_PARKED and are about to call schedule(). + */ wait_for_completion(&kthread->parked); + /* + * Now wait for that schedule() to complete and the task to + * get scheduled out. + */ + WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 3a4656fb7047..5b77a7314e01 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -678,6 +678,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) if (!func->old_name || !func->new_func) return -EINVAL; + if (strlen(func->old_name) >= KSYM_NAME_LEN) + return -EINVAL; + INIT_LIST_HEAD(&func->stack_node); func->patched = false; func->transition = false; @@ -751,6 +754,9 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) if (!obj->funcs) return -EINVAL; + if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN) + return -EINVAL; + obj->patched = false; obj->mod = NULL; diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 7c6631e693bc..5bc349805e03 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -310,13 +310,6 @@ static bool klp_try_switch_task(struct task_struct *task) return true; /* - * For arches which don't have reliable stack traces, we have to rely - * on other methods (e.g., switching tasks at kernel exit). - */ - if (!klp_have_reliable_stack()) - return false; - - /* * Now try to check the stack for any to-be-patched or to-be-unpatched * functions. If all goes well, switch the task to the target patch * state. diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 5fa4d3138bf1..e406c5fdb41e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -55,6 +55,7 @@ #include "lockdep_internals.h" +#include <trace/events/preemptirq.h> #define CREATE_TRACE_POINTS #include <trace/events/lock.h> @@ -248,12 +249,7 @@ void clear_lock_stats(struct lock_class *class) static struct lock_class_stats *get_lock_stats(struct lock_class *class) { - return &get_cpu_var(cpu_lock_stats)[class - lock_classes]; -} - -static void put_lock_stats(struct lock_class_stats *stats) -{ - put_cpu_var(cpu_lock_stats); + return &this_cpu_ptr(cpu_lock_stats)[class - lock_classes]; } static void lock_release_holdtime(struct held_lock *hlock) @@ -271,7 +267,6 @@ static void lock_release_holdtime(struct held_lock *hlock) lock_time_inc(&stats->read_holdtime, holdtime); else lock_time_inc(&stats->write_holdtime, holdtime); - put_lock_stats(stats); } #else static inline void lock_release_holdtime(struct held_lock *hlock) @@ -2845,10 +2840,8 @@ static void __trace_hardirqs_on_caller(unsigned long ip) debug_atomic_inc(hardirqs_on_events); } -__visible void trace_hardirqs_on_caller(unsigned long ip) +void lockdep_hardirqs_on(unsigned long ip) { - time_hardirqs_on(CALLER_ADDR0, ip); - if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2887,23 +2880,14 @@ __visible void trace_hardirqs_on_caller(unsigned long ip) __trace_hardirqs_on_caller(ip); current->lockdep_recursion = 0; } -EXPORT_SYMBOL(trace_hardirqs_on_caller); - -void trace_hardirqs_on(void) -{ - trace_hardirqs_on_caller(CALLER_ADDR0); -} -EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -__visible void trace_hardirqs_off_caller(unsigned long ip) +void lockdep_hardirqs_off(unsigned long ip) { struct task_struct *curr = current; - time_hardirqs_off(CALLER_ADDR0, ip); - if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2925,13 +2909,6 @@ __visible void trace_hardirqs_off_caller(unsigned long ip) } else debug_atomic_inc(redundant_hardirqs_off); } -EXPORT_SYMBOL(trace_hardirqs_off_caller); - -void trace_hardirqs_off(void) -{ - trace_hardirqs_off_caller(CALLER_ADDR0); -} -EXPORT_SYMBOL(trace_hardirqs_off); /* * Softirqs will be enabled: @@ -4090,7 +4067,6 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) stats->contending_point[contending_point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; - put_lock_stats(stats); } static void @@ -4138,7 +4114,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) } if (lock->cpu != cpu) stats->bounces[bounce_acquired + !!hlock->read]++; - put_lock_stats(stats); lock->cpu = cpu; lock->ip = ip; @@ -4338,7 +4313,7 @@ out_restore: raw_local_irq_restore(flags); } -void __init lockdep_info(void) +void __init lockdep_init(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8402b3349dca..7d0b0ed74404 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -21,6 +21,9 @@ * Davidlohr Bueso <dave@stgolabs.net> * Based on kernel/rcu/torture.c. */ + +#define pr_fmt(fmt) fmt + #include <linux/kernel.h> #include <linux/module.h> #include <linux/kthread.h> @@ -57,7 +60,7 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); -torture_param(bool, verbose, true, +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "spin_lock"; @@ -365,7 +368,7 @@ static struct lock_torture_ops mutex_lock_ops = { }; #include <linux/ww_mutex.h> -static DEFINE_WW_CLASS(torture_ww_class); +static DEFINE_WD_CLASS(torture_ww_class); static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class); static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class); static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index f44f658ae629..1a81a1257b3f 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -174,6 +174,21 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait } /* + * Add @waiter to a given location in the lock wait_list and set the + * FLAG_WAITERS flag if it's the first waiter. + */ +static void __sched +__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct list_head *list) +{ + debug_mutex_add_waiter(lock, waiter, current); + + list_add_tail(&waiter->list, list); + if (__mutex_waiter_is_first(lock, waiter)) + __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); +} + +/* * Give up ownership to a specific task, when @task = NULL, this is equivalent * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves * WAITERS. Provides RELEASE semantics like a regular unlock, the @@ -244,6 +259,22 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif +/* + * Wait-Die: + * The newer transactions are killed when: + * It (the new transaction) makes a request for a lock being held + * by an older transaction. + * + * Wound-Wait: + * The newer transactions are wounded when: + * An older transaction makes a request for a lock being held by + * the newer transaction. + */ + +/* + * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired + * it. + */ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) { @@ -282,26 +313,108 @@ ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); #endif ww_ctx->acquired++; + ww->ctx = ww_ctx; } +/* + * Determine if context @a is 'after' context @b. IOW, @a is a younger + * transaction than @b and depending on algorithm either needs to wait for + * @b or die. + */ static inline bool __sched __ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) { - return a->stamp - b->stamp <= LONG_MAX && - (a->stamp != b->stamp || a > b); + + return (signed long)(a->stamp - b->stamp) > 0; +} + +/* + * Wait-Die; wake a younger waiter context (when locks held) such that it can + * die. + * + * Among waiters with context, only the first one can have other locks acquired + * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and + * __ww_mutex_check_kill() wake any but the earliest context. + */ +static bool __sched +__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, + struct ww_acquire_ctx *ww_ctx) +{ + if (!ww_ctx->is_wait_die) + return false; + + if (waiter->ww_ctx->acquired > 0 && + __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) { + debug_mutex_wake_waiter(lock, waiter); + wake_up_process(waiter->task); + } + + return true; +} + +/* + * Wound-Wait; wound a younger @hold_ctx if it holds the lock. + * + * Wound the lock holder if there are waiters with older transactions than + * the lock holders. Even if multiple waiters may wound the lock holder, + * it's sufficient that only one does. + */ +static bool __ww_mutex_wound(struct mutex *lock, + struct ww_acquire_ctx *ww_ctx, + struct ww_acquire_ctx *hold_ctx) +{ + struct task_struct *owner = __mutex_owner(lock); + + lockdep_assert_held(&lock->wait_lock); + + /* + * Possible through __ww_mutex_add_waiter() when we race with + * ww_mutex_set_context_fastpath(). In that case we'll get here again + * through __ww_mutex_check_waiters(). + */ + if (!hold_ctx) + return false; + + /* + * Can have !owner because of __mutex_unlock_slowpath(), but if owner, + * it cannot go away because we'll have FLAG_WAITERS set and hold + * wait_lock. + */ + if (!owner) + return false; + + if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) { + hold_ctx->wounded = 1; + + /* + * wake_up_process() paired with set_current_state() + * inserts sufficient barriers to make sure @owner either sees + * it's wounded in __ww_mutex_lock_check_stamp() or has a + * wakeup pending to re-read the wounded state. + */ + if (owner != current) + wake_up_process(owner); + + return true; + } + + return false; } /* - * Wake up any waiters that may have to back off when the lock is held by the - * given context. + * We just acquired @lock under @ww_ctx, if there are later contexts waiting + * behind us on the wait-list, check if they need to die, or wound us. * - * Due to the invariants on the wait list, this can only affect the first - * waiter with a context. + * See __ww_mutex_add_waiter() for the list-order construction; basically the + * list is ordered by stamp, smallest (oldest) first. + * + * This relies on never mixing wait-die/wound-wait on the same wait-list; + * which is currently ensured by that being a ww_class property. * * The current task must not be on the wait list. */ static void __sched -__ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) { struct mutex_waiter *cur; @@ -311,66 +424,51 @@ __ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) if (!cur->ww_ctx) continue; - if (cur->ww_ctx->acquired > 0 && - __ww_ctx_stamp_after(cur->ww_ctx, ww_ctx)) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } - - break; + if (__ww_mutex_die(lock, cur, ww_ctx) || + __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx)) + break; } } /* - * After acquiring lock with fastpath or when we lost out in contested - * slowpath, set ctx and wake up any waiters so they can recheck. + * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx + * and wake up any waiters so they can recheck. */ static __always_inline void ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { ww_mutex_lock_acquired(lock, ctx); - lock->ctx = ctx; - /* * The lock->ctx update should be visible on all cores before - * the atomic read is done, otherwise contended waiters might be + * the WAITERS check is done, otherwise contended waiters might be * missed. The contended waiters will either see ww_ctx == NULL * and keep spinning, or it will acquire wait_lock, add itself * to waiter list and sleep. */ - smp_mb(); /* ^^^ */ + smp_mb(); /* See comments above and below. */ /* - * Check if lock is contended, if not there is nobody to wake up + * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS + * MB MB + * [R] MUTEX_FLAG_WAITERS [R] ww->ctx + * + * The memory barrier above pairs with the memory barrier in + * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx + * and/or !empty list. */ if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS))) return; /* - * Uh oh, we raced in fastpath, wake up everyone in this case, - * so they can see the new lock->ctx. + * Uh oh, we raced in fastpath, check if any of the waiters need to + * die or wound us. */ spin_lock(&lock->base.wait_lock); - __ww_mutex_wakeup_for_backoff(&lock->base, ctx); + __ww_mutex_check_waiters(&lock->base, ctx); spin_unlock(&lock->base.wait_lock); } -/* - * After acquiring lock in the slowpath set ctx. - * - * Unlike for the fast path, the caller ensures that waiters are woken up where - * necessary. - * - * Callers must hold the mutex wait_lock. - */ -static __always_inline void -ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ - ww_mutex_lock_acquired(lock, ctx); - lock->ctx = ctx; -} - #ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline @@ -646,37 +744,83 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) } EXPORT_SYMBOL(ww_mutex_unlock); + +static __always_inline int __sched +__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +{ + if (ww_ctx->acquired > 0) { +#ifdef CONFIG_DEBUG_MUTEXES + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); + ww_ctx->contending_lock = ww; +#endif + return -EDEADLK; + } + + return 0; +} + + +/* + * Check the wound condition for the current lock acquire. + * + * Wound-Wait: If we're wounded, kill ourself. + * + * Wait-Die: If we're trying to acquire a lock already held by an older + * context, kill ourselves. + * + * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to + * look at waiters before us in the wait-list. + */ static inline int __sched -__ww_mutex_lock_check_stamp(struct mutex *lock, struct mutex_waiter *waiter, - struct ww_acquire_ctx *ctx) +__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, + struct ww_acquire_ctx *ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); struct mutex_waiter *cur; + if (ctx->acquired == 0) + return 0; + + if (!ctx->is_wait_die) { + if (ctx->wounded) + return __ww_mutex_kill(lock, ctx); + + return 0; + } + if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) - goto deadlock; + return __ww_mutex_kill(lock, ctx); /* * If there is a waiter in front of us that has a context, then its - * stamp is earlier than ours and we must back off. + * stamp is earlier than ours and we must kill ourself. */ cur = waiter; list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) { - if (cur->ww_ctx) - goto deadlock; + if (!cur->ww_ctx) + continue; + + return __ww_mutex_kill(lock, ctx); } return 0; - -deadlock: -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(ctx->contending_lock); - ctx->contending_lock = ww; -#endif - return -EDEADLK; } +/* + * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest + * first. Such that older contexts are preferred to acquire the lock over + * younger contexts. + * + * Waiters without context are interspersed in FIFO order. + * + * Furthermore, for Wait-Die kill ourself immediately when possible (there are + * older contexts already waiting) to avoid unnecessary waiting and for + * Wound-Wait ensure we wound the owning context when it is younger. + */ static inline int __sched __ww_mutex_add_waiter(struct mutex_waiter *waiter, struct mutex *lock, @@ -684,16 +828,21 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, { struct mutex_waiter *cur; struct list_head *pos; + bool is_wait_die; if (!ww_ctx) { - list_add_tail(&waiter->list, &lock->wait_list); + __mutex_add_waiter(lock, waiter, &lock->wait_list); return 0; } + is_wait_die = ww_ctx->is_wait_die; + /* * Add the waiter before the first waiter with a higher stamp. * Waiters without a context are skipped to avoid starving - * them. + * them. Wait-Die waiters may die here. Wound-Wait waiters + * never die here, but they are sorted in stamp order and + * may wound the lock holder. */ pos = &lock->wait_list; list_for_each_entry_reverse(cur, &lock->wait_list, list) { @@ -701,16 +850,16 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, continue; if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) { - /* Back off immediately if necessary. */ - if (ww_ctx->acquired > 0) { -#ifdef CONFIG_DEBUG_MUTEXES - struct ww_mutex *ww; - - ww = container_of(lock, struct ww_mutex, base); - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); - ww_ctx->contending_lock = ww; -#endif - return -EDEADLK; + /* + * Wait-Die: if we find an older context waiting, there + * is no point in queueing behind it, as we'd have to + * die the moment it would acquire the lock. + */ + if (is_wait_die) { + int ret = __ww_mutex_kill(lock, ww_ctx); + + if (ret) + return ret; } break; @@ -718,17 +867,28 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, pos = &cur->list; + /* Wait-Die: ensure younger waiters die. */ + __ww_mutex_die(lock, cur, ww_ctx); + } + + __mutex_add_waiter(lock, waiter, pos); + + /* + * Wound-Wait: if we're blocking on a mutex owned by a younger context, + * wound that such that we might proceed. + */ + if (!is_wait_die) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + /* - * Wake up the waiter so that it gets a chance to back - * off. + * See ww_mutex_set_context_fastpath(). Orders setting + * MUTEX_FLAG_WAITERS vs the ww->ctx load, + * such that either we or the fastpath will wound @ww->ctx. */ - if (cur->ww_ctx->acquired > 0) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } + smp_mb(); + __ww_mutex_wound(lock, ww_ctx, ww->ctx); } - list_add_tail(&waiter->list, pos); return 0; } @@ -751,6 +911,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (use_ww_ctx && ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; } preempt_disable(); @@ -772,7 +940,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ if (__mutex_trylock(lock)) { if (use_ww_ctx && ww_ctx) - __ww_mutex_wakeup_for_backoff(lock, ww_ctx); + __ww_mutex_check_waiters(lock, ww_ctx); goto skip_wait; } @@ -784,25 +952,26 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (!use_ww_ctx) { /* add waiting tasks to the end of the waitqueue (FIFO): */ - list_add_tail(&waiter.list, &lock->wait_list); + __mutex_add_waiter(lock, &waiter, &lock->wait_list); + #ifdef CONFIG_DEBUG_MUTEXES waiter.ww_ctx = MUTEX_POISON_WW_CTX; #endif } else { - /* Add in stamp order, waking up waiters that must back off. */ + /* + * Add in stamp order, waking up waiters that must kill + * themselves. + */ ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); if (ret) - goto err_early_backoff; + goto err_early_kill; waiter.ww_ctx = ww_ctx; } waiter.task = current; - if (__mutex_waiter_is_first(lock, &waiter)) - __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); - set_current_state(state); for (;;) { /* @@ -815,7 +984,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto acquired; /* - * Check for signals and wound conditions while holding + * Check for signals and kill conditions while holding * wait_lock. This ensures the lock cancellation is ordered * against mutex_unlock() and wake-ups do not go missing. */ @@ -824,8 +993,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto err; } - if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) { - ret = __ww_mutex_lock_check_stamp(lock, &waiter, ww_ctx); + if (use_ww_ctx && ww_ctx) { + ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx); if (ret) goto err; } @@ -859,6 +1028,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, acquired: __set_current_state(TASK_RUNNING); + if (use_ww_ctx && ww_ctx) { + /* + * Wound-Wait; we stole the lock (!first_waiter), check the + * waiters as anyone might want to wound us. + */ + if (!ww_ctx->is_wait_die && + !__mutex_waiter_is_first(lock, &waiter)) + __ww_mutex_check_waiters(lock, ww_ctx); + } + mutex_remove_waiter(lock, &waiter, current); if (likely(list_empty(&lock->wait_list))) __mutex_clear_flag(lock, MUTEX_FLAGS); @@ -870,7 +1049,7 @@ skip_wait: lock_acquired(&lock->dep_map, ip); if (use_ww_ctx && ww_ctx) - ww_mutex_set_context_slowpath(ww, ww_ctx); + ww_mutex_lock_acquired(ww, ww_ctx); spin_unlock(&lock->wait_lock); preempt_enable(); @@ -879,7 +1058,7 @@ skip_wait: err: __set_current_state(TASK_RUNNING); mutex_remove_waiter(lock, &waiter, current); -err_early_backoff: +err_early_kill: spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 4f014be7a4b8..2823d4163a37 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1465,6 +1465,29 @@ rt_mutex_fastunlock(struct rt_mutex *lock, rt_mutex_postunlock(&wake_q); } +static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass) +{ + might_sleep(); + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/** + * rt_mutex_lock_nested - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * @subclass: the lockdep subclass + */ +void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) +{ + __rt_mutex_lock(lock, subclass); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); +#endif + +#ifndef CONFIG_DEBUG_LOCK_ALLOC /** * rt_mutex_lock - lock a rt_mutex * @@ -1472,12 +1495,10 @@ rt_mutex_fastunlock(struct rt_mutex *lock, */ void __sched rt_mutex_lock(struct rt_mutex *lock) { - might_sleep(); - - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); + __rt_mutex_lock(lock, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock); +#endif /** * rt_mutex_lock_interruptible - lock a rt_mutex interruptible diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 0e4cd64ad2c0..5b915b370d5a 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -26,7 +26,7 @@ #include <linux/slab.h> #include <linux/ww_mutex.h> -static DEFINE_WW_CLASS(ww_class); +static DEFINE_WD_CLASS(ww_class); struct workqueue_struct *wq; struct test_mutex { diff --git a/kernel/memremap.c b/kernel/memremap.c index 5857267a4af5..5b8600d39931 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/pfn_t.h> #include <linux/io.h> +#include <linux/kasan.h> #include <linux/mm.h> #include <linux/memory_hotplug.h> #include <linux/swap.h> @@ -42,7 +43,7 @@ static unsigned long order_at(struct resource *res, unsigned long pgoff) pgoff += 1UL << order, order = order_at((res), pgoff)) #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) -int device_private_entry_fault(struct vm_area_struct *vma, +vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, swp_entry_t entry, unsigned int flags, @@ -137,6 +138,7 @@ static void devm_memremap_pages_release(void *data) mem_hotplug_begin(); arch_remove_memory(align_start, align_size, pgmap->altmap_valid ? &pgmap->altmap : NULL); + kasan_remove_zero_shadow(__va(align_start), align_size); mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); @@ -176,10 +178,27 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) unsigned long pfn, pgoff, order; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; + struct dev_pagemap *conflict_pgmap; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) - align_start; + align_end = align_start + align_size - 1; + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_start), NULL); + if (conflict_pgmap) { + dev_WARN(dev, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return ERR_PTR(-ENOMEM); + } + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL); + if (conflict_pgmap) { + dev_WARN(dev, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return ERR_PTR(-ENOMEM); + } + is_ram = region_intersects(align_start, align_size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); @@ -199,7 +218,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) mutex_lock(&pgmap_lock); error = 0; - align_end = align_start + align_size - 1; foreach_order_pgoff(res, order, pgoff) { error = __radix_tree_insert(&pgmap_radix, @@ -223,6 +241,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_pfn_remap; mem_hotplug_begin(); + error = kasan_add_zero_shadow(__va(align_start), align_size); + if (error) { + mem_hotplug_done(); + goto err_kasan; + } + error = arch_add_memory(nid, align_start, align_size, altmap, false); if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], @@ -251,6 +275,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) return __va(res->start); err_add_memory: + kasan_remove_zero_shadow(__va(align_start), align_size); + err_kasan: untrack_pfn(NULL, PHYS_PFN(align_start), align_size); err_pfn_remap: err_radix: @@ -305,7 +331,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap); #ifdef CONFIG_DEV_PAGEMAP_OPS DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL_GPL(devmap_managed_key); +EXPORT_SYMBOL(devmap_managed_key); static atomic_t devmap_enable; /* @@ -339,12 +365,11 @@ void __put_devmap_managed_page(struct page *page) __ClearPageActive(page); __ClearPageWaiters(page); - page->mapping = NULL; mem_cgroup_uncharge(page); page->pgmap->page_free(page, page->pgmap->data); } else if (!count) __put_page(page); } -EXPORT_SYMBOL_GPL(__put_devmap_managed_page); +EXPORT_SYMBOL(__put_devmap_managed_page); #endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 915e123a430f..79c9be2dbbe9 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -9,4 +9,27 @@ * 2 of the Licence, or (at your option) any later version. */ -extern int mod_verify_sig(const void *mod, unsigned long *_modlen); +#include <linux/elf.h> +#include <asm/module.h> + +struct load_info { + const char *name; + /* pointer to module in temporary copy, freed at end of load_module() */ + struct module *mod; + Elf_Ehdr *hdr; + unsigned long len; + Elf_Shdr *sechdrs; + char *secstrings, *strtab; + unsigned long symoffs, stroffs; + struct _ddebug *debug; + unsigned int num_debug; + bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif + struct { + unsigned int sym, str, mod, vers, info, pcpu; + } index; +}; + +extern int mod_verify_sig(const void *mod, struct load_info *info); diff --git a/kernel/module.c b/kernel/module.c index f475f30eed8c..6746c85511fe 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -307,24 +307,6 @@ int unregister_module_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_module_notifier); -struct load_info { - const char *name; - Elf_Ehdr *hdr; - unsigned long len; - Elf_Shdr *sechdrs; - char *secstrings, *strtab; - unsigned long symoffs, stroffs; - struct _ddebug *debug; - unsigned int num_debug; - bool sig_ok; -#ifdef CONFIG_KALLSYMS - unsigned long mod_kallsyms_init_off; -#endif - struct { - unsigned int sym, str, mod, vers, info, pcpu; - } index; -}; - /* * We require a truly strong try_module_get(): 0 means success. * Otherwise an error is returned due to ongoing or failed @@ -547,12 +529,30 @@ static bool check_symbol(const struct symsearch *syms, return true; } +static unsigned long kernel_symbol_value(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return (unsigned long)offset_to_ptr(&sym->value_offset); +#else + return sym->value; +#endif +} + +static const char *kernel_symbol_name(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return offset_to_ptr(&sym->name_offset); +#else + return sym->name; +#endif +} + static int cmp_name(const void *va, const void *vb) { const char *a; const struct kernel_symbol *b; a = va; b = vb; - return strcmp(a, b->name); + return strcmp(a, kernel_symbol_name(b)); } static bool find_symbol_in_section(const struct symsearch *syms, @@ -1339,14 +1339,12 @@ static inline int check_modstruct_version(const struct load_info *info, * locking is necessary -- use preempt_disable() to placate lockdep. */ preempt_disable(); - if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL, - &crc, true, false)) { + if (!find_symbol("module_layout", NULL, &crc, true, false)) { preempt_enable(); BUG(); } preempt_enable(); - return check_version(info, VMLINUX_SYMBOL_STR(module_layout), - mod, crc); + return check_version(info, "module_layout", mod, crc); } /* First part is kernel version, which we ignore if module has crcs. */ @@ -2059,21 +2057,19 @@ static int copy_module_elf(struct module *mod, struct load_info *info) /* Elf section header table */ size = sizeof(*info->sechdrs) * info->hdr->e_shnum; - mod->klp_info->sechdrs = kmalloc(size, GFP_KERNEL); + mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL); if (mod->klp_info->sechdrs == NULL) { ret = -ENOMEM; goto free_info; } - memcpy(mod->klp_info->sechdrs, info->sechdrs, size); /* Elf section name string table */ size = info->sechdrs[info->hdr->e_shstrndx].sh_size; - mod->klp_info->secstrings = kmalloc(size, GFP_KERNEL); + mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL); if (mod->klp_info->secstrings == NULL) { ret = -ENOMEM; goto free_sechdrs; } - memcpy(mod->klp_info->secstrings, info->secstrings, size); /* Elf symbol section index */ symndx = info->index.sym; @@ -2192,7 +2188,7 @@ void *__symbol_get(const char *symbol) sym = NULL; preempt_enable(); - return sym ? (void *)sym->value : NULL; + return sym ? (void *)kernel_symbol_value(sym) : NULL; } EXPORT_SYMBOL_GPL(__symbol_get); @@ -2222,10 +2218,12 @@ static int verify_export_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - if (find_symbol(s->name, &owner, NULL, true, false)) { + if (find_symbol(kernel_symbol_name(s), &owner, NULL, + true, false)) { pr_err("%s: exports duplicate symbol %s" " (owned by %s)\n", - mod->name, s->name, module_name(owner)); + mod->name, kernel_symbol_name(s), + module_name(owner)); return -ENOEXEC; } } @@ -2274,7 +2272,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) ksym = resolve_symbol_wait(mod, info, name); /* Ok if resolved. */ if (ksym && !IS_ERR(ksym)) { - sym[i].st_value = ksym->value; + sym[i].st_value = kernel_symbol_value(ksym); break; } @@ -2282,9 +2280,9 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) break; - pr_warn("%s: Unknown symbol %s (err %li)\n", - mod->name, name, PTR_ERR(ksym)); ret = PTR_ERR(ksym) ?: -ENOENT; + pr_warn("%s: Unknown symbol %s (err %d)\n", + mod->name, name, ret); break; default: @@ -2486,7 +2484,11 @@ static char *get_modinfo(struct load_info *info, const char *tag) Elf_Shdr *infosec = &info->sechdrs[info->index.info]; unsigned long size = infosec->sh_size; - for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) { + /* + * get_modinfo() calls made before rewrite_section_headers() + * must use sh_offset, as sh_addr isn't set! + */ + for (p = (char *)info->hdr + infosec->sh_offset; p; p = next_string(p, &size)) { if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') return p + taglen + 1; } @@ -2534,7 +2536,7 @@ static int is_exported(const char *name, unsigned long value, ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); else ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); - return ks != NULL && ks->value == value; + return ks != NULL && kernel_symbol_value(ks) == value; } /* As per nm */ @@ -2774,7 +2776,7 @@ static int module_sig_check(struct load_info *info, int flags) memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ info->len -= markerlen; - err = mod_verify_sig(mod, &info->len); + err = mod_verify_sig(mod, info); } if (!err) { @@ -2876,7 +2878,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; - err = security_kernel_read_file(NULL, READING_MODULE); + err = security_kernel_load_data(LOADING_MODULE); if (err) return err; @@ -2926,17 +2928,7 @@ static int rewrite_section_headers(struct load_info *info, int flags) } /* Track but don't keep modinfo and version sections. */ - if (flags & MODULE_INIT_IGNORE_MODVERSIONS) - info->index.vers = 0; /* Pretend no __versions section! */ - else - info->index.vers = find_sec(info, "__versions"); info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; - - info->index.info = find_sec(info, ".modinfo"); - if (!info->index.info) - info->name = "(missing .modinfo section)"; - else - info->name = get_modinfo(info, "name"); info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; return 0; @@ -2947,23 +2939,24 @@ static int rewrite_section_headers(struct load_info *info, int flags) * search for module section index etc), and do some basic section * verification. * - * Return the temporary module pointer (we'll replace it with the final - * one when we move the module sections around). + * Set info->mod to the temporary copy of the module in info->hdr. The final one + * will be allocated in move_module(). */ -static struct module *setup_load_info(struct load_info *info, int flags) +static int setup_load_info(struct load_info *info, int flags) { unsigned int i; - int err; - struct module *mod; /* Set up the convenience variables */ info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; info->secstrings = (void *)info->hdr + info->sechdrs[info->hdr->e_shstrndx].sh_offset; - err = rewrite_section_headers(info, flags); - if (err) - return ERR_PTR(err); + /* Try to find a name early so we can log errors with a module name */ + info->index.info = find_sec(info, ".modinfo"); + if (!info->index.info) + info->name = "(missing .modinfo section)"; + else + info->name = get_modinfo(info, "name"); /* Find internal symbols and strings. */ for (i = 1; i < info->hdr->e_shnum; i++) { @@ -2976,34 +2969,35 @@ static struct module *setup_load_info(struct load_info *info, int flags) } } + if (info->index.sym == 0) { + pr_warn("%s: module has no symbols (stripped?)\n", info->name); + return -ENOEXEC; + } + info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); if (!info->index.mod) { pr_warn("%s: No module found in object\n", info->name ?: "(missing .modinfo name field)"); - return ERR_PTR(-ENOEXEC); + return -ENOEXEC; } /* This is temporary: point mod into copy of data. */ - mod = (void *)info->sechdrs[info->index.mod].sh_addr; + info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset; /* - * If we didn't load the .modinfo 'name' field, fall back to + * If we didn't load the .modinfo 'name' field earlier, fall back to * on-disk struct mod 'name' field. */ if (!info->name) - info->name = mod->name; + info->name = info->mod->name; - if (info->index.sym == 0) { - pr_warn("%s: module has no symbols (stripped?)\n", info->name); - return ERR_PTR(-ENOEXEC); - } + if (flags & MODULE_INIT_IGNORE_MODVERSIONS) + info->index.vers = 0; /* Pretend no __versions section! */ + else + info->index.vers = find_sec(info, "__versions"); info->index.pcpu = find_pcpusec(info); - /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(info, mod)) - return ERR_PTR(-ENOEXEC); - - return mod; + return 0; } static int check_modinfo(struct module *mod, struct load_info *info, int flags) @@ -3298,25 +3292,17 @@ core_param(module_blacklist, module_blacklist, charp, 0400); static struct module *layout_and_allocate(struct load_info *info, int flags) { - /* Module within temporary copy. */ struct module *mod; unsigned int ndx; int err; - mod = setup_load_info(info, flags); - if (IS_ERR(mod)) - return mod; - - if (blacklisted(info->name)) - return ERR_PTR(-EPERM); - - err = check_modinfo(mod, info, flags); + err = check_modinfo(info->mod, info, flags); if (err) return ERR_PTR(err); /* Allow arches to frob section contents and sizes. */ err = module_frob_arch_sections(info->hdr, info->sechdrs, - info->secstrings, mod); + info->secstrings, info->mod); if (err < 0) return ERR_PTR(err); @@ -3335,11 +3321,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any special cases for the architectures. */ - layout_sections(mod, info); - layout_symtab(mod, info); + layout_sections(info->mod, info); + layout_symtab(info->mod, info); /* Allocate and move to the final place */ - err = move_module(mod, info); + err = move_module(info->mod, info); if (err) return ERR_PTR(err); @@ -3657,17 +3643,36 @@ static int load_module(struct load_info *info, const char __user *uargs, int flags) { struct module *mod; - long err; + long err = 0; char *after_dashes; + err = elf_header_check(info); + if (err) + goto free_copy; + + err = setup_load_info(info, flags); + if (err) + goto free_copy; + + if (blacklisted(info->name)) { + err = -EPERM; + goto free_copy; + } + err = module_sig_check(info, flags); if (err) goto free_copy; - err = elf_header_check(info); + err = rewrite_section_headers(info, flags); if (err) goto free_copy; + /* Check module struct version now, before we try to use module. */ + if (!check_modstruct_version(info, info->mod)) { + err = -ENOEXEC; + goto free_copy; + } + /* Figure out module layout, and allocate all the memory. */ mod = layout_and_allocate(info, flags); if (IS_ERR(mod)) { @@ -4067,7 +4072,7 @@ static unsigned long mod_find_symname(struct module *mod, const char *name) for (i = 0; i < kallsyms->num_symtab; i++) if (strcmp(name, symname(kallsyms, i)) == 0 && - kallsyms->symtab[i].st_info != 'U') + kallsyms->symtab[i].st_shndx != SHN_UNDEF) return kallsyms->symtab[i].st_value; return 0; } @@ -4113,6 +4118,10 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, if (mod->state == MODULE_STATE_UNFORMED) continue; for (i = 0; i < kallsyms->num_symtab; i++) { + + if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) + continue; + ret = fn(data, symname(kallsyms, i), mod, kallsyms->symtab[i].st_value); if (ret != 0) diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 937c844bee4a..f2075ce8e4b3 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -45,10 +45,10 @@ struct module_signature { /* * Verify the signature on a module. */ -int mod_verify_sig(const void *mod, unsigned long *_modlen) +int mod_verify_sig(const void *mod, struct load_info *info) { struct module_signature ms; - size_t modlen = *_modlen, sig_len; + size_t sig_len, modlen = info->len; pr_devel("==>%s(,%zu)\n", __func__, modlen); @@ -62,10 +62,11 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) if (sig_len >= modlen) return -EBADMSG; modlen -= sig_len; - *_modlen = modlen; + info->len = modlen; if (ms.id_type != PKEY_ID_PKCS7) { - pr_err("Module is not signed with expected PKCS#7 message\n"); + pr_err("%s: Module is not signed with expected PKCS#7 message\n", + info->name); return -ENOPKG; } @@ -76,7 +77,8 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) ms.__pad[0] != 0 || ms.__pad[1] != 0 || ms.__pad[2] != 0) { - pr_err("PKCS#7 signature info has unexpected non-zero params\n"); + pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n", + info->name); return -EBADMSG; } diff --git a/kernel/pid.c b/kernel/pid.c index 157fe4b19971..de1cfc4f75a2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -265,27 +265,33 @@ struct pid *find_vpid(int nr) } EXPORT_SYMBOL_GPL(find_vpid); +static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) +{ + return (type == PIDTYPE_PID) ? + &task->thread_pid : + &task->signal->pids[type]; +} + /* * attach_pid() must be called with the tasklist_lock write-held. */ void attach_pid(struct task_struct *task, enum pid_type type) { - struct pid_link *link = &task->pids[type]; - hlist_add_head_rcu(&link->node, &link->pid->tasks[type]); + struct pid *pid = *task_pid_ptr(task, type); + hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } static void __change_pid(struct task_struct *task, enum pid_type type, struct pid *new) { - struct pid_link *link; + struct pid **pid_ptr = task_pid_ptr(task, type); struct pid *pid; int tmp; - link = &task->pids[type]; - pid = link->pid; + pid = *pid_ptr; - hlist_del_rcu(&link->node); - link->pid = new; + hlist_del_rcu(&task->pid_links[type]); + *pid_ptr = new; for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (!hlist_empty(&pid->tasks[tmp])) @@ -310,8 +316,9 @@ void change_pid(struct task_struct *task, enum pid_type type, void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { - new->pids[type].pid = old->pids[type].pid; - hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); + if (type == PIDTYPE_PID) + new->thread_pid = old->thread_pid; + hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } struct task_struct *pid_task(struct pid *pid, enum pid_type type) @@ -322,7 +329,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), lockdep_tasklist_lock_is_held()); if (first) - result = hlist_entry(first, struct task_struct, pids[(type)].node); + result = hlist_entry(first, struct task_struct, pid_links[(type)]); } return result; } @@ -360,9 +367,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { struct pid *pid; rcu_read_lock(); - if (type != PIDTYPE_PID) - task = task->group_leader; - pid = get_pid(rcu_dereference(task->pids[type].pid)); + pid = get_pid(rcu_dereference(*task_pid_ptr(task, type))); rcu_read_unlock(); return pid; } @@ -420,15 +425,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) { - if (type == __PIDTYPE_TGID) - type = PIDTYPE_PID; - - task = task->group_leader; - } - nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); - } + if (likely(pid_alive(task))) + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index e880ca22c5a5..3a6c2f87699e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -105,6 +105,7 @@ config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS select PM + select SRCU config PM_SLEEP_SMP def_bool y diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 9c85c7822383..abef759de7c8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -338,7 +338,7 @@ static int create_image(int platform_mode) * hibernation_snapshot - Quiesce devices and create a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. * - * This routine must be called with pm_mutex held. + * This routine must be called with system_transition_mutex held. */ int hibernation_snapshot(int platform_mode) { @@ -500,8 +500,9 @@ static int resume_target_kernel(bool platform_mode) * hibernation_restore - Quiesce devices and restore from a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. * - * This routine must be called with pm_mutex held. If it is successful, control - * reappears in the restored target kernel in hibernation_snapshot(). + * This routine must be called with system_transition_mutex held. If it is + * successful, control reappears in the restored target kernel in + * hibernation_snapshot(). */ int hibernation_restore(int platform_mode) { @@ -638,6 +639,7 @@ static void power_down(void) break; case HIBERNATION_PLATFORM: hibernation_platform_enter(); + /* Fall through */ case HIBERNATION_SHUTDOWN: if (pm_power_off) kernel_power_off(); @@ -805,13 +807,13 @@ static int software_resume(void) * name_to_dev_t() below takes a sysfs buffer mutex when sysfs * is configured into the kernel. Since the regular hibernate * trigger path is via sysfs which takes a buffer mutex before - * calling hibernate functions (which take pm_mutex) this can - * cause lockdep to complain about a possible ABBA deadlock + * calling hibernate functions (which take system_transition_mutex) + * this can cause lockdep to complain about a possible ABBA deadlock * which cannot happen since we're in the boot code here and * sysfs can't be invoked yet. Therefore, we use a subclass * here to avoid lockdep complaining. */ - mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); + mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING); if (swsusp_resume_device) goto Check_image; @@ -899,7 +901,7 @@ static int software_resume(void) atomic_inc(&snapshot_device_available); /* For success case, the suspend path will release the lock */ Unlock: - mutex_unlock(&pm_mutex); + mutex_unlock(&system_transition_mutex); pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: diff --git a/kernel/power/main.c b/kernel/power/main.c index d9706da10930..35b50823d83b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -15,17 +15,16 @@ #include <linux/workqueue.h> #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <linux/suspend.h> #include "power.h" -DEFINE_MUTEX(pm_mutex); - #ifdef CONFIG_PM_SLEEP void lock_system_sleep(void) { current->flags |= PF_FREEZER_SKIP; - mutex_lock(&pm_mutex); + mutex_lock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(lock_system_sleep); @@ -37,8 +36,9 @@ void unlock_system_sleep(void) * * Reason: * Fundamentally, we just don't need it, because freezing condition - * doesn't come into effect until we release the pm_mutex lock, - * since the freezer always works with pm_mutex held. + * doesn't come into effect until we release the + * system_transition_mutex lock, since the freezer always works with + * system_transition_mutex held. * * More importantly, in the case of hibernation, * unlock_system_sleep() gets called in snapshot_read() and @@ -47,7 +47,7 @@ void unlock_system_sleep(void) * enter the refrigerator, thus causing hibernation to lockup. */ current->flags &= ~PF_FREEZER_SKIP; - mutex_unlock(&pm_mutex); + mutex_unlock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(unlock_system_sleep); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 87331565e505..5342f6fc022e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -92,7 +92,7 @@ static void s2idle_enter(void) /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ - swait_event(s2idle_wait_head, + swait_event_exclusive(s2idle_wait_head, s2idle_state == S2IDLE_STATE_WAKE); cpuidle_pause(); @@ -160,7 +160,7 @@ void s2idle_wake(void) raw_spin_lock_irqsave(&s2idle_lock, flags); if (s2idle_state > S2IDLE_STATE_NONE) { s2idle_state = S2IDLE_STATE_WAKE; - swake_up(&s2idle_wait_head); + swake_up_one(&s2idle_wait_head); } raw_spin_unlock_irqrestore(&s2idle_lock, flags); } @@ -556,7 +556,7 @@ static int enter_state(suspend_state_t state) } else if (!valid_state(state)) { return -EINVAL; } - if (!mutex_trylock(&pm_mutex)) + if (!mutex_trylock(&system_transition_mutex)) return -EBUSY; if (state == PM_SUSPEND_TO_IDLE) @@ -590,7 +590,7 @@ static int enter_state(suspend_state_t state) pm_pr_dbg("Finishing wakeup.\n"); suspend_finish(); Unlock: - mutex_unlock(&pm_mutex); + mutex_unlock(&system_transition_mutex); return error; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index c2bcf97d24c8..d7f6c1a288d3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -923,7 +923,7 @@ int swsusp_write(unsigned int flags) } memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_read_next(&snapshot); - if (error < PAGE_SIZE) { + if (error < (int)PAGE_SIZE) { if (error >= 0) error = -EFAULT; @@ -1483,7 +1483,7 @@ int swsusp_read(unsigned int *flags_p) memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_write_next(&snapshot); - if (error < PAGE_SIZE) + if (error < (int)PAGE_SIZE) return error < 0 ? error : -EFAULT; header = (struct swsusp_info *)data_of(snapshot); error = get_swap_reader(&handle, flags_p); diff --git a/kernel/power/user.c b/kernel/power/user.c index abd225550271..2d8b60a3c86b 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -216,7 +216,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!mutex_trylock(&pm_mutex)) + if (!mutex_trylock(&system_transition_mutex)) return -EBUSY; lock_device_hotplug(); @@ -394,7 +394,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } unlock_device_hotplug(); - mutex_unlock(&pm_mutex); + mutex_unlock(&system_transition_mutex); return error; } diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 2a7d04049af4..0f1898820cba 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -19,11 +19,16 @@ #ifdef CONFIG_PRINTK #define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff -#define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000 +#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x40000000 #define PRINTK_NMI_CONTEXT_MASK 0x80000000 extern raw_spinlock_t logbuf_lock; +__printf(5, 0) +int vprintk_store(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, va_list args); + __printf(1, 0) int vprintk_default(const char *fmt, va_list args); __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); __printf(1, 0) int vprintk_func(const char *fmt, va_list args); @@ -54,6 +59,8 @@ void __printk_safe_exit(void); local_irq_enable(); \ } while (0) +void defer_console_output(void); + #else __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 247808333ba4..924e37fb1620 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -66,6 +66,9 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; +atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); +EXPORT_SYMBOL(ignore_console_lock_warning); + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -349,7 +352,7 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; */ enum log_flags { - LOG_NOCONS = 1, /* already flushed, do not print to console */ + LOG_NOCONS = 1, /* suppress print, do not print to console */ LOG_NEWLINE = 2, /* text ended with a newline */ LOG_PREFIX = 4, /* text started with a prefix */ LOG_CONT = 8, /* text is a fragment of a continuation line */ @@ -1352,71 +1355,68 @@ static int syslog_print_all(char __user *buf, int size, bool clear) { char *text; int len = 0; + u64 next_seq; + u64 seq; + u32 idx; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; logbuf_lock_irq(); - if (buf) { - u64 next_seq; - u64 seq; - u32 idx; + /* + * Find first record that fits, including all following records, + * into the user-provided buffer for this dump. + */ + seq = clear_seq; + idx = clear_idx; + while (seq < log_next_seq) { + struct printk_log *msg = log_from_idx(idx); - /* - * Find first record that fits, including all following records, - * into the user-provided buffer for this dump. - */ - seq = clear_seq; - idx = clear_idx; - while (seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - len += msg_print_text(msg, true, NULL, 0); - idx = log_next(idx); - seq++; - } + len += msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } - /* move first record forward until length fits into the buffer */ - seq = clear_seq; - idx = clear_idx; - while (len > size && seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); + /* move first record forward until length fits into the buffer */ + seq = clear_seq; + idx = clear_idx; + while (len > size && seq < log_next_seq) { + struct printk_log *msg = log_from_idx(idx); - len -= msg_print_text(msg, true, NULL, 0); - idx = log_next(idx); - seq++; - } + len -= msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } - /* last message fitting into this dump */ - next_seq = log_next_seq; + /* last message fitting into this dump */ + next_seq = log_next_seq; - len = 0; - while (len >= 0 && seq < next_seq) { - struct printk_log *msg = log_from_idx(idx); - int textlen; + len = 0; + while (len >= 0 && seq < next_seq) { + struct printk_log *msg = log_from_idx(idx); + int textlen; - textlen = msg_print_text(msg, true, text, - LOG_LINE_MAX + PREFIX_MAX); - if (textlen < 0) { - len = textlen; - break; - } - idx = log_next(idx); - seq++; + textlen = msg_print_text(msg, true, text, + LOG_LINE_MAX + PREFIX_MAX); + if (textlen < 0) { + len = textlen; + break; + } + idx = log_next(idx); + seq++; - logbuf_unlock_irq(); - if (copy_to_user(buf + len, text, textlen)) - len = -EFAULT; - else - len += textlen; - logbuf_lock_irq(); - - if (seq < log_first_seq) { - /* messages are gone, move to next one */ - seq = log_first_seq; - idx = log_first_idx; - } + logbuf_unlock_irq(); + if (copy_to_user(buf + len, text, textlen)) + len = -EFAULT; + else + len += textlen; + logbuf_lock_irq(); + + if (seq < log_first_seq) { + /* messages are gone, move to next one */ + seq = log_first_seq; + idx = log_first_idx; } } @@ -1430,6 +1430,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear) return len; } +static void syslog_clear(void) +{ + logbuf_lock_irq(); + clear_seq = log_next_seq; + clear_idx = log_next_idx; + logbuf_unlock_irq(); +} + int do_syslog(int type, char __user *buf, int len, int source) { bool clear = false; @@ -1474,7 +1482,7 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: - syslog_print_all(NULL, 0, true); + syslog_clear(); break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: @@ -1824,28 +1832,16 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); } -asmlinkage int vprintk_emit(int facility, int level, - const char *dict, size_t dictlen, - const char *fmt, va_list args) +/* Must be called under logbuf_lock. */ +int vprintk_store(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, va_list args) { static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len; enum log_flags lflags = 0; - unsigned long flags; - int printed_len; - bool in_sched = false; - - if (level == LOGLEVEL_SCHED) { - level = LOGLEVEL_DEFAULT; - in_sched = true; - } - boot_delay_msec(level); - printk_delay(); - - /* This stops the holder of console_sem just where we want him */ - logbuf_lock_irqsave(flags); /* * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. @@ -1886,8 +1882,32 @@ asmlinkage int vprintk_emit(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; - printed_len = log_output(facility, level, lflags, dict, dictlen, text, text_len); + if (suppress_message_printing(level)) + lflags |= LOG_NOCONS; + return log_output(facility, level, lflags, + dict, dictlen, text, text_len); +} + +asmlinkage int vprintk_emit(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, va_list args) +{ + int printed_len; + bool in_sched = false; + unsigned long flags; + + if (level == LOGLEVEL_SCHED) { + level = LOGLEVEL_DEFAULT; + in_sched = true; + } + + boot_delay_msec(level); + printk_delay(); + + /* This stops the holder of console_sem just where we want him */ + logbuf_lock_irqsave(flags); + printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ @@ -2013,7 +2033,6 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { return 0; } -static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2243,6 +2262,7 @@ int is_console_locked(void) { return console_locked; } +EXPORT_SYMBOL(is_console_locked); /* * Check if we have any console that is capable of printing while cpu is @@ -2349,11 +2369,10 @@ skip: break; msg = log_from_idx(console_idx); - if (suppress_message_printing(msg->level)) { + if (msg->flags & LOG_NOCONS) { /* - * Skip record we have buffered and already printed - * directly to the console when we received it, and - * record that has level above the console loglevel. + * Skip record if !ignore_loglevel, and + * record has level above the console loglevel. */ console_idx = log_next(console_idx); console_seq++; @@ -2772,7 +2791,8 @@ EXPORT_SYMBOL(unregister_console); void __init console_init(void) { int ret; - initcall_t *call; + initcall_t call; + initcall_entry_t *ce; /* Setup the default TTY line discipline. */ n_tty_init(); @@ -2781,13 +2801,14 @@ void __init console_init(void) * set up the console device so that later boot sequences can * inform about problems etc.. */ - call = __con_initcall_start; + ce = __con_initcall_start; trace_initcall_level("console"); - while (call < __con_initcall_end) { - trace_initcall_start((*call)); - ret = (*call)(); - trace_initcall_finish((*call), ret); - call++; + while (ce < __con_initcall_end) { + call = initcall_from_entry(ce); + trace_initcall_start(call); + ret = call(); + trace_initcall_finish(call, ret); + ce++; } } @@ -2878,16 +2899,20 @@ void wake_up_klogd(void) preempt_enable(); } -int vprintk_deferred(const char *fmt, va_list args) +void defer_console_output(void) { - int r; - - r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); - preempt_disable(); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); preempt_enable(); +} + +int vprintk_deferred(const char *fmt, va_list args) +{ + int r; + + r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); + defer_console_output(); return r; } diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index d7d091309054..a0a74c533e4b 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -308,24 +308,33 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) void printk_nmi_enter(void) { - /* - * The size of the extra per-CPU buffer is limited. Use it only when - * the main one is locked. If this CPU is not in the safe context, - * the lock must be taken on another CPU and we could wait for it. - */ - if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) && - raw_spin_is_locked(&logbuf_lock)) { - this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); - } else { - this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK); - } + this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); } void printk_nmi_exit(void) { - this_cpu_and(printk_context, - ~(PRINTK_NMI_CONTEXT_MASK | - PRINTK_NMI_DEFERRED_CONTEXT_MASK)); + this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); +} + +/* + * Marks a code that might produce many messages in NMI context + * and the risk of losing them is more critical than eventual + * reordering. + * + * It has effect only when called in NMI context. Then printk() + * will try to store the messages into the main logbuf directly + * and use the per-CPU buffers only as a fallback when the lock + * is not available. + */ +void printk_nmi_direct_enter(void) +{ + if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) + this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK); +} + +void printk_nmi_direct_exit(void) +{ + this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK); } #else @@ -363,6 +372,20 @@ void __printk_safe_exit(void) __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { + /* + * Try to use the main logbuf even in NMI. But avoid calling console + * drivers that might have their own locks. + */ + if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) && + raw_spin_trylock(&logbuf_lock)) { + int len; + + len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + raw_spin_unlock(&logbuf_lock); + defer_console_output(); + return len; + } + /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */ if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) return vprintk_nmi(fmt, args); @@ -371,13 +394,6 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) return vprintk_safe(fmt, args); - /* - * Use the main logbuf when logbuf_lock is available in NMI. - * But avoid calling console drivers that might have their own locks. - */ - if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK) - return vprintk_deferred(fmt, args); - /* No obstacles. */ return vprintk_default(fmt, args); } diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 40cea6735c2d..4d04683c31b2 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -91,7 +91,17 @@ static inline void rcu_seq_end(unsigned long *sp) WRITE_ONCE(*sp, rcu_seq_endval(sp)); } -/* Take a snapshot of the update side's sequence number. */ +/* + * rcu_seq_snap - Take a snapshot of the update side's sequence number. + * + * This function returns the earliest value of the grace-period sequence number + * that will indicate that a full grace period has elapsed since the current + * time. Once the grace-period sequence number has reached this value, it will + * be safe to invoke all callbacks that have been registered prior to the + * current time. This value is the current grace-period number plus two to the + * power of the number of low-order bits reserved for state, then rounded up to + * the next value in which the state bits are all zero. + */ static inline unsigned long rcu_seq_snap(unsigned long *sp) { unsigned long s; @@ -108,6 +118,15 @@ static inline unsigned long rcu_seq_current(unsigned long *sp) } /* + * Given a snapshot from rcu_seq_snap(), determine whether or not the + * corresponding update-side operation has started. + */ +static inline bool rcu_seq_started(unsigned long *sp, unsigned long s) +{ + return ULONG_CMP_LT((s - 1) & ~RCU_SEQ_STATE_MASK, READ_ONCE(*sp)); +} + +/* * Given a snapshot from rcu_seq_snap(), determine whether or not a * full update-side operation has occurred. */ @@ -117,6 +136,45 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) } /* + * Has a grace period completed since the time the old gp_seq was collected? + */ +static inline bool rcu_seq_completed_gp(unsigned long old, unsigned long new) +{ + return ULONG_CMP_LT(old, new & ~RCU_SEQ_STATE_MASK); +} + +/* + * Has a grace period started since the time the old gp_seq was collected? + */ +static inline bool rcu_seq_new_gp(unsigned long old, unsigned long new) +{ + return ULONG_CMP_LT((old + RCU_SEQ_STATE_MASK) & ~RCU_SEQ_STATE_MASK, + new); +} + +/* + * Roughly how many full grace periods have elapsed between the collection + * of the two specified grace periods? + */ +static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old) +{ + unsigned long rnd_diff; + + if (old == new) + return 0; + /* + * Compute the number of grace periods (still shifted up), plus + * one if either of new and old is not an exact grace period. + */ + rnd_diff = (new & ~RCU_SEQ_STATE_MASK) - + ((old + RCU_SEQ_STATE_MASK) & ~RCU_SEQ_STATE_MASK) + + ((new & RCU_SEQ_STATE_MASK) || (old & RCU_SEQ_STATE_MASK)); + if (ULONG_CMP_GE(RCU_SEQ_STATE_MASK, rnd_diff)) + return 1; /* Definitely no grace period has elapsed. */ + return ((rnd_diff - RCU_SEQ_STATE_MASK - 1) >> RCU_SEQ_CTR_SHIFT) + 2; +} + +/* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally * by call_rcu() and rcu callback execution, and are therefore not part of the * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. @@ -276,6 +334,9 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) /* Is this rcu_node a leaf? */ #define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1) +/* Is this rcu_node the last leaf? */ +#define rcu_is_last_leaf_node(rsp, rnp) ((rnp) == &(rsp)->node[rcu_num_nodes - 1]) + /* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. @@ -405,8 +466,7 @@ enum rcutorture_type { #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gpnum, unsigned long *completed); -void rcutorture_record_test_transition(void); + unsigned long *gp_seq); void rcutorture_record_progress(unsigned long vernum); void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, @@ -415,15 +475,11 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long c); #else static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, - int *flags, - unsigned long *gpnum, - unsigned long *completed) + int *flags, unsigned long *gp_seq) { *flags = 0; - *gpnum = 0; - *completed = 0; + *gp_seq = 0; } -static inline void rcutorture_record_test_transition(void) { } static inline void rcutorture_record_progress(unsigned long vernum) { } #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, @@ -441,31 +497,26 @@ void do_trace_rcu_torture_read(const char *rcutorturename, static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, - unsigned long *gpnum, - unsigned long *completed) + unsigned long *gp_seq) { if (test_type != SRCU_FLAVOR) return; *flags = 0; - *completed = sp->srcu_idx; - *gpnum = *completed; + *gp_seq = sp->srcu_idx; } #elif defined(CONFIG_TREE_SRCU) void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, - unsigned long *gpnum, unsigned long *completed); + unsigned long *gp_seq); #endif #ifdef CONFIG_TINY_RCU -static inline unsigned long rcu_batches_started(void) { return 0; } -static inline unsigned long rcu_batches_started_bh(void) { return 0; } -static inline unsigned long rcu_batches_started_sched(void) { return 0; } -static inline unsigned long rcu_batches_completed(void) { return 0; } -static inline unsigned long rcu_batches_completed_bh(void) { return 0; } -static inline unsigned long rcu_batches_completed_sched(void) { return 0; } +static inline unsigned long rcu_get_gp_seq(void) { return 0; } +static inline unsigned long rcu_bh_get_gp_seq(void) { return 0; } +static inline unsigned long rcu_sched_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; } static inline unsigned long @@ -474,19 +525,16 @@ static inline void rcu_force_quiescent_state(void) { } static inline void rcu_bh_force_quiescent_state(void) { } static inline void rcu_sched_force_quiescent_state(void) { } static inline void show_rcu_gp_kthreads(void) { } +static inline int rcu_get_gp_kthreads_prio(void) { return 0; } #else /* #ifdef CONFIG_TINY_RCU */ -extern unsigned long rcutorture_testseq; -extern unsigned long rcutorture_vernum; -unsigned long rcu_batches_started(void); -unsigned long rcu_batches_started_bh(void); -unsigned long rcu_batches_started_sched(void); -unsigned long rcu_batches_completed(void); -unsigned long rcu_batches_completed_bh(void); -unsigned long rcu_batches_completed_sched(void); +unsigned long rcu_get_gp_seq(void); +unsigned long rcu_bh_get_gp_seq(void); +unsigned long rcu_sched_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long rcu_exp_batches_completed_sched(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); void show_rcu_gp_kthreads(void); +int rcu_get_gp_kthreads_prio(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index e232846516b3..34244523550e 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -19,6 +19,9 @@ * * Authors: Paul E. McKenney <paulmck@us.ibm.com> */ + +#define pr_fmt(fmt) fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/init.h> @@ -88,7 +91,7 @@ torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, !IS_ENABLED(MODULE), "Shutdown at end of performance tests."); -torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); static char *perf_type = "rcu"; @@ -135,8 +138,8 @@ struct rcu_perf_ops { void (*cleanup)(void); int (*readlock)(void); void (*readunlock)(int idx); - unsigned long (*started)(void); - unsigned long (*completed)(void); + unsigned long (*get_gp_seq)(void); + unsigned long (*gp_diff)(unsigned long new, unsigned long old); unsigned long (*exp_completed)(void); void (*async)(struct rcu_head *head, rcu_callback_t func); void (*gp_barrier)(void); @@ -176,8 +179,8 @@ static struct rcu_perf_ops rcu_ops = { .init = rcu_sync_perf_init, .readlock = rcu_perf_read_lock, .readunlock = rcu_perf_read_unlock, - .started = rcu_batches_started, - .completed = rcu_batches_completed, + .get_gp_seq = rcu_get_gp_seq, + .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed, .async = call_rcu, .gp_barrier = rcu_barrier, @@ -206,8 +209,8 @@ static struct rcu_perf_ops rcu_bh_ops = { .init = rcu_sync_perf_init, .readlock = rcu_bh_perf_read_lock, .readunlock = rcu_bh_perf_read_unlock, - .started = rcu_batches_started_bh, - .completed = rcu_batches_completed_bh, + .get_gp_seq = rcu_bh_get_gp_seq, + .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed_sched, .async = call_rcu_bh, .gp_barrier = rcu_barrier_bh, @@ -263,8 +266,8 @@ static struct rcu_perf_ops srcu_ops = { .init = rcu_sync_perf_init, .readlock = srcu_perf_read_lock, .readunlock = srcu_perf_read_unlock, - .started = NULL, - .completed = srcu_perf_completed, + .get_gp_seq = srcu_perf_completed, + .gp_diff = rcu_seq_diff, .exp_completed = srcu_perf_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, @@ -292,8 +295,8 @@ static struct rcu_perf_ops srcud_ops = { .cleanup = srcu_sync_perf_cleanup, .readlock = srcu_perf_read_lock, .readunlock = srcu_perf_read_unlock, - .started = NULL, - .completed = srcu_perf_completed, + .get_gp_seq = srcu_perf_completed, + .gp_diff = rcu_seq_diff, .exp_completed = srcu_perf_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, @@ -322,8 +325,8 @@ static struct rcu_perf_ops sched_ops = { .init = rcu_sync_perf_init, .readlock = sched_perf_read_lock, .readunlock = sched_perf_read_unlock, - .started = rcu_batches_started_sched, - .completed = rcu_batches_completed_sched, + .get_gp_seq = rcu_sched_get_gp_seq, + .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed_sched, .async = call_rcu_sched, .gp_barrier = rcu_barrier_sched, @@ -350,8 +353,8 @@ static struct rcu_perf_ops tasks_ops = { .init = rcu_sync_perf_init, .readlock = tasks_perf_read_lock, .readunlock = tasks_perf_read_unlock, - .started = rcu_no_completed, - .completed = rcu_no_completed, + .get_gp_seq = rcu_no_completed, + .gp_diff = rcu_seq_diff, .async = call_rcu_tasks, .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, @@ -359,9 +362,11 @@ static struct rcu_perf_ops tasks_ops = { .name = "tasks" }; -static bool __maybe_unused torturing_tasks(void) +static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) { - return cur_ops == &tasks_ops; + if (!cur_ops->gp_diff) + return new - old; + return cur_ops->gp_diff(new, old); } /* @@ -444,8 +449,7 @@ rcu_perf_writer(void *arg) b_rcu_perf_writer_started = cur_ops->exp_completed() / 2; } else { - b_rcu_perf_writer_started = - cur_ops->completed(); + b_rcu_perf_writer_started = cur_ops->get_gp_seq(); } } @@ -502,7 +506,7 @@ retry: cur_ops->exp_completed() / 2; } else { b_rcu_perf_writer_finished = - cur_ops->completed(); + cur_ops->get_gp_seq(); } if (shutdown) { smp_mb(); /* Assign before wake. */ @@ -527,7 +531,7 @@ retry: return 0; } -static inline void +static void rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG @@ -582,8 +586,8 @@ rcu_perf_cleanup(void) t_rcu_perf_writer_finished - t_rcu_perf_writer_started, ngps, - b_rcu_perf_writer_finished - - b_rcu_perf_writer_started); + rcuperf_seq_diff(b_rcu_perf_writer_finished, + b_rcu_perf_writer_started)); for (i = 0; i < nrealwriters; i++) { if (!writer_durations) break; @@ -671,12 +675,11 @@ rcu_perf_init(void) break; } if (i == ARRAY_SIZE(perf_ops)) { - pr_alert("rcu-perf: invalid perf type: \"%s\"\n", - perf_type); + pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); pr_alert("rcu-perf types:"); for (i = 0; i < ARRAY_SIZE(perf_ops); i++) - pr_alert(" %s", perf_ops[i]->name); - pr_alert("\n"); + pr_cont(" %s", perf_ops[i]->name); + pr_cont("\n"); firsterr = -EINVAL; goto unwind; } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 42fcb7f05fac..c596c6f1e457 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -22,6 +22,9 @@ * * See also: Documentation/RCU/torture.txt */ + +#define pr_fmt(fmt) fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/init.h> @@ -52,6 +55,7 @@ #include <linux/torture.h> #include <linux/vmalloc.h> #include <linux/sched/debug.h> +#include <linux/sched/sysctl.h> #include "rcu.h" @@ -59,6 +63,19 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); +/* Bits for ->extendables field, extendables param, and related definitions. */ +#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) +#define RCUTORTURE_RDR_BH 0x1 /* Extend readers by disabling bh. */ +#define RCUTORTURE_RDR_IRQ 0x2 /* ... disabling interrupts. */ +#define RCUTORTURE_RDR_PREEMPT 0x4 /* ... disabling preemption. */ +#define RCUTORTURE_RDR_RCU 0x8 /* ... entering another RCU reader. */ +#define RCUTORTURE_RDR_NBITS 4 /* Number of bits defined above. */ +#define RCUTORTURE_MAX_EXTEND (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | \ + RCUTORTURE_RDR_PREEMPT) +#define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */ + /* Must be power of two minus one. */ + torture_param(int, cbflood_inter_holdoff, HZ, "Holdoff between floods (jiffies)"); torture_param(int, cbflood_intra_holdoff, 1, @@ -66,6 +83,8 @@ torture_param(int, cbflood_intra_holdoff, 1, torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable"); torture_param(int, cbflood_n_per_burst, 20000, "# callbacks per burst in flood"); +torture_param(int, extendables, RCUTORTURE_MAX_EXTEND, + "Extend readers by disabling bh (1), irqs (2), or preempt (4)"); torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); @@ -84,7 +103,7 @@ torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, - "Time between CPU hotplugs (s), 0=disable"); + "Time between CPU hotplugs (jiffies), 0=disable"); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -101,7 +120,7 @@ torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); -torture_param(bool, verbose, true, +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "rcu"; @@ -148,9 +167,9 @@ static long n_rcu_torture_boost_ktrerror; static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; -static long n_rcu_torture_timers; +static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; -static long n_barrier_successes; +static long n_barrier_successes; /* did rcu_barrier test succeed? */ static atomic_long_t n_cbfloods; static struct list_head rcu_torture_removed; @@ -261,8 +280,8 @@ struct rcu_torture_ops { int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); - unsigned long (*started)(void); - unsigned long (*completed)(void); + unsigned long (*get_gp_seq)(void); + unsigned long (*gp_diff)(unsigned long new, unsigned long old); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); @@ -274,6 +293,8 @@ struct rcu_torture_ops { void (*stats)(void); int irq_capable; int can_boost; + int extendables; + int ext_irq_conflict; const char *name; }; @@ -302,10 +323,10 @@ static void rcu_read_delay(struct torture_random_state *rrsp) * force_quiescent_state. */ if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { - started = cur_ops->completed(); + started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); mdelay(longdelay_ms); - completed = cur_ops->completed(); + completed = cur_ops->get_gp_seq(); do_trace_rcu_torture_read(cur_ops->name, NULL, ts, started, completed); } @@ -397,8 +418,8 @@ static struct rcu_torture_ops rcu_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, - .started = rcu_batches_started, - .completed = rcu_batches_completed, + .get_gp_seq = rcu_get_gp_seq, + .gp_diff = rcu_seq_diff, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, @@ -439,8 +460,8 @@ static struct rcu_torture_ops rcu_bh_ops = { .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, - .started = rcu_batches_started_bh, - .completed = rcu_batches_completed_bh, + .get_gp_seq = rcu_bh_get_gp_seq, + .gp_diff = rcu_seq_diff, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, @@ -449,6 +470,8 @@ static struct rcu_torture_ops rcu_bh_ops = { .fqs = rcu_bh_force_quiescent_state, .stats = NULL, .irq_capable = 1, + .extendables = (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ), + .ext_irq_conflict = RCUTORTURE_RDR_RCU, .name = "rcu_bh" }; @@ -483,8 +506,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock, - .started = rcu_no_completed, - .completed = rcu_no_completed, + .get_gp_seq = rcu_no_completed, .deferred_free = rcu_busted_torture_deferred_free, .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, @@ -572,8 +594,7 @@ static struct rcu_torture_ops srcu_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, - .started = NULL, - .completed = srcu_torture_completed, + .get_gp_seq = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -610,8 +631,7 @@ static struct rcu_torture_ops srcud_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, - .started = NULL, - .completed = srcu_torture_completed, + .get_gp_seq = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -622,6 +642,26 @@ static struct rcu_torture_ops srcud_ops = { .name = "srcud" }; +/* As above, but broken due to inappropriate reader extension. */ +static struct rcu_torture_ops busted_srcud_ops = { + .ttype = SRCU_FLAVOR, + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .get_gp_seq = srcu_torture_completed, + .deferred_free = srcu_torture_deferred_free, + .sync = srcu_torture_synchronize, + .exp_sync = srcu_torture_synchronize_expedited, + .call = srcu_torture_call, + .cb_barrier = srcu_torture_barrier, + .stats = srcu_torture_stats, + .irq_capable = 1, + .extendables = RCUTORTURE_MAX_EXTEND, + .name = "busted_srcud" +}; + /* * Definitions for sched torture testing. */ @@ -648,8 +688,8 @@ static struct rcu_torture_ops sched_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .started = rcu_batches_started_sched, - .completed = rcu_batches_completed_sched, + .get_gp_seq = rcu_sched_get_gp_seq, + .gp_diff = rcu_seq_diff, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, @@ -660,6 +700,7 @@ static struct rcu_torture_ops sched_ops = { .fqs = rcu_sched_force_quiescent_state, .stats = NULL, .irq_capable = 1, + .extendables = RCUTORTURE_MAX_EXTEND, .name = "sched" }; @@ -687,8 +728,7 @@ static struct rcu_torture_ops tasks_ops = { .readlock = tasks_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = tasks_torture_read_unlock, - .started = rcu_no_completed, - .completed = rcu_no_completed, + .get_gp_seq = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, @@ -700,6 +740,13 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; +static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) +{ + if (!cur_ops->gp_diff) + return new - old; + return cur_ops->gp_diff(new, old); +} + static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; @@ -726,6 +773,44 @@ static void rcu_torture_boost_cb(struct rcu_head *head) smp_store_release(&rbip->inflight, 0); } +static int old_rt_runtime = -1; + +static void rcu_torture_disable_rt_throttle(void) +{ + /* + * Disable RT throttling so that rcutorture's boost threads don't get + * throttled. Only possible if rcutorture is built-in otherwise the + * user should manually do this by setting the sched_rt_period_us and + * sched_rt_runtime sysctls. + */ + if (!IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) || old_rt_runtime != -1) + return; + + old_rt_runtime = sysctl_sched_rt_runtime; + sysctl_sched_rt_runtime = -1; +} + +static void rcu_torture_enable_rt_throttle(void) +{ + if (!IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) || old_rt_runtime == -1) + return; + + sysctl_sched_rt_runtime = old_rt_runtime; + old_rt_runtime = -1; +} + +static bool rcu_torture_boost_failed(unsigned long start, unsigned long end) +{ + if (end - start > test_boost_duration * HZ - HZ / 2) { + VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); + n_rcu_torture_boost_failure++; + + return true; /* failed */ + } + + return false; /* passed */ +} + static int rcu_torture_boost(void *arg) { unsigned long call_rcu_time; @@ -746,6 +831,21 @@ static int rcu_torture_boost(void *arg) init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ do { + /* Track if the test failed already in this test interval? */ + bool failed = false; + + /* Increment n_rcu_torture_boosts once per boost-test */ + while (!kthread_should_stop()) { + if (mutex_trylock(&boost_mutex)) { + n_rcu_torture_boosts++; + mutex_unlock(&boost_mutex); + break; + } + schedule_timeout_uninterruptible(1); + } + if (kthread_should_stop()) + goto checkwait; + /* Wait for the next test interval. */ oldstarttime = boost_starttime; while (ULONG_CMP_LT(jiffies, oldstarttime)) { @@ -764,11 +864,10 @@ static int rcu_torture_boost(void *arg) /* RCU core before ->inflight = 1. */ smp_store_release(&rbi.inflight, 1); call_rcu(&rbi.rcu, rcu_torture_boost_cb); - if (jiffies - call_rcu_time > - test_boost_duration * HZ - HZ / 2) { - VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); - n_rcu_torture_boost_failure++; - } + /* Check if the boost test failed */ + failed = failed || + rcu_torture_boost_failed(call_rcu_time, + jiffies); call_rcu_time = jiffies; } stutter_wait("rcu_torture_boost"); @@ -777,6 +876,14 @@ static int rcu_torture_boost(void *arg) } /* + * If boost never happened, then inflight will always be 1, in + * this case the boost check would never happen in the above + * loop so do another one here. + */ + if (!failed && smp_load_acquire(&rbi.inflight)) + rcu_torture_boost_failed(call_rcu_time, jiffies); + + /* * Set the start time of the next test interval. * Yes, this is vulnerable to long delays, but such * delays simply cause a false negative for the next @@ -788,7 +895,6 @@ static int rcu_torture_boost(void *arg) if (mutex_trylock(&boost_mutex)) { boost_starttime = jiffies + test_boost_interval * HZ; - n_rcu_torture_boosts++; mutex_unlock(&boost_mutex); break; } @@ -1010,7 +1116,7 @@ rcu_torture_writer(void *arg) break; } } - rcutorture_record_progress(++rcu_torture_current_version); + rcu_torture_current_version++; /* Cycle through nesting levels of rcu_expedite_gp() calls. */ if (can_expedite && !(torture_random(&rand) & 0xff & (!!expediting - 1))) { @@ -1084,27 +1190,133 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) } /* - * RCU torture reader from timer handler. Dereferences rcu_torture_current, - * incrementing the corresponding element of the pipeline array. The - * counter in the element should never be greater than 1, otherwise, the - * RCU implementation is broken. + * Do one extension of an RCU read-side critical section using the + * current reader state in readstate (set to zero for initial entry + * to extended critical section), set the new state as specified by + * newstate (set to zero for final exit from extended critical section), + * and random-number-generator state in trsp. If this is neither the + * beginning or end of the critical section and if there was actually a + * change, do a ->read_delay(). */ -static void rcu_torture_timer(struct timer_list *unused) +static void rcutorture_one_extend(int *readstate, int newstate, + struct torture_random_state *trsp) +{ + int idxnew = -1; + int idxold = *readstate; + int statesnew = ~*readstate & newstate; + int statesold = *readstate & ~newstate; + + WARN_ON_ONCE(idxold < 0); + WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1); + + /* First, put new protection in place to avoid critical-section gap. */ + if (statesnew & RCUTORTURE_RDR_BH) + local_bh_disable(); + if (statesnew & RCUTORTURE_RDR_IRQ) + local_irq_disable(); + if (statesnew & RCUTORTURE_RDR_PREEMPT) + preempt_disable(); + if (statesnew & RCUTORTURE_RDR_RCU) + idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT; + + /* Next, remove old protection, irq first due to bh conflict. */ + if (statesold & RCUTORTURE_RDR_IRQ) + local_irq_enable(); + if (statesold & RCUTORTURE_RDR_BH) + local_bh_enable(); + if (statesold & RCUTORTURE_RDR_PREEMPT) + preempt_enable(); + if (statesold & RCUTORTURE_RDR_RCU) + cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + + /* Delay if neither beginning nor end and there was a change. */ + if ((statesnew || statesold) && *readstate && newstate) + cur_ops->read_delay(trsp); + + /* Update the reader state. */ + if (idxnew == -1) + idxnew = idxold & ~RCUTORTURE_RDR_MASK; + WARN_ON_ONCE(idxnew < 0); + WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1); + *readstate = idxnew | newstate; + WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0); + WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1); +} + +/* Return the biggest extendables mask given current RCU and boot parameters. */ +static int rcutorture_extend_mask_max(void) +{ + int mask; + + WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND); + mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables; + mask = mask | RCUTORTURE_RDR_RCU; + return mask; +} + +/* Return a random protection state mask, but with at least one bit set. */ +static int +rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) +{ + int mask = rcutorture_extend_mask_max(); + unsigned long randmask1 = torture_random(trsp) >> 8; + unsigned long randmask2 = randmask1 >> 1; + + WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); + /* Half the time lots of bits, half the time only one bit. */ + if (randmask1 & 0x1) + mask = mask & randmask2; + else + mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS)); + if ((mask & RCUTORTURE_RDR_IRQ) && + !(mask & RCUTORTURE_RDR_BH) && + (oldmask & RCUTORTURE_RDR_BH)) + mask |= RCUTORTURE_RDR_BH; /* Can't enable bh w/irq disabled. */ + if ((mask & RCUTORTURE_RDR_IRQ) && + !(mask & cur_ops->ext_irq_conflict) && + (oldmask & cur_ops->ext_irq_conflict)) + mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */ + return mask ?: RCUTORTURE_RDR_RCU; +} + +/* + * Do a randomly selected number of extensions of an existing RCU read-side + * critical section. + */ +static void rcutorture_loop_extend(int *readstate, + struct torture_random_state *trsp) +{ + int i; + int mask = rcutorture_extend_mask_max(); + + WARN_ON_ONCE(!*readstate); /* -Existing- RCU read-side critsect! */ + if (!((mask - 1) & mask)) + return; /* Current RCU flavor not extendable. */ + i = (torture_random(trsp) >> 3) & RCUTORTURE_RDR_MAX_LOOPS; + while (i--) { + mask = rcutorture_extend_mask(*readstate, trsp); + rcutorture_one_extend(readstate, mask, trsp); + } +} + +/* + * Do one read-side critical section, returning false if there was + * no data to read. Can be invoked both from process context and + * from a timer handler. + */ +static bool rcu_torture_one_read(struct torture_random_state *trsp) { - int idx; unsigned long started; unsigned long completed; - static DEFINE_TORTURE_RANDOM(rand); - static DEFINE_SPINLOCK(rand_lock); + int newstate; struct rcu_torture *p; int pipe_count; + int readstate = 0; unsigned long long ts; - idx = cur_ops->readlock(); - if (cur_ops->started) - started = cur_ops->started(); - else - started = cur_ops->completed(); + newstate = rcutorture_extend_mask(readstate, trsp); + rcutorture_one_extend(&readstate, newstate, trsp); + started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1112,39 +1324,50 @@ static void rcu_torture_timer(struct timer_list *unused) srcu_read_lock_held(srcu_ctlp) || torturing_tasks()); if (p == NULL) { - /* Leave because rcu_torture_writer is not yet underway */ - cur_ops->readunlock(idx); - return; + /* Wait for rcu_torture_writer to get underway */ + rcutorture_one_extend(&readstate, 0, trsp); + return false; } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - spin_lock(&rand_lock); - cur_ops->read_delay(&rand); - n_rcu_torture_timers++; - spin_unlock(&rand_lock); + rcutorture_loop_extend(&readstate, trsp); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed = cur_ops->completed(); + completed = cur_ops->get_gp_seq(); if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, - started, completed); + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, + ts, started, completed); rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed - started; - if (cur_ops->started) - completed++; + completed = rcutorture_seq_diff(completed, started); if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); - cur_ops->readunlock(idx); + rcutorture_one_extend(&readstate, 0, trsp); + WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); + return true; +} + +static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand); + +/* + * RCU torture reader from timer handler. Dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static void rcu_torture_timer(struct timer_list *unused) +{ + atomic_long_inc(&n_rcu_torture_timers); + (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand)); /* Test call_rcu() invocation from interrupt handler. */ if (cur_ops->call) { @@ -1164,14 +1387,8 @@ static void rcu_torture_timer(struct timer_list *unused) static int rcu_torture_reader(void *arg) { - unsigned long started; - unsigned long completed; - int idx; DEFINE_TORTURE_RANDOM(rand); - struct rcu_torture *p; - int pipe_count; struct timer_list t; - unsigned long long ts; VERBOSE_TOROUT_STRING("rcu_torture_reader task started"); set_user_nice(current, MAX_NICE); @@ -1183,49 +1400,8 @@ rcu_torture_reader(void *arg) if (!timer_pending(&t)) mod_timer(&t, jiffies + 1); } - idx = cur_ops->readlock(); - if (cur_ops->started) - started = cur_ops->started(); - else - started = cur_ops->completed(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp) || - torturing_tasks()); - if (p == NULL) { - /* Wait for rcu_torture_writer to get underway */ - cur_ops->readunlock(idx); + if (!rcu_torture_one_read(&rand)) schedule_timeout_interruptible(HZ); - continue; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - cur_ops->read_delay(&rand); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - completed = cur_ops->completed(); - if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, started, completed); - rcu_ftrace_dump(DUMP_ALL); - } - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed - started; - if (cur_ops->started) - completed++; - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { @@ -1282,7 +1458,7 @@ rcu_torture_stats_print(void) pr_cont("rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, - n_rcu_torture_timers); + atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld ", n_barrier_successes, @@ -1324,18 +1500,16 @@ rcu_torture_stats_print(void) if (rtcv_snap == rcu_torture_current_version && rcu_torture_current != NULL) { int __maybe_unused flags = 0; - unsigned long __maybe_unused gpnum = 0; - unsigned long __maybe_unused completed = 0; + unsigned long __maybe_unused gp_seq = 0; rcutorture_get_gp_data(cur_ops->ttype, - &flags, &gpnum, &completed); + &flags, &gp_seq); srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, - &flags, &gpnum, &completed); + &flags, &gp_seq); wtp = READ_ONCE(writer_task); - pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n", + pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#lx cpu %d\n", rcu_torture_writer_state_getname(), - rcu_torture_writer_state, - gpnum, completed, flags, + rcu_torture_writer_state, gp_seq, flags, wtp == NULL ? ~0UL : wtp->state, wtp == NULL ? -1 : (int)task_cpu(wtp)); if (!splatted && wtp) { @@ -1365,7 +1539,7 @@ rcu_torture_stats(void *arg) return 0; } -static inline void +static void rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) { pr_alert("%s" TORTURE_FLAG @@ -1397,6 +1571,7 @@ static int rcutorture_booster_cleanup(unsigned int cpu) mutex_lock(&boost_mutex); t = boost_tasks[cpu]; boost_tasks[cpu] = NULL; + rcu_torture_enable_rt_throttle(); mutex_unlock(&boost_mutex); /* This must be outside of the mutex, otherwise deadlock! */ @@ -1413,6 +1588,7 @@ static int rcutorture_booster_init(unsigned int cpu) /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); + rcu_torture_disable_rt_throttle(); VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task"); boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, cpu_to_node(cpu), @@ -1446,7 +1622,7 @@ static int rcu_torture_stall(void *args) VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } if (!kthread_should_stop()) { - stop_at = get_seconds() + stall_cpu; + stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ rcu_read_lock(); if (stall_cpu_irqsoff) @@ -1455,7 +1631,8 @@ static int rcu_torture_stall(void *args) preempt_disable(); pr_alert("rcu_torture_stall start on CPU %d.\n", smp_processor_id()); - while (ULONG_CMP_LT(get_seconds(), stop_at)) + while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), + stop_at)) continue; /* Induce RCU CPU stall warning. */ if (stall_cpu_irqsoff) local_irq_enable(); @@ -1546,8 +1723,9 @@ static int rcu_torture_barrier(void *arg) atomic_read(&barrier_cbs_invoked), n_barrier_cbs); WARN_ON_ONCE(1); + } else { + n_barrier_successes++; } - n_barrier_successes++; schedule_timeout_interruptible(HZ / 10); } while (!torture_must_stop()); torture_kthread_stopping("rcu_torture_barrier"); @@ -1610,17 +1788,39 @@ static void rcu_torture_barrier_cleanup(void) } } +static bool rcu_torture_can_boost(void) +{ + static int boost_warn_once; + int prio; + + if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2) + return false; + + prio = rcu_get_gp_kthreads_prio(); + if (!prio) + return false; + + if (prio < 2) { + if (boost_warn_once == 1) + return false; + + pr_alert("%s: WARN: RCU kthread priority too low to test boosting. Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME); + boost_warn_once = 1; + return false; + } + + return true; +} + static enum cpuhp_state rcutor_hp; static void rcu_torture_cleanup(void) { int flags = 0; - unsigned long gpnum = 0; - unsigned long completed = 0; + unsigned long gp_seq = 0; int i; - rcutorture_record_test_transition(); if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); @@ -1648,17 +1848,15 @@ rcu_torture_cleanup(void) fakewriter_tasks = NULL; } - rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, - &flags, &gpnum, &completed); - pr_alert("%s: End-test grace-period state: g%lu c%lu f%#x\n", - cur_ops->name, gpnum, completed, flags); + rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + pr_alert("%s: End-test grace-period state: g%lu f%#x\n", + cur_ops->name, gp_seq, flags); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); for (i = 0; i < ncbflooders; i++) torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) + if (rcu_torture_can_boost()) cpuhp_remove_state(rcutor_hp); /* @@ -1746,7 +1944,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &sched_ops, &tasks_ops, + &busted_srcud_ops, &sched_ops, &tasks_ops, }; if (!torture_init_begin(torture_type, verbose)) @@ -1763,8 +1961,8 @@ rcu_torture_init(void) torture_type); pr_alert("rcu-torture types:"); for (i = 0; i < ARRAY_SIZE(torture_ops); i++) - pr_alert(" %s", torture_ops[i]->name); - pr_alert("\n"); + pr_cont(" %s", torture_ops[i]->name); + pr_cont("\n"); firsterr = -EINVAL; goto unwind; } @@ -1882,8 +2080,7 @@ rcu_torture_init(void) test_boost_interval = 1; if (test_boost_duration < 2) test_boost_duration = 2; - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { + if (rcu_torture_can_boost()) { boost_starttime = jiffies + test_boost_interval * HZ; @@ -1897,7 +2094,7 @@ rcu_torture_init(void) firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) goto unwind; - firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval); if (firsterr) goto unwind; firsterr = rcu_torture_stall_init(); @@ -1926,7 +2123,6 @@ rcu_torture_init(void) goto unwind; } } - rcutorture_record_test_transition(); torture_init_end(); return 0; diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 622792abe41a..04fc2ed71af8 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -110,7 +110,7 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx) WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); if (!newval && READ_ONCE(sp->srcu_gp_waiting)) - swake_up(&sp->srcu_wq); + swake_up_one(&sp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -140,7 +140,7 @@ void srcu_drive_gp(struct work_struct *wp) idx = sp->srcu_idx; WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ - swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); + swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ /* Invoke the callbacks we removed above. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b4123d7a2cec..6c9866a854b1 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -26,6 +26,8 @@ * */ +#define pr_fmt(fmt) "rcu: " fmt + #include <linux/export.h> #include <linux/mutex.h> #include <linux/percpu.h> @@ -390,7 +392,8 @@ void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) } if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(sp))) { - pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p state: %d\n", + __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); return; /* Caller forgot to stop doing call_srcu()? */ } free_percpu(sp->sda); @@ -641,6 +644,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, * period s. Losers must either ensure that their desired grace-period * number is recorded on at least their leaf srcu_node structure, or they * must take steps to invoke their own callbacks. + * + * Note that this function also does the work of srcu_funnel_exp_start(), + * in some cases by directly invoking it. */ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, unsigned long s, bool do_norm) @@ -823,17 +829,17 @@ static void srcu_leak_callback(struct rcu_head *rhp) * more than one CPU, this means that when "func()" is invoked, each CPU * is guaranteed to have executed a full memory barrier since the end of * its last corresponding SRCU read-side critical section whose beginning - * preceded the call to call_rcu(). It also means that each CPU executing + * preceded the call to call_srcu(). It also means that each CPU executing * an SRCU read-side critical section that continues beyond the start of - * "func()" must have executed a memory barrier after the call_rcu() + * "func()" must have executed a memory barrier after the call_srcu() * but before the beginning of that SRCU read-side critical section. * Note that these guarantees include CPUs that are offline, idle, or * executing in user mode, as well as CPUs that are executing in the kernel. * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * Furthermore, if CPU A invoked call_srcu() and CPU B invoked the * resulting SRCU callback function "func()", then both CPU A and CPU * B are guaranteed to execute a full memory barrier during the time - * interval between the call to call_rcu() and the invocation of "func()". + * interval between the call to call_srcu() and the invocation of "func()". * This guarantee applies even if CPU A and CPU B are the same CPU (but * again only if the system has more than one CPU). * @@ -1246,13 +1252,12 @@ static void process_srcu(struct work_struct *work) void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, - unsigned long *gpnum, unsigned long *completed) + unsigned long *gp_seq) { if (test_type != SRCU_FLAVOR) return; *flags = 0; - *completed = rcu_seq_ctr(sp->srcu_gp_seq); - *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); + *gp_seq = rcu_seq_current(&sp->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); @@ -1263,16 +1268,17 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) unsigned long s0 = 0, s1 = 0; idx = sp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx); + pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):", + tt, tf, rcu_seq_current(&sp->srcu_gp_seq), idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; long c0, c1; - struct srcu_data *counts; + struct srcu_data *sdp; - counts = per_cpu_ptr(sp->sda, cpu); - u0 = counts->srcu_unlock_count[!idx]; - u1 = counts->srcu_unlock_count[idx]; + sdp = per_cpu_ptr(sp->sda, cpu); + u0 = sdp->srcu_unlock_count[!idx]; + u1 = sdp->srcu_unlock_count[idx]; /* * Make sure that a lock is always counted if the corresponding @@ -1280,12 +1286,13 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) */ smp_rmb(); - l0 = counts->srcu_lock_count[!idx]; - l1 = counts->srcu_lock_count[idx]; + l0 = sdp->srcu_lock_count[!idx]; + l1 = sdp->srcu_lock_count[idx]; c0 = l0 - u0; c1 = l1 - u1; - pr_cont(" %d(%ld,%ld)", cpu, c0, c1); + pr_cont(" %d(%ld,%ld %1p)", + cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist)); s0 += c0; s1 += c1; } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index a64eee0db39e..befc9321a89c 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -122,10 +122,8 @@ void rcu_check_callbacks(int user) { if (user) rcu_sched_qs(); - else if (!in_softirq()) + if (user || !in_softirq()) rcu_bh_qs(); - if (user) - rcu_note_voluntary_context_switch(current); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index aa7cade1b9f3..0b760c1369f7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -27,6 +27,9 @@ * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU */ + +#define pr_fmt(fmt) "rcu: " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/init.h> @@ -95,13 +98,13 @@ struct rcu_state sname##_state = { \ .rda = &sname##_data, \ .call = cr, \ .gp_state = RCU_GP_IDLE, \ - .gpnum = 0UL - 300UL, \ - .completed = 0UL - 300UL, \ + .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ + .ofl_lock = __SPIN_LOCK_UNLOCKED(sname##_state.ofl_lock), \ } RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); @@ -155,6 +158,9 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); */ static int rcu_scheduler_fully_active __read_mostly; +static void +rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long gps, unsigned long flags); static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); @@ -177,6 +183,13 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +/* Retreive RCU kthreads priority for rcutorture */ +int rcu_get_gp_kthreads_prio(void) +{ + return kthread_prio; +} +EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); + /* * Number of grace periods between delays, normalized by the duration of * the delay. The longer the delay, the more the grace periods between @@ -189,18 +202,6 @@ module_param(gp_cleanup_delay, int, 0444); #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */ /* - * Track the rcutorture test sequence number and the update version - * number within a given test. The rcutorture_testseq is incremented - * on every rcutorture module load and unload, so has an odd value - * when a test is running. The rcutorture_vernum is set to zero - * when rcutorture starts and is incremented on each rcutorture update. - * These variables enable correlating rcutorture output with the - * RCU tracing information. - */ -unsigned long rcutorture_testseq; -unsigned long rcutorture_vernum; - -/* * Compute the mask of online CPUs for the specified rcu_node structure. * This will not be stable unless the rcu_node structure's ->lock is * held, but the bit corresponding to the current CPU will be stable @@ -218,7 +219,7 @@ unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) */ static int rcu_gp_in_progress(struct rcu_state *rsp) { - return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum); + return rcu_seq_state(rcu_seq_current(&rsp->gp_seq)); } /* @@ -233,7 +234,7 @@ void rcu_sched_qs(void) if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) return; trace_rcu_grace_period(TPS("rcu_sched"), - __this_cpu_read(rcu_sched_data.gpnum), + __this_cpu_read(rcu_sched_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) @@ -248,7 +249,7 @@ void rcu_bh_qs(void) RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!"); if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_bh"), - __this_cpu_read(rcu_bh_data.gpnum), + __this_cpu_read(rcu_bh_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); } @@ -380,20 +381,6 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) } /* - * Do a double-increment of the ->dynticks counter to emulate a - * momentary idle-CPU quiescent state. - */ -static void rcu_dynticks_momentary_idle(void) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, - &rdtp->dynticks); - - /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); -} - -/* * Set the special (bottom) bit of the specified CPU so that it * will take special action (such as flushing its TLB) on the * next exit from an extended quiescent state. Returns true if @@ -424,12 +411,17 @@ bool rcu_eqs_special_set(int cpu) * * We inform the RCU core by emulating a zero-duration dyntick-idle period. * - * The caller must have disabled interrupts. + * The caller must have disabled interrupts and must not be idle. */ static void rcu_momentary_dyntick_idle(void) { + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int special; + raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); - rcu_dynticks_momentary_idle(); + special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + /* It is illegal to call this from idle state. */ + WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); } /* @@ -451,7 +443,7 @@ void rcu_note_context_switch(bool preempt) rcu_momentary_dyntick_idle(); this_cpu_inc(rcu_dynticks.rcu_qs_ctr); if (!preempt) - rcu_note_voluntary_context_switch_lite(current); + rcu_tasks_qs(current); out: trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ @@ -513,8 +505,38 @@ static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; -module_param(jiffies_till_first_fqs, ulong, 0644); -module_param(jiffies_till_next_fqs, ulong, 0644); +static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp) +{ + ulong j; + int ret = kstrtoul(val, 0, &j); + + if (!ret) + WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j); + return ret; +} + +static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param *kp) +{ + ulong j; + int ret = kstrtoul(val, 0, &j); + + if (!ret) + WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1)); + return ret; +} + +static struct kernel_param_ops first_fqs_jiffies_ops = { + .set = param_set_first_fqs_jiffies, + .get = param_get_ulong, +}; + +static struct kernel_param_ops next_fqs_jiffies_ops = { + .set = param_set_next_fqs_jiffies, + .get = param_get_ulong, +}; + +module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_first_fqs, 0644); +module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644); module_param(rcu_kick_kthreads, bool, 0644); /* @@ -529,58 +551,31 @@ static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); /* - * Return the number of RCU batches started thus far for debug & stats. + * Return the number of RCU GPs completed thus far for debug & stats. */ -unsigned long rcu_batches_started(void) +unsigned long rcu_get_gp_seq(void) { - return rcu_state_p->gpnum; + return READ_ONCE(rcu_state_p->gp_seq); } -EXPORT_SYMBOL_GPL(rcu_batches_started); +EXPORT_SYMBOL_GPL(rcu_get_gp_seq); /* - * Return the number of RCU-sched batches started thus far for debug & stats. + * Return the number of RCU-sched GPs completed thus far for debug & stats. */ -unsigned long rcu_batches_started_sched(void) +unsigned long rcu_sched_get_gp_seq(void) { - return rcu_sched_state.gpnum; + return READ_ONCE(rcu_sched_state.gp_seq); } -EXPORT_SYMBOL_GPL(rcu_batches_started_sched); +EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq); /* - * Return the number of RCU BH batches started thus far for debug & stats. + * Return the number of RCU-bh GPs completed thus far for debug & stats. */ -unsigned long rcu_batches_started_bh(void) +unsigned long rcu_bh_get_gp_seq(void) { - return rcu_bh_state.gpnum; + return READ_ONCE(rcu_bh_state.gp_seq); } -EXPORT_SYMBOL_GPL(rcu_batches_started_bh); - -/* - * Return the number of RCU batches completed thus far for debug & stats. - */ -unsigned long rcu_batches_completed(void) -{ - return rcu_state_p->completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* - * Return the number of RCU-sched batches completed thus far for debug & stats. - */ -unsigned long rcu_batches_completed_sched(void) -{ - return rcu_sched_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); - -/* - * Return the number of RCU BH batches completed thus far for debug & stats. - */ -unsigned long rcu_batches_completed_bh(void) -{ - return rcu_bh_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); +EXPORT_SYMBOL_GPL(rcu_bh_get_gp_seq); /* * Return the number of RCU expedited batches completed thus far for @@ -636,35 +631,42 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); */ void show_rcu_gp_kthreads(void) { + int cpu; + struct rcu_data *rdp; + struct rcu_node *rnp; struct rcu_state *rsp; for_each_rcu_flavor(rsp) { pr_info("%s: wait state: %d ->state: %#lx\n", rsp->name, rsp->gp_state, rsp->gp_kthread->state); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_GE(rsp->gp_seq, rnp->gp_seq_needed)) + continue; + pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n", + rnp->grplo, rnp->grphi, rnp->gp_seq, + rnp->gp_seq_needed); + if (!rcu_is_leaf_node(rnp)) + continue; + for_each_leaf_node_possible_cpu(rnp, cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->gpwrap || + ULONG_CMP_GE(rsp->gp_seq, + rdp->gp_seq_needed)) + continue; + pr_info("\tcpu %d ->gp_seq_needed %lu\n", + cpu, rdp->gp_seq_needed); + } + } /* sched_show_task(rsp->gp_kthread); */ } } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); /* - * Record the number of times rcutorture tests have been initiated and - * terminated. This information allows the debugfs tracing stats to be - * correlated to the rcutorture messages, even when the rcutorture module - * is being repeatedly loaded and unloaded. In other words, we cannot - * store this state in rcutorture itself. - */ -void rcutorture_record_test_transition(void) -{ - rcutorture_testseq++; - rcutorture_vernum = 0; -} -EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); - -/* * Send along grace-period-related data for rcutorture diagnostics. */ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gpnum, unsigned long *completed) + unsigned long *gp_seq) { struct rcu_state *rsp = NULL; @@ -684,23 +686,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, if (rsp == NULL) return; *flags = READ_ONCE(rsp->gp_flags); - *gpnum = READ_ONCE(rsp->gpnum); - *completed = READ_ONCE(rsp->completed); + *gp_seq = rcu_seq_current(&rsp->gp_seq); } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); /* - * Record the number of writer passes through the current rcutorture test. - * This is also used to correlate debugfs tracing stats with the rcutorture - * messages. - */ -void rcutorture_record_progress(unsigned long vernum) -{ - rcutorture_vernum++; -} -EXPORT_SYMBOL_GPL(rcutorture_record_progress); - -/* * Return the root node of the specified rcu_state structure. */ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) @@ -1059,41 +1049,41 @@ void rcu_request_urgent_qs_task(struct task_struct *t) #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* - * Is the current CPU online? Disable preemption to avoid false positives - * that could otherwise happen due to the current CPU number being sampled, - * this task being preempted, its old CPU being taken offline, resuming - * on some other CPU, then determining that its old CPU is now offline. - * It is OK to use RCU on an offline processor during initial boot, hence - * the check for rcu_scheduler_fully_active. Note also that it is OK - * for a CPU coming online to use RCU for one jiffy prior to marking itself - * online in the cpu_online_mask. Similarly, it is OK for a CPU going - * offline to continue to use RCU for one jiffy after marking itself - * offline in the cpu_online_mask. This leniency is necessary given the - * non-atomic nature of the online and offline processing, for example, - * the fact that a CPU enters the scheduler after completing the teardown - * of the CPU. + * Is the current CPU online as far as RCU is concerned? * - * This is also why RCU internally marks CPUs online during in the - * preparation phase and offline after the CPU has been taken down. + * Disable preemption to avoid false positives that could otherwise + * happen due to the current CPU number being sampled, this task being + * preempted, its old CPU being taken offline, resuming on some other CPU, + * then determining that its old CPU is now offline. Because there are + * multiple flavors of RCU, and because this function can be called in the + * midst of updating the flavors while a given CPU coming online or going + * offline, it is necessary to check all flavors. If any of the flavors + * believe that given CPU is online, it is considered to be online. * - * Disable checking if in an NMI handler because we cannot safely report - * errors from NMI handlers anyway. + * Disable checking if in an NMI handler because we cannot safely + * report errors from NMI handlers anyway. In addition, it is OK to use + * RCU on an offline processor during initial boot, hence the check for + * rcu_scheduler_fully_active. */ bool rcu_lockdep_current_cpu_online(void) { struct rcu_data *rdp; struct rcu_node *rnp; - bool ret; + struct rcu_state *rsp; - if (in_nmi()) + if (in_nmi() || !rcu_scheduler_fully_active) return true; preempt_disable(); - rdp = this_cpu_ptr(&rcu_sched_data); - rnp = rdp->mynode; - ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) || - !rcu_scheduler_fully_active; + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) { + preempt_enable(); + return true; + } + } preempt_enable(); - return ret; + return false; } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); @@ -1115,17 +1105,18 @@ static int rcu_is_cpu_rrupt_from_idle(void) /* * We are reporting a quiescent state on behalf of some other CPU, so * it is our responsibility to check for and handle potential overflow - * of the rcu_node ->gpnum counter with respect to the rcu_data counters. + * of the rcu_node ->gp_seq counter with respect to the rcu_data counters. * After all, the CPU might be in deep idle state, and thus executing no * code whatsoever. */ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { raw_lockdep_assert_held_rcu_node(rnp); - if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) + if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4, + rnp->gp_seq)) WRITE_ONCE(rdp->gpwrap, true); - if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) - rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4; + if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq)) + rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4; } /* @@ -1137,7 +1128,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) { rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); return 1; } @@ -1159,7 +1150,7 @@ static void rcu_iw_handler(struct irq_work *iwp) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { - rdp->rcu_iw_gpnum = rnp->gpnum; + rdp->rcu_iw_gp_seq = rnp->gp_seq; rdp->rcu_iw_pending = false; } raw_spin_unlock_rcu_node(rnp); @@ -1187,7 +1178,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * of the current RCU grace period. */ if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; rcu_gpnum_ovf(rnp, rdp); return 1; @@ -1203,8 +1194,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && - READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); + rcu_seq_current(&rdp->gp_seq) == rnp->gp_seq && !rdp->gpwrap) { + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("rqc")); rcu_gpnum_ovf(rnp, rdp); return 1; } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) { @@ -1212,12 +1203,25 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) smp_store_release(ruqp, true); } - /* Check for the CPU being offline. */ - if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); - rdp->offline_fqs++; - rcu_gpnum_ovf(rnp, rdp); - return 1; + /* If waiting too long on an offline CPU, complain. */ + if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && + time_after(jiffies, rdp->rsp->gp_start + HZ)) { + bool onl; + struct rcu_node *rnp1; + + WARN_ON(1); /* Offline CPUs are supposed to report QS! */ + pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", + __func__, rnp->grplo, rnp->grphi, rnp->level, + (long)rnp->gp_seq, (long)rnp->completedqs); + for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) + pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n", + __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask); + onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); + pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n", + __func__, rdp->cpu, ".o"[onl], + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + return 1; /* Break things loose after complaining. */ } /* @@ -1256,11 +1260,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) { resched_cpu(rdp->cpu); if (IS_ENABLED(CONFIG_IRQ_WORK) && - !rdp->rcu_iw_pending && rdp->rcu_iw_gpnum != rnp->gpnum && + !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { init_irq_work(&rdp->rcu_iw, rcu_iw_handler); rdp->rcu_iw_pending = true; - rdp->rcu_iw_gpnum = rnp->gpnum; + rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); } } @@ -1274,9 +1278,9 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) unsigned long j1; rsp->gp_start = j; - smp_wmb(); /* Record start time before stall time. */ j1 = rcu_jiffies_till_stall_check(); - WRITE_ONCE(rsp->jiffies_stall, j + j1); + /* Record ->gp_start before ->jiffies_stall. */ + smp_store_release(&rsp->jiffies_stall, j + j1); /* ^^^ */ rsp->jiffies_resched = j + j1 / 2; rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs); } @@ -1302,9 +1306,9 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n", + pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rsp->name, j - gpa, - rsp->gpnum, rsp->completed, + (long)rcu_seq_current(&rsp->gp_seq), rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0, @@ -1359,16 +1363,15 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) } } -static inline void panic_on_rcu_stall(void) +static void panic_on_rcu_stall(void) { if (sysctl_panic_on_rcu_stall) panic("RCU Stall\n"); } -static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) +static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) { int cpu; - long delta; unsigned long flags; unsigned long gpa; unsigned long j; @@ -1381,25 +1384,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) if (rcu_cpu_stall_suppress) return; - /* Only let one CPU complain about others per time interval. */ - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - delta = jiffies - READ_ONCE(rsp->jiffies_stall); - if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - WRITE_ONCE(rsp->jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - /* * OK, time to rat on our buddy... * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - pr_err("INFO: %s detected stalls on CPUs/tasks:", - rsp->name); + pr_err("INFO: %s detected stalls on CPUs/tasks:", rsp->name); print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -1418,17 +1408,16 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) for_each_possible_cpu(cpu) totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, cpu)->cblist); - pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", + pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), - (long)rsp->gpnum, (long)rsp->completed, totqlen); + (long)rcu_seq_current(&rsp->gp_seq), totqlen); if (ndetected) { rcu_dump_cpu_stacks(rsp); /* Complain about tasks blocking the grace period. */ rcu_print_detail_task_stall(rsp); } else { - if (READ_ONCE(rsp->gpnum) != gpnum || - READ_ONCE(rsp->completed) == gpnum) { + if (rcu_seq_current(&rsp->gp_seq) != gp_seq) { pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; @@ -1441,6 +1430,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) sched_show_task(current); } } + /* Rewrite if needed in case of slow consoles. */ + if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) + WRITE_ONCE(rsp->jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); rcu_check_gp_kthread_starvation(rsp); @@ -1476,15 +1469,16 @@ static void print_cpu_stall(struct rcu_state *rsp) for_each_possible_cpu(cpu) totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, cpu)->cblist); - pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", + pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", jiffies - rsp->gp_start, - (long)rsp->gpnum, (long)rsp->completed, totqlen); + (long)rcu_seq_current(&rsp->gp_seq), totqlen); rcu_check_gp_kthread_starvation(rsp); rcu_dump_cpu_stacks(rsp); raw_spin_lock_irqsave_rcu_node(rnp, flags); + /* Rewrite if needed in case of slow consoles. */ if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); @@ -1504,10 +1498,11 @@ static void print_cpu_stall(struct rcu_state *rsp) static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { - unsigned long completed; - unsigned long gpnum; + unsigned long gs1; + unsigned long gs2; unsigned long gps; unsigned long j; + unsigned long jn; unsigned long js; struct rcu_node *rnp; @@ -1520,43 +1515,46 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) /* * Lots of memory barriers to reject false positives. * - * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, - * then rsp->gp_start, and finally rsp->completed. These values - * are updated in the opposite order with memory barriers (or - * equivalent) during grace-period initialization and cleanup. - * Now, a false positive can occur if we get an new value of - * rsp->gp_start and a old value of rsp->jiffies_stall. But given - * the memory barriers, the only way that this can happen is if one - * grace period ends and another starts between these two fetches. - * Detect this by comparing rsp->completed with the previous fetch - * from rsp->gpnum. + * The idea is to pick up rsp->gp_seq, then rsp->jiffies_stall, + * then rsp->gp_start, and finally another copy of rsp->gp_seq. + * These values are updated in the opposite order with memory + * barriers (or equivalent) during grace-period initialization + * and cleanup. Now, a false positive can occur if we get an new + * value of rsp->gp_start and a old value of rsp->jiffies_stall. + * But given the memory barriers, the only way that this can happen + * is if one grace period ends and another starts between these + * two fetches. This is detected by comparing the second fetch + * of rsp->gp_seq with the previous fetch from rsp->gp_seq. * * Given this check, comparisons of jiffies, rsp->jiffies_stall, * and rsp->gp_start suffice to forestall false positives. */ - gpnum = READ_ONCE(rsp->gpnum); - smp_rmb(); /* Pick up ->gpnum first... */ + gs1 = READ_ONCE(rsp->gp_seq); + smp_rmb(); /* Pick up ->gp_seq first... */ js = READ_ONCE(rsp->jiffies_stall); smp_rmb(); /* ...then ->jiffies_stall before the rest... */ gps = READ_ONCE(rsp->gp_start); - smp_rmb(); /* ...and finally ->gp_start before ->completed. */ - completed = READ_ONCE(rsp->completed); - if (ULONG_CMP_GE(completed, gpnum) || + smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ + gs2 = READ_ONCE(rsp->gp_seq); + if (gs1 != gs2 || ULONG_CMP_LT(j, js) || ULONG_CMP_GE(gps, js)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; if (rcu_gp_in_progress(rsp) && - (READ_ONCE(rnp->qsmask) & rdp->grpmask)) { + (READ_ONCE(rnp->qsmask) & rdp->grpmask) && + cmpxchg(&rsp->jiffies_stall, js, jn) == js) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); } else if (rcu_gp_in_progress(rsp) && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { + ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && + cmpxchg(&rsp->jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(rsp, gpnum); + print_other_cpu_stall(rsp, gs2); } } @@ -1577,123 +1575,99 @@ void rcu_cpu_stall_reset(void) WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2); } -/* - * Determine the value that ->completed will have at the end of the - * next subsequent grace period. This is used to tag callbacks so that - * a CPU can invoke callbacks in a timely fashion even if that CPU has - * been dyntick-idle for an extended period with callbacks under the - * influence of RCU_FAST_NO_HZ. - * - * The caller must hold rnp->lock with interrupts disabled. - */ -static unsigned long rcu_cbs_completed(struct rcu_state *rsp, - struct rcu_node *rnp) -{ - raw_lockdep_assert_held_rcu_node(rnp); - - /* - * If RCU is idle, we just wait for the next grace period. - * But we can only be sure that RCU is idle if we are looking - * at the root rcu_node structure -- otherwise, a new grace - * period might have started, but just not yet gotten around - * to initializing the current non-root rcu_node structure. - */ - if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) - return rnp->completed + 1; - - /* - * If the current rcu_node structure believes that RCU is - * idle, and if the rcu_state structure does not yet reflect - * the start of a new grace period, then the next grace period - * will suffice. The memory barrier is needed to accurately - * sample the rsp->gpnum, and pairs with the second lock - * acquisition in rcu_gp_init(), which is augmented with - * smp_mb__after_unlock_lock() for this purpose. - */ - if (rnp->gpnum == rnp->completed) { - smp_mb(); /* See above block comment. */ - if (READ_ONCE(rsp->gpnum) == rnp->completed) - return rnp->completed + 1; - } - - /* - * Otherwise, wait for a possible partial grace period and - * then the subsequent full grace period. - */ - return rnp->completed + 2; -} - /* Trace-event wrapper function for trace_rcu_future_grace_period. */ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c, const char *s) + unsigned long gp_seq_req, const char *s) { - trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, - rnp->completed, c, rnp->level, - rnp->grplo, rnp->grphi, s); + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, gp_seq_req, + rnp->level, rnp->grplo, rnp->grphi, s); } /* + * rcu_start_this_gp - Request the start of a particular grace period + * @rnp_start: The leaf node of the CPU from which to start. + * @rdp: The rcu_data corresponding to the CPU from which to start. + * @gp_seq_req: The gp_seq of the grace period to start. + * * Start the specified grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp[] field. Returns true if there + * rcu_node structure's ->gp_seq_needed field. Returns true if there * is reason to awaken the grace-period kthread. * * The caller must hold the specified rcu_node structure's ->lock, which * is why the caller is responsible for waking the grace-period kthread. + * + * Returns true if the GP thread needs to be awakened else false. */ -static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c) +static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, + unsigned long gp_seq_req) { bool ret = false; struct rcu_state *rsp = rdp->rsp; - struct rcu_node *rnp_root; + struct rcu_node *rnp; /* * Use funnel locking to either acquire the root rcu_node * structure's lock or bail out if the need for this grace period - * has already been recorded -- or has already started. If there - * is already a grace period in progress in a non-leaf node, no - * recording is needed because the end of the grace period will - * scan the leaf rcu_node structures. Note that rnp->lock must - * not be released. + * has already been recorded -- or if that grace period has in + * fact already started. If there is already a grace period in + * progress in a non-leaf node, no recording is needed because the + * end of the grace period will scan the leaf rcu_node structures. + * Note that rnp_start->lock must not be released. */ - raw_lockdep_assert_held_rcu_node(rnp); - trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); - for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { - if (rnp_root != rnp) - raw_spin_lock_rcu_node(rnp_root); - WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum + - need_future_gp_mask(), c)); - if (need_future_gp_element(rnp_root, c) || - ULONG_CMP_GE(rnp_root->gpnum, c) || - (rnp != rnp_root && - rnp_root->gpnum != rnp_root->completed)) { - trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); + raw_lockdep_assert_held_rcu_node(rnp_start); + trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startleaf")); + for (rnp = rnp_start; 1; rnp = rnp->parent) { + if (rnp != rnp_start) + raw_spin_lock_rcu_node(rnp); + if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) || + rcu_seq_started(&rnp->gp_seq, gp_seq_req) || + (rnp != rnp_start && + rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))) { + trace_rcu_this_gp(rnp, rdp, gp_seq_req, + TPS("Prestarted")); goto unlock_out; } - need_future_gp_element(rnp_root, c) = true; - if (rnp_root != rnp && rnp_root->parent != NULL) - raw_spin_unlock_rcu_node(rnp_root); - if (!rnp_root->parent) + rnp->gp_seq_needed = gp_seq_req; + if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { + /* + * We just marked the leaf or internal node, and a + * grace period is in progress, which means that + * rcu_gp_cleanup() will see the marking. Bail to + * reduce contention. + */ + trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, + TPS("Startedleaf")); + goto unlock_out; + } + if (rnp != rnp_start && rnp->parent != NULL) + raw_spin_unlock_rcu_node(rnp); + if (!rnp->parent) break; /* At root, and perhaps also leaf. */ } /* If GP already in progress, just leave, otherwise start one. */ - if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot")); + if (rcu_gp_in_progress(rsp)) { + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot")); goto unlock_out; } - trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot")); WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); + rsp->gp_req_activity = jiffies; if (!rsp->gp_kthread) { - trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: - if (rnp != rnp_root) - raw_spin_unlock_rcu_node(rnp_root); + /* Push furthest requested GP to leaf node and rcu_data structure. */ + if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) { + rnp_start->gp_seq_needed = rnp->gp_seq_needed; + rdp->gp_seq_needed = rnp->gp_seq_needed; + } + if (rnp != rnp_start) + raw_spin_unlock_rcu_node(rnp); return ret; } @@ -1703,13 +1677,13 @@ unlock_out: */ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - unsigned long c = rnp->completed; bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - need_future_gp_element(rnp, c) = false; - needmore = need_any_future_gp(rnp); - trace_rcu_this_gp(rnp, rdp, c, + needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed); + if (!needmore) + rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */ + trace_rcu_this_gp(rnp, rdp, rnp->gp_seq, needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; } @@ -1727,25 +1701,25 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) !READ_ONCE(rsp->gp_flags) || !rsp->gp_kthread) return; - swake_up(&rsp->gp_wq); + swake_up_one(&rsp->gp_wq); } /* - * If there is room, assign a ->completed number to any callbacks on - * this CPU that have not already been assigned. Also accelerate any - * callbacks that were previously assigned a ->completed number that has - * since proven to be too conservative, which can happen if callbacks get - * assigned a ->completed number while RCU is idle, but with reference to - * a non-root rcu_node structure. This function is idempotent, so it does - * not hurt to call it repeatedly. Returns an flag saying that we should - * awaken the RCU grace-period kthread. + * If there is room, assign a ->gp_seq number to any callbacks on this + * CPU that have not already been assigned. Also accelerate any callbacks + * that were previously assigned a ->gp_seq number that has since proven + * to be too conservative, which can happen if callbacks get assigned a + * ->gp_seq number while RCU is idle, but with reference to a non-root + * rcu_node structure. This function is idempotent, so it does not hurt + * to call it repeatedly. Returns an flag saying that we should awaken + * the RCU grace-period kthread. * * The caller must hold rnp->lock with interrupts disabled. */ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - unsigned long c; + unsigned long gp_seq_req; bool ret = false; raw_lockdep_assert_held_rcu_node(rnp); @@ -1764,22 +1738,50 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - c = rcu_cbs_completed(rsp, rnp); - if (rcu_segcblist_accelerate(&rdp->cblist, c)) - ret = rcu_start_this_gp(rnp, rdp, c); + gp_seq_req = rcu_seq_snap(&rsp->gp_seq); + if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req)) + ret = rcu_start_this_gp(rnp, rdp, gp_seq_req); /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("AccWaitCB")); else - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("AccReadyCB")); return ret; } /* + * Similar to rcu_accelerate_cbs(), but does not require that the leaf + * rcu_node structure's ->lock be held. It consults the cached value + * of ->gp_seq_needed in the rcu_data structure, and if that indicates + * that a new grace-period request be made, invokes rcu_accelerate_cbs() + * while holding the leaf rcu_node structure's ->lock. + */ +static void rcu_accelerate_cbs_unlocked(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) +{ + unsigned long c; + bool needwake; + + lockdep_assert_irqs_disabled(); + c = rcu_seq_snap(&rsp->gp_seq); + if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { + /* Old request still live, so mark recent callbacks. */ + (void)rcu_segcblist_accelerate(&rdp->cblist, c); + return; + } + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + if (needwake) + rcu_gp_kthread_wake(rsp); +} + +/* * Move any callbacks whose grace period has completed to the * RCU_DONE_TAIL sublist, then compact the remaining sublists and - * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL + * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL * sublist. This function is idempotent, so it does not hurt to * invoke it repeatedly. As long as it is not invoked -too- often... * Returns true if the RCU grace-period kthread needs to be awakened. @@ -1796,10 +1798,10 @@ static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, return false; /* - * Find all callbacks whose ->completed numbers indicate that they + * Find all callbacks whose ->gp_seq numbers indicate that they * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. */ - rcu_segcblist_advance(&rdp->cblist, rnp->completed); + rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq); /* Classify any remaining callbacks. */ return rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1819,39 +1821,38 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, raw_lockdep_assert_held_rcu_node(rnp); - /* Handle the ends of any preceding grace periods first. */ - if (rdp->completed == rnp->completed && - !unlikely(READ_ONCE(rdp->gpwrap))) { - - /* No grace period end, so just accelerate recent callbacks. */ - ret = rcu_accelerate_cbs(rsp, rnp, rdp); + if (rdp->gp_seq == rnp->gp_seq) + return false; /* Nothing to do. */ + /* Handle the ends of any preceding grace periods first. */ + if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || + unlikely(READ_ONCE(rdp->gpwrap))) { + ret = rcu_advance_cbs(rsp, rnp, rdp); /* Advance callbacks. */ + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuend")); } else { - - /* Advance callbacks. */ - ret = rcu_advance_cbs(rsp, rnp, rdp); - - /* Remember that we saw this grace-period completion. */ - rdp->completed = rnp->completed; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); + ret = rcu_accelerate_cbs(rsp, rnp, rdp); /* Recent callbacks. */ } - if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) { + /* Now handle the beginnings of any new-to-this-CPU grace periods. */ + if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || + unlikely(READ_ONCE(rdp->gpwrap))) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't * go looking for one. */ - rdp->gpnum = rnp->gpnum; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); + trace_rcu_grace_period(rsp->name, rnp->gp_seq, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); - WRITE_ONCE(rdp->gpwrap, false); - rcu_gpnum_ovf(rnp, rdp); } + rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ + if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap) + rdp->gp_seq_needed = rnp->gp_seq_needed; + WRITE_ONCE(rdp->gpwrap, false); + rcu_gpnum_ovf(rnp, rdp); return ret; } @@ -1863,8 +1864,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; - if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && - rdp->completed == READ_ONCE(rnp->completed) && + if ((rdp->gp_seq == rcu_seq_current(&rnp->gp_seq) && !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ local_irq_restore(flags); @@ -1879,7 +1879,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_gp_slow(struct rcu_state *rsp, int delay) { if (delay > 0 && - !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) + !(rcu_seq_ctr(rsp->gp_seq) % + (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) schedule_timeout_uninterruptible(delay); } @@ -1888,7 +1889,9 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay) */ static bool rcu_gp_init(struct rcu_state *rsp) { + unsigned long flags; unsigned long oldmask; + unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1912,9 +1915,9 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(rsp); - /* Record GP times before starting GP, hence smp_store_release(). */ - smp_store_release(&rsp->gpnum, rsp->gpnum + 1); - trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); + /* Record GP times before starting GP, hence rcu_seq_start(). */ + rcu_seq_start(&rsp->gp_seq); + trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -1923,13 +1926,15 @@ static bool rcu_gp_init(struct rcu_state *rsp) * for subsequent online CPUs, and that quiescent-state forcing * will handle subsequent offline CPUs. */ + rsp->gp_state = RCU_GP_ONOFF; rcu_for_each_leaf_node(rsp, rnp) { - rcu_gp_slow(rsp, gp_preinit_delay); + spin_lock(&rsp->ofl_lock); raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ raw_spin_unlock_irq_rcu_node(rnp); + spin_unlock(&rsp->ofl_lock); continue; } @@ -1939,12 +1944,14 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* If zero-ness of ->qsmaskinit changed, propagate up tree. */ if (!oldmask != !rnp->qsmaskinit) { - if (!oldmask) /* First online CPU for this rcu_node. */ - rcu_init_new_rnp(rnp); - else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */ - rnp->wait_blkd_tasks = true; - else /* Last offline CPU and can propagate. */ + if (!oldmask) { /* First online CPU for rcu_node. */ + if (!rnp->wait_blkd_tasks) /* Ever offline? */ + rcu_init_new_rnp(rnp); + } else if (rcu_preempt_has_tasks(rnp)) { + rnp->wait_blkd_tasks = true; /* blocked tasks */ + } else { /* Last offline CPU and can propagate. */ rcu_cleanup_dead_rnp(rnp); + } } /* @@ -1953,18 +1960,19 @@ static bool rcu_gp_init(struct rcu_state *rsp) * still offline, propagate up the rcu_node tree and * clear ->wait_blkd_tasks. Otherwise, if one of this * rcu_node structure's CPUs has since come back online, - * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp() - * checks for this, so just call it unconditionally). + * simply clear ->wait_blkd_tasks. */ if (rnp->wait_blkd_tasks && - (!rcu_preempt_has_tasks(rnp) || - rnp->qsmaskinit)) { + (!rcu_preempt_has_tasks(rnp) || rnp->qsmaskinit)) { rnp->wait_blkd_tasks = false; - rcu_cleanup_dead_rnp(rnp); + if (!rnp->qsmaskinit) + rcu_cleanup_dead_rnp(rnp); } raw_spin_unlock_irq_rcu_node(rnp); + spin_unlock(&rsp->ofl_lock); } + rcu_gp_slow(rsp, gp_preinit_delay); /* Races with CPU hotplug. */ /* * Set the quiescent-state-needed bits in all the rcu_node @@ -1978,22 +1986,27 @@ static bool rcu_gp_init(struct rcu_state *rsp) * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. */ + rsp->gp_state = RCU_GP_INIT; rcu_for_each_node_breadth_first(rsp, rnp) { rcu_gp_slow(rsp, gp_init_delay); - raw_spin_lock_irq_rcu_node(rnp); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp = this_cpu_ptr(rsp->rda); - rcu_preempt_check_blocked_tasks(rnp); + rcu_preempt_check_blocked_tasks(rsp, rnp); rnp->qsmask = rnp->qsmaskinit; - WRITE_ONCE(rnp->gpnum, rsp->gpnum); - if (WARN_ON_ONCE(rnp->completed != rsp->completed)) - WRITE_ONCE(rnp->completed, rsp->completed); + WRITE_ONCE(rnp->gp_seq, rsp->gp_seq); if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + trace_rcu_grace_period_init(rsp->name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock_irq_rcu_node(rnp); + /* Quiescent states for tasks on any now-offline CPUs. */ + mask = rnp->qsmask & ~rnp->qsmaskinitnext; + rnp->rcu_gp_init_mask = mask; + if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + else + raw_spin_unlock_irq_rcu_node(rnp); cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } @@ -2002,7 +2015,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) } /* - * Helper function for swait_event_idle() wakeup at force-quiescent-state + * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state * time. */ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) @@ -2053,6 +2066,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; bool needgp = false; + unsigned long new_gp_seq; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); struct swait_queue_head *sq; @@ -2074,19 +2088,22 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) raw_spin_unlock_irq_rcu_node(rnp); /* - * Propagate new ->completed value to rcu_node structures so - * that other CPUs don't have to wait until the start of the next - * grace period to process their callbacks. This also avoids - * some nasty RCU grace-period initialization races by forcing - * the end of the current grace period to be completely recorded in - * all of the rcu_node structures before the beginning of the next - * grace period is recorded in any of the rcu_node structures. + * Propagate new ->gp_seq value to rcu_node structures so that + * other CPUs don't have to wait until the start of the next grace + * period to process their callbacks. This also avoids some nasty + * RCU grace-period initialization races by forcing the end of + * the current grace period to be completely recorded in all of + * the rcu_node structures before the beginning of the next grace + * period is recorded in any of the rcu_node structures. */ + new_gp_seq = rsp->gp_seq; + rcu_seq_end(&new_gp_seq); rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq_rcu_node(rnp); - WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) + dump_blkd_tasks(rsp, rnp, 10); WARN_ON_ONCE(rnp->qsmask); - WRITE_ONCE(rnp->completed, rsp->gpnum); + WRITE_ONCE(rnp->gp_seq, new_gp_seq); rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; @@ -2100,26 +2117,28 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); - raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ + raw_spin_lock_irq_rcu_node(rnp); /* GP before rsp->gp_seq update. */ /* Declare grace period done. */ - WRITE_ONCE(rsp->completed, rsp->gpnum); - trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); + rcu_seq_end(&rsp->gp_seq); + trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("end")); rsp->gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); - if (need_any_future_gp(rnp)) { - trace_rcu_this_gp(rnp, rdp, rsp->completed - 1, + if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { + trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed, TPS("CleanupMore")); needgp = true; } /* Advance CBs to reduce false positives below. */ if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), + rsp->gp_req_activity = jiffies; + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); + } else { + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); } - WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); raw_spin_unlock_irq_rcu_node(rnp); } @@ -2141,10 +2160,10 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle grace-period start. */ for (;;) { trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; - swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & + swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); rsp->gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ @@ -2154,17 +2173,13 @@ static int __noreturn rcu_gp_kthread(void *arg) WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("reqwaitsig")); } /* Handle quiescent-state forcing. */ first_gp_fqs = true; j = jiffies_till_first_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_first_fqs = HZ; - } ret = 0; for (;;) { if (!ret) { @@ -2173,10 +2188,10 @@ static int __noreturn rcu_gp_kthread(void *arg) jiffies + 3 * j); } trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; - ret = swait_event_idle_timeout(rsp->gp_wq, + ret = swait_event_idle_timeout_exclusive(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ @@ -2188,31 +2203,24 @@ static int __noreturn rcu_gp_kthread(void *arg) if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || (gf & RCU_GP_FLAG_FQS)) { trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqsstart")); rcu_gp_fqs(rsp, first_gp_fqs); first_gp_fqs = false; trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqsend")); cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); ret = 0; /* Force full wait till next FQS. */ j = jiffies_till_next_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_next_fqs = HZ; - } else if (j < 1) { - j = 1; - jiffies_till_next_fqs = 1; - } } else { /* Deal with stray signal. */ cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqswaitsig")); ret = 1; /* Keep old FQS timing. */ j = jiffies; @@ -2256,8 +2264,12 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * must be represented by the same rcu_node structure (which need not be a * leaf rcu_node structure, though it often will be). The gps parameter * is the grace-period snapshot, which means that the quiescent states - * are valid only if rnp->gpnum is equal to gps. That structure's lock + * are valid only if rnp->gp_seq is equal to gps. That structure's lock * must be held upon entry, and it is released before return. + * + * As a special case, if mask is zero, the bit-already-cleared check is + * disabled. This allows propagating quiescent state due to resumed tasks + * during grace-period initialization. */ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, @@ -2271,7 +2283,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, /* Walk up the rcu_node hierarchy. */ for (;;) { - if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { + if ((!(rnp->qsmask & mask) && mask) || rnp->gp_seq != gps) { /* * Our bit has already been cleared, or the @@ -2284,7 +2296,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; - trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, + trace_rcu_quiescent_state_report(rsp->name, rnp->gp_seq, mask, rnp->qsmask, rnp->level, rnp->grplo, rnp->grphi, !!rnp->gp_tasks); @@ -2294,6 +2306,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } + rnp->completedqs = rnp->gp_seq; mask = rnp->grpmask; if (rnp->parent == NULL) { @@ -2323,8 +2336,9 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * irqs disabled, and this lock is released upon return, but irqs remain * disabled. */ -static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags) +static void __maybe_unused +rcu_report_unblock_qs_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { unsigned long gps; @@ -2332,12 +2346,15 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || - rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { + if (WARN_ON_ONCE(rcu_state_p == &rcu_sched_state) || + WARN_ON_ONCE(rsp != rcu_state_p) || + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || + rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; /* Still need more quiescent states! */ } + rnp->completedqs = rnp->gp_seq; rnp_p = rnp->parent; if (rnp_p == NULL) { /* @@ -2348,8 +2365,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, return; } - /* Report up the rest of the hierarchy, tracking current ->gpnum. */ - gps = rnp->gpnum; + /* Report up the rest of the hierarchy, tracking current ->gp_seq. */ + gps = rnp->gp_seq; mask = rnp->grpmask; raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ @@ -2370,8 +2387,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rdp->cpu_no_qs.b.norm || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum || rdp->gpwrap) { + if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || + rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -2396,7 +2413,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ if (needwake) rcu_gp_kthread_wake(rsp); @@ -2441,17 +2458,16 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) */ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { - RCU_TRACE(unsigned long mask;) + RCU_TRACE(bool blkd;) RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return; - RCU_TRACE(mask = rdp->grpmask;) - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - !!(rnp->qsmask & mask), - TPS("cpuofl")); + RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) + trace_rcu_grace_period(rsp->name, rnp->gp_seq, + blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); } /* @@ -2463,7 +2479,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) * This function therefore goes up the tree of rcu_node structures, * clearing the corresponding bits in the ->qsmaskinit fields. Note that * the leaf rcu_node structure's ->qsmaskinit field has already been - * updated + * updated. * * This function does check that the specified rcu_node structure has * all CPUs offline and no blocked tasks, so it is OK to invoke it @@ -2476,9 +2492,10 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - raw_lockdep_assert_held_rcu_node(rnp); + raw_lockdep_assert_held_rcu_node(rnp_leaf); if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) + WARN_ON_ONCE(rnp_leaf->qsmaskinit) || + WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) return; for (;;) { mask = rnp->grpmask; @@ -2487,7 +2504,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) break; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; - rnp->qsmask &= ~mask; + /* Between grace periods, so better already be zero! */ + WARN_ON_ONCE(rnp->qsmask); if (rnp->qsmaskinit) { raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ @@ -2630,6 +2648,7 @@ void rcu_check_callbacks(int user) rcu_sched_qs(); rcu_bh_qs(); + rcu_note_voluntary_context_switch(current); } else if (!in_softirq()) { @@ -2645,8 +2664,7 @@ void rcu_check_callbacks(int user) rcu_preempt_check_callbacks(); if (rcu_pending()) invoke_rcu_core(); - if (user) - rcu_note_voluntary_context_switch(current); + trace_rcu_utilization(TPS("End scheduler-tick")); } @@ -2681,17 +2699,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) /* rcu_initiate_boost() releases rnp->lock */ continue; } - if (rnp->parent && - (rnp->parent->qsmask & rnp->grpmask)) { - /* - * Race between grace-period - * initialization and task exiting RCU - * read-side critical section: Report. - */ - rcu_report_unblock_qs_rnp(rsp, rnp, flags); - /* rcu_report_unblock_qs_rnp() rlses ->lock */ - continue; - } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + continue; } for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long bit = leaf_node_cpu_bit(rnp, cpu); @@ -2701,8 +2710,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) } } if (mask != 0) { - /* Idle/offline CPUs, report (releases rnp->lock. */ - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); + /* Idle/offline CPUs, report (releases rnp->lock). */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); } else { /* Nothing to do here, so just drop the lock. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2747,6 +2756,65 @@ static void force_quiescent_state(struct rcu_state *rsp) } /* + * This function checks for grace-period requests that fail to motivate + * RCU to come out of its idle mode. + */ +static void +rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + const unsigned long gpssdelay = rcu_jiffies_till_stall_check() * HZ; + unsigned long flags; + unsigned long j; + struct rcu_node *rnp_root = rcu_get_root(rsp); + static atomic_t warned = ATOMIC_INIT(0); + + if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress(rsp) || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) + return; + j = jiffies; /* Expensive access, and in common case don't get here. */ + if (time_before(j, READ_ONCE(rsp->gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rsp->gp_activity) + gpssdelay) || + atomic_read(&warned)) + return; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + j = jiffies; + if (rcu_gp_in_progress(rsp) || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || + time_before(j, READ_ONCE(rsp->gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rsp->gp_activity) + gpssdelay) || + atomic_read(&warned)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + /* Hold onto the leaf lock to make others see warned==1. */ + + if (rnp_root != rnp) + raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + j = jiffies; + if (rcu_gp_in_progress(rsp) || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || + time_before(j, rsp->gp_req_activity + gpssdelay) || + time_before(j, rsp->gp_activity + gpssdelay) || + atomic_xchg(&warned, 1)) { + raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n", + __func__, (long)READ_ONCE(rsp->gp_seq), + (long)READ_ONCE(rnp_root->gp_seq_needed), + j - rsp->gp_req_activity, j - rsp->gp_activity, + rsp->gp_flags, rsp->gp_state, rsp->name, + rsp->gp_kthread ? rsp->gp_kthread->state : 0x1ffffL); + WARN_ON(1); + if (rnp_root != rnp) + raw_spin_unlock_rcu_node(rnp_root); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + +/* * This does the RCU core processing work for the specified rcu_state * and rcu_data structures. This may be called only from the CPU to * whom the rdp belongs. @@ -2755,9 +2823,8 @@ static void __rcu_process_callbacks(struct rcu_state *rsp) { unsigned long flags; - bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - struct rcu_node *rnp; + struct rcu_node *rnp = rdp->mynode; WARN_ON_ONCE(!rdp->beenonline); @@ -2768,18 +2835,13 @@ __rcu_process_callbacks(struct rcu_state *rsp) if (!rcu_gp_in_progress(rsp) && rcu_segcblist_is_enabled(&rdp->cblist)) { local_irq_save(flags); - if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) { - local_irq_restore(flags); - } else { - rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ - needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (needwake) - rcu_gp_kthread_wake(rsp); - } + if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) + rcu_accelerate_cbs_unlocked(rsp, rnp, rdp); + local_irq_restore(flags); } + rcu_check_gp_start_stall(rsp, rnp, rdp); + /* If there are callbacks ready, invoke them. */ if (rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_callbacks(rsp, rdp); @@ -2833,8 +2895,6 @@ static void invoke_rcu_core(void) static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, struct rcu_head *head, unsigned long flags) { - bool needwake; - /* * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. @@ -2861,13 +2921,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - struct rcu_node *rnp = rdp->mynode; - - raw_spin_lock_rcu_node(rnp); - needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock_rcu_node(rnp); - if (needwake) - rcu_gp_kthread_wake(rsp); + rcu_accelerate_cbs_unlocked(rsp, rdp->mynode, rdp); } else { /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; @@ -3037,7 +3091,7 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu); * when there was in fact only one the whole time, as this just adds * some overhead: RCU still operates correctly. */ -static inline int rcu_blocking_is_gp(void) +static int rcu_blocking_is_gp(void) { int ret; @@ -3136,16 +3190,10 @@ unsigned long get_state_synchronize_rcu(void) { /* * Any prior manipulation of RCU-protected data must happen - * before the load from ->gpnum. + * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - - /* - * Make sure this load happens before the purportedly - * time-consuming work between get_state_synchronize_rcu() - * and cond_synchronize_rcu(). - */ - return smp_load_acquire(&rcu_state_p->gpnum); + return rcu_seq_snap(&rcu_state_p->gp_seq); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -3165,15 +3213,10 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void cond_synchronize_rcu(unsigned long oldstate) { - unsigned long newstate; - - /* - * Ensure that this load happens before any RCU-destructive - * actions the caller might carry out after we return. - */ - newstate = smp_load_acquire(&rcu_state_p->completed); - if (ULONG_CMP_GE(oldstate, newstate)) + if (!rcu_seq_done(&rcu_state_p->gp_seq, oldstate)) synchronize_rcu(); + else + smp_mb(); /* Ensure GP ends before subsequent accesses. */ } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); @@ -3188,16 +3231,10 @@ unsigned long get_state_synchronize_sched(void) { /* * Any prior manipulation of RCU-protected data must happen - * before the load from ->gpnum. + * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - - /* - * Make sure this load happens before the purportedly - * time-consuming work between get_state_synchronize_sched() - * and cond_synchronize_sched(). - */ - return smp_load_acquire(&rcu_sched_state.gpnum); + return rcu_seq_snap(&rcu_sched_state.gp_seq); } EXPORT_SYMBOL_GPL(get_state_synchronize_sched); @@ -3217,15 +3254,10 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_sched); */ void cond_synchronize_sched(unsigned long oldstate) { - unsigned long newstate; - - /* - * Ensure that this load happens before any RCU-destructive - * actions the caller might carry out after we return. - */ - newstate = smp_load_acquire(&rcu_sched_state.completed); - if (ULONG_CMP_GE(oldstate, newstate)) + if (!rcu_seq_done(&rcu_sched_state.gp_seq, oldstate)) synchronize_sched(); + else + smp_mb(); /* Ensure GP ends before subsequent accesses. */ } EXPORT_SYMBOL_GPL(cond_synchronize_sched); @@ -3261,12 +3293,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; - /* Has another RCU grace period completed? */ - if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */ - return 1; - - /* Has a new RCU grace period started? */ - if (READ_ONCE(rnp->gpnum) != rdp->gpnum || + /* Have RCU grace period completed or started? */ + if (rcu_seq_current(&rnp->gp_seq) != rdp->gp_seq || unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ return 1; @@ -3298,7 +3326,7 @@ static int rcu_pending(void) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) +static bool rcu_cpu_has_callbacks(bool *all_lazy) { bool al = true; bool hc = false; @@ -3484,17 +3512,22 @@ EXPORT_SYMBOL_GPL(rcu_barrier_sched); static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) { long mask; + long oldmask; struct rcu_node *rnp = rnp_leaf; - raw_lockdep_assert_held_rcu_node(rnp); + raw_lockdep_assert_held_rcu_node(rnp_leaf); + WARN_ON_ONCE(rnp->wait_blkd_tasks); for (;;) { mask = rnp->grpmask; rnp = rnp->parent; if (rnp == NULL) return; raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ + oldmask = rnp->qsmaskinit; rnp->qsmaskinit |= mask; raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */ + if (oldmask) + return; } } @@ -3511,6 +3544,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); + rdp->rcu_ofl_gp_seq = rsp->gp_seq; + rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; + rdp->rcu_onl_gp_seq = rsp->gp_seq; + rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); @@ -3518,9 +3555,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* * Initialize a CPU's per-CPU RCU data. Note that only one online or - * offline event can be happening at a given time. Note also that we - * can accept some slop in the rsp->completed access due to the fact - * that this CPU cannot possibly have any RCU callbacks in flight yet. + * offline event can be happening at a given time. Note also that we can + * accept some slop in the rsp->gp_seq access due to the fact that this + * CPU cannot possibly have any RCU callbacks in flight yet. */ static void rcu_init_percpu_data(int cpu, struct rcu_state *rsp) @@ -3549,14 +3586,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rdp->beenonline = true; /* We have now been online. */ - rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ - rdp->completed = rnp->completed; + rdp->gp_seq = rnp->gp_seq; + rdp->gp_seq_needed = rnp->gp_seq; rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; - rdp->rcu_iw_gpnum = rnp->gpnum - 1; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); + rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -3705,7 +3742,15 @@ void rcu_cpu_starting(unsigned int cpu) nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ + rdp->rcu_onl_gp_seq = READ_ONCE(rsp->gp_seq); + rdp->rcu_onl_gp_flags = READ_ONCE(rsp->gp_flags); + if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ + /* Report QS -after- changing ->qsmaskinitnext! */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + } else { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } } smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } @@ -3713,7 +3758,7 @@ void rcu_cpu_starting(unsigned int cpu) #ifdef CONFIG_HOTPLUG_CPU /* * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinit + * function. We now remove it from the rcu_node tree's ->qsmaskinitnext * bit masks. */ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) @@ -3725,9 +3770,18 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; + spin_lock(&rsp->ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ + rdp->rcu_ofl_gp_seq = READ_ONCE(rsp->gp_seq); + rdp->rcu_ofl_gp_flags = READ_ONCE(rsp->gp_flags); + if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ + /* Report quiescent state -before- changing ->qsmaskinitnext! */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); + } rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + spin_unlock(&rsp->ofl_lock); } /* @@ -3839,12 +3893,16 @@ static int __init rcu_spawn_gp_kthread(void) struct task_struct *t; /* Force priority into range. */ - if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) + if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2 + && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) + kthread_prio = 2; + else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) kthread_prio = 1; else if (kthread_prio < 0) kthread_prio = 0; else if (kthread_prio > 99) kthread_prio = 99; + if (kthread_prio != kthread_prio_in) pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", kthread_prio, kthread_prio_in); @@ -3928,8 +3986,9 @@ static void __init rcu_init_one(struct rcu_state *rsp) raw_spin_lock_init(&rnp->fqslock); lockdep_set_class_and_name(&rnp->fqslock, &rcu_fqs_class[i], fqs[i]); - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; + rnp->gp_seq = rsp->gp_seq; + rnp->gp_seq_needed = rsp->gp_seq; + rnp->completedqs = rsp->gp_seq; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; @@ -3997,7 +4056,7 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; - pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", + pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", rcu_fanout_leaf, nr_cpu_ids); /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 78e051dffc5b..4e74df768c57 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -81,18 +81,16 @@ struct rcu_node { raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ /* some rcu_state fields as well as */ /* following. */ - unsigned long gpnum; /* Current grace period for this node. */ - /* This will either be equal to or one */ - /* behind the root rcu_node's gpnum. */ - unsigned long completed; /* Last GP completed for this node. */ - /* This will either be equal to or one */ - /* behind the root rcu_node's gpnum. */ + unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ + unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed. */ + unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ /* In leaf rcu_node, each bit corresponds to */ /* an rcu_data structure, otherwise, each */ /* bit corresponds to a child rcu_node */ /* structure. */ + unsigned long rcu_gp_init_mask; /* Mask of offline CPUs at GP init. */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask. */ /* Initialized from ->qsmaskinitnext at the */ @@ -158,7 +156,6 @@ struct rcu_node { struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - u8 need_future_gp[4]; /* Counts of upcoming GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; spinlock_t exp_lock ____cacheline_internodealigned_in_smp; @@ -168,22 +165,6 @@ struct rcu_node { bool exp_need_flush; /* Need to flush workitem? */ } ____cacheline_internodealigned_in_smp; -/* Accessors for ->need_future_gp[] array. */ -#define need_future_gp_mask() \ - (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1) -#define need_future_gp_element(rnp, c) \ - ((rnp)->need_future_gp[(c) & need_future_gp_mask()]) -#define need_any_future_gp(rnp) \ -({ \ - int __i; \ - bool __nonzero = false; \ - \ - for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \ - __nonzero = __nonzero || \ - READ_ONCE((rnp)->need_future_gp[__i]); \ - __nonzero; \ -}) - /* * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and * are indexed relative to this interval rather than the global CPU ID space. @@ -206,16 +187,14 @@ union rcu_noqs { /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ - unsigned long completed; /* Track rsp->completed gp number */ - /* in order to detect GP end. */ - unsigned long gpnum; /* Highest gp number that this CPU */ - /* is aware of having started. */ + unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ + unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed ctr. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ /* for rcu_all_qs() invocations. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ - bool gpwrap; /* Possible gpnum/completed wrap. */ + bool gpwrap; /* Possible ->gp_seq wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ @@ -239,7 +218,6 @@ struct rcu_data { /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ - unsigned long offline_fqs; /* Kicked due to being offline. */ unsigned long cond_resched_completed; /* Grace period that needs help */ /* from cond_resched(). */ @@ -278,12 +256,16 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 7) RCU CPU stall data. */ + /* 7) Diagnostic data, including RCU CPU stall warnings. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ bool rcu_iw_pending; /* Is ->rcu_iw pending? */ - unsigned long rcu_iw_gpnum; /* ->gpnum associated with ->rcu_iw. */ + unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */ + unsigned long rcu_ofl_gp_seq; /* ->gp_seq at last offline. */ + short rcu_ofl_gp_flags; /* ->gp_flags at last offline. */ + unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ + short rcu_onl_gp_flags; /* ->gp_flags at last online. */ int cpu; struct rcu_state *rsp; @@ -340,8 +322,7 @@ struct rcu_state { u8 boost ____cacheline_internodealigned_in_smp; /* Subject to priority boost. */ - unsigned long gpnum; /* Current gp number. */ - unsigned long completed; /* # of last completed gp. */ + unsigned long gp_seq; /* Grace-period sequence #. */ struct task_struct *gp_kthread; /* Task for grace periods. */ struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ @@ -373,6 +354,8 @@ struct rcu_state { /* but in jiffies. */ unsigned long gp_activity; /* Time of last GP kthread */ /* activity in jiffies. */ + unsigned long gp_req_activity; /* Time of last GP request */ + /* in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ unsigned long jiffies_resched; /* Time at which to resched */ @@ -384,6 +367,10 @@ struct rcu_state { const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ + + spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; + /* Synchronize offline with */ + /* GP pre-initialization. */ }; /* Values for rcu_state structure's gp_flags field. */ @@ -394,16 +381,20 @@ struct rcu_state { #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ #define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ -#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ -#define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */ -#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ -#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ +#define RCU_GP_ONOFF 3 /* Grace-period initialization hotplug. */ +#define RCU_GP_INIT 4 /* Grace-period initialization. */ +#define RCU_GP_WAIT_FQS 5 /* Wait for force-quiescent-state time. */ +#define RCU_GP_DOING_FQS 6 /* Wait done for force-quiescent-state time. */ +#define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */ +#define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */ #ifndef RCU_TREE_NONCORE static const char * const gp_state_names[] = { "RCU_GP_IDLE", "RCU_GP_WAIT_GPS", "RCU_GP_DONE_GPS", + "RCU_GP_ONOFF", + "RCU_GP_INIT", "RCU_GP_WAIT_FQS", "RCU_GP_DOING_FQS", "RCU_GP_CLEANUP", @@ -449,10 +440,13 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); +static void rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, + struct rcu_node *rnp); static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, rcu_callback_t func); static void __init __rcu_init_preempt(void); +static void dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, + int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); @@ -489,7 +483,6 @@ static void __init rcu_spawn_nocb_kthreads(void); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ -static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(struct rcu_state *rsp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d40708e8c5d6..0b2c2ad69629 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -212,7 +212,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ - swake_up(&rsp->expedited_wq); + swake_up_one(&rsp->expedited_wq); } break; } @@ -472,6 +472,7 @@ retry_ipi: static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, smp_call_func_t func) { + int cpu; struct rcu_node *rnp; trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); @@ -486,13 +487,20 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, rnp->rew.rew_func = func; rnp->rew.rew_rsp = rsp; if (!READ_ONCE(rcu_par_gp_wq) || - rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { - /* No workqueues yet. */ + rcu_scheduler_active != RCU_SCHEDULER_RUNNING || + rcu_is_last_leaf_node(rsp, rnp)) { + /* No workqueues yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; } INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work); + preempt_disable(); + cpu = cpumask_next(rnp->grplo - 1, cpu_online_mask); + /* If all offline, queue the work on an unbound CPU. */ + if (unlikely(cpu > rnp->grphi)) + cpu = WORK_CPU_UNBOUND; + queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); + preempt_enable(); rnp->exp_need_flush = true; } @@ -518,7 +526,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) jiffies_start = jiffies; for (;;) { - ret = swait_event_timeout( + ret = swait_event_timeout_exclusive( rsp->expedited_wq, sync_rcu_preempt_exp_done_unlocked(rnp_root), jiffies_stall); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7fd12039e512..a97c20ea9bce 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -74,8 +74,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU event tracing is enabled.\n"); if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) - pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", - RCU_FANOUT); + pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d.\n", + RCU_FANOUT); if (rcu_fanout_exact) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) @@ -88,11 +88,13 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", RCU_FANOUT_LEAF); if (rcu_fanout_leaf != RCU_FANOUT_LEAF) - pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", + rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_BOOST - pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); + pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", + kthread_prio, CONFIG_RCU_BOOST_DELAY); #endif if (blimit != DEFAULT_RCU_BLIMIT) pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit); @@ -127,6 +129,7 @@ static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); +static void rcu_read_unlock_special(struct task_struct *t); /* * Tell them what RCU they are running. @@ -183,6 +186,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); + /* RCU better not be waiting on newly onlined CPUs! */ + WARN_ON_ONCE(rnp->qsmaskinitnext & ~rnp->qsmaskinit & rnp->qsmask & + rdp->grpmask); /* * Decide where to queue the newly blocked task. In theory, @@ -260,8 +266,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * ->exp_tasks pointers, respectively, to reference the newly * blocked tasks. */ - if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) + if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { rnp->gp_tasks = &t->rcu_node_entry; + WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); + } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != @@ -286,20 +294,24 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) } /* - * Record a preemptible-RCU quiescent state for the specified CPU. Note - * that this just means that the task currently running on the CPU is - * not in a quiescent state. There might be any number of tasks blocked - * while in an RCU read-side critical section. + * Record a preemptible-RCU quiescent state for the specified CPU. + * Note that this does not necessarily mean that the task currently running + * on the CPU is in a quiescent state: Instead, it means that the current + * grace period need not wait on any RCU read-side critical section that + * starts later on this CPU. It also means that if the current task is + * in an RCU read-side critical section, it has already added itself to + * some leaf rcu_node structure's ->blkd_tasks list. In addition to the + * current task, there might be any number of other tasks blocked while + * in an RCU read-side critical section. * - * As with the other rcu_*_qs() functions, callers to this function - * must disable preemption. + * Callers to this function must disable preemption. */ static void rcu_preempt_qs(void) { RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n"); if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), - __this_cpu_read(rcu_data_p->gpnum), + __this_cpu_read(rcu_data_p->gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false); barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ @@ -348,8 +360,8 @@ static void rcu_preempt_note_context_switch(bool preempt) trace_rcu_preempt_task(rdp->rsp->name, t->pid, (rnp->qsmask & rdp->grpmask) - ? rnp->gpnum - : rnp->gpnum + 1); + ? rnp->gp_seq + : rcu_seq_snap(&rnp->gp_seq)); rcu_preempt_ctxt_queue(rnp, rdp); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { @@ -456,7 +468,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -void rcu_read_unlock_special(struct task_struct *t) +static void rcu_read_unlock_special(struct task_struct *t) { bool empty_exp; bool empty_norm; @@ -535,13 +547,15 @@ void rcu_read_unlock_special(struct task_struct *t) WARN_ON_ONCE(rnp != t->rcu_blocked_node); WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); + WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && + (!empty_norm || rnp->qsmask)); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), - rnp->gpnum, t->pid); + rnp->gp_seq, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) rnp->gp_tasks = np; if (&t->rcu_node_entry == rnp->exp_tasks) @@ -562,7 +576,7 @@ void rcu_read_unlock_special(struct task_struct *t) empty_exp_now = sync_rcu_preempt_exp_done(rnp); if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), - rnp->gpnum, + rnp->gp_seq, 0, rnp->qsmask, rnp->level, rnp->grplo, @@ -686,24 +700,27 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be - * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock + * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock * must be held by the caller. * * Also, if there are blocked tasks on the list, they automatically * block the newly created grace period, so set up ->gp_tasks accordingly. */ -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +static void +rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) { struct task_struct *t; RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); - WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); - if (rcu_preempt_has_tasks(rnp)) { + if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) + dump_blkd_tasks(rsp, rnp, 10); + if (rcu_preempt_has_tasks(rnp) && + (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { rnp->gp_tasks = rnp->blkd_tasks.next; t = container_of(rnp->gp_tasks, struct task_struct, rcu_node_entry); trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), - rnp->gpnum, t->pid); + rnp->gp_seq, t->pid); } WARN_ON_ONCE(rnp->qsmask); } @@ -717,6 +734,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) */ static void rcu_preempt_check_callbacks(void) { + struct rcu_state *rsp = &rcu_preempt_state; struct task_struct *t = current; if (t->rcu_read_lock_nesting == 0) { @@ -725,7 +743,9 @@ static void rcu_preempt_check_callbacks(void) } if (t->rcu_read_lock_nesting > 0 && __this_cpu_read(rcu_data_p->core_needs_qs) && - __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm)) + __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm) && + !t->rcu_read_unlock_special.b.need_qs && + time_after(jiffies, rsp->gp_start + HZ)) t->rcu_read_unlock_special.b.need_qs = true; } @@ -841,6 +861,47 @@ void exit_rcu(void) __rcu_read_unlock(); } +/* + * Dump the blocked-tasks state, but limit the list dump to the + * specified number of elements. + */ +static void +dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) +{ + int cpu; + int i; + struct list_head *lhp; + bool onl; + struct rcu_data *rdp; + struct rcu_node *rnp1; + + raw_lockdep_assert_held_rcu_node(rnp); + pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", + __func__, rnp->grplo, rnp->grphi, rnp->level, + (long)rnp->gp_seq, (long)rnp->completedqs); + for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) + pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", + __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); + pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", + __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks); + pr_info("%s: ->blkd_tasks", __func__); + i = 0; + list_for_each(lhp, &rnp->blkd_tasks) { + pr_cont(" %p", lhp); + if (++i >= 10) + break; + } + pr_cont("\n"); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + rdp = per_cpu_ptr(rsp->rda, cpu); + onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); + pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", + cpu, ".o"[onl], + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + } +} + #else /* #ifdef CONFIG_PREEMPT_RCU */ static struct rcu_state *const rcu_state_p = &rcu_sched_state; @@ -911,7 +972,8 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. */ -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +static void +rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) { WARN_ON_ONCE(rnp->qsmask); } @@ -949,6 +1011,15 @@ void exit_rcu(void) { } +/* + * Dump the guaranteed-empty blocked-tasks state. Trust but verify. + */ +static void +dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) +{ + WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); +} + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST @@ -1433,7 +1504,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * completed since we last checked and there are * callbacks not yet ready to invoke. */ - if ((rdp->completed != rnp->completed || + if ((rcu_seq_completed_gp(rdp->gp_seq, + rcu_seq_current(&rnp->gp_seq)) || unlikely(READ_ONCE(rdp->gpwrap))) && rcu_segcblist_pend_cbs(&rdp->cblist)) note_gp_changes(rsp, rdp); @@ -1720,16 +1792,16 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) */ touch_nmi_watchdog(); - if (rsp->gpnum == rdp->gpnum) { + ticks_value = rcu_seq_ctr(rsp->gp_seq - rdp->gp_seq); + if (ticks_value) { + ticks_title = "GPs behind"; + } else { ticks_title = "ticks this GP"; ticks_value = rdp->ticks_this_gp; - } else { - ticks_title = "GPs behind"; - ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum; - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%ld softirq=%u/%u fqs=%ld %s\n", + delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -1817,7 +1889,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { - return &rnp->nocb_gp_wq[rnp->completed & 0x1]; + return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; } static void rcu_init_one_nocb(struct rcu_node *rnp) @@ -1854,8 +1926,8 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force, WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); del_timer(&rdp->nocb_timer); raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ - swake_up(&rdp_leader->nocb_wq); + smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */ + swake_up_one(&rdp_leader->nocb_wq); } else { raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } @@ -2069,12 +2141,17 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) bool needwake; struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - c = rcu_cbs_completed(rdp->rsp, rnp); - needwake = rcu_start_this_gp(rnp, rdp, c); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (needwake) - rcu_gp_kthread_wake(rdp->rsp); + local_irq_save(flags); + c = rcu_seq_snap(&rdp->rsp->gp_seq); + if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { + local_irq_restore(flags); + } else { + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + needwake = rcu_start_this_gp(rnp, rdp, c); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (needwake) + rcu_gp_kthread_wake(rdp->rsp); + } /* * Wait for the grace period. Do so interruptibly to avoid messing @@ -2082,9 +2159,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) */ trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { - swait_event_interruptible( - rnp->nocb_gp_wq[c & 0x1], - (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); + swait_event_interruptible_exclusive( + rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], + (d = rcu_seq_done(&rnp->gp_seq, c))); if (likely(d)) break; WARN_ON(signal_pending(current)); @@ -2111,7 +2188,7 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); - swait_event_interruptible(my_rdp->nocb_wq, + swait_event_interruptible_exclusive(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); my_rdp->nocb_leader_sleep = true; @@ -2176,7 +2253,7 @@ wait_again: raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { /* List was empty, so wake up the follower. */ - swake_up(&rdp->nocb_wq); + swake_up_one(&rdp->nocb_wq); } } @@ -2193,7 +2270,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) { for (;;) { trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); - swait_event_interruptible(rdp->nocb_wq, + swait_event_interruptible_exclusive(rdp->nocb_wq, READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { /* ^^^ Ensure CB invocation follows _head test. */ @@ -2569,23 +2646,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /* - * An adaptive-ticks CPU can potentially execute in kernel mode for an - * arbitrarily long period of time with the scheduling-clock tick turned - * off. RCU will be paying attention to this CPU because it is in the - * kernel, but the CPU cannot be guaranteed to be executing the RCU state - * machine because the scheduling-clock tick has been disabled. Therefore, - * if an adaptive-ticks CPU is failing to respond to the current grace - * period and has not be idle from an RCU perspective, kick it. - */ -static void __maybe_unused rcu_kick_nohz_cpu(int cpu) -{ -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_cpu(cpu)) - smp_send_reschedule(cpu); -#endif /* #ifdef CONFIG_NO_HZ_FULL */ -} - -/* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? * The idea is to avoid waking up RCU core processing on such a @@ -2610,8 +2670,6 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) */ static void rcu_bind_gp_kthread(void) { - int __maybe_unused cpu; - if (!tick_nohz_full_enabled()) return; housekeeping_affine(current, HK_FLAG_RCU); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4c230a60ece4..39cb23d22109 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -507,14 +507,15 @@ early_initcall(check_cpu_stall_init); #ifdef CONFIG_TASKS_RCU /* - * Simple variant of RCU whose quiescent states are voluntary context switch, - * user-space execution, and idle. As such, grace periods can take one good - * long time. There are no read-side primitives similar to rcu_read_lock() - * and rcu_read_unlock() because this implementation is intended to get - * the system into a safe state for some of the manipulations involved in - * tracing and the like. Finally, this implementation does not support - * high call_rcu_tasks() rates from multiple CPUs. If this is required, - * per-CPU callback lists will be needed. + * Simple variant of RCU whose quiescent states are voluntary context + * switch, cond_resched_rcu_qs(), user-space execution, and idle. + * As such, grace periods can take one good long time. There are no + * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() + * because this implementation is intended to get the system into a safe + * state for some of the manipulations involved in tracing and the like. + * Finally, this implementation does not support high call_rcu_tasks() + * rates from multiple CPUs. If this is required, per-CPU callback lists + * will be needed. */ /* Global list of callbacks and associated lock. */ @@ -542,11 +543,11 @@ static struct task_struct *rcu_tasks_kthread_ptr; * period elapses, in other words after all currently executing RCU * read-side critical sections have completed. call_rcu_tasks() assumes * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), entry into idle, or transition to usermode - * execution. As such, there are no read-side primitives analogous to - * rcu_read_lock() and rcu_read_unlock() because this primitive is intended - * to determine that all tasks have passed through a safe state, not so - * much for data-strcuture synchronization. + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-strcuture synchronization. * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. @@ -667,6 +668,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct rcu_head *list; struct rcu_head *next; LIST_HEAD(rcu_tasks_holdouts); + int fract; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current, HK_FLAG_RCU); @@ -748,13 +750,25 @@ static int __noreturn rcu_tasks_kthread(void *arg) * holdouts. When the list is empty, we are done. */ lastreport = jiffies; - while (!list_empty(&rcu_tasks_holdouts)) { + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ + fract = 10; + + for (;;) { bool firstreport; bool needreport; int rtst; struct task_struct *t1; - schedule_timeout_interruptible(HZ); + if (list_empty(&rcu_tasks_holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); @@ -800,6 +814,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) list = next; cond_resched(); } + /* Paranoid sleep to keep this from entering a tight loop */ schedule_timeout_uninterruptible(HZ/10); } } diff --git a/kernel/reboot.c b/kernel/reboot.c index e4ced883d8de..8fb44dec9ad7 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -294,7 +294,7 @@ void kernel_power_off(void) } EXPORT_SYMBOL_GPL(kernel_power_off); -static DEFINE_MUTEX(reboot_mutex); +DEFINE_MUTEX(system_transition_mutex); /* * Reboot system call: for obvious reasons only root may call it, @@ -338,7 +338,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) cmd = LINUX_REBOOT_CMD_HALT; - mutex_lock(&reboot_mutex); + mutex_lock(&system_transition_mutex); switch (cmd) { case LINUX_REBOOT_CMD_RESTART: kernel_restart(NULL); @@ -389,7 +389,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, ret = -EINVAL; break; } - mutex_unlock(&reboot_mutex); + mutex_unlock(&system_transition_mutex); return ret; } diff --git a/kernel/rseq.c b/kernel/rseq.c index 22b6acf1ad63..c6242d8594dc 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -85,9 +85,9 @@ static int rseq_update_cpu_id(struct task_struct *t) { u32 cpu_id = raw_smp_processor_id(); - if (__put_user(cpu_id, &t->rseq->cpu_id_start)) + if (put_user(cpu_id, &t->rseq->cpu_id_start)) return -EFAULT; - if (__put_user(cpu_id, &t->rseq->cpu_id)) + if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; trace_rseq_update(t); return 0; @@ -100,14 +100,14 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) /* * Reset cpu_id_start to its initial state (0). */ - if (__put_user(cpu_id_start, &t->rseq->cpu_id_start)) + if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) return -EFAULT; /* * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming * in after unregistration can figure out that rseq needs to be * registered again. */ - if (__put_user(cpu_id, &t->rseq->cpu_id)) + if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; return 0; } @@ -115,29 +115,36 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; - unsigned long ptr; + u64 ptr; u32 __user *usig; u32 sig; int ret; - ret = __get_user(ptr, &t->rseq->rseq_cs); - if (ret) - return ret; + if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr))) + return -EFAULT; if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } - urseq_cs = (struct rseq_cs __user *)ptr; + if (ptr >= TASK_SIZE) + return -EINVAL; + urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) return -EFAULT; - if (rseq_cs->version > 0) - return -EINVAL; + if (rseq_cs->start_ip >= TASK_SIZE || + rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || + rseq_cs->abort_ip >= TASK_SIZE || + rseq_cs->version > 0) + return -EINVAL; + /* Check for overflow. */ + if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) + return -EINVAL; /* Ensure that abort_ip is not in the critical section. */ if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) return -EINVAL; - usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32)); + usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); ret = get_user(sig, usig); if (ret) return ret; @@ -146,7 +153,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) printk_ratelimited(KERN_WARNING "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", sig, current->rseq_sig, current->pid, usig); - return -EPERM; + return -EINVAL; } return 0; } @@ -157,7 +164,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) int ret; /* Get thread flags. */ - ret = __get_user(flags, &t->rseq->flags); + ret = get_user(flags, &t->rseq->flags); if (ret) return ret; @@ -195,9 +202,11 @@ static int clear_rseq_cs(struct task_struct *t) * of code outside of the rseq assembly block. This performs * a lazy clear of the rseq_cs field. * - * Set rseq_cs to NULL with single-copy atomicity. + * Set rseq_cs to NULL. */ - return __put_user(0UL, &t->rseq->rseq_cs); + if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64))) + return -EFAULT; + return 0; } /* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index d9a02b318108..7fe183404c38 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle.o fair.o rt.o deadline.o obj-y += wait.o wait_bit.o swait.o completion.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 10c83e73837a..e3e3b979f9bd 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -53,6 +53,7 @@ * */ #include "sched.h" +#include <linux/sched_clock.h> /* * Scheduler clock - returns current time in nanosec units. @@ -66,12 +67,7 @@ unsigned long long __weak sched_clock(void) } EXPORT_SYMBOL_GPL(sched_clock); -__read_mostly int sched_clock_running; - -void sched_clock_init(void) -{ - sched_clock_running = 1; -} +static DEFINE_STATIC_KEY_FALSE(sched_clock_running); #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK /* @@ -195,17 +191,40 @@ void clear_sched_clock_stable(void) smp_mb(); /* matches sched_clock_init_late() */ - if (sched_clock_running == 2) + if (static_key_count(&sched_clock_running.key) == 2) __clear_sched_clock_stable(); } +static void __sched_clock_gtod_offset(void) +{ + struct sched_clock_data *scd = this_scd(); + + __scd_stamp(scd); + __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod; +} + +void __init sched_clock_init(void) +{ + /* + * Set __gtod_offset such that once we mark sched_clock_running, + * sched_clock_tick() continues where sched_clock() left off. + * + * Even if TSC is buggered, we're still UP at this point so it + * can't really be out of sync. + */ + local_irq_disable(); + __sched_clock_gtod_offset(); + local_irq_enable(); + + static_branch_inc(&sched_clock_running); +} /* * We run this as late_initcall() such that it runs after all built-in drivers, * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. */ static int __init sched_clock_init_late(void) { - sched_clock_running = 2; + static_branch_inc(&sched_clock_running); /* * Ensure that it is impossible to not do a static_key update. * @@ -350,8 +369,8 @@ u64 sched_clock_cpu(int cpu) if (sched_clock_stable()) return sched_clock() + __sched_clock_offset; - if (unlikely(!sched_clock_running)) - return 0ull; + if (!static_branch_unlikely(&sched_clock_running)) + return sched_clock(); preempt_disable_notrace(); scd = cpu_sdc(cpu); @@ -373,7 +392,7 @@ void sched_clock_tick(void) if (sched_clock_stable()) return; - if (unlikely(!sched_clock_running)) + if (!static_branch_unlikely(&sched_clock_running)) return; lockdep_assert_irqs_disabled(); @@ -385,8 +404,6 @@ void sched_clock_tick(void) void sched_clock_tick_stable(void) { - u64 gtod, clock; - if (!sched_clock_stable()) return; @@ -398,9 +415,7 @@ void sched_clock_tick_stable(void) * TSC to be unstable, any computation will be computing crap. */ local_irq_disable(); - gtod = ktime_get_ns(); - clock = sched_clock(); - __gtod_offset = (clock + __sched_clock_offset) - gtod; + __sched_clock_gtod_offset(); local_irq_enable(); } @@ -434,9 +449,17 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +void __init sched_clock_init(void) +{ + static_branch_inc(&sched_clock_running); + local_irq_disable(); + generic_sched_clock_init(); + local_irq_enable(); +} + u64 sched_clock_cpu(int cpu) { - if (unlikely(!sched_clock_running)) + if (!static_branch_unlikely(&sched_clock_running)) return 0; return sched_clock(); diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index e426b0cb9ac6..a1ad5b7d5521 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -22,8 +22,8 @@ * * See also complete_all(), wait_for_completion() and related routines. * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. */ void complete(struct completion *x) { @@ -44,8 +44,8 @@ EXPORT_SYMBOL(complete); * * This will wake up all threads waiting on this particular completion event. * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. * * Since complete_all() sets the completion of @x permanently to done * to allow multiple waiters to finish, a call to reinit_completion() diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78d8facba456..39af2bec2b39 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,7 +7,6 @@ */ #include "sched.h" -#include <linux/kthread.h> #include <linux/nospec.h> #include <linux/kcov.h> @@ -18,6 +17,8 @@ #include "../workqueue_internal.h" #include "../smpboot.h" +#include "pelt.h" + #define CREATE_TRACE_POINTS #include <trace/events/sched.h> @@ -46,14 +47,6 @@ const_debug unsigned int sysctl_sched_features = const_debug unsigned int sysctl_sched_nr_migrate = 32; /* - * period over which we average the RT time consumption, measured - * in ms. - * - * default: 1s - */ -const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; - -/* * period over which we measure -rt task CPU usage in us. * default: 1s */ @@ -184,9 +177,9 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->clock_task += delta; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef HAVE_SCHED_AVG_IRQ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) - sched_rt_avg_update(rq, irq_delta + steal); + update_irq_load_avg(rq, irq_delta + steal); #endif } @@ -413,8 +406,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) * its already queued (either by us or someone else) and will get the * wakeup due to that. * - * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_up_q(). + * This cmpxchg() executes a full barrier, which pairs with the full + * barrier executed by the wakeup in wake_up_q(). */ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) return; @@ -442,8 +435,8 @@ void wake_up_q(struct wake_q_head *head) task->wake_q.next = NULL; /* - * wake_up_process() implies a wmb() to pair with the queueing - * in wake_q_add() so as not to miss wakeups. + * wake_up_process() executes a full barrier, which pairs with + * the queueing in wake_q_add() so as not to miss wakeups. */ wake_up_process(task); put_task_struct(task); @@ -650,23 +643,6 @@ bool sched_can_stop_tick(struct rq *rq) return true; } #endif /* CONFIG_NO_HZ_FULL */ - -void sched_avg_update(struct rq *rq) -{ - s64 period = sched_avg_period(); - - while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (rq->age_stamp)); - rq->age_stamp += period; - rq->rt_avg /= 2; - } -} - #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -1200,6 +1176,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } +#ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { if (task_on_rq_queued(p)) { @@ -1281,16 +1258,17 @@ unlock: /* * Cross migrate two tasks */ -int migrate_swap(struct task_struct *cur, struct task_struct *p) +int migrate_swap(struct task_struct *cur, struct task_struct *p, + int target_cpu, int curr_cpu) { struct migration_swap_arg arg; int ret = -EINVAL; arg = (struct migration_swap_arg){ .src_task = cur, - .src_cpu = task_cpu(cur), + .src_cpu = curr_cpu, .dst_task = p, - .dst_cpu = task_cpu(p), + .dst_cpu = target_cpu, }; if (arg.src_cpu == arg.dst_cpu) @@ -1315,6 +1293,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) out: return ret; } +#endif /* CONFIG_NUMA_BALANCING */ /* * wait_task_inactive - wait for a thread to unschedule. @@ -1880,8 +1859,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * rq(c1)->lock (if not at the same time, then in that order). * C) LOCK of the rq(c1)->lock scheduling in task * - * Transitivity guarantees that B happens after A and C after B. - * Note: we only require RCpc transitivity. + * Release/acquire chaining guarantees that B happens after A and C after B. * Note: the CPU doing B need not be c0 or c1 * * Example: @@ -1943,16 +1921,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * UNLOCK rq(0)->lock * * - * However; for wakeups there is a second guarantee we must provide, namely we - * must observe the state that lead to our wakeup. That is, not only must our - * task observe its own prior state, it must also observe the stores prior to - * its wakeup. - * - * This means that any means of doing remote wakeups must order the CPU doing - * the wakeup against the CPU the task is going to end up running on. This, - * however, is already required for the regular Program-Order guarantee above, - * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). - * + * However, for wakeups there is a second guarantee we must provide, namely we + * must ensure that CONDITION=1 done by the caller can not be reordered with + * accesses to the task state; see try_to_wake_up() and set_current_state(). */ /** @@ -1968,6 +1939,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * Atomic against schedule() which would dequeue a task, also see * set_current_state(). * + * This function executes a full memory barrier before accessing the task + * state; see set_current_state(). + * * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. */ @@ -1999,21 +1973,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * be possible to, falsely, observe p->on_rq == 0 and get stuck * in smp_cond_load_acquire() below. * - * sched_ttwu_pending() try_to_wake_up() - * [S] p->on_rq = 1; [L] P->state - * UNLOCK rq->lock -----. - * \ - * +--- RMB - * schedule() / - * LOCK rq->lock -----' - * UNLOCK rq->lock + * sched_ttwu_pending() try_to_wake_up() + * STORE p->on_rq = 1 LOAD p->state + * UNLOCK rq->lock + * + * __schedule() (switch to task 'p') + * LOCK rq->lock smp_rmb(); + * smp_mb__after_spinlock(); + * UNLOCK rq->lock * * [task p] - * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq + * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq * - * Pairs with the UNLOCK+LOCK on rq->lock from the - * last wakeup of our task and the schedule that got our task - * current. + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). */ smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) @@ -2027,15 +2000,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * One must be running (->on_cpu == 1) in order to remove oneself * from the runqueue. * - * [S] ->on_cpu = 1; [L] ->on_rq - * UNLOCK rq->lock - * RMB - * LOCK rq->lock - * [S] ->on_rq = 0; [L] ->on_cpu + * __schedule() (switch to task 'p') try_to_wake_up() + * STORE p->on_cpu = 1 LOAD p->on_rq + * UNLOCK rq->lock + * + * __schedule() (put 'p' to sleep) + * LOCK rq->lock smp_rmb(); + * smp_mb__after_spinlock(); + * STORE p->on_rq = 0 LOAD p->on_cpu * - * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock - * from the consecutive calls to schedule(); the first switching to our - * task, the second putting it to sleep. + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). */ smp_rmb(); @@ -2141,8 +2116,7 @@ out: * * Return: 1 if the process was woken up, 0 if it was already running. * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * This function executes a full memory barrier before accessing the task state. */ int wake_up_process(struct task_struct *p) { @@ -2318,7 +2292,6 @@ static inline void init_schedstats(void) {} int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; - int cpu = get_cpu(); __sched_fork(clone_flags, p); /* @@ -2354,14 +2327,12 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_reset_on_fork = 0; } - if (dl_prio(p->prio)) { - put_cpu(); + if (dl_prio(p->prio)) return -EAGAIN; - } else if (rt_prio(p->prio)) { + else if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; - } else { + else p->sched_class = &fair_sched_class; - } init_entity_runnable_average(&p->se); @@ -2377,7 +2348,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). */ - __set_task_cpu(p, cpu); + __set_task_cpu(p, smp_processor_id()); if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2394,8 +2365,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif - - put_cpu(); return 0; } @@ -2724,28 +2693,20 @@ static struct rq *finish_task_switch(struct task_struct *prev) membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); } - if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { - switch (prev_state) { - case TASK_DEAD: - if (prev->sched_class->task_dead) - prev->sched_class->task_dead(prev); + if (unlikely(prev_state == TASK_DEAD)) { + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - - /* Task is done with its stack. */ - put_task_stack(prev); + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); - put_task_struct(prev); - break; + /* Task is done with its stack. */ + put_task_stack(prev); - case TASK_PARKED: - kthread_park_complete(prev); - break; - } + put_task_struct(prev); } tick_nohz_task_switch(); @@ -2813,6 +2774,8 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); + + calculate_sigpending(); } /* @@ -3113,7 +3076,9 @@ static void sched_tick_remote(struct work_struct *work) struct tick_work *twork = container_of(dwork, struct tick_work, work); int cpu = twork->cpu; struct rq *rq = cpu_rq(cpu); + struct task_struct *curr; struct rq_flags rf; + u64 delta; /* * Handle the tick only if it appears the remote CPU is running in full @@ -3122,25 +3087,29 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { - struct task_struct *curr; - u64 delta; + if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) + goto out_requeue; - rq_lock_irq(rq, &rf); - update_rq_clock(rq); - curr = rq->curr; - delta = rq_clock_task(rq) - curr->se.exec_start; + rq_lock_irq(rq, &rf); + curr = rq->curr; + if (is_idle_task(curr)) + goto out_unlock; - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); - curr->sched_class->task_tick(rq, curr, 0); - rq_unlock_irq(rq, &rf); - } + update_rq_clock(rq); + delta = rq_clock_task(rq) - curr->se.exec_start; /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + curr->sched_class->task_tick(rq, curr, 0); + +out_unlock: + rq_unlock_irq(rq, &rf); + +out_requeue: + /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough * to keep scheduler internal stats reasonably up to date. @@ -3192,7 +3161,7 @@ static inline void sched_tick_stop(int cpu) { } #endif #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) + defined(CONFIG_TRACE_PREEMPT_TOGGLE)) /* * If the value passed in is equal to the current preempt count * then we just disabled preemption. Start timing the latency. @@ -5274,7 +5243,7 @@ out_unlock: * an error code. */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) + struct __kernel_timespec __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); @@ -5285,16 +5254,16 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, return retval; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, compat_pid_t, pid, - struct compat_timespec __user *, interval) + struct old_timespec32 __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); if (retval == 0) - retval = compat_put_timespec64(&t, interval); + retval = put_old_timespec32(&t, interval); return retval; } #endif @@ -5717,13 +5686,6 @@ void set_rq_offline(struct rq *rq) } } -static void set_cpu_rq_start_time(unsigned int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - rq->age_stamp = sched_clock_cpu(cpu); -} - /* * used to mark begin/end of suspend/resume: */ @@ -5777,6 +5739,18 @@ int sched_cpu_activate(unsigned int cpu) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; +#ifdef CONFIG_SCHED_SMT + /* + * The sched_smt_present static key needs to be evaluated on every + * hotplug event because at boot time SMT might be disabled when + * the number of booted CPUs is limited. + * + * If then later a sibling gets hotplugged, then the key would stay + * off and SMT scheduling would never be functional. + */ + if (cpumask_weight(cpu_smt_mask(cpu)) > 1) + static_branch_enable_cpuslocked(&sched_smt_present); +#endif set_cpu_active(cpu, true); if (sched_smp_initialized) { @@ -5841,7 +5815,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) int sched_cpu_starting(unsigned int cpu) { - set_cpu_rq_start_time(cpu); sched_rq_cpu_starting(cpu); sched_tick_start(cpu); return 0; @@ -5874,22 +5847,6 @@ int sched_cpu_dying(unsigned int cpu) } #endif -#ifdef CONFIG_SCHED_SMT -DEFINE_STATIC_KEY_FALSE(sched_smt_present); - -static void sched_init_smt(void) -{ - /* - * We've enumerated all CPUs and will assume that if any CPU - * has SMT siblings, CPU0 will too. - */ - if (cpumask_weight(cpu_smt_mask(0)) > 1) - static_branch_enable(&sched_smt_present); -} -#else -static inline void sched_init_smt(void) { } -#endif - void __init sched_init_smp(void) { sched_init_numa(); @@ -5911,8 +5868,6 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); - sched_init_smt(); - sched_smp_initialized = true; } @@ -5957,7 +5912,6 @@ void __init sched_init(void) int i, j; unsigned long alloc_size = 0, ptr; - sched_clock_init(); wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6109,7 +6063,6 @@ void __init sched_init(void) #ifdef CONFIG_SMP idle_thread_set_boot_cpu(); - set_cpu_rq_start_time(smp_processor_id()); #endif init_sched_fair_class(); @@ -6788,6 +6741,16 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); + if (schedstat_enabled() && tg != &root_task_group) { + u64 ws = 0; + int i; + + for_each_possible_cpu(i) + ws += schedstat_val(tg->se[i]->statistics.wait_sum); + + seq_printf(sf, "wait_sum %llu\n", ws); + } + return 0; } #endif /* CONFIG_CFS_BANDWIDTH */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3cde46483f0a..3fffad3bc8a8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -53,9 +53,7 @@ struct sugov_cpu { unsigned int iowait_boost_max; u64 last_update; - /* The fields below are only needed when sharing a policy: */ - unsigned long util_cfs; - unsigned long util_dl; + unsigned long bw_dl; unsigned long max; /* The field below is for single-CPU policies only: */ @@ -179,33 +177,90 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(struct sugov_cpu *sg_cpu) +/* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the irq utilization. + * + * The DL bandwidth number otoh is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ +static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long util, irq, max; - sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); - sg_cpu->util_cfs = cpu_util_cfs(rq); - sg_cpu->util_dl = cpu_util_dl(rq); -} + sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + sg_cpu->bw_dl = cpu_bw_dl(rq); -static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) -{ - struct rq *rq = cpu_rq(sg_cpu->cpu); + if (rt_rq_is_runnable(&rq->rt)) + return max; - if (rq->rt.rt_nr_running) - return sg_cpu->max; + /* + * Early check to see if IRQ/steal time saturates the CPU, can be + * because of inaccuracies in how we track these -- see + * update_irq_load_avg(). + */ + irq = cpu_util_irq(rq); + if (unlikely(irq >= max)) + return max; + + /* + * Because the time spend on RT/DL tasks is visible as 'lost' time to + * CFS tasks and we use the same metric to track the effective + * utilization (PELT windows are synchronized) we can directly add them + * to obtain the CPU's actual utilization. + */ + util = cpu_util_cfs(rq); + util += cpu_util_rt(rq); + + /* + * We do not make cpu_util_dl() a permanent part of this sum because we + * want to use cpu_bw_dl() later on, but we need to check if the + * CFS+RT+DL sum is saturated (ie. no idle time) such that we select + * f_max when there is no idle time. + * + * NOTE: numerical errors or stop class might cause us to not quite hit + * saturation when we should -- something for later. + */ + if ((util + cpu_util_dl(rq)) >= max) + return max; + + /* + * There is still idle time; further improve the number by using the + * irq metric. Because IRQ/steal time is hidden from the task clock we + * need to scale the task numbers: + * + * 1 - irq + * U' = irq + ------- * U + * max + */ + util = scale_irq_capacity(util, irq, max); + util += irq; /* - * Utilization required by DEADLINE must always be granted while, for - * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to - * gracefully reduce the frequency when no tasks show up for longer + * Bandwidth required by DEADLINE must always be granted while, for + * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism + * to gracefully reduce the frequency when no tasks show up for longer * periods of time. * - * Ideally we would like to set util_dl as min/guaranteed freq and - * util_cfs + util_dl as requested freq. However, cpufreq is not yet - * ready for such an interface. So, we only do the latter for now. + * Ideally we would like to set bw_dl as min/guaranteed freq and util + + * bw_dl as requested freq. However, cpufreq is not yet ready for such + * an interface. So, we only do the latter for now. */ - return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs)); + return min(max, util + sg_cpu->bw_dl); } /** @@ -360,7 +415,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } */ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) { - if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl) + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) sg_policy->need_freq_update = true; } @@ -383,9 +438,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, busy = sugov_cpu_is_busy(sg_cpu); - sugov_get_util(sg_cpu); + util = sugov_get_util(sg_cpu); max = sg_cpu->max; - util = sugov_aggregate_util(sg_cpu); sugov_iowait_apply(sg_cpu, time, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* @@ -424,9 +478,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; - sugov_get_util(j_sg_cpu); + j_util = sugov_get_util(j_sg_cpu); j_max = j_sg_cpu->max; - j_util = sugov_aggregate_util(j_sg_cpu); sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); if (j_util * max > j_max * util) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fbfc3f1d368a..997ea7b839fa 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,7 @@ * Fabio Checconi <fchecconi@gmail.com> */ #include "sched.h" +#include "pelt.h" struct dl_bandwidth def_dl_bandwidth; @@ -1179,8 +1180,6 @@ static void update_curr_dl(struct rq *rq) curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - if (dl_entity_is_special(dl_se)) return; @@ -1761,6 +1760,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) deadline_queue_push_tasks(rq); + if (rq->curr->sched_class != &dl_sched_class) + update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + return p; } @@ -1768,6 +1770,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -1784,6 +1787,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); /* * Even when we have runtime, update_curr_dl() might have resulted in us * not being the leftmost task anymore. In that case NEED_RESCHED will @@ -2090,8 +2094,14 @@ retry: sub_rq_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); add_rq_bw(&next_task->dl, &later_rq->dl); + + /* + * Update the later_rq clock here, because the clock is used + * by the cpufreq_update_util() inside __add_running_bw(). + */ + update_rq_clock(later_rq); add_running_bw(&next_task->dl, &later_rq->dl); - activate_task(later_rq, next_task, 0); + activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); ret = 1; resched_curr(later_rq); @@ -2290,8 +2300,17 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && p->dl.dl_runtime) task_non_contending(p); - if (!task_on_rq_queued(p)) + if (!task_on_rq_queued(p)) { + /* + * Inactive timer is armed. However, p is leaving DEADLINE and + * might migrate away from this rq while continuing to run on + * some other class. We need to remove its contribution from + * this rq running_bw now, or sub_rq_bw (below) will complain. + */ + if (p->dl.dl_non_contending) + sub_running_bw(&p->dl, &rq->dl); sub_rq_bw(&p->dl, &rq->dl); + } /* * We cannot use inactive_task_timer() to invoke sub_running_bw() diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e593b4118578..60caf1fb94e0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -111,20 +111,19 @@ static int sched_feat_set(char *cmp) cmp += 3; } - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) { - sysctl_sched_features &= ~(1UL << i); - sched_feat_disable(i); - } else { - sysctl_sched_features |= (1UL << i); - sched_feat_enable(i); - } - break; - } + i = match_string(sched_feat_names, __SCHED_FEAT_NR, cmp); + if (i < 0) + return i; + + if (neg) { + sysctl_sched_features &= ~(1UL << i); + sched_feat_disable(i); + } else { + sysctl_sched_features |= (1UL << i); + sched_feat_enable(i); } - return i; + return 0; } static ssize_t @@ -133,7 +132,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, { char buf[64]; char *cmp; - int i; + int ret; struct inode *inode; if (cnt > 63) @@ -148,10 +147,10 @@ sched_feat_write(struct file *filp, const char __user *ubuf, /* Ensure the static_key remains in a consistent state */ inode = file_inode(filp); inode_lock(inode); - i = sched_feat_set(cmp); + ret = sched_feat_set(cmp); inode_unlock(inode); - if (i == __SCHED_FEAT_NR) - return -EINVAL; + if (ret < 0) + return ret; *ppos += cnt; @@ -623,8 +622,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) #undef PU } -extern __read_mostly int sched_clock_running; - static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); @@ -843,8 +840,8 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, unsigned long tpf, unsigned long gsf, unsigned long gpf) { SEQ_printf(m, "numa_faults node=%d ", node); - SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf); - SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf); + SEQ_printf(m, "task_private=%lu task_shared=%lu ", tpf, tsf); + SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gpf, gsf); } #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1866e64792a7..b39fb596f6c1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -255,9 +255,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) return cfs_rq->rq; } -/* An entity is a task if it doesn't "own" a runqueue */ -#define entity_is_task(se) (!se->my_q) - static inline struct task_struct *task_of(struct sched_entity *se) { SCHED_WARN_ON(!entity_is_task(se)); @@ -419,7 +416,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) return container_of(cfs_rq, struct rq, cfs); } -#define entity_is_task(se) 1 #define for_each_sched_entity(se) \ for (; se; se = NULL) @@ -692,7 +688,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP - +#include "pelt.h" #include "sched-pelt.h" static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); @@ -735,11 +731,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se); * To solve this problem, we also cap the util_avg of successive tasks to * only 1/2 of the left utilization budget: * - * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n * - * where n denotes the nth task. + * where n denotes the nth task and cpu_scale the CPU capacity. * - * For example, a simplest series from the beginning would be like: + * For example, for a CPU with 1024 of capacity, a simplest series from + * the beginning would be like: * * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... @@ -751,7 +748,8 @@ void post_init_entity_util_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; - long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -1314,7 +1312,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * of each group. Skip other nodes. */ if (sched_numa_topology_type == NUMA_BACKPLANE && - dist > maxdist) + dist >= maxdist) continue; /* Add up the faults from nearby nodes. */ @@ -1452,15 +1450,12 @@ static unsigned long capacity_of(int cpu); /* Cached statistics for all CPUs within a node */ struct numa_stats { - unsigned long nr_running; unsigned long load; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; - /* Approximate capacity in terms of runnable tasks on a node */ - unsigned long task_capacity; - int has_free_capacity; + unsigned int nr_running; }; /* @@ -1487,8 +1482,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) * the @ns structure is NULL'ed and task_numa_compare() will * not find this node attractive. * - * We'll either bail at !has_free_capacity, or we'll detect a huge - * imbalance and bail there. + * We'll detect a huge imbalance and bail there. */ if (!cpus) return; @@ -1497,9 +1491,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid) smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); capacity = cpus / smt; /* cores */ - ns->task_capacity = min_t(unsigned, capacity, + capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); - ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } struct task_numa_env { @@ -1548,28 +1541,12 @@ static bool load_too_imbalanced(long src_load, long dst_load, src_capacity = env->src_stats.compute_capacity; dst_capacity = env->dst_stats.compute_capacity; - /* We care about the slope of the imbalance, not the direction. */ - if (dst_load < src_load) - swap(dst_load, src_load); + imb = abs(dst_load * src_capacity - src_load * dst_capacity); - /* Is the difference below the threshold? */ - imb = dst_load * src_capacity * 100 - - src_load * dst_capacity * env->imbalance_pct; - if (imb <= 0) - return false; - - /* - * The imbalance is above the allowed threshold. - * Compare it with the old imbalance. - */ orig_src_load = env->src_stats.load; orig_dst_load = env->dst_stats.load; - if (orig_dst_load < orig_src_load) - swap(orig_dst_load, orig_src_load); - - old_imb = orig_dst_load * src_capacity * 100 - - orig_src_load * dst_capacity * env->imbalance_pct; + old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity); /* Would this change make things worse? */ return (imb > old_imb); @@ -1582,9 +1559,8 @@ static bool load_too_imbalanced(long src_load, long dst_load, * be exchanged with the source task */ static void task_numa_compare(struct task_numa_env *env, - long taskimp, long groupimp) + long taskimp, long groupimp, bool maymove) { - struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; long src_load, dst_load; @@ -1605,97 +1581,73 @@ static void task_numa_compare(struct task_numa_env *env, if (cur == env->p) goto unlock; + if (!cur) { + if (maymove || imp > env->best_imp) + goto assign; + else + goto unlock; + } + /* * "imp" is the fault differential for the source task between the * source and destination node. Calculate the total differential for * the source task and potential destination task. The more negative - * the value is, the more rmeote accesses that would be expected to + * the value is, the more remote accesses that would be expected to * be incurred if the tasks were swapped. */ - if (cur) { - /* Skip this swap candidate if cannot move to the source CPU: */ - if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) - goto unlock; + /* Skip this swap candidate if cannot move to the source cpu */ + if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) + goto unlock; + /* + * If dst and source tasks are in the same NUMA group, or not + * in any group then look only at task weights. + */ + if (cur->numa_group == env->p->numa_group) { + imp = taskimp + task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); /* - * If dst and source tasks are in the same NUMA group, or not - * in any group then look only at task weights. + * Add some hysteresis to prevent swapping the + * tasks within a group over tiny differences. */ - if (cur->numa_group == env->p->numa_group) { - imp = taskimp + task_weight(cur, env->src_nid, dist) - - task_weight(cur, env->dst_nid, dist); - /* - * Add some hysteresis to prevent swapping the - * tasks within a group over tiny differences. - */ - if (cur->numa_group) - imp -= imp/16; - } else { - /* - * Compare the group weights. If a task is all by - * itself (not part of a group), use the task weight - * instead. - */ - if (cur->numa_group) - imp += group_weight(cur, env->src_nid, dist) - - group_weight(cur, env->dst_nid, dist); - else - imp += task_weight(cur, env->src_nid, dist) - - task_weight(cur, env->dst_nid, dist); - } + if (cur->numa_group) + imp -= imp / 16; + } else { + /* + * Compare the group weights. If a task is all by itself + * (not part of a group), use the task weight instead. + */ + if (cur->numa_group && env->p->numa_group) + imp += group_weight(cur, env->src_nid, dist) - + group_weight(cur, env->dst_nid, dist); + else + imp += task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); } - if (imp <= env->best_imp && moveimp <= env->best_imp) + if (imp <= env->best_imp) goto unlock; - if (!cur) { - /* Is there capacity at our destination? */ - if (env->src_stats.nr_running <= env->src_stats.task_capacity && - !env->dst_stats.has_free_capacity) - goto unlock; - - goto balance; - } - - /* Balance doesn't matter much if we're running a task per CPU: */ - if (imp > env->best_imp && src_rq->nr_running == 1 && - dst_rq->nr_running == 1) + if (maymove && moveimp > imp && moveimp > env->best_imp) { + imp = moveimp - 1; + cur = NULL; goto assign; + } /* * In the overloaded case, try and keep the load balanced. */ -balance: - load = task_h_load(env->p); + load = task_h_load(env->p) - task_h_load(cur); + if (!load) + goto assign; + dst_load = env->dst_stats.load + load; src_load = env->src_stats.load - load; - if (moveimp > imp && moveimp > env->best_imp) { - /* - * If the improvement from just moving env->p direction is - * better than swapping tasks around, check if a move is - * possible. Store a slightly smaller score than moveimp, - * so an actually idle CPU will win. - */ - if (!load_too_imbalanced(src_load, dst_load, env)) { - imp = moveimp - 1; - cur = NULL; - goto assign; - } - } - - if (imp <= env->best_imp) - goto unlock; - - if (cur) { - load = task_h_load(cur); - dst_load -= load; - src_load += load; - } - if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; +assign: /* * One idle CPU per node is evaluated for a task numa move. * Call select_idle_sibling to maybe find a better one. @@ -1711,7 +1663,6 @@ balance: local_irq_enable(); } -assign: task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); @@ -1720,43 +1671,30 @@ unlock: static void task_numa_find_cpu(struct task_numa_env *env, long taskimp, long groupimp) { + long src_load, dst_load, load; + bool maymove = false; int cpu; + load = task_h_load(env->p); + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + + /* + * If the improvement from just moving env->p direction is better + * than swapping tasks around, check if a move is possible. + */ + maymove = !load_too_imbalanced(src_load, dst_load, env); + for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { /* Skip this CPU if the source task cannot migrate */ if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) continue; env->dst_cpu = cpu; - task_numa_compare(env, taskimp, groupimp); + task_numa_compare(env, taskimp, groupimp, maymove); } } -/* Only move tasks to a NUMA node less busy than the current node. */ -static bool numa_has_capacity(struct task_numa_env *env) -{ - struct numa_stats *src = &env->src_stats; - struct numa_stats *dst = &env->dst_stats; - - if (src->has_free_capacity && !dst->has_free_capacity) - return false; - - /* - * Only consider a task move if the source has a higher load - * than the destination, corrected for CPU capacity on each node. - * - * src->load dst->load - * --------------------- vs --------------------- - * src->compute_capacity dst->compute_capacity - */ - if (src->load * dst->compute_capacity * env->imbalance_pct > - - dst->load * src->compute_capacity * 100) - return true; - - return false; -} - static int task_numa_migrate(struct task_struct *p) { struct task_numa_env env = { @@ -1797,7 +1735,7 @@ static int task_numa_migrate(struct task_struct *p) * elsewhere, so there is no point in (re)trying. */ if (unlikely(!sd)) { - p->numa_preferred_nid = task_node(p); + sched_setnuma(p, task_node(p)); return -EINVAL; } @@ -1811,8 +1749,7 @@ static int task_numa_migrate(struct task_struct *p) update_numa_stats(&env.dst_stats, env.dst_nid); /* Try to find a spot on the preferred nid. */ - if (numa_has_capacity(&env)) - task_numa_find_cpu(&env, taskimp, groupimp); + task_numa_find_cpu(&env, taskimp, groupimp); /* * Look at other nodes in these cases: @@ -1842,8 +1779,7 @@ static int task_numa_migrate(struct task_struct *p) env.dist = dist; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); - if (numa_has_capacity(&env)) - task_numa_find_cpu(&env, taskimp, groupimp); + task_numa_find_cpu(&env, taskimp, groupimp); } } @@ -1856,15 +1792,13 @@ static int task_numa_migrate(struct task_struct *p) * trying for a better one later. Do not set the preferred node here. */ if (p->numa_group) { - struct numa_group *ng = p->numa_group; - if (env.best_cpu == -1) nid = env.src_nid; else - nid = env.dst_nid; + nid = cpu_to_node(env.best_cpu); - if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) - sched_setnuma(p, env.dst_nid); + if (nid != p->numa_preferred_nid) + sched_setnuma(p, nid); } /* No better CPU than the current one was found. */ @@ -1884,7 +1818,8 @@ static int task_numa_migrate(struct task_struct *p) return ret; } - ret = migrate_swap(p, env.best_task); + ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); + if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); put_task_struct(env.best_task); @@ -2144,8 +2079,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1, max_group_nid = -1; - unsigned long max_faults = 0, max_group_faults = 0; + int seq, nid, max_nid = -1; + unsigned long max_faults = 0; unsigned long fault_types[2] = { 0, 0 }; unsigned long total_faults; u64 runtime, period; @@ -2224,33 +2159,30 @@ static void task_numa_placement(struct task_struct *p) } } - if (faults > max_faults) { - max_faults = faults; + if (!p->numa_group) { + if (faults > max_faults) { + max_faults = faults; + max_nid = nid; + } + } else if (group_faults > max_faults) { + max_faults = group_faults; max_nid = nid; } - - if (group_faults > max_group_faults) { - max_group_faults = group_faults; - max_group_nid = nid; - } } - update_task_scan_period(p, fault_types[0], fault_types[1]); - if (p->numa_group) { numa_group_count_active_nodes(p->numa_group); spin_unlock_irq(group_lock); - max_nid = preferred_group_nid(p, max_group_nid); + max_nid = preferred_group_nid(p, max_nid); } if (max_faults) { /* Set the new preferred node */ if (max_nid != p->numa_preferred_nid) sched_setnuma(p, max_nid); - - if (task_node(p) != p->numa_preferred_nid) - numa_migrate_preferred(p); } + + update_task_scan_period(p, fault_types[0], fault_types[1]); } static inline int get_numa_group(struct numa_group *grp) @@ -2450,14 +2382,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) numa_is_active_node(mem_node, ng)) local = 1; - task_numa_placement(p); - /* * Retry task to preferred node migration periodically, in case it * case it previously failed, or the scheduler moved us. */ - if (time_after(jiffies, p->numa_migrate_retry)) + if (time_after(jiffies, p->numa_migrate_retry)) { + task_numa_placement(p); numa_migrate_preferred(p); + } if (migrated) p->numa_pages_migrated += pages; @@ -2749,19 +2681,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } while (0) #ifdef CONFIG_SMP -/* - * XXX we want to get rid of these helpers and use the full load resolution. - */ -static inline long se_weight(struct sched_entity *se) -{ - return scale_load_down(se->load.weight); -} - -static inline long se_runnable(struct sched_entity *se) -{ - return scale_load_down(se->runnable_weight); -} - static inline void enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -3062,314 +2981,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) } #ifdef CONFIG_SMP -/* - * Approximate: - * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) - */ -static u64 decay_load(u64 val, u64 n) -{ - unsigned int local_n; - - if (unlikely(n > LOAD_AVG_PERIOD * 63)) - return 0; - - /* after bounds checking we can collapse to 32-bit */ - local_n = n; - - /* - * As y^PERIOD = 1/2, we can combine - * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) - * With a look-up table which covers y^n (n<PERIOD) - * - * To achieve constant time decay_load. - */ - if (unlikely(local_n >= LOAD_AVG_PERIOD)) { - val >>= local_n / LOAD_AVG_PERIOD; - local_n %= LOAD_AVG_PERIOD; - } - - val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); - return val; -} - -static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) -{ - u32 c1, c2, c3 = d3; /* y^0 == 1 */ - - /* - * c1 = d1 y^p - */ - c1 = decay_load((u64)d1, periods); - - /* - * p-1 - * c2 = 1024 \Sum y^n - * n=1 - * - * inf inf - * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) - * n=0 n=p - */ - c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; - - return c1 + c2 + c3; -} - -/* - * Accumulate the three separate parts of the sum; d1 the remainder - * of the last (incomplete) period, d2 the span of full periods and d3 - * the remainder of the (incomplete) current period. - * - * d1 d2 d3 - * ^ ^ ^ - * | | | - * |<->|<----------------->|<--->| - * ... |---x---|------| ... |------|-----x (now) - * - * p-1 - * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 - * n=1 - * - * = u y^p + (Step 1) - * - * p-1 - * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) - * n=1 - */ -static __always_inline u32 -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, - unsigned long load, unsigned long runnable, int running) -{ - unsigned long scale_freq, scale_cpu; - u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ - u64 periods; - - scale_freq = arch_scale_freq_capacity(cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - - delta += sa->period_contrib; - periods = delta / 1024; /* A period is 1024us (~1ms) */ - - /* - * Step 1: decay old *_sum if we crossed period boundaries. - */ - if (periods) { - sa->load_sum = decay_load(sa->load_sum, periods); - sa->runnable_load_sum = - decay_load(sa->runnable_load_sum, periods); - sa->util_sum = decay_load((u64)(sa->util_sum), periods); - - /* - * Step 2 - */ - delta %= 1024; - contrib = __accumulate_pelt_segments(periods, - 1024 - sa->period_contrib, delta); - } - sa->period_contrib = delta; - - contrib = cap_scale(contrib, scale_freq); - if (load) - sa->load_sum += load * contrib; - if (runnable) - sa->runnable_load_sum += runnable * contrib; - if (running) - sa->util_sum += contrib * scale_cpu; - - return periods; -} - -/* - * We can represent the historical contribution to runnable average as the - * coefficients of a geometric series. To do this we sub-divide our runnable - * history into segments of approximately 1ms (1024us); label the segment that - * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. - * - * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... - * p0 p1 p2 - * (now) (~1ms ago) (~2ms ago) - * - * Let u_i denote the fraction of p_i that the entity was runnable. - * - * We then designate the fractions u_i as our co-efficients, yielding the - * following representation of historical load: - * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... - * - * We choose y based on the with of a reasonably scheduling period, fixing: - * y^32 = 0.5 - * - * This means that the contribution to load ~32ms ago (u_32) will be weighted - * approximately half as much as the contribution to load within the last ms - * (u_0). - * - * When a period "rolls over" and we have new u_0`, multiplying the previous - * sum again by y is sufficient to update: - * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) - * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] - */ -static __always_inline int -___update_load_sum(u64 now, int cpu, struct sched_avg *sa, - unsigned long load, unsigned long runnable, int running) -{ - u64 delta; - - delta = now - sa->last_update_time; - /* - * This should only happen when time goes backwards, which it - * unfortunately does during sched clock init when we swap over to TSC. - */ - if ((s64)delta < 0) { - sa->last_update_time = now; - return 0; - } - - /* - * Use 1024ns as the unit of measurement since it's a reasonable - * approximation of 1us and fast to compute. - */ - delta >>= 10; - if (!delta) - return 0; - - sa->last_update_time += delta << 10; - - /* - * running is a subset of runnable (weight) so running can't be set if - * runnable is clear. But there are some corner cases where the current - * se has been already dequeued but cfs_rq->curr still points to it. - * This means that weight will be 0 but not running for a sched_entity - * but also for a cfs_rq if the latter becomes idle. As an example, - * this happens during idle_balance() which calls - * update_blocked_averages() - */ - if (!load) - runnable = running = 0; - - /* - * Now we know we crossed measurement unit boundaries. The *_avg - * accrues by two steps: - * - * Step 1: accumulate *_sum since last_update_time. If we haven't - * crossed period boundaries, finish. - */ - if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) - return 0; - - return 1; -} - -static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) -{ - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; - - /* - * Step 2: update *_avg. - */ - sa->load_avg = div_u64(load * sa->load_sum, divider); - sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); - sa->util_avg = sa->util_sum / divider; -} - -/* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. - * This flag is used to synchronize util_avg updates with util_est updates. - * We map this information into the LSB bit of the utilization saved at - * dequeue time (i.e. util_est.dequeued). - */ -#define UTIL_AVG_UNCHANGED 0x1 - -static inline void cfs_se_util_change(struct sched_avg *avg) -{ - unsigned int enqueued; - - if (!sched_feat(UTIL_EST)) - return; - - /* Avoid store if the flag has been already set */ - enqueued = avg->util_est.enqueued; - if (!(enqueued & UTIL_AVG_UNCHANGED)) - return; - - /* Reset flag to report util_avg has been updated */ - enqueued &= ~UTIL_AVG_UNCHANGED; - WRITE_ONCE(avg->util_est.enqueued, enqueued); -} - -/* - * sched_entity: - * - * task: - * se_runnable() == se_weight() - * - * group: [ see update_cfs_group() ] - * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg - * - * load_sum := runnable_sum - * load_avg = se_weight(se) * runnable_avg - * - * runnable_load_sum := runnable_sum - * runnable_load_avg = se_runnable(se) * runnable_avg - * - * XXX collapse load_sum and runnable_load_sum - * - * cfq_rs: - * - * load_sum = \Sum se_weight(se) * se->avg.load_sum - * load_avg = \Sum se->avg.load_avg - * - * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum - * runnable_load_avg = \Sum se->avg.runable_load_avg - */ - -static int -__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) -{ - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - - if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); - return 1; - } - - return 0; -} - -static int -__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, - cfs_rq->curr == se)) { - - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); - cfs_se_util_change(&se->avg); - return 1; - } - - return 0; -} - -static int -__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) -{ - if (___update_load_sum(now, cpu, &cfs_rq->avg, - scale_load_down(cfs_rq->load.weight), - scale_load_down(cfs_rq->runnable_weight), - cfs_rq->curr != NULL)) { - - ___update_load_avg(&cfs_rq->avg, 1, 1); - return 1; - } - - return 0; -} - #ifdef CONFIG_FAIR_GROUP_SCHED /** * update_tg_load_avg - update the tg's load avg @@ -3982,18 +3593,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) if (!sched_feat(UTIL_EST)) return; - /* - * Update root cfs_rq's estimated utilization - * - * If *p is the last task then the root cfs_rq's estimated utilization - * of a CPU is 0 by definition. - */ - ue.enqueued = 0; - if (cfs_rq->nr_running) { - ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, - (_task_util_est(p) | UTIL_AVG_UNCHANGED)); - } + /* Update root cfs_rq's estimated utilization */ + ue.enqueued = cfs_rq->avg.util_est.enqueued; + ue.enqueued -= min_t(unsigned int, ue.enqueued, + (_task_util_est(p) | UTIL_AVG_UNCHANGED)); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); /* @@ -4045,12 +3648,6 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) #else /* CONFIG_SMP */ -static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) -{ - return 0; -} - #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 #define DO_ATTACH 0x0 @@ -4590,6 +4187,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) now = sched_clock_cpu(smp_processor_id()); cfs_b->runtime = cfs_b->quota; cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); + cfs_b->expires_seq++; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4612,6 +4210,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); u64 amount = 0, min_amount, expires; + int expires_seq; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -4628,6 +4227,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } + expires_seq = cfs_b->expires_seq; expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); @@ -4637,8 +4237,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) * spread between our sched_clock and the one on which runtime was * issued. */ - if ((s64)(expires - cfs_rq->runtime_expires) > 0) + if (cfs_rq->expires_seq != expires_seq) { + cfs_rq->expires_seq = expires_seq; cfs_rq->runtime_expires = expires; + } return cfs_rq->runtime_remaining > 0; } @@ -4664,12 +4266,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) * has not truly expired. * * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. It is valid to compare - * cfs_b->runtime_expires without any locks since we only care about - * exact equality, so a partial write will still work. + * whether the global deadline(cfs_b->expires_seq) has advanced. */ - - if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { + if (cfs_rq->expires_seq == cfs_b->expires_seq) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; } else { @@ -4732,7 +4331,6 @@ static inline int throttled_lb_pair(struct task_group *tg, throttled_hierarchy(dest_cfs_rq); } -/* updated child weight may affect parent so we have to do this bottom up */ static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; @@ -5202,13 +4800,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + u64 overrun; + lockdep_assert_held(&cfs_b->lock); - if (!cfs_b->period_active) { - cfs_b->period_active = 1; - hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); - hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); - } + if (cfs_b->period_active) + return; + + cfs_b->period_active = 1; + overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); + cfs_b->expires_seq++; + hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); } static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -5654,8 +5257,6 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load, this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; } - - sched_avg_update(this_rq); } /* Used instead of source_load when we know the type == 0 */ @@ -6238,6 +5839,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p } #ifdef CONFIG_SCHED_SMT +DEFINE_STATIC_KEY_FALSE(sched_smt_present); static inline void set_idle_cores(int cpu, int val) { @@ -7295,8 +6897,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env) static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); - unsigned long src_faults, dst_faults; - int src_nid, dst_nid; + unsigned long src_weight, dst_weight; + int src_nid, dst_nid, dist; if (!static_branch_likely(&sched_numa_balancing)) return -1; @@ -7323,18 +6925,19 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) return 0; /* Leaving a core idle is often worse than degrading locality. */ - if (env->idle != CPU_NOT_IDLE) + if (env->idle == CPU_IDLE) return -1; + dist = node_distance(src_nid, dst_nid); if (numa_group) { - src_faults = group_faults(p, src_nid); - dst_faults = group_faults(p, dst_nid); + src_weight = group_weight(p, src_nid, dist); + dst_weight = group_weight(p, dst_nid, dist); } else { - src_faults = task_faults(p, src_nid); - dst_faults = task_faults(p, dst_nid); + src_weight = task_weight(p, src_nid, dist); + dst_weight = task_weight(p, dst_nid, dist); } - return dst_faults < src_faults; + return dst_weight < src_weight; } #else @@ -7621,6 +7224,22 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) return false; } +static inline bool others_have_blocked(struct rq *rq) +{ + if (READ_ONCE(rq->avg_rt.util_avg)) + return true; + + if (READ_ONCE(rq->avg_dl.util_avg)) + return true; + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + if (READ_ONCE(rq->avg_irq.util_avg)) + return true; +#endif + + return false; +} + #ifdef CONFIG_FAIR_GROUP_SCHED static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -7680,6 +7299,12 @@ static void update_blocked_averages(int cpu) if (cfs_rq_has_blocked(cfs_rq)) done = false; } + update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + update_irq_load_avg(rq, 0); + /* Don't need periodic decay once load/util_avg are null */ + if (others_have_blocked(rq)) + done = false; #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; @@ -7745,9 +7370,12 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + update_irq_load_avg(rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; - if (!cfs_rq_has_blocked(cfs_rq)) + if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq)) rq->has_blocked_load = 0; #endif rq_unlock_irqrestore(rq, &rf); @@ -7857,39 +7485,32 @@ static inline int get_sd_load_idx(struct sched_domain *sd, static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, used, age_stamp, avg; - s64 delta; + unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + unsigned long used, free; + unsigned long irq; - /* - * Since we're reading these variables without serialization make sure - * we read them once before doing sanity checks on them. - */ - age_stamp = READ_ONCE(rq->age_stamp); - avg = READ_ONCE(rq->rt_avg); - delta = __rq_clock_broken(rq) - age_stamp; + irq = cpu_util_irq(rq); - if (unlikely(delta < 0)) - delta = 0; + if (unlikely(irq >= max)) + return 1; - total = sched_avg_period() + delta; + used = READ_ONCE(rq->avg_rt.util_avg); + used += READ_ONCE(rq->avg_dl.util_avg); - used = div_u64(avg, total); + if (unlikely(used >= max)) + return 1; - if (likely(used < SCHED_CAPACITY_SCALE)) - return SCHED_CAPACITY_SCALE - used; + free = max - used; - return 1; + return scale_irq_capacity(free, irq, max); } static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); + unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = capacity; - - capacity *= scale_rt_capacity(cpu); - capacity >>= SCHED_CAPACITY_SHIFT; + cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); if (!capacity) capacity = 1; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1a3e9bddd17b..16f84142f2f4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -190,7 +190,7 @@ static void cpuidle_idle_call(void) */ next_state = cpuidle_select(drv, dev, &stop_tick); - if (stop_tick) + if (stop_tick || tick_nohz_tick_stopped()) tick_nohz_idle_stop_tick(); else tick_nohz_idle_retain_tick(); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c new file mode 100644 index 000000000000..35475c0c5419 --- /dev/null +++ b/kernel/sched/pelt.c @@ -0,0 +1,399 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Per Entity Load Tracking + * + * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * Interactivity improvements by Mike Galbraith + * (C) 2007 Mike Galbraith <efault@gmx.de> + * + * Various enhancements by Dmitry Adamushko. + * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> + * + * Group scheduling enhancements by Srivatsa Vaddagiri + * Copyright IBM Corporation, 2007 + * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> + * + * Scaled math optimizations by Thomas Gleixner + * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * Move PELT related code from fair.c into this pelt.c file + * Author: Vincent Guittot <vincent.guittot@linaro.org> + */ + +#include <linux/sched.h> +#include "sched.h" +#include "sched-pelt.h" +#include "pelt.h" + +/* + * Approximate: + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) + */ +static u64 decay_load(u64 val, u64 n) +{ + unsigned int local_n; + + if (unlikely(n > LOAD_AVG_PERIOD * 63)) + return 0; + + /* after bounds checking we can collapse to 32-bit */ + local_n = n; + + /* + * As y^PERIOD = 1/2, we can combine + * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) + * With a look-up table which covers y^n (n<PERIOD) + * + * To achieve constant time decay_load. + */ + if (unlikely(local_n >= LOAD_AVG_PERIOD)) { + val >>= local_n / LOAD_AVG_PERIOD; + local_n %= LOAD_AVG_PERIOD; + } + + val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); + return val; +} + +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) +{ + u32 c1, c2, c3 = d3; /* y^0 == 1 */ + + /* + * c1 = d1 y^p + */ + c1 = decay_load((u64)d1, periods); + + /* + * p-1 + * c2 = 1024 \Sum y^n + * n=1 + * + * inf inf + * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) + * n=0 n=p + */ + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; + + return c1 + c2 + c3; +} + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + +/* + * Accumulate the three separate parts of the sum; d1 the remainder + * of the last (incomplete) period, d2 the span of full periods and d3 + * the remainder of the (incomplete) current period. + * + * d1 d2 d3 + * ^ ^ ^ + * | | | + * |<->|<----------------->|<--->| + * ... |---x---|------| ... |------|-----x (now) + * + * p-1 + * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 + * n=1 + * + * = u y^p + (Step 1) + * + * p-1 + * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) + * n=1 + */ +static __always_inline u32 +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, + unsigned long load, unsigned long runnable, int running) +{ + unsigned long scale_freq, scale_cpu; + u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ + u64 periods; + + scale_freq = arch_scale_freq_capacity(cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + + delta += sa->period_contrib; + periods = delta / 1024; /* A period is 1024us (~1ms) */ + + /* + * Step 1: decay old *_sum if we crossed period boundaries. + */ + if (periods) { + sa->load_sum = decay_load(sa->load_sum, periods); + sa->runnable_load_sum = + decay_load(sa->runnable_load_sum, periods); + sa->util_sum = decay_load((u64)(sa->util_sum), periods); + + /* + * Step 2 + */ + delta %= 1024; + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } + sa->period_contrib = delta; + + contrib = cap_scale(contrib, scale_freq); + if (load) + sa->load_sum += load * contrib; + if (runnable) + sa->runnable_load_sum += runnable * contrib; + if (running) + sa->util_sum += contrib * scale_cpu; + + return periods; +} + +/* + * We can represent the historical contribution to runnable average as the + * coefficients of a geometric series. To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + * p0 p1 p2 + * (now) (~1ms ago) (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + * y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] + */ +static __always_inline int +___update_load_sum(u64 now, int cpu, struct sched_avg *sa, + unsigned long load, unsigned long runnable, int running) +{ + u64 delta; + + delta = now - sa->last_update_time; + /* + * This should only happen when time goes backwards, which it + * unfortunately does during sched clock init when we swap over to TSC. + */ + if ((s64)delta < 0) { + sa->last_update_time = now; + return 0; + } + + /* + * Use 1024ns as the unit of measurement since it's a reasonable + * approximation of 1us and fast to compute. + */ + delta >>= 10; + if (!delta) + return 0; + + sa->last_update_time += delta << 10; + + /* + * running is a subset of runnable (weight) so running can't be set if + * runnable is clear. But there are some corner cases where the current + * se has been already dequeued but cfs_rq->curr still points to it. + * This means that weight will be 0 but not running for a sched_entity + * but also for a cfs_rq if the latter becomes idle. As an example, + * this happens during idle_balance() which calls + * update_blocked_averages() + */ + if (!load) + runnable = running = 0; + + /* + * Now we know we crossed measurement unit boundaries. The *_avg + * accrues by two steps: + * + * Step 1: accumulate *_sum since last_update_time. If we haven't + * crossed period boundaries, finish. + */ + if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) + return 0; + + return 1; +} + +static __always_inline void +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) +{ + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + + /* + * Step 2: update *_avg. + */ + sa->load_avg = div_u64(load * sa->load_sum, divider); + sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); +} + +/* + * sched_entity: + * + * task: + * se_runnable() == se_weight() + * + * group: [ see update_cfs_group() ] + * se_weight() = tg->weight * grq->load_avg / tg->load_avg + * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg + * + * load_sum := runnable_sum + * load_avg = se_weight(se) * runnable_avg + * + * runnable_load_sum := runnable_sum + * runnable_load_avg = se_runnable(se) * runnable_avg + * + * XXX collapse load_sum and runnable_load_sum + * + * cfq_rq: + * + * load_sum = \Sum se_weight(se) * se->avg.load_sum + * load_avg = \Sum se->avg.load_avg + * + * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum + * runnable_load_avg = \Sum se->avg.runable_load_avg + */ + +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +{ + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + return 1; + } + + return 0; +} + +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, + cfs_rq->curr == se)) { + + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + cfs_se_util_change(&se->avg); + return 1; + } + + return 0; +} + +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +{ + if (___update_load_sum(now, cpu, &cfs_rq->avg, + scale_load_down(cfs_rq->load.weight), + scale_load_down(cfs_rq->runnable_weight), + cfs_rq->curr != NULL)) { + + ___update_load_avg(&cfs_rq->avg, 1, 1); + return 1; + } + + return 0; +} + +/* + * rt_rq: + * + * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked + * util_sum = cpu_scale * load_sum + * runnable_load_sum = load_sum + * + * load_avg and runnable_load_avg are not supported and meaningless. + * + */ + +int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) +{ + if (___update_load_sum(now, rq->cpu, &rq->avg_rt, + running, + running, + running)) { + + ___update_load_avg(&rq->avg_rt, 1, 1); + return 1; + } + + return 0; +} + +/* + * dl_rq: + * + * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked + * util_sum = cpu_scale * load_sum + * runnable_load_sum = load_sum + * + */ + +int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) +{ + if (___update_load_sum(now, rq->cpu, &rq->avg_dl, + running, + running, + running)) { + + ___update_load_avg(&rq->avg_dl, 1, 1); + return 1; + } + + return 0; +} + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +/* + * irq: + * + * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked + * util_sum = cpu_scale * load_sum + * runnable_load_sum = load_sum + * + */ + +int update_irq_load_avg(struct rq *rq, u64 running) +{ + int ret = 0; + /* + * We know the time that has been used by interrupt since last update + * but we don't when. Let be pessimistic and assume that interrupt has + * happened just before the update. This is not so far from reality + * because interrupt will most probably wake up task and trig an update + * of rq clock during which the metric si updated. + * We start to decay with normal context time and then we add the + * interrupt context time. + * We can safely remove running from rq->clock because + * rq->clock += delta with delta >= running + */ + ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq, + 0, + 0, + 0); + ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq, + 1, + 1, + 1); + + if (ret) + ___update_load_avg(&rq->avg_irq, 1, 1); + + return ret; +} +#endif diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h new file mode 100644 index 000000000000..d2894db28955 --- /dev/null +++ b/kernel/sched/pelt.h @@ -0,0 +1,72 @@ +#ifdef CONFIG_SMP + +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); +int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); +int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +int update_irq_load_avg(struct rq *rq, u64 running); +#else +static inline int +update_irq_load_avg(struct rq *rq, u64 running) +{ + return 0; +} +#endif + +/* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. + * This flag is used to synchronize util_avg updates with util_est updates. + * We map this information into the LSB bit of the utilization saved at + * dequeue time (i.e. util_est.dequeued). + */ +#define UTIL_AVG_UNCHANGED 0x1 + +static inline void cfs_se_util_change(struct sched_avg *avg) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Avoid store if the flag has been already set */ + enqueued = avg->util_est.enqueued; + if (!(enqueued & UTIL_AVG_UNCHANGED)) + return; + + /* Reset flag to report util_avg has been updated */ + enqueued &= ~UTIL_AVG_UNCHANGED; + WRITE_ONCE(avg->util_est.enqueued, enqueued); +} + +#else + +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +{ + return 0; +} + +static inline int +update_rt_rq_load_avg(u64 now, struct rq *rq, int running) +{ + return 0; +} + +static inline int +update_dl_rq_load_avg(u64 now, struct rq *rq, int running) +{ + return 0; +} + +static inline int +update_irq_load_avg(struct rq *rq, u64 running) +{ + return 0; +} +#endif + + diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 47556b0c9a95..2e2955a8cf8f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -5,6 +5,8 @@ */ #include "sched.h" +#include "pelt.h" + int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; @@ -508,8 +510,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; - if (!rt_se) + if (!rt_se) { dequeue_top_rt_rq(rt_rq); + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); + } else if (on_rt_rq(rt_se)) dequeue_rt_entity(rt_se, 0); } @@ -833,6 +838,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) * can be time-consuming. Try to avoid it when possible. */ raw_spin_lock(&rt_rq->rt_runtime_lock); + if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF) + rt_rq->rt_runtime = rt_b->rt_runtime; skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; raw_spin_unlock(&rt_rq->rt_runtime_lock); if (skip) @@ -968,8 +975,6 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - if (!rt_bandwidth_enabled()) return; @@ -1001,8 +1006,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) sub_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 0; - /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq, 0); } static void @@ -1014,11 +1017,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) if (rt_rq->rt_queued) return; - if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) + + if (rt_rq_throttled(rt_rq)) return; - add_nr_running(rq, rt_rq->rt_nr_running); - rt_rq->rt_queued = 1; + if (rt_rq->rt_nr_running) { + add_nr_running(rq, rt_rq->rt_nr_running); + rt_rq->rt_queued = 1; + } /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq, 0); @@ -1572,6 +1578,14 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) rt_queue_push_tasks(rq); + /* + * If prev task was rt, put_prev_task() has already updated the + * utilization. We only care of the case where we start to schedule a + * rt task + */ + if (rq->curr->sched_class != &rt_sched_class) + update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + return p; } @@ -1579,6 +1593,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); + update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); + /* * The previous task needs to be made eligible for pushing * if it is still active @@ -2308,6 +2324,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); + update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); watchdog(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6601baf2361c..4a2e8cae63c4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -334,9 +334,10 @@ struct cfs_bandwidth { u64 runtime; s64 hierarchical_quota; u64 runtime_expires; + int expires_seq; - int idle; - int period_active; + short idle; + short period_active; struct hrtimer period_timer; struct hrtimer slack_timer; struct list_head throttled_cfs_rq; @@ -551,6 +552,7 @@ struct cfs_rq { #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; + int expires_seq; u64 runtime_expires; s64 runtime_remaining; @@ -592,6 +594,7 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; + #endif /* CONFIG_SMP */ int rt_queued; @@ -609,6 +612,11 @@ struct rt_rq { #endif }; +static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq) +{ + return rt_rq->rt_queued && rt_rq->rt_nr_running; +} + /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ @@ -666,7 +674,26 @@ struct dl_rq { u64 bw_ratio; }; +#ifdef CONFIG_FAIR_GROUP_SCHED +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) +#else +#define entity_is_task(se) 1 +#endif + #ifdef CONFIG_SMP +/* + * XXX we want to get rid of these helpers and use the full load resolution. + */ +static inline long se_weight(struct sched_entity *se) +{ + return scale_load_down(se->load.weight); +} + +static inline long se_runnable(struct sched_entity *se) +{ + return scale_load_down(se->runnable_weight); +} static inline bool sched_asym_prefer(int a, int b) { @@ -826,8 +853,12 @@ struct rq { struct list_head cfs_tasks; - u64 rt_avg; - u64 age_stamp; + struct sched_avg avg_rt; + struct sched_avg avg_dl; +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#define HAVE_SCHED_AVG_IRQ + struct sched_avg avg_irq; +#endif u64 idle_stamp; u64 avg_idle; @@ -1068,7 +1099,8 @@ enum numa_faults_stats { }; extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); -extern int migrate_swap(struct task_struct *, struct task_struct *); +extern int migrate_swap(struct task_struct *p, struct task_struct *t, + int cpu, int scpu); extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); #else static inline void @@ -1683,15 +1715,9 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); -extern const_debug unsigned int sysctl_sched_time_avg; extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; -static inline u64 sched_avg_period(void) -{ - return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; -} - #ifdef CONFIG_SCHED_HRTICK /* @@ -1728,8 +1754,6 @@ unsigned long arch_scale_freq_capacity(int cpu) #endif #ifdef CONFIG_SMP -extern void sched_avg_update(struct rq *rq); - #ifndef arch_scale_cpu_capacity static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) @@ -1740,12 +1764,6 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) return SCHED_CAPACITY_SCALE; } #endif - -static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) -{ - rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq)); - sched_avg_update(rq); -} #else #ifndef arch_scale_cpu_capacity static __always_inline @@ -1754,8 +1772,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) return SCHED_CAPACITY_SCALE; } #endif -static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } -static inline void sched_avg_update(struct rq *rq) { } #endif struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) @@ -2170,11 +2186,16 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL -static inline unsigned long cpu_util_dl(struct rq *rq) +static inline unsigned long cpu_bw_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; } +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return READ_ONCE(rq->avg_dl.util_avg); +} + static inline unsigned long cpu_util_cfs(struct rq *rq) { unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); @@ -2186,4 +2207,37 @@ static inline unsigned long cpu_util_cfs(struct rq *rq) return util; } + +static inline unsigned long cpu_util_rt(struct rq *rq) +{ + return READ_ONCE(rq->avg_rt.util_avg); +} +#endif + +#ifdef HAVE_SCHED_AVG_IRQ +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return rq->avg_irq.util_avg; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + util *= (max - irq); + util /= max; + + return util; + +} +#else +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return 0; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + return util; +} #endif diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index b6fb2c3b3ff7..66b59ac77c22 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -32,7 +32,7 @@ void swake_up_locked(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_locked); -void swake_up(struct swait_queue_head *q) +void swake_up_one(struct swait_queue_head *q) { unsigned long flags; @@ -40,7 +40,7 @@ void swake_up(struct swait_queue_head *q) swake_up_locked(q); raw_spin_unlock_irqrestore(&q->lock, flags); } -EXPORT_SYMBOL(swake_up); +EXPORT_SYMBOL(swake_up_one); /* * Does not allow usage from IRQ disabled, since we must be able to @@ -69,14 +69,14 @@ void swake_up_all(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_all); -void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) { wait->task = current; if (list_empty(&wait->task_list)) - list_add(&wait->task_list, &q->task_list); + list_add_tail(&wait->task_list, &q->task_list); } -void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) +void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state) { unsigned long flags; @@ -85,16 +85,28 @@ void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int set_current_state(state); raw_spin_unlock_irqrestore(&q->lock, flags); } -EXPORT_SYMBOL(prepare_to_swait); +EXPORT_SYMBOL(prepare_to_swait_exclusive); long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) { - if (signal_pending_state(state, current)) - return -ERESTARTSYS; + unsigned long flags; + long ret = 0; - prepare_to_swait(q, wait, state); + raw_spin_lock_irqsave(&q->lock, flags); + if (unlikely(signal_pending_state(state, current))) { + /* + * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one() + * must not see us. + */ + list_del_init(&wait->task_list); + ret = -ERESTARTSYS; + } else { + __prepare_to_swait(q, wait); + set_current_state(state); + } + raw_spin_unlock_irqrestore(&q->lock, flags); - return 0; + return ret; } EXPORT_SYMBOL(prepare_to_swait_event); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 05a831427bc7..56a0fed30c0a 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -47,7 +47,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); } - if (!cpumask_test_cpu(cpu, sched_group_span(group))) { + if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 928be527477e..5dd47f1103d1 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -69,6 +69,8 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, wait_queue_entry_t *curr, *next; int cnt = 0; + lockdep_assert_held(&wq_head->lock); + if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) { curr = list_next_entry(bookmark, entry); @@ -134,8 +136,8 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. */ void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) @@ -180,8 +182,8 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); * * On UP it can prevent extra preemption. * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. */ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) @@ -392,35 +394,36 @@ static inline bool is_kthread_should_stop(void) * if (condition) * break; * - * p->state = mode; condition = true; - * smp_mb(); // A smp_wmb(); // C - * if (!wq_entry->flags & WQ_FLAG_WOKEN) wq_entry->flags |= WQ_FLAG_WOKEN; - * schedule() try_to_wake_up(); - * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ - * wq_entry->flags &= ~WQ_FLAG_WOKEN; condition = true; - * smp_mb() // B smp_wmb(); // C - * wq_entry->flags |= WQ_FLAG_WOKEN; - * } - * remove_wait_queue(&wq_head, &wait); + * // in wait_woken() // in woken_wake_function() * + * p->state = mode; wq_entry->flags |= WQ_FLAG_WOKEN; + * smp_mb(); // A try_to_wake_up(): + * if (!(wq_entry->flags & WQ_FLAG_WOKEN)) <full barrier> + * schedule() if (p->state & mode) + * p->state = TASK_RUNNING; p->state = TASK_RUNNING; + * wq_entry->flags &= ~WQ_FLAG_WOKEN; ~~~~~~~~~~~~~~~~~~ + * smp_mb(); // B condition = true; + * } smp_mb(); // C + * remove_wait_queue(&wq_head, &wait); wq_entry->flags |= WQ_FLAG_WOKEN; */ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) { - set_current_state(mode); /* A */ /* - * The above implies an smp_mb(), which matches with the smp_wmb() from - * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must - * also observe all state before the wakeup. + * The below executes an smp_mb(), which matches with the full barrier + * executed by the try_to_wake_up() in woken_wake_function() such that + * either we see the store to wq_entry->flags in woken_wake_function() + * or woken_wake_function() sees our store to current->state. */ + set_current_state(mode); /* A */ if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) timeout = schedule_timeout(timeout); __set_current_state(TASK_RUNNING); /* - * The below implies an smp_mb(), it too pairs with the smp_wmb() from - * woken_wake_function() such that we must either observe the wait - * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss - * an event. + * The below executes an smp_mb(), which matches with the smp_mb() (C) + * in woken_wake_function() such that either we see the wait condition + * being true or the store to wq_entry->flags in woken_wake_function() + * follows ours in the coherence order. */ smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */ @@ -430,14 +433,8 @@ EXPORT_SYMBOL(wait_woken); int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) { - /* - * Although this function is called under waitqueue lock, LOCK - * doesn't imply write barrier and the users expects write - * barrier semantics on wakeup functions. The following - * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() - * and is paired with smp_store_mb() in wait_woken(). - */ - smp_wmb(); /* C */ + /* Pairs with the smp_store_mb() in wait_woken(). */ + smp_mb(); /* C */ wq_entry->flags |= WQ_FLAG_WOKEN; return default_wake_function(wq_entry, mode, sync, key); diff --git a/kernel/signal.c b/kernel/signal.c index 8d8a940422a8..0831d56a731a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -65,14 +65,14 @@ static void __user *sig_handler(struct task_struct *t, int sig) return t->sighand->action[sig - 1].sa.sa_handler; } -static int sig_handler_ignored(void __user *handler, int sig) +static inline bool sig_handler_ignored(void __user *handler, int sig) { /* Is it explicitly or implicitly ignored? */ return handler == SIG_IGN || - (handler == SIG_DFL && sig_kernel_ignore(sig)); + (handler == SIG_DFL && sig_kernel_ignore(sig)); } -static int sig_task_ignored(struct task_struct *t, int sig, bool force) +static bool sig_task_ignored(struct task_struct *t, int sig, bool force) { void __user *handler; @@ -80,12 +80,12 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force) if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && handler == SIG_DFL && !(force && sig_kernel_only(sig))) - return 1; + return true; return sig_handler_ignored(handler, sig); } -static int sig_ignored(struct task_struct *t, int sig, bool force) +static bool sig_ignored(struct task_struct *t, int sig, bool force) { /* * Blocked signals are never ignored, since the @@ -93,7 +93,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) * unblocked. */ if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) - return 0; + return false; /* * Tracers may want to know about even ignored signal unless it @@ -101,7 +101,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) * by SIGNAL_UNKILLABLE task. */ if (t->ptrace && sig != SIGKILL) - return 0; + return false; return sig_task_ignored(t, sig, force); } @@ -110,7 +110,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) * Re-calculate pending state from the set of locally pending * signals, globally pending signals, and blocked signals. */ -static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) +static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked) { unsigned long ready; long i; @@ -138,20 +138,21 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) -static int recalc_sigpending_tsk(struct task_struct *t) +static bool recalc_sigpending_tsk(struct task_struct *t) { if ((t->jobctl & JOBCTL_PENDING_MASK) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); - return 1; + return true; } + /* * We must never clear the flag in another thread, or in current * when it's possible the current syscall is returning -ERESTART*. * So we don't clear it here, and only callers who know they should do. */ - return 0; + return false; } /* @@ -172,6 +173,17 @@ void recalc_sigpending(void) } +void calculate_sigpending(void) +{ + /* Have any signals or users of TIF_SIGPENDING been delayed + * until after fork? + */ + spin_lock_irq(¤t->sighand->siglock); + set_tsk_thread_flag(current, TIF_SIGPENDING); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); +} + /* Given the mask, find the first available signal that should be serviced. */ #define SYNCHRONOUS_MASK \ @@ -362,6 +374,20 @@ static bool task_participate_group_stop(struct task_struct *task) return false; } +void task_join_group_stop(struct task_struct *task) +{ + /* Have the new thread join an on-going signal group stop */ + unsigned long jobctl = current->jobctl; + if (jobctl & JOBCTL_STOP_PENDING) { + struct signal_struct *sig = current->signal; + unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK; + unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; + if (task_set_jobctl_pending(task, signr | gstop)) { + sig->group_stop_count++; + } + } +} + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an @@ -504,13 +530,15 @@ flush_signal_handlers(struct task_struct *t, int force_default) } } -int unhandled_signal(struct task_struct *tsk, int sig) +bool unhandled_signal(struct task_struct *tsk, int sig) { void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; if (is_global_init(tsk)) - return 1; + return true; + if (handler != SIG_IGN && handler != SIG_DFL) - return 0; + return false; + /* if ptraced, let the tracer determine */ return !tsk->ptrace; } @@ -684,14 +712,14 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) * * All callers must be holding the siglock. */ -static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) +static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) { struct sigqueue *q, *n; sigset_t m; sigandsets(&m, mask, &s->signal); if (sigisemptyset(&m)) - return 0; + return; sigandnsets(&s->signal, &s->signal, mask); list_for_each_entry_safe(q, n, &s->list, list) { @@ -700,7 +728,6 @@ static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) __sigqueue_free(q); } } - return 1; } static inline int is_si_special(const struct siginfo *info) @@ -717,21 +744,16 @@ static inline bool si_fromuser(const struct siginfo *info) /* * called with RCU read lock from check_kill_permission() */ -static int kill_ok_by_cred(struct task_struct *t) +static bool kill_ok_by_cred(struct task_struct *t) { const struct cred *cred = current_cred(); const struct cred *tcred = __task_cred(t); - if (uid_eq(cred->euid, tcred->suid) || - uid_eq(cred->euid, tcred->uid) || - uid_eq(cred->uid, tcred->suid) || - uid_eq(cred->uid, tcred->uid)) - return 1; - - if (ns_capable(tcred->user_ns, CAP_KILL)) - return 1; - - return 0; + return uid_eq(cred->euid, tcred->suid) || + uid_eq(cred->euid, tcred->uid) || + uid_eq(cred->uid, tcred->suid) || + uid_eq(cred->uid, tcred->uid) || + ns_capable(tcred->user_ns, CAP_KILL); } /* @@ -882,20 +904,24 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) * as soon as they're available, so putting the signal on the shared queue * will be equivalent to sending it to one such thread. */ -static inline int wants_signal(int sig, struct task_struct *p) +static inline bool wants_signal(int sig, struct task_struct *p) { if (sigismember(&p->blocked, sig)) - return 0; + return false; + if (p->flags & PF_EXITING) - return 0; + return false; + if (sig == SIGKILL) - return 1; + return true; + if (task_is_stopped_or_traced(p)) - return 0; + return false; + return task_curr(p) || !signal_pending(p); } -static void complete_signal(int sig, struct task_struct *p, int group) +static void complete_signal(int sig, struct task_struct *p, enum pid_type type) { struct signal_struct *signal = p->signal; struct task_struct *t; @@ -908,7 +934,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) */ if (wants_signal(sig, p)) t = p; - else if (!group || thread_group_empty(p)) + else if ((type == PIDTYPE_PID) || thread_group_empty(p)) /* * There is just one thread and it does not need to be woken. * It will dequeue unblocked signals before it runs again. @@ -971,7 +997,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) return; } -static inline int legacy_queue(struct sigpending *signals, int sig) +static inline bool legacy_queue(struct sigpending *signals, int sig) { return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } @@ -998,7 +1024,7 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str #endif static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, - int group, int from_ancestor_ns) + enum pid_type type, int from_ancestor_ns) { struct sigpending *pending; struct sigqueue *q; @@ -1012,7 +1038,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, from_ancestor_ns || (info == SEND_SIG_FORCED))) goto ret; - pending = group ? &t->signal->shared_pending : &t->pending; + pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; /* * Short-circuit ignored signals and support queuing * exactly one non-rt signal, so that we can get more @@ -1096,14 +1122,29 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); - complete_signal(sig, t, group); + + /* Let multiprocess signals appear after on-going forks */ + if (type > PIDTYPE_TGID) { + struct multiprocess_signals *delayed; + hlist_for_each_entry(delayed, &t->signal->multiprocess, node) { + sigset_t *signal = &delayed->signal; + /* Can't queue both a stop and a continue signal */ + if (sig == SIGCONT) + sigdelsetmask(signal, SIG_KERNEL_STOP_MASK); + else if (sig_kernel_stop(sig)) + sigdelset(signal, SIGCONT); + sigaddset(signal, sig); + } + } + + complete_signal(sig, t, type); ret: - trace_signal_generate(sig, info, t, group, result); + trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result); return ret; } static int send_signal(int sig, struct siginfo *info, struct task_struct *t, - int group) + enum pid_type type) { int from_ancestor_ns = 0; @@ -1112,7 +1153,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, !task_pid_nr_ns(current, task_active_pid_ns(t)); #endif - return __send_signal(sig, info, t, group, from_ancestor_ns); + return __send_signal(sig, info, t, type, from_ancestor_ns); } static void print_fatal_signal(int signr) @@ -1151,23 +1192,23 @@ __setup("print-fatal-signals=", setup_print_fatal_signals); int __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - return send_signal(sig, info, p, 1); + return send_signal(sig, info, p, PIDTYPE_TGID); } static int specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) { - return send_signal(sig, info, t, 0); + return send_signal(sig, info, t, PIDTYPE_PID); } int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, - bool group) + enum pid_type type) { unsigned long flags; int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { - ret = send_signal(sig, info, p, group); + ret = send_signal(sig, info, p, type); unlock_task_sighand(p, &flags); } @@ -1274,7 +1315,8 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, /* * send signal info to all the members of a group */ -int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, + enum pid_type type) { int ret; @@ -1283,7 +1325,7 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) rcu_read_unlock(); if (!ret && sig) - ret = do_send_sig_info(sig, info, p, true); + ret = do_send_sig_info(sig, info, p, type); return ret; } @@ -1301,7 +1343,7 @@ int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) success = 0; retval = -ESRCH; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - int err = group_send_sig_info(sig, info, p); + int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID); success |= !err; retval = err; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); @@ -1317,7 +1359,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) - error = group_send_sig_info(sig, info, p); + error = group_send_sig_info(sig, info, p, PIDTYPE_TGID); rcu_read_unlock(); if (likely(!p || error != -ESRCH)) return error; @@ -1339,14 +1381,15 @@ static int kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } -static int kill_as_cred_perm(const struct cred *cred, - struct task_struct *target) +static inline bool kill_as_cred_perm(const struct cred *cred, + struct task_struct *target) { const struct cred *pcred = __task_cred(target); - if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) && - !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid)) - return 0; - return 1; + + return uid_eq(cred->euid, pcred->suid) || + uid_eq(cred->euid, pcred->uid) || + uid_eq(cred->uid, pcred->suid) || + uid_eq(cred->uid, pcred->uid); } /* like kill_pid_info(), but doesn't use uid/euid of "current" */ @@ -1376,7 +1419,7 @@ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, if (sig) { if (lock_task_sighand(p, &flags)) { - ret = __send_signal(sig, info, p, 1, 0); + ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0); unlock_task_sighand(p, &flags); } else ret = -ESRCH; @@ -1420,7 +1463,8 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) for_each_process(p) { if (task_pid_vnr(p) > 1 && !same_thread_group(p, current)) { - int err = group_send_sig_info(sig, info, p); + int err = group_send_sig_info(sig, info, p, + PIDTYPE_MAX); ++count; if (err != -EPERM) retval = err; @@ -1446,7 +1490,7 @@ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) if (!valid_signal(sig)) return -EINVAL; - return do_send_sig_info(sig, info, p, false); + return do_send_sig_info(sig, info, p, PIDTYPE_PID); } #define __si_special(priv) \ @@ -1458,8 +1502,7 @@ send_sig(int sig, struct task_struct *p, int priv) return send_sig_info(sig, __si_special(priv), p); } -void -force_sig(int sig, struct task_struct *p) +void force_sig(int sig, struct task_struct *p) { force_sig_info(sig, SEND_SIG_PRIV, p); } @@ -1470,8 +1513,7 @@ force_sig(int sig, struct task_struct *p) * the problem was already a SIGSEGV, we'll want to * make sure we don't even try to deliver the signal.. */ -int -force_sigsegv(int sig, struct task_struct *p) +void force_sigsegv(int sig, struct task_struct *p) { if (sig == SIGSEGV) { unsigned long flags; @@ -1480,7 +1522,6 @@ force_sigsegv(int sig, struct task_struct *p) spin_unlock_irqrestore(&p->sighand->siglock, flags); } force_sig(SIGSEGV, p); - return 0; } int force_sig_fault(int sig, int code, void __user *addr @@ -1664,17 +1705,20 @@ void sigqueue_free(struct sigqueue *q) __sigqueue_free(q); } -int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) +int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) { int sig = q->info.si_signo; struct sigpending *pending; + struct task_struct *t; unsigned long flags; int ret, result; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); ret = -1; - if (!likely(lock_task_sighand(t, &flags))) + rcu_read_lock(); + t = pid_task(pid, type); + if (!t || !likely(lock_task_sighand(t, &flags))) goto ret; ret = 1; /* the signal is ignored */ @@ -1696,15 +1740,16 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) q->info.si_overrun = 0; signalfd_notify(t, sig); - pending = group ? &t->signal->shared_pending : &t->pending; + pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; list_add_tail(&q->list, &pending->list); sigaddset(&pending->signal, sig); - complete_signal(sig, t, group); + complete_signal(sig, t, type); result = TRACE_SIGNAL_DELIVERED; out: - trace_signal_generate(sig, &q->info, t, group, result); + trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result); unlock_task_sighand(t, &flags); ret: + rcu_read_unlock(); return ret; } @@ -1877,10 +1922,10 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, spin_unlock_irqrestore(&sighand->siglock, flags); } -static inline int may_ptrace_stop(void) +static inline bool may_ptrace_stop(void) { if (!likely(current->ptrace)) - return 0; + return false; /* * Are we in the middle of do_coredump? * If so and our tracer is also part of the coredump stopping @@ -1896,19 +1941,19 @@ static inline int may_ptrace_stop(void) */ if (unlikely(current->mm->core_state) && unlikely(current->mm == current->parent->mm)) - return 0; + return false; - return 1; + return true; } /* * Return non-zero if there is a SIGKILL that should be waking us up. * Called with the siglock held. */ -static int sigkill_pending(struct task_struct *tsk) +static bool sigkill_pending(struct task_struct *tsk) { - return sigismember(&tsk->pending.signal, SIGKILL) || - sigismember(&tsk->signal->shared_pending.signal, SIGKILL); + return sigismember(&tsk->pending.signal, SIGKILL) || + sigismember(&tsk->signal->shared_pending.signal, SIGKILL); } /* @@ -2288,7 +2333,7 @@ static int ptrace_signal(int signr, siginfo_t *info) return signr; } -int get_signal(struct ksignal *ksig) +bool get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; @@ -2298,7 +2343,7 @@ int get_signal(struct ksignal *ksig) task_work_run(); if (unlikely(uprobe_deny_signal())) - return 0; + return false; /* * Do this once, we can't return to user-mode if freezing() == T. @@ -2755,7 +2800,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, } #endif -static int do_sigpending(sigset_t *set) +static void do_sigpending(sigset_t *set) { spin_lock_irq(¤t->sighand->siglock); sigorsets(set, ¤t->pending.signal, @@ -2764,7 +2809,6 @@ static int do_sigpending(sigset_t *set) /* Outside the lock because only this thread touches it. */ sigandsets(set, ¤t->blocked, set); - return 0; } /** @@ -2776,15 +2820,16 @@ static int do_sigpending(sigset_t *set) SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) { sigset_t set; - int err; if (sigsetsize > sizeof(*uset)) return -EINVAL; - err = do_sigpending(&set); - if (!err && copy_to_user(uset, &set, sigsetsize)) - err = -EFAULT; - return err; + do_sigpending(&set); + + if (copy_to_user(uset, &set, sigsetsize)) + return -EFAULT; + + return 0; } #ifdef CONFIG_COMPAT @@ -2792,15 +2837,13 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { sigset_t set; - int err; if (sigsetsize > sizeof(*uset)) return -EINVAL; - err = do_sigpending(&set); - if (!err) - err = put_compat_sigset(uset, &set, sigsetsize); - return err; + do_sigpending(&set); + + return put_compat_sigset(uset, &set, sigsetsize); } #endif @@ -3039,7 +3082,7 @@ int copy_siginfo_from_user32(struct siginfo *to, * @ts: upper bound on process time suspension */ static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, - const struct timespec *ts) + const struct timespec64 *ts) { ktime_t *to = NULL, timeout = KTIME_MAX; struct task_struct *tsk = current; @@ -3047,9 +3090,9 @@ static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, int sig, ret = 0; if (ts) { - if (!timespec_valid(ts)) + if (!timespec64_valid(ts)) return -EINVAL; - timeout = timespec_to_ktime(*ts); + timeout = timespec64_to_ktime(*ts); to = &timeout; } @@ -3097,11 +3140,12 @@ static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, - siginfo_t __user *, uinfo, const struct timespec __user *, uts, + siginfo_t __user *, uinfo, + const struct __kernel_timespec __user *, uts, size_t, sigsetsize) { sigset_t these; - struct timespec ts; + struct timespec64 ts; siginfo_t info; int ret; @@ -3113,7 +3157,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return -EFAULT; if (uts) { - if (copy_from_user(&ts, uts, sizeof(ts))) + if (get_timespec64(&ts, uts)) return -EFAULT; } @@ -3130,10 +3174,10 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, - struct compat_timespec __user *, uts, compat_size_t, sigsetsize) + struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { sigset_t s; - struct timespec t; + struct timespec64 t; siginfo_t info; long ret; @@ -3144,7 +3188,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, return -EFAULT; if (uts) { - if (compat_get_timespec(&t, uts)) + if (get_old_timespec32(&t, uts)) return -EFAULT; } @@ -3193,7 +3237,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) * probe. No signal is actually delivered. */ if (!error && sig) { - error = do_send_sig_info(sig, info, p, false); + error = do_send_sig_info(sig, info, p, PIDTYPE_PID); /* * If lock_task_sighand() failed we pretend the task * dies after receiving the signal. The window is tiny, @@ -3562,25 +3606,26 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset) { sigset_t set; - int err; if (sizeof(old_sigset_t) > sizeof(*uset)) return -EINVAL; - err = do_sigpending(&set); - if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t))) - err = -EFAULT; - return err; + do_sigpending(&set); + + if (copy_to_user(uset, &set, sizeof(old_sigset_t))) + return -EFAULT; + + return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { sigset_t set; - int err = do_sigpending(&set); - if (!err) - err = put_user(set.sig[0], set32); - return err; + + do_sigpending(&set); + + return put_user(set.sig[0], set32); } #endif @@ -3651,25 +3696,23 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig, size_t, sigsetsize) { struct k_sigaction new_sa, old_sa; - int ret = -EINVAL; + int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) - goto out; + return -EINVAL; - if (act) { - if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) - return -EFAULT; - } + if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) + return -EFAULT; ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); + if (ret) + return ret; - if (!ret && oact) { - if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) - return -EFAULT; - } -out: - return ret; + if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) + return -EFAULT; + + return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, @@ -3960,7 +4003,7 @@ void kdb_send_sig(struct task_struct *t, int sig) "the deadlock.\n"); return; } - ret = send_signal(sig, SEND_SIG_PRIV, t, false); + ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); spin_unlock(&t->sighand->siglock); if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", diff --git a/kernel/smp.c b/kernel/smp.c index 084c8b3a2681..d86eec5f51c1 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -584,6 +584,8 @@ void __init smp_init(void) num_nodes, (num_nodes > 1 ? "s" : ""), num_cpus, (num_cpus > 1 ? "s" : "")); + /* Final decision about SMT support */ + cpu_smt_check_topology(); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); } diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 5043e7433f4b..c230c2dd48e1 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -238,8 +238,7 @@ int smpboot_unpark_threads(unsigned int cpu) mutex_lock(&smpboot_threads_lock); list_for_each_entry(cur, &hotplug_threads, list) - if (cpumask_test_cpu(cpu, cur->cpumask)) - smpboot_unpark_thread(cur, cpu); + smpboot_unpark_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); return 0; } @@ -280,34 +279,26 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) } /** - * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related + * smpboot_register_percpu_thread - Register a per_cpu thread related * to hotplug * @plug_thread: Hotplug thread descriptor - * @cpumask: The cpumask where threads run * * Creates and starts the threads on all online cpus. */ -int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, - const struct cpumask *cpumask) +int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) { unsigned int cpu; int ret = 0; - if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) - return -ENOMEM; - cpumask_copy(plug_thread->cpumask, cpumask); - get_online_cpus(); mutex_lock(&smpboot_threads_lock); for_each_online_cpu(cpu) { ret = __smpboot_create_thread(plug_thread, cpu); if (ret) { smpboot_destroy_threads(plug_thread); - free_cpumask_var(plug_thread->cpumask); goto out; } - if (cpumask_test_cpu(cpu, cpumask)) - smpboot_unpark_thread(plug_thread, cpu); + smpboot_unpark_thread(plug_thread, cpu); } list_add(&plug_thread->list, &hotplug_threads); out: @@ -315,7 +306,7 @@ out: put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask); +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); /** * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug @@ -331,44 +322,9 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) smpboot_destroy_threads(plug_thread); mutex_unlock(&smpboot_threads_lock); put_online_cpus(); - free_cpumask_var(plug_thread->cpumask); } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); -/** - * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked - * @plug_thread: Hotplug thread descriptor - * @new: Revised mask to use - * - * The cpumask field in the smp_hotplug_thread must not be updated directly - * by the client, but only by calling this function. - * This function can only be called on a registered smp_hotplug_thread. - */ -void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, - const struct cpumask *new) -{ - struct cpumask *old = plug_thread->cpumask; - static struct cpumask tmp; - unsigned int cpu; - - lockdep_assert_cpus_held(); - mutex_lock(&smpboot_threads_lock); - - /* Park threads that were exclusively enabled on the old mask. */ - cpumask_andnot(&tmp, old, new); - for_each_cpu_and(cpu, &tmp, cpu_online_mask) - smpboot_park_thread(plug_thread, cpu); - - /* Unpark threads that are exclusively enabled on the new mask. */ - cpumask_andnot(&tmp, new, old); - for_each_cpu_and(cpu, &tmp, cpu_online_mask) - smpboot_unpark_thread(plug_thread, cpu); - - cpumask_copy(old, new); - - mutex_unlock(&smpboot_threads_lock); -} - static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); /* diff --git a/kernel/softirq.c b/kernel/softirq.c index 900dcfee542c..6f584861d329 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -79,12 +79,16 @@ static void wakeup_softirqd(void) /* * If ksoftirqd is scheduled, we do not want to process pending softirqs - * right now. Let ksoftirqd handle this at its own rate, to get fairness. + * right now. Let ksoftirqd handle this at its own rate, to get fairness, + * unless we're doing some of the synchronous softirqs. */ -static bool ksoftirqd_running(void) +#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ)) +static bool ksoftirqd_running(unsigned long pending) { struct task_struct *tsk = __this_cpu_read(ksoftirqd); + if (pending & SOFTIRQ_NOW_MASK) + return false; return tsk && (tsk->state == TASK_RUNNING); } @@ -328,7 +332,7 @@ asmlinkage __visible void do_softirq(void) pending = local_softirq_pending(); - if (pending && !ksoftirqd_running()) + if (pending && !ksoftirqd_running(pending)) do_softirq_own_stack(); local_irq_restore(flags); @@ -355,7 +359,7 @@ void irq_enter(void) static inline void invoke_softirq(void) { - if (ksoftirqd_running()) + if (ksoftirqd_running(local_softirq_pending())) return; if (!force_irqthreads) { @@ -386,7 +390,7 @@ static inline void tick_irq_exit(void) /* Make sure that timer wheel updates are propagated */ if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { - if (!in_interrupt()) + if (!in_irq()) tick_nohz_irq_exit(); } #endif diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index f89014a2c238..067cb83f37ea 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -81,6 +81,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) unsigned long flags; bool enabled; + preempt_disable(); raw_spin_lock_irqsave(&stopper->lock, flags); enabled = stopper->enabled; if (enabled) @@ -90,6 +91,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) raw_spin_unlock_irqrestore(&stopper->lock, flags); wake_up_q(&wakeq); + preempt_enable(); return enabled; } @@ -236,13 +238,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); DEFINE_WAKE_Q(wakeq); int err; + retry: + /* + * The waking up of stopper threads has to happen in the same + * scheduling context as the queueing. Otherwise, there is a + * possibility of one of the above stoppers being woken up by another + * CPU, and preempting us. This will cause us to not wake up the other + * stopper forever. + */ + preempt_disable(); raw_spin_lock_irq(&stopper1->lock); raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); - err = -ENOENT; - if (!stopper1->enabled || !stopper2->enabled) + if (!stopper1->enabled || !stopper2->enabled) { + err = -ENOENT; goto unlock; + } + /* * Ensure that if we race with __stop_cpus() the stoppers won't get * queued up in reverse order leading to system deadlock. @@ -253,24 +266,30 @@ retry: * It can be falsely true but it is safe to spin until it is cleared, * queue_stop_cpus_work() does everything under preempt_disable(). */ - err = -EDEADLK; - if (unlikely(stop_cpus_in_progress)) - goto unlock; + if (unlikely(stop_cpus_in_progress)) { + err = -EDEADLK; + goto unlock; + } err = 0; __cpu_stop_queue_work(stopper1, work1, &wakeq); __cpu_stop_queue_work(stopper2, work2, &wakeq); + unlock: raw_spin_unlock(&stopper2->lock); raw_spin_unlock_irq(&stopper1->lock); if (unlikely(err == -EDEADLK)) { + preempt_enable(); + while (stop_cpus_in_progress) cpu_relax(); + goto retry; } wake_up_q(&wakeq); + preempt_enable(); return err; } diff --git a/kernel/sys.c b/kernel/sys.c index e27b51d3facd..cf5c67533ff1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1237,18 +1237,19 @@ static int override_release(char __user *release, size_t len) SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { - int errno = 0; + struct new_utsname tmp; down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof *name)) - errno = -EFAULT; + memcpy(&tmp, utsname(), sizeof(tmp)); up_read(&uts_sem); + if (copy_to_user(name, &tmp, sizeof(tmp))) + return -EFAULT; - if (!errno && override_release(name->release, sizeof(name->release))) - errno = -EFAULT; - if (!errno && override_architecture(name)) - errno = -EFAULT; - return errno; + if (override_release(name->release, sizeof(name->release))) + return -EFAULT; + if (override_architecture(name)) + return -EFAULT; + return 0; } #ifdef __ARCH_WANT_SYS_OLD_UNAME @@ -1257,55 +1258,46 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) */ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) { - int error = 0; + struct old_utsname tmp; if (!name) return -EFAULT; down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof(*name))) - error = -EFAULT; + memcpy(&tmp, utsname(), sizeof(tmp)); up_read(&uts_sem); + if (copy_to_user(name, &tmp, sizeof(tmp))) + return -EFAULT; - if (!error && override_release(name->release, sizeof(name->release))) - error = -EFAULT; - if (!error && override_architecture(name)) - error = -EFAULT; - return error; + if (override_release(name->release, sizeof(name->release))) + return -EFAULT; + if (override_architecture(name)) + return -EFAULT; + return 0; } SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) { - int error; + struct oldold_utsname tmp = {}; if (!name) return -EFAULT; - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; down_read(&uts_sem); - error = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - error |= __put_user(0, name->sysname + __OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - error |= __put_user(0, name->nodename + __OLD_UTS_LEN); - error |= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - error |= __put_user(0, name->release + __OLD_UTS_LEN); - error |= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - error |= __put_user(0, name->version + __OLD_UTS_LEN); - error |= __copy_to_user(&name->machine, &utsname()->machine, - __OLD_UTS_LEN); - error |= __put_user(0, name->machine + __OLD_UTS_LEN); + memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN); + memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN); + memcpy(&tmp.release, &utsname()->release, __OLD_UTS_LEN); + memcpy(&tmp.version, &utsname()->version, __OLD_UTS_LEN); + memcpy(&tmp.machine, &utsname()->machine, __OLD_UTS_LEN); up_read(&uts_sem); + if (copy_to_user(name, &tmp, sizeof(tmp))) + return -EFAULT; - if (!error && override_architecture(name)) - error = -EFAULT; - if (!error && override_release(name->release, sizeof(name->release))) - error = -EFAULT; - return error ? -EFAULT : 0; + if (override_architecture(name)) + return -EFAULT; + if (override_release(name->release, sizeof(name->release))) + return -EFAULT; + return 0; } #endif @@ -1319,17 +1311,18 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; - down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - struct new_utsname *u = utsname(); + struct new_utsname *u; + down_write(&uts_sem); + u = utsname(); memcpy(u->nodename, tmp, len); memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; uts_proc_notify(UTS_PROC_HOSTNAME); + up_write(&uts_sem); } - up_write(&uts_sem); return errno; } @@ -1337,8 +1330,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) { - int i, errno; + int i; struct new_utsname *u; + char tmp[__NEW_UTS_LEN + 1]; if (len < 0) return -EINVAL; @@ -1347,11 +1341,11 @@ SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) i = 1 + strlen(u->nodename); if (i > len) i = len; - errno = 0; - if (copy_to_user(name, u->nodename, i)) - errno = -EFAULT; + memcpy(tmp, u->nodename, i); up_read(&uts_sem); - return errno; + if (copy_to_user(name, tmp, i)) + return -EFAULT; + return 0; } #endif @@ -1370,17 +1364,18 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; - down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - struct new_utsname *u = utsname(); + struct new_utsname *u; + down_write(&uts_sem); + u = utsname(); memcpy(u->domainname, tmp, len); memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; uts_proc_notify(UTS_PROC_DOMAINNAME); + up_write(&uts_sem); } - up_write(&uts_sem); return errno; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2d9837c0aff4..cc02050fd0c4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -145,7 +145,10 @@ static int minolduid; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ #ifdef CONFIG_DETECT_HUNG_TASK static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #endif @@ -222,7 +225,7 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #ifdef CONFIG_MAGIC_SYSRQ -/* Note: sysrq code uses it's own private copy */ +/* Note: sysrq code uses its own private copy */ static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; static int sysrq_sysctl_handler(struct ctl_table *table, int write, @@ -368,14 +371,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "sched_time_avg_ms", - .data = &sysctl_sched_time_avg, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", @@ -1099,6 +1094,14 @@ static struct ctl_table kern_table[] = { .extra2 = &hung_task_timeout_max, }, { + .procname = "hung_task_check_interval_secs", + .data = &sysctl_hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, + }, + { .procname = "hung_task_warnings", .data = &sysctl_hung_task_warnings, .maxlen = sizeof(int), @@ -1805,6 +1808,24 @@ static struct ctl_table fs_table[] = { .extra2 = &one, }, { + .procname = "protected_fifos", + .data = &sysctl_protected_fifos, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, + { + .procname = "protected_regular", + .data = &sysctl_protected_regular, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, + { .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), @@ -1973,13 +1994,13 @@ static void warn_sysctl_write(struct ctl_table *table) } /** - * proc_first_pos_non_zero_ignore - check if firs position is allowed + * proc_first_pos_non_zero_ignore - check if first position is allowed * @ppos: file position * @table: the sysctl table * * Returns true if the first position is non-zero and the sysctl_writes_strict * mode indicates this is not allowed for numeric input types. String proc - * hadlers can ignore the return value. + * handlers can ignore the return value. */ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, struct ctl_table *table) diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index dd53e354f630..7bca480151b0 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -162,90 +162,6 @@ static int test_kprobes(void) } -#if 0 -static u32 jph_val; - -static u32 j_kprobe_target(u32 value) -{ - if (preemptible()) { - handler_errors++; - pr_err("jprobe-handler is preemptible\n"); - } - if (value != rand1) { - handler_errors++; - pr_err("incorrect value in jprobe handler\n"); - } - - jph_val = rand1; - jprobe_return(); - return 0; -} - -static struct jprobe jp = { - .entry = j_kprobe_target, - .kp.symbol_name = "kprobe_target" -}; - -static int test_jprobe(void) -{ - int ret; - - ret = register_jprobe(&jp); - if (ret < 0) { - pr_err("register_jprobe returned %d\n", ret); - return ret; - } - - ret = target(rand1); - unregister_jprobe(&jp); - if (jph_val == 0) { - pr_err("jprobe handler not called\n"); - handler_errors++; - } - - return 0; -} - -static struct jprobe jp2 = { - .entry = j_kprobe_target, - .kp.symbol_name = "kprobe_target2" -}; - -static int test_jprobes(void) -{ - int ret; - struct jprobe *jps[2] = {&jp, &jp2}; - - /* addr and flags should be cleard for reusing kprobe. */ - jp.kp.addr = NULL; - jp.kp.flags = 0; - ret = register_jprobes(jps, 2); - if (ret < 0) { - pr_err("register_jprobes returned %d\n", ret); - return ret; - } - - jph_val = 0; - ret = target(rand1); - if (jph_val == 0) { - pr_err("jprobe handler not called\n"); - handler_errors++; - } - - jph_val = 0; - ret = target2(rand1); - if (jph_val == 0) { - pr_err("jprobe handler2 not called\n"); - handler_errors++; - } - unregister_jprobes(jps, 2); - - return 0; -} -#else -#define test_jprobe() (0) -#define test_jprobes() (0) -#endif #ifdef CONFIG_KRETPROBES static u32 krph_val; @@ -383,16 +299,6 @@ int init_test_probes(void) if (ret < 0) errors++; - num_tests++; - ret = test_jprobe(); - if (ret < 0) - errors++; - - num_tests++; - ret = test_jprobes(); - if (ret < 0) - errors++; - #ifdef CONFIG_KRETPROBES num_tests++; ret = test_kretprobe(); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e1a549c9e399..9cdd74bd2d27 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1660,7 +1660,7 @@ int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: - if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp)) + if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif @@ -1780,12 +1780,12 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) +COMPAT_SYSCALL_DEFINE2(nanosleep, struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 tu; - if (compat_get_timespec64(&tu, rqtp)) + if (get_old_timespec32(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index f26acef5d7b4..9a65713c8309 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -139,9 +139,10 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) { struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); + struct pid *leader_pid = sig->pids[PIDTYPE_TGID]; - trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); - kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); + trace_itimer_expire(ITIMER_REAL, leader_pid, 0); + kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid); return HRTIMER_NORESTART; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 294d7b65af33..ce32cf741b25 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -894,7 +894,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, trace_itimer_expire(signo == SIGPROF ? ITIMER_PROF : ITIMER_VIRTUAL, - tsk->signal->leader_pid, cur_time); + task_tgid(tsk), cur_time); __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); } diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 2c6847d5d69b..989ccf028bde 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -162,20 +162,20 @@ COMPAT_SYS_NI(setitimer); #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { struct timespec64 new_tp; if (which_clock != CLOCK_REALTIME) return -EINVAL; - if (compat_get_timespec64(&new_tp, tp)) + if (get_old_timespec32(&new_tp, tp)) return -EFAULT; return do_sys_settimeofday64(&new_tp, NULL); } COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { int ret; struct timespec64 kernel_tp; @@ -184,13 +184,13 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, if (ret) return ret; - if (compat_put_timespec64(&kernel_tp, tp)) + if (put_old_timespec32(&kernel_tp, tp)) return -EFAULT; return 0; } COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { struct timespec64 rtn_tp = { .tv_sec = 0, @@ -201,7 +201,7 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: - if (compat_put_timespec64(&rtn_tp, tp)) + if (put_old_timespec32(&rtn_tp, tp)) return -EFAULT; return 0; default: @@ -210,8 +210,8 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, } COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 t; @@ -224,7 +224,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return -EINVAL; } - if (compat_get_timespec64(&t, rqtp)) + if (get_old_timespec32(&t, rqtp)) return -EFAULT; if (!timespec64_valid(&t)) return -EINVAL; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 3ac7295306dc..3e71921668ba 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -86,15 +86,6 @@ static const struct k_clock clock_realtime, clock_monotonic; #endif /* - * parisc wants ENOTSUP instead of EOPNOTSUPP - */ -#ifndef ENOTSUP -# define ENANOSLEEP_NOTSUP EOPNOTSUPP -#else -# define ENANOSLEEP_NOTSUP ENOTSUP -#endif - -/* * The timer ID is turned into a timer address by idr_find(). * Verifying a valid ID consists of: * @@ -342,8 +333,8 @@ void posixtimer_rearm(struct siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { - struct task_struct *task; - int shared, ret = -1; + enum pid_type type; + int ret = -1; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->posixtimer_rearm(). @@ -357,13 +348,8 @@ int posix_timer_event(struct k_itimer *timr, int si_private) */ timr->sigq->info.si_sys_private = si_private; - rcu_read_lock(); - task = pid_task(timr->it_pid, PIDTYPE_PID); - if (task) { - shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); - ret = send_sigqueue(timr->sigq, task, shared); - } - rcu_read_unlock(); + type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; + ret = send_sigqueue(timr->sigq, timr->it_pid, type); /* If we failed to send the signal the timer stops. */ return ret > 0; } @@ -442,11 +428,13 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) static struct pid *good_sigevent(sigevent_t * event) { - struct task_struct *rtn = current->group_leader; + struct pid *pid = task_tgid(current); + struct task_struct *rtn; switch (event->sigev_notify) { case SIGEV_SIGNAL | SIGEV_THREAD_ID: - rtn = find_task_by_vpid(event->sigev_notify_thread_id); + pid = find_vpid(event->sigev_notify_thread_id); + rtn = pid_task(pid, PIDTYPE_PID); if (!rtn || !same_thread_group(rtn, current)) return NULL; /* FALLTHRU */ @@ -456,7 +444,7 @@ static struct pid *good_sigevent(sigevent_t * event) return NULL; /* FALLTHRU */ case SIGEV_NONE: - return task_pid(rtn); + return pid; default: return NULL; } @@ -767,13 +755,13 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct compat_itimerspec __user *, setting) + struct old_itimerspec32 __user *, setting) { struct itimerspec64 cur_setting; int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { - if (put_compat_itimerspec64(&cur_setting, setting)) + if (put_old_itimerspec32(&cur_setting, setting)) ret = -EFAULT; } return ret; @@ -940,8 +928,8 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - struct compat_itimerspec __user *, new, - struct compat_itimerspec __user *, old) + struct old_itimerspec32 __user *, new, + struct old_itimerspec32 __user *, old) { struct itimerspec64 new_spec, old_spec; struct itimerspec64 *rtn = old ? &old_spec : NULL; @@ -949,12 +937,12 @@ COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, if (!new) return -EINVAL; - if (get_compat_itimerspec64(&new_spec, new)) + if (get_old_itimerspec32(&new_spec, new)) return -EFAULT; error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old) { - if (put_compat_itimerspec64(&old_spec, old)) + if (put_old_itimerspec32(&old_spec, old)) error = -EFAULT; } return error; @@ -1127,7 +1115,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1135,14 +1123,14 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, if (!kc || !kc->clock_set) return -EINVAL; - if (compat_get_timespec64(&ts, tp)) + if (get_old_timespec32(&ts, tp)) return -EFAULT; return kc->clock_set(which_clock, &ts); } COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1153,7 +1141,7 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, err = kc->clock_get(which_clock, &ts); - if (!err && compat_put_timespec64(&ts, tp)) + if (!err && put_old_timespec32(&ts, tp)) err = -EFAULT; return err; @@ -1192,7 +1180,7 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1202,7 +1190,7 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, return -EINVAL; err = kc->clock_getres(which_clock, &ts); - if (!err && tp && compat_put_timespec64(&ts, tp)) + if (!err && tp && put_old_timespec32(&ts, tp)) return -EFAULT; return err; @@ -1231,7 +1219,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, if (!kc) return -EINVAL; if (!kc->nsleep) - return -ENANOSLEEP_NOTSUP; + return -EOPNOTSUPP; if (get_timespec64(&t, rqtp)) return -EFAULT; @@ -1249,8 +1237,8 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; @@ -1258,9 +1246,9 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, if (!kc) return -EINVAL; if (!kc->nsleep) - return -ENANOSLEEP_NOTSUP; + return -EOPNOTSUPP; - if (compat_get_timespec64(&t, rqtp)) + if (get_old_timespec32(&t, rqtp)) return -EFAULT; if (!timespec64_valid(&t)) diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 2d8f05aad442..cbc72c2c1fca 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -237,7 +237,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pF as sched_clock source\n", read); } -void __init sched_clock_postinit(void) +void __init generic_sched_clock_init(void) { /* * If no sched_clock() function has been provided at that point, diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b7005dd21ec1..14de3727b18e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -277,8 +277,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev, */ return !curdev || newdev->rating > curdev->rating || - (!cpumask_equal(curdev->cpumask, newdev->cpumask) && - !tick_check_percpu(curdev, newdev, smp_processor_id())); + !cpumask_equal(curdev->cpumask, newdev->cpumask); } /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index da9455a6b42b..5b33e2f5c0ed 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -642,7 +642,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static inline bool local_timer_softirq_pending(void) { - return local_softirq_pending() & TIMER_SOFTIRQ; + return local_softirq_pending() & BIT(TIMER_SOFTIRQ); } static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) diff --git a/kernel/time/time.c b/kernel/time/time.c index ccdb351277ee..e3a7f7fd3abc 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -104,12 +104,12 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_TIME -/* compat_time_t is a 32 bit "long" and needs to get converted. */ -COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) +/* old_time32_t is a 32 bit "long" and needs to get converted. */ +COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc) { - compat_time_t i; + old_time32_t i; - i = (compat_time_t)ktime_get_real_seconds(); + i = (old_time32_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) @@ -119,7 +119,7 @@ COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) return i; } -COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) +COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr) { struct timespec64 tv; int err; @@ -144,9 +144,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, struct timezone __user *, tz) { if (likely(tv != NULL)) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (copy_to_user(tv, &ktv, sizeof(ktv))) + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + if (put_user(ts.tv_sec, &tv->tv_sec) || + put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (unlikely(tz != NULL)) { @@ -223,14 +225,15 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, +COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { if (tv) { - struct timeval ktv; + struct timespec64 ts; - do_gettimeofday(&ktv); - if (compat_put_timeval(&ktv, tv)) + ktime_get_real_ts64(&ts); + if (put_user(ts.tv_sec, &tv->tv_sec) || + put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (tz) { @@ -241,7 +244,7 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, return 0; } -COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, +COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; @@ -342,30 +345,6 @@ unsigned int jiffies_to_usecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_usecs); -/** - * timespec_trunc - Truncate timespec to a granularity - * @t: Timespec - * @gran: Granularity in ns. - * - * Truncate a timespec to a granularity. Always rounds down. gran must - * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). - */ -struct timespec timespec_trunc(struct timespec t, unsigned gran) -{ - /* Avoid division in the common cases 1 ns and 1 s. */ - if (gran == 1) { - /* nothing */ - } else if (gran == NSEC_PER_SEC) { - t.tv_nsec = 0; - } else if (gran > 1 && gran < NSEC_PER_SEC) { - t.tv_nsec -= t.tv_nsec % gran; - } else { - WARN(1, "illegal file time granularity: %u", gran); - } - return t; -} -EXPORT_SYMBOL(timespec_trunc); - /* * mktime64 - Converts date to seconds. * Converts Gregorian date to seconds since 1970-01-01 00:00:00. @@ -884,10 +863,10 @@ int put_timespec64(const struct timespec64 *ts, } EXPORT_SYMBOL_GPL(put_timespec64); -int __compat_get_timespec64(struct timespec64 *ts64, - const struct compat_timespec __user *cts) +static int __get_old_timespec32(struct timespec64 *ts64, + const struct old_timespec32 __user *cts) { - struct compat_timespec ts; + struct old_timespec32 ts; int ret; ret = copy_from_user(&ts, cts, sizeof(ts)); @@ -900,33 +879,33 @@ int __compat_get_timespec64(struct timespec64 *ts64, return 0; } -int __compat_put_timespec64(const struct timespec64 *ts64, - struct compat_timespec __user *cts) +static int __put_old_timespec32(const struct timespec64 *ts64, + struct old_timespec32 __user *cts) { - struct compat_timespec ts = { + struct old_timespec32 ts = { .tv_sec = ts64->tv_sec, .tv_nsec = ts64->tv_nsec }; return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; } -int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) +int get_old_timespec32(struct timespec64 *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; else - return __compat_get_timespec64(ts, uts); + return __get_old_timespec32(ts, uts); } -EXPORT_SYMBOL_GPL(compat_get_timespec64); +EXPORT_SYMBOL_GPL(get_old_timespec32); -int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) +int put_old_timespec32(const struct timespec64 *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; else - return __compat_put_timespec64(ts, uts); + return __put_old_timespec32(ts, uts); } -EXPORT_SYMBOL_GPL(compat_put_timespec64); +EXPORT_SYMBOL_GPL(put_old_timespec32); int get_itimerspec64(struct itimerspec64 *it, const struct __kernel_itimerspec __user *uit) @@ -958,23 +937,23 @@ int put_itimerspec64(const struct itimerspec64 *it, } EXPORT_SYMBOL_GPL(put_itimerspec64); -int get_compat_itimerspec64(struct itimerspec64 *its, - const struct compat_itimerspec __user *uits) +int get_old_itimerspec32(struct itimerspec64 *its, + const struct old_itimerspec32 __user *uits) { - if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) || - __compat_get_timespec64(&its->it_value, &uits->it_value)) + if (__get_old_timespec32(&its->it_interval, &uits->it_interval) || + __get_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(get_compat_itimerspec64); +EXPORT_SYMBOL_GPL(get_old_itimerspec32); -int put_compat_itimerspec64(const struct itimerspec64 *its, - struct compat_itimerspec __user *uits) +int put_old_itimerspec32(const struct itimerspec64 *its, + struct old_itimerspec32 __user *uits) { - if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) || - __compat_put_timespec64(&its->it_value, &uits->it_value)) + if (__put_old_timespec32(&its->it_interval, &uits->it_interval) || + __put_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(put_compat_itimerspec64); +EXPORT_SYMBOL_GPL(put_old_itimerspec32); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d9e659a12c76..2d110c948805 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -17,6 +17,7 @@ #include <linux/nmi.h> #include <linux/sched.h> #include <linux/sched/loadavg.h> +#include <linux/sched/clock.h> #include <linux/syscore_ops.h> #include <linux/clocksource.h> #include <linux/jiffies.h> @@ -1211,22 +1212,6 @@ int get_device_system_crosststamp(int (*get_time_fn) EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** - * do_gettimeofday - Returns the time of day in a timeval - * @tv: pointer to the timeval to be set - * - * NOTE: Users should be converted to using getnstimeofday() - */ -void do_gettimeofday(struct timeval *tv) -{ - struct timespec64 now; - - getnstimeofday64(&now); - tv->tv_sec = now.tv_sec; - tv->tv_usec = now.tv_nsec/1000; -} -EXPORT_SYMBOL(do_gettimeofday); - -/** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * @@ -1505,18 +1490,23 @@ void __weak read_persistent_clock64(struct timespec64 *ts64) } /** - * read_boot_clock64 - Return time of the system start. + * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset + * from the boot. * * Weak dummy function for arches that do not yet support it. - * Function to read the exact time the system has been started. - * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported. - * - * XXX - Do be sure to remove it once all arches implement it. + * wall_time - current time as returned by persistent clock + * boot_offset - offset that is defined as wall_time - boot_time + * The default function calculates offset based on the current value of + * local_clock(). This way architectures that support sched_clock() but don't + * support dedicated boot time clock will provide the best estimate of the + * boot time. */ -void __weak read_boot_clock64(struct timespec64 *ts) +void __weak __init +read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, + struct timespec64 *boot_offset) { - ts->tv_sec = 0; - ts->tv_nsec = 0; + read_persistent_clock64(wall_time); + *boot_offset = ns_to_timespec64(local_clock()); } /* @@ -1542,28 +1532,29 @@ static bool persistent_clock_exists; */ void __init timekeeping_init(void) { + struct timespec64 wall_time, boot_offset, wall_to_mono; struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock; unsigned long flags; - struct timespec64 now, boot, tmp; - - read_persistent_clock64(&now); - if (!timespec64_valid_strict(&now)) { - pr_warn("WARNING: Persistent clock returned invalid value!\n" - " Check your CMOS/BIOS settings.\n"); - now.tv_sec = 0; - now.tv_nsec = 0; - } else if (now.tv_sec || now.tv_nsec) - persistent_clock_exists = true; - read_boot_clock64(&boot); - if (!timespec64_valid_strict(&boot)) { - pr_warn("WARNING: Boot clock returned invalid value!\n" - " Check your CMOS/BIOS settings.\n"); - boot.tv_sec = 0; - boot.tv_nsec = 0; + read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); + if (timespec64_valid_strict(&wall_time) && + timespec64_to_ns(&wall_time) > 0) { + persistent_clock_exists = true; + } else if (timespec64_to_ns(&wall_time) != 0) { + pr_warn("Persistent clock returned invalid value"); + wall_time = (struct timespec64){0}; } + if (timespec64_compare(&wall_time, &boot_offset) < 0) + boot_offset = (struct timespec64){0}; + + /* + * We want set wall_to_mono, so the following is true: + * wall time + wall_to_mono = boot time + */ + wall_to_mono = timespec64_sub(boot_offset, wall_time); + raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); ntp_init(); @@ -1573,13 +1564,10 @@ void __init timekeeping_init(void) clock->enable(clock); tk_setup_internals(tk, clock); - tk_set_xtime(tk, &now); + tk_set_xtime(tk, &wall_time); tk->raw_sec = 0; - if (boot.tv_sec == 0 && boot.tv_nsec == 0) - boot = tk_xtime(tk); - set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); - tk_set_wall_to_mono(tk, tmp); + tk_set_wall_to_mono(tk, wall_to_mono); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); @@ -2170,14 +2158,6 @@ void getboottime64(struct timespec64 *ts) } EXPORT_SYMBOL_GPL(getboottime64); -unsigned long get_seconds(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - return tk->xtime_sec; -} -EXPORT_SYMBOL(get_seconds); - void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; diff --git a/kernel/torture.c b/kernel/torture.c index 3de1efbecd6a..1ac24a826589 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -20,6 +20,9 @@ * Author: Paul E. McKenney <paulmck@us.ibm.com> * Based on kernel/rcu/torture.c. */ + +#define pr_fmt(fmt) fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/init.h> @@ -53,7 +56,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); static char *torture_type; -static bool verbose; +static int verbose; /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ @@ -98,7 +101,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) return false; - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: offlining %d\n", torture_type, cpu); @@ -111,7 +114,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, "torture_onoff task: offline %d failed: errno %d\n", torture_type, cpu, ret); } else { - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: offlined %d\n", torture_type, cpu); @@ -147,7 +150,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) return false; - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: onlining %d\n", torture_type, cpu); @@ -160,7 +163,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, "torture_onoff task: online %d failed: errno %d\n", torture_type, cpu, ret); } else { - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: onlined %d\n", torture_type, cpu); @@ -647,7 +650,7 @@ static void torture_stutter_cleanup(void) * The runnable parameter points to a flag that controls whether or not * the test is currently runnable. If there is no such flag, pass in NULL. */ -bool torture_init_begin(char *ttype, bool v) +bool torture_init_begin(char *ttype, int v) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index dcc0166d1997..5e3de28c7677 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -47,6 +47,11 @@ config HAVE_FENTRY help Arch supports the gcc options -pg with -mfentry +config HAVE_NOP_MCOUNT + bool + help + Arch supports the gcc options -pg with -mrecord-mcount and -nop-mcount + config HAVE_C_RECORDMCOUNT bool help @@ -82,6 +87,15 @@ config RING_BUFFER_ALLOW_SWAP Allow the use of ring_buffer_swap_cpu. Adds a very slight overhead to tracing when enabled. +config PREEMPTIRQ_TRACEPOINTS + bool + depends on TRACE_PREEMPT_TOGGLE || TRACE_IRQFLAGS + select TRACING + default y + help + Create preempt/irq toggle tracepoints if needed, so that other parts + of the kernel can use them to generate or add hooks to them. + # All tracer options should select GENERIC_TRACER. For those options that are # enabled by all tracers (context switch and event tracer) they select TRACING. # This allows those options to appear when no other tracer is selected. But the @@ -155,18 +169,20 @@ config FUNCTION_GRAPH_TRACER the return value. This is done by setting the current return address on the current task structure into a stack of calls. +config TRACE_PREEMPT_TOGGLE + bool + help + Enables hooks which will be called when preemption is first disabled, + and last enabled. config PREEMPTIRQ_EVENTS bool "Enable trace events for preempt and irq disable/enable" select TRACE_IRQFLAGS - depends on DEBUG_PREEMPT || !PROVE_LOCKING - depends on TRACING + select TRACE_PREEMPT_TOGGLE if PREEMPT + select GENERIC_TRACER default n help Enable tracing of disable and enable events for preemption and irqs. - For tracing preempt disable/enable events, DEBUG_PREEMPT must be - enabled. For tracing irq disable/enable events, PROVE_LOCKING must - be disabled. config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" @@ -203,6 +219,7 @@ config PREEMPT_TRACER select RING_BUFFER_ALLOW_SWAP select TRACER_SNAPSHOT select TRACER_SNAPSHOT_PER_CPU_SWAP + select TRACE_PREEMPT_TOGGLE help This option measures the time spent in preemption-off critical sections, with microsecond accuracy. @@ -456,6 +473,26 @@ config KPROBE_EVENTS This option is also required by perf-probe subcommand of perf tools. If you want to use perf tools, this option is strongly recommended. +config KPROBE_EVENTS_ON_NOTRACE + bool "Do NOT protect notrace function from kprobe events" + depends on KPROBE_EVENTS + depends on KPROBES_ON_FTRACE + default n + help + This is only for the developers who want to debug ftrace itself + using kprobe events. + + If kprobes can use ftrace instead of breakpoint, ftrace related + functions are protected from kprobe-events to prevent an infinit + recursion or any unexpected execution path which leads to a kernel + crash. + + This option disables such protection and allows you to put kprobe + events on ftrace functions for debugging ftrace by itself. + Note that this might let you shoot yourself in the foot. + + If unsure, say N. + config UPROBE_EVENTS bool "Enable uprobes-based dynamic events" depends on ARCH_SUPPORTS_UPROBES @@ -521,7 +558,7 @@ config FUNCTION_PROFILER in debugfs called function_profile_enabled which defaults to zero. When a 1 is echoed into this file profiling begins, and when a zero is entered, profiling stops. A "functions" file is created in - the trace_stats directory; this file shows the list of functions that + the trace_stat directory; this file shows the list of functions that have been hit and their counters. If in doubt, say N. @@ -605,7 +642,7 @@ config HIST_TRIGGERS Inter-event tracing of quantities such as latencies is also supported using hist triggers under this option. - See Documentation/trace/histogram.txt. + See Documentation/trace/histogram.rst. If in doubt, say N. config MMIOTRACE_TEST @@ -687,6 +724,21 @@ config RING_BUFFER_STARTUP_TEST If unsure, say N +config PREEMPTIRQ_DELAY_TEST + tristate "Preempt / IRQ disable delay thread to test latency tracers" + depends on m + help + Select this option to build a test module that can help test latency + tracers by executing a preempt or irq disable section with a user + configurable delay. The module busy waits for the duration of the + critical section. + + For example, the following invocation forces a one-time irq-disabled + critical section for 500us: + modprobe preemptirq_delay_test test_mode=irq delay=500000 + + If unsure, say N + config TRACE_EVAL_MAP_FILE bool "Show eval mappings for trace events" depends on TRACING @@ -722,6 +774,18 @@ config TRACING_EVENTS_GPIO help Enable tracing events for gpio subsystem +config GCOV_PROFILE_FTRACE + bool "Enable GCOV profiling on ftrace subsystem" + depends on GCOV_KERNEL + help + Enable GCOV profiling on ftrace subsystem for checking + which functions/lines are tested. + + If unsure, say N. + + Note that on a kernel compiled with this config, ftrace will + run significantly slower. + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index e2538c7638d4..f81dadbc7c4a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -13,11 +13,21 @@ obj-y += trace_selftest_dynamic.o endif endif +ifdef CONFIG_FTRACE_STARTUP_TEST +CFLAGS_trace_kprobe_selftest.o = $(CC_FLAGS_FTRACE) +obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe_selftest.o +endif + # If unlikely tracing is enabled, do not trace these files ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +# for GCOV coverage profiling +ifdef CONFIG_GCOV_PROFILE_FTRACE +GCOV_PROFILE := y +endif + CFLAGS_trace_benchmark.o := -I$(src) CFLAGS_trace_events_filter.o := -I$(src) @@ -33,9 +43,10 @@ obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o +obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o -obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o +obj-$(CONFIG_PREEMPTIRQ_TRACEPOINTS) += trace_preemptirq.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 987d9a9ae283..2868d85f1fb1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk> * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * */ #include <linux/kernel.h> #include <linux/blkdev.h> @@ -494,6 +482,9 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!buts->buf_size || !buts->buf_nr) return -EINVAL; + if (!blk_debugfs_root) + return -ENOENT; + strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; @@ -518,9 +509,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = -ENOENT; - if (!blk_debugfs_root) - goto err; - dir = debugfs_lookup(buts->name, blk_debugfs_root); if (!dir) bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); @@ -1841,6 +1829,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { + if (!!value == !!q->blk_trace) { + ret = 0; + goto out_unlock_bdev; + } if (value) ret = blk_trace_setup_queue(q, bdev); else diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0ae6829804bc..08fcfe440c63 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1,9 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include <linux/kernel.h> #include <linux/types.h> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index efed9c1cfb7e..f536f601bd46 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Infrastructure for profiling code inserted by 'gcc -pg'. * @@ -157,30 +158,6 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) #endif } -/** - * ftrace_nr_registered_ops - return number of ops registered - * - * Returns the number of ftrace_ops registered and tracing functions - */ -int ftrace_nr_registered_ops(void) -{ - struct ftrace_ops *ops; - int cnt = 0; - - mutex_lock(&ftrace_lock); - - for (ops = rcu_dereference_protected(ftrace_ops_list, - lockdep_is_held(&ftrace_lock)); - ops != &ftrace_list_end; - ops = rcu_dereference_protected(ops->next, - lockdep_is_held(&ftrace_lock))) - cnt++; - - mutex_unlock(&ftrace_lock); - - return cnt; -} - static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { @@ -192,17 +169,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, regs); } -/** - * clear_ftrace_function - reset the ftrace function - * - * This NULLs the ftrace function and in essence stops - * tracing. There may be lag - */ -void clear_ftrace_function(void) -{ - ftrace_trace_function = ftrace_stub; -} - static void ftrace_sync(struct work_struct *work) { /* @@ -324,11 +290,6 @@ static void update_ftrace_function(void) ftrace_trace_function = func; } -int using_ftrace_ops_list_func(void) -{ - return ftrace_trace_function == ftrace_ops_list_func; -} - static void add_ftrace_ops(struct ftrace_ops __rcu **list, struct ftrace_ops *ops) { @@ -1060,8 +1021,6 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) } #endif /* CONFIG_FUNCTION_PROFILER */ -static struct pid * const ftrace_swapper_pid = &init_struct_pid; - #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int ftrace_graph_active; #else @@ -2938,22 +2897,22 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) { /* If ops isn't enabled, ignore it */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return 0; + return false; /* If ops traces all then it includes this function */ if (ops_traces_mod(ops)) - return 1; + return true; /* The function must be in the filter */ if (!ftrace_hash_empty(ops->func_hash->filter_hash) && !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) - return 0; + return false; /* If in notrace hash, we ignore it too */ if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) - return 0; + return false; - return 1; + return true; } static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) @@ -2992,12 +2951,14 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) p = &pg->records[i]; p->flags = rec_flags; +#ifndef CC_USING_NOP_MCOUNT /* * Do the initial record conversion from mcount jump * to the NOP instructions. */ if (!ftrace_code_disable(mod, p)) break; +#endif update_cnt++; } @@ -6689,7 +6650,7 @@ void ftrace_kill(void) { ftrace_disabled = 1; ftrace_enabled = 0; - clear_ftrace_function(); + ftrace_trace_function = ftrace_stub; } /** diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c new file mode 100644 index 000000000000..f704390db9fc --- /dev/null +++ b/kernel/trace/preemptirq_delay_test.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Preempt / IRQ disable delay thread to test latency tracers + * + * Copyright (C) 2018 Joel Fernandes (Google) <joel@joelfernandes.org> + */ + +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/kthread.h> +#include <linux/ktime.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/string.h> + +static ulong delay = 100; +static char test_mode[10] = "irq"; + +module_param_named(delay, delay, ulong, S_IRUGO); +module_param_string(test_mode, test_mode, 10, S_IRUGO); +MODULE_PARM_DESC(delay, "Period in microseconds (100 uS default)"); +MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default irq)"); + +static void busy_wait(ulong time) +{ + ktime_t start, end; + start = ktime_get(); + do { + end = ktime_get(); + if (kthread_should_stop()) + break; + } while (ktime_to_ns(ktime_sub(end, start)) < (time * 1000)); +} + +static int preemptirq_delay_run(void *data) +{ + unsigned long flags; + + if (!strcmp(test_mode, "irq")) { + local_irq_save(flags); + busy_wait(delay); + local_irq_restore(flags); + } else if (!strcmp(test_mode, "preempt")) { + preempt_disable(); + busy_wait(delay); + preempt_enable(); + } + + return 0; +} + +static int __init preemptirq_delay_init(void) +{ + char task_name[50]; + struct task_struct *test_task; + + snprintf(task_name, sizeof(task_name), "%s_test", test_mode); + + test_task = kthread_run(preemptirq_delay_run, NULL, task_name); + return PTR_ERR_OR_ZERO(test_task); +} + +static void __exit preemptirq_delay_exit(void) +{ + return; +} + +module_init(preemptirq_delay_init) +module_exit(preemptirq_delay_exit) +MODULE_LICENSE("GPL v2"); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6a46af21765c..1d92d4a982fd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Generic ring buffer * @@ -3221,12 +3222,28 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_on); * * Returns true if the ring buffer is in a state that it accepts writes. */ -int ring_buffer_record_is_on(struct ring_buffer *buffer) +bool ring_buffer_record_is_on(struct ring_buffer *buffer) { return !atomic_read(&buffer->record_disabled); } /** + * ring_buffer_record_is_set_on - return true if the ring buffer is set writable + * @buffer: The ring buffer to see if write is set enabled + * + * Returns true if the ring buffer is set writable by ring_buffer_record_on(). + * Note that this does NOT mean it is in a writable state. + * + * It may return true when the ring buffer has been disabled by + * ring_buffer_record_disable(), as that is a temporary disabling of + * the ring buffer. + */ +bool ring_buffer_record_is_set_on(struct ring_buffer *buffer) +{ + return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); +} + +/** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. * @cpu: The CPU buffer to stop diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 68ee79afe31c..ffba6789c0e2 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ring buffer tester and benchmark * diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a0079b4c7a49..bf6f1d70484d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ring buffer based function tracer * @@ -1087,7 +1088,7 @@ void disable_trace_on_warning(void) * * Shows real state of the ring buffer if it is enabled or not. */ -int tracer_tracing_is_on(struct trace_array *tr) +bool tracer_tracing_is_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) return ring_buffer_record_is_on(tr->trace_buffer.buffer); @@ -1373,6 +1374,12 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&tr->max_lock); + /* Inherit the recordable setting from trace_buffer */ + if (ring_buffer_record_is_set_on(tr->trace_buffer.buffer)) + ring_buffer_record_on(tr->max_buffer.buffer); + else + ring_buffer_record_off(tr->max_buffer.buffer); + swap(tr->trace_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); @@ -2953,6 +2960,7 @@ out_nobuffer: } EXPORT_SYMBOL_GPL(trace_vbprintk); +__printf(3, 0) static int __trace_array_vprintk(struct ring_buffer *buffer, unsigned long ip, const char *fmt, va_list args) @@ -3007,12 +3015,14 @@ out_nobuffer: return len; } +__printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); } +__printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { @@ -3028,6 +3038,7 @@ int trace_array_printk(struct trace_array *tr, return ret; } +__printf(3, 4) int trace_array_printk_buf(struct ring_buffer *buffer, unsigned long ip, const char *fmt, ...) { @@ -3043,6 +3054,7 @@ int trace_array_printk_buf(struct ring_buffer *buffer, return ret; } +__printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(&global_trace, ip, fmt, args); @@ -3360,8 +3372,8 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m, print_event_info(buf, m); - seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); - seq_printf(m, "# | | | %s | |\n", tgid ? " | " : ""); + seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); + seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); } static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m, @@ -3381,9 +3393,9 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file tgid ? tgid_space : space); seq_printf(m, "# %s||| / delay\n", tgid ? tgid_space : space); - seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", + seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n", tgid ? " TGID " : space); - seq_printf(m, "# | | | %s|||| | |\n", + seq_printf(m, "# | | %s | |||| | |\n", tgid ? " | " : space); } @@ -7617,7 +7629,9 @@ rb_simple_write(struct file *filp, const char __user *ubuf, if (buffer) { mutex_lock(&trace_types_lock); - if (val) { + if (!!val == tracer_tracing_is_on(tr)) { + val = 0; /* do nothing */ + } else if (val) { tracer_tracing_on(tr); if (tr->current_trace->start) tr->current_trace->start(tr); @@ -8277,6 +8291,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) tracing_off(); local_irq_save(flags); + printk_nmi_direct_enter(); /* Simulate the iterator */ trace_init_global_iter(&iter); @@ -8356,7 +8371,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - atomic_dec(&dump_running); + atomic_dec(&dump_running); + printk_nmi_direct_exit(); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(ftrace_dump); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 630c5a24b2b2..3b8c0e24ab30 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #ifndef _LINUX_KERNEL_TRACE_H #define _LINUX_KERNEL_TRACE_H @@ -583,9 +583,7 @@ static __always_inline void trace_clear_recursion(int bit) static inline struct ring_buffer_iter * trace_buffer_iter(struct trace_iterator *iter, int cpu) { - if (iter->buffer_iter && iter->buffer_iter[cpu]) - return iter->buffer_iter[cpu]; - return NULL; + return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL; } int tracer_init(struct tracer *t, struct trace_array *tr); @@ -596,7 +594,7 @@ void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); -int tracer_tracing_is_on(struct trace_array *tr); +bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); void tracer_tracing_off(struct trace_array *tr); struct dentry *trace_create_file(const char *name, @@ -939,7 +937,6 @@ void ftrace_destroy_function_files(struct trace_array *tr); void ftrace_init_global_array_ops(struct trace_array *tr); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); -int using_ftrace_ops_list_func(void); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d_tracer); @@ -1535,9 +1532,6 @@ extern int event_trigger_init(struct event_trigger_ops *ops, extern int trace_event_trigger_enable_disable(struct trace_event_file *file, int trigger_enable); extern void update_cond_flag(struct trace_event_file *file); -extern void unregister_trigger(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file); extern int set_trigger_filter(char *filter_str, struct event_trigger_data *trigger_data, struct trace_event_file *file); @@ -1833,6 +1827,21 @@ static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) } #endif +#ifdef CONFIG_PREEMPT_TRACER +void tracer_preempt_on(unsigned long a0, unsigned long a1); +void tracer_preempt_off(unsigned long a0, unsigned long a1); +#else +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } +#endif +#ifdef CONFIG_IRQSOFF_TRACER +void tracer_hardirqs_on(unsigned long a0, unsigned long a1); +void tracer_hardirqs_off(unsigned long a0, unsigned long a1); +#else +static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { } +static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { } +#endif + extern struct trace_iterator *tracepoint_print_iter; #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h index be1d86ff753d..79e6fbe5b365 100644 --- a/kernel/trace/trace_benchmark.h +++ b/kernel/trace/trace_benchmark.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #undef TRACE_SYSTEM #define TRACE_SYSTEM benchmark diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index d8a188e0418a..aaf6793ededa 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * tracing clocks * diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 1d67464ed95e..06bb2fd9a56c 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* * This file defines the trace event structures that go into the ring * buffer directly. They are created via macros so that changes for them diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index c79193e598f5..69a3fe926e8c 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace event based perf event profiling/tracing * diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 14ff4ff3caab..f94be0c2827b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * event tracer * @@ -239,7 +240,7 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) struct trace_array_cpu *data; struct trace_pid_list *pid_list; - pid_list = rcu_dereference_sched(tr->filtered_pids); + pid_list = rcu_dereference_raw(tr->filtered_pids); if (!pid_list) return false; @@ -512,7 +513,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task) struct trace_pid_list *pid_list; struct trace_array *tr = data; - pid_list = rcu_dereference_sched(tr->filtered_pids); + pid_list = rcu_dereference_raw(tr->filtered_pids); trace_filter_add_remove_task(pid_list, NULL, task); } @@ -636,7 +637,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) rcu_assign_pointer(tr->filtered_pids, NULL); /* Wait till all users are no longer using pid filtering */ - synchronize_sched(); + tracepoint_synchronize_unregister(); trace_free_pid_list(pid_list); } @@ -1622,7 +1623,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, } if (filtered_pids) { - synchronize_sched(); + tracepoint_synchronize_unregister(); trace_free_pid_list(filtered_pids); } else if (pid_list) { /* @@ -3036,8 +3037,8 @@ int event_trace_del_tracer(struct trace_array *tr) /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); - /* Access to events are within rcu_read_lock_sched() */ - synchronize_sched(); + /* Make sure no more events are being executed */ + tracepoint_synchronize_unregister(); down_write(&trace_event_sem); __trace_remove_event_dirs(tr); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0dceb77d1d42..84a65173b1e9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_events_filter - generic event filtering * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> */ @@ -899,7 +886,8 @@ int filter_match_preds(struct event_filter *filter, void *rec) if (!filter) return 1; - prog = rcu_dereference_sched(filter->prog); + /* Protected by either SRCU(tracepoint_srcu) or preempt_disable */ + prog = rcu_dereference_raw(filter->prog); if (!prog) return 1; @@ -1626,10 +1614,10 @@ static int process_system_preds(struct trace_subsystem_dir *dir, /* * The calls can still be using the old filters. - * Do a synchronize_sched() to ensure all calls are + * Do a synchronize_sched() and to ensure all calls are * done with them before we free them. */ - synchronize_sched(); + tracepoint_synchronize_unregister(); list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { __free_filter(filter_item->filter); list_del(&filter_item->list); @@ -1648,7 +1636,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, kfree(filter); /* If any call succeeded, we still need to sync */ if (!fail) - synchronize_sched(); + tracepoint_synchronize_unregister(); list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { __free_filter(filter_item->filter); list_del(&filter_item->list); @@ -1701,6 +1689,7 @@ static void create_filter_finish(struct filter_parse_error *pe) * @filter_str: filter string * @set_str: remember @filter_str and enable detailed error in filter * @filterp: out param for created filter (always updated on return) + * Must be a pointer that references a NULL pointer. * * Creates a filter for @call with @filter_str. If @set_str is %true, * @filter_str is copied and recorded in the new filter. @@ -1718,6 +1707,10 @@ static int create_filter(struct trace_event_call *call, struct filter_parse_error *pe = NULL; int err; + /* filterp must point to NULL */ + if (WARN_ON(*filterp)) + *filterp = NULL; + err = create_filter_start(filter_string, set_str, &pe, filterp); if (err) return err; @@ -1785,7 +1778,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_clear_filter(file); /* Make sure the filter is not being used */ - synchronize_sched(); + tracepoint_synchronize_unregister(); __free_filter(filter); return 0; @@ -1812,7 +1805,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) if (tmp) { /* Make sure the call is done with the filter */ - synchronize_sched(); + tracepoint_synchronize_unregister(); __free_filter(tmp); } } @@ -1842,7 +1835,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ - synchronize_sched(); + tracepoint_synchronize_unregister(); filter_free_subsystem_filters(dir, tr); __free_filter(filter); goto out_unlock; diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h index 39d7ef4f57cb..e651dfbd345e 100644 --- a/kernel/trace/trace_events_filter_test.h +++ b/kernel/trace/trace_events_filter_test.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #undef TRACE_SYSTEM #define TRACE_SYSTEM test diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 046c716a6536..85f6b01431c7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_events_hist - trace event hist triggers * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com> */ @@ -393,7 +384,7 @@ static void hist_err_event(char *str, char *system, char *event, char *var) else if (system) snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); else - strncpy(err, var, MAX_FILTER_STR_VAL); + strscpy(err, var, MAX_FILTER_STR_VAL); hist_err(str, err); } @@ -5141,7 +5132,7 @@ static void hist_clear(struct event_trigger_data *data) if (data->name) pause_named_trigger(data); - synchronize_sched(); + tracepoint_synchronize_unregister(); tracing_map_clear(hist_data->map); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d18249683682..2152d1e530cb 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_events_trigger - trace event triggers * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com> */ @@ -34,7 +21,9 @@ void trigger_data_free(struct event_trigger_data *data) if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); - synchronize_sched(); /* make sure current triggers exit before free */ + /* make sure current triggers exit before free */ + tracepoint_synchronize_unregister(); + kfree(data); } @@ -579,9 +568,9 @@ out: * Usually used directly as the @unreg method in event command * implementations. */ -void unregister_trigger(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file) +static void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file) { struct event_trigger_data *data; bool unregistered = false; @@ -679,6 +668,8 @@ event_trigger_callback(struct event_command *cmd_ops, goto out_free; out_reg: + /* Up the trigger_data count to make sure reg doesn't free it on failure */ + event_trigger_init(trigger_ops, trigger_data); ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); /* * The above returns on success the # of functions enabled, @@ -686,11 +677,13 @@ event_trigger_callback(struct event_command *cmd_ops, * Consider no functions a failure too. */ if (!ret) { + cmd_ops->unreg(glob, trigger_ops, trigger_data, file); ret = -ENOENT; - goto out_free; - } else if (ret < 0) - goto out_free; - ret = 0; + } else if (ret > 0) + ret = 0; + + /* Down the counter of trigger_data or free it if not used anymore */ + event_trigger_free(trigger_ops, trigger_data); out: return ret; @@ -748,7 +741,7 @@ int set_trigger_filter(char *filter_str, if (tmp) { /* Make sure the call is done with the filter */ - synchronize_sched(); + tracepoint_synchronize_unregister(); free_event_filter(tmp); } @@ -1416,6 +1409,9 @@ int event_enable_trigger_func(struct event_command *cmd_ops, goto out; } + /* Up the trigger_data count to make sure nothing frees it on failure */ + event_trigger_init(trigger_ops, trigger_data); + if (trigger) { number = strsep(&trigger, ":"); @@ -1466,6 +1462,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, goto out_disable; /* Just return zero, not the number of enabled functions */ ret = 0; + event_trigger_free(trigger_ops, trigger_data); out: return ret; @@ -1476,7 +1473,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); - kfree(trigger_data); + event_trigger_free(trigger_ops, trigger_data); kfree(enable_data); goto out; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 23c0b0cb5fb9..169b3c44ee97 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -831,6 +831,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; + int cpu = iter->cpu; int i; graph_ret = &ret_entry->ret; @@ -839,7 +840,6 @@ print_graph_entry_leaf(struct trace_iterator *iter, if (data) { struct fgraph_cpu_data *cpu_data; - int cpu = iter->cpu; cpu_data = per_cpu_ptr(data->cpu_data, cpu); @@ -869,6 +869,9 @@ print_graph_entry_leaf(struct trace_iterator *iter, trace_seq_printf(s, "%ps();\n", (void *)call->func); + print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, + cpu, iter->ent->pid, flags); + return trace_handle_return(s); } diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index d7c8e4ec3d9d..1e6db9cbe4dc 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_hwlatdetect.c - A simple Hardware Latency detector. * @@ -35,9 +36,6 @@ * * Includes useful feedback from Clark Williams <clark@redhat.com> * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. */ #include <linux/kthread.h> #include <linux/tracefs.h> @@ -354,6 +352,9 @@ static int start_kthread(struct trace_array *tr) struct task_struct *kthread; int next_cpu; + if (WARN_ON(hwlat_kthread)) + return 0; + /* Just pick the first CPU on first iteration */ current_mask = &save_cpumask; get_online_cpus(); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 03ecb4465ee4..b7357f9f82a3 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace irqs off critical timings * @@ -16,7 +17,6 @@ #include "trace.h" -#define CREATE_TRACE_POINTS #include <trace/events/preemptirq.h> #if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) @@ -41,12 +41,12 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph); #ifdef CONFIG_PREEMPT_TRACER static inline int -preempt_trace(void) +preempt_trace(int pc) { - return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); + return ((trace_type & TRACER_PREEMPT_OFF) && pc); } #else -# define preempt_trace() (0) +# define preempt_trace(pc) (0) #endif #ifdef CONFIG_IRQSOFF_TRACER @@ -367,7 +367,7 @@ out: } static inline void -start_critical_timing(unsigned long ip, unsigned long parent_ip) +start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; struct trace_array *tr = irqsoff_trace; @@ -395,7 +395,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) local_save_flags(flags); - __trace_function(tr, ip, parent_ip, flags, preempt_count()); + __trace_function(tr, ip, parent_ip, flags, pc); per_cpu(tracing_cpu, cpu) = 1; @@ -403,7 +403,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) } static inline void -stop_critical_timing(unsigned long ip, unsigned long parent_ip) +stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; struct trace_array *tr = irqsoff_trace; @@ -429,7 +429,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); local_save_flags(flags); - __trace_function(tr, ip, parent_ip, flags, preempt_count()); + __trace_function(tr, ip, parent_ip, flags, pc); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); @@ -438,77 +438,21 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) /* start and stop critical timings used to for stoppage (in idle) */ void start_critical_timings(void) { - if (preempt_trace() || irq_trace()) - start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + int pc = preempt_count(); + + if (preempt_trace(pc) || irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } EXPORT_SYMBOL_GPL(start_critical_timings); void stop_critical_timings(void) { - if (preempt_trace() || irq_trace()) - stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} -EXPORT_SYMBOL_GPL(stop_critical_timings); - -#ifdef CONFIG_IRQSOFF_TRACER -#ifdef CONFIG_PROVE_LOCKING -void time_hardirqs_on(unsigned long a0, unsigned long a1) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(a0, a1); -} - -void time_hardirqs_off(unsigned long a0, unsigned long a1) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(a0, a1); -} - -#else /* !CONFIG_PROVE_LOCKING */ - -/* - * We are only interested in hardirq on/off events: - */ -static inline void tracer_hardirqs_on(void) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} + int pc = preempt_count(); -static inline void tracer_hardirqs_off(void) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} - -static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(CALLER_ADDR0, caller_addr); -} - -static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(CALLER_ADDR0, caller_addr); -} - -#endif /* CONFIG_PROVE_LOCKING */ -#endif /* CONFIG_IRQSOFF_TRACER */ - -#ifdef CONFIG_PREEMPT_TRACER -static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) -{ - if (preempt_trace() && !irq_trace()) - stop_critical_timing(a0, a1); -} - -static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) -{ - if (preempt_trace() && !irq_trace()) - start_critical_timing(a0, a1); + if (preempt_trace(pc) || irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } -#endif /* CONFIG_PREEMPT_TRACER */ +EXPORT_SYMBOL_GPL(stop_critical_timings); #ifdef CONFIG_FUNCTION_TRACER static bool function_enabled; @@ -634,7 +578,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr) return 0; } -static void irqsoff_tracer_reset(struct trace_array *tr) +static void __irqsoff_tracer_reset(struct trace_array *tr) { int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; @@ -659,12 +603,37 @@ static void irqsoff_tracer_stop(struct trace_array *tr) } #ifdef CONFIG_IRQSOFF_TRACER +/* + * We are only interested in hardirq on/off events: + */ +void tracer_hardirqs_on(unsigned long a0, unsigned long a1) +{ + unsigned int pc = preempt_count(); + + if (!preempt_trace(pc) && irq_trace()) + stop_critical_timing(a0, a1, pc); +} + +void tracer_hardirqs_off(unsigned long a0, unsigned long a1) +{ + unsigned int pc = preempt_count(); + + if (!preempt_trace(pc) && irq_trace()) + start_critical_timing(a0, a1, pc); +} + static int irqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF; return __irqsoff_tracer_init(tr); } + +static void irqsoff_tracer_reset(struct trace_array *tr) +{ + __irqsoff_tracer_reset(tr); +} + static struct tracer irqsoff_tracer __read_mostly = { .name = "irqsoff", @@ -684,12 +653,25 @@ static struct tracer irqsoff_tracer __read_mostly = .allow_instances = true, .use_max_tr = true, }; -# define register_irqsoff(trace) register_tracer(&trace) -#else -# define register_irqsoff(trace) do { } while (0) -#endif +#endif /* CONFIG_IRQSOFF_TRACER */ #ifdef CONFIG_PREEMPT_TRACER +void tracer_preempt_on(unsigned long a0, unsigned long a1) +{ + int pc = preempt_count(); + + if (preempt_trace(pc) && !irq_trace()) + stop_critical_timing(a0, a1, pc); +} + +void tracer_preempt_off(unsigned long a0, unsigned long a1) +{ + int pc = preempt_count(); + + if (preempt_trace(pc) && !irq_trace()) + start_critical_timing(a0, a1, pc); +} + static int preemptoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_PREEMPT_OFF; @@ -697,11 +679,16 @@ static int preemptoff_tracer_init(struct trace_array *tr) return __irqsoff_tracer_init(tr); } +static void preemptoff_tracer_reset(struct trace_array *tr) +{ + __irqsoff_tracer_reset(tr); +} + static struct tracer preemptoff_tracer __read_mostly = { .name = "preemptoff", .init = preemptoff_tracer_init, - .reset = irqsoff_tracer_reset, + .reset = preemptoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, .print_max = true, @@ -716,13 +703,9 @@ static struct tracer preemptoff_tracer __read_mostly = .allow_instances = true, .use_max_tr = true, }; -# define register_preemptoff(trace) register_tracer(&trace) -#else -# define register_preemptoff(trace) do { } while (0) -#endif +#endif /* CONFIG_PREEMPT_TRACER */ -#if defined(CONFIG_IRQSOFF_TRACER) && \ - defined(CONFIG_PREEMPT_TRACER) +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) static int preemptirqsoff_tracer_init(struct trace_array *tr) { @@ -731,11 +714,16 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr) return __irqsoff_tracer_init(tr); } +static void preemptirqsoff_tracer_reset(struct trace_array *tr) +{ + __irqsoff_tracer_reset(tr); +} + static struct tracer preemptirqsoff_tracer __read_mostly = { .name = "preemptirqsoff", .init = preemptirqsoff_tracer_init, - .reset = irqsoff_tracer_reset, + .reset = preemptirqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, .print_max = true, @@ -750,115 +738,21 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .allow_instances = true, .use_max_tr = true, }; - -# define register_preemptirqsoff(trace) register_tracer(&trace) -#else -# define register_preemptirqsoff(trace) do { } while (0) #endif __init static int init_irqsoff_tracer(void) { - register_irqsoff(irqsoff_tracer); - register_preemptoff(preemptoff_tracer); - register_preemptirqsoff(preemptirqsoff_tracer); - - return 0; -} -core_initcall(init_irqsoff_tracer); -#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */ - -#ifndef CONFIG_IRQSOFF_TRACER -static inline void tracer_hardirqs_on(void) { } -static inline void tracer_hardirqs_off(void) { } -static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { } -static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { } +#ifdef CONFIG_IRQSOFF_TRACER + register_tracer(&irqsoff_tracer); #endif - -#ifndef CONFIG_PREEMPT_TRACER -static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } -static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } +#ifdef CONFIG_PREEMPT_TRACER + register_tracer(&preemptoff_tracer); #endif - -#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING) -/* Per-cpu variable to prevent redundant calls when IRQs already off */ -static DEFINE_PER_CPU(int, tracing_irq_cpu); - -void trace_hardirqs_on(void) -{ - if (!this_cpu_read(tracing_irq_cpu)) - return; - - trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); - tracer_hardirqs_on(); - - this_cpu_write(tracing_irq_cpu, 0); -} -EXPORT_SYMBOL(trace_hardirqs_on); - -void trace_hardirqs_off(void) -{ - if (this_cpu_read(tracing_irq_cpu)) - return; - - this_cpu_write(tracing_irq_cpu, 1); - - trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); - tracer_hardirqs_off(); -} -EXPORT_SYMBOL(trace_hardirqs_off); - -__visible void trace_hardirqs_on_caller(unsigned long caller_addr) -{ - if (!this_cpu_read(tracing_irq_cpu)) - return; - - trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); - tracer_hardirqs_on_caller(caller_addr); - - this_cpu_write(tracing_irq_cpu, 0); -} -EXPORT_SYMBOL(trace_hardirqs_on_caller); - -__visible void trace_hardirqs_off_caller(unsigned long caller_addr) -{ - if (this_cpu_read(tracing_irq_cpu)) - return; - - this_cpu_write(tracing_irq_cpu, 1); - - trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); - tracer_hardirqs_off_caller(caller_addr); -} -EXPORT_SYMBOL(trace_hardirqs_off_caller); - -/* - * Stubs: - */ - -void trace_softirqs_on(unsigned long ip) -{ -} - -void trace_softirqs_off(unsigned long ip) -{ -} - -inline void print_irqtrace_events(struct task_struct *curr) -{ -} +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) + register_tracer(&preemptirqsoff_tracer); #endif -#if defined(CONFIG_PREEMPT_TRACER) || \ - (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS)) -void trace_preempt_on(unsigned long a0, unsigned long a1) -{ - trace_preempt_enable_rcuidle(a0, a1); - tracer_preempt_on(a0, a1); -} - -void trace_preempt_off(unsigned long a0, unsigned long a1) -{ - trace_preempt_disable_rcuidle(a0, a1); - tracer_preempt_off(a0, a1); + return 0; } -#endif +core_initcall(init_irqsoff_tracer); +#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index daa81571b22a..c30032367aab 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Kprobes-based tracing events * * Created by Masami Hiramatsu <mhiramat@redhat.com> * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define pr_fmt(fmt) "trace_kprobe: " fmt @@ -23,6 +12,7 @@ #include <linux/rculist.h> #include <linux/error-injection.h> +#include "trace_kprobe_selftest.h" #include "trace_probe.h" #define KPROBE_EVENT_SYSTEM "kprobes" @@ -87,6 +77,23 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +/* Return 0 if it fails to find the symbol address */ +static nokprobe_inline +unsigned long trace_kprobe_address(struct trace_kprobe *tk) +{ + unsigned long addr; + + if (tk->symbol) { + addr = (unsigned long) + kallsyms_lookup_name(trace_kprobe_symbol(tk)); + if (addr) + addr += tk->rp.kp.offset; + } else { + addr = (unsigned long)tk->rp.kp.addr; + } + return addr; +} + bool trace_kprobe_on_func_entry(struct trace_event_call *call) { struct trace_kprobe *tk = (struct trace_kprobe *)call->data; @@ -99,16 +106,8 @@ bool trace_kprobe_on_func_entry(struct trace_event_call *call) bool trace_kprobe_error_injectable(struct trace_event_call *call) { struct trace_kprobe *tk = (struct trace_kprobe *)call->data; - unsigned long addr; - if (tk->symbol) { - addr = (unsigned long) - kallsyms_lookup_name(trace_kprobe_symbol(tk)); - addr += tk->rp.kp.offset; - } else { - addr = (unsigned long)tk->rp.kp.addr; - } - return within_error_injection_list(addr); + return within_error_injection_list(trace_kprobe_address(tk)); } static int register_kprobe_event(struct trace_kprobe *tk); @@ -393,6 +392,20 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, return NULL; } +static inline int __enable_trace_kprobe(struct trace_kprobe *tk) +{ + int ret = 0; + + if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { + if (trace_kprobe_is_return(tk)) + ret = enable_kretprobe(&tk->rp); + else + ret = enable_kprobe(&tk->rp.kp); + } + + return ret; +} + /* * Enable trace_probe * if the file is NULL, enable "perf" handler, or enable "trace" handler. @@ -400,11 +413,10 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, static int enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { + struct event_file_link *link; int ret = 0; if (file) { - struct event_file_link *link; - link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { ret = -ENOMEM; @@ -415,14 +427,18 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) list_add_tail_rcu(&link->list, &tk->tp.files); tk->tp.flags |= TP_FLAG_TRACE; - } else - tk->tp.flags |= TP_FLAG_PROFILE; + ret = __enable_trace_kprobe(tk); + if (ret) { + list_del_rcu(&link->list); + kfree(link); + tk->tp.flags &= ~TP_FLAG_TRACE; + } - if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { - if (trace_kprobe_is_return(tk)) - ret = enable_kretprobe(&tk->rp); - else - ret = enable_kprobe(&tk->rp.kp); + } else { + tk->tp.flags |= TP_FLAG_PROFILE; + ret = __enable_trace_kprobe(tk); + if (ret) + tk->tp.flags &= ~TP_FLAG_PROFILE; } out: return ret; @@ -487,6 +503,29 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) return ret; } +#if defined(CONFIG_KPROBES_ON_FTRACE) && \ + !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) +static bool within_notrace_func(struct trace_kprobe *tk) +{ + unsigned long offset, size, addr; + + addr = trace_kprobe_address(tk); + if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset)) + return false; + + /* Get the entry address of the target function */ + addr -= offset; + + /* + * Since ftrace_location_range() does inclusive range check, we need + * to subtract 1 byte from the end address. + */ + return !ftrace_location_range(addr, addr + size - 1); +} +#else +#define within_notrace_func(tk) (false) +#endif + /* Internal register function - just handle k*probes and flags */ static int __register_trace_kprobe(struct trace_kprobe *tk) { @@ -495,6 +534,12 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (trace_probe_is_registered(&tk->tp)) return -EINVAL; + if (within_notrace_func(tk)) { + pr_warn("Could not probe notrace function %s\n", + trace_kprobe_symbol(tk)); + return -EINVAL; + } + for (i = 0; i < tk->tp.nr_args; i++) traceprobe_update_arg(&tk->tp.args[i]); @@ -1217,16 +1262,11 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) /* * We need to check and see if we modified the pc of the - * pt_regs, and if so clear the kprobe and return 1 so that we - * don't do the single stepping. - * The ftrace kprobe handler leaves it up to us to re-enable - * preemption here before returning if we've modified the ip. + * pt_regs, and if so return 1 so that we don't do the + * single stepping. */ - if (orig_ip != instruction_pointer(regs)) { - reset_current_kprobe(); - preempt_enable_no_resched(); + if (orig_ip != instruction_pointer(regs)) return 1; - } if (!ret) return 0; } @@ -1480,8 +1520,10 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, } ret = __register_trace_kprobe(tk); - if (ret < 0) + if (ret < 0) { + kfree(tk->tp.call.print_fmt); goto error; + } return &tk->tp.call; error: @@ -1501,6 +1543,8 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call) } __unregister_trace_kprobe(tk); + + kfree(tk->tp.call.print_fmt); free_trace_kprobe(tk); } #endif /* CONFIG_PERF_EVENTS */ @@ -1537,17 +1581,6 @@ fs_initcall(init_kprobe_trace); #ifdef CONFIG_FTRACE_STARTUP_TEST -/* - * The "__used" keeps gcc from removing the function symbol - * from the kallsyms table. 'noinline' makes sure that there - * isn't an inlined version used by the test method below - */ -static __used __init noinline int -kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6) -{ - return a1 + a2 + a3 + a4 + a5 + a6; -} - static __init struct trace_event_file * find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) { diff --git a/kernel/trace/trace_kprobe_selftest.c b/kernel/trace/trace_kprobe_selftest.c new file mode 100644 index 000000000000..16548ee4c8c6 --- /dev/null +++ b/kernel/trace/trace_kprobe_selftest.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Function used during the kprobe self test. This function is in a separate + * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it + * can be probed by the selftests. + */ +int kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6) +{ + return a1 + a2 + a3 + a4 + a5 + a6; +} diff --git a/kernel/trace/trace_kprobe_selftest.h b/kernel/trace/trace_kprobe_selftest.h new file mode 100644 index 000000000000..c4fc7268ba7c --- /dev/null +++ b/kernel/trace/trace_kprobe_selftest.h @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Function used during the kprobe self test. This function is in a separate + * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it + * can be probed by the selftests. + */ +int kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 90db994ac900..6e6cc64faa38 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_output.c * @@ -594,8 +595,7 @@ int trace_print_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); - trace_seq_printf(s, "%16s-%-5d [%03d] ", - comm, entry->pid, iter->cpu); + trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { unsigned int tgid = trace_find_tgid(entry->pid); @@ -606,6 +606,8 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "(%5d) ", tgid); } + trace_seq_printf(s, "[%03d] ", iter->cpu); + if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index dbba03ed96de..2f742b74e7e6 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #ifndef __TRACE_EVENTS_H #define __TRACE_EVENTS_H diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c new file mode 100644 index 000000000000..71f553cceb3c --- /dev/null +++ b/kernel/trace/trace_preemptirq.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * preemptoff and irqoff tracepoints + * + * Copyright (C) Joel Fernandes (Google) <joel@joelfernandes.org> + */ + +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include "trace.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/preemptirq.h> + +#ifdef CONFIG_TRACE_IRQFLAGS +/* Per-cpu variable to prevent redundant calls when IRQs already off */ +static DEFINE_PER_CPU(int, tracing_irq_cpu); + +void trace_hardirqs_on(void) +{ + if (this_cpu_read(tracing_irq_cpu)) { + if (!in_nmi()) + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); + this_cpu_write(tracing_irq_cpu, 0); + } + + lockdep_hardirqs_on(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void trace_hardirqs_off(void) +{ + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); + tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); + if (!in_nmi()) + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + } + + lockdep_hardirqs_off(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +{ + if (this_cpu_read(tracing_irq_cpu)) { + if (!in_nmi()) + trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); + tracer_hardirqs_on(CALLER_ADDR0, caller_addr); + this_cpu_write(tracing_irq_cpu, 0); + } + + lockdep_hardirqs_on(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +{ + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); + tracer_hardirqs_off(CALLER_ADDR0, caller_addr); + if (!in_nmi()) + trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); + } + + lockdep_hardirqs_off(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); +#endif /* CONFIG_TRACE_IRQFLAGS */ + +#ifdef CONFIG_TRACE_PREEMPT_TOGGLE + +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + if (!in_nmi()) + trace_preempt_enable_rcuidle(a0, a1); + tracer_preempt_on(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + if (!in_nmi()) + trace_preempt_disable_rcuidle(a0, a1); + tracer_preempt_off(a0, a1); +} +#endif diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 50f44b7b2b32..b0875b327f5c 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace binary printk * diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index daf54bda4dc8..e99c3ce7aa65 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Common code for probe-based Dynamic events. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * This code was copied from kernel/trace/trace_kprobe.c written by * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> * diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 75daff22ccea..5f52668e165d 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Common header file for probe-based Dynamic events. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * This code was copied from kernel/trace/trace_kprobe.h written by * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> * diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index e694c9f9efa4..6b1c562ffdaf 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_seq.c * diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 76d30b4ebe83..8786d17caf49 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #ifndef __TRACE_STAT_H #define __TRACE_STAT_H diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index bf89a51e740d..e696667da29a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * uprobes-based tracing events * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * Copyright (C) IBM Corporation, 2010-2012 * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com> */ @@ -952,7 +940,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) list_del_rcu(&link->list); /* synchronize with u{,ret}probe_trace_func */ - synchronize_sched(); + synchronize_rcu(); kfree(link); if (!list_empty(&tu->tp.files)) diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 752d8042bad4..9a1c22310323 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * tracing_map - lock-free map for tracing * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com> * * tracing_map implementation inspired by lock-free map algorithms diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 053eb92b2d31..a6de61fc22de 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #ifndef __TRACING_MAP_H #define __TRACING_MAP_H diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 6dc6356c3327..bf2c06ef9afc 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -31,6 +31,9 @@ extern struct tracepoint * const __start___tracepoints_ptrs[]; extern struct tracepoint * const __stop___tracepoints_ptrs[]; +DEFINE_SRCU(tracepoint_srcu); +EXPORT_SYMBOL_GPL(tracepoint_srcu); + /* Set to 1 to enable tracepoint debug output */ static const int tracepoint_debug; @@ -50,6 +53,9 @@ static LIST_HEAD(tracepoint_module_list); */ static DEFINE_MUTEX(tracepoints_mutex); +static struct rcu_head *early_probes; +static bool ok_to_free_tracepoints; + /* * Note about RCU : * It is used to delay the free of multiple probes array until a quiescent @@ -67,16 +73,56 @@ static inline void *allocate_probes(int count) return p == NULL ? NULL : p->probes; } -static void rcu_free_old_probes(struct rcu_head *head) +static void srcu_free_old_probes(struct rcu_head *head) { kfree(container_of(head, struct tp_probes, rcu)); } +static void rcu_free_old_probes(struct rcu_head *head) +{ + call_srcu(&tracepoint_srcu, head, srcu_free_old_probes); +} + +static __init int release_early_probes(void) +{ + struct rcu_head *tmp; + + ok_to_free_tracepoints = true; + + while (early_probes) { + tmp = early_probes; + early_probes = tmp->next; + call_rcu_sched(tmp, rcu_free_old_probes); + } + + return 0; +} + +/* SRCU is initialized at core_initcall */ +postcore_initcall(release_early_probes); + static inline void release_probes(struct tracepoint_func *old) { if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); + + /* + * We can't free probes if SRCU is not initialized yet. + * Postpone the freeing till after SRCU is initialized. + */ + if (unlikely(!ok_to_free_tracepoints)) { + tp_probes->rcu.next = early_probes; + early_probes = &tp_probes->rcu; + return; + } + + /* + * Tracepoint probes are protected by both sched RCU and SRCU, + * by calling the SRCU callback in the sched RCU callback we + * cover both cases. So let us chain the SRCU and sched RCU + * callbacks to wait for both grace periods. + */ call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes); } } @@ -325,6 +371,27 @@ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data) } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); +static void for_each_tracepoint_range(struct tracepoint * const *begin, + struct tracepoint * const *end, + void (*fct)(struct tracepoint *tp, void *priv), + void *priv) +{ + if (!begin) + return; + + if (IS_ENABLED(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS)) { + const int *iter; + + for (iter = (const int *)begin; iter < (const int *)end; iter++) + fct(offset_to_ptr(iter), priv); + } else { + struct tracepoint * const *iter; + + for (iter = begin; iter < end; iter++) + fct(*iter, priv); + } +} + #ifdef CONFIG_MODULES bool trace_module_has_bad_taint(struct module *mod) { @@ -389,15 +456,9 @@ EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier); * Ensure the tracer unregistered the module's probes before the module * teardown is performed. Prevents leaks of probe and data pointers. */ -static void tp_module_going_check_quiescent(struct tracepoint * const *begin, - struct tracepoint * const *end) +static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv) { - struct tracepoint * const *iter; - - if (!begin) - return; - for (iter = begin; iter < end; iter++) - WARN_ON_ONCE((*iter)->funcs); + WARN_ON_ONCE(tp->funcs); } static int tracepoint_module_coming(struct module *mod) @@ -448,8 +509,9 @@ static void tracepoint_module_going(struct module *mod) * Called the going notifier before checking for * quiescence. */ - tp_module_going_check_quiescent(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); + for_each_tracepoint_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints, + tp_module_going_check_quiescent, NULL); break; } } @@ -501,19 +563,6 @@ static __init int init_tracepoints(void) __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ -static void for_each_tracepoint_range(struct tracepoint * const *begin, - struct tracepoint * const *end, - void (*fct)(struct tracepoint *tp, void *priv), - void *priv) -{ - struct tracepoint * const *iter; - - if (!begin) - return; - for (iter = begin; iter < end; iter++) - fct(*iter, priv); -} - /** * for_each_kernel_tracepoint - iteration on all kernel tracepoints * @fct: callback diff --git a/kernel/user.c b/kernel/user.c index 36288d840675..0df9b1640b2a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -96,7 +96,7 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { - .__count = ATOMIC_INIT(1), + .__count = REFCOUNT_INIT(1), .processes = ATOMIC_INIT(1), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, @@ -123,7 +123,7 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) hlist_for_each_entry(user, hashent, uidhash_node) { if (uid_eq(user->uid, uid)) { - atomic_inc(&user->__count); + refcount_inc(&user->__count); return user; } } @@ -169,11 +169,8 @@ void free_uid(struct user_struct *up) if (!up) return; - local_irq_save(flags); - if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) + if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags)) free_user(up, flags); - else - local_irq_restore(flags); } struct user_struct *alloc_uid(kuid_t uid) @@ -191,7 +188,7 @@ struct user_struct *alloc_uid(kuid_t uid) goto out_unlock; new->uid = uid; - atomic_set(&new->__count, 1); + refcount_set(&new->__count, 1); ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c3d7583fcd21..e5222b5fb4fe 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -859,7 +859,16 @@ static ssize_t map_write(struct file *file, const char __user *buf, unsigned idx; struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; - ssize_t ret = -EINVAL; + ssize_t ret; + + /* Only allow < page size writes at the beginning of the file */ + if ((*ppos != 0) || (count >= PAGE_SIZE)) + return -EINVAL; + + /* Slurp in the user data */ + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. @@ -895,19 +904,6 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) goto out; - /* Only allow < page size writes at the beginning of the file */ - ret = -EINVAL; - if ((*ppos != 0) || (count >= PAGE_SIZE)) - goto out; - - /* Slurp in the user data */ - kbuf = memdup_user_nul(buf, count); - if (IS_ERR(kbuf)) { - ret = PTR_ERR(kbuf); - kbuf = NULL; - goto out; - } - /* Parse the user data */ ret = -EINVAL; pos = kbuf; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 233cd8fc6910..258033d62cb3 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -18,7 +18,7 @@ #ifdef CONFIG_PROC_SYSCTL -static void *get_uts(struct ctl_table *table, int write) +static void *get_uts(struct ctl_table *table) { char *which = table->data; struct uts_namespace *uts_ns; @@ -26,21 +26,9 @@ static void *get_uts(struct ctl_table *table, int write) uts_ns = current->nsproxy->uts_ns; which = (which - (char *)&init_uts_ns) + (char *)uts_ns; - if (!write) - down_read(&uts_sem); - else - down_write(&uts_sem); return which; } -static void put_uts(struct ctl_table *table, int write, void *which) -{ - if (!write) - up_read(&uts_sem); - else - up_write(&uts_sem); -} - /* * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? @@ -50,13 +38,34 @@ static int proc_do_uts_string(struct ctl_table *table, int write, { struct ctl_table uts_table; int r; + char tmp_data[__NEW_UTS_LEN + 1]; + memcpy(&uts_table, table, sizeof(uts_table)); - uts_table.data = get_uts(table, write); + uts_table.data = tmp_data; + + /* + * Buffer the value in tmp_data so that proc_dostring() can be called + * without holding any locks. + * We also need to read the original value in the write==1 case to + * support partial writes. + */ + down_read(&uts_sem); + memcpy(tmp_data, get_uts(table), sizeof(tmp_data)); + up_read(&uts_sem); r = proc_dostring(&uts_table, write, buffer, lenp, ppos); - put_uts(table, write, uts_table.data); - if (write) + if (write) { + /* + * Write back the new value. + * Note that, since we dropped uts_sem, the result can + * theoretically be incorrect if there are two parallel writes + * at non-zero offsets to the same sysctl. + */ + down_write(&uts_sem); + memcpy(get_uts(table), tmp_data, sizeof(tmp_data)); + up_write(&uts_sem); proc_sys_poll_notify(table->poll); + } return r; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 576d18045811..5470dce212c0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -18,18 +18,14 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/sysctl.h> -#include <linux/smpboot.h> -#include <linux/sched/rt.h> -#include <uapi/linux/sched/types.h> #include <linux/tick.h> -#include <linux/workqueue.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/isolation.h> +#include <linux/stop_machine.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> -#include <linux/kthread.h> static DEFINE_MUTEX(watchdog_mutex); @@ -169,11 +165,10 @@ static void lockup_detector_update_enable(void) unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; -static bool softlockup_threads_initialized __read_mostly; +static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); -static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); @@ -335,6 +330,27 @@ static void watchdog_interrupt_count(void) __this_cpu_inc(hrtimer_interrupts); } +static DEFINE_PER_CPU(struct completion, softlockup_completion); +static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); + +/* + * The watchdog thread function - touches the timestamp. + * + * It only runs once every sample_period seconds (4 seconds by + * default) to reset the softlockup timestamp. If this gets delayed + * for more than 2*watchdog_thresh seconds then the debug-printout + * triggers in watchdog_timer_fn(). + */ +static int softlockup_fn(void *data) +{ + __this_cpu_write(soft_lockup_hrtimer_cnt, + __this_cpu_read(hrtimer_interrupts)); + __touch_watchdog(); + complete(this_cpu_ptr(&softlockup_completion)); + + return 0; +} + /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -350,7 +366,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) watchdog_interrupt_count(); /* kick the softlockup detector */ - wake_up_process(__this_cpu_read(softlockup_watchdog)); + if (completion_done(this_cpu_ptr(&softlockup_completion))) { + reinit_completion(this_cpu_ptr(&softlockup_completion)); + stop_one_cpu_nowait(smp_processor_id(), + softlockup_fn, NULL, + this_cpu_ptr(&softlockup_stop_work)); + } /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); @@ -448,16 +469,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } -static void watchdog_set_prio(unsigned int policy, unsigned int prio) -{ - struct sched_param param = { .sched_priority = prio }; - - sched_setscheduler(current, policy, ¶m); -} - static void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); + struct completion *done = this_cpu_ptr(&softlockup_completion); + + WARN_ON_ONCE(cpu != smp_processor_id()); + + init_completion(done); + complete(done); /* * Start the timer first to prevent the NMI watchdog triggering @@ -473,15 +493,14 @@ static void watchdog_enable(unsigned int cpu) /* Enable the perf event */ if (watchdog_enabled & NMI_WATCHDOG_ENABLED) watchdog_nmi_enable(cpu); - - watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); } static void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); - watchdog_set_prio(SCHED_NORMAL, 0); + WARN_ON_ONCE(cpu != smp_processor_id()); + /* * Disable the perf event first. That prevents that a large delay * between disabling the timer and disabling the perf event causes @@ -489,79 +508,66 @@ static void watchdog_disable(unsigned int cpu) */ watchdog_nmi_disable(cpu); hrtimer_cancel(hrtimer); + wait_for_completion(this_cpu_ptr(&softlockup_completion)); } -static void watchdog_cleanup(unsigned int cpu, bool online) +static int softlockup_stop_fn(void *data) { - watchdog_disable(cpu); + watchdog_disable(smp_processor_id()); + return 0; } -static int watchdog_should_run(unsigned int cpu) +static void softlockup_stop_all(void) { - return __this_cpu_read(hrtimer_interrupts) != - __this_cpu_read(soft_lockup_hrtimer_cnt); + int cpu; + + if (!softlockup_initialized) + return; + + for_each_cpu(cpu, &watchdog_allowed_mask) + smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false); + + cpumask_clear(&watchdog_allowed_mask); } -/* - * The watchdog thread function - touches the timestamp. - * - * It only runs once every sample_period seconds (4 seconds by - * default) to reset the softlockup timestamp. If this gets delayed - * for more than 2*watchdog_thresh seconds then the debug-printout - * triggers in watchdog_timer_fn(). - */ -static void watchdog(unsigned int cpu) +static int softlockup_start_fn(void *data) { - __this_cpu_write(soft_lockup_hrtimer_cnt, - __this_cpu_read(hrtimer_interrupts)); - __touch_watchdog(); + watchdog_enable(smp_processor_id()); + return 0; } -static struct smp_hotplug_thread watchdog_threads = { - .store = &softlockup_watchdog, - .thread_should_run = watchdog_should_run, - .thread_fn = watchdog, - .thread_comm = "watchdog/%u", - .setup = watchdog_enable, - .cleanup = watchdog_cleanup, - .park = watchdog_disable, - .unpark = watchdog_enable, -}; - -static void softlockup_update_smpboot_threads(void) +static void softlockup_start_all(void) { - lockdep_assert_held(&watchdog_mutex); - - if (!softlockup_threads_initialized) - return; + int cpu; - smpboot_update_cpumask_percpu_thread(&watchdog_threads, - &watchdog_allowed_mask); + cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); + for_each_cpu(cpu, &watchdog_allowed_mask) + smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false); } -/* Temporarily park all watchdog threads */ -static void softlockup_park_all_threads(void) +int lockup_detector_online_cpu(unsigned int cpu) { - cpumask_clear(&watchdog_allowed_mask); - softlockup_update_smpboot_threads(); + watchdog_enable(cpu); + return 0; } -/* Unpark enabled threads */ -static void softlockup_unpark_threads(void) +int lockup_detector_offline_cpu(unsigned int cpu) { - cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); - softlockup_update_smpboot_threads(); + watchdog_disable(cpu); + return 0; } static void lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); - softlockup_park_all_threads(); + + softlockup_stop_all(); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) - softlockup_unpark_threads(); + softlockup_start_all(); + watchdog_nmi_start(); cpus_read_unlock(); /* @@ -580,8 +586,6 @@ static void lockup_detector_reconfigure(void) */ static __init void lockup_detector_setup(void) { - int ret; - /* * If sysctl is off and watchdog got disabled on the command line, * nothing to do here. @@ -592,24 +596,13 @@ static __init void lockup_detector_setup(void) !(watchdog_enabled && watchdog_thresh)) return; - ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads, - &watchdog_allowed_mask); - if (ret) { - pr_err("Failed to initialize soft lockup detector threads\n"); - return; - } - mutex_lock(&watchdog_mutex); - softlockup_threads_initialized = true; lockup_detector_reconfigure(); + softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ -static inline int watchdog_park_threads(void) { return 0; } -static inline void watchdog_unpark_threads(void) { } -static inline int watchdog_enable_all_cpus(void) { return 0; } -static inline void watchdog_disable_all_cpus(void) { } static void lockup_detector_reconfigure(void) { cpus_read_lock(); diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index e449a23e9d59..1f7020d65d0a 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -175,8 +175,8 @@ static int hardlockup_detector_event_create(void) evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); if (IS_ERR(evt)) { - pr_info("Perf event create on CPU %d failed with %ld\n", cpu, - PTR_ERR(evt)); + pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, + PTR_ERR(evt)); return PTR_ERR(evt); } this_cpu_write(watchdog_ev, evt); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 78b192071ef7..60e80198c3df 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2652,6 +2652,9 @@ void flush_workqueue(struct workqueue_struct *wq) if (WARN_ON(!wq_online)) return; + lock_map_acquire(&wq->lockdep_map); + lock_map_release(&wq->lockdep_map); + mutex_lock(&wq->mutex); /* @@ -2843,7 +2846,8 @@ reflush: } EXPORT_SYMBOL_GPL(drain_workqueue); -static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) +static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, + bool from_cancel) { struct worker *worker = NULL; struct worker_pool *pool; @@ -2885,7 +2889,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) * workqueues the deadlock happens when the rescuer stalls, blocking * forward progress. */ - if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) { + if (!from_cancel && + (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) { lock_map_acquire(&pwq->wq->lockdep_map); lock_map_release(&pwq->wq->lockdep_map); } @@ -2896,6 +2901,27 @@ already_gone: return false; } +static bool __flush_work(struct work_struct *work, bool from_cancel) +{ + struct wq_barrier barr; + + if (WARN_ON(!wq_online)) + return false; + + if (!from_cancel) { + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); + } + + if (start_flush_work(work, &barr, from_cancel)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else { + return false; + } +} + /** * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush @@ -2909,18 +2935,7 @@ already_gone: */ bool flush_work(struct work_struct *work) { - struct wq_barrier barr; - - if (WARN_ON(!wq_online)) - return false; - - if (start_flush_work(work, &barr)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else { - return false; - } + return __flush_work(work, false); } EXPORT_SYMBOL_GPL(flush_work); @@ -2986,7 +3001,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) * isn't executing. */ if (wq_online) - flush_work(work); + __flush_work(work, true); clear_work_data(work); |
