From db8dd9697238be70a6b4f9d0284cd89f59c0e070 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 30 Jan 2020 13:34:49 +0300 Subject: cgroup-v1: cgroup_pidlist_next should update position index if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output. # mount | grep cgroup # dd if=/mnt/cgroup.procs bs=1 # normal output ... 1294 1295 1296 1304 1382 584+0 records in 584+0 records out 584 bytes copied dd: /mnt/cgroup.procs: cannot skip to specified offset 83 <<< generates end of last line 1383 <<< ... and whole last line once again 0+1 records in 0+1 records out 8 bytes copied dd: /mnt/cgroup.procs: cannot skip to specified offset 1386 <<< generates last line anyway 0+1 records in 0+1 records out 5 bytes copied https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index be1a1c83cdd1..9a6f060cbf51 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -471,6 +471,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) */ p++; if (p >= end) { + (*pos)++; return NULL; } else { *pos = *p; -- cgit v1.2.3 From 2d4ecb030dcc90fb725ecbfc82ce5d6c37906e0e Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 30 Jan 2020 13:34:59 +0300 Subject: cgroup: cgroup_procs_next should increase position index If seq_file .next fuction does not change position index, read after some lseek can generate unexpected output: 1) dd bs=1 skip output of each 2nd elements $ dd if=/sys/fs/cgroup/cgroup.procs bs=8 count=1 2 3 4 5 1+0 records in 1+0 records out 8 bytes copied, 0,000267297 s, 29,9 kB/s [test@localhost ~]$ dd if=/sys/fs/cgroup/cgroup.procs bs=1 count=8 2 4 <<< NB! 3 was skipped 6 <<< ... and 5 too 8 <<< ... and 7 8+0 records in 8+0 records out 8 bytes copied, 5,2123e-05 s, 153 kB/s This happen because __cgroup_procs_start() makes an extra extra cgroup_procs_next() call 2) read after lseek beyond end of file generates whole last line. 3) read after lseek into middle of last line generates expected rest of last line and unexpected whole line once again. Additionally patch removes an extra position index changes in __cgroup_procs_start() Cc: stable@vger.kernel.org https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 75f687301bbf..927f7b82e5c1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4595,6 +4595,9 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) struct kernfs_open_file *of = s->private; struct css_task_iter *it = of->priv; + if (pos) + (*pos)++; + return css_task_iter_next(it); } @@ -4610,7 +4613,7 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, * from position 0, so we can simply keep iterating on !0 *pos. */ if (!it) { - if (WARN_ON_ONCE((*pos)++)) + if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); it = kzalloc(sizeof(*it), GFP_KERNEL); @@ -4618,10 +4621,11 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, return ERR_PTR(-ENOMEM); of->priv = it; css_task_iter_start(&cgrp->self, iter_flags, it); - } else if (!(*pos)++) { + } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); - } + } else + return it->cur_task; return cgroup_procs_next(s, NULL, NULL); } -- cgit v1.2.3 From 9c974c77246460fa6a92c18554c3311c8c83c160 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Fri, 24 Jan 2020 12:40:15 +0100 Subject: cgroup: Iterate tasks that did not finish do_exit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PF_EXITING is set earlier than actual removal from css_set when a task is exitting. This can confuse cgroup.procs readers who see no PF_EXITING tasks, however, rmdir is checking against css_set membership so it can transitionally fail with EBUSY. Fix this by listing tasks that weren't unlinked from css_set active lists. It may happen that other users of the task iterator (without CSS_TASK_ITER_PROCS) spot a PF_EXITING task before cgroup_exit(). This is equal to the state before commit c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations") but it may be reviewed later. Reported-by: Suren Baghdasaryan Fixes: c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations") Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 23 ++++++++++++++++------- 2 files changed, 17 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index d7ddebd0cdec..e75d2191226b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -62,6 +62,7 @@ struct css_task_iter { struct list_head *mg_tasks_head; struct list_head *dying_tasks_head; + struct list_head *cur_tasks_head; struct css_set *cur_cset; struct css_set *cur_dcset; struct task_struct *cur_task; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 927f7b82e5c1..c719a4154d6d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4400,12 +4400,16 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) } } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); - if (!list_empty(&cset->tasks)) + if (!list_empty(&cset->tasks)) { it->task_pos = cset->tasks.next; - else if (!list_empty(&cset->mg_tasks)) + it->cur_tasks_head = &cset->tasks; + } else if (!list_empty(&cset->mg_tasks)) { it->task_pos = cset->mg_tasks.next; - else + it->cur_tasks_head = &cset->mg_tasks; + } else { it->task_pos = cset->dying_tasks.next; + it->cur_tasks_head = &cset->dying_tasks; + } it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; @@ -4463,10 +4467,14 @@ repeat: else it->task_pos = it->task_pos->next; - if (it->task_pos == it->tasks_head) + if (it->task_pos == it->tasks_head) { it->task_pos = it->mg_tasks_head->next; - if (it->task_pos == it->mg_tasks_head) + it->cur_tasks_head = it->mg_tasks_head; + } + if (it->task_pos == it->mg_tasks_head) { it->task_pos = it->dying_tasks_head->next; + it->cur_tasks_head = it->dying_tasks_head; + } if (it->task_pos == it->dying_tasks_head) css_task_iter_advance_css_set(it); } else { @@ -4485,11 +4493,12 @@ repeat: goto repeat; /* and dying leaders w/o live member threads */ - if (!atomic_read(&task->signal->live)) + if (it->cur_tasks_head == it->dying_tasks_head && + !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ - if (task->flags & PF_EXITING) + if (it->cur_tasks_head == it->dying_tasks_head) goto repeat; } } -- cgit v1.2.3 From b0c609ab2057d0953fa05e7566f0c0e8a28fa9e1 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Fri, 14 Feb 2020 15:06:21 +0100 Subject: PM / hibernate: fix typo "reserverd_size" -> "reserved_size" Fix a mistake in a variable name in a comment. Signed-off-by: Alexandre Belloni Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ddade80ad276..d82b7b88d616 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1681,7 +1681,7 @@ static unsigned long minimum_image_size(unsigned long saveable) * hibernation for allocations made while saving the image and for device * drivers, in case they need to allocate memory from their hibernation * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough - * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through + * estimate) and reserved_size divided by PAGE_SIZE (which is tunable through * /sys/power/reserved_size, respectively). To make this happen, we compute the * total number of available page frames and allocate at least * -- cgit v1.2.3 From 279eef0531928a6669230879a6eed081513ad5a3 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:38 -0600 Subject: tracing: Make sure synth_event_trace() example always uses u64 synth_event_trace() is the varargs version of synth_event_trace_array(), which takes an array of u64, as do synth_event_add_val() et al. To not only be consistent with those, but also to address the fact that synth_event_trace() expects every arg to be of the same type since it doesn't also pass in e.g. a format string, the caller needs to make sure all args are of the same type, u64. u64 is used because it needs to accomodate the largest type available in synthetic events, which is u64. This fixes the bug reported by the kernel test robot/Rong Chen. Link: https://lore.kernel.org/lkml/20200212113444.GS12867@shao2-debian/ Link: http://lkml.kernel.org/r/894c4e955558b521210ee0642ba194a9e603354c.1581720155.git.zanussi@kernel.org Fixes: 9fe41efaca084 ("tracing: Add synth event generation test module") Reported-by: kernel test robot Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/synth_event_gen_test.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 4aefe003cb7c..6866280a9b10 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -111,11 +111,11 @@ static int __init test_gen_synth_cmd(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"hula hoops"; /* next_comm_field */ + vals[1] = (u64)(long)"hula hoops"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed"; /* my_string_field */ + vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 598; /* my_int_field */ /* Now generate a gen_synth_test event */ @@ -218,11 +218,11 @@ static int __init test_empty_synth_event(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"tiddlywinks"; /* next_comm_field */ + vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed_2.0"; /* my_string_field */ + vals[5] = (u64)(long)"thneed_2.0"; /* my_string_field */ vals[6] = 399; /* my_int_field */ /* Now trace an empty_synth_test event */ @@ -290,11 +290,11 @@ static int __init test_create_synth_event(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"tiddlywinks"; /* next_comm_field */ + vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed"; /* my_string_field */ + vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 398; /* my_int_field */ /* Now generate a create_synth_test event */ @@ -330,7 +330,7 @@ static int __init test_add_next_synth_val(void) goto out; /* next_comm_field */ - ret = synth_event_add_next_val((u64)"slinky", &trace_state); + ret = synth_event_add_next_val((u64)(long)"slinky", &trace_state); if (ret) goto out; @@ -350,7 +350,7 @@ static int __init test_add_next_synth_val(void) goto out; /* my_string_field */ - ret = synth_event_add_next_val((u64)"thneed_2.01", &trace_state); + ret = synth_event_add_next_val((u64)(long)"thneed_2.01", &trace_state); if (ret) goto out; @@ -396,12 +396,12 @@ static int __init test_add_synth_val(void) if (ret) goto out; - ret = synth_event_add_val("next_comm_field", (u64)"silly putty", + ret = synth_event_add_val("next_comm_field", (u64)(long)"silly putty", &trace_state); if (ret) goto out; - ret = synth_event_add_val("my_string_field", (u64)"thneed_9", + ret = synth_event_add_val("my_string_field", (u64)(long)"thneed_9", &trace_state); if (ret) goto out; @@ -423,13 +423,13 @@ static int __init test_trace_synth_event(void) /* Trace some bogus values just for testing */ ret = synth_event_trace(create_synth_test, 7, /* number of values */ - 444, /* next_pid_field */ - (u64)"clackers", /* next_comm_field */ - 1000000, /* ts_ns */ - 1000, /* ts_ms */ - smp_processor_id(), /* cpu */ - (u64)"Thneed", /* my_string_field */ - 999); /* my_int_field */ + (u64)444, /* next_pid_field */ + (u64)(long)"clackers", /* next_comm_field */ + (u64)1000000, /* ts_ns */ + (u64)1000, /* ts_ms */ + (u64)smp_processor_id(),/* cpu */ + (u64)(long)"Thneed", /* my_string_field */ + (u64)999); /* my_int_field */ return ret; } -- cgit v1.2.3 From 1d9d4c90194a8c3b2f7da9f4bf3f8ba2ed810656 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:39 -0600 Subject: tracing: Make synth_event trace functions endian-correct synth_event_trace(), synth_event_trace_array() and __synth_event_add_val() write directly into the trace buffer and need to take endianness into account, like trace_event_raw_event_synth() does. Link: http://lkml.kernel.org/r/2011354355e405af9c9d28abba430d1f5ff7771a.1581720155.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 62 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 65b54d6a1422..6a380fb83864 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1891,7 +1891,25 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - state.entry->fields[n_u64] = val; + struct synth_field *field = state.event->fields[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } n_u64++; } } @@ -1943,7 +1961,26 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - state.entry->fields[n_u64] = vals[i]; + struct synth_field *field = state.event->fields[i]; + u64 val = vals[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } n_u64++; } } @@ -2062,8 +2099,25 @@ static int __synth_event_add_val(const char *field_name, u64 val, str_field = (char *)&entry->fields[field->offset]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); - } else - entry->fields[field->offset] = val; + } else { + switch (field->size) { + case 1: + *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; + break; + + case 2: + *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; + break; + + case 4: + *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; + break; + + default: + trace_state->entry->fields[field->offset] = val; + break; + } + } out: return ret; } -- cgit v1.2.3 From 3843083772dc2afde790a6d7160658b00a808da1 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:40 -0600 Subject: tracing: Check that number of vals matches number of synth event fields Commit 7276531d4036('tracing: Consolidate trace() functions') inadvertently dropped the synth_event_trace() and synth_event_trace_array() checks that verify the number of values passed in matches the number of fields in the synthetic event being traced, so add them back. Link: http://lkml.kernel.org/r/32819cac708714693669e0dfe10fe9d935e94a16.1581720155.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6a380fb83864..45622194a34d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1878,6 +1878,11 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) return ret; } + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + va_start(args, n_vals); for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { u64 val; @@ -1914,7 +1919,7 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) } } va_end(args); - +out: __synth_event_trace_end(&state); return ret; @@ -1953,6 +1958,11 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, return ret; } + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)vals[i]; @@ -1984,7 +1994,7 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, n_u64++; } } - +out: __synth_event_trace_end(&state); return ret; -- cgit v1.2.3 From 784bd0847eda032ed2f3522f87250655a18c0190 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 14 Feb 2020 16:56:41 -0600 Subject: tracing: Fix number printing bug in print_synth_event() Fix a varargs-related bug in print_synth_event() which resulted in strange output and oopses on 32-bit x86 systems. The problem is that trace_seq_printf() expects the varargs to match the format string, but print_synth_event() was always passing u64 values regardless. This results in unspecified behavior when unpacking with va_arg() in trace_seq_printf(). Add a function that takes the size into account when calling trace_seq_printf(). Before: modprobe-1731 [003] .... 919.039758: gen_synth_test: next_pid_field=777(null)next_comm_field=hula hoops ts_ns=1000000 ts_ms=1000 cpu=3(null)my_string_field=thneed my_int_field=598(null) After: insmod-1136 [001] .... 36.634590: gen_synth_test: next_pid_field=777 next_comm_field=hula hoops ts_ns=1000000 ts_ms=1000 cpu=1 my_string_field=thneed my_int_field=598 Link: http://lkml.kernel.org/r/a9b59eb515dbbd7d4abe53b347dccf7a8e285657.1581720155.git.zanussi@kernel.org Reported-by: Steven Rostedt (VMware) Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 45622194a34d..f068d55bd37f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -820,6 +820,29 @@ static const char *synth_field_fmt(char *type) return fmt; } +static void print_synth_event_num_val(struct trace_seq *s, + char *print_fmt, char *name, + int size, u64 val, char *space) +{ + switch (size) { + case 1: + trace_seq_printf(s, print_fmt, name, (u8)val, space); + break; + + case 2: + trace_seq_printf(s, print_fmt, name, (u16)val, space); + break; + + case 4: + trace_seq_printf(s, print_fmt, name, (u32)val, space); + break; + + default: + trace_seq_printf(s, print_fmt, name, val, space); + break; + } +} + static enum print_line_t print_synth_event(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -858,10 +881,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; + char *space = (i == se->n_fields - 1 ? "" : " "); - trace_seq_printf(s, print_fmt, se->fields[i]->name, - entry->fields[n_u64], - i == se->n_fields - 1 ? "" : " "); + print_synth_event_num_val(s, print_fmt, + se->fields[i]->name, + se->fields[i]->size, + entry->fields[n_u64], + space); if (strcmp(se->fields[i]->type, "gfp_t") == 0) { trace_seq_puts(s, " ("); -- cgit v1.2.3 From 3c18a9be7c9d4f53239795282c5d927f73f534b3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Feb 2020 16:29:50 -0500 Subject: tracing: Have synthetic event test use raw_smp_processor_id() The test code that tests synthetic event creation pushes in as one of its test fields the current CPU using "smp_processor_id()". As this is just something to see if the value is correctly passed in, and the actual CPU used does not matter, use raw_smp_processor_id(), otherwise with debug preemption enabled, a warning happens as the smp_processor_id() is called without preemption enabled. Link: http://lkml.kernel.org/r/20200220162950.35162579@gandalf.local.home Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/synth_event_gen_test.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 6866280a9b10..7d56d621ffea 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -114,7 +114,7 @@ static int __init test_gen_synth_cmd(void) vals[1] = (u64)(long)"hula hoops"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ + vals[4] = raw_smp_processor_id(); /* cpu */ vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 598; /* my_int_field */ @@ -221,7 +221,7 @@ static int __init test_empty_synth_event(void) vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ + vals[4] = raw_smp_processor_id(); /* cpu */ vals[5] = (u64)(long)"thneed_2.0"; /* my_string_field */ vals[6] = 399; /* my_int_field */ @@ -293,7 +293,7 @@ static int __init test_create_synth_event(void) vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ + vals[4] = raw_smp_processor_id(); /* cpu */ vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 398; /* my_int_field */ @@ -345,7 +345,7 @@ static int __init test_add_next_synth_val(void) goto out; /* cpu */ - ret = synth_event_add_next_val(smp_processor_id(), &trace_state); + ret = synth_event_add_next_val(raw_smp_processor_id(), &trace_state); if (ret) goto out; @@ -388,7 +388,7 @@ static int __init test_add_synth_val(void) if (ret) goto out; - ret = synth_event_add_val("cpu", smp_processor_id(), &trace_state); + ret = synth_event_add_val("cpu", raw_smp_processor_id(), &trace_state); if (ret) goto out; @@ -427,7 +427,7 @@ static int __init test_trace_synth_event(void) (u64)(long)"clackers", /* next_comm_field */ (u64)1000000, /* ts_ns */ (u64)1000, /* ts_ms */ - (u64)smp_processor_id(),/* cpu */ + (u64)raw_smp_processor_id(), /* cpu */ (u64)(long)"Thneed", /* my_string_field */ (u64)999); /* my_int_field */ return ret; -- cgit v1.2.3 From 78041c0c9e935d9ce4086feeff6c569ed88ddfd4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Feb 2020 15:38:01 -0500 Subject: tracing: Disable trace_printk() on post poned tests The tracing seftests checks various aspects of the tracing infrastructure, and one is filtering. If trace_printk() is active during a self test, it can cause the filtering to fail, which will disable that part of the trace. To keep the selftests from failing because of trace_printk() calls, trace_printk() checks the variable tracing_selftest_running, and if set, it does not write to the tracing buffer. As some tracers were registered earlier in boot, the selftest they triggered would fail because not all the infrastructure was set up for the full selftest. Thus, some of the tests were post poned to when their infrastructure was ready (namely file system code). The postpone code did not set the tracing_seftest_running variable, and could fail if a trace_printk() was added and executed during their run. Cc: stable@vger.kernel.org Fixes: 9afecfbb95198 ("tracing: Postpone tracer start-up tests till the system is more robust") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 183b031a3828..a89c562ffb8f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1837,6 +1837,7 @@ static __init int init_trace_selftests(void) pr_info("Running postponed tracer tests:\n"); + tracing_selftest_running = true; list_for_each_entry_safe(p, n, &postponed_selftests, list) { /* This loop can take minutes when sanitizers are enabled, so * lets make sure we allow RCU processing. @@ -1859,6 +1860,7 @@ static __init int init_trace_selftests(void) list_del(&p->list); kfree(p); } + tracing_selftest_running = false; out: mutex_unlock(&trace_types_lock); -- cgit v1.2.3 From 7ab215f22d04067094de8c81c20ba4c565ff8dd4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 17 Feb 2020 18:52:39 +0900 Subject: tracing: Clear trace_state when starting trace Clear trace_state data structure when starting trace in __synth_event_trace_start() internal function. Currently trace_state is initialized only in the synth_event_trace_start() API, but the trace_state in synth_event_trace() and synth_event_trace_array() are on the stack without initialization. This means those APIs will see wrong parameters and wil skip closing process in __synth_event_trace_end() because trace_state->disabled may be !0. Link: http://lkml.kernel.org/r/158193315899.8868.1781259176894639952.stgit@devnote2 Reviewed-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f068d55bd37f..9d87aa1f0b79 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1824,6 +1824,8 @@ __synth_event_trace_start(struct trace_event_file *file, int entry_size, fields_size = 0; int ret = 0; + memset(trace_state, '\0', sizeof(*trace_state)); + /* * Normal event tracing doesn't get called at all unless the * ENABLED bit is set (which attaches the probe thus allowing @@ -2063,8 +2065,6 @@ int synth_event_trace_start(struct trace_event_file *file, if (!trace_state) return -EINVAL; - memset(trace_state, '\0', sizeof(*trace_state)); - ret = __synth_event_trace_start(file, trace_state); if (ret == -ENOENT) ret = 0; /* just disabled, not really an error */ -- cgit v1.2.3 From d8a953ddde5ec30a36810d0a892c3949b50849e9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Feb 2020 21:18:33 +0900 Subject: bootconfig: Set CONFIG_BOOT_CONFIG=n by default Set CONFIG_BOOT_CONFIG=n by default. This also warns user if CONFIG_BOOT_CONFIG=n but "bootconfig" is given in the kernel command line. Link: http://lkml.kernel.org/r/158220111291.26565.9036889083940367969.stgit@devnote2 Suggested-by: Steven Rostedt Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/Kconfig | 1 - init/main.c | 8 ++++++++ kernel/trace/Kconfig | 3 ++- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 4a672c6629d0..f586878410d2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1218,7 +1218,6 @@ endif config BOOT_CONFIG bool "Boot config support" depends on BLK_DEV_INITRD - default y help Extra boot config allows system admin to pass a config file as complemental extension of kernel cmdline when booting. diff --git a/init/main.c b/init/main.c index 48c87f47a444..d96cc5f65022 100644 --- a/init/main.c +++ b/init/main.c @@ -418,6 +418,14 @@ not_found: } #else #define setup_boot_config(cmdline) do { } while (0) + +static int __init warn_bootconfig(char *str) +{ + pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOTCONFIG is not set.\n"); + return 0; +} +early_param("bootconfig", warn_bootconfig); + #endif /* Change NUL term back to "=", to make "param" the whole string. */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 91e885194dbc..795c3e02d3f1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -143,7 +143,8 @@ if FTRACE config BOOTTIME_TRACING bool "Boot-time Tracing support" - depends on BOOT_CONFIG && TRACING + depends on TRACING + select BOOT_CONFIG default y help Enable developer to setup ftrace subsystem via supplemental -- cgit v1.2.3 From 2ad3e17ebf94b7b7f3f64c050ff168f9915345eb Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Sat, 22 Feb 2020 20:36:47 -0500 Subject: audit: fix error handling in audit_data_to_entry() Commit 219ca39427bf ("audit: use union for audit_field values since they are mutually exclusive") combined a number of separate fields in the audit_field struct into a single union. Generally this worked just fine because they are generally mutually exclusive. Unfortunately in audit_data_to_entry() the overlap can be a problem when a specific error case is triggered that causes the error path code to attempt to cleanup an audit_field struct and the cleanup involves attempting to free a stored LSM string (the lsm_str field). Currently the code always has a non-NULL value in the audit_field.lsm_str field as the top of the for-loop transfers a value into audit_field.val (both .lsm_str and .val are part of the same union); if audit_data_to_entry() fails and the audit_field struct is specified to contain a LSM string, but the audit_field.lsm_str has not yet been properly set, the error handling code will attempt to free the bogus audit_field.lsm_str value that was set with audit_field.val at the top of the for-loop. This patch corrects this by ensuring that the audit_field.val is only set when needed (it is cleared when the audit_field struct is allocated with kcalloc()). It also corrects a few other issues to ensure that in case of error the proper error code is returned. Cc: stable@vger.kernel.org Fixes: 219ca39427bf ("audit: use union for audit_field values since they are mutually exclusive") Reported-by: syzbot+1f4d90ead370d72e450b@syzkaller.appspotmail.com Signed-off-by: Paul Moore --- kernel/auditfilter.c | 71 +++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b0126e9c0743..026e34da4ace 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -456,6 +456,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, bufp = data->buf; for (i = 0; i < data->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; + u32 f_val; err = -EINVAL; @@ -464,12 +465,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; f->type = data->fields[i]; - f->val = data->values[i]; + f_val = data->values[i]; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { + if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; - f->val = 0; + f_val = 0; entry->rule.pflags |= AUDIT_LOGINUID_LEGACY; } @@ -485,7 +486,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_OBJ_UID: - f->uid = make_kuid(current_user_ns(), f->val); + f->uid = make_kuid(current_user_ns(), f_val); if (!uid_valid(f->uid)) goto exit_free; break; @@ -494,11 +495,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: - f->gid = make_kgid(current_user_ns(), f->val); + f->gid = make_kgid(current_user_ns(), f_val); if (!gid_valid(f->gid)) goto exit_free; break; case AUDIT_ARCH: + f->val = f_val; entry->rule.arch_f = f; break; case AUDIT_SUBJ_USER: @@ -511,11 +513,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - + } + entry->rule.buflen += f_val; + f->lsm_str = str; err = security_audit_rule_init(f->type, f->op, str, (void **)&f->lsm_rule); /* Keep currently invalid fields around in case they @@ -524,68 +528,71 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, pr_warn("audit rule for LSM \'%s\' is invalid\n", str); err = 0; - } - if (err) { - kfree(str); + } else if (err) goto exit_free; - } else - f->lsm_str = str; break; case AUDIT_WATCH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - - err = audit_to_watch(&entry->rule, str, f->val, f->op); + } + err = audit_to_watch(&entry->rule, str, f_val, f->op); if (err) { kfree(str); goto exit_free; } + entry->rule.buflen += f_val; break; case AUDIT_DIR: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - + } err = audit_make_tree(&entry->rule, str, f->op); kfree(str); if (err) goto exit_free; + entry->rule.buflen += f_val; break; case AUDIT_INODE: + f->val = f_val; err = audit_to_inode(&entry->rule, f); if (err) goto exit_free; break; case AUDIT_FILTERKEY: - if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) + if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN) goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; + } + entry->rule.buflen += f_val; entry->rule.filterkey = str; break; case AUDIT_EXE: - if (entry->rule.exe || f->val > PATH_MAX) + if (entry->rule.exe || f_val > PATH_MAX) goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); + str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } - entry->rule.buflen += f->val; - - audit_mark = audit_alloc_mark(&entry->rule, str, f->val); + audit_mark = audit_alloc_mark(&entry->rule, str, f_val); if (IS_ERR(audit_mark)) { kfree(str); err = PTR_ERR(audit_mark); goto exit_free; } + entry->rule.buflen += f_val; entry->rule.exe = audit_mark; break; + default: + f->val = f_val; + break; } } -- cgit v1.2.3 From 756125289285f6e55a03861bf4b6257aa3d19a93 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 24 Feb 2020 16:38:57 -0500 Subject: audit: always check the netlink payload length in audit_receive_msg() This patch ensures that we always check the netlink payload length in audit_receive_msg() before we take any action on the payload itself. Cc: stable@vger.kernel.org Reported-by: syzbot+399c44bf1f43b8747403@syzkaller.appspotmail.com Reported-by: syzbot+e4b12d8d202701f08b6d@syzkaller.appspotmail.com Signed-off-by: Paul Moore --- kernel/audit.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 17b0d523afb3..9ddfe2aa6671 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1101,13 +1101,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature audit_log_end(ab); } -static int audit_set_feature(struct sk_buff *skb) +static int audit_set_feature(struct audit_features *uaf) { - struct audit_features *uaf; int i; BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); - uaf = nlmsg_data(nlmsg_hdr(skb)); /* if there is ever a version 2 we should handle that here */ @@ -1175,6 +1173,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; void *data; + int data_len; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; @@ -1188,6 +1187,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); + data_len = nlmsg_len(nlh); switch (msg_type) { case AUDIT_GET: { @@ -1211,7 +1211,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_status s; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) @@ -1315,7 +1315,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; break; case AUDIT_SET_FEATURE: - err = audit_set_feature(skb); + if (data_len < sizeof(struct audit_features)) + return -EINVAL; + err = audit_set_feature(data); if (err) return err; break; @@ -1327,6 +1329,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ + char *str = data; + err = 0; if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push(); @@ -1334,26 +1338,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } audit_log_user_recv_msg(&ab, msg_type); - if (msg_type != AUDIT_USER_TTY) + if (msg_type != AUDIT_USER_TTY) { + /* ensure NULL termination */ + str[data_len - 1] = '\0'; audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, - (char *)data); - else { - int size; - + str); + } else { audit_log_format(ab, " data="); - size = nlmsg_len(nlh); - if (size > 0 && - ((unsigned char *)data)[size - 1] == '\0') - size--; - audit_log_n_untrustedstring(ab, data, size); + if (data_len > 0 && str[data_len - 1] == '\0') + data_len--; + audit_log_n_untrustedstring(ab, str, data_len); } audit_log_end(ab); } break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: - if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) + if (data_len < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(audit_context(), &ab, @@ -1365,7 +1367,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_end(ab); return -EPERM; } - err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh)); + err = audit_rule_change(msg_type, seq, data, data_len); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); @@ -1380,7 +1382,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; - size_t msglen = nlmsg_len(nlh); + size_t msglen = data_len; char *old, *new; err = -EINVAL; @@ -1456,7 +1458,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); /* check if new data is valid */ if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) -- cgit v1.2.3 From c780e86dd48ef6467a1146cf7d0fe1e05a635039 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 6 Feb 2020 15:28:12 +0100 Subject: blktrace: Protect q->blk_trace with RCU KASAN is reporting that __blk_add_trace() has a use-after-free issue when accessing q->blk_trace. Indeed the switching of block tracing (and thus eventual freeing of q->blk_trace) is completely unsynchronized with the currently running tracing and thus it can happen that the blk_trace structure is being freed just while __blk_add_trace() works on it. Protect accesses to q->blk_trace by RCU during tracing and make sure we wait for the end of RCU grace period when shutting down tracing. Luckily that is rare enough event that we can afford that. Note that postponing the freeing of blk_trace to an RCU callback should better be avoided as it could have unexpected user visible side-effects as debugfs files would be still existing for a short while block tracing has been shut down. Link: https://bugzilla.kernel.org/show_bug.cgi?id=205711 CC: stable@vger.kernel.org Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Tested-by: Ming Lei Reviewed-by: Bart Van Assche Reported-by: Tristan Madani Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- include/linux/blktrace_api.h | 18 +++++-- kernel/trace/blktrace.c | 114 +++++++++++++++++++++++++++++++------------ 3 files changed, 97 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 053ea4b51988..10455b2bbbb4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -524,7 +524,7 @@ struct request_queue { unsigned int sg_reserved_size; int node; #ifdef CONFIG_BLK_DEV_IO_TRACE - struct blk_trace *blk_trace; + struct blk_trace __rcu *blk_trace; struct mutex blk_trace_mutex; #endif /* diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 7bb2d8de9f30..3b6ff5902edc 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -51,9 +51,13 @@ void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *f **/ #define blk_add_cgroup_trace_msg(q, cg, fmt, ...) \ do { \ - struct blk_trace *bt = (q)->blk_trace; \ + struct blk_trace *bt; \ + \ + rcu_read_lock(); \ + bt = rcu_dereference((q)->blk_trace); \ if (unlikely(bt)) \ __trace_note_message(bt, cg, fmt, ##__VA_ARGS__);\ + rcu_read_unlock(); \ } while (0) #define blk_add_trace_msg(q, fmt, ...) \ blk_add_cgroup_trace_msg(q, NULL, fmt, ##__VA_ARGS__) @@ -61,10 +65,14 @@ void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *f static inline bool blk_trace_note_message_enabled(struct request_queue *q) { - struct blk_trace *bt = q->blk_trace; - if (likely(!bt)) - return false; - return bt->act_mask & BLK_TC_NOTIFY; + struct blk_trace *bt; + bool ret; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + ret = bt && (bt->act_mask & BLK_TC_NOTIFY); + rcu_read_unlock(); + return ret; } extern void blk_add_driver_data(struct request_queue *q, struct request *rq, diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 0735ae8545d8..4560878f0bac 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -335,6 +335,7 @@ static void put_probe_ref(void) static void blk_trace_cleanup(struct blk_trace *bt) { + synchronize_rcu(); blk_trace_free(bt); put_probe_ref(); } @@ -629,8 +630,10 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, static int __blk_trace_startstop(struct request_queue *q, int start) { int ret; - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (bt == NULL) return -EINVAL; @@ -740,8 +743,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) void blk_trace_shutdown(struct request_queue *q) { mutex_lock(&q->blk_trace_mutex); - - if (q->blk_trace) { + if (rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } @@ -752,8 +755,10 @@ void blk_trace_shutdown(struct request_queue *q) #ifdef CONFIG_BLK_CGROUP static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + /* We don't use the 'bt' value here except as an optimization... */ + bt = rcu_dereference_protected(q->blk_trace, 1); if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0; @@ -796,10 +801,14 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) static void blk_add_trace_rq(struct request *rq, int error, unsigned int nr_bytes, u32 what, u64 cgid) { - struct blk_trace *bt = rq->q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } if (blk_rq_is_passthrough(rq)) what |= BLK_TC_ACT(BLK_TC_PC); @@ -808,6 +817,7 @@ static void blk_add_trace_rq(struct request *rq, int error, __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), rq->cmd_flags, what, error, 0, NULL, cgid); + rcu_read_unlock(); } static void blk_add_trace_rq_insert(void *ignore, @@ -853,14 +863,19 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, u32 what, int error) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, what, error, 0, NULL, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); } static void blk_add_trace_bio_bounce(void *ignore, @@ -905,11 +920,14 @@ static void blk_add_trace_getrq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, NULL, 0); + rcu_read_unlock(); } } @@ -921,27 +939,35 @@ static void blk_add_trace_sleeprq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, 0, 0, NULL, 0); + rcu_read_unlock(); } } static void blk_add_trace_plug(void *ignore, struct request_queue *q) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); + rcu_read_unlock(); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, unsigned int depth, bool explicit) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); u32 what; @@ -953,14 +979,17 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } + rcu_read_unlock(); } static void blk_add_trace_split(void *ignore, struct request_queue *q, struct bio *bio, unsigned int pdu) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(pdu); @@ -969,6 +998,7 @@ static void blk_add_trace_split(void *ignore, BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), &rpdu, blk_trace_bio_get_cgid(q, bio)); } + rcu_read_unlock(); } /** @@ -988,11 +1018,15 @@ static void blk_add_trace_bio_remap(void *ignore, struct request_queue *q, struct bio *bio, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(bio_dev(bio)); @@ -1001,6 +1035,7 @@ static void blk_add_trace_bio_remap(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); } /** @@ -1021,11 +1056,15 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); @@ -1034,6 +1073,7 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); + rcu_read_unlock(); } /** @@ -1051,14 +1091,19 @@ void blk_add_driver_data(struct request_queue *q, struct request *rq, void *data, size_t len) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, 0, len, data, blk_trace_request_get_cgid(q, rq)); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1597,6 +1642,7 @@ static int blk_trace_remove_queue(struct request_queue *q) return -EINVAL; put_probe_ref(); + synchronize_rcu(); blk_trace_free(bt); return 0; } @@ -1758,6 +1804,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct hd_struct *p = dev_to_part(dev); struct request_queue *q; struct block_device *bdev; + struct blk_trace *bt; ssize_t ret = -ENXIO; bdev = bdget(part_devt(p)); @@ -1770,21 +1817,23 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - ret = sprintf(buf, "%u\n", !!q->blk_trace); + ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; } - if (q->blk_trace == NULL) + if (bt == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) - ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); + ret = blk_trace_mask2str(buf, bt->act_mask); else if (attr == &dev_attr_pid) - ret = sprintf(buf, "%u\n", q->blk_trace->pid); + ret = sprintf(buf, "%u\n", bt->pid); else if (attr == &dev_attr_start_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); + ret = sprintf(buf, "%llu\n", bt->start_lba); else if (attr == &dev_attr_end_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: mutex_unlock(&q->blk_trace_mutex); @@ -1801,6 +1850,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct block_device *bdev; struct request_queue *q; struct hd_struct *p; + struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1831,8 +1881,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - if (!!value == !!q->blk_trace) { + if (!!value == !!bt) { ret = 0; goto out_unlock_bdev; } @@ -1844,18 +1896,18 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } ret = 0; - if (q->blk_trace == NULL) + if (bt == NULL) ret = blk_trace_setup_queue(q, bdev); if (ret == 0) { if (attr == &dev_attr_act_mask) - q->blk_trace->act_mask = value; + bt->act_mask = value; else if (attr == &dev_attr_pid) - q->blk_trace->pid = value; + bt->pid = value; else if (attr == &dev_attr_start_lba) - q->blk_trace->start_lba = value; + bt->start_lba = value; else if (attr == &dev_attr_end_lba) - q->blk_trace->end_lba = value; + bt->end_lba = value; } out_unlock_bdev: -- cgit v1.2.3 From 2910b5aa6f545c044173a5cab3dbb7f43e23916d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 25 Feb 2020 23:36:41 +0900 Subject: bootconfig: Fix CONFIG_BOOTTIME_TRACING dependency issue Since commit d8a953ddde5e ("bootconfig: Set CONFIG_BOOT_CONFIG=n by default") also changed the CONFIG_BOOTTIME_TRACING to select CONFIG_BOOT_CONFIG to show the boot-time tracing on the menu, it introduced wrong dependencies with BLK_DEV_INITRD as below. WARNING: unmet direct dependencies detected for BOOT_CONFIG Depends on [n]: BLK_DEV_INITRD [=n] Selected by [y]: - BOOTTIME_TRACING [=y] && TRACING_SUPPORT [=y] && FTRACE [=y] && TRACING [=y] This makes the CONFIG_BOOT_CONFIG selects CONFIG_BLK_DEV_INITRD to fix this error and make CONFIG_BOOTTIME_TRACING=n by default, so that both boot-time tracing and boot configuration off but those appear on the menu list. Link: http://lkml.kernel.org/r/158264140162.23842.11237423518607465535.stgit@devnote2 Fixes: d8a953ddde5e ("bootconfig: Set CONFIG_BOOT_CONFIG=n by default") Reported-by: Randy Dunlap Compiled-tested-by: Randy Dunlap Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/Kconfig | 2 +- kernel/trace/Kconfig | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index a84e7aa89a29..8b4c3e8c05ea 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1217,7 +1217,7 @@ endif config BOOT_CONFIG bool "Boot config support" - depends on BLK_DEV_INITRD + select BLK_DEV_INITRD help Extra boot config allows system admin to pass a config file as complemental extension of kernel cmdline when booting. diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 795c3e02d3f1..402eef84c859 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -145,7 +145,6 @@ config BOOTTIME_TRACING bool "Boot-time Tracing support" depends on TRACING select BOOT_CONFIG - default y help Enable developer to setup ftrace subsystem via supplemental kernel cmdline at boot time for debugging (tracing) driver -- cgit v1.2.3 From fda31c50292a5062332fa0343c084bd9f46604d9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 24 Feb 2020 12:47:14 -0800 Subject: signal: avoid double atomic counter increments for user accounting When queueing a signal, we increment both the users count of pending signals (for RLIMIT_SIGPENDING tracking) and we increment the refcount of the user struct itself (because we keep a reference to the user in the signal structure in order to correctly account for it when freeing). That turns out to be fairly expensive, because both of them are atomic updates, and particularly under extreme signal handling pressure on big machines, you can get a lot of cache contention on the user struct. That can then cause horrid cacheline ping-pong when you do these multiple accesses. So change the reference counting to only pin the user for the _first_ pending signal, and to unpin it when the last pending signal is dequeued. That means that when a user sees a lot of concurrent signal queuing - which is the only situation when this matters - the only atomic access needed is generally the 'sigpending' count update. This was noticed because of a particularly odd timing artifact on a dual-socket 96C/192T Cascade Lake platform: when you get into bad contention, on that machine for some reason seems to be much worse when the contention happens in the upper 32-byte half of the cacheline. As a result, the kernel test robot will-it-scale 'signal1' benchmark had an odd performance regression simply due to random alignment of the 'struct user_struct' (and pointed to a completely unrelated and apparently nonsensical commit for the regression). Avoiding the double increments (and decrements on the dequeueing side, of course) makes for much less contention and hugely improved performance on that will-it-scale microbenchmark. Quoting Feng Tang: "It makes a big difference, that the performance score is tripled! bump from original 17000 to 54000. Also the gap between 5.0-rc6 and 5.0-rc6+Jiri's patch is reduced to around 2%" [ The "2% gap" is the odd cacheline placement difference on that platform: under the extreme contention case, the effect of which half of the cacheline was hot was 5%, so with the reduced contention the odd timing artifact is reduced too ] It does help in the non-contended case too, but is not nearly as noticeable. Reported-and-tested-by: Feng Tang Cc: Eric W. Biederman Cc: Huang, Ying Cc: Philip Li Cc: Andi Kleen Cc: Jiri Olsa Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/signal.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 9ad8dea93dbb..5b2396350dd1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -413,27 +413,32 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi { struct sigqueue *q = NULL; struct user_struct *user; + int sigpending; /* * Protect access to @t credentials. This can go away when all * callers hold rcu read lock. + * + * NOTE! A pending signal will hold on to the user refcount, + * and we get/put the refcount only when the sigpending count + * changes from/to zero. */ rcu_read_lock(); - user = get_uid(__task_cred(t)->user); - atomic_inc(&user->sigpending); + user = __task_cred(t)->user; + sigpending = atomic_inc_return(&user->sigpending); + if (sigpending == 1) + get_uid(user); rcu_read_unlock(); - if (override_rlimit || - atomic_read(&user->sigpending) <= - task_rlimit(t, RLIMIT_SIGPENDING)) { + if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } if (unlikely(q == NULL)) { - atomic_dec(&user->sigpending); - free_uid(user); + if (atomic_dec_and_test(&user->sigpending)) + free_uid(user); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; @@ -447,8 +452,8 @@ static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; - atomic_dec(&q->user->sigpending); - free_uid(q->user); + if (atomic_dec_and_test(&q->user->sigpending)) + free_uid(q->user); kmem_cache_free(sigqueue_cachep, q); } -- cgit v1.2.3 From 289de35984815576793f579ec27248609e75976e Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 18 Feb 2020 15:45:34 +0100 Subject: sched/fair: Fix statistics for find_idlest_group() sgs->group_weight is not set while gathering statistics in update_sg_wakeup_stats(). This means that a group can be classified as fully busy with 0 running tasks if utilization is high enough. This path is mainly used for fork and exec. Fixes: 57abff067a08 ("sched/fair: Rework find_idlest_group()") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20200218144534.4564-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c8a379c357e..c1217bfe5e81 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8337,6 +8337,8 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_capacity = group->sgc->capacity; + sgs->group_weight = group->group_weight; + sgs->group_type = group_classify(sd->imbalance_pct, group, sgs); /* -- cgit v1.2.3 From 0c282b068eb26db0e85e2ab4ec6d1e932acda841 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Mon, 27 Jan 2020 23:28:21 +0530 Subject: fork: Use RCU_INIT_POINTER() instead of rcu_access_pointer() Use RCU_INIT_POINTER() instead of rcu_access_pointer() in copy_sighand(). Suggested-by: Oleg Nesterov Signed-off-by: Madhuparna Bhowmik Acked-by: Oleg Nesterov Acked-by: Christian Brauner [christian.brauner@ubuntu.com: edit commit message] Link: https://lore.kernel.org/r/20200127175821.10833-1-madhuparnabhowmik10@gmail.com Signed-off-by: Christian Brauner --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 60a1295f4384..86425305cd4a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1508,7 +1508,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); - rcu_assign_pointer(tsk->sighand, sig); + RCU_INIT_POINTER(tsk->sighand, sig); if (!sig) return -ENOMEM; -- cgit v1.2.3 From 22a34c6fe0ffc1d92ee26a25913fadf347258fd6 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Thu, 30 Jan 2020 11:50:28 +0530 Subject: exit: Fix Sparse errors and warnings This patch fixes the following sparse error: kernel/exit.c:627:25: error: incompatible types in comparison expression And the following warning: kernel/exit.c:626:40: warning: incorrect type in assignment Signed-off-by: Madhuparna Bhowmik Acked-by: Oleg Nesterov Acked-by: Christian Brauner [christian.brauner@ubuntu.com: edit commit message] Link: https://lore.kernel.org/r/20200130062028.4870-1-madhuparnabhowmik10@gmail.com Signed-off-by: Christian Brauner --- kernel/exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2833ffb0c211..0b81b26a872a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -619,8 +619,8 @@ static void forget_original_parent(struct task_struct *father, reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { - t->real_parent = reaper; - BUG_ON((!t->ptrace) != (t->parent == father)); + RCU_INIT_POINTER(t->real_parent, reaper); + BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) -- cgit v1.2.3 From ecc421e05bab97cf3ff4fe456ade47ef84dba8c2 Mon Sep 17 00:00:00 2001 From: Cyril Hrubis Date: Tue, 3 Mar 2020 16:06:38 +0100 Subject: sys/sysinfo: Respect boottime inside time namespace The sysinfo() syscall includes uptime in seconds but has no correction for time namespaces which makes it inconsistent with the /proc/uptime inside of a time namespace. Add the missing time namespace adjustment call. Signed-off-by: Cyril Hrubis Signed-off-by: Thomas Gleixner Reviewed-by: Dmitry Safonov Link: https://lkml.kernel.org/r/20200303150638.7329-1-chrubis@suse.cz --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index f9bc5c303e3f..d325f3ab624a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -2546,6 +2547,7 @@ static int do_sysinfo(struct sysinfo *info) memset(info, 0, sizeof(struct sysinfo)); ktime_get_boottime_ts64(&tp); + timens_add_boottime(&tp); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); -- cgit v1.2.3 From 190ecb190a9cd8c0599d8499b901e3c32e87966a Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Sun, 23 Feb 2020 22:00:07 -0500 Subject: cgroup: fix psi_show() crash on 32bit ino archs Similar to the commit d7495343228f ("cgroup: fix incorrect WARN_ON_ONCE() in cgroup_setup_root()"), cgroup_id(root_cgrp) does not equal to 1 on 32bit ino archs which triggers all sorts of issues with psi_show() on s390x. For example, BUG: KASAN: slab-out-of-bounds in collect_percpu_times+0x2d0/ Read of size 4 at addr 000000001e0ce000 by task read_all/3667 collect_percpu_times+0x2d0/0x798 psi_show+0x7c/0x2a8 seq_read+0x2ac/0x830 vfs_read+0x92/0x150 ksys_read+0xe2/0x188 system_call+0xd8/0x2b4 Fix it by using cgroup_ino(). Fixes: 743210386c03 ("cgroup: use cgrp->kn->id as the cgroup ID") Signed-off-by: Qian Cai Acked-by: Johannes Weiner Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v5.5 --- kernel/cgroup/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c719a4154d6d..7a39dc882095 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3542,21 +3542,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_CPU); } -- cgit v1.2.3 From 2e5383d7904e60529136727e49629a82058a5607 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 19 Feb 2020 12:01:29 -0700 Subject: cgroup1: don't call release_agent when it is "" Older (and maybe current) versions of systemd set release_agent to "" when shutting down, but do not set notify_on_release to 0. Since 64e90a8acb85 ("Introduce STATIC_USERMODEHELPER to mediate call_usermodehelper()"), we filter out such calls when the user mode helper path is "". However, when used in conjunction with an actual (i.e. non "") STATIC_USERMODEHELPER, the path is never "", so the real usermode helper will be called with argv[0] == "". Let's avoid this by not invoking the release_agent when it is "". Signed-off-by: Tycho Andersen Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 9a6f060cbf51..f2d7cea86ffe 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -783,7 +783,7 @@ void cgroup1_release_agent(struct work_struct *work) pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf) + if (!pathbuf || !agentbuf || !strlen(agentbuf)) goto out; spin_lock_irq(&css_set_lock); -- cgit v1.2.3 From 153031a301bb07194e9c37466cfce8eacb977621 Mon Sep 17 00:00:00 2001 From: Cengiz Can Date: Wed, 4 Mar 2020 13:58:19 +0300 Subject: blktrace: fix dereference after null check There was a recent change in blktrace.c that added a RCU protection to `q->blk_trace` in order to fix a use-after-free issue during access. However the change missed an edge case that can lead to dereferencing of `bt` pointer even when it's NULL: Coverity static analyzer marked this as a FORWARD_NULL issue with CID 1460458. ``` /kernel/trace/blktrace.c: 1904 in sysfs_blk_trace_attr_store() 1898 ret = 0; 1899 if (bt == NULL) 1900 ret = blk_trace_setup_queue(q, bdev); 1901 1902 if (ret == 0) { 1903 if (attr == &dev_attr_act_mask) >>> CID 1460458: Null pointer dereferences (FORWARD_NULL) >>> Dereferencing null pointer "bt". 1904 bt->act_mask = value; 1905 else if (attr == &dev_attr_pid) 1906 bt->pid = value; 1907 else if (attr == &dev_attr_start_lba) 1908 bt->start_lba = value; 1909 else if (attr == &dev_attr_end_lba) ``` Added a reassignment with RCU annotation to fix the issue. Fixes: c780e86dd48 ("blktrace: Protect q->blk_trace with RCU") Cc: stable@vger.kernel.org Reviewed-by: Ming Lei Reviewed-by: Bob Liu Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Cengiz Can Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4560878f0bac..ca39dc3230cb 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1896,8 +1896,11 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } ret = 0; - if (bt == NULL) + if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); + } if (ret == 0) { if (attr == &dev_attr_act_mask) -- cgit v1.2.3 From 8019ad13ef7f64be44d4f892af9c840179009254 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Mar 2020 11:28:31 +0100 Subject: futex: Fix inode life-time issue As reported by Jann, ihold() does not in fact guarantee inode persistence. And instead of making it so, replace the usage of inode pointers with a per boot, machine wide, unique inode identifier. This sequence number is global, but shared (file backed) futexes are rare enough that this should not become a performance issue. Reported-by: Jann Horn Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) --- fs/inode.c | 1 + include/linux/fs.h | 1 + include/linux/futex.h | 17 ++++++---- kernel/futex.c | 89 ++++++++++++++++++++++++++++++--------------------- 4 files changed, 65 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/fs/inode.c b/fs/inode.c index 7d57068b6b7a..93d9252a00ab 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -138,6 +138,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; + atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; diff --git a/include/linux/fs.h b/include/linux/fs.h index 3cd4fe6b845e..abedbffe2c9e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -698,6 +698,7 @@ struct inode { struct rcu_head i_rcu; }; atomic64_t i_version; + atomic64_t i_sequence; /* see futex */ atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; diff --git a/include/linux/futex.h b/include/linux/futex.h index 5cc3fed27d4c..b70df27d7e85 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -31,23 +31,26 @@ struct task_struct; union futex_key { struct { + u64 i_seq; unsigned long pgoff; - struct inode *inode; - int offset; + unsigned int offset; } shared; struct { + union { + struct mm_struct *mm; + u64 __tmp; + }; unsigned long address; - struct mm_struct *mm; - int offset; + unsigned int offset; } private; struct { + u64 ptr; unsigned long word; - void *ptr; - int offset; + unsigned int offset; } both; }; -#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } } +#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } } #ifdef CONFIG_FUTEX enum { diff --git a/kernel/futex.c b/kernel/futex.c index 0cf84c8664f2..e14f7cd45dbd 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -429,7 +429,7 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies smp_mb(); (B) */ + smp_mb(); /* explicit smp_mb(); (B) */ break; case FUT_OFF_MMSHARED: futex_get_mm(key); /* implies smp_mb(); (B) */ @@ -463,7 +463,6 @@ static void drop_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - iput(key->shared.inode); break; case FUT_OFF_MMSHARED: mmdrop(key->private.mm); @@ -505,6 +504,46 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, return timeout; } +/* + * Generate a machine wide unique identifier for this inode. + * + * This relies on u64 not wrapping in the life-time of the machine; which with + * 1ns resolution means almost 585 years. + * + * This further relies on the fact that a well formed program will not unmap + * the file while it has a (shared) futex waiting on it. This mapping will have + * a file reference which pins the mount and inode. + * + * If for some reason an inode gets evicted and read back in again, it will get + * a new sequence number and will _NOT_ match, even though it is the exact same + * file. + * + * It is important that match_futex() will never have a false-positive, esp. + * for PI futexes that can mess up the state. The above argues that false-negatives + * are only possible for malformed programs. + */ +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex @@ -517,9 +556,15 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, * * The key words are stored in @key on success. * - * For shared mappings, it's (page->index, file_inode(vma->vm_file), - * offset_within_page). For private mappings, it's (uaddr, current->mm). - * We can usually work out the index without swapping in the page. + * For shared mappings (when @fshared), the key is: + * ( inode->i_sequence, page->index, offset_within_page ) + * [ also see get_inode_sequence_number() ] + * + * For private mappings (or when !@fshared), the key is: + * ( current->mm, address, 0 ) + * + * This allows (cross process, where applicable) identification of the futex + * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ @@ -659,8 +704,6 @@ again: key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies smp_mb(); (B) */ - } else { struct inode *inode; @@ -692,40 +735,14 @@ again: goto again; } - /* - * Take a reference unless it is about to be freed. Previously - * this reference was taken by ihold under the page lock - * pinning the inode in place so i_lock was unnecessary. The - * only way for this check to fail is if the inode was - * truncated in parallel which is almost certainly an - * application bug. In such a case, just retry. - * - * We are not calling into get_futex_key_refs() in file-backed - * cases, therefore a successful atomic_inc return below will - * guarantee that get_futex_key() will still imply smp_mb(); (B). - */ - if (!atomic_inc_not_zero(&inode->i_count)) { - rcu_read_unlock(); - put_page(page); - - goto again; - } - - /* Should be impossible but lets be paranoid for now */ - if (WARN_ON_ONCE(inode->i_mapping != mapping)) { - err = -EFAULT; - rcu_read_unlock(); - iput(inode); - - goto out; - } - key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = inode; + key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = basepage_index(tail); rcu_read_unlock(); } + get_futex_key_refs(key); /* implies smp_mb(); (B) */ + out: put_page(page); return err; -- cgit v1.2.3 From b26ebfe12f34f372cf041c6f801fa49c3fb382c5 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Fri, 6 Mar 2020 11:23:14 -0600 Subject: pid: Fix error return value in some cases Recent changes to alloc_pid() allow the pid number to be specified on the command line. If set_tid_size is set, then the code scanning the levels will hard-set retval to -EPERM, overriding it's previous -ENOMEM value. After the code scanning the levels, there are error returns that do not set retval, assuming it is still set to -ENOMEM. So set retval back to -ENOMEM after scanning the levels. Fixes: 49cb2fc42ce4 ("fork: extend clone3() to support setting a PID") Signed-off-by: Corey Minyard Acked-by: Christian Brauner Cc: Andrei Vagin Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Oleg Nesterov Cc: Adrian Reber Cc: # 5.5 Link: https://lore.kernel.org/r/20200306172314.12232-1-minyard@acm.org [christian.brauner@ubuntu.com: fixup commit message] Signed-off-by: Christian Brauner --- kernel/pid.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 0f4ecb57214c..19645b25b77c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -247,6 +247,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, tmp = tmp->parent; } + retval = -ENOMEM; + if (unlikely(is_child_reaper(pid))) { if (pid_ns_prepare_proc(ns)) goto out_free; -- cgit v1.2.3 From 8d67743653dce5a0e7aa500fcccb237cde7ad88e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 8 Mar 2020 19:07:17 +0100 Subject: futex: Unbreak futex hashing The recent futex inode life time fix changed the ordering of the futex key union struct members, but forgot to adjust the hash function accordingly, As a result the hashing omits the leading 64bit and even hashes beyond the futex key causing a bad hash distribution which led to a ~100% performance regression. Hand in the futex key pointer instead of a random struct member and make the size calculation based of the struct offset. Fixes: 8019ad13ef7f ("futex: Fix inode life-time issue") Reported-by: Rong Chen Decoded-by: Linus Torvalds Signed-off-by: Thomas Gleixner Tested-by: Rong Chen Link: https://lkml.kernel.org/r/87h7yy90ve.fsf@nanos.tec.linutronix.de --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e14f7cd45dbd..82dfacb3250e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -385,9 +385,9 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb) */ static struct futex_hash_bucket *hash_futex(union futex_key *key) { - u32 hash = jhash2((u32*)&key->both.word, - (sizeof(key->both.word)+sizeof(key->both.ptr))/4, + u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); + return &futex_queues[hash & (futex_hashsize - 1)]; } -- cgit v1.2.3 From 10dab84caf400f2f5f8b010ebb0c7c4272ec5093 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 8 Mar 2020 14:29:17 +0100 Subject: pid: make ENOMEM return value more obvious The alloc_pid() codepath used to be simpler. With the introducation of the ability to choose specific pids in 49cb2fc42ce4 ("fork: extend clone3() to support setting a PID") it got more complex. It hasn't been super obvious that ENOMEM is returned when the pid namespace init process/child subreaper of the pid namespace has died. As can be seen from multiple attempts to improve this see e.g. [1] and most recently [2]. We regressed returning ENOMEM in [3] and [2] restored it. Let's add a comment on top explaining that this is historic and documented behavior and cannot easily be changed. [1]: 35f71bc0a09a ("fork: report pid reservation failure properly") [2]: b26ebfe12f34 ("pid: Fix error return value in some cases") [3]: 49cb2fc42ce4 ("fork: extend clone3() to support setting a PID") Signed-off-by: Christian Brauner --- kernel/pid.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 19645b25b77c..647b4bb457b5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -247,6 +247,14 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, tmp = tmp->parent; } + /* + * ENOMEM is not the most obvious choice especially for the case + * where the child subreaper has already exited and the pid + * namespace denies the creation of any new processes. But ENOMEM + * is what we have exposed to userspace for a long time and it is + * documented behavior for pid namespaces. So we can't easily + * change it even if there were an error code better suited. + */ retval = -ENOMEM; if (unlikely(is_child_reaper(pid))) { -- cgit v1.2.3 From aa202f1f56960c60e7befaa0f49c72b8fa11b0a8 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Fri, 24 Jan 2020 20:14:45 -0500 Subject: workqueue: don't use wq_select_unbound_cpu() for bound works wq_select_unbound_cpu() is designed for unbound workqueues only, but it's wrongly called when using a bound workqueue too. Fixing this ensures work queued to a bound workqueue with cpu=WORK_CPU_UNBOUND always runs on the local CPU. Before, that would happen only if wq_unbound_cpumask happened to include it (likely almost always the case), or was empty, or we got lucky with forced round-robin placement. So restricting /sys/devices/virtual/workqueue/cpumask to a small subset of a machine's CPUs would cause some bound work items to run unexpectedly there. Fixes: ef557180447f ("workqueue: schedule WORK_CPU_UNBOUND work on wq_unbound_cpumask CPUs") Cc: stable@vger.kernel.org # v4.5+ Signed-off-by: Hillf Danton [dj: massage changelog] Signed-off-by: Daniel Jordan Cc: Tejun Heo Cc: Lai Jiangshan Cc: linux-kernel@vger.kernel.org Signed-off-by: Tejun Heo --- kernel/workqueue.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 301db4406bc3..4e01c448b4b4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1411,14 +1411,16 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, return; rcu_read_lock(); retry: - if (req_cpu == WORK_CPU_UNBOUND) - cpu = wq_select_unbound_cpu(raw_smp_processor_id()); - /* pwq which will be used unless @work is executing elsewhere */ - if (!(wq->flags & WQ_UNBOUND)) - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); - else + if (wq->flags & WQ_UNBOUND) { + if (req_cpu == WORK_CPU_UNBOUND) + cpu = wq_select_unbound_cpu(raw_smp_processor_id()); pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); + } else { + if (req_cpu == WORK_CPU_UNBOUND) + cpu = raw_smp_processor_id(); + pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + } /* * If @work was previously on a different pool, it might still be -- cgit v1.2.3 From e876ecc67db80dfdb8e237f71e5b43bb88ae549c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 9 Mar 2020 22:16:05 -0700 Subject: cgroup: memcg: net: do not associate sock with unrelated cgroup We are testing network memory accounting in our setup and noticed inconsistent network memory usage and often unrelated cgroups network usage correlates with testing workload. On further inspection, it seems like mem_cgroup_sk_alloc() and cgroup_sk_alloc() are broken in irq context specially for cgroup v1. mem_cgroup_sk_alloc() and cgroup_sk_alloc() can be called in irq context and kind of assumes that this can only happen from sk_clone_lock() and the source sock object has already associated cgroup. However in cgroup v1, where network memory accounting is opt-in, the source sock can be unassociated with any cgroup and the new cloned sock can get associated with unrelated interrupted cgroup. Cgroup v2 can also suffer if the source sock object was created by process in the root cgroup or if sk_alloc() is called in irq context. The fix is to just do nothing in interrupt. WARNING: Please note that about half of the TCP sockets are allocated from the IRQ context, so, memory used by such sockets will not be accouted by the memcg. The stack trace of mem_cgroup_sk_alloc() from IRQ-context: CPU: 70 PID: 12720 Comm: ssh Tainted: 5.6.0-smp-DEV #1 Hardware name: ... Call Trace: dump_stack+0x57/0x75 mem_cgroup_sk_alloc+0xe9/0xf0 sk_clone_lock+0x2a7/0x420 inet_csk_clone_lock+0x1b/0x110 tcp_create_openreq_child+0x23/0x3b0 tcp_v6_syn_recv_sock+0x88/0x730 tcp_check_req+0x429/0x560 tcp_v6_rcv+0x72d/0xa40 ip6_protocol_deliver_rcu+0xc9/0x400 ip6_input+0x44/0xd0 ? ip6_protocol_deliver_rcu+0x400/0x400 ip6_rcv_finish+0x71/0x80 ipv6_rcv+0x5b/0xe0 ? ip6_sublist_rcv+0x2e0/0x2e0 process_backlog+0x108/0x1e0 net_rx_action+0x26b/0x460 __do_softirq+0x104/0x2a6 do_softirq_own_stack+0x2a/0x40 do_softirq.part.19+0x40/0x50 __local_bh_enable_ip+0x51/0x60 ip6_finish_output2+0x23d/0x520 ? ip6table_mangle_hook+0x55/0x160 __ip6_finish_output+0xa1/0x100 ip6_finish_output+0x30/0xd0 ip6_output+0x73/0x120 ? __ip6_finish_output+0x100/0x100 ip6_xmit+0x2e3/0x600 ? ipv6_anycast_cleanup+0x50/0x50 ? inet6_csk_route_socket+0x136/0x1e0 ? skb_free_head+0x1e/0x30 inet6_csk_xmit+0x95/0xf0 __tcp_transmit_skb+0x5b4/0xb20 __tcp_send_ack.part.60+0xa3/0x110 tcp_send_ack+0x1d/0x20 tcp_rcv_state_process+0xe64/0xe80 ? tcp_v6_connect+0x5d1/0x5f0 tcp_v6_do_rcv+0x1b1/0x3f0 ? tcp_v6_do_rcv+0x1b1/0x3f0 __release_sock+0x7f/0xd0 release_sock+0x30/0xa0 __inet_stream_connect+0x1c3/0x3b0 ? prepare_to_wait+0xb0/0xb0 inet_stream_connect+0x3b/0x60 __sys_connect+0x101/0x120 ? __sys_getsockopt+0x11b/0x140 __x64_sys_connect+0x1a/0x20 do_syscall_64+0x51/0x200 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The stack trace of mem_cgroup_sk_alloc() from IRQ-context: Fixes: 2d7580738345 ("mm: memcontrol: consolidate cgroup socket tracking") Fixes: d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets") Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Signed-off-by: David S. Miller --- kernel/cgroup/cgroup.c | 4 ++++ mm/memcontrol.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 75f687301bbf..6b2fc56b2201 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6258,6 +6258,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) return; } + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) + return; + rcu_read_lock(); while (true) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d09776cd6e10..1cf41ee22258 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6696,6 +6696,10 @@ void mem_cgroup_sk_alloc(struct sock *sk) return; } + /* Do not associate the sock with unrelated interrupted task's memcg. */ + if (in_interrupt()) + return; + rcu_read_lock(); memcg = mem_cgroup_from_task(current); if (memcg == root_mem_cgroup) -- cgit v1.2.3 From d9815bff6b379ff46981bea9dfeb146081eab314 Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Fri, 6 Mar 2020 18:43:17 +0100 Subject: ftrace: Return the first found result in lookup_rec() It appears that ip ranges can overlap so. In that case lookup_rec() returns whatever results it got last even if it found nothing in last searched page. This breaks an obscure livepatch late module patching usecase: - load livepatch - load the patched module - unload livepatch - try to load livepatch again To fix this return from lookup_rec() as soon as it found the record containing searched-for ip. This used to be this way prior lookup_rec() introduction. Link: http://lkml.kernel.org/r/20200306174317.21699-1-asavkov@redhat.com Cc: stable@vger.kernel.org Fixes: 7e16f581a817 ("ftrace: Separate out functionality from ftrace_location_range()") Signed-off-by: Artem Savkov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3f7ee102868a..fd81c7de77a7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1547,6 +1547,8 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) rec = bsearch(&key, pg->records, pg->index, sizeof(struct dyn_ftrace), ftrace_cmp_recs); + if (rec) + break; } return rec; } -- cgit v1.2.3