aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf/scripts/python/netdev-times.py
blob: 9aa0a32972e80fa7f49b1226751f04970e299882 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
# Display a process of packets and processed time.
# It helps us to investigate networking or network device.
#
# options
# tx: show only tx chart
# rx: show only rx chart
# dev=: show only thing related to specified device
# debug: work with debug mode. It shows buffer status.

import os
import sys

sys.path.append(os.environ['PERF_EXEC_PATH'] + \
	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')

from perf_trace_context import *
from Core import *
from Util import *

all_event_list = []; # insert all tracepoint event related with this script
irq_dic = {}; # key is cpu and value is a list which stacks irqs
              # which raise NET_RX softirq
net_rx_dic = {}; # key is cpu and value include time of NET_RX softirq-entry
		 # and a list which stacks receive
receive_hunk_list = []; # a list which include a sequence of receive events
rx_skb_list = []; # received packet list for matching
		       # skb_copy_datagram_iovec

buffer_budget = 65536; # the budget of rx_skb_list, tx_queue_list and
		       # tx_xmit_list
of_count_rx_skb_list = 0; # overflow count

tx_queue_list = []; # list of packets which pass through dev_queue_xmit
of_count_tx_queue_list = 0; # overflow count

tx_xmit_list = [];  # list of packets which pass through dev_hard_start_xmit
of_count_tx_xmit_list = 0; # overflow count

tx_free_list = [];  # list of packets which is freed

# options
show_tx = 0;
show_rx = 0;
dev = 0; # store a name of device specified by option "dev="
debug = 0;

# indices of event_info tuple
EINFO_IDX_NAME=   0
EINFO_IDX_CONTEXT=1
EINFO_IDX_CPU=    2
EINFO_IDX_TIME=   3
EINFO_IDX_PID=    4
EINFO_IDX_COMM=   5

# Calculate a time interval(msec) from src(nsec) to dst(nsec)
def diff_msec(src, dst):
	return (dst - src) / 1000000.0

# Display a process of transmitting a packet
def print_transmit(hunk):
	if dev != 0 and hunk['dev'].find(dev) < 0:
		return
	print "%7s %5d %6d.%06dsec %12.3fmsec      %12.3fmsec" % \
		(hunk['dev'], hunk['len'],
		nsecs_secs(hunk['queue_t']),
		nsecs_nsecs(hunk['queue_t'])/1000,
		diff_msec(hunk['queue_t'], hunk['xmit_t']),
		diff_msec(hunk['xmit_t'], hunk['free_t']))

# Format for displaying rx packet processing
PF_IRQ_ENTRY= "  irq_entry(+%.3fmsec irq=%d:%s)"
PF_SOFT_ENTRY="  softirq_entry(+%.3fmsec)"
PF_NAPI_POLL= "  napi_poll_exit(+%.3fmsec %s)"
PF_JOINT=     "         |"
PF_WJOINT=    "         |            |"
PF_NET_RECV=  "         |---netif_receive_skb(+%.3fmsec skb=%x len=%d)"
PF_NET_RX=    "         |---netif_rx(+%.3fmsec skb=%x)"
PF_CPY_DGRAM= "         |      skb_copy_datagram_iovec(+%.3fmsec %d:%s)"
PF_KFREE_SKB= "         |      kfree_skb(+%.3fmsec location=%x)"
PF_CONS_SKB=  "         |      consume_skb(+%.3fmsec)"

# Display a process of received packets and interrputs associated with
# a NET_RX softirq
def print_receive(hunk):
	show_hunk = 0
	irq_list = hunk['irq_list']
	cpu = irq_list[0]['cpu']
	base_t = irq_list[0]['irq_ent_t']
	# check if this hunk should be showed
	if dev != 0:
		for i in range(len(irq_list)):
			if irq_list[i]['name'].find(dev) >= 0:
				show_hunk = 1
				break
	else:
		show_hunk = 1
	if show_hunk == 0:
		return

	print "%d.%06dsec cpu=%d" % \
		(nsecs_secs(base_t), nsecs_nsecs(base_t)/1000, cpu)
	for i in range(len(irq_list)):
		print PF_IRQ_ENTRY % \
			(diff_msec(base_t, irq_list[i]['irq_ent_t']),
			irq_list[i]['irq'], irq_list[i]['name'])
		print PF_JOINT
		irq_event_list = irq_list[i]['event_list']
		for j in range(len(irq_event_list)):
			irq_event = irq_event_list[j]
			if irq_event['event'] == 'netif_rx':
				print PF_NET_RX % \
					(diff_msec(base_t, irq_event['time']),
					irq_event['skbaddr'])
				print PF_JOINT
	print PF_SOFT_ENTRY % \
		diff_msec(base_t, hunk['sirq_ent_t'])
	print PF_JOINT
	event_list = hunk['event_list']
	for i in range(len(event_list)):
		event = event_list[i]
		if event['event_name'] == 'napi_poll':
			print PF_NAPI_POLL % \
			    (diff_msec(base_t, event['event_t']), event['dev'])
			if i == len(event_list) - 1:
				print ""
			else:
				print PF_JOINT
		else:
			print PF_NET_RECV % \
			    (diff_msec(base_t, event['event_t']), event['skbaddr'],
				event['len'])
			if 'comm' in event.keys():
				print PF_WJOINT
				print PF_CPY_DGRAM % \
					(diff_msec(base_t, event['comm_t']),
					event['pid'], event['comm'])
			elif 'handle' in event.keys():
				print PF_WJOINT
				if event['handle'] == "kfree_skb":
					print PF_KFREE_SKB % \
						(diff_msec(base_t,
						event['comm_t']),
						event['location'])
				elif event['handle'] == "consume_skb":
					print PF_CONS_SKB % \
						diff_msec(base_t,
							event['comm_t'])
			print PF_JOINT

def trace_begin():
	global show_tx
	global show_rx
	global dev
	global debug

	for i in range(len(sys.argv)):
		if i == 0:
			continue
		arg = sys.argv[i]
		if arg == 'tx':
			show_tx = 1
		elif arg =='rx':
			show_rx = 1
		elif arg.find('dev=',0, 4) >= 0:
			dev = arg[4:]
		elif arg == 'debug':
			debug = 1
	if show_tx == 0  and show_rx == 0:
		show_tx = 1
		show_rx = 1

def trace_end():
	# order all events in time
	all_event_list.sort(lambda a,b :cmp(a[EINFO_IDX_TIME],
					    b[EINFO_IDX_TIME]))
	# process all events
	for i in range(len(all_event_list)):
		event_info = all_event_list[i]
		name = event_info[EINFO_IDX_NAME]
		if name == 'irq__softirq_exit':
			handle_irq_softirq_exit(event_info)
		elif name == 'irq__softirq_entry':
			handle_irq_softirq_entry(event_info)
		elif name == 'irq__softirq_raise':
			handle_irq_softirq_raise(event_info)
		elif name == 'irq__irq_handler_entry':
			handle_irq_handler_entry(event_info)
		elif name == 'irq__irq_handler_exit':
			handle_irq_handler_exit(event_info)
		elif name == 'napi__napi_poll':
			handle_napi_poll(event_info)
		elif name == 'net__netif_receive_skb':
			handle_netif_receive_skb(event_info)
		elif name == 'net__netif_rx':
			handle_netif_rx(event_info)
		elif name == 'skb__skb_copy_datagram_iovec':
			handle_skb_copy_datagram_iovec(event_info)
		elif name == 'net__net_dev_queue':
			handle_net_dev_queue(event_info)
		elif name == 'net__net_dev_xmit':
			handle_net_dev_xmit(event_info)
		elif name == 'skb__kfree_skb':
			handle_kfree_skb(event_info)
		elif name == 'skb__consume_skb':
			handle_consume_skb(event_info)
	# display receive hunks
	if show_rx:
		for i in range(len(receive_hunk_list)):
			print_receive(receive_hunk_list[i])
	# display transmit hunks
	if show_tx:
		print "   dev    len      Qdisc        " \
			"       netdevice             free"
		for i in range(len(tx_free_list)):
			print_transmit(tx_free_list[i])
	if debug:
		print "debug buffer status"
		print "----------------------------"
		print "xmit Qdisc:remain:%d overflow:%d" % \
			(len(tx_queue_list), of_count_tx_queue_list)
		print "xmit netdevice:remain:%d overflow:%d" % \
			(len(tx_xmit_list), of_count_tx_xmit_list)
		print "receive:remain:%d overflow:%d" % \
			(len(rx_skb_list), of_count_rx_skb_list)

# called from perf, when it finds a correspoinding event
def irq__softirq_entry(name, context, cpu, sec, nsec, pid, comm, vec):
	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
		return
	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
	all_event_list.append(event_info)

def irq__softirq_exit(name, context, cpu, sec, nsec, pid, comm, vec):
	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
		return
	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
	all_event_list.append(event_info)

def irq__softirq_raise(name, context, cpu, sec, nsec, pid, comm, vec):
	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
		return
	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
	all_event_list.append(event_info)

def irq__irq_handler_entry(name, context, cpu, sec, nsec, pid, comm,
			irq, irq_name):
	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
			irq, irq_name)
	all_event_list.append(event_info)

def irq__irq_handler_exit(name, context, cpu, sec, nsec, pid, comm, irq, ret):
	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, irq, ret)
	all_event_list.append(event_info)

def napi__napi_poll(name, context, cpu, sec, nsec, pid, comm, napi, dev_name):
	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
			napi, dev_name)
	all_event_list.append(event_info)

def net__netif_receive_skb(name, context, cpu, sec, nsec, pid, comm, skbaddr,
			skblen, dev_name):
	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
			skbaddr, skblen, dev_name)
	all_event_list.append(event_info)

def net__netif_rx(name, context, cpu, sec, nsec, pid, comm, skbaddr,
			skblen, dev_name):
	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
			skbaddr, skblen, dev_name)
	all_event_list.append(event_info)

def net__net_dev_queue(name, context, cpu, sec, nsec, pid, comm,
			skbaddr, skblen, dev_name):
	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
			skbaddr, skblen, dev_name)
	all_event_list.append(event_info)

def net__net_dev_xmit(name, context, cpu, sec, nsec, pid, comm,
			skbaddr, skblen, rc, dev_name):
	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
			skbaddr, skblen, rc ,dev_name)
	all_event_list.append(event_info)

def skb__kfree_skb(name, context, cpu, sec, nsec, pid, comm,
			skbaddr, protocol, location):
	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
			skbaddr, protocol, location)
	all_event_list.append(event_info)

def skb__consume_skb(name, context, cpu, sec, nsec, pid, comm, skbaddr):
	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
			skbaddr)
	all_event_list.append(event_info)

def skb__skb_copy_datagram_iovec(name, context, cpu, sec, nsec, pid, comm,
	skbaddr, skblen):
	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
			skbaddr, skblen)
	all_event_list.append(event_info)

def handle_irq_handler_entry(event_info):
	(name, context, cpu, time, pid, comm, irq, irq_name) = event_info
	if cpu not in irq_dic.keys():
		irq_dic[cpu] = []
	irq_record = {'irq':irq, 'name':irq_name, 'cpu':cpu, 'irq_ent_t':time}
	irq_dic[cpu].append(irq_record)

def handle_irq_handler_exit(event_info):
	(name, context, cpu, time, pid, comm, irq, ret) = event_info
	if cpu not in irq_dic.keys():
		return
	irq_record = irq_dic[cpu].pop()
	if irq != irq_record['irq']:
		return
	irq_record.update({'irq_ext_t':time})
	# if an irq doesn't include NET_RX softirq, drop.
	if 'event_list' in irq_record.keys():
		irq_dic[cpu].append(irq_record)

def handle_irq_softirq_raise(event_info):
	(name, context, cpu, time, pid, comm, vec) = event_info
	if cpu not in irq_dic.keys() \
	or len(irq_dic[cpu]) == 0:
		return
	irq_record = irq_dic[cpu].pop()
	if 'event_list' in irq_record.keys():
		irq_event_list = irq_record['event_list']
	else:
		irq_event_list = []
	irq_event_list.append({'time':time, 'event':'sirq_raise'})
	irq_record.update({'event_list':irq_event_list})
	irq_dic[cpu].append(irq_record)

def handle_irq_softirq_entry(event_info):
	(name, context, cpu, time, pid, comm, vec) = event_info
	net_rx_dic[cpu] = {'sirq_ent_t':time, 'event_list':[]}

def handle_irq_softirq_exit(event_info):
	(name, context, cpu, time, pid, comm, vec) = event_info
	irq_list = []
	event_list = 0
	if cpu in irq_dic.keys():
		irq_list = irq_dic[cpu]
		del irq_dic[cpu]
	if cpu in net_rx_dic.keys():
		sirq_ent_t = net_rx_dic[cpu]['sirq_ent_t']
		event_list = net_rx_dic[cpu]['event_list']
		del net_rx_dic[cpu]
	if irq_list == [] or event_list == 0:
		return
	rec_data = {'sirq_ent_t':sirq_ent_t, 'sirq_ext_t':time,
		    'irq_list':irq_list, 'event_list':event_list}
	# merge information realted to a NET_RX softirq
	receive_hunk_list.append(rec_data)

def handle_napi_poll(event_info):
	(name, context, cpu, time, pid, comm, napi, dev_name) = event_info
	if cpu in net_rx_dic.keys():
		event_list = net_rx_dic[cpu]['event_list']
		rec_data = {'event_name':'napi_poll',
				'dev':dev_name, 'event_t':time}
		event_list.append(rec_data)

def handle_netif_rx(event_info):
	(name, context, cpu, time, pid, comm,
		skbaddr, skblen, dev_name) = event_info
	if cpu not in irq_dic.keys() \
	or len(irq_dic[cpu]) == 0:
		return
	irq_record = irq_dic[cpu].pop()
	if 'event_list' in irq_record.keys():
		irq_event_list = irq_record['event_list']
	else:
		irq_event_list = []
	irq_event_list.append({'time':time, 'event':'netif_rx',
		'skbaddr':skbaddr, 'skblen':skblen, 'dev_name':dev_name})
	irq_record.update({'event_list':irq_event_list})
	irq_dic[cpu].append(irq_record)

def handle_netif_receive_skb(event_info):
	global of_count_rx_skb_list

	(name, context, cpu, time, pid, comm,
		skbaddr, skblen, dev_name) = event_info
	if cpu in net_rx_dic.keys():
		rec_data = {'event_name':'netif_receive_skb',
			    'event_t':time, 'skbaddr':skbaddr, 'len':skblen}
		event_list = net_rx_dic[cpu]['event_list']
		event_list.append(rec_data)
		rx_skb_list.insert(0, rec_data)
		if len(rx_skb_list) > buffer_budget:
			rx_skb_list.pop()
			of_count_rx_skb_list += 1

def handle_net_dev_queue(event_info):
	global of_count_tx_queue_list

	(name, context, cpu, time, pid, comm,
		skbaddr, skblen, dev_name) = event_info
	skb = {'dev':dev_name, 'skbaddr':skbaddr, 'len':skblen, 'queue_t':time}
	tx_queue_list.insert(0, skb)
	if len(tx_queue_list) > buffer_budget:
		tx_queue_list.pop()
		of_count_tx_queue_list += 1

def handle_net_dev_xmit(event_info):
	global of_count_tx_xmit_list

	(name, context, cpu, time, pid, comm,
		skbaddr, skblen, rc, dev_name) = event_info
	if rc == 0: # NETDEV_TX_OK
		for i in range(len(tx_queue_list)):
			skb = tx_queue_list[i]
			if skb['skbaddr'] == skbaddr:
				skb['xmit_t'] = time
				tx_xmit_list.insert(0, skb)
				del tx_queue_list[i]
				if len(tx_xmit_list) > buffer_budget:
					tx_xmit_list.pop()
					of_count_tx_xmit_list += 1
				return

def handle_kfree_skb(event_info):
	(name, context, cpu, time, pid, comm,
		skbaddr, protocol, location) = event_info
	for i in range(len(tx_queue_list)):
		skb = tx_queue_list[i]
		if skb['skbaddr'] == skbaddr:
			del tx_queue_list[i]
			return
	for i in range(len(tx_xmit_list)):
		skb = tx_xmit_list[i]
		if skb['skbaddr'] == skbaddr:
			skb['free_t'] = time
			tx_free_list.append(skb)
			del tx_xmit_list[i]
			return
	for i in range(len(rx_skb_list)):
		rec_data = rx_skb_list[i]
		if rec_data['skbaddr'] == skbaddr:
			rec_data.update({'handle':"kfree_skb",
					'comm':comm, 'pid':pid, 'comm_t':time})
			del rx_skb_list[i]
			return

def handle_consume_skb(event_info):
	(name, context, cpu, time, pid, comm, skbaddr) = event_info
	for i in range(len(tx_xmit_list)):
		skb = tx_xmit_list[i]
		if skb['skbaddr'] == skbaddr:
			skb['free_t'] = time
			tx_free_list.append(skb)
			del tx_xmit_list[i]
			return

def handle_skb_copy_datagram_iovec(event_info):
	(name, context, cpu, time, pid, comm, skbaddr, skblen) = event_info
	for i in range(len(rx_skb_list)):
		rec_data = rx_skb_list[i]
		if skbaddr == rec_data['skbaddr']:
			rec_data.update({'handle':"skb_copy_datagram_iovec",
					'comm':comm, 'pid':pid, 'comm_t':time})
			del rx_skb_list[i]
			return
r CONFIG_KEXEC_FILE, instead of doing select CRYPTO=y, I changed it to "depends on CRYPTO=y". This should be safer as "select" is not recursive. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Cc: Eric Biederman <ebiederm@xmission.com> Cc: H. Peter Anvin <hpa@zytor.com> Tested-by: Shaun Ruffell <sruffell@digium.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-29x86,mm: fix pte_special versus pte_numaHugh Dickins2-6/+10 Sasha Levin has shown oopses on ffffea0003480048 and ffffea0003480008 at mm/memory.c:1132, running Trinity on different 3.16-rc-next kernels: where zap_pte_range() checks page->mapping to see if PageAnon(page). Those addresses fit struct pages for pfns d2001 and d2000, and in each dump a register or a stack slot showed d2001730 or d2000730: pte flags 0x730 are PCD ACCESSED PROTNONE SPECIAL IOMAP; and Sasha's e820 map has a hole between cfffffff and 100000000, which would need special access. Commit c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels") has broken vm_normal_page(): a PROTNONE SPECIAL pte no longer passes the pte_special() test, so zap_pte_range() goes on to try to access a non-existent struct page. Fix this by refining pte_special() (SPECIAL with PRESENT or PROTNONE) to complement pte_numa() (SPECIAL with neither PRESENT nor PROTNONE). A hint that this was a problem was that c46a7c817e66 added pte_numa() test to vm_normal_page(), and moved its is_zero_pfn() test from slow to fast path: This was papering over a pte_special() snag when the zero page was encountered during zap. This patch reverts vm_normal_page() to how it was before, relying on pte_special(). It still appears that this patch may be incomplete: aren't there other places which need to be handling PROTNONE along with PRESENT? For example, pte_mknuma() clears _PAGE_PRESENT and sets _PAGE_NUMA, but on a PROT_NONE area, that would make it pte_special(). This is side-stepped by the fact that NUMA hinting faults skipped PROT_NONE VMAs and there are no grounds where a NUMA hinting fault on a PROT_NONE VMA would be interesting. Fixes: c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels") Reported-by: Sasha Levin <sasha.levin@oracle.com> Tested-by: Sasha Levin <sasha.levin@oracle.com> Signed-off-by: Hugh Dickins <hughd@google.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Cyrill Gorcunov <gorcunov@gmail.com> Cc: Matthew Wilcox <matthew.r.wilcox@intel.com> Cc: <stable@vger.kernel.org> [3.16] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-29hugetlb_cgroup: use lockdep_assert_held rather than spin_is_lockedMichal Hocko1-1/+1 spin_lock may be an empty struct for !SMP configurations and so arch_spin_is_locked may return unconditional 0 and trigger the VM_BUG_ON even when the lock is held. Replace spin_is_locked by lockdep_assert_held. We will not BUG anymore but it is questionable whether crashing makes a lot of sense in the uncharge path. Uncharge happens after the last page reference was released so nobody should touch the page and the function doesn't update any shared state except for res counter which uses synchronization of its own. Signed-off-by: Michal Hocko <mhocko@suse.cz> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-29mm/zpool: use prefixed module loadingKees Cook3-1/+3 To avoid potential format string expansion via module parameters, do not use the zpool type directly in request_module() without a format string. Additionally, to avoid arbitrary modules being loaded via zpool API (e.g. via the zswap_zpool_type module parameter) add a "zpool-" prefix to the requested module, as well as module aliases for the existing zpool types (zbud and zsmalloc). Signed-off-by: Kees Cook <keescook@chromium.org> Cc: Seth Jennings <sjennings@variantweb.net> Cc: Minchan Kim <minchan@kernel.org> Cc: Nitin Gupta <ngupta@vflare.org> Acked-by: Dan Streetman <ddstreet@ieee.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-29zram: fix incorrect stat with failed_readsChao Yu2-4/+8 Since we allocate a temporary buffer in zram_bvec_read to handle partial page operations in commit 924bd88d703e ("Staging: zram: allow partial page operations"), our ->failed_reads value may be incorrect as we do not increase its value when failing to allocate the temporary buffer. Let's fix this issue and correct the annotation of failed_reads. Signed-off-by: Chao Yu <chao2.yu@samsung.com> Acked-by: Minchan Kim <minchan@kernel.org> Cc: Nitin Gupta <ngupta@vflare.org> Acked-by: Jerome Marchand <jmarchan@redhat.com> Acked-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-29lib: turn CONFIG_STACKTRACE into an actual option.Dave Jones1-1/+6 I was puzzled why /proc/$$/stack had disappeared, until I figured out I had disabled the last debug option that did a 'select STACKTRACE'. This patch makes the option show up at config time, so it can be enabled without enabling any of the more heavyweight debug options. Signed-off-by: Dave Jones <davej@redhat.com> Acked-by: Ingo Molnar <mingo@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-29mm: actually clear pmd_numa before invalidatingMatthew Wilcox1-1/+1 Commit 67f87463d3a3 ("mm: clear pmd_numa before invalidating") cleared the NUMA bit in a copy of the PMD entry, but then wrote back the original Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com> Acked-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-29memblock, memhotplug: fix wrong type in memblock_find_in_range_node().Tang Chen1-2/+1 In memblock_find_in_range_node(), we defined ret as int. But it should be phys_addr_t because it is used to store the return value from __memblock_find_range_bottom_up(). The bug has not been triggered because when allocating low memory near the kernel end, the "int ret" won't turn out to be negative. When we started to allocate memory on other nodes, and the "int ret" could be minus. Then the kernel will panic. A simple way to reproduce this: comment out the following code in numa_init(), memblock_set_bottom_up(false); and the kernel won't boot. Reported-by: Xishi Qiu <qiuxishi@huawei.com> Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com> Tested-by: Xishi Qiu <qiuxishi@huawei.com> Cc: <stable@vger.kernel.org> [3.13+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-29resource: fix the case of null pointer accessVivek Goyal1-7/+4 Richard and Daniel reported that UML is broken due to changes to resource traversal functions. Problem is that iomem_resource.child can be null and new code does not consider that possibility. Old code used a for loop and that loop will not even execute if p was null. Revert back to for() loop logic and bail out if p is null. I also moved sibling_only check out of resource_lock. There is no reason to keep it inside the lock. Following is backtrace of the UML crash. RIP: 0033:[<0000000060039b9f>] RSP: 0000000081459da0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 00000000219b3fff RCX: 000000006010d1d9 RDX: 0000000000000001 RSI: 00000000602dfb94 RDI: 0000000081459df8 RBP: 0000000081459de0 R08: 00000000601b59f4 R09: ffffffff0000ff00 R10: ffffffff0000ff00 R11: 0000000081459e88 R12: 0000000081459df8 R13: 00000000219b3fff R14: 00000000602dfb94 R15: 0000000000000000 Kernel panic - not syncing: Segfault with no mm CPU: 0 PID: 1 Comm: swapper Not tainted 3.16.0-10454-g58d08e3 #13 Stack: 00000000 000080d0 81459df0 219b3fff 81459e70 6010d1d9 ffffffff 6033e010 81459e50 6003a269 81459e30 00000000 Call Trace: [<6010d1d9>] ? kclist_add_private+0x0/0xe7 [<6003a269>] walk_system_ram_range+0x61/0xb7 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<6010d574>] kcore_update_ram+0x4c/0x168 [<6010d72e>] ? kclist_add+0x0/0x2e [<6000e943>] proc_kcore_init+0xea/0xf1 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<600189f0>] do_one_initcall+0x13c/0x204 [<6004ca46>] ? parse_args+0x1df/0x2e0 [<6004c82d>] ? parameq+0x0/0x3a [<601b5990>] ? strcpy+0x0/0x18 [<60001e1a>] kernel_init_freeable+0x240/0x31e [<6026f1c0>] kernel_init+0x12/0x148 [<60019fad>] new_thread_handler+0x81/0xa3 Fixes 8c86e70acead629aacb4a ("resource: provide new functions to walk through resources"). Reported-by: Daniel Walter <sahne@0x90.at> Tested-by: Richard Weinberger <richard@nod.at> Tested-by: Toralf Förster <toralf.foerster@gmx.de> Tested-by: Daniel Walter <sahne@0x90.at> Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-29checkpatch: relax check for length of git commit IDsJoe Perches1-2/+2 Checkpatch currently warns if a git commit ID (in the changelog, usually) is less than 12 characters or more than 16. The "more than 16" is excessive. Change the check so we accept IDs from 12 to 40 chars in length. Cc: Geert Uytterhoeven <geert@linux-m68k.org Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-29alpha: io: implement relaxed accessor macros for writesWill Deacon1-4/+8 write{b,w,l,q}_relaxed are implemented by some architectures in order to permit memory-mapped I/O writes with weaker barrier semantics than the non-relaxed variants. This patch implements these write macros for Alpha, in the same vein as the relaxed read macros, which are already implemented. Acked-by: Richard Henderson <rth@twiddle.net> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Signed-off-by: Will Deacon <will.deacon@arm.com> Signed-off-by: Matt Turner <mattst88@gmail.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-29alpha: Wire up sched_setattr, sched_getattr, and renameat2 syscalls.Michael Cree3-1/+7 Signed-off-by: Michael Cree <mcree@orcon.net.nz> Signed-off-by: Matt Turner <mattst88@gmail.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-29x86, irq, PCI: Keep IRQ assignment for runtime power managementJiang Liu5-2/+20 Now IOAPIC driver dynamically allocates IRQ numbers for IOAPIC pins. We need to keep IRQ assignment for PCI devices during runtime power management, otherwise it may cause failure of device wakeups. Commit 3eec595235c17a7 "x86, irq, PCI: Keep IRQ assignment for PCI devices during suspend/hibernation" has fixed the issue for suspend/ hibernation, we also need the same fix for runtime device sleep too. Fix: https://bugzilla.kernel.org/show_bug.cgi?id=83271 Reported-and-Tested-by: EmanueL Czirai <amanual@openmailbox.org> Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Tony Luck <tony.luck@intel.com> Cc: Joerg Roedel <joro@8bytes.org> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: EmanueL Czirai <amanual@openmailbox.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Rafael J. Wysocki <rjw@rjwysocki.net> Cc: Bjorn Helgaas <bhelgaas@google.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Grant Likely <grant.likely@linaro.org> Link: http://lkml.kernel.org/r/1409304383-18806-1-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> 2014-08-29spi/rockchip: Avoid accidentally turning off the clockDoug Anderson1-1/+1 If our client is requesting a clock that is above the maximum clock then the following division will result in 0: rs->max_freq / rs->speed We'll then program 0 into the SPI_BAUDR register. The Rockchip TRM says: "If the value is 0, the serial output clock (sclk_out) is disabled." It's much better to end up with the fastest possible clock rather than a clock that is off, so enforce a minimum value. Signed-off-by: Doug Anderson <dianders@chromium.org> Signed-off-by: Mark Brown <broonie@kernel.org> 2014-08-28ext4: fix same-dir rename when inline data directory overflowsDarrick J. Wong1-3/+18 When performing a same-directory rename, it's possible that adding or setting the new directory entry will cause the directory to overflow the inline data area, which causes the directory to be converted to an extent-based directory. Under this circumstance it is necessary to re-read the directory when deleting the old dirent because the "old directory" context still points to i_block in the inode table, which is now an extent tree root! The delete fails with an FS error, and the subsequent fsck complains about incorrect link counts and hardlinked directories. Test case (originally found with flat_dir_test in the metadata_csum test program): # mkfs.ext4 -O inline_data /dev/sda # mount /dev/sda /mnt # mkdir /mnt/x # touch /mnt/x/changelog.gz /mnt/x/copyright /mnt/x/README.Debian # sync # for i in /mnt/x/*; do mv $i $i.longer; done # ls -la /mnt/x/ total 0 -rw-r--r-- 1 root root 0 Aug 25 12:03 changelog.gz.longer -rw-r--r-- 1 root root 0 Aug 25 12:03 copyright -rw-r--r-- 1 root root 0 Aug 25 12:03 copyright.longer -rw-r--r-- 1 root root 0 Aug 25 12:03 README.Debian.longer (Hey! Why are there four files now??) Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@vger.kernel.org 2014-08-28jbd2: fix descriptor block size handling errors with journal_csumDarrick J. Wong6-49/+95 It turns out that there are some serious problems with the on-disk format of journal checksum v2. The foremost is that the function to calculate descriptor tag size returns sizes that are too big. This causes alignment issues on some architectures and is compounded by the fact that some parts of jbd2 use the structure size (incorrectly) to determine the presence of a 64bit journal instead of checking the feature flags. Therefore, introduce journal checksum v3, which enlarges the descriptor block tag format to allow for full 32-bit checksums of journal blocks, fix the journal tag function to return the correct sizes, and fix the jbd2 recovery code to use feature flags to determine 64bitness. Add a few function helpers so we don't have to open-code quite so many pieces. Switching to a 16-byte block size was found to increase journal size overhead by a maximum of 0.1%, to convert a 32-bit journal with no checksumming to a 32-bit journal with checksum v3 enabled. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reported-by: TR Reardon <thomas_reardon@hotmail.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@vger.kernel.org 2014-08-28jbd2: fix infinite loop when recovering corrupt journal blocksDarrick J. Wong1-2/+5 When recovering the journal, don't fall into an infinite loop if we encounter a corrupt journal block. Instead, just skip the block and return an error, which fails the mount and thus forces the user to run a full filesystem fsck. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@vger.kernel.org 2014-08-28ext4: update i_disksize coherently with block allocation on error pathDmitry Monakhov1-2/+8 In case of delalloc block i_disksize may be less than i_size. So we have to update i_disksize each time we allocated and submitted some blocks beyond i_disksize. We weren't doing this on the error paths, so fix this. testcase: xfstest generic/019 Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@vger.kernel.org 2014-08-28dm crypt: fix access beyond the end of allocated spaceMikulas Patocka1-6/+19 The DM crypt target accesses memory beyond allocated space resulting in a crash on 32 bit x86 systems. This bug is very old (it dates back to 2.6.25 commit 3a7f6c990ad04 "dm crypt: use async crypto"). However, this bug was masked by the fact that kmalloc rounds the size up to the next power of two. This bug wasn't exposed until 3.17-rc1 commit 298a9fa08a ("dm crypt: use per-bio data"). By switching to using per-bio data there was no longer any padding beyond the end of a dm-crypt allocated memory block. To minimize allocation overhead dm-crypt puts several structures into one block allocated with kmalloc. The block holds struct ablkcipher_request, cipher-specific scratch pad (crypto_ablkcipher_reqsize(any_tfm(cc))), struct dm_crypt_request and an initialization vector. The variable dmreq_start is set to offset of struct dm_crypt_request within this memory block. dm-crypt allocates the block with this size: cc->dmreq_start + sizeof(struct dm_crypt_request) + cc->iv_size. When accessing the initialization vector, dm-crypt uses the function iv_of_dmreq, which performs this calculation: ALIGN((unsigned long)(dmreq + 1), crypto_ablkcipher_alignmask(any_tfm(cc)) + 1). dm-crypt allocated "cc->iv_size" bytes beyond the end of dm_crypt_request structure. However, when dm-crypt accesses the initialization vector, it takes a pointer to the end of dm_crypt_request, aligns it, and then uses it as the initialization vector. If the end of dm_crypt_request is not aligned on a crypto_ablkcipher_alignmask(any_tfm(cc)) boundary the alignment causes the initialization vector to point beyond the allocated space. Fix this bug by calculating the variable iv_size_padding and adding it to the allocated size. Also correct the alignment of dm_crypt_request. struct dm_crypt_request is specific to dm-crypt (it isn't used by the crypto subsystem at all), so it is aligned on __alignof__(struct dm_crypt_request). Also align per_bio_data_size on ARCH_KMALLOC_MINALIGN, so that it is aligned as if the block was allocated with kmalloc. Reported-by: Krzysztof Kolasa <kkolasa@winsoft.pl> Tested-by: Milan Broz <gmazyland@gmail.com> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> 2014-08-28mfd: twl4030-power: Fix PM idle pin configuration to not conflict with ↵Tony Lindgren2-11/+11 regulators Commit 43fef47f94a1 (mfd: twl4030-power: Add a configuration to turn off oscillator during off-idle) added support for configuring the PMIC to cut off resources during deeper idle states to save power. This however caused regression for n900 display power that needed the PMIC configuration to be disabled with commit d937678ab625 (ARM: dts: Revert enabling of twl configuration for n900). Turns out the root cause of the problem is that we must use TWL4030_RESCONFIG_UNDEF instead of DEV_GRP_NULL to avoid disabling regulators that may have been enabled before the init function for twl4030-power.c runs. With TWL4030_RESCONFIG_UNDEF we let the regulator framework control the regulators like it should. Here we need to only configure the sys_clken and sys_off_mode triggers for the regulators that cannot be done by the regulator framework as it's not running at that point. This allows us to enable the PMIC configuration for n900. Fixes: 43fef47f94a1 (mfd: twl4030-power: Add a configuration to turn off oscillator during off-idle) Cc: stable@vger.kernel.org # v3.16 Signed-off-by: Tony Lindgren <tony@atomide.com> Tested-by: Aaro Koskinen <aaro.koskinen@iki.fi> Signed-off-by: Lee Jones <lee.jones@linaro.org>