forked from mirrors/linux
		
	Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - a few misc bits - ocfs2 updates - almost all of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (131 commits) memory hotplug: fix comments when adding section mm: make alloc_node_mem_map a void call if we don't have CONFIG_FLAT_NODE_MEM_MAP mm: simplify nodemask printing mm,oom_reaper: remove pointless kthread_run() error check mm/page_ext.c: check if page_ext is not prepared writeback: remove unused function parameter mm: do not rely on preempt_count in print_vma_addr mm, sparse: do not swamp log with huge vmemmap allocation failures mm/hmm: remove redundant variable align_end mm/list_lru.c: mark expected switch fall-through mm/shmem.c: mark expected switch fall-through mm/page_alloc.c: broken deferred calculation mm: don't warn about allocations which stall for too long fs: fuse: account fuse_inode slab memory as reclaimable mm, page_alloc: fix potential false positive in __zone_watermark_ok mm: mlock: remove lru_add_drain_all() mm, sysctl: make NUMA stats configurable shmem: convert shmem_init_inodecache() to void Unify migrate_pages and move_pages access checks mm, pagevec: rename pagevec drained field ...
This commit is contained in:
		
						commit
						7c225c69f8
					
				
					 250 changed files with 2278 additions and 4086 deletions
				
			
		|  | @ -1864,13 +1864,6 @@ | ||||||
| 			Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y, | 			Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y, | ||||||
| 			the default is off. | 			the default is off. | ||||||
| 
 | 
 | ||||||
| 	kmemcheck=	[X86] Boot-time kmemcheck enable/disable/one-shot mode |  | ||||||
| 			Valid arguments: 0, 1, 2 |  | ||||||
| 			kmemcheck=0 (disabled) |  | ||||||
| 			kmemcheck=1 (enabled) |  | ||||||
| 			kmemcheck=2 (one-shot mode) |  | ||||||
| 			Default: 2 (one-shot mode) |  | ||||||
| 
 |  | ||||||
| 	kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. | 	kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. | ||||||
| 			Default is 0 (don't ignore, but inject #GP) | 			Default is 0 (don't ignore, but inject #GP) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -21,7 +21,6 @@ whole; patches welcome! | ||||||
|    kasan |    kasan | ||||||
|    ubsan |    ubsan | ||||||
|    kmemleak |    kmemleak | ||||||
|    kmemcheck |  | ||||||
|    gdb-kernel-debugging |    gdb-kernel-debugging | ||||||
|    kgdb |    kgdb | ||||||
|    kselftest |    kselftest | ||||||
|  |  | ||||||
|  | @ -1,733 +0,0 @@ | ||||||
| Getting started with kmemcheck |  | ||||||
| ============================== |  | ||||||
| 
 |  | ||||||
| Vegard Nossum <vegardno@ifi.uio.no> |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| Introduction |  | ||||||
| ------------ |  | ||||||
| 
 |  | ||||||
| kmemcheck is a debugging feature for the Linux Kernel. More specifically, it |  | ||||||
| is a dynamic checker that detects and warns about some uses of uninitialized |  | ||||||
| memory. |  | ||||||
| 
 |  | ||||||
| Userspace programmers might be familiar with Valgrind's memcheck. The main |  | ||||||
| difference between memcheck and kmemcheck is that memcheck works for userspace |  | ||||||
| programs only, and kmemcheck works for the kernel only. The implementations |  | ||||||
| are of course vastly different. Because of this, kmemcheck is not as accurate |  | ||||||
| as memcheck, but it turns out to be good enough in practice to discover real |  | ||||||
| programmer errors that the compiler is not able to find through static |  | ||||||
| analysis. |  | ||||||
| 
 |  | ||||||
| Enabling kmemcheck on a kernel will probably slow it down to the extent that |  | ||||||
| the machine will not be usable for normal workloads such as e.g. an |  | ||||||
| interactive desktop. kmemcheck will also cause the kernel to use about twice |  | ||||||
| as much memory as normal. For this reason, kmemcheck is strictly a debugging |  | ||||||
| feature. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| Downloading |  | ||||||
| ----------- |  | ||||||
| 
 |  | ||||||
| As of version 2.6.31-rc1, kmemcheck is included in the mainline kernel. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| Configuring and compiling |  | ||||||
| ------------------------- |  | ||||||
| 
 |  | ||||||
| kmemcheck only works for the x86 (both 32- and 64-bit) platform. A number of |  | ||||||
| configuration variables must have specific settings in order for the kmemcheck |  | ||||||
| menu to even appear in "menuconfig". These are: |  | ||||||
| 
 |  | ||||||
| - ``CONFIG_CC_OPTIMIZE_FOR_SIZE=n`` |  | ||||||
| 	This option is located under "General setup" / "Optimize for size". |  | ||||||
| 
 |  | ||||||
| 	Without this, gcc will use certain optimizations that usually lead to |  | ||||||
| 	false positive warnings from kmemcheck. An example of this is a 16-bit |  | ||||||
| 	field in a struct, where gcc may load 32 bits, then discard the upper |  | ||||||
| 	16 bits. kmemcheck sees only the 32-bit load, and may trigger a |  | ||||||
| 	warning for the upper 16 bits (if they're uninitialized). |  | ||||||
| 
 |  | ||||||
| - ``CONFIG_SLAB=y`` or ``CONFIG_SLUB=y`` |  | ||||||
| 	This option is located under "General setup" / "Choose SLAB |  | ||||||
| 	allocator". |  | ||||||
| 
 |  | ||||||
| - ``CONFIG_FUNCTION_TRACER=n`` |  | ||||||
| 	This option is located under "Kernel hacking" / "Tracers" / "Kernel |  | ||||||
| 	Function Tracer" |  | ||||||
| 
 |  | ||||||
| 	When function tracing is compiled in, gcc emits a call to another |  | ||||||
| 	function at the beginning of every function. This means that when the |  | ||||||
| 	page fault handler is called, the ftrace framework will be called |  | ||||||
| 	before kmemcheck has had a chance to handle the fault. If ftrace then |  | ||||||
| 	modifies memory that was tracked by kmemcheck, the result is an |  | ||||||
| 	endless recursive page fault. |  | ||||||
| 
 |  | ||||||
| - ``CONFIG_DEBUG_PAGEALLOC=n`` |  | ||||||
| 	This option is located under "Kernel hacking" / "Memory Debugging" |  | ||||||
| 	/ "Debug page memory allocations". |  | ||||||
| 
 |  | ||||||
| In addition, I highly recommend turning on ``CONFIG_DEBUG_INFO=y``. This is also |  | ||||||
| located under "Kernel hacking". With this, you will be able to get line number |  | ||||||
| information from the kmemcheck warnings, which is extremely valuable in |  | ||||||
| debugging a problem. This option is not mandatory, however, because it slows |  | ||||||
| down the compilation process and produces a much bigger kernel image. |  | ||||||
| 
 |  | ||||||
| Now the kmemcheck menu should be visible (under "Kernel hacking" / "Memory |  | ||||||
| Debugging" / "kmemcheck: trap use of uninitialized memory"). Here follows |  | ||||||
| a description of the kmemcheck configuration variables: |  | ||||||
| 
 |  | ||||||
| - ``CONFIG_KMEMCHECK`` |  | ||||||
| 	This must be enabled in order to use kmemcheck at all... |  | ||||||
| 
 |  | ||||||
| - ``CONFIG_KMEMCHECK_``[``DISABLED`` | ``ENABLED`` | ``ONESHOT``]``_BY_DEFAULT`` |  | ||||||
| 	This option controls the status of kmemcheck at boot-time. "Enabled" |  | ||||||
| 	will enable kmemcheck right from the start, "disabled" will boot the |  | ||||||
| 	kernel as normal (but with the kmemcheck code compiled in, so it can |  | ||||||
| 	be enabled at run-time after the kernel has booted), and "one-shot" is |  | ||||||
| 	a special mode which will turn kmemcheck off automatically after |  | ||||||
| 	detecting the first use of uninitialized memory. |  | ||||||
| 
 |  | ||||||
| 	If you are using kmemcheck to actively debug a problem, then you |  | ||||||
| 	probably want to choose "enabled" here. |  | ||||||
| 
 |  | ||||||
| 	The one-shot mode is mostly useful in automated test setups because it |  | ||||||
| 	can prevent floods of warnings and increase the chances of the machine |  | ||||||
| 	surviving in case something is really wrong. In other cases, the one- |  | ||||||
| 	shot mode could actually be counter-productive because it would turn |  | ||||||
| 	itself off at the very first error -- in the case of a false positive |  | ||||||
| 	too -- and this would come in the way of debugging the specific |  | ||||||
| 	problem you were interested in. |  | ||||||
| 
 |  | ||||||
| 	If you would like to use your kernel as normal, but with a chance to |  | ||||||
| 	enable kmemcheck in case of some problem, it might be a good idea to |  | ||||||
| 	choose "disabled" here. When kmemcheck is disabled, most of the run- |  | ||||||
| 	time overhead is not incurred, and the kernel will be almost as fast |  | ||||||
| 	as normal. |  | ||||||
| 
 |  | ||||||
| - ``CONFIG_KMEMCHECK_QUEUE_SIZE`` |  | ||||||
| 	Select the maximum number of error reports to store in an internal |  | ||||||
| 	(fixed-size) buffer. Since errors can occur virtually anywhere and in |  | ||||||
| 	any context, we need a temporary storage area which is guaranteed not |  | ||||||
| 	to generate any other page faults when accessed. The queue will be |  | ||||||
| 	emptied as soon as a tasklet may be scheduled. If the queue is full, |  | ||||||
| 	new error reports will be lost. |  | ||||||
| 
 |  | ||||||
| 	The default value of 64 is probably fine. If some code produces more |  | ||||||
| 	than 64 errors within an irqs-off section, then the code is likely to |  | ||||||
| 	produce many, many more, too, and these additional reports seldom give |  | ||||||
| 	any more information (the first report is usually the most valuable |  | ||||||
| 	anyway). |  | ||||||
| 
 |  | ||||||
| 	This number might have to be adjusted if you are not using serial |  | ||||||
| 	console or similar to capture the kernel log. If you are using the |  | ||||||
| 	"dmesg" command to save the log, then getting a lot of kmemcheck |  | ||||||
| 	warnings might overflow the kernel log itself, and the earlier reports |  | ||||||
| 	will get lost in that way instead. Try setting this to 10 or so on |  | ||||||
| 	such a setup. |  | ||||||
| 
 |  | ||||||
| - ``CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT`` |  | ||||||
| 	Select the number of shadow bytes to save along with each entry of the |  | ||||||
| 	error-report queue. These bytes indicate what parts of an allocation |  | ||||||
| 	are initialized, uninitialized, etc. and will be displayed when an |  | ||||||
| 	error is detected to help the debugging of a particular problem. |  | ||||||
| 
 |  | ||||||
| 	The number entered here is actually the logarithm of the number of |  | ||||||
| 	bytes that will be saved. So if you pick for example 5 here, kmemcheck |  | ||||||
| 	will save 2^5 = 32 bytes. |  | ||||||
| 
 |  | ||||||
| 	The default value should be fine for debugging most problems. It also |  | ||||||
| 	fits nicely within 80 columns. |  | ||||||
| 
 |  | ||||||
| - ``CONFIG_KMEMCHECK_PARTIAL_OK`` |  | ||||||
| 	This option (when enabled) works around certain GCC optimizations that |  | ||||||
| 	produce 32-bit reads from 16-bit variables where the upper 16 bits are |  | ||||||
| 	thrown away afterwards. |  | ||||||
| 
 |  | ||||||
| 	The default value (enabled) is recommended. This may of course hide |  | ||||||
| 	some real errors, but disabling it would probably produce a lot of |  | ||||||
| 	false positives. |  | ||||||
| 
 |  | ||||||
| - ``CONFIG_KMEMCHECK_BITOPS_OK`` |  | ||||||
| 	This option silences warnings that would be generated for bit-field |  | ||||||
| 	accesses where not all the bits are initialized at the same time. This |  | ||||||
| 	may also hide some real bugs. |  | ||||||
| 
 |  | ||||||
| 	This option is probably obsolete, or it should be replaced with |  | ||||||
| 	the kmemcheck-/bitfield-annotations for the code in question. The |  | ||||||
| 	default value is therefore fine. |  | ||||||
| 
 |  | ||||||
| Now compile the kernel as usual. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| How to use |  | ||||||
| ---------- |  | ||||||
| 
 |  | ||||||
| Booting |  | ||||||
| ~~~~~~~ |  | ||||||
| 
 |  | ||||||
| First some information about the command-line options. There is only one |  | ||||||
| option specific to kmemcheck, and this is called "kmemcheck". It can be used |  | ||||||
| to override the default mode as chosen by the ``CONFIG_KMEMCHECK_*_BY_DEFAULT`` |  | ||||||
| option. Its possible settings are: |  | ||||||
| 
 |  | ||||||
| - ``kmemcheck=0`` (disabled) |  | ||||||
| - ``kmemcheck=1`` (enabled) |  | ||||||
| - ``kmemcheck=2`` (one-shot mode) |  | ||||||
| 
 |  | ||||||
| If SLUB debugging has been enabled in the kernel, it may take precedence over |  | ||||||
| kmemcheck in such a way that the slab caches which are under SLUB debugging |  | ||||||
| will not be tracked by kmemcheck. In order to ensure that this doesn't happen |  | ||||||
| (even though it shouldn't by default), use SLUB's boot option ``slub_debug``, |  | ||||||
| like this: ``slub_debug=-`` |  | ||||||
| 
 |  | ||||||
| In fact, this option may also be used for fine-grained control over SLUB vs. |  | ||||||
| kmemcheck. For example, if the command line includes |  | ||||||
| ``kmemcheck=1 slub_debug=,dentry``, then SLUB debugging will be used only |  | ||||||
| for the "dentry" slab cache, and with kmemcheck tracking all the other |  | ||||||
| caches. This is advanced usage, however, and is not generally recommended. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| Run-time enable/disable |  | ||||||
| ~~~~~~~~~~~~~~~~~~~~~~~ |  | ||||||
| 
 |  | ||||||
| When the kernel has booted, it is possible to enable or disable kmemcheck at |  | ||||||
| run-time. WARNING: This feature is still experimental and may cause false |  | ||||||
| positive warnings to appear. Therefore, try not to use this. If you find that |  | ||||||
| it doesn't work properly (e.g. you see an unreasonable amount of warnings), I |  | ||||||
| will be happy to take bug reports. |  | ||||||
| 
 |  | ||||||
| Use the file ``/proc/sys/kernel/kmemcheck`` for this purpose, e.g.:: |  | ||||||
| 
 |  | ||||||
| 	$ echo 0 > /proc/sys/kernel/kmemcheck # disables kmemcheck |  | ||||||
| 
 |  | ||||||
| The numbers are the same as for the ``kmemcheck=`` command-line option. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| Debugging |  | ||||||
| ~~~~~~~~~ |  | ||||||
| 
 |  | ||||||
| A typical report will look something like this:: |  | ||||||
| 
 |  | ||||||
|     WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024) |  | ||||||
|     80000000000000000000000000000000000000000088ffff0000000000000000 |  | ||||||
|      i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u |  | ||||||
|              ^ |  | ||||||
| 
 |  | ||||||
|     Pid: 1856, comm: ntpdate Not tainted 2.6.29-rc5 #264 945P-A |  | ||||||
|     RIP: 0010:[<ffffffff8104ede8>]  [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190 |  | ||||||
|     RSP: 0018:ffff88003cdf7d98  EFLAGS: 00210002 |  | ||||||
|     RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009 |  | ||||||
|     RDX: ffff88003e5d6018 RSI: ffff88003e5d6024 RDI: ffff88003cdf7e84 |  | ||||||
|     RBP: ffff88003cdf7db8 R08: ffff88003e5d6000 R09: 0000000000000000 |  | ||||||
|     R10: 0000000000000080 R11: 0000000000000000 R12: 000000000000000e |  | ||||||
|     R13: ffff88003cdf7e78 R14: ffff88003d530710 R15: ffff88003d5a98c8 |  | ||||||
|     FS:  0000000000000000(0000) GS:ffff880001982000(0063) knlGS:00000 |  | ||||||
|     CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033 |  | ||||||
|     CR2: ffff88003f806ea0 CR3: 000000003c036000 CR4: 00000000000006a0 |  | ||||||
|     DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 |  | ||||||
|     DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400 |  | ||||||
|      [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170 |  | ||||||
|      [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390 |  | ||||||
|      [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0 |  | ||||||
|      [<ffffffff8100c7b5>] int_signal+0x12/0x17 |  | ||||||
|      [<ffffffffffffffff>] 0xffffffffffffffff |  | ||||||
| 
 |  | ||||||
| The single most valuable information in this report is the RIP (or EIP on 32- |  | ||||||
| bit) value. This will help us pinpoint exactly which instruction that caused |  | ||||||
| the warning. |  | ||||||
| 
 |  | ||||||
| If your kernel was compiled with ``CONFIG_DEBUG_INFO=y``, then all we have to do |  | ||||||
| is give this address to the addr2line program, like this:: |  | ||||||
| 
 |  | ||||||
| 	$ addr2line -e vmlinux -i ffffffff8104ede8 |  | ||||||
| 	arch/x86/include/asm/string_64.h:12 |  | ||||||
| 	include/asm-generic/siginfo.h:287 |  | ||||||
| 	kernel/signal.c:380 |  | ||||||
| 	kernel/signal.c:410 |  | ||||||
| 
 |  | ||||||
| The "``-e vmlinux``" tells addr2line which file to look in. **IMPORTANT:** |  | ||||||
| This must be the vmlinux of the kernel that produced the warning in the |  | ||||||
| first place! If not, the line number information will almost certainly be |  | ||||||
| wrong. |  | ||||||
| 
 |  | ||||||
| The "``-i``" tells addr2line to also print the line numbers of inlined |  | ||||||
| functions.  In this case, the flag was very important, because otherwise, |  | ||||||
| it would only have printed the first line, which is just a call to |  | ||||||
| ``memcpy()``, which could be called from a thousand places in the kernel, and |  | ||||||
| is therefore not very useful.  These inlined functions would not show up in |  | ||||||
| the stack trace above, simply because the kernel doesn't load the extra |  | ||||||
| debugging information. This technique can of course be used with ordinary |  | ||||||
| kernel oopses as well. |  | ||||||
| 
 |  | ||||||
| In this case, it's the caller of ``memcpy()`` that is interesting, and it can be |  | ||||||
| found in ``include/asm-generic/siginfo.h``, line 287:: |  | ||||||
| 
 |  | ||||||
|     281 static inline void copy_siginfo(struct siginfo *to, struct siginfo *from) |  | ||||||
|     282 { |  | ||||||
|     283         if (from->si_code < 0) |  | ||||||
|     284                 memcpy(to, from, sizeof(*to)); |  | ||||||
|     285         else |  | ||||||
|     286                 /* _sigchld is currently the largest know union member */ |  | ||||||
|     287                 memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld)); |  | ||||||
|     288 } |  | ||||||
| 
 |  | ||||||
| Since this was a read (kmemcheck usually warns about reads only, though it can |  | ||||||
| warn about writes to unallocated or freed memory as well), it was probably the |  | ||||||
| "from" argument which contained some uninitialized bytes. Following the chain |  | ||||||
| of calls, we move upwards to see where "from" was allocated or initialized, |  | ||||||
| ``kernel/signal.c``, line 380:: |  | ||||||
| 
 |  | ||||||
|     359 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) |  | ||||||
|     360 { |  | ||||||
|     ... |  | ||||||
|     367         list_for_each_entry(q, &list->list, list) { |  | ||||||
|     368                 if (q->info.si_signo == sig) { |  | ||||||
|     369                         if (first) |  | ||||||
|     370                                 goto still_pending; |  | ||||||
|     371                         first = q; |  | ||||||
|     ... |  | ||||||
|     377         if (first) { |  | ||||||
|     378 still_pending: |  | ||||||
|     379                 list_del_init(&first->list); |  | ||||||
|     380                 copy_siginfo(info, &first->info); |  | ||||||
|     381                 __sigqueue_free(first); |  | ||||||
|     ... |  | ||||||
|     392         } |  | ||||||
|     393 } |  | ||||||
| 
 |  | ||||||
| Here, it is ``&first->info`` that is being passed on to ``copy_siginfo()``. The |  | ||||||
| variable ``first`` was found on a list -- passed in as the second argument to |  | ||||||
| ``collect_signal()``. We  continue our journey through the stack, to figure out |  | ||||||
| where the item on "list" was allocated or initialized. We move to line 410:: |  | ||||||
| 
 |  | ||||||
|     395 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, |  | ||||||
|     396                         siginfo_t *info) |  | ||||||
|     397 { |  | ||||||
|     ... |  | ||||||
|     410                 collect_signal(sig, pending, info); |  | ||||||
|     ... |  | ||||||
|     414 } |  | ||||||
| 
 |  | ||||||
| Now we need to follow the ``pending`` pointer, since that is being passed on to |  | ||||||
| ``collect_signal()`` as ``list``. At this point, we've run out of lines from the |  | ||||||
| "addr2line" output. Not to worry, we just paste the next addresses from the |  | ||||||
| kmemcheck stack dump, i.e.:: |  | ||||||
| 
 |  | ||||||
|      [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170 |  | ||||||
|      [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390 |  | ||||||
|      [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0 |  | ||||||
|      [<ffffffff8100c7b5>] int_signal+0x12/0x17 |  | ||||||
| 
 |  | ||||||
| 	$ addr2line -e vmlinux -i ffffffff8104f04e ffffffff81050bd8 \ |  | ||||||
| 		ffffffff8100b87d ffffffff8100c7b5 |  | ||||||
| 	kernel/signal.c:446 |  | ||||||
| 	kernel/signal.c:1806 |  | ||||||
| 	arch/x86/kernel/signal.c:805 |  | ||||||
| 	arch/x86/kernel/signal.c:871 |  | ||||||
| 	arch/x86/kernel/entry_64.S:694 |  | ||||||
| 
 |  | ||||||
| Remember that since these addresses were found on the stack and not as the |  | ||||||
| RIP value, they actually point to the _next_ instruction (they are return |  | ||||||
| addresses). This becomes obvious when we look at the code for line 446:: |  | ||||||
| 
 |  | ||||||
|     422 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) |  | ||||||
|     423 { |  | ||||||
|     ... |  | ||||||
|     431                 signr = __dequeue_signal(&tsk->signal->shared_pending, |  | ||||||
|     432						 mask, info); |  | ||||||
|     433			/* |  | ||||||
|     434			 * itimer signal ? |  | ||||||
|     435			 * |  | ||||||
|     436			 * itimers are process shared and we restart periodic |  | ||||||
|     437			 * itimers in the signal delivery path to prevent DoS |  | ||||||
|     438			 * attacks in the high resolution timer case. This is |  | ||||||
|     439			 * compliant with the old way of self restarting |  | ||||||
|     440			 * itimers, as the SIGALRM is a legacy signal and only |  | ||||||
|     441			 * queued once. Changing the restart behaviour to |  | ||||||
|     442			 * restart the timer in the signal dequeue path is |  | ||||||
|     443			 * reducing the timer noise on heavy loaded !highres |  | ||||||
|     444			 * systems too. |  | ||||||
|     445			 */ |  | ||||||
|     446			if (unlikely(signr == SIGALRM)) { |  | ||||||
|     ... |  | ||||||
|     489 } |  | ||||||
| 
 |  | ||||||
| So instead of looking at 446, we should be looking at 431, which is the line |  | ||||||
| that executes just before 446. Here we see that what we are looking for is |  | ||||||
| ``&tsk->signal->shared_pending``. |  | ||||||
| 
 |  | ||||||
| Our next task is now to figure out which function that puts items on this |  | ||||||
| ``shared_pending`` list. A crude, but efficient tool, is ``git grep``:: |  | ||||||
| 
 |  | ||||||
| 	$ git grep -n 'shared_pending' kernel/ |  | ||||||
| 	... |  | ||||||
| 	kernel/signal.c:828:	pending = group ? &t->signal->shared_pending : &t->pending; |  | ||||||
| 	kernel/signal.c:1339:	pending = group ? &t->signal->shared_pending : &t->pending; |  | ||||||
| 	... |  | ||||||
| 
 |  | ||||||
| There were more results, but none of them were related to list operations, |  | ||||||
| and these were the only assignments. We inspect the line numbers more closely |  | ||||||
| and find that this is indeed where items are being added to the list:: |  | ||||||
| 
 |  | ||||||
|     816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t, |  | ||||||
|     817				int group) |  | ||||||
|     818 { |  | ||||||
|     ... |  | ||||||
|     828		pending = group ? &t->signal->shared_pending : &t->pending; |  | ||||||
|     ... |  | ||||||
|     851		q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && |  | ||||||
|     852						     (is_si_special(info) || |  | ||||||
|     853						      info->si_code >= 0))); |  | ||||||
|     854		if (q) { |  | ||||||
|     855			list_add_tail(&q->list, &pending->list); |  | ||||||
|     ... |  | ||||||
|     890 } |  | ||||||
| 
 |  | ||||||
| and:: |  | ||||||
| 
 |  | ||||||
|     1309 int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) |  | ||||||
|     1310 { |  | ||||||
|     .... |  | ||||||
|     1339	 pending = group ? &t->signal->shared_pending : &t->pending; |  | ||||||
|     1340	 list_add_tail(&q->list, &pending->list); |  | ||||||
|     .... |  | ||||||
|     1347 } |  | ||||||
| 
 |  | ||||||
| In the first case, the list element we are looking for, ``q``, is being |  | ||||||
| returned from the function ``__sigqueue_alloc()``, which looks like an |  | ||||||
| allocation function.  Let's take a look at it:: |  | ||||||
| 
 |  | ||||||
|     187 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, |  | ||||||
|     188						 int override_rlimit) |  | ||||||
|     189 { |  | ||||||
|     190		struct sigqueue *q = NULL; |  | ||||||
|     191		struct user_struct *user; |  | ||||||
|     192 |  | ||||||
|     193		/* |  | ||||||
|     194		 * We won't get problems with the target's UID changing under us |  | ||||||
|     195		 * because changing it requires RCU be used, and if t != current, the |  | ||||||
|     196		 * caller must be holding the RCU readlock (by way of a spinlock) and |  | ||||||
|     197		 * we use RCU protection here |  | ||||||
|     198		 */ |  | ||||||
|     199		user = get_uid(__task_cred(t)->user); |  | ||||||
|     200		atomic_inc(&user->sigpending); |  | ||||||
|     201		if (override_rlimit || |  | ||||||
|     202		    atomic_read(&user->sigpending) <= |  | ||||||
|     203				t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) |  | ||||||
|     204			q = kmem_cache_alloc(sigqueue_cachep, flags); |  | ||||||
|     205		if (unlikely(q == NULL)) { |  | ||||||
|     206			atomic_dec(&user->sigpending); |  | ||||||
|     207			free_uid(user); |  | ||||||
|     208		} else { |  | ||||||
|     209			INIT_LIST_HEAD(&q->list); |  | ||||||
|     210			q->flags = 0; |  | ||||||
|     211			q->user = user; |  | ||||||
|     212		} |  | ||||||
|     213 |  | ||||||
|     214		return q; |  | ||||||
|     215 } |  | ||||||
| 
 |  | ||||||
| We see that this function initializes ``q->list``, ``q->flags``, and |  | ||||||
| ``q->user``. It seems that now is the time to look at the definition of |  | ||||||
| ``struct sigqueue``, e.g.:: |  | ||||||
| 
 |  | ||||||
|     14 struct sigqueue { |  | ||||||
|     15	       struct list_head list; |  | ||||||
|     16	       int flags; |  | ||||||
|     17	       siginfo_t info; |  | ||||||
|     18	       struct user_struct *user; |  | ||||||
|     19 }; |  | ||||||
| 
 |  | ||||||
| And, you might remember, it was a ``memcpy()`` on ``&first->info`` that |  | ||||||
| caused the warning, so this makes perfect sense. It also seems reasonable |  | ||||||
| to assume that it is the caller of ``__sigqueue_alloc()`` that has the |  | ||||||
| responsibility of filling out (initializing) this member. |  | ||||||
| 
 |  | ||||||
| But just which fields of the struct were uninitialized? Let's look at |  | ||||||
| kmemcheck's report again:: |  | ||||||
| 
 |  | ||||||
|     WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024) |  | ||||||
|     80000000000000000000000000000000000000000088ffff0000000000000000 |  | ||||||
|      i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u |  | ||||||
| 	     ^ |  | ||||||
| 
 |  | ||||||
| These first two lines are the memory dump of the memory object itself, and |  | ||||||
| the shadow bytemap, respectively. The memory object itself is in this case |  | ||||||
| ``&first->info``. Just beware that the start of this dump is NOT the start |  | ||||||
| of the object itself! The position of the caret (^) corresponds with the |  | ||||||
| address of the read (ffff88003e4a2024). |  | ||||||
| 
 |  | ||||||
| The shadow bytemap dump legend is as follows: |  | ||||||
| 
 |  | ||||||
| - i: initialized |  | ||||||
| - u: uninitialized |  | ||||||
| - a: unallocated (memory has been allocated by the slab layer, but has not |  | ||||||
|   yet been handed off to anybody) |  | ||||||
| - f: freed (memory has been allocated by the slab layer, but has been freed |  | ||||||
|   by the previous owner) |  | ||||||
| 
 |  | ||||||
| In order to figure out where (relative to the start of the object) the |  | ||||||
| uninitialized memory was located, we have to look at the disassembly. For |  | ||||||
| that, we'll need the RIP address again:: |  | ||||||
| 
 |  | ||||||
|     RIP: 0010:[<ffffffff8104ede8>]  [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190 |  | ||||||
| 
 |  | ||||||
| 	$ objdump -d --no-show-raw-insn vmlinux | grep -C 8 ffffffff8104ede8: |  | ||||||
| 	ffffffff8104edc8:	mov    %r8,0x8(%r8) |  | ||||||
| 	ffffffff8104edcc:	test   %r10d,%r10d |  | ||||||
| 	ffffffff8104edcf:	js     ffffffff8104ee88 <__dequeue_signal+0x168> |  | ||||||
| 	ffffffff8104edd5:	mov    %rax,%rdx |  | ||||||
| 	ffffffff8104edd8:	mov    $0xc,%ecx |  | ||||||
| 	ffffffff8104eddd:	mov    %r13,%rdi |  | ||||||
| 	ffffffff8104ede0:	mov    $0x30,%eax |  | ||||||
| 	ffffffff8104ede5:	mov    %rdx,%rsi |  | ||||||
| 	ffffffff8104ede8:	rep movsl %ds:(%rsi),%es:(%rdi) |  | ||||||
| 	ffffffff8104edea:	test   $0x2,%al |  | ||||||
| 	ffffffff8104edec:	je     ffffffff8104edf0 <__dequeue_signal+0xd0> |  | ||||||
| 	ffffffff8104edee:	movsw  %ds:(%rsi),%es:(%rdi) |  | ||||||
| 	ffffffff8104edf0:	test   $0x1,%al |  | ||||||
| 	ffffffff8104edf2:	je     ffffffff8104edf5 <__dequeue_signal+0xd5> |  | ||||||
| 	ffffffff8104edf4:	movsb  %ds:(%rsi),%es:(%rdi) |  | ||||||
| 	ffffffff8104edf5:	mov    %r8,%rdi |  | ||||||
| 	ffffffff8104edf8:	callq  ffffffff8104de60 <__sigqueue_free> |  | ||||||
| 
 |  | ||||||
| As expected, it's the "``rep movsl``" instruction from the ``memcpy()`` |  | ||||||
| that causes the warning. We know about ``REP MOVSL`` that it uses the register |  | ||||||
| ``RCX`` to count the number of remaining iterations. By taking a look at the |  | ||||||
| register dump again (from the kmemcheck report), we can figure out how many |  | ||||||
| bytes were left to copy:: |  | ||||||
| 
 |  | ||||||
|     RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009 |  | ||||||
| 
 |  | ||||||
| By looking at the disassembly, we also see that ``%ecx`` is being loaded |  | ||||||
| with the value ``$0xc`` just before (ffffffff8104edd8), so we are very |  | ||||||
| lucky. Keep in mind that this is the number of iterations, not bytes. And |  | ||||||
| since this is a "long" operation, we need to multiply by 4 to get the |  | ||||||
| number of bytes. So this means that the uninitialized value was encountered |  | ||||||
| at 4 * (0xc - 0x9) = 12 bytes from the start of the object. |  | ||||||
| 
 |  | ||||||
| We can now try to figure out which field of the "``struct siginfo``" that |  | ||||||
| was not initialized. This is the beginning of the struct:: |  | ||||||
| 
 |  | ||||||
|     40 typedef struct siginfo { |  | ||||||
|     41	       int si_signo; |  | ||||||
|     42	       int si_errno; |  | ||||||
|     43	       int si_code; |  | ||||||
|     44 |  | ||||||
|     45	       union { |  | ||||||
|     .. |  | ||||||
|     92	       } _sifields; |  | ||||||
|     93 } siginfo_t; |  | ||||||
| 
 |  | ||||||
| On 64-bit, the int is 4 bytes long, so it must the union member that has |  | ||||||
| not been initialized. We can verify this using gdb:: |  | ||||||
| 
 |  | ||||||
| 	$ gdb vmlinux |  | ||||||
| 	... |  | ||||||
| 	(gdb) p &((struct siginfo *) 0)->_sifields |  | ||||||
| 	$1 = (union {...} *) 0x10 |  | ||||||
| 
 |  | ||||||
| Actually, it seems that the union member is located at offset 0x10 -- which |  | ||||||
| means that gcc has inserted 4 bytes of padding between the members ``si_code`` |  | ||||||
| and ``_sifields``. We can now get a fuller picture of the memory dump:: |  | ||||||
| 
 |  | ||||||
| 		 _----------------------------=> si_code |  | ||||||
| 		/	 _--------------------=> (padding) |  | ||||||
| 	       |	/	 _------------=> _sifields(._kill._pid) |  | ||||||
| 	       |       |	/	 _----=> _sifields(._kill._uid) |  | ||||||
| 	       |       |       |	/ |  | ||||||
| 	-------|-------|-------|-------| |  | ||||||
| 	80000000000000000000000000000000000000000088ffff0000000000000000 |  | ||||||
| 	 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u |  | ||||||
| 
 |  | ||||||
| This allows us to realize another important fact: ``si_code`` contains the |  | ||||||
| value 0x80. Remember that x86 is little endian, so the first 4 bytes |  | ||||||
| "80000000" are really the number 0x00000080. With a bit of research, we |  | ||||||
| find that this is actually the constant ``SI_KERNEL`` defined in |  | ||||||
| ``include/asm-generic/siginfo.h``:: |  | ||||||
| 
 |  | ||||||
|     144 #define SI_KERNEL	0x80		/* sent by the kernel from somewhere	 */ |  | ||||||
| 
 |  | ||||||
| This macro is used in exactly one place in the x86 kernel: In ``send_signal()`` |  | ||||||
| in ``kernel/signal.c``:: |  | ||||||
| 
 |  | ||||||
|     816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t, |  | ||||||
|     817				int group) |  | ||||||
|     818 { |  | ||||||
|     ... |  | ||||||
|     828		pending = group ? &t->signal->shared_pending : &t->pending; |  | ||||||
|     ... |  | ||||||
|     851		q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && |  | ||||||
|     852						     (is_si_special(info) || |  | ||||||
|     853						      info->si_code >= 0))); |  | ||||||
|     854		if (q) { |  | ||||||
|     855			list_add_tail(&q->list, &pending->list); |  | ||||||
|     856			switch ((unsigned long) info) { |  | ||||||
|     ... |  | ||||||
|     865			case (unsigned long) SEND_SIG_PRIV: |  | ||||||
|     866				q->info.si_signo = sig; |  | ||||||
|     867				q->info.si_errno = 0; |  | ||||||
|     868				q->info.si_code = SI_KERNEL; |  | ||||||
|     869				q->info.si_pid = 0; |  | ||||||
|     870				q->info.si_uid = 0; |  | ||||||
|     871				break; |  | ||||||
|     ... |  | ||||||
|     890 } |  | ||||||
| 
 |  | ||||||
| Not only does this match with the ``.si_code`` member, it also matches the place |  | ||||||
| we found earlier when looking for where siginfo_t objects are enqueued on the |  | ||||||
| ``shared_pending`` list. |  | ||||||
| 
 |  | ||||||
| So to sum up: It seems that it is the padding introduced by the compiler |  | ||||||
| between two struct fields that is uninitialized, and this gets reported when |  | ||||||
| we do a ``memcpy()`` on the struct. This means that we have identified a false |  | ||||||
| positive warning. |  | ||||||
| 
 |  | ||||||
| Normally, kmemcheck will not report uninitialized accesses in ``memcpy()`` calls |  | ||||||
| when both the source and destination addresses are tracked. (Instead, we copy |  | ||||||
| the shadow bytemap as well). In this case, the destination address clearly |  | ||||||
| was not tracked. We can dig a little deeper into the stack trace from above:: |  | ||||||
| 
 |  | ||||||
| 	arch/x86/kernel/signal.c:805 |  | ||||||
| 	arch/x86/kernel/signal.c:871 |  | ||||||
| 	arch/x86/kernel/entry_64.S:694 |  | ||||||
| 
 |  | ||||||
| And we clearly see that the destination siginfo object is located on the |  | ||||||
| stack:: |  | ||||||
| 
 |  | ||||||
|     782 static void do_signal(struct pt_regs *regs) |  | ||||||
|     783 { |  | ||||||
|     784		struct k_sigaction ka; |  | ||||||
|     785		siginfo_t info; |  | ||||||
|     ... |  | ||||||
|     804		signr = get_signal_to_deliver(&info, &ka, regs, NULL); |  | ||||||
|     ... |  | ||||||
|     854 } |  | ||||||
| 
 |  | ||||||
| And this ``&info`` is what eventually gets passed to ``copy_siginfo()`` as the |  | ||||||
| destination argument. |  | ||||||
| 
 |  | ||||||
| Now, even though we didn't find an actual error here, the example is still a |  | ||||||
| good one, because it shows how one would go about to find out what the report |  | ||||||
| was all about. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| Annotating false positives |  | ||||||
| ~~~~~~~~~~~~~~~~~~~~~~~~~~ |  | ||||||
| 
 |  | ||||||
| There are a few different ways to make annotations in the source code that |  | ||||||
| will keep kmemcheck from checking and reporting certain allocations. Here |  | ||||||
| they are: |  | ||||||
| 
 |  | ||||||
| - ``__GFP_NOTRACK_FALSE_POSITIVE`` |  | ||||||
| 	This flag can be passed to ``kmalloc()`` or ``kmem_cache_alloc()`` |  | ||||||
| 	(therefore also to other functions that end up calling one of |  | ||||||
| 	these) to indicate that the allocation should not be tracked |  | ||||||
| 	because it would lead to a false positive report. This is a "big |  | ||||||
| 	hammer" way of silencing kmemcheck; after all, even if the false |  | ||||||
| 	positive pertains to particular field in a struct, for example, we |  | ||||||
| 	will now lose the ability to find (real) errors in other parts of |  | ||||||
| 	the same struct. |  | ||||||
| 
 |  | ||||||
| 	Example:: |  | ||||||
| 
 |  | ||||||
| 	    /* No warnings will ever trigger on accessing any part of x */ |  | ||||||
| 	    x = kmalloc(sizeof *x, GFP_KERNEL | __GFP_NOTRACK_FALSE_POSITIVE); |  | ||||||
| 
 |  | ||||||
| - ``kmemcheck_bitfield_begin(name)``/``kmemcheck_bitfield_end(name)`` and |  | ||||||
| 	``kmemcheck_annotate_bitfield(ptr, name)`` |  | ||||||
| 	The first two of these three macros can be used inside struct |  | ||||||
| 	definitions to signal, respectively, the beginning and end of a |  | ||||||
| 	bitfield. Additionally, this will assign the bitfield a name, which |  | ||||||
| 	is given as an argument to the macros. |  | ||||||
| 
 |  | ||||||
| 	Having used these markers, one can later use |  | ||||||
| 	kmemcheck_annotate_bitfield() at the point of allocation, to indicate |  | ||||||
| 	which parts of the allocation is part of a bitfield. |  | ||||||
| 
 |  | ||||||
| 	Example:: |  | ||||||
| 
 |  | ||||||
| 	    struct foo { |  | ||||||
| 		int x; |  | ||||||
| 
 |  | ||||||
| 		kmemcheck_bitfield_begin(flags); |  | ||||||
| 		int flag_a:1; |  | ||||||
| 		int flag_b:1; |  | ||||||
| 		kmemcheck_bitfield_end(flags); |  | ||||||
| 
 |  | ||||||
| 		int y; |  | ||||||
| 	    }; |  | ||||||
| 
 |  | ||||||
| 	    struct foo *x = kmalloc(sizeof *x); |  | ||||||
| 
 |  | ||||||
| 	    /* No warnings will trigger on accessing the bitfield of x */ |  | ||||||
| 	    kmemcheck_annotate_bitfield(x, flags); |  | ||||||
| 
 |  | ||||||
| 	Note that ``kmemcheck_annotate_bitfield()`` can be used even before the |  | ||||||
| 	return value of ``kmalloc()`` is checked -- in other words, passing NULL |  | ||||||
| 	as the first argument is legal (and will do nothing). |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| Reporting errors |  | ||||||
| ---------------- |  | ||||||
| 
 |  | ||||||
| As we have seen, kmemcheck will produce false positive reports. Therefore, it |  | ||||||
| is not very wise to blindly post kmemcheck warnings to mailing lists and |  | ||||||
| maintainers. Instead, I encourage maintainers and developers to find errors |  | ||||||
| in their own code. If you get a warning, you can try to work around it, try |  | ||||||
| to figure out if it's a real error or not, or simply ignore it. Most |  | ||||||
| developers know their own code and will quickly and efficiently determine the |  | ||||||
| root cause of a kmemcheck report. This is therefore also the most efficient |  | ||||||
| way to work with kmemcheck. |  | ||||||
| 
 |  | ||||||
| That said, we (the kmemcheck maintainers) will always be on the lookout for |  | ||||||
| false positives that we can annotate and silence. So whatever you find, |  | ||||||
| please drop us a note privately! Kernel configs and steps to reproduce (if |  | ||||||
| available) are of course a great help too. |  | ||||||
| 
 |  | ||||||
| Happy hacking! |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| Technical description |  | ||||||
| --------------------- |  | ||||||
| 
 |  | ||||||
| kmemcheck works by marking memory pages non-present. This means that whenever |  | ||||||
| somebody attempts to access the page, a page fault is generated. The page |  | ||||||
| fault handler notices that the page was in fact only hidden, and so it calls |  | ||||||
| on the kmemcheck code to make further investigations. |  | ||||||
| 
 |  | ||||||
| When the investigations are completed, kmemcheck "shows" the page by marking |  | ||||||
| it present (as it would be under normal circumstances). This way, the |  | ||||||
| interrupted code can continue as usual. |  | ||||||
| 
 |  | ||||||
| But after the instruction has been executed, we should hide the page again, so |  | ||||||
| that we can catch the next access too! Now kmemcheck makes use of a debugging |  | ||||||
| feature of the processor, namely single-stepping. When the processor has |  | ||||||
| finished the one instruction that generated the memory access, a debug |  | ||||||
| exception is raised. From here, we simply hide the page again and continue |  | ||||||
| execution, this time with the single-stepping feature turned off. |  | ||||||
| 
 |  | ||||||
| kmemcheck requires some assistance from the memory allocator in order to work. |  | ||||||
| The memory allocator needs to |  | ||||||
| 
 |  | ||||||
|   1. Tell kmemcheck about newly allocated pages and pages that are about to |  | ||||||
|      be freed. This allows kmemcheck to set up and tear down the shadow memory |  | ||||||
|      for the pages in question. The shadow memory stores the status of each |  | ||||||
|      byte in the allocation proper, e.g. whether it is initialized or |  | ||||||
|      uninitialized. |  | ||||||
| 
 |  | ||||||
|   2. Tell kmemcheck which parts of memory should be marked uninitialized. |  | ||||||
|      There are actually a few more states, such as "not yet allocated" and |  | ||||||
|      "recently freed". |  | ||||||
| 
 |  | ||||||
| If a slab cache is set up using the SLAB_NOTRACK flag, it will never return |  | ||||||
| memory that can take page faults because of kmemcheck. |  | ||||||
| 
 |  | ||||||
| If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still |  | ||||||
| request memory with the __GFP_NOTRACK or __GFP_NOTRACK_FALSE_POSITIVE flags. |  | ||||||
| This does not prevent the page faults from occurring, however, but marks the |  | ||||||
| object in question as being initialized so that no warnings will ever be |  | ||||||
| produced for this object. |  | ||||||
| 
 |  | ||||||
| Currently, the SLAB and SLUB allocators are supported by kmemcheck. |  | ||||||
|  | @ -250,7 +250,6 @@ Table 1-2: Contents of the status files (as of 4.8) | ||||||
|  VmExe                       size of text segment |  VmExe                       size of text segment | ||||||
|  VmLib                       size of shared library code |  VmLib                       size of shared library code | ||||||
|  VmPTE                       size of page table entries |  VmPTE                       size of page table entries | ||||||
|  VmPMD                       size of second level page tables |  | ||||||
|  VmSwap                      amount of swap used by anonymous private data |  VmSwap                      amount of swap used by anonymous private data | ||||||
|                              (shmem swap usage is not included) |                              (shmem swap usage is not included) | ||||||
|  HugetlbPages                size of hugetlb memory portions |  HugetlbPages                size of hugetlb memory portions | ||||||
|  |  | ||||||
|  | @ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm: | ||||||
| - percpu_pagelist_fraction | - percpu_pagelist_fraction | ||||||
| - stat_interval | - stat_interval | ||||||
| - stat_refresh | - stat_refresh | ||||||
|  | - numa_stat | ||||||
| - swappiness | - swappiness | ||||||
| - user_reserve_kbytes | - user_reserve_kbytes | ||||||
| - vfs_cache_pressure | - vfs_cache_pressure | ||||||
|  | @ -157,6 +158,10 @@ Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any | ||||||
| value lower than this limit will be ignored and the old configuration will be | value lower than this limit will be ignored and the old configuration will be | ||||||
| retained. | retained. | ||||||
| 
 | 
 | ||||||
|  | Note: the value of dirty_bytes also must be set greater than | ||||||
|  | dirty_background_bytes or the amount of memory corresponding to | ||||||
|  | dirty_background_ratio. | ||||||
|  | 
 | ||||||
| ============================================================== | ============================================================== | ||||||
| 
 | 
 | ||||||
| dirty_expire_centisecs | dirty_expire_centisecs | ||||||
|  | @ -176,6 +181,9 @@ generating disk writes will itself start writing out dirty data. | ||||||
| 
 | 
 | ||||||
| The total available memory is not equal to total system memory. | The total available memory is not equal to total system memory. | ||||||
| 
 | 
 | ||||||
|  | Note: dirty_ratio must be set greater than dirty_background_ratio or | ||||||
|  | ratio corresponding to dirty_background_bytes. | ||||||
|  | 
 | ||||||
| ============================================================== | ============================================================== | ||||||
| 
 | 
 | ||||||
| dirty_writeback_centisecs | dirty_writeback_centisecs | ||||||
|  | @ -622,7 +630,7 @@ oom_dump_tasks | ||||||
| 
 | 
 | ||||||
| Enables a system-wide task dump (excluding kernel threads) to be produced | Enables a system-wide task dump (excluding kernel threads) to be produced | ||||||
| when the kernel performs an OOM-killing and includes such information as | when the kernel performs an OOM-killing and includes such information as | ||||||
| pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj | pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj | ||||||
| score, and name.  This is helpful to determine why the OOM killer was | score, and name.  This is helpful to determine why the OOM killer was | ||||||
| invoked, to identify the rogue task that caused it, and to determine why | invoked, to identify the rogue task that caused it, and to determine why | ||||||
| the OOM killer chose the task it did to kill. | the OOM killer chose the task it did to kill. | ||||||
|  | @ -792,6 +800,21 @@ with no ill effects: errors and warnings on these stats are suppressed.) | ||||||
| 
 | 
 | ||||||
| ============================================================== | ============================================================== | ||||||
| 
 | 
 | ||||||
|  | numa_stat | ||||||
|  | 
 | ||||||
|  | This interface allows runtime configuration of numa statistics. | ||||||
|  | 
 | ||||||
|  | When page allocation performance becomes a bottleneck and you can tolerate | ||||||
|  | some possible tool breakage and decreased numa counter precision, you can | ||||||
|  | do: | ||||||
|  | 	echo 0 > /proc/sys/vm/numa_stat | ||||||
|  | 
 | ||||||
|  | When page allocation performance is not a bottleneck and you want all | ||||||
|  | tooling to work, you can do: | ||||||
|  | 	echo 1 > /proc/sys/vm/numa_stat | ||||||
|  | 
 | ||||||
|  | ============================================================== | ||||||
|  | 
 | ||||||
| swappiness | swappiness | ||||||
| 
 | 
 | ||||||
| This control is used to define how aggressive the kernel will swap | This control is used to define how aggressive the kernel will swap | ||||||
|  |  | ||||||
							
								
								
									
										93
									
								
								Documentation/vm/mmu_notifier.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										93
									
								
								Documentation/vm/mmu_notifier.txt
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,93 @@ | ||||||
|  | When do you need to notify inside page table lock ? | ||||||
|  | 
 | ||||||
|  | When clearing a pte/pmd we are given a choice to notify the event through | ||||||
|  | (notify version of *_clear_flush call mmu_notifier_invalidate_range) under | ||||||
|  | the page table lock. But that notification is not necessary in all cases. | ||||||
|  | 
 | ||||||
|  | For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use | ||||||
|  | thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a | ||||||
|  | process virtual address space). There is only 2 cases when you need to notify | ||||||
|  | those secondary TLB while holding page table lock when clearing a pte/pmd: | ||||||
|  | 
 | ||||||
|  |   A) page backing address is free before mmu_notifier_invalidate_range_end() | ||||||
|  |   B) a page table entry is updated to point to a new page (COW, write fault | ||||||
|  |      on zero page, __replace_page(), ...) | ||||||
|  | 
 | ||||||
|  | Case A is obvious you do not want to take the risk for the device to write to | ||||||
|  | a page that might now be used by some completely different task. | ||||||
|  | 
 | ||||||
|  | Case B is more subtle. For correctness it requires the following sequence to | ||||||
|  | happen: | ||||||
|  |   - take page table lock | ||||||
|  |   - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify()) | ||||||
|  |   - set page table entry to point to new page | ||||||
|  | 
 | ||||||
|  | If clearing the page table entry is not followed by a notify before setting | ||||||
|  | the new pte/pmd value then you can break memory model like C11 or C++11 for | ||||||
|  | the device. | ||||||
|  | 
 | ||||||
|  | Consider the following scenario (device use a feature similar to ATS/PASID): | ||||||
|  | 
 | ||||||
|  | Two address addrA and addrB such that |addrA - addrB| >= PAGE_SIZE we assume | ||||||
|  | they are write protected for COW (other case of B apply too). | ||||||
|  | 
 | ||||||
|  | [Time N] -------------------------------------------------------------------- | ||||||
|  | CPU-thread-0  {try to write to addrA} | ||||||
|  | CPU-thread-1  {try to write to addrB} | ||||||
|  | CPU-thread-2  {} | ||||||
|  | CPU-thread-3  {} | ||||||
|  | DEV-thread-0  {read addrA and populate device TLB} | ||||||
|  | DEV-thread-2  {read addrB and populate device TLB} | ||||||
|  | [Time N+1] ------------------------------------------------------------------ | ||||||
|  | CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} | ||||||
|  | CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} | ||||||
|  | CPU-thread-2  {} | ||||||
|  | CPU-thread-3  {} | ||||||
|  | DEV-thread-0  {} | ||||||
|  | DEV-thread-2  {} | ||||||
|  | [Time N+2] ------------------------------------------------------------------ | ||||||
|  | CPU-thread-0  {COW_step1: {update page table to point to new page for addrA}} | ||||||
|  | CPU-thread-1  {COW_step1: {update page table to point to new page for addrB}} | ||||||
|  | CPU-thread-2  {} | ||||||
|  | CPU-thread-3  {} | ||||||
|  | DEV-thread-0  {} | ||||||
|  | DEV-thread-2  {} | ||||||
|  | [Time N+3] ------------------------------------------------------------------ | ||||||
|  | CPU-thread-0  {preempted} | ||||||
|  | CPU-thread-1  {preempted} | ||||||
|  | CPU-thread-2  {write to addrA which is a write to new page} | ||||||
|  | CPU-thread-3  {} | ||||||
|  | DEV-thread-0  {} | ||||||
|  | DEV-thread-2  {} | ||||||
|  | [Time N+3] ------------------------------------------------------------------ | ||||||
|  | CPU-thread-0  {preempted} | ||||||
|  | CPU-thread-1  {preempted} | ||||||
|  | CPU-thread-2  {} | ||||||
|  | CPU-thread-3  {write to addrB which is a write to new page} | ||||||
|  | DEV-thread-0  {} | ||||||
|  | DEV-thread-2  {} | ||||||
|  | [Time N+4] ------------------------------------------------------------------ | ||||||
|  | CPU-thread-0  {preempted} | ||||||
|  | CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} | ||||||
|  | CPU-thread-2  {} | ||||||
|  | CPU-thread-3  {} | ||||||
|  | DEV-thread-0  {} | ||||||
|  | DEV-thread-2  {} | ||||||
|  | [Time N+5] ------------------------------------------------------------------ | ||||||
|  | CPU-thread-0  {preempted} | ||||||
|  | CPU-thread-1  {} | ||||||
|  | CPU-thread-2  {} | ||||||
|  | CPU-thread-3  {} | ||||||
|  | DEV-thread-0  {read addrA from old page} | ||||||
|  | DEV-thread-2  {read addrB from new page} | ||||||
|  | 
 | ||||||
|  | So here because at time N+2 the clear page table entry was not pair with a | ||||||
|  | notification to invalidate the secondary TLB, the device see the new value for | ||||||
|  | addrB before seing the new value for addrA. This break total memory ordering | ||||||
|  | for the device. | ||||||
|  | 
 | ||||||
|  | When changing a pte to write protect or to point to a new write protected page | ||||||
|  | with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range | ||||||
|  | call to mmu_notifier_invalidate_range_end() outside the page table lock. This | ||||||
|  | is true even if the thread doing the page table update is preempted right after | ||||||
|  | releasing page table lock but before call mmu_notifier_invalidate_range_end(). | ||||||
							
								
								
									
										10
									
								
								MAINTAINERS
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								MAINTAINERS
									
									
									
									
									
								
							|  | @ -7692,16 +7692,6 @@ F:	include/linux/kdb.h | ||||||
| F:	include/linux/kgdb.h | F:	include/linux/kgdb.h | ||||||
| F:	kernel/debug/ | F:	kernel/debug/ | ||||||
| 
 | 
 | ||||||
| KMEMCHECK |  | ||||||
| M:	Vegard Nossum <vegardno@ifi.uio.no> |  | ||||||
| M:	Pekka Enberg <penberg@kernel.org> |  | ||||||
| S:	Maintained |  | ||||||
| F:	Documentation/dev-tools/kmemcheck.rst |  | ||||||
| F:	arch/x86/include/asm/kmemcheck.h |  | ||||||
| F:	arch/x86/mm/kmemcheck/ |  | ||||||
| F:	include/linux/kmemcheck.h |  | ||||||
| F:	mm/kmemcheck.c |  | ||||||
| 
 |  | ||||||
| KMEMLEAK | KMEMLEAK | ||||||
| M:	Catalin Marinas <catalin.marinas@arm.com> | M:	Catalin Marinas <catalin.marinas@arm.com> | ||||||
| S:	Maintained | S:	Maintained | ||||||
|  |  | ||||||
|  | @ -7,7 +7,6 @@ | ||||||
| #include <linux/mm_types.h> | #include <linux/mm_types.h> | ||||||
| #include <linux/scatterlist.h> | #include <linux/scatterlist.h> | ||||||
| #include <linux/dma-debug.h> | #include <linux/dma-debug.h> | ||||||
| #include <linux/kmemcheck.h> |  | ||||||
| #include <linux/kref.h> | #include <linux/kref.h> | ||||||
| 
 | 
 | ||||||
| #define ARM_MAPPING_ERROR		(~(dma_addr_t)0x0) | #define ARM_MAPPING_ERROR		(~(dma_addr_t)0x0) | ||||||
|  |  | ||||||
|  | @ -57,7 +57,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) | ||||||
| extern pgd_t *pgd_alloc(struct mm_struct *mm); | extern pgd_t *pgd_alloc(struct mm_struct *mm); | ||||||
| extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); | extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); | ||||||
| 
 | 
 | ||||||
| #define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) | #define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO) | ||||||
| 
 | 
 | ||||||
| static inline void clean_pte_table(pte_t *pte) | static inline void clean_pte_table(pte_t *pte) | ||||||
| { | { | ||||||
|  |  | ||||||
|  | @ -141,7 +141,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) | ||||||
| 	pte = pmd_pgtable(*pmd); | 	pte = pmd_pgtable(*pmd); | ||||||
| 	pmd_clear(pmd); | 	pmd_clear(pmd); | ||||||
| 	pte_free(mm, pte); | 	pte_free(mm, pte); | ||||||
| 	atomic_long_dec(&mm->nr_ptes); | 	mm_dec_nr_ptes(mm); | ||||||
| no_pmd: | no_pmd: | ||||||
| 	pud_clear(pud); | 	pud_clear(pud); | ||||||
| 	pmd_free(mm, pmd); | 	pmd_free(mm, pmd); | ||||||
|  |  | ||||||
|  | @ -85,7 +85,7 @@ config ARM64 | ||||||
| 	select HAVE_ARCH_BITREVERSE | 	select HAVE_ARCH_BITREVERSE | ||||||
| 	select HAVE_ARCH_HUGE_VMAP | 	select HAVE_ARCH_HUGE_VMAP | ||||||
| 	select HAVE_ARCH_JUMP_LABEL | 	select HAVE_ARCH_JUMP_LABEL | ||||||
| 	select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48) | 	select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48) | ||||||
| 	select HAVE_ARCH_KGDB | 	select HAVE_ARCH_KGDB | ||||||
| 	select HAVE_ARCH_MMAP_RND_BITS | 	select HAVE_ARCH_MMAP_RND_BITS | ||||||
| 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT | 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT | ||||||
|  |  | ||||||
|  | @ -26,7 +26,7 @@ | ||||||
| 
 | 
 | ||||||
| #define check_pgt_cache()		do { } while (0) | #define check_pgt_cache()		do { } while (0) | ||||||
| 
 | 
 | ||||||
| #define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) | #define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO) | ||||||
| #define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t)) | #define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t)) | ||||||
| 
 | 
 | ||||||
| #if CONFIG_PGTABLE_LEVELS > 2 | #if CONFIG_PGTABLE_LEVELS > 2 | ||||||
|  |  | ||||||
|  | @ -11,6 +11,7 @@ | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #define pr_fmt(fmt) "kasan: " fmt | #define pr_fmt(fmt) "kasan: " fmt | ||||||
|  | #include <linux/bootmem.h> | ||||||
| #include <linux/kasan.h> | #include <linux/kasan.h> | ||||||
| #include <linux/kernel.h> | #include <linux/kernel.h> | ||||||
| #include <linux/sched/task.h> | #include <linux/sched/task.h> | ||||||
|  | @ -35,77 +36,117 @@ static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE); | ||||||
|  * with the physical address from __pa_symbol. |  * with the physical address from __pa_symbol. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr, | static phys_addr_t __init kasan_alloc_zeroed_page(int node) | ||||||
| 					unsigned long end) | { | ||||||
|  | 	void *p = memblock_virt_alloc_try_nid(PAGE_SIZE, PAGE_SIZE, | ||||||
|  | 					      __pa(MAX_DMA_ADDRESS), | ||||||
|  | 					      MEMBLOCK_ALLOC_ACCESSIBLE, node); | ||||||
|  | 	return __pa(p); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static pte_t *__init kasan_pte_offset(pmd_t *pmd, unsigned long addr, int node, | ||||||
|  | 				      bool early) | ||||||
|  | { | ||||||
|  | 	if (pmd_none(*pmd)) { | ||||||
|  | 		phys_addr_t pte_phys = early ? __pa_symbol(kasan_zero_pte) | ||||||
|  | 					     : kasan_alloc_zeroed_page(node); | ||||||
|  | 		__pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return early ? pte_offset_kimg(pmd, addr) | ||||||
|  | 		     : pte_offset_kernel(pmd, addr); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static pmd_t *__init kasan_pmd_offset(pud_t *pud, unsigned long addr, int node, | ||||||
|  | 				      bool early) | ||||||
|  | { | ||||||
|  | 	if (pud_none(*pud)) { | ||||||
|  | 		phys_addr_t pmd_phys = early ? __pa_symbol(kasan_zero_pmd) | ||||||
|  | 					     : kasan_alloc_zeroed_page(node); | ||||||
|  | 		__pud_populate(pud, pmd_phys, PMD_TYPE_TABLE); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return early ? pmd_offset_kimg(pud, addr) : pmd_offset(pud, addr); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static pud_t *__init kasan_pud_offset(pgd_t *pgd, unsigned long addr, int node, | ||||||
|  | 				      bool early) | ||||||
|  | { | ||||||
|  | 	if (pgd_none(*pgd)) { | ||||||
|  | 		phys_addr_t pud_phys = early ? __pa_symbol(kasan_zero_pud) | ||||||
|  | 					     : kasan_alloc_zeroed_page(node); | ||||||
|  | 		__pgd_populate(pgd, pud_phys, PMD_TYPE_TABLE); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return early ? pud_offset_kimg(pgd, addr) : pud_offset(pgd, addr); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void __init kasan_pte_populate(pmd_t *pmd, unsigned long addr, | ||||||
|  | 				      unsigned long end, int node, bool early) | ||||||
| { | { | ||||||
| 	pte_t *pte; |  | ||||||
| 	unsigned long next; | 	unsigned long next; | ||||||
|  | 	pte_t *pte = kasan_pte_offset(pmd, addr, node, early); | ||||||
| 
 | 
 | ||||||
| 	if (pmd_none(*pmd)) |  | ||||||
| 		__pmd_populate(pmd, __pa_symbol(kasan_zero_pte), PMD_TYPE_TABLE); |  | ||||||
| 
 |  | ||||||
| 	pte = pte_offset_kimg(pmd, addr); |  | ||||||
| 	do { | 	do { | ||||||
|  | 		phys_addr_t page_phys = early ? __pa_symbol(kasan_zero_page) | ||||||
|  | 					      : kasan_alloc_zeroed_page(node); | ||||||
| 		next = addr + PAGE_SIZE; | 		next = addr + PAGE_SIZE; | ||||||
| 		set_pte(pte, pfn_pte(sym_to_pfn(kasan_zero_page), | 		set_pte(pte, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); | ||||||
| 					PAGE_KERNEL)); |  | ||||||
| 	} while (pte++, addr = next, addr != end && pte_none(*pte)); | 	} while (pte++, addr = next, addr != end && pte_none(*pte)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void __init kasan_early_pmd_populate(pud_t *pud, | static void __init kasan_pmd_populate(pud_t *pud, unsigned long addr, | ||||||
| 					unsigned long addr, | 				      unsigned long end, int node, bool early) | ||||||
| 					unsigned long end) |  | ||||||
| { | { | ||||||
| 	pmd_t *pmd; |  | ||||||
| 	unsigned long next; | 	unsigned long next; | ||||||
|  | 	pmd_t *pmd = kasan_pmd_offset(pud, addr, node, early); | ||||||
| 
 | 
 | ||||||
| 	if (pud_none(*pud)) |  | ||||||
| 		__pud_populate(pud, __pa_symbol(kasan_zero_pmd), PMD_TYPE_TABLE); |  | ||||||
| 
 |  | ||||||
| 	pmd = pmd_offset_kimg(pud, addr); |  | ||||||
| 	do { | 	do { | ||||||
| 		next = pmd_addr_end(addr, end); | 		next = pmd_addr_end(addr, end); | ||||||
| 		kasan_early_pte_populate(pmd, addr, next); | 		kasan_pte_populate(pmd, addr, next, node, early); | ||||||
| 	} while (pmd++, addr = next, addr != end && pmd_none(*pmd)); | 	} while (pmd++, addr = next, addr != end && pmd_none(*pmd)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void __init kasan_early_pud_populate(pgd_t *pgd, | static void __init kasan_pud_populate(pgd_t *pgd, unsigned long addr, | ||||||
| 					unsigned long addr, | 				      unsigned long end, int node, bool early) | ||||||
| 					unsigned long end) |  | ||||||
| { | { | ||||||
| 	pud_t *pud; |  | ||||||
| 	unsigned long next; | 	unsigned long next; | ||||||
|  | 	pud_t *pud = kasan_pud_offset(pgd, addr, node, early); | ||||||
| 
 | 
 | ||||||
| 	if (pgd_none(*pgd)) |  | ||||||
| 		__pgd_populate(pgd, __pa_symbol(kasan_zero_pud), PUD_TYPE_TABLE); |  | ||||||
| 
 |  | ||||||
| 	pud = pud_offset_kimg(pgd, addr); |  | ||||||
| 	do { | 	do { | ||||||
| 		next = pud_addr_end(addr, end); | 		next = pud_addr_end(addr, end); | ||||||
| 		kasan_early_pmd_populate(pud, addr, next); | 		kasan_pmd_populate(pud, addr, next, node, early); | ||||||
| 	} while (pud++, addr = next, addr != end && pud_none(*pud)); | 	} while (pud++, addr = next, addr != end && pud_none(*pud)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void __init kasan_map_early_shadow(void) | static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, | ||||||
|  | 				      int node, bool early) | ||||||
| { | { | ||||||
| 	unsigned long addr = KASAN_SHADOW_START; |  | ||||||
| 	unsigned long end = KASAN_SHADOW_END; |  | ||||||
| 	unsigned long next; | 	unsigned long next; | ||||||
| 	pgd_t *pgd; | 	pgd_t *pgd; | ||||||
| 
 | 
 | ||||||
| 	pgd = pgd_offset_k(addr); | 	pgd = pgd_offset_k(addr); | ||||||
| 	do { | 	do { | ||||||
| 		next = pgd_addr_end(addr, end); | 		next = pgd_addr_end(addr, end); | ||||||
| 		kasan_early_pud_populate(pgd, addr, next); | 		kasan_pud_populate(pgd, addr, next, node, early); | ||||||
| 	} while (pgd++, addr = next, addr != end); | 	} while (pgd++, addr = next, addr != end); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /* The early shadow maps everything to a single page of zeroes */ | ||||||
| asmlinkage void __init kasan_early_init(void) | asmlinkage void __init kasan_early_init(void) | ||||||
| { | { | ||||||
| 	BUILD_BUG_ON(KASAN_SHADOW_OFFSET != KASAN_SHADOW_END - (1UL << 61)); | 	BUILD_BUG_ON(KASAN_SHADOW_OFFSET != KASAN_SHADOW_END - (1UL << 61)); | ||||||
| 	BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE)); | 	BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE)); | ||||||
| 	BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE)); | 	BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE)); | ||||||
| 	kasan_map_early_shadow(); | 	kasan_pgd_populate(KASAN_SHADOW_START, KASAN_SHADOW_END, NUMA_NO_NODE, | ||||||
|  | 			   true); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* Set up full kasan mappings, ensuring that the mapped pages are zeroed */ | ||||||
|  | static void __init kasan_map_populate(unsigned long start, unsigned long end, | ||||||
|  | 				      int node) | ||||||
|  | { | ||||||
|  | 	kasan_pgd_populate(start & PAGE_MASK, PAGE_ALIGN(end), node, false); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  | @ -142,8 +183,8 @@ void __init kasan_init(void) | ||||||
| 	struct memblock_region *reg; | 	struct memblock_region *reg; | ||||||
| 	int i; | 	int i; | ||||||
| 
 | 
 | ||||||
| 	kimg_shadow_start = (u64)kasan_mem_to_shadow(_text); | 	kimg_shadow_start = (u64)kasan_mem_to_shadow(_text) & PAGE_MASK; | ||||||
| 	kimg_shadow_end = (u64)kasan_mem_to_shadow(_end); | 	kimg_shadow_end = PAGE_ALIGN((u64)kasan_mem_to_shadow(_end)); | ||||||
| 
 | 
 | ||||||
| 	mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR); | 	mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR); | ||||||
| 	mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END); | 	mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END); | ||||||
|  | @ -161,20 +202,9 @@ void __init kasan_init(void) | ||||||
| 
 | 
 | ||||||
| 	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); | 	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); | ||||||
| 
 | 
 | ||||||
| 	vmemmap_populate(kimg_shadow_start, kimg_shadow_end, | 	kasan_map_populate(kimg_shadow_start, kimg_shadow_end, | ||||||
| 			   pfn_to_nid(virt_to_pfn(lm_alias(_text)))); | 			   pfn_to_nid(virt_to_pfn(lm_alias(_text)))); | ||||||
| 
 | 
 | ||||||
| 	/*
 |  | ||||||
| 	 * vmemmap_populate() has populated the shadow region that covers the |  | ||||||
| 	 * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round |  | ||||||
| 	 * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent |  | ||||||
| 	 * kasan_populate_zero_shadow() from replacing the page table entries |  | ||||||
| 	 * (PMD or PTE) at the edges of the shadow region for the kernel |  | ||||||
| 	 * image. |  | ||||||
| 	 */ |  | ||||||
| 	kimg_shadow_start = round_down(kimg_shadow_start, SWAPPER_BLOCK_SIZE); |  | ||||||
| 	kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); |  | ||||||
| 
 |  | ||||||
| 	kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, | 	kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, | ||||||
| 				   (void *)mod_shadow_start); | 				   (void *)mod_shadow_start); | ||||||
| 	kasan_populate_zero_shadow((void *)kimg_shadow_end, | 	kasan_populate_zero_shadow((void *)kimg_shadow_end, | ||||||
|  | @ -191,7 +221,7 @@ void __init kasan_init(void) | ||||||
| 		if (start >= end) | 		if (start >= end) | ||||||
| 			break; | 			break; | ||||||
| 
 | 
 | ||||||
| 		vmemmap_populate((unsigned long)kasan_mem_to_shadow(start), | 		kasan_map_populate((unsigned long)kasan_mem_to_shadow(start), | ||||||
| 				   (unsigned long)kasan_mem_to_shadow(end), | 				   (unsigned long)kasan_mem_to_shadow(end), | ||||||
| 				   pfn_to_nid(virt_to_pfn(start))); | 				   pfn_to_nid(virt_to_pfn(start))); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | @ -42,21 +42,9 @@ | ||||||
| #undef DEBUG | #undef DEBUG | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * BAD_PAGE is the page that is used for page faults when linux |  | ||||||
|  * is out-of-memory. Older versions of linux just did a |  | ||||||
|  * do_exit(), but using this instead means there is less risk |  | ||||||
|  * for a process dying in kernel mode, possibly leaving a inode |  | ||||||
|  * unused etc.. |  | ||||||
|  * |  | ||||||
|  * BAD_PAGETABLE is the accompanying page-table: it is initialized |  | ||||||
|  * to point to BAD_PAGE entries. |  | ||||||
|  * |  | ||||||
|  * ZERO_PAGE is a special page that is used for zero-initialized |  * ZERO_PAGE is a special page that is used for zero-initialized | ||||||
|  * data and COW. |  * data and COW. | ||||||
|  */ |  */ | ||||||
| static unsigned long empty_bad_page_table; |  | ||||||
| static unsigned long empty_bad_page; |  | ||||||
| 
 |  | ||||||
| unsigned long empty_zero_page; | unsigned long empty_zero_page; | ||||||
| EXPORT_SYMBOL(empty_zero_page); | EXPORT_SYMBOL(empty_zero_page); | ||||||
| 
 | 
 | ||||||
|  | @ -72,8 +60,6 @@ void __init paging_init(void) | ||||||
| 	unsigned long zones_size[MAX_NR_ZONES] = {0, }; | 	unsigned long zones_size[MAX_NR_ZONES] = {0, }; | ||||||
| 
 | 
 | ||||||
| 	/* allocate some pages for kernel housekeeping tasks */ | 	/* allocate some pages for kernel housekeeping tasks */ | ||||||
| 	empty_bad_page_table	= (unsigned long) alloc_bootmem_pages(PAGE_SIZE); |  | ||||||
| 	empty_bad_page		= (unsigned long) alloc_bootmem_pages(PAGE_SIZE); |  | ||||||
| 	empty_zero_page		= (unsigned long) alloc_bootmem_pages(PAGE_SIZE); | 	empty_zero_page		= (unsigned long) alloc_bootmem_pages(PAGE_SIZE); | ||||||
| 
 | 
 | ||||||
| 	memset((void *) empty_zero_page, 0, PAGE_SIZE); | 	memset((void *) empty_zero_page, 0, PAGE_SIZE); | ||||||
|  |  | ||||||
|  | @ -40,20 +40,9 @@ | ||||||
| #include <asm/sections.h> | #include <asm/sections.h> | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * BAD_PAGE is the page that is used for page faults when linux |  | ||||||
|  * is out-of-memory. Older versions of linux just did a |  | ||||||
|  * do_exit(), but using this instead means there is less risk |  | ||||||
|  * for a process dying in kernel mode, possibly leaving a inode |  | ||||||
|  * unused etc.. |  | ||||||
|  * |  | ||||||
|  * BAD_PAGETABLE is the accompanying page-table: it is initialized |  | ||||||
|  * to point to BAD_PAGE entries. |  | ||||||
|  * |  | ||||||
|  * ZERO_PAGE is a special page that is used for zero-initialized |  * ZERO_PAGE is a special page that is used for zero-initialized | ||||||
|  * data and COW. |  * data and COW. | ||||||
|  */ |  */ | ||||||
| static unsigned long empty_bad_page_table; |  | ||||||
| static unsigned long empty_bad_page; |  | ||||||
| unsigned long empty_zero_page; | unsigned long empty_zero_page; | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  | @ -78,8 +67,6 @@ void __init paging_init(void) | ||||||
| 	 * Initialize the bad page table and bad page to point | 	 * Initialize the bad page table and bad page to point | ||||||
| 	 * to a couple of allocated pages. | 	 * to a couple of allocated pages. | ||||||
| 	 */ | 	 */ | ||||||
| 	empty_bad_page_table = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); |  | ||||||
| 	empty_bad_page = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); |  | ||||||
| 	empty_zero_page = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); | 	empty_zero_page = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); | ||||||
| 	memset((void *)empty_zero_page, 0, PAGE_SIZE); | 	memset((void *)empty_zero_page, 0, PAGE_SIZE); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -196,8 +196,8 @@ config TIMER_DIVIDE | ||||||
| 	default "128" | 	default "128" | ||||||
| 
 | 
 | ||||||
| config CPU_BIG_ENDIAN | config CPU_BIG_ENDIAN | ||||||
|         bool "Generate big endian code" | 	bool | ||||||
| 	default n | 	default !CPU_LITTLE_ENDIAN | ||||||
| 
 | 
 | ||||||
| config CPU_LITTLE_ENDIAN | config CPU_LITTLE_ENDIAN | ||||||
|         bool "Generate little endian code" |         bool "Generate little endian code" | ||||||
|  |  | ||||||
|  | @ -31,12 +31,7 @@ | ||||||
|  * tables. Each page table is also a single 4K page, giving 512 (== |  * tables. Each page table is also a single 4K page, giving 512 (== | ||||||
|  * PTRS_PER_PTE) 8 byte ptes. Each pud entry is initialized to point to |  * PTRS_PER_PTE) 8 byte ptes. Each pud entry is initialized to point to | ||||||
|  * invalid_pmd_table, each pmd entry is initialized to point to |  * invalid_pmd_table, each pmd entry is initialized to point to | ||||||
|  * invalid_pte_table, each pte is initialized to 0. When memory is low, |  * invalid_pte_table, each pte is initialized to 0. | ||||||
|  * and a pmd table or a page table allocation fails, empty_bad_pmd_table |  | ||||||
|  * and empty_bad_page_table is returned back to higher layer code, so |  | ||||||
|  * that the failure is recognized later on. Linux does not seem to |  | ||||||
|  * handle these failures very well though. The empty_bad_page_table has |  | ||||||
|  * invalid pte entries in it, to force page faults. |  | ||||||
|  * |  * | ||||||
|  * Kernel mappings: kernel mappings are held in the swapper_pg_table. |  * Kernel mappings: kernel mappings are held in the swapper_pg_table. | ||||||
|  * The layout is identical to userspace except it's indexed with the |  * The layout is identical to userspace except it's indexed with the | ||||||
|  | @ -175,7 +170,6 @@ | ||||||
| 	printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e)) | 	printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e)) | ||||||
| 
 | 
 | ||||||
| extern pte_t invalid_pte_table[PTRS_PER_PTE]; | extern pte_t invalid_pte_table[PTRS_PER_PTE]; | ||||||
| extern pte_t empty_bad_page_table[PTRS_PER_PTE]; |  | ||||||
| 
 | 
 | ||||||
| #ifndef __PAGETABLE_PUD_FOLDED | #ifndef __PAGETABLE_PUD_FOLDED | ||||||
| /*
 | /*
 | ||||||
|  |  | ||||||
|  | @ -433,14 +433,6 @@ ENTRY(swapper_pg_dir) | ||||||
| ENTRY(empty_zero_page) | ENTRY(empty_zero_page) | ||||||
| 	.space PAGE_SIZE
 | 	.space PAGE_SIZE
 | ||||||
| 
 | 
 | ||||||
| 	.balign PAGE_SIZE
 |  | ||||||
| ENTRY(empty_bad_page) |  | ||||||
| 	.space PAGE_SIZE
 |  | ||||||
| 
 |  | ||||||
| 	.balign PAGE_SIZE
 |  | ||||||
| ENTRY(empty_bad_pte_table) |  | ||||||
| 	.space PAGE_SIZE
 |  | ||||||
| 
 |  | ||||||
| 	.balign PAGE_SIZE
 | 	.balign PAGE_SIZE
 | ||||||
| ENTRY(large_page_table) | ENTRY(large_page_table) | ||||||
| 	.space PAGE_SIZE
 | 	.space PAGE_SIZE
 | ||||||
|  |  | ||||||
|  | @ -23,7 +23,6 @@ | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #include <linux/dma-debug.h> | #include <linux/dma-debug.h> | ||||||
| #include <linux/kmemcheck.h> |  | ||||||
| #include <linux/dma-mapping.h> | #include <linux/dma-mapping.h> | ||||||
| 
 | 
 | ||||||
| extern const struct dma_map_ops or1k_dma_map_ops; | extern const struct dma_map_ops or1k_dma_map_ops; | ||||||
|  |  | ||||||
|  | @ -18,7 +18,7 @@ static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp) | ||||||
| } | } | ||||||
| #endif /* MODULE */ | #endif /* MODULE */ | ||||||
| 
 | 
 | ||||||
| #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) | #define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_PPC_BOOK3S | #ifdef CONFIG_PPC_BOOK3S | ||||||
| #include <asm/book3s/pgalloc.h> | #include <asm/book3s/pgalloc.h> | ||||||
|  |  | ||||||
|  | @ -433,6 +433,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | ||||||
| 	pud = pud_offset(pgd, start); | 	pud = pud_offset(pgd, start); | ||||||
| 	pgd_clear(pgd); | 	pgd_clear(pgd); | ||||||
| 	pud_free_tlb(tlb, pud, start); | 	pud_free_tlb(tlb, pud, start); | ||||||
|  | 	mm_dec_nr_puds(tlb->mm); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  |  | ||||||
|  | @ -200,7 +200,7 @@ static void destroy_pagetable_page(struct mm_struct *mm) | ||||||
| 	/* We allow PTE_FRAG_NR fragments from a PTE page */ | 	/* We allow PTE_FRAG_NR fragments from a PTE page */ | ||||||
| 	if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) { | 	if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) { | ||||||
| 		pgtable_page_dtor(page); | 		pgtable_page_dtor(page); | ||||||
| 		free_hot_cold_page(page, 0); | 		free_unref_page(page); | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -404,7 +404,7 @@ void pte_fragment_free(unsigned long *table, int kernel) | ||||||
| 	if (put_page_testzero(page)) { | 	if (put_page_testzero(page)) { | ||||||
| 		if (!kernel) | 		if (!kernel) | ||||||
| 			pgtable_page_dtor(page); | 			pgtable_page_dtor(page); | ||||||
| 		free_hot_cold_page(page, 0); | 		free_unref_page(page); | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -44,6 +44,8 @@ static inline int init_new_context(struct task_struct *tsk, | ||||||
| 		mm->context.asce_limit = STACK_TOP_MAX; | 		mm->context.asce_limit = STACK_TOP_MAX; | ||||||
| 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | | 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | | ||||||
| 				   _ASCE_USER_BITS | _ASCE_TYPE_REGION3; | 				   _ASCE_USER_BITS | _ASCE_TYPE_REGION3; | ||||||
|  | 		/* pgd_alloc() did not account this pud */ | ||||||
|  | 		mm_inc_nr_puds(mm); | ||||||
| 		break; | 		break; | ||||||
| 	case -PAGE_SIZE: | 	case -PAGE_SIZE: | ||||||
| 		/* forked 5-level task, set new asce with new_mm->pgd */ | 		/* forked 5-level task, set new asce with new_mm->pgd */ | ||||||
|  | @ -59,7 +61,7 @@ static inline int init_new_context(struct task_struct *tsk, | ||||||
| 		/* forked 2-level compat task, set new asce with new mm->pgd */ | 		/* forked 2-level compat task, set new asce with new mm->pgd */ | ||||||
| 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | | 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | | ||||||
| 				   _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; | 				   _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; | ||||||
| 		/* pgd_alloc() did not increase mm->nr_pmds */ | 		/* pgd_alloc() did not account this pmd */ | ||||||
| 		mm_inc_nr_pmds(mm); | 		mm_inc_nr_pmds(mm); | ||||||
| 	} | 	} | ||||||
| 	crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); | 	crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); | ||||||
|  |  | ||||||
|  | @ -1172,11 +1172,11 @@ static int __init dwarf_unwinder_init(void) | ||||||
| 
 | 
 | ||||||
| 	dwarf_frame_cachep = kmem_cache_create("dwarf_frames", | 	dwarf_frame_cachep = kmem_cache_create("dwarf_frames", | ||||||
| 			sizeof(struct dwarf_frame), 0, | 			sizeof(struct dwarf_frame), 0, | ||||||
| 			SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL); | 			SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL); | ||||||
| 
 | 
 | ||||||
| 	dwarf_reg_cachep = kmem_cache_create("dwarf_regs", | 	dwarf_reg_cachep = kmem_cache_create("dwarf_regs", | ||||||
| 			sizeof(struct dwarf_reg), 0, | 			sizeof(struct dwarf_reg), 0, | ||||||
| 			SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL); | 			SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL); | ||||||
| 
 | 
 | ||||||
| 	dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ, | 	dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ, | ||||||
| 						    dwarf_frame_cachep); | 						    dwarf_frame_cachep); | ||||||
|  |  | ||||||
|  | @ -101,14 +101,6 @@ empty_zero_page: | ||||||
| mmu_pdtp_cache: | mmu_pdtp_cache: | ||||||
| 	.space PAGE_SIZE, 0 | 	.space PAGE_SIZE, 0 | ||||||
| 
 | 
 | ||||||
| 	.global empty_bad_page
 |  | ||||||
| empty_bad_page: |  | ||||||
| 	.space PAGE_SIZE, 0 |  | ||||||
| 
 |  | ||||||
| 	.global empty_bad_pte_table
 |  | ||||||
| empty_bad_pte_table: |  | ||||||
| 	.space PAGE_SIZE, 0 |  | ||||||
| 
 |  | ||||||
| 	.global	fpu_in_use
 | 	.global	fpu_in_use
 | ||||||
| fpu_in_use:	.quad	0 | fpu_in_use:	.quad	0 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -59,7 +59,7 @@ void arch_task_cache_init(void) | ||||||
| 
 | 
 | ||||||
| 	task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size, | 	task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size, | ||||||
| 					       __alignof__(union thread_xstate), | 					       __alignof__(union thread_xstate), | ||||||
| 					       SLAB_PANIC | SLAB_NOTRACK, NULL); | 					       SLAB_PANIC, NULL); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_SH_FPU_EMU | #ifdef CONFIG_SH_FPU_EMU | ||||||
|  |  | ||||||
|  | @ -231,6 +231,36 @@ extern unsigned long _PAGE_ALL_SZ_BITS; | ||||||
| extern struct page *mem_map_zero; | extern struct page *mem_map_zero; | ||||||
| #define ZERO_PAGE(vaddr)	(mem_map_zero) | #define ZERO_PAGE(vaddr)	(mem_map_zero) | ||||||
| 
 | 
 | ||||||
|  | /* This macro must be updated when the size of struct page grows above 80
 | ||||||
|  |  * or reduces below 64. | ||||||
|  |  * The idea that compiler optimizes out switch() statement, and only | ||||||
|  |  * leaves clrx instructions | ||||||
|  |  */ | ||||||
|  | #define	mm_zero_struct_page(pp) do {					\ | ||||||
|  | 	unsigned long *_pp = (void *)(pp);				\ | ||||||
|  | 									\ | ||||||
|  | 	 /* Check that struct page is either 64, 72, or 80 bytes */	\ | ||||||
|  | 	BUILD_BUG_ON(sizeof(struct page) & 7);				\ | ||||||
|  | 	BUILD_BUG_ON(sizeof(struct page) < 64);				\ | ||||||
|  | 	BUILD_BUG_ON(sizeof(struct page) > 80);				\ | ||||||
|  | 									\ | ||||||
|  | 	switch (sizeof(struct page)) {					\ | ||||||
|  | 	case 80:							\ | ||||||
|  | 		_pp[9] = 0;	/* fallthrough */			\ | ||||||
|  | 	case 72:							\ | ||||||
|  | 		_pp[8] = 0;	/* fallthrough */			\ | ||||||
|  | 	default:							\ | ||||||
|  | 		_pp[7] = 0;						\ | ||||||
|  | 		_pp[6] = 0;						\ | ||||||
|  | 		_pp[5] = 0;						\ | ||||||
|  | 		_pp[4] = 0;						\ | ||||||
|  | 		_pp[3] = 0;						\ | ||||||
|  | 		_pp[2] = 0;						\ | ||||||
|  | 		_pp[1] = 0;						\ | ||||||
|  | 		_pp[0] = 0;						\ | ||||||
|  | 	}								\ | ||||||
|  | } while (0) | ||||||
|  | 
 | ||||||
| /* PFNs are real physical page numbers.  However, mem_map only begins to record
 | /* PFNs are real physical page numbers.  However, mem_map only begins to record
 | ||||||
|  * per-page information starting at pfn_base.  This is to handle systems where |  * per-page information starting at pfn_base.  This is to handle systems where | ||||||
|  * the first physical page in the machine is at some huge physical address, |  * the first physical page in the machine is at some huge physical address, | ||||||
|  |  | ||||||
|  | @ -397,7 +397,7 @@ static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | ||||||
| 
 | 
 | ||||||
| 	pmd_clear(pmd); | 	pmd_clear(pmd); | ||||||
| 	pte_free_tlb(tlb, token, addr); | 	pte_free_tlb(tlb, token, addr); | ||||||
| 	atomic_long_dec(&tlb->mm->nr_ptes); | 	mm_dec_nr_ptes(tlb->mm); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | ||||||
|  | @ -472,6 +472,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | ||||||
| 	pud = pud_offset(pgd, start); | 	pud = pud_offset(pgd, start); | ||||||
| 	pgd_clear(pgd); | 	pgd_clear(pgd); | ||||||
| 	pud_free_tlb(tlb, pud, start); | 	pud_free_tlb(tlb, pud, start); | ||||||
|  | 	mm_dec_nr_puds(tlb->mm); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void hugetlb_free_pgd_range(struct mmu_gather *tlb, | void hugetlb_free_pgd_range(struct mmu_gather *tlb, | ||||||
|  |  | ||||||
|  | @ -2540,9 +2540,16 @@ void __init mem_init(void) | ||||||
| { | { | ||||||
| 	high_memory = __va(last_valid_pfn << PAGE_SHIFT); | 	high_memory = __va(last_valid_pfn << PAGE_SHIFT); | ||||||
| 
 | 
 | ||||||
| 	register_page_bootmem_info(); |  | ||||||
| 	free_all_bootmem(); | 	free_all_bootmem(); | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Must be done after boot memory is put on freelist, because here we | ||||||
|  | 	 * might set fields in deferred struct pages that have not yet been | ||||||
|  | 	 * initialized, and free_all_bootmem() initializes all the reserved | ||||||
|  | 	 * deferred pages for us. | ||||||
|  | 	 */ | ||||||
|  | 	register_page_bootmem_info(); | ||||||
|  | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Set up the zero page, mark it reserved, so that page count | 	 * Set up the zero page, mark it reserved, so that page count | ||||||
| 	 * is not manipulated when freeing the page from user ptes. | 	 * is not manipulated when freeing the page from user ptes. | ||||||
|  | @ -2637,30 +2644,19 @@ int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend, | ||||||
| 	vstart = vstart & PMD_MASK; | 	vstart = vstart & PMD_MASK; | ||||||
| 	vend = ALIGN(vend, PMD_SIZE); | 	vend = ALIGN(vend, PMD_SIZE); | ||||||
| 	for (; vstart < vend; vstart += PMD_SIZE) { | 	for (; vstart < vend; vstart += PMD_SIZE) { | ||||||
| 		pgd_t *pgd = pgd_offset_k(vstart); | 		pgd_t *pgd = vmemmap_pgd_populate(vstart, node); | ||||||
| 		unsigned long pte; | 		unsigned long pte; | ||||||
| 		pud_t *pud; | 		pud_t *pud; | ||||||
| 		pmd_t *pmd; | 		pmd_t *pmd; | ||||||
| 
 | 
 | ||||||
| 		if (pgd_none(*pgd)) { | 		if (!pgd) | ||||||
| 			pud_t *new = vmemmap_alloc_block(PAGE_SIZE, node); |  | ||||||
| 
 |  | ||||||
| 			if (!new) |  | ||||||
| 			return -ENOMEM; | 			return -ENOMEM; | ||||||
| 			pgd_populate(&init_mm, pgd, new); |  | ||||||
| 		} |  | ||||||
| 
 | 
 | ||||||
| 		pud = pud_offset(pgd, vstart); | 		pud = vmemmap_pud_populate(pgd, vstart, node); | ||||||
| 		if (pud_none(*pud)) { | 		if (!pud) | ||||||
| 			pmd_t *new = vmemmap_alloc_block(PAGE_SIZE, node); |  | ||||||
| 
 |  | ||||||
| 			if (!new) |  | ||||||
| 			return -ENOMEM; | 			return -ENOMEM; | ||||||
| 			pud_populate(&init_mm, pud, new); |  | ||||||
| 		} |  | ||||||
| 
 | 
 | ||||||
| 		pmd = pmd_offset(pud, vstart); | 		pmd = pmd_offset(pud, vstart); | ||||||
| 
 |  | ||||||
| 		pte = pmd_val(*pmd); | 		pte = pmd_val(*pmd); | ||||||
| 		if (!(pte & _PAGE_VALID)) { | 		if (!(pte & _PAGE_VALID)) { | ||||||
| 			void *block = vmemmap_alloc_block(PMD_SIZE, node); | 			void *block = vmemmap_alloc_block(PMD_SIZE, node); | ||||||
|  | @ -2927,7 +2923,7 @@ void __flush_tlb_all(void) | ||||||
| pte_t *pte_alloc_one_kernel(struct mm_struct *mm, | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, | ||||||
| 			    unsigned long address) | 			    unsigned long address) | ||||||
| { | { | ||||||
| 	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); | 	struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||||||
| 	pte_t *pte = NULL; | 	pte_t *pte = NULL; | ||||||
| 
 | 
 | ||||||
| 	if (page) | 	if (page) | ||||||
|  | @ -2939,11 +2935,11 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, | ||||||
| pgtable_t pte_alloc_one(struct mm_struct *mm, | pgtable_t pte_alloc_one(struct mm_struct *mm, | ||||||
| 			unsigned long address) | 			unsigned long address) | ||||||
| { | { | ||||||
| 	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); | 	struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||||||
| 	if (!page) | 	if (!page) | ||||||
| 		return NULL; | 		return NULL; | ||||||
| 	if (!pgtable_page_ctor(page)) { | 	if (!pgtable_page_ctor(page)) { | ||||||
| 		free_hot_cold_page(page, 0); | 		free_unref_page(page); | ||||||
| 		return NULL; | 		return NULL; | ||||||
| 	} | 	} | ||||||
| 	return (pte_t *) page_address(page); | 	return (pte_t *) page_address(page); | ||||||
|  |  | ||||||
|  | @ -409,7 +409,7 @@ void __homecache_free_pages(struct page *page, unsigned int order) | ||||||
| 	if (put_page_testzero(page)) { | 	if (put_page_testzero(page)) { | ||||||
| 		homecache_change_page_home(page, order, PAGE_HOME_HASH); | 		homecache_change_page_home(page, order, PAGE_HOME_HASH); | ||||||
| 		if (order == 0) { | 		if (order == 0) { | ||||||
| 			free_hot_cold_page(page, false); | 			free_unref_page(page); | ||||||
| 		} else { | 		} else { | ||||||
| 			init_page_count(page); | 			init_page_count(page); | ||||||
| 			__free_pages(page, order); | 			__free_pages(page, order); | ||||||
|  |  | ||||||
|  | @ -22,8 +22,6 @@ | ||||||
| /* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */ | /* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */ | ||||||
| unsigned long *empty_zero_page = NULL; | unsigned long *empty_zero_page = NULL; | ||||||
| EXPORT_SYMBOL(empty_zero_page); | EXPORT_SYMBOL(empty_zero_page); | ||||||
| /* allocated in paging_init and unchanged thereafter */ |  | ||||||
| static unsigned long *empty_bad_page = NULL; |  | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Initialized during boot, and readonly for initializing page tables |  * Initialized during boot, and readonly for initializing page tables | ||||||
|  | @ -146,7 +144,6 @@ void __init paging_init(void) | ||||||
| 	int i; | 	int i; | ||||||
| 
 | 
 | ||||||
| 	empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); | 	empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); | ||||||
| 	empty_bad_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); |  | ||||||
| 	for (i = 0; i < ARRAY_SIZE(zones_size); i++) | 	for (i = 0; i < ARRAY_SIZE(zones_size); i++) | ||||||
| 		zones_size[i] = 0; | 		zones_size[i] = 0; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -28,7 +28,7 @@ extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd); | ||||||
| #define pgd_alloc(mm)			get_pgd_slow(mm) | #define pgd_alloc(mm)			get_pgd_slow(mm) | ||||||
| #define pgd_free(mm, pgd)		free_pgd_slow(mm, pgd) | #define pgd_free(mm, pgd)		free_pgd_slow(mm, pgd) | ||||||
| 
 | 
 | ||||||
| #define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) | #define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO) | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Allocate one PTE table. |  * Allocate one PTE table. | ||||||
|  |  | ||||||
|  | @ -97,7 +97,7 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd) | ||||||
| 	pte = pmd_pgtable(*pmd); | 	pte = pmd_pgtable(*pmd); | ||||||
| 	pmd_clear(pmd); | 	pmd_clear(pmd); | ||||||
| 	pte_free(mm, pte); | 	pte_free(mm, pte); | ||||||
| 	atomic_long_dec(&mm->nr_ptes); | 	mm_dec_nr_ptes(mm); | ||||||
| 	pmd_free(mm, pmd); | 	pmd_free(mm, pmd); | ||||||
| 	mm_dec_nr_pmds(mm); | 	mm_dec_nr_pmds(mm); | ||||||
| free: | free: | ||||||
|  |  | ||||||
|  | @ -110,9 +110,8 @@ config X86 | ||||||
| 	select HAVE_ARCH_AUDITSYSCALL | 	select HAVE_ARCH_AUDITSYSCALL | ||||||
| 	select HAVE_ARCH_HUGE_VMAP		if X86_64 || X86_PAE | 	select HAVE_ARCH_HUGE_VMAP		if X86_64 || X86_PAE | ||||||
| 	select HAVE_ARCH_JUMP_LABEL | 	select HAVE_ARCH_JUMP_LABEL | ||||||
| 	select HAVE_ARCH_KASAN			if X86_64 && SPARSEMEM_VMEMMAP | 	select HAVE_ARCH_KASAN			if X86_64 | ||||||
| 	select HAVE_ARCH_KGDB | 	select HAVE_ARCH_KGDB | ||||||
| 	select HAVE_ARCH_KMEMCHECK |  | ||||||
| 	select HAVE_ARCH_MMAP_RND_BITS		if MMU | 	select HAVE_ARCH_MMAP_RND_BITS		if MMU | ||||||
| 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if MMU && COMPAT | 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if MMU && COMPAT | ||||||
| 	select HAVE_ARCH_COMPAT_MMAP_BASES	if MMU && COMPAT | 	select HAVE_ARCH_COMPAT_MMAP_BASES	if MMU && COMPAT | ||||||
|  | @ -1430,7 +1429,7 @@ config ARCH_DMA_ADDR_T_64BIT | ||||||
| 
 | 
 | ||||||
| config X86_DIRECT_GBPAGES | config X86_DIRECT_GBPAGES | ||||||
| 	def_bool y | 	def_bool y | ||||||
| 	depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK | 	depends on X86_64 && !DEBUG_PAGEALLOC | ||||||
| 	---help--- | 	---help--- | ||||||
| 	  Certain kernel features effectively disable kernel | 	  Certain kernel features effectively disable kernel | ||||||
| 	  linear 1 GB mappings (even if the CPU otherwise | 	  linear 1 GB mappings (even if the CPU otherwise | ||||||
|  |  | ||||||
|  | @ -158,11 +158,6 @@ ifdef CONFIG_X86_X32 | ||||||
| endif | endif | ||||||
| export CONFIG_X86_X32_ABI | export CONFIG_X86_X32_ABI | ||||||
| 
 | 
 | ||||||
| # Don't unroll struct assignments with kmemcheck enabled
 |  | ||||||
| ifeq ($(CONFIG_KMEMCHECK),y) |  | ||||||
| 	KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy) |  | ||||||
| endif |  | ||||||
| 
 |  | ||||||
| #
 | #
 | ||||||
| # If the function graph tracer is used with mcount instead of fentry,
 | # If the function graph tracer is used with mcount instead of fentry,
 | ||||||
| # '-maccumulate-outgoing-args' is needed to prevent a GCC bug
 | # '-maccumulate-outgoing-args' is needed to prevent a GCC bug
 | ||||||
|  |  | ||||||
|  | @ -7,7 +7,6 @@ | ||||||
|  * Documentation/DMA-API.txt for documentation. |  * Documentation/DMA-API.txt for documentation. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #include <linux/kmemcheck.h> |  | ||||||
| #include <linux/scatterlist.h> | #include <linux/scatterlist.h> | ||||||
| #include <linux/dma-debug.h> | #include <linux/dma-debug.h> | ||||||
| #include <asm/io.h> | #include <asm/io.h> | ||||||
|  |  | ||||||
|  | @ -1,43 +1 @@ | ||||||
| /* SPDX-License-Identifier: GPL-2.0 */ | /* SPDX-License-Identifier: GPL-2.0 */ | ||||||
| #ifndef ASM_X86_KMEMCHECK_H |  | ||||||
| #define ASM_X86_KMEMCHECK_H |  | ||||||
| 
 |  | ||||||
| #include <linux/types.h> |  | ||||||
| #include <asm/ptrace.h> |  | ||||||
| 
 |  | ||||||
| #ifdef CONFIG_KMEMCHECK |  | ||||||
| bool kmemcheck_active(struct pt_regs *regs); |  | ||||||
| 
 |  | ||||||
| void kmemcheck_show(struct pt_regs *regs); |  | ||||||
| void kmemcheck_hide(struct pt_regs *regs); |  | ||||||
| 
 |  | ||||||
| bool kmemcheck_fault(struct pt_regs *regs, |  | ||||||
| 	unsigned long address, unsigned long error_code); |  | ||||||
| bool kmemcheck_trap(struct pt_regs *regs); |  | ||||||
| #else |  | ||||||
| static inline bool kmemcheck_active(struct pt_regs *regs) |  | ||||||
| { |  | ||||||
| 	return false; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static inline void kmemcheck_show(struct pt_regs *regs) |  | ||||||
| { |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static inline void kmemcheck_hide(struct pt_regs *regs) |  | ||||||
| { |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static inline bool kmemcheck_fault(struct pt_regs *regs, |  | ||||||
| 	unsigned long address, unsigned long error_code) |  | ||||||
| { |  | ||||||
| 	return false; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static inline bool kmemcheck_trap(struct pt_regs *regs) |  | ||||||
| { |  | ||||||
| 	return false; |  | ||||||
| } |  | ||||||
| #endif /* CONFIG_KMEMCHECK */ |  | ||||||
| 
 |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | @ -667,11 +667,6 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) | ||||||
| 	return false; | 	return false; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline int pte_hidden(pte_t pte) |  | ||||||
| { |  | ||||||
| 	return pte_flags(pte) & _PAGE_HIDDEN; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static inline int pmd_present(pmd_t pmd) | static inline int pmd_present(pmd_t pmd) | ||||||
| { | { | ||||||
| 	/*
 | 	/*
 | ||||||
|  |  | ||||||
|  | @ -32,7 +32,6 @@ | ||||||
| 
 | 
 | ||||||
| #define _PAGE_BIT_SPECIAL	_PAGE_BIT_SOFTW1 | #define _PAGE_BIT_SPECIAL	_PAGE_BIT_SOFTW1 | ||||||
| #define _PAGE_BIT_CPA_TEST	_PAGE_BIT_SOFTW1 | #define _PAGE_BIT_CPA_TEST	_PAGE_BIT_SOFTW1 | ||||||
| #define _PAGE_BIT_HIDDEN	_PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ |  | ||||||
| #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */ | #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */ | ||||||
| #define _PAGE_BIT_DEVMAP	_PAGE_BIT_SOFTW4 | #define _PAGE_BIT_DEVMAP	_PAGE_BIT_SOFTW4 | ||||||
| 
 | 
 | ||||||
|  | @ -79,18 +78,6 @@ | ||||||
| #define _PAGE_KNL_ERRATUM_MASK 0 | #define _PAGE_KNL_ERRATUM_MASK 0 | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_KMEMCHECK |  | ||||||
| #define _PAGE_HIDDEN	(_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) |  | ||||||
| #else |  | ||||||
| #define _PAGE_HIDDEN	(_AT(pteval_t, 0)) |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * The same hidden bit is used by kmemcheck, but since kmemcheck |  | ||||||
|  * works on kernel pages while soft-dirty engine on user space, |  | ||||||
|  * they do not conflict with each other. |  | ||||||
|  */ |  | ||||||
| 
 |  | ||||||
| #ifdef CONFIG_MEM_SOFT_DIRTY | #ifdef CONFIG_MEM_SOFT_DIRTY | ||||||
| #define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) | #define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) | ||||||
| #else | #else | ||||||
|  |  | ||||||
|  | @ -179,8 +179,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) | ||||||
|  *	No 3D Now! |  *	No 3D Now! | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #ifndef CONFIG_KMEMCHECK |  | ||||||
| 
 |  | ||||||
| #if (__GNUC__ >= 4) | #if (__GNUC__ >= 4) | ||||||
| #define memcpy(t, f, n) __builtin_memcpy(t, f, n) | #define memcpy(t, f, n) __builtin_memcpy(t, f, n) | ||||||
| #else | #else | ||||||
|  | @ -189,13 +187,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) | ||||||
| 	 ? __constant_memcpy((t), (f), (n))	\ | 	 ? __constant_memcpy((t), (f), (n))	\ | ||||||
| 	 : __memcpy((t), (f), (n))) | 	 : __memcpy((t), (f), (n))) | ||||||
| #endif | #endif | ||||||
| #else |  | ||||||
| /*
 |  | ||||||
|  * kmemcheck becomes very happy if we use the REP instructions unconditionally, |  | ||||||
|  * because it means that we know both memory operands in advance. |  | ||||||
|  */ |  | ||||||
| #define memcpy(t, f, n) __memcpy((t), (f), (n)) |  | ||||||
| #endif |  | ||||||
| 
 | 
 | ||||||
| #endif | #endif | ||||||
| #endif /* !CONFIG_FORTIFY_SOURCE */ | #endif /* !CONFIG_FORTIFY_SOURCE */ | ||||||
|  |  | ||||||
|  | @ -33,7 +33,6 @@ extern void *memcpy(void *to, const void *from, size_t len); | ||||||
| extern void *__memcpy(void *to, const void *from, size_t len); | extern void *__memcpy(void *to, const void *from, size_t len); | ||||||
| 
 | 
 | ||||||
| #ifndef CONFIG_FORTIFY_SOURCE | #ifndef CONFIG_FORTIFY_SOURCE | ||||||
| #ifndef CONFIG_KMEMCHECK |  | ||||||
| #if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 | #if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 | ||||||
| #define memcpy(dst, src, len)					\ | #define memcpy(dst, src, len)					\ | ||||||
| ({								\ | ({								\ | ||||||
|  | @ -46,13 +45,6 @@ extern void *__memcpy(void *to, const void *from, size_t len); | ||||||
| 	__ret;							\ | 	__ret;							\ | ||||||
| }) | }) | ||||||
| #endif | #endif | ||||||
| #else |  | ||||||
| /*
 |  | ||||||
|  * kmemcheck becomes very happy if we use the REP instructions unconditionally, |  | ||||||
|  * because it means that we know both memory operands in advance. |  | ||||||
|  */ |  | ||||||
| #define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len)) |  | ||||||
| #endif |  | ||||||
| #endif /* !CONFIG_FORTIFY_SOURCE */ | #endif /* !CONFIG_FORTIFY_SOURCE */ | ||||||
| 
 | 
 | ||||||
| #define __HAVE_ARCH_MEMSET | #define __HAVE_ARCH_MEMSET | ||||||
|  |  | ||||||
|  | @ -1,7 +1,4 @@ | ||||||
| #ifdef CONFIG_KMEMCHECK | #ifndef _ASM_X86_XOR_H | ||||||
| /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ |  | ||||||
| # include <asm-generic/xor.h> |  | ||||||
| #elif !defined(_ASM_X86_XOR_H) |  | ||||||
| #define _ASM_X86_XOR_H | #define _ASM_X86_XOR_H | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  |  | ||||||
|  | @ -187,21 +187,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) | ||||||
| 	if (c->x86 == 6 && c->x86_model < 15) | 	if (c->x86 == 6 && c->x86_model < 15) | ||||||
| 		clear_cpu_cap(c, X86_FEATURE_PAT); | 		clear_cpu_cap(c, X86_FEATURE_PAT); | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_KMEMCHECK |  | ||||||
| 	/*
 |  | ||||||
| 	 * P4s have a "fast strings" feature which causes single- |  | ||||||
| 	 * stepping REP instructions to only generate a #DB on |  | ||||||
| 	 * cache-line boundaries. |  | ||||||
| 	 * |  | ||||||
| 	 * Ingo Molnar reported a Pentium D (model 6) and a Xeon |  | ||||||
| 	 * (model 2) with the same problem. |  | ||||||
| 	 */ |  | ||||||
| 	if (c->x86 == 15) |  | ||||||
| 		if (msr_clear_bit(MSR_IA32_MISC_ENABLE, |  | ||||||
| 				  MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0) |  | ||||||
| 			pr_info("kmemcheck: Disabling fast string operations\n"); |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * If fast string is not enabled in IA32_MISC_ENABLE for any reason, | 	 * If fast string is not enabled in IA32_MISC_ENABLE for any reason, | ||||||
| 	 * clear the fast string and enhanced fast string CPU capabilities. | 	 * clear the fast string and enhanced fast string CPU capabilities. | ||||||
|  |  | ||||||
|  | @ -57,7 +57,7 @@ | ||||||
| # error "Need more virtual address space for the ESPFIX hack" | # error "Need more virtual address space for the ESPFIX hack" | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) | #define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) | ||||||
| 
 | 
 | ||||||
| /* This contains the *bottom* address of the espfix stack */ | /* This contains the *bottom* address of the espfix stack */ | ||||||
| DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); | DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); | ||||||
|  |  | ||||||
|  | @ -42,7 +42,6 @@ | ||||||
| #include <linux/edac.h> | #include <linux/edac.h> | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #include <asm/kmemcheck.h> |  | ||||||
| #include <asm/stacktrace.h> | #include <asm/stacktrace.h> | ||||||
| #include <asm/processor.h> | #include <asm/processor.h> | ||||||
| #include <asm/debugreg.h> | #include <asm/debugreg.h> | ||||||
|  | @ -749,10 +748,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) | ||||||
| 	if (!dr6 && user_mode(regs)) | 	if (!dr6 && user_mode(regs)) | ||||||
| 		user_icebp = 1; | 		user_icebp = 1; | ||||||
| 
 | 
 | ||||||
| 	/* Catch kmemcheck conditions! */ |  | ||||||
| 	if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) |  | ||||||
| 		goto exit; |  | ||||||
| 
 |  | ||||||
| 	/* Store the virtualized DR6 value */ | 	/* Store the virtualized DR6 value */ | ||||||
| 	tsk->thread.debugreg6 = dr6; | 	tsk->thread.debugreg6 = dr6; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -29,8 +29,6 @@ obj-$(CONFIG_X86_PTDUMP)	+= debug_pagetables.o | ||||||
| 
 | 
 | ||||||
| obj-$(CONFIG_HIGHMEM)		+= highmem_32.o | obj-$(CONFIG_HIGHMEM)		+= highmem_32.o | ||||||
| 
 | 
 | ||||||
| obj-$(CONFIG_KMEMCHECK)		+= kmemcheck/ |  | ||||||
| 
 |  | ||||||
| KASAN_SANITIZE_kasan_init_$(BITS).o := n | KASAN_SANITIZE_kasan_init_$(BITS).o := n | ||||||
| obj-$(CONFIG_KASAN)		+= kasan_init_$(BITS).o | obj-$(CONFIG_KASAN)		+= kasan_init_$(BITS).o | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -20,7 +20,6 @@ | ||||||
| #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/ | #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/ | ||||||
| #include <asm/traps.h>			/* dotraplinkage, ...		*/ | #include <asm/traps.h>			/* dotraplinkage, ...		*/ | ||||||
| #include <asm/pgalloc.h>		/* pgd_*(), ...			*/ | #include <asm/pgalloc.h>		/* pgd_*(), ...			*/ | ||||||
| #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/ |  | ||||||
| #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/ | #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/ | ||||||
| #include <asm/vsyscall.h>		/* emulate_vsyscall		*/ | #include <asm/vsyscall.h>		/* emulate_vsyscall		*/ | ||||||
| #include <asm/vm86.h>			/* struct vm86			*/ | #include <asm/vm86.h>			/* struct vm86			*/ | ||||||
|  | @ -1256,8 +1255,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, | ||||||
| 	 * Detect and handle instructions that would cause a page fault for | 	 * Detect and handle instructions that would cause a page fault for | ||||||
| 	 * both a tracked kernel page and a userspace page. | 	 * both a tracked kernel page and a userspace page. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (kmemcheck_active(regs)) |  | ||||||
| 		kmemcheck_hide(regs); |  | ||||||
| 	prefetchw(&mm->mmap_sem); | 	prefetchw(&mm->mmap_sem); | ||||||
| 
 | 
 | ||||||
| 	if (unlikely(kmmio_fault(regs, address))) | 	if (unlikely(kmmio_fault(regs, address))) | ||||||
|  | @ -1280,9 +1277,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, | ||||||
| 		if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { | 		if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { | ||||||
| 			if (vmalloc_fault(address) >= 0) | 			if (vmalloc_fault(address) >= 0) | ||||||
| 				return; | 				return; | ||||||
| 
 |  | ||||||
| 			if (kmemcheck_fault(regs, address, error_code)) |  | ||||||
| 				return; |  | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		/* Can handle a stale RO->RW TLB: */ | 		/* Can handle a stale RO->RW TLB: */ | ||||||
|  |  | ||||||
|  | @ -92,8 +92,7 @@ __ref void *alloc_low_pages(unsigned int num) | ||||||
| 		unsigned int order; | 		unsigned int order; | ||||||
| 
 | 
 | ||||||
| 		order = get_order((unsigned long)num << PAGE_SHIFT); | 		order = get_order((unsigned long)num << PAGE_SHIFT); | ||||||
| 		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | | 		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); | ||||||
| 						__GFP_ZERO, order); |  | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { | 	if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { | ||||||
|  | @ -164,12 +163,11 @@ static int page_size_mask; | ||||||
| static void __init probe_page_size_mask(void) | static void __init probe_page_size_mask(void) | ||||||
| { | { | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will | 	 * For pagealloc debugging, identity mapping will use small pages. | ||||||
| 	 * use small pages. |  | ||||||
| 	 * This will simplify cpa(), which otherwise needs to support splitting | 	 * This will simplify cpa(), which otherwise needs to support splitting | ||||||
| 	 * large pages into small in interrupt context, etc. | 	 * large pages into small in interrupt context, etc. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK)) | 	if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) | ||||||
| 		page_size_mask |= 1 << PG_LEVEL_2M; | 		page_size_mask |= 1 << PG_LEVEL_2M; | ||||||
| 	else | 	else | ||||||
| 		direct_gbpages = 0; | 		direct_gbpages = 0; | ||||||
|  |  | ||||||
|  | @ -184,7 +184,7 @@ static __ref void *spp_getpage(void) | ||||||
| 	void *ptr; | 	void *ptr; | ||||||
| 
 | 
 | ||||||
| 	if (after_bootmem) | 	if (after_bootmem) | ||||||
| 		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); | 		ptr = (void *) get_zeroed_page(GFP_ATOMIC); | ||||||
| 	else | 	else | ||||||
| 		ptr = alloc_bootmem_pages(PAGE_SIZE); | 		ptr = alloc_bootmem_pages(PAGE_SIZE); | ||||||
| 
 | 
 | ||||||
|  | @ -1173,12 +1173,18 @@ void __init mem_init(void) | ||||||
| 
 | 
 | ||||||
| 	/* clear_bss() already clear the empty_zero_page */ | 	/* clear_bss() already clear the empty_zero_page */ | ||||||
| 
 | 
 | ||||||
| 	register_page_bootmem_info(); |  | ||||||
| 
 |  | ||||||
| 	/* this will put all memory onto the freelists */ | 	/* this will put all memory onto the freelists */ | ||||||
| 	free_all_bootmem(); | 	free_all_bootmem(); | ||||||
| 	after_bootmem = 1; | 	after_bootmem = 1; | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Must be done after boot memory is put on freelist, because here we | ||||||
|  | 	 * might set fields in deferred struct pages that have not yet been | ||||||
|  | 	 * initialized, and free_all_bootmem() initializes all the reserved | ||||||
|  | 	 * deferred pages for us. | ||||||
|  | 	 */ | ||||||
|  | 	register_page_bootmem_info(); | ||||||
|  | 
 | ||||||
| 	/* Register memory areas for /proc/kcore */ | 	/* Register memory areas for /proc/kcore */ | ||||||
| 	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, | 	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, | ||||||
| 			 PAGE_SIZE, KCORE_OTHER); | 			 PAGE_SIZE, KCORE_OTHER); | ||||||
|  | @ -1399,7 +1405,6 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, | ||||||
| 			vmemmap_verify((pte_t *)pmd, node, addr, next); | 			vmemmap_verify((pte_t *)pmd, node, addr, next); | ||||||
| 			continue; | 			continue; | ||||||
| 		} | 		} | ||||||
| 		pr_warn_once("vmemmap: falling back to regular page backing\n"); |  | ||||||
| 		if (vmemmap_populate_basepages(addr, next, node)) | 		if (vmemmap_populate_basepages(addr, next, node)) | ||||||
| 			return -ENOMEM; | 			return -ENOMEM; | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | @ -4,12 +4,14 @@ | ||||||
| #include <linux/bootmem.h> | #include <linux/bootmem.h> | ||||||
| #include <linux/kasan.h> | #include <linux/kasan.h> | ||||||
| #include <linux/kdebug.h> | #include <linux/kdebug.h> | ||||||
|  | #include <linux/memblock.h> | ||||||
| #include <linux/mm.h> | #include <linux/mm.h> | ||||||
| #include <linux/sched.h> | #include <linux/sched.h> | ||||||
| #include <linux/sched/task.h> | #include <linux/sched/task.h> | ||||||
| #include <linux/vmalloc.h> | #include <linux/vmalloc.h> | ||||||
| 
 | 
 | ||||||
| #include <asm/e820/types.h> | #include <asm/e820/types.h> | ||||||
|  | #include <asm/pgalloc.h> | ||||||
| #include <asm/tlbflush.h> | #include <asm/tlbflush.h> | ||||||
| #include <asm/sections.h> | #include <asm/sections.h> | ||||||
| #include <asm/pgtable.h> | #include <asm/pgtable.h> | ||||||
|  | @ -18,7 +20,134 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; | ||||||
| 
 | 
 | ||||||
| static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); | static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); | ||||||
| 
 | 
 | ||||||
| static int __init map_range(struct range *range) | static __init void *early_alloc(size_t size, int nid) | ||||||
|  | { | ||||||
|  | 	return memblock_virt_alloc_try_nid_nopanic(size, size, | ||||||
|  | 		__pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, | ||||||
|  | 				      unsigned long end, int nid) | ||||||
|  | { | ||||||
|  | 	pte_t *pte; | ||||||
|  | 
 | ||||||
|  | 	if (pmd_none(*pmd)) { | ||||||
|  | 		void *p; | ||||||
|  | 
 | ||||||
|  | 		if (boot_cpu_has(X86_FEATURE_PSE) && | ||||||
|  | 		    ((end - addr) == PMD_SIZE) && | ||||||
|  | 		    IS_ALIGNED(addr, PMD_SIZE)) { | ||||||
|  | 			p = early_alloc(PMD_SIZE, nid); | ||||||
|  | 			if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) | ||||||
|  | 				return; | ||||||
|  | 			else if (p) | ||||||
|  | 				memblock_free(__pa(p), PMD_SIZE); | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		p = early_alloc(PAGE_SIZE, nid); | ||||||
|  | 		pmd_populate_kernel(&init_mm, pmd, p); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	pte = pte_offset_kernel(pmd, addr); | ||||||
|  | 	do { | ||||||
|  | 		pte_t entry; | ||||||
|  | 		void *p; | ||||||
|  | 
 | ||||||
|  | 		if (!pte_none(*pte)) | ||||||
|  | 			continue; | ||||||
|  | 
 | ||||||
|  | 		p = early_alloc(PAGE_SIZE, nid); | ||||||
|  | 		entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL); | ||||||
|  | 		set_pte_at(&init_mm, addr, pte, entry); | ||||||
|  | 	} while (pte++, addr += PAGE_SIZE, addr != end); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, | ||||||
|  | 				      unsigned long end, int nid) | ||||||
|  | { | ||||||
|  | 	pmd_t *pmd; | ||||||
|  | 	unsigned long next; | ||||||
|  | 
 | ||||||
|  | 	if (pud_none(*pud)) { | ||||||
|  | 		void *p; | ||||||
|  | 
 | ||||||
|  | 		if (boot_cpu_has(X86_FEATURE_GBPAGES) && | ||||||
|  | 		    ((end - addr) == PUD_SIZE) && | ||||||
|  | 		    IS_ALIGNED(addr, PUD_SIZE)) { | ||||||
|  | 			p = early_alloc(PUD_SIZE, nid); | ||||||
|  | 			if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) | ||||||
|  | 				return; | ||||||
|  | 			else if (p) | ||||||
|  | 				memblock_free(__pa(p), PUD_SIZE); | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		p = early_alloc(PAGE_SIZE, nid); | ||||||
|  | 		pud_populate(&init_mm, pud, p); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	pmd = pmd_offset(pud, addr); | ||||||
|  | 	do { | ||||||
|  | 		next = pmd_addr_end(addr, end); | ||||||
|  | 		if (!pmd_large(*pmd)) | ||||||
|  | 			kasan_populate_pmd(pmd, addr, next, nid); | ||||||
|  | 	} while (pmd++, addr = next, addr != end); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, | ||||||
|  | 				      unsigned long end, int nid) | ||||||
|  | { | ||||||
|  | 	pud_t *pud; | ||||||
|  | 	unsigned long next; | ||||||
|  | 
 | ||||||
|  | 	if (p4d_none(*p4d)) { | ||||||
|  | 		void *p = early_alloc(PAGE_SIZE, nid); | ||||||
|  | 
 | ||||||
|  | 		p4d_populate(&init_mm, p4d, p); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	pud = pud_offset(p4d, addr); | ||||||
|  | 	do { | ||||||
|  | 		next = pud_addr_end(addr, end); | ||||||
|  | 		if (!pud_large(*pud)) | ||||||
|  | 			kasan_populate_pud(pud, addr, next, nid); | ||||||
|  | 	} while (pud++, addr = next, addr != end); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr, | ||||||
|  | 				      unsigned long end, int nid) | ||||||
|  | { | ||||||
|  | 	void *p; | ||||||
|  | 	p4d_t *p4d; | ||||||
|  | 	unsigned long next; | ||||||
|  | 
 | ||||||
|  | 	if (pgd_none(*pgd)) { | ||||||
|  | 		p = early_alloc(PAGE_SIZE, nid); | ||||||
|  | 		pgd_populate(&init_mm, pgd, p); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	p4d = p4d_offset(pgd, addr); | ||||||
|  | 	do { | ||||||
|  | 		next = p4d_addr_end(addr, end); | ||||||
|  | 		kasan_populate_p4d(p4d, addr, next, nid); | ||||||
|  | 	} while (p4d++, addr = next, addr != end); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void __init kasan_populate_shadow(unsigned long addr, unsigned long end, | ||||||
|  | 					 int nid) | ||||||
|  | { | ||||||
|  | 	pgd_t *pgd; | ||||||
|  | 	unsigned long next; | ||||||
|  | 
 | ||||||
|  | 	addr = addr & PAGE_MASK; | ||||||
|  | 	end = round_up(end, PAGE_SIZE); | ||||||
|  | 	pgd = pgd_offset_k(addr); | ||||||
|  | 	do { | ||||||
|  | 		next = pgd_addr_end(addr, end); | ||||||
|  | 		kasan_populate_pgd(pgd, addr, next, nid); | ||||||
|  | 	} while (pgd++, addr = next, addr != end); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void __init map_range(struct range *range) | ||||||
| { | { | ||||||
| 	unsigned long start; | 	unsigned long start; | ||||||
| 	unsigned long end; | 	unsigned long end; | ||||||
|  | @ -26,7 +155,7 @@ static int __init map_range(struct range *range) | ||||||
| 	start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); | 	start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); | ||||||
| 	end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); | 	end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); | ||||||
| 
 | 
 | ||||||
| 	return vmemmap_populate(start, end, NUMA_NO_NODE); | 	kasan_populate_shadow(start, end, early_pfn_to_nid(range->start)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void __init clear_pgds(unsigned long start, | static void __init clear_pgds(unsigned long start, | ||||||
|  | @ -189,16 +318,16 @@ void __init kasan_init(void) | ||||||
| 		if (pfn_mapped[i].end == 0) | 		if (pfn_mapped[i].end == 0) | ||||||
| 			break; | 			break; | ||||||
| 
 | 
 | ||||||
| 		if (map_range(&pfn_mapped[i])) | 		map_range(&pfn_mapped[i]); | ||||||
| 			panic("kasan: unable to allocate shadow!"); |  | ||||||
| 	} | 	} | ||||||
|  | 
 | ||||||
| 	kasan_populate_zero_shadow( | 	kasan_populate_zero_shadow( | ||||||
| 		kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), | 		kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), | ||||||
| 		kasan_mem_to_shadow((void *)__START_KERNEL_map)); | 		kasan_mem_to_shadow((void *)__START_KERNEL_map)); | ||||||
| 
 | 
 | ||||||
| 	vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext), | 	kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), | ||||||
| 			      (unsigned long)kasan_mem_to_shadow(_end), | 			      (unsigned long)kasan_mem_to_shadow(_end), | ||||||
| 			NUMA_NO_NODE); | 			      early_pfn_to_nid(__pa(_stext))); | ||||||
| 
 | 
 | ||||||
| 	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), | 	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), | ||||||
| 			(void *)KASAN_SHADOW_END); | 			(void *)KASAN_SHADOW_END); | ||||||
|  |  | ||||||
|  | @ -1 +0,0 @@ | ||||||
| obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o |  | ||||||
|  | @ -1,228 +1 @@ | ||||||
| // SPDX-License-Identifier: GPL-2.0
 | // SPDX-License-Identifier: GPL-2.0
 | ||||||
| #include <linux/interrupt.h> |  | ||||||
| #include <linux/kdebug.h> |  | ||||||
| #include <linux/kmemcheck.h> |  | ||||||
| #include <linux/kernel.h> |  | ||||||
| #include <linux/types.h> |  | ||||||
| #include <linux/ptrace.h> |  | ||||||
| #include <linux/stacktrace.h> |  | ||||||
| #include <linux/string.h> |  | ||||||
| 
 |  | ||||||
| #include "error.h" |  | ||||||
| #include "shadow.h" |  | ||||||
| 
 |  | ||||||
| enum kmemcheck_error_type { |  | ||||||
| 	KMEMCHECK_ERROR_INVALID_ACCESS, |  | ||||||
| 	KMEMCHECK_ERROR_BUG, |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| #define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT) |  | ||||||
| 
 |  | ||||||
| struct kmemcheck_error { |  | ||||||
| 	enum kmemcheck_error_type type; |  | ||||||
| 
 |  | ||||||
| 	union { |  | ||||||
| 		/* KMEMCHECK_ERROR_INVALID_ACCESS */ |  | ||||||
| 		struct { |  | ||||||
| 			/* Kind of access that caused the error */ |  | ||||||
| 			enum kmemcheck_shadow state; |  | ||||||
| 			/* Address and size of the erroneous read */ |  | ||||||
| 			unsigned long	address; |  | ||||||
| 			unsigned int	size; |  | ||||||
| 		}; |  | ||||||
| 	}; |  | ||||||
| 
 |  | ||||||
| 	struct pt_regs		regs; |  | ||||||
| 	struct stack_trace	trace; |  | ||||||
| 	unsigned long		trace_entries[32]; |  | ||||||
| 
 |  | ||||||
| 	/* We compress it to a char. */ |  | ||||||
| 	unsigned char		shadow_copy[SHADOW_COPY_SIZE]; |  | ||||||
| 	unsigned char		memory_copy[SHADOW_COPY_SIZE]; |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * Create a ring queue of errors to output. We can't call printk() directly |  | ||||||
|  * from the kmemcheck traps, since this may call the console drivers and |  | ||||||
|  * result in a recursive fault. |  | ||||||
|  */ |  | ||||||
| static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE]; |  | ||||||
| static unsigned int error_count; |  | ||||||
| static unsigned int error_rd; |  | ||||||
| static unsigned int error_wr; |  | ||||||
| static unsigned int error_missed_count; |  | ||||||
| 
 |  | ||||||
| static struct kmemcheck_error *error_next_wr(void) |  | ||||||
| { |  | ||||||
| 	struct kmemcheck_error *e; |  | ||||||
| 
 |  | ||||||
| 	if (error_count == ARRAY_SIZE(error_fifo)) { |  | ||||||
| 		++error_missed_count; |  | ||||||
| 		return NULL; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	e = &error_fifo[error_wr]; |  | ||||||
| 	if (++error_wr == ARRAY_SIZE(error_fifo)) |  | ||||||
| 		error_wr = 0; |  | ||||||
| 	++error_count; |  | ||||||
| 	return e; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static struct kmemcheck_error *error_next_rd(void) |  | ||||||
| { |  | ||||||
| 	struct kmemcheck_error *e; |  | ||||||
| 
 |  | ||||||
| 	if (error_count == 0) |  | ||||||
| 		return NULL; |  | ||||||
| 
 |  | ||||||
| 	e = &error_fifo[error_rd]; |  | ||||||
| 	if (++error_rd == ARRAY_SIZE(error_fifo)) |  | ||||||
| 		error_rd = 0; |  | ||||||
| 	--error_count; |  | ||||||
| 	return e; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| void kmemcheck_error_recall(void) |  | ||||||
| { |  | ||||||
| 	static const char *desc[] = { |  | ||||||
| 		[KMEMCHECK_SHADOW_UNALLOCATED]		= "unallocated", |  | ||||||
| 		[KMEMCHECK_SHADOW_UNINITIALIZED]	= "uninitialized", |  | ||||||
| 		[KMEMCHECK_SHADOW_INITIALIZED]		= "initialized", |  | ||||||
| 		[KMEMCHECK_SHADOW_FREED]		= "freed", |  | ||||||
| 	}; |  | ||||||
| 
 |  | ||||||
| 	static const char short_desc[] = { |  | ||||||
| 		[KMEMCHECK_SHADOW_UNALLOCATED]		= 'a', |  | ||||||
| 		[KMEMCHECK_SHADOW_UNINITIALIZED]	= 'u', |  | ||||||
| 		[KMEMCHECK_SHADOW_INITIALIZED]		= 'i', |  | ||||||
| 		[KMEMCHECK_SHADOW_FREED]		= 'f', |  | ||||||
| 	}; |  | ||||||
| 
 |  | ||||||
| 	struct kmemcheck_error *e; |  | ||||||
| 	unsigned int i; |  | ||||||
| 
 |  | ||||||
| 	e = error_next_rd(); |  | ||||||
| 	if (!e) |  | ||||||
| 		return; |  | ||||||
| 
 |  | ||||||
| 	switch (e->type) { |  | ||||||
| 	case KMEMCHECK_ERROR_INVALID_ACCESS: |  | ||||||
| 		printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n", |  | ||||||
| 			8 * e->size, e->state < ARRAY_SIZE(desc) ? |  | ||||||
| 				desc[e->state] : "(invalid shadow state)", |  | ||||||
| 			(void *) e->address); |  | ||||||
| 
 |  | ||||||
| 		printk(KERN_WARNING); |  | ||||||
| 		for (i = 0; i < SHADOW_COPY_SIZE; ++i) |  | ||||||
| 			printk(KERN_CONT "%02x", e->memory_copy[i]); |  | ||||||
| 		printk(KERN_CONT "\n"); |  | ||||||
| 
 |  | ||||||
| 		printk(KERN_WARNING); |  | ||||||
| 		for (i = 0; i < SHADOW_COPY_SIZE; ++i) { |  | ||||||
| 			if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) |  | ||||||
| 				printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]); |  | ||||||
| 			else |  | ||||||
| 				printk(KERN_CONT " ?"); |  | ||||||
| 		} |  | ||||||
| 		printk(KERN_CONT "\n"); |  | ||||||
| 		printk(KERN_WARNING "%*c\n", 2 + 2 |  | ||||||
| 			* (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); |  | ||||||
| 		break; |  | ||||||
| 	case KMEMCHECK_ERROR_BUG: |  | ||||||
| 		printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n"); |  | ||||||
| 		break; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	__show_regs(&e->regs, 1); |  | ||||||
| 	print_stack_trace(&e->trace, 0); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static void do_wakeup(unsigned long data) |  | ||||||
| { |  | ||||||
| 	while (error_count > 0) |  | ||||||
| 		kmemcheck_error_recall(); |  | ||||||
| 
 |  | ||||||
| 	if (error_missed_count > 0) { |  | ||||||
| 		printk(KERN_WARNING "kmemcheck: Lost %d error reports because " |  | ||||||
| 			"the queue was too small\n", error_missed_count); |  | ||||||
| 		error_missed_count = 0; |  | ||||||
| 	} |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0); |  | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * Save the context of an error report. |  | ||||||
|  */ |  | ||||||
| void kmemcheck_error_save(enum kmemcheck_shadow state, |  | ||||||
| 	unsigned long address, unsigned int size, struct pt_regs *regs) |  | ||||||
| { |  | ||||||
| 	static unsigned long prev_ip; |  | ||||||
| 
 |  | ||||||
| 	struct kmemcheck_error *e; |  | ||||||
| 	void *shadow_copy; |  | ||||||
| 	void *memory_copy; |  | ||||||
| 
 |  | ||||||
| 	/* Don't report several adjacent errors from the same EIP. */ |  | ||||||
| 	if (regs->ip == prev_ip) |  | ||||||
| 		return; |  | ||||||
| 	prev_ip = regs->ip; |  | ||||||
| 
 |  | ||||||
| 	e = error_next_wr(); |  | ||||||
| 	if (!e) |  | ||||||
| 		return; |  | ||||||
| 
 |  | ||||||
| 	e->type = KMEMCHECK_ERROR_INVALID_ACCESS; |  | ||||||
| 
 |  | ||||||
| 	e->state = state; |  | ||||||
| 	e->address = address; |  | ||||||
| 	e->size = size; |  | ||||||
| 
 |  | ||||||
| 	/* Save regs */ |  | ||||||
| 	memcpy(&e->regs, regs, sizeof(*regs)); |  | ||||||
| 
 |  | ||||||
| 	/* Save stack trace */ |  | ||||||
| 	e->trace.nr_entries = 0; |  | ||||||
| 	e->trace.entries = e->trace_entries; |  | ||||||
| 	e->trace.max_entries = ARRAY_SIZE(e->trace_entries); |  | ||||||
| 	e->trace.skip = 0; |  | ||||||
| 	save_stack_trace_regs(regs, &e->trace); |  | ||||||
| 
 |  | ||||||
| 	/* Round address down to nearest 16 bytes */ |  | ||||||
| 	shadow_copy = kmemcheck_shadow_lookup(address |  | ||||||
| 		& ~(SHADOW_COPY_SIZE - 1)); |  | ||||||
| 	BUG_ON(!shadow_copy); |  | ||||||
| 
 |  | ||||||
| 	memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE); |  | ||||||
| 
 |  | ||||||
| 	kmemcheck_show_addr(address); |  | ||||||
| 	memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1)); |  | ||||||
| 	memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE); |  | ||||||
| 	kmemcheck_hide_addr(address); |  | ||||||
| 
 |  | ||||||
| 	tasklet_hi_schedule_first(&kmemcheck_tasklet); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * Save the context of a kmemcheck bug. |  | ||||||
|  */ |  | ||||||
| void kmemcheck_error_save_bug(struct pt_regs *regs) |  | ||||||
| { |  | ||||||
| 	struct kmemcheck_error *e; |  | ||||||
| 
 |  | ||||||
| 	e = error_next_wr(); |  | ||||||
| 	if (!e) |  | ||||||
| 		return; |  | ||||||
| 
 |  | ||||||
| 	e->type = KMEMCHECK_ERROR_BUG; |  | ||||||
| 
 |  | ||||||
| 	memcpy(&e->regs, regs, sizeof(*regs)); |  | ||||||
| 
 |  | ||||||
| 	e->trace.nr_entries = 0; |  | ||||||
| 	e->trace.entries = e->trace_entries; |  | ||||||
| 	e->trace.max_entries = ARRAY_SIZE(e->trace_entries); |  | ||||||
| 	e->trace.skip = 1; |  | ||||||
| 	save_stack_trace(&e->trace); |  | ||||||
| 
 |  | ||||||
| 	tasklet_hi_schedule_first(&kmemcheck_tasklet); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  | @ -1,16 +1 @@ | ||||||
| /* SPDX-License-Identifier: GPL-2.0 */ | /* SPDX-License-Identifier: GPL-2.0 */ | ||||||
| #ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H |  | ||||||
| #define ARCH__X86__MM__KMEMCHECK__ERROR_H |  | ||||||
| 
 |  | ||||||
| #include <linux/ptrace.h> |  | ||||||
| 
 |  | ||||||
| #include "shadow.h" |  | ||||||
| 
 |  | ||||||
| void kmemcheck_error_save(enum kmemcheck_shadow state, |  | ||||||
| 	unsigned long address, unsigned int size, struct pt_regs *regs); |  | ||||||
| 
 |  | ||||||
| void kmemcheck_error_save_bug(struct pt_regs *regs); |  | ||||||
| 
 |  | ||||||
| void kmemcheck_error_recall(void); |  | ||||||
| 
 |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | @ -1,658 +0,0 @@ | ||||||
| /**
 |  | ||||||
|  * kmemcheck - a heavyweight memory checker for the linux kernel |  | ||||||
|  * Copyright (C) 2007, 2008  Vegard Nossum <vegardno@ifi.uio.no> |  | ||||||
|  * (With a lot of help from Ingo Molnar and Pekka Enberg.) |  | ||||||
|  * |  | ||||||
|  * This program is free software; you can redistribute it and/or modify |  | ||||||
|  * it under the terms of the GNU General Public License (version 2) as |  | ||||||
|  * published by the Free Software Foundation. |  | ||||||
|  */ |  | ||||||
| 
 |  | ||||||
| #include <linux/init.h> |  | ||||||
| #include <linux/interrupt.h> |  | ||||||
| #include <linux/kallsyms.h> |  | ||||||
| #include <linux/kernel.h> |  | ||||||
| #include <linux/kmemcheck.h> |  | ||||||
| #include <linux/mm.h> |  | ||||||
| #include <linux/page-flags.h> |  | ||||||
| #include <linux/percpu.h> |  | ||||||
| #include <linux/ptrace.h> |  | ||||||
| #include <linux/string.h> |  | ||||||
| #include <linux/types.h> |  | ||||||
| 
 |  | ||||||
| #include <asm/cacheflush.h> |  | ||||||
| #include <asm/kmemcheck.h> |  | ||||||
| #include <asm/pgtable.h> |  | ||||||
| #include <asm/tlbflush.h> |  | ||||||
| 
 |  | ||||||
| #include "error.h" |  | ||||||
| #include "opcode.h" |  | ||||||
| #include "pte.h" |  | ||||||
| #include "selftest.h" |  | ||||||
| #include "shadow.h" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| #ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT |  | ||||||
| #  define KMEMCHECK_ENABLED 0 |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| #ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT |  | ||||||
| #  define KMEMCHECK_ENABLED 1 |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| #ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT |  | ||||||
| #  define KMEMCHECK_ENABLED 2 |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| int kmemcheck_enabled = KMEMCHECK_ENABLED; |  | ||||||
| 
 |  | ||||||
| int __init kmemcheck_init(void) |  | ||||||
| { |  | ||||||
| #ifdef CONFIG_SMP |  | ||||||
| 	/*
 |  | ||||||
| 	 * Limit SMP to use a single CPU. We rely on the fact that this code |  | ||||||
| 	 * runs before SMP is set up. |  | ||||||
| 	 */ |  | ||||||
| 	if (setup_max_cpus > 1) { |  | ||||||
| 		printk(KERN_INFO |  | ||||||
| 			"kmemcheck: Limiting number of CPUs to 1.\n"); |  | ||||||
| 		setup_max_cpus = 1; |  | ||||||
| 	} |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| 	if (!kmemcheck_selftest()) { |  | ||||||
| 		printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n"); |  | ||||||
| 		kmemcheck_enabled = 0; |  | ||||||
| 		return -EINVAL; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	printk(KERN_INFO "kmemcheck: Initialized\n"); |  | ||||||
| 	return 0; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| early_initcall(kmemcheck_init); |  | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * We need to parse the kmemcheck= option before any memory is allocated. |  | ||||||
|  */ |  | ||||||
| static int __init param_kmemcheck(char *str) |  | ||||||
| { |  | ||||||
| 	int val; |  | ||||||
| 	int ret; |  | ||||||
| 
 |  | ||||||
| 	if (!str) |  | ||||||
| 		return -EINVAL; |  | ||||||
| 
 |  | ||||||
| 	ret = kstrtoint(str, 0, &val); |  | ||||||
| 	if (ret) |  | ||||||
| 		return ret; |  | ||||||
| 	kmemcheck_enabled = val; |  | ||||||
| 	return 0; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| early_param("kmemcheck", param_kmemcheck); |  | ||||||
| 
 |  | ||||||
| int kmemcheck_show_addr(unsigned long address) |  | ||||||
| { |  | ||||||
| 	pte_t *pte; |  | ||||||
| 
 |  | ||||||
| 	pte = kmemcheck_pte_lookup(address); |  | ||||||
| 	if (!pte) |  | ||||||
| 		return 0; |  | ||||||
| 
 |  | ||||||
| 	set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); |  | ||||||
| 	__flush_tlb_one(address); |  | ||||||
| 	return 1; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| int kmemcheck_hide_addr(unsigned long address) |  | ||||||
| { |  | ||||||
| 	pte_t *pte; |  | ||||||
| 
 |  | ||||||
| 	pte = kmemcheck_pte_lookup(address); |  | ||||||
| 	if (!pte) |  | ||||||
| 		return 0; |  | ||||||
| 
 |  | ||||||
| 	set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); |  | ||||||
| 	__flush_tlb_one(address); |  | ||||||
| 	return 1; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| struct kmemcheck_context { |  | ||||||
| 	bool busy; |  | ||||||
| 	int balance; |  | ||||||
| 
 |  | ||||||
| 	/*
 |  | ||||||
| 	 * There can be at most two memory operands to an instruction, but |  | ||||||
| 	 * each address can cross a page boundary -- so we may need up to |  | ||||||
| 	 * four addresses that must be hidden/revealed for each fault. |  | ||||||
| 	 */ |  | ||||||
| 	unsigned long addr[4]; |  | ||||||
| 	unsigned long n_addrs; |  | ||||||
| 	unsigned long flags; |  | ||||||
| 
 |  | ||||||
| 	/* Data size of the instruction that caused a fault. */ |  | ||||||
| 	unsigned int size; |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context); |  | ||||||
| 
 |  | ||||||
| bool kmemcheck_active(struct pt_regs *regs) |  | ||||||
| { |  | ||||||
| 	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); |  | ||||||
| 
 |  | ||||||
| 	return data->balance > 0; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /* Save an address that needs to be shown/hidden */ |  | ||||||
| static void kmemcheck_save_addr(unsigned long addr) |  | ||||||
| { |  | ||||||
| 	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); |  | ||||||
| 
 |  | ||||||
| 	BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr)); |  | ||||||
| 	data->addr[data->n_addrs++] = addr; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static unsigned int kmemcheck_show_all(void) |  | ||||||
| { |  | ||||||
| 	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); |  | ||||||
| 	unsigned int i; |  | ||||||
| 	unsigned int n; |  | ||||||
| 
 |  | ||||||
| 	n = 0; |  | ||||||
| 	for (i = 0; i < data->n_addrs; ++i) |  | ||||||
| 		n += kmemcheck_show_addr(data->addr[i]); |  | ||||||
| 
 |  | ||||||
| 	return n; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static unsigned int kmemcheck_hide_all(void) |  | ||||||
| { |  | ||||||
| 	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); |  | ||||||
| 	unsigned int i; |  | ||||||
| 	unsigned int n; |  | ||||||
| 
 |  | ||||||
| 	n = 0; |  | ||||||
| 	for (i = 0; i < data->n_addrs; ++i) |  | ||||||
| 		n += kmemcheck_hide_addr(data->addr[i]); |  | ||||||
| 
 |  | ||||||
| 	return n; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * Called from the #PF handler. |  | ||||||
|  */ |  | ||||||
| void kmemcheck_show(struct pt_regs *regs) |  | ||||||
| { |  | ||||||
| 	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); |  | ||||||
| 
 |  | ||||||
| 	BUG_ON(!irqs_disabled()); |  | ||||||
| 
 |  | ||||||
| 	if (unlikely(data->balance != 0)) { |  | ||||||
| 		kmemcheck_show_all(); |  | ||||||
| 		kmemcheck_error_save_bug(regs); |  | ||||||
| 		data->balance = 0; |  | ||||||
| 		return; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	/*
 |  | ||||||
| 	 * None of the addresses actually belonged to kmemcheck. Note that |  | ||||||
| 	 * this is not an error. |  | ||||||
| 	 */ |  | ||||||
| 	if (kmemcheck_show_all() == 0) |  | ||||||
| 		return; |  | ||||||
| 
 |  | ||||||
| 	++data->balance; |  | ||||||
| 
 |  | ||||||
| 	/*
 |  | ||||||
| 	 * The IF needs to be cleared as well, so that the faulting |  | ||||||
| 	 * instruction can run "uninterrupted". Otherwise, we might take |  | ||||||
| 	 * an interrupt and start executing that before we've had a chance |  | ||||||
| 	 * to hide the page again. |  | ||||||
| 	 * |  | ||||||
| 	 * NOTE: In the rare case of multiple faults, we must not override |  | ||||||
| 	 * the original flags: |  | ||||||
| 	 */ |  | ||||||
| 	if (!(regs->flags & X86_EFLAGS_TF)) |  | ||||||
| 		data->flags = regs->flags; |  | ||||||
| 
 |  | ||||||
| 	regs->flags |= X86_EFLAGS_TF; |  | ||||||
| 	regs->flags &= ~X86_EFLAGS_IF; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * Called from the #DB handler. |  | ||||||
|  */ |  | ||||||
| void kmemcheck_hide(struct pt_regs *regs) |  | ||||||
| { |  | ||||||
| 	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); |  | ||||||
| 	int n; |  | ||||||
| 
 |  | ||||||
| 	BUG_ON(!irqs_disabled()); |  | ||||||
| 
 |  | ||||||
| 	if (unlikely(data->balance != 1)) { |  | ||||||
| 		kmemcheck_show_all(); |  | ||||||
| 		kmemcheck_error_save_bug(regs); |  | ||||||
| 		data->n_addrs = 0; |  | ||||||
| 		data->balance = 0; |  | ||||||
| 
 |  | ||||||
| 		if (!(data->flags & X86_EFLAGS_TF)) |  | ||||||
| 			regs->flags &= ~X86_EFLAGS_TF; |  | ||||||
| 		if (data->flags & X86_EFLAGS_IF) |  | ||||||
| 			regs->flags |= X86_EFLAGS_IF; |  | ||||||
| 		return; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	if (kmemcheck_enabled) |  | ||||||
| 		n = kmemcheck_hide_all(); |  | ||||||
| 	else |  | ||||||
| 		n = kmemcheck_show_all(); |  | ||||||
| 
 |  | ||||||
| 	if (n == 0) |  | ||||||
| 		return; |  | ||||||
| 
 |  | ||||||
| 	--data->balance; |  | ||||||
| 
 |  | ||||||
| 	data->n_addrs = 0; |  | ||||||
| 
 |  | ||||||
| 	if (!(data->flags & X86_EFLAGS_TF)) |  | ||||||
| 		regs->flags &= ~X86_EFLAGS_TF; |  | ||||||
| 	if (data->flags & X86_EFLAGS_IF) |  | ||||||
| 		regs->flags |= X86_EFLAGS_IF; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| void kmemcheck_show_pages(struct page *p, unsigned int n) |  | ||||||
| { |  | ||||||
| 	unsigned int i; |  | ||||||
| 
 |  | ||||||
| 	for (i = 0; i < n; ++i) { |  | ||||||
| 		unsigned long address; |  | ||||||
| 		pte_t *pte; |  | ||||||
| 		unsigned int level; |  | ||||||
| 
 |  | ||||||
| 		address = (unsigned long) page_address(&p[i]); |  | ||||||
| 		pte = lookup_address(address, &level); |  | ||||||
| 		BUG_ON(!pte); |  | ||||||
| 		BUG_ON(level != PG_LEVEL_4K); |  | ||||||
| 
 |  | ||||||
| 		set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); |  | ||||||
| 		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN)); |  | ||||||
| 		__flush_tlb_one(address); |  | ||||||
| 	} |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| bool kmemcheck_page_is_tracked(struct page *p) |  | ||||||
| { |  | ||||||
| 	/* This will also check the "hidden" flag of the PTE. */ |  | ||||||
| 	return kmemcheck_pte_lookup((unsigned long) page_address(p)); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| void kmemcheck_hide_pages(struct page *p, unsigned int n) |  | ||||||
| { |  | ||||||
| 	unsigned int i; |  | ||||||
| 
 |  | ||||||
| 	for (i = 0; i < n; ++i) { |  | ||||||
| 		unsigned long address; |  | ||||||
| 		pte_t *pte; |  | ||||||
| 		unsigned int level; |  | ||||||
| 
 |  | ||||||
| 		address = (unsigned long) page_address(&p[i]); |  | ||||||
| 		pte = lookup_address(address, &level); |  | ||||||
| 		BUG_ON(!pte); |  | ||||||
| 		BUG_ON(level != PG_LEVEL_4K); |  | ||||||
| 
 |  | ||||||
| 		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); |  | ||||||
| 		set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN)); |  | ||||||
| 		__flush_tlb_one(address); |  | ||||||
| 	} |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /* Access may NOT cross page boundary */ |  | ||||||
| static void kmemcheck_read_strict(struct pt_regs *regs, |  | ||||||
| 	unsigned long addr, unsigned int size) |  | ||||||
| { |  | ||||||
| 	void *shadow; |  | ||||||
| 	enum kmemcheck_shadow status; |  | ||||||
| 
 |  | ||||||
| 	shadow = kmemcheck_shadow_lookup(addr); |  | ||||||
| 	if (!shadow) |  | ||||||
| 		return; |  | ||||||
| 
 |  | ||||||
| 	kmemcheck_save_addr(addr); |  | ||||||
| 	status = kmemcheck_shadow_test(shadow, size); |  | ||||||
| 	if (status == KMEMCHECK_SHADOW_INITIALIZED) |  | ||||||
| 		return; |  | ||||||
| 
 |  | ||||||
| 	if (kmemcheck_enabled) |  | ||||||
| 		kmemcheck_error_save(status, addr, size, regs); |  | ||||||
| 
 |  | ||||||
| 	if (kmemcheck_enabled == 2) |  | ||||||
| 		kmemcheck_enabled = 0; |  | ||||||
| 
 |  | ||||||
| 	/* Don't warn about it again. */ |  | ||||||
| 	kmemcheck_shadow_set(shadow, size); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) |  | ||||||
| { |  | ||||||
| 	enum kmemcheck_shadow status; |  | ||||||
| 	void *shadow; |  | ||||||
| 
 |  | ||||||
| 	shadow = kmemcheck_shadow_lookup(addr); |  | ||||||
| 	if (!shadow) |  | ||||||
| 		return true; |  | ||||||
| 
 |  | ||||||
| 	status = kmemcheck_shadow_test_all(shadow, size); |  | ||||||
| 
 |  | ||||||
| 	return status == KMEMCHECK_SHADOW_INITIALIZED; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /* Access may cross page boundary */ |  | ||||||
| static void kmemcheck_read(struct pt_regs *regs, |  | ||||||
| 	unsigned long addr, unsigned int size) |  | ||||||
| { |  | ||||||
| 	unsigned long page = addr & PAGE_MASK; |  | ||||||
| 	unsigned long next_addr = addr + size - 1; |  | ||||||
| 	unsigned long next_page = next_addr & PAGE_MASK; |  | ||||||
| 
 |  | ||||||
| 	if (likely(page == next_page)) { |  | ||||||
| 		kmemcheck_read_strict(regs, addr, size); |  | ||||||
| 		return; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	/*
 |  | ||||||
| 	 * What we do is basically to split the access across the |  | ||||||
| 	 * two pages and handle each part separately. Yes, this means |  | ||||||
| 	 * that we may now see reads that are 3 + 5 bytes, for |  | ||||||
| 	 * example (and if both are uninitialized, there will be two |  | ||||||
| 	 * reports), but it makes the code a lot simpler. |  | ||||||
| 	 */ |  | ||||||
| 	kmemcheck_read_strict(regs, addr, next_page - addr); |  | ||||||
| 	kmemcheck_read_strict(regs, next_page, next_addr - next_page); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static void kmemcheck_write_strict(struct pt_regs *regs, |  | ||||||
| 	unsigned long addr, unsigned int size) |  | ||||||
| { |  | ||||||
| 	void *shadow; |  | ||||||
| 
 |  | ||||||
| 	shadow = kmemcheck_shadow_lookup(addr); |  | ||||||
| 	if (!shadow) |  | ||||||
| 		return; |  | ||||||
| 
 |  | ||||||
| 	kmemcheck_save_addr(addr); |  | ||||||
| 	kmemcheck_shadow_set(shadow, size); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static void kmemcheck_write(struct pt_regs *regs, |  | ||||||
| 	unsigned long addr, unsigned int size) |  | ||||||
| { |  | ||||||
| 	unsigned long page = addr & PAGE_MASK; |  | ||||||
| 	unsigned long next_addr = addr + size - 1; |  | ||||||
| 	unsigned long next_page = next_addr & PAGE_MASK; |  | ||||||
| 
 |  | ||||||
| 	if (likely(page == next_page)) { |  | ||||||
| 		kmemcheck_write_strict(regs, addr, size); |  | ||||||
| 		return; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	/* See comment in kmemcheck_read(). */ |  | ||||||
| 	kmemcheck_write_strict(regs, addr, next_page - addr); |  | ||||||
| 	kmemcheck_write_strict(regs, next_page, next_addr - next_page); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * Copying is hard. We have two addresses, each of which may be split across |  | ||||||
|  * a page (and each page will have different shadow addresses). |  | ||||||
|  */ |  | ||||||
| static void kmemcheck_copy(struct pt_regs *regs, |  | ||||||
| 	unsigned long src_addr, unsigned long dst_addr, unsigned int size) |  | ||||||
| { |  | ||||||
| 	uint8_t shadow[8]; |  | ||||||
| 	enum kmemcheck_shadow status; |  | ||||||
| 
 |  | ||||||
| 	unsigned long page; |  | ||||||
| 	unsigned long next_addr; |  | ||||||
| 	unsigned long next_page; |  | ||||||
| 
 |  | ||||||
| 	uint8_t *x; |  | ||||||
| 	unsigned int i; |  | ||||||
| 	unsigned int n; |  | ||||||
| 
 |  | ||||||
| 	BUG_ON(size > sizeof(shadow)); |  | ||||||
| 
 |  | ||||||
| 	page = src_addr & PAGE_MASK; |  | ||||||
| 	next_addr = src_addr + size - 1; |  | ||||||
| 	next_page = next_addr & PAGE_MASK; |  | ||||||
| 
 |  | ||||||
| 	if (likely(page == next_page)) { |  | ||||||
| 		/* Same page */ |  | ||||||
| 		x = kmemcheck_shadow_lookup(src_addr); |  | ||||||
| 		if (x) { |  | ||||||
| 			kmemcheck_save_addr(src_addr); |  | ||||||
| 			for (i = 0; i < size; ++i) |  | ||||||
| 				shadow[i] = x[i]; |  | ||||||
| 		} else { |  | ||||||
| 			for (i = 0; i < size; ++i) |  | ||||||
| 				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; |  | ||||||
| 		} |  | ||||||
| 	} else { |  | ||||||
| 		n = next_page - src_addr; |  | ||||||
| 		BUG_ON(n > sizeof(shadow)); |  | ||||||
| 
 |  | ||||||
| 		/* First page */ |  | ||||||
| 		x = kmemcheck_shadow_lookup(src_addr); |  | ||||||
| 		if (x) { |  | ||||||
| 			kmemcheck_save_addr(src_addr); |  | ||||||
| 			for (i = 0; i < n; ++i) |  | ||||||
| 				shadow[i] = x[i]; |  | ||||||
| 		} else { |  | ||||||
| 			/* Not tracked */ |  | ||||||
| 			for (i = 0; i < n; ++i) |  | ||||||
| 				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; |  | ||||||
| 		} |  | ||||||
| 
 |  | ||||||
| 		/* Second page */ |  | ||||||
| 		x = kmemcheck_shadow_lookup(next_page); |  | ||||||
| 		if (x) { |  | ||||||
| 			kmemcheck_save_addr(next_page); |  | ||||||
| 			for (i = n; i < size; ++i) |  | ||||||
| 				shadow[i] = x[i - n]; |  | ||||||
| 		} else { |  | ||||||
| 			/* Not tracked */ |  | ||||||
| 			for (i = n; i < size; ++i) |  | ||||||
| 				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; |  | ||||||
| 		} |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	page = dst_addr & PAGE_MASK; |  | ||||||
| 	next_addr = dst_addr + size - 1; |  | ||||||
| 	next_page = next_addr & PAGE_MASK; |  | ||||||
| 
 |  | ||||||
| 	if (likely(page == next_page)) { |  | ||||||
| 		/* Same page */ |  | ||||||
| 		x = kmemcheck_shadow_lookup(dst_addr); |  | ||||||
| 		if (x) { |  | ||||||
| 			kmemcheck_save_addr(dst_addr); |  | ||||||
| 			for (i = 0; i < size; ++i) { |  | ||||||
| 				x[i] = shadow[i]; |  | ||||||
| 				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; |  | ||||||
| 			} |  | ||||||
| 		} |  | ||||||
| 	} else { |  | ||||||
| 		n = next_page - dst_addr; |  | ||||||
| 		BUG_ON(n > sizeof(shadow)); |  | ||||||
| 
 |  | ||||||
| 		/* First page */ |  | ||||||
| 		x = kmemcheck_shadow_lookup(dst_addr); |  | ||||||
| 		if (x) { |  | ||||||
| 			kmemcheck_save_addr(dst_addr); |  | ||||||
| 			for (i = 0; i < n; ++i) { |  | ||||||
| 				x[i] = shadow[i]; |  | ||||||
| 				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; |  | ||||||
| 			} |  | ||||||
| 		} |  | ||||||
| 
 |  | ||||||
| 		/* Second page */ |  | ||||||
| 		x = kmemcheck_shadow_lookup(next_page); |  | ||||||
| 		if (x) { |  | ||||||
| 			kmemcheck_save_addr(next_page); |  | ||||||
| 			for (i = n; i < size; ++i) { |  | ||||||
| 				x[i - n] = shadow[i]; |  | ||||||
| 				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; |  | ||||||
| 			} |  | ||||||
| 		} |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	status = kmemcheck_shadow_test(shadow, size); |  | ||||||
| 	if (status == KMEMCHECK_SHADOW_INITIALIZED) |  | ||||||
| 		return; |  | ||||||
| 
 |  | ||||||
| 	if (kmemcheck_enabled) |  | ||||||
| 		kmemcheck_error_save(status, src_addr, size, regs); |  | ||||||
| 
 |  | ||||||
| 	if (kmemcheck_enabled == 2) |  | ||||||
| 		kmemcheck_enabled = 0; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| enum kmemcheck_method { |  | ||||||
| 	KMEMCHECK_READ, |  | ||||||
| 	KMEMCHECK_WRITE, |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| static void kmemcheck_access(struct pt_regs *regs, |  | ||||||
| 	unsigned long fallback_address, enum kmemcheck_method fallback_method) |  | ||||||
| { |  | ||||||
| 	const uint8_t *insn; |  | ||||||
| 	const uint8_t *insn_primary; |  | ||||||
| 	unsigned int size; |  | ||||||
| 
 |  | ||||||
| 	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); |  | ||||||
| 
 |  | ||||||
| 	/* Recursive fault -- ouch. */ |  | ||||||
| 	if (data->busy) { |  | ||||||
| 		kmemcheck_show_addr(fallback_address); |  | ||||||
| 		kmemcheck_error_save_bug(regs); |  | ||||||
| 		return; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	data->busy = true; |  | ||||||
| 
 |  | ||||||
| 	insn = (const uint8_t *) regs->ip; |  | ||||||
| 	insn_primary = kmemcheck_opcode_get_primary(insn); |  | ||||||
| 
 |  | ||||||
| 	kmemcheck_opcode_decode(insn, &size); |  | ||||||
| 
 |  | ||||||
| 	switch (insn_primary[0]) { |  | ||||||
| #ifdef CONFIG_KMEMCHECK_BITOPS_OK |  | ||||||
| 		/* AND, OR, XOR */ |  | ||||||
| 		/*
 |  | ||||||
| 		 * Unfortunately, these instructions have to be excluded from |  | ||||||
| 		 * our regular checking since they access only some (and not |  | ||||||
| 		 * all) bits. This clears out "bogus" bitfield-access warnings. |  | ||||||
| 		 */ |  | ||||||
| 	case 0x80: |  | ||||||
| 	case 0x81: |  | ||||||
| 	case 0x82: |  | ||||||
| 	case 0x83: |  | ||||||
| 		switch ((insn_primary[1] >> 3) & 7) { |  | ||||||
| 			/* OR */ |  | ||||||
| 		case 1: |  | ||||||
| 			/* AND */ |  | ||||||
| 		case 4: |  | ||||||
| 			/* XOR */ |  | ||||||
| 		case 6: |  | ||||||
| 			kmemcheck_write(regs, fallback_address, size); |  | ||||||
| 			goto out; |  | ||||||
| 
 |  | ||||||
| 			/* ADD */ |  | ||||||
| 		case 0: |  | ||||||
| 			/* ADC */ |  | ||||||
| 		case 2: |  | ||||||
| 			/* SBB */ |  | ||||||
| 		case 3: |  | ||||||
| 			/* SUB */ |  | ||||||
| 		case 5: |  | ||||||
| 			/* CMP */ |  | ||||||
| 		case 7: |  | ||||||
| 			break; |  | ||||||
| 		} |  | ||||||
| 		break; |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| 		/* MOVS, MOVSB, MOVSW, MOVSD */ |  | ||||||
| 	case 0xa4: |  | ||||||
| 	case 0xa5: |  | ||||||
| 		/*
 |  | ||||||
| 		 * These instructions are special because they take two |  | ||||||
| 		 * addresses, but we only get one page fault. |  | ||||||
| 		 */ |  | ||||||
| 		kmemcheck_copy(regs, regs->si, regs->di, size); |  | ||||||
| 		goto out; |  | ||||||
| 
 |  | ||||||
| 		/* CMPS, CMPSB, CMPSW, CMPSD */ |  | ||||||
| 	case 0xa6: |  | ||||||
| 	case 0xa7: |  | ||||||
| 		kmemcheck_read(regs, regs->si, size); |  | ||||||
| 		kmemcheck_read(regs, regs->di, size); |  | ||||||
| 		goto out; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	/*
 |  | ||||||
| 	 * If the opcode isn't special in any way, we use the data from the |  | ||||||
| 	 * page fault handler to determine the address and type of memory |  | ||||||
| 	 * access. |  | ||||||
| 	 */ |  | ||||||
| 	switch (fallback_method) { |  | ||||||
| 	case KMEMCHECK_READ: |  | ||||||
| 		kmemcheck_read(regs, fallback_address, size); |  | ||||||
| 		goto out; |  | ||||||
| 	case KMEMCHECK_WRITE: |  | ||||||
| 		kmemcheck_write(regs, fallback_address, size); |  | ||||||
| 		goto out; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| out: |  | ||||||
| 	data->busy = false; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, |  | ||||||
| 	unsigned long error_code) |  | ||||||
| { |  | ||||||
| 	pte_t *pte; |  | ||||||
| 
 |  | ||||||
| 	/*
 |  | ||||||
| 	 * XXX: Is it safe to assume that memory accesses from virtual 86 |  | ||||||
| 	 * mode or non-kernel code segments will _never_ access kernel |  | ||||||
| 	 * memory (e.g. tracked pages)? For now, we need this to avoid |  | ||||||
| 	 * invoking kmemcheck for PnP BIOS calls. |  | ||||||
| 	 */ |  | ||||||
| 	if (regs->flags & X86_VM_MASK) |  | ||||||
| 		return false; |  | ||||||
| 	if (regs->cs != __KERNEL_CS) |  | ||||||
| 		return false; |  | ||||||
| 
 |  | ||||||
| 	pte = kmemcheck_pte_lookup(address); |  | ||||||
| 	if (!pte) |  | ||||||
| 		return false; |  | ||||||
| 
 |  | ||||||
| 	WARN_ON_ONCE(in_nmi()); |  | ||||||
| 
 |  | ||||||
| 	if (error_code & 2) |  | ||||||
| 		kmemcheck_access(regs, address, KMEMCHECK_WRITE); |  | ||||||
| 	else |  | ||||||
| 		kmemcheck_access(regs, address, KMEMCHECK_READ); |  | ||||||
| 
 |  | ||||||
| 	kmemcheck_show(regs); |  | ||||||
| 	return true; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| bool kmemcheck_trap(struct pt_regs *regs) |  | ||||||
| { |  | ||||||
| 	if (!kmemcheck_active(regs)) |  | ||||||
| 		return false; |  | ||||||
| 
 |  | ||||||
| 	/* We're done. */ |  | ||||||
| 	kmemcheck_hide(regs); |  | ||||||
| 	return true; |  | ||||||
| } |  | ||||||
|  | @ -1,107 +1 @@ | ||||||
| // SPDX-License-Identifier: GPL-2.0
 | // SPDX-License-Identifier: GPL-2.0
 | ||||||
| #include <linux/types.h> |  | ||||||
| 
 |  | ||||||
| #include "opcode.h" |  | ||||||
| 
 |  | ||||||
| static bool opcode_is_prefix(uint8_t b) |  | ||||||
| { |  | ||||||
| 	return |  | ||||||
| 		/* Group 1 */ |  | ||||||
| 		b == 0xf0 || b == 0xf2 || b == 0xf3 |  | ||||||
| 		/* Group 2 */ |  | ||||||
| 		|| b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 |  | ||||||
| 		|| b == 0x64 || b == 0x65 |  | ||||||
| 		/* Group 3 */ |  | ||||||
| 		|| b == 0x66 |  | ||||||
| 		/* Group 4 */ |  | ||||||
| 		|| b == 0x67; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| #ifdef CONFIG_X86_64 |  | ||||||
| static bool opcode_is_rex_prefix(uint8_t b) |  | ||||||
| { |  | ||||||
| 	return (b & 0xf0) == 0x40; |  | ||||||
| } |  | ||||||
| #else |  | ||||||
| static bool opcode_is_rex_prefix(uint8_t b) |  | ||||||
| { |  | ||||||
| 	return false; |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| #define REX_W (1 << 3) |  | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * This is a VERY crude opcode decoder. We only need to find the size of the |  | ||||||
|  * load/store that caused our #PF and this should work for all the opcodes |  | ||||||
|  * that we care about. Moreover, the ones who invented this instruction set |  | ||||||
|  * should be shot. |  | ||||||
|  */ |  | ||||||
| void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size) |  | ||||||
| { |  | ||||||
| 	/* Default operand size */ |  | ||||||
| 	int operand_size_override = 4; |  | ||||||
| 
 |  | ||||||
| 	/* prefixes */ |  | ||||||
| 	for (; opcode_is_prefix(*op); ++op) { |  | ||||||
| 		if (*op == 0x66) |  | ||||||
| 			operand_size_override = 2; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	/* REX prefix */ |  | ||||||
| 	if (opcode_is_rex_prefix(*op)) { |  | ||||||
| 		uint8_t rex = *op; |  | ||||||
| 
 |  | ||||||
| 		++op; |  | ||||||
| 		if (rex & REX_W) { |  | ||||||
| 			switch (*op) { |  | ||||||
| 			case 0x63: |  | ||||||
| 				*size = 4; |  | ||||||
| 				return; |  | ||||||
| 			case 0x0f: |  | ||||||
| 				++op; |  | ||||||
| 
 |  | ||||||
| 				switch (*op) { |  | ||||||
| 				case 0xb6: |  | ||||||
| 				case 0xbe: |  | ||||||
| 					*size = 1; |  | ||||||
| 					return; |  | ||||||
| 				case 0xb7: |  | ||||||
| 				case 0xbf: |  | ||||||
| 					*size = 2; |  | ||||||
| 					return; |  | ||||||
| 				} |  | ||||||
| 
 |  | ||||||
| 				break; |  | ||||||
| 			} |  | ||||||
| 
 |  | ||||||
| 			*size = 8; |  | ||||||
| 			return; |  | ||||||
| 		} |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	/* escape opcode */ |  | ||||||
| 	if (*op == 0x0f) { |  | ||||||
| 		++op; |  | ||||||
| 
 |  | ||||||
| 		/*
 |  | ||||||
| 		 * This is move with zero-extend and sign-extend, respectively; |  | ||||||
| 		 * we don't have to think about 0xb6/0xbe, because this is |  | ||||||
| 		 * already handled in the conditional below. |  | ||||||
| 		 */ |  | ||||||
| 		if (*op == 0xb7 || *op == 0xbf) |  | ||||||
| 			operand_size_override = 2; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	*size = (*op & 1) ? operand_size_override : 1; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op) |  | ||||||
| { |  | ||||||
| 	/* skip prefixes */ |  | ||||||
| 	while (opcode_is_prefix(*op)) |  | ||||||
| 		++op; |  | ||||||
| 	if (opcode_is_rex_prefix(*op)) |  | ||||||
| 		++op; |  | ||||||
| 	return op; |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  | @ -1,10 +1 @@ | ||||||
| /* SPDX-License-Identifier: GPL-2.0 */ | /* SPDX-License-Identifier: GPL-2.0 */ | ||||||
| #ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H |  | ||||||
| #define ARCH__X86__MM__KMEMCHECK__OPCODE_H |  | ||||||
| 
 |  | ||||||
| #include <linux/types.h> |  | ||||||
| 
 |  | ||||||
| void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size); |  | ||||||
| const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op); |  | ||||||
| 
 |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | @ -1,23 +1 @@ | ||||||
| // SPDX-License-Identifier: GPL-2.0
 | // SPDX-License-Identifier: GPL-2.0
 | ||||||
| #include <linux/mm.h> |  | ||||||
| 
 |  | ||||||
| #include <asm/pgtable.h> |  | ||||||
| 
 |  | ||||||
| #include "pte.h" |  | ||||||
| 
 |  | ||||||
| pte_t *kmemcheck_pte_lookup(unsigned long address) |  | ||||||
| { |  | ||||||
| 	pte_t *pte; |  | ||||||
| 	unsigned int level; |  | ||||||
| 
 |  | ||||||
| 	pte = lookup_address(address, &level); |  | ||||||
| 	if (!pte) |  | ||||||
| 		return NULL; |  | ||||||
| 	if (level != PG_LEVEL_4K) |  | ||||||
| 		return NULL; |  | ||||||
| 	if (!pte_hidden(*pte)) |  | ||||||
| 		return NULL; |  | ||||||
| 
 |  | ||||||
| 	return pte; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
|  |  | ||||||
|  | @ -1,11 +1 @@ | ||||||
| /* SPDX-License-Identifier: GPL-2.0 */ | /* SPDX-License-Identifier: GPL-2.0 */ | ||||||
| #ifndef ARCH__X86__MM__KMEMCHECK__PTE_H |  | ||||||
| #define ARCH__X86__MM__KMEMCHECK__PTE_H |  | ||||||
| 
 |  | ||||||
| #include <linux/mm.h> |  | ||||||
| 
 |  | ||||||
| #include <asm/pgtable.h> |  | ||||||
| 
 |  | ||||||
| pte_t *kmemcheck_pte_lookup(unsigned long address); |  | ||||||
| 
 |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | @ -1,71 +1 @@ | ||||||
| // SPDX-License-Identifier: GPL-2.0
 | // SPDX-License-Identifier: GPL-2.0
 | ||||||
| #include <linux/bug.h> |  | ||||||
| #include <linux/kernel.h> |  | ||||||
| 
 |  | ||||||
| #include "opcode.h" |  | ||||||
| #include "selftest.h" |  | ||||||
| 
 |  | ||||||
| struct selftest_opcode { |  | ||||||
| 	unsigned int expected_size; |  | ||||||
| 	const uint8_t *insn; |  | ||||||
| 	const char *desc; |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| static const struct selftest_opcode selftest_opcodes[] = { |  | ||||||
| 	/* REP MOVS */ |  | ||||||
| 	{1, "\xf3\xa4", 		"rep movsb <mem8>, <mem8>"}, |  | ||||||
| 	{4, "\xf3\xa5",			"rep movsl <mem32>, <mem32>"}, |  | ||||||
| 
 |  | ||||||
| 	/* MOVZX / MOVZXD */ |  | ||||||
| 	{1, "\x66\x0f\xb6\x51\xf8",	"movzwq <mem8>, <reg16>"}, |  | ||||||
| 	{1, "\x0f\xb6\x51\xf8",		"movzwq <mem8>, <reg32>"}, |  | ||||||
| 
 |  | ||||||
| 	/* MOVSX / MOVSXD */ |  | ||||||
| 	{1, "\x66\x0f\xbe\x51\xf8",	"movswq <mem8>, <reg16>"}, |  | ||||||
| 	{1, "\x0f\xbe\x51\xf8",		"movswq <mem8>, <reg32>"}, |  | ||||||
| 
 |  | ||||||
| #ifdef CONFIG_X86_64 |  | ||||||
| 	/* MOVZX / MOVZXD */ |  | ||||||
| 	{1, "\x49\x0f\xb6\x51\xf8",	"movzbq <mem8>, <reg64>"}, |  | ||||||
| 	{2, "\x49\x0f\xb7\x51\xf8",	"movzbq <mem16>, <reg64>"}, |  | ||||||
| 
 |  | ||||||
| 	/* MOVSX / MOVSXD */ |  | ||||||
| 	{1, "\x49\x0f\xbe\x51\xf8",	"movsbq <mem8>, <reg64>"}, |  | ||||||
| 	{2, "\x49\x0f\xbf\x51\xf8",	"movsbq <mem16>, <reg64>"}, |  | ||||||
| 	{4, "\x49\x63\x51\xf8",		"movslq <mem32>, <reg64>"}, |  | ||||||
| #endif |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| static bool selftest_opcode_one(const struct selftest_opcode *op) |  | ||||||
| { |  | ||||||
| 	unsigned size; |  | ||||||
| 
 |  | ||||||
| 	kmemcheck_opcode_decode(op->insn, &size); |  | ||||||
| 
 |  | ||||||
| 	if (size == op->expected_size) |  | ||||||
| 		return true; |  | ||||||
| 
 |  | ||||||
| 	printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n", |  | ||||||
| 		op->desc, op->expected_size, size); |  | ||||||
| 	return false; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static bool selftest_opcodes_all(void) |  | ||||||
| { |  | ||||||
| 	bool pass = true; |  | ||||||
| 	unsigned int i; |  | ||||||
| 
 |  | ||||||
| 	for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i) |  | ||||||
| 		pass = pass && selftest_opcode_one(&selftest_opcodes[i]); |  | ||||||
| 
 |  | ||||||
| 	return pass; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| bool kmemcheck_selftest(void) |  | ||||||
| { |  | ||||||
| 	bool pass = true; |  | ||||||
| 
 |  | ||||||
| 	pass = pass && selftest_opcodes_all(); |  | ||||||
| 
 |  | ||||||
| 	return pass; |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  | @ -1,7 +1 @@ | ||||||
| /* SPDX-License-Identifier: GPL-2.0 */ | /* SPDX-License-Identifier: GPL-2.0 */ | ||||||
| #ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H |  | ||||||
| #define ARCH_X86_MM_KMEMCHECK_SELFTEST_H |  | ||||||
| 
 |  | ||||||
| bool kmemcheck_selftest(void); |  | ||||||
| 
 |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | @ -1,173 +0,0 @@ | ||||||
| #include <linux/kmemcheck.h> |  | ||||||
| #include <linux/export.h> |  | ||||||
| #include <linux/mm.h> |  | ||||||
| 
 |  | ||||||
| #include <asm/page.h> |  | ||||||
| #include <asm/pgtable.h> |  | ||||||
| 
 |  | ||||||
| #include "pte.h" |  | ||||||
| #include "shadow.h" |  | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * Return the shadow address for the given address. Returns NULL if the |  | ||||||
|  * address is not tracked. |  | ||||||
|  * |  | ||||||
|  * We need to be extremely careful not to follow any invalid pointers, |  | ||||||
|  * because this function can be called for *any* possible address. |  | ||||||
|  */ |  | ||||||
| void *kmemcheck_shadow_lookup(unsigned long address) |  | ||||||
| { |  | ||||||
| 	pte_t *pte; |  | ||||||
| 	struct page *page; |  | ||||||
| 
 |  | ||||||
| 	if (!virt_addr_valid(address)) |  | ||||||
| 		return NULL; |  | ||||||
| 
 |  | ||||||
| 	pte = kmemcheck_pte_lookup(address); |  | ||||||
| 	if (!pte) |  | ||||||
| 		return NULL; |  | ||||||
| 
 |  | ||||||
| 	page = virt_to_page(address); |  | ||||||
| 	if (!page->shadow) |  | ||||||
| 		return NULL; |  | ||||||
| 	return page->shadow + (address & (PAGE_SIZE - 1)); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static void mark_shadow(void *address, unsigned int n, |  | ||||||
| 	enum kmemcheck_shadow status) |  | ||||||
| { |  | ||||||
| 	unsigned long addr = (unsigned long) address; |  | ||||||
| 	unsigned long last_addr = addr + n - 1; |  | ||||||
| 	unsigned long page = addr & PAGE_MASK; |  | ||||||
| 	unsigned long last_page = last_addr & PAGE_MASK; |  | ||||||
| 	unsigned int first_n; |  | ||||||
| 	void *shadow; |  | ||||||
| 
 |  | ||||||
| 	/* If the memory range crosses a page boundary, stop there. */ |  | ||||||
| 	if (page == last_page) |  | ||||||
| 		first_n = n; |  | ||||||
| 	else |  | ||||||
| 		first_n = page + PAGE_SIZE - addr; |  | ||||||
| 
 |  | ||||||
| 	shadow = kmemcheck_shadow_lookup(addr); |  | ||||||
| 	if (shadow) |  | ||||||
| 		memset(shadow, status, first_n); |  | ||||||
| 
 |  | ||||||
| 	addr += first_n; |  | ||||||
| 	n -= first_n; |  | ||||||
| 
 |  | ||||||
| 	/* Do full-page memset()s. */ |  | ||||||
| 	while (n >= PAGE_SIZE) { |  | ||||||
| 		shadow = kmemcheck_shadow_lookup(addr); |  | ||||||
| 		if (shadow) |  | ||||||
| 			memset(shadow, status, PAGE_SIZE); |  | ||||||
| 
 |  | ||||||
| 		addr += PAGE_SIZE; |  | ||||||
| 		n -= PAGE_SIZE; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	/* Do the remaining page, if any. */ |  | ||||||
| 	if (n > 0) { |  | ||||||
| 		shadow = kmemcheck_shadow_lookup(addr); |  | ||||||
| 		if (shadow) |  | ||||||
| 			memset(shadow, status, n); |  | ||||||
| 	} |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| void kmemcheck_mark_unallocated(void *address, unsigned int n) |  | ||||||
| { |  | ||||||
| 	mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| void kmemcheck_mark_uninitialized(void *address, unsigned int n) |  | ||||||
| { |  | ||||||
| 	mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * Fill the shadow memory of the given address such that the memory at that |  | ||||||
|  * address is marked as being initialized. |  | ||||||
|  */ |  | ||||||
| void kmemcheck_mark_initialized(void *address, unsigned int n) |  | ||||||
| { |  | ||||||
| 	mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED); |  | ||||||
| } |  | ||||||
| EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized); |  | ||||||
| 
 |  | ||||||
| void kmemcheck_mark_freed(void *address, unsigned int n) |  | ||||||
| { |  | ||||||
| 	mark_shadow(address, n, KMEMCHECK_SHADOW_FREED); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n) |  | ||||||
| { |  | ||||||
| 	unsigned int i; |  | ||||||
| 
 |  | ||||||
| 	for (i = 0; i < n; ++i) |  | ||||||
| 		kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n) |  | ||||||
| { |  | ||||||
| 	unsigned int i; |  | ||||||
| 
 |  | ||||||
| 	for (i = 0; i < n; ++i) |  | ||||||
| 		kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n) |  | ||||||
| { |  | ||||||
| 	unsigned int i; |  | ||||||
| 
 |  | ||||||
| 	for (i = 0; i < n; ++i) |  | ||||||
| 		kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) |  | ||||||
| { |  | ||||||
| #ifdef CONFIG_KMEMCHECK_PARTIAL_OK |  | ||||||
| 	uint8_t *x; |  | ||||||
| 	unsigned int i; |  | ||||||
| 
 |  | ||||||
| 	x = shadow; |  | ||||||
| 
 |  | ||||||
| 	/*
 |  | ||||||
| 	 * Make sure _some_ bytes are initialized. Gcc frequently generates |  | ||||||
| 	 * code to access neighboring bytes. |  | ||||||
| 	 */ |  | ||||||
| 	for (i = 0; i < size; ++i) { |  | ||||||
| 		if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) |  | ||||||
| 			return x[i]; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	return x[0]; |  | ||||||
| #else |  | ||||||
| 	return kmemcheck_shadow_test_all(shadow, size); |  | ||||||
| #endif |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size) |  | ||||||
| { |  | ||||||
| 	uint8_t *x; |  | ||||||
| 	unsigned int i; |  | ||||||
| 
 |  | ||||||
| 	x = shadow; |  | ||||||
| 
 |  | ||||||
| 	/* All bytes must be initialized. */ |  | ||||||
| 	for (i = 0; i < size; ++i) { |  | ||||||
| 		if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) |  | ||||||
| 			return x[i]; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	return x[0]; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| void kmemcheck_shadow_set(void *shadow, unsigned int size) |  | ||||||
| { |  | ||||||
| 	uint8_t *x; |  | ||||||
| 	unsigned int i; |  | ||||||
| 
 |  | ||||||
| 	x = shadow; |  | ||||||
| 	for (i = 0; i < size; ++i) |  | ||||||
| 		x[i] = KMEMCHECK_SHADOW_INITIALIZED; |  | ||||||
| } |  | ||||||
|  | @ -1,19 +1 @@ | ||||||
| /* SPDX-License-Identifier: GPL-2.0 */ | /* SPDX-License-Identifier: GPL-2.0 */ | ||||||
| #ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H |  | ||||||
| #define ARCH__X86__MM__KMEMCHECK__SHADOW_H |  | ||||||
| 
 |  | ||||||
| enum kmemcheck_shadow { |  | ||||||
| 	KMEMCHECK_SHADOW_UNALLOCATED, |  | ||||||
| 	KMEMCHECK_SHADOW_UNINITIALIZED, |  | ||||||
| 	KMEMCHECK_SHADOW_INITIALIZED, |  | ||||||
| 	KMEMCHECK_SHADOW_FREED, |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| void *kmemcheck_shadow_lookup(unsigned long address); |  | ||||||
| 
 |  | ||||||
| enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); |  | ||||||
| enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, |  | ||||||
| 						unsigned int size); |  | ||||||
| void kmemcheck_shadow_set(void *shadow, unsigned int size); |  | ||||||
| 
 |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  | @ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, | ||||||
| 
 | 
 | ||||||
| 	if (!debug_pagealloc_enabled()) | 	if (!debug_pagealloc_enabled()) | ||||||
| 		spin_unlock(&cpa_lock); | 		spin_unlock(&cpa_lock); | ||||||
| 	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); | 	base = alloc_pages(GFP_KERNEL, 0); | ||||||
| 	if (!debug_pagealloc_enabled()) | 	if (!debug_pagealloc_enabled()) | ||||||
| 		spin_lock(&cpa_lock); | 		spin_lock(&cpa_lock); | ||||||
| 	if (!base) | 	if (!base) | ||||||
|  | @ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) | ||||||
| 
 | 
 | ||||||
| static int alloc_pte_page(pmd_t *pmd) | static int alloc_pte_page(pmd_t *pmd) | ||||||
| { | { | ||||||
| 	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); | 	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL); | ||||||
| 	if (!pte) | 	if (!pte) | ||||||
| 		return -1; | 		return -1; | ||||||
| 
 | 
 | ||||||
|  | @ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd) | ||||||
| 
 | 
 | ||||||
| static int alloc_pmd_page(pud_t *pud) | static int alloc_pmd_page(pud_t *pud) | ||||||
| { | { | ||||||
| 	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); | 	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); | ||||||
| 	if (!pmd) | 	if (!pmd) | ||||||
| 		return -1; | 		return -1; | ||||||
| 
 | 
 | ||||||
|  | @ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) | ||||||
| 	pgd_entry = cpa->pgd + pgd_index(addr); | 	pgd_entry = cpa->pgd + pgd_index(addr); | ||||||
| 
 | 
 | ||||||
| 	if (pgd_none(*pgd_entry)) { | 	if (pgd_none(*pgd_entry)) { | ||||||
| 		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); | 		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); | ||||||
| 		if (!p4d) | 		if (!p4d) | ||||||
| 			return -1; | 			return -1; | ||||||
| 
 | 
 | ||||||
|  | @ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) | ||||||
| 	 */ | 	 */ | ||||||
| 	p4d = p4d_offset(pgd_entry, addr); | 	p4d = p4d_offset(pgd_entry, addr); | ||||||
| 	if (p4d_none(*p4d)) { | 	if (p4d_none(*p4d)) { | ||||||
| 		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); | 		pud = (pud_t *)get_zeroed_page(GFP_KERNEL); | ||||||
| 		if (!pud) | 		if (!pud) | ||||||
| 			return -1; | 			return -1; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -7,7 +7,7 @@ | ||||||
| #include <asm/fixmap.h> | #include <asm/fixmap.h> | ||||||
| #include <asm/mtrr.h> | #include <asm/mtrr.h> | ||||||
| 
 | 
 | ||||||
| #define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO) | #define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_HIGHPTE | #ifdef CONFIG_HIGHPTE | ||||||
| #define PGALLOC_USER_GFP __GFP_HIGHMEM | #define PGALLOC_USER_GFP __GFP_HIGHMEM | ||||||
|  |  | ||||||
|  | @ -207,7 +207,7 @@ int __init efi_alloc_page_tables(void) | ||||||
| 	if (efi_enabled(EFI_OLD_MEMMAP)) | 	if (efi_enabled(EFI_OLD_MEMMAP)) | ||||||
| 		return 0; | 		return 0; | ||||||
| 
 | 
 | ||||||
| 	gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO; | 	gfp_mask = GFP_KERNEL | __GFP_ZERO; | ||||||
| 	efi_pgd = (pgd_t *)__get_free_page(gfp_mask); | 	efi_pgd = (pgd_t *)__get_free_page(gfp_mask); | ||||||
| 	if (!efi_pgd) | 	if (!efi_pgd) | ||||||
| 		return -ENOMEM; | 		return -ENOMEM; | ||||||
|  |  | ||||||
|  | @ -2047,7 +2047,7 @@ static int blk_mq_init_hctx(struct request_queue *q, | ||||||
| 	 * Allocate space for all possible cpus to avoid allocation at | 	 * Allocate space for all possible cpus to avoid allocation at | ||||||
| 	 * runtime | 	 * runtime | ||||||
| 	 */ | 	 */ | ||||||
| 	hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), | 	hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), | ||||||
| 					GFP_KERNEL, node); | 					GFP_KERNEL, node); | ||||||
| 	if (!hctx->ctxs) | 	if (!hctx->ctxs) | ||||||
| 		goto unregister_cpu_notifier; | 		goto unregister_cpu_notifier; | ||||||
|  |  | ||||||
|  | @ -122,12 +122,7 @@ calibrate_xor_blocks(void) | ||||||
| 		goto out; | 		goto out; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	b1 = (void *) __get_free_pages(GFP_KERNEL, 2); | ||||||
| 	 * Note: Since the memory is not actually used for _anything_ but to |  | ||||||
| 	 * test the XOR speed, we don't really want kmemcheck to warn about |  | ||||||
| 	 * reading uninitialized bytes here. |  | ||||||
| 	 */ |  | ||||||
| 	b1 = (void *) __get_free_pages(GFP_KERNEL | __GFP_NOTRACK, 2); |  | ||||||
| 	if (!b1) { | 	if (!b1) { | ||||||
| 		printk(KERN_WARNING "xor: Yikes!  No memory available.\n"); | 		printk(KERN_WARNING "xor: Yikes!  No memory available.\n"); | ||||||
| 		return -ENOMEM; | 		return -ENOMEM; | ||||||
|  |  | ||||||
|  | @ -20,6 +20,7 @@ | ||||||
| #include <linux/radix-tree.h> | #include <linux/radix-tree.h> | ||||||
| #include <linux/fs.h> | #include <linux/fs.h> | ||||||
| #include <linux/slab.h> | #include <linux/slab.h> | ||||||
|  | #include <linux/backing-dev.h> | ||||||
| #ifdef CONFIG_BLK_DEV_RAM_DAX | #ifdef CONFIG_BLK_DEV_RAM_DAX | ||||||
| #include <linux/pfn_t.h> | #include <linux/pfn_t.h> | ||||||
| #include <linux/dax.h> | #include <linux/dax.h> | ||||||
|  | @ -448,6 +449,7 @@ static struct brd_device *brd_alloc(int i) | ||||||
| 	disk->flags		= GENHD_FL_EXT_DEVT; | 	disk->flags		= GENHD_FL_EXT_DEVT; | ||||||
| 	sprintf(disk->disk_name, "ram%d", i); | 	sprintf(disk->disk_name, "ram%d", i); | ||||||
| 	set_capacity(disk, rd_size * 2); | 	set_capacity(disk, rd_size * 2); | ||||||
|  | 	disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_BLK_DEV_RAM_DAX | #ifdef CONFIG_BLK_DEV_RAM_DAX | ||||||
| 	queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); | 	queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); | ||||||
|  |  | ||||||
|  | @ -23,14 +23,14 @@ static const char * const backends[] = { | ||||||
| #if IS_ENABLED(CONFIG_CRYPTO_LZ4) | #if IS_ENABLED(CONFIG_CRYPTO_LZ4) | ||||||
| 	"lz4", | 	"lz4", | ||||||
| #endif | #endif | ||||||
| #if IS_ENABLED(CONFIG_CRYPTO_DEFLATE) |  | ||||||
| 	"deflate", |  | ||||||
| #endif |  | ||||||
| #if IS_ENABLED(CONFIG_CRYPTO_LZ4HC) | #if IS_ENABLED(CONFIG_CRYPTO_LZ4HC) | ||||||
| 	"lz4hc", | 	"lz4hc", | ||||||
| #endif | #endif | ||||||
| #if IS_ENABLED(CONFIG_CRYPTO_842) | #if IS_ENABLED(CONFIG_CRYPTO_842) | ||||||
| 	"842", | 	"842", | ||||||
|  | #endif | ||||||
|  | #if IS_ENABLED(CONFIG_CRYPTO_ZSTD) | ||||||
|  | 	"zstd", | ||||||
| #endif | #endif | ||||||
| 	NULL | 	NULL | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | @ -122,14 +122,6 @@ static inline bool is_partial_io(struct bio_vec *bvec) | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| static void zram_revalidate_disk(struct zram *zram) |  | ||||||
| { |  | ||||||
| 	revalidate_disk(zram->disk); |  | ||||||
| 	/* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */ |  | ||||||
| 	zram->disk->queue->backing_dev_info->capabilities |= |  | ||||||
| 		BDI_CAP_STABLE_WRITES; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /*
 | /*
 | ||||||
|  * Check if request is within bounds and aligned on zram logical blocks. |  * Check if request is within bounds and aligned on zram logical blocks. | ||||||
|  */ |  */ | ||||||
|  | @ -436,7 +428,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry) | ||||||
| 	WARN_ON_ONCE(!was_set); | 	WARN_ON_ONCE(!was_set); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void zram_page_end_io(struct bio *bio) | static void zram_page_end_io(struct bio *bio) | ||||||
| { | { | ||||||
| 	struct page *page = bio->bi_io_vec[0].bv_page; | 	struct page *page = bio->bi_io_vec[0].bv_page; | ||||||
| 
 | 
 | ||||||
|  | @ -1373,7 +1365,8 @@ static ssize_t disksize_store(struct device *dev, | ||||||
| 	zram->comp = comp; | 	zram->comp = comp; | ||||||
| 	zram->disksize = disksize; | 	zram->disksize = disksize; | ||||||
| 	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); | 	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); | ||||||
| 	zram_revalidate_disk(zram); | 
 | ||||||
|  | 	revalidate_disk(zram->disk); | ||||||
| 	up_write(&zram->init_lock); | 	up_write(&zram->init_lock); | ||||||
| 
 | 
 | ||||||
| 	return len; | 	return len; | ||||||
|  | @ -1420,7 +1413,7 @@ static ssize_t reset_store(struct device *dev, | ||||||
| 	/* Make sure all the pending I/O are finished */ | 	/* Make sure all the pending I/O are finished */ | ||||||
| 	fsync_bdev(bdev); | 	fsync_bdev(bdev); | ||||||
| 	zram_reset_device(zram); | 	zram_reset_device(zram); | ||||||
| 	zram_revalidate_disk(zram); | 	revalidate_disk(zram->disk); | ||||||
| 	bdput(bdev); | 	bdput(bdev); | ||||||
| 
 | 
 | ||||||
| 	mutex_lock(&bdev->bd_mutex); | 	mutex_lock(&bdev->bd_mutex); | ||||||
|  | @ -1539,6 +1532,7 @@ static int zram_add(void) | ||||||
| 	/* zram devices sort of resembles non-rotational disks */ | 	/* zram devices sort of resembles non-rotational disks */ | ||||||
| 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); | 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); | ||||||
| 	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); | 	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); | ||||||
|  | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * To ensure that we always get PAGE_SIZE aligned | 	 * To ensure that we always get PAGE_SIZE aligned | ||||||
| 	 * and n*PAGE_SIZED sized I/O requests. | 	 * and n*PAGE_SIZED sized I/O requests. | ||||||
|  | @ -1563,6 +1557,8 @@ static int zram_add(void) | ||||||
| 	if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) | 	if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) | ||||||
| 		blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); | 		blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); | ||||||
| 
 | 
 | ||||||
|  | 	zram->disk->queue->backing_dev_info->capabilities |= | ||||||
|  | 			(BDI_CAP_STABLE_WRITES | BDI_CAP_SYNCHRONOUS_IO); | ||||||
| 	add_disk(zram->disk); | 	add_disk(zram->disk); | ||||||
| 
 | 
 | ||||||
| 	ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, | 	ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, | ||||||
|  |  | ||||||
|  | @ -259,7 +259,6 @@ | ||||||
| #include <linux/cryptohash.h> | #include <linux/cryptohash.h> | ||||||
| #include <linux/fips.h> | #include <linux/fips.h> | ||||||
| #include <linux/ptrace.h> | #include <linux/ptrace.h> | ||||||
| #include <linux/kmemcheck.h> |  | ||||||
| #include <linux/workqueue.h> | #include <linux/workqueue.h> | ||||||
| #include <linux/irq.h> | #include <linux/irq.h> | ||||||
| #include <linux/syscalls.h> | #include <linux/syscalls.h> | ||||||
|  |  | ||||||
|  | @ -553,8 +553,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, | ||||||
| 				 * invalidated it. Free it and try again | 				 * invalidated it. Free it and try again | ||||||
| 				 */ | 				 */ | ||||||
| 				release_pages(e->user_pages, | 				release_pages(e->user_pages, | ||||||
| 					      e->robj->tbo.ttm->num_pages, | 					      e->robj->tbo.ttm->num_pages); | ||||||
| 					      false); |  | ||||||
| 				kvfree(e->user_pages); | 				kvfree(e->user_pages); | ||||||
| 				e->user_pages = NULL; | 				e->user_pages = NULL; | ||||||
| 			} | 			} | ||||||
|  | @ -691,8 +690,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, | ||||||
| 				continue; | 				continue; | ||||||
| 
 | 
 | ||||||
| 			release_pages(e->user_pages, | 			release_pages(e->user_pages, | ||||||
| 				      e->robj->tbo.ttm->num_pages, | 				      e->robj->tbo.ttm->num_pages); | ||||||
| 				      false); |  | ||||||
| 			kvfree(e->user_pages); | 			kvfree(e->user_pages); | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | @ -347,7 +347,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data, | ||||||
| 	return 0; | 	return 0; | ||||||
| 
 | 
 | ||||||
| free_pages: | free_pages: | ||||||
| 	release_pages(bo->tbo.ttm->pages, bo->tbo.ttm->num_pages, false); | 	release_pages(bo->tbo.ttm->pages, bo->tbo.ttm->num_pages); | ||||||
| 
 | 
 | ||||||
| unlock_mmap_sem: | unlock_mmap_sem: | ||||||
| 	up_read(¤t->mm->mmap_sem); | 	up_read(¤t->mm->mmap_sem); | ||||||
|  |  | ||||||
|  | @ -659,7 +659,7 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) | ||||||
| 	return 0; | 	return 0; | ||||||
| 
 | 
 | ||||||
| release_pages: | release_pages: | ||||||
| 	release_pages(pages, pinned, 0); | 	release_pages(pages, pinned); | ||||||
| 	return r; | 	return r; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -779,7 +779,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages( | ||||||
| 	up_read(&mm->mmap_sem); | 	up_read(&mm->mmap_sem); | ||||||
| 
 | 
 | ||||||
| 	if (ret < 0) { | 	if (ret < 0) { | ||||||
| 		release_pages(pvec, pinned, 0); | 		release_pages(pvec, pinned); | ||||||
| 		kvfree(pvec); | 		kvfree(pvec); | ||||||
| 		return ERR_PTR(ret); | 		return ERR_PTR(ret); | ||||||
| 	} | 	} | ||||||
|  | @ -852,7 +852,7 @@ static int etnaviv_gem_userptr_get_pages(struct etnaviv_gem_object *etnaviv_obj) | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	release_pages(pvec, pinned, 0); | 	release_pages(pvec, pinned); | ||||||
| 	kvfree(pvec); | 	kvfree(pvec); | ||||||
| 
 | 
 | ||||||
| 	work = kmalloc(sizeof(*work), GFP_KERNEL); | 	work = kmalloc(sizeof(*work), GFP_KERNEL); | ||||||
|  | @ -886,7 +886,7 @@ static void etnaviv_gem_userptr_release(struct etnaviv_gem_object *etnaviv_obj) | ||||||
| 	if (etnaviv_obj->pages) { | 	if (etnaviv_obj->pages) { | ||||||
| 		int npages = etnaviv_obj->base.size >> PAGE_SHIFT; | 		int npages = etnaviv_obj->base.size >> PAGE_SHIFT; | ||||||
| 
 | 
 | ||||||
| 		release_pages(etnaviv_obj->pages, npages, 0); | 		release_pages(etnaviv_obj->pages, npages); | ||||||
| 		kvfree(etnaviv_obj->pages); | 		kvfree(etnaviv_obj->pages); | ||||||
| 	} | 	} | ||||||
| 	put_task_struct(etnaviv_obj->userptr.task); | 	put_task_struct(etnaviv_obj->userptr.task); | ||||||
|  |  | ||||||
|  | @ -1859,7 +1859,7 @@ static void i915_address_space_init(struct i915_address_space *vm, | ||||||
| 	INIT_LIST_HEAD(&vm->unbound_list); | 	INIT_LIST_HEAD(&vm->unbound_list); | ||||||
| 
 | 
 | ||||||
| 	list_add_tail(&vm->global_link, &dev_priv->vm_list); | 	list_add_tail(&vm->global_link, &dev_priv->vm_list); | ||||||
| 	pagevec_init(&vm->free_pages, false); | 	pagevec_init(&vm->free_pages); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void i915_address_space_fini(struct i915_address_space *vm) | static void i915_address_space_fini(struct i915_address_space *vm) | ||||||
|  |  | ||||||
|  | @ -554,7 +554,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) | ||||||
| 	} | 	} | ||||||
| 	mutex_unlock(&obj->mm.lock); | 	mutex_unlock(&obj->mm.lock); | ||||||
| 
 | 
 | ||||||
| 	release_pages(pvec, pinned, 0); | 	release_pages(pvec, pinned); | ||||||
| 	kvfree(pvec); | 	kvfree(pvec); | ||||||
| 
 | 
 | ||||||
| 	i915_gem_object_put(obj); | 	i915_gem_object_put(obj); | ||||||
|  | @ -668,7 +668,7 @@ i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj) | ||||||
| 		__i915_gem_userptr_set_active(obj, true); | 		__i915_gem_userptr_set_active(obj, true); | ||||||
| 
 | 
 | ||||||
| 	if (IS_ERR(pages)) | 	if (IS_ERR(pages)) | ||||||
| 		release_pages(pvec, pinned, 0); | 		release_pages(pvec, pinned); | ||||||
| 	kvfree(pvec); | 	kvfree(pvec); | ||||||
| 
 | 
 | ||||||
| 	return pages; | 	return pages; | ||||||
|  |  | ||||||
|  | @ -597,7 +597,7 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_tt *ttm) | ||||||
| 	kfree(ttm->sg); | 	kfree(ttm->sg); | ||||||
| 
 | 
 | ||||||
| release_pages: | release_pages: | ||||||
| 	release_pages(ttm->pages, pinned, 0); | 	release_pages(ttm->pages, pinned); | ||||||
| 	return r; | 	return r; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1667,7 +1667,8 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) | ||||||
| 	} | 	} | ||||||
| 	if (!rcd->rcvegrbuf_phys) { | 	if (!rcd->rcvegrbuf_phys) { | ||||||
| 		rcd->rcvegrbuf_phys = | 		rcd->rcvegrbuf_phys = | ||||||
| 			kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]), | 			kmalloc_array_node(chunk, | ||||||
|  | 					   sizeof(rcd->rcvegrbuf_phys[0]), | ||||||
| 					   GFP_KERNEL, rcd->node_id); | 					   GFP_KERNEL, rcd->node_id); | ||||||
| 		if (!rcd->rcvegrbuf_phys) | 		if (!rcd->rcvegrbuf_phys) | ||||||
| 			goto bail_rcvegrbuf; | 			goto bail_rcvegrbuf; | ||||||
|  |  | ||||||
|  | @ -238,7 +238,7 @@ int rvt_driver_qp_init(struct rvt_dev_info *rdi) | ||||||
| 	rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size; | 	rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size; | ||||||
| 	rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size); | 	rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size); | ||||||
| 	rdi->qp_dev->qp_table = | 	rdi->qp_dev->qp_table = | ||||||
| 		kmalloc_node(rdi->qp_dev->qp_table_size * | 		kmalloc_array_node(rdi->qp_dev->qp_table_size, | ||||||
| 			     sizeof(*rdi->qp_dev->qp_table), | 			     sizeof(*rdi->qp_dev->qp_table), | ||||||
| 			     GFP_KERNEL, rdi->dparms.node); | 			     GFP_KERNEL, rdi->dparms.node); | ||||||
| 	if (!rdi->qp_dev->qp_table) | 	if (!rdi->qp_dev->qp_table) | ||||||
|  |  | ||||||
|  | @ -15,7 +15,6 @@ | ||||||
| #include <linux/errno.h> | #include <linux/errno.h> | ||||||
| #include <linux/err.h> | #include <linux/err.h> | ||||||
| #include <linux/kernel.h> | #include <linux/kernel.h> | ||||||
| #include <linux/kmemcheck.h> |  | ||||||
| #include <linux/ctype.h> | #include <linux/ctype.h> | ||||||
| #include <linux/delay.h> | #include <linux/delay.h> | ||||||
| #include <linux/idr.h> | #include <linux/idr.h> | ||||||
|  | @ -904,7 +903,6 @@ struct c2port_device *c2port_device_register(char *name, | ||||||
| 		return ERR_PTR(-EINVAL); | 		return ERR_PTR(-EINVAL); | ||||||
| 
 | 
 | ||||||
| 	c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL); | 	c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL); | ||||||
| 	kmemcheck_annotate_bitfield(c2dev, flags); |  | ||||||
| 	if (unlikely(!c2dev)) | 	if (unlikely(!c2dev)) | ||||||
| 		return ERR_PTR(-ENOMEM); | 		return ERR_PTR(-ENOMEM); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -517,7 +517,7 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 		rc = ena_alloc_rx_page(rx_ring, rx_info, | 		rc = ena_alloc_rx_page(rx_ring, rx_info, | ||||||
| 				       __GFP_COLD | GFP_ATOMIC | __GFP_COMP); | 				       GFP_ATOMIC | __GFP_COMP); | ||||||
| 		if (unlikely(rc < 0)) { | 		if (unlikely(rc < 0)) { | ||||||
| 			netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev, | 			netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev, | ||||||
| 				   "failed to alloc buffer for rx queue %d\n", | 				   "failed to alloc buffer for rx queue %d\n", | ||||||
|  |  | ||||||
|  | @ -295,7 +295,7 @@ static int xgbe_alloc_pages(struct xgbe_prv_data *pdata, | ||||||
| 	order = alloc_order; | 	order = alloc_order; | ||||||
| 
 | 
 | ||||||
| 	/* Try to obtain pages, decreasing order if necessary */ | 	/* Try to obtain pages, decreasing order if necessary */ | ||||||
| 	gfp = GFP_ATOMIC | __GFP_COLD | __GFP_COMP | __GFP_NOWARN; | 	gfp = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN; | ||||||
| 	while (order >= 0) { | 	while (order >= 0) { | ||||||
| 		pages = alloc_pages_node(node, gfp, order); | 		pages = alloc_pages_node(node, gfp, order); | ||||||
| 		if (pages) | 		if (pages) | ||||||
|  |  | ||||||
|  | @ -304,8 +304,7 @@ int aq_ring_rx_fill(struct aq_ring_s *self) | ||||||
| 		buff->flags = 0U; | 		buff->flags = 0U; | ||||||
| 		buff->len = AQ_CFG_RX_FRAME_MAX; | 		buff->len = AQ_CFG_RX_FRAME_MAX; | ||||||
| 
 | 
 | ||||||
| 		buff->page = alloc_pages(GFP_ATOMIC | __GFP_COLD | | 		buff->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, pages_order); | ||||||
| 					 __GFP_COMP, pages_order); |  | ||||||
| 		if (!buff->page) { | 		if (!buff->page) { | ||||||
| 			err = -ENOMEM; | 			err = -ENOMEM; | ||||||
| 			goto err_exit; | 			goto err_exit; | ||||||
|  |  | ||||||
|  | @ -198,7 +198,7 @@ static inline void | ||||||
| 	struct sk_buff *skb; | 	struct sk_buff *skb; | ||||||
| 	struct octeon_skb_page_info *skb_pg_info; | 	struct octeon_skb_page_info *skb_pg_info; | ||||||
| 
 | 
 | ||||||
| 	page = alloc_page(GFP_ATOMIC | __GFP_COLD); | 	page = alloc_page(GFP_ATOMIC); | ||||||
| 	if (unlikely(!page)) | 	if (unlikely(!page)) | ||||||
| 		return NULL; | 		return NULL; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -193,7 +193,7 @@ static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv) | ||||||
| 
 | 
 | ||||||
| 			if (mlx4_en_prepare_rx_desc(priv, ring, | 			if (mlx4_en_prepare_rx_desc(priv, ring, | ||||||
| 						    ring->actual_size, | 						    ring->actual_size, | ||||||
| 						    GFP_KERNEL | __GFP_COLD)) { | 						    GFP_KERNEL)) { | ||||||
| 				if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) { | 				if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) { | ||||||
| 					en_err(priv, "Failed to allocate enough rx buffers\n"); | 					en_err(priv, "Failed to allocate enough rx buffers\n"); | ||||||
| 					return -ENOMEM; | 					return -ENOMEM; | ||||||
|  | @ -551,8 +551,7 @@ static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv, | ||||||
| 	do { | 	do { | ||||||
| 		if (mlx4_en_prepare_rx_desc(priv, ring, | 		if (mlx4_en_prepare_rx_desc(priv, ring, | ||||||
| 					    ring->prod & ring->size_mask, | 					    ring->prod & ring->size_mask, | ||||||
| 					    GFP_ATOMIC | __GFP_COLD | | 					    GFP_ATOMIC | __GFP_MEMALLOC)) | ||||||
| 					    __GFP_MEMALLOC)) |  | ||||||
| 			break; | 			break; | ||||||
| 		ring->prod++; | 		ring->prod++; | ||||||
| 	} while (likely(--missing)); | 	} while (likely(--missing)); | ||||||
|  |  | ||||||
|  | @ -1185,7 +1185,7 @@ static void *nfp_net_rx_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr) | ||||||
| 	} else { | 	} else { | ||||||
| 		struct page *page; | 		struct page *page; | ||||||
| 
 | 
 | ||||||
| 		page = alloc_page(GFP_KERNEL | __GFP_COLD); | 		page = alloc_page(GFP_KERNEL); | ||||||
| 		frag = page ? page_address(page) : NULL; | 		frag = page ? page_address(page) : NULL; | ||||||
| 	} | 	} | ||||||
| 	if (!frag) { | 	if (!frag) { | ||||||
|  |  | ||||||
|  | @ -1092,8 +1092,7 @@ static int ql_get_next_chunk(struct ql_adapter *qdev, struct rx_ring *rx_ring, | ||||||
| { | { | ||||||
| 	if (!rx_ring->pg_chunk.page) { | 	if (!rx_ring->pg_chunk.page) { | ||||||
| 		u64 map; | 		u64 map; | ||||||
| 		rx_ring->pg_chunk.page = alloc_pages(__GFP_COLD | __GFP_COMP | | 		rx_ring->pg_chunk.page = alloc_pages(__GFP_COMP | GFP_ATOMIC, | ||||||
| 						GFP_ATOMIC, |  | ||||||
| 						qdev->lbq_buf_order); | 						qdev->lbq_buf_order); | ||||||
| 		if (unlikely(!rx_ring->pg_chunk.page)) { | 		if (unlikely(!rx_ring->pg_chunk.page)) { | ||||||
| 			netif_err(qdev, drv, qdev->ndev, | 			netif_err(qdev, drv, qdev->ndev, | ||||||
|  |  | ||||||
|  | @ -163,7 +163,7 @@ static int ef4_init_rx_buffers(struct ef4_rx_queue *rx_queue, bool atomic) | ||||||
| 	do { | 	do { | ||||||
| 		page = ef4_reuse_page(rx_queue); | 		page = ef4_reuse_page(rx_queue); | ||||||
| 		if (page == NULL) { | 		if (page == NULL) { | ||||||
| 			page = alloc_pages(__GFP_COLD | __GFP_COMP | | 			page = alloc_pages(__GFP_COMP | | ||||||
| 					   (atomic ? GFP_ATOMIC : GFP_KERNEL), | 					   (atomic ? GFP_ATOMIC : GFP_KERNEL), | ||||||
| 					   efx->rx_buffer_order); | 					   efx->rx_buffer_order); | ||||||
| 			if (unlikely(page == NULL)) | 			if (unlikely(page == NULL)) | ||||||
|  |  | ||||||
|  | @ -163,7 +163,7 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic) | ||||||
| 	do { | 	do { | ||||||
| 		page = efx_reuse_page(rx_queue); | 		page = efx_reuse_page(rx_queue); | ||||||
| 		if (page == NULL) { | 		if (page == NULL) { | ||||||
| 			page = alloc_pages(__GFP_COLD | __GFP_COMP | | 			page = alloc_pages(__GFP_COMP | | ||||||
| 					   (atomic ? GFP_ATOMIC : GFP_KERNEL), | 					   (atomic ? GFP_ATOMIC : GFP_KERNEL), | ||||||
| 					   efx->rx_buffer_order); | 					   efx->rx_buffer_order); | ||||||
| 			if (unlikely(page == NULL)) | 			if (unlikely(page == NULL)) | ||||||
|  |  | ||||||
|  | @ -335,7 +335,7 @@ static int xlgmac_alloc_pages(struct xlgmac_pdata *pdata, | ||||||
| 	dma_addr_t pages_dma; | 	dma_addr_t pages_dma; | ||||||
| 
 | 
 | ||||||
| 	/* Try to obtain pages, decreasing order if necessary */ | 	/* Try to obtain pages, decreasing order if necessary */ | ||||||
| 	gfp |= __GFP_COLD | __GFP_COMP | __GFP_NOWARN; | 	gfp |= __GFP_COMP | __GFP_NOWARN; | ||||||
| 	while (order >= 0) { | 	while (order >= 0) { | ||||||
| 		pages = alloc_pages(gfp, order); | 		pages = alloc_pages(gfp, order); | ||||||
| 		if (pages) | 		if (pages) | ||||||
|  |  | ||||||
|  | @ -906,7 +906,7 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq) | ||||||
| 		sw_data[0] = (u32)bufptr; | 		sw_data[0] = (u32)bufptr; | ||||||
| 	} else { | 	} else { | ||||||
| 		/* Allocate a secondary receive queue entry */ | 		/* Allocate a secondary receive queue entry */ | ||||||
| 		page = alloc_page(GFP_ATOMIC | GFP_DMA | __GFP_COLD); | 		page = alloc_page(GFP_ATOMIC | GFP_DMA); | ||||||
| 		if (unlikely(!page)) { | 		if (unlikely(!page)) { | ||||||
| 			dev_warn_ratelimited(netcp->ndev_dev, "Secondary page alloc failed\n"); | 			dev_warn_ratelimited(netcp->ndev_dev, "Secondary page alloc failed\n"); | ||||||
| 			goto fail; | 			goto fail; | ||||||
|  |  | ||||||
|  | @ -1030,7 +1030,6 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq, | ||||||
| 	int err; | 	int err; | ||||||
| 	bool oom; | 	bool oom; | ||||||
| 
 | 
 | ||||||
| 	gfp |= __GFP_COLD; |  | ||||||
| 	do { | 	do { | ||||||
| 		if (vi->mergeable_rx_bufs) | 		if (vi->mergeable_rx_bufs) | ||||||
| 			err = add_recvbuf_mergeable(vi, rq, gfp); | 			err = add_recvbuf_mergeable(vi, rq, gfp); | ||||||
|  |  | ||||||
|  | @ -23,6 +23,7 @@ | ||||||
| #include <linux/ndctl.h> | #include <linux/ndctl.h> | ||||||
| #include <linux/fs.h> | #include <linux/fs.h> | ||||||
| #include <linux/nd.h> | #include <linux/nd.h> | ||||||
|  | #include <linux/backing-dev.h> | ||||||
| #include "btt.h" | #include "btt.h" | ||||||
| #include "nd.h" | #include "nd.h" | ||||||
| 
 | 
 | ||||||
|  | @ -1402,6 +1403,8 @@ static int btt_blk_init(struct btt *btt) | ||||||
| 	btt->btt_disk->private_data = btt; | 	btt->btt_disk->private_data = btt; | ||||||
| 	btt->btt_disk->queue = btt->btt_queue; | 	btt->btt_disk->queue = btt->btt_queue; | ||||||
| 	btt->btt_disk->flags = GENHD_FL_EXT_DEVT; | 	btt->btt_disk->flags = GENHD_FL_EXT_DEVT; | ||||||
|  | 	btt->btt_disk->queue->backing_dev_info->capabilities |= | ||||||
|  | 			BDI_CAP_SYNCHRONOUS_IO; | ||||||
| 
 | 
 | ||||||
| 	blk_queue_make_request(btt->btt_queue, btt_make_request); | 	blk_queue_make_request(btt->btt_queue, btt_make_request); | ||||||
| 	blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); | 	blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); | ||||||
|  |  | ||||||
|  | @ -31,6 +31,7 @@ | ||||||
| #include <linux/uio.h> | #include <linux/uio.h> | ||||||
| #include <linux/dax.h> | #include <linux/dax.h> | ||||||
| #include <linux/nd.h> | #include <linux/nd.h> | ||||||
|  | #include <linux/backing-dev.h> | ||||||
| #include "pmem.h" | #include "pmem.h" | ||||||
| #include "pfn.h" | #include "pfn.h" | ||||||
| #include "nd.h" | #include "nd.h" | ||||||
|  | @ -394,6 +395,7 @@ static int pmem_attach_disk(struct device *dev, | ||||||
| 	disk->fops		= &pmem_fops; | 	disk->fops		= &pmem_fops; | ||||||
| 	disk->queue		= q; | 	disk->queue		= q; | ||||||
| 	disk->flags		= GENHD_FL_EXT_DEVT; | 	disk->flags		= GENHD_FL_EXT_DEVT; | ||||||
|  | 	disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; | ||||||
| 	nvdimm_namespace_disk_name(ndns, disk->disk_name); | 	nvdimm_namespace_disk_name(ndns, disk->disk_name); | ||||||
| 	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) | 	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) | ||||||
| 			/ 512); | 			/ 512); | ||||||
|  |  | ||||||
|  | @ -1152,7 +1152,7 @@ static int mdc_read_page_remote(void *data, struct page *page0) | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	for (npages = 1; npages < max_pages; npages++) { | 	for (npages = 1; npages < max_pages; npages++) { | ||||||
| 		page = page_cache_alloc_cold(inode->i_mapping); | 		page = page_cache_alloc(inode->i_mapping); | ||||||
| 		if (!page) | 		if (!page) | ||||||
| 			break; | 			break; | ||||||
| 		page_pool[npages] = page; | 		page_pool[npages] = page; | ||||||
|  |  | ||||||
|  | @ -308,7 +308,7 @@ static void afs_kill_pages(struct afs_vnode *vnode, bool error, | ||||||
| 	_enter("{%x:%u},%lx-%lx", | 	_enter("{%x:%u},%lx-%lx", | ||||||
| 	       vnode->fid.vid, vnode->fid.vnode, first, last); | 	       vnode->fid.vid, vnode->fid.vnode, first, last); | ||||||
| 
 | 
 | ||||||
| 	pagevec_init(&pv, 0); | 	pagevec_init(&pv); | ||||||
| 
 | 
 | ||||||
| 	do { | 	do { | ||||||
| 		_debug("kill %lx-%lx", first, last); | 		_debug("kill %lx-%lx", first, last); | ||||||
|  | @ -497,20 +497,13 @@ static int afs_writepages_region(struct address_space *mapping, | ||||||
| 	_enter(",,%lx,%lx,", index, end); | 	_enter(",,%lx,%lx,", index, end); | ||||||
| 
 | 
 | ||||||
| 	do { | 	do { | ||||||
| 		n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY, | 		n = find_get_pages_range_tag(mapping, &index, end, | ||||||
| 				       1, &page); | 					PAGECACHE_TAG_DIRTY, 1, &page); | ||||||
| 		if (!n) | 		if (!n) | ||||||
| 			break; | 			break; | ||||||
| 
 | 
 | ||||||
| 		_debug("wback %lx", page->index); | 		_debug("wback %lx", page->index); | ||||||
| 
 | 
 | ||||||
| 		if (page->index > end) { |  | ||||||
| 			*_next = index; |  | ||||||
| 			put_page(page); |  | ||||||
| 			_leave(" = 0 [%lx]", *_next); |  | ||||||
| 			return 0; |  | ||||||
| 		} |  | ||||||
| 
 |  | ||||||
| 		/* at this point we hold neither mapping->tree_lock nor lock on
 | 		/* at this point we hold neither mapping->tree_lock nor lock on
 | ||||||
| 		 * the page itself: the page may be truncated or invalidated | 		 * the page itself: the page may be truncated or invalidated | ||||||
| 		 * (changing page->mapping to NULL), or even swizzled back from | 		 * (changing page->mapping to NULL), or even swizzled back from | ||||||
|  | @ -609,7 +602,7 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) | ||||||
| 
 | 
 | ||||||
| 	ASSERT(wb != NULL); | 	ASSERT(wb != NULL); | ||||||
| 
 | 
 | ||||||
| 	pagevec_init(&pv, 0); | 	pagevec_init(&pv); | ||||||
| 
 | 
 | ||||||
| 	do { | 	do { | ||||||
| 		_debug("done %lx-%lx", first, last); | 		_debug("done %lx-%lx", first, last); | ||||||
|  |  | ||||||
|  | @ -3797,7 +3797,7 @@ int btree_write_cache_pages(struct address_space *mapping, | ||||||
| 	int scanned = 0; | 	int scanned = 0; | ||||||
| 	int tag; | 	int tag; | ||||||
| 
 | 
 | ||||||
| 	pagevec_init(&pvec, 0); | 	pagevec_init(&pvec); | ||||||
| 	if (wbc->range_cyclic) { | 	if (wbc->range_cyclic) { | ||||||
| 		index = mapping->writeback_index; /* Start from prev offset */ | 		index = mapping->writeback_index; /* Start from prev offset */ | ||||||
| 		end = -1; | 		end = -1; | ||||||
|  | @ -3814,8 +3814,8 @@ int btree_write_cache_pages(struct address_space *mapping, | ||||||
| 	if (wbc->sync_mode == WB_SYNC_ALL) | 	if (wbc->sync_mode == WB_SYNC_ALL) | ||||||
| 		tag_pages_for_writeback(mapping, index, end); | 		tag_pages_for_writeback(mapping, index, end); | ||||||
| 	while (!done && !nr_to_write_done && (index <= end) && | 	while (!done && !nr_to_write_done && (index <= end) && | ||||||
| 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, | 	       (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, | ||||||
| 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { | 			tag))) { | ||||||
| 		unsigned i; | 		unsigned i; | ||||||
| 
 | 
 | ||||||
| 		scanned = 1; | 		scanned = 1; | ||||||
|  | @ -3825,11 +3825,6 @@ int btree_write_cache_pages(struct address_space *mapping, | ||||||
| 			if (!PagePrivate(page)) | 			if (!PagePrivate(page)) | ||||||
| 				continue; | 				continue; | ||||||
| 
 | 
 | ||||||
| 			if (!wbc->range_cyclic && page->index > end) { |  | ||||||
| 				done = 1; |  | ||||||
| 				break; |  | ||||||
| 			} |  | ||||||
| 
 |  | ||||||
| 			spin_lock(&mapping->private_lock); | 			spin_lock(&mapping->private_lock); | ||||||
| 			if (!PagePrivate(page)) { | 			if (!PagePrivate(page)) { | ||||||
| 				spin_unlock(&mapping->private_lock); | 				spin_unlock(&mapping->private_lock); | ||||||
|  | @ -3941,7 +3936,7 @@ static int extent_write_cache_pages(struct address_space *mapping, | ||||||
| 	if (!igrab(inode)) | 	if (!igrab(inode)) | ||||||
| 		return 0; | 		return 0; | ||||||
| 
 | 
 | ||||||
| 	pagevec_init(&pvec, 0); | 	pagevec_init(&pvec); | ||||||
| 	if (wbc->range_cyclic) { | 	if (wbc->range_cyclic) { | ||||||
| 		index = mapping->writeback_index; /* Start from prev offset */ | 		index = mapping->writeback_index; /* Start from prev offset */ | ||||||
| 		end = -1; | 		end = -1; | ||||||
|  | @ -3961,8 +3956,8 @@ static int extent_write_cache_pages(struct address_space *mapping, | ||||||
| 		tag_pages_for_writeback(mapping, index, end); | 		tag_pages_for_writeback(mapping, index, end); | ||||||
| 	done_index = index; | 	done_index = index; | ||||||
| 	while (!done && !nr_to_write_done && (index <= end) && | 	while (!done && !nr_to_write_done && (index <= end) && | ||||||
| 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, | 			(nr_pages = pagevec_lookup_range_tag(&pvec, mapping, | ||||||
| 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { | 						&index, end, tag))) { | ||||||
| 		unsigned i; | 		unsigned i; | ||||||
| 
 | 
 | ||||||
| 		scanned = 1; | 		scanned = 1; | ||||||
|  | @ -3987,12 +3982,6 @@ static int extent_write_cache_pages(struct address_space *mapping, | ||||||
| 				continue; | 				continue; | ||||||
| 			} | 			} | ||||||
| 
 | 
 | ||||||
| 			if (!wbc->range_cyclic && page->index > end) { |  | ||||||
| 				done = 1; |  | ||||||
| 				unlock_page(page); |  | ||||||
| 				continue; |  | ||||||
| 			} |  | ||||||
| 
 |  | ||||||
| 			if (wbc->sync_mode != WB_SYNC_NONE) { | 			if (wbc->sync_mode != WB_SYNC_NONE) { | ||||||
| 				if (PageWriteback(page)) | 				if (PageWriteback(page)) | ||||||
| 					flush_fn(data); | 					flush_fn(data); | ||||||
|  |  | ||||||
|  | @ -1592,7 +1592,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) | ||||||
| 	struct buffer_head *head; | 	struct buffer_head *head; | ||||||
| 
 | 
 | ||||||
| 	end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); | 	end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); | ||||||
| 	pagevec_init(&pvec, 0); | 	pagevec_init(&pvec); | ||||||
| 	while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) { | 	while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) { | ||||||
| 		count = pagevec_count(&pvec); | 		count = pagevec_count(&pvec); | ||||||
| 		for (i = 0; i < count; i++) { | 		for (i = 0; i < count; i++) { | ||||||
|  | @ -3514,7 +3514,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, | ||||||
| 	if (length <= 0) | 	if (length <= 0) | ||||||
| 		return -ENOENT; | 		return -ENOENT; | ||||||
| 
 | 
 | ||||||
| 	pagevec_init(&pvec, 0); | 	pagevec_init(&pvec); | ||||||
| 
 | 
 | ||||||
| 	do { | 	do { | ||||||
| 		unsigned nr_pages, i; | 		unsigned nr_pages, i; | ||||||
|  |  | ||||||
Some files were not shown because too many files have changed in this diff Show more
		Loading…
	
		Reference in a new issue
	
	 Linus Torvalds
						Linus Torvalds