mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
 "147 patches, based on 7d2a07b769.
  Subsystems affected by this patch series: mm (memory-hotplug, rmap,
  ioremap, highmem, cleanups, secretmem, kfence, damon, and vmscan),
  alpha, percpu, procfs, misc, core-kernel, MAINTAINERS, lib,
  checkpatch, epoll, init, nilfs2, coredump, fork, pids, criu, kconfig,
  selftests, ipc, and scripts"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (94 commits)
  scripts: check_extable: fix typo in user error message
  mm/workingset: correct kernel-doc notations
  ipc: replace costly bailout check in sysvipc_find_ipc()
  selftests/memfd: remove unused variable
  Kconfig.debug: drop selecting non-existing HARDLOCKUP_DETECTOR_ARCH
  configs: remove the obsolete CONFIG_INPUT_POLLDEV
  prctl: allow to setup brk for et_dyn executables
  pid: cleanup the stale comment mentioning pidmap_init().
  kernel/fork.c: unexport get_{mm,task}_exe_file
  coredump: fix memleak in dump_vma_snapshot()
  fs/coredump.c: log if a core dump is aborted due to changed file permissions
  nilfs2: use refcount_dec_and_lock() to fix potential UAF
  nilfs2: fix memory leak in nilfs_sysfs_delete_snapshot_group
  nilfs2: fix memory leak in nilfs_sysfs_create_snapshot_group
  nilfs2: fix memory leak in nilfs_sysfs_delete_##name##_group
  nilfs2: fix memory leak in nilfs_sysfs_create_##name##_group
  nilfs2: fix NULL pointer in nilfs_##name##_attr_release
  nilfs2: fix memory leak in nilfs_sysfs_create_device_group
  trap: cleanup trap_init()
  init: move usermodehelper_enable() to populate_rootfs()
  ...
			
			
This commit is contained in:
		
						commit
						2d338201d5
					
				
					 149 changed files with 5357 additions and 946 deletions
				
			
		
							
								
								
									
										15
									
								
								Documentation/admin-guide/mm/damon/index.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								Documentation/admin-guide/mm/damon/index.rst
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,15 @@
 | 
				
			||||||
 | 
					.. SPDX-License-Identifier: GPL-2.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					========================
 | 
				
			||||||
 | 
					Monitoring Data Accesses
 | 
				
			||||||
 | 
					========================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					:doc:`DAMON </vm/damon/index>` allows light-weight data access monitoring.
 | 
				
			||||||
 | 
					Using DAMON, users can analyze the memory access patterns of their systems and
 | 
				
			||||||
 | 
					optimize those.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. toctree::
 | 
				
			||||||
 | 
					   :maxdepth: 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   start
 | 
				
			||||||
 | 
					   usage
 | 
				
			||||||
							
								
								
									
										114
									
								
								Documentation/admin-guide/mm/damon/start.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										114
									
								
								Documentation/admin-guide/mm/damon/start.rst
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,114 @@
 | 
				
			||||||
 | 
					.. SPDX-License-Identifier: GPL-2.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					===============
 | 
				
			||||||
 | 
					Getting Started
 | 
				
			||||||
 | 
					===============
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This document briefly describes how you can use DAMON by demonstrating its
 | 
				
			||||||
 | 
					default user space tool.  Please note that this document describes only a part
 | 
				
			||||||
 | 
					of its features for brevity.  Please refer to :doc:`usage` for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TL; DR
 | 
				
			||||||
 | 
					======
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Follow the commands below to monitor and visualize the memory access pattern of
 | 
				
			||||||
 | 
					your workload. ::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # # build the kernel with CONFIG_DAMON_*=y, install it, and reboot
 | 
				
			||||||
 | 
					    # mount -t debugfs none /sys/kernel/debug/
 | 
				
			||||||
 | 
					    # git clone https://github.com/awslabs/damo
 | 
				
			||||||
 | 
					    # ./damo/damo record $(pidof <your workload>)
 | 
				
			||||||
 | 
					    # ./damo/damo report heat --plot_ascii
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The final command draws the access heatmap of ``<your workload>``.  The heatmap
 | 
				
			||||||
 | 
					shows which memory region (x-axis) is accessed when (y-axis) and how frequently
 | 
				
			||||||
 | 
					(number; the higher the more accesses have been observed). ::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    111111111111111111111111111111111111111111111111111111110000
 | 
				
			||||||
 | 
					    111121111111111111111111111111211111111111111111111111110000
 | 
				
			||||||
 | 
					    000000000000000000000000000000000000000000000000001555552000
 | 
				
			||||||
 | 
					    000000000000000000000000000000000000000000000222223555552000
 | 
				
			||||||
 | 
					    000000000000000000000000000000000000000011111677775000000000
 | 
				
			||||||
 | 
					    000000000000000000000000000000000000000488888000000000000000
 | 
				
			||||||
 | 
					    000000000000000000000000000000000177888400000000000000000000
 | 
				
			||||||
 | 
					    000000000000000000000000000046666522222100000000000000000000
 | 
				
			||||||
 | 
					    000000000000000000000014444344444300000000000000000000000000
 | 
				
			||||||
 | 
					    000000000000000002222245555510000000000000000000000000000000
 | 
				
			||||||
 | 
					    # access_frequency:  0  1  2  3  4  5  6  7  8  9
 | 
				
			||||||
 | 
					    # x-axis: space (140286319947776-140286426374096: 101.496 MiB)
 | 
				
			||||||
 | 
					    # y-axis: time (605442256436361-605479951866441: 37.695430s)
 | 
				
			||||||
 | 
					    # resolution: 60x10 (1.692 MiB and 3.770s for each character)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Prerequisites
 | 
				
			||||||
 | 
					=============
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Kernel
 | 
				
			||||||
 | 
					------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You should first ensure your system is running on a kernel built with
 | 
				
			||||||
 | 
					``CONFIG_DAMON_*=y``.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					User Space Tool
 | 
				
			||||||
 | 
					---------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For the demonstration, we will use the default user space tool for DAMON,
 | 
				
			||||||
 | 
					called DAMON Operator (DAMO).  It is available at
 | 
				
			||||||
 | 
					https://github.com/awslabs/damo.  The examples below assume that ``damo`` is on
 | 
				
			||||||
 | 
					your ``$PATH``.  It's not mandatory, though.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Because DAMO is using the debugfs interface (refer to :doc:`usage` for the
 | 
				
			||||||
 | 
					detail) of DAMON, you should ensure debugfs is mounted.  Mount it manually as
 | 
				
			||||||
 | 
					below::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # mount -t debugfs none /sys/kernel/debug/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					or append the following line to your ``/etc/fstab`` file so that your system
 | 
				
			||||||
 | 
					can automatically mount debugfs upon booting::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    debugfs /sys/kernel/debug debugfs defaults 0 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Recording Data Access Patterns
 | 
				
			||||||
 | 
					==============================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The commands below record the memory access patterns of a program and save the
 | 
				
			||||||
 | 
					monitoring results to a file. ::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    $ git clone https://github.com/sjp38/masim
 | 
				
			||||||
 | 
					    $ cd masim; make; ./masim ./configs/zigzag.cfg &
 | 
				
			||||||
 | 
					    $ sudo damo record -o damon.data $(pidof masim)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The first two lines of the commands download an artificial memory access
 | 
				
			||||||
 | 
					generator program and run it in the background.  The generator will repeatedly
 | 
				
			||||||
 | 
					access two 100 MiB sized memory regions one by one.  You can substitute this
 | 
				
			||||||
 | 
					with your real workload.  The last line asks ``damo`` to record the access
 | 
				
			||||||
 | 
					pattern in the ``damon.data`` file.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Visualizing Recorded Patterns
 | 
				
			||||||
 | 
					=============================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The following three commands visualize the recorded access patterns and save
 | 
				
			||||||
 | 
					the results as separate image files. ::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    $ damo report heats --heatmap access_pattern_heatmap.png
 | 
				
			||||||
 | 
					    $ damo report wss --range 0 101 1 --plot wss_dist.png
 | 
				
			||||||
 | 
					    $ damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- ``access_pattern_heatmap.png`` will visualize the data access pattern in a
 | 
				
			||||||
 | 
					  heatmap, showing which memory region (y-axis) got accessed when (x-axis)
 | 
				
			||||||
 | 
					  and how frequently (color).
 | 
				
			||||||
 | 
					- ``wss_dist.png`` will show the distribution of the working set size.
 | 
				
			||||||
 | 
					- ``wss_chron_change.png`` will show how the working set size has
 | 
				
			||||||
 | 
					  chronologically changed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You can view the visualizations of this example workload at [1]_.
 | 
				
			||||||
 | 
					Visualizations of other realistic workloads are available at [2]_ [3]_ [4]_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#visualizing-recorded-patterns
 | 
				
			||||||
 | 
					.. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html
 | 
				
			||||||
 | 
					.. [3] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html
 | 
				
			||||||
 | 
					.. [4] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html
 | 
				
			||||||
							
								
								
									
										112
									
								
								Documentation/admin-guide/mm/damon/usage.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										112
									
								
								Documentation/admin-guide/mm/damon/usage.rst
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,112 @@
 | 
				
			||||||
 | 
					.. SPDX-License-Identifier: GPL-2.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					===============
 | 
				
			||||||
 | 
					Detailed Usages
 | 
				
			||||||
 | 
					===============
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DAMON provides below three interfaces for different users.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- *DAMON user space tool.*
 | 
				
			||||||
 | 
					  This is for privileged people such as system administrators who want a
 | 
				
			||||||
 | 
					  just-working human-friendly interface.  Using this, users can use the DAMON’s
 | 
				
			||||||
 | 
					  major features in a human-friendly way.  It may not be highly tuned for
 | 
				
			||||||
 | 
					  special cases, though.  It supports only virtual address spaces monitoring.
 | 
				
			||||||
 | 
					- *debugfs interface.*
 | 
				
			||||||
 | 
					  This is for privileged user space programmers who want more optimized use of
 | 
				
			||||||
 | 
					  DAMON.  Using this, users can use DAMON’s major features by reading
 | 
				
			||||||
 | 
					  from and writing to special debugfs files.  Therefore, you can write and use
 | 
				
			||||||
 | 
					  your personalized DAMON debugfs wrapper programs that reads/writes the
 | 
				
			||||||
 | 
					  debugfs files instead of you.  The DAMON user space tool is also a reference
 | 
				
			||||||
 | 
					  implementation of such programs.  It supports only virtual address spaces
 | 
				
			||||||
 | 
					  monitoring.
 | 
				
			||||||
 | 
					- *Kernel Space Programming Interface.*
 | 
				
			||||||
 | 
					  This is for kernel space programmers.  Using this, users can utilize every
 | 
				
			||||||
 | 
					  feature of DAMON most flexibly and efficiently by writing kernel space
 | 
				
			||||||
 | 
					  DAMON application programs for you.  You can even extend DAMON for various
 | 
				
			||||||
 | 
					  address spaces.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Nevertheless, you could write your own user space tool using the debugfs
 | 
				
			||||||
 | 
					interface.  A reference implementation is available at
 | 
				
			||||||
 | 
					https://github.com/awslabs/damo.  If you are a kernel programmer, you could
 | 
				
			||||||
 | 
					refer to :doc:`/vm/damon/api` for the kernel space programming interface.  For
 | 
				
			||||||
 | 
					the reason, this document describes only the debugfs interface
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					debugfs Interface
 | 
				
			||||||
 | 
					=================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DAMON exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under
 | 
				
			||||||
 | 
					its debugfs directory, ``<debugfs>/damon/``.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Attributes
 | 
				
			||||||
 | 
					----------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Users can get and set the ``sampling interval``, ``aggregation interval``,
 | 
				
			||||||
 | 
					``regions update interval``, and min/max number of monitoring target regions by
 | 
				
			||||||
 | 
					reading from and writing to the ``attrs`` file.  To know about the monitoring
 | 
				
			||||||
 | 
					attributes in detail, please refer to the :doc:`/vm/damon/design`.  For
 | 
				
			||||||
 | 
					example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and
 | 
				
			||||||
 | 
					1000, and then check it again::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # cd <debugfs>/damon
 | 
				
			||||||
 | 
					    # echo 5000 100000 1000000 10 1000 > attrs
 | 
				
			||||||
 | 
					    # cat attrs
 | 
				
			||||||
 | 
					    5000 100000 1000000 10 1000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Target IDs
 | 
				
			||||||
 | 
					----------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Some types of address spaces supports multiple monitoring target.  For example,
 | 
				
			||||||
 | 
					the virtual memory address spaces monitoring can have multiple processes as the
 | 
				
			||||||
 | 
					monitoring targets.  Users can set the targets by writing relevant id values of
 | 
				
			||||||
 | 
					the targets to, and get the ids of the current targets by reading from the
 | 
				
			||||||
 | 
					``target_ids`` file.  In case of the virtual address spaces monitoring, the
 | 
				
			||||||
 | 
					values should be pids of the monitoring target processes.  For example, below
 | 
				
			||||||
 | 
					commands set processes having pids 42 and 4242 as the monitoring targets and
 | 
				
			||||||
 | 
					check it again::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # cd <debugfs>/damon
 | 
				
			||||||
 | 
					    # echo 42 4242 > target_ids
 | 
				
			||||||
 | 
					    # cat target_ids
 | 
				
			||||||
 | 
					    42 4242
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Note that setting the target ids doesn't start the monitoring.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Turning On/Off
 | 
				
			||||||
 | 
					--------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Setting the files as described above doesn't incur effect unless you explicitly
 | 
				
			||||||
 | 
					start the monitoring.  You can start, stop, and check the current status of the
 | 
				
			||||||
 | 
					monitoring by writing to and reading from the ``monitor_on`` file.  Writing
 | 
				
			||||||
 | 
					``on`` to the file starts the monitoring of the targets with the attributes.
 | 
				
			||||||
 | 
					Writing ``off`` to the file stops those.  DAMON also stops if every target
 | 
				
			||||||
 | 
					process is terminated.  Below example commands turn on, off, and check the
 | 
				
			||||||
 | 
					status of DAMON::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # cd <debugfs>/damon
 | 
				
			||||||
 | 
					    # echo on > monitor_on
 | 
				
			||||||
 | 
					    # echo off > monitor_on
 | 
				
			||||||
 | 
					    # cat monitor_on
 | 
				
			||||||
 | 
					    off
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Please note that you cannot write to the above-mentioned debugfs files while
 | 
				
			||||||
 | 
					the monitoring is turned on.  If you write to the files while DAMON is running,
 | 
				
			||||||
 | 
					an error code such as ``-EBUSY`` will be returned.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Tracepoint for Monitoring Results
 | 
				
			||||||
 | 
					=================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DAMON provides the monitoring results via a tracepoint,
 | 
				
			||||||
 | 
					``damon:damon_aggregated``.  While the monitoring is turned on, you could
 | 
				
			||||||
 | 
					record the tracepoint events and show results using tracepoint supporting tools
 | 
				
			||||||
 | 
					like ``perf``.  For example::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # echo on > monitor_on
 | 
				
			||||||
 | 
					    # perf record -e damon:damon_aggregated &
 | 
				
			||||||
 | 
					    # sleep 5
 | 
				
			||||||
 | 
					    # kill 9 $(pidof perf)
 | 
				
			||||||
 | 
					    # echo off > monitor_on
 | 
				
			||||||
 | 
					    # perf script
 | 
				
			||||||
| 
						 | 
					@ -27,6 +27,7 @@ the Linux memory management.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
   concepts
 | 
					   concepts
 | 
				
			||||||
   cma_debugfs
 | 
					   cma_debugfs
 | 
				
			||||||
 | 
					   damon/index
 | 
				
			||||||
   hugetlbpage
 | 
					   hugetlbpage
 | 
				
			||||||
   idle_page_tracking
 | 
					   idle_page_tracking
 | 
				
			||||||
   ksm
 | 
					   ksm
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,140 +1,321 @@
 | 
				
			||||||
.. _admin_guide_memory_hotplug:
 | 
					.. _admin_guide_memory_hotplug:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
==============
 | 
					==================
 | 
				
			||||||
Memory Hotplug
 | 
					Memory Hot(Un)Plug
 | 
				
			||||||
==============
 | 
					==================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
:Created:							Jul 28 2007
 | 
					This document describes generic Linux support for memory hot(un)plug with
 | 
				
			||||||
:Updated: Add some details about locking internals:		Aug 20 2018
 | 
					a focus on System RAM, including ZONE_MOVABLE support.
 | 
				
			||||||
 | 
					 | 
				
			||||||
This document is about memory hotplug including how-to-use and current status.
 | 
					 | 
				
			||||||
Because Memory Hotplug is still under development, contents of this text will
 | 
					 | 
				
			||||||
be changed often.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. contents:: :local:
 | 
					.. contents:: :local:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. note::
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    (1) x86_64's has special implementation for memory hotplug.
 | 
					 | 
				
			||||||
        This text does not describe it.
 | 
					 | 
				
			||||||
    (2) This text assumes that sysfs is mounted at ``/sys``.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Introduction
 | 
					Introduction
 | 
				
			||||||
============
 | 
					============
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Purpose of memory hotplug
 | 
					Memory hot(un)plug allows for increasing and decreasing the size of physical
 | 
				
			||||||
-------------------------
 | 
					memory available to a machine at runtime. In the simplest case, it consists of
 | 
				
			||||||
 | 
					physically plugging or unplugging a DIMM at runtime, coordinated with the
 | 
				
			||||||
 | 
					operating system.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Memory Hotplug allows users to increase/decrease the amount of memory.
 | 
					Memory hot(un)plug is used for various purposes:
 | 
				
			||||||
Generally, there are two purposes.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
(A) For changing the amount of memory.
 | 
					- The physical memory available to a machine can be adjusted at runtime, up- or
 | 
				
			||||||
    This is to allow a feature like capacity on demand.
 | 
					  downgrading the memory capacity. This dynamic memory resizing, sometimes
 | 
				
			||||||
(B) For installing/removing DIMMs or NUMA-nodes physically.
 | 
					  referred to as "capacity on demand", is frequently used with virtual machines
 | 
				
			||||||
    This is to exchange DIMMs/NUMA-nodes, reduce power consumption, etc.
 | 
					  and logical partitions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
(A) is required by highly virtualized environments and (B) is required by
 | 
					- Replacing hardware, such as DIMMs or whole NUMA nodes, without downtime. One
 | 
				
			||||||
hardware which supports memory power management.
 | 
					  example is replacing failing memory modules.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Linux memory hotplug is designed for both purpose.
 | 
					- Reducing energy consumption either by physically unplugging memory modules or
 | 
				
			||||||
 | 
					  by logically unplugging (parts of) memory modules from Linux.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Phases of memory hotplug
 | 
					Further, the basic memory hot(un)plug infrastructure in Linux is nowadays also
 | 
				
			||||||
------------------------
 | 
					used to expose persistent memory, other performance-differentiated memory and
 | 
				
			||||||
 | 
					reserved memory regions as ordinary system RAM to Linux.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
There are 2 phases in Memory Hotplug:
 | 
					Linux only supports memory hot(un)plug on selected 64 bit architectures, such as
 | 
				
			||||||
 | 
					x86_64, arm64, ppc64, s390x and ia64.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  1) Physical Memory Hotplug phase
 | 
					Memory Hot(Un)Plug Granularity
 | 
				
			||||||
  2) Logical Memory Hotplug phase.
 | 
					------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The First phase is to communicate hardware/firmware and make/erase
 | 
					Memory hot(un)plug in Linux uses the SPARSEMEM memory model, which divides the
 | 
				
			||||||
environment for hotplugged memory. Basically, this phase is necessary
 | 
					physical memory address space into chunks of the same size: memory sections. The
 | 
				
			||||||
for the purpose (B), but this is good phase for communication between
 | 
					size of a memory section is architecture dependent. For example, x86_64 uses
 | 
				
			||||||
highly virtualized environments too.
 | 
					128 MiB and ppc64 uses 16 MiB.
 | 
				
			||||||
 | 
					 | 
				
			||||||
When memory is hotplugged, the kernel recognizes new memory, makes new memory
 | 
					 | 
				
			||||||
management tables, and makes sysfs files for new memory's operation.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
If firmware supports notification of connection of new memory to OS,
 | 
					 | 
				
			||||||
this phase is triggered automatically. ACPI can notify this event. If not,
 | 
					 | 
				
			||||||
"probe" operation by system administration is used instead.
 | 
					 | 
				
			||||||
(see :ref:`memory_hotplug_physical_mem`).
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Logical Memory Hotplug phase is to change memory state into
 | 
					 | 
				
			||||||
available/unavailable for users. Amount of memory from user's view is
 | 
					 | 
				
			||||||
changed by this phase. The kernel makes all memory in it as free pages
 | 
					 | 
				
			||||||
when a memory range is available.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
In this document, this phase is described as online/offline.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Logical Memory Hotplug phase is triggered by write of sysfs file by system
 | 
					 | 
				
			||||||
administrator. For the hot-add case, it must be executed after Physical Hotplug
 | 
					 | 
				
			||||||
phase by hand.
 | 
					 | 
				
			||||||
(However, if you writes udev's hotplug scripts for memory hotplug, these
 | 
					 | 
				
			||||||
phases can be execute in seamless way.)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Unit of Memory online/offline operation
 | 
					 | 
				
			||||||
---------------------------------------
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Memory hotplug uses SPARSEMEM memory model which allows memory to be divided
 | 
					 | 
				
			||||||
into chunks of the same size. These chunks are called "sections". The size of
 | 
					 | 
				
			||||||
a memory section is architecture dependent. For example, power uses 16MiB, ia64
 | 
					 | 
				
			||||||
uses 1GiB.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
Memory sections are combined into chunks referred to as "memory blocks". The
 | 
					Memory sections are combined into chunks referred to as "memory blocks". The
 | 
				
			||||||
size of a memory block is architecture dependent and represents the logical
 | 
					size of a memory block is architecture dependent and corresponds to the smallest
 | 
				
			||||||
unit upon which memory online/offline operations are to be performed. The
 | 
					granularity that can be hot(un)plugged. The default size of a memory block is
 | 
				
			||||||
default size of a memory block is the same as memory section size unless an
 | 
					the same as memory section size, unless an architecture specifies otherwise.
 | 
				
			||||||
architecture specifies otherwise. (see :ref:`memory_hotplug_sysfs_files`.)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
To determine the size (in bytes) of a memory block please read this file::
 | 
					All memory blocks have the same size.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /sys/devices/system/memory/block_size_bytes
 | 
					Phases of Memory Hotplug
 | 
				
			||||||
 | 
					------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Kernel Configuration
 | 
					Memory hotplug consists of two phases:
 | 
				
			||||||
====================
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
To use memory hotplug feature, kernel must be compiled with following
 | 
					(1) Adding the memory to Linux
 | 
				
			||||||
config options.
 | 
					(2) Onlining memory blocks
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- For all memory hotplug:
 | 
					In the first phase, metadata, such as the memory map ("memmap") and page tables
 | 
				
			||||||
    - Memory model -> Sparse Memory  (``CONFIG_SPARSEMEM``)
 | 
					for the direct mapping, is allocated and initialized, and memory blocks are
 | 
				
			||||||
    - Allow for memory hot-add       (``CONFIG_MEMORY_HOTPLUG``)
 | 
					created; the latter also creates sysfs files for managing newly created memory
 | 
				
			||||||
 | 
					blocks.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- To enable memory removal, the following are also necessary:
 | 
					In the second phase, added memory is exposed to the page allocator. After this
 | 
				
			||||||
    - Allow for memory hot remove    (``CONFIG_MEMORY_HOTREMOVE``)
 | 
					phase, the memory is visible in memory statistics, such as free and total
 | 
				
			||||||
    - Page Migration                 (``CONFIG_MIGRATION``)
 | 
					memory, of the system.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- For ACPI memory hotplug, the following are also necessary:
 | 
					Phases of Memory Hotunplug
 | 
				
			||||||
    - Memory hotplug (under ACPI Support menu) (``CONFIG_ACPI_HOTPLUG_MEMORY``)
 | 
					--------------------------
 | 
				
			||||||
    - This option can be kernel module.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
- As a related configuration, if your box has a feature of NUMA-node hotplug
 | 
					Memory hotunplug consists of two phases:
 | 
				
			||||||
  via ACPI, then this option is necessary too.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    - ACPI0004,PNP0A05 and PNP0A06 Container Driver (under ACPI Support menu)
 | 
					(1) Offlining memory blocks
 | 
				
			||||||
      (``CONFIG_ACPI_CONTAINER``).
 | 
					(2) Removing the memory from Linux
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     This option can be kernel module too.
 | 
					In the fist phase, memory is "hidden" from the page allocator again, for
 | 
				
			||||||
 | 
					example, by migrating busy memory to other memory locations and removing all
 | 
				
			||||||
 | 
					relevant free pages from the page allocator After this phase, the memory is no
 | 
				
			||||||
 | 
					longer visible in memory statistics of the system.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In the second phase, the memory blocks are removed and metadata is freed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. _memory_hotplug_sysfs_files:
 | 
					Memory Hotplug Notifications
 | 
				
			||||||
 | 
					============================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
sysfs files for memory hotplug
 | 
					There are various ways how Linux is notified about memory hotplug events such
 | 
				
			||||||
 | 
					that it can start adding hotplugged memory. This description is limited to
 | 
				
			||||||
 | 
					systems that support ACPI; mechanisms specific to other firmware interfaces or
 | 
				
			||||||
 | 
					virtual machines are not described.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ACPI Notifications
 | 
				
			||||||
 | 
					------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Platforms that support ACPI, such as x86_64, can support memory hotplug
 | 
				
			||||||
 | 
					notifications via ACPI.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In general, a firmware supporting memory hotplug defines a memory class object
 | 
				
			||||||
 | 
					HID "PNP0C80". When notified about hotplug of a new memory device, the ACPI
 | 
				
			||||||
 | 
					driver will hotplug the memory to Linux.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If the firmware supports hotplug of NUMA nodes, it defines an object _HID
 | 
				
			||||||
 | 
					"ACPI0004", "PNP0A05", or "PNP0A06". When notified about an hotplug event, all
 | 
				
			||||||
 | 
					assigned memory devices are added to Linux by the ACPI driver.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Similarly, Linux can be notified about requests to hotunplug a memory device or
 | 
				
			||||||
 | 
					a NUMA node via ACPI. The ACPI driver will try offlining all relevant memory
 | 
				
			||||||
 | 
					blocks, and, if successful, hotunplug the memory from Linux.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Manual Probing
 | 
				
			||||||
 | 
					--------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					On some architectures, the firmware may not be able to notify the operating
 | 
				
			||||||
 | 
					system about a memory hotplug event. Instead, the memory has to be manually
 | 
				
			||||||
 | 
					probed from user space.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The probe interface is located at::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/sys/devices/system/memory/probe
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Only complete memory blocks can be probed. Individual memory blocks are probed
 | 
				
			||||||
 | 
					by providing the physical start address of the memory block::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						% echo addr > /sys/devices/system/memory/probe
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Which results in a memory block for the range [addr, addr + memory_block_size)
 | 
				
			||||||
 | 
					being created.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. note::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Using the probe interface is discouraged as it is easy to crash the kernel,
 | 
				
			||||||
 | 
					  because Linux cannot validate user input; this interface might be removed in
 | 
				
			||||||
 | 
					  the future.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Onlining and Offlining Memory Blocks
 | 
				
			||||||
 | 
					====================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					After a memory block has been created, Linux has to be instructed to actually
 | 
				
			||||||
 | 
					make use of that memory: the memory block has to be "online".
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Before a memory block can be removed, Linux has to stop using any memory part of
 | 
				
			||||||
 | 
					the memory block: the memory block has to be "offlined".
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The Linux kernel can be configured to automatically online added memory blocks
 | 
				
			||||||
 | 
					and drivers automatically trigger offlining of memory blocks when trying
 | 
				
			||||||
 | 
					hotunplug of memory. Memory blocks can only be removed once offlining succeeded
 | 
				
			||||||
 | 
					and drivers may trigger offlining of memory blocks when attempting hotunplug of
 | 
				
			||||||
 | 
					memory.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Onlining Memory Blocks Manually
 | 
				
			||||||
 | 
					-------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If auto-onlining of memory blocks isn't enabled, user-space has to manually
 | 
				
			||||||
 | 
					trigger onlining of memory blocks. Often, udev rules are used to automate this
 | 
				
			||||||
 | 
					task in user space.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Onlining of a memory block can be triggered via::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						% echo online > /sys/devices/system/memory/memoryXXX/state
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Or alternatively::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						% echo 1 > /sys/devices/system/memory/memoryXXX/online
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The kernel will select the target zone automatically, usually defaulting to
 | 
				
			||||||
 | 
					``ZONE_NORMAL`` unless ``movablecore=1`` has been specified on the kernel
 | 
				
			||||||
 | 
					command line or if the memory block would intersect the ZONE_MOVABLE already.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					One can explicitly request to associate an offline memory block with
 | 
				
			||||||
 | 
					ZONE_MOVABLE by::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						% echo online_movable > /sys/devices/system/memory/memoryXXX/state
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Or one can explicitly request a kernel zone (usually ZONE_NORMAL) by::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						% echo online_kernel > /sys/devices/system/memory/memoryXXX/state
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In any case, if onlining succeeds, the state of the memory block is changed to
 | 
				
			||||||
 | 
					be "online". If it fails, the state of the memory block will remain unchanged
 | 
				
			||||||
 | 
					and the above commands will fail.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Onlining Memory Blocks Automatically
 | 
				
			||||||
 | 
					------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The kernel can be configured to try auto-onlining of newly added memory blocks.
 | 
				
			||||||
 | 
					If this feature is disabled, the memory blocks will stay offline until
 | 
				
			||||||
 | 
					explicitly onlined from user space.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The configured auto-online behavior can be observed via::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						% cat /sys/devices/system/memory/auto_online_blocks
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Auto-onlining can be enabled by writing ``online``, ``online_kernel`` or
 | 
				
			||||||
 | 
					``online_movable`` to that file, like::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						% echo online > /sys/devices/system/memory/auto_online_blocks
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Modifying the auto-online behavior will only affect all subsequently added
 | 
				
			||||||
 | 
					memory blocks only.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. note::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  In corner cases, auto-onlining can fail. The kernel won't retry. Note that
 | 
				
			||||||
 | 
					  auto-onlining is not expected to fail in default configurations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. note::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  DLPAR on ppc64 ignores the ``offline`` setting and will still online added
 | 
				
			||||||
 | 
					  memory blocks; if onlining fails, memory blocks are removed again.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Offlining Memory Blocks
 | 
				
			||||||
 | 
					-----------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In the current implementation, Linux's memory offlining will try migrating all
 | 
				
			||||||
 | 
					movable pages off the affected memory block. As most kernel allocations, such as
 | 
				
			||||||
 | 
					page tables, are unmovable, page migration can fail and, therefore, inhibit
 | 
				
			||||||
 | 
					memory offlining from succeeding.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Having the memory provided by memory block managed by ZONE_MOVABLE significantly
 | 
				
			||||||
 | 
					increases memory offlining reliability; still, memory offlining can fail in
 | 
				
			||||||
 | 
					some corner cases.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Further, memory offlining might retry for a long time (or even forever), until
 | 
				
			||||||
 | 
					aborted by the user.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Offlining of a memory block can be triggered via::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						% echo offline > /sys/devices/system/memory/memoryXXX/state
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Or alternatively::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						% echo 0 > /sys/devices/system/memory/memoryXXX/online
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If offlining succeeds, the state of the memory block is changed to be "offline".
 | 
				
			||||||
 | 
					If it fails, the state of the memory block will remain unchanged and the above
 | 
				
			||||||
 | 
					commands will fail, for example, via::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						bash: echo: write error: Device or resource busy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					or via::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						bash: echo: write error: Invalid argument
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Observing the State of Memory Blocks
 | 
				
			||||||
 | 
					------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The state (online/offline/going-offline) of a memory block can be observed
 | 
				
			||||||
 | 
					either via::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						% cat /sys/device/system/memory/memoryXXX/state
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Or alternatively (1/0) via::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						% cat /sys/device/system/memory/memoryXXX/online
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For an online memory block, the managing zone can be observed via::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						% cat /sys/device/system/memory/memoryXXX/valid_zones
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Configuring Memory Hot(Un)Plug
 | 
				
			||||||
==============================
 | 
					==============================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
All memory blocks have their device information in sysfs.  Each memory block
 | 
					There are various ways how system administrators can configure memory
 | 
				
			||||||
is described under ``/sys/devices/system/memory`` as::
 | 
					hot(un)plug and interact with memory blocks, especially, to online them.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Memory Hot(Un)Plug Configuration via Sysfs
 | 
				
			||||||
 | 
					------------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Some memory hot(un)plug properties can be configured or inspected via sysfs in::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/sys/devices/system/memory/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The following files are currently defined:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					====================== =========================================================
 | 
				
			||||||
 | 
					``auto_online_blocks`` read-write: set or get the default state of new memory
 | 
				
			||||||
 | 
							       blocks; configure auto-onlining.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							       The default value depends on the
 | 
				
			||||||
 | 
							       CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel configuration
 | 
				
			||||||
 | 
							       option.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							       See the ``state`` property of memory blocks for details.
 | 
				
			||||||
 | 
					``block_size_bytes``   read-only: the size in bytes of a memory block.
 | 
				
			||||||
 | 
					``probe``	       write-only: add (probe) selected memory blocks manually
 | 
				
			||||||
 | 
							       from user space by supplying the physical start address.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							       Availability depends on the CONFIG_ARCH_MEMORY_PROBE
 | 
				
			||||||
 | 
							       kernel configuration option.
 | 
				
			||||||
 | 
					``uevent``	       read-write: generic udev file for device subsystems.
 | 
				
			||||||
 | 
					====================== =========================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. note::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  When the CONFIG_MEMORY_FAILURE kernel configuration option is enabled, two
 | 
				
			||||||
 | 
					  additional files ``hard_offline_page`` and ``soft_offline_page`` are available
 | 
				
			||||||
 | 
					  to trigger hwpoisoning of pages, for example, for testing purposes. Note that
 | 
				
			||||||
 | 
					  this functionality is not really related to memory hot(un)plug or actual
 | 
				
			||||||
 | 
					  offlining of memory blocks.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Memory Block Configuration via Sysfs
 | 
				
			||||||
 | 
					------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Each memory block is represented as a memory block device that can be
 | 
				
			||||||
 | 
					onlined or offlined. All memory blocks have their device information located in
 | 
				
			||||||
 | 
					sysfs. Each present memory block is listed under
 | 
				
			||||||
 | 
					``/sys/devices/system/memory`` as::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/sys/devices/system/memory/memoryXXX
 | 
						/sys/devices/system/memory/memoryXXX
 | 
				
			||||||
 | 
					
 | 
				
			||||||
where XXX is the memory block id.
 | 
					where XXX is the memory block id; the number of digits is variable.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
For the memory block covered by the sysfs directory.  It is expected that all
 | 
					A present memory block indicates that some memory in the range is present;
 | 
				
			||||||
memory sections in this range are present and no memory holes exist in the
 | 
					however, a memory block might span memory holes. A memory block spanning memory
 | 
				
			||||||
range. Currently there is no way to determine if there is a memory hole, but
 | 
					holes cannot be offlined.
 | 
				
			||||||
the existence of one should not affect the hotplug capabilities of the memory
 | 
					 | 
				
			||||||
block.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
For example, assume 1 GiB memory block size. A device for a memory starting at
 | 
					For example, assume 1 GiB memory block size. A device for a memory starting at
 | 
				
			||||||
0x100000000 is ``/sys/device/system/memory/memory4``::
 | 
					0x100000000 is ``/sys/device/system/memory/memory4``::
 | 
				
			||||||
| 
						 | 
					@ -143,51 +324,57 @@ For example, assume 1GiB memory block size. A device for a memory starting at
 | 
				
			||||||
 | 
					
 | 
				
			||||||
This device covers address range [0x100000000 ... 0x140000000)
 | 
					This device covers address range [0x100000000 ... 0x140000000)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Under each memory block, you can see 5 files:
 | 
					The following files are currently defined:
 | 
				
			||||||
 | 
					 | 
				
			||||||
- ``/sys/devices/system/memory/memoryXXX/phys_index``
 | 
					 | 
				
			||||||
- ``/sys/devices/system/memory/memoryXXX/phys_device``
 | 
					 | 
				
			||||||
- ``/sys/devices/system/memory/memoryXXX/state``
 | 
					 | 
				
			||||||
- ``/sys/devices/system/memory/memoryXXX/removable``
 | 
					 | 
				
			||||||
- ``/sys/devices/system/memory/memoryXXX/valid_zones``
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
=================== ============================================================
 | 
					=================== ============================================================
 | 
				
			||||||
``phys_index``      read-only and contains memory block id, same as XXX.
 | 
					``online``	    read-write: simplified interface to trigger onlining /
 | 
				
			||||||
``state``           read-write
 | 
							    offlining and to observe the state of a memory block.
 | 
				
			||||||
 | 
							    When onlining, the zone is selected automatically.
 | 
				
			||||||
                    - at read:  contains online/offline state of memory.
 | 
					 | 
				
			||||||
                    - at write: user can specify "online_kernel",
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                    "online_movable", "online", "offline" command
 | 
					 | 
				
			||||||
                    which will be performed on all sections in the block.
 | 
					 | 
				
			||||||
``phys_device``	    read-only: legacy interface only ever used on s390x to
 | 
					``phys_device``	    read-only: legacy interface only ever used on s390x to
 | 
				
			||||||
		    expose the covered storage increment.
 | 
							    expose the covered storage increment.
 | 
				
			||||||
 | 
					``phys_index``	    read-only: the memory block id (XXX).
 | 
				
			||||||
``removable``	    read-only: legacy interface that indicated whether a memory
 | 
					``removable``	    read-only: legacy interface that indicated whether a memory
 | 
				
			||||||
		    block was likely to be offlineable or not.  Newer kernel
 | 
							    block was likely to be offlineable or not. Nowadays, the
 | 
				
			||||||
		    versions return "1" if and only if the kernel supports
 | 
							    kernel return ``1`` if and only if it supports memory
 | 
				
			||||||
		    memory offlining.
 | 
							    offlining.
 | 
				
			||||||
``valid_zones``     read-only: designed to show by which zone memory provided by
 | 
					``state``	    read-write: advanced interface to trigger onlining /
 | 
				
			||||||
		    a memory block is managed, and to show by which zone memory
 | 
							    offlining and to observe the state of a memory block.
 | 
				
			||||||
		    provided by an offline memory block could be managed when
 | 
					 | 
				
			||||||
		    onlining.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
		    The first column shows it`s default zone.
 | 
							    When writing, ``online``, ``offline``, ``online_kernel`` and
 | 
				
			||||||
 | 
							    ``online_movable`` are supported.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		    "memory6/valid_zones: Normal Movable" shows this memoryblock
 | 
							    ``online_movable`` specifies onlining to ZONE_MOVABLE.
 | 
				
			||||||
		    can be onlined to ZONE_NORMAL by default and to ZONE_MOVABLE
 | 
							    ``online_kernel`` specifies onlining to the default kernel
 | 
				
			||||||
		    by online_movable.
 | 
							    zone for the memory block, such as ZONE_NORMAL.
 | 
				
			||||||
 | 
					                    ``online`` let's the kernel select the zone automatically.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		    "memory7/valid_zones: Movable Normal" shows this memoryblock
 | 
							    When reading, ``online``, ``offline`` and ``going-offline``
 | 
				
			||||||
		    can be onlined to ZONE_MOVABLE by default and to ZONE_NORMAL
 | 
							    may be returned.
 | 
				
			||||||
		    by online_kernel.
 | 
					``uevent``	    read-write: generic uevent file for devices.
 | 
				
			||||||
 | 
					``valid_zones``     read-only: when a block is online, shows the zone it
 | 
				
			||||||
 | 
							    belongs to; when a block is offline, shows what zone will
 | 
				
			||||||
 | 
							    manage it when the block will be onlined.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							    For online memory blocks, ``DMA``, ``DMA32``, ``Normal``,
 | 
				
			||||||
 | 
							    ``Movable`` and ``none`` may be returned. ``none`` indicates
 | 
				
			||||||
 | 
							    that memory provided by a memory block is managed by
 | 
				
			||||||
 | 
							    multiple zones or spans multiple nodes; such memory blocks
 | 
				
			||||||
 | 
							    cannot be offlined. ``Movable`` indicates ZONE_MOVABLE.
 | 
				
			||||||
 | 
							    Other values indicate a kernel zone.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							    For offline memory blocks, the first column shows the
 | 
				
			||||||
 | 
							    zone the kernel would select when onlining the memory block
 | 
				
			||||||
 | 
							    right now without further specifying a zone.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							    Availability depends on the CONFIG_MEMORY_HOTREMOVE
 | 
				
			||||||
 | 
							    kernel configuration option.
 | 
				
			||||||
=================== ============================================================
 | 
					=================== ============================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. note::
 | 
					.. note::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  These directories/files appear after physical memory hotplug phase.
 | 
					  If the CONFIG_NUMA kernel configuration option is enabled, the memoryXXX/
 | 
				
			||||||
 | 
					  directories can also be accessed via symbolic links located in the
 | 
				
			||||||
If CONFIG_NUMA is enabled the memoryXXX/ directories can also be accessed
 | 
					  ``/sys/devices/system/node/node*`` directories.
 | 
				
			||||||
via symbolic links located in the ``/sys/devices/system/node/node*`` directories.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  For example::
 | 
					  For example::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -197,270 +384,193 @@ A backlink will also be created::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/sys/devices/system/memory/memory9/node0 -> ../../node/node0
 | 
						/sys/devices/system/memory/memory9/node0 -> ../../node/node0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. _memory_hotplug_physical_mem:
 | 
					Command Line Parameters
 | 
				
			||||||
 | 
					-----------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Physical memory hot-add phase
 | 
					Some command line parameters affect memory hot(un)plug handling. The following
 | 
				
			||||||
=============================
 | 
					command line parameters are relevant:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Hardware(Firmware) Support
 | 
					======================== =======================================================
 | 
				
			||||||
--------------------------
 | 
					``memhp_default_state``	 configure auto-onlining by essentially setting
 | 
				
			||||||
 | 
					                         ``/sys/devices/system/memory/auto_online_blocks``.
 | 
				
			||||||
 | 
					``movablecore``		 configure automatic zone selection of the kernel. When
 | 
				
			||||||
 | 
								 set, the kernel will default to ZONE_MOVABLE, unless
 | 
				
			||||||
 | 
								 other zones can be kept contiguous.
 | 
				
			||||||
 | 
					======================== =======================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
On x86_64/ia64 platform, memory hotplug by ACPI is supported.
 | 
					Module Parameters
 | 
				
			||||||
 | 
					------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
In general, the firmware (ACPI) which supports memory hotplug defines
 | 
					Instead of additional command line parameters or sysfs files, the
 | 
				
			||||||
memory class object of _HID "PNP0C80". When a notify is asserted to PNP0C80,
 | 
					``memory_hotplug`` subsystem now provides a dedicated namespace for module
 | 
				
			||||||
Linux's ACPI handler does hot-add memory to the system and calls a hotplug udev
 | 
					parameters. Module parameters can be set via the command line by predicating
 | 
				
			||||||
script. This will be done automatically.
 | 
					them with ``memory_hotplug.`` such as::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
But scripts for memory hotplug are not contained in generic udev package(now).
 | 
						memory_hotplug.memmap_on_memory=1
 | 
				
			||||||
You may have to write it by yourself or online/offline memory by hand.
 | 
					 | 
				
			||||||
Please see :ref:`memory_hotplug_how_to_online_memory` and
 | 
					 | 
				
			||||||
:ref:`memory_hotplug_how_to_offline_memory`.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
If firmware supports NUMA-node hotplug, and defines an object _HID "ACPI0004",
 | 
					and they can be observed (and some even modified at runtime) via::
 | 
				
			||||||
"PNP0A05", or "PNP0A06", notification is asserted to it, and ACPI handler
 | 
					 | 
				
			||||||
calls hotplug code for all of objects which are defined in it.
 | 
					 | 
				
			||||||
If memory device is found, memory hotplug code will be called.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
Notify memory hot-add event by hand
 | 
						/sys/modules/memory_hotplug/parameters/
 | 
				
			||||||
-----------------------------------
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
On some architectures, the firmware may not notify the kernel of a memory
 | 
					The following module parameters are currently defined:
 | 
				
			||||||
hotplug event.  Therefore, the memory "probe" interface is supported to
 | 
					 | 
				
			||||||
explicitly notify the kernel.  This interface depends on
 | 
					 | 
				
			||||||
CONFIG_ARCH_MEMORY_PROBE and can be configured on powerpc, sh, and x86
 | 
					 | 
				
			||||||
if hotplug is supported, although for x86 this should be handled by ACPI
 | 
					 | 
				
			||||||
notification.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
Probe interface is located at::
 | 
					======================== =======================================================
 | 
				
			||||||
 | 
					``memmap_on_memory``	 read-write: Allocate memory for the memmap from the
 | 
				
			||||||
 | 
								 added memory block itself. Even if enabled, actual
 | 
				
			||||||
 | 
								 support depends on various other system properties and
 | 
				
			||||||
 | 
								 should only be regarded as a hint whether the behavior
 | 
				
			||||||
 | 
								 would be desired.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/sys/devices/system/memory/probe
 | 
								 While allocating the memmap from the memory block
 | 
				
			||||||
 | 
								 itself makes memory hotplug less likely to fail and
 | 
				
			||||||
 | 
								 keeps the memmap on the same NUMA node in any case, it
 | 
				
			||||||
 | 
								 can fragment physical memory in a way that huge pages
 | 
				
			||||||
 | 
								 in bigger granularity cannot be formed on hotplugged
 | 
				
			||||||
 | 
								 memory.
 | 
				
			||||||
 | 
					======================== =======================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
You can tell the physical address of new memory to the kernel by::
 | 
					ZONE_MOVABLE
 | 
				
			||||||
 | 
					============
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	% echo start_address_of_new_memory > /sys/devices/system/memory/probe
 | 
					ZONE_MOVABLE is an important mechanism for more reliable memory offlining.
 | 
				
			||||||
 | 
					Further, having system RAM managed by ZONE_MOVABLE instead of one of the
 | 
				
			||||||
 | 
					kernel zones can increase the number of possible transparent huge pages and
 | 
				
			||||||
 | 
					dynamically allocated huge pages.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Then, [start_address_of_new_memory, start_address_of_new_memory +
 | 
					Most kernel allocations are unmovable. Important examples include the memory
 | 
				
			||||||
memory_block_size] memory range is hot-added. In this case, hotplug script is
 | 
					map (usually 1/64ths of memory), page tables, and kmalloc(). Such allocations
 | 
				
			||||||
not called (in current implementation). You'll have to online memory by
 | 
					can only be served from the kernel zones.
 | 
				
			||||||
yourself.  Please see :ref:`memory_hotplug_how_to_online_memory`.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
Logical Memory hot-add phase
 | 
					Most user space pages, such as anonymous memory, and page cache pages are
 | 
				
			||||||
============================
 | 
					movable. Such allocations can be served from ZONE_MOVABLE and the kernel zones.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
State of memory
 | 
					Only movable allocations are served from ZONE_MOVABLE, resulting in unmovable
 | 
				
			||||||
 | 
					allocations being limited to the kernel zones. Without ZONE_MOVABLE, there is
 | 
				
			||||||
 | 
					absolutely no guarantee whether a memory block can be offlined successfully.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Zone Imbalances
 | 
				
			||||||
---------------
 | 
					---------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To see (online/offline) state of a memory block, read 'state' file::
 | 
					Having too much system RAM managed by ZONE_MOVABLE is called a zone imbalance,
 | 
				
			||||||
 | 
					which can harm the system or degrade performance. As one example, the kernel
 | 
				
			||||||
 | 
					might crash because it runs out of free memory for unmovable allocations,
 | 
				
			||||||
 | 
					although there is still plenty of free memory left in ZONE_MOVABLE.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	% cat /sys/device/system/memory/memoryXXX/state
 | 
					Usually, MOVABLE:KERNEL ratios of up to 3:1 or even 4:1 are fine. Ratios of 63:1
 | 
				
			||||||
 | 
					are definitely impossible due to the overhead for the memory map.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Actual safe zone ratios depend on the workload. Extreme cases, like excessive
 | 
				
			||||||
- If the memory block is online, you'll read "online".
 | 
					long-term pinning of pages, might not be able to deal with ZONE_MOVABLE at all.
 | 
				
			||||||
- If the memory block is offline, you'll read "offline".
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
.. _memory_hotplug_how_to_online_memory:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
How to online memory
 | 
					 | 
				
			||||||
--------------------
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
When the memory is hot-added, the kernel decides whether or not to "online"
 | 
					 | 
				
			||||||
it according to the policy which can be read from "auto_online_blocks" file::
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	% cat /sys/devices/system/memory/auto_online_blocks
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
The default depends on the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
 | 
					 | 
				
			||||||
option. If it is disabled the default is "offline" which means the newly added
 | 
					 | 
				
			||||||
memory is not in a ready-to-use state and you have to "online" the newly added
 | 
					 | 
				
			||||||
memory blocks manually. Automatic onlining can be requested by writing "online"
 | 
					 | 
				
			||||||
to "auto_online_blocks" file::
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	% echo online > /sys/devices/system/memory/auto_online_blocks
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
This sets a global policy and impacts all memory blocks that will subsequently
 | 
					 | 
				
			||||||
be hotplugged. Currently offline blocks keep their state. It is possible, under
 | 
					 | 
				
			||||||
certain circumstances, that some memory blocks will be added but will fail to
 | 
					 | 
				
			||||||
online. User space tools can check their "state" files
 | 
					 | 
				
			||||||
(``/sys/devices/system/memory/memoryXXX/state``) and try to online them manually.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
If the automatic onlining wasn't requested, failed, or some memory block was
 | 
					 | 
				
			||||||
offlined it is possible to change the individual block's state by writing to the
 | 
					 | 
				
			||||||
"state" file::
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	% echo online > /sys/devices/system/memory/memoryXXX/state
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
This onlining will not change the ZONE type of the target memory block,
 | 
					 | 
				
			||||||
If the memory block doesn't belong to any zone an appropriate kernel zone
 | 
					 | 
				
			||||||
(usually ZONE_NORMAL) will be used unless movable_node kernel command line
 | 
					 | 
				
			||||||
option is specified when ZONE_MOVABLE will be used.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
You can explicitly request to associate it with ZONE_MOVABLE by::
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	% echo online_movable > /sys/devices/system/memory/memoryXXX/state
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
.. note:: current limit: this memory block must be adjacent to ZONE_MOVABLE
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Or you can explicitly request a kernel zone (usually ZONE_NORMAL) by::
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	% echo online_kernel > /sys/devices/system/memory/memoryXXX/state
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
.. note:: current limit: this memory block must be adjacent to ZONE_NORMAL
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
An explicit zone onlining can fail (e.g. when the range is already within
 | 
					 | 
				
			||||||
and existing and incompatible zone already).
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
After this, memory block XXX's state will be 'online' and the amount of
 | 
					 | 
				
			||||||
available memory will be increased.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
This may be changed in future.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Logical memory remove
 | 
					 | 
				
			||||||
=====================
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Memory offline and ZONE_MOVABLE
 | 
					 | 
				
			||||||
-------------------------------
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Memory offlining is more complicated than memory online. Because memory offline
 | 
					 | 
				
			||||||
has to make the whole memory block be unused, memory offline can fail if
 | 
					 | 
				
			||||||
the memory block includes memory which cannot be freed.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
In general, memory offline can use 2 techniques.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
(1) reclaim and free all memory in the memory block.
 | 
					 | 
				
			||||||
(2) migrate all pages in the memory block.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
In the current implementation, Linux's memory offline uses method (2), freeing
 | 
					 | 
				
			||||||
all  pages in the memory block by page migration. But not all pages are
 | 
					 | 
				
			||||||
migratable. Under current Linux, migratable pages are anonymous pages and
 | 
					 | 
				
			||||||
page caches. For offlining a memory block by migration, the kernel has to
 | 
					 | 
				
			||||||
guarantee that the memory block contains only migratable pages.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Now, a boot option for making a memory block which consists of migratable pages
 | 
					 | 
				
			||||||
is supported. By specifying "kernelcore=" or "movablecore=" boot option, you can
 | 
					 | 
				
			||||||
create ZONE_MOVABLE...a zone which is just used for movable pages.
 | 
					 | 
				
			||||||
(See also Documentation/admin-guide/kernel-parameters.rst)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Assume the system has "TOTAL" amount of memory at boot time, this boot option
 | 
					 | 
				
			||||||
creates ZONE_MOVABLE as following.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
1) When kernelcore=YYYY boot option is used,
 | 
					 | 
				
			||||||
   Size of memory not for movable pages (not for offline) is YYYY.
 | 
					 | 
				
			||||||
   Size of memory for movable pages (for offline) is TOTAL-YYYY.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
2) When movablecore=ZZZZ boot option is used,
 | 
					 | 
				
			||||||
   Size of memory not for movable pages (not for offline) is TOTAL - ZZZZ.
 | 
					 | 
				
			||||||
   Size of memory for movable pages (for offline) is ZZZZ.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. note::
 | 
					.. note::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
   Unfortunately, there is no information to show which memory block belongs
 | 
					  CMA memory part of a kernel zone essentially behaves like memory in
 | 
				
			||||||
   to ZONE_MOVABLE. This is TBD.
 | 
					  ZONE_MOVABLE and similar considerations apply, especially when combining
 | 
				
			||||||
 | 
					  CMA with ZONE_MOVABLE.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
   Memory offlining can fail when dissolving a free huge page on ZONE_MOVABLE
 | 
					ZONE_MOVABLE Sizing Considerations
 | 
				
			||||||
   and the feature of freeing unused vmemmap pages associated with each hugetlb
 | 
					----------------------------------
 | 
				
			||||||
   page is enabled.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
   This can happen when we have plenty of ZONE_MOVABLE memory, but not enough
 | 
					We usually expect that a large portion of available system RAM will actually
 | 
				
			||||||
   kernel memory to allocate vmemmmap pages.  We may even be able to migrate
 | 
					be consumed by user space, either directly or indirectly via the page cache. In
 | 
				
			||||||
   huge page contents, but will not be able to dissolve the source huge page.
 | 
					the normal case, ZONE_MOVABLE can be used when allocating such pages just fine.
 | 
				
			||||||
   This will prevent an offline operation and is unfortunate as memory offlining
 | 
					 | 
				
			||||||
   is expected to succeed on movable zones.  Users that depend on memory hotplug
 | 
					 | 
				
			||||||
   to succeed for movable zones should carefully consider whether the memory
 | 
					 | 
				
			||||||
   savings gained from this feature are worth the risk of possibly not being
 | 
					 | 
				
			||||||
   able to offline memory in certain situations.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. note::
 | 
					With that in mind, it makes sense that we can have a big portion of system RAM
 | 
				
			||||||
   Techniques that rely on long-term pinnings of memory (especially, RDMA and
 | 
					managed by ZONE_MOVABLE. However, there are some things to consider when using
 | 
				
			||||||
   vfio) are fundamentally problematic with ZONE_MOVABLE and, therefore, memory
 | 
					ZONE_MOVABLE, especially when fine-tuning zone ratios:
 | 
				
			||||||
   hot remove. Pinned pages cannot reside on ZONE_MOVABLE, to guarantee that
 | 
					 | 
				
			||||||
   memory can still get hot removed - be aware that pinning can fail even if
 | 
					 | 
				
			||||||
   there is plenty of free memory in ZONE_MOVABLE. In addition, using
 | 
					 | 
				
			||||||
   ZONE_MOVABLE might make page pinning more expensive, because pages have to be
 | 
					 | 
				
			||||||
   migrated off that zone first.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. _memory_hotplug_how_to_offline_memory:
 | 
					- Having a lot of offline memory blocks. Even offline memory blocks consume
 | 
				
			||||||
 | 
					  memory for metadata and page tables in the direct map; having a lot of offline
 | 
				
			||||||
 | 
					  memory blocks is not a typical case, though.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
How to offline memory
 | 
					- Memory ballooning without balloon compaction is incompatible with
 | 
				
			||||||
---------------------
 | 
					  ZONE_MOVABLE. Only some implementations, such as virtio-balloon and
 | 
				
			||||||
 | 
					  pseries CMM, fully support balloon compaction.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
You can offline a memory block by using the same sysfs interface that was used
 | 
					  Further, the CONFIG_BALLOON_COMPACTION kernel configuration option might be
 | 
				
			||||||
in memory onlining::
 | 
					  disabled. In that case, balloon inflation will only perform unmovable
 | 
				
			||||||
 | 
					  allocations and silently create a zone imbalance, usually triggered by
 | 
				
			||||||
 | 
					  inflation requests from the hypervisor.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	% echo offline > /sys/devices/system/memory/memoryXXX/state
 | 
					- Gigantic pages are unmovable, resulting in user space consuming a
 | 
				
			||||||
 | 
					  lot of unmovable memory.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
If offline succeeds, the state of the memory block is changed to be "offline".
 | 
					- Huge pages are unmovable when an architectures does not support huge
 | 
				
			||||||
If it fails, some error core (like -EBUSY) will be returned by the kernel.
 | 
					  page migration, resulting in a similar issue as with gigantic pages.
 | 
				
			||||||
Even if a memory block does not belong to ZONE_MOVABLE, you can try to offline
 | 
					 | 
				
			||||||
it.  If it doesn't contain 'unmovable' memory, you'll get success.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
A memory block under ZONE_MOVABLE is considered to be able to be offlined
 | 
					- Page tables are unmovable. Excessive swapping, mapping extremely large
 | 
				
			||||||
easily.  But under some busy state, it may return -EBUSY. Even if a memory
 | 
					  files or ZONE_DEVICE memory can be problematic, although only really relevant
 | 
				
			||||||
block cannot be offlined due to -EBUSY, you can retry offlining it and may be
 | 
					  in corner cases. When we manage a lot of user space memory that has been
 | 
				
			||||||
able to offline it (or not). (For example, a page is referred to by some kernel
 | 
					  swapped out or is served from a file/persistent memory/... we still need a lot
 | 
				
			||||||
internal call and released soon.)
 | 
					  of page tables to manage that memory once user space accessed that memory.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Consideration:
 | 
					- In certain DAX configurations the memory map for the device memory will be
 | 
				
			||||||
  Memory hotplug's design direction is to make the possibility of memory
 | 
					  allocated from the kernel zones.
 | 
				
			||||||
  offlining higher and to guarantee unplugging memory under any situation. But
 | 
					 | 
				
			||||||
  it needs more work. Returning -EBUSY under some situation may be good because
 | 
					 | 
				
			||||||
  the user can decide to retry more or not by himself. Currently, memory
 | 
					 | 
				
			||||||
  offlining code does some amount of retry with 120 seconds timeout.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
Physical memory remove
 | 
					- KASAN can have a significant memory overhead, for example, consuming 1/8th of
 | 
				
			||||||
======================
 | 
					  the total system memory size as (unmovable) tracking metadata.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Need more implementation yet....
 | 
					- Long-term pinning of pages. Techniques that rely on long-term pinnings
 | 
				
			||||||
 - Notification completion of remove works by OS to firmware.
 | 
					  (especially, RDMA and vfio/mdev) are fundamentally problematic with
 | 
				
			||||||
 - Guard from remove if not yet.
 | 
					  ZONE_MOVABLE, and therefore, memory offlining. Pinned pages cannot reside
 | 
				
			||||||
 | 
					  on ZONE_MOVABLE as that would turn these pages unmovable. Therefore, they
 | 
				
			||||||
 | 
					  have to be migrated off that zone while pinning. Pinning a page can fail
 | 
				
			||||||
 | 
					  even if there is plenty of free memory in ZONE_MOVABLE.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  In addition, using ZONE_MOVABLE might make page pinning more expensive,
 | 
				
			||||||
 | 
					  because of the page migration overhead.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Locking Internals
 | 
					By default, all the memory configured at boot time is managed by the kernel
 | 
				
			||||||
=================
 | 
					zones and ZONE_MOVABLE is not used.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
When adding/removing memory that uses memory block devices (i.e. ordinary RAM),
 | 
					To enable ZONE_MOVABLE to include the memory present at boot and to control the
 | 
				
			||||||
the device_hotplug_lock should be held to:
 | 
					ratio between movable and kernel zones there are two command line options:
 | 
				
			||||||
 | 
					``kernelcore=`` and ``movablecore=``. See
 | 
				
			||||||
 | 
					Documentation/admin-guide/kernel-parameters.rst for their description.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- synchronize against online/offline requests (e.g. via sysfs). This way, memory
 | 
					Memory Offlining and ZONE_MOVABLE
 | 
				
			||||||
  block devices can only be accessed (.online/.state attributes) by user
 | 
					---------------------------------
 | 
				
			||||||
  space once memory has been fully added. And when removing memory, we
 | 
					 | 
				
			||||||
  know nobody is in critical sections.
 | 
					 | 
				
			||||||
- synchronize against CPU hotplug and similar (e.g. relevant for ACPI and PPC)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
Especially, there is a possible lock inversion that is avoided using
 | 
					Even with ZONE_MOVABLE, there are some corner cases where offlining a memory
 | 
				
			||||||
device_hotplug_lock when adding memory and user space tries to online that
 | 
					block might fail:
 | 
				
			||||||
memory faster than expected:
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
- device_online() will first take the device_lock(), followed by
 | 
					- Memory blocks with memory holes; this applies to memory blocks present during
 | 
				
			||||||
  mem_hotplug_lock
 | 
					  boot and can apply to memory blocks hotplugged via the XEN balloon and the
 | 
				
			||||||
- add_memory_resource() will first take the mem_hotplug_lock, followed by
 | 
					  Hyper-V balloon.
 | 
				
			||||||
  the device_lock() (while creating the devices, during bus_add_device()).
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
As the device is visible to user space before taking the device_lock(), this
 | 
					- Mixed NUMA nodes and mixed zones within a single memory block prevent memory
 | 
				
			||||||
can result in a lock inversion.
 | 
					  offlining; this applies to memory blocks present during boot only.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
onlining/offlining of memory should be done via device_online()/
 | 
					- Special memory blocks prevented by the system from getting offlined. Examples
 | 
				
			||||||
device_offline() - to make sure it is properly synchronized to actions
 | 
					  include any memory available during boot on arm64 or memory blocks spanning
 | 
				
			||||||
via sysfs. Holding device_hotplug_lock is advised (to e.g. protect online_type)
 | 
					  the crashkernel area on s390x; this usually applies to memory blocks present
 | 
				
			||||||
 | 
					  during boot only.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
When adding/removing/onlining/offlining memory or adding/removing
 | 
					- Memory blocks overlapping with CMA areas cannot be offlined, this applies to
 | 
				
			||||||
heterogeneous/device memory, we should always hold the mem_hotplug_lock in
 | 
					  memory blocks present during boot only.
 | 
				
			||||||
write mode to serialise memory hotplug (e.g. access to global/zone
 | 
					 | 
				
			||||||
variables).
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
In addition, mem_hotplug_lock (in contrast to device_hotplug_lock) in read
 | 
					- Concurrent activity that operates on the same physical memory area, such as
 | 
				
			||||||
mode allows for a quite efficient get_online_mems/put_online_mems
 | 
					  allocating gigantic pages, can result in temporary offlining failures.
 | 
				
			||||||
implementation, so code accessing memory can protect from that memory
 | 
					 | 
				
			||||||
vanishing.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- Out of memory when dissolving huge pages, especially when freeing unused
 | 
				
			||||||
 | 
					  vmemmap pages associated with each hugetlb page is enabled.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Future Work
 | 
					  Offlining code may be able to migrate huge page contents, but may not be able
 | 
				
			||||||
===========
 | 
					  to dissolve the source huge page because it fails allocating (unmovable) pages
 | 
				
			||||||
 | 
					  for the vmemmap, because the system might not have free memory in the kernel
 | 
				
			||||||
 | 
					  zones left.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
 | 
					  Users that depend on memory offlining to succeed for movable zones should
 | 
				
			||||||
    sysctl or new control file.
 | 
					  carefully consider whether the memory savings gained from this feature are
 | 
				
			||||||
  - showing memory block and physical device relationship.
 | 
					  worth the risk of possibly not being able to offline memory in certain
 | 
				
			||||||
  - test and make it better memory offlining.
 | 
					  situations.
 | 
				
			||||||
  - support HugeTLB page migration and offlining.
 | 
					
 | 
				
			||||||
  - memmap removing at memory offline.
 | 
					Further, when running into out of memory situations while migrating pages, or
 | 
				
			||||||
  - physical remove memory.
 | 
					when still encountering permanently unmovable pages within ZONE_MOVABLE
 | 
				
			||||||
 | 
					(-> BUG), memory offlining will keep retrying until it eventually succeeds.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					When offlining is triggered from user space, the offlining context can be
 | 
				
			||||||
 | 
					terminated by sending a fatal signal. A timeout based offlining can easily be
 | 
				
			||||||
 | 
					implemented via::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						% timeout $TIMEOUT offline_block | failure_handling
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -65,25 +65,27 @@ Error reports
 | 
				
			||||||
A typical out-of-bounds access looks like this::
 | 
					A typical out-of-bounds access looks like this::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ==================================================================
 | 
					    ==================================================================
 | 
				
			||||||
    BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0xa3/0x22b
 | 
					    BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0xa6/0x234
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Out-of-bounds read at 0xffffffffb672efff (1B left of kfence-#17):
 | 
					    Out-of-bounds read at 0xffff8c3f2e291fff (1B left of kfence-#72):
 | 
				
			||||||
     test_out_of_bounds_read+0xa3/0x22b
 | 
					     test_out_of_bounds_read+0xa6/0x234
 | 
				
			||||||
     kunit_try_run_case+0x51/0x85
 | 
					     kunit_try_run_case+0x61/0xa0
 | 
				
			||||||
     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
					     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
				
			||||||
     kthread+0x137/0x160
 | 
					     kthread+0x176/0x1b0
 | 
				
			||||||
     ret_from_fork+0x22/0x30
 | 
					     ret_from_fork+0x22/0x30
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    kfence-#17 [0xffffffffb672f000-0xffffffffb672f01f, size=32, cache=kmalloc-32] allocated by task 507:
 | 
					    kfence-#72: 0xffff8c3f2e292000-0xffff8c3f2e29201f, size=32, cache=kmalloc-32
 | 
				
			||||||
     test_alloc+0xf3/0x25b
 | 
					
 | 
				
			||||||
     test_out_of_bounds_read+0x98/0x22b
 | 
					    allocated by task 484 on cpu 0 at 32.919330s:
 | 
				
			||||||
     kunit_try_run_case+0x51/0x85
 | 
					     test_alloc+0xfe/0x738
 | 
				
			||||||
 | 
					     test_out_of_bounds_read+0x9b/0x234
 | 
				
			||||||
 | 
					     kunit_try_run_case+0x61/0xa0
 | 
				
			||||||
     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
					     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
				
			||||||
     kthread+0x137/0x160
 | 
					     kthread+0x176/0x1b0
 | 
				
			||||||
     ret_from_fork+0x22/0x30
 | 
					     ret_from_fork+0x22/0x30
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    CPU: 4 PID: 107 Comm: kunit_try_catch Not tainted 5.8.0-rc6+ #7
 | 
					    CPU: 0 PID: 484 Comm: kunit_try_catch Not tainted 5.13.0-rc3+ #7
 | 
				
			||||||
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
 | 
					    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
 | 
				
			||||||
    ==================================================================
 | 
					    ==================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The header of the report provides a short summary of the function involved in
 | 
					The header of the report provides a short summary of the function involved in
 | 
				
			||||||
| 
						 | 
					@ -96,30 +98,32 @@ Use-after-free accesses are reported as::
 | 
				
			||||||
    ==================================================================
 | 
					    ==================================================================
 | 
				
			||||||
    BUG: KFENCE: use-after-free read in test_use_after_free_read+0xb3/0x143
 | 
					    BUG: KFENCE: use-after-free read in test_use_after_free_read+0xb3/0x143
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Use-after-free read at 0xffffffffb673dfe0 (in kfence-#24):
 | 
					    Use-after-free read at 0xffff8c3f2e2a0000 (in kfence-#79):
 | 
				
			||||||
     test_use_after_free_read+0xb3/0x143
 | 
					     test_use_after_free_read+0xb3/0x143
 | 
				
			||||||
     kunit_try_run_case+0x51/0x85
 | 
					     kunit_try_run_case+0x61/0xa0
 | 
				
			||||||
     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
					     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
				
			||||||
     kthread+0x137/0x160
 | 
					     kthread+0x176/0x1b0
 | 
				
			||||||
     ret_from_fork+0x22/0x30
 | 
					     ret_from_fork+0x22/0x30
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    kfence-#24 [0xffffffffb673dfe0-0xffffffffb673dfff, size=32, cache=kmalloc-32] allocated by task 507:
 | 
					    kfence-#79: 0xffff8c3f2e2a0000-0xffff8c3f2e2a001f, size=32, cache=kmalloc-32
 | 
				
			||||||
     test_alloc+0xf3/0x25b
 | 
					
 | 
				
			||||||
 | 
					    allocated by task 488 on cpu 2 at 33.871326s:
 | 
				
			||||||
 | 
					     test_alloc+0xfe/0x738
 | 
				
			||||||
     test_use_after_free_read+0x76/0x143
 | 
					     test_use_after_free_read+0x76/0x143
 | 
				
			||||||
     kunit_try_run_case+0x51/0x85
 | 
					     kunit_try_run_case+0x61/0xa0
 | 
				
			||||||
     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
					     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
				
			||||||
     kthread+0x137/0x160
 | 
					     kthread+0x176/0x1b0
 | 
				
			||||||
     ret_from_fork+0x22/0x30
 | 
					     ret_from_fork+0x22/0x30
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    freed by task 507:
 | 
					    freed by task 488 on cpu 2 at 33.871358s:
 | 
				
			||||||
     test_use_after_free_read+0xa8/0x143
 | 
					     test_use_after_free_read+0xa8/0x143
 | 
				
			||||||
     kunit_try_run_case+0x51/0x85
 | 
					     kunit_try_run_case+0x61/0xa0
 | 
				
			||||||
     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
					     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
				
			||||||
     kthread+0x137/0x160
 | 
					     kthread+0x176/0x1b0
 | 
				
			||||||
     ret_from_fork+0x22/0x30
 | 
					     ret_from_fork+0x22/0x30
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    CPU: 4 PID: 109 Comm: kunit_try_catch Tainted: G        W         5.8.0-rc6+ #7
 | 
					    CPU: 2 PID: 488 Comm: kunit_try_catch Tainted: G    B             5.13.0-rc3+ #7
 | 
				
			||||||
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
 | 
					    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
 | 
				
			||||||
    ==================================================================
 | 
					    ==================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
KFENCE also reports on invalid frees, such as double-frees::
 | 
					KFENCE also reports on invalid frees, such as double-frees::
 | 
				
			||||||
| 
						 | 
					@ -127,30 +131,32 @@ KFENCE also reports on invalid frees, such as double-frees::
 | 
				
			||||||
    ==================================================================
 | 
					    ==================================================================
 | 
				
			||||||
    BUG: KFENCE: invalid free in test_double_free+0xdc/0x171
 | 
					    BUG: KFENCE: invalid free in test_double_free+0xdc/0x171
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Invalid free of 0xffffffffb6741000:
 | 
					    Invalid free of 0xffff8c3f2e2a4000 (in kfence-#81):
 | 
				
			||||||
     test_double_free+0xdc/0x171
 | 
					     test_double_free+0xdc/0x171
 | 
				
			||||||
     kunit_try_run_case+0x51/0x85
 | 
					     kunit_try_run_case+0x61/0xa0
 | 
				
			||||||
     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
					     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
				
			||||||
     kthread+0x137/0x160
 | 
					     kthread+0x176/0x1b0
 | 
				
			||||||
     ret_from_fork+0x22/0x30
 | 
					     ret_from_fork+0x22/0x30
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    kfence-#26 [0xffffffffb6741000-0xffffffffb674101f, size=32, cache=kmalloc-32] allocated by task 507:
 | 
					    kfence-#81: 0xffff8c3f2e2a4000-0xffff8c3f2e2a401f, size=32, cache=kmalloc-32
 | 
				
			||||||
     test_alloc+0xf3/0x25b
 | 
					
 | 
				
			||||||
 | 
					    allocated by task 490 on cpu 1 at 34.175321s:
 | 
				
			||||||
 | 
					     test_alloc+0xfe/0x738
 | 
				
			||||||
     test_double_free+0x76/0x171
 | 
					     test_double_free+0x76/0x171
 | 
				
			||||||
     kunit_try_run_case+0x51/0x85
 | 
					     kunit_try_run_case+0x61/0xa0
 | 
				
			||||||
     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
					     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
				
			||||||
     kthread+0x137/0x160
 | 
					     kthread+0x176/0x1b0
 | 
				
			||||||
     ret_from_fork+0x22/0x30
 | 
					     ret_from_fork+0x22/0x30
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    freed by task 507:
 | 
					    freed by task 490 on cpu 1 at 34.175348s:
 | 
				
			||||||
     test_double_free+0xa8/0x171
 | 
					     test_double_free+0xa8/0x171
 | 
				
			||||||
     kunit_try_run_case+0x51/0x85
 | 
					     kunit_try_run_case+0x61/0xa0
 | 
				
			||||||
     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
					     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
				
			||||||
     kthread+0x137/0x160
 | 
					     kthread+0x176/0x1b0
 | 
				
			||||||
     ret_from_fork+0x22/0x30
 | 
					     ret_from_fork+0x22/0x30
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    CPU: 4 PID: 111 Comm: kunit_try_catch Tainted: G        W         5.8.0-rc6+ #7
 | 
					    CPU: 1 PID: 490 Comm: kunit_try_catch Tainted: G    B             5.13.0-rc3+ #7
 | 
				
			||||||
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
 | 
					    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
 | 
				
			||||||
    ==================================================================
 | 
					    ==================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
KFENCE also uses pattern-based redzones on the other side of an object's guard
 | 
					KFENCE also uses pattern-based redzones on the other side of an object's guard
 | 
				
			||||||
| 
						 | 
					@ -160,23 +166,25 @@ These are reported on frees::
 | 
				
			||||||
    ==================================================================
 | 
					    ==================================================================
 | 
				
			||||||
    BUG: KFENCE: memory corruption in test_kmalloc_aligned_oob_write+0xef/0x184
 | 
					    BUG: KFENCE: memory corruption in test_kmalloc_aligned_oob_write+0xef/0x184
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Corrupted memory at 0xffffffffb6797ff9 [ 0xac . . . . . . ] (in kfence-#69):
 | 
					    Corrupted memory at 0xffff8c3f2e33aff9 [ 0xac . . . . . . ] (in kfence-#156):
 | 
				
			||||||
     test_kmalloc_aligned_oob_write+0xef/0x184
 | 
					     test_kmalloc_aligned_oob_write+0xef/0x184
 | 
				
			||||||
     kunit_try_run_case+0x51/0x85
 | 
					     kunit_try_run_case+0x61/0xa0
 | 
				
			||||||
     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
					     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
				
			||||||
     kthread+0x137/0x160
 | 
					     kthread+0x176/0x1b0
 | 
				
			||||||
     ret_from_fork+0x22/0x30
 | 
					     ret_from_fork+0x22/0x30
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    kfence-#69 [0xffffffffb6797fb0-0xffffffffb6797ff8, size=73, cache=kmalloc-96] allocated by task 507:
 | 
					    kfence-#156: 0xffff8c3f2e33afb0-0xffff8c3f2e33aff8, size=73, cache=kmalloc-96
 | 
				
			||||||
     test_alloc+0xf3/0x25b
 | 
					
 | 
				
			||||||
 | 
					    allocated by task 502 on cpu 7 at 42.159302s:
 | 
				
			||||||
 | 
					     test_alloc+0xfe/0x738
 | 
				
			||||||
     test_kmalloc_aligned_oob_write+0x57/0x184
 | 
					     test_kmalloc_aligned_oob_write+0x57/0x184
 | 
				
			||||||
     kunit_try_run_case+0x51/0x85
 | 
					     kunit_try_run_case+0x61/0xa0
 | 
				
			||||||
     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
					     kunit_generic_run_threadfn_adapter+0x16/0x30
 | 
				
			||||||
     kthread+0x137/0x160
 | 
					     kthread+0x176/0x1b0
 | 
				
			||||||
     ret_from_fork+0x22/0x30
 | 
					     ret_from_fork+0x22/0x30
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    CPU: 4 PID: 120 Comm: kunit_try_catch Tainted: G        W         5.8.0-rc6+ #7
 | 
					    CPU: 7 PID: 502 Comm: kunit_try_catch Tainted: G    B             5.13.0-rc3+ #7
 | 
				
			||||||
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
 | 
					    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
 | 
				
			||||||
    ==================================================================
 | 
					    ==================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
For such errors, the address where the corruption occurred as well as the
 | 
					For such errors, the address where the corruption occurred as well as the
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -130,9 +130,10 @@ Getting Help
 | 
				
			||||||
------------
 | 
					------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- `Website <https://clangbuiltlinux.github.io/>`_
 | 
					- `Website <https://clangbuiltlinux.github.io/>`_
 | 
				
			||||||
- `Mailing List <https://groups.google.com/forum/#!forum/clang-built-linux>`_: <clang-built-linux@googlegroups.com>
 | 
					- `Mailing List <https://lore.kernel.org/llvm/>`_: <llvm@lists.linux.dev>
 | 
				
			||||||
 | 
					- `Old Mailing List Archives <https://groups.google.com/g/clang-built-linux>`_
 | 
				
			||||||
- `Issue Tracker <https://github.com/ClangBuiltLinux/linux/issues>`_
 | 
					- `Issue Tracker <https://github.com/ClangBuiltLinux/linux/issues>`_
 | 
				
			||||||
- IRC: #clangbuiltlinux on chat.freenode.net
 | 
					- IRC: #clangbuiltlinux on irc.libera.chat
 | 
				
			||||||
- `Telegram <https://t.me/ClangBuiltLinux>`_: @ClangBuiltLinux
 | 
					- `Telegram <https://t.me/ClangBuiltLinux>`_: @ClangBuiltLinux
 | 
				
			||||||
- `Wiki <https://github.com/ClangBuiltLinux/linux/wiki>`_
 | 
					- `Wiki <https://github.com/ClangBuiltLinux/linux/wiki>`_
 | 
				
			||||||
- `Beginner Bugs <https://github.com/ClangBuiltLinux/linux/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22>`_
 | 
					- `Beginner Bugs <https://github.com/ClangBuiltLinux/linux/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22>`_
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										20
									
								
								Documentation/vm/damon/api.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								Documentation/vm/damon/api.rst
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,20 @@
 | 
				
			||||||
 | 
					.. SPDX-License-Identifier: GPL-2.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					=============
 | 
				
			||||||
 | 
					API Reference
 | 
				
			||||||
 | 
					=============
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Kernel space programs can use every feature of DAMON using below APIs.  All you
 | 
				
			||||||
 | 
					need to do is including ``damon.h``, which is located in ``include/linux/`` of
 | 
				
			||||||
 | 
					the source tree.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Structures
 | 
				
			||||||
 | 
					==========
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. kernel-doc:: include/linux/damon.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Functions
 | 
				
			||||||
 | 
					=========
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. kernel-doc:: mm/damon/core.c
 | 
				
			||||||
							
								
								
									
										166
									
								
								Documentation/vm/damon/design.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										166
									
								
								Documentation/vm/damon/design.rst
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,166 @@
 | 
				
			||||||
 | 
					.. SPDX-License-Identifier: GPL-2.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					======
 | 
				
			||||||
 | 
					Design
 | 
				
			||||||
 | 
					======
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Configurable Layers
 | 
				
			||||||
 | 
					===================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DAMON provides data access monitoring functionality while making the accuracy
 | 
				
			||||||
 | 
					and the overhead controllable.  The fundamental access monitorings require
 | 
				
			||||||
 | 
					primitives that dependent on and optimized for the target address space.  On
 | 
				
			||||||
 | 
					the other hand, the accuracy and overhead tradeoff mechanism, which is the core
 | 
				
			||||||
 | 
					of DAMON, is in the pure logic space.  DAMON separates the two parts in
 | 
				
			||||||
 | 
					different layers and defines its interface to allow various low level
 | 
				
			||||||
 | 
					primitives implementations configurable with the core logic.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Due to this separated design and the configurable interface, users can extend
 | 
				
			||||||
 | 
					DAMON for any address space by configuring the core logics with appropriate low
 | 
				
			||||||
 | 
					level primitive implementations.  If appropriate one is not provided, users can
 | 
				
			||||||
 | 
					implement the primitives on their own.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For example, physical memory, virtual memory, swap space, those for specific
 | 
				
			||||||
 | 
					processes, NUMA nodes, files, and backing memory devices would be supportable.
 | 
				
			||||||
 | 
					Also, if some architectures or devices support special optimized access check
 | 
				
			||||||
 | 
					primitives, those will be easily configurable.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Reference Implementations of Address Space Specific Primitives
 | 
				
			||||||
 | 
					==============================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The low level primitives for the fundamental access monitoring are defined in
 | 
				
			||||||
 | 
					two parts:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. Identification of the monitoring target address range for the address space.
 | 
				
			||||||
 | 
					2. Access check of specific address range in the target space.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DAMON currently provides the implementation of the primitives for only the
 | 
				
			||||||
 | 
					virtual address spaces. Below two subsections describe how it works.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					VMA-based Target Address Range Construction
 | 
				
			||||||
 | 
					-------------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Only small parts in the super-huge virtual address space of the processes are
 | 
				
			||||||
 | 
					mapped to the physical memory and accessed.  Thus, tracking the unmapped
 | 
				
			||||||
 | 
					address regions is just wasteful.  However, because DAMON can deal with some
 | 
				
			||||||
 | 
					level of noise using the adaptive regions adjustment mechanism, tracking every
 | 
				
			||||||
 | 
					mapping is not strictly required but could even incur a high overhead in some
 | 
				
			||||||
 | 
					cases.  That said, too huge unmapped areas inside the monitoring target should
 | 
				
			||||||
 | 
					be removed to not take the time for the adaptive mechanism.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For the reason, this implementation converts the complex mappings to three
 | 
				
			||||||
 | 
					distinct regions that cover every mapped area of the address space.  The two
 | 
				
			||||||
 | 
					gaps between the three regions are the two biggest unmapped areas in the given
 | 
				
			||||||
 | 
					address space.  The two biggest unmapped areas would be the gap between the
 | 
				
			||||||
 | 
					heap and the uppermost mmap()-ed region, and the gap between the lowermost
 | 
				
			||||||
 | 
					mmap()-ed region and the stack in most of the cases.  Because these gaps are
 | 
				
			||||||
 | 
					exceptionally huge in usual address spaces, excluding these will be sufficient
 | 
				
			||||||
 | 
					to make a reasonable trade-off.  Below shows this in detail::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <heap>
 | 
				
			||||||
 | 
					    <BIG UNMAPPED REGION 1>
 | 
				
			||||||
 | 
					    <uppermost mmap()-ed region>
 | 
				
			||||||
 | 
					    (small mmap()-ed regions and munmap()-ed regions)
 | 
				
			||||||
 | 
					    <lowermost mmap()-ed region>
 | 
				
			||||||
 | 
					    <BIG UNMAPPED REGION 2>
 | 
				
			||||||
 | 
					    <stack>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PTE Accessed-bit Based Access Check
 | 
				
			||||||
 | 
					-----------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The implementation for the virtual address space uses PTE Accessed-bit for
 | 
				
			||||||
 | 
					basic access checks.  It finds the relevant PTE Accessed bit from the address
 | 
				
			||||||
 | 
					by walking the page table for the target task of the address.  In this way, the
 | 
				
			||||||
 | 
					implementation finds and clears the bit for next sampling target address and
 | 
				
			||||||
 | 
					checks whether the bit set again after one sampling period.  This could disturb
 | 
				
			||||||
 | 
					other kernel subsystems using the Accessed bits, namely Idle page tracking and
 | 
				
			||||||
 | 
					the reclaim logic.  To avoid such disturbances, DAMON makes it mutually
 | 
				
			||||||
 | 
					exclusive with Idle page tracking and uses ``PG_idle`` and ``PG_young`` page
 | 
				
			||||||
 | 
					flags to solve the conflict with the reclaim logic, as Idle page tracking does.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Address Space Independent Core Mechanisms
 | 
				
			||||||
 | 
					=========================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Below four sections describe each of the DAMON core mechanisms and the five
 | 
				
			||||||
 | 
					monitoring attributes, ``sampling interval``, ``aggregation interval``,
 | 
				
			||||||
 | 
					``regions update interval``, ``minimum number of regions``, and ``maximum
 | 
				
			||||||
 | 
					number of regions``.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Access Frequency Monitoring
 | 
				
			||||||
 | 
					---------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The output of DAMON says what pages are how frequently accessed for a given
 | 
				
			||||||
 | 
					duration.  The resolution of the access frequency is controlled by setting
 | 
				
			||||||
 | 
					``sampling interval`` and ``aggregation interval``.  In detail, DAMON checks
 | 
				
			||||||
 | 
					access to each page per ``sampling interval`` and aggregates the results.  In
 | 
				
			||||||
 | 
					other words, counts the number of the accesses to each page.  After each
 | 
				
			||||||
 | 
					``aggregation interval`` passes, DAMON calls callback functions that previously
 | 
				
			||||||
 | 
					registered by users so that users can read the aggregated results and then
 | 
				
			||||||
 | 
					clears the results.  This can be described in below simple pseudo-code::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    while monitoring_on:
 | 
				
			||||||
 | 
					        for page in monitoring_target:
 | 
				
			||||||
 | 
					            if accessed(page):
 | 
				
			||||||
 | 
					                nr_accesses[page] += 1
 | 
				
			||||||
 | 
					        if time() % aggregation_interval == 0:
 | 
				
			||||||
 | 
					            for callback in user_registered_callbacks:
 | 
				
			||||||
 | 
					                callback(monitoring_target, nr_accesses)
 | 
				
			||||||
 | 
					            for page in monitoring_target:
 | 
				
			||||||
 | 
					                nr_accesses[page] = 0
 | 
				
			||||||
 | 
					        sleep(sampling interval)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The monitoring overhead of this mechanism will arbitrarily increase as the
 | 
				
			||||||
 | 
					size of the target workload grows.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Region Based Sampling
 | 
				
			||||||
 | 
					---------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					To avoid the unbounded increase of the overhead, DAMON groups adjacent pages
 | 
				
			||||||
 | 
					that assumed to have the same access frequencies into a region.  As long as the
 | 
				
			||||||
 | 
					assumption (pages in a region have the same access frequencies) is kept, only
 | 
				
			||||||
 | 
					one page in the region is required to be checked.  Thus, for each ``sampling
 | 
				
			||||||
 | 
					interval``, DAMON randomly picks one page in each region, waits for one
 | 
				
			||||||
 | 
					``sampling interval``, checks whether the page is accessed meanwhile, and
 | 
				
			||||||
 | 
					increases the access frequency of the region if so.  Therefore, the monitoring
 | 
				
			||||||
 | 
					overhead is controllable by setting the number of regions.  DAMON allows users
 | 
				
			||||||
 | 
					to set the minimum and the maximum number of regions for the trade-off.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This scheme, however, cannot preserve the quality of the output if the
 | 
				
			||||||
 | 
					assumption is not guaranteed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Adaptive Regions Adjustment
 | 
				
			||||||
 | 
					---------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Even somehow the initial monitoring target regions are well constructed to
 | 
				
			||||||
 | 
					fulfill the assumption (pages in same region have similar access frequencies),
 | 
				
			||||||
 | 
					the data access pattern can be dynamically changed.  This will result in low
 | 
				
			||||||
 | 
					monitoring quality.  To keep the assumption as much as possible, DAMON
 | 
				
			||||||
 | 
					adaptively merges and splits each region based on their access frequency.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For each ``aggregation interval``, it compares the access frequencies of
 | 
				
			||||||
 | 
					adjacent regions and merges those if the frequency difference is small.  Then,
 | 
				
			||||||
 | 
					after it reports and clears the aggregated access frequency of each region, it
 | 
				
			||||||
 | 
					splits each region into two or three regions if the total number of regions
 | 
				
			||||||
 | 
					will not exceed the user-specified maximum number of regions after the split.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In this way, DAMON provides its best-effort quality and minimal overhead while
 | 
				
			||||||
 | 
					keeping the bounds users set for their trade-off.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Dynamic Target Space Updates Handling
 | 
				
			||||||
 | 
					-------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The monitoring target address range could dynamically changed.  For example,
 | 
				
			||||||
 | 
					virtual memory could be dynamically mapped and unmapped.  Physical memory could
 | 
				
			||||||
 | 
					be hot-plugged.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					As the changes could be quite frequent in some cases, DAMON checks the dynamic
 | 
				
			||||||
 | 
					memory mapping changes and applies it to the abstracted target area only for
 | 
				
			||||||
 | 
					each of a user-specified time interval (``regions update interval``).
 | 
				
			||||||
							
								
								
									
										51
									
								
								Documentation/vm/damon/faq.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								Documentation/vm/damon/faq.rst
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,51 @@
 | 
				
			||||||
 | 
					.. SPDX-License-Identifier: GPL-2.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					==========================
 | 
				
			||||||
 | 
					Frequently Asked Questions
 | 
				
			||||||
 | 
					==========================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Why a new subsystem, instead of extending perf or other user space tools?
 | 
				
			||||||
 | 
					=========================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					First, because it needs to be lightweight as much as possible so that it can be
 | 
				
			||||||
 | 
					used online, any unnecessary overhead such as kernel - user space context
 | 
				
			||||||
 | 
					switching cost should be avoided.  Second, DAMON aims to be used by other
 | 
				
			||||||
 | 
					programs including the kernel.  Therefore, having a dependency on specific
 | 
				
			||||||
 | 
					tools like perf is not desirable.  These are the two biggest reasons why DAMON
 | 
				
			||||||
 | 
					is implemented in the kernel space.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Can 'idle pages tracking' or 'perf mem' substitute DAMON?
 | 
				
			||||||
 | 
					=========================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Idle page tracking is a low level primitive for access check of the physical
 | 
				
			||||||
 | 
					address space.  'perf mem' is similar, though it can use sampling to minimize
 | 
				
			||||||
 | 
					the overhead.  On the other hand, DAMON is a higher-level framework for the
 | 
				
			||||||
 | 
					monitoring of various address spaces.  It is focused on memory management
 | 
				
			||||||
 | 
					optimization and provides sophisticated accuracy/overhead handling mechanisms.
 | 
				
			||||||
 | 
					Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of
 | 
				
			||||||
 | 
					DAMON's output, but cannot substitute DAMON.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Does DAMON support virtual memory only?
 | 
				
			||||||
 | 
					=======================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					No.  The core of the DAMON is address space independent.  The address space
 | 
				
			||||||
 | 
					specific low level primitive parts including monitoring target regions
 | 
				
			||||||
 | 
					constructions and actual access checks can be implemented and configured on the
 | 
				
			||||||
 | 
					DAMON core by the users.  In this way, DAMON users can monitor any address
 | 
				
			||||||
 | 
					space with any access check technique.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Nonetheless, DAMON provides vma tracking and PTE Accessed bit check based
 | 
				
			||||||
 | 
					implementations of the address space dependent functions for the virtual memory
 | 
				
			||||||
 | 
					by default, for a reference and convenient use.  In near future, we will
 | 
				
			||||||
 | 
					provide those for physical memory address space.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Can I simply monitor page granularity?
 | 
				
			||||||
 | 
					======================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Yes.  You can do so by setting the ``min_nr_regions`` attribute higher than the
 | 
				
			||||||
 | 
					working set size divided by the page size.  Because the monitoring target
 | 
				
			||||||
 | 
					regions size is forced to be ``>=page size``, the region split will make no
 | 
				
			||||||
 | 
					effect.
 | 
				
			||||||
							
								
								
									
										30
									
								
								Documentation/vm/damon/index.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								Documentation/vm/damon/index.rst
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,30 @@
 | 
				
			||||||
 | 
					.. SPDX-License-Identifier: GPL-2.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					==========================
 | 
				
			||||||
 | 
					DAMON: Data Access MONitor
 | 
				
			||||||
 | 
					==========================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DAMON is a data access monitoring framework subsystem for the Linux kernel.
 | 
				
			||||||
 | 
					The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 - *accurate* (the monitoring output is useful enough for DRAM level memory
 | 
				
			||||||
 | 
					   management; It might not appropriate for CPU Cache levels, though),
 | 
				
			||||||
 | 
					 - *light-weight* (the monitoring overhead is low enough to be applied online),
 | 
				
			||||||
 | 
					   and
 | 
				
			||||||
 | 
					 - *scalable* (the upper-bound of the overhead is in constant range regardless
 | 
				
			||||||
 | 
					   of the size of target workloads).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Using this framework, therefore, the kernel's memory management mechanisms can
 | 
				
			||||||
 | 
					make advanced decisions.  Experimental memory management optimization works
 | 
				
			||||||
 | 
					that incurring high data accesses monitoring overhead could implemented again.
 | 
				
			||||||
 | 
					In user space, meanwhile, users who have some special workloads can write
 | 
				
			||||||
 | 
					personalized applications for better understanding and optimizations of their
 | 
				
			||||||
 | 
					workloads and systems.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. toctree::
 | 
				
			||||||
 | 
					   :maxdepth: 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   faq
 | 
				
			||||||
 | 
					   design
 | 
				
			||||||
 | 
					   api
 | 
				
			||||||
 | 
					   plans
 | 
				
			||||||
| 
						 | 
					@ -32,6 +32,7 @@ descriptions of data structures and algorithms.
 | 
				
			||||||
   arch_pgtable_helpers
 | 
					   arch_pgtable_helpers
 | 
				
			||||||
   balance
 | 
					   balance
 | 
				
			||||||
   cleancache
 | 
					   cleancache
 | 
				
			||||||
 | 
					   damon/index
 | 
				
			||||||
   free_page_reporting
 | 
					   free_page_reporting
 | 
				
			||||||
   frontswap
 | 
					   frontswap
 | 
				
			||||||
   highmem
 | 
					   highmem
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										15
									
								
								MAINTAINERS
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								MAINTAINERS
									
									
									
									
									
								
							| 
						 | 
					@ -4526,7 +4526,7 @@ F:	.clang-format
 | 
				
			||||||
CLANG/LLVM BUILD SUPPORT
 | 
					CLANG/LLVM BUILD SUPPORT
 | 
				
			||||||
M:	Nathan Chancellor <nathan@kernel.org>
 | 
					M:	Nathan Chancellor <nathan@kernel.org>
 | 
				
			||||||
M:	Nick Desaulniers <ndesaulniers@google.com>
 | 
					M:	Nick Desaulniers <ndesaulniers@google.com>
 | 
				
			||||||
L:	clang-built-linux@googlegroups.com
 | 
					L:	llvm@lists.linux.dev
 | 
				
			||||||
S:	Supported
 | 
					S:	Supported
 | 
				
			||||||
W:	https://clangbuiltlinux.github.io/
 | 
					W:	https://clangbuiltlinux.github.io/
 | 
				
			||||||
B:	https://github.com/ClangBuiltLinux/linux/issues
 | 
					B:	https://github.com/ClangBuiltLinux/linux/issues
 | 
				
			||||||
| 
						 | 
					@ -4542,7 +4542,7 @@ M:	Sami Tolvanen <samitolvanen@google.com>
 | 
				
			||||||
M:	Kees Cook <keescook@chromium.org>
 | 
					M:	Kees Cook <keescook@chromium.org>
 | 
				
			||||||
R:	Nathan Chancellor <nathan@kernel.org>
 | 
					R:	Nathan Chancellor <nathan@kernel.org>
 | 
				
			||||||
R:	Nick Desaulniers <ndesaulniers@google.com>
 | 
					R:	Nick Desaulniers <ndesaulniers@google.com>
 | 
				
			||||||
L:	clang-built-linux@googlegroups.com
 | 
					L:	llvm@lists.linux.dev
 | 
				
			||||||
S:	Supported
 | 
					S:	Supported
 | 
				
			||||||
B:	https://github.com/ClangBuiltLinux/linux/issues
 | 
					B:	https://github.com/ClangBuiltLinux/linux/issues
 | 
				
			||||||
T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/clang/features
 | 
					T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/clang/features
 | 
				
			||||||
| 
						 | 
					@ -5149,6 +5149,17 @@ F:	net/ax25/ax25_out.c
 | 
				
			||||||
F:	net/ax25/ax25_timer.c
 | 
					F:	net/ax25/ax25_timer.c
 | 
				
			||||||
F:	net/ax25/sysctl_net_ax25.c
 | 
					F:	net/ax25/sysctl_net_ax25.c
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DATA ACCESS MONITOR
 | 
				
			||||||
 | 
					M:	SeongJae Park <sjpark@amazon.de>
 | 
				
			||||||
 | 
					L:	linux-mm@kvack.org
 | 
				
			||||||
 | 
					S:	Maintained
 | 
				
			||||||
 | 
					F:	Documentation/admin-guide/mm/damon/
 | 
				
			||||||
 | 
					F:	Documentation/vm/damon/
 | 
				
			||||||
 | 
					F:	include/linux/damon.h
 | 
				
			||||||
 | 
					F:	include/trace/events/damon.h
 | 
				
			||||||
 | 
					F:	mm/damon/
 | 
				
			||||||
 | 
					F:	tools/testing/selftests/damon/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER
 | 
					DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER
 | 
				
			||||||
L:	netdev@vger.kernel.org
 | 
					L:	netdev@vger.kernel.org
 | 
				
			||||||
S:	Orphan
 | 
					S:	Orphan
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -889,7 +889,7 @@ config HAVE_SOFTIRQ_ON_OWN_STACK
 | 
				
			||||||
	bool
 | 
						bool
 | 
				
			||||||
	help
 | 
						help
 | 
				
			||||||
	  Architecture provides a function to run __do_softirq() on a
 | 
						  Architecture provides a function to run __do_softirq() on a
 | 
				
			||||||
	  seperate stack.
 | 
						  separate stack.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
config PGTABLE_LEVELS
 | 
					config PGTABLE_LEVELS
 | 
				
			||||||
	int
 | 
						int
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,8 +6,8 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* dummy for now */
 | 
					/* dummy for now */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define map_page_into_agp(page) 
 | 
					#define map_page_into_agp(page)		do { } while (0)
 | 
				
			||||||
#define unmap_page_from_agp(page) 
 | 
					#define unmap_page_from_agp(page)	do { } while (0)
 | 
				
			||||||
#define flush_agp_cache() mb()
 | 
					#define flush_agp_cache() mb()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* GATT allocation. Returns/accepts GATT kernel virtual address. */
 | 
					/* GATT allocation. Returns/accepts GATT kernel virtual address. */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -60,6 +60,8 @@ static int __pci_mmap_fits(struct pci_dev *pdev, int num,
 | 
				
			||||||
 * @sparse: address space type
 | 
					 * @sparse: address space type
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * Use the bus mapping routines to map a PCI resource into userspace.
 | 
					 * Use the bus mapping routines to map a PCI resource into userspace.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Return: %0 on success, negative error code otherwise
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static int pci_mmap_resource(struct kobject *kobj,
 | 
					static int pci_mmap_resource(struct kobject *kobj,
 | 
				
			||||||
			     struct bin_attribute *attr,
 | 
								     struct bin_attribute *attr,
 | 
				
			||||||
| 
						 | 
					@ -106,7 +108,7 @@ static int pci_mmap_resource_dense(struct file *filp, struct kobject *kobj,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * pci_remove_resource_files - cleanup resource files
 | 
					 * pci_remove_resource_files - cleanup resource files
 | 
				
			||||||
 * @dev: dev to cleanup
 | 
					 * @pdev: pci_dev to cleanup
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * If we created resource files for @dev, remove them from sysfs and
 | 
					 * If we created resource files for @dev, remove them from sysfs and
 | 
				
			||||||
 * free their resources.
 | 
					 * free their resources.
 | 
				
			||||||
| 
						 | 
					@ -221,10 +223,12 @@ static int pci_create_attr(struct pci_dev *pdev, int num)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * pci_create_resource_files - create resource files in sysfs for @dev
 | 
					 * pci_create_resource_files - create resource files in sysfs for @pdev
 | 
				
			||||||
 * @dev: dev in question
 | 
					 * @pdev: pci_dev in question
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * Walk the resources in @dev creating files for each resource available.
 | 
					 * Walk the resources in @dev creating files for each resource available.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Return: %0 on success, or negative error code
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
int pci_create_resource_files(struct pci_dev *pdev)
 | 
					int pci_create_resource_files(struct pci_dev *pdev)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					@ -296,7 +300,7 @@ int pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * pci_adjust_legacy_attr - adjustment of legacy file attributes
 | 
					 * pci_adjust_legacy_attr - adjustment of legacy file attributes
 | 
				
			||||||
 * @b: bus to create files under
 | 
					 * @bus: bus to create files under
 | 
				
			||||||
 * @mmap_type: I/O port or memory
 | 
					 * @mmap_type: I/O port or memory
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * Adjust file name and size for sparse mappings.
 | 
					 * Adjust file name and size for sparse mappings.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -20,11 +20,6 @@
 | 
				
			||||||
#include <asm/unaligned.h>
 | 
					#include <asm/unaligned.h>
 | 
				
			||||||
#include <asm/kprobes.h>
 | 
					#include <asm/kprobes.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void __init trap_init(void)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	return;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void die(const char *str, struct pt_regs *regs, unsigned long address)
 | 
					void die(const char *str, struct pt_regs *regs, unsigned long address)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	show_kernel_fault_diag(str, regs, address);
 | 
						show_kernel_fault_diag(str, regs, address);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -56,7 +56,6 @@ CONFIG_ATA=y
 | 
				
			||||||
CONFIG_SATA_MV=y
 | 
					CONFIG_SATA_MV=y
 | 
				
			||||||
CONFIG_NETDEVICES=y
 | 
					CONFIG_NETDEVICES=y
 | 
				
			||||||
CONFIG_MV643XX_ETH=y
 | 
					CONFIG_MV643XX_ETH=y
 | 
				
			||||||
CONFIG_INPUT_POLLDEV=y
 | 
					 | 
				
			||||||
# CONFIG_INPUT_MOUSEDEV is not set
 | 
					# CONFIG_INPUT_MOUSEDEV is not set
 | 
				
			||||||
CONFIG_INPUT_EVDEV=y
 | 
					CONFIG_INPUT_EVDEV=y
 | 
				
			||||||
# CONFIG_KEYBOARD_ATKBD is not set
 | 
					# CONFIG_KEYBOARD_ATKBD is not set
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -284,7 +284,6 @@ CONFIG_RT2800USB=m
 | 
				
			||||||
CONFIG_MWIFIEX=m
 | 
					CONFIG_MWIFIEX=m
 | 
				
			||||||
CONFIG_MWIFIEX_SDIO=m
 | 
					CONFIG_MWIFIEX_SDIO=m
 | 
				
			||||||
CONFIG_INPUT_FF_MEMLESS=m
 | 
					CONFIG_INPUT_FF_MEMLESS=m
 | 
				
			||||||
CONFIG_INPUT_POLLDEV=y
 | 
					 | 
				
			||||||
CONFIG_INPUT_MATRIXKMAP=y
 | 
					CONFIG_INPUT_MATRIXKMAP=y
 | 
				
			||||||
CONFIG_INPUT_MOUSEDEV=m
 | 
					CONFIG_INPUT_MOUSEDEV=m
 | 
				
			||||||
CONFIG_INPUT_MOUSEDEV_SCREEN_X=640
 | 
					CONFIG_INPUT_MOUSEDEV_SCREEN_X=640
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -781,11 +781,6 @@ void abort(void)
 | 
				
			||||||
	panic("Oops failed to kill thread");
 | 
						panic("Oops failed to kill thread");
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void __init trap_init(void)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	return;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef CONFIG_KUSER_HELPERS
 | 
					#ifdef CONFIG_KUSER_HELPERS
 | 
				
			||||||
static void __init kuser_init(void *vectors)
 | 
					static void __init kuser_init(void *vectors)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1502,8 +1502,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
 | 
				
			||||||
	return ret;
 | 
						return ret;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void arch_remove_memory(int nid, u64 start, u64 size,
 | 
					void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 | 
				
			||||||
			struct vmem_altmap *altmap)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned long start_pfn = start >> PAGE_SHIFT;
 | 
						unsigned long start_pfn = start >> PAGE_SHIFT;
 | 
				
			||||||
	unsigned long nr_pages = size >> PAGE_SHIFT;
 | 
						unsigned long nr_pages = size >> PAGE_SHIFT;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -39,10 +39,6 @@ void __init base_trap_init(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void __init trap_init(void)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
asmlinkage void set_esp0(unsigned long ssp)
 | 
					asmlinkage void set_esp0(unsigned long ssp)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	current->thread.esp0 = ssp;
 | 
						current->thread.esp0 = ssp;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -28,10 +28,6 @@
 | 
				
			||||||
#define TRAP_SYSCALL	1
 | 
					#define TRAP_SYSCALL	1
 | 
				
			||||||
#define TRAP_DEBUG	0xdb
 | 
					#define TRAP_DEBUG	0xdb
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void __init trap_init(void)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef CONFIG_GENERIC_BUG
 | 
					#ifdef CONFIG_GENERIC_BUG
 | 
				
			||||||
/* Maybe should resemble arch/sh/kernel/traps.c ?? */
 | 
					/* Maybe should resemble arch/sh/kernel/traps.c ?? */
 | 
				
			||||||
int is_valid_bugaddr(unsigned long addr)
 | 
					int is_valid_bugaddr(unsigned long addr)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -484,8 +484,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
 | 
				
			||||||
	return ret;
 | 
						return ret;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void arch_remove_memory(int nid, u64 start, u64 size,
 | 
					void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 | 
				
			||||||
			struct vmem_altmap *altmap)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned long start_pfn = start >> PAGE_SHIFT;
 | 
						unsigned long start_pfn = start >> PAGE_SHIFT;
 | 
				
			||||||
	unsigned long nr_pages = size >> PAGE_SHIFT;
 | 
						unsigned long nr_pages = size >> PAGE_SHIFT;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -116,7 +116,6 @@ CONFIG_8139TOO=y
 | 
				
			||||||
CONFIG_R8169=y
 | 
					CONFIG_R8169=y
 | 
				
			||||||
CONFIG_USB_USBNET=m
 | 
					CONFIG_USB_USBNET=m
 | 
				
			||||||
CONFIG_USB_NET_CDC_EEM=m
 | 
					CONFIG_USB_NET_CDC_EEM=m
 | 
				
			||||||
CONFIG_INPUT_POLLDEV=m
 | 
					 | 
				
			||||||
CONFIG_INPUT_EVDEV=y
 | 
					CONFIG_INPUT_EVDEV=y
 | 
				
			||||||
# CONFIG_MOUSE_PS2_ALPS is not set
 | 
					# CONFIG_MOUSE_PS2_ALPS is not set
 | 
				
			||||||
# CONFIG_MOUSE_PS2_LOGIPS2PP is not set
 | 
					# CONFIG_MOUSE_PS2_LOGIPS2PP is not set
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -34,7 +34,6 @@ CONFIG_SCSI_CONSTANTS=y
 | 
				
			||||||
CONFIG_SCSI_SCAN_ASYNC=y
 | 
					CONFIG_SCSI_SCAN_ASYNC=y
 | 
				
			||||||
# CONFIG_SCSI_LOWLEVEL is not set
 | 
					# CONFIG_SCSI_LOWLEVEL is not set
 | 
				
			||||||
CONFIG_INPUT_LEDS=m
 | 
					CONFIG_INPUT_LEDS=m
 | 
				
			||||||
CONFIG_INPUT_POLLDEV=y
 | 
					 | 
				
			||||||
CONFIG_INPUT_MOUSEDEV=m
 | 
					CONFIG_INPUT_MOUSEDEV=m
 | 
				
			||||||
CONFIG_INPUT_EVDEV=y
 | 
					CONFIG_INPUT_EVDEV=y
 | 
				
			||||||
CONFIG_INPUT_EVBUG=m
 | 
					CONFIG_INPUT_EVBUG=m
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -90,7 +90,6 @@ CONFIG_PPPOE=m
 | 
				
			||||||
CONFIG_PPP_ASYNC=m
 | 
					CONFIG_PPP_ASYNC=m
 | 
				
			||||||
CONFIG_ISDN=y
 | 
					CONFIG_ISDN=y
 | 
				
			||||||
CONFIG_INPUT=m
 | 
					CONFIG_INPUT=m
 | 
				
			||||||
CONFIG_INPUT_POLLDEV=m
 | 
					 | 
				
			||||||
# CONFIG_KEYBOARD_ATKBD is not set
 | 
					# CONFIG_KEYBOARD_ATKBD is not set
 | 
				
			||||||
# CONFIG_INPUT_MOUSE is not set
 | 
					# CONFIG_INPUT_MOUSE is not set
 | 
				
			||||||
CONFIG_INPUT_MISC=y
 | 
					CONFIG_INPUT_MISC=y
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -96,7 +96,6 @@ CONFIG_PPPOE=m
 | 
				
			||||||
CONFIG_PPP_ASYNC=m
 | 
					CONFIG_PPP_ASYNC=m
 | 
				
			||||||
CONFIG_ISDN=y
 | 
					CONFIG_ISDN=y
 | 
				
			||||||
CONFIG_INPUT=m
 | 
					CONFIG_INPUT=m
 | 
				
			||||||
CONFIG_INPUT_POLLDEV=m
 | 
					 | 
				
			||||||
# CONFIG_KEYBOARD_ATKBD is not set
 | 
					# CONFIG_KEYBOARD_ATKBD is not set
 | 
				
			||||||
# CONFIG_INPUT_MOUSE is not set
 | 
					# CONFIG_INPUT_MOUSE is not set
 | 
				
			||||||
CONFIG_INPUT_MISC=y
 | 
					CONFIG_INPUT_MISC=y
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -183,11 +183,6 @@ void __pgd_error(const char *file, int line, unsigned long val)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern char *exception_vector, *exception_vector_end;
 | 
					extern char *exception_vector, *exception_vector_end;
 | 
				
			||||||
void __init trap_init(void)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	return;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void __init early_trap_init(void)
 | 
					void __init early_trap_init(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned long ivb = 0;
 | 
						unsigned long ivb = 0;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -105,11 +105,6 @@ void show_stack(struct task_struct *task, unsigned long *stack,
 | 
				
			||||||
	printk("%s\n", loglvl);
 | 
						printk("%s\n", loglvl);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void __init trap_init(void)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	/* Nothing to do here */
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Breakpoint handler */
 | 
					/* Breakpoint handler */
 | 
				
			||||||
asmlinkage void breakpoint_c(struct pt_regs *fp)
 | 
					asmlinkage void breakpoint_c(struct pt_regs *fp)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -231,11 +231,6 @@ void unhandled_exception(struct pt_regs *regs, int ea, int vector)
 | 
				
			||||||
	die("Oops", regs, 9);
 | 
						die("Oops", regs, 9);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void __init trap_init(void)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	/* Nothing needs to be done */
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
asmlinkage void do_trap(struct pt_regs *regs, unsigned long address)
 | 
					asmlinkage void do_trap(struct pt_regs *regs, unsigned long address)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc);
 | 
						force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->pc);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -111,7 +111,6 @@ CONFIG_PPP_BSDCOMP=m
 | 
				
			||||||
CONFIG_PPP_DEFLATE=m
 | 
					CONFIG_PPP_DEFLATE=m
 | 
				
			||||||
CONFIG_PPPOE=m
 | 
					CONFIG_PPPOE=m
 | 
				
			||||||
# CONFIG_WLAN is not set
 | 
					# CONFIG_WLAN is not set
 | 
				
			||||||
CONFIG_INPUT_POLLDEV=y
 | 
					 | 
				
			||||||
CONFIG_KEYBOARD_HIL_OLD=m
 | 
					CONFIG_KEYBOARD_HIL_OLD=m
 | 
				
			||||||
CONFIG_KEYBOARD_HIL=m
 | 
					CONFIG_KEYBOARD_HIL=m
 | 
				
			||||||
CONFIG_MOUSE_SERIAL=y
 | 
					CONFIG_MOUSE_SERIAL=y
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -859,7 +859,3 @@ void  __init early_trap_init(void)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	initialize_ivt(&fault_vector_20);
 | 
						initialize_ivt(&fault_vector_20);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					 | 
				
			||||||
void __init trap_init(void)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2219,11 +2219,6 @@ DEFINE_INTERRUPT_HANDLER(kernel_bad_stack)
 | 
				
			||||||
	die("Bad kernel stack pointer", regs, SIGABRT);
 | 
						die("Bad kernel stack pointer", regs, SIGABRT);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void __init trap_init(void)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef CONFIG_PPC_EMULATED_STATS
 | 
					#ifdef CONFIG_PPC_EMULATED_STATS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define WARN_EMULATED_SETUP(type)	.type = { .name = #type }
 | 
					#define WARN_EMULATED_SETUP(type)	.type = { .name = #type }
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -119,8 +119,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
 | 
				
			||||||
	return rc;
 | 
						return rc;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void __ref arch_remove_memory(int nid, u64 start, u64 size,
 | 
					void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 | 
				
			||||||
			      struct vmem_altmap *altmap)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned long start_pfn = start >> PAGE_SHIFT;
 | 
						unsigned long start_pfn = start >> PAGE_SHIFT;
 | 
				
			||||||
	unsigned long nr_pages = size >> PAGE_SHIFT;
 | 
						unsigned long nr_pages = size >> PAGE_SHIFT;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -286,7 +286,7 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned long block_sz, start_pfn;
 | 
						unsigned long block_sz, start_pfn;
 | 
				
			||||||
	int sections_per_block;
 | 
						int sections_per_block;
 | 
				
			||||||
	int i, nid;
 | 
						int i;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	start_pfn = base >> PAGE_SHIFT;
 | 
						start_pfn = base >> PAGE_SHIFT;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -297,10 +297,9 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	block_sz = pseries_memory_block_size();
 | 
						block_sz = pseries_memory_block_size();
 | 
				
			||||||
	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
 | 
						sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
 | 
				
			||||||
	nid = memory_add_physaddr_to_nid(base);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for (i = 0; i < sections_per_block; i++) {
 | 
						for (i = 0; i < sections_per_block; i++) {
 | 
				
			||||||
		__remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
 | 
							__remove_memory(base, MIN_MEMORY_BLOCK_SIZE);
 | 
				
			||||||
		base += MIN_MEMORY_BLOCK_SIZE;
 | 
							base += MIN_MEMORY_BLOCK_SIZE;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -387,7 +386,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	block_sz = pseries_memory_block_size();
 | 
						block_sz = pseries_memory_block_size();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	__remove_memory(mem_block->nid, lmb->base_addr, block_sz);
 | 
						__remove_memory(lmb->base_addr, block_sz);
 | 
				
			||||||
	put_device(&mem_block->dev);
 | 
						put_device(&mem_block->dev);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Update memory regions for memory remove */
 | 
						/* Update memory regions for memory remove */
 | 
				
			||||||
| 
						 | 
					@ -660,7 +659,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	rc = dlpar_online_lmb(lmb);
 | 
						rc = dlpar_online_lmb(lmb);
 | 
				
			||||||
	if (rc) {
 | 
						if (rc) {
 | 
				
			||||||
		__remove_memory(nid, lmb->base_addr, block_sz);
 | 
							__remove_memory(lmb->base_addr, block_sz);
 | 
				
			||||||
		invalidate_lmb_associativity_index(lmb);
 | 
							invalidate_lmb_associativity_index(lmb);
 | 
				
			||||||
	} else {
 | 
						} else {
 | 
				
			||||||
		lmb->flags |= DRCONF_MEM_ASSIGNED;
 | 
							lmb->flags |= DRCONF_MEM_ASSIGNED;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -51,7 +51,7 @@ config RISCV
 | 
				
			||||||
	select GENERIC_EARLY_IOREMAP
 | 
						select GENERIC_EARLY_IOREMAP
 | 
				
			||||||
	select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO
 | 
						select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO
 | 
				
			||||||
	select GENERIC_IDLE_POLL_SETUP
 | 
						select GENERIC_IDLE_POLL_SETUP
 | 
				
			||||||
	select GENERIC_IOREMAP
 | 
						select GENERIC_IOREMAP if MMU
 | 
				
			||||||
	select GENERIC_IRQ_MULTI_HANDLER
 | 
						select GENERIC_IRQ_MULTI_HANDLER
 | 
				
			||||||
	select GENERIC_IRQ_SHOW
 | 
						select GENERIC_IRQ_SHOW
 | 
				
			||||||
	select GENERIC_IRQ_SHOW_LEVEL
 | 
						select GENERIC_IRQ_SHOW_LEVEL
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -199,11 +199,6 @@ int is_valid_bugaddr(unsigned long pc)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#endif /* CONFIG_GENERIC_BUG */
 | 
					#endif /* CONFIG_GENERIC_BUG */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* stvec & scratch is already set from head.S */
 | 
					 | 
				
			||||||
void __init trap_init(void)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef CONFIG_VMAP_STACK
 | 
					#ifdef CONFIG_VMAP_STACK
 | 
				
			||||||
static DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)],
 | 
					static DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)],
 | 
				
			||||||
		overflow_stack)__aligned(16);
 | 
							overflow_stack)__aligned(16);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -307,8 +307,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
 | 
				
			||||||
	return rc;
 | 
						return rc;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void arch_remove_memory(int nid, u64 start, u64 size,
 | 
					void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 | 
				
			||||||
			struct vmem_altmap *altmap)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned long start_pfn = start >> PAGE_SHIFT;
 | 
						unsigned long start_pfn = start >> PAGE_SHIFT;
 | 
				
			||||||
	unsigned long nr_pages = size >> PAGE_SHIFT;
 | 
						unsigned long nr_pages = size >> PAGE_SHIFT;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -414,8 +414,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
 | 
				
			||||||
	return ret;
 | 
						return ret;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void arch_remove_memory(int nid, u64 start, u64 size,
 | 
					void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 | 
				
			||||||
			struct vmem_altmap *altmap)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned long start_pfn = PFN_DOWN(start);
 | 
						unsigned long start_pfn = PFN_DOWN(start);
 | 
				
			||||||
	unsigned long nr_pages = size >> PAGE_SHIFT;
 | 
						unsigned long nr_pages = size >> PAGE_SHIFT;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -311,7 +311,3 @@ void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	do_IRQ(WINCH_IRQ, regs);
 | 
						do_IRQ(WINCH_IRQ, regs);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					 | 
				
			||||||
void trap_init(void)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -156,7 +156,6 @@ CONFIG_FORCEDETH=y
 | 
				
			||||||
CONFIG_8139TOO=y
 | 
					CONFIG_8139TOO=y
 | 
				
			||||||
# CONFIG_8139TOO_PIO is not set
 | 
					# CONFIG_8139TOO_PIO is not set
 | 
				
			||||||
CONFIG_R8169=y
 | 
					CONFIG_R8169=y
 | 
				
			||||||
CONFIG_INPUT_POLLDEV=y
 | 
					 | 
				
			||||||
CONFIG_INPUT_EVDEV=y
 | 
					CONFIG_INPUT_EVDEV=y
 | 
				
			||||||
CONFIG_INPUT_JOYSTICK=y
 | 
					CONFIG_INPUT_JOYSTICK=y
 | 
				
			||||||
CONFIG_INPUT_TABLET=y
 | 
					CONFIG_INPUT_TABLET=y
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -148,7 +148,6 @@ CONFIG_SKY2=y
 | 
				
			||||||
CONFIG_FORCEDETH=y
 | 
					CONFIG_FORCEDETH=y
 | 
				
			||||||
CONFIG_8139TOO=y
 | 
					CONFIG_8139TOO=y
 | 
				
			||||||
CONFIG_R8169=y
 | 
					CONFIG_R8169=y
 | 
				
			||||||
CONFIG_INPUT_POLLDEV=y
 | 
					 | 
				
			||||||
CONFIG_INPUT_EVDEV=y
 | 
					CONFIG_INPUT_EVDEV=y
 | 
				
			||||||
CONFIG_INPUT_JOYSTICK=y
 | 
					CONFIG_INPUT_JOYSTICK=y
 | 
				
			||||||
CONFIG_INPUT_TABLET=y
 | 
					CONFIG_INPUT_TABLET=y
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -801,8 +801,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
 | 
				
			||||||
	return __add_pages(nid, start_pfn, nr_pages, params);
 | 
						return __add_pages(nid, start_pfn, nr_pages, params);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void arch_remove_memory(int nid, u64 start, u64 size,
 | 
					void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 | 
				
			||||||
			struct vmem_altmap *altmap)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned long start_pfn = start >> PAGE_SHIFT;
 | 
						unsigned long start_pfn = start >> PAGE_SHIFT;
 | 
				
			||||||
	unsigned long nr_pages = size >> PAGE_SHIFT;
 | 
						unsigned long nr_pages = size >> PAGE_SHIFT;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1255,8 +1255,7 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 | 
				
			||||||
	remove_pagetable(start, end, true, NULL);
 | 
						remove_pagetable(start, end, true, NULL);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void __ref arch_remove_memory(int nid, u64 start, u64 size,
 | 
					void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 | 
				
			||||||
			      struct vmem_altmap *altmap)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned long start_pfn = start >> PAGE_SHIFT;
 | 
						unsigned long start_pfn = start >> PAGE_SHIFT;
 | 
				
			||||||
	unsigned long nr_pages = size >> PAGE_SHIFT;
 | 
						unsigned long nr_pages = size >> PAGE_SHIFT;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -54,6 +54,7 @@ struct acpi_memory_info {
 | 
				
			||||||
struct acpi_memory_device {
 | 
					struct acpi_memory_device {
 | 
				
			||||||
	struct acpi_device *device;
 | 
						struct acpi_device *device;
 | 
				
			||||||
	struct list_head res_list;
 | 
						struct list_head res_list;
 | 
				
			||||||
 | 
						int mgid;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static acpi_status
 | 
					static acpi_status
 | 
				
			||||||
| 
						 | 
					@ -169,12 +170,33 @@ static void acpi_unbind_memory_blocks(struct acpi_memory_info *info)
 | 
				
			||||||
static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 | 
					static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	acpi_handle handle = mem_device->device->handle;
 | 
						acpi_handle handle = mem_device->device->handle;
 | 
				
			||||||
 | 
						mhp_t mhp_flags = MHP_NID_IS_MGID;
 | 
				
			||||||
	int result, num_enabled = 0;
 | 
						int result, num_enabled = 0;
 | 
				
			||||||
	struct acpi_memory_info *info;
 | 
						struct acpi_memory_info *info;
 | 
				
			||||||
	mhp_t mhp_flags = MHP_NONE;
 | 
						u64 total_length = 0;
 | 
				
			||||||
	int node;
 | 
						int node, mgid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	node = acpi_get_node(handle);
 | 
						node = acpi_get_node(handle);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						list_for_each_entry(info, &mem_device->res_list, list) {
 | 
				
			||||||
 | 
							if (!info->length)
 | 
				
			||||||
 | 
								continue;
 | 
				
			||||||
 | 
							/* We want a single node for the whole memory group */
 | 
				
			||||||
 | 
							if (node < 0)
 | 
				
			||||||
 | 
								node = memory_add_physaddr_to_nid(info->start_addr);
 | 
				
			||||||
 | 
							total_length += info->length;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!total_length) {
 | 
				
			||||||
 | 
							dev_err(&mem_device->device->dev, "device is empty\n");
 | 
				
			||||||
 | 
							return -EINVAL;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						mgid = memory_group_register_static(node, PFN_UP(total_length));
 | 
				
			||||||
 | 
						if (mgid < 0)
 | 
				
			||||||
 | 
							return mgid;
 | 
				
			||||||
 | 
						mem_device->mgid = mgid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Tell the VM there is more memory here...
 | 
						 * Tell the VM there is more memory here...
 | 
				
			||||||
	 * Note: Assume that this function returns zero on success
 | 
						 * Note: Assume that this function returns zero on success
 | 
				
			||||||
| 
						 | 
					@ -182,22 +204,16 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 | 
				
			||||||
	 * (i.e. memory-hot-remove function)
 | 
						 * (i.e. memory-hot-remove function)
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	list_for_each_entry(info, &mem_device->res_list, list) {
 | 
						list_for_each_entry(info, &mem_device->res_list, list) {
 | 
				
			||||||
		if (info->enabled) { /* just sanity check...*/
 | 
					 | 
				
			||||||
			num_enabled++;
 | 
					 | 
				
			||||||
			continue;
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
		 * If the memory block size is zero, please ignore it.
 | 
							 * If the memory block size is zero, please ignore it.
 | 
				
			||||||
		 * Don't try to do the following memory hotplug flowchart.
 | 
							 * Don't try to do the following memory hotplug flowchart.
 | 
				
			||||||
		 */
 | 
							 */
 | 
				
			||||||
		if (!info->length)
 | 
							if (!info->length)
 | 
				
			||||||
			continue;
 | 
								continue;
 | 
				
			||||||
		if (node < 0)
 | 
					 | 
				
			||||||
			node = memory_add_physaddr_to_nid(info->start_addr);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (mhp_supports_memmap_on_memory(info->length))
 | 
							if (mhp_supports_memmap_on_memory(info->length))
 | 
				
			||||||
			mhp_flags |= MHP_MEMMAP_ON_MEMORY;
 | 
								mhp_flags |= MHP_MEMMAP_ON_MEMORY;
 | 
				
			||||||
		result = __add_memory(node, info->start_addr, info->length,
 | 
							result = __add_memory(mgid, info->start_addr, info->length,
 | 
				
			||||||
				      mhp_flags);
 | 
									      mhp_flags);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
| 
						 | 
					@ -239,19 +255,14 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 | 
					static void acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	acpi_handle handle = mem_device->device->handle;
 | 
					 | 
				
			||||||
	struct acpi_memory_info *info, *n;
 | 
						struct acpi_memory_info *info, *n;
 | 
				
			||||||
	int nid = acpi_get_node(handle);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
 | 
						list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
 | 
				
			||||||
		if (!info->enabled)
 | 
							if (!info->enabled)
 | 
				
			||||||
			continue;
 | 
								continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (nid == NUMA_NO_NODE)
 | 
					 | 
				
			||||||
			nid = memory_add_physaddr_to_nid(info->start_addr);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		acpi_unbind_memory_blocks(info);
 | 
							acpi_unbind_memory_blocks(info);
 | 
				
			||||||
		__remove_memory(nid, info->start_addr, info->length);
 | 
							__remove_memory(info->start_addr, info->length);
 | 
				
			||||||
		list_del(&info->list);
 | 
							list_del(&info->list);
 | 
				
			||||||
		kfree(info);
 | 
							kfree(info);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					@ -262,6 +273,10 @@ static void acpi_memory_device_free(struct acpi_memory_device *mem_device)
 | 
				
			||||||
	if (!mem_device)
 | 
						if (!mem_device)
 | 
				
			||||||
		return;
 | 
							return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* In case we succeeded adding *some* memory, unregistering fails. */
 | 
				
			||||||
 | 
						if (mem_device->mgid >= 0)
 | 
				
			||||||
 | 
							memory_group_unregister(mem_device->mgid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	acpi_memory_free_device_resources(mem_device);
 | 
						acpi_memory_free_device_resources(mem_device);
 | 
				
			||||||
	mem_device->device->driver_data = NULL;
 | 
						mem_device->device->driver_data = NULL;
 | 
				
			||||||
	kfree(mem_device);
 | 
						kfree(mem_device);
 | 
				
			||||||
| 
						 | 
					@ -282,6 +297,7 @@ static int acpi_memory_device_add(struct acpi_device *device,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	INIT_LIST_HEAD(&mem_device->res_list);
 | 
						INIT_LIST_HEAD(&mem_device->res_list);
 | 
				
			||||||
	mem_device->device = device;
 | 
						mem_device->device = device;
 | 
				
			||||||
 | 
						mem_device->mgid = -1;
 | 
				
			||||||
	sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME);
 | 
						sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME);
 | 
				
			||||||
	sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS);
 | 
						sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS);
 | 
				
			||||||
	device->driver_data = mem_device;
 | 
						device->driver_data = mem_device;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -82,6 +82,12 @@ static struct bus_type memory_subsys = {
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static DEFINE_XARRAY(memory_blocks);
 | 
					static DEFINE_XARRAY(memory_blocks);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * Memory groups, indexed by memory group id (mgid).
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);
 | 
				
			||||||
 | 
					#define MEMORY_GROUP_MARK_DYNAMIC	XA_MARK_1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static BLOCKING_NOTIFIER_HEAD(memory_chain);
 | 
					static BLOCKING_NOTIFIER_HEAD(memory_chain);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int register_memory_notifier(struct notifier_block *nb)
 | 
					int register_memory_notifier(struct notifier_block *nb)
 | 
				
			||||||
| 
						 | 
					@ -177,7 +183,8 @@ static int memory_block_online(struct memory_block *mem)
 | 
				
			||||||
	struct zone *zone;
 | 
						struct zone *zone;
 | 
				
			||||||
	int ret;
 | 
						int ret;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	zone = zone_for_pfn_range(mem->online_type, mem->nid, start_pfn, nr_pages);
 | 
						zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
 | 
				
			||||||
 | 
									  start_pfn, nr_pages);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Although vmemmap pages have a different lifecycle than the pages
 | 
						 * Although vmemmap pages have a different lifecycle than the pages
 | 
				
			||||||
| 
						 | 
					@ -193,7 +200,7 @@ static int memory_block_online(struct memory_block *mem)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	ret = online_pages(start_pfn + nr_vmemmap_pages,
 | 
						ret = online_pages(start_pfn + nr_vmemmap_pages,
 | 
				
			||||||
			   nr_pages - nr_vmemmap_pages, zone);
 | 
								   nr_pages - nr_vmemmap_pages, zone, mem->group);
 | 
				
			||||||
	if (ret) {
 | 
						if (ret) {
 | 
				
			||||||
		if (nr_vmemmap_pages)
 | 
							if (nr_vmemmap_pages)
 | 
				
			||||||
			mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 | 
								mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 | 
				
			||||||
| 
						 | 
					@ -205,7 +212,8 @@ static int memory_block_online(struct memory_block *mem)
 | 
				
			||||||
	 * now already properly populated.
 | 
						 * now already properly populated.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (nr_vmemmap_pages)
 | 
						if (nr_vmemmap_pages)
 | 
				
			||||||
		adjust_present_page_count(zone, nr_vmemmap_pages);
 | 
							adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
 | 
				
			||||||
 | 
										  nr_vmemmap_pages);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return ret;
 | 
						return ret;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -215,24 +223,23 @@ static int memory_block_offline(struct memory_block *mem)
 | 
				
			||||||
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 | 
						unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 | 
				
			||||||
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 | 
						unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 | 
				
			||||||
	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
 | 
						unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
 | 
				
			||||||
	struct zone *zone;
 | 
					 | 
				
			||||||
	int ret;
 | 
						int ret;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Unaccount before offlining, such that unpopulated zone and kthreads
 | 
						 * Unaccount before offlining, such that unpopulated zone and kthreads
 | 
				
			||||||
	 * can properly be torn down in offline_pages().
 | 
						 * can properly be torn down in offline_pages().
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (nr_vmemmap_pages) {
 | 
						if (nr_vmemmap_pages)
 | 
				
			||||||
		zone = page_zone(pfn_to_page(start_pfn));
 | 
							adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
 | 
				
			||||||
		adjust_present_page_count(zone, -nr_vmemmap_pages);
 | 
										  -nr_vmemmap_pages);
 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	ret = offline_pages(start_pfn + nr_vmemmap_pages,
 | 
						ret = offline_pages(start_pfn + nr_vmemmap_pages,
 | 
				
			||||||
			    nr_pages - nr_vmemmap_pages);
 | 
								    nr_pages - nr_vmemmap_pages, mem->group);
 | 
				
			||||||
	if (ret) {
 | 
						if (ret) {
 | 
				
			||||||
		/* offline_pages() failed. Account back. */
 | 
							/* offline_pages() failed. Account back. */
 | 
				
			||||||
		if (nr_vmemmap_pages)
 | 
							if (nr_vmemmap_pages)
 | 
				
			||||||
			adjust_present_page_count(zone, nr_vmemmap_pages);
 | 
								adjust_present_page_count(pfn_to_page(start_pfn),
 | 
				
			||||||
 | 
											  mem->group, nr_vmemmap_pages);
 | 
				
			||||||
		return ret;
 | 
							return ret;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -374,12 +381,13 @@ static ssize_t phys_device_show(struct device *dev,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef CONFIG_MEMORY_HOTREMOVE
 | 
					#ifdef CONFIG_MEMORY_HOTREMOVE
 | 
				
			||||||
static int print_allowed_zone(char *buf, int len, int nid,
 | 
					static int print_allowed_zone(char *buf, int len, int nid,
 | 
				
			||||||
 | 
								      struct memory_group *group,
 | 
				
			||||||
			      unsigned long start_pfn, unsigned long nr_pages,
 | 
								      unsigned long start_pfn, unsigned long nr_pages,
 | 
				
			||||||
			      int online_type, struct zone *default_zone)
 | 
								      int online_type, struct zone *default_zone)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct zone *zone;
 | 
						struct zone *zone;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
 | 
						zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages);
 | 
				
			||||||
	if (zone == default_zone)
 | 
						if (zone == default_zone)
 | 
				
			||||||
		return 0;
 | 
							return 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -392,9 +400,10 @@ static ssize_t valid_zones_show(struct device *dev,
 | 
				
			||||||
	struct memory_block *mem = to_memory_block(dev);
 | 
						struct memory_block *mem = to_memory_block(dev);
 | 
				
			||||||
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 | 
						unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 | 
				
			||||||
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 | 
						unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 | 
				
			||||||
 | 
						struct memory_group *group = mem->group;
 | 
				
			||||||
	struct zone *default_zone;
 | 
						struct zone *default_zone;
 | 
				
			||||||
 | 
						int nid = mem->nid;
 | 
				
			||||||
	int len = 0;
 | 
						int len = 0;
 | 
				
			||||||
	int nid;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Check the existing zone. Make sure that we do that only on the
 | 
						 * Check the existing zone. Make sure that we do that only on the
 | 
				
			||||||
| 
						 | 
					@ -413,14 +422,13 @@ static ssize_t valid_zones_show(struct device *dev,
 | 
				
			||||||
		goto out;
 | 
							goto out;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	nid = mem->nid;
 | 
						default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group,
 | 
				
			||||||
	default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn,
 | 
										  start_pfn, nr_pages);
 | 
				
			||||||
					  nr_pages);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	len += sysfs_emit_at(buf, len, "%s", default_zone->name);
 | 
						len += sysfs_emit_at(buf, len, "%s", default_zone->name);
 | 
				
			||||||
	len += print_allowed_zone(buf, len, nid, start_pfn, nr_pages,
 | 
						len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
 | 
				
			||||||
				  MMOP_ONLINE_KERNEL, default_zone);
 | 
									  MMOP_ONLINE_KERNEL, default_zone);
 | 
				
			||||||
	len += print_allowed_zone(buf, len, nid, start_pfn, nr_pages,
 | 
						len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
 | 
				
			||||||
				  MMOP_ONLINE_MOVABLE, default_zone);
 | 
									  MMOP_ONLINE_MOVABLE, default_zone);
 | 
				
			||||||
out:
 | 
					out:
 | 
				
			||||||
	len += sysfs_emit_at(buf, len, "\n");
 | 
						len += sysfs_emit_at(buf, len, "\n");
 | 
				
			||||||
| 
						 | 
					@ -634,7 +642,8 @@ int register_memory(struct memory_block *memory)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static int init_memory_block(unsigned long block_id, unsigned long state,
 | 
					static int init_memory_block(unsigned long block_id, unsigned long state,
 | 
				
			||||||
			     unsigned long nr_vmemmap_pages)
 | 
								     unsigned long nr_vmemmap_pages,
 | 
				
			||||||
 | 
								     struct memory_group *group)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct memory_block *mem;
 | 
						struct memory_block *mem;
 | 
				
			||||||
	int ret = 0;
 | 
						int ret = 0;
 | 
				
			||||||
| 
						 | 
					@ -652,6 +661,12 @@ static int init_memory_block(unsigned long block_id, unsigned long state,
 | 
				
			||||||
	mem->state = state;
 | 
						mem->state = state;
 | 
				
			||||||
	mem->nid = NUMA_NO_NODE;
 | 
						mem->nid = NUMA_NO_NODE;
 | 
				
			||||||
	mem->nr_vmemmap_pages = nr_vmemmap_pages;
 | 
						mem->nr_vmemmap_pages = nr_vmemmap_pages;
 | 
				
			||||||
 | 
						INIT_LIST_HEAD(&mem->group_next);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (group) {
 | 
				
			||||||
 | 
							mem->group = group;
 | 
				
			||||||
 | 
							list_add(&mem->group_next, &group->memory_blocks);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	ret = register_memory(mem);
 | 
						ret = register_memory(mem);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -671,7 +686,7 @@ static int add_memory_block(unsigned long base_section_nr)
 | 
				
			||||||
	if (section_count == 0)
 | 
						if (section_count == 0)
 | 
				
			||||||
		return 0;
 | 
							return 0;
 | 
				
			||||||
	return init_memory_block(memory_block_id(base_section_nr),
 | 
						return init_memory_block(memory_block_id(base_section_nr),
 | 
				
			||||||
				 MEM_ONLINE, 0);
 | 
									 MEM_ONLINE, 0,  NULL);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void unregister_memory(struct memory_block *memory)
 | 
					static void unregister_memory(struct memory_block *memory)
 | 
				
			||||||
| 
						 | 
					@ -681,6 +696,11 @@ static void unregister_memory(struct memory_block *memory)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
 | 
						WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (memory->group) {
 | 
				
			||||||
 | 
							list_del(&memory->group_next);
 | 
				
			||||||
 | 
							memory->group = NULL;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* drop the ref. we got via find_memory_block() */
 | 
						/* drop the ref. we got via find_memory_block() */
 | 
				
			||||||
	put_device(&memory->dev);
 | 
						put_device(&memory->dev);
 | 
				
			||||||
	device_unregister(&memory->dev);
 | 
						device_unregister(&memory->dev);
 | 
				
			||||||
| 
						 | 
					@ -694,7 +714,8 @@ static void unregister_memory(struct memory_block *memory)
 | 
				
			||||||
 * Called under device_hotplug_lock.
 | 
					 * Called under device_hotplug_lock.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
int create_memory_block_devices(unsigned long start, unsigned long size,
 | 
					int create_memory_block_devices(unsigned long start, unsigned long size,
 | 
				
			||||||
				unsigned long vmemmap_pages)
 | 
									unsigned long vmemmap_pages,
 | 
				
			||||||
 | 
									struct memory_group *group)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 | 
						const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 | 
				
			||||||
	unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 | 
						unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 | 
				
			||||||
| 
						 | 
					@ -707,7 +728,8 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
 | 
				
			||||||
		return -EINVAL;
 | 
							return -EINVAL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 | 
						for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 | 
				
			||||||
		ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages);
 | 
							ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages,
 | 
				
			||||||
 | 
										group);
 | 
				
			||||||
		if (ret)
 | 
							if (ret)
 | 
				
			||||||
			break;
 | 
								break;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					@ -891,3 +913,164 @@ int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
 | 
				
			||||||
	return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
 | 
						return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
 | 
				
			||||||
				for_each_memory_block_cb);
 | 
									for_each_memory_block_cb);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * This is an internal helper to unify allocation and initialization of
 | 
				
			||||||
 | 
					 * memory groups. Note that the passed memory group will be copied to a
 | 
				
			||||||
 | 
					 * dynamically allocated memory group. After this call, the passed
 | 
				
			||||||
 | 
					 * memory group should no longer be used.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static int memory_group_register(struct memory_group group)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct memory_group *new_group;
 | 
				
			||||||
 | 
						uint32_t mgid;
 | 
				
			||||||
 | 
						int ret;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!node_possible(group.nid))
 | 
				
			||||||
 | 
							return -EINVAL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						new_group = kzalloc(sizeof(group), GFP_KERNEL);
 | 
				
			||||||
 | 
						if (!new_group)
 | 
				
			||||||
 | 
							return -ENOMEM;
 | 
				
			||||||
 | 
						*new_group = group;
 | 
				
			||||||
 | 
						INIT_LIST_HEAD(&new_group->memory_blocks);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b,
 | 
				
			||||||
 | 
							       GFP_KERNEL);
 | 
				
			||||||
 | 
						if (ret) {
 | 
				
			||||||
 | 
							kfree(new_group);
 | 
				
			||||||
 | 
							return ret;
 | 
				
			||||||
 | 
						} else if (group.is_dynamic) {
 | 
				
			||||||
 | 
							xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						return mgid;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * memory_group_register_static() - Register a static memory group.
 | 
				
			||||||
 | 
					 * @nid: The node id.
 | 
				
			||||||
 | 
					 * @max_pages: The maximum number of pages we'll have in this static memory
 | 
				
			||||||
 | 
					 *	       group.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Register a new static memory group and return the memory group id.
 | 
				
			||||||
 | 
					 * All memory in the group belongs to a single unit, such as a DIMM. All
 | 
				
			||||||
 | 
					 * memory belonging to a static memory group is added in one go to be removed
 | 
				
			||||||
 | 
					 * in one go -- it's static.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Returns an error if out of memory, if the node id is invalid, if no new
 | 
				
			||||||
 | 
					 * memory groups can be registered, or if max_pages is invalid (0). Otherwise,
 | 
				
			||||||
 | 
					 * returns the new memory group id.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					int memory_group_register_static(int nid, unsigned long max_pages)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct memory_group group = {
 | 
				
			||||||
 | 
							.nid = nid,
 | 
				
			||||||
 | 
							.s = {
 | 
				
			||||||
 | 
								.max_pages = max_pages,
 | 
				
			||||||
 | 
							},
 | 
				
			||||||
 | 
						};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!max_pages)
 | 
				
			||||||
 | 
							return -EINVAL;
 | 
				
			||||||
 | 
						return memory_group_register(group);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					EXPORT_SYMBOL_GPL(memory_group_register_static);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * memory_group_register_dynamic() - Register a dynamic memory group.
 | 
				
			||||||
 | 
					 * @nid: The node id.
 | 
				
			||||||
 | 
					 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic
 | 
				
			||||||
 | 
					 *		memory group.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Register a new dynamic memory group and return the memory group id.
 | 
				
			||||||
 | 
					 * Memory within a dynamic memory group is added/removed dynamically
 | 
				
			||||||
 | 
					 * in unit_pages.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Returns an error if out of memory, if the node id is invalid, if no new
 | 
				
			||||||
 | 
					 * memory groups can be registered, or if unit_pages is invalid (0, not a
 | 
				
			||||||
 | 
					 * power of two, smaller than a single memory block). Otherwise, returns the
 | 
				
			||||||
 | 
					 * new memory group id.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					int memory_group_register_dynamic(int nid, unsigned long unit_pages)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct memory_group group = {
 | 
				
			||||||
 | 
							.nid = nid,
 | 
				
			||||||
 | 
							.is_dynamic = true,
 | 
				
			||||||
 | 
							.d = {
 | 
				
			||||||
 | 
								.unit_pages = unit_pages,
 | 
				
			||||||
 | 
							},
 | 
				
			||||||
 | 
						};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!unit_pages || !is_power_of_2(unit_pages) ||
 | 
				
			||||||
 | 
						    unit_pages < PHYS_PFN(memory_block_size_bytes()))
 | 
				
			||||||
 | 
							return -EINVAL;
 | 
				
			||||||
 | 
						return memory_group_register(group);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					EXPORT_SYMBOL_GPL(memory_group_register_dynamic);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * memory_group_unregister() - Unregister a memory group.
 | 
				
			||||||
 | 
					 * @mgid: the memory group id
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Unregister a memory group. If any memory block still belongs to this
 | 
				
			||||||
 | 
					 * memory group, unregistering will fail.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some
 | 
				
			||||||
 | 
					 * memory blocks still belong to this memory group and returns 0 if
 | 
				
			||||||
 | 
					 * unregistering succeeded.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					int memory_group_unregister(int mgid)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct memory_group *group;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (mgid < 0)
 | 
				
			||||||
 | 
							return -EINVAL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						group = xa_load(&memory_groups, mgid);
 | 
				
			||||||
 | 
						if (!group)
 | 
				
			||||||
 | 
							return -EINVAL;
 | 
				
			||||||
 | 
						if (!list_empty(&group->memory_blocks))
 | 
				
			||||||
 | 
							return -EBUSY;
 | 
				
			||||||
 | 
						xa_erase(&memory_groups, mgid);
 | 
				
			||||||
 | 
						kfree(group);
 | 
				
			||||||
 | 
						return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					EXPORT_SYMBOL_GPL(memory_group_unregister);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * This is an internal helper only to be used in core memory hotplug code to
 | 
				
			||||||
 | 
					 * lookup a memory group. We don't care about locking, as we don't expect a
 | 
				
			||||||
 | 
					 * memory group to get unregistered while adding memory to it -- because
 | 
				
			||||||
 | 
					 * the group and the memory is managed by the same driver.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					struct memory_group *memory_group_find_by_id(int mgid)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						return xa_load(&memory_groups, mgid);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * This is an internal helper only to be used in core memory hotplug code to
 | 
				
			||||||
 | 
					 * walk all dynamic memory groups excluding a given memory group, either
 | 
				
			||||||
 | 
					 * belonging to a specific node, or belonging to any node.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
 | 
				
			||||||
 | 
								       struct memory_group *excluded, void *arg)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct memory_group *group;
 | 
				
			||||||
 | 
						unsigned long index;
 | 
				
			||||||
 | 
						int ret = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						xa_for_each_marked(&memory_groups, index, group,
 | 
				
			||||||
 | 
								   MEMORY_GROUP_MARK_DYNAMIC) {
 | 
				
			||||||
 | 
							if (group == excluded)
 | 
				
			||||||
 | 
								continue;
 | 
				
			||||||
 | 
					#ifdef CONFIG_NUMA
 | 
				
			||||||
 | 
							if (nid != NUMA_NO_NODE && group->nid != nid)
 | 
				
			||||||
 | 
								continue;
 | 
				
			||||||
 | 
					#endif /* CONFIG_NUMA */
 | 
				
			||||||
 | 
							ret = func(group, arg);
 | 
				
			||||||
 | 
							if (ret)
 | 
				
			||||||
 | 
								break;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						return ret;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -785,8 +785,6 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
 | 
				
			||||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 | 
					#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 | 
				
			||||||
static int __ref get_nid_for_pfn(unsigned long pfn)
 | 
					static int __ref get_nid_for_pfn(unsigned long pfn)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	if (!pfn_valid_within(pfn))
 | 
					 | 
				
			||||||
		return -1;
 | 
					 | 
				
			||||||
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 | 
					#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 | 
				
			||||||
	if (system_state < SYSTEM_RUNNING)
 | 
						if (system_state < SYSTEM_RUNNING)
 | 
				
			||||||
		return early_pfn_to_nid(pfn);
 | 
							return early_pfn_to_nid(pfn);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -37,15 +37,16 @@ static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct dax_kmem_data {
 | 
					struct dax_kmem_data {
 | 
				
			||||||
	const char *res_name;
 | 
						const char *res_name;
 | 
				
			||||||
 | 
						int mgid;
 | 
				
			||||||
	struct resource *res[];
 | 
						struct resource *res[];
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 | 
					static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct device *dev = &dev_dax->dev;
 | 
						struct device *dev = &dev_dax->dev;
 | 
				
			||||||
 | 
						unsigned long total_len = 0;
 | 
				
			||||||
	struct dax_kmem_data *data;
 | 
						struct dax_kmem_data *data;
 | 
				
			||||||
	int rc = -ENOMEM;
 | 
						int i, rc, mapped = 0;
 | 
				
			||||||
	int i, mapped = 0;
 | 
					 | 
				
			||||||
	int numa_node;
 | 
						int numa_node;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
| 
						 | 
					@ -61,16 +62,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 | 
				
			||||||
		return -EINVAL;
 | 
							return -EINVAL;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL);
 | 
					 | 
				
			||||||
	if (!data)
 | 
					 | 
				
			||||||
		return -ENOMEM;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	data->res_name = kstrdup(dev_name(dev), GFP_KERNEL);
 | 
					 | 
				
			||||||
	if (!data->res_name)
 | 
					 | 
				
			||||||
		goto err_res_name;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	for (i = 0; i < dev_dax->nr_range; i++) {
 | 
						for (i = 0; i < dev_dax->nr_range; i++) {
 | 
				
			||||||
		struct resource *res;
 | 
					 | 
				
			||||||
		struct range range;
 | 
							struct range range;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		rc = dax_kmem_range(dev_dax, i, &range);
 | 
							rc = dax_kmem_range(dev_dax, i, &range);
 | 
				
			||||||
| 
						 | 
					@ -79,6 +71,35 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 | 
				
			||||||
					i, range.start, range.end);
 | 
										i, range.start, range.end);
 | 
				
			||||||
			continue;
 | 
								continue;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
							total_len += range_len(&range);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!total_len) {
 | 
				
			||||||
 | 
							dev_warn(dev, "rejecting DAX region without any memory after alignment\n");
 | 
				
			||||||
 | 
							return -EINVAL;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL);
 | 
				
			||||||
 | 
						if (!data)
 | 
				
			||||||
 | 
							return -ENOMEM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						rc = -ENOMEM;
 | 
				
			||||||
 | 
						data->res_name = kstrdup(dev_name(dev), GFP_KERNEL);
 | 
				
			||||||
 | 
						if (!data->res_name)
 | 
				
			||||||
 | 
							goto err_res_name;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						rc = memory_group_register_static(numa_node, total_len);
 | 
				
			||||||
 | 
						if (rc < 0)
 | 
				
			||||||
 | 
							goto err_reg_mgid;
 | 
				
			||||||
 | 
						data->mgid = rc;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for (i = 0; i < dev_dax->nr_range; i++) {
 | 
				
			||||||
 | 
							struct resource *res;
 | 
				
			||||||
 | 
							struct range range;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							rc = dax_kmem_range(dev_dax, i, &range);
 | 
				
			||||||
 | 
							if (rc)
 | 
				
			||||||
 | 
								continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/* Region is permanently reserved if hotremove fails. */
 | 
							/* Region is permanently reserved if hotremove fails. */
 | 
				
			||||||
		res = request_mem_region(range.start, range_len(&range), data->res_name);
 | 
							res = request_mem_region(range.start, range_len(&range), data->res_name);
 | 
				
			||||||
| 
						 | 
					@ -108,8 +129,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 | 
				
			||||||
		 * Ensure that future kexec'd kernels will not treat
 | 
							 * Ensure that future kexec'd kernels will not treat
 | 
				
			||||||
		 * this as RAM automatically.
 | 
							 * this as RAM automatically.
 | 
				
			||||||
		 */
 | 
							 */
 | 
				
			||||||
		rc = add_memory_driver_managed(numa_node, range.start,
 | 
							rc = add_memory_driver_managed(data->mgid, range.start,
 | 
				
			||||||
				range_len(&range), kmem_name, MHP_NONE);
 | 
									range_len(&range), kmem_name, MHP_NID_IS_MGID);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (rc) {
 | 
							if (rc) {
 | 
				
			||||||
			dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n",
 | 
								dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n",
 | 
				
			||||||
| 
						 | 
					@ -129,6 +150,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 | 
				
			||||||
	return 0;
 | 
						return 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
err_request_mem:
 | 
					err_request_mem:
 | 
				
			||||||
 | 
						memory_group_unregister(data->mgid);
 | 
				
			||||||
 | 
					err_reg_mgid:
 | 
				
			||||||
	kfree(data->res_name);
 | 
						kfree(data->res_name);
 | 
				
			||||||
err_res_name:
 | 
					err_res_name:
 | 
				
			||||||
	kfree(data);
 | 
						kfree(data);
 | 
				
			||||||
| 
						 | 
					@ -156,8 +179,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
 | 
				
			||||||
		if (rc)
 | 
							if (rc)
 | 
				
			||||||
			continue;
 | 
								continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		rc = remove_memory(dev_dax->target_node, range.start,
 | 
							rc = remove_memory(range.start, range_len(&range));
 | 
				
			||||||
				range_len(&range));
 | 
					 | 
				
			||||||
		if (rc == 0) {
 | 
							if (rc == 0) {
 | 
				
			||||||
			release_resource(data->res[i]);
 | 
								release_resource(data->res[i]);
 | 
				
			||||||
			kfree(data->res[i]);
 | 
								kfree(data->res[i]);
 | 
				
			||||||
| 
						 | 
					@ -172,6 +194,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (success >= dev_dax->nr_range) {
 | 
						if (success >= dev_dax->nr_range) {
 | 
				
			||||||
 | 
							memory_group_unregister(data->mgid);
 | 
				
			||||||
		kfree(data->res_name);
 | 
							kfree(data->res_name);
 | 
				
			||||||
		kfree(data);
 | 
							kfree(data);
 | 
				
			||||||
		dev_set_drvdata(dev, NULL);
 | 
							dev_set_drvdata(dev, NULL);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -27,6 +27,7 @@
 | 
				
			||||||
#include <linux/hrtimer.h>
 | 
					#include <linux/hrtimer.h>
 | 
				
			||||||
#include <linux/of.h>
 | 
					#include <linux/of.h>
 | 
				
			||||||
#include <linux/pm_qos.h>
 | 
					#include <linux/pm_qos.h>
 | 
				
			||||||
 | 
					#include <linux/units.h>
 | 
				
			||||||
#include "governor.h"
 | 
					#include "governor.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define CREATE_TRACE_POINTS
 | 
					#define CREATE_TRACE_POINTS
 | 
				
			||||||
| 
						 | 
					@ -34,7 +35,6 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define IS_SUPPORTED_FLAG(f, name) ((f & DEVFREQ_GOV_FLAG_##name) ? true : false)
 | 
					#define IS_SUPPORTED_FLAG(f, name) ((f & DEVFREQ_GOV_FLAG_##name) ? true : false)
 | 
				
			||||||
#define IS_SUPPORTED_ATTR(f, name) ((f & DEVFREQ_GOV_ATTR_##name) ? true : false)
 | 
					#define IS_SUPPORTED_ATTR(f, name) ((f & DEVFREQ_GOV_ATTR_##name) ? true : false)
 | 
				
			||||||
#define HZ_PER_KHZ	1000
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
static struct class *devfreq_class;
 | 
					static struct class *devfreq_class;
 | 
				
			||||||
static struct dentry *devfreq_debugfs;
 | 
					static struct dentry *devfreq_debugfs;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,6 +17,7 @@
 | 
				
			||||||
#include <linux/property.h>
 | 
					#include <linux/property.h>
 | 
				
			||||||
#include <linux/regmap.h>
 | 
					#include <linux/regmap.h>
 | 
				
			||||||
#include <linux/reset.h>
 | 
					#include <linux/reset.h>
 | 
				
			||||||
 | 
					#include <linux/units.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* PVT Common register */
 | 
					/* PVT Common register */
 | 
				
			||||||
#define PVT_IP_CONFIG	0x04
 | 
					#define PVT_IP_CONFIG	0x04
 | 
				
			||||||
| 
						 | 
					@ -37,7 +38,6 @@
 | 
				
			||||||
#define CLK_SYNTH_EN		BIT(24)
 | 
					#define CLK_SYNTH_EN		BIT(24)
 | 
				
			||||||
#define CLK_SYS_CYCLES_MAX	514
 | 
					#define CLK_SYS_CYCLES_MAX	514
 | 
				
			||||||
#define CLK_SYS_CYCLES_MIN	2
 | 
					#define CLK_SYS_CYCLES_MIN	2
 | 
				
			||||||
#define HZ_PER_MHZ		1000000L
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define SDIF_DISABLE	0x04
 | 
					#define SDIF_DISABLE	0x04
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,12 +6,11 @@
 | 
				
			||||||
#include <linux/module.h>
 | 
					#include <linux/module.h>
 | 
				
			||||||
#include <linux/kernel.h>
 | 
					#include <linux/kernel.h>
 | 
				
			||||||
#include <linux/time.h>
 | 
					#include <linux/time.h>
 | 
				
			||||||
 | 
					#include <linux/units.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <linux/hid-sensor-hub.h>
 | 
					#include <linux/hid-sensor-hub.h>
 | 
				
			||||||
#include <linux/iio/iio.h>
 | 
					#include <linux/iio/iio.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define HZ_PER_MHZ	1000000L
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static struct {
 | 
					static struct {
 | 
				
			||||||
	u32 usage_id;
 | 
						u32 usage_id;
 | 
				
			||||||
	int unit; /* 0 for default others from HID sensor spec */
 | 
						int unit; /* 0 for default others from HID sensor spec */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -24,8 +24,7 @@
 | 
				
			||||||
#include <linux/module.h>
 | 
					#include <linux/module.h>
 | 
				
			||||||
#include <linux/mutex.h>
 | 
					#include <linux/mutex.h>
 | 
				
			||||||
#include <linux/pm.h>
 | 
					#include <linux/pm.h>
 | 
				
			||||||
 | 
					#include <linux/units.h>
 | 
				
			||||||
#define HZ_PER_KHZ 1000
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define AS73211_DRV_NAME "as73211"
 | 
					#define AS73211_DRV_NAME "as73211"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,6 +9,7 @@
 | 
				
			||||||
#include <linux/module.h>
 | 
					#include <linux/module.h>
 | 
				
			||||||
#include <linux/pm_runtime.h>
 | 
					#include <linux/pm_runtime.h>
 | 
				
			||||||
#include <linux/regulator/consumer.h>
 | 
					#include <linux/regulator/consumer.h>
 | 
				
			||||||
 | 
					#include <linux/units.h>
 | 
				
			||||||
#include <media/media-entity.h>
 | 
					#include <media/media-entity.h>
 | 
				
			||||||
#include <media/v4l2-async.h>
 | 
					#include <media/v4l2-async.h>
 | 
				
			||||||
#include <media/v4l2-ctrls.h>
 | 
					#include <media/v4l2-ctrls.h>
 | 
				
			||||||
| 
						 | 
					@ -64,7 +65,6 @@
 | 
				
			||||||
/* Test pattern control */
 | 
					/* Test pattern control */
 | 
				
			||||||
#define OV02A10_REG_TEST_PATTERN			0xb6
 | 
					#define OV02A10_REG_TEST_PATTERN			0xb6
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define HZ_PER_MHZ					1000000L
 | 
					 | 
				
			||||||
#define OV02A10_LINK_FREQ_390MHZ			(390 * HZ_PER_MHZ)
 | 
					#define OV02A10_LINK_FREQ_390MHZ			(390 * HZ_PER_MHZ)
 | 
				
			||||||
#define OV02A10_ECLK_FREQ				(24 * HZ_PER_MHZ)
 | 
					#define OV02A10_ECLK_FREQ				(24 * HZ_PER_MHZ)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -20,6 +20,7 @@
 | 
				
			||||||
#include <linux/sched.h>
 | 
					#include <linux/sched.h>
 | 
				
			||||||
#include <linux/slab.h>
 | 
					#include <linux/slab.h>
 | 
				
			||||||
#include <linux/types.h>
 | 
					#include <linux/types.h>
 | 
				
			||||||
 | 
					#include <linux/units.h>
 | 
				
			||||||
#include <asm/unaligned.h>
 | 
					#include <asm/unaligned.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define EBU_CLC			0x000
 | 
					#define EBU_CLC			0x000
 | 
				
			||||||
| 
						 | 
					@ -102,7 +103,6 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define MAX_CS	2
 | 
					#define MAX_CS	2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define HZ_PER_MHZ	1000000L
 | 
					 | 
				
			||||||
#define USEC_PER_SEC	1000000L
 | 
					#define USEC_PER_SEC	1000000L
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct ebu_nand_cs {
 | 
					struct ebu_nand_cs {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,6 +15,7 @@
 | 
				
			||||||
#include <linux/of_platform.h>
 | 
					#include <linux/of_platform.h>
 | 
				
			||||||
#include <linux/phy/phy.h>
 | 
					#include <linux/phy/phy.h>
 | 
				
			||||||
#include <linux/reset.h>
 | 
					#include <linux/reset.h>
 | 
				
			||||||
 | 
					#include <linux/units.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define STM32_USBPHYC_PLL	0x0
 | 
					#define STM32_USBPHYC_PLL	0x0
 | 
				
			||||||
#define STM32_USBPHYC_MISC	0x8
 | 
					#define STM32_USBPHYC_MISC	0x8
 | 
				
			||||||
| 
						 | 
					@ -47,7 +48,6 @@
 | 
				
			||||||
#define PLL_FVCO_MHZ		2880
 | 
					#define PLL_FVCO_MHZ		2880
 | 
				
			||||||
#define PLL_INFF_MIN_RATE_HZ	19200000
 | 
					#define PLL_INFF_MIN_RATE_HZ	19200000
 | 
				
			||||||
#define PLL_INFF_MAX_RATE_HZ	38400000
 | 
					#define PLL_INFF_MAX_RATE_HZ	38400000
 | 
				
			||||||
#define HZ_PER_MHZ		1000000L
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct pll_params {
 | 
					struct pll_params {
 | 
				
			||||||
	u8 ndiv;
 | 
						u8 ndiv;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -18,10 +18,10 @@
 | 
				
			||||||
#include <linux/pm_opp.h>
 | 
					#include <linux/pm_opp.h>
 | 
				
			||||||
#include <linux/pm_qos.h>
 | 
					#include <linux/pm_qos.h>
 | 
				
			||||||
#include <linux/thermal.h>
 | 
					#include <linux/thermal.h>
 | 
				
			||||||
 | 
					#include <linux/units.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <trace/events/thermal.h>
 | 
					#include <trace/events/thermal.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define HZ_PER_KHZ		1000
 | 
					 | 
				
			||||||
#define SCALE_ERROR_MITIGATION	100
 | 
					#define SCALE_ERROR_MITIGATION	100
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -143,6 +143,8 @@ struct virtio_mem {
 | 
				
			||||||
	 * add_memory_driver_managed().
 | 
						 * add_memory_driver_managed().
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	const char *resource_name;
 | 
						const char *resource_name;
 | 
				
			||||||
 | 
						/* Memory group identification. */
 | 
				
			||||||
 | 
						int mgid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * We don't want to add too much memory if it's not getting onlined,
 | 
						 * We don't want to add too much memory if it's not getting onlined,
 | 
				
			||||||
| 
						 | 
					@ -626,8 +628,8 @@ static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
 | 
				
			||||||
		addr + size - 1);
 | 
							addr + size - 1);
 | 
				
			||||||
	/* Memory might get onlined immediately. */
 | 
						/* Memory might get onlined immediately. */
 | 
				
			||||||
	atomic64_add(size, &vm->offline_size);
 | 
						atomic64_add(size, &vm->offline_size);
 | 
				
			||||||
	rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name,
 | 
						rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name,
 | 
				
			||||||
				       MHP_MERGE_RESOURCE);
 | 
									       MHP_MERGE_RESOURCE | MHP_NID_IS_MGID);
 | 
				
			||||||
	if (rc) {
 | 
						if (rc) {
 | 
				
			||||||
		atomic64_sub(size, &vm->offline_size);
 | 
							atomic64_sub(size, &vm->offline_size);
 | 
				
			||||||
		dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
 | 
							dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
 | 
				
			||||||
| 
						 | 
					@ -677,7 +679,7 @@ static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
 | 
						dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
 | 
				
			||||||
		addr + size - 1);
 | 
							addr + size - 1);
 | 
				
			||||||
	rc = remove_memory(vm->nid, addr, size);
 | 
						rc = remove_memory(addr, size);
 | 
				
			||||||
	if (!rc) {
 | 
						if (!rc) {
 | 
				
			||||||
		atomic64_sub(size, &vm->offline_size);
 | 
							atomic64_sub(size, &vm->offline_size);
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
| 
						 | 
					@ -720,7 +722,7 @@ static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
 | 
				
			||||||
		"offlining and removing memory: 0x%llx - 0x%llx\n", addr,
 | 
							"offlining and removing memory: 0x%llx - 0x%llx\n", addr,
 | 
				
			||||||
		addr + size - 1);
 | 
							addr + size - 1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	rc = offline_and_remove_memory(vm->nid, addr, size);
 | 
						rc = offline_and_remove_memory(addr, size);
 | 
				
			||||||
	if (!rc) {
 | 
						if (!rc) {
 | 
				
			||||||
		atomic64_sub(size, &vm->offline_size);
 | 
							atomic64_sub(size, &vm->offline_size);
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
| 
						 | 
					@ -2569,6 +2571,7 @@ static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
 | 
				
			||||||
static int virtio_mem_probe(struct virtio_device *vdev)
 | 
					static int virtio_mem_probe(struct virtio_device *vdev)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct virtio_mem *vm;
 | 
						struct virtio_mem *vm;
 | 
				
			||||||
 | 
						uint64_t unit_pages;
 | 
				
			||||||
	int rc;
 | 
						int rc;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
 | 
						BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
 | 
				
			||||||
| 
						 | 
					@ -2603,6 +2606,16 @@ static int virtio_mem_probe(struct virtio_device *vdev)
 | 
				
			||||||
	if (rc)
 | 
						if (rc)
 | 
				
			||||||
		goto out_del_vq;
 | 
							goto out_del_vq;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* use a single dynamic memory group to cover the whole memory device */
 | 
				
			||||||
 | 
						if (vm->in_sbm)
 | 
				
			||||||
 | 
							unit_pages = PHYS_PFN(memory_block_size_bytes());
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
 | 
							unit_pages = PHYS_PFN(vm->bbm.bb_size);
 | 
				
			||||||
 | 
						rc = memory_group_register_dynamic(vm->nid, unit_pages);
 | 
				
			||||||
 | 
						if (rc < 0)
 | 
				
			||||||
 | 
							goto out_del_resource;
 | 
				
			||||||
 | 
						vm->mgid = rc;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * If we still have memory plugged, we have to unplug all memory first.
 | 
						 * If we still have memory plugged, we have to unplug all memory first.
 | 
				
			||||||
	 * Registering our parent resource makes sure that this memory isn't
 | 
						 * Registering our parent resource makes sure that this memory isn't
 | 
				
			||||||
| 
						 | 
					@ -2617,7 +2630,7 @@ static int virtio_mem_probe(struct virtio_device *vdev)
 | 
				
			||||||
	vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
 | 
						vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
 | 
				
			||||||
	rc = register_memory_notifier(&vm->memory_notifier);
 | 
						rc = register_memory_notifier(&vm->memory_notifier);
 | 
				
			||||||
	if (rc)
 | 
						if (rc)
 | 
				
			||||||
		goto out_del_resource;
 | 
							goto out_unreg_group;
 | 
				
			||||||
	rc = register_virtio_mem_device(vm);
 | 
						rc = register_virtio_mem_device(vm);
 | 
				
			||||||
	if (rc)
 | 
						if (rc)
 | 
				
			||||||
		goto out_unreg_mem;
 | 
							goto out_unreg_mem;
 | 
				
			||||||
| 
						 | 
					@ -2631,6 +2644,8 @@ static int virtio_mem_probe(struct virtio_device *vdev)
 | 
				
			||||||
	return 0;
 | 
						return 0;
 | 
				
			||||||
out_unreg_mem:
 | 
					out_unreg_mem:
 | 
				
			||||||
	unregister_memory_notifier(&vm->memory_notifier);
 | 
						unregister_memory_notifier(&vm->memory_notifier);
 | 
				
			||||||
 | 
					out_unreg_group:
 | 
				
			||||||
 | 
						memory_group_unregister(vm->mgid);
 | 
				
			||||||
out_del_resource:
 | 
					out_del_resource:
 | 
				
			||||||
	virtio_mem_delete_resource(vm);
 | 
						virtio_mem_delete_resource(vm);
 | 
				
			||||||
out_del_vq:
 | 
					out_del_vq:
 | 
				
			||||||
| 
						 | 
					@ -2695,6 +2710,7 @@ static void virtio_mem_remove(struct virtio_device *vdev)
 | 
				
			||||||
	} else {
 | 
						} else {
 | 
				
			||||||
		virtio_mem_delete_resource(vm);
 | 
							virtio_mem_delete_resource(vm);
 | 
				
			||||||
		kfree_const(vm->resource_name);
 | 
							kfree_const(vm->resource_name);
 | 
				
			||||||
 | 
							memory_group_unregister(vm->mgid);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* remove all tracking data - no locking needed */
 | 
						/* remove all tracking data - no locking needed */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -782,10 +782,17 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 | 
				
			||||||
		 * filesystem.
 | 
							 * filesystem.
 | 
				
			||||||
		 */
 | 
							 */
 | 
				
			||||||
		mnt_userns = file_mnt_user_ns(cprm.file);
 | 
							mnt_userns = file_mnt_user_ns(cprm.file);
 | 
				
			||||||
		if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), current_fsuid()))
 | 
							if (!uid_eq(i_uid_into_mnt(mnt_userns, inode),
 | 
				
			||||||
 | 
								    current_fsuid())) {
 | 
				
			||||||
 | 
								pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n",
 | 
				
			||||||
 | 
										    cn.corename);
 | 
				
			||||||
			goto close_fail;
 | 
								goto close_fail;
 | 
				
			||||||
		if ((inode->i_mode & 0677) != 0600)
 | 
							}
 | 
				
			||||||
 | 
							if ((inode->i_mode & 0677) != 0600) {
 | 
				
			||||||
 | 
								pr_info_ratelimited("Core dump to %s aborted: cannot preserve file permissions\n",
 | 
				
			||||||
 | 
										    cn.corename);
 | 
				
			||||||
			goto close_fail;
 | 
								goto close_fail;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
		if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
 | 
							if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
 | 
				
			||||||
			goto close_fail;
 | 
								goto close_fail;
 | 
				
			||||||
		if (do_truncate(mnt_userns, cprm.file->f_path.dentry,
 | 
							if (do_truncate(mnt_userns, cprm.file->f_path.dentry,
 | 
				
			||||||
| 
						 | 
					@ -1127,8 +1134,10 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	mmap_write_unlock(mm);
 | 
						mmap_write_unlock(mm);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (WARN_ON(i != *vma_count))
 | 
						if (WARN_ON(i != *vma_count)) {
 | 
				
			||||||
 | 
							kvfree(*vma_meta);
 | 
				
			||||||
		return -EFAULT;
 | 
							return -EFAULT;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	*vma_data_size_ptr = vma_data_size;
 | 
						*vma_data_size_ptr = vma_data_size;
 | 
				
			||||||
	return 0;
 | 
						return 0;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -723,7 +723,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	call_rcu(&epi->rcu, epi_rcu_free);
 | 
						call_rcu(&epi->rcu, epi_rcu_free);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	atomic_long_dec(&ep->user->epoll_watches);
 | 
						percpu_counter_dec(&ep->user->epoll_watches);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return 0;
 | 
						return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -1439,7 +1439,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	int error, pwake = 0;
 | 
						int error, pwake = 0;
 | 
				
			||||||
	__poll_t revents;
 | 
						__poll_t revents;
 | 
				
			||||||
	long user_watches;
 | 
					 | 
				
			||||||
	struct epitem *epi;
 | 
						struct epitem *epi;
 | 
				
			||||||
	struct ep_pqueue epq;
 | 
						struct ep_pqueue epq;
 | 
				
			||||||
	struct eventpoll *tep = NULL;
 | 
						struct eventpoll *tep = NULL;
 | 
				
			||||||
| 
						 | 
					@ -1449,11 +1448,15 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	lockdep_assert_irqs_enabled();
 | 
						lockdep_assert_irqs_enabled();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	user_watches = atomic_long_read(&ep->user->epoll_watches);
 | 
						if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
 | 
				
			||||||
	if (unlikely(user_watches >= max_user_watches))
 | 
										    max_user_watches) >= 0))
 | 
				
			||||||
		return -ENOSPC;
 | 
							return -ENOSPC;
 | 
				
			||||||
	if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL)))
 | 
						percpu_counter_inc(&ep->user->epoll_watches);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
 | 
				
			||||||
 | 
							percpu_counter_dec(&ep->user->epoll_watches);
 | 
				
			||||||
		return -ENOMEM;
 | 
							return -ENOMEM;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Item initialization follow here ... */
 | 
						/* Item initialization follow here ... */
 | 
				
			||||||
	INIT_LIST_HEAD(&epi->rdllink);
 | 
						INIT_LIST_HEAD(&epi->rdllink);
 | 
				
			||||||
| 
						 | 
					@ -1466,17 +1469,16 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 | 
				
			||||||
		mutex_lock_nested(&tep->mtx, 1);
 | 
							mutex_lock_nested(&tep->mtx, 1);
 | 
				
			||||||
	/* Add the current item to the list of active epoll hook for this file */
 | 
						/* Add the current item to the list of active epoll hook for this file */
 | 
				
			||||||
	if (unlikely(attach_epitem(tfile, epi) < 0)) {
 | 
						if (unlikely(attach_epitem(tfile, epi) < 0)) {
 | 
				
			||||||
		kmem_cache_free(epi_cache, epi);
 | 
					 | 
				
			||||||
		if (tep)
 | 
							if (tep)
 | 
				
			||||||
			mutex_unlock(&tep->mtx);
 | 
								mutex_unlock(&tep->mtx);
 | 
				
			||||||
 | 
							kmem_cache_free(epi_cache, epi);
 | 
				
			||||||
 | 
							percpu_counter_dec(&ep->user->epoll_watches);
 | 
				
			||||||
		return -ENOMEM;
 | 
							return -ENOMEM;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (full_check && !tep)
 | 
						if (full_check && !tep)
 | 
				
			||||||
		list_file(tfile);
 | 
							list_file(tfile);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	atomic_long_inc(&ep->user->epoll_watches);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Add the current item to the RB tree. All RB tree operations are
 | 
						 * Add the current item to the RB tree. All RB tree operations are
 | 
				
			||||||
	 * protected by "mtx", and ep_insert() is called with "mtx" held.
 | 
						 * protected by "mtx", and ep_insert() is called with "mtx" held.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -51,11 +51,9 @@ static const struct sysfs_ops nilfs_##name##_attr_ops = { \
 | 
				
			||||||
#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \
 | 
					#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \
 | 
				
			||||||
static void nilfs_##name##_attr_release(struct kobject *kobj) \
 | 
					static void nilfs_##name##_attr_release(struct kobject *kobj) \
 | 
				
			||||||
{ \
 | 
					{ \
 | 
				
			||||||
	struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \
 | 
						struct nilfs_sysfs_##parent_name##_subgroups *subgroups = container_of(kobj, \
 | 
				
			||||||
	struct the_nilfs *nilfs = container_of(kobj->parent, \
 | 
											struct nilfs_sysfs_##parent_name##_subgroups, \
 | 
				
			||||||
						struct the_nilfs, \
 | 
											sg_##name##_kobj); \
 | 
				
			||||||
						ns_##parent_name##_kobj); \
 | 
					 | 
				
			||||||
	subgroups = nilfs->ns_##parent_name##_subgroups; \
 | 
					 | 
				
			||||||
	complete(&subgroups->sg_##name##_kobj_unregister); \
 | 
						complete(&subgroups->sg_##name##_kobj_unregister); \
 | 
				
			||||||
} \
 | 
					} \
 | 
				
			||||||
static struct kobj_type nilfs_##name##_ktype = { \
 | 
					static struct kobj_type nilfs_##name##_ktype = { \
 | 
				
			||||||
| 
						 | 
					@ -81,12 +79,12 @@ static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \
 | 
				
			||||||
	err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \
 | 
						err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \
 | 
				
			||||||
				    #name); \
 | 
									    #name); \
 | 
				
			||||||
	if (err) \
 | 
						if (err) \
 | 
				
			||||||
 | 
							kobject_put(kobj); \
 | 
				
			||||||
	return err; \
 | 
						return err; \
 | 
				
			||||||
	return 0; \
 | 
					 | 
				
			||||||
} \
 | 
					} \
 | 
				
			||||||
static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \
 | 
					static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \
 | 
				
			||||||
{ \
 | 
					{ \
 | 
				
			||||||
	kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \
 | 
						kobject_put(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/************************************************************************
 | 
					/************************************************************************
 | 
				
			||||||
| 
						 | 
					@ -197,14 +195,14 @@ int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (err)
 | 
						if (err)
 | 
				
			||||||
		return err;
 | 
							kobject_put(&root->snapshot_kobj);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return 0;
 | 
						return err;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root)
 | 
					void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	kobject_del(&root->snapshot_kobj);
 | 
						kobject_put(&root->snapshot_kobj);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/************************************************************************
 | 
					/************************************************************************
 | 
				
			||||||
| 
						 | 
					@ -986,7 +984,7 @@ int nilfs_sysfs_create_device_group(struct super_block *sb)
 | 
				
			||||||
	err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL,
 | 
						err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL,
 | 
				
			||||||
				    "%s", sb->s_id);
 | 
									    "%s", sb->s_id);
 | 
				
			||||||
	if (err)
 | 
						if (err)
 | 
				
			||||||
		goto free_dev_subgroups;
 | 
							goto cleanup_dev_kobject;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	err = nilfs_sysfs_create_mounted_snapshots_group(nilfs);
 | 
						err = nilfs_sysfs_create_mounted_snapshots_group(nilfs);
 | 
				
			||||||
	if (err)
 | 
						if (err)
 | 
				
			||||||
| 
						 | 
					@ -1023,9 +1021,7 @@ int nilfs_sysfs_create_device_group(struct super_block *sb)
 | 
				
			||||||
	nilfs_sysfs_delete_mounted_snapshots_group(nilfs);
 | 
						nilfs_sysfs_delete_mounted_snapshots_group(nilfs);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cleanup_dev_kobject:
 | 
					cleanup_dev_kobject:
 | 
				
			||||||
	kobject_del(&nilfs->ns_dev_kobj);
 | 
						kobject_put(&nilfs->ns_dev_kobj);
 | 
				
			||||||
 | 
					 | 
				
			||||||
free_dev_subgroups:
 | 
					 | 
				
			||||||
	kfree(nilfs->ns_dev_subgroups);
 | 
						kfree(nilfs->ns_dev_subgroups);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
failed_create_device_group:
 | 
					failed_create_device_group:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -792,14 +792,13 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void nilfs_put_root(struct nilfs_root *root)
 | 
					void nilfs_put_root(struct nilfs_root *root)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	if (refcount_dec_and_test(&root->count)) {
 | 
					 | 
				
			||||||
	struct the_nilfs *nilfs = root->nilfs;
 | 
						struct the_nilfs *nilfs = root->nilfs;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		nilfs_sysfs_delete_snapshot_group(root);
 | 
						if (refcount_dec_and_lock(&root->count, &nilfs->ns_cptree_lock)) {
 | 
				
			||||||
 | 
					 | 
				
			||||||
		spin_lock(&nilfs->ns_cptree_lock);
 | 
					 | 
				
			||||||
		rb_erase(&root->rb_node, &nilfs->ns_cptree);
 | 
							rb_erase(&root->rb_node, &nilfs->ns_cptree);
 | 
				
			||||||
		spin_unlock(&nilfs->ns_cptree_lock);
 | 
							spin_unlock(&nilfs->ns_cptree_lock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							nilfs_sysfs_delete_snapshot_group(root);
 | 
				
			||||||
		iput(root->ifile);
 | 
							iput(root->ifile);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		kfree(root);
 | 
							kfree(root);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -98,27 +98,17 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
 | 
					void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	char *buf;
 | 
					 | 
				
			||||||
	size_t size;
 | 
					 | 
				
			||||||
	char tcomm[64];
 | 
						char tcomm[64];
 | 
				
			||||||
	int ret;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (p->flags & PF_WQ_WORKER)
 | 
						if (p->flags & PF_WQ_WORKER)
 | 
				
			||||||
		wq_worker_comm(tcomm, sizeof(tcomm), p);
 | 
							wq_worker_comm(tcomm, sizeof(tcomm), p);
 | 
				
			||||||
	else
 | 
						else
 | 
				
			||||||
		__get_task_comm(tcomm, sizeof(tcomm), p);
 | 
							__get_task_comm(tcomm, sizeof(tcomm), p);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	size = seq_get_buf(m, &buf);
 | 
						if (escape)
 | 
				
			||||||
	if (escape) {
 | 
							seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
 | 
				
			||||||
		ret = string_escape_str(tcomm, buf, size,
 | 
						else
 | 
				
			||||||
					ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
 | 
							seq_printf(m, "%.64s", tcomm);
 | 
				
			||||||
		if (ret >= size)
 | 
					 | 
				
			||||||
			ret = -1;
 | 
					 | 
				
			||||||
	} else {
 | 
					 | 
				
			||||||
		ret = strscpy(buf, tcomm, size);
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	seq_commit(m, ret);
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -95,6 +95,7 @@
 | 
				
			||||||
#include <linux/posix-timers.h>
 | 
					#include <linux/posix-timers.h>
 | 
				
			||||||
#include <linux/time_namespace.h>
 | 
					#include <linux/time_namespace.h>
 | 
				
			||||||
#include <linux/resctrl.h>
 | 
					#include <linux/resctrl.h>
 | 
				
			||||||
 | 
					#include <linux/cn_proc.h>
 | 
				
			||||||
#include <trace/events/oom.h>
 | 
					#include <trace/events/oom.h>
 | 
				
			||||||
#include "internal.h"
 | 
					#include "internal.h"
 | 
				
			||||||
#include "fd.h"
 | 
					#include "fd.h"
 | 
				
			||||||
| 
						 | 
					@ -1674,8 +1675,10 @@ static ssize_t comm_write(struct file *file, const char __user *buf,
 | 
				
			||||||
	if (!p)
 | 
						if (!p)
 | 
				
			||||||
		return -ESRCH;
 | 
							return -ESRCH;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (same_thread_group(current, p))
 | 
						if (same_thread_group(current, p)) {
 | 
				
			||||||
		set_task_comm(p, buffer);
 | 
							set_task_comm(p, buffer);
 | 
				
			||||||
 | 
							proc_comm_connector(p);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
	else
 | 
						else
 | 
				
			||||||
		count = -EINVAL;
 | 
							count = -EINVAL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -19,12 +19,6 @@ extern void *early_memremap_prot(resource_size_t phys_addr,
 | 
				
			||||||
extern void early_iounmap(void __iomem *addr, unsigned long size);
 | 
					extern void early_iounmap(void __iomem *addr, unsigned long size);
 | 
				
			||||||
extern void early_memunmap(void *addr, unsigned long size);
 | 
					extern void early_memunmap(void *addr, unsigned long size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
 * Weak function called by early_ioremap_reset(). It does nothing, but
 | 
					 | 
				
			||||||
 * architectures may provide their own version to do any needed cleanups.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
extern void early_ioremap_shutdown(void);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#if defined(CONFIG_GENERIC_EARLY_IOREMAP) && defined(CONFIG_MMU)
 | 
					#if defined(CONFIG_GENERIC_EARLY_IOREMAP) && defined(CONFIG_MMU)
 | 
				
			||||||
/* Arch-specific initialization */
 | 
					/* Arch-specific initialization */
 | 
				
			||||||
extern void early_ioremap_init(void);
 | 
					extern void early_ioremap_init(void);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										268
									
								
								include/linux/damon.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										268
									
								
								include/linux/damon.h
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,268 @@
 | 
				
			||||||
 | 
					/* SPDX-License-Identifier: GPL-2.0 */
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * DAMON api
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Author: SeongJae Park <sjpark@amazon.de>
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifndef _DAMON_H_
 | 
				
			||||||
 | 
					#define _DAMON_H_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <linux/mutex.h>
 | 
				
			||||||
 | 
					#include <linux/time64.h>
 | 
				
			||||||
 | 
					#include <linux/types.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Minimal region size.  Every damon_region is aligned by this. */
 | 
				
			||||||
 | 
					#define DAMON_MIN_REGION	PAGE_SIZE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * struct damon_addr_range - Represents an address region of [@start, @end).
 | 
				
			||||||
 | 
					 * @start:	Start address of the region (inclusive).
 | 
				
			||||||
 | 
					 * @end:	End address of the region (exclusive).
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					struct damon_addr_range {
 | 
				
			||||||
 | 
						unsigned long start;
 | 
				
			||||||
 | 
						unsigned long end;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * struct damon_region - Represents a monitoring target region.
 | 
				
			||||||
 | 
					 * @ar:			The address range of the region.
 | 
				
			||||||
 | 
					 * @sampling_addr:	Address of the sample for the next access check.
 | 
				
			||||||
 | 
					 * @nr_accesses:	Access frequency of this region.
 | 
				
			||||||
 | 
					 * @list:		List head for siblings.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					struct damon_region {
 | 
				
			||||||
 | 
						struct damon_addr_range ar;
 | 
				
			||||||
 | 
						unsigned long sampling_addr;
 | 
				
			||||||
 | 
						unsigned int nr_accesses;
 | 
				
			||||||
 | 
						struct list_head list;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * struct damon_target - Represents a monitoring target.
 | 
				
			||||||
 | 
					 * @id:			Unique identifier for this target.
 | 
				
			||||||
 | 
					 * @nr_regions:		Number of monitoring target regions of this target.
 | 
				
			||||||
 | 
					 * @regions_list:	Head of the monitoring target regions of this target.
 | 
				
			||||||
 | 
					 * @list:		List head for siblings.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Each monitoring context could have multiple targets.  For example, a context
 | 
				
			||||||
 | 
					 * for virtual memory address spaces could have multiple target processes.  The
 | 
				
			||||||
 | 
					 * @id of each target should be unique among the targets of the context.  For
 | 
				
			||||||
 | 
					 * example, in the virtual address monitoring context, it could be a pidfd or
 | 
				
			||||||
 | 
					 * an address of an mm_struct.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					struct damon_target {
 | 
				
			||||||
 | 
						unsigned long id;
 | 
				
			||||||
 | 
						unsigned int nr_regions;
 | 
				
			||||||
 | 
						struct list_head regions_list;
 | 
				
			||||||
 | 
						struct list_head list;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct damon_ctx;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * struct damon_primitive	Monitoring primitives for given use cases.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * @init:			Initialize primitive-internal data structures.
 | 
				
			||||||
 | 
					 * @update:			Update primitive-internal data structures.
 | 
				
			||||||
 | 
					 * @prepare_access_checks:	Prepare next access check of target regions.
 | 
				
			||||||
 | 
					 * @check_accesses:		Check the accesses to target regions.
 | 
				
			||||||
 | 
					 * @reset_aggregated:		Reset aggregated accesses monitoring results.
 | 
				
			||||||
 | 
					 * @target_valid:		Determine if the target is valid.
 | 
				
			||||||
 | 
					 * @cleanup:			Clean up the context.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * DAMON can be extended for various address spaces and usages.  For this,
 | 
				
			||||||
 | 
					 * users should register the low level primitives for their target address
 | 
				
			||||||
 | 
					 * space and usecase via the &damon_ctx.primitive.  Then, the monitoring thread
 | 
				
			||||||
 | 
					 * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting
 | 
				
			||||||
 | 
					 * the monitoring, @update after each &damon_ctx.primitive_update_interval, and
 | 
				
			||||||
 | 
					 * @check_accesses, @target_valid and @prepare_access_checks after each
 | 
				
			||||||
 | 
					 * &damon_ctx.sample_interval.  Finally, @reset_aggregated is called after each
 | 
				
			||||||
 | 
					 * &damon_ctx.aggr_interval.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * @init should initialize primitive-internal data structures.  For example,
 | 
				
			||||||
 | 
					 * this could be used to construct proper monitoring target regions and link
 | 
				
			||||||
 | 
					 * those to @damon_ctx.adaptive_targets.
 | 
				
			||||||
 | 
					 * @update should update the primitive-internal data structures.  For example,
 | 
				
			||||||
 | 
					 * this could be used to update monitoring target regions for current status.
 | 
				
			||||||
 | 
					 * @prepare_access_checks should manipulate the monitoring regions to be
 | 
				
			||||||
 | 
					 * prepared for the next access check.
 | 
				
			||||||
 | 
					 * @check_accesses should check the accesses to each region that made after the
 | 
				
			||||||
 | 
					 * last preparation and update the number of observed accesses of each region.
 | 
				
			||||||
 | 
					 * It should also return max number of observed accesses that made as a result
 | 
				
			||||||
 | 
					 * of its update.  The value will be used for regions adjustment threshold.
 | 
				
			||||||
 | 
					 * @reset_aggregated should reset the access monitoring results that aggregated
 | 
				
			||||||
 | 
					 * by @check_accesses.
 | 
				
			||||||
 | 
					 * @target_valid should check whether the target is still valid for the
 | 
				
			||||||
 | 
					 * monitoring.
 | 
				
			||||||
 | 
					 * @cleanup is called from @kdamond just before its termination.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					struct damon_primitive {
 | 
				
			||||||
 | 
						void (*init)(struct damon_ctx *context);
 | 
				
			||||||
 | 
						void (*update)(struct damon_ctx *context);
 | 
				
			||||||
 | 
						void (*prepare_access_checks)(struct damon_ctx *context);
 | 
				
			||||||
 | 
						unsigned int (*check_accesses)(struct damon_ctx *context);
 | 
				
			||||||
 | 
						void (*reset_aggregated)(struct damon_ctx *context);
 | 
				
			||||||
 | 
						bool (*target_valid)(void *target);
 | 
				
			||||||
 | 
						void (*cleanup)(struct damon_ctx *context);
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * struct damon_callback	Monitoring events notification callbacks.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * @before_start:	Called before starting the monitoring.
 | 
				
			||||||
 | 
					 * @after_sampling:	Called after each sampling.
 | 
				
			||||||
 | 
					 * @after_aggregation:	Called after each aggregation.
 | 
				
			||||||
 | 
					 * @before_terminate:	Called before terminating the monitoring.
 | 
				
			||||||
 | 
					 * @private:		User private data.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * The monitoring thread (&damon_ctx.kdamond) calls @before_start and
 | 
				
			||||||
 | 
					 * @before_terminate just before starting and finishing the monitoring,
 | 
				
			||||||
 | 
					 * respectively.  Therefore, those are good places for installing and cleaning
 | 
				
			||||||
 | 
					 * @private.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * The monitoring thread calls @after_sampling and @after_aggregation for each
 | 
				
			||||||
 | 
					 * of the sampling intervals and aggregation intervals, respectively.
 | 
				
			||||||
 | 
					 * Therefore, users can safely access the monitoring results without additional
 | 
				
			||||||
 | 
					 * protection.  For the reason, users are recommended to use these callback for
 | 
				
			||||||
 | 
					 * the accesses to the results.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * If any callback returns non-zero, monitoring stops.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					struct damon_callback {
 | 
				
			||||||
 | 
						void *private;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int (*before_start)(struct damon_ctx *context);
 | 
				
			||||||
 | 
						int (*after_sampling)(struct damon_ctx *context);
 | 
				
			||||||
 | 
						int (*after_aggregation)(struct damon_ctx *context);
 | 
				
			||||||
 | 
						int (*before_terminate)(struct damon_ctx *context);
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * struct damon_ctx - Represents a context for each monitoring.  This is the
 | 
				
			||||||
 | 
					 * main interface that allows users to set the attributes and get the results
 | 
				
			||||||
 | 
					 * of the monitoring.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * @sample_interval:		The time between access samplings.
 | 
				
			||||||
 | 
					 * @aggr_interval:		The time between monitor results aggregations.
 | 
				
			||||||
 | 
					 * @primitive_update_interval:	The time between monitoring primitive updates.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * For each @sample_interval, DAMON checks whether each region is accessed or
 | 
				
			||||||
 | 
					 * not.  It aggregates and keeps the access information (number of accesses to
 | 
				
			||||||
 | 
					 * each region) for @aggr_interval time.  DAMON also checks whether the target
 | 
				
			||||||
 | 
					 * memory regions need update (e.g., by ``mmap()`` calls from the application,
 | 
				
			||||||
 | 
					 * in case of virtual memory monitoring) and applies the changes for each
 | 
				
			||||||
 | 
					 * @primitive_update_interval.  All time intervals are in micro-seconds.
 | 
				
			||||||
 | 
					 * Please refer to &struct damon_primitive and &struct damon_callback for more
 | 
				
			||||||
 | 
					 * detail.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * @kdamond:		Kernel thread who does the monitoring.
 | 
				
			||||||
 | 
					 * @kdamond_stop:	Notifies whether kdamond should stop.
 | 
				
			||||||
 | 
					 * @kdamond_lock:	Mutex for the synchronizations with @kdamond.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * For each monitoring context, one kernel thread for the monitoring is
 | 
				
			||||||
 | 
					 * created.  The pointer to the thread is stored in @kdamond.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Once started, the monitoring thread runs until explicitly required to be
 | 
				
			||||||
 | 
					 * terminated or every monitoring target is invalid.  The validity of the
 | 
				
			||||||
 | 
					 * targets is checked via the &damon_primitive.target_valid of @primitive.  The
 | 
				
			||||||
 | 
					 * termination can also be explicitly requested by writing non-zero to
 | 
				
			||||||
 | 
					 * @kdamond_stop.  The thread sets @kdamond to NULL when it terminates.
 | 
				
			||||||
 | 
					 * Therefore, users can know whether the monitoring is ongoing or terminated by
 | 
				
			||||||
 | 
					 * reading @kdamond.  Reads and writes to @kdamond and @kdamond_stop from
 | 
				
			||||||
 | 
					 * outside of the monitoring thread must be protected by @kdamond_lock.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Note that the monitoring thread protects only @kdamond and @kdamond_stop via
 | 
				
			||||||
 | 
					 * @kdamond_lock.  Accesses to other fields must be protected by themselves.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * @primitive:	Set of monitoring primitives for given use cases.
 | 
				
			||||||
 | 
					 * @callback:	Set of callbacks for monitoring events notifications.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * @min_nr_regions:	The minimum number of adaptive monitoring regions.
 | 
				
			||||||
 | 
					 * @max_nr_regions:	The maximum number of adaptive monitoring regions.
 | 
				
			||||||
 | 
					 * @adaptive_targets:	Head of monitoring targets (&damon_target) list.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					struct damon_ctx {
 | 
				
			||||||
 | 
						unsigned long sample_interval;
 | 
				
			||||||
 | 
						unsigned long aggr_interval;
 | 
				
			||||||
 | 
						unsigned long primitive_update_interval;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* private: internal use only */
 | 
				
			||||||
 | 
						struct timespec64 last_aggregation;
 | 
				
			||||||
 | 
						struct timespec64 last_primitive_update;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* public: */
 | 
				
			||||||
 | 
						struct task_struct *kdamond;
 | 
				
			||||||
 | 
						bool kdamond_stop;
 | 
				
			||||||
 | 
						struct mutex kdamond_lock;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						struct damon_primitive primitive;
 | 
				
			||||||
 | 
						struct damon_callback callback;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						unsigned long min_nr_regions;
 | 
				
			||||||
 | 
						unsigned long max_nr_regions;
 | 
				
			||||||
 | 
						struct list_head adaptive_targets;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define damon_next_region(r) \
 | 
				
			||||||
 | 
						(container_of(r->list.next, struct damon_region, list))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define damon_prev_region(r) \
 | 
				
			||||||
 | 
						(container_of(r->list.prev, struct damon_region, list))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define damon_for_each_region(r, t) \
 | 
				
			||||||
 | 
						list_for_each_entry(r, &t->regions_list, list)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define damon_for_each_region_safe(r, next, t) \
 | 
				
			||||||
 | 
						list_for_each_entry_safe(r, next, &t->regions_list, list)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define damon_for_each_target(t, ctx) \
 | 
				
			||||||
 | 
						list_for_each_entry(t, &(ctx)->adaptive_targets, list)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define damon_for_each_target_safe(t, next, ctx)	\
 | 
				
			||||||
 | 
						list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef CONFIG_DAMON
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct damon_region *damon_new_region(unsigned long start, unsigned long end);
 | 
				
			||||||
 | 
					inline void damon_insert_region(struct damon_region *r,
 | 
				
			||||||
 | 
							struct damon_region *prev, struct damon_region *next,
 | 
				
			||||||
 | 
							struct damon_target *t);
 | 
				
			||||||
 | 
					void damon_add_region(struct damon_region *r, struct damon_target *t);
 | 
				
			||||||
 | 
					void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct damon_target *damon_new_target(unsigned long id);
 | 
				
			||||||
 | 
					void damon_add_target(struct damon_ctx *ctx, struct damon_target *t);
 | 
				
			||||||
 | 
					void damon_free_target(struct damon_target *t);
 | 
				
			||||||
 | 
					void damon_destroy_target(struct damon_target *t);
 | 
				
			||||||
 | 
					unsigned int damon_nr_regions(struct damon_target *t);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct damon_ctx *damon_new_ctx(void);
 | 
				
			||||||
 | 
					void damon_destroy_ctx(struct damon_ctx *ctx);
 | 
				
			||||||
 | 
					int damon_set_targets(struct damon_ctx *ctx,
 | 
				
			||||||
 | 
							unsigned long *ids, ssize_t nr_ids);
 | 
				
			||||||
 | 
					int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
 | 
				
			||||||
 | 
							unsigned long aggr_int, unsigned long primitive_upd_int,
 | 
				
			||||||
 | 
							unsigned long min_nr_reg, unsigned long max_nr_reg);
 | 
				
			||||||
 | 
					int damon_nr_running_ctxs(void);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int damon_start(struct damon_ctx **ctxs, int nr_ctxs);
 | 
				
			||||||
 | 
					int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif	/* CONFIG_DAMON */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef CONFIG_DAMON_VADDR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Monitoring primitives for virtual memory address spaces */
 | 
				
			||||||
 | 
					void damon_va_init(struct damon_ctx *ctx);
 | 
				
			||||||
 | 
					void damon_va_update(struct damon_ctx *ctx);
 | 
				
			||||||
 | 
					void damon_va_prepare_access_checks(struct damon_ctx *ctx);
 | 
				
			||||||
 | 
					unsigned int damon_va_check_accesses(struct damon_ctx *ctx);
 | 
				
			||||||
 | 
					bool damon_va_target_valid(void *t);
 | 
				
			||||||
 | 
					void damon_va_cleanup(struct damon_ctx *ctx);
 | 
				
			||||||
 | 
					void damon_va_set_primitives(struct damon_ctx *ctx);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif	/* CONFIG_DAMON_VADDR */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif	/* _DAMON_H */
 | 
				
			||||||
| 
						 | 
					@ -90,7 +90,11 @@ static inline void __kunmap_local(void *vaddr)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 | 
					static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
						if (IS_ENABLED(CONFIG_PREEMPT_RT))
 | 
				
			||||||
 | 
							migrate_disable();
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
		preempt_disable();
 | 
							preempt_disable();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	pagefault_disable();
 | 
						pagefault_disable();
 | 
				
			||||||
	return __kmap_local_page_prot(page, prot);
 | 
						return __kmap_local_page_prot(page, prot);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -102,7 +106,11 @@ static inline void *kmap_atomic(struct page *page)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void *kmap_atomic_pfn(unsigned long pfn)
 | 
					static inline void *kmap_atomic_pfn(unsigned long pfn)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
						if (IS_ENABLED(CONFIG_PREEMPT_RT))
 | 
				
			||||||
 | 
							migrate_disable();
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
		preempt_disable();
 | 
							preempt_disable();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	pagefault_disable();
 | 
						pagefault_disable();
 | 
				
			||||||
	return __kmap_local_pfn_prot(pfn, kmap_prot);
 | 
						return __kmap_local_pfn_prot(pfn, kmap_prot);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -111,6 +119,9 @@ static inline void __kunmap_atomic(void *addr)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	kunmap_local_indexed(addr);
 | 
						kunmap_local_indexed(addr);
 | 
				
			||||||
	pagefault_enable();
 | 
						pagefault_enable();
 | 
				
			||||||
 | 
						if (IS_ENABLED(CONFIG_PREEMPT_RT))
 | 
				
			||||||
 | 
							migrate_enable();
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
		preempt_enable();
 | 
							preempt_enable();
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -179,6 +190,9 @@ static inline void __kunmap_local(void *addr)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void *kmap_atomic(struct page *page)
 | 
					static inline void *kmap_atomic(struct page *page)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
						if (IS_ENABLED(CONFIG_PREEMPT_RT))
 | 
				
			||||||
 | 
							migrate_disable();
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
		preempt_disable();
 | 
							preempt_disable();
 | 
				
			||||||
	pagefault_disable();
 | 
						pagefault_disable();
 | 
				
			||||||
	return page_address(page);
 | 
						return page_address(page);
 | 
				
			||||||
| 
						 | 
					@ -200,6 +214,9 @@ static inline void __kunmap_atomic(void *addr)
 | 
				
			||||||
	kunmap_flush_on_unmap(addr);
 | 
						kunmap_flush_on_unmap(addr);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	pagefault_enable();
 | 
						pagefault_enable();
 | 
				
			||||||
 | 
						if (IS_ENABLED(CONFIG_PREEMPT_RT))
 | 
				
			||||||
 | 
							migrate_enable();
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
		preempt_enable();
 | 
							preempt_enable();
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -23,6 +23,48 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define MIN_MEMORY_BLOCK_SIZE     (1UL << SECTION_SIZE_BITS)
 | 
					#define MIN_MEMORY_BLOCK_SIZE     (1UL << SECTION_SIZE_BITS)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * struct memory_group - a logical group of memory blocks
 | 
				
			||||||
 | 
					 * @nid: The node id for all memory blocks inside the memory group.
 | 
				
			||||||
 | 
					 * @blocks: List of all memory blocks belonging to this memory group.
 | 
				
			||||||
 | 
					 * @present_kernel_pages: Present (online) memory outside ZONE_MOVABLE of this
 | 
				
			||||||
 | 
					 *			  memory group.
 | 
				
			||||||
 | 
					 * @present_movable_pages: Present (online) memory in ZONE_MOVABLE of this
 | 
				
			||||||
 | 
					 *			   memory group.
 | 
				
			||||||
 | 
					 * @is_dynamic: The memory group type: static vs. dynamic
 | 
				
			||||||
 | 
					 * @s.max_pages: Valid with &memory_group.is_dynamic == false. The maximum
 | 
				
			||||||
 | 
					 *		 number of pages we'll have in this static memory group.
 | 
				
			||||||
 | 
					 * @d.unit_pages: Valid with &memory_group.is_dynamic == true. Unit in pages
 | 
				
			||||||
 | 
					 *		  in which memory is added/removed in this dynamic memory group.
 | 
				
			||||||
 | 
					 *		  This granularity defines the alignment of a unit in physical
 | 
				
			||||||
 | 
					 *		  address space; it has to be at least as big as a single
 | 
				
			||||||
 | 
					 *		  memory block.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * A memory group logically groups memory blocks; each memory block
 | 
				
			||||||
 | 
					 * belongs to at most one memory group. A memory group corresponds to
 | 
				
			||||||
 | 
					 * a memory device, such as a DIMM or a NUMA node, which spans multiple
 | 
				
			||||||
 | 
					 * memory blocks and might even span multiple non-contiguous physical memory
 | 
				
			||||||
 | 
					 * ranges.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Modification of members after registration is serialized by memory
 | 
				
			||||||
 | 
					 * hot(un)plug code.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					struct memory_group {
 | 
				
			||||||
 | 
						int nid;
 | 
				
			||||||
 | 
						struct list_head memory_blocks;
 | 
				
			||||||
 | 
						unsigned long present_kernel_pages;
 | 
				
			||||||
 | 
						unsigned long present_movable_pages;
 | 
				
			||||||
 | 
						bool is_dynamic;
 | 
				
			||||||
 | 
						union {
 | 
				
			||||||
 | 
							struct {
 | 
				
			||||||
 | 
								unsigned long max_pages;
 | 
				
			||||||
 | 
							} s;
 | 
				
			||||||
 | 
							struct {
 | 
				
			||||||
 | 
								unsigned long unit_pages;
 | 
				
			||||||
 | 
							} d;
 | 
				
			||||||
 | 
						};
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct memory_block {
 | 
					struct memory_block {
 | 
				
			||||||
	unsigned long start_section_nr;
 | 
						unsigned long start_section_nr;
 | 
				
			||||||
	unsigned long state;		/* serialized by the dev->lock */
 | 
						unsigned long state;		/* serialized by the dev->lock */
 | 
				
			||||||
| 
						 | 
					@ -34,6 +76,8 @@ struct memory_block {
 | 
				
			||||||
	 * lay at the beginning of the memory block.
 | 
						 * lay at the beginning of the memory block.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	unsigned long nr_vmemmap_pages;
 | 
						unsigned long nr_vmemmap_pages;
 | 
				
			||||||
 | 
						struct memory_group *group;	/* group (if any) for this block */
 | 
				
			||||||
 | 
						struct list_head group_next;	/* next block inside memory group */
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int arch_get_memory_phys_device(unsigned long start_pfn);
 | 
					int arch_get_memory_phys_device(unsigned long start_pfn);
 | 
				
			||||||
| 
						 | 
					@ -86,7 +130,8 @@ static inline int memory_notify(unsigned long val, void *v)
 | 
				
			||||||
extern int register_memory_notifier(struct notifier_block *nb);
 | 
					extern int register_memory_notifier(struct notifier_block *nb);
 | 
				
			||||||
extern void unregister_memory_notifier(struct notifier_block *nb);
 | 
					extern void unregister_memory_notifier(struct notifier_block *nb);
 | 
				
			||||||
int create_memory_block_devices(unsigned long start, unsigned long size,
 | 
					int create_memory_block_devices(unsigned long start, unsigned long size,
 | 
				
			||||||
				unsigned long vmemmap_pages);
 | 
									unsigned long vmemmap_pages,
 | 
				
			||||||
 | 
									struct memory_group *group);
 | 
				
			||||||
void remove_memory_block_devices(unsigned long start, unsigned long size);
 | 
					void remove_memory_block_devices(unsigned long start, unsigned long size);
 | 
				
			||||||
extern void memory_dev_init(void);
 | 
					extern void memory_dev_init(void);
 | 
				
			||||||
extern int memory_notify(unsigned long val, void *v);
 | 
					extern int memory_notify(unsigned long val, void *v);
 | 
				
			||||||
| 
						 | 
					@ -96,6 +141,14 @@ extern int walk_memory_blocks(unsigned long start, unsigned long size,
 | 
				
			||||||
			      void *arg, walk_memory_blocks_func_t func);
 | 
								      void *arg, walk_memory_blocks_func_t func);
 | 
				
			||||||
extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func);
 | 
					extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func);
 | 
				
			||||||
#define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
 | 
					#define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern int memory_group_register_static(int nid, unsigned long max_pages);
 | 
				
			||||||
 | 
					extern int memory_group_register_dynamic(int nid, unsigned long unit_pages);
 | 
				
			||||||
 | 
					extern int memory_group_unregister(int mgid);
 | 
				
			||||||
 | 
					struct memory_group *memory_group_find_by_id(int mgid);
 | 
				
			||||||
 | 
					typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *);
 | 
				
			||||||
 | 
					int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
 | 
				
			||||||
 | 
								       struct memory_group *excluded, void *arg);
 | 
				
			||||||
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 | 
					#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef CONFIG_MEMORY_HOTPLUG
 | 
					#ifdef CONFIG_MEMORY_HOTPLUG
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,6 +12,7 @@ struct zone;
 | 
				
			||||||
struct pglist_data;
 | 
					struct pglist_data;
 | 
				
			||||||
struct mem_section;
 | 
					struct mem_section;
 | 
				
			||||||
struct memory_block;
 | 
					struct memory_block;
 | 
				
			||||||
 | 
					struct memory_group;
 | 
				
			||||||
struct resource;
 | 
					struct resource;
 | 
				
			||||||
struct vmem_altmap;
 | 
					struct vmem_altmap;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -50,6 +51,11 @@ typedef int __bitwise mhp_t;
 | 
				
			||||||
 * Only selected architectures support it with SPARSE_VMEMMAP.
 | 
					 * Only selected architectures support it with SPARSE_VMEMMAP.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
#define MHP_MEMMAP_ON_MEMORY   ((__force mhp_t)BIT(1))
 | 
					#define MHP_MEMMAP_ON_MEMORY   ((__force mhp_t)BIT(1))
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * The nid field specifies a memory group id (mgid) instead. The memory group
 | 
				
			||||||
 | 
					 * implies the node id (nid).
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					#define MHP_NID_IS_MGID		((__force mhp_t)BIT(2))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 * Extended parameters for memory hotplug:
 | 
					 * Extended parameters for memory hotplug:
 | 
				
			||||||
| 
						 | 
					@ -95,13 +101,15 @@ static inline void zone_seqlock_init(struct zone *zone)
 | 
				
			||||||
extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
 | 
					extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
 | 
				
			||||||
extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
 | 
					extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
 | 
				
			||||||
extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
 | 
					extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
 | 
				
			||||||
extern void adjust_present_page_count(struct zone *zone, long nr_pages);
 | 
					extern void adjust_present_page_count(struct page *page,
 | 
				
			||||||
 | 
									      struct memory_group *group,
 | 
				
			||||||
 | 
									      long nr_pages);
 | 
				
			||||||
/* VM interface that may be used by firmware interface */
 | 
					/* VM interface that may be used by firmware interface */
 | 
				
			||||||
extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 | 
					extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 | 
				
			||||||
				     struct zone *zone);
 | 
									     struct zone *zone);
 | 
				
			||||||
extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
 | 
					extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
 | 
				
			||||||
extern int online_pages(unsigned long pfn, unsigned long nr_pages,
 | 
					extern int online_pages(unsigned long pfn, unsigned long nr_pages,
 | 
				
			||||||
			struct zone *zone);
 | 
								struct zone *zone, struct memory_group *group);
 | 
				
			||||||
extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
 | 
					extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
 | 
				
			||||||
					 unsigned long end_pfn);
 | 
										 unsigned long end_pfn);
 | 
				
			||||||
extern void __offline_isolated_pages(unsigned long start_pfn,
 | 
					extern void __offline_isolated_pages(unsigned long start_pfn,
 | 
				
			||||||
| 
						 | 
					@ -130,8 +138,7 @@ static inline bool movable_node_is_enabled(void)
 | 
				
			||||||
	return movable_node_enabled;
 | 
						return movable_node_enabled;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern void arch_remove_memory(int nid, u64 start, u64 size,
 | 
					extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap);
 | 
				
			||||||
			       struct vmem_altmap *altmap);
 | 
					 | 
				
			||||||
extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages,
 | 
					extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages,
 | 
				
			||||||
			   struct vmem_altmap *altmap);
 | 
								   struct vmem_altmap *altmap);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -292,25 +299,27 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
 | 
				
			||||||
#ifdef CONFIG_MEMORY_HOTREMOVE
 | 
					#ifdef CONFIG_MEMORY_HOTREMOVE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern void try_offline_node(int nid);
 | 
					extern void try_offline_node(int nid);
 | 
				
			||||||
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 | 
					extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 | 
				
			||||||
extern int remove_memory(int nid, u64 start, u64 size);
 | 
								 struct memory_group *group);
 | 
				
			||||||
extern void __remove_memory(int nid, u64 start, u64 size);
 | 
					extern int remove_memory(u64 start, u64 size);
 | 
				
			||||||
extern int offline_and_remove_memory(int nid, u64 start, u64 size);
 | 
					extern void __remove_memory(u64 start, u64 size);
 | 
				
			||||||
 | 
					extern int offline_and_remove_memory(u64 start, u64 size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
static inline void try_offline_node(int nid) {}
 | 
					static inline void try_offline_node(int nid) {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 | 
					static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 | 
				
			||||||
 | 
									struct memory_group *group)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	return -EINVAL;
 | 
						return -EINVAL;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline int remove_memory(int nid, u64 start, u64 size)
 | 
					static inline int remove_memory(u64 start, u64 size)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	return -EBUSY;
 | 
						return -EBUSY;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void __remove_memory(int nid, u64 start, u64 size) {}
 | 
					static inline void __remove_memory(u64 start, u64 size) {}
 | 
				
			||||||
#endif /* CONFIG_MEMORY_HOTREMOVE */
 | 
					#endif /* CONFIG_MEMORY_HOTREMOVE */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern void set_zone_contiguous(struct zone *zone);
 | 
					extern void set_zone_contiguous(struct zone *zone);
 | 
				
			||||||
| 
						 | 
					@ -339,7 +348,8 @@ extern void sparse_remove_section(struct mem_section *ms,
 | 
				
			||||||
		unsigned long map_offset, struct vmem_altmap *altmap);
 | 
							unsigned long map_offset, struct vmem_altmap *altmap);
 | 
				
			||||||
extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 | 
					extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 | 
				
			||||||
					  unsigned long pnum);
 | 
										  unsigned long pnum);
 | 
				
			||||||
extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
 | 
					extern struct zone *zone_for_pfn_range(int online_type, int nid,
 | 
				
			||||||
 | 
							struct memory_group *group, unsigned long start_pfn,
 | 
				
			||||||
		unsigned long nr_pages);
 | 
							unsigned long nr_pages);
 | 
				
			||||||
extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
 | 
					extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
 | 
				
			||||||
				      struct mhp_params *params);
 | 
									      struct mhp_params *params);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -540,6 +540,10 @@ struct zone {
 | 
				
			||||||
	 * is calculated as:
 | 
						 * is calculated as:
 | 
				
			||||||
	 *	present_pages = spanned_pages - absent_pages(pages in holes);
 | 
						 *	present_pages = spanned_pages - absent_pages(pages in holes);
 | 
				
			||||||
	 *
 | 
						 *
 | 
				
			||||||
 | 
						 * present_early_pages is present pages existing within the zone
 | 
				
			||||||
 | 
						 * located on memory available since early boot, excluding hotplugged
 | 
				
			||||||
 | 
						 * memory.
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
	 * managed_pages is present pages managed by the buddy system, which
 | 
						 * managed_pages is present pages managed by the buddy system, which
 | 
				
			||||||
	 * is calculated as (reserved_pages includes pages allocated by the
 | 
						 * is calculated as (reserved_pages includes pages allocated by the
 | 
				
			||||||
	 * bootmem allocator):
 | 
						 * bootmem allocator):
 | 
				
			||||||
| 
						 | 
					@ -572,6 +576,9 @@ struct zone {
 | 
				
			||||||
	atomic_long_t		managed_pages;
 | 
						atomic_long_t		managed_pages;
 | 
				
			||||||
	unsigned long		spanned_pages;
 | 
						unsigned long		spanned_pages;
 | 
				
			||||||
	unsigned long		present_pages;
 | 
						unsigned long		present_pages;
 | 
				
			||||||
 | 
					#if defined(CONFIG_MEMORY_HOTPLUG)
 | 
				
			||||||
 | 
						unsigned long		present_early_pages;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
#ifdef CONFIG_CMA
 | 
					#ifdef CONFIG_CMA
 | 
				
			||||||
	unsigned long		cma_pages;
 | 
						unsigned long		cma_pages;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
| 
						 | 
					@ -1525,18 +1532,6 @@ void sparse_init(void);
 | 
				
			||||||
#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
 | 
					#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
 | 
				
			||||||
#endif /* CONFIG_SPARSEMEM */
 | 
					#endif /* CONFIG_SPARSEMEM */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					 | 
				
			||||||
 * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
 | 
					 | 
				
			||||||
 * need to check pfn validity within that MAX_ORDER_NR_PAGES block.
 | 
					 | 
				
			||||||
 * pfn_valid_within() should be used in this case; we optimise this away
 | 
					 | 
				
			||||||
 * when we have no holes within a MAX_ORDER_NR_PAGES block.
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
#ifdef CONFIG_HOLES_IN_ZONE
 | 
					 | 
				
			||||||
#define pfn_valid_within(pfn) pfn_valid(pfn)
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
#define pfn_valid_within(pfn) (1)
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#endif /* !__GENERATING_BOUNDS.H */
 | 
					#endif /* !__GENERATING_BOUNDS.H */
 | 
				
			||||||
#endif /* !__ASSEMBLY__ */
 | 
					#endif /* !__ASSEMBLY__ */
 | 
				
			||||||
#endif /* _LINUX_MMZONE_H */
 | 
					#endif /* _LINUX_MMZONE_H */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,7 +16,7 @@ void __do_once_done(bool *done, struct static_key_true *once_key,
 | 
				
			||||||
 * out the condition into a nop. DO_ONCE() guarantees type safety of
 | 
					 * out the condition into a nop. DO_ONCE() guarantees type safety of
 | 
				
			||||||
 * arguments!
 | 
					 * arguments!
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * Not that the following is not equivalent ...
 | 
					 * Note that the following is not equivalent ...
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 *   DO_ONCE(func, arg);
 | 
					 *   DO_ONCE(func, arg);
 | 
				
			||||||
 *   DO_ONCE(func, arg);
 | 
					 *   DO_ONCE(func, arg);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -131,7 +131,7 @@ enum pageflags {
 | 
				
			||||||
#ifdef CONFIG_MEMORY_FAILURE
 | 
					#ifdef CONFIG_MEMORY_FAILURE
 | 
				
			||||||
	PG_hwpoison,		/* hardware poisoned page. Don't touch */
 | 
						PG_hwpoison,		/* hardware poisoned page. Don't touch */
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
 | 
					#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
 | 
				
			||||||
	PG_young,
 | 
						PG_young,
 | 
				
			||||||
	PG_idle,
 | 
						PG_idle,
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
| 
						 | 
					@ -178,6 +178,8 @@ enum pageflags {
 | 
				
			||||||
	PG_reported = PG_uptodate,
 | 
						PG_reported = PG_uptodate,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define PAGEFLAGS_MASK		((1UL << NR_PAGEFLAGS) - 1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef __GENERATING_BOUNDS_H
 | 
					#ifndef __GENERATING_BOUNDS_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline unsigned long _compound_head(const struct page *page)
 | 
					static inline unsigned long _compound_head(const struct page *page)
 | 
				
			||||||
| 
						 | 
					@ -439,7 +441,7 @@ PAGEFLAG_FALSE(HWPoison)
 | 
				
			||||||
#define __PG_HWPOISON 0
 | 
					#define __PG_HWPOISON 0
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
 | 
					#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
 | 
				
			||||||
TESTPAGEFLAG(Young, young, PF_ANY)
 | 
					TESTPAGEFLAG(Young, young, PF_ANY)
 | 
				
			||||||
SETPAGEFLAG(Young, young, PF_ANY)
 | 
					SETPAGEFLAG(Young, young, PF_ANY)
 | 
				
			||||||
TESTCLEARFLAG(Young, young, PF_ANY)
 | 
					TESTCLEARFLAG(Young, young, PF_ANY)
 | 
				
			||||||
| 
						 | 
					@ -831,7 +833,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
 | 
				
			||||||
 * alloc-free cycle to prevent from reusing the page.
 | 
					 * alloc-free cycle to prevent from reusing the page.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
#define PAGE_FLAGS_CHECK_AT_PREP	\
 | 
					#define PAGE_FLAGS_CHECK_AT_PREP	\
 | 
				
			||||||
	(((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
 | 
						(PAGEFLAGS_MASK & ~__PG_HWPOISON)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define PAGE_FLAGS_PRIVATE				\
 | 
					#define PAGE_FLAGS_PRIVATE				\
 | 
				
			||||||
	(1UL << PG_private | 1UL << PG_private_2)
 | 
						(1UL << PG_private | 1UL << PG_private_2)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -19,7 +19,7 @@ struct page_ext_operations {
 | 
				
			||||||
enum page_ext_flags {
 | 
					enum page_ext_flags {
 | 
				
			||||||
	PAGE_EXT_OWNER,
 | 
						PAGE_EXT_OWNER,
 | 
				
			||||||
	PAGE_EXT_OWNER_ALLOCATED,
 | 
						PAGE_EXT_OWNER_ALLOCATED,
 | 
				
			||||||
#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
 | 
					#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
 | 
				
			||||||
	PAGE_EXT_YOUNG,
 | 
						PAGE_EXT_YOUNG,
 | 
				
			||||||
	PAGE_EXT_IDLE,
 | 
						PAGE_EXT_IDLE,
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,7 +6,7 @@
 | 
				
			||||||
#include <linux/page-flags.h>
 | 
					#include <linux/page-flags.h>
 | 
				
			||||||
#include <linux/page_ext.h>
 | 
					#include <linux/page_ext.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef CONFIG_IDLE_PAGE_TRACKING
 | 
					#ifdef CONFIG_PAGE_IDLE_FLAG
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef CONFIG_64BIT
 | 
					#ifdef CONFIG_64BIT
 | 
				
			||||||
static inline bool page_is_young(struct page *page)
 | 
					static inline bool page_is_young(struct page *page)
 | 
				
			||||||
| 
						 | 
					@ -106,7 +106,7 @@ static inline void clear_page_idle(struct page *page)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#endif /* CONFIG_64BIT */
 | 
					#endif /* CONFIG_64BIT */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#else /* !CONFIG_IDLE_PAGE_TRACKING */
 | 
					#else /* !CONFIG_PAGE_IDLE_FLAG */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline bool page_is_young(struct page *page)
 | 
					static inline bool page_is_young(struct page *page)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					@ -135,6 +135,6 @@ static inline void clear_page_idle(struct page *page)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif /* CONFIG_IDLE_PAGE_TRACKING */
 | 
					#endif /* CONFIG_PAGE_IDLE_FLAG */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif /* _LINUX_MM_PAGE_IDLE_H */
 | 
					#endif /* _LINUX_MM_PAGE_IDLE_H */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -521,18 +521,17 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static inline pgoff_t page_to_index(struct page *page)
 | 
					static inline pgoff_t page_to_index(struct page *page)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	pgoff_t pgoff;
 | 
						struct page *head;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (likely(!PageTransTail(page)))
 | 
						if (likely(!PageTransTail(page)))
 | 
				
			||||||
		return page->index;
 | 
							return page->index;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						head = compound_head(page);
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 *  We don't initialize ->index for tail pages: calculate based on
 | 
						 *  We don't initialize ->index for tail pages: calculate based on
 | 
				
			||||||
	 *  head page
 | 
						 *  head page
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	pgoff = compound_head(page)->index;
 | 
						return head->index + page - head;
 | 
				
			||||||
	pgoff += page - compound_head(page);
 | 
					 | 
				
			||||||
	return pgoff;
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern pgoff_t hugetlb_basepage_index(struct page *page);
 | 
					extern pgoff_t hugetlb_basepage_index(struct page *page);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,6 +4,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <linux/uidgid.h>
 | 
					#include <linux/uidgid.h>
 | 
				
			||||||
#include <linux/atomic.h>
 | 
					#include <linux/atomic.h>
 | 
				
			||||||
 | 
					#include <linux/percpu_counter.h>
 | 
				
			||||||
#include <linux/refcount.h>
 | 
					#include <linux/refcount.h>
 | 
				
			||||||
#include <linux/ratelimit.h>
 | 
					#include <linux/ratelimit.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,7 +14,7 @@
 | 
				
			||||||
struct user_struct {
 | 
					struct user_struct {
 | 
				
			||||||
	refcount_t __count;	/* reference count */
 | 
						refcount_t __count;	/* reference count */
 | 
				
			||||||
#ifdef CONFIG_EPOLL
 | 
					#ifdef CONFIG_EPOLL
 | 
				
			||||||
	atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
 | 
						struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	unsigned long unix_inflight;	/* How many files in flight in unix sockets */
 | 
						unsigned long unix_inflight;	/* How many files in flight in unix sockets */
 | 
				
			||||||
	atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */
 | 
						atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -38,7 +38,7 @@
 | 
				
			||||||
 * Define a minimum number of pids per cpu.  Heuristically based
 | 
					 * Define a minimum number of pids per cpu.  Heuristically based
 | 
				
			||||||
 * on original pid max of 32k for 32 cpus.  Also, increase the
 | 
					 * on original pid max of 32k for 32 cpus.  Also, increase the
 | 
				
			||||||
 * minimum settable value for pid_max on the running system based
 | 
					 * minimum settable value for pid_max on the running system based
 | 
				
			||||||
 * on similar defaults.  See kernel/pid.c:pidmap_init() for details.
 | 
					 * on similar defaults.  See kernel/pid.c:pid_idr_init() for details.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
#define PIDS_PER_CPU_DEFAULT	1024
 | 
					#define PIDS_PER_CPU_DEFAULT	1024
 | 
				
			||||||
#define PIDS_PER_CPU_MIN	8
 | 
					#define PIDS_PER_CPU_MIN	8
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -20,9 +20,13 @@
 | 
				
			||||||
#define PICO	1000000000000ULL
 | 
					#define PICO	1000000000000ULL
 | 
				
			||||||
#define FEMTO	1000000000000000ULL
 | 
					#define FEMTO	1000000000000000ULL
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define MILLIWATT_PER_WATT	1000L
 | 
					#define HZ_PER_KHZ		1000UL
 | 
				
			||||||
#define MICROWATT_PER_MILLIWATT	1000L
 | 
					#define KHZ_PER_MHZ		1000UL
 | 
				
			||||||
#define MICROWATT_PER_WATT	1000000L
 | 
					#define HZ_PER_MHZ		1000000UL
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MILLIWATT_PER_WATT	1000UL
 | 
				
			||||||
 | 
					#define MICROWATT_PER_MILLIWATT	1000UL
 | 
				
			||||||
 | 
					#define MICROWATT_PER_WATT	1000000UL
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define ABSOLUTE_ZERO_MILLICELSIUS -273150
 | 
					#define ABSOLUTE_ZERO_MILLICELSIUS -273150
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -225,9 +225,6 @@ static inline bool is_vm_area_hugepages(const void *addr)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef CONFIG_MMU
 | 
					#ifdef CONFIG_MMU
 | 
				
			||||||
int vmap_range(unsigned long addr, unsigned long end,
 | 
					 | 
				
			||||||
			phys_addr_t phys_addr, pgprot_t prot,
 | 
					 | 
				
			||||||
			unsigned int max_page_shift);
 | 
					 | 
				
			||||||
void vunmap_range(unsigned long addr, unsigned long end);
 | 
					void vunmap_range(unsigned long addr, unsigned long end);
 | 
				
			||||||
static inline void set_vm_flush_reset_perms(void *addr)
 | 
					static inline void set_vm_flush_reset_perms(void *addr)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										43
									
								
								include/trace/events/damon.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								include/trace/events/damon.h
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,43 @@
 | 
				
			||||||
 | 
					/* SPDX-License-Identifier: GPL-2.0 */
 | 
				
			||||||
 | 
					#undef TRACE_SYSTEM
 | 
				
			||||||
 | 
					#define TRACE_SYSTEM damon
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if !defined(_TRACE_DAMON_H) || defined(TRACE_HEADER_MULTI_READ)
 | 
				
			||||||
 | 
					#define _TRACE_DAMON_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <linux/damon.h>
 | 
				
			||||||
 | 
					#include <linux/types.h>
 | 
				
			||||||
 | 
					#include <linux/tracepoint.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TRACE_EVENT(damon_aggregated,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						TP_PROTO(struct damon_target *t, struct damon_region *r,
 | 
				
			||||||
 | 
							unsigned int nr_regions),
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						TP_ARGS(t, r, nr_regions),
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						TP_STRUCT__entry(
 | 
				
			||||||
 | 
							__field(unsigned long, target_id)
 | 
				
			||||||
 | 
							__field(unsigned int, nr_regions)
 | 
				
			||||||
 | 
							__field(unsigned long, start)
 | 
				
			||||||
 | 
							__field(unsigned long, end)
 | 
				
			||||||
 | 
							__field(unsigned int, nr_accesses)
 | 
				
			||||||
 | 
						),
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						TP_fast_assign(
 | 
				
			||||||
 | 
							__entry->target_id = t->id;
 | 
				
			||||||
 | 
							__entry->nr_regions = nr_regions;
 | 
				
			||||||
 | 
							__entry->start = r->ar.start;
 | 
				
			||||||
 | 
							__entry->end = r->ar.end;
 | 
				
			||||||
 | 
							__entry->nr_accesses = r->nr_accesses;
 | 
				
			||||||
 | 
						),
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u",
 | 
				
			||||||
 | 
								__entry->target_id, __entry->nr_regions,
 | 
				
			||||||
 | 
								__entry->start, __entry->end, __entry->nr_accesses)
 | 
				
			||||||
 | 
					);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif /* _TRACE_DAMON_H */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* This part must be outside protection */
 | 
				
			||||||
 | 
					#include <trace/define_trace.h>
 | 
				
			||||||
| 
						 | 
					@ -75,7 +75,7 @@
 | 
				
			||||||
#define IF_HAVE_PG_HWPOISON(flag,string)
 | 
					#define IF_HAVE_PG_HWPOISON(flag,string)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
 | 
					#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
 | 
				
			||||||
#define IF_HAVE_PG_IDLE(flag,string) ,{1UL << flag, string}
 | 
					#define IF_HAVE_PG_IDLE(flag,string) ,{1UL << flag, string}
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
#define IF_HAVE_PG_IDLE(flag,string)
 | 
					#define IF_HAVE_PG_IDLE(flag,string)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -38,7 +38,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_template,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d val=%d",
 | 
						TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d val=%d",
 | 
				
			||||||
		__entry->pfn,
 | 
							__entry->pfn,
 | 
				
			||||||
		show_page_flags(__entry->flags & ((1UL << NR_PAGEFLAGS) - 1)),
 | 
							show_page_flags(__entry->flags & PAGEFLAGS_MASK),
 | 
				
			||||||
		__entry->count,
 | 
							__entry->count,
 | 
				
			||||||
		__entry->mapcount, __entry->mapping, __entry->mt,
 | 
							__entry->mapcount, __entry->mapping, __entry->mt,
 | 
				
			||||||
		__entry->val)
 | 
							__entry->val)
 | 
				
			||||||
| 
						 | 
					@ -88,7 +88,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_and_test_template,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d val=%d ret=%d",
 | 
						TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d val=%d ret=%d",
 | 
				
			||||||
		__entry->pfn,
 | 
							__entry->pfn,
 | 
				
			||||||
		show_page_flags(__entry->flags & ((1UL << NR_PAGEFLAGS) - 1)),
 | 
							show_page_flags(__entry->flags & PAGEFLAGS_MASK),
 | 
				
			||||||
		__entry->count,
 | 
							__entry->count,
 | 
				
			||||||
		__entry->mapcount, __entry->mapping, __entry->mt,
 | 
							__entry->mapcount, __entry->mapping, __entry->mt,
 | 
				
			||||||
		__entry->val, __entry->ret)
 | 
							__entry->val, __entry->ret)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,6 +15,7 @@
 | 
				
			||||||
#include <linux/mm.h>
 | 
					#include <linux/mm.h>
 | 
				
			||||||
#include <linux/namei.h>
 | 
					#include <linux/namei.h>
 | 
				
			||||||
#include <linux/init_syscalls.h>
 | 
					#include <linux/init_syscalls.h>
 | 
				
			||||||
 | 
					#include <linux/umh.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static ssize_t __init xwrite(struct file *file, const char *p, size_t count,
 | 
					static ssize_t __init xwrite(struct file *file, const char *p, size_t count,
 | 
				
			||||||
		loff_t *pos)
 | 
							loff_t *pos)
 | 
				
			||||||
| 
						 | 
					@ -727,6 +728,7 @@ static int __init populate_rootfs(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	initramfs_cookie = async_schedule_domain(do_populate_rootfs, NULL,
 | 
						initramfs_cookie = async_schedule_domain(do_populate_rootfs, NULL,
 | 
				
			||||||
						 &initramfs_domain);
 | 
											 &initramfs_domain);
 | 
				
			||||||
 | 
						usermodehelper_enable();
 | 
				
			||||||
	if (!initramfs_async)
 | 
						if (!initramfs_async)
 | 
				
			||||||
		wait_for_initramfs();
 | 
							wait_for_initramfs();
 | 
				
			||||||
	return 0;
 | 
						return 0;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -777,6 +777,8 @@ void __init __weak poking_init(void) { }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void __init __weak pgtable_cache_init(void) { }
 | 
					void __init __weak pgtable_cache_init(void) { }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void __init __weak trap_init(void) { }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
bool initcall_debug;
 | 
					bool initcall_debug;
 | 
				
			||||||
core_param(initcall_debug, initcall_debug, bool, 0644);
 | 
					core_param(initcall_debug, initcall_debug, bool, 0644);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1392,7 +1394,6 @@ static void __init do_basic_setup(void)
 | 
				
			||||||
	driver_init();
 | 
						driver_init();
 | 
				
			||||||
	init_irq_proc();
 | 
						init_irq_proc();
 | 
				
			||||||
	do_ctors();
 | 
						do_ctors();
 | 
				
			||||||
	usermodehelper_enable();
 | 
					 | 
				
			||||||
	do_initcalls();
 | 
						do_initcalls();
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,6 +10,7 @@
 | 
				
			||||||
#include <linux/kdev_t.h>
 | 
					#include <linux/kdev_t.h>
 | 
				
			||||||
#include <linux/syscalls.h>
 | 
					#include <linux/syscalls.h>
 | 
				
			||||||
#include <linux/init_syscalls.h>
 | 
					#include <linux/init_syscalls.h>
 | 
				
			||||||
 | 
					#include <linux/umh.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 * Create a simple rootfs that is similar to the default initramfs
 | 
					 * Create a simple rootfs that is similar to the default initramfs
 | 
				
			||||||
| 
						 | 
					@ -18,6 +19,7 @@ static int __init default_rootfs(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	int err;
 | 
						int err;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						usermodehelper_enable();
 | 
				
			||||||
	err = init_mkdir("/dev", 0755);
 | 
						err = init_mkdir("/dev", 0755);
 | 
				
			||||||
	if (err < 0)
 | 
						if (err < 0)
 | 
				
			||||||
		goto out;
 | 
							goto out;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										16
									
								
								ipc/util.c
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								ipc/util.c
									
									
									
									
									
								
							| 
						 | 
					@ -788,21 +788,13 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s)
 | 
				
			||||||
static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
 | 
					static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
 | 
				
			||||||
					      loff_t *new_pos)
 | 
										      loff_t *new_pos)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct kern_ipc_perm *ipc;
 | 
						struct kern_ipc_perm *ipc = NULL;
 | 
				
			||||||
	int total, id;
 | 
						int max_idx = ipc_get_maxidx(ids);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	total = 0;
 | 
						if (max_idx == -1 || pos > max_idx)
 | 
				
			||||||
	for (id = 0; id < pos && total < ids->in_use; id++) {
 | 
					 | 
				
			||||||
		ipc = idr_find(&ids->ipcs_idr, id);
 | 
					 | 
				
			||||||
		if (ipc != NULL)
 | 
					 | 
				
			||||||
			total++;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	ipc = NULL;
 | 
					 | 
				
			||||||
	if (total >= ids->in_use)
 | 
					 | 
				
			||||||
		goto out;
 | 
							goto out;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for (; pos < ipc_mni; pos++) {
 | 
						for (; pos <= max_idx; pos++) {
 | 
				
			||||||
		ipc = idr_find(&ids->ipcs_idr, pos);
 | 
							ipc = idr_find(&ids->ipcs_idr, pos);
 | 
				
			||||||
		if (ipc != NULL) {
 | 
							if (ipc != NULL) {
 | 
				
			||||||
			rcu_read_lock();
 | 
								rcu_read_lock();
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -478,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Accounting records are not subject to resource limits.
 | 
						 * Accounting records are not subject to resource limits.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 | 
						flim = rlimit(RLIMIT_FSIZE);
 | 
				
			||||||
	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 | 
						current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 | 
				
			||||||
	/* Perform file operations on behalf of whoever enabled accounting */
 | 
						/* Perform file operations on behalf of whoever enabled accounting */
 | 
				
			||||||
	orig_cred = override_creds(file->f_cred);
 | 
						orig_cred = override_creds(file->f_cred);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1262,7 +1262,6 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 | 
				
			||||||
	rcu_read_unlock();
 | 
						rcu_read_unlock();
 | 
				
			||||||
	return exe_file;
 | 
						return exe_file;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
EXPORT_SYMBOL(get_mm_exe_file);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * get_task_exe_file - acquire a reference to the task's executable file
 | 
					 * get_task_exe_file - acquire a reference to the task's executable file
 | 
				
			||||||
| 
						 | 
					@ -1285,7 +1284,6 @@ struct file *get_task_exe_file(struct task_struct *task)
 | 
				
			||||||
	task_unlock(task);
 | 
						task_unlock(task);
 | 
				
			||||||
	return exe_file;
 | 
						return exe_file;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
EXPORT_SYMBOL(get_task_exe_file);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * get_task_mm - acquire a reference to the task's mm
 | 
					 * get_task_mm - acquire a reference to the task's mm
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -41,7 +41,8 @@ struct profile_hit {
 | 
				
			||||||
#define NR_PROFILE_GRP		(NR_PROFILE_HIT/PROFILE_GRPSZ)
 | 
					#define NR_PROFILE_GRP		(NR_PROFILE_HIT/PROFILE_GRPSZ)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static atomic_t *prof_buffer;
 | 
					static atomic_t *prof_buffer;
 | 
				
			||||||
static unsigned long prof_len, prof_shift;
 | 
					static unsigned long prof_len;
 | 
				
			||||||
 | 
					static unsigned short int prof_shift;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int prof_on __read_mostly;
 | 
					int prof_on __read_mostly;
 | 
				
			||||||
EXPORT_SYMBOL_GPL(prof_on);
 | 
					EXPORT_SYMBOL_GPL(prof_on);
 | 
				
			||||||
| 
						 | 
					@ -67,8 +68,8 @@ int profile_setup(char *str)
 | 
				
			||||||
		if (str[strlen(sleepstr)] == ',')
 | 
							if (str[strlen(sleepstr)] == ',')
 | 
				
			||||||
			str += strlen(sleepstr) + 1;
 | 
								str += strlen(sleepstr) + 1;
 | 
				
			||||||
		if (get_option(&str, &par))
 | 
							if (get_option(&str, &par))
 | 
				
			||||||
			prof_shift = par;
 | 
								prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
 | 
				
			||||||
		pr_info("kernel sleep profiling enabled (shift: %ld)\n",
 | 
							pr_info("kernel sleep profiling enabled (shift: %u)\n",
 | 
				
			||||||
			prof_shift);
 | 
								prof_shift);
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
		pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
 | 
							pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
 | 
				
			||||||
| 
						 | 
					@ -78,21 +79,21 @@ int profile_setup(char *str)
 | 
				
			||||||
		if (str[strlen(schedstr)] == ',')
 | 
							if (str[strlen(schedstr)] == ',')
 | 
				
			||||||
			str += strlen(schedstr) + 1;
 | 
								str += strlen(schedstr) + 1;
 | 
				
			||||||
		if (get_option(&str, &par))
 | 
							if (get_option(&str, &par))
 | 
				
			||||||
			prof_shift = par;
 | 
								prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
 | 
				
			||||||
		pr_info("kernel schedule profiling enabled (shift: %ld)\n",
 | 
							pr_info("kernel schedule profiling enabled (shift: %u)\n",
 | 
				
			||||||
			prof_shift);
 | 
								prof_shift);
 | 
				
			||||||
	} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
 | 
						} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
 | 
				
			||||||
		prof_on = KVM_PROFILING;
 | 
							prof_on = KVM_PROFILING;
 | 
				
			||||||
		if (str[strlen(kvmstr)] == ',')
 | 
							if (str[strlen(kvmstr)] == ',')
 | 
				
			||||||
			str += strlen(kvmstr) + 1;
 | 
								str += strlen(kvmstr) + 1;
 | 
				
			||||||
		if (get_option(&str, &par))
 | 
							if (get_option(&str, &par))
 | 
				
			||||||
			prof_shift = par;
 | 
								prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
 | 
				
			||||||
		pr_info("kernel KVM profiling enabled (shift: %ld)\n",
 | 
							pr_info("kernel KVM profiling enabled (shift: %u)\n",
 | 
				
			||||||
			prof_shift);
 | 
								prof_shift);
 | 
				
			||||||
	} else if (get_option(&str, &par)) {
 | 
						} else if (get_option(&str, &par)) {
 | 
				
			||||||
		prof_shift = par;
 | 
							prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
 | 
				
			||||||
		prof_on = CPU_PROFILING;
 | 
							prof_on = CPU_PROFILING;
 | 
				
			||||||
		pr_info("kernel profiling enabled (shift: %ld)\n",
 | 
							pr_info("kernel profiling enabled (shift: %u)\n",
 | 
				
			||||||
			prof_shift);
 | 
								prof_shift);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	return 1;
 | 
						return 1;
 | 
				
			||||||
| 
						 | 
					@ -468,7 +469,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 | 
				
			||||||
	unsigned long p = *ppos;
 | 
						unsigned long p = *ppos;
 | 
				
			||||||
	ssize_t read;
 | 
						ssize_t read;
 | 
				
			||||||
	char *pnt;
 | 
						char *pnt;
 | 
				
			||||||
	unsigned int sample_step = 1 << prof_shift;
 | 
						unsigned long sample_step = 1UL << prof_shift;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	profile_flip_buffers();
 | 
						profile_flip_buffers();
 | 
				
			||||||
	if (p >= (prof_len+1)*sizeof(unsigned int))
 | 
						if (p >= (prof_len+1)*sizeof(unsigned int))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1929,13 +1929,6 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	error = -EINVAL;
 | 
						error = -EINVAL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * @brk should be after @end_data in traditional maps.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	if (prctl_map->start_brk <= prctl_map->end_data ||
 | 
					 | 
				
			||||||
	    prctl_map->brk <= prctl_map->end_data)
 | 
					 | 
				
			||||||
		goto out;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Neither we should allow to override limits if they set.
 | 
						 * Neither we should allow to override limits if they set.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -129,6 +129,22 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
 | 
				
			||||||
	return NULL;
 | 
						return NULL;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int user_epoll_alloc(struct user_struct *up)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					#ifdef CONFIG_EPOLL
 | 
				
			||||||
 | 
						return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL);
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
						return 0;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void user_epoll_free(struct user_struct *up)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					#ifdef CONFIG_EPOLL
 | 
				
			||||||
 | 
						percpu_counter_destroy(&up->epoll_watches);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* IRQs are disabled and uidhash_lock is held upon function entry.
 | 
					/* IRQs are disabled and uidhash_lock is held upon function entry.
 | 
				
			||||||
 * IRQ state (as stored in flags) is restored and uidhash_lock released
 | 
					 * IRQ state (as stored in flags) is restored and uidhash_lock released
 | 
				
			||||||
 * upon function exit.
 | 
					 * upon function exit.
 | 
				
			||||||
| 
						 | 
					@ -138,6 +154,7 @@ static void free_user(struct user_struct *up, unsigned long flags)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	uid_hash_remove(up);
 | 
						uid_hash_remove(up);
 | 
				
			||||||
	spin_unlock_irqrestore(&uidhash_lock, flags);
 | 
						spin_unlock_irqrestore(&uidhash_lock, flags);
 | 
				
			||||||
 | 
						user_epoll_free(up);
 | 
				
			||||||
	kmem_cache_free(uid_cachep, up);
 | 
						kmem_cache_free(uid_cachep, up);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -185,6 +202,10 @@ struct user_struct *alloc_uid(kuid_t uid)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		new->uid = uid;
 | 
							new->uid = uid;
 | 
				
			||||||
		refcount_set(&new->__count, 1);
 | 
							refcount_set(&new->__count, 1);
 | 
				
			||||||
 | 
							if (user_epoll_alloc(new)) {
 | 
				
			||||||
 | 
								kmem_cache_free(uid_cachep, new);
 | 
				
			||||||
 | 
								return NULL;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
		ratelimit_state_init(&new->ratelimit, HZ, 100);
 | 
							ratelimit_state_init(&new->ratelimit, HZ, 100);
 | 
				
			||||||
		ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);
 | 
							ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -195,6 +216,7 @@ struct user_struct *alloc_uid(kuid_t uid)
 | 
				
			||||||
		spin_lock_irq(&uidhash_lock);
 | 
							spin_lock_irq(&uidhash_lock);
 | 
				
			||||||
		up = uid_hash_find(uid, hashent);
 | 
							up = uid_hash_find(uid, hashent);
 | 
				
			||||||
		if (up) {
 | 
							if (up) {
 | 
				
			||||||
 | 
								user_epoll_free(new);
 | 
				
			||||||
			kmem_cache_free(uid_cachep, new);
 | 
								kmem_cache_free(uid_cachep, new);
 | 
				
			||||||
		} else {
 | 
							} else {
 | 
				
			||||||
			uid_hash_insert(new, hashent);
 | 
								uid_hash_insert(new, hashent);
 | 
				
			||||||
| 
						 | 
					@ -216,6 +238,9 @@ static int __init uid_cache_init(void)
 | 
				
			||||||
	for(n = 0; n < UIDHASH_SZ; ++n)
 | 
						for(n = 0; n < UIDHASH_SZ; ++n)
 | 
				
			||||||
		INIT_HLIST_HEAD(uidhash_table + n);
 | 
							INIT_HLIST_HEAD(uidhash_table + n);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (user_epoll_alloc(&root_user))
 | 
				
			||||||
 | 
							panic("root_user epoll percpu counter alloc failed");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Insert the root user immediately (init already runs as root) */
 | 
						/* Insert the root user immediately (init already runs as root) */
 | 
				
			||||||
	spin_lock_irq(&uidhash_lock);
 | 
						spin_lock_irq(&uidhash_lock);
 | 
				
			||||||
	uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
 | 
						uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1064,7 +1064,6 @@ config HARDLOCKUP_DETECTOR
 | 
				
			||||||
	depends on HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_ARCH
 | 
						depends on HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_ARCH
 | 
				
			||||||
	select LOCKUP_DETECTOR
 | 
						select LOCKUP_DETECTOR
 | 
				
			||||||
	select HARDLOCKUP_DETECTOR_PERF if HAVE_HARDLOCKUP_DETECTOR_PERF
 | 
						select HARDLOCKUP_DETECTOR_PERF if HAVE_HARDLOCKUP_DETECTOR_PERF
 | 
				
			||||||
	select HARDLOCKUP_DETECTOR_ARCH if HAVE_HARDLOCKUP_DETECTOR_ARCH
 | 
					 | 
				
			||||||
	help
 | 
						help
 | 
				
			||||||
	  Say Y here to enable the kernel to act as a watchdog to detect
 | 
						  Say Y here to enable the kernel to act as a watchdog to detect
 | 
				
			||||||
	  hard lockups.
 | 
						  hard lockups.
 | 
				
			||||||
| 
						 | 
					@ -2061,8 +2060,9 @@ config TEST_MIN_HEAP
 | 
				
			||||||
	  If unsure, say N.
 | 
						  If unsure, say N.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
config TEST_SORT
 | 
					config TEST_SORT
 | 
				
			||||||
	tristate "Array-based sort test"
 | 
						tristate "Array-based sort test" if !KUNIT_ALL_TESTS
 | 
				
			||||||
	depends on DEBUG_KERNEL || m
 | 
						depends on KUNIT
 | 
				
			||||||
 | 
						default KUNIT_ALL_TESTS
 | 
				
			||||||
	help
 | 
						help
 | 
				
			||||||
	  This option enables the self-test function of 'sort()' at boot,
 | 
						  This option enables the self-test function of 'sort()' at boot,
 | 
				
			||||||
	  or at module load time.
 | 
						  or at module load time.
 | 
				
			||||||
| 
						 | 
					@ -2443,8 +2443,7 @@ config SLUB_KUNIT_TEST
 | 
				
			||||||
 | 
					
 | 
				
			||||||
config RATIONAL_KUNIT_TEST
 | 
					config RATIONAL_KUNIT_TEST
 | 
				
			||||||
	tristate "KUnit test for rational.c" if !KUNIT_ALL_TESTS
 | 
						tristate "KUnit test for rational.c" if !KUNIT_ALL_TESTS
 | 
				
			||||||
	depends on KUNIT
 | 
						depends on KUNIT && RATIONAL
 | 
				
			||||||
	select RATIONAL
 | 
					 | 
				
			||||||
	default KUNIT_ALL_TESTS
 | 
						default KUNIT_ALL_TESTS
 | 
				
			||||||
	help
 | 
						help
 | 
				
			||||||
	  This builds the rational math unit test.
 | 
						  This builds the rational math unit test.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -89,7 +89,8 @@ static void __dump_stack(const char *log_lvl)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * dump_stack - dump the current task information and its stack trace
 | 
					 * dump_stack_lvl - dump the current task information and its stack trace
 | 
				
			||||||
 | 
					 * @log_lvl: log level
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * Architectures can override this implementation by implementing its own.
 | 
					 * Architectures can override this implementation by implementing its own.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -672,7 +672,7 @@ static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
 | 
				
			||||||
 * _copy_mc_to_iter - copy to iter with source memory error exception handling
 | 
					 * _copy_mc_to_iter - copy to iter with source memory error exception handling
 | 
				
			||||||
 * @addr: source kernel address
 | 
					 * @addr: source kernel address
 | 
				
			||||||
 * @bytes: total transfer length
 | 
					 * @bytes: total transfer length
 | 
				
			||||||
 * @iter: destination iterator
 | 
					 * @i: destination iterator
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * The pmem driver deploys this for the dax operation
 | 
					 * The pmem driver deploys this for the dax operation
 | 
				
			||||||
 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
 | 
					 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
 | 
				
			||||||
| 
						 | 
					@ -690,6 +690,8 @@ static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
 | 
				
			||||||
 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
 | 
					 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
 | 
				
			||||||
 *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
 | 
					 *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
 | 
				
			||||||
 *   a short copy.
 | 
					 *   a short copy.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Return: number of bytes copied (may be %0)
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 | 
					size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					@ -744,7 +746,7 @@ EXPORT_SYMBOL(_copy_from_iter_nocache);
 | 
				
			||||||
 * _copy_from_iter_flushcache - write destination through cpu cache
 | 
					 * _copy_from_iter_flushcache - write destination through cpu cache
 | 
				
			||||||
 * @addr: destination kernel address
 | 
					 * @addr: destination kernel address
 | 
				
			||||||
 * @bytes: total transfer length
 | 
					 * @bytes: total transfer length
 | 
				
			||||||
 * @iter: source iterator
 | 
					 * @i: source iterator
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * The pmem driver arranges for filesystem-dax to use this facility via
 | 
					 * The pmem driver arranges for filesystem-dax to use this facility via
 | 
				
			||||||
 * dax_copy_from_iter() for ensuring that writes to persistent memory
 | 
					 * dax_copy_from_iter() for ensuring that writes to persistent memory
 | 
				
			||||||
| 
						 | 
					@ -753,6 +755,8 @@ EXPORT_SYMBOL(_copy_from_iter_nocache);
 | 
				
			||||||
 * all iterator types. The _copy_from_iter_nocache() only attempts to
 | 
					 * all iterator types. The _copy_from_iter_nocache() only attempts to
 | 
				
			||||||
 * bypass the cache for the ITER_IOVEC case, and on some archs may use
 | 
					 * bypass the cache for the ITER_IOVEC case, and on some archs may use
 | 
				
			||||||
 * instructions that strand dirty-data in the cache.
 | 
					 * instructions that strand dirty-data in the cache.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Return: number of bytes copied (may be %0)
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 | 
					size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,4 +14,4 @@ config PRIME_NUMBERS
 | 
				
			||||||
	  If unsure, say N.
 | 
						  If unsure, say N.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
config RATIONAL
 | 
					config RATIONAL
 | 
				
			||||||
	bool
 | 
						tristate
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,6 +13,7 @@
 | 
				
			||||||
#include <linux/export.h>
 | 
					#include <linux/export.h>
 | 
				
			||||||
#include <linux/minmax.h>
 | 
					#include <linux/minmax.h>
 | 
				
			||||||
#include <linux/limits.h>
 | 
					#include <linux/limits.h>
 | 
				
			||||||
 | 
					#include <linux/module.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 * calculate best rational approximation for a given fraction
 | 
					 * calculate best rational approximation for a given fraction
 | 
				
			||||||
| 
						 | 
					@ -106,3 +107,5 @@ void rational_best_approximation(
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
EXPORT_SYMBOL(rational_best_approximation);
 | 
					EXPORT_SYMBOL(rational_best_approximation);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					MODULE_LICENSE("GPL v2");
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -614,7 +614,7 @@ page_flags_test(int section, int node, int zone, int last_cpupid,
 | 
				
			||||||
	bool append = false;
 | 
						bool append = false;
 | 
				
			||||||
	int i;
 | 
						int i;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	flags &= BIT(NR_PAGEFLAGS) - 1;
 | 
						flags &= PAGEFLAGS_MASK;
 | 
				
			||||||
	if (flags) {
 | 
						if (flags) {
 | 
				
			||||||
		page_flags |= flags;
 | 
							page_flags |= flags;
 | 
				
			||||||
		snprintf(cmp_buf + size, BUF_SIZE - size, "%s", name);
 | 
							snprintf(cmp_buf + size, BUF_SIZE - size, "%s", name);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,7 @@
 | 
				
			||||||
// SPDX-License-Identifier: GPL-2.0-only
 | 
					// SPDX-License-Identifier: GPL-2.0-only
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <kunit/test.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <linux/sort.h>
 | 
					#include <linux/sort.h>
 | 
				
			||||||
#include <linux/slab.h>
 | 
					#include <linux/slab.h>
 | 
				
			||||||
#include <linux/module.h>
 | 
					#include <linux/module.h>
 | 
				
			||||||
| 
						 | 
					@ -7,18 +10,17 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define TEST_LEN 1000
 | 
					#define TEST_LEN 1000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static int __init cmpint(const void *a, const void *b)
 | 
					static int cmpint(const void *a, const void *b)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	return *(int *)a - *(int *)b;
 | 
						return *(int *)a - *(int *)b;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static int __init test_sort_init(void)
 | 
					static void test_sort(struct kunit *test)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	int *a, i, r = 1, err = -ENOMEM;
 | 
						int *a, i, r = 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	a = kmalloc_array(TEST_LEN, sizeof(*a), GFP_KERNEL);
 | 
						a = kunit_kmalloc_array(test, TEST_LEN, sizeof(*a), GFP_KERNEL);
 | 
				
			||||||
	if (!a)
 | 
						KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a);
 | 
				
			||||||
		return err;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for (i = 0; i < TEST_LEN; i++) {
 | 
						for (i = 0; i < TEST_LEN; i++) {
 | 
				
			||||||
		r = (r * 725861) % 6599;
 | 
							r = (r * 725861) % 6599;
 | 
				
			||||||
| 
						 | 
					@ -27,24 +29,20 @@ static int __init test_sort_init(void)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	sort(a, TEST_LEN, sizeof(*a), cmpint, NULL);
 | 
						sort(a, TEST_LEN, sizeof(*a), cmpint, NULL);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	err = -EINVAL;
 | 
					 | 
				
			||||||
	for (i = 0; i < TEST_LEN-1; i++)
 | 
						for (i = 0; i < TEST_LEN-1; i++)
 | 
				
			||||||
		if (a[i] > a[i+1]) {
 | 
							KUNIT_ASSERT_LE(test, a[i], a[i + 1]);
 | 
				
			||||||
			pr_err("test has failed\n");
 | 
					 | 
				
			||||||
			goto exit;
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
	err = 0;
 | 
					 | 
				
			||||||
	pr_info("test passed\n");
 | 
					 | 
				
			||||||
exit:
 | 
					 | 
				
			||||||
	kfree(a);
 | 
					 | 
				
			||||||
	return err;
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void __exit test_sort_exit(void)
 | 
					static struct kunit_case sort_test_cases[] = {
 | 
				
			||||||
{
 | 
						KUNIT_CASE(test_sort),
 | 
				
			||||||
}
 | 
						{}
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
module_init(test_sort_init);
 | 
					static struct kunit_suite sort_test_suite = {
 | 
				
			||||||
module_exit(test_sort_exit);
 | 
						.name = "lib_sort",
 | 
				
			||||||
 | 
						.test_cases = sort_test_cases,
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					kunit_test_suites(&sort_test_suite);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
MODULE_LICENSE("GPL");
 | 
					MODULE_LICENSE("GPL");
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2019,7 +2019,7 @@ static const struct page_flags_fields pff[] = {
 | 
				
			||||||
static
 | 
					static
 | 
				
			||||||
char *format_page_flags(char *buf, char *end, unsigned long flags)
 | 
					char *format_page_flags(char *buf, char *end, unsigned long flags)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned long main_flags = flags & (BIT(NR_PAGEFLAGS) - 1);
 | 
						unsigned long main_flags = flags & PAGEFLAGS_MASK;
 | 
				
			||||||
	bool append = false;
 | 
						bool append = false;
 | 
				
			||||||
	int i;
 | 
						int i;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										15
									
								
								mm/Kconfig
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								mm/Kconfig
									
									
									
									
									
								
							| 
						 | 
					@ -96,9 +96,6 @@ config HAVE_FAST_GUP
 | 
				
			||||||
	depends on MMU
 | 
						depends on MMU
 | 
				
			||||||
	bool
 | 
						bool
 | 
				
			||||||
 | 
					
 | 
				
			||||||
config HOLES_IN_ZONE
 | 
					 | 
				
			||||||
	bool
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Don't discard allocated memory used to track "memory" and "reserved" memblocks
 | 
					# Don't discard allocated memory used to track "memory" and "reserved" memblocks
 | 
				
			||||||
# after early boot, so it can still be used to test for validity of memory.
 | 
					# after early boot, so it can still be used to test for validity of memory.
 | 
				
			||||||
# Also, memblocks are updated with memory hot(un)plug.
 | 
					# Also, memblocks are updated with memory hot(un)plug.
 | 
				
			||||||
| 
						 | 
					@ -742,10 +739,18 @@ config DEFERRED_STRUCT_PAGE_INIT
 | 
				
			||||||
	  lifetime of the system until these kthreads finish the
 | 
						  lifetime of the system until these kthreads finish the
 | 
				
			||||||
	  initialisation.
 | 
						  initialisation.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					config PAGE_IDLE_FLAG
 | 
				
			||||||
 | 
						bool
 | 
				
			||||||
 | 
						select PAGE_EXTENSION if !64BIT
 | 
				
			||||||
 | 
						help
 | 
				
			||||||
 | 
						  This adds PG_idle and PG_young flags to 'struct page'.  PTE Accessed
 | 
				
			||||||
 | 
						  bit writers can set the state of the bit in the flags so that PTE
 | 
				
			||||||
 | 
						  Accessed bit readers may avoid disturbance.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
config IDLE_PAGE_TRACKING
 | 
					config IDLE_PAGE_TRACKING
 | 
				
			||||||
	bool "Enable idle page tracking"
 | 
						bool "Enable idle page tracking"
 | 
				
			||||||
	depends on SYSFS && MMU
 | 
						depends on SYSFS && MMU
 | 
				
			||||||
	select PAGE_EXTENSION if !64BIT
 | 
						select PAGE_IDLE_FLAG
 | 
				
			||||||
	help
 | 
						help
 | 
				
			||||||
	  This feature allows to estimate the amount of user pages that have
 | 
						  This feature allows to estimate the amount of user pages that have
 | 
				
			||||||
	  not been touched during a given period of time. This information can
 | 
						  not been touched during a given period of time. This information can
 | 
				
			||||||
| 
						 | 
					@ -889,4 +894,6 @@ config IO_MAPPING
 | 
				
			||||||
config SECRETMEM
 | 
					config SECRETMEM
 | 
				
			||||||
	def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
 | 
						def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					source "mm/damon/Kconfig"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
endmenu
 | 
					endmenu
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
Some files were not shown because too many files have changed in this diff Show more
		Loading…
	
		Reference in a new issue