mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 00:28:52 +02:00 
			
		
		
		
	Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: "146 patches. Subsystems affected by this patch series: kthread, ia64, scripts, ntfs, squashfs, ocfs2, vfs, and mm (slab-generic, slab, kmemleak, dax, kasan, debug, pagecache, gup, shmem, frontswap, memremap, memcg, selftests, pagemap, dma, vmalloc, memory-failure, hugetlb, userfaultfd, vmscan, mempolicy, oom-kill, hugetlbfs, migration, thp, ksm, page-poison, percpu, rmap, zswap, zram, cleanups, hmm, and damon)" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (146 commits) mm/damon: hide kernel pointer from tracepoint event mm/damon/vaddr: hide kernel pointer from damon_va_three_regions() failure log mm/damon/vaddr: use pr_debug() for damon_va_three_regions() failure logging mm/damon/dbgfs: remove an unnecessary variable mm/damon: move the implementation of damon_insert_region to damon.h mm/damon: add access checking for hugetlb pages Docs/admin-guide/mm/damon/usage: update for schemes statistics mm/damon/dbgfs: support all DAMOS stats Docs/admin-guide/mm/damon/reclaim: document statistics parameters mm/damon/reclaim: provide reclamation statistics mm/damon/schemes: account how many times quota limit has exceeded mm/damon/schemes: account scheme actions that successfully applied mm/damon: remove a mistakenly added comment for a future feature Docs/admin-guide/mm/damon/usage: update for kdamond_pid and (mk|rm)_contexts Docs/admin-guide/mm/damon/usage: mention tracepoint at the beginning Docs/admin-guide/mm/damon/usage: remove redundant information Docs/admin-guide/mm/damon/usage: update for scheme quotas and watermarks mm/damon: convert macro functions to static inline functions mm/damon: modify damon_rand() macro to static inline function mm/damon: move damon_rand() definition into damon.h ...
This commit is contained in:
		
						commit
						f56caedaf9
					
				
					 211 changed files with 3825 additions and 1604 deletions
				
			
		|  | @ -29,12 +29,14 @@ Brief summary of control files:: | |||
|  hugetlb.<hugepagesize>.max_usage_in_bytes             # show max "hugepagesize" hugetlb  usage recorded | ||||
|  hugetlb.<hugepagesize>.usage_in_bytes                 # show current usage for "hugepagesize" hugetlb | ||||
|  hugetlb.<hugepagesize>.failcnt                        # show the number of allocation failure due to HugeTLB usage limit | ||||
|  hugetlb.<hugepagesize>.numa_stat                      # show the numa information of the hugetlb memory charged to this cgroup | ||||
| 
 | ||||
| For a system supporting three hugepage sizes (64k, 32M and 1G), the control | ||||
| files include:: | ||||
| 
 | ||||
|   hugetlb.1GB.limit_in_bytes | ||||
|   hugetlb.1GB.max_usage_in_bytes | ||||
|   hugetlb.1GB.numa_stat | ||||
|   hugetlb.1GB.usage_in_bytes | ||||
|   hugetlb.1GB.failcnt | ||||
|   hugetlb.1GB.rsvd.limit_in_bytes | ||||
|  | @ -43,6 +45,7 @@ files include:: | |||
|   hugetlb.1GB.rsvd.failcnt | ||||
|   hugetlb.64KB.limit_in_bytes | ||||
|   hugetlb.64KB.max_usage_in_bytes | ||||
|   hugetlb.64KB.numa_stat | ||||
|   hugetlb.64KB.usage_in_bytes | ||||
|   hugetlb.64KB.failcnt | ||||
|   hugetlb.64KB.rsvd.limit_in_bytes | ||||
|  | @ -51,6 +54,7 @@ files include:: | |||
|   hugetlb.64KB.rsvd.failcnt | ||||
|   hugetlb.32MB.limit_in_bytes | ||||
|   hugetlb.32MB.max_usage_in_bytes | ||||
|   hugetlb.32MB.numa_stat | ||||
|   hugetlb.32MB.usage_in_bytes | ||||
|   hugetlb.32MB.failcnt | ||||
|   hugetlb.32MB.rsvd.limit_in_bytes | ||||
|  |  | |||
|  | @ -1268,6 +1268,9 @@ PAGE_SIZE multiple when read back. | |||
| 		The number of processes belonging to this cgroup | ||||
| 		killed by any kind of OOM killer. | ||||
| 
 | ||||
|           oom_group_kill | ||||
|                 The number of times a group OOM has occurred. | ||||
| 
 | ||||
|   memory.events.local | ||||
| 	Similar to memory.events but the fields in the file are local | ||||
| 	to the cgroup i.e. not hierarchical. The file modified event | ||||
|  | @ -1311,6 +1314,9 @@ PAGE_SIZE multiple when read back. | |||
| 	  sock (npn) | ||||
| 		Amount of memory used in network transmission buffers | ||||
| 
 | ||||
| 	  vmalloc (npn) | ||||
| 		Amount of memory used for vmap backed memory. | ||||
| 
 | ||||
| 	  shmem | ||||
| 		Amount of cached filesystem data that is swap-backed, | ||||
| 		such as tmpfs, shm segments, shared anonymous mmap()s | ||||
|  | @ -2260,6 +2266,11 @@ HugeTLB Interface Files | |||
| 	are local to the cgroup i.e. not hierarchical. The file modified event | ||||
| 	generated on this file reflects only the local events. | ||||
| 
 | ||||
|   hugetlb.<hugepagesize>.numa_stat | ||||
| 	Similar to memory.numa_stat, it shows the numa information of the | ||||
|         hugetlb pages of <hugepagesize> in this cgroup.  Only active in | ||||
|         use hugetlb pages are included.  The per-node values are in bytes. | ||||
| 
 | ||||
| Misc | ||||
| ---- | ||||
| 
 | ||||
|  |  | |||
|  | @ -208,6 +208,31 @@ PID of the DAMON thread. | |||
| If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.  Else, | ||||
| -1. | ||||
| 
 | ||||
| nr_reclaim_tried_regions | ||||
| ------------------------ | ||||
| 
 | ||||
| Number of memory regions that tried to be reclaimed by DAMON_RECLAIM. | ||||
| 
 | ||||
| bytes_reclaim_tried_regions | ||||
| --------------------------- | ||||
| 
 | ||||
| Total bytes of memory regions that tried to be reclaimed by DAMON_RECLAIM. | ||||
| 
 | ||||
| nr_reclaimed_regions | ||||
| -------------------- | ||||
| 
 | ||||
| Number of memory regions that successfully be reclaimed by DAMON_RECLAIM. | ||||
| 
 | ||||
| bytes_reclaimed_regions | ||||
| ----------------------- | ||||
| 
 | ||||
| Total bytes of memory regions that successfully be reclaimed by DAMON_RECLAIM. | ||||
| 
 | ||||
| nr_quota_exceeds | ||||
| ---------------- | ||||
| 
 | ||||
| Number of times that the time/space quota limits have exceeded. | ||||
| 
 | ||||
| Example | ||||
| ======= | ||||
| 
 | ||||
|  |  | |||
|  | @ -7,37 +7,40 @@ Detailed Usages | |||
| DAMON provides below three interfaces for different users. | ||||
| 
 | ||||
| - *DAMON user space tool.* | ||||
|   This is for privileged people such as system administrators who want a | ||||
|   just-working human-friendly interface.  Using this, users can use the DAMON’s | ||||
|   major features in a human-friendly way.  It may not be highly tuned for | ||||
|   special cases, though.  It supports both virtual and physical address spaces | ||||
|   monitoring. | ||||
|   `This <https://github.com/awslabs/damo>`_ is for privileged people such as | ||||
|   system administrators who want a just-working human-friendly interface. | ||||
|   Using this, users can use the DAMON’s major features in a human-friendly way. | ||||
|   It may not be highly tuned for special cases, though.  It supports both | ||||
|   virtual and physical address spaces monitoring.  For more detail, please | ||||
|   refer to its `usage document | ||||
|   <https://github.com/awslabs/damo/blob/next/USAGE.md>`_. | ||||
| - *debugfs interface.* | ||||
|   This is for privileged user space programmers who want more optimized use of | ||||
|   DAMON.  Using this, users can use DAMON’s major features by reading | ||||
|   from and writing to special debugfs files.  Therefore, you can write and use | ||||
|   your personalized DAMON debugfs wrapper programs that reads/writes the | ||||
|   debugfs files instead of you.  The DAMON user space tool is also a reference | ||||
|   implementation of such programs.  It supports both virtual and physical | ||||
|   address spaces monitoring. | ||||
|   :ref:`This <debugfs_interface>` is for privileged user space programmers who | ||||
|   want more optimized use of DAMON.  Using this, users can use DAMON’s major | ||||
|   features by reading from and writing to special debugfs files.  Therefore, | ||||
|   you can write and use your personalized DAMON debugfs wrapper programs that | ||||
|   reads/writes the debugfs files instead of you.  The `DAMON user space tool | ||||
|   <https://github.com/awslabs/damo>`_ is one example of such programs.  It | ||||
|   supports both virtual and physical address spaces monitoring.  Note that this | ||||
|   interface provides only simple :ref:`statistics <damos_stats>` for the | ||||
|   monitoring results.  For detailed monitoring results, DAMON provides a | ||||
|   :ref:`tracepoint <tracepoint>`. | ||||
| - *Kernel Space Programming Interface.* | ||||
|   This is for kernel space programmers.  Using this, users can utilize every | ||||
|   feature of DAMON most flexibly and efficiently by writing kernel space | ||||
|   DAMON application programs for you.  You can even extend DAMON for various | ||||
|   address spaces. | ||||
|   :doc:`This </vm/damon/api>` is for kernel space programmers.  Using this, | ||||
|   users can utilize every feature of DAMON most flexibly and efficiently by | ||||
|   writing kernel space DAMON application programs for you.  You can even extend | ||||
|   DAMON for various address spaces.  For detail, please refer to the interface | ||||
|   :doc:`document </vm/damon/api>`. | ||||
| 
 | ||||
| Nevertheless, you could write your own user space tool using the debugfs | ||||
| interface.  A reference implementation is available at | ||||
| https://github.com/awslabs/damo.  If you are a kernel programmer, you could | ||||
| refer to :doc:`/vm/damon/api` for the kernel space programming interface.  For | ||||
| the reason, this document describes only the debugfs interface | ||||
| 
 | ||||
| .. _debugfs_interface: | ||||
| 
 | ||||
| debugfs Interface | ||||
| ================= | ||||
| 
 | ||||
| DAMON exports five files, ``attrs``, ``target_ids``, ``init_regions``, | ||||
| ``schemes`` and ``monitor_on`` under its debugfs directory, | ||||
| ``<debugfs>/damon/``. | ||||
| DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``, | ||||
| ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and | ||||
| ``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``. | ||||
| 
 | ||||
| 
 | ||||
| Attributes | ||||
|  | @ -131,24 +134,38 @@ Schemes | |||
| 
 | ||||
| For usual DAMON-based data access aware memory management optimizations, users | ||||
| would simply want the system to apply a memory management action to a memory | ||||
| region of a specific size having a specific access frequency for a specific | ||||
| time.  DAMON receives such formalized operation schemes from the user and | ||||
| applies those to the target processes.  It also counts the total number and | ||||
| size of regions that each scheme is applied.  This statistics can be used for | ||||
| online analysis or tuning of the schemes. | ||||
| region of a specific access pattern.  DAMON receives such formalized operation | ||||
| schemes from the user and applies those to the target processes. | ||||
| 
 | ||||
| Users can get and set the schemes by reading from and writing to ``schemes`` | ||||
| debugfs file.  Reading the file also shows the statistics of each scheme.  To | ||||
| the file, each of the schemes should be represented in each line in below form: | ||||
| the file, each of the schemes should be represented in each line in below | ||||
| form:: | ||||
| 
 | ||||
|     min-size max-size min-acc max-acc min-age max-age action | ||||
|     <target access pattern> <action> <quota> <watermarks> | ||||
| 
 | ||||
| Note that the ranges are closed interval.  Bytes for the size of regions | ||||
| (``min-size`` and ``max-size``), number of monitored accesses per aggregate | ||||
| interval for access frequency (``min-acc`` and ``max-acc``), number of | ||||
| aggregate intervals for the age of regions (``min-age`` and ``max-age``), and a | ||||
| predefined integer for memory management actions should be used.  The supported | ||||
| numbers and their meanings are as below. | ||||
| You can disable schemes by simply writing an empty string to the file. | ||||
| 
 | ||||
| Target Access Pattern | ||||
| ~~~~~~~~~~~~~~~~~~~~~ | ||||
| 
 | ||||
| The ``<target access pattern>`` is constructed with three ranges in below | ||||
| form:: | ||||
| 
 | ||||
|     min-size max-size min-acc max-acc min-age max-age | ||||
| 
 | ||||
| Specifically, bytes for the size of regions (``min-size`` and ``max-size``), | ||||
| number of monitored accesses per aggregate interval for access frequency | ||||
| (``min-acc`` and ``max-acc``), number of aggregate intervals for the age of | ||||
| regions (``min-age`` and ``max-age``) are specified.  Note that the ranges are | ||||
| closed interval. | ||||
| 
 | ||||
| Action | ||||
| ~~~~~~ | ||||
| 
 | ||||
| The ``<action>`` is a predefined integer for memory management actions, which | ||||
| DAMON will apply to the regions having the target access pattern.  The | ||||
| supported numbers and their meanings are as below. | ||||
| 
 | ||||
|  - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED`` | ||||
|  - 1: Call ``madvise()`` for the region with ``MADV_COLD`` | ||||
|  | @ -157,20 +174,82 @@ numbers and their meanings are as below. | |||
|  - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` | ||||
|  - 5: Do nothing but count the statistics | ||||
| 
 | ||||
| You can disable schemes by simply writing an empty string to the file.  For | ||||
| example, below commands applies a scheme saying "If a memory region of size in | ||||
| [4KiB, 8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate | ||||
| interval in [10, 20], page out the region", check the entered scheme again, and | ||||
| finally remove the scheme. :: | ||||
| Quota | ||||
| ~~~~~ | ||||
| 
 | ||||
| Optimal ``target access pattern`` for each ``action`` is workload dependent, so | ||||
| not easy to find.  Worse yet, setting a scheme of some action too aggressive | ||||
| can cause severe overhead.  To avoid such overhead, users can limit time and | ||||
| size quota for the scheme via the ``<quota>`` in below form:: | ||||
| 
 | ||||
|     <ms> <sz> <reset interval> <priority weights> | ||||
| 
 | ||||
| This makes DAMON to try to use only up to ``<ms>`` milliseconds for applying | ||||
| the action to memory regions of the ``target access pattern`` within the | ||||
| ``<reset interval>`` milliseconds, and to apply the action to only up to | ||||
| ``<sz>`` bytes of memory regions within the ``<reset interval>``.  Setting both | ||||
| ``<ms>`` and ``<sz>`` zero disables the quota limits. | ||||
| 
 | ||||
| When the quota limit is expected to be exceeded, DAMON prioritizes found memory | ||||
| regions of the ``target access pattern`` based on their size, access frequency, | ||||
| and age.  For personalized prioritization, users can set the weights for the | ||||
| three properties in ``<priority weights>`` in below form:: | ||||
| 
 | ||||
|     <size weight> <access frequency weight> <age weight> | ||||
| 
 | ||||
| Watermarks | ||||
| ~~~~~~~~~~ | ||||
| 
 | ||||
| Some schemes would need to run based on current value of the system's specific | ||||
| metrics like free memory ratio.  For such cases, users can specify watermarks | ||||
| for the condition.:: | ||||
| 
 | ||||
|     <metric> <check interval> <high mark> <middle mark> <low mark> | ||||
| 
 | ||||
| ``<metric>`` is a predefined integer for the metric to be checked.  The | ||||
| supported numbers and their meanings are as below. | ||||
| 
 | ||||
|  - 0: Ignore the watermarks | ||||
|  - 1: System's free memory rate (per thousand) | ||||
| 
 | ||||
| The value of the metric is checked every ``<check interval>`` microseconds. | ||||
| 
 | ||||
| If the value is higher than ``<high mark>`` or lower than ``<low mark>``, the | ||||
| scheme is deactivated.  If the value is lower than ``<mid mark>``, the scheme | ||||
| is activated. | ||||
| 
 | ||||
| .. _damos_stats: | ||||
| 
 | ||||
| Statistics | ||||
| ~~~~~~~~~~ | ||||
| 
 | ||||
| It also counts the total number and bytes of regions that each scheme is tried | ||||
| to be applied, the two numbers for the regions that each scheme is successfully | ||||
| applied, and the total number of the quota limit exceeds.  This statistics can | ||||
| be used for online analysis or tuning of the schemes. | ||||
| 
 | ||||
| The statistics can be shown by reading the ``schemes`` file.  Reading the file | ||||
| will show each scheme you entered in each line, and the five numbers for the | ||||
| statistics will be added at the end of each line. | ||||
| 
 | ||||
| Example | ||||
| ~~~~~~~ | ||||
| 
 | ||||
| Below commands applies a scheme saying "If a memory region of size in [4KiB, | ||||
| 8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate | ||||
| interval in [10, 20], page out the region.  For the paging out, use only up to | ||||
| 10ms per second, and also don't page out more than 1GiB per second.  Under the | ||||
| limitation, page out memory regions having longer age first.  Also, check the | ||||
| free memory rate of the system every 5 seconds, start the monitoring and paging | ||||
| out when the free memory rate becomes lower than 50%, but stop it if the free | ||||
| memory rate becomes larger than 60%, or lower than 30%".:: | ||||
| 
 | ||||
|     # cd <debugfs>/damon | ||||
|     # echo "4096 8192    0 5    10 20    2" > schemes | ||||
|     # cat schemes | ||||
|     4096 8192 0 5 10 20 2 0 0 | ||||
|     # echo > schemes | ||||
| 
 | ||||
| The last two integers in the 4th line of above example is the total number and | ||||
| the total size of the regions that the scheme is applied. | ||||
|     # scheme="4096 8192  0 5    10 20    2"  # target access pattern and action | ||||
|     # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas | ||||
|     # scheme+=" 0 0 100"                     # prioritization weights | ||||
|     # scheme+=" 1 5000000 600 500 300"       # watermarks | ||||
|     # echo "$scheme" > schemes | ||||
| 
 | ||||
| 
 | ||||
| Turning On/Off | ||||
|  | @ -195,6 +274,54 @@ the monitoring is turned on.  If you write to the files while DAMON is running, | |||
| an error code such as ``-EBUSY`` will be returned. | ||||
| 
 | ||||
| 
 | ||||
| Monitoring Thread PID | ||||
| --------------------- | ||||
| 
 | ||||
| DAMON does requested monitoring with a kernel thread called ``kdamond``.  You | ||||
| can get the pid of the thread by reading the ``kdamond_pid`` file.  When the | ||||
| monitoring is turned off, reading the file returns ``none``. :: | ||||
| 
 | ||||
|     # cd <debugfs>/damon | ||||
|     # cat monitor_on | ||||
|     off | ||||
|     # cat kdamond_pid | ||||
|     none | ||||
|     # echo on > monitor_on | ||||
|     # cat kdamond_pid | ||||
|     18594 | ||||
| 
 | ||||
| 
 | ||||
| Using Multiple Monitoring Threads | ||||
| --------------------------------- | ||||
| 
 | ||||
| One ``kdamond`` thread is created for each monitoring context.  You can create | ||||
| and remove monitoring contexts for multiple ``kdamond`` required use case using | ||||
| the ``mk_contexts`` and ``rm_contexts`` files. | ||||
| 
 | ||||
| Writing the name of the new context to the ``mk_contexts`` file creates a | ||||
| directory of the name on the DAMON debugfs directory.  The directory will have | ||||
| DAMON debugfs files for the context. :: | ||||
| 
 | ||||
|     # cd <debugfs>/damon | ||||
|     # ls foo | ||||
|     # ls: cannot access 'foo': No such file or directory | ||||
|     # echo foo > mk_contexts | ||||
|     # ls foo | ||||
|     # attrs  init_regions  kdamond_pid  schemes  target_ids | ||||
| 
 | ||||
| If the context is not needed anymore, you can remove it and the corresponding | ||||
| directory by putting the name of the context to the ``rm_contexts`` file. :: | ||||
| 
 | ||||
|     # echo foo > rm_contexts | ||||
|     # ls foo | ||||
|     # ls: cannot access 'foo': No such file or directory | ||||
| 
 | ||||
| Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the | ||||
| root directory only. | ||||
| 
 | ||||
| 
 | ||||
| .. _tracepoint: | ||||
| 
 | ||||
| Tracepoint for Monitoring Results | ||||
| ================================= | ||||
| 
 | ||||
|  |  | |||
|  | @ -408,7 +408,7 @@ follows: | |||
| Memory Policy APIs | ||||
| ================== | ||||
| 
 | ||||
| Linux supports 3 system calls for controlling memory policy.  These APIS | ||||
| Linux supports 4 system calls for controlling memory policy.  These APIS | ||||
| always affect only the calling task, the calling task's address space, or | ||||
| some shared object mapped into the calling task's address space. | ||||
| 
 | ||||
|  | @ -460,6 +460,20 @@ requested via the 'flags' argument. | |||
| 
 | ||||
| See the mbind(2) man page for more details. | ||||
| 
 | ||||
| Set home node for a Range of Task's Address Spacec:: | ||||
| 
 | ||||
| 	long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, | ||||
| 					 unsigned long home_node, | ||||
| 					 unsigned long flags); | ||||
| 
 | ||||
| sys_set_mempolicy_home_node set the home node for a VMA policy present in the | ||||
| task's address range. The system call updates the home node only for the existing | ||||
| mempolicy range. Other address ranges are ignored. A home node is the NUMA node | ||||
| closest to which page allocation will come from. Specifying the home node override | ||||
| the default allocation policy to allocate memory close to the local node for an | ||||
| executing CPU. | ||||
| 
 | ||||
| 
 | ||||
| Memory Policy Command Line Interface | ||||
| ==================================== | ||||
| 
 | ||||
|  |  | |||
|  | @ -948,7 +948,7 @@ how much memory needs to be free before kswapd goes back to sleep. | |||
| 
 | ||||
| The unit is in fractions of 10,000. The default value of 10 means the | ||||
| distances between watermarks are 0.1% of the available memory in the | ||||
| node/system. The maximum value is 1000, or 10% of memory. | ||||
| node/system. The maximum value is 3000, or 30% of memory. | ||||
| 
 | ||||
| A high rate of threads entering direct reclaim (allocstall) or kswapd | ||||
| going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate | ||||
|  |  | |||
|  | @ -426,12 +426,14 @@ with the memory region, as the case would be with BSS (uninitialized data). | |||
| The "pathname" shows the name associated file for this mapping.  If the mapping | ||||
| is not associated with a file: | ||||
| 
 | ||||
|  =======                    ==================================== | ||||
|  =============              ==================================== | ||||
|  [heap]                     the heap of the program | ||||
|  [stack]                    the stack of the main process | ||||
|  [vdso]                     the "virtual dynamic shared object", | ||||
|                             the kernel system call handler | ||||
|  =======                    ==================================== | ||||
|  [anon:<name>]              an anonymous mapping that has been | ||||
|                             named by userspace | ||||
|  =============              ==================================== | ||||
| 
 | ||||
|  or if empty, the mapping is anonymous. | ||||
| 
 | ||||
|  |  | |||
|  | @ -66,9 +66,11 @@ PTE Page Table Helpers | |||
| +---------------------------+--------------------------------------------------+ | ||||
| | pte_mknotpresent          | Invalidates a mapped PTE                         | | ||||
| +---------------------------+--------------------------------------------------+ | ||||
| | ptep_get_and_clear        | Clears a PTE                                     | | ||||
| | ptep_clear                | Clears a PTE                                     | | ||||
| +---------------------------+--------------------------------------------------+ | ||||
| | ptep_get_and_clear_full   | Clears a PTE                                     | | ||||
| | ptep_get_and_clear        | Clears and returns PTE                           | | ||||
| +---------------------------+--------------------------------------------------+ | ||||
| | ptep_get_and_clear_full   | Clears and returns PTE (batched PTE unmap)       | | ||||
| +---------------------------+--------------------------------------------------+ | ||||
| | ptep_test_and_clear_young | Clears young from a PTE                          | | ||||
| +---------------------------+--------------------------------------------------+ | ||||
|  | @ -247,12 +249,12 @@ SWAP Page Table Helpers | |||
| | __swp_to_pmd_entry        | Creates a mapped PMD from a swapped entry (arch) | | ||||
| +---------------------------+--------------------------------------------------+ | ||||
| | is_migration_entry        | Tests a migration (read or write) swapped entry  | | ||||
| +---------------------------+--------------------------------------------------+ | ||||
| | is_write_migration_entry  | Tests a write migration swapped entry            | | ||||
| +---------------------------+--------------------------------------------------+ | ||||
| | make_migration_entry_read | Converts into read migration swapped entry       | | ||||
| +---------------------------+--------------------------------------------------+ | ||||
| | make_migration_entry      | Creates a migration swapped entry (read or write)| | ||||
| +---------------------------+--------------------------------------------------+ | ||||
| +-------------------------------+----------------------------------------------+ | ||||
| | is_writable_migration_entry   | Tests a write migration swapped entry        | | ||||
| +-------------------------------+----------------------------------------------+ | ||||
| | make_readable_migration_entry | Creates a read migration swapped entry       | | ||||
| +-------------------------------+----------------------------------------------+ | ||||
| | make_writable_migration_entry | Creates a write migration swapped entry      | | ||||
| +-------------------------------+----------------------------------------------+ | ||||
| 
 | ||||
| [1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/ | ||||
|  |  | |||
|  | @ -31,10 +31,12 @@ algorithms.  If you are looking for advice on simply allocating memory, see the | |||
|    page_migration | ||||
|    page_frags | ||||
|    page_owner | ||||
|    page_table_check | ||||
|    remap_file_pages | ||||
|    slub | ||||
|    split_page_table_lock | ||||
|    transhuge | ||||
|    unevictable-lru | ||||
|    vmalloced-kernel-stacks | ||||
|    z3fold | ||||
|    zsmalloc | ||||
|  |  | |||
|  | @ -263,15 +263,15 @@ Monitoring Migration | |||
| The following events (counters) can be used to monitor page migration. | ||||
| 
 | ||||
| 1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a | ||||
|    page was migrated. If the page was a non-THP page, then this counter is | ||||
|    increased by one. If the page was a THP, then this counter is increased by | ||||
|    the number of THP subpages. For example, migration of a single 2MB THP that | ||||
|    has 4KB-size base pages (subpages) will cause this counter to increase by | ||||
|    512. | ||||
|    page was migrated. If the page was a non-THP and non-hugetlb page, then | ||||
|    this counter is increased by one. If the page was a THP or hugetlb, then | ||||
|    this counter is increased by the number of THP or hugetlb subpages. | ||||
|    For example, migration of a single 2MB THP that has 4KB-size base pages | ||||
|    (subpages) will cause this counter to increase by 512. | ||||
| 
 | ||||
| 2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for | ||||
|    PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages, | ||||
|    if it was a THP. | ||||
|    if it was a THP or hugetlb. | ||||
| 
 | ||||
| 3. THP_MIGRATION_SUCCESS: A THP was migrated without being split. | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										56
									
								
								Documentation/vm/page_table_check.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								Documentation/vm/page_table_check.rst
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,56 @@ | |||
| .. SPDX-License-Identifier: GPL-2.0 | ||||
| 
 | ||||
| .. _page_table_check: | ||||
| 
 | ||||
| ================ | ||||
| Page Table Check | ||||
| ================ | ||||
| 
 | ||||
| Introduction | ||||
| ============ | ||||
| 
 | ||||
| Page table check allows to hardern the kernel by ensuring that some types of | ||||
| the memory corruptions are prevented. | ||||
| 
 | ||||
| Page table check performs extra verifications at the time when new pages become | ||||
| accessible from the userspace by getting their page table entries (PTEs PMDs | ||||
| etc.) added into the table. | ||||
| 
 | ||||
| In case of detected corruption, the kernel is crashed. There is a small | ||||
| performance and memory overhead associated with the page table check. Therefore, | ||||
| it is disabled by default, but can be optionally enabled on systems where the | ||||
| extra hardening outweighs the performance costs. Also, because page table check | ||||
| is synchronous, it can help with debugging double map memory corruption issues, | ||||
| by crashing kernel at the time wrong mapping occurs instead of later which is | ||||
| often the case with memory corruptions bugs. | ||||
| 
 | ||||
| Double mapping detection logic | ||||
| ============================== | ||||
| 
 | ||||
| +-------------------+-------------------+-------------------+------------------+ | ||||
| | Current Mapping   | New mapping       | Permissions       | Rule             | | ||||
| +===================+===================+===================+==================+ | ||||
| | Anonymous         | Anonymous         | Read              | Allow            | | ||||
| +-------------------+-------------------+-------------------+------------------+ | ||||
| | Anonymous         | Anonymous         | Read / Write      | Prohibit         | | ||||
| +-------------------+-------------------+-------------------+------------------+ | ||||
| | Anonymous         | Named             | Any               | Prohibit         | | ||||
| +-------------------+-------------------+-------------------+------------------+ | ||||
| | Named             | Anonymous         | Any               | Prohibit         | | ||||
| +-------------------+-------------------+-------------------+------------------+ | ||||
| | Named             | Named             | Any               | Allow            | | ||||
| +-------------------+-------------------+-------------------+------------------+ | ||||
| 
 | ||||
| Enabling Page Table Check | ||||
| ========================= | ||||
| 
 | ||||
| Build kernel with: | ||||
| 
 | ||||
| - PAGE_TABLE_CHECK=y | ||||
|   Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK | ||||
|   is available. | ||||
| 
 | ||||
| - Boot with 'page_table_check=on' kernel parameter. | ||||
| 
 | ||||
| Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page | ||||
| table support without extra kernel parameter. | ||||
							
								
								
									
										153
									
								
								Documentation/vm/vmalloced-kernel-stacks.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										153
									
								
								Documentation/vm/vmalloced-kernel-stacks.rst
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,153 @@ | |||
| .. SPDX-License-Identifier: GPL-2.0 | ||||
| 
 | ||||
| ===================================== | ||||
| Virtually Mapped Kernel Stack Support | ||||
| ===================================== | ||||
| 
 | ||||
| :Author: Shuah Khan <skhan@linuxfoundation.org> | ||||
| 
 | ||||
| .. contents:: :local: | ||||
| 
 | ||||
| Overview | ||||
| -------- | ||||
| 
 | ||||
| This is a compilation of information from the code and original patch | ||||
| series that introduced the `Virtually Mapped Kernel Stacks feature | ||||
| <https://lwn.net/Articles/694348/>` | ||||
| 
 | ||||
| Introduction | ||||
| ------------ | ||||
| 
 | ||||
| Kernel stack overflows are often hard to debug and make the kernel | ||||
| susceptible to exploits. Problems could show up at a later time making | ||||
| it difficult to isolate and root-cause. | ||||
| 
 | ||||
| Virtually-mapped kernel stacks with guard pages causes kernel stack | ||||
| overflows to be caught immediately rather than causing difficult to | ||||
| diagnose corruptions. | ||||
| 
 | ||||
| HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable | ||||
| support for virtually mapped stacks with guard pages. This feature | ||||
| causes reliable faults when the stack overflows. The usability of | ||||
| the stack trace after overflow and response to the overflow itself | ||||
| is architecture dependent. | ||||
| 
 | ||||
| .. note:: | ||||
|         As of this writing, arm64, powerpc, riscv, s390, um, and x86 have | ||||
|         support for VMAP_STACK. | ||||
| 
 | ||||
| HAVE_ARCH_VMAP_STACK | ||||
| -------------------- | ||||
| 
 | ||||
| Architectures that can support Virtually Mapped Kernel Stacks should | ||||
| enable this bool configuration option. The requirements are: | ||||
| 
 | ||||
| - vmalloc space must be large enough to hold many kernel stacks. This | ||||
|   may rule out many 32-bit architectures. | ||||
| - Stacks in vmalloc space need to work reliably.  For example, if | ||||
|   vmap page tables are created on demand, either this mechanism | ||||
|   needs to work while the stack points to a virtual address with | ||||
|   unpopulated page tables or arch code (switch_to() and switch_mm(), | ||||
|   most likely) needs to ensure that the stack's page table entries | ||||
|   are populated before running on a possibly unpopulated stack. | ||||
| - If the stack overflows into a guard page, something reasonable | ||||
|   should happen. The definition of "reasonable" is flexible, but | ||||
|   instantly rebooting without logging anything would be unfriendly. | ||||
| 
 | ||||
| VMAP_STACK | ||||
| ---------- | ||||
| 
 | ||||
| VMAP_STACK bool configuration option when enabled allocates virtually | ||||
| mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK. | ||||
| 
 | ||||
| - Enable this if you want the use virtually-mapped kernel stacks | ||||
|   with guard pages. This causes kernel stack overflows to be caught | ||||
|   immediately rather than causing difficult-to-diagnose corruption. | ||||
| 
 | ||||
| .. note:: | ||||
| 
 | ||||
|         Using this feature with KASAN requires architecture support | ||||
|         for backing virtual mappings with real shadow memory, and | ||||
|         KASAN_VMALLOC must be enabled. | ||||
| 
 | ||||
| .. note:: | ||||
| 
 | ||||
|         VMAP_STACK is enabled, it is not possible to run DMA on stack | ||||
|         allocated data. | ||||
| 
 | ||||
| Kernel configuration options and dependencies keep changing. Refer to | ||||
| the latest code base: | ||||
| 
 | ||||
| `Kconfig <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/Kconfig>` | ||||
| 
 | ||||
| Allocation | ||||
| ----------- | ||||
| 
 | ||||
| When a new kernel thread is created, thread stack is allocated from | ||||
| virtually contiguous memory pages from the page level allocator. These | ||||
| pages are mapped into contiguous kernel virtual space with PAGE_KERNEL | ||||
| protections. | ||||
| 
 | ||||
| alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack | ||||
| with PAGE_KERNEL protections. | ||||
| 
 | ||||
| - Allocated stacks are cached and later reused by new threads, so memcg | ||||
|   accounting is performed manually on assigning/releasing stacks to tasks. | ||||
|   Hence, __vmalloc_node_range is called without __GFP_ACCOUNT. | ||||
| - vm_struct is cached to be able to find when thread free is initiated | ||||
|   in interrupt context. free_thread_stack() can be called in interrupt | ||||
|   context. | ||||
| - On arm64, all VMAP's stacks need to have the same alignment to ensure | ||||
|   that VMAP'd stack overflow detection works correctly. Arch specific | ||||
|   vmap stack allocator takes care of this detail. | ||||
| - This does not address interrupt stacks - according to the original patch | ||||
| 
 | ||||
| Thread stack allocation is initiated from clone(), fork(), vfork(), | ||||
| kernel_thread() via kernel_clone(). Leaving a few hints for searching | ||||
| the code base to understand when and how thread stack is allocated. | ||||
| 
 | ||||
| Bulk of the code is in: | ||||
| `kernel/fork.c <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/fork.c>`. | ||||
| 
 | ||||
| stack_vm_area pointer in task_struct keeps track of the virtually allocated | ||||
| stack and a non-null stack_vm_area pointer serves as a indication that the | ||||
| virtually mapped kernel stacks are enabled. | ||||
| 
 | ||||
| :: | ||||
| 
 | ||||
|         struct vm_struct *stack_vm_area; | ||||
| 
 | ||||
| Stack overflow handling | ||||
| ----------------------- | ||||
| 
 | ||||
| Leading and trailing guard pages help detect stack overflows. When stack | ||||
| overflows into the guard pages, handlers have to be careful not overflow | ||||
| the stack again. When handlers are called, it is likely that very little | ||||
| stack space is left. | ||||
| 
 | ||||
| On x86, this is done by handling the page fault indicating the kernel | ||||
| stack overflow on the double-fault stack. | ||||
| 
 | ||||
| Testing VMAP allocation with guard pages | ||||
| ---------------------------------------- | ||||
| 
 | ||||
| How do we ensure that VMAP_STACK is actually allocating with a leading | ||||
| and trailing guard page? The following lkdtm tests can help detect any | ||||
| regressions. | ||||
| 
 | ||||
| :: | ||||
| 
 | ||||
|         void lkdtm_STACK_GUARD_PAGE_LEADING() | ||||
|         void lkdtm_STACK_GUARD_PAGE_TRAILING() | ||||
| 
 | ||||
| Conclusions | ||||
| ----------- | ||||
| 
 | ||||
| - A percpu cache of vmalloced stacks appears to be a bit faster than a | ||||
|   high-order stack allocation, at least when the cache hits. | ||||
| - THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and | ||||
|   simply embed the thread_info (containing only flags) and 'int cpu' into | ||||
|   task_struct. | ||||
| - The thread stack can be free'ed as soon as the task is dead (without | ||||
|   waiting for RCU) and then, if vmapped stacks are in use, cache the | ||||
|   entire stack for reuse on the same cpu. | ||||
|  | @ -14541,6 +14541,15 @@ F:	include/net/page_pool.h | |||
| F:	include/trace/events/page_pool.h | ||||
| F:	net/core/page_pool.c | ||||
| 
 | ||||
| PAGE TABLE CHECK | ||||
| M:	Pasha Tatashin <pasha.tatashin@soleen.com> | ||||
| M:	Andrew Morton <akpm@linux-foundation.org> | ||||
| L:	linux-mm@kvack.org | ||||
| S:	Maintained | ||||
| F:	Documentation/vm/page_table_check.rst | ||||
| F:	include/linux/page_table_check.h | ||||
| F:	mm/page_table_check.c | ||||
| 
 | ||||
| PANASONIC LAPTOP ACPI EXTRAS DRIVER | ||||
| M:	Kenneth Chan <kenneth.t.chan@gmail.com> | ||||
| L:	platform-driver-x86@vger.kernel.org | ||||
|  |  | |||
|  | @ -1297,6 +1297,9 @@ config HAVE_ARCH_PFN_VALID | |||
| config ARCH_SUPPORTS_DEBUG_PAGEALLOC | ||||
| 	bool | ||||
| 
 | ||||
| config ARCH_SUPPORTS_PAGE_TABLE_CHECK | ||||
| 	bool | ||||
| 
 | ||||
| config ARCH_SPLIT_ARG64 | ||||
| 	bool | ||||
| 	help | ||||
|  |  | |||
|  | @ -489,3 +489,4 @@ | |||
| # 557 reserved for memfd_secret | ||||
| 558	common	process_mrelease		sys_process_mrelease | ||||
| 559	common  futex_waitv                     sys_futex_waitv | ||||
| 560	common	set_mempolicy_home_node		sys_ni_syscall | ||||
|  |  | |||
|  | @ -165,7 +165,6 @@ do_page_fault(unsigned long address, unsigned long mmcsr, | |||
| 		BUG(); | ||||
| 	} | ||||
| 
 | ||||
| 	if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 
 | ||||
|  | @ -176,7 +175,6 @@ do_page_fault(unsigned long address, unsigned long mmcsr, | |||
| 
 | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 
 | ||||
| 	mmap_read_unlock(mm); | ||||
| 
 | ||||
|  |  | |||
|  | @ -149,8 +149,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) | |||
| 	/*
 | ||||
| 	 * Fault retry nuances, mmap_lock already relinquished by core mm | ||||
| 	 */ | ||||
| 	if (unlikely((fault & VM_FAULT_RETRY) && | ||||
| 		     (flags & FAULT_FLAG_ALLOW_RETRY))) { | ||||
| 	if (unlikely(fault & VM_FAULT_RETRY)) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 		goto retry; | ||||
| 	} | ||||
|  |  | |||
|  | @ -322,7 +322,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) | |||
| 		return 0; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 	if (!(fault & VM_FAULT_ERROR)) { | ||||
| 		if (fault & VM_FAULT_RETRY) { | ||||
| 			flags |= FAULT_FLAG_TRIED; | ||||
| 			goto retry; | ||||
|  |  | |||
|  | @ -463,3 +463,4 @@ | |||
| # 447 reserved for memfd_secret | ||||
| 448	common	process_mrelease		sys_process_mrelease | ||||
| 449	common	futex_waitv			sys_futex_waitv | ||||
| 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node | ||||
|  |  | |||
|  | @ -38,7 +38,7 @@ | |||
| #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5) | ||||
| #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800) | ||||
| 
 | ||||
| #define __NR_compat_syscalls		450 | ||||
| #define __NR_compat_syscalls		451 | ||||
| #endif | ||||
| 
 | ||||
| #define __ARCH_WANT_SYS_CLONE | ||||
|  |  | |||
|  | @ -905,6 +905,8 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) | |||
| __SYSCALL(__NR_process_mrelease, sys_process_mrelease) | ||||
| #define __NR_futex_waitv 449 | ||||
| __SYSCALL(__NR_futex_waitv, sys_futex_waitv) | ||||
| #define __NR_set_mempolicy_home_node 450 | ||||
| __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) | ||||
| 
 | ||||
| /*
 | ||||
|  * Please add new compat syscalls above this comment and update | ||||
|  |  | |||
|  | @ -36,7 +36,7 @@ void *module_alloc(unsigned long size) | |||
| 		module_alloc_end = MODULES_END; | ||||
| 
 | ||||
| 	p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, | ||||
| 				module_alloc_end, gfp_mask, PAGE_KERNEL, 0, | ||||
| 				module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, | ||||
| 				NUMA_NO_NODE, __builtin_return_address(0)); | ||||
| 
 | ||||
| 	if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && | ||||
|  | @ -58,7 +58,7 @@ void *module_alloc(unsigned long size) | |||
| 				PAGE_KERNEL, 0, NUMA_NO_NODE, | ||||
| 				__builtin_return_address(0)); | ||||
| 
 | ||||
| 	if (p && (kasan_module_alloc(p, size) < 0)) { | ||||
| 	if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { | ||||
| 		vfree(p); | ||||
| 		return NULL; | ||||
| 	} | ||||
|  |  | |||
|  | @ -608,11 +608,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned int esr, | |||
| 	} | ||||
| 
 | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		if (mm_flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 		mm_flags |= FAULT_FLAG_TRIED; | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 	mmap_read_unlock(mm); | ||||
| 
 | ||||
| 	/*
 | ||||
|  |  | |||
|  | @ -98,12 +98,10 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) | |||
| 
 | ||||
| 	/* The most common case -- we are done. */ | ||||
| 	if (likely(!(fault & VM_FAULT_ERROR))) { | ||||
| 		if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 		if (fault & VM_FAULT_RETRY) { | ||||
| 			flags |= FAULT_FLAG_TRIED; | ||||
| 			goto retry; | ||||
| 		} | ||||
| 		} | ||||
| 
 | ||||
| 		mmap_read_unlock(mm); | ||||
| 		return; | ||||
|  |  | |||
|  | @ -848,7 +848,7 @@ register_unwind_table (struct module *mod) | |||
| { | ||||
| 	struct unw_table_entry *start = (void *) mod->arch.unwind->sh_addr; | ||||
| 	struct unw_table_entry *end = start + mod->arch.unwind->sh_size / sizeof (*start); | ||||
| 	struct unw_table_entry tmp, *e1, *e2, *core, *init; | ||||
| 	struct unw_table_entry *e1, *e2, *core, *init; | ||||
| 	unsigned long num_init = 0, num_core = 0; | ||||
| 
 | ||||
| 	/* First, count how many init and core unwind-table entries there are.  */ | ||||
|  | @ -865,9 +865,7 @@ register_unwind_table (struct module *mod) | |||
| 	for (e1 = start; e1 < end; ++e1) { | ||||
| 		for (e2 = e1 + 1; e2 < end; ++e2) { | ||||
| 			if (e2->start_offset < e1->start_offset) { | ||||
| 				tmp = *e1; | ||||
| 				*e1 = *e2; | ||||
| 				*e2 = tmp; | ||||
| 				swap(*e1, *e2); | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
|  |  | |||
|  | @ -208,10 +208,7 @@ sort_regions (struct rsvd_region *rsvd_region, int max) | |||
| 	while (max--) { | ||||
| 		for (j = 0; j < max; ++j) { | ||||
| 			if (rsvd_region[j].start > rsvd_region[j+1].start) { | ||||
| 				struct rsvd_region tmp; | ||||
| 				tmp = rsvd_region[j]; | ||||
| 				rsvd_region[j] = rsvd_region[j + 1]; | ||||
| 				rsvd_region[j + 1] = tmp; | ||||
| 				swap(rsvd_region[j], rsvd_region[j + 1]); | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
|  |  | |||
|  | @ -370,3 +370,4 @@ | |||
| # 447 reserved for memfd_secret | ||||
| 448	common	process_mrelease		sys_process_mrelease | ||||
| 449	common  futex_waitv                     sys_futex_waitv | ||||
| 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node | ||||
|  |  | |||
|  | @ -264,6 +264,7 @@ static struct attribute * cache_default_attrs[] = { | |||
| 	&shared_cpu_map.attr, | ||||
| 	NULL | ||||
| }; | ||||
| ATTRIBUTE_GROUPS(cache_default); | ||||
| 
 | ||||
| #define to_object(k) container_of(k, struct cache_info, kobj) | ||||
| #define to_attr(a) container_of(a, struct cache_attr, attr) | ||||
|  | @ -284,7 +285,7 @@ static const struct sysfs_ops cache_sysfs_ops = { | |||
| 
 | ||||
| static struct kobj_type cache_ktype = { | ||||
| 	.sysfs_ops	= &cache_sysfs_ops, | ||||
| 	.default_attrs	= cache_default_attrs, | ||||
| 	.default_groups	= cache_default_groups, | ||||
| }; | ||||
| 
 | ||||
| static struct kobj_type cache_ktype_percpu_entry = { | ||||
|  |  | |||
|  | @ -171,7 +171,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid) | |||
|  * @n_pages: number of contiguous pages to allocate | ||||
|  * | ||||
|  * Allocate the specified number of contiguous uncached pages on the | ||||
|  * the requested node. If not enough contiguous uncached pages are available | ||||
|  * requested node. If not enough contiguous uncached pages are available | ||||
|  * on the requested node, roundrobin starting with the next higher node. | ||||
|  */ | ||||
| unsigned long uncached_alloc_page(int starting_nid, int n_pages) | ||||
|  |  | |||
|  | @ -156,7 +156,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re | |||
| 		BUG(); | ||||
| 	} | ||||
| 
 | ||||
| 	if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 
 | ||||
|  | @ -167,7 +166,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re | |||
| 
 | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 
 | ||||
| 	mmap_read_unlock(mm); | ||||
| 	return; | ||||
|  |  | |||
|  | @ -449,3 +449,4 @@ | |||
| # 447 reserved for memfd_secret | ||||
| 448	common	process_mrelease		sys_process_mrelease | ||||
| 449	common  futex_waitv                     sys_futex_waitv | ||||
| 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node | ||||
|  |  | |||
|  | @ -153,7 +153,6 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 		BUG(); | ||||
| 	} | ||||
| 
 | ||||
| 	if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 
 | ||||
|  | @ -165,7 +164,6 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 
 | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 
 | ||||
| 	mmap_read_unlock(mm); | ||||
| 	return 0; | ||||
|  |  | |||
|  | @ -455,3 +455,4 @@ | |||
| # 447 reserved for memfd_secret | ||||
| 448	common	process_mrelease		sys_process_mrelease | ||||
| 449	common  futex_waitv                     sys_futex_waitv | ||||
| 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node | ||||
|  |  | |||
|  | @ -232,7 +232,6 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 		BUG(); | ||||
| 	} | ||||
| 
 | ||||
| 	if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 
 | ||||
|  | @ -244,7 +243,6 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 
 | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 
 | ||||
| 	mmap_read_unlock(mm); | ||||
| 
 | ||||
|  |  | |||
|  | @ -388,3 +388,4 @@ | |||
| # 447 reserved for memfd_secret | ||||
| 448	n32	process_mrelease		sys_process_mrelease | ||||
| 449	n32	futex_waitv			sys_futex_waitv | ||||
| 450	n32	set_mempolicy_home_node		sys_set_mempolicy_home_node | ||||
|  |  | |||
|  | @ -364,3 +364,4 @@ | |||
| # 447 reserved for memfd_secret | ||||
| 448	n64	process_mrelease		sys_process_mrelease | ||||
| 449	n64	futex_waitv			sys_futex_waitv | ||||
| 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node | ||||
|  |  | |||
|  | @ -437,3 +437,4 @@ | |||
| # 447 reserved for memfd_secret | ||||
| 448	o32	process_mrelease		sys_process_mrelease | ||||
| 449	o32	futex_waitv			sys_futex_waitv | ||||
| 450	o32	set_mempolicy_home_node		sys_set_mempolicy_home_node | ||||
|  |  | |||
|  | @ -171,7 +171,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, | |||
| 			goto do_sigbus; | ||||
| 		BUG(); | ||||
| 	} | ||||
| 	if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 
 | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 
 | ||||
|  | @ -183,7 +183,6 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, | |||
| 
 | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 
 | ||||
| 	mmap_read_unlock(mm); | ||||
| 	return; | ||||
|  |  | |||
|  | @ -230,7 +230,6 @@ void do_page_fault(unsigned long entry, unsigned long addr, | |||
| 			goto bad_area; | ||||
| 	} | ||||
| 
 | ||||
| 	if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 
 | ||||
|  | @ -240,7 +239,6 @@ void do_page_fault(unsigned long entry, unsigned long addr, | |||
| 		 */ | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 
 | ||||
| 	mmap_read_unlock(mm); | ||||
| 	return; | ||||
|  |  | |||
|  | @ -149,7 +149,6 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause, | |||
| 		BUG(); | ||||
| 	} | ||||
| 
 | ||||
| 	if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 
 | ||||
|  | @ -161,7 +160,6 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause, | |||
| 
 | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 
 | ||||
| 	mmap_read_unlock(mm); | ||||
| 	return; | ||||
|  |  | |||
|  | @ -177,7 +177,6 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 		BUG(); | ||||
| 	} | ||||
| 
 | ||||
| 	if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 	/*RGD modeled on Cris */ | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
|  | @ -189,7 +188,6 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 
 | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 
 | ||||
| 	mmap_read_unlock(mm); | ||||
| 	return; | ||||
|  |  | |||
|  | @ -447,3 +447,4 @@ | |||
| # 447 reserved for memfd_secret | ||||
| 448	common	process_mrelease		sys_process_mrelease | ||||
| 449	common	futex_waitv			sys_futex_waitv | ||||
| 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node | ||||
|  |  | |||
|  | @ -324,7 +324,6 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, | |||
| 			goto bad_area; | ||||
| 		BUG(); | ||||
| 	} | ||||
| 	if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		/*
 | ||||
| 		 * No need to mmap_read_unlock(mm) as we would | ||||
|  | @ -334,7 +333,6 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, | |||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 	mmap_read_unlock(mm); | ||||
| 	return; | ||||
| 
 | ||||
|  |  | |||
|  | @ -529,3 +529,4 @@ | |||
| # 447 reserved for memfd_secret | ||||
| 448	common	process_mrelease		sys_process_mrelease | ||||
| 449	common  futex_waitv                     sys_futex_waitv | ||||
| 450 	nospu	set_mempolicy_home_node		sys_set_mempolicy_home_node | ||||
|  |  | |||
|  | @ -517,11 +517,9 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, | |||
| 	 * case. | ||||
| 	 */ | ||||
| 	if (unlikely(fault & VM_FAULT_RETRY)) { | ||||
| 		if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 
 | ||||
| 	mmap_read_unlock(current->mm); | ||||
| 
 | ||||
|  |  | |||
|  | @ -330,7 +330,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs) | |||
| 	if (fault_signal_pending(fault, regs)) | ||||
| 		return; | ||||
| 
 | ||||
| 	if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) { | ||||
| 	if (unlikely(fault & VM_FAULT_RETRY)) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 
 | ||||
| 		/*
 | ||||
|  |  | |||
|  | @ -37,14 +37,15 @@ | |||
| 
 | ||||
| void *module_alloc(unsigned long size) | ||||
| { | ||||
| 	gfp_t gfp_mask = GFP_KERNEL; | ||||
| 	void *p; | ||||
| 
 | ||||
| 	if (PAGE_ALIGN(size) > MODULES_LEN) | ||||
| 		return NULL; | ||||
| 	p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, | ||||
| 				 GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, | ||||
| 				 gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, | ||||
| 				 __builtin_return_address(0)); | ||||
| 	if (p && (kasan_module_alloc(p, size) < 0)) { | ||||
| 	if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { | ||||
| 		vfree(p); | ||||
| 		return NULL; | ||||
| 	} | ||||
|  |  | |||
|  | @ -452,3 +452,4 @@ | |||
| # 447 reserved for memfd_secret | ||||
| 448  common	process_mrelease	sys_process_mrelease		sys_process_mrelease | ||||
| 449  common	futex_waitv		sys_futex_waitv			sys_futex_waitv | ||||
| 450  common	set_mempolicy_home_node	sys_set_mempolicy_home_node	sys_set_mempolicy_home_node | ||||
|  |  | |||
|  | @ -452,12 +452,13 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) | |||
| 	if (unlikely(fault & VM_FAULT_ERROR)) | ||||
| 		goto out_up; | ||||
| 
 | ||||
| 	if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		if (IS_ENABLED(CONFIG_PGSTE) && gmap && | ||||
| 			(flags & FAULT_FLAG_RETRY_NOWAIT)) { | ||||
| 				/* FAULT_FLAG_RETRY_NOWAIT has been set,
 | ||||
| 				 * mmap_lock has not been released */ | ||||
| 			/*
 | ||||
| 			 * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has | ||||
| 			 * not been released | ||||
| 			 */ | ||||
| 			current->thread.gmap_pfault = 1; | ||||
| 			fault = VM_FAULT_PFAULT; | ||||
| 			goto out_up; | ||||
|  | @ -467,7 +468,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) | |||
| 		mmap_read_lock(mm); | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 	if (IS_ENABLED(CONFIG_PGSTE) && gmap) { | ||||
| 		address =  __gmap_link(gmap, current->thread.gmap_addr, | ||||
| 				       address); | ||||
|  |  | |||
|  | @ -452,3 +452,4 @@ | |||
| # 447 reserved for memfd_secret | ||||
| 448	common	process_mrelease		sys_process_mrelease | ||||
| 449	common  futex_waitv                     sys_futex_waitv | ||||
| 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node | ||||
|  |  | |||
|  | @ -485,7 +485,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, | |||
| 		if (mm_fault_error(regs, error_code, address, fault)) | ||||
| 			return; | ||||
| 
 | ||||
| 	if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 
 | ||||
|  | @ -496,7 +495,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, | |||
| 		 */ | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 
 | ||||
| 	mmap_read_unlock(mm); | ||||
| } | ||||
|  |  | |||
|  | @ -495,3 +495,4 @@ | |||
| # 447 reserved for memfd_secret | ||||
| 448	common	process_mrelease		sys_process_mrelease | ||||
| 449	common  futex_waitv                     sys_futex_waitv | ||||
| 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node | ||||
|  |  | |||
|  | @ -200,7 +200,6 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, | |||
| 		BUG(); | ||||
| 	} | ||||
| 
 | ||||
| 	if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 
 | ||||
|  | @ -211,7 +210,6 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, | |||
| 
 | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 
 | ||||
| 	mmap_read_unlock(mm); | ||||
| 	return; | ||||
|  |  | |||
|  | @ -437,7 +437,6 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) | |||
| 		BUG(); | ||||
| 	} | ||||
| 
 | ||||
| 	if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 
 | ||||
|  | @ -448,7 +447,6 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) | |||
| 
 | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 	mmap_read_unlock(mm); | ||||
| 
 | ||||
| 	mm_rss = get_mm_rss(mm); | ||||
|  |  | |||
|  | @ -87,13 +87,11 @@ int handle_page_fault(unsigned long address, unsigned long ip, | |||
| 			} | ||||
| 			BUG(); | ||||
| 		} | ||||
| 		if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 		if (fault & VM_FAULT_RETRY) { | ||||
| 			flags |= FAULT_FLAG_TRIED; | ||||
| 
 | ||||
| 			goto retry; | ||||
| 		} | ||||
| 		} | ||||
| 
 | ||||
| 		pmd = pmd_off(mm, address); | ||||
| 		pte = pte_offset_kernel(pmd, address); | ||||
|  |  | |||
|  | @ -104,6 +104,7 @@ config X86 | |||
| 	select ARCH_SUPPORTS_ACPI | ||||
| 	select ARCH_SUPPORTS_ATOMIC_RMW | ||||
| 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC | ||||
| 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK	if X86_64 | ||||
| 	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64 | ||||
| 	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096 | ||||
| 	select ARCH_SUPPORTS_LTO_CLANG | ||||
|  |  | |||
|  | @ -454,3 +454,4 @@ | |||
| 447	i386	memfd_secret		sys_memfd_secret | ||||
| 448	i386	process_mrelease	sys_process_mrelease | ||||
| 449	i386	futex_waitv		sys_futex_waitv | ||||
| 450	i386	set_mempolicy_home_node		sys_set_mempolicy_home_node | ||||
|  |  | |||
|  | @ -371,6 +371,7 @@ | |||
| 447	common	memfd_secret		sys_memfd_secret | ||||
| 448	common	process_mrelease	sys_process_mrelease | ||||
| 449	common	futex_waitv		sys_futex_waitv | ||||
| 450	common	set_mempolicy_home_node	sys_set_mempolicy_home_node | ||||
| 
 | ||||
| # | ||||
| # Due to a historical design error, certain syscalls are numbered differently | ||||
|  |  | |||
|  | @ -27,6 +27,7 @@ | |||
| #include <asm/pkru.h> | ||||
| #include <asm/fpu/api.h> | ||||
| #include <asm-generic/pgtable_uffd.h> | ||||
| #include <linux/page_table_check.h> | ||||
| 
 | ||||
| extern pgd_t early_top_pgt[PTRS_PER_PGD]; | ||||
| bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd); | ||||
|  | @ -753,7 +754,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) | |||
| 		return true; | ||||
| 
 | ||||
| 	if ((pte_flags(a) & _PAGE_PROTNONE) && | ||||
| 			mm_tlb_flush_pending(mm)) | ||||
| 			atomic_read(&mm->tlb_flush_pending)) | ||||
| 		return true; | ||||
| 
 | ||||
| 	return false; | ||||
|  | @ -1007,18 +1008,21 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) | |||
| static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, | ||||
| 			      pte_t *ptep, pte_t pte) | ||||
| { | ||||
| 	page_table_check_pte_set(mm, addr, ptep, pte); | ||||
| 	set_pte(ptep, pte); | ||||
| } | ||||
| 
 | ||||
| static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, | ||||
| 			      pmd_t *pmdp, pmd_t pmd) | ||||
| { | ||||
| 	page_table_check_pmd_set(mm, addr, pmdp, pmd); | ||||
| 	set_pmd(pmdp, pmd); | ||||
| } | ||||
| 
 | ||||
| static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, | ||||
| 			      pud_t *pudp, pud_t pud) | ||||
| { | ||||
| 	page_table_check_pud_set(mm, addr, pudp, pud); | ||||
| 	native_set_pud(pudp, pud); | ||||
| } | ||||
| 
 | ||||
|  | @ -1049,6 +1053,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, | |||
| 				       pte_t *ptep) | ||||
| { | ||||
| 	pte_t pte = native_ptep_get_and_clear(ptep); | ||||
| 	page_table_check_pte_clear(mm, addr, pte); | ||||
| 	return pte; | ||||
| } | ||||
| 
 | ||||
|  | @ -1064,12 +1069,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, | |||
| 		 * care about updates and native needs no locking | ||||
| 		 */ | ||||
| 		pte = native_local_ptep_get_and_clear(ptep); | ||||
| 		page_table_check_pte_clear(mm, addr, pte); | ||||
| 	} else { | ||||
| 		pte = ptep_get_and_clear(mm, addr, ptep); | ||||
| 	} | ||||
| 	return pte; | ||||
| } | ||||
| 
 | ||||
| #define __HAVE_ARCH_PTEP_CLEAR | ||||
| static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, | ||||
| 			      pte_t *ptep) | ||||
| { | ||||
| 	if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK)) | ||||
| 		ptep_get_and_clear(mm, addr, ptep); | ||||
| 	else | ||||
| 		pte_clear(mm, addr, ptep); | ||||
| } | ||||
| 
 | ||||
| #define __HAVE_ARCH_PTEP_SET_WRPROTECT | ||||
| static inline void ptep_set_wrprotect(struct mm_struct *mm, | ||||
| 				      unsigned long addr, pte_t *ptep) | ||||
|  | @ -1110,14 +1126,22 @@ static inline int pmd_write(pmd_t pmd) | |||
| static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, | ||||
| 				       pmd_t *pmdp) | ||||
| { | ||||
| 	return native_pmdp_get_and_clear(pmdp); | ||||
| 	pmd_t pmd = native_pmdp_get_and_clear(pmdp); | ||||
| 
 | ||||
| 	page_table_check_pmd_clear(mm, addr, pmd); | ||||
| 
 | ||||
| 	return pmd; | ||||
| } | ||||
| 
 | ||||
| #define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR | ||||
| static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, | ||||
| 					unsigned long addr, pud_t *pudp) | ||||
| { | ||||
| 	return native_pudp_get_and_clear(pudp); | ||||
| 	pud_t pud = native_pudp_get_and_clear(pudp); | ||||
| 
 | ||||
| 	page_table_check_pud_clear(mm, addr, pud); | ||||
| 
 | ||||
| 	return pud; | ||||
| } | ||||
| 
 | ||||
| #define __HAVE_ARCH_PMDP_SET_WRPROTECT | ||||
|  | @ -1138,6 +1162,7 @@ static inline int pud_write(pud_t pud) | |||
| static inline pmd_t pmdp_establish(struct vm_area_struct *vma, | ||||
| 		unsigned long address, pmd_t *pmdp, pmd_t pmd) | ||||
| { | ||||
| 	page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); | ||||
| 	if (IS_ENABLED(CONFIG_SMP)) { | ||||
| 		return xchg(pmdp, pmd); | ||||
| 	} else { | ||||
|  |  | |||
|  | @ -67,6 +67,7 @@ static unsigned long int get_module_load_offset(void) | |||
| 
 | ||||
| void *module_alloc(unsigned long size) | ||||
| { | ||||
| 	gfp_t gfp_mask = GFP_KERNEL; | ||||
| 	void *p; | ||||
| 
 | ||||
| 	if (PAGE_ALIGN(size) > MODULES_LEN) | ||||
|  | @ -74,10 +75,10 @@ void *module_alloc(unsigned long size) | |||
| 
 | ||||
| 	p = __vmalloc_node_range(size, MODULE_ALIGN, | ||||
| 				    MODULES_VADDR + get_module_load_offset(), | ||||
| 				    MODULES_END, GFP_KERNEL, | ||||
| 				    PAGE_KERNEL, 0, NUMA_NO_NODE, | ||||
| 				    MODULES_END, gfp_mask, | ||||
| 				    PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, | ||||
| 				    __builtin_return_address(0)); | ||||
| 	if (p && (kasan_module_alloc(p, size) < 0)) { | ||||
| 	if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { | ||||
| 		vfree(p); | ||||
| 		return NULL; | ||||
| 	} | ||||
|  |  | |||
|  | @ -1413,8 +1413,7 @@ void do_user_addr_fault(struct pt_regs *regs, | |||
| 	 * and if there is a fatal signal pending there is no guarantee | ||||
| 	 * that we made any progress. Handle this case first. | ||||
| 	 */ | ||||
| 	if (unlikely((fault & VM_FAULT_RETRY) && | ||||
| 		     (flags & FAULT_FLAG_ALLOW_RETRY))) { | ||||
| 	if (unlikely(fault & VM_FAULT_RETRY)) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 		goto retry; | ||||
| 	} | ||||
|  |  | |||
|  | @ -420,3 +420,4 @@ | |||
| # 447 reserved for memfd_secret | ||||
| 448	common	process_mrelease		sys_process_mrelease | ||||
| 449	common  futex_waitv                     sys_futex_waitv | ||||
| 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node | ||||
|  |  | |||
|  | @ -127,7 +127,7 @@ void do_page_fault(struct pt_regs *regs) | |||
| 			goto do_sigbus; | ||||
| 		BUG(); | ||||
| 	} | ||||
| 	if (flags & FAULT_FLAG_ALLOW_RETRY) { | ||||
| 
 | ||||
| 	if (fault & VM_FAULT_RETRY) { | ||||
| 		flags |= FAULT_FLAG_TRIED; | ||||
| 
 | ||||
|  | @ -138,7 +138,6 @@ void do_page_fault(struct pt_regs *regs) | |||
| 
 | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	} | ||||
| 
 | ||||
| 	mmap_read_unlock(mm); | ||||
| 	return; | ||||
|  |  | |||
|  | @ -1903,14 +1903,7 @@ static struct attribute *zram_disk_attrs[] = { | |||
| 	NULL, | ||||
| }; | ||||
| 
 | ||||
| static const struct attribute_group zram_disk_attr_group = { | ||||
| 	.attrs = zram_disk_attrs, | ||||
| }; | ||||
| 
 | ||||
| static const struct attribute_group *zram_disk_attr_groups[] = { | ||||
| 	&zram_disk_attr_group, | ||||
| 	NULL, | ||||
| }; | ||||
| ATTRIBUTE_GROUPS(zram_disk); | ||||
| 
 | ||||
| /*
 | ||||
|  * Allocate and initialize new zram device. the function returns | ||||
|  | @ -1983,7 +1976,7 @@ static int zram_add(void) | |||
| 		blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); | ||||
| 
 | ||||
| 	blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); | ||||
| 	ret = device_add_disk(NULL, zram->disk, zram_disk_attr_groups); | ||||
| 	ret = device_add_disk(NULL, zram->disk, zram_disk_groups); | ||||
| 	if (ret) | ||||
| 		goto out_cleanup_disk; | ||||
| 
 | ||||
|  |  | |||
|  | @ -127,11 +127,35 @@ ATTRIBUTE_GROUPS(dax_drv); | |||
| 
 | ||||
| static int dax_bus_match(struct device *dev, struct device_driver *drv); | ||||
| 
 | ||||
| /*
 | ||||
|  * Static dax regions are regions created by an external subsystem | ||||
|  * nvdimm where a single range is assigned. Its boundaries are by the external | ||||
|  * subsystem and are usually limited to one physical memory range. For example, | ||||
|  * for PMEM it is usually defined by NVDIMM Namespace boundaries (i.e. a | ||||
|  * single contiguous range) | ||||
|  * | ||||
|  * On dynamic dax regions, the assigned region can be partitioned by dax core | ||||
|  * into multiple subdivisions. A subdivision is represented into one | ||||
|  * /dev/daxN.M device composed by one or more potentially discontiguous ranges. | ||||
|  * | ||||
|  * When allocating a dax region, drivers must set whether it's static | ||||
|  * (IORESOURCE_DAX_STATIC).  On static dax devices, the @pgmap is pre-assigned | ||||
|  * to dax core when calling devm_create_dev_dax(), whereas in dynamic dax | ||||
|  * devices it is NULL but afterwards allocated by dax core on device ->probe(). | ||||
|  * Care is needed to make sure that dynamic dax devices are torn down with a | ||||
|  * cleared @pgmap field (see kill_dev_dax()). | ||||
|  */ | ||||
| static bool is_static(struct dax_region *dax_region) | ||||
| { | ||||
| 	return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0; | ||||
| } | ||||
| 
 | ||||
| bool static_dev_dax(struct dev_dax *dev_dax) | ||||
| { | ||||
| 	return is_static(dev_dax->region); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(static_dev_dax); | ||||
| 
 | ||||
| static u64 dev_dax_size(struct dev_dax *dev_dax) | ||||
| { | ||||
| 	u64 size = 0; | ||||
|  | @ -361,6 +385,14 @@ void kill_dev_dax(struct dev_dax *dev_dax) | |||
| 
 | ||||
| 	kill_dax(dax_dev); | ||||
| 	unmap_mapping_range(inode->i_mapping, 0, 0, 1); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Dynamic dax region have the pgmap allocated via dev_kzalloc() | ||||
| 	 * and thus freed by devm. Clear the pgmap to not have stale pgmap | ||||
| 	 * ranges on probe() from previous reconfigurations of region devices. | ||||
| 	 */ | ||||
| 	if (!static_dev_dax(dev_dax)) | ||||
| 		dev_dax->pgmap = NULL; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(kill_dev_dax); | ||||
| 
 | ||||
|  |  | |||
|  | @ -39,6 +39,7 @@ int __dax_driver_register(struct dax_device_driver *dax_drv, | |||
| 	__dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) | ||||
| void dax_driver_unregister(struct dax_device_driver *dax_drv); | ||||
| void kill_dev_dax(struct dev_dax *dev_dax); | ||||
| bool static_dev_dax(struct dev_dax *dev_dax); | ||||
| 
 | ||||
| /*
 | ||||
|  * While run_dax() is potentially a generic operation that could be | ||||
|  |  | |||
|  | @ -73,11 +73,39 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, | |||
| 	return -1; | ||||
| } | ||||
| 
 | ||||
| static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, | ||||
| 			      unsigned long fault_size) | ||||
| { | ||||
| 	unsigned long i, nr_pages = fault_size / PAGE_SIZE; | ||||
| 	struct file *filp = vmf->vma->vm_file; | ||||
| 	struct dev_dax *dev_dax = filp->private_data; | ||||
| 	pgoff_t pgoff; | ||||
| 
 | ||||
| 	/* mapping is only set on the head */ | ||||
| 	if (dev_dax->pgmap->vmemmap_shift) | ||||
| 		nr_pages = 1; | ||||
| 
 | ||||
| 	pgoff = linear_page_index(vmf->vma, | ||||
| 			ALIGN(vmf->address, fault_size)); | ||||
| 
 | ||||
| 	for (i = 0; i < nr_pages; i++) { | ||||
| 		struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i); | ||||
| 
 | ||||
| 		page = compound_head(page); | ||||
| 		if (page->mapping) | ||||
| 			continue; | ||||
| 
 | ||||
| 		page->mapping = filp->f_mapping; | ||||
| 		page->index = pgoff + i; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, | ||||
| 				struct vm_fault *vmf, pfn_t *pfn) | ||||
| 				struct vm_fault *vmf) | ||||
| { | ||||
| 	struct device *dev = &dev_dax->dev; | ||||
| 	phys_addr_t phys; | ||||
| 	pfn_t pfn; | ||||
| 	unsigned int fault_size = PAGE_SIZE; | ||||
| 
 | ||||
| 	if (check_vma(dev_dax, vmf->vma, __func__)) | ||||
|  | @ -98,18 +126,21 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, | |||
| 		return VM_FAULT_SIGBUS; | ||||
| 	} | ||||
| 
 | ||||
| 	*pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); | ||||
| 	pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); | ||||
| 
 | ||||
| 	return vmf_insert_mixed(vmf->vma, vmf->address, *pfn); | ||||
| 	dax_set_mapping(vmf, pfn, fault_size); | ||||
| 
 | ||||
| 	return vmf_insert_mixed(vmf->vma, vmf->address, pfn); | ||||
| } | ||||
| 
 | ||||
| static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, | ||||
| 				struct vm_fault *vmf, pfn_t *pfn) | ||||
| 				struct vm_fault *vmf) | ||||
| { | ||||
| 	unsigned long pmd_addr = vmf->address & PMD_MASK; | ||||
| 	struct device *dev = &dev_dax->dev; | ||||
| 	phys_addr_t phys; | ||||
| 	pgoff_t pgoff; | ||||
| 	pfn_t pfn; | ||||
| 	unsigned int fault_size = PMD_SIZE; | ||||
| 
 | ||||
| 	if (check_vma(dev_dax, vmf->vma, __func__)) | ||||
|  | @ -138,19 +169,22 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, | |||
| 		return VM_FAULT_SIGBUS; | ||||
| 	} | ||||
| 
 | ||||
| 	*pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); | ||||
| 	pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); | ||||
| 
 | ||||
| 	return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); | ||||
| 	dax_set_mapping(vmf, pfn, fault_size); | ||||
| 
 | ||||
| 	return vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD | ||||
| static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, | ||||
| 				struct vm_fault *vmf, pfn_t *pfn) | ||||
| 				struct vm_fault *vmf) | ||||
| { | ||||
| 	unsigned long pud_addr = vmf->address & PUD_MASK; | ||||
| 	struct device *dev = &dev_dax->dev; | ||||
| 	phys_addr_t phys; | ||||
| 	pgoff_t pgoff; | ||||
| 	pfn_t pfn; | ||||
| 	unsigned int fault_size = PUD_SIZE; | ||||
| 
 | ||||
| 
 | ||||
|  | @ -180,13 +214,15 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, | |||
| 		return VM_FAULT_SIGBUS; | ||||
| 	} | ||||
| 
 | ||||
| 	*pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); | ||||
| 	pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); | ||||
| 
 | ||||
| 	return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); | ||||
| 	dax_set_mapping(vmf, pfn, fault_size); | ||||
| 
 | ||||
| 	return vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); | ||||
| } | ||||
| #else | ||||
| static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, | ||||
| 				struct vm_fault *vmf, pfn_t *pfn) | ||||
| 				struct vm_fault *vmf) | ||||
| { | ||||
| 	return VM_FAULT_FALLBACK; | ||||
| } | ||||
|  | @ -196,10 +232,8 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, | |||
| 		enum page_entry_size pe_size) | ||||
| { | ||||
| 	struct file *filp = vmf->vma->vm_file; | ||||
| 	unsigned long fault_size; | ||||
| 	vm_fault_t rc = VM_FAULT_SIGBUS; | ||||
| 	int id; | ||||
| 	pfn_t pfn; | ||||
| 	struct dev_dax *dev_dax = filp->private_data; | ||||
| 
 | ||||
| 	dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm, | ||||
|  | @ -209,43 +243,18 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, | |||
| 	id = dax_read_lock(); | ||||
| 	switch (pe_size) { | ||||
| 	case PE_SIZE_PTE: | ||||
| 		fault_size = PAGE_SIZE; | ||||
| 		rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn); | ||||
| 		rc = __dev_dax_pte_fault(dev_dax, vmf); | ||||
| 		break; | ||||
| 	case PE_SIZE_PMD: | ||||
| 		fault_size = PMD_SIZE; | ||||
| 		rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn); | ||||
| 		rc = __dev_dax_pmd_fault(dev_dax, vmf); | ||||
| 		break; | ||||
| 	case PE_SIZE_PUD: | ||||
| 		fault_size = PUD_SIZE; | ||||
| 		rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn); | ||||
| 		rc = __dev_dax_pud_fault(dev_dax, vmf); | ||||
| 		break; | ||||
| 	default: | ||||
| 		rc = VM_FAULT_SIGBUS; | ||||
| 	} | ||||
| 
 | ||||
| 	if (rc == VM_FAULT_NOPAGE) { | ||||
| 		unsigned long i; | ||||
| 		pgoff_t pgoff; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * In the device-dax case the only possibility for a | ||||
| 		 * VM_FAULT_NOPAGE result is when device-dax capacity is | ||||
| 		 * mapped. No need to consider the zero page, or racing | ||||
| 		 * conflicting mappings. | ||||
| 		 */ | ||||
| 		pgoff = linear_page_index(vmf->vma, vmf->address | ||||
| 				& ~(fault_size - 1)); | ||||
| 		for (i = 0; i < fault_size / PAGE_SIZE; i++) { | ||||
| 			struct page *page; | ||||
| 
 | ||||
| 			page = pfn_to_page(pfn_t_to_pfn(pfn) + i); | ||||
| 			if (page->mapping) | ||||
| 				continue; | ||||
| 			page->mapping = filp->f_mapping; | ||||
| 			page->index = pgoff + i; | ||||
| 		} | ||||
| 	} | ||||
| 	dax_read_unlock(id); | ||||
| 
 | ||||
| 	return rc; | ||||
|  | @ -398,17 +407,34 @@ int dev_dax_probe(struct dev_dax *dev_dax) | |||
| 	void *addr; | ||||
| 	int rc, i; | ||||
| 
 | ||||
| 	pgmap = dev_dax->pgmap; | ||||
| 	if (dev_WARN_ONCE(dev, pgmap && dev_dax->nr_range > 1, | ||||
| 			"static pgmap / multi-range device conflict\n")) | ||||
| 	if (static_dev_dax(dev_dax))  { | ||||
| 		if (dev_dax->nr_range > 1) { | ||||
| 			dev_warn(dev, | ||||
| 				"static pgmap / multi-range device conflict\n"); | ||||
| 			return -EINVAL; | ||||
| 		} | ||||
| 
 | ||||
| 	if (!pgmap) { | ||||
| 		pgmap = devm_kzalloc(dev, sizeof(*pgmap) + sizeof(struct range) | ||||
| 				* (dev_dax->nr_range - 1), GFP_KERNEL); | ||||
| 		pgmap = dev_dax->pgmap; | ||||
| 	} else { | ||||
| 		if (dev_dax->pgmap) { | ||||
| 			dev_warn(dev, | ||||
| 				 "dynamic-dax with pre-populated page map\n"); | ||||
| 			return -EINVAL; | ||||
| 		} | ||||
| 
 | ||||
| 		pgmap = devm_kzalloc(dev, | ||||
|                        struct_size(pgmap, ranges, dev_dax->nr_range - 1), | ||||
|                        GFP_KERNEL); | ||||
| 		if (!pgmap) | ||||
| 			return -ENOMEM; | ||||
| 
 | ||||
| 		pgmap->nr_range = dev_dax->nr_range; | ||||
| 		dev_dax->pgmap = pgmap; | ||||
| 
 | ||||
| 		for (i = 0; i < dev_dax->nr_range; i++) { | ||||
| 			struct range *range = &dev_dax->ranges[i].range; | ||||
| 			pgmap->ranges[i] = *range; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	for (i = 0; i < dev_dax->nr_range; i++) { | ||||
|  | @ -420,12 +446,12 @@ int dev_dax_probe(struct dev_dax *dev_dax) | |||
| 					i, range->start, range->end); | ||||
| 			return -EBUSY; | ||||
| 		} | ||||
| 		/* don't update the range for static pgmap */ | ||||
| 		if (!dev_dax->pgmap) | ||||
| 			pgmap->ranges[i] = *range; | ||||
| 	} | ||||
| 
 | ||||
| 	pgmap->type = MEMORY_DEVICE_GENERIC; | ||||
| 	if (dev_dax->align > PAGE_SIZE) | ||||
| 		pgmap->vmemmap_shift = | ||||
| 			order_base_2(dev_dax->align >> PAGE_SHIFT); | ||||
| 	addr = devm_memremap_pages(dev, pgmap); | ||||
| 	if (IS_ERR(addr)) | ||||
| 		return PTR_ERR(addr); | ||||
|  |  | |||
|  | @ -98,15 +98,14 @@ static int siw_create_tx_threads(void) | |||
| 			continue; | ||||
| 
 | ||||
| 		siw_tx_thread[cpu] = | ||||
| 			kthread_create(siw_run_sq, (unsigned long *)(long)cpu, | ||||
| 				       "siw_tx/%d", cpu); | ||||
| 			kthread_run_on_cpu(siw_run_sq, | ||||
| 					   (unsigned long *)(long)cpu, | ||||
| 					   cpu, "siw_tx/%u"); | ||||
| 		if (IS_ERR(siw_tx_thread[cpu])) { | ||||
| 			siw_tx_thread[cpu] = NULL; | ||||
| 			continue; | ||||
| 		} | ||||
| 		kthread_bind(siw_tx_thread[cpu], cpu); | ||||
| 
 | ||||
| 		wake_up_process(siw_tx_thread[cpu]); | ||||
| 		assigned++; | ||||
| 	} | ||||
| 	return assigned; | ||||
|  |  | |||
|  | @ -26,6 +26,7 @@ | |||
| #include <linux/serial_core.h> | ||||
| #include <linux/sysfs.h> | ||||
| #include <linux/random.h> | ||||
| #include <linux/kmemleak.h> | ||||
| 
 | ||||
| #include <asm/setup.h>  /* for COMMAND_LINE_SIZE */ | ||||
| #include <asm/page.h> | ||||
|  | @ -524,9 +525,12 @@ static int __init __reserved_mem_reserve_reg(unsigned long node, | |||
| 		size = dt_mem_next_cell(dt_root_size_cells, &prop); | ||||
| 
 | ||||
| 		if (size && | ||||
| 		    early_init_dt_reserve_memory_arch(base, size, nomap) == 0) | ||||
| 		    early_init_dt_reserve_memory_arch(base, size, nomap) == 0) { | ||||
| 			pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", | ||||
| 				uname, &base, (unsigned long)(size / SZ_1M)); | ||||
| 			if (!nomap) | ||||
| 				kmemleak_alloc_phys(base, size, 0, 0); | ||||
| 		} | ||||
| 		else | ||||
| 			pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n", | ||||
| 				uname, &base, (unsigned long)(size / SZ_1M)); | ||||
|  |  | |||
|  | @ -27,8 +27,8 @@ | |||
| #include <linux/slab.h> | ||||
| #include <linux/uaccess.h> | ||||
| #include <linux/fiemap.h> | ||||
| #include <linux/backing-dev.h> | ||||
| #include <linux/iomap.h> | ||||
| #include <linux/sched/mm.h> | ||||
| #include "ext4_jbd2.h" | ||||
| #include "ext4_extents.h" | ||||
| #include "xattr.h" | ||||
|  | @ -4404,8 +4404,7 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode) | |||
| 	err = ext4_es_remove_extent(inode, last_block, | ||||
| 				    EXT_MAX_BLOCKS - last_block); | ||||
| 	if (err == -ENOMEM) { | ||||
| 		cond_resched(); | ||||
| 		congestion_wait(BLK_RW_ASYNC, HZ/50); | ||||
| 		memalloc_retry_wait(GFP_ATOMIC); | ||||
| 		goto retry; | ||||
| 	} | ||||
| 	if (err) | ||||
|  | @ -4413,8 +4412,7 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode) | |||
| retry_remove_space: | ||||
| 	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); | ||||
| 	if (err == -ENOMEM) { | ||||
| 		cond_resched(); | ||||
| 		congestion_wait(BLK_RW_ASYNC, HZ/50); | ||||
| 		memalloc_retry_wait(GFP_ATOMIC); | ||||
| 		goto retry_remove_space; | ||||
| 	} | ||||
| 	return err; | ||||
|  |  | |||
|  | @ -7,7 +7,7 @@ | |||
| #include <linux/iomap.h> | ||||
| #include <linux/fiemap.h> | ||||
| #include <linux/iversion.h> | ||||
| #include <linux/backing-dev.h> | ||||
| #include <linux/sched/mm.h> | ||||
| 
 | ||||
| #include "ext4_jbd2.h" | ||||
| #include "ext4.h" | ||||
|  | @ -1929,8 +1929,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) | |||
| retry: | ||||
| 			err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); | ||||
| 			if (err == -ENOMEM) { | ||||
| 				cond_resched(); | ||||
| 				congestion_wait(BLK_RW_ASYNC, HZ/50); | ||||
| 				memalloc_retry_wait(GFP_ATOMIC); | ||||
| 				goto retry; | ||||
| 			} | ||||
| 			if (err) | ||||
|  |  | |||
|  | @ -24,7 +24,7 @@ | |||
| #include <linux/kernel.h> | ||||
| #include <linux/slab.h> | ||||
| #include <linux/mm.h> | ||||
| #include <linux/backing-dev.h> | ||||
| #include <linux/sched/mm.h> | ||||
| 
 | ||||
| #include "ext4_jbd2.h" | ||||
| #include "xattr.h" | ||||
|  | @ -523,12 +523,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io, | |||
| 			ret = PTR_ERR(bounce_page); | ||||
| 			if (ret == -ENOMEM && | ||||
| 			    (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) { | ||||
| 				gfp_flags = GFP_NOFS; | ||||
| 				gfp_t new_gfp_flags = GFP_NOFS; | ||||
| 				if (io->io_bio) | ||||
| 					ext4_io_submit(io); | ||||
| 				else | ||||
| 					gfp_flags |= __GFP_NOFAIL; | ||||
| 				congestion_wait(BLK_RW_ASYNC, HZ/50); | ||||
| 					new_gfp_flags |= __GFP_NOFAIL; | ||||
| 				memalloc_retry_wait(gfp_flags); | ||||
| 				gfp_flags = new_gfp_flags; | ||||
| 				goto retry_encrypt; | ||||
| 			} | ||||
| 
 | ||||
|  |  | |||
|  | @ -8,9 +8,9 @@ | |||
| #include <linux/fs.h> | ||||
| #include <linux/f2fs_fs.h> | ||||
| #include <linux/buffer_head.h> | ||||
| #include <linux/sched/mm.h> | ||||
| #include <linux/mpage.h> | ||||
| #include <linux/writeback.h> | ||||
| #include <linux/backing-dev.h> | ||||
| #include <linux/pagevec.h> | ||||
| #include <linux/blkdev.h> | ||||
| #include <linux/bio.h> | ||||
|  | @ -2542,7 +2542,7 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio) | |||
| 		/* flush pending IOs and wait for a while in the ENOMEM case */ | ||||
| 		if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { | ||||
| 			f2fs_flush_merged_writes(fio->sbi); | ||||
| 			congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); | ||||
| 			memalloc_retry_wait(GFP_NOFS); | ||||
| 			gfp_flags |= __GFP_NOFAIL; | ||||
| 			goto retry_encrypt; | ||||
| 		} | ||||
|  |  | |||
|  | @ -7,7 +7,6 @@ | |||
|  */ | ||||
| #include <linux/fs.h> | ||||
| #include <linux/module.h> | ||||
| #include <linux/backing-dev.h> | ||||
| #include <linux/init.h> | ||||
| #include <linux/f2fs_fs.h> | ||||
| #include <linux/kthread.h> | ||||
|  | @ -15,6 +14,7 @@ | |||
| #include <linux/freezer.h> | ||||
| #include <linux/sched/signal.h> | ||||
| #include <linux/random.h> | ||||
| #include <linux/sched/mm.h> | ||||
| 
 | ||||
| #include "f2fs.h" | ||||
| #include "node.h" | ||||
|  | @ -1375,8 +1375,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, | |||
| 		if (err) { | ||||
| 			clear_page_private_gcing(page); | ||||
| 			if (err == -ENOMEM) { | ||||
| 				congestion_wait(BLK_RW_ASYNC, | ||||
| 						DEFAULT_IO_TIMEOUT); | ||||
| 				memalloc_retry_wait(GFP_NOFS); | ||||
| 				goto retry; | ||||
| 			} | ||||
| 			if (is_dirty) | ||||
|  |  | |||
|  | @ -8,8 +8,8 @@ | |||
| #include <linux/fs.h> | ||||
| #include <linux/f2fs_fs.h> | ||||
| #include <linux/buffer_head.h> | ||||
| #include <linux/backing-dev.h> | ||||
| #include <linux/writeback.h> | ||||
| #include <linux/sched/mm.h> | ||||
| 
 | ||||
| #include "f2fs.h" | ||||
| #include "node.h" | ||||
|  | @ -562,7 +562,7 @@ struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) | |||
| 	inode = f2fs_iget(sb, ino); | ||||
| 	if (IS_ERR(inode)) { | ||||
| 		if (PTR_ERR(inode) == -ENOMEM) { | ||||
| 			congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); | ||||
| 			memalloc_retry_wait(GFP_NOFS); | ||||
| 			goto retry; | ||||
| 		} | ||||
| 	} | ||||
|  |  | |||
|  | @ -8,7 +8,7 @@ | |||
| #include <linux/fs.h> | ||||
| #include <linux/f2fs_fs.h> | ||||
| #include <linux/mpage.h> | ||||
| #include <linux/backing-dev.h> | ||||
| #include <linux/sched/mm.h> | ||||
| #include <linux/blkdev.h> | ||||
| #include <linux/pagevec.h> | ||||
| #include <linux/swap.h> | ||||
|  | @ -2750,7 +2750,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) | |||
| retry: | ||||
| 	ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); | ||||
| 	if (!ipage) { | ||||
| 		congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); | ||||
| 		memalloc_retry_wait(GFP_NOFS); | ||||
| 		goto retry; | ||||
| 	} | ||||
| 
 | ||||
|  |  | |||
|  | @ -8,6 +8,7 @@ | |||
| #include <asm/unaligned.h> | ||||
| #include <linux/fs.h> | ||||
| #include <linux/f2fs_fs.h> | ||||
| #include <linux/sched/mm.h> | ||||
| #include "f2fs.h" | ||||
| #include "node.h" | ||||
| #include "segment.h" | ||||
|  | @ -587,7 +588,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, | |||
| 	err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); | ||||
| 	if (err) { | ||||
| 		if (err == -ENOMEM) { | ||||
| 			congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); | ||||
| 			memalloc_retry_wait(GFP_NOFS); | ||||
| 			goto retry_dn; | ||||
| 		} | ||||
| 		goto out; | ||||
|  | @ -670,8 +671,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, | |||
| 			err = check_index_in_prev_nodes(sbi, dest, &dn); | ||||
| 			if (err) { | ||||
| 				if (err == -ENOMEM) { | ||||
| 					congestion_wait(BLK_RW_ASYNC, | ||||
| 							DEFAULT_IO_TIMEOUT); | ||||
| 					memalloc_retry_wait(GFP_NOFS); | ||||
| 					goto retry_prev; | ||||
| 				} | ||||
| 				goto err; | ||||
|  |  | |||
|  | @ -9,6 +9,7 @@ | |||
| #include <linux/f2fs_fs.h> | ||||
| #include <linux/bio.h> | ||||
| #include <linux/blkdev.h> | ||||
| #include <linux/sched/mm.h> | ||||
| #include <linux/prefetch.h> | ||||
| #include <linux/kthread.h> | ||||
| #include <linux/swap.h> | ||||
|  | @ -245,9 +246,7 @@ static int __revoke_inmem_pages(struct inode *inode, | |||
| 								LOOKUP_NODE); | ||||
| 			if (err) { | ||||
| 				if (err == -ENOMEM) { | ||||
| 					congestion_wait(BLK_RW_ASYNC, | ||||
| 							DEFAULT_IO_TIMEOUT); | ||||
| 					cond_resched(); | ||||
| 					memalloc_retry_wait(GFP_NOFS); | ||||
| 					goto retry; | ||||
| 				} | ||||
| 				err = -EAGAIN; | ||||
|  | @ -424,9 +423,7 @@ static int __f2fs_commit_inmem_pages(struct inode *inode) | |||
| 			err = f2fs_do_write_data_page(&fio); | ||||
| 			if (err) { | ||||
| 				if (err == -ENOMEM) { | ||||
| 					congestion_wait(BLK_RW_ASYNC, | ||||
| 							DEFAULT_IO_TIMEOUT); | ||||
| 					cond_resched(); | ||||
| 					memalloc_retry_wait(GFP_NOFS); | ||||
| 					goto retry; | ||||
| 				} | ||||
| 				unlock_page(page); | ||||
|  |  | |||
|  | @ -8,9 +8,9 @@ | |||
| #include <linux/module.h> | ||||
| #include <linux/init.h> | ||||
| #include <linux/fs.h> | ||||
| #include <linux/sched/mm.h> | ||||
| #include <linux/statfs.h> | ||||
| #include <linux/buffer_head.h> | ||||
| #include <linux/backing-dev.h> | ||||
| #include <linux/kthread.h> | ||||
| #include <linux/parser.h> | ||||
| #include <linux/mount.h> | ||||
|  | @ -2415,8 +2415,7 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, | |||
| 		page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); | ||||
| 		if (IS_ERR(page)) { | ||||
| 			if (PTR_ERR(page) == -ENOMEM) { | ||||
| 				congestion_wait(BLK_RW_ASYNC, | ||||
| 						DEFAULT_IO_TIMEOUT); | ||||
| 				memalloc_retry_wait(GFP_NOFS); | ||||
| 				goto repeat; | ||||
| 			} | ||||
| 			set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); | ||||
|  |  | |||
|  | @ -409,10 +409,11 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) | |||
| 	struct vm_area_struct *vma; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * end == 0 indicates that the entire range after | ||||
| 	 * start should be unmapped. | ||||
| 	 * end == 0 indicates that the entire range after start should be | ||||
| 	 * unmapped.  Note, end is exclusive, whereas the interval tree takes | ||||
| 	 * an inclusive "last". | ||||
| 	 */ | ||||
| 	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { | ||||
| 	vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { | ||||
| 		unsigned long v_offset; | ||||
| 		unsigned long v_end; | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										49
									
								
								fs/inode.c
									
									
									
									
									
								
							
							
						
						
									
										49
									
								
								fs/inode.c
									
									
									
									
									
								
							|  | @ -526,6 +526,55 @@ void __remove_inode_hash(struct inode *inode) | |||
| } | ||||
| EXPORT_SYMBOL(__remove_inode_hash); | ||||
| 
 | ||||
| void dump_mapping(const struct address_space *mapping) | ||||
| { | ||||
| 	struct inode *host; | ||||
| 	const struct address_space_operations *a_ops; | ||||
| 	struct hlist_node *dentry_first; | ||||
| 	struct dentry *dentry_ptr; | ||||
| 	struct dentry dentry; | ||||
| 	unsigned long ino; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If mapping is an invalid pointer, we don't want to crash | ||||
| 	 * accessing it, so probe everything depending on it carefully. | ||||
| 	 */ | ||||
| 	if (get_kernel_nofault(host, &mapping->host) || | ||||
| 	    get_kernel_nofault(a_ops, &mapping->a_ops)) { | ||||
| 		pr_warn("invalid mapping:%px\n", mapping); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!host) { | ||||
| 		pr_warn("aops:%ps\n", a_ops); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || | ||||
| 	    get_kernel_nofault(ino, &host->i_ino)) { | ||||
| 		pr_warn("aops:%ps invalid inode:%px\n", a_ops, host); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!dentry_first) { | ||||
| 		pr_warn("aops:%ps ino:%lx\n", a_ops, ino); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); | ||||
| 	if (get_kernel_nofault(dentry, dentry_ptr)) { | ||||
| 		pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", | ||||
| 				a_ops, ino, dentry_ptr); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * if dentry is corrupted, the %pd handler may still crash, | ||||
| 	 * but it's unlikely that we reach here with a corrupt mapping | ||||
| 	 */ | ||||
| 	pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry); | ||||
| } | ||||
| 
 | ||||
| void clear_inode(struct inode *inode) | ||||
| { | ||||
| 	/*
 | ||||
|  |  | |||
|  | @ -430,7 +430,7 @@ static int ioctl_file_dedupe_range(struct file *file, | |||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	size = offsetof(struct file_dedupe_range __user, info[count]); | ||||
| 	size = offsetof(struct file_dedupe_range, info[count]); | ||||
| 	if (size > PAGE_SIZE) { | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out; | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| // SPDX-License-Identifier: GPL-2.0-or-later
 | ||||
| /**
 | ||||
| /*
 | ||||
|  * attrib.c - NTFS attribute operations.  Part of the Linux-NTFS project. | ||||
|  * | ||||
|  * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. | ||||
|  |  | |||
|  | @ -2040,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, | |||
| 	int i, idx; | ||||
| 	struct ocfs2_extent_list *el, *left_el, *right_el; | ||||
| 	struct ocfs2_extent_rec *left_rec, *right_rec; | ||||
| 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; | ||||
| 	struct buffer_head *root_bh; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Update the counts and position values within all the | ||||
|  |  | |||
|  | @ -1799,23 +1799,23 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, | |||
| 	 */ | ||||
| 	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, | ||||
| 					 cluster_of_pages, mmap_page); | ||||
| 	if (ret && ret != -EAGAIN) { | ||||
| 		mlog_errno(ret); | ||||
| 		goto out_quota; | ||||
| 	} | ||||
| 
 | ||||
| 	if (ret) { | ||||
| 		/*
 | ||||
| 		 * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock | ||||
| 		 * the target page. In this case, we exit with no error and no target | ||||
| 		 * page. This will trigger the caller, page_mkwrite(), to re-try | ||||
| 		 * the operation. | ||||
| 		 */ | ||||
| 	if (ret == -EAGAIN) { | ||||
| 		if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) { | ||||
| 			BUG_ON(wc->w_target_page); | ||||
| 			ret = 0; | ||||
| 			goto out_quota; | ||||
| 		} | ||||
| 
 | ||||
| 		mlog_errno(ret); | ||||
| 		goto out_quota; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, | ||||
| 					  len); | ||||
| 	if (ret) { | ||||
|  |  | |||
|  | @ -120,7 +120,8 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { | |||
| 	define_mask(KTHREAD), | ||||
| }; | ||||
| 
 | ||||
| static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; | ||||
| static struct attribute *mlog_default_attrs[MLOG_MAX_BITS] = {NULL, }; | ||||
| ATTRIBUTE_GROUPS(mlog_default); | ||||
| 
 | ||||
| static ssize_t mlog_show(struct kobject *obj, struct attribute *attr, | ||||
| 			 char *buf) | ||||
|  | @ -144,7 +145,7 @@ static const struct sysfs_ops mlog_attr_ops = { | |||
| }; | ||||
| 
 | ||||
| static struct kobj_type mlog_ktype = { | ||||
| 	.default_attrs = mlog_attr_ptrs, | ||||
| 	.default_groups = mlog_default_groups, | ||||
| 	.sysfs_ops      = &mlog_attr_ops, | ||||
| }; | ||||
| 
 | ||||
|  | @ -157,10 +158,10 @@ int mlog_sys_init(struct kset *o2cb_kset) | |||
| 	int i = 0; | ||||
| 
 | ||||
| 	while (mlog_attrs[i].attr.mode) { | ||||
| 		mlog_attr_ptrs[i] = &mlog_attrs[i].attr; | ||||
| 		mlog_default_attrs[i] = &mlog_attrs[i].attr; | ||||
| 		i++; | ||||
| 	} | ||||
| 	mlog_attr_ptrs[i] = NULL; | ||||
| 	mlog_default_attrs[i] = NULL; | ||||
| 
 | ||||
| 	kobject_set_name(&mlog_kset.kobj, "logmask"); | ||||
| 	mlog_kset.kobj.kset = o2cb_kset; | ||||
|  |  | |||
|  | @ -3343,7 +3343,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, | |||
| 	struct ocfs2_dir_entry *de, *last_de = NULL; | ||||
| 	char *de_buf, *limit; | ||||
| 	unsigned long offset = 0; | ||||
| 	unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize; | ||||
| 	unsigned int rec_len, new_rec_len, free_space; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * This calculates how many free bytes we'd have in block zero, should | ||||
|  |  | |||
|  | @ -94,6 +94,7 @@ static struct attribute *ocfs2_filecheck_attrs[] = { | |||
| 	&ocfs2_filecheck_attr_set.attr, | ||||
| 	NULL | ||||
| }; | ||||
| ATTRIBUTE_GROUPS(ocfs2_filecheck); | ||||
| 
 | ||||
| static void ocfs2_filecheck_release(struct kobject *kobj) | ||||
| { | ||||
|  | @ -138,7 +139,7 @@ static const struct sysfs_ops ocfs2_filecheck_ops = { | |||
| }; | ||||
| 
 | ||||
| static struct kobj_type ocfs2_ktype_filecheck = { | ||||
| 	.default_attrs = ocfs2_filecheck_attrs, | ||||
| 	.default_groups = ocfs2_filecheck_groups, | ||||
| 	.sysfs_ops = &ocfs2_filecheck_ops, | ||||
| 	.release = ocfs2_filecheck_release, | ||||
| }; | ||||
|  |  | |||
|  | @ -1669,8 +1669,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, | |||
| 	status = jbd2_journal_load(journal); | ||||
| 	if (status < 0) { | ||||
| 		mlog_errno(status); | ||||
| 		if (!igrab(inode)) | ||||
| 			BUG(); | ||||
| 		BUG_ON(!igrab(inode)); | ||||
| 		jbd2_journal_destroy(journal); | ||||
| 		goto done; | ||||
| 	} | ||||
|  | @ -1699,8 +1698,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, | |||
| 	if (status < 0) | ||||
| 		mlog_errno(status); | ||||
| 
 | ||||
| 	if (!igrab(inode)) | ||||
| 		BUG(); | ||||
| 	BUG_ON(!igrab(inode)); | ||||
| 
 | ||||
| 	jbd2_journal_destroy(journal); | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,7 @@ | |||
| // SPDX-License-Identifier: GPL-2.0
 | ||||
| #include <linux/pagewalk.h> | ||||
| #include <linux/vmacache.h> | ||||
| #include <linux/mm_inline.h> | ||||
| #include <linux/hugetlb.h> | ||||
| #include <linux/huge_mm.h> | ||||
| #include <linux/mount.h> | ||||
|  | @ -308,6 +309,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) | |||
| 
 | ||||
| 	name = arch_vma_name(vma); | ||||
| 	if (!name) { | ||||
| 		const char *anon_name; | ||||
| 
 | ||||
| 		if (!mm) { | ||||
| 			name = "[vdso]"; | ||||
| 			goto done; | ||||
|  | @ -319,8 +322,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) | |||
| 			goto done; | ||||
| 		} | ||||
| 
 | ||||
| 		if (is_stack(vma)) | ||||
| 		if (is_stack(vma)) { | ||||
| 			name = "[stack]"; | ||||
| 			goto done; | ||||
| 		} | ||||
| 
 | ||||
| 		anon_name = vma_anon_name(vma); | ||||
| 		if (anon_name) { | ||||
| 			seq_pad(m, ' '); | ||||
| 			seq_printf(m, "[anon:%s]", anon_name); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| done: | ||||
|  |  | |||
|  | @ -29,6 +29,7 @@ | |||
| #include <linux/module.h> | ||||
| #include <linux/magic.h> | ||||
| #include <linux/xattr.h> | ||||
| #include <linux/backing-dev.h> | ||||
| 
 | ||||
| #include "squashfs_fs.h" | ||||
| #include "squashfs_fs_sb.h" | ||||
|  | @ -112,6 +113,24 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem( | |||
| 	return decompressor; | ||||
| } | ||||
| 
 | ||||
| static int squashfs_bdi_init(struct super_block *sb) | ||||
| { | ||||
| 	int err; | ||||
| 	unsigned int major = MAJOR(sb->s_dev); | ||||
| 	unsigned int minor = MINOR(sb->s_dev); | ||||
| 
 | ||||
| 	bdi_put(sb->s_bdi); | ||||
| 	sb->s_bdi = &noop_backing_dev_info; | ||||
| 
 | ||||
| 	err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor); | ||||
| 	if (err) | ||||
| 		return err; | ||||
| 
 | ||||
| 	sb->s_bdi->ra_pages = 0; | ||||
| 	sb->s_bdi->io_pages = 0; | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) | ||||
| { | ||||
|  | @ -127,6 +146,20 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) | |||
| 
 | ||||
| 	TRACE("Entered squashfs_fill_superblock\n"); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * squashfs provides 'backing_dev_info' in order to disable read-ahead. For | ||||
| 	 * squashfs, I/O is not deferred, it is done immediately in readpage, | ||||
| 	 * which means the user would always have to wait their own I/O. So the effect | ||||
| 	 * of readahead is very weak for squashfs. squashfs_bdi_init will set | ||||
| 	 * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for | ||||
| 	 * squashfs. | ||||
| 	 */ | ||||
| 	err = squashfs_bdi_init(sb); | ||||
| 	if (err) { | ||||
| 		errorf(fc, "squashfs init bdi failed"); | ||||
| 		return err; | ||||
| 	} | ||||
| 
 | ||||
| 	sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); | ||||
| 	if (sb->s_fs_info == NULL) { | ||||
| 		ERROR("Failed to allocate squashfs_sb_info\n"); | ||||
|  |  | |||
|  | @ -15,6 +15,7 @@ | |||
| #include <linux/sched/signal.h> | ||||
| #include <linux/sched/mm.h> | ||||
| #include <linux/mm.h> | ||||
| #include <linux/mm_inline.h> | ||||
| #include <linux/mmu_notifier.h> | ||||
| #include <linux/poll.h> | ||||
| #include <linux/slab.h> | ||||
|  | @ -877,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) | |||
| 				 new_flags, vma->anon_vma, | ||||
| 				 vma->vm_file, vma->vm_pgoff, | ||||
| 				 vma_policy(vma), | ||||
| 				 NULL_VM_UFFD_CTX); | ||||
| 				 NULL_VM_UFFD_CTX, vma_anon_name(vma)); | ||||
| 		if (prev) | ||||
| 			vma = prev; | ||||
| 		else | ||||
|  | @ -1436,7 +1437,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, | |||
| 		prev = vma_merge(mm, prev, start, vma_end, new_flags, | ||||
| 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff, | ||||
| 				 vma_policy(vma), | ||||
| 				 ((struct vm_userfaultfd_ctx){ ctx })); | ||||
| 				 ((struct vm_userfaultfd_ctx){ ctx }), | ||||
| 				 vma_anon_name(vma)); | ||||
| 		if (prev) { | ||||
| 			vma = prev; | ||||
| 			goto next; | ||||
|  | @ -1613,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, | |||
| 		prev = vma_merge(mm, prev, start, vma_end, new_flags, | ||||
| 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff, | ||||
| 				 vma_policy(vma), | ||||
| 				 NULL_VM_UFFD_CTX); | ||||
| 				 NULL_VM_UFFD_CTX, vma_anon_name(vma)); | ||||
| 		if (prev) { | ||||
| 			vma = prev; | ||||
| 			goto next; | ||||
|  |  | |||
|  | @ -4,7 +4,6 @@ | |||
|  * All Rights Reserved. | ||||
|  */ | ||||
| #include "xfs.h" | ||||
| #include <linux/backing-dev.h> | ||||
| #include "xfs_message.h" | ||||
| #include "xfs_trace.h" | ||||
| 
 | ||||
|  | @ -26,6 +25,6 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) | |||
| 	"%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)", | ||||
| 				current->comm, current->pid, | ||||
| 				(unsigned int)size, __func__, lflags); | ||||
| 		congestion_wait(BLK_RW_ASYNC, HZ/50); | ||||
| 		memalloc_retry_wait(lflags); | ||||
| 	} while (1); | ||||
| } | ||||
|  |  | |||
|  | @ -394,7 +394,7 @@ xfs_buf_alloc_pages( | |||
| 		} | ||||
| 
 | ||||
| 		XFS_STATS_INC(bp->b_mount, xb_page_retries); | ||||
| 		congestion_wait(BLK_RW_ASYNC, HZ / 50); | ||||
| 		memalloc_retry_wait(gfp_mask); | ||||
| 	} | ||||
| 	return 0; | ||||
| } | ||||
|  |  | |||
|  | @ -295,7 +295,6 @@ extern bool libceph_compatible(void *data); | |||
| 
 | ||||
| extern const char *ceph_msg_type_name(int type); | ||||
| extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); | ||||
| extern void *ceph_kvmalloc(size_t size, gfp_t flags); | ||||
| 
 | ||||
| struct fs_parameter; | ||||
| struct fc_log; | ||||
|  |  | |||
|  | @ -11,12 +11,19 @@ | |||
| #include <linux/mutex.h> | ||||
| #include <linux/time64.h> | ||||
| #include <linux/types.h> | ||||
| #include <linux/random.h> | ||||
| 
 | ||||
| /* Minimal region size.  Every damon_region is aligned by this. */ | ||||
| #define DAMON_MIN_REGION	PAGE_SIZE | ||||
| /* Max priority score for DAMON-based operation schemes */ | ||||
| #define DAMOS_MAX_SCORE		(99) | ||||
| 
 | ||||
| /* Get a random number in [l, r) */ | ||||
| static inline unsigned long damon_rand(unsigned long l, unsigned long r) | ||||
| { | ||||
| 	return l + prandom_u32_max(r - l); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * struct damon_addr_range - Represents an address region of [@start, @end). | ||||
|  * @start:	Start address of the region (inclusive). | ||||
|  | @ -185,6 +192,22 @@ struct damos_watermarks { | |||
| 	bool activated; | ||||
| }; | ||||
| 
 | ||||
| /**
 | ||||
|  * struct damos_stat - Statistics on a given scheme. | ||||
|  * @nr_tried:	Total number of regions that the scheme is tried to be applied. | ||||
|  * @sz_tried:	Total size of regions that the scheme is tried to be applied. | ||||
|  * @nr_applied:	Total number of regions that the scheme is applied. | ||||
|  * @sz_applied:	Total size of regions that the scheme is applied. | ||||
|  * @qt_exceeds: Total number of times the quota of the scheme has exceeded. | ||||
|  */ | ||||
| struct damos_stat { | ||||
| 	unsigned long nr_tried; | ||||
| 	unsigned long sz_tried; | ||||
| 	unsigned long nr_applied; | ||||
| 	unsigned long sz_applied; | ||||
| 	unsigned long qt_exceeds; | ||||
| }; | ||||
| 
 | ||||
| /**
 | ||||
|  * struct damos - Represents a Data Access Monitoring-based Operation Scheme. | ||||
|  * @min_sz_region:	Minimum size of target regions. | ||||
|  | @ -196,8 +219,7 @@ struct damos_watermarks { | |||
|  * @action:		&damo_action to be applied to the target regions. | ||||
|  * @quota:		Control the aggressiveness of this scheme. | ||||
|  * @wmarks:		Watermarks for automated (in)activation of this scheme. | ||||
|  * @stat_count:		Total number of regions that this scheme is applied. | ||||
|  * @stat_sz:		Total size of regions that this scheme is applied. | ||||
|  * @stat:		Statistics of this scheme. | ||||
|  * @list:		List head for siblings. | ||||
|  * | ||||
|  * For each aggregation interval, DAMON finds regions which fit in the | ||||
|  | @ -228,8 +250,7 @@ struct damos { | |||
| 	enum damos_action action; | ||||
| 	struct damos_quota quota; | ||||
| 	struct damos_watermarks wmarks; | ||||
| 	unsigned long stat_count; | ||||
| 	unsigned long stat_sz; | ||||
| 	struct damos_stat stat; | ||||
| 	struct list_head list; | ||||
| }; | ||||
| 
 | ||||
|  | @ -274,7 +295,8 @@ struct damon_ctx; | |||
|  * as an integer in [0, &DAMOS_MAX_SCORE]. | ||||
|  * @apply_scheme is called from @kdamond when a region for user provided | ||||
|  * DAMON-based operation scheme is found.  It should apply the scheme's action | ||||
|  * to the region.  This is not used for &DAMON_ARBITRARY_TARGET case. | ||||
|  * to the region and return bytes of the region that the action is successfully | ||||
|  * applied. | ||||
|  * @target_valid should check whether the target is still valid for the | ||||
|  * monitoring. | ||||
|  * @cleanup is called from @kdamond just before its termination. | ||||
|  | @ -288,8 +310,9 @@ struct damon_primitive { | |||
| 	int (*get_scheme_score)(struct damon_ctx *context, | ||||
| 			struct damon_target *t, struct damon_region *r, | ||||
| 			struct damos *scheme); | ||||
| 	int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, | ||||
| 			struct damon_region *r, struct damos *scheme); | ||||
| 	unsigned long (*apply_scheme)(struct damon_ctx *context, | ||||
| 			struct damon_target *t, struct damon_region *r, | ||||
| 			struct damos *scheme); | ||||
| 	bool (*target_valid)(void *target); | ||||
| 	void (*cleanup)(struct damon_ctx *context); | ||||
| }; | ||||
|  | @ -392,14 +415,20 @@ struct damon_ctx { | |||
| 	struct list_head schemes; | ||||
| }; | ||||
| 
 | ||||
| #define damon_next_region(r) \ | ||||
| 	(container_of(r->list.next, struct damon_region, list)) | ||||
| static inline struct damon_region *damon_next_region(struct damon_region *r) | ||||
| { | ||||
| 	return container_of(r->list.next, struct damon_region, list); | ||||
| } | ||||
| 
 | ||||
| #define damon_prev_region(r) \ | ||||
| 	(container_of(r->list.prev, struct damon_region, list)) | ||||
| static inline struct damon_region *damon_prev_region(struct damon_region *r) | ||||
| { | ||||
| 	return container_of(r->list.prev, struct damon_region, list); | ||||
| } | ||||
| 
 | ||||
| #define damon_last_region(t) \ | ||||
| 	(list_last_entry(&t->regions_list, struct damon_region, list)) | ||||
| static inline struct damon_region *damon_last_region(struct damon_target *t) | ||||
| { | ||||
| 	return list_last_entry(&t->regions_list, struct damon_region, list); | ||||
| } | ||||
| 
 | ||||
| #define damon_for_each_region(r, t) \ | ||||
| 	list_for_each_entry(r, &t->regions_list, list) | ||||
|  | @ -422,9 +451,18 @@ struct damon_ctx { | |||
| #ifdef CONFIG_DAMON | ||||
| 
 | ||||
| struct damon_region *damon_new_region(unsigned long start, unsigned long end); | ||||
| inline void damon_insert_region(struct damon_region *r, | ||||
| 
 | ||||
| /*
 | ||||
|  * Add a region between two other regions | ||||
|  */ | ||||
| static inline void damon_insert_region(struct damon_region *r, | ||||
| 		struct damon_region *prev, struct damon_region *next, | ||||
| 		struct damon_target *t); | ||||
| 		struct damon_target *t) | ||||
| { | ||||
| 	__list_add(&r->list, &prev->list, &next->list); | ||||
| 	t->nr_regions++; | ||||
| } | ||||
| 
 | ||||
| void damon_add_region(struct damon_region *r, struct damon_target *t); | ||||
| void damon_destroy_region(struct damon_region *r, struct damon_target *t); | ||||
| 
 | ||||
|  | @ -461,34 +499,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); | |||
| #endif	/* CONFIG_DAMON */ | ||||
| 
 | ||||
| #ifdef CONFIG_DAMON_VADDR | ||||
| 
 | ||||
| /* Monitoring primitives for virtual memory address spaces */ | ||||
| void damon_va_init(struct damon_ctx *ctx); | ||||
| void damon_va_update(struct damon_ctx *ctx); | ||||
| void damon_va_prepare_access_checks(struct damon_ctx *ctx); | ||||
| unsigned int damon_va_check_accesses(struct damon_ctx *ctx); | ||||
| bool damon_va_target_valid(void *t); | ||||
| void damon_va_cleanup(struct damon_ctx *ctx); | ||||
| int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t, | ||||
| 		struct damon_region *r, struct damos *scheme); | ||||
| int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, | ||||
| 		struct damon_region *r, struct damos *scheme); | ||||
| void damon_va_set_primitives(struct damon_ctx *ctx); | ||||
| 
 | ||||
| #endif	/* CONFIG_DAMON_VADDR */ | ||||
| 
 | ||||
| #ifdef CONFIG_DAMON_PADDR | ||||
| 
 | ||||
| /* Monitoring primitives for the physical memory address space */ | ||||
| void damon_pa_prepare_access_checks(struct damon_ctx *ctx); | ||||
| unsigned int damon_pa_check_accesses(struct damon_ctx *ctx); | ||||
| bool damon_pa_target_valid(void *t); | ||||
| int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t, | ||||
| 		struct damon_region *r, struct damos *scheme); | ||||
| int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, | ||||
| 		struct damon_region *r, struct damos *scheme); | ||||
| void damon_pa_set_primitives(struct damon_ctx *ctx); | ||||
| 
 | ||||
| #endif	/* CONFIG_DAMON_PADDR */ | ||||
| 
 | ||||
| #endif	/* _DAMON_H */ | ||||
|  |  | |||
|  | @ -3093,6 +3093,7 @@ extern void unlock_new_inode(struct inode *); | |||
| extern void discard_new_inode(struct inode *); | ||||
| extern unsigned int get_next_ino(void); | ||||
| extern void evict_inodes(struct super_block *sb); | ||||
| void dump_mapping(const struct address_space *); | ||||
| 
 | ||||
| /*
 | ||||
|  * Userspace may rely on the the inode number being non-zero. For example, glibc | ||||
|  |  | |||
|  | @ -302,7 +302,9 @@ struct vm_area_struct; | |||
|  * lowest zone as a type of emergency reserve. | ||||
|  * | ||||
|  * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit | ||||
|  * address. | ||||
|  * address. Note that kmalloc(..., GFP_DMA32) does not return DMA32 memory | ||||
|  * because the DMA32 kmalloc cache array is not implemented. | ||||
|  * (Reason: there is no such user in kernel). | ||||
|  * | ||||
|  * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace, | ||||
|  * do not need to be directly accessible by the kernel but that cannot | ||||
|  | @ -598,9 +600,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order); | |||
| struct folio *folio_alloc(gfp_t gfp, unsigned order); | ||||
| extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, | ||||
| 			struct vm_area_struct *vma, unsigned long addr, | ||||
| 			int node, bool hugepage); | ||||
| 			bool hugepage); | ||||
| #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ | ||||
| 	alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) | ||||
| 	alloc_pages_vma(gfp_mask, order, vma, addr, true) | ||||
| #else | ||||
| static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) | ||||
| { | ||||
|  | @ -610,14 +612,14 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) | |||
| { | ||||
| 	return __folio_alloc_node(gfp, order, numa_node_id()); | ||||
| } | ||||
| #define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ | ||||
| #define alloc_pages_vma(gfp_mask, order, vma, addr, false)\ | ||||
| 	alloc_pages(gfp_mask, order) | ||||
| #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ | ||||
| 	alloc_pages(gfp_mask, order) | ||||
| #endif | ||||
| #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) | ||||
| #define alloc_page_vma(gfp_mask, vma, addr)			\ | ||||
| 	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) | ||||
| 	alloc_pages_vma(gfp_mask, 0, vma, addr, false) | ||||
| 
 | ||||
| extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); | ||||
| extern unsigned long get_zeroed_page(gfp_t gfp_mask); | ||||
|  |  | |||
|  | @ -622,8 +622,8 @@ struct hstate { | |||
| #endif | ||||
| #ifdef CONFIG_CGROUP_HUGETLB | ||||
| 	/* cgroup control files */ | ||||
| 	struct cftype cgroup_files_dfl[7]; | ||||
| 	struct cftype cgroup_files_legacy[9]; | ||||
| 	struct cftype cgroup_files_dfl[8]; | ||||
| 	struct cftype cgroup_files_legacy[10]; | ||||
| #endif | ||||
| 	char name[HSTATE_NAME_LEN]; | ||||
| }; | ||||
|  |  | |||
|  | @ -36,6 +36,11 @@ enum hugetlb_memory_event { | |||
| 	HUGETLB_NR_MEMORY_EVENTS, | ||||
| }; | ||||
| 
 | ||||
| struct hugetlb_cgroup_per_node { | ||||
| 	/* hugetlb usage in pages over all hstates. */ | ||||
| 	unsigned long usage[HUGE_MAX_HSTATE]; | ||||
| }; | ||||
| 
 | ||||
| struct hugetlb_cgroup { | ||||
| 	struct cgroup_subsys_state css; | ||||
| 
 | ||||
|  | @ -57,6 +62,8 @@ struct hugetlb_cgroup { | |||
| 
 | ||||
| 	/* Handle for "hugetlb.events.local" */ | ||||
| 	struct cgroup_file events_local_file[HUGE_MAX_HSTATE]; | ||||
| 
 | ||||
| 	struct hugetlb_cgroup_per_node *nodeinfo[]; | ||||
| }; | ||||
| 
 | ||||
| static inline struct hugetlb_cgroup * | ||||
|  |  | |||
Some files were not shown because too many files have changed in this diff Show more
		Loading…
	
		Reference in a new issue
	
	 Linus Torvalds
						Linus Torvalds