mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 00:28:52 +02:00 
			
		
		
		
	 f35ab95ca0
			
		
	
	
		f35ab95ca0
		
	
	
	
	
		
			
			rw_semaphore is a sizable structure of 40 bytes and consumes considerable space for each vm_area_struct. However vma_lock has two important specifics which can be used to replace rw_semaphore with a simpler structure: 1. Readers never wait. They try to take the vma_lock and fall back to mmap_lock if that fails. 2. Only one writer at a time will ever try to write-lock a vma_lock because writers first take mmap_lock in write mode. Because of these requirements, full rw_semaphore functionality is not needed and we can replace rw_semaphore and the vma->detached flag with a refcount (vm_refcnt). When vma is in detached state, vm_refcnt is 0 and only a call to vma_mark_attached() can take it out of this state. Note that unlike before, now we enforce both vma_mark_attached() and vma_mark_detached() to be done only after vma has been write-locked. vma_mark_attached() changes vm_refcnt to 1 to indicate that it has been attached to the vma tree. When a reader takes read lock, it increments vm_refcnt, unless the top usable bit of vm_refcnt (0x40000000) is set, indicating presence of a writer. When writer takes write lock, it sets the top usable bit to indicate its presence. If there are readers, writer will wait using newly introduced mm->vma_writer_wait. Since all writers take mmap_lock in write mode first, there can be only one writer at a time. The last reader to release the lock will signal the writer to wake up. refcount might overflow if there are many competing readers, in which case read-locking will fail. Readers are expected to handle such failures. In summary: 1. all readers increment the vm_refcnt; 2. writer sets top usable (writer) bit of vm_refcnt; 3. readers cannot increment the vm_refcnt if the writer bit is set; 4. in the presence of readers, writer must wait for the vm_refcnt to drop to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma with no readers; 5. vm_refcnt overflow is handled by the readers. While this vm_lock replacement does not yet result in a smaller vm_area_struct (it stays at 256 bytes due to cacheline alignment), it allows for further size optimization by structure member regrouping to bring the size of vm_area_struct below 192 bytes. [surenb@google.com: fix a crash due to vma_end_read() that should have been removed] Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Suggested-by: Peter Zijlstra <peterz@infradead.org> Suggested-by: Matthew Wilcox <willy@infradead.org> Tested-by: Shivank Garg <shivankg@amd.com> Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Christian Brauner <brauner@kernel.org> Cc: David Hildenbrand <david@redhat.com> Cc: David Howells <dhowells@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Hugh Dickins <hughd@google.com> Cc: Jann Horn <jannh@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Klara Modin <klarasmodin@gmail.com> Cc: Liam R. Howlett <Liam.Howlett@Oracle.com> Cc: Lokesh Gidra <lokeshgidra@google.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Mateusz Guzik <mjguzik@gmail.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Minchan Kim <minchan@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: "Paul E . McKenney" <paulmck@kernel.org> Cc: Peter Xu <peterx@redhat.com> Cc: Shakeel Butt <shakeel.butt@linux.dev> Cc: Sourav Panda <souravpanda@google.com> Cc: Wei Yang <richard.weiyang@gmail.com> Cc: Will Deacon <will@kernel.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
		
			
				
	
	
		
			58 lines
		
	
	
	
		
			1.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			58 lines
		
	
	
	
		
			1.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0
 | |
| #include <linux/mm_types.h>
 | |
| #include <linux/maple_tree.h>
 | |
| #include <linux/rwsem.h>
 | |
| #include <linux/spinlock.h>
 | |
| #include <linux/list.h>
 | |
| #include <linux/cpumask.h>
 | |
| #include <linux/mman.h>
 | |
| #include <linux/pgtable.h>
 | |
| 
 | |
| #include <linux/atomic.h>
 | |
| #include <linux/user_namespace.h>
 | |
| #include <linux/iommu.h>
 | |
| #include <asm/mmu.h>
 | |
| 
 | |
| #ifndef INIT_MM_CONTEXT
 | |
| #define INIT_MM_CONTEXT(name)
 | |
| #endif
 | |
| 
 | |
| const struct vm_operations_struct vma_dummy_vm_ops;
 | |
| 
 | |
| /*
 | |
|  * For dynamically allocated mm_structs, there is a dynamically sized cpumask
 | |
|  * at the end of the structure, the size of which depends on the maximum CPU
 | |
|  * number the system can see. That way we allocate only as much memory for
 | |
|  * mm_cpumask() as needed for the hundreds, or thousands of processes that
 | |
|  * a system typically runs.
 | |
|  *
 | |
|  * Since there is only one init_mm in the entire system, keep it simple
 | |
|  * and size this cpu_bitmask to NR_CPUS.
 | |
|  */
 | |
| struct mm_struct init_mm = {
 | |
| 	.mm_mt		= MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock),
 | |
| 	.pgd		= swapper_pg_dir,
 | |
| 	.mm_users	= ATOMIC_INIT(2),
 | |
| 	.mm_count	= ATOMIC_INIT(1),
 | |
| 	.write_protect_seq = SEQCNT_ZERO(init_mm.write_protect_seq),
 | |
| 	MMAP_LOCK_INITIALIZER(init_mm)
 | |
| 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 | |
| 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 | |
| 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 | |
| #ifdef CONFIG_PER_VMA_LOCK
 | |
| 	.vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait),
 | |
| 	.mm_lock_seq	= SEQCNT_ZERO(init_mm.mm_lock_seq),
 | |
| #endif
 | |
| 	.user_ns	= &init_user_ns,
 | |
| 	.cpu_bitmap	= CPU_BITS_NONE,
 | |
| 	INIT_MM_CONTEXT(init_mm)
 | |
| };
 | |
| 
 | |
| void setup_initial_init_mm(void *start_code, void *end_code,
 | |
| 			   void *end_data, void *brk)
 | |
| {
 | |
| 	init_mm.start_code = (unsigned long)start_code;
 | |
| 	init_mm.end_code = (unsigned long)end_code;
 | |
| 	init_mm.end_data = (unsigned long)end_data;
 | |
| 	init_mm.brk = (unsigned long)brk;
 | |
| }
 |