forked from mirrors/linux
		
	mm: add per-VMA lock and helper functions to control it
Introduce per-VMA locking.  The lock implementation relies on a per-vma
and per-mm sequence counters to note exclusive locking:
  - read lock - (implemented by vma_start_read) requires the vma
    (vm_lock_seq) and mm (mm_lock_seq) sequence counters to differ.
    If they match then there must be a vma exclusive lock held somewhere.
  - read unlock - (implemented by vma_end_read) is a trivial vma->lock
    unlock.
  - write lock - (vma_start_write) requires the mmap_lock to be held
    exclusively and the current mm counter is assigned to the vma counter.
    This will allow multiple vmas to be locked under a single mmap_lock
    write lock (e.g. during vma merging). The vma counter is modified
    under exclusive vma lock.
  - write unlock - (vma_end_write_all) is a batch release of all vma
    locks held. It doesn't pair with a specific vma_start_write! It is
    done before exclusive mmap_lock is released by incrementing mm
    sequence counter (mm_lock_seq).
  - write downgrade - if the mmap_lock is downgraded to the read lock, all
    vma write locks are released as well (effectivelly same as write
    unlock).
Link: https://lkml.kernel.org/r/20230227173632.3292573-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
			
			
This commit is contained in:
		
							parent
							
								
									438b6e12cd
								
							
						
					
					
						commit
						5e31275cc9
					
				
					 5 changed files with 110 additions and 0 deletions
				
			
		| 
						 | 
					@ -624,6 +624,87 @@ struct vm_operations_struct {
 | 
				
			||||||
					  unsigned long addr);
 | 
										  unsigned long addr);
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef CONFIG_PER_VMA_LOCK
 | 
				
			||||||
 | 
					static inline void vma_init_lock(struct vm_area_struct *vma)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						init_rwsem(&vma->lock);
 | 
				
			||||||
 | 
						vma->vm_lock_seq = -1;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * Try to read-lock a vma. The function is allowed to occasionally yield false
 | 
				
			||||||
 | 
					 * locked result to avoid performance overhead, in which case we fall back to
 | 
				
			||||||
 | 
					 * using mmap_lock. The function should never yield false unlocked result.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static inline bool vma_start_read(struct vm_area_struct *vma)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						/* Check before locking. A race might cause false locked result. */
 | 
				
			||||||
 | 
						if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
 | 
				
			||||||
 | 
							return false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (unlikely(down_read_trylock(&vma->lock) == 0))
 | 
				
			||||||
 | 
							return false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Overflow might produce false locked result.
 | 
				
			||||||
 | 
						 * False unlocked result is impossible because we modify and check
 | 
				
			||||||
 | 
						 * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
 | 
				
			||||||
 | 
						 * modification invalidates all existing locks.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
 | 
				
			||||||
 | 
							up_read(&vma->lock);
 | 
				
			||||||
 | 
							return false;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						return true;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static inline void vma_end_read(struct vm_area_struct *vma)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						rcu_read_lock(); /* keeps vma alive till the end of up_read */
 | 
				
			||||||
 | 
						up_read(&vma->lock);
 | 
				
			||||||
 | 
						rcu_read_unlock();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static inline void vma_start_write(struct vm_area_struct *vma)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						int mm_lock_seq;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						mmap_assert_write_locked(vma->vm_mm);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
 | 
				
			||||||
 | 
						 * mm->mm_lock_seq can't be concurrently modified.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
 | 
				
			||||||
 | 
						if (vma->vm_lock_seq == mm_lock_seq)
 | 
				
			||||||
 | 
							return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						down_write(&vma->lock);
 | 
				
			||||||
 | 
						vma->vm_lock_seq = mm_lock_seq;
 | 
				
			||||||
 | 
						up_write(&vma->lock);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						mmap_assert_write_locked(vma->vm_mm);
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
 | 
				
			||||||
 | 
						 * mm->mm_lock_seq can't be concurrently modified.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#else /* CONFIG_PER_VMA_LOCK */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static inline void vma_init_lock(struct vm_area_struct *vma) {}
 | 
				
			||||||
 | 
					static inline bool vma_start_read(struct vm_area_struct *vma)
 | 
				
			||||||
 | 
							{ return false; }
 | 
				
			||||||
 | 
					static inline void vma_end_read(struct vm_area_struct *vma) {}
 | 
				
			||||||
 | 
					static inline void vma_start_write(struct vm_area_struct *vma) {}
 | 
				
			||||||
 | 
					static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif /* CONFIG_PER_VMA_LOCK */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 | 
					static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	static const struct vm_operations_struct dummy_vm_ops = {};
 | 
						static const struct vm_operations_struct dummy_vm_ops = {};
 | 
				
			||||||
| 
						 | 
					@ -632,6 +713,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 | 
				
			||||||
	vma->vm_mm = mm;
 | 
						vma->vm_mm = mm;
 | 
				
			||||||
	vma->vm_ops = &dummy_vm_ops;
 | 
						vma->vm_ops = &dummy_vm_ops;
 | 
				
			||||||
	INIT_LIST_HEAD(&vma->anon_vma_chain);
 | 
						INIT_LIST_HEAD(&vma->anon_vma_chain);
 | 
				
			||||||
 | 
						vma_init_lock(vma);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* Use when VMA is not part of the VMA tree and needs no locking */
 | 
					/* Use when VMA is not part of the VMA tree and needs no locking */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -503,6 +503,11 @@ struct vm_area_struct {
 | 
				
			||||||
		vm_flags_t __private __vm_flags;
 | 
							vm_flags_t __private __vm_flags;
 | 
				
			||||||
	};
 | 
						};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef CONFIG_PER_VMA_LOCK
 | 
				
			||||||
 | 
						int vm_lock_seq;
 | 
				
			||||||
 | 
						struct rw_semaphore lock;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * For areas with an address space and backing store,
 | 
						 * For areas with an address space and backing store,
 | 
				
			||||||
	 * linkage into the address_space->i_mmap interval tree.
 | 
						 * linkage into the address_space->i_mmap interval tree.
 | 
				
			||||||
| 
						 | 
					@ -639,6 +644,9 @@ struct mm_struct {
 | 
				
			||||||
					  * init_mm.mmlist, and are protected
 | 
										  * init_mm.mmlist, and are protected
 | 
				
			||||||
					  * by mmlist_lock
 | 
										  * by mmlist_lock
 | 
				
			||||||
					  */
 | 
										  */
 | 
				
			||||||
 | 
					#ifdef CONFIG_PER_VMA_LOCK
 | 
				
			||||||
 | 
							int mm_lock_seq;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		unsigned long hiwater_rss; /* High-watermark of RSS usage */
 | 
							unsigned long hiwater_rss; /* High-watermark of RSS usage */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
 | 
				
			||||||
	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
 | 
						VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef CONFIG_PER_VMA_LOCK
 | 
				
			||||||
 | 
					static inline void vma_end_write_all(struct mm_struct *mm)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						mmap_assert_write_locked(mm);
 | 
				
			||||||
 | 
						/* No races during update due to exclusive mmap_lock being held */
 | 
				
			||||||
 | 
						WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					static inline void vma_end_write_all(struct mm_struct *mm) {}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void mmap_init_lock(struct mm_struct *mm)
 | 
					static inline void mmap_init_lock(struct mm_struct *mm)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	init_rwsem(&mm->mmap_lock);
 | 
						init_rwsem(&mm->mmap_lock);
 | 
				
			||||||
| 
						 | 
					@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
 | 
				
			||||||
static inline void mmap_write_unlock(struct mm_struct *mm)
 | 
					static inline void mmap_write_unlock(struct mm_struct *mm)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	__mmap_lock_trace_released(mm, true);
 | 
						__mmap_lock_trace_released(mm, true);
 | 
				
			||||||
 | 
						vma_end_write_all(mm);
 | 
				
			||||||
	up_write(&mm->mmap_lock);
 | 
						up_write(&mm->mmap_lock);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void mmap_write_downgrade(struct mm_struct *mm)
 | 
					static inline void mmap_write_downgrade(struct mm_struct *mm)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	__mmap_lock_trace_acquire_returned(mm, false, true);
 | 
						__mmap_lock_trace_acquire_returned(mm, false, true);
 | 
				
			||||||
 | 
						vma_end_write_all(mm);
 | 
				
			||||||
	downgrade_write(&mm->mmap_lock);
 | 
						downgrade_write(&mm->mmap_lock);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 | 
				
			||||||
		 */
 | 
							 */
 | 
				
			||||||
		data_race(memcpy(new, orig, sizeof(*new)));
 | 
							data_race(memcpy(new, orig, sizeof(*new)));
 | 
				
			||||||
		INIT_LIST_HEAD(&new->anon_vma_chain);
 | 
							INIT_LIST_HEAD(&new->anon_vma_chain);
 | 
				
			||||||
 | 
							vma_init_lock(new);
 | 
				
			||||||
		dup_anon_vma_name(orig, new);
 | 
							dup_anon_vma_name(orig, new);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	return new;
 | 
						return new;
 | 
				
			||||||
| 
						 | 
					@ -1208,6 +1209,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 | 
				
			||||||
	seqcount_init(&mm->write_protect_seq);
 | 
						seqcount_init(&mm->write_protect_seq);
 | 
				
			||||||
	mmap_init_lock(mm);
 | 
						mmap_init_lock(mm);
 | 
				
			||||||
	INIT_LIST_HEAD(&mm->mmlist);
 | 
						INIT_LIST_HEAD(&mm->mmlist);
 | 
				
			||||||
 | 
					#ifdef CONFIG_PER_VMA_LOCK
 | 
				
			||||||
 | 
						mm->mm_lock_seq = 0;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
	mm_pgtables_bytes_init(mm);
 | 
						mm_pgtables_bytes_init(mm);
 | 
				
			||||||
	mm->map_count = 0;
 | 
						mm->map_count = 0;
 | 
				
			||||||
	mm->locked_vm = 0;
 | 
						mm->locked_vm = 0;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -37,6 +37,9 @@ struct mm_struct init_mm = {
 | 
				
			||||||
	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 | 
						.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 | 
				
			||||||
	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 | 
						.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 | 
				
			||||||
	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 | 
						.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 | 
				
			||||||
 | 
					#ifdef CONFIG_PER_VMA_LOCK
 | 
				
			||||||
 | 
						.mm_lock_seq	= 0,
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
	.user_ns	= &init_user_ns,
 | 
						.user_ns	= &init_user_ns,
 | 
				
			||||||
	.cpu_bitmap	= CPU_BITS_NONE,
 | 
						.cpu_bitmap	= CPU_BITS_NONE,
 | 
				
			||||||
#ifdef CONFIG_IOMMU_SVA
 | 
					#ifdef CONFIG_IOMMU_SVA
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue