forked from mirrors/linux
		
	mm/vmalloc: rework the drain logic
A current "lazy drain" model suffers from at least two issues.
First one is related to the unsorted list of vmap areas, thus in order to
identify the [min:max] range of areas to be drained, it requires a full
list scan.  What is a time consuming if the list is too long.
Second one and as a next step is about merging all fragments with a free
space.  What is also a time consuming because it has to iterate over
entire list which holds outstanding lazy areas.
See below the "preemptirqsoff" tracer that illustrates a high latency.  It
is ~24676us.  Our workloads like audio and video are effected by such long
latency:
<snip>
  tracer: preemptirqsoff
  preemptirqsoff latency trace v1.1.5 on 4.9.186-perf+
  --------------------------------------------------------------------
  latency: 24676 us, #4/4, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 P:8)
     -----------------
     | task: crtc_commit:112-261 (uid:0 nice:0 policy:1 rt_prio:16)
     -----------------
   => started at: __purge_vmap_area_lazy
   => ended at:   __purge_vmap_area_lazy
                   _------=> CPU#
                  / _-----=> irqs-off
                 | / _----=> need-resched
                 || / _---=> hardirq/softirq
                 ||| / _--=> preempt-depth
                 |||| /     delay
   cmd     pid   ||||| time  |   caller
      \   /      |||||  \    |   /
crtc_com-261     1...1    1us*: _raw_spin_lock <-__purge_vmap_area_lazy
[...]
crtc_com-261     1...1 24675us : _raw_spin_unlock <-__purge_vmap_area_lazy
crtc_com-261     1...1 24677us : trace_preempt_on <-__purge_vmap_area_lazy
crtc_com-261     1...1 24683us : <stack trace>
 => free_vmap_area_noflush
 => remove_vm_area
 => __vunmap
 => vfree
 => drm_property_free_blob
 => drm_mode_object_unreference
 => drm_property_unreference_blob
 => __drm_atomic_helper_crtc_destroy_state
 => sde_crtc_destroy_state
 => drm_atomic_state_default_clear
 => drm_atomic_state_clear
 => drm_atomic_state_free
 => complete_commit
 => _msm_drm_commit_work_cb
 => kthread_worker_fn
 => kthread
 => ret_from_fork
<snip>
To address those two issues we can redesign a purging of the outstanding
lazy areas.  Instead of queuing vmap areas to the list, we replace it by
the separate rb-tree.  In hat case an area is located in the tree/list in
ascending order.  It will give us below advantages:
a) Outstanding vmap areas are merged creating bigger coalesced blocks,
   thus it becomes less fragmented.
b) It is possible to calculate a flush range [min:max] without scanning
   all elements.  It is O(1) access time or complexity;
c) The final merge of areas with the rb-tree that represents a free
   space is faster because of (a).  As a result the lock contention is
   also reduced.
Link: https://lkml.kernel.org/r/20201116220033.1837-2-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: huang ying <huang.ying.caritas@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
			
			
This commit is contained in:
		
							parent
							
								
									8945a72306
								
							
						
					
					
						commit
						96e2db4561
					
				
					 2 changed files with 53 additions and 45 deletions
				
			
		|  | @ -72,16 +72,14 @@ struct vmap_area { | |||
| 	struct list_head list;          /* address sorted list */ | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The following three variables can be packed, because | ||||
| 	 * a vmap_area object is always one of the three states: | ||||
| 	 * The following two variables can be packed, because | ||||
| 	 * a vmap_area object can be either: | ||||
| 	 *    1) in "free" tree (root is vmap_area_root) | ||||
| 	 *    2) in "busy" tree (root is free_vmap_area_root) | ||||
| 	 *    3) in purge list  (head is vmap_purge_list) | ||||
| 	 *    2) or "busy" tree (root is free_vmap_area_root) | ||||
| 	 */ | ||||
| 	union { | ||||
| 		unsigned long subtree_max_size; /* in "free" tree */ | ||||
| 		struct vm_struct *vm;           /* in "busy" tree */ | ||||
| 		struct llist_node purge_list;   /* in purge list */ | ||||
| 	}; | ||||
| }; | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										90
									
								
								mm/vmalloc.c
									
									
									
									
									
								
							
							
						
						
									
										90
									
								
								mm/vmalloc.c
									
									
									
									
									
								
							|  | @ -413,10 +413,13 @@ static DEFINE_SPINLOCK(vmap_area_lock); | |||
| static DEFINE_SPINLOCK(free_vmap_area_lock); | ||||
| /* Export for kexec only */ | ||||
| LIST_HEAD(vmap_area_list); | ||||
| static LLIST_HEAD(vmap_purge_list); | ||||
| static struct rb_root vmap_area_root = RB_ROOT; | ||||
| static bool vmap_initialized __read_mostly; | ||||
| 
 | ||||
| static struct rb_root purge_vmap_area_root = RB_ROOT; | ||||
| static LIST_HEAD(purge_vmap_area_list); | ||||
| static DEFINE_SPINLOCK(purge_vmap_area_lock); | ||||
| 
 | ||||
| /*
 | ||||
|  * This kmem_cache is used for vmap_area objects. Instead of | ||||
|  * allocating from slab we reuse an object from this cache to | ||||
|  | @ -820,10 +823,17 @@ merge_or_add_vmap_area(struct vmap_area *va, | |||
| 	if (!merged) | ||||
| 		link_va(va, root, parent, link, head); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Last step is to check and update the tree. | ||||
| 	 */ | ||||
| 	augment_tree_propagate_from(va); | ||||
| 	return va; | ||||
| } | ||||
| 
 | ||||
| static __always_inline struct vmap_area * | ||||
| merge_or_add_vmap_area_augment(struct vmap_area *va, | ||||
| 	struct rb_root *root, struct list_head *head) | ||||
| { | ||||
| 	va = merge_or_add_vmap_area(va, root, head); | ||||
| 	if (va) | ||||
| 		augment_tree_propagate_from(va); | ||||
| 
 | ||||
| 	return va; | ||||
| } | ||||
| 
 | ||||
|  | @ -1138,7 +1148,7 @@ static void free_vmap_area(struct vmap_area *va) | |||
| 	 * Insert/Merge it back to the free tree/list. | ||||
| 	 */ | ||||
| 	spin_lock(&free_vmap_area_lock); | ||||
| 	merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); | ||||
| 	merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); | ||||
| 	spin_unlock(&free_vmap_area_lock); | ||||
| } | ||||
| 
 | ||||
|  | @ -1326,32 +1336,32 @@ void set_iounmap_nonlazy(void) | |||
| static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) | ||||
| { | ||||
| 	unsigned long resched_threshold; | ||||
| 	struct llist_node *valist; | ||||
| 	struct vmap_area *va; | ||||
| 	struct vmap_area *n_va; | ||||
| 	struct list_head local_pure_list; | ||||
| 	struct vmap_area *va, *n_va; | ||||
| 
 | ||||
| 	lockdep_assert_held(&vmap_purge_lock); | ||||
| 
 | ||||
| 	valist = llist_del_all(&vmap_purge_list); | ||||
| 	if (unlikely(valist == NULL)) | ||||
| 	spin_lock(&purge_vmap_area_lock); | ||||
| 	purge_vmap_area_root = RB_ROOT; | ||||
| 	list_replace_init(&purge_vmap_area_list, &local_pure_list); | ||||
| 	spin_unlock(&purge_vmap_area_lock); | ||||
| 
 | ||||
| 	if (unlikely(list_empty(&local_pure_list))) | ||||
| 		return false; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * TODO: to calculate a flush range without looping. | ||||
| 	 * The list can be up to lazy_max_pages() elements. | ||||
| 	 */ | ||||
| 	llist_for_each_entry(va, valist, purge_list) { | ||||
| 		if (va->va_start < start) | ||||
| 			start = va->va_start; | ||||
| 		if (va->va_end > end) | ||||
| 			end = va->va_end; | ||||
| 	} | ||||
| 	start = min(start, | ||||
| 		list_first_entry(&local_pure_list, | ||||
| 			struct vmap_area, list)->va_start); | ||||
| 
 | ||||
| 	end = max(end, | ||||
| 		list_last_entry(&local_pure_list, | ||||
| 			struct vmap_area, list)->va_end); | ||||
| 
 | ||||
| 	flush_tlb_kernel_range(start, end); | ||||
| 	resched_threshold = lazy_max_pages() << 1; | ||||
| 
 | ||||
| 	spin_lock(&free_vmap_area_lock); | ||||
| 	llist_for_each_entry_safe(va, n_va, valist, purge_list) { | ||||
| 	list_for_each_entry_safe(va, n_va, &local_pure_list, list) { | ||||
| 		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; | ||||
| 		unsigned long orig_start = va->va_start; | ||||
| 		unsigned long orig_end = va->va_end; | ||||
|  | @ -1361,8 +1371,8 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) | |||
| 		 * detached and there is no need to "unlink" it from | ||||
| 		 * anything. | ||||
| 		 */ | ||||
| 		va = merge_or_add_vmap_area(va, &free_vmap_area_root, | ||||
| 					    &free_vmap_area_list); | ||||
| 		va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, | ||||
| 				&free_vmap_area_list); | ||||
| 
 | ||||
| 		if (!va) | ||||
| 			continue; | ||||
|  | @ -1419,9 +1429,15 @@ static void free_vmap_area_noflush(struct vmap_area *va) | |||
| 	nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> | ||||
| 				PAGE_SHIFT, &vmap_lazy_nr); | ||||
| 
 | ||||
| 	/* After this point, we may free va at any time */ | ||||
| 	llist_add(&va->purge_list, &vmap_purge_list); | ||||
| 	/*
 | ||||
| 	 * Merge or place it to the purge tree/list. | ||||
| 	 */ | ||||
| 	spin_lock(&purge_vmap_area_lock); | ||||
| 	merge_or_add_vmap_area(va, | ||||
| 		&purge_vmap_area_root, &purge_vmap_area_list); | ||||
| 	spin_unlock(&purge_vmap_area_lock); | ||||
| 
 | ||||
| 	/* After this point, we may free va at any time */ | ||||
| 	if (unlikely(nr_lazy > lazy_max_pages())) | ||||
| 		try_purge_vmap_area_lazy(); | ||||
| } | ||||
|  | @ -3351,8 +3367,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | |||
| 	while (area--) { | ||||
| 		orig_start = vas[area]->va_start; | ||||
| 		orig_end = vas[area]->va_end; | ||||
| 		va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, | ||||
| 					    &free_vmap_area_list); | ||||
| 		va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, | ||||
| 				&free_vmap_area_list); | ||||
| 		if (va) | ||||
| 			kasan_release_vmalloc(orig_start, orig_end, | ||||
| 				va->va_start, va->va_end); | ||||
|  | @ -3401,8 +3417,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | |||
| 	for (area = 0; area < nr_vms; area++) { | ||||
| 		orig_start = vas[area]->va_start; | ||||
| 		orig_end = vas[area]->va_end; | ||||
| 		va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, | ||||
| 					    &free_vmap_area_list); | ||||
| 		va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, | ||||
| 				&free_vmap_area_list); | ||||
| 		if (va) | ||||
| 			kasan_release_vmalloc(orig_start, orig_end, | ||||
| 				va->va_start, va->va_end); | ||||
|  | @ -3482,18 +3498,15 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) | |||
| 
 | ||||
| static void show_purge_info(struct seq_file *m) | ||||
| { | ||||
| 	struct llist_node *head; | ||||
| 	struct vmap_area *va; | ||||
| 
 | ||||
| 	head = READ_ONCE(vmap_purge_list.first); | ||||
| 	if (head == NULL) | ||||
| 		return; | ||||
| 
 | ||||
| 	llist_for_each_entry(va, head, purge_list) { | ||||
| 	spin_lock(&purge_vmap_area_lock); | ||||
| 	list_for_each_entry(va, &purge_vmap_area_list, list) { | ||||
| 		seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", | ||||
| 			(void *)va->va_start, (void *)va->va_end, | ||||
| 			va->va_end - va->va_start); | ||||
| 	} | ||||
| 	spin_unlock(&purge_vmap_area_lock); | ||||
| } | ||||
| 
 | ||||
| static int s_show(struct seq_file *m, void *p) | ||||
|  | @ -3551,10 +3564,7 @@ static int s_show(struct seq_file *m, void *p) | |||
| 	seq_putc(m, '\n'); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * As a final step, dump "unpurged" areas. Note, | ||||
| 	 * that entire "/proc/vmallocinfo" output will not | ||||
| 	 * be address sorted, because the purge list is not | ||||
| 	 * sorted. | ||||
| 	 * As a final step, dump "unpurged" areas. | ||||
| 	 */ | ||||
| 	if (list_is_last(&va->list, &vmap_area_list)) | ||||
| 		show_purge_info(m); | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Uladzislau Rezki (Sony)
						Uladzislau Rezki (Sony)