forked from mirrors/linux
		
	mm, memory-hotplug: dynamic configure movable memory and portion memory
Add online_movable and online_kernel for logic memory hotplug. This is the dynamic version of "movablecore" & "kernelcore". We have the same reason to introduce it as to introduce "movablecore" & "kernelcore". It has the same motive as "movablecore" & "kernelcore", but it is dynamic/running-time: o We can configure memory as kernelcore or movablecore after boot. Userspace workload is increased, we need more hugepage, we can't use "online_movable" to add memory and allow the system use more THP(transparent-huge-page), vice-verse when kernel workload is increase. Also help for virtualization to dynamic configure host/guest's memory, to save/(reduce waste) memory. Memory capacity on Demand o When a new node is physically online after boot, we need to use "online_movable" or "online_kernel" to configure/portion it as we expected when we logic-online it. This configuration also helps for physically-memory-migrate. o all benefit as the same as existed "movablecore" & "kernelcore". o Preparing for movable-node, which is very important for power-saving, hardware partitioning and high-available-system(hardware fault management). (Note, we don't introduce movable-node here.) Action behavior: When a memoryblock/memorysection is onlined by "online_movable", the kernel will not have directly reference to the page of the memoryblock, thus we can remove that memory any time when needed. When it is online by "online_kernel", the kernel can use it. When it is online by "online", the zone type doesn't changed. Current constraints: Only the memoryblock which is adjacent to the ZONE_MOVABLE can be online from ZONE_NORMAL to ZONE_MOVABLE. [akpm@linux-foundation.org: use min_t, cleanups] Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Greg KH <greg@kroah.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									fcf07d22f0
								
							
						
					
					
						commit
						511c2aba8f
					
				
					 4 changed files with 146 additions and 14 deletions
				
			
		|  | @ -161,7 +161,8 @@ a recent addition and not present on older kernels. | |||
| 		    in the memory block. | ||||
| 'state'           : read-write | ||||
|                     at read:  contains online/offline state of memory. | ||||
|                     at write: user can specify "online", "offline" command | ||||
|                     at write: user can specify "online_kernel", | ||||
|                     "online_movable", "online", "offline" command | ||||
|                     which will be performed on al sections in the block. | ||||
| 'phys_device'     : read-only: designed to show the name of physical memory | ||||
|                     device.  This is not well implemented now. | ||||
|  | @ -255,6 +256,17 @@ For onlining, you have to write "online" to the section's state file as: | |||
| 
 | ||||
| % echo online > /sys/devices/system/memory/memoryXXX/state | ||||
| 
 | ||||
| This onlining will not change the ZONE type of the target memory section, | ||||
| If the memory section is in ZONE_NORMAL, you can change it to ZONE_MOVABLE: | ||||
| 
 | ||||
| % echo online_movable > /sys/devices/system/memory/memoryXXX/state | ||||
| (NOTE: current limit: this memory section must be adjacent to ZONE_MOVABLE) | ||||
| 
 | ||||
| And if the memory section is in ZONE_MOVABLE, you can change it to ZONE_NORMAL: | ||||
| 
 | ||||
| % echo online_kernel > /sys/devices/system/memory/memoryXXX/state | ||||
| (NOTE: current limit: this memory section must be adjacent to ZONE_NORMAL) | ||||
| 
 | ||||
| After this, section memoryXXX's state will be 'online' and the amount of | ||||
| available memory will be increased. | ||||
| 
 | ||||
|  |  | |||
|  | @ -254,7 +254,7 @@ static bool pages_correctly_reserved(unsigned long start_pfn, | |||
|  * OK to have direct references to sparsemem variables in here. | ||||
|  */ | ||||
| static int | ||||
| memory_block_action(unsigned long phys_index, unsigned long action) | ||||
| memory_block_action(unsigned long phys_index, unsigned long action, int online_type) | ||||
| { | ||||
| 	unsigned long start_pfn; | ||||
| 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; | ||||
|  | @ -269,7 +269,7 @@ memory_block_action(unsigned long phys_index, unsigned long action) | |||
| 			if (!pages_correctly_reserved(start_pfn, nr_pages)) | ||||
| 				return -EBUSY; | ||||
| 
 | ||||
| 			ret = online_pages(start_pfn, nr_pages); | ||||
| 			ret = online_pages(start_pfn, nr_pages, online_type); | ||||
| 			break; | ||||
| 		case MEM_OFFLINE: | ||||
| 			ret = offline_pages(start_pfn, nr_pages); | ||||
|  | @ -284,7 +284,8 @@ memory_block_action(unsigned long phys_index, unsigned long action) | |||
| } | ||||
| 
 | ||||
| static int __memory_block_change_state(struct memory_block *mem, | ||||
| 		unsigned long to_state, unsigned long from_state_req) | ||||
| 		unsigned long to_state, unsigned long from_state_req, | ||||
| 		int online_type) | ||||
| { | ||||
| 	int ret = 0; | ||||
| 
 | ||||
|  | @ -296,7 +297,7 @@ static int __memory_block_change_state(struct memory_block *mem, | |||
| 	if (to_state == MEM_OFFLINE) | ||||
| 		mem->state = MEM_GOING_OFFLINE; | ||||
| 
 | ||||
| 	ret = memory_block_action(mem->start_section_nr, to_state); | ||||
| 	ret = memory_block_action(mem->start_section_nr, to_state, online_type); | ||||
| 
 | ||||
| 	if (ret) { | ||||
| 		mem->state = from_state_req; | ||||
|  | @ -319,12 +320,14 @@ static int __memory_block_change_state(struct memory_block *mem, | |||
| } | ||||
| 
 | ||||
| static int memory_block_change_state(struct memory_block *mem, | ||||
| 		unsigned long to_state, unsigned long from_state_req) | ||||
| 		unsigned long to_state, unsigned long from_state_req, | ||||
| 		int online_type) | ||||
| { | ||||
| 	int ret; | ||||
| 
 | ||||
| 	mutex_lock(&mem->state_mutex); | ||||
| 	ret = __memory_block_change_state(mem, to_state, from_state_req); | ||||
| 	ret = __memory_block_change_state(mem, to_state, from_state_req, | ||||
| 					  online_type); | ||||
| 	mutex_unlock(&mem->state_mutex); | ||||
| 
 | ||||
| 	return ret; | ||||
|  | @ -338,10 +341,18 @@ store_mem_state(struct device *dev, | |||
| 
 | ||||
| 	mem = container_of(dev, struct memory_block, dev); | ||||
| 
 | ||||
| 	if (!strncmp(buf, "online", min((int)count, 6))) | ||||
| 		ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); | ||||
| 	else if(!strncmp(buf, "offline", min((int)count, 7))) | ||||
| 		ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); | ||||
| 	if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) | ||||
| 		ret = memory_block_change_state(mem, MEM_ONLINE, | ||||
| 						MEM_OFFLINE, ONLINE_KERNEL); | ||||
| 	else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) | ||||
| 		ret = memory_block_change_state(mem, MEM_ONLINE, | ||||
| 						MEM_OFFLINE, ONLINE_MOVABLE); | ||||
| 	else if (!strncmp(buf, "online", min_t(int, count, 6))) | ||||
| 		ret = memory_block_change_state(mem, MEM_ONLINE, | ||||
| 						MEM_OFFLINE, ONLINE_KEEP); | ||||
| 	else if(!strncmp(buf, "offline", min_t(int, count, 7))) | ||||
| 		ret = memory_block_change_state(mem, MEM_OFFLINE, | ||||
| 						MEM_ONLINE, -1); | ||||
| 
 | ||||
| 	if (ret) | ||||
| 		return ret; | ||||
|  | @ -676,7 +687,7 @@ int offline_memory_block(struct memory_block *mem) | |||
| 
 | ||||
| 	mutex_lock(&mem->state_mutex); | ||||
| 	if (mem->state != MEM_OFFLINE) | ||||
| 		ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); | ||||
| 		ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE, -1); | ||||
| 	mutex_unlock(&mem->state_mutex); | ||||
| 
 | ||||
| 	return ret; | ||||
|  |  | |||
|  | @ -26,6 +26,13 @@ enum { | |||
| 	MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, | ||||
| }; | ||||
| 
 | ||||
| /* Types for control the zone type of onlined memory */ | ||||
| enum { | ||||
| 	ONLINE_KEEP, | ||||
| 	ONLINE_KERNEL, | ||||
| 	ONLINE_MOVABLE, | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * pgdat resizing functions | ||||
|  */ | ||||
|  | @ -46,6 +53,10 @@ void pgdat_resize_init(struct pglist_data *pgdat) | |||
| } | ||||
| /*
 | ||||
|  * Zone resizing functions | ||||
|  * | ||||
|  * Note: any attempt to resize a zone should has pgdat_resize_lock() | ||||
|  * zone_span_writelock() both held. This ensure the size of a zone | ||||
|  * can't be changed while pgdat_resize_lock() held. | ||||
|  */ | ||||
| static inline unsigned zone_span_seqbegin(struct zone *zone) | ||||
| { | ||||
|  | @ -71,7 +82,7 @@ extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); | |||
| extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); | ||||
| extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); | ||||
| /* VM interface that may be used by firmware interface */ | ||||
| extern int online_pages(unsigned long, unsigned long); | ||||
| extern int online_pages(unsigned long, unsigned long, int); | ||||
| extern void __offline_isolated_pages(unsigned long, unsigned long); | ||||
| 
 | ||||
| typedef void (*online_page_callback_t)(struct page *page); | ||||
|  |  | |||
|  | @ -214,6 +214,88 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, | |||
| 	zone_span_writeunlock(zone); | ||||
| } | ||||
| 
 | ||||
| static void resize_zone(struct zone *zone, unsigned long start_pfn, | ||||
| 		unsigned long end_pfn) | ||||
| { | ||||
| 	zone_span_writelock(zone); | ||||
| 
 | ||||
| 	zone->zone_start_pfn = start_pfn; | ||||
| 	zone->spanned_pages = end_pfn - start_pfn; | ||||
| 
 | ||||
| 	zone_span_writeunlock(zone); | ||||
| } | ||||
| 
 | ||||
| static void fix_zone_id(struct zone *zone, unsigned long start_pfn, | ||||
| 		unsigned long end_pfn) | ||||
| { | ||||
| 	enum zone_type zid = zone_idx(zone); | ||||
| 	int nid = zone->zone_pgdat->node_id; | ||||
| 	unsigned long pfn; | ||||
| 
 | ||||
| 	for (pfn = start_pfn; pfn < end_pfn; pfn++) | ||||
| 		set_page_links(pfn_to_page(pfn), zid, nid, pfn); | ||||
| } | ||||
| 
 | ||||
| static int move_pfn_range_left(struct zone *z1, struct zone *z2, | ||||
| 		unsigned long start_pfn, unsigned long end_pfn) | ||||
| { | ||||
| 	unsigned long flags; | ||||
| 
 | ||||
| 	pgdat_resize_lock(z1->zone_pgdat, &flags); | ||||
| 
 | ||||
| 	/* can't move pfns which are higher than @z2 */ | ||||
| 	if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) | ||||
| 		goto out_fail; | ||||
| 	/* the move out part mast at the left most of @z2 */ | ||||
| 	if (start_pfn > z2->zone_start_pfn) | ||||
| 		goto out_fail; | ||||
| 	/* must included/overlap */ | ||||
| 	if (end_pfn <= z2->zone_start_pfn) | ||||
| 		goto out_fail; | ||||
| 
 | ||||
| 	resize_zone(z1, z1->zone_start_pfn, end_pfn); | ||||
| 	resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); | ||||
| 
 | ||||
| 	pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||||
| 
 | ||||
| 	fix_zone_id(z1, start_pfn, end_pfn); | ||||
| 
 | ||||
| 	return 0; | ||||
| out_fail: | ||||
| 	pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||||
| 	return -1; | ||||
| } | ||||
| 
 | ||||
| static int move_pfn_range_right(struct zone *z1, struct zone *z2, | ||||
| 		unsigned long start_pfn, unsigned long end_pfn) | ||||
| { | ||||
| 	unsigned long flags; | ||||
| 
 | ||||
| 	pgdat_resize_lock(z1->zone_pgdat, &flags); | ||||
| 
 | ||||
| 	/* can't move pfns which are lower than @z1 */ | ||||
| 	if (z1->zone_start_pfn > start_pfn) | ||||
| 		goto out_fail; | ||||
| 	/* the move out part mast at the right most of @z1 */ | ||||
| 	if (z1->zone_start_pfn + z1->spanned_pages >  end_pfn) | ||||
| 		goto out_fail; | ||||
| 	/* must included/overlap */ | ||||
| 	if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) | ||||
| 		goto out_fail; | ||||
| 
 | ||||
| 	resize_zone(z1, z1->zone_start_pfn, start_pfn); | ||||
| 	resize_zone(z2, start_pfn, z2->zone_start_pfn + z2->spanned_pages); | ||||
| 
 | ||||
| 	pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||||
| 
 | ||||
| 	fix_zone_id(z2, start_pfn, end_pfn); | ||||
| 
 | ||||
| 	return 0; | ||||
| out_fail: | ||||
| 	pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||||
| 	return -1; | ||||
| } | ||||
| 
 | ||||
| static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, | ||||
| 			    unsigned long end_pfn) | ||||
| { | ||||
|  | @ -508,7 +590,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) | |||
| } | ||||
| 
 | ||||
| 
 | ||||
| int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | ||||
| int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) | ||||
| { | ||||
| 	unsigned long onlined_pages = 0; | ||||
| 	struct zone *zone; | ||||
|  | @ -525,6 +607,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 	 */ | ||||
| 	zone = page_zone(pfn_to_page(pfn)); | ||||
| 
 | ||||
| 	if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { | ||||
| 		if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { | ||||
| 			unlock_memory_hotplug(); | ||||
| 			return -1; | ||||
| 		} | ||||
| 	} | ||||
| 	if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { | ||||
| 		if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { | ||||
| 			unlock_memory_hotplug(); | ||||
| 			return -1; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	/* Previous code may changed the zone of the pfn range */ | ||||
| 	zone = page_zone(pfn_to_page(pfn)); | ||||
| 
 | ||||
| 	arg.start_pfn = pfn; | ||||
| 	arg.nr_pages = nr_pages; | ||||
| 	node_states_check_changes_online(nr_pages, zone, &arg); | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Lai Jiangshan
						Lai Jiangshan