forked from mirrors/linux
		
	mm, memory-hotplug: dynamic configure movable memory and portion memory
Add online_movable and online_kernel for logic memory hotplug. This is the dynamic version of "movablecore" & "kernelcore". We have the same reason to introduce it as to introduce "movablecore" & "kernelcore". It has the same motive as "movablecore" & "kernelcore", but it is dynamic/running-time: o We can configure memory as kernelcore or movablecore after boot. Userspace workload is increased, we need more hugepage, we can't use "online_movable" to add memory and allow the system use more THP(transparent-huge-page), vice-verse when kernel workload is increase. Also help for virtualization to dynamic configure host/guest's memory, to save/(reduce waste) memory. Memory capacity on Demand o When a new node is physically online after boot, we need to use "online_movable" or "online_kernel" to configure/portion it as we expected when we logic-online it. This configuration also helps for physically-memory-migrate. o all benefit as the same as existed "movablecore" & "kernelcore". o Preparing for movable-node, which is very important for power-saving, hardware partitioning and high-available-system(hardware fault management). (Note, we don't introduce movable-node here.) Action behavior: When a memoryblock/memorysection is onlined by "online_movable", the kernel will not have directly reference to the page of the memoryblock, thus we can remove that memory any time when needed. When it is online by "online_kernel", the kernel can use it. When it is online by "online", the zone type doesn't changed. Current constraints: Only the memoryblock which is adjacent to the ZONE_MOVABLE can be online from ZONE_NORMAL to ZONE_MOVABLE. [akpm@linux-foundation.org: use min_t, cleanups] Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Greg KH <greg@kroah.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									fcf07d22f0
								
							
						
					
					
						commit
						511c2aba8f
					
				
					 4 changed files with 146 additions and 14 deletions
				
			
		|  | @ -161,7 +161,8 @@ a recent addition and not present on older kernels. | ||||||
| 		    in the memory block. | 		    in the memory block. | ||||||
| 'state'           : read-write | 'state'           : read-write | ||||||
|                     at read:  contains online/offline state of memory. |                     at read:  contains online/offline state of memory. | ||||||
|                     at write: user can specify "online", "offline" command |                     at write: user can specify "online_kernel", | ||||||
|  |                     "online_movable", "online", "offline" command | ||||||
|                     which will be performed on al sections in the block. |                     which will be performed on al sections in the block. | ||||||
| 'phys_device'     : read-only: designed to show the name of physical memory | 'phys_device'     : read-only: designed to show the name of physical memory | ||||||
|                     device.  This is not well implemented now. |                     device.  This is not well implemented now. | ||||||
|  | @ -255,6 +256,17 @@ For onlining, you have to write "online" to the section's state file as: | ||||||
| 
 | 
 | ||||||
| % echo online > /sys/devices/system/memory/memoryXXX/state | % echo online > /sys/devices/system/memory/memoryXXX/state | ||||||
| 
 | 
 | ||||||
|  | This onlining will not change the ZONE type of the target memory section, | ||||||
|  | If the memory section is in ZONE_NORMAL, you can change it to ZONE_MOVABLE: | ||||||
|  | 
 | ||||||
|  | % echo online_movable > /sys/devices/system/memory/memoryXXX/state | ||||||
|  | (NOTE: current limit: this memory section must be adjacent to ZONE_MOVABLE) | ||||||
|  | 
 | ||||||
|  | And if the memory section is in ZONE_MOVABLE, you can change it to ZONE_NORMAL: | ||||||
|  | 
 | ||||||
|  | % echo online_kernel > /sys/devices/system/memory/memoryXXX/state | ||||||
|  | (NOTE: current limit: this memory section must be adjacent to ZONE_NORMAL) | ||||||
|  | 
 | ||||||
| After this, section memoryXXX's state will be 'online' and the amount of | After this, section memoryXXX's state will be 'online' and the amount of | ||||||
| available memory will be increased. | available memory will be increased. | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -254,7 +254,7 @@ static bool pages_correctly_reserved(unsigned long start_pfn, | ||||||
|  * OK to have direct references to sparsemem variables in here. |  * OK to have direct references to sparsemem variables in here. | ||||||
|  */ |  */ | ||||||
| static int | static int | ||||||
| memory_block_action(unsigned long phys_index, unsigned long action) | memory_block_action(unsigned long phys_index, unsigned long action, int online_type) | ||||||
| { | { | ||||||
| 	unsigned long start_pfn; | 	unsigned long start_pfn; | ||||||
| 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; | 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; | ||||||
|  | @ -269,7 +269,7 @@ memory_block_action(unsigned long phys_index, unsigned long action) | ||||||
| 			if (!pages_correctly_reserved(start_pfn, nr_pages)) | 			if (!pages_correctly_reserved(start_pfn, nr_pages)) | ||||||
| 				return -EBUSY; | 				return -EBUSY; | ||||||
| 
 | 
 | ||||||
| 			ret = online_pages(start_pfn, nr_pages); | 			ret = online_pages(start_pfn, nr_pages, online_type); | ||||||
| 			break; | 			break; | ||||||
| 		case MEM_OFFLINE: | 		case MEM_OFFLINE: | ||||||
| 			ret = offline_pages(start_pfn, nr_pages); | 			ret = offline_pages(start_pfn, nr_pages); | ||||||
|  | @ -284,7 +284,8 @@ memory_block_action(unsigned long phys_index, unsigned long action) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int __memory_block_change_state(struct memory_block *mem, | static int __memory_block_change_state(struct memory_block *mem, | ||||||
| 		unsigned long to_state, unsigned long from_state_req) | 		unsigned long to_state, unsigned long from_state_req, | ||||||
|  | 		int online_type) | ||||||
| { | { | ||||||
| 	int ret = 0; | 	int ret = 0; | ||||||
| 
 | 
 | ||||||
|  | @ -296,7 +297,7 @@ static int __memory_block_change_state(struct memory_block *mem, | ||||||
| 	if (to_state == MEM_OFFLINE) | 	if (to_state == MEM_OFFLINE) | ||||||
| 		mem->state = MEM_GOING_OFFLINE; | 		mem->state = MEM_GOING_OFFLINE; | ||||||
| 
 | 
 | ||||||
| 	ret = memory_block_action(mem->start_section_nr, to_state); | 	ret = memory_block_action(mem->start_section_nr, to_state, online_type); | ||||||
| 
 | 
 | ||||||
| 	if (ret) { | 	if (ret) { | ||||||
| 		mem->state = from_state_req; | 		mem->state = from_state_req; | ||||||
|  | @ -319,12 +320,14 @@ static int __memory_block_change_state(struct memory_block *mem, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int memory_block_change_state(struct memory_block *mem, | static int memory_block_change_state(struct memory_block *mem, | ||||||
| 		unsigned long to_state, unsigned long from_state_req) | 		unsigned long to_state, unsigned long from_state_req, | ||||||
|  | 		int online_type) | ||||||
| { | { | ||||||
| 	int ret; | 	int ret; | ||||||
| 
 | 
 | ||||||
| 	mutex_lock(&mem->state_mutex); | 	mutex_lock(&mem->state_mutex); | ||||||
| 	ret = __memory_block_change_state(mem, to_state, from_state_req); | 	ret = __memory_block_change_state(mem, to_state, from_state_req, | ||||||
|  | 					  online_type); | ||||||
| 	mutex_unlock(&mem->state_mutex); | 	mutex_unlock(&mem->state_mutex); | ||||||
| 
 | 
 | ||||||
| 	return ret; | 	return ret; | ||||||
|  | @ -338,10 +341,18 @@ store_mem_state(struct device *dev, | ||||||
| 
 | 
 | ||||||
| 	mem = container_of(dev, struct memory_block, dev); | 	mem = container_of(dev, struct memory_block, dev); | ||||||
| 
 | 
 | ||||||
| 	if (!strncmp(buf, "online", min((int)count, 6))) | 	if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) | ||||||
| 		ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); | 		ret = memory_block_change_state(mem, MEM_ONLINE, | ||||||
| 	else if(!strncmp(buf, "offline", min((int)count, 7))) | 						MEM_OFFLINE, ONLINE_KERNEL); | ||||||
| 		ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); | 	else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) | ||||||
|  | 		ret = memory_block_change_state(mem, MEM_ONLINE, | ||||||
|  | 						MEM_OFFLINE, ONLINE_MOVABLE); | ||||||
|  | 	else if (!strncmp(buf, "online", min_t(int, count, 6))) | ||||||
|  | 		ret = memory_block_change_state(mem, MEM_ONLINE, | ||||||
|  | 						MEM_OFFLINE, ONLINE_KEEP); | ||||||
|  | 	else if(!strncmp(buf, "offline", min_t(int, count, 7))) | ||||||
|  | 		ret = memory_block_change_state(mem, MEM_OFFLINE, | ||||||
|  | 						MEM_ONLINE, -1); | ||||||
| 
 | 
 | ||||||
| 	if (ret) | 	if (ret) | ||||||
| 		return ret; | 		return ret; | ||||||
|  | @ -676,7 +687,7 @@ int offline_memory_block(struct memory_block *mem) | ||||||
| 
 | 
 | ||||||
| 	mutex_lock(&mem->state_mutex); | 	mutex_lock(&mem->state_mutex); | ||||||
| 	if (mem->state != MEM_OFFLINE) | 	if (mem->state != MEM_OFFLINE) | ||||||
| 		ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); | 		ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE, -1); | ||||||
| 	mutex_unlock(&mem->state_mutex); | 	mutex_unlock(&mem->state_mutex); | ||||||
| 
 | 
 | ||||||
| 	return ret; | 	return ret; | ||||||
|  |  | ||||||
|  | @ -26,6 +26,13 @@ enum { | ||||||
| 	MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, | 	MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | /* Types for control the zone type of onlined memory */ | ||||||
|  | enum { | ||||||
|  | 	ONLINE_KEEP, | ||||||
|  | 	ONLINE_KERNEL, | ||||||
|  | 	ONLINE_MOVABLE, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|  * pgdat resizing functions |  * pgdat resizing functions | ||||||
|  */ |  */ | ||||||
|  | @ -46,6 +53,10 @@ void pgdat_resize_init(struct pglist_data *pgdat) | ||||||
| } | } | ||||||
| /*
 | /*
 | ||||||
|  * Zone resizing functions |  * Zone resizing functions | ||||||
|  |  * | ||||||
|  |  * Note: any attempt to resize a zone should has pgdat_resize_lock() | ||||||
|  |  * zone_span_writelock() both held. This ensure the size of a zone | ||||||
|  |  * can't be changed while pgdat_resize_lock() held. | ||||||
|  */ |  */ | ||||||
| static inline unsigned zone_span_seqbegin(struct zone *zone) | static inline unsigned zone_span_seqbegin(struct zone *zone) | ||||||
| { | { | ||||||
|  | @ -71,7 +82,7 @@ extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); | ||||||
| extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); | extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); | ||||||
| extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); | extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); | ||||||
| /* VM interface that may be used by firmware interface */ | /* VM interface that may be used by firmware interface */ | ||||||
| extern int online_pages(unsigned long, unsigned long); | extern int online_pages(unsigned long, unsigned long, int); | ||||||
| extern void __offline_isolated_pages(unsigned long, unsigned long); | extern void __offline_isolated_pages(unsigned long, unsigned long); | ||||||
| 
 | 
 | ||||||
| typedef void (*online_page_callback_t)(struct page *page); | typedef void (*online_page_callback_t)(struct page *page); | ||||||
|  |  | ||||||
|  | @ -214,6 +214,88 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, | ||||||
| 	zone_span_writeunlock(zone); | 	zone_span_writeunlock(zone); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void resize_zone(struct zone *zone, unsigned long start_pfn, | ||||||
|  | 		unsigned long end_pfn) | ||||||
|  | { | ||||||
|  | 	zone_span_writelock(zone); | ||||||
|  | 
 | ||||||
|  | 	zone->zone_start_pfn = start_pfn; | ||||||
|  | 	zone->spanned_pages = end_pfn - start_pfn; | ||||||
|  | 
 | ||||||
|  | 	zone_span_writeunlock(zone); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void fix_zone_id(struct zone *zone, unsigned long start_pfn, | ||||||
|  | 		unsigned long end_pfn) | ||||||
|  | { | ||||||
|  | 	enum zone_type zid = zone_idx(zone); | ||||||
|  | 	int nid = zone->zone_pgdat->node_id; | ||||||
|  | 	unsigned long pfn; | ||||||
|  | 
 | ||||||
|  | 	for (pfn = start_pfn; pfn < end_pfn; pfn++) | ||||||
|  | 		set_page_links(pfn_to_page(pfn), zid, nid, pfn); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int move_pfn_range_left(struct zone *z1, struct zone *z2, | ||||||
|  | 		unsigned long start_pfn, unsigned long end_pfn) | ||||||
|  | { | ||||||
|  | 	unsigned long flags; | ||||||
|  | 
 | ||||||
|  | 	pgdat_resize_lock(z1->zone_pgdat, &flags); | ||||||
|  | 
 | ||||||
|  | 	/* can't move pfns which are higher than @z2 */ | ||||||
|  | 	if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) | ||||||
|  | 		goto out_fail; | ||||||
|  | 	/* the move out part mast at the left most of @z2 */ | ||||||
|  | 	if (start_pfn > z2->zone_start_pfn) | ||||||
|  | 		goto out_fail; | ||||||
|  | 	/* must included/overlap */ | ||||||
|  | 	if (end_pfn <= z2->zone_start_pfn) | ||||||
|  | 		goto out_fail; | ||||||
|  | 
 | ||||||
|  | 	resize_zone(z1, z1->zone_start_pfn, end_pfn); | ||||||
|  | 	resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); | ||||||
|  | 
 | ||||||
|  | 	pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||||||
|  | 
 | ||||||
|  | 	fix_zone_id(z1, start_pfn, end_pfn); | ||||||
|  | 
 | ||||||
|  | 	return 0; | ||||||
|  | out_fail: | ||||||
|  | 	pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||||||
|  | 	return -1; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int move_pfn_range_right(struct zone *z1, struct zone *z2, | ||||||
|  | 		unsigned long start_pfn, unsigned long end_pfn) | ||||||
|  | { | ||||||
|  | 	unsigned long flags; | ||||||
|  | 
 | ||||||
|  | 	pgdat_resize_lock(z1->zone_pgdat, &flags); | ||||||
|  | 
 | ||||||
|  | 	/* can't move pfns which are lower than @z1 */ | ||||||
|  | 	if (z1->zone_start_pfn > start_pfn) | ||||||
|  | 		goto out_fail; | ||||||
|  | 	/* the move out part mast at the right most of @z1 */ | ||||||
|  | 	if (z1->zone_start_pfn + z1->spanned_pages >  end_pfn) | ||||||
|  | 		goto out_fail; | ||||||
|  | 	/* must included/overlap */ | ||||||
|  | 	if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) | ||||||
|  | 		goto out_fail; | ||||||
|  | 
 | ||||||
|  | 	resize_zone(z1, z1->zone_start_pfn, start_pfn); | ||||||
|  | 	resize_zone(z2, start_pfn, z2->zone_start_pfn + z2->spanned_pages); | ||||||
|  | 
 | ||||||
|  | 	pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||||||
|  | 
 | ||||||
|  | 	fix_zone_id(z2, start_pfn, end_pfn); | ||||||
|  | 
 | ||||||
|  | 	return 0; | ||||||
|  | out_fail: | ||||||
|  | 	pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||||||
|  | 	return -1; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, | static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, | ||||||
| 			    unsigned long end_pfn) | 			    unsigned long end_pfn) | ||||||
| { | { | ||||||
|  | @ -508,7 +590,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) | ||||||
| { | { | ||||||
| 	unsigned long onlined_pages = 0; | 	unsigned long onlined_pages = 0; | ||||||
| 	struct zone *zone; | 	struct zone *zone; | ||||||
|  | @ -525,6 +607,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | ||||||
| 	 */ | 	 */ | ||||||
| 	zone = page_zone(pfn_to_page(pfn)); | 	zone = page_zone(pfn_to_page(pfn)); | ||||||
| 
 | 
 | ||||||
|  | 	if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { | ||||||
|  | 		if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { | ||||||
|  | 			unlock_memory_hotplug(); | ||||||
|  | 			return -1; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { | ||||||
|  | 		if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { | ||||||
|  | 			unlock_memory_hotplug(); | ||||||
|  | 			return -1; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	/* Previous code may changed the zone of the pfn range */ | ||||||
|  | 	zone = page_zone(pfn_to_page(pfn)); | ||||||
|  | 
 | ||||||
| 	arg.start_pfn = pfn; | 	arg.start_pfn = pfn; | ||||||
| 	arg.nr_pages = nr_pages; | 	arg.nr_pages = nr_pages; | ||||||
| 	node_states_check_changes_online(nr_pages, zone, &arg); | 	node_states_check_changes_online(nr_pages, zone, &arg); | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Lai Jiangshan
						Lai Jiangshan