mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	mm, swap: use rbtree for swap_extent
swap_extent is used to map swap page offset to backing device's block offset. For a continuous block range, one swap_extent is used and all these swap_extents are managed in a linked list. These swap_extents are used by map_swap_entry() during swap's read and write path. To find out the backing device's block offset for a page offset, the swap_extent list will be traversed linearly, with curr_swap_extent being used as a cache to speed up the search. This works well as long as swap_extents are not huge or when the number of processes that access swap device are few, but when the swap device has many extents and there are a number of processes accessing the swap device concurrently, it can be a problem. On one of our servers, the disk's remaining size is tight: $df -h Filesystem Size Used Avail Use% Mounted on ... ... /dev/nvme0n1p1 1.8T 1.3T 504G 72% /home/t4 When creating a 80G swapfile there, there are as many as 84656 swap extents. The end result is, kernel spends abou 30% time in map_swap_entry() and swap throughput is only 70MB/s. As a comparison, when I used smaller sized swapfile, like 4G whose swap_extent dropped to 2000, swap throughput is back to 400-500MB/s and map_swap_entry() is about 3%. One downside of using rbtree for swap_extent is, 'struct rbtree' takes 24 bytes while 'struct list_head' takes 16 bytes, that's 8 bytes more for each swap_extent. For a swapfile that has 80k swap_extents, that means 625KiB more memory consumed. Test: Since it's not possible to reboot that server, I can not test this patch diretly there. Instead, I tested it on another server with NVMe disk. I created a 20G swapfile on an NVMe backed XFS fs. By default, the filesystem is quite clean and the created swapfile has only 2 extents. Testing vanilla and this patch shows no obvious performance difference when swapfile is not fragmented. To see the patch's effects, I used some tweaks to manually fragment the swapfile by breaking the extent at 1M boundary. This made the swapfile have 20K extents. nr_task=4 kernel swapout(KB/s) map_swap_entry(perf) swapin(KB/s) map_swap_entry(perf) vanilla 165191 90.77% 171798 90.21% patched 858993 +420% 2.16% 715827 +317% 0.77% nr_task=8 kernel swapout(KB/s) map_swap_entry(perf) swapin(KB/s) map_swap_entry(perf) vanilla 306783 92.19% 318145 87.76% patched 954437 +211% 2.35% 1073741 +237% 1.57% swapout: the throughput of swap out, in KB/s, higher is better 1st map_swap_entry: cpu cycles percent sampled by perf swapin: the throughput of swap in, in KB/s, higher is better. 2nd map_swap_entry: cpu cycles percent sampled by perf nr_task=1 doesn't show any difference, this is due to the curr_swap_extent can be effectively used to cache the correct swap extent for single task workload. [akpm@linux-foundation.org: s/BUG_ON(1)/BUG()/] Link: http://lkml.kernel.org/r/20190523142404.GA181@aaronlu Signed-off-by: Aaron Lu <ziqian.lzq@antfin.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									054f1d1faa
								
							
						
					
					
						commit
						4efaceb1c5
					
				
					 3 changed files with 75 additions and 63 deletions
				
			
		| 
						 | 
				
			
			@ -148,7 +148,7 @@ struct zone;
 | 
			
		|||
 * We always assume that blocks are of size PAGE_SIZE.
 | 
			
		||||
 */
 | 
			
		||||
struct swap_extent {
 | 
			
		||||
	struct list_head list;
 | 
			
		||||
	struct rb_node rb_node;
 | 
			
		||||
	pgoff_t start_page;
 | 
			
		||||
	pgoff_t nr_pages;
 | 
			
		||||
	sector_t start_block;
 | 
			
		||||
| 
						 | 
				
			
			@ -248,8 +248,7 @@ struct swap_info_struct {
 | 
			
		|||
	unsigned int cluster_next;	/* likely index for next allocation */
 | 
			
		||||
	unsigned int cluster_nr;	/* countdown to next cluster search */
 | 
			
		||||
	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
 | 
			
		||||
	struct swap_extent *curr_swap_extent;
 | 
			
		||||
	struct swap_extent first_swap_extent;
 | 
			
		||||
	struct rb_root swap_extent_root;/* root of the swap extent rbtree */
 | 
			
		||||
	struct block_device *bdev;	/* swap device or bdev of swap file */
 | 
			
		||||
	struct file *swap_file;		/* seldom referenced */
 | 
			
		||||
	unsigned int old_block_size;	/* seldom referenced */
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -163,7 +163,7 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
 | 
			
		|||
	blocks_per_page = PAGE_SIZE >> blkbits;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Map all the blocks into the extent list.  This code doesn't try
 | 
			
		||||
	 * Map all the blocks into the extent tree.  This code doesn't try
 | 
			
		||||
	 * to be very smart.
 | 
			
		||||
	 */
 | 
			
		||||
	probe_block = 0;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										131
									
								
								mm/swapfile.c
									
									
									
									
									
								
							
							
						
						
									
										131
									
								
								mm/swapfile.c
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -152,6 +152,18 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
 | 
			
		|||
	return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline struct swap_extent *first_se(struct swap_info_struct *sis)
 | 
			
		||||
{
 | 
			
		||||
	struct rb_node *rb = rb_first(&sis->swap_extent_root);
 | 
			
		||||
	return rb_entry(rb, struct swap_extent, rb_node);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline struct swap_extent *next_se(struct swap_extent *se)
 | 
			
		||||
{
 | 
			
		||||
	struct rb_node *rb = rb_next(&se->rb_node);
 | 
			
		||||
	return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * swapon tell device that all the old swap contents can be discarded,
 | 
			
		||||
 * to allow the swap device to optimize its wear-levelling.
 | 
			
		||||
| 
						 | 
				
			
			@ -164,7 +176,7 @@ static int discard_swap(struct swap_info_struct *si)
 | 
			
		|||
	int err = 0;
 | 
			
		||||
 | 
			
		||||
	/* Do not discard the swap header page! */
 | 
			
		||||
	se = &si->first_swap_extent;
 | 
			
		||||
	se = first_se(si);
 | 
			
		||||
	start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
 | 
			
		||||
	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
 | 
			
		||||
	if (nr_blocks) {
 | 
			
		||||
| 
						 | 
				
			
			@ -175,7 +187,7 @@ static int discard_swap(struct swap_info_struct *si)
 | 
			
		|||
		cond_resched();
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	list_for_each_entry(se, &si->first_swap_extent.list, list) {
 | 
			
		||||
	for (se = next_se(se); se; se = next_se(se)) {
 | 
			
		||||
		start_block = se->start_block << (PAGE_SHIFT - 9);
 | 
			
		||||
		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -189,6 +201,26 @@ static int discard_swap(struct swap_info_struct *si)
 | 
			
		|||
	return err;		/* That will often be -EOPNOTSUPP */
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static struct swap_extent *
 | 
			
		||||
offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
 | 
			
		||||
{
 | 
			
		||||
	struct swap_extent *se;
 | 
			
		||||
	struct rb_node *rb;
 | 
			
		||||
 | 
			
		||||
	rb = sis->swap_extent_root.rb_node;
 | 
			
		||||
	while (rb) {
 | 
			
		||||
		se = rb_entry(rb, struct swap_extent, rb_node);
 | 
			
		||||
		if (offset < se->start_page)
 | 
			
		||||
			rb = rb->rb_left;
 | 
			
		||||
		else if (offset >= se->start_page + se->nr_pages)
 | 
			
		||||
			rb = rb->rb_right;
 | 
			
		||||
		else
 | 
			
		||||
			return se;
 | 
			
		||||
	}
 | 
			
		||||
	/* It *must* be present */
 | 
			
		||||
	BUG();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * swap allocation tell device that a cluster of swap can now be discarded,
 | 
			
		||||
 * to allow the swap device to optimize its wear-levelling.
 | 
			
		||||
| 
						 | 
				
			
			@ -196,32 +228,25 @@ static int discard_swap(struct swap_info_struct *si)
 | 
			
		|||
static void discard_swap_cluster(struct swap_info_struct *si,
 | 
			
		||||
				 pgoff_t start_page, pgoff_t nr_pages)
 | 
			
		||||
{
 | 
			
		||||
	struct swap_extent *se = si->curr_swap_extent;
 | 
			
		||||
	int found_extent = 0;
 | 
			
		||||
	struct swap_extent *se = offset_to_swap_extent(si, start_page);
 | 
			
		||||
 | 
			
		||||
	while (nr_pages) {
 | 
			
		||||
		if (se->start_page <= start_page &&
 | 
			
		||||
		    start_page < se->start_page + se->nr_pages) {
 | 
			
		||||
			pgoff_t offset = start_page - se->start_page;
 | 
			
		||||
			sector_t start_block = se->start_block + offset;
 | 
			
		||||
			sector_t nr_blocks = se->nr_pages - offset;
 | 
			
		||||
		pgoff_t offset = start_page - se->start_page;
 | 
			
		||||
		sector_t start_block = se->start_block + offset;
 | 
			
		||||
		sector_t nr_blocks = se->nr_pages - offset;
 | 
			
		||||
 | 
			
		||||
			if (nr_blocks > nr_pages)
 | 
			
		||||
				nr_blocks = nr_pages;
 | 
			
		||||
			start_page += nr_blocks;
 | 
			
		||||
			nr_pages -= nr_blocks;
 | 
			
		||||
		if (nr_blocks > nr_pages)
 | 
			
		||||
			nr_blocks = nr_pages;
 | 
			
		||||
		start_page += nr_blocks;
 | 
			
		||||
		nr_pages -= nr_blocks;
 | 
			
		||||
 | 
			
		||||
			if (!found_extent++)
 | 
			
		||||
				si->curr_swap_extent = se;
 | 
			
		||||
		start_block <<= PAGE_SHIFT - 9;
 | 
			
		||||
		nr_blocks <<= PAGE_SHIFT - 9;
 | 
			
		||||
		if (blkdev_issue_discard(si->bdev, start_block,
 | 
			
		||||
					nr_blocks, GFP_NOIO, 0))
 | 
			
		||||
			break;
 | 
			
		||||
 | 
			
		||||
			start_block <<= PAGE_SHIFT - 9;
 | 
			
		||||
			nr_blocks <<= PAGE_SHIFT - 9;
 | 
			
		||||
			if (blkdev_issue_discard(si->bdev, start_block,
 | 
			
		||||
				    nr_blocks, GFP_NOIO, 0))
 | 
			
		||||
				break;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		se = list_next_entry(se, list);
 | 
			
		||||
		se = next_se(se);
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1755,7 +1780,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
 | 
			
		|||
			return type;
 | 
			
		||||
		}
 | 
			
		||||
		if (bdev == sis->bdev) {
 | 
			
		||||
			struct swap_extent *se = &sis->first_swap_extent;
 | 
			
		||||
			struct swap_extent *se = first_se(sis);
 | 
			
		||||
 | 
			
		||||
			if (se->start_block == offset) {
 | 
			
		||||
				if (bdev_p)
 | 
			
		||||
| 
						 | 
				
			
			@ -2232,7 +2257,6 @@ static void drain_mmlist(void)
 | 
			
		|||
static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
 | 
			
		||||
{
 | 
			
		||||
	struct swap_info_struct *sis;
 | 
			
		||||
	struct swap_extent *start_se;
 | 
			
		||||
	struct swap_extent *se;
 | 
			
		||||
	pgoff_t offset;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -2240,18 +2264,8 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
 | 
			
		|||
	*bdev = sis->bdev;
 | 
			
		||||
 | 
			
		||||
	offset = swp_offset(entry);
 | 
			
		||||
	start_se = sis->curr_swap_extent;
 | 
			
		||||
	se = start_se;
 | 
			
		||||
 | 
			
		||||
	for ( ; ; ) {
 | 
			
		||||
		if (se->start_page <= offset &&
 | 
			
		||||
				offset < (se->start_page + se->nr_pages)) {
 | 
			
		||||
			return se->start_block + (offset - se->start_page);
 | 
			
		||||
		}
 | 
			
		||||
		se = list_next_entry(se, list);
 | 
			
		||||
		sis->curr_swap_extent = se;
 | 
			
		||||
		BUG_ON(se == start_se);		/* It *must* be present */
 | 
			
		||||
	}
 | 
			
		||||
	se = offset_to_swap_extent(sis, offset);
 | 
			
		||||
	return se->start_block + (offset - se->start_page);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
| 
						 | 
				
			
			@ -2269,12 +2283,11 @@ sector_t map_swap_page(struct page *page, struct block_device **bdev)
 | 
			
		|||
 */
 | 
			
		||||
static void destroy_swap_extents(struct swap_info_struct *sis)
 | 
			
		||||
{
 | 
			
		||||
	while (!list_empty(&sis->first_swap_extent.list)) {
 | 
			
		||||
		struct swap_extent *se;
 | 
			
		||||
	while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
 | 
			
		||||
		struct rb_node *rb = sis->swap_extent_root.rb_node;
 | 
			
		||||
		struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
 | 
			
		||||
 | 
			
		||||
		se = list_first_entry(&sis->first_swap_extent.list,
 | 
			
		||||
				struct swap_extent, list);
 | 
			
		||||
		list_del(&se->list);
 | 
			
		||||
		rb_erase(rb, &sis->swap_extent_root);
 | 
			
		||||
		kfree(se);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -2290,7 +2303,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
 | 
			
		|||
 | 
			
		||||
/*
 | 
			
		||||
 * Add a block range (and the corresponding page range) into this swapdev's
 | 
			
		||||
 * extent list.  The extent list is kept sorted in page order.
 | 
			
		||||
 * extent tree.
 | 
			
		||||
 *
 | 
			
		||||
 * This function rather assumes that it is called in ascending page order.
 | 
			
		||||
 */
 | 
			
		||||
| 
						 | 
				
			
			@ -2298,20 +2311,21 @@ int
 | 
			
		|||
add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 | 
			
		||||
		unsigned long nr_pages, sector_t start_block)
 | 
			
		||||
{
 | 
			
		||||
	struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
 | 
			
		||||
	struct swap_extent *se;
 | 
			
		||||
	struct swap_extent *new_se;
 | 
			
		||||
	struct list_head *lh;
 | 
			
		||||
 | 
			
		||||
	if (start_page == 0) {
 | 
			
		||||
		se = &sis->first_swap_extent;
 | 
			
		||||
		sis->curr_swap_extent = se;
 | 
			
		||||
		se->start_page = 0;
 | 
			
		||||
		se->nr_pages = nr_pages;
 | 
			
		||||
		se->start_block = start_block;
 | 
			
		||||
		return 1;
 | 
			
		||||
	} else {
 | 
			
		||||
		lh = sis->first_swap_extent.list.prev;	/* Highest extent */
 | 
			
		||||
		se = list_entry(lh, struct swap_extent, list);
 | 
			
		||||
	/*
 | 
			
		||||
	 * place the new node at the right most since the
 | 
			
		||||
	 * function is called in ascending page order.
 | 
			
		||||
	 */
 | 
			
		||||
	while (*link) {
 | 
			
		||||
		parent = *link;
 | 
			
		||||
		link = &parent->rb_right;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (parent) {
 | 
			
		||||
		se = rb_entry(parent, struct swap_extent, rb_node);
 | 
			
		||||
		BUG_ON(se->start_page + se->nr_pages != start_page);
 | 
			
		||||
		if (se->start_block + se->nr_pages == start_block) {
 | 
			
		||||
			/* Merge it */
 | 
			
		||||
| 
						 | 
				
			
			@ -2320,9 +2334,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 | 
			
		|||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * No merge.  Insert a new extent, preserving ordering.
 | 
			
		||||
	 */
 | 
			
		||||
	/* No merge, insert a new extent. */
 | 
			
		||||
	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
 | 
			
		||||
	if (new_se == NULL)
 | 
			
		||||
		return -ENOMEM;
 | 
			
		||||
| 
						 | 
				
			
			@ -2330,7 +2342,8 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 | 
			
		|||
	new_se->nr_pages = nr_pages;
 | 
			
		||||
	new_se->start_block = start_block;
 | 
			
		||||
 | 
			
		||||
	list_add_tail(&new_se->list, &sis->first_swap_extent.list);
 | 
			
		||||
	rb_link_node(&new_se->rb_node, parent, link);
 | 
			
		||||
	rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
 | 
			
		||||
	return 1;
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL_GPL(add_swap_extent);
 | 
			
		||||
| 
						 | 
				
			
			@ -2846,7 +2859,7 @@ static struct swap_info_struct *alloc_swap_info(void)
 | 
			
		|||
		 * would be relying on p->type to remain valid.
 | 
			
		||||
		 */
 | 
			
		||||
	}
 | 
			
		||||
	INIT_LIST_HEAD(&p->first_swap_extent.list);
 | 
			
		||||
	p->swap_extent_root = RB_ROOT;
 | 
			
		||||
	plist_node_init(&p->list, 0);
 | 
			
		||||
	for_each_node(i)
 | 
			
		||||
		plist_node_init(&p->avail_lists[i], 0);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue