forked from mirrors/linux
		
	memcg: add per cgroup dirty page accounting
When modifying PG_Dirty on cached file pages, update the new
MEM_CGROUP_STAT_DIRTY counter.  This is done in the same places where
global NR_FILE_DIRTY is managed.  The new memcg stat is visible in the
per memcg memory.stat cgroupfs file.  The most recent past attempt at
this was http://thread.gmane.org/gmane.linux.kernel.cgroups/8632
The new accounting supports future efforts to add per cgroup dirty
page throttling and writeback.  It also helps an administrator break
down a container's memory usage and provides evidence to understand
memcg oom kills (the new dirty count is included in memcg oom kill
messages).
The ability to move page accounting between memcg
(memory.move_charge_at_immigrate) makes this accounting more
complicated than the global counter.  The existing
mem_cgroup_{begin,end}_page_stat() lock is used to serialize move
accounting with stat updates.
Typical update operation:
	memcg = mem_cgroup_begin_page_stat(page)
	if (TestSetPageDirty()) {
		[...]
		mem_cgroup_update_page_stat(memcg)
	}
	mem_cgroup_end_page_stat(memcg)
Summary of mem_cgroup_end_page_stat() overhead:
- Without CONFIG_MEMCG it's a no-op
- With CONFIG_MEMCG and no inter memcg task movement, it's just
  rcu_read_lock()
- With CONFIG_MEMCG and inter memcg  task movement, it's
  rcu_read_lock() + spin_lock_irqsave()
A memcg parameter is added to several routines because their callers
now grab mem_cgroup_begin_page_stat() which returns the memcg later
needed by for mem_cgroup_update_page_stat().
Because mem_cgroup_begin_page_stat() may disable interrupts, some
adjustments are needed:
- move __mark_inode_dirty() from __set_page_dirty() to its caller.
  __mark_inode_dirty() locking does not want interrupts disabled.
- use spin_lock_irqsave(tree_lock) rather than spin_lock_irq() in
  __delete_from_page_cache(), replace_page_cache_page(),
  invalidate_complete_page2(), and __remove_mapping().
   text    data     bss      dec    hex filename
8925147 1774832 1785856 12485835 be84cb vmlinux-!CONFIG_MEMCG-before
8925339 1774832 1785856 12486027 be858b vmlinux-!CONFIG_MEMCG-after
                            +192 text bytes
8965977 1784992 1785856 12536825 bf4bf9 vmlinux-CONFIG_MEMCG-before
8966750 1784992 1785856 12537598 bf4efe vmlinux-CONFIG_MEMCG-after
                            +773 text bytes
Performance tests run on v4.0-rc1-36-g4f671fe2f952.  Lower is better for
all metrics, they're all wall clock or cycle counts.  The read and write
fault benchmarks just measure fault time, they do not include I/O time.
* CONFIG_MEMCG not set:
                            baseline                              patched
  kbuild                 1m25.030000(+-0.088% 3 samples)       1m25.426667(+-0.120% 3 samples)
  dd write 100 MiB          0.859211561 +-15.10%                  0.874162885 +-15.03%
  dd write 200 MiB          1.670653105 +-17.87%                  1.669384764 +-11.99%
  dd write 1000 MiB         8.434691190 +-14.15%                  8.474733215 +-14.77%
  read fault cycles       254.0(+-0.000% 10 samples)            253.0(+-0.000% 10 samples)
  write fault cycles     2021.2(+-3.070% 10 samples)           1984.5(+-1.036% 10 samples)
* CONFIG_MEMCG=y root_memcg:
                            baseline                              patched
  kbuild                 1m25.716667(+-0.105% 3 samples)       1m25.686667(+-0.153% 3 samples)
  dd write 100 MiB          0.855650830 +-14.90%                  0.887557919 +-14.90%
  dd write 200 MiB          1.688322953 +-12.72%                  1.667682724 +-13.33%
  dd write 1000 MiB         8.418601605 +-14.30%                  8.673532299 +-15.00%
  read fault cycles       266.0(+-0.000% 10 samples)            266.0(+-0.000% 10 samples)
  write fault cycles     2051.7(+-1.349% 10 samples)           2049.6(+-1.686% 10 samples)
* CONFIG_MEMCG=y non-root_memcg:
                            baseline                              patched
  kbuild                 1m26.120000(+-0.273% 3 samples)       1m25.763333(+-0.127% 3 samples)
  dd write 100 MiB          0.861723964 +-15.25%                  0.818129350 +-14.82%
  dd write 200 MiB          1.669887569 +-13.30%                  1.698645885 +-13.27%
  dd write 1000 MiB         8.383191730 +-14.65%                  8.351742280 +-14.52%
  read fault cycles       265.7(+-0.172% 10 samples)            267.0(+-0.000% 10 samples)
  write fault cycles     2070.6(+-1.512% 10 samples)           2084.4(+-2.148% 10 samples)
As expected anon page faults are not affected by this patch.
tj: Updated to apply on top of the recent cancel_dirty_page() changes.
Signed-off-by: Sha Zhengju <handai.szj@gmail.com>
Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
			
			
This commit is contained in:
		
							parent
							
								
									11f81becca
								
							
						
					
					
						commit
						c4843a7593
					
				
					 12 changed files with 156 additions and 39 deletions
				
			
		|  | @ -493,6 +493,7 @@ pgpgin		- # of charging events to the memory cgroup. The charging | ||||||
| pgpgout		- # of uncharging events to the memory cgroup. The uncharging | pgpgout		- # of uncharging events to the memory cgroup. The uncharging | ||||||
| 		event happens each time a page is unaccounted from the cgroup. | 		event happens each time a page is unaccounted from the cgroup. | ||||||
| swap		- # of bytes of swap usage | swap		- # of bytes of swap usage | ||||||
|  | dirty		- # of bytes that are waiting to get written back to the disk. | ||||||
| writeback	- # of bytes of file/anon cache that are queued for syncing to | writeback	- # of bytes of file/anon cache that are queued for syncing to | ||||||
| 		disk. | 		disk. | ||||||
| inactive_anon	- # of bytes of anonymous and swap cache memory on inactive | inactive_anon	- # of bytes of anonymous and swap cache memory on inactive | ||||||
|  |  | ||||||
							
								
								
									
										34
									
								
								fs/buffer.c
									
									
									
									
									
								
							
							
						
						
									
										34
									
								
								fs/buffer.c
									
									
									
									
									
								
							|  | @ -623,21 +623,22 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); | ||||||
|  * |  * | ||||||
|  * If warn is true, then emit a warning if the page is not uptodate and has |  * If warn is true, then emit a warning if the page is not uptodate and has | ||||||
|  * not been truncated. |  * not been truncated. | ||||||
|  |  * | ||||||
|  |  * The caller must hold mem_cgroup_begin_page_stat() lock. | ||||||
|  */ |  */ | ||||||
| static void __set_page_dirty(struct page *page, | static void __set_page_dirty(struct page *page, struct address_space *mapping, | ||||||
| 		struct address_space *mapping, int warn) | 			     struct mem_cgroup *memcg, int warn) | ||||||
| { | { | ||||||
| 	unsigned long flags; | 	unsigned long flags; | ||||||
| 
 | 
 | ||||||
| 	spin_lock_irqsave(&mapping->tree_lock, flags); | 	spin_lock_irqsave(&mapping->tree_lock, flags); | ||||||
| 	if (page->mapping) {	/* Race with truncate? */ | 	if (page->mapping) {	/* Race with truncate? */ | ||||||
| 		WARN_ON_ONCE(warn && !PageUptodate(page)); | 		WARN_ON_ONCE(warn && !PageUptodate(page)); | ||||||
| 		account_page_dirtied(page, mapping); | 		account_page_dirtied(page, mapping, memcg); | ||||||
| 		radix_tree_tag_set(&mapping->page_tree, | 		radix_tree_tag_set(&mapping->page_tree, | ||||||
| 				page_index(page), PAGECACHE_TAG_DIRTY); | 				page_index(page), PAGECACHE_TAG_DIRTY); | ||||||
| 	} | 	} | ||||||
| 	spin_unlock_irqrestore(&mapping->tree_lock, flags); | 	spin_unlock_irqrestore(&mapping->tree_lock, flags); | ||||||
| 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  | @ -668,6 +669,7 @@ static void __set_page_dirty(struct page *page, | ||||||
| int __set_page_dirty_buffers(struct page *page) | int __set_page_dirty_buffers(struct page *page) | ||||||
| { | { | ||||||
| 	int newly_dirty; | 	int newly_dirty; | ||||||
|  | 	struct mem_cgroup *memcg; | ||||||
| 	struct address_space *mapping = page_mapping(page); | 	struct address_space *mapping = page_mapping(page); | ||||||
| 
 | 
 | ||||||
| 	if (unlikely(!mapping)) | 	if (unlikely(!mapping)) | ||||||
|  | @ -683,11 +685,22 @@ int __set_page_dirty_buffers(struct page *page) | ||||||
| 			bh = bh->b_this_page; | 			bh = bh->b_this_page; | ||||||
| 		} while (bh != head); | 		} while (bh != head); | ||||||
| 	} | 	} | ||||||
|  | 	/*
 | ||||||
|  | 	 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with | ||||||
|  | 	 * per-memcg dirty page counters. | ||||||
|  | 	 */ | ||||||
|  | 	memcg = mem_cgroup_begin_page_stat(page); | ||||||
| 	newly_dirty = !TestSetPageDirty(page); | 	newly_dirty = !TestSetPageDirty(page); | ||||||
| 	spin_unlock(&mapping->private_lock); | 	spin_unlock(&mapping->private_lock); | ||||||
| 
 | 
 | ||||||
| 	if (newly_dirty) | 	if (newly_dirty) | ||||||
| 		__set_page_dirty(page, mapping, 1); | 		__set_page_dirty(page, mapping, memcg, 1); | ||||||
|  | 
 | ||||||
|  | 	mem_cgroup_end_page_stat(memcg); | ||||||
|  | 
 | ||||||
|  | 	if (newly_dirty) | ||||||
|  | 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | ||||||
|  | 
 | ||||||
| 	return newly_dirty; | 	return newly_dirty; | ||||||
| } | } | ||||||
| EXPORT_SYMBOL(__set_page_dirty_buffers); | EXPORT_SYMBOL(__set_page_dirty_buffers); | ||||||
|  | @ -1158,11 +1171,18 @@ void mark_buffer_dirty(struct buffer_head *bh) | ||||||
| 
 | 
 | ||||||
| 	if (!test_set_buffer_dirty(bh)) { | 	if (!test_set_buffer_dirty(bh)) { | ||||||
| 		struct page *page = bh->b_page; | 		struct page *page = bh->b_page; | ||||||
|  | 		struct address_space *mapping = NULL; | ||||||
|  | 		struct mem_cgroup *memcg; | ||||||
|  | 
 | ||||||
|  | 		memcg = mem_cgroup_begin_page_stat(page); | ||||||
| 		if (!TestSetPageDirty(page)) { | 		if (!TestSetPageDirty(page)) { | ||||||
| 			struct address_space *mapping = page_mapping(page); | 			mapping = page_mapping(page); | ||||||
| 			if (mapping) | 			if (mapping) | ||||||
| 				__set_page_dirty(page, mapping, 0); | 				__set_page_dirty(page, mapping, memcg, 0); | ||||||
| 		} | 		} | ||||||
|  | 		mem_cgroup_end_page_stat(memcg); | ||||||
|  | 		if (mapping) | ||||||
|  | 			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| EXPORT_SYMBOL(mark_buffer_dirty); | EXPORT_SYMBOL(mark_buffer_dirty); | ||||||
|  |  | ||||||
|  | @ -1873,6 +1873,7 @@ xfs_vm_set_page_dirty( | ||||||
| 	loff_t			end_offset; | 	loff_t			end_offset; | ||||||
| 	loff_t			offset; | 	loff_t			offset; | ||||||
| 	int			newly_dirty; | 	int			newly_dirty; | ||||||
|  | 	struct mem_cgroup	*memcg; | ||||||
| 
 | 
 | ||||||
| 	if (unlikely(!mapping)) | 	if (unlikely(!mapping)) | ||||||
| 		return !TestSetPageDirty(page); | 		return !TestSetPageDirty(page); | ||||||
|  | @ -1892,6 +1893,11 @@ xfs_vm_set_page_dirty( | ||||||
| 			offset += 1 << inode->i_blkbits; | 			offset += 1 << inode->i_blkbits; | ||||||
| 		} while (bh != head); | 		} while (bh != head); | ||||||
| 	} | 	} | ||||||
|  | 	/*
 | ||||||
|  | 	 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with | ||||||
|  | 	 * per-memcg dirty page counters. | ||||||
|  | 	 */ | ||||||
|  | 	memcg = mem_cgroup_begin_page_stat(page); | ||||||
| 	newly_dirty = !TestSetPageDirty(page); | 	newly_dirty = !TestSetPageDirty(page); | ||||||
| 	spin_unlock(&mapping->private_lock); | 	spin_unlock(&mapping->private_lock); | ||||||
| 
 | 
 | ||||||
|  | @ -1902,13 +1908,15 @@ xfs_vm_set_page_dirty( | ||||||
| 		spin_lock_irqsave(&mapping->tree_lock, flags); | 		spin_lock_irqsave(&mapping->tree_lock, flags); | ||||||
| 		if (page->mapping) {	/* Race with truncate? */ | 		if (page->mapping) {	/* Race with truncate? */ | ||||||
| 			WARN_ON_ONCE(!PageUptodate(page)); | 			WARN_ON_ONCE(!PageUptodate(page)); | ||||||
| 			account_page_dirtied(page, mapping); | 			account_page_dirtied(page, mapping, memcg); | ||||||
| 			radix_tree_tag_set(&mapping->page_tree, | 			radix_tree_tag_set(&mapping->page_tree, | ||||||
| 					page_index(page), PAGECACHE_TAG_DIRTY); | 					page_index(page), PAGECACHE_TAG_DIRTY); | ||||||
| 		} | 		} | ||||||
| 		spin_unlock_irqrestore(&mapping->tree_lock, flags); | 		spin_unlock_irqrestore(&mapping->tree_lock, flags); | ||||||
| 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |  | ||||||
| 	} | 	} | ||||||
|  | 	mem_cgroup_end_page_stat(memcg); | ||||||
|  | 	if (newly_dirty) | ||||||
|  | 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | ||||||
| 	return newly_dirty; | 	return newly_dirty; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -41,6 +41,7 @@ enum mem_cgroup_stat_index { | ||||||
| 	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */ | 	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */ | ||||||
| 	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */ | 	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */ | ||||||
| 	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */ | 	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */ | ||||||
|  | 	MEM_CGROUP_STAT_DIRTY,          /* # of dirty pages in page cache */ | ||||||
| 	MEM_CGROUP_STAT_WRITEBACK,	/* # of pages under writeback */ | 	MEM_CGROUP_STAT_WRITEBACK,	/* # of pages under writeback */ | ||||||
| 	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */ | 	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */ | ||||||
| 	MEM_CGROUP_STAT_NSTATS, | 	MEM_CGROUP_STAT_NSTATS, | ||||||
|  |  | ||||||
|  | @ -1211,8 +1211,10 @@ int __set_page_dirty_nobuffers(struct page *page); | ||||||
| int __set_page_dirty_no_writeback(struct page *page); | int __set_page_dirty_no_writeback(struct page *page); | ||||||
| int redirty_page_for_writepage(struct writeback_control *wbc, | int redirty_page_for_writepage(struct writeback_control *wbc, | ||||||
| 				struct page *page); | 				struct page *page); | ||||||
| void account_page_dirtied(struct page *page, struct address_space *mapping); | void account_page_dirtied(struct page *page, struct address_space *mapping, | ||||||
| void account_page_cleaned(struct page *page, struct address_space *mapping); | 			  struct mem_cgroup *memcg); | ||||||
|  | void account_page_cleaned(struct page *page, struct address_space *mapping, | ||||||
|  | 			  struct mem_cgroup *memcg); | ||||||
| int set_page_dirty(struct page *page); | int set_page_dirty(struct page *page); | ||||||
| int set_page_dirty_lock(struct page *page); | int set_page_dirty_lock(struct page *page); | ||||||
| void cancel_dirty_page(struct page *page); | void cancel_dirty_page(struct page *page); | ||||||
|  |  | ||||||
|  | @ -651,7 +651,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | ||||||
| int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | ||||||
| 				pgoff_t index, gfp_t gfp_mask); | 				pgoff_t index, gfp_t gfp_mask); | ||||||
| extern void delete_from_page_cache(struct page *page); | extern void delete_from_page_cache(struct page *page); | ||||||
| extern void __delete_from_page_cache(struct page *page, void *shadow); | extern void __delete_from_page_cache(struct page *page, void *shadow, | ||||||
|  | 				     struct mem_cgroup *memcg); | ||||||
| int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  |  | ||||||
							
								
								
									
										31
									
								
								mm/filemap.c
									
									
									
									
									
								
							
							
						
						
									
										31
									
								
								mm/filemap.c
									
									
									
									
									
								
							|  | @ -100,6 +100,7 @@ | ||||||
|  *    ->tree_lock		(page_remove_rmap->set_page_dirty) |  *    ->tree_lock		(page_remove_rmap->set_page_dirty) | ||||||
|  *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty) |  *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty) | ||||||
|  *    ->inode->i_lock		(page_remove_rmap->set_page_dirty) |  *    ->inode->i_lock		(page_remove_rmap->set_page_dirty) | ||||||
|  |  *    ->memcg->move_lock	(page_remove_rmap->mem_cgroup_begin_page_stat) | ||||||
|  *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty) |  *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty) | ||||||
|  *    ->inode->i_lock		(zap_pte_range->set_page_dirty) |  *    ->inode->i_lock		(zap_pte_range->set_page_dirty) | ||||||
|  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers) |  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers) | ||||||
|  | @ -174,9 +175,11 @@ static void page_cache_tree_delete(struct address_space *mapping, | ||||||
| /*
 | /*
 | ||||||
|  * Delete a page from the page cache and free it. Caller has to make |  * Delete a page from the page cache and free it. Caller has to make | ||||||
|  * sure the page is locked and that nobody else uses it - or that usage |  * sure the page is locked and that nobody else uses it - or that usage | ||||||
|  * is safe.  The caller must hold the mapping's tree_lock. |  * is safe.  The caller must hold the mapping's tree_lock and | ||||||
|  |  * mem_cgroup_begin_page_stat(). | ||||||
|  */ |  */ | ||||||
| void __delete_from_page_cache(struct page *page, void *shadow) | void __delete_from_page_cache(struct page *page, void *shadow, | ||||||
|  | 			      struct mem_cgroup *memcg) | ||||||
| { | { | ||||||
| 	struct address_space *mapping = page->mapping; | 	struct address_space *mapping = page->mapping; | ||||||
| 
 | 
 | ||||||
|  | @ -210,7 +213,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) | ||||||
| 	 * anyway will be cleared before returning page into buddy allocator. | 	 * anyway will be cleared before returning page into buddy allocator. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (WARN_ON_ONCE(PageDirty(page))) | 	if (WARN_ON_ONCE(PageDirty(page))) | ||||||
| 		account_page_cleaned(page, mapping); | 		account_page_cleaned(page, mapping, memcg); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  | @ -224,14 +227,20 @@ void __delete_from_page_cache(struct page *page, void *shadow) | ||||||
| void delete_from_page_cache(struct page *page) | void delete_from_page_cache(struct page *page) | ||||||
| { | { | ||||||
| 	struct address_space *mapping = page->mapping; | 	struct address_space *mapping = page->mapping; | ||||||
|  | 	struct mem_cgroup *memcg; | ||||||
|  | 	unsigned long flags; | ||||||
|  | 
 | ||||||
| 	void (*freepage)(struct page *); | 	void (*freepage)(struct page *); | ||||||
| 
 | 
 | ||||||
| 	BUG_ON(!PageLocked(page)); | 	BUG_ON(!PageLocked(page)); | ||||||
| 
 | 
 | ||||||
| 	freepage = mapping->a_ops->freepage; | 	freepage = mapping->a_ops->freepage; | ||||||
| 	spin_lock_irq(&mapping->tree_lock); | 
 | ||||||
| 	__delete_from_page_cache(page, NULL); | 	memcg = mem_cgroup_begin_page_stat(page); | ||||||
| 	spin_unlock_irq(&mapping->tree_lock); | 	spin_lock_irqsave(&mapping->tree_lock, flags); | ||||||
|  | 	__delete_from_page_cache(page, NULL, memcg); | ||||||
|  | 	spin_unlock_irqrestore(&mapping->tree_lock, flags); | ||||||
|  | 	mem_cgroup_end_page_stat(memcg); | ||||||
| 
 | 
 | ||||||
| 	if (freepage) | 	if (freepage) | ||||||
| 		freepage(page); | 		freepage(page); | ||||||
|  | @ -470,6 +479,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | ||||||
| 	if (!error) { | 	if (!error) { | ||||||
| 		struct address_space *mapping = old->mapping; | 		struct address_space *mapping = old->mapping; | ||||||
| 		void (*freepage)(struct page *); | 		void (*freepage)(struct page *); | ||||||
|  | 		struct mem_cgroup *memcg; | ||||||
|  | 		unsigned long flags; | ||||||
| 
 | 
 | ||||||
| 		pgoff_t offset = old->index; | 		pgoff_t offset = old->index; | ||||||
| 		freepage = mapping->a_ops->freepage; | 		freepage = mapping->a_ops->freepage; | ||||||
|  | @ -478,15 +489,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | ||||||
| 		new->mapping = mapping; | 		new->mapping = mapping; | ||||||
| 		new->index = offset; | 		new->index = offset; | ||||||
| 
 | 
 | ||||||
| 		spin_lock_irq(&mapping->tree_lock); | 		memcg = mem_cgroup_begin_page_stat(old); | ||||||
| 		__delete_from_page_cache(old, NULL); | 		spin_lock_irqsave(&mapping->tree_lock, flags); | ||||||
|  | 		__delete_from_page_cache(old, NULL, memcg); | ||||||
| 		error = radix_tree_insert(&mapping->page_tree, offset, new); | 		error = radix_tree_insert(&mapping->page_tree, offset, new); | ||||||
| 		BUG_ON(error); | 		BUG_ON(error); | ||||||
| 		mapping->nrpages++; | 		mapping->nrpages++; | ||||||
| 		__inc_zone_page_state(new, NR_FILE_PAGES); | 		__inc_zone_page_state(new, NR_FILE_PAGES); | ||||||
| 		if (PageSwapBacked(new)) | 		if (PageSwapBacked(new)) | ||||||
| 			__inc_zone_page_state(new, NR_SHMEM); | 			__inc_zone_page_state(new, NR_SHMEM); | ||||||
| 		spin_unlock_irq(&mapping->tree_lock); | 		spin_unlock_irqrestore(&mapping->tree_lock, flags); | ||||||
|  | 		mem_cgroup_end_page_stat(memcg); | ||||||
| 		mem_cgroup_migrate(old, new, true); | 		mem_cgroup_migrate(old, new, true); | ||||||
| 		radix_tree_preload_end(); | 		radix_tree_preload_end(); | ||||||
| 		if (freepage) | 		if (freepage) | ||||||
|  |  | ||||||
|  | @ -90,6 +90,7 @@ static const char * const mem_cgroup_stat_names[] = { | ||||||
| 	"rss", | 	"rss", | ||||||
| 	"rss_huge", | 	"rss_huge", | ||||||
| 	"mapped_file", | 	"mapped_file", | ||||||
|  | 	"dirty", | ||||||
| 	"writeback", | 	"writeback", | ||||||
| 	"swap", | 	"swap", | ||||||
| }; | }; | ||||||
|  | @ -2011,6 +2012,7 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) | ||||||
| 
 | 
 | ||||||
| 	return memcg; | 	return memcg; | ||||||
| } | } | ||||||
|  | EXPORT_SYMBOL(mem_cgroup_begin_page_stat); | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  * mem_cgroup_end_page_stat - finish a page state statistics transaction |  * mem_cgroup_end_page_stat - finish a page state statistics transaction | ||||||
|  | @ -2029,6 +2031,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) | ||||||
| 
 | 
 | ||||||
| 	rcu_read_unlock(); | 	rcu_read_unlock(); | ||||||
| } | } | ||||||
|  | EXPORT_SYMBOL(mem_cgroup_end_page_stat); | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  * mem_cgroup_update_page_stat - update page state statistics |  * mem_cgroup_update_page_stat - update page state statistics | ||||||
|  | @ -4746,6 +4749,7 @@ static int mem_cgroup_move_account(struct page *page, | ||||||
| { | { | ||||||
| 	unsigned long flags; | 	unsigned long flags; | ||||||
| 	int ret; | 	int ret; | ||||||
|  | 	bool anon; | ||||||
| 
 | 
 | ||||||
| 	VM_BUG_ON(from == to); | 	VM_BUG_ON(from == to); | ||||||
| 	VM_BUG_ON_PAGE(PageLRU(page), page); | 	VM_BUG_ON_PAGE(PageLRU(page), page); | ||||||
|  | @ -4771,15 +4775,33 @@ static int mem_cgroup_move_account(struct page *page, | ||||||
| 	if (page->mem_cgroup != from) | 	if (page->mem_cgroup != from) | ||||||
| 		goto out_unlock; | 		goto out_unlock; | ||||||
| 
 | 
 | ||||||
|  | 	anon = PageAnon(page); | ||||||
|  | 
 | ||||||
| 	spin_lock_irqsave(&from->move_lock, flags); | 	spin_lock_irqsave(&from->move_lock, flags); | ||||||
| 
 | 
 | ||||||
| 	if (!PageAnon(page) && page_mapped(page)) { | 	if (!anon && page_mapped(page)) { | ||||||
| 		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | 		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | ||||||
| 			       nr_pages); | 			       nr_pages); | ||||||
| 		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | 		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | ||||||
| 			       nr_pages); | 			       nr_pages); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * move_lock grabbed above and caller set from->moving_account, so | ||||||
|  | 	 * mem_cgroup_update_page_stat() will serialize updates to PageDirty. | ||||||
|  | 	 * So mapping should be stable for dirty pages. | ||||||
|  | 	 */ | ||||||
|  | 	if (!anon && PageDirty(page)) { | ||||||
|  | 		struct address_space *mapping = page_mapping(page); | ||||||
|  | 
 | ||||||
|  | 		if (mapping_cap_account_dirty(mapping)) { | ||||||
|  | 			__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY], | ||||||
|  | 				       nr_pages); | ||||||
|  | 			__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY], | ||||||
|  | 				       nr_pages); | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	if (PageWriteback(page)) { | 	if (PageWriteback(page)) { | ||||||
| 		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], | 		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], | ||||||
| 			       nr_pages); | 			       nr_pages); | ||||||
|  |  | ||||||
|  | @ -2090,15 +2090,20 @@ int __set_page_dirty_no_writeback(struct page *page) | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Helper function for set_page_dirty family. |  * Helper function for set_page_dirty family. | ||||||
|  |  * | ||||||
|  |  * Caller must hold mem_cgroup_begin_page_stat(). | ||||||
|  |  * | ||||||
|  * NOTE: This relies on being atomic wrt interrupts. |  * NOTE: This relies on being atomic wrt interrupts. | ||||||
|  */ |  */ | ||||||
| void account_page_dirtied(struct page *page, struct address_space *mapping) | void account_page_dirtied(struct page *page, struct address_space *mapping, | ||||||
|  | 			  struct mem_cgroup *memcg) | ||||||
| { | { | ||||||
| 	trace_writeback_dirty_page(page, mapping); | 	trace_writeback_dirty_page(page, mapping); | ||||||
| 
 | 
 | ||||||
| 	if (mapping_cap_account_dirty(mapping)) { | 	if (mapping_cap_account_dirty(mapping)) { | ||||||
| 		struct backing_dev_info *bdi = inode_to_bdi(mapping->host); | 		struct backing_dev_info *bdi = inode_to_bdi(mapping->host); | ||||||
| 
 | 
 | ||||||
|  | 		mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); | ||||||
| 		__inc_zone_page_state(page, NR_FILE_DIRTY); | 		__inc_zone_page_state(page, NR_FILE_DIRTY); | ||||||
| 		__inc_zone_page_state(page, NR_DIRTIED); | 		__inc_zone_page_state(page, NR_DIRTIED); | ||||||
| 		__inc_bdi_stat(bdi, BDI_RECLAIMABLE); | 		__inc_bdi_stat(bdi, BDI_RECLAIMABLE); | ||||||
|  | @ -2112,10 +2117,14 @@ EXPORT_SYMBOL(account_page_dirtied); | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Helper function for deaccounting dirty page without writeback. |  * Helper function for deaccounting dirty page without writeback. | ||||||
|  |  * | ||||||
|  |  * Caller must hold mem_cgroup_begin_page_stat(). | ||||||
|  */ |  */ | ||||||
| void account_page_cleaned(struct page *page, struct address_space *mapping) | void account_page_cleaned(struct page *page, struct address_space *mapping, | ||||||
|  | 			  struct mem_cgroup *memcg) | ||||||
| { | { | ||||||
| 	if (mapping_cap_account_dirty(mapping)) { | 	if (mapping_cap_account_dirty(mapping)) { | ||||||
|  | 		mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); | ||||||
| 		dec_zone_page_state(page, NR_FILE_DIRTY); | 		dec_zone_page_state(page, NR_FILE_DIRTY); | ||||||
| 		dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); | 		dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); | ||||||
| 		task_io_account_cancelled_write(PAGE_CACHE_SIZE); | 		task_io_account_cancelled_write(PAGE_CACHE_SIZE); | ||||||
|  | @ -2136,26 +2145,34 @@ void account_page_cleaned(struct page *page, struct address_space *mapping) | ||||||
|  */ |  */ | ||||||
| int __set_page_dirty_nobuffers(struct page *page) | int __set_page_dirty_nobuffers(struct page *page) | ||||||
| { | { | ||||||
|  | 	struct mem_cgroup *memcg; | ||||||
|  | 
 | ||||||
|  | 	memcg = mem_cgroup_begin_page_stat(page); | ||||||
| 	if (!TestSetPageDirty(page)) { | 	if (!TestSetPageDirty(page)) { | ||||||
| 		struct address_space *mapping = page_mapping(page); | 		struct address_space *mapping = page_mapping(page); | ||||||
| 		unsigned long flags; | 		unsigned long flags; | ||||||
| 
 | 
 | ||||||
| 		if (!mapping) | 		if (!mapping) { | ||||||
|  | 			mem_cgroup_end_page_stat(memcg); | ||||||
| 			return 1; | 			return 1; | ||||||
|  | 		} | ||||||
| 
 | 
 | ||||||
| 		spin_lock_irqsave(&mapping->tree_lock, flags); | 		spin_lock_irqsave(&mapping->tree_lock, flags); | ||||||
| 		BUG_ON(page_mapping(page) != mapping); | 		BUG_ON(page_mapping(page) != mapping); | ||||||
| 		WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); | 		WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); | ||||||
| 		account_page_dirtied(page, mapping); | 		account_page_dirtied(page, mapping, memcg); | ||||||
| 		radix_tree_tag_set(&mapping->page_tree, page_index(page), | 		radix_tree_tag_set(&mapping->page_tree, page_index(page), | ||||||
| 				   PAGECACHE_TAG_DIRTY); | 				   PAGECACHE_TAG_DIRTY); | ||||||
| 		spin_unlock_irqrestore(&mapping->tree_lock, flags); | 		spin_unlock_irqrestore(&mapping->tree_lock, flags); | ||||||
|  | 		mem_cgroup_end_page_stat(memcg); | ||||||
|  | 
 | ||||||
| 		if (mapping->host) { | 		if (mapping->host) { | ||||||
| 			/* !PageAnon && !swapper_space */ | 			/* !PageAnon && !swapper_space */ | ||||||
| 			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | ||||||
| 		} | 		} | ||||||
| 		return 1; | 		return 1; | ||||||
| 	} | 	} | ||||||
|  | 	mem_cgroup_end_page_stat(memcg); | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
| EXPORT_SYMBOL(__set_page_dirty_nobuffers); | EXPORT_SYMBOL(__set_page_dirty_nobuffers); | ||||||
|  | @ -2273,8 +2290,20 @@ EXPORT_SYMBOL(set_page_dirty_lock); | ||||||
|  */ |  */ | ||||||
| void cancel_dirty_page(struct page *page) | void cancel_dirty_page(struct page *page) | ||||||
| { | { | ||||||
| 	if (TestClearPageDirty(page)) | 	struct address_space *mapping = page_mapping(page); | ||||||
| 		account_page_cleaned(page, page_mapping(page)); | 
 | ||||||
|  | 	if (mapping_cap_account_dirty(mapping)) { | ||||||
|  | 		struct mem_cgroup *memcg; | ||||||
|  | 
 | ||||||
|  | 		memcg = mem_cgroup_begin_page_stat(page); | ||||||
|  | 
 | ||||||
|  | 		if (TestClearPageDirty(page)) | ||||||
|  | 			account_page_cleaned(page, mapping, memcg); | ||||||
|  | 
 | ||||||
|  | 		mem_cgroup_end_page_stat(memcg); | ||||||
|  | 	} else { | ||||||
|  | 		ClearPageDirty(page); | ||||||
|  | 	} | ||||||
| } | } | ||||||
| EXPORT_SYMBOL(cancel_dirty_page); | EXPORT_SYMBOL(cancel_dirty_page); | ||||||
| 
 | 
 | ||||||
|  | @ -2295,6 +2324,8 @@ EXPORT_SYMBOL(cancel_dirty_page); | ||||||
| int clear_page_dirty_for_io(struct page *page) | int clear_page_dirty_for_io(struct page *page) | ||||||
| { | { | ||||||
| 	struct address_space *mapping = page_mapping(page); | 	struct address_space *mapping = page_mapping(page); | ||||||
|  | 	struct mem_cgroup *memcg; | ||||||
|  | 	int ret = 0; | ||||||
| 
 | 
 | ||||||
| 	BUG_ON(!PageLocked(page)); | 	BUG_ON(!PageLocked(page)); | ||||||
| 
 | 
 | ||||||
|  | @ -2334,13 +2365,16 @@ int clear_page_dirty_for_io(struct page *page) | ||||||
| 		 * always locked coming in here, so we get the desired | 		 * always locked coming in here, so we get the desired | ||||||
| 		 * exclusion. | 		 * exclusion. | ||||||
| 		 */ | 		 */ | ||||||
|  | 		memcg = mem_cgroup_begin_page_stat(page); | ||||||
| 		if (TestClearPageDirty(page)) { | 		if (TestClearPageDirty(page)) { | ||||||
|  | 			mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); | ||||||
| 			dec_zone_page_state(page, NR_FILE_DIRTY); | 			dec_zone_page_state(page, NR_FILE_DIRTY); | ||||||
| 			dec_bdi_stat(inode_to_bdi(mapping->host), | 			dec_bdi_stat(inode_to_bdi(mapping->host), | ||||||
| 					BDI_RECLAIMABLE); | 					BDI_RECLAIMABLE); | ||||||
| 			return 1; | 			ret = 1; | ||||||
| 		} | 		} | ||||||
| 		return 0; | 		mem_cgroup_end_page_stat(memcg); | ||||||
|  | 		return ret; | ||||||
| 	} | 	} | ||||||
| 	return TestClearPageDirty(page); | 	return TestClearPageDirty(page); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -30,6 +30,8 @@ | ||||||
|  *             swap_lock (in swap_duplicate, swap_info_get) |  *             swap_lock (in swap_duplicate, swap_info_get) | ||||||
|  *               mmlist_lock (in mmput, drain_mmlist and others) |  *               mmlist_lock (in mmput, drain_mmlist and others) | ||||||
|  *               mapping->private_lock (in __set_page_dirty_buffers) |  *               mapping->private_lock (in __set_page_dirty_buffers) | ||||||
|  |  *                 mem_cgroup_{begin,end}_page_stat (memcg->move_lock) | ||||||
|  |  *                   mapping->tree_lock (widely used) | ||||||
|  *               inode->i_lock (in set_page_dirty's __mark_inode_dirty) |  *               inode->i_lock (in set_page_dirty's __mark_inode_dirty) | ||||||
|  *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) |  *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) | ||||||
|  *                 sb_lock (within inode_lock in fs/fs-writeback.c) |  *                 sb_lock (within inode_lock in fs/fs-writeback.c) | ||||||
|  |  | ||||||
|  | @ -510,19 +510,24 @@ EXPORT_SYMBOL(invalidate_mapping_pages); | ||||||
| static int | static int | ||||||
| invalidate_complete_page2(struct address_space *mapping, struct page *page) | invalidate_complete_page2(struct address_space *mapping, struct page *page) | ||||||
| { | { | ||||||
|  | 	struct mem_cgroup *memcg; | ||||||
|  | 	unsigned long flags; | ||||||
|  | 
 | ||||||
| 	if (page->mapping != mapping) | 	if (page->mapping != mapping) | ||||||
| 		return 0; | 		return 0; | ||||||
| 
 | 
 | ||||||
| 	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) | 	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) | ||||||
| 		return 0; | 		return 0; | ||||||
| 
 | 
 | ||||||
| 	spin_lock_irq(&mapping->tree_lock); | 	memcg = mem_cgroup_begin_page_stat(page); | ||||||
|  | 	spin_lock_irqsave(&mapping->tree_lock, flags); | ||||||
| 	if (PageDirty(page)) | 	if (PageDirty(page)) | ||||||
| 		goto failed; | 		goto failed; | ||||||
| 
 | 
 | ||||||
| 	BUG_ON(page_has_private(page)); | 	BUG_ON(page_has_private(page)); | ||||||
| 	__delete_from_page_cache(page, NULL); | 	__delete_from_page_cache(page, NULL, memcg); | ||||||
| 	spin_unlock_irq(&mapping->tree_lock); | 	spin_unlock_irqrestore(&mapping->tree_lock, flags); | ||||||
|  | 	mem_cgroup_end_page_stat(memcg); | ||||||
| 
 | 
 | ||||||
| 	if (mapping->a_ops->freepage) | 	if (mapping->a_ops->freepage) | ||||||
| 		mapping->a_ops->freepage(page); | 		mapping->a_ops->freepage(page); | ||||||
|  | @ -530,7 +535,8 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | ||||||
| 	page_cache_release(page);	/* pagecache ref */ | 	page_cache_release(page);	/* pagecache ref */ | ||||||
| 	return 1; | 	return 1; | ||||||
| failed: | failed: | ||||||
| 	spin_unlock_irq(&mapping->tree_lock); | 	spin_unlock_irqrestore(&mapping->tree_lock, flags); | ||||||
|  | 	mem_cgroup_end_page_stat(memcg); | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										17
									
								
								mm/vmscan.c
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								mm/vmscan.c
									
									
									
									
									
								
							|  | @ -579,10 +579,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | ||||||
| static int __remove_mapping(struct address_space *mapping, struct page *page, | static int __remove_mapping(struct address_space *mapping, struct page *page, | ||||||
| 			    bool reclaimed) | 			    bool reclaimed) | ||||||
| { | { | ||||||
|  | 	unsigned long flags; | ||||||
|  | 	struct mem_cgroup *memcg; | ||||||
|  | 
 | ||||||
| 	BUG_ON(!PageLocked(page)); | 	BUG_ON(!PageLocked(page)); | ||||||
| 	BUG_ON(mapping != page_mapping(page)); | 	BUG_ON(mapping != page_mapping(page)); | ||||||
| 
 | 
 | ||||||
| 	spin_lock_irq(&mapping->tree_lock); | 	memcg = mem_cgroup_begin_page_stat(page); | ||||||
|  | 	spin_lock_irqsave(&mapping->tree_lock, flags); | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * The non racy check for a busy page. | 	 * The non racy check for a busy page. | ||||||
| 	 * | 	 * | ||||||
|  | @ -620,7 +624,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, | ||||||
| 		swp_entry_t swap = { .val = page_private(page) }; | 		swp_entry_t swap = { .val = page_private(page) }; | ||||||
| 		mem_cgroup_swapout(page, swap); | 		mem_cgroup_swapout(page, swap); | ||||||
| 		__delete_from_swap_cache(page); | 		__delete_from_swap_cache(page); | ||||||
| 		spin_unlock_irq(&mapping->tree_lock); | 		spin_unlock_irqrestore(&mapping->tree_lock, flags); | ||||||
|  | 		mem_cgroup_end_page_stat(memcg); | ||||||
| 		swapcache_free(swap); | 		swapcache_free(swap); | ||||||
| 	} else { | 	} else { | ||||||
| 		void (*freepage)(struct page *); | 		void (*freepage)(struct page *); | ||||||
|  | @ -640,8 +645,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, | ||||||
| 		if (reclaimed && page_is_file_cache(page) && | 		if (reclaimed && page_is_file_cache(page) && | ||||||
| 		    !mapping_exiting(mapping)) | 		    !mapping_exiting(mapping)) | ||||||
| 			shadow = workingset_eviction(mapping, page); | 			shadow = workingset_eviction(mapping, page); | ||||||
| 		__delete_from_page_cache(page, shadow); | 		__delete_from_page_cache(page, shadow, memcg); | ||||||
| 		spin_unlock_irq(&mapping->tree_lock); | 		spin_unlock_irqrestore(&mapping->tree_lock, flags); | ||||||
|  | 		mem_cgroup_end_page_stat(memcg); | ||||||
| 
 | 
 | ||||||
| 		if (freepage != NULL) | 		if (freepage != NULL) | ||||||
| 			freepage(page); | 			freepage(page); | ||||||
|  | @ -650,7 +656,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, | ||||||
| 	return 1; | 	return 1; | ||||||
| 
 | 
 | ||||||
| cannot_free: | cannot_free: | ||||||
| 	spin_unlock_irq(&mapping->tree_lock); | 	spin_unlock_irqrestore(&mapping->tree_lock, flags); | ||||||
|  | 	mem_cgroup_end_page_stat(memcg); | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Greg Thelen
						Greg Thelen