forked from mirrors/linux
		
	mempolicy: alloc_pages_mpol() for NUMA policy without vma
Shrink shmem's stack usage by eliminating the pseudo-vma from its folio
allocation.  alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the
principal actor for passing mempolicy choice down to __alloc_pages(),
rather than vma_alloc_folio(gfp, order, vma, addr, hugepage).
vma_alloc_folio() and alloc_pages() remain, but as wrappers around
alloc_pages_mpol().  alloc_pages_bulk_*() untouched, except to provide the
additional args to policy_nodemask(), which subsumes policy_node(). 
Cleanup throughout, cutting out some unhelpful "helpers".
It would all be much simpler without MPOL_INTERLEAVE, but that adds a
dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8b
("tmpfs: distribute interleave better across nodes"), which added ino bias
to the interleave, hidden from mm/mempolicy.c until this commit.
Hence "ilx" throughout, the "interleave index".  Originally I thought it
could be done just with nid, but that's wrong: the nodemask may come from
the shared policy layer below a shmem vma, or it may come from the task
layer above a shmem vma; and without the final nodemask then nodeid cannot
be decided.  And how ilx is applied depends also on page order.
The interleave index is almost always irrelevant unless MPOL_INTERLEAVE:
with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX
passed down from vma-less alloc_pages() is also used as hint not to use
THP-style hugepage allocation - to avoid the overhead of a hugepage arg
(though I don't understand why we never just added a GFP bit for THP - if
it actually needs a different allocation strategy from other pages of the
same order).  vma_alloc_folio() still carries its hugepage arg here, but
it is not used, and should be removed when agreed.
get_vma_policy() no longer allows a NULL vma: over time I believe we've
eradicated all the places which used to need it e.g.  swapoff and madvise
used to pass NULL vma to read_swap_cache_async(), but now know the vma.
[hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()]
  Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com
Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com
Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
			
			
This commit is contained in:
		
							parent
							
								
									23e4883248
								
							
						
					
					
						commit
						ddc1a5cbc0
					
				
					 10 changed files with 309 additions and 324 deletions
				
			
		|  | @ -2673,8 +2673,9 @@ static int show_numa_map(struct seq_file *m, void *v) | ||||||
| 	struct numa_maps *md = &numa_priv->md; | 	struct numa_maps *md = &numa_priv->md; | ||||||
| 	struct file *file = vma->vm_file; | 	struct file *file = vma->vm_file; | ||||||
| 	struct mm_struct *mm = vma->vm_mm; | 	struct mm_struct *mm = vma->vm_mm; | ||||||
| 	struct mempolicy *pol; |  | ||||||
| 	char buffer[64]; | 	char buffer[64]; | ||||||
|  | 	struct mempolicy *pol; | ||||||
|  | 	pgoff_t ilx; | ||||||
| 	int nid; | 	int nid; | ||||||
| 
 | 
 | ||||||
| 	if (!mm) | 	if (!mm) | ||||||
|  | @ -2683,7 +2684,7 @@ static int show_numa_map(struct seq_file *m, void *v) | ||||||
| 	/* Ensure we start with an empty set of numa_maps statistics. */ | 	/* Ensure we start with an empty set of numa_maps statistics. */ | ||||||
| 	memset(md, 0, sizeof(*md)); | 	memset(md, 0, sizeof(*md)); | ||||||
| 
 | 
 | ||||||
| 	pol = __get_vma_policy(vma, vma->vm_start); | 	pol = __get_vma_policy(vma, vma->vm_start, &ilx); | ||||||
| 	if (pol) { | 	if (pol) { | ||||||
| 		mpol_to_str(buffer, sizeof(buffer), pol); | 		mpol_to_str(buffer, sizeof(buffer), pol); | ||||||
| 		mpol_cond_put(pol); | 		mpol_cond_put(pol); | ||||||
|  |  | ||||||
|  | @ -8,6 +8,7 @@ | ||||||
| #include <linux/topology.h> | #include <linux/topology.h> | ||||||
| 
 | 
 | ||||||
| struct vm_area_struct; | struct vm_area_struct; | ||||||
|  | struct mempolicy; | ||||||
| 
 | 
 | ||||||
| /* Convert GFP flags to their corresponding migrate type */ | /* Convert GFP flags to their corresponding migrate type */ | ||||||
| #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) | #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) | ||||||
|  | @ -262,7 +263,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_NUMA | #ifdef CONFIG_NUMA | ||||||
| struct page *alloc_pages(gfp_t gfp, unsigned int order); | struct page *alloc_pages(gfp_t gfp, unsigned int order); | ||||||
| struct folio *folio_alloc(gfp_t gfp, unsigned order); | struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, | ||||||
|  | 		struct mempolicy *mpol, pgoff_t ilx, int nid); | ||||||
|  | struct folio *folio_alloc(gfp_t gfp, unsigned int order); | ||||||
| struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, | struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, | ||||||
| 		unsigned long addr, bool hugepage); | 		unsigned long addr, bool hugepage); | ||||||
| #else | #else | ||||||
|  | @ -270,6 +273,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) | ||||||
| { | { | ||||||
| 	return alloc_pages_node(numa_node_id(), gfp_mask, order); | 	return alloc_pages_node(numa_node_id(), gfp_mask, order); | ||||||
| } | } | ||||||
|  | static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, | ||||||
|  | 		struct mempolicy *mpol, pgoff_t ilx, int nid) | ||||||
|  | { | ||||||
|  | 	return alloc_pages(gfp, order); | ||||||
|  | } | ||||||
| static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) | static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) | ||||||
| { | { | ||||||
| 	return __folio_alloc_node(gfp, order, numa_node_id()); | 	return __folio_alloc_node(gfp, order, numa_node_id()); | ||||||
|  |  | ||||||
|  | @ -17,6 +17,8 @@ | ||||||
| 
 | 
 | ||||||
| struct mm_struct; | struct mm_struct; | ||||||
| 
 | 
 | ||||||
|  | #define NO_INTERLEAVE_INDEX (-1UL)	/* use task il_prev for interleaving */ | ||||||
|  | 
 | ||||||
| #ifdef CONFIG_NUMA | #ifdef CONFIG_NUMA | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  | @ -126,7 +128,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, | ||||||
| 
 | 
 | ||||||
| struct mempolicy *get_task_policy(struct task_struct *p); | struct mempolicy *get_task_policy(struct task_struct *p); | ||||||
| struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, | struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, | ||||||
| 		unsigned long addr); | 		unsigned long addr, pgoff_t *ilx); | ||||||
|  | struct mempolicy *get_vma_policy(struct vm_area_struct *vma, | ||||||
|  | 		unsigned long addr, int order, pgoff_t *ilx); | ||||||
| bool vma_policy_mof(struct vm_area_struct *vma); | bool vma_policy_mof(struct vm_area_struct *vma); | ||||||
| 
 | 
 | ||||||
| extern void numa_default_policy(void); | extern void numa_default_policy(void); | ||||||
|  | @ -140,8 +144,6 @@ extern int huge_node(struct vm_area_struct *vma, | ||||||
| extern bool init_nodemask_of_mempolicy(nodemask_t *mask); | extern bool init_nodemask_of_mempolicy(nodemask_t *mask); | ||||||
| extern bool mempolicy_in_oom_domain(struct task_struct *tsk, | extern bool mempolicy_in_oom_domain(struct task_struct *tsk, | ||||||
| 				const nodemask_t *mask); | 				const nodemask_t *mask); | ||||||
| extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy); |  | ||||||
| 
 |  | ||||||
| extern unsigned int mempolicy_slab_node(void); | extern unsigned int mempolicy_slab_node(void); | ||||||
| 
 | 
 | ||||||
| extern enum zone_type policy_zone; | extern enum zone_type policy_zone; | ||||||
|  | @ -179,6 +181,11 @@ extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); | ||||||
| 
 | 
 | ||||||
| struct mempolicy {}; | struct mempolicy {}; | ||||||
| 
 | 
 | ||||||
|  | static inline struct mempolicy *get_task_policy(struct task_struct *p) | ||||||
|  | { | ||||||
|  | 	return NULL; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) | static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) | ||||||
| { | { | ||||||
| 	return true; | 	return true; | ||||||
|  | @ -213,6 +220,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) | ||||||
| 	return NULL; | 	return NULL; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, | ||||||
|  | 				unsigned long addr, int order, pgoff_t *ilx) | ||||||
|  | { | ||||||
|  | 	*ilx = 0; | ||||||
|  | 	return NULL; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static inline int | static inline int | ||||||
| vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) | vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) | ||||||
| { | { | ||||||
|  |  | ||||||
|  | @ -619,7 +619,7 @@ struct vm_operations_struct { | ||||||
| 	 * policy. | 	 * policy. | ||||||
| 	 */ | 	 */ | ||||||
| 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma, | 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma, | ||||||
| 					unsigned long addr); | 					unsigned long addr, pgoff_t *ilx); | ||||||
| #endif | #endif | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Called by vm_normal_page() for special PTEs to find the | 	 * Called by vm_normal_page() for special PTEs to find the | ||||||
|  |  | ||||||
							
								
								
									
										21
									
								
								ipc/shm.c
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								ipc/shm.c
									
									
									
									
									
								
							|  | @ -562,30 +562,25 @@ static unsigned long shm_pagesize(struct vm_area_struct *vma) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_NUMA | #ifdef CONFIG_NUMA | ||||||
| static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) | ||||||
| { | { | ||||||
| 	struct file *file = vma->vm_file; | 	struct shm_file_data *sfd = shm_file_data(vma->vm_file); | ||||||
| 	struct shm_file_data *sfd = shm_file_data(file); |  | ||||||
| 	int err = 0; | 	int err = 0; | ||||||
| 
 | 
 | ||||||
| 	if (sfd->vm_ops->set_policy) | 	if (sfd->vm_ops->set_policy) | ||||||
| 		err = sfd->vm_ops->set_policy(vma, new); | 		err = sfd->vm_ops->set_policy(vma, mpol); | ||||||
| 	return err; | 	return err; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, | static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, | ||||||
| 					unsigned long addr) | 					unsigned long addr, pgoff_t *ilx) | ||||||
| { | { | ||||||
| 	struct file *file = vma->vm_file; | 	struct shm_file_data *sfd = shm_file_data(vma->vm_file); | ||||||
| 	struct shm_file_data *sfd = shm_file_data(file); | 	struct mempolicy *mpol = vma->vm_policy; | ||||||
| 	struct mempolicy *pol = NULL; |  | ||||||
| 
 | 
 | ||||||
| 	if (sfd->vm_ops->get_policy) | 	if (sfd->vm_ops->get_policy) | ||||||
| 		pol = sfd->vm_ops->get_policy(vma, addr); | 		mpol = sfd->vm_ops->get_policy(vma, addr, ilx); | ||||||
| 	else if (vma->vm_policy) | 	return mpol; | ||||||
| 		pol = vma->vm_policy; |  | ||||||
| 
 |  | ||||||
| 	return pol; |  | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										381
									
								
								mm/mempolicy.c
									
									
									
									
									
								
							
							
						
						
									
										381
									
								
								mm/mempolicy.c
									
									
									
									
									
								
							|  | @ -898,6 +898,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if (flags & MPOL_F_ADDR) { | 	if (flags & MPOL_F_ADDR) { | ||||||
|  | 		pgoff_t ilx;		/* ignored here */ | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * Do NOT fall back to task policy if the | 		 * Do NOT fall back to task policy if the | ||||||
| 		 * vma/shared policy at addr is NULL.  We | 		 * vma/shared policy at addr is NULL.  We | ||||||
|  | @ -909,10 +910,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | ||||||
| 			mmap_read_unlock(mm); | 			mmap_read_unlock(mm); | ||||||
| 			return -EFAULT; | 			return -EFAULT; | ||||||
| 		} | 		} | ||||||
| 		if (vma->vm_ops && vma->vm_ops->get_policy) | 		pol = __get_vma_policy(vma, addr, &ilx); | ||||||
| 			pol = vma->vm_ops->get_policy(vma, addr); |  | ||||||
| 		else |  | ||||||
| 			pol = vma->vm_policy; |  | ||||||
| 	} else if (addr) | 	} else if (addr) | ||||||
| 		return -EINVAL; | 		return -EINVAL; | ||||||
| 
 | 
 | ||||||
|  | @ -1170,6 +1168,15 @@ static struct folio *new_folio(struct folio *src, unsigned long start) | ||||||
| 			break; | 			break; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL | ||||||
|  | 	 * when the page can no longer be located in a vma: that is not ideal | ||||||
|  | 	 * (migrate_pages() will give up early, presuming ENOMEM), but good | ||||||
|  | 	 * enough to avoid a crash by syzkaller or concurrent holepunch. | ||||||
|  | 	 */ | ||||||
|  | 	if (!vma) | ||||||
|  | 		return NULL; | ||||||
|  | 
 | ||||||
| 	if (folio_test_hugetlb(src)) { | 	if (folio_test_hugetlb(src)) { | ||||||
| 		return alloc_hugetlb_folio_vma(folio_hstate(src), | 		return alloc_hugetlb_folio_vma(folio_hstate(src), | ||||||
| 				vma, address); | 				vma, address); | ||||||
|  | @ -1178,9 +1185,6 @@ static struct folio *new_folio(struct folio *src, unsigned long start) | ||||||
| 	if (folio_test_large(src)) | 	if (folio_test_large(src)) | ||||||
| 		gfp = GFP_TRANSHUGE; | 		gfp = GFP_TRANSHUGE; | ||||||
| 
 | 
 | ||||||
| 	/*
 |  | ||||||
| 	 * if !vma, vma_alloc_folio() will use task or system default policy |  | ||||||
| 	 */ |  | ||||||
| 	return vma_alloc_folio(gfp, folio_order(src), vma, address, | 	return vma_alloc_folio(gfp, folio_order(src), vma, address, | ||||||
| 			folio_test_large(src)); | 			folio_test_large(src)); | ||||||
| } | } | ||||||
|  | @ -1690,34 +1694,19 @@ bool vma_migratable(struct vm_area_struct *vma) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, | struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, | ||||||
| 						unsigned long addr) | 				   unsigned long addr, pgoff_t *ilx) | ||||||
| { | { | ||||||
| 	struct mempolicy *pol = NULL; | 	*ilx = 0; | ||||||
| 
 | 	return (vma->vm_ops && vma->vm_ops->get_policy) ? | ||||||
| 	if (vma) { | 		vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; | ||||||
| 		if (vma->vm_ops && vma->vm_ops->get_policy) { |  | ||||||
| 			pol = vma->vm_ops->get_policy(vma, addr); |  | ||||||
| 		} else if (vma->vm_policy) { |  | ||||||
| 			pol = vma->vm_policy; |  | ||||||
| 
 |  | ||||||
| 			/*
 |  | ||||||
| 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with |  | ||||||
| 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference |  | ||||||
| 			 * count on these policies which will be dropped by |  | ||||||
| 			 * mpol_cond_put() later |  | ||||||
| 			 */ |  | ||||||
| 			if (mpol_needs_cond_ref(pol)) |  | ||||||
| 				mpol_get(pol); |  | ||||||
| 		} |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	return pol; |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * get_vma_policy(@vma, @addr) |  * get_vma_policy(@vma, @addr, @order, @ilx) | ||||||
|  * @vma: virtual memory area whose policy is sought |  * @vma: virtual memory area whose policy is sought | ||||||
|  * @addr: address in @vma for shared policy lookup |  * @addr: address in @vma for shared policy lookup | ||||||
|  |  * @order: 0, or appropriate huge_page_order for interleaving | ||||||
|  |  * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE | ||||||
|  * |  * | ||||||
|  * Returns effective policy for a VMA at specified address. |  * Returns effective policy for a VMA at specified address. | ||||||
|  * Falls back to current->mempolicy or system default policy, as necessary. |  * Falls back to current->mempolicy or system default policy, as necessary. | ||||||
|  | @ -1726,14 +1715,18 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, | ||||||
|  * freeing by another task.  It is the caller's responsibility to free the |  * freeing by another task.  It is the caller's responsibility to free the | ||||||
|  * extra reference for shared policies. |  * extra reference for shared policies. | ||||||
|  */ |  */ | ||||||
| static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, | struct mempolicy *get_vma_policy(struct vm_area_struct *vma, | ||||||
| 						unsigned long addr) | 				 unsigned long addr, int order, pgoff_t *ilx) | ||||||
| { | { | ||||||
| 	struct mempolicy *pol = __get_vma_policy(vma, addr); | 	struct mempolicy *pol; | ||||||
| 
 | 
 | ||||||
|  | 	pol = __get_vma_policy(vma, addr, ilx); | ||||||
| 	if (!pol) | 	if (!pol) | ||||||
| 		pol = get_task_policy(current); | 		pol = get_task_policy(current); | ||||||
| 
 | 	if (pol->mode == MPOL_INTERLEAVE) { | ||||||
|  | 		*ilx += vma->vm_pgoff >> order; | ||||||
|  | 		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); | ||||||
|  | 	} | ||||||
| 	return pol; | 	return pol; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -1743,8 +1736,9 @@ bool vma_policy_mof(struct vm_area_struct *vma) | ||||||
| 
 | 
 | ||||||
| 	if (vma->vm_ops && vma->vm_ops->get_policy) { | 	if (vma->vm_ops && vma->vm_ops->get_policy) { | ||||||
| 		bool ret = false; | 		bool ret = false; | ||||||
|  | 		pgoff_t ilx;		/* ignored here */ | ||||||
| 
 | 
 | ||||||
| 		pol = vma->vm_ops->get_policy(vma, vma->vm_start); | 		pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); | ||||||
| 		if (pol && (pol->flags & MPOL_F_MOF)) | 		if (pol && (pol->flags & MPOL_F_MOF)) | ||||||
| 			ret = true; | 			ret = true; | ||||||
| 		mpol_cond_put(pol); | 		mpol_cond_put(pol); | ||||||
|  | @ -1779,54 +1773,6 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) | ||||||
| 	return zone >= dynamic_policy_zone; | 	return zone >= dynamic_policy_zone; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 |  | ||||||
|  * Return a nodemask representing a mempolicy for filtering nodes for |  | ||||||
|  * page allocation |  | ||||||
|  */ |  | ||||||
| nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) |  | ||||||
| { |  | ||||||
| 	int mode = policy->mode; |  | ||||||
| 
 |  | ||||||
| 	/* Lower zones don't get a nodemask applied for MPOL_BIND */ |  | ||||||
| 	if (unlikely(mode == MPOL_BIND) && |  | ||||||
| 		apply_policy_zone(policy, gfp_zone(gfp)) && |  | ||||||
| 		cpuset_nodemask_valid_mems_allowed(&policy->nodes)) |  | ||||||
| 		return &policy->nodes; |  | ||||||
| 
 |  | ||||||
| 	if (mode == MPOL_PREFERRED_MANY) |  | ||||||
| 		return &policy->nodes; |  | ||||||
| 
 |  | ||||||
| 	return NULL; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * Return the  preferred node id for 'prefer' mempolicy, and return |  | ||||||
|  * the given id for all other policies. |  | ||||||
|  * |  | ||||||
|  * policy_node() is always coupled with policy_nodemask(), which |  | ||||||
|  * secures the nodemask limit for 'bind' and 'prefer-many' policy. |  | ||||||
|  */ |  | ||||||
| static int policy_node(gfp_t gfp, struct mempolicy *policy, int nid) |  | ||||||
| { |  | ||||||
| 	if (policy->mode == MPOL_PREFERRED) { |  | ||||||
| 		nid = first_node(policy->nodes); |  | ||||||
| 	} else { |  | ||||||
| 		/*
 |  | ||||||
| 		 * __GFP_THISNODE shouldn't even be used with the bind policy |  | ||||||
| 		 * because we might easily break the expectation to stay on the |  | ||||||
| 		 * requested node and not break the policy. |  | ||||||
| 		 */ |  | ||||||
| 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	if ((policy->mode == MPOL_BIND || |  | ||||||
| 	     policy->mode == MPOL_PREFERRED_MANY) && |  | ||||||
| 	    policy->home_node != NUMA_NO_NODE) |  | ||||||
| 		return policy->home_node; |  | ||||||
| 
 |  | ||||||
| 	return nid; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /* Do dynamic interleaving for a process */ | /* Do dynamic interleaving for a process */ | ||||||
| static unsigned int interleave_nodes(struct mempolicy *policy) | static unsigned int interleave_nodes(struct mempolicy *policy) | ||||||
| { | { | ||||||
|  | @ -1886,11 +1832,11 @@ unsigned int mempolicy_slab_node(void) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Do static interleaving for a VMA with known offset @n.  Returns the n'th |  * Do static interleaving for interleave index @ilx.  Returns the ilx'th | ||||||
|  * node in pol->nodes (starting from n=0), wrapping around if n exceeds the |  * node in pol->nodes (starting from ilx=0), wrapping around if ilx | ||||||
|  * number of present nodes. |  * exceeds the number of present nodes. | ||||||
|  */ |  */ | ||||||
| static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) | static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) | ||||||
| { | { | ||||||
| 	nodemask_t nodemask = pol->nodes; | 	nodemask_t nodemask = pol->nodes; | ||||||
| 	unsigned int target, nnodes; | 	unsigned int target, nnodes; | ||||||
|  | @ -1908,33 +1854,54 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) | ||||||
| 	nnodes = nodes_weight(nodemask); | 	nnodes = nodes_weight(nodemask); | ||||||
| 	if (!nnodes) | 	if (!nnodes) | ||||||
| 		return numa_node_id(); | 		return numa_node_id(); | ||||||
| 	target = (unsigned int)n % nnodes; | 	target = ilx % nnodes; | ||||||
| 	nid = first_node(nodemask); | 	nid = first_node(nodemask); | ||||||
| 	for (i = 0; i < target; i++) | 	for (i = 0; i < target; i++) | ||||||
| 		nid = next_node(nid, nodemask); | 		nid = next_node(nid, nodemask); | ||||||
| 	return nid; | 	return nid; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* Determine a node number for interleave */ | /*
 | ||||||
| static inline unsigned interleave_nid(struct mempolicy *pol, |  * Return a nodemask representing a mempolicy for filtering nodes for | ||||||
| 		 struct vm_area_struct *vma, unsigned long addr, int shift) |  * page allocation, together with preferred node id (or the input node id). | ||||||
|  |  */ | ||||||
|  | static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, | ||||||
|  | 				   pgoff_t ilx, int *nid) | ||||||
| { | { | ||||||
| 	if (vma) { | 	nodemask_t *nodemask = NULL; | ||||||
| 		unsigned long off; |  | ||||||
| 
 | 
 | ||||||
|  | 	switch (pol->mode) { | ||||||
|  | 	case MPOL_PREFERRED: | ||||||
|  | 		/* Override input node id */ | ||||||
|  | 		*nid = first_node(pol->nodes); | ||||||
|  | 		break; | ||||||
|  | 	case MPOL_PREFERRED_MANY: | ||||||
|  | 		nodemask = &pol->nodes; | ||||||
|  | 		if (pol->home_node != NUMA_NO_NODE) | ||||||
|  | 			*nid = pol->home_node; | ||||||
|  | 		break; | ||||||
|  | 	case MPOL_BIND: | ||||||
|  | 		/* Restrict to nodemask (but not on lower zones) */ | ||||||
|  | 		if (apply_policy_zone(pol, gfp_zone(gfp)) && | ||||||
|  | 		    cpuset_nodemask_valid_mems_allowed(&pol->nodes)) | ||||||
|  | 			nodemask = &pol->nodes; | ||||||
|  | 		if (pol->home_node != NUMA_NO_NODE) | ||||||
|  | 			*nid = pol->home_node; | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * for small pages, there is no difference between | 		 * __GFP_THISNODE shouldn't even be used with the bind policy | ||||||
| 		 * shift and PAGE_SHIFT, so the bit-shift is safe. | 		 * because we might easily break the expectation to stay on the | ||||||
| 		 * for huge pages, since vm_pgoff is in units of small | 		 * requested node and not break the policy. | ||||||
| 		 * pages, we need to shift off the always 0 bits to get |  | ||||||
| 		 * a useful offset. |  | ||||||
| 		 */ | 		 */ | ||||||
| 		BUG_ON(shift < PAGE_SHIFT); | 		WARN_ON_ONCE(gfp & __GFP_THISNODE); | ||||||
| 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT); | 		break; | ||||||
| 		off += (addr - vma->vm_start) >> shift; | 	case MPOL_INTERLEAVE: | ||||||
| 		return offset_il_node(pol, off); | 		/* Override input node id */ | ||||||
| 	} else | 		*nid = (ilx == NO_INTERLEAVE_INDEX) ? | ||||||
| 		return interleave_nodes(pol); | 			interleave_nodes(pol) : interleave_nid(pol, ilx); | ||||||
|  | 		break; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return nodemask; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_HUGETLBFS | #ifdef CONFIG_HUGETLBFS | ||||||
|  | @ -1950,27 +1917,16 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | ||||||
|  * to the struct mempolicy for conditional unref after allocation. |  * to the struct mempolicy for conditional unref after allocation. | ||||||
|  * If the effective policy is 'bind' or 'prefer-many', returns a pointer |  * If the effective policy is 'bind' or 'prefer-many', returns a pointer | ||||||
|  * to the mempolicy's @nodemask for filtering the zonelist. |  * to the mempolicy's @nodemask for filtering the zonelist. | ||||||
|  * |  | ||||||
|  * Must be protected by read_mems_allowed_begin() |  | ||||||
|  */ |  */ | ||||||
| int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, | int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, | ||||||
| 				struct mempolicy **mpol, nodemask_t **nodemask) | 		struct mempolicy **mpol, nodemask_t **nodemask) | ||||||
| { | { | ||||||
|  | 	pgoff_t ilx; | ||||||
| 	int nid; | 	int nid; | ||||||
| 	int mode; |  | ||||||
| 
 | 
 | ||||||
| 	*mpol = get_vma_policy(vma, addr); | 	nid = numa_node_id(); | ||||||
| 	*nodemask = NULL; | 	*mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); | ||||||
| 	mode = (*mpol)->mode; | 	*nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); | ||||||
| 
 |  | ||||||
| 	if (unlikely(mode == MPOL_INTERLEAVE)) { |  | ||||||
| 		nid = interleave_nid(*mpol, vma, addr, |  | ||||||
| 					huge_page_shift(hstate_vma(vma))); |  | ||||||
| 	} else { |  | ||||||
| 		nid = policy_node(gfp_flags, *mpol, numa_node_id()); |  | ||||||
| 		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) |  | ||||||
| 			*nodemask = &(*mpol)->nodes; |  | ||||||
| 	} |  | ||||||
| 	return nid; | 	return nid; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -2048,27 +2004,8 @@ bool mempolicy_in_oom_domain(struct task_struct *tsk, | ||||||
| 	return ret; | 	return ret; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* Allocate a page in interleaved policy.
 |  | ||||||
|    Own path because it needs to do special accounting. */ |  | ||||||
| static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, |  | ||||||
| 					unsigned nid) |  | ||||||
| { |  | ||||||
| 	struct page *page; |  | ||||||
| 
 |  | ||||||
| 	page = __alloc_pages(gfp, order, nid, NULL); |  | ||||||
| 	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ |  | ||||||
| 	if (!static_branch_likely(&vm_numa_stat_key)) |  | ||||||
| 		return page; |  | ||||||
| 	if (page && page_to_nid(page) == nid) { |  | ||||||
| 		preempt_disable(); |  | ||||||
| 		__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); |  | ||||||
| 		preempt_enable(); |  | ||||||
| 	} |  | ||||||
| 	return page; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, | static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, | ||||||
| 						int nid, struct mempolicy *pol) | 						int nid, nodemask_t *nodemask) | ||||||
| { | { | ||||||
| 	struct page *page; | 	struct page *page; | ||||||
| 	gfp_t preferred_gfp; | 	gfp_t preferred_gfp; | ||||||
|  | @ -2081,7 +2018,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, | ||||||
| 	 */ | 	 */ | ||||||
| 	preferred_gfp = gfp | __GFP_NOWARN; | 	preferred_gfp = gfp | __GFP_NOWARN; | ||||||
| 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); | 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); | ||||||
| 	page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); | 	page = __alloc_pages(preferred_gfp, order, nid, nodemask); | ||||||
| 	if (!page) | 	if (!page) | ||||||
| 		page = __alloc_pages(gfp, order, nid, NULL); | 		page = __alloc_pages(gfp, order, nid, NULL); | ||||||
| 
 | 
 | ||||||
|  | @ -2089,55 +2026,29 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  * vma_alloc_folio - Allocate a folio for a VMA. |  * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. | ||||||
|  * @gfp: GFP flags. |  * @gfp: GFP flags. | ||||||
|  * @order: Order of the folio. |  * @order: Order of the page allocation. | ||||||
|  * @vma: Pointer to VMA or NULL if not available. |  * @pol: Pointer to the NUMA mempolicy. | ||||||
|  * @addr: Virtual address of the allocation.  Must be inside @vma. |  * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). | ||||||
|  * @hugepage: For hugepages try only the preferred node if possible. |  * @nid: Preferred node (usually numa_node_id() but @mpol may override it). | ||||||
|  * |  * | ||||||
|  * Allocate a folio for a specific address in @vma, using the appropriate |  * Return: The page on success or NULL if allocation fails. | ||||||
|  * NUMA policy.  When @vma is not NULL the caller must hold the mmap_lock |  | ||||||
|  * of the mm_struct of the VMA to prevent it from going away.  Should be |  | ||||||
|  * used for all allocations for folios that will be mapped into user space. |  | ||||||
|  * |  | ||||||
|  * Return: The folio on success or NULL if allocation fails. |  | ||||||
|  */ |  */ | ||||||
| struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, | struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, | ||||||
| 		unsigned long addr, bool hugepage) | 		struct mempolicy *pol, pgoff_t ilx, int nid) | ||||||
| { | { | ||||||
| 	struct mempolicy *pol; | 	nodemask_t *nodemask; | ||||||
| 	int node = numa_node_id(); | 	struct page *page; | ||||||
| 	struct folio *folio; |  | ||||||
| 	int preferred_nid; |  | ||||||
| 	nodemask_t *nmask; |  | ||||||
| 
 | 
 | ||||||
| 	pol = get_vma_policy(vma, addr); | 	nodemask = policy_nodemask(gfp, pol, ilx, &nid); | ||||||
| 
 | 
 | ||||||
| 	if (pol->mode == MPOL_INTERLEAVE) { | 	if (pol->mode == MPOL_PREFERRED_MANY) | ||||||
| 		struct page *page; | 		return alloc_pages_preferred_many(gfp, order, nid, nodemask); | ||||||
| 		unsigned nid; |  | ||||||
| 
 |  | ||||||
| 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); |  | ||||||
| 		mpol_cond_put(pol); |  | ||||||
| 		gfp |= __GFP_COMP; |  | ||||||
| 		page = alloc_page_interleave(gfp, order, nid); |  | ||||||
| 		return page_rmappable_folio(page); |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	if (pol->mode == MPOL_PREFERRED_MANY) { |  | ||||||
| 		struct page *page; |  | ||||||
| 
 |  | ||||||
| 		node = policy_node(gfp, pol, node); |  | ||||||
| 		gfp |= __GFP_COMP; |  | ||||||
| 		page = alloc_pages_preferred_many(gfp, order, node, pol); |  | ||||||
| 		mpol_cond_put(pol); |  | ||||||
| 		return page_rmappable_folio(page); |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { |  | ||||||
| 		int hpage_node = node; |  | ||||||
| 
 | 
 | ||||||
|  | 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && | ||||||
|  | 	    /* filter "hugepage" allocation, unless from alloc_pages() */ | ||||||
|  | 	    order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * For hugepage allocation and non-interleave policy which | 		 * For hugepage allocation and non-interleave policy which | ||||||
| 		 * allows the current node (or other explicitly preferred | 		 * allows the current node (or other explicitly preferred | ||||||
|  | @ -2148,39 +2059,68 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, | ||||||
| 		 * If the policy is interleave or does not allow the current | 		 * If the policy is interleave or does not allow the current | ||||||
| 		 * node in its nodemask, we allocate the standard way. | 		 * node in its nodemask, we allocate the standard way. | ||||||
| 		 */ | 		 */ | ||||||
| 		if (pol->mode == MPOL_PREFERRED) | 		if (pol->mode != MPOL_INTERLEAVE && | ||||||
| 			hpage_node = first_node(pol->nodes); | 		    (!nodemask || node_isset(nid, *nodemask))) { | ||||||
| 
 |  | ||||||
| 		nmask = policy_nodemask(gfp, pol); |  | ||||||
| 		if (!nmask || node_isset(hpage_node, *nmask)) { |  | ||||||
| 			mpol_cond_put(pol); |  | ||||||
| 			/*
 | 			/*
 | ||||||
| 			 * First, try to allocate THP only on local node, but | 			 * First, try to allocate THP only on local node, but | ||||||
| 			 * don't reclaim unnecessarily, just compact. | 			 * don't reclaim unnecessarily, just compact. | ||||||
| 			 */ | 			 */ | ||||||
| 			folio = __folio_alloc_node(gfp | __GFP_THISNODE | | 			page = __alloc_pages_node(nid, | ||||||
| 					__GFP_NORETRY, order, hpage_node); | 				gfp | __GFP_THISNODE | __GFP_NORETRY, order); | ||||||
| 
 | 			if (page || !(gfp & __GFP_DIRECT_RECLAIM)) | ||||||
|  | 				return page; | ||||||
| 			/*
 | 			/*
 | ||||||
| 			 * If hugepage allocations are configured to always | 			 * If hugepage allocations are configured to always | ||||||
| 			 * synchronous compact or the vma has been madvised | 			 * synchronous compact or the vma has been madvised | ||||||
| 			 * to prefer hugepage backing, retry allowing remote | 			 * to prefer hugepage backing, retry allowing remote | ||||||
| 			 * memory with both reclaim and compact as well. | 			 * memory with both reclaim and compact as well. | ||||||
| 			 */ | 			 */ | ||||||
| 			if (!folio && (gfp & __GFP_DIRECT_RECLAIM)) |  | ||||||
| 				folio = __folio_alloc(gfp, order, hpage_node, |  | ||||||
| 						      nmask); |  | ||||||
| 
 |  | ||||||
| 			goto out; |  | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	nmask = policy_nodemask(gfp, pol); | 	page = __alloc_pages(gfp, order, nid, nodemask); | ||||||
| 	preferred_nid = policy_node(gfp, pol, node); | 
 | ||||||
| 	folio = __folio_alloc(gfp, order, preferred_nid, nmask); | 	if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { | ||||||
|  | 		/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ | ||||||
|  | 		if (static_branch_likely(&vm_numa_stat_key) && | ||||||
|  | 		    page_to_nid(page) == nid) { | ||||||
|  | 			preempt_disable(); | ||||||
|  | 			__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); | ||||||
|  | 			preempt_enable(); | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return page; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * vma_alloc_folio - Allocate a folio for a VMA. | ||||||
|  |  * @gfp: GFP flags. | ||||||
|  |  * @order: Order of the folio. | ||||||
|  |  * @vma: Pointer to VMA. | ||||||
|  |  * @addr: Virtual address of the allocation.  Must be inside @vma. | ||||||
|  |  * @hugepage: Unused (was: For hugepages try only preferred node if possible). | ||||||
|  |  * | ||||||
|  |  * Allocate a folio for a specific address in @vma, using the appropriate | ||||||
|  |  * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the | ||||||
|  |  * VMA to prevent it from going away.  Should be used for all allocations | ||||||
|  |  * for folios that will be mapped into user space, excepting hugetlbfs, and | ||||||
|  |  * excepting where direct use of alloc_pages_mpol() is more appropriate. | ||||||
|  |  * | ||||||
|  |  * Return: The folio on success or NULL if allocation fails. | ||||||
|  |  */ | ||||||
|  | struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, | ||||||
|  | 		unsigned long addr, bool hugepage) | ||||||
|  | { | ||||||
|  | 	struct mempolicy *pol; | ||||||
|  | 	pgoff_t ilx; | ||||||
|  | 	struct page *page; | ||||||
|  | 
 | ||||||
|  | 	pol = get_vma_policy(vma, addr, order, &ilx); | ||||||
|  | 	page = alloc_pages_mpol(gfp | __GFP_COMP, order, | ||||||
|  | 				pol, ilx, numa_node_id()); | ||||||
| 	mpol_cond_put(pol); | 	mpol_cond_put(pol); | ||||||
| out: | 	return page_rmappable_folio(page); | ||||||
| 	return folio; |  | ||||||
| } | } | ||||||
| EXPORT_SYMBOL(vma_alloc_folio); | EXPORT_SYMBOL(vma_alloc_folio); | ||||||
| 
 | 
 | ||||||
|  | @ -2198,33 +2138,23 @@ EXPORT_SYMBOL(vma_alloc_folio); | ||||||
|  * flags are used. |  * flags are used. | ||||||
|  * Return: The page on success or NULL if allocation fails. |  * Return: The page on success or NULL if allocation fails. | ||||||
|  */ |  */ | ||||||
| struct page *alloc_pages(gfp_t gfp, unsigned order) | struct page *alloc_pages(gfp_t gfp, unsigned int order) | ||||||
| { | { | ||||||
| 	struct mempolicy *pol = &default_policy; | 	struct mempolicy *pol = &default_policy; | ||||||
| 	struct page *page; |  | ||||||
| 
 |  | ||||||
| 	if (!in_interrupt() && !(gfp & __GFP_THISNODE)) |  | ||||||
| 		pol = get_task_policy(current); |  | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * No reference counting needed for current->mempolicy | 	 * No reference counting needed for current->mempolicy | ||||||
| 	 * nor system default_policy | 	 * nor system default_policy | ||||||
| 	 */ | 	 */ | ||||||
| 	if (pol->mode == MPOL_INTERLEAVE) | 	if (!in_interrupt() && !(gfp & __GFP_THISNODE)) | ||||||
| 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); | 		pol = get_task_policy(current); | ||||||
| 	else if (pol->mode == MPOL_PREFERRED_MANY) |  | ||||||
| 		page = alloc_pages_preferred_many(gfp, order, |  | ||||||
| 				  policy_node(gfp, pol, numa_node_id()), pol); |  | ||||||
| 	else |  | ||||||
| 		page = __alloc_pages(gfp, order, |  | ||||||
| 				policy_node(gfp, pol, numa_node_id()), |  | ||||||
| 				policy_nodemask(gfp, pol)); |  | ||||||
| 
 | 
 | ||||||
| 	return page; | 	return alloc_pages_mpol(gfp, order, | ||||||
|  | 				pol, NO_INTERLEAVE_INDEX, numa_node_id()); | ||||||
| } | } | ||||||
| EXPORT_SYMBOL(alloc_pages); | EXPORT_SYMBOL(alloc_pages); | ||||||
| 
 | 
 | ||||||
| struct folio *folio_alloc(gfp_t gfp, unsigned order) | struct folio *folio_alloc(gfp_t gfp, unsigned int order) | ||||||
| { | { | ||||||
| 	return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); | 	return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); | ||||||
| } | } | ||||||
|  | @ -2295,6 +2225,8 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, | ||||||
| 		unsigned long nr_pages, struct page **page_array) | 		unsigned long nr_pages, struct page **page_array) | ||||||
| { | { | ||||||
| 	struct mempolicy *pol = &default_policy; | 	struct mempolicy *pol = &default_policy; | ||||||
|  | 	nodemask_t *nodemask; | ||||||
|  | 	int nid; | ||||||
| 
 | 
 | ||||||
| 	if (!in_interrupt() && !(gfp & __GFP_THISNODE)) | 	if (!in_interrupt() && !(gfp & __GFP_THISNODE)) | ||||||
| 		pol = get_task_policy(current); | 		pol = get_task_policy(current); | ||||||
|  | @ -2307,9 +2239,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, | ||||||
| 		return alloc_pages_bulk_array_preferred_many(gfp, | 		return alloc_pages_bulk_array_preferred_many(gfp, | ||||||
| 				numa_node_id(), pol, nr_pages, page_array); | 				numa_node_id(), pol, nr_pages, page_array); | ||||||
| 
 | 
 | ||||||
| 	return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), | 	nid = numa_node_id(); | ||||||
| 				  policy_nodemask(gfp, pol), nr_pages, NULL, | 	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); | ||||||
| 				  page_array); | 	return __alloc_pages_bulk(gfp, nid, nodemask, | ||||||
|  | 				  nr_pages, NULL, page_array); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) | int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) | ||||||
|  | @ -2496,23 +2429,21 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, | ||||||
| 		   unsigned long addr) | 		   unsigned long addr) | ||||||
| { | { | ||||||
| 	struct mempolicy *pol; | 	struct mempolicy *pol; | ||||||
|  | 	pgoff_t ilx; | ||||||
| 	struct zoneref *z; | 	struct zoneref *z; | ||||||
| 	int curnid = folio_nid(folio); | 	int curnid = folio_nid(folio); | ||||||
| 	unsigned long pgoff; |  | ||||||
| 	int thiscpu = raw_smp_processor_id(); | 	int thiscpu = raw_smp_processor_id(); | ||||||
| 	int thisnid = cpu_to_node(thiscpu); | 	int thisnid = cpu_to_node(thiscpu); | ||||||
| 	int polnid = NUMA_NO_NODE; | 	int polnid = NUMA_NO_NODE; | ||||||
| 	int ret = NUMA_NO_NODE; | 	int ret = NUMA_NO_NODE; | ||||||
| 
 | 
 | ||||||
| 	pol = get_vma_policy(vma, addr); | 	pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); | ||||||
| 	if (!(pol->flags & MPOL_F_MOF)) | 	if (!(pol->flags & MPOL_F_MOF)) | ||||||
| 		goto out; | 		goto out; | ||||||
| 
 | 
 | ||||||
| 	switch (pol->mode) { | 	switch (pol->mode) { | ||||||
| 	case MPOL_INTERLEAVE: | 	case MPOL_INTERLEAVE: | ||||||
| 		pgoff = vma->vm_pgoff; | 		polnid = interleave_nid(pol, ilx); | ||||||
| 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; |  | ||||||
| 		polnid = offset_il_node(pol, pgoff); |  | ||||||
| 		break; | 		break; | ||||||
| 
 | 
 | ||||||
| 	case MPOL_PREFERRED: | 	case MPOL_PREFERRED: | ||||||
|  |  | ||||||
							
								
								
									
										92
									
								
								mm/shmem.c
									
									
									
									
									
								
							
							
						
						
									
										92
									
								
								mm/shmem.c
									
									
									
									
									
								
							|  | @ -1544,38 +1544,20 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | ||||||
| 	return NULL; | 	return NULL; | ||||||
| } | } | ||||||
| #endif /* CONFIG_NUMA && CONFIG_TMPFS */ | #endif /* CONFIG_NUMA && CONFIG_TMPFS */ | ||||||
| #ifndef CONFIG_NUMA |  | ||||||
| #define vm_policy vm_private_data |  | ||||||
| #endif |  | ||||||
| 
 | 
 | ||||||
| static void shmem_pseudo_vma_init(struct vm_area_struct *vma, | static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, | ||||||
| 		struct shmem_inode_info *info, pgoff_t index) | 			pgoff_t index, unsigned int order, pgoff_t *ilx); | ||||||
| { |  | ||||||
| 	/* Create a pseudo vma that just contains the policy */ |  | ||||||
| 	vma_init(vma, NULL); |  | ||||||
| 	/* Bias interleave by inode number to distribute better across nodes */ |  | ||||||
| 	vma->vm_pgoff = index + info->vfs_inode.i_ino; |  | ||||||
| 	vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); |  | ||||||
| } |  | ||||||
| 
 | 
 | ||||||
| static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) | static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, | ||||||
| { |  | ||||||
| 	/* Drop reference taken by mpol_shared_policy_lookup() */ |  | ||||||
| 	mpol_cond_put(vma->vm_policy); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, |  | ||||||
| 			struct shmem_inode_info *info, pgoff_t index) | 			struct shmem_inode_info *info, pgoff_t index) | ||||||
| { | { | ||||||
| 	struct vm_area_struct pvma; | 	struct mempolicy *mpol; | ||||||
|  | 	pgoff_t ilx; | ||||||
| 	struct page *page; | 	struct page *page; | ||||||
| 	struct vm_fault vmf = { |  | ||||||
| 		.vma = &pvma, |  | ||||||
| 	}; |  | ||||||
| 
 | 
 | ||||||
| 	shmem_pseudo_vma_init(&pvma, info, index); | 	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); | ||||||
| 	page = swap_cluster_readahead(swap, gfp, &vmf); | 	page = swap_cluster_readahead(swap, gfp, mpol, ilx); | ||||||
| 	shmem_pseudo_vma_destroy(&pvma); | 	mpol_cond_put(mpol); | ||||||
| 
 | 
 | ||||||
| 	if (!page) | 	if (!page) | ||||||
| 		return NULL; | 		return NULL; | ||||||
|  | @ -1609,27 +1591,29 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) | ||||||
| static struct folio *shmem_alloc_hugefolio(gfp_t gfp, | static struct folio *shmem_alloc_hugefolio(gfp_t gfp, | ||||||
| 		struct shmem_inode_info *info, pgoff_t index) | 		struct shmem_inode_info *info, pgoff_t index) | ||||||
| { | { | ||||||
| 	struct vm_area_struct pvma; | 	struct mempolicy *mpol; | ||||||
| 	struct folio *folio; | 	pgoff_t ilx; | ||||||
|  | 	struct page *page; | ||||||
| 
 | 
 | ||||||
| 	shmem_pseudo_vma_init(&pvma, info, index); | 	mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx); | ||||||
| 	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); | 	page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id()); | ||||||
| 	shmem_pseudo_vma_destroy(&pvma); | 	mpol_cond_put(mpol); | ||||||
| 
 | 
 | ||||||
| 	return folio; | 	return page_rmappable_folio(page); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static struct folio *shmem_alloc_folio(gfp_t gfp, | static struct folio *shmem_alloc_folio(gfp_t gfp, | ||||||
| 		struct shmem_inode_info *info, pgoff_t index) | 		struct shmem_inode_info *info, pgoff_t index) | ||||||
| { | { | ||||||
| 	struct vm_area_struct pvma; | 	struct mempolicy *mpol; | ||||||
| 	struct folio *folio; | 	pgoff_t ilx; | ||||||
|  | 	struct page *page; | ||||||
| 
 | 
 | ||||||
| 	shmem_pseudo_vma_init(&pvma, info, index); | 	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); | ||||||
| 	folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); | 	page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id()); | ||||||
| 	shmem_pseudo_vma_destroy(&pvma); | 	mpol_cond_put(mpol); | ||||||
| 
 | 
 | ||||||
| 	return folio; | 	return (struct folio *)page; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, | static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, | ||||||
|  | @ -1883,7 +1867,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, | ||||||
| 			count_memcg_event_mm(fault_mm, PGMAJFAULT); | 			count_memcg_event_mm(fault_mm, PGMAJFAULT); | ||||||
| 		} | 		} | ||||||
| 		/* Here we actually start the io */ | 		/* Here we actually start the io */ | ||||||
| 		folio = shmem_swapin(swap, gfp, info, index); | 		folio = shmem_swapin_cluster(swap, gfp, info, index); | ||||||
| 		if (!folio) { | 		if (!folio) { | ||||||
| 			error = -ENOMEM; | 			error = -ENOMEM; | ||||||
| 			goto failed; | 			goto failed; | ||||||
|  | @ -2334,15 +2318,41 @@ static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, | static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, | ||||||
| 					  unsigned long addr) | 					  unsigned long addr, pgoff_t *ilx) | ||||||
| { | { | ||||||
| 	struct inode *inode = file_inode(vma->vm_file); | 	struct inode *inode = file_inode(vma->vm_file); | ||||||
| 	pgoff_t index; | 	pgoff_t index; | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Bias interleave by inode number to distribute better across nodes; | ||||||
|  | 	 * but this interface is independent of which page order is used, so | ||||||
|  | 	 * supplies only that bias, letting caller apply the offset (adjusted | ||||||
|  | 	 * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). | ||||||
|  | 	 */ | ||||||
|  | 	*ilx = inode->i_ino; | ||||||
| 	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||||||
| 	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); | 	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); | ||||||
| } | } | ||||||
| #endif | 
 | ||||||
|  | static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, | ||||||
|  | 			pgoff_t index, unsigned int order, pgoff_t *ilx) | ||||||
|  | { | ||||||
|  | 	struct mempolicy *mpol; | ||||||
|  | 
 | ||||||
|  | 	/* Bias interleave by inode number to distribute better across nodes */ | ||||||
|  | 	*ilx = info->vfs_inode.i_ino + (index >> order); | ||||||
|  | 
 | ||||||
|  | 	mpol = mpol_shared_policy_lookup(&info->policy, index); | ||||||
|  | 	return mpol ? mpol : get_task_policy(current); | ||||||
|  | } | ||||||
|  | #else | ||||||
|  | static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, | ||||||
|  | 			pgoff_t index, unsigned int order, pgoff_t *ilx) | ||||||
|  | { | ||||||
|  | 	*ilx = 0; | ||||||
|  | 	return NULL; | ||||||
|  | } | ||||||
|  | #endif /* CONFIG_NUMA */ | ||||||
| 
 | 
 | ||||||
| int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) | int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) | ||||||
| { | { | ||||||
|  |  | ||||||
|  | @ -2,6 +2,8 @@ | ||||||
| #ifndef _MM_SWAP_H | #ifndef _MM_SWAP_H | ||||||
| #define _MM_SWAP_H | #define _MM_SWAP_H | ||||||
| 
 | 
 | ||||||
|  | struct mempolicy; | ||||||
|  | 
 | ||||||
| #ifdef CONFIG_SWAP | #ifdef CONFIG_SWAP | ||||||
| #include <linux/blk_types.h> /* for bio_end_io_t */ | #include <linux/blk_types.h> /* for bio_end_io_t */ | ||||||
| 
 | 
 | ||||||
|  | @ -48,11 +50,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | ||||||
| 				   unsigned long addr, | 				   unsigned long addr, | ||||||
| 				   struct swap_iocb **plug); | 				   struct swap_iocb **plug); | ||||||
| struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | ||||||
| 				     struct vm_area_struct *vma, | 				     struct mempolicy *mpol, pgoff_t ilx, | ||||||
| 				     unsigned long addr, |  | ||||||
| 				     bool *new_page_allocated); | 				     bool *new_page_allocated); | ||||||
| struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, | struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, | ||||||
| 				    struct vm_fault *vmf); | 				    struct mempolicy *mpol, pgoff_t ilx); | ||||||
| struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, | struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, | ||||||
| 			      struct vm_fault *vmf); | 			      struct vm_fault *vmf); | ||||||
| 
 | 
 | ||||||
|  | @ -80,7 +81,7 @@ static inline void show_swap_cache_info(void) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline struct page *swap_cluster_readahead(swp_entry_t entry, | static inline struct page *swap_cluster_readahead(swp_entry_t entry, | ||||||
| 				gfp_t gfp_mask, struct vm_fault *vmf) | 			gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) | ||||||
| { | { | ||||||
| 	return NULL; | 	return NULL; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -10,6 +10,7 @@ | ||||||
| #include <linux/mm.h> | #include <linux/mm.h> | ||||||
| #include <linux/gfp.h> | #include <linux/gfp.h> | ||||||
| #include <linux/kernel_stat.h> | #include <linux/kernel_stat.h> | ||||||
|  | #include <linux/mempolicy.h> | ||||||
| #include <linux/swap.h> | #include <linux/swap.h> | ||||||
| #include <linux/swapops.h> | #include <linux/swapops.h> | ||||||
| #include <linux/init.h> | #include <linux/init.h> | ||||||
|  | @ -410,8 +411,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | ||||||
| 			struct vm_area_struct *vma, unsigned long addr, | 				     struct mempolicy *mpol, pgoff_t ilx, | ||||||
| 			bool *new_page_allocated) | 				     bool *new_page_allocated) | ||||||
| { | { | ||||||
| 	struct swap_info_struct *si; | 	struct swap_info_struct *si; | ||||||
| 	struct folio *folio; | 	struct folio *folio; | ||||||
|  | @ -453,7 +454,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | ||||||
| 		 * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will | 		 * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will | ||||||
| 		 * cause any racers to loop around until we add it to cache. | 		 * cause any racers to loop around until we add it to cache. | ||||||
| 		 */ | 		 */ | ||||||
| 		folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); | 		folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, | ||||||
|  | 						mpol, ilx, numa_node_id()); | ||||||
| 		if (!folio) | 		if (!folio) | ||||||
|                         goto fail_put_swap; |                         goto fail_put_swap; | ||||||
| 
 | 
 | ||||||
|  | @ -528,14 +530,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | ||||||
| 				   struct vm_area_struct *vma, | 				   struct vm_area_struct *vma, | ||||||
| 				   unsigned long addr, struct swap_iocb **plug) | 				   unsigned long addr, struct swap_iocb **plug) | ||||||
| { | { | ||||||
| 	bool page_was_allocated; | 	bool page_allocated; | ||||||
| 	struct page *retpage = __read_swap_cache_async(entry, gfp_mask, | 	struct mempolicy *mpol; | ||||||
| 			vma, addr, &page_was_allocated); | 	pgoff_t ilx; | ||||||
|  | 	struct page *page; | ||||||
| 
 | 
 | ||||||
| 	if (page_was_allocated) | 	mpol = get_vma_policy(vma, addr, 0, &ilx); | ||||||
| 		swap_readpage(retpage, false, plug); | 	page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, | ||||||
|  | 					&page_allocated); | ||||||
|  | 	mpol_cond_put(mpol); | ||||||
| 
 | 
 | ||||||
| 	return retpage; | 	if (page_allocated) | ||||||
|  | 		swap_readpage(page, false, plug); | ||||||
|  | 	return page; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static unsigned int __swapin_nr_pages(unsigned long prev_offset, | static unsigned int __swapin_nr_pages(unsigned long prev_offset, | ||||||
|  | @ -603,7 +610,8 @@ static unsigned long swapin_nr_pages(unsigned long offset) | ||||||
|  * swap_cluster_readahead - swap in pages in hope we need them soon |  * swap_cluster_readahead - swap in pages in hope we need them soon | ||||||
|  * @entry: swap entry of this memory |  * @entry: swap entry of this memory | ||||||
|  * @gfp_mask: memory allocation flags |  * @gfp_mask: memory allocation flags | ||||||
|  * @vmf: fault information |  * @mpol: NUMA memory allocation policy to be applied | ||||||
|  |  * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE | ||||||
|  * |  * | ||||||
|  * Returns the struct page for entry and addr, after queueing swapin. |  * Returns the struct page for entry and addr, after queueing swapin. | ||||||
|  * |  * | ||||||
|  | @ -612,13 +620,12 @@ static unsigned long swapin_nr_pages(unsigned long offset) | ||||||
|  * because it doesn't cost us any seek time.  We also make sure to queue |  * because it doesn't cost us any seek time.  We also make sure to queue | ||||||
|  * the 'original' request together with the readahead ones... |  * the 'original' request together with the readahead ones... | ||||||
|  * |  * | ||||||
|  * This has been extended to use the NUMA policies from the mm triggering |  * Note: it is intentional that the same NUMA policy and interleave index | ||||||
|  * the readahead. |  * are used for every page of the readahead: neighbouring pages on swap | ||||||
|  * |  * are fairly likely to have been swapped out from the same node. | ||||||
|  * Caller must hold read mmap_lock if vmf->vma is not NULL. |  | ||||||
|  */ |  */ | ||||||
| struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, | struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, | ||||||
| 				struct vm_fault *vmf) | 				    struct mempolicy *mpol, pgoff_t ilx) | ||||||
| { | { | ||||||
| 	struct page *page; | 	struct page *page; | ||||||
| 	unsigned long entry_offset = swp_offset(entry); | 	unsigned long entry_offset = swp_offset(entry); | ||||||
|  | @ -629,8 +636,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, | ||||||
| 	struct blk_plug plug; | 	struct blk_plug plug; | ||||||
| 	struct swap_iocb *splug = NULL; | 	struct swap_iocb *splug = NULL; | ||||||
| 	bool page_allocated; | 	bool page_allocated; | ||||||
| 	struct vm_area_struct *vma = vmf->vma; |  | ||||||
| 	unsigned long addr = vmf->address; |  | ||||||
| 
 | 
 | ||||||
| 	mask = swapin_nr_pages(offset) - 1; | 	mask = swapin_nr_pages(offset) - 1; | ||||||
| 	if (!mask) | 	if (!mask) | ||||||
|  | @ -648,8 +653,8 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, | ||||||
| 	for (offset = start_offset; offset <= end_offset ; offset++) { | 	for (offset = start_offset; offset <= end_offset ; offset++) { | ||||||
| 		/* Ok, do the async read-ahead now */ | 		/* Ok, do the async read-ahead now */ | ||||||
| 		page = __read_swap_cache_async( | 		page = __read_swap_cache_async( | ||||||
| 			swp_entry(swp_type(entry), offset), | 				swp_entry(swp_type(entry), offset), | ||||||
| 			gfp_mask, vma, addr, &page_allocated); | 				gfp_mask, mpol, ilx, &page_allocated); | ||||||
| 		if (!page) | 		if (!page) | ||||||
| 			continue; | 			continue; | ||||||
| 		if (page_allocated) { | 		if (page_allocated) { | ||||||
|  | @ -663,11 +668,14 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, | ||||||
| 	} | 	} | ||||||
| 	blk_finish_plug(&plug); | 	blk_finish_plug(&plug); | ||||||
| 	swap_read_unplug(splug); | 	swap_read_unplug(splug); | ||||||
| 
 |  | ||||||
| 	lru_add_drain();	/* Push any new pages onto the LRU now */ | 	lru_add_drain();	/* Push any new pages onto the LRU now */ | ||||||
| skip: | skip: | ||||||
| 	/* The page was likely read above, so no need for plugging here */ | 	/* The page was likely read above, so no need for plugging here */ | ||||||
| 	return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL); | 	page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, | ||||||
|  | 					&page_allocated); | ||||||
|  | 	if (unlikely(page_allocated)) | ||||||
|  | 		swap_readpage(page, false, NULL); | ||||||
|  | 	return page; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| int init_swap_address_space(unsigned int type, unsigned long nr_pages) | int init_swap_address_space(unsigned int type, unsigned long nr_pages) | ||||||
|  | @ -765,8 +773,10 @@ static void swap_ra_info(struct vm_fault *vmf, | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  * swap_vma_readahead - swap in pages in hope we need them soon |  * swap_vma_readahead - swap in pages in hope we need them soon | ||||||
|  * @fentry: swap entry of this memory |  * @targ_entry: swap entry of the targeted memory | ||||||
|  * @gfp_mask: memory allocation flags |  * @gfp_mask: memory allocation flags | ||||||
|  |  * @mpol: NUMA memory allocation policy to be applied | ||||||
|  |  * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE | ||||||
|  * @vmf: fault information |  * @vmf: fault information | ||||||
|  * |  * | ||||||
|  * Returns the struct page for entry and addr, after queueing swapin. |  * Returns the struct page for entry and addr, after queueing swapin. | ||||||
|  | @ -777,16 +787,17 @@ static void swap_ra_info(struct vm_fault *vmf, | ||||||
|  * Caller must hold read mmap_lock if vmf->vma is not NULL. |  * Caller must hold read mmap_lock if vmf->vma is not NULL. | ||||||
|  * |  * | ||||||
|  */ |  */ | ||||||
| static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, | static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, | ||||||
|  | 				       struct mempolicy *mpol, pgoff_t targ_ilx, | ||||||
| 				       struct vm_fault *vmf) | 				       struct vm_fault *vmf) | ||||||
| { | { | ||||||
| 	struct blk_plug plug; | 	struct blk_plug plug; | ||||||
| 	struct swap_iocb *splug = NULL; | 	struct swap_iocb *splug = NULL; | ||||||
| 	struct vm_area_struct *vma = vmf->vma; |  | ||||||
| 	struct page *page; | 	struct page *page; | ||||||
| 	pte_t *pte = NULL, pentry; | 	pte_t *pte = NULL, pentry; | ||||||
| 	unsigned long addr; | 	unsigned long addr; | ||||||
| 	swp_entry_t entry; | 	swp_entry_t entry; | ||||||
|  | 	pgoff_t ilx; | ||||||
| 	unsigned int i; | 	unsigned int i; | ||||||
| 	bool page_allocated; | 	bool page_allocated; | ||||||
| 	struct vma_swap_readahead ra_info = { | 	struct vma_swap_readahead ra_info = { | ||||||
|  | @ -798,9 +809,10 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, | ||||||
| 		goto skip; | 		goto skip; | ||||||
| 
 | 
 | ||||||
| 	addr = vmf->address - (ra_info.offset * PAGE_SIZE); | 	addr = vmf->address - (ra_info.offset * PAGE_SIZE); | ||||||
|  | 	ilx = targ_ilx - ra_info.offset; | ||||||
| 
 | 
 | ||||||
| 	blk_start_plug(&plug); | 	blk_start_plug(&plug); | ||||||
| 	for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) { | 	for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) { | ||||||
| 		if (!pte++) { | 		if (!pte++) { | ||||||
| 			pte = pte_offset_map(vmf->pmd, addr); | 			pte = pte_offset_map(vmf->pmd, addr); | ||||||
| 			if (!pte) | 			if (!pte) | ||||||
|  | @ -814,8 +826,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, | ||||||
| 			continue; | 			continue; | ||||||
| 		pte_unmap(pte); | 		pte_unmap(pte); | ||||||
| 		pte = NULL; | 		pte = NULL; | ||||||
| 		page = __read_swap_cache_async(entry, gfp_mask, vma, | 		page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, | ||||||
| 					       addr, &page_allocated); | 						&page_allocated); | ||||||
| 		if (!page) | 		if (!page) | ||||||
| 			continue; | 			continue; | ||||||
| 		if (page_allocated) { | 		if (page_allocated) { | ||||||
|  | @ -834,8 +846,11 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, | ||||||
| 	lru_add_drain(); | 	lru_add_drain(); | ||||||
| skip: | skip: | ||||||
| 	/* The page was likely read above, so no need for plugging here */ | 	/* The page was likely read above, so no need for plugging here */ | ||||||
| 	return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, | 	page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, | ||||||
| 				     NULL); | 					&page_allocated); | ||||||
|  | 	if (unlikely(page_allocated)) | ||||||
|  | 		swap_readpage(page, false, NULL); | ||||||
|  | 	return page; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  | @ -853,9 +868,16 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, | ||||||
| struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | ||||||
| 				struct vm_fault *vmf) | 				struct vm_fault *vmf) | ||||||
| { | { | ||||||
| 	return swap_use_vma_readahead() ? | 	struct mempolicy *mpol; | ||||||
| 			swap_vma_readahead(entry, gfp_mask, vmf) : | 	pgoff_t ilx; | ||||||
| 			swap_cluster_readahead(entry, gfp_mask, vmf); | 	struct page *page; | ||||||
|  | 
 | ||||||
|  | 	mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); | ||||||
|  | 	page = swap_use_vma_readahead() ? | ||||||
|  | 		swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : | ||||||
|  | 		swap_cluster_readahead(entry, gfp_mask, mpol, ilx); | ||||||
|  | 	mpol_cond_put(mpol); | ||||||
|  | 	return page; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_SYSFS | #ifdef CONFIG_SYSFS | ||||||
|  |  | ||||||
|  | @ -24,6 +24,7 @@ | ||||||
| #include <linux/swap.h> | #include <linux/swap.h> | ||||||
| #include <linux/crypto.h> | #include <linux/crypto.h> | ||||||
| #include <linux/scatterlist.h> | #include <linux/scatterlist.h> | ||||||
|  | #include <linux/mempolicy.h> | ||||||
| #include <linux/mempool.h> | #include <linux/mempool.h> | ||||||
| #include <linux/zpool.h> | #include <linux/zpool.h> | ||||||
| #include <crypto/acompress.h> | #include <crypto/acompress.h> | ||||||
|  | @ -1057,6 +1058,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, | ||||||
| { | { | ||||||
| 	swp_entry_t swpentry = entry->swpentry; | 	swp_entry_t swpentry = entry->swpentry; | ||||||
| 	struct page *page; | 	struct page *page; | ||||||
|  | 	struct mempolicy *mpol; | ||||||
| 	struct scatterlist input, output; | 	struct scatterlist input, output; | ||||||
| 	struct crypto_acomp_ctx *acomp_ctx; | 	struct crypto_acomp_ctx *acomp_ctx; | ||||||
| 	struct zpool *pool = zswap_find_zpool(entry); | 	struct zpool *pool = zswap_find_zpool(entry); | ||||||
|  | @ -1075,8 +1077,9 @@ static int zswap_writeback_entry(struct zswap_entry *entry, | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	/* try to allocate swap cache page */ | 	/* try to allocate swap cache page */ | ||||||
| 	page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0, | 	mpol = get_task_policy(current); | ||||||
| 				       &page_was_allocated); | 	page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, | ||||||
|  | 				NO_INTERLEAVE_INDEX, &page_was_allocated); | ||||||
| 	if (!page) { | 	if (!page) { | ||||||
| 		ret = -ENOMEM; | 		ret = -ENOMEM; | ||||||
| 		goto fail; | 		goto fail; | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Hugh Dickins
						Hugh Dickins