mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	mm/gup: track FOLL_PIN pages
Add tracking of pages that were pinned via FOLL_PIN.  This tracking is
implemented via overloading of page->_refcount: pins are added by adding
GUP_PIN_COUNTING_BIAS (1024) to the refcount.  This provides a fuzzy
indication of pinning, and it can have false positives (and that's OK).
Please see the pre-existing Documentation/core-api/pin_user_pages.rst for
details.
As mentioned in pin_user_pages.rst, callers who effectively set FOLL_PIN
(typically via pin_user_pages*()) are required to ultimately free such
pages via unpin_user_page().
Please also note the limitation, discussed in pin_user_pages.rst under the
"TODO: for 1GB and larger huge pages" section.  (That limitation will be
removed in a following patch.)
The effect of a FOLL_PIN flag is similar to that of FOLL_GET, and may be
thought of as "FOLL_GET for DIO and/or RDMA use".
Pages that have been pinned via FOLL_PIN are identifiable via a new
function call:
   bool page_maybe_dma_pinned(struct page *page);
What to do in response to encountering such a page, is left to later
patchsets. There is discussion about this in [1], [2], [3], and [4].
This also changes a BUG_ON(), to a WARN_ON(), in follow_page_mask().
[1] Some slow progress on get_user_pages() (Apr 2, 2019):
    https://lwn.net/Articles/784574/
[2] DMA and get_user_pages() (LPC: Dec 12, 2018):
    https://lwn.net/Articles/774411/
[3] The trouble with get_user_pages() (Apr 30, 2018):
    https://lwn.net/Articles/753027/
[4] LWN kernel index: get_user_pages():
    https://lwn.net/Kernel/Index/#Memory_management-get_user_pages
[jhubbard@nvidia.com: add kerneldoc]
  Link: http://lkml.kernel.org/r/20200307021157.235726-1-jhubbard@nvidia.com
[imbrenda@linux.ibm.com: if pin fails, we need to unpin, a simple put_page will not be enough]
  Link: http://lkml.kernel.org/r/20200306132537.783769-2-imbrenda@linux.ibm.com
[akpm@linux-foundation.org: fix put_compound_head defined but not used]
Suggested-by: Jan Kara <jack@suse.cz>
Suggested-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Link: http://lkml.kernel.org/r/20200211001536.1027652-7-jhubbard@nvidia.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
			
			
This commit is contained in:
		
							parent
							
								
									94202f126f
								
							
						
					
					
						commit
						3faa52c03f
					
				
					 5 changed files with 379 additions and 104 deletions
				
			
		| 
						 | 
					@ -173,8 +173,8 @@ CASE 4: Pinning for struct page manipulation only
 | 
				
			||||||
-------------------------------------------------
 | 
					-------------------------------------------------
 | 
				
			||||||
Here, normal GUP calls are sufficient, so neither flag needs to be set.
 | 
					Here, normal GUP calls are sufficient, so neither flag needs to be set.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
page_dma_pinned(): the whole point of pinning
 | 
					page_maybe_dma_pinned(): the whole point of pinning
 | 
				
			||||||
=============================================
 | 
					===================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The whole point of marking pages as "DMA-pinned" or "gup-pinned" is to be able
 | 
					The whole point of marking pages as "DMA-pinned" or "gup-pinned" is to be able
 | 
				
			||||||
to query, "is this page DMA-pinned?" That allows code such as page_mkclean()
 | 
					to query, "is this page DMA-pinned?" That allows code such as page_mkclean()
 | 
				
			||||||
| 
						 | 
					@ -186,7 +186,7 @@ and debates (see the References at the end of this document). It's a TODO item
 | 
				
			||||||
here: fill in the details once that's worked out. Meanwhile, it's safe to say
 | 
					here: fill in the details once that's worked out. Meanwhile, it's safe to say
 | 
				
			||||||
that having this available: ::
 | 
					that having this available: ::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        static inline bool page_dma_pinned(struct page *page)
 | 
					        static inline bool page_maybe_dma_pinned(struct page *page)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
...is a prerequisite to solving the long-running gup+DMA problem.
 | 
					...is a prerequisite to solving the long-running gup+DMA problem.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1001,6 +1001,8 @@ static inline void get_page(struct page *page)
 | 
				
			||||||
	page_ref_inc(page);
 | 
						page_ref_inc(page);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					bool __must_check try_grab_page(struct page *page, unsigned int flags);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline __must_check bool try_get_page(struct page *page)
 | 
					static inline __must_check bool try_get_page(struct page *page)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	page = compound_head(page);
 | 
						page = compound_head(page);
 | 
				
			||||||
| 
						 | 
					@ -1029,29 +1031,79 @@ static inline void put_page(struct page *page)
 | 
				
			||||||
		__put_page(page);
 | 
							__put_page(page);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/*
 | 
				
			||||||
 * unpin_user_page() - release a gup-pinned page
 | 
					 * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
 | 
				
			||||||
 * @page:            pointer to page to be released
 | 
					 * the page's refcount so that two separate items are tracked: the original page
 | 
				
			||||||
 | 
					 * reference count, and also a new count of how many pin_user_pages() calls were
 | 
				
			||||||
 | 
					 * made against the page. ("gup-pinned" is another term for the latter).
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * Pages that were pinned via pin_user_pages*() must be released via either
 | 
					 * With this scheme, pin_user_pages() becomes special: such pages are marked as
 | 
				
			||||||
 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
 | 
					 * distinct from normal pages. As such, the unpin_user_page() call (and its
 | 
				
			||||||
 * that eventually such pages can be separately tracked and uniquely handled. In
 | 
					 * variants) must be used in order to release gup-pinned pages.
 | 
				
			||||||
 * particular, interactions with RDMA and filesystems need special handling.
 | 
					 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * unpin_user_page() and put_page() are not interchangeable, despite this early
 | 
					 * Choice of value:
 | 
				
			||||||
 * implementation that makes them look the same. unpin_user_page() calls must
 | 
					 *
 | 
				
			||||||
 * be perfectly matched up with pin*() calls.
 | 
					 * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
 | 
				
			||||||
 | 
					 * counts with respect to pin_user_pages() and unpin_user_page() becomes
 | 
				
			||||||
 | 
					 * simpler, due to the fact that adding an even power of two to the page
 | 
				
			||||||
 | 
					 * refcount has the effect of using only the upper N bits, for the code that
 | 
				
			||||||
 | 
					 * counts up using the bias value. This means that the lower bits are left for
 | 
				
			||||||
 | 
					 * the exclusive use of the original code that increments and decrements by one
 | 
				
			||||||
 | 
					 * (or at least, by much smaller values than the bias value).
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Of course, once the lower bits overflow into the upper bits (and this is
 | 
				
			||||||
 | 
					 * OK, because subtraction recovers the original values), then visual inspection
 | 
				
			||||||
 | 
					 * no longer suffices to directly view the separate counts. However, for normal
 | 
				
			||||||
 | 
					 * applications that don't have huge page reference counts, this won't be an
 | 
				
			||||||
 | 
					 * issue.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Locking: the lockless algorithm described in page_cache_get_speculative()
 | 
				
			||||||
 | 
					 * and page_cache_gup_pin_speculative() provides safe operation for
 | 
				
			||||||
 | 
					 * get_user_pages and page_mkclean and other calls that race to set up page
 | 
				
			||||||
 | 
					 * table entries.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static inline void unpin_user_page(struct page *page)
 | 
					#define GUP_PIN_COUNTING_BIAS (1U << 10)
 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	put_page(page);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void unpin_user_page(struct page *page);
 | 
				
			||||||
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
 | 
					void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
 | 
				
			||||||
				 bool make_dirty);
 | 
									 bool make_dirty);
 | 
				
			||||||
 | 
					 | 
				
			||||||
void unpin_user_pages(struct page **pages, unsigned long npages);
 | 
					void unpin_user_pages(struct page **pages, unsigned long npages);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * page_maybe_dma_pinned() - report if a page is pinned for DMA.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * This function checks if a page has been pinned via a call to
 | 
				
			||||||
 | 
					 * pin_user_pages*().
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * For non-huge pages, the return value is partially fuzzy: false is not fuzzy,
 | 
				
			||||||
 | 
					 * because it means "definitely not pinned for DMA", but true means "probably
 | 
				
			||||||
 | 
					 * pinned for DMA, but possibly a false positive due to having at least
 | 
				
			||||||
 | 
					 * GUP_PIN_COUNTING_BIAS worth of normal page references".
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * False positives are OK, because: a) it's unlikely for a page to get that many
 | 
				
			||||||
 | 
					 * refcounts, and b) all the callers of this routine are expected to be able to
 | 
				
			||||||
 | 
					 * deal gracefully with a false positive.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * For more information, please see Documentation/vm/pin_user_pages.rst.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * @page:	pointer to page to be queried.
 | 
				
			||||||
 | 
					 * @Return:	True, if it is likely that the page has been "dma-pinned".
 | 
				
			||||||
 | 
					 *		False, if the page is definitely not dma-pinned.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static inline bool page_maybe_dma_pinned(struct page *page)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * page_ref_count() is signed. If that refcount overflows, then
 | 
				
			||||||
 | 
						 * page_ref_count() returns a negative value, and callers will avoid
 | 
				
			||||||
 | 
						 * further incrementing the refcount.
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * Here, for that overflow case, use the signed bit to count a little
 | 
				
			||||||
 | 
						 * bit higher via unsigned math, and thus still get an accurate result.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						return ((unsigned int)page_ref_count(compound_head(page))) >=
 | 
				
			||||||
 | 
							GUP_PIN_COUNTING_BIAS;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 | 
					#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 | 
				
			||||||
#define SECTION_IN_PAGE_FLAGS
 | 
					#define SECTION_IN_PAGE_FLAGS
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										312
									
								
								mm/gup.c
									
									
									
									
									
								
							
							
						
						
									
										312
									
								
								mm/gup.c
									
									
									
									
									
								
							| 
						 | 
					@ -44,6 +44,135 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
 | 
				
			||||||
	return head;
 | 
						return head;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * try_grab_compound_head() - attempt to elevate a page's refcount, by a
 | 
				
			||||||
 | 
					 * flags-dependent amount.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * "grab" names in this file mean, "look at flags to decide whether to use
 | 
				
			||||||
 | 
					 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
 | 
				
			||||||
 | 
					 * same time. (That's true throughout the get_user_pages*() and
 | 
				
			||||||
 | 
					 * pin_user_pages*() APIs.) Cases:
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *    FOLL_GET: page's refcount will be incremented by 1.
 | 
				
			||||||
 | 
					 *    FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Return: head page (with refcount appropriately incremented) for success, or
 | 
				
			||||||
 | 
					 * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
 | 
				
			||||||
 | 
					 * considered failure, and furthermore, a likely bug in the caller, so a warning
 | 
				
			||||||
 | 
					 * is also emitted.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static __maybe_unused struct page *try_grab_compound_head(struct page *page,
 | 
				
			||||||
 | 
												  int refs,
 | 
				
			||||||
 | 
												  unsigned int flags)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						if (flags & FOLL_GET)
 | 
				
			||||||
 | 
							return try_get_compound_head(page, refs);
 | 
				
			||||||
 | 
						else if (flags & FOLL_PIN) {
 | 
				
			||||||
 | 
							refs *= GUP_PIN_COUNTING_BIAS;
 | 
				
			||||||
 | 
							return try_get_compound_head(page, refs);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						WARN_ON_ONCE(1);
 | 
				
			||||||
 | 
						return NULL;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * try_grab_page() - elevate a page's refcount by a flag-dependent amount
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * This might not do anything at all, depending on the flags argument.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * "grab" names in this file mean, "look at flags to decide whether to use
 | 
				
			||||||
 | 
					 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * @page:    pointer to page to be grabbed
 | 
				
			||||||
 | 
					 * @flags:   gup flags: these are the FOLL_* flag values.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
 | 
				
			||||||
 | 
					 * time. Cases:
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *    FOLL_GET: page's refcount will be incremented by 1.
 | 
				
			||||||
 | 
					 *    FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Return: true for success, or if no action was required (if neither FOLL_PIN
 | 
				
			||||||
 | 
					 * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
 | 
				
			||||||
 | 
					 * FOLL_PIN was set, but the page could not be grabbed.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					bool __must_check try_grab_page(struct page *page, unsigned int flags)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (flags & FOLL_GET)
 | 
				
			||||||
 | 
							return try_get_page(page);
 | 
				
			||||||
 | 
						else if (flags & FOLL_PIN) {
 | 
				
			||||||
 | 
							page = compound_head(page);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							if (WARN_ON_ONCE(page_ref_count(page) <= 0))
 | 
				
			||||||
 | 
								return false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							page_ref_add(page, GUP_PIN_COUNTING_BIAS);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return true;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef CONFIG_DEV_PAGEMAP_OPS
 | 
				
			||||||
 | 
					static bool __unpin_devmap_managed_user_page(struct page *page)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						int count;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!page_is_devmap_managed(page))
 | 
				
			||||||
 | 
							return false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						count = page_ref_sub_return(page, GUP_PIN_COUNTING_BIAS);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * devmap page refcounts are 1-based, rather than 0-based: if
 | 
				
			||||||
 | 
						 * refcount is 1, then the page is free and the refcount is
 | 
				
			||||||
 | 
						 * stable because nobody holds a reference on the page.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (count == 1)
 | 
				
			||||||
 | 
							free_devmap_managed_page(page);
 | 
				
			||||||
 | 
						else if (!count)
 | 
				
			||||||
 | 
							__put_page(page);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return true;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					static bool __unpin_devmap_managed_user_page(struct page *page)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						return false;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif /* CONFIG_DEV_PAGEMAP_OPS */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * unpin_user_page() - release a dma-pinned page
 | 
				
			||||||
 | 
					 * @page:            pointer to page to be released
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Pages that were pinned via pin_user_pages*() must be released via either
 | 
				
			||||||
 | 
					 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
 | 
				
			||||||
 | 
					 * that such pages can be separately tracked and uniquely handled. In
 | 
				
			||||||
 | 
					 * particular, interactions with RDMA and filesystems need special handling.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					void unpin_user_page(struct page *page)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						page = compound_head(page);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * For devmap managed pages we need to catch refcount transition from
 | 
				
			||||||
 | 
						 * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the
 | 
				
			||||||
 | 
						 * page is free and we need to inform the device driver through
 | 
				
			||||||
 | 
						 * callback. See include/linux/memremap.h and HMM for details.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (__unpin_devmap_managed_user_page(page))
 | 
				
			||||||
 | 
							return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (page_ref_sub_and_test(page, GUP_PIN_COUNTING_BIAS))
 | 
				
			||||||
 | 
							__put_page(page);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					EXPORT_SYMBOL(unpin_user_page);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
 | 
					 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
 | 
				
			||||||
 * @pages:  array of pages to be maybe marked dirty, and definitely released.
 | 
					 * @pages:  array of pages to be maybe marked dirty, and definitely released.
 | 
				
			||||||
| 
						 | 
					@ -230,10 +359,11 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	page = vm_normal_page(vma, address, pte);
 | 
						page = vm_normal_page(vma, address, pte);
 | 
				
			||||||
	if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
 | 
						if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
		 * Only return device mapping pages in the FOLL_GET case since
 | 
							 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
 | 
				
			||||||
		 * they are only valid while holding the pgmap reference.
 | 
							 * case since they are only valid while holding the pgmap
 | 
				
			||||||
 | 
							 * reference.
 | 
				
			||||||
		 */
 | 
							 */
 | 
				
			||||||
		*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
 | 
							*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
 | 
				
			||||||
		if (*pgmap)
 | 
							if (*pgmap)
 | 
				
			||||||
| 
						 | 
					@ -271,11 +401,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
 | 
				
			||||||
		goto retry;
 | 
							goto retry;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (flags & FOLL_GET) {
 | 
						/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
 | 
				
			||||||
		if (unlikely(!try_get_page(page))) {
 | 
						if (unlikely(!try_grab_page(page, flags))) {
 | 
				
			||||||
			page = ERR_PTR(-ENOMEM);
 | 
							page = ERR_PTR(-ENOMEM);
 | 
				
			||||||
			goto out;
 | 
							goto out;
 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	if (flags & FOLL_TOUCH) {
 | 
						if (flags & FOLL_TOUCH) {
 | 
				
			||||||
		if ((flags & FOLL_WRITE) &&
 | 
							if ((flags & FOLL_WRITE) &&
 | 
				
			||||||
| 
						 | 
					@ -537,7 +666,7 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
 | 
				
			||||||
	/* make this handle hugepd */
 | 
						/* make this handle hugepd */
 | 
				
			||||||
	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 | 
						page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 | 
				
			||||||
	if (!IS_ERR(page)) {
 | 
						if (!IS_ERR(page)) {
 | 
				
			||||||
		BUG_ON(flags & FOLL_GET);
 | 
							WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
 | 
				
			||||||
		return page;
 | 
							return page;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1675,6 +1804,15 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	return 0;
 | 
						return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static long __get_user_pages_remote(struct task_struct *tsk,
 | 
				
			||||||
 | 
									    struct mm_struct *mm,
 | 
				
			||||||
 | 
									    unsigned long start, unsigned long nr_pages,
 | 
				
			||||||
 | 
									    unsigned int gup_flags, struct page **pages,
 | 
				
			||||||
 | 
									    struct vm_area_struct **vmas, int *locked)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
#endif /* !CONFIG_MMU */
 | 
					#endif /* !CONFIG_MMU */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
| 
						 | 
					@ -1814,7 +1952,24 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
 | 
				
			||||||
 * This code is based heavily on the PowerPC implementation by Nick Piggin.
 | 
					 * This code is based heavily on the PowerPC implementation by Nick Piggin.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
#ifdef CONFIG_HAVE_FAST_GUP
 | 
					#ifdef CONFIG_HAVE_FAST_GUP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void put_compound_head(struct page *page, int refs, unsigned int flags)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						if (flags & FOLL_PIN)
 | 
				
			||||||
 | 
							refs *= GUP_PIN_COUNTING_BIAS;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Calling put_page() for each ref is unnecessarily slow. Only the last
 | 
				
			||||||
 | 
						 * ref needs a put_page().
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (refs > 1)
 | 
				
			||||||
 | 
							page_ref_sub(page, refs - 1);
 | 
				
			||||||
 | 
						put_page(page);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
 | 
					#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 * WARNING: only to be used in the get_user_pages_fast() implementation.
 | 
					 * WARNING: only to be used in the get_user_pages_fast() implementation.
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
| 
						 | 
					@ -1877,7 +2032,10 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
 | 
				
			||||||
		struct page *page = pages[--(*nr)];
 | 
							struct page *page = pages[--(*nr)];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		ClearPageReferenced(page);
 | 
							ClearPageReferenced(page);
 | 
				
			||||||
		put_page(page);
 | 
							if (flags & FOLL_PIN)
 | 
				
			||||||
 | 
								unpin_user_page(page);
 | 
				
			||||||
 | 
							else
 | 
				
			||||||
 | 
								put_page(page);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1919,12 +2077,12 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 | 
				
			||||||
		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 | 
							VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 | 
				
			||||||
		page = pte_page(pte);
 | 
							page = pte_page(pte);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		head = try_get_compound_head(page, 1);
 | 
							head = try_grab_compound_head(page, 1, flags);
 | 
				
			||||||
		if (!head)
 | 
							if (!head)
 | 
				
			||||||
			goto pte_unmap;
 | 
								goto pte_unmap;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
 | 
							if (unlikely(pte_val(pte) != pte_val(*ptep))) {
 | 
				
			||||||
			put_page(head);
 | 
								put_compound_head(head, 1, flags);
 | 
				
			||||||
			goto pte_unmap;
 | 
								goto pte_unmap;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1980,7 +2138,10 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		SetPageReferenced(page);
 | 
							SetPageReferenced(page);
 | 
				
			||||||
		pages[*nr] = page;
 | 
							pages[*nr] = page;
 | 
				
			||||||
		get_page(page);
 | 
							if (unlikely(!try_grab_page(page, flags))) {
 | 
				
			||||||
 | 
								undo_dev_pagemap(nr, nr_start, flags, pages);
 | 
				
			||||||
 | 
								return 0;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
		(*nr)++;
 | 
							(*nr)++;
 | 
				
			||||||
		pfn++;
 | 
							pfn++;
 | 
				
			||||||
	} while (addr += PAGE_SIZE, addr != end);
 | 
						} while (addr += PAGE_SIZE, addr != end);
 | 
				
			||||||
| 
						 | 
					@ -2054,18 +2215,6 @@ static int record_subpages(struct page *page, unsigned long addr,
 | 
				
			||||||
	return nr;
 | 
						return nr;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void put_compound_head(struct page *page, int refs, unsigned int flags)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
 | 
					 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * Calling put_page() for each ref is unnecessarily slow. Only the last
 | 
					 | 
				
			||||||
	 * ref needs a put_page().
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	if (refs > 1)
 | 
					 | 
				
			||||||
		page_ref_sub(page, refs - 1);
 | 
					 | 
				
			||||||
	put_page(page);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef CONFIG_ARCH_HAS_HUGEPD
 | 
					#ifdef CONFIG_ARCH_HAS_HUGEPD
 | 
				
			||||||
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 | 
					static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 | 
				
			||||||
				      unsigned long sz)
 | 
									      unsigned long sz)
 | 
				
			||||||
| 
						 | 
					@ -2099,7 +2248,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 | 
				
			||||||
	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
 | 
						page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
 | 
				
			||||||
	refs = record_subpages(page, addr, end, pages + *nr);
 | 
						refs = record_subpages(page, addr, end, pages + *nr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	head = try_get_compound_head(head, refs);
 | 
						head = try_grab_compound_head(head, refs, flags);
 | 
				
			||||||
	if (!head)
 | 
						if (!head)
 | 
				
			||||||
		return 0;
 | 
							return 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2159,7 +2308,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 | 
				
			||||||
	page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
 | 
						page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
 | 
				
			||||||
	refs = record_subpages(page, addr, end, pages + *nr);
 | 
						refs = record_subpages(page, addr, end, pages + *nr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	head = try_get_compound_head(pmd_page(orig), refs);
 | 
						head = try_grab_compound_head(pmd_page(orig), refs, flags);
 | 
				
			||||||
	if (!head)
 | 
						if (!head)
 | 
				
			||||||
		return 0;
 | 
							return 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2193,7 +2342,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 | 
				
			||||||
	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
 | 
						page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
 | 
				
			||||||
	refs = record_subpages(page, addr, end, pages + *nr);
 | 
						refs = record_subpages(page, addr, end, pages + *nr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	head = try_get_compound_head(pud_page(orig), refs);
 | 
						head = try_grab_compound_head(pud_page(orig), refs, flags);
 | 
				
			||||||
	if (!head)
 | 
						if (!head)
 | 
				
			||||||
		return 0;
 | 
							return 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2222,7 +2371,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 | 
				
			||||||
	page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
 | 
						page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
 | 
				
			||||||
	refs = record_subpages(page, addr, end, pages + *nr);
 | 
						refs = record_subpages(page, addr, end, pages + *nr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	head = try_get_compound_head(pgd_page(orig), refs);
 | 
						head = try_grab_compound_head(pgd_page(orig), refs, flags);
 | 
				
			||||||
	if (!head)
 | 
						if (!head)
 | 
				
			||||||
		return 0;
 | 
							return 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2505,11 +2654,11 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * get_user_pages_fast() - pin user pages in memory
 | 
					 * get_user_pages_fast() - pin user pages in memory
 | 
				
			||||||
 * @start:	starting user address
 | 
					 * @start:      starting user address
 | 
				
			||||||
 * @nr_pages:	number of pages from start to pin
 | 
					 * @nr_pages:   number of pages from start to pin
 | 
				
			||||||
 * @gup_flags:	flags modifying pin behaviour
 | 
					 * @gup_flags:  flags modifying pin behaviour
 | 
				
			||||||
 * @pages:	array that receives pointers to the pages pinned.
 | 
					 * @pages:      array that receives pointers to the pages pinned.
 | 
				
			||||||
 *		Should be at least nr_pages long.
 | 
					 *              Should be at least nr_pages long.
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * Attempt to pin user pages in memory without taking mm->mmap_sem.
 | 
					 * Attempt to pin user pages in memory without taking mm->mmap_sem.
 | 
				
			||||||
 * If not successful, it will fall back to taking the lock and
 | 
					 * If not successful, it will fall back to taking the lock and
 | 
				
			||||||
| 
						 | 
					@ -2543,9 +2692,18 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * pin_user_pages_fast() - pin user pages in memory without taking locks
 | 
					 * pin_user_pages_fast() - pin user pages in memory without taking locks
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * For now, this is a placeholder function, until various call sites are
 | 
					 * @start:      starting user address
 | 
				
			||||||
 * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
 | 
					 * @nr_pages:   number of pages from start to pin
 | 
				
			||||||
 * this is identical to get_user_pages_fast().
 | 
					 * @gup_flags:  flags modifying pin behaviour
 | 
				
			||||||
 | 
					 * @pages:      array that receives pointers to the pages pinned.
 | 
				
			||||||
 | 
					 *              Should be at least nr_pages long.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
 | 
				
			||||||
 | 
					 * get_user_pages_fast() for documentation on the function arguments, because
 | 
				
			||||||
 | 
					 * the arguments here are identical.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 | 
				
			||||||
 | 
					 * see Documentation/vm/pin_user_pages.rst for further details.
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
 | 
					 * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
 | 
				
			||||||
 * is NOT intended for Case 2 (RDMA: long-term pins).
 | 
					 * is NOT intended for Case 2 (RDMA: long-term pins).
 | 
				
			||||||
| 
						 | 
					@ -2553,21 +2711,39 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
 | 
				
			||||||
int pin_user_pages_fast(unsigned long start, int nr_pages,
 | 
					int pin_user_pages_fast(unsigned long start, int nr_pages,
 | 
				
			||||||
			unsigned int gup_flags, struct page **pages)
 | 
								unsigned int gup_flags, struct page **pages)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	/*
 | 
						/* FOLL_GET and FOLL_PIN are mutually exclusive. */
 | 
				
			||||||
	 * This is a placeholder, until the pin functionality is activated.
 | 
						if (WARN_ON_ONCE(gup_flags & FOLL_GET))
 | 
				
			||||||
	 * Until then, just behave like the corresponding get_user_pages*()
 | 
							return -EINVAL;
 | 
				
			||||||
	 * routine.
 | 
					
 | 
				
			||||||
	 */
 | 
						gup_flags |= FOLL_PIN;
 | 
				
			||||||
	return get_user_pages_fast(start, nr_pages, gup_flags, pages);
 | 
						return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
EXPORT_SYMBOL_GPL(pin_user_pages_fast);
 | 
					EXPORT_SYMBOL_GPL(pin_user_pages_fast);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * pin_user_pages_remote() - pin pages of a remote process (task != current)
 | 
					 * pin_user_pages_remote() - pin pages of a remote process (task != current)
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * For now, this is a placeholder function, until various call sites are
 | 
					 * @tsk:	the task_struct to use for page fault accounting, or
 | 
				
			||||||
 * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
 | 
					 *		NULL if faults are not to be recorded.
 | 
				
			||||||
 * this is identical to get_user_pages_remote().
 | 
					 * @mm:		mm_struct of target mm
 | 
				
			||||||
 | 
					 * @start:	starting user address
 | 
				
			||||||
 | 
					 * @nr_pages:	number of pages from start to pin
 | 
				
			||||||
 | 
					 * @gup_flags:	flags modifying lookup behaviour
 | 
				
			||||||
 | 
					 * @pages:	array that receives pointers to the pages pinned.
 | 
				
			||||||
 | 
					 *		Should be at least nr_pages long. Or NULL, if caller
 | 
				
			||||||
 | 
					 *		only intends to ensure the pages are faulted in.
 | 
				
			||||||
 | 
					 * @vmas:	array of pointers to vmas corresponding to each page.
 | 
				
			||||||
 | 
					 *		Or NULL if the caller does not require them.
 | 
				
			||||||
 | 
					 * @locked:	pointer to lock flag indicating whether lock is held and
 | 
				
			||||||
 | 
					 *		subsequently whether VM_FAULT_RETRY functionality can be
 | 
				
			||||||
 | 
					 *		utilised. Lock must initially be held.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
 | 
				
			||||||
 | 
					 * get_user_pages_remote() for documentation on the function arguments, because
 | 
				
			||||||
 | 
					 * the arguments here are identical.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 | 
				
			||||||
 | 
					 * see Documentation/vm/pin_user_pages.rst for details.
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
 | 
					 * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
 | 
				
			||||||
 * is NOT intended for Case 2 (RDMA: long-term pins).
 | 
					 * is NOT intended for Case 2 (RDMA: long-term pins).
 | 
				
			||||||
| 
						 | 
					@ -2577,22 +2753,33 @@ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 | 
				
			||||||
			   unsigned int gup_flags, struct page **pages,
 | 
								   unsigned int gup_flags, struct page **pages,
 | 
				
			||||||
			   struct vm_area_struct **vmas, int *locked)
 | 
								   struct vm_area_struct **vmas, int *locked)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	/*
 | 
						/* FOLL_GET and FOLL_PIN are mutually exclusive. */
 | 
				
			||||||
	 * This is a placeholder, until the pin functionality is activated.
 | 
						if (WARN_ON_ONCE(gup_flags & FOLL_GET))
 | 
				
			||||||
	 * Until then, just behave like the corresponding get_user_pages*()
 | 
							return -EINVAL;
 | 
				
			||||||
	 * routine.
 | 
					
 | 
				
			||||||
	 */
 | 
						gup_flags |= FOLL_PIN;
 | 
				
			||||||
	return get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, pages,
 | 
						return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
 | 
				
			||||||
				     vmas, locked);
 | 
									       pages, vmas, locked);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
EXPORT_SYMBOL(pin_user_pages_remote);
 | 
					EXPORT_SYMBOL(pin_user_pages_remote);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * pin_user_pages() - pin user pages in memory for use by other devices
 | 
					 * pin_user_pages() - pin user pages in memory for use by other devices
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * For now, this is a placeholder function, until various call sites are
 | 
					 * @start:	starting user address
 | 
				
			||||||
 * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
 | 
					 * @nr_pages:	number of pages from start to pin
 | 
				
			||||||
 * this is identical to get_user_pages().
 | 
					 * @gup_flags:	flags modifying lookup behaviour
 | 
				
			||||||
 | 
					 * @pages:	array that receives pointers to the pages pinned.
 | 
				
			||||||
 | 
					 *		Should be at least nr_pages long. Or NULL, if caller
 | 
				
			||||||
 | 
					 *		only intends to ensure the pages are faulted in.
 | 
				
			||||||
 | 
					 * @vmas:	array of pointers to vmas corresponding to each page.
 | 
				
			||||||
 | 
					 *		Or NULL if the caller does not require them.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
 | 
				
			||||||
 | 
					 * FOLL_PIN is set.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 | 
				
			||||||
 | 
					 * see Documentation/vm/pin_user_pages.rst for details.
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
 | 
					 * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
 | 
				
			||||||
 * is NOT intended for Case 2 (RDMA: long-term pins).
 | 
					 * is NOT intended for Case 2 (RDMA: long-term pins).
 | 
				
			||||||
| 
						 | 
					@ -2601,11 +2788,12 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
 | 
				
			||||||
		    unsigned int gup_flags, struct page **pages,
 | 
							    unsigned int gup_flags, struct page **pages,
 | 
				
			||||||
		    struct vm_area_struct **vmas)
 | 
							    struct vm_area_struct **vmas)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	/*
 | 
						/* FOLL_GET and FOLL_PIN are mutually exclusive. */
 | 
				
			||||||
	 * This is a placeholder, until the pin functionality is activated.
 | 
						if (WARN_ON_ONCE(gup_flags & FOLL_GET))
 | 
				
			||||||
	 * Until then, just behave like the corresponding get_user_pages*()
 | 
							return -EINVAL;
 | 
				
			||||||
	 * routine.
 | 
					
 | 
				
			||||||
	 */
 | 
						gup_flags |= FOLL_PIN;
 | 
				
			||||||
	return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
 | 
						return __gup_longterm_locked(current, current->mm, start, nr_pages,
 | 
				
			||||||
 | 
									     pages, vmas, gup_flags);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
EXPORT_SYMBOL(pin_user_pages);
 | 
					EXPORT_SYMBOL(pin_user_pages);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -958,6 +958,11 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
 | 
						WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* FOLL_GET and FOLL_PIN are mutually exclusive. */
 | 
				
			||||||
 | 
						if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
 | 
				
			||||||
 | 
								 (FOLL_PIN | FOLL_GET)))
 | 
				
			||||||
 | 
							return NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (flags & FOLL_WRITE && !pmd_write(*pmd))
 | 
						if (flags & FOLL_WRITE && !pmd_write(*pmd))
 | 
				
			||||||
		return NULL;
 | 
							return NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -973,7 +978,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 | 
				
			||||||
	 * device mapped pages can only be returned if the
 | 
						 * device mapped pages can only be returned if the
 | 
				
			||||||
	 * caller will manage the page reference count.
 | 
						 * caller will manage the page reference count.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (!(flags & FOLL_GET))
 | 
						if (!(flags & (FOLL_GET | FOLL_PIN)))
 | 
				
			||||||
		return ERR_PTR(-EEXIST);
 | 
							return ERR_PTR(-EEXIST);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
 | 
						pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
 | 
				
			||||||
| 
						 | 
					@ -981,7 +986,8 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 | 
				
			||||||
	if (!*pgmap)
 | 
						if (!*pgmap)
 | 
				
			||||||
		return ERR_PTR(-EFAULT);
 | 
							return ERR_PTR(-EFAULT);
 | 
				
			||||||
	page = pfn_to_page(pfn);
 | 
						page = pfn_to_page(pfn);
 | 
				
			||||||
	get_page(page);
 | 
						if (!try_grab_page(page, flags))
 | 
				
			||||||
 | 
							page = ERR_PTR(-ENOMEM);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return page;
 | 
						return page;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -1101,6 +1107,11 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
 | 
				
			||||||
	if (flags & FOLL_WRITE && !pud_write(*pud))
 | 
						if (flags & FOLL_WRITE && !pud_write(*pud))
 | 
				
			||||||
		return NULL;
 | 
							return NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* FOLL_GET and FOLL_PIN are mutually exclusive. */
 | 
				
			||||||
 | 
						if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
 | 
				
			||||||
 | 
								 (FOLL_PIN | FOLL_GET)))
 | 
				
			||||||
 | 
							return NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (pud_present(*pud) && pud_devmap(*pud))
 | 
						if (pud_present(*pud) && pud_devmap(*pud))
 | 
				
			||||||
		/* pass */;
 | 
							/* pass */;
 | 
				
			||||||
	else
 | 
						else
 | 
				
			||||||
| 
						 | 
					@ -1112,8 +1123,10 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * device mapped pages can only be returned if the
 | 
						 * device mapped pages can only be returned if the
 | 
				
			||||||
	 * caller will manage the page reference count.
 | 
						 * caller will manage the page reference count.
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (!(flags & FOLL_GET))
 | 
						if (!(flags & (FOLL_GET | FOLL_PIN)))
 | 
				
			||||||
		return ERR_PTR(-EEXIST);
 | 
							return ERR_PTR(-EEXIST);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
 | 
						pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
 | 
				
			||||||
| 
						 | 
					@ -1121,7 +1134,8 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
 | 
				
			||||||
	if (!*pgmap)
 | 
						if (!*pgmap)
 | 
				
			||||||
		return ERR_PTR(-EFAULT);
 | 
							return ERR_PTR(-EFAULT);
 | 
				
			||||||
	page = pfn_to_page(pfn);
 | 
						page = pfn_to_page(pfn);
 | 
				
			||||||
	get_page(page);
 | 
						if (!try_grab_page(page, flags))
 | 
				
			||||||
 | 
							page = ERR_PTR(-ENOMEM);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return page;
 | 
						return page;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -1497,8 +1511,13 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	page = pmd_page(*pmd);
 | 
						page = pmd_page(*pmd);
 | 
				
			||||||
	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
 | 
						VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!try_grab_page(page, flags))
 | 
				
			||||||
 | 
							return ERR_PTR(-ENOMEM);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (flags & FOLL_TOUCH)
 | 
						if (flags & FOLL_TOUCH)
 | 
				
			||||||
		touch_pmd(vma, addr, pmd, flags);
 | 
							touch_pmd(vma, addr, pmd, flags);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 | 
						if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
		 * We don't mlock() pte-mapped THPs. This way we can avoid
 | 
							 * We don't mlock() pte-mapped THPs. This way we can avoid
 | 
				
			||||||
| 
						 | 
					@ -1535,8 +1554,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 | 
				
			||||||
skip_mlock:
 | 
					skip_mlock:
 | 
				
			||||||
	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
 | 
						page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
 | 
				
			||||||
	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
 | 
						VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
 | 
				
			||||||
	if (flags & FOLL_GET)
 | 
					 | 
				
			||||||
		get_page(page);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
out:
 | 
					out:
 | 
				
			||||||
	return page;
 | 
						return page;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										54
									
								
								mm/hugetlb.c
									
									
									
									
									
								
							
							
						
						
									
										54
									
								
								mm/hugetlb.c
									
									
									
									
									
								
							| 
						 | 
					@ -4375,19 +4375,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 | 
				
			||||||
		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
 | 
							pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
 | 
				
			||||||
		page = pte_page(huge_ptep_get(pte));
 | 
							page = pte_page(huge_ptep_get(pte));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/*
 | 
					 | 
				
			||||||
		 * Instead of doing 'try_get_page()' below in the same_page
 | 
					 | 
				
			||||||
		 * loop, just check the count once here.
 | 
					 | 
				
			||||||
		 */
 | 
					 | 
				
			||||||
		if (unlikely(page_count(page) <= 0)) {
 | 
					 | 
				
			||||||
			if (pages) {
 | 
					 | 
				
			||||||
				spin_unlock(ptl);
 | 
					 | 
				
			||||||
				remainder = 0;
 | 
					 | 
				
			||||||
				err = -ENOMEM;
 | 
					 | 
				
			||||||
				break;
 | 
					 | 
				
			||||||
			}
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
		 * If subpage information not requested, update counters
 | 
							 * If subpage information not requested, update counters
 | 
				
			||||||
		 * and skip the same_page loop below.
 | 
							 * and skip the same_page loop below.
 | 
				
			||||||
| 
						 | 
					@ -4405,7 +4392,22 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 | 
				
			||||||
same_page:
 | 
					same_page:
 | 
				
			||||||
		if (pages) {
 | 
							if (pages) {
 | 
				
			||||||
			pages[i] = mem_map_offset(page, pfn_offset);
 | 
								pages[i] = mem_map_offset(page, pfn_offset);
 | 
				
			||||||
			get_page(pages[i]);
 | 
								/*
 | 
				
			||||||
 | 
								 * try_grab_page() should always succeed here, because:
 | 
				
			||||||
 | 
								 * a) we hold the ptl lock, and b) we've just checked
 | 
				
			||||||
 | 
								 * that the huge page is present in the page tables. If
 | 
				
			||||||
 | 
								 * the huge page is present, then the tail pages must
 | 
				
			||||||
 | 
								 * also be present. The ptl prevents the head page and
 | 
				
			||||||
 | 
								 * tail pages from being rearranged in any way. So this
 | 
				
			||||||
 | 
								 * page must be available at this point, unless the page
 | 
				
			||||||
 | 
								 * refcount overflowed:
 | 
				
			||||||
 | 
								 */
 | 
				
			||||||
 | 
								if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
 | 
				
			||||||
 | 
									spin_unlock(ptl);
 | 
				
			||||||
 | 
									remainder = 0;
 | 
				
			||||||
 | 
									err = -ENOMEM;
 | 
				
			||||||
 | 
									break;
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (vmas)
 | 
							if (vmas)
 | 
				
			||||||
| 
						 | 
					@ -4965,6 +4967,12 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 | 
				
			||||||
	struct page *page = NULL;
 | 
						struct page *page = NULL;
 | 
				
			||||||
	spinlock_t *ptl;
 | 
						spinlock_t *ptl;
 | 
				
			||||||
	pte_t pte;
 | 
						pte_t pte;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* FOLL_GET and FOLL_PIN are mutually exclusive. */
 | 
				
			||||||
 | 
						if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
 | 
				
			||||||
 | 
								 (FOLL_PIN | FOLL_GET)))
 | 
				
			||||||
 | 
							return NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
retry:
 | 
					retry:
 | 
				
			||||||
	ptl = pmd_lockptr(mm, pmd);
 | 
						ptl = pmd_lockptr(mm, pmd);
 | 
				
			||||||
	spin_lock(ptl);
 | 
						spin_lock(ptl);
 | 
				
			||||||
| 
						 | 
					@ -4977,8 +4985,18 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 | 
				
			||||||
	pte = huge_ptep_get((pte_t *)pmd);
 | 
						pte = huge_ptep_get((pte_t *)pmd);
 | 
				
			||||||
	if (pte_present(pte)) {
 | 
						if (pte_present(pte)) {
 | 
				
			||||||
		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
 | 
							page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
 | 
				
			||||||
		if (flags & FOLL_GET)
 | 
							/*
 | 
				
			||||||
			get_page(page);
 | 
							 * try_grab_page() should always succeed here, because: a) we
 | 
				
			||||||
 | 
							 * hold the pmd (ptl) lock, and b) we've just checked that the
 | 
				
			||||||
 | 
							 * huge pmd (head) page is present in the page tables. The ptl
 | 
				
			||||||
 | 
							 * prevents the head page and tail pages from being rearranged
 | 
				
			||||||
 | 
							 * in any way. So this page must be available at this point,
 | 
				
			||||||
 | 
							 * unless the page refcount overflowed:
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
 | 
							if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
 | 
				
			||||||
 | 
								page = NULL;
 | 
				
			||||||
 | 
								goto out;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
	} else {
 | 
						} else {
 | 
				
			||||||
		if (is_hugetlb_entry_migration(pte)) {
 | 
							if (is_hugetlb_entry_migration(pte)) {
 | 
				
			||||||
			spin_unlock(ptl);
 | 
								spin_unlock(ptl);
 | 
				
			||||||
| 
						 | 
					@ -4999,7 +5017,7 @@ struct page * __weak
 | 
				
			||||||
follow_huge_pud(struct mm_struct *mm, unsigned long address,
 | 
					follow_huge_pud(struct mm_struct *mm, unsigned long address,
 | 
				
			||||||
		pud_t *pud, int flags)
 | 
							pud_t *pud, int flags)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	if (flags & FOLL_GET)
 | 
						if (flags & (FOLL_GET | FOLL_PIN))
 | 
				
			||||||
		return NULL;
 | 
							return NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
 | 
						return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
 | 
				
			||||||
| 
						 | 
					@ -5008,7 +5026,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 | 
				
			||||||
struct page * __weak
 | 
					struct page * __weak
 | 
				
			||||||
follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
 | 
					follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	if (flags & FOLL_GET)
 | 
						if (flags & (FOLL_GET | FOLL_PIN))
 | 
				
			||||||
		return NULL;
 | 
							return NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
 | 
						return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue