forked from mirrors/linux
		
	 63551ae0fe
			
		
	
	
		63551ae0fe
		
	
	
	
	
		
			
			A lot of the code in arch/*/mm/hugetlbpage.c is quite similar. This patch attempts to consolidate a lot of the code across the arch's, putting the combined version in mm/hugetlb.c. There are a couple of uglyish hacks in order to covert all the hugepage archs, but the result is a very large reduction in the total amount of code. It also means things like hugepage lazy allocation could be implemented in one place, instead of six. Tested, at least a little, on ppc64, i386 and x86_64. Notes: - this patch changes the meaning of set_huge_pte() to be more analagous to set_pte() - does SH4 need s special huge_ptep_get_and_clear()?? Acked-by: William Lee Irwin <wli@holomorphy.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
		
			
				
	
	
		
			435 lines
		
	
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			435 lines
		
	
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * Generic hugetlb support.
 | |
|  * (C) William Irwin, April 2004
 | |
|  */
 | |
| #include <linux/gfp.h>
 | |
| #include <linux/list.h>
 | |
| #include <linux/init.h>
 | |
| #include <linux/module.h>
 | |
| #include <linux/mm.h>
 | |
| #include <linux/sysctl.h>
 | |
| #include <linux/highmem.h>
 | |
| #include <linux/nodemask.h>
 | |
| #include <linux/pagemap.h>
 | |
| #include <asm/page.h>
 | |
| #include <asm/pgtable.h>
 | |
| 
 | |
| #include <linux/hugetlb.h>
 | |
| 
 | |
| const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 | |
| static unsigned long nr_huge_pages, free_huge_pages;
 | |
| unsigned long max_huge_pages;
 | |
| static struct list_head hugepage_freelists[MAX_NUMNODES];
 | |
| static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 | |
| static unsigned int free_huge_pages_node[MAX_NUMNODES];
 | |
| static DEFINE_SPINLOCK(hugetlb_lock);
 | |
| 
 | |
| static void enqueue_huge_page(struct page *page)
 | |
| {
 | |
| 	int nid = page_to_nid(page);
 | |
| 	list_add(&page->lru, &hugepage_freelists[nid]);
 | |
| 	free_huge_pages++;
 | |
| 	free_huge_pages_node[nid]++;
 | |
| }
 | |
| 
 | |
| static struct page *dequeue_huge_page(void)
 | |
| {
 | |
| 	int nid = numa_node_id();
 | |
| 	struct page *page = NULL;
 | |
| 
 | |
| 	if (list_empty(&hugepage_freelists[nid])) {
 | |
| 		for (nid = 0; nid < MAX_NUMNODES; ++nid)
 | |
| 			if (!list_empty(&hugepage_freelists[nid]))
 | |
| 				break;
 | |
| 	}
 | |
| 	if (nid >= 0 && nid < MAX_NUMNODES &&
 | |
| 	    !list_empty(&hugepage_freelists[nid])) {
 | |
| 		page = list_entry(hugepage_freelists[nid].next,
 | |
| 				  struct page, lru);
 | |
| 		list_del(&page->lru);
 | |
| 		free_huge_pages--;
 | |
| 		free_huge_pages_node[nid]--;
 | |
| 	}
 | |
| 	return page;
 | |
| }
 | |
| 
 | |
| static struct page *alloc_fresh_huge_page(void)
 | |
| {
 | |
| 	static int nid = 0;
 | |
| 	struct page *page;
 | |
| 	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
 | |
| 					HUGETLB_PAGE_ORDER);
 | |
| 	nid = (nid + 1) % num_online_nodes();
 | |
| 	if (page) {
 | |
| 		nr_huge_pages++;
 | |
| 		nr_huge_pages_node[page_to_nid(page)]++;
 | |
| 	}
 | |
| 	return page;
 | |
| }
 | |
| 
 | |
| void free_huge_page(struct page *page)
 | |
| {
 | |
| 	BUG_ON(page_count(page));
 | |
| 
 | |
| 	INIT_LIST_HEAD(&page->lru);
 | |
| 	page[1].mapping = NULL;
 | |
| 
 | |
| 	spin_lock(&hugetlb_lock);
 | |
| 	enqueue_huge_page(page);
 | |
| 	spin_unlock(&hugetlb_lock);
 | |
| }
 | |
| 
 | |
| struct page *alloc_huge_page(void)
 | |
| {
 | |
| 	struct page *page;
 | |
| 	int i;
 | |
| 
 | |
| 	spin_lock(&hugetlb_lock);
 | |
| 	page = dequeue_huge_page();
 | |
| 	if (!page) {
 | |
| 		spin_unlock(&hugetlb_lock);
 | |
| 		return NULL;
 | |
| 	}
 | |
| 	spin_unlock(&hugetlb_lock);
 | |
| 	set_page_count(page, 1);
 | |
| 	page[1].mapping = (void *)free_huge_page;
 | |
| 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
 | |
| 		clear_highpage(&page[i]);
 | |
| 	return page;
 | |
| }
 | |
| 
 | |
| static int __init hugetlb_init(void)
 | |
| {
 | |
| 	unsigned long i;
 | |
| 	struct page *page;
 | |
| 
 | |
| 	for (i = 0; i < MAX_NUMNODES; ++i)
 | |
| 		INIT_LIST_HEAD(&hugepage_freelists[i]);
 | |
| 
 | |
| 	for (i = 0; i < max_huge_pages; ++i) {
 | |
| 		page = alloc_fresh_huge_page();
 | |
| 		if (!page)
 | |
| 			break;
 | |
| 		spin_lock(&hugetlb_lock);
 | |
| 		enqueue_huge_page(page);
 | |
| 		spin_unlock(&hugetlb_lock);
 | |
| 	}
 | |
| 	max_huge_pages = free_huge_pages = nr_huge_pages = i;
 | |
| 	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
 | |
| 	return 0;
 | |
| }
 | |
| module_init(hugetlb_init);
 | |
| 
 | |
| static int __init hugetlb_setup(char *s)
 | |
| {
 | |
| 	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
 | |
| 		max_huge_pages = 0;
 | |
| 	return 1;
 | |
| }
 | |
| __setup("hugepages=", hugetlb_setup);
 | |
| 
 | |
| #ifdef CONFIG_SYSCTL
 | |
| static void update_and_free_page(struct page *page)
 | |
| {
 | |
| 	int i;
 | |
| 	nr_huge_pages--;
 | |
| 	nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
 | |
| 	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
 | |
| 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 | |
| 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
 | |
| 				1 << PG_private | 1<< PG_writeback);
 | |
| 		set_page_count(&page[i], 0);
 | |
| 	}
 | |
| 	set_page_count(page, 1);
 | |
| 	__free_pages(page, HUGETLB_PAGE_ORDER);
 | |
| }
 | |
| 
 | |
| #ifdef CONFIG_HIGHMEM
 | |
| static void try_to_free_low(unsigned long count)
 | |
| {
 | |
| 	int i, nid;
 | |
| 	for (i = 0; i < MAX_NUMNODES; ++i) {
 | |
| 		struct page *page, *next;
 | |
| 		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
 | |
| 			if (PageHighMem(page))
 | |
| 				continue;
 | |
| 			list_del(&page->lru);
 | |
| 			update_and_free_page(page);
 | |
| 			nid = page_zone(page)->zone_pgdat->node_id;
 | |
| 			free_huge_pages--;
 | |
| 			free_huge_pages_node[nid]--;
 | |
| 			if (count >= nr_huge_pages)
 | |
| 				return;
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| #else
 | |
| static inline void try_to_free_low(unsigned long count)
 | |
| {
 | |
| }
 | |
| #endif
 | |
| 
 | |
| static unsigned long set_max_huge_pages(unsigned long count)
 | |
| {
 | |
| 	while (count > nr_huge_pages) {
 | |
| 		struct page *page = alloc_fresh_huge_page();
 | |
| 		if (!page)
 | |
| 			return nr_huge_pages;
 | |
| 		spin_lock(&hugetlb_lock);
 | |
| 		enqueue_huge_page(page);
 | |
| 		spin_unlock(&hugetlb_lock);
 | |
| 	}
 | |
| 	if (count >= nr_huge_pages)
 | |
| 		return nr_huge_pages;
 | |
| 
 | |
| 	spin_lock(&hugetlb_lock);
 | |
| 	try_to_free_low(count);
 | |
| 	while (count < nr_huge_pages) {
 | |
| 		struct page *page = dequeue_huge_page();
 | |
| 		if (!page)
 | |
| 			break;
 | |
| 		update_and_free_page(page);
 | |
| 	}
 | |
| 	spin_unlock(&hugetlb_lock);
 | |
| 	return nr_huge_pages;
 | |
| }
 | |
| 
 | |
| int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 | |
| 			   struct file *file, void __user *buffer,
 | |
| 			   size_t *length, loff_t *ppos)
 | |
| {
 | |
| 	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
 | |
| 	max_huge_pages = set_max_huge_pages(max_huge_pages);
 | |
| 	return 0;
 | |
| }
 | |
| #endif /* CONFIG_SYSCTL */
 | |
| 
 | |
| int hugetlb_report_meminfo(char *buf)
 | |
| {
 | |
| 	return sprintf(buf,
 | |
| 			"HugePages_Total: %5lu\n"
 | |
| 			"HugePages_Free:  %5lu\n"
 | |
| 			"Hugepagesize:    %5lu kB\n",
 | |
| 			nr_huge_pages,
 | |
| 			free_huge_pages,
 | |
| 			HPAGE_SIZE/1024);
 | |
| }
 | |
| 
 | |
| int hugetlb_report_node_meminfo(int nid, char *buf)
 | |
| {
 | |
| 	return sprintf(buf,
 | |
| 		"Node %d HugePages_Total: %5u\n"
 | |
| 		"Node %d HugePages_Free:  %5u\n",
 | |
| 		nid, nr_huge_pages_node[nid],
 | |
| 		nid, free_huge_pages_node[nid]);
 | |
| }
 | |
| 
 | |
| int is_hugepage_mem_enough(size_t size)
 | |
| {
 | |
| 	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
 | |
| }
 | |
| 
 | |
| /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 | |
| unsigned long hugetlb_total_pages(void)
 | |
| {
 | |
| 	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
 | |
| }
 | |
| EXPORT_SYMBOL(hugetlb_total_pages);
 | |
| 
 | |
| /*
 | |
|  * We cannot handle pagefaults against hugetlb pages at all.  They cause
 | |
|  * handle_mm_fault() to try to instantiate regular-sized pages in the
 | |
|  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
 | |
|  * this far.
 | |
|  */
 | |
| static struct page *hugetlb_nopage(struct vm_area_struct *vma,
 | |
| 				unsigned long address, int *unused)
 | |
| {
 | |
| 	BUG();
 | |
| 	return NULL;
 | |
| }
 | |
| 
 | |
| struct vm_operations_struct hugetlb_vm_ops = {
 | |
| 	.nopage = hugetlb_nopage,
 | |
| };
 | |
| 
 | |
| static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
 | |
| {
 | |
| 	pte_t entry;
 | |
| 
 | |
| 	if (vma->vm_flags & VM_WRITE) {
 | |
| 		entry =
 | |
| 		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 | |
| 	} else {
 | |
| 		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
 | |
| 	}
 | |
| 	entry = pte_mkyoung(entry);
 | |
| 	entry = pte_mkhuge(entry);
 | |
| 
 | |
| 	return entry;
 | |
| }
 | |
| 
 | |
| int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 | |
| 			    struct vm_area_struct *vma)
 | |
| {
 | |
| 	pte_t *src_pte, *dst_pte, entry;
 | |
| 	struct page *ptepage;
 | |
| 	unsigned long addr = vma->vm_start;
 | |
| 	unsigned long end = vma->vm_end;
 | |
| 
 | |
| 	while (addr < end) {
 | |
| 		dst_pte = huge_pte_alloc(dst, addr);
 | |
| 		if (!dst_pte)
 | |
| 			goto nomem;
 | |
| 		src_pte = huge_pte_offset(src, addr);
 | |
| 		BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
 | |
| 		entry = *src_pte;
 | |
| 		ptepage = pte_page(entry);
 | |
| 		get_page(ptepage);
 | |
| 		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
 | |
| 		set_huge_pte_at(dst, addr, dst_pte, entry);
 | |
| 		addr += HPAGE_SIZE;
 | |
| 	}
 | |
| 	return 0;
 | |
| 
 | |
| nomem:
 | |
| 	return -ENOMEM;
 | |
| }
 | |
| 
 | |
| void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 | |
| 			  unsigned long end)
 | |
| {
 | |
| 	struct mm_struct *mm = vma->vm_mm;
 | |
| 	unsigned long address;
 | |
| 	pte_t pte;
 | |
| 	struct page *page;
 | |
| 
 | |
| 	WARN_ON(!is_vm_hugetlb_page(vma));
 | |
| 	BUG_ON(start & ~HPAGE_MASK);
 | |
| 	BUG_ON(end & ~HPAGE_MASK);
 | |
| 
 | |
| 	for (address = start; address < end; address += HPAGE_SIZE) {
 | |
| 		pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address));
 | |
| 		if (pte_none(pte))
 | |
| 			continue;
 | |
| 		page = pte_page(pte);
 | |
| 		put_page(page);
 | |
| 	}
 | |
| 	add_mm_counter(mm, rss,  -((end - start) >> PAGE_SHIFT));
 | |
| 	flush_tlb_range(vma, start, end);
 | |
| }
 | |
| 
 | |
| void zap_hugepage_range(struct vm_area_struct *vma,
 | |
| 			unsigned long start, unsigned long length)
 | |
| {
 | |
| 	struct mm_struct *mm = vma->vm_mm;
 | |
| 
 | |
| 	spin_lock(&mm->page_table_lock);
 | |
| 	unmap_hugepage_range(vma, start, start + length);
 | |
| 	spin_unlock(&mm->page_table_lock);
 | |
| }
 | |
| 
 | |
| int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 | |
| {
 | |
| 	struct mm_struct *mm = current->mm;
 | |
| 	unsigned long addr;
 | |
| 	int ret = 0;
 | |
| 
 | |
| 	WARN_ON(!is_vm_hugetlb_page(vma));
 | |
| 	BUG_ON(vma->vm_start & ~HPAGE_MASK);
 | |
| 	BUG_ON(vma->vm_end & ~HPAGE_MASK);
 | |
| 
 | |
| 	hugetlb_prefault_arch_hook(mm);
 | |
| 
 | |
| 	spin_lock(&mm->page_table_lock);
 | |
| 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
 | |
| 		unsigned long idx;
 | |
| 		pte_t *pte = huge_pte_alloc(mm, addr);
 | |
| 		struct page *page;
 | |
| 
 | |
| 		if (!pte) {
 | |
| 			ret = -ENOMEM;
 | |
| 			goto out;
 | |
| 		}
 | |
| 		if (! pte_none(*pte))
 | |
| 			hugetlb_clean_stale_pgtable(pte);
 | |
| 
 | |
| 		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
 | |
| 			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
 | |
| 		page = find_get_page(mapping, idx);
 | |
| 		if (!page) {
 | |
| 			/* charge the fs quota first */
 | |
| 			if (hugetlb_get_quota(mapping)) {
 | |
| 				ret = -ENOMEM;
 | |
| 				goto out;
 | |
| 			}
 | |
| 			page = alloc_huge_page();
 | |
| 			if (!page) {
 | |
| 				hugetlb_put_quota(mapping);
 | |
| 				ret = -ENOMEM;
 | |
| 				goto out;
 | |
| 			}
 | |
| 			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
 | |
| 			if (! ret) {
 | |
| 				unlock_page(page);
 | |
| 			} else {
 | |
| 				hugetlb_put_quota(mapping);
 | |
| 				free_huge_page(page);
 | |
| 				goto out;
 | |
| 			}
 | |
| 		}
 | |
| 		add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
 | |
| 		set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
 | |
| 	}
 | |
| out:
 | |
| 	spin_unlock(&mm->page_table_lock);
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 | |
| 			struct page **pages, struct vm_area_struct **vmas,
 | |
| 			unsigned long *position, int *length, int i)
 | |
| {
 | |
| 	unsigned long vpfn, vaddr = *position;
 | |
| 	int remainder = *length;
 | |
| 
 | |
| 	BUG_ON(!is_vm_hugetlb_page(vma));
 | |
| 
 | |
| 	vpfn = vaddr/PAGE_SIZE;
 | |
| 	while (vaddr < vma->vm_end && remainder) {
 | |
| 
 | |
| 		if (pages) {
 | |
| 			pte_t *pte;
 | |
| 			struct page *page;
 | |
| 
 | |
| 			/* Some archs (sparc64, sh*) have multiple
 | |
| 			 * pte_ts to each hugepage.  We have to make
 | |
| 			 * sure we get the first, for the page
 | |
| 			 * indexing below to work. */
 | |
| 			pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
 | |
| 
 | |
| 			/* hugetlb should be locked, and hence, prefaulted */
 | |
| 			WARN_ON(!pte || pte_none(*pte));
 | |
| 
 | |
| 			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
 | |
| 
 | |
| 			WARN_ON(!PageCompound(page));
 | |
| 
 | |
| 			get_page(page);
 | |
| 			pages[i] = page;
 | |
| 		}
 | |
| 
 | |
| 		if (vmas)
 | |
| 			vmas[i] = vma;
 | |
| 
 | |
| 		vaddr += PAGE_SIZE;
 | |
| 		++vpfn;
 | |
| 		--remainder;
 | |
| 		++i;
 | |
| 	}
 | |
| 
 | |
| 	*length = remainder;
 | |
| 	*position = vaddr;
 | |
| 
 | |
| 	return i;
 | |
| }
 |