Return-Path: <mbligh@w-mbligh>
X-Sieve: cmu-sieve 2.0
Return-path: <agl@us.ibm.com>
Envelope-to: mbligh@localhost
Delivery-date: Wed, 17 Mar 2004 14:23:35 -0800
Received: from w-mbligh.beaverton.ibm.com
	([127.0.0.1] helo=mail.aracnet.com ident=mbligh)
	by w-mbligh.beaverton.ibm.com with esmtp (Exim 3.35 #1 (Debian))
	id 1B3jRm-0002x1-00
	for <mbligh@localhost>; Wed, 17 Mar 2004 14:23:34 -0800
Received: from psmtp.com (exprod5mx18.postini.com [12.158.34.158])
	by obsidian.spiritone.com (8.12.10/8.12.8) with SMTP id i2HMQSkT010825
	for <mbligh@aracnet.com>; Wed, 17 Mar 2004 14:26:28 -0800
Delivered-To: <mbligh@aracnet.com>
Received: from source ([32.97.110.131]) by exprod5mx18.postini.com ([12.158.34.245]) with SMTP;
	Wed, 17 Mar 2004 17:19:46 EST
Received: from westrelay04.boulder.ibm.com (westrelay04.boulder.ibm.com [9.17.193.32])
	by e33.co.us.ibm.com (8.12.10/8.12.2) with ESMTP id i2HMJjfS792882
	for <mbligh@aracnet.com>; Wed, 17 Mar 2004 17:19:45 -0500
Received: from DYN317989BLD.beaverton.ibm.com (d03av02.boulder.ibm.com [9.17.193.82])
	by westrelay04.boulder.ibm.com (8.12.10/NCO/VER6.6) with ESMTP id i2HMJi3D113458
	for <mbligh@aracnet.com>; Wed, 17 Mar 2004 15:19:45 -0700
Subject: 2.6.4-mjb1 : 760-implicit_hugetlb
From: Adam Litke <agl@us.ibm.com>
To: Martin Bligh <mbligh@aracnet.com>
Content-Type: text/plain
Organization: IBM
Message-Id: <1079561652.5224.1.camel@agtpad>
Mime-Version: 1.0
X-Mailer: Ximian Evolution 1.4.5 
Date: Wed, 17 Mar 2004 14:14:13 -0800
Content-Transfer-Encoding: 7bit
X-Accept: 2.6 or must-fix





diff -upN reference/arch/ppc64/mm/hugetlbpage.c current/arch/ppc64/mm/hugetlbpage.c
--- reference/arch/ppc64/mm/hugetlbpage.c	2004-04-29 10:39:11.000000000 -0700
+++ current/arch/ppc64/mm/hugetlbpage.c	2004-04-29 10:39:27.000000000 -0700
@@ -307,6 +307,21 @@ int prepare_hugepage_range(unsigned long
 	return -EINVAL;
 }
 
+int close_32bit_htlbpage_range(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	
+	BUG_ON(mm->context.low_hpages == 0);
+
+	/* Check if any vmas are in the region */
+	vma = find_vma(mm, TASK_HPAGE_BASE_32);
+	if (vma && vma->vm_start < TASK_HPAGE_END_32)
+		return -EBUSY;
+	
+	mm->context.low_hpages = 0;
+	return 0;
+}
+
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma)
 {
@@ -637,8 +652,11 @@ unsigned long hugetlb_get_unmapped_area(
 
 	for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
-		if (addr + len > end)
+		if (addr + len > end) {
+			if (test_thread_flag(TIF_32BIT))
+				close_32bit_htlbpage_range(current->mm);
 			return -ENOMEM;
+		}
 		if (!vma || (addr + len) <= vma->vm_start)
 			return addr;
 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
diff -upN reference/fs/hugetlbfs/inode.c current/fs/hugetlbfs/inode.c
--- reference/fs/hugetlbfs/inode.c	2004-04-29 10:39:24.000000000 -0700
+++ current/fs/hugetlbfs/inode.c	2004-04-29 10:39:27.000000000 -0700
@@ -26,12 +26,17 @@
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
 #include <linux/security.h>
+#include <linux/err.h>
 
 #include <asm/uaccess.h>
+#include <asm/mman.h>
 
 /* some random number */
 #define HUGETLBFS_MAGIC	0x958458f6
 
+extern int mmap_use_hugepages;
+extern int mmap_hugepages_map_sz;
+
 static struct super_operations hugetlbfs_ops;
 static struct address_space_operations hugetlbfs_aops;
 struct file_operations hugetlbfs_file_operations;
@@ -82,7 +87,7 @@ static int hugetlbfs_file_mmap(struct fi
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
 #else
-static unsigned long
+unsigned long
 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags)
 {
diff -upN reference/include/asm-i386/mman.h current/include/asm-i386/mman.h
--- reference/include/asm-i386/mman.h	2003-10-14 15:50:32.000000000 -0700
+++ current/include/asm-i386/mman.h	2004-04-29 10:39:27.000000000 -0700
@@ -16,6 +16,7 @@
 #define MAP_ANONYMOUS	0x20		/* don't use a file */
 
 #define MAP_GROWSDOWN	0x0100		/* stack-like segment */
+#define MAP_HUGETLB	0x0400		/* Backed by hugetlb pages */
 #define MAP_DENYWRITE	0x0800		/* ETXTBSY */
 #define MAP_EXECUTABLE	0x1000		/* mark it as an executable */
 #define MAP_LOCKED	0x2000		/* pages are locked */
diff -upN reference/include/asm-ppc64/mman.h current/include/asm-ppc64/mman.h
--- reference/include/asm-ppc64/mman.h	2003-10-01 11:48:24.000000000 -0700
+++ current/include/asm-ppc64/mman.h	2004-04-29 10:39:27.000000000 -0700
@@ -26,6 +26,7 @@
 #define MAP_LOCKED	0x80
 
 #define MAP_GROWSDOWN	0x0100		/* stack-like segment */
+#define MAP_HUGETLB	0x0400		/* Backed with hugetlb pages */
 #define MAP_DENYWRITE	0x0800		/* ETXTBSY */
 #define MAP_EXECUTABLE	0x1000		/* mark it as an executable */
 
diff -upN reference/include/linux/hugetlb.h current/include/linux/hugetlb.h
--- reference/include/linux/hugetlb.h	2004-04-07 14:54:36.000000000 -0700
+++ current/include/linux/hugetlb.h	2004-04-29 10:39:27.000000000 -0700
@@ -50,6 +50,9 @@ mark_mm_hugetlb(struct mm_struct *mm, st
 int prepare_hugepage_range(unsigned long addr, unsigned long len);
 #endif
 
+unsigned long try_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+	unsigned long len, unsigned long pgoff, unsigned long *flags);
+
 #else /* !CONFIG_HUGETLB_PAGE */
 
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
@@ -123,12 +126,21 @@ static inline void set_file_hugepages(st
 {
 	file->f_op = &hugetlbfs_file_operations;
 }
+
+unsigned long
+hugetlb_get_unmapped_area(struct file *, unsigned long, unsigned long,
+			  unsigned long, unsigned long);
 #else /* !CONFIG_HUGETLBFS */
 
 #define is_file_hugepages(file)		0
 #define set_file_hugepages(file)	BUG()
 #define hugetlb_zero_setup(size)	ERR_PTR(-ENOSYS)
 
+static inline unsigned long
+hugetlb_get_unmapped_area(struct file * a, unsigned long b, unsigned long c,
+			  unsigned long d, unsigned long e) { return -ENOSYS; }
 #endif /* !CONFIG_HUGETLBFS */
 
+
+
 #endif /* _LINUX_HUGETLB_H */
diff -upN reference/include/linux/mman.h current/include/linux/mman.h
--- reference/include/linux/mman.h	2003-10-14 15:50:34.000000000 -0700
+++ current/include/linux/mman.h	2004-04-29 10:39:27.000000000 -0700
@@ -58,6 +58,9 @@ calc_vm_flag_bits(unsigned long flags)
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
 	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
 	       _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
+#ifdef CONFIG_HUGETLB_PAGE
+               _calc_vm_trans(flags, MAP_HUGETLB,    VM_HUGETLB   ) |
+#endif
 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
 }
 
diff -upN reference/include/linux/sysctl.h current/include/linux/sysctl.h
--- reference/include/linux/sysctl.h	2004-04-07 14:54:37.000000000 -0700
+++ current/include/linux/sysctl.h	2004-04-29 10:39:27.000000000 -0700
@@ -131,6 +131,10 @@ enum
 	KERN_PRINTK_RATELIMIT_BURST=61,	/* int: tune printk ratelimiting */
 	KERN_PTY=62,		/* dir: pty driver */
 	KERN_NGROUPS_MAX=63,	/* int: NGROUPS_MAX */
+	KERN_SHMUSEHUGEPAGES=64,       /* int: back shm with huge pages */
+	KERN_MMAPUSEHUGEPAGES=65,      /* int: back anon mmap with huge pages */
+	KERN_HPAGES_PER_FILE=66,       /* int: max bigpages per file */
+	KERN_HPAGES_MAP_SZ=67,         /* int: min size (MB) of mapping */
 };
 
 
diff -upN reference/ipc/shm.c current/ipc/shm.c
--- reference/ipc/shm.c	2004-04-07 14:54:37.000000000 -0700
+++ current/ipc/shm.c	2004-04-29 10:39:27.000000000 -0700
@@ -32,6 +32,9 @@
 
 #define shm_flags	shm_perm.mode
 
+extern int shm_use_hugepages;
+extern int shm_hugepages_per_file;
+
 static struct file_operations shm_file_operations;
 static struct vm_operations_struct shm_vm_ops;
 
@@ -165,6 +168,31 @@ static struct vm_operations_struct shm_v
 	.nopage	= shmem_nopage,
 };
 
+#ifdef CONFIG_HUGETLBFS
+int shm_with_hugepages(int shmflag, size_t size)
+{
+	/* flag specified explicitly */
+	if (shmflag & SHM_HUGETLB)
+		return 1;
+	/* Are we disabled? */
+	if (!shm_use_hugepages)
+		return 0;
+	/* Must be HPAGE aligned */
+	if (size & ~HPAGE_MASK)
+		return 0;
+	/* Are we under the max per file? */
+	if ((size >> HPAGE_SHIFT) > shm_hugepages_per_file)
+		return 0;
+	/* Do we have enough free huge pages? */
+	if (!is_hugepage_mem_enough(size))
+		return 0;
+	
+	return 1;
+}
+#else
+int shm_with_hugepages(int shmflag, size_t size) { return 0; }
+#endif
+
 static int newseg (key_t key, int shmflg, size_t size)
 {
 	int error;
@@ -194,8 +222,10 @@ static int newseg (key_t key, int shmflg
 		return error;
 	}
 
-	if (shmflg & SHM_HUGETLB)
+	if (shm_with_hugepages(shmflg, size)) {
+		shmflg |= SHM_HUGETLB;
 		file = hugetlb_zero_setup(size);
+	}
 	else {
 		sprintf (name, "SYSV%08x", key);
 		file = shmem_file_setup(name, size, VM_ACCOUNT);
diff -upN reference/kernel/sysctl.c current/kernel/sysctl.c
--- reference/kernel/sysctl.c	2004-04-29 10:39:19.000000000 -0700
+++ current/kernel/sysctl.c	2004-04-29 10:39:27.000000000 -0700
@@ -64,6 +64,8 @@ extern int sysctl_lower_zone_protection;
 extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
+extern int shm_use_hugepages, shm_hugepages_per_file;
+extern int mmap_use_hugepages, mmap_hugepages_map_sz;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -615,6 +617,40 @@ static ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_HUGETLBFS
+	{
+		.ctl_name	= KERN_SHMUSEHUGEPAGES,
+		.procname	= "shm-use-hugepages",
+		.data		= &shm_use_hugepages,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_MMAPUSEHUGEPAGES,
+		.procname	= "mmap-use-hugepages",
+		.data		= &mmap_use_hugepages,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_HPAGES_PER_FILE,
+		.procname	= "shm-hugepages-per-file",
+		.data		= &shm_hugepages_per_file,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_HPAGES_MAP_SZ,
+		.procname	= "mmap-hugepages-min-mapping",
+		.data		= &mmap_hugepages_map_sz,
+		.maxlen		= sizeof(int),
+		.mode		0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
diff -upN reference/mm/mmap.c current/mm/mmap.c
--- reference/mm/mmap.c	2004-04-29 10:39:14.000000000 -0700
+++ current/mm/mmap.c	2004-04-29 10:39:27.000000000 -0700
@@ -21,6 +21,7 @@
 #include <linux/profile.h>
 #include <linux/module.h>
 #include <linux/mount.h>
+#include <linux/err.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
@@ -62,6 +63,9 @@ EXPORT_SYMBOL(sysctl_overcommit_ratio);
 EXPORT_SYMBOL(sysctl_max_map_count);
 EXPORT_SYMBOL(vm_committed_space);
 
+int mmap_use_hugepages = 0;
+int mmap_hugepages_map_sz = 256;
+
 /*
  * Requires inode->i_mapping->i_shared_sem
  */
@@ -474,6 +478,46 @@ static struct vm_area_struct *vma_merge(
 	return NULL;
 }
 
+#ifdef CONFIG_HUGETLBFS
+int mmap_hugetlb_implicit(unsigned long len)
+{
+	/* Are we enabled? */
+	if (!mmap_use_hugepages)
+		return 0;
+	/* Must be HPAGE aligned */
+	if (len & ~HPAGE_MASK)
+		return 0;
+	/* Are we under the minimum size? */
+	if (mmap_hugepages_map_sz
+	    && len < (mmap_hugepages_map_sz << 20))
+		return 0;
+
+	return 1;
+}
+#else
+int mmap_hugetlb_implicit(unsigned long len) { return 0; }
+#endif
+
+unsigned long
+try_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long *flags)
+{
+	if (!capable(CAP_IPC_LOCK))
+		return -EPERM;
+	
+	if (*flags & MAP_HUGETLB) {
+		return hugetlb_get_unmapped_area(NULL, addr, len, pgoff, *flags);
+	}
+
+	if (mmap_hugetlb_implicit(len)) {
+		addr = hugetlb_get_unmapped_area(NULL, addr, len, pgoff, *flags);
+		if (!(addr & ~HPAGE_MASK))
+			*flags |= MAP_HUGETLB;
+		return addr;
+	}
+	return -ENOMEM;
+}
+
 /*
  * The caller must hold down_write(current->mm->mmap_sem).
  */
@@ -490,7 +534,8 @@ unsigned long do_mmap_pgoff(struct file 
 	int error;
 	struct rb_node ** rb_link, * rb_parent;
 	int accountable = 1;
-	unsigned long charged = 0;
+	unsigned long charged = 0, addr_save = addr;
+	int hugetlb_explicit = (flags & MAP_HUGETLB) != 0;
 
 	if (file) {
 		if (is_file_hugepages(file))
@@ -521,8 +566,14 @@ unsigned long do_mmap_pgoff(struct file 
 
 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
+	 * VM_HUGETLB will never appear in vm_flags when CONFIG_HUGETLB is 
+	 * unset.                                                          
 	 */
-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
+	addr = try_hugetlb_get_unmapped_area(NULL, addr, len, pgoff, &flags);
+	if (!(flags & MAP_HUGETLB))
+hugetlb_fallback:
+		addr = get_unmapped_area(file, addr_save, len, pgoff, flags);
+	
 	if (addr & ~PAGE_MASK)
 		return addr;
 
@@ -671,10 +722,44 @@ munmap_back:
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
-	} else if (vm_flags & VM_SHARED) {
-		error = shmem_zero_setup(vma);
-		if (error)
-			goto free_vma;
+	} else if ((vm_flags & VM_SHARED) || (vm_flags & VM_HUGETLB)) {
+		if (!is_vm_hugetlb_page(vma)) {
+			error = shmem_zero_setup(vma);
+			if (error)
+				goto free_vma;
+		} else {
+			/*
+			 * Presumably hugetlb_zero_setup() acquires a
+			 * reference count for us. The difference
+			 * between this and the shmem_zero_setup()
+			 * case is that we can encounter an error
+			 * _after_ allocating the file. The error
+			 * path was adjusted slightly to fput() for us.
+			 */
+			struct file *new_file = hugetlb_zero_setup(len);
+			if (IS_ERR(new_file)) {
+				if (hugetlb_explicit) {
+					error = PTR_ERR(new_file);
+					goto free_vma;
+				} else {
+					/*
+					 * We tried an implicit hugetlb mmap
+					 * but we failed to get the pages.
+					 * We basically have to start over.
+					 */
+					flags &= ~MAP_HUGETLB;
+					kmem_cache_free(vm_area_cachep, vma);
+					if (charged)
+						vm_unacct_memory(charged);
+					goto hugetlb_fallback;
+				}
+			} else {
+				vma->vm_file = new_file;
+				error = new_file->f_op->mmap(new_file, vma);
+				if (error)
+					goto unmap_and_free_vma;
+			}
+		}
 	}
 
 	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
@@ -722,11 +807,21 @@ out:	
 unmap_and_free_vma:
 	if (correct_wcount)
 		atomic_inc(&inode->i_writecount);
-	vma->vm_file = NULL;
-	fput(file);
 
-	/* Undo any partial mapping done by a device driver. */
+	/*
+	 * Undo any partial mapping done by a device driver.  
+	 * hugetlb wants to know the vma's file etc. so nuke  
+	 * the file afterward.                                
+	 */                                                   
 	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+
+	/*
+	 * vma->vm_file may be different from file in the hugetlb case.
+	 */
+	if (vma->vm_file)
+		fput(vma->vm_file); 
+	vma->vm_file = NULL;
+
 free_vma:
 	kmem_cache_free(vm_area_cachep, vma);
 unacct_error:
diff -upN reference/mm/shmem.c current/mm/shmem.c
--- reference/mm/shmem.c	2004-04-07 14:54:38.000000000 -0700
+++ current/mm/shmem.c	2004-04-29 10:39:27.000000000 -0700
@@ -40,6 +40,29 @@
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 
+int shm_use_hugepages;
+
+/*
+ * On 64bit archs the vmalloc area is very large,
+ * so we allocate the array in vmalloc on 64bit archs.
+ *
+ * Assuming 2M pages (x86 and x86-64) those default setting
+ * will allow up to 128G of bigpages in a single file on
+ * 64bit archs and 64G on 32bit archs using the max
+ * kmalloc size of 128k. So tweaking in practice is needed
+ * only to go past 128G of bigpages per file on 64bit archs.
+ *
+ * This sysctl is in page units (each page large BIGPAGE_SIZE).
+ */
+#ifdef CONFIG_HUGETLBFS
+#if BITS_PER_LONG == 64
+int shm_hugepages_per_file = 128UL << (30 - HPAGE_SHIFT);
+#else
+int shm_hugepages_per_file = 131072 / sizeof(struct page *);
+#endif
+#endif
+
+
 /* This magic number is used in glibc for posix shared memory */
 #define TMPFS_MAGIC	0x01021994