mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	[PATCH] lightweight robust futexes: core
Add the core infrastructure for robust futexes: structure definitions, the new syscalls and the do_exit() based cleanup mechanism. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Arjan van de Ven <arjan@infradead.org> Acked-by: Ulrich Drepper <drepper@redhat.com> Cc: Michael Kerrisk <mtk-manpages@gmx.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
		
							parent
							
								
									e9056f13bf
								
							
						
					
					
						commit
						0771dfefc9
					
				
					 6 changed files with 279 additions and 1 deletions
				
			
		|  | @ -1,6 +1,8 @@ | |||
| #ifndef _LINUX_FUTEX_H | ||||
| #define _LINUX_FUTEX_H | ||||
| 
 | ||||
| #include <linux/sched.h> | ||||
| 
 | ||||
| /* Second argument to futex syscall */ | ||||
| 
 | ||||
| 
 | ||||
|  | @ -11,10 +13,103 @@ | |||
| #define FUTEX_CMP_REQUEUE	4 | ||||
| #define FUTEX_WAKE_OP		5 | ||||
| 
 | ||||
| /*
 | ||||
|  * Support for robust futexes: the kernel cleans up held futexes at | ||||
|  * thread exit time. | ||||
|  */ | ||||
| 
 | ||||
| /*
 | ||||
|  * Per-lock list entry - embedded in user-space locks, somewhere close | ||||
|  * to the futex field. (Note: user-space uses a double-linked list to | ||||
|  * achieve O(1) list add and remove, but the kernel only needs to know | ||||
|  * about the forward link) | ||||
|  * | ||||
|  * NOTE: this structure is part of the syscall ABI, and must not be | ||||
|  * changed. | ||||
|  */ | ||||
| struct robust_list { | ||||
| 	struct robust_list __user *next; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * Per-thread list head: | ||||
|  * | ||||
|  * NOTE: this structure is part of the syscall ABI, and must only be | ||||
|  * changed if the change is first communicated with the glibc folks. | ||||
|  * (When an incompatible change is done, we'll increase the structure | ||||
|  *  size, which glibc will detect) | ||||
|  */ | ||||
| struct robust_list_head { | ||||
| 	/*
 | ||||
| 	 * The head of the list. Points back to itself if empty: | ||||
| 	 */ | ||||
| 	struct robust_list list; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * This relative offset is set by user-space, it gives the kernel | ||||
| 	 * the relative position of the futex field to examine. This way | ||||
| 	 * we keep userspace flexible, to freely shape its data-structure, | ||||
| 	 * without hardcoding any particular offset into the kernel: | ||||
| 	 */ | ||||
| 	long futex_offset; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The death of the thread may race with userspace setting | ||||
| 	 * up a lock's links. So to handle this race, userspace first | ||||
| 	 * sets this field to the address of the to-be-taken lock, | ||||
| 	 * then does the lock acquire, and then adds itself to the | ||||
| 	 * list, and then clears this field. Hence the kernel will | ||||
| 	 * always have full knowledge of all locks that the thread | ||||
| 	 * _might_ have taken. We check the owner TID in any case, | ||||
| 	 * so only truly owned locks will be handled. | ||||
| 	 */ | ||||
| 	struct robust_list __user *list_op_pending; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * Are there any waiters for this robust futex: | ||||
|  */ | ||||
| #define FUTEX_WAITERS		0x80000000 | ||||
| 
 | ||||
| /*
 | ||||
|  * The kernel signals via this bit that a thread holding a futex | ||||
|  * has exited without unlocking the futex. The kernel also does | ||||
|  * a FUTEX_WAKE on such futexes, after setting the bit, to wake | ||||
|  * up any possible waiters: | ||||
|  */ | ||||
| #define FUTEX_OWNER_DIED	0x40000000 | ||||
| 
 | ||||
| /*
 | ||||
|  * Reserved bit: | ||||
|  */ | ||||
| #define FUTEX_OWNER_PENDING	0x20000000 | ||||
| 
 | ||||
| /*
 | ||||
|  * The rest of the robust-futex field is for the TID: | ||||
|  */ | ||||
| #define FUTEX_TID_MASK		0x1fffffff | ||||
| 
 | ||||
| /*
 | ||||
|  * A limit of one million locks held per thread (!) ought to be enough | ||||
|  * for some time. This also protects against a deliberately circular | ||||
|  * list. Not worth introducing an rlimit for this: | ||||
|  */ | ||||
| #define ROBUST_LIST_LIMIT	1048576 | ||||
| 
 | ||||
| long do_futex(unsigned long uaddr, int op, int val, | ||||
| 		unsigned long timeout, unsigned long uaddr2, int val2, | ||||
| 		int val3); | ||||
| 
 | ||||
| extern int handle_futex_death(unsigned int *uaddr, struct task_struct *curr); | ||||
| 
 | ||||
| #ifdef CONFIG_FUTEX | ||||
| extern void exit_robust_list(struct task_struct *curr); | ||||
| #else | ||||
| static inline void exit_robust_list(struct task_struct *curr) | ||||
| { | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| #define FUTEX_OP_SET		0	/* *(int *)UADDR2 = OPARG; */ | ||||
| #define FUTEX_OP_ADD		1	/* *(int *)UADDR2 += OPARG; */ | ||||
| #define FUTEX_OP_OR		2	/* *(int *)UADDR2 |= OPARG; */ | ||||
|  |  | |||
|  | @ -35,6 +35,7 @@ | |||
| #include <linux/topology.h> | ||||
| #include <linux/seccomp.h> | ||||
| #include <linux/rcupdate.h> | ||||
| #include <linux/futex.h> | ||||
| 
 | ||||
| #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */ | ||||
| 
 | ||||
|  | @ -872,6 +873,8 @@ struct task_struct { | |||
| 	int cpuset_mems_generation; | ||||
| 	int cpuset_mem_spread_rotor; | ||||
| #endif | ||||
| 	struct robust_list_head __user *robust_list; | ||||
| 
 | ||||
| 	atomic_t fs_excl;	/* holding fs exclusive resources */ | ||||
| 	struct rcu_head rcu; | ||||
| }; | ||||
|  |  | |||
|  | @ -28,7 +28,8 @@ | |||
| #define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000) | ||||
| 
 | ||||
| /*
 | ||||
|  * A maximum of 4 million PIDs should be enough for a while: | ||||
|  * A maximum of 4 million PIDs should be enough for a while. | ||||
|  * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.] | ||||
|  */ | ||||
| #define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \ | ||||
| 	(sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT)) | ||||
|  |  | |||
|  | @ -31,6 +31,7 @@ | |||
| #include <linux/signal.h> | ||||
| #include <linux/cn_proc.h> | ||||
| #include <linux/mutex.h> | ||||
| #include <linux/futex.h> | ||||
| 
 | ||||
| #include <asm/uaccess.h> | ||||
| #include <asm/unistd.h> | ||||
|  | @ -852,6 +853,8 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 		exit_itimers(tsk->signal); | ||||
| 		acct_process(code); | ||||
| 	} | ||||
| 	if (unlikely(tsk->robust_list)) | ||||
| 		exit_robust_list(tsk); | ||||
| 	exit_mm(tsk); | ||||
| 
 | ||||
| 	exit_sem(tsk); | ||||
|  |  | |||
							
								
								
									
										172
									
								
								kernel/futex.c
									
									
									
									
									
								
							
							
						
						
									
										172
									
								
								kernel/futex.c
									
									
									
									
									
								
							|  | @ -8,6 +8,10 @@ | |||
|  *  Removed page pinning, fix privately mapped COW pages and other cleanups | ||||
|  *  (C) Copyright 2003, 2004 Jamie Lokier | ||||
|  * | ||||
|  *  Robust futex support started by Ingo Molnar | ||||
|  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved | ||||
|  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes. | ||||
|  * | ||||
|  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | ||||
|  *  enough at me, Linus for the original (flawed) idea, Matthew | ||||
|  *  Kirkwood for proof-of-concept implementation. | ||||
|  | @ -829,6 +833,174 @@ static int futex_fd(unsigned long uaddr, int signal) | |||
| 	goto out; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Support for robust futexes: the kernel cleans up held futexes at | ||||
|  * thread exit time. | ||||
|  * | ||||
|  * Implementation: user-space maintains a per-thread list of locks it | ||||
|  * is holding. Upon do_exit(), the kernel carefully walks this list, | ||||
|  * and marks all locks that are owned by this thread with the | ||||
|  * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is | ||||
|  * always manipulated with the lock held, so the list is private and | ||||
|  * per-thread. Userspace also maintains a per-thread 'list_op_pending' | ||||
|  * field, to allow the kernel to clean up if the thread dies after | ||||
|  * acquiring the lock, but just before it could have added itself to | ||||
|  * the list. There can only be one such pending lock. | ||||
|  */ | ||||
| 
 | ||||
| /**
 | ||||
|  * sys_set_robust_list - set the robust-futex list head of a task | ||||
|  * @head: pointer to the list-head | ||||
|  * @len: length of the list-head, as userspace expects | ||||
|  */ | ||||
| asmlinkage long | ||||
| sys_set_robust_list(struct robust_list_head __user *head, | ||||
| 		    size_t len) | ||||
| { | ||||
| 	/*
 | ||||
| 	 * The kernel knows only one size for now: | ||||
| 	 */ | ||||
| 	if (unlikely(len != sizeof(*head))) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	current->robust_list = head; | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * sys_get_robust_list - get the robust-futex list head of a task | ||||
|  * @pid: pid of the process [zero for current task] | ||||
|  * @head_ptr: pointer to a list-head pointer, the kernel fills it in | ||||
|  * @len_ptr: pointer to a length field, the kernel fills in the header size | ||||
|  */ | ||||
| asmlinkage long | ||||
| sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, | ||||
| 		    size_t __user *len_ptr) | ||||
| { | ||||
| 	struct robust_list_head *head; | ||||
| 	unsigned long ret; | ||||
| 
 | ||||
| 	if (!pid) | ||||
| 		head = current->robust_list; | ||||
| 	else { | ||||
| 		struct task_struct *p; | ||||
| 
 | ||||
| 		ret = -ESRCH; | ||||
| 		read_lock(&tasklist_lock); | ||||
| 		p = find_task_by_pid(pid); | ||||
| 		if (!p) | ||||
| 			goto err_unlock; | ||||
| 		ret = -EPERM; | ||||
| 		if ((current->euid != p->euid) && (current->euid != p->uid) && | ||||
| 				!capable(CAP_SYS_PTRACE)) | ||||
| 			goto err_unlock; | ||||
| 		head = p->robust_list; | ||||
| 		read_unlock(&tasklist_lock); | ||||
| 	} | ||||
| 
 | ||||
| 	if (put_user(sizeof(*head), len_ptr)) | ||||
| 		return -EFAULT; | ||||
| 	return put_user(head, head_ptr); | ||||
| 
 | ||||
| err_unlock: | ||||
| 	read_unlock(&tasklist_lock); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Process a futex-list entry, check whether it's owned by the | ||||
|  * dying task, and do notification if so: | ||||
|  */ | ||||
| int handle_futex_death(unsigned int *uaddr, struct task_struct *curr) | ||||
| { | ||||
| 	unsigned int futex_val; | ||||
| 
 | ||||
| repeat: | ||||
| 	if (get_user(futex_val, uaddr)) | ||||
| 		return -1; | ||||
| 
 | ||||
| 	if ((futex_val & FUTEX_TID_MASK) == curr->pid) { | ||||
| 		/*
 | ||||
| 		 * Ok, this dying thread is truly holding a futex | ||||
| 		 * of interest. Set the OWNER_DIED bit atomically | ||||
| 		 * via cmpxchg, and if the value had FUTEX_WAITERS | ||||
| 		 * set, wake up a waiter (if any). (We have to do a | ||||
| 		 * futex_wake() even if OWNER_DIED is already set - | ||||
| 		 * to handle the rare but possible case of recursive | ||||
| 		 * thread-death.) The rest of the cleanup is done in | ||||
| 		 * userspace. | ||||
| 		 */ | ||||
| 		if (futex_atomic_cmpxchg_inuser(uaddr, futex_val, | ||||
| 					 futex_val | FUTEX_OWNER_DIED) != | ||||
| 								   futex_val) | ||||
| 			goto repeat; | ||||
| 
 | ||||
| 		if (futex_val & FUTEX_WAITERS) | ||||
| 			futex_wake((unsigned long)uaddr, 1); | ||||
| 	} | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Walk curr->robust_list (very carefully, it's a userspace list!) | ||||
|  * and mark any locks found there dead, and notify any waiters. | ||||
|  * | ||||
|  * We silently return on any sign of list-walking problem. | ||||
|  */ | ||||
| void exit_robust_list(struct task_struct *curr) | ||||
| { | ||||
| 	struct robust_list_head __user *head = curr->robust_list; | ||||
| 	struct robust_list __user *entry, *pending; | ||||
| 	unsigned int limit = ROBUST_LIST_LIMIT; | ||||
| 	unsigned long futex_offset; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Fetch the list head (which was registered earlier, via | ||||
| 	 * sys_set_robust_list()): | ||||
| 	 */ | ||||
| 	if (get_user(entry, &head->list.next)) | ||||
| 		return; | ||||
| 	/*
 | ||||
| 	 * Fetch the relative futex offset: | ||||
| 	 */ | ||||
| 	if (get_user(futex_offset, &head->futex_offset)) | ||||
| 		return; | ||||
| 	/*
 | ||||
| 	 * Fetch any possibly pending lock-add first, and handle it | ||||
| 	 * if it exists: | ||||
| 	 */ | ||||
| 	if (get_user(pending, &head->list_op_pending)) | ||||
| 		return; | ||||
| 	if (pending) | ||||
| 		handle_futex_death((void *)pending + futex_offset, curr); | ||||
| 
 | ||||
| 	while (entry != &head->list) { | ||||
| 		/*
 | ||||
| 		 * A pending lock might already be on the list, so | ||||
| 		 * dont process it twice: | ||||
| 		 */ | ||||
| 		if (entry != pending) | ||||
| 			if (handle_futex_death((void *)entry + futex_offset, | ||||
| 						curr)) | ||||
| 				return; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Fetch the next entry in the list: | ||||
| 		 */ | ||||
| 		if (get_user(entry, &entry->next)) | ||||
| 			return; | ||||
| 		/*
 | ||||
| 		 * Avoid excessively long or circular lists: | ||||
| 		 */ | ||||
| 		if (!--limit) | ||||
| 			break; | ||||
| 
 | ||||
| 		cond_resched(); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | ||||
| 		unsigned long uaddr2, int val2, int val3) | ||||
| { | ||||
|  |  | |||
|  | @ -42,6 +42,10 @@ cond_syscall(sys_recvmsg); | |||
| cond_syscall(sys_socketcall); | ||||
| cond_syscall(sys_futex); | ||||
| cond_syscall(compat_sys_futex); | ||||
| cond_syscall(sys_set_robust_list); | ||||
| cond_syscall(compat_sys_set_robust_list); | ||||
| cond_syscall(sys_get_robust_list); | ||||
| cond_syscall(compat_sys_get_robust_list); | ||||
| cond_syscall(sys_epoll_create); | ||||
| cond_syscall(sys_epoll_ctl); | ||||
| cond_syscall(sys_epoll_wait); | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Ingo Molnar
						Ingo Molnar