diff --git a/include/linux/futex.h b/include/linux/futex.h index 1bd8dfcb037b..899fc7f20edd 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -19,6 +19,18 @@ union ktime; #define FUTEX_TRYLOCK_PI 8 #define FUTEX_CMP_REQUEUE_PI 9 +#define FUTEX_PRIVATE_FLAG 128 +#define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG + +#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) +#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) +#define FUTEX_REQUEUE_PRIVATE (FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG) +#define FUTEX_CMP_REQUEUE_PRIVATE (FUTEX_CMP_REQUEUE | FUTEX_PRIVATE_FLAG) +#define FUTEX_WAKE_OP_PRIVATE (FUTEX_WAKE_OP | FUTEX_PRIVATE_FLAG) +#define FUTEX_LOCK_PI_PRIVATE (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG) +#define FUTEX_UNLOCK_PI_PRIVATE (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG) +#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. @@ -114,8 +126,18 @@ handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi); * Don't rearrange members without looking at hash_futex(). * * offset is aligned to a multiple of sizeof(u32) (== 4) by definition. - * We set bit 0 to indicate if it's an inode-based key. - */ + * We use the two low order bits of offset to tell what is the kind of key : + * 00 : Private process futex (PTHREAD_PROCESS_PRIVATE) + * (no reference on an inode or mm) + * 01 : Shared futex (PTHREAD_PROCESS_SHARED) + * mapped on a file (reference on the underlying inode) + * 10 : Shared futex (PTHREAD_PROCESS_SHARED) + * (but private mapping on an mm, and reference taken on it) +*/ + +#define FUT_OFF_INODE 1 /* We set bit 0 if key has a reference on inode */ +#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */ + union futex_key { u32 __user *uaddr; struct { @@ -134,7 +156,8 @@ union futex_key { int offset; } both; }; -int get_futex_key(u32 __user *uaddr, union futex_key *key); +int get_futex_key(u32 __user *uaddr, struct rw_semaphore *shared, + union futex_key *key); void get_futex_key_refs(union futex_key *key); void drop_futex_key_refs(union futex_key *key); diff --git a/kernel/futex.c b/kernel/futex.c index 4a60ef55dab4..b7ce15c67e32 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -16,6 +16,9 @@ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar * Copyright (C) 2006 Timesys Corp., Thomas Gleixner * + * PRIVATE futexes by Eric Dumazet + * Copyright (C) 2007 Eric Dumazet + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -150,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) && key1->both.offset == key2->both.offset); } -/* - * Get parameters which are the keys for a futex. +/** + * get_futex_key - Get parameters which are the keys for a futex. + * @uaddr: virtual address of the futex + * @shared: NULL for a PROCESS_PRIVATE futex, + * ¤t->mm->mmap_sem for a PROCESS_SHARED futex + * @key: address where result is stored. + * + * Returns a negative error code or 0 + * The key words are stored in *key on success. * * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * - * Returns: 0, or negative error code. - * The key words are stored in *key on success. - * - * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. + * fshared is NULL for PROCESS_PRIVATE futexes + * For other futexes, it points to ¤t->mm->mmap_sem and + * caller must have taken the reader lock. but NOT any spinlocks. */ -int get_futex_key(u32 __user *uaddr, union futex_key *key) +int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, + union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -174,10 +184,24 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key) * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; - if (unlikely((key->both.offset % sizeof(u32)) != 0)) + if (unlikely((address % sizeof(u32)) != 0)) return -EINVAL; address -= key->both.offset; + /* + * PROCESS_PRIVATE futexes are fast. + * As the mm cannot disappear under us and the 'key' only needs + * virtual address, we dont even have to find the underlying vma. + * Note : We do have to check 'uaddr' is a valid user address, + * but access_ok() should be faster than find_vma() + */ + if (!fshared) { + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) + return -EFAULT; + key->private.mm = mm; + key->private.address = address; + return 0; + } /* * The futex is hashed differently depending on whether * it's in a shared or private mapping. So check vma first. @@ -205,6 +229,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key) * mappings of _writable_ handles. */ if (likely(!(vma->vm_flags & VM_MAYSHARE))) { + key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ key->private.mm = mm; key->private.address = address; return 0; @@ -214,7 +239,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key) * Linear file mappings are also simple. */ key->shared.inode = vma->vm_file->f_path.dentry->d_inode; - key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ + key->both.offset |= FUT_OFF_INODE; /* inode-based key. */ if (likely(!(vma->vm_flags & VM_NONLINEAR))) { key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff); @@ -242,16 +267,18 @@ EXPORT_SYMBOL_GPL(get_futex_key); * Take a reference to the resource addressed by a key. * Can be called while holding spinlocks. * - * NOTE: mmap_sem MUST be held between get_futex_key() and calling this - * function, if it is called at all. mmap_sem keeps key->shared.inode valid. */ inline void get_futex_key_refs(union futex_key *key) { - if (key->both.ptr != 0) { - if (key->both.offset & 1) + if (key->both.ptr == 0) + return; + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { + case FUT_OFF_INODE: atomic_inc(&key->shared.inode->i_count); - else + break; + case FUT_OFF_MMSHARED: atomic_inc(&key->private.mm->mm_count); + break; } } EXPORT_SYMBOL_GPL(get_futex_key_refs); @@ -262,11 +289,15 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs); */ void drop_futex_key_refs(union futex_key *key) { - if (key->both.ptr != 0) { - if (key->both.offset & 1) + if (key->both.ptr == 0) + return; + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { + case FUT_OFF_INODE: iput(key->shared.inode); - else + break; + case FUT_OFF_MMSHARED: mmdrop(key->private.mm); + break; } } EXPORT_SYMBOL_GPL(drop_futex_key_refs); @@ -283,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from) } /* - * Fault handling. Called with current->mm->mmap_sem held. + * Fault handling. + * if fshared is non NULL, current->mm->mmap_sem is already held */ -static int futex_handle_fault(unsigned long address, int attempt) +static int futex_handle_fault(unsigned long address, + struct rw_semaphore *fshared, int attempt) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; + int ret = -EFAULT; - if (attempt > 2 || !(vma = find_vma(mm, address)) || - vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) - return -EFAULT; + if (attempt > 2) + return ret; - switch (handle_mm_fault(mm, vma, address, 1)) { - case VM_FAULT_MINOR: - current->min_flt++; - break; - case VM_FAULT_MAJOR: - current->maj_flt++; - break; - default: - return -EFAULT; + if (!fshared) + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); + if (vma && address >= vma->vm_start && + (vma->vm_flags & VM_WRITE)) { + switch (handle_mm_fault(mm, vma, address, 1)) { + case VM_FAULT_MINOR: + ret = 0; + current->min_flt++; + break; + case VM_FAULT_MAJOR: + ret = 0; + current->maj_flt++; + break; + } } - return 0; + if (!fshared) + up_read(&mm->mmap_sem); + return ret; } /* @@ -647,7 +688,8 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int futex_wake(u32 __user *uaddr, int nr_wake) +static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, + int nr_wake) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -655,9 +697,10 @@ static int futex_wake(u32 __user *uaddr, int nr_wake) union futex_key key; int ret; - down_read(¤t->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr, &key); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -679,7 +722,8 @@ static int futex_wake(u32 __user *uaddr, int nr_wake) spin_unlock(&hb->lock); out: - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; } @@ -746,7 +790,9 @@ retry: * and requeue the next nr_requeue waiters following hashed on * one physical page to another physical page (PI-futex uaddr2) */ -static int futex_requeue_pi(u32 __user *uaddr1, u32 __user *uaddr2, +static int futex_requeue_pi(u32 __user *uaddr1, + struct rw_semaphore *fshared, + u32 __user *uaddr2, int nr_wake, int nr_requeue, u32 *cmpval) { union futex_key key1, key2; @@ -765,12 +811,13 @@ retry: /* * First take all the futex related locks: */ - down_read(¤t->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr1, &key1); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, &key2); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out; @@ -793,7 +840,8 @@ retry: * If we would have faulted, release mmap_sem, fault * it in and start all over again. */ - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(curval, uaddr1); @@ -927,7 +975,8 @@ out_unlock: drop_futex_key_refs(&key1); out: - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; } @@ -936,7 +985,8 @@ out: * to this virtual address: */ static int -futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, +futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, + u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1, key2; @@ -946,12 +996,13 @@ futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, int ret, op_ret, attempt = 0; retryfull: - down_read(¤t->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr1, &key1); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, &key2); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out; @@ -991,11 +1042,10 @@ retry: * still holding the mmap_sem. */ if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr2, - attempt)) { - ret = -EFAULT; + ret = futex_handle_fault((unsigned long)uaddr2, + fshared, attempt); + if (ret) goto out; - } goto retry; } @@ -1003,7 +1053,8 @@ retry: * If we would have faulted, release mmap_sem, * fault it in and start all over again. */ - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(dummy, uaddr2); if (ret) @@ -1040,7 +1091,8 @@ retry: if (hb1 != hb2) spin_unlock(&hb2->lock); out: - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; } @@ -1048,7 +1100,8 @@ out: * Requeue all waiters hashed on one physical page to another * physical page. */ -static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, +static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, + u32 __user *uaddr2, int nr_wake, int nr_requeue, u32 *cmpval) { union futex_key key1, key2; @@ -1058,12 +1111,13 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, int ret, drop_count = 0; retry: - down_read(¤t->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr1, &key1); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, &key2); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out; @@ -1086,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, * If we would have faulted, release mmap_sem, fault * it in and start all over again. */ - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(curval, uaddr1); @@ -1139,7 +1194,8 @@ out_unlock: drop_futex_key_refs(&key1); out: - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; } @@ -1273,7 +1329,8 @@ static void unqueue_me_pi(struct futex_q *q) * The cur->mm semaphore must be held, it is released at return of this * function. */ -static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, +static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared, + struct futex_q *q, struct futex_hash_bucket *hb, struct task_struct *curr) { @@ -1300,7 +1357,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, /* Unqueue and drop the lock */ unqueue_me_pi(q); - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); /* * We own it, so we have to replace the pending owner * TID. This must be atomic as we have preserve the @@ -1321,8 +1379,15 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, return ret; } +/* + * In case we must use restart_block to restart a futex_wait, + * we encode in the 'arg3' shared capability + */ +#define ARG3_SHARED 1 + static long futex_wait_restart(struct restart_block *restart); -static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time) +static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, + u32 val, ktime_t *abs_time) { struct task_struct *curr = current; DECLARE_WAITQUEUE(wait, curr); @@ -1335,9 +1400,10 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time) q.pi_state = NULL; retry: - down_read(&curr->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr, &q.key); + ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out_release_sem; @@ -1360,8 +1426,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time) * a wakeup when *uaddr != val on entry to the syscall. This is * rare, but normal. * - * We hold the mmap semaphore, so the mapping cannot have changed - * since we looked it up in get_futex_key. + * for shared futexes, we hold the mmap semaphore, so the mapping + * cannot have changed since we looked it up in get_futex_key. */ ret = get_futex_value_locked(&uval, uaddr); @@ -1372,7 +1438,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time) * If we would have faulted, release mmap_sem, fault it in and * start all over again. */ - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(uval, uaddr); @@ -1399,7 +1466,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time) * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. */ - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); /* * There might have been scheduling since the queue_me(), as we @@ -1469,7 +1537,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time) else ret = rt_mutex_timed_lock(lock, to, 1); - down_read(&curr->mm->mmap_sem); + if (fshared) + down_read(fshared); spin_lock(q.lock_ptr); /* @@ -1486,7 +1555,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time) /* mmap_sem and hash_bucket lock are unlocked at return of this function */ - ret = fixup_pi_state_owner(uaddr, &q, hb, curr); + ret = fixup_pi_state_owner(uaddr, fshared, + &q, hb, curr); } else { /* * Catch the rare case, where the lock was released @@ -1499,7 +1569,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time) } /* Unqueue and drop the lock */ unqueue_me_pi(&q); - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); } debug_rt_mutex_free_waiter(&q.waiter); @@ -1528,6 +1599,9 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time) restart->arg0 = (unsigned long)uaddr; restart->arg1 = (unsigned long)val; restart->arg2 = (unsigned long)abs_time; + restart->arg3 = 0; + if (fshared) + restart->arg3 |= ARG3_SHARED; return -ERESTART_RESTARTBLOCK; } @@ -1535,7 +1609,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time) queue_unlock(&q, hb); out_release_sem: - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; } @@ -1545,9 +1620,12 @@ static long futex_wait_restart(struct restart_block *restart) u32 __user *uaddr = (u32 __user *)restart->arg0; u32 val = (u32)restart->arg1; ktime_t *abs_time = (ktime_t *)restart->arg2; + struct rw_semaphore *fshared = NULL; restart->fn = do_no_restart_syscall; - return (long)futex_wait(uaddr, val, abs_time); + if (restart->arg3 & ARG3_SHARED) + fshared = ¤t->mm->mmap_sem; + return (long)futex_wait(uaddr, fshared, val, abs_time); } @@ -1602,8 +1680,8 @@ static void set_pi_futex_owner(struct futex_hash_bucket *hb, * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time, - int trylock) +static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, + int detect, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; struct task_struct *curr = current; @@ -1624,9 +1702,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time, q.pi_state = NULL; retry: - down_read(&curr->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr, &q.key); + ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out_release_sem; @@ -1747,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time, * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. */ - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); WARN_ON(!q.pi_state); /* @@ -1761,7 +1841,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time, ret = ret ? 0 : -EWOULDBLOCK; } - down_read(&curr->mm->mmap_sem); + if (fshared) + down_read(fshared); spin_lock(q.lock_ptr); /* @@ -1770,7 +1851,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time, */ if (!ret && q.pi_state->owner != curr) /* mmap_sem is unlocked at return of this function */ - ret = fixup_pi_state_owner(uaddr, &q, hb, curr); + ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr); else { /* * Catch the rare case, where the lock was released @@ -1783,7 +1864,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time, } /* Unqueue and drop the lock */ unqueue_me_pi(&q); - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); } if (!detect && ret == -EDEADLK && 0) @@ -1795,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time, queue_unlock(&q, hb); out_release_sem: - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; uaddr_faulted: @@ -1806,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time, * still holding the mmap_sem. */ if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) { - ret = -EFAULT; + ret = futex_handle_fault((unsigned long)uaddr, fshared, + attempt); + if (ret) goto out_unlock_release_sem; - } goto retry_locked; } queue_unlock(&q, hb); - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) @@ -1828,7 +1912,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time, * This is the in-kernel slowpath: we look up the PI state (if any), * and do the rt-mutex unlock. */ -static int futex_unlock_pi(u32 __user *uaddr) +static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -1848,9 +1932,10 @@ retry: /* * First take all the futex related locks: */ - down_read(¤t->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr, &key); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -1909,7 +1994,8 @@ retry_locked: out_unlock: spin_unlock(&hb->lock); out: - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; @@ -1921,15 +2007,16 @@ pi_faulted: * still holding the mmap_sem. */ if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) { - ret = -EFAULT; + ret = futex_handle_fault((unsigned long)uaddr, fshared, + attempt); + if (ret) goto out_unlock; - } goto retry_locked; } spin_unlock(&hb->lock); - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) @@ -1981,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal) struct futex_q *q; struct file *filp; int ret, err; + struct rw_semaphore *fshared; static unsigned long printk_interval; if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { @@ -2022,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal) } q->pi_state = NULL; - down_read(¤t->mm->mmap_sem); - err = get_futex_key(uaddr, &q->key); + fshared = ¤t->mm->mmap_sem; + down_read(fshared); + err = get_futex_key(uaddr, fshared, &q->key); if (unlikely(err != 0)) { - up_read(¤t->mm->mmap_sem); + up_read(fshared); kfree(q); goto error; } @@ -2038,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal) filp->private_data = q; queue_me(q, ret, filp); - up_read(¤t->mm->mmap_sem); + up_read(fshared); /* Now we map fd to filp, so userspace can access it */ fd_install(ret, filp); @@ -2167,7 +2256,7 @@ retry: */ if (!pi) { if (uval & FUTEX_WAITERS) - futex_wake(uaddr, 1); + futex_wake(uaddr, &curr->mm->mmap_sem, 1); } } return 0; @@ -2223,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr) return; if (pending) - handle_futex_death((void __user *)pending + futex_offset, curr, pip); + handle_futex_death((void __user *)pending + futex_offset, + curr, pip); while (entry != &head->list) { /* @@ -2253,38 +2343,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { int ret; + int cmd = op & FUTEX_CMD_MASK; + struct rw_semaphore *fshared = NULL; - switch (op) { + if (!(op & FUTEX_PRIVATE_FLAG)) + fshared = ¤t->mm->mmap_sem; + + switch (cmd) { case FUTEX_WAIT: - ret = futex_wait(uaddr, val, timeout); + ret = futex_wait(uaddr, fshared, val, timeout); break; case FUTEX_WAKE: - ret = futex_wake(uaddr, val); + ret = futex_wake(uaddr, fshared, val); break; case FUTEX_FD: /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ ret = futex_fd(uaddr, val); break; case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, uaddr2, val, val2, NULL); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); break; case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); break; case FUTEX_WAKE_OP: - ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); + ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: - ret = futex_lock_pi(uaddr, val, timeout, 0); + ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); break; case FUTEX_UNLOCK_PI: - ret = futex_unlock_pi(uaddr); + ret = futex_unlock_pi(uaddr, fshared); break; case FUTEX_TRYLOCK_PI: - ret = futex_lock_pi(uaddr, 0, timeout, 1); + ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); break; case FUTEX_CMP_REQUEUE_PI: - ret = futex_requeue_pi(uaddr, uaddr2, val, val2, &val3); + ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3); break; default: ret = -ENOSYS; @@ -2300,23 +2395,24 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, struct timespec ts; ktime_t t, *tp = NULL; u32 val2 = 0; + int cmd = op & FUTEX_CMD_MASK; - if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) { if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) return -EINVAL; t = timespec_to_ktime(ts); - if (op == FUTEX_WAIT) + if (cmd == FUTEX_WAIT) t = ktime_add(ktime_get(), t); tp = &t; } /* - * requeue parameter in 'utime' if op == FUTEX_REQUEUE. + * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. */ - if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE - || op == FUTEX_CMP_REQUEUE_PI) + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE + || cmd == FUTEX_CMP_REQUEUE_PI) val2 = (u32) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);