forked from mirrors/linux
		
	net: implement lockless setsockopt(SO_PEEK_OFF)
syzbot reported a lockdep violation [1] involving af_unix
support of SO_PEEK_OFF.
Since SO_PEEK_OFF is inherently not thread safe (it uses a per-socket
sk_peek_off field), there is really no point to enforce a pointless
thread safety in the kernel.
After this patch :
- setsockopt(SO_PEEK_OFF) no longer acquires the socket lock.
- skb_consume_udp() no longer has to acquire the socket lock.
- af_unix no longer needs a special version of sk_set_peek_off(),
  because it does not lock u->iolock anymore.
As a followup, we could replace prot->set_peek_off to be a boolean
and avoid an indirect call, since we always use sk_set_peek_off().
[1]
WARNING: possible circular locking dependency detected
6.8.0-rc4-syzkaller-00267-g0f1dd5e91e2b #0 Not tainted
syz-executor.2/30025 is trying to acquire lock:
 ffff8880765e7d80 (&u->iolock){+.+.}-{3:3}, at: unix_set_peek_off+0x26/0xa0 net/unix/af_unix.c:789
but task is already holding lock:
 ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1691 [inline]
 ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: sockopt_lock_sock net/core/sock.c:1060 [inline]
 ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: sk_setsockopt+0xe52/0x3360 net/core/sock.c:1193
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (sk_lock-AF_UNIX){+.+.}-{0:0}:
        lock_acquire+0x1e3/0x530 kernel/locking/lockdep.c:5754
        lock_sock_nested+0x48/0x100 net/core/sock.c:3524
        lock_sock include/net/sock.h:1691 [inline]
        __unix_dgram_recvmsg+0x1275/0x12c0 net/unix/af_unix.c:2415
        sock_recvmsg_nosec+0x18e/0x1d0 net/socket.c:1046
        ____sys_recvmsg+0x3c0/0x470 net/socket.c:2801
        ___sys_recvmsg net/socket.c:2845 [inline]
        do_recvmmsg+0x474/0xae0 net/socket.c:2939
        __sys_recvmmsg net/socket.c:3018 [inline]
        __do_sys_recvmmsg net/socket.c:3041 [inline]
        __se_sys_recvmmsg net/socket.c:3034 [inline]
        __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3034
       do_syscall_64+0xf9/0x240
       entry_SYSCALL_64_after_hwframe+0x6f/0x77
-> #0 (&u->iolock){+.+.}-{3:3}:
        check_prev_add kernel/locking/lockdep.c:3134 [inline]
        check_prevs_add kernel/locking/lockdep.c:3253 [inline]
        validate_chain+0x18ca/0x58e0 kernel/locking/lockdep.c:3869
        __lock_acquire+0x1345/0x1fd0 kernel/locking/lockdep.c:5137
        lock_acquire+0x1e3/0x530 kernel/locking/lockdep.c:5754
        __mutex_lock_common kernel/locking/mutex.c:608 [inline]
        __mutex_lock+0x136/0xd70 kernel/locking/mutex.c:752
        unix_set_peek_off+0x26/0xa0 net/unix/af_unix.c:789
       sk_setsockopt+0x207e/0x3360
        do_sock_setsockopt+0x2fb/0x720 net/socket.c:2307
        __sys_setsockopt+0x1ad/0x250 net/socket.c:2334
        __do_sys_setsockopt net/socket.c:2343 [inline]
        __se_sys_setsockopt net/socket.c:2340 [inline]
        __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340
       do_syscall_64+0xf9/0x240
       entry_SYSCALL_64_after_hwframe+0x6f/0x77
other info that might help us debug this:
 Possible unsafe locking scenario:
       CPU0                    CPU1
       ----                    ----
  lock(sk_lock-AF_UNIX);
                               lock(&u->iolock);
                               lock(sk_lock-AF_UNIX);
  lock(&u->iolock);
 *** DEADLOCK ***
1 lock held by syz-executor.2/30025:
  #0: ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1691 [inline]
  #0: ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: sockopt_lock_sock net/core/sock.c:1060 [inline]
  #0: ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: sk_setsockopt+0xe52/0x3360 net/core/sock.c:1193
stack backtrace:
CPU: 0 PID: 30025 Comm: syz-executor.2 Not tainted 6.8.0-rc4-syzkaller-00267-g0f1dd5e91e2b #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024
Call Trace:
 <TASK>
  __dump_stack lib/dump_stack.c:88 [inline]
  dump_stack_lvl+0x1e7/0x2e0 lib/dump_stack.c:106
  check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2187
  check_prev_add kernel/locking/lockdep.c:3134 [inline]
  check_prevs_add kernel/locking/lockdep.c:3253 [inline]
  validate_chain+0x18ca/0x58e0 kernel/locking/lockdep.c:3869
  __lock_acquire+0x1345/0x1fd0 kernel/locking/lockdep.c:5137
  lock_acquire+0x1e3/0x530 kernel/locking/lockdep.c:5754
  __mutex_lock_common kernel/locking/mutex.c:608 [inline]
  __mutex_lock+0x136/0xd70 kernel/locking/mutex.c:752
  unix_set_peek_off+0x26/0xa0 net/unix/af_unix.c:789
 sk_setsockopt+0x207e/0x3360
  do_sock_setsockopt+0x2fb/0x720 net/socket.c:2307
  __sys_setsockopt+0x1ad/0x250 net/socket.c:2334
  __do_sys_setsockopt net/socket.c:2343 [inline]
  __se_sys_setsockopt net/socket.c:2340 [inline]
  __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340
 do_syscall_64+0xf9/0x240
 entry_SYSCALL_64_after_hwframe+0x6f/0x77
RIP: 0033:0x7f78a1c7dda9
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f78a0fde0c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 00007f78a1dac050 RCX: 00007f78a1c7dda9
RDX: 000000000000002a RSI: 0000000000000001 RDI: 0000000000000006
RBP: 00007f78a1cca47a R08: 0000000000000004 R09: 0000000000000000
R10: 0000000020000180 R11: 0000000000000246 R12: 0000000000000000
R13: 000000000000006e R14: 00007f78a1dac050 R15: 00007ffe5cd81ae8
Fixes: 859051dd16 ("bpf: Implement cgroup sockaddr hooks for unix sockets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Cc: Daan De Meyer <daan.j.demeyer@gmail.com>
Cc: Kuniyuki Iwashima <kuniyu@amazon.com>
Cc: Martin KaFai Lau <martin.lau@kernel.org>
Cc: David Ahern <dsahern@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
			
			
This commit is contained in:
		
							parent
							
								
									3b1ae9b71c
								
							
						
					
					
						commit
						56667da739
					
				
					 3 changed files with 15 additions and 34 deletions
				
			
		|  | @ -1188,6 +1188,17 @@ int sk_setsockopt(struct sock *sk, int level, int optname, | ||||||
| 		 */ | 		 */ | ||||||
| 		WRITE_ONCE(sk->sk_txrehash, (u8)val); | 		WRITE_ONCE(sk->sk_txrehash, (u8)val); | ||||||
| 		return 0; | 		return 0; | ||||||
|  | 	case SO_PEEK_OFF: | ||||||
|  | 		{ | ||||||
|  | 		int (*set_peek_off)(struct sock *sk, int val); | ||||||
|  | 
 | ||||||
|  | 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off; | ||||||
|  | 		if (set_peek_off) | ||||||
|  | 			ret = set_peek_off(sk, val); | ||||||
|  | 		else | ||||||
|  | 			ret = -EOPNOTSUPP; | ||||||
|  | 		return ret; | ||||||
|  | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	sockopt_lock_sock(sk); | 	sockopt_lock_sock(sk); | ||||||
|  | @ -1430,18 +1441,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname, | ||||||
| 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); | 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); | ||||||
| 		break; | 		break; | ||||||
| 
 | 
 | ||||||
| 	case SO_PEEK_OFF: |  | ||||||
| 		{ |  | ||||||
| 		int (*set_peek_off)(struct sock *sk, int val); |  | ||||||
| 
 |  | ||||||
| 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off; |  | ||||||
| 		if (set_peek_off) |  | ||||||
| 			ret = set_peek_off(sk, val); |  | ||||||
| 		else |  | ||||||
| 			ret = -EOPNOTSUPP; |  | ||||||
| 		break; |  | ||||||
| 		} |  | ||||||
| 
 |  | ||||||
| 	case SO_NOFCS: | 	case SO_NOFCS: | ||||||
| 		sock_valbool_flag(sk, SOCK_NOFCS, valbool); | 		sock_valbool_flag(sk, SOCK_NOFCS, valbool); | ||||||
| 		break; | 		break; | ||||||
|  |  | ||||||
|  | @ -1589,12 +1589,7 @@ int udp_init_sock(struct sock *sk) | ||||||
| 
 | 
 | ||||||
| void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) | void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) | ||||||
| { | { | ||||||
| 	if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) { | 	sk_peek_offset_bwd(sk, len); | ||||||
| 		bool slow = lock_sock_fast(sk); |  | ||||||
| 
 |  | ||||||
| 		sk_peek_offset_bwd(sk, len); |  | ||||||
| 		unlock_sock_fast(sk, slow); |  | ||||||
| 	} |  | ||||||
| 
 | 
 | ||||||
| 	if (!skb_unref(skb)) | 	if (!skb_unref(skb)) | ||||||
| 		return; | 		return; | ||||||
|  |  | ||||||
|  | @ -782,19 +782,6 @@ static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); | ||||||
| static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, | static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, | ||||||
| 				  int); | 				  int); | ||||||
| 
 | 
 | ||||||
| static int unix_set_peek_off(struct sock *sk, int val) |  | ||||||
| { |  | ||||||
| 	struct unix_sock *u = unix_sk(sk); |  | ||||||
| 
 |  | ||||||
| 	if (mutex_lock_interruptible(&u->iolock)) |  | ||||||
| 		return -EINTR; |  | ||||||
| 
 |  | ||||||
| 	WRITE_ONCE(sk->sk_peek_off, val); |  | ||||||
| 	mutex_unlock(&u->iolock); |  | ||||||
| 
 |  | ||||||
| 	return 0; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| #ifdef CONFIG_PROC_FS | #ifdef CONFIG_PROC_FS | ||||||
| static int unix_count_nr_fds(struct sock *sk) | static int unix_count_nr_fds(struct sock *sk) | ||||||
| { | { | ||||||
|  | @ -862,7 +849,7 @@ static const struct proto_ops unix_stream_ops = { | ||||||
| 	.read_skb =	unix_stream_read_skb, | 	.read_skb =	unix_stream_read_skb, | ||||||
| 	.mmap =		sock_no_mmap, | 	.mmap =		sock_no_mmap, | ||||||
| 	.splice_read =	unix_stream_splice_read, | 	.splice_read =	unix_stream_splice_read, | ||||||
| 	.set_peek_off =	unix_set_peek_off, | 	.set_peek_off =	sk_set_peek_off, | ||||||
| 	.show_fdinfo =	unix_show_fdinfo, | 	.show_fdinfo =	unix_show_fdinfo, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | @ -886,7 +873,7 @@ static const struct proto_ops unix_dgram_ops = { | ||||||
| 	.read_skb =	unix_read_skb, | 	.read_skb =	unix_read_skb, | ||||||
| 	.recvmsg =	unix_dgram_recvmsg, | 	.recvmsg =	unix_dgram_recvmsg, | ||||||
| 	.mmap =		sock_no_mmap, | 	.mmap =		sock_no_mmap, | ||||||
| 	.set_peek_off =	unix_set_peek_off, | 	.set_peek_off =	sk_set_peek_off, | ||||||
| 	.show_fdinfo =	unix_show_fdinfo, | 	.show_fdinfo =	unix_show_fdinfo, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | @ -909,7 +896,7 @@ static const struct proto_ops unix_seqpacket_ops = { | ||||||
| 	.sendmsg =	unix_seqpacket_sendmsg, | 	.sendmsg =	unix_seqpacket_sendmsg, | ||||||
| 	.recvmsg =	unix_seqpacket_recvmsg, | 	.recvmsg =	unix_seqpacket_recvmsg, | ||||||
| 	.mmap =		sock_no_mmap, | 	.mmap =		sock_no_mmap, | ||||||
| 	.set_peek_off =	unix_set_peek_off, | 	.set_peek_off =	sk_set_peek_off, | ||||||
| 	.show_fdinfo =	unix_show_fdinfo, | 	.show_fdinfo =	unix_show_fdinfo, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Eric Dumazet
						Eric Dumazet