forked from mirrors/linux
		
	In a case when a ptp chardev (like /dev/ptp0) is open but an underlying
device is removed, closing this file leads to a race. This reproduces
easily in a kvm virtual machine:
ts# cat openptp0.c
int main() { ... fp = fopen("/dev/ptp0", "r"); ... sleep(10); }
ts# uname -r
5.5.0-rc3-46cf053e
ts# cat /proc/cmdline
... slub_debug=FZP
ts# modprobe ptp_kvm
ts# ./openptp0 &
[1] 670
opened /dev/ptp0, sleeping 10s...
ts# rmmod ptp_kvm
ts# ls /dev/ptp*
ls: cannot access '/dev/ptp*': No such file or directory
ts# ...woken up
[   48.010809] general protection fault: 0000 [#1] SMP
[   48.012502] CPU: 6 PID: 658 Comm: openptp0 Not tainted 5.5.0-rc3-46cf053e #25
[   48.014624] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...
[   48.016270] RIP: 0010:module_put.part.0+0x7/0x80
[   48.017939] RSP: 0018:ffffb3850073be00 EFLAGS: 00010202
[   48.018339] RAX: 000000006b6b6b6b RBX: 6b6b6b6b6b6b6b6b RCX: ffff89a476c00ad0
[   48.018936] RDX: fffff65a08d3ea08 RSI: 0000000000000247 RDI: 6b6b6b6b6b6b6b6b
[   48.019470] ...                                              ^^^ a slub poison
[   48.023854] Call Trace:
[   48.024050]  __fput+0x21f/0x240
[   48.024288]  task_work_run+0x79/0x90
[   48.024555]  do_exit+0x2af/0xab0
[   48.024799]  ? vfs_write+0x16a/0x190
[   48.025082]  do_group_exit+0x35/0x90
[   48.025387]  __x64_sys_exit_group+0xf/0x10
[   48.025737]  do_syscall_64+0x3d/0x130
[   48.026056]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   48.026479] RIP: 0033:0x7f53b12082f6
[   48.026792] ...
[   48.030945] Modules linked in: ptp i6300esb watchdog [last unloaded: ptp_kvm]
[   48.045001] Fixing recursive fault but reboot is needed!
This happens in:
static void __fput(struct file *file)
{   ...
    if (file->f_op->release)
        file->f_op->release(inode, file); <<< cdev is kfree'd here
    if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
             !(mode & FMODE_PATH))) {
        cdev_put(inode->i_cdev); <<< cdev fields are accessed here
Namely:
__fput()
  posix_clock_release()
    kref_put(&clk->kref, delete_clock) <<< the last reference
      delete_clock()
        delete_ptp_clock()
          kfree(ptp) <<< cdev is embedded in ptp
  cdev_put
    module_put(p->owner) <<< *p is kfree'd, bang!
Here cdev is embedded in posix_clock which is embedded in ptp_clock.
The race happens because ptp_clock's lifetime is controlled by two
refcounts: kref and cdev.kobj in posix_clock. This is wrong.
Make ptp_clock's sysfs device a parent of cdev with cdev_device_add()
created especially for such cases. This way the parent device with its
ptp_clock is not released until all references to the cdev are released.
This adds a requirement that an initialized but not exposed struct
device should be provided to posix_clock_register() by a caller instead
of a simple dev_t.
This approach was adopted from the commit 72139dfa24 ("watchdog: Fix
the race between the release of watchdog_core_data and cdev"). See
details of the implementation in the commit 233ed09d7f ("chardev: add
helper function to register char devs with a struct device").
Link: https://lore.kernel.org/linux-fsdevel/20191125125342.6189-1-vdronov@redhat.com/T/#u
Analyzed-by: Stephen Johnston <sjohnsto@redhat.com>
Analyzed-by: Vern Lovejoy <vlovejoy@redhat.com>
Signed-off-by: Vladis Dronov <vdronov@redhat.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
		
	
			
		
			
				
	
	
		
			119 lines
		
	
	
	
		
			3.9 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			119 lines
		
	
	
	
		
			3.9 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* SPDX-License-Identifier: GPL-2.0-or-later */
 | |
| /*
 | |
|  * posix-clock.h - support for dynamic clock devices
 | |
|  *
 | |
|  * Copyright (C) 2010 OMICRON electronics GmbH
 | |
|  */
 | |
| #ifndef _LINUX_POSIX_CLOCK_H_
 | |
| #define _LINUX_POSIX_CLOCK_H_
 | |
| 
 | |
| #include <linux/cdev.h>
 | |
| #include <linux/fs.h>
 | |
| #include <linux/poll.h>
 | |
| #include <linux/posix-timers.h>
 | |
| #include <linux/rwsem.h>
 | |
| 
 | |
| struct posix_clock;
 | |
| 
 | |
| /**
 | |
|  * struct posix_clock_operations - functional interface to the clock
 | |
|  *
 | |
|  * Every posix clock is represented by a character device. Drivers may
 | |
|  * optionally offer extended capabilities by implementing the
 | |
|  * character device methods. The character device file operations are
 | |
|  * first handled by the clock device layer, then passed on to the
 | |
|  * driver by calling these functions.
 | |
|  *
 | |
|  * @owner:          The clock driver should set to THIS_MODULE
 | |
|  * @clock_adjtime:  Adjust the clock
 | |
|  * @clock_gettime:  Read the current time
 | |
|  * @clock_getres:   Get the clock resolution
 | |
|  * @clock_settime:  Set the current time value
 | |
|  * @open:           Optional character device open method
 | |
|  * @release:        Optional character device release method
 | |
|  * @ioctl:          Optional character device ioctl method
 | |
|  * @read:           Optional character device read method
 | |
|  * @poll:           Optional character device poll method
 | |
|  */
 | |
| struct posix_clock_operations {
 | |
| 	struct module *owner;
 | |
| 
 | |
| 	int  (*clock_adjtime)(struct posix_clock *pc, struct __kernel_timex *tx);
 | |
| 
 | |
| 	int  (*clock_gettime)(struct posix_clock *pc, struct timespec64 *ts);
 | |
| 
 | |
| 	int  (*clock_getres) (struct posix_clock *pc, struct timespec64 *ts);
 | |
| 
 | |
| 	int  (*clock_settime)(struct posix_clock *pc,
 | |
| 			      const struct timespec64 *ts);
 | |
| 
 | |
| 	/*
 | |
| 	 * Optional character device methods:
 | |
| 	 */
 | |
| 	long    (*ioctl)   (struct posix_clock *pc,
 | |
| 			    unsigned int cmd, unsigned long arg);
 | |
| 
 | |
| 	int     (*open)    (struct posix_clock *pc, fmode_t f_mode);
 | |
| 
 | |
| 	__poll_t (*poll)   (struct posix_clock *pc,
 | |
| 			    struct file *file, poll_table *wait);
 | |
| 
 | |
| 	int     (*release) (struct posix_clock *pc);
 | |
| 
 | |
| 	ssize_t (*read)    (struct posix_clock *pc,
 | |
| 			    uint flags, char __user *buf, size_t cnt);
 | |
| };
 | |
| 
 | |
| /**
 | |
|  * struct posix_clock - represents a dynamic posix clock
 | |
|  *
 | |
|  * @ops:     Functional interface to the clock
 | |
|  * @cdev:    Character device instance for this clock
 | |
|  * @dev:     Pointer to the clock's device.
 | |
|  * @rwsem:   Protects the 'zombie' field from concurrent access.
 | |
|  * @zombie:  If 'zombie' is true, then the hardware has disappeared.
 | |
|  *
 | |
|  * Drivers should embed their struct posix_clock within a private
 | |
|  * structure, obtaining a reference to it during callbacks using
 | |
|  * container_of().
 | |
|  *
 | |
|  * Drivers should supply an initialized but not exposed struct device
 | |
|  * to posix_clock_register(). It is used to manage lifetime of the
 | |
|  * driver's private structure. It's 'release' field should be set to
 | |
|  * a release function for this private structure.
 | |
|  */
 | |
| struct posix_clock {
 | |
| 	struct posix_clock_operations ops;
 | |
| 	struct cdev cdev;
 | |
| 	struct device *dev;
 | |
| 	struct rw_semaphore rwsem;
 | |
| 	bool zombie;
 | |
| };
 | |
| 
 | |
| /**
 | |
|  * posix_clock_register() - register a new clock
 | |
|  * @clk:   Pointer to the clock. Caller must provide 'ops' field
 | |
|  * @dev:   Pointer to the initialized device. Caller must provide
 | |
|  *         'release' field
 | |
|  *
 | |
|  * A clock driver calls this function to register itself with the
 | |
|  * clock device subsystem. If 'clk' points to dynamically allocated
 | |
|  * memory, then the caller must provide a 'release' function to free
 | |
|  * that memory.
 | |
|  *
 | |
|  * Returns zero on success, non-zero otherwise.
 | |
|  */
 | |
| int posix_clock_register(struct posix_clock *clk, struct device *dev);
 | |
| 
 | |
| /**
 | |
|  * posix_clock_unregister() - unregister a clock
 | |
|  * @clk: Clock instance previously registered via posix_clock_register()
 | |
|  *
 | |
|  * A clock driver calls this function to remove itself from the clock
 | |
|  * device subsystem. The posix_clock itself will remain (in an
 | |
|  * inactive state) until its reference count drops to zero, at which
 | |
|  * point it will be deallocated with its 'release' method.
 | |
|  */
 | |
| void posix_clock_unregister(struct posix_clock *clk);
 | |
| 
 | |
| #endif
 |