3
0
Fork 0
forked from mirrors/linux

vfs-6.16-rc1.misc

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaDBPTwAKCRCRxhvAZXjc
 om0+AQDMxKLweJXplqQQ7jxuvW2dEa60YpE2EalEKWGg9YA3KgEA3nI4kyKMKn7Y
 PRFXgIcKvhs62oJLKsq8SGQUqExqvAE=
 =atEw
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.16-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull misc vfs updates from Christian Brauner:
 "This contains the usual selections of misc updates for this cycle.

  Features:

   - Use folios for symlinks in the page cache

     FUSE already uses folios for its symlinks. Mirror that conversion
     in the generic code and the NFS code. That lets us get rid of a few
     folio->page->folio conversions in this path, and some of the few
     remaining users of read_cache_page() / read_mapping_page()

   - Try and make a few filesystem operations killable on the VFS
     inode->i_mutex level

   - Add sysctl vfs_cache_pressure_denom for bulk file operations

     Some workloads need to preserve more dentries than we currently
     allow through out sysctl interface

     A HDFS servers with 12 HDDs per server, on a HDFS datanode startup
     involves scanning all files and caching their metadata (including
     dentries and inodes) in memory. Each HDD contains approximately 2
     million files, resulting in a total of ~20 million cached dentries
     after initialization

     To minimize dentry reclamation, they set vfs_cache_pressure to 1.
     Despite this configuration, memory pressure conditions can still
     trigger reclamation of up to 50% of cached dentries, reducing the
     cache from 20 million to approximately 10 million entries. During
     the subsequent cache rebuild period, any HDFS datanode restart
     operation incurs substantial latency penalties until full cache
     recovery completes

     To maintain service stability, more dentries need to be preserved
     during memory reclamation. The current minimum reclaim ratio (1/100
     of total dentries) remains too aggressive for such workload. This
     patch introduces vfs_cache_pressure_denom for more granular cache
     pressure control

     The configuration [vfs_cache_pressure=1,
     vfs_cache_pressure_denom=10000] effectively maintains the full 20
     million dentry cache under memory pressure, preventing datanode
     restart performance degradation

   - Avoid some jumps in inode_permission() using likely()/unlikely()

   - Avid a memory access which is most likely a cache miss when
     descending into devcgroup_inode_permission()

   - Add fastpath predicts for stat() and fdput()

   - Anonymous inodes currently don't come with a proper mode causing
     issues in the kernel when we want to add useful VFS debug assert.
     Fix that by giving them a proper mode and masking it off when we
     report it to userspace which relies on them not having any mode

   - Anonymous inodes currently allow to change inode attributes because
     the VFS falls back to simple_setattr() if i_op->setattr isn't
     implemented. This means the ownership and mode for every single
     user of anon_inode_inode can be changed. Block that as it's either
     useless or actively harmful. If specific ownership is needed the
     respective subsystem should allocate anonymous inodes from their
     own private superblock

   - Raise SB_I_NODEV and SB_I_NOEXEC on the anonymous inode superblock

   - Add proper tests for anonymous inode behavior

   - Make it easy to detect proper anonymous inodes and to ensure that
     we can detect them in codepaths such as readahead()

  Cleanups:

   - Port pidfs to the new anon_inode_{g,s}etattr() helpers

   - Try to remove the uselib() system call

   - Add unlikely branch hint return path for poll

   - Add unlikely branch hint on return path for core_sys_select

   - Don't allow signals to interrupt getdents copying for fuse

   - Provide a size hint to dir_context for during readdir()

   - Use writeback_iter directly in mpage_writepages

   - Update compression and mtime descriptions in initramfs
     documentation

   - Update main netfs API document

   - Remove useless plus one in super_cache_scan()

   - Remove unnecessary NULL-check guards during setns()

   - Add separate separate {get,put}_cgroup_ns no-op cases

  Fixes:

   - Fix typo in root= kernel parameter description

   - Use KERN_INFO for infof()|info_plog()|infofc()

   - Correct comments of fs_validate_description()

   - Mark an unlikely if condition with unlikely() in
     vfs_parse_monolithic_sep()

   - Delete macro fsparam_u32hex()

   - Remove unused and problematic validate_constant_table()

   - Fix potential unsigned integer underflow in fs_name()

   - Make file-nr output the total allocated file handles"

* tag 'vfs-6.16-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (43 commits)
  fs: Pass a folio to page_put_link()
  nfs: Use a folio in nfs_get_link()
  fs: Convert __page_get_link() to use a folio
  fs/read_write: make default_llseek() killable
  fs/open: make do_truncate() killable
  fs/open: make chmod_common() and chown_common() killable
  include/linux/fs.h: add inode_lock_killable()
  readdir: supply dir_context.count as readdir buffer size hint
  vfs: Add sysctl vfs_cache_pressure_denom for bulk file operations
  fuse: don't allow signals to interrupt getdents copying
  Documentation: fix typo in root= kernel parameter description
  include/cgroup: separate {get,put}_cgroup_ns no-op case
  kernel/nsproxy: remove unnecessary guards
  fs: use writeback_iter directly in mpage_writepages
  fs: remove useless plus one in super_cache_scan()
  fs: add S_ANON_INODE
  fs: remove uselib() system call
  device_cgroup: avoid access to ->i_rdev in the common case in devcgroup_inode_permission()
  fs/fs_parse: Remove unused and problematic validate_constant_table()
  fs: touch up predicts in inode_permission()
  ...
This commit is contained in:
Linus Torvalds 2025-05-26 09:02:39 -07:00
commit 181d8e399f
47 changed files with 1168 additions and 698 deletions

View file

@ -6268,7 +6268,7 @@
port and the regular usb controller gets disabled.
root= [KNL] Root filesystem
Usually this a a block device specifier of some kind,
Usually this is a block device specifier of some kind,
see the early_lookup_bdev comment in
block/early-lookup.c for details.
Alternatively this can be "ram" for the legacy initial

View file

@ -75,6 +75,7 @@ Currently, these files are in /proc/sys/vm:
- unprivileged_userfaultfd
- user_reserve_kbytes
- vfs_cache_pressure
- vfs_cache_pressure_denom
- watermark_boost_factor
- watermark_scale_factor
- zone_reclaim_mode
@ -1017,19 +1018,28 @@ vfs_cache_pressure
This percentage value controls the tendency of the kernel to reclaim
the memory which is used for caching of directory and inode objects.
At the default value of vfs_cache_pressure=100 the kernel will attempt to
reclaim dentries and inodes at a "fair" rate with respect to pagecache and
swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer
to retain dentry and inode caches. When vfs_cache_pressure=0, the kernel will
never reclaim dentries and inodes due to memory pressure and this can easily
lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100
causes the kernel to prefer to reclaim dentries and inodes.
At the default value of vfs_cache_pressure=vfs_cache_pressure_denom the kernel
will attempt to reclaim dentries and inodes at a "fair" rate with respect to
pagecache and swapcache reclaim. Decreasing vfs_cache_pressure causes the
kernel to prefer to retain dentry and inode caches. When vfs_cache_pressure=0,
the kernel will never reclaim dentries and inodes due to memory pressure and
this can easily lead to out-of-memory conditions. Increasing vfs_cache_pressure
beyond vfs_cache_pressure_denom causes the kernel to prefer to reclaim dentries
and inodes.
Increasing vfs_cache_pressure significantly beyond 100 may have negative
performance impact. Reclaim code needs to take various locks to find freeable
directory and inode objects. With vfs_cache_pressure=1000, it will look for
ten times more freeable objects than there are.
Increasing vfs_cache_pressure significantly beyond vfs_cache_pressure_denom may
have negative performance impact. Reclaim code needs to take various locks to
find freeable directory and inode objects. When vfs_cache_pressure equals
(10 * vfs_cache_pressure_denom), it will look for ten times more freeable
objects than there are.
Note: This setting should always be used together with vfs_cache_pressure_denom.
vfs_cache_pressure_denom
========================
Defaults to 100 (minimum allowed value). Requires corresponding
vfs_cache_pressure setting to take effect.
watermark_boost_factor
======================

View file

@ -4,20 +4,18 @@ initramfs buffer format
Al Viro, H. Peter Anvin
Last revision: 2002-01-13
Starting with kernel 2.5.x, the old "initial ramdisk" protocol is
getting {replaced/complemented} with the new "initial ramfs"
(initramfs) protocol. The initramfs contents is passed using the same
memory buffer protocol used by the initrd protocol, but the contents
With kernel 2.5.x, the old "initial ramdisk" protocol was complemented
with an "initial ramfs" protocol. The initramfs content is passed
using the same memory buffer protocol used by initrd, but the content
is different. The initramfs buffer contains an archive which is
expanded into a ramfs filesystem; this document details the format of
the initramfs buffer format.
expanded into a ramfs filesystem; this document details the initramfs
buffer format.
The initramfs buffer format is based around the "newc" or "crc" CPIO
formats, and can be created with the cpio(1) utility. The cpio
archive can be compressed using gzip(1). One valid version of an
initramfs buffer is thus a single .cpio.gz file.
archive can be compressed using gzip(1), or any other algorithm provided
via CONFIG_DECOMPRESS_*. One valid version of an initramfs buffer is
thus a single .cpio.gz file.
The full format of the initramfs buffer is defined by the following
grammar, where::
@ -25,12 +23,20 @@ grammar, where::
* is used to indicate "0 or more occurrences of"
(|) indicates alternatives
+ indicates concatenation
GZIP() indicates the gzip(1) of the operand
GZIP() indicates gzip compression of the operand
BZIP2() indicates bzip2 compression of the operand
LZMA() indicates lzma compression of the operand
XZ() indicates xz compression of the operand
LZO() indicates lzo compression of the operand
LZ4() indicates lz4 compression of the operand
ZSTD() indicates zstd compression of the operand
ALGN(n) means padding with null bytes to an n-byte boundary
initramfs := ("\0" | cpio_archive | cpio_gzip_archive)*
initramfs := ("\0" | cpio_archive | cpio_compressed_archive)*
cpio_gzip_archive := GZIP(cpio_archive)
cpio_compressed_archive := (GZIP(cpio_archive) | BZIP2(cpio_archive)
| LZMA(cpio_archive) | XZ(cpio_archive) | LZO(cpio_archive)
| LZ4(cpio_archive) | ZSTD(cpio_archive))
cpio_archive := cpio_file* + (<nothing> | cpio_trailer)
@ -75,6 +81,8 @@ c_chksum 8 bytes Checksum of data field if c_magic is 070702;
The c_mode field matches the contents of st_mode returned by stat(2)
on Linux, and encodes the file type and file permissions.
c_mtime is ignored unless CONFIG_INITRAMFS_PRESERVE_MTIME=y is set.
The c_filesize should be zero for any file which is not a regular file
or symlink.

View file

@ -671,7 +671,6 @@ The members are as follows:
fsparam_bool() fs_param_is_bool
fsparam_u32() fs_param_is_u32
fsparam_u32oct() fs_param_is_u32_octal
fsparam_u32hex() fs_param_is_u32_hex
fsparam_s32() fs_param_is_s32
fsparam_u64() fs_param_is_u64
fsparam_enum() fs_param_is_enum
@ -753,21 +752,6 @@ process the parameters it is given.
If a match is found, the corresponding value is returned. If a match
isn't found, the not_found value is returned instead.
* ::
bool validate_constant_table(const struct constant_table *tbl,
size_t tbl_size,
int low, int high, int special);
Validate a constant table. Checks that all the elements are appropriately
ordered, that there are no duplicates and that the values are between low
and high inclusive, though provision is made for one allowable special
value outside of that range. If no special value is required, special
should just be set to lie inside the low-to-high range.
If all is good, true is returned. If the table is invalid, errors are
logged to the kernel log buffer and false is returned.
* ::
bool fs_validate_description(const char *name,

File diff suppressed because it is too large Load diff

View file

@ -2,7 +2,6 @@ CONFIG_LOCALVERSION="amcore-002"
CONFIG_DEFAULT_HOSTNAME="amcore"
CONFIG_SYSVIPC=y
# CONFIG_FHANDLE is not set
# CONFIG_USELIB is not set
CONFIG_LOG_BUF_SHIFT=14
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
# CONFIG_AIO is not set

View file

@ -1,7 +1,6 @@
CONFIG_WERROR=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_USELIB=y
CONFIG_AUDIT=y
CONFIG_NO_HZ=y
CONFIG_HIGH_RES_TIMERS=y

View file

@ -1,6 +1,5 @@
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_USELIB=y
CONFIG_NO_HZ_IDLE=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_IRQ_TIME_ACCOUNTING=y

View file

@ -24,9 +24,50 @@
#include <linux/uaccess.h>
#include "internal.h"
static struct vfsmount *anon_inode_mnt __ro_after_init;
static struct inode *anon_inode_inode __ro_after_init;
/*
* User space expects anonymous inodes to have no file type in st_mode.
*
* In particular, 'lsof' has this legacy logic:
*
* type = s->st_mode & S_IFMT;
* switch (type) {
* ...
* case 0:
* if (!strcmp(p, "anon_inode"))
* Lf->ntype = Ntype = N_ANON_INODE;
*
* to detect our old anon_inode logic.
*
* Rather than mess with our internal sane inode data, just fix it
* up here in getattr() by masking off the format bits.
*/
int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->mode &= ~S_IFMT;
return 0;
}
int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr)
{
return -EOPNOTSUPP;
}
static const struct inode_operations anon_inode_operations = {
.getattr = anon_inode_getattr,
.setattr = anon_inode_setattr,
};
/*
* anon_inodefs_dname() is called from d_path().
*/
@ -45,6 +86,8 @@ static int anon_inodefs_init_fs_context(struct fs_context *fc)
struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
if (!ctx)
return -ENOMEM;
fc->s_iflags |= SB_I_NOEXEC;
fc->s_iflags |= SB_I_NODEV;
ctx->dops = &anon_inodefs_dentry_operations;
return 0;
}
@ -66,6 +109,7 @@ static struct inode *anon_inode_make_secure_inode(
if (IS_ERR(inode))
return inode;
inode->i_flags &= ~S_PRIVATE;
inode->i_op = &anon_inode_operations;
error = security_inode_init_security_anon(inode, &QSTR(name),
context_inode);
if (error) {
@ -313,6 +357,7 @@ static int __init anon_inode_init(void)
anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
if (IS_ERR(anon_inode_inode))
panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
anon_inode_inode->i_op = &anon_inode_operations;
return 0;
}

View file

@ -68,12 +68,6 @@
static int load_elf_binary(struct linux_binprm *bprm);
#ifdef CONFIG_USELIB
static int load_elf_library(struct file *);
#else
#define load_elf_library NULL
#endif
/*
* If we don't support core dumping, then supply a NULL so we
* don't even try.
@ -101,7 +95,6 @@ static int elf_core_dump(struct coredump_params *cprm);
static struct linux_binfmt elf_format = {
.module = THIS_MODULE,
.load_binary = load_elf_binary,
.load_shlib = load_elf_library,
#ifdef CONFIG_COREDUMP
.core_dump = elf_core_dump,
.min_coredump = ELF_EXEC_PAGESIZE,
@ -1384,75 +1377,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
goto out;
}
#ifdef CONFIG_USELIB
/* This is really simpleminded and specialized - we are loading an
a.out library that is given an ELF header. */
static int load_elf_library(struct file *file)
{
struct elf_phdr *elf_phdata;
struct elf_phdr *eppnt;
int retval, error, i, j;
struct elfhdr elf_ex;
error = -ENOEXEC;
retval = elf_read(file, &elf_ex, sizeof(elf_ex), 0);
if (retval < 0)
goto out;
if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
goto out;
/* First of all, some simple consistency checks */
if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
!elf_check_arch(&elf_ex) || !file->f_op->mmap)
goto out;
if (elf_check_fdpic(&elf_ex))
goto out;
/* Now read in all of the header information */
j = sizeof(struct elf_phdr) * elf_ex.e_phnum;
/* j < ELF_MIN_ALIGN because elf_ex.e_phnum <= 2 */
error = -ENOMEM;
elf_phdata = kmalloc(j, GFP_KERNEL);
if (!elf_phdata)
goto out;
eppnt = elf_phdata;
error = -ENOEXEC;
retval = elf_read(file, eppnt, j, elf_ex.e_phoff);
if (retval < 0)
goto out_free_ph;
for (j = 0, i = 0; i<elf_ex.e_phnum; i++)
if ((eppnt + i)->p_type == PT_LOAD)
j++;
if (j != 1)
goto out_free_ph;
while (eppnt->p_type != PT_LOAD)
eppnt++;
/* Now use mmap to map the library into memory. */
error = elf_load(file, ELF_PAGESTART(eppnt->p_vaddr),
eppnt,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_FIXED_NOREPLACE | MAP_PRIVATE,
0);
if (error != ELF_PAGESTART(eppnt->p_vaddr))
goto out_free_ph;
error = 0;
out_free_ph:
kfree(elf_phdata);
out:
return error;
}
#endif /* #ifdef CONFIG_USELIB */
#ifdef CONFIG_ELF_CORE
/*
* ELF core dumper

View file

@ -74,10 +74,11 @@
* arbitrary, since it's serialized on rename_lock
*/
static int sysctl_vfs_cache_pressure __read_mostly = 100;
static int sysctl_vfs_cache_pressure_denom __read_mostly = 100;
unsigned long vfs_pressure_ratio(unsigned long val)
{
return mult_frac(val, sysctl_vfs_cache_pressure, 100);
return mult_frac(val, sysctl_vfs_cache_pressure, sysctl_vfs_cache_pressure_denom);
}
EXPORT_SYMBOL_GPL(vfs_pressure_ratio);
@ -225,6 +226,14 @@ static const struct ctl_table vm_dcache_sysctls[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
{
.procname = "vfs_cache_pressure_denom",
.data = &sysctl_vfs_cache_pressure_denom,
.maxlen = sizeof(sysctl_vfs_cache_pressure_denom),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE_HUNDRED,
},
};
static int __init init_fs_dcache_sysctls(void)

View file

@ -115,66 +115,6 @@ bool path_noexec(const struct path *path)
(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
}
#ifdef CONFIG_USELIB
/*
* Note that a shared library must be both readable and executable due to
* security reasons.
*
* Also note that we take the address to load from the file itself.
*/
SYSCALL_DEFINE1(uselib, const char __user *, library)
{
struct linux_binfmt *fmt;
struct file *file;
struct filename *tmp = getname(library);
int error = PTR_ERR(tmp);
static const struct open_flags uselib_flags = {
.open_flag = O_LARGEFILE | O_RDONLY,
.acc_mode = MAY_READ | MAY_EXEC,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
};
if (IS_ERR(tmp))
goto out;
file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
putname(tmp);
error = PTR_ERR(file);
if (IS_ERR(file))
goto out;
/*
* Check do_open_execat() for an explanation.
*/
error = -EACCES;
if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
path_noexec(&file->f_path))
goto exit;
error = -ENOEXEC;
read_lock(&binfmt_lock);
list_for_each_entry(fmt, &formats, lh) {
if (!fmt->load_shlib)
continue;
if (!try_module_get(fmt->module))
continue;
read_unlock(&binfmt_lock);
error = fmt->load_shlib(file);
read_lock(&binfmt_lock);
put_binfmt(fmt);
if (error != -ENOEXEC)
break;
}
read_unlock(&binfmt_lock);
exit:
fput(file);
out:
return error;
}
#endif /* #ifdef CONFIG_USELIB */
#ifdef CONFIG_MMU
/*
* The nascent bprm->mm is not visible until exec_mmap() but it can

View file

@ -284,6 +284,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
};
struct getdents_callback buffer = {
.ctx.actor = filldir_one,
.ctx.count = INT_MAX,
.name = name,
};

View file

@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(get_max_files);
static int proc_nr_files(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
files_stat.nr_files = get_nr_files();
files_stat.nr_files = percpu_counter_sum_positive(&nr_files);
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

View file

@ -156,15 +156,19 @@ static int fs_index(const char __user * __name)
static int fs_name(unsigned int index, char __user * buf)
{
struct file_system_type * tmp;
int len, res;
int len, res = -EINVAL;
read_lock(&file_systems_lock);
for (tmp = file_systems; tmp; tmp = tmp->next, index--)
if (index <= 0 && try_module_get(tmp->owner))
for (tmp = file_systems; tmp; tmp = tmp->next, index--) {
if (index == 0) {
if (try_module_get(tmp->owner))
res = 0;
break;
}
}
read_unlock(&file_systems_lock);
if (!tmp)
return -EINVAL;
if (res)
return res;
/* OK, we got the reference, so we can safely block */
len = strlen(tmp->name) + 1;

View file

@ -222,7 +222,7 @@ int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
char *value = strchr(key, '=');
if (value) {
if (value == key)
if (unlikely(value == key))
continue;
*value++ = 0;
v_len = strlen(value);
@ -449,6 +449,10 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt,
printk(KERN_ERR "%s%s%pV\n", prefix ? prefix : "",
prefix ? ": " : "", &vaf);
break;
case 'i':
printk(KERN_INFO "%s%s%pV\n", prefix ? prefix : "",
prefix ? ": " : "", &vaf);
break;
default:
printk(KERN_NOTICE "%s%s%pV\n", prefix ? prefix : "",
prefix ? ": " : "", &vaf);

View file

@ -380,58 +380,9 @@ EXPORT_SYMBOL(fs_param_is_path);
#ifdef CONFIG_VALIDATE_FS_PARSER
/**
* validate_constant_table - Validate a constant table
* @tbl: The constant table to validate.
* @tbl_size: The size of the table.
* @low: The lowest permissible value.
* @high: The highest permissible value.
* @special: One special permissible value outside of the range.
*/
bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
int low, int high, int special)
{
size_t i;
bool good = true;
if (tbl_size == 0) {
pr_warn("VALIDATE C-TBL: Empty\n");
return true;
}
for (i = 0; i < tbl_size; i++) {
if (!tbl[i].name) {
pr_err("VALIDATE C-TBL[%zu]: Null\n", i);
good = false;
} else if (i > 0 && tbl[i - 1].name) {
int c = strcmp(tbl[i-1].name, tbl[i].name);
if (c == 0) {
pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n",
i, tbl[i].name);
good = false;
}
if (c > 0) {
pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n",
i, tbl[i-1].name, tbl[i].name);
good = false;
}
}
if (tbl[i].value != special &&
(tbl[i].value < low || tbl[i].value > high)) {
pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n",
i, tbl[i].name, tbl[i].value, low, high);
good = false;
}
}
return good;
}
/**
* fs_validate_description - Validate a parameter description
* @name: The parameter name to search for.
* @desc: The parameter description to validate.
* fs_validate_description - Validate a parameter specification array
* @name: Owner name of the parameter specification array
* @desc: The parameter specification array to validate.
*/
bool fs_validate_description(const char *name,
const struct fs_parameter_spec *desc)

View file

@ -1676,7 +1676,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
goto out_err;
}
set_delayed_call(callback, page_put_link, &folio->page);
set_delayed_call(callback, page_put_link, folio);
return folio_address(folio);

View file

@ -120,7 +120,7 @@ static bool fuse_emit(struct file *file, struct dir_context *ctx,
fuse_add_dirent_to_cache(file, dirent, ctx->pos);
return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino,
dirent->type);
dirent->type | FILLDIR_FLAG_NOINTR);
}
static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
@ -419,7 +419,7 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff,
if (ff->readdir.pos == ctx->pos) {
res = FOUND_SOME;
if (!dir_emit(ctx, dirent->name, dirent->namelen,
dirent->ino, dirent->type))
dirent->ino, dirent->type | FILLDIR_FLAG_NOINTR))
return FOUND_ALL;
ctx->pos = dirent->off;
}

View file

@ -344,3 +344,8 @@ static inline bool path_mounted(const struct path *path)
void file_f_owner_release(struct file *file);
bool file_seek_cur_needs_f_lock(struct file *file);
int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map);
int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags);
int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr);

View file

@ -821,7 +821,8 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
return ioctl_fioasync(fd, filp, argp);
case FIOQSIZE:
if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
if (S_ISDIR(inode->i_mode) ||
(S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) ||
S_ISLNK(inode->i_mode)) {
loff_t res = inode_get_bytes(inode);
return copy_to_user(argp, &res, sizeof(res)) ?
@ -856,7 +857,7 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
return ioctl_file_dedupe_range(filp, argp);
case FIONREAD:
if (!S_ISREG(inode->i_mode))
if (!S_ISREG(inode->i_mode) || IS_ANON_FILE(inode))
return vfs_ioctl(filp, cmd, arg);
return put_user(i_size_read(inode) - filp->f_pos,
@ -881,7 +882,7 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
return ioctl_get_fs_sysfs_path(filp, argp);
default:
if (S_ISREG(inode->i_mode))
if (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode))
return file_ioctl(filp, cmd, argp);
break;
}

View file

@ -1647,10 +1647,16 @@ struct inode *alloc_anon_inode(struct super_block *s)
* that it already _is_ on the dirty list.
*/
inode->i_state = I_DIRTY;
inode->i_mode = S_IRUSR | S_IWUSR;
/*
* Historically anonymous inodes didn't have a type at all and
* userspace has come to rely on this. Internally they're just
* regular files but S_IFREG is masked off when reporting
* information to userspace.
*/
inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_flags |= S_PRIVATE;
inode->i_flags |= S_PRIVATE | S_ANON_INODE;
simple_inode_init_ts(inode);
return inode;
}

View file

@ -445,10 +445,9 @@ static void clean_buffers(struct folio *folio, unsigned first_unmapped)
try_to_free_buffers(folio);
}
static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
void *data)
static int mpage_write_folio(struct writeback_control *wbc, struct folio *folio,
struct mpage_data *mpd)
{
struct mpage_data *mpd = data;
struct bio *bio = mpd->bio;
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
@ -656,14 +655,16 @@ mpage_writepages(struct address_space *mapping,
struct mpage_data mpd = {
.get_block = get_block,
};
struct folio *folio = NULL;
struct blk_plug plug;
int ret;
int error;
blk_start_plug(&plug);
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
while ((folio = writeback_iter(mapping, wbc, folio, &error)))
error = mpage_write_folio(wbc, folio, &mpd);
if (mpd.bio)
mpage_bio_submit_write(mpd.bio);
blk_finish_plug(&plug);
return ret;
return error;
}
EXPORT_SYMBOL(mpage_writepages);

View file

@ -571,14 +571,14 @@ int inode_permission(struct mnt_idmap *idmap,
int retval;
retval = sb_permission(inode->i_sb, inode, mask);
if (retval)
if (unlikely(retval))
return retval;
if (unlikely(mask & MAY_WRITE)) {
/*
* Nobody gets write access to an immutable file.
*/
if (IS_IMMUTABLE(inode))
if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
/*
@ -586,16 +586,16 @@ int inode_permission(struct mnt_idmap *idmap,
* written back improperly if their true value is unknown
* to the vfs.
*/
if (HAS_UNMAPPED_ID(idmap, inode))
if (unlikely(HAS_UNMAPPED_ID(idmap, inode)))
return -EACCES;
}
retval = do_inode_permission(idmap, inode, mask);
if (retval)
if (unlikely(retval))
return retval;
retval = devcgroup_inode_permission(inode, mask);
if (retval)
if (unlikely(retval))
return retval;
return security_inode_permission(inode, mask);
@ -1915,13 +1915,13 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
return ERR_PTR(-ELOOP);
if (!(nd->flags & LOOKUP_RCU)) {
if (unlikely(atime_needs_update(&last->link, inode))) {
if (nd->flags & LOOKUP_RCU) {
if (!try_to_unlazy(nd))
return ERR_PTR(-ECHILD);
}
touch_atime(&last->link);
cond_resched();
} else if (atime_needs_update(&last->link, inode)) {
if (!try_to_unlazy(nd))
return ERR_PTR(-ECHILD);
touch_atime(&last->link);
}
error = security_inode_follow_link(link->dentry, inode,
@ -2434,9 +2434,12 @@ static int link_path_walk(const char *name, struct nameidata *nd)
nd->flags |= LOOKUP_PARENT;
if (IS_ERR(name))
return PTR_ERR(name);
while (*name=='/')
name++;
if (!*name) {
if (*name == '/') {
do {
name++;
} while (unlikely(*name == '/'));
}
if (unlikely(!*name)) {
nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
return 0;
}
@ -2449,7 +2452,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
idmap = mnt_idmap(nd->path.mnt);
err = may_lookup(idmap, nd);
if (err)
if (unlikely(err))
return err;
nd->last.name = name;
@ -5407,25 +5410,25 @@ EXPORT_SYMBOL(vfs_get_link);
static char *__page_get_link(struct dentry *dentry, struct inode *inode,
struct delayed_call *callback)
{
struct page *page;
struct folio *folio;
struct address_space *mapping = inode->i_mapping;
if (!dentry) {
page = find_get_page(mapping, 0);
if (!page)
folio = filemap_get_folio(mapping, 0);
if (IS_ERR(folio))
return ERR_PTR(-ECHILD);
if (!PageUptodate(page)) {
put_page(page);
if (!folio_test_uptodate(folio)) {
folio_put(folio);
return ERR_PTR(-ECHILD);
}
} else {
page = read_mapping_page(mapping, 0, NULL);
if (IS_ERR(page))
return (char*)page;
folio = read_mapping_folio(mapping, 0, NULL);
if (IS_ERR(folio))
return ERR_CAST(folio);
}
set_delayed_call(callback, page_put_link, page);
set_delayed_call(callback, page_put_link, folio);
BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
return page_address(page);
return folio_address(folio);
}
const char *page_get_link_raw(struct dentry *dentry, struct inode *inode,
@ -5435,6 +5438,17 @@ const char *page_get_link_raw(struct dentry *dentry, struct inode *inode,
}
EXPORT_SYMBOL_GPL(page_get_link_raw);
/**
* page_get_link() - An implementation of the get_link inode_operation.
* @dentry: The directory entry which is the symlink.
* @inode: The inode for the symlink.
* @callback: Used to drop the reference to the symlink.
*
* Filesystems which store their symlinks in the page cache should use
* this to implement the get_link() member of their inode_operations.
*
* Return: A pointer to the NUL-terminated symlink.
*/
const char *page_get_link(struct dentry *dentry, struct inode *inode,
struct delayed_call *callback)
{
@ -5444,12 +5458,25 @@ const char *page_get_link(struct dentry *dentry, struct inode *inode,
nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
return kaddr;
}
EXPORT_SYMBOL(page_get_link);
/**
* page_put_link() - Drop the reference to the symlink.
* @arg: The folio which contains the symlink.
*
* This is used internally by page_get_link(). It is exported for use
* by filesystems which need to implement a variant of page_get_link()
* themselves. Despite the apparent symmetry, filesystems which use
* page_get_link() do not need to call page_put_link().
*
* The argument, while it has a void pointer type, must be a pointer to
* the folio which was retrieved from the page cache. The delayed_call
* infrastructure is used to drop the reference count once the caller
* is done with the symlink.
*/
void page_put_link(void *arg)
{
put_page(arg);
folio_put(arg);
}
EXPORT_SYMBOL(page_put_link);

View file

@ -40,31 +40,31 @@ static const char *nfs_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
struct page *page;
struct folio *folio;
void *err;
if (!dentry) {
err = ERR_PTR(nfs_revalidate_mapping_rcu(inode));
if (err)
return err;
page = find_get_page(inode->i_mapping, 0);
if (!page)
folio = filemap_get_folio(inode->i_mapping, 0);
if (IS_ERR(folio))
return ERR_PTR(-ECHILD);
if (!PageUptodate(page)) {
put_page(page);
if (!folio_test_uptodate(folio)) {
folio_put(folio);
return ERR_PTR(-ECHILD);
}
} else {
err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
if (err)
return err;
page = read_cache_page(&inode->i_data, 0, nfs_symlink_filler,
folio = read_cache_folio(&inode->i_data, 0, nfs_symlink_filler,
NULL);
if (IS_ERR(page))
return ERR_CAST(page);
if (IS_ERR(folio))
return ERR_CAST(folio);
}
set_delayed_call(done, page_put_link, page);
return page_address(page);
set_delayed_call(done, page_put_link, folio);
return folio_address(folio);
}
/*

View file

@ -60,7 +60,10 @@ int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry,
if (ret)
newattrs.ia_valid |= ret | ATTR_FORCE;
inode_lock(dentry->d_inode);
ret = inode_lock_killable(dentry->d_inode);
if (ret)
return ret;
/* Note any delegations or leases have already been broken: */
ret = notify_change(idmap, dentry, &newattrs, NULL);
inode_unlock(dentry->d_inode);
@ -635,7 +638,9 @@ int chmod_common(const struct path *path, umode_t mode)
if (error)
return error;
retry_deleg:
inode_lock(inode);
error = inode_lock_killable(inode);
if (error)
goto out_mnt_unlock;
error = security_path_chmod(path, mode);
if (error)
goto out_unlock;
@ -650,6 +655,7 @@ int chmod_common(const struct path *path, umode_t mode)
if (!error)
goto retry_deleg;
}
out_mnt_unlock:
mnt_drop_write(path->mnt);
return error;
}
@ -769,7 +775,9 @@ int chown_common(const struct path *path, uid_t user, gid_t group)
return -EINVAL;
if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
return -EINVAL;
inode_lock(inode);
error = inode_lock_killable(inode);
if (error)
return error;
if (!S_ISDIR(inode->i_mode))
newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
setattr_should_drop_sgid(idmap, inode);

View file

@ -352,6 +352,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.ctx.count = INT_MAX,
.dentry = dentry,
.list = list,
.root = root,
@ -572,6 +573,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list,
struct ovl_cache_entry *p, *n;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
.ctx.count = INT_MAX,
.list = list,
.root = root,
};
@ -673,6 +675,7 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name,
struct ovl_readdir_translate *rdt =
container_of(ctx, struct ovl_readdir_translate, ctx);
struct dir_context *orig_ctx = rdt->orig_ctx;
bool res;
if (rdt->parent_ino && strcmp(name, "..") == 0) {
ino = rdt->parent_ino;
@ -687,7 +690,10 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name,
name, namelen, rdt->xinowarn);
}
return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
res = orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
ctx->count = orig_ctx->count;
return res;
}
static bool ovl_is_impure_dir(struct file *file)
@ -714,6 +720,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
const struct ovl_layer *lower_layer = ovl_layer_lower(dir);
struct ovl_readdir_translate rdt = {
.ctx.actor = ovl_fill_real,
.ctx.count = ctx->count,
.orig_ctx = ctx,
.xinobits = ovl_xino_bits(ofs),
.xinowarn = ovl_xino_warn(ofs),
@ -1074,6 +1081,7 @@ int ovl_check_d_type_supported(const struct path *realpath)
int err;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_check_d_type,
.ctx.count = INT_MAX,
.d_type_supported = false,
};
@ -1095,6 +1103,7 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
.ctx.count = INT_MAX,
.list = &list,
};
bool incompat = false;
@ -1179,6 +1188,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
.ctx.count = INT_MAX,
.list = &list,
};

View file

@ -569,36 +569,14 @@ static struct vfsmount *pidfs_mnt __ro_after_init;
static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr)
{
return -EOPNOTSUPP;
return anon_inode_setattr(idmap, dentry, attr);
}
/*
* User space expects pidfs inodes to have no file type in st_mode.
*
* In particular, 'lsof' has this legacy logic:
*
* type = s->st_mode & S_IFMT;
* switch (type) {
* ...
* case 0:
* if (!strcmp(p, "anon_inode"))
* Lf->ntype = Ntype = N_ANON_INODE;
*
* to detect our old anon_inode logic.
*
* Rather than mess with our internal sane inode data, just fix it
* up here in getattr() by masking off the format bits.
*/
static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->mode &= ~S_IFMT;
return 0;
return anon_inode_getattr(idmap, path, stat, request_mask, query_flags);
}
static const struct inode_operations pidfs_inode_operations = {
@ -826,7 +804,7 @@ static int pidfs_init_inode(struct inode *inode, void *data)
const struct pid *pid = data;
inode->i_private = data;
inode->i_flags |= S_PRIVATE;
inode->i_flags |= S_PRIVATE | S_ANON_INODE;
inode->i_mode |= S_IRWXU;
inode->i_op = &pidfs_inode_operations;
inode->i_fop = &pidfs_file_operations;

View file

@ -332,7 +332,9 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
struct inode *inode = file_inode(file);
loff_t retval;
inode_lock(inode);
retval = inode_lock_killable(inode);
if (retval)
return retval;
switch (whence) {
case SEEK_END:
offset += i_size_read(inode);

View file

@ -222,6 +222,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct readdir_callback buf = {
.ctx.actor = fillonedir,
.ctx.count = 1, /* Hint to fs: just one entry. */
.dirent = dirent
};
@ -252,7 +253,6 @@ struct getdents_callback {
struct dir_context ctx;
struct linux_dirent __user * current_dir;
int prev_reclen;
int count;
int error;
};
@ -266,12 +266,16 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen,
int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
sizeof(long));
int prev_reclen;
unsigned int flags = d_type;
BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK);
d_type &= S_DT_MASK;
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
if (reclen > ctx->count)
return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
@ -279,7 +283,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen,
return false;
}
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current))
return false;
dirent = buf->current_dir;
prev = (void __user *) dirent - prev_reclen;
@ -296,7 +300,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen,
buf->current_dir = (void __user *)dirent + reclen;
buf->prev_reclen = reclen;
buf->count -= reclen;
ctx->count -= reclen;
return true;
efault_end:
user_write_access_end();
@ -311,7 +315,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct getdents_callback buf = {
.ctx.actor = filldir,
.count = count,
.ctx.count = count,
.current_dir = dirent
};
int error;
@ -329,7 +333,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
error = count - buf.count;
error = count - buf.ctx.count;
}
return error;
}
@ -338,7 +342,6 @@ struct getdents_callback64 {
struct dir_context ctx;
struct linux_dirent64 __user * current_dir;
int prev_reclen;
int count;
int error;
};
@ -351,15 +354,19 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
sizeof(u64));
int prev_reclen;
unsigned int flags = d_type;
BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK);
d_type &= S_DT_MASK;
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
if (reclen > ctx->count)
return false;
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current))
return false;
dirent = buf->current_dir;
prev = (void __user *)dirent - prev_reclen;
@ -376,7 +383,7 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
buf->count -= reclen;
ctx->count -= reclen;
return true;
efault_end:
@ -392,7 +399,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
.count = count,
.ctx.count = count,
.current_dir = dirent
};
int error;
@ -411,7 +418,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
if (put_user(d_off, &lastdirent->d_off))
error = -EFAULT;
else
error = count - buf.count;
error = count - buf.ctx.count;
}
return error;
}
@ -475,6 +482,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct compat_readdir_callback buf = {
.ctx.actor = compat_fillonedir,
.ctx.count = 1, /* Hint to fs: just one entry. */
.dirent = dirent
};
@ -499,7 +507,6 @@ struct compat_getdents_callback {
struct dir_context ctx;
struct compat_linux_dirent __user *current_dir;
int prev_reclen;
int count;
int error;
};
@ -513,12 +520,16 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen
int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
namlen + 2, sizeof(compat_long_t));
int prev_reclen;
unsigned int flags = d_type;
BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK);
d_type &= S_DT_MASK;
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
if (reclen > ctx->count)
return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
@ -526,7 +537,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen
return false;
}
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current))
return false;
dirent = buf->current_dir;
prev = (void __user *) dirent - prev_reclen;
@ -542,7 +553,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
buf->count -= reclen;
ctx->count -= reclen;
return true;
efault_end:
user_write_access_end();
@ -557,8 +568,8 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct compat_getdents_callback buf = {
.ctx.actor = compat_filldir,
.ctx.count = count,
.current_dir = dirent,
.count = count
};
int error;
@ -575,7 +586,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
error = count - buf.count;
error = count - buf.ctx.count;
}
return error;
}

View file

@ -630,7 +630,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
ret = -EINVAL;
if (n < 0)
if (unlikely(n < 0))
goto out_nofds;
/* max_fds can increase, so grab it once to avoid race */
@ -857,7 +857,7 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
int fd = pollfd->fd;
__poll_t mask, filter;
if (fd < 0)
if (unlikely(fd < 0))
return 0;
CLASS(fd, f)(fd);

View file

@ -254,7 +254,7 @@ int vfs_getattr(const struct path *path, struct kstat *stat,
int retval;
retval = security_inode_getattr(path);
if (retval)
if (unlikely(retval))
return retval;
return vfs_getattr_nosec(path, stat, request_mask, query_flags);
}
@ -425,7 +425,7 @@ SYSCALL_DEFINE2(stat, const char __user *, filename,
int error;
error = vfs_stat(filename, &stat);
if (error)
if (unlikely(error))
return error;
return cp_old_stat(&stat, statbuf);
@ -438,7 +438,7 @@ SYSCALL_DEFINE2(lstat, const char __user *, filename,
int error;
error = vfs_lstat(filename, &stat);
if (error)
if (unlikely(error))
return error;
return cp_old_stat(&stat, statbuf);
@ -447,12 +447,13 @@ SYSCALL_DEFINE2(lstat, const char __user *, filename,
SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
{
struct kstat stat;
int error = vfs_fstat(fd, &stat);
int error;
if (!error)
error = cp_old_stat(&stat, statbuf);
error = vfs_fstat(fd, &stat);
if (unlikely(error))
return error;
return error;
return cp_old_stat(&stat, statbuf);
}
#endif /* __ARCH_WANT_OLD_STAT */
@ -506,10 +507,12 @@ SYSCALL_DEFINE2(newstat, const char __user *, filename,
struct stat __user *, statbuf)
{
struct kstat stat;
int error = vfs_stat(filename, &stat);
int error;
if (error)
error = vfs_stat(filename, &stat);
if (unlikely(error))
return error;
return cp_new_stat(&stat, statbuf);
}
@ -520,7 +523,7 @@ SYSCALL_DEFINE2(newlstat, const char __user *, filename,
int error;
error = vfs_lstat(filename, &stat);
if (error)
if (unlikely(error))
return error;
return cp_new_stat(&stat, statbuf);
@ -534,8 +537,9 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
int error;
error = vfs_fstatat(dfd, filename, &stat, flag);
if (error)
if (unlikely(error))
return error;
return cp_new_stat(&stat, statbuf);
}
#endif
@ -543,12 +547,13 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
{
struct kstat stat;
int error = vfs_fstat(fd, &stat);
int error;
if (!error)
error = cp_new_stat(&stat, statbuf);
error = vfs_fstat(fd, &stat);
if (unlikely(error))
return error;
return error;
return cp_new_stat(&stat, statbuf);
}
#endif

View file

@ -201,7 +201,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
total_objects = dentries + inodes + fs_objects + 1;
total_objects = dentries + inodes + fs_objects;
if (!total_objects)
total_objects = 1;

View file

@ -90,7 +90,6 @@ struct linux_binfmt {
struct list_head lh;
struct module *module;
int (*load_binary)(struct linux_binprm *);
int (*load_shlib)(struct file *);
#ifdef CONFIG_COREDUMP
int (*core_dump)(struct coredump_params *cprm);
unsigned long min_coredump; /* minimal dump size */

View file

@ -785,6 +785,17 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
static inline void get_cgroup_ns(struct cgroup_namespace *ns)
{
refcount_inc(&ns->ns.count);
}
static inline void put_cgroup_ns(struct cgroup_namespace *ns)
{
if (refcount_dec_and_test(&ns->ns.count))
free_cgroup_ns(ns);
}
#else /* !CONFIG_CGROUPS */
static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
@ -795,20 +806,11 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
return old_ns;
}
static inline void get_cgroup_ns(struct cgroup_namespace *ns) { }
static inline void put_cgroup_ns(struct cgroup_namespace *ns) { }
#endif /* !CONFIG_CGROUPS */
static inline void get_cgroup_ns(struct cgroup_namespace *ns)
{
if (ns)
refcount_inc(&ns->ns.count);
}
static inline void put_cgroup_ns(struct cgroup_namespace *ns)
{
if (ns && refcount_dec_and_test(&ns->ns.count))
free_cgroup_ns(ns);
}
#ifdef CONFIG_CGROUPS
void cgroup_enter_frozen(void);

View file

@ -18,15 +18,16 @@ static inline int devcgroup_inode_permission(struct inode *inode, int mask)
{
short type, access = 0;
if (likely(!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode)))
return 0;
if (likely(!inode->i_rdev))
return 0;
if (S_ISBLK(inode->i_mode))
type = DEVCG_DEV_BLOCK;
else if (S_ISCHR(inode->i_mode))
else /* S_ISCHR by the test above */
type = DEVCG_DEV_CHAR;
else
return 0;
if (mask & MAY_WRITE)
access |= DEVCG_ACC_WRITE;

View file

@ -59,7 +59,7 @@ static inline struct fd CLONED_FD(struct file *f)
static inline void fdput(struct fd fd)
{
if (fd.word & FDPUT_FPUT)
if (unlikely(fd.word & FDPUT_FPUT))
fput(fd_file(fd));
}

View file

@ -866,6 +866,11 @@ static inline void inode_lock(struct inode *inode)
down_write(&inode->i_rwsem);
}
static inline __must_check int inode_lock_killable(struct inode *inode)
{
return down_write_killable(&inode->i_rwsem);
}
static inline void inode_unlock(struct inode *inode)
{
up_write(&inode->i_rwsem);
@ -876,6 +881,11 @@ static inline void inode_lock_shared(struct inode *inode)
down_read(&inode->i_rwsem);
}
static inline __must_check int inode_lock_shared_killable(struct inode *inode)
{
return down_read_killable(&inode->i_rwsem);
}
static inline void inode_unlock_shared(struct inode *inode)
{
up_read(&inode->i_rwsem);
@ -2070,8 +2080,18 @@ typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
struct dir_context {
filldir_t actor;
loff_t pos;
/*
* Filesystems MUST NOT MODIFY count, but may use as a hint:
* 0 unknown
* > 0 space in buffer (assume at least one entry)
* INT_MAX unlimited
*/
int count;
};
/* If OR-ed with d_type, pending signals are not checked */
#define FILLDIR_FLAG_NOINTR 0x1000
/*
* These flags let !MMU mmap() govern direct device mapping vs immediate
* copying more easily for MAP_PRIVATE, especially for ROM filesystems.
@ -2343,6 +2363,7 @@ struct super_operations {
#define S_CASEFOLD (1 << 15) /* Casefolded file */
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
#define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
#define S_ANON_INODE (1 << 19) /* Inode is an anonymous inode */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@ -2399,6 +2420,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
#define IS_ANON_FILE(inode) ((inode)->i_flags & S_ANON_INODE)
static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap,
struct inode *inode)

View file

@ -87,14 +87,9 @@ extern int lookup_constant(const struct constant_table tbl[], const char *name,
extern const struct constant_table bool_names[];
#ifdef CONFIG_VALIDATE_FS_PARSER
extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
int low, int high, int special);
extern bool fs_validate_description(const char *name,
const struct fs_parameter_spec *desc);
#else
static inline bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
int low, int high, int special)
{ return true; }
static inline bool fs_validate_description(const char *name,
const struct fs_parameter_spec *desc)
{ return true; }
@ -125,8 +120,6 @@ static inline bool fs_validate_description(const char *name,
#define fsparam_u32(NAME, OPT) __fsparam(fs_param_is_u32, NAME, OPT, 0, NULL)
#define fsparam_u32oct(NAME, OPT) \
__fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)8)
#define fsparam_u32hex(NAME, OPT) \
__fsparam(fs_param_is_u32_hex, NAME, OPT, 0, (void *)16)
#define fsparam_s32(NAME, OPT) __fsparam(fs_param_is_s32, NAME, OPT, 0, NULL)
#define fsparam_u64(NAME, OPT) __fsparam(fs_param_is_u64, NAME, OPT, 0, NULL)
#define fsparam_enum(NAME, OPT, array) __fsparam(fs_param_is_enum, NAME, OPT, 0, array)

View file

@ -477,16 +477,6 @@ config CROSS_MEMORY_ATTACH
to directly read from or write to another process' address space.
See the man page for more details.
config USELIB
bool "uselib syscall (for libc5 and earlier)"
default ALPHA || M68K || SPARC
help
This option enables the uselib syscall, a system call used in the
dynamic linker from libc5 and earlier. glibc does not use this
system call. If you intend to run programs built on libc5 or
earlier, you may need to enable this syscall. Current systems
running glibc can safely disable this.
config AUDIT
bool "Auditing support"
depends on NET

View file

@ -128,17 +128,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
out_net:
put_cgroup_ns(new_nsp->cgroup_ns);
out_cgroup:
if (new_nsp->pid_ns_for_children)
put_pid_ns(new_nsp->pid_ns_for_children);
put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
if (new_nsp->ipc_ns)
put_ipc_ns(new_nsp->ipc_ns);
put_ipc_ns(new_nsp->ipc_ns);
out_ipc:
if (new_nsp->uts_ns)
put_uts_ns(new_nsp->uts_ns);
put_uts_ns(new_nsp->uts_ns);
out_uts:
if (new_nsp->mnt_ns)
put_mnt_ns(new_nsp->mnt_ns);
put_mnt_ns(new_nsp->mnt_ns);
out_ns:
kmem_cache_free(nsproxy_cachep, new_nsp);
return ERR_PTR(err);
@ -189,18 +185,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
void free_nsproxy(struct nsproxy *ns)
{
if (ns->mnt_ns)
put_mnt_ns(ns->mnt_ns);
if (ns->uts_ns)
put_uts_ns(ns->uts_ns);
if (ns->ipc_ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns_for_children)
put_pid_ns(ns->pid_ns_for_children);
if (ns->time_ns)
put_time_ns(ns->time_ns);
if (ns->time_ns_for_children)
put_time_ns(ns->time_ns_for_children);
put_mnt_ns(ns->mnt_ns);
put_uts_ns(ns->uts_ns);
put_ipc_ns(ns->ipc_ns);
put_pid_ns(ns->pid_ns_for_children);
put_time_ns(ns->time_ns);
put_time_ns(ns->time_ns_for_children);
put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);

View file

@ -690,9 +690,15 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
CLASS(fd, f)(fd);
struct file *file;
const struct inode *inode;
if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
CLASS(fd, f)(fd);
if (fd_empty(f))
return -EBADF;
file = fd_file(f);
if (!(file->f_mode & FMODE_READ))
return -EBADF;
/*
@ -700,9 +706,15 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
* that can execute readahead. If readahead is not possible
* on this file, then we must return -EINVAL.
*/
if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
(!S_ISREG(file_inode(fd_file(f))->i_mode) &&
!S_ISBLK(file_inode(fd_file(f))->i_mode)))
if (!file->f_mapping)
return -EINVAL;
if (!file->f_mapping->a_ops)
return -EINVAL;
inode = file_inode(file);
if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
return -EINVAL;
if (IS_ANON_FILE(inode))
return -EINVAL;
return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);

View file

@ -158,7 +158,6 @@ CONFIG_TRANSPARENT_HUGEPAGE=y
CONFIG_TUN=y
CONFIG_UNIX=y
CONFIG_UPROBES=y
CONFIG_USELIB=y
CONFIG_USER_NS=y
CONFIG_VETH=y
CONFIG_VLAN_8021Q=y

View file

@ -128,7 +128,6 @@ CONFIG_TRANSPARENT_HUGEPAGE=y
CONFIG_TUN=y
CONFIG_UNIX=y
CONFIG_UPROBES=y
CONFIG_USELIB=y
CONFIG_USER_NS=y
CONFIG_VETH=y
CONFIG_VLAN_8021Q=y

View file

@ -2,3 +2,4 @@
dnotify_test
devpts_pts
file_stressor
anon_inode_test

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
CFLAGS += $(KHDR_INCLUDES)
TEST_GEN_PROGS := devpts_pts file_stressor
TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test
TEST_GEN_PROGS_EXTENDED := dnotify_test
include ../lib.mk

View file

@ -0,0 +1,69 @@
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#define __SANE_USERSPACE_TYPES__
#include <fcntl.h>
#include <stdio.h>
#include <sys/stat.h>
#include "../kselftest_harness.h"
#include "overlayfs/wrappers.h"
TEST(anon_inode_no_chown)
{
int fd_context;
fd_context = sys_fsopen("tmpfs", 0);
ASSERT_GE(fd_context, 0);
ASSERT_LT(fchown(fd_context, 1234, 5678), 0);
ASSERT_EQ(errno, EOPNOTSUPP);
EXPECT_EQ(close(fd_context), 0);
}
TEST(anon_inode_no_chmod)
{
int fd_context;
fd_context = sys_fsopen("tmpfs", 0);
ASSERT_GE(fd_context, 0);
ASSERT_LT(fchmod(fd_context, 0777), 0);
ASSERT_EQ(errno, EOPNOTSUPP);
EXPECT_EQ(close(fd_context), 0);
}
TEST(anon_inode_no_exec)
{
int fd_context;
fd_context = sys_fsopen("tmpfs", 0);
ASSERT_GE(fd_context, 0);
ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0);
ASSERT_EQ(errno, EACCES);
EXPECT_EQ(close(fd_context), 0);
}
TEST(anon_inode_no_open)
{
int fd_context;
fd_context = sys_fsopen("tmpfs", 0);
ASSERT_GE(fd_context, 0);
ASSERT_GE(dup2(fd_context, 500), 0);
ASSERT_EQ(close(fd_context), 0);
fd_context = 500;
ASSERT_LT(open("/proc/self/fd/500", 0), 0);
ASSERT_EQ(errno, ENXIO);
EXPECT_EQ(close(fd_context), 0);
}
TEST_HARNESS_MAIN