From 23e947091f6039e2897102a7ce41f03f91e8698e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 26 Jan 2024 16:56:37 -0800 Subject: [PATCH] 5.4-stable patches added patches: fs-add-mode_strip_sgid-helper.patch fs-move-s_isgid-stripping-into-the-vfs_-helpers.patch powerpc-use-always-instead-of-always-y-in-for-crtsavres.o.patch --- queue-5.4/fs-add-mode_strip_sgid-helper.patch | 101 +++++ ...sgid-stripping-into-the-vfs_-helpers.patch | 361 ++++++++++++++++++ ...ve-highatomic-page-blocks-before-oom.patch | 104 ----- ...stead-of-always-y-in-for-crtsavres.o.patch | 46 +++ queue-5.4/series | 4 +- 5 files changed, 511 insertions(+), 105 deletions(-) create mode 100644 queue-5.4/fs-add-mode_strip_sgid-helper.patch create mode 100644 queue-5.4/fs-move-s_isgid-stripping-into-the-vfs_-helpers.patch delete mode 100644 queue-5.4/mm-page_alloc-unreserve-highatomic-page-blocks-before-oom.patch create mode 100644 queue-5.4/powerpc-use-always-instead-of-always-y-in-for-crtsavres.o.patch diff --git a/queue-5.4/fs-add-mode_strip_sgid-helper.patch b/queue-5.4/fs-add-mode_strip_sgid-helper.patch new file mode 100644 index 00000000000..e2e241bcba9 --- /dev/null +++ b/queue-5.4/fs-add-mode_strip_sgid-helper.patch @@ -0,0 +1,101 @@ +From stable+bounces-15650-greg=kroah.com@vger.kernel.org Wed Jan 24 05:03:18 2024 +From: Mahmoud Adam +Date: Wed, 24 Jan 2024 14:00:25 +0100 +Subject: fs: add mode_strip_sgid() helper +To: +Cc: , , , , <--suppress-cc=body@amazon.com>, "Darrick J . Wong" , Christian Brauner , Jeff Layton , Amir Goldstein , Mahmoud Adam +Message-ID: <20240124130025.2292-2-mngyadam@amazon.com> + +From: Yang Xu + +commit 2b3416ceff5e6bd4922f6d1c61fb68113dd82302 upstream. + +[remove userns argument of helper for 5.4.y backport] + +Add a dedicated helper to handle the setgid bit when creating a new file +in a setgid directory. This is a preparatory patch for moving setgid +stripping into the vfs. The patch contains no functional changes. + +Currently the setgid stripping logic is open-coded directly in +inode_init_owner() and the individual filesystems are responsible for +handling setgid inheritance. Since this has proven to be brittle as +evidenced by old issues we uncovered over the last months (see [1] to +[3] below) we will try to move this logic into the vfs. + +Link: e014f37db1a2 ("xfs: use setattr_copy to set vfs inode attributes") [1] +Link: 01ea173e103e ("xfs: fix up non-directory creation in SGID directories") [2] +Link: fd84bfdddd16 ("ceph: fix up non-directory creation in SGID directories") [3] +Link: https://lore.kernel.org/r/1657779088-2242-1-git-send-email-xuyang2018.jy@fujitsu.com +Reviewed-by: Darrick J. Wong +Reviewed-by: Christian Brauner (Microsoft) +Reviewed-and-Tested-by: Jeff Layton +Signed-off-by: Yang Xu +Signed-off-by: Christian Brauner (Microsoft) +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +[commit 347750e1b69cef62966fbc5bd7dc579b4c00688a upstream + backported from 5.10.y, resolved context conflicts] +Signed-off-by: Mahmoud Adam +Signed-off-by: Greg Kroah-Hartman +--- + fs/inode.c | 34 ++++++++++++++++++++++++++++++---- + include/linux/fs.h | 1 + + 2 files changed, 31 insertions(+), 4 deletions(-) + +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -2100,10 +2100,8 @@ void inode_init_owner(struct inode *inod + /* Directories are special, and always inherit S_ISGID */ + if (S_ISDIR(mode)) + mode |= S_ISGID; +- else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && +- !in_group_p(inode->i_gid) && +- !capable_wrt_inode_uidgid(dir, CAP_FSETID)) +- mode &= ~S_ISGID; ++ else ++ mode = mode_strip_sgid(dir, mode); + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; +@@ -2359,3 +2357,31 @@ int vfs_ioc_fssetxattr_check(struct inod + return 0; + } + EXPORT_SYMBOL(vfs_ioc_fssetxattr_check); ++ ++/** ++ * mode_strip_sgid - handle the sgid bit for non-directories ++ * @dir: parent directory inode ++ * @mode: mode of the file to be created in @dir ++ * ++ * If the @mode of the new file has both the S_ISGID and S_IXGRP bit ++ * raised and @dir has the S_ISGID bit raised ensure that the caller is ++ * either in the group of the parent directory or they have CAP_FSETID ++ * in their user namespace and are privileged over the parent directory. ++ * In all other cases, strip the S_ISGID bit from @mode. ++ * ++ * Return: the new mode to use for the file ++ */ ++umode_t mode_strip_sgid(const struct inode *dir, umode_t mode) ++{ ++ if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP)) ++ return mode; ++ if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) ++ return mode; ++ if (in_group_p(dir->i_gid)) ++ return mode; ++ if (capable_wrt_inode_uidgid(dir, CAP_FSETID)) ++ return mode; ++ ++ return mode & ~S_ISGID; ++} ++EXPORT_SYMBOL(mode_strip_sgid); +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -1743,6 +1743,7 @@ extern long compat_ptr_ioctl(struct file + extern void inode_init_owner(struct inode *inode, const struct inode *dir, + umode_t mode); + extern bool may_open_dev(const struct path *path); ++umode_t mode_strip_sgid(const struct inode *dir, umode_t mode); + /* + * VFS FS_IOC_FIEMAP helper definitions. + */ diff --git a/queue-5.4/fs-move-s_isgid-stripping-into-the-vfs_-helpers.patch b/queue-5.4/fs-move-s_isgid-stripping-into-the-vfs_-helpers.patch new file mode 100644 index 00000000000..5d092ac45a2 --- /dev/null +++ b/queue-5.4/fs-move-s_isgid-stripping-into-the-vfs_-helpers.patch @@ -0,0 +1,361 @@ +From stable+bounces-15651-greg=kroah.com@vger.kernel.org Wed Jan 24 05:03:11 2024 +From: Mahmoud Adam +Date: Wed, 24 Jan 2024 14:00:26 +0100 +Subject: fs: move S_ISGID stripping into the vfs_*() helpers +To: +Cc: , , , , <--suppress-cc=body@amazon.com>, Dave Chinner , Christian Brauner , "Darrick J . Wong" , Jeff Layton , Amir Goldstein , Mahmoud Adam +Message-ID: <20240124130025.2292-3-mngyadam@amazon.com> + +From: Yang Xu + +commit 1639a49ccdce58ea248841ed9b23babcce6dbb0b upstream. + +[remove userns argument of helpers for 5.4.y backport] + +Move setgid handling out of individual filesystems and into the VFS +itself to stop the proliferation of setgid inheritance bugs. + +Creating files that have both the S_IXGRP and S_ISGID bit raised in +directories that themselves have the S_ISGID bit set requires additional +privileges to avoid security issues. + +When a filesystem creates a new inode it needs to take care that the +caller is either in the group of the newly created inode or they have +CAP_FSETID in their current user namespace and are privileged over the +parent directory of the new inode. If any of these two conditions is +true then the S_ISGID bit can be raised for an S_IXGRP file and if not +it needs to be stripped. + +However, there are several key issues with the current implementation: + +* S_ISGID stripping logic is entangled with umask stripping. + + If a filesystem doesn't support or enable POSIX ACLs then umask + stripping is done directly in the vfs before calling into the + filesystem. + If the filesystem does support POSIX ACLs then unmask stripping may be + done in the filesystem itself when calling posix_acl_create(). + + Since umask stripping has an effect on S_ISGID inheritance, e.g., by + stripping the S_IXGRP bit from the file to be created and all relevant + filesystems have to call posix_acl_create() before inode_init_owner() + where we currently take care of S_ISGID handling S_ISGID handling is + order dependent. IOW, whether or not you get a setgid bit depends on + POSIX ACLs and umask and in what order they are called. + + Note that technically filesystems are free to impose their own + ordering between posix_acl_create() and inode_init_owner() meaning + that there's additional ordering issues that influence S_SIGID + inheritance. + +* Filesystems that don't rely on inode_init_owner() don't get S_ISGID + stripping logic. + + While that may be intentional (e.g. network filesystems might just + defer setgid stripping to a server) it is often just a security issue. + +This is not just ugly it's unsustainably messy especially since we do +still have bugs in this area years after the initial round of setgid +bugfixes. + +So the current state is quite messy and while we won't be able to make +it completely clean as posix_acl_create() is still a filesystem specific +call we can improve the S_SIGD stripping situation quite a bit by +hoisting it out of inode_init_owner() and into the vfs creation +operations. This means we alleviate the burden for filesystems to handle +S_ISGID stripping correctly and can standardize the ordering between +S_ISGID and umask stripping in the vfs. + +We add a new helper vfs_prepare_mode() so S_ISGID handling is now done +in the VFS before umask handling. This has S_ISGID handling is +unaffected unaffected by whether umask stripping is done by the VFS +itself (if no POSIX ACLs are supported or enabled) or in the filesystem +in posix_acl_create() (if POSIX ACLs are supported). + +The vfs_prepare_mode() helper is called directly in vfs_*() helpers that +create new filesystem objects. We need to move them into there to make +sure that filesystems like overlayfs hat have callchains like: + +sys_mknod() +-> do_mknodat(mode) + -> .mknod = ovl_mknod(mode) + -> ovl_create(mode) + -> vfs_mknod(mode) + +get S_ISGID stripping done when calling into lower filesystems via +vfs_*() creation helpers. Moving vfs_prepare_mode() into e.g. +vfs_mknod() takes care of that. This is in any case semantically cleaner +because S_ISGID stripping is VFS security requirement. + +Security hooks so far have seen the mode with the umask applied but +without S_ISGID handling done. The relevant hooks are called outside of +vfs_*() creation helpers so by calling vfs_prepare_mode() from vfs_*() +helpers the security hooks would now see the mode without umask +stripping applied. For now we fix this by passing the mode with umask +settings applied to not risk any regressions for LSM hooks. IOW, nothing +changes for LSM hooks. It is worth pointing out that security hooks +never saw the mode that is seen by the filesystem when actually creating +the file. They have always been completely misplaced for that to work. + +The following filesystems use inode_init_owner() and thus relied on +S_ISGID stripping: spufs, 9p, bfs, btrfs, ext2, ext4, f2fs, hfsplus, +hugetlbfs, jfs, minix, nilfs2, ntfs3, ocfs2, omfs, overlayfs, ramfs, +reiserfs, sysv, ubifs, udf, ufs, xfs, zonefs, bpf, tmpfs. + +All of the above filesystems end up calling inode_init_owner() when new +filesystem objects are created through the ->mkdir(), ->mknod(), +->create(), ->tmpfile(), ->rename() inode operations. + +Since directories always inherit the S_ISGID bit with the exception of +xfs when irix_sgid_inherit mode is turned on S_ISGID stripping doesn't +apply. The ->symlink() and ->link() inode operations trivially inherit +the mode from the target and the ->rename() inode operation inherits the +mode from the source inode. All other creation inode operations will get +S_ISGID handling via vfs_prepare_mode() when called from their relevant +vfs_*() helpers. + +In addition to this there are filesystems which allow the creation of +filesystem objects through ioctl()s or - in the case of spufs - +circumventing the vfs in other ways. If filesystem objects are created +through ioctl()s the vfs doesn't know about it and can't apply regular +permission checking including S_ISGID logic. Therfore, a filesystem +relying on S_ISGID stripping in inode_init_owner() in their ioctl() +callpath will be affected by moving this logic into the vfs. We audited +those filesystems: + +* btrfs allows the creation of filesystem objects through various + ioctls(). Snapshot creation literally takes a snapshot and so the mode + is fully preserved and S_ISGID stripping doesn't apply. + + Creating a new subvolum relies on inode_init_owner() in + btrfs_new_subvol_inode() but only creates directories and doesn't + raise S_ISGID. + +* ocfs2 has a peculiar implementation of reflinks. In contrast to e.g. + xfs and btrfs FICLONE/FICLONERANGE ioctl() that is only concerned with + the actual extents ocfs2 uses a separate ioctl() that also creates the + target file. + + Iow, ocfs2 circumvents the vfs entirely here and did indeed rely on + inode_init_owner() to strip the S_ISGID bit. This is the only place + where a filesystem needs to call mode_strip_sgid() directly but this + is self-inflicted pain. + +* spufs doesn't go through the vfs at all and doesn't use ioctl()s + either. Instead it has a dedicated system call spufs_create() which + allows the creation of filesystem objects. But spufs only creates + directories and doesn't allo S_SIGID bits, i.e. it specifically only + allows 0777 bits. + +* bpf uses vfs_mkobj() but also doesn't allow S_ISGID bits to be created. + +The patch will have an effect on ext2 when the EXT2_MOUNT_GRPID mount +option is used, on ext4 when the EXT4_MOUNT_GRPID mount option is used, +and on xfs when the XFS_FEAT_GRPID mount option is used. When any of +these filesystems are mounted with their respective GRPID option then +newly created files inherit the parent directories group +unconditionally. In these cases non of the filesystems call +inode_init_owner() and thus did never strip the S_ISGID bit for newly +created files. Moving this logic into the VFS means that they now get +the S_ISGID bit stripped. This is a user visible change. If this leads +to regressions we will either need to figure out a better way or we need +to revert. However, given the various setgid bugs that we found just in +the last two years this is a regression risk we should take. + +Associated with this change is a new set of fstests to enforce the +semantics for all new filesystems. + +Link: https://lore.kernel.org/ceph-devel/20220427092201.wvsdjbnc7b4dttaw@wittgenstein [1] +Link: e014f37db1a2 ("xfs: use setattr_copy to set vfs inode attributes") [2] +Link: 01ea173e103e ("xfs: fix up non-directory creation in SGID directories") [3] +Link: fd84bfdddd16 ("ceph: fix up non-directory creation in SGID directories") [4] +Link: https://lore.kernel.org/r/1657779088-2242-3-git-send-email-xuyang2018.jy@fujitsu.com +Suggested-by: Dave Chinner +Suggested-by: Christian Brauner (Microsoft) +Reviewed-by: Darrick J. Wong +Reviewed-and-Tested-by: Jeff Layton +Signed-off-by: Yang Xu +[: rewrote commit message] +Signed-off-by: Christian Brauner (Microsoft) +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +[commit 94ac142c19f1016283a1860b07de7fa555385d31 upstream + backported from 5.10.y, resolved context conflicts] +Signed-off-by: Mahmoud Adam +Signed-off-by: Greg Kroah-Hartman +--- + fs/inode.c | 2 - + fs/namei.c | 84 +++++++++++++++++++++++++++++++++++++++++++++---------- + fs/ocfs2/namei.c | 1 + 3 files changed, 70 insertions(+), 17 deletions(-) + +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -2100,8 +2100,6 @@ void inode_init_owner(struct inode *inod + /* Directories are special, and always inherit S_ISGID */ + if (S_ISDIR(mode)) + mode |= S_ISGID; +- else +- mode = mode_strip_sgid(dir, mode); + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -52,8 +52,8 @@ + * The new code replaces the old recursive symlink resolution with + * an iterative one (in case of non-nested symlink chains). It does + * this with calls to _follow_link(). +- * As a side effect, dir_namei(), _namei() and follow_link() are now +- * replaced with a single function lookup_dentry() that can handle all ++ * As a side effect, dir_namei(), _namei() and follow_link() are now ++ * replaced with a single function lookup_dentry() that can handle all + * the special cases of the former code. + * + * With the new dcache, the pathname is stored at each inode, at least as +@@ -2900,6 +2900,63 @@ void unlock_rename(struct dentry *p1, st + } + EXPORT_SYMBOL(unlock_rename); + ++/** ++ * mode_strip_umask - handle vfs umask stripping ++ * @dir: parent directory of the new inode ++ * @mode: mode of the new inode to be created in @dir ++ * ++ * Umask stripping depends on whether or not the filesystem supports POSIX ++ * ACLs. If the filesystem doesn't support it umask stripping is done directly ++ * in here. If the filesystem does support POSIX ACLs umask stripping is ++ * deferred until the filesystem calls posix_acl_create(). ++ * ++ * Returns: mode ++ */ ++static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) ++{ ++ if (!IS_POSIXACL(dir)) ++ mode &= ~current_umask(); ++ return mode; ++} ++ ++/** ++ * vfs_prepare_mode - prepare the mode to be used for a new inode ++ * @dir: parent directory of the new inode ++ * @mode: mode of the new inode ++ * @mask_perms: allowed permission by the vfs ++ * @type: type of file to be created ++ * ++ * This helper consolidates and enforces vfs restrictions on the @mode of a new ++ * object to be created. ++ * ++ * Umask stripping depends on whether the filesystem supports POSIX ACLs (see ++ * the kernel documentation for mode_strip_umask()). Moving umask stripping ++ * after setgid stripping allows the same ordering for both non-POSIX ACL and ++ * POSIX ACL supporting filesystems. ++ * ++ * Note that it's currently valid for @type to be 0 if a directory is created. ++ * Filesystems raise that flag individually and we need to check whether each ++ * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a ++ * non-zero type. ++ * ++ * Returns: mode to be passed to the filesystem ++ */ ++static inline umode_t vfs_prepare_mode(const struct inode *dir, umode_t mode, ++ umode_t mask_perms, umode_t type) ++{ ++ mode = mode_strip_sgid(dir, mode); ++ mode = mode_strip_umask(dir, mode); ++ ++ /* ++ * Apply the vfs mandated allowed permission mask and set the type of ++ * file to be created before we call into the filesystem. ++ */ ++ mode &= (mask_perms & ~S_IFMT); ++ mode |= (type & S_IFMT); ++ ++ return mode; ++} ++ + int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool want_excl) + { +@@ -2909,8 +2966,8 @@ int vfs_create(struct inode *dir, struct + + if (!dir->i_op->create) + return -EACCES; /* shouldn't it be ENOSYS? */ +- mode &= S_IALLUGO; +- mode |= S_IFREG; ++ ++ mode = vfs_prepare_mode(dir, mode, S_IALLUGO, S_IFREG); + error = security_inode_create(dir, dentry, mode); + if (error) + return error; +@@ -3180,8 +3237,7 @@ static int lookup_open(struct nameidata + * O_EXCL open we want to return EEXIST not EROFS). + */ + if (open_flag & O_CREAT) { +- if (!IS_POSIXACL(dir->d_inode)) +- mode &= ~current_umask(); ++ mode = vfs_prepare_mode(dir->d_inode, mode, mode, mode); + if (unlikely(!got_write)) { + create_error = -EROFS; + open_flag &= ~O_CREAT; +@@ -3457,8 +3513,7 @@ struct dentry *vfs_tmpfile(struct dentry + child = d_alloc(dentry, &slash_name); + if (unlikely(!child)) + goto out_err; +- if (!IS_POSIXACL(dir)) +- mode &= ~current_umask(); ++ mode = vfs_prepare_mode(dir, mode, mode, mode); + error = dir->i_op->tmpfile(dir, child, mode); + if (error) + goto out_err; +@@ -3717,6 +3772,7 @@ int vfs_mknod(struct inode *dir, struct + if (!dir->i_op->mknod) + return -EPERM; + ++ mode = vfs_prepare_mode(dir, mode, mode, mode); + error = devcgroup_inode_mknod(mode, dev); + if (error) + return error; +@@ -3765,9 +3821,8 @@ retry: + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + +- if (!IS_POSIXACL(path.dentry->d_inode)) +- mode &= ~current_umask(); +- error = security_path_mknod(&path, dentry, mode, dev); ++ error = security_path_mknod(&path, dentry, ++ mode_strip_umask(path.dentry->d_inode, mode), dev); + if (error) + goto out; + switch (mode & S_IFMT) { +@@ -3815,7 +3870,7 @@ int vfs_mkdir(struct inode *dir, struct + if (!dir->i_op->mkdir) + return -EPERM; + +- mode &= (S_IRWXUGO|S_ISVTX); ++ mode = vfs_prepare_mode(dir, mode, S_IRWXUGO | S_ISVTX, 0); + error = security_inode_mkdir(dir, dentry, mode); + if (error) + return error; +@@ -3842,9 +3897,8 @@ retry: + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + +- if (!IS_POSIXACL(path.dentry->d_inode)) +- mode &= ~current_umask(); +- error = security_path_mkdir(&path, dentry, mode); ++ error = security_path_mkdir(&path, dentry, ++ mode_strip_umask(path.dentry->d_inode, mode)); + if (!error) + error = vfs_mkdir(path.dentry->d_inode, dentry, mode); + done_path_create(&path, dentry); +--- a/fs/ocfs2/namei.c ++++ b/fs/ocfs2/namei.c +@@ -198,6 +198,7 @@ static struct inode *ocfs2_get_init_inod + * callers. */ + if (S_ISDIR(mode)) + set_nlink(inode, 2); ++ mode = mode_strip_sgid(dir, mode); + inode_init_owner(inode, dir, mode); + status = dquot_initialize(inode); + if (status) diff --git a/queue-5.4/mm-page_alloc-unreserve-highatomic-page-blocks-before-oom.patch b/queue-5.4/mm-page_alloc-unreserve-highatomic-page-blocks-before-oom.patch deleted file mode 100644 index 3871c0fff55..00000000000 --- a/queue-5.4/mm-page_alloc-unreserve-highatomic-page-blocks-before-oom.patch +++ /dev/null @@ -1,104 +0,0 @@ -From ac3f3b0a55518056bc80ed32a41931c99e1f7d81 Mon Sep 17 00:00:00 2001 -From: Charan Teja Kalla -Date: Fri, 24 Nov 2023 16:27:25 +0530 -Subject: mm: page_alloc: unreserve highatomic page blocks before oom - -From: Charan Teja Kalla - -commit ac3f3b0a55518056bc80ed32a41931c99e1f7d81 upstream. - -__alloc_pages_direct_reclaim() is called from slowpath allocation where -high atomic reserves can be unreserved after there is a progress in -reclaim and yet no suitable page is found. Later should_reclaim_retry() -gets called from slow path allocation to decide if the reclaim needs to be -retried before OOM kill path is taken. - -should_reclaim_retry() checks the available(reclaimable + free pages) -memory against the min wmark levels of a zone and returns: - -a) true, if it is above the min wmark so that slow path allocation will - do the reclaim retries. - -b) false, thus slowpath allocation takes oom kill path. - -should_reclaim_retry() can also unreserves the high atomic reserves **but -only after all the reclaim retries are exhausted.** - -In a case where there are almost none reclaimable memory and free pages -contains mostly the high atomic reserves but allocation context can't use -these high atomic reserves, makes the available memory below min wmark -levels hence false is returned from should_reclaim_retry() leading the -allocation request to take OOM kill path. This can turn into a early oom -kill if high atomic reserves are holding lot of free memory and -unreserving of them is not attempted. - -(early)OOM is encountered on a VM with the below state: -[ 295.998653] Normal free:7728kB boost:0kB min:804kB low:1004kB -high:1204kB reserved_highatomic:8192KB active_anon:4kB inactive_anon:0kB -active_file:24kB inactive_file:24kB unevictable:1220kB writepending:0kB -present:70732kB managed:49224kB mlocked:0kB bounce:0kB free_pcp:688kB -local_pcp:492kB free_cma:0kB -[ 295.998656] lowmem_reserve[]: 0 32 -[ 295.998659] Normal: 508*4kB (UMEH) 241*8kB (UMEH) 143*16kB (UMEH) -33*32kB (UH) 7*64kB (UH) 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB -0*4096kB = 7752kB - -Per above log, the free memory of ~7MB exist in the high atomic reserves -is not freed up before falling back to oom kill path. - -Fix it by trying to unreserve the high atomic reserves in -should_reclaim_retry() before __alloc_pages_direct_reclaim() can fallback -to oom kill path. - -Link: https://lkml.kernel.org/r/1700823445-27531-1-git-send-email-quic_charante@quicinc.com -Fixes: 0aaa29a56e4f ("mm, page_alloc: reserve pageblocks for high-order atomic allocations on demand") -Signed-off-by: Charan Teja Kalla -Reported-by: Chris Goldsworthy -Suggested-by: Michal Hocko -Acked-by: Michal Hocko -Acked-by: David Rientjes -Cc: Chris Goldsworthy -Cc: David Hildenbrand -Cc: Johannes Weiner -Cc: Mel Gorman -Cc: Pavankumar Kondeti -Cc: Vlastimil Babka -Cc: Joakim Tjernlund -Signed-off-by: Andrew Morton -Signed-off-by: Greg Kroah-Hartman ---- - mm/page_alloc.c | 16 ++++++++-------- - 1 file changed, 8 insertions(+), 8 deletions(-) - ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -4335,14 +4335,9 @@ should_reclaim_retry(gfp_t gfp_mask, uns - else - (*no_progress_loops)++; - -- /* -- * Make sure we converge to OOM if we cannot make any progress -- * several times in the row. -- */ -- if (*no_progress_loops > MAX_RECLAIM_RETRIES) { -- /* Before OOM, exhaust highatomic_reserve */ -- return unreserve_highatomic_pageblock(ac, true); -- } -+ if (*no_progress_loops > MAX_RECLAIM_RETRIES) -+ goto out; -+ - - /* - * Keep reclaiming pages while there is a chance this will lead -@@ -4404,6 +4399,11 @@ out: - schedule_timeout_uninterruptible(1); - else - cond_resched(); -+out: -+ /* Before OOM, exhaust highatomic_reserve */ -+ if (!ret) -+ return unreserve_highatomic_pageblock(ac, true); -+ - return ret; - } - diff --git a/queue-5.4/powerpc-use-always-instead-of-always-y-in-for-crtsavres.o.patch b/queue-5.4/powerpc-use-always-instead-of-always-y-in-for-crtsavres.o.patch new file mode 100644 index 00000000000..9ba2da40846 --- /dev/null +++ b/queue-5.4/powerpc-use-always-instead-of-always-y-in-for-crtsavres.o.patch @@ -0,0 +1,46 @@ +From nathan@kernel.org Fri Jan 26 16:51:54 2024 +From: Nathan Chancellor +Date: Fri, 26 Jan 2024 10:37:02 -0700 +Subject: powerpc: Use always instead of always-y in for crtsavres.o +To: gregkh@linuxfoundation.org, sashal@kernel.org +Cc: stable@vger.kernel.org, linuxppc-dev@lists.ozlabs.org, linux-kbuild@vger.kernel.org, llvm@lists.linux.dev, Nathan Chancellor +Message-ID: <20240126-5-4-fix-lib-powerpc-backport-v1-1-2c110ed18b1d@kernel.org> + +From: Nathan Chancellor + +This commit is for linux-5.4.y only, it has no direct upstream +equivalent. + +Prior to commit 5f2fb52fac15 ("kbuild: rename hostprogs-y/always to +hostprogs/always-y"), always-y did not exist, making the backport of +mainline commit 1b1e38002648 ("powerpc: add crtsavres.o to always-y +instead of extra-y") to linux-5.4.y as commit 245da9eebba0 ("powerpc: +add crtsavres.o to always-y instead of extra-y") incorrect, breaking the +build with linkers that need crtsavres.o: + + ld.lld: error: cannot open arch/powerpc/lib/crtsavres.o: No such file or directory + +Backporting the aforementioned kbuild commit is not suitable for stable +due to its size and number of conflicts, so transform the always-y usage +to an equivalent form using always, which resolves the build issues. + +Fixes: 245da9eebba0 ("powerpc: add crtsavres.o to always-y instead of extra-y") +Signed-off-by: Nathan Chancellor +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/lib/Makefile | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/powerpc/lib/Makefile ++++ b/arch/powerpc/lib/Makefile +@@ -34,8 +34,8 @@ obj-$(CONFIG_FUNCTION_ERROR_INJECTION) + + # 64-bit linker creates .sfpr on demand for final link (vmlinux), + # so it is only needed for modules, and only for older linkers which + # do not support --save-restore-funcs +-ifeq ($(call ld-ifversion, -lt, 225000000, y),y) +-always-$(CONFIG_PPC64) += crtsavres.o ++ifeq ($(call ld-ifversion, -lt, 225000000, y)$(CONFIG_PPC64),yy) ++always += crtsavres.o + endif + + obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \ diff --git a/queue-5.4/series b/queue-5.4/series index 7cc798de31a..1a07cefd329 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -20,5 +20,7 @@ nouveau-vmm-don-t-set-addr-on-the-fail-path-to-avoid-warning.patch ubifs-ubifs_symlink-fix-memleak-of-inode-i_link-in-error-path.patch rename-fix-the-locking-of-subdirectories.patch block-remove-special-casing-of-compound-pages.patch -mm-page_alloc-unreserve-highatomic-page-blocks-before-oom.patch mtd-spinand-macronix-fix-mx35lfxge4ad-page-size.patch +fs-add-mode_strip_sgid-helper.patch +fs-move-s_isgid-stripping-into-the-vfs_-helpers.patch +powerpc-use-always-instead-of-always-y-in-for-crtsavres.o.patch -- 2.47.3