]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
f2fs: fix lock priority inversion issue
authorChao Yu <chao@kernel.org>
Fri, 30 Jan 2026 13:28:08 +0000 (21:28 +0800)
committerJaegeuk Kim <jaegeuk@kernel.org>
Sat, 31 Jan 2026 03:24:39 +0000 (03:24 +0000)
If userspace thread has held f2fs rw semaphore, due to its low priority,
it could be runnable or preempted state for long time, during the time,
it will block high priority thread which is trying to grab the same rw
semaphore, e.g. cp_rwsem, io_rwsem...

To fix such issue, let's detect thread's priority when it tries to grab
f2fs_rwsem lock, if the priority is lower than a priority threshold, let's
uplift the priority before it enters into critical region of lock, and
restore the priority after it leaves from critical region.

Meanwhile, introducing two new sysfs nodes:
- /sys/fs/f2fs/<disk>/adjust_lock_priority, it is used to control whether
the functionality is enable or not.
==========     ==================
Flag_Value     Flag_Description
==========     ==================
0x00000000     Disabled (default)
0x00000001     cp_rwsem
0x00000002     node_change
0x00000004     node_write
0x00000008     gc_lock
0x00000010     cp_global
0x00000020     io_rwsem
==========     ==================
- /sys/fs/f2fs/<disk>/lock_duration_priority, it is used to control
priority threshold.

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Documentation/ABI/testing/sysfs-fs-f2fs
fs/f2fs/checkpoint.c
fs/f2fs/f2fs.h
fs/f2fs/super.c
fs/f2fs/sysfs.c

index 9a8ec2290f6885e3f461c653a0455e2251d371e6..ea6474db8a319d2b07ae618fa5bb155d3a0174cd 100644 (file)
@@ -963,3 +963,27 @@ Description:       This sysfs entry can be used to change type of injected timeout:
                0x00000003     Simulate Non-IO type sleep time
                0x00000004     Simulate runnable time
                ==========     ===============================
+
+What:          /sys/fs/f2fs/<disk>/adjust_lock_priority
+Date:          January 2026
+Contact:       "Chao Yu" <chao@kernel.org>
+Description:   This sysfs entry can be used to enable/disable to adjust priority for task
+               which is in critical region covered by lock.
+               ==========     ==================
+               Flag_Value     Flag_Description
+               ==========     ==================
+               0x00000000     Disabled (default)
+               0x00000001     cp_rwsem
+               0x00000002     node_change
+               0x00000004     node_write
+               0x00000008     gc_lock
+               0x00000010     cp_global
+               0x00000020     io_rwsem
+               ==========     ==================
+
+What:          /sys/fs/f2fs/<disk>/lock_duration_priority
+Date:          January 2026
+Contact:       "Chao Yu" <chao@kernel.org>
+Description:   f2fs can tune priority of thread which has entered into critical region covered by
+               f2fs rwsemphore lock. This sysfs entry can be used to control priority value, the
+               range is [100,139], by default the value is 120.
index 5172396c0b019c9ae7412447afae0f70f0f48333..2f5a03e29d0b9d47b1260c6b3b429d02e9de4b8b 100644 (file)
@@ -90,16 +90,72 @@ static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem,
                        runnable_time, io_sleep_time, other_time);
 }
 
+static bool need_uplift_priority(struct f2fs_rwsem *sem, bool is_write)
+{
+       if (!(sem->sbi->adjust_lock_priority & BIT(sem->name - 1)))
+               return false;
+
+       switch (sem->name) {
+       /*
+        * writer is checkpoint which has high priority, let's just uplift
+        * priority for reader
+        */
+       case LOCK_NAME_CP_RWSEM:
+       case LOCK_NAME_NODE_CHANGE:
+       case LOCK_NAME_NODE_WRITE:
+               return !is_write;
+       case LOCK_NAME_GC_LOCK:
+       case LOCK_NAME_CP_GLOBAL:
+       case LOCK_NAME_IO_RWSEM:
+               return true;
+       default:
+               f2fs_bug_on(sem->sbi, 1);
+       }
+       return false;
+}
+
+static void uplift_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc,
+                                               bool is_write)
+{
+       lc->need_restore = false;
+       if (!sem->sbi->adjust_lock_priority)
+               return;
+       if (rt_task(current))
+               return;
+       if (!need_uplift_priority(sem, is_write))
+               return;
+       lc->orig_nice = task_nice(current);
+       lc->new_nice = PRIO_TO_NICE(sem->sbi->lock_duration_priority);
+       if (lc->orig_nice <= lc->new_nice)
+               return;
+       set_user_nice(current, lc->new_nice);
+       lc->need_restore = true;
+}
+
+static void restore_priority(struct f2fs_lock_context *lc)
+{
+       if (!lc->need_restore)
+               return;
+       /* someone has updated the priority */
+       if (task_nice(current) != lc->new_nice)
+               return;
+       set_user_nice(current, lc->orig_nice);
+}
+
 void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
 {
+       uplift_priority(sem, lc, false);
        f2fs_down_read(sem);
        trace_lock_elapsed_time_start(sem, lc);
 }
 
 int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
 {
-       if (!f2fs_down_read_trylock(sem))
+       uplift_priority(sem, lc, false);
+       if (!f2fs_down_read_trylock(sem)) {
+               restore_priority(lc);
                return 0;
+       }
        trace_lock_elapsed_time_start(sem, lc);
        return 1;
 }
@@ -107,19 +163,24 @@ int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_contex
 void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
 {
        f2fs_up_read(sem);
+       restore_priority(lc);
        trace_lock_elapsed_time_end(sem, lc, false);
 }
 
 void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
 {
+       uplift_priority(sem, lc, true);
        f2fs_down_write(sem);
        trace_lock_elapsed_time_start(sem, lc);
 }
 
 int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
 {
-       if (!f2fs_down_write_trylock(sem))
+       uplift_priority(sem, lc, true);
+       if (!f2fs_down_write_trylock(sem)) {
+               restore_priority(lc);
                return 0;
+       }
        trace_lock_elapsed_time_start(sem, lc);
        return 1;
 }
@@ -127,6 +188,7 @@ int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_conte
 void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
 {
        f2fs_up_write(sem);
+       restore_priority(lc);
        trace_lock_elapsed_time_end(sem, lc, true);
 }
 
index 29f81a496b72f446e39449c46c4eea7e408b3ea5..a6e7368fc40ae9220082f606d6f616ffc421e988 100644 (file)
@@ -185,6 +185,7 @@ enum f2fs_lock_name {
        LOCK_NAME_GC_LOCK,
        LOCK_NAME_CP_GLOBAL,
        LOCK_NAME_IO_RWSEM,
+       LOCK_NAME_MAX,
 };
 
 enum f2fs_timeout_type {
@@ -1447,7 +1448,10 @@ struct f2fs_time_stat {
 
 struct f2fs_lock_context {
        struct f2fs_time_stat ts;
+       int orig_nice;
+       int new_nice;
        bool lock_trace;
+       bool need_restore;
 };
 
 struct f2fs_gc_control {
@@ -1588,6 +1592,8 @@ enum node_type {
 /* a threshold of maximum elapsed time in critical region to print tracepoint */
 #define MAX_LOCK_ELAPSED_TIME          500
 
+#define F2FS_DEFAULT_TASK_PRIORITY             (DEFAULT_PRIO)
+
 static inline int f2fs_test_bit(unsigned int nr, char *addr);
 static inline void f2fs_set_bit(unsigned int nr, char *addr);
 static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -1998,6 +2004,12 @@ struct f2fs_sb_info {
        /* max elapsed time threshold in critical region that lock covered */
        unsigned long long max_lock_elapsed_time;
 
+       /* enable/disable to adjust task priority in critical region covered by lock */
+       unsigned int adjust_lock_priority;
+
+       /* adjust priority for task which is in critical region covered by lock */
+       unsigned int lock_duration_priority;
+
 #ifdef CONFIG_F2FS_FS_COMPRESSION
        struct kmem_cache *page_array_slab;     /* page array entry */
        unsigned int page_array_slab_size;      /* default page array slab size */
index 9d421a07d2d5226c96801717c50f1f111ce3aad3..d5cf7265e5d3c0e52aad5ba3f87a73fc603ea97c 100644 (file)
@@ -4338,6 +4338,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
        spin_lock_init(&sbi->gc_remaining_trials_lock);
        atomic64_set(&sbi->current_atomic_write, 0);
        sbi->max_lock_elapsed_time = MAX_LOCK_ELAPSED_TIME;
+       sbi->adjust_lock_priority = 0;
+       sbi->lock_duration_priority = F2FS_DEFAULT_TASK_PRIORITY;
 
        sbi->sum_blocksize = f2fs_sb_has_packed_ssa(sbi) ?
                4096 : sbi->blocksize;
index d01a2664a2508a6123e3345e4bfff3949c8d3e19..3a272e7edf23413be227f1b7dd71eca56e9c74b8 100644 (file)
@@ -955,6 +955,20 @@ out:
                return count;
        }
 
+       if (!strcmp(a->attr.name, "adjust_lock_priority")) {
+               if (t >= BIT(LOCK_NAME_MAX - 1))
+                       return -EINVAL;
+               sbi->adjust_lock_priority = t;
+               return count;
+       }
+
+       if (!strcmp(a->attr.name, "lock_duration_priority")) {
+               if (t < NICE_TO_PRIO(MIN_NICE) || t > NICE_TO_PRIO(MAX_NICE))
+                       return -EINVAL;
+               sbi->lock_duration_priority = t;
+               return count;
+       }
+
        __sbi_store_value(a, sbi, ptr + a->offset, t);
 
        return count;
@@ -1272,6 +1286,8 @@ F2FS_SBI_GENERAL_RW_ATTR(carve_out);
 F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section);
 F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware);
 F2FS_SBI_GENERAL_RW_ATTR(max_lock_elapsed_time);
+F2FS_SBI_GENERAL_RW_ATTR(lock_duration_priority);
+F2FS_SBI_GENERAL_RW_ATTR(adjust_lock_priority);
 
 /* STAT_INFO ATTR */
 #ifdef CONFIG_F2FS_STAT_FS
@@ -1478,6 +1494,8 @@ static struct attribute *f2fs_attrs[] = {
        ATTR_LIST(allocate_section_hint),
        ATTR_LIST(allocate_section_policy),
        ATTR_LIST(max_lock_elapsed_time),
+       ATTR_LIST(lock_duration_priority),
+       ATTR_LIST(adjust_lock_priority),
        NULL,
 };
 ATTRIBUTE_GROUPS(f2fs);