0x00000003 Simulate Non-IO type sleep time
0x00000004 Simulate runnable time
========== ===============================
+
+What: /sys/fs/f2fs/<disk>/adjust_lock_priority
+Date: January 2026
+Contact: "Chao Yu" <chao@kernel.org>
+Description: This sysfs entry can be used to enable/disable to adjust priority for task
+ which is in critical region covered by lock.
+ ========== ==================
+ Flag_Value Flag_Description
+ ========== ==================
+ 0x00000000 Disabled (default)
+ 0x00000001 cp_rwsem
+ 0x00000002 node_change
+ 0x00000004 node_write
+ 0x00000008 gc_lock
+ 0x00000010 cp_global
+ 0x00000020 io_rwsem
+ ========== ==================
+
+What: /sys/fs/f2fs/<disk>/lock_duration_priority
+Date: January 2026
+Contact: "Chao Yu" <chao@kernel.org>
+Description: f2fs can tune priority of thread which has entered into critical region covered by
+ f2fs rwsemphore lock. This sysfs entry can be used to control priority value, the
+ range is [100,139], by default the value is 120.
runnable_time, io_sleep_time, other_time);
}
+static bool need_uplift_priority(struct f2fs_rwsem *sem, bool is_write)
+{
+ if (!(sem->sbi->adjust_lock_priority & BIT(sem->name - 1)))
+ return false;
+
+ switch (sem->name) {
+ /*
+ * writer is checkpoint which has high priority, let's just uplift
+ * priority for reader
+ */
+ case LOCK_NAME_CP_RWSEM:
+ case LOCK_NAME_NODE_CHANGE:
+ case LOCK_NAME_NODE_WRITE:
+ return !is_write;
+ case LOCK_NAME_GC_LOCK:
+ case LOCK_NAME_CP_GLOBAL:
+ case LOCK_NAME_IO_RWSEM:
+ return true;
+ default:
+ f2fs_bug_on(sem->sbi, 1);
+ }
+ return false;
+}
+
+static void uplift_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc,
+ bool is_write)
+{
+ lc->need_restore = false;
+ if (!sem->sbi->adjust_lock_priority)
+ return;
+ if (rt_task(current))
+ return;
+ if (!need_uplift_priority(sem, is_write))
+ return;
+ lc->orig_nice = task_nice(current);
+ lc->new_nice = PRIO_TO_NICE(sem->sbi->lock_duration_priority);
+ if (lc->orig_nice <= lc->new_nice)
+ return;
+ set_user_nice(current, lc->new_nice);
+ lc->need_restore = true;
+}
+
+static void restore_priority(struct f2fs_lock_context *lc)
+{
+ if (!lc->need_restore)
+ return;
+ /* someone has updated the priority */
+ if (task_nice(current) != lc->new_nice)
+ return;
+ set_user_nice(current, lc->orig_nice);
+}
+
void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
{
+ uplift_priority(sem, lc, false);
f2fs_down_read(sem);
trace_lock_elapsed_time_start(sem, lc);
}
int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
{
- if (!f2fs_down_read_trylock(sem))
+ uplift_priority(sem, lc, false);
+ if (!f2fs_down_read_trylock(sem)) {
+ restore_priority(lc);
return 0;
+ }
trace_lock_elapsed_time_start(sem, lc);
return 1;
}
void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
{
f2fs_up_read(sem);
+ restore_priority(lc);
trace_lock_elapsed_time_end(sem, lc, false);
}
void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
{
+ uplift_priority(sem, lc, true);
f2fs_down_write(sem);
trace_lock_elapsed_time_start(sem, lc);
}
int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
{
- if (!f2fs_down_write_trylock(sem))
+ uplift_priority(sem, lc, true);
+ if (!f2fs_down_write_trylock(sem)) {
+ restore_priority(lc);
return 0;
+ }
trace_lock_elapsed_time_start(sem, lc);
return 1;
}
void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
{
f2fs_up_write(sem);
+ restore_priority(lc);
trace_lock_elapsed_time_end(sem, lc, true);
}
LOCK_NAME_GC_LOCK,
LOCK_NAME_CP_GLOBAL,
LOCK_NAME_IO_RWSEM,
+ LOCK_NAME_MAX,
};
enum f2fs_timeout_type {
struct f2fs_lock_context {
struct f2fs_time_stat ts;
+ int orig_nice;
+ int new_nice;
bool lock_trace;
+ bool need_restore;
};
struct f2fs_gc_control {
/* a threshold of maximum elapsed time in critical region to print tracepoint */
#define MAX_LOCK_ELAPSED_TIME 500
+#define F2FS_DEFAULT_TASK_PRIORITY (DEFAULT_PRIO)
+
static inline int f2fs_test_bit(unsigned int nr, char *addr);
static inline void f2fs_set_bit(unsigned int nr, char *addr);
static inline void f2fs_clear_bit(unsigned int nr, char *addr);
/* max elapsed time threshold in critical region that lock covered */
unsigned long long max_lock_elapsed_time;
+ /* enable/disable to adjust task priority in critical region covered by lock */
+ unsigned int adjust_lock_priority;
+
+ /* adjust priority for task which is in critical region covered by lock */
+ unsigned int lock_duration_priority;
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
struct kmem_cache *page_array_slab; /* page array entry */
unsigned int page_array_slab_size; /* default page array slab size */
spin_lock_init(&sbi->gc_remaining_trials_lock);
atomic64_set(&sbi->current_atomic_write, 0);
sbi->max_lock_elapsed_time = MAX_LOCK_ELAPSED_TIME;
+ sbi->adjust_lock_priority = 0;
+ sbi->lock_duration_priority = F2FS_DEFAULT_TASK_PRIORITY;
sbi->sum_blocksize = f2fs_sb_has_packed_ssa(sbi) ?
4096 : sbi->blocksize;
return count;
}
+ if (!strcmp(a->attr.name, "adjust_lock_priority")) {
+ if (t >= BIT(LOCK_NAME_MAX - 1))
+ return -EINVAL;
+ sbi->adjust_lock_priority = t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "lock_duration_priority")) {
+ if (t < NICE_TO_PRIO(MIN_NICE) || t > NICE_TO_PRIO(MAX_NICE))
+ return -EINVAL;
+ sbi->lock_duration_priority = t;
+ return count;
+ }
+
__sbi_store_value(a, sbi, ptr + a->offset, t);
return count;
F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section);
F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware);
F2FS_SBI_GENERAL_RW_ATTR(max_lock_elapsed_time);
+F2FS_SBI_GENERAL_RW_ATTR(lock_duration_priority);
+F2FS_SBI_GENERAL_RW_ATTR(adjust_lock_priority);
/* STAT_INFO ATTR */
#ifdef CONFIG_F2FS_STAT_FS
ATTR_LIST(allocate_section_hint),
ATTR_LIST(allocate_section_policy),
ATTR_LIST(max_lock_elapsed_time),
+ ATTR_LIST(lock_duration_priority),
+ ATTR_LIST(adjust_lock_priority),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);