From: Greg Kroah-Hartman Date: Thu, 18 Oct 2018 17:13:46 +0000 (+0200) Subject: 4.9-stable patches X-Git-Tag: v4.18.16~2 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=5a784215be8a71ab6497b3106195b3ccf9774387;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: ext4-avoid-running-out-of-journal-credits-when-appending-to-an-inline-file.patch hv-properly-delay-kvp-packets-when-negotiation-is-in-progress.patch macintosh-rack-meter-convert-cputime64_t-use-to-u64.patch sched-cputime-convert-kcpustat-to-nsecs.patch sched-cputime-fix-ksoftirqd-cputime-accounting-regression.patch sched-cputime-increment-kcpustat-directly-on-irqtime-account.patch --- diff --git a/queue-4.9/ext4-avoid-running-out-of-journal-credits-when-appending-to-an-inline-file.patch b/queue-4.9/ext4-avoid-running-out-of-journal-credits-when-appending-to-an-inline-file.patch new file mode 100644 index 00000000000..9fd3466b249 --- /dev/null +++ b/queue-4.9/ext4-avoid-running-out-of-journal-credits-when-appending-to-an-inline-file.patch @@ -0,0 +1,125 @@ +From 8bc1379b82b8e809eef77a9fedbb75c6c297be19 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sat, 16 Jun 2018 23:41:59 -0400 +Subject: ext4: avoid running out of journal credits when appending to an inline file + +From: Theodore Ts'o + +commit 8bc1379b82b8e809eef77a9fedbb75c6c297be19 upstream. + +Use a separate journal transaction if it turns out that we need to +convert an inline file to use an data block. Otherwise we could end +up failing due to not having journal credits. + +This addresses CVE-2018-10883. + +https://bugzilla.kernel.org/show_bug.cgi?id=200071 + +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +[fengc@google.com: 4.4 and 4.9 backport: adjust context] +Signed-off-by: Chenbo Feng +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 3 --- + fs/ext4/inline.c | 38 +------------------------------------- + fs/ext4/xattr.c | 18 ++---------------- + 3 files changed, 3 insertions(+), 56 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -3038,9 +3038,6 @@ extern struct buffer_head *ext4_get_firs + extern int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline, __u64 start, __u64 len); +-extern int ext4_try_to_evict_inline_data(handle_t *handle, +- struct inode *inode, +- int needed); + extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); + + extern int ext4_convert_inline_data(struct inode *inode); +--- a/fs/ext4/inline.c ++++ b/fs/ext4/inline.c +@@ -889,11 +889,11 @@ retry_journal: + flags |= AOP_FLAG_NOFS; + + if (ret == -ENOSPC) { ++ ext4_journal_stop(handle); + ret = ext4_da_convert_inline_data_to_extent(mapping, + inode, + flags, + fsdata); +- ext4_journal_stop(handle); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; +@@ -1865,42 +1865,6 @@ out: + return (error < 0 ? error : 0); + } + +-/* +- * Called during xattr set, and if we can sparse space 'needed', +- * just create the extent tree evict the data to the outer block. +- * +- * We use jbd2 instead of page cache to move data to the 1st block +- * so that the whole transaction can be committed as a whole and +- * the data isn't lost because of the delayed page cache write. +- */ +-int ext4_try_to_evict_inline_data(handle_t *handle, +- struct inode *inode, +- int needed) +-{ +- int error; +- struct ext4_xattr_entry *entry; +- struct ext4_inode *raw_inode; +- struct ext4_iloc iloc; +- +- error = ext4_get_inode_loc(inode, &iloc); +- if (error) +- return error; +- +- raw_inode = ext4_raw_inode(&iloc); +- entry = (struct ext4_xattr_entry *)((void *)raw_inode + +- EXT4_I(inode)->i_inline_off); +- if (EXT4_XATTR_LEN(entry->e_name_len) + +- EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) { +- error = -ENOSPC; +- goto out; +- } +- +- error = ext4_convert_inline_data_nolock(handle, inode, &iloc); +-out: +- brelse(iloc.bh); +- return error; +-} +- + void ext4_inline_data_truncate(struct inode *inode, int *has_inline) + { + handle_t *handle; +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -1086,22 +1086,8 @@ int ext4_xattr_ibody_inline_set(handle_t + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; + error = ext4_xattr_set_entry(i, s, inode); +- if (error) { +- if (error == -ENOSPC && +- ext4_has_inline_data(inode)) { +- error = ext4_try_to_evict_inline_data(handle, inode, +- EXT4_XATTR_LEN(strlen(i->name) + +- EXT4_XATTR_SIZE(i->value_len))); +- if (error) +- return error; +- error = ext4_xattr_ibody_find(inode, i, is); +- if (error) +- return error; +- error = ext4_xattr_set_entry(i, s, inode); +- } +- if (error) +- return error; +- } ++ if (error) ++ return error; + header = IHDR(inode, ext4_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); diff --git a/queue-4.9/hv-properly-delay-kvp-packets-when-negotiation-is-in-progress.patch b/queue-4.9/hv-properly-delay-kvp-packets-when-negotiation-is-in-progress.patch new file mode 100644 index 00000000000..9f4ca85e144 --- /dev/null +++ b/queue-4.9/hv-properly-delay-kvp-packets-when-negotiation-is-in-progress.patch @@ -0,0 +1,94 @@ +From a3ade8cc474d848676278660e65f5af1e9e094d9 Mon Sep 17 00:00:00 2001 +From: Long Li +Date: Sun, 30 Apr 2017 16:21:19 -0700 +Subject: HV: properly delay KVP packets when negotiation is in progress + +From: Long Li + +commit a3ade8cc474d848676278660e65f5af1e9e094d9 upstream. + +The host may send multiple negotiation packets +(due to timeout) before the KVP user-mode daemon +is connected. KVP user-mode daemon is connected. +We need to defer processing those packets +until the daemon is negotiated and connected. +It's okay for guest to respond +to all negotiation packets. + +In addition, the host may send multiple staged +KVP requests as soon as negotiation is done. +We need to properly process those packets using one +tasklet for exclusive access to ring buffer. + +This patch is based on the work of +Nick Meier . + +The above is the original changelog of +a3ade8cc474d ("HV: properly delay KVP packets when negotiation is in progress" + +Here I re-worked the original patch because the mainline version +can't work for the linux-4.4.y branch, on which channel->callback_event +doesn't exist yet. In the mainline, channel->callback_event was added by: +631e63a9f346 ("vmbus: change to per channel tasklet"). Here we don't want +to backport it to v4.4, as it requires extra supporting changes and fixes, +which are unnecessary as to the KVP bug we're trying to resolve. + +NOTE: before this patch is used, we should cherry-pick the other related +3 patches from the mainline first: + +The background of this backport request is that: recently Wang Jian reported +some KVP issues: https://github.com/LIS/lis-next/issues/593: +e.g. the /var/lib/hyperv/.kvp_pool_* files can not be updated, and sometimes +if the hv_kvp_daemon doesn't timely start, the host may not be able to query +the VM's IP address via KVP. + +Reported-by: Wang Jian +Tested-by: Wang Jian +Signed-off-by: Dexuan Cui +Signed-off-by: Long Li +Signed-off-by: K. Y. Srinivasan +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/hv/hv_kvp.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +--- a/drivers/hv/hv_kvp.c ++++ b/drivers/hv/hv_kvp.c +@@ -616,21 +616,22 @@ void hv_kvp_onchannelcallback(void *cont + NEGO_IN_PROGRESS, + NEGO_FINISHED} host_negotiatied = NEGO_NOT_STARTED; + +- if (host_negotiatied == NEGO_NOT_STARTED && +- kvp_transaction.state < HVUTIL_READY) { ++ if (kvp_transaction.state < HVUTIL_READY) { + /* + * If userspace daemon is not connected and host is asking + * us to negotiate we need to delay to not lose messages. + * This is important for Failover IP setting. + */ +- host_negotiatied = NEGO_IN_PROGRESS; +- schedule_delayed_work(&kvp_host_handshake_work, ++ if (host_negotiatied == NEGO_NOT_STARTED) { ++ host_negotiatied = NEGO_IN_PROGRESS; ++ schedule_delayed_work(&kvp_host_handshake_work, + HV_UTIL_NEGO_TIMEOUT * HZ); ++ } + return; + } + if (kvp_transaction.state > HVUTIL_READY) + return; +- ++recheck: + vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 4, &recvlen, + &requestid); + +@@ -707,6 +708,8 @@ void hv_kvp_onchannelcallback(void *cont + VM_PKT_DATA_INBAND, 0); + + host_negotiatied = NEGO_FINISHED; ++ ++ goto recheck; + } + + } diff --git a/queue-4.9/macintosh-rack-meter-convert-cputime64_t-use-to-u64.patch b/queue-4.9/macintosh-rack-meter-convert-cputime64_t-use-to-u64.patch new file mode 100644 index 00000000000..431dce009e0 --- /dev/null +++ b/queue-4.9/macintosh-rack-meter-convert-cputime64_t-use-to-u64.patch @@ -0,0 +1,108 @@ +From 564b733c899f4e12a64946658960fce80cad0b05 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker +Date: Tue, 31 Jan 2017 04:09:20 +0100 +Subject: macintosh/rack-meter: Convert cputime64_t use to u64 + +From: Frederic Weisbecker + +commit 564b733c899f4e12a64946658960fce80cad0b05 upstream. + +cputime_t is going to be removed and replaced by nsecs units, +so convert the drivers/macintosh/rack-meter.c use to u64.. + +Signed-off-by: Frederic Weisbecker +Cc: Benjamin Herrenschmidt +Cc: Paul Mackerras +Cc: Michael Ellerman +Cc: Heiko Carstens +Cc: Martin Schwidefsky +Cc: Tony Luck +Cc: Fenghua Yu +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Stanislaw Gruszka +Cc: Wanpeng Li +Link: http://lkml.kernel.org/r/1485832191-26889-5-git-send-email-fweisbec@gmail.com +Signed-off-by: Ingo Molnar +Signed-off-by: Ivan Delalande +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/macintosh/rack-meter.c | 32 ++++++++++++++++---------------- + 1 file changed, 16 insertions(+), 16 deletions(-) + +--- a/drivers/macintosh/rack-meter.c ++++ b/drivers/macintosh/rack-meter.c +@@ -52,8 +52,8 @@ struct rackmeter_dma { + struct rackmeter_cpu { + struct delayed_work sniffer; + struct rackmeter *rm; +- cputime64_t prev_wall; +- cputime64_t prev_idle; ++ u64 prev_wall; ++ u64 prev_idle; + int zero; + } ____cacheline_aligned; + +@@ -81,7 +81,7 @@ static int rackmeter_ignore_nice; + /* This is copied from cpufreq_ondemand, maybe we should put it in + * a common header somewhere + */ +-static inline cputime64_t get_cpu_idle_time(unsigned int cpu) ++static inline u64 get_cpu_idle_time(unsigned int cpu) + { + u64 retval; + +@@ -91,7 +91,7 @@ static inline cputime64_t get_cpu_idle_t + if (rackmeter_ignore_nice) + retval += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; + +- return nsecs_to_cputime64(retval); ++ return retval; + } + + static void rackmeter_setup_i2s(struct rackmeter *rm) +@@ -217,23 +217,23 @@ static void rackmeter_do_timer(struct wo + container_of(work, struct rackmeter_cpu, sniffer.work); + struct rackmeter *rm = rcpu->rm; + unsigned int cpu = smp_processor_id(); +- cputime64_t cur_jiffies, total_idle_ticks; +- unsigned int total_ticks, idle_ticks; ++ u64 cur_nsecs, total_idle_nsecs; ++ u64 total_nsecs, idle_nsecs; + int i, offset, load, cumm, pause; + +- cur_jiffies = jiffies64_to_cputime64(get_jiffies_64()); +- total_ticks = (unsigned int) (cur_jiffies - rcpu->prev_wall); +- rcpu->prev_wall = cur_jiffies; +- +- total_idle_ticks = get_cpu_idle_time(cpu); +- idle_ticks = (unsigned int) (total_idle_ticks - rcpu->prev_idle); +- idle_ticks = min(idle_ticks, total_ticks); +- rcpu->prev_idle = total_idle_ticks; ++ cur_nsecs = jiffies64_to_nsecs(get_jiffies_64()); ++ total_nsecs = cur_nsecs - rcpu->prev_wall; ++ rcpu->prev_wall = cur_nsecs; ++ ++ total_idle_nsecs = get_cpu_idle_time(cpu); ++ idle_nsecs = total_idle_nsecs - rcpu->prev_idle; ++ idle_nsecs = min(idle_nsecs, total_nsecs); ++ rcpu->prev_idle = total_idle_nsecs; + + /* We do a very dumb calculation to update the LEDs for now, + * we'll do better once we have actual PWM implemented + */ +- load = (9 * (total_ticks - idle_ticks)) / total_ticks; ++ load = div64_u64(9 * (total_nsecs - idle_nsecs), total_nsecs); + + offset = cpu << 3; + cumm = 0; +@@ -278,7 +278,7 @@ static void rackmeter_init_cpu_sniffer(s + continue; + rcpu = &rm->cpu[cpu]; + rcpu->prev_idle = get_cpu_idle_time(cpu); +- rcpu->prev_wall = jiffies64_to_cputime64(get_jiffies_64()); ++ rcpu->prev_wall = jiffies64_to_nsecs(get_jiffies_64()); + schedule_delayed_work_on(cpu, &rm->cpu[cpu].sniffer, + msecs_to_jiffies(CPU_SAMPLING_RATE)); + } diff --git a/queue-4.9/sched-cputime-convert-kcpustat-to-nsecs.patch b/queue-4.9/sched-cputime-convert-kcpustat-to-nsecs.patch new file mode 100644 index 00000000000..cfa8de64bc4 --- /dev/null +++ b/queue-4.9/sched-cputime-convert-kcpustat-to-nsecs.patch @@ -0,0 +1,368 @@ +From 7fb1327ee9b92fca27662f9b9d60c7c3376d6c69 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker +Date: Tue, 31 Jan 2017 04:09:19 +0100 +Subject: sched/cputime: Convert kcpustat to nsecs + +From: Frederic Weisbecker + +commit 7fb1327ee9b92fca27662f9b9d60c7c3376d6c69 upstream. + +Kernel CPU stats are stored in cputime_t which is an architecture +defined type, and hence a bit opaque and requiring accessors and mutators +for any operation. + +Converting them to nsecs simplifies the code and is one step toward +the removal of cputime_t in the core code. + +Signed-off-by: Frederic Weisbecker +Cc: Benjamin Herrenschmidt +Cc: Paul Mackerras +Cc: Michael Ellerman +Cc: Heiko Carstens +Cc: Martin Schwidefsky +Cc: Tony Luck +Cc: Fenghua Yu +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Stanislaw Gruszka +Cc: Wanpeng Li +Link: http://lkml.kernel.org/r/1485832191-26889-4-git-send-email-fweisbec@gmail.com +Signed-off-by: Ingo Molnar +[colona: minor conflict as 527b0a76f41d ("sched/cpuacct: Avoid %lld seq_printf + warning") is missing from v4.9] +Signed-off-by: Ivan Delalande +Signed-off-by: Greg Kroah-Hartman +--- + arch/s390/appldata/appldata_os.c | 16 ++++---- + drivers/cpufreq/cpufreq.c | 6 +-- + drivers/cpufreq/cpufreq_governor.c | 2 - + drivers/cpufreq/cpufreq_stats.c | 1 + drivers/macintosh/rack-meter.c | 2 - + fs/proc/stat.c | 68 ++++++++++++++++++------------------- + fs/proc/uptime.c | 7 +-- + kernel/sched/cpuacct.c | 2 - + kernel/sched/cputime.c | 22 +++++------ + 9 files changed, 61 insertions(+), 65 deletions(-) + +--- a/arch/s390/appldata/appldata_os.c ++++ b/arch/s390/appldata/appldata_os.c +@@ -113,21 +113,21 @@ static void appldata_get_os_data(void *d + j = 0; + for_each_online_cpu(i) { + os_data->os_cpu[j].per_cpu_user = +- cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]); ++ nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]); + os_data->os_cpu[j].per_cpu_nice = +- cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]); ++ nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]); + os_data->os_cpu[j].per_cpu_system = +- cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]); ++ nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]); + os_data->os_cpu[j].per_cpu_idle = +- cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]); ++ nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]); + os_data->os_cpu[j].per_cpu_irq = +- cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]); ++ nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]); + os_data->os_cpu[j].per_cpu_softirq = +- cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]); ++ nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]); + os_data->os_cpu[j].per_cpu_iowait = +- cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]); ++ nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]); + os_data->os_cpu[j].per_cpu_steal = +- cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]); ++ nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]); + os_data->os_cpu[j].cpu_id = i; + j++; + } +--- a/drivers/cpufreq/cpufreq.c ++++ b/drivers/cpufreq/cpufreq.c +@@ -132,7 +132,7 @@ static inline u64 get_cpu_idle_time_jiff + u64 cur_wall_time; + u64 busy_time; + +- cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); ++ cur_wall_time = jiffies64_to_nsecs(get_jiffies_64()); + + busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; + busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; +@@ -143,9 +143,9 @@ static inline u64 get_cpu_idle_time_jiff + + idle_time = cur_wall_time - busy_time; + if (wall) +- *wall = cputime_to_usecs(cur_wall_time); ++ *wall = div_u64(cur_wall_time, NSEC_PER_USEC); + +- return cputime_to_usecs(idle_time); ++ return div_u64(idle_time, NSEC_PER_USEC); + } + + u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy) +--- a/drivers/cpufreq/cpufreq_governor.c ++++ b/drivers/cpufreq/cpufreq_governor.c +@@ -152,7 +152,7 @@ unsigned int dbs_update(struct cpufreq_p + if (ignore_nice) { + u64 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; + +- idle_time += cputime_to_usecs(cur_nice - j_cdbs->prev_cpu_nice); ++ idle_time += div_u64(cur_nice - j_cdbs->prev_cpu_nice, NSEC_PER_USEC); + j_cdbs->prev_cpu_nice = cur_nice; + } + +--- a/drivers/cpufreq/cpufreq_stats.c ++++ b/drivers/cpufreq/cpufreq_stats.c +@@ -13,7 +13,6 @@ + #include + #include + #include +-#include + + static DEFINE_SPINLOCK(cpufreq_stats_lock); + +--- a/drivers/macintosh/rack-meter.c ++++ b/drivers/macintosh/rack-meter.c +@@ -91,7 +91,7 @@ static inline cputime64_t get_cpu_idle_t + if (rackmeter_ignore_nice) + retval += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; + +- return retval; ++ return nsecs_to_cputime64(retval); + } + + static void rackmeter_setup_i2s(struct rackmeter *rm) +--- a/fs/proc/stat.c ++++ b/fs/proc/stat.c +@@ -21,23 +21,23 @@ + + #ifdef arch_idle_time + +-static cputime64_t get_idle_time(int cpu) ++static u64 get_idle_time(int cpu) + { +- cputime64_t idle; ++ u64 idle; + + idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) +- idle += arch_idle_time(cpu); ++ idle += cputime_to_nsecs(arch_idle_time(cpu)); + return idle; + } + +-static cputime64_t get_iowait_time(int cpu) ++static u64 get_iowait_time(int cpu) + { +- cputime64_t iowait; ++ u64 iowait; + + iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + if (cpu_online(cpu) && nr_iowait_cpu(cpu)) +- iowait += arch_idle_time(cpu); ++ iowait += cputime_to_nsecs(arch_idle_time(cpu)); + return iowait; + } + +@@ -45,32 +45,32 @@ static cputime64_t get_iowait_time(int c + + static u64 get_idle_time(int cpu) + { +- u64 idle, idle_time = -1ULL; ++ u64 idle, idle_usecs = -1ULL; + + if (cpu_online(cpu)) +- idle_time = get_cpu_idle_time_us(cpu, NULL); ++ idle_usecs = get_cpu_idle_time_us(cpu, NULL); + +- if (idle_time == -1ULL) ++ if (idle_usecs == -1ULL) + /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ + idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + else +- idle = usecs_to_cputime64(idle_time); ++ idle = idle_usecs * NSEC_PER_USEC; + + return idle; + } + + static u64 get_iowait_time(int cpu) + { +- u64 iowait, iowait_time = -1ULL; ++ u64 iowait, iowait_usecs = -1ULL; + + if (cpu_online(cpu)) +- iowait_time = get_cpu_iowait_time_us(cpu, NULL); ++ iowait_usecs = get_cpu_iowait_time_us(cpu, NULL); + +- if (iowait_time == -1ULL) ++ if (iowait_usecs == -1ULL) + /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ + iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + else +- iowait = usecs_to_cputime64(iowait_time); ++ iowait = iowait_usecs * NSEC_PER_USEC; + + return iowait; + } +@@ -115,16 +115,16 @@ static int show_stat(struct seq_file *p, + } + sum += arch_irq_stat(); + +- seq_put_decimal_ull(p, "cpu ", cputime64_to_clock_t(user)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice)); ++ seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(user)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); + seq_putc(p, '\n'); + + for_each_online_cpu(i) { +@@ -140,16 +140,16 @@ static int show_stat(struct seq_file *p, + guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + seq_printf(p, "cpu%d", i); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(user)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest)); +- seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); ++ seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); + seq_putc(p, '\n'); + } + seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); +--- a/fs/proc/uptime.c ++++ b/fs/proc/uptime.c +@@ -5,23 +5,20 @@ + #include + #include + #include +-#include + + static int uptime_proc_show(struct seq_file *m, void *v) + { + struct timespec uptime; + struct timespec idle; +- u64 idletime; + u64 nsec; + u32 rem; + int i; + +- idletime = 0; ++ nsec = 0; + for_each_possible_cpu(i) +- idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; ++ nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; + + get_monotonic_boottime(&uptime); +- nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_nsec = rem; + seq_printf(m, "%lu.%02lu %lu.%02lu\n", +--- a/kernel/sched/cpuacct.c ++++ b/kernel/sched/cpuacct.c +@@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { + seq_printf(sf, "%s %lld\n", + cpuacct_stat_desc[stat], +- cputime64_to_clock_t(val[stat])); ++ nsec_to_clock_t(val[stat])); + } + + return 0; +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -75,9 +75,9 @@ static cputime_t irqtime_account_update( + u64 *cpustat = kcpustat_this_cpu->cpustat; + cputime_t irq_cputime; + +- irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx]; ++ irq_cputime = nsecs_to_cputime64(irqtime - cpustat[idx]); + irq_cputime = min(irq_cputime, maxtime); +- cpustat[idx] += irq_cputime; ++ cpustat[idx] += cputime_to_nsecs(irq_cputime); + + return irq_cputime; + } +@@ -143,7 +143,7 @@ void account_user_time(struct task_struc + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ +- task_group_account_field(p, index, (__force u64) cputime); ++ task_group_account_field(p, index, cputime_to_nsecs(cputime)); + + /* Account for user time used */ + acct_account_cputime(p); +@@ -168,11 +168,11 @@ static void account_guest_time(struct ta + + /* Add guest time to cpustat. */ + if (task_nice(p) > 0) { +- cpustat[CPUTIME_NICE] += (__force u64) cputime; +- cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; ++ cpustat[CPUTIME_NICE] += cputime_to_nsecs(cputime); ++ cpustat[CPUTIME_GUEST_NICE] += cputime_to_nsecs(cputime); + } else { +- cpustat[CPUTIME_USER] += (__force u64) cputime; +- cpustat[CPUTIME_GUEST] += (__force u64) cputime; ++ cpustat[CPUTIME_USER] += cputime_to_nsecs(cputime); ++ cpustat[CPUTIME_GUEST] += cputime_to_nsecs(cputime); + } + } + +@@ -193,7 +193,7 @@ void __account_system_time(struct task_s + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ +- task_group_account_field(p, index, (__force u64) cputime); ++ task_group_account_field(p, index, cputime_to_nsecs(cputime)); + + /* Account for system time used */ + acct_account_cputime(p); +@@ -234,7 +234,7 @@ void account_steal_time(cputime_t cputim + { + u64 *cpustat = kcpustat_this_cpu->cpustat; + +- cpustat[CPUTIME_STEAL] += (__force u64) cputime; ++ cpustat[CPUTIME_STEAL] += cputime_to_nsecs(cputime); + } + + /* +@@ -247,9 +247,9 @@ void account_idle_time(cputime_t cputime + struct rq *rq = this_rq(); + + if (atomic_read(&rq->nr_iowait) > 0) +- cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; ++ cpustat[CPUTIME_IOWAIT] += cputime_to_nsecs(cputime); + else +- cpustat[CPUTIME_IDLE] += (__force u64) cputime; ++ cpustat[CPUTIME_IDLE] += cputime_to_nsecs(cputime); + } + + /* diff --git a/queue-4.9/sched-cputime-fix-ksoftirqd-cputime-accounting-regression.patch b/queue-4.9/sched-cputime-fix-ksoftirqd-cputime-accounting-regression.patch new file mode 100644 index 00000000000..4a621951a0d --- /dev/null +++ b/queue-4.9/sched-cputime-fix-ksoftirqd-cputime-accounting-regression.patch @@ -0,0 +1,140 @@ +From 25e2d8c1b9e327ed260edd13169cc22bc7a78bc6 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker +Date: Tue, 25 Apr 2017 16:10:48 +0200 +Subject: sched/cputime: Fix ksoftirqd cputime accounting regression + +From: Frederic Weisbecker + +commit 25e2d8c1b9e327ed260edd13169cc22bc7a78bc6 upstream. + +irq_time_read() returns the irqtime minus the ksoftirqd time. This +is necessary because irq_time_read() is used to substract the IRQ time +from the sum_exec_runtime of a task. If we were to include the softirq +time of ksoftirqd, this task would substract its own CPU time everytime +it updates ksoftirqd->sum_exec_runtime which would therefore never +progress. + +But this behaviour got broken by: + + a499a5a14db ("sched/cputime: Increment kcpustat directly on irqtime account") + +... which now includes ksoftirqd softirq time in the time returned by +irq_time_read(). + +This has resulted in wrong ksoftirqd cputime reported to userspace +through /proc/stat and thus "top" not showing ksoftirqd when it should +after intense networking load. + +ksoftirqd->stime happens to be correct but it gets scaled down by +sum_exec_runtime through task_cputime_adjusted(). + +To fix this, just account the strict IRQ time in a separate counter and +use it to report the IRQ time. + +Reported-and-tested-by: Jesper Dangaard Brouer +Signed-off-by: Frederic Weisbecker +Reviewed-by: Rik van Riel +Acked-by: Jesper Dangaard Brouer +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Stanislaw Gruszka +Cc: Thomas Gleixner +Cc: Wanpeng Li +Link: http://lkml.kernel.org/r/1493129448-5356-1-git-send-email-fweisbec@gmail.com +Signed-off-by: Ingo Molnar +Signed-off-by: Ivan Delalande +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/cputime.c | 27 ++++++++++++++++----------- + kernel/sched/sched.h | 9 +++++++-- + 2 files changed, 23 insertions(+), 13 deletions(-) + +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -37,6 +37,18 @@ void disable_sched_clock_irqtime(void) + sched_clock_irqtime = 0; + } + ++static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, ++ enum cpu_usage_stat idx) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ ++ u64_stats_update_begin(&irqtime->sync); ++ cpustat[idx] += delta; ++ irqtime->total += delta; ++ irqtime->tick_delta += delta; ++ u64_stats_update_end(&irqtime->sync); ++} ++ + /* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. +@@ -44,7 +56,6 @@ void disable_sched_clock_irqtime(void) + void irqtime_account_irq(struct task_struct *curr) + { + struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); +- u64 *cpustat = kcpustat_this_cpu->cpustat; + s64 delta; + int cpu; + +@@ -55,22 +66,16 @@ void irqtime_account_irq(struct task_str + delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; + irqtime->irq_start_time += delta; + +- u64_stats_update_begin(&irqtime->sync); + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ +- if (hardirq_count()) { +- cpustat[CPUTIME_IRQ] += delta; +- irqtime->tick_delta += delta; +- } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) { +- cpustat[CPUTIME_SOFTIRQ] += delta; +- irqtime->tick_delta += delta; +- } +- +- u64_stats_update_end(&irqtime->sync); ++ if (hardirq_count()) ++ irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); ++ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) ++ irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); + } + EXPORT_SYMBOL_GPL(irqtime_account_irq); + +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1743,6 +1743,7 @@ static inline void nohz_balance_exit_idl + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING + struct irqtime { ++ u64 total; + u64 tick_delta; + u64 irq_start_time; + struct u64_stats_sync sync; +@@ -1750,16 +1751,20 @@ struct irqtime { + + DECLARE_PER_CPU(struct irqtime, cpu_irqtime); + ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ + static inline u64 irq_time_read(int cpu) + { + struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); +- u64 *cpustat = kcpustat_cpu(cpu).cpustat; + unsigned int seq; + u64 total; + + do { + seq = __u64_stats_fetch_begin(&irqtime->sync); +- total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ]; ++ total = irqtime->total; + } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); + + return total; diff --git a/queue-4.9/sched-cputime-increment-kcpustat-directly-on-irqtime-account.patch b/queue-4.9/sched-cputime-increment-kcpustat-directly-on-irqtime-account.patch new file mode 100644 index 00000000000..5daf4c29444 --- /dev/null +++ b/queue-4.9/sched-cputime-increment-kcpustat-directly-on-irqtime-account.patch @@ -0,0 +1,166 @@ +From a499a5a14dbd1d0315a96fc62a8798059325e9e6 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker +Date: Tue, 31 Jan 2017 04:09:32 +0100 +Subject: sched/cputime: Increment kcpustat directly on irqtime account + +From: Frederic Weisbecker + +commit a499a5a14dbd1d0315a96fc62a8798059325e9e6 upstream. + +The irqtime is accounted is nsecs and stored in +cpu_irq_time.hardirq_time and cpu_irq_time.softirq_time. Once the +accumulated amount reaches a new jiffy, this one gets accounted to the +kcpustat. + +This was necessary when kcpustat was stored in cputime_t, which could at +worst have jiffies granularity. But now kcpustat is stored in nsecs +so this whole discretization game with temporary irqtime storage has +become unnecessary. + +We can now directly account the irqtime to the kcpustat. + +Signed-off-by: Frederic Weisbecker +Cc: Benjamin Herrenschmidt +Cc: Fenghua Yu +Cc: Heiko Carstens +Cc: Linus Torvalds +Cc: Martin Schwidefsky +Cc: Michael Ellerman +Cc: Paul Mackerras +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Stanislaw Gruszka +Cc: Thomas Gleixner +Cc: Tony Luck +Cc: Wanpeng Li +Link: http://lkml.kernel.org/r/1485832191-26889-17-git-send-email-fweisbec@gmail.com +Signed-off-by: Ingo Molnar +Signed-off-by: Ivan Delalande +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/cputime.c | 50 ++++++++++++++++--------------------------------- + kernel/sched/sched.h | 7 +++--- + 2 files changed, 21 insertions(+), 36 deletions(-) + +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -44,6 +44,7 @@ void disable_sched_clock_irqtime(void) + void irqtime_account_irq(struct task_struct *curr) + { + struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); ++ u64 *cpustat = kcpustat_this_cpu->cpustat; + s64 delta; + int cpu; + +@@ -61,49 +62,35 @@ void irqtime_account_irq(struct task_str + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ +- if (hardirq_count()) +- irqtime->hardirq_time += delta; +- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) +- irqtime->softirq_time += delta; ++ if (hardirq_count()) { ++ cpustat[CPUTIME_IRQ] += delta; ++ irqtime->tick_delta += delta; ++ } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) { ++ cpustat[CPUTIME_SOFTIRQ] += delta; ++ irqtime->tick_delta += delta; ++ } + + u64_stats_update_end(&irqtime->sync); + } + EXPORT_SYMBOL_GPL(irqtime_account_irq); + +-static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime) ++static cputime_t irqtime_tick_accounted(cputime_t maxtime) + { +- u64 *cpustat = kcpustat_this_cpu->cpustat; +- cputime_t irq_cputime; +- +- irq_cputime = nsecs_to_cputime64(irqtime - cpustat[idx]); +- irq_cputime = min(irq_cputime, maxtime); +- cpustat[idx] += cputime_to_nsecs(irq_cputime); +- +- return irq_cputime; +-} ++ struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); ++ cputime_t delta; + +-static cputime_t irqtime_account_hi_update(cputime_t maxtime) +-{ +- return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time), +- CPUTIME_IRQ, maxtime); +-} ++ delta = nsecs_to_cputime(irqtime->tick_delta); ++ delta = min(delta, maxtime); ++ irqtime->tick_delta -= cputime_to_nsecs(delta); + +-static cputime_t irqtime_account_si_update(cputime_t maxtime) +-{ +- return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time), +- CPUTIME_SOFTIRQ, maxtime); ++ return delta; + } + + #else /* CONFIG_IRQ_TIME_ACCOUNTING */ + + #define sched_clock_irqtime (0) + +-static cputime_t irqtime_account_hi_update(cputime_t dummy) +-{ +- return 0; +-} +- +-static cputime_t irqtime_account_si_update(cputime_t dummy) ++static cputime_t irqtime_tick_accounted(cputime_t dummy) + { + return 0; + } +@@ -290,10 +277,7 @@ static inline cputime_t account_other_ti + accounted = steal_account_process_time(max); + + if (accounted < max) +- accounted += irqtime_account_hi_update(max - accounted); +- +- if (accounted < max) +- accounted += irqtime_account_si_update(max - accounted); ++ accounted += irqtime_tick_accounted(max - accounted); + + return accounted; + } +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1742,8 +1743,7 @@ static inline void nohz_balance_exit_idl + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING + struct irqtime { +- u64 hardirq_time; +- u64 softirq_time; ++ u64 tick_delta; + u64 irq_start_time; + struct u64_stats_sync sync; + }; +@@ -1753,12 +1753,13 @@ DECLARE_PER_CPU(struct irqtime, cpu_irqt + static inline u64 irq_time_read(int cpu) + { + struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ u64 *cpustat = kcpustat_cpu(cpu).cpustat; + unsigned int seq; + u64 total; + + do { + seq = __u64_stats_fetch_begin(&irqtime->sync); +- total = irqtime->softirq_time + irqtime->hardirq_time; ++ total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ]; + } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); + + return total; diff --git a/queue-4.9/series b/queue-4.9/series index 748932d1362..f8416299fda 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -27,3 +27,9 @@ arc-build-get-rid-of-toolchain-check.patch arc-build-don-t-set-cross_compile-in-arch-s-makefile.patch hid-quirks-fix-support-for-apple-magic-keyboards.patch usb-gadget-serial-fix-oops-when-data-rx-d-after-close.patch +sched-cputime-convert-kcpustat-to-nsecs.patch +macintosh-rack-meter-convert-cputime64_t-use-to-u64.patch +sched-cputime-increment-kcpustat-directly-on-irqtime-account.patch +sched-cputime-fix-ksoftirqd-cputime-accounting-regression.patch +ext4-avoid-running-out-of-journal-credits-when-appending-to-an-inline-file.patch +hv-properly-delay-kvp-packets-when-negotiation-is-in-progress.patch