1) Using this_cpu_inc() is better than going through this_cpu_ptr():
- Single instruction on x86.
- Store tearing prevention.
2) Change tcf_action_update_stats() to use this_cpu_add().
3) Add WRITE_ONCE() to __qdisc_qstats_drop() and qstats_drop_inc()
in preparation for lockless "tc qdisc show".
$ scripts/bloat-o-meter -t vmlinux.old vmlinux.new
add/remove: 0/0 grow/shrink: 3/17 up/down: 72/-216 (-144)
Function old new delta
dualpi2_enqueue_skb 462 511 +49
tcf_ife_act 1061 1077 +16
taprio_enqueue 613 620 +7
codel_qdisc_enqueue 149 143 -6
tcf_vlan_act 684 676 -8
tcf_skbedit_act 626 618 -8
tcf_police_act 725 717 -8
tcf_mpls_act 1297 1289 -8
tcf_gate_act 310 302 -8
tcf_gact_act 222 214 -8
tcf_csum_act 2438 2430 -8
tcf_bpf_act 709 701 -8
tcf_action_update_stats 124 115 -9
pie_qdisc_enqueue 865 856 -9
pfifo_enqueue 116 107 -9
choke_enqueue 2069 2059 -10
plug_enqueue 139 128 -11
bfifo_enqueue 121 110 -11
tcf_nat_act 1501 1489 -12
gred_enqueue 1743 1668 -75
Total: Before=
24388609, After=
24388465, chg -0.00%
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20260501135916.2566766-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
static inline void tcf_action_inc_drop_qstats(struct tc_action *a)
{
if (likely(a->cpu_qstats)) {
- qstats_drop_inc(this_cpu_ptr(a->cpu_qstats));
+ qstats_cpu_drop_inc(a->cpu_qstats);
return;
}
atomic_inc(&a->tcfa_drops);
static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count)
{
- sch->qstats.drops += count;
+ WRITE_ONCE(sch->qstats.drops, sch->qstats.drops + count);
}
static inline void qstats_drop_inc(struct gnet_stats_queue *qstats)
{
- qstats->drops++;
+ WRITE_ONCE(qstats->drops, qstats->drops + 1);
+}
+
+static inline void qstats_cpu_drop_inc(struct gnet_stats_queue __percpu *qstats)
+{
+ this_cpu_inc(qstats->drops);
}
static inline void qstats_cpu_overlimit_inc(struct gnet_stats_queue __percpu *qstats)
if (a->cpu_bstats) {
_bstats_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
- this_cpu_ptr(a->cpu_qstats)->drops += drops;
+ this_cpu_add(a->cpu_qstats->drops, drops);
if (hw)
_bstats_update(this_cpu_ptr(a->cpu_bstats_hw),
break;
case TC_ACT_SHOT:
action = filter_res;
- qstats_drop_inc(this_cpu_ptr(prog->common.cpu_qstats));
+ qstats_cpu_drop_inc(prog->common.cpu_qstats);
break;
case TC_ACT_UNSPEC:
action = prog->tcf_action;
tlv_data = ife_decode(skb, &metalen);
if (unlikely(!tlv_data)) {
- qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats));
+ qstats_cpu_drop_inc(ife->common.cpu_qstats);
return TC_ACT_SHOT;
}
curr_data = ife_tlv_meta_decode(tlv_data, ifehdr_end, &mtype,
&dlen, NULL);
if (!curr_data) {
- qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats));
+ qstats_cpu_drop_inc(ife->common.cpu_qstats);
return TC_ACT_SHOT;
}
}
if (WARN_ON(tlv_data != ifehdr_end)) {
- qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats));
+ qstats_cpu_drop_inc(ife->common.cpu_qstats);
return TC_ACT_SHOT;
}
* so lets be conservative.. */
if ((action == TC_ACT_SHOT) || exceed_mtu) {
drop:
- qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats));
+ qstats_cpu_drop_inc(ife->common.cpu_qstats);
return TC_ACT_SHOT;
}
return p->action;
drop:
- qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats));
+ qstats_cpu_drop_inc(m->common.cpu_qstats);
return TC_ACT_SHOT;
}
qstats_cpu_overlimit_inc(police->common.cpu_qstats);
inc_drops:
if (ret == TC_ACT_SHOT)
- qstats_drop_inc(this_cpu_ptr(police->common.cpu_qstats));
+ qstats_cpu_drop_inc(police->common.cpu_qstats);
end:
return ret;
}
return params->action;
err:
- qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats));
+ qstats_cpu_drop_inc(d->common.cpu_qstats);
return TC_ACT_SHOT;
}
if (ack) {
WRITE_ONCE(b->ack_drops, b->ack_drops + 1);
- sch->qstats.drops++;
+ qdisc_qstats_drop(sch);
ack_pkt_len = qdisc_pkt_len(ack);
WRITE_ONCE(b->bytes, b->bytes + ack_pkt_len);
q->buffer_used += skb->truesize - ack->truesize;
flow->cvars.count += i;
q->backlogs[idx] -= len;
q->memory_usage -= mem;
- sch->qstats.drops += i;
+ __qdisc_qstats_drop(sch, i);
sch->qstats.backlog -= len;
sch->q.qlen -= i;
return idx;
packets += u64_stats_read(&hw_stats->stats.bstats[i].packets);
sch->qstats.qlen += hw_stats->stats.qstats[i].qlen;
sch->qstats.backlog += hw_stats->stats.qstats[i].backlog;
- sch->qstats.drops += hw_stats->stats.qstats[i].drops;
+ __qdisc_qstats_drop(sch, hw_stats->stats.qstats[i].drops);
sch->qstats.requeues += hw_stats->stats.qstats[i].requeues;
sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits;
}