Values for both are rounded up to the nearest power of two. The
suggested flow count depends on the expected number of active connections
at any given time, which may be significantly less than the number of open
-connections. We have found that a value of 32768 for rps_sock_flow_entries
-works fairly well on a moderately loaded server.
+connections. We have found that a value of 65536 for rps_sock_flow_entries
+works fairly well on a moderately loaded server. Big servers might
+need 1048576 or even higher values.
+
+On a NUMA host it is advisable to spread rps_sock_flow_entries on all nodes.
+
+numactl --interleave=all bash -c "echo 1048576 >/proc/sys/net/core/rps_sock_flow_entries"
For a single queue device, the rps_flow_cnt value for the single queue
would normally be configured to the same value as rps_sock_flow_entries.
For a multi-queue device, the rps_flow_cnt for each queue might be
configured as rps_sock_flow_entries / N, where N is the number of
-queues. So for instance, if rps_sock_flow_entries is set to 32768 and there
+queues. So for instance, if rps_sock_flow_entries is set to 131072 and there
are 16 configured receive queues, rps_flow_cnt for each queue might be
-configured as 2048.
+configured as 8192.
Accelerated RFS
#include <linux/types.h>
#include <linux/netdevice.h>
#include <net/protocol.h>
+#ifdef CONFIG_RPS
+#include <net/rps-types.h>
+#endif
struct skb_defer_node {
struct llist_head defer_list;
struct kmem_cache *skbuff_fclone_cache;
struct kmem_cache *skb_small_head_cache;
#ifdef CONFIG_RPS
- struct rps_sock_flow_table __rcu *rps_sock_flow_table;
+ rps_tag_ptr rps_sock_flow_table;
u32 rps_cpu_mask;
#endif
struct skb_defer_node __percpu *skb_defer_nodes;
#include <net/hotdata.h>
#ifdef CONFIG_RPS
+#include <net/rps-types.h>
extern struct static_key_false rps_needed;
extern struct static_key_false rfs_needed;
* meaning we use 32-6=26 bits for the hash.
*/
struct rps_sock_flow_table {
- u32 _mask;
-
- u32 ents[] ____cacheline_aligned_in_smp;
+ u32 ent;
};
-#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
-
-static inline u32 rps_sock_flow_table_mask(const struct rps_sock_flow_table *table)
-{
- return table->_mask;
-}
#define RPS_NO_CPU 0xffff
-static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
- u32 hash)
+static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
{
- unsigned int index = hash & rps_sock_flow_table_mask(table);
+ unsigned int index = hash & rps_tag_to_mask(tag_ptr);
u32 val = hash & ~net_hotdata.rps_cpu_mask;
+ struct rps_sock_flow_table *table;
/* We only give a hint, preemption can change CPU under us */
val |= raw_smp_processor_id();
+ table = rps_tag_to_table(tag_ptr);
/* The following WRITE_ONCE() is paired with the READ_ONCE()
* here, and another one in get_rps_cpu().
*/
- if (READ_ONCE(table->ents[index]) != val)
- WRITE_ONCE(table->ents[index], val);
+ if (READ_ONCE(table[index].ent) != val)
+ WRITE_ONCE(table[index].ent, val);
}
static inline void _sock_rps_record_flow_hash(__u32 hash)
{
- struct rps_sock_flow_table *sock_flow_table;
+ rps_tag_ptr tag_ptr;
if (!hash)
return;
rcu_read_lock();
- sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
- if (sock_flow_table)
- rps_record_sock_flow(sock_flow_table, hash);
+ tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
+ if (tag_ptr)
+ rps_record_sock_flow(tag_ptr, hash);
rcu_read_unlock();
}
static inline void _sock_rps_delete_flow(const struct sock *sk)
{
struct rps_sock_flow_table *table;
+ rps_tag_ptr tag_ptr;
u32 hash, index;
hash = READ_ONCE(sk->sk_rxhash);
return;
rcu_read_lock();
- table = rcu_dereference(net_hotdata.rps_sock_flow_table);
- if (table) {
- index = hash & rps_sock_flow_table_mask(table);
- if (READ_ONCE(table->ents[index]) != RPS_NO_CPU)
- WRITE_ONCE(table->ents[index], RPS_NO_CPU);
+ tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
+ if (tag_ptr) {
+ index = hash & rps_tag_to_mask(tag_ptr);
+ table = rps_tag_to_table(tag_ptr);
+ if (READ_ONCE(table[index].ent) != RPS_NO_CPU)
+ WRITE_ONCE(table[index].ent, RPS_NO_CPU);
}
rcu_read_unlock();
}
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow **rflowp)
{
- const struct rps_sock_flow_table *sock_flow_table;
struct netdev_rx_queue *rxqueue = dev->_rx;
struct rps_dev_flow_table *flow_table;
+ rps_tag_ptr global_tag_ptr;
struct rps_map *map;
int cpu = -1;
u32 tcpu;
if (!hash)
goto done;
- sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
- if (flow_table && sock_flow_table) {
+ global_tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
+ if (flow_table && global_tag_ptr) {
+ struct rps_sock_flow_table *sock_flow_table;
struct rps_dev_flow *rflow;
u32 next_cpu;
u32 flow_id;
/* First check into global flow table if there is a match.
* This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
*/
- flow_id = hash & rps_sock_flow_table_mask(sock_flow_table);
- ident = READ_ONCE(sock_flow_table->ents[flow_id]);
+ flow_id = hash & rps_tag_to_mask(global_tag_ptr);
+ sock_flow_table = rps_tag_to_table(global_tag_ptr);
+ ident = READ_ONCE(sock_flow_table[flow_id].ent);
if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
goto try_rps;
static int rps_sock_flow_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
+ struct rps_sock_flow_table *o_sock_table, *sock_table;
+ static DEFINE_MUTEX(sock_flow_mutex);
+ rps_tag_ptr o_tag_ptr, tag_ptr;
unsigned int orig_size, size;
- int ret, i;
struct ctl_table tmp = {
.data = &size,
.maxlen = sizeof(size),
.mode = table->mode
};
- struct rps_sock_flow_table *o_sock_table, *sock_table;
- static DEFINE_MUTEX(sock_flow_mutex);
void *tofree = NULL;
+ int ret, i;
+ u8 log;
mutex_lock(&sock_flow_mutex);
- o_sock_table = rcu_dereference_protected(
- net_hotdata.rps_sock_flow_table,
- lockdep_is_held(&sock_flow_mutex));
- size = o_sock_table ? rps_sock_flow_table_mask(o_sock_table) + 1 : 0;
+ o_tag_ptr = tag_ptr = net_hotdata.rps_sock_flow_table;
+
+ size = o_tag_ptr ? rps_tag_to_mask(o_tag_ptr) + 1 : 0;
+ o_sock_table = rps_tag_to_table(o_tag_ptr);
orig_size = size;
ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
- if (write) {
- if (size) {
- if (size > 1<<29) {
- /* Enforce limit to prevent overflow */
+ if (!write)
+ goto unlock;
+
+ if (size) {
+ if (size > 1<<29) {
+ /* Enforce limit to prevent overflow */
+ mutex_unlock(&sock_flow_mutex);
+ return -EINVAL;
+ }
+ sock_table = o_sock_table;
+ size = roundup_pow_of_two(size);
+ if (size != orig_size) {
+ sock_table = vmalloc_huge(size * sizeof(*sock_table),
+ GFP_KERNEL);
+ if (!sock_table) {
mutex_unlock(&sock_flow_mutex);
- return -EINVAL;
- }
- sock_table = o_sock_table;
- size = roundup_pow_of_two(size);
- if (size != orig_size) {
- sock_table =
- vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
- if (!sock_table) {
- mutex_unlock(&sock_flow_mutex);
- return -ENOMEM;
- }
- net_hotdata.rps_cpu_mask =
- roundup_pow_of_two(nr_cpu_ids) - 1;
- sock_table->_mask = size - 1;
+ return -ENOMEM;
}
+ net_hotdata.rps_cpu_mask =
+ roundup_pow_of_two(nr_cpu_ids) - 1;
+ log = ilog2(size);
+ tag_ptr = (rps_tag_ptr)sock_table | log;
+ }
- for (i = 0; i < size; i++)
- sock_table->ents[i] = RPS_NO_CPU;
- } else
- sock_table = NULL;
-
- if (sock_table != o_sock_table) {
- rcu_assign_pointer(net_hotdata.rps_sock_flow_table,
- sock_table);
- if (sock_table) {
- static_branch_inc(&rps_needed);
- static_branch_inc(&rfs_needed);
- }
- if (o_sock_table) {
- static_branch_dec(&rps_needed);
- static_branch_dec(&rfs_needed);
- tofree = o_sock_table;
- }
+ for (i = 0; i < size; i++)
+ sock_table[i].ent = RPS_NO_CPU;
+ } else {
+ sock_table = NULL;
+ tag_ptr = 0UL;
+ }
+ if (tag_ptr != o_tag_ptr) {
+ smp_store_release(&net_hotdata.rps_sock_flow_table, tag_ptr);
+ if (sock_table) {
+ static_branch_inc(&rps_needed);
+ static_branch_inc(&rfs_needed);
+ }
+ if (o_sock_table) {
+ static_branch_dec(&rps_needed);
+ static_branch_dec(&rfs_needed);
+ tofree = o_sock_table;
}
}
+unlock:
mutex_unlock(&sock_flow_mutex);
kvfree_rcu_mightsleep(tofree);