forked from mirrors/linux
		
	net: rfs: add hash collision detection
Receive Flow Steering is a nice solution but suffers from hash collisions when a mix of connected and unconnected traffic is received on the host, when flow hash table is populated. Also, clearing flow in inet_release() makes RFS not very good for short lived flows, as many packets can follow close(). (FIN , ACK packets, ...) This patch extends the information stored into global hash table to not only include cpu number, but upper part of the hash value. I use a 32bit value, and dynamically split it in two parts. For host with less than 64 possible cpus, this gives 6 bits for the cpu number, and 26 (32-6) bits for the upper part of the hash. Since hash bucket selection use low order bits of the hash, we have a full hash match, if /proc/sys/net/core/rps_sock_flow_entries is big enough. If the hash found in flow table does not match, we fallback to RPS (if it is enabled for the rxqueue). This means that a packet for an non connected flow can avoid the IPI through a unrelated/victim CPU. This also means we no longer have to clear the table at socket close time, and this helps short lived flows performance. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									096a4cfa58
								
							
						
					
					
						commit
						567e4b7973
					
				
					 6 changed files with 49 additions and 70 deletions
				
			
		| 
						 | 
					@ -256,7 +256,6 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
 | 
						tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
 | 
				
			||||||
		  e->rxhash, e->queue_index);
 | 
							  e->rxhash, e->queue_index);
 | 
				
			||||||
	sock_rps_reset_flow_hash(e->rps_rxhash);
 | 
					 | 
				
			||||||
	hlist_del_rcu(&e->hash_link);
 | 
						hlist_del_rcu(&e->hash_link);
 | 
				
			||||||
	kfree_rcu(e, rcu);
 | 
						kfree_rcu(e, rcu);
 | 
				
			||||||
	--tun->flow_count;
 | 
						--tun->flow_count;
 | 
				
			||||||
| 
						 | 
					@ -373,10 +372,8 @@ static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
 | 
					static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	if (unlikely(e->rps_rxhash != hash)) {
 | 
						if (unlikely(e->rps_rxhash != hash))
 | 
				
			||||||
		sock_rps_reset_flow_hash(e->rps_rxhash);
 | 
					 | 
				
			||||||
		e->rps_rxhash = hash;
 | 
							e->rps_rxhash = hash;
 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* We try to identify a flow through its rxhash first. The reason that
 | 
					/* We try to identify a flow through its rxhash first. The reason that
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -644,39 +644,39 @@ struct rps_dev_flow_table {
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 * The rps_sock_flow_table contains mappings of flows to the last CPU
 | 
					 * The rps_sock_flow_table contains mappings of flows to the last CPU
 | 
				
			||||||
 * on which they were processed by the application (set in recvmsg).
 | 
					 * on which they were processed by the application (set in recvmsg).
 | 
				
			||||||
 | 
					 * Each entry is a 32bit value. Upper part is the high order bits
 | 
				
			||||||
 | 
					 * of flow hash, lower part is cpu number.
 | 
				
			||||||
 | 
					 * rps_cpu_mask is used to partition the space, depending on number of
 | 
				
			||||||
 | 
					 * possible cpus : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
 | 
				
			||||||
 | 
					 * For example, if 64 cpus are possible, rps_cpu_mask = 0x3f,
 | 
				
			||||||
 | 
					 * meaning we use 32-6=26 bits for the hash.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
struct rps_sock_flow_table {
 | 
					struct rps_sock_flow_table {
 | 
				
			||||||
	unsigned int mask;
 | 
						u32	mask;
 | 
				
			||||||
	u16 ents[0];
 | 
						u32	ents[0];
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
#define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \
 | 
					#define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
 | 
				
			||||||
    ((_num) * sizeof(u16)))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define RPS_NO_CPU 0xffff
 | 
					#define RPS_NO_CPU 0xffff
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern u32 rps_cpu_mask;
 | 
				
			||||||
 | 
					extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
 | 
					static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
 | 
				
			||||||
					u32 hash)
 | 
										u32 hash)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	if (table && hash) {
 | 
						if (table && hash) {
 | 
				
			||||||
		unsigned int cpu, index = hash & table->mask;
 | 
							unsigned int index = hash & table->mask;
 | 
				
			||||||
 | 
							u32 val = hash & ~rps_cpu_mask;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/* We only give a hint, preemption can change cpu under us */
 | 
							/* We only give a hint, preemption can change cpu under us */
 | 
				
			||||||
		cpu = raw_smp_processor_id();
 | 
							val |= raw_smp_processor_id();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (table->ents[index] != cpu)
 | 
							if (table->ents[index] != val)
 | 
				
			||||||
			table->ents[index] = cpu;
 | 
								table->ents[index] = val;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
 | 
					 | 
				
			||||||
				       u32 hash)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	if (table && hash)
 | 
					 | 
				
			||||||
		table->ents[hash & table->mask] = RPS_NO_CPU;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef CONFIG_RFS_ACCEL
 | 
					#ifdef CONFIG_RFS_ACCEL
 | 
				
			||||||
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
 | 
					bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
 | 
				
			||||||
			 u16 filter_id);
 | 
								 u16 filter_id);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -857,18 +857,6 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void sock_rps_reset_flow_hash(__u32 hash)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
#ifdef CONFIG_RPS
 | 
					 | 
				
			||||||
	struct rps_sock_flow_table *sock_flow_table;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	rcu_read_lock();
 | 
					 | 
				
			||||||
	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 | 
					 | 
				
			||||||
	rps_reset_sock_flow(sock_flow_table, hash);
 | 
					 | 
				
			||||||
	rcu_read_unlock();
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static inline void sock_rps_record_flow(const struct sock *sk)
 | 
					static inline void sock_rps_record_flow(const struct sock *sk)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
#ifdef CONFIG_RPS
 | 
					#ifdef CONFIG_RPS
 | 
				
			||||||
| 
						 | 
					@ -876,28 +864,18 @@ static inline void sock_rps_record_flow(const struct sock *sk)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void sock_rps_reset_flow(const struct sock *sk)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
#ifdef CONFIG_RPS
 | 
					 | 
				
			||||||
	sock_rps_reset_flow_hash(sk->sk_rxhash);
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
static inline void sock_rps_save_rxhash(struct sock *sk,
 | 
					static inline void sock_rps_save_rxhash(struct sock *sk,
 | 
				
			||||||
					const struct sk_buff *skb)
 | 
										const struct sk_buff *skb)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
#ifdef CONFIG_RPS
 | 
					#ifdef CONFIG_RPS
 | 
				
			||||||
	if (unlikely(sk->sk_rxhash != skb->hash)) {
 | 
						if (unlikely(sk->sk_rxhash != skb->hash))
 | 
				
			||||||
		sock_rps_reset_flow(sk);
 | 
					 | 
				
			||||||
		sk->sk_rxhash = skb->hash;
 | 
							sk->sk_rxhash = skb->hash;
 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void sock_rps_reset_rxhash(struct sock *sk)
 | 
					static inline void sock_rps_reset_rxhash(struct sock *sk)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
#ifdef CONFIG_RPS
 | 
					#ifdef CONFIG_RPS
 | 
				
			||||||
	sock_rps_reset_flow(sk);
 | 
					 | 
				
			||||||
	sk->sk_rxhash = 0;
 | 
						sk->sk_rxhash = 0;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3030,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 | 
				
			||||||
/* One global table that all flow-based protocols share. */
 | 
					/* One global table that all flow-based protocols share. */
 | 
				
			||||||
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 | 
					struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 | 
				
			||||||
EXPORT_SYMBOL(rps_sock_flow_table);
 | 
					EXPORT_SYMBOL(rps_sock_flow_table);
 | 
				
			||||||
 | 
					u32 rps_cpu_mask __read_mostly;
 | 
				
			||||||
 | 
					EXPORT_SYMBOL(rps_cpu_mask);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct static_key rps_needed __read_mostly;
 | 
					struct static_key rps_needed __read_mostly;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3086,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 | 
				
			||||||
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 | 
					static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 | 
				
			||||||
		       struct rps_dev_flow **rflowp)
 | 
							       struct rps_dev_flow **rflowp)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct netdev_rx_queue *rxqueue;
 | 
						const struct rps_sock_flow_table *sock_flow_table;
 | 
				
			||||||
	struct rps_map *map;
 | 
						struct netdev_rx_queue *rxqueue = dev->_rx;
 | 
				
			||||||
	struct rps_dev_flow_table *flow_table;
 | 
						struct rps_dev_flow_table *flow_table;
 | 
				
			||||||
	struct rps_sock_flow_table *sock_flow_table;
 | 
						struct rps_map *map;
 | 
				
			||||||
	int cpu = -1;
 | 
						int cpu = -1;
 | 
				
			||||||
	u16 tcpu;
 | 
						u32 tcpu;
 | 
				
			||||||
	u32 hash;
 | 
						u32 hash;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (skb_rx_queue_recorded(skb)) {
 | 
						if (skb_rx_queue_recorded(skb)) {
 | 
				
			||||||
		u16 index = skb_get_rx_queue(skb);
 | 
							u16 index = skb_get_rx_queue(skb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (unlikely(index >= dev->real_num_rx_queues)) {
 | 
							if (unlikely(index >= dev->real_num_rx_queues)) {
 | 
				
			||||||
			WARN_ONCE(dev->real_num_rx_queues > 1,
 | 
								WARN_ONCE(dev->real_num_rx_queues > 1,
 | 
				
			||||||
				  "%s received packet on queue %u, but number "
 | 
									  "%s received packet on queue %u, but number "
 | 
				
			||||||
| 
						 | 
					@ -3103,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 | 
				
			||||||
				  dev->name, index, dev->real_num_rx_queues);
 | 
									  dev->name, index, dev->real_num_rx_queues);
 | 
				
			||||||
			goto done;
 | 
								goto done;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		rxqueue = dev->_rx + index;
 | 
							rxqueue += index;
 | 
				
			||||||
	} else
 | 
						}
 | 
				
			||||||
		rxqueue = dev->_rx;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						flow_table = rcu_dereference(rxqueue->rps_flow_table);
 | 
				
			||||||
	map = rcu_dereference(rxqueue->rps_map);
 | 
						map = rcu_dereference(rxqueue->rps_map);
 | 
				
			||||||
	if (map) {
 | 
						if (!flow_table && !map)
 | 
				
			||||||
		if (map->len == 1 &&
 | 
					 | 
				
			||||||
		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
 | 
					 | 
				
			||||||
			tcpu = map->cpus[0];
 | 
					 | 
				
			||||||
			if (cpu_online(tcpu))
 | 
					 | 
				
			||||||
				cpu = tcpu;
 | 
					 | 
				
			||||||
		goto done;
 | 
							goto done;
 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
 | 
					 | 
				
			||||||
		goto done;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	skb_reset_network_header(skb);
 | 
						skb_reset_network_header(skb);
 | 
				
			||||||
	hash = skb_get_hash(skb);
 | 
						hash = skb_get_hash(skb);
 | 
				
			||||||
	if (!hash)
 | 
						if (!hash)
 | 
				
			||||||
		goto done;
 | 
							goto done;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 | 
					 | 
				
			||||||
	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 | 
						sock_flow_table = rcu_dereference(rps_sock_flow_table);
 | 
				
			||||||
	if (flow_table && sock_flow_table) {
 | 
						if (flow_table && sock_flow_table) {
 | 
				
			||||||
		u16 next_cpu;
 | 
					 | 
				
			||||||
		struct rps_dev_flow *rflow;
 | 
							struct rps_dev_flow *rflow;
 | 
				
			||||||
 | 
							u32 next_cpu;
 | 
				
			||||||
 | 
							u32 ident;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							/* First check into global flow table if there is a match */
 | 
				
			||||||
 | 
							ident = sock_flow_table->ents[hash & sock_flow_table->mask];
 | 
				
			||||||
 | 
							if ((ident ^ hash) & ~rps_cpu_mask)
 | 
				
			||||||
 | 
								goto try_rps;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							next_cpu = ident & rps_cpu_mask;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							/* OK, now we know there is a match,
 | 
				
			||||||
 | 
							 * we can look at the local (per receive queue) flow table
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
		rflow = &flow_table->flows[hash & flow_table->mask];
 | 
							rflow = &flow_table->flows[hash & flow_table->mask];
 | 
				
			||||||
		tcpu = rflow->cpu;
 | 
							tcpu = rflow->cpu;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
		 * If the desired CPU (where last recvmsg was done) is
 | 
							 * If the desired CPU (where last recvmsg was done) is
 | 
				
			||||||
		 * different from current CPU (one in the rx-queue flow
 | 
							 * different from current CPU (one in the rx-queue flow
 | 
				
			||||||
| 
						 | 
					@ -3162,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					try_rps:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (map) {
 | 
						if (map) {
 | 
				
			||||||
		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 | 
							tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 | 
				
			||||||
		if (cpu_online(tcpu)) {
 | 
							if (cpu_online(tcpu)) {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -65,7 +65,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 | 
				
			||||||
					mutex_unlock(&sock_flow_mutex);
 | 
										mutex_unlock(&sock_flow_mutex);
 | 
				
			||||||
					return -ENOMEM;
 | 
										return -ENOMEM;
 | 
				
			||||||
				}
 | 
									}
 | 
				
			||||||
 | 
									rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
 | 
				
			||||||
				sock_table->mask = size - 1;
 | 
									sock_table->mask = size - 1;
 | 
				
			||||||
			} else
 | 
								} else
 | 
				
			||||||
				sock_table = orig_sock_table;
 | 
									sock_table = orig_sock_table;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -395,8 +395,6 @@ int inet_release(struct socket *sock)
 | 
				
			||||||
	if (sk) {
 | 
						if (sk) {
 | 
				
			||||||
		long timeout;
 | 
							long timeout;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		sock_rps_reset_flow(sk);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		/* Applications forget to leave groups before exiting */
 | 
							/* Applications forget to leave groups before exiting */
 | 
				
			||||||
		ip_mc_drop_socket(sk);
 | 
							ip_mc_drop_socket(sk);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue