forked from mirrors/linux
		
	While doing high throughput test on a BQL enabled NIC, I found a very high cost in ndo_start_xmit() when accessing BQL data. It turned out the problem was caused by compiler trying to be smart, but involving a bad MESI transaction : 0.05 │ mov 0xc0(%rax),%edi // LOAD dql->num_queued 0.48 │ mov %edx,0xc8(%rax) // STORE dql->last_obj_cnt = count 58.23 │ add %edx,%edi 0.58 │ cmp %edi,0xc4(%rax) 0.76 │ mov %edi,0xc0(%rax) // STORE dql->num_queued += count 0.72 │ js bd8 I got an incredible 10 % gain [1] by making sure cpu do not attempt to get the cache line in Shared mode, but directly requests for ownership. New code : mov %edx,0xc8(%rax) // STORE dql->last_obj_cnt = count add %edx,0xc0(%rax) // RMW dql->num_queued += count mov 0xc4(%rax),%ecx // LOAD dql->adj_limit mov 0xc0(%rax),%edx // LOAD dql->num_queued cmp %edx,%ecx The TX completion was running from another cpu, with high interrupts rate. Note that I am using barrier() as a soft hint, as mb() here could be too heavy cost. [1] This was a netperf TCP_STREAM with TSO disabled, but GSO enabled. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Jesper Dangaard Brouer <brouer@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
		
			
				
	
	
		
			105 lines
		
	
	
	
		
			3.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			105 lines
		
	
	
	
		
			3.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/*
 | 
						|
 * Dynamic queue limits (dql) - Definitions
 | 
						|
 *
 | 
						|
 * Copyright (c) 2011, Tom Herbert <therbert@google.com>
 | 
						|
 *
 | 
						|
 * This header file contains the definitions for dynamic queue limits (dql).
 | 
						|
 * dql would be used in conjunction with a producer/consumer type queue
 | 
						|
 * (possibly a HW queue).  Such a queue would have these general properties:
 | 
						|
 *
 | 
						|
 *   1) Objects are queued up to some limit specified as number of objects.
 | 
						|
 *   2) Periodically a completion process executes which retires consumed
 | 
						|
 *      objects.
 | 
						|
 *   3) Starvation occurs when limit has been reached, all queued data has
 | 
						|
 *      actually been consumed, but completion processing has not yet run
 | 
						|
 *      so queuing new data is blocked.
 | 
						|
 *   4) Minimizing the amount of queued data is desirable.
 | 
						|
 *
 | 
						|
 * The goal of dql is to calculate the limit as the minimum number of objects
 | 
						|
 * needed to prevent starvation.
 | 
						|
 *
 | 
						|
 * The primary functions of dql are:
 | 
						|
 *    dql_queued - called when objects are enqueued to record number of objects
 | 
						|
 *    dql_avail - returns how many objects are available to be queued based
 | 
						|
 *      on the object limit and how many objects are already enqueued
 | 
						|
 *    dql_completed - called at completion time to indicate how many objects
 | 
						|
 *      were retired from the queue
 | 
						|
 *
 | 
						|
 * The dql implementation does not implement any locking for the dql data
 | 
						|
 * structures, the higher layer should provide this.  dql_queued should
 | 
						|
 * be serialized to prevent concurrent execution of the function; this
 | 
						|
 * is also true for  dql_completed.  However, dql_queued and dlq_completed  can
 | 
						|
 * be executed concurrently (i.e. they can be protected by different locks).
 | 
						|
 */
 | 
						|
 | 
						|
#ifndef _LINUX_DQL_H
 | 
						|
#define _LINUX_DQL_H
 | 
						|
 | 
						|
#ifdef __KERNEL__
 | 
						|
 | 
						|
struct dql {
 | 
						|
	/* Fields accessed in enqueue path (dql_queued) */
 | 
						|
	unsigned int	num_queued;		/* Total ever queued */
 | 
						|
	unsigned int	adj_limit;		/* limit + num_completed */
 | 
						|
	unsigned int	last_obj_cnt;		/* Count at last queuing */
 | 
						|
 | 
						|
	/* Fields accessed only by completion path (dql_completed) */
 | 
						|
 | 
						|
	unsigned int	limit ____cacheline_aligned_in_smp; /* Current limit */
 | 
						|
	unsigned int	num_completed;		/* Total ever completed */
 | 
						|
 | 
						|
	unsigned int	prev_ovlimit;		/* Previous over limit */
 | 
						|
	unsigned int	prev_num_queued;	/* Previous queue total */
 | 
						|
	unsigned int	prev_last_obj_cnt;	/* Previous queuing cnt */
 | 
						|
 | 
						|
	unsigned int	lowest_slack;		/* Lowest slack found */
 | 
						|
	unsigned long	slack_start_time;	/* Time slacks seen */
 | 
						|
 | 
						|
	/* Configuration */
 | 
						|
	unsigned int	max_limit;		/* Max limit */
 | 
						|
	unsigned int	min_limit;		/* Minimum limit */
 | 
						|
	unsigned int	slack_hold_time;	/* Time to measure slack */
 | 
						|
};
 | 
						|
 | 
						|
/* Set some static maximums */
 | 
						|
#define DQL_MAX_OBJECT (UINT_MAX / 16)
 | 
						|
#define DQL_MAX_LIMIT ((UINT_MAX / 2) - DQL_MAX_OBJECT)
 | 
						|
 | 
						|
/*
 | 
						|
 * Record number of objects queued. Assumes that caller has already checked
 | 
						|
 * availability in the queue with dql_avail.
 | 
						|
 */
 | 
						|
static inline void dql_queued(struct dql *dql, unsigned int count)
 | 
						|
{
 | 
						|
	BUG_ON(count > DQL_MAX_OBJECT);
 | 
						|
 | 
						|
	dql->last_obj_cnt = count;
 | 
						|
 | 
						|
	/* We want to force a write first, so that cpu do not attempt
 | 
						|
	 * to get cache line containing last_obj_cnt, num_queued, adj_limit
 | 
						|
	 * in Shared state, but directly does a Request For Ownership
 | 
						|
	 * It is only a hint, we use barrier() only.
 | 
						|
	 */
 | 
						|
	barrier();
 | 
						|
 | 
						|
	dql->num_queued += count;
 | 
						|
}
 | 
						|
 | 
						|
/* Returns how many objects can be queued, < 0 indicates over limit. */
 | 
						|
static inline int dql_avail(const struct dql *dql)
 | 
						|
{
 | 
						|
	return ACCESS_ONCE(dql->adj_limit) - ACCESS_ONCE(dql->num_queued);
 | 
						|
}
 | 
						|
 | 
						|
/* Record number of completed objects and recalculate the limit. */
 | 
						|
void dql_completed(struct dql *dql, unsigned int count);
 | 
						|
 | 
						|
/* Reset dql state */
 | 
						|
void dql_reset(struct dql *dql);
 | 
						|
 | 
						|
/* Initialize dql state */
 | 
						|
int dql_init(struct dql *dql, unsigned hold_time);
 | 
						|
 | 
						|
#endif /* _KERNEL_ */
 | 
						|
 | 
						|
#endif /* _LINUX_DQL_H */
 |