forked from mirrors/linux
		
	net: sched: fix packet stuck problem for lockless qdisc
Lockless qdisc has below concurrent problem:
    cpu0                 cpu1
     .                     .
q->enqueue                 .
     .                     .
qdisc_run_begin()          .
     .                     .
dequeue_skb()              .
     .                     .
sch_direct_xmit()          .
     .                     .
     .                q->enqueue
     .             qdisc_run_begin()
     .            return and do nothing
     .                     .
qdisc_run_end()            .
cpu1 enqueue a skb without calling __qdisc_run() because cpu0
has not released the lock yet and spin_trylock() return false
for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb
enqueued by cpu1 when calling dequeue_skb() because cpu1 may
enqueue the skb after cpu0 calling dequeue_skb() and before
cpu0 calling qdisc_run_end().
Lockless qdisc has below another concurrent problem when
tx_action is involved:
cpu0(serving tx_action)     cpu1             cpu2
          .                   .                .
          .              q->enqueue            .
          .            qdisc_run_begin()       .
          .              dequeue_skb()         .
          .                   .            q->enqueue
          .                   .                .
          .             sch_direct_xmit()      .
          .                   .         qdisc_run_begin()
          .                   .       return and do nothing
          .                   .                .
 clear __QDISC_STATE_SCHED    .                .
 qdisc_run_begin()            .                .
 return and do nothing        .                .
          .                   .                .
          .            qdisc_run_end()         .
This patch fixes the above data race by:
1. If the first spin_trylock() return false and STATE_MISSED is
   not set, set STATE_MISSED and retry another spin_trylock() in
   case other CPU may not see STATE_MISSED after it releases the
   lock.
2. reschedule if STATE_MISSED is set after the lock is released
   at the end of qdisc_run_end().
For tx_action case, STATE_MISSED is also set when cpu1 is at the
end if qdisc_run_end(), so tx_action will be rescheduled again
to dequeue the skb enqueued by cpu2.
Clear STATE_MISSED before retrying a dequeuing when dequeuing
returns NULL in order to reduce the overhead of the second
spin_trylock() and __netif_schedule() calling.
Also clear the STATE_MISSED before calling __netif_schedule()
at the end of qdisc_run_end() to avoid doing another round of
dequeuing in the pfifo_fast_dequeue().
The performance impact of this patch, tested using pktgen and
dummy netdev with pfifo_fast qdisc attached:
 threads  without+this_patch   with+this_patch      delta
    1        2.61Mpps            2.60Mpps           -0.3%
    2        3.97Mpps            3.82Mpps           -3.7%
    4        5.62Mpps            5.59Mpps           -0.5%
    8        2.78Mpps            2.77Mpps           -0.3%
   16        2.22Mpps            2.22Mpps           -0.0%
Fixes: 6b3ba9146f ("net: sched: allow qdiscs to handle locking")
Acked-by: Jakub Kicinski <kuba@kernel.org>
Tested-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
			
			
This commit is contained in:
		
							parent
							
								
									974271e5ed
								
							
						
					
					
						commit
						a90c57f2ce
					
				
					 2 changed files with 53 additions and 1 deletions
				
			
		| 
						 | 
				
			
			@ -36,6 +36,7 @@ struct qdisc_rate_table {
 | 
			
		|||
enum qdisc_state_t {
 | 
			
		||||
	__QDISC_STATE_SCHED,
 | 
			
		||||
	__QDISC_STATE_DEACTIVATED,
 | 
			
		||||
	__QDISC_STATE_MISSED,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct qdisc_size_table {
 | 
			
		||||
| 
						 | 
				
			
			@ -159,8 +160,33 @@ static inline bool qdisc_is_empty(const struct Qdisc *qdisc)
 | 
			
		|||
static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 | 
			
		||||
{
 | 
			
		||||
	if (qdisc->flags & TCQ_F_NOLOCK) {
 | 
			
		||||
		if (spin_trylock(&qdisc->seqlock))
 | 
			
		||||
			goto nolock_empty;
 | 
			
		||||
 | 
			
		||||
		/* If the MISSED flag is set, it means other thread has
 | 
			
		||||
		 * set the MISSED flag before second spin_trylock(), so
 | 
			
		||||
		 * we can return false here to avoid multi cpus doing
 | 
			
		||||
		 * the set_bit() and second spin_trylock() concurrently.
 | 
			
		||||
		 */
 | 
			
		||||
		if (test_bit(__QDISC_STATE_MISSED, &qdisc->state))
 | 
			
		||||
			return false;
 | 
			
		||||
 | 
			
		||||
		/* Set the MISSED flag before the second spin_trylock(),
 | 
			
		||||
		 * if the second spin_trylock() return false, it means
 | 
			
		||||
		 * other cpu holding the lock will do dequeuing for us
 | 
			
		||||
		 * or it will see the MISSED flag set after releasing
 | 
			
		||||
		 * lock and reschedule the net_tx_action() to do the
 | 
			
		||||
		 * dequeuing.
 | 
			
		||||
		 */
 | 
			
		||||
		set_bit(__QDISC_STATE_MISSED, &qdisc->state);
 | 
			
		||||
 | 
			
		||||
		/* Retry again in case other CPU may not see the new flag
 | 
			
		||||
		 * after it releases the lock at the end of qdisc_run_end().
 | 
			
		||||
		 */
 | 
			
		||||
		if (!spin_trylock(&qdisc->seqlock))
 | 
			
		||||
			return false;
 | 
			
		||||
 | 
			
		||||
nolock_empty:
 | 
			
		||||
		WRITE_ONCE(qdisc->empty, false);
 | 
			
		||||
	} else if (qdisc_is_running(qdisc)) {
 | 
			
		||||
		return false;
 | 
			
		||||
| 
						 | 
				
			
			@ -176,8 +202,15 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 | 
			
		|||
static inline void qdisc_run_end(struct Qdisc *qdisc)
 | 
			
		||||
{
 | 
			
		||||
	write_seqcount_end(&qdisc->running);
 | 
			
		||||
	if (qdisc->flags & TCQ_F_NOLOCK)
 | 
			
		||||
	if (qdisc->flags & TCQ_F_NOLOCK) {
 | 
			
		||||
		spin_unlock(&qdisc->seqlock);
 | 
			
		||||
 | 
			
		||||
		if (unlikely(test_bit(__QDISC_STATE_MISSED,
 | 
			
		||||
				      &qdisc->state))) {
 | 
			
		||||
			clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
 | 
			
		||||
			__netif_schedule(qdisc);
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -640,8 +640,10 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 | 
			
		|||
{
 | 
			
		||||
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
 | 
			
		||||
	struct sk_buff *skb = NULL;
 | 
			
		||||
	bool need_retry = true;
 | 
			
		||||
	int band;
 | 
			
		||||
 | 
			
		||||
retry:
 | 
			
		||||
	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
 | 
			
		||||
		struct skb_array *q = band2list(priv, band);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -652,6 +654,23 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 | 
			
		|||
	}
 | 
			
		||||
	if (likely(skb)) {
 | 
			
		||||
		qdisc_update_stats_at_dequeue(qdisc, skb);
 | 
			
		||||
	} else if (need_retry &&
 | 
			
		||||
		   test_bit(__QDISC_STATE_MISSED, &qdisc->state)) {
 | 
			
		||||
		/* Delay clearing the STATE_MISSED here to reduce
 | 
			
		||||
		 * the overhead of the second spin_trylock() in
 | 
			
		||||
		 * qdisc_run_begin() and __netif_schedule() calling
 | 
			
		||||
		 * in qdisc_run_end().
 | 
			
		||||
		 */
 | 
			
		||||
		clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
 | 
			
		||||
 | 
			
		||||
		/* Make sure dequeuing happens after clearing
 | 
			
		||||
		 * STATE_MISSED.
 | 
			
		||||
		 */
 | 
			
		||||
		smp_mb__after_atomic();
 | 
			
		||||
 | 
			
		||||
		need_retry = false;
 | 
			
		||||
 | 
			
		||||
		goto retry;
 | 
			
		||||
	} else {
 | 
			
		||||
		WRITE_ONCE(qdisc->empty, true);
 | 
			
		||||
	}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue