forked from mirrors/linux
		
	Revert "net: skb: introduce and use a single page frag cache"
This reverts commitdbae2b0628("net: skb: introduce and use a single page frag cache"). The intended goal of such change was to counter a performance regression introduced by commit3226b158e6("net: avoid 32 x truesize under-estimation for tiny skbs"). Unfortunately, the blamed commit introduces another regression for the virtio_net driver. Such a driver calls napi_alloc_skb() with a tiny size, so that the whole head frag could fit a 512-byte block. The single page frag cache uses a 1K fragment for such allocation, and the additional overhead, under small UDP packets flood, makes the page allocator a bottleneck. Thanks to commitbf9f1baa27("net: add dedicated kmem_cache for typical/small skb->head"), this revert does not re-introduce the original regression. Actually, in the relevant test on top of this revert, I measure a small but noticeable positive delta, just above noise level. The revert itself required some additional mangling due to the introduction of the SKB_HEAD_ALIGN() helper and local lock infra in the affected code. Suggested-by: Eric Dumazet <edumazet@google.com> Fixes:dbae2b0628("net: skb: introduce and use a single page frag cache") Signed-off-by: Paolo Abeni <pabeni@redhat.com> Link: https://patch.msgid.link/e649212fde9f0fdee23909ca0d14158d32bb7425.1738877290.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
		
							parent
							
								
									cb827db50a
								
							
						
					
					
						commit
						011b033590
					
				
					 3 changed files with 22 additions and 99 deletions
				
			
		| 
						 | 
				
			
			@ -4115,7 +4115,6 @@ void netif_receive_skb_list(struct list_head *head);
 | 
			
		|||
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 | 
			
		||||
void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 | 
			
		||||
struct sk_buff *napi_get_frags(struct napi_struct *napi);
 | 
			
		||||
void napi_get_frags_check(struct napi_struct *napi);
 | 
			
		||||
gro_result_t napi_gro_frags(struct napi_struct *napi);
 | 
			
		||||
 | 
			
		||||
static inline void napi_free_frags(struct napi_struct *napi)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -6920,6 +6920,23 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
 | 
			
		|||
	list_add_rcu(&napi->dev_list, higher); /* adds after higher */
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Double check that napi_get_frags() allocates skbs with
 | 
			
		||||
 * skb->head being backed by slab, not a page fragment.
 | 
			
		||||
 * This is to make sure bug fixed in 3226b158e67c
 | 
			
		||||
 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
 | 
			
		||||
 * does not accidentally come back.
 | 
			
		||||
 */
 | 
			
		||||
static void napi_get_frags_check(struct napi_struct *napi)
 | 
			
		||||
{
 | 
			
		||||
	struct sk_buff *skb;
 | 
			
		||||
 | 
			
		||||
	local_bh_disable();
 | 
			
		||||
	skb = napi_get_frags(napi);
 | 
			
		||||
	WARN_ON_ONCE(skb && skb->head_frag);
 | 
			
		||||
	napi_free_frags(napi);
 | 
			
		||||
	local_bh_enable();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void netif_napi_add_weight_locked(struct net_device *dev,
 | 
			
		||||
				  struct napi_struct *napi,
 | 
			
		||||
				  int (*poll)(struct napi_struct *, int),
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -220,67 +220,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
 | 
			
		|||
#define NAPI_SKB_CACHE_BULK	16
 | 
			
		||||
#define NAPI_SKB_CACHE_HALF	(NAPI_SKB_CACHE_SIZE / 2)
 | 
			
		||||
 | 
			
		||||
#if PAGE_SIZE == SZ_4K
 | 
			
		||||
 | 
			
		||||
#define NAPI_HAS_SMALL_PAGE_FRAG	1
 | 
			
		||||
#define NAPI_SMALL_PAGE_PFMEMALLOC(nc)	((nc).pfmemalloc)
 | 
			
		||||
 | 
			
		||||
/* specialized page frag allocator using a single order 0 page
 | 
			
		||||
 * and slicing it into 1K sized fragment. Constrained to systems
 | 
			
		||||
 * with a very limited amount of 1K fragments fitting a single
 | 
			
		||||
 * page - to avoid excessive truesize underestimation
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
struct page_frag_1k {
 | 
			
		||||
	void *va;
 | 
			
		||||
	u16 offset;
 | 
			
		||||
	bool pfmemalloc;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
 | 
			
		||||
{
 | 
			
		||||
	struct page *page;
 | 
			
		||||
	int offset;
 | 
			
		||||
 | 
			
		||||
	offset = nc->offset - SZ_1K;
 | 
			
		||||
	if (likely(offset >= 0))
 | 
			
		||||
		goto use_frag;
 | 
			
		||||
 | 
			
		||||
	page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
 | 
			
		||||
	if (!page)
 | 
			
		||||
		return NULL;
 | 
			
		||||
 | 
			
		||||
	nc->va = page_address(page);
 | 
			
		||||
	nc->pfmemalloc = page_is_pfmemalloc(page);
 | 
			
		||||
	offset = PAGE_SIZE - SZ_1K;
 | 
			
		||||
	page_ref_add(page, offset / SZ_1K);
 | 
			
		||||
 | 
			
		||||
use_frag:
 | 
			
		||||
	nc->offset = offset;
 | 
			
		||||
	return nc->va + offset;
 | 
			
		||||
}
 | 
			
		||||
#else
 | 
			
		||||
 | 
			
		||||
/* the small page is actually unused in this build; add dummy helpers
 | 
			
		||||
 * to please the compiler and avoid later preprocessor's conditionals
 | 
			
		||||
 */
 | 
			
		||||
#define NAPI_HAS_SMALL_PAGE_FRAG	0
 | 
			
		||||
#define NAPI_SMALL_PAGE_PFMEMALLOC(nc)	false
 | 
			
		||||
 | 
			
		||||
struct page_frag_1k {
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
 | 
			
		||||
{
 | 
			
		||||
	return NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
struct napi_alloc_cache {
 | 
			
		||||
	local_lock_t bh_lock;
 | 
			
		||||
	struct page_frag_cache page;
 | 
			
		||||
	struct page_frag_1k page_small;
 | 
			
		||||
	unsigned int skb_count;
 | 
			
		||||
	void *skb_cache[NAPI_SKB_CACHE_SIZE];
 | 
			
		||||
};
 | 
			
		||||
| 
						 | 
				
			
			@ -290,23 +232,6 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
 | 
			
		|||
	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
/* Double check that napi_get_frags() allocates skbs with
 | 
			
		||||
 * skb->head being backed by slab, not a page fragment.
 | 
			
		||||
 * This is to make sure bug fixed in 3226b158e67c
 | 
			
		||||
 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
 | 
			
		||||
 * does not accidentally come back.
 | 
			
		||||
 */
 | 
			
		||||
void napi_get_frags_check(struct napi_struct *napi)
 | 
			
		||||
{
 | 
			
		||||
	struct sk_buff *skb;
 | 
			
		||||
 | 
			
		||||
	local_bh_disable();
 | 
			
		||||
	skb = napi_get_frags(napi);
 | 
			
		||||
	WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
 | 
			
		||||
	napi_free_frags(napi);
 | 
			
		||||
	local_bh_enable();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 | 
			
		||||
{
 | 
			
		||||
	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 | 
			
		||||
| 
						 | 
				
			
			@ -813,10 +738,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
 | 
			
		|||
 | 
			
		||||
	/* If requested length is either too small or too big,
 | 
			
		||||
	 * we use kmalloc() for skb->head allocation.
 | 
			
		||||
	 * When the small frag allocator is available, prefer it over kmalloc
 | 
			
		||||
	 * for small fragments
 | 
			
		||||
	 */
 | 
			
		||||
	if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
 | 
			
		||||
	if (len <= SKB_WITH_OVERHEAD(1024) ||
 | 
			
		||||
	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
 | 
			
		||||
	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
 | 
			
		||||
		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
 | 
			
		||||
| 
						 | 
				
			
			@ -826,32 +749,16 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
 | 
			
		|||
		goto skb_success;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	len = SKB_HEAD_ALIGN(len);
 | 
			
		||||
 | 
			
		||||
	if (sk_memalloc_socks())
 | 
			
		||||
		gfp_mask |= __GFP_MEMALLOC;
 | 
			
		||||
 | 
			
		||||
	local_lock_nested_bh(&napi_alloc_cache.bh_lock);
 | 
			
		||||
	nc = this_cpu_ptr(&napi_alloc_cache);
 | 
			
		||||
	if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
 | 
			
		||||
		/* we are artificially inflating the allocation size, but
 | 
			
		||||
		 * that is not as bad as it may look like, as:
 | 
			
		||||
		 * - 'len' less than GRO_MAX_HEAD makes little sense
 | 
			
		||||
		 * - On most systems, larger 'len' values lead to fragment
 | 
			
		||||
		 *   size above 512 bytes
 | 
			
		||||
		 * - kmalloc would use the kmalloc-1k slab for such values
 | 
			
		||||
		 * - Builds with smaller GRO_MAX_HEAD will very likely do
 | 
			
		||||
		 *   little networking, as that implies no WiFi and no
 | 
			
		||||
		 *   tunnels support, and 32 bits arches.
 | 
			
		||||
		 */
 | 
			
		||||
		len = SZ_1K;
 | 
			
		||||
 | 
			
		||||
		data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
 | 
			
		||||
		pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
 | 
			
		||||
	} else {
 | 
			
		||||
		len = SKB_HEAD_ALIGN(len);
 | 
			
		||||
 | 
			
		||||
		data = page_frag_alloc(&nc->page, len, gfp_mask);
 | 
			
		||||
		pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
 | 
			
		||||
	}
 | 
			
		||||
	data = page_frag_alloc(&nc->page, len, gfp_mask);
 | 
			
		||||
	pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
 | 
			
		||||
	local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 | 
			
		||||
 | 
			
		||||
	if (unlikely(!data))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue