forked from mirrors/linux
		
	dev: add per net_device packet type chains
When many pf_packet listeners are created on a lot of interfaces the current implementation using global packet type lists scales poorly. This patch adds per net_device packet type lists to fix this problem. The patch was originally written by Eric Biederman for linux-2.6.29. Tested on linux-3.16. Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: Salam Noureddine <noureddine@arista.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									7b4ce694b2
								
							
						
					
					
						commit
						7866a62104
					
				
					 2 changed files with 89 additions and 51 deletions
				
			
		|  | @ -1514,6 +1514,8 @@ struct net_device { | ||||||
| 	struct list_head	napi_list; | 	struct list_head	napi_list; | ||||||
| 	struct list_head	unreg_list; | 	struct list_head	unreg_list; | ||||||
| 	struct list_head	close_list; | 	struct list_head	close_list; | ||||||
|  | 	struct list_head	ptype_all; | ||||||
|  | 	struct list_head	ptype_specific; | ||||||
| 
 | 
 | ||||||
| 	struct { | 	struct { | ||||||
| 		struct list_head upper; | 		struct list_head upper; | ||||||
|  |  | ||||||
							
								
								
									
										138
									
								
								net/core/dev.c
									
									
									
									
									
								
							
							
						
						
									
										138
									
								
								net/core/dev.c
									
									
									
									
									
								
							|  | @ -371,9 +371,10 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) | ||||||
| static inline struct list_head *ptype_head(const struct packet_type *pt) | static inline struct list_head *ptype_head(const struct packet_type *pt) | ||||||
| { | { | ||||||
| 	if (pt->type == htons(ETH_P_ALL)) | 	if (pt->type == htons(ETH_P_ALL)) | ||||||
| 		return &ptype_all; | 		return pt->dev ? &pt->dev->ptype_all : &ptype_all; | ||||||
| 	else | 	else | ||||||
| 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; | 		return pt->dev ? &pt->dev->ptype_specific : | ||||||
|  | 				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  | @ -1734,6 +1735,23 @@ static inline int deliver_skb(struct sk_buff *skb, | ||||||
| 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); | 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static inline void deliver_ptype_list_skb(struct sk_buff *skb, | ||||||
|  | 					  struct packet_type **pt, | ||||||
|  | 					  struct net_device *dev, __be16 type, | ||||||
|  | 					  struct list_head *ptype_list) | ||||||
|  | { | ||||||
|  | 	struct packet_type *ptype, *pt_prev = *pt; | ||||||
|  | 
 | ||||||
|  | 	list_for_each_entry_rcu(ptype, ptype_list, list) { | ||||||
|  | 		if (ptype->type != type) | ||||||
|  | 			continue; | ||||||
|  | 		if (pt_prev) | ||||||
|  | 			deliver_skb(skb, pt_prev, dev); | ||||||
|  | 		pt_prev = ptype; | ||||||
|  | 	} | ||||||
|  | 	*pt = pt_prev; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) | static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) | ||||||
| { | { | ||||||
| 	if (!ptype->af_packet_priv || !skb->sk) | 	if (!ptype->af_packet_priv || !skb->sk) | ||||||
|  | @ -1757,45 +1775,54 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) | ||||||
| 	struct packet_type *ptype; | 	struct packet_type *ptype; | ||||||
| 	struct sk_buff *skb2 = NULL; | 	struct sk_buff *skb2 = NULL; | ||||||
| 	struct packet_type *pt_prev = NULL; | 	struct packet_type *pt_prev = NULL; | ||||||
|  | 	struct list_head *ptype_list = &ptype_all; | ||||||
| 
 | 
 | ||||||
| 	rcu_read_lock(); | 	rcu_read_lock(); | ||||||
| 	list_for_each_entry_rcu(ptype, &ptype_all, list) { | again: | ||||||
|  | 	list_for_each_entry_rcu(ptype, ptype_list, list) { | ||||||
| 		/* Never send packets back to the socket
 | 		/* Never send packets back to the socket
 | ||||||
| 		 * they originated from - MvS (miquels@drinkel.ow.org) | 		 * they originated from - MvS (miquels@drinkel.ow.org) | ||||||
| 		 */ | 		 */ | ||||||
| 		if ((ptype->dev == dev || !ptype->dev) && | 		if (skb_loop_sk(ptype, skb)) | ||||||
| 		    (!skb_loop_sk(ptype, skb))) { | 			continue; | ||||||
| 			if (pt_prev) { |  | ||||||
| 				deliver_skb(skb2, pt_prev, skb->dev); |  | ||||||
| 				pt_prev = ptype; |  | ||||||
| 				continue; |  | ||||||
| 			} |  | ||||||
| 
 | 
 | ||||||
| 			skb2 = skb_clone(skb, GFP_ATOMIC); | 		if (pt_prev) { | ||||||
| 			if (!skb2) | 			deliver_skb(skb2, pt_prev, skb->dev); | ||||||
| 				break; |  | ||||||
| 
 |  | ||||||
| 			net_timestamp_set(skb2); |  | ||||||
| 
 |  | ||||||
| 			/* skb->nh should be correctly
 |  | ||||||
| 			   set by sender, so that the second statement is |  | ||||||
| 			   just protection against buggy protocols. |  | ||||||
| 			 */ |  | ||||||
| 			skb_reset_mac_header(skb2); |  | ||||||
| 
 |  | ||||||
| 			if (skb_network_header(skb2) < skb2->data || |  | ||||||
| 			    skb_network_header(skb2) > skb_tail_pointer(skb2)) { |  | ||||||
| 				net_crit_ratelimited("protocol %04x is buggy, dev %s\n", |  | ||||||
| 						     ntohs(skb2->protocol), |  | ||||||
| 						     dev->name); |  | ||||||
| 				skb_reset_network_header(skb2); |  | ||||||
| 			} |  | ||||||
| 
 |  | ||||||
| 			skb2->transport_header = skb2->network_header; |  | ||||||
| 			skb2->pkt_type = PACKET_OUTGOING; |  | ||||||
| 			pt_prev = ptype; | 			pt_prev = ptype; | ||||||
|  | 			continue; | ||||||
| 		} | 		} | ||||||
|  | 
 | ||||||
|  | 		/* need to clone skb, done only once */ | ||||||
|  | 		skb2 = skb_clone(skb, GFP_ATOMIC); | ||||||
|  | 		if (!skb2) | ||||||
|  | 			goto out_unlock; | ||||||
|  | 
 | ||||||
|  | 		net_timestamp_set(skb2); | ||||||
|  | 
 | ||||||
|  | 		/* skb->nh should be correctly
 | ||||||
|  | 		 * set by sender, so that the second statement is | ||||||
|  | 		 * just protection against buggy protocols. | ||||||
|  | 		 */ | ||||||
|  | 		skb_reset_mac_header(skb2); | ||||||
|  | 
 | ||||||
|  | 		if (skb_network_header(skb2) < skb2->data || | ||||||
|  | 		    skb_network_header(skb2) > skb_tail_pointer(skb2)) { | ||||||
|  | 			net_crit_ratelimited("protocol %04x is buggy, dev %s\n", | ||||||
|  | 					     ntohs(skb2->protocol), | ||||||
|  | 					     dev->name); | ||||||
|  | 			skb_reset_network_header(skb2); | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		skb2->transport_header = skb2->network_header; | ||||||
|  | 		skb2->pkt_type = PACKET_OUTGOING; | ||||||
|  | 		pt_prev = ptype; | ||||||
| 	} | 	} | ||||||
|  | 
 | ||||||
|  | 	if (ptype_list == &ptype_all) { | ||||||
|  | 		ptype_list = &dev->ptype_all; | ||||||
|  | 		goto again; | ||||||
|  | 	} | ||||||
|  | out_unlock: | ||||||
| 	if (pt_prev) | 	if (pt_prev) | ||||||
| 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); | 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); | ||||||
| 	rcu_read_unlock(); | 	rcu_read_unlock(); | ||||||
|  | @ -2617,7 +2644,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, | ||||||
| 	unsigned int len; | 	unsigned int len; | ||||||
| 	int rc; | 	int rc; | ||||||
| 
 | 
 | ||||||
| 	if (!list_empty(&ptype_all)) | 	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) | ||||||
| 		dev_queue_xmit_nit(skb, dev); | 		dev_queue_xmit_nit(skb, dev); | ||||||
| 
 | 
 | ||||||
| 	len = skb->len; | 	len = skb->len; | ||||||
|  | @ -3615,7 +3642,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) | ||||||
| 	struct packet_type *ptype, *pt_prev; | 	struct packet_type *ptype, *pt_prev; | ||||||
| 	rx_handler_func_t *rx_handler; | 	rx_handler_func_t *rx_handler; | ||||||
| 	struct net_device *orig_dev; | 	struct net_device *orig_dev; | ||||||
| 	struct net_device *null_or_dev; |  | ||||||
| 	bool deliver_exact = false; | 	bool deliver_exact = false; | ||||||
| 	int ret = NET_RX_DROP; | 	int ret = NET_RX_DROP; | ||||||
| 	__be16 type; | 	__be16 type; | ||||||
|  | @ -3658,11 +3684,15 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) | ||||||
| 		goto skip_taps; | 		goto skip_taps; | ||||||
| 
 | 
 | ||||||
| 	list_for_each_entry_rcu(ptype, &ptype_all, list) { | 	list_for_each_entry_rcu(ptype, &ptype_all, list) { | ||||||
| 		if (!ptype->dev || ptype->dev == skb->dev) { | 		if (pt_prev) | ||||||
| 			if (pt_prev) | 			ret = deliver_skb(skb, pt_prev, orig_dev); | ||||||
| 				ret = deliver_skb(skb, pt_prev, orig_dev); | 		pt_prev = ptype; | ||||||
| 			pt_prev = ptype; | 	} | ||||||
| 		} | 
 | ||||||
|  | 	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { | ||||||
|  | 		if (pt_prev) | ||||||
|  | 			ret = deliver_skb(skb, pt_prev, orig_dev); | ||||||
|  | 		pt_prev = ptype; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| skip_taps: | skip_taps: | ||||||
|  | @ -3718,19 +3748,21 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) | ||||||
| 		skb->vlan_tci = 0; | 		skb->vlan_tci = 0; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	/* deliver only exact match when indicated */ |  | ||||||
| 	null_or_dev = deliver_exact ? skb->dev : NULL; |  | ||||||
| 
 |  | ||||||
| 	type = skb->protocol; | 	type = skb->protocol; | ||||||
| 	list_for_each_entry_rcu(ptype, | 
 | ||||||
| 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { | 	/* deliver only exact match when indicated */ | ||||||
| 		if (ptype->type == type && | 	if (likely(!deliver_exact)) { | ||||||
| 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev || | 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, | ||||||
| 		     ptype->dev == orig_dev)) { | 				       &ptype_base[ntohs(type) & | ||||||
| 			if (pt_prev) | 						   PTYPE_HASH_MASK]); | ||||||
| 				ret = deliver_skb(skb, pt_prev, orig_dev); | 	} | ||||||
| 			pt_prev = ptype; | 
 | ||||||
| 		} | 	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, | ||||||
|  | 			       &orig_dev->ptype_specific); | ||||||
|  | 
 | ||||||
|  | 	if (unlikely(skb->dev != orig_dev)) { | ||||||
|  | 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, | ||||||
|  | 				       &skb->dev->ptype_specific); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if (pt_prev) { | 	if (pt_prev) { | ||||||
|  | @ -6579,6 +6611,8 @@ void netdev_run_todo(void) | ||||||
| 
 | 
 | ||||||
| 		/* paranoia */ | 		/* paranoia */ | ||||||
| 		BUG_ON(netdev_refcnt_read(dev)); | 		BUG_ON(netdev_refcnt_read(dev)); | ||||||
|  | 		BUG_ON(!list_empty(&dev->ptype_all)); | ||||||
|  | 		BUG_ON(!list_empty(&dev->ptype_specific)); | ||||||
| 		WARN_ON(rcu_access_pointer(dev->ip_ptr)); | 		WARN_ON(rcu_access_pointer(dev->ip_ptr)); | ||||||
| 		WARN_ON(rcu_access_pointer(dev->ip6_ptr)); | 		WARN_ON(rcu_access_pointer(dev->ip6_ptr)); | ||||||
| 		WARN_ON(dev->dn_ptr); | 		WARN_ON(dev->dn_ptr); | ||||||
|  | @ -6761,6 +6795,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, | ||||||
| 	INIT_LIST_HEAD(&dev->adj_list.lower); | 	INIT_LIST_HEAD(&dev->adj_list.lower); | ||||||
| 	INIT_LIST_HEAD(&dev->all_adj_list.upper); | 	INIT_LIST_HEAD(&dev->all_adj_list.upper); | ||||||
| 	INIT_LIST_HEAD(&dev->all_adj_list.lower); | 	INIT_LIST_HEAD(&dev->all_adj_list.lower); | ||||||
|  | 	INIT_LIST_HEAD(&dev->ptype_all); | ||||||
|  | 	INIT_LIST_HEAD(&dev->ptype_specific); | ||||||
| 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; | 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; | ||||||
| 	setup(dev); | 	setup(dev); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Salam Noureddine
						Salam Noureddine