forked from mirrors/linux
		
	ipvlan: Introduce l3s mode
In a typical IPvlan L3 setup where master is in default-ns and each slave is into different (slave) ns. In this setup egress packet processing for traffic originating from slave-ns will hit all NF_HOOKs in slave-ns as well as default-ns. However same is not true for ingress processing. All these NF_HOOKs are hit only in the slave-ns skipping them in the default-ns. IPvlan in L3 mode is restrictive and if admins want to deploy iptables rules in default-ns, this asymmetric data path makes it impossible to do so. This patch makes use of the l3_rcv() (added as part of l3mdev enhancements) to perform input route lookup on RX packets without changing the skb->dev and then uses nf_hook at NF_INET_LOCAL_IN to change the skb->dev just before handing over skb to L4. Signed-off-by: Mahesh Bandewar <maheshb@google.com> CC: David Ahern <dsa@cumulusnetworks.com> Reviewed-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									e8bffe0cf9
								
							
						
					
					
						commit
						4fbae7d83c
					
				
					 6 changed files with 188 additions and 8 deletions
				
			
		|  | @ -22,7 +22,7 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module | |||
| 	There are no module parameters for this driver and it can be configured | ||||
| using IProute2/ip utility. | ||||
| 
 | ||||
| 	ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | L3 } | ||||
| 	ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | l3 | l3s } | ||||
| 
 | ||||
| 	e.g. ip link add link ipvl0 eth0 type ipvlan mode l2 | ||||
| 
 | ||||
|  | @ -48,6 +48,11 @@ master device for the L2 processing and routing from that instance will be | |||
| used before packets are queued on the outbound device. In this mode the slaves | ||||
| will not receive nor can send multicast / broadcast traffic. | ||||
| 
 | ||||
| 4.3 L3S mode: | ||||
| 	This is very similar to the L3 mode except that iptables (conn-tracking) | ||||
| works in this mode and hence it is L3-symmetric (L3s). This will have slightly less | ||||
| performance but that shouldn't matter since you are choosing this mode over plain-L3 | ||||
| mode to make conn-tracking work. | ||||
| 
 | ||||
| 5. What to choose (macvlan vs. ipvlan)? | ||||
| 	These two devices are very similar in many regards and the specific use | ||||
|  |  | |||
|  | @ -149,6 +149,7 @@ config IPVLAN | |||
|     tristate "IP-VLAN support" | ||||
|     depends on INET | ||||
|     depends on IPV6 | ||||
|     depends on NET_L3_MASTER_DEV | ||||
|     ---help--- | ||||
|       This allows one to create virtual devices off of a main interface | ||||
|       and packets will be delivered based on the dest L3 (IPv6/IPv4 addr) | ||||
|  |  | |||
|  | @ -23,11 +23,13 @@ | |||
| #include <linux/if_vlan.h> | ||||
| #include <linux/ip.h> | ||||
| #include <linux/inetdevice.h> | ||||
| #include <linux/netfilter.h> | ||||
| #include <net/ip.h> | ||||
| #include <net/ip6_route.h> | ||||
| #include <net/rtnetlink.h> | ||||
| #include <net/route.h> | ||||
| #include <net/addrconf.h> | ||||
| #include <net/l3mdev.h> | ||||
| 
 | ||||
| #define IPVLAN_DRV	"ipvlan" | ||||
| #define IPV_DRV_VER	"0.1" | ||||
|  | @ -124,4 +126,8 @@ struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, | |||
| 				   const void *iaddr, bool is_v6); | ||||
| bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6); | ||||
| void ipvlan_ht_addr_del(struct ipvl_addr *addr); | ||||
| struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb, | ||||
| 			      u16 proto); | ||||
| unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb, | ||||
| 			     const struct nf_hook_state *state); | ||||
| #endif /* __IPVLAN_H */ | ||||
|  |  | |||
|  | @ -560,6 +560,7 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) | |||
| 	case IPVLAN_MODE_L2: | ||||
| 		return ipvlan_xmit_mode_l2(skb, dev); | ||||
| 	case IPVLAN_MODE_L3: | ||||
| 	case IPVLAN_MODE_L3S: | ||||
| 		return ipvlan_xmit_mode_l3(skb, dev); | ||||
| 	} | ||||
| 
 | ||||
|  | @ -664,6 +665,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) | |||
| 		return ipvlan_handle_mode_l2(pskb, port); | ||||
| 	case IPVLAN_MODE_L3: | ||||
| 		return ipvlan_handle_mode_l3(pskb, port); | ||||
| 	case IPVLAN_MODE_L3S: | ||||
| 		return RX_HANDLER_PASS; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Should not reach here */ | ||||
|  | @ -672,3 +675,94 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) | |||
| 	kfree_skb(skb); | ||||
| 	return RX_HANDLER_CONSUMED; | ||||
| } | ||||
| 
 | ||||
| static struct ipvl_addr *ipvlan_skb_to_addr(struct sk_buff *skb, | ||||
| 					    struct net_device *dev) | ||||
| { | ||||
| 	struct ipvl_addr *addr = NULL; | ||||
| 	struct ipvl_port *port; | ||||
| 	void *lyr3h; | ||||
| 	int addr_type; | ||||
| 
 | ||||
| 	if (!dev || !netif_is_ipvlan_port(dev)) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	port = ipvlan_port_get_rcu(dev); | ||||
| 	if (!port || port->mode != IPVLAN_MODE_L3S) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); | ||||
| 	if (!lyr3h) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); | ||||
| out: | ||||
| 	return addr; | ||||
| } | ||||
| 
 | ||||
| struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb, | ||||
| 			      u16 proto) | ||||
| { | ||||
| 	struct ipvl_addr *addr; | ||||
| 	struct net_device *sdev; | ||||
| 
 | ||||
| 	addr = ipvlan_skb_to_addr(skb, dev); | ||||
| 	if (!addr) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	sdev = addr->master->dev; | ||||
| 	switch (proto) { | ||||
| 	case AF_INET: | ||||
| 	{ | ||||
| 		int err; | ||||
| 		struct iphdr *ip4h = ip_hdr(skb); | ||||
| 
 | ||||
| 		err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr, | ||||
| 					   ip4h->tos, sdev); | ||||
| 		if (unlikely(err)) | ||||
| 			goto out; | ||||
| 		break; | ||||
| 	} | ||||
| 	case AF_INET6: | ||||
| 	{ | ||||
| 		struct dst_entry *dst; | ||||
| 		struct ipv6hdr *ip6h = ipv6_hdr(skb); | ||||
| 		int flags = RT6_LOOKUP_F_HAS_SADDR; | ||||
| 		struct flowi6 fl6 = { | ||||
| 			.flowi6_iif   = sdev->ifindex, | ||||
| 			.daddr        = ip6h->daddr, | ||||
| 			.saddr        = ip6h->saddr, | ||||
| 			.flowlabel    = ip6_flowinfo(ip6h), | ||||
| 			.flowi6_mark  = skb->mark, | ||||
| 			.flowi6_proto = ip6h->nexthdr, | ||||
| 		}; | ||||
| 
 | ||||
| 		skb_dst_drop(skb); | ||||
| 		dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, flags); | ||||
| 		skb_dst_set(skb, dst); | ||||
| 		break; | ||||
| 	} | ||||
| 	default: | ||||
| 		break; | ||||
| 	} | ||||
| 
 | ||||
| out: | ||||
| 	return skb; | ||||
| } | ||||
| 
 | ||||
| unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb, | ||||
| 			     const struct nf_hook_state *state) | ||||
| { | ||||
| 	struct ipvl_addr *addr; | ||||
| 	unsigned int len; | ||||
| 
 | ||||
| 	addr = ipvlan_skb_to_addr(skb, skb->dev); | ||||
| 	if (!addr) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	skb->dev = addr->master->dev; | ||||
| 	len = skb->len + ETH_HLEN; | ||||
| 	ipvlan_count_rx(addr->master, len, true, false); | ||||
| out: | ||||
| 	return NF_ACCEPT; | ||||
| } | ||||
|  |  | |||
|  | @ -9,24 +9,87 @@ | |||
| 
 | ||||
| #include "ipvlan.h" | ||||
| 
 | ||||
| static u32 ipvl_nf_hook_refcnt = 0; | ||||
| 
 | ||||
| static struct nf_hook_ops ipvl_nfops[] __read_mostly = { | ||||
| 	{ | ||||
| 		.hook     = ipvlan_nf_input, | ||||
| 		.pf       = NFPROTO_IPV4, | ||||
| 		.hooknum  = NF_INET_LOCAL_IN, | ||||
| 		.priority = INT_MAX, | ||||
| 	}, | ||||
| 	{ | ||||
| 		.hook     = ipvlan_nf_input, | ||||
| 		.pf       = NFPROTO_IPV6, | ||||
| 		.hooknum  = NF_INET_LOCAL_IN, | ||||
| 		.priority = INT_MAX, | ||||
| 	}, | ||||
| }; | ||||
| 
 | ||||
| static struct l3mdev_ops ipvl_l3mdev_ops __read_mostly = { | ||||
| 	.l3mdev_l3_rcv = ipvlan_l3_rcv, | ||||
| }; | ||||
| 
 | ||||
| static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev) | ||||
| { | ||||
| 	ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj; | ||||
| } | ||||
| 
 | ||||
| static void ipvlan_set_port_mode(struct ipvl_port *port, u16 nval) | ||||
| static int ipvlan_register_nf_hook(void) | ||||
| { | ||||
| 	int err = 0; | ||||
| 
 | ||||
| 	if (!ipvl_nf_hook_refcnt) { | ||||
| 		err = _nf_register_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); | ||||
| 		if (!err) | ||||
| 			ipvl_nf_hook_refcnt = 1; | ||||
| 	} else { | ||||
| 		ipvl_nf_hook_refcnt++; | ||||
| 	} | ||||
| 
 | ||||
| 	return err; | ||||
| } | ||||
| 
 | ||||
| static void ipvlan_unregister_nf_hook(void) | ||||
| { | ||||
| 	WARN_ON(!ipvl_nf_hook_refcnt); | ||||
| 
 | ||||
| 	ipvl_nf_hook_refcnt--; | ||||
| 	if (!ipvl_nf_hook_refcnt) | ||||
| 		_nf_unregister_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); | ||||
| } | ||||
| 
 | ||||
| static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval) | ||||
| { | ||||
| 	struct ipvl_dev *ipvlan; | ||||
| 	struct net_device *mdev = port->dev; | ||||
| 	int err = 0; | ||||
| 
 | ||||
| 	ASSERT_RTNL(); | ||||
| 	if (port->mode != nval) { | ||||
| 		if (nval == IPVLAN_MODE_L3S) { | ||||
| 			/* New mode is L3S */ | ||||
| 			err = ipvlan_register_nf_hook(); | ||||
| 			if (!err) { | ||||
| 				mdev->l3mdev_ops = &ipvl_l3mdev_ops; | ||||
| 				mdev->priv_flags |= IFF_L3MDEV_MASTER; | ||||
| 			} else | ||||
| 				return err; | ||||
| 		} else if (port->mode == IPVLAN_MODE_L3S) { | ||||
| 			/* Old mode was L3S */ | ||||
| 			mdev->priv_flags &= ~IFF_L3MDEV_MASTER; | ||||
| 			ipvlan_unregister_nf_hook(); | ||||
| 			mdev->l3mdev_ops = NULL; | ||||
| 		} | ||||
| 		list_for_each_entry(ipvlan, &port->ipvlans, pnode) { | ||||
| 			if (nval == IPVLAN_MODE_L3) | ||||
| 			if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) | ||||
| 				ipvlan->dev->flags |= IFF_NOARP; | ||||
| 			else | ||||
| 				ipvlan->dev->flags &= ~IFF_NOARP; | ||||
| 		} | ||||
| 		port->mode = nval; | ||||
| 	} | ||||
| 	return err; | ||||
| } | ||||
| 
 | ||||
| static int ipvlan_port_create(struct net_device *dev) | ||||
|  | @ -74,6 +137,11 @@ static void ipvlan_port_destroy(struct net_device *dev) | |||
| 	struct ipvl_port *port = ipvlan_port_get_rtnl(dev); | ||||
| 
 | ||||
| 	dev->priv_flags &= ~IFF_IPVLAN_MASTER; | ||||
| 	if (port->mode == IPVLAN_MODE_L3S) { | ||||
| 		dev->priv_flags &= ~IFF_L3MDEV_MASTER; | ||||
| 		ipvlan_unregister_nf_hook(); | ||||
| 		dev->l3mdev_ops = NULL; | ||||
| 	} | ||||
| 	netdev_rx_handler_unregister(dev); | ||||
| 	cancel_work_sync(&port->wq); | ||||
| 	__skb_queue_purge(&port->backlog); | ||||
|  | @ -132,7 +200,8 @@ static int ipvlan_open(struct net_device *dev) | |||
| 	struct net_device *phy_dev = ipvlan->phy_dev; | ||||
| 	struct ipvl_addr *addr; | ||||
| 
 | ||||
| 	if (ipvlan->port->mode == IPVLAN_MODE_L3) | ||||
| 	if (ipvlan->port->mode == IPVLAN_MODE_L3 || | ||||
| 	    ipvlan->port->mode == IPVLAN_MODE_L3S) | ||||
| 		dev->flags |= IFF_NOARP; | ||||
| 	else | ||||
| 		dev->flags &= ~IFF_NOARP; | ||||
|  | @ -372,13 +441,14 @@ static int ipvlan_nl_changelink(struct net_device *dev, | |||
| { | ||||
| 	struct ipvl_dev *ipvlan = netdev_priv(dev); | ||||
| 	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); | ||||
| 	int err = 0; | ||||
| 
 | ||||
| 	if (data && data[IFLA_IPVLAN_MODE]) { | ||||
| 		u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]); | ||||
| 
 | ||||
| 		ipvlan_set_port_mode(port, nmode); | ||||
| 		err = ipvlan_set_port_mode(port, nmode); | ||||
| 	} | ||||
| 	return 0; | ||||
| 	return err; | ||||
| } | ||||
| 
 | ||||
| static size_t ipvlan_nl_getsize(const struct net_device *dev) | ||||
|  | @ -473,10 +543,13 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev, | |||
| 		unregister_netdevice(dev); | ||||
| 		return err; | ||||
| 	} | ||||
| 	err = ipvlan_set_port_mode(port, mode); | ||||
| 	if (err) { | ||||
| 		unregister_netdevice(dev); | ||||
| 		return err; | ||||
| 	} | ||||
| 
 | ||||
| 	list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans); | ||||
| 	ipvlan_set_port_mode(port, mode); | ||||
| 
 | ||||
| 	netif_stacked_transfer_operstate(phy_dev, dev); | ||||
| 	return 0; | ||||
| } | ||||
|  |  | |||
|  | @ -464,6 +464,7 @@ enum { | |||
| enum ipvlan_mode { | ||||
| 	IPVLAN_MODE_L2 = 0, | ||||
| 	IPVLAN_MODE_L3, | ||||
| 	IPVLAN_MODE_L3S, | ||||
| 	IPVLAN_MODE_MAX | ||||
| }; | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Mahesh Bandewar
						Mahesh Bandewar