3
0
Fork 0
forked from mirrors/linux

netfilter pull request 25-05-23

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEjF9xRqF1emXiQiqU1w0aZmrPKyEFAmgwd00ACgkQ1w0aZmrP
 KyEfwA//RXQ3i8PCa7lKHxDRhVzG3rEvgXRmiXeNd+JjzsCnybBb7+wRf3dtBGWT
 +1s44Utx1JqosWxCVBulqYC5bqSC66789l5X2jhYJmUZxRrbcsqPngwnIrjb/XeK
 ZJM62wiRhkBQED7yZLGy+y4VHQiG8CEMt16AOQHk863aruWv1tT7up90CTtzA545
 4GF/grU3FC0PsoTLwzWyvqsWK+9uk3Y4Tifp5hU3w6uRD9EjX5tHCZlXXSqOF5gu
 KT26OYsePYXhJVZIwDf2oVLGi0EVTPB9IFxZSNgLqyXqu2ILAb9OwRNVTNfTP7Pg
 1RWJWmgqvRNs9OM2ecifYgQf/AfvCL0Cja1BJOjmvtICuGegrYH7G5YYQsMl9CoE
 7jBoTzpToSASat5+dwoz81Bvzh447dYxRE2VmbxmRTTWToQYS1KGBPc9e3u/n5Rr
 ruh8tRZ3/R0Fy+YLDkrJst3grh5RLITbuyu4ElJMArPU50mLTVYxKd6nA3BqwB5G
 1GmLfCzvQH3e6PKz6CNke1AytVDy/wLTXtcbLnze2Muaj4AqhtOe5Q8ypnOO0Vyk
 PsJ6U3rm2asd3GE9+AIx8gZBv8yCu1w9CiwLK8ybT2NETb2dEnqPgWeDyT7rpcaD
 sQOPsBE1q/TEp9gofbYCHBm5E2mX9UP7Q6EHCTekrI97xLq8Q2M=
 =fBhd
 -----END PGP SIGNATURE-----

Merge tag 'nf-next-25-05-23' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next

Pablo Neira Ayuso says:

====================
Netfilter updates for net-next

The following batch contains Netfilter updates for net-next,
specifically 26 patches: 5 patches adding/updating selftests,
4 fixes, 3 PREEMPT_RT fixes, and 14 patches to enhance nf_tables):

1) Improve selftest coverage for pipapo 4 bit group format, from
   Florian Westphal.

2) Fix incorrect dependencies when compiling a kernel without
   legacy ip{6}tables support, also from Florian.

3) Two patches to fix nft_fib vrf issues, including selftest updates
   to improve coverage, also from Florian Westphal.

4) Fix incorrect nesting in nft_tunnel's GENEVE support, from
   Fernando F. Mancera.

5) Three patches to fix PREEMPT_RT issues with nf_dup infrastructure
   and nft_inner to match in inner headers, from Sebastian Andrzej Siewior.

6) Integrate conntrack information into nft trace infrastructure,
   from Florian Westphal.

7) A series of 13 patches to allow to specify wildcard netdevice in
   netdev basechain and flowtables, eg.

   table netdev filter {
       chain ingress {
           type filter hook ingress devices = { eth0, eth1, vlan* } priority 0; policy accept;
       }
   }

   This also allows for runtime hook registration on NETDEV_{UN}REGISTER
   event, from Phil Sutter.

netfilter pull request 25-05-23

* tag 'nf-next-25-05-23' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next: (26 commits)
  selftests: netfilter: Torture nftables netdev hooks
  netfilter: nf_tables: Add notifications for hook changes
  netfilter: nf_tables: Support wildcard netdev hook specs
  netfilter: nf_tables: Sort labels in nft_netdev_hook_alloc()
  netfilter: nf_tables: Handle NETDEV_CHANGENAME events
  netfilter: nf_tables: Wrap netdev notifiers
  netfilter: nf_tables: Respect NETDEV_REGISTER events
  netfilter: nf_tables: Prepare for handling NETDEV_REGISTER events
  netfilter: nf_tables: Have a list of nf_hook_ops in nft_hook
  netfilter: nf_tables: Pass nf_hook_ops to nft_unregister_flowtable_hook()
  netfilter: nf_tables: Introduce nft_register_flowtable_ops()
  netfilter: nf_tables: Introduce nft_hook_find_ops{,_rcu}()
  netfilter: nf_tables: Introduce functions freeing nft_hook objects
  netfilter: nf_tables: add packets conntrack state to debug trace info
  netfilter: conntrack: make nf_conntrack_id callable without a module dependency
  netfilter: nf_dup_netdev: Move the recursion counter struct netdev_xmit
  netfilter: nft_inner: Use nested-BH locking for nft_pcpu_tun_ctx
  netfilter: nf_dup{4, 6}: Move duplication check to task_struct
  netfilter: nft_tunnel: fix geneve_opt dump
  selftests: netfilter: nft_fib.sh: add type and oif tests with and without VRFs
  ...
====================

Link: https://patch.msgid.link/20250523132712.458507-1-pablo@netfilter.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
Paolo Abeni 2025-05-26 18:53:40 +02:00
commit f5b60d6a57
31 changed files with 1505 additions and 231 deletions

View file

@ -11,6 +11,9 @@ struct netdev_xmit {
#if IS_ENABLED(CONFIG_NET_ACT_MIRRED)
u8 sched_mirred_nest;
#endif
#if IS_ENABLED(CONFIG_NF_DUP_NETDEV)
u8 nf_dup_skb_recursion;
#endif
};
#endif

View file

@ -95,6 +95,9 @@ enum nf_hook_ops_type {
};
struct nf_hook_ops {
struct list_head list;
struct rcu_head rcu;
/* User fills in from here down. */
nf_hookfn *hook;
struct net_device *dev;
@ -470,6 +473,7 @@ struct nf_ct_hook {
void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);
void (*set_closing)(struct nf_conntrack *nfct);
int (*confirm)(struct sk_buff *skb);
u32 (*get_id)(const struct nf_conntrack *nfct);
};
extern const struct nf_ct_hook __rcu *nf_ct_hook;
@ -497,17 +501,6 @@ struct nf_defrag_hook {
extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook;
extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook;
/*
* nf_skb_duplicated - TEE target has sent a packet
*
* When a xtables target sends a packet, the OUTPUT and POSTROUTING
* hooks are traversed again, i.e. nft and xtables are invoked recursively.
*
* This is used by xtables TEE target to prevent the duplicated skb from
* being duplicated again.
*/
DECLARE_PER_CPU(bool, nf_skb_duplicated);
/*
* Contains bitmask of ctnetlink event subscribers, if any.
* Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag.

View file

@ -1044,6 +1044,7 @@ struct task_struct {
/* delay due to memory thrashing */
unsigned in_thrashing:1;
#endif
unsigned in_nf_duplicate:1;
#ifdef CONFIG_PREEMPT_RT
struct netdev_xmit net_xmit;
#endif

View file

@ -1142,6 +1142,11 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set);
int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
struct nft_hook;
void nf_tables_chain_device_notify(const struct nft_chain *chain,
const struct nft_hook *hook,
const struct net_device *dev, int event);
enum nft_chain_types {
NFT_CHAIN_T_DEFAULT = 0,
NFT_CHAIN_T_ROUTE,
@ -1199,12 +1204,17 @@ struct nft_stats {
struct nft_hook {
struct list_head list;
struct nf_hook_ops ops;
struct list_head ops_list;
struct rcu_head rcu;
char ifname[IFNAMSIZ];
u8 ifnamelen;
};
struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook,
const struct net_device *dev);
struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook,
const struct net_device *dev);
/**
* struct nft_base_chain - nf_tables base chain
*

View file

@ -2,6 +2,7 @@
#ifndef _NFT_FIB_H_
#define _NFT_FIB_H_
#include <net/l3mdev.h>
#include <net/netfilter/nf_tables.h>
struct nft_fib {
@ -39,6 +40,14 @@ static inline bool nft_fib_can_skip(const struct nft_pktinfo *pkt)
return nft_fib_is_loopback(pkt->skb, indev);
}
static inline int nft_fib_l3mdev_master_ifindex_rcu(const struct nft_pktinfo *pkt,
const struct net_device *iif)
{
const struct net_device *dev = iif ? iif : pkt->skb->dev;
return l3mdev_master_ifindex_rcu(dev);
}
int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset);
int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
const struct nlattr * const tb[]);

View file

@ -142,6 +142,8 @@ enum nf_tables_msg_types {
NFT_MSG_DESTROYOBJ,
NFT_MSG_DESTROYFLOWTABLE,
NFT_MSG_GETSETELEM_RESET,
NFT_MSG_NEWDEV,
NFT_MSG_DELDEV,
NFT_MSG_MAX,
};
@ -1784,10 +1786,18 @@ enum nft_synproxy_attributes {
* enum nft_device_attributes - nf_tables device netlink attributes
*
* @NFTA_DEVICE_NAME: name of this device (NLA_STRING)
* @NFTA_DEVICE_TABLE: table containing the flowtable or chain hooking into the device (NLA_STRING)
* @NFTA_DEVICE_FLOWTABLE: flowtable hooking into the device (NLA_STRING)
* @NFTA_DEVICE_CHAIN: chain hooking into the device (NLA_STRING)
* @NFTA_DEVICE_SPEC: hook spec matching the device (NLA_STRING)
*/
enum nft_devices_attributes {
NFTA_DEVICE_UNSPEC,
NFTA_DEVICE_NAME,
NFTA_DEVICE_TABLE,
NFTA_DEVICE_FLOWTABLE,
NFTA_DEVICE_CHAIN,
NFTA_DEVICE_SPEC,
__NFTA_DEVICE_MAX
};
#define NFTA_DEVICE_MAX (__NFTA_DEVICE_MAX - 1)
@ -1841,6 +1851,10 @@ enum nft_xfrm_keys {
* @NFTA_TRACE_MARK: nfmark (NLA_U32)
* @NFTA_TRACE_NFPROTO: nf protocol processed (NLA_U32)
* @NFTA_TRACE_POLICY: policy that decided fate of packet (NLA_U32)
* @NFTA_TRACE_CT_ID: conntrack id (NLA_U32)
* @NFTA_TRACE_CT_DIRECTION: packets direction (NLA_U8)
* @NFTA_TRACE_CT_STATUS: conntrack status (NLA_U32)
* @NFTA_TRACE_CT_STATE: packet state (new, established, ...) (NLA_U32)
*/
enum nft_trace_attributes {
NFTA_TRACE_UNSPEC,
@ -1861,6 +1875,10 @@ enum nft_trace_attributes {
NFTA_TRACE_NFPROTO,
NFTA_TRACE_POLICY,
NFTA_TRACE_PAD,
NFTA_TRACE_CT_ID,
NFTA_TRACE_CT_DIRECTION,
NFTA_TRACE_CT_STATUS,
NFTA_TRACE_CT_STATE,
__NFTA_TRACE_MAX
};
#define NFTA_TRACE_MAX (__NFTA_TRACE_MAX - 1)

View file

@ -25,6 +25,8 @@ enum nfnetlink_groups {
#define NFNLGRP_ACCT_QUOTA NFNLGRP_ACCT_QUOTA
NFNLGRP_NFTRACE,
#define NFNLGRP_NFTRACE NFNLGRP_NFTRACE
NFNLGRP_NFT_DEV,
#define NFNLGRP_NFT_DEV NFNLGRP_NFT_DEV
__NFNLGRP_MAX,
};
#define NFNLGRP_MAX (__NFNLGRP_MAX - 1)

View file

@ -270,7 +270,7 @@ ipt_do_table(void *priv,
* but it is no problem since absolute verdict is issued by these.
*/
if (static_key_false(&xt_tee_enabled))
jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
jumpstack += private->stacksize * current->in_nf_duplicate;
e = get_entry(table_base, private->hook_entry[hook]);

View file

@ -54,7 +54,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
struct iphdr *iph;
local_bh_disable();
if (this_cpu_read(nf_skb_duplicated))
if (current->in_nf_duplicate)
goto out;
/*
* Copy the skb, and route the copy. Will later return %XT_CONTINUE for
@ -86,9 +86,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
--iph->ttl;
if (nf_dup_ipv4_route(net, skb, gw, oif)) {
__this_cpu_write(nf_skb_duplicated, true);
current->in_nf_duplicate = true;
ip_local_out(net, skb->sk, skb);
__this_cpu_write(nf_skb_duplicated, false);
current->in_nf_duplicate = false;
} else {
kfree_skb(skb);
}

View file

@ -50,7 +50,12 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
else
addr = iph->saddr;
*dst = inet_dev_addr_type(nft_net(pkt), dev, addr);
if (priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) {
*dst = inet_dev_addr_type(nft_net(pkt), dev, addr);
return;
}
*dst = inet_addr_type_dev_table(nft_net(pkt), pkt->skb->dev, addr);
}
EXPORT_SYMBOL_GPL(nft_fib4_eval_type);
@ -65,8 +70,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
struct flowi4 fl4 = {
.flowi4_scope = RT_SCOPE_UNIVERSE,
.flowi4_iif = LOOPBACK_IFINDEX,
.flowi4_proto = pkt->tprot,
.flowi4_uid = sock_net_uid(nft_net(pkt), NULL),
.flowi4_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)),
};
const struct net_device *oif;
const struct net_device *found;
@ -90,6 +95,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
else
oif = NULL;
fl4.flowi4_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, oif);
iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
if (!iph) {
regs->verdict.code = NFT_BREAK;

View file

@ -292,7 +292,7 @@ ip6t_do_table(void *priv, struct sk_buff *skb,
* but it is no problem since absolute verdict is issued by these.
*/
if (static_key_false(&xt_tee_enabled))
jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
jumpstack += private->stacksize * current->in_nf_duplicate;
e = get_entry(table_base, private->hook_entry[hook]);

View file

@ -48,7 +48,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
const struct in6_addr *gw, int oif)
{
local_bh_disable();
if (this_cpu_read(nf_skb_duplicated))
if (current->in_nf_duplicate)
goto out;
skb = pskb_copy(skb, GFP_ATOMIC);
if (skb == NULL)
@ -64,9 +64,9 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
--iph->hop_limit;
}
if (nf_dup_ipv6_route(net, skb, gw, oif)) {
__this_cpu_write(nf_skb_duplicated, true);
current->in_nf_duplicate = true;
ip6_local_out(net, skb->sk, skb);
__this_cpu_write(nf_skb_duplicated, false);
current->in_nf_duplicate = false;
} else {
kfree_skb(skb);
}

View file

@ -50,6 +50,7 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,
fl6->flowi6_mark = pkt->skb->mark;
fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK;
fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev);
return lookup_flags;
}
@ -73,8 +74,6 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
else if (priv->flags & NFTA_FIB_F_OIF)
dev = nft_out(pkt);
fl6.flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev);
nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph);
if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true))
@ -158,6 +157,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
{
const struct nft_fib *priv = nft_expr_priv(expr);
int noff = skb_network_offset(pkt->skb);
const struct net_device *found = NULL;
const struct net_device *oif = NULL;
u32 *dest = &regs->data[priv->dreg];
struct ipv6hdr *iph, _iph;
@ -165,7 +165,6 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
.flowi6_iif = LOOPBACK_IFINDEX,
.flowi6_proto = pkt->tprot,
.flowi6_uid = sock_net_uid(nft_net(pkt), NULL),
.flowi6_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)),
};
struct rt6_info *rt;
int lookup_flags;
@ -203,11 +202,15 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL))
goto put_rt_err;
if (oif && oif != rt->rt6i_idev->dev &&
l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) != oif->ifindex)
goto put_rt_err;
if (!oif) {
found = rt->rt6i_idev->dev;
} else {
if (oif == rt->rt6i_idev->dev ||
l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex)
found = oif;
}
nft_fib_store_result(dest, priv, rt->rt6i_idev->dev);
nft_fib_store_result(dest, priv, found);
put_rt_err:
ip6_rt_put(rt);
}

View file

@ -31,9 +31,6 @@
const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_ipv6_ops);
DEFINE_PER_CPU(bool, nf_skb_duplicated);
EXPORT_SYMBOL_GPL(nf_skb_duplicated);
#ifdef CONFIG_JUMP_LABEL
struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
EXPORT_SYMBOL(nf_hooks_needed);

View file

@ -505,6 +505,11 @@ u32 nf_ct_get_id(const struct nf_conn *ct)
}
EXPORT_SYMBOL_GPL(nf_ct_get_id);
static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct)
{
return nf_ct_get_id(nf_ct_to_nf_conn(nfct));
}
static void
clean_from_lists(struct nf_conn *ct)
{
@ -2710,6 +2715,7 @@ static const struct nf_ct_hook nf_conntrack_hook = {
.attach = nf_conntrack_attach,
.set_closing = nf_conntrack_set_closing,
.confirm = __nf_conntrack_confirm,
.get_id = nf_conntrack_get_id,
};
void nf_conntrack_init_end(void)

View file

@ -15,12 +15,26 @@
#define NF_RECURSION_LIMIT 2
static DEFINE_PER_CPU(u8, nf_dup_skb_recursion);
#ifndef CONFIG_PREEMPT_RT
static u8 *nf_get_nf_dup_skb_recursion(void)
{
return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
}
#else
static u8 *nf_get_nf_dup_skb_recursion(void)
{
return &current->net_xmit.nf_dup_skb_recursion;
}
#endif
static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
enum nf_dev_hooks hook)
{
if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT)
u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion();
if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT)
goto err;
if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) {
@ -32,9 +46,9 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
skb->dev = dev;
skb_clear_tstamp(skb);
__this_cpu_inc(nf_dup_skb_recursion);
(*nf_dup_skb_recursion)++;
dev_queue_xmit(skb);
__this_cpu_dec(nf_dup_skb_recursion);
(*nf_dup_skb_recursion)--;
return;
err:
kfree_skb(skb);

View file

@ -300,40 +300,75 @@ void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain)
static int nft_netdev_register_hooks(struct net *net,
struct list_head *hook_list)
{
struct nf_hook_ops *ops;
struct nft_hook *hook;
int err, j;
j = 0;
list_for_each_entry(hook, hook_list, list) {
err = nf_register_net_hook(net, &hook->ops);
if (err < 0)
goto err_register;
list_for_each_entry(ops, &hook->ops_list, list) {
err = nf_register_net_hook(net, ops);
if (err < 0)
goto err_register;
j++;
j++;
}
}
return 0;
err_register:
list_for_each_entry(hook, hook_list, list) {
if (j-- <= 0)
break;
list_for_each_entry(ops, &hook->ops_list, list) {
if (j-- <= 0)
break;
nf_unregister_net_hook(net, &hook->ops);
nf_unregister_net_hook(net, ops);
}
}
return err;
}
static void nft_netdev_hook_free_ops(struct nft_hook *hook)
{
struct nf_hook_ops *ops, *next;
list_for_each_entry_safe(ops, next, &hook->ops_list, list) {
list_del(&ops->list);
kfree(ops);
}
}
static void nft_netdev_hook_free(struct nft_hook *hook)
{
nft_netdev_hook_free_ops(hook);
kfree(hook);
}
static void __nft_netdev_hook_free_rcu(struct rcu_head *rcu)
{
struct nft_hook *hook = container_of(rcu, struct nft_hook, rcu);
nft_netdev_hook_free(hook);
}
static void nft_netdev_hook_free_rcu(struct nft_hook *hook)
{
call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu);
}
static void nft_netdev_unregister_hooks(struct net *net,
struct list_head *hook_list,
bool release_netdev)
{
struct nft_hook *hook, *next;
struct nf_hook_ops *ops;
list_for_each_entry_safe(hook, next, hook_list, list) {
nf_unregister_net_hook(net, &hook->ops);
list_for_each_entry(ops, &hook->ops_list, list)
nf_unregister_net_hook(net, ops);
if (release_netdev) {
list_del(&hook->list);
kfree_rcu(hook, rcu);
nft_netdev_hook_free_rcu(hook);
}
}
}
@ -2253,7 +2288,7 @@ void nf_tables_chain_destroy(struct nft_chain *chain)
list_for_each_entry_safe(hook, next,
&basechain->hook_list, list) {
list_del_rcu(&hook->list);
kfree_rcu(hook, rcu);
nft_netdev_hook_free_rcu(hook);
}
}
module_put(basechain->type->owner);
@ -2274,19 +2309,20 @@ void nf_tables_chain_destroy(struct nft_chain *chain)
static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
const struct nlattr *attr)
{
struct nf_hook_ops *ops;
struct net_device *dev;
struct nft_hook *hook;
int err;
hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT);
if (!hook) {
err = -ENOMEM;
goto err_hook_alloc;
}
if (!hook)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&hook->ops_list);
err = nla_strscpy(hook->ifname, attr, IFNAMSIZ);
if (err < 0)
goto err_hook_dev;
goto err_hook_free;
hook->ifnamelen = nla_len(attr);
@ -2294,18 +2330,22 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
* indirectly serializing all the other holders of the commit_mutex with
* the rtnl_mutex.
*/
dev = __dev_get_by_name(net, hook->ifname);
if (!dev) {
err = -ENOENT;
goto err_hook_dev;
}
hook->ops.dev = dev;
for_each_netdev(net, dev) {
if (strncmp(dev->name, hook->ifname, hook->ifnamelen))
continue;
ops = kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT);
if (!ops) {
err = -ENOMEM;
goto err_hook_free;
}
ops->dev = dev;
list_add_tail(&ops->list, &hook->ops_list);
}
return hook;
err_hook_dev:
kfree(hook);
err_hook_alloc:
err_hook_free:
nft_netdev_hook_free(hook);
return ERR_PTR(err);
}
@ -2315,7 +2355,8 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,
struct nft_hook *hook;
list_for_each_entry(hook, hook_list, list) {
if (!strcmp(hook->ifname, this->ifname))
if (!strncmp(hook->ifname, this->ifname,
min(hook->ifnamelen, this->ifnamelen)))
return hook;
}
@ -2345,7 +2386,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net,
}
if (nft_hook_list_find(hook_list, hook)) {
NL_SET_BAD_ATTR(extack, tmp);
kfree(hook);
nft_netdev_hook_free(hook);
err = -EEXIST;
goto err_hook;
}
@ -2363,7 +2404,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net,
err_hook:
list_for_each_entry_safe(hook, next, hook_list, list) {
list_del(&hook->list);
kfree(hook);
nft_netdev_hook_free(hook);
}
return err;
}
@ -2506,7 +2547,7 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
list_for_each_entry_safe(h, next, &hook->list, list) {
list_del(&h->list);
kfree(h);
nft_netdev_hook_free(h);
}
module_put(hook->type->owner);
}
@ -2559,6 +2600,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
struct nft_chain_hook *hook, u32 flags)
{
struct nft_chain *chain;
struct nf_hook_ops *ops;
struct nft_hook *h;
basechain->type = hook->type;
@ -2567,8 +2609,10 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
if (nft_base_chain_netdev(family, hook->num)) {
list_splice_init(&hook->list, &basechain->hook_list);
list_for_each_entry(h, &basechain->hook_list, list)
nft_basechain_hook_init(&h->ops, family, hook, chain);
list_for_each_entry(h, &basechain->hook_list, list) {
list_for_each_entry(ops, &h->ops_list, list)
nft_basechain_hook_init(ops, family, hook, chain);
}
}
nft_basechain_hook_init(&basechain->ops, family, hook, chain);
@ -2787,15 +2831,17 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
list_for_each_entry_safe(h, next, &hook.list, list) {
h->ops.pf = basechain->ops.pf;
h->ops.hooknum = basechain->ops.hooknum;
h->ops.priority = basechain->ops.priority;
h->ops.priv = basechain->ops.priv;
h->ops.hook = basechain->ops.hook;
list_for_each_entry(ops, &h->ops_list, list) {
ops->pf = basechain->ops.pf;
ops->hooknum = basechain->ops.hooknum;
ops->priority = basechain->ops.priority;
ops->priv = basechain->ops.priv;
ops->hook = basechain->ops.hook;
}
if (nft_hook_list_find(&basechain->hook_list, h)) {
list_del(&h->list);
kfree(h);
nft_netdev_hook_free(h);
}
}
} else {
@ -2913,10 +2959,12 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
err_hooks:
if (nla[NFTA_CHAIN_HOOK]) {
list_for_each_entry_safe(h, next, &hook.list, list) {
if (unregister)
nf_unregister_net_hook(ctx->net, &h->ops);
if (unregister) {
list_for_each_entry(ops, &h->ops_list, list)
nf_unregister_net_hook(ctx->net, ops);
}
list_del(&h->list);
kfree_rcu(h, rcu);
nft_netdev_hook_free_rcu(h);
}
module_put(hook.type->owner);
}
@ -8785,6 +8833,7 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
struct netlink_ext_ack *extack, bool add)
{
struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
struct nf_hook_ops *ops;
struct nft_hook *hook;
int hooknum, priority;
int err;
@ -8839,11 +8888,13 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
}
list_for_each_entry(hook, &flowtable_hook->list, list) {
hook->ops.pf = NFPROTO_NETDEV;
hook->ops.hooknum = flowtable_hook->num;
hook->ops.priority = flowtable_hook->priority;
hook->ops.priv = &flowtable->data;
hook->ops.hook = flowtable->data.type->hook;
list_for_each_entry(ops, &hook->ops_list, list) {
ops->pf = NFPROTO_NETDEV;
ops->hooknum = flowtable_hook->num;
ops->priority = flowtable_hook->priority;
ops->priv = &flowtable->data;
ops->hook = flowtable->data.type->hook;
}
}
return err;
@ -8885,12 +8936,12 @@ nft_flowtable_type_get(struct net *net, u8 family)
}
/* Only called from error and netdev event paths. */
static void nft_unregister_flowtable_hook(struct net *net,
struct nft_flowtable *flowtable,
struct nft_hook *hook)
static void nft_unregister_flowtable_ops(struct net *net,
struct nft_flowtable *flowtable,
struct nf_hook_ops *ops)
{
nf_unregister_net_hook(net, &hook->ops);
flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
nf_unregister_net_hook(net, ops);
flowtable->data.type->setup(&flowtable->data, ops->dev,
FLOW_BLOCK_UNBIND);
}
@ -8900,14 +8951,14 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net,
bool release_netdev)
{
struct nft_hook *hook, *next;
struct nf_hook_ops *ops;
list_for_each_entry_safe(hook, next, hook_list, list) {
nf_unregister_net_hook(net, &hook->ops);
flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
FLOW_BLOCK_UNBIND);
list_for_each_entry(ops, &hook->ops_list, list)
nft_unregister_flowtable_ops(net, flowtable, ops);
if (release_netdev) {
list_del(&hook->list);
kfree_rcu(hook, rcu);
nft_netdev_hook_free_rcu(hook);
}
}
}
@ -8919,6 +8970,26 @@ static void nft_unregister_flowtable_net_hooks(struct net *net,
__nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false);
}
static int nft_register_flowtable_ops(struct net *net,
struct nft_flowtable *flowtable,
struct nf_hook_ops *ops)
{
int err;
err = flowtable->data.type->setup(&flowtable->data,
ops->dev, FLOW_BLOCK_BIND);
if (err < 0)
return err;
err = nf_register_net_hook(net, ops);
if (!err)
return 0;
flowtable->data.type->setup(&flowtable->data,
ops->dev, FLOW_BLOCK_UNBIND);
return err;
}
static int nft_register_flowtable_net_hooks(struct net *net,
struct nft_table *table,
struct list_head *hook_list,
@ -8926,6 +8997,7 @@ static int nft_register_flowtable_net_hooks(struct net *net,
{
struct nft_hook *hook, *next;
struct nft_flowtable *ft;
struct nf_hook_ops *ops;
int err, i = 0;
list_for_each_entry(hook, hook_list, list) {
@ -8939,33 +9011,27 @@ static int nft_register_flowtable_net_hooks(struct net *net,
}
}
err = flowtable->data.type->setup(&flowtable->data,
hook->ops.dev,
FLOW_BLOCK_BIND);
if (err < 0)
goto err_unregister_net_hooks;
list_for_each_entry(ops, &hook->ops_list, list) {
err = nft_register_flowtable_ops(net, flowtable, ops);
if (err < 0)
goto err_unregister_net_hooks;
err = nf_register_net_hook(net, &hook->ops);
if (err < 0) {
flowtable->data.type->setup(&flowtable->data,
hook->ops.dev,
FLOW_BLOCK_UNBIND);
goto err_unregister_net_hooks;
i++;
}
i++;
}
return 0;
err_unregister_net_hooks:
list_for_each_entry_safe(hook, next, hook_list, list) {
if (i-- <= 0)
break;
list_for_each_entry(ops, &hook->ops_list, list) {
if (i-- <= 0)
break;
nft_unregister_flowtable_hook(net, flowtable, hook);
nft_unregister_flowtable_ops(net, flowtable, ops);
}
list_del_rcu(&hook->list);
kfree_rcu(hook, rcu);
nft_netdev_hook_free_rcu(hook);
}
return err;
@ -8977,7 +9043,7 @@ static void nft_hooks_destroy(struct list_head *hook_list)
list_for_each_entry_safe(hook, next, hook_list, list) {
list_del_rcu(&hook->list);
kfree_rcu(hook, rcu);
nft_netdev_hook_free_rcu(hook);
}
}
@ -8988,6 +9054,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
const struct nlattr * const *nla = ctx->nla;
struct nft_flowtable_hook flowtable_hook;
struct nft_hook *hook, *next;
struct nf_hook_ops *ops;
struct nft_trans *trans;
bool unregister = false;
u32 flags;
@ -9001,7 +9068,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
if (nft_hook_list_find(&flowtable->hook_list, hook)) {
list_del(&hook->list);
kfree(hook);
nft_netdev_hook_free(hook);
}
}
@ -9045,10 +9112,13 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
err_flowtable_update_hook:
list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
if (unregister)
nft_unregister_flowtable_hook(ctx->net, flowtable, hook);
if (unregister) {
list_for_each_entry(ops, &hook->ops_list, list)
nft_unregister_flowtable_ops(ctx->net,
flowtable, ops);
}
list_del_rcu(&hook->list);
kfree_rcu(hook, rcu);
nft_netdev_hook_free_rcu(hook);
}
return err;
@ -9194,7 +9264,7 @@ static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook
list_for_each_entry_safe(this, next, &flowtable_hook->list, list) {
list_del(&this->list);
kfree(this);
nft_netdev_hook_free(this);
}
}
@ -9557,7 +9627,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
flowtable->data.type->free(&flowtable->data);
list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
list_del_rcu(&hook->list);
kfree_rcu(hook, rcu);
nft_netdev_hook_free_rcu(hook);
}
kfree(flowtable->name);
module_put(flowtable->data.type->owner);
@ -9590,46 +9660,190 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
return -EMSGSIZE;
}
static void nft_flowtable_event(unsigned long event, struct net_device *dev,
struct nft_flowtable *flowtable)
struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook,
const struct net_device *dev)
{
struct nf_hook_ops *ops;
list_for_each_entry(ops, &hook->ops_list, list) {
if (ops->dev == dev)
return ops;
}
return NULL;
}
EXPORT_SYMBOL_GPL(nft_hook_find_ops);
struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook,
const struct net_device *dev)
{
struct nf_hook_ops *ops;
list_for_each_entry_rcu(ops, &hook->ops_list, list) {
if (ops->dev == dev)
return ops;
}
return NULL;
}
EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu);
static void
nf_tables_device_notify(const struct nft_table *table, int attr,
const char *name, const struct nft_hook *hook,
const struct net_device *dev, int event)
{
struct net *net = dev_net(dev);
struct nlmsghdr *nlh;
struct sk_buff *skb;
u16 flags = 0;
if (!nfnetlink_has_listeners(net, NFNLGRP_NFT_DEV))
return;
skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!skb)
goto err;
event = event == NETDEV_REGISTER ? NFT_MSG_NEWDEV : NFT_MSG_DELDEV;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
nlh = nfnl_msg_put(skb, 0, 0, event, flags, table->family,
NFNETLINK_V0, nft_base_seq(net));
if (!nlh)
goto err;
if (nla_put_string(skb, NFTA_DEVICE_TABLE, table->name) ||
nla_put_string(skb, attr, name) ||
nla_put(skb, NFTA_DEVICE_SPEC, hook->ifnamelen, hook->ifname) ||
nla_put_string(skb, NFTA_DEVICE_NAME, dev->name))
goto err;
nlmsg_end(skb, nlh);
nfnetlink_send(skb, net, 0, NFNLGRP_NFT_DEV,
nlmsg_report(nlh), GFP_KERNEL);
return;
err:
if (skb)
kfree_skb(skb);
nfnetlink_set_err(net, 0, NFNLGRP_NFT_DEV, -ENOBUFS);
}
void
nf_tables_chain_device_notify(const struct nft_chain *chain,
const struct nft_hook *hook,
const struct net_device *dev, int event)
{
nf_tables_device_notify(chain->table, NFTA_DEVICE_CHAIN,
chain->name, hook, dev, event);
}
static void
nf_tables_flowtable_device_notify(const struct nft_flowtable *ft,
const struct nft_hook *hook,
const struct net_device *dev, int event)
{
nf_tables_device_notify(ft->table, NFTA_DEVICE_FLOWTABLE,
ft->name, hook, dev, event);
}
static int nft_flowtable_event(unsigned long event, struct net_device *dev,
struct nft_flowtable *flowtable, bool changename)
{
struct nf_hook_ops *ops;
struct nft_hook *hook;
bool match;
list_for_each_entry(hook, &flowtable->hook_list, list) {
if (hook->ops.dev != dev)
continue;
ops = nft_hook_find_ops(hook, dev);
match = !strncmp(hook->ifname, dev->name, hook->ifnamelen);
/* flow_offload_netdev_event() cleans up entries for us. */
nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook);
list_del_rcu(&hook->list);
kfree_rcu(hook, rcu);
switch (event) {
case NETDEV_UNREGISTER:
/* NOP if not found or new name still matching */
if (!ops || (changename && match))
continue;
/* flow_offload_netdev_event() cleans up entries for us. */
nft_unregister_flowtable_ops(dev_net(dev),
flowtable, ops);
list_del_rcu(&ops->list);
kfree_rcu(ops, rcu);
break;
case NETDEV_REGISTER:
/* NOP if not matching or already registered */
if (!match || (changename && ops))
continue;
ops = kzalloc(sizeof(struct nf_hook_ops),
GFP_KERNEL_ACCOUNT);
if (!ops)
return 1;
ops->pf = NFPROTO_NETDEV;
ops->hooknum = flowtable->hooknum;
ops->priority = flowtable->data.priority;
ops->priv = &flowtable->data;
ops->hook = flowtable->data.type->hook;
ops->dev = dev;
if (nft_register_flowtable_ops(dev_net(dev),
flowtable, ops)) {
kfree(ops);
return 1;
}
list_add_tail_rcu(&ops->list, &hook->ops_list);
break;
}
nf_tables_flowtable_device_notify(flowtable, hook, dev, event);
break;
}
return 0;
}
static int __nf_tables_flowtable_event(unsigned long event,
struct net_device *dev,
bool changename)
{
struct nftables_pernet *nft_net = nft_pernet(dev_net(dev));
struct nft_flowtable *flowtable;
struct nft_table *table;
list_for_each_entry(table, &nft_net->tables, list) {
list_for_each_entry(flowtable, &table->flowtables, list) {
if (nft_flowtable_event(event, dev,
flowtable, changename))
return 1;
}
}
return 0;
}
static int nf_tables_flowtable_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct nft_flowtable *flowtable;
struct nftables_pernet *nft_net;
struct nft_table *table;
int ret = NOTIFY_DONE;
struct net *net;
if (event != NETDEV_UNREGISTER)
return 0;
if (event != NETDEV_REGISTER &&
event != NETDEV_UNREGISTER &&
event != NETDEV_CHANGENAME)
return NOTIFY_DONE;
net = dev_net(dev);
nft_net = nft_pernet(net);
mutex_lock(&nft_net->commit_mutex);
list_for_each_entry(table, &nft_net->tables, list) {
list_for_each_entry(flowtable, &table->flowtables, list) {
nft_flowtable_event(event, dev, flowtable);
}
}
mutex_unlock(&nft_net->commit_mutex);
return NOTIFY_DONE;
if (event == NETDEV_CHANGENAME) {
if (__nf_tables_flowtable_event(NETDEV_REGISTER, dev, true)) {
ret = NOTIFY_BAD;
goto out_unlock;
}
__nf_tables_flowtable_event(NETDEV_UNREGISTER, dev, true);
} else if (__nf_tables_flowtable_event(event, dev, false)) {
ret = NOTIFY_BAD;
}
out_unlock:
mutex_unlock(&nft_net->commit_mutex);
return ret;
}
static struct notifier_block nf_tables_flowtable_notifier = {

View file

@ -220,6 +220,7 @@ static int nft_chain_offload_priority(const struct nft_base_chain *basechain)
bool nft_chain_offload_support(const struct nft_base_chain *basechain)
{
struct nf_hook_ops *ops;
struct net_device *dev;
struct nft_hook *hook;
@ -227,13 +228,16 @@ bool nft_chain_offload_support(const struct nft_base_chain *basechain)
return false;
list_for_each_entry(hook, &basechain->hook_list, list) {
if (hook->ops.pf != NFPROTO_NETDEV ||
hook->ops.hooknum != NF_NETDEV_INGRESS)
return false;
list_for_each_entry(ops, &hook->ops_list, list) {
if (ops->pf != NFPROTO_NETDEV ||
ops->hooknum != NF_NETDEV_INGRESS)
return false;
dev = hook->ops.dev;
if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists())
return false;
dev = ops->dev;
if (!dev->netdev_ops->ndo_setup_tc &&
!flow_indr_dev_exists())
return false;
}
}
return true;
@ -455,34 +459,37 @@ static int nft_flow_block_chain(struct nft_base_chain *basechain,
const struct net_device *this_dev,
enum flow_block_command cmd)
{
struct net_device *dev;
struct nf_hook_ops *ops;
struct nft_hook *hook;
int err, i = 0;
list_for_each_entry(hook, &basechain->hook_list, list) {
dev = hook->ops.dev;
if (this_dev && this_dev != dev)
continue;
list_for_each_entry(ops, &hook->ops_list, list) {
if (this_dev && this_dev != ops->dev)
continue;
err = nft_chain_offload_cmd(basechain, dev, cmd);
if (err < 0 && cmd == FLOW_BLOCK_BIND) {
if (!this_dev)
goto err_flow_block;
err = nft_chain_offload_cmd(basechain, ops->dev, cmd);
if (err < 0 && cmd == FLOW_BLOCK_BIND) {
if (!this_dev)
goto err_flow_block;
return err;
return err;
}
i++;
}
i++;
}
return 0;
err_flow_block:
list_for_each_entry(hook, &basechain->hook_list, list) {
if (i-- <= 0)
break;
list_for_each_entry(ops, &hook->ops_list, list) {
if (i-- <= 0)
break;
dev = hook->ops.dev;
nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND);
nft_chain_offload_cmd(basechain, ops->dev,
FLOW_BLOCK_UNBIND);
}
}
return err;
}
@ -638,7 +645,7 @@ static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *n
found = NULL;
basechain = nft_base_chain(chain);
list_for_each_entry(hook, &basechain->hook_list, list) {
if (hook->ops.dev != dev)
if (!nft_hook_find_ops(hook, dev))
continue;
found = hook;

View file

@ -15,6 +15,7 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
@ -90,6 +91,49 @@ static int nf_trace_fill_dev_info(struct sk_buff *nlskb,
return 0;
}
static int nf_trace_fill_ct_info(struct sk_buff *nlskb,
const struct sk_buff *skb)
{
const struct nf_ct_hook *ct_hook;
enum ip_conntrack_info ctinfo;
const struct nf_conn *ct;
u32 state;
ct_hook = rcu_dereference(nf_ct_hook);
if (!ct_hook)
return 0;
ct = nf_ct_get(skb, &ctinfo);
if (!ct) {
if (ctinfo != IP_CT_UNTRACKED) /* not seen by conntrack or invalid */
return 0;
state = NF_CT_STATE_UNTRACKED_BIT;
} else {
state = NF_CT_STATE_BIT(ctinfo);
}
if (nla_put_be32(nlskb, NFTA_TRACE_CT_STATE, htonl(state)))
return -1;
if (ct) {
u32 id = ct_hook->get_id(&ct->ct_general);
u32 status = READ_ONCE(ct->status);
u8 dir = CTINFO2DIR(ctinfo);
if (nla_put_u8(nlskb, NFTA_TRACE_CT_DIRECTION, dir))
return -1;
if (nla_put_be32(nlskb, NFTA_TRACE_CT_ID, (__force __be32)id))
return -1;
if (status && nla_put_be32(nlskb, NFTA_TRACE_CT_STATUS, htonl(status)))
return -1;
}
return 0;
}
static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
const struct nft_pktinfo *pkt)
{
@ -210,7 +254,11 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
nla_total_size(sizeof(__be32)) + /* trace type */
nla_total_size(0) + /* VERDICT, nested */
nla_total_size(sizeof(u32)) + /* verdict code */
nla_total_size(sizeof(u32)) + /* id */
nla_total_size(sizeof(u32)) + /* ct id */
nla_total_size(sizeof(u8)) + /* ct direction */
nla_total_size(sizeof(u32)) + /* ct state */
nla_total_size(sizeof(u32)) + /* ct status */
nla_total_size(sizeof(u32)) + /* trace id */
nla_total_size(NFT_TRACETYPE_LL_HSIZE) +
nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) +
nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) +
@ -291,6 +339,10 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
if (nf_trace_fill_pkt_info(skb, pkt))
goto nla_put_failure;
if (nf_trace_fill_ct_info(skb, pkt->skb))
goto nla_put_failure;
info->packet_dumped = true;
}

View file

@ -86,6 +86,7 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = {
[NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES,
[NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT,
[NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES,
[NFNLGRP_NFT_DEV] = NFNL_SUBSYS_NFTABLES,
};
static struct nfnl_net *nfnl_pernet(struct net *net)

View file

@ -318,38 +318,68 @@ static const struct nft_chain_type nft_chain_filter_netdev = {
},
};
static void nft_netdev_event(unsigned long event, struct net_device *dev,
struct nft_base_chain *basechain)
static int nft_netdev_event(unsigned long event, struct net_device *dev,
struct nft_base_chain *basechain, bool changename)
{
struct nft_table *table = basechain->chain.table;
struct nf_hook_ops *ops;
struct nft_hook *hook;
bool match;
list_for_each_entry(hook, &basechain->hook_list, list) {
if (hook->ops.dev != dev)
continue;
ops = nft_hook_find_ops(hook, dev);
match = !strncmp(hook->ifname, dev->name, hook->ifnamelen);
if (!(basechain->chain.table->flags & NFT_TABLE_F_DORMANT))
nf_unregister_net_hook(dev_net(dev), &hook->ops);
switch (event) {
case NETDEV_UNREGISTER:
/* NOP if not found or new name still matching */
if (!ops || (changename && match))
continue;
list_del_rcu(&hook->list);
kfree_rcu(hook, rcu);
if (!(table->flags & NFT_TABLE_F_DORMANT))
nf_unregister_net_hook(dev_net(dev), ops);
list_del_rcu(&ops->list);
kfree_rcu(ops, rcu);
break;
case NETDEV_REGISTER:
/* NOP if not matching or already registered */
if (!match || (changename && ops))
continue;
ops = kmemdup(&basechain->ops,
sizeof(struct nf_hook_ops),
GFP_KERNEL_ACCOUNT);
if (!ops)
return 1;
ops->dev = dev;
if (!(table->flags & NFT_TABLE_F_DORMANT) &&
nf_register_net_hook(dev_net(dev), ops)) {
kfree(ops);
return 1;
}
list_add_tail_rcu(&ops->list, &hook->ops_list);
break;
}
nf_tables_chain_device_notify(&basechain->chain,
hook, dev, event);
break;
}
return 0;
}
static int nf_tables_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
static int __nf_tables_netdev_event(unsigned long event,
struct net_device *dev,
bool changename)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct nft_base_chain *basechain;
struct nftables_pernet *nft_net;
struct nft_chain *chain;
struct nft_table *table;
if (event != NETDEV_UNREGISTER)
return NOTIFY_DONE;
nft_net = nft_pernet(dev_net(dev));
mutex_lock(&nft_net->commit_mutex);
list_for_each_entry(table, &nft_net->tables, list) {
if (table->family != NFPROTO_NETDEV &&
table->family != NFPROTO_INET)
@ -364,12 +394,40 @@ static int nf_tables_netdev_event(struct notifier_block *this,
basechain->ops.hooknum != NF_INET_INGRESS)
continue;
nft_netdev_event(event, dev, basechain);
if (nft_netdev_event(event, dev, basechain, changename))
return 1;
}
}
mutex_unlock(&nft_net->commit_mutex);
return 0;
}
return NOTIFY_DONE;
static int nf_tables_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct nftables_pernet *nft_net;
int ret = NOTIFY_DONE;
if (event != NETDEV_REGISTER &&
event != NETDEV_UNREGISTER &&
event != NETDEV_CHANGENAME)
return NOTIFY_DONE;
nft_net = nft_pernet(dev_net(dev));
mutex_lock(&nft_net->commit_mutex);
if (event == NETDEV_CHANGENAME) {
if (__nf_tables_netdev_event(NETDEV_REGISTER, dev, true)) {
ret = NOTIFY_BAD;
goto out_unlock;
}
__nf_tables_netdev_event(NETDEV_UNREGISTER, dev, true);
} else if (__nf_tables_netdev_event(event, dev, false)) {
ret = NOTIFY_BAD;
}
out_unlock:
mutex_unlock(&nft_net->commit_mutex);
return ret;
}
static struct notifier_block nf_tables_netdev_notifier = {

View file

@ -175,7 +175,7 @@ static bool nft_flowtable_find_dev(const struct net_device *dev,
bool found = false;
list_for_each_entry_rcu(hook, &ft->hook_list, list) {
if (hook->ops.dev != dev)
if (!nft_hook_find_ops_rcu(hook, dev))
continue;
found = true;

View file

@ -23,7 +23,14 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx);
struct nft_inner_tun_ctx_locked {
struct nft_inner_tun_ctx ctx;
local_lock_t bh_lock;
};
static DEFINE_PER_CPU(struct nft_inner_tun_ctx_locked, nft_pcpu_tun_ctx) = {
.bh_lock = INIT_LOCAL_LOCK(bh_lock),
};
/* Same layout as nft_expr but it embeds the private expression data area. */
struct __nft_expr {
@ -237,12 +244,15 @@ static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
struct nft_inner_tun_ctx *this_cpu_tun_ctx;
local_bh_disable();
this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);
if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) {
local_bh_enable();
local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
return false;
}
*tun_ctx = *this_cpu_tun_ctx;
local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
local_bh_enable();
return true;
@ -254,9 +264,11 @@ static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt,
struct nft_inner_tun_ctx *this_cpu_tun_ctx;
local_bh_disable();
this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);
if (this_cpu_tun_ctx->cookie != tun_ctx->cookie)
*this_cpu_tun_ctx = *tun_ctx;
local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
local_bh_enable();
}

View file

@ -621,10 +621,10 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
struct geneve_opt *opt;
int offset = 0;
inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE);
if (!inner)
goto failure;
while (opts->len > offset) {
inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE);
if (!inner)
goto failure;
opt = (struct geneve_opt *)(opts->u.data + offset);
if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS,
opt->opt_class) ||
@ -634,8 +634,8 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
opt->length * 4, opt->opt_data))
goto inner_failure;
offset += sizeof(*opt) + opt->length * 4;
nla_nest_end(skb, inner);
}
nla_nest_end(skb, inner);
}
nla_nest_end(skb, nest);
return 0;

View file

@ -91,7 +91,7 @@ tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb));
}
#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static unsigned int
tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
@ -119,7 +119,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = {
.targetsize = sizeof(struct xt_tcpoptstrip_target_info),
.me = THIS_MODULE,
},
#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.name = "TCPOPTSTRIP",
.family = NFPROTO_IPV6,

View file

@ -48,7 +48,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = {
.targetsize = sizeof(struct xt_mark_tginfo2),
.me = THIS_MODULE,
},
#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES)
#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) || IS_ENABLED(CONFIG_NFT_COMPAT_ARP)
{
.name = "MARK",
.revision = 2,

View file

@ -24,6 +24,7 @@ TEST_PROGS += nft_concat_range.sh
TEST_PROGS += nft_conntrack_helper.sh
TEST_PROGS += nft_fib.sh
TEST_PROGS += nft_flowtable.sh
TEST_PROGS += nft_interface_stress.sh
TEST_PROGS += nft_meta.sh
TEST_PROGS += nft_nat.sh
TEST_PROGS += nft_nat_zones.sh

View file

@ -32,7 +32,6 @@ source lib.sh
IP0=172.30.30.1
IP1=172.30.30.2
DUMMYNET=10.9.9
PFXL=30
ret=0
@ -52,8 +51,6 @@ trap cleanup EXIT
setup_ns ns0 ns1
ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.forwarding=1
if ! ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1; then
echo "SKIP: Could not add veth device"
exit $ksft_skip
@ -64,18 +61,13 @@ if ! ip -net "$ns0" li add tvrf type vrf table 9876; then
exit $ksft_skip
fi
ip -net "$ns0" link add dummy0 type dummy
ip -net "$ns0" li set veth0 master tvrf
ip -net "$ns0" li set dummy0 master tvrf
ip -net "$ns0" li set tvrf up
ip -net "$ns0" li set veth0 up
ip -net "$ns0" li set dummy0 up
ip -net "$ns1" li set veth0 up
ip -net "$ns0" addr add $IP0/$PFXL dev veth0
ip -net "$ns1" addr add $IP1/$PFXL dev veth0
ip -net "$ns0" addr add $DUMMYNET.1/$PFXL dev dummy0
listener_ready()
{
@ -216,35 +208,9 @@ EOF
fi
}
test_fib()
{
ip netns exec "$ns0" nft -f - <<EOF
flush ruleset
table ip t {
counter fibcount { }
chain prerouting {
type filter hook prerouting priority 0;
meta iifname veth0 ip daddr $DUMMYNET.2 fib daddr oif dummy0 counter name fibcount notrack
}
}
EOF
ip -net "$ns1" route add 10.9.9.0/24 via "$IP0" dev veth0
ip netns exec "$ns1" ping -q -w 1 -c 1 "$DUMMYNET".2 > /dev/null
if ip netns exec "$ns0" nft list counter t fibcount | grep -q "packets 1"; then
echo "PASS: fib lookup returned exepected output interface"
else
echo "FAIL: fib lookup did not return exepected output interface"
ret=1
return
fi
}
test_ct_zone_in
test_masquerade_vrf "default"
test_masquerade_vrf "pfifo"
test_masquerade_veth
test_fib
exit $ret

View file

@ -15,10 +15,12 @@ source lib.sh
# Available test groups:
# - reported_issues: check for issues that were reported in the past
# - correctness: check that packets match given entries, and only those
# - correctness_large: same but with additional non-matching entries
# - concurrency: attempt races between insertion, deletion and lookup
# - timeout: check that packets match entries until they expire
# - performance: estimate matching rate, compare with rbtree and hash baselines
TESTS="reported_issues correctness concurrency timeout"
TESTS="reported_issues correctness correctness_large concurrency timeout"
[ -n "$NFT_CONCAT_RANGE_TESTS" ] && TESTS="${NFT_CONCAT_RANGE_TESTS}"
# Set types, defined by TYPE_ variables below
@ -1257,9 +1259,7 @@ send_nomatch() {
# - add ranged element, check that packets match it
# - check that packets outside range don't match it
# - remove some elements, check that packets don't match anymore
test_correctness() {
setup veth send_"${proto}" set || return ${ksft_skip}
test_correctness_main() {
range_size=1
for i in $(seq "${start}" $((start + count))); do
end=$((start + range_size))
@ -1293,6 +1293,163 @@ test_correctness() {
done
}
test_correctness() {
setup veth send_"${proto}" set || return ${ksft_skip}
test_correctness_main
}
# Repeat the correctness tests, but add extra non-matching entries.
# This exercises the more compact '4 bit group' representation that
# gets picked when the default 8-bit representation exceed
# NFT_PIPAPO_LT_SIZE_HIGH bytes of memory.
# See usage of NFT_PIPAPO_LT_SIZE_HIGH in pipapo_lt_bits_adjust().
#
# The format() helper is way too slow when generating lots of
# entries so its not used here.
test_correctness_large() {
setup veth send_"${proto}" set || return ${ksft_skip}
# number of dummy (filler) entries to add.
local dcount=16385
(
echo -n "add element inet filter test { "
case "$type_spec" in
"ether_addr . ipv4_addr")
for i in $(seq 1 $dcount); do
[ $i -gt 1 ] && echo ", "
format_mac $((1000000 + i))
printf ". 172.%i.%i.%i " $((RANDOM%256)) $((RANDOM%256)) $((i%256))
done
;;
"inet_proto . ipv6_addr")
for i in $(seq 1 $dcount); do
[ $i -gt 1 ] && echo ", "
printf "%i . " $((RANDOM%256))
format_addr6 $((1000000 + i))
done
;;
"inet_service . inet_proto")
# smaller key sizes, need more entries to hit the
# 4-bit threshold.
dcount=65536
for i in $(seq 1 $dcount); do
local proto=$((RANDOM%256))
# Test uses UDP to match, as it also fails when matching
# an entry that doesn't exist, so skip 'udp' entries
# to not trigger a wrong failure.
[ $proto -eq 17 ] && proto=18
[ $i -gt 1 ] && echo ", "
printf "%i . %i " $(((i%65534) + 1)) $((proto))
done
;;
"inet_service . ipv4_addr")
dcount=32768
for i in $(seq 1 $dcount); do
[ $i -gt 1 ] && echo ", "
printf "%i . 172.%i.%i.%i " $(((RANDOM%65534) + 1)) $((RANDOM%256)) $((RANDOM%256)) $((i%256))
done
;;
"ipv4_addr . ether_addr")
for i in $(seq 1 $dcount); do
[ $i -gt 1 ] && echo ", "
printf "172.%i.%i.%i . " $((RANDOM%256)) $((RANDOM%256)) $((i%256))
format_mac $((1000000 + i))
done
;;
"ipv4_addr . inet_service")
dcount=32768
for i in $(seq 1 $dcount); do
[ $i -gt 1 ] && echo ", "
printf "172.%i.%i.%i . %i" $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1))
done
;;
"ipv4_addr . inet_service . ether_addr . inet_proto . ipv4_addr")
dcount=65536
for i in $(seq 1 $dcount); do
[ $i -gt 1 ] && echo ", "
printf "172.%i.%i.%i . %i . " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1))
format_mac $((1000000 + i))
printf ". %i . 192.168.%i.%i" $((RANDOM%256)) $((RANDOM%256)) $((i%256))
done
;;
"ipv4_addr . inet_service . inet_proto")
for i in $(seq 1 $dcount); do
[ $i -gt 1 ] && echo ", "
printf "172.%i.%i.%i . %i . %i " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) $((RANDOM%256))
done
;;
"ipv4_addr . inet_service . inet_proto . ipv4_addr")
for i in $(seq 1 $dcount); do
[ $i -gt 1 ] && echo ", "
printf "172.%i.%i.%i . %i . %i . 192.168.%i.%i " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) $((RANDOM%256)) $((RANDOM%256)) $((RANDOM%256))
done
;;
"ipv4_addr . inet_service . ipv4_addr")
dcount=32768
for i in $(seq 1 $dcount); do
[ $i -gt 1 ] && echo ", "
printf "172.%i.%i.%i . %i . 192.168.%i.%i " $((RANDOM%256)) $((RANDOM%256)) $((i%256)) $(((RANDOM%65534) + 1)) $((RANDOM%256)) $((RANDOM%256))
done
;;
"ipv6_addr . ether_addr")
for i in $(seq 1 $dcount); do
[ $i -gt 1 ] && echo ", "
format_addr6 $((i + 1000000))
echo -n " . "
format_mac $((1000000 + i))
done
;;
"ipv6_addr . inet_service")
dcount=32768
for i in $(seq 1 $dcount); do
[ $i -gt 1 ] && echo ", "
format_addr6 $((i + 1000000))
echo -n " . $(((RANDOM%65534) + 1))"
done
;;
"ipv6_addr . inet_service . ether_addr")
dcount=32768
for i in $(seq 1 $dcount); do
[ $i -gt 1 ] && echo ", "
format_addr6 $((i + 1000000))
echo -n " . $(((RANDOM%65534) + 1)) . "
format_mac $((i + 1000000))
done
;;
"ipv6_addr . inet_service . ether_addr . inet_proto")
dcount=65536
for i in $(seq 1 $dcount); do
[ $i -gt 1 ] && echo ", "
format_addr6 $((i + 1000000))
echo -n " . $(((RANDOM%65534) + 1)) . "
format_mac $((i + 1000000))
echo -n " . $((RANDOM%256))"
done
;;
"ipv6_addr . inet_service . ipv6_addr . inet_service")
dcount=32768
for i in $(seq 1 $dcount); do
[ $i -gt 1 ] && echo ", "
format_addr6 $((i + 1000000))
echo -n " . $(((RANDOM%65534) + 1)) . "
format_addr6 $((i + 2123456))
echo -n " . $((RANDOM%256))"
done
;;
*)
"Unhandled $type_spec"
return 1
esac
echo -n "}"
) | nft -f - || return 1
test_correctness_main
}
# Concurrency test template:
# - add all the elements
# - start a thread for each physical thread that:

View file

@ -3,6 +3,10 @@
# This tests the fib expression.
#
# Kselftest framework requirement - SKIP code is 4.
#
# 10.0.1.99 10.0.1.1 10.0.2.1 10.0.2.99
# dead:1::99 dead:1::1 dead:2::1 dead:2::99
# ns1 <-------> [ veth0 ] nsrouter [veth1] <-------> ns2
source lib.sh
@ -72,6 +76,89 @@ table inet filter {
EOF
}
load_type_ruleset() {
local netns=$1
for family in ip ip6;do
ip netns exec "$netns" nft -f /dev/stdin <<EOF
table $family filter {
chain type_match_in {
fib daddr type local counter comment "daddr configured on other iface"
fib daddr . iif type local counter comment "daddr configured on iif"
fib daddr type unicast counter comment "daddr not local"
fib daddr . iif type unicast counter comment "daddr not configured on iif"
}
chain type_match_out {
fib daddr type unicast counter
fib daddr . oif type unicast counter
fib daddr type local counter
fib daddr . oif type local counter
}
chain prerouting {
type filter hook prerouting priority 0;
icmp type echo-request counter jump type_match_in
icmpv6 type echo-request counter jump type_match_in
}
chain input {
type filter hook input priority 0;
icmp type echo-request counter jump type_match_in
icmpv6 type echo-request counter jump type_match_in
}
chain forward {
type filter hook forward priority 0;
icmp type echo-request counter jump type_match_in
icmpv6 type echo-request counter jump type_match_in
}
chain output {
type filter hook output priority 0;
icmp type echo-request counter jump type_match_out
icmpv6 type echo-request counter jump type_match_out
}
chain postrouting {
type filter hook postrouting priority 0;
icmp type echo-request counter jump type_match_out
icmpv6 type echo-request counter jump type_match_out
}
}
EOF
done
}
reload_type_ruleset() {
ip netns exec "$1" nft flush table ip filter
ip netns exec "$1" nft flush table ip6 filter
load_type_ruleset "$1"
}
check_fib_type_counter_family() {
local family="$1"
local want="$2"
local ns="$3"
local chain="$4"
local what="$5"
local errmsg="$6"
if ! ip netns exec "$ns" nft list chain "$family" filter "$chain" | grep "$what" | grep -q "packets $want";then
echo "Netns $ns $family fib type counter doesn't match expected packet count of $want for $what $errmsg" 1>&2
ip netns exec "$ns" nft list chain "$family" filter "$chain"
ret=1
return 1
fi
return 0
}
check_fib_type_counter() {
check_fib_type_counter_family "ip" "$@" || return 1
check_fib_type_counter_family "ip6" "$@" || return 1
}
load_ruleset_count() {
local netns=$1
@ -90,6 +177,7 @@ check_drops() {
if dmesg | grep -q ' nft_rpfilter: ';then
dmesg | grep ' nft_rpfilter: '
echo "FAIL: rpfilter did drop packets"
ret=1
return 1
fi
@ -164,17 +252,496 @@ test_ping() {
return 0
}
test_ping_unreachable() {
local daddr4=$1
local daddr6=$2
if ip netns exec "$ns1" ping -c 1 -w 1 -q "$daddr4" > /dev/null; then
echo "FAIL: ${ns1} could reach $daddr4" 1>&2
return 1
fi
if ip netns exec "$ns1" ping -c 1 -w 1 -q "$daddr6" > /dev/null; then
echo "FAIL: ${ns1} could reach $daddr6" 1>&2
return 1
fi
return 0
}
test_fib_type() {
local notice="$1"
local errmsg="addr-on-if"
local lret=0
if ! load_type_ruleset "$nsrouter";then
echo "SKIP: Could not load fib type ruleset"
[ $ret -eq 0 ] && ret=$ksft_skip
return
fi
# makes router receive packet for addresses configured on incoming
# interface.
test_ping 10.0.1.1 dead:1::1 || return 1
# expectation: triggers all 'local' in prerouting/input.
check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr type local" "$errmsg" || lret=1
check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr . iif type local" "$errmsg" || lret=1
reload_type_ruleset "$nsrouter"
# makes router receive packet for address configured on a different (but local)
# interface.
test_ping 10.0.2.1 dead:2::1 || return 1
# expectation: triggers 'unicast' in prerouting/input for daddr . iif and local for 'daddr'.
errmsg="addr-on-host"
check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr type local" "$errmsg" || lret=1
check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr . iif type unicast" "$errmsg" || lret=1
reload_type_ruleset "$nsrouter"
test_ping 10.0.2.99 dead:2::99 || return 1
errmsg="addr-on-otherhost"
check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr type unicast" "$errmsg" || lret=1
check_fib_type_counter 2 "$nsrouter" "type_match_in" "fib daddr . iif type unicast" "$errmsg" || lret=1
if [ $lret -eq 0 ];then
echo "PASS: fib expression address types match ($notice)"
else
echo "FAIL: fib expression address types match ($notice)"
ret=1
fi
}
test_fib_vrf_dev_add_dummy()
{
if ! ip -net "$nsrouter" link add dummy0 type dummy ;then
echo "SKIP: VRF tests: dummy device type not supported"
return 1
fi
if ! ip -net "$nsrouter" link add tvrf type vrf table 9876;then
echo "SKIP: VRF tests: vrf device type not supported"
return 1
fi
ip -net "$nsrouter" link set dummy0 master tvrf
ip -net "$nsrouter" link set dummy0 up
ip -net "$nsrouter" link set tvrf up
}
load_ruleset_vrf()
{
# Due to the many different possible combinations using named counters
# or one-rule-per-expected-result is complex.
#
# Instead, add dynamic sets for the fib modes
# (fib address type, fib output interface lookup .. ),
# and then add the obtained fib results to them.
#
# The test is successful if the sets contain the expected results
# and no unexpected extra entries existed.
ip netns exec "$nsrouter" nft -f - <<EOF
flush ruleset
table inet t {
set fibif4 {
typeof meta iif . ip daddr . fib daddr oif
flags dynamic
counter
}
set fibif4iif {
typeof meta iif . ip daddr . fib daddr . iif oif
flags dynamic
counter
}
set fibif6 {
typeof meta iif . ip6 daddr . fib daddr oif
flags dynamic
counter
}
set fibif6iif {
typeof meta iif . ip6 daddr . fib daddr . iif oif
flags dynamic
counter
}
set fibtype4 {
typeof meta iif . ip daddr . fib daddr type
flags dynamic
counter
}
set fibtype4iif {
typeof meta iif . ip daddr . fib daddr . iif type
flags dynamic
counter
}
set fibtype6 {
typeof meta iif . ip6 daddr . fib daddr type
flags dynamic
counter
}
set fibtype6iif {
typeof meta iif . ip6 daddr . fib daddr . iif type
flags dynamic
counter
}
chain fib_test {
meta nfproto ipv4 jump {
add @fibif4 { meta iif . ip daddr . fib daddr oif }
add @fibif4iif { meta iif . ip daddr . fib daddr . iif oif }
add @fibtype4 { meta iif . ip daddr . fib daddr type }
add @fibtype4iif { meta iif . ip daddr . fib daddr . iif type }
add @fibif4 { meta iif . ip saddr . fib saddr oif }
add @fibif4iif { meta iif . ip saddr . fib saddr . iif oif }
}
meta nfproto ipv6 jump {
add @fibif6 { meta iif . ip6 daddr . fib daddr oif }
add @fibif6iif { meta iif . ip6 daddr . fib daddr . iif oif }
add @fibtype6 { meta iif . ip6 daddr . fib daddr type }
add @fibtype6iif { meta iif . ip6 daddr . fib daddr . iif type }
add @fibif6 { meta iif . ip6 saddr . fib saddr oif }
add @fibif6iif { meta iif . ip6 saddr . fib saddr . iif oif }
}
}
chain prerouting {
type filter hook prerouting priority 0;
icmp type echo-request counter jump fib_test
# neighbour discovery to be ignored.
icmpv6 type echo-request counter jump fib_test
}
}
EOF
if [ $? -ne 0 ] ;then
echo "SKIP: Could not load ruleset for fib vrf test"
[ $ret -eq 0 ] && ret=$ksft_skip
return 1
fi
}
check_type()
{
local setname="$1"
local iifname="$2"
local addr="$3"
local type="$4"
local count="$5"
[ -z "$count" ] && count=1
if ! ip netns exec "$nsrouter" nft get element inet t "$setname" { "$iifname" . "$addr" . "$type" } |grep -q "counter packets $count";then
echo "FAIL: did not find $iifname . $addr . $type in $setname"
ip netns exec "$nsrouter" nft list set inet t "$setname"
ret=1
return 1
fi
# delete the entry, this allows to check if anything unexpected appeared
# at the end of the test run: all dynamic sets should be empty by then.
if ! ip netns exec "$nsrouter" nft delete element inet t "$setname" { "$iifname" . "$addr" . "$type" } ; then
echo "FAIL: can't delete $iifname . $addr . $type in $setname"
ip netns exec "$nsrouter" nft list set inet t "$setname"
ret=1
return 1
fi
return 0
}
check_local()
{
check_type $@ "local" 1
}
check_unicast()
{
check_type $@ "unicast" 1
}
check_rpf()
{
check_type $@
}
check_fib_vrf_sets_empty()
{
local setname=""
local lret=0
# A non-empty set means that we have seen unexpected packets OR
# that a fib lookup provided unexpected results.
for setname in "fibif4" "fibif4iif" "fibif6" "fibif6iif" \
"fibtype4" "fibtype4iif" "fibtype6" "fibtype6iif";do
if ip netns exec "$nsrouter" nft list set inet t "$setname" | grep -q elements;then
echo "FAIL: $setname not empty"
ip netns exec "$nsrouter" nft list set inet t "$setname"
ret=1
lret=1
fi
done
return $lret
}
check_fib_vrf_type()
{
local msg="$1"
local addr
# the incoming interface is always veth0. As its not linked to a VRF,
# the 'tvrf' device should NOT show up anywhere.
local ifname="veth0"
local lret=0
# local_veth0, local_veth1
for addr in "10.0.1.1" "10.0.2.1"; do
check_local fibtype4 "$ifname" "$addr" || lret=1
check_type fibif4 "$ifname" "$addr" "0" || lret=1
done
for addr in "dead:1::1" "dead:2::1";do
check_local fibtype6 "$ifname" "$addr" || lret=1
check_type fibif6 "$ifname" "$addr" "0" || lret=1
done
# when restricted to the incoming interface, 10.0.1.1 should
# be 'local', but 10.0.2.1 unicast.
check_local fibtype4iif "$ifname" "10.0.1.1" || lret=1
check_unicast fibtype4iif "$ifname" "10.0.2.1" || lret=1
# same for the ipv6 addresses.
check_local fibtype6iif "$ifname" "dead:1::1" || lret=1
check_unicast fibtype6iif "$ifname" "dead:2::1" || lret=1
# None of these addresses should find a valid route when restricting
# to the incoming interface (we ask for daddr - 10.0.1.1/2.1 are
# reachable via 'lo'.
for addr in "10.0.1.1" "10.0.2.1" "10.9.9.1" "10.9.9.2";do
check_type fibif4iif "$ifname" "$addr" "0" || lret=1
done
# expect default route (veth1), dummy0 is part of VRF but iif isn't.
for addr in "10.9.9.1" "10.9.9.2";do
check_unicast fibtype4 "$ifname" "$addr" || lret=1
check_unicast fibtype4iif "$ifname" "$addr" || lret=1
check_type fibif4 "$ifname" "$addr" "veth1" || lret=1
done
for addr in "dead:9::1" "dead:9::2";do
check_unicast fibtype6 "$ifname" "$addr" || lret=1
check_unicast fibtype6iif "$ifname" "$addr" || lret=1
check_type fibif6 "$ifname" "$addr" "veth1" || lret=1
done
# same for the IPv6 equivalent addresses.
for addr in "dead:1::1" "dead:2::1" "dead:9::1" "dead:9::2";do
check_type fibif6iif "$ifname" "$addr" "0" || lret=1
done
check_unicast fibtype4 "$ifname" "10.0.2.99" || lret=1
check_unicast fibtype4iif "$ifname" "10.0.2.99" || lret=1
check_unicast fibtype6 "$ifname" "dead:2::99" || lret=1
check_unicast fibtype6iif "$ifname" "dead:2::99" || lret=1
check_type fibif4 "$ifname" "10.0.2.99" "veth1" || lret=1
check_type fibif4iif "$ifname" "10.0.2.99" 0 || lret=1
check_type fibif6 "$ifname" "dead:2::99" "veth1" || lret=1
check_type fibif6iif "$ifname" "dead:2::99" 0 || lret=1
check_rpf fibif4 "$ifname" "10.0.1.99" "veth0" 5 || lret=1
check_rpf fibif4iif "$ifname" "10.0.1.99" "veth0" 5 || lret=1
check_rpf fibif6 "$ifname" "dead:1::99" "veth0" 5 || lret=1
check_rpf fibif6iif "$ifname" "dead:1::99" "veth0" 5 || lret=1
check_fib_vrf_sets_empty || lret=1
if [ $lret -eq 0 ];then
echo "PASS: $msg"
else
echo "FAIL: $msg"
ret=1
fi
}
check_fib_veth_vrf_type()
{
local msg="$1"
local addr
local ifname
local setname
local lret=0
# as veth0 is now part of tvrf interface, packets will be seen
# twice, once with iif veth0, then with iif tvrf.
for ifname in "veth0" "tvrf"; do
for addr in "10.0.1.1" "10.9.9.1"; do
check_local fibtype4 "$ifname" "$addr" || lret=1
# addr local, but nft_fib doesn't return routes with RTN_LOCAL.
check_type fibif4 "$ifname" "$addr" 0 || lret=1
check_type fibif4iif "$ifname" "$addr" 0 || lret=1
done
for addr in "dead:1::1" "dead:9::1"; do
check_local fibtype6 "$ifname" "$addr" || lret=1
# same, address is local but no route is returned for lo.
check_type fibif6 "$ifname" "$addr" 0 || lret=1
check_type fibif6iif "$ifname" "$addr" 0 || lret=1
done
for t in fibtype4 fibtype4iif; do
check_unicast "$t" "$ifname" 10.9.9.2 || lret=1
done
for t in fibtype6 fibtype6iif; do
check_unicast "$t" "$ifname" dead:9::2 || lret=1
done
check_unicast fibtype4iif "$ifname" "10.9.9.1" || lret=1
check_unicast fibtype6iif "$ifname" "dead:9::1" || lret=1
check_unicast fibtype4 "$ifname" "10.0.2.99" || lret=1
check_unicast fibtype4iif "$ifname" "10.0.2.99" || lret=1
check_unicast fibtype6 "$ifname" "dead:2::99" || lret=1
check_unicast fibtype6iif "$ifname" "dead:2::99" || lret=1
check_type fibif4 "$ifname" "10.0.2.99" "veth1" || lret=1
check_type fibif6 "$ifname" "dead:2::99" "veth1" || lret=1
check_type fibif4 "$ifname" "10.9.9.2" "dummy0" || lret=1
check_type fibif6 "$ifname" "dead:9::2" "dummy0" || lret=1
# restricted to iif -- MUST NOT provide result, its != $ifname.
check_type fibif4iif "$ifname" "10.0.2.99" 0 || lret=1
check_type fibif6iif "$ifname" "dead:2::99" 0 || lret=1
check_rpf fibif4 "$ifname" "10.0.1.99" "veth0" 4 || lret=1
check_rpf fibif6 "$ifname" "dead:1::99" "veth0" 4 || lret=1
check_rpf fibif4iif "$ifname" "10.0.1.99" "$ifname" 4 || lret=1
check_rpf fibif6iif "$ifname" "dead:1::99" "$ifname" 4 || lret=1
done
check_local fibtype4iif "veth0" "10.0.1.1" || lret=1
check_local fibtype6iif "veth0" "dead:1::1" || lret=1
check_unicast fibtype4iif "tvrf" "10.0.1.1" || lret=1
check_unicast fibtype6iif "tvrf" "dead:1::1" || lret=1
# 10.9.9.2 should not provide a result for iif veth, but
# should when iif is tvrf.
# This is because its reachable via dummy0 which is part of
# tvrf. iif veth0 MUST conceal the dummy0 result (i.e. return oif 0).
check_type fibif4iif "veth0" "10.9.9.2" 0 || lret=1
check_type fibif6iif "veth0" "dead:9::2" 0 || lret=1
check_type fibif4iif "tvrf" "10.9.9.2" "tvrf" || lret=1
check_type fibif6iif "tvrf" "dead:9::2" "tvrf" || lret=1
check_fib_vrf_sets_empty || lret=1
if [ $lret -eq 0 ];then
echo "PASS: $msg"
else
echo "FAIL: $msg"
ret=1
fi
}
# Extends nsrouter config by adding dummy0+vrf.
#
# 10.0.1.99 10.0.1.1 10.0.2.1 10.0.2.99
# dead:1::99 dead:1::1 dead:2::1 dead:2::99
# ns1 <-------> [ veth0 ] nsrouter [veth1] <-------> ns2
# [dummy0]
# 10.9.9.1
# dead:9::1
# [tvrf]
test_fib_vrf()
{
local cntname=""
if ! test_fib_vrf_dev_add_dummy; then
[ $ret -eq 0 ] && ret=$ksft_skip
return
fi
ip -net "$nsrouter" addr add "10.9.9.1"/24 dev dummy0
ip -net "$nsrouter" addr add "dead:9::1"/64 dev dummy0 nodad
ip -net "$nsrouter" route add default via 10.0.2.99
ip -net "$nsrouter" route add default via dead:2::99
load_ruleset_vrf || return
# no echo reply for these addresses: The dummy interface is part of tvrf,
# but veth0 (incoming interface) isn't linked to it.
test_ping_unreachable "10.9.9.1" "dead:9::1" &
test_ping_unreachable "10.9.9.2" "dead:9::2" &
# expect replies from these.
test_ping "10.0.1.1" "dead:1::1"
test_ping "10.0.2.1" "dead:2::1"
test_ping "10.0.2.99" "dead:2::99"
wait
check_fib_vrf_type "fib expression address types match (iif not in vrf)"
# second round: this time, make veth0 (rx interface) part of the vrf.
# 10.9.9.1 / dead:9::1 become reachable from ns1, while ns2
# becomes unreachable.
ip -net "$nsrouter" link set veth0 master tvrf
ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad
# this reload should not be needed, but in case
# there is some error (missing or unexpected entry) this will prevent them
# from leaking into round 2.
load_ruleset_vrf || return
test_ping "10.0.1.1" "dead:1::1"
test_ping "10.9.9.1" "dead:9::1"
# ns2 should no longer be reachable (veth1 not in vrf)
test_ping_unreachable "10.0.2.99" "dead:2::99" &
# vrf via dummy0, but host doesn't exist
test_ping_unreachable "10.9.9.2" "dead:9::2" &
wait
check_fib_veth_vrf_type "fib expression address types match (iif in vrf)"
}
ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
test_ping 10.0.2.1 dead:2::1 || exit 1
check_drops || exit 1
check_drops
test_ping 10.0.2.99 dead:2::99 || exit 1
check_drops || exit 1
check_drops
echo "PASS: fib expression did not cause unwanted packet drops"
[ $ret -eq 0 ] && echo "PASS: fib expression did not cause unwanted packet drops"
load_input_ruleset "$ns1"
test_ping 127.0.0.1 ::1
check_drops
test_ping 10.0.1.99 dead:1::99
check_drops
[ $ret -eq 0 ] && echo "PASS: fib expression did not discard loopback packets"
load_input_ruleset "$ns1"
@ -234,7 +801,7 @@ ip -net "$nsrouter" addr del dead:2::1/64 dev veth0
# ... pbr ruleset for the router, check iif+oif.
if ! load_pbr_ruleset "$nsrouter";then
echo "SKIP: Could not load fib forward ruleset"
exit $ksft_skip
[ "$ret" -eq 0 ] && ret=$ksft_skip
fi
ip -net "$nsrouter" rule add from all table 128
@ -245,11 +812,36 @@ ip -net "$nsrouter" route add table 129 to 10.0.2.0/24 dev veth1
# drop main ipv4 table
ip -net "$nsrouter" -4 rule delete table main
if ! test_ping 10.0.2.99 dead:2::99;then
ip -net "$nsrouter" nft list ruleset
echo "FAIL: fib mismatch in pbr setup"
exit 1
if test_ping 10.0.2.99 dead:2::99;then
echo "PASS: fib expression forward check with policy based routing"
else
echo "FAIL: fib expression forward check with policy based routing"
ret=1
fi
echo "PASS: fib expression forward check with policy based routing"
exit 0
test_fib_type "policy routing"
ip netns exec "$nsrouter" nft delete table ip filter
ip netns exec "$nsrouter" nft delete table ip6 filter
# Un-do policy routing changes
ip -net "$nsrouter" rule del from all table 128
ip -net "$nsrouter" rule del from all iif veth0 table 129
ip -net "$nsrouter" route del table 128 to 10.0.1.0/24 dev veth0
ip -net "$nsrouter" route del table 129 to 10.0.2.0/24 dev veth1
ip -net "$ns1" -4 route del default
ip -net "$ns1" -6 route del default
ip -net "$ns1" -4 route add default via 10.0.1.1
ip -net "$ns1" -6 route add default via dead:1::1
ip -net "$nsrouter" -4 rule add from all table main priority 32766
test_fib_type "default table"
ip netns exec "$nsrouter" nft delete table ip filter
ip netns exec "$nsrouter" nft delete table ip6 filter
test_fib_vrf
exit $ret

View file

@ -0,0 +1,151 @@
#!/bin/bash -e
#
# SPDX-License-Identifier: GPL-2.0
#
# Torture nftables' netdevice notifier callbacks and related code by frequent
# renaming of interfaces which netdev-family chains and flowtables hook into.
source lib.sh
checktool "nft --version" "run test without nft tool"
checktool "iperf3 --version" "run test without iperf3 tool"
# how many seconds to torture the kernel?
# default to 80% of max run time but don't exceed 48s
TEST_RUNTIME=$((${kselftest_timeout:-60} * 8 / 10))
[[ $TEST_RUNTIME -gt 48 ]] && TEST_RUNTIME=48
trap "cleanup_all_ns" EXIT
setup_ns nsc nsr nss
ip -net $nsc link add cr0 type veth peer name rc0 netns $nsr
ip -net $nsc addr add 10.0.0.1/24 dev cr0
ip -net $nsc link set cr0 up
ip -net $nsc route add default via 10.0.0.2
ip -net $nss link add sr0 type veth peer name rs0 netns $nsr
ip -net $nss addr add 10.1.0.1/24 dev sr0
ip -net $nss link set sr0 up
ip -net $nss route add default via 10.1.0.2
ip -net $nsr addr add 10.0.0.2/24 dev rc0
ip -net $nsr link set rc0 up
ip -net $nsr addr add 10.1.0.2/24 dev rs0
ip -net $nsr link set rs0 up
ip netns exec $nsr sysctl -q net.ipv4.ip_forward=1
ip netns exec $nsr sysctl -q net.ipv4.conf.all.forwarding=1
{
echo "table netdev t {"
for ((i = 0; i < 10; i++)); do
cat <<-EOF
chain chain_rc$i {
type filter hook ingress device rc$i priority 0
counter
}
chain chain_rs$i {
type filter hook ingress device rs$i priority 0
counter
}
EOF
done
echo "}"
echo "table ip t {"
for ((i = 0; i < 10; i++)); do
cat <<-EOF
flowtable ft_${i} {
hook ingress priority 0
devices = { rc$i, rs$i }
}
EOF
done
echo "chain c {"
echo "type filter hook forward priority 0"
for ((i = 0; i < 10; i++)); do
echo -n "iifname rc$i oifname rs$i "
echo "ip protocol tcp counter flow add @ft_${i}"
done
echo "counter"
echo "}"
echo "}"
} | ip netns exec $nsr nft -f - || {
echo "SKIP: Could not load nft ruleset"
exit $ksft_skip
}
for ((o=0, n=1; ; o=n, n++, n %= 10)); do
ip -net $nsr link set rc$o name rc$n
ip -net $nsr link set rs$o name rs$n
done &
rename_loop_pid=$!
while true; do ip netns exec $nsr nft list ruleset >/dev/null 2>&1; done &
nft_list_pid=$!
ip netns exec $nsr nft monitor >/dev/null &
nft_monitor_pid=$!
ip netns exec $nss iperf3 --server --daemon -1
summary_expr='s,^\[SUM\] .* \([0-9\.]\+\) Kbits/sec .* receiver,\1,p'
rate=$(ip netns exec $nsc iperf3 \
--format k -c 10.1.0.1 --time $TEST_RUNTIME \
--length 56 --parallel 10 -i 0 | sed -n "$summary_expr")
kill $nft_list_pid
kill $nft_monitor_pid
kill $rename_loop_pid
wait
ip netns exec $nsr nft -f - <<EOF
table ip t {
flowtable ft_wild {
hook ingress priority 0
devices = { wild* }
}
}
EOF
if [[ $? -ne 0 ]]; then
echo "SKIP wildcard tests: not supported by host's nft?"
else
for ((i = 0; i < 100; i++)); do
ip -net $nsr link add wild$i type dummy &
done
wait
for ((i = 80; i < 100; i++)); do
ip -net $nsr link del wild$i &
done
for ((i = 0; i < 80; i++)); do
ip -net $nsr link del wild$i &
done
wait
for ((i = 0; i < 100; i += 10)); do
(
for ((j = 0; j < 10; j++)); do
ip -net $nsr link add wild$((i + j)) type dummy
done
for ((j = 0; j < 10; j++)); do
ip -net $nsr link del wild$((i + j))
done
) &
done
wait
fi
[[ $(</proc/sys/kernel/tainted) -eq 0 ]] || {
echo "FAIL: Kernel is tainted!"
exit $ksft_fail
}
[[ $rate -gt 0 ]] || {
echo "FAIL: Zero throughput in iperf3"
exit $ksft_fail
}
[[ -f /sys/kernel/debug/kmemleak && \
-n $(</sys/kernel/debug/kmemleak) ]] && {
echo "FAIL: non-empty kmemleak report"
exit $ksft_fail
}
exit $ksft_pass