android_kernel_samsung_msm8976/net/netfilter/nf_conntrack_netlink.c

3153 lines
76 KiB
C
Raw Normal View History

/* Connection tracking via netlink socket. Allows for user space
* protocol helpers and general trouble making from userspace.
*
* (C) 2001 by Jay Schulist <jschlst@samba.org>
* (C) 2002-2006 by Harald Welte <laforge@gnumonks.org>
* (C) 2003 by Patrick Mchardy <kaber@trash.net>
* (C) 2005-2012 by Pablo Neira Ayuso <pablo@netfilter.org>
*
* Initial connection tracking via netlink development funded and
* generally made possible by Network Robots, Inc. (www.networkrobots.com)
*
* Further development of this code funded by Astaro AG (http://www.astaro.com)
*
* This software may be used and distributed according to the terms
* of the GNU General Public License, incorporated herein by reference.
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/rculist.h>
#include <linux/rculist_nulls.h>
#include <linux/types.h>
#include <linux/timer.h>
#include <linux/security.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/netlink.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 08:04:11 +00:00
#include <linux/slab.h>
#include <linux/netfilter.h>
#include <net/netlink.h>
#include <net/sock.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_timestamp.h>
#include <net/netfilter/nf_conntrack_labels.h>
#ifdef CONFIG_NF_NAT_NEEDED
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_l4proto.h>
#include <net/netfilter/nf_nat_helper.h>
#endif
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
MODULE_LICENSE("GPL");
static char __initdata version[] = "0.93";
static inline int
ctnetlink_dump_tuples_proto(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
struct nf_conntrack_l4proto *l4proto)
{
int ret = 0;
struct nlattr *nest_parms;
nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (nla_put_u8(skb, CTA_PROTO_NUM, tuple->dst.protonum))
goto nla_put_failure;
if (likely(l4proto->tuple_to_nlattr))
ret = l4proto->tuple_to_nlattr(skb, tuple);
nla_nest_end(skb, nest_parms);
return ret;
nla_put_failure:
return -1;
}
static inline int
ctnetlink_dump_tuples_ip(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
struct nf_conntrack_l3proto *l3proto)
{
int ret = 0;
struct nlattr *nest_parms;
nest_parms = nla_nest_start(skb, CTA_TUPLE_IP | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (likely(l3proto->tuple_to_nlattr))
ret = l3proto->tuple_to_nlattr(skb, tuple);
nla_nest_end(skb, nest_parms);
return ret;
nla_put_failure:
return -1;
}
[NETFILTER]: Kill some supper dupper bloatry /me awards the bloatiest-of-all-net/-.c-code award to nf_conntrack_netlink.c, congratulations to all the authors :-/! Hall of (unquestionable) fame (measured per inline, top 10 under net/): -4496 ctnetlink_parse_tuple netfilter/nf_conntrack_netlink.c -2165 ctnetlink_dump_tuples netfilter/nf_conntrack_netlink.c -2115 __ip_vs_get_out_rt ipv4/ipvs/ip_vs_xmit.c -1924 xfrm_audit_helper_pktinfo xfrm/xfrm_state.c -1799 ctnetlink_parse_tuple_proto netfilter/nf_conntrack_netlink.c -1268 ctnetlink_parse_tuple_ip netfilter/nf_conntrack_netlink.c -1093 ctnetlink_exp_dump_expect netfilter/nf_conntrack_netlink.c -1060 void ccid3_update_send_interval dccp/ccids/ccid3.c -983 ctnetlink_dump_tuples_proto netfilter/nf_conntrack_netlink.c -827 ctnetlink_exp_dump_tuple netfilter/nf_conntrack_netlink.c (i386 / gcc (GCC) 4.1.2 20070626 (Red Hat 4.1.2-13) / allyesconfig except CONFIG_FORCED_INLINING) ...and I left < 200 byte gains as future work item. After iterative inline removal, I finally have this: net/netfilter/nf_conntrack_netlink.c: ctnetlink_exp_fill_info | -1104 ctnetlink_new_expect | -1572 ctnetlink_fill_info | -1303 ctnetlink_new_conntrack | -2230 ctnetlink_get_expect | -341 ctnetlink_del_expect | -352 ctnetlink_expect_event | -1110 ctnetlink_conntrack_event | -1548 ctnetlink_del_conntrack | -729 ctnetlink_get_conntrack | -728 10 functions changed, 11017 bytes removed, diff: -11017 net/netfilter/nf_conntrack_netlink.c: ctnetlink_parse_tuple | +419 dump_nat_seq_adj | +183 ctnetlink_dump_counters | +166 ctnetlink_dump_tuples | +261 ctnetlink_exp_dump_expect | +633 ctnetlink_change_status | +460 6 functions changed, 2122 bytes added, diff: +2122 net/netfilter/nf_conntrack_netlink.o: 16 functions changed, 2122 bytes added, 11017 bytes removed, diff: -8895 Without a number of CONFIG.*DEBUGs, I got this: net/netfilter/nf_conntrack_netlink.o: 16 functions changed, 2122 bytes added, 11029 bytes removed, diff: -8907 Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-06 07:11:31 +00:00
static int
ctnetlink_dump_tuples(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple)
{
int ret;
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
netfilter: ctnetlink: fix lockep splats net/netfilter/nf_conntrack_proto.c:70 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 3 locks held by conntrack/3235: nfnl_lock+0x17/0x20 netlink_dump+0x32/0x240 ctnetlink_dump_table+0x3e/0x170 [nf_conntrack_netlink] stack backtrace: Pid: 3235, comm: conntrack Tainted: G W 3.2.0+ #511 Call Trace: [<ffffffff8108ce45>] lockdep_rcu_suspicious+0xe5/0x100 [<ffffffffa00ec6e1>] __nf_ct_l4proto_find+0x81/0xb0 [nf_conntrack] [<ffffffffa0115675>] ctnetlink_fill_info+0x215/0x5f0 [nf_conntrack_netlink] [<ffffffffa0115dc1>] ctnetlink_dump_table+0xd1/0x170 [nf_conntrack_netlink] [<ffffffff815fbdbf>] netlink_dump+0x7f/0x240 [<ffffffff81090f9d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff815fd34f>] netlink_dump_start+0xdf/0x190 [<ffffffffa0111490>] ? ctnetlink_change_nat_seq_adj+0x160/0x160 [nf_conntrack_netlink] [<ffffffffa0115cf0>] ? ctnetlink_get_conntrack+0x2a0/0x2a0 [nf_conntrack_netlink] [<ffffffffa0115ad9>] ctnetlink_get_conntrack+0x89/0x2a0 [nf_conntrack_netlink] [<ffffffff81603a47>] nfnetlink_rcv_msg+0x467/0x5f0 [<ffffffff81603a7c>] ? nfnetlink_rcv_msg+0x49c/0x5f0 [<ffffffff81603922>] ? nfnetlink_rcv_msg+0x342/0x5f0 [<ffffffff81071b21>] ? get_parent_ip+0x11/0x50 [<ffffffff816035e0>] ? nfnetlink_subsys_register+0x60/0x60 [<ffffffff815fed49>] netlink_rcv_skb+0xa9/0xd0 [<ffffffff81603475>] nfnetlink_rcv+0x15/0x20 [<ffffffff815fe70e>] netlink_unicast+0x1ae/0x1f0 [<ffffffff815fea16>] netlink_sendmsg+0x2c6/0x320 [<ffffffff815b2a87>] sock_sendmsg+0x117/0x130 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff811250dc>] ? might_fault+0x9c/0xb0 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff815b5991>] ? move_addr_to_kernel+0x71/0x80 [<ffffffff815b644e>] sys_sendto+0xfe/0x130 [<ffffffff815b5c94>] ? sys_bind+0xb4/0xd0 [<ffffffff817a8a0e>] ? retint_swapgs+0xe/0x13 [<ffffffff817afcd2>] system_call_fastpath+0x16/0x1b Reported-by: Hans Schillstrom <hans.schillstrom@ericsson.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
2012-03-05 02:24:29 +00:00
rcu_read_lock();
l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto);
netfilter: ctnetlink: fix lockep splats net/netfilter/nf_conntrack_proto.c:70 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 3 locks held by conntrack/3235: nfnl_lock+0x17/0x20 netlink_dump+0x32/0x240 ctnetlink_dump_table+0x3e/0x170 [nf_conntrack_netlink] stack backtrace: Pid: 3235, comm: conntrack Tainted: G W 3.2.0+ #511 Call Trace: [<ffffffff8108ce45>] lockdep_rcu_suspicious+0xe5/0x100 [<ffffffffa00ec6e1>] __nf_ct_l4proto_find+0x81/0xb0 [nf_conntrack] [<ffffffffa0115675>] ctnetlink_fill_info+0x215/0x5f0 [nf_conntrack_netlink] [<ffffffffa0115dc1>] ctnetlink_dump_table+0xd1/0x170 [nf_conntrack_netlink] [<ffffffff815fbdbf>] netlink_dump+0x7f/0x240 [<ffffffff81090f9d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff815fd34f>] netlink_dump_start+0xdf/0x190 [<ffffffffa0111490>] ? ctnetlink_change_nat_seq_adj+0x160/0x160 [nf_conntrack_netlink] [<ffffffffa0115cf0>] ? ctnetlink_get_conntrack+0x2a0/0x2a0 [nf_conntrack_netlink] [<ffffffffa0115ad9>] ctnetlink_get_conntrack+0x89/0x2a0 [nf_conntrack_netlink] [<ffffffff81603a47>] nfnetlink_rcv_msg+0x467/0x5f0 [<ffffffff81603a7c>] ? nfnetlink_rcv_msg+0x49c/0x5f0 [<ffffffff81603922>] ? nfnetlink_rcv_msg+0x342/0x5f0 [<ffffffff81071b21>] ? get_parent_ip+0x11/0x50 [<ffffffff816035e0>] ? nfnetlink_subsys_register+0x60/0x60 [<ffffffff815fed49>] netlink_rcv_skb+0xa9/0xd0 [<ffffffff81603475>] nfnetlink_rcv+0x15/0x20 [<ffffffff815fe70e>] netlink_unicast+0x1ae/0x1f0 [<ffffffff815fea16>] netlink_sendmsg+0x2c6/0x320 [<ffffffff815b2a87>] sock_sendmsg+0x117/0x130 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff811250dc>] ? might_fault+0x9c/0xb0 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff815b5991>] ? move_addr_to_kernel+0x71/0x80 [<ffffffff815b644e>] sys_sendto+0xfe/0x130 [<ffffffff815b5c94>] ? sys_bind+0xb4/0xd0 [<ffffffff817a8a0e>] ? retint_swapgs+0xe/0x13 [<ffffffff817afcd2>] system_call_fastpath+0x16/0x1b Reported-by: Hans Schillstrom <hans.schillstrom@ericsson.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
2012-03-05 02:24:29 +00:00
if (ret >= 0) {
l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
tuple->dst.protonum);
ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto);
}
rcu_read_unlock();
return ret;
}
static inline int
ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
{
if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status)))
goto nla_put_failure;
return 0;
nla_put_failure:
return -1;
}
static inline int
ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
{
long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ;
if (timeout < 0)
timeout = 0;
if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout)))
goto nla_put_failure;
return 0;
nla_put_failure:
return -1;
}
static inline int
ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
{
struct nf_conntrack_l4proto *l4proto;
struct nlattr *nest_proto;
int ret;
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
if (!l4proto->to_nlattr)
return 0;
nest_proto = nla_nest_start(skb, CTA_PROTOINFO | NLA_F_NESTED);
if (!nest_proto)
goto nla_put_failure;
ret = l4proto->to_nlattr(skb, nest_proto, ct);
nla_nest_end(skb, nest_proto);
return ret;
nla_put_failure:
return -1;
}
static inline int
ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nlattr *nest_helper;
const struct nf_conn_help *help = nfct_help(ct);
struct nf_conntrack_helper *helper;
if (!help)
return 0;
helper = rcu_dereference(help->helper);
if (!helper)
goto out;
nest_helper = nla_nest_start(skb, CTA_HELP | NLA_F_NESTED);
if (!nest_helper)
goto nla_put_failure;
if (nla_put_string(skb, CTA_HELP_NAME, helper->name))
goto nla_put_failure;
if (helper->to_nlattr)
helper->to_nlattr(skb, ct);
nla_nest_end(skb, nest_helper);
out:
return 0;
nla_put_failure:
return -1;
}
[NETFILTER]: Kill some supper dupper bloatry /me awards the bloatiest-of-all-net/-.c-code award to nf_conntrack_netlink.c, congratulations to all the authors :-/! Hall of (unquestionable) fame (measured per inline, top 10 under net/): -4496 ctnetlink_parse_tuple netfilter/nf_conntrack_netlink.c -2165 ctnetlink_dump_tuples netfilter/nf_conntrack_netlink.c -2115 __ip_vs_get_out_rt ipv4/ipvs/ip_vs_xmit.c -1924 xfrm_audit_helper_pktinfo xfrm/xfrm_state.c -1799 ctnetlink_parse_tuple_proto netfilter/nf_conntrack_netlink.c -1268 ctnetlink_parse_tuple_ip netfilter/nf_conntrack_netlink.c -1093 ctnetlink_exp_dump_expect netfilter/nf_conntrack_netlink.c -1060 void ccid3_update_send_interval dccp/ccids/ccid3.c -983 ctnetlink_dump_tuples_proto netfilter/nf_conntrack_netlink.c -827 ctnetlink_exp_dump_tuple netfilter/nf_conntrack_netlink.c (i386 / gcc (GCC) 4.1.2 20070626 (Red Hat 4.1.2-13) / allyesconfig except CONFIG_FORCED_INLINING) ...and I left < 200 byte gains as future work item. After iterative inline removal, I finally have this: net/netfilter/nf_conntrack_netlink.c: ctnetlink_exp_fill_info | -1104 ctnetlink_new_expect | -1572 ctnetlink_fill_info | -1303 ctnetlink_new_conntrack | -2230 ctnetlink_get_expect | -341 ctnetlink_del_expect | -352 ctnetlink_expect_event | -1110 ctnetlink_conntrack_event | -1548 ctnetlink_del_conntrack | -729 ctnetlink_get_conntrack | -728 10 functions changed, 11017 bytes removed, diff: -11017 net/netfilter/nf_conntrack_netlink.c: ctnetlink_parse_tuple | +419 dump_nat_seq_adj | +183 ctnetlink_dump_counters | +166 ctnetlink_dump_tuples | +261 ctnetlink_exp_dump_expect | +633 ctnetlink_change_status | +460 6 functions changed, 2122 bytes added, diff: +2122 net/netfilter/nf_conntrack_netlink.o: 16 functions changed, 2122 bytes added, 11017 bytes removed, diff: -8895 Without a number of CONFIG.*DEBUGs, I got this: net/netfilter/nf_conntrack_netlink.o: 16 functions changed, 2122 bytes added, 11029 bytes removed, diff: -8907 Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-06 07:11:31 +00:00
static int
dump_counters(struct sk_buff *skb, u64 pkts, u64 bytes,
enum ip_conntrack_dir dir)
{
enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
struct nlattr *nest_count;
nest_count = nla_nest_start(skb, type | NLA_F_NESTED);
if (!nest_count)
goto nla_put_failure;
if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts)) ||
nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes)))
goto nla_put_failure;
nla_nest_end(skb, nest_count);
return 0;
nla_put_failure:
return -1;
}
static int
ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct,
enum ip_conntrack_dir dir, int type)
{
struct nf_conn_counter *acct;
u64 pkts, bytes;
acct = nf_conn_acct_find(ct);
if (!acct)
return 0;
if (type == IPCTNL_MSG_CT_GET_CTRZERO) {
pkts = atomic64_xchg(&acct[dir].packets, 0);
bytes = atomic64_xchg(&acct[dir].bytes, 0);
} else {
pkts = atomic64_read(&acct[dir].packets);
bytes = atomic64_read(&acct[dir].bytes);
}
return dump_counters(skb, pkts, bytes, dir);
}
static int
ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nlattr *nest_count;
const struct nf_conn_tstamp *tstamp;
tstamp = nf_conn_tstamp_find(ct);
if (!tstamp)
return 0;
nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED);
if (!nest_count)
goto nla_put_failure;
if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)) ||
(tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP,
cpu_to_be64(tstamp->stop))))
goto nla_put_failure;
nla_nest_end(skb, nest_count);
return 0;
nla_put_failure:
return -1;
}
#ifdef CONFIG_NF_CONNTRACK_MARK
static inline int
ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
{
if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark)))
goto nla_put_failure;
return 0;
nla_put_failure:
return -1;
}
#else
#define ctnetlink_dump_mark(a, b) (0)
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
static inline int
ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nlattr *nest_secctx;
int len, ret;
char *secctx;
ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
if (ret)
return 0;
ret = -1;
nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
if (!nest_secctx)
goto nla_put_failure;
if (nla_put_string(skb, CTA_SECCTX_NAME, secctx))
goto nla_put_failure;
nla_nest_end(skb, nest_secctx);
ret = 0;
nla_put_failure:
security_release_secctx(secctx, len);
return ret;
}
#else
#define ctnetlink_dump_secctx(a, b) (0)
#endif
#ifdef CONFIG_NF_CONNTRACK_LABELS
static int ctnetlink_label_size(const struct nf_conn *ct)
{
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
if (!labels)
return 0;
return nla_total_size(labels->words * sizeof(long));
}
static int
ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
unsigned int len, i;
if (!labels)
return 0;
len = labels->words * sizeof(long);
i = 0;
do {
if (labels->bits[i] != 0)
return nla_put(skb, CTA_LABELS, len, labels->bits);
i++;
} while (i < labels->words);
return 0;
}
#else
#define ctnetlink_dump_labels(a, b) (0)
#define ctnetlink_label_size(a) (0)
#endif
#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
static inline int
ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nlattr *nest_parms;
if (!(ct->status & IPS_EXPECTED))
return 0;
nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, master_tuple(ct)) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
return 0;
nla_put_failure:
return -1;
}
#ifdef CONFIG_NF_NAT_NEEDED
[NETFILTER]: Kill some supper dupper bloatry /me awards the bloatiest-of-all-net/-.c-code award to nf_conntrack_netlink.c, congratulations to all the authors :-/! Hall of (unquestionable) fame (measured per inline, top 10 under net/): -4496 ctnetlink_parse_tuple netfilter/nf_conntrack_netlink.c -2165 ctnetlink_dump_tuples netfilter/nf_conntrack_netlink.c -2115 __ip_vs_get_out_rt ipv4/ipvs/ip_vs_xmit.c -1924 xfrm_audit_helper_pktinfo xfrm/xfrm_state.c -1799 ctnetlink_parse_tuple_proto netfilter/nf_conntrack_netlink.c -1268 ctnetlink_parse_tuple_ip netfilter/nf_conntrack_netlink.c -1093 ctnetlink_exp_dump_expect netfilter/nf_conntrack_netlink.c -1060 void ccid3_update_send_interval dccp/ccids/ccid3.c -983 ctnetlink_dump_tuples_proto netfilter/nf_conntrack_netlink.c -827 ctnetlink_exp_dump_tuple netfilter/nf_conntrack_netlink.c (i386 / gcc (GCC) 4.1.2 20070626 (Red Hat 4.1.2-13) / allyesconfig except CONFIG_FORCED_INLINING) ...and I left < 200 byte gains as future work item. After iterative inline removal, I finally have this: net/netfilter/nf_conntrack_netlink.c: ctnetlink_exp_fill_info | -1104 ctnetlink_new_expect | -1572 ctnetlink_fill_info | -1303 ctnetlink_new_conntrack | -2230 ctnetlink_get_expect | -341 ctnetlink_del_expect | -352 ctnetlink_expect_event | -1110 ctnetlink_conntrack_event | -1548 ctnetlink_del_conntrack | -729 ctnetlink_get_conntrack | -728 10 functions changed, 11017 bytes removed, diff: -11017 net/netfilter/nf_conntrack_netlink.c: ctnetlink_parse_tuple | +419 dump_nat_seq_adj | +183 ctnetlink_dump_counters | +166 ctnetlink_dump_tuples | +261 ctnetlink_exp_dump_expect | +633 ctnetlink_change_status | +460 6 functions changed, 2122 bytes added, diff: +2122 net/netfilter/nf_conntrack_netlink.o: 16 functions changed, 2122 bytes added, 11017 bytes removed, diff: -8895 Without a number of CONFIG.*DEBUGs, I got this: net/netfilter/nf_conntrack_netlink.o: 16 functions changed, 2122 bytes added, 11029 bytes removed, diff: -8907 Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-06 07:11:31 +00:00
static int
dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type)
{
struct nlattr *nest_parms;
nest_parms = nla_nest_start(skb, type | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (nla_put_be32(skb, CTA_NAT_SEQ_CORRECTION_POS,
htonl(natseq->correction_pos)) ||
nla_put_be32(skb, CTA_NAT_SEQ_OFFSET_BEFORE,
htonl(natseq->offset_before)) ||
nla_put_be32(skb, CTA_NAT_SEQ_OFFSET_AFTER,
htonl(natseq->offset_after)))
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
return 0;
nla_put_failure:
return -1;
}
static inline int
ctnetlink_dump_nat_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nf_nat_seq *natseq;
struct nf_conn_nat *nat = nfct_nat(ct);
if (!(ct->status & IPS_SEQ_ADJUST) || !nat)
return 0;
natseq = &nat->seq[IP_CT_DIR_ORIGINAL];
if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_ORIG) == -1)
return -1;
natseq = &nat->seq[IP_CT_DIR_REPLY];
if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_REPLY) == -1)
return -1;
return 0;
}
#else
#define ctnetlink_dump_nat_seq_adj(a, b) (0)
#endif
static inline int
ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
{
if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct)))
goto nla_put_failure;
return 0;
nla_put_failure:
return -1;
}
static inline int
ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
{
if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))))
goto nla_put_failure;
return 0;
nla_put_failure:
return -1;
}
static int
ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
struct nf_conn *ct)
{
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
struct nlattr *nest_parms;
unsigned int flags = portid ? NLM_F_MULTI : 0, event;
event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_NEW);
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
if (nlh == NULL)
goto nlmsg_failure;
nfmsg = nlmsg_data(nlh);
nfmsg->nfgen_family = nf_ct_l3num(ct);
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = 0;
nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
if (nf_ct_zone(ct) &&
nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
goto nla_put_failure;
if (ctnetlink_dump_status(skb, ct) < 0 ||
ctnetlink_dump_timeout(skb, ct) < 0 ||
ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL, type) < 0 ||
ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY, type) < 0 ||
ctnetlink_dump_timestamp(skb, ct) < 0 ||
ctnetlink_dump_protoinfo(skb, ct) < 0 ||
ctnetlink_dump_helpinfo(skb, ct) < 0 ||
ctnetlink_dump_mark(skb, ct) < 0 ||
ctnetlink_dump_secctx(skb, ct) < 0 ||
ctnetlink_dump_labels(skb, ct) < 0 ||
ctnetlink_dump_id(skb, ct) < 0 ||
ctnetlink_dump_use(skb, ct) < 0 ||
ctnetlink_dump_master(skb, ct) < 0 ||
ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
goto nla_put_failure;
nlmsg_end(skb, nlh);
return skb->len;
nlmsg_failure:
nla_put_failure:
nlmsg_cancel(skb, nlh);
return -1;
}
static inline size_t
ctnetlink_proto_size(const struct nf_conn *ct)
{
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
size_t len = 0;
rcu_read_lock();
l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
len += l3proto->nla_size;
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
len += l4proto->nla_size;
rcu_read_unlock();
return len;
}
static inline size_t
ctnetlink_counters_size(const struct nf_conn *ct)
{
if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT))
return 0;
return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */
+ 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */
+ 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */
;
}
static inline int
ctnetlink_secctx_size(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_SECMARK
int len, ret;
ret = security_secid_to_secctx(ct->secmark, NULL, &len);
if (ret)
return 0;
return nla_total_size(0) /* CTA_SECCTX */
+ nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */
#else
return 0;
#endif
}
static inline size_t
ctnetlink_timestamp_size(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
return 0;
return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t));
#else
return 0;
#endif
}
static inline size_t
ctnetlink_nlmsg_size(const struct nf_conn *ct)
{
return NLMSG_ALIGN(sizeof(struct nfgenmsg))
+ 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */
+ 3 * nla_total_size(0) /* CTA_TUPLE_IP */
+ 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */
+ 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */
+ nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
+ nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
+ ctnetlink_counters_size(ct)
+ ctnetlink_timestamp_size(ct)
+ nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
+ nla_total_size(0) /* CTA_PROTOINFO */
+ nla_total_size(0) /* CTA_HELP */
+ nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
+ ctnetlink_secctx_size(ct)
#ifdef CONFIG_NF_NAT_NEEDED
+ 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+ 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
#endif
#ifdef CONFIG_NF_CONNTRACK_MARK
+ nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
#endif
+ ctnetlink_proto_size(ct)
+ ctnetlink_label_size(ct)
;
}
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static int
ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
{
struct net *net;
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
struct nlattr *nest_parms;
struct nf_conn *ct = item->ct;
struct sk_buff *skb;
unsigned int type;
unsigned int flags = 0, group;
netfilter: conntrack: optional reliable conntrack event delivery This patch improves ctnetlink event reliability if one broadcast listener has set the NETLINK_BROADCAST_ERROR socket option. The logic is the following: if an event delivery fails, we keep the undelivered events in the missed event cache. Once the next packet arrives, we add the new events (if any) to the missed events in the cache and we try a new delivery, and so on. Thus, if ctnetlink fails to deliver an event, we try to deliver them once we see a new packet. Therefore, we may lose state transitions but the userspace process gets in sync at some point. At worst case, if no events were delivered to userspace, we make sure that destroy events are successfully delivered. Basically, if ctnetlink fails to deliver the destroy event, we remove the conntrack entry from the hashes and we insert them in the dying list, which contains inactive entries. Then, the conntrack timer is added with an extra grace timeout of random32() % 15 seconds to trigger the event again (this grace timeout is tunable via /proc). The use of a limited random timeout value allows distributing the "destroy" resends, thus, avoiding accumulating lots "destroy" events at the same time. Event delivery may re-order but we can identify them by means of the tuple plus the conntrack ID. The maximum number of conntrack entries (active or inactive) is still handled by nf_conntrack_max. Thus, we may start dropping packets at some point if we accumulate a lot of inactive conntrack entries that did not successfully report the destroy event to userspace. During my stress tests consisting of setting a very small buffer of 2048 bytes for conntrackd and the NETLINK_BROADCAST_ERROR socket flag, and generating lots of very small connections, I noticed very few destroy entries on the fly waiting to be resend. A simple way to test this patch consist of creating a lot of entries, set a very small Netlink buffer in conntrackd (+ a patch which is not in the git tree to set the BROADCAST_ERROR flag) and invoke `conntrack -F'. For expectations, no changes are introduced in this patch. Currently, event delivery is only done for new expectations (no events from expectation expiration, removal and confirmation). In that case, they need a per-expectation event cache to implement the same idea that is exposed in this patch. This patch can be useful to provide reliable flow-accouting. We still have to add a new conntrack extension to store the creation and destroy time. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Signed-off-by: Patrick McHardy <kaber@trash.net>
2009-06-13 10:30:52 +00:00
int err;
/* ignore our fake conntrack entry */
if (nf_ct_is_untracked(ct))
return 0;
if (events & (1 << IPCT_DESTROY)) {
type = IPCTNL_MSG_CT_DELETE;
group = NFNLGRP_CONNTRACK_DESTROY;
} else if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) {
type = IPCTNL_MSG_CT_NEW;
flags = NLM_F_CREATE|NLM_F_EXCL;
group = NFNLGRP_CONNTRACK_NEW;
} else if (events) {
type = IPCTNL_MSG_CT_NEW;
group = NFNLGRP_CONNTRACK_UPDATE;
} else
return 0;
net = nf_ct_net(ct);
if (!item->report && !nfnetlink_has_listeners(net, group))
return 0;
skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC);
if (skb == NULL)
goto errout;
type |= NFNL_SUBSYS_CTNETLINK << 8;
nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
if (nlh == NULL)
goto nlmsg_failure;
nfmsg = nlmsg_data(nlh);
nfmsg->nfgen_family = nf_ct_l3num(ct);
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = 0;
rcu_read_lock();
nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
if (nf_ct_zone(ct) &&
nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
goto nla_put_failure;
if (ctnetlink_dump_id(skb, ct) < 0)
goto nla_put_failure;
if (ctnetlink_dump_status(skb, ct) < 0)
goto nla_put_failure;
if (events & (1 << IPCT_DESTROY)) {
if (ctnetlink_dump_counters(skb, ct,
IP_CT_DIR_ORIGINAL, type) < 0 ||
ctnetlink_dump_counters(skb, ct,
IP_CT_DIR_REPLY, type) < 0 ||
ctnetlink_dump_timestamp(skb, ct) < 0)
goto nla_put_failure;
} else {
if (ctnetlink_dump_timeout(skb, ct) < 0)
goto nla_put_failure;
if (events & (1 << IPCT_PROTOINFO)
&& ctnetlink_dump_protoinfo(skb, ct) < 0)
goto nla_put_failure;
if ((events & (1 << IPCT_HELPER) || nfct_help(ct))
&& ctnetlink_dump_helpinfo(skb, ct) < 0)
goto nla_put_failure;
#ifdef CONFIG_NF_CONNTRACK_SECMARK
if ((events & (1 << IPCT_SECMARK) || ct->secmark)
&& ctnetlink_dump_secctx(skb, ct) < 0)
goto nla_put_failure;
#endif
if (events & (1 << IPCT_LABEL) &&
ctnetlink_dump_labels(skb, ct) < 0)
goto nla_put_failure;
if (events & (1 << IPCT_RELATED) &&
ctnetlink_dump_master(skb, ct) < 0)
goto nla_put_failure;
if (events & (1 << IPCT_NATSEQADJ) &&
ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
goto nla_put_failure;
}
#ifdef CONFIG_NF_CONNTRACK_MARK
if ((events & (1 << IPCT_MARK) || ct->mark)
&& ctnetlink_dump_mark(skb, ct) < 0)
goto nla_put_failure;
#endif
rcu_read_unlock();
nlmsg_end(skb, nlh);
err = nfnetlink_send(skb, net, item->portid, group, item->report,
GFP_ATOMIC);
netfilter: conntrack: optional reliable conntrack event delivery This patch improves ctnetlink event reliability if one broadcast listener has set the NETLINK_BROADCAST_ERROR socket option. The logic is the following: if an event delivery fails, we keep the undelivered events in the missed event cache. Once the next packet arrives, we add the new events (if any) to the missed events in the cache and we try a new delivery, and so on. Thus, if ctnetlink fails to deliver an event, we try to deliver them once we see a new packet. Therefore, we may lose state transitions but the userspace process gets in sync at some point. At worst case, if no events were delivered to userspace, we make sure that destroy events are successfully delivered. Basically, if ctnetlink fails to deliver the destroy event, we remove the conntrack entry from the hashes and we insert them in the dying list, which contains inactive entries. Then, the conntrack timer is added with an extra grace timeout of random32() % 15 seconds to trigger the event again (this grace timeout is tunable via /proc). The use of a limited random timeout value allows distributing the "destroy" resends, thus, avoiding accumulating lots "destroy" events at the same time. Event delivery may re-order but we can identify them by means of the tuple plus the conntrack ID. The maximum number of conntrack entries (active or inactive) is still handled by nf_conntrack_max. Thus, we may start dropping packets at some point if we accumulate a lot of inactive conntrack entries that did not successfully report the destroy event to userspace. During my stress tests consisting of setting a very small buffer of 2048 bytes for conntrackd and the NETLINK_BROADCAST_ERROR socket flag, and generating lots of very small connections, I noticed very few destroy entries on the fly waiting to be resend. A simple way to test this patch consist of creating a lot of entries, set a very small Netlink buffer in conntrackd (+ a patch which is not in the git tree to set the BROADCAST_ERROR flag) and invoke `conntrack -F'. For expectations, no changes are introduced in this patch. Currently, event delivery is only done for new expectations (no events from expectation expiration, removal and confirmation). In that case, they need a per-expectation event cache to implement the same idea that is exposed in this patch. This patch can be useful to provide reliable flow-accouting. We still have to add a new conntrack extension to store the creation and destroy time. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Signed-off-by: Patrick McHardy <kaber@trash.net>
2009-06-13 10:30:52 +00:00
if (err == -ENOBUFS || err == -EAGAIN)
return -ENOBUFS;
return 0;
nla_put_failure:
rcu_read_unlock();
nlmsg_cancel(skb, nlh);
nlmsg_failure:
kfree_skb(skb);
errout:
if (nfnetlink_set_err(net, 0, group, -ENOBUFS) > 0)
return -ENOBUFS;
return 0;
}
#endif /* CONFIG_NF_CONNTRACK_EVENTS */
static int ctnetlink_done(struct netlink_callback *cb)
{
if (cb->args[1])
nf_ct_put((struct nf_conn *)cb->args[1]);
if (cb->data)
kfree(cb->data);
return 0;
}
struct ctnetlink_dump_filter {
struct {
u_int32_t val;
u_int32_t mask;
} mark;
};
static int
ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
struct nf_conn *ct, *last;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
u_int8_t l3proto = nfmsg->nfgen_family;
netfilter: ctnetlink: fix lockep splats net/netfilter/nf_conntrack_proto.c:70 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 3 locks held by conntrack/3235: nfnl_lock+0x17/0x20 netlink_dump+0x32/0x240 ctnetlink_dump_table+0x3e/0x170 [nf_conntrack_netlink] stack backtrace: Pid: 3235, comm: conntrack Tainted: G W 3.2.0+ #511 Call Trace: [<ffffffff8108ce45>] lockdep_rcu_suspicious+0xe5/0x100 [<ffffffffa00ec6e1>] __nf_ct_l4proto_find+0x81/0xb0 [nf_conntrack] [<ffffffffa0115675>] ctnetlink_fill_info+0x215/0x5f0 [nf_conntrack_netlink] [<ffffffffa0115dc1>] ctnetlink_dump_table+0xd1/0x170 [nf_conntrack_netlink] [<ffffffff815fbdbf>] netlink_dump+0x7f/0x240 [<ffffffff81090f9d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff815fd34f>] netlink_dump_start+0xdf/0x190 [<ffffffffa0111490>] ? ctnetlink_change_nat_seq_adj+0x160/0x160 [nf_conntrack_netlink] [<ffffffffa0115cf0>] ? ctnetlink_get_conntrack+0x2a0/0x2a0 [nf_conntrack_netlink] [<ffffffffa0115ad9>] ctnetlink_get_conntrack+0x89/0x2a0 [nf_conntrack_netlink] [<ffffffff81603a47>] nfnetlink_rcv_msg+0x467/0x5f0 [<ffffffff81603a7c>] ? nfnetlink_rcv_msg+0x49c/0x5f0 [<ffffffff81603922>] ? nfnetlink_rcv_msg+0x342/0x5f0 [<ffffffff81071b21>] ? get_parent_ip+0x11/0x50 [<ffffffff816035e0>] ? nfnetlink_subsys_register+0x60/0x60 [<ffffffff815fed49>] netlink_rcv_skb+0xa9/0xd0 [<ffffffff81603475>] nfnetlink_rcv+0x15/0x20 [<ffffffff815fe70e>] netlink_unicast+0x1ae/0x1f0 [<ffffffff815fea16>] netlink_sendmsg+0x2c6/0x320 [<ffffffff815b2a87>] sock_sendmsg+0x117/0x130 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff811250dc>] ? might_fault+0x9c/0xb0 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff815b5991>] ? move_addr_to_kernel+0x71/0x80 [<ffffffff815b644e>] sys_sendto+0xfe/0x130 [<ffffffff815b5c94>] ? sys_bind+0xb4/0xd0 [<ffffffff817a8a0e>] ? retint_swapgs+0xe/0x13 [<ffffffff817afcd2>] system_call_fastpath+0x16/0x1b Reported-by: Hans Schillstrom <hans.schillstrom@ericsson.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
2012-03-05 02:24:29 +00:00
int res;
#ifdef CONFIG_NF_CONNTRACK_MARK
const struct ctnetlink_dump_filter *filter = cb->data;
#endif
netfilter: ctnetlink: fix lockep splats net/netfilter/nf_conntrack_proto.c:70 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 3 locks held by conntrack/3235: nfnl_lock+0x17/0x20 netlink_dump+0x32/0x240 ctnetlink_dump_table+0x3e/0x170 [nf_conntrack_netlink] stack backtrace: Pid: 3235, comm: conntrack Tainted: G W 3.2.0+ #511 Call Trace: [<ffffffff8108ce45>] lockdep_rcu_suspicious+0xe5/0x100 [<ffffffffa00ec6e1>] __nf_ct_l4proto_find+0x81/0xb0 [nf_conntrack] [<ffffffffa0115675>] ctnetlink_fill_info+0x215/0x5f0 [nf_conntrack_netlink] [<ffffffffa0115dc1>] ctnetlink_dump_table+0xd1/0x170 [nf_conntrack_netlink] [<ffffffff815fbdbf>] netlink_dump+0x7f/0x240 [<ffffffff81090f9d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff815fd34f>] netlink_dump_start+0xdf/0x190 [<ffffffffa0111490>] ? ctnetlink_change_nat_seq_adj+0x160/0x160 [nf_conntrack_netlink] [<ffffffffa0115cf0>] ? ctnetlink_get_conntrack+0x2a0/0x2a0 [nf_conntrack_netlink] [<ffffffffa0115ad9>] ctnetlink_get_conntrack+0x89/0x2a0 [nf_conntrack_netlink] [<ffffffff81603a47>] nfnetlink_rcv_msg+0x467/0x5f0 [<ffffffff81603a7c>] ? nfnetlink_rcv_msg+0x49c/0x5f0 [<ffffffff81603922>] ? nfnetlink_rcv_msg+0x342/0x5f0 [<ffffffff81071b21>] ? get_parent_ip+0x11/0x50 [<ffffffff816035e0>] ? nfnetlink_subsys_register+0x60/0x60 [<ffffffff815fed49>] netlink_rcv_skb+0xa9/0xd0 [<ffffffff81603475>] nfnetlink_rcv+0x15/0x20 [<ffffffff815fe70e>] netlink_unicast+0x1ae/0x1f0 [<ffffffff815fea16>] netlink_sendmsg+0x2c6/0x320 [<ffffffff815b2a87>] sock_sendmsg+0x117/0x130 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff811250dc>] ? might_fault+0x9c/0xb0 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff815b5991>] ? move_addr_to_kernel+0x71/0x80 [<ffffffff815b644e>] sys_sendto+0xfe/0x130 [<ffffffff815b5c94>] ? sys_bind+0xb4/0xd0 [<ffffffff817a8a0e>] ? retint_swapgs+0xe/0x13 [<ffffffff817afcd2>] system_call_fastpath+0x16/0x1b Reported-by: Hans Schillstrom <hans.schillstrom@ericsson.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
2012-03-05 02:24:29 +00:00
spin_lock_bh(&nf_conntrack_lock);
last = (struct nf_conn *)cb->args[1];
for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
restart:
hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
/* Dump entries of a given L3 protocol number.
* If it is not specified, ie. l3proto == 0,
* then dump everything. */
if (l3proto && nf_ct_l3num(ct) != l3proto)
continue;
if (cb->args[1]) {
if (ct != last)
continue;
cb->args[1] = 0;
}
#ifdef CONFIG_NF_CONNTRACK_MARK
if (filter && !((ct->mark & filter->mark.mask) ==
filter->mark.val)) {
continue;
}
#endif
netfilter: ctnetlink: fix lockep splats net/netfilter/nf_conntrack_proto.c:70 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 3 locks held by conntrack/3235: nfnl_lock+0x17/0x20 netlink_dump+0x32/0x240 ctnetlink_dump_table+0x3e/0x170 [nf_conntrack_netlink] stack backtrace: Pid: 3235, comm: conntrack Tainted: G W 3.2.0+ #511 Call Trace: [<ffffffff8108ce45>] lockdep_rcu_suspicious+0xe5/0x100 [<ffffffffa00ec6e1>] __nf_ct_l4proto_find+0x81/0xb0 [nf_conntrack] [<ffffffffa0115675>] ctnetlink_fill_info+0x215/0x5f0 [nf_conntrack_netlink] [<ffffffffa0115dc1>] ctnetlink_dump_table+0xd1/0x170 [nf_conntrack_netlink] [<ffffffff815fbdbf>] netlink_dump+0x7f/0x240 [<ffffffff81090f9d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff815fd34f>] netlink_dump_start+0xdf/0x190 [<ffffffffa0111490>] ? ctnetlink_change_nat_seq_adj+0x160/0x160 [nf_conntrack_netlink] [<ffffffffa0115cf0>] ? ctnetlink_get_conntrack+0x2a0/0x2a0 [nf_conntrack_netlink] [<ffffffffa0115ad9>] ctnetlink_get_conntrack+0x89/0x2a0 [nf_conntrack_netlink] [<ffffffff81603a47>] nfnetlink_rcv_msg+0x467/0x5f0 [<ffffffff81603a7c>] ? nfnetlink_rcv_msg+0x49c/0x5f0 [<ffffffff81603922>] ? nfnetlink_rcv_msg+0x342/0x5f0 [<ffffffff81071b21>] ? get_parent_ip+0x11/0x50 [<ffffffff816035e0>] ? nfnetlink_subsys_register+0x60/0x60 [<ffffffff815fed49>] netlink_rcv_skb+0xa9/0xd0 [<ffffffff81603475>] nfnetlink_rcv+0x15/0x20 [<ffffffff815fe70e>] netlink_unicast+0x1ae/0x1f0 [<ffffffff815fea16>] netlink_sendmsg+0x2c6/0x320 [<ffffffff815b2a87>] sock_sendmsg+0x117/0x130 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff811250dc>] ? might_fault+0x9c/0xb0 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff815b5991>] ? move_addr_to_kernel+0x71/0x80 [<ffffffff815b644e>] sys_sendto+0xfe/0x130 [<ffffffff815b5c94>] ? sys_bind+0xb4/0xd0 [<ffffffff817a8a0e>] ? retint_swapgs+0xe/0x13 [<ffffffff817afcd2>] system_call_fastpath+0x16/0x1b Reported-by: Hans Schillstrom <hans.schillstrom@ericsson.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
2012-03-05 02:24:29 +00:00
rcu_read_lock();
res =
ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
netfilter: ctnetlink: fix lockep splats net/netfilter/nf_conntrack_proto.c:70 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 3 locks held by conntrack/3235: nfnl_lock+0x17/0x20 netlink_dump+0x32/0x240 ctnetlink_dump_table+0x3e/0x170 [nf_conntrack_netlink] stack backtrace: Pid: 3235, comm: conntrack Tainted: G W 3.2.0+ #511 Call Trace: [<ffffffff8108ce45>] lockdep_rcu_suspicious+0xe5/0x100 [<ffffffffa00ec6e1>] __nf_ct_l4proto_find+0x81/0xb0 [nf_conntrack] [<ffffffffa0115675>] ctnetlink_fill_info+0x215/0x5f0 [nf_conntrack_netlink] [<ffffffffa0115dc1>] ctnetlink_dump_table+0xd1/0x170 [nf_conntrack_netlink] [<ffffffff815fbdbf>] netlink_dump+0x7f/0x240 [<ffffffff81090f9d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff815fd34f>] netlink_dump_start+0xdf/0x190 [<ffffffffa0111490>] ? ctnetlink_change_nat_seq_adj+0x160/0x160 [nf_conntrack_netlink] [<ffffffffa0115cf0>] ? ctnetlink_get_conntrack+0x2a0/0x2a0 [nf_conntrack_netlink] [<ffffffffa0115ad9>] ctnetlink_get_conntrack+0x89/0x2a0 [nf_conntrack_netlink] [<ffffffff81603a47>] nfnetlink_rcv_msg+0x467/0x5f0 [<ffffffff81603a7c>] ? nfnetlink_rcv_msg+0x49c/0x5f0 [<ffffffff81603922>] ? nfnetlink_rcv_msg+0x342/0x5f0 [<ffffffff81071b21>] ? get_parent_ip+0x11/0x50 [<ffffffff816035e0>] ? nfnetlink_subsys_register+0x60/0x60 [<ffffffff815fed49>] netlink_rcv_skb+0xa9/0xd0 [<ffffffff81603475>] nfnetlink_rcv+0x15/0x20 [<ffffffff815fe70e>] netlink_unicast+0x1ae/0x1f0 [<ffffffff815fea16>] netlink_sendmsg+0x2c6/0x320 [<ffffffff815b2a87>] sock_sendmsg+0x117/0x130 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff811250dc>] ? might_fault+0x9c/0xb0 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff815b5991>] ? move_addr_to_kernel+0x71/0x80 [<ffffffff815b644e>] sys_sendto+0xfe/0x130 [<ffffffff815b5c94>] ? sys_bind+0xb4/0xd0 [<ffffffff817a8a0e>] ? retint_swapgs+0xe/0x13 [<ffffffff817afcd2>] system_call_fastpath+0x16/0x1b Reported-by: Hans Schillstrom <hans.schillstrom@ericsson.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
2012-03-05 02:24:29 +00:00
cb->nlh->nlmsg_seq,
NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
ct);
rcu_read_unlock();
if (res < 0) {
nf_conntrack_get(&ct->ct_general);
cb->args[1] = (unsigned long)ct;
goto out;
}
}
if (cb->args[1]) {
cb->args[1] = 0;
goto restart;
}
}
out:
spin_unlock_bh(&nf_conntrack_lock);
if (last) {
/* nf ct hash resize happened, now clear the leftover. */
if ((struct nf_conn *)cb->args[1] == last)
cb->args[1] = 0;
nf_ct_put(last);
}
return skb->len;
}
static inline int
ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple)
{
struct nlattr *tb[CTA_IP_MAX+1];
struct nf_conntrack_l3proto *l3proto;
int ret = 0;
nla_parse_nested(tb, CTA_IP_MAX, attr, NULL);
rcu_read_lock();
l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
if (likely(l3proto->nlattr_to_tuple)) {
ret = nla_validate_nested(attr, CTA_IP_MAX,
l3proto->nla_policy);
if (ret == 0)
ret = l3proto->nlattr_to_tuple(tb, tuple);
}
rcu_read_unlock();
return ret;
}
static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = {
[CTA_PROTO_NUM] = { .type = NLA_U8 },
};
static inline int
ctnetlink_parse_tuple_proto(struct nlattr *attr,
struct nf_conntrack_tuple *tuple)
{
struct nlattr *tb[CTA_PROTO_MAX+1];
struct nf_conntrack_l4proto *l4proto;
int ret = 0;
ret = nla_parse_nested(tb, CTA_PROTO_MAX, attr, proto_nla_policy);
if (ret < 0)
return ret;
if (!tb[CTA_PROTO_NUM])
return -EINVAL;
tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]);
rcu_read_lock();
l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum);
if (likely(l4proto->nlattr_to_tuple)) {
ret = nla_validate_nested(attr, CTA_PROTO_MAX,
l4proto->nla_policy);
if (ret == 0)
ret = l4proto->nlattr_to_tuple(tb, tuple);
}
rcu_read_unlock();
return ret;
}
static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
[CTA_TUPLE_IP] = { .type = NLA_NESTED },
[CTA_TUPLE_PROTO] = { .type = NLA_NESTED },
};
[NETFILTER]: Kill some supper dupper bloatry /me awards the bloatiest-of-all-net/-.c-code award to nf_conntrack_netlink.c, congratulations to all the authors :-/! Hall of (unquestionable) fame (measured per inline, top 10 under net/): -4496 ctnetlink_parse_tuple netfilter/nf_conntrack_netlink.c -2165 ctnetlink_dump_tuples netfilter/nf_conntrack_netlink.c -2115 __ip_vs_get_out_rt ipv4/ipvs/ip_vs_xmit.c -1924 xfrm_audit_helper_pktinfo xfrm/xfrm_state.c -1799 ctnetlink_parse_tuple_proto netfilter/nf_conntrack_netlink.c -1268 ctnetlink_parse_tuple_ip netfilter/nf_conntrack_netlink.c -1093 ctnetlink_exp_dump_expect netfilter/nf_conntrack_netlink.c -1060 void ccid3_update_send_interval dccp/ccids/ccid3.c -983 ctnetlink_dump_tuples_proto netfilter/nf_conntrack_netlink.c -827 ctnetlink_exp_dump_tuple netfilter/nf_conntrack_netlink.c (i386 / gcc (GCC) 4.1.2 20070626 (Red Hat 4.1.2-13) / allyesconfig except CONFIG_FORCED_INLINING) ...and I left < 200 byte gains as future work item. After iterative inline removal, I finally have this: net/netfilter/nf_conntrack_netlink.c: ctnetlink_exp_fill_info | -1104 ctnetlink_new_expect | -1572 ctnetlink_fill_info | -1303 ctnetlink_new_conntrack | -2230 ctnetlink_get_expect | -341 ctnetlink_del_expect | -352 ctnetlink_expect_event | -1110 ctnetlink_conntrack_event | -1548 ctnetlink_del_conntrack | -729 ctnetlink_get_conntrack | -728 10 functions changed, 11017 bytes removed, diff: -11017 net/netfilter/nf_conntrack_netlink.c: ctnetlink_parse_tuple | +419 dump_nat_seq_adj | +183 ctnetlink_dump_counters | +166 ctnetlink_dump_tuples | +261 ctnetlink_exp_dump_expect | +633 ctnetlink_change_status | +460 6 functions changed, 2122 bytes added, diff: +2122 net/netfilter/nf_conntrack_netlink.o: 16 functions changed, 2122 bytes added, 11017 bytes removed, diff: -8895 Without a number of CONFIG.*DEBUGs, I got this: net/netfilter/nf_conntrack_netlink.o: 16 functions changed, 2122 bytes added, 11029 bytes removed, diff: -8907 Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-06 07:11:31 +00:00
static int
ctnetlink_parse_tuple(const struct nlattr * const cda[],
struct nf_conntrack_tuple *tuple,
enum ctattr_type type, u_int8_t l3num)
{
struct nlattr *tb[CTA_TUPLE_MAX+1];
int err;
memset(tuple, 0, sizeof(*tuple));
nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy);
if (!tb[CTA_TUPLE_IP])
return -EINVAL;
tuple->src.l3num = l3num;
err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple);
if (err < 0)
return err;
if (!tb[CTA_TUPLE_PROTO])
return -EINVAL;
err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple);
if (err < 0)
return err;
/* orig and expect tuples get DIR_ORIGINAL */
if (type == CTA_TUPLE_REPLY)
tuple->dst.dir = IP_CT_DIR_REPLY;
else
tuple->dst.dir = IP_CT_DIR_ORIGINAL;
return 0;
}
static int
ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone)
{
if (attr)
#ifdef CONFIG_NF_CONNTRACK_ZONES
*zone = ntohs(nla_get_be16(attr));
#else
return -EOPNOTSUPP;
#endif
else
*zone = 0;
return 0;
}
static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
[CTA_HELP_NAME] = { .type = NLA_NUL_STRING,
.len = NF_CT_HELPER_NAME_LEN - 1 },
};
static inline int
ctnetlink_parse_help(const struct nlattr *attr, char **helper_name,
struct nlattr **helpinfo)
{
struct nlattr *tb[CTA_HELP_MAX+1];
nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy);
if (!tb[CTA_HELP_NAME])
return -EINVAL;
*helper_name = nla_data(tb[CTA_HELP_NAME]);
if (tb[CTA_HELP_INFO])
*helpinfo = tb[CTA_HELP_INFO];
return 0;
}
#define __CTA_LABELS_MAX_LENGTH ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE)
static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
[CTA_TUPLE_ORIG] = { .type = NLA_NESTED },
[CTA_TUPLE_REPLY] = { .type = NLA_NESTED },
[CTA_STATUS] = { .type = NLA_U32 },
[CTA_PROTOINFO] = { .type = NLA_NESTED },
[CTA_HELP] = { .type = NLA_NESTED },
[CTA_NAT_SRC] = { .type = NLA_NESTED },
[CTA_TIMEOUT] = { .type = NLA_U32 },
[CTA_MARK] = { .type = NLA_U32 },
[CTA_ID] = { .type = NLA_U32 },
[CTA_NAT_DST] = { .type = NLA_NESTED },
[CTA_TUPLE_MASTER] = { .type = NLA_NESTED },
[CTA_NAT_SEQ_ADJ_ORIG] = { .type = NLA_NESTED },
[CTA_NAT_SEQ_ADJ_REPLY] = { .type = NLA_NESTED },
[CTA_ZONE] = { .type = NLA_U16 },
[CTA_MARK_MASK] = { .type = NLA_U32 },
[CTA_LABELS] = { .type = NLA_BINARY,
.len = __CTA_LABELS_MAX_LENGTH },
[CTA_LABELS_MASK] = { .type = NLA_BINARY,
.len = __CTA_LABELS_MAX_LENGTH },
};
static int
ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
struct nf_conn *ct;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
u16 zone;
int err;
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
if (err < 0)
return err;
if (cda[CTA_TUPLE_ORIG])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
else if (cda[CTA_TUPLE_REPLY])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
else {
/* Flush the whole table */
nf_conntrack_flush_report(net,
NETLINK_CB(skb).portid,
nlmsg_report(nlh));
return 0;
}
if (err < 0)
return err;
h = nf_conntrack_find_get(net, zone, &tuple);
if (!h)
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
if (cda[CTA_ID]) {
u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
if (id != (u32)(unsigned long)ct) {
nf_ct_put(ct);
return -ENOENT;
}
}
if (del_timer(&ct->timeout)) {
if (nf_conntrack_event_report(IPCT_DESTROY, ct,
NETLINK_CB(skb).portid,
nlmsg_report(nlh)) < 0) {
nf_ct_delete_from_lists(ct);
/* we failed to report the event, try later */
nf_ct_dying_timeout(ct);
nf_ct_put(ct);
return 0;
}
/* death_by_timeout would report the event again */
set_bit(IPS_DYING_BIT, &ct->status);
netfilter: conntrack: optional reliable conntrack event delivery This patch improves ctnetlink event reliability if one broadcast listener has set the NETLINK_BROADCAST_ERROR socket option. The logic is the following: if an event delivery fails, we keep the undelivered events in the missed event cache. Once the next packet arrives, we add the new events (if any) to the missed events in the cache and we try a new delivery, and so on. Thus, if ctnetlink fails to deliver an event, we try to deliver them once we see a new packet. Therefore, we may lose state transitions but the userspace process gets in sync at some point. At worst case, if no events were delivered to userspace, we make sure that destroy events are successfully delivered. Basically, if ctnetlink fails to deliver the destroy event, we remove the conntrack entry from the hashes and we insert them in the dying list, which contains inactive entries. Then, the conntrack timer is added with an extra grace timeout of random32() % 15 seconds to trigger the event again (this grace timeout is tunable via /proc). The use of a limited random timeout value allows distributing the "destroy" resends, thus, avoiding accumulating lots "destroy" events at the same time. Event delivery may re-order but we can identify them by means of the tuple plus the conntrack ID. The maximum number of conntrack entries (active or inactive) is still handled by nf_conntrack_max. Thus, we may start dropping packets at some point if we accumulate a lot of inactive conntrack entries that did not successfully report the destroy event to userspace. During my stress tests consisting of setting a very small buffer of 2048 bytes for conntrackd and the NETLINK_BROADCAST_ERROR socket flag, and generating lots of very small connections, I noticed very few destroy entries on the fly waiting to be resend. A simple way to test this patch consist of creating a lot of entries, set a very small Netlink buffer in conntrackd (+ a patch which is not in the git tree to set the BROADCAST_ERROR flag) and invoke `conntrack -F'. For expectations, no changes are introduced in this patch. Currently, event delivery is only done for new expectations (no events from expectation expiration, removal and confirmation). In that case, they need a per-expectation event cache to implement the same idea that is exposed in this patch. This patch can be useful to provide reliable flow-accouting. We still have to add a new conntrack extension to store the creation and destroy time. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Signed-off-by: Patrick McHardy <kaber@trash.net>
2009-06-13 10:30:52 +00:00
nf_ct_delete_from_lists(ct);
nf_ct_put(ct);
}
nf_ct_put(ct);
return 0;
}
static int
ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
struct nf_conn *ct;
struct sk_buff *skb2 = NULL;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
u16 zone;
int err;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_table,
.done = ctnetlink_done,
};
#ifdef CONFIG_NF_CONNTRACK_MARK
if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
struct ctnetlink_dump_filter *filter;
filter = kzalloc(sizeof(struct ctnetlink_dump_filter),
GFP_ATOMIC);
if (filter == NULL)
return -ENOMEM;
filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK]));
filter->mark.mask =
ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
c.data = filter;
}
#endif
return netlink_dump_start(ctnl, skb, nlh, &c);
}
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
if (err < 0)
return err;
if (cda[CTA_TUPLE_ORIG])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
else if (cda[CTA_TUPLE_REPLY])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
else
return -EINVAL;
if (err < 0)
return err;
h = nf_conntrack_find_get(net, zone, &tuple);
if (!h)
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
err = -ENOMEM;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (skb2 == NULL) {
nf_ct_put(ct);
return -ENOMEM;
}
rcu_read_lock();
err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
NFNL_MSG_TYPE(nlh->nlmsg_type), ct);
rcu_read_unlock();
nf_ct_put(ct);
if (err <= 0)
goto free;
err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
if (err < 0)
goto out;
return 0;
free:
kfree_skb(skb2);
out:
/* this avoids a loop in nfnetlink. */
return err == -EAGAIN ? -ENOBUFS : err;
}
static int ctnetlink_done_list(struct netlink_callback *cb)
{
if (cb->args[1])
nf_ct_put((struct nf_conn *)cb->args[1]);
return 0;
}
static int
ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb,
struct hlist_nulls_head *list)
{
struct nf_conn *ct, *last;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
u_int8_t l3proto = nfmsg->nfgen_family;
int res;
if (cb->args[2])
return 0;
spin_lock_bh(&nf_conntrack_lock);
last = (struct nf_conn *)cb->args[1];
restart:
hlist_nulls_for_each_entry(h, n, list, hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (l3proto && nf_ct_l3num(ct) != l3proto)
continue;
if (cb->args[1]) {
if (ct != last)
continue;
cb->args[1] = 0;
}
rcu_read_lock();
res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
ct);
rcu_read_unlock();
if (res < 0) {
nf_conntrack_get(&ct->ct_general);
cb->args[1] = (unsigned long)ct;
goto out;
}
}
if (cb->args[1]) {
cb->args[1] = 0;
goto restart;
} else
cb->args[2] = 1;
out:
spin_unlock_bh(&nf_conntrack_lock);
if (last)
nf_ct_put(last);
return skb->len;
}
static int
ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
return ctnetlink_dump_list(skb, cb, &net->ct.dying);
}
static int
ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_dying,
.done = ctnetlink_done_list,
};
return netlink_dump_start(ctnl, skb, nlh, &c);
}
return -EOPNOTSUPP;
}
static int
ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
return ctnetlink_dump_list(skb, cb, &net->ct.unconfirmed);
}
static int
ctnetlink_get_ct_unconfirmed(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_unconfirmed,
.done = ctnetlink_done_list,
};
return netlink_dump_start(ctnl, skb, nlh, &c);
}
return -EOPNOTSUPP;
}
#ifdef CONFIG_NF_NAT_NEEDED
static int
ctnetlink_parse_nat_setup(struct nf_conn *ct,
enum nf_nat_manip_type manip,
const struct nlattr *attr)
{
typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup;
int err;
parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook);
if (!parse_nat_setup) {
#ifdef CONFIG_MODULES
rcu_read_unlock();
nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
if (request_module("nf-nat") < 0) {
nfnl_lock(NFNL_SUBSYS_CTNETLINK);
rcu_read_lock();
return -EOPNOTSUPP;
}
nfnl_lock(NFNL_SUBSYS_CTNETLINK);
rcu_read_lock();
if (nfnetlink_parse_nat_setup_hook)
return -EAGAIN;
#endif
return -EOPNOTSUPP;
}
err = parse_nat_setup(ct, manip, attr);
if (err == -EAGAIN) {
#ifdef CONFIG_MODULES
rcu_read_unlock();
nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
if (request_module("nf-nat-%u", nf_ct_l3num(ct)) < 0) {
nfnl_lock(NFNL_SUBSYS_CTNETLINK);
rcu_read_lock();
return -EOPNOTSUPP;
}
nfnl_lock(NFNL_SUBSYS_CTNETLINK);
rcu_read_lock();
#else
err = -EOPNOTSUPP;
#endif
}
return err;
}
#endif
[NETFILTER]: Kill some supper dupper bloatry /me awards the bloatiest-of-all-net/-.c-code award to nf_conntrack_netlink.c, congratulations to all the authors :-/! Hall of (unquestionable) fame (measured per inline, top 10 under net/): -4496 ctnetlink_parse_tuple netfilter/nf_conntrack_netlink.c -2165 ctnetlink_dump_tuples netfilter/nf_conntrack_netlink.c -2115 __ip_vs_get_out_rt ipv4/ipvs/ip_vs_xmit.c -1924 xfrm_audit_helper_pktinfo xfrm/xfrm_state.c -1799 ctnetlink_parse_tuple_proto netfilter/nf_conntrack_netlink.c -1268 ctnetlink_parse_tuple_ip netfilter/nf_conntrack_netlink.c -1093 ctnetlink_exp_dump_expect netfilter/nf_conntrack_netlink.c -1060 void ccid3_update_send_interval dccp/ccids/ccid3.c -983 ctnetlink_dump_tuples_proto netfilter/nf_conntrack_netlink.c -827 ctnetlink_exp_dump_tuple netfilter/nf_conntrack_netlink.c (i386 / gcc (GCC) 4.1.2 20070626 (Red Hat 4.1.2-13) / allyesconfig except CONFIG_FORCED_INLINING) ...and I left < 200 byte gains as future work item. After iterative inline removal, I finally have this: net/netfilter/nf_conntrack_netlink.c: ctnetlink_exp_fill_info | -1104 ctnetlink_new_expect | -1572 ctnetlink_fill_info | -1303 ctnetlink_new_conntrack | -2230 ctnetlink_get_expect | -341 ctnetlink_del_expect | -352 ctnetlink_expect_event | -1110 ctnetlink_conntrack_event | -1548 ctnetlink_del_conntrack | -729 ctnetlink_get_conntrack | -728 10 functions changed, 11017 bytes removed, diff: -11017 net/netfilter/nf_conntrack_netlink.c: ctnetlink_parse_tuple | +419 dump_nat_seq_adj | +183 ctnetlink_dump_counters | +166 ctnetlink_dump_tuples | +261 ctnetlink_exp_dump_expect | +633 ctnetlink_change_status | +460 6 functions changed, 2122 bytes added, diff: +2122 net/netfilter/nf_conntrack_netlink.o: 16 functions changed, 2122 bytes added, 11017 bytes removed, diff: -8895 Without a number of CONFIG.*DEBUGs, I got this: net/netfilter/nf_conntrack_netlink.o: 16 functions changed, 2122 bytes added, 11029 bytes removed, diff: -8907 Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-06 07:11:31 +00:00
static int
ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
{
unsigned long d;
unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
d = ct->status ^ status;
if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
/* unchangeable */
return -EBUSY;
if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
/* SEEN_REPLY bit can only be set */
return -EBUSY;
if (d & IPS_ASSURED && !(status & IPS_ASSURED))
/* ASSURED bit can only be set */
return -EBUSY;
/* Be careful here, modifying NAT bits can screw up things,
* so don't let users modify them directly if they don't pass
* nf_nat_range. */
ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
return 0;
}
static int
ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[])
{
#ifdef CONFIG_NF_NAT_NEEDED
int ret;
if (cda[CTA_NAT_DST]) {
ret = ctnetlink_parse_nat_setup(ct,
NF_NAT_MANIP_DST,
cda[CTA_NAT_DST]);
if (ret < 0)
return ret;
}
if (cda[CTA_NAT_SRC]) {
ret = ctnetlink_parse_nat_setup(ct,
NF_NAT_MANIP_SRC,
cda[CTA_NAT_SRC]);
if (ret < 0)
return ret;
}
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static inline int
ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
{
struct nf_conntrack_helper *helper;
struct nf_conn_help *help = nfct_help(ct);
char *helpname = NULL;
struct nlattr *helpinfo = NULL;
int err;
/* don't change helper of sibling connections */
if (ct->master)
return -EBUSY;
err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);
if (err < 0)
return err;
if (!strcmp(helpname, "")) {
if (help && help->helper) {
/* we had a helper before ... */
nf_ct_remove_expectations(ct);
RCU_INIT_POINTER(help->helper, NULL);
}
return 0;
}
helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
nf_ct_protonum(ct));
if (helper == NULL) {
#ifdef CONFIG_MODULES
spin_unlock_bh(&nf_conntrack_lock);
if (request_module("nfct-helper-%s", helpname) < 0) {
spin_lock_bh(&nf_conntrack_lock);
return -EOPNOTSUPP;
}
spin_lock_bh(&nf_conntrack_lock);
helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
nf_ct_protonum(ct));
if (helper)
return -EAGAIN;
#endif
return -EOPNOTSUPP;
}
if (help) {
if (help->helper == helper) {
/* update private helper data if allowed. */
if (helper->from_nlattr)
helper->from_nlattr(helpinfo, ct);
return 0;
} else
return -EBUSY;
}
/* we cannot set a helper for an existing conntrack */
return -EOPNOTSUPP;
}
static inline int
ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[])
{
u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
if (!del_timer(&ct->timeout))
return -ETIME;
ct->timeout.expires = jiffies + timeout * HZ;
add_timer(&ct->timeout);
/* Refresh the NAT type entry. */
#if defined(CONFIG_IP_NF_TARGET_NATTYPE_MODULE)
(void)nattype_refresh_timer(ct->nattype_entry, ct->timeout.expires);
#endif
return 0;
}
static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = {
[CTA_PROTOINFO_TCP] = { .type = NLA_NESTED },
[CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED },
[CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED },
};
static inline int
ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[])
{
const struct nlattr *attr = cda[CTA_PROTOINFO];
struct nlattr *tb[CTA_PROTOINFO_MAX+1];
struct nf_conntrack_l4proto *l4proto;
int err = 0;
nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy);
rcu_read_lock();
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
if (l4proto->from_nlattr)
err = l4proto->from_nlattr(tb, ct);
rcu_read_unlock();
return err;
}
#ifdef CONFIG_NF_NAT_NEEDED
static const struct nla_policy nat_seq_policy[CTA_NAT_SEQ_MAX+1] = {
[CTA_NAT_SEQ_CORRECTION_POS] = { .type = NLA_U32 },
[CTA_NAT_SEQ_OFFSET_BEFORE] = { .type = NLA_U32 },
[CTA_NAT_SEQ_OFFSET_AFTER] = { .type = NLA_U32 },
};
static inline int
change_nat_seq_adj(struct nf_nat_seq *natseq, const struct nlattr * const attr)
{
struct nlattr *cda[CTA_NAT_SEQ_MAX+1];
nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, nat_seq_policy);
if (!cda[CTA_NAT_SEQ_CORRECTION_POS])
return -EINVAL;
natseq->correction_pos =
ntohl(nla_get_be32(cda[CTA_NAT_SEQ_CORRECTION_POS]));
if (!cda[CTA_NAT_SEQ_OFFSET_BEFORE])
return -EINVAL;
natseq->offset_before =
ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_BEFORE]));
if (!cda[CTA_NAT_SEQ_OFFSET_AFTER])
return -EINVAL;
natseq->offset_after =
ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_AFTER]));
return 0;
}
static int
ctnetlink_change_nat_seq_adj(struct nf_conn *ct,
const struct nlattr * const cda[])
{
int ret = 0;
struct nf_conn_nat *nat = nfct_nat(ct);
if (!nat)
return 0;
if (cda[CTA_NAT_SEQ_ADJ_ORIG]) {
ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_ORIGINAL],
cda[CTA_NAT_SEQ_ADJ_ORIG]);
if (ret < 0)
return ret;
ct->status |= IPS_SEQ_ADJUST;
}
if (cda[CTA_NAT_SEQ_ADJ_REPLY]) {
ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_REPLY],
cda[CTA_NAT_SEQ_ADJ_REPLY]);
if (ret < 0)
return ret;
ct->status |= IPS_SEQ_ADJUST;
}
return 0;
}
#endif
static int
ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[])
{
#ifdef CONFIG_NF_CONNTRACK_LABELS
size_t len = nla_len(cda[CTA_LABELS]);
const void *mask = cda[CTA_LABELS_MASK];
if (len & (sizeof(u32)-1)) /* must be multiple of u32 */
return -EINVAL;
if (mask) {
if (nla_len(cda[CTA_LABELS_MASK]) == 0 ||
nla_len(cda[CTA_LABELS_MASK]) != len)
return -EINVAL;
mask = nla_data(cda[CTA_LABELS_MASK]);
}
len /= sizeof(u32);
return nf_connlabels_replace(ct, nla_data(cda[CTA_LABELS]), mask, len);
#else
return -EOPNOTSUPP;
#endif
}
static int
ctnetlink_change_conntrack(struct nf_conn *ct,
const struct nlattr * const cda[])
{
int err;
/* only allow NAT changes and master assignation for new conntracks */
if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST] || cda[CTA_TUPLE_MASTER])
return -EOPNOTSUPP;
if (cda[CTA_HELP]) {
err = ctnetlink_change_helper(ct, cda);
if (err < 0)
return err;
}
if (cda[CTA_TIMEOUT]) {
err = ctnetlink_change_timeout(ct, cda);
if (err < 0)
return err;
}
if (cda[CTA_STATUS]) {
err = ctnetlink_change_status(ct, cda);
if (err < 0)
return err;
}
if (cda[CTA_PROTOINFO]) {
err = ctnetlink_change_protoinfo(ct, cda);
if (err < 0)
return err;
}
#if defined(CONFIG_NF_CONNTRACK_MARK)
if (cda[CTA_MARK])
ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
#endif
#ifdef CONFIG_NF_NAT_NEEDED
if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) {
err = ctnetlink_change_nat_seq_adj(ct, cda);
if (err < 0)
return err;
}
#endif
if (cda[CTA_LABELS]) {
err = ctnetlink_attach_labels(ct, cda);
if (err < 0)
return err;
}
return 0;
}
static struct nf_conn *
ctnetlink_create_conntrack(struct net *net, u16 zone,
const struct nlattr * const cda[],
struct nf_conntrack_tuple *otuple,
struct nf_conntrack_tuple *rtuple,
u8 u3)
{
struct nf_conn *ct;
int err = -EINVAL;
struct nf_conntrack_helper *helper;
struct nf_conn_tstamp *tstamp;
ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC);
if (IS_ERR(ct))
return ERR_PTR(-ENOMEM);
if (!cda[CTA_TIMEOUT])
goto err1;
ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
rcu_read_lock();
if (cda[CTA_HELP]) {
char *helpname = NULL;
struct nlattr *helpinfo = NULL;
err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);
if (err < 0)
goto err2;
helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
nf_ct_protonum(ct));
if (helper == NULL) {
rcu_read_unlock();
#ifdef CONFIG_MODULES
if (request_module("nfct-helper-%s", helpname) < 0) {
err = -EOPNOTSUPP;
goto err1;
}
rcu_read_lock();
helper = __nf_conntrack_helper_find(helpname,
nf_ct_l3num(ct),
nf_ct_protonum(ct));
if (helper) {
err = -EAGAIN;
goto err2;
}
rcu_read_unlock();
#endif
err = -EOPNOTSUPP;
goto err1;
} else {
struct nf_conn_help *help;
help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC);
if (help == NULL) {
err = -ENOMEM;
goto err2;
}
/* set private helper data if allowed. */
if (helper->from_nlattr)
helper->from_nlattr(helpinfo, ct);
/* not in hash table yet so not strictly necessary */
RCU_INIT_POINTER(help->helper, helper);
}
} else {
/* try an implicit helper assignation */
err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
if (err < 0)
goto err2;
}
if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) {
err = ctnetlink_change_nat(ct, cda);
if (err < 0)
goto err2;
}
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC);
nf_ct_labels_ext_add(ct);
/* we must add conntrack extensions before confirmation. */
ct->status |= IPS_CONFIRMED;
if (cda[CTA_STATUS]) {
err = ctnetlink_change_status(ct, cda);
if (err < 0)
goto err2;
}
#ifdef CONFIG_NF_NAT_NEEDED
if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) {
err = ctnetlink_change_nat_seq_adj(ct, cda);
if (err < 0)
goto err2;
}
#endif
memset(&ct->proto, 0, sizeof(ct->proto));
if (cda[CTA_PROTOINFO]) {
err = ctnetlink_change_protoinfo(ct, cda);
if (err < 0)
goto err2;
}
#if defined(CONFIG_NF_CONNTRACK_MARK)
if (cda[CTA_MARK])
ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
#endif
/* setup master conntrack: this is a confirmed expectation */
if (cda[CTA_TUPLE_MASTER]) {
struct nf_conntrack_tuple master;
struct nf_conntrack_tuple_hash *master_h;
struct nf_conn *master_ct;
err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3);
if (err < 0)
goto err2;
master_h = nf_conntrack_find_get(net, zone, &master);
if (master_h == NULL) {
err = -ENOENT;
goto err2;
}
master_ct = nf_ct_tuplehash_to_ctrack(master_h);
__set_bit(IPS_EXPECTED_BIT, &ct->status);
ct->master = master_ct;
}
tstamp = nf_conn_tstamp_find(ct);
if (tstamp)
tstamp->start = ktime_to_ns(ktime_get_real());
err = nf_conntrack_hash_check_insert(ct);
if (err < 0)
goto err2;
rcu_read_unlock();
return ct;
err2:
rcu_read_unlock();
err1:
nf_conntrack_free(ct);
return ERR_PTR(err);
}
static int
ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple otuple, rtuple;
struct nf_conntrack_tuple_hash *h = NULL;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nf_conn *ct;
u_int8_t u3 = nfmsg->nfgen_family;
u16 zone;
int err;
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
if (err < 0)
return err;
if (cda[CTA_TUPLE_ORIG]) {
err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3);
if (err < 0)
return err;
}
if (cda[CTA_TUPLE_REPLY]) {
err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3);
if (err < 0)
return err;
}
if (cda[CTA_TUPLE_ORIG])
h = nf_conntrack_find_get(net, zone, &otuple);
else if (cda[CTA_TUPLE_REPLY])
h = nf_conntrack_find_get(net, zone, &rtuple);
if (h == NULL) {
err = -ENOENT;
if (nlh->nlmsg_flags & NLM_F_CREATE) {
enum ip_conntrack_events events;
if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
return -EINVAL;
ct = ctnetlink_create_conntrack(net, zone, cda, &otuple,
&rtuple, u3);
if (IS_ERR(ct))
return PTR_ERR(ct);
err = 0;
if (test_bit(IPS_EXPECTED_BIT, &ct->status))
events = IPCT_RELATED;
else
events = IPCT_NEW;
if (cda[CTA_LABELS] &&
ctnetlink_attach_labels(ct, cda) == 0)
events |= (1 << IPCT_LABEL);
nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
(1 << IPCT_ASSURED) |
(1 << IPCT_HELPER) |
(1 << IPCT_PROTOINFO) |
(1 << IPCT_NATSEQADJ) |
(1 << IPCT_MARK) | events,
ct, NETLINK_CB(skb).portid,
nlmsg_report(nlh));
nf_ct_put(ct);
}
return err;
}
/* implicit 'else' */
err = -EEXIST;
ct = nf_ct_tuplehash_to_ctrack(h);
if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
spin_lock_bh(&nf_conntrack_lock);
err = ctnetlink_change_conntrack(ct, cda);
spin_unlock_bh(&nf_conntrack_lock);
if (err == 0) {
nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
(1 << IPCT_ASSURED) |
(1 << IPCT_HELPER) |
(1 << IPCT_LABEL) |
(1 << IPCT_PROTOINFO) |
(1 << IPCT_NATSEQADJ) |
(1 << IPCT_MARK),
ct, NETLINK_CB(skb).portid,
nlmsg_report(nlh));
}
}
nf_ct_put(ct);
return err;
}
static int
ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
__u16 cpu, const struct ip_conntrack_stat *st)
{
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0, event;
event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS_CPU);
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
if (nlh == NULL)
goto nlmsg_failure;
nfmsg = nlmsg_data(nlh);
nfmsg->nfgen_family = AF_UNSPEC;
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = htons(cpu);
if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) ||
nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) ||
nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) ||
nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) ||
nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) ||
nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
htonl(st->insert_failed)) ||
nla_put_be32(skb, CTA_STATS_DROP, htonl(st->drop)) ||
nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) ||
nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) ||
nla_put_be32(skb, CTA_STATS_SEARCH_RESTART,
htonl(st->search_restart)))
goto nla_put_failure;
nlmsg_end(skb, nlh);
return skb->len;
nla_put_failure:
nlmsg_failure:
nlmsg_cancel(skb, nlh);
return -1;
}
static int
ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
int cpu;
struct net *net = sock_net(skb->sk);
if (cb->args[0] == nr_cpu_ids)
return 0;
for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
const struct ip_conntrack_stat *st;
if (!cpu_possible(cpu))
continue;
st = per_cpu_ptr(net->ct.stat, cpu);
if (ctnetlink_ct_stat_cpu_fill_info(skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
cpu, st) < 0)
break;
}
cb->args[0] = cpu;
return skb->len;
}
static int
ctnetlink_stat_ct_cpu(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_ct_stat_cpu_dump,
};
return netlink_dump_start(ctnl, skb, nlh, &c);
}
return 0;
}
static int
ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
struct net *net)
{
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0, event;
unsigned int nr_conntracks = atomic_read(&net->ct.count);
event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS);
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
if (nlh == NULL)
goto nlmsg_failure;
nfmsg = nlmsg_data(nlh);
nfmsg->nfgen_family = AF_UNSPEC;
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = 0;
if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
goto nla_put_failure;
nlmsg_end(skb, nlh);
return skb->len;
nla_put_failure:
nlmsg_failure:
nlmsg_cancel(skb, nlh);
return -1;
}
static int
ctnetlink_stat_ct(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
struct sk_buff *skb2;
int err;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (skb2 == NULL)
return -ENOMEM;
err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid,
nlh->nlmsg_seq,
NFNL_MSG_TYPE(nlh->nlmsg_type),
sock_net(skb->sk));
if (err <= 0)
goto free;
err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
if (err < 0)
goto out;
return 0;
free:
kfree_skb(skb2);
out:
/* this avoids a loop in nfnetlink. */
return err == -EAGAIN ? -ENOBUFS : err;
}
#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
static size_t
ctnetlink_nfqueue_build_size(const struct nf_conn *ct)
{
return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */
+ 3 * nla_total_size(0) /* CTA_TUPLE_IP */
+ 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */
+ 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */
+ nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
+ nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
+ nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
+ nla_total_size(0) /* CTA_PROTOINFO */
+ nla_total_size(0) /* CTA_HELP */
+ nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
+ ctnetlink_secctx_size(ct)
#ifdef CONFIG_NF_NAT_NEEDED
+ 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+ 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
#endif
#ifdef CONFIG_NF_CONNTRACK_MARK
+ nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
#endif
+ ctnetlink_proto_size(ct)
;
}
static int
ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
{
struct nlattr *nest_parms;
rcu_read_lock();
nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
if (nf_ct_zone(ct)) {
if (nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
goto nla_put_failure;
}
if (ctnetlink_dump_id(skb, ct) < 0)
goto nla_put_failure;
if (ctnetlink_dump_status(skb, ct) < 0)
goto nla_put_failure;
if (ctnetlink_dump_timeout(skb, ct) < 0)
goto nla_put_failure;
if (ctnetlink_dump_protoinfo(skb, ct) < 0)
goto nla_put_failure;
if (ctnetlink_dump_helpinfo(skb, ct) < 0)
goto nla_put_failure;
#ifdef CONFIG_NF_CONNTRACK_SECMARK
if (ct->secmark && ctnetlink_dump_secctx(skb, ct) < 0)
goto nla_put_failure;
#endif
if (ct->master && ctnetlink_dump_master(skb, ct) < 0)
goto nla_put_failure;
if ((ct->status & IPS_SEQ_ADJUST) &&
ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
goto nla_put_failure;
#ifdef CONFIG_NF_CONNTRACK_MARK
if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0)
goto nla_put_failure;
#endif
if (ctnetlink_dump_labels(skb, ct) < 0)
goto nla_put_failure;
rcu_read_unlock();
return 0;
nla_put_failure:
rcu_read_unlock();
return -ENOSPC;
}
static int
ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
{
int err;
if (cda[CTA_TIMEOUT]) {
err = ctnetlink_change_timeout(ct, cda);
if (err < 0)
return err;
}
if (cda[CTA_STATUS]) {
err = ctnetlink_change_status(ct, cda);
if (err < 0)
return err;
}
if (cda[CTA_HELP]) {
err = ctnetlink_change_helper(ct, cda);
if (err < 0)
return err;
}
if (cda[CTA_LABELS]) {
err = ctnetlink_attach_labels(ct, cda);
if (err < 0)
return err;
}
#if defined(CONFIG_NF_CONNTRACK_MARK)
if (cda[CTA_MARK])
ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
#endif
return 0;
}
static int
ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)
{
struct nlattr *cda[CTA_MAX+1];
int ret;
nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy);
spin_lock_bh(&nf_conntrack_lock);
ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct);
spin_unlock_bh(&nf_conntrack_lock);
return ret;
}
static struct nfq_ct_hook ctnetlink_nfqueue_hook = {
.build_size = ctnetlink_nfqueue_build_size,
.build = ctnetlink_nfqueue_build,
.parse = ctnetlink_nfqueue_parse,
};
#endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */
/***********************************************************************
* EXPECT
***********************************************************************/
static inline int
ctnetlink_exp_dump_tuple(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
enum ctattr_expect type)
{
struct nlattr *nest_parms;
nest_parms = nla_nest_start(skb, type | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, tuple) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
return 0;
nla_put_failure:
return -1;
}
static inline int
ctnetlink_exp_dump_mask(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple_mask *mask)
{
int ret;
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
struct nf_conntrack_tuple m;
struct nlattr *nest_parms;
memset(&m, 0xFF, sizeof(m));
memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3));
m.src.u.all = mask->src.u.all;
m.dst.protonum = tuple->dst.protonum;
nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
netfilter: ctnetlink: fix lockep splats net/netfilter/nf_conntrack_proto.c:70 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 3 locks held by conntrack/3235: nfnl_lock+0x17/0x20 netlink_dump+0x32/0x240 ctnetlink_dump_table+0x3e/0x170 [nf_conntrack_netlink] stack backtrace: Pid: 3235, comm: conntrack Tainted: G W 3.2.0+ #511 Call Trace: [<ffffffff8108ce45>] lockdep_rcu_suspicious+0xe5/0x100 [<ffffffffa00ec6e1>] __nf_ct_l4proto_find+0x81/0xb0 [nf_conntrack] [<ffffffffa0115675>] ctnetlink_fill_info+0x215/0x5f0 [nf_conntrack_netlink] [<ffffffffa0115dc1>] ctnetlink_dump_table+0xd1/0x170 [nf_conntrack_netlink] [<ffffffff815fbdbf>] netlink_dump+0x7f/0x240 [<ffffffff81090f9d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff815fd34f>] netlink_dump_start+0xdf/0x190 [<ffffffffa0111490>] ? ctnetlink_change_nat_seq_adj+0x160/0x160 [nf_conntrack_netlink] [<ffffffffa0115cf0>] ? ctnetlink_get_conntrack+0x2a0/0x2a0 [nf_conntrack_netlink] [<ffffffffa0115ad9>] ctnetlink_get_conntrack+0x89/0x2a0 [nf_conntrack_netlink] [<ffffffff81603a47>] nfnetlink_rcv_msg+0x467/0x5f0 [<ffffffff81603a7c>] ? nfnetlink_rcv_msg+0x49c/0x5f0 [<ffffffff81603922>] ? nfnetlink_rcv_msg+0x342/0x5f0 [<ffffffff81071b21>] ? get_parent_ip+0x11/0x50 [<ffffffff816035e0>] ? nfnetlink_subsys_register+0x60/0x60 [<ffffffff815fed49>] netlink_rcv_skb+0xa9/0xd0 [<ffffffff81603475>] nfnetlink_rcv+0x15/0x20 [<ffffffff815fe70e>] netlink_unicast+0x1ae/0x1f0 [<ffffffff815fea16>] netlink_sendmsg+0x2c6/0x320 [<ffffffff815b2a87>] sock_sendmsg+0x117/0x130 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff811250dc>] ? might_fault+0x9c/0xb0 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff815b5991>] ? move_addr_to_kernel+0x71/0x80 [<ffffffff815b644e>] sys_sendto+0xfe/0x130 [<ffffffff815b5c94>] ? sys_bind+0xb4/0xd0 [<ffffffff817a8a0e>] ? retint_swapgs+0xe/0x13 [<ffffffff817afcd2>] system_call_fastpath+0x16/0x1b Reported-by: Hans Schillstrom <hans.schillstrom@ericsson.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
2012-03-05 02:24:29 +00:00
rcu_read_lock();
l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto);
netfilter: ctnetlink: fix lockep splats net/netfilter/nf_conntrack_proto.c:70 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 3 locks held by conntrack/3235: nfnl_lock+0x17/0x20 netlink_dump+0x32/0x240 ctnetlink_dump_table+0x3e/0x170 [nf_conntrack_netlink] stack backtrace: Pid: 3235, comm: conntrack Tainted: G W 3.2.0+ #511 Call Trace: [<ffffffff8108ce45>] lockdep_rcu_suspicious+0xe5/0x100 [<ffffffffa00ec6e1>] __nf_ct_l4proto_find+0x81/0xb0 [nf_conntrack] [<ffffffffa0115675>] ctnetlink_fill_info+0x215/0x5f0 [nf_conntrack_netlink] [<ffffffffa0115dc1>] ctnetlink_dump_table+0xd1/0x170 [nf_conntrack_netlink] [<ffffffff815fbdbf>] netlink_dump+0x7f/0x240 [<ffffffff81090f9d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff815fd34f>] netlink_dump_start+0xdf/0x190 [<ffffffffa0111490>] ? ctnetlink_change_nat_seq_adj+0x160/0x160 [nf_conntrack_netlink] [<ffffffffa0115cf0>] ? ctnetlink_get_conntrack+0x2a0/0x2a0 [nf_conntrack_netlink] [<ffffffffa0115ad9>] ctnetlink_get_conntrack+0x89/0x2a0 [nf_conntrack_netlink] [<ffffffff81603a47>] nfnetlink_rcv_msg+0x467/0x5f0 [<ffffffff81603a7c>] ? nfnetlink_rcv_msg+0x49c/0x5f0 [<ffffffff81603922>] ? nfnetlink_rcv_msg+0x342/0x5f0 [<ffffffff81071b21>] ? get_parent_ip+0x11/0x50 [<ffffffff816035e0>] ? nfnetlink_subsys_register+0x60/0x60 [<ffffffff815fed49>] netlink_rcv_skb+0xa9/0xd0 [<ffffffff81603475>] nfnetlink_rcv+0x15/0x20 [<ffffffff815fe70e>] netlink_unicast+0x1ae/0x1f0 [<ffffffff815fea16>] netlink_sendmsg+0x2c6/0x320 [<ffffffff815b2a87>] sock_sendmsg+0x117/0x130 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff811250dc>] ? might_fault+0x9c/0xb0 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff815b5991>] ? move_addr_to_kernel+0x71/0x80 [<ffffffff815b644e>] sys_sendto+0xfe/0x130 [<ffffffff815b5c94>] ? sys_bind+0xb4/0xd0 [<ffffffff817a8a0e>] ? retint_swapgs+0xe/0x13 [<ffffffff817afcd2>] system_call_fastpath+0x16/0x1b Reported-by: Hans Schillstrom <hans.schillstrom@ericsson.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
2012-03-05 02:24:29 +00:00
if (ret >= 0) {
l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
tuple->dst.protonum);
ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto);
netfilter: ctnetlink: fix lockep splats net/netfilter/nf_conntrack_proto.c:70 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 3 locks held by conntrack/3235: nfnl_lock+0x17/0x20 netlink_dump+0x32/0x240 ctnetlink_dump_table+0x3e/0x170 [nf_conntrack_netlink] stack backtrace: Pid: 3235, comm: conntrack Tainted: G W 3.2.0+ #511 Call Trace: [<ffffffff8108ce45>] lockdep_rcu_suspicious+0xe5/0x100 [<ffffffffa00ec6e1>] __nf_ct_l4proto_find+0x81/0xb0 [nf_conntrack] [<ffffffffa0115675>] ctnetlink_fill_info+0x215/0x5f0 [nf_conntrack_netlink] [<ffffffffa0115dc1>] ctnetlink_dump_table+0xd1/0x170 [nf_conntrack_netlink] [<ffffffff815fbdbf>] netlink_dump+0x7f/0x240 [<ffffffff81090f9d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff815fd34f>] netlink_dump_start+0xdf/0x190 [<ffffffffa0111490>] ? ctnetlink_change_nat_seq_adj+0x160/0x160 [nf_conntrack_netlink] [<ffffffffa0115cf0>] ? ctnetlink_get_conntrack+0x2a0/0x2a0 [nf_conntrack_netlink] [<ffffffffa0115ad9>] ctnetlink_get_conntrack+0x89/0x2a0 [nf_conntrack_netlink] [<ffffffff81603a47>] nfnetlink_rcv_msg+0x467/0x5f0 [<ffffffff81603a7c>] ? nfnetlink_rcv_msg+0x49c/0x5f0 [<ffffffff81603922>] ? nfnetlink_rcv_msg+0x342/0x5f0 [<ffffffff81071b21>] ? get_parent_ip+0x11/0x50 [<ffffffff816035e0>] ? nfnetlink_subsys_register+0x60/0x60 [<ffffffff815fed49>] netlink_rcv_skb+0xa9/0xd0 [<ffffffff81603475>] nfnetlink_rcv+0x15/0x20 [<ffffffff815fe70e>] netlink_unicast+0x1ae/0x1f0 [<ffffffff815fea16>] netlink_sendmsg+0x2c6/0x320 [<ffffffff815b2a87>] sock_sendmsg+0x117/0x130 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff811250dc>] ? might_fault+0x9c/0xb0 [<ffffffff81125093>] ? might_fault+0x53/0xb0 [<ffffffff815b5991>] ? move_addr_to_kernel+0x71/0x80 [<ffffffff815b644e>] sys_sendto+0xfe/0x130 [<ffffffff815b5c94>] ? sys_bind+0xb4/0xd0 [<ffffffff817a8a0e>] ? retint_swapgs+0xe/0x13 [<ffffffff817afcd2>] system_call_fastpath+0x16/0x1b Reported-by: Hans Schillstrom <hans.schillstrom@ericsson.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
2012-03-05 02:24:29 +00:00
}
rcu_read_unlock();
if (unlikely(ret < 0))
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
return 0;
nla_put_failure:
return -1;
}
static const union nf_inet_addr any_addr;
[NETFILTER]: Kill some supper dupper bloatry /me awards the bloatiest-of-all-net/-.c-code award to nf_conntrack_netlink.c, congratulations to all the authors :-/! Hall of (unquestionable) fame (measured per inline, top 10 under net/): -4496 ctnetlink_parse_tuple netfilter/nf_conntrack_netlink.c -2165 ctnetlink_dump_tuples netfilter/nf_conntrack_netlink.c -2115 __ip_vs_get_out_rt ipv4/ipvs/ip_vs_xmit.c -1924 xfrm_audit_helper_pktinfo xfrm/xfrm_state.c -1799 ctnetlink_parse_tuple_proto netfilter/nf_conntrack_netlink.c -1268 ctnetlink_parse_tuple_ip netfilter/nf_conntrack_netlink.c -1093 ctnetlink_exp_dump_expect netfilter/nf_conntrack_netlink.c -1060 void ccid3_update_send_interval dccp/ccids/ccid3.c -983 ctnetlink_dump_tuples_proto netfilter/nf_conntrack_netlink.c -827 ctnetlink_exp_dump_tuple netfilter/nf_conntrack_netlink.c (i386 / gcc (GCC) 4.1.2 20070626 (Red Hat 4.1.2-13) / allyesconfig except CONFIG_FORCED_INLINING) ...and I left < 200 byte gains as future work item. After iterative inline removal, I finally have this: net/netfilter/nf_conntrack_netlink.c: ctnetlink_exp_fill_info | -1104 ctnetlink_new_expect | -1572 ctnetlink_fill_info | -1303 ctnetlink_new_conntrack | -2230 ctnetlink_get_expect | -341 ctnetlink_del_expect | -352 ctnetlink_expect_event | -1110 ctnetlink_conntrack_event | -1548 ctnetlink_del_conntrack | -729 ctnetlink_get_conntrack | -728 10 functions changed, 11017 bytes removed, diff: -11017 net/netfilter/nf_conntrack_netlink.c: ctnetlink_parse_tuple | +419 dump_nat_seq_adj | +183 ctnetlink_dump_counters | +166 ctnetlink_dump_tuples | +261 ctnetlink_exp_dump_expect | +633 ctnetlink_change_status | +460 6 functions changed, 2122 bytes added, diff: +2122 net/netfilter/nf_conntrack_netlink.o: 16 functions changed, 2122 bytes added, 11017 bytes removed, diff: -8895 Without a number of CONFIG.*DEBUGs, I got this: net/netfilter/nf_conntrack_netlink.o: 16 functions changed, 2122 bytes added, 11029 bytes removed, diff: -8907 Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-06 07:11:31 +00:00
static int
ctnetlink_exp_dump_expect(struct sk_buff *skb,
const struct nf_conntrack_expect *exp)
{
struct nf_conn *master = exp->master;
long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ;
struct nf_conn_help *help;
#ifdef CONFIG_NF_NAT_NEEDED
struct nlattr *nest_parms;
struct nf_conntrack_tuple nat_tuple = {};
#endif
struct nf_ct_helper_expectfn *expfn;
if (timeout < 0)
timeout = 0;
if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
goto nla_put_failure;
if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0)
goto nla_put_failure;
if (ctnetlink_exp_dump_tuple(skb,
&master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
CTA_EXPECT_MASTER) < 0)
goto nla_put_failure;
#ifdef CONFIG_NF_NAT_NEEDED
if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) ||
exp->saved_proto.all) {
nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
if (nla_put_be32(skb, CTA_EXPECT_NAT_DIR, htonl(exp->dir)))
goto nla_put_failure;
nat_tuple.src.l3num = nf_ct_l3num(master);
nat_tuple.src.u3 = exp->saved_addr;
nat_tuple.dst.protonum = nf_ct_protonum(master);
nat_tuple.src.u = exp->saved_proto;
if (ctnetlink_exp_dump_tuple(skb, &nat_tuple,
CTA_EXPECT_NAT_TUPLE) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
}
#endif
if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) ||
nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) ||
nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) ||
nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class)))
goto nla_put_failure;
help = nfct_help(master);
if (help) {
struct nf_conntrack_helper *helper;
helper = rcu_dereference(help->helper);
if (helper &&
nla_put_string(skb, CTA_EXPECT_HELP_NAME, helper->name))
goto nla_put_failure;
}
expfn = nf_ct_helper_expectfn_find_by_symbol(exp->expectfn);
if (expfn != NULL &&
nla_put_string(skb, CTA_EXPECT_FN, expfn->name))
goto nla_put_failure;
return 0;
nla_put_failure:
return -1;
}
static int
ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
int event, const struct nf_conntrack_expect *exp)
{
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
if (nlh == NULL)
goto nlmsg_failure;
nfmsg = nlmsg_data(nlh);
nfmsg->nfgen_family = exp->tuple.src.l3num;
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = 0;
if (ctnetlink_exp_dump_expect(skb, exp) < 0)
goto nla_put_failure;
nlmsg_end(skb, nlh);
return skb->len;
nlmsg_failure:
nla_put_failure:
nlmsg_cancel(skb, nlh);
return -1;
}
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static int
ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
{
struct nf_conntrack_expect *exp = item->exp;
struct net *net = nf_ct_exp_net(exp);
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
struct sk_buff *skb;
unsigned int type, group;
int flags = 0;
if (events & (1 << IPEXP_DESTROY)) {
type = IPCTNL_MSG_EXP_DELETE;
group = NFNLGRP_CONNTRACK_EXP_DESTROY;
} else if (events & (1 << IPEXP_NEW)) {
type = IPCTNL_MSG_EXP_NEW;
flags = NLM_F_CREATE|NLM_F_EXCL;
group = NFNLGRP_CONNTRACK_EXP_NEW;
} else
return 0;
if (!item->report && !nfnetlink_has_listeners(net, group))
return 0;
skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
if (skb == NULL)
goto errout;
type |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
if (nlh == NULL)
goto nlmsg_failure;
nfmsg = nlmsg_data(nlh);
nfmsg->nfgen_family = exp->tuple.src.l3num;
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = 0;
rcu_read_lock();
if (ctnetlink_exp_dump_expect(skb, exp) < 0)
goto nla_put_failure;
rcu_read_unlock();
nlmsg_end(skb, nlh);
nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC);
return 0;
nla_put_failure:
rcu_read_unlock();
nlmsg_cancel(skb, nlh);
nlmsg_failure:
kfree_skb(skb);
errout:
nfnetlink_set_err(net, 0, 0, -ENOBUFS);
return 0;
}
#endif
static int ctnetlink_exp_done(struct netlink_callback *cb)
{
if (cb->args[1])
nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]);
return 0;
}
static int
ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
struct nf_conntrack_expect *exp, *last;
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
u_int8_t l3proto = nfmsg->nfgen_family;
rcu_read_lock();
last = (struct nf_conntrack_expect *)cb->args[1];
for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
restart:
hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 01:06:00 +00:00
hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]],
hnode) {
if (l3proto && exp->tuple.src.l3num != l3proto)
continue;
if (cb->args[1]) {
if (exp != last)
continue;
cb->args[1] = 0;
}
if (ctnetlink_exp_fill_info(skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
IPCTNL_MSG_EXP_NEW,
exp) < 0) {
if (!atomic_inc_not_zero(&exp->use))
continue;
cb->args[1] = (unsigned long)exp;
goto out;
}
}
if (cb->args[1]) {
cb->args[1] = 0;
goto restart;
}
}
out:
rcu_read_unlock();
if (last)
nf_ct_expect_put(last);
return skb->len;
}
static int
ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
struct nf_conntrack_expect *exp, *last;
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
struct nf_conn *ct = cb->data;
struct nf_conn_help *help = nfct_help(ct);
u_int8_t l3proto = nfmsg->nfgen_family;
if (cb->args[0])
return 0;
rcu_read_lock();
last = (struct nf_conntrack_expect *)cb->args[1];
restart:
hlist_for_each_entry(exp, &help->expectations, lnode) {
if (l3proto && exp->tuple.src.l3num != l3proto)
continue;
if (cb->args[1]) {
if (exp != last)
continue;
cb->args[1] = 0;
}
if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
IPCTNL_MSG_EXP_NEW,
exp) < 0) {
if (!atomic_inc_not_zero(&exp->use))
continue;
cb->args[1] = (unsigned long)exp;
goto out;
}
}
if (cb->args[1]) {
cb->args[1] = 0;
goto restart;
}
cb->args[0] = 1;
out:
rcu_read_unlock();
if (last)
nf_ct_expect_put(last);
return skb->len;
}
static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
int err;
struct net *net = sock_net(ctnl);
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
u16 zone = 0;
struct netlink_dump_control c = {
.dump = ctnetlink_exp_ct_dump_table,
.done = ctnetlink_exp_done,
};
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
if (err < 0)
return err;
if (cda[CTA_EXPECT_ZONE]) {
err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
if (err < 0)
return err;
}
h = nf_conntrack_find_get(net, zone, &tuple);
if (!h)
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
c.data = ct;
err = netlink_dump_start(ctnl, skb, nlh, &c);
nf_ct_put(ct);
return err;
}
static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
[CTA_EXPECT_MASTER] = { .type = NLA_NESTED },
[CTA_EXPECT_TUPLE] = { .type = NLA_NESTED },
[CTA_EXPECT_MASK] = { .type = NLA_NESTED },
[CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 },
[CTA_EXPECT_ID] = { .type = NLA_U32 },
[CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING,
.len = NF_CT_HELPER_NAME_LEN - 1 },
[CTA_EXPECT_ZONE] = { .type = NLA_U16 },
[CTA_EXPECT_FLAGS] = { .type = NLA_U32 },
[CTA_EXPECT_CLASS] = { .type = NLA_U32 },
[CTA_EXPECT_NAT] = { .type = NLA_NESTED },
[CTA_EXPECT_FN] = { .type = NLA_NUL_STRING },
};
static int
ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple tuple;
struct nf_conntrack_expect *exp;
struct sk_buff *skb2;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
u16 zone;
int err;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
if (cda[CTA_EXPECT_MASTER])
return ctnetlink_dump_exp_ct(ctnl, skb, nlh, cda);
else {
struct netlink_dump_control c = {
.dump = ctnetlink_exp_dump_table,
.done = ctnetlink_exp_done,
};
return netlink_dump_start(ctnl, skb, nlh, &c);
}
}
err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
if (err < 0)
return err;
if (cda[CTA_EXPECT_TUPLE])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
else if (cda[CTA_EXPECT_MASTER])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
else
return -EINVAL;
if (err < 0)
return err;
exp = nf_ct_expect_find_get(net, zone, &tuple);
if (!exp)
return -ENOENT;
if (cda[CTA_EXPECT_ID]) {
__be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
if (ntohl(id) != (u32)(unsigned long)exp) {
nf_ct_expect_put(exp);
return -ENOENT;
}
}
err = -ENOMEM;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (skb2 == NULL) {
nf_ct_expect_put(exp);
goto out;
}
rcu_read_lock();
err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid,
nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp);
rcu_read_unlock();
nf_ct_expect_put(exp);
if (err <= 0)
goto free;
err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
if (err < 0)
goto out;
return 0;
free:
kfree_skb(skb2);
out:
/* this avoids a loop in nfnetlink. */
return err == -EAGAIN ? -ENOBUFS : err;
}
static int
ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
struct net *net = sock_net(ctnl);
struct nf_conntrack_expect *exp;
struct nf_conntrack_tuple tuple;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 01:06:00 +00:00
struct hlist_node *next;
u_int8_t u3 = nfmsg->nfgen_family;
unsigned int i;
u16 zone;
int err;
if (cda[CTA_EXPECT_TUPLE]) {
/* delete a single expect by tuple */
err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
if (err < 0)
return err;
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
if (err < 0)
return err;
/* bump usage count to 2 */
exp = nf_ct_expect_find_get(net, zone, &tuple);
if (!exp)
return -ENOENT;
if (cda[CTA_EXPECT_ID]) {
__be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
if (ntohl(id) != (u32)(unsigned long)exp) {
nf_ct_expect_put(exp);
return -ENOENT;
}
}
/* after list removal, usage count == 1 */
spin_lock_bh(&nf_conntrack_lock);
if (del_timer(&exp->timeout)) {
nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
nlmsg_report(nlh));
nf_ct_expect_put(exp);
}
spin_unlock_bh(&nf_conntrack_lock);
/* have to put what we 'get' above.
* after this line usage count == 0 */
nf_ct_expect_put(exp);
} else if (cda[CTA_EXPECT_HELP_NAME]) {
char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]);
struct nf_conn_help *m_help;
/* delete all expectations for this helper */
spin_lock_bh(&nf_conntrack_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 01:06:00 +00:00
hlist_for_each_entry_safe(exp, next,
&net->ct.expect_hash[i],
hnode) {
m_help = nfct_help(exp->master);
if (!strcmp(m_help->helper->name, name) &&
del_timer(&exp->timeout)) {
nf_ct_unlink_expect_report(exp,
NETLINK_CB(skb).portid,
nlmsg_report(nlh));
nf_ct_expect_put(exp);
}
}
}
spin_unlock_bh(&nf_conntrack_lock);
} else {
/* This basically means we have to flush everything*/
spin_lock_bh(&nf_conntrack_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 01:06:00 +00:00
hlist_for_each_entry_safe(exp, next,
&net->ct.expect_hash[i],
hnode) {
if (del_timer(&exp->timeout)) {
nf_ct_unlink_expect_report(exp,
NETLINK_CB(skb).portid,
nlmsg_report(nlh));
nf_ct_expect_put(exp);
}
}
}
spin_unlock_bh(&nf_conntrack_lock);
}
return 0;
}
static int
ctnetlink_change_expect(struct nf_conntrack_expect *x,
const struct nlattr * const cda[])
{
if (cda[CTA_EXPECT_TIMEOUT]) {
if (!del_timer(&x->timeout))
return -ETIME;
x->timeout.expires = jiffies +
ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
add_timer(&x->timeout);
}
return 0;
}
static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = {
[CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 },
[CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED },
};
static int
ctnetlink_parse_expect_nat(const struct nlattr *attr,
struct nf_conntrack_expect *exp,
u_int8_t u3)
{
#ifdef CONFIG_NF_NAT_NEEDED
struct nlattr *tb[CTA_EXPECT_NAT_MAX+1];
struct nf_conntrack_tuple nat_tuple = {};
int err;
nla_parse_nested(tb, CTA_EXPECT_NAT_MAX, attr, exp_nat_nla_policy);
if (!tb[CTA_EXPECT_NAT_DIR] || !tb[CTA_EXPECT_NAT_TUPLE])
return -EINVAL;
err = ctnetlink_parse_tuple((const struct nlattr * const *)tb,
&nat_tuple, CTA_EXPECT_NAT_TUPLE, u3);
if (err < 0)
return err;
exp->saved_addr = nat_tuple.src.u3;
exp->saved_proto = nat_tuple.src.u;
exp->dir = ntohl(nla_get_be32(tb[CTA_EXPECT_NAT_DIR]));
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int
ctnetlink_create_expect(struct net *net, u16 zone,
const struct nlattr * const cda[],
u_int8_t u3,
u32 portid, int report)
{
struct nf_conntrack_tuple tuple, mask, master_tuple;
struct nf_conntrack_tuple_hash *h = NULL;
struct nf_conntrack_expect *exp;
struct nf_conn *ct;
struct nf_conn_help *help;
struct nf_conntrack_helper *helper = NULL;
u_int32_t class = 0;
int err = 0;
/* caller guarantees that those three CTA_EXPECT_* exist */
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
if (err < 0)
return err;
err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3);
if (err < 0)
return err;
err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3);
if (err < 0)
return err;
/* Look for master conntrack of this expectation */
h = nf_conntrack_find_get(net, zone, &master_tuple);
if (!h)
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
/* Look for helper of this expectation */
if (cda[CTA_EXPECT_HELP_NAME]) {
const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
nf_ct_protonum(ct));
if (helper == NULL) {
#ifdef CONFIG_MODULES
if (request_module("nfct-helper-%s", helpname) < 0) {
err = -EOPNOTSUPP;
goto out;
}
helper = __nf_conntrack_helper_find(helpname,
nf_ct_l3num(ct),
nf_ct_protonum(ct));
if (helper) {
err = -EAGAIN;
goto out;
}
#endif
err = -EOPNOTSUPP;
goto out;
}
}
if (cda[CTA_EXPECT_CLASS] && helper) {
class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS]));
if (class > helper->expect_class_max) {
err = -EINVAL;
goto out;
}
}
exp = nf_ct_expect_alloc(ct);
if (!exp) {
err = -ENOMEM;
goto out;
}
help = nfct_help(ct);
if (!help) {
if (!cda[CTA_EXPECT_TIMEOUT]) {
err = -EINVAL;
goto err_out;
}
exp->timeout.expires =
jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
exp->flags = NF_CT_EXPECT_USERSPACE;
if (cda[CTA_EXPECT_FLAGS]) {
exp->flags |=
ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
}
} else {
if (cda[CTA_EXPECT_FLAGS]) {
exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
exp->flags &= ~NF_CT_EXPECT_USERSPACE;
} else
exp->flags = 0;
}
if (cda[CTA_EXPECT_FN]) {
const char *name = nla_data(cda[CTA_EXPECT_FN]);
struct nf_ct_helper_expectfn *expfn;
expfn = nf_ct_helper_expectfn_find_by_name(name);
if (expfn == NULL) {
err = -EINVAL;
goto err_out;
}
exp->expectfn = expfn->expectfn;
} else
exp->expectfn = NULL;
exp->class = class;
exp->master = ct;
exp->helper = helper;
memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple));
memcpy(&exp->mask.src.u3, &mask.src.u3, sizeof(exp->mask.src.u3));
exp->mask.src.u.all = mask.src.u.all;
if (cda[CTA_EXPECT_NAT]) {
err = ctnetlink_parse_expect_nat(cda[CTA_EXPECT_NAT],
exp, u3);
if (err < 0)
goto err_out;
}
err = nf_ct_expect_related_report(exp, portid, report);
err_out:
nf_ct_expect_put(exp);
out:
nf_ct_put(nf_ct_tuplehash_to_ctrack(h));
return err;
}
static int
ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple tuple;
struct nf_conntrack_expect *exp;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
u16 zone;
int err;
if (!cda[CTA_EXPECT_TUPLE]
|| !cda[CTA_EXPECT_MASK]
|| !cda[CTA_EXPECT_MASTER])
return -EINVAL;
err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
if (err < 0)
return err;
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
if (err < 0)
return err;
spin_lock_bh(&nf_conntrack_lock);
exp = __nf_ct_expect_find(net, zone, &tuple);
if (!exp) {
spin_unlock_bh(&nf_conntrack_lock);
err = -ENOENT;
if (nlh->nlmsg_flags & NLM_F_CREATE) {
err = ctnetlink_create_expect(net, zone, cda,
u3,
NETLINK_CB(skb).portid,
nlmsg_report(nlh));
}
return err;
}
err = -EEXIST;
if (!(nlh->nlmsg_flags & NLM_F_EXCL))
err = ctnetlink_change_expect(exp, cda);
spin_unlock_bh(&nf_conntrack_lock);
return err;
}
static int
ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu,
const struct ip_conntrack_stat *st)
{
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0, event;
event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_EXP_GET_STATS_CPU);
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
if (nlh == NULL)
goto nlmsg_failure;
nfmsg = nlmsg_data(nlh);
nfmsg->nfgen_family = AF_UNSPEC;
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = htons(cpu);
if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) ||
nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) ||
nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete)))
goto nla_put_failure;
nlmsg_end(skb, nlh);
return skb->len;
nla_put_failure:
nlmsg_failure:
nlmsg_cancel(skb, nlh);
return -1;
}
static int
ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
int cpu;
struct net *net = sock_net(skb->sk);
if (cb->args[0] == nr_cpu_ids)
return 0;
for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
const struct ip_conntrack_stat *st;
if (!cpu_possible(cpu))
continue;
st = per_cpu_ptr(net->ct.stat, cpu);
if (ctnetlink_exp_stat_fill_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
cpu, st) < 0)
break;
}
cb->args[0] = cpu;
return skb->len;
}
static int
ctnetlink_stat_exp_cpu(struct sock *ctnl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_exp_stat_cpu_dump,
};
return netlink_dump_start(ctnl, skb, nlh, &c);
}
return 0;
}
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static struct nf_ct_event_notifier ctnl_notifier = {
.fcn = ctnetlink_conntrack_event,
};
static struct nf_exp_event_notifier ctnl_notifier_exp = {
.fcn = ctnetlink_expect_event,
};
#endif
static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
[IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
.attr_count = CTA_MAX,
.policy = ct_nla_policy },
[IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
.attr_count = CTA_MAX,
.policy = ct_nla_policy },
[IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
.attr_count = CTA_MAX,
.policy = ct_nla_policy },
[IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
.attr_count = CTA_MAX,
.policy = ct_nla_policy },
[IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu },
[IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct },
[IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying },
[IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed },
};
static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
[IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
.attr_count = CTA_EXPECT_MAX,
.policy = exp_nla_policy },
[IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
.attr_count = CTA_EXPECT_MAX,
.policy = exp_nla_policy },
[IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
.attr_count = CTA_EXPECT_MAX,
.policy = exp_nla_policy },
[IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu },
};
static const struct nfnetlink_subsystem ctnl_subsys = {
.name = "conntrack",
.subsys_id = NFNL_SUBSYS_CTNETLINK,
.cb_count = IPCTNL_MSG_MAX,
.cb = ctnl_cb,
};
static const struct nfnetlink_subsystem ctnl_exp_subsys = {
.name = "conntrack_expect",
.subsys_id = NFNL_SUBSYS_CTNETLINK_EXP,
.cb_count = IPCTNL_MSG_EXP_MAX,
.cb = ctnl_exp_cb,
};
MODULE_ALIAS("ip_conntrack_netlink");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK);
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP);
static int __net_init ctnetlink_net_init(struct net *net)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
int ret;
ret = nf_conntrack_register_notifier(net, &ctnl_notifier);
if (ret < 0) {
pr_err("ctnetlink_init: cannot register notifier.\n");
goto err_out;
}
ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp);
if (ret < 0) {
pr_err("ctnetlink_init: cannot expect register notifier.\n");
goto err_unreg_notifier;
}
#endif
return 0;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
err_unreg_notifier:
nf_conntrack_unregister_notifier(net, &ctnl_notifier);
err_out:
return ret;
#endif
}
static void ctnetlink_net_exit(struct net *net)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp);
nf_conntrack_unregister_notifier(net, &ctnl_notifier);
#endif
}
static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list)
{
struct net *net;
list_for_each_entry(net, net_exit_list, exit_list)
ctnetlink_net_exit(net);
netfilter: ctnetlink: netns exit must wait for callbacks [ Upstream commit 18a110b022a5c02e7dc9f6109d0bd93e58ac6ebb ] Curtis Taylor and Jon Maxwell reported and debugged a crash on 3.10 based kernel. Crash occurs in ctnetlink_conntrack_events because net->nfnl socket is NULL. The nfnl socket was set to NULL by netns destruction running on another cpu. The exiting network namespace calls the relevant destructors in the following order: 1. ctnetlink_net_exit_batch This nulls out the event callback pointer in struct netns. 2. nfnetlink_net_exit_batch This nulls net->nfnl socket and frees it. 3. nf_conntrack_cleanup_net_list This removes all remaining conntrack entries. This is order is correct. The only explanation for the crash so ar is: cpu1: conntrack is dying, eviction occurs: -> nf_ct_delete() -> nf_conntrack_event_report \ -> nf_conntrack_eventmask_report -> notify->fcn() (== ctnetlink_conntrack_events). cpu1: a. fetches rcu protected pointer to obtain ctnetlink event callback. b. gets interrupted. cpu2: runs netns exit handlers: a runs ctnetlink destructor, event cb pointer set to NULL. b runs nfnetlink destructor, nfnl socket is closed and set to NULL. cpu1: c. resumes and trips over NULL net->nfnl. Problem appears to be that ctnetlink_net_exit_batch only prevents future callers of nf_conntrack_eventmask_report() from obtaining the callback. It doesn't wait of other cpus that might have already obtained the callbacks address. I don't see anything in upstream kernels that would prevent similar crash: We need to wait for all cpus to have exited the event callback. Fixes: 9592a5c01e79dbc59eb56fa ("netfilter: ctnetlink: netns support") Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Signed-off-by: Sasha Levin <sashal@kernel.org> Signed-off-by: Lee Jones <lee.jones@linaro.org> Change-Id: Iad93478e7bf5dc987ae5d1cd10cf261af41c313a
2019-11-15 11:39:23 +00:00
/* wait for other cpus until they are done with ctnl_notifiers */
synchronize_rcu();
}
static struct pernet_operations ctnetlink_net_ops = {
.init = ctnetlink_net_init,
.exit_batch = ctnetlink_net_exit_batch,
};
static int __init ctnetlink_init(void)
{
int ret;
pr_info("ctnetlink v%s: registering with nfnetlink.\n", version);
ret = nfnetlink_subsys_register(&ctnl_subsys);
if (ret < 0) {
pr_err("ctnetlink_init: cannot register with nfnetlink.\n");
goto err_out;
}
ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
if (ret < 0) {
pr_err("ctnetlink_init: cannot register exp with nfnetlink.\n");
goto err_unreg_subsys;
}
ret = register_pernet_subsys(&ctnetlink_net_ops);
if (ret < 0) {
pr_err("ctnetlink_init: cannot register pernet operations\n");
goto err_unreg_exp_subsys;
}
#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
/* setup interaction between nf_queue and nf_conntrack_netlink. */
RCU_INIT_POINTER(nfq_ct_hook, &ctnetlink_nfqueue_hook);
#endif
return 0;
err_unreg_exp_subsys:
nfnetlink_subsys_unregister(&ctnl_exp_subsys);
err_unreg_subsys:
nfnetlink_subsys_unregister(&ctnl_subsys);
err_out:
return ret;
}
static void __exit ctnetlink_exit(void)
{
pr_info("ctnetlink: unregistering from nfnetlink.\n");
unregister_pernet_subsys(&ctnetlink_net_ops);
nfnetlink_subsys_unregister(&ctnl_exp_subsys);
nfnetlink_subsys_unregister(&ctnl_subsys);
#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
RCU_INIT_POINTER(nfq_ct_hook, NULL);
#endif
synchronize_rcu();
}
module_init(ctnetlink_init);
module_exit(ctnetlink_exit);