android_kernel_samsung_msm8976/include/net/inet_timewait_sock.h

234 lines
6.8 KiB
C
Raw Normal View History

/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Definitions for a generic INET TIMEWAIT sock
*
* From code originally in net/tcp.h
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _INET_TIMEWAIT_SOCK_
#define _INET_TIMEWAIT_SOCK_
#include <linux/kmemcheck.h>
#include <linux/list.h>
#include <linux/timer.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <net/inet_sock.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <net/timewait_sock.h>
#include <linux/atomic.h>
struct inet_hashinfo;
#define INET_TWDR_RECYCLE_SLOTS_LOG 5
#define INET_TWDR_RECYCLE_SLOTS (1 << INET_TWDR_RECYCLE_SLOTS_LOG)
/*
* If time > 4sec, it is "slow" path, no recycling is required,
* so that we select tick to get range about 4 seconds.
*/
#if HZ <= 16 || HZ > 4096
# error Unsupported: HZ <= 16 or HZ > 4096
#elif HZ <= 32
# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 64
# define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 128
# define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 256
# define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 512
# define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 1024
# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#elif HZ <= 2048
# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#else
# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
#endif
/* TIME_WAIT reaping mechanism. */
#define INET_TWDR_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
#define INET_TWDR_TWKILL_QUOTA 100
struct inet_timewait_death_row {
/* Short-time timewait calendar */
int twcal_hand;
unsigned long twcal_jiffie;
struct timer_list twcal_timer;
struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS];
spinlock_t death_lock;
int tw_count;
int period;
u32 thread_slots;
struct work_struct twkill_work;
struct timer_list tw_timer;
int slot;
struct hlist_head cells[INET_TWDR_TWKILL_SLOTS];
struct inet_hashinfo *hashinfo;
int sysctl_tw_recycle;
int sysctl_max_tw_buckets;
};
extern void inet_twdr_hangman(unsigned long data);
2006-11-22 14:55:48 +00:00
extern void inet_twdr_twkill_work(struct work_struct *work);
extern void inet_twdr_twcal_tick(unsigned long data);
struct inet_bind_bucket;
/*
* This is a TIME_WAIT sock. It works around the memory consumption
* problems of sockets in such a state on heavily loaded servers, but
* without violating the protocol specification.
*/
struct inet_timewait_sock {
/*
* Now struct sock also uses sock_common, so please just
* don't add nothing before this first member (__tw_common) --acme
*/
struct sock_common __tw_common;
#define tw_family __tw_common.skc_family
#define tw_state __tw_common.skc_state
#define tw_reuse __tw_common.skc_reuse
soreuseport: initialise timewait reuseport field commit 3099a52918937ab86ec47038ad80d377ba16c531 upstream. syzbot reported an uninit-value in inet_csk_bind_conflict() [1] It turns out we never propagated sk->sk_reuseport into timewait socket. [1] BUG: KMSAN: uninit-value in inet_csk_bind_conflict+0x5f9/0x990 net/ipv4/inet_connection_sock.c:151 CPU: 1 PID: 3589 Comm: syzkaller008242 Not tainted 4.16.0+ #82 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676 inet_csk_bind_conflict+0x5f9/0x990 net/ipv4/inet_connection_sock.c:151 inet_csk_get_port+0x1d28/0x1e40 net/ipv4/inet_connection_sock.c:320 inet6_bind+0x121c/0x1820 net/ipv6/af_inet6.c:399 SYSC_bind+0x3f2/0x4b0 net/socket.c:1474 SyS_bind+0x54/0x80 net/socket.c:1460 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x4416e9 RSP: 002b:00007ffce6d15c88 EFLAGS: 00000217 ORIG_RAX: 0000000000000031 RAX: ffffffffffffffda RBX: 0100000000000000 RCX: 00000000004416e9 RDX: 000000000000001c RSI: 0000000020402000 RDI: 0000000000000004 RBP: 0000000000000000 R08: 00000000e6d15e08 R09: 00000000e6d15e08 R10: 0000000000000004 R11: 0000000000000217 R12: 0000000000009478 R13: 00000000006cd448 R14: 0000000000000000 R15: 0000000000000000 Uninit was stored to memory at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_save_stack mm/kmsan/kmsan.c:293 [inline] kmsan_internal_chain_origin+0x12b/0x210 mm/kmsan/kmsan.c:684 __msan_chain_origin+0x69/0xc0 mm/kmsan/kmsan_instr.c:521 tcp_time_wait+0xf17/0xf50 net/ipv4/tcp_minisocks.c:283 tcp_rcv_state_process+0xebe/0x6490 net/ipv4/tcp_input.c:6003 tcp_v6_do_rcv+0x11dd/0x1d90 net/ipv6/tcp_ipv6.c:1331 sk_backlog_rcv include/net/sock.h:908 [inline] __release_sock+0x2d6/0x680 net/core/sock.c:2271 release_sock+0x97/0x2a0 net/core/sock.c:2786 tcp_close+0x277/0x18f0 net/ipv4/tcp.c:2269 inet_release+0x240/0x2a0 net/ipv4/af_inet.c:427 inet6_release+0xaf/0x100 net/ipv6/af_inet6.c:435 sock_release net/socket.c:595 [inline] sock_close+0xe0/0x300 net/socket.c:1149 __fput+0x49e/0xa10 fs/file_table.c:209 ____fput+0x37/0x40 fs/file_table.c:243 task_work_run+0x243/0x2c0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x10e1/0x38d0 kernel/exit.c:867 do_group_exit+0x1a0/0x360 kernel/exit.c:970 SYSC_exit_group+0x21/0x30 kernel/exit.c:981 SyS_exit_group+0x25/0x30 kernel/exit.c:979 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Uninit was stored to memory at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_save_stack mm/kmsan/kmsan.c:293 [inline] kmsan_internal_chain_origin+0x12b/0x210 mm/kmsan/kmsan.c:684 __msan_chain_origin+0x69/0xc0 mm/kmsan/kmsan_instr.c:521 inet_twsk_alloc+0xaef/0xc00 net/ipv4/inet_timewait_sock.c:182 tcp_time_wait+0xd9/0xf50 net/ipv4/tcp_minisocks.c:258 tcp_rcv_state_process+0xebe/0x6490 net/ipv4/tcp_input.c:6003 tcp_v6_do_rcv+0x11dd/0x1d90 net/ipv6/tcp_ipv6.c:1331 sk_backlog_rcv include/net/sock.h:908 [inline] __release_sock+0x2d6/0x680 net/core/sock.c:2271 release_sock+0x97/0x2a0 net/core/sock.c:2786 tcp_close+0x277/0x18f0 net/ipv4/tcp.c:2269 inet_release+0x240/0x2a0 net/ipv4/af_inet.c:427 inet6_release+0xaf/0x100 net/ipv6/af_inet6.c:435 sock_release net/socket.c:595 [inline] sock_close+0xe0/0x300 net/socket.c:1149 __fput+0x49e/0xa10 fs/file_table.c:209 ____fput+0x37/0x40 fs/file_table.c:243 task_work_run+0x243/0x2c0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x10e1/0x38d0 kernel/exit.c:867 do_group_exit+0x1a0/0x360 kernel/exit.c:970 SYSC_exit_group+0x21/0x30 kernel/exit.c:981 SyS_exit_group+0x25/0x30 kernel/exit.c:979 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314 kmem_cache_alloc+0xaab/0xb90 mm/slub.c:2756 inet_twsk_alloc+0x13b/0xc00 net/ipv4/inet_timewait_sock.c:163 tcp_time_wait+0xd9/0xf50 net/ipv4/tcp_minisocks.c:258 tcp_rcv_state_process+0xebe/0x6490 net/ipv4/tcp_input.c:6003 tcp_v6_do_rcv+0x11dd/0x1d90 net/ipv6/tcp_ipv6.c:1331 sk_backlog_rcv include/net/sock.h:908 [inline] __release_sock+0x2d6/0x680 net/core/sock.c:2271 release_sock+0x97/0x2a0 net/core/sock.c:2786 tcp_close+0x277/0x18f0 net/ipv4/tcp.c:2269 inet_release+0x240/0x2a0 net/ipv4/af_inet.c:427 inet6_release+0xaf/0x100 net/ipv6/af_inet6.c:435 sock_release net/socket.c:595 [inline] sock_close+0xe0/0x300 net/socket.c:1149 __fput+0x49e/0xa10 fs/file_table.c:209 ____fput+0x37/0x40 fs/file_table.c:243 task_work_run+0x243/0x2c0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x10e1/0x38d0 kernel/exit.c:867 do_group_exit+0x1a0/0x360 kernel/exit.c:970 SYSC_exit_group+0x21/0x30 kernel/exit.c:981 SyS_exit_group+0x25/0x30 kernel/exit.c:979 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Fixes: da5e36308d9f ("soreuseport: TCP/IPv4 implementation") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net> [bwh: Backported to 3.16: adjust context] Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
2018-04-07 20:42:43 +00:00
#define tw_reuseport __tw_common.skc_reuseport
#define tw_bound_dev_if __tw_common.skc_bound_dev_if
#define tw_node __tw_common.skc_nulls_node
#define tw_bind_node __tw_common.skc_bind_node
#define tw_refcnt __tw_common.skc_refcnt
[INET]: speedup inet (tcp/dccp) lookups Arnaldo and I agreed it could be applied now, because I have other pending patches depending on this one (Thank you Arnaldo) (The other important patch moves skc_refcnt in a separate cache line, so that the SMP/NUMA performance doesnt suffer from cache line ping pongs) 1) First some performance data : -------------------------------- tcp_v4_rcv() wastes a *lot* of time in __inet_lookup_established() The most time critical code is : sk_for_each(sk, node, &head->chain) { if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ } The sk_for_each() does use prefetch() hints but only the begining of "struct sock" is prefetched. As INET_MATCH first comparison uses inet_sk(__sk)->daddr, wich is far away from the begining of "struct sock", it has to bring into CPU cache cold cache line. Each iteration has to use at least 2 cache lines. This can be problematic if some chains are very long. 2) The goal ----------- The idea I had is to change things so that INET_MATCH() may return FALSE in 99% of cases only using the data already in the CPU cache, using one cache line per iteration. 3) Description of the patch --------------------------- Adds a new 'unsigned int skc_hash' field in 'struct sock_common', filling a 32 bits hole on 64 bits platform. struct sock_common { unsigned short skc_family; volatile unsigned char skc_state; unsigned char skc_reuse; int skc_bound_dev_if; struct hlist_node skc_node; struct hlist_node skc_bind_node; atomic_t skc_refcnt; + unsigned int skc_hash; struct proto *skc_prot; }; Store in this 32 bits field the full hash, not masked by (ehash_size - 1) Using this full hash as the first comparison done in INET_MATCH permits us immediatly skip the element without touching a second cache line in case of a miss. Suppress the sk_hashent/tw_hashent fields since skc_hash (aliased to sk_hash and tw_hash) already contains the slot number if we mask with (ehash_size - 1) File include/net/inet_hashtables.h 64 bits platforms : #define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ (((__sk)->sk_hash == (__hash)) ((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie)) && \ ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) 32bits platforms: #define TCP_IPV4_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ (((__sk)->sk_hash == (__hash)) && \ (inet_sk(__sk)->daddr == (__saddr)) && \ (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) - Adds a prefetch(head->chain.first) in __inet_lookup_established()/__tcp_v4_check_established() and __inet6_lookup_established()/__tcp_v6_check_established() and __dccp_v4_check_established() to bring into cache the first element of the list, before the {read|write}_lock(&head->lock); Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Acked-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2005-10-03 21:13:38 +00:00
#define tw_hash __tw_common.skc_hash
#define tw_prot __tw_common.skc_prot
#define tw_net __tw_common.skc_net
#define tw_daddr __tw_common.skc_daddr
#define tw_rcv_saddr __tw_common.skc_rcv_saddr
net: move inet_dport/inet_num in sock_common commit 68835aba4d9b (net: optimize INET input path further) moved some fields used for tcp/udp sockets lookup in the first cache line of struct sock_common. This patch moves inet_dport/inet_num as well, filling a 32bit hole on 64 bit arches and reducing number of cache line misses in lookups. Also change INET_MATCH()/INET_TW_MATCH() to perform the ports match before addresses match, as this check is more discriminant. Remove the hash check from MATCH() macros because we dont need to re validate the hash value after taking a refcount on socket, and use likely/unlikely compiler hints, as the sk_hash/hash check makes the following conditional tests 100% predicted by cpu. Introduce skc_addrpair/skc_portpair pair values to better document the alignment requirements of the port/addr pairs used in the various MATCH() macros, and remove some casts. The namespace check can also be done at last. This slightly improves TCP/UDP lookup times. IP/TCP early demux needs inet->rx_dst_ifindex and TCP needs inet->min_ttl, lets group them together in same cache line. With help from Ben Hutchings & Joe Perches. Idea of this patch came after Ling Ma proposal to move skc_hash to the beginning of struct sock_common, and should allow him to submit a final version of his patch. My tests show an improvement doing so. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Joe Perches <joe@perches.com> Cc: Ling Ma <ling.ma.program@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-30 09:49:27 +00:00
#define tw_addrpair __tw_common.skc_addrpair
#define tw_dport __tw_common.skc_dport
#define tw_num __tw_common.skc_num
#define tw_portpair __tw_common.skc_portpair
int tw_timeout;
volatile unsigned char tw_substate;
unsigned char tw_rcv_wscale;
/* Socket demultiplex comparisons on incoming packets. */
/* these three are in inet_sock */
__be16 tw_sport;
kmemcheck_bitfield_begin(flags);
/* And these are ours. */
unsigned int tw_ipv6only : 1,
tw_transparent : 1,
tw_pad : 6, /* 6 bits hole */
tw_tos : 8,
tw_ipv6_offset : 16;
kmemcheck_bitfield_end(flags);
unsigned long tw_ttd;
struct inet_bind_bucket *tw_tb;
struct hlist_node tw_death_node;
};
#define tw_tclass tw_tos
static inline void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
struct hlist_nulls_head *list)
{
hlist_nulls_add_head_rcu(&tw->tw_node, list);
}
static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
struct hlist_head *list)
{
hlist_add_head(&tw->tw_bind_node, list);
}
static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw)
{
return !hlist_unhashed(&tw->tw_death_node);
}
static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw)
{
tw->tw_death_node.pprev = NULL;
}
static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
{
__hlist_del(&tw->tw_death_node);
inet_twsk_dead_node_init(tw);
}
static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
{
if (inet_twsk_dead_hashed(tw)) {
__inet_twsk_del_dead_node(tw);
return 1;
}
return 0;
}
#define inet_twsk_for_each(tw, node, head) \
hlist_nulls_for_each_entry(tw, node, head, tw_node)
hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 01:06:00 +00:00
#define inet_twsk_for_each_inmate(tw, jail) \
hlist_for_each_entry(tw, jail, tw_death_node)
hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 01:06:00 +00:00
#define inet_twsk_for_each_inmate_safe(tw, safe, jail) \
hlist_for_each_entry_safe(tw, safe, jail, tw_death_node)
static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
{
return (struct inet_timewait_sock *)sk;
}
static inline __be32 sk_rcv_saddr(const struct sock *sk)
{
/* both inet_sk() and inet_twsk() store rcv_saddr in skc_rcv_saddr */
return sk->__sk_common.skc_rcv_saddr;
}
extern void inet_twsk_put(struct inet_timewait_sock *tw);
extern int inet_twsk_unhash(struct inet_timewait_sock *tw);
extern int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo);
extern struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
const int state);
extern void __inet_twsk_hashdance(struct inet_timewait_sock *tw,
struct sock *sk,
struct inet_hashinfo *hashinfo);
extern void inet_twsk_schedule(struct inet_timewait_sock *tw,
struct inet_timewait_death_row *twdr,
const int timeo, const int timewait_len);
extern void inet_twsk_deschedule(struct inet_timewait_sock *tw,
struct inet_timewait_death_row *twdr);
extern void inet_twsk_purge(struct inet_hashinfo *hashinfo,
netns : fix kernel panic in timewait socket destruction How to reproduce ? - create a network namespace - use tcp protocol and get timewait socket - exit the network namespace - after a moment (when the timewait socket is destroyed), the kernel panics. # BUG: unable to handle kernel NULL pointer dereference at 0000000000000007 IP: [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8 PGD 119985067 PUD 11c5c0067 PMD 0 Oops: 0000 [1] SMP CPU 1 Modules linked in: ipv6 button battery ac loop dm_mod tg3 libphy ext3 jbd edd fan thermal processor thermal_sys sg sata_svw libata dock serverworks sd_mod scsi_mod ide_disk ide_core [last unloaded: freq_table] Pid: 0, comm: swapper Not tainted 2.6.27-rc2 #3 RIP: 0010:[<ffffffff821e394d>] [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8 RSP: 0018:ffff88011ff7fed0 EFLAGS: 00010246 RAX: ffffffffffffffff RBX: ffffffff82339420 RCX: ffff88011ff7ff30 RDX: 0000000000000001 RSI: ffff88011a4d03c0 RDI: ffff88011ac2fc00 RBP: ffffffff823392e0 R08: 0000000000000000 R09: ffff88002802a200 R10: ffff8800a5c4b000 R11: ffffffff823e4080 R12: ffff88011ac2fc00 R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000 FS: 0000000041cbd940(0000) GS:ffff8800bff839c0(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 0000000000000007 CR3: 00000000bd87c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 0, threadinfo ffff8800bff9e000, task ffff88011ff76690) Stack: ffffffff823392e0 0000000000000100 ffffffff821e3a3a 0000000000000008 0000000000000000 ffffffff821e3a61 ffff8800bff7c000 ffffffff8203c7e7 ffff88011ff7ff10 ffff88011ff7ff10 0000000000000021 ffffffff82351108 Call Trace: <IRQ> [<ffffffff821e3a3a>] ? inet_twdr_hangman+0x0/0x9e [<ffffffff821e3a61>] ? inet_twdr_hangman+0x27/0x9e [<ffffffff8203c7e7>] ? run_timer_softirq+0x12c/0x193 [<ffffffff820390d1>] ? __do_softirq+0x5e/0xcd [<ffffffff8200d08c>] ? call_softirq+0x1c/0x28 [<ffffffff8200e611>] ? do_softirq+0x2c/0x68 [<ffffffff8201a055>] ? smp_apic_timer_interrupt+0x8e/0xa9 [<ffffffff8200cad6>] ? apic_timer_interrupt+0x66/0x70 <EOI> [<ffffffff82011f4c>] ? default_idle+0x27/0x3b [<ffffffff8200abbd>] ? cpu_idle+0x5f/0x7d Code: e8 01 00 00 4c 89 e7 41 ff c5 e8 8d fd ff ff 49 8b 44 24 38 4c 89 e7 65 8b 14 25 24 00 00 00 89 d2 48 8b 80 e8 00 00 00 48 f7 d0 <48> 8b 04 d0 48 ff 40 58 e8 fc fc ff ff 48 89 df e8 c0 5f 04 00 RIP [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8 RSP <ffff88011ff7fed0> CR2: 0000000000000007 This patch provides a function to purge all timewait sockets related to a network namespace. The timewait sockets life cycle is not tied with the network namespace, that means the timewait sockets stay alive while the network namespace dies. The timewait sockets are for avoiding to receive a duplicate packet from the network, if the network namespace is freed, the network stack is removed, so no chance to receive any packets from the outside world. Furthermore, having a pending destruction timer on these sockets with a network namespace freed is not safe and will lead to an oops if the timer callback which try to access data belonging to the namespace like for example in: inet_twdr_do_twkill_work -> NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); Purging the timewait sockets at the network namespace destruction will: 1) speed up memory freeing for the namespace 2) fix kernel panic on asynchronous timewait destruction Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Acked-by: Denis V. Lunev <den@openvz.org> Acked-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-09-08 20:17:27 +00:00
struct inet_timewait_death_row *twdr, int family);
static inline
struct net *twsk_net(const struct inet_timewait_sock *twsk)
{
return read_pnet(&twsk->tw_net);
}
static inline
void twsk_net_set(struct inet_timewait_sock *twsk, struct net *net)
{
write_pnet(&twsk->tw_net, net);
}
#endif /* _INET_TIMEWAIT_SOCK_ */