From 0a62a7f918bee88fececa8673f6195352dcd83f5 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 26 Mar 2014 13:03:12 +0900 Subject: [PATCH] net: support marking accepting TCP sockets When using mark-based routing, sockets returned from accept() may need to be marked differently depending on the incoming connection request. This is the case, for example, if different socket marks identify different networks: a listening socket may want to accept connections from all networks, but each connection should be marked with the network that the request came in on, so that subsequent packets are sent on the correct network. This patch adds a sysctl to mark TCP sockets based on the fwmark of the incoming SYN packet. If enabled, and an unmarked socket receives a SYN, then the SYN packet's fwmark is written to the connection's inet_request_sock, and later written back to the accepted socket when the connection is established. If the socket already has a nonzero mark, then the behaviour is the same as it is today, i.e., the listening socket's fwmark is used. Black-box tested using user-mode linux: - IPv4/IPv6 SYN+ACK, FIN, etc. packets are routed based on the mark of the incoming SYN packet. - The socket returned by accept() is marked with the mark of the incoming SYN packet. - Tested with syncookies=1 and syncookies=2. Change-Id: I5e8c9b989762a93f3eb5a0c1b4df44f62d57f3cb Signed-off-by: Lorenzo Colitti --- include/net/inet_sock.h | 10 ++++++++++ include/net/netns/ipv4.h | 1 + net/ipv4/inet_connection_sock.c | 6 ++++-- net/ipv4/syncookies.c | 3 ++- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/syncookies.c | 4 +++- net/ipv6/tcp_ipv6.c | 3 ++- 9 files changed, 31 insertions(+), 6 deletions(-) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 8cd2e1d0ec32..c80b3733d7ce 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -89,6 +89,7 @@ struct inet_request_sock { no_srccheck: 1; kmemcheck_bitfield_end(flags); struct ip_options_rcu *opt; + u32 ir_mark; }; static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) @@ -96,6 +97,15 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) return (struct inet_request_sock *)sk; } +static inline u32 inet_request_mark(struct sock *sk, struct sk_buff *skb) +{ + if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept) { + return skb->mark; + } else { + return sk->sk_mark; + } +} + struct inet_cork { unsigned int flags; __be32 addr; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 446db341c5b2..886942628077 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -57,6 +57,7 @@ struct netns_ipv4 { unsigned int sysctl_ping_group_range[2]; long sysctl_tcp_mem[3]; int sysctl_fwmark_reflect; + int sysctl_tcp_fwmark_accept; atomic_t rt_genid; atomic_t dev_addr_genid; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 19d66cefd7d3..454c56acdf6c 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -361,7 +361,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, struct ip_options_rcu *opt = inet_rsk(req)->opt; struct net *net = sock_net(sk); - flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, @@ -394,7 +394,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, struct rtable *rt; fl4 = &newinet->cork.fl.u.ip4; - flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, @@ -616,6 +616,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port; newsk->sk_write_space = sk_stream_write_space; + newsk->sk_mark = inet_rsk(req)->ir_mark; + newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index eab2a7fb15d1..ce1f98c51a15 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -312,6 +312,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->rmt_port = th->source; ireq->loc_addr = ip_hdr(skb)->daddr; ireq->rmt_addr = ip_hdr(skb)->saddr; + ireq->ir_mark = inet_request_mark(sk, skb); ireq->ecn_ok = ecn_ok; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; @@ -347,7 +348,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * hasn't changed since we received the original syn, but I see * no easy way to do this. */ - flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), + flowi4_init_output(&fl4, 0, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5ba9b363c7c8..ac6051d4c9d8 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -794,6 +794,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_fwmark_accept", + .data = &init_net.ipv4.sysctl_tcp_fwmark_accept, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0cb86ceb652f..4e9044391b46 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1350,6 +1350,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq->rmt_addr = saddr; ireq->no_srccheck = inet_sk(sk)->transparent; ireq->opt = tcp_v4_save_options(sk, skb); + ireq->ir_mark = inet_request_mark(sk, skb); if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 02dd203d9eac..36fe742329fb 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -69,7 +69,7 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk, final_p = fl6_update_dst(&fl6, np->opt, &final); fl6.saddr = treq->loc_addr; fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_mark = inet_rsk(req)->ir_mark; fl6.fl6_dport = inet_rsk(req)->rmt_port; fl6.fl6_sport = inet_rsk(req)->loc_port; security_req_classify_flow(req, flowi6_to_flowi(&fl6)); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 8e951d8d3b81..ca7d749f2bf0 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -215,6 +215,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq6->iif = inet6_iif(skb); + ireq->ir_mark = inet_request_mark(sk, skb); + req->expires = 0UL; req->retrans = 0; ireq->ecn_ok = ecn_ok; @@ -241,7 +243,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) final_p = fl6_update_dst(&fl6, np->opt, &final); fl6.saddr = ireq6->loc_addr; fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_mark = inet_rsk(req)->ir_mark; fl6.fl6_dport = inet_rsk(req)->rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; security_req_classify_flow(req, flowi6_to_flowi(&fl6)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 035812b2428b..b9baa5627b8d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -493,7 +493,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, fl6.saddr = treq->loc_addr; fl6.flowlabel = 0; fl6.flowi6_oif = treq->iif; - fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_mark = inet_rsk(req)->ir_mark; fl6.fl6_dport = inet_rsk(req)->rmt_port; fl6.fl6_sport = inet_rsk(req)->loc_port; security_req_classify_flow(req, flowi6_to_flowi(&fl6)); @@ -1144,6 +1144,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) TCP_ECN_create_request(req, tcp_hdr(skb)); treq->iif = sk->sk_bound_dev_if; + inet_rsk(req)->ir_mark = inet_request_mark(sk, skb); /* So that link locals have meaning */ if (!sk->sk_bound_dev_if &&