android_kernel_samsung_msm8976/net/netfilter/ipvs/ip_vs_ctl.c

3897 lines
95 KiB
C
Raw Normal View History

/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system. IPVS is now implemented as a module
* over the NetFilter framework. IPVS can be used to build a
* high-performance and highly available server based on a
* cluster of servers.
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
* Julian Anastasov <ja@ssi.bg>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
#define KMSG_COMPONENT "IPVS"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/workqueue.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 08:04:11 +00:00
#include <linux/slab.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/mutex.h>
#include <net/net_namespace.h>
#include <linux/nsproxy.h>
#include <net/ip.h>
#ifdef CONFIG_IP_VS_IPV6
#include <net/ipv6.h>
#include <net/ip6_route.h>
#endif
#include <net/route.h>
#include <net/sock.h>
#include <net/genetlink.h>
#include <asm/uaccess.h>
#include <net/ip_vs.h>
/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
static DEFINE_MUTEX(__ip_vs_mutex);
/* sysctl variables */
#ifdef CONFIG_IP_VS_DEBUG
static int sysctl_ip_vs_debug_level = 0;
int ip_vs_get_debug_level(void)
{
return sysctl_ip_vs_debug_level;
}
#endif
/* Protos */
static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
#ifdef CONFIG_IP_VS_IPV6
/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
static bool __ip_vs_addr_is_local_v6(struct net *net,
const struct in6_addr *addr)
{
struct flowi6 fl6 = {
.daddr = *addr,
};
struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
bool is_local;
is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
dst_release(dst);
return is_local;
}
#endif
#ifdef CONFIG_SYSCTL
/*
* update_defense_level is called from keventd and from sysctl,
* so it needs to protect itself from softirqs
*/
static void update_defense_level(struct netns_ipvs *ipvs)
{
struct sysinfo i;
static int old_secure_tcp = 0;
int availmem;
int nomem;
int to_change = -1;
/* we only count free and buffered memory (in pages) */
si_meminfo(&i);
availmem = i.freeram + i.bufferram;
/* however in linux 2.5 the i.bufferram is total page cache size,
we need adjust it */
/* si_swapinfo(&i); */
/* availmem = availmem - (i.totalswap - i.freeswap); */
nomem = (availmem < ipvs->sysctl_amemthresh);
local_bh_disable();
/* drop_entry */
spin_lock(&ipvs->dropentry_lock);
switch (ipvs->sysctl_drop_entry) {
case 0:
atomic_set(&ipvs->dropentry, 0);
break;
case 1:
if (nomem) {
atomic_set(&ipvs->dropentry, 1);
ipvs->sysctl_drop_entry = 2;
} else {
atomic_set(&ipvs->dropentry, 0);
}
break;
case 2:
if (nomem) {
atomic_set(&ipvs->dropentry, 1);
} else {
atomic_set(&ipvs->dropentry, 0);
ipvs->sysctl_drop_entry = 1;
};
break;
case 3:
atomic_set(&ipvs->dropentry, 1);
break;
}
spin_unlock(&ipvs->dropentry_lock);
/* drop_packet */
spin_lock(&ipvs->droppacket_lock);
switch (ipvs->sysctl_drop_packet) {
case 0:
ipvs->drop_rate = 0;
break;
case 1:
if (nomem) {
ipvs->drop_rate = ipvs->drop_counter
= ipvs->sysctl_amemthresh /
(ipvs->sysctl_amemthresh-availmem);
ipvs->sysctl_drop_packet = 2;
} else {
ipvs->drop_rate = 0;
}
break;
case 2:
if (nomem) {
ipvs->drop_rate = ipvs->drop_counter
= ipvs->sysctl_amemthresh /
(ipvs->sysctl_amemthresh-availmem);
} else {
ipvs->drop_rate = 0;
ipvs->sysctl_drop_packet = 1;
}
break;
case 3:
ipvs->drop_rate = ipvs->sysctl_am_droprate;
break;
}
spin_unlock(&ipvs->droppacket_lock);
/* secure_tcp */
spin_lock(&ipvs->securetcp_lock);
switch (ipvs->sysctl_secure_tcp) {
case 0:
if (old_secure_tcp >= 2)
to_change = 0;
break;
case 1:
if (nomem) {
if (old_secure_tcp < 2)
to_change = 1;
ipvs->sysctl_secure_tcp = 2;
} else {
if (old_secure_tcp >= 2)
to_change = 0;
}
break;
case 2:
if (nomem) {
if (old_secure_tcp < 2)
to_change = 1;
} else {
if (old_secure_tcp >= 2)
to_change = 0;
ipvs->sysctl_secure_tcp = 1;
}
break;
case 3:
if (old_secure_tcp < 2)
to_change = 1;
break;
}
old_secure_tcp = ipvs->sysctl_secure_tcp;
if (to_change >= 0)
ip_vs_protocol_timeout_change(ipvs,
ipvs->sysctl_secure_tcp > 1);
spin_unlock(&ipvs->securetcp_lock);
local_bh_enable();
}
/*
* Timer for checking the defense
*/
#define DEFENSE_TIMER_PERIOD 1*HZ
static void defense_work_handler(struct work_struct *work)
{
struct netns_ipvs *ipvs =
container_of(work, struct netns_ipvs, defense_work.work);
update_defense_level(ipvs);
if (atomic_read(&ipvs->dropentry))
ip_vs_random_dropentry(ipvs->net);
schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
}
#endif
int
ip_vs_use_count_inc(void)
{
return try_module_get(THIS_MODULE);
}
void
ip_vs_use_count_dec(void)
{
module_put(THIS_MODULE);
}
/*
* Hash table: for virtual service lookups
*/
#define IP_VS_SVC_TAB_BITS 8
#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
/* the service table hashed by <protocol, addr, port> */
static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
/* the service table hashed by fwmark */
static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
/*
* Returns hash value for virtual service
*/
static inline unsigned int
ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
const union nf_inet_addr *addr, __be16 port)
{
register unsigned int porth = ntohs(port);
__be32 addr_fold = addr->ip;
__u32 ahash;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
ahash = ntohl(addr_fold);
ahash ^= ((size_t) net >> 8);
return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
IP_VS_SVC_TAB_MASK;
}
/*
* Returns hash value of fwmark for virtual service lookup
*/
static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
{
return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
}
/*
* Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
* or in the ip_vs_svc_fwm_table by fwmark.
* Should be called with locked tables.
*/
static int ip_vs_svc_hash(struct ip_vs_service *svc)
{
unsigned int hash;
if (svc->flags & IP_VS_SVC_F_HASHED) {
pr_err("%s(): request for already hashed, called from %pF\n",
__func__, __builtin_return_address(0));
return 0;
}
if (svc->fwmark == 0) {
/*
* Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
*/
hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
&svc->addr, svc->port);
hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
} else {
/*
* Hash it by fwmark in svc_fwm_table
*/
hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
}
svc->flags |= IP_VS_SVC_F_HASHED;
/* increase its refcnt because it is referenced by the svc table */
atomic_inc(&svc->refcnt);
return 1;
}
/*
* Unhashes a service from svc_table / svc_fwm_table.
* Should be called with locked tables.
*/
static int ip_vs_svc_unhash(struct ip_vs_service *svc)
{
if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
pr_err("%s(): request for unhash flagged, called from %pF\n",
__func__, __builtin_return_address(0));
return 0;
}
if (svc->fwmark == 0) {
/* Remove it from the svc_table table */
hlist_del_rcu(&svc->s_list);
} else {
/* Remove it from the svc_fwm_table table */
hlist_del_rcu(&svc->f_list);
}
svc->flags &= ~IP_VS_SVC_F_HASHED;
atomic_dec(&svc->refcnt);
return 1;
}
/*
* Get service by {netns, proto,addr,port} in the service table.
*/
static inline struct ip_vs_service *
__ip_vs_service_find(struct net *net, int af, __u16 protocol,
const union nf_inet_addr *vaddr, __be16 vport)
{
unsigned int hash;
struct ip_vs_service *svc;
/* Check for "full" addressed entries */
hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
if ((svc->af == af)
&& ip_vs_addr_equal(af, &svc->addr, vaddr)
&& (svc->port == vport)
&& (svc->protocol == protocol)
&& net_eq(svc->net, net)) {
/* HIT */
return svc;
}
}
return NULL;
}
/*
* Get service by {fwmark} in the service table.
*/
static inline struct ip_vs_service *
__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
{
unsigned int hash;
struct ip_vs_service *svc;
/* Check for fwmark addressed entries */
hash = ip_vs_svc_fwm_hashkey(net, fwmark);
hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
if (svc->fwmark == fwmark && svc->af == af
&& net_eq(svc->net, net)) {
/* HIT */
return svc;
}
}
return NULL;
}
/* Find service, called under RCU lock */
struct ip_vs_service *
ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
const union nf_inet_addr *vaddr, __be16 vport)
{
struct ip_vs_service *svc;
struct netns_ipvs *ipvs = net_ipvs(net);
/*
* Check the table hashed by fwmark first
*/
if (fwmark) {
svc = __ip_vs_svc_fwm_find(net, af, fwmark);
if (svc)
goto out;
}
/*
* Check the table hashed by <protocol,addr,port>
* for "full" addressed entries
*/
svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
if (svc == NULL
&& protocol == IPPROTO_TCP
&& atomic_read(&ipvs->ftpsvc_counter)
&& (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
/*
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
*/
svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
}
if (svc == NULL
&& atomic_read(&ipvs->nullsvc_counter)) {
/*
* Check if the catch-all port (port zero) exists
*/
svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
}
out:
IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
fwmark, ip_vs_proto_name(protocol),
IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
svc ? "hit" : "not hit");
return svc;
}
static inline void
__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
{
atomic_inc(&svc->refcnt);
dest->svc = svc;
}
static void ip_vs_service_free(struct ip_vs_service *svc)
{
if (svc->stats.cpustats)
free_percpu(svc->stats.cpustats);
kfree(svc);
}
static void
__ip_vs_unbind_svc(struct ip_vs_dest *dest)
{
struct ip_vs_service *svc = dest->svc;
dest->svc = NULL;
if (atomic_dec_and_test(&svc->refcnt)) {
IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port));
ip_vs_service_free(svc);
}
}
/*
* Returns hash value for real service
*/
static inline unsigned int ip_vs_rs_hashkey(int af,
const union nf_inet_addr *addr,
__be16 port)
{
register unsigned int porth = ntohs(port);
__be32 addr_fold = addr->ip;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
& IP_VS_RTAB_MASK;
}
/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
{
unsigned int hash;
if (dest->in_rs_table)
return;
/*
* Hash by proto,addr,port,
* which are the parameters of the real service.
*/
hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
dest->in_rs_table = 1;
}
/* Unhash ip_vs_dest from rs_table. */
static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
{
/*
* Remove it from the rs_table table.
*/
if (dest->in_rs_table) {
hlist_del_rcu(&dest->d_list);
dest->in_rs_table = 0;
}
}
/* Check if real service by <proto,addr,port> is present */
bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
const union nf_inet_addr *daddr, __be16 dport)
{
struct netns_ipvs *ipvs = net_ipvs(net);
unsigned int hash;
struct ip_vs_dest *dest;
/* Check for "full" addressed entries */
hash = ip_vs_rs_hashkey(af, daddr, dport);
rcu_read_lock();
hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
if (dest->port == dport &&
dest->af == af &&
ip_vs_addr_equal(af, &dest->addr, daddr) &&
(dest->protocol == protocol || dest->vfwmark)) {
/* HIT */
rcu_read_unlock();
return true;
}
}
rcu_read_unlock();
return false;
}
/* Lookup destination by {addr,port} in the given service
* Called under RCU lock.
*/
static struct ip_vs_dest *
ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
__be16 dport)
{
struct ip_vs_dest *dest;
/*
* Find the destination for the given service
*/
list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if ((dest->af == svc->af)
&& ip_vs_addr_equal(svc->af, &dest->addr, daddr)
&& (dest->port == dport)) {
/* HIT */
return dest;
}
}
return NULL;
}
[IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com> Signed-off-by: Simon Horman <horms@verge.net.au>
2007-11-07 10:35:54 +00:00
/*
* Find destination by {daddr,dport,vaddr,protocol}
* Created to be used in ip_vs_process_message() in
[IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com> Signed-off-by: Simon Horman <horms@verge.net.au>
2007-11-07 10:35:54 +00:00
* the backup synchronization daemon. It finds the
* destination to be bound to the received connection
* on the backup.
* Called under RCU lock, no refcnt is returned.
[IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com> Signed-off-by: Simon Horman <horms@verge.net.au>
2007-11-07 10:35:54 +00:00
*/
struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
const union nf_inet_addr *daddr,
__be16 dport,
const union nf_inet_addr *vaddr,
__be16 vport, __u16 protocol, __u32 fwmark,
__u32 flags)
[IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com> Signed-off-by: Simon Horman <horms@verge.net.au>
2007-11-07 10:35:54 +00:00
{
struct ip_vs_dest *dest;
struct ip_vs_service *svc;
__be16 port = dport;
[IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com> Signed-off-by: Simon Horman <horms@verge.net.au>
2007-11-07 10:35:54 +00:00
svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport);
[IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com> Signed-off-by: Simon Horman <horms@verge.net.au>
2007-11-07 10:35:54 +00:00
if (!svc)
return NULL;
if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
port = 0;
dest = ip_vs_lookup_dest(svc, daddr, port);
if (!dest)
dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
[IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com> Signed-off-by: Simon Horman <horms@verge.net.au>
2007-11-07 10:35:54 +00:00
return dest;
}
void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
{
struct ip_vs_dest_dst *dest_dst = container_of(head,
struct ip_vs_dest_dst,
rcu_head);
dst_release(dest_dst->dst_cache);
kfree(dest_dst);
}
/* Release dest_dst and dst_cache for dest in user context */
static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
{
struct ip_vs_dest_dst *old;
old = rcu_dereference_protected(dest->dest_dst, 1);
if (old) {
RCU_INIT_POINTER(dest->dest_dst, NULL);
call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
}
}
/*
* Lookup dest by {svc,addr,port} in the destination trash.
* The destination trash is used to hold the destinations that are removed
* from the service table but are still referenced by some conn entries.
* The reason to add the destination trash is when the dest is temporary
* down (either by administrator or by monitor program), the dest can be
* picked back from the trash, the remaining connections to the dest can
* continue, and the counting information of the dest is also useful for
* scheduling.
*/
static struct ip_vs_dest *
ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
__be16 dport)
{
struct ip_vs_dest *dest;
struct netns_ipvs *ipvs = net_ipvs(svc->net);
/*
* Find the destination in trash
*/
spin_lock_bh(&ipvs->dest_trash_lock);
list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
"dest->refcnt=%d\n",
dest->vfwmark,
IP_VS_DBG_ADDR(svc->af, &dest->addr),
ntohs(dest->port),
atomic_read(&dest->refcnt));
/* We can not reuse dest while in grace period
* because conns still can use dest->svc
*/
if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
continue;
if (dest->af == svc->af &&
ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
dest->port == dport &&
dest->vfwmark == svc->fwmark &&
dest->protocol == svc->protocol &&
(svc->fwmark ||
(ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
dest->vport == svc->port))) {
/* HIT */
list_del(&dest->t_list);
ip_vs_dest_hold(dest);
goto out;
}
}
dest = NULL;
out:
spin_unlock_bh(&ipvs->dest_trash_lock);
return dest;
}
static void ip_vs_dest_free(struct ip_vs_dest *dest)
{
__ip_vs_dst_cache_reset(dest);
__ip_vs_unbind_svc(dest);
free_percpu(dest->stats.cpustats);
kfree(dest);
}
/*
* Clean up all the destinations in the trash
* Called by the ip_vs_control_cleanup()
*
* When the ip_vs_control_clearup is activated by ipvs module exit,
* the service tables must have been flushed and all the connections
* are expired, and the refcnt of each destination in the trash must
* be 0, so we simply release them here.
*/
static void ip_vs_trash_cleanup(struct net *net)
{
struct ip_vs_dest *dest, *nxt;
struct netns_ipvs *ipvs = net_ipvs(net);
del_timer_sync(&ipvs->dest_trash_timer);
/* No need to use dest_trash_lock */
list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
list_del(&dest->t_list);
ip_vs_dest_free(dest);
}
}
static void
ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
{
#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c
spin_lock_bh(&src->lock);
IP_VS_SHOW_STATS_COUNTER(conns);
IP_VS_SHOW_STATS_COUNTER(inpkts);
IP_VS_SHOW_STATS_COUNTER(outpkts);
IP_VS_SHOW_STATS_COUNTER(inbytes);
IP_VS_SHOW_STATS_COUNTER(outbytes);
ip_vs_read_estimator(dst, src);
spin_unlock_bh(&src->lock);
}
static void
ip_vs_zero_stats(struct ip_vs_stats *stats)
{
spin_lock_bh(&stats->lock);
/* get current counters as zero point, rates are zeroed */
#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c
IP_VS_ZERO_STATS_COUNTER(conns);
IP_VS_ZERO_STATS_COUNTER(inpkts);
IP_VS_ZERO_STATS_COUNTER(outpkts);
IP_VS_ZERO_STATS_COUNTER(inbytes);
IP_VS_ZERO_STATS_COUNTER(outbytes);
ip_vs_zero_estimator(stats);
spin_unlock_bh(&stats->lock);
}
/*
* Update a destination in the given service
*/
static void
__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct ip_vs_dest_user_kern *udest, int add)
{
struct netns_ipvs *ipvs = net_ipvs(svc->net);
struct ip_vs_scheduler *sched;
int conn_flags;
/* set the weight and the flags */
atomic_set(&dest->weight, udest->weight);
conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
conn_flags |= IP_VS_CONN_F_INACTIVE;
/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
/*
* Put the real service in rs_table if not present.
* For now only for NAT!
*/
ip_vs_rs_hash(ipvs, dest);
}
atomic_set(&dest->conn_flags, conn_flags);
/* bind the service */
if (!dest->svc) {
__ip_vs_bind_svc(dest, svc);
} else {
if (dest->svc != svc) {
__ip_vs_unbind_svc(dest);
ip_vs_zero_stats(&dest->stats);
__ip_vs_bind_svc(dest, svc);
}
}
/* set the dest status flags */
dest->flags |= IP_VS_DEST_F_AVAILABLE;
if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
dest->u_threshold = udest->u_threshold;
dest->l_threshold = udest->l_threshold;
spin_lock_bh(&dest->dst_lock);
__ip_vs_dst_cache_reset(dest);
spin_unlock_bh(&dest->dst_lock);
ipvs: changes for local real server This patch deals with local real servers: - Add support for DNAT to local address (different real server port). It needs ip_vs_out hook in LOCAL_OUT for both families because skb->protocol is not set for locally generated packets and can not be used to set 'af'. - Skip packets in ip_vs_in marked with skb->ipvs_property because ip_vs_out processing can be executed in LOCAL_OUT but we still have the conn_out_get check in ip_vs_in. - Ignore packets with inet->nodefrag from local stack - Require skb_dst(skb) != NULL because we use it to get struct net - Add support for changing the route to local IPv4 stack after DNAT depending on the source address type. Local client sets output route and the remote client sets input route. It looks like IPv6 does not need such rerouting because the replies use addresses from initial incoming header, not from skb route. - All transmitters now have strict checks for the destination address type: redirect from non-local address to local real server requires NAT method, local address can not be used as source address when talking to remote real server. - Now LOCALNODE is not set explicitly as forwarding method in real server to allow the connections to provide correct forwarding method to the backup server. Not sure if this breaks tools that expect to see 'Local' real server type. If needed, this can be supported with new flag IP_VS_DEST_F_LOCAL. Now it should be possible connections in backup that lost their fwmark information during sync to be forwarded properly to their daddr, even if it is local address in the backup server. By this way backup could be used as real server for DR or TUN, for NAT there are some restrictions because tuple collisions in conntracks can create problems for the traffic. - Call ip_vs_dst_reset when destination is updated in case some real server IP type is changed between local and remote. [ horms@verge.net.au: removed trailing whitespace ] Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2010-10-17 13:38:15 +00:00
sched = rcu_dereference_protected(svc->scheduler, 1);
if (add) {
ip_vs_start_estimator(svc->net, &dest->stats);
list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
if (sched->add_dest)
sched->add_dest(svc, dest);
} else {
if (sched->upd_dest)
sched->upd_dest(svc, dest);
}
}
/*
* Create a destination for the given service
*/
static int
ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
struct ip_vs_dest **dest_p)
{
struct ip_vs_dest *dest;
unsigned int atype;
EnterFunction(2);
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6) {
atype = ipv6_addr_type(&udest->addr.in6);
if ((!(atype & IPV6_ADDR_UNICAST) ||
atype & IPV6_ADDR_LINKLOCAL) &&
!__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
return -EINVAL;
} else
#endif
{
atype = inet_addr_type(svc->net, udest->addr.ip);
if (atype != RTN_LOCAL && atype != RTN_UNICAST)
return -EINVAL;
}
dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
if (dest == NULL)
return -ENOMEM;
dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
if (!dest->stats.cpustats)
goto err_alloc;
dest->af = svc->af;
dest->protocol = svc->protocol;
dest->vaddr = svc->addr;
dest->vport = svc->port;
dest->vfwmark = svc->fwmark;
ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
dest->port = udest->port;
atomic_set(&dest->activeconns, 0);
atomic_set(&dest->inactconns, 0);
atomic_set(&dest->persistconns, 0);
atomic_set(&dest->refcnt, 1);
INIT_HLIST_NODE(&dest->d_list);
spin_lock_init(&dest->dst_lock);
spin_lock_init(&dest->stats.lock);
__ip_vs_update_dest(svc, dest, udest, 1);
*dest_p = dest;
LeaveFunction(2);
return 0;
err_alloc:
kfree(dest);
return -ENOMEM;
}
/*
* Add a destination into an existing service
*/
static int
ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
struct ip_vs_dest *dest;
union nf_inet_addr daddr;
__be16 dport = udest->port;
int ret;
EnterFunction(2);
if (udest->weight < 0) {
pr_err("%s(): server weight less than zero\n", __func__);
return -ERANGE;
}
if (udest->l_threshold > udest->u_threshold) {
pr_err("%s(): lower threshold is higher than upper threshold\n",
__func__);
return -ERANGE;
}
ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
/* We use function that requires RCU lock */
rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &daddr, dport);
rcu_read_unlock();
if (dest != NULL) {
IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
return -EEXIST;
}
/*
* Check if the dest already exists in the trash and
* is from the same service
*/
dest = ip_vs_trash_get_dest(svc, &daddr, dport);
if (dest != NULL) {
IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
"dest->refcnt=%d, service %u/%s:%u\n",
IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
atomic_read(&dest->refcnt),
dest->vfwmark,
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
__ip_vs_update_dest(svc, dest, udest, 1);
ret = 0;
} else {
/*
* Allocate and initialize the dest structure
*/
ret = ip_vs_new_dest(svc, udest, &dest);
}
LeaveFunction(2);
return ret;
}
/*
* Edit a destination in the given service
*/
static int
ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
struct ip_vs_dest *dest;
union nf_inet_addr daddr;
__be16 dport = udest->port;
EnterFunction(2);
if (udest->weight < 0) {
pr_err("%s(): server weight less than zero\n", __func__);
return -ERANGE;
}
if (udest->l_threshold > udest->u_threshold) {
pr_err("%s(): lower threshold is higher than upper threshold\n",
__func__);
return -ERANGE;
}
ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
/* We use function that requires RCU lock */
rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &daddr, dport);
rcu_read_unlock();
if (dest == NULL) {
IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
return -ENOENT;
}
__ip_vs_update_dest(svc, dest, udest, 0);
LeaveFunction(2);
return 0;
}
static void ip_vs_dest_wait_readers(struct rcu_head *head)
{
struct ip_vs_dest *dest = container_of(head, struct ip_vs_dest,
rcu_head);
/* End of grace period after unlinking */
clear_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
}
/*
* Delete a destination (must be already unlinked from the service)
*/
static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
bool cleanup)
{
struct netns_ipvs *ipvs = net_ipvs(net);
ip_vs_stop_estimator(net, &dest->stats);
/*
* Remove it from the d-linked list with the real services.
*/
ip_vs_rs_unhash(dest);
if (!cleanup) {
set_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
call_rcu(&dest->rcu_head, ip_vs_dest_wait_readers);
}
spin_lock_bh(&ipvs->dest_trash_lock);
IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
atomic_read(&dest->refcnt));
if (list_empty(&ipvs->dest_trash) && !cleanup)
mod_timer(&ipvs->dest_trash_timer,
jiffies + IP_VS_DEST_TRASH_PERIOD);
/* dest lives in trash without reference */
list_add(&dest->t_list, &ipvs->dest_trash);
spin_unlock_bh(&ipvs->dest_trash_lock);
ip_vs_dest_put(dest);
}
/*
* Unlink a destination from the given service
*/
static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
struct ip_vs_dest *dest,
int svcupd)
{
dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
/*
* Remove it from the d-linked destination list.
*/
list_del_rcu(&dest->n_list);
svc->num_dests--;
if (svcupd) {
struct ip_vs_scheduler *sched;
sched = rcu_dereference_protected(svc->scheduler, 1);
if (sched->del_dest)
sched->del_dest(svc, dest);
}
}
/*
* Delete a destination server in the given service
*/
static int
ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
struct ip_vs_dest *dest;
__be16 dport = udest->port;
EnterFunction(2);
/* We use function that requires RCU lock */
rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
rcu_read_unlock();
if (dest == NULL) {
IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
return -ENOENT;
}
/*
* Unlink dest from the service
*/
__ip_vs_unlink_dest(svc, dest, 1);
/*
* Delete the destination
*/
__ip_vs_del_dest(svc->net, dest, false);
LeaveFunction(2);
return 0;
}
static void ip_vs_dest_trash_expire(unsigned long data)
{
struct net *net = (struct net *) data;
struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_dest *dest, *next;
spin_lock(&ipvs->dest_trash_lock);
list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
/* Skip if dest is in grace period */
if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
continue;
if (atomic_read(&dest->refcnt) > 0)
continue;
IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
dest->vfwmark,
IP_VS_DBG_ADDR(dest->svc->af, &dest->addr),
ntohs(dest->port));
list_del(&dest->t_list);
ip_vs_dest_free(dest);
}
if (!list_empty(&ipvs->dest_trash))
mod_timer(&ipvs->dest_trash_timer,
jiffies + IP_VS_DEST_TRASH_PERIOD);
spin_unlock(&ipvs->dest_trash_lock);
}
/*
* Add a service into the service hash table
*/
static int
ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
int ret = 0;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
struct netns_ipvs *ipvs = net_ipvs(net);
/* increase the module use count */
ip_vs_use_count_inc();
/* Lookup the scheduler by 'u->sched_name' */
sched = ip_vs_scheduler_get(u->sched_name);
if (sched == NULL) {
pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
ret = -ENOENT;
goto out_err;
}
if (u->pe_name && *u->pe_name) {
pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
ret = -ENOENT;
goto out_err;
}
}
#ifdef CONFIG_IP_VS_IPV6
if (u->af == AF_INET6) {
__u32 plen = (__force __u32) u->netmask;
if (plen < 1 || plen > 128) {
ret = -EINVAL;
goto out_err;
}
}
#endif
svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
if (svc == NULL) {
IP_VS_DBG(1, "%s(): no memory\n", __func__);
ret = -ENOMEM;
goto out_err;
}
svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
if (!svc->stats.cpustats) {
ret = -ENOMEM;
goto out_err;
}
/* I'm the first user of the service */
atomic_set(&svc->refcnt, 0);
svc->af = u->af;
svc->protocol = u->protocol;
ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
svc->port = u->port;
svc->fwmark = u->fwmark;
svc->flags = u->flags;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
svc->net = net;
INIT_LIST_HEAD(&svc->destinations);
spin_lock_init(&svc->sched_lock);
spin_lock_init(&svc->stats.lock);
/* Bind the scheduler */
ret = ip_vs_bind_scheduler(svc, sched);
if (ret)
goto out_err;
sched = NULL;
/* Bind the ct retriever */
RCU_INIT_POINTER(svc->pe, pe);
pe = NULL;
/* Update the virtual service counters */
if (svc->port == FTPPORT)
atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
atomic_inc(&ipvs->nullsvc_counter);
ip_vs_start_estimator(net, &svc->stats);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
ipvs->num_services++;
/* Hash the service into the service table */
ip_vs_svc_hash(svc);
*svc_p = svc;
/* Now there is a service - full throttle */
ipvs->enable = 1;
return 0;
out_err:
if (svc != NULL) {
ip_vs_unbind_scheduler(svc, sched);
ip_vs_service_free(svc);
}
ip_vs_scheduler_put(sched);
ip_vs_pe_put(pe);
/* decrease the module use count */
ip_vs_use_count_dec();
return ret;
}
/*
* Edit a service and bind it with a new scheduler
*/
static int
ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
{
struct ip_vs_scheduler *sched, *old_sched;
struct ip_vs_pe *pe = NULL, *old_pe = NULL;
int ret = 0;
/*
* Lookup the scheduler, by 'u->sched_name'
*/
sched = ip_vs_scheduler_get(u->sched_name);
if (sched == NULL) {
pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
return -ENOENT;
}
old_sched = sched;
if (u->pe_name && *u->pe_name) {
pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
ret = -ENOENT;
goto out;
}
old_pe = pe;
}
#ifdef CONFIG_IP_VS_IPV6
if (u->af == AF_INET6) {
__u32 plen = (__force __u32) u->netmask;
if (plen < 1 || plen > 128) {
ret = -EINVAL;
goto out;
}
}
#endif
old_sched = rcu_dereference_protected(svc->scheduler, 1);
if (sched != old_sched) {
/* Bind the new scheduler */
ret = ip_vs_bind_scheduler(svc, sched);
if (ret) {
old_sched = sched;
goto out;
}
/* Unbind the old scheduler on success */
ip_vs_unbind_scheduler(svc, old_sched);
}
/*
* Set the flags and timeout value
*/
svc->flags = u->flags | IP_VS_SVC_F_HASHED;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
old_pe = rcu_dereference_protected(svc->pe, 1);
if (pe != old_pe)
rcu_assign_pointer(svc->pe, pe);
out:
ip_vs_scheduler_put(old_sched);
ip_vs_pe_put(old_pe);
return ret;
}
static void ip_vs_service_rcu_free(struct rcu_head *head)
{
struct ip_vs_service *svc;
svc = container_of(head, struct ip_vs_service, rcu_head);
ip_vs_service_free(svc);
}
/*
* Delete a service from the service list
* - The service must be unlinked, unlocked and not referenced!
* - We are called under _bh lock
*/
static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
{
struct ip_vs_dest *dest, *nxt;
struct ip_vs_scheduler *old_sched;
struct ip_vs_pe *old_pe;
struct netns_ipvs *ipvs = net_ipvs(svc->net);
pr_info("%s: enter\n", __func__);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
ipvs->num_services--;
ip_vs_stop_estimator(svc->net, &svc->stats);
/* Unbind scheduler */
old_sched = rcu_dereference_protected(svc->scheduler, 1);
ip_vs_unbind_scheduler(svc, old_sched);
ip_vs_scheduler_put(old_sched);
/* Unbind persistence engine, keep svc->pe */
old_pe = rcu_dereference_protected(svc->pe, 1);
ip_vs_pe_put(old_pe);
/*
* Unlink the whole destination list
*/
list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
__ip_vs_unlink_dest(svc, dest, 0);
__ip_vs_del_dest(svc->net, dest, cleanup);
}
/*
* Update the virtual service counters
*/
if (svc->port == FTPPORT)
atomic_dec(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
atomic_dec(&ipvs->nullsvc_counter);
/*
* Free the service if nobody refers to it
*/
if (atomic_dec_and_test(&svc->refcnt)) {
IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port));
call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
}
/* decrease the module use count */
ip_vs_use_count_dec();
}
/*
* Unlink a service from list and try to delete it if its refcnt reached 0
*/
static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
{
/* Hold svc to avoid double release from dest_trash */
atomic_inc(&svc->refcnt);
/*
* Unhash it from the service table
*/
ip_vs_svc_unhash(svc);
__ip_vs_del_service(svc, cleanup);
}
/*
* Delete a service from the service list
*/
static int ip_vs_del_service(struct ip_vs_service *svc)
{
if (svc == NULL)
return -EEXIST;
ip_vs_unlink_service(svc, false);
return 0;
}
/*
* Flush all the virtual services
*/
static int ip_vs_flush(struct net *net, bool cleanup)
{
int idx;
struct ip_vs_service *svc;
struct hlist_node *n;
/*
* Flush the service table hashed by <netns,protocol,addr,port>
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
s_list) {
if (net_eq(svc->net, net))
ip_vs_unlink_service(svc, cleanup);
}
}
/*
* Flush the service table hashed by fwmark
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
f_list) {
if (net_eq(svc->net, net))
ip_vs_unlink_service(svc, cleanup);
}
}
return 0;
}
/*
* Delete service by {netns} in the service table.
* Called by __ip_vs_cleanup()
*/
void ip_vs_service_net_cleanup(struct net *net)
{
EnterFunction(2);
/* Check for "full" addressed entries */
mutex_lock(&__ip_vs_mutex);
ip_vs_flush(net, true);
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
}
/* Put all references for device (dst_cache) */
static inline void
ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
{
struct ip_vs_dest_dst *dest_dst;
spin_lock_bh(&dest->dst_lock);
dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
if (dest_dst && dest_dst->dst_cache->dev == dev) {
IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
dev->name,
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port),
atomic_read(&dest->refcnt));
__ip_vs_dst_cache_reset(dest);
}
spin_unlock_bh(&dest->dst_lock);
}
/* Netdev event receiver
* Currently only NETDEV_DOWN is handled to release refs to cached dsts
*/
static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
struct net_device *dev = ptr;
struct net *net = dev_net(dev);
struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_service *svc;
struct ip_vs_dest *dest;
unsigned int idx;
if (event != NETDEV_DOWN || !ipvs)
return NOTIFY_DONE;
IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
EnterFunction(2);
mutex_lock(&__ip_vs_mutex);
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
if (net_eq(svc->net, net)) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
ip_vs_forget_dev(dest, dev);
}
}
}
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
if (net_eq(svc->net, net)) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
ip_vs_forget_dev(dest, dev);
}
}
}
}
spin_lock_bh(&ipvs->dest_trash_lock);
list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
ip_vs_forget_dev(dest, dev);
}
spin_unlock_bh(&ipvs->dest_trash_lock);
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
return NOTIFY_DONE;
}
/*
* Zero counters in a service or all services
*/
static int ip_vs_zero_service(struct ip_vs_service *svc)
{
struct ip_vs_dest *dest;
list_for_each_entry(dest, &svc->destinations, n_list) {
ip_vs_zero_stats(&dest->stats);
}
ip_vs_zero_stats(&svc->stats);
return 0;
}
static int ip_vs_zero_all(struct net *net)
{
int idx;
struct ip_vs_service *svc;
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
if (net_eq(svc->net, net))
ip_vs_zero_service(svc);
}
}
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
if (net_eq(svc->net, net))
ip_vs_zero_service(svc);
}
}
ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
return 0;
}
#ifdef CONFIG_SYSCTL
static int zero;
static int three = 3;
static int
proc_do_defense_mode(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = current->nsproxy->net_ns;
int *valp = table->data;
int val = *valp;
int rc;
rc = proc_dointvec(table, write, buffer, lenp, ppos);
if (write && (*valp != val)) {
if ((*valp < 0) || (*valp > 3)) {
/* Restore the correct value */
*valp = val;
} else {
update_defense_level(net_ipvs(net));
}
}
return rc;
}
static int
proc_do_sync_threshold(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
int val[2];
int rc;
/* backup the value first */
memcpy(val, valp, sizeof(val));
rc = proc_dointvec(table, write, buffer, lenp, ppos);
if (write && (valp[0] < 0 || valp[1] < 0 ||
(valp[0] >= valp[1] && valp[1]))) {
/* Restore the correct value */
memcpy(valp, val, sizeof(val));
}
return rc;
}
static int
proc_do_sync_mode(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
int val = *valp;
int rc;
rc = proc_dointvec(table, write, buffer, lenp, ppos);
if (write && (*valp != val)) {
if ((*valp < 0) || (*valp > 1)) {
/* Restore the correct value */
*valp = val;
}
}
return rc;
}
static int
proc_do_sync_ports(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
int val = *valp;
int rc;
rc = proc_dointvec(table, write, buffer, lenp, ppos);
if (write && (*valp != val)) {
if (*valp < 1 || !is_power_of_2(*valp)) {
/* Restore the correct value */
*valp = val;
}
}
return rc;
}
/*
* IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
* Do not change order or insert new entries without
* align with netns init in ip_vs_control_net_init()
*/
static struct ctl_table vs_vars[] = {
{
.procname = "amemthresh",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "am_droprate",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "drop_entry",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
{
.procname = "drop_packet",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
ipvs: netfilter connection tracking changes Add more code to IPVS to work with Netfilter connection tracking and fix some problems. - Allow IPVS to be compiled without connection tracking as in 2.6.35 and before. This can avoid keeping conntracks for all IPVS connections because this costs memory. ip_vs_ftp still depends on connection tracking and NAT as implemented for 2.6.36. - Add sysctl var "conntrack" to enable connection tracking for all IPVS connections. For loaded IPVS directors it needs tuning of nf_conntrack_max limit. - Add IP_VS_CONN_F_NFCT connection flag to request the connection to use connection tracking. This allows user space to provide this flag, for example, in dest->conn_flags. This can be useful to request connection tracking per real server instead of forcing it for all connections with the "conntrack" sysctl. This flag is set currently only by ip_vs_ftp and of course by "conntrack" sysctl. - Add ip_vs_nfct.c file to hold all connection tracking code, by this way main code should not depend of netfilter conntrack support. - Return back the ip_vs_post_routing handler as in 2.6.35 and use skb->ipvs_property=1 to allow IPVS to work without connection tracking Connection tracking: - most of the code is already in 2.6.36-rc - alter conntrack reply tuple for LVS-NAT connections when first packet from client is forwarded and conntrack state is NEW or RELATED. Additionally, alter reply for RELATED connections from real server, again for packet in original direction. - add IP_VS_XMIT_TUNNEL to confirm conntrack (without altering reply) for LVS-TUN early because we want to call nf_reset. It is needed because we add IPIP header and the original conntrack should be preserved, not destroyed. The transmitted IPIP packets can reuse same conntrack, so we do not set skb->ipvs_property. - try to destroy conntrack when the IPVS connection is destroyed. It is not fatal if conntrack disappears before that, it depends on the used timers. Fix problems from long time: - add skb->ip_summed = CHECKSUM_NONE for the LVS-TUN transmitters Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Patrick McHardy <kaber@trash.net>
2010-09-21 15:35:41 +00:00
#ifdef CONFIG_IP_VS_NFCT
{
.procname = "conntrack",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#endif
{
.procname = "secure_tcp",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
{
.procname = "snat_reroute",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.procname = "sync_version",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_do_sync_mode,
},
{
.procname = "sync_ports",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_do_sync_ports,
},
{
.procname = "sync_qlen_max",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sync_sock_size",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "cache_bypass",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "expire_nodest_conn",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "expire_quiescent_template",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sync_threshold",
.maxlen =
sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
.mode = 0644,
.proc_handler = proc_do_sync_threshold,
},
{
.procname = "sync_refresh_period",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "sync_retries",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &three,
},
{
.procname = "nat_icmp_send",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "pmtu_disc",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "backup_only",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
#ifdef CONFIG_IP_VS_DEBUG
{
.procname = "debug_level",
.data = &sysctl_ip_vs_debug_level,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
#endif
#if 0
{
.procname = "timeout_established",
.data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "timeout_synsent",
.data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "timeout_synrecv",
.data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "timeout_finwait",
.data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "timeout_timewait",
.data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "timeout_close",
.data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "timeout_closewait",
.data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "timeout_lastack",
.data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "timeout_listen",
.data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "timeout_synack",
.data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "timeout_udp",
.data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "timeout_icmp",
.data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
#endif
{ }
};
#endif
#ifdef CONFIG_PROC_FS
struct ip_vs_iter {
struct seq_net_private p; /* Do not move this, netns depends upon it*/
struct hlist_head *table;
int bucket;
};
/*
* Write the contents of the VS rule table to a PROCfs file.
* (It is kept just for backward compatibility)
*/
static inline const char *ip_vs_fwd_name(unsigned int flags)
{
switch (flags & IP_VS_CONN_F_FWD_MASK) {
case IP_VS_CONN_F_LOCALNODE:
return "Local";
case IP_VS_CONN_F_TUNNEL:
return "Tunnel";
case IP_VS_CONN_F_DROUTE:
return "Route";
default:
return "Masq";
}
}
/* Get the Nth entry in the two lists */
static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
{
struct net *net = seq_file_net(seq);
struct ip_vs_iter *iter = seq->private;
int idx;
struct ip_vs_service *svc;
/* look in hash by protocol */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_table;
iter->bucket = idx;
return svc;
}
}
}
/* keep looking in fwmark */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
f_list) {
if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_fwm_table;
iter->bucket = idx;
return svc;
}
}
}
return NULL;
}
static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
rcu_read_lock();
return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
}
static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct hlist_node *e;
struct ip_vs_iter *iter;
struct ip_vs_service *svc;
++*pos;
if (v == SEQ_START_TOKEN)
return ip_vs_info_array(seq,0);
svc = v;
iter = seq->private;
if (iter->table == ip_vs_svc_table) {
/* next service in table hashed by protocol */
e = rcu_dereference(hlist_next_rcu(&svc->s_list));
if (e)
return hlist_entry(e, struct ip_vs_service, s_list);
while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
hlist_for_each_entry_rcu(svc,
&ip_vs_svc_table[iter->bucket],
s_list) {
return svc;
}
}
iter->table = ip_vs_svc_fwm_table;
iter->bucket = -1;
goto scan_fwmark;
}
/* next service in hashed by fwmark */
e = rcu_dereference(hlist_next_rcu(&svc->f_list));
if (e)
return hlist_entry(e, struct ip_vs_service, f_list);
scan_fwmark:
while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
hlist_for_each_entry_rcu(svc,
&ip_vs_svc_fwm_table[iter->bucket],
f_list)
return svc;
}
return NULL;
}
static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
__releases(RCU)
{
rcu_read_unlock();
}
static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN) {
seq_printf(seq,
"IP Virtual Server version %d.%d.%d (size=%d)\n",
IPVS: Allow boot time change of hash size I was very frustrated about the fact that I have to recompile the kernel to change the hash size. So, I created this patch. If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel command line, or, if you built IPVS as modules, you can add options ip_vs conn_tab_bits=??. To keep everything backward compatible, you still can select the size at compile time, and that will be used as default. It has been about a year since this patch was originally posted and subsequently dropped on the basis of insufficient test data. Mark Bergsma has provided the following test results which seem to strongly support the need for larger hash table sizes: We do however run into the same problem with the default setting (212 = 4096 entries), as most of our LVS balancers handle around a million connections/SLAB entries at any point in time (around 100-150 kpps load). With only 4096 hash table entries this implies that each entry consists of a linked list of 256 connections *on average*. To provide some statistics, I did an oprofile run on an 2.6.31 kernel, with both the default 4096 table size, and the same kernel recompiled with IP_VS_CONN_TAB_BITS set to 18 (218 = 262144 entries). I built a quick test setup with a part of Wikimedia/Wikipedia's live traffic mirrored by the switch to the test host. With the default setting, at ~ 120 kpps packet load we saw a typical %si CPU usage of around 30-35%, and oprofile reported a hot spot in ip_vs_conn_in_get: samples % image name app name symbol name 1719761 42.3741 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 302577 7.4554 bnx2 bnx2 /bnx2 181984 4.4840 vmlinux vmlinux __ticket_spin_lock 128636 3.1695 vmlinux vmlinux ip_route_input 74345 1.8318 ip_vs.ko ip_vs.ko ip_vs_conn_out_get 68482 1.6874 vmlinux vmlinux mwait_idle After loading the recompiled kernel with 218 entries, %si CPU usage dropped in half to around 12-18%, and oprofile looks much healthier, with only 7% spent in ip_vs_conn_in_get: samples % image name app name symbol name 265641 14.4616 bnx2 bnx2 /bnx2 143251 7.7986 vmlinux vmlinux __ticket_spin_lock 140661 7.6576 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 94364 5.1372 vmlinux vmlinux mwait_idle 86267 4.6964 vmlinux vmlinux ip_route_input [ horms@verge.net.au: trivial up-port and minor style fixes ] Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro> Cc: Mark Bergsma <mark@wikimedia.org> Signed-off-by: Simon Horman <horms@verge.net.au> Signed-off-by: Patrick McHardy <kaber@trash.net>
2010-01-05 04:50:24 +00:00
NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
seq_puts(seq,
"Prot LocalAddress:Port Scheduler Flags\n");
seq_puts(seq,
" -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
} else {
const struct ip_vs_service *svc = v;
const struct ip_vs_iter *iter = seq->private;
const struct ip_vs_dest *dest;
struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
if (iter->table == ip_vs_svc_table) {
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6)
seq_printf(seq, "%s [%pI6]:%04X %s ",
ip_vs_proto_name(svc->protocol),
&svc->addr.in6,
ntohs(svc->port),
sched->name);
else
#endif
seq_printf(seq, "%s %08X:%04X %s %s ",
ip_vs_proto_name(svc->protocol),
ntohl(svc->addr.ip),
ntohs(svc->port),
sched->name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
} else {
seq_printf(seq, "FWM %08X %s %s",
svc->fwmark, sched->name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
}
if (svc->flags & IP_VS_SVC_F_PERSISTENT)
seq_printf(seq, "persistent %d %08X\n",
svc->timeout,
ntohl(svc->netmask));
else
seq_putc(seq, '\n');
list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
#ifdef CONFIG_IP_VS_IPV6
if (dest->af == AF_INET6)
seq_printf(seq,
" -> [%pI6]:%04X"
" %-7s %-6d %-10d %-10d\n",
&dest->addr.in6,
ntohs(dest->port),
ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
atomic_read(&dest->weight),
atomic_read(&dest->activeconns),
atomic_read(&dest->inactconns));
else
#endif
seq_printf(seq,
" -> %08X:%04X "
"%-7s %-6d %-10d %-10d\n",
ntohl(dest->addr.ip),
ntohs(dest->port),
ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
atomic_read(&dest->weight),
atomic_read(&dest->activeconns),
atomic_read(&dest->inactconns));
}
}
return 0;
}
static const struct seq_operations ip_vs_info_seq_ops = {
.start = ip_vs_info_seq_start,
.next = ip_vs_info_seq_next,
.stop = ip_vs_info_seq_stop,
.show = ip_vs_info_seq_show,
};
static int ip_vs_info_open(struct inode *inode, struct file *file)
{
return seq_open_net(inode, file, &ip_vs_info_seq_ops,
sizeof(struct ip_vs_iter));
}
static const struct file_operations ip_vs_info_fops = {
.owner = THIS_MODULE,
.open = ip_vs_info_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_net,
};
static int ip_vs_stats_show(struct seq_file *seq, void *v)
{
struct net *net = seq_file_single_net(seq);
struct ip_vs_stats_user show;
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
" Total Incoming Outgoing Incoming Outgoing\n");
seq_printf(seq,
" Conns Packets Packets Bytes Bytes\n");
ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns,
show.inpkts, show.outpkts,
(unsigned long long) show.inbytes,
(unsigned long long) show.outbytes);
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
" Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
seq_printf(seq, "%8X %8X %8X %16X %16X\n",
show.cps, show.inpps, show.outpps,
show.inbps, show.outbps);
return 0;
}
static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
{
return single_open_net(inode, file, ip_vs_stats_show);
}
static const struct file_operations ip_vs_stats_fops = {
.owner = THIS_MODULE,
.open = ip_vs_stats_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release_net,
};
static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
{
struct net *net = seq_file_single_net(seq);
struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
struct ip_vs_stats_user rates;
int i;
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
" Total Incoming Outgoing Incoming Outgoing\n");
seq_printf(seq,
"CPU Conns Packets Packets Bytes Bytes\n");
for_each_possible_cpu(i) {
struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
unsigned int start;
__u64 inbytes, outbytes;
do {
start = u64_stats_fetch_begin_bh(&u->syncp);
inbytes = u->ustats.inbytes;
outbytes = u->ustats.outbytes;
} while (u64_stats_fetch_retry_bh(&u->syncp, start));
seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
i, u->ustats.conns, u->ustats.inpkts,
u->ustats.outpkts, (__u64)inbytes,
(__u64)outbytes);
}
spin_lock_bh(&tot_stats->lock);
seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n",
tot_stats->ustats.conns, tot_stats->ustats.inpkts,
tot_stats->ustats.outpkts,
(unsigned long long) tot_stats->ustats.inbytes,
(unsigned long long) tot_stats->ustats.outbytes);
ip_vs_read_estimator(&rates, tot_stats);
spin_unlock_bh(&tot_stats->lock);
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
" Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
seq_printf(seq, " %8X %8X %8X %16X %16X\n",
rates.cps,
rates.inpps,
rates.outpps,
rates.inbps,
rates.outbps);
return 0;
}
static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
{
return single_open_net(inode, file, ip_vs_stats_percpu_show);
}
static const struct file_operations ip_vs_stats_percpu_fops = {
.owner = THIS_MODULE,
.open = ip_vs_stats_percpu_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release_net,
};
#endif
/*
* Set timeout values for tcp tcpfin udp in the timeout_table.
*/
static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
{
#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
struct ip_vs_proto_data *pd;
#endif
IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
u->tcp_timeout,
u->tcp_fin_timeout,
u->udp_timeout);
#ifdef CONFIG_IP_VS_PROTO_TCP
if (u->tcp_timeout) {
pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
= u->tcp_timeout * HZ;
}
if (u->tcp_fin_timeout) {
pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
= u->tcp_fin_timeout * HZ;
}
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
if (u->udp_timeout) {
pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
pd->timeout_table[IP_VS_UDP_S_NORMAL]
= u->udp_timeout * HZ;
}
#endif
return 0;
}
#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
sizeof(struct ip_vs_dest_user))
#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
#define MAX_ARG_LEN SVCDEST_ARG_LEN
static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
[SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
[SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
[SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
};
static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
struct ip_vs_service_user *usvc_compat)
{
memset(usvc, 0, sizeof(*usvc));
usvc->af = AF_INET;
usvc->protocol = usvc_compat->protocol;
usvc->addr.ip = usvc_compat->addr;
usvc->port = usvc_compat->port;
usvc->fwmark = usvc_compat->fwmark;
/* Deep copy of sched_name is not needed here */
usvc->sched_name = usvc_compat->sched_name;
usvc->flags = usvc_compat->flags;
usvc->timeout = usvc_compat->timeout;
usvc->netmask = usvc_compat->netmask;
}
static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
struct ip_vs_dest_user *udest_compat)
{
memset(udest, 0, sizeof(*udest));
udest->addr.ip = udest_compat->addr;
udest->port = udest_compat->port;
udest->conn_flags = udest_compat->conn_flags;
udest->weight = udest_compat->weight;
udest->u_threshold = udest_compat->u_threshold;
udest->l_threshold = udest_compat->l_threshold;
}
static int
do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
struct net *net = sock_net(sk);
int ret;
unsigned char arg[MAX_ARG_LEN];
struct ip_vs_service_user *usvc_compat;
struct ip_vs_service_user_kern usvc;
struct ip_vs_service *svc;
struct ip_vs_dest_user *udest_compat;
struct ip_vs_dest_user_kern udest;
struct netns_ipvs *ipvs = net_ipvs(net);
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
return -EINVAL;
if (len < 0 || len > MAX_ARG_LEN)
return -EINVAL;
if (len != set_arglen[SET_CMDID(cmd)]) {
pr_err("set_ctl: len %u != %u\n",
len, set_arglen[SET_CMDID(cmd)]);
return -EINVAL;
}
if (copy_from_user(arg, user, len) != 0)
return -EFAULT;
/* increase the module use count */
ip_vs_use_count_inc();
/* Handle daemons since they have another lock */
if (cmd == IP_VS_SO_SET_STARTDAEMON ||
cmd == IP_VS_SO_SET_STOPDAEMON) {
struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
if (mutex_lock_interruptible(&ipvs->sync_mutex)) {
ret = -ERESTARTSYS;
goto out_dec;
}
if (cmd == IP_VS_SO_SET_STARTDAEMON)
ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
dm->syncid);
else
ret = stop_sync_thread(net, dm->state);
mutex_unlock(&ipvs->sync_mutex);
goto out_dec;
}
if (mutex_lock_interruptible(&__ip_vs_mutex)) {
ret = -ERESTARTSYS;
goto out_dec;
}
if (cmd == IP_VS_SO_SET_FLUSH) {
/* Flush the virtual service */
ret = ip_vs_flush(net, false);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
/* Set timeout values for (tcp tcpfin udp) */
ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
goto out_unlock;
}
usvc_compat = (struct ip_vs_service_user *)arg;
udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
/* We only use the new structs internally, so copy userspace compat
* structs to extended internal versions */
ip_vs_copy_usvc_compat(&usvc, usvc_compat);
ip_vs_copy_udest_compat(&udest, udest_compat);
if (cmd == IP_VS_SO_SET_ZERO) {
/* if no service address is set, zero counters in all */
if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
ret = ip_vs_zero_all(net);
goto out_unlock;
}
}
ipvs: fix buffer overflow with sync daemon and service commit 52f96757905bbf0edef47f3ee6c7c784e7f8ff8a upstream. syzkaller reports for buffer overflow for interface name when starting sync daemons [1] What we do is that we copy user structure into larger stack buffer but later we search NUL past the stack buffer. The same happens for sched_name when adding/editing virtual server. We are restricted by IP_VS_SCHEDNAME_MAXLEN and IP_VS_IFNAME_MAXLEN being used as size in include/uapi/linux/ip_vs.h, so they include the space for NUL. As using strlcpy is wrong for unsafe source, replace it with strscpy and add checks to return EINVAL if source string is not NUL-terminated. The incomplete strlcpy fix comes from 2.6.13. For the netlink interface reduce the len parameter for IPVS_DAEMON_ATTR_MCAST_IFN and IPVS_SVC_ATTR_SCHED_NAME, so that we get proper EINVAL. [1] kernel BUG at lib/string.c:1052! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 373 Comm: syz-executor936 Not tainted 4.17.0-rc4+ #45 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fortify_panic+0x13/0x20 lib/string.c:1051 RSP: 0018:ffff8801c976f800 EFLAGS: 00010282 RAX: 0000000000000022 RBX: 0000000000000040 RCX: 0000000000000000 RDX: 0000000000000022 RSI: ffffffff8160f6f1 RDI: ffffed00392edef6 RBP: ffff8801c976f800 R08: ffff8801cf4c62c0 R09: ffffed003b5e4fb0 R10: ffffed003b5e4fb0 R11: ffff8801daf27d87 R12: ffff8801c976fa20 R13: ffff8801c976fae4 R14: ffff8801c976fae0 R15: 000000000000048b FS: 00007fd99f75e700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200001c0 CR3: 00000001d6843000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: strlen include/linux/string.h:270 [inline] strlcpy include/linux/string.h:293 [inline] do_ip_vs_set_ctl+0x31c/0x1d00 net/netfilter/ipvs/ip_vs_ctl.c:2388 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] nf_setsockopt+0x7d/0xd0 net/netfilter/nf_sockopt.c:115 ip_setsockopt+0xd8/0xf0 net/ipv4/ip_sockglue.c:1253 udp_setsockopt+0x62/0xa0 net/ipv4/udp.c:2487 ipv6_setsockopt+0x149/0x170 net/ipv6/ipv6_sockglue.c:917 tcp_setsockopt+0x93/0xe0 net/ipv4/tcp.c:3057 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3046 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x447369 RSP: 002b:00007fd99f75dda8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00000000006e39e4 RCX: 0000000000447369 RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000018 R09: 0000000000000000 R10: 00000000200001c0 R11: 0000000000000246 R12: 00000000006e39e0 R13: 75a1ff93f0896195 R14: 6f745f3168746576 R15: 0000000000000001 Code: 08 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 48 89 df e8 d2 8f 48 fa eb de 55 48 89 fe 48 c7 c7 60 65 64 88 48 89 e5 e8 91 dd f3 f9 <0f> 0b 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 41 57 41 56 RIP: fortify_panic+0x13/0x20 lib/string.c:1051 RSP: ffff8801c976f800 Change-Id: I60d908e03538def93e5b6784d828da9fb88ac465 Reported-and-tested-by: syzbot+aac887f77319868646df@syzkaller.appspotmail.com Fixes: e4ff67513096 ("ipvs: add sync_maxlen parameter for the sync daemon") Fixes: 4da62fc70d7c ("[IPVS]: Fix for overflows") Signed-off-by: Julian Anastasov <ja@ssi.bg> Acked-by: Simon Horman <horms+renesas@verge.net.au> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> [bwh: Backported to 3.16: Interface name is copied in start_sync_thread(), not do_ip_vs_set_ctl()] Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
2018-05-19 15:22:35 +00:00
if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) &&
strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) ==
IP_VS_SCHEDNAME_MAXLEN) {
ret = -EINVAL;
goto out_unlock;
}
/* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
usvc.protocol != IPPROTO_SCTP) {
ipvs: fix buffer overflow with sync daemon and service commit 52f96757905bbf0edef47f3ee6c7c784e7f8ff8a upstream. syzkaller reports for buffer overflow for interface name when starting sync daemons [1] What we do is that we copy user structure into larger stack buffer but later we search NUL past the stack buffer. The same happens for sched_name when adding/editing virtual server. We are restricted by IP_VS_SCHEDNAME_MAXLEN and IP_VS_IFNAME_MAXLEN being used as size in include/uapi/linux/ip_vs.h, so they include the space for NUL. As using strlcpy is wrong for unsafe source, replace it with strscpy and add checks to return EINVAL if source string is not NUL-terminated. The incomplete strlcpy fix comes from 2.6.13. For the netlink interface reduce the len parameter for IPVS_DAEMON_ATTR_MCAST_IFN and IPVS_SVC_ATTR_SCHED_NAME, so that we get proper EINVAL. [1] kernel BUG at lib/string.c:1052! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 373 Comm: syz-executor936 Not tainted 4.17.0-rc4+ #45 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fortify_panic+0x13/0x20 lib/string.c:1051 RSP: 0018:ffff8801c976f800 EFLAGS: 00010282 RAX: 0000000000000022 RBX: 0000000000000040 RCX: 0000000000000000 RDX: 0000000000000022 RSI: ffffffff8160f6f1 RDI: ffffed00392edef6 RBP: ffff8801c976f800 R08: ffff8801cf4c62c0 R09: ffffed003b5e4fb0 R10: ffffed003b5e4fb0 R11: ffff8801daf27d87 R12: ffff8801c976fa20 R13: ffff8801c976fae4 R14: ffff8801c976fae0 R15: 000000000000048b FS: 00007fd99f75e700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200001c0 CR3: 00000001d6843000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: strlen include/linux/string.h:270 [inline] strlcpy include/linux/string.h:293 [inline] do_ip_vs_set_ctl+0x31c/0x1d00 net/netfilter/ipvs/ip_vs_ctl.c:2388 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] nf_setsockopt+0x7d/0xd0 net/netfilter/nf_sockopt.c:115 ip_setsockopt+0xd8/0xf0 net/ipv4/ip_sockglue.c:1253 udp_setsockopt+0x62/0xa0 net/ipv4/udp.c:2487 ipv6_setsockopt+0x149/0x170 net/ipv6/ipv6_sockglue.c:917 tcp_setsockopt+0x93/0xe0 net/ipv4/tcp.c:3057 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3046 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x447369 RSP: 002b:00007fd99f75dda8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00000000006e39e4 RCX: 0000000000447369 RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000018 R09: 0000000000000000 R10: 00000000200001c0 R11: 0000000000000246 R12: 00000000006e39e0 R13: 75a1ff93f0896195 R14: 6f745f3168746576 R15: 0000000000000001 Code: 08 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 48 89 df e8 d2 8f 48 fa eb de 55 48 89 fe 48 c7 c7 60 65 64 88 48 89 e5 e8 91 dd f3 f9 <0f> 0b 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 41 57 41 56 RIP: fortify_panic+0x13/0x20 lib/string.c:1051 RSP: ffff8801c976f800 Change-Id: I60d908e03538def93e5b6784d828da9fb88ac465 Reported-and-tested-by: syzbot+aac887f77319868646df@syzkaller.appspotmail.com Fixes: e4ff67513096 ("ipvs: add sync_maxlen parameter for the sync daemon") Fixes: 4da62fc70d7c ("[IPVS]: Fix for overflows") Signed-off-by: Julian Anastasov <ja@ssi.bg> Acked-by: Simon Horman <horms+renesas@verge.net.au> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> [bwh: Backported to 3.16: Interface name is copied in start_sync_thread(), not do_ip_vs_set_ctl()] Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
2018-05-19 15:22:35 +00:00
pr_err("set_ctl: invalid protocol: %d %pI4:%d\n",
usvc.protocol, &usvc.addr.ip,
ipvs: fix buffer overflow with sync daemon and service commit 52f96757905bbf0edef47f3ee6c7c784e7f8ff8a upstream. syzkaller reports for buffer overflow for interface name when starting sync daemons [1] What we do is that we copy user structure into larger stack buffer but later we search NUL past the stack buffer. The same happens for sched_name when adding/editing virtual server. We are restricted by IP_VS_SCHEDNAME_MAXLEN and IP_VS_IFNAME_MAXLEN being used as size in include/uapi/linux/ip_vs.h, so they include the space for NUL. As using strlcpy is wrong for unsafe source, replace it with strscpy and add checks to return EINVAL if source string is not NUL-terminated. The incomplete strlcpy fix comes from 2.6.13. For the netlink interface reduce the len parameter for IPVS_DAEMON_ATTR_MCAST_IFN and IPVS_SVC_ATTR_SCHED_NAME, so that we get proper EINVAL. [1] kernel BUG at lib/string.c:1052! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 373 Comm: syz-executor936 Not tainted 4.17.0-rc4+ #45 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fortify_panic+0x13/0x20 lib/string.c:1051 RSP: 0018:ffff8801c976f800 EFLAGS: 00010282 RAX: 0000000000000022 RBX: 0000000000000040 RCX: 0000000000000000 RDX: 0000000000000022 RSI: ffffffff8160f6f1 RDI: ffffed00392edef6 RBP: ffff8801c976f800 R08: ffff8801cf4c62c0 R09: ffffed003b5e4fb0 R10: ffffed003b5e4fb0 R11: ffff8801daf27d87 R12: ffff8801c976fa20 R13: ffff8801c976fae4 R14: ffff8801c976fae0 R15: 000000000000048b FS: 00007fd99f75e700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200001c0 CR3: 00000001d6843000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: strlen include/linux/string.h:270 [inline] strlcpy include/linux/string.h:293 [inline] do_ip_vs_set_ctl+0x31c/0x1d00 net/netfilter/ipvs/ip_vs_ctl.c:2388 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] nf_setsockopt+0x7d/0xd0 net/netfilter/nf_sockopt.c:115 ip_setsockopt+0xd8/0xf0 net/ipv4/ip_sockglue.c:1253 udp_setsockopt+0x62/0xa0 net/ipv4/udp.c:2487 ipv6_setsockopt+0x149/0x170 net/ipv6/ipv6_sockglue.c:917 tcp_setsockopt+0x93/0xe0 net/ipv4/tcp.c:3057 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3046 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x447369 RSP: 002b:00007fd99f75dda8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00000000006e39e4 RCX: 0000000000447369 RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000018 R09: 0000000000000000 R10: 00000000200001c0 R11: 0000000000000246 R12: 00000000006e39e0 R13: 75a1ff93f0896195 R14: 6f745f3168746576 R15: 0000000000000001 Code: 08 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 48 89 df e8 d2 8f 48 fa eb de 55 48 89 fe 48 c7 c7 60 65 64 88 48 89 e5 e8 91 dd f3 f9 <0f> 0b 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 41 57 41 56 RIP: fortify_panic+0x13/0x20 lib/string.c:1051 RSP: ffff8801c976f800 Change-Id: I60d908e03538def93e5b6784d828da9fb88ac465 Reported-and-tested-by: syzbot+aac887f77319868646df@syzkaller.appspotmail.com Fixes: e4ff67513096 ("ipvs: add sync_maxlen parameter for the sync daemon") Fixes: 4da62fc70d7c ("[IPVS]: Fix for overflows") Signed-off-by: Julian Anastasov <ja@ssi.bg> Acked-by: Simon Horman <horms+renesas@verge.net.au> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> [bwh: Backported to 3.16: Interface name is copied in start_sync_thread(), not do_ip_vs_set_ctl()] Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
2018-05-19 15:22:35 +00:00
ntohs(usvc.port));
ret = -EFAULT;
goto out_unlock;
}
/* Lookup the exact service by <protocol, addr, port> or fwmark */
rcu_read_lock();
if (usvc.fwmark == 0)
svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
&usvc.addr, usvc.port);
else
svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
rcu_read_unlock();
if (cmd != IP_VS_SO_SET_ADD
&& (svc == NULL || svc->protocol != usvc.protocol)) {
ret = -ESRCH;
goto out_unlock;
}
switch (cmd) {
case IP_VS_SO_SET_ADD:
if (svc != NULL)
ret = -EEXIST;
else
ret = ip_vs_add_service(net, &usvc, &svc);
break;
case IP_VS_SO_SET_EDIT:
ret = ip_vs_edit_service(svc, &usvc);
break;
case IP_VS_SO_SET_DEL:
ret = ip_vs_del_service(svc);
if (!ret)
goto out_unlock;
break;
case IP_VS_SO_SET_ZERO:
ret = ip_vs_zero_service(svc);
break;
case IP_VS_SO_SET_ADDDEST:
ret = ip_vs_add_dest(svc, &udest);
break;
case IP_VS_SO_SET_EDITDEST:
ret = ip_vs_edit_dest(svc, &udest);
break;
case IP_VS_SO_SET_DELDEST:
ret = ip_vs_del_dest(svc, &udest);
break;
default:
ret = -EINVAL;
}
out_unlock:
mutex_unlock(&__ip_vs_mutex);
out_dec:
/* decrease the module use count */
ip_vs_use_count_dec();
return ret;
}
static void
ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
{
struct ip_vs_scheduler *sched;
sched = rcu_dereference_protected(src->scheduler, 1);
dst->protocol = src->protocol;
dst->addr = src->addr.ip;
dst->port = src->port;
dst->fwmark = src->fwmark;
strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name));
dst->flags = src->flags;
dst->timeout = src->timeout / HZ;
dst->netmask = src->netmask;
dst->num_dests = src->num_dests;
ip_vs_copy_stats(&dst->stats, &src->stats);
}
static inline int
__ip_vs_get_service_entries(struct net *net,
const struct ip_vs_get_services *get,
struct ip_vs_get_services __user *uptr)
{
int idx, count=0;
struct ip_vs_service *svc;
struct ip_vs_service_entry entry;
int ret = 0;
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
/* Only expose IPv4 entries to old interface */
if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
if (count >= get->num_services)
goto out;
memset(&entry, 0, sizeof(entry));
ip_vs_copy_service(&entry, svc);
if (copy_to_user(&uptr->entrytable[count],
&entry, sizeof(entry))) {
ret = -EFAULT;
goto out;
}
count++;
}
}
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
/* Only expose IPv4 entries to old interface */
if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
if (count >= get->num_services)
goto out;
memset(&entry, 0, sizeof(entry));
ip_vs_copy_service(&entry, svc);
if (copy_to_user(&uptr->entrytable[count],
&entry, sizeof(entry))) {
ret = -EFAULT;
goto out;
}
count++;
}
}
out:
return ret;
}
static inline int
__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
struct ip_vs_get_dests __user *uptr)
{
struct ip_vs_service *svc;
union nf_inet_addr addr = { .ip = get->addr };
int ret = 0;
rcu_read_lock();
if (get->fwmark)
svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
else
svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
get->port);
rcu_read_unlock();
if (svc) {
int count = 0;
struct ip_vs_dest *dest;
struct ip_vs_dest_entry entry;
memset(&entry, 0, sizeof(entry));
list_for_each_entry(dest, &svc->destinations, n_list) {
if (count >= get->num_dests)
break;
entry.addr = dest->addr.ip;
entry.port = dest->port;
entry.conn_flags = atomic_read(&dest->conn_flags);
entry.weight = atomic_read(&dest->weight);
entry.u_threshold = dest->u_threshold;
entry.l_threshold = dest->l_threshold;
entry.activeconns = atomic_read(&dest->activeconns);
entry.inactconns = atomic_read(&dest->inactconns);
entry.persistconns = atomic_read(&dest->persistconns);
ip_vs_copy_stats(&entry.stats, &dest->stats);
if (copy_to_user(&uptr->entrytable[count],
&entry, sizeof(entry))) {
ret = -EFAULT;
break;
}
count++;
}
} else
ret = -ESRCH;
return ret;
}
static inline void
__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
{
#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
struct ip_vs_proto_data *pd;
#endif
memset(u, 0, sizeof (*u));
#ifdef CONFIG_IP_VS_PROTO_TCP
pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
u->udp_timeout =
pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
#endif
}
#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
[GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
[GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
[GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
[GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
[GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
[GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
[GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
};
static int
do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
unsigned char arg[128];
int ret = 0;
unsigned int copylen;
struct net *net = sock_net(sk);
struct netns_ipvs *ipvs = net_ipvs(net);
BUG_ON(!net);
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
return -EINVAL;
if (*len < get_arglen[GET_CMDID(cmd)]) {
pr_err("get_ctl: len %u < %u\n",
*len, get_arglen[GET_CMDID(cmd)]);
return -EINVAL;
}
copylen = get_arglen[GET_CMDID(cmd)];
if (copylen > 128)
return -EINVAL;
if (copy_from_user(arg, user, copylen) != 0)
return -EFAULT;
/*
* Handle daemons first since it has its own locking
*/
if (cmd == IP_VS_SO_GET_DAEMON) {
struct ip_vs_daemon_user d[2];
memset(&d, 0, sizeof(d));
if (mutex_lock_interruptible(&ipvs->sync_mutex))
return -ERESTARTSYS;
if (ipvs->sync_state & IP_VS_STATE_MASTER) {
d[0].state = IP_VS_STATE_MASTER;
strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
sizeof(d[0].mcast_ifn));
d[0].syncid = ipvs->master_syncid;
}
if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
d[1].state = IP_VS_STATE_BACKUP;
strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
sizeof(d[1].mcast_ifn));
d[1].syncid = ipvs->backup_syncid;
}
if (copy_to_user(user, &d, sizeof(d)) != 0)
ret = -EFAULT;
mutex_unlock(&ipvs->sync_mutex);
return ret;
}
if (mutex_lock_interruptible(&__ip_vs_mutex))
return -ERESTARTSYS;
switch (cmd) {
case IP_VS_SO_GET_VERSION:
{
char buf[64];
sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
IPVS: Allow boot time change of hash size I was very frustrated about the fact that I have to recompile the kernel to change the hash size. So, I created this patch. If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel command line, or, if you built IPVS as modules, you can add options ip_vs conn_tab_bits=??. To keep everything backward compatible, you still can select the size at compile time, and that will be used as default. It has been about a year since this patch was originally posted and subsequently dropped on the basis of insufficient test data. Mark Bergsma has provided the following test results which seem to strongly support the need for larger hash table sizes: We do however run into the same problem with the default setting (212 = 4096 entries), as most of our LVS balancers handle around a million connections/SLAB entries at any point in time (around 100-150 kpps load). With only 4096 hash table entries this implies that each entry consists of a linked list of 256 connections *on average*. To provide some statistics, I did an oprofile run on an 2.6.31 kernel, with both the default 4096 table size, and the same kernel recompiled with IP_VS_CONN_TAB_BITS set to 18 (218 = 262144 entries). I built a quick test setup with a part of Wikimedia/Wikipedia's live traffic mirrored by the switch to the test host. With the default setting, at ~ 120 kpps packet load we saw a typical %si CPU usage of around 30-35%, and oprofile reported a hot spot in ip_vs_conn_in_get: samples % image name app name symbol name 1719761 42.3741 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 302577 7.4554 bnx2 bnx2 /bnx2 181984 4.4840 vmlinux vmlinux __ticket_spin_lock 128636 3.1695 vmlinux vmlinux ip_route_input 74345 1.8318 ip_vs.ko ip_vs.ko ip_vs_conn_out_get 68482 1.6874 vmlinux vmlinux mwait_idle After loading the recompiled kernel with 218 entries, %si CPU usage dropped in half to around 12-18%, and oprofile looks much healthier, with only 7% spent in ip_vs_conn_in_get: samples % image name app name symbol name 265641 14.4616 bnx2 bnx2 /bnx2 143251 7.7986 vmlinux vmlinux __ticket_spin_lock 140661 7.6576 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 94364 5.1372 vmlinux vmlinux mwait_idle 86267 4.6964 vmlinux vmlinux ip_route_input [ horms@verge.net.au: trivial up-port and minor style fixes ] Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro> Cc: Mark Bergsma <mark@wikimedia.org> Signed-off-by: Simon Horman <horms@verge.net.au> Signed-off-by: Patrick McHardy <kaber@trash.net>
2010-01-05 04:50:24 +00:00
NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
ret = -EFAULT;
goto out;
}
*len = strlen(buf)+1;
}
break;
case IP_VS_SO_GET_INFO:
{
struct ip_vs_getinfo info;
info.version = IP_VS_VERSION_CODE;
IPVS: Allow boot time change of hash size I was very frustrated about the fact that I have to recompile the kernel to change the hash size. So, I created this patch. If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel command line, or, if you built IPVS as modules, you can add options ip_vs conn_tab_bits=??. To keep everything backward compatible, you still can select the size at compile time, and that will be used as default. It has been about a year since this patch was originally posted and subsequently dropped on the basis of insufficient test data. Mark Bergsma has provided the following test results which seem to strongly support the need for larger hash table sizes: We do however run into the same problem with the default setting (212 = 4096 entries), as most of our LVS balancers handle around a million connections/SLAB entries at any point in time (around 100-150 kpps load). With only 4096 hash table entries this implies that each entry consists of a linked list of 256 connections *on average*. To provide some statistics, I did an oprofile run on an 2.6.31 kernel, with both the default 4096 table size, and the same kernel recompiled with IP_VS_CONN_TAB_BITS set to 18 (218 = 262144 entries). I built a quick test setup with a part of Wikimedia/Wikipedia's live traffic mirrored by the switch to the test host. With the default setting, at ~ 120 kpps packet load we saw a typical %si CPU usage of around 30-35%, and oprofile reported a hot spot in ip_vs_conn_in_get: samples % image name app name symbol name 1719761 42.3741 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 302577 7.4554 bnx2 bnx2 /bnx2 181984 4.4840 vmlinux vmlinux __ticket_spin_lock 128636 3.1695 vmlinux vmlinux ip_route_input 74345 1.8318 ip_vs.ko ip_vs.ko ip_vs_conn_out_get 68482 1.6874 vmlinux vmlinux mwait_idle After loading the recompiled kernel with 218 entries, %si CPU usage dropped in half to around 12-18%, and oprofile looks much healthier, with only 7% spent in ip_vs_conn_in_get: samples % image name app name symbol name 265641 14.4616 bnx2 bnx2 /bnx2 143251 7.7986 vmlinux vmlinux __ticket_spin_lock 140661 7.6576 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 94364 5.1372 vmlinux vmlinux mwait_idle 86267 4.6964 vmlinux vmlinux ip_route_input [ horms@verge.net.au: trivial up-port and minor style fixes ] Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro> Cc: Mark Bergsma <mark@wikimedia.org> Signed-off-by: Simon Horman <horms@verge.net.au> Signed-off-by: Patrick McHardy <kaber@trash.net>
2010-01-05 04:50:24 +00:00
info.size = ip_vs_conn_tab_size;
info.num_services = ipvs->num_services;
if (copy_to_user(user, &info, sizeof(info)) != 0)
ret = -EFAULT;
}
break;
case IP_VS_SO_GET_SERVICES:
{
struct ip_vs_get_services *get;
int size;
get = (struct ip_vs_get_services *)arg;
size = sizeof(*get) +
sizeof(struct ip_vs_service_entry) * get->num_services;
if (*len != size) {
pr_err("length: %u != %u\n", *len, size);
ret = -EINVAL;
goto out;
}
ret = __ip_vs_get_service_entries(net, get, user);
}
break;
case IP_VS_SO_GET_SERVICE:
{
struct ip_vs_service_entry *entry;
struct ip_vs_service *svc;
union nf_inet_addr addr;
entry = (struct ip_vs_service_entry *)arg;
addr.ip = entry->addr;
rcu_read_lock();
if (entry->fwmark)
svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
else
svc = __ip_vs_service_find(net, AF_INET,
entry->protocol, &addr,
entry->port);
rcu_read_unlock();
if (svc) {
ip_vs_copy_service(entry, svc);
if (copy_to_user(user, entry, sizeof(*entry)) != 0)
ret = -EFAULT;
} else
ret = -ESRCH;
}
break;
case IP_VS_SO_GET_DESTS:
{
struct ip_vs_get_dests *get;
int size;
get = (struct ip_vs_get_dests *)arg;
size = sizeof(*get) +
sizeof(struct ip_vs_dest_entry) * get->num_dests;
if (*len != size) {
pr_err("length: %u != %u\n", *len, size);
ret = -EINVAL;
goto out;
}
ret = __ip_vs_get_dest_entries(net, get, user);
}
break;
case IP_VS_SO_GET_TIMEOUT:
{
struct ip_vs_timeout_user t;
__ip_vs_get_timeouts(net, &t);
if (copy_to_user(user, &t, sizeof(t)) != 0)
ret = -EFAULT;
}
break;
default:
ret = -EINVAL;
}
out:
mutex_unlock(&__ip_vs_mutex);
return ret;
}
static struct nf_sockopt_ops ip_vs_sockopts = {
.pf = PF_INET,
.set_optmin = IP_VS_BASE_CTL,
.set_optmax = IP_VS_SO_SET_MAX+1,
.set = do_ip_vs_set_ctl,
.get_optmin = IP_VS_BASE_CTL,
.get_optmax = IP_VS_SO_GET_MAX+1,
.get = do_ip_vs_get_ctl,
[NETFILTER]: Fix/improve deadlock condition on module removal netfilter So I've had a deadlock reported to me. I've found that the sequence of events goes like this: 1) process A (modprobe) runs to remove ip_tables.ko 2) process B (iptables-restore) runs and calls setsockopt on a netfilter socket, increasing the ip_tables socket_ops use count 3) process A acquires a file lock on the file ip_tables.ko, calls remove_module in the kernel, which in turn executes the ip_tables module cleanup routine, which calls nf_unregister_sockopt 4) nf_unregister_sockopt, seeing that the use count is non-zero, puts the calling process into uninterruptible sleep, expecting the process using the socket option code to wake it up when it exits the kernel 4) the user of the socket option code (process B) in do_ipt_get_ctl, calls ipt_find_table_lock, which in this case calls request_module to load ip_tables_nat.ko 5) request_module forks a copy of modprobe (process C) to load the module and blocks until modprobe exits. 6) Process C. forked by request_module process the dependencies of ip_tables_nat.ko, of which ip_tables.ko is one. 7) Process C attempts to lock the request module and all its dependencies, it blocks when it attempts to lock ip_tables.ko (which was previously locked in step 3) Theres not really any great permanent solution to this that I can see, but I've developed a two part solution that corrects the problem Part 1) Modifies the nf_sockopt registration code so that, instead of using a use counter internal to the nf_sockopt_ops structure, we instead use a pointer to the registering modules owner to do module reference counting when nf_sockopt calls a modules set/get routine. This prevents the deadlock by preventing set 4 from happening. Part 2) Enhances the modprobe utilty so that by default it preforms non-blocking remove operations (the same way rmmod does), and add an option to explicity request blocking operation. So if you select blocking operation in modprobe you can still cause the above deadlock, but only if you explicity try (and since root can do any old stupid thing it would like.... :) ). Signed-off-by: Neil Horman <nhorman@tuxdriver.com> Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2007-09-11 09:28:26 +00:00
.owner = THIS_MODULE,
};
/*
* Generic Netlink interface
*/
/* IPVS genetlink family */
static struct genl_family ip_vs_genl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = 0,
.name = IPVS_GENL_NAME,
.version = IPVS_GENL_VERSION,
.maxattr = IPVS_CMD_MAX,
.netnsok = true, /* Make ipvsadm to work on netns */
};
/* Policy used for first-level command attributes */
static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
[IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED },
[IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED },
[IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED },
[IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 },
[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
[IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 },
};
/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
[IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 },
[IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
ipvs: fix buffer overflow with sync daemon and service commit 52f96757905bbf0edef47f3ee6c7c784e7f8ff8a upstream. syzkaller reports for buffer overflow for interface name when starting sync daemons [1] What we do is that we copy user structure into larger stack buffer but later we search NUL past the stack buffer. The same happens for sched_name when adding/editing virtual server. We are restricted by IP_VS_SCHEDNAME_MAXLEN and IP_VS_IFNAME_MAXLEN being used as size in include/uapi/linux/ip_vs.h, so they include the space for NUL. As using strlcpy is wrong for unsafe source, replace it with strscpy and add checks to return EINVAL if source string is not NUL-terminated. The incomplete strlcpy fix comes from 2.6.13. For the netlink interface reduce the len parameter for IPVS_DAEMON_ATTR_MCAST_IFN and IPVS_SVC_ATTR_SCHED_NAME, so that we get proper EINVAL. [1] kernel BUG at lib/string.c:1052! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 373 Comm: syz-executor936 Not tainted 4.17.0-rc4+ #45 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fortify_panic+0x13/0x20 lib/string.c:1051 RSP: 0018:ffff8801c976f800 EFLAGS: 00010282 RAX: 0000000000000022 RBX: 0000000000000040 RCX: 0000000000000000 RDX: 0000000000000022 RSI: ffffffff8160f6f1 RDI: ffffed00392edef6 RBP: ffff8801c976f800 R08: ffff8801cf4c62c0 R09: ffffed003b5e4fb0 R10: ffffed003b5e4fb0 R11: ffff8801daf27d87 R12: ffff8801c976fa20 R13: ffff8801c976fae4 R14: ffff8801c976fae0 R15: 000000000000048b FS: 00007fd99f75e700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200001c0 CR3: 00000001d6843000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: strlen include/linux/string.h:270 [inline] strlcpy include/linux/string.h:293 [inline] do_ip_vs_set_ctl+0x31c/0x1d00 net/netfilter/ipvs/ip_vs_ctl.c:2388 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] nf_setsockopt+0x7d/0xd0 net/netfilter/nf_sockopt.c:115 ip_setsockopt+0xd8/0xf0 net/ipv4/ip_sockglue.c:1253 udp_setsockopt+0x62/0xa0 net/ipv4/udp.c:2487 ipv6_setsockopt+0x149/0x170 net/ipv6/ipv6_sockglue.c:917 tcp_setsockopt+0x93/0xe0 net/ipv4/tcp.c:3057 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3046 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x447369 RSP: 002b:00007fd99f75dda8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00000000006e39e4 RCX: 0000000000447369 RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000018 R09: 0000000000000000 R10: 00000000200001c0 R11: 0000000000000246 R12: 00000000006e39e0 R13: 75a1ff93f0896195 R14: 6f745f3168746576 R15: 0000000000000001 Code: 08 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 48 89 df e8 d2 8f 48 fa eb de 55 48 89 fe 48 c7 c7 60 65 64 88 48 89 e5 e8 91 dd f3 f9 <0f> 0b 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 41 57 41 56 RIP: fortify_panic+0x13/0x20 lib/string.c:1051 RSP: ffff8801c976f800 Change-Id: I60d908e03538def93e5b6784d828da9fb88ac465 Reported-and-tested-by: syzbot+aac887f77319868646df@syzkaller.appspotmail.com Fixes: e4ff67513096 ("ipvs: add sync_maxlen parameter for the sync daemon") Fixes: 4da62fc70d7c ("[IPVS]: Fix for overflows") Signed-off-by: Julian Anastasov <ja@ssi.bg> Acked-by: Simon Horman <horms+renesas@verge.net.au> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> [bwh: Backported to 3.16: Interface name is copied in start_sync_thread(), not do_ip_vs_set_ctl()] Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
2018-05-19 15:22:35 +00:00
.len = IP_VS_IFNAME_MAXLEN - 1 },
[IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
};
/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
[IPVS_SVC_ATTR_AF] = { .type = NLA_U16 },
[IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 },
[IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY,
.len = sizeof(union nf_inet_addr) },
[IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 },
[IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
[IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
ipvs: fix buffer overflow with sync daemon and service commit 52f96757905bbf0edef47f3ee6c7c784e7f8ff8a upstream. syzkaller reports for buffer overflow for interface name when starting sync daemons [1] What we do is that we copy user structure into larger stack buffer but later we search NUL past the stack buffer. The same happens for sched_name when adding/editing virtual server. We are restricted by IP_VS_SCHEDNAME_MAXLEN and IP_VS_IFNAME_MAXLEN being used as size in include/uapi/linux/ip_vs.h, so they include the space for NUL. As using strlcpy is wrong for unsafe source, replace it with strscpy and add checks to return EINVAL if source string is not NUL-terminated. The incomplete strlcpy fix comes from 2.6.13. For the netlink interface reduce the len parameter for IPVS_DAEMON_ATTR_MCAST_IFN and IPVS_SVC_ATTR_SCHED_NAME, so that we get proper EINVAL. [1] kernel BUG at lib/string.c:1052! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 373 Comm: syz-executor936 Not tainted 4.17.0-rc4+ #45 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fortify_panic+0x13/0x20 lib/string.c:1051 RSP: 0018:ffff8801c976f800 EFLAGS: 00010282 RAX: 0000000000000022 RBX: 0000000000000040 RCX: 0000000000000000 RDX: 0000000000000022 RSI: ffffffff8160f6f1 RDI: ffffed00392edef6 RBP: ffff8801c976f800 R08: ffff8801cf4c62c0 R09: ffffed003b5e4fb0 R10: ffffed003b5e4fb0 R11: ffff8801daf27d87 R12: ffff8801c976fa20 R13: ffff8801c976fae4 R14: ffff8801c976fae0 R15: 000000000000048b FS: 00007fd99f75e700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200001c0 CR3: 00000001d6843000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: strlen include/linux/string.h:270 [inline] strlcpy include/linux/string.h:293 [inline] do_ip_vs_set_ctl+0x31c/0x1d00 net/netfilter/ipvs/ip_vs_ctl.c:2388 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] nf_setsockopt+0x7d/0xd0 net/netfilter/nf_sockopt.c:115 ip_setsockopt+0xd8/0xf0 net/ipv4/ip_sockglue.c:1253 udp_setsockopt+0x62/0xa0 net/ipv4/udp.c:2487 ipv6_setsockopt+0x149/0x170 net/ipv6/ipv6_sockglue.c:917 tcp_setsockopt+0x93/0xe0 net/ipv4/tcp.c:3057 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3046 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x447369 RSP: 002b:00007fd99f75dda8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00000000006e39e4 RCX: 0000000000447369 RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000018 R09: 0000000000000000 R10: 00000000200001c0 R11: 0000000000000246 R12: 00000000006e39e0 R13: 75a1ff93f0896195 R14: 6f745f3168746576 R15: 0000000000000001 Code: 08 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 48 89 df e8 d2 8f 48 fa eb de 55 48 89 fe 48 c7 c7 60 65 64 88 48 89 e5 e8 91 dd f3 f9 <0f> 0b 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 41 57 41 56 RIP: fortify_panic+0x13/0x20 lib/string.c:1051 RSP: ffff8801c976f800 Change-Id: I60d908e03538def93e5b6784d828da9fb88ac465 Reported-and-tested-by: syzbot+aac887f77319868646df@syzkaller.appspotmail.com Fixes: e4ff67513096 ("ipvs: add sync_maxlen parameter for the sync daemon") Fixes: 4da62fc70d7c ("[IPVS]: Fix for overflows") Signed-off-by: Julian Anastasov <ja@ssi.bg> Acked-by: Simon Horman <horms+renesas@verge.net.au> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> [bwh: Backported to 3.16: Interface name is copied in start_sync_thread(), not do_ip_vs_set_ctl()] Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
2018-05-19 15:22:35 +00:00
.len = IP_VS_SCHEDNAME_MAXLEN - 1 },
[IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING,
.len = IP_VS_PENAME_MAXLEN },
[IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
.len = sizeof(struct ip_vs_flags) },
[IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 },
[IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED },
};
/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
[IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY,
.len = sizeof(union nf_inet_addr) },
[IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 },
[IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
};
static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
struct ip_vs_stats *stats)
{
struct ip_vs_stats_user ustats;
struct nlattr *nl_stats = nla_nest_start(skb, container_type);
if (!nl_stats)
return -EMSGSIZE;
ip_vs_copy_stats(&ustats, stats);
if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) ||
nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) ||
nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) ||
nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) ||
nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) ||
nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) ||
nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) ||
nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) ||
nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) ||
nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps))
goto nla_put_failure;
nla_nest_end(skb, nl_stats);
return 0;
nla_put_failure:
nla_nest_cancel(skb, nl_stats);
return -EMSGSIZE;
}
static int ip_vs_genl_fill_service(struct sk_buff *skb,
struct ip_vs_service *svc)
{
struct ip_vs_scheduler *sched;
struct ip_vs_pe *pe;
struct nlattr *nl_service;
struct ip_vs_flags flags = { .flags = svc->flags,
.mask = ~0 };
nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
if (!nl_service)
return -EMSGSIZE;
if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
goto nla_put_failure;
if (svc->fwmark) {
if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
goto nla_put_failure;
} else {
if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
goto nla_put_failure;
}
sched = rcu_dereference_protected(svc->scheduler, 1);
pe = rcu_dereference_protected(svc->pe, 1);
if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) ||
(pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
goto nla_put_failure;
if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
goto nla_put_failure;
nla_nest_end(skb, nl_service);
return 0;
nla_put_failure:
nla_nest_cancel(skb, nl_service);
return -EMSGSIZE;
}
static int ip_vs_genl_dump_service(struct sk_buff *skb,
struct ip_vs_service *svc,
struct netlink_callback *cb)
{
void *hdr;
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&ip_vs_genl_family, NLM_F_MULTI,
IPVS_CMD_NEW_SERVICE);
if (!hdr)
return -EMSGSIZE;
if (ip_vs_genl_fill_service(skb, svc) < 0)
goto nla_put_failure;
return genlmsg_end(skb, hdr);
nla_put_failure:
genlmsg_cancel(skb, hdr);
return -EMSGSIZE;
}
static int ip_vs_genl_dump_services(struct sk_buff *skb,
struct netlink_callback *cb)
{
int idx = 0, i;
int start = cb->args[0];
struct ip_vs_service *svc;
struct net *net = skb_sknet(skb);
mutex_lock(&__ip_vs_mutex);
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
goto nla_put_failure;
}
}
}
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
goto nla_put_failure;
}
}
}
nla_put_failure:
mutex_unlock(&__ip_vs_mutex);
cb->args[0] = idx;
return skb->len;
}
static int ip_vs_genl_parse_service(struct net *net,
struct ip_vs_service_user_kern *usvc,
struct nlattr *nla, int full_entry,
struct ip_vs_service **ret_svc)
{
struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
struct ip_vs_service *svc;
/* Parse mandatory identifying service fields first */
if (nla == NULL ||
nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
return -EINVAL;
nla_af = attrs[IPVS_SVC_ATTR_AF];
nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL];
nla_addr = attrs[IPVS_SVC_ATTR_ADDR];
nla_port = attrs[IPVS_SVC_ATTR_PORT];
nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK];
if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
return -EINVAL;
memset(usvc, 0, sizeof(*usvc));
usvc->af = nla_get_u16(nla_af);
#ifdef CONFIG_IP_VS_IPV6
if (usvc->af != AF_INET && usvc->af != AF_INET6)
#else
if (usvc->af != AF_INET)
#endif
return -EAFNOSUPPORT;
if (nla_fwmark) {
usvc->protocol = IPPROTO_TCP;
usvc->fwmark = nla_get_u32(nla_fwmark);
} else {
usvc->protocol = nla_get_u16(nla_protocol);
nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
usvc->port = nla_get_be16(nla_port);
usvc->fwmark = 0;
}
rcu_read_lock();
if (usvc->fwmark)
svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
else
svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
&usvc->addr, usvc->port);
rcu_read_unlock();
*ret_svc = svc;
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
*nla_netmask;
struct ip_vs_flags flags;
nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
return -EINVAL;
nla_memcpy(&flags, nla_flags, sizeof(flags));
/* prefill flags from service if it already exists */
if (svc)
usvc->flags = svc->flags;
/* set new flags from userland */
usvc->flags = (usvc->flags & ~flags.mask) |
(flags.flags & flags.mask);
usvc->sched_name = nla_data(nla_sched);
usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
usvc->timeout = nla_get_u32(nla_timeout);
usvc->netmask = nla_get_be32(nla_netmask);
}
return 0;
}
static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
struct nlattr *nla)
{
struct ip_vs_service_user_kern usvc;
struct ip_vs_service *svc;
int ret;
ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
return ret ? ERR_PTR(ret) : svc;
}
static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
{
struct nlattr *nl_dest;
nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
if (!nl_dest)
return -EMSGSIZE;
if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
(atomic_read(&dest->conn_flags) &
IP_VS_CONN_F_FWD_MASK)) ||
nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
atomic_read(&dest->weight)) ||
nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
atomic_read(&dest->activeconns)) ||
nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
atomic_read(&dest->inactconns)) ||
nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
atomic_read(&dest->persistconns)))
goto nla_put_failure;
if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
goto nla_put_failure;
nla_nest_end(skb, nl_dest);
return 0;
nla_put_failure:
nla_nest_cancel(skb, nl_dest);
return -EMSGSIZE;
}
static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
struct netlink_callback *cb)
{
void *hdr;
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&ip_vs_genl_family, NLM_F_MULTI,
IPVS_CMD_NEW_DEST);
if (!hdr)
return -EMSGSIZE;
if (ip_vs_genl_fill_dest(skb, dest) < 0)
goto nla_put_failure;
return genlmsg_end(skb, hdr);
nla_put_failure:
genlmsg_cancel(skb, hdr);
return -EMSGSIZE;
}
static int ip_vs_genl_dump_dests(struct sk_buff *skb,
struct netlink_callback *cb)
{
int idx = 0;
int start = cb->args[0];
struct ip_vs_service *svc;
struct ip_vs_dest *dest;
struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
struct net *net = skb_sknet(skb);
mutex_lock(&__ip_vs_mutex);
/* Try to find the service for which to dump destinations */
if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
goto out_err;
svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc) || svc == NULL)
goto out_err;
/* Dump the destinations */
list_for_each_entry(dest, &svc->destinations, n_list) {
if (++idx <= start)
continue;
if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
idx--;
goto nla_put_failure;
}
}
nla_put_failure:
cb->args[0] = idx;
out_err:
mutex_unlock(&__ip_vs_mutex);
return skb->len;
}
static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
struct nlattr *nla, int full_entry)
{
struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
struct nlattr *nla_addr, *nla_port;
/* Parse mandatory identifying destination fields first */
if (nla == NULL ||
nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
return -EINVAL;
nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
nla_port = attrs[IPVS_DEST_ATTR_PORT];
if (!(nla_addr && nla_port))
return -EINVAL;
memset(udest, 0, sizeof(*udest));
nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
udest->port = nla_get_be16(nla_port);
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
*nla_l_thresh;
nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
return -EINVAL;
udest->conn_flags = nla_get_u32(nla_fwd)
& IP_VS_CONN_F_FWD_MASK;
udest->weight = nla_get_u32(nla_weight);
udest->u_threshold = nla_get_u32(nla_u_thresh);
udest->l_threshold = nla_get_u32(nla_l_thresh);
}
return 0;
}
static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
const char *mcast_ifn, __u32 syncid)
{
struct nlattr *nl_daemon;
nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
if (!nl_daemon)
return -EMSGSIZE;
if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
goto nla_put_failure;
nla_nest_end(skb, nl_daemon);
return 0;
nla_put_failure:
nla_nest_cancel(skb, nl_daemon);
return -EMSGSIZE;
}
static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
const char *mcast_ifn, __u32 syncid,
struct netlink_callback *cb)
{
void *hdr;
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&ip_vs_genl_family, NLM_F_MULTI,
IPVS_CMD_NEW_DAEMON);
if (!hdr)
return -EMSGSIZE;
if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
goto nla_put_failure;
return genlmsg_end(skb, hdr);
nla_put_failure:
genlmsg_cancel(skb, hdr);
return -EMSGSIZE;
}
static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
struct netlink_callback *cb)
{
struct net *net = skb_sknet(skb);
struct netns_ipvs *ipvs = net_ipvs(net);
mutex_lock(&ipvs->sync_mutex);
if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
ipvs->master_mcast_ifn,
ipvs->master_syncid, cb) < 0)
goto nla_put_failure;
cb->args[0] = 1;
}
if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
ipvs->backup_mcast_ifn,
ipvs->backup_syncid, cb) < 0)
goto nla_put_failure;
cb->args[1] = 1;
}
nla_put_failure:
mutex_unlock(&ipvs->sync_mutex);
return skb->len;
}
static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
{
if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
return -EINVAL;
return start_sync_thread(net,
nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
}
static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
{
if (!attrs[IPVS_DAEMON_ATTR_STATE])
return -EINVAL;
return stop_sync_thread(net,
nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
}
static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
{
struct ip_vs_timeout_user t;
__ip_vs_get_timeouts(net, &t);
if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
t.tcp_fin_timeout =
nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
return ip_vs_set_timeout(net, &t);
}
static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
{
int ret = 0, cmd;
struct net *net;
struct netns_ipvs *ipvs;
net = skb_sknet(skb);
ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
mutex_lock(&ipvs->sync_mutex);
if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
info->attrs[IPVS_CMD_ATTR_DAEMON],
ip_vs_daemon_policy)) {
ret = -EINVAL;
goto out;
}
if (cmd == IPVS_CMD_NEW_DAEMON)
ret = ip_vs_genl_new_daemon(net, daemon_attrs);
else
ret = ip_vs_genl_del_daemon(net, daemon_attrs);
out:
mutex_unlock(&ipvs->sync_mutex);
}
return ret;
}
static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct ip_vs_service *svc = NULL;
struct ip_vs_service_user_kern usvc;
struct ip_vs_dest_user_kern udest;
int ret = 0, cmd;
int need_full_svc = 0, need_full_dest = 0;
struct net *net;
net = skb_sknet(skb);
cmd = info->genlhdr->cmd;
mutex_lock(&__ip_vs_mutex);
if (cmd == IPVS_CMD_FLUSH) {
ret = ip_vs_flush(net, false);
goto out;
} else if (cmd == IPVS_CMD_SET_CONFIG) {
ret = ip_vs_genl_set_config(net, info->attrs);
goto out;
} else if (cmd == IPVS_CMD_ZERO &&
!info->attrs[IPVS_CMD_ATTR_SERVICE]) {
ret = ip_vs_zero_all(net);
goto out;
}
/* All following commands require a service argument, so check if we
* received a valid one. We need a full service specification when
* adding / editing a service. Only identifying members otherwise. */
if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
need_full_svc = 1;
ret = ip_vs_genl_parse_service(net, &usvc,
info->attrs[IPVS_CMD_ATTR_SERVICE],
need_full_svc, &svc);
if (ret)
goto out;
/* Unless we're adding a new service, the service must already exist */
if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
ret = -ESRCH;
goto out;
}
/* Destination commands require a valid destination argument. For
* adding / editing a destination, we need a full destination
* specification. */
if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
cmd == IPVS_CMD_DEL_DEST) {
if (cmd != IPVS_CMD_DEL_DEST)
need_full_dest = 1;
ret = ip_vs_genl_parse_dest(&udest,
info->attrs[IPVS_CMD_ATTR_DEST],
need_full_dest);
if (ret)
goto out;
}
switch (cmd) {
case IPVS_CMD_NEW_SERVICE:
if (svc == NULL)
ret = ip_vs_add_service(net, &usvc, &svc);
else
ret = -EEXIST;
break;
case IPVS_CMD_SET_SERVICE:
ret = ip_vs_edit_service(svc, &usvc);
break;
case IPVS_CMD_DEL_SERVICE:
ret = ip_vs_del_service(svc);
/* do not use svc, it can be freed */
break;
case IPVS_CMD_NEW_DEST:
ret = ip_vs_add_dest(svc, &udest);
break;
case IPVS_CMD_SET_DEST:
ret = ip_vs_edit_dest(svc, &udest);
break;
case IPVS_CMD_DEL_DEST:
ret = ip_vs_del_dest(svc, &udest);
break;
case IPVS_CMD_ZERO:
ret = ip_vs_zero_service(svc);
break;
default:
ret = -EINVAL;
}
out:
mutex_unlock(&__ip_vs_mutex);
return ret;
}
static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct sk_buff *msg;
void *reply;
int ret, cmd, reply_cmd;
struct net *net;
net = skb_sknet(skb);
cmd = info->genlhdr->cmd;
if (cmd == IPVS_CMD_GET_SERVICE)
reply_cmd = IPVS_CMD_NEW_SERVICE;
else if (cmd == IPVS_CMD_GET_INFO)
reply_cmd = IPVS_CMD_SET_INFO;
else if (cmd == IPVS_CMD_GET_CONFIG)
reply_cmd = IPVS_CMD_SET_CONFIG;
else {
pr_err("unknown Generic Netlink command\n");
return -EINVAL;
}
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
mutex_lock(&__ip_vs_mutex);
reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
if (reply == NULL)
goto nla_put_failure;
switch (cmd) {
case IPVS_CMD_GET_SERVICE:
{
struct ip_vs_service *svc;
svc = ip_vs_genl_find_service(net,
info->attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc)) {
ret = PTR_ERR(svc);
goto out_err;
} else if (svc) {
ret = ip_vs_genl_fill_service(msg, svc);
if (ret)
goto nla_put_failure;
} else {
ret = -ESRCH;
goto out_err;
}
break;
}
case IPVS_CMD_GET_CONFIG:
{
struct ip_vs_timeout_user t;
__ip_vs_get_timeouts(net, &t);
#ifdef CONFIG_IP_VS_PROTO_TCP
if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
t.tcp_timeout) ||
nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
t.tcp_fin_timeout))
goto nla_put_failure;
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
goto nla_put_failure;
#endif
break;
}
case IPVS_CMD_GET_INFO:
if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
IP_VS_VERSION_CODE) ||
nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
ip_vs_conn_tab_size))
goto nla_put_failure;
break;
}
genlmsg_end(msg, reply);
genetlink: make netns aware This makes generic netlink network namespace aware. No generic netlink families except for the controller family are made namespace aware, they need to be checked one by one and then set the family->netnsok member to true. A new function genlmsg_multicast_netns() is introduced to allow sending a multicast message in a given namespace, for example when it applies to an object that lives in that namespace, a new function genlmsg_multicast_allns() to send a message to all network namespaces (for objects that do not have an associated netns). The function genlmsg_multicast() is changed to multicast the message in just init_net, which is currently correct for all generic netlink families since they only work in init_net right now. Some will later want to work in all net namespaces because they do not care about the netns at all -- those will have to be converted to use one of the new functions genlmsg_multicast_allns() or genlmsg_multicast_netns() whenever they are made netns aware in some way. After this patch families can easily decide whether or not they should be available in all net namespaces. Many genl families us it for objects not related to networking and should therefore be available in all namespaces, but that will have to be done on a per family basis. Note that this doesn't touch on the checkpoint/restart problem where network namespaces could be used, genl families and multicast groups are numbered globally and I see no easy way of changing that, especially since it must be possible to multicast to all network namespaces for those families that do not care about netns. Signed-off-by: Johannes Berg <johannes@sipsolutions.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2009-07-10 09:51:34 +00:00
ret = genlmsg_reply(msg, info);
goto out;
nla_put_failure:
pr_err("not enough space in Netlink message\n");
ret = -EMSGSIZE;
out_err:
nlmsg_free(msg);
out:
mutex_unlock(&__ip_vs_mutex);
return ret;
}
static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
{
.cmd = IPVS_CMD_NEW_SERVICE,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_SET_SERVICE,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_DEL_SERVICE,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_GET_SERVICE,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_get_cmd,
.dumpit = ip_vs_genl_dump_services,
.policy = ip_vs_cmd_policy,
},
{
.cmd = IPVS_CMD_NEW_DEST,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_SET_DEST,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_DEL_DEST,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_GET_DEST,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
.dumpit = ip_vs_genl_dump_dests,
},
{
.cmd = IPVS_CMD_NEW_DAEMON,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_daemon,
},
{
.cmd = IPVS_CMD_DEL_DAEMON,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_daemon,
},
{
.cmd = IPVS_CMD_GET_DAEMON,
.flags = GENL_ADMIN_PERM,
.dumpit = ip_vs_genl_dump_daemons,
},
{
.cmd = IPVS_CMD_SET_CONFIG,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_GET_CONFIG,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_get_cmd,
},
{
.cmd = IPVS_CMD_GET_INFO,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_get_cmd,
},
{
.cmd = IPVS_CMD_ZERO,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_FLUSH,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_set_cmd,
},
};
static int __init ip_vs_genl_register(void)
{
return genl_register_family_with_ops(&ip_vs_genl_family,
ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
}
static void ip_vs_genl_unregister(void)
{
genl_unregister_family(&ip_vs_genl_family);
}
/* End of Generic Netlink interface definitions */
/*
* per netns intit/exit func.
*/
#ifdef CONFIG_SYSCTL
static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
{
int idx;
struct netns_ipvs *ipvs = net_ipvs(net);
struct ctl_table *tbl;
atomic_set(&ipvs->dropentry, 0);
spin_lock_init(&ipvs->dropentry_lock);
spin_lock_init(&ipvs->droppacket_lock);
spin_lock_init(&ipvs->securetcp_lock);
if (!net_eq(net, &init_net)) {
tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
if (tbl == NULL)
return -ENOMEM;
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
tbl[0].procname = NULL;
} else
tbl = vs_vars;
/* Initialize sysctl defaults */
idx = 0;
ipvs->sysctl_amemthresh = 1024;
tbl[idx++].data = &ipvs->sysctl_amemthresh;
ipvs->sysctl_am_droprate = 10;
tbl[idx++].data = &ipvs->sysctl_am_droprate;
tbl[idx++].data = &ipvs->sysctl_drop_entry;
tbl[idx++].data = &ipvs->sysctl_drop_packet;
#ifdef CONFIG_IP_VS_NFCT
tbl[idx++].data = &ipvs->sysctl_conntrack;
#endif
tbl[idx++].data = &ipvs->sysctl_secure_tcp;
ipvs->sysctl_snat_reroute = 1;
tbl[idx++].data = &ipvs->sysctl_snat_reroute;
ipvs->sysctl_sync_ver = 1;
tbl[idx++].data = &ipvs->sysctl_sync_ver;
ipvs->sysctl_sync_ports = 1;
tbl[idx++].data = &ipvs->sysctl_sync_ports;
ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
ipvs->sysctl_sync_sock_size = 0;
tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
tbl[idx++].data = &ipvs->sysctl_cache_bypass;
tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
tbl[idx].data = &ipvs->sysctl_sync_threshold;
tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
tbl[idx++].data = &ipvs->sysctl_sync_retries;
tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
ipvs->sysctl_pmtu_disc = 1;
tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
tbl[idx++].data = &ipvs->sysctl_backup_only;
ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
if (ipvs->sysctl_hdr == NULL) {
if (!net_eq(net, &init_net))
kfree(tbl);
return -ENOMEM;
}
ip_vs_start_estimator(net, &ipvs->tot_stats);
ipvs->sysctl_tbl = tbl;
/* Schedule defense work */
INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
return 0;
}
static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
cancel_delayed_work_sync(&ipvs->defense_work);
cancel_work_sync(&ipvs->defense_work.work);
unregister_net_sysctl_table(ipvs->sysctl_hdr);
}
#else
static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
#endif
static struct notifier_block ip_vs_dst_notifier = {
.notifier_call = ip_vs_dst_event,
};
int __net_init ip_vs_control_net_init(struct net *net)
{
int idx;
struct netns_ipvs *ipvs = net_ipvs(net);
/* Initialize rs_table */
for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
INIT_LIST_HEAD(&ipvs->dest_trash);
spin_lock_init(&ipvs->dest_trash_lock);
setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
(unsigned long) net);
atomic_set(&ipvs->ftpsvc_counter, 0);
atomic_set(&ipvs->nullsvc_counter, 0);
/* procfs stats */
ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
if (!ipvs->tot_stats.cpustats)
return -ENOMEM;
spin_lock_init(&ipvs->tot_stats.lock);
proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
proc_create("ip_vs_stats_percpu", 0, net->proc_net,
&ip_vs_stats_percpu_fops);
if (ip_vs_control_net_init_sysctl(net))
goto err;
return 0;
err:
free_percpu(ipvs->tot_stats.cpustats);
return -ENOMEM;
}
void __net_exit ip_vs_control_net_cleanup(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
/* Some dest can be in grace period even before cleanup, we have to
* defer ip_vs_trash_cleanup until ip_vs_dest_wait_readers is called.
*/
rcu_barrier();
ip_vs_trash_cleanup(net);
ip_vs_stop_estimator(net, &ipvs->tot_stats);
ip_vs_control_net_cleanup_sysctl(net);
remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
remove_proc_entry("ip_vs_stats", net->proc_net);
remove_proc_entry("ip_vs", net->proc_net);
free_percpu(ipvs->tot_stats.cpustats);
}
int __init ip_vs_register_nl_ioctl(void)
{
int ret;
ret = nf_register_sockopt(&ip_vs_sockopts);
if (ret) {
pr_err("cannot register sockopt.\n");
goto err_sock;
}
ret = ip_vs_genl_register();
if (ret) {
pr_err("cannot register Generic Netlink interface.\n");
goto err_genl;
}
return 0;
err_genl:
nf_unregister_sockopt(&ip_vs_sockopts);
err_sock:
return ret;
}
void ip_vs_unregister_nl_ioctl(void)
{
ip_vs_genl_unregister();
nf_unregister_sockopt(&ip_vs_sockopts);
}
int __init ip_vs_control_init(void)
{
int idx;
int ret;
EnterFunction(2);
/* Initialize svc_table, ip_vs_svc_fwm_table */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
}
smp_wmb(); /* Do we really need it now ? */
ret = register_netdevice_notifier(&ip_vs_dst_notifier);
if (ret < 0)
return ret;
LeaveFunction(2);
return 0;
}
void ip_vs_control_cleanup(void)
{
EnterFunction(2);
unregister_netdevice_notifier(&ip_vs_dst_notifier);
LeaveFunction(2);
}