sctp: Implement quick failover draft from tsvwg

I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org
CC: joe@perches.com
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Neil Horman 2012-07-21 07:56:07 +00:00 committed by David S. Miller
parent e3906486f6
commit 5aa93bcf66
10 changed files with 221 additions and 15 deletions

View file

@ -1440,6 +1440,20 @@ path_max_retrans - INTEGER
Default: 5
pf_retrans - INTEGER
The number of retransmissions that will be attempted on a given path
before traffic is redirected to an alternate transport (should one
exist). Note this is distinct from path_max_retrans, as a path that
passes the pf_retrans threshold can still be used. Its only
deprioritized when a transmission path is selected by the stack. This
setting is primarily used to enable fast failover mechanisms without
having to reduce path_max_retrans to a very low value. See:
http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
for details. Note also that a value of pf_retrans > path_max_retrans
disables this feature
Default: 0
rto_initial - INTEGER
The initial round trip timeout value in milliseconds that will be used
in calculating round trip times. This is the initial time interval

View file

@ -334,6 +334,7 @@ typedef enum {
typedef enum {
SCTP_TRANSPORT_UP,
SCTP_TRANSPORT_DOWN,
SCTP_TRANSPORT_PF,
} sctp_transport_cmd_t;
/* These are the address scopes defined mainly for IPv4 addresses

View file

@ -161,6 +161,12 @@ extern struct sctp_globals {
int max_retrans_path;
int max_retrans_init;
/* Potentially-Failed.Max.Retrans sysctl value
* taken from:
* http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
*/
int pf_retrans;
/*
* Policy for preforming sctp/socket accounting
* 0 - do socket level accounting, all assocs share sk_sndbuf
@ -258,6 +264,7 @@ extern struct sctp_globals {
#define sctp_sndbuf_policy (sctp_globals.sndbuf_policy)
#define sctp_rcvbuf_policy (sctp_globals.rcvbuf_policy)
#define sctp_max_retrans_path (sctp_globals.max_retrans_path)
#define sctp_pf_retrans (sctp_globals.pf_retrans)
#define sctp_max_retrans_init (sctp_globals.max_retrans_init)
#define sctp_sack_timeout (sctp_globals.sack_timeout)
#define sctp_hb_interval (sctp_globals.hb_interval)
@ -990,10 +997,15 @@ struct sctp_transport {
/* This is the max_retrans value for the transport and will
* be initialized from the assocs value. This can be changed
* using SCTP_SET_PEER_ADDR_PARAMS socket option.
* using the SCTP_SET_PEER_ADDR_PARAMS socket option.
*/
__u16 pathmaxrxt;
/* This is the partially failed retrans value for the transport
* and will be initialized from the assocs value. This can be changed
* using the SCTP_PEER_ADDR_THLDS socket option
*/
int pf_retrans;
/* PMTU : The current known path MTU. */
__u32 pathmtu;
@ -1664,6 +1676,12 @@ struct sctp_association {
*/
int max_retrans;
/* This is the partially failed retrans value for the transport
* and will be initialized from the assocs value. This can be
* changed using the SCTP_PEER_ADDR_THLDS socket option
*/
int pf_retrans;
/* Maximum number of times the endpoint will retransmit INIT */
__u16 max_init_attempts;

View file

@ -93,6 +93,7 @@ typedef __s32 sctp_assoc_t;
#define SCTP_GET_ASSOC_NUMBER 28 /* Read only */
#define SCTP_GET_ASSOC_ID_LIST 29 /* Read only */
#define SCTP_AUTO_ASCONF 30
#define SCTP_PEER_ADDR_THLDS 31
/* Internal Socket Options. Some of the sctp library functions are
* implemented using these socket options.
@ -649,6 +650,7 @@ struct sctp_paddrinfo {
*/
enum sctp_spinfo_state {
SCTP_INACTIVE,
SCTP_PF,
SCTP_ACTIVE,
SCTP_UNCONFIRMED,
SCTP_UNKNOWN = 0xffff /* Value used for transport state unknown */
@ -741,4 +743,13 @@ typedef struct {
int sd;
} sctp_peeloff_arg_t;
/*
* Peer Address Thresholds socket option
*/
struct sctp_paddrthlds {
sctp_assoc_t spt_assoc_id;
struct sockaddr_storage spt_address;
__u16 spt_pathmaxrxt;
__u16 spt_pathpfthld;
};
#endif /* __net_sctp_user_h__ */

View file

@ -124,6 +124,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
* socket values.
*/
asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
asoc->pf_retrans = sctp_pf_retrans;
asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
@ -686,6 +688,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
/* Set the path max_retrans. */
peer->pathmaxrxt = asoc->pathmaxrxt;
/* And the partial failure retrnas threshold */
peer->pf_retrans = asoc->pf_retrans;
/* Initialize the peer's SACK delay timeout based on the
* association configured value.
*/
@ -841,6 +846,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
struct sctp_ulpevent *event;
struct sockaddr_storage addr;
int spc_state = 0;
bool ulp_notify = true;
/* Record the transition on the transport. */
switch (command) {
@ -854,6 +860,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
spc_state = SCTP_ADDR_CONFIRMED;
else
spc_state = SCTP_ADDR_AVAILABLE;
/* Don't inform ULP about transition from PF to
* active state and set cwnd to 1, see SCTP
* Quick failover draft section 5.1, point 5
*/
if (transport->state == SCTP_PF) {
ulp_notify = false;
transport->cwnd = 1;
}
transport->state = SCTP_ACTIVE;
break;
@ -872,6 +886,11 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
spc_state = SCTP_ADDR_UNREACHABLE;
break;
case SCTP_TRANSPORT_PF:
transport->state = SCTP_PF;
ulp_notify = false;
break;
default:
return;
}
@ -879,12 +898,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
* user.
*/
memset(&addr, 0, sizeof(struct sockaddr_storage));
memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
0, spc_state, error, GFP_ATOMIC);
if (event)
sctp_ulpq_tail_event(&asoc->ulpq, event);
if (ulp_notify) {
memset(&addr, 0, sizeof(struct sockaddr_storage));
memcpy(&addr, &transport->ipaddr,
transport->af_specific->sockaddr_len);
event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
0, spc_state, error, GFP_ATOMIC);
if (event)
sctp_ulpq_tail_event(&asoc->ulpq, event);
}
/* Select new active and retran paths. */
@ -900,7 +922,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
transports) {
if ((t->state == SCTP_INACTIVE) ||
(t->state == SCTP_UNCONFIRMED))
(t->state == SCTP_UNCONFIRMED) ||
(t->state == SCTP_PF))
continue;
if (!first || t->last_time_heard > first->last_time_heard) {
second = first;

View file

@ -792,7 +792,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
if (!new_transport)
new_transport = asoc->peer.active_path;
} else if ((new_transport->state == SCTP_INACTIVE) ||
(new_transport->state == SCTP_UNCONFIRMED)) {
(new_transport->state == SCTP_UNCONFIRMED) ||
(new_transport->state == SCTP_PF)) {
/* If the chunk is Heartbeat or Heartbeat Ack,
* send it to chunk->transport, even if it's
* inactive.
@ -987,7 +988,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
new_transport = chunk->transport;
if (!new_transport ||
((new_transport->state == SCTP_INACTIVE) ||
(new_transport->state == SCTP_UNCONFIRMED)))
(new_transport->state == SCTP_UNCONFIRMED) ||
(new_transport->state == SCTP_PF)))
new_transport = asoc->peer.active_path;
if (new_transport->state == SCTP_UNCONFIRMED)
continue;

View file

@ -76,6 +76,8 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
sctp_cmd_seq_t *commands,
gfp_t gfp);
static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
struct sctp_transport *t);
/********************************************************************
* Helper functions
********************************************************************/
@ -470,7 +472,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
* notification SHOULD be sent to the upper layer.
*
*/
static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
struct sctp_association *asoc,
struct sctp_transport *transport,
int is_hb)
{
@ -495,6 +498,23 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
transport->error_count++;
}
/* If the transport error count is greater than the pf_retrans
* threshold, and less than pathmaxrtx, then mark this transport
* as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
* point 1
*/
if ((transport->state != SCTP_PF) &&
(asoc->pf_retrans < transport->pathmaxrxt) &&
(transport->error_count > asoc->pf_retrans)) {
sctp_assoc_control_transport(asoc, transport,
SCTP_TRANSPORT_PF,
0);
/* Update the hb timer to resend a heartbeat every rto */
sctp_cmd_hb_timer_update(commands, transport);
}
if (transport->state != SCTP_INACTIVE &&
(transport->error_count > transport->pathmaxrxt)) {
SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
@ -699,6 +719,10 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
SCTP_HEARTBEAT_SUCCESS);
}
if (t->state == SCTP_PF)
sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
SCTP_HEARTBEAT_SUCCESS);
/* The receiver of the HEARTBEAT ACK should also perform an
* RTT measurement for that destination transport address
* using the time value carried in the HEARTBEAT ACK chunk.
@ -1565,8 +1589,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
case SCTP_CMD_STRIKE:
/* Mark one strike against a transport. */
sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
0);
sctp_do_8_2_transport_strike(commands, asoc,
cmd->obj.transport, 0);
break;
case SCTP_CMD_TRANSPORT_IDLE:
@ -1576,7 +1600,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
case SCTP_CMD_TRANSPORT_HB_SENT:
t = cmd->obj.transport;
sctp_do_8_2_transport_strike(asoc, t, 1);
sctp_do_8_2_transport_strike(commands, asoc,
t, 1);
t->hb_sent = 1;
break;

View file

@ -3478,6 +3478,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
}
/*
* SCTP_PEER_ADDR_THLDS
*
* This option allows us to alter the partially failed threshold for one or all
* transports in an association. See Section 6.1 of:
* http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
*/
static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
char __user *optval,
unsigned int optlen)
{
struct sctp_paddrthlds val;
struct sctp_transport *trans;
struct sctp_association *asoc;
if (optlen < sizeof(struct sctp_paddrthlds))
return -EINVAL;
if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
sizeof(struct sctp_paddrthlds)))
return -EFAULT;
if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
asoc = sctp_id2assoc(sk, val.spt_assoc_id);
if (!asoc)
return -ENOENT;
list_for_each_entry(trans, &asoc->peer.transport_addr_list,
transports) {
if (val.spt_pathmaxrxt)
trans->pathmaxrxt = val.spt_pathmaxrxt;
trans->pf_retrans = val.spt_pathpfthld;
}
if (val.spt_pathmaxrxt)
asoc->pathmaxrxt = val.spt_pathmaxrxt;
asoc->pf_retrans = val.spt_pathpfthld;
} else {
trans = sctp_addr_id2transport(sk, &val.spt_address,
val.spt_assoc_id);
if (!trans)
return -ENOENT;
if (val.spt_pathmaxrxt)
trans->pathmaxrxt = val.spt_pathmaxrxt;
trans->pf_retrans = val.spt_pathpfthld;
}
return 0;
}
/* API 6.2 setsockopt(), getsockopt()
*
* Applications use setsockopt() and getsockopt() to set or retrieve
@ -3627,6 +3677,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
case SCTP_AUTO_ASCONF:
retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
break;
case SCTP_PEER_ADDR_THLDS:
retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
break;
default:
retval = -ENOPROTOOPT;
break;
@ -5498,6 +5551,51 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
return 0;
}
/*
* SCTP_PEER_ADDR_THLDS
*
* This option allows us to fetch the partially failed threshold for one or all
* transports in an association. See Section 6.1 of:
* http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
*/
static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
char __user *optval,
int len,
int __user *optlen)
{
struct sctp_paddrthlds val;
struct sctp_transport *trans;
struct sctp_association *asoc;
if (len < sizeof(struct sctp_paddrthlds))
return -EINVAL;
len = sizeof(struct sctp_paddrthlds);
if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, len))
return -EFAULT;
if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
asoc = sctp_id2assoc(sk, val.spt_assoc_id);
if (!asoc)
return -ENOENT;
val.spt_pathpfthld = asoc->pf_retrans;
val.spt_pathmaxrxt = asoc->pathmaxrxt;
} else {
trans = sctp_addr_id2transport(sk, &val.spt_address,
val.spt_assoc_id);
if (!trans)
return -ENOENT;
val.spt_pathmaxrxt = trans->pathmaxrxt;
val.spt_pathpfthld = trans->pf_retrans;
}
if (put_user(len, optlen) || copy_to_user(optval, &val, len))
return -EFAULT;
return 0;
}
SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@ -5636,6 +5734,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
case SCTP_AUTO_ASCONF:
retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
break;
case SCTP_PEER_ADDR_THLDS:
retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen);
break;
default:
retval = -ENOPROTOOPT;
break;

View file

@ -140,6 +140,15 @@ static ctl_table sctp_table[] = {
.extra1 = &one,
.extra2 = &int_max
},
{
.procname = "pf_retrans",
.data = &sctp_pf_retrans,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &int_max
},
{
.procname = "max_init_retransmits",
.data = &sctp_max_retrans_init,

View file

@ -87,6 +87,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
/* Initialize the default path max_retrans. */
peer->pathmaxrxt = sctp_max_retrans_path;
peer->pf_retrans = sctp_pf_retrans;
INIT_LIST_HEAD(&peer->transmitted);
INIT_LIST_HEAD(&peer->send_ready);
@ -595,7 +596,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
{
unsigned long timeout;
timeout = t->rto + sctp_jitter(t->rto);
if (t->state != SCTP_UNCONFIRMED)
if ((t->state != SCTP_UNCONFIRMED) &&
(t->state != SCTP_PF))
timeout += t->hbinterval;
timeout += jiffies;
return timeout;