From d1e8c3336e8bbabcafe9bfa335f4fd0c03620f82 Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Sun, 26 Aug 2012 19:13:55 +0200 Subject: [PATCH] ipv4: fix path MTU discovery with connection tracking IPv4 conntrack defragments incoming packet at the PRE_ROUTING hook and (in case of forwarded packets) refragments them at POST_ROUTING independent of the IP_DF flag. Refragmentation uses the dst_mtu() of the local route without caring about the original fragment sizes, thereby breaking PMTUD. This patch fixes this by keeping track of the largest received fragment with IP_DF set and generates an ICMP fragmentation required error during refragmentation if that size exceeds the MTU. Change-Id: Ibac77b728baba05841286ea5a8a2089d56e6ad65 Signed-off-by: Patrick McHardy <kaber@trash.net> Acked-by: Eric Dumazet <edumazet@google.com> Acked-by: David S. Miller <davem@davemloft.net> --- include/net/inet_frag.h | 2 ++ include/net/ip.h | 2 ++ net/ipv4/ip_fragment.c | 8 +++++++- net/ipv4/ip_output.c | 4 +++- 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index b289bd286d77..992d412dc7d0 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -29,6 +29,8 @@ struct inet_frag_queue { #define INET_FRAG_COMPLETE 4 #define INET_FRAG_FIRST_IN 2 #define INET_FRAG_LAST_IN 1 + + u16 max_size; }; #define INETFRAGS_HASHSZ 64 diff --git a/include/net/ip.h b/include/net/ip.h index fa70cbe60ab0..c2425b2a052f 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -43,6 +43,8 @@ struct inet_skb_parm { #define IPSKB_FRAG_COMPLETE BIT(3) #define IPSKB_REROUTED BIT(4) #define IPSKB_DOREDIRECT BIT(5) + + u16 frag_max_size; }; static inline unsigned int ip_hdrlen(const struct sk_buff *skb) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index f4597939b9e9..0fa9752f1383 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -515,6 +515,10 @@ found: if (offset == 0) qp->q.last_in |= INET_FRAG_FIRST_IN; + if (ip_hdr(skb)->frag_off & htons(IP_DF) && + skb->len + ihl > qp->q.max_size) + qp->q.max_size = skb->len + ihl; + if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) { unsigned long orefdst = skb->_skb_refdst; @@ -632,9 +636,11 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->next = NULL; head->dev = dev; head->tstamp = qp->q.stamp; + IPCB(head)->frag_max_size = qp->q.max_size; iph = ip_hdr(head); - iph->frag_off = 0; + /* max_size != 0 implies at least one fragment had IP_DF set */ + iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0; iph->tot_len = htons(len); iph->tos |= ecn; ip_send_check(iph); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 803f1ecd5fc0..3bf2ac7fd069 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -475,7 +475,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) iph = ip_hdr(skb); - if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { + if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || + (IPCB(skb)->frag_max_size && + IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) { IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(ip_skb_dst_mtu(skb)));