android_kernel_samsung_msm8976/net/core/neighbour.c

3128 lines
76 KiB
C
Raw Normal View History

/*
* Generic address resolution entity
*
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Fixes:
* Vitaly E. Lavrov releasing NULL neighbor in neigh_add.
* Harald Welte Add neighbour cache statistics like rtstat
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 08:04:11 +00:00
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <linux/times.h>
#include <net/net_namespace.h>
#include <net/neighbour.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/netevent.h>
#include <net/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/random.h>
#include <linux/string.h>
#include <linux/log2.h>
#define DEBUG
#define NEIGH_DEBUG 1
#define neigh_dbg(level, fmt, ...) \
do { \
if (level <= NEIGH_DEBUG) \
pr_debug(fmt, ##__VA_ARGS__); \
} while (0)
#define PNEIGH_HASHMASK 0xF
static void neigh_timer_handler(unsigned long arg);
static void __neigh_notify(struct neighbour *n, int type, int flags);
static void neigh_update_notify(struct neighbour *neigh);
static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
struct net_device *dev);
static struct neigh_table *neigh_tables;
static unsigned neigh_probe_enable;
#ifdef CONFIG_PROC_FS
static const struct file_operations neigh_stat_seq_fops;
#endif
/*
Neighbour hash table buckets are protected with rwlock tbl->lock.
- All the scans/updates to hash buckets MUST be made under this lock.
- NOTHING clever should be made under this lock: no callbacks
to protocol backends, no attempts to send something to network.
It will result in deadlocks, if backend/driver wants to use neighbour
cache.
- If the entry requires some non-trivial actions, increase
its reference count and release table lock.
Neighbour entries are protected:
- with reference count.
- with rwlock neigh->lock
Reference count prevents destruction.
neigh->lock mainly serializes ll address data and its validity state.
However, the same lock is used to protect another entry fields:
- timer
- resolution queue
Again, nothing clever shall be made under neigh->lock,
the most complicated procedure, which we allow is dev->hard_header.
It is supposed, that dev->hard_header is simplistic and does
not make callbacks to neighbour tables.
The last lock is neigh_tbl_lock. It is pure SMP lock, protecting
list of neighbour tables. This list is used only in process context,
*/
static DEFINE_RWLOCK(neigh_tbl_lock);
static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
{
kfree_skb(skb);
return -ENETDOWN;
}
static void neigh_cleanup_and_release(struct neighbour *neigh)
{
if (neigh->parms->neigh_cleanup)
neigh->parms->neigh_cleanup(neigh);
__neigh_notify(neigh, RTM_DELNEIGH, 0);
neigh_release(neigh);
}
/*
* It is random distribution in the interval (1/2)*base...(3/2)*base.
* It corresponds to default IPv6 settings and is not overridable,
* because it is really reasonable choice.
*/
unsigned long neigh_rand_reach_time(unsigned long base)
{
return base ? (net_random() % base) + (base >> 1) : 0;
}
EXPORT_SYMBOL(neigh_rand_reach_time);
static int neigh_forced_gc(struct neigh_table *tbl)
{
int shrunk = 0;
int i;
struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
for (i = 0; i < (1 << nht->hash_shift); i++) {
struct neighbour *n;
struct neighbour __rcu **np;
np = &nht->hash_buckets[i];
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock))) != NULL) {
/* Neighbour record may be discarded if:
* - nobody refers to it.
* - it is not permanent
*/
write_lock(&n->lock);
if (atomic_read(&n->refcnt) == 1 &&
!(n->nud_state & NUD_PERMANENT)) {
rcu_assign_pointer(*np,
rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock)));
n->dead = 1;
shrunk = 1;
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
continue;
}
write_unlock(&n->lock);
np = &n->next;
}
}
tbl->last_flush = jiffies;
write_unlock_bh(&tbl->lock);
return shrunk;
}
static void neigh_add_timer(struct neighbour *n, unsigned long when)
{
neigh_hold(n);
if (unlikely(mod_timer(&n->timer, when))) {
printk("NEIGH: BUG, double timer add, state is %x\n",
n->nud_state);
dump_stack();
}
}
static int neigh_del_timer(struct neighbour *n)
{
if ((n->nud_state & NUD_IN_TIMER) &&
del_timer(&n->timer)) {
neigh_release(n);
return 1;
}
return 0;
}
static void pneigh_queue_purge(struct sk_buff_head *list)
{
struct sk_buff *skb;
while ((skb = skb_dequeue(list)) != NULL) {
dev_put(skb->dev);
kfree_skb(skb);
}
}
static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
{
int i;
struct neigh_hash_table *nht;
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
for (i = 0; i < (1 << nht->hash_shift); i++) {
struct neighbour *n;
struct neighbour __rcu **np = &nht->hash_buckets[i];
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock))) != NULL) {
if (dev && n->dev != dev) {
np = &n->next;
continue;
}
rcu_assign_pointer(*np,
rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock)));
write_lock(&n->lock);
neigh_del_timer(n);
n->dead = 1;
if (atomic_read(&n->refcnt) != 1) {
/* The most unpleasant situation.
We must destroy neighbour entry,
but someone still uses it.
The destroy will be delayed until
the last user releases us, but
we must kill timers etc. and move
it to safe state.
*/
__skb_queue_purge(&n->arp_queue);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
n->arp_queue_len_bytes = 0;
n->output = neigh_blackhole;
if (n->nud_state & NUD_VALID)
n->nud_state = NUD_NOARP;
else
n->nud_state = NUD_NONE;
neigh_dbg(2, "neigh %p is stray\n", n);
}
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
}
}
}
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
{
write_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev);
write_unlock_bh(&tbl->lock);
}
EXPORT_SYMBOL(neigh_changeaddr);
int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
{
write_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev);
pneigh_ifdown_and_unlock(tbl, dev);
del_timer_sync(&tbl->proxy_timer);
pneigh_queue_purge(&tbl->proxy_queue);
return 0;
}
EXPORT_SYMBOL(neigh_ifdown);
static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
int entries;
entries = atomic_inc_return(&tbl->entries) - 1;
if (entries >= tbl->gc_thresh3 ||
(entries >= tbl->gc_thresh2 &&
time_after(now, tbl->last_flush + 5 * HZ))) {
if (!neigh_forced_gc(tbl) &&
entries >= tbl->gc_thresh3)
goto out_entries;
}
n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
if (!n)
goto out_entries;
__skb_queue_head_init(&n->arp_queue);
rwlock_init(&n->lock);
seqlock_init(&n->ha_lock);
n->updated = n->used = now;
n->nud_state = NUD_NONE;
n->output = neigh_blackhole;
seqlock_init(&n->hh.hh_lock);
n->parms = neigh_parms_clone(&tbl->parms);
setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n);
NEIGH_CACHE_STAT_INC(tbl, allocs);
n->tbl = tbl;
atomic_set(&n->refcnt, 1);
n->dead = 1;
out:
return n;
out_entries:
atomic_dec(&tbl->entries);
goto out;
}
static void neigh_get_hash_rnd(u32 *x)
{
get_random_bytes(x, sizeof(*x));
*x |= 1;
}
static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
{
size_t size = (1 << shift) * sizeof(struct neighbour *);
struct neigh_hash_table *ret;
struct neighbour __rcu **buckets;
int i;
ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
if (!ret)
return NULL;
if (size <= PAGE_SIZE)
buckets = kzalloc(size, GFP_ATOMIC);
else
buckets = (struct neighbour __rcu **)
__get_free_pages(GFP_ATOMIC | __GFP_ZERO,
get_order(size));
if (!buckets) {
kfree(ret);
return NULL;
}
ret->hash_buckets = buckets;
ret->hash_shift = shift;
for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
neigh_get_hash_rnd(&ret->hash_rnd[i]);
return ret;
}
static void neigh_hash_free_rcu(struct rcu_head *head)
{
struct neigh_hash_table *nht = container_of(head,
struct neigh_hash_table,
rcu);
size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *);
struct neighbour __rcu **buckets = nht->hash_buckets;
if (size <= PAGE_SIZE)
kfree(buckets);
else
free_pages((unsigned long)buckets, get_order(size));
kfree(nht);
}
static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
unsigned long new_shift)
{
unsigned int i, hash;
struct neigh_hash_table *new_nht, *old_nht;
NEIGH_CACHE_STAT_INC(tbl, hash_grows);
old_nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
new_nht = neigh_hash_alloc(new_shift);
if (!new_nht)
return old_nht;
for (i = 0; i < (1 << old_nht->hash_shift); i++) {
struct neighbour *n, *next;
for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
lockdep_is_held(&tbl->lock));
n != NULL;
n = next) {
hash = tbl->hash(n->primary_key, n->dev,
new_nht->hash_rnd);
hash >>= (32 - new_nht->hash_shift);
next = rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock));
rcu_assign_pointer(n->next,
rcu_dereference_protected(
new_nht->hash_buckets[hash],
lockdep_is_held(&tbl->lock)));
rcu_assign_pointer(new_nht->hash_buckets[hash], n);
}
}
rcu_assign_pointer(tbl->nht, new_nht);
call_rcu(&old_nht->rcu, neigh_hash_free_rcu);
return new_nht;
}
struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
struct net_device *dev)
{
struct neighbour *n;
int key_len = tbl->key_len;
u32 hash_val;
struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, lookups);
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
n != NULL;
n = rcu_dereference_bh(n->next)) {
if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
if (!atomic_inc_not_zero(&n->refcnt))
n = NULL;
NEIGH_CACHE_STAT_INC(tbl, hits);
break;
}
}
rcu_read_unlock_bh();
return n;
}
EXPORT_SYMBOL(neigh_lookup);
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
const void *pkey)
{
struct neighbour *n;
int key_len = tbl->key_len;
u32 hash_val;
struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, lookups);
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift);
for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
n != NULL;
n = rcu_dereference_bh(n->next)) {
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
if (!memcmp(n->primary_key, pkey, key_len) &&
net_eq(dev_net(n->dev), net)) {
if (!atomic_inc_not_zero(&n->refcnt))
n = NULL;
NEIGH_CACHE_STAT_INC(tbl, hits);
break;
}
}
rcu_read_unlock_bh();
return n;
}
EXPORT_SYMBOL(neigh_lookup_nodev);
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, bool want_ref)
{
u32 hash_val;
int key_len = tbl->key_len;
int error;
struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
struct neigh_hash_table *nht;
if (!n) {
rc = ERR_PTR(-ENOBUFS);
goto out;
}
memcpy(n->primary_key, pkey, key_len);
n->dev = dev;
dev_hold(dev);
/* Protocol specific setup. */
if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
}
if (dev->netdev_ops->ndo_neigh_construct) {
error = dev->netdev_ops->ndo_neigh_construct(n);
if (error < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
}
}
/* Device specific setup. */
if (n->parms->neigh_setup &&
(error = n->parms->neigh_setup(n)) < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
}
n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
if (n->parms->dead) {
rc = ERR_PTR(-EINVAL);
goto out_tbl_unlock;
}
for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
lockdep_is_held(&tbl->lock));
n1 != NULL;
n1 = rcu_dereference_protected(n1->next,
lockdep_is_held(&tbl->lock))) {
if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
if (want_ref)
neigh_hold(n1);
rc = n1;
goto out_tbl_unlock;
}
}
n->dead = 0;
if (want_ref)
neigh_hold(n);
rcu_assign_pointer(n->next,
rcu_dereference_protected(nht->hash_buckets[hash_val],
lockdep_is_held(&tbl->lock)));
rcu_assign_pointer(nht->hash_buckets[hash_val], n);
write_unlock_bh(&tbl->lock);
neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
out:
return rc;
out_tbl_unlock:
write_unlock_bh(&tbl->lock);
out_neigh_release:
neigh_release(n);
goto out;
}
EXPORT_SYMBOL(__neigh_create);
static u32 pneigh_hash(const void *pkey, int key_len)
{
u32 hash_val = *(u32 *)(pkey + key_len - 4);
hash_val ^= (hash_val >> 16);
hash_val ^= hash_val >> 8;
hash_val ^= hash_val >> 4;
hash_val &= PNEIGH_HASHMASK;
return hash_val;
}
static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n,
struct net *net,
const void *pkey,
int key_len,
struct net_device *dev)
{
while (n) {
if (!memcmp(n->key, pkey, key_len) &&
net_eq(pneigh_net(n), net) &&
(n->dev == dev || !n->dev))
return n;
n = n->next;
}
return NULL;
}
struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl,
struct net *net, const void *pkey, struct net_device *dev)
{
int key_len = tbl->key_len;
u32 hash_val = pneigh_hash(pkey, key_len);
return __pneigh_lookup_1(tbl->phash_buckets[hash_val],
net, pkey, key_len, dev);
}
EXPORT_SYMBOL_GPL(__pneigh_lookup);
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
struct net *net, const void *pkey,
struct net_device *dev, int creat)
{
struct pneigh_entry *n;
int key_len = tbl->key_len;
u32 hash_val = pneigh_hash(pkey, key_len);
read_lock_bh(&tbl->lock);
n = __pneigh_lookup_1(tbl->phash_buckets[hash_val],
net, pkey, key_len, dev);
read_unlock_bh(&tbl->lock);
if (n || !creat)
goto out;
ASSERT_RTNL();
n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL);
if (!n)
goto out;
write_pnet(&n->net, hold_net(net));
memcpy(n->key, pkey, key_len);
n->dev = dev;
if (dev)
dev_hold(dev);
if (tbl->pconstructor && tbl->pconstructor(n)) {
if (dev)
dev_put(dev);
release_net(net);
kfree(n);
n = NULL;
goto out;
}
write_lock_bh(&tbl->lock);
n->next = tbl->phash_buckets[hash_val];
tbl->phash_buckets[hash_val] = n;
write_unlock_bh(&tbl->lock);
out:
return n;
}
EXPORT_SYMBOL(pneigh_lookup);
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
struct net_device *dev)
{
struct pneigh_entry *n, **np;
int key_len = tbl->key_len;
u32 hash_val = pneigh_hash(pkey, key_len);
write_lock_bh(&tbl->lock);
for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
np = &n->next) {
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
if (!memcmp(n->key, pkey, key_len) && n->dev == dev &&
net_eq(pneigh_net(n), net)) {
*np = n->next;
write_unlock_bh(&tbl->lock);
if (tbl->pdestructor)
tbl->pdestructor(n);
if (n->dev)
dev_put(n->dev);
release_net(pneigh_net(n));
kfree(n);
return 0;
}
}
write_unlock_bh(&tbl->lock);
return -ENOENT;
}
static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
struct net_device *dev)
{
struct pneigh_entry *n, **np, *freelist = NULL;
u32 h;
for (h = 0; h <= PNEIGH_HASHMASK; h++) {
np = &tbl->phash_buckets[h];
while ((n = *np) != NULL) {
if (!dev || n->dev == dev) {
*np = n->next;
n->next = freelist;
freelist = n;
continue;
}
np = &n->next;
}
}
write_unlock_bh(&tbl->lock);
while ((n = freelist)) {
freelist = n->next;
n->next = NULL;
if (tbl->pdestructor)
tbl->pdestructor(n);
if (n->dev)
dev_put(n->dev);
release_net(pneigh_net(n));
kfree(n);
}
return -ENOENT;
}
static void neigh_parms_destroy(struct neigh_parms *parms);
static inline void neigh_parms_put(struct neigh_parms *parms)
{
if (atomic_dec_and_test(&parms->refcnt))
neigh_parms_destroy(parms);
}
/*
* neighbour must already be out of the table;
*
*/
void neigh_destroy(struct neighbour *neigh)
{
struct net_device *dev = neigh->dev;
NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
if (!neigh->dead) {
pr_warn("Destroying alive neighbour %pK\n", neigh);
dump_stack();
return;
}
if (neigh_del_timer(neigh))
pr_warn("Impossible event\n");
write_lock_bh(&neigh->lock);
__skb_queue_purge(&neigh->arp_queue);
write_unlock_bh(&neigh->lock);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
neigh->arp_queue_len_bytes = 0;
if (dev->netdev_ops->ndo_neigh_destroy)
dev->netdev_ops->ndo_neigh_destroy(neigh);
dev_put(dev);
neigh_parms_put(neigh->parms);
neigh_dbg(2, "neigh %p is destroyed\n", neigh);
atomic_dec(&neigh->tbl->entries);
kfree_rcu(neigh, rcu);
}
EXPORT_SYMBOL(neigh_destroy);
/* Neighbour state is suspicious;
disable fast path.
Called with write_locked neigh.
*/
static void neigh_suspect(struct neighbour *neigh)
{
neigh_dbg(2, "neigh %p is suspected\n", neigh);
neigh->output = neigh->ops->output;
}
/* Neighbour state is OK;
enable fast path.
Called with write_locked neigh.
*/
static void neigh_connect(struct neighbour *neigh)
{
neigh_dbg(2, "neigh %p is connected\n", neigh);
neigh->output = neigh->ops->connected_output;
}
static void neigh_periodic_work(struct work_struct *work)
{
struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
struct neighbour *n;
struct neighbour __rcu **np;
unsigned int i;
struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
/*
* periodically recompute ReachableTime from random function
*/
if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
struct neigh_parms *p;
tbl->last_rand = jiffies;
for (p = &tbl->parms; p; p = p->next)
p->reachable_time =
neigh_rand_reach_time(p->base_reachable_time);
}
if (atomic_read(&tbl->entries) < tbl->gc_thresh1)
goto out;
for (i = 0 ; i < (1 << nht->hash_shift); i++) {
np = &nht->hash_buckets[i];
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock))) != NULL) {
unsigned int state;
write_lock(&n->lock);
state = n->nud_state;
if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
write_unlock(&n->lock);
goto next_elt;
}
if (time_before(n->used, n->confirmed))
n->used = n->confirmed;
if (atomic_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
time_after(jiffies, n->used + n->parms->gc_staletime))) {
*np = n->next;
n->dead = 1;
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
continue;
}
write_unlock(&n->lock);
next_elt:
np = &n->next;
}
/*
* It's fine to release lock here, even if hash table
* grows while we are preempted.
*/
write_unlock_bh(&tbl->lock);
cond_resched();
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
}
out:
/* Cycle through all hash buckets every base_reachable_time/2 ticks.
* ARP entry timeouts range from 1/2 base_reachable_time to 3/2
* base_reachable_time.
*/
queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
tbl->parms.base_reachable_time >> 1);
write_unlock_bh(&tbl->lock);
}
static __inline__ int neigh_max_probes(struct neighbour *n)
{
struct neigh_parms *p = n->parms;
int max_probes = p->ucast_probes + p->app_probes;
if (!(n->nud_state & NUD_PROBE))
max_probes += p->mcast_probes;
return max_probes;
}
static void neigh_invalidate(struct neighbour *neigh)
__releases(neigh->lock)
__acquires(neigh->lock)
{
struct sk_buff *skb;
NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed);
neigh_dbg(2, "neigh %p is failed\n", neigh);
neigh->updated = jiffies;
/* It is very thin place. report_unreachable is very complicated
routine. Particularly, it can hit the same neighbour entry!
So that, we try to be accurate and avoid dead loop. --ANK
*/
while (neigh->nud_state == NUD_FAILED &&
(skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
write_unlock(&neigh->lock);
neigh->ops->error_report(neigh, skb);
write_lock(&neigh->lock);
}
__skb_queue_purge(&neigh->arp_queue);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
neigh->arp_queue_len_bytes = 0;
}
static void neigh_probe(struct neighbour *neigh)
__releases(neigh->lock)
{
struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);
/* keep skb alive even if arp_queue overflows */
if (skb)
skb = skb_copy(skb, GFP_ATOMIC);
write_unlock(&neigh->lock);
if (neigh->ops->solicit)
neigh->ops->solicit(neigh, skb);
atomic_inc(&neigh->probes);
kfree_skb(skb);
}
/* Called when a timer expires for a neighbour entry. */
static void neigh_timer_handler(unsigned long arg)
{
unsigned long now, next;
struct neighbour *neigh = (struct neighbour *)arg;
unsigned int state;
int notify = 0;
write_lock(&neigh->lock);
state = neigh->nud_state;
now = jiffies;
next = now + HZ;
if (!(state & NUD_IN_TIMER))
goto out;
if (state & NUD_REACHABLE) {
if (time_before_eq(now,
neigh->confirmed + neigh->parms->reachable_time)) {
neigh_dbg(2, "neigh %p is still alive\n", neigh);
next = neigh->confirmed + neigh->parms->reachable_time;
} else if (time_before_eq(now,
neigh->used + neigh->parms->delay_probe_time)) {
neigh_dbg(2, "neigh %p is delayed\n", neigh);
neigh->nud_state = NUD_DELAY;
neigh->updated = jiffies;
neigh_suspect(neigh);
next = now + neigh->parms->delay_probe_time;
} else {
neigh_dbg(2, "neigh %p is suspected\n", neigh);
neigh->nud_state = NUD_STALE;
neigh->updated = jiffies;
neigh_suspect(neigh);
notify = 1;
}
} else if (state & NUD_DELAY) {
if (time_before_eq(now,
neigh->confirmed + neigh->parms->delay_probe_time)) {
neigh_dbg(2, "neigh %p is now reachable\n", neigh);
neigh->nud_state = NUD_REACHABLE;
neigh->updated = jiffies;
neigh_connect(neigh);
notify = 1;
next = neigh->confirmed + neigh->parms->reachable_time;
} else {
neigh_dbg(2, "neigh %p is probed\n", neigh);
neigh->nud_state = NUD_PROBE;
neigh->updated = jiffies;
atomic_set(&neigh->probes, 0);
notify = 1;
next = now + neigh->parms->retrans_time;
}
} else {
/* NUD_PROBE|NUD_INCOMPLETE */
next = now + neigh->parms->retrans_time;
}
if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
neigh->nud_state = NUD_FAILED;
notify = 1;
neigh_invalidate(neigh);
goto out;
}
if (neigh->nud_state & NUD_IN_TIMER) {
if (time_before(next, jiffies + HZ/2))
next = jiffies + HZ/2;
if (!mod_timer(&neigh->timer, next))
neigh_hold(neigh);
}
if (neigh_probe_enable) {
if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE | NUD_STALE))
neigh_probe(neigh);
} else if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
neigh_probe(neigh);
} else {
out:
write_unlock(&neigh->lock);
}
if (notify)
neigh_update_notify(neigh);
neigh_release(neigh);
}
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
int rc;
bool immediate_probe = false;
write_lock_bh(&neigh->lock);
rc = 0;
if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
goto out_unlock_bh;
if (neigh->dead)
goto out_dead;
if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
unsigned long next, now = jiffies;
atomic_set(&neigh->probes, neigh->parms->ucast_probes);
net: neigh: fix multiple neigh timer scheduling commit 071c37983d99da07797294ea78e9da1a6e287144 upstream. Neigh timer can be scheduled multiple times from userspace adding multiple neigh entries and forcing the neigh timer scheduling passing NTF_USE in the netlink requests. This will result in a refcount leak and in the following dump stack: [ 32.465295] NEIGH: BUG, double timer add, state is 8 [ 32.465308] CPU: 0 PID: 416 Comm: double_timer_ad Not tainted 5.2.0+ #65 [ 32.465311] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.12.0-2.fc30 04/01/2014 [ 32.465313] Call Trace: [ 32.465318] dump_stack+0x7c/0xc0 [ 32.465323] __neigh_event_send+0x20c/0x880 [ 32.465326] ? ___neigh_create+0x846/0xfb0 [ 32.465329] ? neigh_lookup+0x2a9/0x410 [ 32.465332] ? neightbl_fill_info.constprop.0+0x800/0x800 [ 32.465334] neigh_add+0x4f8/0x5e0 [ 32.465337] ? neigh_xmit+0x620/0x620 [ 32.465341] ? find_held_lock+0x85/0xa0 [ 32.465345] rtnetlink_rcv_msg+0x204/0x570 [ 32.465348] ? rtnl_dellink+0x450/0x450 [ 32.465351] ? mark_held_locks+0x90/0x90 [ 32.465354] ? match_held_lock+0x1b/0x230 [ 32.465357] netlink_rcv_skb+0xc4/0x1d0 [ 32.465360] ? rtnl_dellink+0x450/0x450 [ 32.465363] ? netlink_ack+0x420/0x420 [ 32.465366] ? netlink_deliver_tap+0x115/0x560 [ 32.465369] ? __alloc_skb+0xc9/0x2f0 [ 32.465372] netlink_unicast+0x270/0x330 [ 32.465375] ? netlink_attachskb+0x2f0/0x2f0 [ 32.465378] netlink_sendmsg+0x34f/0x5a0 [ 32.465381] ? netlink_unicast+0x330/0x330 [ 32.465385] ? move_addr_to_kernel.part.0+0x20/0x20 [ 32.465388] ? netlink_unicast+0x330/0x330 [ 32.465391] sock_sendmsg+0x91/0xa0 [ 32.465394] ___sys_sendmsg+0x407/0x480 [ 32.465397] ? copy_msghdr_from_user+0x200/0x200 [ 32.465401] ? _raw_spin_unlock_irqrestore+0x37/0x40 [ 32.465404] ? lockdep_hardirqs_on+0x17d/0x250 [ 32.465407] ? __wake_up_common_lock+0xcb/0x110 [ 32.465410] ? __wake_up_common+0x230/0x230 [ 32.465413] ? netlink_bind+0x3e1/0x490 [ 32.465416] ? netlink_setsockopt+0x540/0x540 [ 32.465420] ? __fget_light+0x9c/0xf0 [ 32.465423] ? sockfd_lookup_light+0x8c/0xb0 [ 32.465426] __sys_sendmsg+0xa5/0x110 [ 32.465429] ? __ia32_sys_shutdown+0x30/0x30 [ 32.465432] ? __fd_install+0xe1/0x2c0 [ 32.465435] ? lockdep_hardirqs_off+0xb5/0x100 [ 32.465438] ? mark_held_locks+0x24/0x90 [ 32.465441] ? do_syscall_64+0xf/0x270 [ 32.465444] do_syscall_64+0x63/0x270 [ 32.465448] entry_SYSCALL_64_after_hwframe+0x49/0xbe Fix the issue unscheduling neigh_timer if selected entry is in 'IN_TIMER' receiving a netlink request with NTF_USE flag set Change-Id: Ie3b230fa2876d99ea8a10de49d3d2134a857dd36 Reported-by: Marek Majkowski <marek@cloudflare.com> Fixes: 0c5c2d308906 ("neigh: Allow for user space users of the neighbour table") Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com> Reviewed-by: David Ahern <dsahern@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
2019-07-14 21:36:11 +00:00
neigh_del_timer(neigh);
neigh->nud_state = NUD_INCOMPLETE;
neigh->updated = now;
next = now + max(neigh->parms->retrans_time, HZ/2);
neigh_add_timer(neigh, next);
immediate_probe = true;
} else {
neigh->nud_state = NUD_FAILED;
neigh->updated = jiffies;
write_unlock_bh(&neigh->lock);
kfree_skb(skb);
return 1;
}
} else if (neigh->nud_state & NUD_STALE) {
neigh_dbg(2, "neigh %p is delayed\n", neigh);
net: neigh: fix multiple neigh timer scheduling commit 071c37983d99da07797294ea78e9da1a6e287144 upstream. Neigh timer can be scheduled multiple times from userspace adding multiple neigh entries and forcing the neigh timer scheduling passing NTF_USE in the netlink requests. This will result in a refcount leak and in the following dump stack: [ 32.465295] NEIGH: BUG, double timer add, state is 8 [ 32.465308] CPU: 0 PID: 416 Comm: double_timer_ad Not tainted 5.2.0+ #65 [ 32.465311] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.12.0-2.fc30 04/01/2014 [ 32.465313] Call Trace: [ 32.465318] dump_stack+0x7c/0xc0 [ 32.465323] __neigh_event_send+0x20c/0x880 [ 32.465326] ? ___neigh_create+0x846/0xfb0 [ 32.465329] ? neigh_lookup+0x2a9/0x410 [ 32.465332] ? neightbl_fill_info.constprop.0+0x800/0x800 [ 32.465334] neigh_add+0x4f8/0x5e0 [ 32.465337] ? neigh_xmit+0x620/0x620 [ 32.465341] ? find_held_lock+0x85/0xa0 [ 32.465345] rtnetlink_rcv_msg+0x204/0x570 [ 32.465348] ? rtnl_dellink+0x450/0x450 [ 32.465351] ? mark_held_locks+0x90/0x90 [ 32.465354] ? match_held_lock+0x1b/0x230 [ 32.465357] netlink_rcv_skb+0xc4/0x1d0 [ 32.465360] ? rtnl_dellink+0x450/0x450 [ 32.465363] ? netlink_ack+0x420/0x420 [ 32.465366] ? netlink_deliver_tap+0x115/0x560 [ 32.465369] ? __alloc_skb+0xc9/0x2f0 [ 32.465372] netlink_unicast+0x270/0x330 [ 32.465375] ? netlink_attachskb+0x2f0/0x2f0 [ 32.465378] netlink_sendmsg+0x34f/0x5a0 [ 32.465381] ? netlink_unicast+0x330/0x330 [ 32.465385] ? move_addr_to_kernel.part.0+0x20/0x20 [ 32.465388] ? netlink_unicast+0x330/0x330 [ 32.465391] sock_sendmsg+0x91/0xa0 [ 32.465394] ___sys_sendmsg+0x407/0x480 [ 32.465397] ? copy_msghdr_from_user+0x200/0x200 [ 32.465401] ? _raw_spin_unlock_irqrestore+0x37/0x40 [ 32.465404] ? lockdep_hardirqs_on+0x17d/0x250 [ 32.465407] ? __wake_up_common_lock+0xcb/0x110 [ 32.465410] ? __wake_up_common+0x230/0x230 [ 32.465413] ? netlink_bind+0x3e1/0x490 [ 32.465416] ? netlink_setsockopt+0x540/0x540 [ 32.465420] ? __fget_light+0x9c/0xf0 [ 32.465423] ? sockfd_lookup_light+0x8c/0xb0 [ 32.465426] __sys_sendmsg+0xa5/0x110 [ 32.465429] ? __ia32_sys_shutdown+0x30/0x30 [ 32.465432] ? __fd_install+0xe1/0x2c0 [ 32.465435] ? lockdep_hardirqs_off+0xb5/0x100 [ 32.465438] ? mark_held_locks+0x24/0x90 [ 32.465441] ? do_syscall_64+0xf/0x270 [ 32.465444] do_syscall_64+0x63/0x270 [ 32.465448] entry_SYSCALL_64_after_hwframe+0x49/0xbe Fix the issue unscheduling neigh_timer if selected entry is in 'IN_TIMER' receiving a netlink request with NTF_USE flag set Change-Id: Ie3b230fa2876d99ea8a10de49d3d2134a857dd36 Reported-by: Marek Majkowski <marek@cloudflare.com> Fixes: 0c5c2d308906 ("neigh: Allow for user space users of the neighbour table") Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com> Reviewed-by: David Ahern <dsahern@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
2019-07-14 21:36:11 +00:00
neigh_del_timer(neigh);
neigh->nud_state = NUD_DELAY;
neigh->updated = jiffies;
neigh_add_timer(neigh,
jiffies + neigh->parms->delay_probe_time);
}
if (neigh->nud_state == NUD_INCOMPLETE) {
if (skb) {
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
while (neigh->arp_queue_len_bytes + skb->truesize >
neigh->parms->queue_len_bytes) {
struct sk_buff *buff;
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
buff = __skb_dequeue(&neigh->arp_queue);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
if (!buff)
break;
neigh->arp_queue_len_bytes -= buff->truesize;
kfree_skb(buff);
NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
}
skb_dst_force(skb);
__skb_queue_tail(&neigh->arp_queue, skb);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
neigh->arp_queue_len_bytes += skb->truesize;
}
rc = 1;
}
out_unlock_bh:
if (immediate_probe)
neigh_probe(neigh);
else
write_unlock(&neigh->lock);
local_bh_enable();
return rc;
out_dead:
if (neigh->nud_state & NUD_STALE)
goto out_unlock_bh;
write_unlock_bh(&neigh->lock);
kfree_skb(skb);
return 1;
}
EXPORT_SYMBOL(__neigh_event_send);
static void neigh_update_hhs(struct neighbour *neigh)
{
struct hh_cache *hh;
void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
= NULL;
if (neigh->dev->header_ops)
update = neigh->dev->header_ops->cache_update;
if (update) {
hh = &neigh->hh;
if (hh->hh_len) {
write_seqlock_bh(&hh->hh_lock);
update(hh, neigh->dev, neigh->ha);
write_sequnlock_bh(&hh->hh_lock);
}
}
}
/* Generic update routine.
-- lladdr is new lladdr or NULL, if it is not supplied.
-- new is new state.
-- flags
NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,
if it is different.
NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
lladdr instead of overriding it
if it is different.
It also allows to retain current state
if lladdr is unchanged.
NEIGH_UPDATE_F_ADMIN means that the change is administrative.
NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
NTF_ROUTER flag.
NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
a router.
Caller MUST hold reference count on the entry.
*/
int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
u32 flags)
{
u8 old;
int err;
int notify = 0;
struct net_device *dev;
int update_isrouter = 0;
write_lock_bh(&neigh->lock);
dev = neigh->dev;
old = neigh->nud_state;
err = -EPERM;
if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
(old & (NUD_NOARP | NUD_PERMANENT)))
goto out;
if (neigh->dead)
goto out;
if (!(new & NUD_VALID)) {
neigh_del_timer(neigh);
if (old & NUD_CONNECTED)
neigh_suspect(neigh);
neigh->nud_state = new;
err = 0;
notify = old & NUD_VALID;
if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
(new & NUD_FAILED)) {
neigh_invalidate(neigh);
notify = 1;
}
goto out;
}
/* Compare new lladdr with cached one */
if (!dev->addr_len) {
/* First case: device needs no address. */
lladdr = neigh->ha;
} else if (lladdr) {
/* The second case: if something is already cached
and a new address is proposed:
- compare new & old
- if they are different, check override flag
*/
if ((old & NUD_VALID) &&
!memcmp(lladdr, neigh->ha, dev->addr_len))
lladdr = neigh->ha;
} else {
/* No address is supplied; if we know something,
use it, otherwise discard the request.
*/
err = -EINVAL;
if (!(old & NUD_VALID))
goto out;
lladdr = neigh->ha;
}
/* Update confirmed timestamp for neighbour entry after we
* received ARP packet even if it doesn't change IP to MAC binding.
*/
if (new & NUD_CONNECTED)
neigh->confirmed = jiffies;
/* If entry was valid and address is not changed,
do not change entry state, if new one is STALE.
*/
err = 0;
update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
if (old & NUD_VALID) {
if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) {
update_isrouter = 0;
if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&
(old & NUD_CONNECTED)) {
lladdr = neigh->ha;
new = NUD_STALE;
} else
goto out;
} else {
if (lladdr == neigh->ha && new == NUD_STALE &&
((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) ||
(old & NUD_CONNECTED))
)
new = old;
}
}
/* Update timestamp only once we know we will make a change to the
neighbour: update neigh timestamps iff update is effective [ Upstream commit 77d7123342dcf6442341b67816321d71da8b2b16 ] It's a common practice to send gratuitous ARPs after moving an IP address to another device to speed up healing of a service. To fulfill service availability constraints, the timing of network peers updating their caches to point to a new location of an IP address can be particularly important. Sometimes neigh_update calls won't touch neither lladdr nor state, for example if an update arrives in locktime interval. The neigh->updated value is tested by the protocol specific neigh code, which in turn will influence whether NEIGH_UPDATE_F_OVERRIDE gets set in the call to neigh_update() or not. As a result, we may effectively ignore the update request, bailing out of touching the neigh entry, except that we still bump its timestamps inside neigh_update. This may be a problem for updates arriving in quick succession. For example, consider the following scenario: A service is moved to another device with its IP address. The new device sends three gratuitous ARP requests into the network with ~1 seconds interval between them. Just before the first request arrives to one of network peer nodes, its neigh entry for the IP address transitions from STALE to DELAY. This transition, among other things, updates neigh->updated. Once the kernel receives the first gratuitous ARP, it ignores it because its arrival time is inside the locktime interval. The kernel still bumps neigh->updated. Then the second gratuitous ARP request arrives, and it's also ignored because it's still in the (new) locktime interval. Same happens for the third request. The node eventually heals itself (after delay_first_probe_time seconds since the initial transition to DELAY state), but it just wasted some time and require a new ARP request/reply round trip. This unfortunate behaviour both puts more load on the network, as well as reduces service availability. This patch changes neigh_update so that it bumps neigh->updated (as well as neigh->confirmed) only once we are sure that either lladdr or entry state will change). In the scenario described above, it means that the second gratuitous ARP request will actually update the entry lladdr. Ideally, we would update the neigh entry on the very first gratuitous ARP request. The locktime mechanism is designed to ignore ARP updates in a short timeframe after a previous ARP update was honoured by the kernel layer. This would require tracking timestamps for state transitions separately from timestamps when actual updates are received. This would probably involve changes in neighbour struct. Therefore, the patch doesn't tackle the issue of the first gratuitous APR ignored, leaving it for a follow-up. Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Sasha Levin <alexander.levin@microsoft.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2017-05-16 15:44:24 +00:00
* neighbour entry. Otherwise we risk to move the locktime window with
* noop updates and ignore relevant ARP updates.
*/
if (new != old || lladdr != neigh->ha)
neighbour: update neigh timestamps iff update is effective [ Upstream commit 77d7123342dcf6442341b67816321d71da8b2b16 ] It's a common practice to send gratuitous ARPs after moving an IP address to another device to speed up healing of a service. To fulfill service availability constraints, the timing of network peers updating their caches to point to a new location of an IP address can be particularly important. Sometimes neigh_update calls won't touch neither lladdr nor state, for example if an update arrives in locktime interval. The neigh->updated value is tested by the protocol specific neigh code, which in turn will influence whether NEIGH_UPDATE_F_OVERRIDE gets set in the call to neigh_update() or not. As a result, we may effectively ignore the update request, bailing out of touching the neigh entry, except that we still bump its timestamps inside neigh_update. This may be a problem for updates arriving in quick succession. For example, consider the following scenario: A service is moved to another device with its IP address. The new device sends three gratuitous ARP requests into the network with ~1 seconds interval between them. Just before the first request arrives to one of network peer nodes, its neigh entry for the IP address transitions from STALE to DELAY. This transition, among other things, updates neigh->updated. Once the kernel receives the first gratuitous ARP, it ignores it because its arrival time is inside the locktime interval. The kernel still bumps neigh->updated. Then the second gratuitous ARP request arrives, and it's also ignored because it's still in the (new) locktime interval. Same happens for the third request. The node eventually heals itself (after delay_first_probe_time seconds since the initial transition to DELAY state), but it just wasted some time and require a new ARP request/reply round trip. This unfortunate behaviour both puts more load on the network, as well as reduces service availability. This patch changes neigh_update so that it bumps neigh->updated (as well as neigh->confirmed) only once we are sure that either lladdr or entry state will change). In the scenario described above, it means that the second gratuitous ARP request will actually update the entry lladdr. Ideally, we would update the neigh entry on the very first gratuitous ARP request. The locktime mechanism is designed to ignore ARP updates in a short timeframe after a previous ARP update was honoured by the kernel layer. This would require tracking timestamps for state transitions separately from timestamps when actual updates are received. This would probably involve changes in neighbour struct. Therefore, the patch doesn't tackle the issue of the first gratuitous APR ignored, leaving it for a follow-up. Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Sasha Levin <alexander.levin@microsoft.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2017-05-16 15:44:24 +00:00
neigh->updated = jiffies;
if (new != old) {
neigh_del_timer(neigh);
if (new & NUD_PROBE)
atomic_set(&neigh->probes, 0);
if (new & NUD_IN_TIMER)
neigh_add_timer(neigh, (jiffies +
((new & NUD_REACHABLE) ?
neigh->parms->reachable_time :
0)));
neigh->nud_state = new;
notify = 1;
}
if (lladdr != neigh->ha) {
write_seqlock(&neigh->ha_lock);
memcpy(&neigh->ha, lladdr, dev->addr_len);
write_sequnlock(&neigh->ha_lock);
neigh_update_hhs(neigh);
if (!(new & NUD_CONNECTED))
neigh->confirmed = jiffies -
(neigh->parms->base_reachable_time << 1);
notify = 1;
}
if (new == old)
goto out;
if (new & NUD_CONNECTED)
neigh_connect(neigh);
else
neigh_suspect(neigh);
if (!(old & NUD_VALID)) {
struct sk_buff *skb;
/* Again: avoid dead loop if something went wrong */
while (neigh->nud_state & NUD_VALID &&
(skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
struct dst_entry *dst = skb_dst(skb);
struct neighbour *n2, *n1 = neigh;
write_unlock_bh(&neigh->lock);
neigh: fix rcu splat in neigh_update() when use dst_get_neighbour to get neighbour, we need rcu_read_lock to protect, since dst_get_neighbour uses rcu_dereference. The bug was reported by Ari Savolainen <ari.m.savolainen@gmail.com> [ 105.612095] [ 105.612096] =================================================== [ 105.612100] [ INFO: suspicious rcu_dereference_check() usage. ] [ 105.612101] --------------------------------------------------- [ 105.612103] include/net/dst.h:91 invoked rcu_dereference_check() without protection! [ 105.612105] [ 105.612106] other info that might help us debug this: [ 105.612106] [ 105.612108] [ 105.612108] rcu_scheduler_active = 1, debug_locks = 0 [ 105.612110] 1 lock held by dnsmasq/2618: [ 105.612111] #0: (rtnl_mutex){+.+.+.}, at: [<ffffffff815df8c7>] rtnl_lock+0x17/0x20 [ 105.612120] [ 105.612121] stack backtrace: [ 105.612123] Pid: 2618, comm: dnsmasq Not tainted 3.1.0-rc1 #41 [ 105.612125] Call Trace: [ 105.612129] [<ffffffff810ccdcb>] lockdep_rcu_dereference+0xbb/0xc0 [ 105.612132] [<ffffffff815dc5a9>] neigh_update+0x4f9/0x5f0 [ 105.612135] [<ffffffff815da001>] ? neigh_lookup+0xe1/0x220 [ 105.612139] [<ffffffff81639298>] arp_req_set+0xb8/0x230 [ 105.612142] [<ffffffff8163a59f>] arp_ioctl+0x1bf/0x310 [ 105.612146] [<ffffffff810baa40>] ? lock_hrtimer_base.isra.26+0x30/0x60 [ 105.612150] [<ffffffff8163fb75>] inet_ioctl+0x85/0x90 [ 105.612154] [<ffffffff815b5520>] sock_do_ioctl+0x30/0x70 [ 105.612157] [<ffffffff815b55d3>] sock_ioctl+0x73/0x280 [ 105.612162] [<ffffffff811b7698>] do_vfs_ioctl+0x98/0x570 [ 105.612165] [<ffffffff811a5c40>] ? fget_light+0x340/0x3a0 [ 105.612168] [<ffffffff811b7bbf>] sys_ioctl+0x4f/0x80 [ 105.612172] [<ffffffff816fdcab>] system_call_fastpath+0x16/0x1b Reported-by: Ari Savolainen <ari.m.savolainen@gmail.com> Signed-off-by: RongQing <roy.qing.li@gmail.com> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-10-17 22:32:42 +00:00
rcu_read_lock();
/* Why not just use 'neigh' as-is? The problem is that
* things such as shaper, eql, and sch_teql can end up
* using alternative, different, neigh objects to output
* the packet in the output path. So what we need to do
* here is re-lookup the top-level neigh in the path so
* we can reinject the packet there.
*/
n2 = NULL;
if (dst) {
n2 = dst_neigh_lookup_skb(dst, skb);
if (n2)
n1 = n2;
}
n1->output(n1, skb);
if (n2)
neigh_release(n2);
neigh: fix rcu splat in neigh_update() when use dst_get_neighbour to get neighbour, we need rcu_read_lock to protect, since dst_get_neighbour uses rcu_dereference. The bug was reported by Ari Savolainen <ari.m.savolainen@gmail.com> [ 105.612095] [ 105.612096] =================================================== [ 105.612100] [ INFO: suspicious rcu_dereference_check() usage. ] [ 105.612101] --------------------------------------------------- [ 105.612103] include/net/dst.h:91 invoked rcu_dereference_check() without protection! [ 105.612105] [ 105.612106] other info that might help us debug this: [ 105.612106] [ 105.612108] [ 105.612108] rcu_scheduler_active = 1, debug_locks = 0 [ 105.612110] 1 lock held by dnsmasq/2618: [ 105.612111] #0: (rtnl_mutex){+.+.+.}, at: [<ffffffff815df8c7>] rtnl_lock+0x17/0x20 [ 105.612120] [ 105.612121] stack backtrace: [ 105.612123] Pid: 2618, comm: dnsmasq Not tainted 3.1.0-rc1 #41 [ 105.612125] Call Trace: [ 105.612129] [<ffffffff810ccdcb>] lockdep_rcu_dereference+0xbb/0xc0 [ 105.612132] [<ffffffff815dc5a9>] neigh_update+0x4f9/0x5f0 [ 105.612135] [<ffffffff815da001>] ? neigh_lookup+0xe1/0x220 [ 105.612139] [<ffffffff81639298>] arp_req_set+0xb8/0x230 [ 105.612142] [<ffffffff8163a59f>] arp_ioctl+0x1bf/0x310 [ 105.612146] [<ffffffff810baa40>] ? lock_hrtimer_base.isra.26+0x30/0x60 [ 105.612150] [<ffffffff8163fb75>] inet_ioctl+0x85/0x90 [ 105.612154] [<ffffffff815b5520>] sock_do_ioctl+0x30/0x70 [ 105.612157] [<ffffffff815b55d3>] sock_ioctl+0x73/0x280 [ 105.612162] [<ffffffff811b7698>] do_vfs_ioctl+0x98/0x570 [ 105.612165] [<ffffffff811a5c40>] ? fget_light+0x340/0x3a0 [ 105.612168] [<ffffffff811b7bbf>] sys_ioctl+0x4f/0x80 [ 105.612172] [<ffffffff816fdcab>] system_call_fastpath+0x16/0x1b Reported-by: Ari Savolainen <ari.m.savolainen@gmail.com> Signed-off-by: RongQing <roy.qing.li@gmail.com> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-10-17 22:32:42 +00:00
rcu_read_unlock();
write_lock_bh(&neigh->lock);
}
__skb_queue_purge(&neigh->arp_queue);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
neigh->arp_queue_len_bytes = 0;
}
out:
if (update_isrouter) {
neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
(neigh->flags | NTF_ROUTER) :
(neigh->flags & ~NTF_ROUTER);
}
write_unlock_bh(&neigh->lock);
if (notify)
neigh_update_notify(neigh);
return err;
}
EXPORT_SYMBOL(neigh_update);
struct neighbour *neigh_event_ns(struct neigh_table *tbl,
u8 *lladdr, void *saddr,
struct net_device *dev)
{
struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev,
lladdr || !dev->addr_len);
if (neigh) {
if (neigh_probe_enable) {
if (!(neigh->nud_state == NUD_REACHABLE)) {
neigh_update(neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_OVERRIDE);
write_lock(&neigh->lock);
neigh_probe(neigh);
neigh_update_notify(neigh);
}
} else {
neigh_update(neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_OVERRIDE);
}
}
return neigh;
}
EXPORT_SYMBOL(neigh_event_ns);
/* called with read_lock_bh(&n->lock); */
static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst)
{
struct net_device *dev = dst->dev;
__be16 prot = dst->ops->protocol;
struct hh_cache *hh = &n->hh;
write_lock_bh(&n->lock);
/* Only one thread can come in here and initialize the
* hh_cache entry.
*/
if (!hh->hh_len)
dev->header_ops->cache(n, hh, prot);
write_unlock_bh(&n->lock);
}
/* This function can be used in contexts, where only old dev_queue_xmit
* worked, f.e. if you want to override normal output path (eql, shaper),
* but resolution is not made yet.
*/
int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
__skb_pull(skb, skb_network_offset(skb));
if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL,
skb->len) < 0 &&
dev_rebuild_header(skb))
return 0;
return dev_queue_xmit(skb);
}
EXPORT_SYMBOL(neigh_compat_output);
/* Slow and careful. */
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
int rc = 0;
if (!dst)
goto discard;
if (!neigh_event_send(neigh, skb)) {
int err;
struct net_device *dev = neigh->dev;
unsigned int seq;
if (dev->header_ops->cache && !neigh->hh.hh_len)
neigh_hh_init(neigh, dst);
do {
__skb_pull(skb, skb_network_offset(skb));
seq = read_seqbegin(&neigh->ha_lock);
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
} while (read_seqretry(&neigh->ha_lock, seq));
if (err >= 0)
rc = dev_queue_xmit(skb);
else
goto out_kfree_skb;
}
out:
return rc;
discard:
neigh_dbg(1, "%s: dst=%pK neigh=%pK\n", __func__, dst, neigh);
out_kfree_skb:
rc = -EINVAL;
kfree_skb(skb);
goto out;
}
EXPORT_SYMBOL(neigh_resolve_output);
/* As fast as possible without hh cache */
int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)
{
struct net_device *dev = neigh->dev;
unsigned int seq;
int err;
do {
__skb_pull(skb, skb_network_offset(skb));
seq = read_seqbegin(&neigh->ha_lock);
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
} while (read_seqretry(&neigh->ha_lock, seq));
if (err >= 0)
err = dev_queue_xmit(skb);
else {
err = -EINVAL;
kfree_skb(skb);
}
return err;
}
EXPORT_SYMBOL(neigh_connected_output);
int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb)
{
return dev_queue_xmit(skb);
}
EXPORT_SYMBOL(neigh_direct_output);
static void neigh_proxy_process(unsigned long arg)
{
struct neigh_table *tbl = (struct neigh_table *)arg;
long sched_next = 0;
unsigned long now = jiffies;
struct sk_buff *skb, *n;
spin_lock(&tbl->proxy_queue.lock);
skb_queue_walk_safe(&tbl->proxy_queue, skb, n) {
long tdif = NEIGH_CB(skb)->sched_next - now;
if (tdif <= 0) {
struct net_device *dev = skb->dev;
arp: fix rcu lockdep splat in arp_process() Dave Jones reported a lockdep splat triggered by an arp_process() call from parp_redo(). Commit faa9dcf793be (arp: RCU changes) is the origin of the bug, since it assumed arp_process() was called under rcu_read_lock(), which is not true in this particular path. Instead of adding rcu_read_lock() in parp_redo(), I chose to add it in neigh_proxy_process() to take care of IPv6 side too. =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- include/linux/inetdevice.h:209 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 4 locks held by setfiles/2123: #0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff8114cbc4>] walk_component+0x1ef/0x3e8 #1: (&isec->lock){+.+.+.}, at: [<ffffffff81204bca>] inode_doinit_with_dentry+0x3f/0x41f #2: (&tbl->proxy_timer){+.-...}, at: [<ffffffff8106a803>] run_timer_softirq+0x157/0x372 #3: (class){+.-...}, at: [<ffffffff8141f256>] neigh_proxy_process +0x36/0x103 stack backtrace: Pid: 2123, comm: setfiles Tainted: G W 3.1.0-0.rc2.git7.2.fc16.x86_64 #1 Call Trace: <IRQ> [<ffffffff8108ca23>] lockdep_rcu_dereference+0xa7/0xaf [<ffffffff8146a0b7>] __in_dev_get_rcu+0x55/0x5d [<ffffffff8146a751>] arp_process+0x25/0x4d7 [<ffffffff8146ac11>] parp_redo+0xe/0x10 [<ffffffff8141f2ba>] neigh_proxy_process+0x9a/0x103 [<ffffffff8106a8c4>] run_timer_softirq+0x218/0x372 [<ffffffff8106a803>] ? run_timer_softirq+0x157/0x372 [<ffffffff8141f220>] ? neigh_stat_seq_open+0x41/0x41 [<ffffffff8108f2f0>] ? mark_held_locks+0x6d/0x95 [<ffffffff81062bb6>] __do_softirq+0x112/0x25a [<ffffffff8150d27c>] call_softirq+0x1c/0x30 [<ffffffff81010bf5>] do_softirq+0x4b/0xa2 [<ffffffff81062f65>] irq_exit+0x5d/0xcf [<ffffffff8150dc11>] smp_apic_timer_interrupt+0x7c/0x8a [<ffffffff8150baf3>] apic_timer_interrupt+0x73/0x80 <EOI> [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff814fc285>] ? __slab_free+0x30/0x24c [<ffffffff814fc283>] ? __slab_free+0x2e/0x24c [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81130cb0>] kfree+0x108/0x131 [<ffffffff81204e74>] inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204fc6>] selinux_d_instantiate+0x1c/0x1e [<ffffffff81200f4f>] security_d_instantiate+0x21/0x23 [<ffffffff81154625>] d_instantiate+0x5c/0x61 [<ffffffff811563ca>] d_splice_alias+0xbc/0xd2 [<ffffffff811b17ff>] ext4_lookup+0xba/0xeb [<ffffffff8114bf1e>] d_alloc_and_lookup+0x45/0x6b [<ffffffff8114cbea>] walk_component+0x215/0x3e8 [<ffffffff8114cdf8>] lookup_last+0x3b/0x3d [<ffffffff8114daf3>] path_lookupat+0x82/0x2af [<ffffffff8110fc53>] ? might_fault+0xa5/0xac [<ffffffff8110fc0a>] ? might_fault+0x5c/0xac [<ffffffff8114c564>] ? getname_flags+0x31/0x1ca [<ffffffff8114dd48>] do_path_lookup+0x28/0x97 [<ffffffff8114df2c>] user_path_at+0x59/0x96 [<ffffffff811467ad>] ? cp_new_stat+0xf7/0x10d [<ffffffff811469a6>] vfs_fstatat+0x44/0x6e [<ffffffff811469ee>] vfs_lstat+0x1e/0x20 [<ffffffff81146b3d>] sys_newlstat+0x1a/0x33 [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff812535fe>] ? trace_hardirqs_on_thunk+0x3a/0x3f [<ffffffff8150af82>] system_call_fastpath+0x16/0x1b Reported-by: Dave Jones <davej@redhat.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-08-22 19:32:42 +00:00
__skb_unlink(skb, &tbl->proxy_queue);
arp: fix rcu lockdep splat in arp_process() Dave Jones reported a lockdep splat triggered by an arp_process() call from parp_redo(). Commit faa9dcf793be (arp: RCU changes) is the origin of the bug, since it assumed arp_process() was called under rcu_read_lock(), which is not true in this particular path. Instead of adding rcu_read_lock() in parp_redo(), I chose to add it in neigh_proxy_process() to take care of IPv6 side too. =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- include/linux/inetdevice.h:209 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 4 locks held by setfiles/2123: #0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff8114cbc4>] walk_component+0x1ef/0x3e8 #1: (&isec->lock){+.+.+.}, at: [<ffffffff81204bca>] inode_doinit_with_dentry+0x3f/0x41f #2: (&tbl->proxy_timer){+.-...}, at: [<ffffffff8106a803>] run_timer_softirq+0x157/0x372 #3: (class){+.-...}, at: [<ffffffff8141f256>] neigh_proxy_process +0x36/0x103 stack backtrace: Pid: 2123, comm: setfiles Tainted: G W 3.1.0-0.rc2.git7.2.fc16.x86_64 #1 Call Trace: <IRQ> [<ffffffff8108ca23>] lockdep_rcu_dereference+0xa7/0xaf [<ffffffff8146a0b7>] __in_dev_get_rcu+0x55/0x5d [<ffffffff8146a751>] arp_process+0x25/0x4d7 [<ffffffff8146ac11>] parp_redo+0xe/0x10 [<ffffffff8141f2ba>] neigh_proxy_process+0x9a/0x103 [<ffffffff8106a8c4>] run_timer_softirq+0x218/0x372 [<ffffffff8106a803>] ? run_timer_softirq+0x157/0x372 [<ffffffff8141f220>] ? neigh_stat_seq_open+0x41/0x41 [<ffffffff8108f2f0>] ? mark_held_locks+0x6d/0x95 [<ffffffff81062bb6>] __do_softirq+0x112/0x25a [<ffffffff8150d27c>] call_softirq+0x1c/0x30 [<ffffffff81010bf5>] do_softirq+0x4b/0xa2 [<ffffffff81062f65>] irq_exit+0x5d/0xcf [<ffffffff8150dc11>] smp_apic_timer_interrupt+0x7c/0x8a [<ffffffff8150baf3>] apic_timer_interrupt+0x73/0x80 <EOI> [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff814fc285>] ? __slab_free+0x30/0x24c [<ffffffff814fc283>] ? __slab_free+0x2e/0x24c [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81130cb0>] kfree+0x108/0x131 [<ffffffff81204e74>] inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204fc6>] selinux_d_instantiate+0x1c/0x1e [<ffffffff81200f4f>] security_d_instantiate+0x21/0x23 [<ffffffff81154625>] d_instantiate+0x5c/0x61 [<ffffffff811563ca>] d_splice_alias+0xbc/0xd2 [<ffffffff811b17ff>] ext4_lookup+0xba/0xeb [<ffffffff8114bf1e>] d_alloc_and_lookup+0x45/0x6b [<ffffffff8114cbea>] walk_component+0x215/0x3e8 [<ffffffff8114cdf8>] lookup_last+0x3b/0x3d [<ffffffff8114daf3>] path_lookupat+0x82/0x2af [<ffffffff8110fc53>] ? might_fault+0xa5/0xac [<ffffffff8110fc0a>] ? might_fault+0x5c/0xac [<ffffffff8114c564>] ? getname_flags+0x31/0x1ca [<ffffffff8114dd48>] do_path_lookup+0x28/0x97 [<ffffffff8114df2c>] user_path_at+0x59/0x96 [<ffffffff811467ad>] ? cp_new_stat+0xf7/0x10d [<ffffffff811469a6>] vfs_fstatat+0x44/0x6e [<ffffffff811469ee>] vfs_lstat+0x1e/0x20 [<ffffffff81146b3d>] sys_newlstat+0x1a/0x33 [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff812535fe>] ? trace_hardirqs_on_thunk+0x3a/0x3f [<ffffffff8150af82>] system_call_fastpath+0x16/0x1b Reported-by: Dave Jones <davej@redhat.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-08-22 19:32:42 +00:00
if (tbl->proxy_redo && netif_running(dev)) {
rcu_read_lock();
tbl->proxy_redo(skb);
arp: fix rcu lockdep splat in arp_process() Dave Jones reported a lockdep splat triggered by an arp_process() call from parp_redo(). Commit faa9dcf793be (arp: RCU changes) is the origin of the bug, since it assumed arp_process() was called under rcu_read_lock(), which is not true in this particular path. Instead of adding rcu_read_lock() in parp_redo(), I chose to add it in neigh_proxy_process() to take care of IPv6 side too. =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- include/linux/inetdevice.h:209 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 4 locks held by setfiles/2123: #0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff8114cbc4>] walk_component+0x1ef/0x3e8 #1: (&isec->lock){+.+.+.}, at: [<ffffffff81204bca>] inode_doinit_with_dentry+0x3f/0x41f #2: (&tbl->proxy_timer){+.-...}, at: [<ffffffff8106a803>] run_timer_softirq+0x157/0x372 #3: (class){+.-...}, at: [<ffffffff8141f256>] neigh_proxy_process +0x36/0x103 stack backtrace: Pid: 2123, comm: setfiles Tainted: G W 3.1.0-0.rc2.git7.2.fc16.x86_64 #1 Call Trace: <IRQ> [<ffffffff8108ca23>] lockdep_rcu_dereference+0xa7/0xaf [<ffffffff8146a0b7>] __in_dev_get_rcu+0x55/0x5d [<ffffffff8146a751>] arp_process+0x25/0x4d7 [<ffffffff8146ac11>] parp_redo+0xe/0x10 [<ffffffff8141f2ba>] neigh_proxy_process+0x9a/0x103 [<ffffffff8106a8c4>] run_timer_softirq+0x218/0x372 [<ffffffff8106a803>] ? run_timer_softirq+0x157/0x372 [<ffffffff8141f220>] ? neigh_stat_seq_open+0x41/0x41 [<ffffffff8108f2f0>] ? mark_held_locks+0x6d/0x95 [<ffffffff81062bb6>] __do_softirq+0x112/0x25a [<ffffffff8150d27c>] call_softirq+0x1c/0x30 [<ffffffff81010bf5>] do_softirq+0x4b/0xa2 [<ffffffff81062f65>] irq_exit+0x5d/0xcf [<ffffffff8150dc11>] smp_apic_timer_interrupt+0x7c/0x8a [<ffffffff8150baf3>] apic_timer_interrupt+0x73/0x80 <EOI> [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff814fc285>] ? __slab_free+0x30/0x24c [<ffffffff814fc283>] ? __slab_free+0x2e/0x24c [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81130cb0>] kfree+0x108/0x131 [<ffffffff81204e74>] inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204fc6>] selinux_d_instantiate+0x1c/0x1e [<ffffffff81200f4f>] security_d_instantiate+0x21/0x23 [<ffffffff81154625>] d_instantiate+0x5c/0x61 [<ffffffff811563ca>] d_splice_alias+0xbc/0xd2 [<ffffffff811b17ff>] ext4_lookup+0xba/0xeb [<ffffffff8114bf1e>] d_alloc_and_lookup+0x45/0x6b [<ffffffff8114cbea>] walk_component+0x215/0x3e8 [<ffffffff8114cdf8>] lookup_last+0x3b/0x3d [<ffffffff8114daf3>] path_lookupat+0x82/0x2af [<ffffffff8110fc53>] ? might_fault+0xa5/0xac [<ffffffff8110fc0a>] ? might_fault+0x5c/0xac [<ffffffff8114c564>] ? getname_flags+0x31/0x1ca [<ffffffff8114dd48>] do_path_lookup+0x28/0x97 [<ffffffff8114df2c>] user_path_at+0x59/0x96 [<ffffffff811467ad>] ? cp_new_stat+0xf7/0x10d [<ffffffff811469a6>] vfs_fstatat+0x44/0x6e [<ffffffff811469ee>] vfs_lstat+0x1e/0x20 [<ffffffff81146b3d>] sys_newlstat+0x1a/0x33 [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff812535fe>] ? trace_hardirqs_on_thunk+0x3a/0x3f [<ffffffff8150af82>] system_call_fastpath+0x16/0x1b Reported-by: Dave Jones <davej@redhat.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-08-22 19:32:42 +00:00
rcu_read_unlock();
} else {
kfree_skb(skb);
arp: fix rcu lockdep splat in arp_process() Dave Jones reported a lockdep splat triggered by an arp_process() call from parp_redo(). Commit faa9dcf793be (arp: RCU changes) is the origin of the bug, since it assumed arp_process() was called under rcu_read_lock(), which is not true in this particular path. Instead of adding rcu_read_lock() in parp_redo(), I chose to add it in neigh_proxy_process() to take care of IPv6 side too. =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- include/linux/inetdevice.h:209 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 4 locks held by setfiles/2123: #0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff8114cbc4>] walk_component+0x1ef/0x3e8 #1: (&isec->lock){+.+.+.}, at: [<ffffffff81204bca>] inode_doinit_with_dentry+0x3f/0x41f #2: (&tbl->proxy_timer){+.-...}, at: [<ffffffff8106a803>] run_timer_softirq+0x157/0x372 #3: (class){+.-...}, at: [<ffffffff8141f256>] neigh_proxy_process +0x36/0x103 stack backtrace: Pid: 2123, comm: setfiles Tainted: G W 3.1.0-0.rc2.git7.2.fc16.x86_64 #1 Call Trace: <IRQ> [<ffffffff8108ca23>] lockdep_rcu_dereference+0xa7/0xaf [<ffffffff8146a0b7>] __in_dev_get_rcu+0x55/0x5d [<ffffffff8146a751>] arp_process+0x25/0x4d7 [<ffffffff8146ac11>] parp_redo+0xe/0x10 [<ffffffff8141f2ba>] neigh_proxy_process+0x9a/0x103 [<ffffffff8106a8c4>] run_timer_softirq+0x218/0x372 [<ffffffff8106a803>] ? run_timer_softirq+0x157/0x372 [<ffffffff8141f220>] ? neigh_stat_seq_open+0x41/0x41 [<ffffffff8108f2f0>] ? mark_held_locks+0x6d/0x95 [<ffffffff81062bb6>] __do_softirq+0x112/0x25a [<ffffffff8150d27c>] call_softirq+0x1c/0x30 [<ffffffff81010bf5>] do_softirq+0x4b/0xa2 [<ffffffff81062f65>] irq_exit+0x5d/0xcf [<ffffffff8150dc11>] smp_apic_timer_interrupt+0x7c/0x8a [<ffffffff8150baf3>] apic_timer_interrupt+0x73/0x80 <EOI> [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff814fc285>] ? __slab_free+0x30/0x24c [<ffffffff814fc283>] ? __slab_free+0x2e/0x24c [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81130cb0>] kfree+0x108/0x131 [<ffffffff81204e74>] inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204fc6>] selinux_d_instantiate+0x1c/0x1e [<ffffffff81200f4f>] security_d_instantiate+0x21/0x23 [<ffffffff81154625>] d_instantiate+0x5c/0x61 [<ffffffff811563ca>] d_splice_alias+0xbc/0xd2 [<ffffffff811b17ff>] ext4_lookup+0xba/0xeb [<ffffffff8114bf1e>] d_alloc_and_lookup+0x45/0x6b [<ffffffff8114cbea>] walk_component+0x215/0x3e8 [<ffffffff8114cdf8>] lookup_last+0x3b/0x3d [<ffffffff8114daf3>] path_lookupat+0x82/0x2af [<ffffffff8110fc53>] ? might_fault+0xa5/0xac [<ffffffff8110fc0a>] ? might_fault+0x5c/0xac [<ffffffff8114c564>] ? getname_flags+0x31/0x1ca [<ffffffff8114dd48>] do_path_lookup+0x28/0x97 [<ffffffff8114df2c>] user_path_at+0x59/0x96 [<ffffffff811467ad>] ? cp_new_stat+0xf7/0x10d [<ffffffff811469a6>] vfs_fstatat+0x44/0x6e [<ffffffff811469ee>] vfs_lstat+0x1e/0x20 [<ffffffff81146b3d>] sys_newlstat+0x1a/0x33 [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff812535fe>] ? trace_hardirqs_on_thunk+0x3a/0x3f [<ffffffff8150af82>] system_call_fastpath+0x16/0x1b Reported-by: Dave Jones <davej@redhat.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-08-22 19:32:42 +00:00
}
dev_put(dev);
} else if (!sched_next || tdif < sched_next)
sched_next = tdif;
}
del_timer(&tbl->proxy_timer);
if (sched_next)
mod_timer(&tbl->proxy_timer, jiffies + sched_next);
spin_unlock(&tbl->proxy_queue.lock);
}
void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
struct sk_buff *skb)
{
unsigned long now = jiffies;
unsigned long sched_next = now + (net_random() % p->proxy_delay);
if (tbl->proxy_queue.qlen > p->proxy_qlen) {
kfree_skb(skb);
return;
}
NEIGH_CB(skb)->sched_next = sched_next;
NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
spin_lock(&tbl->proxy_queue.lock);
if (del_timer(&tbl->proxy_timer)) {
if (time_before(tbl->proxy_timer.expires, sched_next))
sched_next = tbl->proxy_timer.expires;
}
skb_dst_drop(skb);
dev_hold(skb->dev);
__skb_queue_tail(&tbl->proxy_queue, skb);
mod_timer(&tbl->proxy_timer, sched_next);
spin_unlock(&tbl->proxy_queue.lock);
}
EXPORT_SYMBOL(pneigh_enqueue);
static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
struct net *net, int ifindex)
{
struct neigh_parms *p;
for (p = &tbl->parms; p; p = p->next) {
if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) ||
(!p->dev && !ifindex && net_eq(net, &init_net)))
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
return p;
}
return NULL;
}
struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
struct neigh_table *tbl)
{
struct neigh_parms *p;
struct net *net = dev_net(dev);
const struct net_device_ops *ops = dev->netdev_ops;
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);
if (p) {
p->tbl = tbl;
atomic_set(&p->refcnt, 1);
p->reachable_time =
neigh_rand_reach_time(p->base_reachable_time);
dev_hold(dev);
p->dev = dev;
write_pnet(&p->net, hold_net(net));
p->sysctl_table = NULL;
if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
release_net(net);
dev_put(dev);
kfree(p);
return NULL;
}
write_lock_bh(&tbl->lock);
p->next = tbl->parms.next;
tbl->parms.next = p;
write_unlock_bh(&tbl->lock);
}
return p;
}
EXPORT_SYMBOL(neigh_parms_alloc);
static void neigh_rcu_free_parms(struct rcu_head *head)
{
struct neigh_parms *parms =
container_of(head, struct neigh_parms, rcu_head);
neigh_parms_put(parms);
}
void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
{
struct neigh_parms **p;
if (!parms || parms == &tbl->parms)
return;
write_lock_bh(&tbl->lock);
for (p = &tbl->parms.next; *p; p = &(*p)->next) {
if (*p == parms) {
*p = parms->next;
parms->dead = 1;
write_unlock_bh(&tbl->lock);
if (parms->dev)
dev_put(parms->dev);
call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
return;
}
}
write_unlock_bh(&tbl->lock);
neigh_dbg(1, "%s: not found\n", __func__);
}
EXPORT_SYMBOL(neigh_parms_release);
static void neigh_parms_destroy(struct neigh_parms *parms)
{
release_net(neigh_parms_net(parms));
kfree(parms);
}
static struct lock_class_key neigh_table_proxy_queue_class;
static void neigh_table_init_no_netlink(struct neigh_table *tbl)
{
unsigned long now = jiffies;
unsigned long phsize;
write_pnet(&tbl->parms.net, &init_net);
atomic_set(&tbl->parms.refcnt, 1);
tbl->parms.reachable_time =
neigh_rand_reach_time(tbl->parms.base_reachable_time);
tbl->stats = alloc_percpu(struct neigh_statistics);
if (!tbl->stats)
panic("cannot create neighbour cache statistics");
#ifdef CONFIG_PROC_FS
if (!proc_create_data(tbl->id, 0, init_net.proc_net_stat,
&neigh_stat_seq_fops, tbl))
panic("cannot create neighbour proc dir entry");
#endif
RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3));
phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
if (!tbl->nht || !tbl->phash_buckets)
panic("cannot allocate neighbour cache hashes");
if (!tbl->entry_size)
tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) +
tbl->key_len, NEIGH_PRIV_ALIGN);
else
WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN);
rwlock_init(&tbl->lock);
INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
tbl->parms.reachable_time);
setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl);
skb_queue_head_init_class(&tbl->proxy_queue,
&neigh_table_proxy_queue_class);
tbl->last_flush = now;
tbl->last_rand = now + tbl->parms.reachable_time * 20;
[NEIGH]: Fix IP-over-ATM and ARP interaction. The classical IP over ATM code maintains its own IPv4 <-> <ATM stuff> ARP table, using the standard neighbour-table code. The neigh_table_init function adds this neighbour table to a linked list of all neighbor tables which is used by the functions neigh_delete() neigh_add() and neightbl_set(), all called by the netlink code. Once the ATM neighbour table is added to the list, there are two tables with family == AF_INET there, and ARP entries sent via netlink go into the first table with matching family. This is indeterminate and often wrong. To see the bug, on a kernel with CLIP enabled, create a standard IPv4 ARP entry by pinging an unused address on a local subnet. Then attempt to complete that entry by doing ip neigh replace <ip address> lladdr <some mac address> nud reachable Looking at the ARP tables by using ip neigh show will reveal two ARP entries for the same address. One of these can be found in /proc/net/arp, and the other in /proc/net/atm/arp. This patch adds a new function, neigh_table_init_no_netlink() which does everything the neigh_table_init() does, except add the table to the netlink all-arp-tables chain. In addition neigh_table_init() has a check that all tables on the chain have a distinct address family. The init call in clip.c is changed to call neigh_table_init_no_netlink(). Since ATM ARP tables are rather more complicated than can currently be handled by the available rtattrs in the netlink protocol, no functionality is lost by this patch, and non-ATM ARP manipulation via netlink is rescued. A more complete solution would involve a rtattr for ATM ARP entries and some way for the netlink code to give neigh_add and friends more information than just address family with which to find the correct ARP table. [ I've changed the assertion checking in neigh_table_init() to not use BUG_ON() while holding neigh_tbl_lock. Instead we remember that we found an existing tbl with the same family, and after dropping the lock we'll give a diagnostic kernel log message and a stack dump. -DaveM ] Signed-off-by: Simon Kelley <simon@thekelleys.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2006-05-12 21:56:08 +00:00
}
void neigh_table_init(struct neigh_table *tbl)
{
struct neigh_table *tmp;
neigh_table_init_no_netlink(tbl);
write_lock(&neigh_tbl_lock);
[NEIGH]: Fix IP-over-ATM and ARP interaction. The classical IP over ATM code maintains its own IPv4 <-> <ATM stuff> ARP table, using the standard neighbour-table code. The neigh_table_init function adds this neighbour table to a linked list of all neighbor tables which is used by the functions neigh_delete() neigh_add() and neightbl_set(), all called by the netlink code. Once the ATM neighbour table is added to the list, there are two tables with family == AF_INET there, and ARP entries sent via netlink go into the first table with matching family. This is indeterminate and often wrong. To see the bug, on a kernel with CLIP enabled, create a standard IPv4 ARP entry by pinging an unused address on a local subnet. Then attempt to complete that entry by doing ip neigh replace <ip address> lladdr <some mac address> nud reachable Looking at the ARP tables by using ip neigh show will reveal two ARP entries for the same address. One of these can be found in /proc/net/arp, and the other in /proc/net/atm/arp. This patch adds a new function, neigh_table_init_no_netlink() which does everything the neigh_table_init() does, except add the table to the netlink all-arp-tables chain. In addition neigh_table_init() has a check that all tables on the chain have a distinct address family. The init call in clip.c is changed to call neigh_table_init_no_netlink(). Since ATM ARP tables are rather more complicated than can currently be handled by the available rtattrs in the netlink protocol, no functionality is lost by this patch, and non-ATM ARP manipulation via netlink is rescued. A more complete solution would involve a rtattr for ATM ARP entries and some way for the netlink code to give neigh_add and friends more information than just address family with which to find the correct ARP table. [ I've changed the assertion checking in neigh_table_init() to not use BUG_ON() while holding neigh_tbl_lock. Instead we remember that we found an existing tbl with the same family, and after dropping the lock we'll give a diagnostic kernel log message and a stack dump. -DaveM ] Signed-off-by: Simon Kelley <simon@thekelleys.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2006-05-12 21:56:08 +00:00
for (tmp = neigh_tables; tmp; tmp = tmp->next) {
if (tmp->family == tbl->family)
break;
}
tbl->next = neigh_tables;
neigh_tables = tbl;
write_unlock(&neigh_tbl_lock);
[NEIGH]: Fix IP-over-ATM and ARP interaction. The classical IP over ATM code maintains its own IPv4 <-> <ATM stuff> ARP table, using the standard neighbour-table code. The neigh_table_init function adds this neighbour table to a linked list of all neighbor tables which is used by the functions neigh_delete() neigh_add() and neightbl_set(), all called by the netlink code. Once the ATM neighbour table is added to the list, there are two tables with family == AF_INET there, and ARP entries sent via netlink go into the first table with matching family. This is indeterminate and often wrong. To see the bug, on a kernel with CLIP enabled, create a standard IPv4 ARP entry by pinging an unused address on a local subnet. Then attempt to complete that entry by doing ip neigh replace <ip address> lladdr <some mac address> nud reachable Looking at the ARP tables by using ip neigh show will reveal two ARP entries for the same address. One of these can be found in /proc/net/arp, and the other in /proc/net/atm/arp. This patch adds a new function, neigh_table_init_no_netlink() which does everything the neigh_table_init() does, except add the table to the netlink all-arp-tables chain. In addition neigh_table_init() has a check that all tables on the chain have a distinct address family. The init call in clip.c is changed to call neigh_table_init_no_netlink(). Since ATM ARP tables are rather more complicated than can currently be handled by the available rtattrs in the netlink protocol, no functionality is lost by this patch, and non-ATM ARP manipulation via netlink is rescued. A more complete solution would involve a rtattr for ATM ARP entries and some way for the netlink code to give neigh_add and friends more information than just address family with which to find the correct ARP table. [ I've changed the assertion checking in neigh_table_init() to not use BUG_ON() while holding neigh_tbl_lock. Instead we remember that we found an existing tbl with the same family, and after dropping the lock we'll give a diagnostic kernel log message and a stack dump. -DaveM ] Signed-off-by: Simon Kelley <simon@thekelleys.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2006-05-12 21:56:08 +00:00
if (unlikely(tmp)) {
pr_err("Registering multiple tables for family %d\n",
tbl->family);
[NEIGH]: Fix IP-over-ATM and ARP interaction. The classical IP over ATM code maintains its own IPv4 <-> <ATM stuff> ARP table, using the standard neighbour-table code. The neigh_table_init function adds this neighbour table to a linked list of all neighbor tables which is used by the functions neigh_delete() neigh_add() and neightbl_set(), all called by the netlink code. Once the ATM neighbour table is added to the list, there are two tables with family == AF_INET there, and ARP entries sent via netlink go into the first table with matching family. This is indeterminate and often wrong. To see the bug, on a kernel with CLIP enabled, create a standard IPv4 ARP entry by pinging an unused address on a local subnet. Then attempt to complete that entry by doing ip neigh replace <ip address> lladdr <some mac address> nud reachable Looking at the ARP tables by using ip neigh show will reveal two ARP entries for the same address. One of these can be found in /proc/net/arp, and the other in /proc/net/atm/arp. This patch adds a new function, neigh_table_init_no_netlink() which does everything the neigh_table_init() does, except add the table to the netlink all-arp-tables chain. In addition neigh_table_init() has a check that all tables on the chain have a distinct address family. The init call in clip.c is changed to call neigh_table_init_no_netlink(). Since ATM ARP tables are rather more complicated than can currently be handled by the available rtattrs in the netlink protocol, no functionality is lost by this patch, and non-ATM ARP manipulation via netlink is rescued. A more complete solution would involve a rtattr for ATM ARP entries and some way for the netlink code to give neigh_add and friends more information than just address family with which to find the correct ARP table. [ I've changed the assertion checking in neigh_table_init() to not use BUG_ON() while holding neigh_tbl_lock. Instead we remember that we found an existing tbl with the same family, and after dropping the lock we'll give a diagnostic kernel log message and a stack dump. -DaveM ] Signed-off-by: Simon Kelley <simon@thekelleys.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2006-05-12 21:56:08 +00:00
dump_stack();
}
}
EXPORT_SYMBOL(neigh_table_init);
int neigh_table_clear(struct neigh_table *tbl)
{
struct neigh_table **tp;
/* It is not clean... Fix it to unload IPv6 module safely */
cancel_delayed_work_sync(&tbl->gc_work);
del_timer_sync(&tbl->proxy_timer);
pneigh_queue_purge(&tbl->proxy_queue);
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
pr_crit("neighbour leakage\n");
write_lock(&neigh_tbl_lock);
for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
if (*tp == tbl) {
*tp = tbl->next;
break;
}
}
write_unlock(&neigh_tbl_lock);
call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
neigh_hash_free_rcu);
tbl->nht = NULL;
kfree(tbl->phash_buckets);
tbl->phash_buckets = NULL;
[NET]: Remove /proc/net/stat/*_arp_cache upon module removal neigh_table_init_no_netlink() creates them, but they aren't removed anywhere. Steps to reproduce: modprobe clip rmmod clip cat /proc/net/stat/clip_arp_cache BUG: unable to handle kernel paging request at virtual address f89d7758 printing eip: c05a99da *pdpt = 0000000000004001 *pde = 0000000004408067 *pte = 0000000000000000 Oops: 0000 [#1] PREEMPT SMP Modules linked in: atm af_packet ipv6 binfmt_misc sbs sbshc fan dock battery backlight ac power_supply parport loop rtc_cmos rtc_core rtc_lib serio_raw button k8temp hwmon amd_rng sr_mod cdrom shpchp pci_hotplug ehci_hcd ohci_hcd uhci_hcd usbcore Pid: 2082, comm: cat Not tainted (2.6.24-rc1-b1d08ac064268d0ae2281e98bf5e82627e0f0c56-bloat #4) EIP: 0060:[<c05a99da>] EFLAGS: 00210256 CPU: 0 EIP is at neigh_stat_seq_next+0x26/0x3f EAX: 00000001 EBX: f89d7600 ECX: c587bf40 EDX: 00000000 ESI: 00000000 EDI: 00000001 EBP: 00000400 ESP: c587bf1c DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 Process cat (pid: 2082, ti=c587b000 task=c5984e10 task.ti=c587b000) Stack: c06228cc c5313790 c049e5c0 0804f000 c45a7b00 c53137b0 00000000 00000000 00000082 00000001 00000000 00000000 00000000 fffffffb c58d6780 c049e437 c45a7b00 c04b1f93 c587bfa0 00000400 0804f000 00000400 0804f000 c04b1f2f Call Trace: [<c049e5c0>] seq_read+0x189/0x281 [<c049e437>] seq_read+0x0/0x281 [<c04b1f93>] proc_reg_read+0x64/0x77 [<c04b1f2f>] proc_reg_read+0x0/0x77 [<c048907e>] vfs_read+0x80/0xd1 [<c0489491>] sys_read+0x41/0x67 [<c04080fa>] sysenter_past_esp+0x6b/0xc1 ======================= Code: e9 ec 8d 05 00 56 8b 11 53 8b 40 70 8b 58 3c eb 29 0f a3 15 80 91 7b c0 19 c0 85 c0 8d 42 01 74 17 89 c6 c1 fe 1f 89 01 89 71 04 <8b> 83 58 01 00 00 f7 d0 8b 04 90 eb 09 89 c2 83 fa 01 7e d2 31 EIP: [<c05a99da>] neigh_stat_seq_next+0x26/0x3f SS:ESP 0068:c587bf1c Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru> Signed-off-by: David S. Miller <davem@davemloft.net>
2007-11-06 05:28:13 +00:00
remove_proc_entry(tbl->id, init_net.proc_net_stat);
free_percpu(tbl->stats);
tbl->stats = NULL;
return 0;
}
EXPORT_SYMBOL(neigh_table_clear);
static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
struct nlattr *dst_attr;
struct neigh_table *tbl;
struct net_device *dev = NULL;
int err = -EINVAL;
ASSERT_RTNL();
if (nlmsg_len(nlh) < sizeof(*ndm))
goto out;
dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST);
if (dst_attr == NULL)
goto out;
ndm = nlmsg_data(nlh);
if (ndm->ndm_ifindex) {
dev = __dev_get_by_index(net, ndm->ndm_ifindex);
if (dev == NULL) {
err = -ENODEV;
goto out;
}
}
read_lock(&neigh_tbl_lock);
for (tbl = neigh_tables; tbl; tbl = tbl->next) {
struct neighbour *neigh;
if (tbl->family != ndm->ndm_family)
continue;
read_unlock(&neigh_tbl_lock);
if (nla_len(dst_attr) < tbl->key_len)
goto out;
if (ndm->ndm_flags & NTF_PROXY) {
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
goto out;
}
if (dev == NULL)
goto out;
neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
if (neigh == NULL) {
err = -ENOENT;
goto out;
}
err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE |
NEIGH_UPDATE_F_ADMIN);
neigh_release(neigh);
goto out;
}
read_unlock(&neigh_tbl_lock);
err = -EAFNOSUPPORT;
out:
return err;
}
static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
struct nlattr *tb[NDA_MAX+1];
struct neigh_table *tbl;
struct net_device *dev = NULL;
int err;
ASSERT_RTNL();
err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
if (err < 0)
goto out;
err = -EINVAL;
if (tb[NDA_DST] == NULL)
goto out;
ndm = nlmsg_data(nlh);
if (ndm->ndm_ifindex) {
dev = __dev_get_by_index(net, ndm->ndm_ifindex);
if (dev == NULL) {
err = -ENODEV;
goto out;
}
if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len)
goto out;
}
read_lock(&neigh_tbl_lock);
for (tbl = neigh_tables; tbl; tbl = tbl->next) {
int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
struct neighbour *neigh;
void *dst, *lladdr;
if (tbl->family != ndm->ndm_family)
continue;
read_unlock(&neigh_tbl_lock);
if (nla_len(tb[NDA_DST]) < tbl->key_len)
goto out;
dst = nla_data(tb[NDA_DST]);
lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
if (ndm->ndm_flags & NTF_PROXY) {
struct pneigh_entry *pn;
err = -ENOBUFS;
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
pn = pneigh_lookup(tbl, net, dst, dev, 1);
if (pn) {
pn->flags = ndm->ndm_flags;
err = 0;
}
goto out;
}
if (dev == NULL)
goto out;
neigh = neigh_lookup(tbl, dst, dev);
if (neigh == NULL) {
if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
err = -ENOENT;
goto out;
}
neigh = __neigh_lookup_errno(tbl, dst, dev);
if (IS_ERR(neigh)) {
err = PTR_ERR(neigh);
goto out;
}
} else {
if (nlh->nlmsg_flags & NLM_F_EXCL) {
err = -EEXIST;
neigh_release(neigh);
goto out;
}
if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
flags &= ~NEIGH_UPDATE_F_OVERRIDE;
}
if (ndm->ndm_flags & NTF_USE) {
neigh_event_send(neigh, NULL);
err = 0;
} else
err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
neigh_release(neigh);
goto out;
}
read_unlock(&neigh_tbl_lock);
err = -EAFNOSUPPORT;
out:
return err;
}
static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
{
struct nlattr *nest;
nest = nla_nest_start(skb, NDTA_PARMS);
if (nest == NULL)
return -ENOBUFS;
if ((parms->dev &&
nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) ||
nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) ||
nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes) ||
/* approximative value for deprecated QUEUE_LEN (in packets) */
nla_put_u32(skb, NDTPA_QUEUE_LEN,
parms->queue_len_bytes / SKB_TRUESIZE(ETH_FRAME_LEN)) ||
nla_put_u32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen) ||
nla_put_u32(skb, NDTPA_APP_PROBES, parms->app_probes) ||
nla_put_u32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes) ||
nla_put_u32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes) ||
nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) ||
nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME,
parms->base_reachable_time) ||
nla_put_msecs(skb, NDTPA_GC_STALETIME, parms->gc_staletime) ||
nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME,
parms->delay_probe_time) ||
nla_put_msecs(skb, NDTPA_RETRANS_TIME, parms->retrans_time) ||
nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay) ||
nla_put_msecs(skb, NDTPA_PROXY_DELAY, parms->proxy_delay) ||
nla_put_msecs(skb, NDTPA_LOCKTIME, parms->locktime))
goto nla_put_failure;
return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
return -EMSGSIZE;
}
static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
u32 pid, u32 seq, int type, int flags)
{
struct nlmsghdr *nlh;
struct ndtmsg *ndtmsg;
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
if (nlh == NULL)
return -EMSGSIZE;
ndtmsg = nlmsg_data(nlh);
read_lock_bh(&tbl->lock);
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
if (nla_put_string(skb, NDTA_NAME, tbl->id) ||
nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval) ||
nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) ||
nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) ||
nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3))
goto nla_put_failure;
{
unsigned long now = jiffies;
unsigned int flush_delta = now - tbl->last_flush;
unsigned int rand_delta = now - tbl->last_rand;
struct neigh_hash_table *nht;
struct ndt_config ndc = {
.ndtc_key_len = tbl->key_len,
.ndtc_entry_size = tbl->entry_size,
.ndtc_entries = atomic_read(&tbl->entries),
.ndtc_last_flush = jiffies_to_msecs(flush_delta),
.ndtc_last_rand = jiffies_to_msecs(rand_delta),
.ndtc_proxy_qlen = tbl->proxy_queue.qlen,
};
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
ndc.ndtc_hash_rnd = nht->hash_rnd[0];
ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
rcu_read_unlock_bh();
if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc))
goto nla_put_failure;
}
{
int cpu;
struct ndt_stats ndst;
memset(&ndst, 0, sizeof(ndst));
for_each_possible_cpu(cpu) {
struct neigh_statistics *st;
st = per_cpu_ptr(tbl->stats, cpu);
ndst.ndts_allocs += st->allocs;
ndst.ndts_destroys += st->destroys;
ndst.ndts_hash_grows += st->hash_grows;
ndst.ndts_res_failed += st->res_failed;
ndst.ndts_lookups += st->lookups;
ndst.ndts_hits += st->hits;
ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast;
ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast;
ndst.ndts_periodic_gc_runs += st->periodic_gc_runs;
ndst.ndts_forced_gc_runs += st->forced_gc_runs;
}
if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst))
goto nla_put_failure;
}
BUG_ON(tbl->parms.dev);
if (neightbl_fill_parms(skb, &tbl->parms) < 0)
goto nla_put_failure;
read_unlock_bh(&tbl->lock);
return nlmsg_end(skb, nlh);
nla_put_failure:
read_unlock_bh(&tbl->lock);
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static int neightbl_fill_param_info(struct sk_buff *skb,
struct neigh_table *tbl,
struct neigh_parms *parms,
u32 pid, u32 seq, int type,
unsigned int flags)
{
struct ndtmsg *ndtmsg;
struct nlmsghdr *nlh;
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
if (nlh == NULL)
return -EMSGSIZE;
ndtmsg = nlmsg_data(nlh);
read_lock_bh(&tbl->lock);
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 ||
neightbl_fill_parms(skb, parms) < 0)
goto errout;
read_unlock_bh(&tbl->lock);
return nlmsg_end(skb, nlh);
errout:
read_unlock_bh(&tbl->lock);
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
[NDTA_NAME] = { .type = NLA_STRING },
[NDTA_THRESH1] = { .type = NLA_U32 },
[NDTA_THRESH2] = { .type = NLA_U32 },
[NDTA_THRESH3] = { .type = NLA_U32 },
[NDTA_GC_INTERVAL] = { .type = NLA_U64 },
[NDTA_PARMS] = { .type = NLA_NESTED },
};
static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
[NDTPA_IFINDEX] = { .type = NLA_U32 },
[NDTPA_QUEUE_LEN] = { .type = NLA_U32 },
[NDTPA_PROXY_QLEN] = { .type = NLA_U32 },
[NDTPA_APP_PROBES] = { .type = NLA_U32 },
[NDTPA_UCAST_PROBES] = { .type = NLA_U32 },
[NDTPA_MCAST_PROBES] = { .type = NLA_U32 },
[NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 },
[NDTPA_GC_STALETIME] = { .type = NLA_U64 },
[NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 },
[NDTPA_RETRANS_TIME] = { .type = NLA_U64 },
[NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 },
[NDTPA_PROXY_DELAY] = { .type = NLA_U64 },
[NDTPA_LOCKTIME] = { .type = NLA_U64 },
};
static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct neigh_table *tbl;
struct ndtmsg *ndtmsg;
struct nlattr *tb[NDTA_MAX+1];
int err;
err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
nl_neightbl_policy);
if (err < 0)
goto errout;
if (tb[NDTA_NAME] == NULL) {
err = -EINVAL;
goto errout;
}
ndtmsg = nlmsg_data(nlh);
read_lock(&neigh_tbl_lock);
for (tbl = neigh_tables; tbl; tbl = tbl->next) {
if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
continue;
if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0)
break;
}
if (tbl == NULL) {
err = -ENOENT;
goto errout_locked;
}
/*
* We acquire tbl->lock to be nice to the periodic timers and
* make sure they always see a consistent set of values.
*/
write_lock_bh(&tbl->lock);
if (tb[NDTA_PARMS]) {
struct nlattr *tbp[NDTPA_MAX+1];
struct neigh_parms *p;
int i, ifindex = 0;
err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS],
nl_ntbl_parm_policy);
if (err < 0)
goto errout_tbl_lock;
if (tbp[NDTPA_IFINDEX])
ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]);
p = lookup_neigh_parms(tbl, net, ifindex);
if (p == NULL) {
err = -ENOENT;
goto errout_tbl_lock;
}
for (i = 1; i <= NDTPA_MAX; i++) {
if (tbp[i] == NULL)
continue;
switch (i) {
case NDTPA_QUEUE_LEN:
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
p->queue_len_bytes = nla_get_u32(tbp[i]) *
SKB_TRUESIZE(ETH_FRAME_LEN);
break;
case NDTPA_QUEUE_LENBYTES:
p->queue_len_bytes = nla_get_u32(tbp[i]);
break;
case NDTPA_PROXY_QLEN:
p->proxy_qlen = nla_get_u32(tbp[i]);
break;
case NDTPA_APP_PROBES:
p->app_probes = nla_get_u32(tbp[i]);
break;
case NDTPA_UCAST_PROBES:
p->ucast_probes = nla_get_u32(tbp[i]);
break;
case NDTPA_MCAST_PROBES:
p->mcast_probes = nla_get_u32(tbp[i]);
break;
case NDTPA_BASE_REACHABLE_TIME:
p->base_reachable_time = nla_get_msecs(tbp[i]);
break;
case NDTPA_GC_STALETIME:
p->gc_staletime = nla_get_msecs(tbp[i]);
break;
case NDTPA_DELAY_PROBE_TIME:
p->delay_probe_time = nla_get_msecs(tbp[i]);
break;
case NDTPA_RETRANS_TIME:
p->retrans_time = nla_get_msecs(tbp[i]);
break;
case NDTPA_ANYCAST_DELAY:
p->anycast_delay = nla_get_msecs(tbp[i]);
break;
case NDTPA_PROXY_DELAY:
p->proxy_delay = nla_get_msecs(tbp[i]);
break;
case NDTPA_LOCKTIME:
p->locktime = nla_get_msecs(tbp[i]);
break;
}
}
}
err = -ENOENT;
if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] ||
tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) &&
!net_eq(net, &init_net))
goto errout_tbl_lock;
if (tb[NDTA_THRESH1])
tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]);
if (tb[NDTA_THRESH2])
tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]);
if (tb[NDTA_THRESH3])
tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]);
if (tb[NDTA_GC_INTERVAL])
tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]);
err = 0;
errout_tbl_lock:
write_unlock_bh(&tbl->lock);
errout_locked:
read_unlock(&neigh_tbl_lock);
errout:
return err;
}
static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
int family, tidx, nidx = 0;
int tbl_skip = cb->args[0];
int neigh_skip = cb->args[1];
struct neigh_table *tbl;
family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
read_lock(&neigh_tbl_lock);
for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) {
struct neigh_parms *p;
if (tidx < tbl_skip || (family && tbl->family != family))
continue;
if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
NLM_F_MULTI) <= 0)
break;
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
for (nidx = 0, p = tbl->parms.next; p; p = p->next) {
if (!net_eq(neigh_parms_net(p), net))
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
continue;
if (nidx < neigh_skip)
goto next;
if (neightbl_fill_param_info(skb, tbl, p,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGHTBL,
NLM_F_MULTI) <= 0)
goto out;
next:
nidx++;
}
neigh_skip = 0;
}
out:
read_unlock(&neigh_tbl_lock);
cb->args[0] = tidx;
cb->args[1] = nidx;
return skb->len;
}
static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
u32 pid, u32 seq, int type, unsigned int flags)
{
unsigned long now = jiffies;
struct nda_cacheinfo ci;
struct nlmsghdr *nlh;
struct ndmsg *ndm;
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
if (nlh == NULL)
return -EMSGSIZE;
ndm = nlmsg_data(nlh);
ndm->ndm_family = neigh->ops->family;
ndm->ndm_pad1 = 0;
ndm->ndm_pad2 = 0;
ndm->ndm_flags = neigh->flags;
ndm->ndm_type = neigh->type;
ndm->ndm_ifindex = neigh->dev->ifindex;
if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key))
goto nla_put_failure;
read_lock_bh(&neigh->lock);
ndm->ndm_state = neigh->nud_state;
if (neigh->nud_state & NUD_VALID) {
char haddr[MAX_ADDR_LEN];
neigh_ha_snapshot(haddr, neigh, neigh->dev);
if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
read_unlock_bh(&neigh->lock);
goto nla_put_failure;
}
}
ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated);
ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1;
read_unlock_bh(&neigh->lock);
if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) ||
nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
goto nla_put_failure;
return nlmsg_end(skb, nlh);
nla_put_failure:
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
u32 pid, u32 seq, int type, unsigned int flags,
struct neigh_table *tbl)
{
struct nlmsghdr *nlh;
struct ndmsg *ndm;
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
if (nlh == NULL)
return -EMSGSIZE;
ndm = nlmsg_data(nlh);
ndm->ndm_family = tbl->family;
ndm->ndm_pad1 = 0;
ndm->ndm_pad2 = 0;
ndm->ndm_flags = pn->flags | NTF_PROXY;
ndm->ndm_type = RTN_UNICAST;
ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0;
ndm->ndm_state = NUD_NONE;
if (nla_put(skb, NDA_DST, tbl->key_len, pn->key))
goto nla_put_failure;
return nlmsg_end(skb, nlh);
nla_put_failure:
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static void neigh_update_notify(struct neighbour *neigh)
{
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
__neigh_notify(neigh, RTM_NEWNEIGH, 0);
}
static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
struct neighbour *n;
int rc, h, s_h = cb->args[1];
int idx, s_idx = idx = cb->args[2];
struct neigh_hash_table *nht;
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
for (h = s_h; h < (1 << nht->hash_shift); h++) {
if (h > s_h)
s_idx = 0;
for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
n != NULL;
n = rcu_dereference_bh(n->next)) {
if (!net_eq(dev_net(n->dev), net))
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
continue;
if (idx < s_idx)
goto next;
if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH,
NLM_F_MULTI) <= 0) {
rc = -1;
goto out;
}
next:
idx++;
}
}
rc = skb->len;
out:
rcu_read_unlock_bh();
cb->args[1] = h;
cb->args[2] = idx;
return rc;
}
static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
struct netlink_callback *cb)
{
struct pneigh_entry *n;
struct net *net = sock_net(skb->sk);
int rc, h, s_h = cb->args[3];
int idx, s_idx = idx = cb->args[4];
read_lock_bh(&tbl->lock);
for (h = s_h; h <= PNEIGH_HASHMASK; h++) {
if (h > s_h)
s_idx = 0;
for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
if (pneigh_net(n) != net)
continue;
if (idx < s_idx)
goto next;
if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH,
NLM_F_MULTI, tbl) <= 0) {
read_unlock_bh(&tbl->lock);
rc = -1;
goto out;
}
next:
idx++;
}
}
read_unlock_bh(&tbl->lock);
rc = skb->len;
out:
cb->args[3] = h;
cb->args[4] = idx;
return rc;
}
static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
struct neigh_table *tbl;
int t, family, s_t;
int proxy = 0;
int err;
read_lock(&neigh_tbl_lock);
family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
/* check for full ndmsg structure presence, family member is
* the same for both structures
*/
if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) &&
((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY)
proxy = 1;
s_t = cb->args[0];
for (tbl = neigh_tables, t = 0; tbl;
tbl = tbl->next, t++) {
if (t < s_t || (family && tbl->family != family))
continue;
if (t > s_t)
memset(&cb->args[1], 0, sizeof(cb->args) -
sizeof(cb->args[0]));
if (proxy)
err = pneigh_dump_table(tbl, skb, cb);
else
err = neigh_dump_table(tbl, skb, cb);
if (err < 0)
break;
}
read_unlock(&neigh_tbl_lock);
cb->args[0] = t;
return skb->len;
}
void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
{
int chain;
struct neigh_hash_table *nht;
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
read_lock(&tbl->lock); /* avoid resizes */
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct neighbour *n;
for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
n != NULL;
n = rcu_dereference_bh(n->next))
cb(n, cookie);
}
read_unlock(&tbl->lock);
rcu_read_unlock_bh();
}
EXPORT_SYMBOL(neigh_for_each);
/* The tbl->lock must be held as a writer and BH disabled. */
void __neigh_for_each_release(struct neigh_table *tbl,
int (*cb)(struct neighbour *))
{
int chain;
struct neigh_hash_table *nht;
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct neighbour *n;
struct neighbour __rcu **np;
np = &nht->hash_buckets[chain];
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock))) != NULL) {
int release;
write_lock(&n->lock);
release = cb(n);
if (release) {
rcu_assign_pointer(*np,
rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock)));
n->dead = 1;
} else
np = &n->next;
write_unlock(&n->lock);
if (release)
neigh_cleanup_and_release(n);
}
}
}
EXPORT_SYMBOL(__neigh_for_each_release);
#ifdef CONFIG_PROC_FS
static struct neighbour *neigh_get_first(struct seq_file *seq)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
struct neigh_hash_table *nht = state->nht;
struct neighbour *n = NULL;
int bucket = state->bucket;
state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
n = rcu_dereference_bh(nht->hash_buckets[bucket]);
while (n) {
if (!net_eq(dev_net(n->dev), net))
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
goto next;
if (state->neigh_sub_iter) {
loff_t fakep = 0;
void *v;
v = state->neigh_sub_iter(state, n, &fakep);
if (!v)
goto next;
}
if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
break;
if (n->nud_state & ~NUD_NOARP)
break;
next:
n = rcu_dereference_bh(n->next);
}
if (n)
break;
}
state->bucket = bucket;
return n;
}
static struct neighbour *neigh_get_next(struct seq_file *seq,
struct neighbour *n,
loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
struct neigh_hash_table *nht = state->nht;
if (state->neigh_sub_iter) {
void *v = state->neigh_sub_iter(state, n, pos);
if (v)
return n;
}
n = rcu_dereference_bh(n->next);
while (1) {
while (n) {
if (!net_eq(dev_net(n->dev), net))
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
goto next;
if (state->neigh_sub_iter) {
void *v = state->neigh_sub_iter(state, n, pos);
if (v)
return n;
goto next;
}
if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
break;
if (n->nud_state & ~NUD_NOARP)
break;
next:
n = rcu_dereference_bh(n->next);
}
if (n)
break;
if (++state->bucket >= (1 << nht->hash_shift))
break;
n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
}
if (n && pos)
--(*pos);
return n;
}
static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos)
{
struct neighbour *n = neigh_get_first(seq);
if (n) {
--(*pos);
while (*pos) {
n = neigh_get_next(seq, n, pos);
if (!n)
break;
}
}
return *pos ? NULL : n;
}
static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
struct pneigh_entry *pn = NULL;
int bucket = state->bucket;
state->flags |= NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
pn = tbl->phash_buckets[bucket];
while (pn && !net_eq(pneigh_net(pn), net))
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
pn = pn->next;
if (pn)
break;
}
state->bucket = bucket;
return pn;
}
static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
struct pneigh_entry *pn,
loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
do {
pn = pn->next;
} while (pn && !net_eq(pneigh_net(pn), net));
while (!pn) {
if (++state->bucket > PNEIGH_HASHMASK)
break;
pn = tbl->phash_buckets[state->bucket];
while (pn && !net_eq(pneigh_net(pn), net))
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
pn = pn->next;
if (pn)
break;
}
if (pn && pos)
--(*pos);
return pn;
}
static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos)
{
struct pneigh_entry *pn = pneigh_get_first(seq);
if (pn) {
--(*pos);
while (*pos) {
pn = pneigh_get_next(seq, pn, pos);
if (!pn)
break;
}
}
return *pos ? NULL : pn;
}
static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
void *rc;
loff_t idxpos = *pos;
rc = neigh_get_idx(seq, &idxpos);
if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY))
rc = pneigh_get_idx(seq, &idxpos);
return rc;
}
void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
neigh: fix use-after-free read in pneigh_get_next [ Upstream commit f3e92cb8e2eb8c27d109e6fd73d3a69a8c09e288 ] Nine years ago, I added RCU handling to neighbours, not pneighbours. (pneigh are not commonly used) Unfortunately I missed that /proc dump operations would use a common entry and exit point : neigh_seq_start() and neigh_seq_stop() We need to read_lock(tbl->lock) or risk use-after-free while iterating the pneigh structures. We might later convert pneigh to RCU and revert this patch. sysbot reported : BUG: KASAN: use-after-free in pneigh_get_next.isra.0+0x24b/0x280 net/core/neighbour.c:3158 Read of size 8 at addr ffff888097f2a700 by task syz-executor.0/9825 CPU: 1 PID: 9825 Comm: syz-executor.0 Not tainted 5.2.0-rc4+ #32 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317 kasan_report+0x12/0x20 mm/kasan/common.c:614 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132 pneigh_get_next.isra.0+0x24b/0x280 net/core/neighbour.c:3158 neigh_seq_next+0xdb/0x210 net/core/neighbour.c:3240 seq_read+0x9cf/0x1110 fs/seq_file.c:258 proc_reg_read+0x1fc/0x2c0 fs/proc/inode.c:221 do_loop_readv_writev fs/read_write.c:714 [inline] do_loop_readv_writev fs/read_write.c:701 [inline] do_iter_read+0x4a4/0x660 fs/read_write.c:935 vfs_readv+0xf0/0x160 fs/read_write.c:997 kernel_readv fs/splice.c:359 [inline] default_file_splice_read+0x475/0x890 fs/splice.c:414 do_splice_to+0x127/0x180 fs/splice.c:877 splice_direct_to_actor+0x2d2/0x970 fs/splice.c:954 do_splice_direct+0x1da/0x2a0 fs/splice.c:1063 do_sendfile+0x597/0xd00 fs/read_write.c:1464 __do_sys_sendfile64 fs/read_write.c:1525 [inline] __se_sys_sendfile64 fs/read_write.c:1511 [inline] __x64_sys_sendfile64+0x1dd/0x220 fs/read_write.c:1511 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4592c9 Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f4aab51dc78 EFLAGS: 00000246 ORIG_RAX: 0000000000000028 RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00000000004592c9 RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000005 RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000080000000 R11: 0000000000000246 R12: 00007f4aab51e6d4 R13: 00000000004c689d R14: 00000000004db828 R15: 00000000ffffffff Allocated by task 9827: save_stack+0x23/0x90 mm/kasan/common.c:71 set_track mm/kasan/common.c:79 [inline] __kasan_kmalloc mm/kasan/common.c:489 [inline] __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:462 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:503 __do_kmalloc mm/slab.c:3660 [inline] __kmalloc+0x15c/0x740 mm/slab.c:3669 kmalloc include/linux/slab.h:552 [inline] pneigh_lookup+0x19c/0x4a0 net/core/neighbour.c:731 arp_req_set_public net/ipv4/arp.c:1010 [inline] arp_req_set+0x613/0x720 net/ipv4/arp.c:1026 arp_ioctl+0x652/0x7f0 net/ipv4/arp.c:1226 inet_ioctl+0x2a0/0x340 net/ipv4/af_inet.c:926 sock_do_ioctl+0xd8/0x2f0 net/socket.c:1043 sock_ioctl+0x3ed/0x780 net/socket.c:1194 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:509 [inline] do_vfs_ioctl+0xd5f/0x1380 fs/ioctl.c:696 ksys_ioctl+0xab/0xd0 fs/ioctl.c:713 __do_sys_ioctl fs/ioctl.c:720 [inline] __se_sys_ioctl fs/ioctl.c:718 [inline] __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 9824: save_stack+0x23/0x90 mm/kasan/common.c:71 set_track mm/kasan/common.c:79 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/common.c:451 kasan_slab_free+0xe/0x10 mm/kasan/common.c:459 __cache_free mm/slab.c:3432 [inline] kfree+0xcf/0x220 mm/slab.c:3755 pneigh_ifdown_and_unlock net/core/neighbour.c:812 [inline] __neigh_ifdown+0x236/0x2f0 net/core/neighbour.c:356 neigh_ifdown+0x20/0x30 net/core/neighbour.c:372 arp_ifdown+0x1d/0x21 net/ipv4/arp.c:1274 inetdev_destroy net/ipv4/devinet.c:319 [inline] inetdev_event+0xa14/0x11f0 net/ipv4/devinet.c:1544 notifier_call_chain+0xc2/0x230 kernel/notifier.c:95 __raw_notifier_call_chain kernel/notifier.c:396 [inline] raw_notifier_call_chain+0x2e/0x40 kernel/notifier.c:403 call_netdevice_notifiers_info+0x3f/0x90 net/core/dev.c:1749 call_netdevice_notifiers_extack net/core/dev.c:1761 [inline] call_netdevice_notifiers net/core/dev.c:1775 [inline] rollback_registered_many+0x9b9/0xfc0 net/core/dev.c:8178 rollback_registered+0x109/0x1d0 net/core/dev.c:8220 unregister_netdevice_queue net/core/dev.c:9267 [inline] unregister_netdevice_queue+0x1ee/0x2c0 net/core/dev.c:9260 unregister_netdevice include/linux/netdevice.h:2631 [inline] __tun_detach+0xd8a/0x1040 drivers/net/tun.c:724 tun_detach drivers/net/tun.c:741 [inline] tun_chr_close+0xe0/0x180 drivers/net/tun.c:3451 __fput+0x2ff/0x890 fs/file_table.c:280 ____fput+0x16/0x20 fs/file_table.c:313 task_work_run+0x145/0x1c0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:185 [inline] exit_to_usermode_loop+0x273/0x2c0 arch/x86/entry/common.c:168 prepare_exit_to_usermode arch/x86/entry/common.c:199 [inline] syscall_return_slowpath arch/x86/entry/common.c:279 [inline] do_syscall_64+0x58e/0x680 arch/x86/entry/common.c:304 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff888097f2a700 which belongs to the cache kmalloc-64 of size 64 The buggy address is located 0 bytes inside of 64-byte region [ffff888097f2a700, ffff888097f2a740) The buggy address belongs to the page: page:ffffea00025fca80 refcount:1 mapcount:0 mapping:ffff8880aa400340 index:0x0 flags: 0x1fffc0000000200(slab) raw: 01fffc0000000200 ffffea000250d548 ffffea00025726c8 ffff8880aa400340 raw: 0000000000000000 ffff888097f2a000 0000000100000020 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888097f2a600: 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc ffff888097f2a680: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc >ffff888097f2a700: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ^ ffff888097f2a780: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff888097f2a800: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc Fixes: 767e97e1e0db ("neigh: RCU conversion of struct neighbour") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-06-15 23:28:48 +00:00
__acquires(tbl->lock)
__acquires(rcu_bh)
{
struct neigh_seq_state *state = seq->private;
state->tbl = tbl;
state->bucket = 0;
state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
rcu_read_lock_bh();
state->nht = rcu_dereference_bh(tbl->nht);
neigh: fix use-after-free read in pneigh_get_next [ Upstream commit f3e92cb8e2eb8c27d109e6fd73d3a69a8c09e288 ] Nine years ago, I added RCU handling to neighbours, not pneighbours. (pneigh are not commonly used) Unfortunately I missed that /proc dump operations would use a common entry and exit point : neigh_seq_start() and neigh_seq_stop() We need to read_lock(tbl->lock) or risk use-after-free while iterating the pneigh structures. We might later convert pneigh to RCU and revert this patch. sysbot reported : BUG: KASAN: use-after-free in pneigh_get_next.isra.0+0x24b/0x280 net/core/neighbour.c:3158 Read of size 8 at addr ffff888097f2a700 by task syz-executor.0/9825 CPU: 1 PID: 9825 Comm: syz-executor.0 Not tainted 5.2.0-rc4+ #32 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317 kasan_report+0x12/0x20 mm/kasan/common.c:614 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132 pneigh_get_next.isra.0+0x24b/0x280 net/core/neighbour.c:3158 neigh_seq_next+0xdb/0x210 net/core/neighbour.c:3240 seq_read+0x9cf/0x1110 fs/seq_file.c:258 proc_reg_read+0x1fc/0x2c0 fs/proc/inode.c:221 do_loop_readv_writev fs/read_write.c:714 [inline] do_loop_readv_writev fs/read_write.c:701 [inline] do_iter_read+0x4a4/0x660 fs/read_write.c:935 vfs_readv+0xf0/0x160 fs/read_write.c:997 kernel_readv fs/splice.c:359 [inline] default_file_splice_read+0x475/0x890 fs/splice.c:414 do_splice_to+0x127/0x180 fs/splice.c:877 splice_direct_to_actor+0x2d2/0x970 fs/splice.c:954 do_splice_direct+0x1da/0x2a0 fs/splice.c:1063 do_sendfile+0x597/0xd00 fs/read_write.c:1464 __do_sys_sendfile64 fs/read_write.c:1525 [inline] __se_sys_sendfile64 fs/read_write.c:1511 [inline] __x64_sys_sendfile64+0x1dd/0x220 fs/read_write.c:1511 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4592c9 Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f4aab51dc78 EFLAGS: 00000246 ORIG_RAX: 0000000000000028 RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00000000004592c9 RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000005 RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000080000000 R11: 0000000000000246 R12: 00007f4aab51e6d4 R13: 00000000004c689d R14: 00000000004db828 R15: 00000000ffffffff Allocated by task 9827: save_stack+0x23/0x90 mm/kasan/common.c:71 set_track mm/kasan/common.c:79 [inline] __kasan_kmalloc mm/kasan/common.c:489 [inline] __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:462 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:503 __do_kmalloc mm/slab.c:3660 [inline] __kmalloc+0x15c/0x740 mm/slab.c:3669 kmalloc include/linux/slab.h:552 [inline] pneigh_lookup+0x19c/0x4a0 net/core/neighbour.c:731 arp_req_set_public net/ipv4/arp.c:1010 [inline] arp_req_set+0x613/0x720 net/ipv4/arp.c:1026 arp_ioctl+0x652/0x7f0 net/ipv4/arp.c:1226 inet_ioctl+0x2a0/0x340 net/ipv4/af_inet.c:926 sock_do_ioctl+0xd8/0x2f0 net/socket.c:1043 sock_ioctl+0x3ed/0x780 net/socket.c:1194 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:509 [inline] do_vfs_ioctl+0xd5f/0x1380 fs/ioctl.c:696 ksys_ioctl+0xab/0xd0 fs/ioctl.c:713 __do_sys_ioctl fs/ioctl.c:720 [inline] __se_sys_ioctl fs/ioctl.c:718 [inline] __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 9824: save_stack+0x23/0x90 mm/kasan/common.c:71 set_track mm/kasan/common.c:79 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/common.c:451 kasan_slab_free+0xe/0x10 mm/kasan/common.c:459 __cache_free mm/slab.c:3432 [inline] kfree+0xcf/0x220 mm/slab.c:3755 pneigh_ifdown_and_unlock net/core/neighbour.c:812 [inline] __neigh_ifdown+0x236/0x2f0 net/core/neighbour.c:356 neigh_ifdown+0x20/0x30 net/core/neighbour.c:372 arp_ifdown+0x1d/0x21 net/ipv4/arp.c:1274 inetdev_destroy net/ipv4/devinet.c:319 [inline] inetdev_event+0xa14/0x11f0 net/ipv4/devinet.c:1544 notifier_call_chain+0xc2/0x230 kernel/notifier.c:95 __raw_notifier_call_chain kernel/notifier.c:396 [inline] raw_notifier_call_chain+0x2e/0x40 kernel/notifier.c:403 call_netdevice_notifiers_info+0x3f/0x90 net/core/dev.c:1749 call_netdevice_notifiers_extack net/core/dev.c:1761 [inline] call_netdevice_notifiers net/core/dev.c:1775 [inline] rollback_registered_many+0x9b9/0xfc0 net/core/dev.c:8178 rollback_registered+0x109/0x1d0 net/core/dev.c:8220 unregister_netdevice_queue net/core/dev.c:9267 [inline] unregister_netdevice_queue+0x1ee/0x2c0 net/core/dev.c:9260 unregister_netdevice include/linux/netdevice.h:2631 [inline] __tun_detach+0xd8a/0x1040 drivers/net/tun.c:724 tun_detach drivers/net/tun.c:741 [inline] tun_chr_close+0xe0/0x180 drivers/net/tun.c:3451 __fput+0x2ff/0x890 fs/file_table.c:280 ____fput+0x16/0x20 fs/file_table.c:313 task_work_run+0x145/0x1c0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:185 [inline] exit_to_usermode_loop+0x273/0x2c0 arch/x86/entry/common.c:168 prepare_exit_to_usermode arch/x86/entry/common.c:199 [inline] syscall_return_slowpath arch/x86/entry/common.c:279 [inline] do_syscall_64+0x58e/0x680 arch/x86/entry/common.c:304 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff888097f2a700 which belongs to the cache kmalloc-64 of size 64 The buggy address is located 0 bytes inside of 64-byte region [ffff888097f2a700, ffff888097f2a740) The buggy address belongs to the page: page:ffffea00025fca80 refcount:1 mapcount:0 mapping:ffff8880aa400340 index:0x0 flags: 0x1fffc0000000200(slab) raw: 01fffc0000000200 ffffea000250d548 ffffea00025726c8 ffff8880aa400340 raw: 0000000000000000 ffff888097f2a000 0000000100000020 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888097f2a600: 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc ffff888097f2a680: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc >ffff888097f2a700: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ^ ffff888097f2a780: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff888097f2a800: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc Fixes: 767e97e1e0db ("neigh: RCU conversion of struct neighbour") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-06-15 23:28:48 +00:00
read_lock(&tbl->lock);
return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
}
EXPORT_SYMBOL(neigh_seq_start);
void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct neigh_seq_state *state;
void *rc;
if (v == SEQ_START_TOKEN) {
rc = neigh_get_first(seq);
goto out;
}
state = seq->private;
if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) {
rc = neigh_get_next(seq, v, NULL);
if (rc)
goto out;
if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY))
rc = pneigh_get_first(seq);
} else {
BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY);
rc = pneigh_get_next(seq, v, NULL);
}
out:
++(*pos);
return rc;
}
EXPORT_SYMBOL(neigh_seq_next);
void neigh_seq_stop(struct seq_file *seq, void *v)
neigh: fix use-after-free read in pneigh_get_next [ Upstream commit f3e92cb8e2eb8c27d109e6fd73d3a69a8c09e288 ] Nine years ago, I added RCU handling to neighbours, not pneighbours. (pneigh are not commonly used) Unfortunately I missed that /proc dump operations would use a common entry and exit point : neigh_seq_start() and neigh_seq_stop() We need to read_lock(tbl->lock) or risk use-after-free while iterating the pneigh structures. We might later convert pneigh to RCU and revert this patch. sysbot reported : BUG: KASAN: use-after-free in pneigh_get_next.isra.0+0x24b/0x280 net/core/neighbour.c:3158 Read of size 8 at addr ffff888097f2a700 by task syz-executor.0/9825 CPU: 1 PID: 9825 Comm: syz-executor.0 Not tainted 5.2.0-rc4+ #32 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317 kasan_report+0x12/0x20 mm/kasan/common.c:614 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132 pneigh_get_next.isra.0+0x24b/0x280 net/core/neighbour.c:3158 neigh_seq_next+0xdb/0x210 net/core/neighbour.c:3240 seq_read+0x9cf/0x1110 fs/seq_file.c:258 proc_reg_read+0x1fc/0x2c0 fs/proc/inode.c:221 do_loop_readv_writev fs/read_write.c:714 [inline] do_loop_readv_writev fs/read_write.c:701 [inline] do_iter_read+0x4a4/0x660 fs/read_write.c:935 vfs_readv+0xf0/0x160 fs/read_write.c:997 kernel_readv fs/splice.c:359 [inline] default_file_splice_read+0x475/0x890 fs/splice.c:414 do_splice_to+0x127/0x180 fs/splice.c:877 splice_direct_to_actor+0x2d2/0x970 fs/splice.c:954 do_splice_direct+0x1da/0x2a0 fs/splice.c:1063 do_sendfile+0x597/0xd00 fs/read_write.c:1464 __do_sys_sendfile64 fs/read_write.c:1525 [inline] __se_sys_sendfile64 fs/read_write.c:1511 [inline] __x64_sys_sendfile64+0x1dd/0x220 fs/read_write.c:1511 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4592c9 Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f4aab51dc78 EFLAGS: 00000246 ORIG_RAX: 0000000000000028 RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00000000004592c9 RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000005 RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000080000000 R11: 0000000000000246 R12: 00007f4aab51e6d4 R13: 00000000004c689d R14: 00000000004db828 R15: 00000000ffffffff Allocated by task 9827: save_stack+0x23/0x90 mm/kasan/common.c:71 set_track mm/kasan/common.c:79 [inline] __kasan_kmalloc mm/kasan/common.c:489 [inline] __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:462 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:503 __do_kmalloc mm/slab.c:3660 [inline] __kmalloc+0x15c/0x740 mm/slab.c:3669 kmalloc include/linux/slab.h:552 [inline] pneigh_lookup+0x19c/0x4a0 net/core/neighbour.c:731 arp_req_set_public net/ipv4/arp.c:1010 [inline] arp_req_set+0x613/0x720 net/ipv4/arp.c:1026 arp_ioctl+0x652/0x7f0 net/ipv4/arp.c:1226 inet_ioctl+0x2a0/0x340 net/ipv4/af_inet.c:926 sock_do_ioctl+0xd8/0x2f0 net/socket.c:1043 sock_ioctl+0x3ed/0x780 net/socket.c:1194 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:509 [inline] do_vfs_ioctl+0xd5f/0x1380 fs/ioctl.c:696 ksys_ioctl+0xab/0xd0 fs/ioctl.c:713 __do_sys_ioctl fs/ioctl.c:720 [inline] __se_sys_ioctl fs/ioctl.c:718 [inline] __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 9824: save_stack+0x23/0x90 mm/kasan/common.c:71 set_track mm/kasan/common.c:79 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/common.c:451 kasan_slab_free+0xe/0x10 mm/kasan/common.c:459 __cache_free mm/slab.c:3432 [inline] kfree+0xcf/0x220 mm/slab.c:3755 pneigh_ifdown_and_unlock net/core/neighbour.c:812 [inline] __neigh_ifdown+0x236/0x2f0 net/core/neighbour.c:356 neigh_ifdown+0x20/0x30 net/core/neighbour.c:372 arp_ifdown+0x1d/0x21 net/ipv4/arp.c:1274 inetdev_destroy net/ipv4/devinet.c:319 [inline] inetdev_event+0xa14/0x11f0 net/ipv4/devinet.c:1544 notifier_call_chain+0xc2/0x230 kernel/notifier.c:95 __raw_notifier_call_chain kernel/notifier.c:396 [inline] raw_notifier_call_chain+0x2e/0x40 kernel/notifier.c:403 call_netdevice_notifiers_info+0x3f/0x90 net/core/dev.c:1749 call_netdevice_notifiers_extack net/core/dev.c:1761 [inline] call_netdevice_notifiers net/core/dev.c:1775 [inline] rollback_registered_many+0x9b9/0xfc0 net/core/dev.c:8178 rollback_registered+0x109/0x1d0 net/core/dev.c:8220 unregister_netdevice_queue net/core/dev.c:9267 [inline] unregister_netdevice_queue+0x1ee/0x2c0 net/core/dev.c:9260 unregister_netdevice include/linux/netdevice.h:2631 [inline] __tun_detach+0xd8a/0x1040 drivers/net/tun.c:724 tun_detach drivers/net/tun.c:741 [inline] tun_chr_close+0xe0/0x180 drivers/net/tun.c:3451 __fput+0x2ff/0x890 fs/file_table.c:280 ____fput+0x16/0x20 fs/file_table.c:313 task_work_run+0x145/0x1c0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:185 [inline] exit_to_usermode_loop+0x273/0x2c0 arch/x86/entry/common.c:168 prepare_exit_to_usermode arch/x86/entry/common.c:199 [inline] syscall_return_slowpath arch/x86/entry/common.c:279 [inline] do_syscall_64+0x58e/0x680 arch/x86/entry/common.c:304 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff888097f2a700 which belongs to the cache kmalloc-64 of size 64 The buggy address is located 0 bytes inside of 64-byte region [ffff888097f2a700, ffff888097f2a740) The buggy address belongs to the page: page:ffffea00025fca80 refcount:1 mapcount:0 mapping:ffff8880aa400340 index:0x0 flags: 0x1fffc0000000200(slab) raw: 01fffc0000000200 ffffea000250d548 ffffea00025726c8 ffff8880aa400340 raw: 0000000000000000 ffff888097f2a000 0000000100000020 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888097f2a600: 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc ffff888097f2a680: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc >ffff888097f2a700: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ^ ffff888097f2a780: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff888097f2a800: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc Fixes: 767e97e1e0db ("neigh: RCU conversion of struct neighbour") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-06-15 23:28:48 +00:00
__releases(tbl->lock)
__releases(rcu_bh)
{
neigh: fix use-after-free read in pneigh_get_next [ Upstream commit f3e92cb8e2eb8c27d109e6fd73d3a69a8c09e288 ] Nine years ago, I added RCU handling to neighbours, not pneighbours. (pneigh are not commonly used) Unfortunately I missed that /proc dump operations would use a common entry and exit point : neigh_seq_start() and neigh_seq_stop() We need to read_lock(tbl->lock) or risk use-after-free while iterating the pneigh structures. We might later convert pneigh to RCU and revert this patch. sysbot reported : BUG: KASAN: use-after-free in pneigh_get_next.isra.0+0x24b/0x280 net/core/neighbour.c:3158 Read of size 8 at addr ffff888097f2a700 by task syz-executor.0/9825 CPU: 1 PID: 9825 Comm: syz-executor.0 Not tainted 5.2.0-rc4+ #32 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317 kasan_report+0x12/0x20 mm/kasan/common.c:614 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132 pneigh_get_next.isra.0+0x24b/0x280 net/core/neighbour.c:3158 neigh_seq_next+0xdb/0x210 net/core/neighbour.c:3240 seq_read+0x9cf/0x1110 fs/seq_file.c:258 proc_reg_read+0x1fc/0x2c0 fs/proc/inode.c:221 do_loop_readv_writev fs/read_write.c:714 [inline] do_loop_readv_writev fs/read_write.c:701 [inline] do_iter_read+0x4a4/0x660 fs/read_write.c:935 vfs_readv+0xf0/0x160 fs/read_write.c:997 kernel_readv fs/splice.c:359 [inline] default_file_splice_read+0x475/0x890 fs/splice.c:414 do_splice_to+0x127/0x180 fs/splice.c:877 splice_direct_to_actor+0x2d2/0x970 fs/splice.c:954 do_splice_direct+0x1da/0x2a0 fs/splice.c:1063 do_sendfile+0x597/0xd00 fs/read_write.c:1464 __do_sys_sendfile64 fs/read_write.c:1525 [inline] __se_sys_sendfile64 fs/read_write.c:1511 [inline] __x64_sys_sendfile64+0x1dd/0x220 fs/read_write.c:1511 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4592c9 Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f4aab51dc78 EFLAGS: 00000246 ORIG_RAX: 0000000000000028 RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00000000004592c9 RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000005 RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000080000000 R11: 0000000000000246 R12: 00007f4aab51e6d4 R13: 00000000004c689d R14: 00000000004db828 R15: 00000000ffffffff Allocated by task 9827: save_stack+0x23/0x90 mm/kasan/common.c:71 set_track mm/kasan/common.c:79 [inline] __kasan_kmalloc mm/kasan/common.c:489 [inline] __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:462 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:503 __do_kmalloc mm/slab.c:3660 [inline] __kmalloc+0x15c/0x740 mm/slab.c:3669 kmalloc include/linux/slab.h:552 [inline] pneigh_lookup+0x19c/0x4a0 net/core/neighbour.c:731 arp_req_set_public net/ipv4/arp.c:1010 [inline] arp_req_set+0x613/0x720 net/ipv4/arp.c:1026 arp_ioctl+0x652/0x7f0 net/ipv4/arp.c:1226 inet_ioctl+0x2a0/0x340 net/ipv4/af_inet.c:926 sock_do_ioctl+0xd8/0x2f0 net/socket.c:1043 sock_ioctl+0x3ed/0x780 net/socket.c:1194 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:509 [inline] do_vfs_ioctl+0xd5f/0x1380 fs/ioctl.c:696 ksys_ioctl+0xab/0xd0 fs/ioctl.c:713 __do_sys_ioctl fs/ioctl.c:720 [inline] __se_sys_ioctl fs/ioctl.c:718 [inline] __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 9824: save_stack+0x23/0x90 mm/kasan/common.c:71 set_track mm/kasan/common.c:79 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/common.c:451 kasan_slab_free+0xe/0x10 mm/kasan/common.c:459 __cache_free mm/slab.c:3432 [inline] kfree+0xcf/0x220 mm/slab.c:3755 pneigh_ifdown_and_unlock net/core/neighbour.c:812 [inline] __neigh_ifdown+0x236/0x2f0 net/core/neighbour.c:356 neigh_ifdown+0x20/0x30 net/core/neighbour.c:372 arp_ifdown+0x1d/0x21 net/ipv4/arp.c:1274 inetdev_destroy net/ipv4/devinet.c:319 [inline] inetdev_event+0xa14/0x11f0 net/ipv4/devinet.c:1544 notifier_call_chain+0xc2/0x230 kernel/notifier.c:95 __raw_notifier_call_chain kernel/notifier.c:396 [inline] raw_notifier_call_chain+0x2e/0x40 kernel/notifier.c:403 call_netdevice_notifiers_info+0x3f/0x90 net/core/dev.c:1749 call_netdevice_notifiers_extack net/core/dev.c:1761 [inline] call_netdevice_notifiers net/core/dev.c:1775 [inline] rollback_registered_many+0x9b9/0xfc0 net/core/dev.c:8178 rollback_registered+0x109/0x1d0 net/core/dev.c:8220 unregister_netdevice_queue net/core/dev.c:9267 [inline] unregister_netdevice_queue+0x1ee/0x2c0 net/core/dev.c:9260 unregister_netdevice include/linux/netdevice.h:2631 [inline] __tun_detach+0xd8a/0x1040 drivers/net/tun.c:724 tun_detach drivers/net/tun.c:741 [inline] tun_chr_close+0xe0/0x180 drivers/net/tun.c:3451 __fput+0x2ff/0x890 fs/file_table.c:280 ____fput+0x16/0x20 fs/file_table.c:313 task_work_run+0x145/0x1c0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:185 [inline] exit_to_usermode_loop+0x273/0x2c0 arch/x86/entry/common.c:168 prepare_exit_to_usermode arch/x86/entry/common.c:199 [inline] syscall_return_slowpath arch/x86/entry/common.c:279 [inline] do_syscall_64+0x58e/0x680 arch/x86/entry/common.c:304 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff888097f2a700 which belongs to the cache kmalloc-64 of size 64 The buggy address is located 0 bytes inside of 64-byte region [ffff888097f2a700, ffff888097f2a740) The buggy address belongs to the page: page:ffffea00025fca80 refcount:1 mapcount:0 mapping:ffff8880aa400340 index:0x0 flags: 0x1fffc0000000200(slab) raw: 01fffc0000000200 ffffea000250d548 ffffea00025726c8 ffff8880aa400340 raw: 0000000000000000 ffff888097f2a000 0000000100000020 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888097f2a600: 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc ffff888097f2a680: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc >ffff888097f2a700: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ^ ffff888097f2a780: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff888097f2a800: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc Fixes: 767e97e1e0db ("neigh: RCU conversion of struct neighbour") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-06-15 23:28:48 +00:00
struct neigh_seq_state *state = seq->private;
struct neigh_table *tbl = state->tbl;
read_unlock(&tbl->lock);
rcu_read_unlock_bh();
}
EXPORT_SYMBOL(neigh_seq_stop);
/* statistics via seq_file */
static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
{
struct neigh_table *tbl = seq->private;
int cpu;
if (*pos == 0)
return SEQ_START_TOKEN;
for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
if (!cpu_possible(cpu))
continue;
*pos = cpu+1;
return per_cpu_ptr(tbl->stats, cpu);
}
return NULL;
}
static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct neigh_table *tbl = seq->private;
int cpu;
for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
if (!cpu_possible(cpu))
continue;
*pos = cpu+1;
return per_cpu_ptr(tbl->stats, cpu);
}
return NULL;
}
static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
{
}
static int neigh_stat_seq_show(struct seq_file *seq, void *v)
{
struct neigh_table *tbl = seq->private;
struct neigh_statistics *st = v;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards\n");
return 0;
}
seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
"%08lx %08lx %08lx %08lx %08lx\n",
atomic_read(&tbl->entries),
st->allocs,
st->destroys,
st->hash_grows,
st->lookups,
st->hits,
st->res_failed,
st->rcv_probes_mcast,
st->rcv_probes_ucast,
st->periodic_gc_runs,
st->forced_gc_runs,
st->unres_discards
);
return 0;
}
static const struct seq_operations neigh_stat_seq_ops = {
.start = neigh_stat_seq_start,
.next = neigh_stat_seq_next,
.stop = neigh_stat_seq_stop,
.show = neigh_stat_seq_show,
};
static int neigh_stat_seq_open(struct inode *inode, struct file *file)
{
int ret = seq_open(file, &neigh_stat_seq_ops);
if (!ret) {
struct seq_file *sf = file->private_data;
sf->private = PDE_DATA(inode);
}
return ret;
};
static const struct file_operations neigh_stat_seq_fops = {
.owner = THIS_MODULE,
.open = neigh_stat_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
#endif /* CONFIG_PROC_FS */
static inline size_t neigh_nlmsg_size(void)
{
return NLMSG_ALIGN(sizeof(struct ndmsg))
+ nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
+ nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
+ nla_total_size(sizeof(struct nda_cacheinfo))
+ nla_total_size(4); /* NDA_PROBES */
}
static void __neigh_notify(struct neighbour *n, int type, int flags)
{
struct net *net = dev_net(n->dev);
struct sk_buff *skb;
int err = -ENOBUFS;
skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
if (skb == NULL)
goto errout;
err = neigh_fill_info(skb, n, 0, 0, type, flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in neigh_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
goto errout;
}
2009-02-25 07:18:28 +00:00
rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
return;
errout:
if (err < 0)
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 08:13:18 +00:00
rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}
#ifdef CONFIG_ARPD
void neigh_app_ns(struct neighbour *n)
{
__neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST);
}
EXPORT_SYMBOL(neigh_app_ns);
#endif /* CONFIG_ARPD */
#ifdef CONFIG_SYSCTL
static int zero;
static int int_max = INT_MAX;
static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int size, ret;
ctl_table tmp = *ctl;
tmp.extra1 = &zero;
tmp.extra2 = &unres_qlen_max;
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
tmp.data = &size;
size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN);
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
if (write && !ret)
*(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
return ret;
}
enum {
NEIGH_VAR_MCAST_PROBE,
NEIGH_VAR_UCAST_PROBE,
NEIGH_VAR_APP_PROBE,
NEIGH_VAR_RETRANS_TIME,
NEIGH_VAR_BASE_REACHABLE_TIME,
NEIGH_VAR_DELAY_PROBE_TIME,
NEIGH_VAR_GC_STALETIME,
NEIGH_VAR_QUEUE_LEN,
NEIGH_VAR_QUEUE_LEN_BYTES,
NEIGH_VAR_PROXY_QLEN,
NEIGH_VAR_ANYCAST_DELAY,
NEIGH_VAR_PROXY_DELAY,
NEIGH_VAR_LOCKTIME,
NEIGH_VAR_RETRANS_TIME_MS,
NEIGH_VAR_BASE_REACHABLE_TIME_MS,
NEIGH_VAR_GC_INTERVAL,
NEIGH_VAR_GC_THRESH1,
NEIGH_VAR_GC_THRESH2,
NEIGH_VAR_GC_THRESH3,
NEIGH_VAR_PROBE,
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
NEIGH_VAR_MAX
};
static struct neigh_sysctl_table {
struct ctl_table_header *sysctl_header;
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
} neigh_sysctl_template __read_mostly = {
.neigh_vars = {
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_MCAST_PROBE] = {
.procname = "mcast_solicit",
.maxlen = sizeof(int),
.mode = 0644,
.extra1 = &zero,
.extra2 = &int_max,
.proc_handler = proc_dointvec_minmax,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_UCAST_PROBE] = {
.procname = "ucast_solicit",
.maxlen = sizeof(int),
.mode = 0644,
.extra1 = &zero,
.extra2 = &int_max,
.proc_handler = proc_dointvec_minmax,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_APP_PROBE] = {
.procname = "app_solicit",
.maxlen = sizeof(int),
.mode = 0644,
.extra1 = &zero,
.extra2 = &int_max,
.proc_handler = proc_dointvec_minmax,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_RETRANS_TIME] = {
.procname = "retrans_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_BASE_REACHABLE_TIME] = {
.procname = "base_reachable_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_DELAY_PROBE_TIME] = {
.procname = "delay_first_probe_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_GC_STALETIME] = {
.procname = "gc_stale_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_QUEUE_LEN] = {
.procname = "unres_qlen",
.maxlen = sizeof(int),
.mode = 0644,
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
.proc_handler = proc_unres_qlen,
},
[NEIGH_VAR_QUEUE_LEN_BYTES] = {
.procname = "unres_qlen_bytes",
.maxlen = sizeof(int),
.mode = 0644,
.extra1 = &zero,
.proc_handler = proc_dointvec_minmax,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_PROXY_QLEN] = {
.procname = "proxy_qlen",
.maxlen = sizeof(int),
.mode = 0644,
.extra1 = &zero,
.extra2 = &int_max,
.proc_handler = proc_dointvec_minmax,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_ANYCAST_DELAY] = {
.procname = "anycast_delay",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_PROXY_DELAY] = {
.procname = "proxy_delay",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_LOCKTIME] = {
.procname = "locktime",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_RETRANS_TIME_MS] = {
.procname = "retrans_time_ms",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_BASE_REACHABLE_TIME_MS] = {
.procname = "base_reachable_time_ms",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_GC_INTERVAL] = {
.procname = "gc_interval",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_GC_THRESH1] = {
.procname = "gc_thresh1",
.maxlen = sizeof(int),
.mode = 0644,
.extra1 = &zero,
.extra2 = &int_max,
.proc_handler = proc_dointvec_minmax,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_GC_THRESH2] = {
.procname = "gc_thresh2",
.maxlen = sizeof(int),
.mode = 0644,
.extra1 = &zero,
.extra2 = &int_max,
.proc_handler = proc_dointvec_minmax,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
[NEIGH_VAR_GC_THRESH3] = {
.procname = "gc_thresh3",
.maxlen = sizeof(int),
.mode = 0644,
.extra1 = &zero,
.extra2 = &int_max,
.proc_handler = proc_dointvec_minmax,
},
[NEIGH_VAR_PROBE] = {
.procname = "neigh_probe",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{},
},
};
int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
char *p_name, proc_handler *handler)
{
struct neigh_sysctl_table *t;
const char *dev_name_source = NULL;
char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ];
t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL);
if (!t)
goto err;
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data = &p->mcast_probes;
t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data = &p->ucast_probes;
t->neigh_vars[NEIGH_VAR_APP_PROBE].data = &p->app_probes;
t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data = &p->retrans_time;
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data = &p->base_reachable_time;
t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data = &p->delay_probe_time;
t->neigh_vars[NEIGH_VAR_GC_STALETIME].data = &p->gc_staletime;
t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data = &p->queue_len_bytes;
t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data = &p->queue_len_bytes;
t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data = &p->proxy_qlen;
t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data = &p->anycast_delay;
t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay;
t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime;
t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data = &p->retrans_time;
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data = &p->base_reachable_time;
t->neigh_vars[NEIGH_VAR_PROBE].data = &neigh_probe_enable;
if (dev) {
dev_name_source = dev->name;
/* Terminate the table early */
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
} else {
dev_name_source = "default";
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1);
t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1;
t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2;
t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3;
}
if (handler) {
/* RetransTime */
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev;
/* ReachableTime */
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev;
/* RetransTime (in milliseconds)*/
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev;
/* ReachableTime (in milliseconds) */
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 12:07:14 +00:00
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev;
}
/* Don't export sysctls to unprivileged users */
if (neigh_parms_net(p)->user_ns != &init_user_ns)
t->neigh_vars[0].procname = NULL;
snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s",
p_name, dev_name_source);
t->sysctl_header =
register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars);
if (!t->sysctl_header)
goto free;
p->sysctl_table = t;
return 0;
free:
kfree(t);
err:
return -ENOBUFS;
}
EXPORT_SYMBOL(neigh_sysctl_register);
void neigh_sysctl_unregister(struct neigh_parms *p)
{
if (p->sysctl_table) {
struct neigh_sysctl_table *t = p->sysctl_table;
p->sysctl_table = NULL;
unregister_net_sysctl_table(t->sysctl_header);
kfree(t);
}
}
EXPORT_SYMBOL(neigh_sysctl_unregister);
#endif /* CONFIG_SYSCTL */
static int __init neigh_init(void)
{
rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, NULL);
rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,
NULL);
rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, NULL);
return 0;
}
subsys_initcall(neigh_init);