mirror of
https://github.com/team-infusion-developers/android_kernel_samsung_msm8976.git
synced 2024-10-21 02:49:11 +00:00
f348d70a32
Implement the half-closed devices notifiation, by adding a new POLLRDHUP (and its alias EPOLLRDHUP) bit to the existing poll/select sets. Since the existing POLLHUP handling, that does not report correctly half-closed devices, was feared to be changed, this implementation leaves the current POLLHUP reporting unchanged and simply add a new bit that is set in the few places where it makes sense. The same thing was discussed and conceptually agreed quite some time ago: http://lkml.org/lkml/2003/7/12/116 Since this new event bit is added to the existing Linux poll infrastruture, even the existing poll/select system calls will be able to use it. As far as the existing POLLHUP handling, the patch leaves it as is. The pollrdhup-2.6.16.rc5-0.10.diff defines the POLLRDHUP for all the existing archs and sets the bit in the six relevant files. The other attached diff is the simple change required to sys/epoll.h to add the EPOLLRDHUP definition. There is "a stupid program" to test POLLRDHUP delivery here: http://www.xmailserver.org/pollrdhup-test.c It tests poll(2), but since the delivery is same epoll(2) will work equally. Signed-off-by: Davide Libenzi <davidel@xmailserver.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Michael Kerrisk <mtk-manpages@gmx.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
535 lines
13 KiB
C
535 lines
13 KiB
C
/*
|
|
* SUCS NET3:
|
|
*
|
|
* Generic datagram handling routines. These are generic for all
|
|
* protocols. Possibly a generic IP version on top of these would
|
|
* make sense. Not tonight however 8-).
|
|
* This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
|
|
* NetROM layer all have identical poll code and mostly
|
|
* identical recvmsg() code. So we share it here. The poll was
|
|
* shared before but buried in udp.c so I moved it.
|
|
*
|
|
* Authors: Alan Cox <alan@redhat.com>. (datagram_poll() from old
|
|
* udp.c code)
|
|
*
|
|
* Fixes:
|
|
* Alan Cox : NULL return from skb_peek_copy()
|
|
* understood
|
|
* Alan Cox : Rewrote skb_read_datagram to avoid the
|
|
* skb_peek_copy stuff.
|
|
* Alan Cox : Added support for SOCK_SEQPACKET.
|
|
* IPX can no longer use the SO_TYPE hack
|
|
* but AX.25 now works right, and SPX is
|
|
* feasible.
|
|
* Alan Cox : Fixed write poll of non IP protocol
|
|
* crash.
|
|
* Florian La Roche: Changed for my new skbuff handling.
|
|
* Darryl Miles : Fixed non-blocking SOCK_SEQPACKET.
|
|
* Linus Torvalds : BSD semantic fixes.
|
|
* Alan Cox : Datagram iovec handling
|
|
* Darryl Miles : Fixed non-blocking SOCK_STREAM.
|
|
* Alan Cox : POSIXisms
|
|
* Pete Wyckoff : Unconnected accept() fix.
|
|
*
|
|
*/
|
|
|
|
#include <linux/module.h>
|
|
#include <linux/types.h>
|
|
#include <linux/kernel.h>
|
|
#include <asm/uaccess.h>
|
|
#include <asm/system.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/interrupt.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/inet.h>
|
|
#include <linux/netdevice.h>
|
|
#include <linux/rtnetlink.h>
|
|
#include <linux/poll.h>
|
|
#include <linux/highmem.h>
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <net/protocol.h>
|
|
#include <linux/skbuff.h>
|
|
|
|
#include <net/checksum.h>
|
|
#include <net/sock.h>
|
|
#include <net/tcp_states.h>
|
|
|
|
/*
|
|
* Is a socket 'connection oriented' ?
|
|
*/
|
|
static inline int connection_based(struct sock *sk)
|
|
{
|
|
return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
|
|
}
|
|
|
|
/*
|
|
* Wait for a packet..
|
|
*/
|
|
static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
|
|
{
|
|
int error;
|
|
DEFINE_WAIT(wait);
|
|
|
|
prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
|
|
|
|
/* Socket errors? */
|
|
error = sock_error(sk);
|
|
if (error)
|
|
goto out_err;
|
|
|
|
if (!skb_queue_empty(&sk->sk_receive_queue))
|
|
goto out;
|
|
|
|
/* Socket shut down? */
|
|
if (sk->sk_shutdown & RCV_SHUTDOWN)
|
|
goto out_noerr;
|
|
|
|
/* Sequenced packets can come disconnected.
|
|
* If so we report the problem
|
|
*/
|
|
error = -ENOTCONN;
|
|
if (connection_based(sk) &&
|
|
!(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
|
|
goto out_err;
|
|
|
|
/* handle signals */
|
|
if (signal_pending(current))
|
|
goto interrupted;
|
|
|
|
error = 0;
|
|
*timeo_p = schedule_timeout(*timeo_p);
|
|
out:
|
|
finish_wait(sk->sk_sleep, &wait);
|
|
return error;
|
|
interrupted:
|
|
error = sock_intr_errno(*timeo_p);
|
|
out_err:
|
|
*err = error;
|
|
goto out;
|
|
out_noerr:
|
|
*err = 0;
|
|
error = 1;
|
|
goto out;
|
|
}
|
|
|
|
/**
|
|
* skb_recv_datagram - Receive a datagram skbuff
|
|
* @sk: socket
|
|
* @flags: MSG_ flags
|
|
* @noblock: blocking operation?
|
|
* @err: error code returned
|
|
*
|
|
* Get a datagram skbuff, understands the peeking, nonblocking wakeups
|
|
* and possible races. This replaces identical code in packet, raw and
|
|
* udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
|
|
* the long standing peek and read race for datagram sockets. If you
|
|
* alter this routine remember it must be re-entrant.
|
|
*
|
|
* This function will lock the socket if a skb is returned, so the caller
|
|
* needs to unlock the socket in that case (usually by calling
|
|
* skb_free_datagram)
|
|
*
|
|
* * It does not lock socket since today. This function is
|
|
* * free of race conditions. This measure should/can improve
|
|
* * significantly datagram socket latencies at high loads,
|
|
* * when data copying to user space takes lots of time.
|
|
* * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
|
|
* * 8) Great win.)
|
|
* * --ANK (980729)
|
|
*
|
|
* The order of the tests when we find no data waiting are specified
|
|
* quite explicitly by POSIX 1003.1g, don't change them without having
|
|
* the standard around please.
|
|
*/
|
|
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
|
|
int noblock, int *err)
|
|
{
|
|
struct sk_buff *skb;
|
|
long timeo;
|
|
/*
|
|
* Caller is allowed not to check sk->sk_err before skb_recv_datagram()
|
|
*/
|
|
int error = sock_error(sk);
|
|
|
|
if (error)
|
|
goto no_packet;
|
|
|
|
timeo = sock_rcvtimeo(sk, noblock);
|
|
|
|
do {
|
|
/* Again only user level code calls this function, so nothing
|
|
* interrupt level will suddenly eat the receive_queue.
|
|
*
|
|
* Look at current nfs client by the way...
|
|
* However, this function was corrent in any case. 8)
|
|
*/
|
|
if (flags & MSG_PEEK) {
|
|
unsigned long cpu_flags;
|
|
|
|
spin_lock_irqsave(&sk->sk_receive_queue.lock,
|
|
cpu_flags);
|
|
skb = skb_peek(&sk->sk_receive_queue);
|
|
if (skb)
|
|
atomic_inc(&skb->users);
|
|
spin_unlock_irqrestore(&sk->sk_receive_queue.lock,
|
|
cpu_flags);
|
|
} else
|
|
skb = skb_dequeue(&sk->sk_receive_queue);
|
|
|
|
if (skb)
|
|
return skb;
|
|
|
|
/* User doesn't want to wait */
|
|
error = -EAGAIN;
|
|
if (!timeo)
|
|
goto no_packet;
|
|
|
|
} while (!wait_for_packet(sk, err, &timeo));
|
|
|
|
return NULL;
|
|
|
|
no_packet:
|
|
*err = error;
|
|
return NULL;
|
|
}
|
|
|
|
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
kfree_skb(skb);
|
|
}
|
|
|
|
/**
|
|
* skb_kill_datagram - Free a datagram skbuff forcibly
|
|
* @sk: socket
|
|
* @skb: datagram skbuff
|
|
* @flags: MSG_ flags
|
|
*
|
|
* This function frees a datagram skbuff that was received by
|
|
* skb_recv_datagram. The flags argument must match the one
|
|
* used for skb_recv_datagram.
|
|
*
|
|
* If the MSG_PEEK flag is set, and the packet is still on the
|
|
* receive queue of the socket, it will be taken off the queue
|
|
* before it is freed.
|
|
*
|
|
* This function currently only disables BH when acquiring the
|
|
* sk_receive_queue lock. Therefore it must not be used in a
|
|
* context where that lock is acquired in an IRQ context.
|
|
*/
|
|
|
|
void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
|
|
{
|
|
if (flags & MSG_PEEK) {
|
|
spin_lock_bh(&sk->sk_receive_queue.lock);
|
|
if (skb == skb_peek(&sk->sk_receive_queue)) {
|
|
__skb_unlink(skb, &sk->sk_receive_queue);
|
|
atomic_dec(&skb->users);
|
|
}
|
|
spin_unlock_bh(&sk->sk_receive_queue.lock);
|
|
}
|
|
|
|
kfree_skb(skb);
|
|
}
|
|
|
|
EXPORT_SYMBOL(skb_kill_datagram);
|
|
|
|
/**
|
|
* skb_copy_datagram_iovec - Copy a datagram to an iovec.
|
|
* @skb: buffer to copy
|
|
* @offset: offset in the buffer to start copying from
|
|
* @to: io vector to copy to
|
|
* @len: amount of data to copy from buffer to iovec
|
|
*
|
|
* Note: the iovec is modified during the copy.
|
|
*/
|
|
int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
|
|
struct iovec *to, int len)
|
|
{
|
|
int start = skb_headlen(skb);
|
|
int i, copy = start - offset;
|
|
|
|
/* Copy header. */
|
|
if (copy > 0) {
|
|
if (copy > len)
|
|
copy = len;
|
|
if (memcpy_toiovec(to, skb->data + offset, copy))
|
|
goto fault;
|
|
if ((len -= copy) == 0)
|
|
return 0;
|
|
offset += copy;
|
|
}
|
|
|
|
/* Copy paged appendix. Hmm... why does this look so complicated? */
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
|
int end;
|
|
|
|
BUG_TRAP(start <= offset + len);
|
|
|
|
end = start + skb_shinfo(skb)->frags[i].size;
|
|
if ((copy = end - offset) > 0) {
|
|
int err;
|
|
u8 *vaddr;
|
|
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
|
|
struct page *page = frag->page;
|
|
|
|
if (copy > len)
|
|
copy = len;
|
|
vaddr = kmap(page);
|
|
err = memcpy_toiovec(to, vaddr + frag->page_offset +
|
|
offset - start, copy);
|
|
kunmap(page);
|
|
if (err)
|
|
goto fault;
|
|
if (!(len -= copy))
|
|
return 0;
|
|
offset += copy;
|
|
}
|
|
start = end;
|
|
}
|
|
|
|
if (skb_shinfo(skb)->frag_list) {
|
|
struct sk_buff *list = skb_shinfo(skb)->frag_list;
|
|
|
|
for (; list; list = list->next) {
|
|
int end;
|
|
|
|
BUG_TRAP(start <= offset + len);
|
|
|
|
end = start + list->len;
|
|
if ((copy = end - offset) > 0) {
|
|
if (copy > len)
|
|
copy = len;
|
|
if (skb_copy_datagram_iovec(list,
|
|
offset - start,
|
|
to, copy))
|
|
goto fault;
|
|
if ((len -= copy) == 0)
|
|
return 0;
|
|
offset += copy;
|
|
}
|
|
start = end;
|
|
}
|
|
}
|
|
if (!len)
|
|
return 0;
|
|
|
|
fault:
|
|
return -EFAULT;
|
|
}
|
|
|
|
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
|
|
u8 __user *to, int len,
|
|
unsigned int *csump)
|
|
{
|
|
int start = skb_headlen(skb);
|
|
int pos = 0;
|
|
int i, copy = start - offset;
|
|
|
|
/* Copy header. */
|
|
if (copy > 0) {
|
|
int err = 0;
|
|
if (copy > len)
|
|
copy = len;
|
|
*csump = csum_and_copy_to_user(skb->data + offset, to, copy,
|
|
*csump, &err);
|
|
if (err)
|
|
goto fault;
|
|
if ((len -= copy) == 0)
|
|
return 0;
|
|
offset += copy;
|
|
to += copy;
|
|
pos = copy;
|
|
}
|
|
|
|
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
|
int end;
|
|
|
|
BUG_TRAP(start <= offset + len);
|
|
|
|
end = start + skb_shinfo(skb)->frags[i].size;
|
|
if ((copy = end - offset) > 0) {
|
|
unsigned int csum2;
|
|
int err = 0;
|
|
u8 *vaddr;
|
|
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
|
|
struct page *page = frag->page;
|
|
|
|
if (copy > len)
|
|
copy = len;
|
|
vaddr = kmap(page);
|
|
csum2 = csum_and_copy_to_user(vaddr +
|
|
frag->page_offset +
|
|
offset - start,
|
|
to, copy, 0, &err);
|
|
kunmap(page);
|
|
if (err)
|
|
goto fault;
|
|
*csump = csum_block_add(*csump, csum2, pos);
|
|
if (!(len -= copy))
|
|
return 0;
|
|
offset += copy;
|
|
to += copy;
|
|
pos += copy;
|
|
}
|
|
start = end;
|
|
}
|
|
|
|
if (skb_shinfo(skb)->frag_list) {
|
|
struct sk_buff *list = skb_shinfo(skb)->frag_list;
|
|
|
|
for (; list; list=list->next) {
|
|
int end;
|
|
|
|
BUG_TRAP(start <= offset + len);
|
|
|
|
end = start + list->len;
|
|
if ((copy = end - offset) > 0) {
|
|
unsigned int csum2 = 0;
|
|
if (copy > len)
|
|
copy = len;
|
|
if (skb_copy_and_csum_datagram(list,
|
|
offset - start,
|
|
to, copy,
|
|
&csum2))
|
|
goto fault;
|
|
*csump = csum_block_add(*csump, csum2, pos);
|
|
if ((len -= copy) == 0)
|
|
return 0;
|
|
offset += copy;
|
|
to += copy;
|
|
pos += copy;
|
|
}
|
|
start = end;
|
|
}
|
|
}
|
|
if (!len)
|
|
return 0;
|
|
|
|
fault:
|
|
return -EFAULT;
|
|
}
|
|
|
|
unsigned int __skb_checksum_complete(struct sk_buff *skb)
|
|
{
|
|
unsigned int sum;
|
|
|
|
sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
|
|
if (likely(!sum)) {
|
|
if (unlikely(skb->ip_summed == CHECKSUM_HW))
|
|
netdev_rx_csum_fault(skb->dev);
|
|
skb->ip_summed = CHECKSUM_UNNECESSARY;
|
|
}
|
|
return sum;
|
|
}
|
|
EXPORT_SYMBOL(__skb_checksum_complete);
|
|
|
|
/**
|
|
* skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
|
|
* @skb: skbuff
|
|
* @hlen: hardware length
|
|
* @iov: io vector
|
|
*
|
|
* Caller _must_ check that skb will fit to this iovec.
|
|
*
|
|
* Returns: 0 - success.
|
|
* -EINVAL - checksum failure.
|
|
* -EFAULT - fault during copy. Beware, in this case iovec
|
|
* can be modified!
|
|
*/
|
|
int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
|
|
int hlen, struct iovec *iov)
|
|
{
|
|
unsigned int csum;
|
|
int chunk = skb->len - hlen;
|
|
|
|
/* Skip filled elements.
|
|
* Pretty silly, look at memcpy_toiovec, though 8)
|
|
*/
|
|
while (!iov->iov_len)
|
|
iov++;
|
|
|
|
if (iov->iov_len < chunk) {
|
|
if (__skb_checksum_complete(skb))
|
|
goto csum_error;
|
|
if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
|
|
goto fault;
|
|
} else {
|
|
csum = csum_partial(skb->data, hlen, skb->csum);
|
|
if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
|
|
chunk, &csum))
|
|
goto fault;
|
|
if ((unsigned short)csum_fold(csum))
|
|
goto csum_error;
|
|
if (unlikely(skb->ip_summed == CHECKSUM_HW))
|
|
netdev_rx_csum_fault(skb->dev);
|
|
iov->iov_len -= chunk;
|
|
iov->iov_base += chunk;
|
|
}
|
|
return 0;
|
|
csum_error:
|
|
return -EINVAL;
|
|
fault:
|
|
return -EFAULT;
|
|
}
|
|
|
|
/**
|
|
* datagram_poll - generic datagram poll
|
|
* @file: file struct
|
|
* @sock: socket
|
|
* @wait: poll table
|
|
*
|
|
* Datagram poll: Again totally generic. This also handles
|
|
* sequenced packet sockets providing the socket receive queue
|
|
* is only ever holding data ready to receive.
|
|
*
|
|
* Note: when you _don't_ use this routine for this protocol,
|
|
* and you use a different write policy from sock_writeable()
|
|
* then please supply your own write_space callback.
|
|
*/
|
|
unsigned int datagram_poll(struct file *file, struct socket *sock,
|
|
poll_table *wait)
|
|
{
|
|
struct sock *sk = sock->sk;
|
|
unsigned int mask;
|
|
|
|
poll_wait(file, sk->sk_sleep, wait);
|
|
mask = 0;
|
|
|
|
/* exceptional events? */
|
|
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
|
|
mask |= POLLERR;
|
|
if (sk->sk_shutdown & RCV_SHUTDOWN)
|
|
mask |= POLLRDHUP;
|
|
if (sk->sk_shutdown == SHUTDOWN_MASK)
|
|
mask |= POLLHUP;
|
|
|
|
/* readable? */
|
|
if (!skb_queue_empty(&sk->sk_receive_queue) ||
|
|
(sk->sk_shutdown & RCV_SHUTDOWN))
|
|
mask |= POLLIN | POLLRDNORM;
|
|
|
|
/* Connection-based need to check for termination and startup */
|
|
if (connection_based(sk)) {
|
|
if (sk->sk_state == TCP_CLOSE)
|
|
mask |= POLLHUP;
|
|
/* connection hasn't started yet? */
|
|
if (sk->sk_state == TCP_SYN_SENT)
|
|
return mask;
|
|
}
|
|
|
|
/* writable? */
|
|
if (sock_writeable(sk))
|
|
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
|
|
else
|
|
set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
|
|
|
|
return mask;
|
|
}
|
|
|
|
EXPORT_SYMBOL(datagram_poll);
|
|
EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
|
|
EXPORT_SYMBOL(skb_copy_datagram_iovec);
|
|
EXPORT_SYMBOL(skb_free_datagram);
|
|
EXPORT_SYMBOL(skb_recv_datagram);
|