Commit c33585c5 authored by Manfred Spraul's avatar Manfred Spraul Committed by Linus Torvalds

[PATCH] pipe bugfix /cleanup

pipe_write contains a wakeup storm, 2 writers that write into the same
fifo can wake each other up, and spend 100% cpu time with
wakeup/schedule, without making any progress.

The only regression I'm aware of is that

  $ dd if=/dev/zero | grep not_there

will fail due to OOM, because grep does something like

	for(;;) {
		rlen = read(fd, buf, len);
		if (rlen == len) {
			len *= 2;
			buf = realloc(buf, len);
		}
	}

if it operates on pipes, and due to the improved syscall merging, read
will always return the maximum possible amount of data. But that's a grep
bug, not a kernel problem.
parent d8ac8dd7
...@@ -25,6 +25,9 @@ ...@@ -25,6 +25,9 @@
* *
* FIFOs and Pipes now generate SIGIO for both readers and writers. * FIFOs and Pipes now generate SIGIO for both readers and writers.
* -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
*
* pipe_read & write cleanup
* -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
*/ */
/* Drop the inode semaphore and wait for a pipe event, atomically */ /* Drop the inode semaphore and wait for a pipe event, atomically */
...@@ -44,52 +47,23 @@ static ssize_t ...@@ -44,52 +47,23 @@ static ssize_t
pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos) pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
{ {
struct inode *inode = filp->f_dentry->d_inode; struct inode *inode = filp->f_dentry->d_inode;
ssize_t size, read, ret; int do_wakeup;
ssize_t ret;
/* Seeks are not allowed on pipes. */
ret = -ESPIPE;
read = 0;
if (ppos != &filp->f_pos)
goto out_nolock;
/* Always return 0 on null read. */ /* pread is not allowed on pipes. */
ret = 0; if (unlikely(ppos != &filp->f_pos))
if (count == 0) return -ESPIPE;
goto out_nolock;
/* Get the pipe semaphore */ /* Null read succeeds. */
ret = -ERESTARTSYS; if (unlikely(count == 0))
if (down_interruptible(PIPE_SEM(*inode))) return 0;
goto out_nolock;
if (PIPE_EMPTY(*inode)) { do_wakeup = 0;
do_more_read:
ret = 0; ret = 0;
if (!PIPE_WRITERS(*inode)) down(PIPE_SEM(*inode));
goto out;
ret = -EAGAIN;
if (filp->f_flags & O_NONBLOCK)
goto out;
for (;;) { for (;;) {
PIPE_WAITING_READERS(*inode)++; int size = PIPE_LEN(*inode);
pipe_wait(inode); if (size) {
PIPE_WAITING_READERS(*inode)--;
ret = -ERESTARTSYS;
if (signal_pending(current))
goto out;
ret = 0;
if (!PIPE_EMPTY(*inode))
break;
if (!PIPE_WRITERS(*inode))
goto out;
}
}
/* Read what data is available. */
ret = -EFAULT;
while (count > 0 && (size = PIPE_LEN(*inode))) {
char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode); char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
ssize_t chars = PIPE_MAX_RCHUNK(*inode); ssize_t chars = PIPE_MAX_RCHUNK(*inode);
...@@ -98,43 +72,56 @@ pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos) ...@@ -98,43 +72,56 @@ pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
if (chars > size) if (chars > size)
chars = size; chars = size;
if (copy_to_user(buf, pipebuf, chars)) if (copy_to_user(buf, pipebuf, chars)) {
goto out; if (!ret) ret = -EFAULT;
break;
}
ret += chars;
read += chars;
PIPE_START(*inode) += chars; PIPE_START(*inode) += chars;
PIPE_START(*inode) &= (PIPE_SIZE - 1); PIPE_START(*inode) &= (PIPE_SIZE - 1);
PIPE_LEN(*inode) -= chars; PIPE_LEN(*inode) -= chars;
count -= chars; count -= chars;
buf += chars; buf += chars;
do_wakeup = 1;
} }
if (!count)
/* Cache behaviour optimization */ break; /* common path: read succeeded */
if (!PIPE_LEN(*inode)) if (PIPE_LEN(*inode)) /* test for cyclic buffers */
PIPE_START(*inode) = 0; continue;
if (!PIPE_WRITERS(*inode))
if (count && PIPE_WAITING_WRITERS(*inode) && !(filp->f_flags & O_NONBLOCK)) { break;
/* if (!PIPE_WAITING_WRITERS(*inode)) {
* We know that we are going to sleep: signal /* syscall merging: Usually we must not sleep
* writers synchronously that there is more * if O_NONBLOCK is set, or if we got some data.
* room. * But if a writer sleeps in kernel space, then
* we can wait for that data without violating POSIX.
*/ */
wake_up_interruptible_sync(PIPE_WAIT(*inode)); if (ret)
kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); break;
if (!PIPE_EMPTY(*inode)) if (filp->f_flags & O_NONBLOCK) {
BUG(); ret = -EAGAIN;
goto do_more_read; break;
} }
/* Signal writers asynchronously that there is more room. */ }
if (signal_pending(current)) {
if (!ret) ret = -ERESTARTSYS;
break;
}
if (do_wakeup) {
wake_up_interruptible(PIPE_WAIT(*inode)); wake_up_interruptible(PIPE_WAIT(*inode));
kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
}
ret = read; pipe_wait(inode);
out: }
up(PIPE_SEM(*inode)); up(PIPE_SEM(*inode));
out_nolock: /* Signal writers asynchronously that there is more room. */
if (read) if (do_wakeup) {
ret = read; wake_up_interruptible_sync(PIPE_WAIT(*inode));
kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
}
if (ret > 0)
UPDATE_ATIME(inode);
return ret; return ret;
} }
...@@ -142,116 +129,90 @@ static ssize_t ...@@ -142,116 +129,90 @@ static ssize_t
pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos) pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
{ {
struct inode *inode = filp->f_dentry->d_inode; struct inode *inode = filp->f_dentry->d_inode;
ssize_t free, written, ret; ssize_t ret;
size_t min;
int do_wakeup;
/* Seeks are not allowed on pipes. */ /* pwrite is not allowed on pipes. */
ret = -ESPIPE; if (unlikely(ppos != &filp->f_pos))
written = 0; return -ESPIPE;
if (ppos != &filp->f_pos)
goto out_nolock;
/* Null write succeeds. */ /* Null write succeeds. */
ret = 0; if (unlikely(count == 0))
if (count == 0) return 0;
goto out_nolock;
ret = -ERESTARTSYS;
if (down_interruptible(PIPE_SEM(*inode)))
goto out_nolock;
/* No readers yields SIGPIPE. */
if (!PIPE_READERS(*inode))
goto sigpipe;
/* If count <= PIPE_BUF, we have to make it atomic. */
free = (count <= PIPE_BUF ? count : 1);
/* Wait, or check for, available space. */
if (filp->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
if (PIPE_FREE(*inode) < free)
goto out;
} else {
while (PIPE_FREE(*inode) < free) {
PIPE_WAITING_WRITERS(*inode)++;
pipe_wait(inode);
PIPE_WAITING_WRITERS(*inode)--;
ret = -ERESTARTSYS;
if (signal_pending(current))
goto out;
if (!PIPE_READERS(*inode)) do_wakeup = 0;
goto sigpipe; ret = 0;
} min = count;
if (min > PIPE_BUF)
min = 1;
down(PIPE_SEM(*inode));
for (;;) {
int free;
if (!PIPE_READERS(*inode)) {
send_sig(SIGPIPE, current, 0);
if (!ret) ret = -EPIPE;
break;
} }
free = PIPE_FREE(*inode);
/* Copy into available space. */ if (free >= min) {
ret = -EFAULT; /* transfer data */
while (count > 0) {
int space;
char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode);
ssize_t chars = PIPE_MAX_WCHUNK(*inode); ssize_t chars = PIPE_MAX_WCHUNK(*inode);
char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode);
if ((space = PIPE_FREE(*inode)) != 0) { /* Always wakeup, even if the copy fails. Otherwise
* we lock up (O_NONBLOCK-)readers that sleep due to
* syscall merging.
*/
do_wakeup = 1;
if (chars > count) if (chars > count)
chars = count; chars = count;
if (chars > space) if (chars > free)
chars = space; chars = free;
if (copy_from_user(pipebuf, buf, chars)) if (copy_from_user(pipebuf, buf, chars)) {
goto out; if (!ret) ret = -EFAULT;
break;
}
written += chars; ret += chars;
PIPE_LEN(*inode) += chars; PIPE_LEN(*inode) += chars;
count -= chars; count -= chars;
buf += chars; buf += chars;
space = PIPE_FREE(*inode); }
if (!count)
break;
if (PIPE_FREE(*inode) && ret) {
/* handle cyclic data buffers */
min = 1;
continue; continue;
} }
if (filp->f_flags & O_NONBLOCK) {
ret = written; if (!ret) ret = -EAGAIN;
if (filp->f_flags & O_NONBLOCK)
break; break;
}
do { if (signal_pending(current)) {
/* if (!ret) ret = -ERESTARTSYS;
* Synchronous wake-up: it knows that this process break;
* is going to give up this CPU, so it doesn't have }
* to do idle reschedules. if (do_wakeup) {
*/
wake_up_interruptible_sync(PIPE_WAIT(*inode)); wake_up_interruptible_sync(PIPE_WAIT(*inode));
kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
do_wakeup = 0;
}
PIPE_WAITING_WRITERS(*inode)++; PIPE_WAITING_WRITERS(*inode)++;
pipe_wait(inode); pipe_wait(inode);
PIPE_WAITING_WRITERS(*inode)--; PIPE_WAITING_WRITERS(*inode)--;
if (signal_pending(current))
goto out;
if (!PIPE_READERS(*inode))
goto sigpipe;
} while (!PIPE_FREE(*inode));
ret = -EFAULT;
} }
up(PIPE_SEM(*inode));
/* Signal readers asynchronously that there is more data. */ if (do_wakeup) {
wake_up_interruptible(PIPE_WAIT(*inode)); wake_up_interruptible(PIPE_WAIT(*inode));
kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
}
if (ret > 0) {
inode->i_ctime = inode->i_mtime = CURRENT_TIME; inode->i_ctime = inode->i_mtime = CURRENT_TIME;
mark_inode_dirty(inode); mark_inode_dirty(inode);
}
out:
up(PIPE_SEM(*inode));
out_nolock:
if (written)
ret = written;
return ret; return ret;
sigpipe:
if (written)
goto out;
up(PIPE_SEM(*inode));
send_sig(SIGPIPE, current, 0);
return -EPIPE;
} }
static ssize_t static ssize_t
...@@ -525,7 +486,7 @@ struct inode* pipe_new(struct inode* inode) ...@@ -525,7 +486,7 @@ struct inode* pipe_new(struct inode* inode)
PIPE_BASE(*inode) = (char*) page; PIPE_BASE(*inode) = (char*) page;
PIPE_START(*inode) = PIPE_LEN(*inode) = 0; PIPE_START(*inode) = PIPE_LEN(*inode) = 0;
PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0; PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0;
PIPE_WAITING_READERS(*inode) = PIPE_WAITING_WRITERS(*inode) = 0; PIPE_WAITING_WRITERS(*inode) = 0;
PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1; PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
*PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL; *PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL;
......
...@@ -9,7 +9,6 @@ struct pipe_inode_info { ...@@ -9,7 +9,6 @@ struct pipe_inode_info {
unsigned int start; unsigned int start;
unsigned int readers; unsigned int readers;
unsigned int writers; unsigned int writers;
unsigned int waiting_readers;
unsigned int waiting_writers; unsigned int waiting_writers;
unsigned int r_counter; unsigned int r_counter;
unsigned int w_counter; unsigned int w_counter;
...@@ -28,7 +27,6 @@ struct pipe_inode_info { ...@@ -28,7 +27,6 @@ struct pipe_inode_info {
#define PIPE_LEN(inode) ((inode).i_pipe->len) #define PIPE_LEN(inode) ((inode).i_pipe->len)
#define PIPE_READERS(inode) ((inode).i_pipe->readers) #define PIPE_READERS(inode) ((inode).i_pipe->readers)
#define PIPE_WRITERS(inode) ((inode).i_pipe->writers) #define PIPE_WRITERS(inode) ((inode).i_pipe->writers)
#define PIPE_WAITING_READERS(inode) ((inode).i_pipe->waiting_readers)
#define PIPE_WAITING_WRITERS(inode) ((inode).i_pipe->waiting_writers) #define PIPE_WAITING_WRITERS(inode) ((inode).i_pipe->waiting_writers)
#define PIPE_RCOUNTER(inode) ((inode).i_pipe->r_counter) #define PIPE_RCOUNTER(inode) ((inode).i_pipe->r_counter)
#define PIPE_WCOUNTER(inode) ((inode).i_pipe->w_counter) #define PIPE_WCOUNTER(inode) ((inode).i_pipe->w_counter)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment