Commit a4369a58 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] force_successful_syscall_return()

From: David Mosberger <davidm@napali.hpl.hp.com>, Christoph Hellwig

I believe this is the last outstanding piece that prevents ia64 from being
fully in sync with Linus' tree (yes, there are some minor ACPI changes
outstanding and a toolchain bug that's left to fix, but other than that, I
think we're clean).

Many architectures (alpha, ia64, ppc, ppc64, sparc, and sparc64 at least)
use a syscall convention which provides for a return value and a separate
error flag.  On those architectures, it can be beneficial if the kernel
provides a mechanism to signal that a syscall call has completed
successfully, even when the returned value is potentially a (small)
negative number.  The patch below provides a hook for such a mechanism via
a macro called force_successful_syscall_return().  On x86, this would be
simply a no-op (because on x86, user-level has to be hacked to handle such
cases).  On Alpha, it would be something along the lines of:

 #define force_successful_syscall_return()  ptregs->r0 = 0

where "ptregs" is a pointer to the user's ptregs structure of the current
task.  On ia64, we have been using this for a long time:

 static inline void force_successful_syscall_return (void) {
	ia64_task_regs(current)->r8 = 0;
 }

The other architectures (ppc, ppc64, sparc, and sparc64) currently have no
mechanism to force a syscall return to be successful.  But since the
syscall convention already provide for a separate error flag, the arch
maintainers could change this if they wanted to.

There are only 3 places in the platform-independent portion of the kernel
that need this macro:

 - memory_lseek() in drivers/char/mem.c
 - fs/fcntl.c for F_GETOWN
 - lseek for /proc/mem in fs/proc/array.c

Ideally, there are a couple of other places that could benefit from this
macro:

 - sys_getpriority()
 - sys_shmat()
 - sys_brk()
 - do_mmap2()
 - do_mremap()

but these are not so critical, because the can be worked around in
platform-specific code (e.g., see arch/ia64/kernel/sys_ia64.c).

Note that for the above 3 cases, handling them in user level is rather
suboptimal:

 - it would affect all lseek() syscalls, even though only /proc/mem and
   /dev/mem need the special treatment (at least until there are
   filesystems that can handle files >= 2^63 bytes)

 - all fcntl() calls would be affected, even though only F_GETOWN needs
   the special treatment

so I think handling these in the kernel for the platforms that can makes
tons of sense.

The only limitation of force_successful_syscall_return() is that it doesn't
help with system calls performed by the kernel.  But the kernel does that
so rarely and for such a limited set of syscalls that this is not a real
problem.
parent 792fb4b7
......@@ -23,6 +23,7 @@
#include <linux/capability.h>
#include <linux/smp_lock.h>
#include <linux/devfs_fs_kernel.h>
#include <linux/ptrace.h>
#include <asm/uaccess.h>
#include <asm/io.h>
......@@ -524,20 +525,22 @@ static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
{
loff_t ret;
lock_kernel();
down(&file->f_dentry->d_inode->i_sem);
switch (orig) {
case 0:
file->f_pos = offset;
ret = file->f_pos;
force_successful_syscall_return();
break;
case 1:
file->f_pos += offset;
ret = file->f_pos;
force_successful_syscall_return();
break;
default:
ret = -EINVAL;
}
unlock_kernel();
up(&file->f_dentry->d_inode->i_sem);
return ret;
}
......
......@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <asm/poll.h>
#include <asm/siginfo.h>
......@@ -318,6 +319,7 @@ static long do_fcntl(unsigned int fd, unsigned int cmd,
* to fix this will be in libc.
*/
err = filp->f_owner.pid;
force_successful_syscall_return();
break;
case F_SETOWN:
err = f_setown(filp, arg, 1);
......
......@@ -557,7 +557,24 @@ static ssize_t mem_write(struct file * file, const char * buf,
}
#endif
static loff_t mem_lseek(struct file * file, loff_t offset, int orig)
{
switch (orig) {
case 0:
file->f_pos = offset;
break;
case 1:
file->f_pos += offset;
break;
default:
return -EINVAL;
}
force_successful_syscall_return();
return file->f_pos;
}
static struct file_operations proc_mem_operations = {
.llseek = mem_lseek,
.read = mem_read,
.write = mem_write,
.open = mem_open,
......
......@@ -70,6 +70,19 @@ struct switch_stack {
#define user_mode(regs) (((regs)->ps & 8) != 0)
#define instruction_pointer(regs) ((regs)->pc)
extern void show_regs(struct pt_regs *);
/*
* TODO: if kernel-only threads do not have a dummy pt_regs structure at the
* top of the stack, this would cause kernel stack corruption. Either check
* first that we're not dealing with a kernel thread or change the kernel
* stacks to allocate a dummy pt_regs structure.
*/
#define alpha_task_regs(task) ((struct pt_regs *) \
((long) task->thread_info + PAGE_SIZE) - 1)
#define force_successful_syscall_return() (alpha_task_regs(current)->r0 = 0)
#endif
#endif
......@@ -250,11 +250,10 @@ struct switch_stack {
extern void ia64_increment_ip (struct pt_regs *pt);
extern void ia64_decrement_ip (struct pt_regs *pt);
static inline void
force_successful_syscall_return (void)
{
ia64_task_regs(current)->r8 = 0;
}
#define force_successful_syscall_return() \
do { \
ia64_task_regs(current)->r8 = 0; \
} while (0)
#endif /* !__KERNEL__ */
......
......@@ -92,6 +92,23 @@ static inline void ptrace_unlink(struct task_struct *child)
if (unlikely(child->ptrace))
__ptrace_unlink(child);
}
#ifndef force_successful_syscall_return
/*
* System call handlers that, upon successful completion, need to return a
* negative value should call force_successful_syscall_return() right before
* returning. On architectures where the syscall convention provides for a
* separate error flag (e.g., alpha, ia64, ppc{,64}, sparc{,64}, possibly
* others), this macro can be used to ensure that the error flag will not get
* set. On architectures which do not support a separate error flag, the macro
* is a no-op and the spurious error condition needs to be filtered out by some
* other means (e.g., in user-level, by passing an extra argument to the
* syscall handler, or something along those lines).
*/
#define force_successful_syscall_return() do { } while (0)
#endif
#endif
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment