fs: pass ppos=NULL to .read()/.write() of FMODE_STREAM files
This amends commit 10dce8af ("fs: stream_open - opener for stream-like files so that read and write can run simultaneously without deadlock") in how position is passed into .read()/.write() handler for stream-like files: Rasmus noticed that we currently pass 0 as position and ignore any position change if that is done by a file implementation. This papers over bugs if ppos is used in files that declare themselves as being stream-like as such bugs will go unnoticed. Even if a file implementation is correctly converted into using stream_open, its read/write later could be changed to use ppos and even though that won't be working correctly, that bug might go unnoticed without someone doing wrong behaviour analysis. It is thus better to pass ppos=NULL into read/write for stream-like files as that don't give any chance for ppos usage bugs because it will oops if ppos is ever used inside .read() or .write(). Rasmus also made the point that FMODE_STREAM patch added 2 branches into ksys_read/ksys_write patch which is too much unnecessary overhead and could be reduces. Let's rework the code to pass ppos=NULL and have less branches. The first approach could be to revert file_pos_read and file_pos_write into their pre-FMODE_STREAM version and just do --- a/fs/read_write.c +++ b/fs/read_write.c @@ -575,7 +575,7 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) if (f.file) { loff_t pos = file_pos_read(f.file); - ret = vfs_read(f.file, buf, count, &pos); + ret = vfs_read(f.file, buf, count, (file->f_mode & FMODE_STREAM) ? NULL : &pos); if (ret >= 0) file_pos_write(f.file, pos); fdput_pos(f); this adds no branches at all as `(file->f_mode & FMODE_STREAM) ? NULL : &pos)` translates to single cmov instruction on x86_64: if (f.file) { 18e5: 48 89 c3 mov %rax,%rbx 18e8: 48 83 e3 fc and $0xfffffffffffffffc,%rbx 18ec: 74 79 je 1967 <ksys_read+0xa7> 18ee: 48 89 c5 mov %rax,%rbp loff_t pos = f.file->f_pos; 18f1: 48 8b 43 68 mov 0x68(%rbx),%rax ret = vfs_read(f.file, buf, count, (f.file->f_mode & FMODE_STREAM) ? NULL : &pos); 18f5: 48 89 e1 mov %rsp,%rcx 18f8: f6 43 46 20 testb $0x20,0x46(%rbx) 18fc: 4c 89 e6 mov %r12,%rsi 18ff: 4c 89 ea mov %r13,%rdx 1902: 48 89 df mov %rbx,%rdi loff_t pos = f.file->f_pos; 1905: 48 89 04 24 mov %rax,(%rsp) ret = vfs_read(f.file, buf, count, (f.file->f_mode & FMODE_STREAM) ? NULL : &pos); 1909: b8 00 00 00 00 mov $0x0,%eax 190e: 48 0f 45 c8 cmovne %rax,%rcx <-- NOTE 1912: e8 00 00 00 00 callq 1917 <ksys_read+0x57> 1913: R_X86_64_PLT32 vfs_read-0x4 However this leaves on unconditional write into file->f_pos after vfs_read call, and even though it should not be affecting semantic, it will bug under race detector (e.g. KTSAN) and probably it might add some inter-CPU overhead if read and write handlers are running on different cores. If we instead move `file->f_mode & FMODE_STREAM` checking into vfs functions that actually dispatch IO and care to load/store file->f_fpos only for non-stream files, even though it is 3 branches if looking at C source, gcc generates only 1 more branch compared to how it was pre-FMODE_STREAM. For example consider updated ksys_read: ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { - loff_t pos = file_pos_read(f.file); - ret = vfs_read(f.file, buf, count, &pos); - if (ret >= 0) - file_pos_write(f.file, pos); + loff_t pos, *ppos = f.file->f_mode & FMODE_STREAM ? NULL : &pos; + if (ppos) + pos = f.file->f_pos; + ret = vfs_read(f.file, buf, count, ppos); + if (ret >= 0 && ppos) + f.file->f_pos = pos; fdput_pos(f); } return ret; The code that gcc generates here is essentially as if the source was: if (f.file->f_mode & FMODE_STREAM) { ret = vfs_read(f.file, buf, count, NULL); } else { loff_t pos = f.file->f_pos; ret = vfs_read(f.file, buf, count, ppos); if (ret >= 0) f.file->f_pos = pos; } fdput_pos(f); i.e. it branches only once after checking file->f_mode and then has two distinct codepaths each with its own vfs_read call: if (f.file) { 18e5: 48 89 c5 mov %rax,%rbp 18e8: 48 83 e5 fc and $0xfffffffffffffffc,%rbp 18ec: 0f 84 89 00 00 00 je 197b <ksys_read+0xbb> 18f2: 48 89 c3 mov %rax,%rbx loff_t pos, *ppos = (f.file->f_mode & FMODE_STREAM) ? NULL : &pos; 18f5: f6 45 46 20 testb $0x20,0x46(%rbp) 18f9: 74 3b je 1936 <ksys_read+0x76> <-- branch pos/no-pos ret = vfs_read(f.file, buf, count, ppos); 18fb: 4c 89 e6 mov %r12,%rsi <-- start of IO without pos 18fe: 31 c9 xor %ecx,%ecx 1900: 4c 89 ea mov %r13,%rdx 1903: 48 89 ef mov %rbp,%rdi 1906: e8 00 00 00 00 callq 190b <ksys_read+0x4b> 1907: R_X86_64_PLT32 vfs_read-0x4 <-- vfs_read(..., ppos=NULL) 190b: 49 89 c4 mov %rax,%r12 if (f.flags & FDPUT_POS_UNLOCK) 190e: f6 c3 02 test $0x2,%bl 1911: 75 51 jne 1964 <ksys_read+0xa4> if (fd.flags & FDPUT_FPUT) 1913: 83 e3 01 and $0x1,%ebx 1916: 75 59 jne 1971 <ksys_read+0xb1> } 1918: 48 8b 4c 24 08 mov 0x8(%rsp),%rcx 191d: 65 48 33 0c 25 28 00 xor %gs:0x28,%rcx 1924: 00 00 1926: 4c 89 e0 mov %r12,%rax 1929: 75 59 jne 1984 <ksys_read+0xc4> 192b: 48 83 c4 10 add $0x10,%rsp 192f: 5b pop %rbx 1930: 5d pop %rbp 1931: 41 5c pop %r12 1933: 41 5d pop %r13 1935: c3 retq pos = f.file->f_pos; 1936: 48 8b 45 68 mov 0x68(%rbp),%rax <-- start of IO with pos ret = vfs_read(f.file, buf, count, ppos); 193a: 4c 89 e6 mov %r12,%rsi 193d: 48 89 e1 mov %rsp,%rcx 1940: 4c 89 ea mov %r13,%rdx 1943: 48 89 ef mov %rbp,%rdi pos = f.file->f_pos; 1946: 48 89 04 24 mov %rax,(%rsp) ret = vfs_read(f.file, buf, count, ppos); 194a: e8 00 00 00 00 callq 194f <ksys_read+0x8f> 194b: R_X86_64_PLT32 vfs_read-0x4 <-- vfs_read(..., ppos=&pos) 194f: 49 89 c4 mov %rax,%r12 if (ret >= 0 && ppos) 1952: 48 85 c0 test %rax,%rax 1955: 78 b7 js 190e <ksys_read+0x4e> f.file->f_pos = pos; 1957: 48 8b 04 24 mov (%rsp),%rax 195b: 48 89 45 68 mov %rax,0x68(%rbp) if (f.flags & FDPUT_POS_UNLOCK) ... If adding one branch to handle FMODE_STREAM is acceptable, this approach should be ok. Not duplicating vfs_read calls at C level is better as C-level code is more clear without such duplication, and gcc cares to optimize the function properly to have only 1 branch depending on file->f_mode. If even 1 extra branch is unacceptable, we should indeed go with the first option described in this patch but be prepared for race-detector bug reports and probably some inter-CPU overhead. Overal I would suggest to use simpler approach presented by hereby patch unless 1-extra jump could be shown to cause noticable slowdowns in practice. Suggested-by: Rasmus Villemoes <linux@rasmusvillemoes.dk> Signed-off-by: Kirill Smelkov <kirr@nexedi.com>
Showing
Please register or sign in to comment