Commit 4142e0d1 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'osync_cleanup' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs-2.6

* 'osync_cleanup' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs-2.6:
  fsync: wait for data writeout completion before calling ->fsync
  vfs: Remove generic_osync_inode() and sync_page_range{_nolock}()
  fat: Opencode sync_page_range_nolock()
  pohmelfs: Use new syncing helper
  xfs: Convert sync_page_range() to simple filemap_write_and_wait_range()
  ocfs2: Update syncing after splicing to match generic version
  ntfs: Use new syncing helpers and update comments
  ext4: Remove syncing logic from ext4_file_write
  ext3: Remove syncing logic from ext3_file_write
  ext2: Update comment about generic_osync_inode
  vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  vfs: Rename generic_file_aio_write_nolock
  ocfs2: Use __generic_file_aio_write instead of generic_file_aio_write_nolock
  pohmelfs: Use __generic_file_aio_write instead of generic_file_aio_write_nolock
  vfs: Remove syncing from generic_file_direct_write() and generic_file_buffered_write()
  vfs: Export __generic_file_aio_write() and add some comments
  vfs: Introduce filemap_fdatawait_range
parents 33f1de69 2daea67e
......@@ -246,7 +246,7 @@ static const struct file_operations raw_fops = {
.read = do_sync_read,
.aio_read = generic_file_aio_read,
.write = do_sync_write,
.aio_write = generic_file_aio_write_nolock,
.aio_write = blkdev_aio_write,
.open = raw_open,
.release= raw_release,
.ioctl = raw_ioctl,
......
......@@ -921,16 +921,16 @@ ssize_t pohmelfs_write(struct file *file, const char __user *buf,
if (ret)
goto err_out_unlock;
ret = generic_file_aio_write_nolock(&kiocb, &iov, 1, pos);
ret = __generic_file_aio_write(&kiocb, &iov, 1, &kiocb.ki_pos);
*ppos = kiocb.ki_pos;
mutex_unlock(&inode->i_mutex);
WARN_ON(ret < 0);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
if (ret > 0) {
ssize_t err;
err = sync_page_range(inode, mapping, pos, ret);
err = generic_write_sync(file, pos, ret);
if (err < 0)
ret = err;
WARN_ON(ret < 0);
......
......@@ -1404,6 +1404,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return blkdev_ioctl(bdev, mode, cmd, arg);
}
/*
* Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device.
*
* Does not take i_mutex for the write and thus is not for general purpose
* use.
*/
ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
if (ret > 0 || ret == -EIOCBQUEUED) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
if (err < 0 && ret > 0)
ret = err;
}
return ret;
}
EXPORT_SYMBOL_GPL(blkdev_aio_write);
/*
* Try to release a page associated with block device when the system
* is under memory pressure.
......@@ -1436,7 +1463,7 @@ const struct file_operations def_blk_fops = {
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
.aio_write = generic_file_aio_write_nolock,
.aio_write = blkdev_aio_write,
.mmap = generic_file_mmap,
.fsync = block_fsync,
.unlocked_ioctl = block_ioctl,
......
......@@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode,
unlock_buffer(bh);
mark_buffer_dirty_inode(bh, inode);
/* We used to sync bh here if IS_SYNC(inode).
* But we now rely upon generic_osync_inode()
* But we now rely upon generic_write_sync()
* and b_inode_buffers. But not for directories.
*/
if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
......
......@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
return 0;
}
static ssize_t
ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_path.dentry->d_inode;
ssize_t ret;
int err;
ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
/*
* Skip flushing if there was an error, or if nothing was written.
*/
if (ret <= 0)
return ret;
/*
* If the inode is IS_SYNC, or is O_SYNC and we are doing data
* journalling then we need to make sure that we force the transaction
* to disk to keep all metadata uptodate synchronously.
*/
if (file->f_flags & O_SYNC) {
/*
* If we are non-data-journaled, then the dirty data has
* already been flushed to backing store by generic_osync_inode,
* and the inode has been flushed too if there have been any
* modifications other than mere timestamp updates.
*
* Open question --- do we care about flushing timestamps too
* if the inode is IS_SYNC?
*/
if (!ext3_should_journal_data(inode))
return ret;
goto force_commit;
}
/*
* So we know that there has been no forced data flush. If the inode
* is marked IS_SYNC, we need to force one ourselves.
*/
if (!IS_SYNC(inode))
return ret;
/*
* Open question #2 --- should we force data to disk here too? If we
* don't, the only impact is that data=writeback filesystems won't
* flush data to disk automatically on IS_SYNC, only metadata (but
* historically, that is what ext2 has done.)
*/
force_commit:
err = ext3_force_commit(inode->i_sb);
if (err)
return err;
return ret;
}
const struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
.aio_write = ext3_file_write,
.aio_write = generic_file_aio_write,
.unlocked_ioctl = ext3_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext3_compat_ioctl,
......
......@@ -58,10 +58,7 @@ static ssize_t
ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_path.dentry->d_inode;
ssize_t ret;
int err;
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
/*
* If we have encountered a bitmap-format file, the size limit
......@@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
}
}
ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
/*
* Skip flushing if there was an error, or if nothing was written.
*/
if (ret <= 0)
return ret;
/*
* If the inode is IS_SYNC, or is O_SYNC and we are doing data
* journalling then we need to make sure that we force the transaction
* to disk to keep all metadata uptodate synchronously.
*/
if (file->f_flags & O_SYNC) {
/*
* If we are non-data-journaled, then the dirty data has
* already been flushed to backing store by generic_osync_inode,
* and the inode has been flushed too if there have been any
* modifications other than mere timestamp updates.
*
* Open question --- do we care about flushing timestamps too
* if the inode is IS_SYNC?
*/
if (!ext4_should_journal_data(inode))
return ret;
goto force_commit;
}
/*
* So we know that there has been no forced data flush. If the inode
* is marked IS_SYNC, we need to force one ourselves.
*/
if (!IS_SYNC(inode))
return ret;
/*
* Open question #2 --- should we force data to disk here too? If we
* don't, the only impact is that data=writeback filesystems won't
* flush data to disk automatically on IS_SYNC, only metadata (but
* historically, that is what ext2 has done.)
*/
force_commit:
err = ext4_force_commit(inode->i_sb);
if (err)
return err;
return ret;
return generic_file_aio_write(iocb, iov, nr_segs, pos);
}
static struct vm_operations_struct ext4_file_vm_ops = {
......
......@@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
if (IS_SYNC(inode))
err = sync_page_range_nolock(inode, mapping, start, count);
if (IS_SYNC(inode)) {
int err2;
/*
* Opencode syncing since we don't have a file open to use
* standard fsync path.
*/
err = filemap_fdatawrite_range(mapping, start,
start + count - 1);
err2 = sync_mapping_buffers(mapping);
if (!err)
err = err2;
err2 = write_inode_now(inode, 1);
if (!err)
err = err2;
if (!err) {
err = filemap_fdatawait_range(mapping, start,
start + count - 1);
}
}
out:
return err;
}
......
......@@ -119,8 +119,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
MSDOS_I(inode)->i_start = new_dclus;
MSDOS_I(inode)->i_logstart = new_dclus;
/*
* Since generic_osync_inode() synchronize later if
* this is not directory, we don't here.
* Since generic_write_sync() synchronizes regular files later,
* we sync here only directories.
*/
if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
ret = fat_sync_inode(inode);
......
......@@ -1242,57 +1242,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
return ret;
}
EXPORT_SYMBOL(sync_inode);
/**
* generic_osync_inode - flush all dirty data for a given inode to disk
* @inode: inode to write
* @mapping: the address_space that should be flushed
* @what: what to write and wait upon
*
* This can be called by file_write functions for files which have the
* O_SYNC flag set, to flush dirty writes to disk.
*
* @what is a bitmask, specifying which part of the inode's data should be
* written and waited upon.
*
* OSYNC_DATA: i_mapping's dirty data
* OSYNC_METADATA: the buffers at i_mapping->private_list
* OSYNC_INODE: the inode itself
*/
int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
{
int err = 0;
int need_write_inode_now = 0;
int err2;
if (what & OSYNC_DATA)
err = filemap_fdatawrite(mapping);
if (what & (OSYNC_METADATA|OSYNC_DATA)) {
err2 = sync_mapping_buffers(mapping);
if (!err)
err = err2;
}
if (what & OSYNC_DATA) {
err2 = filemap_fdatawait(mapping);
if (!err)
err = err2;
}
spin_lock(&inode_lock);
if ((inode->i_state & I_DIRTY) &&
((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
need_write_inode_now = 1;
spin_unlock(&inode_lock);
if (need_write_inode_now) {
err2 = write_inode_now(inode, 1);
if (!err)
err = err2;
}
else
inode_sync_wait(inode);
return err;
}
EXPORT_SYMBOL(generic_osync_inode);
......@@ -2076,14 +2076,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
*ppos = pos;
if (cached_page)
page_cache_release(cached_page);
/* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
if (likely(!status)) {
if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
status = generic_osync_inode(vi, mapping,
OSYNC_METADATA|OSYNC_DATA);
}
}
pagevec_lru_add_file(&lru_pvec);
ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
written ? "written" : "status", (unsigned long)written,
......@@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
mutex_lock(&inode->i_mutex);
ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
int err = sync_page_range(inode, mapping, pos, ret);
if (ret > 0) {
int err = generic_write_sync(file, pos, ret);
if (err < 0)
ret = err;
}
......@@ -2173,8 +2165,8 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
if (ret == -EIOCBQUEUED)
ret = wait_on_sync_kiocb(&kiocb);
mutex_unlock(&inode->i_mutex);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
int err = sync_page_range(inode, mapping, *ppos - ret, ret);
if (ret > 0) {
int err = generic_write_sync(file, *ppos - ret, ret);
if (err < 0)
ret = err;
}
......
......@@ -384,13 +384,12 @@ MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
* it is dirty in the inode meta data rather than the data page cache of the
* inode, and thus there are no data pages that need writing out. Therefore, a
* full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
* other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
* ensure ->write_inode is called from generic_osync_inode() and this needs to
* happen or the file data would not necessarily hit the device synchronously,
* even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC
* simply "feels" better than just I_DIRTY_SYNC, since the file data has not
* actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
* would suggest.
* other hand, is not sufficient, because ->write_inode needs to be called even
* in case of fdatasync. This needs to happen or the file data would not
* necessarily hit the device synchronously, even though the vfs inode has the
* O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just
* I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
* which is not what I_DIRTY_SYNC on its own would suggest.
*/
void __mark_mft_record_dirty(ntfs_inode *ni)
{
......
......@@ -1871,8 +1871,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
goto out_dio;
}
} else {
written = generic_file_aio_write_nolock(iocb, iov, nr_segs,
*ppos);
written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
}
out_dio:
......@@ -1880,18 +1879,21 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
/*
* The generic write paths have handled getting data
* to disk, but since we don't make use of the dirty
* inode list, a manual journal commit is necessary
* here.
*/
if (old_size != i_size_read(inode) ||
old_clusters != OCFS2_I(inode)->ip_clusters) {
ret = filemap_fdatawrite_range(file->f_mapping, pos,
pos + count - 1);
if (ret < 0)
written = ret;
if (!ret && (old_size != i_size_read(inode) ||
old_clusters != OCFS2_I(inode)->ip_clusters)) {
ret = jbd2_journal_force_commit(osb->journal->j_journal);
if (ret < 0)
written = ret;
}
if (!ret)
ret = filemap_fdatawait_range(file->f_mapping, pos,
pos + count - 1);
}
/*
......@@ -1991,31 +1993,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
if (ret > 0) {
unsigned long nr_pages;
*ppos += ret;
nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
/*
* If file or inode is SYNC and we actually wrote some data,
* sync it.
*/
if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
int err;
mutex_lock(&inode->i_mutex);
err = ocfs2_rw_lock(inode, 1);
if (err < 0) {
mlog_errno(err);
} else {
err = generic_osync_inode(inode, mapping,
OSYNC_METADATA|OSYNC_DATA);
ocfs2_rw_unlock(inode, 1);
}
mutex_unlock(&inode->i_mutex);
nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
err = generic_write_sync(out, *ppos, ret);
if (err)
ret = err;
}
else
*ppos += ret;
balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
}
......
......@@ -976,25 +976,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
if (ret > 0) {
unsigned long nr_pages;
*ppos += ret;
nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
/*
* If file or inode is SYNC and we actually wrote some data,
* sync it.
*/
if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
int err;
mutex_lock(&inode->i_mutex);
err = generic_osync_inode(inode, mapping,
OSYNC_METADATA|OSYNC_DATA);
mutex_unlock(&inode->i_mutex);
nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
err = generic_write_sync(out, *ppos, ret);
if (err)
ret = err;
}
else
*ppos += ret;
balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
}
......
......@@ -178,19 +178,23 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
}
/**
* vfs_fsync - perform a fsync or fdatasync on a file
* vfs_fsync_range - helper to sync a range of data & metadata to disk
* @file: file to sync
* @dentry: dentry of @file
* @data: only perform a fdatasync operation
* @start: offset in bytes of the beginning of data range to sync
* @end: offset in bytes of the end of data range (inclusive)
* @datasync: perform only datasync
*
* Write back data and metadata for @file to disk. If @datasync is
* set only metadata needed to access modified file data is written.
* Write back data in range @start..@end and metadata for @file to disk. If
* @datasync is set only metadata needed to access modified file data is
* written.
*
* In case this function is called from nfsd @file may be %NULL and
* only @dentry is set. This can only happen when the filesystem
* implements the export_operations API.
*/
int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
loff_t end, int datasync)
{
const struct file_operations *fop;
struct address_space *mapping;
......@@ -214,7 +218,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
goto out;
}
ret = filemap_fdatawrite(mapping);
ret = filemap_write_and_wait_range(mapping, start, end);
/*
* We need to protect against concurrent writers, which could cause
......@@ -225,12 +229,29 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
if (!ret)
ret = err;
mutex_unlock(&mapping->host->i_mutex);
err = filemap_fdatawait(mapping);
if (!ret)
ret = err;
out:
return ret;
}
EXPORT_SYMBOL(vfs_fsync_range);
/**
* vfs_fsync - perform a fsync or fdatasync on a file
* @file: file to sync
* @dentry: dentry of @file
* @datasync: only perform a fdatasync operation
*
* Write back data and metadata for @file to disk. If @datasync is
* set only metadata needed to access modified file data is written.
*
* In case this function is called from nfsd @file may be %NULL and
* only @dentry is set. This can only happen when the filesystem
* implements the export_operations API.
*/
int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
{
return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
}
EXPORT_SYMBOL(vfs_fsync);
static int do_fsync(unsigned int fd, int datasync)
......@@ -256,6 +277,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
return do_fsync(fd, 1);
}
/**
* generic_write_sync - perform syncing after a write if file / inode is sync
* @file: file to which the write happened
* @pos: offset where the write started
* @count: length of the write
*
* This is just a simple wrapper about our general syncing function.
*/
int generic_write_sync(struct file *file, loff_t pos, loff_t count)
{
if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
return 0;
return vfs_fsync_range(file, file->f_path.dentry, pos,
pos + count - 1, 1);
}
EXPORT_SYMBOL(generic_write_sync);
/*
* sys_sync_file_range() permits finely controlled syncing over a segment of
* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
......
......@@ -817,7 +817,8 @@ xfs_write(
xfs_iunlock(xip, iolock);
if (need_i_mutex)
mutex_unlock(&inode->i_mutex);
error2 = sync_page_range(inode, mapping, pos, ret);
error2 = filemap_write_and_wait_range(mapping, pos,
pos + ret - 1);
if (!error)
error = error2;
if (need_i_mutex)
......
......@@ -1455,11 +1455,6 @@ int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
#define DT_SOCK 12
#define DT_WHT 14
#define OSYNC_METADATA (1<<0)
#define OSYNC_DATA (1<<1)
#define OSYNC_INODE (1<<2)
int generic_osync_inode(struct inode *, struct address_space *, int);
/*
* This is the "filldir" function type, used by readdir() to let
* the kernel specify what kind of dirent layout it wants to have.
......@@ -2086,6 +2081,8 @@ extern int write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *);
extern int filemap_fdatawait(struct address_space *);
extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
loff_t lend);
extern int filemap_write_and_wait(struct address_space *mapping);
extern int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
......@@ -2096,7 +2093,10 @@ extern int __filemap_fdatawrite_range(struct address_space *mapping,
extern int filemap_fdatawrite_range(struct address_space *mapping,
loff_t start, loff_t end);
extern int vfs_fsync_range(struct file *file, struct dentry *dentry,
loff_t start, loff_t end, int datasync);
extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
extern int generic_write_sync(struct file *file, loff_t pos, loff_t count);
extern void sync_supers(void);
extern void emergency_sync(void);
extern void emergency_remount(void);
......@@ -2202,9 +2202,9 @@ extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long,
loff_t *);
extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *,
unsigned long, loff_t);
extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
unsigned long *, loff_t, loff_t *, size_t, size_t);
extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
......@@ -2214,6 +2214,10 @@ extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t l
extern int generic_segment_checks(const struct iovec *iov,
unsigned long *nr_segs, size_t *count, int access_flags);
/* fs/block_dev.c */
extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
/* fs/splice.c */
extern ssize_t generic_file_splice_read(struct file *, loff_t *,
struct pipe_inode_info *, size_t, unsigned int);
......
......@@ -150,10 +150,6 @@ int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
void *data);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
int sync_page_range(struct inode *inode, struct address_space *mapping,
loff_t pos, loff_t count);
int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
loff_t pos, loff_t count);
void set_page_dirty_balance(struct page *page, int page_mkwrite);
void writeback_set_ratelimit(void);
......
......@@ -39,11 +39,10 @@
/*
* FIXME: remove all knowledge of the buffer layer from the core VM
*/
#include <linux/buffer_head.h> /* for generic_osync_inode */
#include <linux/buffer_head.h> /* for try_to_free_buffers */
#include <asm/mman.h>
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
......@@ -307,68 +306,24 @@ int wait_on_page_writeback_range(struct address_space *mapping,
}
/**
* sync_page_range - write and wait on all pages in the passed range
* @inode: target inode
* @mapping: target address_space
* @pos: beginning offset in pages to write
* @count: number of bytes to write
*
* Write and wait upon all the pages in the passed range. This is a "data
* integrity" operation. It waits upon in-flight writeout before starting and
* waiting upon new writeout. If there was an IO error, return it.
* filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range
* @mapping: address space structure to wait for
* @start: offset in bytes where the range starts
* @end: offset in bytes where the range ends (inclusive)
*
* We need to re-take i_mutex during the generic_osync_inode list walk because
* it is otherwise livelockable.
*/
int sync_page_range(struct inode *inode, struct address_space *mapping,
loff_t pos, loff_t count)
{
pgoff_t start = pos >> PAGE_CACHE_SHIFT;
pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
int ret;
if (!mapping_cap_writeback_dirty(mapping) || !count)
return 0;
ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
if (ret == 0) {
mutex_lock(&inode->i_mutex);
ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
mutex_unlock(&inode->i_mutex);
}
if (ret == 0)
ret = wait_on_page_writeback_range(mapping, start, end);
return ret;
}
EXPORT_SYMBOL(sync_page_range);
/**
* sync_page_range_nolock - write & wait on all pages in the passed range without locking
* @inode: target inode
* @mapping: target address_space
* @pos: beginning offset in pages to write
* @count: number of bytes to write
* Walk the list of under-writeback pages of the given address space
* in the given range and wait for all of them.
*
* Note: Holding i_mutex across sync_page_range_nolock() is not a good idea
* as it forces O_SYNC writers to different parts of the same file
* to be serialised right until io completion.
* This is just a simple wrapper so that callers don't have to convert offsets
* to page indexes themselves
*/
int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
loff_t pos, loff_t count)
int filemap_fdatawait_range(struct address_space *mapping, loff_t start,
loff_t end)
{
pgoff_t start = pos >> PAGE_CACHE_SHIFT;
pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
int ret;
if (!mapping_cap_writeback_dirty(mapping) || !count)
return 0;
ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
if (ret == 0)
ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
if (ret == 0)
ret = wait_on_page_writeback_range(mapping, start, end);
return ret;
return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT,
end >> PAGE_CACHE_SHIFT);
}
EXPORT_SYMBOL(sync_page_range_nolock);
EXPORT_SYMBOL(filemap_fdatawait_range);
/**
* filemap_fdatawait - wait for all under-writeback pages to complete
......@@ -2167,20 +2122,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
}
*ppos = end;
}
/*
* Sync the fs metadata but not the minor inode changes and
* of course not the data as we did direct DMA for the IO.
* i_mutex is held, which protects generic_osync_inode() from
* livelocking. AIO O_DIRECT ops attempt to sync metadata here.
*/
out:
if ((written >= 0 || written == -EIOCBQUEUED) &&
((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
if (err < 0)
written = err;
}
return written;
}
EXPORT_SYMBOL(generic_file_direct_write);
......@@ -2312,8 +2254,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
const struct address_space_operations *a_ops = mapping->a_ops;
struct inode *inode = mapping->host;
ssize_t status;
struct iov_iter i;
......@@ -2323,16 +2263,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
if (likely(status >= 0)) {
written += status;
*ppos = pos + status;
/*
* For now, when the user asks for O_SYNC, we'll actually give
* O_DSYNC
*/
if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
if (!a_ops->writepage || !is_sync_kiocb(iocb))
status = generic_osync_inode(inode, mapping,
OSYNC_METADATA|OSYNC_DATA);
}
}
/*
......@@ -2348,8 +2278,26 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
}
EXPORT_SYMBOL(generic_file_buffered_write);
static ssize_t
__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
/**
* __generic_file_aio_write - write data to a file
* @iocb: IO state structure (file, offset, etc.)
* @iov: vector with data to write
* @nr_segs: number of segments in the vector
* @ppos: position where to write
*
* This function does all the work needed for actually writing data to a
* file. It does all basic checks, removes SUID from the file, updates
* modification times and calls proper subroutines depending on whether we
* do direct IO or a standard buffered write.
*
* It expects i_mutex to be grabbed unless we work on a block device or similar
* object which does not need locking at all.
*
* This function does *not* take care of syncing data in case of O_SYNC write.
* A caller has to handle it. This is mainly due to the fact that we want to
* avoid syncing under i_mutex.
*/
ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
struct file *file = iocb->ki_filp;
......@@ -2447,51 +2395,37 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
current->backing_dev_info = NULL;
return written ? written : err;
}
EXPORT_SYMBOL(__generic_file_aio_write);
ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
const struct iovec *iov, unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
&iocb->ki_pos);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
ssize_t err;
err = sync_page_range_nolock(inode, mapping, pos, ret);
if (err < 0)
ret = err;
}
return ret;
}
EXPORT_SYMBOL(generic_file_aio_write_nolock);
/**
* generic_file_aio_write - write data to a file
* @iocb: IO state structure
* @iov: vector with data to write
* @nr_segs: number of segments in the vector
* @pos: position in file where to write
*
* This is a wrapper around __generic_file_aio_write() to be used by most
* filesystems. It takes care of syncing the file in case of O_SYNC file
* and acquires i_mutex as needed.
*/
ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
struct inode *inode = file->f_mapping->host;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
mutex_lock(&inode->i_mutex);
ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
&iocb->ki_pos);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
if (ret > 0 || ret == -EIOCBQUEUED) {
ssize_t err;
err = sync_page_range(inode, mapping, pos, ret);
if (err < 0)
err = generic_write_sync(file, pos, ret);
if (err < 0 && ret > 0)
ret = err;
}
return ret;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment