[PATCH] async write errors: report truncate and io errors on

From: Oliver Xymoron <oxymoron@waste.org> These patches add the infrastructure for reporting asynchronous write errors to block devices to userspace. Error which are detected due to pdflush or VM writeout are reported at the next fsync, fdatasync, or msync on the given file, and on close if the error occurs in time. We do this by propagating any errors into page->mapping->error when they are detected. In fsync(), msync(), fdatasync() and close() we return that error and zero it out. The Open Group say close() _may_ fail if an I/O error occurred while reading from or writing to the file system. Well, in this implementation close() can return -EIO or -ENOSPC. And in that case it will succeed, not fail - perhaps that is what they meant. There are three patches in this series and testing has only been performed with all three applied.

[PATCH] async write errors: report truncate and io errors on
From: Oliver Xymoron <oxymoron@waste.org> These patches add the infrastructure for reporting asynchronous write errors to block devices to userspace. Error which are detected due to pdflush or VM writeout are reported at the next fsync, fdatasync, or msync on the given file, and on close if the error occurs in time. We do this by propagating any errors into page->mapping->error when they are detected. In fsync(), msync(), fdatasync() and close() we return that error and zero it out. The Open Group say close() _may_ fail if an I/O error occurred while reading from or writing to the file system. Well, in this implementation close() can return -EIO or -ENOSPC. And in that case it will succeed, not fail - perhaps that is what they meant. There are three patches in this series and testing has only been performed with all three applied.
fe7e689f · Andrew Morton · Linus Torvalds · a5bfb7f3 · fe7e689f · fe7e689f
Commit fe7e689f authored Aug 18, 2003 by Andrew Morton Committed by Linus Torvalds Aug 18, 2003
11 changed files
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -170,15 +170,29 @@ static void buffer_io_error(struct buffer_head *bh)
 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 * unlock the buffer. This is what ll_rw_block uses too.
 */
-void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 {
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		/*
-		 * This happens, due to failed READA attempts.
-		 * buffer_io_error(bh);
-		 */
+		/* This happens, due to failed READA attempts. */
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+	char b[BDEVNAME_SIZE];
+
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		buffer_io_error(bh);
+		printk(KERN_WARNING "lost page write due to I/O error on %s\n",
+		       bdevname(bh->b_bdev, b));
+		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
 	}
 	unlock_buffer(bh);
@@ -550,6 +564,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 */
 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
+	char b[BDEVNAME_SIZE];
 	static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 	unsigned long flags;
 	struct buffer_head *tmp;
@@ -562,6 +577,9 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 		set_buffer_uptodate(bh);
 	} else {
 		buffer_io_error(bh);
+		printk(KERN_WARNING "lost page write due to I/O error on %s\n",
+		       bdevname(bh->b_bdev, b));
+		page->mapping->error = -EIO;
 		clear_buffer_uptodate(bh);
 		SetPageError(page);
 	}
@@ -1288,7 +1306,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh)
 		if (buffer_dirty(bh))
 			buffer_error();
 		get_bh(bh);
-		bh->b_end_io = end_buffer_io_sync;
+		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, bh);
 		wait_on_buffer(bh);
 		if (buffer_uptodate(bh))
@@ -2646,8 +2664,10 @@ int submit_bh(int rw, struct buffer_head * bh)
 		buffer_error();
 	if (rw == READ && buffer_dirty(bh))
 		buffer_error();
-				
-	set_buffer_req(bh);
+
+	/* Only clear out a write error when rewriting */
+	if (test_set_buffer_req(bh) && rw == WRITE)
+		clear_buffer_write_io_error(bh);

 	/*
 	 * from here on down, it's all bio -- do the initial mapping,
@@ -2707,13 +2727,14 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 			continue;

 		get_bh(bh);
-		bh->b_end_io = end_buffer_io_sync;
 		if (rw == WRITE) {
+			bh->b_end_io = end_buffer_write_sync;
 			if (test_clear_buffer_dirty(bh)) {
 				submit_bh(WRITE, bh);
 				continue;
 			}
 		} else {
+			bh->b_end_io = end_buffer_read_sync;
 			if (!buffer_uptodate(bh)) {
 				submit_bh(rw, bh);
 				continue;
@@ -2734,7 +2755,7 @@ void sync_dirty_buffer(struct buffer_head *bh)
 	lock_buffer(bh);
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
-		bh->b_end_io = end_buffer_io_sync;
+		bh->b_end_io = end_buffer_write_sync;
 		submit_bh(WRITE, bh);
 		wait_on_buffer(bh);
 	} else {
@@ -2793,6 +2814,8 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
 	bh = head;
 	do {
 		check_ttfb_buffer(page, bh);
+		if (buffer_write_io_error(bh))
+			page->mapping->error = -EIO;
 		if (buffer_busy(bh))
 			goto failed;
 		if (!buffer_uptodate(bh) && !buffer_req(bh))

--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2431,7 +2431,7 @@ static int ext3_get_inode_loc(struct inode *inode,
 		 * read the block from disk
 		 */
 		get_bh(bh);
-		bh->b_end_io = end_buffer_io_sync;
+		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {

--- a/fs/inode.c
+++ b/fs/inode.c
@@ -145,6 +145,7 @@ static struct inode *alloc_inode(struct super_block *sb)
 		mapping->dirtied_when = 0;
 		mapping->assoc_mapping = NULL;
 		mapping->backing_dev_info = &default_backing_dev_info;
+		mapping->error = 0;
 		if (sb->s_bdev)
 			mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
 		memset(&inode->u, 0, sizeof(inode->u));

--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -388,6 +388,7 @@ static struct bio *
 mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 	sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc)
 {
+	struct address_space *mapping = page->mapping;
 	struct inode *inode = page->mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
 	unsigned long end_index;
@@ -562,6 +563,11 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 	if (bio)
 		bio = mpage_bio_submit(WRITE, bio);
 	*ret = page->mapping->a_ops->writepage(page, wbc);
+	/*
+	 * The caller has a ref on the inode, so *mapping is stable
+	 */
+	if (*ret < 0)
+		mapping->error = *ret;
 out:
 	return bio;
 }
@@ -663,6 +669,8 @@ mpage_writepages(struct address_space *mapping,
 					test_clear_page_dirty(page)) {
 			if (writepage) {
 				ret = (*writepage)(page, wbc);
+				if (ret < 0)
+					mapping->error = ret;
 			} else {
 				bio = mpage_writepage(bio, page, get_block,
 					&last_block_in_bio, &ret, wbc);

--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -643,7 +643,7 @@ int ntfs_read_compressed_block(struct page *page)
 			continue;
 		}
 		atomic_inc(&tbh->b_count);
-		tbh->b_end_io = end_buffer_io_sync;
+		tbh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, tbh);
 	}


--- a/fs/open.c
+++ b/fs/open.c
@@ -944,15 +944,32 @@ asmlinkage long sys_creat(const char __user * pathname, int mode)
 */
 int filp_close(struct file *filp, fl_owner_t id)
 {
-	int retval;
+	struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
+	int retval = 0, err;
+
+	/* Report and clear outstanding errors */
+	err = filp->f_error;
+	if (err) {
+		filp->f_error = 0;
+		retval = err;
+	}
+
+	err = mapping->error;
+	if (!retval)
+		retval = err;
+	mapping->error = 0;

 	if (!file_count(filp)) {
 		printk(KERN_ERR "VFS: Close: file count is 0\n");
-		return 0;
+		return retval;
+	}
+
+	if (filp->f_op && filp->f_op->flush) {
+		err = filp->f_op->flush(filp);
+		if (!retval)
+			retval = err;
 	}
-	retval = 0;
-	if (filp->f_op && filp->f_op->flush)
-		retval = filp->f_op->flush(filp);
+
 	dnotify_flush(filp, id);
 	locks_remove_posix(filp, id);
 	fput(filp);

--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -24,8 +24,9 @@ enum bh_state_bits {
 	BH_Async_Read,	/* Is under end_buffer_async_read I/O */
 	BH_Async_Write,	/* Is under end_buffer_async_write I/O */
 	BH_Delay,	/* Buffer is not yet allocated on disk */
-
 	BH_Boundary,	/* Block is followed by a discontiguity */
+	BH_Write_EIO,	/* I/O error on write */
+
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
 			 */
@@ -109,12 +110,14 @@ TAS_BUFFER_FNS(Dirty, dirty)
 BUFFER_FNS(Lock, locked)
 TAS_BUFFER_FNS(Lock, locked)
 BUFFER_FNS(Req, req)
+TAS_BUFFER_FNS(Req, req)
 BUFFER_FNS(Mapped, mapped)
 BUFFER_FNS(New, new)
 BUFFER_FNS(Async_Read, async_read)
 BUFFER_FNS(Async_Write, async_write)
-BUFFER_FNS(Delay, delay);
+BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
+BUFFER_FNS(Write_EIO,write_io_error)

 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
 #define touch_buffer(bh)	mark_page_accessed(bh->b_page)
@@ -139,7 +142,8 @@ void set_bh_page(struct buffer_head *bh,
 int try_to_free_buffers(struct page *);
 void create_empty_buffers(struct page *, unsigned long,
 			unsigned long b_state);
-void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
+void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
+void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_async_write(struct buffer_head *bh, int uptodate);

 /* Things to do with buffers at mapping->private_list */

--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -332,6 +332,7 @@ struct address_space {
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+	int			error;		/* write error for fsync */
 };

 struct block_device {

--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -180,7 +180,8 @@ EXPORT_SYMBOL(d_splice_alias);
 EXPORT_SYMBOL(d_lookup);
 EXPORT_SYMBOL(d_path);
 EXPORT_SYMBOL(mark_buffer_dirty);
-EXPORT_SYMBOL(end_buffer_io_sync);
+EXPORT_SYMBOL(end_buffer_read_sync);
+EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(end_buffer_async_write);
 EXPORT_SYMBOL(__mark_inode_dirty);
 EXPORT_SYMBOL(get_empty_filp);

--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -203,6 +203,14 @@ int filemap_fdatawait(struct address_space * mapping)
 		spin_lock(&mapping->page_lock);
 	}
 	spin_unlock(&mapping->page_lock);
+
+	/* Check for outstanding write errors */
+	if (mapping->error) {
+		if (!ret)
+			ret = mapping->error;
+		mapping->error = 0;
+	}
+
 	return ret;
 }


--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -235,6 +235,27 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
 	return 0;
 }

+/*
+ * We detected a synchronous write error writing a page out.  Probably
+ * -ENOSPC.  We need to propagate that into the address_space for a subsequent
+ * fsync(), msync() or close().
+ *
+ * The tricky part is that after writepage we cannot touch the mapping: nothing
+ * prevents it from being freed up.  But we have a ref on the page and once
+ * that page is locked, the mapping is pinned.
+ *
+ * We're allowed to run sleeping lock_page() here because we know the caller has
+ * __GFP_FS.
+ */
+static void handle_write_error(struct address_space *mapping,
+				struct page *page, int error)
+{
+	lock_page(page);
+	if (page->mapping == mapping)
+		mapping->error = error;
+	unlock_page(page);
+}
+
 /*
 * shrink_list returns the number of reclaimed pages
 */
@@ -358,7 +379,8 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,

 				SetPageReclaim(page);
 				res = mapping->a_ops->writepage(page, &wbc);
-
+				if (res < 0)
+					handle_write_error(mapping, page, res);
 				if (res == WRITEPAGE_ACTIVATE) {
 					ClearPageReclaim(page);
 					goto activate_locked;