Commit 5439ca6b authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 bug fixes from Ted Ts'o:
 "Various bug fixes for ext4.  Perhaps the most serious bug fixed is one
  which could cause file system corruptions when performing file punch
  operations."

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: avoid hang when mounting non-journal filesystems with orphan list
  ext4: lock i_mutex when truncating orphan inodes
  ext4: do not try to write superblock on ro remount w/o journal
  ext4: include journal blocks in df overhead calcs
  ext4: remove unaligned AIO warning printk
  ext4: fix an incorrect comment about i_mutex
  ext4: fix deadlock in journal_unmap_buffer()
  ext4: split off ext4_journalled_invalidatepage()
  jbd2: fix assertion failure in jbd2_journal_flush()
  ext4: check dioread_nolock on remount
  ext4: fix extent tree corruption caused by hole punch
parents a7a88b23 0e9a9a1a
......@@ -2226,13 +2226,14 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
* removes index from the index block.
*/
static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path)
struct ext4_ext_path *path, int depth)
{
int err;
ext4_fsblk_t leaf;
/* free index block */
path--;
depth--;
path = path + depth;
leaf = ext4_idx_pblock(path->p_idx);
if (unlikely(path->p_hdr->eh_entries == 0)) {
EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
......@@ -2257,6 +2258,19 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
ext4_free_blocks(handle, inode, NULL, leaf, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
while (--depth >= 0) {
if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
break;
path--;
err = ext4_ext_get_access(handle, inode, path);
if (err)
break;
path->p_idx->ei_block = (path+1)->p_idx->ei_block;
err = ext4_ext_dirty(handle, inode, path);
if (err)
break;
}
return err;
}
......@@ -2599,7 +2613,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
/* if this leaf is free, then we should
* remove it from index block above */
if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
err = ext4_ext_rm_idx(handle, inode, path + depth);
err = ext4_ext_rm_idx(handle, inode, path, depth);
out:
return err;
......@@ -2802,7 +2816,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
/* index is empty, remove it;
* handle must be already prepared by the
* truncatei_leaf() */
err = ext4_ext_rm_idx(handle, inode, path + i);
err = ext4_ext_rm_idx(handle, inode, path, i);
}
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
......
......@@ -108,14 +108,6 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
/* Unaligned direct AIO must be serialized; see comment above */
if (unaligned_aio) {
static unsigned long unaligned_warn_time;
/* Warn about this once per day */
if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
ext4_msg(inode->i_sb, KERN_WARNING,
"Unaligned AIO/DIO on inode %ld by %s; "
"performance will be poor.",
inode->i_ino, current->comm);
mutex_lock(ext4_aio_mutex(inode));
ext4_unwritten_wait(inode);
}
......
......@@ -109,8 +109,6 @@ static int __sync_inode(struct inode *inode, int datasync)
*
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
*
* i_mutex lock is held when entering and exiting this function
*/
int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
......
......@@ -2880,8 +2880,6 @@ static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offs
static void ext4_invalidatepage(struct page *page, unsigned long offset)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
trace_ext4_invalidatepage(page, offset);
/*
......@@ -2889,16 +2887,34 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
*/
if (ext4_should_dioread_nolock(page->mapping->host))
ext4_invalidatepage_free_endio(page, offset);
/* No journalling happens on data buffers when this function is used */
WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
block_invalidatepage(page, offset);
}
static int __ext4_journalled_invalidatepage(struct page *page,
unsigned long offset)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
trace_ext4_journalled_invalidatepage(page, offset);
/*
* If it's a full truncate we just forget about the pending dirtying
*/
if (offset == 0)
ClearPageChecked(page);
if (journal)
jbd2_journal_invalidatepage(journal, page, offset);
else
block_invalidatepage(page, offset);
return jbd2_journal_invalidatepage(journal, page, offset);
}
/* Wrapper for aops... */
static void ext4_journalled_invalidatepage(struct page *page,
unsigned long offset)
{
WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
......@@ -3264,7 +3280,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.write_end = ext4_journalled_write_end,
.set_page_dirty = ext4_journalled_set_page_dirty,
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.invalidatepage = ext4_journalled_invalidatepage,
.releasepage = ext4_releasepage,
.direct_IO = ext4_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
......@@ -4304,6 +4320,47 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
return err;
}
/*
* In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
* buffers that are attached to a page stradding i_size and are undergoing
* commit. In that case we have to wait for commit to finish and try again.
*/
static void ext4_wait_for_tail_page_commit(struct inode *inode)
{
struct page *page;
unsigned offset;
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = 0;
int ret;
offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
/*
* All buffers in the last page remain valid? Then there's nothing to
* do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
* blocksize case
*/
if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
return;
while (1) {
page = find_lock_page(inode->i_mapping,
inode->i_size >> PAGE_CACHE_SHIFT);
if (!page)
return;
ret = __ext4_journalled_invalidatepage(page, offset);
unlock_page(page);
page_cache_release(page);
if (ret != -EBUSY)
return;
commit_tid = 0;
read_lock(&journal->j_state_lock);
if (journal->j_committing_transaction)
commit_tid = journal->j_committing_transaction->t_tid;
read_unlock(&journal->j_state_lock);
if (commit_tid)
jbd2_log_wait_commit(journal, commit_tid);
}
}
/*
* ext4_setattr()
*
......@@ -4417,16 +4474,28 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
if (attr->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attr->ia_size);
/* Inode size will be reduced, wait for dio in flight.
* Temporarily disable dioread_nolock to prevent
* livelock. */
if (attr->ia_size != inode->i_size) {
loff_t oldsize = inode->i_size;
i_size_write(inode, attr->ia_size);
/*
* Blocks are going to be removed from the inode. Wait
* for dio in flight. Temporarily disable
* dioread_nolock to prevent livelock.
*/
if (orphan) {
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
ext4_inode_resume_unlocked_dio(inode);
if (!ext4_should_journal_data(inode)) {
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
ext4_inode_resume_unlocked_dio(inode);
} else
ext4_wait_for_tail_page_commit(inode);
}
/*
* Truncate pagecache after we've waited for commit
* in data=journal mode to make pages freeable.
*/
truncate_pagecache(inode, oldsize, inode->i_size);
}
ext4_truncate(inode);
}
......
......@@ -2648,7 +2648,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
struct ext4_iloc iloc;
int err = 0;
if (!EXT4_SB(inode->i_sb)->s_journal)
if ((!EXT4_SB(inode->i_sb)->s_journal) &&
!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
return 0;
mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
......
......@@ -1645,9 +1645,7 @@ static int parse_options(char *options, struct super_block *sb,
unsigned int *journal_ioprio,
int is_remount)
{
#ifdef CONFIG_QUOTA
struct ext4_sb_info *sbi = EXT4_SB(sb);
#endif
char *p;
substring_t args[MAX_OPT_ARGS];
int token;
......@@ -1696,6 +1694,16 @@ static int parse_options(char *options, struct super_block *sb,
}
}
#endif
if (test_opt(sb, DIOREAD_NOLOCK)) {
int blocksize =
BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
if (blocksize < PAGE_CACHE_SIZE) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"dioread_nolock if block size != PAGE_SIZE");
return 0;
}
}
return 1;
}
......@@ -2212,7 +2220,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
__func__, inode->i_ino, inode->i_size);
jbd_debug(2, "truncating inode %lu to %lld bytes\n",
inode->i_ino, inode->i_size);
mutex_lock(&inode->i_mutex);
ext4_truncate(inode);
mutex_unlock(&inode->i_mutex);
nr_truncates++;
} else {
ext4_msg(sb, KERN_DEBUG,
......@@ -3223,6 +3233,10 @@ int ext4_calculate_overhead(struct super_block *sb)
memset(buf, 0, PAGE_SIZE);
cond_resched();
}
/* Add the journal blocks as well */
if (sbi->s_journal)
overhead += EXT4_B2C(sbi, sbi->s_journal->j_maxlen);
sbi->s_overhead = overhead;
smp_wmb();
free_page((unsigned long) buf);
......@@ -3436,15 +3450,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
clear_opt(sb, DELALLOC);
}
blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
if (test_opt(sb, DIOREAD_NOLOCK)) {
if (blocksize < PAGE_SIZE) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"dioread_nolock if block size != PAGE_SIZE");
goto failed_mount;
}
}
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
......@@ -3486,6 +3491,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
goto failed_mount;
blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
if (blocksize < EXT4_MIN_BLOCK_SIZE ||
blocksize > EXT4_MAX_BLOCK_SIZE) {
ext4_msg(sb, KERN_ERR,
......@@ -4725,7 +4731,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
ext4_setup_system_zone(sb);
if (sbi->s_journal == NULL)
if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
ext4_commit_super(sb, 1);
#ifdef CONFIG_QUOTA
......
......@@ -209,7 +209,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
if (!new_transaction)
goto alloc_transaction;
write_lock(&journal->j_state_lock);
if (!journal->j_running_transaction) {
if (!journal->j_running_transaction &&
!journal->j_barrier_count) {
jbd2_get_transaction(journal, new_transaction);
new_transaction = NULL;
}
......@@ -1839,7 +1840,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
BUFFER_TRACE(bh, "entry");
retry:
/*
* It is safe to proceed here without the j_list_lock because the
* buffers cannot be stolen by try_to_free_buffers as long as we are
......@@ -1934,14 +1934,11 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
* for commit and try again.
*/
if (partial_page) {
tid_t tid = journal->j_committing_transaction->t_tid;
jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
write_unlock(&journal->j_state_lock);
jbd2_log_wait_commit(journal, tid);
goto retry;
return -EBUSY;
}
/*
* OK, buffer won't be reachable after truncate. We just set
......@@ -2002,21 +1999,23 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
* @page: page to flush
* @offset: length of page to invalidate.
*
* Reap page buffers containing data after offset in page.
*
* Reap page buffers containing data after offset in page. Can return -EBUSY
* if buffers are part of the committing transaction and the page is straddling
* i_size. Caller then has to wait for current commit and try again.
*/
void jbd2_journal_invalidatepage(journal_t *journal,
struct page *page,
unsigned long offset)
int jbd2_journal_invalidatepage(journal_t *journal,
struct page *page,
unsigned long offset)
{
struct buffer_head *head, *bh, *next;
unsigned int curr_off = 0;
int may_free = 1;
int ret = 0;
if (!PageLocked(page))
BUG();
if (!page_has_buffers(page))
return;
return 0;
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
......@@ -2030,9 +2029,11 @@ void jbd2_journal_invalidatepage(journal_t *journal,
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
may_free &= journal_unmap_buffer(journal, bh,
offset > 0);
ret = journal_unmap_buffer(journal, bh, offset > 0);
unlock_buffer(bh);
if (ret < 0)
return ret;
may_free &= ret;
}
curr_off = next_off;
bh = next;
......@@ -2043,6 +2044,7 @@ void jbd2_journal_invalidatepage(journal_t *journal,
if (may_free && try_to_free_buffers(page))
J_ASSERT(!page_has_buffers(page));
}
return 0;
}
/*
......
......@@ -1098,7 +1098,7 @@ void jbd2_journal_set_triggers(struct buffer_head *,
extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
extern int jbd2_journal_forget (handle_t *, struct buffer_head *);
extern void journal_sync_buffer (struct buffer_head *);
extern void jbd2_journal_invalidatepage(journal_t *,
extern int jbd2_journal_invalidatepage(journal_t *,
struct page *, unsigned long);
extern int jbd2_journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
extern int jbd2_journal_stop(handle_t *);
......
......@@ -451,7 +451,7 @@ DEFINE_EVENT(ext4__page_op, ext4_releasepage,
TP_ARGS(page)
);
TRACE_EVENT(ext4_invalidatepage,
DECLARE_EVENT_CLASS(ext4_invalidatepage_op,
TP_PROTO(struct page *page, unsigned long offset),
TP_ARGS(page, offset),
......@@ -477,6 +477,18 @@ TRACE_EVENT(ext4_invalidatepage,
(unsigned long) __entry->index, __entry->offset)
);
DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage,
TP_PROTO(struct page *page, unsigned long offset),
TP_ARGS(page, offset)
);
DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage,
TP_PROTO(struct page *page, unsigned long offset),
TP_ARGS(page, offset)
);
TRACE_EVENT(ext4_discard_blocks,
TP_PROTO(struct super_block *sb, unsigned long long blk,
unsigned long long count),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment