Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (64 commits) ext4: Update documentation about quota mount options ext4: replace MAX_DEFRAG_SIZE with EXT_MAX_BLOCK ext4: Fix the alloc on close after a truncate hueristic ext4: Add a tracepoint for ext4_alloc_da_blocks() ext4: store EXT4_EXT_MIGRATE in i_state instead of i_flags ext4: limit block allocations for indirect-block files to < 2^32 ext4: Fix different block exchange issue in EXT4_IOC_MOVE_EXT ext4: Add null extent check to ext_get_path ext4: Replace BUG_ON() with ext4_error() in move_extents.c ext4: Replace get_ext_path macro with an inline funciton ext4: Fix include/trace/events/ext4.h to work with Systemtap ext4: Fix initalization of s_flex_groups ext4: Always set dx_node's fake_dirent explicitly. ext4: Fix async commit mode to be safe by using a barrier ext4: Don't update superblock write time when filesystem is read-only ext4: Clarify the locking details in mballoc ext4: check for need init flag in ext4_mb_load_buddy ext4: move ext4_mb_init_group() function earlier in the mballoc.c ext4: Make non-journal fsync work properly ext4: Assure that metadata blocks are written during fsync in no journal mode ...

Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (64 commits) ext4: Update documentation about quota mount options ext4: replace MAX_DEFRAG_SIZE with EXT_MAX_BLOCK ext4: Fix the alloc on close after a truncate hueristic ext4: Add a tracepoint for ext4_alloc_da_blocks() ext4: store EXT4_EXT_MIGRATE in i_state instead of i_flags ext4: limit block allocations for indirect-block files to < 2^32 ext4: Fix different block exchange issue in EXT4_IOC_MOVE_EXT ext4: Add null extent check to ext_get_path ext4: Replace BUG_ON() with ext4_error() in move_extents.c ext4: Replace get_ext_path macro with an inline funciton ext4: Fix include/trace/events/ext4.h to work with Systemtap ext4: Fix initalization of s_flex_groups ext4: Always set dx_node's fake_dirent explicitly. ext4: Fix async commit mode to be safe by using a barrier ext4: Don't update superblock write time when filesystem is read-only ext4: Clarify the locking details in mballoc ext4: check for need init flag in ext4_mb_load_buddy ext4: move ext4_mb_init_group() function earlier in the mballoc.c ext4: Make non-journal fsync work properly ext4: Assure that metadata blocks are written during fsync in no journal mode ...
3530c188 · Linus Torvalds · 6952b61d · 1358870d · 3530c188 · 3530c188
Commit 3530c188 authored Sep 18, 2009 by Linus Torvalds
25 changed files
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -134,15 +134,9 @@ ro                   	Mount filesystem read only. Note that ext4 will
                     	mount options "ro,noload" can be used to prevent
 		     	writes to the filesystem.

-journal_checksum	Enable checksumming of the journal transactions.
-			This will allow the recovery code in e2fsck and the
-			kernel to detect corruption in the kernel.  It is a
-			compatible change and will be ignored by older kernels.
-
 journal_async_commit	Commit block can be written to disk without waiting
 			for descriptor blocks. If enabled older kernels cannot
-			mount the device. This will enable 'journal_checksum'
-			internally.
+			mount the device.

 journal=update		Update the ext4 file system's journal to the current
 			format.
@@ -263,10 +257,18 @@ resuid=n		The user ID which may use the reserved blocks.

 sb=n			Use alternate superblock at this location.

-quota
-noquota
-grpquota
-usrquota
+quota			These options are ignored by the filesystem. They
+noquota			are used only by quota tools to recognize volumes
+grpquota		where quota should be turned on. See documentation
+usrquota		in the quota-tools package for more details
+			(http://sourceforge.net/projects/linuxquota).
+
+jqfmt=<quota type>	These options tell filesystem details about quota
+usrjquota=<file>	so that quota information can be properly updated
+grpjquota=<file>	during journal replay. They replace the above
+			quota options. See documentation in the quota-tools
+			package for more details
+			(http://sourceforge.net/projects/linuxquota).

 bh		(*)	ext4 associates buffer heads to data pages to
 nobh			(a) cache disk block mapping information

--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,7 +37,7 @@ config EXT4DEV_COMPAT

 	  To enable backwards compatibility so that systems that are
 	  still expecting to mount ext4 filesystems using ext4dev,
-	  chose Y here.   This feature will go away by 2.6.31, so
+	  choose Y here.   This feature will go away by 2.6.31, so
 	  please arrange to get your userspace programs fixed!

 config EXT4_FS_XATTR
@@ -77,3 +77,12 @@ config EXT4_FS_SECURITY

 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
+
+config EXT4_DEBUG
+	bool "EXT4 debugging support"
+	depends on EXT4_FS
+	help
+	  Enables run-time debugging support for the ext4 filesystem.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 	 * new bitmap information
 	 */
 	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
-	ext4_mb_update_group_info(grp, blocks_freed);
+	grp->bb_free += blocks_freed;
 	up_write(&grp->alloc_sem);

 	/* We dirtied the bitmap block */

--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -67,27 +67,29 @@ typedef unsigned int ext4_group_t;


 /* prefer goal again. length */
-#define EXT4_MB_HINT_MERGE		1
+#define EXT4_MB_HINT_MERGE		0x0001
 /* blocks already reserved */
-#define EXT4_MB_HINT_RESERVED		2
+#define EXT4_MB_HINT_RESERVED		0x0002
 /* metadata is being allocated */
-#define EXT4_MB_HINT_METADATA		4
+#define EXT4_MB_HINT_METADATA		0x0004
 /* first blocks in the file */
-#define EXT4_MB_HINT_FIRST		8
+#define EXT4_MB_HINT_FIRST		0x0008
 /* search for the best chunk */
-#define EXT4_MB_HINT_BEST		16
+#define EXT4_MB_HINT_BEST		0x0010
 /* data is being allocated */
-#define EXT4_MB_HINT_DATA		32
+#define EXT4_MB_HINT_DATA		0x0020
 /* don't preallocate (for tails) */
-#define EXT4_MB_HINT_NOPREALLOC		64
+#define EXT4_MB_HINT_NOPREALLOC		0x0040
 /* allocate for locality group */
-#define EXT4_MB_HINT_GROUP_ALLOC	128
+#define EXT4_MB_HINT_GROUP_ALLOC	0x0080
 /* allocate goal blocks or none */
-#define EXT4_MB_HINT_GOAL_ONLY		256
+#define EXT4_MB_HINT_GOAL_ONLY		0x0100
 /* goal is meaningful */
-#define EXT4_MB_HINT_TRY_GOAL		512
+#define EXT4_MB_HINT_TRY_GOAL		0x0200
 /* blocks already pre-reserved by delayed allocation */
-#define EXT4_MB_DELALLOC_RESERVED      1024
+#define EXT4_MB_DELALLOC_RESERVED	0x0400
+/* We are doing stream allocation */
+#define EXT4_MB_STREAM_ALLOC		0x0800


 struct ext4_allocation_request {
@@ -111,6 +113,21 @@ struct ext4_allocation_request {
 	unsigned int flags;
 };

+/*
+ * For delayed allocation tracking
+ */
+struct mpage_da_data {
+	struct inode *inode;
+	sector_t b_blocknr;		/* start block number of extent */
+	size_t b_size;			/* size of extent */
+	unsigned long b_state;		/* state of the extent */
+	unsigned long first_page, next_page;	/* extent of pages */
+	struct writeback_control *wbc;
+	int io_done;
+	int pages_written;
+	int retval;
+};
+
 /*
 * Special inodes numbers
 */
@@ -251,7 +268,6 @@ struct flex_groups {
 #define EXT4_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
 #define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
-#define EXT4_EXT_MIGRATE		0x00100000 /* Inode is migrating */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */

 #define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
@@ -289,6 +305,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 #define EXT4_STATE_XATTR		0x00000004 /* has in-inode xattrs */
 #define EXT4_STATE_NO_EXPAND		0x00000008 /* No space for expansion */
 #define EXT4_STATE_DA_ALLOC_CLOSE	0x00000010 /* Alloc DA blks on close */
+#define EXT4_STATE_EXT_MIGRATE		0x00000020 /* Inode is migrating */

 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
@@ -386,6 +403,9 @@ struct ext4_mount_options {
 #endif
 };

+/* Max physical block we can addres w/o extents */
+#define EXT4_MAX_BLOCK_FILE_PHYS	0xFFFFFFFF
+
 /*
 * Structure of an inode on the disk
 */
@@ -456,7 +476,6 @@ struct move_extent {
 	__u64 len;		/* block length to be moved */
 	__u64 moved_len;	/* moved block length */
 };
-#define MAX_DEFRAG_SIZE         ((1UL<<31) - 1)

 #define EXT4_EPOCH_BITS 2
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -694,7 +713,6 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
-#define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
@@ -841,6 +859,7 @@ struct ext4_sb_info {
 	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
 	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
 	ext4_group_t s_groups_count;	/* Number of groups in the fs */
+	ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
 	unsigned long s_overhead_last;  /* Last calculated overhead */
 	unsigned long s_blocks_last;    /* Last seen block count */
 	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
@@ -950,6 +969,7 @@ struct ext4_sb_info {
 	atomic_t s_mb_lost_chunks;
 	atomic_t s_mb_preallocated;
 	atomic_t s_mb_discarded;
+	atomic_t s_lock_busy;

 	/* locality groups */
 	struct ext4_locality_group *s_locality_groups;
@@ -1340,8 +1360,6 @@ extern void ext4_mb_free_blocks(handle_t *, struct inode *,
 		ext4_fsblk_t, unsigned long, int, unsigned long *);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
 		ext4_group_t i, struct ext4_group_desc *desc);
-extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
-		ext4_grpblk_t add);
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
 						ext4_group_t, int);
@@ -1367,6 +1385,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
+extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
@@ -1575,15 +1594,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 struct ext4_group_info {
 	unsigned long   bb_state;
 	struct rb_root  bb_free_root;
-	unsigned short  bb_first_free;
-	unsigned short  bb_free;
-	unsigned short  bb_fragments;
+	ext4_grpblk_t	bb_first_free;	/* first free block */
+	ext4_grpblk_t	bb_free;	/* total free blocks */
+	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
 	struct          list_head bb_prealloc_list;
 #ifdef DOUBLE_CHECK
 	void            *bb_bitmap;
 #endif
 	struct rw_semaphore alloc_sem;
-	unsigned short  bb_counters[];
+	ext4_grpblk_t	bb_counters[];	/* Nr of free power-of-two-block
+					 * regions, index is order.
+					 * bb_counters[3] = 5 means
+					 * 5 free 8-block regions. */
 };

 #define EXT4_GROUP_INFO_NEED_INIT_BIT	0
@@ -1591,15 +1613,42 @@ struct ext4_group_info {
 #define EXT4_MB_GRP_NEED_INIT(grp)	\
 	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))

+#define EXT4_MAX_CONTENTION		8
+#define EXT4_CONTENTION_THRESHOLD	2
+
 static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
 					      ext4_group_t group)
 {
 	return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
 }

+/*
+ * Returns true if the filesystem is busy enough that attempts to
+ * access the block group locks has run into contention.
+ */
+static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
+{
+	return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
+}
+
 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
 {
-	spin_lock(ext4_group_lock_ptr(sb, group));
+	spinlock_t *lock = ext4_group_lock_ptr(sb, group);
+	if (spin_trylock(lock))
+		/*
+		 * We're able to grab the lock right away, so drop the
+		 * lock contention counter.
+		 */
+		atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+	else {
+		/*
+		 * The lock is busy, so bump the contention counter,
+		 * and then wait on the spin lock.
+		 */
+		atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
+				  EXT4_MAX_CONTENTION);
+		spin_lock(lock);
+	}
 }

 static inline void ext4_unlock_group(struct super_block *sb,

--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,8 +43,7 @@
 #define CHECK_BINSEARCH__

 /*
- * If EXT_DEBUG is defined you can use the 'extdebug' mount option
- * to get lots of info about what's going on.
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
 */
 #define EXT_DEBUG__
 #ifdef EXT_DEBUG
@@ -138,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
 #define EXT_BREAK      1
 #define EXT_REPEAT     2

+/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
 #define EXT_MAX_BLOCK	0xffffffff

 /*

--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
 						  handle, err);
 	}
 	else
-		brelse(bh);
+		bforget(bh);
 	return err;
 }

@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
 						  handle, err);
 	}
 	else
-		brelse(bh);
+		bforget(bh);
 	return err;
 }

@@ -89,6 +89,9 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
 			ext4_journal_abort_handle(where, __func__, bh,
 						  handle, err);
 	} else {
+		if (inode && bh)
+			mark_buffer_dirty_inode(bh, inode);
+		else
 			mark_buffer_dirty(bh);
 		if (inode && inode_needs_sync(inode)) {
 			sync_dirty_buffer(bh);

--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -50,7 +50,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-	int ret = 0;
+	int err, ret = 0;

 	J_ASSERT(ext4_journal_current_handle() == NULL);

@@ -79,6 +79,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		goto out;
 	}

+	if (!journal)
+		ret = sync_mapping_buffers(inode->i_mapping);
+
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
 		goto out;

@@ -91,10 +94,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 			.sync_mode = WB_SYNC_ALL,
 			.nr_to_write = 0, /* sys_fsync did this */
 		};
-		ret = sync_inode(inode, &wbc);
-		if (journal && (journal->j_flags & JBD2_BARRIER))
-			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+		err = sync_inode(inode, &wbc);
+		if (ret == 0)
+			ret = err;
 	}
 out:
+	if (journal && (journal->j_flags & JBD2_BARRIER))
+		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	return ret;
 }
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)

 		x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
 		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
-			i, ext4_free_inodes_count(sb, gdp), x);
+			(unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);

--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -243,10 +243,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 					me.donor_start, me.len, &me.moved_len);
 		fput(donor_filp);

-		if (!err)
-			if (copy_to_user((struct move_extent *)arg,
-				&me, sizeof(me)))
+		if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
 			return -EFAULT;
+
 		return err;
 	}


--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,11 +37,19 @@

 /*
 */
-#define MB_DEBUG__
-#ifdef MB_DEBUG
-#define mb_debug(fmt, a...)	printk(fmt, ##a)
+#ifdef CONFIG_EXT4_DEBUG
+extern u8 mb_enable_debug;
+
+#define mb_debug(n, fmt, a...)	                                        \
+	do {								\
+		if ((n) <= mb_enable_debug) {		        	\
+			printk(KERN_DEBUG "(%s, %d): %s: ",		\
+			       __FILE__, __LINE__, __func__);		\
+			printk(fmt, ## a);				\
+		}							\
+	} while (0)
 #else
-#define mb_debug(fmt, a...)
+#define mb_debug(n, fmt, a...)
 #endif

 /*
@@ -128,8 +136,8 @@ struct ext4_prealloc_space {
 	unsigned		pa_deleted;
 	ext4_fsblk_t		pa_pstart;	/* phys. block */
 	ext4_lblk_t		pa_lstart;	/* log. block */
-	unsigned short		pa_len;		/* len of preallocated chunk */
-	unsigned short		pa_free;	/* how many blocks are free */
+	ext4_grpblk_t		pa_len;		/* len of preallocated chunk */
+	ext4_grpblk_t		pa_free;	/* how many blocks are free */
 	unsigned short		pa_type;	/* pa type. inode or group */
 	spinlock_t		*pa_obj_lock;
 	struct inode		*pa_inode;	/* hack, for history only */
@@ -144,7 +152,7 @@ struct ext4_free_extent {
 	ext4_lblk_t fe_logical;
 	ext4_grpblk_t fe_start;
 	ext4_group_t fe_group;
-	int fe_len;
+	ext4_grpblk_t fe_len;
 };

 /*

--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,

 	down_write(&EXT4_I(inode)->i_data_sem);
 	/*
-	 * if EXT4_EXT_MIGRATE is cleared a block allocation
+	 * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
 	 * happened after we started the migrate. We need to
 	 * fail the migrate
 	 */
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) {
+	if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
 		retval = -EAGAIN;
 		up_write(&EXT4_I(inode)->i_data_sem);
 		goto err_out;
 	} else
-		EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
-							~EXT4_EXT_MIGRATE;
+		EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
 	/*
 	 * We have the extent map build with the tmp inode.
 	 * Now copy the i_data across
@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
 	 * when we add extents we extent the journal
 	 */
 	/*
-	 * Even though we take i_mutex we can still cause block allocation
-	 * via mmap write to holes. If we have allocated new blocks we fail
-	 * migrate.  New block allocation will clear EXT4_EXT_MIGRATE flag.
-	 * The flag is updated with i_data_sem held to prevent racing with
-	 * block allocation.
+	 * Even though we take i_mutex we can still cause block
+	 * allocation via mmap write to holes. If we have allocated
+	 * new blocks we fail migrate.  New block allocation will
+	 * clear EXT4_STATE_EXT_MIGRATE flag.  The flag is updated
+	 * with i_data_sem held to prevent racing with block
+	 * allocation.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
-	EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE;
+	EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
 	up_read((&EXT4_I(inode)->i_data_sem));

 	handle = ext4_journal_start(inode, 1);
@@ -618,7 +618,7 @@ int ext4_ext_migrate(struct inode *inode)
 	tmp_inode->i_nlink = 0;

 	ext4_journal_stop(handle);
-
+	unlock_new_inode(tmp_inode);
 	iput(tmp_inode);

 	return retval;

--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,8 +1518,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 			return retval;

 		if (blocks == 1 && !dx_fallback &&
-		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
-			return make_indexed_dir(handle, dentry, inode, bh);
+		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+			retval = make_indexed_dir(handle, dentry, inode, bh);
+			if (retval == -ENOSPC)
+				brelse(bh);
+			return retval;
+		}
 		brelse(bh);
 	}
 	bh = ext4_append(handle, dir, &block, &retval);
@@ -1528,7 +1532,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	de->inode = 0;
 	de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
-	return add_dirent_to_buf(handle, dentry, inode, de, bh);
+	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	if (retval == -ENOSPC)
+		brelse(bh);
+	return retval;
 }

 /*
@@ -1590,9 +1597,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			goto cleanup;
 		node2 = (struct dx_node *)(bh2->b_data);
 		entries2 = node2->entries;
+		memset(&node2->fake, 0, sizeof(struct fake_dirent));
 		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
 							   sb->s_blocksize);
-		node2->fake.inode = 0;
 		BUFFER_TRACE(frame->bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, frame->bh);
 		if (err)
@@ -1657,6 +1664,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	if (!de)
 		goto cleanup;
 	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	if (err != -ENOSPC)
 		bh = NULL;
 	goto cleanup;

@@ -2310,7 +2318,7 @@ static int ext4_link(struct dentry *old_dentry,
 	struct inode *inode = old_dentry->d_inode;
 	int err, retries = 0;

-	if (EXT4_DIR_LINK_MAX(inode))
+	if (inode->i_nlink >= EXT4_LINK_MAX)
 		return -EMLINK;

 	/*
@@ -2413,7 +2421,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto end_rename;
 		retval = -EMLINK;
 		if (!new_inode && new_dir != old_dir &&
-				new_dir->i_nlink >= EXT4_LINK_MAX)
+		    EXT4_DIR_LINK_MAX(new_dir))
 			goto end_rename;
 	}
 	if (!new_bh) {

--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	struct inode *inode = NULL;
 	handle_t *handle;
 	int gdb_off, gdb_num;
-	int num_grp_locked = 0;
 	int err, err2;

 	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         * using the new disk blocks.
         */

-	num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
 	/* Update group descriptor block for new group */
 	gdp = (struct ext4_group_desc *)((char *)primary->b_data +
 					 gdb_off * EXT4_DESC_SIZE(sb));
@@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	 * descriptor
 	 */
 	err = ext4_mb_add_groupinfo(sb, input->group, gdp);
-	if (err) {
-		ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
+	if (err)
 		goto exit_journal;
-	}

 	/*
 	 * Make the new blocks and inodes valid next.  We do this before
@@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)

 	/* Update the global fs size fields */
 	sbi->s_groups_count++;
-	ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);

 	ext4_handle_dirty_metadata(handle, NULL, primary);


--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "mballoc.h"

 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
@@ -344,7 +345,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
 		errstr = "Out of memory";
 		break;
 	case -EROFS:
-		if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
+		if (!sb || (EXT4_SB(sb)->s_journal &&
+			    EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
 			errstr = "Journal has aborted";
 		else
 			errstr = "Readonly filesystem";
@@ -1279,11 +1281,9 @@ static int parse_options(char *options, struct super_block *sb,
 			*journal_devnum = option;
 			break;
 		case Opt_journal_checksum:
-			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
-			break;
+			break;	/* Kept for backwards compatibility */
 		case Opt_journal_async_commit:
 			set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
-			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
 			break;
 		case Opt_noload:
 			set_opt(sbi->s_mount_opt, NOLOAD);
@@ -1695,12 +1695,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);

 		flex_group = ext4_flex_group(sbi, i);
-		atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
-			   ext4_free_inodes_count(sb, gdp));
-		atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
-			   ext4_free_blks_count(sb, gdp));
-		atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
-			   ext4_used_dirs_count(sb, gdp));
+		atomic_add(ext4_free_inodes_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].free_inodes);
+		atomic_add(ext4_free_blks_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].free_blocks);
+		atomic_add(ext4_used_dirs_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].used_dirs);
 	}

 	return 1;
@@ -2253,6 +2253,49 @@ static struct kobj_type ext4_ktype = {
 	.release	= ext4_sb_release,
 };

+/*
+ * Check whether this filesystem can be mounted based on
+ * the features present and the RDONLY/RDWR mount requested.
+ * Returns 1 if this filesystem can be mounted as requested,
+ * 0 if it cannot be.
+ */
+static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+{
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+		ext4_msg(sb, KERN_ERR,
+			"Couldn't mount because of "
+			"unsupported optional features (%x)",
+			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+			~EXT4_FEATURE_INCOMPAT_SUPP));
+		return 0;
+	}
+
+	if (readonly)
+		return 1;
+
+	/* Check that feature set is OK for a read-write mount */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
+			 "unsupported optional features (%x)",
+			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+				~EXT4_FEATURE_RO_COMPAT_SUPP));
+		return 0;
+	}
+	/*
+	 * Large file size enabled file system can only be mounted
+	 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
+	 */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+		if (sizeof(blkcnt_t) < sizeof(u64)) {
+			ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
+				 "cannot be mounted RDWR without "
+				 "CONFIG_LBDAF");
+			return 0;
+		}
+	}
+	return 1;
+}
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
@@ -2274,7 +2317,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned int db_count;
 	unsigned int i;
 	int needs_recovery, has_huge_files;
-	int features;
 	__u64 blocks_count;
 	int err;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -2401,39 +2443,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * previously didn't change the revision level when setting the flags,
 	 * so there is a chance incompat flags are set on a rev 0 filesystem.
 	 */
-	features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
-	if (features) {
-		ext4_msg(sb, KERN_ERR,
-			"Couldn't mount because of "
-			"unsupported optional features (%x)",
-			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
-			~EXT4_FEATURE_INCOMPAT_SUPP));
-		goto failed_mount;
-	}
-	features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
-	if (!(sb->s_flags & MS_RDONLY) && features) {
-		ext4_msg(sb, KERN_ERR,
-			"Couldn't mount RDWR because of "
-			"unsupported optional features (%x)",
-			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
-			~EXT4_FEATURE_RO_COMPAT_SUPP));
+	if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
 		goto failed_mount;
-	}
-	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
-	if (has_huge_files) {
-		/*
-		 * Large file size enabled file system can only be
-		 * mount if kernel is build with CONFIG_LBDAF
-		 */
-		if (sizeof(root->i_blocks) < sizeof(u64) &&
-				!(sb->s_flags & MS_RDONLY)) {
-			ext4_msg(sb, KERN_ERR, "Filesystem with huge "
-					"files cannot be mounted read-write "
-					"without CONFIG_LBDAF");
-			goto failed_mount;
-		}
-	}
+
 	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);

 	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2469,6 +2481,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		}
 	}

+	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
 	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
 						      has_huge_files);
 	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -2549,12 +2563,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}

-	if (ext4_blocks_count(es) >
-		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+	/*
+	 * Test whether we have more sectors than will fit in sector_t,
+	 * and whether the max offset is addressable by the page cache.
+	 */
+	if ((ext4_blocks_count(es) >
+	     (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
+	    (ext4_blocks_count(es) >
+	     (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
 		ext4_msg(sb, KERN_ERR, "filesystem"
-			" too large to mount safely");
+			 " too large to mount safely on this system");
 		if (sizeof(sector_t) < 8)
 			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+		ret = -EFBIG;
 		goto failed_mount;
 	}

@@ -2595,6 +2616,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 	sbi->s_groups_count = blocks_count;
+	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
 		   EXT4_DESC_PER_BLOCK(sb);
 	sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
@@ -2729,20 +2752,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount4;
 	}

-	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
-		jbd2_journal_set_features(sbi->s_journal,
-				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
-				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
 	jbd2_journal_set_features(sbi->s_journal,
 				  JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
-		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+	if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+		jbd2_journal_set_features(sbi->s_journal, 0, 0,
 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-	} else {
-		jbd2_journal_clear_features(sbi->s_journal,
-				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+	else
+		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-	}

 	/* We have now updated the journal if required, so we can
 	 * validate the data journaling mode. */
@@ -3208,6 +3225,17 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 		clear_buffer_write_io_error(sbh);
 		set_buffer_uptodate(sbh);
 	}
+	/*
+	 * If the file system is mounted read-only, don't update the
+	 * superblock write time.  This avoids updating the superblock
+	 * write time when we are mounting the root file system
+	 * read/only but we need to replay the journal; at that point,
+	 * for people who are east of GMT and who make their clock
+	 * tick in localtime for Windows bug-for-bug compatibility,
+	 * the clock is set in the future, and this will cause e2fsck
+	 * to complain and force a full file system check.
+	 */
+	if (!(sb->s_flags & MS_RDONLY))
 		es->s_wtime = cpu_to_le32(get_seconds());
 	es->s_kbytes_written =
 		cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
@@ -3477,18 +3505,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			if (sbi->s_journal)
 				ext4_mark_recovery_complete(sb, es);
 		} else {
-			int ret;
-			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					~EXT4_FEATURE_RO_COMPAT_SUPP))) {
-				ext4_msg(sb, KERN_WARNING, "couldn't "
-				       "remount RDWR because of unsupported "
-				       "optional features (%x)",
-				(le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
-					~EXT4_FEATURE_RO_COMPAT_SUPP));
+			/* Make sure we can mount this feature set readwrite */
+			if (!ext4_feature_set_ok(sb, 0)) {
 				err = -EROFS;
 				goto restore_opts;
 			}
-
 			/*
 			 * Make sure the group descriptor checksums
 			 * are sane.  If they aren't, refuse to remount r/w.

--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,12 +810,23 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			get_bh(new_bh);
 		} else {
 			/* We need to allocate a new block */
-			ext4_fsblk_t goal = ext4_group_first_block_no(sb,
+			ext4_fsblk_t goal, block;
+
+			goal = ext4_group_first_block_no(sb,
 						EXT4_I(inode)->i_block_group);
-			ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
+
+			/* non-extent files can't have physical blocks past 2^32 */
+			if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+
+			block = ext4_new_meta_blocks(handle, inode,
 						  goal, NULL, &error);
 			if (error)
 				goto cleanup;
+
+			if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+				BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+
 			ea_idebug(inode, "creating block %d", block);

 			new_bh = sb_getblk(sb, block);

--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
+#include <linux/blkdev.h>
 #include <trace/events/jbd2.h>

 /*
@@ -711,6 +712,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 						 &cbh, crc32_sum);
 		if (err)
 			__jbd2_journal_abort_hard(journal);
+		if (journal->j_flags & JBD2_BARRIER)
+			blkdev_issue_flush(journal->j_dev, NULL);
 	}

 	/*

--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1187,6 +1187,12 @@ static int journal_reset(journal_t *journal)

 	first = be32_to_cpu(sb->s_first);
 	last = be32_to_cpu(sb->s_maxlen);
+	if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
+		printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
+		       first, last);
+		journal_fail_superblock(journal);
+		return -EINVAL;
+	}

 	journal->j_first = first;
 	journal->j_last = last;

--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	INIT_LIST_HEAD(&transaction->t_private_list);

 	/* Set up the commit timer for the new transaction. */
-	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+	journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
 	add_timer(&journal->j_commit_timer);

 	J_ASSERT(journal->j_running_transaction == NULL);
@@ -238,6 +238,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
 		  __jbd2_log_space_left(journal));
 	spin_unlock(&transaction->t_handle_lock);
 	spin_unlock(&journal->j_state_lock);
+
+	lock_map_acquire(&handle->h_lockdep_map);
 out:
 	if (unlikely(new_transaction))		/* It's usually NULL */
 		kfree(new_transaction);
@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 		handle = ERR_PTR(err);
 		goto out;
 	}
-
-	lock_map_acquire(&handle->h_lockdep_map);
 out:
 	return handle;
 }
@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
 	__jbd2_log_start_commit(journal, transaction->t_tid);
 	spin_unlock(&journal->j_state_lock);

+	lock_map_release(&handle->h_lockdep_map);
 	handle->h_buffer_credits = nblocks;
 	ret = start_this_handle(journal, handle);
 	return ret;

--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -652,7 +652,7 @@ struct transaction_s
 	 * This transaction is being forced and some process is
 	 * waiting for it to finish.
 	 */
-	int t_synchronous_commit:1;
+	unsigned int t_synchronous_commit:1;

 	/*
 	 * For use by the filesystem to store fs-specific data

--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
--- a/include/trace/events/jbd2.h
+++ b/include/trace/events/jbd2.h
@@ -159,7 +159,7 @@ TRACE_EVENT(jbd2_submit_inode_data,
 	),

 	TP_printk("dev %s ino %lu",
-		  jbd2_dev_to_name(__entry->dev), __entry->ino)
+		  jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino)
 );

 #endif /* _TRACE_JBD2_H */