Merge bk://kernel.bkbits.net/vojtech/x86-64

into home.transmeta.com:/home/torvalds/v2.5/linux

Merge bk://kernel.bkbits.net/vojtech/x86-64
into home.transmeta.com:/home/torvalds/v2.5/linux
d0d3f1f0 · Linus Torvalds · 4a69c79b · 9a3e1a96 · d0d3f1f0 · d0d3f1f0
Commit d0d3f1f0 authored Feb 05, 2003 by Linus Torvalds
41 changed files
--- a/Documentation/DocBook/journal-api.tmpl
+++ b/Documentation/DocBook/journal-api.tmpl
@@ -141,17 +141,14 @@ you are have done so you need to call journal_dirty_{meta,}data().
 Or if you've asked for access to a buffer you now know is now longer 
 required to be pushed back on the device you can call journal_forget()
 in much the same way as you might have used bforget() in the past.
 </para>
 <para>
 A journal_flush() may be called at any time to commit and checkpoint
 all your transactions.
 </para>
-<para>
+<para>
 Then at umount time , in your put_super() (2.4) or write_super() (2.5)
 you can then call journal_destroy() to clean up your in-core journal object.
 </para>
@@ -168,8 +165,8 @@ on another journal. Since transactions can't be nested/batched
 across differing journals, and another filesystem other than
 yours (say ext3) may be modified in a later syscall.
 </para>
-<para>
+<para>
 The second case to bear in mind is that journal_start() can 
 block if there isn't enough space in the journal for your transaction 
 (based on the passed nblocks param) - when it blocks it merely(!) needs to
@@ -180,10 +177,14 @@ were semaphores and include them in your semaphore ordering rules to prevent
 deadlocks. Note that journal_extend() has similar blocking behaviour to
 journal_start() so you can deadlock here just as easily as on journal_start().
 </para>
-<para>
-Try to reserve the right number of blocks the first time. ;-).
+<para>
+Try to reserve the right number of blocks the first time. ;-). This will
+be the maximum number of blocks you are going to touch in this transaction.
+I advise having a look at at least ext3_jbd.h to see the basis on which 
+ext3 uses to make these decisions.
 </para>
 <para>
 Another wriggle to watch out for is your on-disk block allocation strategy.
 why? Because, if you undo a delete, you need to ensure you haven't reused any
@@ -211,6 +212,30 @@ The opportunities for abuse and DOS attacks with this should be obvious,
 if you allow unprivileged userspace to trigger codepaths containing these
 calls.
 </para>
+<para>
+A new feature of jbd since 2.5.25 is commit callbacks with the new
+journal_callback_set() function you can now ask the journalling layer
+to call you back when the transaction is finally commited to disk, so that
+you can do some of your own management. The key to this is the journal_callback
+struct, this maintains the internal callback information but you can
+extend it like this:-
+</para>
+<programlisting>
+	struct  myfs_callback_s {
+		//Data structure element required by jbd..
+		struct journal_callback for_jbd;
+		// Stuff for myfs allocated together.
+		myfs_inode*    i_commited;
+	}
+</programlisting>
+<para>
+this would be useful if you needed to know when data was commited to a 
+particular inode.
+</para>
 </sect1>
 <sect1>

--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
-November 2002             Kernel Parameters                     v2.5.49
+February 2003             Kernel Parameters                     v2.5.59
                          ~~~~~~~~~~~~~~~~~
 The following is a consolidated list of the kernel parameters as implemented
@@ -60,6 +60,7 @@ restrictions referred to are that the relevant option is valid if:
 	V4L	Video For Linux support is enabled.
 	VGA	The VGA console has been enabled.
 	VT	Virtual terminal support is enabled.
+	WDT	Watchdog support is enabled.
 	XT	IBM PC/XT MFM hard disk support is enabled.
 In addition, the following text indicates that the option:
@@ -98,6 +99,9 @@ running once the system is up.
 	advansys=	[HW,SCSI]
 			See header of drivers/scsi/advansys.c.
+	advwdt=		[HW,WDT] Advantech WDT
+			Format: <iostart>,<iostop>
 	aedsp16=	[HW,OSS] Audio Excel DSP 16
 			Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq>
 			See also header of sound/oss/aedsp16.c.
@@ -111,6 +115,9 @@ running once the system is up.
 	aic7xxx=	[HW,SCSI]
 			See Documentation/scsi/aic7xxx.txt.
+	aic79xx=	[HW,SCSI]
+			See Documentation/scsi/aic79xx.txt.
 	allowdma0	[ISAPNP]
 	AM53C974=	[HW,SCSI]
@@ -231,19 +238,11 @@ running once the system is up.
 	cs89x0_media=	[HW,NET]
 			Format: { rj45 | aui | bnc }
-	ctc=		[HW,NET]
-			See drivers/s390/net/ctcmain.c, comment before function
-			ctc_setup().
 	cyclades=	[HW,SERIAL] Cyclades multi-serial port adapter.
 	dasd=		[HW,NET]    
 			See header of drivers/s390/block/dasd_devmap.c.
-	dasd_discipline=
-			[HW,NET]
-			See header of drivers/s390/block/dasd.c.
 	db9=		[HW,JOY]
 	db9_2=
 	db9_3=
@@ -254,9 +253,6 @@ running once the system is up.
 			Format: <area>[,<node>]
 			See also Documentation/networking/decnet.txt.
-	decr_overclock= [PPC]
-	decr_overclock_proc0=
 	devfs=		[DEVFS]
 			See Documentation/filesystems/devfs/boot-options.
@@ -305,6 +301,9 @@ running once the system is up.
 			This option is obsoleted by the "netdev=" option, which
 			has equivalent usage. See its documentation for details.
+	eurwdt=		[HW,WDT] Eurotech CPU-1220/1410 onboard watchdog.
+			Format: <io>[,<irq>]
 	fd_mcs=		[HW,SCSI]
 			See header of drivers/scsi/fd_mcs.c.
@@ -350,7 +349,9 @@ running once the system is up.
 	hisax=		[HW,ISDN]
 			See Documentation/isdn/README.HiSax.
-	hugepages=	[HW,IA-32] Maximal number of HugeTLB pages
+	hugepages=	[HW,IA-32,IA-64] Maximal number of HugeTLB pages.
+	noirqbalance	[IA-32,SMP,KNL] Disable kernel irq balancing
 	i8042_direct	[HW] Non-translated mode
 	i8042_dumbkbd
@@ -394,6 +395,10 @@ running once the system is up.
 	inttest=	[IA64]
+	io7=		[HW] IO7 for Marvel based alpha systems
+			See comment before marvel_specify_io7 in
+			arch/alpha/kernel/core_marvel.c.
 	ip=		[IP_PNP]
 			See Documentation/nfsroot.txt.
@@ -495,6 +500,7 @@ running once the system is up.
 	mdacon=		[MDA]
 			Format: <first>,<last>
+			Specifies range of consoles to be captured by the MDA.
 	mem=exactmap	[KNL,BOOT,IA-32] Enable setting of an exact
 			E820 memory map, as specified by the user.
@@ -576,6 +582,8 @@ running once the system is up.
 	nodisconnect	[HW,SCSI,M68K] Disables SCSI disconnects.
+	noexec		[IA-64]
 	nofxsr		[BUGS=IA-32]
 	nohighio	[BUGS=IA-32] Disable highmem block I/O.
@@ -599,7 +607,9 @@ running once the system is up.
 	noresume	[SWSUSP] Disables resume and restore original swap space.
-	no-scroll	[VGA]
+	no-scroll	[VGA] Disables scrollback.
+			This is required for the Braillex ib80-piezo Braille
+			reader made by F.H. Papenmeier (Germany).
 	nosbagart	[IA-64]
@@ -809,6 +819,9 @@ running once the system is up.
 			See a comment before function sbpcd_setup() in
 			drivers/cdrom/sbpcd.c.
+	sc1200wdt=	[HW,WDT] SC1200 WDT (watchdog) driver
+			Format: <io>[,<timeout>[,<isapnp>]]
 	scsi_debug_*=	[SCSI]
 			See drivers/scsi/scsi_debug.c.
@@ -997,9 +1010,6 @@ running once the system is up.
 	spia_pedr=
 	spia_peddr=
-	spread_lpevents=
-			[PPC]
 	sscape=		[HW,OSS]
 			Format: <io>,<irq>,<dma>,<mpu_io>,<mpu_irq>
@@ -1009,6 +1019,19 @@ running once the system is up.
 	st0x=		[HW,SCSI]
 			See header of drivers/scsi/seagate.c.
+	sti=		[HW]
+			Format: <num>
+			Set the STI (builtin display/keyboard on the HP-PARISC
+			machines) console (graphic card) which should be used
+			as the initial boot-console.
+			See also comment in drivers/video/console/sticore.c.
+	sti_font=	[HW]
+			See comment in drivers/video/console/sticore.c.
+	stifb=		[HW]
+			Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
 	stram_swap=	[HW,M68k]
 	swiotlb=	[IA-64] Number of I/O TLB slabs
@@ -1079,7 +1102,7 @@ running once the system is up.
 	wd7000=		[HW,SCSI]
 			See header of drivers/scsi/wd7000.c.
-	wdt=		[HW] Watchdog
+	wdt=		[WDT] Watchdog
 			See Documentation/watchdog.txt.
 	xd=		[HW,XT] Original XT pre-IDE (RLL encoded) disks.

--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -207,19 +207,34 @@ static void set_ioapic_affinity (unsigned int irq, unsigned long mask)
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
-#if CONFIG_SMP
+#if defined(CONFIG_SMP)
+# include <asm/processor.h>	/* kernel_thread() */
-typedef struct {
+# include <linux/kernel_stat.h>	/* kstat */
-	unsigned int cpu;
+# include <linux/slab.h>		/* kmalloc() */
-	unsigned long timestamp;
+# include <linux/timer.h>	/* time_after() */
-} ____cacheline_aligned irq_balance_t;
+# if CONFIG_BALANCED_IRQ_DEBUG
-static irq_balance_t irq_balance[NR_IRQS] __cacheline_aligned
+#  define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
-			= { [ 0 ... NR_IRQS-1 ] = { 0, 0 } };
+#  define Dprintk(x...) do { TDprintk(x); } while (0)
+# else
+#  define TDprintk(x...) 
+#  define Dprintk(x...) 
+# endif
 extern unsigned long irq_affinity [NR_IRQS];
+unsigned long __cacheline_aligned irq_balance_mask [NR_IRQS];
+static int irqbalance_disabled __initdata = 0;
+static int physical_balance = 0;
-#endif
+struct irq_cpu_info {
+	unsigned long * last_irq;
+	unsigned long * irq_delta;
+	unsigned long irq;
+} irq_cpu_data[NR_CPUS];
+#define CPU_IRQ(cpu)		(irq_cpu_data[cpu].irq)
+#define LAST_CPU_IRQ(cpu,irq)   (irq_cpu_data[cpu].last_irq[irq])
+#define IRQ_DELTA(cpu,irq) 	(irq_cpu_data[cpu].irq_delta[irq])
 #define IDLE_ENOUGH(cpu,now) \
 		(idle_cpu(cpu) && ((now) - irq_stat[(cpu)].idle_timestamp > 1))
@@ -227,9 +242,223 @@ extern unsigned long irq_affinity [NR_IRQS];
 #define IRQ_ALLOWED(cpu,allowed_mask) \
 		((1 << cpu) & (allowed_mask))
-#if CONFIG_SMP
+#define CPU_TO_PACKAGEINDEX(i) \
+		((physical_balance && i > cpu_sibling_map[i]) ? cpu_sibling_map[i] : i)
+#define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
+#define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
+#define BALANCED_IRQ_MORE_DELTA		(HZ/10)
+#define BALANCED_IRQ_LESS_DELTA		(HZ)
+long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL;
+static inline void balance_irq(int cpu, int irq);
+static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
+{
+	int i, j;
+	Dprintk("Rotating IRQs among CPUs.\n");
+	for (i = 0; i < NR_CPUS; i++) {
+		for (j = 0; cpu_online(i) && (j < NR_IRQS); j++) {
+			if (!irq_desc[j].action)
+				continue;
+			/* Is it a significant load ?  */
+			if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < useful_load_threshold)
+				continue;
+			balance_irq(i, j);
+		}
+	}
+	balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+		balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);	
+	return;
+}
+static void do_irq_balance(void)
+{
+	int i, j;
+	unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
+	unsigned long move_this_load = 0;
+	int max_loaded = 0, min_loaded = 0;
+	unsigned long useful_load_threshold = balanced_irq_interval + 10;
+	int selected_irq;
+	int tmp_loaded, first_attempt = 1;
+	unsigned long tmp_cpu_irq;
+	unsigned long imbalance = 0;
+	unsigned long allowed_mask;
+	unsigned long target_cpu_mask;
+	for (i = 0; i < NR_CPUS; i++) {
+		int package_index;
+		CPU_IRQ(i) = 0;
+		if (!cpu_online(i))
+			continue;
+		package_index = CPU_TO_PACKAGEINDEX(i);
+		for (j = 0; j < NR_IRQS; j++) {
+			unsigned long value_now, delta;
+			/* Is this an active IRQ? */
+			if (!irq_desc[j].action)
+				continue;
+			if ( package_index == i )
+				IRQ_DELTA(package_index,j) = 0;
+			/* Determine the total count per processor per IRQ */
+			value_now = (unsigned long) kstat_cpu(i).irqs[j];
+			/* Determine the activity per processor per IRQ */
+			delta = value_now - LAST_CPU_IRQ(i,j);
+			/* Update last_cpu_irq[][] for the next time */
+			LAST_CPU_IRQ(i,j) = value_now;
+			/* Ignore IRQs whose rate is less than the clock */
+			if (delta < useful_load_threshold)
+				continue;
+			/* update the load for the processor or package total */
+			IRQ_DELTA(package_index,j) += delta;
+			/* Keep track of the higher numbered sibling as well */
+			if (i != package_index)
+				CPU_IRQ(i) += delta;
+			/*
+			 * We have sibling A and sibling B in the package
+			 *
+			 * cpu_irq[A] = load for cpu A + load for cpu B
+			 * cpu_irq[B] = load for cpu B
+			 */
+			CPU_IRQ(package_index) += delta;
+		}
+	}
+	/* Find the least loaded processor package */
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_online(i))
+			continue;
+		if (physical_balance && i > cpu_sibling_map[i])
+			continue;
+		if (min_cpu_irq > CPU_IRQ(i)) {
+			min_cpu_irq = CPU_IRQ(i);
+			min_loaded = i;
+		}
+	}
+	max_cpu_irq = ULONG_MAX;
+tryanothercpu:
+	/* Look for heaviest loaded processor.
+	 * We may come back to get the next heaviest loaded processor.
+	 * Skip processors with trivial loads.
+	 */
+	tmp_cpu_irq = 0;
+	tmp_loaded = -1;
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_online(i))
+			continue;
+		if (physical_balance && i > cpu_sibling_map[i])
+			continue;
+		if (max_cpu_irq <= CPU_IRQ(i)) 
+			continue;
+		if (tmp_cpu_irq < CPU_IRQ(i)) {
+			tmp_cpu_irq = CPU_IRQ(i);
+			tmp_loaded = i;
+		}
+	}
+	if (tmp_loaded == -1) {
+ 	 /* In the case of small number of heavy interrupt sources, 
+	  * loading some of the cpus too much. We use Ingo's original 
+	  * approach to rotate them around.
+	  */
+		if (!first_attempt && imbalance >= useful_load_threshold) {
+			rotate_irqs_among_cpus(useful_load_threshold);
+			return;
+		}
+		goto not_worth_the_effort;
+	}
+	first_attempt = 0;		/* heaviest search */
+	max_cpu_irq = tmp_cpu_irq;	/* load */
+	max_loaded = tmp_loaded;	/* processor */
+	imbalance = (max_cpu_irq - min_cpu_irq) / 2;
+	Dprintk("max_loaded cpu = %d\n", max_loaded);
+	Dprintk("min_loaded cpu = %d\n", min_loaded);
+	Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
+	Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
+	Dprintk("load imbalance = %lu\n", imbalance);
+	/* if imbalance is less than approx 10% of max load, then
+	 * observe diminishing returns action. - quit
+	 */
+	if (imbalance < (max_cpu_irq >> 3)) {
+		Dprintk("Imbalance too trivial\n");
+		goto not_worth_the_effort;
+	}
+tryanotherirq:
+	/* if we select an IRQ to move that can't go where we want, then
+	 * see if there is another one to try.
+	 */
+	move_this_load = 0;
+	selected_irq = -1;
+	for (j = 0; j < NR_IRQS; j++) {
+		/* Is this an active IRQ? */
+		if (!irq_desc[j].action)
+			continue;
+		if (imbalance <= IRQ_DELTA(max_loaded,j))
+			continue;
+		/* Try to find the IRQ that is closest to the imbalance
+		 * without going over.
+		 */
+		if (move_this_load < IRQ_DELTA(max_loaded,j)) {
+			move_this_load = IRQ_DELTA(max_loaded,j);
+			selected_irq = j;
+		}
+	}
+	if (selected_irq == -1) {
+		goto tryanothercpu;
+	}
+	imbalance = move_this_load;
-#define IRQ_BALANCE_INTERVAL (HZ/50)
+	/* For physical_balance case, we accumlated both load
+	 * values in the one of the siblings cpu_irq[],
+	 * to use the same code for physical and logical processors
+	 * as much as possible. 
+	 *
+	 * NOTE: the cpu_irq[] array holds the sum of the load for
+	 * sibling A and sibling B in the slot for the lowest numbered
+	 * sibling (A), _AND_ the load for sibling B in the slot for
+	 * the higher numbered sibling.
+	 *
+	 * We seek the least loaded sibling by making the comparison
+	 * (A+B)/2 vs B
+	 */
+	if (physical_balance && (CPU_IRQ(min_loaded) >> 1) > CPU_IRQ(cpu_sibling_map[min_loaded]))
+		min_loaded = cpu_sibling_map[min_loaded];
+	allowed_mask = cpu_online_map & irq_affinity[selected_irq];
+	target_cpu_mask = 1 << min_loaded;
+	if (target_cpu_mask & allowed_mask) {
+		irq_desc_t *desc = irq_desc + selected_irq;
+		Dprintk("irq = %d moved to cpu = %d\n", selected_irq, min_loaded);
+		/* mark for change destination */
+		spin_lock(&desc->lock);
+		irq_balance_mask[selected_irq] = target_cpu_mask;
+		spin_unlock(&desc->lock);
+		/* Since we made a change, come back sooner to 
+		 * check for more variation.
+		 */
+		balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+			balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);	
+		return;
+	}
+	goto tryanotherirq;
+not_worth_the_effort:
+	/* if we did not find an IRQ to move, then adjust the time interval upward */
+	balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
+		balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);	
+	Dprintk("IRQ worth rotating not found\n");
+	return;
+}
 static unsigned long move(int curr_cpu, unsigned long allowed_mask, unsigned long now, int direction)
 {
@@ -257,34 +486,113 @@ static unsigned long move(int curr_cpu, unsigned long allowed_mask, unsigned lon
 	return cpu;
 }
-static inline void balance_irq(int irq)
+static inline void balance_irq (int cpu, int irq)
 {
-	irq_balance_t *entry = irq_balance + irq;
 	unsigned long now = jiffies;
+	unsigned long allowed_mask;
+	unsigned int new_cpu;
 	if (no_balance_irq)
 		return;
-	if (unlikely(time_after(now, entry->timestamp + IRQ_BALANCE_INTERVAL))) {
+	allowed_mask = cpu_online_map & irq_affinity[irq];
-		unsigned long allowed_mask;
+	new_cpu = move(cpu, allowed_mask, now, 1);
-		unsigned int new_cpu;
+	if (cpu != new_cpu) {
-		int random_number;
+		irq_desc_t *desc = irq_desc + irq;
+		spin_lock(&desc->lock);
+		irq_balance_mask[irq] = cpu_to_logical_apicid(new_cpu);
+		spin_unlock(&desc->lock);
+	}
+}
+int balanced_irq(void *unused)
+{
+	int i;
+	unsigned long prev_balance_time = jiffies;
+	long time_remaining = balanced_irq_interval;
+	daemonize();
+	sigfillset(&current->blocked);
+	sprintf(current->comm, "kirqd");
+	/* push everything to CPU 0 to give us a starting point.  */
+	for (i = 0 ; i < NR_IRQS ; i++)
+		irq_balance_mask[i] = 1 << 0;
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		time_remaining = schedule_timeout(time_remaining);
+		if (time_after(jiffies, prev_balance_time+balanced_irq_interval)) {
+			Dprintk("balanced_irq: calling do_irq_balance() %lu\n", jiffies);
+			do_irq_balance();
+			prev_balance_time = jiffies;
+			time_remaining = balanced_irq_interval;
+		}
+        }
+}
-		rdtscl(random_number);
+static int __init balanced_irq_init(void)
-		random_number &= 1;
+{
+	int i;
+	struct cpuinfo_x86 *c;
+        c = &boot_cpu_data;
+	if (irqbalance_disabled)
+		return 0;
+	/* Enable physical balance only if more than 1 physical processor is present */
+	if (smp_num_siblings > 1 && cpu_online_map >> 2)
+		physical_balance = 1;
-		allowed_mask = cpu_online_map & irq_affinity[irq];
+	for (i = 0; i < NR_CPUS; i++) {
-		entry->timestamp = now;
+		if (!cpu_online(i))
-		new_cpu = move(entry->cpu, allowed_mask, now, random_number);
+			continue;
-		if (entry->cpu != new_cpu) {
+		irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
-			entry->cpu = new_cpu;
+		irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
-			set_ioapic_affinity(irq, cpu_to_logical_apicid(new_cpu));
+		if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
+			printk(KERN_ERR "balanced_irq_init: out of memory");
+			goto failed;
 		}
+		memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
+		memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
 	}
+	printk(KERN_INFO "Starting balanced_irq\n");
+	if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) 
+		return 0;
+	else 
+		printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
+failed:
+	for (i = 0; i < NR_CPUS; i++) {
+		if(irq_cpu_data[i].irq_delta)
+			kfree(irq_cpu_data[i].irq_delta);
+		if(irq_cpu_data[i].last_irq)
+			kfree(irq_cpu_data[i].last_irq);
+	}
+	return 0;
 }
+static int __init irqbalance_disable(char *str)
+{
+	irqbalance_disabled = 1;
+	return 0;
+}
+__setup("noirqbalance", irqbalance_disable);
+static void set_ioapic_affinity (unsigned int irq, unsigned long mask);
+static inline void move_irq(int irq)
+{
+	/* note - we hold the desc->lock */
+	if (unlikely(irq_balance_mask[irq])) {
+		set_ioapic_affinity(irq, irq_balance_mask[irq]);
+		irq_balance_mask[irq] = 0;
+	}
+}
+__initcall(balanced_irq_init);
 #else /* !SMP */
-static inline void balance_irq(int irq) { }
+static inline void move_irq(int irq) { }
-#endif
+#endif /* defined(CONFIG_SMP) */
 /*
 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
@@ -1307,7 +1615,7 @@ static unsigned int startup_edge_ioapic_irq(unsigned int irq)
 */
 static void ack_edge_ioapic_irq(unsigned int irq)
 {
-	balance_irq(irq);
+	move_irq(irq);
 	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
 					== (IRQ_PENDING | IRQ_DISABLED))
 		mask_IO_APIC_irq(irq);
@@ -1347,7 +1655,7 @@ static void end_level_ioapic_irq (unsigned int irq)
 	unsigned long v;
 	int i;
-	balance_irq(irq);
+	move_irq(irq);
 /*
 * It appears there is an erratum which affects at least version 0x11
 * of I/O APIC (that's the 82093AA and cores integrated into various

--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -86,7 +86,7 @@ void enable_hlt(void)
 */
 void default_idle(void)
 {
-	if (current_cpu_data.hlt_works_ok && !hlt_counter) {
+	if (!hlt_counter && current_cpu_data.hlt_works_ok) {
 		local_irq_disable();
 		if (!need_resched())
 			safe_halt();

--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -26,7 +26,6 @@ static long    htlbpagemem;
 int     htlbpage_max;
 static long    htlbzone_pages;
-struct vm_operations_struct hugetlb_vm_ops;
 static LIST_HEAD(htlbpage_freelist);
 static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
@@ -46,6 +45,7 @@ static struct page *alloc_hugetlb_page(void)
 	htlbpagemem--;
 	spin_unlock(&htlbpage_lock);
 	set_page_count(page, 1);
+	page->lru.prev = (void *)huge_page_release;
 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
 		clear_highpage(&page[i]);
 	return page;
@@ -134,6 +134,7 @@ follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		page = pte_page(pte);
 		if (pages) {
 			page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
+			get_page(page);
 			pages[i] = page;
 		}
 		if (vmas)
@@ -150,6 +151,82 @@ follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	return i;
 }
+#if 0	/* This is just for testing */
+struct page *
+follow_huge_addr(struct mm_struct *mm,
+	struct vm_area_struct *vma, unsigned long address, int write)
+{
+	unsigned long start = address;
+	int length = 1;
+	int nr;
+	struct page *page;
+	nr = follow_hugetlb_page(mm, vma, &page, NULL, &start, &length, 0);
+	if (nr == 1)
+		return page;
+	return NULL;
+}
+/*
+ * If virtual address `addr' lies within a huge page, return its controlling
+ * VMA, else NULL.
+ */
+struct vm_area_struct *hugepage_vma(struct mm_struct *mm, unsigned long addr)
+{
+	if (mm->used_hugetlb) {
+		struct vm_area_struct *vma = find_vma(mm, addr);
+		if (vma && is_vm_hugetlb_page(vma))
+			return vma;
+	}
+	return NULL;
+}
+int pmd_huge(pmd_t pmd)
+{
+	return 0;
+}
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+		pmd_t *pmd, int write)
+{
+	return NULL;
+}
+#else
+struct page *
+follow_huge_addr(struct mm_struct *mm,
+	struct vm_area_struct *vma, unsigned long address, int write)
+{
+	return NULL;
+}
+struct vm_area_struct *hugepage_vma(struct mm_struct *mm, unsigned long addr)
+{
+	return NULL;
+}
+int pmd_huge(pmd_t pmd)
+{
+	return !!(pmd_val(pmd) & _PAGE_PSE);
+}
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+		pmd_t *pmd, int write)
+{
+	struct page *page;
+	page = pte_page(*(pte_t *)pmd);
+	if (page) {
+		page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
+		get_page(page);
+	}
+	return page;
+}
+#endif
 void free_huge_page(struct page *page)
 {
 	BUG_ON(page_count(page));
@@ -171,7 +248,8 @@ void huge_page_release(struct page *page)
 	free_huge_page(page);
 }
-void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+void unmap_hugepage_range(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
@@ -181,8 +259,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsig
 	BUG_ON(start & (HPAGE_SIZE - 1));
 	BUG_ON(end & (HPAGE_SIZE - 1));
-	spin_lock(&htlbpage_lock);
-	spin_unlock(&htlbpage_lock);
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		pte = huge_pte_offset(mm, address);
 		if (pte_none(*pte))
@@ -195,7 +271,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsig
 	flush_tlb_range(vma, start, end);
 }
-void zap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long length)
+void
+zap_hugepage_range(struct vm_area_struct *vma,
+		unsigned long start, unsigned long length)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	spin_lock(&mm->page_table_lock);
@@ -206,6 +284,7 @@ void zap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigne
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
+	struct inode *inode = mapping->host;
 	unsigned long addr;
 	int ret = 0;
@@ -229,6 +308,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
 		page = find_get_page(mapping, idx);
 		if (!page) {
+			loff_t i_size;
 			page = alloc_hugetlb_page();
 			if (!page) {
 				ret = -ENOMEM;
@@ -240,6 +320,9 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				free_huge_page(page);
 				goto out;
 			}
+			i_size = (loff_t)(idx + 1) * HPAGE_SIZE;
+			if (i_size > inode->i_size)
+				inode->i_size = i_size;
 		}
 		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
 	}
@@ -298,8 +381,8 @@ int try_to_free_low(int count)
 int set_hugetlb_mem_size(int count)
 {
-	int j, lcount;
+	int lcount;
-	struct page *page, *map;
+	struct page *page;
 	extern long htlbzone_pages;
 	extern struct list_head htlbpage_freelist;
@@ -315,11 +398,6 @@ int set_hugetlb_mem_size(int count)
 			page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
 			if (page == NULL)
 				break;
-			map = page;
-			for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-				SetPageReserved(map);
-				map++;
-			}
 			spin_lock(&htlbpage_lock);
 			list_add(&page->list, &htlbpage_freelist);
 			htlbpagemem++;
@@ -341,7 +419,8 @@ int set_hugetlb_mem_size(int count)
 	return (int) htlbzone_pages;
 }
-int hugetlb_sysctl_handler(ctl_table *table, int write, struct file *file, void *buffer, size_t *length)
+int hugetlb_sysctl_handler(ctl_table *table, int write,
+		struct file *file, void *buffer, size_t *length)
 {
 	proc_dointvec(table, write, file, buffer, length);
 	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
@@ -358,15 +437,13 @@ __setup("hugepages=", hugetlb_setup);
 static int __init hugetlb_init(void)
 {
-	int i, j;
+	int i;
 	struct page *page;
 	for (i = 0; i < htlbpage_max; ++i) {
 		page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
 		if (!page)
 			break;
-		for (j = 0; j < HPAGE_SIZE/PAGE_SIZE; ++j)
-			SetPageReserved(&page[j]);
 		spin_lock(&htlbpage_lock);
 		list_add(&page->list, &htlbpage_freelist);
 		spin_unlock(&htlbpage_lock);
@@ -395,7 +472,14 @@ int is_hugepage_mem_enough(size_t size)
 	return 1;
 }
-static struct page *hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int unused)
+/*
+ * We cannot handle pagefaults against hugetlb pages at all.  They cause
+ * handle_mm_fault() to try to instantiate regular-sized pages in the
+ * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
+ * this far.
+ */
+static struct page *hugetlb_nopage(struct vm_area_struct *vma,
+				unsigned long address, int unused)
 {
 	BUG();
 	return NULL;

--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -18,7 +18,6 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
-static struct vm_operations_struct hugetlb_vm_ops;
 struct list_head htlbpage_freelist;
 spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
 extern long htlbpagemem;
@@ -227,6 +226,7 @@ follow_hugetlb_page (struct mm_struct *mm, struct vm_area_struct *vma,
 		page = pte_page(pte);
 		if (pages) {
 			page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
+			get_page(page);
 			pages[i] = page;
 		}
 		if (vmas)
@@ -303,11 +303,6 @@ set_hugetlb_mem_size (int count)
 			page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
 			if (page == NULL)
 				break;
-			map = page;
-			for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-				SetPageReserved(map);
-				map++;
-			}
 			spin_lock(&htlbpage_lock);
 			list_add(&page->list, &htlbpage_freelist);
 			htlbpagemem++;
@@ -327,7 +322,7 @@ set_hugetlb_mem_size (int count)
 		map = page;
 		for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
 			map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-					1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+					1 << PG_dirty | 1 << PG_active |
 					1 << PG_private | 1<< PG_writeback);
 			map++;
 		}
@@ -337,6 +332,14 @@ set_hugetlb_mem_size (int count)
 	return (int) htlbzone_pages;
 }
+static struct page *
+hugetlb_nopage(struct vm_area_struct *vma, unsigned long address, int unused)
+{
+	BUG();
+	return NULL;
+}
 static struct vm_operations_struct hugetlb_vm_ops = {
-	.close =	zap_hugetlb_resources
+	.nopage =	hugetlb_nopage,
+	.close =	zap_hugetlb_resources,
 };
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -288,6 +288,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		page = pte_page(pte);
 		if (pages) {
 			page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
+			get_page(page);
 			pages[i] = page;
 		}
 		if (vmas)
@@ -584,11 +585,6 @@ int set_hugetlb_mem_size(int count)
 			page = alloc_pages(GFP_ATOMIC, HUGETLB_PAGE_ORDER);
 			if (page == NULL)
 				break;
-			map = page;
-			for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-				SetPageReserved(map);
-				map++;
-			}
 			spin_lock(&htlbpage_lock);
 			list_add(&page->list, &htlbpage_freelist);
 			htlbpagemem++;
@@ -613,7 +609,6 @@ int set_hugetlb_mem_size(int count)
 			map->flags &= ~(1UL << PG_locked | 1UL << PG_error |
 					1UL << PG_referenced |
 					1UL << PG_dirty | 1UL << PG_active |
-					1UL << PG_reserved |
 					1UL << PG_private | 1UL << PG_writeback);
 			set_page_count(page, 0);
 			map++;
@@ -624,6 +619,14 @@ int set_hugetlb_mem_size(int count)
 	return (int) htlbzone_pages;
 }
+static struct page *
+hugetlb_nopage(struct vm_area_struct *vma, unsigned long address, int unused)
+{
+	BUG();
+	return NULL;
+}
 static struct vm_operations_struct hugetlb_vm_ops = {
+	.nopage = hugetlb_nopage,
 	.close	= zap_hugetlb_resources,
 };
--- a/arch/x86_64/mm/hugetlbpage.c
+++ b/arch/x86_64/mm/hugetlbpage.c
@@ -25,7 +25,6 @@ static long    htlbpagemem;
 int     htlbpage_max;
 static long    htlbzone_pages;
-struct vm_operations_struct hugetlb_vm_ops;
 static LIST_HEAD(htlbpage_freelist);
 static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
@@ -134,6 +133,7 @@ follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		page = pte_page(pte);
 		if (pages) {
 			page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
+			get_page(page);
 			pages[i] = page;
 		}
 		if (vmas)
@@ -204,6 +204,7 @@ void zap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigne
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
+	struct inode = mapping->host;
 	unsigned long addr;
 	int ret = 0;
@@ -227,6 +228,8 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
 		page = find_get_page(mapping, idx);
 		if (!page) {
+			loff_t i_size;
 			page = alloc_hugetlb_page();
 			if (!page) {
 				ret = -ENOMEM;
@@ -238,6 +241,9 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				free_huge_page(page);
 				goto out;
 			}
+			i_size = (loff_t)(idx + 1) * HPAGE_SIZE;
+			if (i_size > inode->i_size)
+				inode->i_size = i_size;
 		}
 		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
 	}
@@ -263,11 +269,6 @@ int set_hugetlb_mem_size(int count)
 			page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
 			if (page == NULL)
 				break;
-			map = page;
-			for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-				SetPageReserved(map);
-				map++;
-			}
 			spin_lock(&htlbpage_lock);
 			list_add(&page->list, &htlbpage_freelist);
 			htlbpagemem++;
@@ -286,8 +287,9 @@ int set_hugetlb_mem_size(int count)
 		spin_unlock(&htlbpage_lock);
 		map = page;
 		for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-			map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
+			map->flags &= ~(1 << PG_locked | 1 << PG_error |
-					1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+					1 << PG_referenced |
+					1 << PG_dirty | 1 << PG_active |
 					1 << PG_private | 1<< PG_writeback);
 			set_page_count(map, 0);
 			map++;
@@ -346,7 +348,8 @@ int hugetlb_report_meminfo(char *buf)
 			HPAGE_SIZE/1024);
 }
-static struct page * hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int unused)
+static struct page *
+hugetlb_nopage(struct vm_area_struct *vma, unsigned long address, int unused)
 {
 	BUG();
 	return NULL;

--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -27,6 +27,8 @@
 #include <linux/completion.h>
 #include <linux/slab.h>
+static void blk_unplug_work(void *data);
 /*
 * For the allocated request tables
 */
@@ -237,6 +239,14 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 	blk_queue_hardsect_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
+	q->unplug_thresh = 4;		/* hmm */
+	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
+	if (q->unplug_delay == 0)
+		q->unplug_delay = 1;
+	init_timer(&q->unplug_timer);
+	INIT_WORK(&q->unplug_work, blk_unplug_work, q);
 	/*
 	 * by default assume old behaviour and bounce for any highmem page
 	 */
@@ -960,6 +970,7 @@ void blk_plug_device(request_queue_t *q)
 	if (!blk_queue_plugged(q)) {
 		spin_lock(&blk_plug_lock);
 		list_add_tail(&q->plug_list, &blk_plug_list);
+		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
 		spin_unlock(&blk_plug_lock);
 	}
 }
@@ -974,6 +985,7 @@ int blk_remove_plug(request_queue_t *q)
 	if (blk_queue_plugged(q)) {
 		spin_lock(&blk_plug_lock);
 		list_del_init(&q->plug_list);
+		del_timer(&q->unplug_timer);
 		spin_unlock(&blk_plug_lock);
 		return 1;
 	}
@@ -992,6 +1004,8 @@ static inline void __generic_unplug_device(request_queue_t *q)
 	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
 		return;
+	del_timer(&q->unplug_timer);
 	/*
 	 * was plugged, fire request_fn if queue has stuff to do
 	 */
@@ -1020,6 +1034,18 @@ void generic_unplug_device(void *data)
 	spin_unlock_irq(q->queue_lock);
 }
+static void blk_unplug_work(void *data)
+{
+	generic_unplug_device(data);
+}
+static void blk_unplug_timeout(unsigned long data)
+{
+	request_queue_t *q = (request_queue_t *)data;
+	schedule_work(&q->unplug_work);
+}
 /**
 * blk_start_queue - restart a previously stopped queue
 * @q:    The &request_queue_t in question
@@ -1164,6 +1190,9 @@ void blk_cleanup_queue(request_queue_t * q)
 	count -= __blk_cleanup_queue(&q->rq[READ]);
 	count -= __blk_cleanup_queue(&q->rq[WRITE]);
+	del_timer_sync(&q->unplug_timer);
+	flush_scheduled_work();
 	if (count)
 		printk("blk_cleanup_queue: leaked requests (%d)\n", count);
@@ -1269,6 +1298,9 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock)
 	blk_queue_make_request(q, __make_request);
 	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
+	q->unplug_timer.function = blk_unplug_timeout;
+	q->unplug_timer.data = (unsigned long)q;
 	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
 	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
@@ -1811,7 +1843,15 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 out:
 	if (freereq)
 		__blk_put_request(q, freereq);
+	if (blk_queue_plugged(q)) {
+		int nr_queued = (queue_nr_requests - q->rq[0].count) +
+				(queue_nr_requests - q->rq[1].count);
+		if (nr_queued == q->unplug_thresh)
+			__generic_unplug_device(q);
+	}
 	spin_unlock_irq(q->queue_lock);
 	return 0;
 end_io:

--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -350,15 +350,10 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 	int ret;
 	pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
-	do {
 	if (bio_rw(bio) == WRITE)
 		ret = lo_send(lo, bio, lo->lo_blocksize, pos);
 	else
 		ret = lo_receive(lo, bio, lo->lo_blocksize, pos);
-	} while (++bio->bi_idx < bio->bi_vcnt);
 	return ret;
 }

--- a/drivers/media/video/Kconfig
+++ b/drivers/media/video/Kconfig
@@ -19,7 +19,7 @@ comment "Video Adapters"
 config VIDEO_BT848
 	tristate "BT848 Video For Linux"
-	depends on VIDEO_DEV && PCI && I2C_ALGOBIT
+	depends on VIDEO_DEV && PCI && I2C_ALGOBIT && SOUND
 	---help---
 	  Support for BT848 based frame grabber/overlay boards. This includes
 	  the Miro, Hauppauge and STB boards. Please read the material in

--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -127,9 +127,10 @@ void __wait_on_buffer(struct buffer_head * bh)
 	get_bh(bh);
 	do {
 		prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+		if (buffer_locked(bh)) {
 			blk_run_queues();
-		if (buffer_locked(bh))
 			io_schedule();
+		}
 	} while (buffer_locked(bh));
 	put_bh(bh);
 	finish_wait(wqh, &wait);
@@ -959,8 +960,6 @@ create_buffers(struct page * page, unsigned long size, int retry)
 	 * the reserve list is empty, we're sure there are 
 	 * async buffer heads in use.
 	 */
-	blk_run_queues();
 	free_more_memory();
 	goto try_again;
 }

--- a/fs/exec.c
+++ b/fs/exec.c
@@ -300,6 +300,8 @@ void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long a
 	pgd = pgd_offset(tsk->mm, address);
 	pte_chain = pte_chain_alloc(GFP_KERNEL);
+	if (!pte_chain)
+		goto out_sig;
 	spin_lock(&tsk->mm->page_table_lock);
 	pmd = pmd_alloc(tsk->mm, pgd, address);
 	if (!pmd)
@@ -325,6 +327,7 @@ void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long a
 	return;
 out:
 	spin_unlock(&tsk->mm->page_table_lock);
+out_sig:
 	__free_page(page);
 	force_sig(SIGKILL, tsk);
 	pte_chain_free(pte_chain);

--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -99,6 +99,34 @@ int ext3_forget(handle_t *handle, int is_metadata,
 	return err;
 }
+/*
+ * Work out how many blocks we need to progress with the next chunk of a
+ * truncate transaction.
+ */
+static unsigned long blocks_for_truncate(struct inode *inode) 
+{
+	unsigned long needed;
+	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+	/* Give ourselves just enough room to cope with inodes in which
+	 * i_blocks is corrupt: we've seen disk corruptions in the past
+	 * which resulted in random data in an inode which looked enough
+	 * like a regular file for ext3 to try to delete it.  Things
+	 * will go a bit crazy if that happens, but at least we should
+	 * try not to panic the whole kernel. */
+	if (needed < 2)
+		needed = 2;
+	/* But we need to bound the transaction so we don't overflow the
+	 * journal. */
+	if (needed > EXT3_MAX_TRANS_DATA) 
+		needed = EXT3_MAX_TRANS_DATA;
+	return EXT3_DATA_TRANS_BLOCKS + needed;
+}
 /* 
 * Truncate transactions can be complex and absolutely huge.  So we need to
 * be able to restart the transaction at a conventient checkpoint to make
@@ -112,14 +140,9 @@ int ext3_forget(handle_t *handle, int is_metadata,
 static handle_t *start_transaction(struct inode *inode) 
 {
-	long needed;
 	handle_t *result;
-	needed = inode->i_blocks;
+	result = ext3_journal_start(inode, blocks_for_truncate(inode));
-	if (needed > EXT3_MAX_TRANS_DATA) 
-		needed = EXT3_MAX_TRANS_DATA;
-	result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed);
 	if (!IS_ERR(result))
 		return result;
@@ -135,14 +158,9 @@ static handle_t *start_transaction(struct inode *inode)
 */
 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 {
-	long needed;
 	if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
 		return 0;
-	needed = inode->i_blocks;
+	if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
-	if (needed > EXT3_MAX_TRANS_DATA) 
-		needed = EXT3_MAX_TRANS_DATA;
-	if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed))
 		return 0;
 	return 1;
 }
@@ -154,11 +172,8 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 */
 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
 {
-	long needed = inode->i_blocks;
-	if (needed > EXT3_MAX_TRANS_DATA) 
-		needed = EXT3_MAX_TRANS_DATA;
 	jbd_debug(2, "restarting handle %p\n", handle);
-	return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed);
+	return ext3_journal_restart(handle, blocks_for_truncate(inode));
 }
 /*

--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -61,6 +61,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			sb->s_op->dirty_inode(inode);
 	}
+	/*
+	 * make sure that changes are seen by all cpus before we test i_state
+	 * -- mikulas
+	 */
+	smp_mb();
 	/* avoid the locking if we can */
 	if ((inode->i_state & flags) == flags)
 		return;
@@ -137,6 +143,12 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	inode->i_state |= I_LOCK;
 	inode->i_state &= ~I_DIRTY;
+	/*
+	 * smp_rmb(); note: if you remove write_lock below, you must add this.
+	 * mark_inode_dirty doesn't take spinlock, make sure that inode is not
+	 * read speculatively by this cpu before &= ~I_DIRTY  -- mikulas
+	 */
 	write_lock(&mapping->page_lock);
 	if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages))
 		list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
@@ -334,7 +346,6 @@ writeback_inodes(struct writeback_control *wbc)
 	}
 	spin_unlock(&sb_lock);
 	spin_unlock(&inode_lock);
-	blk_run_queues();
 }
 /*

--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -34,6 +34,7 @@ static struct super_operations hugetlbfs_ops;
 static struct address_space_operations hugetlbfs_aops;
 struct file_operations hugetlbfs_file_operations;
 static struct inode_operations hugetlbfs_dir_inode_operations;
+static struct inode_operations hugetlbfs_inode_operations;
 static struct backing_dev_info hugetlbfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
@@ -44,7 +45,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode =file->f_dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
-	size_t len;
 	int ret;
 	if (!capable(CAP_IPC_LOCK))
@@ -65,14 +65,51 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
 	vma->vm_ops = &hugetlb_vm_ops;
 	ret = hugetlb_prefault(mapping, vma);
-	len = (vma->vm_end - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT);
-	if (inode->i_size < len)
-		inode->i_size = len;
 	up(&inode->i_sem);
 	return ret;
 }
+/*
+ * Called under down_write(mmap_sem), page_table_lock is not held
+ */
+#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags);
+#else
+static unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	if (len & ~HPAGE_MASK)
+		return -EINVAL;
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+	if (addr) {
+		addr = ALIGN(addr, HPAGE_SIZE);
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+	addr = ALIGN(mm->free_area_cache, HPAGE_SIZE);
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (TASK_SIZE - len < addr)
+			return -ENOMEM;
+		if (!vma || addr + len <= vma->vm_start)
+			return addr;
+		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+	}
+}
+#endif
 /*
 * Read a page. Again trivial. If it didn't already exist
 * in the page cache, it is zero-filled.
@@ -83,12 +120,14 @@ static int hugetlbfs_readpage(struct file *file, struct page * page)
 	return -EINVAL;
 }
-static int hugetlbfs_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+static int hugetlbfs_prepare_write(struct file *file,
+			struct page *page, unsigned offset, unsigned to)
 {
 	return -EINVAL;
 }
-static int hugetlbfs_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+static int hugetlbfs_commit_write(struct file *file,
+			struct page *page, unsigned offset, unsigned to)
 {
 	return -EINVAL;
 }
@@ -103,28 +142,8 @@ void huge_pagevec_release(struct pagevec *pvec)
 	pagevec_reinit(pvec);
 }
-void truncate_partial_hugepage(struct page *page, unsigned partial)
+void truncate_huge_page(struct page *page)
-{
-	int i;
-	const unsigned piece = partial & (PAGE_SIZE - 1);
-	const unsigned tailstart = PAGE_SIZE - piece;
-	const unsigned whole_pages = partial / PAGE_SIZE;
-	const unsigned last_page_offset = HPAGE_SIZE/PAGE_SIZE - whole_pages;
-	for (i = HPAGE_SIZE/PAGE_SIZE - 1; i >= last_page_offset; ++i)
-		memclear_highpage_flush(&page[i], 0, PAGE_SIZE);
-	if (!piece)
-		return;
-	memclear_highpage_flush(&page[last_page_offset - 1], tailstart, piece);
-}
-void truncate_huge_page(struct address_space *mapping, struct page *page)
 {
-	if (page->mapping != mapping)
-		return;
 	clear_page_dirty(page);
 	ClearPageUptodate(page);
 	remove_from_page_cache(page);
@@ -133,52 +152,13 @@ void truncate_huge_page(struct address_space *mapping, struct page *page)
 void truncate_hugepages(struct address_space *mapping, loff_t lstart)
 {
-	const pgoff_t start = (lstart + HPAGE_SIZE - 1) >> HPAGE_SHIFT;
+	const pgoff_t start = lstart >> HPAGE_SHIFT;
-	const unsigned partial = lstart & (HPAGE_SIZE - 1);
 	struct pagevec pvec;
 	pgoff_t next;
 	int i;
 	pagevec_init(&pvec, 0);
 	next = start;
-	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
-		for (i = 0; i < pagevec_count(&pvec); ++i) {
-			struct page *page = pvec.pages[i];
-			pgoff_t page_index = page->index;
-			if (page_index > next)
-				next = page_index;
-			++next;
-			if (TestSetPageLocked(page))
-				continue;
-			if (PageWriteback(page)) {
-				unlock_page(page);
-				continue;
-			}
-			truncate_huge_page(mapping, page);
-			unlock_page(page);
-		}
-		huge_pagevec_release(&pvec);
-		cond_resched();
-	}
-	if (partial) {
-		struct page *page = find_lock_page(mapping, start - 1);
-		if (page) {
-			wait_on_page_writeback(page);
-			truncate_partial_hugepage(page, partial);
-			unlock_page(page);
-			huge_page_release(page);
-		}
-	}
-	next = start;
 	while (1) {
 		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
 			if (next == start)
@@ -191,11 +171,10 @@ void truncate_hugepages(struct address_space *mapping, loff_t lstart)
 			struct page *page = pvec.pages[i];
 			lock_page(page);
-			wait_on_page_writeback(page);
 			if (page->index > next)
 				next = page->index;
 			++next;
-			truncate_huge_page(mapping, page);
+			truncate_huge_page(page);
 			unlock_page(page);
 		}
 		huge_pagevec_release(&pvec);
@@ -259,70 +238,73 @@ static void hugetlbfs_drop_inode(struct inode *inode)
 		hugetlbfs_forget_inode(inode);
 }
-static void hugetlb_vmtruncate_list(struct list_head *list, unsigned long pgoff)
+/*
+ * h_pgoff is in HPAGE_SIZE units.
+ * vma->vm_pgoff is in PAGE_SIZE units.
+ */
+static void
+hugetlb_vmtruncate_list(struct list_head *list, unsigned long h_pgoff)
 {
-	unsigned long start, end, length, delta;
 	struct vm_area_struct *vma;
 	list_for_each_entry(vma, list, shared) {
-		start = vma->vm_start;
+		unsigned long h_vm_pgoff;
-		end = vma->vm_end;
+		unsigned long v_length;
-		length = end - start;
+		unsigned long h_length;
+		unsigned long v_offset;
-		if (vma->vm_pgoff >= pgoff) {
-			zap_hugepage_range(vma, start, length);
+		h_vm_pgoff = vma->vm_pgoff << (HPAGE_SHIFT - PAGE_SHIFT);
+		v_length = vma->vm_end - vma->vm_start;
+		h_length = v_length >> HPAGE_SHIFT;
+		v_offset = (h_pgoff - h_vm_pgoff) << HPAGE_SHIFT;
+		/*
+		 * Is this VMA fully outside the truncation point?
+		 */
+		if (h_vm_pgoff >= h_pgoff) {
+			zap_hugepage_range(vma, vma->vm_start, v_length);
 			continue;
 		}
-		length >>= PAGE_SHIFT;
+		/*
-		delta = pgoff = vma->vm_pgoff;
+		 * Is this VMA fully inside the truncaton point?
-		if (delta >= length)
+		 */
+		if (h_vm_pgoff + (v_length >> HPAGE_SHIFT) <= h_pgoff)
 			continue;
-		start += delta << PAGE_SHIFT;
+		/*
-		length = (length - delta) << PAGE_SHIFT;
+		 * The VMA straddles the truncation point.  v_offset is the
-		zap_hugepage_range(vma, start, length);
+		 * offset (in bytes) into the VMA where the point lies.
+		 */
+		zap_hugepage_range(vma,
+				vma->vm_start + v_offset,
+				v_length - v_offset);
 	}
 }
+/*
+ * Expanding truncates are not allowed.
+ */
 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
 	unsigned long pgoff;
 	struct address_space *mapping = inode->i_mapping;
-	unsigned long limit;
-	pgoff = (offset + HPAGE_SIZE - 1) >> HPAGE_SHIFT;
+	if (offset > inode->i_size)
+		return -EINVAL;
-	if (inode->i_size < offset)
+	BUG_ON(offset & ~HPAGE_MASK);
-		goto do_expand;
+	pgoff = offset >> HPAGE_SHIFT;
 	inode->i_size = offset;
 	down(&mapping->i_shared_sem);
-	if (list_empty(&mapping->i_mmap) && list_empty(&mapping->i_mmap_shared))
-		goto out_unlock;
 	if (!list_empty(&mapping->i_mmap))
 		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
 	if (!list_empty(&mapping->i_mmap_shared))
 		hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff);
-out_unlock:
 	up(&mapping->i_shared_sem);
 	truncate_hugepages(mapping, offset);
 	return 0;
-do_expand:
-	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
-	if (limit != RLIM_INFINITY && offset > limit)
-		goto out_sig;
-	if (offset > inode->i_sb->s_maxbytes)
-		goto out;
-	inode->i_size = offset;
-	return 0;
-out_sig:
-	send_sig(SIGXFSZ, current, 0);
-out:
-	return -EFBIG;
 }
 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -341,14 +323,9 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 	error = security_inode_setattr(dentry, attr);
 	if (error)
 		goto out;
-	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-	    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
-		error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
-	if (error)
-		goto out;
 	if (ia_valid & ATTR_SIZE) {
+		error = -EINVAL;
+		if (!(attr->ia_size & ~HPAGE_MASK))
 			error = hugetlb_vmtruncate(inode, attr->ia_size);
 		if (error)
 			goto out;
@@ -364,8 +341,8 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 	return error;
 }
-static struct inode *
+static struct inode *hugetlbfs_get_inode(struct super_block *sb,
-hugetlbfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+					int mode, dev_t dev)
 {
 	struct inode * inode = new_inode(sb);
@@ -377,13 +354,14 @@ hugetlbfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 		inode->i_blocks = 0;
 		inode->i_rdev = NODEV;
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
-		inode->i_mapping->backing_dev_info = &hugetlbfs_backing_dev_info;
+		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
 		default:
 			init_special_inode(inode, mode, dev);
 			break;
 		case S_IFREG:
+			inode->i_op = &hugetlbfs_inode_operations;
 			inode->i_fop = &hugetlbfs_file_operations;
 			break;
 		case S_IFDIR:
@@ -405,8 +383,8 @@ hugetlbfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 * File creation. Allocate an inode, and we're done..
 */
 /* SMP-safe */
-static int
+static int hugetlbfs_mknod(struct inode *dir,
-hugetlbfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+			struct dentry *dentry, int mode, dev_t dev)
 {
 	struct inode * inode = hugetlbfs_get_inode(dir->i_sb, mode, dev);
 	int error = -ENOSPC;
@@ -419,7 +397,7 @@ hugetlbfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	return error;
 }
-static int hugetlbfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
 	if (!retval)
@@ -432,7 +410,8 @@ static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, int mode)
 	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
 }
-static int hugetlbfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
+static int hugetlbfs_symlink(struct inode *dir,
+			struct dentry *dentry, const char *symname)
 {
 	struct inode *inode;
 	int error = -ENOSPC;
@@ -450,15 +429,25 @@ static int hugetlbfs_symlink(struct inode * dir, struct dentry *dentry, const ch
 	return error;
 }
+/*
+ * For direct-IO reads into hugetlb pages
+ */
+int hugetlbfs_set_page_dirty(struct page *page)
+{
+	return 0;
+}
 static struct address_space_operations hugetlbfs_aops = {
 	.readpage	= hugetlbfs_readpage,
 	.prepare_write	= hugetlbfs_prepare_write,
-	.commit_write	= hugetlbfs_commit_write
+	.commit_write	= hugetlbfs_commit_write,
+	.set_page_dirty	= hugetlbfs_set_page_dirty,
 };
 struct file_operations hugetlbfs_file_operations = {
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= simple_sync_file,
+	.get_unmapped_area	= hugetlb_get_unmapped_area,
 };
 static struct inode_operations hugetlbfs_dir_inode_operations = {
@@ -474,12 +463,17 @@ static struct inode_operations hugetlbfs_dir_inode_operations = {
 	.setattr	= hugetlbfs_setattr,
 };
+static struct inode_operations hugetlbfs_inode_operations = {
+	.setattr	= hugetlbfs_setattr,
+};
 static struct super_operations hugetlbfs_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= hugetlbfs_drop_inode,
 };
-static int hugetlbfs_fill_super(struct super_block * sb, void * data, int silent)
+static int
+hugetlbfs_fill_super(struct super_block * sb, void * data, int silent)
 {
 	struct inode * inode;
 	struct dentry * root;

--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -732,14 +732,21 @@ static journal_t * journal_init_common (void)
 * need to set up all of the mapping information to tell the journaling
 * system where the journal blocks are.
 *
+ */
+/**
+ *  journal_t * journal_init_dev() - creates an initialises a journal structure
+ *  @bdev: Block device on which to create the journal
+ *  @fs_dev: Device which hold journalled filesystem for this journal.
+ *  @start: Block nr Start of journal.
+ *  @len:  Lenght of the journal in blocks.
+ *  @blocksize: blocksize of journalling device
+ *  @returns: a newly created journal_t *
+ *  
 *  journal_init_dev creates a journal which maps a fixed contiguous
 *  range of blocks on an arbitrary block device.
 * 
- * journal_init_inode creates a journal which maps an on-disk inode as
- * the journal.  The inode must exist already, must support bmap() and
- * must have all data blocks preallocated.
 */
 journal_t * journal_init_dev(struct block_device *bdev,
 			struct block_device *fs_dev,
 			int start, int len, int blocksize)
@@ -764,6 +771,14 @@ journal_t * journal_init_dev(struct block_device *bdev,
 	return journal;
 }
+/** 
+ *  journal_t * journal_init_inode () - creates a journal which maps to a inode.
+ *  @inode: An inode to create the journal in
+ *  
+ * journal_init_inode creates a journal which maps an on-disk inode as
+ * the journal.  The inode must exist already, must support bmap() and
+ * must have all data blocks preallocated.
+ */
 journal_t * journal_init_inode (struct inode *inode)
 {
 	struct buffer_head *bh;
@@ -852,12 +867,15 @@ static int journal_reset (journal_t *journal)
 	return 0;
 }
-/*
+/** 
+ * int journal_create() - Initialise the new journal file
+ * @journal: Journal to create. This structure must have been initialised
+ * 
 * Given a journal_t structure which tells us which disk blocks we can
 * use, create a new journal superblock and initialise all of the
- * journal fields from scratch.  */
+ * journal fields from scratch.  
+ **/
-int journal_create (journal_t *journal)
+int journal_create(journal_t *journal)
 {
 	unsigned long blocknr;
 	struct buffer_head *bh;
@@ -920,11 +938,14 @@ int journal_create (journal_t *journal)
 	return journal_reset(journal);
 }
-/*
+/** 
+ * void journal_update_superblock() - Update journal sb on disk.
+ * @journal: The journal to update.
+ * @wait: Set to '0' if you don't want to wait for IO completion.
+ *
 * Update a journal's dynamic superblock fields and write it to disk,
 * optionally waiting for the IO to complete.
-*/
+ */
 void journal_update_superblock(journal_t *journal, int wait)
 {
 	journal_superblock_t *sb = journal->j_superblock;
@@ -1040,12 +1061,14 @@ static int load_superblock(journal_t *journal)
 }
-/*
+/**
+ * int journal_load() - Read journal from disk.
+ * @journal: Journal to act on.
+ * 
 * Given a journal_t structure which tells us which disk blocks contain
 * a journal, read the journal from disk to initialise the in-memory
 * structures.
 */
 int journal_load(journal_t *journal)
 {
 	int err;
@@ -1090,11 +1113,13 @@ int journal_load(journal_t *journal)
 	return -EIO;
 }
-/*
+/**
+ * void journal_destroy() - Release a journal_t structure.
+ * @journal: Journal to act on.
+* 
 * Release a journal_t structure once it is no longer in use by the
 * journaled object.
 */
 void journal_destroy (journal_t *journal)
 {
 	/* Wait for the commit thread to wake up and die. */
@@ -1131,8 +1156,12 @@ void journal_destroy (journal_t *journal)
 }
-/* Published API: Check whether the journal uses all of a given set of
+/**
- * features.  Return true (non-zero) if it does. */
+ *int journal_check_used_features () - Check if features specified are used.
+ * 
+ * Check whether the journal uses all of a given set of
+ * features.  Return true (non-zero) if it does. 
+ **/
 int journal_check_used_features (journal_t *journal, unsigned long compat,
 				 unsigned long ro, unsigned long incompat)
@@ -1154,7 +1183,10 @@ int journal_check_used_features (journal_t *journal, unsigned long compat,
 	return 0;
 }
-/* Published API: Check whether the journaling code supports the use of
+/**
+ * int journal_check_available_features() - Check feature set in journalling layer
+ * 
+ * Check whether the journaling code supports the use of
 * all of a given set of features on this journal.  Return true
 * (non-zero) if it can. */
@@ -1183,8 +1215,13 @@ int journal_check_available_features (journal_t *journal, unsigned long compat,
 	return 0;
 }
-/* Published API: Mark a given journal feature as present on the
+/**
- * superblock.  Returns true if the requested features could be set. */
+ * int journal_set_features () - Mark a given journal feature in the superblock
+ *
+ * Mark a given journal feature as present on the
+ * superblock.  Returns true if the requested features could be set. 
+ *
+ */
 int journal_set_features (journal_t *journal, unsigned long compat,
 			  unsigned long ro, unsigned long incompat)
@@ -1210,12 +1247,12 @@ int journal_set_features (journal_t *journal, unsigned long compat,
 }
-/*
+/**
- * Published API:
+ * int journal_update_format () - Update on-disk journal structure.
+ *
 * Given an initialised but unloaded journal struct, poke about in the
 * on-disk structure to update it to the most recent supported version.
 */
 int journal_update_format (journal_t *journal)
 {
 	journal_superblock_t *sb;
@@ -1265,7 +1302,10 @@ static int journal_convert_superblock_v1(journal_t *journal,
 }
-/*
+/**
+ * int journal_flush () - Flush journal
+ * @journal: Journal to act on.
+ * 
 * Flush all data for a given journal to disk and empty the journal.
 * Filesystems can use this when remounting readonly to ensure that
 * recovery does not need to happen on remount.
@@ -1319,12 +1359,16 @@ int journal_flush (journal_t *journal)
 	return err;
 }
-/*
+/**
+ * int journal_wipe() - Wipe journal contents
+ * @journal: Journal to act on.
+ * @write: flag (see below)
+ * 
 * Wipe out all of the contents of a journal, safely.  This will produce
 * a warning if the journal contains any valid recovery information.
 * Must be called between journal_init_*() and journal_load().
 *
- * If (write) is non-zero, then we wipe out the journal on disk; otherwise
+ * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
 * we merely suppress recovery.
 */
@@ -1373,43 +1417,11 @@ const char * journal_dev_name(journal_t *journal)
 }
 /*
- * journal_abort: perform a complete, immediate shutdown of the ENTIRE
+ * Journal abort has very specific semantics, which we describe
- * journal (not of a single transaction).  This operation cannot be
+ * for journal abort. 
- * undone without closing and reopening the journal.
- *
- * The journal_abort function is intended to support higher level error
- * recovery mechanisms such as the ext2/ext3 remount-readonly error
- * mode.
- *
- * Journal abort has very specific semantics.  Any existing dirty,
- * unjournaled buffers in the main filesystem will still be written to
- * disk by bdflush, but the journaling mechanism will be suspended
- * immediately and no further transaction commits will be honoured.
- *
- * Any dirty, journaled buffers will be written back to disk without
- * hitting the journal.  Atomicity cannot be guaranteed on an aborted
- * filesystem, but we _do_ attempt to leave as much data as possible
- * behind for fsck to use for cleanup.
 *
- * Any attempt to get a new transaction handle on a journal which is in
+ * Two internal function, which provide abort to te jbd layer
- * ABORT state will just result in an -EROFS error return.  A
+ * itself are here.
- * journal_stop on an existing handle will return -EIO if we have
- * entered abort state during the update.
- *
- * Recursive transactions are not disturbed by journal abort until the
- * final journal_stop, which will receive the -EIO error.
- *
- * Finally, the journal_abort call allows the caller to supply an errno
- * which will be recored (if possible) in the journal superblock.  This
- * allows a client to record failure conditions in the middle of a
- * transaction without having to complete the transaction to record the
- * failure to disk.  ext3_error, for example, now uses this
- * functionality.
- *
- * Errors which originate from within the journaling layer will NOT
- * supply an errno; a null errno implies that absolutely no further
- * writes are done to the journal (unless there are any already in
- * progress).
 */
 /* Quick version for internal journal use (doesn't lock the journal).
@@ -1447,7 +1459,52 @@ void __journal_abort_soft (journal_t *journal, int errno)
 		journal_update_superblock(journal, 1);
 }
-/* Full version for external use */
+/**
+ * void journal_abort () - Shutdown the journal immediately.
+ * @journal: the journal to shutdown.
+ * @errno:   an error number to record in the journal indicating
+ *           the reason for the shutdown.
+ *
+ * Perform a complete, immediate shutdown of the ENTIRE
+ * journal (not of a single transaction).  This operation cannot be
+ * undone without closing and reopening the journal.
+ *           
+ * The journal_abort function is intended to support higher level error
+ * recovery mechanisms such as the ext2/ext3 remount-readonly error
+ * mode.
+ *
+ * Journal abort has very specific semantics.  Any existing dirty,
+ * unjournaled buffers in the main filesystem will still be written to
+ * disk by bdflush, but the journaling mechanism will be suspended
+ * immediately and no further transaction commits will be honoured.
+ *
+ * Any dirty, journaled buffers will be written back to disk without
+ * hitting the journal.  Atomicity cannot be guaranteed on an aborted
+ * filesystem, but we _do_ attempt to leave as much data as possible
+ * behind for fsck to use for cleanup.
+ *
+ * Any attempt to get a new transaction handle on a journal which is in
+ * ABORT state will just result in an -EROFS error return.  A
+ * journal_stop on an existing handle will return -EIO if we have
+ * entered abort state during the update.
+ *
+ * Recursive transactions are not disturbed by journal abort until the
+ * final journal_stop, which will receive the -EIO error.
+ *
+ * Finally, the journal_abort call allows the caller to supply an errno
+ * which will be recorded (if possible) in the journal superblock.  This
+ * allows a client to record failure conditions in the middle of a
+ * transaction without having to complete the transaction to record the
+ * failure to disk.  ext3_error, for example, now uses this
+ * functionality.
+ *
+ * Errors which originate from within the journaling layer will NOT
+ * supply an errno; a null errno implies that absolutely no further
+ * writes are done to the journal (unless there are any already in
+ * progress).
+ * 
+ */
 void journal_abort (journal_t *journal, int errno)
 {
 	lock_journal(journal);
@@ -1455,6 +1512,17 @@ void journal_abort (journal_t *journal, int errno)
 	unlock_journal(journal);
 }
+/** 
+ * int journal_errno () - returns the journal's error state.
+ * @journal: journal to examine.
+ *
+ * This is the errno numbet set with journal_abort(), the last
+ * time the journal was mounted - if the journal was stopped
+ * without calling abort this will be 0.
+ *
+ * If the journal has been aborted on this mount time -EROFS will
+ * be returned.
+ */
 int journal_errno (journal_t *journal)
 {
 	int err;
@@ -1468,6 +1536,14 @@ int journal_errno (journal_t *journal)
 	return err;
 }
+/** 
+ * int journal_clear_err () - clears the journal's error state
+ *
+ * An error must be cleared or Acked to take a FS out of readonly
+ * mode.
+ */
 int journal_clear_err (journal_t *journal)
 {
 	int err = 0;
@@ -1481,6 +1557,13 @@ int journal_clear_err (journal_t *journal)
 	return err;
 }
+/** 
+ * void journal_ack_err() - Ack journal err.
+ *
+ * An error must be cleared or Acked to take a FS out of readonly
+ * mode.
+ */
 void journal_ack_err (journal_t *journal)
 {
 	lock_journal(journal);

--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -206,20 +206,22 @@ do {									\
 		var -= ((journal)->j_last - (journal)->j_first);	\
 } while (0)
-/*
+/**
- * journal_recover
+ * int journal_recover(journal_t *journal) - recovers a on-disk journal
+ * @journal: the journal to recover
 * 
 * The primary function for recovering the log contents when mounting a
 * journaled device.  
- * 
+ */
+int journal_recover(journal_t *journal)
+{
+/*
 * Recovery is done in three passes.  In the first pass, we look for the
 * end of the log.  In the second, we assemble the list of revoke
 * blocks.  In the third and final pass, we replay any un-revoked blocks
 * in the log.  
 */
-int journal_recover(journal_t *journal)
-{
 	int			err;
 	journal_superblock_t *	sb;
@@ -263,20 +265,23 @@ int journal_recover(journal_t *journal)
 	return err;
 }
-/*
+/**
- * journal_skip_recovery
+ * int journal_skip_recovery() - Start journal and wipe exiting records 
+ * @journal: journal to startup
 * 
 * Locate any valid recovery information from the journal and set up the
 * journal structures in memory to ignore it (presumably because the
 * caller has evidence that it is out of date).  
- *
+ * This function does'nt appear to be exorted..
+ */
+int journal_skip_recovery(journal_t *journal)
+{
+/*
 * We perform one pass over the journal to allow us to tell the user how
 * much recovery information is being erased, and to let us initialise
 * the journal transaction sequence numbers to the next unused ID. 
 */
-int journal_skip_recovery(journal_t *journal)
-{
 	int			err;
 	journal_superblock_t *	sb;

--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -222,19 +222,20 @@ static handle_t *new_handle(int nblocks)
 	return handle;
 }
-/*
+/**
- * Obtain a new handle.  
+ * handle_t *journal_start() - Obtain a new handle.  
+ * @journal: Journal to start transaction on.
+ * @nblocks: number of block buffer we might modify
 *
 * We make sure that the transaction can guarantee at least nblocks of
 * modified buffers in the log.  We block until the log can guarantee
 * that much space.  
 *
- * This function is visible to journal users (like ext2fs), so is not
+ * This function is visible to journal users (like ext3fs), so is not
 * called with the journal already locked.
 *
 * Return a pointer to a newly allocated handle, or NULL on failure
 */
 handle_t *journal_start(journal_t *journal, int nblocks)
 {
 	handle_t *handle = journal_current_handle();
@@ -324,7 +325,11 @@ static int try_start_this_handle(journal_t *journal, handle_t *handle)
 	return ret;
 }
-/*
+/**
+ * handle_t *journal_try_start() - Don't block, but try and get a handle
+ * @journal: Journal to start transaction on.
+ * @nblocks: number of block buffer we might modify
+ * 
 * Try to start a handle, but non-blockingly.  If we weren't able
 * to, return an ERR_PTR value.
 */
@@ -368,8 +373,10 @@ handle_t *journal_try_start(journal_t *journal, int nblocks)
 	return handle;
 }
-/*
+/**
- * journal_extend: extend buffer credits.
+ * int journal_extend() - extend buffer credits.
+ * @handle:  handle to 'extend'
+ * @nblocks: nr blocks to try to extend by.
 * 
 * Some transactions, such as large extends and truncates, can be done
 * atomically all at once or in several stages.  The operation requests
@@ -377,7 +384,7 @@ handle_t *journal_try_start(journal_t *journal, int nblocks)
 * extend its credit if it needs more.  
 *
 * journal_extend tries to give the running handle more buffer credits.
- * It does not guarantee that allocation: this is a best-effort only.
+ * It does not guarantee that allocation - this is a best-effort only.
 * The calling process MUST be able to deal cleanly with a failure to
 * extend here.
 *
@@ -386,7 +393,6 @@ handle_t *journal_try_start(journal_t *journal, int nblocks)
 * return code < 0 implies an error
 * return code > 0 implies normal transaction-full status.
 */
 int journal_extend (handle_t *handle, int nblocks)
 {
 	transaction_t *transaction = handle->h_transaction;
@@ -435,8 +441,12 @@ int journal_extend (handle_t *handle, int nblocks)
 }
-/*
+/**
- * journal_restart: restart a handle for a multi-transaction filesystem
+ * int journal_restart() - restart a handle .
+ * @handle:  handle to restart
+ * @nblocks: nr credits requested
+ * 
+ * Restart a handle for a multi-transaction filesystem
 * operation.
 *
 * If the journal_extend() call above fails to grant new buffer credits
@@ -478,8 +488,9 @@ int journal_restart(handle_t *handle, int nblocks)
 }
-/* 
+/**
- * Barrier operation: establish a transaction barrier. 
+ * void journal_lock_updates () - establish a transaction barrier.
+ * @journal:  Journal to establish a barrier on.
 *
 * This locks out any further updates from being started, and blocks
 * until all existing updates have completed, returning only once the
@@ -487,7 +498,6 @@ int journal_restart(handle_t *handle, int nblocks)
 *
 * The journal lock should not be held on entry.
 */
 void journal_lock_updates (journal_t *journal)
 {
 	lock_journal(journal);
@@ -515,12 +525,14 @@ void journal_lock_updates (journal_t *journal)
 	down(&journal->j_barrier);
 }
-/*
+/**
+ * void journal_unlock_updates (journal_t* journal) - release barrier
+ * @journal:  Journal to release the barrier on.
+ * 
 * Release a transaction barrier obtained with journal_lock_updates().
 *
 * Should be called without the journal lock held.
 */
 void journal_unlock_updates (journal_t *journal)
 {
 	lock_journal(journal);
@@ -566,9 +578,6 @@ static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
 }
 /*
- * journal_get_write_access: notify intent to modify a buffer for metadata
- * (not data) update.
- *
 * If the buffer is already part of the current transaction, then there
 * is nothing we need to do.  If it is already part of a prior
 * transaction which we are still committing to disk, then we need to
@@ -577,7 +586,6 @@ static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
 * the handle's metadata buffer credits (unless the buffer is already
 * part of the transaction, that is).
 *
- * Returns an error code or 0 on success.
 */
 static int
@@ -786,6 +794,17 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy)
 	return error;
 }
+/**
+ * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
+ * @handle: transaction to add buffer modifications to
+ * @bh:     bh to be used for metadata writes
+ *
+ * Returns an error code or 0 on success.
+ *
+ * In full data journalling mode the buffer may be of type BJ_AsyncData,
+ * because we're write()ing a buffer which is also part of a shared mapping.
+ */
 int journal_get_write_access (handle_t *handle, struct buffer_head *bh) 
 {
 	transaction_t *transaction = handle->h_transaction;
@@ -816,6 +835,13 @@ int journal_get_write_access (handle_t *handle, struct buffer_head *bh)
 * There is no lock ranking violation: it was a newly created,
 * unlocked buffer beforehand. */
+/**
+ * int journal_get_create_access () - notify intent to use newly created bh
+ * @handle: transaction to new buffer to
+ * @bh: new buffer.
+ *
+ * Call this if you create a new bh.
+ */
 int journal_get_create_access (handle_t *handle, struct buffer_head *bh) 
 {
 	transaction_t *transaction = handle->h_transaction;
@@ -875,13 +901,14 @@ int journal_get_create_access (handle_t *handle, struct buffer_head *bh)
-/*
+/**
- * journal_get_undo_access: Notify intent to modify metadata with non-
+ * int journal_get_undo_access() -  Notify intent to modify metadata with non-rewindable consequences
- * rewindable consequences
+ * @handle: transaction
+ * @bh: buffer to undo
 * 
 * Sometimes there is a need to distinguish between metadata which has
 * been committed to disk and that which has not.  The ext3fs code uses
- * this for freeing and allocating space: we have to make sure that we
+ * this for freeing and allocating space, we have to make sure that we
 * do not reuse freed space until the deallocation has been committed,
 * since if we overwrote that space we would make the delete
 * un-rewindable in case of a crash.
@@ -893,13 +920,12 @@ int journal_get_create_access (handle_t *handle, struct buffer_head *bh)
 * as we know that the buffer has definitely been committed to disk.
 * 
 * We never need to know which transaction the committed data is part
- * of: buffers touched here are guaranteed to be dirtied later and so
+ * of, buffers touched here are guaranteed to be dirtied later and so
 * will be committed to a new transaction in due course, at which point
 * we can discard the old committed data pointer.
 *
 * Returns error number or 0 on success.  
 */
 int journal_get_undo_access (handle_t *handle, struct buffer_head *bh)
 {
 	journal_t *journal = handle->h_transaction->t_journal;
@@ -942,21 +968,23 @@ int journal_get_undo_access (handle_t *handle, struct buffer_head *bh)
 	return err;
 }
-/* 
+/** 
- * journal_dirty_data: mark a buffer as containing dirty data which
+ * int journal_dirty_data() -  mark a buffer as containing dirty data which needs to be flushed before we can commit the current transaction.  
- * needs to be flushed before we can commit the current transaction.  
+ * @handle: transaction
+ * @bh: bufferhead to mark
 * 
 * The buffer is placed on the transaction's data list and is marked as
 * belonging to the transaction.
 *
 * Returns error number or 0 on success.  
- *
+ */
+int journal_dirty_data (handle_t *handle, struct buffer_head *bh)
+{
+/*
 * journal_dirty_data() can be called via page_launder->ext3_writepage
 * by kswapd.  So it cannot block.  Happily, there's nothing here
 * which needs lock_journal if `async' is set.
 */
-int journal_dirty_data (handle_t *handle, struct buffer_head *bh)
-{
 	journal_t *journal = handle->h_transaction->t_journal;
 	int need_brelse = 0;
 	struct journal_head *jh;
@@ -1097,24 +1125,28 @@ int journal_dirty_data (handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
-/* 
+/** 
- * journal_dirty_metadata: mark a buffer as containing dirty metadata
+ * int journal_dirty_metadata() -  mark a buffer as containing dirty metadata
- * which needs to be journaled as part of the current transaction.
+ * @handle: transaction to add buffer to.
+ * @bh: buffer to mark 
+ * 
+ * mark dirty metadata which needs to be journaled as part of the current transaction.
 *
 * The buffer is placed on the transaction's metadata list and is marked
 * as belonging to the transaction.  
 *
+ * Returns error number or 0 on success.  
+ */
+int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh)
+{
+/*
 * Special care needs to be taken if the buffer already belongs to the
 * current committing transaction (in which case we should have frozen
 * data present for that commit).  In that case, we don't relink the
 * buffer: that only gets done when the old transaction finally
 * completes its commit.
 * 
- * Returns error number or 0 on success.  
 */
-int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh)
-{
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
 	struct journal_head *jh = bh2jh(bh);
@@ -1199,9 +1231,12 @@ void journal_release_buffer (handle_t *handle, struct buffer_head *bh)
 }
 #endif
-/* 
+/** 
- * journal_forget: bforget() for potentially-journaled buffers.  We can
+ * void journal_forget() - bforget() for potentially-journaled buffers.
- * only do the bforget if there are no commits pending against the
+ * @handle: transaction handle
+ * @bh:     bh to 'forget'
+ *
+ * We can only do the bforget if there are no commits pending against the
 * buffer.  If the buffer is dirty in the current running transaction we
 * can safely unlink it. 
 *
@@ -1213,7 +1248,6 @@ void journal_release_buffer (handle_t *handle, struct buffer_head *bh)
 * Allow this call even if the handle has aborted --- it may be part of
 * the caller's cleanup after an abort.
 */
 void journal_forget (handle_t *handle, struct buffer_head *bh)
 {
 	transaction_t *transaction = handle->h_transaction;
@@ -1352,8 +1386,14 @@ void journal_sync_buffer(struct buffer_head *bh)
 }
 #endif
-/*
+/**
- * Register a callback function for this handle.  The function will be
+ * void journal_callback_set() -  Register a callback function for this handle.
+ * @handle: handle to attach the callback to.
+ * @func: function to callback.
+ * @jcb:  structure with additional information required by func() , and
+ *        some space for jbd internal information.
+ * 
+ * The function will be
 * called when the transaction that this handle is part of has been
 * committed to disk with the original callback data struct and the
 * error status of the journal as parameters.  There is no guarantee of
@@ -1374,7 +1414,11 @@ void journal_callback_set(handle_t *handle,
 	jcb->jcb_func = func;
 }
-/*
+/**
+ * int journal_stop() - complete a transaction
+ * @handle: tranaction to complete.
+ * 
 * All done for a particular handle.
 *
 * There is not much action needed here.  We just return any remaining
@@ -1387,7 +1431,6 @@ void journal_callback_set(handle_t *handle,
 * return -EIO if a journal_abort has been executed since the
 * transaction began.
 */
 int journal_stop(handle_t *handle)
 {
 	transaction_t *transaction = handle->h_transaction;
@@ -1473,8 +1516,10 @@ int journal_stop(handle_t *handle)
 	return err;
 }
-/*
+/**int journal_force_commit() - force any uncommitted transactions
- * For synchronous operations: force any uncommitted trasnactions
+ * @journal: journal to force
+ *
+ * For synchronous operations: force any uncommitted transactions
 * to disk.  May seem kludgy, but it reuses all the handle batching
 * code in a very simple manner.
 */
@@ -1667,6 +1712,26 @@ static inline int __journal_try_to_free_buffer(struct buffer_head *bh)
 	return 0;
 }
+/** 
+ * int journal_try_to_free_buffers() - try to free page buffers.
+ * @journal: journal for operation
+ * @page: to try and free
+ * @gfp_mask: 'IO' mode for try_to_free_buffers()
+ *
+ * 
+ * For all the buffers on this page,
+ * if they are fully written out ordered data, move them onto BUF_CLEAN
+ * so try_to_free_buffers() can reap them.
+ * 
+ * This function returns non-zero if we wish try_to_free_buffers()
+ * to be called. We do this if the page is releasable by try_to_free_buffers().
+ * We also do it if the page has locked or dirty buffers and the caller wants
+ * us to perform sync or async writeout.
+ */
+int journal_try_to_free_buffers(journal_t *journal, 
+				struct page *page, int unused_gfp_mask)
+{
 /*
 * journal_try_to_free_buffers().  Try to remove all this page's buffers
 * from the journal.
@@ -1689,9 +1754,6 @@ static inline int __journal_try_to_free_buffer(struct buffer_head *bh)
 * cannot happen because we never reallocate freed data as metadata
 * while the data is part of a transaction.  Yes?
 */
-int journal_try_to_free_buffers(journal_t *journal, 
-				struct page *page, int unused_gfp_mask)
-{
 	struct buffer_head *head;
 	struct buffer_head *bh;
 	int ret = 0;
@@ -1886,8 +1948,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 	return may_free;
 }
-/*
+/** 
- * Return non-zero if the page's buffers were successfully reaped
+ * int journal_invalidatepage() 
+ * @journal: journal to use for flush... 
+ * @page:    page to flush
+ * @offset:  length of page to invalidate.
+ *
+ * Reap page buffers containing data after offset in page.
+ *
+ * Return non-zero if the page's buffers were successfully reaped.
 */
 int journal_invalidatepage(journal_t *journal, 
 		      struct page *page, 

--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -116,6 +116,49 @@ mpage_alloc(struct block_device *bdev,
 	return bio;
 }
+/*
+ * support function for mpage_readpages.  The fs supplied get_block might
+ * return an up to date buffer.  This is used to map that buffer into
+ * the page, which allows readpage to avoid triggering a duplicate call
+ * to get_block.
+ *
+ * The idea is to avoid adding buffers to pages that don't already have
+ * them.  So when the buffer is up to date and the page size == block size,
+ * this marks the page up to date instead of adding new buffers.
+ */
+static void 
+map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) 
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head *page_bh, *head;
+	int block = 0;
+	if (!page_has_buffers(page)) {
+		/*
+		 * don't make any buffers if there is only one buffer on
+		 * the page and the page just needs to be set up to date
+		 */
+		if (inode->i_blkbits == PAGE_CACHE_SHIFT && 
+		    buffer_uptodate(bh)) {
+			SetPageUptodate(page);    
+			return;
+		}
+		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+	}
+	head = page_buffers(page);
+	page_bh = head;
+	do {
+		if (block == page_block) {
+			page_bh->b_state = bh->b_state;
+			page_bh->b_bdev = bh->b_bdev;
+			page_bh->b_blocknr = bh->b_blocknr;
+			break;
+		}
+		page_bh = page_bh->b_this_page;
+		block++;
+	} while (page_bh != head);
+}
 /**
 * mpage_readpages - populate an address space with some pages, and
 *                       start reads against them.
@@ -186,6 +229,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 	block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
 	last_block = (inode->i_size + blocksize - 1) >> blkbits;
+	bh.b_page = page;
 	for (page_block = 0; page_block < blocks_per_page;
 				page_block++, block_in_file++) {
 		bh.b_state = 0;
@@ -201,6 +245,17 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 			continue;
 		}
+		/* some filesystems will copy data into the page during
+		 * the get_block call, in which case we don't want to
+		 * read it again.  map_buffer_to_page copies the data
+		 * we just collected from get_block into the page's buffers
+		 * so readpage doesn't have to repeat the get_block call
+		 */
+		if (buffer_uptodate(&bh)) {
+			map_buffer_to_page(page, &bh, page_block);
+			goto confused;
+		}
 		if (first_hole != blocks_per_page)
 			goto confused;		/* hole -> non-hole */
@@ -256,7 +311,10 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 confused:
 	if (bio)
 		bio = mpage_bio_submit(READ, bio);
+	if (!PageUptodate(page))
 	        block_read_full_page(page, get_block);
+	else
+		unlock_page(page);
 	goto out;
 }
@@ -344,6 +402,7 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 	sector_t boundary_block = 0;
 	struct block_device *boundary_bdev = NULL;
 	int length;
+	struct buffer_head map_bh;
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
@@ -401,8 +460,8 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 	BUG_ON(!PageUptodate(page));
 	block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
 	last_block = (inode->i_size - 1) >> blkbits;
+	map_bh.b_page = page;
 	for (page_block = 0; page_block < blocks_per_page; ) {
-		struct buffer_head map_bh;
 		map_bh.b_state = 0;
 		if (get_block(inode, block_in_file, &map_bh, 1))
@@ -559,7 +618,6 @@ mpage_writepages(struct address_space *mapping,
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		blk_run_queues();
 		wbc->encountered_congestion = 1;
 		return 0;
 	}
@@ -614,7 +672,6 @@ mpage_writepages(struct address_space *mapping,
 			if (ret || (--(wbc->nr_to_write) <= 0))
 				done = 1;
 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
-				blk_run_queues();
 				wbc->encountered_congestion = 1;
 				done = 1;
 			}

--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -535,6 +535,10 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	if (retval)
 		goto fput_in;
+	retval = security_file_permission (in_file, MAY_READ);
+	if (retval)
+		goto fput_in;
 	/*
 	 * Get output file, and verify that it is ok..
 	 */
@@ -556,6 +560,10 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	if (retval)
 		goto fput_out;
+	retval = security_file_permission (out_file, MAY_WRITE);
+	if (retval)
+		goto fput_out;
 	if (!ppos)
 		ppos = &in_file->f_pos;

--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,6 +11,8 @@
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 #include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
 /* args for the create parameter of reiserfs_get_block */
 #define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */
@@ -263,6 +265,9 @@ static int _get_block_create_0 (struct inode * inode, long block,
 	ret = 0 ;
 	if (blocknr) {
 	    map_bh(bh_result, inode->i_sb, blocknr);
+	    if (path.pos_in_item == ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
+		set_buffer_boundary(bh_result);
+	    }
 	} else 
 	    // We do not return -ENOENT if there is a hole but page is uptodate, because it means
 	    // That there is some MMAPED data associated with it that is yet to  be written to disk.
@@ -286,7 +291,7 @@ static int _get_block_create_0 (struct inode * inode, long block,
 	return -ENOENT;
    }
-    /* if we've got a direct item, and the buffer was uptodate,
+    /* if we've got a direct item, and the buffer or page was uptodate,
    ** we don't want to pull data off disk again.  skip to the
    ** end, where we map the buffer and return
    */
@@ -367,7 +372,9 @@ static int _get_block_create_0 (struct inode * inode, long block,
 finished:
    pathrelse (&path);
-    /* I _really_ doubt that you want it.  Chris? */
+    /* this buffer has valid data, but isn't valid for io.  mapping it to
+     * block #0 tells the rest of reiserfs it just has a tail in it
+     */
    map_bh(bh_result, inode->i_sb, 0);
    set_buffer_uptodate (bh_result);
    return 0;
@@ -842,6 +849,12 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
    return retval;
 }
+static int
+reiserfs_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+    return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
+}
 //
 // BAD: new directories have stat data of new type and all other items
@@ -1809,13 +1822,19 @@ static int map_block_for_writepage(struct inode *inode,
    int use_get_block = 0 ;
    int bytes_copied = 0 ;
    int copy_size ;
+    int trans_running = 0;
+    /* catch places below that try to log something without starting a trans */
+    th.t_trans_id = 0;
+    if (!buffer_uptodate(bh_result)) {
+        buffer_error();
+	return -EIO;
+    }
    kmap(bh_result->b_page) ;
 start_over:
    reiserfs_write_lock(inode->i_sb);
-    journal_begin(&th, inode->i_sb, jbegin_count) ;
-    reiserfs_update_inode_transaction(inode) ;
    make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3) ;
 research:
@@ -1841,7 +1860,6 @@ static int map_block_for_writepage(struct inode *inode,
 	    goto out ;
 	}
 	set_block_dev_mapped(bh_result, get_block_num(item,pos_in_item),inode);
-        set_buffer_uptodate(bh_result);
    } else if (is_direct_le_ih(ih)) {
        char *p ; 
        p = page_address(bh_result->b_page) ;
@@ -1850,7 +1868,20 @@ static int map_block_for_writepage(struct inode *inode,
 	fs_gen = get_generation(inode->i_sb) ;
 	copy_item_head(&tmp_ih, ih) ;
+	if (!trans_running) {
+	    /* vs-3050 is gone, no need to drop the path */
+	    journal_begin(&th, inode->i_sb, jbegin_count) ;
+	    reiserfs_update_inode_transaction(inode) ;
+	    trans_running = 1;
+	    if (fs_changed(fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
+		reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
+		goto research;
+	    }
+	}
 	reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
 	if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 	    reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
 	    goto research;
@@ -1861,7 +1892,6 @@ static int map_block_for_writepage(struct inode *inode,
 	journal_mark_dirty(&th, inode->i_sb, bh) ;
 	bytes_copied += copy_size ;
 	set_block_dev_mapped(bh_result, 0, inode);
-        set_buffer_uptodate(bh_result);
 	/* are there still bytes left? */
        if (bytes_copied < bh_result->b_size && 
@@ -1878,7 +1908,10 @@ static int map_block_for_writepage(struct inode *inode,
 out:
    pathrelse(&path) ;
+    if (trans_running) {
 	journal_end(&th, inode->i_sb, jbegin_count) ;
+	trans_running = 0;
+    }
    reiserfs_write_unlock(inode->i_sb);
    /* this is where we fill in holes in the file. */
@@ -1894,49 +1927,77 @@ static int map_block_for_writepage(struct inode *inode,
 	}
    }
    kunmap(bh_result->b_page) ;
+    if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
+	/* we've copied data from the page into the direct item, so the
+	 * buffer in the page is now clean, mark it to reflect that.
+	 */
+        lock_buffer(bh_result);
+	clear_buffer_dirty(bh_result);
+	unlock_buffer(bh_result);
+    }
    return retval ;
 }
-/* helper func to get a buffer head ready for writepage to send to
+/*
-** ll_rw_block
+ * does the right thing for deciding when to lock a buffer and
-*/
+ * mark it for io during a writepage.  make sure the buffer is
-static inline void submit_bh_for_writepage(struct buffer_head **bhp, int nr) {
+ * dirty before sending it here though.
-    struct buffer_head *bh ;
-    int i;
-    for(i = 0 ; i < nr ; i++) {
-        bh = bhp[i] ;
-	lock_buffer(bh) ;
-	mark_buffer_async_write(bh) ;
-	/* submit_bh doesn't care if the buffer is dirty, but nobody
-	** later on in the call chain will be cleaning it.  So, we
-	** clean the buffer here, it still gets written either way.
 */
-	clear_buffer_dirty(bh) ;
+static void lock_buffer_for_writepage(struct page *page, 
-	set_buffer_uptodate(bh) ;
+                                      struct writeback_control *wbc, 
-	submit_bh(WRITE, bh) ;
+			              struct buffer_head *bh)
+{
+    if (wbc->sync_mode != WB_SYNC_NONE) {
+	lock_buffer(bh);
+    } else {
+	if (test_set_buffer_locked(bh)) {
+	    __set_page_dirty_nobuffers(page);
+	    return;
+	}
+    }
+    if (test_clear_buffer_dirty(bh)) {
+	if (!buffer_uptodate(bh))
+	    buffer_error();
+	mark_buffer_async_write(bh);
+    } else {
+	unlock_buffer(bh);
    }
 }
+/* 
+ * mason@suse.com: updated in 2.5.54 to follow the same general io 
+ * start/recovery path as __block_write_full_page, along with special
+ * code to handle reiserfs tails.
+ */
 static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) {
    struct inode *inode = page->mapping->host ;
    unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ;
-    unsigned last_offset = PAGE_CACHE_SIZE;
    int error = 0;
    unsigned long block ;
-    unsigned cur_offset = 0 ;
+    struct buffer_head *head, *bh;
-    struct buffer_head *head, *bh ;
    int partial = 0 ;
-    struct buffer_head *arr[PAGE_CACHE_SIZE/512] ;
+    int nr = 0;
-    int nr = 0 ;
-    if (!page_has_buffers(page))
+    /* The page dirty bit is cleared before writepage is called, which
-        block_prepare_write(page, 0, 0, NULL) ;
+     * means we have to tell create_empty_buffers to make dirty buffers
+     * The page really should be up to date at this point, so tossing
+     * in the BH_Uptodate is just a sanity check.
+     */
+    if (!page_has_buffers(page)) {
+	if (!PageUptodate(page))
+	    buffer_error();
+	create_empty_buffers(page, inode->i_sb->s_blocksize, 
+	                    (1 << BH_Dirty) | (1 << BH_Uptodate));
+    }
+    head = page_buffers(page) ;
    /* last page in the file, zero out any contents past the
    ** last byte in the file
    */
    if (page->index >= end_index) {
 	char *kaddr;
+	unsigned last_offset;
        last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ;
 	/* no file contents in this page */
@@ -1949,66 +2010,107 @@ static int reiserfs_write_full_page(struct page *page, struct writeback_control
 	flush_dcache_page(page) ;
 	kunmap_atomic(kaddr, KM_USER0) ;
    }
-    head = page_buffers(page) ;
    bh = head ;
    block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits) ;
    do {
-	/* if this offset in the page is outside the file */
+	get_bh(bh);
-	if (cur_offset >= last_offset) {
+	if (buffer_dirty(bh)) {
-	    if (!buffer_uptodate(bh))
-	        partial = 1 ;
-	} else {
-	    /* fast path, buffer mapped to an unformatted node */
 	    if (buffer_mapped(bh) && bh->b_blocknr != 0) {
-		arr[nr++] = bh ;
+		/* buffer mapped to an unformatted node */
+		lock_buffer_for_writepage(page, wbc, bh);
 	    } else {
-		/* buffer not mapped yet, or points to a direct item.
+		/* not mapped yet, or it points to a direct item, search
-		** search and dirty or log
+		 * the btree for the mapping info, and log any direct
+		 * items found
 		 */
 		if ((error = map_block_for_writepage(inode, bh, block))) {
 		    goto fail ;
 		}
-		/* map_block_for_writepage either found an unformatted node
-		** and mapped it for us, or it found a direct item
-		** and logged the changes.  
-		*/
 		if (buffer_mapped(bh) && bh->b_blocknr != 0)  {
-		    arr[nr++] = bh ;
+		    lock_buffer_for_writepage(page, wbc, bh);
 		} 
 	    }
 	}
-        bh = bh->b_this_page ;
+        bh = bh->b_this_page;
-	cur_offset += bh->b_size ;
+	block++;
-	block++ ;
    } while(bh != head) ;
-    if (!partial)
-        SetPageUptodate(page) ;
    BUG_ON(PageWriteback(page));
    SetPageWriteback(page);
    unlock_page(page);
-    /* if this page only had a direct item, it is very possible for
+    /*
-    ** nr == 0 without there being any kind of error.
+     * since any buffer might be the only dirty buffer on the page, 
+     * the first submit_bh can bring the page out of writeback.
+     * be careful with the buffers.
     */
-    if (nr) {
+    do {
-        submit_bh_for_writepage(arr, nr) ;
+        struct buffer_head *next = bh->b_this_page;
-    } else {
+	if (buffer_async_write(bh)) {
-        end_page_writeback(page) ;
+	    submit_bh(WRITE, bh);
+	    nr++;
+	}
+	put_bh(bh);
+	bh = next;
+    } while(bh != head);
+    error = 0;
+done:
+    if (nr == 0) {
+        /*
+         * if this page only had a direct item, it is very possible for
+         * no io to be required without there being an error.  Or, 
+	 * someone else could have locked them and sent them down the 
+	 * pipe without locking the page
+	 */
+	do {
+	    if (!buffer_uptodate(bh)) {
+	        partial = 1;
+		break;
 	    }
+	} while(bh != head);
-    return 0 ;
+	if (!partial)
+	    SetPageUptodate(page);
+	end_page_writeback(page);
+    }
+    return error;
 fail:
-    if (nr) {
+    /* catches various errors, we need to make sure any valid dirty blocks
-        SetPageWriteback(page);
+     * get to the media.  The page is currently locked and not marked for 
-        unlock_page(page);
+     * writeback
-        submit_bh_for_writepage(arr, nr) ;
+     */
+    ClearPageUptodate(page);
+    bh = head;
+    do {
+	get_bh(bh);
+	if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
+	    lock_buffer(bh);
+	    mark_buffer_async_write(bh);
 	} else {
-        unlock_page(page) ;
+	    /*
+	     * clear any dirty bits that might have come from getting
+	     * attached to a dirty page
+	     */
+	     clear_buffer_dirty(bh);
 	}
-    ClearPageUptodate(page) ;
+        bh = bh->b_this_page;
-    return error ;
+    } while(bh != head);
+    SetPageError(page);
+    BUG_ON(PageWriteback(page));
+    SetPageWriteback(page);
+    unlock_page(page);
+    do {
+        struct buffer_head *next = bh->b_this_page;
+	if (buffer_async_write(bh)) {
+	    clear_buffer_dirty(bh);
+	    submit_bh(WRITE, bh);
+	    nr++;
+	}
+	put_bh(bh);
+	bh = next;
+    } while(bh != head);
+    goto done;
 }
@@ -2115,6 +2217,7 @@ static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
 struct address_space_operations reiserfs_address_space_operations = {
    .writepage = reiserfs_writepage,
    .readpage = reiserfs_readpage, 
+    .readpages = reiserfs_readpages, 
    .releasepage = reiserfs_releasepage,
    .sync_page = block_sync_page,
    .prepare_write = reiserfs_prepare_write,

--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -4,6 +4,8 @@
 #include <linux/major.h>
 #include <linux/genhd.h>
 #include <linux/list.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
 #include <linux/pagemap.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
@@ -188,6 +190,14 @@ struct request_queue
 	unplug_fn		*unplug_fn;
 	merge_bvec_fn		*merge_bvec_fn;
+	/*
+	 * Auto-unplugging state
+	 */
+	struct timer_list	unplug_timer;
+	int			unplug_thresh;	/* After this many requests */
+	unsigned long		unplug_delay;	/* After this many jiffies */
+	struct work_struct	unplug_work;
 	struct backing_dev_info	backing_dev_info;
 	/*

--- a/include/linux/ext3_jbd.h
+++ b/include/linux/ext3_jbd.h
@@ -28,7 +28,7 @@
 * indirection blocks, the group and superblock summaries, and the data
 * block to complete the transaction.  */
-#define EXT3_SINGLEDATA_TRANS_BLOCKS	8
+#define EXT3_SINGLEDATA_TRANS_BLOCKS	8U
 /* Extended attributes may touch two data buffers, two bitmap buffers,
 * and two group and summaries. */
@@ -58,7 +58,7 @@ extern int ext3_writepage_trans_blocks(struct inode *inode);
 * start off at the maximum transaction size and grow the transaction
 * optimistically as we go. */
-#define EXT3_MAX_TRANS_DATA		64
+#define EXT3_MAX_TRANS_DATA		64U
 /* We break up a large truncate or write transaction once the handle's
 * buffer credits gets this low, we need either to extend the
@@ -67,7 +67,7 @@ extern int ext3_writepage_trans_blocks(struct inode *inode);
 * one block, plus two quota updates.  Quota allocations are not
 * needed. */
-#define EXT3_RESERVE_TRANS_BLOCKS	12
+#define EXT3_RESERVE_TRANS_BLOCKS	12U
 #define EXT3_INDEX_EXTRA_TRANS_BLOCKS	8

--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,16 +20,32 @@ int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 void huge_page_release(struct page *);
 int hugetlb_report_meminfo(char *);
 int is_hugepage_mem_enough(size_t);
+struct page *follow_huge_addr(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, int write);
+struct vm_area_struct *hugepage_vma(struct mm_struct *mm,
+					unsigned long address);
+struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+				pmd_t *pmd, int write);
+int pmd_huge(pmd_t pmd);
 extern int htlbpage_max;
+static inline void
+mark_mm_hugetlb(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+	if (is_vm_hugetlb_page(vma))
+		mm->used_hugetlb = 1;
+}
 #else /* !CONFIG_HUGETLB_PAGE */
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
 	return 0;
 }
 #define follow_hugetlb_page(m,v,p,vs,a,b,i)	({ BUG(); 0; })
+#define follow_huge_addr(mm, vma, addr, write)	0
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
 #define hugetlb_prefault(mapping, vma)		({ BUG(); 0; })
 #define zap_hugepage_range(vma, start, len)	BUG()
@@ -37,6 +53,14 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 #define huge_page_release(page)			BUG()
 #define is_hugepage_mem_enough(size)		0
 #define hugetlb_report_meminfo(buf)		0
+#define hugepage_vma(mm, addr)			0
+#define mark_mm_hugetlb(mm, vma)		do { } while (0)
+#define follow_huge_pmd(mm, addr, pmd, write)	0
+#define pmd_huge(x)	0
+#ifndef HPAGE_MASK
+#define HPAGE_MASK	0		/* Keep the compiler happy */
+#endif
 #endif /* !CONFIG_HUGETLB_PAGE */

--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -63,7 +63,38 @@ extern void * __jbd_kmalloc (const char *where, size_t size, int flags, int retr
 #define JFS_MIN_JOURNAL_BLOCKS 1024
 #ifdef __KERNEL__
+/**
+ * typedef handle_t - The handle_t type represents a single atomic update being performed by some process.
+ *
+ * All filesystem modifications made by the process go
+ * through this handle.  Recursive operations (such as quota operations)
+ * are gathered into a single update.
+ *
+ * The buffer credits field is used to account for journaled buffers
+ * being modified by the running process.  To ensure that there is
+ * enough log space for all outstanding operations, we need to limit the
+ * number of outstanding buffers possible at any time.  When the
+ * operation completes, any buffer credits not used are credited back to
+ * the transaction, so that at all times we know how many buffers the
+ * outstanding updates on a transaction might possibly touch. 
+ * 
+ * This is an opaque datatype.
+ **/
 typedef struct handle_s		handle_t;	/* Atomic operation type */
+/**
+ * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem.
+ *
+ * journal_t is linked to from the fs superblock structure.
+ * 
+ * We use the journal_t to keep track of all outstanding transaction
+ * activity on the filesystem, and to manage the state of the log
+ * writing process.
+ *
+ * This is an opaque datatype.
+ **/
 typedef struct journal_s	journal_t;	/* Journal control structure */
 #endif
@@ -252,6 +283,20 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
 }
 #define HAVE_JOURNAL_CALLBACK_STATUS
+/**
+ *   struct journal_callback - Base structure for callback information.
+ *   @jcb_list: list information for other callbacks attached to the same handle.
+ *   @jcb_func: Function to call with this callback structure. 
+ *
+ *   This struct is a 'seed' structure for a using with your own callback
+ *   structs. If you are using callbacks you must allocate one of these
+ *   or another struct of your own definition which has this struct 
+ *   as it's first element and pass it to journal_callback_set().
+ *
+ *   This is used internally by jbd to maintain callback information.
+ *
+ *   See journal_callback_set for more information.
+ **/
 struct journal_callback {
 	struct list_head jcb_list;
 	void (*jcb_func)(struct journal_callback *jcb, int error);
@@ -260,18 +305,21 @@ struct journal_callback {
 struct jbd_revoke_table_s;
-/* The handle_t type represents a single atomic update being performed
+/**
- * by some process.  All filesystem modifications made by the process go
+ * struct handle_s - The handle_s type is the concrete type associated with handle_t.
- * through this handle.  Recursive operations (such as quota operations)
+ * @h_transaction: Which compound transaction is this update a part of?
- * are gathered into a single update.
+ * @h_buffer_credits: Number of remaining buffers we are allowed to dirty.
- *
+ * @h_ref: Reference count on this handle
- * The buffer credits field is used to account for journaled buffers
+ * @h_jcb: List of application registered callbacks for this handle.
- * being modified by the running process.  To ensure that there is
+ * @h_err: Field for caller's use to track errors through large fs operations
- * enough log space for all outstanding operations, we need to limit the
+ * @h_sync: flag for sync-on-close
- * number of outstanding buffers possible at any time.  When the
+ * @h_jdata: flag to force data journaling
- * operation completes, any buffer credits not used are credited back to
+ * @h_aborted: flag indicating fatal error on handle
- * the transaction, so that at all times we know how many buffers the
+ **/
- * outstanding updates on a transaction might possibly touch. */
+/* Docbook can't yet cope with the bit fields, but will leave the documentation
+ * in so it can be fixed later. 
+ */
 struct handle_s 
 {
@@ -284,8 +332,8 @@ struct handle_s
 	/* Reference count on this handle */
 	int			h_ref;
-	/* Field for caller's use to track errors through large fs
+	/* Field for caller's use to track errors through large fs */
-	   operations */
+	/* operations */
 	int			h_err;
 	/* List of application registered callbacks for this handle.
@@ -412,21 +460,58 @@ struct transaction_s
 	struct list_head	t_jcb;
 };
+/**
-/* The journal_t maintains all of the journaling state information for a
+ * struct journal_s - The journal_s type is the concrete type associated with journal_t.
- * single filesystem.  It is linked to from the fs superblock structure.
+ * @j_flags:  General journaling state flags
- * 
+ * @j_errno:  Is there an outstanding uncleared error on the journal (from a prior abort)? 
- * We use the journal_t to keep track of all outstanding transaction
+ * @j_sb_buffer: First part of superblock buffer
- * activity on the filesystem, and to manage the state of the log
+ * @j_superblock: Second part of superblock buffer
- * writing process. */
+ * @j_format_version: Version of the superblock format
+ * @j_barrier_count:  Number of processes waiting to create a barrier lock
+ * @j_barrier: The barrier lock itself
+ * @j_running_transaction: The current running transaction..
+ * @j_committing_transaction: the transaction we are pushing to disk
+ * @j_checkpoint_transactions: a linked circular list of all transactions waiting for checkpointing
+ * @j_wait_transaction_locked: Wait queue for waiting for a locked transaction to start committing, or for a barrier lock to be released
+ * @j_wait_logspace: Wait queue for waiting for checkpointing to complete
+ * @j_wait_done_commit: Wait queue for waiting for commit to complete 
+ * @j_wait_checkpoint:  Wait queue to trigger checkpointing
+ * @j_wait_commit: Wait queue to trigger commit
+ * @j_wait_updates: Wait queue to wait for updates to complete
+ * @j_checkpoint_sem: Semaphore for locking against concurrent checkpoints
+ * @j_sem: The main journal lock, used by lock_journal() 
+ * @j_head: Journal head - identifies the first unused block in the journal
+ * @j_tail: Journal tail - identifies the oldest still-used block in the journal.
+ * @j_free: Journal free - how many free blocks are there in the journal?
+ * @j_first: The block number of the first usable block 
+ * @j_last: The block number one beyond the last usable block
+ * @j_dev: Device where we store the journal
+ * @j_blocksize: blocksize for the location where we store the journal.
+ * @j_blk_offset: starting block offset for into the device where we store the journal
+ * @j_fs_dev: Device which holds the client fs.  For internal journal this will be equal to j_dev
+ * @j_maxlen: Total maximum capacity of the journal region on disk.
+ * @j_inode: Optional inode where we store the journal.  If present, all  journal block numbers are mapped into this inode via bmap().
+ * @j_tail_sequence:  Sequence number of the oldest transaction in the log 
+ * @j_transaction_sequence: Sequence number of the next transaction to grant
+ * @j_commit_sequence: Sequence number of the most recently committed transaction
+ * @j_commit_request: Sequence number of the most recent transaction wanting commit 
+ * @j_uuid: Uuid of client object.
+ * @j_task: Pointer to the current commit thread for this journal
+ * @j_max_transaction_buffers:  Maximum number of metadata buffers to allow in a single compound commit transaction
+ * @j_commit_interval: What is the maximum transaction lifetime before we begin a commit?
+ * @j_commit_timer:  The timer used to wakeup the commit thread
+ * @j_commit_timer_active: Timer flag
+ * @j_all_journals:  Link all journals together - system-wide 
+ * @j_revoke: The revoke table - maintains the list of revoked blocks in the current transaction.
+ **/
 struct journal_s
 {
 	/* General journaling state flags */
 	unsigned long		j_flags;
-	/* Is there an outstanding uncleared error on the journal (from
+	/* Is there an outstanding uncleared error on the journal (from */
-	 * a prior abort)? */
+	/* a prior abort)? */
 	int			j_errno;
 	/* The superblock buffer */
@@ -448,13 +533,13 @@ struct journal_s
 	/* ... the transaction we are pushing to disk ... */
 	transaction_t *		j_committing_transaction;
-	/* ... and a linked circular list of all transactions waiting
+	/* ... and a linked circular list of all transactions waiting */
-	 * for checkpointing. */
+	/* for checkpointing. */
 	/* Protected by journal_datalist_lock */
 	transaction_t *		j_checkpoint_transactions;
-	/* Wait queue for waiting for a locked transaction to start
+	/* Wait queue for waiting for a locked transaction to start */
-           committing, or for a barrier lock to be released */
+        /*  committing, or for a barrier lock to be released */
 	wait_queue_head_t	j_wait_transaction_locked;
 	/* Wait queue for waiting for checkpointing to complete */
@@ -481,33 +566,33 @@ struct journal_s
 	/* Journal head: identifies the first unused block in the journal. */
 	unsigned long		j_head;
-	/* Journal tail: identifies the oldest still-used block in the
+	/* Journal tail: identifies the oldest still-used block in the */
-	 * journal. */
+	/* journal. */
 	unsigned long		j_tail;
 	/* Journal free: how many free blocks are there in the journal? */
 	unsigned long		j_free;
-	/* Journal start and end: the block numbers of the first usable
+	/* Journal start and end: the block numbers of the first usable */
-	 * block and one beyond the last usable block in the journal. */
+	/* block and one beyond the last usable block in the journal.   */
 	unsigned long		j_first, j_last;
-	/* Device, blocksize and starting block offset for the location
+	/* Device, blocksize and starting block offset for the location */
-	 * where we store the journal. */
+	/* where we store the journal. */
 	struct block_device *	j_dev;
 	int			j_blocksize;
 	unsigned int		j_blk_offset;
-	/* Device which holds the client fs.  For internal journal this
+	/* Device which holds the client fs.  For internal journal this */
-	 * will be equal to j_dev. */
+	/* will be equal to j_dev. */
 	struct block_device *	j_fs_dev;
 	/* Total maximum capacity of the journal region on disk. */
 	unsigned int		j_maxlen;
-	/* Optional inode where we store the journal.  If present, all
+	/* Optional inode where we store the journal.  If present, all */
-	 * journal block numbers are mapped into this inode via
+	/* journal block numbers are mapped into this inode via */
-	 * bmap(). */
+	/* bmap(). */
 	struct inode *		j_inode;
 	/* Sequence number of the oldest transaction in the log */
@@ -519,23 +604,23 @@ struct journal_s
 	/* Sequence number of the most recent transaction wanting commit */
 	tid_t			j_commit_request;
-	/* Journal uuid: identifies the object (filesystem, LVM volume
+	/* Journal uuid: identifies the object (filesystem, LVM volume   */
-	 * etc) backed by this journal.  This will eventually be
+	/* etc) backed by this journal.  This will eventually be         */
-	 * replaced by an array of uuids, allowing us to index multiple
+	/* replaced by an array of uuids, allowing us to index multiple  */
-	 * devices within a single journal and to perform atomic updates
+	/* devices within a single journal and to perform atomic updates */
-	 * across them.  */
+	/* across them.  */
 	__u8			j_uuid[16];
 	/* Pointer to the current commit thread for this journal */
 	struct task_struct *	j_task;
-	/* Maximum number of metadata buffers to allow in a single
+	/* Maximum number of metadata buffers to allow in a single */
-	 * compound commit transaction */
+	/* compound commit transaction */
 	int			j_max_transaction_buffers;
-	/* What is the maximum transaction lifetime before we begin a
+	/* What is the maximum transaction lifetime before we begin a */
-	 * commit? */
+	/* commit? */
 	unsigned long		j_commit_interval;
 	/* The timer used to wakeup the commit thread: */
@@ -545,8 +630,8 @@ struct journal_s
 	/* Link all journals together - system-wide */
 	struct list_head	j_all_journals;
-	/* The revoke table: maintains the list of revoked blocks in the
+	/* The revoke table: maintains the list of revoked blocks in the */
-           current transaction. */
+        /*  current transaction. */
 	struct jbd_revoke_table_s *j_revoke;
 };

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -208,24 +208,55 @@ struct page {
 * Also, many kernel routines increase the page count before a critical
 * routine so they can be sure the page doesn't go away from under them.
 */
-#define get_page(p)		atomic_inc(&(p)->count)
-#define __put_page(p)		atomic_dec(&(p)->count)
 #define put_page_testzero(p)				\
 	({						\
 		BUG_ON(page_count(page) == 0);		\
 		atomic_dec_and_test(&(p)->count);	\
 	})
 #define page_count(p)		atomic_read(&(p)->count)
 #define set_page_count(p,v) 	atomic_set(&(p)->count, v)
+#define __put_page(p)		atomic_dec(&(p)->count)
 extern void FASTCALL(__page_cache_release(struct page *));
+#ifdef CONFIG_HUGETLB_PAGE
+static inline void get_page(struct page *page)
+{
+	if (PageCompound(page))
+		page = (struct page *)page->lru.next;
+	atomic_inc(&page->count);
+}
+static inline void put_page(struct page *page)
+{
+	if (PageCompound(page)) {
+		page = (struct page *)page->lru.next;
+		if (page->lru.prev) {	/* destructor? */
+			(*(void (*)(struct page *))page->lru.prev)(page);
+			return;
+		}
+	}
+	if (!PageReserved(page) && put_page_testzero(page))
+		__page_cache_release(page);
+}
+#else		/* CONFIG_HUGETLB_PAGE */
+static inline void get_page(struct page *page)
+{
+	atomic_inc(&page->count);
+}
 static inline void put_page(struct page *page)
 {
 	if (!PageReserved(page) && put_page_testzero(page))
 		__page_cache_release(page);
 }
+#endif		/* CONFIG_HUGETLB_PAGE */
 /*
 * Multiple processes may "see" the same page. E.g. for untouched
 * mappings of /dev/null, all processes see the same page full of

--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -72,7 +72,8 @@
 #define PG_direct		16	/* ->pte_chain points directly at pte */
 #define PG_mappedtodisk		17	/* Has blocks allocated on-disk */
-#define PG_reclaim		18	/* To be recalimed asap */
+#define PG_reclaim		18	/* To be reclaimed asap */
+#define PG_compound		19	/* Part of a compound page */
 /*
 * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -251,6 +252,10 @@ extern void get_full_page_state(struct page_state *ret);
 #define ClearPageReclaim(page)	clear_bit(PG_reclaim, &(page)->flags)
 #define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags)
+#define PageCompound(page)	test_bit(PG_compound, &(page)->flags)
+#define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
+#define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
 /*
 * The PageSwapCache predicate doesn't use a PG_flag at this time,
 * but it may again do so one day.

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -201,7 +201,9 @@ struct mm_struct {
 	unsigned long swap_address;
 	unsigned dumpable:1;
+#ifdef CONFIG_HUGETLB_PAGE
+	int used_hugetlb;
+#endif
 	/* Architecture-specific MM context */
 	mm_context_t context;

--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -37,30 +37,120 @@
 #ifdef CONFIG_SMP
 #include <asm/spinlock.h>
-/*
+#else
- * !CONFIG_SMP and spin_lock_init not previously defined
- * (e.g. by including include/asm/spinlock.h)
- */
-#elif !defined(spin_lock_init)
-#ifndef CONFIG_PREEMPT
+#if !defined(CONFIG_PREEMPT) && !defined(CONFIG_DEBUG_SPINLOCK)
 # define atomic_dec_and_lock(atomic,lock) atomic_dec_and_test(atomic)
 # define ATOMIC_DEC_AND_LOCK
 #endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+#define SPINLOCK_MAGIC	0x1D244B3C
+typedef struct {
+	unsigned long magic;
+	volatile unsigned long lock;
+	volatile unsigned int babble;
+	const char *module;
+	char *owner;
+	int oline;
+} spinlock_t;
+#define SPIN_LOCK_UNLOCKED (spinlock_t) { SPINLOCK_MAGIC, 0, 10, __FILE__ , NULL, 0}
+#define spin_lock_init(x) \
+	do { \
+		(x)->magic = SPINLOCK_MAGIC; \
+		(x)->lock = 0; \
+		(x)->babble = 5; \
+		(x)->module = __FILE__; \
+		(x)->owner = NULL; \
+		(x)->oline = 0; \
+	} while (0)
+#define CHECK_LOCK(x) \
+	do { \
+	 	if ((x)->magic != SPINLOCK_MAGIC) { \
+			printk(KERN_ERR "%s:%d: spin_is_locked on uninitialized spinlock %p.\n", \
+					__FILE__, __LINE__, (x)); \
+		} \
+	} while(0)
+#define _raw_spin_lock(x)		\
+	do { \
+	 	CHECK_LOCK(x); \
+		if ((x)->lock&&(x)->babble) { \
+			printk("%s:%d: spin_lock(%s:%p) already locked by %s/%d\n", \
+					__FILE__,__LINE__, (x)->module, \
+					(x), (x)->owner, (x)->oline); \
+			(x)->babble--; \
+		} \
+		(x)->lock = 1; \
+		(x)->owner = __FILE__; \
+		(x)->oline = __LINE__; \
+	} while (0)
+/* without debugging, spin_is_locked on UP always says
+ * FALSE. --> printk if already locked. */
+#define spin_is_locked(x) \
+	({ \
+	 	CHECK_LOCK(x); \
+		if ((x)->lock&&(x)->babble) { \
+			printk("%s:%d: spin_is_locked(%s:%p) already locked by %s/%d\n", \
+					__FILE__,__LINE__, (x)->module, \
+					(x), (x)->owner, (x)->oline); \
+			(x)->babble--; \
+		} \
+		0; \
+	})
+/* without debugging, spin_trylock on UP always says
+ * TRUE. --> printk if already locked. */
+#define _raw_spin_trylock(x) \
+	({ \
+	 	CHECK_LOCK(x); \
+		if ((x)->lock&&(x)->babble) { \
+			printk("%s:%d: spin_trylock(%s:%p) already locked by %s/%d\n", \
+					__FILE__,__LINE__, (x)->module, \
+					(x), (x)->owner, (x)->oline); \
+			(x)->babble--; \
+		} \
+		(x)->lock = 1; \
+		(x)->owner = __FILE__; \
+		(x)->oline = __LINE__; \
+		1; \
+	})
+#define spin_unlock_wait(x)	\
+	do { \
+	 	CHECK_LOCK(x); \
+		if ((x)->lock&&(x)->babble) { \
+			printk("%s:%d: spin_unlock_wait(%s:%p) owned by %s/%d\n", \
+					__FILE__,__LINE__, (x)->module, (x), \
+					(x)->owner, (x)->oline); \
+			(x)->babble--; \
+		}\
+	} while (0)
+#define _raw_spin_unlock(x) \
+	do { \
+	 	CHECK_LOCK(x); \
+		if (!(x)->lock&&(x)->babble) { \
+			printk("%s:%d: spin_unlock(%s:%p) not locked\n", \
+					__FILE__,__LINE__, (x)->module, (x));\
+			(x)->babble--; \
+		} \
+		(x)->lock = 0; \
+	} while (0)
+#else
 /*
 * gcc versions before ~2.95 have a nasty bug with empty initializers.
 */
 #if (__GNUC__ > 2)
  typedef struct { } spinlock_t;
-  typedef struct { } rwlock_t;
  #define SPIN_LOCK_UNLOCKED (spinlock_t) { }
-  #define RW_LOCK_UNLOCKED (rwlock_t) { }
 #else
  typedef struct { int gcc_is_buggy; } spinlock_t;
-  typedef struct { int gcc_is_buggy; } rwlock_t;
  #define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 }
-  #define RW_LOCK_UNLOCKED (rwlock_t) { 0 }
 #endif
 /*
@@ -72,6 +162,18 @@
 #define _raw_spin_trylock(lock)	((void)(lock), 1)
 #define spin_unlock_wait(lock)	do { (void)(lock); } while(0)
 #define _raw_spin_unlock(lock)	do { (void)(lock); } while(0)
+#endif /* CONFIG_DEBUG_SPINLOCK */
+/* RW spinlocks: No debug version */
+#if (__GNUC__ > 2)
+  typedef struct { } rwlock_t;
+  #define RW_LOCK_UNLOCKED (rwlock_t) { }
+#else
+  typedef struct { int gcc_is_buggy; } rwlock_t;
+  #define RW_LOCK_UNLOCKED (rwlock_t) { 0 }
+#endif
 #define rwlock_init(lock)	do { (void)(lock); } while(0)
 #define _raw_read_lock(lock)	do { (void)(lock); } while(0)
 #define _raw_read_unlock(lock)	do { (void)(lock); } while(0)

--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -177,6 +177,7 @@ static int worker_thread(void *__startup)
 	current->flags |= PF_IOTHREAD;
 	cwq->thread = current;
+	set_user_nice(current, -10);
 	set_cpus_allowed(current, 1UL << cpu);
 	spin_lock_irq(&current->sig->siglock);

--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -259,9 +259,10 @@ void wait_on_page_bit(struct page *page, int bit_nr)
 	do {
 		prepare_to_wait(waitqueue, &wait, TASK_UNINTERRUPTIBLE);
+		if (test_bit(bit_nr, &page->flags)) {
 			sync_page(page);
-		if (test_bit(bit_nr, &page->flags))
 			io_schedule();
+		}
 	} while (test_bit(bit_nr, &page->flags));
 	finish_wait(waitqueue, &wait);
 }
@@ -326,10 +327,11 @@ void __lock_page(struct page *page)
 	while (TestSetPageLocked(page)) {
 		prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+		if (PageLocked(page)) {
 			sync_page(page);
-		if (PageLocked(page))
 			io_schedule();
 		}
+	}
 	finish_wait(wqh, &wait);
 }
 EXPORT_SYMBOL(__lock_page);

--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -53,8 +53,11 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t *pte, entry;
 	pgd_t *pgd;
 	pmd_t *pmd;
-	struct pte_chain *pte_chain = NULL;
+	struct pte_chain *pte_chain;
+	pte_chain = pte_chain_alloc(GFP_KERNEL);
+	if (!pte_chain)
+		goto err;
 	pgd = pgd_offset(mm, addr);
 	spin_lock(&mm->page_table_lock);
@@ -62,7 +65,6 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!pmd)
 		goto err_unlock;
-	pte_chain = pte_chain_alloc(GFP_KERNEL);
 	pte = pte_alloc_map(mm, pmd, addr);
 	if (!pte)
 		goto err_unlock;
@@ -87,6 +89,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 err_unlock:
 	spin_unlock(&mm->page_table_lock);
 	pte_chain_free(pte_chain);
+err:
 	return err;
 }

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -607,13 +607,22 @@ follow_page(struct mm_struct *mm, unsigned long address, int write)
 	pmd_t *pmd;
 	pte_t *ptep, pte;
 	unsigned long pfn;
+	struct vm_area_struct *vma;
+	vma = hugepage_vma(mm, address);
+	if (vma)
+		return follow_huge_addr(mm, vma, address, write);
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || pgd_bad(*pgd))
 		goto out;
 	pmd = pmd_offset(pgd, address);
-	if (pmd_none(*pmd) || pmd_bad(*pmd))
+	if (pmd_none(*pmd))
+		goto out;
+	if (pmd_huge(*pmd))
+		return follow_huge_pmd(mm, address, pmd, write);
+	if (pmd_bad(*pmd))
 		goto out;
 	ptep = pte_offset_map(pmd, address);
@@ -926,9 +935,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 	struct page *old_page, *new_page;
 	unsigned long pfn = pte_pfn(pte);
 	struct pte_chain *pte_chain = NULL;
+	int ret;
-	if (!pfn_valid(pfn))
+	if (unlikely(!pfn_valid(pfn))) {
-		goto bad_wp_page;
+		/*
+		 * This should really halt the system so it can be debugged or
+		 * at least the kernel stops what it's doing before it corrupts
+		 * data, but for the moment just pretend this is OOM.
+		 */
+		pte_unmap(page_table);
+		printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n",
+				address);
+		goto oom;
+	}
 	old_page = pfn_to_page(pfn);
 	if (!TestSetPageLocked(old_page)) {
@@ -936,10 +955,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 		unlock_page(old_page);
 		if (reuse) {
 			flush_cache_page(vma, address);
-			establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
+			establish_pte(vma, address, page_table,
+				pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
 			pte_unmap(page_table);
-			spin_unlock(&mm->page_table_lock);
+			ret = VM_FAULT_MINOR;
-			return VM_FAULT_MINOR;
+			goto out;
 		}
 	}
 	pte_unmap(page_table);
@@ -950,11 +970,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 	page_cache_get(old_page);
 	spin_unlock(&mm->page_table_lock);
+	pte_chain = pte_chain_alloc(GFP_KERNEL);
+	if (!pte_chain)
+		goto no_mem;
 	new_page = alloc_page(GFP_HIGHUSER);
 	if (!new_page)
 		goto no_mem;
 	copy_cow_page(old_page,new_page,address);
-	pte_chain = pte_chain_alloc(GFP_KERNEL);
 	/*
 	 * Re-check the pte - we dropped the lock
@@ -973,25 +995,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 		new_page = old_page;
 	}
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
-	pte_chain_free(pte_chain);
+	ret = VM_FAULT_MINOR;
-	return VM_FAULT_MINOR;
+	goto out;
-bad_wp_page:
-	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
-	printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", address);
-	/*
-	 * This should really halt the system so it can be debugged or
-	 * at least the kernel stops what it's doing before it corrupts
-	 * data, but for the moment just pretend this is OOM.
-	 */
-	return VM_FAULT_OOM;
 no_mem:
 	page_cache_release(old_page);
-	return VM_FAULT_OOM;
+oom:
+	ret = VM_FAULT_OOM;
+out:
+	spin_unlock(&mm->page_table_lock);
+	pte_chain_free(pte_chain);
+	return ret;
 }
 static void vmtruncate_list(struct list_head *head, unsigned long pgoff)
@@ -1286,6 +1302,7 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page * new_page;
 	pte_t entry;
 	struct pte_chain *pte_chain;
+	int ret;
 	if (!vma->vm_ops || !vma->vm_ops->nopage)
 		return do_anonymous_page(mm, vma, page_table,
@@ -1301,6 +1318,10 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (new_page == NOPAGE_OOM)
 		return VM_FAULT_OOM;
+	pte_chain = pte_chain_alloc(GFP_KERNEL);
+	if (!pte_chain)
+		goto oom;
 	/*
 	 * Should we do an early C-O-W break?
 	 */
@@ -1308,7 +1329,7 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct page * page = alloc_page(GFP_HIGHUSER);
 		if (!page) {
 			page_cache_release(new_page);
-			return VM_FAULT_OOM;
+			goto oom;
 		}
 		copy_user_highpage(page, new_page, address);
 		page_cache_release(new_page);
@@ -1316,7 +1337,6 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		new_page = page;
 	}
-	pte_chain = pte_chain_alloc(GFP_KERNEL);
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
@@ -1346,15 +1366,20 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		pte_unmap(page_table);
 		page_cache_release(new_page);
 		spin_unlock(&mm->page_table_lock);
-		pte_chain_free(pte_chain);
+		ret = VM_FAULT_MINOR;
-		return VM_FAULT_MINOR;
+		goto out;
 	}
 	/* no need to invalidate: a not-present page shouldn't be cached */
 	update_mmu_cache(vma, address, entry);
 	spin_unlock(&mm->page_table_lock);
+	ret = VM_FAULT_MAJOR;
+	goto out;
+oom:
+	ret = VM_FAULT_OOM;
+out:
 	pte_chain_free(pte_chain);
-	return VM_FAULT_MAJOR;
+	return ret;
 }
 /*
@@ -1422,6 +1447,10 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
 	pgd = pgd_offset(mm, address);
 	inc_page_state(pgfault);
+	if (is_vm_hugetlb_page(vma))
+		return VM_FAULT_SIGBUS;	/* mapping truncation does this. */
 	/*
 	 * We need the page table lock to synchronize with kswapd
 	 * and the SMP-safe atomic PTE updates.

--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -362,6 +362,7 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (mapping)
 		up(&mapping->i_shared_sem);
+	mark_mm_hugetlb(mm, vma);
 	mm->map_count++;
 	validate_mm(mm);
 }
@@ -1222,6 +1223,11 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 		return 0;
 	/* we have  start < mpnt->vm_end  */
+	if (is_vm_hugetlb_page(mpnt)) {
+		if ((start & ~HPAGE_MASK) || (len & ~HPAGE_MASK))
+			return -EINVAL;
+	}
 	/* if it doesn't overlap, we have nothing.. */
 	end = start + len;
 	if (mpnt->vm_start >= end)
@@ -1423,7 +1429,6 @@ void exit_mmap(struct mm_struct *mm)
 		kmem_cache_free(vm_area_cachep, vma);
 		vma = next;
 	}
 }
 /* Insert vm structure into process list sorted by address

--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -24,9 +24,9 @@
 static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr)
 {
-	pgd_t * pgd;
+	pgd_t *pgd;
-	pmd_t * pmd;
+	pmd_t *pmd;
-	pte_t * pte = NULL;
+	pte_t *pte = NULL;
 	pgd = pgd_offset(mm, addr);
 	if (pgd_none(*pgd))
@@ -73,8 +73,8 @@ static inline int page_table_present(struct mm_struct *mm, unsigned long addr)
 static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
 {
-	pmd_t * pmd;
+	pmd_t *pmd;
-	pte_t * pte = NULL;
+	pte_t *pte = NULL;
 	pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr);
 	if (pmd)
@@ -88,7 +88,7 @@ copy_one_pte(struct mm_struct *mm, pte_t *src, pte_t *dst,
 {
 	int error = 0;
 	pte_t pte;
-	struct page * page = NULL;
+	struct page *page = NULL;
 	if (pte_present(*src))
 		page = pte_page(*src);
@@ -183,12 +183,12 @@ static int move_page_tables(struct vm_area_struct *vma,
 	return -1;
 }
-static unsigned long move_vma(struct vm_area_struct * vma,
+static unsigned long move_vma(struct vm_area_struct *vma,
 	unsigned long addr, unsigned long old_len, unsigned long new_len,
 	unsigned long new_addr)
 {
-	struct mm_struct * mm = vma->vm_mm;
+	struct mm_struct *mm = vma->vm_mm;
-	struct vm_area_struct * new_vma, * next, * prev;
+	struct vm_area_struct *new_vma, *next, *prev;
 	int allocated_vma;
 	int split = 0;
@@ -196,14 +196,16 @@ static unsigned long move_vma(struct vm_area_struct * vma,
 	next = find_vma_prev(mm, new_addr, &prev);
 	if (next) {
 		if (prev && prev->vm_end == new_addr &&
-		    can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+		    can_vma_merge(prev, vma->vm_flags) && !vma->vm_file &&
+					!(vma->vm_flags & VM_SHARED)) {
 			spin_lock(&mm->page_table_lock);
 			prev->vm_end = new_addr + new_len;
 			spin_unlock(&mm->page_table_lock);
 			new_vma = prev;
 			if (next != prev->vm_next)
 				BUG();
-			if (prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags)) {
+			if (prev->vm_end == next->vm_start &&
+					can_vma_merge(next, prev->vm_flags)) {
 				spin_lock(&mm->page_table_lock);
 				prev->vm_end = next->vm_end;
 				__vma_unlink(mm, next, prev);
@@ -214,7 +216,8 @@ static unsigned long move_vma(struct vm_area_struct * vma,
 				kmem_cache_free(vm_area_cachep, next);
 			}
 		} else if (next->vm_start == new_addr + new_len &&
-			   can_vma_merge(next, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+			  	can_vma_merge(next, vma->vm_flags) &&
+				!vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
 			spin_lock(&mm->page_table_lock);
 			next->vm_start = new_addr;
 			spin_unlock(&mm->page_table_lock);
@@ -223,7 +226,8 @@ static unsigned long move_vma(struct vm_area_struct * vma,
 	} else {
 		prev = find_vma(mm, new_addr-1);
 		if (prev && prev->vm_end == new_addr &&
-		    can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+		    can_vma_merge(prev, vma->vm_flags) && !vma->vm_file &&
+				!(vma->vm_flags & VM_SHARED)) {
 			spin_lock(&mm->page_table_lock);
 			prev->vm_end = new_addr + new_len;
 			spin_unlock(&mm->page_table_lock);
@@ -249,7 +253,7 @@ static unsigned long move_vma(struct vm_area_struct * vma,
 			INIT_LIST_HEAD(&new_vma->shared);
 			new_vma->vm_start = new_addr;
 			new_vma->vm_end = new_addr+new_len;
-			new_vma->vm_pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+			new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT;
 			if (new_vma->vm_file)
 				get_file(new_vma->vm_file);
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
@@ -428,7 +432,8 @@ unsigned long do_mremap(unsigned long addr,
 			if (vma->vm_flags & VM_SHARED)
 				map_flags |= MAP_SHARED;
-			new_addr = get_unmapped_area(vma->vm_file, 0, new_len, vma->vm_pgoff, map_flags);
+			new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+						vma->vm_pgoff, map_flags);
 			ret = new_addr;
 			if (new_addr & ~PAGE_MASK)
 				goto out;

--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -237,7 +237,6 @@ static void background_writeout(unsigned long _min_pages)
 				break;
 		}
 	}
-	blk_run_queues();
 }
 /*
@@ -308,7 +307,6 @@ static void wb_kupdate(unsigned long arg)
 		}
 		nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 	}
-	blk_run_queues();
 	if (time_before(next_jif, jiffies + HZ))
 		next_jif = jiffies + HZ;
 	mod_timer(&wb_timer, next_jif);

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -85,6 +85,62 @@ static void bad_page(const char *function, struct page *page)
 	page->mapping = NULL;
 }
+#ifndef CONFIG_HUGETLB_PAGE
+#define prep_compound_page(page, order) do { } while (0)
+#define destroy_compound_page(page, order) do { } while (0)
+#else
+/*
+ * Higher-order pages are called "compound pages".  They are structured thusly:
+ *
+ * The first PAGE_SIZE page is called the "head page".
+ *
+ * The remaining PAGE_SIZE pages are called "tail pages".
+ *
+ * All pages have PG_compound set.  All pages have their lru.next pointing at
+ * the head page (even the head page has this).
+ *
+ * The head page's lru.prev, if non-zero, holds the address of the compound
+ * page's put_page() function.
+ *
+ * The order of the allocation is stored in the first tail page's lru.prev.
+ * This is only for debug at present.  This usage means that zero-order pages
+ * may not be compound.
+ */
+static void prep_compound_page(struct page *page, int order)
+{
+	int i;
+	int nr_pages = 1 << order;
+	page->lru.prev = NULL;
+	page[1].lru.prev = (void *)order;
+	for (i = 0; i < nr_pages; i++) {
+		struct page *p = page + i;
+		SetPageCompound(p);
+		p->lru.next = (void *)page;
+	}
+}
+static void destroy_compound_page(struct page *page, int order)
+{
+	int i;
+	int nr_pages = 1 << order;
+	if (page[1].lru.prev != (void *)order)
+		bad_page(__FUNCTION__, page);
+	for (i = 0; i < nr_pages; i++) {
+		struct page *p = page + i;
+		if (!PageCompound(p))
+			bad_page(__FUNCTION__, page);
+		if (p->lru.next != (void *)page)
+			bad_page(__FUNCTION__, page);
+		ClearPageCompound(p);
+	}
+}
+#endif		/* CONFIG_HUGETLB_PAGE */
 /*
 * Freeing function for a buddy system allocator.
 *
@@ -114,6 +170,8 @@ static inline void __free_pages_bulk (struct page *page, struct page *base,
 {
 	unsigned long page_idx, index;
+	if (order)
+		destroy_compound_page(page, order);
 	page_idx = page - base;
 	if (page_idx & ~mask)
 		BUG();
@@ -409,6 +467,12 @@ void free_cold_page(struct page *page)
 	free_hot_cold_page(page, 1);
 }
+/*
+ * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
+ * we cheat by calling it from here, in the order > 0 path.  Saves a branch
+ * or two.
+ */
 static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
 {
 	unsigned long flags;
@@ -435,6 +499,8 @@ static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
 		spin_unlock_irqrestore(&zone->lock, flags);
+		if (order && page)
+			prep_compound_page(page, order);
 	}
 	if (page != NULL) {

--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -236,10 +236,8 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	 * uptodate then the caller will launch readpage again, and
 	 * will then handle the error.
 	 */
-	if (ret) {
+	if (ret)
 		read_pages(mapping, filp, &page_pool, ret);
-		blk_run_queues();
-	}
 	BUG_ON(!list_empty(&page_pool));
 out:
 	return ret;

--- a/mm/slab.c
+++ b/mm/slab.c
@@ -439,7 +439,6 @@ struct arraycache_init initarray_generic __initdata = { { 0, BOOT_CPUCACHE_ENTRI
 static kmem_cache_t cache_cache = {
 	.lists		= LIST3_INIT(cache_cache.lists),
 	/* Allow for boot cpu != 0 */
-	.array		= { [0 ... NR_CPUS-1] = &initarray_cache.cache },
 	.batchcount	= 1,
 	.limit		= BOOT_CPUCACHE_ENTRIES,
 	.objsize	= sizeof(kmem_cache_t),
@@ -611,6 +610,7 @@ void __init kmem_cache_init(void)
 	init_MUTEX(&cache_chain_sem);
 	INIT_LIST_HEAD(&cache_chain);
 	list_add(&cache_cache.next, &cache_chain);
+	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
 	cache_estimate(0, cache_cache.objsize, 0,
 			&left_over, &cache_cache.num);

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -957,7 +957,6 @@ int kswapd(void *p)
 		finish_wait(&pgdat->kswapd_wait, &wait);
 		get_page_state(&ps);
 		balance_pgdat(pgdat, 0, &ps);
-		blk_run_queues();
 	}
 }