diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 5005bc264e5234d71efefccde436cdc06eb6fa01..55d1c4824a7d5d1a43b30777d2681d177b32fc14 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -511,6 +511,7 @@ config FRAME_POINTER
 	 Normally you should say N.
 
 config IOMMU_DEBUG
+       depends on GART_IOMMU && DEBUG_KERNEL
        bool "Force IOMMU to on" 
        help
          Force the IOMMU to on even when you have less than 4GB of memory and add 
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 7d505b808ab35fc510794e5470f926d1a71596a9..6f08987116848d6ac5120bd4376da39f12d18afd 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -743,7 +743,7 @@ CONFIG_MAGIC_SYSRQ=y
 # CONFIG_INIT_DEBUG is not set
 # CONFIG_DEBUG_INFO is not set
 # CONFIG_FRAME_POINTER is not set
-CONFIG_IOMMU_DEBUG=y
+# CONFIG_IOMMU_DEBUG is not set
 CONFIG_IOMMU_LEAK=y
 CONFIG_MCE_DEBUG=y
 
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 2e367f9c83beed6074e8f045569eedba9e5b2a95..917a607965499ea8fefaee1c655218806807529c 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -44,12 +44,13 @@ static int no_agp;
 #ifdef CONFIG_IOMMU_DEBUG
 int panic_on_overflow = 1; 
 int force_iommu = 1;
-int sac_force_size = 0; 
 #else
-int panic_on_overflow = 1; /* for testing */
+int panic_on_overflow = 0;
 int force_iommu = 0;
-int sac_force_size = 256*1024*1024;
 #endif
+int iommu_merge = 0; 
+int iommu_sac_force = 0; 
+int iommu_fullflush = 1;
 
 /* Allocation bitmap for the remapping area */ 
 static spinlock_t iommu_bitmap_lock = SPIN_LOCK_UNLOCKED;
@@ -125,7 +126,7 @@ static void free_iommu(unsigned long offset, int size)
 /* 
  * Use global flush state to avoid races with multiple flushers.
  */
-static void __flush_gart(struct pci_dev *dev)
+static void flush_gart(struct pci_dev *dev)
 { 
 	unsigned long flags;
 	int bus = dev ? dev->bus->number : -1;
@@ -134,13 +135,17 @@ static void __flush_gart(struct pci_dev *dev)
 	int i;
 
 	spin_lock_irqsave(&iommu_bitmap_lock, flags);
-	/* recheck flush count inside lock */
-	if (need_flush) { 
+	if (need_flush || iommu_fullflush) { 
 		for (i = 0; northbridges[i]; i++) {
+			u32 w;
 			if (bus >= 0 && !(cpu_isset_const(i, bus_cpumask)))
 				continue;
 			pci_write_config_dword(northbridges[i], 0x9c, 
 					       northbridge_flush_word[i] | 1); 
+			/* Make sure the hardware actually executed the flush. */
+			do { 
+				pci_read_config_dword(northbridges[i], 0x9c, &w);
+			} while (w & 1);
 			flushed++;
 		} 
 		if (!flushed) 
@@ -150,12 +155,6 @@ static void __flush_gart(struct pci_dev *dev)
 	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
 } 
 
-static inline void flush_gart(struct pci_dev *dev)
-{ 
-	if (need_flush)
-		__flush_gart(dev);
-} 
-
 /* 
  * Allocate memory for a consistent mapping.
  * All mappings are consistent here, so this is just a wrapper around
@@ -174,11 +173,16 @@ void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size,
 	} else {
 		dma_mask = hwdev->consistent_dma_mask; 
 	}
+
 	if (dma_mask == 0) 
 		dma_mask = 0xffffffff; 
 	if (dma_mask < 0xffffffff || no_iommu)
 		gfp |= GFP_DMA;
 
+	/* Kludge to make it bug-to-bug compatible with i386. i386
+	   uses the normal dma_mask for alloc_consistent. */
+	dma_mask &= hwdev->dma_mask;
+
 	memory = (void *)__get_free_pages(gfp, get_order(size));
 	if (memory == NULL) {
 		return NULL; 
@@ -394,7 +398,9 @@ static int __pci_map_cont(struct scatterlist *sg, int start, int stopat,
 	
 	for (i = start; i < stopat; i++) {
 		struct scatterlist *s = &sg[i];
-		unsigned long start_addr = s->dma_address;
+		unsigned long pages, addr;
+		unsigned long phys_addr = s->dma_address;
+		
 		BUG_ON(i > start && s->offset);
 		if (i == start) {
 			*sout = *s; 
@@ -403,8 +409,10 @@ static int __pci_map_cont(struct scatterlist *sg, int start, int stopat,
 		} else { 
 			sout->length += s->length; 
 		}
-		unsigned long addr = start_addr;
-		while (addr < start_addr + s->length) { 
+
+		addr = phys_addr;
+		pages = to_pages(s->offset, s->length); 
+		while (pages--) { 
 			iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 
 			SET_LEAK(iommu_page);
 			addr += PAGE_SIZE;
@@ -437,7 +445,7 @@ int pci_map_sg(struct pci_dev *dev, struct scatterlist *sg, int nents, int dir)
 	int out;
 	int start;
 	unsigned long pages = 0;
-	int need = 0;
+	int need = 0, nextneed;
 
 	unsigned long size = 0; 
 
@@ -453,13 +461,14 @@ int pci_map_sg(struct pci_dev *dev, struct scatterlist *sg, int nents, int dir)
 		BUG_ON(s->length == 0); 
 
 		size += s->length; 
+		nextneed = need_iommu(dev, addr, s->length); 
 
 		/* Handle the previous not yet processed entries */
 		if (i > start) {
 			struct scatterlist *ps = &sg[i-1];
 			/* Can only merge when the last chunk ends on a page 
-			   boundary. */
-			if (!force_iommu || !need || (i-1 > start && ps->offset) ||
+			   boundary and the new one doesn't have an offset. */
+			if (!iommu_merge || !nextneed || !need || s->offset ||
 			    (ps->offset + ps->length) % PAGE_SIZE) { 
 				if (pci_map_cont(sg, start, i, sg+out, pages, 
 						 need) < 0)
@@ -470,7 +479,7 @@ int pci_map_sg(struct pci_dev *dev, struct scatterlist *sg, int nents, int dir)
 			}
 	}
 
-		need = need_iommu(dev, addr, s->length); 
+		need = nextneed;
 		pages += to_pages(s->offset, s->length);
 	}
 	if (pci_map_cont(sg, start, i, sg+out, pages, need) < 0)
@@ -544,14 +553,12 @@ int pci_dma_supported(struct pci_dev *dev, u64 mask)
 
 	   Problem with this is that if we overflow the IOMMU area
 	   and return DAC as fallback address the device may not handle it correctly.
-	   As a compromise we only do this if the IOMMU area is >= 256MB 
-	   which should make overflow unlikely enough.
 	   
 	   As a special case some controllers have a 39bit address mode 
 	   that is as efficient as 32bit (aic79xx). Don't force SAC for these.
 	   Assume all masks <= 40 bits are of this type. Normally this doesn't
 	   make any difference, but gives more gentle handling of IOMMU overflow. */
-	if (force_iommu && (mask > 0xffffffffffULL) && (iommu_size >= sac_force_size)){ 
+	if (iommu_sac_force && (mask >= 0xffffffffffULL)) { 
 		printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->slot_name,mask);
 		return 0; 
 	}
@@ -680,7 +687,7 @@ static int __init pci_iommu_init(void)
 	unsigned long iommu_start;
 	struct pci_dev *dev;
 		
-#ifndef CONFIG_AGP_AMD_8151
+#ifndef CONFIG_AGP_AMD64
 	no_agp = 1; 
 #else
 	/* Makefile puts PCI initialization via subsys_initcall first. */
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index 69bd270513e1e446f8e0c50fc89a9ebac5a6cc41..278c626badd80e7cd24c903f6048c4cc2a066bee 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -4,6 +4,8 @@
 #include <linux/string.h>
 #include <asm/proto.h>
 
+int iommu_merge = 0;
+
 /* 
  * Dummy IO MMU functions
  */
diff --git a/include/asm-x86_64/io.h b/include/asm-x86_64/io.h
index bf9738f9a23bc1597dd7f5ed351a7fdf71e569d3..7d37aa7f3945be46e0c81ec5021fa859f1b07396 100644
--- a/include/asm-x86_64/io.h
+++ b/include/asm-x86_64/io.h
@@ -304,8 +304,8 @@ static inline int isa_check_signature(unsigned long io_addr,
 /* Disable vmerge for now. Need to fix the block layer code
    to check for non iommu addresses first.
    When the IOMMU is force it is safe to enable. */
-extern int force_iommu; 
-#define BIO_VERMGE_BOUNDARY (force_iommu ? 4096 : 0)
+extern int iommu_merge;
+#define BIO_VMERGE_BOUNDARY (iommu_merge ? 4096 : 0)
 
 #endif /* __KERNEL__ */