diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index f91e63489463eea685a767c554e14eda28085b9b..040862e6c6a08b66214da4514f178d0c7a597398 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -523,20 +523,30 @@ void __init mem_init(void)
 #endif
 }
 
-#ifdef CONFIG_X86_PAE
-struct kmem_cache_s *pae_pgd_cachep;
+kmem_cache_t *pgd_cache;
+kmem_cache_t *pmd_cache;
 
 void __init pgtable_cache_init(void)
 {
-	/*
-	 * PAE pgds must be 16-byte aligned:
-	 */
-	pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0,
-		SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL);
-	if (!pae_pgd_cachep)
-		panic("init_pae(): Cannot alloc pae_pgd SLAB cache");
+	if (PTRS_PER_PMD > 1) {
+		pmd_cache = kmem_cache_create("pmd",
+					PTRS_PER_PMD*sizeof(pmd_t),
+					PTRS_PER_PMD*sizeof(pmd_t),
+					0,
+					pmd_ctor,
+					NULL);
+		if (!pmd_cache)
+			panic("pgtable_cache_init(): cannot create pmd cache");
+	}
+	pgd_cache = kmem_cache_create("pgd",
+				PTRS_PER_PGD*sizeof(pgd_t),
+				PTRS_PER_PGD*sizeof(pgd_t),
+				0,
+				pgd_ctor,
+				PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
+	if (!pgd_cache)
+		panic("pgtable_cache_init(): Cannot create pgd cache");
 }
-#endif
 
 /*
  * This function cannot be __init, since exceptions don't work in that
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 33d970f4ff0aca64a75f14a5353fe719db93006a..04b488ccb15dbda760120b61ec145f6ed78ae350 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -67,19 +67,22 @@ static void flush_kernel_map(void *dummy)
 
 static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 
 { 
+	struct page *page;
+	unsigned long flags;
+
 	set_pte_atomic(kpte, pte); 	/* change init_mm */
-#ifndef CONFIG_X86_PAE
-	{
-		struct list_head *l;
-		spin_lock(&mmlist_lock);
-		list_for_each(l, &init_mm.mmlist) {
-			struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist);
-			pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address);
-			set_pte_atomic((pte_t *)pmd, pte);
-		}
-		spin_unlock(&mmlist_lock);
+	if (PTRS_PER_PMD > 1)
+		return;
+
+	spin_lock_irqsave(&pgd_lock, flags);
+	for (page = pgd_list; page; page = (struct page *)page->index) {
+		pgd_t *pgd;
+		pmd_t *pmd;
+		pgd = (pgd_t *)page_address(page) + pgd_index(address);
+		pmd = pmd_offset(pgd, address);
+		set_pte_atomic((pte_t *)pmd, pte);
 	}
-#endif
+	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
 /* 
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 941c2aa5236c3d50fbfd24ed1a79288ba64ee551..49561339cbcc6c7048983e70e27f1b9e4baf747e 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -12,6 +12,7 @@
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/spinlock.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
@@ -151,61 +152,108 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	return pte;
 }
 
-#ifdef CONFIG_X86_PAE
+void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
+{
+	memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
+}
 
-pgd_t *pgd_alloc(struct mm_struct *mm)
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * If the locking proves to be non-performant, a ticketing scheme with
+ * checks at dup_mmap(), exec(), and other mmlist addition points
+ * could be used. The locking scheme was chosen on the basis of
+ * manfred's recommendations and having no core impact whatsoever.
+ * -- wli
+ */
+spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED;
+struct page *pgd_list;
+
+static inline void pgd_list_add(pgd_t *pgd)
 {
-	int i;
-	pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL);
-
-	if (pgd) {
-		for (i = 0; i < USER_PTRS_PER_PGD; i++) {
-			unsigned long pmd = __get_free_page(GFP_KERNEL);
-			if (!pmd)
-				goto out_oom;
-			clear_page(pmd);
-			set_pgd(pgd + i, __pgd(1 + __pa(pmd)));
-		}
-		memcpy(pgd + USER_PTRS_PER_PGD,
+	struct page *page = virt_to_page(pgd);
+	page->index = (unsigned long)pgd_list;
+	if (pgd_list)
+		pgd_list->private = (unsigned long)&page->index;
+	pgd_list = page;
+	page->private = (unsigned long)&pgd_list;
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+	struct page *next, **pprev, *page = virt_to_page(pgd);
+	next = (struct page *)page->index;
+	pprev = (struct page **)page->private;
+	*pprev = next;
+	if (next)
+		next->private = (unsigned long)pprev;
+}
+
+void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+{
+	unsigned long flags;
+
+	if (PTRS_PER_PMD == 1)
+		spin_lock_irqsave(&pgd_lock, flags);
+
+	memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
 			swapper_pg_dir + USER_PTRS_PER_PGD,
 			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
-	}
-	return pgd;
-out_oom:
-	for (i--; i >= 0; i--)
-		free_page((unsigned long)__va(pgd_val(pgd[i])-1));
-	kmem_cache_free(pae_pgd_cachep, pgd);
-	return NULL;
+
+	if (PTRS_PER_PMD > 1)
+		return;
+
+	pgd_list_add(pgd);
+	spin_unlock_irqrestore(&pgd_lock, flags);
+	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
 }
 
-void pgd_free(pgd_t *pgd)
+/* never called when PTRS_PER_PMD > 1 */
+void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
 {
-	int i;
+	unsigned long flags; /* can be called from interrupt context */
 
-	for (i = 0; i < USER_PTRS_PER_PGD; i++)
-		free_page((unsigned long)__va(pgd_val(pgd[i])-1));
-	kmem_cache_free(pae_pgd_cachep, pgd);
+	spin_lock_irqsave(&pgd_lock, flags);
+	pgd_list_del(pgd);
+	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
-#else
-
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
+	int i;
+	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
 
-	if (pgd) {
-		memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
-		memcpy(pgd + USER_PTRS_PER_PGD,
-			swapper_pg_dir + USER_PTRS_PER_PGD,
-			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
+	if (PTRS_PER_PMD == 1 || !pgd)
+		return pgd;
+
+	for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
+		pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
+		if (!pmd)
+			goto out_oom;
+		set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd))));
 	}
 	return pgd;
+
+out_oom:
+	for (i--; i >= 0; i--)
+		kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+	kmem_cache_free(pgd_cache, pgd);
+	return NULL;
 }
 
 void pgd_free(pgd_t *pgd)
 {
-	free_page((unsigned long)pgd);
-}
-
-#endif /* CONFIG_X86_PAE */
+	int i;
 
+	/* in the PAE case user pgd entries are overwritten before usage */
+	if (PTRS_PER_PMD > 1)
+		for (i = 0; i < USER_PTRS_PER_PGD; ++i)
+			kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+	/* in the non-PAE case, clear_page_tables() clears user pgd entries */
+	kmem_cache_free(pgd_cache, pgd);
+}
diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h
index 0ca6393cbe4c209a69d6c275ae98a37b38a55f20..147acd8530e2be59627a2d861f36d0587a4bf010 100644
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -123,6 +123,4 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
 #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
 #define PTE_FILE_MAX_BITS       32
 
-extern struct kmem_cache_s *pae_pgd_cachep;
-
 #endif /* _I386_PGTABLE_3LEVEL_H */
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 1c0c38375349a71c81ba9154abc99859a8c1daaa..1ba07e9421d87a1036d70ac3fe70436c4b5f56e7 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -21,15 +21,27 @@
 #include <asm/bitops.h>
 #endif
 
-extern pgd_t swapper_pg_dir[1024];
-extern void paging_init(void);
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
 
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
  */
-extern unsigned long empty_zero_page[1024];
 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+extern unsigned long empty_zero_page[1024];
+extern pgd_t swapper_pg_dir[1024];
+extern kmem_cache_t *pgd_cache;
+extern kmem_cache_t *pmd_cache;
+extern spinlock_t pgd_lock;
+extern struct page *pgd_list;
+
+void pmd_ctor(void *, kmem_cache_t *, unsigned long);
+void pgd_ctor(void *, kmem_cache_t *, unsigned long);
+void pgd_dtor(void *, kmem_cache_t *, unsigned long);
+void pgtable_cache_init(void);
+void paging_init(void);
 
 #endif /* !__ASSEMBLY__ */
 
@@ -41,20 +53,8 @@ extern unsigned long empty_zero_page[1024];
 #ifndef __ASSEMBLY__
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
-
-/*
- * Need to initialise the X86 PAE caches
- */
-extern void pgtable_cache_init(void);
-
 #else
 # include <asm/pgtable-2level.h>
-
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()	do { } while (0)
-
 #endif
 #endif