Commit 98eb235b authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] page unmapping debug

From: Manfred Spraul <manfred@colorfullife.com>

Manfred's latest page unmapping debug patch.

The patch adds support for a special debug mode to both the page and the slab
allocator: Unused pages are removed from the kernel linear mapping.  This
means that now any access to freed memory will cause an immediate exception.
Right now, read accesses remain totally unnoticed and write accesses may be
catched by the slab poisoning, but usually far too late for a meaningfull bug
report.

The implementation is based on a new arch dependant function,
kernel_map_pages(), that removes the pages from the linear mapping.  It's
right now only implemented for i386.

Changelog:

- Add kernel_map_pages() for i386, based on change_page_attr.  If
  DEBUG_PAGEALLOC is not set, then the function is an empty stub.  The stub
  is in <linux/mm.h>, i.e.  it exists for all archs.

- Make change_page_attr irq safe.  Note that it's not fully irq safe due to
  the lack of the tlb flush ipi, but it's good enough for kernel_map_pages().
   Another problem is that kernel_map_pages is not permitted to fail, thus
  PSE is disabled if DEBUG_PAGEALLOC is enabled

- use kernel_map pages for the page allocator.

- use kernel_map_pages for the slab allocator.

  I couldn't resist and added additional debugging support into mm/slab.c:

  * at kfree time, the complete backtrace of the kfree caller is stored
    in the freed object.

  * a ptrinfo() function that dumps all known data about a kernel virtual
    address: the pte value, if it belongs to a slab cache the cache name and
    additional info.

  * merging of common code: new helper function obj_dbglen and obj_dbghdr
    for the conversion between the user visible object pointers/len and the
    actual, internal addresses and len values.
parent 17003453
......@@ -1339,6 +1339,14 @@ config DEBUG_SPINLOCK
best used in conjunction with the NMI watchdog so that spinlock
deadlocks are also debuggable.
config DEBUG_PAGEALLOC
bool "Page alloc debugging"
depends on DEBUG_KERNEL
help
Unmap pages from the kernel linear mapping after free_pages().
This results in a large slowdown, but helps to find certain types
of memory corruptions.
config DEBUG_HIGHMEM
bool "Highmem debugging"
depends on DEBUG_KERNEL && HIGHMEM
......
......@@ -430,6 +430,14 @@ void __init early_cpu_init(void)
rise_init_cpu();
nexgen_init_cpu();
umc_init_cpu();
#ifdef CONFIG_DEBUG_PAGEALLOC
/* pse is not compatible with on-the-fly unmapping,
* disable it even if the cpus claim to support it.
*/
clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
disable_pse = 1;
#endif
}
/*
* cpu_init() initializes state that is per-CPU. Some data is already
......
......@@ -13,6 +13,10 @@
#include <asm/processor.h>
#include <asm/tlbflush.h>
static spinlock_t cpa_lock = SPIN_LOCK_UNLOCKED;
static struct list_head df_list = LIST_HEAD_INIT(df_list);
static inline pte_t *lookup_address(unsigned long address)
{
pgd_t *pgd = pgd_offset_k(address);
......@@ -31,10 +35,15 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot)
{
int i;
unsigned long addr;
struct page *base = alloc_pages(GFP_KERNEL, 0);
struct page *base;
pte_t *pbase;
spin_unlock_irq(&cpa_lock);
base = alloc_pages(GFP_KERNEL, 0);
spin_lock_irq(&cpa_lock);
if (!base)
return NULL;
address = __pa(address);
addr = address & LARGE_PAGE_MASK;
pbase = (pte_t *)page_address(base);
......@@ -87,7 +96,7 @@ static inline void revert_page(struct page *kpte_page, unsigned long address)
}
static int
__change_page_attr(struct page *page, pgprot_t prot, struct page **oldpage)
__change_page_attr(struct page *page, pgprot_t prot)
{
pte_t *kpte;
unsigned long address;
......@@ -123,7 +132,7 @@ __change_page_attr(struct page *page, pgprot_t prot, struct page **oldpage)
}
if (cpu_has_pse && (atomic_read(&kpte_page->count) == 1)) {
*oldpage = kpte_page;
list_add(&kpte_page->list, &df_list);
revert_page(kpte_page, address);
}
return 0;
......@@ -134,12 +143,6 @@ static inline void flush_map(void)
on_each_cpu(flush_kernel_map, NULL, 1, 1);
}
struct deferred_page {
struct deferred_page *next;
struct page *fpage;
};
static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */
/*
* Change the page attributes of an page in the linear mapping.
*
......@@ -156,47 +159,54 @@ static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */
int change_page_attr(struct page *page, int numpages, pgprot_t prot)
{
int err = 0;
struct page *fpage;
int i;
unsigned long flags;
down_write(&init_mm.mmap_sem);
spin_lock_irqsave(&cpa_lock, flags);
for (i = 0; i < numpages; i++, page++) {
fpage = NULL;
err = __change_page_attr(page, prot, &fpage);
err = __change_page_attr(page, prot);
if (err)
break;
if (fpage) {
struct deferred_page *df;
df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL);
if (!df) {
flush_map();
__free_page(fpage);
} else {
df->next = df_list;
df->fpage = fpage;
df_list = df;
}
}
}
up_write(&init_mm.mmap_sem);
spin_unlock_irqrestore(&cpa_lock, flags);
return err;
}
void global_flush_tlb(void)
{
struct deferred_page *df, *next_df;
LIST_HEAD(l);
struct list_head* n;
down_read(&init_mm.mmap_sem);
df = xchg(&df_list, NULL);
up_read(&init_mm.mmap_sem);
BUG_ON(irqs_disabled());
spin_lock_irq(&cpa_lock);
list_splice_init(&df_list, &l);
spin_unlock_irq(&cpa_lock);
flush_map();
for (; df; df = next_df) {
next_df = df->next;
if (df->fpage)
__free_page(df->fpage);
kfree(df);
n = l.next;
while (n != &l) {
struct page *pg = list_entry(n, struct page, list);
n = n->next;
__free_page(pg);
}
}
#ifdef CONFIG_DEBUG_PAGEALLOC
void kernel_map_pages(struct page *page, int numpages, int enable)
{
if (PageHighMem(page))
return;
/* the return value is ignored - the calls cannot fail,
* large pages are disabled at boot time.
*/
change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
/* we should perform an IPI and flush all tlbs,
* but that can deadlock->flush only current cpu.
*/
__flush_tlb_all();
}
EXPORT_SYMBOL(kernel_map_pages);
#endif
EXPORT_SYMBOL(change_page_attr);
EXPORT_SYMBOL(global_flush_tlb);
......@@ -17,4 +17,9 @@
void global_flush_tlb(void);
int change_page_attr(struct page *page, int numpages, pgprot_t prot);
#ifdef CONFIG_DEBUG_PAGEALLOC
/* internal debugging function */
void kernel_map_pages(struct page *page, int numpages, int enable);
#endif
#endif /* _I386_CACHEFLUSH_H */
......@@ -609,5 +609,13 @@ extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
int write);
extern int remap_page_range(struct vm_area_struct *vma, unsigned long from,
unsigned long to, unsigned long size, pgprot_t prot);
#ifndef CONFIG_DEBUG_PAGEALLOC
static inline void
kernel_map_pages(struct page *page, int numpages, int enable)
{
}
#endif
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
......@@ -114,6 +114,8 @@ extern kmem_cache_t *signal_cachep;
extern kmem_cache_t *sighand_cachep;
extern kmem_cache_t *bio_cachep;
void ptrinfo(unsigned long addr);
#endif /* __KERNEL__ */
#endif /* _LINUX_SLAB_H */
......@@ -32,6 +32,8 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <asm/tlbflush.h>
DECLARE_BITMAP(node_online_map, MAX_NUMNODES);
DECLARE_BITMAP(memblk_online_map, MAX_NR_MEMBLKS);
struct pglist_data *pgdat_list;
......@@ -265,6 +267,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
mod_page_state(pgfree, 1 << order);
free_pages_check(__FUNCTION__, page);
list_add(&page->list, &list);
kernel_map_pages(page, 1<<order, 0);
free_pages_bulk(page_zone(page), 1, &list, order);
}
......@@ -440,6 +443,7 @@ static void free_hot_cold_page(struct page *page, int cold)
struct per_cpu_pages *pcp;
unsigned long flags;
kernel_map_pages(page, 1, 0);
inc_page_state(pgfree);
free_pages_check(__FUNCTION__, page);
pcp = &zone->pageset[get_cpu()].pcp[cold];
......@@ -556,7 +560,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
(!wait && z->free_pages >= z->pages_high)) {
page = buffered_rmqueue(z, order, cold);
if (page)
return page;
goto got_pg;
}
min += z->pages_low * sysctl_lower_zone_protection;
}
......@@ -579,7 +583,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
(!wait && z->free_pages >= z->pages_high)) {
page = buffered_rmqueue(z, order, cold);
if (page)
return page;
goto got_pg;
}
min += local_min * sysctl_lower_zone_protection;
}
......@@ -594,7 +598,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
page = buffered_rmqueue(z, order, cold);
if (page)
return page;
goto got_pg;
}
goto nopage;
}
......@@ -622,7 +626,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
(!wait && z->free_pages >= z->pages_high)) {
page = buffered_rmqueue(z, order, cold);
if (page)
return page;
goto got_pg;
}
min += z->pages_low * sysctl_lower_zone_protection;
}
......@@ -653,6 +657,9 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
current->comm, order, gfp_mask);
}
return NULL;
got_pg:
kernel_map_pages(page, 1 << order, 1);
return page;
}
/*
......
......@@ -89,7 +89,11 @@
#include <linux/notifier.h>
#include <linux/kallsyms.h>
#include <linux/cpu.h>
#include <linux/sysctl.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
/*
* DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
......@@ -351,6 +355,34 @@ struct kmem_cache_s {
#define POISON_AFTER 0x6b /* for use-after-free poisoning */
#define POISON_END 0xa5 /* end-byte of poisoning */
static inline int obj_dbghead(kmem_cache_t *cachep)
{
if (cachep->flags & SLAB_RED_ZONE)
return BYTES_PER_WORD;
return 0;
}
static inline int obj_dbglen(kmem_cache_t *cachep)
{
int len = 0;
if (cachep->flags & SLAB_RED_ZONE) {
len += 2*BYTES_PER_WORD;
}
if (cachep->flags & SLAB_STORE_USER) {
len += BYTES_PER_WORD;
}
return len;
}
#else
static inline int obj_dbghead(kmem_cache_t *cachep)
{
return 0;
}
static inline int obj_dbglen(kmem_cache_t *cachep)
{
return 0;
}
#endif
/*
......@@ -765,16 +797,45 @@ static inline void kmem_freepages (kmem_cache_t *cachep, void *addr)
}
#if DEBUG
static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
#ifdef CONFIG_DEBUG_PAGEALLOC
static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, unsigned long caller)
{
int size = cachep->objsize;
if (cachep->flags & SLAB_RED_ZONE) {
addr += BYTES_PER_WORD;
size -= 2*BYTES_PER_WORD;
int size = cachep->objsize-obj_dbglen(cachep);
addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)];
if (size < 5*sizeof(unsigned long))
return;
*addr++=0x12345678;
*addr++=caller;
*addr++=smp_processor_id();
size -= 3*sizeof(unsigned long);
{
unsigned long *sptr = &caller;
unsigned long svalue;
while (((long) sptr & (THREAD_SIZE-1)) != 0) {
svalue = *sptr++;
if (kernel_text_address(svalue)) {
*addr++=svalue;
size -= sizeof(unsigned long);
if (size <= sizeof(unsigned long))
break;
}
if (cachep->flags & SLAB_STORE_USER) {
size -= BYTES_PER_WORD;
}
}
*addr++=0x87654321;
}
#endif
static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
{
int size = cachep->objsize-obj_dbglen(cachep);
addr = &((char*)addr)[obj_dbghead(cachep)];
memset(addr, val, size);
*(unsigned char *)(addr+size-1) = POISON_END;
}
......@@ -796,15 +857,11 @@ static void *scan_poisoned_obj(unsigned char* addr, unsigned int size)
static void check_poison_obj(kmem_cache_t *cachep, void *addr)
{
int size = cachep->objsize;
void *end;
if (cachep->flags & SLAB_RED_ZONE) {
addr += BYTES_PER_WORD;
size -= 2*BYTES_PER_WORD;
}
if (cachep->flags & SLAB_STORE_USER) {
size -= BYTES_PER_WORD;
}
int size = cachep->objsize-obj_dbglen(cachep);
addr = &((char*)addr)[obj_dbghead(cachep)];
end = scan_poisoned_obj(addr, size);
if (end) {
int s;
......@@ -858,8 +915,16 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
void *objp = slabp->s_mem + cachep->objsize * i;
int objlen = cachep->objsize;
if (cachep->flags & SLAB_POISON)
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
else
check_poison_obj(cachep, objp);
#else
check_poison_obj(cachep, objp);
#endif
}
if (cachep->flags & SLAB_STORE_USER)
objlen -= BYTES_PER_WORD;
......@@ -952,6 +1017,10 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
}
#if FORCED_DEBUG
#ifdef CONFIG_DEBUG_PAGEALLOC
if (size < PAGE_SIZE-3*BYTES_PER_WORD && size > 128)
size = PAGE_SIZE-3*BYTES_PER_WORD;
#endif
/*
* Enable redzoning and last user accounting, except
* - for caches with forced alignment: redzoning would violate the
......@@ -1404,6 +1473,8 @@ static void cache_init_objs (kmem_cache_t * cachep,
slab_error(cachep, "constructor overwrote the"
" start of an object");
}
if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
#else
if (cachep->ctor)
cachep->ctor(objp, cachep, ctor_flags);
......@@ -1584,24 +1655,27 @@ static inline void *cache_free_debugcheck (kmem_cache_t * cachep, void * objp, v
* caller can perform a verify of its state (debugging).
* Called without the cache-lock held.
*/
if (cachep->flags & SLAB_RED_ZONE) {
cachep->ctor(objp+BYTES_PER_WORD,
cachep->ctor(objp+obj_dbghead(cachep),
cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
} else {
cachep->ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
}
}
if (cachep->flags & SLAB_POISON && cachep->dtor) {
/* we want to cache poison the object,
* call the destruction callback
*/
if (cachep->flags & SLAB_RED_ZONE)
cachep->dtor(objp+BYTES_PER_WORD, cachep, 0);
else
cachep->dtor(objp, cachep, 0);
cachep->dtor(objp+obj_dbghead(cachep), cachep, 0);
}
if (cachep->flags & SLAB_POISON)
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
store_stackinfo(cachep, objp, POISON_AFTER);
kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
} else {
poison_obj(cachep, objp, POISON_AFTER);
}
#else
poison_obj(cachep, objp, POISON_AFTER);
#endif
}
#endif
return objp;
}
......@@ -1617,6 +1691,7 @@ static inline void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
entries++;
BUG_ON(entries > cachep->num);
BUG_ON(i < 0 || i >= cachep->num);
}
BUG_ON(entries != cachep->num - slabp->inuse);
#endif
......@@ -1747,7 +1822,14 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
if (!objp)
return objp;
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1);
else
check_poison_obj(cachep, objp);
#else
check_poison_obj(cachep, objp);
#endif
poison_obj(cachep, objp, POISON_BEFORE);
}
if (cachep->flags & SLAB_STORE_USER) {
......@@ -2085,16 +2167,7 @@ free_percpu(const void *objp)
unsigned int kmem_cache_size(kmem_cache_t *cachep)
{
unsigned int objlen = cachep->objsize;
#if DEBUG
if (cachep->flags & SLAB_RED_ZONE)
objlen -= 2*BYTES_PER_WORD;
if (cachep->flags & SLAB_STORE_USER)
objlen -= BYTES_PER_WORD;
#endif
return objlen;
return cachep->objsize-obj_dbglen(cachep);
}
kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags)
......@@ -2626,3 +2699,70 @@ unsigned int ksize(const void *objp)
return size;
}
void ptrinfo(unsigned long addr)
{
struct page *page;
printk("Dumping data about address %p.\n", (void*)addr);
if (!virt_addr_valid((void*)addr)) {
printk("virt addr invalid.\n");
return;
}
do {
pgd_t *pgd = pgd_offset_k(addr);
pmd_t *pmd;
if (pgd_none(*pgd)) {
printk("No pgd.\n");
break;
}
pmd = pmd_offset(pgd, addr);
if (pmd_none(*pmd)) {
printk("No pmd.\n");
break;
}
#ifdef CONFIG_X86
if (pmd_large(*pmd)) {
printk("Large page.\n");
break;
}
#endif
printk("normal page, pte_val 0x%llx\n",
(unsigned long long)pte_val(*pte_offset_kernel(pmd, addr)));
} while(0);
page = virt_to_page((void*)addr);
printk("struct page at %p, flags %lxh.\n", page, page->flags);
if (PageSlab(page)) {
kmem_cache_t *c;
struct slab *s;
unsigned long flags;
int objnr;
void *objp;
c = GET_PAGE_CACHE(page);
printk("belongs to cache %s.\n",c->name);
spin_lock_irqsave(&c->spinlock, flags);
s = GET_PAGE_SLAB(page);
printk("slabp %p with %d inuse objects (from %d).\n",
s, s->inuse, c->num);
check_slabp(c,s);
objnr = (addr-(unsigned long)s->s_mem)/c->objsize;
objp = s->s_mem+c->objsize*objnr;
printk("points into object no %d, starting at %p, len %d.\n",
objnr, objp, c->objsize);
if (objnr >= c->num) {
printk("Bad obj number.\n");
} else {
kernel_map_pages(virt_to_page(objp), c->objsize/PAGE_SIZE, 1);
printk("redzone: %lxh/%lxh/%lxh.\n",
((unsigned long*)objp)[0],
((unsigned long*)(objp+c->objsize))[-2],
((unsigned long*)(objp+c->objsize))[-1]);
}
spin_unlock_irqrestore(&c->spinlock, flags);
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment