Merge bk://kernel.bkbits.net/acme/net-2.5

into nuts.ninka.net:/home/davem/src/BK/net-2.5

Merge bk://kernel.bkbits.net/acme/net-2.5
into nuts.ninka.net:/home/davem/src/BK/net-2.5
bf5ae502 · David S. Miller · 05b52f33 · 901d6cc4 · bf5ae502 · bf5ae502
Commit bf5ae502 authored May 03, 2003 by David S. Miller
34 changed files
--- a/arch/alpha/oprofile/Makefile
+++ b/arch/alpha/oprofile/Makefile
@@ -5,7 +5,8 @@ obj-$(CONFIG_OPROFILE) += oprofile.o
 DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
 		oprof.o cpu_buffer.o buffer_sync.o \
 		event_buffer.o oprofile_files.o \
-		oprofilefs.o oprofile_stats.o )
+		oprofilefs.o oprofile_stats.o \
+		timer_int.o )

 oprofile-y				:= $(DRIVER_OBJS) common.o
 oprofile-$(CONFIG_ALPHA_GENERIC)	+= op_model_ev4.o \

--- a/arch/alpha/oprofile/common.c
+++ b/arch/alpha/oprofile/common.c
@@ -175,7 +175,7 @@ oprofile_arch_init(struct oprofile_operations **ops)
 	}

 	if (!lmodel)
-		return ENODEV;
+		return -ENODEV;
 	model = lmodel;

 	oprof_axp_ops.cpu_type = lmodel->cpu_type;

--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -114,6 +114,15 @@ zdisk bzdisk: vmlinux
 install fdimage fdimage144 fdimage288: vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@

+prepare: include/asm-$(ARCH)/asm_offsets.h
+CLEAN_FILES += include/asm-$(ARCH)/asm_offsets.h
+
+arch/$(ARCH)/kernel/asm-offsets.s: include/asm include/linux/version.h \
+				   include/config/MARKER
+
+include/asm-$(ARCH)/asm_offsets.h: arch/$(ARCH)/kernel/asm-offsets.s
+	$(call filechk,gen-asm-offsets)
+
 archclean:
 	$(Q)$(MAKE) $(clean)=arch/i386/boot


--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed
+ * to extract and format the required data.
+ */
+
+#include <linux/signal.h>
+#include <asm/ucontext.h>
+#include "sigframe.h"
+
+#define DEFINE(sym, val) \
+        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+
+#define BLANK() asm volatile("\n->" : : )
+
+void foo(void)
+{
+	DEFINE(SIGCONTEXT_eax, offsetof (struct sigcontext, eax));
+	DEFINE(SIGCONTEXT_ebx, offsetof (struct sigcontext, ebx));
+	DEFINE(SIGCONTEXT_ecx, offsetof (struct sigcontext, ecx));
+	DEFINE(SIGCONTEXT_edx, offsetof (struct sigcontext, edx));
+	DEFINE(SIGCONTEXT_esi, offsetof (struct sigcontext, esi));
+	DEFINE(SIGCONTEXT_edi, offsetof (struct sigcontext, edi));
+	DEFINE(SIGCONTEXT_ebp, offsetof (struct sigcontext, ebp));
+	DEFINE(SIGCONTEXT_esp, offsetof (struct sigcontext, esp));
+	DEFINE(SIGCONTEXT_eip, offsetof (struct sigcontext, eip));
+	BLANK();
+
+	DEFINE(RT_SIGFRAME_sigcontext,
+	       offsetof (struct rt_sigframe, uc.uc_mcontext));
+}
--- a/arch/i386/kernel/sigframe.h
+++ b/arch/i386/kernel/sigframe.h
+struct sigframe
+{
+	char *pretcode;
+	int sig;
+	struct sigcontext sc;
+	struct _fpstate fpstate;
+	unsigned long extramask[_NSIG_WORDS-1];
+	char retcode[8];
+};
+
+struct rt_sigframe
+{
+	char *pretcode;
+	int sig;
+	struct siginfo *pinfo;
+	void *puc;
+	struct siginfo info;
+	struct ucontext uc;
+	struct _fpstate fpstate;
+	char retcode[8];
+};
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -23,6 +23,7 @@
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include "sigframe.h"

 #define DEBUG_SIG 0

@@ -126,28 +127,6 @@ sys_sigaltstack(const stack_t *uss, stack_t *uoss)
 * Do a signal return; undo the signal stack.
 */

-struct sigframe
-{
-	char *pretcode;
-	int sig;
-	struct sigcontext sc;
-	struct _fpstate fpstate;
-	unsigned long extramask[_NSIG_WORDS-1];
-	char retcode[8];
-};
-
-struct rt_sigframe
-{
-	char *pretcode;
-	int sig;
-	struct siginfo *pinfo;
-	void *puc;
-	struct siginfo info;
-	struct ucontext uc;
-	struct _fpstate fpstate;
-	char retcode[8];
-};
-
 static int
 restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax)
 {

--- a/arch/i386/kernel/vsyscall-sigreturn.S
+++ b/arch/i386/kernel/vsyscall-sigreturn.S
@@ -7,6 +7,7 @@
 */

 #include <asm/unistd.h>
+#include <asm/asm_offsets.h>


 /* XXX
@@ -18,21 +19,124 @@
 	.globl __kernel_sigreturn
 	.type __kernel_sigreturn,@function
 __kernel_sigreturn:
-.LSTART_kernel_sigreturn:
+.LSTART_sigreturn:
 	popl %eax		/* XXX does this mean it needs unwind info? */
 	movl $__NR_sigreturn, %eax
 	int $0x80
 .LEND_sigreturn:
 	.size __kernel_sigreturn,.-.LSTART_sigreturn

-	.text
 	.balign 32
 	.globl __kernel_rt_sigreturn
 	.type __kernel_rt_sigreturn,@function
 __kernel_rt_sigreturn:
-.LSTART_kernel_rt_sigreturn:
+.LSTART_rt_sigreturn:
 	movl $__NR_rt_sigreturn, %eax
 	int $0x80
 .LEND_rt_sigreturn:
 	.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
 	.previous
+
+	.section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI1:
+	.long .LENDCIEDLSI1-.LSTARTCIEDLSI1
+.LSTARTCIEDLSI1:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zR"		/* NUL-terminated augmentation string */
+	.uleb128 1		/* Code alignment factor */
+	.sleb128 -4		/* Data alignment factor */
+	.byte 8			/* Return address register column */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+	.byte 0			/* DW_CFA_nop */
+	.align 4
+.LENDCIEDLSI1:
+	.long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
+.LSTARTFDEDLSI1:
+	.long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
+	/* HACK: The dwarf2 unwind routines will subtract 1 from the
+	   return address to get an address in the middle of the
+	   presumed call instruction.  Since we didn't get here via
+	   a call, we need to include the nop before the real start
+	   to make up for it.  */
+	.long .LSTART_sigreturn-1-.	/* PC-relative start address */
+	.long .LEND_sigreturn-.LSTART_sigreturn+1
+	.uleb128 0			/* Augmentation */
+	/* What follows are the instructions for the table generation.
+	   We record the locations of each register saved.  This is
+	   complicated by the fact that the "CFA" is always assumed to
+	   be the value of the stack pointer in the caller.  This means
+	   that we must define the CFA of this body of code to be the
+	   saved value of the stack pointer in the sigcontext.  Which
+	   also means that there is no fixed relation to the other 
+	   saved registers, which means that we must use DW_CFA_expression
+	   to compute their addresses.  It also means that when we 
+	   adjust the stack with the popl, we have to do it all over again.  */
+
+#define do_cfa_expr(offset)						\
+	.byte 0x0f;			/* DW_CFA_def_cfa_expression */	\
+	.uleb128 1f-0f;			/*   length */			\
+0:	.byte 0x74;			/*     DW_OP_breg4 */		\
+	.sleb128 offset;		/*      offset */		\
+	.byte 0x06;			/*     DW_OP_deref */		\
+1:
+
+#define do_expr(regno, offset)						\
+	.byte 0x10;			/* DW_CFA_expression */		\
+	.uleb128 regno;			/*   regno */			\
+	.uleb128 1f-0f;			/*   length */			\
+0:	.byte 0x74;			/*     DW_OP_breg4 */		\
+	.sleb128 offset;		/*       offset */		\
+1:
+
+	do_cfa_expr(SIGCONTEXT_esp+4)
+	do_expr(0, SIGCONTEXT_eax+4)
+	do_expr(1, SIGCONTEXT_ecx+4)
+	do_expr(2, SIGCONTEXT_edx+4)
+	do_expr(3, SIGCONTEXT_ebx+4)
+	do_expr(5, SIGCONTEXT_ebp+4)
+	do_expr(6, SIGCONTEXT_esi+4)
+	do_expr(7, SIGCONTEXT_edi+4)
+	do_expr(8, SIGCONTEXT_eip+4)
+
+	.byte 0x42	/* DW_CFA_advance_loc 2 -- nop; popl eax. */
+
+	do_cfa_expr(SIGCONTEXT_esp)
+	do_expr(0, SIGCONTEXT_eax)
+	do_expr(1, SIGCONTEXT_ecx)
+	do_expr(2, SIGCONTEXT_edx)
+	do_expr(3, SIGCONTEXT_ebx)
+	do_expr(5, SIGCONTEXT_ebp)
+	do_expr(6, SIGCONTEXT_esi)
+	do_expr(7, SIGCONTEXT_edi)
+	do_expr(8, SIGCONTEXT_eip)
+
+	.align 4
+.LENDFDEDLSI1:
+
+	.long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
+.LSTARTFDEDLSI2:
+	.long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
+	/* HACK: See above wrt unwind library assumptions.  */
+	.long .LSTART_rt_sigreturn-1-.	/* PC-relative start address */
+	.long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
+	.uleb128 0			/* Augmentation */
+	/* What follows are the instructions for the table generation.
+	   We record the locations of each register saved.  This is
+	   slightly less complicated than the above, since we don't
+	   modify the stack pointer in the process.  */
+
+	do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp)
+	do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax)
+	do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx)
+	do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx)
+	do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx)
+	do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp)
+	do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi)
+	do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi)
+	do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip)
+
+	.align 4
+.LENDFDEDLSI2:
+	.previous
--- a/arch/i386/kernel/vsyscall-sysenter.S
+++ b/arch/i386/kernel/vsyscall-sysenter.S
@@ -74,11 +74,13 @@ SYSENTER_RETURN:
 	.long .Lenter_kernel-.Lpush_edx
 	.byte 0x0e		/* DW_CFA_def_cfa_offset */
 	.byte 0x10		/* RA at offset 16 now */
+	.byte 0x85, 0x04	/* DW_CFA_offset %ebp -16 */
 	/* Finally the epilogue.  */
 	.byte 0x04		/* DW_CFA_advance_loc4 */
 	.long .Lpop_ebp-.Lenter_kernel
 	.byte 0x0e		/* DW_CFA_def_cfa_offset */
 	.byte 0x12		/* RA at offset 12 now */
+	.byte 0xc5		/* DW_CFA_restore %ebp */
 	.byte 0x04		/* DW_CFA_advance_loc4 */
 	.long .Lpop_edx-.Lpop_ebp
 	.byte 0x0e		/* DW_CFA_def_cfa_offset */

--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -505,36 +505,20 @@ void __init mem_init(void)
 #endif
 }

-#include <linux/slab.h>
-
-kmem_cache_t *pmd_cache;
-kmem_cache_t *pgd_cache;
-
-void pmd_ctor(void *, kmem_cache_t *, unsigned long);
-void pgd_ctor(void *, kmem_cache_t *, unsigned long);
+#if CONFIG_X86_PAE
+struct kmem_cache_s *pae_pgd_cachep;

 void __init pgtable_cache_init(void)
 {
-	if (PTRS_PER_PMD > 1) {
-		pmd_cache = kmem_cache_create("pae_pmd",
-						PTRS_PER_PMD*sizeof(pmd_t),
-						0,
-						SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
-						pmd_ctor,
-						NULL);
-
-		if (!pmd_cache)
-			panic("pgtable_cache_init(): cannot create pmd cache");
-	}
-
        /*
         * PAE pgds must be 16-byte aligned:
         */
-        pgd_cache = kmem_cache_create("pgd", PTRS_PER_PGD*sizeof(pgd_t), 0,
-                SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, pgd_ctor, NULL);
-        if (!pgd_cache)
-                panic("pgtable_cache_init(): Cannot create pgd cache");
+        pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0,
+                SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL);
+        if (!pae_pgd_cachep)
+                panic("init_pae(): Cannot alloc pae_pgd SLAB cache");
 }
+#endif

 /*
 * This function cannot be __init, since exceptions don't work in that

--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -151,60 +151,61 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	return pte;
 }

-extern kmem_cache_t *pmd_cache;
-extern kmem_cache_t *pgd_cache;
+#if CONFIG_X86_PAE

-void pmd_ctor(void *__pmd, kmem_cache_t *pmd_cache, unsigned long flags)
+pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	clear_page(__pmd);
+	int i;
+	pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL);
+
+	if (pgd) {
+		for (i = 0; i < USER_PTRS_PER_PGD; i++) {
+			unsigned long pmd = __get_free_page(GFP_KERNEL);
+			if (!pmd)
+				goto out_oom;
+			clear_page(pmd);
+			set_pgd(pgd + i, __pgd(1 + __pa(pmd)));
+		}
+		memcpy(pgd + USER_PTRS_PER_PGD,
+			swapper_pg_dir + USER_PTRS_PER_PGD,
+			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
+	}
+	return pgd;
+out_oom:
+	for (i--; i >= 0; i--)
+		free_page((unsigned long)__va(pgd_val(pgd[i])-1));
+	kmem_cache_free(pae_pgd_cachep, pgd);
+	return NULL;
 }

-void pgd_ctor(void *__pgd, kmem_cache_t *pgd_cache, unsigned long flags)
+void pgd_free(pgd_t *pgd)
 {
-	pgd_t *pgd = __pgd;
+	int i;

-	if (PTRS_PER_PMD == 1)
-		memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
-	memcpy(pgd + USER_PTRS_PER_PGD,
-		swapper_pg_dir + USER_PTRS_PER_PGD,
-		(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
+	for (i = 0; i < USER_PTRS_PER_PGD; i++)
+		free_page((unsigned long)__va(pgd_val(pgd[i])-1));
+	kmem_cache_free(pae_pgd_cachep, pgd);
 }

+#else
+
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	int i;
-	pgd_t *pgd = kmem_cache_alloc(pgd_cache, SLAB_KERNEL);
-
-	if (PTRS_PER_PMD == 1)
-		return pgd;
-	else if (!pgd)
-		return NULL;
-
-	for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
-		pmd_t *pmd = kmem_cache_alloc(pmd_cache, SLAB_KERNEL);
-		if (!pmd)
-			goto out_oom;
-		set_pgd(pgd + i, __pgd(1 + __pa((unsigned long long)((unsigned long)pmd))));
+	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
+
+	if (pgd) {
+		memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
+		memcpy(pgd + USER_PTRS_PER_PGD,
+			swapper_pg_dir + USER_PTRS_PER_PGD,
+			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
 	}
 	return pgd;
-
-out_oom:
-	for (i--; i >= 0; --i)
-		kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
-	kmem_cache_free(pgd_cache, (void *)pgd);
-	return NULL;
 }

 void pgd_free(pgd_t *pgd)
 {
-	int i;
+	free_page((unsigned long)pgd);
+}

-	if (PTRS_PER_PMD > 1) {
-		for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
-			kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
-			set_pgd(pgd + i, __pgd(0));
-		}
-	}
+#endif /* CONFIG_X86_PAE */

-	kmem_cache_free(pgd_cache, (void *)pgd);
-}
--- a/arch/i386/oprofile/Makefile
+++ b/arch/i386/oprofile/Makefile
@@ -3,8 +3,9 @@ obj-$(CONFIG_OPROFILE) += oprofile.o
 DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
 		oprof.o cpu_buffer.o buffer_sync.o \
 		event_buffer.o oprofile_files.o \
-		oprofilefs.o oprofile_stats.o )
+		oprofilefs.o oprofile_stats.o  \
+		timer_int.o )

-oprofile-y				:= $(DRIVER_OBJS) init.o timer_int.o
+oprofile-y				:= $(DRIVER_OBJS) init.o
 oprofile-$(CONFIG_X86_LOCAL_APIC) 	+= nmi_int.o op_model_athlon.o \
 					   op_model_ppro.o op_model_p4.o
--- a/arch/i386/oprofile/init.c
+++ b/arch/i386/oprofile/init.c
@@ -11,22 +11,19 @@
 #include <linux/init.h>
 
 /* We support CPUs that have performance counters like the Pentium Pro
- * with NMI mode samples. Other x86 CPUs use a simple interrupt keyed
- * off the timer interrupt, which cannot profile interrupts-disabled
- * code unlike the NMI-based code.
+ * with the NMI mode driver.
 */
 
 extern int nmi_init(struct oprofile_operations ** ops);
 extern void nmi_exit(void);
-extern void timer_init(struct oprofile_operations ** ops);

 int __init oprofile_arch_init(struct oprofile_operations ** ops)
 {
 #ifdef CONFIG_X86_LOCAL_APIC
-	if (!nmi_init(ops))
+	return nmi_init(ops);
+#else
+	return -ENODEV;
 #endif
-		timer_init(ops);
-	return 0;
 }



--- a/arch/i386/oprofile/nmi_int.c
+++ b/arch/i386/oprofile/nmi_int.c
@@ -314,13 +314,13 @@ int __init nmi_init(struct oprofile_operations ** ops)
 	__u8 family = current_cpu_data.x86;
 
 	if (!cpu_has_apic)
-		return 0;
+		return -ENODEV;
 
 	switch (vendor) {
 		case X86_VENDOR_AMD:
 			/* Needs to be at least an Athlon (or hammer in 32bit mode) */
 			if (family < 6)
-				return 0;
+				return -ENODEV;
 			model = &op_athlon_spec;
 			nmi_ops.cpu_type = "i386/athlon";
 			break;
@@ -331,30 +331,30 @@ int __init nmi_init(struct oprofile_operations ** ops)
 				/* Pentium IV */
 				case 0xf:
 					if (!p4_init())
-						return 0;
+						return -ENODEV;
 					break;

 				/* A P6-class processor */
 				case 6:
 					if (!ppro_init())
-						return 0;
+						return -ENODEV;
 					break;

 				default:
-					return 0;
+					return -ENODEV;
 			}
 			break;
 #endif /* !CONFIG_X86_64 */

 		default:
-			return 0;
+			return -ENODEV;
 	}

 	init_driverfs();
 	using_nmi = 1;
 	*ops = &nmi_ops;
 	printk(KERN_INFO "oprofile: using NMI interrupt.\n");
-	return 1;
+	return 0;
 }



--- a/arch/parisc/oprofile/Makefile
+++ b/arch/parisc/oprofile/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_OPROFILE) += oprofile.o
 DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
 		oprof.o cpu_buffer.o buffer_sync.o \
 		event_buffer.o oprofile_files.o \
-		oprofilefs.o oprofile_stats.o )
+		oprofilefs.o oprofile_stats.o \
+		timer_int.o )

-oprofile-y				:= $(DRIVER_OBJS) init.o timer_int.o
+oprofile-y				:= $(DRIVER_OBJS) init.o
--- a/arch/parisc/oprofile/init.c
+++ b/arch/parisc/oprofile/init.c
@@ -15,8 +15,7 @@ extern void timer_init(struct oprofile_operations ** ops);

 int __init oprofile_arch_init(struct oprofile_operations ** ops)
 {
-	timer_init(ops);
-	return 0;
+	return -ENODEV;
 }



--- a/arch/parisc/oprofile/timer_int.c
+++ b/arch/parisc/oprofile/timer_int.c
-/**
- * @file timer_int.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/notifier.h>
-#include <linux/smp.h>
-#include <linux/irq.h>
-#include <linux/oprofile.h>
-#include <asm/ptrace.h>
- 
-static int timer_notify(struct notifier_block * self, unsigned long val, void * data)
-{
-	struct pt_regs * regs = (struct pt_regs *)data;
-	int cpu = smp_processor_id();
-	unsigned long pc = regs->iaoq[0];
-	int is_kernel = !user_mode(regs);
- 
-	oprofile_add_sample(pc, is_kernel, 0, cpu);
-	return 0;
-}
- 
- 
-static struct notifier_block timer_notifier = {
-	.notifier_call	= timer_notify,
-};
- 
-
-static int timer_start(void)
-{
-	return register_profile_notifier(&timer_notifier);
-}
-
-
-static void timer_stop(void)
-{
-	unregister_profile_notifier(&timer_notifier);
-}
-
-
-static struct oprofile_operations timer_ops = {
-	.start	= timer_start,
-	.stop	= timer_stop,
-	.cpu_type = "timer"
-};
-
- 
-void __init timer_init(struct oprofile_operations ** ops)
-{
-	*ops = &timer_ops;
-	printk(KERN_INFO "oprofile: using timer interrupt.\n");
-}
--- a/arch/ppc64/oprofile/Makefile
+++ b/arch/ppc64/oprofile/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_OPROFILE) += oprofile.o
 DRIVER_OBJS := $(addprefix ../../../drivers/oprofile/, \
 		oprof.o cpu_buffer.o buffer_sync.o \
 		event_buffer.o oprofile_files.o \
-		oprofilefs.o oprofile_stats.o )
+		oprofilefs.o oprofile_stats.o \
+		timer_int.o )

-oprofile-y := $(DRIVER_OBJS) init.o timer_int.o
+oprofile-y := $(DRIVER_OBJS) init.o
--- a/arch/ppc64/oprofile/init.c
+++ b/arch/ppc64/oprofile/init.c
@@ -15,8 +15,7 @@ extern void timer_init(struct oprofile_operations ** ops);

 int __init oprofile_arch_init(struct oprofile_operations ** ops)
 {
-	timer_init(ops);
-	return 0;
+	return -ENODEV;
 }



--- a/arch/ppc64/oprofile/timer_int.c
+++ b/arch/ppc64/oprofile/timer_int.c
-/**
- * @file timer_int.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/notifier.h>
-#include <linux/smp.h>
-#include <linux/irq.h>
-#include <linux/oprofile.h>
-#include <linux/profile.h>
-#include <asm/ptrace.h>
- 
-static int timer_notify(struct notifier_block * self, unsigned long val, void * data)
-{
-	struct pt_regs * regs = (struct pt_regs *)data;
-	int cpu = smp_processor_id();
-	unsigned long pc = instruction_pointer(regs);
-	int is_kernel = !user_mode(regs);
- 
-	oprofile_add_sample(pc, is_kernel, 0, cpu);
-	return 0;
-}
- 
- 
-static struct notifier_block timer_notifier = {
-	.notifier_call	= timer_notify,
-};
- 
-
-static int timer_start(void)
-{
-	return register_profile_notifier(&timer_notifier);
-}
-
-
-static void timer_stop(void)
-{
-	unregister_profile_notifier(&timer_notifier);
-}
-
-
-static struct oprofile_operations timer_ops = {
-	.start	= timer_start,
-	.stop	= timer_stop,
-	.cpu_type = "timer"
-};
-
- 
-void __init timer_init(struct oprofile_operations ** ops)
-{
-	*ops = &timer_ops;
-	printk(KERN_INFO "oprofile: using timer interrupt.\n");
-}
--- a/arch/sparc64/oprofile/Makefile
+++ b/arch/sparc64/oprofile/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_OPROFILE) += oprofile.o
 DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
 		oprof.o cpu_buffer.o buffer_sync.o \
 		event_buffer.o oprofile_files.o \
-		oprofilefs.o oprofile_stats.o )
+		oprofilefs.o oprofile_stats.o \
+		timer_int.o )

-oprofile-y				:= $(DRIVER_OBJS) init.o timer_int.o
+oprofile-y				:= $(DRIVER_OBJS) init.o
--- a/arch/sparc64/oprofile/init.c
+++ b/arch/sparc64/oprofile/init.c
@@ -15,8 +15,7 @@ extern void timer_init(struct oprofile_operations ** ops);

 int __init oprofile_arch_init(struct oprofile_operations ** ops)
 {
-	timer_init(ops);
-	return 0;
+	return -ENODEV;
 }



--- a/arch/sparc64/oprofile/timer_int.c
+++ b/arch/sparc64/oprofile/timer_int.c
-/**
- * @file timer_int.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/notifier.h>
-#include <linux/smp.h>
-#include <linux/irq.h>
-#include <linux/oprofile.h>
-#include <linux/profile.h>
-#include <asm/ptrace.h>
- 
-static int timer_notify(struct notifier_block * self, unsigned long val, void * data)
-{
-	struct pt_regs * regs = (struct pt_regs *)data;
-	int cpu = smp_processor_id();
-	unsigned long pc = instruction_pointer(regs);
-	int is_kernel = !user_mode(regs);
- 
-	oprofile_add_sample(pc, is_kernel, 0, cpu);
-	return 0;
-}
- 
- 
-static struct notifier_block timer_notifier = {
-	.notifier_call	= timer_notify,
-};
- 
-
-static int timer_start(void)
-{
-	return register_profile_notifier(&timer_notifier);
-}
-
-
-static void timer_stop(void)
-{
-	unregister_profile_notifier(&timer_notifier);
-}
-
-
-static struct oprofile_operations timer_ops = {
-	.start	= timer_start,
-	.stop	= timer_stop,
-	.cpu_type = "timer"
-};
-
- 
-void __init timer_init(struct oprofile_operations ** ops)
-{
-	*ops = &timer_ops;
-	printk(KERN_INFO "oprofile: using timer interrupt.\n");
-}
--- a/arch/x86_64/oprofile/Makefile
+++ b/arch/x86_64/oprofile/Makefile
@@ -9,9 +9,10 @@ obj-$(CONFIG_OPROFILE) += oprofile.o
 DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
 	oprof.o cpu_buffer.o buffer_sync.o \
 	event_buffer.o oprofile_files.o \
-	oprofilefs.o oprofile_stats.o )
+	oprofilefs.o oprofile_stats.o \
+	timer_int.o )
 
-oprofile-objs := $(DRIVER_OBJS) init.o timer_int.o
+oprofile-objs := $(DRIVER_OBJS) init.o

 oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o
 
@@ -23,11 +24,9 @@ $(obj)/op_model_athlon.c: ${INCL}
 	@ln -sf ../../i386/oprofile/op_model_athlon.c $(obj)/op_model_athlon.c
 $(obj)/init.c: ${INCL}
 	@ln -sf ../../i386/oprofile/init.c $(obj)/init.c
-$(obj)/timer_int.c: ${INCL}
-	@ln -sf ../../i386/oprofile/timer_int.c $(obj)/timer_int.c
 $(obj)/op_counter.h: 
 	@ln -sf ../../i386/oprofile/op_counter.h $(obj)/op_counter.h
 $(obj)/op_x86_model.h:
 	@ln -sf ../../i386/oprofile/op_x86_model.h $(obj)/op_x86_model.h	
-clean-files += op_x86_model.h op_counter.h timer_int.c init.c \
+clean-files += op_x86_model.h op_counter.h init.c \
 	       op_model_athlon.c nmi_int.c
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -58,8 +58,8 @@ static int exit_task_notify(struct notifier_block * self, unsigned long val, voi
 * must concern ourselves with. First, when a task is about to
 * exit (exit_mmap()), we should process the buffer to deal with
 * any samples in the CPU buffer, before we lose the ->mmap information
- * we need. Second, a task may unmap (part of) an executable mmap,
- * so we want to process samples before that happens too
+ * we need. It is vital to get this case correct, otherwise we can
+ * end up trying to access a freed task_struct.
 */
 static int mm_notify(struct notifier_block * self, unsigned long val, void * data)
 {
@@ -67,6 +67,29 @@ static int mm_notify(struct notifier_block * self, unsigned long val, void * dat
 	return 0;
 }

+
+/* Second, a task may unmap (part of) an executable mmap,
+ * so we want to process samples before that happens too. This is merely
+ * a QOI issue not a correctness one.
+ */
+static int munmap_notify(struct notifier_block * self, unsigned long val, void * data)
+{
+	/* Note that we cannot sync the buffers directly, because we might end up
+	 * taking the the mmap_sem that we hold now inside of event_buffer_read()
+	 * on a page fault, whilst holding buffer_sem - deadlock.
+	 *
+	 * This would mean a threaded reader of the event buffer, but we should
+	 * prevent it anyway.
+	 *
+	 * Delaying the work in a context that doesn't hold the mmap_sem means
+	 * that we won't lose samples from other mappings that current() may
+	 * have. Note that either way, we lose any pending samples for what is
+	 * being unmapped.
+	 */
+	schedule_work(&sync_wq);
+	return 0;
+}
+
 
 /* We need to be told about new modules so we don't attribute to a previously
 * loaded module, or drop the samples on the floor.
@@ -92,7 +115,7 @@ static struct notifier_block exit_task_nb = {
 };

 static struct notifier_block exec_unmap_nb = {
-	.notifier_call	= mm_notify,
+	.notifier_call	= munmap_notify,
 };

 static struct notifier_block exit_mmap_nb = {
@@ -147,6 +170,8 @@ void sync_stop(void)
 	profile_event_unregister(EXIT_MMAP, &exit_mmap_nb);
 	profile_event_unregister(EXEC_UNMAP, &exec_unmap_nb);
 	del_timer_sync(&sync_timer);
+	/* timer might have queued work, make sure it's completed. */
+	flush_scheduled_work();
 }

 
@@ -296,6 +321,8 @@ static void add_sample(struct mm_struct * mm, struct op_sample * s, int in_kerne
 		add_sample_entry(s->eip, s->event);
 	} else if (mm) {
 		add_us_sample(mm, s);
+	} else {
+		atomic_inc(&oprofile_stats.sample_lost_no_mm);
 	}
 }
 
@@ -310,26 +337,23 @@ static void release_mm(struct mm_struct * mm)
 /* Take the task's mmap_sem to protect ourselves from
 * races when we do lookup_dcookie().
 */
-static struct mm_struct * take_task_mm(struct task_struct * task)
+static struct mm_struct * take_tasks_mm(struct task_struct * task)
 {
-	struct mm_struct * mm = task->mm;
- 
-	/* if task->mm !NULL, mm_count must be at least 1. It cannot
-	 * drop to 0 without the task exiting, which will have to sleep
-	 * on buffer_sem first. So we do not need to mark mm_count
-	 * ourselves.
+	struct mm_struct * mm;
+       
+	/* Subtle. We don't need to keep a reference to this task's mm,
+	 * because, for the mm to be freed on another CPU, that would have
+	 * to go through the task exit notifier, which ends up sleeping
+	 * on the buffer_sem we hold, so we end up with mutual exclusion
+	 * anyway.
 	 */
+	task_lock(task);
+	mm = task->mm;
+	task_unlock(task);
+ 
 	if (mm) {
-		/* More ugliness. If a task took its mmap
-		 * sem then came to sleep on buffer_sem we
-		 * will deadlock waiting for it. So we can
-		 * but try. This will lose samples :/
-		 */
-		if (!down_read_trylock(&mm->mmap_sem)) {
-			/* FIXME: this underestimates samples lost */
-			atomic_inc(&oprofile_stats.sample_lost_mmap_sem);
-			mm = NULL;
-		}
+		/* needed to walk the task's VMAs */
+		down_read(&mm->mmap_sem);
 	}
 
 	return mm;
@@ -399,7 +423,7 @@ static void sync_buffer(struct oprofile_cpu_buffer * cpu_buf)
 				new = (struct task_struct *)s->event;

 				release_mm(mm);
-				mm = take_task_mm(new);
+				mm = take_tasks_mm(new);

 				cookie = get_exec_dcookie(mm);
 				add_user_ctx_switch(new->pid, cookie);
@@ -460,4 +484,3 @@ static void timer_ping(unsigned long data)
 	schedule_work(&sync_wq);
 	/* timer is re-added by the scheduled task */
 }
-
--- a/drivers/oprofile/event_buffer.c
+++ b/drivers/oprofile/event_buffer.c
@@ -151,11 +151,15 @@ ssize_t event_buffer_read(struct file * file, char * buf, size_t count, loff_t *
 	if (count != max || *offset)
 		return -EINVAL;

-	/* wait for the event buffer to fill up with some data */
 	wait_event_interruptible(buffer_wait, atomic_read(&buffer_ready));
+
 	if (signal_pending(current))
 		return -EINTR;

+	/* can't currently happen */
+	if (!atomic_read(&buffer_ready))
+		return -EAGAIN;
+
 	down(&buffer_sem);

 	atomic_set(&buffer_ready, 0);

--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -28,6 +28,8 @@ int oprofile_setup(void)
 {
 	int err;
 
+	down(&start_sem);
+
 	if ((err = alloc_cpu_buffers()))
 		goto out;

@@ -45,7 +47,6 @@ int oprofile_setup(void)
 	if ((err = sync_start()))
 		goto out3;

-	down(&start_sem);
 	is_setup = 1;
 	up(&start_sem);
 	return 0;
@@ -58,6 +59,7 @@ int oprofile_setup(void)
 out1:
 	free_cpu_buffers();
 out:
+	up(&start_sem);
 	return err;
 }

@@ -106,27 +108,34 @@ void oprofile_stop(void)

 void oprofile_shutdown(void)
 {
+	down(&start_sem);
 	sync_stop();
 	if (oprofile_ops->shutdown)
 		oprofile_ops->shutdown(); 
-	/* down() is also necessary to synchronise all pending events
-	 * before freeing */
-	down(&buffer_sem);
 	is_setup = 0;
-	up(&buffer_sem);
 	free_event_buffer();
 	free_cpu_buffers();
+	up(&start_sem);
 }

- 
+
+extern void timer_init(struct oprofile_operations ** ops);
+
+
 static int __init oprofile_init(void)
 {
 	int err;

 	/* Architecture must fill in the interrupt ops and the
-	 * logical CPU type.
+	 * logical CPU type, or we can fall back to the timer
+	 * interrupt profiler.
 	 */
 	err = oprofile_arch_init(&oprofile_ops);
+	if (err == -ENODEV) {
+		timer_init(&oprofile_ops);
+		err = 0;
+	}
+
 	if (err)
 		goto out;


--- a/drivers/oprofile/oprofile_stats.c
+++ b/drivers/oprofile/oprofile_stats.c
@@ -31,7 +31,7 @@ void oprofile_reset_stats(void)
 		cpu_buf->sample_lost_task_exit = 0;
 	}
 
-	atomic_set(&oprofile_stats.sample_lost_mmap_sem, 0);
+	atomic_set(&oprofile_stats.sample_lost_no_mm, 0);
 	atomic_set(&oprofile_stats.event_lost_overflow, 0);
 }

@@ -68,8 +68,8 @@ void oprofile_create_stats_files(struct super_block * sb, struct dentry * root)
 			&cpu_buf->sample_lost_task_exit);
 	}
 
-	oprofilefs_create_ro_atomic(sb, dir, "sample_lost_mmap_sem",
-		&oprofile_stats.sample_lost_mmap_sem);
+	oprofilefs_create_ro_atomic(sb, dir, "sample_lost_no_mm",
+		&oprofile_stats.sample_lost_no_mm);
 	oprofilefs_create_ro_atomic(sb, dir, "event_lost_overflow",
 		&oprofile_stats.event_lost_overflow);
 }
--- a/drivers/oprofile/oprofile_stats.h
+++ b/drivers/oprofile/oprofile_stats.h
@@ -13,7 +13,7 @@
 #include <asm/atomic.h>
 
 struct oprofile_stat_struct {
-	atomic_t sample_lost_mmap_sem;
+	atomic_t sample_lost_no_mm;
 	atomic_t event_lost_overflow;
 };


--- a/arch/i386/oprofile/timer_int.c
+++ b/arch/i386/oprofile/timer_int.c
@@ -14,8 +14,6 @@
 #include <linux/oprofile.h>
 #include <asm/ptrace.h>
 
-#include "op_counter.h"
- 
 static int timer_notify(struct notifier_block * self, unsigned long val, void * data)
 {
 	struct pt_regs * regs = (struct pt_regs *)data;

--- a/include/asm-i386/pgalloc.h
+++ b/include/asm-i386/pgalloc.h
@@ -20,11 +20,11 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *p
 * Allocate and free page tables.
 */

-pgd_t *pgd_alloc(struct mm_struct *);
-void pgd_free(pgd_t *pgd);
+extern pgd_t *pgd_alloc(struct mm_struct *);
+extern void pgd_free(pgd_t *pgd);

-pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
-struct page *pte_alloc_one(struct mm_struct *, unsigned long);
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
+extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);

 static inline void pte_free_kernel(pte_t *pte)
 {

--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -123,4 +123,6 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
 #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
 #define PTE_FILE_MAX_BITS       32

+extern struct kmem_cache_s *pae_pgd_cachep;
+
 #endif /* _I386_PGTABLE_3LEVEL_H */
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -41,12 +41,21 @@ extern unsigned long empty_zero_page[1024];
 #ifndef __ASSEMBLY__
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
+
+/*
+ * Need to initialise the X86 PAE caches
+ */
+extern void pgtable_cache_init(void);
+
 #else
 # include <asm/pgtable-2level.h>
-#endif

-void pgtable_cache_init(void);
+/*
+ * No page table caches to initialise
+ */
+#define pgtable_cache_init()	do { } while (0)

+#endif
 #endif

 #define PMD_SIZE	(1UL << PMD_SHIFT)
@@ -183,6 +192,7 @@ extern unsigned long pg0[1024];
 * The following only work if pte_present() is true.
 * Undefined behaviour if not..
 */
+static inline int pte_user(pte_t pte)		{ return (pte).pte_low & _PAGE_USER; }
 static inline int pte_read(pte_t pte)		{ return (pte).pte_low & _PAGE_USER; }
 static inline int pte_exec(pte_t pte)		{ return (pte).pte_low & _PAGE_USER; }
 static inline int pte_dirty(pte_t pte)		{ return (pte).pte_low & _PAGE_DIRTY; }

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -51,6 +51,7 @@
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include <asm/pgtable.h>

 #include <linux/swapops.h>

@@ -688,6 +689,45 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,

 		vma = find_extend_vma(mm, start);

+#ifdef FIXADDR_START
+		if (!vma && start >= FIXADDR_START && start < FIXADDR_TOP) {
+			static struct vm_area_struct fixmap_vma = {
+				/* Catch users - if there are any valid
+				   ones, we can make this be "&init_mm" or
+				   something.  */
+				.vm_mm = NULL,
+				.vm_start = FIXADDR_START,
+				.vm_end = FIXADDR_TOP,
+				.vm_page_prot = PAGE_READONLY,
+				.vm_flags = VM_READ | VM_EXEC,
+			};
+			unsigned long pg = start & PAGE_MASK;
+			pgd_t *pgd;
+			pmd_t *pmd;
+			pte_t *pte;
+			pgd = pgd_offset_k(pg);
+			if (!pgd)
+				return i ? : -EFAULT;
+			pmd = pmd_offset(pgd, pg);
+			if (!pmd)
+				return i ? : -EFAULT;
+			pte = pte_offset_kernel(pmd, pg);
+			if (!pte || !pte_present(*pte) || !pte_user(*pte) ||
+			    !(write ? pte_write(*pte) : pte_read(*pte)))
+				return i ? : -EFAULT;
+			if (pages) {
+				pages[i] = pte_page(*pte);
+				get_page(pages[i]);
+			}
+			if (vmas)
+				vmas[i] = &fixmap_vma;
+			i++;
+			start += PAGE_SIZE;
+			len--;
+			continue;
+		}
+#endif
+
 		if (!vma || (pages && (vma->vm_flags & VM_IO))
 				|| !(flags & vma->vm_flags))
 			return i ? : -EFAULT;

--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -508,6 +508,8 @@ static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy)

 	rt_deadline = 0;

+	get_random_bytes(&rt_hash_rnd, 4);
+
 	for (i = rt_hash_mask; i >= 0; i--) {
 		spin_lock_bh(&rt_hash_table[i].lock);
 		rth = rt_hash_table[i].chain;
@@ -570,7 +572,6 @@ static void rt_secret_rebuild(unsigned long dummy)
 {
 	unsigned long now = jiffies;

-	get_random_bytes(&rt_hash_rnd, 4);
 	rt_cache_flush(0);
 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 }