diff -urN v2.4.17/Makefile v2.4.17-tr.diff/Makefile
--- v2.4.17/Makefile	Tue Jan  1 14:09:32 2002
+++ v2.4.17-tr.diff/Makefile	Tue Jan  1 22:19:53 2002
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 17
-EXTRAVERSION =
+EXTRAVERSION = -str
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff -urN v2.4.17/arch/i386/config.in v2.4.17-tr.diff/arch/i386/config.in
--- v2.4.17/arch/i386/config.in	Tue Jan  1 14:09:34 2002
+++ v2.4.17-tr.diff/arch/i386/config.in	Tue Jan  1 21:28:19 2002
@@ -53,6 +53,7 @@
    define_bool CONFIG_RWSEM_GENERIC_SPINLOCK y
    define_bool CONFIG_RWSEM_XCHGADD_ALGORITHM n
    define_bool CONFIG_X86_PPRO_FENCE y
+   define_bool CONFIG_X86_STR_NEEDS_MASK y
 else
    define_bool CONFIG_X86_WP_WORKS_OK y
    define_bool CONFIG_X86_INVLPG y
@@ -62,18 +63,21 @@
    define_bool CONFIG_X86_POPAD_OK y
    define_bool CONFIG_RWSEM_GENERIC_SPINLOCK n
    define_bool CONFIG_RWSEM_XCHGADD_ALGORITHM y
+   define_bool CONFIG_X86_STR_NEEDS_MASK n
 fi
 if [ "$CONFIG_M486" = "y" ]; then
    define_int  CONFIG_X86_L1_CACHE_SHIFT 4
    define_bool CONFIG_X86_USE_STRING_486 y
    define_bool CONFIG_X86_ALIGNMENT_16 y
    define_bool CONFIG_X86_PPRO_FENCE y
+   define_bool CONFIG_X86_STR_NEEDS_MASK y
 fi
 if [ "$CONFIG_M586" = "y" ]; then
    define_int  CONFIG_X86_L1_CACHE_SHIFT 5
    define_bool CONFIG_X86_USE_STRING_486 y
    define_bool CONFIG_X86_ALIGNMENT_16 y
    define_bool CONFIG_X86_PPRO_FENCE y
+   define_bool CONFIG_X86_STR_NEEDS_MASK y
 fi
 if [ "$CONFIG_M586TSC" = "y" ]; then
    define_int  CONFIG_X86_L1_CACHE_SHIFT 5
@@ -81,6 +85,7 @@
    define_bool CONFIG_X86_ALIGNMENT_16 y
    define_bool CONFIG_X86_TSC y
    define_bool CONFIG_X86_PPRO_FENCE y
+   define_bool CONFIG_X86_STR_NEEDS_MASK y
 fi
 if [ "$CONFIG_M586MMX" = "y" ]; then
    define_int  CONFIG_X86_L1_CACHE_SHIFT 5
@@ -89,6 +94,7 @@
    define_bool CONFIG_X86_TSC y
    define_bool CONFIG_X86_GOOD_APIC y
    define_bool CONFIG_X86_PPRO_FENCE y
+   define_bool CONFIG_X86_STR_NEEDS_MASK y
 fi
 if [ "$CONFIG_M686" = "y" ]; then
    define_int  CONFIG_X86_L1_CACHE_SHIFT 5
@@ -173,6 +179,7 @@
    define_bool CONFIG_X86_PAE y
 fi
 
+bool 'Use stack based current' CONFIG_USE_STACK_CURRENT
 bool 'Math emulation' CONFIG_MATH_EMULATION
 bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
 bool 'Symmetric multi-processing support' CONFIG_SMP
diff -urN v2.4.17/arch/i386/kernel/entry.S v2.4.17-tr.diff/arch/i386/kernel/entry.S
--- v2.4.17/arch/i386/kernel/entry.S	Mon Nov 12 17:49:47 2001
+++ v2.4.17-tr.diff/arch/i386/kernel/entry.S	Wed Jan  2 03:33:23 2002
@@ -3,6 +3,7 @@
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
+#include <asm/current_asm.h>
 
 /*
  * entry.S contains the system-call and fault low-level handling routines.
@@ -128,10 +129,6 @@
 	.long 3b,6b;	\
 .previous
 
-#define GET_CURRENT(reg) \
-	movl $-8192, reg; \
-	andl %esp, reg
-
 ENTRY(lcall7)
 	pushfl			# We get a different stack layout with call gates,
 	pushl %eax		# which has to be cleaned up later..
@@ -144,7 +141,7 @@
 	movl %ecx,CS(%esp)	#
 	movl %esp,%ebx
 	pushl %ebx
-	andl $-8192,%ebx	# GET_CURRENT
+	GET_CURRENT(bx)
 	movl exec_domain(%ebx),%edx	# Get the execution domain
 	movl 4(%edx),%edx	# Get the lcall7 handler for the domain
 	pushl $0x7
@@ -165,7 +162,7 @@
 	movl %ecx,CS(%esp)	#
 	movl %esp,%ebx
 	pushl %ebx
-	andl $-8192,%ebx	# GET_CURRENT
+	GET_CURRENT(bx)
 	movl exec_domain(%ebx),%edx	# Get the execution domain
 	movl 4(%edx),%edx	# Get the lcall7 handler for the domain
 	pushl $0x27
@@ -179,7 +176,7 @@
 	pushl %ebx
 	call SYMBOL_NAME(schedule_tail)
 	addl $4, %esp
-	GET_CURRENT(%ebx)
+	GET_CURRENT(bx)
 	testb $0x02,tsk_ptrace(%ebx)	# PT_TRACESYS
 	jne tracesys_exit
 	jmp	ret_from_sys_call
@@ -193,12 +190,25 @@
 
 ENTRY(system_call)
 	pushl %eax			# save orig_eax
-	SAVE_ALL
-	GET_CURRENT(%ebx)
-	testb $0x02,tsk_ptrace(%ebx)	# PT_TRACESYS
-	jne tracesys
+	pushl %es; 
+	cld;
+	pushl %ds; 
+	pushl %eax; 
+	pushl %ebp; 
+	pushl %edi; 
+	pushl %esi; 
+	pushl %edx; 
+	movl $(__KERNEL_DS),%edx; 
+	pushl %ecx; 
+	movl %edx,%ds;
+	pushl %ebx;
+	GET_CURRENTa(bx)
 	cmpl $(NR_syscalls),%eax
 	jae badsys
+	GET_CURRENTb(bx)
+	movl %edx,%es;
+	testb $0x02,tsk_ptrace(%ebx)	# PT_TRACESYS
+	jne tracesys
 	call *SYMBOL_NAME(sys_call_table)(,%eax,4)
 	movl %eax,EAX(%esp)		# save the return value
 ENTRY(ret_from_sys_call)
@@ -246,7 +256,7 @@
 
 	ALIGN
 ENTRY(ret_from_intr)
-	GET_CURRENT(%ebx)
+	GET_CURRENT(bx)
 ret_from_exception:
 	movl EFLAGS(%esp),%eax		# mix EFLAGS and CS
 	movb CS(%esp),%al
@@ -286,7 +296,7 @@
 	movl $(__KERNEL_DS),%edx
 	movl %edx,%ds
 	movl %edx,%es
-	GET_CURRENT(%ebx)
+	GET_CURRENT(bx)
 	call *%edi
 	addl $8,%esp
 	jmp ret_from_exception
@@ -304,7 +314,7 @@
 ENTRY(device_not_available)
 	pushl $-1		# mark this as an int
 	SAVE_ALL
-	GET_CURRENT(%ebx)
+	GET_CURRENT(bx)
 	movl %cr0,%eax
 	testl $0x4,%eax			# EM (math emulation bit)
 	jne device_not_available_emulate
diff -urN v2.4.17/arch/i386/kernel/head.S v2.4.17-tr.diff/arch/i386/kernel/head.S
--- v2.4.17/arch/i386/kernel/head.S	Tue Jul  3 21:15:01 2001
+++ v2.4.17-tr.diff/arch/i386/kernel/head.S	Tue Jan  1 21:12:26 2002
@@ -261,13 +261,13 @@
 #ifdef CONFIG_SMP
 	movb ready, %cl	
 	cmpb $1,%cl
-	je 1f			# the first CPU calls start_kernel
+	je 1f			# the first CPU calls initialize_primary
 				# all other CPUs call initialize_secondary
 	call SYMBOL_NAME(initialize_secondary)
 	jmp L6
 1:
 #endif
-	call SYMBOL_NAME(start_kernel)
+	call SYMBOL_NAME(initialize_primary)
 L6:
 	jmp L6			# main should never return here, but
 				# just in case, we know what happens.
@@ -320,7 +320,7 @@
 	ret
 
 ENTRY(stack_start)
-	.long SYMBOL_NAME(init_task_union)+8192
+	.long SYMBOL_NAME(init_task_stack)+8192
 	.long __KERNEL_DS
 
 /* This is the default interrupt "handler" :-) */
diff -urN v2.4.17/arch/i386/kernel/init_task.c v2.4.17-tr.diff/arch/i386/kernel/init_task.c
--- v2.4.17/arch/i386/kernel/init_task.c	Mon Sep 24 02:16:02 2001
+++ v2.4.17-tr.diff/arch/i386/kernel/init_task.c	Tue Jan  1 21:12:26 2002
@@ -13,14 +13,18 @@
 
 /*
  * Initial task structure.
- *
+ */
+union task_union init_task_union =
+		{ INIT_TASK(init_task_union.task) };
+/*
  * We need to make sure that this is 8192-byte aligned due to the
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union task_union init_task_union 
+
+unsigned long init_task_stack[THREAD_SIZE/sizeof(unsigned long)]
 	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_TASK(init_task_union.task) };
+	{ (unsigned long)&init_task_union,};
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
diff -urN v2.4.17/arch/i386/kernel/irq.c v2.4.17-tr.diff/arch/i386/kernel/irq.c
--- v2.4.17/arch/i386/kernel/irq.c	Mon Nov 12 17:49:47 2001
+++ v2.4.17-tr.diff/arch/i386/kernel/irq.c	Tue Jan  1 21:12:26 2002
@@ -223,7 +223,6 @@
 			continue;
 		}
 		esp &= ~(THREAD_SIZE-1);
-		esp += sizeof(struct task_struct);
 		show_stack((void*)esp);
  	}
 	printk("\nCPU %d:",cpu);
diff -urN v2.4.17/arch/i386/kernel/ldt.c v2.4.17-tr.diff/arch/i386/kernel/ldt.c
--- v2.4.17/arch/i386/kernel/ldt.c	Thu Nov  1 16:39:57 2001
+++ v2.4.17-tr.diff/arch/i386/kernel/ldt.c	Tue Jan  1 21:12:26 2002
@@ -12,11 +12,13 @@
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
+#include <linux/per_cpu.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
+#include <asm/descfn.h>
 
 /*
  * read_ldt() is not really atomic - this is not a problem since
diff -urN v2.4.17/arch/i386/kernel/process.c v2.4.17-tr.diff/arch/i386/kernel/process.c
--- v2.4.17/arch/i386/kernel/process.c	Fri Oct 12 03:11:49 2001
+++ v2.4.17-tr.diff/arch/i386/kernel/process.c	Tue Jan  1 21:12:26 2002
@@ -569,6 +569,63 @@
 	new_mm->context.cpuvalid = ~0UL;	/* valid on all CPU's - they can't have stale data */
 }
 
+struct full_task_struct
+{
+	struct task_struct tsk;
+	struct task_struct_info info;
+};
+
+static kmem_cache_t * tsk_cache;
+
+struct task_struct * alloc_task_struct(void)
+{
+	struct full_task_struct *f = kmem_cache_alloc(tsk_cache, GFP_KERNEL);
+	if (!f)
+		return NULL;
+	f->info.kstack = (void*)__get_free_pages(GFP_KERNEL,1);
+	if (!f->info.kstack) {
+		kmem_cache_free(tsk_cache, f);
+		return NULL;
+	}
+	*(void**)f->info.kstack = &f->tsk;
+	atomic_set(&f->info.users, 1);	
+	return &f->tsk;
+}
+
+void get_task_struct(struct task_struct *tsk)
+{
+	struct full_task_struct *f = (struct full_task_struct*)tsk;
+	atomic_inc(&f->info.users);
+}
+
+void free_task_struct(struct task_struct *tsk)
+{
+	struct full_task_struct *f = (struct full_task_struct*)tsk;
+	if(atomic_dec_and_test(&f->info.users)) {
+		free_pages((unsigned long) f->info.kstack, 1);
+		kmem_cache_free(tsk_cache, f);
+	}
+}
+
+void __init init_tsk_allocator(void)
+{
+	tsk_cache = kmem_cache_create("task_cache",
+					 sizeof(struct full_task_struct),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+					 NULL, NULL);
+	if (!tsk_cache)
+		panic("Cannot create task struct cache");
+}
+
+extern asmlinkage void start_kernel(void);
+void __init initialize_primary(void)
+{
+	struct full_task_struct *f = (struct full_task_struct*)&init_task;
+	atomic_set(&f->info.users, 1);
+	f->info.kstack = init_task_stack;
+	start_kernel();
+}
 /*
  * Save a segment.
  */
@@ -580,8 +637,9 @@
 	struct task_struct * p, struct pt_regs * regs)
 {
 	struct pt_regs * childregs;
+	struct full_task_struct *f = (struct full_task_struct *)p;
 
-	childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;
+	childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) f->info.kstack)) - 1;
 	struct_cpy(childregs, regs);
 	childregs->eax = 0;
 	childregs->esp = esp;
diff -urN v2.4.17/arch/i386/kernel/setup.c v2.4.17-tr.diff/arch/i386/kernel/setup.c
--- v2.4.17/arch/i386/kernel/setup.c	Tue Jan  1 14:09:34 2002
+++ v2.4.17-tr.diff/arch/i386/kernel/setup.c	Tue Jan  1 21:12:26 2002
@@ -109,6 +109,7 @@
 #include <asm/cobalt.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
+#include <asm/descfn.h>
 #include <asm/e820.h>
 #include <asm/dma.h>
 #include <asm/mpspec.h>
@@ -2870,6 +2871,7 @@
 };
 
 unsigned long cpu_initialized __initdata = 0;
+int cpucount;
 
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
@@ -2879,14 +2881,15 @@
  */
 void __init cpu_init (void)
 {
-	int nr = smp_processor_id();
+	int nr = cpucount;
+	struct task_struct *cur = init_tasks[nr];
 	struct tss_struct * t = &init_tss[nr];
 
 	if (test_and_set_bit(nr, &cpu_initialized)) {
 		printk(KERN_WARNING "CPU#%d already initialized!\n", nr);
 		for (;;) __sti();
 	}
-	printk(KERN_INFO "Initializing CPU#%d\n", nr);
+	printk(KERN_INFO "Initializing CPU#%d/%d\n", nr, cpucount);
 
 	if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
 		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
@@ -2911,17 +2914,21 @@
 	 * set up and load the per-CPU TSS and LDT
 	 */
 	atomic_inc(&init_mm.mm_count);
-	current->active_mm = &init_mm;
-	if(current->mm)
+	cur->active_mm = &init_mm;
+	if(cur->mm)
 		BUG();
-	enter_lazy_tlb(&init_mm, current, nr);
+	enter_lazy_tlb(&init_mm, cur, nr);
 
-	t->esp0 = current->thread.esp0;
+	t->esp0 = cur->thread.esp0;
 	set_tss_desc(nr,t);
 	gdt_table[__TSS(nr)].b &= 0xfffffdff;
 	load_TR(nr);
 	load_LDT(&init_mm);
 
+	printk("cur: %p  processor: %d  smp_processor_id(): %d  current: %p",
+		cur, cur->processor, smp_processor_id(), current);
+	smp_per_cpu_data()->curr = cur;
+
 	/*
 	 * Clear all 6 debug registers:
 	 */
@@ -2935,8 +2942,8 @@
 	/*
 	 * Force FPU initialization:
 	 */
-	current->flags &= ~PF_USEDFPU;
-	current->used_math = 0;
+	cur->flags &= ~PF_USEDFPU;
+	cur->used_math = 0;
 	stts();
 }
 
diff -urN v2.4.17/arch/i386/kernel/smpboot.c v2.4.17-tr.diff/arch/i386/kernel/smpboot.c
--- v2.4.17/arch/i386/kernel/smpboot.c	Tue Jan  1 14:09:34 2002
+++ v2.4.17-tr.diff/arch/i386/kernel/smpboot.c	Tue Jan  1 21:14:57 2002
@@ -447,7 +447,7 @@
 		synchronize_tsc_ap();
 }
 
-int cpucount;
+extern int cpucount;
 
 extern int cpu_idle(void);
 
@@ -480,6 +480,12 @@
  * from the task structure
  * This function must not return.
  */
+extern struct {
+	void * esp;
+	unsigned short ss;
+	unsigned long eip;
+} stack_start;
+
 void __init initialize_secondary(void)
 {
 	/*
@@ -491,14 +497,9 @@
 		"movl %0,%%esp\n\t"
 		"jmp *%1"
 		:
-		:"r" (current->thread.esp),"r" (current->thread.eip));
+		:"r" (stack_start.esp),"r" (stack_start.eip));
 }
 
-extern struct {
-	void * esp;
-	unsigned short ss;
-} stack_start;
-
 static int __init fork_by_hand(void)
 {
 	struct pt_regs regs;
@@ -510,14 +511,14 @@
 }
 
 /* which physical APIC ID maps to which logical CPU number */
-volatile int physical_apicid_2_cpu[MAX_APICID];
+volatile int physical_apicid_to_cpu[MAX_APICID];
 /* which logical CPU number maps to which physical APIC ID */
-volatile int cpu_2_physical_apicid[NR_CPUS];
+volatile int cpu_to_physical_apicid[NR_CPUS];
 
 /* which logical APIC ID maps to which logical CPU number */
-volatile int logical_apicid_2_cpu[MAX_APICID];
+volatile int logical_apicid_to_cpu[MAX_APICID];
 /* which logical CPU number maps to which logical APIC ID */
-volatile int cpu_2_logical_apicid[NR_CPUS];
+volatile int cpu_to_logical_apicid[NR_CPUS];
 
 static inline void init_cpu_to_apicid(void)
 /* Initialize all maps between cpu number and apicids */
@@ -525,12 +526,12 @@
 	int apicid, cpu;
 
 	for (apicid = 0; apicid < MAX_APICID; apicid++) {
-		physical_apicid_2_cpu[apicid] = -1;
-		logical_apicid_2_cpu[apicid] = -1;
+		physical_apicid_to_cpu[apicid] = -1;
+		logical_apicid_to_cpu[apicid] = -1;
 	}
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-		cpu_2_physical_apicid[cpu] = -1;
-		cpu_2_logical_apicid[cpu] = -1;
+		cpu_to_physical_apicid[cpu] = -1;
+		cpu_to_logical_apicid[cpu] = -1;
 	}
 }
 
@@ -541,11 +542,11 @@
  */
 {
 	if (clustered_apic_mode) {
-		logical_apicid_2_cpu[apicid] = cpu;	
-		cpu_2_logical_apicid[cpu] = apicid;
+		logical_apicid_to_cpu[apicid] = cpu;	
+		cpu_to_logical_apicid[cpu] = apicid;
 	} else {
-		physical_apicid_2_cpu[apicid] = cpu;	
-		cpu_2_physical_apicid[cpu] = apicid;
+		physical_apicid_to_cpu[apicid] = cpu;	
+		cpu_to_physical_apicid[cpu] = apicid;
 	}
 }
 
@@ -556,11 +557,11 @@
  */
 {
 	if (clustered_apic_mode) {
-		logical_apicid_2_cpu[apicid] = -1;	
-		cpu_2_logical_apicid[cpu] = -1;
+		logical_apicid_to_cpu[apicid] = -1;	
+		cpu_to_logical_apicid[cpu] = -1;
 	} else {
-		physical_apicid_2_cpu[apicid] = -1;	
-		cpu_2_physical_apicid[cpu] = -1;
+		physical_apicid_to_cpu[apicid] = -1;	
+		cpu_to_physical_apicid[cpu] = -1;
 	}
 }
 
@@ -808,7 +809,7 @@
 
 	map_cpu_to_boot_apicid(cpu, apicid);
 
-	idle->thread.eip = (unsigned long) start_secondary;
+	stack_start.eip = idle->thread.eip = (unsigned long) start_secondary;
 
 	del_from_runqueue(idle);
 	unhash_process(idle);
@@ -819,7 +820,7 @@
 
 	/* So we see what's up   */
 	printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
-	stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle);
+	stack_start.esp = (void *) (THREAD_SIZE + (char *)TSK_TO_KSTACK(idle));
 
 	/*
 	 * This grunge runs the startup process for
diff -urN v2.4.17/arch/i386/kernel/traps.c v2.4.17-tr.diff/arch/i386/kernel/traps.c
--- v2.4.17/arch/i386/kernel/traps.c	Fri Oct 12 03:11:49 2001
+++ v2.4.17-tr.diff/arch/i386/kernel/traps.c	Tue Jan  1 21:12:26 2002
@@ -209,7 +209,7 @@
 	printk("ds: %04x   es: %04x   ss: %04x\n",
 		regs->xds & 0xffff, regs->xes & 0xffff, ss);
 	printk("Process %s (pid: %d, stackpage=%08lx)",
-		current->comm, current->pid, 4096+(unsigned long)current);
+		current->comm, current->pid, (long)TSK_TO_KSTACK(current));
 	/*
 	 * When in-kernel, we also print out the stack and code at the
 	 * time of the fault..
diff -urN v2.4.17/arch/i386/lib/getuser.S v2.4.17-tr.diff/arch/i386/lib/getuser.S
--- v2.4.17/arch/i386/lib/getuser.S	Mon Jan 12 16:42:52 1998
+++ v2.4.17-tr.diff/arch/i386/lib/getuser.S	Tue Jan  1 22:41:10 2002
@@ -27,8 +27,6 @@
 .align 4
 .globl __get_user_1
 __get_user_1:
-	movl %esp,%edx
-	andl $0xffffe000,%edx
 	cmpl addr_limit(%edx),%eax
 	jae bad_get_user
 1:	movzbl (%eax),%edx
@@ -39,9 +37,7 @@
 .globl __get_user_2
 __get_user_2:
 	addl $1,%eax
-	movl %esp,%edx
 	jc bad_get_user
-	andl $0xffffe000,%edx
 	cmpl addr_limit(%edx),%eax
 	jae bad_get_user
 2:	movzwl -1(%eax),%edx
@@ -52,9 +48,7 @@
 .globl __get_user_4
 __get_user_4:
 	addl $3,%eax
-	movl %esp,%edx
 	jc bad_get_user
-	andl $0xffffe000,%edx
 	cmpl addr_limit(%edx),%eax
 	jae bad_get_user
 3:	movl -3(%eax),%edx
diff -urN v2.4.17/arch/i386/mm/fault.c v2.4.17-tr.diff/arch/i386/mm/fault.c
--- v2.4.17/arch/i386/mm/fault.c	Fri Oct 12 03:11:49 2001
+++ v2.4.17-tr.diff/arch/i386/mm/fault.c	Tue Jan  1 21:12:26 2002
@@ -24,6 +24,7 @@
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/hardirq.h>
+#include <asm/desc.h>
 
 extern void die(const char *,struct pt_regs *,long);
 
@@ -134,7 +135,6 @@
 }
 
 asmlinkage void do_invalid_op(struct pt_regs *, unsigned long);
-extern unsigned long idt;
 
 /*
  * This routine handles page faults.  It determines the address,
@@ -295,7 +295,7 @@
 	if (boot_cpu_data.f00f_bug) {
 		unsigned long nr;
 		
-		nr = (address - idt) >> 3;
+		nr = (address - (unsigned long)idt) >> 3;
 
 		if (nr == 6) {
 			do_invalid_op(regs, 0);
diff -urN v2.4.17/include/asm-i386/current.h v2.4.17-tr.diff/include/asm-i386/current.h
--- v2.4.17/include/asm-i386/current.h	Fri Aug 14 19:35:22 1998
+++ v2.4.17-tr.diff/include/asm-i386/current.h	Tue Jan  1 22:20:03 2002
@@ -1,15 +1,25 @@
 #ifndef _I386_CURRENT_H
 #define _I386_CURRENT_H
 
-struct task_struct;
+#include <asm/smp.h>
+#include <linux/per_cpu.h>
 
-static inline struct task_struct * get_current(void)
+static inline struct task_struct *get_current(void) __attribute__((const));
+#if defined(CONFIG_USE_STACK_CURRENT)
+static inline struct task_struct *get_current(void)
 {
-	struct task_struct *current;
-	__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
-	return current;
- }
- 
+	struct task_struct **ptsk;
+	__asm__ __volatile__("andl %%esp,%0; ":"=r" (ptsk) : "0" (~8191UL));
+	return *ptsk;
+}
+#else
+static inline struct task_struct *get_current(void)
+{
+	return smp_per_cpu_data()->curr;
+}
+#endif
+
+/* Note: the implementation is hardcoded into arch/i386/lib/getuser.S */
 #define current get_current()
 
 #endif /* !(_I386_CURRENT_H) */
diff -urN v2.4.17/include/asm-i386/current_asm.h v2.4.17-tr.diff/include/asm-i386/current_asm.h
--- v2.4.17/include/asm-i386/current_asm.h	Wed Dec 31 19:00:00 1969
+++ v2.4.17-tr.diff/include/asm-i386/current_asm.h	Wed Jan  2 01:38:26 2002
@@ -0,0 +1,45 @@
+/* asm/current_asm.h
+ */
+#ifndef __ASM__CURRENT_ASM_H
+#define __ASM__CURRENT_ASM_H
+
+#include <linux/config.h>
+#include <linux/per_cpu.h>
+#include <asm/desc.h>
+
+#define EREG(reg)	%e/**/reg
+
+#ifdef CONFIG_USE_STACK_CURRENT
+/* Pull the pointer to current off of the bottom of the stack. */
+#define GET_CURRENT(r)	\
+	movl $-8192,EREG(r) ;	\
+	andl %esp,EREG(r) ;		\
+	movl (EREG(r)),EREG(r)
+
+#else /* !CONFIG_USE_STACK_CURRENT */
+
+#ifdef CONFIG_SMP
+/* Pass in the short name versions of the register.
+ * eg GET_CURRENT(bx)
+ * All of this braindamage comes to us c/o a bug in gas: the
+ * opcode we want should actually be generated by strl, but 
+ * unfortunately gas doesn't realize that the operand size 
+ * prefix applies to str.  Please take a wet noodle and thread 
+ * it into my eye as that will be less painful than dealing 
+ * with this mess.  -ben
+ */
+#define GET_CURRENTa(reg)				\
+	str %reg
+
+#define GET_CURRENTb(reg)				\
+	; aligned_data_adjusted = aligned_data-(__FIRST_TSS_ENTRY << (LOG2_PER_CPU_SIZE - 2))	\
+	; movl aligned_data_adjusted(,EREG(reg),(1 << (LOG2_PER_CPU_SIZE-5))),EREG(reg)
+
+#define GET_CURRENT(reg)	GET_CURRENTa(reg) ; GET_CURRENTb(reg)
+
+#else	/* Uniprocessor: see linux/per_cpu.h */
+#define GET_CURRENT(reg)	movl (aligned_data),EREG(reg)
+#endif
+
+#endif /*def CONFIG_USE_STACK_CURRENT */
+#endif /* __ASM__CURRENT_ASM_H */
diff -urN v2.4.17/include/asm-i386/desc.h v2.4.17-tr.diff/include/asm-i386/desc.h
--- v2.4.17/include/asm-i386/desc.h	Mon Aug 13 15:12:08 2001
+++ v2.4.17-tr.diff/include/asm-i386/desc.h	Tue Jan  1 21:38:13 2002
@@ -34,7 +34,7 @@
  *
  * Entry into gdt where to find first TSS.
  */
-#define __FIRST_TSS_ENTRY 12
+#define __FIRST_TSS_ENTRY 12	/* Note!  Must be divisible by 4!  See smp.h. */
 #define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY+1)
 
 #define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY)
@@ -60,40 +60,6 @@
 
 #define __load_LDT(n) __asm__ __volatile__("lldt %%ax"::"a" (__LDT(n)<<3))
 
-/*
- * This is the ldt that every process will get unless we need
- * something other than this.
- */
-extern struct desc_struct default_ldt[];
-extern void set_intr_gate(unsigned int irq, void * addr);
-extern void set_ldt_desc(unsigned int n, void *addr, unsigned int size);
-extern void set_tss_desc(unsigned int n, void *addr);
-
-static inline void clear_LDT(void)
-{
-	int cpu = smp_processor_id();
-	set_ldt_desc(cpu, &default_ldt[0], 5);
-	__load_LDT(cpu);
-}
-
-/*
- * load one particular LDT into the current CPU
- */
-static inline void load_LDT (struct mm_struct *mm)
-{
-	int cpu = smp_processor_id();
-	void *segments = mm->context.segments;
-	int count = LDT_ENTRIES;
-
-	if (!segments) {
-		segments = &default_ldt[0];
-		count = 5;
-	}
-		
-	set_ldt_desc(cpu, segments, count);
-	__load_LDT(cpu);
-}
-
 #endif /* !__ASSEMBLY__ */
 
 #endif
diff -urN v2.4.17/include/asm-i386/descfn.h v2.4.17-tr.diff/include/asm-i386/descfn.h
--- v2.4.17/include/asm-i386/descfn.h	Wed Dec 31 19:00:00 1969
+++ v2.4.17-tr.diff/include/asm-i386/descfn.h	Tue Jan  1 21:51:55 2002
@@ -0,0 +1,42 @@
+#ifndef __ARCH_DESCFN_H
+#define __ARCH_DESCFN_H
+
+#ifndef __ARCH_DESC_H
+#include <asm/desc.h>
+#endif
+
+/*
+ * This is the ldt that every process will get unless we need
+ * something other than this.
+ */
+extern struct desc_struct default_ldt[];
+extern void set_intr_gate(unsigned int irq, void * addr);
+extern void set_ldt_desc(unsigned int n, void *addr, unsigned int size);
+extern void set_tss_desc(unsigned int n, void *addr);
+
+static inline void clear_LDT(void)
+{
+	int cpu = smp_processor_id();
+	set_ldt_desc(cpu, &default_ldt[0], 5);
+	__load_LDT(cpu);
+}
+
+/*
+ * load one particular LDT into the current CPU
+ */
+static inline void load_LDT (struct mm_struct *mm)
+{
+	int cpu = smp_processor_id();
+	void *segments = mm->context.segments;
+	int count = LDT_ENTRIES;
+
+	if (!segments) {
+		segments = &default_ldt[0];
+		count = 5;
+	}
+		
+	set_ldt_desc(cpu, segments, count);
+	__load_LDT(cpu);
+}
+
+#endif /* __ARCH_DESCFN_H */
diff -urN v2.4.17/include/asm-i386/mmu_context.h v2.4.17-tr.diff/include/asm-i386/mmu_context.h
--- v2.4.17/include/asm-i386/mmu_context.h	Sat Jan  5 02:58:05 2002
+++ v2.4.17-tr.diff/include/asm-i386/mmu_context.h	Tue Jan  1 21:51:55 2002
@@ -5,6 +5,7 @@
 #include <asm/desc.h>
 #include <asm/atomic.h>
 #include <asm/pgalloc.h>
+#include <asm/descfn.h>
 
 /*
  * possibly do the LDT unload here?
diff -urN v2.4.17/include/asm-i386/processor.h v2.4.17-tr.diff/include/asm-i386/processor.h
--- v2.4.17/include/asm-i386/processor.h	Wed Jan  2 19:11:25 2002
+++ v2.4.17-tr.diff/include/asm-i386/processor.h	Tue Jan  1 21:51:54 2002
@@ -14,6 +14,7 @@
 #include <asm/types.h>
 #include <asm/sigcontext.h>
 #include <asm/cpufeature.h>
+#include <asm/atomic.h>
 #include <linux/cache.h>
 #include <linux/config.h>
 #include <linux/threads.h>
@@ -383,6 +384,16 @@
 	unsigned long	io_bitmap[IO_BITMAP_SIZE+1];
 };
 
+struct task_struct_info
+{
+	void *kstack;
+	atomic_t users;
+};
+
+/* the init task stack is allocated externally */
+#define INIT_TASK_SIZE	(sizeof(struct task_struct) + sizeof(struct task_struct_info))
+extern unsigned long init_task_stack[];
+
 #define INIT_THREAD  {						\
 	0,							\
 	0, 0, 0, 0, 						\
@@ -395,7 +406,7 @@
 
 #define INIT_TSS  {						\
 	0,0, /* back_link, __blh */				\
-	sizeof(init_stack) + (long) &init_stack, /* esp0 */	\
+	0, /* esp0 */ 						\
 	__KERNEL_DS, 0, /* ss0 */				\
 	0,0,0,0,0,0, /* stack1, stack2 */			\
 	0, /* cr3 */						\
@@ -444,16 +455,19 @@
 }
 
 unsigned long get_wchan(struct task_struct *p);
-#define KSTK_EIP(tsk)	(((unsigned long *)(4096+(unsigned long)(tsk)))[1019])
-#define KSTK_ESP(tsk)	(((unsigned long *)(4096+(unsigned long)(tsk)))[1022])
+#define TSK_TO_KSTACK(tsk) \
+	((unsigned long *) ((struct task_struct_info*)(tsk+1))->kstack)
+
+#define KSTK_EIP(tsk)	(TSK_TO_KSTACK(tsk)[2043])
+#define KSTK_ESP(tsk)	(TSK_TO_KSTACK(tsk)[2046])
 
 #define THREAD_SIZE (2*PAGE_SIZE)
-#define alloc_task_struct() ((struct task_struct *) __get_free_pages(GFP_KERNEL,1))
-#define free_task_struct(p) free_pages((unsigned long) (p), 1)
-#define get_task_struct(tsk)      atomic_inc(&virt_to_page(tsk)->count)
+void init_tsk_allocator(void);
+struct task_struct * alloc_task_struct(void);
+void get_task_struct(struct task_struct *tsk);
+void free_task_struct(struct task_struct *tsk);
 
 #define init_task	(init_task_union.task)
-#define init_stack	(init_task_union.stack)
 
 struct microcode {
 	unsigned int hdrver;
diff -urN v2.4.17/include/asm-i386/smp.h v2.4.17-tr.diff/include/asm-i386/smp.h
--- v2.4.17/include/asm-i386/smp.h	Wed Jan  2 19:11:25 2002
+++ v2.4.17-tr.diff/include/asm-i386/smp.h	Tue Jan  1 21:51:54 2002
@@ -8,6 +8,7 @@
 #include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/ptrace.h>
+#include <asm/desc.h>
 #endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -33,6 +34,7 @@
 #else
 # define INT_DELIVERY_MODE 1     /* logical delivery */
 # define TARGET_CPUS 0x01
+# define smp_per_cpu_data()	per_data(0)
 #endif
 
 #ifndef clustered_apic_mode
@@ -86,10 +88,10 @@
  * the real APIC ID <-> CPU # mapping.
  */
 #define MAX_APICID 256
-extern volatile int cpu_to_physical_apicid[NR_CPUS];
-extern volatile int physical_apicid_to_cpu[MAX_APICID];
-extern volatile int cpu_to_logical_apicid[NR_CPUS];
-extern volatile int logical_apicid_to_cpu[MAX_APICID];
+extern volatile int physical_apicid_to_cpu[];
+extern volatile int cpu_to_physical_apicid[];
+extern volatile int cpu_to_logical_apicid[];
+extern volatile int logical_apicid_to_cpu[];
 
 /*
  * General functions that each host system must provide.
@@ -103,8 +105,37 @@
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
  */
+extern int dummy_cpu_id;
 
-#define smp_processor_id() (current->processor)
+static inline unsigned get_TR(void) __attribute__ ((pure));
+static inline unsigned get_TR(void)
+{
+	unsigned tr;
+	/* The PAIN!  The HORROR!
+	 * Technically this is wrong, wrong, wrong, but 
+	 * gas doesn't know about strl.  *sigh*  Please 
+	 * flog them with a wet noodle repeatedly.
+	 * The extra parameter is a dummy value to prevent
+	 * gcc from assuming that the value is const across
+	 * function calls.  Fun!  -ben
+	 */
+	__asm__ ("str %w0" : "=r" (tr) : "m" (dummy_cpu_id));
+#if defined(CONFIG_X86_STR_NEEDS_MASK)
+	tr &= 0xffff;	/* < PPro leaves the high bits undefined. */
+#endif
+	return tr;
+}
+
+#define smp_processor_id()	( (get_TR() >> 5) - (__FIRST_TSS_ENTRY >> 2) )
+
+/* There is no way to tell gcc that the low bits of get_TR 
+ * are always 0, hence the following macro to produce 
+ * optimal code.  -ben
+ */
+#define smp_per_cpu_data()	\
+	( (struct per_cpu_data *)					\
+	  ( (get_TR() << (LOG2_PER_CPU_SIZE - 5)) + (long)&aligned_data \
+		- (__FIRST_TSS_ENTRY << (LOG2_PER_CPU_SIZE - 2)) ) )
 
 static __inline int hard_smp_processor_id(void)
 {
diff -urN v2.4.17/include/asm-i386/smpboot.h v2.4.17-tr.diff/include/asm-i386/smpboot.h
--- v2.4.17/include/asm-i386/smpboot.h	Wed Jan  2 19:13:55 2002
+++ v2.4.17-tr.diff/include/asm-i386/smpboot.h	Tue Jan  1 21:55:38 2002
@@ -36,21 +36,21 @@
  * Mappings between logical cpu number and logical / physical apicid
  * The first four macros are trivial, but it keeps the abstraction consistent
  */
-extern volatile int logical_apicid_2_cpu[];
-extern volatile int cpu_2_logical_apicid[];
-extern volatile int physical_apicid_2_cpu[];
-extern volatile int cpu_2_physical_apicid[];
+extern volatile int logical_apicid_to_cpu[];
+extern volatile int cpu_to_logical_apicid[];
+extern volatile int physical_apicid_to_cpu[];
+extern volatile int cpu_to_physical_apicid[];
 
-#define logical_apicid_to_cpu(apicid) logical_apicid_2_cpu[apicid]
-#define cpu_to_logical_apicid(cpu) cpu_2_logical_apicid[cpu]
-#define physical_apicid_to_cpu(apicid) physical_apicid_2_cpu[apicid]
-#define cpu_to_physical_apicid(cpu) cpu_2_physical_apicid[cpu]
+#define logical_apicid_to_cpu(apicid) logical_apicid_to_cpu[apicid]
+#define cpu_to_logical_apicid(cpu) cpu_to_logical_apicid[cpu]
+#define physical_apicid_to_cpu(apicid) physical_apicid_to_cpu[apicid]
+#define cpu_to_physical_apicid(cpu) cpu_to_physical_apicid[cpu]
 #ifdef CONFIG_MULTIQUAD			/* use logical IDs to bootstrap */
-#define boot_apicid_to_cpu(apicid) logical_apicid_2_cpu[apicid]
-#define cpu_to_boot_apicid(cpu) cpu_2_logical_apicid[cpu]
+#define boot_apicid_to_cpu(apicid) logical_apicid_to_cpu[apicid]
+#define cpu_to_boot_apicid(cpu) cpu_to_logical_apicid[cpu]
 #else /* !CONFIG_MULTIQUAD */		/* use physical IDs to bootstrap */
-#define boot_apicid_to_cpu(apicid) physical_apicid_2_cpu[apicid]
-#define cpu_to_boot_apicid(cpu) cpu_2_physical_apicid[cpu]
+#define boot_apicid_to_cpu(apicid) physical_apicid_to_cpu[apicid]
+#define cpu_to_boot_apicid(cpu) cpu_to_physical_apicid[cpu]
 #endif /* CONFIG_MULTIQUAD */
 
 
diff -urN v2.4.17/include/asm-i386/uaccess.h v2.4.17-tr.diff/include/asm-i386/uaccess.h
--- v2.4.17/include/asm-i386/uaccess.h	Sat Jan  5 02:58:05 2002
+++ v2.4.17-tr.diff/include/asm-i386/uaccess.h	Tue Jan  1 22:20:03 2002
@@ -109,7 +109,7 @@
 #define __get_user_x(size,ret,x,ptr) \
 	__asm__ __volatile__("call __get_user_" #size \
 		:"=a" (ret),"=d" (x) \
-		:"0" (ptr))
+		:"0" (ptr), "1" (current))
 
 /* Careful: we have to cast the result to the type of the pointer for sign reasons */
 #define get_user(x,ptr)							\
diff -urN v2.4.17/include/linux/per_cpu.h v2.4.17-tr.diff/include/linux/per_cpu.h
--- v2.4.17/include/linux/per_cpu.h	Wed Dec 31 19:00:00 1969
+++ v2.4.17-tr.diff/include/linux/per_cpu.h	Tue Jan  1 21:31:34 2002
@@ -0,0 +1,32 @@
+#ifndef __LINUX__PER_CPU__H
+#define __LINUX__PER_CPU__H
+
+#define LOG2_PER_CPU_SIZE	8
+#define PER_CPU_SIZE		(1 << LOG2_PER_CPU_SIZE)
+
+#ifndef __ASSEMBLY__
+struct task_struct;
+
+struct per_cpu_data {
+	/* Assembly code relies on curr being the first member of this 
+	 * structure.  Please change it if this gets rearranged.
+	 */
+	struct task_struct	*curr;
+	cycles_t		last_schedule;
+};
+
+union aligned_data {
+	struct per_cpu_data	data;
+	char __pad [PER_CPU_SIZE];
+
+	/* Make sure the padding is large enough by forcing an error 
+	 * if it isn't.  -ben
+	 */
+	char __pad2 [PER_CPU_SIZE - sizeof(struct per_cpu_data)];
+};
+
+extern union aligned_data aligned_data[];
+
+#define per_data(nr)	(&aligned_data[nr].data)
+#endif
+#endif
diff -urN v2.4.17/init/main.c v2.4.17-tr.diff/init/main.c
--- v2.4.17/init/main.c	Tue Jan  1 14:09:35 2002
+++ v2.4.17-tr.diff/init/main.c	Tue Jan  1 21:12:26 2002
@@ -548,7 +548,6 @@
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
  */
-	lock_kernel();
 	printk(linux_banner);
 	setup_arch(&command_line);
 	printk("Kernel command line: %s\n", saved_command_line);
@@ -559,6 +558,13 @@
 	softirq_init();
 	time_init();
 
+	/* At the very least, this has to come after trap_init as x86
+	 * needs to perform CPU setup before current is valid.  This 
+	 * should be okay as we're still running with interrupts disabled 
+	 * and no other CPUs are up yet.  -ben
+	 */
+	lock_kernel();
+
 	/*
 	 * HACK ALERT! This is early. We're enabling the console before
 	 * we've done PCI setups etc, and console_init() must be aware of
@@ -596,6 +602,9 @@
 	mempages = num_physpages;
 
 	fork_init(mempages);
+#ifdef __i386__
+	init_tsk_allocator();
+#endif
 	proc_caches_init();
 	vfs_caches_init(mempages);
 	buffer_init(mempages);
diff -urN v2.4.17/kernel/ksyms.c v2.4.17-tr.diff/kernel/ksyms.c
--- v2.4.17/kernel/ksyms.c	Tue Jan  1 14:09:35 2002
+++ v2.4.17-tr.diff/kernel/ksyms.c	Tue Jan  1 21:12:26 2002
@@ -448,6 +448,7 @@
 
 EXPORT_SYMBOL(kstat);
 EXPORT_SYMBOL(nr_running);
+EXPORT_SYMBOL(aligned_data);
 
 /* misc */
 EXPORT_SYMBOL(panic);
diff -urN v2.4.17/kernel/sched.c v2.4.17-tr.diff/kernel/sched.c
--- v2.4.17/kernel/sched.c	Tue Jan  1 14:09:35 2002
+++ v2.4.17-tr.diff/kernel/sched.c	Tue Jan  1 21:38:56 2002
@@ -29,6 +29,7 @@
 #include <linux/completion.h>
 #include <linux/prefetch.h>
 #include <linux/compiler.h>
+#include <linux/per_cpu.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -98,16 +99,10 @@
  * We align per-CPU scheduling data on cacheline boundaries,
  * to prevent cacheline ping-pong.
  */
-static union {
-	struct schedule_data {
-		struct task_struct * curr;
-		cycles_t last_schedule;
-	} schedule_data;
-	char __pad [SMP_CACHE_BYTES];
-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+union aligned_data aligned_data[NR_CPUS] __cacheline_aligned;
 
-#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
-#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
+#define cpu_curr(cpu)		per_data(cpu)->curr
+#define last_schedule(cpu)	per_data(cpu)->last_schedule
 
 struct kernel_stat kstat;
 extern struct task_struct *child_reaper;
@@ -548,7 +543,7 @@
  */
 asmlinkage void schedule(void)
 {
-	struct schedule_data * sched_data;
+	struct per_cpu_data * sched_data;
 	struct task_struct *prev, *next, *p;
 	struct list_head *tmp;
 	int this_cpu, c;
@@ -559,7 +554,7 @@
 	if (!current->active_mm) BUG();
 need_resched_back:
 	prev = current;
-	this_cpu = prev->processor;
+	this_cpu = smp_processor_id();	/* This better than current->processor on up */
 
 	if (unlikely(in_interrupt())) {
 		printk("Scheduling in interrupt\n");
@@ -572,7 +567,7 @@
 	 * 'sched_data' is protected by the fact that we can run
 	 * only one process per CPU.
 	 */
-	sched_data = & aligned_data[this_cpu].schedule_data;
+	sched_data = per_data(this_cpu);
 
 	spin_lock_irq(&runqueue_lock);
 
@@ -1048,7 +1043,7 @@
 	// Subtract non-idle processes running on other CPUs.
 	for (i = 0; i < smp_num_cpus; i++) {
 		int cpu = cpu_logical_map(i);
-		if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
+		if (per_data(cpu)->curr != idle_task(cpu))
 			nr_pending--;
 	}
 #else
@@ -1301,17 +1296,18 @@
 
 void __init init_idle(void)
 {
-	struct schedule_data * sched_data;
-	sched_data = &aligned_data[smp_processor_id()].schedule_data;
+	struct per_cpu_data * sched_data;
+	int cpu = smp_processor_id();
+	sched_data = per_data(cpu);
 
 	if (current != &init_task && task_on_runqueue(current)) {
 		printk("UGH! (%d:%d) was on the runqueue, removing.\n",
-			smp_processor_id(), current->pid);
+			cpu, current->pid);
 		del_from_runqueue(current);
 	}
 	sched_data->curr = current;
 	sched_data->last_schedule = get_cycles();
-	clear_bit(current->processor, &wait_init_idle);
+	clear_bit(cpu, &wait_init_idle);
 }
 
 extern void init_timervecs (void);
