diff -urN v2.4.19/AIO-NOTES aio-2.4.19.diff/AIO-NOTES
--- v2.4.19/AIO-NOTES	Wed Dec 31 19:00:00 1969
+++ aio-2.4.19.diff/AIO-NOTES	Mon Sep 16 21:54:13 2002
@@ -0,0 +1,3 @@
+- aio context destruction is now synchronous: it waits for all pending 
+  ios to complete.  This will now cause a task that is exiting to be 
+  delayed if outstanding ios are executing.
diff -urN v2.4.19/MAINTAINERS aio-2.4.19.diff/MAINTAINERS
--- v2.4.19/MAINTAINERS	Fri Aug  9 13:49:02 2002
+++ aio-2.4.19.diff/MAINTAINERS	Mon Sep 16 21:54:13 2002
@@ -228,6 +228,12 @@
 L:	linux-net@vger.kernel.org
 S:	Maintained
 
+ASYNC IO
+P:	Benjamin LaHaise
+M:	bcrl@redhat.com
+L:	linux-aio@kvack.org
+S:	Maintained
+
 AX.25 NETWORK LAYER
 P:	Matthias Welwarsky
 M:	dg2fef@afthd.tu-darmstadt.de
diff -urN v2.4.19/arch/i386/Makefile aio-2.4.19.diff/arch/i386/Makefile
--- v2.4.19/arch/i386/Makefile	Thu May  3 11:22:07 2001
+++ aio-2.4.19.diff/arch/i386/Makefile	Mon Sep 16 21:54:13 2002
@@ -22,6 +22,7 @@
 LINKFLAGS =-T $(TOPDIR)/arch/i386/vmlinux.lds $(LDFLAGS)
 
 CFLAGS += -pipe
+CFLAGS+=-freorder-blocks
 
 # prevent gcc from keeping the stack 16 byte aligned
 CFLAGS += $(shell if $(CC) -mpreferred-stack-boundary=2 -S -o /dev/null -xc /dev/null >/dev/null 2>&1; then echo "-mpreferred-stack-boundary=2"; fi)
@@ -98,7 +99,7 @@
 DRIVERS += arch/i386/math-emu/math.o
 endif
 
-arch/i386/kernel: dummy
+arch/i386/kernel: dummy include/linux/compile.h
 	$(MAKE) linuxsubdirs SUBDIRS=arch/i386/kernel
 
 arch/i386/mm: dummy
diff -urN v2.4.19/arch/i386/kernel/entry.S aio-2.4.19.diff/arch/i386/kernel/entry.S
--- v2.4.19/arch/i386/kernel/entry.S	Fri Aug  9 13:49:03 2002
+++ aio-2.4.19.diff/arch/i386/kernel/entry.S	Mon Sep 16 21:54:13 2002
@@ -45,6 +45,7 @@
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/smp.h>
+#include <asm/unistd.h>
 
 EBX		= 0x00
 ECX		= 0x04
@@ -639,6 +640,13 @@
 	.long SYMBOL_NAME(sys_ni_syscall)	/* 240 reserved for futex */
 	.long SYMBOL_NAME(sys_ni_syscall)	/* reserved for sched_setaffinity */
 	.long SYMBOL_NAME(sys_ni_syscall)	/* reserved for sched_getaffinity */
+	.long SYMBOL_NAME(sys_ni_syscall)       /* reserved for set_thread_area */
+	.long SYMBOL_NAME(sys_ni_syscall)       /* reserved for get_thread_area */
+	.long SYMBOL_NAME(sys_io_setup)         /* 245 */
+	.long SYMBOL_NAME(sys_io_destroy)
+	.long SYMBOL_NAME(sys_io_getevents)
+	.long SYMBOL_NAME(sys_io_submit)
+	.long SYMBOL_NAME(sys_io_cancel)
 
 	.rept NR_syscalls-(.-sys_call_table)/4
 		.long SYMBOL_NAME(sys_ni_syscall)
diff -urN v2.4.19/arch/i386/kernel/irq.c aio-2.4.19.diff/arch/i386/kernel/irq.c
--- v2.4.19/arch/i386/kernel/irq.c	Mon Nov 12 17:49:47 2001
+++ aio-2.4.19.diff/arch/i386/kernel/irq.c	Mon Sep 16 21:54:13 2002
@@ -577,7 +577,17 @@
 	irq_desc_t *desc = irq_desc + irq;
 	struct irqaction * action;
 	unsigned int status;
+	long esp;
 
+	/* Debugging check for stack overflow: is there less than 2KB free? */
+	__asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (8191));
+	if (esp < (sizeof(struct task_struct) + 2048)) {
+		printk("do_IRQ: stack overflow: %ld\n",
+			esp - sizeof(struct task_struct));
+		__asm__ __volatile__("movl %%esp,%0" : "=r" (esp));
+		show_stack((void *)esp);
+	}
+	
 	kstat.irqs[cpu][irq]++;
 	spin_lock(&desc->lock);
 	desc->handler->ack(irq);
diff -urN v2.4.19/arch/i386/kernel/semaphore.c aio-2.4.19.diff/arch/i386/kernel/semaphore.c
--- v2.4.19/arch/i386/kernel/semaphore.c	Fri Aug  9 13:49:03 2002
+++ aio-2.4.19.diff/arch/i386/kernel/semaphore.c	Mon Sep 16 21:54:13 2002
@@ -14,6 +14,7 @@
  */
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/worktodo.h>
 #include <asm/semaphore.h>
 
 /*
@@ -54,6 +55,54 @@
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
+void __wtd_down(struct semaphore * sem, struct worktodo *wtd);
+
+void __wtd_down_action(void *data)
+{
+	struct worktodo *wtd = data;
+	struct semaphore *sem;
+
+	wtd_pop(wtd);
+	sem = wtd->data;
+
+	__wtd_down(sem, wtd);
+}
+
+void __wtd_down_waiter(wait_queue_t *wait)
+{
+	struct worktodo *wtd = (struct worktodo *)wait;
+	struct semaphore *sem = wtd->data;
+
+	__remove_wait_queue(&sem->wait, &wtd->wait);
+	wtd_push(wtd, __wtd_down_action, wtd);
+	wtd_queue(wtd);
+}
+
+void __wtd_down(struct semaphore * sem, struct worktodo *wtd)
+{
+	int gotit;
+	int sleepers;
+
+	init_waitqueue_func_entry(&wtd->wait, __wtd_down_waiter);
+	wtd->data = sem;
+
+	spin_lock_irq(&semaphore_lock);
+	sem->sleepers++;
+	sleepers = sem->sleepers;
+	gotit = add_wait_queue_exclusive_cond(&sem->wait, &wtd->wait,
+			atomic_add_negative(sleepers - 1, &sem->count));
+	if (gotit)
+		sem->sleepers = 0;
+	else
+		sem->sleepers = 1;
+	spin_unlock_irq(&semaphore_lock);
+
+	if (gotit) {
+		wake_up(&sem->wait);
+		wtd_queue(wtd);
+	}
+}
+
 void __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
@@ -254,6 +303,21 @@
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 	"popl %eax\n\t"
+	"ret"
+);
+
+asm(
+".text\n"
+".align 4\n"
+".globl __wtd_down_failed\n"
+"__wtd_down_failed:\n\t"
+	"pushl %eax\n\t"
+	"pushl %edx\n\t"
+	"pushl %ecx\n\t"
+	"call __wtd_down\n\t"
+	"popl %ecx\n\t"
+	"popl %edx\n\t"
+	"popl %eax\n\t"
 	"ret"
 );
 
diff -urN v2.4.19/arch/i386/mm/fault.c aio-2.4.19.diff/arch/i386/mm/fault.c
--- v2.4.19/arch/i386/mm/fault.c	Fri Aug  9 13:49:03 2002
+++ aio-2.4.19.diff/arch/i386/mm/fault.c	Mon Sep 16 21:54:13 2002
@@ -27,6 +27,8 @@
 
 extern void die(const char *,struct pt_regs *,long);
 
+spinlock_t oops_lock = SPIN_LOCK_UNLOCKED;
+
 /*
  * Ugly, ugly, but the goto's result in better assembly..
  */
@@ -306,7 +308,7 @@
  * Oops. The kernel tried to access some bad page. We'll have to
  * terminate things with extreme prejudice.
  */
-
+	spin_lock(&oops_lock);
 	bust_spinlocks(1);
 
 	if (address < PAGE_SIZE)
@@ -327,6 +329,7 @@
 	}
 	die("Oops", regs, error_code);
 	bust_spinlocks(0);
+	spin_unlock(&oops_lock);
 	do_exit(SIGKILL);
 
 /*
diff -urN v2.4.19/arch/ia64/kernel/entry.S aio-2.4.19.diff/arch/ia64/kernel/entry.S
--- v2.4.19/arch/ia64/kernel/entry.S	Fri Aug  9 13:49:03 2002
+++ aio-2.4.19.diff/arch/ia64/kernel/entry.S	Mon Sep 16 21:54:13 2002
@@ -1154,11 +1154,11 @@
 	data8 ia64_ni_syscall			// 1235
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall
-	data8 ia64_ni_syscall
-	data8 ia64_ni_syscall
-	data8 ia64_ni_syscall			// 1240
-	data8 ia64_ni_syscall
-	data8 ia64_ni_syscall
+	data8 sys_io_setup
+	data8 sys_io_destroy
+	data8 sys_io_getevents			// 1240
+	data8 sys_io_submit
+	data8 sys_io_cancel
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall			// 1245
diff -urN v2.4.19/arch/ia64/kernel/semaphore.c aio-2.4.19.diff/arch/ia64/kernel/semaphore.c
--- v2.4.19/arch/ia64/kernel/semaphore.c	Thu May  3 11:22:08 2001
+++ aio-2.4.19.diff/arch/ia64/kernel/semaphore.c	Mon Sep 16 21:54:13 2002
@@ -24,7 +24,7 @@
  * <asm/semaphore.h> where we want to avoid any extra jumps and calls.
  */
 #include <linux/sched.h>
-
+#include <linux/worktodo.h>
 #include <asm/semaphore.h>
 
 /*
@@ -45,6 +45,70 @@
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
+void __wtd_down(struct semaphore * sem, struct worktodo *wtd);
+
+void __wtd_down_action(void *data)
+{
+	struct worktodo *wtd = data;
+	struct semaphore *sem;
+
+	wtd_pop(wtd);
+	sem = wtd->data;
+
+	__wtd_down(sem, wtd);
+}
+
+void __wtd_down_waiter(wait_queue_t *wait)
+{
+	struct worktodo *wtd = (struct worktodo *)wait;
+	struct semaphore *sem = wtd->data;
+
+	__remove_wait_queue(&sem->wait, &wtd->wait);
+	wtd_push(wtd, __wtd_down_action, wtd);
+	wtd_queue(wtd);
+}
+
+void __wtd_down(struct semaphore * sem, struct worktodo *wtd)
+{
+	int gotit;
+	int sleepers;
+
+	init_waitqueue_func_entry(&wtd->wait, __wtd_down_waiter);
+	wtd->data = sem;
+
+	spin_lock_irq(&semaphore_lock);
+	sem->sleepers++;
+	sleepers = sem->sleepers;
+	gotit = add_wait_queue_exclusive_cond(&sem->wait, &wtd->wait,
+			atomic_add_negative(sleepers - 1, &sem->count));
+	if (gotit)
+		sem->sleepers = 0;
+	else
+		sem->sleepers = 1;
+	spin_unlock_irq(&semaphore_lock);
+
+	if (gotit) {
+		wake_up(&sem->wait);
+		wtd_queue(wtd);
+	}
+}
+
+/* Returns 0 if we acquired the semaphore, 1 if it was queued. */
+int wtd_down(struct worktodo *wtd, struct semaphore *sem)
+{
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+#endif
+	if (atomic_dec_return(&sem->count) < 0) {
+		__wtd_down(sem, wtd);
+		return 1;
+	}
+	else {
+		return 0;
+	}
+}
+
+
 void
 __down (struct semaphore *sem)
 {
diff -urN v2.4.19/drivers/block/loop.c aio-2.4.19.diff/drivers/block/loop.c
--- v2.4.19/drivers/block/loop.c	Fri Aug  9 13:49:22 2002
+++ aio-2.4.19.diff/drivers/block/loop.c	Mon Sep 16 21:54:13 2002
@@ -283,7 +283,7 @@
 	spin_lock_irq(&lo->lo_lock);
 	file = lo->lo_backing_file;
 	spin_unlock_irq(&lo->lo_lock);
-	do_generic_file_read(file, &pos, &desc, lo_read_actor);
+	do_generic_file_read(file, &pos, &desc, lo_read_actor, 0);
 	return desc.error;
 }
 
diff -urN v2.4.19/drivers/char/raw.c aio-2.4.19.diff/drivers/char/raw.c
--- v2.4.19/drivers/char/raw.c	Fri Aug  9 13:49:27 2002
+++ aio-2.4.19.diff/drivers/char/raw.c	Mon Sep 16 21:54:13 2002
@@ -16,6 +16,8 @@
 #include <linux/capability.h>
 #include <linux/smp_lock.h>
 #include <asm/uaccess.h>
+#include <linux/kiovec.h>
+#include <linux/slab.h>
 
 #define dprintk(x...) 
 
@@ -35,6 +37,9 @@
 int	raw_release(struct inode *, struct file *);
 int	raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
 int	raw_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
+int	raw_kvec_read(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos);
+int	raw_kvec_write(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos);
+
 
 
 static struct file_operations raw_fops = {
@@ -43,6 +48,10 @@
 	open:		raw_open,
 	release:	raw_release,
 	ioctl:		raw_ioctl,
+	aio_read:	generic_file_aio_read,
+	aio_write:	generic_file_aio_write,
+	kvec_read:	raw_kvec_read,
+	kvec_write:	raw_kvec_write,
 };
 
 static struct file_operations raw_ctl_fops = {
@@ -271,7 +280,6 @@
 }
 
 
-
 ssize_t	raw_read(struct file *filp, char * buf, 
 		 size_t size, loff_t *offp)
 {
@@ -402,3 +410,99 @@
  out:	
 	return err;
 }
+
+static int raw_kvec_rw(struct file *filp, int rw, kvec_cb_t cb, size_t size, loff_t pos);
+int raw_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	return raw_kvec_rw(file, READ, cb, size, pos);
+}
+
+int raw_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	return raw_kvec_rw(file, WRITE, cb, size, pos);
+}
+
+int	raw_kvec_rw(struct file *filp, int rw, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	int		err;
+	unsigned	minor;
+	kdev_t		dev;
+	unsigned long	limit, blocknr, blocks;
+
+	unsigned	sector_size, sector_bits, sector_mask;
+	unsigned	max_sectors;
+	unsigned	i;
+
+	pr_debug("raw_kvec_rw: %p %d %d %p %d %d %Lu\n", filp, rw, nr, kiovec, flags, size, pos);
+	/*
+	 * First, a few checks on device size limits 
+	 */
+
+	minor = MINOR(filp->f_dentry->d_inode->i_rdev);
+	dev = to_kdev_t(raw_devices[minor].binding->bd_dev);
+	sector_size = raw_devices[minor].sector_size;
+	sector_bits = raw_devices[minor].sector_bits;
+	sector_mask = sector_size- 1;
+	max_sectors = 25000; //KIO_MAX_SECTORS >> (sector_bits - 9);
+	
+	if (blk_size[MAJOR(dev)])
+		limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits;
+	else
+		limit = INT_MAX;
+	pr_debug ("raw_kvec_rw: dev %d:%d (+%d)\n",
+		 MAJOR(dev), MINOR(dev), limit);
+
+	/* EOF at the end */
+	err = 0;
+	if (!size || (pos >> sector_bits) == limit) {
+		pr_debug("raw_kvec_rw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits);
+		cb.fn(cb.data, cb.vec, err);
+		return 0;
+	}
+
+	/* ENXIO for io beyond the end */
+	err = -ENXIO;
+	if ((pos >> sector_bits) >= limit) {
+		pr_debug("raw_kvec_rw: %Lu > %lu, %d\n", pos >> sector_bits, limit, sector_bits);
+		goto out;
+	}
+
+	err = -EINVAL;
+	if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) {
+		pr_debug("pos(%Ld)/size(%lu) wrong(%d)\n", pos, size, sector_mask);
+		goto out;
+	}
+
+	/* Verify that the scatter-gather list is sector aligned. */
+	for (i=0; i<cb.vec->nr; i++)
+		if ((cb.vec->veclet[i].offset & sector_mask) ||
+		    (cb.vec->veclet[i].length & sector_mask)) {
+			pr_debug("veclet offset/length wrong");
+			goto out;
+		}
+
+	/*
+	 * Split the IO into KIO_MAX_SECTORS chunks, mapping and
+	 * unmapping the single kiobuf as we go to perform each chunk of
+	 * IO.  
+	 */
+
+	blocknr = pos >> sector_bits;
+	blocks = size >> sector_bits;
+	if (blocks > max_sectors)
+		blocks = max_sectors;
+	if (blocks > limit - blocknr)
+		blocks = limit - blocknr;
+	err = -ENXIO;
+	if (!blocks) {
+		pr_debug("raw: !blocks %d %ld %ld\n", max_sectors, limit, blocknr);
+		goto out;
+	}
+
+	err = brw_kvec_async(rw, cb, dev, blocks, blocknr, sector_bits);
+out:
+	if (err)
+		printk(KERN_DEBUG "raw_kvec_rw: ret is %d\n", err);
+	return err;
+}
+
diff -urN v2.4.19/fs/Makefile aio-2.4.19.diff/fs/Makefile
--- v2.4.19/fs/Makefile	Thu Mar  7 16:40:03 2002
+++ aio-2.4.19.diff/fs/Makefile	Mon Sep 16 21:54:13 2002
@@ -22,6 +22,9 @@
 obj-y += noquot.o
 endif
 
+obj-y += aio.o
+export-objs += aio.o
+
 subdir-$(CONFIG_PROC_FS)	+= proc
 subdir-y			+= partitions
 
diff -urN v2.4.19/fs/aio.c aio-2.4.19.diff/fs/aio.c
--- v2.4.19/fs/aio.c	Wed Dec 31 19:00:00 1969
+++ aio-2.4.19.diff/fs/aio.c	Mon Sep 16 21:54:13 2002
@@ -0,0 +1,1387 @@
+/* fs/aio.c
+ *	An async IO implementation for Linux
+ *	Written by Benjamin LaHaise <bcrl@redhat.com>
+ *
+ *	Implements an efficient asynchronous io interface.
+ *
+ *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License along
+ *   with this program; if not, write to the Free Software Foundation, Inc.,
+ *   59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+//#define DEBUG 1
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/vmalloc.h>
+#include <linux/iobuf.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/brlock.h>
+#include <linux/aio.h>
+#include <linux/smp_lock.h>
+#include <linux/compiler.h>
+#include <linux/poll.h>
+#include <linux/brlock.h>
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/highmem.h>
+
+#if DEBUG > 1
+#define dprintk		printk
+#else
+#define dprintk(x...)	do { ; } while (0)
+#endif
+
+/*------ sysctl variables----*/
+unsigned aio_nr;		/* current system wide number of aio requests */
+unsigned aio_max_nr = 0x10000;	/* system wide maximum number of aio requests */
+unsigned aio_max_size = 0x20000;	/* 128KB per chunk */
+unsigned aio_max_pinned;		/* set to mem/4 in aio_setup */
+/*----end sysctl variables---*/
+
+static kmem_cache_t	*kiocb_cachep;
+static kmem_cache_t	*kioctx_cachep;
+
+/* tunable.  Needs to be added to sysctl. */
+int max_aio_reqs = 0x10000;
+
+/* Used for rare fput completion. */
+static void aio_fput_routine(void *);
+static struct tq_struct	fput_tqueue = {
+	routine:	aio_fput_routine,
+};
+
+static spinlock_t	fput_lock = SPIN_LOCK_UNLOCKED;
+LIST_HEAD(fput_head);
+
+/* forward prototypes */
+static void generic_aio_complete_read(void *_iocb, struct kvec *vec, ssize_t res);
+static void generic_aio_complete_write(void *_iocb, struct kvec *vec, ssize_t res);
+
+/* aio_setup
+ *	Creates the slab caches used by the aio routines, panic on
+ *	failure as this is done early during the boot sequence.
+ */
+static int __init aio_setup(void)
+{
+	kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb),
+				0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!kiocb_cachep)
+		panic("unable to create kiocb cache\n");
+
+	kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx),
+				0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!kioctx_cachep)
+		panic("unable to create kioctx cache");
+
+	aio_max_pinned = num_physpages/4;
+
+	printk(KERN_NOTICE "aio_setup: num_physpages = %u\n", aio_max_pinned);
+	printk(KERN_NOTICE "aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
+
+	return 0;
+}
+
+static void ioctx_free_reqs(struct kioctx *ctx)
+{
+	struct list_head *pos, *next;
+	list_for_each_safe(pos, next, &ctx->free_reqs) {
+		struct kiocb *iocb = list_kiocb(pos);
+		list_del(&iocb->list);
+		kmem_cache_free(kiocb_cachep, iocb);
+	}
+}
+
+static void aio_free_ring(struct kioctx *ctx)
+{
+	struct aio_ring_info *info = &ctx->ring_info;
+
+	if (info->kvec) {
+		unmap_kvec(info->kvec, 1);
+		free_kvec(info->kvec);
+	}
+
+	if (info->mmap_size) {
+		down_write(&ctx->mm->mmap_sem);
+		do_munmap(ctx->mm, info->mmap_base, info->mmap_size);
+		up_write(&ctx->mm->mmap_sem);
+	}
+
+	if (info->ring_pages && info->ring_pages != info->internal_pages)
+		kfree(info->ring_pages);
+	info->ring_pages = NULL;
+	info->nr = 0;
+}
+
+static int aio_setup_ring(struct kioctx *ctx)
+{
+	struct aio_ring *ring;
+	struct aio_ring_info *info = &ctx->ring_info;
+	unsigned nr_reqs = ctx->max_reqs;
+	unsigned long size;
+	int nr_pages, i;
+
+	/* Compensate for the ring buffer's head/tail overlap entry */
+	nr_reqs += 2;	/* 1 is required, 2 for good luck */
+
+	size = sizeof(struct aio_ring);
+	size += sizeof(struct io_event) * nr_reqs;
+	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+	if (nr_pages < 0)
+		return -EINVAL;
+
+	info->nr_pages = nr_pages;
+
+	nr_reqs = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
+
+	info->nr = 0;
+	info->ring_pages = info->internal_pages;
+	if (nr_pages > AIO_RING_PAGES) {
+		info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL);
+		if (!info->ring_pages)
+			return -ENOMEM;
+		memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages);
+	}
+
+	info->mmap_size = nr_pages * PAGE_SIZE;
+	dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
+	down_write(&ctx->mm->mmap_sem);
+	info->mmap_base = do_mmap(NULL, 0, info->mmap_size, 
+				  PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE,
+				  0);
+	up_write(&ctx->mm->mmap_sem);
+	if (IS_ERR((void *)info->mmap_base)) {
+		printk("mmap err: %ld\n", -info->mmap_base);
+		info->mmap_size = 0;
+		aio_free_ring(ctx);
+		return -EAGAIN;
+	}
+	dprintk("mmap address: 0x%08lx\n", info->mmap_base);
+	info->kvec = map_user_kvec(READ, info->mmap_base, info->mmap_size);
+	if (unlikely(IS_ERR(info->kvec))) {
+		info->kvec = NULL;
+		aio_free_ring(ctx);
+		return -EAGAIN;
+	}
+
+	if (unlikely(info->kvec->nr != nr_pages))
+		BUG();
+
+	for (i=0; i<nr_pages; i++) {
+		if (unlikely(info->kvec->veclet[i].offset))
+			BUG();
+		info->ring_pages[i] = info->kvec->veclet[i].page;
+		//printk("[%d] %p -> %p\n", i, info->kvec->veclet[i].page,
+		//	info->pages[i]);
+	}
+
+
+	ctx->user_id = info->mmap_base;
+
+	info->nr = nr_reqs;		/* trusted copy */
+
+	ring = kmap_atomic(info->ring_pages[0], KM_USER0);
+	ring->nr = nr_reqs;	/* user copy */
+	ring->id = ctx->user_id;
+	kunmap_atomic(ring, KM_USER0);
+
+	return 0;
+}
+
+/* aio_ring_event: returns a pointer to the event at the given index from
+ * kmap_atomic(, km).  Release the pointer with put_aio_ring_event();
+ */
+static inline struct io_event *aio_ring_event(struct aio_ring_info *info, int nr, enum km_type km)
+{
+	struct io_event *events;
+#define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
+#define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
+
+	if (nr < AIO_EVENTS_FIRST_PAGE) {
+		struct aio_ring *ring;
+		ring = kmap_atomic(info->ring_pages[0], km);
+		return &ring->io_events[nr];
+	}
+	nr -= AIO_EVENTS_FIRST_PAGE;
+
+	events = kmap_atomic(info->ring_pages[1 + nr / AIO_EVENTS_PER_PAGE], km);
+
+	return events + (nr % AIO_EVENTS_PER_PAGE);
+}
+
+static inline void put_aio_ring_event(struct io_event *event, enum km_type km)
+{
+	void *p = (void *)((unsigned long)event & PAGE_MASK);
+	kunmap_atomic(p, km);
+}
+
+/* ioctx_alloc
+ *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
+ */
+static struct kioctx *ioctx_alloc(unsigned nr_reqs)
+{
+	struct kioctx *ctx;
+	unsigned i;
+
+	/* Prevent overflows */
+	if ((nr_reqs > (0x10000000U / sizeof(struct io_event))) ||
+	    (nr_reqs > (0x10000000U / sizeof(struct kiocb)))) {
+		pr_debug("ENOMEM: nr_reqs too high\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (nr_reqs > aio_max_nr)
+		return ERR_PTR(-EAGAIN);
+
+	ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->max_reqs = nr_reqs;
+	ctx->mm = current->mm;
+	atomic_inc(&ctx->mm->mm_count);
+
+	atomic_set(&ctx->users, 1);
+	spin_lock_init(&ctx->lock);
+	spin_lock_init(&ctx->ring_info.ring_lock);
+	init_waitqueue_head(&ctx->wait);
+
+	INIT_LIST_HEAD(&ctx->free_reqs);
+	INIT_LIST_HEAD(&ctx->active_reqs);
+	//ctx->user_id = ++current->mm->new_ioctx_id;
+
+	if (aio_setup_ring(ctx) < 0)
+		goto out_freectx;
+
+	/* Allocate nr_reqs iocbs for io.  Free iocbs are on the 
+	 * ctx->free_reqs list.  When active they migrate to the 
+	 * active_reqs list.  During completion and cancellation 
+	 * the request may temporarily not be on any list.
+	 */
+	for (i=0; i<nr_reqs; i++) {
+		struct kiocb *iocb = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
+		if (!iocb)
+			goto out_freering;
+		memset(iocb, 0, sizeof(*iocb));
+		iocb->key = i;
+		iocb->users = 0;
+		list_add(&iocb->list, &ctx->free_reqs);
+	}
+
+	/* now link into global list.  kludge.  FIXME */
+	br_write_lock(BR_AIO_REQ_LOCK);			
+	if (unlikely(aio_nr + ctx->max_reqs > aio_max_nr))
+		goto out_cleanup;
+	aio_nr += ctx->max_reqs;	/* undone by __put_ioctx */
+	ctx->next = current->mm->ioctx_list;
+	current->mm->ioctx_list = ctx;
+	br_write_unlock(BR_AIO_REQ_LOCK);
+
+	dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
+		ctx, ctx->user_id, current->mm, ctx->ring_info.ring->nr);
+	return ctx;
+
+out_cleanup:
+	br_write_unlock(BR_AIO_REQ_LOCK);
+	ctx->max_reqs = 0;	/* prevent __put_ioctx from sub'ing aio_nr */
+	__put_ioctx(ctx);
+	return ERR_PTR(-EAGAIN);
+
+out_freering:
+	aio_free_ring(ctx);
+	ioctx_free_reqs(ctx);
+out_freectx:
+	kmem_cache_free(kioctx_cachep, ctx);
+	ctx = ERR_PTR(-ENOMEM);
+
+	dprintk("aio: error allocating ioctx %p\n", ctx);
+	return ctx;
+}
+
+/* aio_cancel_all
+ *	Cancels all outstanding aio requests on an aio context.  Used 
+ *	when the processes owning a context have all exited to encourage 
+ *	the rapid destruction of the kioctx.
+ */
+static void aio_cancel_all(struct kioctx *ctx)
+{
+	int (*cancel)(struct kiocb *);
+	spin_lock_irq(&ctx->lock);
+	ctx->dead = 1;
+	while (!list_empty(&ctx->active_reqs)) {
+		struct list_head *pos = ctx->active_reqs.next;
+		struct kiocb *iocb = list_kiocb(pos);
+		list_del_init(&iocb->list);
+		cancel = iocb->cancel;
+		if (cancel)
+			iocb->users++;
+		spin_unlock_irq(&ctx->lock);
+		if (cancel)
+			cancel(iocb);
+		spin_lock_irq(&ctx->lock);
+	}
+	spin_unlock_irq(&ctx->lock);
+}
+
+void wait_for_all_aios(struct kioctx *ctx)
+{
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	if (!ctx->reqs_active)
+		return;
+
+	add_wait_queue(&ctx->wait, &wait);
+	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+	while (ctx->reqs_active) {
+		printk("ctx->reqs_active = %d\n", ctx->reqs_active);
+		schedule();
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+	}
+	set_task_state(tsk, TASK_RUNNING);
+	remove_wait_queue(&ctx->wait, &wait);
+}
+
+/* exit_aio: called when the last user of mm goes away.  At this point, 
+ * there is no way for any new requests to be submited or any of the 
+ * io_* syscalls to be called on the context.  However, there may be 
+ * outstanding requests which hold references to the context; as they 
+ * go away, they will call put_ioctx and release any pinned memory
+ * associated with the request (held via struct page * references).
+ */
+void exit_aio(struct mm_struct *mm)
+{
+	struct kioctx *ctx = mm->ioctx_list;
+	mm->ioctx_list = NULL;
+	while (ctx) {
+		struct kioctx *next = ctx->next;
+		ctx->next = NULL;
+		aio_cancel_all(ctx);
+
+		wait_for_all_aios(ctx);
+
+		if (1 != atomic_read(&ctx->users))
+			printk(KERN_DEBUG
+				"exit_aio:ioctx still alive: %d %d %d\n",
+				atomic_read(&ctx->users), ctx->dead,
+				ctx->reqs_active);
+		put_ioctx(ctx);
+		ctx = next;
+	}
+}
+
+/* __put_ioctx
+ *	Called when the last user of an aio context has gone away,
+ *	and the struct needs to be freed.
+ */
+void __put_ioctx(struct kioctx *ctx)
+{
+	unsigned nr_reqs = ctx->max_reqs;
+
+	if (unlikely(ctx->reqs_active))
+		BUG();
+
+	aio_free_ring(ctx);
+	mmdrop(ctx->mm);
+	ctx->mm = NULL;
+	pr_debug("__put_ioctx: freeing %p\n", ctx);
+	ioctx_free_reqs(ctx);
+	kmem_cache_free(kioctx_cachep, ctx);
+
+	br_write_lock(BR_AIO_REQ_LOCK);
+	aio_nr -= nr_reqs;
+	br_write_unlock(BR_AIO_REQ_LOCK);
+}
+
+/* aio_get_req
+ *	Allocate a slot for an aio request.  Increments the users count
+ * of the kioctx so that the kioctx stays around until all requests are
+ * complete.  Returns -EAGAIN if no requests are free.
+ */
+static struct kiocb *FASTCALL(__aio_get_req(struct kioctx *ctx));
+static struct kiocb *__aio_get_req(struct kioctx *ctx)
+{
+	struct kiocb *req = NULL;
+	struct aio_ring *ring;
+
+	/* Use cmpxchg instead of spin_lock? */
+	spin_lock_irq(&ctx->lock);
+	ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0);
+	if (likely(!list_empty(&ctx->free_reqs) &&
+	    (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)))) {
+		req = list_kiocb(ctx->free_reqs.next);
+		list_del(&req->list);
+		list_add(&req->list, &ctx->active_reqs);
+		ctx->reqs_active++;
+		req->user_obj = NULL;
+		get_ioctx(ctx);
+
+		if (unlikely(req->ctx != NULL))
+			BUG();
+		req->ctx = ctx;
+		if (unlikely(req->users))
+			BUG();
+		req->users = 1;
+	}
+	kunmap_atomic(ring, KM_USER0);
+	spin_unlock_irq(&ctx->lock);
+
+	return req;
+}
+
+static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+{
+	struct kiocb *req;
+	/* Handle a potential starvation case -- should be exceedingly rare as 
+	 * requests will be stuck on fput_head only if the aio_fput_routine is 
+	 * delayed and the requests were the last user of the struct file.
+	 */
+	req = __aio_get_req(ctx);
+	if (unlikely(NULL == ctx)) {
+		aio_fput_routine(NULL);
+		req = __aio_get_req(ctx);
+	}
+	return req;
+}
+
+static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
+{
+	req->ctx = NULL;
+	req->filp = NULL;
+	req->user_obj = NULL;
+	ctx->reqs_active--;
+	list_add(&req->list, &ctx->free_reqs);
+
+	if (unlikely(!ctx->reqs_active && ctx->dead))
+		wake_up(&ctx->wait);
+}
+
+static void aio_fput_routine(void *data)
+{
+	spin_lock_irq(&fput_lock);
+	while (likely(!list_empty(&fput_head))) {
+		struct kiocb *req = list_kiocb(fput_head.next);
+		struct kioctx *ctx = req->ctx;
+
+		list_del(&req->list);
+		spin_unlock_irq(&fput_lock);
+
+		/* Complete the fput */
+		__fput(req->filp);
+
+		/* Link the iocb into the context's free list */
+		spin_lock_irq(&ctx->lock);
+		really_put_req(ctx, req);
+		spin_unlock_irq(&ctx->lock);
+
+		put_ioctx(ctx);
+		spin_lock_irq(&fput_lock);
+	}
+	spin_unlock_irq(&fput_lock);
+}
+
+/* __aio_put_req
+ *	Returns true if this put was the last user of the request.
+ */
+static inline int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
+{
+	dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n",
+		req, atomic_read(&req->filp->f_count));
+
+	req->users --;
+	if (unlikely(req->users < 0))
+		BUG();
+	if (likely(req->users))
+		return 0;
+	list_del(&req->list);		/* remove from active_reqs */
+	req->cancel = NULL;
+
+	/* Must be done under the lock to serialise against cancellation.
+	 * Call this aio_fput as it duplicates fput via the fput_tqueue.
+	 */
+	if (unlikely(atomic_dec_and_test(&req->filp->f_count))) {
+		get_ioctx(ctx);
+		spin_lock(&fput_lock);
+		list_add(&req->list, &fput_head);
+		spin_unlock(&fput_lock);
+		schedule_task(&fput_tqueue);
+	} else
+		really_put_req(ctx, req);
+	return 1;
+}
+
+/* aio_put_req
+ *	Returns true if this put was the last user of the kiocb,
+ *	false if the request is still in use.
+ */
+int aio_put_req(struct kiocb *req)
+{
+	struct kioctx *ctx = req->ctx;
+	int ret;
+	spin_lock_irq(&ctx->lock);
+	ret = __aio_put_req(ctx, req);
+	spin_unlock_irq(&ctx->lock);
+	if (ret)
+		put_ioctx(ctx);
+	return ret;
+}
+
+/*	Lookup an ioctx id.  ioctx_list is lockless for reads.
+ *	FIXME: this is O(n) and is only suitable for development.
+ */
+static inline struct kioctx *lookup_ioctx(unsigned long ctx_id)
+{
+	struct kioctx *ioctx;
+	struct mm_struct *mm;
+
+	br_read_lock(BR_AIO_REQ_LOCK);
+	mm = current->mm;
+	for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next)
+		if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) {
+			get_ioctx(ioctx);
+			break;
+		}
+	br_read_unlock(BR_AIO_REQ_LOCK);
+
+	return ioctx;
+}
+
+/* aio_complete
+ *	Called when the io request on the given iocb is complete.
+ *	Returns true if this is the last user of the request.  The 
+ *	only other user of the request can be the cancellation code.
+ */
+int aio_complete(struct kiocb *iocb, long res, long res2)
+{
+	struct kioctx	*ctx = iocb->ctx;
+	struct aio_ring_info	*info = &ctx->ring_info;
+	struct aio_ring	*ring;
+	struct io_event	*event;
+	unsigned long	flags;
+	unsigned long	tail;
+	int		ret;
+
+	/* add a completion event to the ring buffer.
+	 * must be done holding ctx->lock to prevent
+	 * other code from messing with the tail
+	 * pointer since we might be called from irq
+	 * context.
+	 */
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
+
+	tail = info->tail;
+	event = aio_ring_event(info, tail, KM_IRQ0);
+	tail = (tail + 1) % info->nr;
+
+	event->obj = (u64)(unsigned long)iocb->user_obj;
+	event->data = iocb->user_data;
+	event->res = res;
+	event->res2 = res2;
+
+	dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
+		ctx, tail, iocb, iocb->user_obj, iocb->user_data, res, res2);
+
+	/* after flagging the request as done, we
+	 * must never even look at it again
+	 */
+	barrier();
+
+	info->tail = tail;
+	ring->tail = tail;
+
+	wmb();
+	if (!ring->woke)
+		ring->woke = 1;
+
+	put_aio_ring_event(event, KM_IRQ0);
+	kunmap_atomic(ring, KM_IRQ1);
+
+	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+
+	/* everything turned out well, dispose of the aiocb. */
+	ret = __aio_put_req(ctx, iocb);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (waitqueue_active(&ctx->wait))
+		wake_up(&ctx->wait);
+
+	if (ret)
+		put_ioctx(ctx);
+
+	return ret;
+}
+
+/* aio_read_evt
+ *	Pull an event off of the ioctx's event ring.  Returns the number of 
+ *	events fetched (0 or 1 ;-)
+ *	FIXME: make this use cmpxchg.
+ *	TODO: make the ringbuffer user mmap()able (requires FIXME).
+ */
+static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
+{
+	struct aio_ring_info *info = &ioctx->ring_info;
+	struct aio_ring *ring;
+	unsigned long head;
+	int ret = 0;
+
+	ring = kmap_atomic(info->ring_pages[0], KM_USER0);
+	dprintk("in aio_read_evt h%lu t%lu m%lu\n",
+		 (unsigned long)ring->head, (unsigned long)ring->tail,
+		 (unsigned long)ring->nr);
+	barrier();
+	if (ring->head == ring->tail)
+		goto out;
+
+	spin_lock(&info->ring_lock);
+
+	head = ring->head % info->nr;
+	if (head != ring->tail) {
+		struct io_event *evp = aio_ring_event(info, head, KM_USER1);
+		*ent = *evp;
+		head = (head + 1) % info->nr;
+		barrier();
+		ring->head = head;
+		ret = 1;
+		put_aio_ring_event(evp, KM_USER1);
+	}
+	spin_unlock(&info->ring_lock);
+
+out:
+	kunmap_atomic(ring, KM_USER0);
+	dprintk("leaving aio_read_evt: %d  h%lu t%lu\n", ret,
+		 (unsigned long)ring->head, (unsigned long)ring->tail);
+	return ret;
+}
+
+struct timeout {
+	struct timer_list	timer;
+	int			timed_out;
+	struct task_struct	*p;
+};
+
+static void timeout_func(unsigned long data)
+{
+	struct timeout *to = (struct timeout *)data;
+
+	to->timed_out = 1;
+	wake_up_process(to->p);
+}
+
+static inline void init_timeout(struct timeout *to)
+{
+	init_timer(&to->timer);
+	to->timer.data = (unsigned long)to;
+	to->timer.function = timeout_func;
+	to->timed_out = 0;
+	to->p = current;
+}
+
+static inline void set_timeout(struct timeout *to, const struct timespec *ts)
+{
+	unsigned long how_long;
+
+	if (!ts->tv_sec && !ts->tv_nsec) {
+		to->timed_out = 1;
+		return;
+	}
+
+	how_long = ts->tv_sec * HZ;
+#define HZ_NS (1000000000 / HZ)
+	how_long += (ts->tv_nsec + HZ_NS - 1) / HZ_NS;
+	
+	to->timer.expires = jiffies + how_long;
+	add_timer(&to->timer);
+}
+
+static inline void update_ts(struct timespec *ts, long jiffies)
+{
+	struct timespec tmp;
+	jiffies_to_timespec(jiffies, &tmp);
+	ts->tv_sec -= tmp.tv_sec;
+	ts->tv_nsec -= tmp.tv_nsec;
+	if (ts->tv_nsec < 0) {
+		ts->tv_nsec += 1000000000;
+		ts->tv_sec -= 1;
+	}
+	if (ts->tv_sec < 0)
+		ts->tv_sec = ts->tv_nsec = 0;
+}
+
+static inline void clear_timeout(struct timeout *to)
+{
+	del_timer_sync(&to->timer);
+}
+
+static int read_events(struct kioctx *ctx,
+			long min_nr, long nr, 
+			struct io_event *event,
+			struct timespec *timeout)
+{
+	long			start_jiffies = jiffies;
+	struct task_struct	*tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	int			ret;
+	int			i = 0;
+	struct io_event		ent;
+	struct timeout		to;
+	struct timespec		ts;
+
+	/* needed to zero any padding within an entry (there shouldn't be 
+	 * any, but C is fun!
+	 */
+	memset(&ent, 0, sizeof(ent));
+	ret = 0;
+
+	while (likely(i < nr)) {
+		ret = aio_read_evt(ctx, &ent);
+		if (unlikely(ret <= 0))
+			break;
+
+		dprintk("read event: %Lx %Lx %Lx %Lx\n",
+			ent.data, ent.obj, ent.res, ent.res2);
+
+		/* FIXME: split checks in two */
+		ret = -EFAULT;
+		if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
+			dprintk("aio: lost an event due to EFAULT.\n");
+			break;
+		}
+		ret = 0;
+
+		/* Good, event copied to userland, update counts. */
+		event ++;
+		i ++;
+	}
+
+	if (i)
+		return i;
+	if (ret)
+		return ret;
+
+	/* End fast path */
+
+	init_timeout(&to);
+	if (timeout) {
+		ret = -EFAULT;
+		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
+			goto out;
+
+		set_timeout(&to, &ts);
+		if (to.timed_out) {
+			timeout = 0;
+			clear_timeout(&to);
+		}
+	}
+
+	while (likely(i < nr)) {
+		add_wait_queue_exclusive_lifo(&ctx->wait, &wait);
+		do {
+			set_task_state(tsk, TASK_INTERRUPTIBLE);
+
+			ret = aio_read_evt(ctx, &ent);
+			if (ret)
+				break;
+			if (i)
+				break;
+			ret = 0;
+			if (to.timed_out)	/* Only check after read evt */
+				break;
+			schedule();
+			if (signal_pending(tsk)) {
+				ret = -EINTR;
+				break;
+			}
+			/*ret = aio_read_evt(ctx, &ent);*/
+		} while (1) ;
+
+		set_task_state(tsk, TASK_RUNNING);
+		remove_wait_queue(&ctx->wait, &wait);
+
+		if (unlikely(ret <= 0))
+			break;
+
+		ret = -EFAULT;
+		if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
+			dprintk("aio: lost an event due to EFAULT.\n");
+			break;
+		}
+
+		/* Good, event copied to userland, update counts. */
+		event ++;
+		i ++;
+	}
+
+	if (timeout) {
+		clear_timeout(&to);
+		update_ts(&ts, jiffies - start_jiffies);
+		if (copy_to_user(timeout, &ts, sizeof(ts)))
+			ret = -EFAULT;
+	}
+out:
+	return i ? i : ret;
+}
+
+/* Take an ioctx and remove it from the list of ioctx's.  Protects 
+ * against races with itself via ->dead.
+ */
+static void io_destroy(struct kioctx *ioctx)
+{
+	struct kioctx **tmp;
+	int was_dead;
+
+	/* delete the entry from the list is someone else hasn't already */
+	br_write_lock(BR_AIO_REQ_LOCK);
+	was_dead = ioctx->dead;
+	ioctx->dead = 1;
+	for (tmp = &current->mm->ioctx_list; *tmp && *tmp != ioctx;
+	     tmp = &(*tmp)->next)
+		;
+	if (*tmp)
+		*tmp = ioctx->next;
+	br_write_unlock(BR_AIO_REQ_LOCK);
+
+	dprintk("aio_release(%p)\n", ioctx);
+	if (likely(!was_dead))
+		put_ioctx(ioctx);	/* twice for the list */
+
+	aio_cancel_all(ioctx);
+	wait_for_all_aios(ioctx);
+	put_ioctx(ioctx);	/* once for the lookup */
+}
+
+asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t *ctxp)
+{
+	struct kioctx *ioctx = NULL;
+	unsigned long ctx;
+	long ret;
+
+	ret = get_user(ctx, ctxp);
+	if (unlikely(ret))
+		goto out;
+
+	ret = -EINVAL;
+	if (unlikely(ctx || !nr_reqs || (int)nr_reqs < 0)) {
+		pr_debug("EINVAL: io_setup: ctx or nr_reqs > max\n");
+		goto out;
+	}
+
+	ret = -EAGAIN;
+	if (unlikely(nr_reqs > max_aio_reqs))
+		goto out;
+
+	ioctx = ioctx_alloc(nr_reqs);
+	ret = PTR_ERR(ioctx);
+	if (!IS_ERR(ioctx)) {
+		ret = put_user(ioctx->user_id, ctxp);
+		if (!ret)
+			return 0;
+		io_destroy(ioctx);
+	}
+
+out:
+	return ret;
+}
+
+/* aio_release
+ *	Release the kioctx associated with the userspace handle.
+ */
+asmlinkage long sys_io_destroy(aio_context_t ctx)
+{
+	struct kioctx *ioctx = lookup_ioctx(ctx);
+	if (likely(NULL != ioctx)) {
+		io_destroy(ioctx);
+		return 0;
+	}
+	pr_debug("EINVAL: io_destroy: invalid context id\n");
+	return -EINVAL;
+}
+
+ssize_t generic_aio_poll(struct file *file, struct kiocb *req, struct iocb *iocb)
+{
+	unsigned events = iocb->aio_buf;
+
+	/* Did the user set any bits they weren't supposed to? (The 
+	 * above is actually a cast.
+	 */
+	if (unlikely(events != iocb->aio_buf))
+		return -EINVAL;
+	
+	return async_poll(req, events);
+}
+
+/* sys_io_submit
+ *	Copy an aiocb from userspace into kernel space, then convert it to
+ *	a kiocb, submit and repeat until done.  Error codes on copy/submit
+ *	only get returned for the first aiocb copied as otherwise the size
+ *	of aiocbs copied is returned (standard write sematics).
+ */
+asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
+{
+	struct kioctx *ctx;
+	long ret = 0;
+	int i;
+
+	if (unlikely(nr < 0))
+		return -EINVAL;
+
+	if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
+		return -EFAULT;
+
+	ctx = lookup_ioctx(ctx_id);
+	if (unlikely(!ctx)) {
+		pr_debug("EINVAL: io_submit: invalid context id\n");
+		return -EINVAL;
+	}
+
+	for (i=0; i<nr; i++) {
+		ssize_t (*op)(struct file *, struct kiocb *, struct iocb *);
+		struct iocb *iocbp, tmp;
+		struct kiocb *req;
+		struct file *file;
+
+		if (unlikely(__get_user(iocbp, iocbpp + i))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (unlikely(copy_from_user(&tmp, iocbp, sizeof(tmp)))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		/* enforce forwards compatibility on users */
+		if (unlikely(tmp.aio_reserved1 || tmp.aio_reserved2 ||
+			     tmp.aio_reserved3)) {
+			pr_debug("EINVAL: io_submit: reserve field set\n");
+			ret = -EINVAL;
+			break;
+		}
+
+		/* prevent overflows */
+		if (unlikely(
+		    (tmp.aio_buf != (unsigned long)tmp.aio_buf) ||
+		    (tmp.aio_nbytes != (size_t)tmp.aio_nbytes) ||
+		    ((ssize_t)tmp.aio_nbytes < 0)
+		   )) {
+			pr_debug("EINVAL: io_submit: overflow check\n");
+			break;
+		}
+
+		file = fget(tmp.aio_fildes);
+		if (unlikely(!file)) {
+			ret = -EBADF;
+			break;
+		}
+
+		req = aio_get_req(ctx);
+		if (unlikely(!req)) {
+			fput(file);
+			ret = -EAGAIN;
+			break;
+		}
+
+		req->filp = file;
+		tmp.aio_key = req->key;
+		ret = put_user(tmp.aio_key, &iocbp->aio_key);
+		if (unlikely(ret)) {
+			dprintk("EFAULT: aio_key\n");
+			goto out_put_req;
+		}
+
+		req->user_obj = iocbp;
+		req->user_data = tmp.aio_data;
+		req->buf = tmp.aio_buf;
+		req->pos = tmp.aio_offset;
+		req->size = tmp.aio_nbytes;
+		req->nr_transferred = 0;
+		req->rlim_fsize = current->rlim[RLIMIT_FSIZE].rlim_cur;
+
+		ret = -EBADF;
+		if (IOCB_CMD_PREAD == tmp.aio_lio_opcode) {
+			op = file->f_op->aio_read;
+			if (unlikely(!(file->f_mode & FMODE_READ)))
+				goto out_put_req;
+		} else if (IOCB_CMD_PREADX == tmp.aio_lio_opcode) {
+			op = file->f_op->aio_readx;
+			if (unlikely(!(file->f_mode & FMODE_READ)))
+				goto out_put_req;
+		} else if (IOCB_CMD_PWRITE == tmp.aio_lio_opcode) {
+			op = file->f_op->aio_write;
+			if (unlikely(!(file->f_mode & FMODE_WRITE)))
+				goto out_put_req;
+		} else if (IOCB_CMD_FSYNC == tmp.aio_lio_opcode) {
+			op = file->f_op->aio_fsync;
+		} else if (IOCB_CMD_POLL == tmp.aio_lio_opcode) {
+			op = generic_aio_poll;
+		} else
+			op = NULL;
+
+		if (unlikely(!op)) {
+			printk("EINVAL: io_submit: no operation provided\n");
+			ret = -EINVAL;
+			goto out_put_req;
+		}
+
+		ret = op(file, req, &tmp);
+		if (likely(!ret))
+			continue;
+
+		pr_debug("io_submit: op returned %ld\n", ret);
+		aio_complete(req, ret, 0);
+		ret = 0;	/* A completion event was sent, so 
+				 * submit is a success. */
+		continue;
+
+	out_put_req:
+		aio_put_req(req);
+		break;
+	}
+
+	put_ioctx(ctx);
+	//run_task_queue(&tq_disk);
+	return i ? i : ret;
+}
+
+static void generic_aio_next_chunk(void *_iocb)
+{
+	int (*kvec_op)(struct file *, kvec_cb_t, size_t, loff_t);
+	struct kiocb *iocb = _iocb;
+	int rw = iocb->this_size;
+	unsigned long buf = iocb->buf;
+	unsigned long old_fsize;
+	kvec_cb_t cb;
+	ssize_t res;
+
+	iocb->this_size = iocb->size - iocb->nr_transferred;
+	if (iocb->this_size > aio_max_size)
+		iocb->this_size = aio_max_size;
+
+	buf += iocb->nr_transferred;
+	cb.vec = mm_map_user_kvec(iocb->ctx->mm, rw, buf, iocb->this_size);
+	cb.fn = (rw == READ) ? generic_aio_complete_read
+			     : generic_aio_complete_write;
+	cb.data = iocb;
+
+	dprintk("generic_aio_rw: cb.vec=%p\n", cb.vec);
+	if (unlikely(IS_ERR(cb.vec)))
+		goto done;
+
+	old_fsize = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	current->rlim[RLIMIT_FSIZE].rlim_cur = iocb->rlim_fsize;
+	kvec_op = (rw == READ) ? iocb->filp->f_op->kvec_read
+			       : iocb->filp->f_op->kvec_write;
+	dprintk("submit: %d %d %d\n", iocb->this_size, iocb->nr_transferred, iocb->size);
+	res = kvec_op(iocb->filp, cb, iocb->this_size,
+		      iocb->pos + iocb->nr_transferred);
+	current->rlim[RLIMIT_FSIZE].rlim_cur = old_fsize;
+	if (!res) {
+		dprintk("submit okay\n");
+		return;
+	}
+	dprintk("submit failed: %d\n", res);
+	
+	cb.fn(cb.data, cb.vec, res);
+	return;
+
+done:
+	if (unlikely(!iocb->nr_transferred))
+		BUG();
+	aio_complete(iocb, iocb->nr_transferred, 0);
+}
+
+static void generic_aio_complete_rw(int rw, void *_iocb, struct kvec *vec, ssize_t res)
+{
+	struct kiocb *iocb = _iocb;
+
+	unmap_kvec(vec, rw == READ);
+	free_kvec(vec);
+
+	if (res > 0)
+		iocb->nr_transferred += res;
+
+	/* Was this chunk successful?  Is there more left to transfer? */
+	if (res == iocb->this_size && iocb->nr_transferred < iocb->size) {
+		/* We may be in irq context, so queue processing in 
+		 * process context.
+		 */
+		iocb->this_size = rw;
+		INIT_TQUEUE(&iocb->u.tq, generic_aio_next_chunk, iocb);
+		schedule_task(&iocb->u.tq);
+		return;
+	}
+
+	aio_complete(iocb, iocb->nr_transferred ? iocb->nr_transferred : res,
+		     0);
+}
+
+static void generic_aio_complete_read(void *_iocb, struct kvec *vec, ssize_t res)
+{
+	generic_aio_complete_rw(READ, _iocb, vec, res);
+}
+
+static void generic_aio_complete_write(void *_iocb, struct kvec *vec, ssize_t res)
+{
+	generic_aio_complete_rw(WRITE, _iocb, vec, res);
+}
+
+ssize_t generic_aio_rw(int rw, struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size)
+{
+	int (*kvec_op)(struct file *, kvec_cb_t, size_t, loff_t);
+	unsigned long buf = iocb->aio_buf;
+	size_t size = iocb->aio_nbytes;
+	size_t	nr_read = 0;
+	loff_t pos = iocb->aio_offset;
+	kvec_cb_t cb;
+	ssize_t res;
+
+#if 0
+	if (likely(NULL != file->f_op->new_read)) {
+		nr_read = file->f_op->new_read(file, (void *)buf, size,
+					       &pos, F_ATOMIC);
+		dprintk("from new_read: nr_read: %ld\n", (long)nr_read);
+		if ((-EAGAIN == nr_read) || (-EWOULDBLOCKIO == nr_read))
+			nr_read = 0;
+		else if ((nr_read >= min_size) || (nr_read < 0)) {
+			dprintk("returning nr_read: %ld\n", (long)nr_read);
+			return nr_read;
+		}
+	}
+	dprintk("nr_read: %ld\n", (long)nr_read);
+#endif
+
+	req->nr_transferred = nr_read;
+	size -= nr_read;
+	if (size > aio_max_size)
+		/* We have to split up the request.  Pin the mm
+		 * struct for further use with map_user_kvec later.
+		 */
+		size = aio_max_size;
+	else
+		req->buf = 0;
+
+	req->this_size = size;
+
+	buf += nr_read;
+	cb.vec = map_user_kvec(rw, buf, size);
+	cb.fn = (rw == READ) ? generic_aio_complete_read
+			     : generic_aio_complete_write;
+	cb.data = req;
+
+	dprintk("generic_aio_rw: cb.vec=%p\n", cb.vec);
+	if (IS_ERR(cb.vec))
+		return nr_read ? nr_read : PTR_ERR(cb.vec);
+
+	kvec_op = (rw == READ) ? file->f_op->kvec_read : file->f_op->kvec_write;
+
+	res = kvec_op(file, cb, size, pos);
+	if (unlikely(res != 0)) {
+		/* If the first chunk was successful, we have to run
+		 * the callback to attempt the rest of the io.
+		 */
+		if (res == size && req->buf) {
+			cb.fn(cb.data, cb.vec, res);
+			return 0;
+		}
+
+		unmap_kvec(cb.vec, rw == READ);
+		free_kvec(cb.vec);
+		if (nr_read) {
+			if (res < 0)
+				res = 0;
+			res += nr_read;
+		}
+	}
+	return res;
+}
+
+ssize_t generic_file_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb)
+{
+	return generic_aio_rw(READ, file, req, iocb, iocb->aio_nbytes);  
+}
+
+ssize_t generic_sock_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb)
+{
+	return generic_aio_rw(READ, file, req, iocb, 1);
+}
+
+ssize_t generic_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size)
+{
+	return generic_aio_rw(WRITE, file, req, iocb, 1);
+#if 0
+	unsigned long buf = iocb.aio_buf;
+	size_t size = iocb.aio_nbytes;
+	loff_t pos = iocb.aio_offset;
+	ssize_t	nr_written = 0;
+	kvec_cb_t cb;
+	long res;
+#if 0
+	if (likely(NULL != file->f_op->new_write)) {
+		nr_written = file->f_op->new_write(file, (void *)buf, size,
+					       &pos, F_ATOMIC);
+		pr_debug("generic_aio_write: new_write: %ld\n", (long)nr_written);
+		if (-EAGAIN == nr_written)
+			nr_written = 0;
+		if ((nr_written >= min_size) || (nr_written < 0))
+			return nr_written;
+	}
+#endif
+
+	req->nr_transferred = nr_written;
+	size -= nr_written;
+	if (size > aio_max_size)
+		size = aio_max_size;
+	req->this_size = size;
+	buf += nr_written;
+	cb.vec = map_user_kvec(WRITE, buf, size);
+	cb.fn = generic_aio_complete_write;
+	cb.data = req;
+
+	if (IS_ERR(cb.vec)) {
+		pr_debug("generic_aio_write: map_user_kvec: %ld\n", PTR_ERR(cb.vec));
+		return nr_written ? nr_written : PTR_ERR(cb.vec);
+	}
+
+	res = file->f_op->kvec_write(file, cb, size, iocb.aio_offset);
+	pr_debug("generic_aio_write: kvec_write: %ld\n", res);
+	if (unlikely(res != 0)) {
+		unmap_kvec(cb.vec, 0);
+		free_kvec(cb.vec);
+		if (nr_written) {
+			if (res < 0)
+				res = 0;
+			res += nr_written;
+		}
+	}
+	return res;
+#endif
+}
+
+ssize_t generic_file_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb)
+{
+	return generic_aio_write(file, req, iocb, iocb->aio_nbytes);	
+}
+
+/* lookup_kiocb
+ *	Finds a given iocb for cancellation.
+ *	MUST be called with ctx->lock held.
+ */
+struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb *iocb, u32 key)
+{
+	struct list_head *pos;
+	/* TODO: use a hash or array, this sucks. */
+	list_for_each(pos, &ctx->free_reqs) {
+		struct kiocb *kiocb = list_kiocb(pos);
+		if (kiocb->user_obj == iocb && kiocb->key == key)
+			return kiocb;
+	}
+	return NULL;
+}
+
+asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb *iocb)
+{
+	int (*cancel)(struct kiocb *iocb);
+	struct kioctx *ctx;
+	struct kiocb *kiocb;
+	u32 key;
+	int ret;
+
+	ret = get_user(key, &iocb->aio_key);
+	if (unlikely(ret))
+		return ret;
+
+	ctx = lookup_ioctx(ctx_id);
+	if (unlikely(!ctx))
+		return -EINVAL;
+
+	spin_lock_irq(&ctx->lock);
+	ret = -EAGAIN;
+	kiocb = lookup_kiocb(ctx, iocb, key);
+	if (kiocb && kiocb->cancel) {
+		cancel = kiocb->cancel;
+		kiocb->users ++;
+	} else
+		cancel = NULL;
+	spin_unlock_irq(&ctx->lock);
+
+	if (NULL != cancel) {
+		printk("calling cancel\n");
+		ret = cancel(kiocb);
+	} else
+		printk("iocb has no cancel operation\n");
+
+	put_ioctx(ctx);
+
+	return ret;
+}
+
+asmlinkage long sys_io_wait(aio_context_t ctx_id, struct iocb *iocb, const struct timespec *timeout)
+{
+#if 0	/* FIXME.  later. */
+	struct kioctx *ioctx;
+	long ret = -EINVAL;
+	unsigned key;
+	long obj = (long)iocb;
+
+	ioctx = lookup_ioctx(ctx_id);
+	if (!ioctx)
+		goto out;
+
+	ret = get_user(key, &iocb->aio_key);
+	if (ret)
+		goto out;
+
+	ret = __aio_complete(ioctx, key, obj, !!timeout);
+	put_ioctx(ioctx);
+
+out:
+	return ret;
+#endif
+	return -ENOSYS;
+}
+
+asmlinkage long sys_io_getevents(aio_context_t ctx_id,
+				 long min_nr,
+				 long nr,
+				 struct io_event *events,
+				 struct timespec *timeout)
+{
+	struct kioctx *ioctx = lookup_ioctx(ctx_id);
+	long ret = -EINVAL;
+
+	if (likely(NULL != ioctx)) {
+		ret = read_events(ioctx, min_nr, nr, events, timeout);
+		put_ioctx(ioctx);
+	}
+
+	return ret;
+}
+
+__initcall(aio_setup);
+
+EXPORT_SYMBOL_GPL(generic_file_kvec_read);
+EXPORT_SYMBOL_GPL(generic_file_aio_read);
+EXPORT_SYMBOL_GPL(generic_file_kvec_write);
+EXPORT_SYMBOL_GPL(generic_file_aio_write);
+EXPORT_SYMBOL_GPL(generic_file_new_read);
diff -urN v2.4.19/fs/buffer.c aio-2.4.19.diff/fs/buffer.c
--- v2.4.19/fs/buffer.c	Fri Aug  9 13:50:13 2002
+++ aio-2.4.19.diff/fs/buffer.c	Mon Sep 16 21:54:13 2002
@@ -3014,3 +3014,220 @@
 
 module_init(bdflush_init)
 
+/* async kio interface */
+struct brw_cb {
+	kvec_cb_t		cb;
+	atomic_t		io_count;
+	int			nr;
+	struct buffer_head	*bh[1];
+};
+
+static inline void brw_cb_put(struct brw_cb *brw_cb)
+{
+	if (atomic_dec_and_test(&brw_cb->io_count)) {
+		ssize_t res = 0, err = 0;
+		int nr;
+
+		/* Walk the buffer heads associated with this kiobuf
+		 * checking for errors and freeing them as we go.
+		 */
+		for (nr=0; nr < brw_cb->nr; nr++) {
+			struct buffer_head *bh = brw_cb->bh[nr];
+			if (!err && buffer_uptodate(bh))
+				res += bh->b_size;
+			else
+				err = -EIO;
+			kmem_cache_free(bh_cachep, bh);
+		}
+
+		if (!res)
+			res = err;
+
+		brw_cb->cb.fn(brw_cb->cb.data, brw_cb->cb.vec, res);
+
+		kfree(brw_cb);
+	}
+}
+
+/*
+ * IO completion routine for a buffer_head being used for kiobuf IO: we
+ * can't dispatch the kiobuf callback until io_count reaches 0.  
+ */
+
+static void end_buffer_io_kiobuf_async(struct buffer_head *bh, int uptodate)
+{
+	struct brw_cb *brw_cb;
+	
+	mark_buffer_uptodate(bh, uptodate);
+
+	brw_cb = bh->b_private;
+	unlock_buffer(bh);
+
+	brw_cb_put(brw_cb);
+}
+
+
+/*
+ * Start I/O on a physical range of kernel memory, defined by a vector
+ * of kiobuf structs (much like a user-space iovec list).
+ *
+ * The kiobuf must already be locked for IO.  IO is submitted
+ * asynchronously: you need to check page->locked, page->uptodate, and
+ * maybe wait on page->wait.
+ *
+ * It is up to the caller to make sure that there are enough blocks
+ * passed in to completely map the iobufs to disk.
+ */
+
+int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, unsigned long blknr, int sector_shift)
+{
+	struct kvec	*vec = cb.vec;
+	struct kveclet	*veclet;
+	int		err;
+	int		length;
+	unsigned	sector_size = 1 << sector_shift;
+	int		i;
+
+	struct brw_cb	*brw_cb;
+
+	if (!vec->nr)
+		BUG();
+
+	/* 
+	 * First, do some alignment and validity checks 
+	 */
+	length = 0;
+	for (veclet=vec->veclet, i=0; i < vec->nr; i++,veclet++) {
+		length += veclet->length;
+		if ((veclet->offset & (sector_size-1)) ||
+		    (veclet->length & (sector_size-1))) {
+			printk("brw_kiovec_async: tuple[%d]->offset=0x%x length=0x%x sector_size: 0x%x\n", i, veclet->offset, veclet->length, sector_size);
+			return -EINVAL;
+		}
+	}
+
+	if (length < (blocks << sector_shift))
+		BUG();
+
+	/* 
+	 * OK to walk down the iovec doing page IO on each page we find. 
+	 */
+	err = 0;
+
+	if (!blocks) {
+		printk("brw_kiovec_async: !i\n");
+		return -EINVAL;
+	}
+
+	/* FIXME: tie into userbeans here */
+	brw_cb = kmalloc(sizeof(*brw_cb) + (blocks * sizeof(struct buffer_head *)), GFP_KERNEL);
+	if (!brw_cb)
+		return -ENOMEM;
+
+	brw_cb->cb = cb;
+	brw_cb->nr = 0;
+
+	/* This is ugly.  FIXME. */
+	for (i=0, veclet=vec->veclet; i<vec->nr; i++,veclet++) {
+		struct page *page = veclet->page;
+		unsigned offset = veclet->offset;
+		unsigned length = veclet->length;
+
+		if (!page)
+			BUG();
+
+		while (length > 0) {
+			struct buffer_head *tmp;
+			tmp = kmem_cache_alloc(bh_cachep, GFP_NOIO);
+			err = -ENOMEM;
+			if (!tmp)
+				goto error;
+
+			tmp->b_dev = B_FREE;
+			tmp->b_size = sector_size;
+			set_bh_page(tmp, page, offset);
+			tmp->b_this_page = tmp;
+
+			init_buffer(tmp, end_buffer_io_kiobuf_async, NULL);
+			tmp->b_dev = dev;
+			tmp->b_blocknr = blknr++;
+			tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock)
+					| (1 << BH_Req);
+			tmp->b_private = brw_cb;
+
+			if (rw == WRITE) {
+				set_bit(BH_Uptodate, &tmp->b_state);
+				clear_bit(BH_Dirty, &tmp->b_state);
+			}
+
+			brw_cb->bh[brw_cb->nr++] = tmp;
+			length -= sector_size;
+			offset += sector_size;
+
+			if (offset >= PAGE_SIZE) {
+				offset = 0;
+				break;
+			}
+
+			if (brw_cb->nr >= blocks)
+				goto submit;
+		} /* End of block loop */
+	} /* End of page loop */		
+
+submit:
+	atomic_set(&brw_cb->io_count, brw_cb->nr+1);
+	/* okay, we've setup all our io requests, now fire them off! */
+	for (i=0; i<brw_cb->nr; i++) 
+		submit_bh(rw, brw_cb->bh[i]);
+	brw_cb_put(brw_cb);
+	run_task_queue(&tq_disk);
+	return 0;
+
+error:
+	/* Walk brw_cb_table freeing all the goop associated with each kiobuf */
+	if (brw_cb) {
+		/* We got an error allocating the bh'es.  Just free the current
+		   buffer_heads and exit. */
+		for (i=0; i<brw_cb->nr; i++)
+			kmem_cache_free(bh_cachep, brw_cb->bh[i]);
+		kfree(brw_cb);
+	}
+
+	return err;
+}
+#if 0
+int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
+		kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
+{
+	int i;
+	int transferred = 0;
+	int err = 0;
+
+	if (!nr)
+		return 0;
+
+	/* queue up and trigger the io */
+	err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size);
+	if (err)
+		goto out;
+
+	/* wait on the last iovec first -- it's more likely to finish last */
+	for (i=nr; --i >= 0; )
+		kiobuf_wait_for_io(iovec[i]);
+
+	run_task_queue(&tq_disk);
+
+	/* okay, how much data actually got through? */
+	for (i=0; i<nr; i++) {
+		if (iovec[i]->errno) {
+			if (!err)
+				err = iovec[i]->errno;
+			break;
+		}
+		transferred += iovec[i]->length;
+	}
+
+out:
+	return transferred ? transferred : err;
+}
+#endif
diff -urN v2.4.19/fs/exec.c aio-2.4.19.diff/fs/exec.c
--- v2.4.19/fs/exec.c	Fri Aug  9 13:50:13 2002
+++ aio-2.4.19.diff/fs/exec.c	Mon Sep 16 21:54:13 2002
@@ -397,6 +397,7 @@
 	old_mm = current->mm;
 	if (old_mm && atomic_read(&old_mm->mm_users) == 1) {
 		mm_release();
+		exit_aio(old_mm);
 		exit_mmap(old_mm);
 		return 0;
 	}
diff -urN v2.4.19/fs/ext2/file.c aio-2.4.19.diff/fs/ext2/file.c
--- v2.4.19/fs/ext2/file.c	Thu Nov  1 16:40:02 2001
+++ aio-2.4.19.diff/fs/ext2/file.c	Mon Sep 16 21:54:13 2002
@@ -40,6 +40,8 @@
  */
 struct file_operations ext2_file_operations = {
 	llseek:		generic_file_llseek,
+	kvec_read:	generic_file_kvec_read,
+	kvec_write:	generic_file_kvec_write,
 	read:		generic_file_read,
 	write:		generic_file_write,
 	ioctl:		ext2_ioctl,
@@ -47,6 +49,8 @@
 	open:		generic_file_open,
 	release:	ext2_release_file,
 	fsync:		ext2_sync_file,
+	aio_read:	generic_file_aio_read,
+	aio_write:	generic_file_aio_write,
 };
 
 struct inode_operations ext2_file_inode_operations = {
diff -urN v2.4.19/fs/ext3/file.c aio-2.4.19.diff/fs/ext3/file.c
--- v2.4.19/fs/ext3/file.c	Mon Nov 26 23:43:08 2001
+++ aio-2.4.19.diff/fs/ext3/file.c	Mon Sep 16 21:54:13 2002
@@ -78,6 +78,8 @@
 
 struct file_operations ext3_file_operations = {
 	llseek:		generic_file_llseek,	/* BKL held */
+	kvec_read:	generic_file_kvec_read,
+	kvec_write:	generic_file_kvec_write,	/* FIXME: attributes */
 	read:		generic_file_read,	/* BKL not held.  Don't need */
 	write:		ext3_file_write,	/* BKL not held.  Don't need */
 	ioctl:		ext3_ioctl,		/* BKL held */
@@ -85,6 +87,8 @@
 	open:		ext3_open_file,		/* BKL not held.  Don't need */
 	release:	ext3_release_file,	/* BKL not held.  Don't need */
 	fsync:		ext3_sync_file,		/* BKL held */
+	aio_read:	generic_file_aio_read,
+	aio_write:	generic_file_aio_write,
 };
 
 struct inode_operations ext3_file_inode_operations = {
diff -urN v2.4.19/fs/file_table.c aio-2.4.19.diff/fs/file_table.c
--- v2.4.19/fs/file_table.c	Mon Sep 24 02:16:04 2001
+++ aio-2.4.19.diff/fs/file_table.c	Mon Sep 16 21:54:13 2002
@@ -99,31 +99,35 @@
 
 void fput(struct file * file)
 {
+	if (atomic_dec_and_test(&file->f_count))
+		__fput(file);
+}
+
+void __fput(struct file * file)
+{
 	struct dentry * dentry = file->f_dentry;
 	struct vfsmount * mnt = file->f_vfsmnt;
 	struct inode * inode = dentry->d_inode;
 
-	if (atomic_dec_and_test(&file->f_count)) {
-		locks_remove_flock(file);
+	locks_remove_flock(file);
 
-		if (file->f_iobuf)
-			free_kiovec(1, &file->f_iobuf);
+	if (file->f_iobuf)
+		free_kiovec(1, &file->f_iobuf);
 
-		if (file->f_op && file->f_op->release)
-			file->f_op->release(inode, file);
-		fops_put(file->f_op);
-		if (file->f_mode & FMODE_WRITE)
-			put_write_access(inode);
-		file_list_lock();
-		file->f_dentry = NULL;
-		file->f_vfsmnt = NULL;
-		list_del(&file->f_list);
-		list_add(&file->f_list, &free_list);
-		files_stat.nr_free_files++;
-		file_list_unlock();
-		dput(dentry);
-		mntput(mnt);
-	}
+	if (file->f_op && file->f_op->release)
+		file->f_op->release(inode, file);
+	fops_put(file->f_op);
+	if (file->f_mode & FMODE_WRITE)
+		put_write_access(inode);
+	file_list_lock();
+	file->f_dentry = NULL;
+	file->f_vfsmnt = NULL;
+	list_del(&file->f_list);
+	list_add(&file->f_list, &free_list);
+	files_stat.nr_free_files++;
+	file_list_unlock();
+	dput(dentry);
+	mntput(mnt);
 }
 
 struct file * fget(unsigned int fd)
diff -urN v2.4.19/fs/locks.c aio-2.4.19.diff/fs/locks.c
--- v2.4.19/fs/locks.c	Thu Nov  1 16:40:02 2001
+++ aio-2.4.19.diff/fs/locks.c	Mon Sep 16 21:54:13 2002
@@ -440,7 +440,7 @@
 	while (!list_empty(&blocker->fl_block)) {
 		struct file_lock *waiter = list_entry(blocker->fl_block.next, struct file_lock, fl_block);
 
-		if (wait) {
+		if (0) {
 			locks_notify_blocked(waiter);
 			/* Let the blocked process remove waiter from the
 			 * block list when it gets scheduled.
diff -urN v2.4.19/fs/nfs/file.c aio-2.4.19.diff/fs/nfs/file.c
--- v2.4.19/fs/nfs/file.c	Thu Mar  7 16:40:04 2002
+++ aio-2.4.19.diff/fs/nfs/file.c	Mon Sep 16 21:54:13 2002
@@ -39,9 +39,13 @@
 static ssize_t nfs_file_write(struct file *, const char *, size_t, loff_t *);
 static int  nfs_file_flush(struct file *);
 static int  nfs_fsync(struct file *, struct dentry *dentry, int datasync);
+static int nfs_kvec_write(struct file *file, kvec_cb_t cb, size_t count, loff_t pos);
+static int nfs_kvec_read(struct file *file, kvec_cb_t cb, size_t count, loff_t pos);
 
 struct file_operations nfs_file_operations = {
 	llseek:		generic_file_llseek,
+	kvec_read:	nfs_kvec_read,
+	kvec_write:	nfs_kvec_write,
 	read:		nfs_file_read,
 	write:		nfs_file_write,
 	mmap:		nfs_file_mmap,
@@ -50,6 +54,8 @@
 	release:	nfs_release,
 	fsync:		nfs_fsync,
 	lock:		nfs_lock,
+	aio_read:	generic_file_aio_read,
+	aio_write:	generic_file_aio_write,
 };
 
 struct inode_operations nfs_file_inode_operations = {
@@ -88,6 +94,28 @@
 	return status;
 }
 
+static int nfs_kvec_write(struct file *file, kvec_cb_t cb, size_t count, loff_t pos)
+{
+	struct dentry * dentry = file->f_dentry;
+	struct inode * inode = dentry->d_inode;
+	int ret;
+	ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	if (!ret)
+		return generic_file_kvec_write(file, cb, count, pos);
+	return ret;
+}
+
+static int nfs_kvec_read(struct file *file, kvec_cb_t cb, size_t count, loff_t pos)
+{
+	struct dentry * dentry = file->f_dentry;
+	struct inode * inode = dentry->d_inode;
+	int ret;
+	ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	if (!ret)
+		return generic_file_kvec_read(file, cb, count, pos);
+	return ret;
+}
+
 static ssize_t
 nfs_file_read(struct file * file, char * buf, size_t count, loff_t *ppos)
 {
diff -urN v2.4.19/fs/pipe.c aio-2.4.19.diff/fs/pipe.c
--- v2.4.19/fs/pipe.c	Fri Aug  9 13:50:14 2002
+++ aio-2.4.19.diff/fs/pipe.c	Mon Sep 16 21:54:13 2002
@@ -134,31 +134,235 @@
 	return ret;
 }
 
+static int pipe_kvec_read(struct file *filp, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	return 0;
+}
+
+static int pipe_aio_read_cancel(struct kiocb *iocb)
+{
+	struct inode *inode = iocb->filp->f_dentry->d_inode;
+	struct pipe_inode_info *pipe = inode->i_pipe;
+	struct list_head *pos;
+	int found = 0;
+
+	pr_debug("cancelling aio pipe read(%p)\n", iocb);
+
+	/* To cancel an aio, we must first prevent writers from
+	 * removing it from the list.  We must block here as the 
+	 * cancellation may be from the process exit path.
+	 */
+	down(PIPE_SEM(*inode));
+
+	pr_debug("got semaphore\n");
+	spin_lock(&pipe->pipe_aio_lock);
+
+	list_for_each(pos, &pipe->read_iocb_list) {
+		if (pos == &iocb->u.list) {
+			list_del(pos);
+			found = 1;
+			break;
+		}
+	}
+	
+	spin_unlock(&pipe->pipe_aio_lock);
+	up(PIPE_SEM(*inode));
+	aio_put_req(iocb);
+
+	if (found) {
+		if (iocb->data) {
+			unmap_kvec(iocb->data, 1);
+			free_kvec(iocb->data);
+		}
+
+		aio_complete(iocb, iocb->nr_transferred, 0);
+		return 0;
+	}
+
+	return -EAGAIN;
+}
+
+static ssize_t pipe_aio_read (struct file *file, struct kiocb *iocb, struct iocb *uiocb)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	int queued = 0, failed_sem = 0;
+
+	iocb->data = NULL;
+	iocb->cancel = pipe_aio_read_cancel;
+	iocb->this_size = iocb->size;
+	if (iocb->this_size > aio_max_size)
+		iocb->this_size = aio_max_size;
+
+	/* 0 length reads are always successful */
+	if (unlikely(!iocb->size)) {
+		aio_complete(iocb, 0, 0);
+		return 0;
+	}
+
+	iocb->data = map_user_kvec(READ, iocb->buf, iocb->this_size);
+	if (unlikely(IS_ERR(iocb->data))) {
+		pr_debug("pipe_aio_read: map_user_kvec=%ld\n", PTR_ERR(iocb->data));
+		return PTR_ERR(iocb->data);
+	}
+
+	/* down_trylock == 0 if we obtained the semaphore -> if the 
+	 * semaphore was not acquired, we queue the read request.
+	 */
+	failed_sem = down_trylock(PIPE_SEM(*inode));
+
+	spin_lock(&inode->i_pipe->pipe_aio_lock);
+	if (failed_sem || !list_empty(&inode->i_pipe->read_iocb_list)) {
+		pr_debug("queueing aio pipe read(%p)\n", iocb);
+		list_add_tail(&iocb->u.list, &inode->i_pipe->read_iocb_list);
+		queued = 1;
+	}
+	spin_unlock(&inode->i_pipe->pipe_aio_lock);
+
+	if (queued) {
+		if (!failed_sem)
+			up(PIPE_SEM(*inode));
+		return 0;
+	}
+
+	/* Okay, we're the first read request.  Try reading data, otherwise 
+	 * fall back and queue.
+	 */
+	if (PIPE_EMPTY(*inode)) {
+//do_more_read:
+		/* No writers?  EOF. */
+		if (!PIPE_WRITERS(*inode)) {
+			aio_complete(iocb, 0, 0);
+			goto out;
+		}
+
+		/* No data.  Oh well, queue it at the head. */
+		spin_lock(&inode->i_pipe->pipe_aio_lock);
+		list_add(&iocb->u.list, &inode->i_pipe->read_iocb_list);
+		spin_unlock(&inode->i_pipe->pipe_aio_lock);
+		up(PIPE_SEM(*inode));
+		return 0;
+	}
+
+	printk("sorry!\n");
+	//BUG();
+	spin_lock(&inode->i_pipe->pipe_aio_lock);
+	list_add(&iocb->u.list, &inode->i_pipe->read_iocb_list);
+	spin_unlock(&inode->i_pipe->pipe_aio_lock);
+	up(PIPE_SEM(*inode));
+	return 0;
+
+	//pfull = PIPE_FULL(*inode);
+
+out:
+	up(PIPE_SEM(*inode));
+	/* FIXME: writes may have been queued */
+
+	unmap_kvec(iocb->data, 1);
+	free_kvec(iocb->data);
+	iocb->data = NULL;
+
+	return 0;
+}
+
+/* do_pipe_write_aio:
+ *	Performs a pipe write when there exists an outstanding aio
+ *	read operation.  Returns the number of bytes written or -EFAULT.
+ */
+static inline ssize_t do_pipe_write_aio(struct pipe_inode_info *pipe,
+			const char *buf, size_t count, struct kiocb *iocb)
+{
+	ssize_t written = 0;
+	pr_debug("do_pipe_aio_write\n");
+
+	while (count > 0) {
+		size_t len;
+		len = min(iocb->this_size, count);
+		if (unlikely(copy_user_to_kvec(iocb->data, iocb->nr_transferred, buf, len))) {
+			pr_debug("EFAULT?\n");
+			break;
+		}
+		iocb->nr_transferred += len;
+		written += len;
+		buf += len;
+		count -= len;
+
+		if ((iocb->nr_transferred == iocb->this_size) ||
+		    (iocb->filp->f_flags & O_NONBLOCK)) {
+			struct list_head *first = NULL;
+
+			pr_debug("done this iocb\n");
+
+			/* Mark the pages as dirty and complete the request.
+			 */
+			unmap_kvec(iocb->data, 1);
+			free_kvec(iocb->data);
+
+			spin_lock(&pipe->pipe_aio_lock);
+			list_del(&iocb->u.list);
+			first = list_first(&pipe->read_iocb_list);
+			spin_unlock(&pipe->pipe_aio_lock);
+
+			aio_complete(iocb, iocb->nr_transferred, 0);
+
+			iocb = NULL;
+
+			/* No more aio reads? */
+			if (!first)
+				break;
+
+			pr_debug("processing another iocb\n");
+			iocb = list_entry(first, struct kiocb, u.list);
+		}
+	}
+
+	pr_debug("returning: %ld\n", written);
+
+	return written ? written : -EFAULT;
+}
+
 static ssize_t
 pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
+	struct list_head *iocb;
 	ssize_t free, written, ret;
 
 	/* Seeks are not allowed on pipes.  */
 	ret = -ESPIPE;
 	written = 0;
-	if (ppos != &filp->f_pos)
+	if (unlikely(ppos != &filp->f_pos))
 		goto out_nolock;
 
 	/* Null write succeeds.  */
 	ret = 0;
-	if (count == 0)
+	if (unlikely(count == 0))
 		goto out_nolock;
 
 	ret = -ERESTARTSYS;
-	if (down_interruptible(PIPE_SEM(*inode)))
+	if (unlikely(down_interruptible(PIPE_SEM(*inode))))
 		goto out_nolock;
 
 	/* No readers yields SIGPIPE.  */
-	if (!PIPE_READERS(*inode))
+	if (unlikely(!PIPE_READERS(*inode)))
 		goto sigpipe;
 
+	spin_lock(&inode->i_pipe->pipe_aio_lock);
+	iocb = list_first(&inode->i_pipe->read_iocb_list);
+	spin_unlock(&inode->i_pipe->pipe_aio_lock);
+
+	if (iocb) {
+		written = do_pipe_write_aio(inode->i_pipe, buf, count,
+					list_entry(iocb, struct kiocb, u.list));
+		if (unlikely(written < 0))
+			goto out;
+
+		count -= written;
+		buf += written;
+
+		if (!count)
+			goto out;
+	}
+
 	/* If count <= PIPE_BUF, we have to make it atomic.  */
 	free = (count <= PIPE_BUF ? count : 1);
 
@@ -340,6 +544,7 @@
 static int
 pipe_read_open(struct inode *inode, struct file *filp)
 {
+	filp->private_data = inode->i_pipe;
 	/* We could have perhaps used atomic_t, but this and friends
 	   below are the only places.  So it doesn't seem worthwhile.  */
 	down(PIPE_SEM(*inode));
@@ -352,6 +557,7 @@
 static int
 pipe_write_open(struct inode *inode, struct file *filp)
 {
+	filp->private_data = inode->i_pipe;
 	down(PIPE_SEM(*inode));
 	PIPE_WRITERS(*inode)++;
 	up(PIPE_SEM(*inode));
@@ -362,6 +568,7 @@
 static int
 pipe_rdwr_open(struct inode *inode, struct file *filp)
 {
+	filp->private_data = inode->i_pipe;
 	down(PIPE_SEM(*inode));
 	if (filp->f_mode & FMODE_READ)
 		PIPE_READERS(*inode)++;
@@ -379,6 +586,7 @@
 struct file_operations read_fifo_fops = {
 	llseek:		no_llseek,
 	read:		pipe_read,
+	aio_read:	pipe_aio_read,
 	write:		bad_pipe_w,
 	poll:		fifo_poll,
 	ioctl:		pipe_ioctl,
@@ -399,6 +607,7 @@
 struct file_operations rdwr_fifo_fops = {
 	llseek:		no_llseek,
 	read:		pipe_read,
+	aio_read:	pipe_aio_read,
 	write:		pipe_write,
 	poll:		fifo_poll,
 	ioctl:		pipe_ioctl,
@@ -409,6 +618,7 @@
 struct file_operations read_pipe_fops = {
 	llseek:		no_llseek,
 	read:		pipe_read,
+	aio_read:	pipe_aio_read,
 	write:		bad_pipe_w,
 	poll:		pipe_poll,
 	ioctl:		pipe_ioctl,
@@ -429,6 +639,7 @@
 struct file_operations rdwr_pipe_fops = {
 	llseek:		no_llseek,
 	read:		pipe_read,
+	aio_read:	pipe_aio_read,
 	write:		pipe_write,
 	poll:		pipe_poll,
 	ioctl:		pipe_ioctl,
@@ -454,6 +665,9 @@
 	PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0;
 	PIPE_WAITING_READERS(*inode) = PIPE_WAITING_WRITERS(*inode) = 0;
 	PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
+	spin_lock_init(&inode->i_pipe->pipe_aio_lock);
+	INIT_LIST_HEAD(&inode->i_pipe->read_iocb_list);
+	INIT_LIST_HEAD(&inode->i_pipe->write_iocb_list);
 
 	return inode;
 fail_page:
diff -urN v2.4.19/fs/select.c aio-2.4.19.diff/fs/select.c
--- v2.4.19/fs/select.c	Mon Sep 24 02:16:05 2001
+++ aio-2.4.19.diff/fs/select.c	Mon Sep 16 21:54:13 2002
@@ -12,6 +12,12 @@
  *  24 January 2000
  *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
  *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
+ *  June 2001
+ *    Added async_poll implementation. -bcrl
+ *  Nov 2001
+ *    Async poll improvments from Suparna Bhattacharya
+ *  April 2002
+ *    smp safe async poll plus cancellation. -bcrl
  */
 
 #include <linux/slab.h>
@@ -19,6 +25,8 @@
 #include <linux/poll.h>
 #include <linux/personality.h> /* for STICKY_TIMEOUTS */
 #include <linux/file.h>
+#include <linux/aio.h>
+#include <linux/init.h>
 
 #include <asm/uaccess.h>
 
@@ -26,19 +34,36 @@
 #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
 
 struct poll_table_entry {
-	struct file * filp;
 	wait_queue_t wait;
 	wait_queue_head_t * wait_address;
+	struct file * filp;
+	poll_table * p;
 };
 
 struct poll_table_page {
+	unsigned long size;
 	struct poll_table_page * next;
 	struct poll_table_entry * entry;
 	struct poll_table_entry entries[0];
 };
 
 #define POLL_TABLE_FULL(table) \
-	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
+	((unsigned long)((table)->entry+1) > \
+	 (table)->size + (unsigned long)(table))
+
+/* async poll uses only one entry per poll table as it is linked to an iocb */
+typedef struct async_poll_table_struct {
+	poll_table		pt;		
+	struct worktodo		wtd;
+	int			events;		/* event mask for async poll */
+	int			wake;
+	long			sync;
+	struct poll_table_page	pt_page;	/* one poll table page hdr */
+	struct poll_table_entry entries[1];	/* space for a single entry */
+} async_poll_table;
+
+
+static kmem_cache_t *async_poll_table_cache;
 
 /*
  * Ok, Peter made a complicated, but straightforward multiple_wait() function.
@@ -53,7 +78,7 @@
  * poll table.
  */
 
-void poll_freewait(poll_table* pt)
+void __poll_freewait(poll_table* pt, wait_queue_t *wait)
 {
 	struct poll_table_page * p = pt->table;
 	while (p) {
@@ -61,15 +86,154 @@
 		struct poll_table_page *old;
 
 		entry = p->entry;
+		if (entry == p->entries) /* may happen with async poll */
+			break;
 		do {
 			entry--;
-			remove_wait_queue(entry->wait_address,&entry->wait);
+			if (wait != &entry->wait)
+				remove_wait_queue(entry->wait_address,&entry->wait);
+			else
+				__remove_wait_queue(entry->wait_address,&entry->wait);
 			fput(entry->filp);
 		} while (entry > p->entries);
 		old = p;
 		p = p->next;
-		free_page((unsigned long) old);
+		if (old->size == PAGE_SIZE)
+			free_page((unsigned long) old);
 	}
+	if (pt->iocb)
+		kmem_cache_free(async_poll_table_cache, pt);
+}
+
+void poll_freewait(poll_table* pt)
+{
+	__poll_freewait(pt, NULL);
+}
+
+void async_poll_complete(void *data)
+{
+	async_poll_table *pasync = data;
+	poll_table *p = data;
+	struct kiocb	*iocb = p->iocb;
+	unsigned int	mask;
+
+	pasync->wake = 0;
+	wmb();
+	do {
+		mask = iocb->filp->f_op->poll(iocb->filp, p);
+		mask &= pasync->events | POLLERR | POLLHUP;
+		if (mask) {
+			poll_table *p2 = xchg(&iocb->data, NULL);
+			if (p2) {
+				poll_freewait(p2); 
+				aio_complete(iocb, mask, 0);
+			}
+			return;
+		}
+		pasync->sync = 0;
+		wmb();
+	} while (pasync->wake);
+}
+
+static void do_hack(async_poll_table *pasync, wait_queue_t *wait)
+{
+	struct kiocb	*iocb = pasync->pt.iocb;
+	unsigned int	mask;
+
+	mask = iocb->filp->f_op->poll(iocb->filp, NULL);
+	mask &= pasync->events | POLLERR | POLLHUP;
+	if (mask) {
+		poll_table *p2 = xchg(&iocb->data, NULL);
+		if (p2) {
+			__poll_freewait(p2, wait); 
+			aio_complete(iocb, mask, 0);
+		}
+		return;
+	}
+}
+
+static void async_poll_waiter(wait_queue_t *wait)
+{
+	struct poll_table_entry *entry = (struct poll_table_entry *)wait;
+	async_poll_table *pasync = (async_poll_table *)(entry->p);
+
+#if 1 /*OLS HACK*/
+	do_hack(pasync, wait);
+#else
+	/* avoid writes to the cacheline if possible for SMP */
+	if (!pasync->wake) {
+		pasync->wake = 1;
+		/* ensure only one wake up queues the wtd */
+		if (!pasync->sync && !test_and_set_bit(0, &pasync->sync))
+			wtd_queue(&pasync->wtd);
+	}
+#endif
+}
+
+int async_poll_cancel(struct kiocb *iocb)
+{
+	poll_table *p;
+
+	/* FIXME: almost right */
+	p = xchg(&iocb->data, NULL);
+	if (p) {
+		poll_freewait(p); 
+		aio_complete(iocb, 0, 0);
+		aio_put_req(iocb);
+		return 0;
+	}
+	return -EAGAIN;
+}
+
+int async_poll(struct kiocb *iocb, int events)
+{
+	unsigned int mask;
+	async_poll_table *pasync;
+	poll_table *p;
+
+	/* Fast path */
+	if (iocb->filp->f_op && iocb->filp->f_op->poll) {
+		mask = iocb->filp->f_op->poll(iocb->filp, NULL);
+		mask &= events | POLLERR | POLLHUP;
+		if (mask & events)
+			return events;
+	}
+
+	pasync = kmem_cache_alloc(async_poll_table_cache, SLAB_KERNEL);
+	if (!pasync)
+		return -ENOMEM;
+
+	p = (poll_table *)pasync;
+	poll_initwait(p);
+	wtd_set_action(&pasync->wtd, async_poll_complete, pasync);
+	p->iocb = iocb;
+	pasync->wake = 0;
+	pasync->sync = 0;
+	pasync->events = events;
+	pasync->pt_page.entry = pasync->pt_page.entries;
+	pasync->pt_page.size = sizeof(pasync->pt_page);
+	p->table = &pasync->pt_page;
+
+	iocb->data = p;
+	wmb();
+	iocb->cancel = async_poll_cancel;
+
+	mask = DEFAULT_POLLMASK;
+#warning broken
+	iocb->users ++;
+	if (iocb->filp->f_op && iocb->filp->f_op->poll)
+		mask = iocb->filp->f_op->poll(iocb->filp, p);
+	mask &= events | POLLERR | POLLHUP;
+	if (mask && !test_and_set_bit(0, &pasync->sync))
+		aio_complete(iocb, mask, 0);
+
+	if (aio_put_req(iocb))
+		/* Must be freed after aio_complete to synchronise with 
+		 * cancellation of the request.
+		 */
+		poll_freewait(p);
+
+	return 0;
 }
 
 void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
@@ -85,6 +249,7 @@
 			__set_current_state(TASK_RUNNING);
 			return;
 		}
+		new_table->size = PAGE_SIZE;
 		new_table->entry = new_table->entries;
 		new_table->next = table;
 		p->table = new_table;
@@ -98,7 +263,11 @@
 	 	get_file(filp);
 	 	entry->filp = filp;
 		entry->wait_address = wait_address;
-		init_waitqueue_entry(&entry->wait, current);
+		entry->p = p;
+		if (p->iocb) /* async poll */
+			init_waitqueue_func_entry(&entry->wait, async_poll_waiter);
+		else
+			init_waitqueue_entry(&entry->wait, current);
 		add_wait_queue(wait_address,&entry->wait);
 	}
 }
@@ -494,3 +663,14 @@
 	poll_freewait(&table);
 	return err;
 }
+
+static int __init async_poll_init(void)
+{
+	async_poll_table_cache = kmem_cache_create("async poll table",
+                        sizeof(async_poll_table), 0, 0, NULL, NULL);
+	if (!async_poll_table_cache)
+		panic("unable to alloc poll_table_cache");
+	return 0;
+}
+
+module_init(async_poll_init);
diff -urN v2.4.19/include/asm-i386/errno.h aio-2.4.19.diff/include/asm-i386/errno.h
--- v2.4.19/include/asm-i386/errno.h	Fri Aug  9 13:50:22 2002
+++ aio-2.4.19.diff/include/asm-i386/errno.h	Mon Sep 16 21:54:13 2002
@@ -128,5 +128,6 @@
 
 #define	ENOMEDIUM	123	/* No medium found */
 #define	EMEDIUMTYPE	124	/* Wrong medium type */
+#define ECANCELED	125	/* Operation canceled */
 
 #endif
diff -urN v2.4.19/include/asm-i386/kmap_types.h aio-2.4.19.diff/include/asm-i386/kmap_types.h
--- v2.4.19/include/asm-i386/kmap_types.h	Mon Sep 24 02:16:05 2001
+++ aio-2.4.19.diff/include/asm-i386/kmap_types.h	Mon Sep 16 21:54:13 2002
@@ -7,6 +7,8 @@
 	KM_SKB_DATA_SOFTIRQ,
 	KM_USER0,
 	KM_USER1,
+	KM_IRQ0,
+	KM_IRQ1,
 	KM_TYPE_NR
 };
 
diff -urN v2.4.19/include/asm-i386/param.h aio-2.4.19.diff/include/asm-i386/param.h
--- v2.4.19/include/asm-i386/param.h	Fri Oct 27 14:04:43 2000
+++ aio-2.4.19.diff/include/asm-i386/param.h	Mon Sep 16 21:54:13 2002
@@ -2,7 +2,8 @@
 #define _ASMi386_PARAM_H
 
 #ifndef HZ
-#define HZ 100
+//#define HZ 100
+#define HZ 1024
 #endif
 
 #define EXEC_PAGESIZE	4096
@@ -18,7 +19,7 @@
 #define MAXHOSTNAMELEN	64	/* max length of hostname */
 
 #ifdef __KERNEL__
-# define CLOCKS_PER_SEC	100	/* frequency at which times() counts */
+# define CLOCKS_PER_SEC	8192	/* frequency at which times() counts */
 #endif
 
 #endif
diff -urN v2.4.19/include/asm-i386/semaphore.h aio-2.4.19.diff/include/asm-i386/semaphore.h
--- v2.4.19/include/asm-i386/semaphore.h	Fri Aug  9 13:50:23 2002
+++ aio-2.4.19.diff/include/asm-i386/semaphore.h	Mon Sep 16 21:54:13 2002
@@ -131,6 +131,31 @@
 		:"memory");
 }
 
+/* Returns 0 if we acquired the semaphore, 1 if it was queued. */
+struct worktodo;
+static inline int wtd_down(struct worktodo *wtd, struct semaphore *sem)
+{
+	int ret = 0;
+#if WAITQUEUE_DEBUG
+	CHECK_MAGIC(sem->__magic);
+#endif
+
+	__asm__ __volatile__(
+		"# atomic down operation\n\t"
+		LOCK "decl %0\n\t"     /* --sem->count */
+		"js 2f\n"
+		"1:\n"
+		LOCK_SECTION_START("")
+		"2:\tcall __wtd_down_failed\n\t"
+		"movl $1,%1\n\t"
+		"jmp 1b\n"
+		LOCK_SECTION_END
+		:"=m" (sem->count), "=r" (ret)
+		:"c" (sem), "1" (ret), "d" (wtd)
+		:"memory");
+	return ret;
+}
+
 /*
  * Interruptible try to acquire a semaphore.  If we obtained
  * it, return zero.  If we were interrupted, returns -EINTR
diff -urN v2.4.19/include/asm-i386/unistd.h aio-2.4.19.diff/include/asm-i386/unistd.h
--- v2.4.19/include/asm-i386/unistd.h	Fri Aug  9 13:50:23 2002
+++ aio-2.4.19.diff/include/asm-i386/unistd.h	Mon Sep 16 21:54:13 2002
@@ -248,6 +248,12 @@
 #define __NR_sched_setaffinity	241
 #define __NR_sched_getaffinity	242
 
+#define __NR_io_setup           245
+#define __NR_io_destroy         246
+#define __NR_io_getevents       247
+#define __NR_io_submit          248
+#define __NR_io_cancel          249
+
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
 
 #define __syscall_return(type, res) \
diff -urN v2.4.19/include/asm-ia64/kmap_types.h aio-2.4.19.diff/include/asm-ia64/kmap_types.h
--- v2.4.19/include/asm-ia64/kmap_types.h	Wed Dec 31 19:00:00 1969
+++ aio-2.4.19.diff/include/asm-ia64/kmap_types.h	Mon Sep 16 21:54:13 2002
@@ -0,0 +1,15 @@
+#ifndef _ASM_KMAP_TYPES_H
+#define _ASM_KMAP_TYPES_H
+
+enum km_type {
+	KM_BOUNCE_READ,
+	KM_SKB_DATA,
+	KM_SKB_DATA_SOFTIRQ,
+	KM_USER0,
+	KM_USER1,
+	KM_IRQ0,
+	KM_IRQ1,
+	KM_TYPE_NR
+};
+
+#endif
diff -urN v2.4.19/include/linux/aio.h aio-2.4.19.diff/include/linux/aio.h
--- v2.4.19/include/linux/aio.h	Wed Dec 31 19:00:00 1969
+++ aio-2.4.19.diff/include/linux/aio.h	Mon Sep 16 21:54:13 2002
@@ -0,0 +1,130 @@
+#ifndef __LINUX__AIO_H
+#define __LINUX__AIO_H
+
+#include <linux/tqueue.h>
+#include <linux/kiovec.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+
+#include <linux/aio_abi.h>
+
+#define AIO_MAXSEGS		4
+#define AIO_KIOGRP_NR_ATOMIC	8
+
+struct kioctx;
+
+/* Notes on cancelling a kiocb:
+ *	If a kiocb is cancelled, aio_complete may return 0 to indicate 
+ *	that cancel has not yet disposed of the kiocb.  All cancel 
+ *	operations *must* call aio_put_req to dispose of the kiocb 
+ *	to guard against races with the completion code.
+ */
+#define KIOCB_C_CANCELLED	0x01
+#define KIOCB_C_COMPLETE	0x02
+
+struct kiocb {
+	struct list_head	list;
+	struct file	*filp;
+	struct kioctx	*ctx;
+	void		*user_obj;
+	__u64		user_data;
+	loff_t		pos;
+	unsigned long	buf;
+	size_t		nr_transferred;	/* used for chunking */
+	size_t		size;
+	size_t		this_size;
+	unsigned	key;		/* id of this request */
+	int		(*cancel)(struct kiocb *kiocb);
+	void		*data;		/* for use by the the async op */
+	int		users;
+	union {
+		struct tq_struct	tq;	/* argh. */
+		struct list_head	list;
+	} u;
+	unsigned long		rlim_fsize;
+};
+
+struct aio_ring {
+	unsigned	id;	/* kernel internal index number */
+	unsigned	nr;	/* number of io_events */
+	unsigned	head;
+	unsigned	tail;
+
+	unsigned	woke;	/* set when a wakeup was sent */
+	unsigned	pad[3];
+
+
+	struct io_event		io_events[0];
+}; /* 128 bytes + ring size */
+
+#define aio_ring_avail(info, ring)	(((ring)->head + (info)->nr - 1 - (ring)->tail) % (info)->nr)
+
+#define AIO_RING_PAGES	8
+struct aio_ring_info {
+	//struct file		*mmap_file;
+	struct kvec		*kvec;
+	unsigned long		mmap_base;
+	unsigned long		mmap_size;
+
+	struct page		**ring_pages;
+	spinlock_t		ring_lock;
+	unsigned		nr_pages;
+
+	unsigned		nr, tail;
+
+	struct page		*internal_pages[AIO_RING_PAGES];
+};
+
+struct kioctx {
+	atomic_t		users;
+	int			dead;
+	struct mm_struct	*mm;
+
+	/* This needs improving */
+	unsigned long		user_id;
+	struct kioctx		*next;
+
+	wait_queue_head_t	wait;
+
+	spinlock_t		lock;
+
+	int			reqs_active;
+	struct list_head	free_reqs;
+	struct list_head	active_reqs;	/* used for cancellation */
+
+	unsigned		max_reqs;
+
+	struct aio_ring_info	ring_info;
+};
+
+/* prototypes */
+extern unsigned aio_max_size;
+
+extern int FASTCALL(aio_put_req(struct kiocb *iocb));
+extern int FASTCALL(aio_complete(struct kiocb *iocb, long res, long res2));
+extern void FASTCALL(__put_ioctx(struct kioctx *ctx));
+struct mm_struct;
+extern void FASTCALL(exit_aio(struct mm_struct *mm));
+
+#define get_ioctx(kioctx)	do { if (unlikely(atomic_read(&(kioctx)->users) <= 0)) BUG(); atomic_inc(&(kioctx)->users); } while (0)
+#define put_ioctx(kioctx)	do { if (unlikely(atomic_dec_and_test(&(kioctx)->users))) __put_ioctx(kioctx); else if (unlikely(atomic_read(&(kioctx)->users) < 0)) BUG(); } while (0)
+
+#include <linux/aio_abi.h>
+
+static inline struct kiocb *list_kiocb(struct list_head *h)
+{
+	return list_entry(h, struct kiocb, list);
+}
+
+struct file;
+extern ssize_t generic_aio_poll(struct file *file, struct kiocb *req, struct iocb *iocb);
+extern ssize_t generic_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size);
+extern ssize_t generic_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb, size_t min_size);
+extern ssize_t generic_file_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
+extern ssize_t generic_file_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+extern ssize_t generic_sock_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
+
+/* for sysctl: */
+extern unsigned aio_nr, aio_max_nr, aio_max_size, aio_max_pinned;
+
+#endif /* __LINUX__AIO_H */
diff -urN v2.4.19/include/linux/aio_abi.h aio-2.4.19.diff/include/linux/aio_abi.h
--- v2.4.19/include/linux/aio_abi.h	Wed Dec 31 19:00:00 1969
+++ aio-2.4.19.diff/include/linux/aio_abi.h	Mon Sep 16 21:54:13 2002
@@ -0,0 +1,87 @@
+/* linux/aio_abi.h
+ *
+ * Copyright 2000,2001,2002 Red Hat.
+ *
+ * Written by Benjamin LaHaise <bcrl@redhat.com>
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation is hereby granted, provided that the above copyright
+ * notice appears in all copies.  This software is provided without any
+ * warranty, express or implied.  Red Hat makes no representations about
+ * the suitability of this software for any purpose.
+ *
+ * IN NO EVENT SHALL RED HAT BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
+ * SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF
+ * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF RED HAT HAS BEEN ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * RED HAT DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND
+ * RED HAT HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+ * ENHANCEMENTS, OR MODIFICATIONS.
+ */
+#ifndef __LINUX__AIO_ABI_H
+#define __LINUX__AIO_ABI_H
+
+#include <asm/byteorder.h>
+
+typedef unsigned long	aio_context_t;
+
+enum {
+	IOCB_CMD_PREAD = 0,
+	IOCB_CMD_PWRITE = 1,
+	IOCB_CMD_FSYNC = 2,
+	IOCB_CMD_FDSYNC = 3,
+	IOCB_CMD_PREADX = 4,
+	IOCB_CMD_POLL = 5,
+	IOCB_CMD_NOOP = 6,
+};
+
+/* read() from /dev/aio returns these structures. */
+struct io_event {
+	__u64		data;		/* the data field from the iocb */
+	__u64		obj;		/* what iocb this event came from */
+	__s64		res;		/* result code for this event */
+	__s64		res2;		/* secondary result */
+};
+
+#if defined(__LITTLE_ENDIAN)
+#define PADDED(x,y)	x, y
+#elif defined(__BIG_ENDIAN)
+#define PADDED(x,y)	y, x
+#else
+#error edit for your odd byteorder.
+#endif
+
+/*
+ * we always use a 64bit off_t when communicating
+ * with userland.  its up to libraries to do the
+ * proper padding and aio_error abstraction
+ */
+
+struct iocb {
+	/* these are internal to the kernel/libc. */
+	__u64	aio_data;	/* data to be returned in event's data */
+	__u32	PADDED(aio_key, aio_reserved1);
+				/* the kernel sets aio_key to the req # */
+
+	/* common fields */
+	__u16	aio_lio_opcode;	/* see IOCB_CMD_ above */
+	__s16	aio_reqprio;
+	__u32	aio_fildes;
+
+	__u64	aio_buf;
+	__u64	aio_nbytes;
+	__s64	aio_offset;
+
+	/* extra parameters */
+	__u64	aio_reserved2;
+	__u64	aio_reserved3;
+}; /* 64 bytes */
+
+#undef IFBIG
+#undef IFLITTLE
+
+#endif /* __LINUX__AIO_ABI_H */
+
diff -urN v2.4.19/include/linux/brlock.h aio-2.4.19.diff/include/linux/brlock.h
--- v2.4.19/include/linux/brlock.h	Sat Jun 15 05:08:15 2002
+++ aio-2.4.19.diff/include/linux/brlock.h	Mon Sep 16 21:54:13 2002
@@ -34,6 +34,7 @@
 enum brlock_indices {
 	BR_GLOBALIRQ_LOCK,
 	BR_NETPROTO_LOCK,
+	BR_AIO_REQ_LOCK,
 
 	__BR_END
 };
diff -urN v2.4.19/include/linux/errno.h aio-2.4.19.diff/include/linux/errno.h
--- v2.4.19/include/linux/errno.h	Tue Jun 11 22:19:17 2002
+++ aio-2.4.19.diff/include/linux/errno.h	Mon Sep 16 21:54:13 2002
@@ -21,6 +21,9 @@
 #define EBADTYPE	527	/* Type not supported by server */
 #define EJUKEBOX	528	/* Request initiated, but will not complete before timeout */
 
+/* Defined for TUX async IO */
+#define EWOULDBLOCKIO	530	/* Would block due to block-IO */
+
 #endif
 
 #endif
diff -urN v2.4.19/include/linux/file.h aio-2.4.19.diff/include/linux/file.h
--- v2.4.19/include/linux/file.h	Fri Aug  9 13:50:40 2002
+++ aio-2.4.19.diff/include/linux/file.h	Mon Sep 16 21:54:13 2002
@@ -5,6 +5,7 @@
 #ifndef __LINUX_FILE_H
 #define __LINUX_FILE_H
 
+extern void FASTCALL(__fput(struct file *));
 extern void FASTCALL(fput(struct file *));
 extern struct file * FASTCALL(fget(unsigned int fd));
  
diff -urN v2.4.19/include/linux/fs.h aio-2.4.19.diff/include/linux/fs.h
--- v2.4.19/include/linux/fs.h	Fri Aug  9 13:50:40 2002
+++ aio-2.4.19.diff/include/linux/fs.h	Mon Sep 16 21:54:13 2002
@@ -196,6 +196,8 @@
 #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */
 
 #ifdef __KERNEL__
+#include <linux/aio.h>
+#include <linux/aio_abi.h>
 
 #include <asm/semaphore.h>
 #include <asm/byteorder.h>
@@ -825,6 +827,10 @@
  * read, write, poll, fsync, readv, writev can be called
  *   without the big kernel lock held in all filesystems.
  */
+
+#define F_ATOMIC	0x0001
+#define F_OFFSETOK	0x0002
+
 struct file_operations {
 	struct module *owner;
 	loff_t (*llseek) (struct file *, loff_t, int);
@@ -844,6 +850,16 @@
 	ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+
+	/* in-kernel fully async api */
+	int (*kvec_read)(struct file *, kvec_cb_t, size_t, loff_t);
+	int (*kvec_write)(struct file *, kvec_cb_t, size_t, loff_t);
+
+	/* userland aio ops */
+	ssize_t (*aio_read)(struct file *, struct kiocb *, struct iocb *);
+	ssize_t (*aio_readx)(struct file *, struct kiocb *, struct iocb *);
+	ssize_t (*aio_write)(struct file *, struct kiocb *, struct iocb *);
+	ssize_t (*aio_fsync)(struct file *, struct kiocb *, struct iocb *);
 };
 
 struct inode_operations {
@@ -1433,12 +1449,16 @@
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
 extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
+extern ssize_t generic_file_new_read(struct file *, char *, size_t, loff_t *, int);
 extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
-extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
-extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
-extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
+extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t, int);
+extern int generic_file_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos);
+extern int generic_file_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos);
+
 extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *);
 extern int generic_file_open(struct inode * inode, struct file * filp);
+extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
+extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
 
 extern struct file_operations generic_ro_fops;
 
diff -urN v2.4.19/include/linux/iobuf.h aio-2.4.19.diff/include/linux/iobuf.h
--- v2.4.19/include/linux/iobuf.h	Sat Jun 15 05:08:17 2002
+++ aio-2.4.19.diff/include/linux/iobuf.h	Mon Sep 16 21:54:13 2002
@@ -53,8 +53,10 @@
 
 	/* Dynamic state for IO completion: */
 	atomic_t	io_count;	/* IOs still in progress */
+	int		transferred;	/* Number of bytes of completed IO at the beginning of the buffer */
 	int		errno;		/* Status of completed IO */
 	void		(*end_io) (struct kiobuf *); /* Completion callback */
+	void		*end_io_data;
 	wait_queue_head_t wait_queue;
 };
 
@@ -80,6 +82,8 @@
 
 /* fs/buffer.c */
 
+int	brw_kiovec_async(int rw, int nr, struct kiobuf *iovec[], 
+		   kdev_t dev, int nr_blocks, unsigned long b[], int size);
 int	brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
 		   kdev_t dev, unsigned long b[], int size);
 
diff -urN v2.4.19/include/linux/kiovec.h aio-2.4.19.diff/include/linux/kiovec.h
--- v2.4.19/include/linux/kiovec.h	Wed Dec 31 19:00:00 1969
+++ aio-2.4.19.diff/include/linux/kiovec.h	Mon Sep 16 21:54:13 2002
@@ -0,0 +1,125 @@
+#ifndef __LINUX__KIOVEC_H
+#define __LINUX__KIOVEC_H
+
+struct page;
+#include <linux/list.h>
+
+struct kveclet {
+	struct page	*page;
+	unsigned	offset;
+	unsigned	length;
+};
+
+struct kvec {
+	unsigned	max_nr;
+	unsigned	nr;
+	struct kveclet	veclet[0];
+};
+
+struct kvec_cb {
+	struct kvec	*vec;
+	void		(*fn)(void *data, struct kvec *vec, ssize_t res);
+	void		*data;
+};
+
+struct kvec_cb_list {
+	struct list_head	list;
+	struct kvec_cb		cb;
+};
+
+#ifndef _LINUX_TYPES_H
+#include <linux/types.h>
+#endif
+#ifndef _LINUX_KDEV_T_H
+#include <linux/kdev_t.h>
+#endif
+#ifndef _ASM_KMAP_TYPES_H
+#include <asm/kmap_types.h>
+#endif
+
+extern struct kvec *FASTCALL(map_user_kvec(int rw, unsigned long va, size_t len));
+extern struct kvec *FASTCALL(mm_map_user_kvec(struct mm_struct *, int rw,
+				     unsigned long va, size_t len));
+extern void FASTCALL(unmap_kvec(struct kvec *, int dirtied));
+extern void FASTCALL(free_kvec(struct kvec *));
+
+/* brw_kvec_async:
+ *	Performs direct io to/from disk into cb.vec.  Count is the number
+ *	of sectors to read, sector_shift is the blocksize (which must be
+ *	compatible with the kernel's current idea of the device's sector
+ *	size) in log2.  blknr is the starting sector offset on dev.
+ *
+ */
+extern int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned count,
+			  unsigned long blknr, int sector_shift);
+
+/* Memory copy helpers usage:
+ * void foo(... struct kveclet *veclet...)
+ *
+ *	struct kvec_dst	dst;
+ *
+ *	kvec_dst_init(&dst, KM_USER0);			-- resets type
+ *	kvec_dst_set(&dst, veclet);			-- set target & clear offset
+ *	kvec_dst_map(&dst);				-- activates kmap
+ *	for (...)
+ *		memcpy_to_kvec_dst(&dst, data, size);	-- each copy appends
+ *	kvec_dst_unmap(&dst);				-- releases kmap
+ *
+ * Note that scheduling is not permitted between kvec_dst_map() and
+ * kvec_dst_unmap().  This is because internally the routines make use
+ * of an atomic kmap.
+ */
+struct kvec_dst {
+	char		*addr;
+	char		*dst;
+	struct kveclet	*let;
+	int		space;
+	int		offset;
+	enum km_type	type;
+};
+
+
+#define kvec_dst_set(Xdst, Xlet)					\
+	do {								\
+		struct kvec_dst *_dst = (Xdst);				\
+		struct kveclet *_let = (Xlet);				\
+		_dst->let = _let;					\
+		_dst->space = _let->length;				\
+		_dst->offset = 0;					\
+	} while(0)
+
+#define kvec_dst_map(Xdst)						\
+	do {								\
+		struct kvec_dst *_dst = (Xdst);				\
+		struct kveclet *_let = _dst->let;			\
+		_dst->dst = _dst->addr = kmap_atomic(_let->page, _dst->type);\
+		_dst->dst += _let->offset + _dst->offset;		\
+		_dst->space = _let->length - _dst->offset;		\
+		_dst->offset = 0;					\
+	} while(0)
+
+#define kvec_dst_init(Xdst, Xtype)					\
+	do {								\
+		(Xdst)->space = 0;					\
+		(Xdst)->addr = 0;					\
+		(Xdst)->offset = 0;					\
+		(Xdst)->type = Xtype;					\
+	} while(0)
+
+#define	kvec_dst_unmap(Xdst)						\
+	do {								\
+		struct kvec_dst *_dst = (Xdst);				\
+		kunmap_atomic(_dst->addr, _dst->type);			\
+		_dst->offset = _dst->dst - _dst->addr;			\
+		_dst->offset -= _dst->let->offset;			\
+		_dst->addr = NULL;					\
+	} while(0)
+
+extern void FASTCALL(memcpy_to_kvec_dst(struct kvec_dst *dst,
+					const char *from, long len));
+extern void FASTCALL(memcpy_from_kvec_dst(char *to,
+					  struct kvec_dst *from, long len));
+extern int FASTCALL(copy_user_to_kvec(struct kvec *to, size_t offset, const char *from, size_t len));
+
+
+#endif /* __LINUX__KIOVEC_H */
diff -urN v2.4.19/include/linux/list.h aio-2.4.19.diff/include/linux/list.h
--- v2.4.19/include/linux/list.h	Sat Jun 15 05:08:15 2002
+++ aio-2.4.19.diff/include/linux/list.h	Mon Sep 16 21:54:13 2002
@@ -170,7 +170,8 @@
 #define list_for_each_prev(pos, head) \
 	for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \
         	pos = pos->prev, prefetch(pos->prev))
-        	
+
+#define list_first(head)	(((head)->next != (head)) ? (head)->next: (struct list_head *) 0)
 
 #endif /* __KERNEL__ || _LVM_H_INCLUDE */
 
diff -urN v2.4.19/include/linux/mm.h aio-2.4.19.diff/include/linux/mm.h
--- v2.4.19/include/linux/mm.h	Fri Aug  9 13:50:41 2002
+++ aio-2.4.19.diff/include/linux/mm.h	Mon Sep 16 21:54:13 2002
@@ -653,7 +653,7 @@
 }
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
-extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
+extern struct vm_area_struct * FASTCALL(find_vma(struct mm_struct * mm, unsigned long addr));
 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
 					     struct vm_area_struct **pprev);
 
diff -urN v2.4.19/include/linux/net.h aio-2.4.19.diff/include/linux/net.h
--- v2.4.19/include/linux/net.h	Sat Jun 15 05:08:15 2002
+++ aio-2.4.19.diff/include/linux/net.h	Mon Sep 16 21:54:13 2002
@@ -83,6 +83,9 @@
 struct scm_cookie;
 struct vm_area_struct;
 struct page;
+struct iocb;
+struct kioctx;
+#include <linux/aio.h>		/* shut gcc up */
 
 struct proto_ops {
   int	family;
@@ -110,6 +113,8 @@
   int   (*recvmsg)	(struct socket *sock, struct msghdr *m, int total_len, int flags, struct scm_cookie *scm);
   int	(*mmap)		(struct file *file, struct socket *sock, struct vm_area_struct * vma);
   ssize_t (*sendpage)	(struct socket *sock, struct page *page, int offset, size_t size, int flags);
+  int   (*kvec_read)	(struct socket *sock, kvec_cb_t cb, size_t size);
+  int   (*kvec_write)	(struct socket *sock, kvec_cb_t cb, size_t size);
 };
 
 struct net_proto_family 
diff -urN v2.4.19/include/linux/pagemap.h aio-2.4.19.diff/include/linux/pagemap.h
--- v2.4.19/include/linux/pagemap.h	Sat Jun 15 05:08:23 2002
+++ aio-2.4.19.diff/include/linux/pagemap.h	Mon Sep 16 21:54:13 2002
@@ -88,6 +88,7 @@
 extern void add_to_page_cache(struct page * page, struct address_space *mapping, unsigned long index);
 extern void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index);
 extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index, struct page **hash);
+extern wait_queue_head_t *FASTCALL(page_waitqueue(struct page *page));
 
 extern void ___wait_on_page(struct page *);
 
diff -urN v2.4.19/include/linux/pipe_fs_i.h aio-2.4.19.diff/include/linux/pipe_fs_i.h
--- v2.4.19/include/linux/pipe_fs_i.h	Thu May  3 11:22:20 2001
+++ aio-2.4.19.diff/include/linux/pipe_fs_i.h	Mon Sep 16 21:54:13 2002
@@ -1,6 +1,9 @@
 #ifndef _LINUX_PIPE_FS_I_H
 #define _LINUX_PIPE_FS_I_H
 
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
 #define PIPEFS_MAGIC 0x50495045
 struct pipe_inode_info {
 	wait_queue_head_t wait;
@@ -13,6 +16,10 @@
 	unsigned int waiting_writers;
 	unsigned int r_counter;
 	unsigned int w_counter;
+
+	spinlock_t		pipe_aio_lock;
+	struct list_head	read_iocb_list;
+	struct list_head	write_iocb_list;
 };
 
 /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
diff -urN v2.4.19/include/linux/poll.h aio-2.4.19.diff/include/linux/poll.h
--- v2.4.19/include/linux/poll.h	Sat Jun 15 05:08:17 2002
+++ aio-2.4.19.diff/include/linux/poll.h	Mon Sep 16 21:54:13 2002
@@ -9,12 +9,15 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <asm/uaccess.h>
+#include <linux/worktodo.h>
 
 struct poll_table_page;
+struct kiocb;
 
 typedef struct poll_table_struct {
-	int error;
-	struct poll_table_page * table;
+	int			error;
+	struct poll_table_page	*table;
+	struct kiocb		*iocb;		/* iocb for async poll */
 } poll_table;
 
 extern void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p);
@@ -29,8 +32,11 @@
 {
 	pt->error = 0;
 	pt->table = NULL;
+	pt->iocb = NULL;
 }
+
 extern void poll_freewait(poll_table* pt);
+extern int async_poll(struct kiocb *iocb, int events);
 
 
 /*
diff -urN v2.4.19/include/linux/sched.h aio-2.4.19.diff/include/linux/sched.h
--- v2.4.19/include/linux/sched.h	Fri Aug  9 13:50:43 2002
+++ aio-2.4.19.diff/include/linux/sched.h	Mon Sep 16 21:54:13 2002
@@ -207,6 +207,7 @@
 
 extern int max_map_count;
 
+struct kioctx;
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
 	rb_root_t mm_rb;
@@ -235,6 +236,9 @@
 
 	/* Architecture-specific MM context */
 	mm_context_t context;
+
+	struct kioctx	*ioctx_list;
+	unsigned long	new_ioctx_id;
 };
 
 extern int mmlist_nr;
@@ -802,6 +806,7 @@
 
 extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
+extern void FASTCALL(add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 
 #define __wait_event(wq, condition) 					\
diff -urN v2.4.19/include/linux/skbuff.h aio-2.4.19.diff/include/linux/skbuff.h
--- v2.4.19/include/linux/skbuff.h	Fri Aug  9 13:50:44 2002
+++ aio-2.4.19.diff/include/linux/skbuff.h	Mon Sep 16 21:54:13 2002
@@ -1128,6 +1128,15 @@
 extern unsigned int		skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len, unsigned int csum);
 extern void			skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
 
+/* skb <-> kvec helpers */
+extern void skb_copy_datagram_kvec(const struct sk_buff *skb, int offset,
+			struct kvec *vec, int len);
+extern int skb_copy_and_csum_datagram_kvec(const struct sk_buff *skb,
+		int offset, struct kvec *vec, int len);
+extern int skb_kvec_recv_datagram(struct sock * sk, kvec_cb_t cb, int len,
+        void (*finish)(struct sock *sk, kvec_cb_t cb, int len, struct sk_buff *skb));
+
+
 extern void skb_init(void);
 extern void skb_add_mtu(int mtu);
 
diff -urN v2.4.19/include/linux/sysctl.h aio-2.4.19.diff/include/linux/sysctl.h
--- v2.4.19/include/linux/sysctl.h	Fri Aug  9 13:50:44 2002
+++ aio-2.4.19.diff/include/linux/sysctl.h	Mon Sep 16 21:54:13 2002
@@ -546,6 +546,13 @@
 	FS_LEASES=13,	/* int: leases enabled */
 	FS_DIR_NOTIFY=14,	/* int: directory notification enabled */
 	FS_LEASE_TIME=15,	/* int: maximum time to wait for a lease break */
+	/* 16 == jbd-debug */
+	/* 17 == jbd-oom-retry */
+
+	FS_AIO_NR=18,		/* int: current number of aio requests */
+	FS_AIO_MAX_NR=19,	/* int: max system wide aio requests */
+	FS_AIO_MAX_SIZE=20,	/* int: max size of read/write chunks */
+	FS_AIO_MAX_PINNED=21,	/* long: max memory pinned (in pages) */
 };
 
 /* CTL_DEBUG names: */
diff -urN v2.4.19/include/linux/tasklet.h aio-2.4.19.diff/include/linux/tasklet.h
--- v2.4.19/include/linux/tasklet.h	Wed Dec 31 19:00:00 1969
+++ aio-2.4.19.diff/include/linux/tasklet.h	Mon Sep 16 21:54:13 2002
@@ -0,0 +1,154 @@
+#ifndef __LINUX__TASKLET_H
+#define __LINUX__TASKLET_H
+
+#include <linux/config.h>
+#include <asm/atomic.h>
+#include <asm/bitops.h>
+#include <asm/system.h>		/* for smp_mb */
+
+/* Tasklets --- multithreaded analogue of BHs.
+
+   Main feature differing them of generic softirqs: tasklet
+   is running only on one CPU simultaneously.
+
+   Main feature differing them of BHs: different tasklets
+   may be run simultaneously on different CPUs.
+
+   Properties:
+   * If tasklet_schedule() is called, then tasklet is guaranteed
+     to be executed on some cpu at least once after this.
+   * If the tasklet is already scheduled, but its excecution is still not
+     started, it will be executed only once.
+   * If this tasklet is already running on another CPU (or schedule is called
+     from tasklet itself), it is rescheduled for later.
+   * Tasklet is strictly serialized wrt itself, but not
+     wrt another tasklets. If client needs some intertask synchronization,
+     he makes it with spinlocks.
+ */
+
+struct tasklet_struct
+{
+	struct tasklet_struct *next;
+	unsigned long state;
+	atomic_t count;
+	void (*func)(unsigned long);
+	unsigned long data;
+	int	*unlocked;
+};
+
+#define DECLARE_TASKLET(name, func, data) \
+struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(0), func, data, NULL }
+
+#define DECLARE_TASKLET_DISABLED(name, func, data) \
+struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data, NULL }
+
+
+enum
+{
+	TASKLET_STATE_SCHED,	/* Tasklet is scheduled for execution */
+	TASKLET_STATE_RUN	/* Tasklet is running (SMP only) */
+};
+
+struct tasklet_head
+{
+	struct tasklet_struct *list;
+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
+
+extern struct tasklet_head tasklet_vec[NR_CPUS];
+extern struct tasklet_head tasklet_hi_vec[NR_CPUS];
+
+#ifdef CONFIG_SMP
+static inline int tasklet_trylock(struct tasklet_struct *t)
+{
+	return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
+}
+
+static inline void tasklet_unlock(struct tasklet_struct *t)
+{
+	smp_mb__before_clear_bit(); 
+	clear_bit(TASKLET_STATE_RUN, &(t)->state);
+}
+
+static inline void tasklet_unlock_self(struct tasklet_struct *t)
+{
+	*t->unlocked = 1;
+	t->unlocked = NULL;
+	tasklet_unlock(t);
+}
+
+static inline void tasklet_unlock_wait(struct tasklet_struct *t)
+{
+	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
+}
+#else
+#define tasklet_trylock(t) 1
+#define tasklet_unlock_wait(t) do { } while (0)
+#define tasklet_unlock(t) do { } while (0)
+#endif
+
+extern void FASTCALL(__tasklet_schedule(struct tasklet_struct *t));
+
+static inline void tasklet_schedule(struct tasklet_struct *t)
+{
+	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+		__tasklet_schedule(t);
+}
+
+extern void FASTCALL(__tasklet_hi_schedule(struct tasklet_struct *t));
+
+static inline void tasklet_hi_schedule(struct tasklet_struct *t)
+{
+	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+		__tasklet_hi_schedule(t);
+}
+
+
+static inline void tasklet_disable_nosync(struct tasklet_struct *t)
+{
+	atomic_inc(&t->count);
+	smp_mb__after_atomic_inc();
+}
+
+static inline void tasklet_disable(struct tasklet_struct *t)
+{
+	tasklet_disable_nosync(t);
+	tasklet_unlock_wait(t);
+	smp_mb();
+}
+
+static inline void tasklet_enable(struct tasklet_struct *t)
+{
+	smp_mb__before_atomic_dec();
+	atomic_dec(&t->count);
+}
+
+static inline void tasklet_hi_enable(struct tasklet_struct *t)
+{
+	smp_mb__before_atomic_dec();
+	atomic_dec(&t->count);
+}
+
+extern void tasklet_kill(struct tasklet_struct *t);
+extern void tasklet_init(struct tasklet_struct *t,
+			 void (*func)(unsigned long), unsigned long data);
+
+#ifdef CONFIG_SMP
+
+#define SMP_TIMER_NAME(name) name##__thr
+
+#define SMP_TIMER_DEFINE(name, task) \
+DECLARE_TASKLET(task, name##__thr, 0); \
+static void name (unsigned long dummy) \
+{ \
+	tasklet_schedule(&(task)); \
+}
+
+#else /* CONFIG_SMP */
+
+#define SMP_TIMER_NAME(name) name
+#define SMP_TIMER_DEFINE(name, task)
+
+#endif /* CONFIG_SMP */
+
+
+#endif /* __LINUX__TASKLET_H */
diff -urN v2.4.19/include/linux/timex.h aio-2.4.19.diff/include/linux/timex.h
--- v2.4.19/include/linux/timex.h	Sat Jun 15 05:08:15 2002
+++ aio-2.4.19.diff/include/linux/timex.h	Mon Sep 16 21:54:13 2002
@@ -74,6 +74,10 @@
 # define SHIFT_HZ	9
 #elif HZ >= 768 && HZ < 1536
 # define SHIFT_HZ	10
+#elif HZ >= 1536 && HZ < 3120
+# define SHIFT_HZ	11
+#elif HZ >= 3120 && HZ < 6240
+# define SHIFT_HZ	12
 #else
 # error You lose.
 #endif
diff -urN v2.4.19/include/linux/tqueue.h aio-2.4.19.diff/include/linux/tqueue.h
--- v2.4.19/include/linux/tqueue.h	Sat Jun 15 05:08:15 2002
+++ aio-2.4.19.diff/include/linux/tqueue.h	Mon Sep 16 21:54:13 2002
@@ -67,6 +67,7 @@
 #define TQ_ACTIVE(q)		(!list_empty(&q))
 
 extern task_queue tq_timer, tq_immediate, tq_disk;
+extern struct tq_struct run_disk_tq;
 
 /*
  * To implement your own list of active bottom halfs, use the following
diff -urN v2.4.19/include/linux/types.h aio-2.4.19.diff/include/linux/types.h
--- v2.4.19/include/linux/types.h	Fri Aug  9 13:50:44 2002
+++ aio-2.4.19.diff/include/linux/types.h	Mon Sep 16 21:54:13 2002
@@ -127,4 +127,9 @@
 	char			f_fpack[6];
 };
 
+/* kernel typedefs -- they belong here. */
+#ifdef __KERNEL__
+typedef struct kvec_cb kvec_cb_t;
+#endif /* __KERNEL__ */
+
 #endif /* _LINUX_TYPES_H */
diff -urN v2.4.19/include/linux/wait.h aio-2.4.19.diff/include/linux/wait.h
--- v2.4.19/include/linux/wait.h	Sat Jun 15 05:08:15 2002
+++ aio-2.4.19.diff/include/linux/wait.h	Mon Sep 16 21:54:13 2002
@@ -28,17 +28,20 @@
 #define WAITQUEUE_DEBUG 0
 #endif
 
+typedef struct __wait_queue wait_queue_t;
+typedef void (*wait_queue_func_t)(wait_queue_t *wait);
+
 struct __wait_queue {
 	unsigned int flags;
 #define WQ_FLAG_EXCLUSIVE	0x01
 	struct task_struct * task;
 	struct list_head task_list;
+	wait_queue_func_t func;
 #if WAITQUEUE_DEBUG
 	long __magic;
 	long __waker;
 #endif
 };
-typedef struct __wait_queue wait_queue_t;
 
 /*
  * 'dual' spinlock architecture. Can be switched between spinlock_t and
@@ -137,6 +140,7 @@
 #endif
 
 #define __WAITQUEUE_INITIALIZER(name, tsk) {				\
+	func:		NULL,						\
 	task:		tsk,						\
 	task_list:	{ NULL, NULL },					\
 			 __WAITQUEUE_DEBUG_INIT(name)}
@@ -174,6 +178,22 @@
 #endif
 	q->flags = 0;
 	q->task = p;
+	q->func = NULL;
+#if WAITQUEUE_DEBUG
+	q->__magic = (long)&q->__magic;
+#endif
+}
+
+static inline void init_waitqueue_func_entry(wait_queue_t *q,
+					wait_queue_func_t func)
+{
+#if WAITQUEUE_DEBUG
+	if (!q || !p)
+		WQ_BUG();
+#endif
+	q->flags = 0;
+	q->task = NULL;
+	q->func = func;
 #if WAITQUEUE_DEBUG
 	q->__magic = (long)&q->__magic;
 #endif
@@ -231,6 +251,38 @@
 	list_del(&old->task_list);
 }
 
+#define add_wait_queue_cond(q, wait, cond) \
+	({							\
+		unsigned long flags;				\
+		int _raced = 0;					\
+		wq_write_lock_irqsave(&(q)->lock, flags);	\
+		(wait)->flags = 0;				\
+		__add_wait_queue((q), (wait));			\
+		rmb();						\
+		if (!(cond)) {					\
+			_raced = 1;				\
+			__remove_wait_queue((q), (wait));	\
+		}						\
+		wq_write_unlock_irqrestore(&(q)->lock, flags);	\
+		_raced;						\
+	})
+
+#define add_wait_queue_exclusive_cond(q, wait, cond) \
+	({							\
+		unsigned long flags;				\
+		int _raced = 0;					\
+		wq_write_lock_irqsave(&(q)->lock, flags);	\
+		(wait)->flags = WQ_FLAG_EXCLUSIVE;		\
+		__add_wait_queue_tail((q), (wait));		\
+		rmb();						\
+		if (!(cond)) {					\
+			_raced = 1;				\
+			__remove_wait_queue((q), (wait));	\
+		}						\
+		wq_write_unlock_irqrestore(&(q)->lock, flags);	\
+		_raced;						\
+	})
+
 #endif /* __KERNEL__ */
 
 #endif
diff -urN v2.4.19/include/linux/worktodo.h aio-2.4.19.diff/include/linux/worktodo.h
--- v2.4.19/include/linux/worktodo.h	Wed Dec 31 19:00:00 1969
+++ aio-2.4.19.diff/include/linux/worktodo.h	Mon Sep 16 21:54:13 2002
@@ -0,0 +1,77 @@
+/*
+ *	Written by Benjamin LaHaise.
+ *
+ *	Copyright 2000-2001 Red Hat, Inc.
+ *
+ *	#include "gpl.h"
+ *
+ *	Basic design idea from Jeff Merkey.
+ *	Stack based on ideas from Ingo Molnar.
+ */
+#ifndef __LINUX__WORKTODO_H
+#define __LINUX__WORKTODO_H
+
+#ifndef _LINUX_WAIT_H
+#include <linux/wait.h>
+#endif
+#ifndef _LINUX_TQUEUE_H
+#include <linux/tqueue.h>
+#endif
+
+struct wtd_stack {
+	void	(*fn)(void *data);
+	void	*data;
+};
+
+struct worktodo {
+	wait_queue_t		wait;
+	struct tq_struct	tq;
+
+	void			*data;	/* for use by the wtd_ primatives */
+
+	int			sp;
+	struct wtd_stack	stack[3];
+};
+
+/* FIXME NOTE: factor from kernel/context.c */
+#define wtd_init(wtd, routine) do {			\
+	INIT_TQUEUE(&(wtd)->tq, (routine), (wtd));	\
+	(wtd)->data = 0;				\
+	(wtd)->sp = 0;					\
+} while (0)
+
+#define wtd_queue(wtd)	schedule_task(&(wtd)->tq)
+
+#define wtd_push(wtd, action, wtddata)			\
+do {							\
+	(wtd)->stack[(wtd)->sp].fn = (wtd)->tq.routine;	\
+	(wtd)->stack[(wtd)->sp++].data = (wtd)->tq.data;\
+	(wtd)->tq.routine = action;			\
+	(wtd)->tq.data = wtddata;			\
+} while (0)
+
+static inline void wtd_pop(struct worktodo *wtd)
+{
+	if (wtd->sp) {
+		wtd->sp--;
+		wtd->tq.routine = wtd->stack[wtd->sp].fn;
+		wtd->tq.data = wtd->stack[wtd->sp].data;
+	}
+}
+
+#define wtd_set_action(wtd, action, wtddata)	INIT_TQUEUE(&(wtd)->tq, action, wtddata)
+
+struct page;
+struct buffer_head;
+struct semaphore;
+extern int wtd_lock_page(struct worktodo *wtd, struct page *page);
+extern int wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh);
+extern int wtd_down(struct worktodo *wtd, struct semaphore *sem);
+
+#if 0	/* not implemented yet */
+extern int wtd_down(struct worktodo *wtd, struct semaphore *sem);
+extern void wtd_down_write(struct worktodo *wtd, struct rw_semaphore *sem);
+extern void wtd_down_read(struct worktodo *wtd, struct rw_semaphore *sem);
+#endif
+
+#endif /* __LINUX__WORKTODO_H */
diff -urN v2.4.19/include/net/sock.h aio-2.4.19.diff/include/net/sock.h
--- v2.4.19/include/net/sock.h	Fri Aug  9 13:50:46 2002
+++ aio-2.4.19.diff/include/net/sock.h	Mon Sep 16 21:54:13 2002
@@ -105,8 +105,15 @@
 
 #include <asm/atomic.h>
 #include <net/dst.h>
+#include <linux/aio.h>
 
 
+struct sock_iocb {
+	struct list_head	list;
+	kvec_cb_t		cb;
+	struct kvec_dst		dst;
+};
+
 /* The AF_UNIX specific socket options */
 struct unix_opt {
 	struct unix_address	*addr;
@@ -560,6 +567,9 @@
 		struct sk_buff *tail;
 	} backlog;
 
+	struct list_head	kvec_read_list;
+	struct list_head	kvec_write_list;
+
 	rwlock_t		callback_lock;
 
 	/* Error queue, rarely used. */
@@ -721,6 +731,8 @@
 	int			(*recvmsg)(struct sock *sk, struct msghdr *msg,
 					int len, int noblock, int flags, 
 					int *addr_len);
+	int		(*kvec_read)(struct sock *, kvec_cb_t cb, int len);
+	int		(*kvec_write)(struct sock *, kvec_cb_t cb, int len);
 	int			(*bind)(struct sock *sk, 
 					struct sockaddr *uaddr, int addr_len);
 
@@ -795,7 +807,7 @@
 	if ((__sk)->backlog.tail != NULL) \
 		__release_sock(__sk); \
 	(__sk)->lock.users = 0; \
-        if (waitqueue_active(&((__sk)->lock.wq))) wake_up(&((__sk)->lock.wq)); \
+        wake_up(&((__sk)->lock.wq)); \
 	spin_unlock_bh(&((__sk)->lock.slock)); \
 } while(0)
 
diff -urN v2.4.19/include/net/tcp.h aio-2.4.19.diff/include/net/tcp.h
--- v2.4.19/include/net/tcp.h	Fri Aug  9 13:50:46 2002
+++ aio-2.4.19.diff/include/net/tcp.h	Mon Sep 16 21:54:13 2002
@@ -732,6 +732,8 @@
 					    struct msghdr *msg,
 					    int len, int nonblock, 
 					    int flags, int *addr_len);
+extern int tcp_kvec_read(struct sock *sk, kvec_cb_t cb, int len);
+extern int tcp_kvec_write(struct sock *sk, kvec_cb_t cb, int len);
 
 extern int			tcp_listen_start(struct sock *sk);
 
diff -urN v2.4.19/kernel/fork.c aio-2.4.19.diff/kernel/fork.c
--- v2.4.19/kernel/fork.c	Fri Aug  9 13:50:46 2002
+++ aio-2.4.19.diff/kernel/fork.c	Mon Sep 16 21:54:13 2002
@@ -48,6 +48,16 @@
 	wq_write_unlock_irqrestore(&q->lock, flags);
 }
 
+void add_wait_queue_exclusive_lifo(wait_queue_head_t *q, wait_queue_t * wait)
+{
+	unsigned long flags;
+
+	wq_write_lock_irqsave(&q->lock, flags);
+	wait->flags = WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue(q, wait);
+	wq_write_unlock_irqrestore(&q->lock, flags);
+}
+
 void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;
@@ -228,6 +238,7 @@
 
 static struct mm_struct * mm_init(struct mm_struct * mm)
 {
+	mm->ioctx_list = NULL;
 	atomic_set(&mm->mm_users, 1);
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
@@ -263,6 +274,8 @@
  */
 inline void __mmdrop(struct mm_struct *mm)
 {
+	if (mm->ioctx_list)
+		BUG();
 	BUG_ON(mm == &init_mm);
 	pgd_free(mm->pgd);
 	destroy_context(mm);
@@ -281,6 +294,7 @@
 		list_del(&mm->mmlist);
 		mmlist_nr--;
 		spin_unlock(&mmlist_lock);
+		exit_aio(mm);
 		exit_mmap(mm);
 		mmdrop(mm);
 	}
diff -urN v2.4.19/kernel/sched.c aio-2.4.19.diff/kernel/sched.c
--- v2.4.19/kernel/sched.c	Fri Aug  9 13:50:46 2002
+++ aio-2.4.19.diff/kernel/sched.c	Mon Sep 16 21:54:13 2002
@@ -705,33 +705,44 @@
 }
 
 /*
- * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just wake everything
- * up.  If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
- * non-exclusive tasks and one exclusive task.
+ * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small
+ * +ve number) then we wake all the non-exclusive tasks and one exclusive task.
  *
  * There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns zero
- * in this (rare) case, and we handle it by contonuing to scan the queue.
+ * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by contonuing to scan the queue.
  */
 static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
 			 	     int nr_exclusive, const int sync)
 {
-	struct list_head *tmp;
+	struct list_head *tmp, *next;
 	struct task_struct *p;
 
 	CHECK_MAGIC_WQHEAD(q);
 	WQ_CHECK_LIST_HEAD(&q->task_list);
 	
-	list_for_each(tmp,&q->task_list) {
+	list_for_each_safe(tmp, next, &q->task_list) {
 		unsigned int state;
-                wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+		wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+		wait_queue_func_t func;
 
 		CHECK_MAGIC(curr->__magic);
+		func = curr->func;
+		if (func) {
+			unsigned flags = curr->flags;
+			func(curr);
+			if ((flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+				break;
+			continue;
+		}
 		p = curr->task;
 		state = p->state;
 		if (state & mode) {
 			WQ_NOTE_WAKER(curr);
-			if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+			if (try_to_wake_up(p, sync) &&
+			    (curr->flags & WQ_FLAG_EXCLUSIVE) &&
+			    !--nr_exclusive)
 				break;
 		}
 	}
diff -urN v2.4.19/kernel/sysctl.c aio-2.4.19.diff/kernel/sysctl.c
--- v2.4.19/kernel/sysctl.c	Fri Aug  9 13:50:46 2002
+++ aio-2.4.19.diff/kernel/sysctl.c	Mon Sep 16 21:54:13 2002
@@ -30,6 +30,7 @@
 #include <linux/init.h>
 #include <linux/sysrq.h>
 #include <linux/highuid.h>
+#include <linux/aio.h>
 
 #include <asm/uaccess.h>
 
@@ -284,6 +285,8 @@
 	{0}
 };
 
+extern int user_pinned_pages;
+
 static ctl_table fs_table[] = {
 	{FS_NRINODE, "inode-nr", &inodes_stat, 2*sizeof(int),
 	 0444, NULL, &proc_dointvec},
@@ -309,6 +312,16 @@
 	 sizeof(int), 0644, NULL, &proc_dointvec},
 	{FS_LEASE_TIME, "lease-break-time", &lease_break_time, sizeof(int),
 	 0644, NULL, &proc_dointvec},
+	{FS_AIO_NR, "aio-nr", &aio_nr, sizeof(aio_nr),
+	 0444, NULL, &proc_dointvec},
+	{FS_AIO_MAX_NR, "aio-max-nr", &aio_max_nr, sizeof(aio_max_nr),
+	 0644, NULL, &proc_dointvec},
+	{FS_AIO_MAX_SIZE, "aio-max-size", &aio_max_size, sizeof(aio_max_size),
+	 0644, NULL, &proc_dointvec},
+	{FS_AIO_MAX_PINNED, "aio-max-pinned", &aio_max_pinned, sizeof(aio_max_pinned),
+	 0644, NULL, &proc_dointvec},
+	{FS_AIO_MAX_PINNED+1, "aio-pinned", &user_pinned_pages, 4,
+	 0644, NULL, &proc_dointvec},
 	{0}
 };
 
diff -urN v2.4.19/mm/Makefile aio-2.4.19.diff/mm/Makefile
--- v2.4.19/mm/Makefile	Fri Aug  9 13:50:46 2002
+++ aio-2.4.19.diff/mm/Makefile	Mon Sep 16 21:54:13 2002
@@ -17,5 +17,6 @@
 	    shmem.o
 
 obj-$(CONFIG_HIGHMEM) += highmem.o
+obj-y += wtd.o
 
 include $(TOPDIR)/Rules.make
diff -urN v2.4.19/mm/filemap.c aio-2.4.19.diff/mm/filemap.c
--- v2.4.19/mm/filemap.c	Fri Aug  9 13:50:46 2002
+++ aio-2.4.19.diff/mm/filemap.c	Mon Sep 16 21:54:13 2002
@@ -29,6 +29,8 @@
 #include <asm/mman.h>
 
 #include <linux/highmem.h>
+#include <linux/worktodo.h>
+#include <linux/iobuf.h>
 
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
@@ -775,7 +777,7 @@
  * at a cost of "thundering herd" phenomena during rare hash
  * collisions.
  */
-static inline wait_queue_head_t *page_waitqueue(struct page *page)
+static inline wait_queue_head_t *__page_waitqueue(struct page *page)
 {
 	const zone_t *zone = page_zone(page);
 	wait_queue_head_t *wait = zone->wait_table;
@@ -806,6 +808,13 @@
 	return &wait[hash];
 }
 
+wait_queue_head_t *page_waitqueue(struct page *page)
+{
+	return __page_waitqueue(page);
+}
+
+#define page_waitqueue(page) __page_waitqueue(page)
+
 /* 
  * Wait for a page to get unlocked.
  *
@@ -1186,7 +1195,7 @@
 
 static void generic_file_readahead(int reada_ok,
 	struct file * filp, struct inode * inode,
-	struct page * page)
+	struct page * page, int flags)
 {
 	unsigned long end_index;
 	unsigned long index = page->index;
@@ -1316,7 +1325,7 @@
  * This is really ugly. But the goto's actually try to clarify some
  * of the logic when it comes to error handling etc.
  */
-void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
+void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor, int flags)
 {
 	struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
 	struct inode *inode = mapping->host;
@@ -1325,10 +1334,17 @@
 	int reada_ok;
 	int error;
 	int max_readahead = get_max_readahead(inode);
+	loff_t pos;
+
+	pos = *ppos;
+	if (unlikely(pos < 0)) {
+		desc->error = -EINVAL;
+		return;
+	}
 
 	cached_page = NULL;
-	index = *ppos >> PAGE_CACHE_SHIFT;
-	offset = *ppos & ~PAGE_CACHE_MASK;
+	index = pos >> PAGE_CACHE_SHIFT;
+	offset = pos & ~PAGE_CACHE_MASK;
 
 /*
  * If the current position is outside the previous read-ahead window, 
@@ -1375,13 +1391,17 @@
 
 		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 			
-		if (index > end_index)
+		if (index > end_index) {
+			desc->error = 0;
 			break;
+		}
 		nr = PAGE_CACHE_SIZE;
 		if (index == end_index) {
 			nr = inode->i_size & ~PAGE_CACHE_MASK;
-			if (nr <= offset)
+			if (nr <= offset) {
+				desc->error = 0;
 				break;
+			}
 		}
 
 		nr = nr - offset;
@@ -1401,7 +1421,7 @@
 
 		if (!Page_Uptodate(page))
 			goto page_not_up_to_date;
-		generic_file_readahead(reada_ok, filp, inode, page);
+		generic_file_readahead(reada_ok, filp, inode, page, flags);
 page_ok:
 		/* If users can be writing to this page using arbitrary
 		 * virtual addresses, take care about potential aliasing
@@ -1441,13 +1461,23 @@
  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
  */
 page_not_up_to_date:
-		generic_file_readahead(reada_ok, filp, inode, page);
+		generic_file_readahead(reada_ok, filp, inode, page, flags);
 
 		if (Page_Uptodate(page))
 			goto page_ok;
 
 		/* Get exclusive access to the page ... */
-		lock_page(page);
+		if (flags & F_ATOMIC) {
+			if (TryLockPage(page)) {
+				if (Page_Uptodate(page))
+					goto page_ok;
+				desc->error = -EWOULDBLOCKIO;
+				page_cache_release(page);
+				break;
+			}
+			printk("page_not_up_to_date: atomic trylock succeeded\n");
+		} else
+			lock_page(page);
 
 		/* Did it get unhashed before we got the lock? */
 		if (!page->mapping) {
@@ -1471,11 +1501,12 @@
 				goto page_ok;
 
 			/* Again, try some read-ahead while waiting for the page to finish.. */
-			generic_file_readahead(reada_ok, filp, inode, page);
-			wait_on_page(page);
+			generic_file_readahead(reada_ok, filp, inode, page, flags);
+			if (!(flags & F_ATOMIC))
+				wait_on_page(page);
 			if (Page_Uptodate(page))
 				goto page_ok;
-			error = -EIO;
+			error = (flags & F_ATOMIC) ? -EWOULDBLOCKIO : -EIO;
 		}
 
 		/* UHHUH! A synchronous read error occurred. Report it */
@@ -1484,6 +1515,11 @@
 		break;
 
 no_cached_page:
+		if (flags & F_ATOMIC) {
+			spin_unlock(&pagecache_lock);
+			desc->error = -EWOULDBLOCKIO;
+			break;
+		}
 		/*
 		 * Ok, it wasn't cached, so we need to create a new
 		 * page..
@@ -1638,6 +1674,11 @@
  */
 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
 {
+	return generic_file_new_read(filp, buf, count, ppos, 0);
+}
+
+ssize_t generic_file_new_read(struct file * filp, char * buf, size_t count, loff_t *ppos, int flags)
+{
 	ssize_t retval;
 
 	if ((ssize_t) count < 0)
@@ -1657,7 +1698,7 @@
 			desc.count = count;
 			desc.buf = buf;
 			desc.error = 0;
-			do_generic_file_read(filp, ppos, &desc, file_read_actor);
+			do_generic_file_read(filp, ppos, &desc, file_read_actor, flags);
 
 			retval = desc.written;
 			if (!retval)
@@ -1782,7 +1823,7 @@
 		desc.count = count;
 		desc.buf = (char *) out_file;
 		desc.error = 0;
-		do_generic_file_read(in_file, ppos, &desc, file_send_actor);
+		do_generic_file_read(in_file, ppos, &desc, file_send_actor, 0);
 
 		retval = desc.written;
 		if (!retval)
@@ -3178,3 +3219,681 @@
 		panic("Failed to allocate page hash table\n");
 	memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
 }
+
+/* address_space_map
+ *	Maps a series of pages from the page cache into the given array.
+ */
+static int address_space_map(struct address_space *as, unsigned long index,
+		int nr, struct page **pages,
+		int *nr_newp, struct page **new_pages)
+{
+	struct page *cached_page = NULL;
+	int nr_new = 0;
+	int ret;
+
+	if (unlikely(nr <= 0)) {
+		*nr_newp = nr_new;
+		return 0;
+	}
+
+	ret = 0;
+
+	spin_lock(&pagecache_lock);
+
+	while (nr > 0) {
+		struct page **hash = page_hash(as, index);
+		struct page *page;
+
+		page = __find_page_nolock(as, index, *hash);
+		if (page) {
+			page_cache_get(page);
+got_page:
+			pages[ret++] = page;
+			index++;
+			nr--;
+			continue;
+		}
+
+		if (cached_page) {
+			__add_to_page_cache(cached_page, as, index, hash);
+			nr_new++;
+			*new_pages++ = page = cached_page;
+			cached_page = NULL;
+			goto got_page;
+		}
+		spin_unlock(&pagecache_lock);
+
+		cached_page = page_cache_alloc(as);
+		if (!cached_page)
+			goto out;
+
+		/* Okay, we now have an allocated page.  Retry
+		 * the search and add. */
+		spin_lock(&pagecache_lock);
+	}
+
+	spin_unlock(&pagecache_lock);
+
+out:
+	if (cached_page)
+		page_cache_release(cached_page);
+
+	*nr_newp = nr_new;
+	return ret ? ret : -ENOMEM;
+}
+
+struct iodesc {
+	struct worktodo	wtd;
+
+	struct page	*good_page;	/* the highest Uptodate page */
+	int		good_idx;
+	int		err;
+	int		did_read;
+	int		rw;
+	loff_t		pos;
+
+	struct page	**pages;
+	struct page	**new_pages;
+	struct page	**cur_pagep;
+	int		nr_pages;
+	int		nr_new_pages;
+
+	struct address_space *as;
+	struct file	*file;
+	kvec_cb_t	cb;
+
+	size_t		size;
+	unsigned long	transferred;
+	unsigned	offset;
+	struct kveclet	*veclet;
+
+	struct kvec_dst	src;
+
+	int		sync;
+	unsigned long	rlimit_fsize;
+
+#define READDESC_NR_DEF	3
+	struct page *def_pages[READDESC_NR_DEF];
+	struct page *def_new_pages[READDESC_NR_DEF];
+};
+
+static void __iodesc_free(struct iodesc *io, int unlock)
+{
+	kvec_cb_t cb;
+	ssize_t res;
+
+	if (unlock) {
+		unsigned i;
+		for (i=0; i<io->nr_pages; i++) {
+			struct page *page = io->pages[i];
+			UnlockPage(page);
+			page_cache_release(page);
+		}
+	} else {
+		unsigned i;
+		for (i=0; i<io->nr_pages; i++)
+			page_cache_release(io->pages[i]);
+	}
+
+	if (io->new_pages != io->def_new_pages)
+		kfree(io->new_pages);
+	if (io->pages != io->def_pages)
+		kfree(io->pages);
+
+	cb = io->cb;
+	res = io->transferred ? io->transferred : io->err;
+	kfree(io);
+
+	cb.fn(cb.data, cb.vec, res);
+}
+
+/* By the time this function is called, all of the pages prior to
+ * the current good_idx have been released appropriately.  The remaining
+ * duties are to release any remaining pages and to honour O_SYNC.
+ */
+static void __iodesc_finish_write(struct iodesc *io)
+{
+	pr_debug("__iodesc_finish_write(%p)\n", io);
+
+	__iodesc_free(io, WRITE == io->rw);
+}
+
+/* This is mostly ripped from generic_file_write */
+static int __iodesc_write_page(struct iodesc *io, struct page *page)
+{
+	char *kaddr = kmap(page);
+	unsigned long bytes;
+	unsigned long offset;
+	long status;
+	int done = 0;
+
+	offset = io->offset;
+	kaddr += offset;
+
+	bytes = PAGE_CACHE_SIZE - offset;
+	if (io->size < bytes)
+		bytes = io->size;
+
+	pr_debug("__iodesc_write_page(%p (%lu), %lu %lu %lu)\n", page, page->index, offset, bytes);
+
+	io->err = io->as->a_ops->prepare_write(io->file, page,
+						offset, offset + bytes);
+	if (unlikely(io->err)) {
+		pr_debug("prepare_write: %d\n", io->err);
+		kunmap(page);
+		return 1;
+	}
+
+	kvec_dst_map(&io->src);
+	memcpy_from_kvec_dst(kaddr, &io->src, bytes);
+	kvec_dst_unmap(&io->src);	/* commit_write may block */
+
+	flush_dcache_page(page);
+	status = io->as->a_ops->commit_write(io->file, page,
+						offset, offset+bytes);
+
+	/* We don't handle short writes */
+	if (status > 0 && status != bytes)
+		done = 1;
+
+	if (!status)
+		status = bytes;
+
+	if (likely(status > 0)) {
+		io->transferred += status;
+		io->size -= status;
+		io->offset = (offset + status) & (PAGE_CACHE_SIZE - 1);
+
+		if (io->offset)
+			done = 1;
+	} else {
+		io->err = status;
+		done = 1;
+	}
+
+	kunmap(page);
+	return done;
+}
+
+void __iodesc_sync_wait_page(void *data)
+{
+	struct iodesc *io = data;
+
+	do {
+		struct buffer_head *bh, *head = io->pages[io->good_idx]->buffers;
+
+		if (!head)
+			continue;
+
+		bh = head;
+		do {
+			if (buffer_locked(bh)) {
+				pr_debug("waiting on bh=%pi io=%p\n", bh, io);
+				if (!wtd_wait_on_buffer(&io->wtd, bh))
+					return;
+			}
+			if (buffer_req(bh) && !buffer_uptodate(bh)) {
+				pr_debug("io err bh=%p (%p)\n", bh, io);
+				io->err = -EIO;
+				break;
+			}
+		} while ((bh = bh->b_this_page) != head);
+	} while (!io->err && ++io->good_idx < io->nr_pages) ;
+
+	pr_debug("finish_write(%p)\n", io);
+	__iodesc_finish_write(io);
+}
+
+static void __iodesc_do_write(void *data)
+{
+	struct iodesc *io = data;
+	unsigned i;
+
+	for (i=0; i<io->nr_pages; i++) {
+		if (__iodesc_write_page(io, io->pages[i]))
+			break;
+	}
+
+	up(&io->file->f_dentry->d_inode->i_sem);
+
+	if (io->sync) {
+		io->good_idx = 0;
+
+		pr_debug("writing out pages(%p)\n", io);
+		for (i=0; i<io->nr_pages; i++) {
+			if (io->pages[i]->buffers)
+				writeout_one_page(io->pages[i]);
+		}
+
+		pr_debug("calling __iodesc_sync_wait_page(%p)\n", io);
+		wtd_set_action(&io->wtd, __iodesc_sync_wait_page, io);
+		__iodesc_sync_wait_page(io);
+		return;
+	}
+
+	__iodesc_finish_write(io);
+}
+
+static void __iodesc_write_lock_next_page(void *data)
+{
+	struct iodesc *io = data;
+	pr_debug("__iodesc_write_next_page(%p)\n", io);
+
+	while (io->good_idx < io->nr_pages) {
+		io->good_page = io->pages[io->good_idx++];
+		if (io->good_page == *io->cur_pagep)
+			io->cur_pagep++;
+		else {
+			if (!wtd_lock_page(&io->wtd, io->good_page))
+				return;
+		}
+	}
+
+	//Is this faster? __iodesc_do_write(io);
+	wtd_set_action(&io->wtd, __iodesc_do_write, io);
+	wtd_queue(&io->wtd);
+}
+
+static void __generic_file_write_iodesc(struct iodesc *io)
+{
+	struct inode *inode = io->file->f_dentry->d_inode;
+	time_t now = CURRENT_TIME;
+
+	remove_suid(inode);
+	if (inode->i_ctime != now || inode->i_mtime != now) {
+		inode->i_ctime = inode->i_mtime = now;
+		mark_inode_dirty_sync(inode);
+	}
+
+	wtd_set_action(&io->wtd, __iodesc_write_lock_next_page, io);
+	io->sync = !!(io->file->f_flags & O_SYNC);
+	io->good_idx = 0;
+	io->cur_pagep = io->new_pages;
+	__iodesc_write_lock_next_page(io);
+}
+
+static void __iodesc_read_finish(struct iodesc *io)
+{
+	struct page **src_pagep;
+	char *dst_addr, *src_addr;
+	int src_off;
+	size_t size;
+	size_t valid;
+
+	struct kveclet *veclet = io->veclet;
+	struct page *dst_page = veclet->page;
+	int dst_len = veclet->length;
+	int dst_off = veclet->offset;
+
+
+	pr_debug("__iodesc_read_finish: good_idx = %d\n", io->good_idx);
+	if (io->good_idx <= 0)
+		goto no_data;
+
+	size = io->size;
+	src_off = io->offset;
+	src_pagep = io->pages;
+	src_addr = kmap(*src_pagep);
+
+	valid = (size_t)io->good_idx << PAGE_CACHE_SHIFT;
+	valid -= src_off;
+	pr_debug("size=%d valid=%d src_off=%d\n", size, valid, src_off);
+
+	if (valid < size)
+		size = valid;
+
+	dst_addr = kmap(veclet->page);
+
+	while (size > 0) {
+		int this = PAGE_CACHE_SIZE - src_off;
+		if ((PAGE_SIZE - dst_off) < this)
+			this = PAGE_SIZE - dst_off;
+		if (size < this)
+			this = size;
+		pr_debug("this=%d src_off=%d dst_off=%d dst_len=%d\n",
+			this, src_off, dst_off, dst_len);
+		memcpy(dst_addr + dst_off, src_addr + src_off, this);
+
+		src_off += this;
+		dst_off += this;
+		dst_len -= this;
+		size -= this;
+		io->transferred += this;
+		pr_debug("read_finish: this=%d transferred=%d\n",
+			 this, io->transferred);
+
+		if (size <= 0)
+			break;
+
+		if (dst_len <= 0) {
+			kunmap(dst_page);
+			veclet++;
+			dst_page = veclet->page;
+			dst_off = veclet->offset;
+			dst_len = veclet->length;
+			dst_addr = kmap(dst_page);
+		}
+
+		if (src_off >= PAGE_SIZE) { /* FIXME: PAGE_CACHE_SIZE */
+			kunmap(*src_pagep);
+			pr_debug("page(%lu)->count = %d\n",
+				 (*src_pagep)->index,
+				 atomic_read(&(*src_pagep)->count));
+			src_pagep++;
+			src_addr = kmap(*src_pagep);
+			src_off = 0;
+		}
+	}
+	kunmap(dst_page);
+	kunmap(*src_pagep);
+no_data:
+	__iodesc_free(io, 0);
+}
+
+static void __iodesc_make_uptodate(void *data)
+{
+	struct iodesc *io = data;
+	struct page *page = io->good_page;
+	int locked = 1;
+
+	pr_debug("__iodesc_make_uptodate: io=%p index=%lu\n", io, page->index);
+again:
+	while (Page_Uptodate(page)) {
+		pr_debug("page index %lu uptodate\n", page->index);
+		if (locked) {
+			UnlockPage(page);
+			locked = 0;
+		}
+		io->did_read = 0;
+		io->good_idx++;
+		if (io->good_idx >= io->nr_pages) {
+			__iodesc_read_finish(io);
+			return;
+		}
+		page = io->good_page = io->pages[io->good_idx];
+		pr_debug("__iodesc_make_uptodate: index=%lu\n", page->index);
+	}
+
+	if (!locked) {
+		if (!wtd_lock_page(&io->wtd, page))
+			return;
+		locked = 1;
+	}
+
+	if (!io->did_read) {
+		/* We haven't tried reading this page before, give it a go. */
+		pr_debug("attempting to read %lu\n", page->index);
+		io->did_read = 1;
+		locked = 0;
+		io->err = page->mapping->a_ops->readpage(io->file, page);
+		if (!io->err) {
+			if (Page_Uptodate(page))
+				goto again;
+			if (wtd_lock_page(&io->wtd, page)) {
+				locked = 1;
+				goto again;
+			}
+			return;
+		}
+	}
+
+	if (locked)
+		UnlockPage(page);
+
+	/* We've already read this page before.  Set err to EIO and quite */
+	if (!io->err)
+		io->err = -EIO;
+	__iodesc_read_finish(io);
+}
+
+static void __wtdgeneric_file_read_iodesc(void *data);
+
+static void __generic_file_read_iodesc(struct iodesc *io, int mayblock)
+{
+	int (*readpage)(struct file *, struct page *);
+	int i;
+
+	wtd_set_action(&io->wtd, __iodesc_make_uptodate, io);
+	readpage = io->as->a_ops->readpage;
+	for (i=0; i<io->nr_new_pages; i++) {
+		int ret;
+		if (!mayblock) {
+			wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io);
+			wtd_queue(&io->wtd);
+			return;
+		}
+		ret = readpage(io->file, io->new_pages[i]);
+		if (ret)
+			printk(KERN_DEBUG "__generic_file_read_kiovec: readpage(%lu) = %d\n", io->new_pages[i]->index, ret);
+	}
+
+	for (i=0; i<io->nr_pages; i++) {
+		struct page *page = io->pages[i];
+		if (Page_Uptodate(page)) {
+			pr_debug("__generic_file_read_iodesc: %lu is uptodate\n", page->index);
+			continue;
+		}
+
+		if (!mayblock) {
+			wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io);
+			wtd_queue(&io->wtd);
+			return;
+		}
+		if (!TryLockPage(page)) {
+			int ret = readpage(io->file, page);
+			if (ret)
+				printk(KERN_DEBUG "__generic_file_read_iodesc: readpage(%lu): %d\n", page->index, ret);
+		}
+
+		if (!Page_Uptodate(page) && io->good_idx == -1) {
+			pr_debug("first good_idx=%d (%lu)\n", i, page->index);
+			io->good_idx = i;
+			io->good_page = page;
+		}
+	}
+
+	/* Whee, all the pages are uptodate! */
+	if (!io->good_page) {
+		pr_debug("all pages uptodate!\n");
+		io->good_idx = io->nr_pages;
+		__iodesc_read_finish(io);
+		return;
+	}
+
+	pr_debug("locking good_page\n");
+	if (wtd_lock_page(&io->wtd, io->good_page))
+		__iodesc_make_uptodate(io);
+	return;
+}
+
+static void __wtdgeneric_file_read_iodesc(void *data)
+{
+	struct iodesc *io = data;
+	__generic_file_read_iodesc(io, 1);
+}
+
+static int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb,
+			 size_t size, loff_t pos);
+
+int generic_file_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	return generic_file_rw_kvec(file, READ, cb, size, pos);
+}
+
+int generic_file_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	return generic_file_rw_kvec(file, WRITE, cb, size, pos);
+}
+
+void wtd_rw_kvec_core(void *);
+int rw_kvec_core(struct iodesc *io);
+
+int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb,
+			 size_t size, loff_t pos)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	int append = file->f_flags & O_APPEND;
+	struct iodesc *io = NULL;
+	int ret;
+
+	ret = -EINVAL;
+	if (unlikely(rw != READ && rw != WRITE))
+		goto out;
+
+	/* Don't check pos when appending, but otherwise do santity 
+	 * checks before allocating memory.  -'ve offsets are invalid.
+	 */
+	if (unlikely(!append && pos < 0))
+		goto out;
+
+	ret = -ENOMEM;
+	io = kmalloc(sizeof(*io), GFP_KERNEL);
+	if (!io)
+		goto out;
+
+	memset(io, 0, sizeof(*io));
+	io->file = file;
+	io->rw = rw;
+	io->cb = cb;
+	io->size = size;
+	io->pos = pos;
+	io->rlimit_fsize = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	wtd_set_action(&io->wtd, wtd_rw_kvec_core, io);
+
+	if ((rw == READ) || (0 == wtd_down(&io->wtd, &inode->i_sem)))
+		return rw_kvec_core(io);
+
+	return 0;
+
+out:
+	if (!ret)
+		cb.fn(cb.data, cb.vec, ret);
+	return ret;
+}
+
+void wtd_rw_kvec_core(void *data)
+{
+	struct iodesc *io = data;
+	kvec_cb_t cb = io->cb;
+	int ret = rw_kvec_core(io);
+	if (ret)
+		cb.fn(cb.data, cb.vec, ret);
+}
+
+int rw_kvec_core(struct iodesc *io)
+{
+	int append = io->file->f_flags & O_APPEND;
+	struct inode *inode = io->file->f_dentry->d_inode;
+	struct address_space *as = inode->i_mapping;
+	unsigned long index;
+	unsigned long eindex;
+	unsigned long nr_pages;
+	int ret;
+
+	if (io->rw == WRITE) {
+		unsigned long long tmp;
+		loff_t limit;
+
+		/* We've already down'd the inode semaphore */
+		if (append)
+			io->pos = inode->i_size;
+
+		limit = io->rlimit_fsize;
+		if (likely(RLIM_INFINITY == limit))
+			limit = OFFSET_MAX;
+
+		/* Filesystem limits take precedence over user limits */
+		if (likely(inode->i_sb->s_maxbytes < limit))
+			limit = inode->i_sb->s_maxbytes;
+
+	        if (unlikely(io->pos >= limit)) {
+			pr_debug("maxbytes: %Ld\n", limit);
+			ret = 0;
+			if (io->size || io->pos > limit)
+				ret = -EFBIG;
+			goto out_io;
+		}
+
+		/* Clamp writes straddling limit. */
+		tmp = io->pos + io->size;
+		if (unlikely(tmp > (unsigned long long)limit))
+			io->size = limit - io->pos;
+	}
+
+	if (READ == io->rw) {
+		pr_debug("io->pos=%Ld i_size=%Ld\n", io->pos, inode->i_size);
+
+		if (io->pos > inode->i_size)
+			io->size = 0;
+		else if ((io->pos + io->size) > inode->i_size) {
+			size_t size = inode->i_size - io->pos;
+			if (size < io->size)
+				io->size = size;
+		}
+
+		pr_debug("io->size=%d\n", io->size);
+	}
+
+	ret = 0;
+	if (unlikely(!io->size))
+		goto out_io;
+
+	index = io->pos >> PAGE_CACHE_SHIFT;
+	eindex = (io->pos + io->size - 1) >> PAGE_CACHE_SHIFT;
+	nr_pages = eindex - index + 1;
+
+	pr_debug("nr_pages: %lu\n", nr_pages);
+
+	io->good_idx = -1;
+	io->good_page = NULL;
+	io->did_read = 0;
+	io->err = 0;
+	io->as = as;
+	io->offset = (unsigned long)io->pos & (PAGE_CACHE_SIZE - 1);
+	kvec_dst_init(&io->src, KM_USER0);
+	kvec_dst_set(&io->src, io->cb.vec->veclet);
+	io->veclet = io->cb.vec->veclet;
+	if (nr_pages < READDESC_NR_DEF) {
+		io->pages = io->def_pages;
+		io->new_pages = io->def_new_pages;
+	} else {
+		io->pages = kmalloc(sizeof(*io->pages) * (nr_pages + 1), GFP_KERNEL);
+		if (!io->pages)
+			goto out_io;
+
+		io->new_pages = kmalloc(sizeof(*io->new_pages) * (nr_pages + 1), GFP_KERNEL);
+		if (!io->new_pages)
+			goto out_pages;
+	}
+
+	ret = address_space_map(as, index, nr_pages, io->pages,
+			&io->nr_new_pages, io->new_pages);
+	pr_debug("as_map: %d (%d new)\n", ret, io->nr_new_pages);
+	if (ret <= 0)
+		goto out_new_pages;
+
+	io->nr_pages = ret;
+	io->pages[io->nr_pages] = NULL;
+	io->new_pages[io->nr_new_pages] = NULL;
+
+	if (io->rw == READ)
+		__generic_file_read_iodesc(io, 0);
+	else if (io->rw == WRITE)
+		__generic_file_write_iodesc(io);
+
+	return 0;
+
+out_new_pages:
+	if (io->new_pages != io->def_new_pages)
+		kfree(io->new_pages);
+out_pages:
+	if (io->pages != io->def_pages)
+		kfree(io->pages);
+out_io:
+	if (io->rw == WRITE)
+		up(&inode->i_sem);
+	if (!ret)
+		io->cb.fn(io->cb.data, io->cb.vec, ret);
+	kfree(io);
+	return ret;
+}
diff -urN v2.4.19/mm/filemap.c.old aio-2.4.19.diff/mm/filemap.c.old
--- v2.4.19/mm/filemap.c.old	Wed Dec 31 19:00:00 1969
+++ aio-2.4.19.diff/mm/filemap.c.old	Mon Sep 16 21:54:13 2002
@@ -0,0 +1,3871 @@
+/*
+ *	linux/mm/filemap.c
+ *
+ * Copyright (C) 1994-1999  Linus Torvalds
+ */
+
+/*
+ * This file handles the generic file mmap semantics used by
+ * most "normal" filesystems (but you don't /have/ to use this:
+ * the NFS filesystem used to do this differently, for example)
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/locks.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/smp_lock.h>
+#include <linux/blkdev.h>
+#include <linux/file.h>
+#include <linux/swapctl.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/iobuf.h>
+
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/mman.h>
+
+#include <linux/highmem.h>
+#include <linux/worktodo.h>
+#include <linux/iobuf.h>
+
+/*
+ * Shared mappings implemented 30.11.1994. It's not fully working yet,
+ * though.
+ *
+ * Shared mappings now work. 15.8.1995  Bruno.
+ *
+ * finished 'unifying' the page and buffer cache and SMP-threaded the
+ * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
+ *
+ * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
+ */
+
+atomic_t page_cache_size = ATOMIC_INIT(0);
+unsigned int page_hash_bits;
+struct page **page_hash_table;
+
+int vm_max_readahead = 31;
+int vm_min_readahead = 3;
+EXPORT_SYMBOL(vm_max_readahead);
+EXPORT_SYMBOL(vm_min_readahead);
+
+
+spinlock_t pagecache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+/*
+ * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock 
+ *	with the pagecache_lock held.
+ *
+ * Ordering:
+ *	swap_lock ->
+ *		pagemap_lru_lock ->
+ *			pagecache_lock
+ */
+spinlock_t pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+
+#define CLUSTER_PAGES		(1 << page_cluster)
+#define CLUSTER_OFFSET(x)	(((x) >> page_cluster) << page_cluster)
+
+static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p));
+static void add_page_to_hash_queue(struct page * page, struct page **p)
+{
+	struct page *next = *p;
+
+	*p = page;
+	page->next_hash = next;
+	page->pprev_hash = p;
+	if (next)
+		next->pprev_hash = &page->next_hash;
+	if (page->buffers)
+		PAGE_BUG(page);
+	atomic_inc(&page_cache_size);
+}
+
+static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
+{
+	struct list_head *head = &mapping->clean_pages;
+
+	mapping->nrpages++;
+	list_add(&page->list, head);
+	page->mapping = mapping;
+}
+
+static inline void remove_page_from_inode_queue(struct page * page)
+{
+	struct address_space * mapping = page->mapping;
+
+	mapping->nrpages--;
+	list_del(&page->list);
+	page->mapping = NULL;
+}
+
+static inline void remove_page_from_hash_queue(struct page * page)
+{
+	struct page *next = page->next_hash;
+	struct page **pprev = page->pprev_hash;
+
+	if (next)
+		next->pprev_hash = pprev;
+	*pprev = next;
+	page->pprev_hash = NULL;
+	atomic_dec(&page_cache_size);
+}
+
+/*
+ * Remove a page from the page cache and free it. Caller has to make
+ * sure the page is locked and that nobody else uses it - or that usage
+ * is safe.
+ */
+void __remove_inode_page(struct page *page)
+{
+	if (PageDirty(page)) BUG();
+	remove_page_from_inode_queue(page);
+	remove_page_from_hash_queue(page);
+}
+
+void remove_inode_page(struct page *page)
+{
+	if (!PageLocked(page))
+		PAGE_BUG(page);
+
+	spin_lock(&pagecache_lock);
+	__remove_inode_page(page);
+	spin_unlock(&pagecache_lock);
+}
+
+static inline int sync_page(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
+		return mapping->a_ops->sync_page(page);
+	return 0;
+}
+
+/*
+ * Add a page to the dirty page list.
+ */
+void set_page_dirty(struct page *page)
+{
+	if (!test_and_set_bit(PG_dirty, &page->flags)) {
+		struct address_space *mapping = page->mapping;
+
+		if (mapping) {
+			spin_lock(&pagecache_lock);
+			list_del(&page->list);
+			list_add(&page->list, &mapping->dirty_pages);
+			spin_unlock(&pagecache_lock);
+
+			if (mapping->host)
+				mark_inode_dirty_pages(mapping->host);
+		}
+	}
+}
+
+/**
+ * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
+ * @inode: the inode which pages we want to invalidate
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ */
+
+void invalidate_inode_pages(struct inode * inode)
+{
+	struct list_head *head, *curr;
+	struct page * page;
+
+	head = &inode->i_mapping->clean_pages;
+
+	spin_lock(&pagemap_lru_lock);
+	spin_lock(&pagecache_lock);
+	curr = head->next;
+
+	while (curr != head) {
+		page = list_entry(curr, struct page, list);
+		curr = curr->next;
+
+		/* We cannot invalidate something in dirty.. */
+		if (PageDirty(page))
+			continue;
+
+		/* ..or locked */
+		if (TryLockPage(page))
+			continue;
+
+		if (page->buffers && !try_to_free_buffers(page, 0))
+			goto unlock;
+
+		if (page_count(page) != 1)
+			goto unlock;
+
+		__lru_cache_del(page);
+		__remove_inode_page(page);
+		UnlockPage(page);
+		page_cache_release(page);
+		continue;
+unlock:
+		UnlockPage(page);
+		continue;
+	}
+
+	spin_unlock(&pagecache_lock);
+	spin_unlock(&pagemap_lru_lock);
+}
+
+static int do_flushpage(struct page *page, unsigned long offset)
+{
+	int (*flushpage) (struct page *, unsigned long);
+	flushpage = page->mapping->a_ops->flushpage;
+	if (flushpage)
+		return (*flushpage)(page, offset);
+	return block_flushpage(page, offset);
+}
+
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+	if (page->buffers)
+		do_flushpage(page, partial);
+}
+
+static void truncate_complete_page(struct page *page)
+{
+	/* Leave it on the LRU if it gets converted into anonymous buffers */
+	if (!page->buffers || do_flushpage(page, 0))
+		lru_cache_del(page);
+
+	/*
+	 * We remove the page from the page cache _after_ we have
+	 * destroyed all buffer-cache references to it. Otherwise some
+	 * other process might think this inode page is not in the
+	 * page cache and creates a buffer-cache alias to it causing
+	 * all sorts of fun problems ...  
+	 */
+	ClearPageDirty(page);
+	ClearPageUptodate(page);
+	remove_inode_page(page);
+	page_cache_release(page);
+}
+
+static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
+static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
+{
+	struct list_head *curr;
+	struct page * page;
+	int unlocked = 0;
+
+ restart:
+	curr = head->prev;
+	while (curr != head) {
+		unsigned long offset;
+
+		page = list_entry(curr, struct page, list);
+		offset = page->index;
+
+		/* Is one of the pages to truncate? */
+		if ((offset >= start) || (*partial && (offset + 1) == start)) {
+			int failed;
+
+			page_cache_get(page);
+			failed = TryLockPage(page);
+
+			list_del(head);
+			if (!failed)
+				/* Restart after this page */
+				list_add_tail(head, curr);
+			else
+				/* Restart on this page */
+				list_add(head, curr);
+
+			spin_unlock(&pagecache_lock);
+			unlocked = 1;
+
+ 			if (!failed) {
+				if (*partial && (offset + 1) == start) {
+					truncate_partial_page(page, *partial);
+					*partial = 0;
+				} else 
+					truncate_complete_page(page);
+
+				UnlockPage(page);
+			} else
+ 				wait_on_page(page);
+
+			page_cache_release(page);
+
+			if (current->need_resched) {
+				__set_current_state(TASK_RUNNING);
+				schedule();
+			}
+
+			spin_lock(&pagecache_lock);
+			goto restart;
+		}
+		curr = curr->prev;
+	}
+	return unlocked;
+}
+
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from with to truncate
+ *
+ * Truncate the page cache at a set offset, removing the pages
+ * that are beyond that offset (and zeroing out partial pages).
+ * If any page is locked we wait for it to become unlocked.
+ */
+void truncate_inode_pages(struct address_space * mapping, loff_t lstart) 
+{
+	unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+	int unlocked;
+
+	spin_lock(&pagecache_lock);
+	do {
+		unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
+		unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
+		unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
+	} while (unlocked);
+	/* Traversed all three lists without dropping the lock */
+	spin_unlock(&pagecache_lock);
+}
+
+static inline int invalidate_this_page2(struct page * page,
+					struct list_head * curr,
+					struct list_head * head)
+{
+	int unlocked = 1;
+
+	/*
+	 * The page is locked and we hold the pagecache_lock as well
+	 * so both page_count(page) and page->buffers stays constant here.
+	 */
+	if (page_count(page) == 1 + !!page->buffers) {
+		/* Restart after this page */
+		list_del(head);
+		list_add_tail(head, curr);
+
+		page_cache_get(page);
+		spin_unlock(&pagecache_lock);
+		truncate_complete_page(page);
+	} else {
+		if (page->buffers) {
+			/* Restart after this page */
+			list_del(head);
+			list_add_tail(head, curr);
+
+			page_cache_get(page);
+			spin_unlock(&pagecache_lock);
+			block_invalidate_page(page);
+		} else
+			unlocked = 0;
+
+		ClearPageDirty(page);
+		ClearPageUptodate(page);
+	}
+
+	return unlocked;
+}
+
+static int FASTCALL(invalidate_list_pages2(struct list_head *));
+static int invalidate_list_pages2(struct list_head *head)
+{
+	struct list_head *curr;
+	struct page * page;
+	int unlocked = 0;
+
+ restart:
+	curr = head->prev;
+	while (curr != head) {
+		page = list_entry(curr, struct page, list);
+
+		if (!TryLockPage(page)) {
+			int __unlocked;
+
+			__unlocked = invalidate_this_page2(page, curr, head);
+			UnlockPage(page);
+			unlocked |= __unlocked;
+			if (!__unlocked) {
+				curr = curr->prev;
+				continue;
+			}
+		} else {
+			/* Restart on this page */
+			list_del(head);
+			list_add(head, curr);
+
+			page_cache_get(page);
+			spin_unlock(&pagecache_lock);
+			unlocked = 1;
+			wait_on_page(page);
+		}
+
+		page_cache_release(page);
+		if (current->need_resched) {
+			__set_current_state(TASK_RUNNING);
+			schedule();
+		}
+
+		spin_lock(&pagecache_lock);
+		goto restart;
+	}
+	return unlocked;
+}
+
+/**
+ * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
+ * free the pages because they're mapped.
+ * @mapping: the address_space which pages we want to invalidate
+ */
+void invalidate_inode_pages2(struct address_space * mapping)
+{
+	int unlocked;
+
+	spin_lock(&pagecache_lock);
+	do {
+		unlocked = invalidate_list_pages2(&mapping->clean_pages);
+		unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
+		unlocked |= invalidate_list_pages2(&mapping->locked_pages);
+	} while (unlocked);
+	spin_unlock(&pagecache_lock);
+}
+
+static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
+{
+	goto inside;
+
+	for (;;) {
+		page = page->next_hash;
+inside:
+		if (!page)
+			goto not_found;
+		if (page->mapping != mapping)
+			continue;
+		if (page->index == offset)
+			break;
+	}
+
+not_found:
+	return page;
+}
+
+static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
+{
+	struct list_head *curr;
+	struct page *page;
+	int retval = 0;
+
+	spin_lock(&pagecache_lock);
+	curr = head->next;
+	while (curr != head) {
+		page = list_entry(curr, struct page, list);
+		curr = curr->next;
+		if (!page->buffers)
+			continue;
+		if (page->index >= end)
+			continue;
+		if (page->index < start)
+			continue;
+
+		page_cache_get(page);
+		spin_unlock(&pagecache_lock);
+		lock_page(page);
+
+		/* The buffers could have been free'd while we waited for the page lock */
+		if (page->buffers)
+			retval |= fn(page);
+
+		UnlockPage(page);
+		spin_lock(&pagecache_lock);
+		curr = page->list.next;
+		page_cache_release(page);
+	}
+	spin_unlock(&pagecache_lock);
+
+	return retval;
+}
+
+/*
+ * Two-stage data sync: first start the IO, then go back and
+ * collect the information..
+ */
+int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
+{
+	int retval;
+
+	/* writeout dirty buffers on pages from both clean and dirty lists */
+	retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
+	retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
+	retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
+
+	/* now wait for locked buffers on pages from both clean and dirty lists */
+	retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page);
+	retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
+	retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
+
+	return retval;
+}
+
+/*
+ * In-memory filesystems have to fail their
+ * writepage function - and this has to be
+ * worked around in the VM layer..
+ *
+ * We
+ *  - mark the page dirty again (but do NOT
+ *    add it back to the inode dirty list, as
+ *    that would livelock in fdatasync)
+ *  - activate the page so that the page stealer
+ *    doesn't try to write it out over and over
+ *    again.
+ */
+int fail_writepage(struct page *page)
+{
+	/* Only activate on memory-pressure, not fsync.. */
+	if (PageLaunder(page)) {
+		activate_page(page);
+		SetPageReferenced(page);
+	}
+
+	/* Set the page dirty again, unlock */
+	SetPageDirty(page);
+	UnlockPage(page);
+	return 0;
+}
+
+EXPORT_SYMBOL(fail_writepage);
+
+/**
+ *      filemap_fdatasync - walk the list of dirty pages of the given address space
+ *     	and writepage() all of them.
+ * 
+ *      @mapping: address space structure to write
+ *
+ */
+int filemap_fdatasync(struct address_space * mapping)
+{
+	int ret = 0;
+	int (*writepage)(struct page *) = mapping->a_ops->writepage;
+
+	spin_lock(&pagecache_lock);
+
+        while (!list_empty(&mapping->dirty_pages)) {
+		struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
+
+		list_del(&page->list);
+		list_add(&page->list, &mapping->locked_pages);
+
+		if (!PageDirty(page))
+			continue;
+
+		page_cache_get(page);
+		spin_unlock(&pagecache_lock);
+
+		lock_page(page);
+
+		if (PageDirty(page)) {
+			int err;
+			ClearPageDirty(page);
+			err = writepage(page);
+			if (err && !ret)
+				ret = err;
+		} else
+			UnlockPage(page);
+
+		page_cache_release(page);
+		spin_lock(&pagecache_lock);
+	}
+	spin_unlock(&pagecache_lock);
+	return ret;
+}
+
+/**
+ *      filemap_fdatawait - walk the list of locked pages of the given address space
+ *     	and wait for all of them.
+ * 
+ *      @mapping: address space structure to wait for
+ *
+ */
+int filemap_fdatawait(struct address_space * mapping)
+{
+	int ret = 0;
+
+	spin_lock(&pagecache_lock);
+
+        while (!list_empty(&mapping->locked_pages)) {
+		struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
+
+		list_del(&page->list);
+		list_add(&page->list, &mapping->clean_pages);
+
+		if (!PageLocked(page))
+			continue;
+
+		page_cache_get(page);
+		spin_unlock(&pagecache_lock);
+
+		___wait_on_page(page);
+		if (PageError(page))
+			ret = -EIO;
+
+		page_cache_release(page);
+		spin_lock(&pagecache_lock);
+	}
+	spin_unlock(&pagecache_lock);
+	return ret;
+}
+
+/*
+ * Add a page to the inode page cache.
+ *
+ * The caller must have locked the page and 
+ * set all the page flags correctly..
+ */
+void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
+{
+	if (!PageLocked(page))
+		BUG();
+
+	page->index = index;
+	page_cache_get(page);
+	spin_lock(&pagecache_lock);
+	add_page_to_inode_queue(mapping, page);
+	add_page_to_hash_queue(page, page_hash(mapping, index));
+	spin_unlock(&pagecache_lock);
+
+	lru_cache_add(page);
+}
+
+/*
+ * This adds a page to the page cache, starting out as locked,
+ * owned by us, but unreferenced, not uptodate and with no errors.
+ */
+static inline void __add_to_page_cache(struct page * page,
+	struct address_space *mapping, unsigned long offset,
+	struct page **hash)
+{
+	unsigned long flags;
+
+	flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
+	page->flags = flags | (1 << PG_locked);
+	page_cache_get(page);
+	page->index = offset;
+	add_page_to_inode_queue(mapping, page);
+	add_page_to_hash_queue(page, hash);
+}
+
+void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
+{
+	spin_lock(&pagecache_lock);
+	__add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
+	spin_unlock(&pagecache_lock);
+	lru_cache_add(page);
+}
+
+int add_to_page_cache_unique(struct page * page,
+	struct address_space *mapping, unsigned long offset,
+	struct page **hash)
+{
+	int err;
+	struct page *alias;
+
+	spin_lock(&pagecache_lock);
+	alias = __find_page_nolock(mapping, offset, *hash);
+
+	err = 1;
+	if (!alias) {
+		__add_to_page_cache(page,mapping,offset,hash);
+		err = 0;
+	}
+
+	spin_unlock(&pagecache_lock);
+	if (!err)
+		lru_cache_add(page);
+	return err;
+}
+
+/*
+ * This adds the requested page to the page cache if it isn't already there,
+ * and schedules an I/O to read in its contents from disk.
+ */
+static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
+static int page_cache_read(struct file * file, unsigned long offset)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct page **hash = page_hash(mapping, offset);
+	struct page *page; 
+
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(mapping, offset, *hash);
+	spin_unlock(&pagecache_lock);
+	if (page)
+		return 0;
+
+	page = page_cache_alloc(mapping);
+	if (!page)
+		return -ENOMEM;
+
+	if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
+		int error = mapping->a_ops->readpage(file, page);
+		page_cache_release(page);
+		return error;
+	}
+	/*
+	 * We arrive here in the unlikely event that someone 
+	 * raced with us and added our page to the cache first.
+	 */
+	page_cache_release(page);
+	return 0;
+}
+
+/*
+ * Read in an entire cluster at once.  A cluster is usually a 64k-
+ * aligned block that includes the page requested in "offset."
+ */
+static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
+					     unsigned long filesize));
+static int read_cluster_nonblocking(struct file * file, unsigned long offset,
+	unsigned long filesize)
+{
+	unsigned long pages = CLUSTER_PAGES;
+
+	offset = CLUSTER_OFFSET(offset);
+	while ((pages-- > 0) && (offset < filesize)) {
+		int error = page_cache_read(file, offset);
+		if (error < 0)
+			return error;
+		offset ++;
+	}
+
+	return 0;
+}
+
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+#if BITS_PER_LONG == 32
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define GOLDEN_RATIO_PRIME 0x9e370001UL
+#elif BITS_PER_LONG == 64
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
+#else
+#error Define GOLDEN_RATIO_PRIME for your wordsize.
+#endif
+
+/*
+ * In order to wait for pages to become available there must be
+ * waitqueues associated with pages. By using a hash table of
+ * waitqueues where the bucket discipline is to maintain all
+ * waiters on the same queue and wake all when any of the pages
+ * become available, and for the woken contexts to check to be
+ * sure the appropriate page became available, this saves space
+ * at a cost of "thundering herd" phenomena during rare hash
+ * collisions.
+ */
+static inline wait_queue_head_t *__page_waitqueue(struct page *page)
+{
+	const zone_t *zone = page_zone(page);
+	wait_queue_head_t *wait = zone->wait_table;
+	unsigned long hash = (unsigned long)page;
+
+#if BITS_PER_LONG == 64
+	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
+	unsigned long n = hash;
+	n <<= 18;
+	hash -= n;
+	n <<= 33;
+	hash -= n;
+	n <<= 3;
+	hash += n;
+	n <<= 3;
+	hash -= n;
+	n <<= 4;
+	hash += n;
+	n <<= 2;
+	hash += n;
+#else
+	/* On some cpus multiply is faster, on others gcc will do shifts */
+	hash *= GOLDEN_RATIO_PRIME;
+#endif
+
+	hash >>= zone->wait_table_shift;
+
+	return &wait[hash];
+}
+
+wait_queue_head_t *page_waitqueue(struct page *page)
+{
+	return __page_waitqueue(page);
+}
+
+#define page_waitqueue(page) __page_waitqueue(page)
+
+/* 
+ * Wait for a page to get unlocked.
+ *
+ * This must be called with the caller "holding" the page,
+ * ie with increased "page->count" so that the page won't
+ * go away during the wait..
+ */
+void ___wait_on_page(struct page *page)
+{
+	wait_queue_head_t *waitqueue = page_waitqueue(page);
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	add_wait_queue(waitqueue, &wait);
+	do {
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (!PageLocked(page))
+			break;
+		sync_page(page);
+		schedule();
+	} while (PageLocked(page));
+	__set_task_state(tsk, TASK_RUNNING);
+	remove_wait_queue(waitqueue, &wait);
+}
+
+/*
+ * Unlock the page and wake up sleepers in ___wait_on_page.
+ */
+void unlock_page(struct page *page)
+{
+	wait_queue_head_t *waitqueue = page_waitqueue(page);
+	clear_bit(PG_launder, &(page)->flags);
+	smp_mb__before_clear_bit();
+	if (!test_and_clear_bit(PG_locked, &(page)->flags))
+		BUG();
+	smp_mb__after_clear_bit(); 
+	if (waitqueue_active(waitqueue))
+		wake_up_all(waitqueue);
+}
+
+/*
+ * Get a lock on the page, assuming we need to sleep
+ * to get it..
+ */
+static void __lock_page(struct page *page)
+{
+	wait_queue_head_t *waitqueue = page_waitqueue(page);
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	add_wait_queue_exclusive(waitqueue, &wait);
+	for (;;) {
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (PageLocked(page)) {
+			sync_page(page);
+			schedule();
+		}
+		if (!TryLockPage(page))
+			break;
+	}
+	__set_task_state(tsk, TASK_RUNNING);
+	remove_wait_queue(waitqueue, &wait);
+}
+
+/*
+ * Get an exclusive lock on the page, optimistically
+ * assuming it's not locked..
+ */
+void lock_page(struct page *page)
+{
+	if (TryLockPage(page))
+		__lock_page(page);
+}
+
+/*
+ * a rather lightweight function, finding and getting a reference to a
+ * hashed page atomically.
+ */
+struct page * __find_get_page(struct address_space *mapping,
+			      unsigned long offset, struct page **hash)
+{
+	struct page *page;
+
+	/*
+	 * We scan the hash list read-only. Addition to and removal from
+	 * the hash-list needs a held write-lock.
+	 */
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(mapping, offset, *hash);
+	if (page)
+		page_cache_get(page);
+	spin_unlock(&pagecache_lock);
+	return page;
+}
+
+/*
+ * Same as above, but trylock it instead of incrementing the count.
+ */
+struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
+{
+	struct page *page;
+	struct page **hash = page_hash(mapping, offset);
+
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(mapping, offset, *hash);
+	if (page) {
+		if (TryLockPage(page))
+			page = NULL;
+	}
+	spin_unlock(&pagecache_lock);
+	return page;
+}
+
+/*
+ * Must be called with the pagecache lock held,
+ * will return with it held (but it may be dropped
+ * during blocking operations..
+ */
+static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *));
+static struct page * __find_lock_page_helper(struct address_space *mapping,
+					unsigned long offset, struct page *hash)
+{
+	struct page *page;
+
+	/*
+	 * We scan the hash list read-only. Addition to and removal from
+	 * the hash-list needs a held write-lock.
+	 */
+repeat:
+	page = __find_page_nolock(mapping, offset, hash);
+	if (page) {
+		page_cache_get(page);
+		if (TryLockPage(page)) {
+			spin_unlock(&pagecache_lock);
+			lock_page(page);
+			spin_lock(&pagecache_lock);
+
+			/* Has the page been re-allocated while we slept? */
+			if (page->mapping != mapping || page->index != offset) {
+				UnlockPage(page);
+				page_cache_release(page);
+				goto repeat;
+			}
+		}
+	}
+	return page;
+}
+
+/*
+ * Same as the above, but lock the page too, verifying that
+ * it's still valid once we own it.
+ */
+struct page * __find_lock_page (struct address_space *mapping,
+				unsigned long offset, struct page **hash)
+{
+	struct page *page;
+
+	spin_lock(&pagecache_lock);
+	page = __find_lock_page_helper(mapping, offset, *hash);
+	spin_unlock(&pagecache_lock);
+	return page;
+}
+
+/*
+ * Same as above, but create the page if required..
+ */
+struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask)
+{
+	struct page *page;
+	struct page **hash = page_hash(mapping, index);
+
+	spin_lock(&pagecache_lock);
+	page = __find_lock_page_helper(mapping, index, *hash);
+	spin_unlock(&pagecache_lock);
+	if (!page) {
+		struct page *newpage = alloc_page(gfp_mask);
+		if (newpage) {
+			spin_lock(&pagecache_lock);
+			page = __find_lock_page_helper(mapping, index, *hash);
+			if (likely(!page)) {
+				page = newpage;
+				__add_to_page_cache(page, mapping, index, hash);
+				newpage = NULL;
+			}
+			spin_unlock(&pagecache_lock);
+			if (newpage == NULL)
+				lru_cache_add(page);
+			else 
+				page_cache_release(newpage);
+		}
+	}
+	return page;	
+}
+
+/*
+ * Returns locked page at given index in given cache, creating it if needed.
+ */
+struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
+{
+	return find_or_create_page(mapping, index, mapping->gfp_mask);
+}
+
+
+/*
+ * Same as grab_cache_page, but do not wait if the page is unavailable.
+ * This is intended for speculative data generators, where the data can
+ * be regenerated if the page couldn't be grabbed.  This routine should
+ * be safe to call while holding the lock for another page.
+ */
+struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
+{
+	struct page *page, **hash;
+
+	hash = page_hash(mapping, index);
+	page = __find_get_page(mapping, index, hash);
+
+	if ( page ) {
+		if ( !TryLockPage(page) ) {
+			/* Page found and locked */
+			/* This test is overly paranoid, but what the heck... */
+			if ( unlikely(page->mapping != mapping || page->index != index) ) {
+				/* Someone reallocated this page under us. */
+				UnlockPage(page);
+				page_cache_release(page);
+				return NULL;
+			} else {
+				return page;
+			}
+		} else {
+			/* Page locked by someone else */
+			page_cache_release(page);
+			return NULL;
+		}
+	}
+
+	page = page_cache_alloc(mapping);
+	if ( unlikely(!page) )
+		return NULL;	/* Failed to allocate a page */
+
+	if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) {
+		/* Someone else grabbed the page already. */
+		page_cache_release(page);
+		return NULL;
+	}
+
+	return page;
+}
+
+#if 0
+#define PROFILE_READAHEAD
+#define DEBUG_READAHEAD
+#endif
+
+/*
+ * Read-ahead profiling information
+ * --------------------------------
+ * Every PROFILE_MAXREADCOUNT, the following information is written 
+ * to the syslog:
+ *   Percentage of asynchronous read-ahead.
+ *   Average of read-ahead fields context value.
+ * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 
+ * to the syslog.
+ */
+
+#ifdef PROFILE_READAHEAD
+
+#define PROFILE_MAXREADCOUNT 1000
+
+static unsigned long total_reada;
+static unsigned long total_async;
+static unsigned long total_ramax;
+static unsigned long total_ralen;
+static unsigned long total_rawin;
+
+static void profile_readahead(int async, struct file *filp)
+{
+	unsigned long flags;
+
+	++total_reada;
+	if (async)
+		++total_async;
+
+	total_ramax	+= filp->f_ramax;
+	total_ralen	+= filp->f_ralen;
+	total_rawin	+= filp->f_rawin;
+
+	if (total_reada > PROFILE_MAXREADCOUNT) {
+		save_flags(flags);
+		cli();
+		if (!(total_reada > PROFILE_MAXREADCOUNT)) {
+			restore_flags(flags);
+			return;
+		}
+
+		printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
+			total_ramax/total_reada,
+			total_ralen/total_reada,
+			total_rawin/total_reada,
+			(total_async*100)/total_reada);
+#ifdef DEBUG_READAHEAD
+		printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
+			filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
+#endif
+
+		total_reada	= 0;
+		total_async	= 0;
+		total_ramax	= 0;
+		total_ralen	= 0;
+		total_rawin	= 0;
+
+		restore_flags(flags);
+	}
+}
+#endif  /* defined PROFILE_READAHEAD */
+
+/*
+ * Read-ahead context:
+ * -------------------
+ * The read ahead context fields of the "struct file" are the following:
+ * - f_raend : position of the first byte after the last page we tried to
+ *	       read ahead.
+ * - f_ramax : current read-ahead maximum size.
+ * - f_ralen : length of the current IO read block we tried to read-ahead.
+ * - f_rawin : length of the current read-ahead window.
+ *		if last read-ahead was synchronous then
+ *			f_rawin = f_ralen
+ *		otherwise (was asynchronous)
+ *			f_rawin = previous value of f_ralen + f_ralen
+ *
+ * Read-ahead limits:
+ * ------------------
+ * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
+ * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
+ *
+ * Synchronous read-ahead benefits:
+ * --------------------------------
+ * Using reasonable IO xfer length from peripheral devices increase system 
+ * performances.
+ * Reasonable means, in this context, not too large but not too small.
+ * The actual maximum value is:
+ *	MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
+ *      and 32K if defined (4K page size assumed).
+ *
+ * Asynchronous read-ahead benefits:
+ * ---------------------------------
+ * Overlapping next read request and user process execution increase system 
+ * performance.
+ *
+ * Read-ahead risks:
+ * -----------------
+ * We have to guess which further data are needed by the user process.
+ * If these data are often not really needed, it's bad for system 
+ * performances.
+ * However, we know that files are often accessed sequentially by 
+ * application programs and it seems that it is possible to have some good 
+ * strategy in that guessing.
+ * We only try to read-ahead files that seems to be read sequentially.
+ *
+ * Asynchronous read-ahead risks:
+ * ------------------------------
+ * In order to maximize overlapping, we must start some asynchronous read 
+ * request from the device, as soon as possible.
+ * We must be very careful about:
+ * - The number of effective pending IO read requests.
+ *   ONE seems to be the only reasonable value.
+ * - The total memory pool usage for the file access stream.
+ *   This maximum memory usage is implicitly 2 IO read chunks:
+ *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
+ *   64k if defined (4K page size assumed).
+ */
+
+static inline int get_max_readahead(struct inode * inode)
+{
+	if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
+		return vm_max_readahead;
+	return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
+}
+
+static void generic_file_readahead(int reada_ok,
+	struct file * filp, struct inode * inode,
+	struct page * page, int flags)
+{
+	unsigned long end_index;
+	unsigned long index = page->index;
+	unsigned long max_ahead, ahead;
+	unsigned long raend;
+	int max_readahead = get_max_readahead(inode);
+
+	end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+
+	raend = filp->f_raend;
+	max_ahead = 0;
+
+/*
+ * The current page is locked.
+ * If the current position is inside the previous read IO request, do not
+ * try to reread previously read ahead pages.
+ * Otherwise decide or not to read ahead some pages synchronously.
+ * If we are not going to read ahead, set the read ahead context for this 
+ * page only.
+ */
+	if (PageLocked(page)) {
+		if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
+			raend = index;
+			if (raend < end_index)
+				max_ahead = filp->f_ramax;
+			filp->f_rawin = 0;
+			filp->f_ralen = 1;
+			if (!max_ahead) {
+				filp->f_raend  = index + filp->f_ralen;
+				filp->f_rawin += filp->f_ralen;
+			}
+		}
+	}
+/*
+ * The current page is not locked.
+ * If we were reading ahead and,
+ * if the current max read ahead size is not zero and,
+ * if the current position is inside the last read-ahead IO request,
+ *   it is the moment to try to read ahead asynchronously.
+ * We will later force unplug device in order to force asynchronous read IO.
+ */
+	else if (reada_ok && filp->f_ramax && raend >= 1 &&
+		 index <= raend && index + filp->f_ralen >= raend) {
+/*
+ * Add ONE page to max_ahead in order to try to have about the same IO max size
+ * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
+ * Compute the position of the last page we have tried to read in order to 
+ * begin to read ahead just at the next page.
+ */
+		raend -= 1;
+		if (raend < end_index)
+			max_ahead = filp->f_ramax + 1;
+
+		if (max_ahead) {
+			filp->f_rawin = filp->f_ralen;
+			filp->f_ralen = 0;
+			reada_ok      = 2;
+		}
+	}
+/*
+ * Try to read ahead pages.
+ * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
+ * scheduler, will work enough for us to avoid too bad actuals IO requests.
+ */
+	ahead = 0;
+	while (ahead < max_ahead) {
+		ahead ++;
+		if ((raend + ahead) >= end_index)
+			break;
+		if (page_cache_read(filp, raend + ahead) < 0)
+			break;
+	}
+/*
+ * If we tried to read ahead some pages,
+ * If we tried to read ahead asynchronously,
+ *   Try to force unplug of the device in order to start an asynchronous
+ *   read IO request.
+ * Update the read-ahead context.
+ * Store the length of the current read-ahead window.
+ * Double the current max read ahead size.
+ *   That heuristic avoid to do some large IO for files that are not really
+ *   accessed sequentially.
+ */
+	if (ahead) {
+		filp->f_ralen += ahead;
+		filp->f_rawin += filp->f_ralen;
+		filp->f_raend = raend + ahead + 1;
+
+		filp->f_ramax += filp->f_ramax;
+
+		if (filp->f_ramax > max_readahead)
+			filp->f_ramax = max_readahead;
+
+#ifdef PROFILE_READAHEAD
+		profile_readahead((reada_ok == 2), filp);
+#endif
+	}
+
+	return;
+}
+
+/*
+ * Mark a page as having seen activity.
+ *
+ * If it was already so marked, move it
+ * to the active queue and drop the referenced
+ * bit. Otherwise, just mark it for future
+ * action..
+ */
+void mark_page_accessed(struct page *page)
+{
+	if (!PageActive(page) && PageReferenced(page)) {
+		activate_page(page);
+		ClearPageReferenced(page);
+		return;
+	}
+
+	/* Mark the page referenced, AFTER checking for previous usage.. */
+	SetPageReferenced(page);
+}
+
+/*
+ * This is a generic file read routine, and uses the
+ * inode->i_op->readpage() function for the actual low-level
+ * stuff.
+ *
+ * This is really ugly. But the goto's actually try to clarify some
+ * of the logic when it comes to error handling etc.
+ */
+void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor, int flags)
+{
+	struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
+	struct inode *inode = mapping->host;
+	unsigned long index, offset;
+	struct page *cached_page;
+	int reada_ok;
+	int error;
+	int max_readahead = get_max_readahead(inode);
+	loff_t pos;
+
+	pos = *ppos;
+	if (unlikely(pos < 0)) {
+		desc->error = -EINVAL;
+		return;
+	}
+
+	cached_page = NULL;
+	index = pos >> PAGE_CACHE_SHIFT;
+	offset = pos & ~PAGE_CACHE_MASK;
+
+/*
+ * If the current position is outside the previous read-ahead window, 
+ * we reset the current read-ahead context and set read ahead max to zero
+ * (will be set to just needed value later),
+ * otherwise, we assume that the file accesses are sequential enough to
+ * continue read-ahead.
+ */
+	if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
+		reada_ok = 0;
+		filp->f_raend = 0;
+		filp->f_ralen = 0;
+		filp->f_ramax = 0;
+		filp->f_rawin = 0;
+	} else {
+		reada_ok = 1;
+	}
+/*
+ * Adjust the current value of read-ahead max.
+ * If the read operation stay in the first half page, force no readahead.
+ * Otherwise try to increase read ahead max just enough to do the read request.
+ * Then, at least MIN_READAHEAD if read ahead is ok,
+ * and at most MAX_READAHEAD in all cases.
+ */
+	if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
+		filp->f_ramax = 0;
+	} else {
+		unsigned long needed;
+
+		needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
+
+		if (filp->f_ramax < needed)
+			filp->f_ramax = needed;
+
+		if (reada_ok && filp->f_ramax < vm_min_readahead)
+				filp->f_ramax = vm_min_readahead;
+		if (filp->f_ramax > max_readahead)
+			filp->f_ramax = max_readahead;
+	}
+
+	for (;;) {
+		struct page *page, **hash;
+		unsigned long end_index, nr, ret;
+
+		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+			
+		if (index > end_index) {
+			desc->error = 0;
+			break;
+		}
+		nr = PAGE_CACHE_SIZE;
+		if (index == end_index) {
+			nr = inode->i_size & ~PAGE_CACHE_MASK;
+			if (nr <= offset) {
+				desc->error = 0;
+				break;
+			}
+		}
+
+		nr = nr - offset;
+
+		/*
+		 * Try to find the data in the page cache..
+		 */
+		hash = page_hash(mapping, index);
+
+		spin_lock(&pagecache_lock);
+		page = __find_page_nolock(mapping, index, *hash);
+		if (!page)
+			goto no_cached_page;
+found_page:
+		page_cache_get(page);
+		spin_unlock(&pagecache_lock);
+
+		if (!Page_Uptodate(page))
+			goto page_not_up_to_date;
+		generic_file_readahead(reada_ok, filp, inode, page, flags);
+page_ok:
+		/* If users can be writing to this page using arbitrary
+		 * virtual addresses, take care about potential aliasing
+		 * before reading the page on the kernel side.
+		 */
+		if (mapping->i_mmap_shared != NULL)
+			flush_dcache_page(page);
+
+		/*
+		 * Mark the page accessed if we read the
+		 * beginning or we just did an lseek.
+		 */
+		if (!offset || !filp->f_reada)
+			mark_page_accessed(page);
+
+		/*
+		 * Ok, we have the page, and it's up-to-date, so
+		 * now we can copy it to user space...
+		 *
+		 * The actor routine returns how many bytes were actually used..
+		 * NOTE! This may not be the same as how much of a user buffer
+		 * we filled up (we may be padding etc), so we can only update
+		 * "pos" here (the actor routine has to update the user buffer
+		 * pointers and the remaining count).
+		 */
+		ret = actor(desc, page, offset, nr);
+		offset += ret;
+		index += offset >> PAGE_CACHE_SHIFT;
+		offset &= ~PAGE_CACHE_MASK;
+
+		page_cache_release(page);
+		if (ret == nr && desc->count)
+			continue;
+		break;
+
+/*
+ * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
+ */
+page_not_up_to_date:
+		generic_file_readahead(reada_ok, filp, inode, page, flags);
+
+		if (Page_Uptodate(page))
+			goto page_ok;
+
+		/* Get exclusive access to the page ... */
+		if (flags & F_ATOMIC) {
+			if (TryLockPage(page)) {
+				if (Page_Uptodate(page))
+					goto page_ok;
+				desc->error = -EWOULDBLOCKIO;
+				page_cache_release(page);
+				break;
+			}
+			printk("page_not_up_to_date: atomic trylock succeeded\n");
+		} else
+			lock_page(page);
+
+		/* Did it get unhashed before we got the lock? */
+		if (!page->mapping) {
+			UnlockPage(page);
+			page_cache_release(page);
+			continue;
+		}
+
+		/* Did somebody else fill it already? */
+		if (Page_Uptodate(page)) {
+			UnlockPage(page);
+			goto page_ok;
+		}
+
+readpage:
+		/* ... and start the actual read. The read will unlock the page. */
+		error = mapping->a_ops->readpage(filp, page);
+
+		if (!error) {
+			if (Page_Uptodate(page))
+				goto page_ok;
+
+			/* Again, try some read-ahead while waiting for the page to finish.. */
+			generic_file_readahead(reada_ok, filp, inode, page, flags);
+			if (!(flags & F_ATOMIC))
+				wait_on_page(page);
+			if (Page_Uptodate(page))
+				goto page_ok;
+			error = (flags & F_ATOMIC) ? -EWOULDBLOCKIO : -EIO;
+		}
+
+		/* UHHUH! A synchronous read error occurred. Report it */
+		desc->error = error;
+		page_cache_release(page);
+		break;
+
+no_cached_page:
+		if (flags & F_ATOMIC) {
+			spin_unlock(&pagecache_lock);
+			desc->error = -EWOULDBLOCKIO;
+			break;
+		}
+		/*
+		 * Ok, it wasn't cached, so we need to create a new
+		 * page..
+		 *
+		 * We get here with the page cache lock held.
+		 */
+		if (!cached_page) {
+			spin_unlock(&pagecache_lock);
+			cached_page = page_cache_alloc(mapping);
+			if (!cached_page) {
+				desc->error = -ENOMEM;
+				break;
+			}
+
+			/*
+			 * Somebody may have added the page while we
+			 * dropped the page cache lock. Check for that.
+			 */
+			spin_lock(&pagecache_lock);
+			page = __find_page_nolock(mapping, index, *hash);
+			if (page)
+				goto found_page;
+		}
+
+		/*
+		 * Ok, add the new page to the hash-queues...
+		 */
+		page = cached_page;
+		__add_to_page_cache(page, mapping, index, hash);
+		spin_unlock(&pagecache_lock);
+		lru_cache_add(page);		
+		cached_page = NULL;
+
+		goto readpage;
+	}
+
+	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+	filp->f_reada = 1;
+	if (cached_page)
+		page_cache_release(cached_page);
+	UPDATE_ATIME(inode);
+}
+
+static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
+{
+	ssize_t retval;
+	int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress;
+	struct kiobuf * iobuf;
+	struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
+	struct inode * inode = mapping->host;
+
+	new_iobuf = 0;
+	iobuf = filp->f_iobuf;
+	if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
+		/*
+		 * A parallel read/write is using the preallocated iobuf
+		 * so just run slow and allocate a new one.
+		 */
+		retval = alloc_kiovec(1, &iobuf);
+		if (retval)
+			goto out;
+		new_iobuf = 1;
+	}
+
+	blocksize = 1 << inode->i_blkbits;
+	blocksize_bits = inode->i_blkbits;
+	blocksize_mask = blocksize - 1;
+	chunk_size = KIO_MAX_ATOMIC_IO << 10;
+
+	retval = -EINVAL;
+	if ((offset & blocksize_mask) || (count & blocksize_mask))
+		goto out_free;
+	if (!mapping->a_ops->direct_IO)
+		goto out_free;
+
+	/*
+	 * Flush to disk exclusively the _data_, metadata must remain
+	 * completly asynchronous or performance will go to /dev/null.
+	 */
+	retval = filemap_fdatasync(mapping);
+	if (retval == 0)
+		retval = fsync_inode_data_buffers(inode);
+	if (retval == 0)
+		retval = filemap_fdatawait(mapping);
+	if (retval < 0)
+		goto out_free;
+
+	progress = retval = 0;
+	while (count > 0) {
+		iosize = count;
+		if (iosize > chunk_size)
+			iosize = chunk_size;
+
+		retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
+		if (retval)
+			break;
+
+		retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize);
+
+		if (rw == READ && retval > 0)
+			mark_dirty_kiobuf(iobuf, retval);
+		
+		if (retval >= 0) {
+			count -= retval;
+			buf += retval;
+			progress += retval;
+		}
+
+		unmap_kiobuf(iobuf);
+
+		if (retval != iosize)
+			break;
+	}
+
+	if (progress)
+		retval = progress;
+
+ out_free:
+	if (!new_iobuf)
+		clear_bit(0, &filp->f_iobuf_lock);
+	else
+		free_kiovec(1, &iobuf);
+ out:	
+	return retval;
+}
+
+int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
+{
+	char *kaddr;
+	unsigned long left, count = desc->count;
+
+	if (size > count)
+		size = count;
+
+	kaddr = kmap(page);
+	left = __copy_to_user(desc->buf, kaddr + offset, size);
+	kunmap(page);
+	
+	if (left) {
+		size -= left;
+		desc->error = -EFAULT;
+	}
+	desc->count = count - size;
+	desc->written += size;
+	desc->buf += size;
+	return size;
+}
+
+/*
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
+{
+	return generic_file_new_read(filp, buf, count, ppos, 0);
+}
+
+ssize_t generic_file_new_read(struct file * filp, char * buf, size_t count, loff_t *ppos, int flags)
+{
+	ssize_t retval;
+
+	if ((ssize_t) count < 0)
+		return -EINVAL;
+
+	if (filp->f_flags & O_DIRECT)
+		goto o_direct;
+
+	retval = -EFAULT;
+	if (access_ok(VERIFY_WRITE, buf, count)) {
+		retval = 0;
+
+		if (count) {
+			read_descriptor_t desc;
+
+			desc.written = 0;
+			desc.count = count;
+			desc.buf = buf;
+			desc.error = 0;
+			do_generic_file_read(filp, ppos, &desc, file_read_actor, flags);
+
+			retval = desc.written;
+			if (!retval)
+				retval = desc.error;
+		}
+	}
+ out:
+	return retval;
+
+ o_direct:
+	{
+		loff_t pos = *ppos, size;
+		struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
+		struct inode *inode = mapping->host;
+
+		retval = 0;
+		if (!count)
+			goto out; /* skip atime */
+		size = inode->i_size;
+		if (pos < size) {
+			if (pos + count > size)
+				count = size - pos;
+			retval = generic_file_direct_IO(READ, filp, buf, count, pos);
+			if (retval > 0)
+				*ppos = pos + retval;
+		}
+		UPDATE_ATIME(filp->f_dentry->d_inode);
+		goto out;
+	}
+}
+
+static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
+{
+	ssize_t written;
+	unsigned long count = desc->count;
+	struct file *file = (struct file *) desc->buf;
+
+	if (size > count)
+		size = count;
+
+ 	if (file->f_op->sendpage) {
+ 		written = file->f_op->sendpage(file, page, offset,
+					       size, &file->f_pos, size<count);
+	} else {
+		char *kaddr;
+		mm_segment_t old_fs;
+
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+
+		kaddr = kmap(page);
+		written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
+		kunmap(page);
+
+		set_fs(old_fs);
+	}
+	if (written < 0) {
+		desc->error = written;
+		written = 0;
+	}
+	desc->count = count - written;
+	desc->written += written;
+	return written;
+}
+
+asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
+{
+	ssize_t retval;
+	struct file * in_file, * out_file;
+	struct inode * in_inode, * out_inode;
+
+	/*
+	 * Get input file, and verify that it is ok..
+	 */
+	retval = -EBADF;
+	in_file = fget(in_fd);
+	if (!in_file)
+		goto out;
+	if (!(in_file->f_mode & FMODE_READ))
+		goto fput_in;
+	retval = -EINVAL;
+	in_inode = in_file->f_dentry->d_inode;
+	if (!in_inode)
+		goto fput_in;
+	if (!in_inode->i_mapping->a_ops->readpage)
+		goto fput_in;
+	retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
+	if (retval)
+		goto fput_in;
+
+	/*
+	 * Get output file, and verify that it is ok..
+	 */
+	retval = -EBADF;
+	out_file = fget(out_fd);
+	if (!out_file)
+		goto fput_in;
+	if (!(out_file->f_mode & FMODE_WRITE))
+		goto fput_out;
+	retval = -EINVAL;
+	if (!out_file->f_op || !out_file->f_op->write)
+		goto fput_out;
+	out_inode = out_file->f_dentry->d_inode;
+	retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
+	if (retval)
+		goto fput_out;
+
+	retval = 0;
+	if (count) {
+		read_descriptor_t desc;
+		loff_t pos = 0, *ppos;
+
+		retval = -EFAULT;
+		ppos = &in_file->f_pos;
+		if (offset) {
+			if (get_user(pos, offset))
+				goto fput_out;
+			ppos = &pos;
+		}
+
+		desc.written = 0;
+		desc.count = count;
+		desc.buf = (char *) out_file;
+		desc.error = 0;
+		do_generic_file_read(in_file, ppos, &desc, file_send_actor, 0);
+
+		retval = desc.written;
+		if (!retval)
+			retval = desc.error;
+		if (offset)
+			put_user(pos, offset);
+	}
+
+fput_out:
+	fput(out_file);
+fput_in:
+	fput(in_file);
+out:
+	return retval;
+}
+
+static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	unsigned long max;
+
+	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
+		return -EINVAL;
+
+	/* Limit it to the size of the file.. */
+	max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT;
+	if (index > max)
+		return 0;
+	max -= index;
+	if (nr > max)
+		nr = max;
+
+	/* And limit it to a sane percentage of the inactive list.. */
+	max = nr_inactive_pages / 2;
+	if (nr > max)
+		nr = max;
+
+	while (nr) {
+		page_cache_read(file, index);
+		index++;
+		nr--;
+	}
+	return 0;
+}
+
+asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
+{
+	ssize_t ret;
+	struct file *file;
+
+	ret = -EBADF;
+	file = fget(fd);
+	if (file) {
+		if (file->f_mode & FMODE_READ) {
+			unsigned long start = offset >> PAGE_CACHE_SHIFT;
+			unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT;
+			ret = do_readahead(file, start, len);
+		}
+		fput(file);
+	}
+	return ret;
+}
+
+/*
+ * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
+ * sure this is sequential access, we don't need a flexible read-ahead
+ * window size -- we can always use a large fixed size window.
+ */
+static void nopage_sequential_readahead(struct vm_area_struct * vma,
+	unsigned long pgoff, unsigned long filesize)
+{
+	unsigned long ra_window;
+
+	ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
+	ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
+
+	/* vm_raend is zero if we haven't read ahead in this area yet.  */
+	if (vma->vm_raend == 0)
+		vma->vm_raend = vma->vm_pgoff + ra_window;
+
+	/*
+	 * If we've just faulted the page half-way through our window,
+	 * then schedule reads for the next window, and release the
+	 * pages in the previous window.
+	 */
+	if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
+		unsigned long start = vma->vm_pgoff + vma->vm_raend;
+		unsigned long end = start + ra_window;
+
+		if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
+			end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
+		if (start > end)
+			return;
+
+		while ((start < end) && (start < filesize)) {
+			if (read_cluster_nonblocking(vma->vm_file,
+							start, filesize) < 0)
+				break;
+			start += CLUSTER_PAGES;
+		}
+		run_task_queue(&tq_disk);
+
+		/* if we're far enough past the beginning of this area,
+		   recycle pages that are in the previous window. */
+		if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
+			unsigned long window = ra_window << PAGE_SHIFT;
+
+			end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
+			end -= window + window;
+			filemap_sync(vma, end - window, window, MS_INVALIDATE);
+		}
+
+		vma->vm_raend += ra_window;
+	}
+
+	return;
+}
+
+/*
+ * filemap_nopage() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * The goto's are kind of ugly, but this streamlines the normal case of having
+ * it in the page cache, and handles the special cases reasonably without
+ * having a lot of duplicated code.
+ */
+struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused)
+{
+	int error;
+	struct file *file = area->vm_file;
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct inode *inode = mapping->host;
+	struct page *page, **hash;
+	unsigned long size, pgoff, endoff;
+
+	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+	endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+
+retry_all:
+	/*
+	 * An external ptracer can access pages that normally aren't
+	 * accessible..
+	 */
+	size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if ((pgoff >= size) && (area->vm_mm == current->mm))
+		return NULL;
+
+	/* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */
+	if (size > endoff)
+		size = endoff;
+
+	/*
+	 * Do we have something in the page cache already?
+	 */
+	hash = page_hash(mapping, pgoff);
+retry_find:
+	page = __find_get_page(mapping, pgoff, hash);
+	if (!page)
+		goto no_cached_page;
+
+	/*
+	 * Ok, found a page in the page cache, now we need to check
+	 * that it's up-to-date.
+	 */
+	if (!Page_Uptodate(page))
+		goto page_not_uptodate;
+
+success:
+ 	/*
+	 * Try read-ahead for sequential areas.
+	 */
+	if (VM_SequentialReadHint(area))
+		nopage_sequential_readahead(area, pgoff, size);
+
+	/*
+	 * Found the page and have a reference on it, need to check sharing
+	 * and possibly copy it over to another page..
+	 */
+	mark_page_accessed(page);
+	flush_page_to_ram(page);
+	return page;
+
+no_cached_page:
+	/*
+	 * If the requested offset is within our file, try to read a whole 
+	 * cluster of pages at once.
+	 *
+	 * Otherwise, we're off the end of a privately mapped file,
+	 * so we need to map a zero page.
+	 */
+	if ((pgoff < size) && !VM_RandomReadHint(area))
+		error = read_cluster_nonblocking(file, pgoff, size);
+	else
+		error = page_cache_read(file, pgoff);
+
+	/*
+	 * The page we want has now been added to the page cache.
+	 * In the unlikely event that someone removed it in the
+	 * meantime, we'll just come back here and read it again.
+	 */
+	if (error >= 0)
+		goto retry_find;
+
+	/*
+	 * An error return from page_cache_read can result if the
+	 * system is low on memory, or a problem occurs while trying
+	 * to schedule I/O.
+	 */
+	if (error == -ENOMEM)
+		return NOPAGE_OOM;
+	return NULL;
+
+page_not_uptodate:
+	lock_page(page);
+
+	/* Did it get unhashed while we waited for it? */
+	if (!page->mapping) {
+		UnlockPage(page);
+		page_cache_release(page);
+		goto retry_all;
+	}
+
+	/* Did somebody else get it up-to-date? */
+	if (Page_Uptodate(page)) {
+		UnlockPage(page);
+		goto success;
+	}
+
+	if (!mapping->a_ops->readpage(file, page)) {
+		wait_on_page(page);
+		if (Page_Uptodate(page))
+			goto success;
+	}
+
+	/*
+	 * Umm, take care of errors if the page isn't up-to-date.
+	 * Try to re-read it _once_. We do this synchronously,
+	 * because there really aren't any performance issues here
+	 * and we need to check for errors.
+	 */
+	lock_page(page);
+
+	/* Somebody truncated the page on us? */
+	if (!page->mapping) {
+		UnlockPage(page);
+		page_cache_release(page);
+		goto retry_all;
+	}
+
+	/* Somebody else successfully read it in? */
+	if (Page_Uptodate(page)) {
+		UnlockPage(page);
+		goto success;
+	}
+	ClearPageError(page);
+	if (!mapping->a_ops->readpage(file, page)) {
+		wait_on_page(page);
+		if (Page_Uptodate(page))
+			goto success;
+	}
+
+	/*
+	 * Things didn't work out. Return zero to tell the
+	 * mm layer so, possibly freeing the page cache page first.
+	 */
+	page_cache_release(page);
+	return NULL;
+}
+
+/* Called with mm->page_table_lock held to protect against other
+ * threads/the swapper from ripping pte's out from under us.
+ */
+static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
+	unsigned long address, unsigned int flags)
+{
+	pte_t pte = *ptep;
+
+	if (pte_present(pte)) {
+		struct page *page = pte_page(pte);
+		if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) {
+			flush_tlb_page(vma, address);
+			set_page_dirty(page);
+		}
+	}
+	return 0;
+}
+
+static inline int filemap_sync_pte_range(pmd_t * pmd,
+	unsigned long address, unsigned long size, 
+	struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
+{
+	pte_t * pte;
+	unsigned long end;
+	int error;
+
+	if (pmd_none(*pmd))
+		return 0;
+	if (pmd_bad(*pmd)) {
+		pmd_ERROR(*pmd);
+		pmd_clear(pmd);
+		return 0;
+	}
+	pte = pte_offset(pmd, address);
+	offset += address & PMD_MASK;
+	address &= ~PMD_MASK;
+	end = address + size;
+	if (end > PMD_SIZE)
+		end = PMD_SIZE;
+	error = 0;
+	do {
+		error |= filemap_sync_pte(pte, vma, address + offset, flags);
+		address += PAGE_SIZE;
+		pte++;
+	} while (address && (address < end));
+	return error;
+}
+
+static inline int filemap_sync_pmd_range(pgd_t * pgd,
+	unsigned long address, unsigned long size, 
+	struct vm_area_struct *vma, unsigned int flags)
+{
+	pmd_t * pmd;
+	unsigned long offset, end;
+	int error;
+
+	if (pgd_none(*pgd))
+		return 0;
+	if (pgd_bad(*pgd)) {
+		pgd_ERROR(*pgd);
+		pgd_clear(pgd);
+		return 0;
+	}
+	pmd = pmd_offset(pgd, address);
+	offset = address & PGDIR_MASK;
+	address &= ~PGDIR_MASK;
+	end = address + size;
+	if (end > PGDIR_SIZE)
+		end = PGDIR_SIZE;
+	error = 0;
+	do {
+		error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
+		address = (address + PMD_SIZE) & PMD_MASK;
+		pmd++;
+	} while (address && (address < end));
+	return error;
+}
+
+int filemap_sync(struct vm_area_struct * vma, unsigned long address,
+	size_t size, unsigned int flags)
+{
+	pgd_t * dir;
+	unsigned long end = address + size;
+	int error = 0;
+
+	/* Aquire the lock early; it may be possible to avoid dropping
+	 * and reaquiring it repeatedly.
+	 */
+	spin_lock(&vma->vm_mm->page_table_lock);
+
+	dir = pgd_offset(vma->vm_mm, address);
+	flush_cache_range(vma->vm_mm, end - size, end);
+	if (address >= end)
+		BUG();
+	do {
+		error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
+		address = (address + PGDIR_SIZE) & PGDIR_MASK;
+		dir++;
+	} while (address && (address < end));
+	flush_tlb_range(vma->vm_mm, end - size, end);
+
+	spin_unlock(&vma->vm_mm->page_table_lock);
+
+	return error;
+}
+
+static struct vm_operations_struct generic_file_vm_ops = {
+	nopage:		filemap_nopage,
+};
+
+/* This is used for a general mmap of a disk file */
+
+int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct inode *inode = mapping->host;
+
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
+		if (!mapping->a_ops->writepage)
+			return -EINVAL;
+	}
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	UPDATE_ATIME(inode);
+	vma->vm_ops = &generic_file_vm_ops;
+	return 0;
+}
+
+/*
+ * The msync() system call.
+ */
+
+/*
+ * MS_SYNC syncs the entire file - including mappings.
+ *
+ * MS_ASYNC initiates writeout of just the dirty mapped data.
+ * This provides no guarantee of file integrity - things like indirect
+ * blocks may not have started writeout.  MS_ASYNC is primarily useful
+ * where the application knows that it has finished with the data and
+ * wishes to intelligently schedule its own I/O traffic.
+ */
+static int msync_interval(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, int flags)
+{
+	int ret = 0;
+	struct file * file = vma->vm_file;
+
+	if ( (flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED) )
+		return -EBUSY;
+
+	if (file && (vma->vm_flags & VM_SHARED)) {
+		ret = filemap_sync(vma, start, end-start, flags);
+
+		if (!ret && (flags & (MS_SYNC|MS_ASYNC))) {
+			struct inode * inode = file->f_dentry->d_inode;
+
+			down(&inode->i_sem);
+			ret = filemap_fdatasync(inode->i_mapping);
+			if (flags & MS_SYNC) {
+				int err;
+
+				if (file->f_op && file->f_op->fsync) {
+					err = file->f_op->fsync(file, file->f_dentry, 1);
+					if (err && !ret)
+						ret = err;
+				}
+				err = filemap_fdatawait(inode->i_mapping);
+				if (err && !ret)
+					ret = err;
+			}
+			up(&inode->i_sem);
+		}
+	}
+	return ret;
+}
+
+asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
+{
+	unsigned long end;
+	struct vm_area_struct * vma;
+	int unmapped_error, error = -EINVAL;
+
+	down_read(&current->mm->mmap_sem);
+	if (start & ~PAGE_MASK)
+		goto out;
+	len = (len + ~PAGE_MASK) & PAGE_MASK;
+	end = start + len;
+	if (end < start)
+		goto out;
+	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
+		goto out;
+	if ((flags & MS_ASYNC) && (flags & MS_SYNC))
+		goto out;
+
+	error = 0;
+	if (end == start)
+		goto out;
+	/*
+	 * If the interval [start,end) covers some unmapped address ranges,
+	 * just ignore them, but return -ENOMEM at the end.
+	 */
+	vma = find_vma(current->mm, start);
+	unmapped_error = 0;
+	for (;;) {
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			goto out;
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = msync_interval(vma, start, end, flags);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = msync_interval(vma, start, vma->vm_end, flags);
+		if (error)
+			goto out;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+out:
+	up_read(&current->mm->mmap_sem);
+	return error;
+}
+
+static inline void setup_read_behavior(struct vm_area_struct * vma,
+	int behavior)
+{
+	VM_ClearReadHint(vma);
+	switch(behavior) {
+		case MADV_SEQUENTIAL:
+			vma->vm_flags |= VM_SEQ_READ;
+			break;
+		case MADV_RANDOM:
+			vma->vm_flags |= VM_RAND_READ;
+			break;
+		default:
+			break;
+	}
+	return;
+}
+
+static long madvise_fixup_start(struct vm_area_struct * vma,
+	unsigned long end, int behavior)
+{
+	struct vm_area_struct * n;
+	struct mm_struct * mm = vma->vm_mm;
+
+	n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!n)
+		return -EAGAIN;
+	*n = *vma;
+	n->vm_end = end;
+	setup_read_behavior(n, behavior);
+	n->vm_raend = 0;
+	if (n->vm_file)
+		get_file(n->vm_file);
+	if (n->vm_ops && n->vm_ops->open)
+		n->vm_ops->open(n);
+	vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
+	lock_vma_mappings(vma);
+	spin_lock(&mm->page_table_lock);
+	vma->vm_start = end;
+	__insert_vm_struct(mm, n);
+	spin_unlock(&mm->page_table_lock);
+	unlock_vma_mappings(vma);
+	return 0;
+}
+
+static long madvise_fixup_end(struct vm_area_struct * vma,
+	unsigned long start, int behavior)
+{
+	struct vm_area_struct * n;
+	struct mm_struct * mm = vma->vm_mm;
+
+	n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!n)
+		return -EAGAIN;
+	*n = *vma;
+	n->vm_start = start;
+	n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
+	setup_read_behavior(n, behavior);
+	n->vm_raend = 0;
+	if (n->vm_file)
+		get_file(n->vm_file);
+	if (n->vm_ops && n->vm_ops->open)
+		n->vm_ops->open(n);
+	lock_vma_mappings(vma);
+	spin_lock(&mm->page_table_lock);
+	vma->vm_end = start;
+	__insert_vm_struct(mm, n);
+	spin_unlock(&mm->page_table_lock);
+	unlock_vma_mappings(vma);
+	return 0;
+}
+
+static long madvise_fixup_middle(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, int behavior)
+{
+	struct vm_area_struct * left, * right;
+	struct mm_struct * mm = vma->vm_mm;
+
+	left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!left)
+		return -EAGAIN;
+	right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!right) {
+		kmem_cache_free(vm_area_cachep, left);
+		return -EAGAIN;
+	}
+	*left = *vma;
+	*right = *vma;
+	left->vm_end = start;
+	right->vm_start = end;
+	right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
+	left->vm_raend = 0;
+	right->vm_raend = 0;
+	if (vma->vm_file)
+		atomic_add(2, &vma->vm_file->f_count);
+
+	if (vma->vm_ops && vma->vm_ops->open) {
+		vma->vm_ops->open(left);
+		vma->vm_ops->open(right);
+	}
+	vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
+	vma->vm_raend = 0;
+	lock_vma_mappings(vma);
+	spin_lock(&mm->page_table_lock);
+	vma->vm_start = start;
+	vma->vm_end = end;
+	setup_read_behavior(vma, behavior);
+	__insert_vm_struct(mm, left);
+	__insert_vm_struct(mm, right);
+	spin_unlock(&mm->page_table_lock);
+	unlock_vma_mappings(vma);
+	return 0;
+}
+
+/*
+ * We can potentially split a vm area into separate
+ * areas, each area with its own behavior.
+ */
+static long madvise_behavior(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, int behavior)
+{
+	int error = 0;
+
+	/* This caps the number of vma's this process can own */
+	if (vma->vm_mm->map_count > max_map_count)
+		return -ENOMEM;
+
+	if (start == vma->vm_start) {
+		if (end == vma->vm_end) {
+			setup_read_behavior(vma, behavior);
+			vma->vm_raend = 0;
+		} else
+			error = madvise_fixup_start(vma, end, behavior);
+	} else {
+		if (end == vma->vm_end)
+			error = madvise_fixup_end(vma, start, behavior);
+		else
+			error = madvise_fixup_middle(vma, start, end, behavior);
+	}
+
+	return error;
+}
+
+/*
+ * Schedule all required I/O operations, then run the disk queue
+ * to make sure they are started.  Do not wait for completion.
+ */
+static long madvise_willneed(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end)
+{
+	long error = -EBADF;
+	struct file * file;
+	unsigned long size, rlim_rss;
+
+	/* Doesn't work if there's no mapped file. */
+	if (!vma->vm_file)
+		return error;
+	file = vma->vm_file;
+	size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
+							PAGE_CACHE_SHIFT;
+
+	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	if (end > vma->vm_end)
+		end = vma->vm_end;
+	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+	/* Make sure this doesn't exceed the process's max rss. */
+	error = -EIO;
+	rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
+				LONG_MAX; /* default: see resource.h */
+	if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
+		return error;
+
+	/* round to cluster boundaries if this isn't a "random" area. */
+	if (!VM_RandomReadHint(vma)) {
+		start = CLUSTER_OFFSET(start);
+		end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
+
+		while ((start < end) && (start < size)) {
+			error = read_cluster_nonblocking(file, start, size);
+			start += CLUSTER_PAGES;
+			if (error < 0)
+				break;
+		}
+	} else {
+		while ((start < end) && (start < size)) {
+			error = page_cache_read(file, start);
+			start++;
+			if (error < 0)
+				break;
+		}
+	}
+
+	/* Don't wait for someone else to push these requests. */
+	run_task_queue(&tq_disk);
+
+	return error;
+}
+
+/*
+ * Application no longer needs these pages.  If the pages are dirty,
+ * it's OK to just throw them away.  The app will be more careful about
+ * data it wants to keep.  Be sure to free swap resources too.  The
+ * zap_page_range call sets things up for refill_inactive to actually free
+ * these pages later if no one else has touched them in the meantime,
+ * although we could add these pages to a global reuse list for
+ * refill_inactive to pick up before reclaiming other pages.
+ *
+ * NB: This interface discards data rather than pushes it out to swap,
+ * as some implementations do.  This has performance implications for
+ * applications like large transactional databases which want to discard
+ * pages in anonymous maps after committing to backing store the data
+ * that was kept in them.  There is no reason to write this data out to
+ * the swap area if the application is discarding it.
+ *
+ * An interface that causes the system to free clean pages and flush
+ * dirty pages is already available as msync(MS_INVALIDATE).
+ */
+static long madvise_dontneed(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end)
+{
+	if (vma->vm_flags & VM_LOCKED)
+		return -EINVAL;
+
+	zap_page_range(vma->vm_mm, start, end - start);
+	return 0;
+}
+
+static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
+	unsigned long end, int behavior)
+{
+	long error = -EBADF;
+
+	switch (behavior) {
+	case MADV_NORMAL:
+	case MADV_SEQUENTIAL:
+	case MADV_RANDOM:
+		error = madvise_behavior(vma, start, end, behavior);
+		break;
+
+	case MADV_WILLNEED:
+		error = madvise_willneed(vma, start, end);
+		break;
+
+	case MADV_DONTNEED:
+		error = madvise_dontneed(vma, start, end);
+		break;
+
+	default:
+		error = -EINVAL;
+		break;
+	}
+		
+	return error;
+}
+
+/*
+ * The madvise(2) system call.
+ *
+ * Applications can use madvise() to advise the kernel how it should
+ * handle paging I/O in this VM area.  The idea is to help the kernel
+ * use appropriate read-ahead and caching techniques.  The information
+ * provided is advisory only, and can be safely disregarded by the
+ * kernel without affecting the correct operation of the application.
+ *
+ * behavior values:
+ *  MADV_NORMAL - the default behavior is to read clusters.  This
+ *		results in some read-ahead and read-behind.
+ *  MADV_RANDOM - the system should read the minimum amount of data
+ *		on any access, since it is unlikely that the appli-
+ *		cation will need more than what it asks for.
+ *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
+ *		once, so they can be aggressively read ahead, and
+ *		can be freed soon after they are accessed.
+ *  MADV_WILLNEED - the application is notifying the system to read
+ *		some pages ahead.
+ *  MADV_DONTNEED - the application is finished with the given range,
+ *		so the kernel can free resources associated with it.
+ *
+ * return values:
+ *  zero    - success
+ *  -EINVAL - start + len < 0, start is not page-aligned,
+ *		"behavior" is not a valid value, or application
+ *		is attempting to release locked or shared pages.
+ *  -ENOMEM - addresses in the specified range are not currently
+ *		mapped, or are outside the AS of the process.
+ *  -EIO    - an I/O error occurred while paging in data.
+ *  -EBADF  - map exists, but area maps something that isn't a file.
+ *  -EAGAIN - a kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
+{
+	unsigned long end;
+	struct vm_area_struct * vma;
+	int unmapped_error = 0;
+	int error = -EINVAL;
+
+	down_write(&current->mm->mmap_sem);
+
+	if (start & ~PAGE_MASK)
+		goto out;
+	len = (len + ~PAGE_MASK) & PAGE_MASK;
+	end = start + len;
+	if (end < start)
+		goto out;
+
+	error = 0;
+	if (end == start)
+		goto out;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 */
+	vma = find_vma(current->mm, start);
+	for (;;) {
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			goto out;
+
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = madvise_vma(vma, start, end,
+							behavior);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = madvise_vma(vma, start, vma->vm_end, behavior);
+		if (error)
+			goto out;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+
+out:
+	up_write(&current->mm->mmap_sem);
+	return error;
+}
+
+/*
+ * Later we can get more picky about what "in core" means precisely.
+ * For now, simply check to see if the page is in the page cache,
+ * and is up to date; i.e. that no page-in operation would be required
+ * at this time if an application were to map and access this page.
+ */
+static unsigned char mincore_page(struct vm_area_struct * vma,
+	unsigned long pgoff)
+{
+	unsigned char present = 0;
+	struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping;
+	struct page * page, ** hash = page_hash(as, pgoff);
+
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(as, pgoff, *hash);
+	if ((page) && (Page_Uptodate(page)))
+		present = 1;
+	spin_unlock(&pagecache_lock);
+
+	return present;
+}
+
+static long mincore_vma(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, unsigned char * vec)
+{
+	long error, i, remaining;
+	unsigned char * tmp;
+
+	error = -ENOMEM;
+	if (!vma->vm_file)
+		return error;
+
+	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	if (end > vma->vm_end)
+		end = vma->vm_end;
+	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+	error = -EAGAIN;
+	tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
+	if (!tmp)
+		return error;
+
+	/* (end - start) is # of pages, and also # of bytes in "vec */
+	remaining = (end - start),
+
+	error = 0;
+	for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
+		int j = 0;
+		long thispiece = (remaining < PAGE_SIZE) ?
+						remaining : PAGE_SIZE;
+
+		while (j < thispiece)
+			tmp[j++] = mincore_page(vma, start++);
+
+		if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
+			error = -EFAULT;
+			break;
+		}
+	}
+
+	free_page((unsigned long) tmp);
+	return error;
+}
+
+/*
+ * The mincore(2) system call.
+ *
+ * mincore() returns the memory residency status of the pages in the
+ * current process's address space specified by [addr, addr + len).
+ * The status is returned in a vector of bytes.  The least significant
+ * bit of each byte is 1 if the referenced page is in memory, otherwise
+ * it is zero.
+ *
+ * Because the status of a page can change after mincore() checks it
+ * but before it returns to the application, the returned vector may
+ * contain stale information.  Only locked pages are guaranteed to
+ * remain in memory.
+ *
+ * return values:
+ *  zero    - success
+ *  -EFAULT - vec points to an illegal address
+ *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
+ *		or len has a nonpositive value
+ *  -ENOMEM - Addresses in the range [addr, addr + len] are
+ *		invalid for the address space of this process, or
+ *		specify one or more pages which are not currently
+ *		mapped
+ *  -EAGAIN - A kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_mincore(unsigned long start, size_t len,
+	unsigned char * vec)
+{
+	int index = 0;
+	unsigned long end;
+	struct vm_area_struct * vma;
+	int unmapped_error = 0;
+	long error = -EINVAL;
+
+	down_read(&current->mm->mmap_sem);
+
+	if (start & ~PAGE_CACHE_MASK)
+		goto out;
+	len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
+	end = start + len;
+	if (end < start)
+		goto out;
+
+	error = 0;
+	if (end == start)
+		goto out;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 */
+	vma = find_vma(current->mm, start);
+	for (;;) {
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			goto out;
+
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = mincore_vma(vma, start, end,
+							&vec[index]);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
+		if (error)
+			goto out;
+		index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+
+out:
+	up_read(&current->mm->mmap_sem);
+	return error;
+}
+
+static inline
+struct page *__read_cache_page(struct address_space *mapping,
+				unsigned long index,
+				int (*filler)(void *,struct page*),
+				void *data)
+{
+	struct page **hash = page_hash(mapping, index);
+	struct page *page, *cached_page = NULL;
+	int err;
+repeat:
+	page = __find_get_page(mapping, index, hash);
+	if (!page) {
+		if (!cached_page) {
+			cached_page = page_cache_alloc(mapping);
+			if (!cached_page)
+				return ERR_PTR(-ENOMEM);
+		}
+		page = cached_page;
+		if (add_to_page_cache_unique(page, mapping, index, hash))
+			goto repeat;
+		cached_page = NULL;
+		err = filler(data, page);
+		if (err < 0) {
+			page_cache_release(page);
+			page = ERR_PTR(err);
+		}
+	}
+	if (cached_page)
+		page_cache_release(cached_page);
+	return page;
+}
+
+/*
+ * Read into the page cache. If a page already exists,
+ * and Page_Uptodate() is not set, try to fill the page.
+ */
+struct page *read_cache_page(struct address_space *mapping,
+				unsigned long index,
+				int (*filler)(void *,struct page*),
+				void *data)
+{
+	struct page *page;
+	int err;
+
+retry:
+	page = __read_cache_page(mapping, index, filler, data);
+	if (IS_ERR(page))
+		goto out;
+	mark_page_accessed(page);
+	if (Page_Uptodate(page))
+		goto out;
+
+	lock_page(page);
+	if (!page->mapping) {
+		UnlockPage(page);
+		page_cache_release(page);
+		goto retry;
+	}
+	if (Page_Uptodate(page)) {
+		UnlockPage(page);
+		goto out;
+	}
+	err = filler(data, page);
+	if (err < 0) {
+		page_cache_release(page);
+		page = ERR_PTR(err);
+	}
+ out:
+	return page;
+}
+
+static inline struct page * __grab_cache_page(struct address_space *mapping,
+				unsigned long index, struct page **cached_page)
+{
+	struct page *page, **hash = page_hash(mapping, index);
+repeat:
+	page = __find_lock_page(mapping, index, hash);
+	if (!page) {
+		if (!*cached_page) {
+			*cached_page = page_cache_alloc(mapping);
+			if (!*cached_page)
+				return NULL;
+		}
+		page = *cached_page;
+		if (add_to_page_cache_unique(page, mapping, index, hash))
+			goto repeat;
+		*cached_page = NULL;
+	}
+	return page;
+}
+
+inline void remove_suid(struct inode *inode)
+{
+	unsigned int mode;
+
+	/* set S_IGID if S_IXGRP is set, and always set S_ISUID */
+	mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
+
+	/* was any of the uid bits set? */
+	mode &= inode->i_mode;
+	if (mode && !capable(CAP_FSETID)) {
+		inode->i_mode &= ~mode;
+		mark_inode_dirty(inode);
+	}
+}
+
+/*
+ * Write to a file through the page cache. 
+ *
+ * We currently put everything into the page cache prior to writing it.
+ * This is not a problem when writing full pages. With partial pages,
+ * however, we first have to read the data into the cache, then
+ * dirty the page, and finally schedule it for writing. Alternatively, we
+ * could write-through just the portion of data that would go into that
+ * page, but that would kill performance for applications that write data
+ * line by line, and it's prone to race conditions.
+ *
+ * Note that this routine doesn't try to keep track of dirty pages. Each
+ * file system has to do this all by itself, unfortunately.
+ *							okir@monad.swb.de
+ */
+ssize_t
+generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct inode	*inode = mapping->host;
+	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	loff_t		pos;
+	struct page	*page, *cached_page;
+	ssize_t		written;
+	long		status = 0;
+	int		err;
+	unsigned	bytes;
+
+	if ((ssize_t) count < 0)
+		return -EINVAL;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	cached_page = NULL;
+
+	down(&inode->i_sem);
+
+	pos = *ppos;
+	err = -EINVAL;
+	if (pos < 0)
+		goto out;
+
+	err = file->f_error;
+	if (err) {
+		file->f_error = 0;
+		goto out;
+	}
+
+	written = 0;
+
+	/* FIXME: this is for backwards compatibility with 2.4 */
+	if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
+		pos = inode->i_size;
+
+	/*
+	 * Check whether we've reached the file size limit.
+	 */
+	err = -EFBIG;
+	
+	if (limit != RLIM_INFINITY) {
+		if (pos >= limit) {
+			send_sig(SIGXFSZ, current, 0);
+			goto out;
+		}
+		if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) {
+			/* send_sig(SIGXFSZ, current, 0); */
+			count = limit - (u32)pos;
+		}
+	}
+
+	/*
+	 *	LFS rule 
+	 */
+	if ( pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
+		if (pos >= MAX_NON_LFS) {
+			send_sig(SIGXFSZ, current, 0);
+			goto out;
+		}
+		if (count > MAX_NON_LFS - (u32)pos) {
+			/* send_sig(SIGXFSZ, current, 0); */
+			count = MAX_NON_LFS - (u32)pos;
+		}
+	}
+
+	/*
+	 *	Are we about to exceed the fs block limit ?
+	 *
+	 *	If we have written data it becomes a short write
+	 *	If we have exceeded without writing data we send
+	 *	a signal and give them an EFBIG.
+	 *
+	 *	Linus frestrict idea will clean these up nicely..
+	 */
+	 
+	if (!S_ISBLK(inode->i_mode)) {
+		if (pos >= inode->i_sb->s_maxbytes)
+		{
+			if (count || pos > inode->i_sb->s_maxbytes) {
+				send_sig(SIGXFSZ, current, 0);
+				err = -EFBIG;
+				goto out;
+			}
+			/* zero-length writes at ->s_maxbytes are OK */
+		}
+
+		if (pos + count > inode->i_sb->s_maxbytes)
+			count = inode->i_sb->s_maxbytes - pos;
+	} else {
+		if (is_read_only(inode->i_rdev)) {
+			err = -EPERM;
+			goto out;
+		}
+		if (pos >= inode->i_size) {
+			if (count || pos > inode->i_size) {
+				err = -ENOSPC;
+				goto out;
+			}
+		}
+
+		if (pos + count > inode->i_size)
+			count = inode->i_size - pos;
+	}
+
+	err = 0;
+	if (count == 0)
+		goto out;
+
+	remove_suid(inode);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	mark_inode_dirty_sync(inode);
+
+	if (file->f_flags & O_DIRECT)
+		goto o_direct;
+
+	do {
+		unsigned long index, offset;
+		long page_fault;
+		char *kaddr;
+
+		/*
+		 * Try to find the page in the cache. If it isn't there,
+		 * allocate a free page.
+		 */
+		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count)
+			bytes = count;
+
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 */
+		{ volatile unsigned char dummy;
+			__get_user(dummy, buf);
+			__get_user(dummy, buf+bytes-1);
+		}
+
+		status = -ENOMEM;	/* we'll assign it later anyway */
+		page = __grab_cache_page(mapping, index, &cached_page);
+		if (!page)
+			break;
+
+		/* We have exclusive IO access to the page.. */
+		if (!PageLocked(page)) {
+			PAGE_BUG(page);
+		}
+
+		kaddr = kmap(page);
+		status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
+		if (status)
+			goto sync_failure;
+		page_fault = __copy_from_user(kaddr+offset, buf, bytes);
+		flush_dcache_page(page);
+		status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
+		if (page_fault)
+			goto fail_write;
+		if (!status)
+			status = bytes;
+
+		if (status >= 0) {
+			written += status;
+			count -= status;
+			pos += status;
+			buf += status;
+		}
+unlock:
+		kunmap(page);
+		/* Mark it unlocked again and drop the page.. */
+		SetPageReferenced(page);
+		UnlockPage(page);
+		page_cache_release(page);
+
+		if (status < 0)
+			break;
+	} while (count);
+done:
+	*ppos = pos;
+
+	if (cached_page)
+		page_cache_release(cached_page);
+
+	/* For now, when the user asks for O_SYNC, we'll actually
+	 * provide O_DSYNC. */
+	if (status >= 0) {
+		if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
+			status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
+	}
+	
+out_status:	
+	err = written ? written : status;
+out:
+
+	up(&inode->i_sem);
+	return err;
+fail_write:
+	status = -EFAULT;
+	goto unlock;
+
+sync_failure:
+	/*
+	 * If blocksize < pagesize, prepare_write() may have instantiated a
+	 * few blocks outside i_size.  Trim these off again.
+	 */
+	kunmap(page);
+	UnlockPage(page);
+	page_cache_release(page);
+	if (pos + bytes > inode->i_size)
+		vmtruncate(inode, inode->i_size);
+	goto done;
+
+o_direct:
+	written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
+	if (written > 0) {
+		loff_t end = pos + written;
+		if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
+			inode->i_size = end;
+			mark_inode_dirty(inode);
+		}
+		*ppos = end;
+		invalidate_inode_pages2(mapping);
+	}
+	/*
+	 * Sync the fs metadata but not the minor inode changes and
+	 * of course not the data as we did direct DMA for the IO.
+	 */
+	if (written >= 0 && file->f_flags & O_SYNC)
+		status = generic_osync_inode(inode, OSYNC_METADATA);
+	goto out_status;
+}
+
+void __init page_cache_init(unsigned long mempages)
+{
+	unsigned long htable_size, order;
+
+	htable_size = mempages;
+	htable_size *= sizeof(struct page *);
+	for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
+		;
+
+	do {
+		unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
+
+		page_hash_bits = 0;
+		while((tmp >>= 1UL) != 0UL)
+			page_hash_bits++;
+
+		page_hash_table = (struct page **)
+			__get_free_pages(GFP_ATOMIC, order);
+	} while(page_hash_table == NULL && --order > 0);
+
+	printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
+	       (1 << page_hash_bits), order, (PAGE_SIZE << order));
+	if (!page_hash_table)
+		panic("Failed to allocate page hash table\n");
+	memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
+}
+
+/* address_space_map
+ *	Maps a series of pages from the page cache into the given array.
+ */
+static int address_space_map(struct address_space *as, unsigned long index,
+		int nr, struct page **pages,
+		int *nr_newp, struct page **new_pages)
+{
+	struct page *cached_page = NULL;
+	int nr_new = 0;
+	int ret;
+
+	if (unlikely(nr <= 0)) {
+		*nr_newp = nr_new;
+		return 0;
+	}
+
+	ret = 0;
+
+	spin_lock(&pagecache_lock);
+
+	while (nr > 0) {
+		struct page **hash = page_hash(as, index);
+		struct page *page;
+
+		page = __find_page_nolock(as, index, *hash);
+		if (page) {
+			page_cache_get(page);
+got_page:
+			pages[ret++] = page;
+			index++;
+			nr--;
+			continue;
+		}
+
+		if (cached_page) {
+			__add_to_page_cache(cached_page, as, index, hash);
+			nr_new++;
+			*new_pages++ = page = cached_page;
+			cached_page = NULL;
+			goto got_page;
+		}
+		spin_unlock(&pagecache_lock);
+
+		cached_page = page_cache_alloc(as);
+		if (!cached_page)
+			goto out;
+
+		/* Okay, we now have an allocated page.  Retry
+		 * the search and add. */
+		spin_lock(&pagecache_lock);
+	}
+
+	spin_unlock(&pagecache_lock);
+
+out:
+	if (cached_page)
+		page_cache_release(cached_page);
+
+	*nr_newp = nr_new;
+	return ret ? ret : -ENOMEM;
+}
+
+struct iodesc {
+	struct worktodo	wtd;
+
+	struct page	*good_page;	/* the highest Uptodate page */
+	int		good_idx;
+	int		err;
+	int		did_read;
+	int		rw;
+
+	struct page	**pages;
+	struct page	**new_pages;
+	struct page	**cur_pagep;
+	int		nr_pages;
+	int		nr_new_pages;
+
+	struct address_space *as;
+	struct file	*file;
+	kvec_cb_t	cb;
+
+	size_t		size;
+	unsigned long	transferred;
+	unsigned	offset;
+	struct kveclet	*veclet;
+
+	struct kvec_dst	src;
+
+	int		sync;
+
+#define READDESC_NR_DEF	3
+	struct page *def_pages[READDESC_NR_DEF];
+	struct page *def_new_pages[READDESC_NR_DEF];
+};
+
+static void __iodesc_free(struct iodesc *io, int unlock)
+{
+	kvec_cb_t cb;
+	ssize_t res;
+
+	if (unlock) {
+		unsigned i;
+		for (i=0; i<io->nr_pages; i++) {
+			struct page *page = io->pages[i];
+			UnlockPage(page);
+			page_cache_release(page);
+		}
+	} else {
+		unsigned i;
+		for (i=0; i<io->nr_pages; i++)
+			page_cache_release(io->pages[i]);
+	}
+
+	if (io->new_pages != io->def_new_pages)
+		kfree(io->new_pages);
+	if (io->pages != io->def_pages)
+		kfree(io->pages);
+
+	cb = io->cb;
+	res = io->transferred ? io->transferred : io->err;
+	kfree(io);
+
+	cb.fn(cb.data, cb.vec, res);
+}
+
+/* By the time this function is called, all of the pages prior to
+ * the current good_idx have been released appropriately.  The remaining
+ * duties are to release any remaining pages and to honour O_SYNC.
+ */
+static void __iodesc_finish_write(struct iodesc *io)
+{
+	pr_debug("__iodesc_finish_write(%p)\n", io);
+
+	__iodesc_free(io, WRITE == io->rw);
+}
+
+/* This is mostly ripped from generic_file_write */
+static int __iodesc_write_page(struct iodesc *io, struct page *page)
+{
+	char *kaddr = kmap(page);
+	unsigned long bytes;
+	unsigned long offset;
+	long status;
+	int done = 0;
+
+	offset = io->offset;
+	kaddr += offset;
+
+	bytes = PAGE_CACHE_SIZE - offset;
+	if (io->size < bytes)
+		bytes = io->size;
+
+	pr_debug("__iodesc_write_page(%p (%lu), %lu %lu %lu)\n", page, page->index, offset, bytes);
+
+	io->err = io->as->a_ops->prepare_write(io->file, page,
+						offset, offset + bytes);
+	if (unlikely(io->err)) {
+		pr_debug("prepare_write: %d\n", io->err);
+		kunmap(page);
+		return 1;
+	}
+
+	kvec_dst_map(&io->src);
+	memcpy_from_kvec_dst(kaddr, &io->src, bytes);
+	kvec_dst_unmap(&io->src);	/* commit_write may block */
+
+	flush_dcache_page(page);
+	status = io->as->a_ops->commit_write(io->file, page,
+						offset, offset+bytes);
+
+	/* We don't handle short writes */
+	if (status > 0 && status != bytes)
+		done = 1;
+
+	if (!status)
+		status = bytes;
+
+	if (likely(status > 0)) {
+		io->transferred += status;
+		io->size -= status;
+		io->offset = (offset + status) & (PAGE_CACHE_SIZE - 1);
+
+		if (io->offset)
+			done = 1;
+	} else {
+		io->err = status;
+		done = 1;
+	}
+
+	kunmap(page);
+	return done;
+}
+
+void __iodesc_sync_wait_page(void *data)
+{
+	struct iodesc *io = data;
+
+	do {
+		struct buffer_head *bh, *head = io->pages[io->good_idx]->buffers;
+
+		if (!head)
+			continue;
+
+		bh = head;
+		do {
+			if (buffer_locked(bh)) {
+				pr_debug("waiting on bh=%pi io=%p\n", bh, io);
+				if (!wtd_wait_on_buffer(&io->wtd, bh))
+					return;
+			}
+			if (buffer_req(bh) && !buffer_uptodate(bh)) {
+				pr_debug("io err bh=%p (%p)\n", bh, io);
+				io->err = -EIO;
+				break;
+			}
+		} while ((bh = bh->b_this_page) != head);
+	} while (!io->err && ++io->good_idx < io->nr_pages) ;
+
+	pr_debug("finish_write(%p)\n", io);
+	__iodesc_finish_write(io);
+}
+
+static void __iodesc_do_write(void *data)
+{
+	struct iodesc *io = data;
+	unsigned i;
+
+	for (i=0; i<io->nr_pages; i++) {
+		if (__iodesc_write_page(io, io->pages[i]))
+			break;
+	}
+
+	up(&io->file->f_dentry->d_inode->i_sem);
+
+	if (io->sync) {
+		io->good_idx = 0;
+
+		pr_debug("writing out pages(%p)\n", io);
+		for (i=0; i<io->nr_pages; i++) {
+			if (io->pages[i]->buffers)
+				writeout_one_page(io->pages[i]);
+		}
+
+		pr_debug("calling __iodesc_sync_wait_page(%p)\n", io);
+		wtd_set_action(&io->wtd, __iodesc_sync_wait_page, io);
+		__iodesc_sync_wait_page(io);
+		return;
+	}
+
+	__iodesc_finish_write(io);
+}
+
+static void __iodesc_write_lock_next_page(void *data)
+{
+	struct iodesc *io = data;
+	pr_debug("__iodesc_write_next_page(%p)\n", io);
+
+	while (io->good_idx < io->nr_pages) {
+		io->good_page = io->pages[io->good_idx++];
+		if (io->good_page == *io->cur_pagep)
+			io->cur_pagep++;
+		else {
+			if (!wtd_lock_page(&io->wtd, io->good_page))
+				return;
+		}
+	}
+
+	//Is this faster? __iodesc_do_write(io);
+	wtd_set_action(&io->wtd, __iodesc_do_write, io);
+	wtd_queue(&io->wtd);
+}
+
+static void __generic_file_write_iodesc(struct iodesc *io)
+{
+	struct inode *inode = io->file->f_dentry->d_inode;
+	time_t now = CURRENT_TIME;
+
+	remove_suid(inode);
+	if (inode->i_ctime != now || inode->i_mtime != now) {
+		inode->i_ctime = inode->i_mtime = now;
+		mark_inode_dirty_sync(inode);
+	}
+
+	wtd_set_action(&io->wtd, __iodesc_write_lock_next_page, io);
+	io->sync = !!(io->file->f_flags & O_SYNC);
+	io->good_idx = 0;
+	io->cur_pagep = io->new_pages;
+	__iodesc_write_lock_next_page(io);
+}
+
+static void __iodesc_read_finish(struct iodesc *io)
+{
+	struct page **src_pagep;
+	char *dst_addr, *src_addr;
+	int src_off;
+	size_t size;
+	size_t valid;
+
+	struct kveclet *veclet = io->veclet;
+	struct page *dst_page = veclet->page;
+	int dst_len = veclet->length;
+	int dst_off = veclet->offset;
+
+
+	pr_debug("__iodesc_read_finish: good_idx = %d\n", io->good_idx);
+	if (io->good_idx <= 0)
+		goto no_data;
+
+	size = io->size;
+	src_off = io->offset;
+	src_pagep = io->pages;
+	src_addr = kmap(*src_pagep);
+
+	valid = (size_t)io->good_idx << PAGE_CACHE_SHIFT;
+	valid -= src_off;
+	pr_debug("size=%d valid=%d src_off=%d\n", size, valid, src_off);
+
+	if (valid < size)
+		size = valid;
+
+	dst_addr = kmap(veclet->page);
+
+	while (size > 0) {
+		int this = PAGE_CACHE_SIZE - src_off;
+		if ((PAGE_SIZE - dst_off) < this)
+			this = PAGE_SIZE - dst_off;
+		if (size < this)
+			this = size;
+		pr_debug("this=%d src_off=%d dst_off=%d dst_len=%d\n",
+			this, src_off, dst_off, dst_len);
+		memcpy(dst_addr + dst_off, src_addr + src_off, this);
+
+		src_off += this;
+		dst_off += this;
+		dst_len -= this;
+		size -= this;
+		io->transferred += this;
+		pr_debug("read_finish: this=%d transferred=%d\n",
+			 this, io->transferred);
+
+		if (size <= 0)
+			break;
+
+		if (dst_len <= 0) {
+			kunmap(dst_page);
+			veclet++;
+			dst_page = veclet->page;
+			dst_off = veclet->offset;
+			dst_len = veclet->length;
+			dst_addr = kmap(dst_page);
+		}
+
+		if (src_off >= PAGE_SIZE) { /* FIXME: PAGE_CACHE_SIZE */
+			kunmap(*src_pagep);
+			pr_debug("page(%lu)->count = %d\n",
+				 (*src_pagep)->index,
+				 atomic_read(&(*src_pagep)->count));
+			src_pagep++;
+			src_addr = kmap(*src_pagep);
+			src_off = 0;
+		}
+	}
+	kunmap(dst_page);
+	kunmap(*src_pagep);
+no_data:
+	__iodesc_free(io, 0);
+}
+
+static void __iodesc_make_uptodate(void *data)
+{
+	struct iodesc *io = data;
+	struct page *page = io->good_page;
+	int locked = 1;
+
+	pr_debug("__iodesc_make_uptodate: io=%p index=%lu\n", io, page->index);
+again:
+	while (Page_Uptodate(page)) {
+		pr_debug("page index %lu uptodate\n", page->index);
+		if (locked) {
+			UnlockPage(page);
+			locked = 0;
+		}
+		io->did_read = 0;
+		io->good_idx++;
+		if (io->good_idx >= io->nr_pages) {
+			__iodesc_read_finish(io);
+			return;
+		}
+		page = io->good_page = io->pages[io->good_idx];
+		pr_debug("__iodesc_make_uptodate: index=%lu\n", page->index);
+	}
+
+	if (!locked) {
+		if (!wtd_lock_page(&io->wtd, page))
+			return;
+		locked = 1;
+	}
+
+	if (!io->did_read) {
+		/* We haven't tried reading this page before, give it a go. */
+		pr_debug("attempting to read %lu\n", page->index);
+		io->did_read = 1;
+		locked = 0;
+		io->err = page->mapping->a_ops->readpage(io->file, page);
+		if (!io->err) {
+			if (Page_Uptodate(page))
+				goto again;
+			if (wtd_lock_page(&io->wtd, page)) {
+				locked = 1;
+				goto again;
+			}
+			return;
+		}
+	}
+
+	if (locked)
+		UnlockPage(page);
+
+	/* We've already read this page before.  Set err to EIO and quite */
+	if (!io->err)
+		io->err = -EIO;
+	__iodesc_read_finish(io);
+}
+
+static void __wtdgeneric_file_read_iodesc(void *data);
+
+static void __generic_file_read_iodesc(struct iodesc *io, int mayblock)
+{
+	int (*readpage)(struct file *, struct page *);
+	int i;
+
+	wtd_set_action(&io->wtd, __iodesc_make_uptodate, io);
+	readpage = io->as->a_ops->readpage;
+	for (i=0; i<io->nr_new_pages; i++) {
+		int ret;
+		if (!mayblock) {
+			wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io);
+			wtd_queue(&io->wtd);
+			return;
+		}
+		ret = readpage(io->file, io->new_pages[i]);
+		if (ret)
+			printk(KERN_DEBUG "__generic_file_read_kiovec: readpage(%lu) = %d\n", io->new_pages[i]->index, ret);
+	}
+
+	for (i=0; i<io->nr_pages; i++) {
+		struct page *page = io->pages[i];
+		if (Page_Uptodate(page)) {
+			pr_debug("__generic_file_read_iodesc: %lu is uptodate\n", page->index);
+			continue;
+		}
+
+		if (!mayblock) {
+			wtd_set_action(&io->wtd, __wtdgeneric_file_read_iodesc, io);
+			wtd_queue(&io->wtd);
+			return;
+		}
+		if (!TryLockPage(page)) {
+			int ret = readpage(io->file, page);
+			if (ret)
+				printk(KERN_DEBUG "__generic_file_read_iodesc: readpage(%lu): %d\n", page->index, ret);
+		}
+
+		if (!Page_Uptodate(page) && io->good_idx == -1) {
+			pr_debug("first good_idx=%d (%lu)\n", i, page->index);
+			io->good_idx = i;
+			io->good_page = page;
+		}
+	}
+
+	/* Whee, all the pages are uptodate! */
+	if (!io->good_page) {
+		pr_debug("all pages uptodate!\n");
+		io->good_idx = io->nr_pages;
+		__iodesc_read_finish(io);
+		return;
+	}
+
+	pr_debug("locking good_page\n");
+	if (wtd_lock_page(&io->wtd, io->good_page))
+		__iodesc_make_uptodate(io);
+	return;
+}
+
+static void __wtdgeneric_file_read_iodesc(void *data)
+{
+	struct iodesc *io = data;
+	__generic_file_read_iodesc(io, 1);
+}
+
+static int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb,
+			 size_t size, loff_t pos);
+
+int generic_file_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	return generic_file_rw_kvec(file, READ, cb, size, pos);
+}
+
+int generic_file_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	return generic_file_rw_kvec(file, WRITE, cb, size, pos);
+}
+
+int generic_file_rw_kvec(struct file *file, int rw, kvec_cb_t cb,
+			 size_t size, loff_t pos)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct address_space *as = inode->i_mapping;
+	unsigned long index;
+	unsigned long eindex;
+	unsigned long nr_pages;
+	struct iodesc *io = NULL;
+	int ret;
+	int append = 0;
+
+	ret = -EINVAL;
+	if (unlikely(rw != READ && rw != WRITE))
+		goto out;
+
+	append = unlikely(0 != (file->f_flags & O_APPEND));
+
+	/* Don't check pos when appending, but otherwise do santity 
+	 * checks before allocating memory.  -'ve offsets are invalid.
+	 */
+	if (unlikely(!append && pos < 0))
+		goto out;
+
+	ret = -ENOMEM;
+	io = kmalloc(sizeof(*io), GFP_KERNEL);
+	if (!io)
+		goto out;
+
+	memset(io, 0, sizeof(*io));
+	io->size = size;
+
+	/* FIXME: make the down a WTD_op */
+	if (rw == WRITE) {
+		unsigned long long tmp;
+		loff_t limit;
+
+		down(&inode->i_sem);
+		if (append)
+			pos = inode->i_size;
+
+		limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+		if (likely(RLIM_INFINITY == limit))
+			limit = OFFSET_MAX;
+
+		/* Filesystem limits take precedence over user limits */
+		if (likely(inode->i_sb->s_maxbytes < limit))
+			limit = inode->i_sb->s_maxbytes;
+
+	        if (unlikely(pos >= limit)) {
+			pr_debug("maxbytes: %Ld\n", limit);
+			ret = 0;
+			if (size || pos > limit)
+				ret = -EFBIG;
+			goto out_io;
+		}
+
+		/* Clamp writes straddling limit. */
+		tmp = pos + size;
+		if (unlikely(tmp > (unsigned long long)limit))
+			size = limit - pos;
+	}
+
+	if (READ == rw) {
+		pr_debug("pos=%Ld i_size=%Ld\n", pos, inode->i_size);
+
+		if (pos > inode->i_size)
+			size = 0;
+		else if ((pos + size) > inode->i_size)
+			size = inode->i_size - pos;
+
+		if (io->size < size)
+			size = io->size;
+		else if (size < io->size)
+			io->size = size;
+
+		pr_debug("io->size=%d size=%d\n", io->size, size);
+	}
+
+	ret = 0;
+	if (unlikely(!size))
+		goto out_io;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	eindex = (pos + size - 1) >> PAGE_CACHE_SHIFT;
+	nr_pages = eindex - index + 1;
+
+	pr_debug("nr_pages: %lu\n", nr_pages);
+
+	io->good_idx = -1;
+	io->good_page = NULL;
+	io->did_read = 0;
+	io->err = 0;
+	io->rw = rw;
+	io->as = as;
+	io->offset = (unsigned long)pos & (PAGE_CACHE_SIZE - 1);
+	io->file = file;
+	io->cb = cb;
+	kvec_dst_init(&io->src, KM_USER0);
+	kvec_dst_set(&io->src, cb.vec->veclet);
+	io->veclet = cb.vec->veclet;
+	if (nr_pages < READDESC_NR_DEF) {
+		io->pages = io->def_pages;
+		io->new_pages = io->def_new_pages;
+	} else {
+		io->pages = kmalloc(sizeof(*io->pages) * (nr_pages + 1), GFP_KERNEL);
+		if (!io->pages)
+			goto out_io;
+
+		io->new_pages = kmalloc(sizeof(*io->new_pages) * (nr_pages + 1), GFP_KERNEL);
+		if (!io->new_pages)
+			goto out_pages;
+	}
+
+	ret = address_space_map(as, index, nr_pages, io->pages,
+			&io->nr_new_pages, io->new_pages);
+	pr_debug("as_map: %d (%d new)\n", ret, io->nr_new_pages);
+	if (ret <= 0)
+		goto out_new_pages;
+
+	io->nr_pages = ret;
+	io->pages[io->nr_pages] = NULL;
+	io->new_pages[io->nr_new_pages] = NULL;
+
+	if (rw == READ)
+		__generic_file_read_iodesc(io, 0);
+	else if (rw == WRITE)
+		__generic_file_write_iodesc(io);
+
+	return 0;
+
+out_new_pages:
+	if (io->new_pages != io->def_new_pages)
+		kfree(io->new_pages);
+out_pages:
+	if (io->pages != io->def_pages)
+		kfree(io->pages);
+out_io:
+	kfree(io);
+
+	if (rw == WRITE)
+		up(&inode->i_sem);
+out:
+	if (!ret)
+		cb.fn(cb.data, cb.vec, ret);
+	return ret;
+}
diff -urN v2.4.19/mm/memory.c aio-2.4.19.diff/mm/memory.c
--- v2.4.19/mm/memory.c	Fri Aug  9 13:50:46 2002
+++ aio-2.4.19.diff/mm/memory.c	Mon Sep 16 21:54:13 2002
@@ -45,6 +45,8 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/compiler.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -425,6 +427,8 @@
 
 	pte = *ptep;
 	if (pte_present(pte)) {
+		struct page *page = pte_page(pte);
+		prefetch(page);
 		if (!write ||
 		    (pte_write(pte) && pte_dirty(pte)))
 			return pte_page(pte);
@@ -1495,3 +1499,272 @@
 	}
 	return page;
 }
+
+/*
+ * Force in an entire range of pages from the current process's user VA,
+ * and pin them in physical memory.  
+ * FIXME: some architectures need to flush the cache based on user addresses 
+ * here.  Someone please provide a better macro than flush_cache_page.
+ */
+
+#define dprintk(x...)
+atomic_t user_pinned_pages = ATOMIC_INIT(0);
+
+struct kvec *map_user_kvec(int rw, unsigned long ptr, size_t len)
+{
+	return mm_map_user_kvec(current->mm, rw, ptr, len);
+}
+
+struct kvec *mm_map_user_kvec(struct mm_struct *mm, int rw, unsigned long ptr,
+			      size_t len)
+{
+	struct kvec		*vec;
+	struct kveclet		*veclet;
+	unsigned long		end;
+	int			err;
+	struct vm_area_struct *	vma = 0;
+	int			i;
+	int			datain = (rw == READ);
+	unsigned		nr_pages;
+
+	end = ptr + len;
+	if (unlikely(end < ptr))
+		return ERR_PTR(-EINVAL);
+
+	nr_pages = (ptr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	nr_pages -= ptr >> PAGE_SHIFT;
+	nr_pages ++;
+
+	atomic_add(nr_pages, &user_pinned_pages);
+	err = -EAGAIN;
+	if (unlikely(atomic_read(&user_pinned_pages) >= aio_max_pinned))
+		goto out_adjust;
+
+	vec = kmalloc(sizeof(struct kvec) + nr_pages * sizeof(struct kveclet),
+			GFP_KERNEL);
+	err = -ENOMEM;
+	if (unlikely(!vec))
+		goto out_adjust;
+
+	vec->nr = 0;
+	vec->max_nr = nr_pages;
+	veclet = vec->veclet;
+	
+	/* Make sure the iobuf is not already mapped somewhere. */
+	dprintk ("map_user_kiobuf: begin\n");
+	
+	down_read(&mm->mmap_sem);
+
+	err = -EFAULT;
+	
+	i = 0;
+
+	/* 
+	 * First of all, try to fault in all of the necessary pages
+	 */
+	while (ptr < end) {
+		struct page *map;
+		veclet->offset = ptr & ~PAGE_MASK;
+		veclet->length = PAGE_SIZE - veclet->offset;
+		if (len < veclet->length)
+			veclet->length = len;
+		ptr &= PAGE_MASK;
+		len -= veclet->length;
+
+		if (!vma || ptr >= vma->vm_end) {
+			vma = find_vma(mm, ptr);
+			if (unlikely(!vma))
+				goto out_unlock;
+			if (unlikely(vma->vm_start > ptr)) {
+				if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+					goto out_unlock;
+				if (unlikely(expand_stack(vma, ptr)))
+					goto out_unlock;
+			}
+			if (unlikely(((datain) && (!(vma->vm_flags & VM_WRITE))) ||
+					(!(vma->vm_flags & VM_READ)))) {
+				err = -EFAULT;
+				goto out_unlock;
+			}
+		}
+		spin_lock(&mm->page_table_lock);
+		while (unlikely(!(map = follow_page(mm, ptr, datain)))) {
+			int ret;
+
+			spin_unlock(&mm->page_table_lock);
+			ret = handle_mm_fault(mm, vma, ptr, datain);
+			if (ret <= 0) {
+				if (!ret)
+					goto out_unlock;
+				else {
+					err = -ENOMEM;
+					goto out_unlock;
+				}
+			}
+			spin_lock(&mm->page_table_lock);
+		}			
+		map = get_page_map(map);
+		if (likely(map != NULL)) {
+			flush_dcache_page(map);
+			atomic_inc(&map->count);
+		} else
+			printk (KERN_INFO "Mapped page missing [%d]\n", i);
+		spin_unlock(&mm->page_table_lock);
+		veclet->page = map;
+		veclet++;
+
+		ptr += PAGE_SIZE;
+		vec->nr = ++i;
+	}
+
+	veclet->page = NULL;	/* dummy for the prefetch in free_kvec */
+	veclet->length = 0;	/* bug checking ;-) */
+
+	up_read(&mm->mmap_sem);
+	dprintk ("map_user_kiobuf: end OK\n");
+	return vec;
+
+ out_unlock:
+	up_read(&mm->mmap_sem);
+	unmap_kvec(vec, 0);
+	kfree(vec);
+	dprintk("map_user_kvec: err(%d) rw=%d\n", err, rw);
+	return ERR_PTR(err);
+
+ out_adjust:
+	atomic_sub(nr_pages, &user_pinned_pages);
+	dprintk("map_user_kvec: err(%d) rw=%d\n", err, rw);
+	return ERR_PTR(err);
+}
+
+/*
+ * Unmap all of the pages referenced by a kiobuf.  We release the pages,
+ * and unlock them if they were locked. 
+ */
+
+void unmap_kvec (struct kvec *vec, int dirtied)
+{
+	struct kveclet *veclet = vec->veclet;
+	struct kveclet *end = vec->veclet + vec->nr;
+	struct page *map = veclet->page;
+
+	prefetchw(map);
+	for (; veclet<end; map = (++veclet)->page) {
+		prefetchw(veclet[1].page);
+		if (likely(map != NULL) && !PageReserved(map)) {
+			if (dirtied) {
+				SetPageDirty(map);
+				flush_dcache_page(map);	/* FIXME */
+			}
+			__free_page(map);
+		}
+	}
+
+	atomic_sub(vec->max_nr, &user_pinned_pages);
+	vec->nr = 0;
+}
+
+void free_kvec(struct kvec *vec)
+{
+	if (unlikely(vec->nr))
+		BUG();
+	kfree(vec);
+}
+
+/* kvec memory copy helper: appends len bytes in from to dst.
+ */
+void memcpy_to_kvec_dst(struct kvec_dst *dst, const char *from, long len)
+{
+	if (unlikely(len < 0))
+		BUG();
+	do {
+		int cnt = len;
+		if (dst->space < cnt)
+			cnt = dst->space;
+
+		memcpy(dst->dst, from, cnt);
+		from += cnt;
+		dst->space -= cnt;
+		dst->dst += cnt;
+		len -= cnt;
+		if (!dst->space && len) {
+			kvec_dst_unmap(dst);
+			dst->let++;
+			dst->offset = 0;
+			kvec_dst_map(dst);
+			if (unlikely(!dst->space))
+				BUG();
+		}
+	} while (len);
+}
+
+/* kvec memory copy helper: copies and consumes len bytes in from to dst.
+ */
+void memcpy_from_kvec_dst(char *to, struct kvec_dst *from, long len)
+{
+	if (unlikely(len < 0))
+		BUG();
+	do {
+		int cnt = len;
+		if (from->space < cnt)
+			cnt = from->space;
+
+		memcpy(to, from->dst, cnt);
+		to += cnt;
+		from->space -= cnt;
+		from->dst += cnt;
+		len -= cnt;
+		if (unlikely(!from->space && len)) {
+			kvec_dst_unmap(from);
+			from->let++;
+			from->offset = 0;
+			kvec_dst_map(from);
+			if (unlikely(!from->space))
+				BUG();
+		}
+	} while (len);
+}
+
+/*
+ */
+int copy_user_to_kvec(struct kvec *to, size_t offset, const char *from, size_t len)
+{
+	struct kveclet *let = to->veclet;
+	int ret = 0;
+
+	if ((ssize_t)len < 0)
+		BUG();
+
+	while (offset) {
+		if (offset < let->length)
+			break;
+		offset -= let->length;
+		let++;
+
+		if ((let - to->veclet) > to->nr)
+			BUG();
+	}
+
+	/* FIXME: kmap deadlockage */
+	while (len && !ret) {
+		char *dst = kmap(let->page);
+		size_t this;
+
+		this = let->length - offset;
+		if (len < this)
+			this = len;
+
+		offset += let->offset;
+		if (copy_from_user(dst+offset, from, this))
+			ret = -EFAULT;
+
+		from += this;
+		len -= this;
+		kunmap(let->page);
+		offset = 0;
+		let ++;
+	}
+
+	return ret;
+}
+
diff -urN v2.4.19/mm/mmap.c aio-2.4.19.diff/mm/mmap.c
--- v2.4.19/mm/mmap.c	Fri Aug  9 13:50:46 2002
+++ aio-2.4.19.diff/mm/mmap.c	Mon Sep 16 21:54:13 2002
@@ -14,6 +14,7 @@
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/personality.h>
+#include <linux/compiler.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
diff -urN v2.4.19/mm/wtd.c aio-2.4.19.diff/mm/wtd.c
--- v2.4.19/mm/wtd.c	Wed Dec 31 19:00:00 1969
+++ aio-2.4.19.diff/mm/wtd.c	Mon Sep 16 21:54:13 2002
@@ -0,0 +1,73 @@
+#include <linux/worktodo.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+static void __wtd_lock_page_waiter(wait_queue_t *wait)
+{
+	struct worktodo *wtd = (struct worktodo *)wait;
+	struct page *page = (struct page *)wtd->data;
+
+	if (!TryLockPage(page)) {
+		__remove_wait_queue(page_waitqueue(page), &wtd->wait);
+		wtd_queue(wtd);
+	} else
+		schedule_task(&run_disk_tq);
+}
+
+int wtd_lock_page(struct worktodo *wtd, struct page *page)
+{
+	if (TryLockPage(page)) {
+		wtd->data = page;
+		init_waitqueue_func_entry(&wtd->wait, __wtd_lock_page_waiter);
+
+		/* Wakeups may race with TryLockPage, so try again within the wait 
+		 * queue spinlock.
+		 */
+		if (!add_wait_queue_cond(page_waitqueue(page), &wtd->wait,
+					TryLockPage(page))) {
+			/* Page is still locked.  Kick the disk queue... */
+			run_task_queue(&tq_disk);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static void __wtd_bh_waiter(wait_queue_t *wait)
+{
+	struct worktodo *wtd = (struct worktodo *)wait;
+	struct buffer_head *bh = (struct buffer_head *)wtd->data;
+
+	if (!buffer_locked(bh)) {
+		__remove_wait_queue(&bh->b_wait, &wtd->wait);
+		wtd_queue(wtd);
+	} else {
+		schedule_task(&run_disk_tq);
+	}
+}
+
+int wtd_wait_on_buffer(struct worktodo *wtd, struct buffer_head *bh)
+{
+	if (!buffer_locked(bh)) {
+		return 1;
+	}
+	wtd->data = bh;
+	init_waitqueue_func_entry(&wtd->wait, __wtd_bh_waiter);
+	if (add_wait_queue_cond(&bh->b_wait, &wtd->wait, buffer_locked(bh)))
+		return 1;
+	run_task_queue(&tq_disk);
+	return 0;
+}
+
+void do_run_tq_disk(void *data)
+{
+	run_task_queue(&tq_disk);
+}
+
+struct tq_struct run_disk_tq = {
+	routine: do_run_tq_disk,
+	data: NULL
+};
+
diff -urN v2.4.19/net/core/datagram.c aio-2.4.19.diff/net/core/datagram.c
--- v2.4.19/net/core/datagram.c	Tue Jan  1 14:09:35 2002
+++ aio-2.4.19.diff/net/core/datagram.c	Mon Sep 16 21:54:13 2002
@@ -8,6 +8,8 @@
  *
  *	Authors:	Alan Cox <alan@redhat.com>. (datagram_poll() from old udp.c code)
  *
+ *	Portions Copyright 2001 Red Hat, Inc.
+ *
  *	Fixes:
  *		Alan Cox	:	NULL return from skb_peek_copy() understood
  *		Alan Cox	:	Rewrote skb_read_datagram to avoid the skb_peek_copy stuff.
@@ -21,6 +23,7 @@
  *		Darryl Miles	:	Fixed non-blocking SOCK_STREAM.
  *		Alan Cox	:	POSIXisms
  *		Pete Wyckoff    :       Unconnected accept() fix.
+ *		Benjamin LaHaise:	added kvec operations
  *
  */
 
@@ -37,6 +40,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/poll.h>
 #include <linux/highmem.h>
+#include <linux/worktodo.h>
 
 #include <net/protocol.h>
 #include <linux/skbuff.h>
@@ -446,3 +450,321 @@
 
 	return mask;
 }
+
+/*
+ */
+static inline void skb_copy_datagram_kvec_dst(const struct sk_buff *skb,
+		int offset, struct kvec_dst *dst, int len)
+{
+	int i, copy;
+	int start = skb->len - skb->data_len;
+
+	/* Copy header. */
+	if ((copy = start-offset) > 0) {
+		if (copy > len)
+			copy = len;
+		memcpy_to_kvec_dst(dst, skb->data + offset, copy);
+		if ((len -= copy) == 0)
+			return;
+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		BUG_TRAP(start <= offset+len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end-offset) > 0) {
+			u8  *vaddr;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap_atomic(page, KM_USER1);
+			memcpy_to_kvec_dst(dst, vaddr + frag->page_offset +
+					     offset-start, copy);
+			kunmap_atomic(vaddr, KM_USER1);
+			if (!(len -= copy))
+				return;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	if (skb_shinfo(skb)->frag_list) {
+		struct sk_buff *list;
+
+		for (list = skb_shinfo(skb)->frag_list; list; list=list->next) {
+			int end;
+
+			BUG_TRAP(start <= offset+len);
+
+			end = start + list->len;
+			if ((copy = end-offset) > 0) {
+				if (copy > len)
+					copy = len;
+				skb_copy_datagram_kvec_dst(list, offset-start, dst, copy);
+				if ((len -= copy) == 0)
+					return;
+				offset += copy;
+			}
+			start = end;
+		}
+	}
+}
+
+void skb_copy_datagram_kvec(const struct sk_buff *skb, int offset,
+			   struct kvec *vec, int len)
+{
+	struct kvec_dst dst;
+	kvec_dst_init(&dst, KM_USER0);
+	kvec_dst_set(&dst, vec->veclet);
+	kvec_dst_map(&dst);
+	skb_copy_datagram_kvec_dst(skb, offset, &dst, len);
+	kvec_dst_unmap(&dst);
+}
+
+/* C++ would be better for this.  Please don't torture me with this code 
+ * ever again.
+ */
+static inline unsigned int csum_and_copy_to_dst(struct kvec_dst *dst,
+				 const char *from, int len, unsigned int csum)
+{
+	do {
+		int cnt = len;
+		if (dst->space < cnt)
+			cnt = dst->space;
+
+		memcpy(dst->dst, from, cnt);
+		csum = csum_partial_copy_nocheck(from, dst->dst, cnt, csum);
+		from += cnt;
+		dst->space -= cnt;
+		dst->dst += cnt;
+		len -= cnt;
+		if (!dst->space && len) {
+			kvec_dst_unmap(dst);
+			dst->let++;
+			dst->offset = 0;
+			kvec_dst_map(dst);
+			if (!dst->space)
+				BUG();
+		}
+	} while (len);
+	return csum;
+}
+
+static inline void skb_copy_and_csum_datagram_kvec_dst(const struct sk_buff *skb, int offset, struct kvec_dst *dst, int len, unsigned int *csump)
+{
+	int i, copy;
+	int start = skb->len - skb->data_len;
+	int pos = 0;
+
+	/* Copy header. */
+	if ((copy = start-offset) > 0) {
+		if (copy > len)
+			copy = len;
+		*csump = csum_and_copy_to_dst(dst, skb->data+offset, copy, *csump);
+		if ((len -= copy) == 0)
+			return;
+		offset += copy;
+		pos = copy;
+	}
+
+	for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		BUG_TRAP(start <= offset+len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end-offset) > 0) {
+			unsigned int csum2;
+			u8  *vaddr;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap_atomic(page, KM_USER1);
+			csum2 = csum_and_copy_to_dst(dst,
+				vaddr + frag->page_offset + offset-start,
+				copy, 0);
+			kunmap_atomic(vaddr, KM_USER1);
+			*csump = csum_block_add(*csump, csum2, pos);
+			if (!(len -= copy))
+				return;
+			offset += copy;
+			pos += copy;
+		}
+		start = end;
+	}
+
+	if (skb_shinfo(skb)->frag_list) {
+		struct sk_buff *list;
+
+		for (list = skb_shinfo(skb)->frag_list; list; list=list->next) {
+			int end;
+
+			BUG_TRAP(start <= offset+len);
+
+			end = start + list->len;
+			if ((copy = end-offset) > 0) {
+				unsigned int csum2 = 0;
+				if (copy > len)
+					copy = len;
+				skb_copy_and_csum_datagram_kvec_dst(list, offset-start, dst, copy, &csum2);
+				*csump = csum_block_add(*csump, csum2, pos);
+				if ((len -= copy) == 0)
+					return;
+				offset += copy;
+				pos += copy;
+			}
+			start = end;
+		}
+	}
+}
+
+int skb_copy_and_csum_datagram_kvec(const struct sk_buff *skb, int offset,
+			   struct kvec *vec, int len)
+{
+	unsigned int csum;
+	struct kvec_dst dst;
+
+	csum = csum_partial(skb->data, offset, skb->csum);
+
+	kvec_dst_init(&dst, KM_USER0);
+	kvec_dst_set(&dst, vec->veclet);
+	kvec_dst_map(&dst);
+	skb_copy_and_csum_datagram_kvec_dst(skb, offset, &dst, len, &csum);
+	kvec_dst_unmap(&dst);
+
+	if ((unsigned short)csum_fold(csum))
+		return -EINVAL;
+	return 0;
+}
+
+struct skb_async_info {
+	struct worktodo	wtd;
+	struct sock	*sk;
+	int		len;
+	void (*finish)(struct sock *sk, kvec_cb_t cb, int len, struct sk_buff *skb);
+	kvec_cb_t	cb;
+};
+static void skb_async_read_worker(void *_data);
+
+int skb_kvec_recv_datagram(struct sock * sk, kvec_cb_t cb, int len,
+	void (*finish)(struct sock *sk, kvec_cb_t cb, int len, struct sk_buff *skb))
+{
+	struct skb_async_info *info = kmalloc(sizeof(struct skb_async_info), GFP_KERNEL);
+	if (info) {
+		wtd_set_action(&info->wtd, skb_async_read_worker, info);
+		info->sk = sk;
+		info->len = len;
+		info->finish = finish;
+		info->cb = cb;
+		skb_async_read_worker(info);
+		return 0;
+	}
+	return -EAGAIN;
+}
+
+static void skb_async_read_waiter(wait_queue_t *wait)
+{
+	struct skb_async_info *info = (void *)wait;
+	__remove_wait_queue(info->sk->sleep, &info->wtd.wait);
+	wtd_queue(&info->wtd);
+}
+
+static void skb_async_read_worker(void *_data)
+{
+	struct skb_async_info	*info = _data;
+	struct sock *sk = info->sk;
+	struct sk_buff *skb;
+	int error;
+
+	/* Caller is allowed not to check sk->err before skb_recv_datagram() */
+	error = sock_error(sk);
+	if (error)
+		goto no_packet;
+
+
+	init_waitqueue_func_entry(&info->wtd.wait, skb_async_read_waiter);
+
+	/* Attempted to dequeue and process any skbs that already arrived.
+	 * Note that add_wait_queue_cond is used to check against a race
+	 * where an skb is added to the queue after we checked but before 
+	 * the callback is added to the wait queue.
+	 */
+	do {
+		skb = skb_dequeue(&sk->receive_queue);
+		if (skb) {
+			info->finish(sk, info->cb, info->len, skb);
+			kfree(info);
+			return;
+		}
+	} while ( add_wait_queue_cond( sk->sleep, &info->wtd.wait,
+					(!(error = sock_error(sk)) &&
+					skb_queue_empty(&sk->receive_queue)) )
+		  && !error);
+
+	if (!error)
+		return;
+
+no_packet:
+	info->cb.fn(info->cb.data, info->cb.vec, error);
+	kfree(info);
+	return;
+}
+
+#if 0
+static void skb_async_read_worker(void *_data)
+{
+	struct skb_async_info	*info = _data;
+	int error;
+
+	/* Socket errors? */
+	error = sock_error(sk);
+	if (error)
+		goto out_err;
+
+	if (!skb_queue_empty(&sk->receive_queue))
+		goto ready;
+
+	/* Socket shut down? */
+	if (sk->shutdown & RCV_SHUTDOWN)
+		goto out_noerr;
+
+	/* Sequenced packets can come disconnected. If so we report the problem */
+	error = -ENOTCONN;
+	if(connection_based(sk) && !(sk->state==TCP_ESTABLISHED || sk->state==TCP_LISTEN))
+		goto out_err;
+
+	/* handle signals */
+	if (signal_pending(current))
+		goto interrupted;
+
+	/* here: queue sleep */
+	*timeo_p = schedule_timeout(*timeo_p);
+	return;
+
+ready:
+	current->state = TASK_RUNNING;
+	remove_wait_queue(sk->sleep, &wait);
+	return 0;
+
+interrupted:
+	error = sock_intr_errno(*timeo_p);
+out_err:
+	*err = error;
+out:
+	current->state = TASK_RUNNING;
+	remove_wait_queue(sk->sleep, &wait);
+	return error;
+out_noerr:
+	*err = 0;
+	error = 1;
+	goto out;
+}
+#endif
diff -urN v2.4.19/net/core/sock.c aio-2.4.19.diff/net/core/sock.c
--- v2.4.19/net/core/sock.c	Fri Aug  9 13:50:46 2002
+++ aio-2.4.19.diff/net/core/sock.c	Mon Sep 16 21:54:13 2002
@@ -587,6 +587,8 @@
 	if(sk && zero_it) {
 		memset(sk, 0, sizeof(struct sock));
 		sk->family = family;
+		INIT_LIST_HEAD(&sk->kvec_read_list);
+		INIT_LIST_HEAD(&sk->kvec_write_list);
 		sock_lock_init(sk);
 	}
 
@@ -1117,7 +1119,7 @@
 void sock_def_wakeup(struct sock *sk)
 {
 	read_lock(&sk->callback_lock);
-	if (sk->sleep && waitqueue_active(sk->sleep))
+	if (sk->sleep)
 		wake_up_interruptible_all(sk->sleep);
 	read_unlock(&sk->callback_lock);
 }
@@ -1125,7 +1127,7 @@
 void sock_def_error_report(struct sock *sk)
 {
 	read_lock(&sk->callback_lock);
-	if (sk->sleep && waitqueue_active(sk->sleep))
+	if (sk->sleep)
 		wake_up_interruptible(sk->sleep);
 	sk_wake_async(sk,0,POLL_ERR); 
 	read_unlock(&sk->callback_lock);
@@ -1134,7 +1136,7 @@
 void sock_def_readable(struct sock *sk, int len)
 {
 	read_lock(&sk->callback_lock);
-	if (sk->sleep && waitqueue_active(sk->sleep))
+	if (sk->sleep)
 		wake_up_interruptible(sk->sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->callback_lock);
@@ -1148,7 +1150,7 @@
 	 * progress.  --DaveM
 	 */
 	if((atomic_read(&sk->wmem_alloc) << 1) <= sk->sndbuf) {
-		if (sk->sleep && waitqueue_active(sk->sleep))
+		if (sk->sleep)
 			wake_up_interruptible(sk->sleep);
 
 		/* Should agree with poll, otherwise some programs break */
diff -urN v2.4.19/net/ipv4/af_inet.c aio-2.4.19.diff/net/ipv4/af_inet.c
--- v2.4.19/net/ipv4/af_inet.c	Fri Aug  9 13:50:46 2002
+++ aio-2.4.19.diff/net/ipv4/af_inet.c	Mon Sep 16 21:54:13 2002
@@ -729,6 +729,19 @@
 }
 
 
+int inet_kvec_read(struct socket *sock, kvec_cb_t cb, size_t len)
+{
+	struct sock *sk = sock->sk;
+
+	return sk->prot->kvec_read(sk, cb, len);
+}
+
+int inet_kvec_write(struct socket *sock, kvec_cb_t cb, size_t len)
+{
+	struct sock *sk = sock->sk;
+
+	return sk->prot->kvec_write(sk, cb, len);
+}
 
 int inet_recvmsg(struct socket *sock, struct msghdr *msg, int size,
 		 int flags, struct scm_cookie *scm)
@@ -960,7 +973,9 @@
 	sendmsg:	inet_sendmsg,
 	recvmsg:	inet_recvmsg,
 	mmap:		sock_no_mmap,
-	sendpage:	tcp_sendpage
+	sendpage:	tcp_sendpage,
+	kvec_read:	inet_kvec_read,
+	kvec_write:	inet_kvec_write,
 };
 
 struct proto_ops inet_dgram_ops = {
@@ -982,6 +997,8 @@
 	recvmsg:	inet_recvmsg,
 	mmap:		sock_no_mmap,
 	sendpage:	sock_no_sendpage,
+	kvec_read:	inet_kvec_read,
+	kvec_write:	inet_kvec_write,
 };
 
 struct net_proto_family inet_family_ops = {
diff -urN v2.4.19/net/ipv4/tcp.c aio-2.4.19.diff/net/ipv4/tcp.c
--- v2.4.19/net/ipv4/tcp.c	Fri Aug  9 13:50:47 2002
+++ aio-2.4.19.diff/net/ipv4/tcp.c	Mon Sep 16 21:54:13 2002
@@ -252,6 +252,7 @@
 #include <linux/init.h>
 #include <linux/smp_lock.h>
 #include <linux/fs.h>
+#include <linux/compiler.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -677,11 +678,266 @@
 	return 0;
 }
 
+struct tcp_write_async_info {
+	struct worktodo	wtd;
+	struct sock	*sk;
+	int		len;
+	int		done;
+	int		offset;
+	struct kveclet	*cur_let;
+	kvec_cb_t	cb;
+	spinlock_t	lock;
+};
+
+static void async_lock_sock_wait(wait_queue_t *wait)
+{
+	struct tcp_write_async_info *info = (void *)wait;
+	printk("async_lock_sock_wait(%p)\n", info);
+	if (!info->sk->lock.users) {
+		printk("async_lock_sock_wait: queuing\n");
+		__remove_wait_queue(info->sk->sleep, &info->wtd.wait);
+		wtd_queue(&info->wtd);
+	}
+}
+
+static void async_lock_sock(void *data)
+{
+	struct tcp_write_async_info *info = data;
+	struct sock *sk;
+	printk(KERN_DEBUG "async_lock_sock(%p)\n", info);
+	sk = info->sk;
+	spin_lock_bh(&sk->lock.slock);
+	if (sk->lock.users) {
+		printk(KERN_DEBUG "async_lock_sock: waiting\n");
+		wtd_push(&info->wtd, async_lock_sock, info);
+		init_waitqueue_func_entry(&info->wtd.wait, async_lock_sock_wait);
+		if (!add_wait_queue_cond(sk->sleep, &info->wtd.wait, !sk->lock.users)) {
+			spin_unlock_bh(&sk->lock.slock);
+			return;
+		}
+		wtd_pop(&info->wtd);
+	}
+	printk(KERN_DEBUG "async_lock_sock: locking\n");
+	sk->lock.users = 1;
+	spin_unlock_bh(&sk->lock.slock);
+	wtd_queue(&info->wtd);
+}
+
+static void async_wait_for_tcp_connect(void *data);
+int tcp_kvec_write(struct sock *sk, kvec_cb_t cb, int len)
+{
+	struct tcp_write_async_info *info;
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	printk(KERN_DEBUG "tcp_kvec_write: %p\n", info);
+	if (!info)
+		return -ENOMEM;
+	wtd_init(&info->wtd, async_wait_for_tcp_connect);
+	info->sk = sk;
+	info->len = len;
+	info->done = 0;
+	info->offset = 0;
+	info->cur_let = cb.vec->veclet;
+	info->cb = cb;
+	spin_lock_init(&info->lock);
+	async_lock_sock(info);
+	return 0;
+}
+
+static void async_cn_wait_task(void *data)
+{
+	struct tcp_write_async_info *info = (void *)data;
+	async_lock_sock(info);
+}
+
+static void async_cn_wait(wait_queue_t *wait)
+{
+	struct tcp_write_async_info *info = (void *)wait;
+	__remove_wait_queue(info->sk->sleep, &info->wtd.wait);
+	wtd_set_action(&info->wtd, async_cn_wait_task, info);
+	wtd_queue(&info->wtd);
+}
+
+/* sock_get_iocb
+ *	Attempts to allocate a local socket iocb, which allows high
+ *	performance for the common cases of a small number of ios
+ *	outstanding per socket.
+ */
+struct sock_iocb *sock_get_iocb(struct sock *sk)
+{
+	struct sock_iocb *iocb;
+
+	iocb = kmalloc(sizeof(*iocb), GFP_KERNEL);
+	return iocb;
+}
+
+void sock_put_iocb(struct sock_iocb *iocb)
+{
+	kfree(iocb);
+}
+
+/* tcp_kvec_read_kick
+ *	Attempts to process an async read request.  Must be called with 
+ *	the socket lock held.
+ */
+void tcp_kvec_read_kick(struct sock *sk, struct sock_iocb *iocb)
+{
+	TCP_CHECK_TIMER(sk);
+#if 0
+	if (unlikely(TCP_LISTEN == sk->state))
+		goto out;
+#endif
+	return;
+}
+
+/* tcp_kvec_read
+ *	Queues an async read request on a socket.  If there were 
+ &	no outstanding read requests, kicks the backlog processing.
+ */
+int tcp_kvec_read(struct sock *sk, kvec_cb_t cb, int size)
+{
+	struct sock_iocb *iocb;
+	printk("tcp_kvec_read(%p, %d): blah", sk, size);
+
+	iocb = sock_get_iocb(sk);
+	if (unlikely(NULL == iocb))
+		return -ENOMEM;
+
+	iocb->cb = cb;
+	kvec_dst_init(&iocb->dst, KM_USER0);
+
+	spin_lock_bh(&sk->lock.slock);
+	if (sk->lock.users != 0 || !list_empty(&sk->kvec_read_list)) {
+		list_add_tail(&iocb->list, &sk->kvec_read_list);
+		spin_unlock_bh(&sk->lock.slock);
+		return 0;
+	}
+	spin_unlock_bh(&sk->lock.slock);
+
+	/* We're the head read request and now own the socket lock;
+	 * attempt to kick off processing.
+	 */
+	tcp_kvec_read_kick(sk, iocb);
+	release_sock(sk);
+	return 0;
+}
+
+static void tcp_kvec_write_worker(struct tcp_write_async_info *info);
+static void async_wait_for_tcp_connect(void *data)
+{
+	struct tcp_write_async_info *info = data;
+	struct sock *sk = info->sk;
+	int err;
+	/* At this point the socket is locked for us. */
+	while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
+		if (sk->err) {
+			err = sock_error(sk);
+			goto error;
+		}
+		if ((1 << sk->state) &
+		   ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+			err = -EPIPE;
+			goto error;
+		}
+
+		sk->tp_pinfo.af_tcp.write_pending++;
+		init_waitqueue_func_entry(&info->wtd.wait, async_cn_wait);
+
+		/* Add our worker to the socket queue, but make sure the socket 
+		 * state isn't changed from when we checked while we do so.
+		 */
+		if (!add_wait_queue_cond(sk->sleep, &info->wtd.wait,
+			((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+			)) {
+			release_sock(sk);
+			return;
+		}
+	}
+	/* sk is now locked *and* the connection is established, let's 
+	 * proceed to the data transfer stage.
+	 */
+	tcp_kvec_write_worker(info);
+	return;
+
+error:
+	release_sock(sk);
+	info->cb.fn(info->cb.data, info->cb.vec, err);
+	kfree(info);
+}
+
 static inline int tcp_memory_free(struct sock *sk)
 {
 	return sk->wmem_queued < sk->sndbuf;
 }
 
+static void async_wait_for_tcp_memory(struct tcp_write_async_info *info);
+static void async_wait_for_tcp_memory_done(void *data)
+{
+	struct tcp_write_async_info *info = data;
+	info->sk->tp_pinfo.af_tcp.write_pending--;
+	if (tcp_memory_free(info->sk))
+		tcp_kvec_write_worker(info);
+	else
+		async_wait_for_tcp_memory(info);
+}
+
+static void async_wait_for_tcp_memory_waiting(void *data)
+{
+	struct tcp_write_async_info *info = data;
+	wtd_set_action(&info->wtd, async_wait_for_tcp_memory_done, info);
+	async_lock_sock(info);
+}
+
+static void async_wait_for_tcp_memory_wake(wait_queue_t *wait)
+{
+	struct tcp_write_async_info *info = (void *)wait;
+	__remove_wait_queue(info->sk->sleep, &info->wtd.wait);
+	wtd_set_action(&info->wtd, async_wait_for_tcp_memory_waiting, info);
+	wtd_queue(&info->wtd);
+}
+
+static void async_wait_for_tcp_memory(struct tcp_write_async_info *info)
+{
+	struct sock *sk = info->sk;
+	ssize_t res;
+	kvec_cb_t cb;
+	int raced = 0;
+
+	printk("async_wait_for_tcp_memory(%p)\n", info);
+	res = -EPIPE;
+	if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
+		goto err;
+
+	if (tcp_memory_free(sk))
+		printk("async_wait_for_tcp_memory: spinning?\n");
+
+	init_waitqueue_func_entry(&info->wtd.wait, async_wait_for_tcp_memory_wake);
+	clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
+	set_bit(SOCK_NOSPACE, &sk->socket->flags);
+	raced = add_wait_queue_cond( sk->sleep, &info->wtd.wait,
+		!(sk->err || (sk->shutdown & SEND_SHUTDOWN) || tcp_memory_free(sk)) );
+
+	sk->tp_pinfo.af_tcp.write_pending++;
+	if (raced) {
+		/* Requeue to be run here: this allows other tasks to 
+		 * get rescheduled in case of bugs
+		 */
+		wtd_set_action(&info->wtd, async_wait_for_tcp_memory_done, info);
+		wtd_queue(&info->wtd);
+		return;
+	}
+
+	release_sock(sk);
+	return;
+
+err:
+	printk("async_wait_for_tcp_memory: err %ld\n", (long)res);
+	if (info->done)
+		res = info->done;
+	cb = info->cb;
+	kfree(info);
+	cb.fn(cb.data, cb.vec, res);
+}
+
 /*
  *	Wait for more memory for a socket
  */
@@ -692,9 +948,17 @@
 	long current_timeo = *timeo;
 	DECLARE_WAITQUEUE(wait, current);
 
+	if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
+		return -EPIPE;
+
 	if (tcp_memory_free(sk))
 		current_timeo = vm_wait = (net_random()%(HZ/5))+2;
 
+	if (!*timeo) {
+		set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
+		return -EAGAIN;
+	}
+
 	add_wait_queue(sk->sleep, &wait);
 	for (;;) {
 		set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
@@ -745,7 +1009,7 @@
 	goto out;
 }
 
-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
+ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags);
 
 static inline int
 can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
@@ -824,7 +1088,7 @@
 	return err;
 }
 
-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
+ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	int mss_now;
@@ -851,9 +1115,14 @@
 		int offset, size, copy, i;
 		struct page *page;
 
-		page = pages[poffset/PAGE_SIZE];
-		offset = poffset % PAGE_SIZE;
-		size = min_t(size_t, psize, PAGE_SIZE-offset);
+		while (poffset >= let->length) {
+			poffset -= let->length;
+			let++;
+		}
+
+		page = let->page;
+		offset = let->offset + poffset;
+		size = min_t(unsigned int, psize, let->length);
 
 		if (tp->send_head==NULL || (copy = mss_now - skb->len) <= 0) {
 new_segment:
@@ -893,6 +1162,10 @@
 
 		copied += copy;
 		poffset += copy;
+		if (poffset >= let->length) {
+			poffset = 0;
+			let++;
+		}
 		if (!(psize -= copy))
 			goto out;
 
@@ -932,6 +1205,7 @@
 
 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
 {
+	struct kveclet let = { page, offset, size };
 	ssize_t res;
 	struct sock *sk = sock->sk;
 
@@ -941,16 +1215,54 @@
 	    !(sk->route_caps & TCP_ZC_CSUM_FLAGS))
 		return sock_no_sendpage(sock, page, offset, size, flags);
 
-#undef TCP_ZC_CSUM_FLAGS
 
 	lock_sock(sk);
 	TCP_CHECK_TIMER(sk);
-	res = do_tcp_sendpages(sk, &page, offset, size, flags);
+	res = do_tcp_sendpages(sk, &let, 0, size, flags);
 	TCP_CHECK_TIMER(sk);
 	release_sock(sk);
 	return res;
 }
 
+static void tcp_kvec_write_worker(struct tcp_write_async_info *info)
+{
+	struct sock *sk = info->sk;
+	int res;
+	if (!(sk->route_caps & NETIF_F_SG) || 
+	    !(sk->route_caps & TCP_ZC_CSUM_FLAGS))
+		BUG();
+
+	res = do_tcp_sendpages(sk, info->cur_let, info->offset, info->len - info->done, MSG_DONTWAIT);
+	if (res > 0)
+		info->done += res;
+
+	if (res == -EAGAIN) {
+		printk("tcp_kvec_write_worker: -EAGAIN: queuing\n");
+		goto requeue;
+	}
+
+	while (res > info->cur_let->length) {
+		res -= info->cur_let->length;
+		info->cur_let++;
+	}
+
+	if (res <= 0 || (info->done >= info->len)) {
+		kvec_cb_t cb = info->cb;
+		printk("tcp_kvec_write_worker: error(%d)\n", res);
+		if (info->done)
+			res = info->done;
+		release_sock(sk);
+		kfree(info);
+		cb.fn(cb.data, cb.vec, res);
+		return;
+	}
+
+requeue:
+	async_wait_for_tcp_memory(info);
+}
+
+#undef TCP_ZC_CSUM_FLAGS
+
 #define TCP_PAGE(sk)	(sk->tp_pinfo.af_tcp.sndmsg_page)
 #define TCP_OFF(sk)	(sk->tp_pinfo.af_tcp.sndmsg_off)
 
diff -urN v2.4.19/net/ipv4/tcp_ipv4.c aio-2.4.19.diff/net/ipv4/tcp_ipv4.c
--- v2.4.19/net/ipv4/tcp_ipv4.c	Fri Aug  9 13:50:47 2002
+++ aio-2.4.19.diff/net/ipv4/tcp_ipv4.c	Mon Sep 16 21:54:13 2002
@@ -2298,6 +2298,8 @@
 	hash:		tcp_v4_hash,
 	unhash:		tcp_unhash,
 	get_port:	tcp_v4_get_port,
+	kvec_read:	tcp_kvec_read,
+	kvec_write:	tcp_kvec_write,
 };
 
 
diff -urN v2.4.19/net/ipv4/udp.c aio-2.4.19.diff/net/ipv4/udp.c
--- v2.4.19/net/ipv4/udp.c	Fri Aug  9 13:50:47 2002
+++ aio-2.4.19.diff/net/ipv4/udp.c	Mon Sep 16 21:54:13 2002
@@ -93,6 +93,7 @@
 #include <net/route.h>
 #include <net/inet_common.h>
 #include <net/checksum.h>
+#include <linux/compiler.h>
 
 /*
  *	Snmp MIB for the UDP layer
@@ -619,6 +620,74 @@
 		__udp_checksum_complete(skb);
 }
 
+void udp_kvec_read_finish(struct sock *sk, kvec_cb_t cb, int len, struct sk_buff *skb)
+{
+  	struct sockaddr_in *sin = NULL;
+	int msg_flags = 0;
+  	int copied, err;
+
+	if (!skb)
+		BUG();
+
+  	copied = skb->len - sizeof(struct udphdr);
+	if (copied > len) {
+		copied = len;
+		msg_flags |= MSG_TRUNC;
+	}
+
+	err = 0;
+
+	if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
+		skb_copy_datagram_kvec(skb, sizeof(struct udphdr),
+					      cb.vec, copied);
+	} else if (msg_flags&MSG_TRUNC) {
+		err = -EAGAIN;
+		if (unlikely(__udp_checksum_complete(skb))) {
+			UDP_INC_STATS_BH(UdpInErrors);
+			goto out_free;
+		}
+		err = 0;
+		skb_copy_datagram_kvec(skb, sizeof(struct udphdr),
+					      cb.vec, copied);
+	} else {
+		err = skb_copy_and_csum_datagram_kvec(skb,
+					sizeof(struct udphdr), cb.vec, copied);
+	}
+
+	if (err)
+		goto out_free;
+
+	//sock_recv_timestamp(msg, sk, skb);
+
+	/* Copy the address. */
+	if (sin)
+	{
+		sin->sin_family = AF_INET;
+		sin->sin_port = skb->h.uh->source;
+		sin->sin_addr.s_addr = skb->nh.iph->saddr;
+		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+  	}
+	//if (sk->protinfo.af_inet.cmsg_flags)
+	//	ip_cmsg_recv(msg, skb);
+	err = copied;
+  
+out_free:
+  	skb_free_datagram(sk, skb);
+  	cb.fn(cb.data, cb.vec, err);
+	return;
+}
+
+static int udp_kvec_read(struct sock *sk, kvec_cb_t cb, int len)
+{
+	return skb_kvec_recv_datagram(sk, cb, len, udp_kvec_read_finish);
+}
+
+static int udp_kvec_write(struct sock *sk, kvec_cb_t cb, int len)
+{
+	return -EINVAL;		/* TODO: someone please write ;-) */
+}
+
+
 /*
  * 	This should be easy, if there is something there we
  * 	return it, otherwise we block.
@@ -1037,6 +1106,8 @@
 	getsockopt:	ip_getsockopt,
 	sendmsg:	udp_sendmsg,
 	recvmsg:	udp_recvmsg,
+	kvec_read:	udp_kvec_read,
+	kvec_write:	udp_kvec_write,
 	backlog_rcv:	udp_queue_rcv_skb,
 	hash:		udp_v4_hash,
 	unhash:		udp_v4_unhash,
diff -urN v2.4.19/net/khttpd/datasending.c aio-2.4.19.diff/net/khttpd/datasending.c
--- v2.4.19/net/khttpd/datasending.c	Mon Sep 24 02:16:05 2001
+++ aio-2.4.19.diff/net/khttpd/datasending.c	Mon Sep 16 21:54:13 2002
@@ -127,7 +127,7 @@
 				desc.count = ReadSize;
 				desc.buf = (char *) CurrentRequest->sock;
 				desc.error = 0;
-				do_generic_file_read(CurrentRequest->filp, ppos, &desc, sock_send_actor);
+				do_generic_file_read(CurrentRequest->filp, ppos, &desc, sock_send_actor, 0);
 				if (desc.written>0)
 				{	
 					CurrentRequest->BytesSent += desc.written;
diff -urN v2.4.19/net/socket.c aio-2.4.19.diff/net/socket.c
--- v2.4.19/net/socket.c	Fri Aug  9 13:50:47 2002
+++ aio-2.4.19.diff/net/socket.c	Mon Sep 16 21:54:13 2002
@@ -44,6 +44,7 @@
  *		Tigran Aivazian	:	sys_send(args) calls sys_sendto(args, NULL, 0)
  *		Tigran Aivazian	:	Made listen(2) backlog sanity checks 
  *					protocol-independent
+ *		Benjamin LaHaise:	real aio support.
  *
  *
  *		This program is free software; you can redistribute it and/or
@@ -104,6 +105,8 @@
 			  unsigned long count, loff_t *ppos);
 static ssize_t sock_sendpage(struct file *file, struct page *page,
 			     int offset, size_t size, loff_t *ppos, int more);
+static int sock_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos);
+static int sock_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos);
 
 
 /*
@@ -123,7 +126,11 @@
 	fasync:		sock_fasync,
 	readv:		sock_readv,
 	writev:		sock_writev,
-	sendpage:	sock_sendpage
+	sendpage:	sock_sendpage,
+	aio_read:	generic_sock_aio_read,
+	aio_write:	generic_file_aio_write,
+	kvec_read:	sock_kvec_read,
+	kvec_write:	sock_kvec_write,
 };
 
 /*
@@ -533,13 +540,14 @@
 static ssize_t sock_read(struct file *file, char *ubuf,
 			 size_t size, loff_t *ppos)
 {
+	int read_flags = 0;
 	struct socket *sock;
 	struct iovec iov;
 	struct msghdr msg;
 	int flags;
 
-	if (ppos != &file->f_pos)
-		return -ESPIPE;
+	if (read_flags & ~F_ATOMIC)
+		return -EINVAL;
 	if (size==0)		/* Match SYS5 behaviour */
 		return 0;
 
@@ -554,6 +562,8 @@
 	iov.iov_base=ubuf;
 	iov.iov_len=size;
 	flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
+	if (read_flags & F_ATOMIC)
+		flags |= MSG_DONTWAIT;
 
 	return sock_recvmsg(sock, &msg, size, flags);
 }
@@ -567,12 +577,13 @@
 static ssize_t sock_write(struct file *file, const char *ubuf,
 			  size_t size, loff_t *ppos)
 {
+	int flags = 0;
 	struct socket *sock;
 	struct msghdr msg;
 	struct iovec iov;
-	
-	if (ppos != &file->f_pos)
-		return -ESPIPE;
+
+	if (flags & ~F_ATOMIC)
+		return -EINVAL;
 	if(size==0)		/* Match SYS5 behaviour */
 		return 0;
 
@@ -585,6 +596,8 @@
 	msg.msg_control=NULL;
 	msg.msg_controllen=0;
 	msg.msg_flags=!(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
+	if (flags & F_ATOMIC)
+		msg.msg_flags = MSG_DONTWAIT;
 	if (sock->type == SOCK_SEQPACKET)
 		msg.msg_flags |= MSG_EOR;
 	iov.iov_base=(void *)ubuf;
@@ -611,6 +624,29 @@
 	return sock->ops->sendpage(sock, page, offset, size, flags);
 }
 
+static int sock_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	struct socket *sock;
+	sock = socki_lookup(file->f_dentry->d_inode);
+	if ((int)size < 0 || (size_t)(int)size != size)
+		return -EINVAL;
+	if (sock->ops->kvec_read)
+		return sock->ops->kvec_read(sock, cb, size);
+	return -EOPNOTSUPP;
+}
+
+static int sock_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos)
+{
+	struct socket *sock;
+	sock = socki_lookup(file->f_dentry->d_inode);
+	if ((int)size < 0 || (size_t)(int)size != size)
+		return -EINVAL;
+	if (sock->ops->kvec_write)
+		return sock->ops->kvec_write(sock, cb, size);
+	return -EOPNOTSUPP;
+}
+
+
 int sock_readv_writev(int type, struct inode * inode, struct file * file,
 		      const struct iovec * iov, long count, long size)
 {
