[patch] total-epoll ( aka full epoll support for poll() enabled

Davide Libenzi (davidel@xmailserver.org)
Fri, 1 Nov 2002 21:13:13 -0800 (PST)


This is the first version of epoll that support all devices that support
file->f_op->poll(). There're many differences with the original epoll.
Basically it's another patch. Features :

*) Full support for all devices that support file->f_op->poll()

*) Multi-thread support. The wait interface is changed to :

int epoll_wait(int epfd, struct pollfd *events, int maxevents,
int timeout);

*) Custom hooks removed everywhere. Only f_op->poll() is used

*) Yes, ... it drops an event @ EP_CTL_ADD :)

*) Smaller

*) Faster on my http test, not tried ( performance, done basic test ) with
pipes

The patch is working fine on my machine but it's very new code.
Comments and test reports will be very welcome ...

- Davide

drivers/char/eventpoll.c | 1140 +++++++++++++++++-----------------------------
fs/Makefile | 4
fs/fcblist.c | 146 -----
fs/file_table.c | 6
fs/pipe.c | 36 -
fs/select.c | 8
include/linux/eventpoll.h | 31 -
include/linux/fcblist.h | 71 --
include/linux/fs.h | 4
include/linux/pipe_fs_i.h | 4
include/linux/poll.h | 18
include/net/sock.h | 12
net/ipv4/tcp.c | 4
13 files changed, 477 insertions, 1007 deletions

diff -Nru linux-2.5.45.vanilla/drivers/char/eventpoll.c linux-2.5.45.epoll/drivers/char/eventpoll.c
--- linux-2.5.45.vanilla/drivers/char/eventpoll.c Wed Oct 30 16:42:27 2002
+++ linux-2.5.45.epoll/drivers/char/eventpoll.c Fri Nov 1 20:47:49 2002
@@ -20,7 +20,6 @@
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/mm.h>
-#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/miscdevice.h>
@@ -31,7 +30,6 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/wait.h>
-#include <linux/fcblist.h>
#include <linux/rwsem.h>
#include <asm/bitops.h>
#include <asm/uaccess.h>
@@ -63,19 +61,44 @@
#define DPI_SLAB_DEBUG 0
#endif /* #if DEBUG_DPI != 0 */

-#define INITIAL_HASH_BITS 7
-#define MAX_HASH_BITS 18
-#define RESIZE_LENGTH 2

+/* How many pollfd in a page ? */
+#define EP_POLLFDS_X_PAGE (PAGE_SIZE / sizeof(struct pollfd))
+
+/* Maximum number of wait queue we can attach to */
+#define EP_MAX_POLL_QUEUE 2
+
+/* Number of hash entries ( "struct list_head" ) inside a page */
+#define EP_HENTRY_X_PAGE (PAGE_SIZE / sizeof(struct list_head))
+
+/* Maximum size of the hash in pages */
+#define EP_MAX_HPAGES (MAX_FDS_IN_EVENTPOLL / EP_HENTRY_X_PAGE + 1)
+
+/* Macro to allocate a "struct epitem" from the slab cache */
#define DPI_MEM_ALLOC() (struct epitem *) kmem_cache_alloc(dpi_cache, SLAB_KERNEL)
+
+/* Macro to free a "struct epitem" to the slab cache */
#define DPI_MEM_FREE(p) kmem_cache_free(dpi_cache, p)
-#define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops)

+/* Fast test to see if the file is an evenpoll file */
+#define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops)

/*
- * Type used for versioning events snapshots inside the double buffer.
+ * Remove the item from the list and perform its initialization.
+ * This is usefull for us because we can test if the item is linked
+ * using "EP_IS_LINKED(p)".
*/
-typedef unsigned long long event_version_t;
+#define EP_LIST_DEL(p) do { list_del(p); INIT_LIST_HEAD(p); } while (0)
+
+/* Tells us if the item is currently linked */
+#define EP_IS_LINKED(p) (!list_empty(p))
+
+/* Get the "struct epitem" from a wait queue pointer */
+#define EP_ITEM_FROM_WAIT(p) ((struct epitem *) container_of(p, struct eppoll_entry, wait)->base)
+
+
+
+

/*
* This structure is stored inside the "private_data" member of the file
@@ -83,10 +106,12 @@
* interface.
*/
struct eventpoll {
+ /* Used to link to the "struct eventpoll" list */
+ struct list_head llink;
+
/*
- * Protect the evenpoll interface from sys_epoll_ctl(2), ioctl(EP_POLL)
- * and ->write() concurrency. It basically serialize the add/remove/edit
- * of items in the interest set.
+ * Protect the evenpoll interface from sys_epoll_ctl(2) concurrency.
+ * It basically serialize the add/remove/edit of items in the interest set.
*/
struct rw_semaphore acsem;

@@ -97,57 +122,33 @@
*/
rwlock_t lock;

- /* Wait queue used by sys_epoll_wait() and ioctl(EP_POLL) */
+ /* Wait queue used by sys_epoll_wait() */
wait_queue_head_t wq;

/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;

- /* This is the hash used to store the "struct epitem" elements */
- struct list_head *hash;
-
- unsigned int hbits;
- unsigned int hmask;
- atomic_t hents;
- atomic_t resize;
-
- /* Number of pages currently allocated in each side of the double buffer */
- int numpages;
-
- /*
- * Current page set pointer, switched from "pages0" and "pages1" each time
- * ep_poll() returns events to the caller.
- */
- char **pages;
+ /* List of ready file descriptors */
+ struct list_head rdllist;

- /* Each one of these contains the pages allocated for each side of
- * the double buffer.
- */
- char *pages0[MAX_EVENTPOLL_PAGES];
- char *pages1[MAX_EVENTPOLL_PAGES];
+ /* Size of the hash */
+ int hsize;

- /*
- * Variable containing the vma base address where the double buffer
- * pages are mapped onto.
- */
- unsigned long vmabase;
+ /* Number of pages currently allocated for the hash */
+ int nhpages;

- /*
- * Certain functions cannot be called if the double buffer pages are
- * not allocated and if the memory mapping is not in place. This tells
- * us that everything is setup to fully use the interface.
- */
- atomic_t mmapped;
+ /* Pages for the "struct epitem" hash */
+ char *hpages[EP_MAX_HPAGES];

- /* Number of events currently available inside the current snapshot */
- int eventcnt;
+ /* Page used to trasfer events to user space */
+ char *eventpage;
+};

- /*
- * Variable storing the current "version" of the snapshot. It is used
- * to validate the validity of the current slot pointed by the "index"
- * member of a "struct epitem".
- */
- event_version_t ver;
+/* Wait structure used by the poll hooks */
+struct eppoll_entry {
+ void *base;
+ wait_queue_t wait;
+ wait_queue_head_t *whead;
};

/*
@@ -158,6 +159,15 @@
/* List header used to link this structure to the eventpoll hash */
struct list_head llink;

+ /* List header used to link this structure to the eventpoll ready list */
+ struct list_head rdllink;
+
+ /* Number of active wait queue attached to poll operations */
+ int nwait;
+
+ /* Wait queue used to attach poll operations */
+ struct eppoll_entry wait[EP_MAX_POLL_QUEUE];
+
/* The "container" of this item */
struct eventpoll *ep;

@@ -168,17 +178,10 @@
struct pollfd pfd;

/*
- * The index inside the current double buffer that stores the active
- * event slot for this item ( file ).
+ * Used to keep track of the usage count of the structure. This avoids
+ * that the structure will desappear from underneath our processing.
*/
- int index;
-
- /*
- * The version that is used to validate if the current slot is still
- * valid or if it refers to an old snapshot. It is matches togheter
- * with the one inside the eventpoll structure.
- */
- event_version_t ver;
+ atomic_t usecnt;
};

@@ -187,33 +190,35 @@
static int ep_getfd(int *efd, struct inode **einode, struct file **efile);
static int ep_alloc_pages(char **pages, int numpages);
static int ep_free_pages(char **pages, int numpages);
-static int ep_init(struct eventpoll *ep);
+static int ep_file_init(struct file *file, int maxfds);
+static int ep_hash_index(struct eventpoll *ep, struct file *file);
+static struct list_head *ep_hash_entry(struct eventpoll *ep, int index);
+static int ep_init(struct eventpoll *ep, int hsize);
static void ep_free(struct eventpoll *ep);
-static struct epitem *ep_find_nl(struct eventpoll *ep, int fd);
-static struct epitem *ep_find(struct eventpoll *ep, int fd);
-static int ep_hashresize(struct eventpoll *ep, unsigned long *kflags);
-static int ep_insert(struct eventpoll *ep, struct pollfd *pfd);
+static struct epitem *ep_find(struct eventpoll *ep, struct file *file);
+static void ep_use_epitem(struct epitem *dpi);
+static void ep_release_epitem(struct epitem *dpi);
+static void ep_ptable_queue_proc(void *priv, wait_queue_head_t *whead);
+static int ep_insert(struct eventpoll *ep, struct pollfd *pfd, struct file *tfile);
+static int ep_unlink(struct eventpoll *ep, struct epitem *dpi);
static int ep_remove(struct eventpoll *ep, struct epitem *dpi);
-static void notify_proc(struct file *file, void *data, unsigned long *local,
- long *event);
-static int open_eventpoll(struct inode *inode, struct file *file);
+static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync);
static int close_eventpoll(struct inode *inode, struct file *file);
static unsigned int poll_eventpoll(struct file *file, poll_table *wait);
-static int write_eventpoll(struct file *file, const char *buffer, size_t count,
- loff_t *ppos);
-static int ep_poll(struct eventpoll *ep, struct evpoll *dvp);
-static int ep_do_alloc_pages(struct eventpoll *ep, int numpages);
-static int ioctl_eventpoll(struct inode *inode, struct file *file,
- unsigned int cmd, unsigned long arg);
-static void eventpoll_mm_open(struct vm_area_struct * vma);
-static void eventpoll_mm_close(struct vm_area_struct * vma);
-static int mmap_eventpoll(struct file *file, struct vm_area_struct *vma);
+static int ep_events_transfer(struct eventpoll *ep, struct pollfd *events, int maxevents);
+static int ep_poll(struct eventpoll *ep, struct pollfd *events, int maxevents,
+ int timeout);
static int eventpollfs_delete_dentry(struct dentry *dentry);
static struct inode *get_eventpoll_inode(void);
static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
int flags, char *dev_name, void *data);

+/* Use to link togheter all the "struct eventpoll" */
+static struct list_head eplist;
+
+/* Serialize the access to "eplist" */
+static rwlock_t eplock;

/* Slab cache used to allocate "struct epitem" */
static kmem_cache_t *dpi_cache;
@@ -223,28 +228,10 @@

/* File callbacks that implement the eventpoll file behaviour */
static struct file_operations eventpoll_fops = {
- .write = write_eventpoll,
- .ioctl = ioctl_eventpoll,
- .mmap = mmap_eventpoll,
- .open = open_eventpoll,
.release = close_eventpoll,
.poll = poll_eventpoll
};

-/* Memory mapping callbacks for the eventpoll file */
-static struct vm_operations_struct eventpoll_mmap_ops = {
- .open = eventpoll_mm_open,
- .close = eventpoll_mm_close,
-};
-
-/*
- * The "struct miscdevice" is used to register the eventpoll device
- * to make it suitable to be openend from a /dev file.
- */
-static struct miscdevice eventpoll_miscdev = {
- EVENTPOLL_MINOR, "eventpoll", &eventpoll_fops
-};
-
/*
* This is used to register the virtual file system from where
* eventpoll inodes are allocated.
@@ -262,17 +249,35 @@

+void ep_notify_file_close(struct file *file)
+{
+ unsigned long flags;
+ struct list_head *lnk;
+ struct eventpoll *ep;
+ struct epitem *dpi;
+
+ read_lock_irqsave(&eplock, flags);
+ list_for_each(lnk, &eplist) {
+ ep = list_entry(lnk, struct eventpoll, llink);
+
+ if ((dpi = ep_find(ep, file))) {
+ ep_remove(ep, dpi);
+ ep_release_epitem(dpi);
+ }
+ }
+ read_unlock_irqrestore(&eplock, flags);
+}
+
+
/*
* It opens an eventpoll file descriptor by allocating space for "maxfds"
* file descriptors. It is the kernel part of the userspace epoll_create(2).
*/
asmlinkage int sys_epoll_create(int maxfds)
{
- int error = -EINVAL, fd;
- unsigned long addr;
+ int error, fd;
struct inode *inode;
struct file *file;
- struct eventpoll *ep;

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
current, maxfds));
@@ -281,6 +286,7 @@
* It is not possible to store more than MAX_FDS_IN_EVENTPOLL file
* descriptors inside the eventpoll interface.
*/
+ error = -EINVAL;
if (maxfds > MAX_FDS_IN_EVENTPOLL)
goto eexit_1;

@@ -292,36 +298,11 @@
if (error)
goto eexit_1;

- /*
- * Calls the code to initialize the eventpoll file. This code is
- * the same as the "open" file operation callback because inside
- * ep_getfd() we did what the kernel usually does before invoking
- * corresponding file "open" callback.
- */
- error = open_eventpoll(inode, file);
+ /* Setup the file internal data structure ( "struct eventpoll" ) */
+ error = ep_file_init(file, maxfds);
if (error)
goto eexit_2;

- /* The "private_data" member is setup by open_eventpoll() */
- ep = file->private_data;
-
- /* Alloc pages for the event double buffer */
- error = ep_do_alloc_pages(ep, EP_FDS_PAGES(maxfds + 1));
- if (error)
- goto eexit_2;
-
- /*
- * Create a user space mapping of the event double buffer to
- * avoid kernel to user space memory copy when returning events
- * to the caller.
- */
- down_write(&current->mm->mmap_sem);
- addr = do_mmap_pgoff(file, 0, EP_MAP_SIZE(maxfds + 1), PROT_READ,
- MAP_PRIVATE, 0);
- up_write(&current->mm->mmap_sem);
- error = PTR_ERR((void *) addr);
- if (IS_ERR((void *) addr))
- goto eexit_2;

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
current, maxfds, fd));
@@ -344,8 +325,8 @@
*/
asmlinkage int sys_epoll_ctl(int epfd, int op, int fd, unsigned int events)
{
- int error = -EBADF;
- struct file *file;
+ int error;
+ struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *dpi;
struct pollfd pfd;
@@ -353,17 +334,29 @@
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %u)\n",
current, epfd, op, fd, events));

+ /* Get the "struct file *" for the eventpoll file */
+ error = -EBADF;
file = fget(epfd);
if (!file)
goto eexit_1;

+ /* Get the "struct file *" for the target file */
+ tfile = fget(fd);
+ if (!tfile)
+ goto eexit_2;
+
+ /* The target file descriptor must support poll */
+ error = -EPERM;
+ if (!tfile->f_op || !tfile->f_op->poll)
+ goto eexit_3;
+
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
if (!IS_FILE_EPOLL(file))
- goto eexit_2;
+ goto eexit_3;

/*
* At this point it is safe to assume that the "private_data" contains
@@ -371,20 +364,24 @@
*/
ep = file->private_data;

+ /*
+ * We need to hold this in write mode before doing operations that
+ * add/remove/change file descriptor inside the interest set.
+ */
down_write(&ep->acsem);

- pfd.fd = fd;
- pfd.events = events | POLLERR | POLLHUP;
- pfd.revents = 0;
-
- dpi = ep_find(ep, fd);
+ dpi = ep_find(ep, tfile);

error = -EINVAL;
switch (op) {
case EP_CTL_ADD:
- if (!dpi)
- error = ep_insert(ep, &pfd);
- else
+ if (!dpi) {
+ pfd.fd = fd;
+ pfd.events = events | POLLERR | POLLHUP;
+ pfd.revents = 0;
+
+ error = ep_insert(ep, &pfd, tfile);
+ } else
error = -EEXIST;
break;
case EP_CTL_DEL:
@@ -402,11 +399,20 @@
break;
}

+ /*
+ * The function ep_find() increments the usage count of the structure
+ * so, if this is not NULL, we need to release it.
+ */
+ if (dpi)
+ ep_release_epitem(dpi);
+
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %u) = %d\n",
current, epfd, op, fd, events, error));

up_write(&ep->acsem);

+eexit_3:
+ fput(tfile);
eexit_2:
fput(file);
eexit_1:
@@ -418,17 +424,22 @@
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
-asmlinkage int sys_epoll_wait(int epfd, struct pollfd const **events, int timeout)
+asmlinkage int sys_epoll_wait(int epfd, struct pollfd *events, int maxevents,
+ int timeout)
{
- int error = -EBADF;
- void *eaddr;
+ int error;
struct file *file;
struct eventpoll *ep;
- struct evpoll dvp;

- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d)\n",
- current, epfd, events, timeout));
+ DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
+ current, epfd, events, maxevents, timeout));
+
+ /* Verify that the area passed by the user is writeable */
+ if ((error = verify_area(VERIFY_WRITE, events, maxevents * sizeof(struct pollfd))))
+ goto eexit_1;

+ /* Get the "struct file *" for the eventpoll file */
+ error = -EBADF;
file = fget(epfd);
if (!file)
goto eexit_1;
@@ -447,26 +458,14 @@
*/
ep = file->private_data;

- /*
- * It is possible that the user created an eventpoll file by open()ing
- * the corresponding /dev/ file and he did not perform the correct
- * initialization required by the old /dev/epoll interface. This test
- * protect us from this scenario.
- */
- error = -EINVAL;
- if (!atomic_read(&ep->mmapped))
- goto eexit_2;
+ down_write(&ep->acsem);

- dvp.ep_timeout = timeout;
- error = ep_poll(ep, &dvp);
- if (error > 0) {
- eaddr = (void *) (ep->vmabase + dvp.ep_resoff);
- if (copy_to_user(events, &eaddr, sizeof(struct pollfd *)))
- error = -EFAULT;
- }
+ error = ep_poll(ep, events, maxevents, timeout);

- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d) = %d\n",
- current, epfd, events, timeout, error));
+ up_write(&ep->acsem);
+
+ DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
+ current, epfd, events, maxevents, timeout, error));

eexit_2:
fput(file);
@@ -527,9 +526,9 @@
* with write() to add/remove/change interest sets.
*/
file->f_pos = 0;
- file->f_flags = O_RDWR;
+ file->f_flags = O_RDONLY;
file->f_op = &eventpoll_fops;
- file->f_mode = FMODE_READ | FMODE_WRITE;
+ file->f_mode = FMODE_READ;
file->f_version = 0;
file->private_data = NULL;

@@ -548,7 +547,7 @@
eexit_2:
put_filp(file);
eexit_1:
- return error;
+ return error;
}

@@ -583,317 +582,316 @@
}

-static int ep_init(struct eventpoll *ep)
+static int ep_file_init(struct file *file, int maxfds)
{
- int ii, hentries;
+ int error;
+ unsigned long flags;
+ struct eventpoll *ep;

+ if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL)))
+ return -ENOMEM;
+
+ memset(ep, 0, sizeof(*ep));
+
+ error = ep_init(ep, maxfds);
+ if (error) {
+ kfree(ep);
+ return error;
+ }
+
+ file->private_data = ep;
+
+ /* Add the structure to the linked list that links "struct eventpoll" */
+ write_lock_irqsave(&eplock, flags);
+ list_add(&ep->llink, &eplist);
+ write_unlock_irqrestore(&eplock, flags);
+
+ DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n",
+ current, ep));
+ return 0;
+}
+
+
+static int ep_hash_index(struct eventpoll *ep, struct file *file)
+{
+
+ return (int) ((((unsigned long) file) / sizeof(struct file)) % ep->hsize);
+}
+
+
+static struct list_head *ep_hash_entry(struct eventpoll *ep, int index)
+{
+
+ return (struct list_head *) (ep->hpages[index / EP_HENTRY_X_PAGE] +
+ (index % EP_HENTRY_X_PAGE) * sizeof(struct list_head));
+}
+
+
+static int ep_init(struct eventpoll *ep, int hsize)
+{
+ int error, i;
+
+ INIT_LIST_HEAD(&ep->llink);
init_rwsem(&ep->acsem);
rwlock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
- ep->hbits = INITIAL_HASH_BITS;
- ep->hmask = (1 << ep->hbits) - 1;
- atomic_set(&ep->hents, 0);
- atomic_set(&ep->resize, 0);
- atomic_set(&ep->mmapped, 0);
- ep->numpages = 0;
- ep->vmabase = 0;
- ep->pages = ep->pages0;
- ep->eventcnt = 0;
- ep->ver = 1;
+ INIT_LIST_HEAD(&ep->rdllist);

- hentries = ep->hmask + 1;
- if (!(ep->hash = (struct list_head *) vmalloc(hentries * sizeof(struct list_head))))
+ ep->eventpage = (char *) __get_free_pages(GFP_KERNEL, 0);
+ if (!ep->eventpage)
return -ENOMEM;
+ SetPageReserved(virt_to_page(ep->eventpage));
+
+ ep->hsize = hsize;
+ ep->nhpages = hsize / EP_HENTRY_X_PAGE + 1;
+ error = ep_alloc_pages(ep->hpages, ep->nhpages);
+ if (error)
+ goto eexit_1;

- for (ii = 0; ii < hentries; ii++)
- INIT_LIST_HEAD(&ep->hash[ii]);
+ for (i = 0; i < ep->hsize; i++)
+ INIT_LIST_HEAD(ep_hash_entry(ep, i));

return 0;
+eexit_1:
+ ClearPageReserved(virt_to_page(ep->eventpage));
+ free_pages((unsigned long) ep->eventpage, 0);
+ return error;
}

static void ep_free(struct eventpoll *ep)
{
- int ii;
+ int i;
+ unsigned long flags;
struct list_head *lsthead;

+ /* Remove the structure to the linked list that links "struct eventpoll" */
+ write_lock_irqsave(&eplock, flags);
+ list_del(&ep->llink);
+ write_unlock_irqrestore(&eplock, flags);
+
/*
* Walks through the whole hash by unregistering file callbacks and
* freeing each "struct epitem".
*/
- for (ii = 0; ii <= ep->hmask; ii++) {
- lsthead = &ep->hash[ii];
+ for (i = 0; i < ep->hsize; i++) {
+ lsthead = ep_hash_entry(ep, i);
while (!list_empty(lsthead)) {
struct epitem *dpi = list_entry(lsthead->next, struct epitem, llink);

- file_notify_delcb(dpi->file, notify_proc);
- list_del(lsthead->next);
- DPI_MEM_FREE(dpi);
+ ep_remove(ep, dpi);
}
}
- /*
- * At this point we can free the hash and the pages used for the event
- * double buffer. The ep_free() function is called from the "close"
- * file operations callback, and this garanties us that the pages are
- * already unmapped.
- */
- vfree(ep->hash);
- if (ep->numpages > 0) {
- ep_free_pages(ep->pages0, ep->numpages);
- ep_free_pages(ep->pages1, ep->numpages);
- }
+ /* Free hash pages */
+ if (ep->nhpages > 0)
+ ep_free_pages(ep->hpages, ep->nhpages);
+
+ /* Free the event buffer page */
+ ClearPageReserved(virt_to_page(ep->eventpage));
+ free_pages((unsigned long) ep->eventpage, 0);
}

-/*
- * No lock version of ep_find(), used when the code had to acquire the lock
- * before calling the function.
- */
-static struct epitem *ep_find_nl(struct eventpoll *ep, int fd)
+static struct epitem *ep_find(struct eventpoll *ep, struct file *file)
{
- struct epitem *dpi = NULL;
+ unsigned long flags;
struct list_head *lsthead, *lnk;
+ struct epitem *dpi = NULL;
+
+ read_lock_irqsave(&ep->lock, flags);

- lsthead = &ep->hash[fd & ep->hmask];
+ lsthead = ep_hash_entry(ep, ep_hash_index(ep, file));
list_for_each(lnk, lsthead) {
dpi = list_entry(lnk, struct epitem, llink);

- if (dpi->pfd.fd == fd) break;
+ if (dpi->file == file) {
+ ep_use_epitem(dpi);
+ break;
+ }
dpi = NULL;
}

- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%d) -> %p\n",
- current, fd, dpi));
+ read_unlock_irqrestore(&ep->lock, flags);
+
+ DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
+ current, file, dpi));

return dpi;
}

-static struct epitem *ep_find(struct eventpoll *ep, int fd)
+/*
+ * Increment the usage count of the "struct epitem" making it sure
+ * that the user will have a valid pointer to reference.
+ */
+static void ep_use_epitem(struct epitem *dpi)
{
- struct epitem *dpi;
- unsigned long flags;
-
- read_lock_irqsave(&ep->lock, flags);
-
- dpi = ep_find_nl(ep, fd);

- read_unlock_irqrestore(&ep->lock, flags);
-
- return dpi;
+ atomic_inc(&dpi->usecnt);
}

-static int ep_hashresize(struct eventpoll *ep, unsigned long *kflags)
+/*
+ * Decrement ( release ) the usage count by signaling that the user
+ * has finished using the structure. It might lead to freeing the
+ * structure itself if the count goes to zero.
+ */
+static void ep_release_epitem(struct epitem *dpi)
{
- struct list_head *hash, *oldhash;
- unsigned int hbits = ep->hbits + 1;
- unsigned int hmask = (1 << hbits) - 1;
- int ii, res, hentries = hmask + 1;
- unsigned long flags = *kflags;
-
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_hashresize(%p) bits=%u\n",
- current, ep, hbits));

- write_unlock_irqrestore(&ep->lock, flags);
-
- res = -ENOMEM;
- if (!(hash = (struct list_head *) vmalloc(hentries * sizeof(struct list_head)))) {
- write_lock_irqsave(&ep->lock, flags);
- goto eexit_1;
- }
-
- for (ii = 0; ii < hentries; ii++)
- INIT_LIST_HEAD(&hash[ii]);
-
- write_lock_irqsave(&ep->lock, flags);
+ if (atomic_dec_and_test(&dpi->usecnt))
+ DPI_MEM_FREE(dpi);
+}

- oldhash = ep->hash;
- for (ii = 0; ii <= ep->hmask; ii++) {
- struct list_head *oldhead = &oldhash[ii], *lnk;

- while (!list_empty(oldhead)) {
- struct epitem *dpi = list_entry(lnk = oldhead->next, struct epitem, llink);
+static void ep_ptable_queue_proc(void *priv, wait_queue_head_t *whead)
+{
+ struct epitem *dpi = priv;

- list_del(lnk);
- list_add(lnk, &hash[dpi->pfd.fd & hmask]);
- }
+ if (dpi->nwait < EP_MAX_POLL_QUEUE) {
+ add_wait_queue(whead, &dpi->wait[dpi->nwait].wait);
+ dpi->wait[dpi->nwait].whead = whead;
+ dpi->nwait++;
}
-
- ep->hash = hash;
- ep->hbits = hbits;
- ep->hmask = hmask;
-
- write_unlock_irqrestore(&ep->lock, flags);
- vfree(oldhash);
- write_lock_irqsave(&ep->lock, flags);
-
- res = 0;
-eexit_1:
- *kflags = flags;
- atomic_dec(&ep->resize);
- return res;
}

-static int ep_insert(struct eventpoll *ep, struct pollfd *pfd)
+static int ep_insert(struct eventpoll *ep, struct pollfd *pfd, struct file *tfile)
{
- int error;
- struct epitem *dpi;
- struct file *file;
+ int error, i, revents;
unsigned long flags;
-
- if (atomic_read(&ep->hents) >= (ep->numpages * POLLFD_X_PAGE))
- return -E2BIG;
-
- file = fget(pfd->fd);
- if (!file)
- return -EBADF;
+ struct epitem *dpi;
+ poll_table pt;

error = -ENOMEM;
if (!(dpi = DPI_MEM_ALLOC()))
goto eexit_1;

+ /* Item initialization follow here ... */
INIT_LIST_HEAD(&dpi->llink);
+ INIT_LIST_HEAD(&dpi->rdllink);
dpi->ep = ep;
- dpi->file = file;
+ dpi->file = tfile;
dpi->pfd = *pfd;
- dpi->index = -1;
- dpi->ver = ep->ver - 1;
+ atomic_set(&dpi->usecnt, 1);
+ dpi->nwait = 0;
+ for (i = 0; i < EP_MAX_POLL_QUEUE; i++) {
+ init_waitqueue_func_entry(&dpi->wait[i].wait, ep_poll_callback);
+ dpi->wait[i].whead = NULL;
+ dpi->wait[i].base = dpi;
+ }
+
+ /* Attach the item to the poll hooks */
+ poll_initwait_ex(&pt, 1, ep_ptable_queue_proc, dpi);

+ revents = tfile->f_op->poll(tfile, &pt);
+
+ poll_freewait(&pt);
+
+ /* We have to drop the new item inside our item list to keep track of it */
write_lock_irqsave(&ep->lock, flags);

- list_add(&dpi->llink, &ep->hash[pfd->fd & ep->hmask]);
- atomic_inc(&ep->hents);
+ list_add(&dpi->llink, ep_hash_entry(ep, ep_hash_index(ep, tfile)));

- if (!atomic_read(&ep->resize) &&
- (atomic_read(&ep->hents) >> ep->hbits) > RESIZE_LENGTH &&
- ep->hbits < MAX_HASH_BITS) {
- atomic_inc(&ep->resize);
- ep_hashresize(ep, &flags);
- }
+ /* If the file is already "ready" we drop him inside the ready list */
+ if (revents & pfd->events)
+ list_add(&dpi->rdllink, &ep->rdllist);

write_unlock_irqrestore(&ep->lock, flags);

- file_notify_addcb(file, notify_proc, dpi);
-
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %d)\n",
current, ep, pfd->fd));

- error = 0;
-eexit_1:
- fput(file);
+ return 0;

+eexit_1:
return error;
}

-/*
- * Removes a "struct epitem" from the eventpoll hash and deallocates
- * all the associated resources.
- */
-static int ep_remove(struct eventpoll *ep, struct epitem *dpi)
+static int ep_unlink(struct eventpoll *ep, struct epitem *dpi)
{
+ int i;
unsigned long flags;
- struct pollfd *pfd, *lpfd;
- struct epitem *ldpi;
-
- /* First, removes the callback from the file callback list */
- file_notify_delcb(dpi->file, notify_proc);

write_lock_irqsave(&ep->lock, flags);

- list_del(&dpi->llink);
- atomic_dec(&ep->hents);
+ /*
+ * It can happen that this one is called for an item already unlinked.
+ * The check protect us from doing a double unlink ( crash ).
+ */
+ if (!EP_IS_LINKED(&dpi->llink))
+ goto not_linked;

/*
- * This is to remove stale events. We don't want that the removed file
- * has a pending event that might be associated with a file inserted
- * at a later time inside the eventpoll interface. this code checks
- * if the currently removed file has a valid pending event and, if it does,
- * manages things to remove it and decrement the currently available
- * event count.
+ * At this point is safe to do the job, decrement the number of file
+ * descriptors stored inside the interest set and unlink the item
+ * from our list.
*/
- if (dpi->index >= 0 && dpi->ver == ep->ver && dpi->index < ep->eventcnt) {
- pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
- EVENT_PAGE_OFFSET(dpi->index));
- if (pfd->fd == dpi->pfd.fd && dpi->index < --ep->eventcnt) {
- lpfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(ep->eventcnt)] +
- EVENT_PAGE_OFFSET(ep->eventcnt));
- *pfd = *lpfd;
+ EP_LIST_DEL(&dpi->llink);

- if ((ldpi = ep_find_nl(ep, pfd->fd))) ldpi->index = dpi->index;
- }
- }
+ /* Removes poll hooks */
+ for (i = 0; i < dpi->nwait; i++)
+ remove_wait_queue(dpi->wait[i].whead, &dpi->wait[i].wait);
+
+ /*
+ * If the item we are going to remove is inside the ready file descriptors
+ * we want to remove it from this list to avoid stale events.
+ */
+ if (EP_IS_LINKED(&dpi->rdllink))
+ EP_LIST_DEL(&dpi->rdllink);

+not_linked:
write_unlock_irqrestore(&ep->lock, flags);

- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %d)\n",
+ DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %d)\n",
current, ep, dpi->pfd.fd));

- /* At this point it is safe to free the eventpoll item */
- DPI_MEM_FREE(dpi);
-
return 0;
}

/*
- * This is the event notify callback that is called from fs/fcblist.c because
- * of the registration ( file_notify_addcb() ) done in ep_insert().
+ * Removes a "struct epitem" from the eventpoll hash and deallocates
+ * all the associated resources.
*/
-static void notify_proc(struct file *file, void *data, unsigned long *local,
- long *event)
+static int ep_remove(struct eventpoll *ep, struct epitem *dpi)
{
- struct epitem *dpi = data;
- struct eventpoll *ep = dpi->ep;
- struct pollfd *pfd;
+ int error;

- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: notify(%p, %p, %ld, %ld) ep=%p\n",
- current, file, data, event[0], event[1], ep));
+ /* Really unlink the item from the hash */
+ error = ep_unlink(ep, dpi);
+ if (error)
+ goto eexit_1;

- /*
- * We don't need to disable IRQs here because the callback dispatch
- * routine inside fs/fcblist.c already call us with disabled IRQ.
- */
- write_lock(&ep->lock);
+ DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %d)\n",
+ current, ep, dpi->pfd.fd));

- /* We're not expecting any of those events. Jump out soon ... */
- if (!(dpi->pfd.events & event[1]))
- goto out;
+ /* At this point it is safe to free the eventpoll item */
+ ep_release_epitem(dpi);

- /*
- * This logic determins if an active even slot is available for the
- * currently signaled file, or if we have to make space for a new one
- * and increment the number of ready file descriptors ( ep->eventcnt ).
- */
- if (dpi->index < 0 || dpi->ver != ep->ver) {
- if (ep->eventcnt >= (ep->numpages * POLLFD_X_PAGE))
- goto out;
- dpi->index = ep->eventcnt++;
- dpi->ver = ep->ver;
- pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
- EVENT_PAGE_OFFSET(dpi->index));
- *pfd = dpi->pfd;
- } else {
- pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
- EVENT_PAGE_OFFSET(dpi->index));
- if (pfd->fd != dpi->pfd.fd) {
- if (ep->eventcnt >= (ep->numpages * POLLFD_X_PAGE))
- goto out;
- dpi->index = ep->eventcnt++;
- pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
- EVENT_PAGE_OFFSET(dpi->index));
- *pfd = dpi->pfd;
- }
- }
+ error = 0;
+eexit_1:
+ return error;
+}

- /*
- * Merge event bits into the corresponding event slot inside the
- * double buffer.
- */
- pfd->revents |= (pfd->events & event[1]);
+
+static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync)
+{
+ unsigned long flags;
+ struct epitem *dpi = EP_ITEM_FROM_WAIT(wait);
+ struct eventpoll *ep = dpi->ep;
+
+ DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) dpi=%p ep=%p\n",
+ current, dpi->file, dpi, ep));
+
+ write_lock_irqsave(&ep->lock, flags);
+
+ if (EP_IS_LINKED(&dpi->rdllink))
+ goto is_linked;
+
+ list_add(&dpi->rdllink, &ep->rdllist);

/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
@@ -903,29 +901,9 @@
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
wake_up(&ep->poll_wait);
-out:
- write_unlock(&ep->lock);
-}
-
-
-static int open_eventpoll(struct inode *inode, struct file *file)
-{
- int res;
- struct eventpoll *ep;
-
- if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL)))
- return -ENOMEM;
-
- memset(ep, 0, sizeof(*ep));
- if ((res = ep_init(ep))) {
- kfree(ep);
- return res;
- }
-
- file->private_data = ep;
-
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: open() ep=%p\n", current, ep));
- return 0;
+is_linked:
+ write_unlock_irqrestore(&ep->lock, flags);
+ return 1;
}

@@ -948,98 +926,83 @@
struct eventpoll *ep = file->private_data;

poll_wait(file, &ep->poll_wait, wait);
- if (ep->eventcnt)
+ if (!list_empty(&ep->rdllist))
return POLLIN | POLLRDNORM;

return 0;
}

-static int write_eventpoll(struct file *file, const char *buffer, size_t count,
- loff_t *ppos)
+/*
+ * Perform the transfer of events to user space. Optimize the copy by
+ * caching one page of event at a time and then copying it to user space.
+ */
+static int ep_events_transfer(struct eventpoll *ep, struct pollfd *events, int maxevents)
{
- int rcount;
- struct eventpoll *ep = file->private_data;
- struct epitem *dpi;
- struct pollfd pfd;
-
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: write(%p, %d)\n", current, ep, count));
+ int eventcnt, pagefill, revents;
+ unsigned long flags;
+ struct list_head *lsthead = &ep->rdllist;
+ struct pollfd *pgevents = (struct pollfd *) ep->eventpage;
+ poll_table pt;

- /* The size of the write must be a multiple of sizeof(struct pollfd) */
- rcount = -EINVAL;
- if (count % sizeof(struct pollfd))
- goto eexit_1;
+ /* Attach the item to the poll hooks */
+ poll_initwait_ex(&pt, 0, NULL, NULL);

- /*
- * And we have also to verify that that area is correctly accessible
- * for the user.
- */
- if ((rcount = verify_area(VERIFY_READ, buffer, count)))
- goto eexit_1;
-
- down_write(&ep->acsem);
+ write_lock_irqsave(&ep->lock, flags);

- rcount = 0;
+ for (eventcnt = 0, pagefill = 0; eventcnt < maxevents && !list_empty(lsthead);) {
+ struct epitem *dpi = list_entry(lsthead->next, struct epitem, rdllink);

- while (count > 0) {
- if (__copy_from_user(&pfd, buffer, sizeof(pfd))) {
- rcount = -EFAULT;
- goto eexit_2;
- }
+ EP_LIST_DEL(&dpi->rdllink);

- dpi = ep_find(ep, pfd.fd);
+ revents = dpi->file->f_op->poll(dpi->file, &pt);
+ if (revents & dpi->pfd.events) {
+ pgevents[pagefill] = dpi->pfd;
+ pgevents[pagefill].revents = revents & pgevents[pagefill].events;
+ pagefill++;
+ if (pagefill == EP_POLLFDS_X_PAGE) {
+ write_unlock_irqrestore(&ep->lock, flags);
+
+ if (__copy_to_user(&events[eventcnt], pgevents,
+ pagefill * sizeof(struct pollfd))) {
+ poll_freewait(&pt);
+ return -EFAULT;
+ }
+ eventcnt += pagefill;
+ pagefill = 0;

- if (pfd.fd >= current->files->max_fds || !current->files->fd[pfd.fd])
- pfd.events = POLLREMOVE;
- if (pfd.events & POLLREMOVE) {
- if (dpi) {
- ep_remove(ep, dpi);
- rcount += sizeof(pfd);
+ write_lock_irqsave(&ep->lock, flags);
}
}
- else if (dpi) {
- dpi->pfd.events = pfd.events;
- rcount += sizeof(pfd);
- } else {
- pfd.revents = 0;
- if (!ep_insert(ep, &pfd))
- rcount += sizeof(pfd);
- }
+ }
+ write_unlock_irqrestore(&ep->lock, flags);

- buffer += sizeof(pfd);
- count -= sizeof(pfd);
+ if (pagefill) {
+ if (__copy_to_user(&events[eventcnt], pgevents,
+ pagefill * sizeof(struct pollfd)))
+ eventcnt = -EFAULT;
+ else
+ eventcnt += pagefill;
}

-eexit_2:
- up_write(&ep->acsem);
-eexit_1:
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: write(%p, %d) = %d\n",
- current, ep, count, rcount));
+ poll_freewait(&pt);

- return rcount;
+ return eventcnt;
}

-static int ep_poll(struct eventpoll *ep, struct evpoll *dvp)
+static int ep_poll(struct eventpoll *ep, struct pollfd *events, int maxevents,
+ int timeout)
{
int res = 0;
- long timeout;
unsigned long flags;
+ long jtimeout;
wait_queue_t wait;

- /*
- * We don't want ep_poll() to be called if the correct sequence
- * of operations are performed to initialize it. This won't happen
- * for the system call interface but it could happen using the
- * old /dev/epoll interface, that is maintained for compatibility.
- */
- if (!atomic_read(&ep->mmapped))
- return -EINVAL;
-
write_lock_irqsave(&ep->lock, flags);

res = 0;
- if (!ep->eventcnt) {
+ if (list_empty(&ep->rdllist)) {
/*
* We don't have any available event to return to the caller.
* We need to sleep here, and we will be wake up by
@@ -1053,8 +1016,8 @@
* and the overflow condition ( > MAX_SCHEDULE_TIMEOUT / HZ ). The
* passed timeout is in milliseconds, that why (t * HZ) / 1000.
*/
- timeout = dvp->ep_timeout == -1 || dvp->ep_timeout > MAX_SCHEDULE_TIMEOUT / HZ ?
- MAX_SCHEDULE_TIMEOUT: (dvp->ep_timeout * HZ) / 1000;
+ jtimeout = timeout == -1 || timeout > MAX_SCHEDULE_TIMEOUT / HZ ?
+ MAX_SCHEDULE_TIMEOUT: (timeout * HZ) / 1000;

for (;;) {
/*
@@ -1063,7 +1026,7 @@
* to TASK_INTERRUPTIBLE before doing the checks.
*/
set_current_state(TASK_INTERRUPTIBLE);
- if (ep->eventcnt || !timeout)
+ if (!list_empty(&ep->rdllist) || !jtimeout)
break;
if (signal_pending(current)) {
res = -EINTR;
@@ -1071,7 +1034,7 @@
}

write_unlock_irqrestore(&ep->lock, flags);
- timeout = schedule_timeout(timeout);
+ jtimeout = schedule_timeout(jtimeout);
write_lock_irqsave(&ep->lock, flags);
}
remove_wait_queue(&ep->wq, &wait);
@@ -1079,249 +1042,12 @@
set_current_state(TASK_RUNNING);
}

- /*
- * If we've been wake up because of events became available, we need to:
- *
- * 1) null the number of available ready file descriptors
- * 2) increment the version of the current ( next ) snapshot
- * 3) swap the double buffer to return the current one to the caller
- * 4) set the current ( for the user, previous for the interface ) offset
- */
- if (!res && ep->eventcnt) {
- res = ep->eventcnt;
- ep->eventcnt = 0;
- ++ep->ver;
- if (ep->pages == ep->pages0) {
- ep->pages = ep->pages1;
- dvp->ep_resoff = 0;
- } else {
- ep->pages = ep->pages0;
- dvp->ep_resoff = ep->numpages * PAGE_SIZE;
- }
- }
-
write_unlock_irqrestore(&ep->lock, flags);

- return res;
-}
-
+ /* Transfer events to user space */
+ if (!list_empty(&ep->rdllist))
+ res = ep_events_transfer(ep, events, maxevents);

-static int ep_do_alloc_pages(struct eventpoll *ep, int numpages)
-{
- int res, pgalloc, pgcpy;
- unsigned long flags;
- char **pages, **pages0, **pages1;
-
- if (atomic_read(&ep->mmapped))
- return -EBUSY;
- if (numpages > MAX_EVENTPOLL_PAGES)
- return -EINVAL;
-
- pgalloc = numpages - ep->numpages;
- if ((pages = (char **) vmalloc(2 * (pgalloc + 1) * sizeof(char *))) == NULL)
- return -ENOMEM;
- pages0 = &pages[0];
- pages1 = &pages[pgalloc + 1];
-
- if ((res = ep_alloc_pages(pages0, pgalloc)))
- goto eexit_1;
-
- if ((res = ep_alloc_pages(pages1, pgalloc))) {
- ep_free_pages(pages0, pgalloc);
- goto eexit_1;
- }
-
- write_lock_irqsave(&ep->lock, flags);
- pgcpy = (ep->numpages + pgalloc) > numpages ? numpages - ep->numpages: pgalloc;
- if (pgcpy > 0) {
- memcpy(&ep->pages0[ep->numpages], pages0, pgcpy * sizeof(char *));
- memcpy(&ep->pages1[ep->numpages], pages1, pgcpy * sizeof(char *));
- ep->numpages += pgcpy;
- }
- write_unlock_irqrestore(&ep->lock, flags);
-
- if (pgcpy < pgalloc) {
- if (pgcpy < 0)
- pgcpy = 0;
- ep_free_pages(&pages0[pgcpy], pgalloc - pgcpy);
- ep_free_pages(&pages1[pgcpy], pgalloc - pgcpy);
- }
-
-eexit_1:
- vfree(pages);
- return res;
-}
-
-
-static int ioctl_eventpoll(struct inode *inode, struct file *file,
- unsigned int cmd, unsigned long arg)
-{
- int res;
- struct eventpoll *ep = file->private_data;
- struct epitem *dpi;
- unsigned long flags;
- struct pollfd pfd;
- struct evpoll dvp;
-
- switch (cmd) {
- case EP_ALLOC:
- res = ep_do_alloc_pages(ep, EP_FDS_PAGES(arg));
-
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_ALLOC, %lu) == %d\n",
- current, ep, arg, res));
- return res;
-
- case EP_FREE:
- if (atomic_read(&ep->mmapped))
- return -EBUSY;
-
- res = -EINVAL;
- write_lock_irqsave(&ep->lock, flags);
- if (ep->numpages > 0) {
- ep_free_pages(ep->pages0, ep->numpages);
- ep_free_pages(ep->pages1, ep->numpages);
- ep->numpages = 0;
- ep->pages = ep->pages0;
- res = 0;
- }
- write_unlock_irqrestore(&ep->lock, flags);
-
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_FREE) == %d\n",
- current, ep, res));
- return res;
-
- case EP_POLL:
- if (copy_from_user(&dvp, (void *) arg, sizeof(struct evpoll)))
- return -EFAULT;
-
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_POLL, %d)\n",
- current, ep, dvp.ep_timeout));
-
- res = ep_poll(ep, &dvp);
-
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_POLL, %d) == %d\n",
- current, ep, dvp.ep_timeout, res));
-
- if (res > 0 && copy_to_user((void *) arg, &dvp, sizeof(struct evpoll)))
- res = -EFAULT;
-
- return res;
-
- case EP_ISPOLLED:
- if (copy_from_user(&pfd, (void *) arg, sizeof(struct pollfd)))
- return 0;
-
- read_lock_irqsave(&ep->lock, flags);
-
- res = 0;
- if (!(dpi = ep_find_nl(ep, pfd.fd)))
- goto is_not_polled;
-
- pfd = dpi->pfd;
- res = 1;
-
- is_not_polled:
- read_unlock_irqrestore(&ep->lock, flags);
-
- if (res)
- copy_to_user((void *) arg, &pfd, sizeof(struct pollfd));
-
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_ISPOLLED, %d) == %d\n",
- current, ep, pfd.fd, res));
- return res;
- }
-
- return -EINVAL;
-}
-
-
-static void eventpoll_mm_open(struct vm_area_struct * vma)
-{
- struct file *file = vma->vm_file;
- struct eventpoll *ep = file->private_data;
-
- if (ep) atomic_inc(&ep->mmapped);
-
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mm_open(%p)\n", current, ep));
-}
-
-
-static void eventpoll_mm_close(struct vm_area_struct * vma)
-{
- struct file *file = vma->vm_file;
- struct eventpoll *ep = file->private_data;
-
- if (ep && atomic_dec_and_test(&ep->mmapped))
- ep->vmabase = 0;
-
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mm_close(%p)\n", current, ep));
-}
-
-
-static int mmap_eventpoll(struct file *file, struct vm_area_struct *vma)
-{
- struct eventpoll *ep = file->private_data;
- unsigned long start;
- int ii, res, numpages;
- size_t mapsize;
-
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mmap(%p, %lx, %lx)\n",
- current, ep, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT));
-
- /*
- * We need the eventpoll file to be RW but we don't want it to be
- * mapped RW. This test perform the test and reject RW mmaping.
- */
- if (vma->vm_flags & VM_WRITE)
- return -EACCES;
-
- if ((vma->vm_pgoff << PAGE_SHIFT) != 0)
- return -EINVAL;
-
- /*
- * We need to verify that the mapped area covers all the allocated
- * double buffer.
- */
- mapsize = PAGE_ALIGN(vma->vm_end - vma->vm_start);
- numpages = mapsize >> PAGE_SHIFT;
-
- res = -EINVAL;
- if (numpages != (2 * ep->numpages))
- goto eexit_1;
-
- /*
- * Map the double buffer starting from "vma->vm_start" up to
- * "vma->vm_start + ep->numpages * PAGE_SIZE".
- */
- start = vma->vm_start;
- for (ii = 0; ii < ep->numpages; ii++) {
- if ((res = remap_page_range(vma, start, __pa(ep->pages0[ii]),
- PAGE_SIZE, vma->vm_page_prot)))
- goto eexit_1;
- start += PAGE_SIZE;
- }
- for (ii = 0; ii < ep->numpages; ii++) {
- if ((res = remap_page_range(vma, start, __pa(ep->pages1[ii]),
- PAGE_SIZE, vma->vm_page_prot)))
- goto eexit_1;
- start += PAGE_SIZE;
- }
- vma->vm_ops = &eventpoll_mmap_ops;
-
- /* Saves the base mapping address for later use in sys_epoll_wait(2) */
- ep->vmabase = vma->vm_start;
-
- /*
- * Ok, mapping has been done. We can open the door to functions that
- * requires the mapping to be in place.
- */
- atomic_set(&ep->mmapped, 1);
-
- res = 0;
-eexit_1:
-
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mmap(%p, %lx, %lx) == %d\n",
- current, ep, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT, res));
return res;
}

@@ -1374,11 +1100,17 @@
{
int error;

+ /* Initialize the list that will link "struct eventpoll" */
+ INIT_LIST_HEAD(&eplist);
+
+ /* Initialize the rwlock used to access "eplist" */
+ rwlock_init(&eplock);
+
/* Allocates slab cache used to allocate "struct epitem" items */
error = -ENOMEM;
dpi_cache = kmem_cache_create("eventpoll",
sizeof(struct epitem),
- __alignof__(struct epitem),
+ 0,
DPI_SLAB_DEBUG, NULL, NULL);
if (!dpi_cache)
goto eexit_1;
@@ -1397,21 +1129,10 @@
if (IS_ERR(eventpoll_mnt))
goto eexit_3;

- /*
- * This is to maintain compatibility with the old /dev/epoll interface.
- * We need to register a misc device so that the caller can open(2) it
- * through a file inside /dev.
- */
- error = misc_register(&eventpoll_miscdev);
- if (error)
- goto eexit_4;
-
printk(KERN_INFO "[%p] eventpoll: driver installed.\n", current);

- return error;
+ return 0;

-eexit_4:
- mntput(eventpoll_mnt);
eexit_3:
unregister_filesystem(&eventpoll_fs_type);
eexit_2:
@@ -1427,7 +1148,6 @@
/* Undo all operations done inside eventpoll_init() */
unregister_filesystem(&eventpoll_fs_type);
mntput(eventpoll_mnt);
- misc_deregister(&eventpoll_miscdev);
kmem_cache_destroy(dpi_cache);
}

diff -Nru linux-2.5.45.vanilla/fs/Makefile linux-2.5.45.epoll/fs/Makefile
--- linux-2.5.45.vanilla/fs/Makefile Wed Oct 30 16:42:59 2002
+++ linux-2.5.45.epoll/fs/Makefile Fri Nov 1 12:06:33 2002
@@ -6,14 +6,14 @@
#

export-objs := open.o dcache.o buffer.o bio.o inode.o dquot.o mpage.o aio.o \
- fcntl.o read_write.o dcookies.o fcblist.o
+ fcntl.o read_write.o dcookies.o

obj-y := open.o read_write.o devices.o file_table.o buffer.o \
bio.o super.o block_dev.o char_dev.o stat.o exec.o pipe.o \
namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
dcache.o inode.o attr.o bad_inode.o file.o dnotify.o \
filesystems.o namespace.o seq_file.o xattr.o libfs.o \
- fs-writeback.o mpage.o direct-io.o aio.o fcblist.o
+ fs-writeback.o mpage.o direct-io.o aio.o

ifneq ($(CONFIG_NFSD),n)
ifneq ($(CONFIG_NFSD),)
diff -Nru linux-2.5.45.vanilla/fs/fcblist.c linux-2.5.45.epoll/fs/fcblist.c
--- linux-2.5.45.vanilla/fs/fcblist.c Wed Oct 30 16:43:07 2002
+++ linux-2.5.45.epoll/fs/fcblist.c Wed Dec 31 16:00:00 1969
@@ -1,146 +0,0 @@
-/*
- * linux/fs/fcblist.c ( File event callbacks handling )
- * Copyright (C) 2001,...,2002 Davide Libenzi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * Davide Libenzi <davidel@xmailserver.org>
- *
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/poll.h>
-#include <asm/bitops.h>
-#include <linux/fcblist.h>
-
-
-long ion_band_table[NSIGPOLL] = {
- ION_IN, /* POLL_IN */
- ION_OUT, /* POLL_OUT */
- ION_IN, /* POLL_MSG */
- ION_ERR, /* POLL_ERR */
- 0, /* POLL_PRI */
- ION_HUP /* POLL_HUP */
-};
-
-long poll_band_table[NSIGPOLL] = {
- POLLIN | POLLRDNORM, /* POLL_IN */
- POLLOUT | POLLWRNORM | POLLWRBAND, /* POLL_OUT */
- POLLIN | POLLRDNORM | POLLMSG, /* POLL_MSG */
- POLLERR, /* POLL_ERR */
- POLLPRI | POLLRDBAND, /* POLL_PRI */
- POLLHUP | POLLERR /* POLL_HUP */
-};
-
-
-
-/*
- * Walk through the file callback list by calling each registered callback
- * with the event that happened on the "filep" file. Callbacks are called
- * by holding a read lock on the callback list lock, and also by keeping
- * local IRQs disabled.
- */
-void file_notify_event(struct file *filep, long *event)
-{
- unsigned long flags;
- struct list_head *lnk, *lsthead;
-
- read_lock_irqsave(&filep->f_cblock, flags);
-
- lsthead = &filep->f_cblist;
- list_for_each(lnk, lsthead) {
- struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink);
-
- fcbp->cbproc(filep, fcbp->data, fcbp->local, event);
- }
-
- read_unlock_irqrestore(&filep->f_cblock, flags);
-}
-
-
-/*
- * Add a new callback to the list of file callbacks.
- */
-int file_notify_addcb(struct file *filep,
- void (*cbproc)(struct file *, void *, unsigned long *, long *),
- void *data)
-{
- unsigned long flags;
- struct fcb_struct *fcbp;
-
- if (!(fcbp = (struct fcb_struct *) kmalloc(sizeof(struct fcb_struct), GFP_KERNEL)))
- return -ENOMEM;
-
- memset(fcbp, 0, sizeof(struct fcb_struct));
- fcbp->cbproc = cbproc;
- fcbp->data = data;
-
- write_lock_irqsave(&filep->f_cblock, flags);
- list_add_tail(&fcbp->llink, &filep->f_cblist);
- write_unlock_irqrestore(&filep->f_cblock, flags);
-
- return 0;
-}
-
-
-/*
- * Removes the callback "cbproc" from the file callback list.
- */
-int file_notify_delcb(struct file *filep,
- void (*cbproc)(struct file *, void *, unsigned long *, long *))
-{
- unsigned long flags;
- struct list_head *lnk, *lsthead;
-
- write_lock_irqsave(&filep->f_cblock, flags);
-
- lsthead = &filep->f_cblist;
- list_for_each(lnk, lsthead) {
- struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink);
-
- if (fcbp->cbproc == cbproc) {
- list_del(lnk);
- write_unlock_irqrestore(&filep->f_cblock, flags);
- kfree(fcbp);
- return 0;
- }
- }
-
- write_unlock_irqrestore(&filep->f_cblock, flags);
-
- return -ENOENT;
-}
-
-
-/*
- * It is called at file cleanup time and removes all the registered callbacks.
- */
-void file_notify_cleanup(struct file *filep)
-{
- unsigned long flags;
- struct list_head *lsthead;
-
- write_lock_irqsave(&filep->f_cblock, flags);
-
- lsthead = &filep->f_cblist;
- while (!list_empty(lsthead)) {
- struct fcb_struct *fcbp = list_entry(lsthead->next, struct fcb_struct, llink);
-
- list_del(lsthead->next);
- write_unlock_irqrestore(&filep->f_cblock, flags);
- kfree(fcbp);
- write_lock_irqsave(&filep->f_cblock, flags);
- }
-
- write_unlock_irqrestore(&filep->f_cblock, flags);
-}
-
diff -Nru linux-2.5.45.vanilla/fs/file_table.c linux-2.5.45.epoll/fs/file_table.c
--- linux-2.5.45.vanilla/fs/file_table.c Wed Oct 30 16:41:39 2002
+++ linux-2.5.45.epoll/fs/file_table.c Fri Nov 1 16:10:46 2002
@@ -8,12 +8,12 @@
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
-#include <linux/fcblist.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/smp_lock.h>
#include <linux/fs.h>
#include <linux/security.h>
+#include <linux/eventpoll.h>

/* sysctl tunables... */
struct files_stat_struct files_stat = {0, 0, NR_FILE};
@@ -59,7 +59,6 @@
f->f_gid = current->fsgid;
f->f_owner.lock = RW_LOCK_UNLOCKED;
list_add(&f->f_list, &anon_list);
- file_notify_init(f);
file_list_unlock();
return f;
}
@@ -104,7 +103,6 @@
filp->f_uid = current->fsuid;
filp->f_gid = current->fsgid;
filp->f_op = dentry->d_inode->i_fop;
- file_notify_init(filp);
if (filp->f_op->open)
return filp->f_op->open(dentry->d_inode, filp);
else
@@ -126,7 +124,7 @@
struct vfsmount * mnt = file->f_vfsmnt;
struct inode * inode = dentry->d_inode;

- file_notify_cleanup(file);
+ ep_notify_file_close(file);
locks_remove_flock(file);

if (file->f_op && file->f_op->release)
diff -Nru linux-2.5.45.vanilla/fs/pipe.c linux-2.5.45.epoll/fs/pipe.c
--- linux-2.5.45.vanilla/fs/pipe.c Wed Oct 30 16:42:57 2002
+++ linux-2.5.45.epoll/fs/pipe.c Fri Nov 1 12:09:00 2002
@@ -11,7 +11,6 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
-#include <linux/fcblist.h>

#include <asm/uaccess.h>
#include <asm/ioctls.h>
@@ -48,7 +47,7 @@
pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
{
struct inode *inode = filp->f_dentry->d_inode;
- int do_wakeup, pfull;
+ int do_wakeup;
ssize_t ret;

/* pread is not allowed on pipes. */
@@ -64,7 +63,6 @@
down(PIPE_SEM(*inode));
for (;;) {
int size = PIPE_LEN(*inode);
- pfull = PIPE_FULL(*inode);
if (size) {
char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
ssize_t chars = PIPE_MAX_RCHUNK(*inode);
@@ -110,18 +108,12 @@
if (!ret) ret = -ERESTARTSYS;
break;
}
- /* Send notification message */
- if (pfull && !PIPE_FULL(*inode) && PIPE_WRITEFILE(*inode))
- file_send_notify(PIPE_WRITEFILE(*inode), ION_OUT, POLLOUT | POLLWRNORM | POLLWRBAND);
if (do_wakeup) {
wake_up_interruptible_sync(PIPE_WAIT(*inode));
kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
}
pipe_wait(inode);
}
- /* Send notification message */
- if (pfull && !PIPE_FULL(*inode) && PIPE_WRITEFILE(*inode))
- file_send_notify(PIPE_WRITEFILE(*inode), ION_OUT, POLLOUT | POLLWRNORM | POLLWRBAND);
up(PIPE_SEM(*inode));
/* Signal writers asynchronously that there is more room. */
if (do_wakeup) {
@@ -139,7 +131,7 @@
struct inode *inode = filp->f_dentry->d_inode;
ssize_t ret;
size_t min;
- int do_wakeup, pempty;
+ int do_wakeup;

/* pwrite is not allowed on pipes. */
if (unlikely(ppos != &filp->f_pos))
@@ -157,7 +149,6 @@
down(PIPE_SEM(*inode));
for (;;) {
int free;
- pempty = PIPE_EMPTY(*inode);
if (!PIPE_READERS(*inode)) {
send_sig(SIGPIPE, current, 0);
if (!ret) ret = -EPIPE;
@@ -203,9 +194,6 @@
if (!ret) ret = -ERESTARTSYS;
break;
}
- /* Send notification message */
- if (pempty && !PIPE_EMPTY(*inode) && PIPE_READFILE(*inode))
- file_send_notify(PIPE_READFILE(*inode), ION_IN, POLLIN | POLLRDNORM);
if (do_wakeup) {
wake_up_interruptible_sync(PIPE_WAIT(*inode));
kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
@@ -215,9 +203,6 @@
pipe_wait(inode);
PIPE_WAITING_WRITERS(*inode)--;
}
- /* Send notification message */
- if (pempty && !PIPE_EMPTY(*inode) && PIPE_READFILE(*inode))
- file_send_notify(PIPE_READFILE(*inode), ION_IN, POLLIN | POLLRDNORM);
up(PIPE_SEM(*inode));
if (do_wakeup) {
wake_up_interruptible(PIPE_WAIT(*inode));
@@ -281,22 +266,9 @@
static int
pipe_release(struct inode *inode, int decr, int decw)
{
- struct file *rdfile, *wrfile;
down(PIPE_SEM(*inode));
PIPE_READERS(*inode) -= decr;
PIPE_WRITERS(*inode) -= decw;
- rdfile = PIPE_READFILE(*inode);
- wrfile = PIPE_WRITEFILE(*inode);
- if (decr && !PIPE_READERS(*inode)) {
- PIPE_READFILE(*inode) = NULL;
- if (wrfile)
- file_send_notify(wrfile, ION_HUP, POLLHUP);
- }
- if (decw && !PIPE_WRITERS(*inode)) {
- PIPE_WRITEFILE(*inode) = NULL;
- if (rdfile)
- file_send_notify(rdfile, ION_HUP, POLLHUP);
- }
if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) {
struct pipe_inode_info *info = inode->i_pipe;
inode->i_pipe = NULL;
@@ -516,7 +488,6 @@
PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0;
PIPE_WAITING_WRITERS(*inode) = 0;
PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
- PIPE_READFILE(*inode) = PIPE_WRITEFILE(*inode) = NULL;
*PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL;

return inode;
@@ -624,9 +595,6 @@
f2->f_op = &write_pipe_fops;
f2->f_mode = 2;
f2->f_version = 0;
-
- PIPE_READFILE(*inode) = f1;
- PIPE_WRITEFILE(*inode) = f2;

fd_install(i, f1);
fd_install(j, f2);
diff -Nru linux-2.5.45.vanilla/fs/select.c linux-2.5.45.epoll/fs/select.c
--- linux-2.5.45.vanilla/fs/select.c Wed Oct 30 16:41:37 2002
+++ linux-2.5.45.epoll/fs/select.c Fri Nov 1 14:15:44 2002
@@ -77,6 +77,14 @@
{
struct poll_table_page *table = p->table;

+ if (!p->queue)
+ return;
+
+ if (p->qproc) {
+ p->qproc(p->priv, wait_address);
+ return;
+ }
+
if (!table || POLL_TABLE_FULL(table)) {
struct poll_table_page *new_table;

diff -Nru linux-2.5.45.vanilla/include/linux/eventpoll.h linux-2.5.45.epoll/include/linux/eventpoll.h
--- linux-2.5.45.vanilla/include/linux/eventpoll.h Wed Oct 30 16:42:54 2002
+++ linux-2.5.45.epoll/include/linux/eventpoll.h Fri Nov 1 19:32:03 2002
@@ -14,37 +14,28 @@
#ifndef _LINUX_EVENTPOLL_H
#define _LINUX_EVENTPOLL_H

+/* Forward declarations to avoid compiler errors */
+struct file;
+struct pollfd;

-#define EVENTPOLL_MINOR 124
-#define POLLFD_X_PAGE (PAGE_SIZE / sizeof(struct pollfd))
+
+/* Maximum storage for the eventpoll interest set */
#define MAX_FDS_IN_EVENTPOLL (1024 * 128)
-#define MAX_EVENTPOLL_PAGES (MAX_FDS_IN_EVENTPOLL / POLLFD_X_PAGE)
-#define EVENT_PAGE_INDEX(n) ((n) / POLLFD_X_PAGE)
-#define EVENT_PAGE_REM(n) ((n) % POLLFD_X_PAGE)
-#define EVENT_PAGE_OFFSET(n) (((n) % POLLFD_X_PAGE) * sizeof(struct pollfd))
-#define EP_FDS_PAGES(n) (((n) + POLLFD_X_PAGE - 1) / POLLFD_X_PAGE)
-#define EP_MAP_SIZE(n) (EP_FDS_PAGES(n) * PAGE_SIZE * 2)
-
-
-struct evpoll {
- int ep_timeout;
- unsigned long ep_resoff;
-};
-
-#define EP_ALLOC _IOR('P', 1, int)
-#define EP_POLL _IOWR('P', 2, struct evpoll)
-#define EP_FREE _IO('P', 3)
-#define EP_ISPOLLED _IOWR('P', 4, struct pollfd)

+/* Valid opcodes to issue to "sys_epoll_ctl()" */
#define EP_CTL_ADD 1
#define EP_CTL_DEL 2
#define EP_CTL_MOD 3

+/* Kernel space functions implementing the user space "epoll" API */
asmlinkage int sys_epoll_create(int maxfds);
asmlinkage int sys_epoll_ctl(int epfd, int op, int fd, unsigned int events);
-asmlinkage int sys_epoll_wait(int epfd, struct pollfd const **events, int timeout);
+asmlinkage int sys_epoll_wait(int epfd, struct pollfd *events, int maxevents,
+ int timeout);

+/* Used in fs/file_table.c:__fput() to unlink files from the eventpoll interface */
+void ep_notify_file_close(struct file *file);

#endif
diff -Nru linux-2.5.45.vanilla/include/linux/fcblist.h linux-2.5.45.epoll/include/linux/fcblist.h
--- linux-2.5.45.vanilla/include/linux/fcblist.h Wed Oct 30 16:43:34 2002
+++ linux-2.5.45.epoll/include/linux/fcblist.h Wed Dec 31 16:00:00 1969
@@ -1,71 +0,0 @@
-/*
- * include/linux/fcblist.h ( File event callbacks handling )
- * Copyright (C) 2001,...,2002 Davide Libenzi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * Davide Libenzi <davidel@xmailserver.org>
- *
- */
-
-#ifndef __LINUX_FCBLIST_H
-#define __LINUX_FCBLIST_H
-
-#include <linux/config.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-
-
-
-/* file callback notification events */
-#define ION_IN 1
-#define ION_OUT 2
-#define ION_HUP 3
-#define ION_ERR 4
-
-#define FCB_LOCAL_SIZE 4
-
-
-struct fcb_struct {
- struct list_head llink;
- void (*cbproc)(struct file *, void *, unsigned long *, long *);
- void *data;
- unsigned long local[FCB_LOCAL_SIZE];
-};
-
-
-extern long ion_band_table[];
-extern long poll_band_table[];
-
-
-void file_notify_event(struct file *filep, long *event);
-
-int file_notify_addcb(struct file *filep,
- void (*cbproc)(struct file *, void *, unsigned long *, long *),
- void *data);
-
-int file_notify_delcb(struct file *filep,
- void (*cbproc)(struct file *, void *, unsigned long *, long *));
-
-void file_notify_cleanup(struct file *filep);
-
-
-static inline void file_notify_init(struct file *filep)
-{
- rwlock_init(&filep->f_cblock);
- INIT_LIST_HEAD(&filep->f_cblist);
-}
-
-static inline void file_send_notify(struct file *filep, long ioevt, long plevt)
-{
- long event[] = { ioevt, plevt, -1 };
-
- file_notify_event(filep, event);
-}
-
-#endif
diff -Nru linux-2.5.45.vanilla/include/linux/fs.h linux-2.5.45.epoll/include/linux/fs.h
--- linux-2.5.45.vanilla/include/linux/fs.h Wed Oct 30 16:42:21 2002
+++ linux-2.5.45.epoll/include/linux/fs.h Fri Nov 1 12:10:05 2002
@@ -504,10 +504,6 @@

/* needed for tty driver, and maybe others */
void *private_data;
-
- /* file callback list */
- rwlock_t f_cblock;
- struct list_head f_cblist;
};
extern spinlock_t files_lock;
#define file_list_lock() spin_lock(&files_lock);
diff -Nru linux-2.5.45.vanilla/include/linux/pipe_fs_i.h linux-2.5.45.epoll/include/linux/pipe_fs_i.h
--- linux-2.5.45.vanilla/include/linux/pipe_fs_i.h Wed Oct 30 16:43:07 2002
+++ linux-2.5.45.epoll/include/linux/pipe_fs_i.h Fri Nov 1 12:10:38 2002
@@ -12,8 +12,6 @@
unsigned int waiting_writers;
unsigned int r_counter;
unsigned int w_counter;
- struct file *rdfile;
- struct file *wrfile;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
};
@@ -32,8 +30,6 @@
#define PIPE_WAITING_WRITERS(inode) ((inode).i_pipe->waiting_writers)
#define PIPE_RCOUNTER(inode) ((inode).i_pipe->r_counter)
#define PIPE_WCOUNTER(inode) ((inode).i_pipe->w_counter)
-#define PIPE_READFILE(inode) ((inode).i_pipe->rdfile)
-#define PIPE_WRITEFILE(inode) ((inode).i_pipe->wrfile)
#define PIPE_FASYNC_READERS(inode) (&((inode).i_pipe->fasync_readers))
#define PIPE_FASYNC_WRITERS(inode) (&((inode).i_pipe->fasync_writers))

diff -Nru linux-2.5.45.vanilla/include/linux/poll.h linux-2.5.45.epoll/include/linux/poll.h
--- linux-2.5.45.vanilla/include/linux/poll.h Wed Oct 30 16:42:54 2002
+++ linux-2.5.45.epoll/include/linux/poll.h Fri Nov 1 14:28:06 2002
@@ -13,6 +13,9 @@
struct poll_table_page;

typedef struct poll_table_struct {
+ int queue;
+ void *priv;
+ void (*qproc)(void *, wait_queue_head_t *);
int error;
struct poll_table_page * table;
} poll_table;
@@ -27,9 +30,24 @@

static inline void poll_initwait(poll_table* pt)
{
+ pt->queue = 1;
+ pt->qproc = NULL;
+ pt->priv = NULL;
pt->error = 0;
pt->table = NULL;
}
+
+static inline void poll_initwait_ex(poll_table* pt, int queue,
+ void (*qproc)(void *, wait_queue_head_t *),
+ void *priv)
+{
+ pt->queue = queue;
+ pt->qproc = qproc;
+ pt->priv = priv;
+ pt->error = 0;
+ pt->table = NULL;
+}
+
extern void poll_freewait(poll_table* pt);

diff -Nru linux-2.5.45.vanilla/include/net/sock.h linux-2.5.45.epoll/include/net/sock.h
--- linux-2.5.45.vanilla/include/net/sock.h Wed Oct 30 16:43:33 2002
+++ linux-2.5.45.epoll/include/net/sock.h Fri Nov 1 12:11:53 2002
@@ -52,9 +52,6 @@
#include <asm/atomic.h>
#include <net/dst.h>
#include <net/scm.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/fcblist.h>

/*
* This structure really needs to be cleaned up.
@@ -769,13 +766,8 @@

static inline void sk_wake_async(struct sock *sk, int how, int band)
{
- if (sk->socket) {
- if (sk->socket->file)
- file_send_notify(sk->socket->file, ion_band_table[band - POLL_IN],
- poll_band_table[band - POLL_IN]);
- if (sk->socket->fasync_list)
- sock_wake_async(sk->socket, how, band);
- }
+ if (sk->socket && sk->socket->fasync_list)
+ sock_wake_async(sk->socket, how, band);
}

#define SOCK_MIN_SNDBUF 2048
diff -Nru linux-2.5.45.vanilla/net/ipv4/tcp.c linux-2.5.45.epoll/net/ipv4/tcp.c
--- linux-2.5.45.vanilla/net/ipv4/tcp.c Wed Oct 30 16:42:21 2002
+++ linux-2.5.45.epoll/net/ipv4/tcp.c Fri Nov 1 12:15:17 2002
@@ -476,8 +476,8 @@
if (sk->sleep && waitqueue_active(sk->sleep))
wake_up_interruptible(sk->sleep);

- if (!(sk->shutdown & SEND_SHUTDOWN))
- sk_wake_async(sk, 2, POLL_OUT);
+ if (sock->fasync_list && !(sk->shutdown & SEND_SHUTDOWN))
+ sock_wake_async(sock, 2, POLL_OUT);
}
}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/