[patch 13/13] lseek speedup

Andrew Morton (akpm@zip.com.au)
Tue, 16 Jul 2002 22:31:23 -0700


This is a fairly dopey patch to fix up the i_sem contention in lseek.
Better ideas are welcome, but I'm offline until Monday so don't think
I'm ignoring them...

We need to decide what we really want to lock in there. If multiple
threads are updating f_pos and/or i_size at the same time then they are
going to see strange results whatever the kernel does. I'd maintain
that the only real obligation which the kernel has in this situation is
to not oops, to not munch data and to not return competely outlandish
results.

And the only way we can return outlandish results is on 32-bit SMP if
one CPU reads i_size or f_pos while another CPU is in the middle of
modifying it.

- We only need to take i_sem for reading i_size on 32-bit machines
with SMP or preempt, so add the entertaining down32() and up32()
functions for that.

- Only take i_sem in the case where we actually need to read i_size:
SEEK_END.

So this patch only speeds up SEEK_SET, SEEK_CUR and SEEK_END on
64-bit. SEEK_END on 32-bit will continue to bounce i_sem around.

- Introduce a new spinlock in struct file. Its scope is currently
for providing atomicity to f_pos updates.

This lock could also be used for guarding consistency of the
readahead state. But after a close walkthrough the readahead code I
don't think readahead needs any locking (apart from the 32-bit reader
of i_size!). Any errors in there will be +/- 1 differences in the
numbers, and readahead just fixes things like that up anyway.

- Fiddled with the layout of struct file in an attempt to make it
more cache-friendly.

fs/file_table.c | 1
fs/read_write.c | 86 ++++++++++++++++++++++++++++-------------------------
include/linux/fs.h | 15 ++++++---
3 files changed, 57 insertions(+), 45 deletions(-)

--- 2.5.26/fs/read_write.c~lseek-speedup Tue Jul 16 21:47:22 2002
+++ 2.5.26-akpm/fs/read_write.c Tue Jul 16 21:47:22 2002
@@ -20,53 +20,59 @@ struct file_operations generic_ro_fops =
mmap: generic_file_mmap,
};

+/*
+ * Assume reads and writes of long longs are atomic on 64-bit machines
+ */
+static inline void down32(struct semaphore *sem)
+{
+#if (BITS_PER_LONG != 64) && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT))
+ down(sem);
+#endif
+}
+
+static inline void up32(struct semaphore *sem)
+{
+#if (BITS_PER_LONG != 64) && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT))
+ up(sem);
+#endif
+}
+
loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
{
- long long retval;
+ loff_t retval;
struct inode *inode = file->f_dentry->d_inode->i_mapping->host;

- down(&inode->i_sem);
- switch (origin) {
- case 2:
- offset += inode->i_size;
- break;
- case 1:
- offset += file->f_pos;
+ if (origin == 2) {
+ down32(&inode->i_sem);
+ offset += inode->i_size;
+ up32(&inode->i_sem);
+ spin_lock(&file->f_lock);
+ } else if (origin == 1) {
+ spin_lock(&file->f_lock);
+ offset += file->f_pos;
+ } else {
+ spin_lock(&file->f_lock);
}
retval = -EINVAL;
- if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
+ if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = ++event;
}
retval = offset;
}
- up(&inode->i_sem);
+ spin_unlock(&file->f_lock);
return retval;
}

loff_t remote_llseek(struct file *file, loff_t offset, int origin)
{
- long long retval;
+ loff_t ret;

lock_kernel();
- switch (origin) {
- case 2:
- offset += file->f_dentry->d_inode->i_size;
- break;
- case 1:
- offset += file->f_pos;
- }
- retval = -EINVAL;
- if (offset>=0 && offset<=file->f_dentry->d_inode->i_sb->s_maxbytes) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = ++event;
- }
- retval = offset;
- }
+ ret = generic_file_llseek(file, offset, origin);
unlock_kernel();
- return retval;
+ return ret;
}

loff_t no_llseek(struct file *file, loff_t offset, int origin)
@@ -76,15 +82,18 @@ loff_t no_llseek(struct file *file, loff

loff_t default_llseek(struct file *file, loff_t offset, int origin)
{
+ struct inode *inode = file->f_dentry->d_inode;
long long retval;

- lock_kernel();
- switch (origin) {
- case 2:
- offset += file->f_dentry->d_inode->i_size;
- break;
- case 1:
- offset += file->f_pos;
+ if (origin == 2) {
+ down32(&inode->i_sem);
+ offset += inode->i_size;
+ up32(&inode->i_sem);
+ } else if (origin == 1) {
+ spin_lock(&file->f_lock);
+ offset += file->f_pos;
+ } else {
+ spin_lock(&file->f_lock);
}
retval = -EINVAL;
if (offset >= 0) {
@@ -94,18 +103,15 @@ loff_t default_llseek(struct file *file,
}
retval = offset;
}
- unlock_kernel();
+ spin_unlock(&file->f_lock);
return retval;
}

static inline loff_t llseek(struct file *file, loff_t offset, int origin)
{
- loff_t (*fn)(struct file *, loff_t, int);
-
- fn = default_llseek;
if (file->f_op && file->f_op->llseek)
- fn = file->f_op->llseek;
- return fn(file, offset, origin);
+ return (*file->f_op->llseek)(file, offset, origin);
+ return default_llseek(file, offset, origin);
}

asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin)
--- 2.5.26/include/linux/fs.h~lseek-speedup Tue Jul 16 21:47:22 2002
+++ 2.5.26-akpm/include/linux/fs.h Tue Jul 16 21:47:22 2002
@@ -475,20 +475,25 @@ struct file_ra_state {
};

struct file {
- struct list_head f_list;
struct dentry *f_dentry;
+ struct list_head f_list;
struct vfsmount *f_vfsmnt;
- struct file_operations *f_op;
- atomic_t f_count;
+
unsigned int f_flags;
mode_t f_mode;
+ struct file_operations *f_op;
+ unsigned long f_version;
+
+ atomic_t f_count;
+ spinlock_t f_lock; /* for f_pos atomicity */
loff_t f_pos;
+
struct fown_struct f_owner;
- unsigned int f_uid, f_gid;
+ unsigned int f_uid;
+ unsigned int f_gid;
int f_error;
struct file_ra_state f_ra;

- unsigned long f_version;

/* needed for tty driver, and maybe others */
void *private_data;
--- 2.5.26/fs/file_table.c~lseek-speedup Tue Jul 16 21:47:22 2002
+++ 2.5.26-akpm/fs/file_table.c Tue Jul 16 21:47:22 2002
@@ -44,6 +44,7 @@ struct file * get_empty_filp(void)
new_one:
memset(f, 0, sizeof(*f));
atomic_set(&f->f_count,1);
+ spin_lock_init(&f->f_lock);
f->f_version = ++event;
f->f_uid = current->fsuid;
f->f_gid = current->fsgid;

.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/