[patch 11/13] don't hold i_sem during O_DIRECT writes to blockdevs

Andrew Morton (akpm@zip.com.au)
Sun, 28 Jul 2002 00:33:23 -0700


We're moving in the direction of deprecating the raw driver and
recommending that applications use O_DIRECT reads and writes against
blockdevs.

I don't know how acceptable this will be - there may be
operational/admin reasons for preferring the raw driver, and there are
certainly application porting issues. So we should continue to support
the raw driver well in 2.6.

One weakness which writes to blockdevs have wrt the raw driver is that
they are serialised under i_sem. There is no need for this.

This patch changes O_DIRECT writes to blockdevs so that they no longer
run under i_sem.

filemap.c | 71 ++++++++++++++++++++++++++++++++++++++------------------------
1 files changed, 44 insertions(+), 27 deletions(-)

--- 2.5.29/mm/filemap.c~o_direct-i_sem Sat Jul 27 23:39:13 2002
+++ 2.5.29-akpm/mm/filemap.c Sat Jul 27 23:48:58 2002
@@ -1934,6 +1934,7 @@ generic_file_write(struct file *file, co
struct address_space_operations *a_ops = mapping->a_ops;
struct inode *inode = mapping->host;
unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+ int isblk = S_ISBLK(inode->i_mode);
long status = 0;
loff_t pos;
struct page *page;
@@ -1953,19 +1954,19 @@ generic_file_write(struct file *file, co
pos = *ppos;
if (unlikely(pos < 0)) {
err = -EINVAL;
- goto out;
+ goto out_sem;
}

if (unlikely(file->f_error)) {
err = file->f_error;
file->f_error = 0;
- goto out;
+ goto out_sem;
}

written = 0;

/* FIXME: this is for backwards compatibility with 2.4 */
- if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
+ if (!isblk && file->f_flags & O_APPEND)
pos = inode->i_size;

/*
@@ -1975,7 +1976,7 @@ generic_file_write(struct file *file, co
if (pos >= limit) {
send_sig(SIGXFSZ, current, 0);
err = -EFBIG;
- goto out;
+ goto out_sem;
}
if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) {
/* send_sig(SIGXFSZ, current, 0); */
@@ -1991,7 +1992,7 @@ generic_file_write(struct file *file, co
if (pos >= MAX_NON_LFS) {
send_sig(SIGXFSZ, current, 0);
err = -EFBIG;
- goto out;
+ goto out_sem;
}
if (count > MAX_NON_LFS - (u32)pos) {
/* send_sig(SIGXFSZ, current, 0); */
@@ -2006,12 +2007,12 @@ generic_file_write(struct file *file, co
* exceeded without writing data we send a signal and return EFBIG.
* Linus frestrict idea will clean these up nicely..
*/
- if (likely(!S_ISBLK(inode->i_mode))) {
+ if (likely(!isblk)) {
if (unlikely(pos >= inode->i_sb->s_maxbytes)) {
if (count || pos > inode->i_sb->s_maxbytes) {
send_sig(SIGXFSZ, current, 0);
err = -EFBIG;
- goto out;
+ goto out_sem;
}
/* zero-length writes at ->s_maxbytes are OK */
}
@@ -2021,12 +2022,12 @@ generic_file_write(struct file *file, co
} else {
if (bdev_read_only(inode->i_bdev)) {
err = -EPERM;
- goto out;
+ goto out_sem;
}
if (pos >= inode->i_size) {
if (count || pos > inode->i_size) {
err = -ENOSPC;
- goto out;
+ goto out_sem;
}
}

@@ -2036,7 +2037,7 @@ generic_file_write(struct file *file, co

err = 0;
if (count == 0)
- goto out;
+ goto out_sem;

remove_suid(file->f_dentry);
time_now = CURRENT_TIME;
@@ -2047,25 +2048,40 @@ generic_file_write(struct file *file, co
}

if (unlikely(file->f_flags & O_DIRECT)) {
- written = generic_file_direct_IO(WRITE, inode,
+ if (isblk) {
+ up(&inode->i_sem);
+ written = generic_file_direct_IO(WRITE, inode,
(char *)buf, pos, count);
- if (written > 0) {
- loff_t end = pos + written;
- if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
- inode->i_size = end;
- mark_inode_dirty(inode);
+ if (written > 0) {
+ if (mapping->nrpages)
+ invalidate_inode_pages2(mapping);
+ *ppos = pos + written;
}
- *ppos = end;
- if (mapping->nrpages)
- invalidate_inode_pages2(mapping);
+ err = written;
+ goto out;
+ } else {
+ written = generic_file_direct_IO(WRITE, inode,
+ (char *)buf, pos, count);
+ if (written > 0) {
+ loff_t end = pos + written;
+ if (end > inode->i_size) {
+ inode->i_size = end;
+ mark_inode_dirty(inode);
+ }
+ *ppos = end;
+ /*
+ * Sync the fs metadata but not the minor inode
+ * changes and of course not the data as we did
+ * direct DMA for the IO.
+ */
+ if (file->f_flags & O_SYNC)
+ status = generic_osync_inode(inode,
+ OSYNC_METADATA);
+ if (mapping->nrpages)
+ invalidate_inode_pages2(mapping);
+ }
+ goto out_status;
}
- /*
- * Sync the fs metadata but not the minor inode changes and
- * of course not the data as we did direct DMA for the IO.
- */
- if (written >= 0 && file->f_flags & O_SYNC)
- status = generic_osync_inode(inode, OSYNC_METADATA);
- goto out_status;
}

do {
@@ -2152,7 +2168,8 @@ generic_file_write(struct file *file, co

out_status:
err = written ? written : status;
-out:
+out_sem:
up(&inode->i_sem);
+out:
return err;
}

.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/