Al Viro produced patch for 2.5 to address this problem:
http://linux.bkbits.net:8080/linux-2.5/cset@1.290?nav=index.html|ChangeSet@-3w
I've done a fairly straightforward backport to the 2.4 tree. But, I'm not sure how it interacts with the VFS patches which came earlier in 2.5. It is quite stable on my 2-way; definitely no deadlocks. I'll be testing it on larger machines soon. I don't expect it to go into the mainline kernel right away, but I would like the secondary tree maintainers to pick it up and let people pound on it.
You can see the lockmeter data here:
http://www.sr71.net/ibm/bkl-ext2-dbench/
You may notice that the data from my patch show a slight drop in throughput (~2%). However, my testing conditions were less than ideal and the drop can probably be attributed to statistical error.
--
Dave Hansen
haveblue@us.ibm.com
diff -ur clean/fs/ext2/balloc.c linux/fs/ext2/balloc.c
--- clean/fs/ext2/balloc.c Mon Feb 25 11:38:08 2002
+++ linux/fs/ext2/balloc.c Wed Feb 27 09:11:39 2002
@@ -539,6 +539,7 @@
*/
#ifdef EXT2_PREALLOCATE
/* Writer: ->i_prealloc* */
+ write_lock(&inode->u.ext2_i.i_meta_lock);
if (prealloc_count && !*prealloc_count) {
int prealloc_goal;
unsigned long next_block = tmp + 1;
@@ -577,7 +578,7 @@
(k - 1));
}
#endif
-
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
j = tmp;
mark_buffer_dirty(bh);
diff -ur clean/fs/ext2/ialloc.c linux/fs/ext2/ialloc.c
--- clean/fs/ext2/ialloc.c Mon Feb 25 11:38:08 2002
+++ linux/fs/ext2/ialloc.c Wed Feb 27 09:11:39 2002
@@ -392,6 +392,7 @@
inode->u.ext2_i.i_block_group = group;
if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL)
inode->i_flags |= S_SYNC;
+ rwlock_init(&inode->u.ext2_i.i_meta_lock);
insert_inode_hash(inode);
inode->i_generation = event++;
mark_inode_dirty(inode);
Only in linux/fs/ext2: ialloc.c.orig
diff -ur clean/fs/ext2/inode.c linux/fs/ext2/inode.c
--- clean/fs/ext2/inode.c Mon Feb 25 11:38:08 2002
+++ linux/fs/ext2/inode.c Wed Feb 27 09:11:39 2002
@@ -51,8 +51,6 @@
*/
void ext2_delete_inode (struct inode * inode)
{
- lock_kernel();
-
if (is_bad_inode(inode) ||
inode->i_ino == EXT2_ACL_IDX_INO ||
inode->i_ino == EXT2_ACL_DATA_INO)
@@ -60,6 +58,8 @@
inode->u.ext2_i.i_dtime = CURRENT_TIME;
mark_inode_dirty(inode);
ext2_update_inode(inode, IS_SYNC(inode));
+
+ lock_kernel();
inode->i_size = 0;
if (inode->i_blocks)
ext2_truncate (inode);
@@ -68,24 +68,26 @@
unlock_kernel();
return;
no_delete:
- unlock_kernel();
clear_inode(inode); /* We must guarantee clearing of inode... */
}
void ext2_discard_prealloc (struct inode * inode)
{
#ifdef EXT2_PREALLOCATE
- lock_kernel();
+ write_lock(&inode->u.ext2_i.i_meta_lock);
/* Writer: ->i_prealloc* */
if (inode->u.ext2_i.i_prealloc_count) {
unsigned short total = inode->u.ext2_i.i_prealloc_count;
unsigned long block = inode->u.ext2_i.i_prealloc_block;
inode->u.ext2_i.i_prealloc_count = 0;
inode->u.ext2_i.i_prealloc_block = 0;
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
/* Writer: end */
ext2_free_blocks (inode, block, total);
+ } else {
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
}
- unlock_kernel();
+
#endif
}
@@ -99,6 +101,7 @@
#ifdef EXT2_PREALLOCATE
/* Writer: ->i_prealloc* */
+ write_lock(&inode->u.ext2_i.i_meta_lock);
if (inode->u.ext2_i.i_prealloc_count &&
(goal == inode->u.ext2_i.i_prealloc_block ||
goal + 1 == inode->u.ext2_i.i_prealloc_block))
@@ -106,9 +109,11 @@
result = inode->u.ext2_i.i_prealloc_block++;
inode->u.ext2_i.i_prealloc_count--;
/* Writer: end */
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
ext2_debug ("preallocation hit (%lu/%lu).\n",
++alloc_hits, ++alloc_attempts);
} else {
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
ext2_discard_prealloc (inode);
ext2_debug ("preallocation miss (%lu/%lu).\n",
alloc_hits, ++alloc_attempts);
@@ -253,9 +258,11 @@
if (!bh)
goto failure;
/* Reader: pointers */
+ read_lock(&inode->u.ext2_i.i_meta_lock);
if (!verify_chain(chain, p))
goto changed;
add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
+ read_unlock(&inode->u.ext2_i.i_meta_lock);
/* Reader: end */
if (!p->key)
goto no_block;
@@ -263,6 +270,7 @@
return NULL;
changed:
+ read_unlock(&inode->u.ext2_i.i_meta_lock);
*err = -EAGAIN;
goto no_block;
failure:
@@ -328,6 +336,8 @@
unsigned long *goal)
{
/* Writer: ->i_next_alloc* */
+
+ write_lock(&inode->u.ext2_i.i_meta_lock);
if (block == inode->u.ext2_i.i_next_alloc_block + 1) {
inode->u.ext2_i.i_next_alloc_block++;
inode->u.ext2_i.i_next_alloc_goal++;
@@ -343,9 +353,11 @@
*goal = inode->u.ext2_i.i_next_alloc_goal;
if (!*goal)
*goal = ext2_find_near(inode, partial);
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
return 0;
}
/* Reader: end */
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
return -EAGAIN;
}
@@ -451,6 +463,7 @@
/* Verify that place we are splicing to is still there and vacant */
+ write_lock(&inode->u.ext2_i.i_meta_lock);
/* Writer: pointers, ->i_next_alloc* */
if (!verify_chain(chain, where-1) || *where->p)
/* Writer: end */
@@ -461,7 +474,7 @@
*where->p = where->key;
inode->u.ext2_i.i_next_alloc_block = block;
inode->u.ext2_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
-
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
/* Writer: end */
/* We are done with atomic stuff, now do the rest of housekeeping */
@@ -517,7 +530,6 @@
if (depth == 0)
goto out;
- lock_kernel();
reread:
partial = ext2_get_branch(inode, depth, offsets, chain, &err);
@@ -539,7 +551,6 @@
brelse(partial->bh);
partial--;
}
- unlock_kernel();
out:
return err;
}
@@ -967,6 +978,7 @@
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
inode->u.ext2_i.i_prealloc_count = 0;
inode->u.ext2_i.i_block_group = block_group;
+ rwlock_init(&inode->u.ext2_i.i_meta_lock);
/*
* NOTE! The in-memory inode i_data array is in little-endian order
@@ -1149,9 +1161,7 @@
void ext2_write_inode (struct inode * inode, int wait)
{
- lock_kernel();
ext2_update_inode (inode, wait);
- unlock_kernel();
}
int ext2_sync_inode (struct inode *inode)
Only in linux/fs/ext2: inode.c.orig
diff -ur clean/include/linux/ext2_fs_i.h linux/include/linux/ext2_fs_i.h
--- clean/include/linux/ext2_fs_i.h Mon Sep 17 13:16:30 2001
+++ linux/include/linux/ext2_fs_i.h Wed Feb 27 09:11:39 2002
@@ -36,6 +36,7 @@
__u32 i_prealloc_count;
__u32 i_dir_start_lookup;
int i_new_inode:1; /* Is a freshly allocated inode */
+ rwlock_t i_meta_lock;
};
#endif /* _LINUX_EXT2_FS_I */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/