[patch 1/2] vfsmount_lock

Maneesh Soni (maneesh@in.ibm.com)
Wed, 21 May 2003 14:55:02 +0530


While path walking we do follow_mount or follow_down which uses
dcache_lock for serialisation. vfsmount related operations also use
dcache_lock for all updates. I think we can use a separate lock for
vfsmount related work and can improve path walking.

The following two patches does the same. The first one replaces
dcache_lock with new vfsmount_lock in namespace.c. The lock is
local to namespace.c and is not required outside. The second patch
uses RCU to have lock free lookup_mnt(). The patches are quite simple
and straight forward.

The lockmeter reults show reduced contention, and lock acquisitions
for dcache_lock while running dcachebench* on a 4-way SMP box

SPINLOCKS HOLD WAIT
UTIL CON MEAN( MAX ) MEAN( MAX )(% CPU) TOTAL NOWAIT SPIN RJECT NAME

baselkm-2569:
20.7% 20.9% 0.5us( 146us) 2.9us( 144us)(0.81%) 31590840 79.1% 20.9% 0% dcache_lock
mntlkm-2569:
14.3% 13.6% 0.4us( 170us) 2.9us( 187us)(0.42%) 23071746 86.4% 13.6% 0% dcache_lock

We get more than 8% improvement on 4-way SMP and 44% improvement on 16-way
NUMAQ while runing dcachebench*.

Average (usecs/iteration) Std. Deviation
(lower is better)
4-way SMP
2.5.69 15739.3 470.90
2.5.69-mnt 14459.6 298.51

16-way NUMAQ
2.5.69 120426.5 363.78
2.5.69-mnt 63225.8 427.60

*dcachebench is a microbenchmark written by Bill Hartner and is available at
http://www-124.ibm.com/developerworks/opensource/linuxperf/dcachebench/dcachebench.html

vfsmount_lock.patch
-------------------
- Patch for replacing dcache_lock with new vfsmount_lock for all mount
related operation. This removes the need to take dcache_lock while
doing follow_mount or follow_down operations in path walking.

fs/namei.c | 24 +++++++++------------
fs/namespace.c | 64 ++++++++++++++++++++++++++++++++-------------------------
2 files changed, 48 insertions(+), 40 deletions(-)

diff -puN fs/namespace.c~vfsmount_lock fs/namespace.c
--- linux-2.5.69/fs/namespace.c~vfsmount_lock 2003-05-21 10:53:00.000000000 +0530
+++ linux-2.5.69-maneesh/fs/namespace.c 2003-05-21 11:02:47.000000000 +0530
@@ -28,6 +28,8 @@ extern int do_remount_sb(struct super_bl
extern int __init init_rootfs(void);
extern int __init fs_subsys_init(void);

+/* spinlock for vfsmount related operation, inplace of dcache_lock */
+static spinlock_t vfsmount_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
static struct list_head *mount_hashtable;
static int hash_mask, hash_bits;
static kmem_cache_t *mnt_cache;
@@ -69,30 +71,38 @@ void free_vfsmnt(struct vfsmount *mnt)
kmem_cache_free(mnt_cache, mnt);
}

+/*
+ * Now, lookup_mnt increments the ref count before returning
+ * the vfsmount struct.
+ */
struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
struct list_head * head = mount_hashtable + hash(mnt, dentry);
struct list_head * tmp = head;
- struct vfsmount *p;
+ struct vfsmount *p, *found = NULL;

+ spin_lock(&vfsmount_lock);
for (;;) {
tmp = tmp->next;
p = NULL;
if (tmp == head)
break;
p = list_entry(tmp, struct vfsmount, mnt_hash);
- if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry)
+ if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
+ found = mntget(p);
break;
+ }
}
- return p;
+ spin_lock(&vfsmount_lock);
+ return found;
}

static int check_mnt(struct vfsmount *mnt)
{
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
while (mnt->mnt_parent != mnt)
mnt = mnt->mnt_parent;
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
return mnt == current->namespace->root;
}

@@ -273,15 +283,15 @@ void umount_tree(struct vfsmount *mnt)
mnt = list_entry(kill.next, struct vfsmount, mnt_list);
list_del_init(&mnt->mnt_list);
if (mnt->mnt_parent == mnt) {
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
} else {
struct nameidata old_nd;
detach_mnt(mnt, &old_nd);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
path_release(&old_nd);
}
mntput(mnt);
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
}
}

@@ -334,17 +344,17 @@ static int do_umount(struct vfsmount *mn
}

down_write(&current->namespace->sem);
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);

if (atomic_read(&sb->s_active) == 1) {
/* last instance - try to be smart */
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
lock_kernel();
DQUOT_OFF(sb);
acct_auto_close(sb);
unlock_kernel();
security_sb_umount_close(mnt);
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
}
retval = -EBUSY;
if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
@@ -352,7 +362,7 @@ static int do_umount(struct vfsmount *mn
umount_tree(mnt);
retval = 0;
}
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
if (retval)
security_sb_umount_busy(mnt);
up_write(&current->namespace->sem);
@@ -442,17 +452,17 @@ static struct vfsmount *copy_tree(struct
q = clone_mnt(p, p->mnt_root);
if (!q)
goto Enomem;
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
list_add_tail(&q->mnt_list, &res->mnt_list);
attach_mnt(q, &nd);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
}
return res;
Enomem:
if (res) {
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
umount_tree(res);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
}
return NULL;
}
@@ -476,7 +486,7 @@ static int graft_tree(struct vfsmount *m
if (err)
goto out_unlock;

- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) {
struct list_head head;
attach_mnt(mnt, nd);
@@ -485,7 +495,7 @@ static int graft_tree(struct vfsmount *m
mntget(mnt);
err = 0;
}
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
out_unlock:
up(&nd->dentry->d_inode->i_sem);
if (!err)
@@ -522,9 +532,9 @@ static int do_loopback(struct nameidata
if (mnt) {
err = graft_tree(mnt, nd);
if (err) {
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
umount_tree(mnt);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
} else
mntput(mnt);
}
@@ -589,7 +599,7 @@ static int do_move_mount(struct nameidat
if (IS_DEADDIR(nd->dentry->d_inode))
goto out1;

- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
goto out2;

@@ -613,7 +623,7 @@ static int do_move_mount(struct nameidat
detach_mnt(old_nd.mnt, &parent_nd);
attach_mnt(old_nd.mnt, nd);
out2:
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
out1:
up(&nd->dentry->d_inode->i_sem);
out:
@@ -794,9 +804,9 @@ int copy_namespace(int flags, struct tas
down_write(&tsk->namespace->sem);
/* First pass: copy the tree topology */
new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root);
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);

/* Second pass: switch the tsk->fs->* elements */
if (fs) {
@@ -1017,7 +1027,7 @@ asmlinkage long sys_pivot_root(const cha
if (new_nd.mnt->mnt_root != new_nd.dentry)
goto out2; /* not a mountpoint */
tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
if (tmp != new_nd.mnt) {
for (;;) {
if (tmp->mnt_parent == tmp)
@@ -1034,7 +1044,7 @@ asmlinkage long sys_pivot_root(const cha
detach_mnt(user_nd.mnt, &root_parent);
attach_mnt(user_nd.mnt, &old_nd);
attach_mnt(new_nd.mnt, &root_parent);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
chroot_fs_refs(&user_nd, &new_nd);
security_sb_post_pivotroot(&user_nd, &new_nd);
error = 0;
@@ -1051,7 +1061,7 @@ out0:
unlock_kernel();
return error;
out3:
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
goto out2;
}

diff -puN fs/namei.c~vfsmount_lock fs/namei.c
--- linux-2.5.69/fs/namei.c~vfsmount_lock 2003-05-21 10:53:00.000000000 +0530
+++ linux-2.5.69-maneesh/fs/namei.c 2003-05-21 10:53:00.000000000 +0530
@@ -434,19 +434,17 @@ int follow_up(struct vfsmount **mnt, str
return 1;
}

+/* no need for dcache_lock, as serialization is taken care in
+ * namespace.c
+ */
static int follow_mount(struct vfsmount **mnt, struct dentry **dentry)
{
int res = 0;
while (d_mountpoint(*dentry)) {
- struct vfsmount *mounted;
- spin_lock(&dcache_lock);
- mounted = lookup_mnt(*mnt, *dentry);
- if (!mounted) {
- spin_unlock(&dcache_lock);
+ struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
+ if (!mounted)
break;
- }
- *mnt = mntget(mounted);
- spin_unlock(&dcache_lock);
+ *mnt = mounted;
dput(*dentry);
mntput(mounted->mnt_parent);
*dentry = dget(mounted->mnt_root);
@@ -455,21 +453,21 @@ static int follow_mount(struct vfsmount
return res;
}

+/* no need for dcache_lock, as serialization is taken care in
+ * namespace.c
+ */
static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry)
{
struct vfsmount *mounted;
-
- spin_lock(&dcache_lock);
+
mounted = lookup_mnt(*mnt, *dentry);
if (mounted) {
- *mnt = mntget(mounted);
- spin_unlock(&dcache_lock);
+ *mnt = mounted;
dput(*dentry);
mntput(mounted->mnt_parent);
*dentry = dget(mounted->mnt_root);
return 1;
}
- spin_unlock(&dcache_lock);
return 0;
}

_

-- 
Maneesh Soni
IBM Linux Technology Center, 
IBM India Software Lab, Bangalore.
Phone: +91-80-5044999 email: maneesh@in.ibm.com
http://lse.sourceforge.net/
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/