[PATCH] VFS mountpoint expiry

David Howells (dhowells@redhat.com)
Fri, 04 Jul 2003 10:25:54 +0100


Hi Linus, Al,

Here's a patch to do mountpoint expiry along the lines suggested by Al.

Mountpoint expiry can be driven in two ways:

(1) From userspace using umount(<path>,MNT_EXPIRE).

If the mountpoint is in use, then -EBUSY is returned.

If not already set, this sets a flag in the vfsmount structure and
returns -EAGAIN.

If the flag is already set, the unmount is actually effected and 0 is
returned.

Note that mixing MNT_EXPIRE with MNT_FORCE or MNT_DETACH will incur a
-EINVAL error.

The flag is cleared should mntput() decrease the vfsmount's mnt_count to
one.

(2) A filesystem can call mark_mounts_for_expiry() with a list of vfsmounts
that should be checked. Such mounts are linked together through a new
list_head (mnt_fslink) in the vfsmount structure. The head of this list
should be retained by the module, and should only be accessed whilst
holding the dcache_lock.

This function performs a similar action on every vfsmount in the list to
that performed by umount with MNT_EXPIRE.

With regard to mnt_fslink, should a namespace be cloned every vfsmount
that is on a such list will be cloned as normal and added to the same
list whilst the dcache_lock is held.

Should a "listed" vfsmount be moved, then it will be removed from the
list. Should a bind be performed on a vfsmount, then the resulting new
vfsmount will not be added to the list.

vfsmounts to go on this list can be obtained by playing tricks with a
follow_link() method attached to a directory - this method can then call
do_kern_mount() to obtain a new vfsmount, and then call do_add_mount() to
mount the vfsmount _and_ add it to the module's expiry list under the
appropriate locking.

This more or less permits a filesystem to do automounting at arbitrary
points within a tree without worrying about namespace issues.

Note that mnt_flags should now be accessed atomically with set_bit() and co.

David

diff -ur linux-2.5.74/fs/block_dev.c linux-2.5.74-auto/fs/block_dev.c
--- linux-2.5.74/fs/block_dev.c 2003-07-03 15:35:15.000000000 +0100
+++ linux-2.5.74-auto/fs/block_dev.c 2003-07-03 15:41:15.000000000 +0100
@@ -762,7 +762,7 @@
if (!S_ISBLK(inode->i_mode))
goto fail;
error = -EACCES;
- if (nd.mnt->mnt_flags & MNT_NODEV)
+ if (test_bit(__MNT_NODEV, &nd.mnt->mnt_flags))
goto fail;
error = bd_acquire(inode);
if (error)
diff -ur linux-2.5.74/fs/exec.c linux-2.5.74-auto/fs/exec.c
--- linux-2.5.74/fs/exec.c 2003-07-03 15:35:13.000000000 +0100
+++ linux-2.5.74-auto/fs/exec.c 2003-07-03 15:41:15.000000000 +0100
@@ -459,7 +459,7 @@
if (!err) {
struct inode *inode = nd.dentry->d_inode;
file = ERR_PTR(-EACCES);
- if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
+ if (!test_bit(__MNT_NOEXEC, &nd.mnt->mnt_flags) &&
S_ISREG(inode->i_mode)) {
int err = permission(inode, MAY_EXEC);
if (!err && !(inode->i_mode & 0111))
@@ -841,7 +841,7 @@
bprm->e_uid = current->euid;
bprm->e_gid = current->egid;

- if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) {
+ if(!test_bit(__MNT_NOSUID, &bprm->file->f_vfsmnt->mnt_flags)) {
/* Set-uid? */
if (mode & S_ISUID)
bprm->e_uid = inode->i_uid;
diff -ur linux-2.5.74/fs/namei.c linux-2.5.74-auto/fs/namei.c
--- linux-2.5.74/fs/namei.c 2003-07-03 15:35:13.000000000 +0100
+++ linux-2.5.74-auto/fs/namei.c 2003-07-03 15:41:15.000000000 +0100
@@ -269,6 +269,12 @@
mntput(nd->mnt);
}

+void path_release_on_umount(struct nameidata *nd)
+{
+ dput(nd->dentry);
+ _mntput(nd->mnt);
+}
+
/*
* Internal lookup() using the new generic dcache.
* SMP-safe
@@ -1147,7 +1153,7 @@
if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
flag &= ~O_TRUNC;
} else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
- if (nd->mnt->mnt_flags & MNT_NODEV)
+ if (test_bit(__MNT_NODEV, &nd->mnt->mnt_flags))
return -EACCES;

flag &= ~O_TRUNC;
diff -ur linux-2.5.74/fs/namespace.c linux-2.5.74-auto/fs/namespace.c
--- linux-2.5.74/fs/namespace.c 2003-07-03 15:35:14.000000000 +0100
+++ linux-2.5.74-auto/fs/namespace.c 2003-07-03 15:41:15.000000000 +0100
@@ -48,6 +48,7 @@
INIT_LIST_HEAD(&mnt->mnt_child);
INIT_LIST_HEAD(&mnt->mnt_mounts);
INIT_LIST_HEAD(&mnt->mnt_list);
+ INIT_LIST_HEAD(&mnt->mnt_fslink);
if (name) {
int size = strlen(name)+1;
char *newname = kmalloc(size, GFP_KERNEL);
@@ -84,13 +85,9 @@
return p;
}

-static int check_mnt(struct vfsmount *mnt)
+static inline int check_mnt(struct vfsmount *mnt)
{
- spin_lock(&dcache_lock);
- while (mnt->mnt_parent != mnt)
- mnt = mnt->mnt_parent;
- spin_unlock(&dcache_lock);
- return mnt == current->namespace->root;
+ return mnt->mnt_namespace == current->namespace;
}

static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
@@ -136,12 +133,17 @@
struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);

if (mnt) {
mnt->mnt_flags = old->mnt_flags;
atomic_inc(&sb->s_active);
mnt->mnt_sb = sb;
mnt->mnt_root = dget(root);
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
+ mnt->mnt_namespace = old->mnt_namespace;
+ spin_lock(&dcache_lock);
+ if (!list_empty(&old->mnt_fslink))
+ list_add(&mnt->mnt_fslink, &old->mnt_fslink);
+ spin_unlock(&dcache_lock);
}
return mnt;
}
@@ -203,9 +205,9 @@
{ 0, NULL }
};
static struct proc_fs_info mnt_info[] = {
- { MNT_NOSUID, ",nosuid" },
- { MNT_NODEV, ",nodev" },
- { MNT_NOEXEC, ",noexec" },
+ { __MNT_NOSUID, ",nosuid" },
+ { __MNT_NODEV, ",nodev" },
+ { __MNT_NOEXEC, ",noexec" },
{ 0, NULL }
};
struct proc_fs_info *fs_infop;
@@ -221,7 +223,7 @@
seq_puts(m, fs_infop->str);
}
for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
- if (mnt->mnt_flags & fs_infop->flag)
+ if (test_bit(fs_infop->flag, &mnt->mnt_flags))
seq_puts(m, fs_infop->str);
}
if (mnt->mnt_sb->s_op->show_options)
@@ -262,6 +264,7 @@
while (!list_empty(&kill)) {
mnt = list_entry(kill.next, struct vfsmount, mnt_list);
list_del_init(&mnt->mnt_list);
+ list_del_init(&mnt->mnt_fslink);
if (mnt->mnt_parent == mnt) {
spin_unlock(&dcache_lock);
} else {
@@ -285,6 +288,25 @@
return retval;

/*
+ * If we've been asked to mark for expiry, we only _actually_ effect
+ * the unmount if:
+ * (1) the mark is already set (the mark is cleared by mntput()
+ * reducing the count to 1)
+ * (2) the usage count != 1 [parent vfsmount] + 1 [sys_umount]
+ */
+ if (flags & MNT_EXPIRE) {
+ if (mnt == current->fs->rootmnt ||
+ flags & (MNT_FORCE | MNT_DETACH))
+ return -EINVAL;
+
+ if (atomic_read(&mnt->mnt_count) != 2)
+ return -EBUSY;
+
+ if (!test_and_set_bit(__MNT_EXPIRY_MARK, &mnt->mnt_flags))
+ return -EAGAIN;
+ }
+
+ /*
* If we may have to abort operations to get out of this
* mount, and they will themselves hold resources we must
* allow the fs to do things. In the Unix tradition of
@@ -377,7 +399,7 @@

retval = do_umount(nd.mnt, flags);
dput_and_out:
- path_release(&nd);
+ path_release_on_umount(&nd);
out:
return retval;
}
@@ -530,6 +552,10 @@
}

if (mnt) {
+ spin_lock(&dcache_lock);
+ list_del_init(&mnt->mnt_fslink);
+ spin_unlock(&dcache_lock);
+
err = graft_tree(mnt, nd);
if (err) {
spin_lock(&dcache_lock);
@@ -550,7 +576,8 @@
* on it - tough luck.
*/

-static int do_remount(struct nameidata *nd,int flags,int mnt_flags,void *data)
+static int do_remount(struct nameidata *nd, int flags, unsigned long mnt_flags,
+ void *data)
{
int err;
struct super_block * sb = nd->mnt->mnt_sb;
@@ -622,6 +649,7 @@

detach_mnt(old_nd.mnt, &parent_nd);
attach_mnt(old_nd.mnt, nd);
+ list_del_init(&old_nd.mnt->mnt_fslink);
out2:
spin_unlock(&dcache_lock);
out1:
@@ -634,27 +662,14 @@
return err;
}

-static int do_add_mount(struct nameidata *nd, char *type, int flags,
- int mnt_flags, char *name, void *data)
+int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
+ unsigned long mnt_flags, struct list_head *fslist)
{
- struct vfsmount *mnt;
int err;

- if (!type || !memchr(type, 0, PAGE_SIZE))
- return -EINVAL;
-
- /* we need capabilities... */
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- mnt = do_kern_mount(type, flags, name, data);
- err = PTR_ERR(mnt);
- if (IS_ERR(mnt))
- goto out;
-
down_write(&current->namespace->sem);
/* Something was mounted here while we slept */
- while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
+ while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
;
err = -EINVAL;
if (!check_mnt(nd->mnt))
@@ -662,18 +677,122 @@

/* Refuse the same filesystem on the same mount point */
err = -EBUSY;
- if (nd->mnt->mnt_sb == mnt->mnt_sb && nd->mnt->mnt_root == nd->dentry)
+ if (nd->mnt->mnt_sb == newmnt->mnt_sb &&
+ nd->mnt->mnt_root == nd->dentry)
goto unlock;

- mnt->mnt_flags = mnt_flags;
- err = graft_tree(mnt, nd);
+ newmnt->mnt_flags = mnt_flags;
+ err = graft_tree(newmnt, nd);
+
+ if (err == 0 && fslist) {
+ spin_lock(&dcache_lock);
+ list_add_tail(&newmnt->mnt_fslink, fslist);
+ spin_unlock(&dcache_lock);
+ }
+
unlock:
up_write(&current->namespace->sem);
- mntput(mnt);
-out:
+ mntput(newmnt);
return err;
}

+EXPORT_SYMBOL_GPL(do_add_mount);
+
+static int do_new_mount(struct nameidata *nd, char *type, int flags,
+ unsigned long mnt_flags, char *name, void *data)
+{
+ struct vfsmount *mnt;
+
+ if (!type || !memchr(type, 0, PAGE_SIZE))
+ return -EINVAL;
+
+ /* we need capabilities... */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mnt = do_kern_mount(type, flags, name, data);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ return do_add_mount(mnt, nd, mnt_flags, NULL);
+}
+
+void mark_mounts_for_expiry(struct list_head *mounts)
+{
+ struct namespace *namespace;
+ struct list_head graveyard, *_p, *_n;
+ struct vfsmount *mnt;
+
+ INIT_LIST_HEAD(&graveyard);
+
+ spin_lock(&dcache_lock);
+
+ list_for_each_safe(_p, _n, mounts) {
+ mnt = list_entry(_p, struct vfsmount, mnt_fslink);
+
+ if (!test_and_set_bit(__MNT_EXPIRY_MARK, &mnt->mnt_flags) ||
+ atomic_read(&mnt->mnt_count) != 1)
+ continue;
+
+ mntget(mnt);
+ list_move(&mnt->mnt_fslink, &graveyard);
+ }
+
+ while (!list_empty(&graveyard)) {
+ mnt = list_entry(graveyard.next, struct vfsmount, mnt_fslink);
+ list_del_init(&mnt->mnt_fslink);
+
+ namespace = mnt->mnt_namespace;
+ if (!namespace || atomic_read(&namespace->count) <= 0)
+ continue;
+ get_namespace(namespace);
+
+ spin_unlock(&dcache_lock);
+ down_write(&namespace->sem);
+ spin_lock(&dcache_lock);
+
+ if (atomic_read(&mnt->mnt_count) == 2) {
+ struct dentry *xdentry;
+
+ list_del_init(&mnt->mnt_list);
+ list_del_init(&mnt->mnt_child);
+ list_del_init(&mnt->mnt_hash);
+ mnt->mnt_mountpoint->d_mounted--;
+
+ xdentry = xchg(&mnt->mnt_mountpoint, mnt->mnt_root);
+ mntput(xchg(&mnt->mnt_parent, mnt));
+ spin_unlock(&dcache_lock);
+
+ dput(xdentry);
+
+ if (atomic_read(&mnt->mnt_sb->s_active) == 1) {
+ /* last instance - try to be smart */
+ lock_kernel();
+ DQUOT_OFF(mnt->mnt_sb);
+ acct_auto_close(mnt->mnt_sb);
+ unlock_kernel();
+ }
+
+ mntput(mnt);
+ }
+ else {
+ list_add_tail(&mnt->mnt_fslink, mounts);
+ spin_unlock(&dcache_lock);
+ }
+
+ up_write(&namespace->sem);
+
+ mntput(mnt);
+ put_namespace(namespace);
+
+ spin_lock(&dcache_lock);
+ }
+
+ spin_unlock(&dcache_lock);
+}
+
+EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+
static int copy_mount_options (const void __user *data, unsigned long *where)
{
int i;
@@ -726,7 +845,7 @@
{
struct nameidata nd;
int retval = 0;
- int mnt_flags = 0;
+ unsigned long mnt_flags = 0;

/* Discard magic */
if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
@@ -765,7 +884,7 @@
else if (flags & MS_MOVE)
retval = do_move_mount(&nd, dev_name);
else
- retval = do_add_mount(&nd, type_page, flags, mnt_flags,
+ retval = do_new_mount(&nd, type_page, flags, mnt_flags,
dev_name, data_page);
dput_out:
path_release(&nd);
@@ -852,6 +971,24 @@
return -ENOMEM;
}

+void __put_namespace(struct namespace *namespace)
+{
+ struct vfsmount *mnt;
+
+ down_write(&namespace->sem);
+ spin_lock(&dcache_lock);
+
+ list_for_each_entry(mnt, &namespace->list, mnt_list) {
+ mnt->mnt_namespace = NULL;
+ }
+
+ umount_tree(namespace->root);
+
+ spin_unlock(&dcache_lock);
+ up_write(&namespace->sem);
+ kfree(namespace);
+}
+
asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
char __user * type, unsigned long flags,
void __user * data)
@@ -1082,6 +1219,7 @@
init_rwsem(&namespace->sem);
list_add(&mnt->mnt_list, &namespace->list);
namespace->root = mnt;
+ mnt->mnt_namespace = namespace;

init_task.namespace = namespace;
read_lock(&tasklist_lock);
diff -ur linux-2.5.74/fs/super.c linux-2.5.74-auto/fs/super.c
--- linux-2.5.74/fs/super.c 2003-07-03 15:35:14.000000000 +0100
+++ linux-2.5.74-auto/fs/super.c 2003-07-03 15:41:15.000000000 +0100
@@ -21,6 +21,7 @@
*/

#include <linux/config.h>
+#include <linux/module.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/acct.h>
@@ -683,6 +684,7 @@
mnt->mnt_root = dget(sb->s_root);
mnt->mnt_mountpoint = sb->s_root;
mnt->mnt_parent = mnt;
+ mnt->mnt_namespace = current->namespace;
up_write(&sb->s_umount);
put_filesystem(type);
return mnt;
@@ -697,6 +699,8 @@
return (struct vfsmount *)sb;
}

+EXPORT_SYMBOL_GPL(do_kern_mount);
+
struct vfsmount *kern_mount(struct file_system_type *type)
{
return do_kern_mount(type->name, 0, type->name, NULL);
diff -ur linux-2.5.74/include/linux/fs.h linux-2.5.74-auto/include/linux/fs.h
--- linux-2.5.74/include/linux/fs.h 2003-07-03 15:34:43.000000000 +0100
+++ linux-2.5.74-auto/include/linux/fs.h 2003-07-03 15:41:15.000000000 +0100
@@ -572,6 +572,7 @@

#define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */
#define MNT_DETACH 0x00000002 /* Just detach from the tree */
+#define MNT_EXPIRE 0x00000004 /* Mark for expiry */

extern struct list_head super_blocks;
extern spinlock_t sb_lock;
diff -ur linux-2.5.74/include/linux/mount.h linux-2.5.74-auto/include/linux/mount.h
--- linux-2.5.74/include/linux/mount.h 2003-07-03 15:34:43.000000000 +0100
+++ linux-2.5.74-auto/include/linux/mount.h 2003-07-03 15:41:15.000000000 +0100
@@ -14,9 +14,15 @@

#include <linux/list.h>

-#define MNT_NOSUID 1
-#define MNT_NODEV 2
-#define MNT_NOEXEC 4
+#define __MNT_NOSUID 0
+#define __MNT_NODEV 1
+#define __MNT_NOEXEC 2
+#define __MNT_EXPIRY_MARK 3
+
+#define MNT_NOSUID (1 << __MNT_NOSUID)
+#define MNT_NODEV (1 << __MNT_NODEV)
+#define MNT_NOEXEC (1 << __MNT_NOEXEC)
+#define MNT_EXPIRY_MARK (1 << __MNT_EXPIRY_MARK)

struct vfsmount
{
@@ -28,9 +34,11 @@
struct list_head mnt_mounts; /* list of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
atomic_t mnt_count;
- int mnt_flags;
+ unsigned long mnt_flags;
char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
struct list_head mnt_list;
+ struct list_head mnt_fslink; /* link in fs-specific list */
+ struct namespace *mnt_namespace; /* containing namespace */
};

static inline struct vfsmount *mntget(struct vfsmount *mnt)
@@ -42,7 +50,7 @@

extern void __mntput(struct vfsmount *mnt);

-static inline void mntput(struct vfsmount *mnt)
+static inline void _mntput(struct vfsmount *mnt)
{
if (mnt) {
if (atomic_dec_and_test(&mnt->mnt_count))
@@ -50,10 +58,24 @@
}
}

+static inline void mntput(struct vfsmount *mnt)
+{
+ if (mnt) {
+ if (atomic_read(&mnt->mnt_count) == 2)
+ clear_bit(__MNT_EXPIRY_MARK, &mnt->mnt_flags);
+ _mntput(mnt);
+ }
+}
+
extern void free_vfsmnt(struct vfsmount *mnt);
extern struct vfsmount *alloc_vfsmnt(const char *name);
extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
const char *name, void *data);

+extern int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
+ unsigned long mnt_flags, struct list_head *fslist);
+
+extern void mark_mounts_for_expiry(struct list_head *mounts);
+
#endif
#endif /* _LINUX_MOUNT_H */
diff -ur linux-2.5.74/include/linux/namei.h linux-2.5.74-auto/include/linux/namei.h
--- linux-2.5.74/include/linux/namei.h 2003-07-03 15:34:43.000000000 +0100
+++ linux-2.5.74-auto/include/linux/namei.h 2003-07-03 15:41:15.000000000 +0100
@@ -42,6 +42,7 @@
extern int FASTCALL(path_walk(const char *, struct nameidata *));
extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
extern void path_release(struct nameidata *);
+extern void path_release_on_umount(struct nameidata *);

extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
diff -ur linux-2.5.74/include/linux/namespace.h linux-2.5.74-auto/include/linux/namespace.h
--- linux-2.5.74/include/linux/namespace.h 2003-07-03 15:34:43.000000000 +0100
+++ linux-2.5.74-auto/include/linux/namespace.h 2003-07-03 15:41:15.000000000 +0100
@@ -15,16 +15,12 @@
extern void umount_tree(struct vfsmount *);
extern int copy_namespace(int, struct task_struct *);

+extern void __put_namespace(struct namespace *namespace);
+
static inline void put_namespace(struct namespace *namespace)
{
- if (atomic_dec_and_test(&namespace->count)) {
- down_write(&namespace->sem);
- spin_lock(&dcache_lock);
- umount_tree(namespace->root);
- spin_unlock(&dcache_lock);
- up_write(&namespace->sem);
- kfree(namespace);
- }
+ if (atomic_dec_and_test(&namespace->count))
+ __put_namespace(namespace);
}

static inline void exit_namespace(struct task_struct *p)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/