[PATCH] alternative API for raw devices

Alexander Viro (viro@math.psu.edu)
Wed, 1 May 2002 16:12:20 -0400 (EDT)


New option: CONFIG_RAW (tristate)

With that animal enabled you can say

# mount -t raw /dev/sda1 /dev/<whatever>

and get a raw device bound to sda1 visible on /dev/<whatever>. Old
raw devices still work - drivers do not conflict.

Actual IO code is pretty much copied from old driver. The main differences:
* device is originally created with ownership/permissions of the
block device we'd used; you can chmod/chown it at any time,
obviously.
* it's _not_ a character device - stat() will give you S_IFREG.
To check that <foo> is a new-style raw device call statfs(2) and
compare .f_type with rawfs magic (0x726177). It doesn't conflict
with existing check for raw devices (stat(), check that it's
a character device and compare major with RAW_MAJOR), so existing
software can be taught to check for raw devices in
backwards-compatible way.

umount will undo the binding, obviously. The thing works and is very small
(less than 3Kb text+data+bss). BTW, it can be built as module.

I'm not sure if the name of config option is right - maybe CONFIG_RAW_FS
would be better (with CONFIG_RAW_DEV added when and if we would want to make
the old one conditional).

If nothing else, it's interesting as example of doing driver-exported mini-fs
instead of messing with ioctl().

Enjoy:

diff -urN C12-0/fs/Config.help C12-current/fs/Config.help
--- C12-0/fs/Config.help Wed May 1 15:34:58 2002
+++ C12-current/fs/Config.help Wed May 1 15:52:41 2002
@@ -6,6 +6,24 @@
<http://www.linuxdoc.org/docs.html#howto>. Probably the quota
support is only useful for multi user systems. If unsure, say N.

+CONFIG_RAW
+ If you say Y here, you will be able to work with raw devices without
+ any special tools - mount -t raw <block device> <file> will bind
+ a raw device with the block one and put it on top of file and umount <file>
+ will undo that. No magic control devices, no ioctls, just plain mount(2).
+ Old raw devices are still there - these drivers are completely independent.
+ If unsure, say Y.
+
+ Note: to check that <file> is a new-style raw device you need either
+ look for raw mounted on <file> (in /proc/mounts or /etc/mtab) or
+ call statfs(<file>, &stat_buf) and compare stat_buf.f_type with
+ rawfs magic (0x726177).
+
+ If you want to compile this as a module ( = code which can be
+ inserted in and removed from the running kernel whenever you want),
+ say M here and read <file:Documentation/modules.txt>. The module
+ will be called raw.o.
+
CONFIG_MINIX_FS
Minix is a simple operating system used in many classes about OS's.
The minix file system (method to organize files on a hard disk
diff -urN C12-0/fs/Config.in C12-current/fs/Config.in
--- C12-0/fs/Config.in Wed May 1 15:34:58 2002
+++ C12-current/fs/Config.in Wed May 1 15:45:22 2002
@@ -5,6 +5,7 @@
comment 'File systems'

bool 'Quota support' CONFIG_QUOTA
+tristate 'Filesystem interface to raw devices' CONFIG_RAW
tristate 'Kernel automounter support' CONFIG_AUTOFS_FS
tristate 'Kernel automounter version 4 support (also supports v3)' CONFIG_AUTOFS4_FS

diff -urN C12-0/fs/Makefile C12-current/fs/Makefile
--- C12-0/fs/Makefile Wed May 1 15:34:58 2002
+++ C12-current/fs/Makefile Wed May 1 15:45:22 2002
@@ -71,6 +71,7 @@
subdir-$(CONFIG_SUN_OPENPROMFS) += openpromfs
subdir-$(CONFIG_JFS_FS) += jfs

+obj-$(CONFIG_RAW) += raw.o

obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
diff -urN C12-0/fs/raw.c C12-current/fs/raw.c
--- C12-0/fs/raw.c Wed Dec 31 19:00:00 1969
+++ C12-current/fs/raw.c Wed May 1 15:51:43 2002
@@ -0,0 +1,310 @@
+/*
+ * fs/raw.c
+ *
+ * raw devices without a barf-bag
+ *
+ * derived from drivers/char/raw.c - actual IO operations are almost exact
+ * copy, API for controlling that beast replaced with sane one.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/iobuf.h>
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+
+struct raw_dev {
+ struct block_device *bdev;
+ int count;
+};
+
+static inline struct raw_dev *raw_dev(struct super_block *s)
+{
+ return s->u.generic_sbp;
+}
+
+static spinlock_t count_lock = SPIN_LOCK_UNLOCKED;
+
+static int raw_open(struct inode *inode, struct file *filp)
+{
+ struct super_block *s = inode->i_sb;
+ struct block_device *bdev = raw_dev(s)->bdev;
+ int sector_size;
+ int err;
+
+ if (!filp->f_iobuf) {
+ err = alloc_kiovec(1, &filp->f_iobuf);
+ if (err)
+ return err;
+ }
+
+ atomic_inc(&bdev->bd_count);
+ err = blkdev_get(bdev, filp->f_mode, 0, BDEV_RAW);
+ if (err)
+ return err;
+
+ sector_size = bdev_hardsect_size(bdev);
+
+ spin_lock(&count_lock);
+ if (!raw_dev(s)->count++) {
+ int bits;
+
+ s->s_blocksize = sector_size;
+ for (bits = 0; !(sector_size & 1); sector_size>>=1, bits++)
+ ;
+ s->s_blocksize_bits = bits;
+ inode->i_size = bdev->bd_inode->i_size;
+ }
+ spin_unlock(&count_lock);
+ return 0;
+}
+
+static int raw_release(struct inode *inode, struct file *filp)
+{
+ struct raw_dev *p = raw_dev(inode->i_sb);
+ spin_lock(&count_lock);
+ p->count--;
+ spin_unlock(&count_lock);
+ blkdev_put(p->bdev, BDEV_RAW);
+ return 0;
+}
+
+static ssize_t rw_raw_dev(int rw, struct file *filp, char *buf,
+ size_t size, loff_t *offp)
+{
+ struct super_block *s = filp->f_dentry->d_inode->i_sb;
+ struct kiobuf *iobuf = filp->f_iobuf;
+ struct block_device *bdev = raw_dev(s)->bdev;
+ int sector_size = s->s_blocksize;
+ int sector_bits = s->s_blocksize_bits;
+ int sector_mask = sector_size - 1;
+ unsigned long limit = bdev->bd_inode->i_size >> sector_bits;
+ sector_t blocknr = *offp >> sector_bits;
+ size_t transferred = 0;
+ int new_iobuf = 0;
+ int err;
+
+ /*
+ * First, a few checks on device size limits
+ */
+
+ if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
+ /*
+ * A parallel read/write is using the preallocated iobuf
+ * so just run slow and allocate a new one.
+ */
+ err = alloc_kiovec(1, &iobuf);
+ if (err)
+ goto out;
+ new_iobuf = 1;
+ }
+
+ err = -EINVAL;
+ if ((*offp & sector_mask) || (size & sector_mask))
+ goto out_free;
+ err = 0;
+ if (size)
+ err = -ENXIO;
+ if ((*offp >> sector_bits) >= limit)
+ goto out_free;
+
+ while (size > 0) {
+ unsigned long blocks = size >> sector_bits;
+ int iosize;
+
+ if (blocks > limit - blocknr)
+ blocks = limit - blocknr;
+ if (!blocks)
+ break;
+
+ iosize = blocks << sector_bits;
+
+ err = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
+ if (err)
+ break;
+
+ err = brw_kiovec(rw, 1, &iobuf, bdev, &blocknr, sector_size);
+
+ if (rw == READ && err > 0)
+ mark_dirty_kiobuf(iobuf, err);
+
+ if (err >= 0) {
+ transferred += err;
+ size -= err;
+ buf += err;
+ }
+
+ blocknr += blocks;
+
+ unmap_kiobuf(iobuf);
+
+ if (err != iosize)
+ break;
+ }
+
+ if (transferred) {
+ *offp += transferred;
+ err = transferred;
+ }
+
+out_free:
+ if (!new_iobuf)
+ clear_bit(0, &filp->f_iobuf_lock);
+ else
+ free_kiovec(1, &iobuf);
+out:
+ return err;
+}
+
+static ssize_t raw_read(struct file *filp, char * buf,
+ size_t size, loff_t *offp)
+{
+ return rw_raw_dev(READ, filp, buf, size, offp);
+}
+
+static ssize_t raw_write(struct file *filp, const char *buf,
+ size_t size, loff_t *offp)
+{
+ return rw_raw_dev(WRITE, filp, (char *) buf, size, offp);
+}
+
+static struct file_operations raw_fops = {
+ open: raw_open,
+ release: raw_release,
+ read: raw_read,
+ write: raw_write,
+};
+
+static int raw_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+ dev_t dev = raw_dev(mnt->mnt_sb)->bdev->bd_dev;
+ seq_printf(m, " dev=%d:%d", MAJOR(dev), MINOR(dev));
+ return 0;
+}
+
+static struct super_operations s_ops = {
+ statfs: simple_statfs,
+ show_options: raw_show_options,
+};
+
+static struct super_block *raw_get_sb(struct file_system_type *fs_type,
+ int flags, char *dev_name, void *data)
+{
+ struct inode *inode, *root;
+ struct super_block *s;
+ struct nameidata nd;
+ struct raw_dev *p = kmalloc(sizeof(struct raw_dev), GFP_KERNEL);
+ int error = -ENOMEM;
+
+ if (!p)
+ goto out;
+
+ /* sanity check for device name */
+ error = -EINVAL;
+ if (!dev_name || !*dev_name)
+ goto out1;
+
+ /* find it */
+ error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
+ if (error)
+ goto out1;
+
+ /* is it a block device? */
+ inode = nd.dentry->d_inode;
+ error = -ENOTBLK;
+ if (!S_ISBLK(inode->i_mode))
+ goto out2;
+
+ /* do we have it on nodev filesystem? */
+ error = -EACCES;
+ if (nd.mnt->mnt_flags & MNT_NODEV)
+ goto out2;
+
+ /* get struct block_device */
+ error = bd_acquire(inode);
+ if (error)
+ goto out2;
+
+ /* allocate superblock */
+ s = sget(fs_type, NULL, set_anon_super, NULL);
+ if (IS_ERR(s)) {
+ bdput(inode->i_bdev);
+ path_release(&nd);
+ kfree(p);
+ return s;
+ }
+
+ /* set it up */
+ s->s_blocksize = PAGE_CACHE_SIZE;
+ s->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ s->s_magic = 0x726177;
+ s->s_op = &s_ops;
+ s->u.generic_sbp = p;
+ p->bdev = inode->i_bdev;
+ p->count = 0;
+
+ /* allocate (the only) inode */
+ error = -ENOMEM;
+ root = new_inode(s);
+ if (!root)
+ goto out3;
+
+ /* set it up */
+ root->i_mode = S_IFREG | (inode->i_mode & S_IRWXUGO);
+ root->i_uid = inode->i_uid;
+ root->i_gid = inode->i_gid;
+ root->i_blksize = PAGE_CACHE_SIZE;
+ root->i_blocks = 0;
+ root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
+ root->i_fop = &raw_fops;
+
+ /* make it root */
+ s->s_root = d_alloc_root(root);
+ if (!s->s_root)
+ goto out4;
+ s->s_flags |= MS_ACTIVE;
+ path_release(&nd);
+ return s;
+
+out4:
+ iput(root);
+out3:
+ up_write(&s->s_umount);
+ deactivate_super(s);
+out2:
+ path_release(&nd);
+out1:
+ kfree(p);
+out:
+ return ERR_PTR(error);
+}
+
+static void raw_kill_sb(struct super_block *s)
+{
+ struct raw_dev *p = raw_dev(s);
+ kill_anon_super(s);
+ bdput(p->bdev);
+ kfree(p);
+}
+
+static struct file_system_type raw_fs_type = {
+ owner: THIS_MODULE,
+ name: "raw",
+ get_sb: raw_get_sb,
+ kill_sb: raw_kill_sb,
+};
+
+static int __init init_rawfs(void)
+{
+ return register_filesystem(&raw_fs_type);
+}
+
+static void __exit exit_rawfs(void)
+{
+ unregister_filesystem(&raw_fs_type);
+}
+
+EXPORT_NO_SYMBOLS;
+MODULE_LICENSE("GPL"); /* fair is fair - derived from GPLed code */
+module_init(init_rawfs)
+module_exit(exit_rawfs)

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/