Logics looks so: upon the final close() we finish all pending
IO and destroy all buffer_heads for device.  However, we leave pages
alive.  block_device stays alive as long as it has pages in cache.
unregister_blkdev() drops the cache for all devices with that major.
->bd_op handling had been sanitized, so the last remnants of rmmod
races for block devices should be gone now.
	Patch works here.  Please, help with testing.
diff -urN S15-pre2/fs/block_dev.c S15-pre2-new/fs/block_dev.c
--- S15-pre2/fs/block_dev.c	Tue Nov  6 11:36:38 2001
+++ S15-pre2-new/fs/block_dev.c	Fri Nov  9 19:01:50 2001
@@ -221,6 +221,18 @@
 static struct vfsmount *bd_mnt;
 
 /*
+ * drivers.
+ */
+
+static struct {
+	const char *name;
+	struct block_device_operations *bdops;
+	struct list_head bdevs;
+} blkdevs[MAX_BLKDEV];
+
+static LIST_HEAD(killable);
+
+/*
  * bdev cache handling - shamelessly stolen from inode.c
  * We use smaller hashtable, though.
  */
@@ -246,6 +258,8 @@
 		memset(bdev, 0, sizeof(*bdev));
 		sema_init(&bdev->bd_sem, 1);
 		INIT_LIST_HEAD(&bdev->bd_inodes);
+		INIT_LIST_HEAD(&bdev->bd_devs);
+		INIT_LIST_HEAD(&bdev->bd_list);
 	}
 }
 
@@ -274,6 +288,8 @@
 	err = PTR_ERR(bd_mnt);
 	if (IS_ERR(bd_mnt))
 		panic("Cannot create bdev pseudo-fs");
+	for (i = 0; i < MAX_BLKDEV; i++)
+		INIT_LIST_HEAD(&blkdevs[i].bdevs);
 }
 
 /*
@@ -296,6 +312,7 @@
 		if (bdev->bd_dev != dev)
 			continue;
 		atomic_inc(&bdev->bd_count);
+		list_del_init(&bdev->bd_list);
 		return bdev;
 	}
 	return NULL;
@@ -346,19 +363,46 @@
 	inode->i_mapping = &inode->i_data;
 }
 
+static void __bdput(struct block_device *bdev)
+{
+	struct list_head *p;
+	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
+		__bd_forget(list_entry(p, struct inode, i_devices));
+	}
+	list_del_init(&bdev->bd_hash);
+	list_del_init(&bdev->bd_devs);
+	list_del_init(&bdev->bd_list);
+	spin_unlock(&bdev_lock);
+	iput(bdev->bd_inode);
+	destroy_bdev(bdev);
+}
+
+void shrink_bdcache(void)
+{
+	struct block_device *bdev;
+	struct list_head *p;
+retry:
+	spin_lock(&bdev_lock);
+	list_for_each(p, &killable) {
+		bdev = list_entry(p, struct block_device, bd_list);
+		if (bdev->bd_inode->i_mapping->nrpages == 0) {
+			__bdput(bdev);
+			goto retry;
+		}
+	}
+	spin_unlock(&bdev_lock);
+}
+
 void bdput(struct block_device *bdev)
 {
 	if (atomic_dec_and_lock(&bdev->bd_count, &bdev_lock)) {
-		struct list_head *p;
 		if (bdev->bd_openers)
 			BUG();
-		list_del(&bdev->bd_hash);
-		while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
-			__bd_forget(list_entry(p, struct inode, i_devices));
-		}
-		spin_unlock(&bdev_lock);
-		iput(bdev->bd_inode);
-		destroy_bdev(bdev);
+		list_add(&bdev->bd_list, &killable);
+		if (bdev->bd_inode->i_mapping->nrpages == 0)
+			__bdput(bdev);
+		else
+			spin_unlock(&bdev_lock);
 	}
 }
  
@@ -368,6 +412,7 @@
 	spin_lock(&bdev_lock);
 	if (inode->i_bdev) {
 		atomic_inc(&inode->i_bdev->bd_count);
+		list_del_init(&inode->i_bdev->bd_list);
 		spin_unlock(&bdev_lock);
 		return 0;
 	}
@@ -396,11 +441,6 @@
 	spin_unlock(&bdev_lock);
 }
 
-static struct {
-	const char *name;
-	struct block_device_operations *bdops;
-} blkdevs[MAX_BLKDEV];
-
 int get_blkdev_list(char * p)
 {
 	int i;
@@ -415,62 +455,118 @@
 	return len;
 }
 
-/*
-	Return the function table of a device.
-	Load the driver if needed.
-*/
-const struct block_device_operations * get_blkfops(unsigned int major)
-{
-	const struct block_device_operations *ret = NULL;
-
-	/* major 0 is used for non-device mounts */
-	if (major && major < MAX_BLKDEV) {
-#ifdef CONFIG_KMOD
-		if (!blkdevs[major].bdops) {
-			char name[20];
-			sprintf(name, "block-major-%d", major);
-			request_module(name);
-		}
-#endif
-		ret = blkdevs[major].bdops;
-	}
-	return ret;
-}
-
 int register_blkdev(unsigned int major, const char * name, struct block_device_operations *bdops)
 {
+	spin_lock(&bdev_lock);
 	if (major == 0) {
 		for (major = MAX_BLKDEV-1; major > 0; major--) {
-			if (blkdevs[major].bdops == NULL) {
+			if (!blkdevs[major].name) {
 				blkdevs[major].name = name;
 				blkdevs[major].bdops = bdops;
+				spin_unlock(&bdev_lock);
 				return major;
 			}
 		}
+		spin_unlock(&bdev_lock);
 		return -EBUSY;
 	}
-	if (major >= MAX_BLKDEV)
+	if (major >= MAX_BLKDEV) {
+		spin_unlock(&bdev_lock);
 		return -EINVAL;
-	if (blkdevs[major].bdops && blkdevs[major].bdops != bdops)
+	}
+	if (blkdevs[major].name) {
+		spin_unlock(&bdev_lock);
 		return -EBUSY;
+	}
 	blkdevs[major].name = name;
 	blkdevs[major].bdops = bdops;
+	spin_unlock(&bdev_lock);
 	return 0;
 }
 
 int unregister_blkdev(unsigned int major, const char * name)
 {
+	struct list_head *p;
+	struct block_device *bdev;
 	if (major >= MAX_BLKDEV)
 		return -EINVAL;
-	if (!blkdevs[major].bdops)
-		return -EINVAL;
-	if (strcmp(blkdevs[major].name, name))
+	spin_lock(&bdev_lock);
+	if (!blkdevs[major].bdops || strcmp(blkdevs[major].name, name)) {
+		spin_unlock(&bdev_lock);
 		return -EINVAL;
-	blkdevs[major].name = NULL;
+	}
 	blkdevs[major].bdops = NULL;
+	while ( (p = blkdevs[major].bdevs.next) != &blkdevs[major].bdevs ) {
+		bdev = list_entry(p, struct block_device, bd_devs);
+		atomic_inc(&bdev->bd_count);
+		list_del_init(&bdev->bd_list);
+		list_del_init(p);
+		bdev->bd_op = NULL;
+		spin_unlock(&bdev_lock);
+		truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+		bdput(bdev);
+		spin_lock(&bdev_lock);
+	}
+	blkdevs[major].name = NULL;
+	spin_unlock(&bdev_lock);
 	return 0;
 }
 
+static struct block_device_operations *get_bdops(int major)
+{
+	struct block_device_operations *ops = NULL;
+	spin_lock(&bdev_lock);
+	if (major < MAX_BLKDEV) {
+		ops = blkdevs[major].bdops;
+		if (ops && ops->owner && !try_inc_mod_count(ops->owner))
+			ops = NULL;
+	}
+	spin_unlock(&bdev_lock);
+	return ops;
+}
+
+static void put_bdops(struct block_device_operations *ops)
+{
+	if (ops->owner)
+		__MOD_DEC_USE_COUNT(ops->owner);
+}
+
+int bd_activate(struct block_device *bdev)
+{
+	struct block_device_operations *ops;
+	int major = MAJOR(to_kdev_t(bdev->bd_dev));
+	spin_lock(&bdev_lock);
+	ops = bdev->bd_op;
+	if (ops) {
+		if (!ops->owner || try_inc_mod_count(ops->owner)) {
+			spin_unlock(&bdev_lock);
+			return 1;
+		}
+		bdev->bd_op = NULL;
+		list_del_init(&bdev->bd_devs);
+	}
+	spin_unlock(&bdev_lock);
+	ops = get_bdops(major);
+#ifdef CONFIG_KMOD
+	if (!ops && major < MAX_BLKDEV) {
+		char name[20];
+		sprintf(name, "block-major-%d", major);
+		request_module(name);
+		ops = get_bdops(major);
+	}
+#endif
+	if (!ops)
+		return 0;
+	spin_lock(&bdev_lock);
+	if (!bdev->bd_op) {
+		bdev->bd_op = ops;
+		list_add(&bdev->bd_devs, &blkdevs[major].bdevs);
+	} else if (bdev->bd_op != ops)
+		BUG();
+	spin_unlock(&bdev_lock);
+	return 1;
+}
+
 /*
  * This routine checks whether a removable media has been changed,
  * and invalidates all buffer-cache-entries in that case. This
@@ -482,25 +578,15 @@
  */
 int check_disk_change(kdev_t dev)
 {
-	int i;
-	const struct block_device_operations * bdops = NULL;
+	struct block_device_operations * bdops = get_bdops(MAJOR(dev));
 
-	i = MAJOR(dev);
-	if (i < MAX_BLKDEV)
-		bdops = blkdevs[i].bdops;
-	if (bdops == NULL) {
-		devfs_handle_t de;
-
-		de = devfs_find_handle (NULL, NULL, i, MINOR (dev),
-					DEVFS_SPECIAL_BLK, 0);
-		if (de) bdops = devfs_get_ops (de);
-	}
 	if (bdops == NULL)
 		return 0;
-	if (bdops->check_media_change == NULL)
-		return 0;
-	if (!bdops->check_media_change(dev))
+
+	if (!bdops->check_media_change || !bdops->check_media_change(dev)) {
+		put_bdops(bdops);
 		return 0;
+	}
 
 	printk(KERN_DEBUG "VFS: Disk change detected on device %s\n",
 		bdevname(dev));
@@ -510,6 +596,7 @@
 
 	if (bdops->revalidate)
 		bdops->revalidate(dev);
+	put_bdops(bdops);
 	return 1;
 }
 
@@ -528,31 +615,25 @@
 
 static int do_open(struct block_device *bdev, struct inode *inode, struct file *file)
 {
-	int ret = -ENXIO;
+	int ret;
 	kdev_t dev = to_kdev_t(bdev->bd_dev);
 
+	if (!bd_activate(bdev))
+		return -ENXIO;
+
 	down(&bdev->bd_sem);
-	lock_kernel();
-	if (!bdev->bd_op)
-		bdev->bd_op = get_blkfops(MAJOR(dev));
-	if (bdev->bd_op) {
-		ret = 0;
-		if (bdev->bd_op->owner)
-			__MOD_INC_USE_COUNT(bdev->bd_op->owner);
-		if (bdev->bd_op->open)
-			ret = bdev->bd_op->open(inode, file);
-		if (!ret) {
-			bdev->bd_openers++;
-			bdev->bd_inode->i_size = blkdev_size(dev);
-			bdev->bd_inode->i_blkbits = blksize_bits(block_size(dev));
-		} else {
-			if (bdev->bd_op->owner)
-				__MOD_DEC_USE_COUNT(bdev->bd_op->owner);
-			if (!bdev->bd_openers)
-				bdev->bd_op = NULL;
-		}
+	ret = 0;
+	if (bdev->bd_op->open) {
+		lock_kernel();
+		ret = bdev->bd_op->open(inode, file);
+		unlock_kernel();
 	}
-	unlock_kernel();
+	if (!ret) {
+		bdev->bd_openers++;
+		bdev->bd_inode->i_size = blkdev_size(dev);
+		bdev->bd_inode->i_blkbits = blksize_bits(block_size(dev));
+	} else
+		put_bdops(bdev->bd_op);
 	up(&bdev->bd_sem);
 	if (ret)
 		bdput(bdev);
@@ -579,8 +660,6 @@
 
 int blkdev_open(struct inode * inode, struct file * filp)
 {
-	struct block_device *bdev;
-
 	/*
 	 * Preserve backwards compatibility and allow large file access
 	 * even if userspace doesn't ask for it explicitly. Some mkfs
@@ -590,9 +669,7 @@
 	filp->f_flags |= O_LARGEFILE;
 
 	bd_acquire(inode);
-	bdev = inode->i_bdev;
-
-	return do_open(bdev, inode, filp);
+	return do_open(inode->i_bdev, inode, filp);
 }	
 
 int blkdev_put(struct block_device *bdev, int kind)
@@ -607,14 +684,14 @@
 		__block_fsync(bd_inode);
 	else if (kind == BDEV_FS)
 		fsync_no_super(rdev);
-	if (!--bdev->bd_openers)
-		kill_bdev(bdev);
+	if (!--bdev->bd_openers) {
+		detach_metadata(bdev->bd_inode->i_mapping);
+		/* XXX: The next is needed if ->check... is broken */
+		/* truncate_inode_pages(bdev->bd_inode->i_mapping, 0); */
+	}
 	if (bdev->bd_op->release)
 		ret = bdev->bd_op->release(bd_inode, NULL);
-	if (bdev->bd_op->owner)
-		__MOD_DEC_USE_COUNT(bdev->bd_op->owner);
-	if (!bdev->bd_openers)
-		bdev->bd_op = NULL;
+	put_bdops(bdev->bd_op);
 	unlock_kernel();
 	up(&bdev->bd_sem);
 	bdput(bdev);
diff -urN S15-pre2/fs/devfs/base.c S15-pre2-new/fs/devfs/base.c
--- S15-pre2/fs/devfs/base.c	Tue Nov  6 11:36:39 2001
+++ S15-pre2-new/fs/devfs/base.c	Fri Nov  9 19:01:48 2001
@@ -1296,17 +1296,11 @@
 	printk ("%s: devfs_register(): NULL name pointer\n", DEVFS_NAME);
 	return NULL;
     }
-    if (ops == NULL)
+    if (ops == NULL && !S_ISBLK (mode) )
     {
-	if ( S_ISBLK (mode) ) ops = (void *) get_blkfops (major);
-	if (ops == NULL)
-	{
-	    printk ("%s: devfs_register(%s): NULL ops pointer\n",
-		    DEVFS_NAME, name);
-	    return NULL;
-	}
-	printk ("%s: devfs_register(%s): NULL ops, got %p from major table\n",
-		DEVFS_NAME, name, ops);
+	printk ("%s: devfs_register(%s): NULL ops pointer\n",
+		DEVFS_NAME, name);
+	return NULL;
     }
     if ( S_ISDIR (mode) )
     {
@@ -2000,7 +1994,6 @@
 int devfs_register_blkdev (unsigned int major, const char *name,
 			   struct block_device_operations *bdops)
 {
-    if (boot_options & OPTION_ONLY) return 0;
     return register_blkdev (major, name, bdops);
 }   /*  End Function devfs_register_blkdev  */
 
@@ -2034,7 +2027,6 @@
 
 int devfs_unregister_blkdev (unsigned int major, const char *name)
 {
-    if (boot_options & OPTION_ONLY) return 0;
     return unregister_blkdev (major, name);
 }   /*  End Function devfs_unregister_blkdev  */
 
@@ -2173,24 +2165,17 @@
 static int check_disc_changed (struct devfs_entry *de)
 {
     int tmp;
+    int res;
     kdev_t dev = MKDEV (de->u.fcb.u.device.major, de->u.fcb.u.device.minor);
-    struct block_device_operations *bdops = de->u.fcb.ops;
     extern int warn_no_part;
 
     if ( !S_ISBLK (de->mode) ) return 0;
-    if (bdops == NULL) return 0;
-    if (bdops->check_media_change == NULL) return 0;
-    if ( !bdops->check_media_change (dev) ) return 0;
-    printk ( KERN_DEBUG "VFS: Disk change detected on device %s\n",
-	     kdevname (dev) );
-    if (invalidate_device(dev, 0))
-	printk("VFS: busy inodes on changed media..\n");
     /*  Ugly hack to disable messages about unable to read partition table  */
     tmp = warn_no_part;
     warn_no_part = 0;
-    if (bdops->revalidate) bdops->revalidate (dev);
+    res = check_disk_change(dev);
     warn_no_part = tmp;
-    return 1;
+    return res;
 }   /*  End Function check_disc_changed  */
 
 
@@ -2366,12 +2351,8 @@
     {
 	inode->i_rdev = MKDEV (de->u.fcb.u.device.major,
 			       de->u.fcb.u.device.minor);
-	if (bd_acquire(inode) == 0)
-	{
-	    if (!inode->i_bdev->bd_op && de->u.fcb.ops)
-		inode->i_bdev->bd_op = de->u.fcb.ops;
-	}
-	else printk ("%s: get_vfs_inode(%d): no block device from bdget()\n",
+	if (bd_acquire(inode) != 0)
+		printk ("%s: get_vfs_inode(%d): no block device from bdget()\n",
 		     DEVFS_NAME, (int) inode->i_ino);
     }
     else if ( S_ISFIFO (de->inode.mode) ) inode->i_fop = &def_fifo_fops;
@@ -2478,10 +2459,7 @@
     df = &de->u.fcb;
     file->private_data = de->info;
     if ( S_ISBLK (inode->i_mode) )
-    {
 	file->f_op = &def_blk_fops;
-	if (df->ops) inode->i_bdev->bd_op = df->ops;
-    }
     else file->f_op = fops_get ( (struct file_operations*) df->ops );
     if (file->f_op)
 	err = file->f_op->open ? (*file->f_op->open) (inode, file) : 0;
diff -urN S15-pre2/fs/super.c S15-pre2-new/fs/super.c
--- S15-pre2/fs/super.c	Tue Nov  6 11:36:39 2001
+++ S15-pre2-new/fs/super.c	Fri Nov  9 19:01:48 2001
@@ -535,7 +535,6 @@
 {
 	struct inode *inode;
 	struct block_device *bdev;
-	struct block_device_operations *bdops;
 	struct super_block * s;
 	struct nameidata nd;
 	struct list_head *p;
@@ -559,9 +558,6 @@
 		goto out;
 	bd_acquire(inode);
 	bdev = inode->i_bdev;
-	bdops = devfs_get_ops ( devfs_get_handle_from_inode (inode) );
-	if (bdops) bdev->bd_op = bdops;
-	/* Done with lookups, semaphore down */
 	dev = to_kdev_t(bdev->bd_dev);
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
@@ -994,7 +990,6 @@
 	bdev = bdget(kdev_t_to_nr(ROOT_DEV));
 	if (!bdev)
 		panic(__FUNCTION__ ": unable to allocate root device");
-	bdev->bd_op = devfs_get_ops (handle);
 	path_start = devfs_generate_path (handle, path + 5, sizeof (path) - 5);
 	mode = FMODE_READ;
 	if (!(root_mountflags & MS_RDONLY))
diff -urN S15-pre2/include/linux/fs.h S15-pre2-new/include/linux/fs.h
--- S15-pre2/include/linux/fs.h	Fri Nov  9 18:58:20 2001
+++ S15-pre2-new/include/linux/fs.h	Fri Nov  9 19:01:49 2001
@@ -422,6 +422,8 @@
 	const struct block_device_operations *bd_op;
 	struct semaphore	bd_sem;	/* open/close mutex */
 	struct list_head	bd_inodes;
+	struct list_head	bd_devs;
+	struct list_head	bd_list;
 };
 
 struct inode {
@@ -1075,7 +1077,7 @@
 extern int blkdev_put(struct block_device *, int);
 
 /* fs/devices.c */
-extern const struct block_device_operations *get_blkfops(unsigned int);
+extern int bd_activate(struct block_device *);
 extern int register_chrdev(unsigned int, const char *, struct file_operations *);
 extern int unregister_chrdev(unsigned int, const char *);
 extern int chrdev_open(struct inode *, struct file *);
diff -urN S15-pre2/mm/filemap.c S15-pre2-new/mm/filemap.c
--- S15-pre2/mm/filemap.c	Fri Nov  9 18:58:21 2001
+++ S15-pre2-new/mm/filemap.c	Fri Nov  9 19:01:49 2001
@@ -244,6 +244,61 @@
 	page_cache_release(page);
 }
 
+static int detach_list_pages(struct list_head *head)
+{
+	struct list_head *curr;
+	struct page * page;
+	int unlocked = 0;
+
+ restart:
+	curr = head->prev;
+	while (curr != head) {
+		unsigned long offset;
+
+		page = list_entry(curr, struct page, list);
+		offset = page->index;
+
+		/* Is one of the pages to truncate? */
+		if (page->buffers) {
+			page_cache_get(page);
+			if (TryLockPage(page)) {
+				/* Restart on this page */
+				list_del(head);
+				list_add(head, curr);
+				spin_unlock(&pagecache_lock);
+				wait_on_page(page);
+			} else {
+				/* Restart after this page */
+				list_del(head);
+				list_add_tail(head, curr);
+				spin_unlock(&pagecache_lock);
+				try_to_free_buffers(page, 0);
+				UnlockPage(page);
+			}
+			page_cache_release(page);
+			spin_lock(&pagecache_lock);
+			unlocked = 1;
+			goto restart;
+		}
+		curr = curr->prev;
+	}
+	return unlocked;
+}
+
+void detach_metadata(struct address_space *mapping)
+{
+	int unlocked;
+
+	spin_lock(&pagecache_lock);
+	do {
+		unlocked = detach_list_pages(&mapping->clean_pages);
+		unlocked |= detach_list_pages(&mapping->dirty_pages);
+		unlocked |= detach_list_pages(&mapping->locked_pages);
+	} while (unlocked);
+	/* Traversed all three lists without dropping the lock */
+	spin_unlock(&pagecache_lock);
+}
+
 static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
 static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
 {
diff -urN S15-pre2/mm/swapfile.c S15-pre2-new/mm/swapfile.c
--- S15-pre2/mm/swapfile.c	Tue Nov  6 11:36:41 2001
+++ S15-pre2-new/mm/swapfile.c	Fri Nov  9 19:01:48 2001
@@ -916,16 +916,12 @@
 
 	if (S_ISBLK(swap_inode->i_mode)) {
 		kdev_t dev = swap_inode->i_rdev;
-		struct block_device_operations *bdops;
 
 		p->swap_device = dev;
 		set_blocksize(dev, PAGE_SIZE);
 		
 		bd_acquire(swap_inode);
 		bdev = swap_inode->i_bdev;
-		bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode));
-		if (bdops) bdev->bd_op = bdops;
-
 		error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP);
 		if (error)
 			goto bad_swap_2;
diff -urN S15-pre2/mm/vmscan.c S15-pre2-new/mm/vmscan.c
--- S15-pre2/mm/vmscan.c	Fri Nov  9 18:58:21 2001
+++ S15-pre2-new/mm/vmscan.c	Fri Nov  9 19:01:49 2001
@@ -576,6 +576,7 @@
 
 	shrink_dcache_memory(priority, gfp_mask);
 	shrink_icache_memory(priority, gfp_mask);
+	shrink_bdcache();
 #ifdef CONFIG_QUOTA
 	shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
 #endif
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/