Re: elevator algorithm in disk controller bad?

Chris Mason (mason@suse.com)
Fri, 11 Jan 2002 11:56:48 -0500


On Friday, January 11, 2002 02:08:39 AM -0800 Andre Hedrick <andre@linux-ide.org> wrote:

> Or something more up to date?
>
> ide.2.4.16.12102001.patch:+ flushcache: NULL,
> ide.2.4.16.12102001.patch:+static int do_idedisk_flushcache(ide_drive_t

Andre, filesystems don't understand what an ide_drive_t is ;-)

Here's something against 2.4.17 + ide.2.4.16.12102001 that tries to use
the flushcache stuff. It is almost entirely untested, I did make sure
the flushcache func is actually called, and returns zero. But, I didn't
make sure driver->flushcache wasn't the default noop func.

For this to actually be useful, we need a real interface, not some
internal reiserfs hack. But, I wanted to make sure I had the idea
right before going any further...patch below.

Jens, it shouldn't be hard to adapt this to your write barrier stuff too...

-chris

Index: linus.17/fs/reiserfs/journal.c
--- linus.17/fs/reiserfs/journal.c Thu, 13 Dec 2001 11:06:51 -0500 root (linux/b/0_journal.c 1.2.1.1.4.2 644)
+++ linus.17(w)/fs/reiserfs/journal.c Fri, 11 Jan 2002 11:14:27 -0500 root (linux/b/0_journal.c 1.2.1.1.4.2 644)
@@ -736,9 +736,20 @@
atomic_read(&(jl->j_commit_left)));
}

+ /* make sure all the log blocks are on disk before we flush
+ ** the commit
+ */
+ reiserfs_write_barrier(jl->j_commit_bh) ;
+
mark_buffer_dirty(jl->j_commit_bh) ;
ll_rw_block(WRITE, 1, &(jl->j_commit_bh)) ;
wait_on_buffer(jl->j_commit_bh) ;
+
+ /* make sure the commit is on disk before we allow anyone to start
+ ** writing the metadata
+ */
+ reiserfs_write_barrier(jl->j_commit_bh) ;
+
if (!buffer_uptodate(jl->j_commit_bh)) {
reiserfs_panic(s, "journal-615: buffer write failed\n") ;
}
@@ -820,6 +831,9 @@
if (trans_id >= SB_JOURNAL(p_s_sb)->j_last_flush_trans_id) {
if (buffer_locked((SB_JOURNAL(p_s_sb)->j_header_bh))) {
wait_on_buffer((SB_JOURNAL(p_s_sb)->j_header_bh)) ;
+ /* no need for write barrier here, whoever locked the header bh
+ ** will do that for us.
+ */
if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) {
reiserfs_panic(p_s_sb, "journal-699: buffer write failed\n") ;
}
@@ -833,6 +847,9 @@
set_bit(BH_Dirty, &(SB_JOURNAL(p_s_sb)->j_header_bh->b_state)) ;
ll_rw_block(WRITE, 1, &(SB_JOURNAL(p_s_sb)->j_header_bh)) ;
wait_on_buffer((SB_JOURNAL(p_s_sb)->j_header_bh)) ;
+
+ reiserfs_write_barrier((SB_JOURNAL(p_s_sb)->j_header_bh)) ;
+
if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) {
printk( "reiserfs: journal-837: IO error during journal replay\n" );
return -EIO ;
@@ -1088,6 +1105,11 @@
if (flushall) {
update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % JOURNAL_BLOCK_COUNT, jl->j_trans_id) ;
}
+ /* upddate_journal_header_block takes care of all the write barrier
+ ** requirements. If writes from older journal lists get reordered before
+ ** the newest list triggers a flush on the journal header block, log replay
+ ** will fix everything for us.
+ */
remove_all_from_journal_list(s, jl, 0) ;
jl->j_len = 0 ;
atomic_set(&(jl->j_nonzerolen), 0) ;
@@ -1101,6 +1123,10 @@
}


+/* kupdate_one_transaction doesn't need to worry about write
+** barriers at all. That happens during the cleanups triggered by
+** flush_journal_list
+*/
static int kupdate_one_transaction(struct super_block *s,
struct reiserfs_journal_list *jl)
{
@@ -1747,6 +1773,11 @@
CURRENT_TIME - start) ;
}
if (!is_read_only(p_s_sb->s_dev) &&
+ /* _update_journal_header_block is the only place during replay
+ ** we trigger write barriers. That's ok, if we crash before we
+ ** hit this point, the same transactions will be replayed again,
+ ** fixing any write ordering problems introduced.
+ */
_update_journal_header_block(p_s_sb, SB_JOURNAL(p_s_sb)->j_start,
SB_JOURNAL(p_s_sb)->j_last_flush_trans_id))
{
Index: linus.17/fs/reiserfs/Makefile
--- linus.17/fs/reiserfs/Makefile Tue, 27 Nov 2001 23:33:36 -0500 root (linux/b/1_Makefile 1.3 644)
+++ linus.17(w)/fs/reiserfs/Makefile Fri, 11 Jan 2002 10:32:41 -0500 root (linux/b/1_Makefile 1.3 644)
@@ -9,7 +9,7 @@

O_TARGET := reiserfs.o
obj-y := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o super.o prints.o objectid.o \
-lbalance.o ibalance.o stree.o hashes.o buffer2.o tail_conversion.o journal.o resize.o tail_conversion.o version.o item_ops.o ioctl.o procfs.o
+lbalance.o ibalance.o stree.o hashes.o buffer2.o tail_conversion.o journal.o resize.o tail_conversion.o version.o item_ops.o ioctl.o procfs.o writecache.o

obj-m := $(O_TARGET)

Index: linus.17/include/linux/reiserfs_fs.h
--- linus.17/include/linux/reiserfs_fs.h Thu, 13 Dec 2001 11:06:51 -0500 root (linux/k/d/7_reiserfs_f 1.2.2.1.2.1 644)
+++ linus.17(w)/include/linux/reiserfs_fs.h Fri, 11 Jan 2002 10:47:55 -0500 root (linux/k/d/7_reiserfs_f 1.2.2.1.2.1 644)
@@ -1918,6 +1918,9 @@

/* ioctl's command */
#define REISERFS_IOC_UNPACK _IOW(0xCD,1,long)
+
+/* from writecache.c */
+int reiserfs_write_barrier(struct buffer_head *bh) ;

#endif /* _LINUX_REISER_FS_H */

Index: linus.17/fs/reiserfs/writecache.c
--- linus.17/fs/reiserfs/writecache.c Fri, 11 Jan 2002 11:37:11 -0500 root ()
+++ linus.17(w)/fs/reiserfs/writecache.c Fri, 11 Jan 2002 11:26:34 -0500 root (linux/L/d/15_writecache 644)
@@ -0,0 +1,21 @@
+#include <linux/fs.h>
+#include <linux/ide.h>
+
+/* insert a write barrier on the device for this buffer. All previous
+** requests will be on disk before the next request is
+*/
+int reiserfs_write_barrier(struct buffer_head *bh) {
+ ide_drive_t *drive ;
+ ide_driver_t *d ;
+
+ /* This won't help on lvm or scsi or software raid */
+ drive = get_info_ptr(bh->b_dev) ;
+ if (drive) {
+ d = drive->driver ;
+ if (d->flushcache && d->flushcache(drive)) {
+ printk (KERN_INFO "%s: Write Cache FAILED Flushing!\n",drive->name);
+ return -EIO ;
+ }
+ }
+ return 0 ;
+}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/