[patch] write() throttling fix

Andrew Morton (akpm@zip.com.au)
Tue, 27 Aug 2002 21:31:52 -0700


The patch fixes a few problems in the writer throttling code. Mainly
in the situation where a single large file is being written out.

That file could be parked on sb->locked_inodes due to pdflush
writeback, and the writer throttling path coming out of
balance_dirty_pages() forgot to look for inodes on ->locked_inodes.

The net effect was that the amount of dirty memory was exceeding the
limit set in /proc/sys/vm/dirty_async_ratio, possibly to the point
where the system gets seriously choked.

The patch removes sb->locked_inodes altogether and teaches the
throttling code to look or inodes on sb->s_io as well as sb->s_dirty.

Also, just leave unwritten dirty pages on mapping->io_pages, and
unwritten dirty inodes on sb->s_io. Putting them back onto
->dirty_pages and ->dirty_inodes was fairly pointless, given that both
lists need to be looked at.

fs/fs-writeback.c | 42 +++++++++++++++---------------------------
fs/inode.c | 6 ------
fs/mpage.c | 3 +--
fs/super.c | 1 -
include/linux/fs.h | 1 -
5 files changed, 16 insertions(+), 37 deletions(-)

--- 2.5.32/fs/fs-writeback.c~throttling-fix Tue Aug 27 21:30:51 2002
+++ 2.5.32-akpm/fs/fs-writeback.c Tue Aug 27 21:30:51 2002
@@ -134,8 +134,6 @@ static void __sync_single_inode(struct i
struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;

- list_move(&inode->i_list, &sb->s_locked_inodes);
-
BUG_ON(inode->i_state & I_LOCK);

/* Set I_LOCK, reset I_DIRTY */
@@ -163,12 +161,12 @@ static void __sync_single_inode(struct i
if (inode->i_state & I_DIRTY) { /* Redirtied */
list_add(&inode->i_list, &sb->s_dirty);
} else {
- if (!list_empty(&mapping->dirty_pages)) {
+ if (!list_empty(&mapping->dirty_pages) ||
+ !list_empty(&mapping->io_pages)) {
/* Not a whole-file writeback */
mapping->dirtied_when = orig_dirtied_when;
inode->i_state |= I_DIRTY_PAGES;
- list_add_tail(&inode->i_list,
- &sb->s_dirty);
+ list_add_tail(&inode->i_list, &sb->s_dirty);
} else if (atomic_read(&inode->i_count)) {
list_add(&inode->i_list, &inode_in_use);
} else {
@@ -205,7 +203,7 @@ __writeback_single_inode(struct inode *i
* If older_than_this is non-NULL, then only write out mappings which
* had their first dirtying at a time earlier than *older_than_this.
*
- * If we're a pdlfush thread, then implement pdlfush collision avoidance
+ * If we're a pdlfush thread, then implement pdflush collision avoidance
* against the entire list.
*
* WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
@@ -221,6 +219,11 @@ __writeback_single_inode(struct inode *i
* FIXME: this linear search could get expensive with many fileystems. But
* how to fix? We need to go from an address_space to all inodes which share
* a queue with that address_space.
+ *
+ * The inodes to be written are parked on sb->s_io. They are moved back onto
+ * sb->s_dirty as they are selected for writing. This way, none can be missed
+ * on the writer throttling path, and we get decent balancing between many
+ * thrlttled threads: we don't want them all piling up on __wait_on_inode.
*/
static void
sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
@@ -241,7 +244,7 @@ sync_sb_inodes(struct backing_dev_info *
if (single_bdi && mapping->backing_dev_info != single_bdi) {
if (sb != blockdev_superblock)
break; /* inappropriate superblock */
- list_move(&inode->i_list, &inode->i_sb->s_dirty);
+ list_move(&inode->i_list, &sb->s_dirty);
continue; /* not this blockdev */
}

@@ -263,10 +266,11 @@ sync_sb_inodes(struct backing_dev_info *

BUG_ON(inode->i_state & I_FREEING);
__iget(inode);
+ list_move(&inode->i_list, &sb->s_dirty);
__writeback_single_inode(inode, really_sync, nr_to_write);
if (sync_mode == WB_SYNC_HOLD) {
mapping->dirtied_when = jiffies;
- list_move(&inode->i_list, &inode->i_sb->s_dirty);
+ list_move(&inode->i_list, &sb->s_dirty);
}
if (current_is_pdflush())
writeback_release(bdi);
@@ -278,9 +282,8 @@ sync_sb_inodes(struct backing_dev_info *
}
out:
/*
- * Put the rest back, in the correct order.
+ * Leave any unwritten inodes on s_io.
*/
- list_splice_init(&sb->s_io, sb->s_dirty.prev);
return;
}

@@ -302,7 +305,7 @@ __writeback_unlocked_inodes(struct backi
spin_lock(&sb_lock);
sb = sb_entry(super_blocks.prev);
for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) {
- if (!list_empty(&sb->s_dirty)) {
+ if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io)) {
spin_unlock(&sb_lock);
sync_sb_inodes(bdi, sb, sync_mode, nr_to_write,
older_than_this);
@@ -321,7 +324,7 @@ __writeback_unlocked_inodes(struct backi
* Note:
* We don't need to grab a reference to superblock here. If it has non-empty
* ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
- * past sync_inodes_sb() until both ->s_dirty and ->s_locked_inodes are
+ * past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are
* empty. Since __sync_single_inode() regains inode_lock before it finally moves
* inode from superblock lists we are OK.
*
@@ -352,19 +355,6 @@ void writeback_backing_dev(struct backin
sync_mode, older_than_this);
}

-static void __wait_on_locked(struct list_head *head)
-{
- struct list_head * tmp;
- while ((tmp = head->prev) != head) {
- struct inode *inode = list_entry(tmp, struct inode, i_list);
- __iget(inode);
- spin_unlock(&inode_lock);
- __wait_on_inode(inode);
- iput(inode);
- spin_lock(&inode_lock);
- }
-}
-
/*
* writeback and wait upon the filesystem's dirty inodes. The caller will
* do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is
@@ -384,8 +374,6 @@ void sync_inodes_sb(struct super_block *
spin_lock(&inode_lock);
sync_sb_inodes(NULL, sb, wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
&nr_to_write, NULL);
- if (wait)
- __wait_on_locked(&sb->s_locked_inodes);
spin_unlock(&inode_lock);
}

--- 2.5.32/fs/mpage.c~throttling-fix Tue Aug 27 21:30:51 2002
+++ 2.5.32-akpm/fs/mpage.c Tue Aug 27 21:30:51 2002
@@ -599,9 +599,8 @@ mpage_writepages(struct address_space *m
write_lock(&mapping->page_lock);
}
/*
- * Put the rest back, in the correct order.
+ * Leave any remaining dirty pages on ->io_pages
*/
- list_splice_init(&mapping->io_pages, mapping->dirty_pages.prev);
write_unlock(&mapping->page_lock);
pagevec_deactivate_inactive(&pvec);
if (bio)
--- 2.5.32/include/linux/fs.h~throttling-fix Tue Aug 27 21:30:51 2002
+++ 2.5.32-akpm/include/linux/fs.h Tue Aug 27 21:30:51 2002
@@ -655,7 +655,6 @@ struct super_block {

struct list_head s_dirty; /* dirty inodes */
struct list_head s_io; /* parked for writeback */
- struct list_head s_locked_inodes;/* inodes being synced */
struct list_head s_anon; /* anonymous dentries for (nfs) exporting */
struct list_head s_files;

--- 2.5.32/fs/inode.c~throttling-fix Tue Aug 27 21:30:51 2002
+++ 2.5.32-akpm/fs/inode.c Tue Aug 27 21:30:51 2002
@@ -323,7 +323,6 @@ int invalidate_inodes(struct super_block
busy |= invalidate_list(&inode_unused, sb, &throw_away);
busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
busy |= invalidate_list(&sb->s_io, sb, &throw_away);
- busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away);
spin_unlock(&inode_lock);

dispose_list(&throw_away);
@@ -1000,11 +999,6 @@ void remove_dquot_ref(struct super_block
inode = list_entry(act_head, struct inode, i_list);
if (IS_QUOTAINIT(inode))
remove_inode_dquot_ref(inode, type, &tofree_head);
- }
- list_for_each(act_head, &sb->s_locked_inodes) {
- inode = list_entry(act_head, struct inode, i_list);
- if (IS_QUOTAINIT(inode))
- remove_inode_dquot_ref(inode, type, &tofree_head);
}
spin_unlock(&inode_lock);
unlock_kernel();
--- 2.5.32/fs/super.c~throttling-fix Tue Aug 27 21:30:51 2002
+++ 2.5.32-akpm/fs/super.c Tue Aug 27 21:30:51 2002
@@ -58,7 +58,6 @@ static struct super_block *alloc_super(v
}
INIT_LIST_HEAD(&s->s_dirty);
INIT_LIST_HEAD(&s->s_io);
- INIT_LIST_HEAD(&s->s_locked_inodes);
INIT_LIST_HEAD(&s->s_files);
INIT_LIST_HEAD(&s->s_instances);
INIT_LIST_HEAD(&s->s_anon);

.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/