Re: balance_dirty_pages broken

Andrew Morton (akpm@digeo.com)
Wed, 16 Oct 2002 22:09:41 -0700


Doug Ledford wrote:
>
> Actually, I don't know if it's balance_dirty_pages fault or some other
> part of the kernel's fault, but it is broken here. Failure mode is that
> balance_dirty_pages would loop forever. Reason it would loop forever is
> because the ps struct had a negative entry for nr_dirty.

I've seen one other report of this. Something has gone wrong
with the page accounting. Could you please send me your
.config and a description of what filesytems you're using?

err. ramdisk? initrd? If so, does this fix?

drivers/block/rd.c | 173 +++++++++++++++++++++++------------------------------
fs/buffer.c | 18 +++--
kernel/ksyms.c | 1
3 files changed, 90 insertions(+), 102 deletions(-)

--- 2.5.43/drivers/block/rd.c~rd-cleanup Tue Oct 15 21:10:15 2002
+++ 2.5.43-akpm/drivers/block/rd.c Tue Oct 15 21:12:13 2002
@@ -45,12 +45,14 @@
#include <linux/config.h>
#include <linux/string.h>
#include <linux/slab.h>
-#include <asm/atomic.h>
+#include <linux/highmem.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/devfs_fs_kernel.h>
#include <linux/buffer_head.h> /* for invalidate_bdev() */
+#include <linux/backing-dev.h>
+
#include <asm/uaccess.h>

/*
@@ -73,10 +75,7 @@ unsigned long initrd_start, initrd_end;
int initrd_below_start_ok;
#endif

-/* Various static variables go here. Most are used only in the RAM disk code.
- */
-
-static unsigned long rd_length[NUM_RAMDISKS]; /* Size of RAM disks in bytes */
+static unsigned long rd_length[NUM_RAMDISKS]; /* Size of RAM disks in bytes */
static struct gendisk *rd_disks[NUM_RAMDISKS];
static devfs_handle_t devfs_handle;
static struct block_device *rd_bdev[NUM_RAMDISKS];/* Protected device data */
@@ -87,7 +86,7 @@ static struct block_device *rd_bdev[NUM_
* architecture-specific setup routine (from the stored boot sector
* information).
*/
-int rd_size = CONFIG_BLK_DEV_RAM_SIZE; /* Size of the RAM disks */
+
/*
* It would be very desiderable to have a soft-blocksize (that in the case
* of the ramdisk driver is also the hardblocksize ;) of PAGE_SIZE because
@@ -101,26 +100,17 @@ int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
*/
int rd_blocksize = BLOCK_SIZE; /* blocksize of the RAM disks */

+/* Size of the RAM disks */
+int rd_size = (CONFIG_BLK_DEV_RAM_SIZE + (PAGE_CACHE_SIZE >> 10) - 1) &
+ (PAGE_CACHE_MASK >> 10);
+
/*
* Copyright (C) 2000 Linus Torvalds.
* 2000 Transmeta Corp.
* aops copied from ramfs.
*/
-static int ramdisk_readpage(struct file *file, struct page * page)
-{
- if (!PageUptodate(page)) {
- void *kaddr = kmap_atomic(page, KM_USER0);

- memset(kaddr, 0, PAGE_CACHE_SIZE);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
- SetPageUptodate(page);
- }
- unlock_page(page);
- return 0;
-}
-
-static int ramdisk_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+static void wipe_page(struct page *page)
{
if (!PageUptodate(page)) {
void *kaddr = kmap_atomic(page, KM_USER0);
@@ -130,43 +120,26 @@ static int ramdisk_prepare_write(struct
kunmap_atomic(kaddr, KM_USER0);
SetPageUptodate(page);
}
- SetPageDirty(page);
- return 0;
}

-static int ramdisk_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+static int
+rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec, sector_t sector, int minor)
{
- return 0;
-}
-
-static struct address_space_operations ramdisk_aops = {
- readpage: ramdisk_readpage,
- writepage: fail_writepage,
- prepare_write: ramdisk_prepare_write,
- commit_write: ramdisk_commit_write,
-};
-
-static int rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec,
- sector_t sector, int minor)
-{
- struct address_space * mapping;
+ struct address_space *mapping;
unsigned long index;
unsigned int vec_offset;
int offset, size, err;

err = 0;
mapping = rd_bdev[minor]->bd_inode->i_mapping;
-
index = sector >> (PAGE_CACHE_SHIFT - 9);
offset = (sector << 9) & ~PAGE_CACHE_MASK;
size = vec->bv_len;
vec_offset = vec->bv_offset;

do {
+ struct page *page;
int count;
- struct page * page;
- char * src, * dst;
- int unlock = 0;

count = PAGE_CACHE_SIZE - offset;
if (count > size)
@@ -176,53 +149,39 @@ static int rd_blkdev_pagecache_IO(int rw
page = find_get_page(mapping, index);
if (!page) {
page = grab_cache_page(mapping, index);
- err = -ENOMEM;
- if (!page)
+ if (!page) {
+ err = -ENOMEM;
goto out;
- err = 0;
-
- if (!PageUptodate(page)) {
- void *kaddr = kmap_atomic(page, KM_USER0);
-
- memset(kaddr, 0, PAGE_CACHE_SIZE);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
- SetPageUptodate(page);
}
-
- unlock = 1;
+ wipe_page(page);
+ set_page_dirty(page);
+ unlock_page(page);
}
+ if (page != vec->bv_page || vec_offset != offset) {
+ if (rw == READ) {
+ char *src = kmap_atomic(page, KM_USER0);
+ char *dst = kmap_atomic(vec->bv_page, KM_USER1);
+
+ memcpy(dst + vec_offset, src + offset, count);
+ flush_dcache_page(vec->bv_page);
+ kunmap_atomic(src, KM_USER0);
+ kunmap_atomic(dst, KM_USER1);
+ } else {
+ char *src = kmap_atomic(vec->bv_page, KM_USER0);
+ char *dst = kmap_atomic(page, KM_USER1);

- index++;
-
- if (rw == READ) {
- src = kmap(page);
- src += offset;
- dst = kmap(vec->bv_page) + vec_offset;
- } else {
- dst = kmap(page);
- dst += offset;
- src = kmap(vec->bv_page) + vec_offset;
+ memcpy(dst + offset, src + vec_offset, count);
+ flush_dcache_page(page);
+ kunmap_atomic(src, KM_USER0);
+ kunmap_atomic(dst, KM_USER1);
+ }
}
+ page_cache_release(page);
offset = 0;
vec_offset += count;
-
- memcpy(dst, src, count);
-
- kunmap(page);
- kunmap(vec->bv_page);
-
- if (rw == READ) {
- flush_dcache_page(sbh->b_page);
- } else {
- SetPageDirty(page);
- }
- if (unlock)
- unlock_page(page);
- __free_page(page);
+ index++;
} while (size);
-
- out:
+out:
return err;
}

@@ -324,7 +283,6 @@ static ssize_t initrd_read(struct file *
return count;
}

-
static int initrd_release(struct inode *inode,struct file *file)
{
extern void free_initrd_mem(unsigned long, unsigned long);
@@ -343,14 +301,19 @@ static int initrd_release(struct inode *
return 0;
}

-
static struct file_operations initrd_fops = {
- read: initrd_read,
- release: initrd_release,
+ .read = initrd_read,
+ .release = initrd_release,
};

#endif

+struct address_space_operations ramdisk_aops;
+
+static struct backing_dev_info rd_backing_dev_info = {
+ .ra_pages = 0, /* No readahead */
+ .memory_backed = 1, /* Does not contribute to dirty memory */
+};

static int rd_open(struct inode * inode, struct file * filp)
{
@@ -375,22 +338,25 @@ static int rd_open(struct inode * inode,
* Immunize device against invalidate_buffers() and prune_icache().
*/
if (rd_bdev[unit] == NULL) {
- rd_bdev[unit] = bdget(kdev_t_to_nr(inode->i_rdev));
- rd_bdev[unit]->bd_openers++;
- rd_bdev[unit]->bd_block_size = rd_blocksize;
- rd_bdev[unit]->bd_inode->i_mapping->a_ops = &ramdisk_aops;
- rd_bdev[unit]->bd_inode->i_size = rd_length[unit];
- rd_bdev[unit]->bd_queue = &blk_dev[MAJOR_NR].request_queue;
- rd_bdev[unit]->bd_disk = get_disk(rd_disks[unit]);
+ struct block_device *b = bdget(kdev_t_to_nr(inode->i_rdev));
+
+ rd_bdev[unit] = b;
+ b->bd_openers++;
+ b->bd_block_size = rd_blocksize;
+ b->bd_inode->i_mapping->a_ops = &ramdisk_aops;
+ b->bd_inode->i_mapping->backing_dev_info = &rd_backing_dev_info;
+ b->bd_inode->i_size = rd_length[unit];
+ b->bd_queue = &blk_dev[MAJOR_NR].request_queue;
+ b->bd_disk = get_disk(rd_disks[unit]);
}

return 0;
}

static struct block_device_operations rd_bd_op = {
- owner: THIS_MODULE,
- open: rd_open,
- ioctl: rd_ioctl,
+ .owner = THIS_MODULE,
+ .open = rd_open,
+ .ioctl = rd_ioctl,
};

/* Before freeing the module, invalidate all of the protected buffers! */
@@ -416,6 +382,19 @@ static void __exit rd_cleanup (void)
unregister_blkdev( MAJOR_NR, "ramdisk" );
}

+/*
+ * If someone writes a ramdisk page with submit_bh(), we have a dirty page
+ * with clean buffers. try_to_free_buffers() will then propagate the buffer
+ * cleanness up into page-cleaness and the VM will evict the page.
+ *
+ * To stop that happening, the ramdisk address_space has a ->releasepage()
+ * which always fails.
+ */
+static int fail_releasepage(struct page *page, int offset)
+{
+ return 0;
+}
+
/* This is the registration and initialization section of the RAM disk driver */
static int __init rd_init (void)
{
@@ -428,6 +407,9 @@ static int __init rd_init (void)
rd_blocksize);
rd_blocksize = BLOCK_SIZE;
}
+ ramdisk_aops = def_blk_aops;
+ ramdisk_aops.writepage = fail_writepage;
+ ramdisk_aops.releasepage = fail_releasepage;

#ifdef CONFIG_BLK_DEV_INITRD
initrd_disk = alloc_disk(1);
@@ -519,9 +501,8 @@ __setup("ramdisk_blocksize=", ramdisk_bl
#endif

/* options - modular */
-MODULE_PARM (rd_size, "1i");
+MODULE_PARM(rd_size, "1i");
MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
MODULE_PARM (rd_blocksize, "i");
MODULE_PARM_DESC(rd_blocksize, "Blocksize of each RAM disk in bytes.");
-
MODULE_LICENSE("GPL");
--- 2.5.43/fs/buffer.c~rd-cleanup Tue Oct 15 21:10:15 2002
+++ 2.5.43-akpm/fs/buffer.c Tue Oct 15 21:10:15 2002
@@ -388,14 +388,21 @@ __find_get_block_slow(struct block_devic
head = page_buffers(page);
bh = head;
do {
- if (bh->b_blocknr == block) {
+ if (bh->b_blocknr == block && buffer_mapped(bh)) {
ret = bh;
get_bh(bh);
goto out_unlock;
}
bh = bh->b_this_page;
} while (bh != head);
- buffer_error();
+ /*
+ * This path can happen if the page had some unmapped buffers, which
+ * will have b_blocknr == -1. When a ramdisk mapping's page was brought
+ * partially uptodate by mkfs and unmap_underlying_metadata searches
+ * for blocks in part of the page which wasn't touched by mkfs.
+ *
+ * buffer_error();
+ */
out_unlock:
spin_unlock(&bd_mapping->private_lock);
page_cache_release(page);
@@ -1463,7 +1470,7 @@ int try_to_release_page(struct page *pag
* @offset: the index of the truncation point
*
* block_invalidatepage() is called when all or part of the page has become
- * invalidatedby a truncate operation.
+ * invalidated by a truncate operation.
*
* block_invalidatepage() does not have to release all buffers, but it must
* ensure that no dirty buffer is left outside @offset and that no I/O
@@ -1617,8 +1624,6 @@ static int __block_write_full_page(struc
last_block = (inode->i_size - 1) >> inode->i_blkbits;

if (!page_has_buffers(page)) {
- if (S_ISBLK(inode->i_mode))
- buffer_error();
if (!PageUptodate(page))
buffer_error();
create_empty_buffers(page, 1 << inode->i_blkbits,
@@ -2605,7 +2610,8 @@ void free_buffer_head(struct buffer_head
}
EXPORT_SYMBOL(free_buffer_head);

-static void init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
+static void
+init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
{
if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
SLAB_CTOR_CONSTRUCTOR) {
--- 2.5.43/kernel/ksyms.c~rd-cleanup Tue Oct 15 21:10:15 2002
+++ 2.5.43-akpm/kernel/ksyms.c Tue Oct 15 21:10:15 2002
@@ -138,6 +138,7 @@ EXPORT_SYMBOL(page_address);
EXPORT_SYMBOL(get_user_pages);

/* filesystem internal functions */
+EXPORT_SYMBOL_GPL(def_blk_aops);
EXPORT_SYMBOL(def_blk_fops);
EXPORT_SYMBOL(update_atime);
EXPORT_SYMBOL(get_fs_type);

.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/