bio_append patch

Adam J. Richter (adam@yggdrasil.com)
Sat, 22 Jun 2002 08:04:17 -0700


Here is a first draft of my bio_append patch. It includes
a rewrite of most of mpage.c.

The new mpage routines never punt to the fs/buffer.c routines.
So, it may be possible to rename them to their fs/buffer.c counterparts
and delete the fs/buffer.c versions. Also, the new mpage routines
notice if you are trying to write PAGE_ZERO to a hole in a file and
will leave the hole in place in that case rather than allocate space
on disk unnecessarily.

I know the code needs a little some clarification in the
form of documentation and perhaps some symbol renaming.

Any comments, testing or benchmarking would be appreciated.
(I sure would like to know how much of a benefit if any we get from
building IO bigger requests.)

Note: I believe I have seen 2.5.24 without these changes lock up
under one process's constant IDE disk activity. So if you see a "soft
lockup", it would be helpful if you could see if you can repeat it
with this patch and have it not occur without this patch.

Adam J. Richter __ ______________ 575 Oroville Road
adam@yggdrasil.com \ / Milpitas, California 95035
+1 408 309-6081 | g g d r a s i l United States of America
"Free Software For The Rest Of Us."

--- linux-2.5.24/include/linux/bio.h 2002-06-20 15:53:55.000000000 -0700
+++ linux/include/linux/bio.h 2002-06-20 06:36:16.000000000 -0700
@@ -37,8 +37,7 @@
#define BIO_BUG_ON
#endif

-#define BIO_MAX_SECTORS 128
-#define BIO_MAX_SIZE (BIO_MAX_SECTORS << 9)
+#define BIO_MAX_VECS 256

/*
* was unsigned short, but we might as well be ready for > 64kB I/O pages
@@ -78,6 +77,7 @@
* hardware coalescing is performed.
*/
unsigned short bi_hw_segments;
+ unsigned int bi_last_seg_size;

unsigned int bi_size; /* residual I/O count */
unsigned int bi_max; /* max bvl_vecs we can hold,
@@ -196,11 +196,14 @@
struct request_queue;
extern inline int bio_phys_segments(struct request_queue *, struct bio *);
extern inline int bio_hw_segments(struct request_queue *, struct bio *);
+extern int bio_max_iovecs(struct request_queue *q, int *iovec_size);

extern inline void __bio_clone(struct bio *, struct bio *);
extern struct bio *bio_clone(struct bio *, int);
extern struct bio *bio_copy(struct bio *, int, int);

+extern void bio_recycle (struct bio **bio, int gfp_mask);
+extern void bio_append(struct bio **bio_p, struct bio_vec *bv, int gfp_mask);
extern inline void bio_init(struct bio *);

extern int bio_ioctl(kdev_t, unsigned int, unsigned long);
--- linux-2.5.24/include/linux/blkdev.h 2002-06-20 15:53:47.000000000 -0700
+++ linux/include/linux/blkdev.h 2002-06-19 03:54:33.000000000 -0700
@@ -266,16 +266,26 @@
unsigned block_size_bits;
};

+struct partition_ops {
+ int (*check)(struct gendisk *hd,
+ struct block_device *bdev,
+ unsigned long first_sect,
+ int first_minor);
+ struct partition_ops *next;
+};
+
/*
* Used to indicate the default queue for drivers that don't bother
* to implement multiple queues. We have this access macro here
* so as to eliminate the need for each and every block device
* driver to know about the internal structure of blk_dev[].
*/
-#define BLK_DEFAULT_QUEUE(_MAJOR) &blk_dev[_MAJOR].request_queue
+#define BLK_DEFAULT_QUEUE(_MAJOR) (&blk_dev[_MAJOR].request_queue)

extern struct sec_size * blk_sec[MAX_BLKDEV];
extern struct blk_dev_struct blk_dev[MAX_BLKDEV];
+extern void register_partition_ops(struct partition_ops *ops);
+extern void unregister_partition_ops(struct partition_ops *ops);
extern void grok_partitions(kdev_t dev, long size);
extern int wipe_partitions(kdev_t dev);
extern void register_disk(struct gendisk *dev, kdev_t first, unsigned minors, struct block_device_operations *ops, long size);
@@ -287,6 +297,10 @@
extern void blk_put_request(struct request *);
extern void blk_plug_device(request_queue_t *);
extern int blk_remove_plug(request_queue_t *);
+struct bio_vec;
+extern void blk_tally_bvecs(request_queue_t *q, struct bio *bio,
+ struct bio_vec *bvprv,
+ struct bio_vec *bv, int bv_count);
extern void blk_recount_segments(request_queue_t *, struct bio *);
extern inline int blk_phys_contig_segment(request_queue_t *q, struct bio *, struct bio *);
extern inline int blk_hw_contig_segment(request_queue_t *q, struct bio *, struct bio *);
@@ -410,4 +424,6 @@
page_cache_release(p.v);
}

+extern int bio_max_iovecs(request_queue_t *q, int *iovec_size);
+
#endif
--- linux-2.5.24/fs/bio.c 2002-06-20 15:53:43.000000000 -0700
+++ linux/fs/bio.c 2002-06-22 07:30:21.000000000 -0700
@@ -92,6 +92,13 @@
return bvl;
}

+/* Must correspond to the switch statement in bvec_alloc. */
+static inline int bi_max_to_size(int bi_max)
+{
+ static unsigned short bi_to_size[] = { 1, 4, 16, 64, 128, 256 };
+ return bi_to_size[bi_max];
+}
+
/*
* default destructor for a bio allocated with bio_alloc()
*/
@@ -118,6 +125,7 @@
bio->bi_idx = 0;
bio->bi_phys_segments = 0;
bio->bi_hw_segments = 0;
+ bio->bi_last_seg_size = 0;
bio->bi_size = 0;
bio->bi_end_io = NULL;
atomic_set(&bio->bi_cnt, 1);
@@ -317,6 +325,94 @@
return NULL;
}

+void bio_recycle (struct bio **bio, int gfp_mask)
+{
+ struct bio *old = *bio;
+ struct bio *newbio = bio_alloc(gfp_mask, bi_max_to_size(old->bi_max));
+
+ newbio->bi_sector = old->bi_sector + (old->bi_size >> 9);
+
+#define COPY(field) newbio->bi_ ## field = old->bi_ ## field
+ COPY(bdev);
+ COPY(rw);
+ COPY(end_io);
+ COPY(private);
+#undef COPY
+
+ submit_bio(old->bi_rw, old);
+ *bio = newbio;
+}
+
+
+/* bio_append appends an IO vector to a bio. If there is no room
+ in the bio, the bio will first be submitted and replaced with an
+ empty one.
+
+ bio_append expects to be called with bio->bi_vcnt indicating the
+ number of IO vectors already loaded. If the provided bio is
+ already full, bio_append will submit the current bio and allocate
+ a new one. */
+
+void bio_append(struct bio **bio_p, struct bio_vec *bv, int gfp_mask)
+{
+ struct bio *bio = *bio_p;
+ request_queue_t *q = bio->bi_bdev->bd_queue;
+
+ BUG_ON(q == NULL);
+ BUG_ON(bio->bi_io_vec == NULL);
+
+ if (bio->bi_vcnt != 0) {
+
+ if (bio->bi_vcnt < bi_max_to_size(bio->bi_max) &&
+ ((bio->bi_size + bv->bv_len) >> 9) <= q->max_sectors) {
+
+ unsigned int phys_segs = bio->bi_phys_segments;
+ unsigned int hw_segs = bio->bi_hw_segments;
+ unsigned int seg_size = bio->bi_last_seg_size;
+
+ blk_tally_bvecs(q, bio,
+ &bio->bi_io_vec[bio->bi_vcnt-1], bv, 1);
+
+ if (bio->bi_phys_segments <= q->max_phys_segments &&
+ bio->bi_hw_segments <= q->max_hw_segments)
+ goto goodbio;
+
+ bio->bi_phys_segments = phys_segs;
+ bio->bi_hw_segments = hw_segs;
+ bio->bi_last_seg_size = seg_size;
+
+ }
+ bio_recycle(&bio, gfp_mask);
+ *bio_p = bio;
+ }
+
+ blk_tally_bvecs(q, bio, NULL, bv, 1);
+ bio->bi_flags |= (1 << BIO_SEG_VALID);
+
+ goodbio:
+ bio->bi_io_vec[bio->bi_vcnt++] = *bv;
+ bio->bi_size += bv->bv_len;
+}
+
+int bio_max_iovecs(request_queue_t *q, int *iovec_size)
+{
+ unsigned max_iovecs = min(q->max_phys_segments, q->max_hw_segments);
+
+ if (q->max_segment_size != 0 && *iovec_size > q->max_segment_size)
+ *iovec_size = q->max_segment_size;
+
+ if (q->max_sectors != 0) {
+ unsigned int max_bytes = q->max_sectors << 9;
+ if (*iovec_size > max_bytes) {
+ *iovec_size = max_bytes;
+ return 1;
+ }
+ max_iovecs = min(max_iovecs, max_bytes / *iovec_size);
+ }
+
+ return max_iovecs;
+}
+
static void bio_end_io_kio(struct bio *bio)
{
struct kiobuf *kio = (struct kiobuf *) bio->bi_private;
@@ -339,9 +435,10 @@
**/
void ll_rw_kio(int rw, struct kiobuf *kio, struct block_device *bdev, sector_t sector)
{
- int i, offset, size, err, map_i, total_nr_pages, nr_pages;
- struct bio_vec *bvec;
+ int offset, size, err, map_i, total_nr_pages, nr_bvecs;
+ struct bio_vec bvec;
struct bio *bio;
+ int bytes_per_bvec, max_bvecs;

err = 0;
if ((rw & WRITE) && bdev_read_only(bdev)) {
@@ -367,17 +464,18 @@

map_i = 0;

-next_chunk:
- nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - 9);
- if (nr_pages > total_nr_pages)
- nr_pages = total_nr_pages;
+ max_bvecs = bio_max_iovecs(bdev->bd_queue, &bytes_per_bvec);
+
+ nr_bvecs = max_bvecs;
+ if (nr_bvecs > total_nr_pages)
+ nr_bvecs = total_nr_pages;

atomic_inc(&kio->io_count);

/*
* allocate bio and do initial setup
*/
- if ((bio = bio_alloc(GFP_NOIO, nr_pages)) == NULL) {
+ if ((bio = bio_alloc(GFP_NOIO, nr_bvecs)) == NULL) {
err = -ENOMEM;
goto out;
}
@@ -387,43 +485,37 @@
bio->bi_idx = 0;
bio->bi_end_io = bio_end_io_kio;
bio->bi_private = kio;
+ bio->bi_rw = rw;

- bvec = bio->bi_io_vec;
- for (i = 0; i < nr_pages; i++, bvec++, map_i++) {
+ while (total_nr_pages > 0) {
int nbytes = PAGE_SIZE - offset;
-
+
if (nbytes > size)
nbytes = size;
+ if (nbytes > bytes_per_bvec)
+ nbytes = bytes_per_bvec;

BUG_ON(kio->maplist[map_i] == NULL);

- if (bio->bi_size + nbytes > (BIO_MAX_SECTORS << 9))
- goto queue_io;
-
- bio->bi_vcnt++;
- bio->bi_size += nbytes;
-
- bvec->bv_page = kio->maplist[map_i];
- bvec->bv_len = nbytes;
- bvec->bv_offset = offset;
-
- /*
- * kiobuf only has an offset into the first page
- */
- offset = 0;
+ bvec.bv_page = kio->maplist[map_i];
+ bvec.bv_offset = offset;
+ bvec.bv_len = nbytes;
+
+ bio_append(&bio, &bvec, GFP_NOIO);
+
+ offset = (offset + nbytes) & PAGE_MASK;
+ if (offset == 0) {
+ total_nr_pages--;
+ map_i++;
+ }

sector += nbytes >> 9;
size -= nbytes;
- total_nr_pages--;
kio->offset += nbytes;
}

-queue_io:
submit_bio(rw, bio);

- if (total_nr_pages)
- goto next_chunk;
-
if (size) {
printk("ll_rw_kio: size %d left (kio %d)\n", size, kio->length);
BUG();
--- linux-2.5.24/fs/mpage.c 2002-06-20 15:53:47.000000000 -0700
+++ linux/fs/mpage.c 2002-06-22 07:48:57.000000000 -0700
@@ -2,12 +2,19 @@
* fs/mpage.c
*
* Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2001, Jens Axboe
+ * Copyright (C) 2002, Yggdrasil Computing, Inc.
*
* Contains functions related to preparing and submitting BIOs which contain
* multiple pagecache pages.
*
* 15May2002 akpm@zip.com.au
* Initial version
+ *
+ * 21June2002 adam@yggdrasil.com
+ * Rewrote at least half of it, in order to safely build
+ * large IO transfers and to support the cases that
+ * fs/buffer.c handled.
*/

#include <linux/kernel.h>
@@ -19,12 +26,40 @@
#include <linux/highmem.h>
#include <linux/prefetch.h>
#include <linux/mpage.h>
+#include <linux/slab.h>
+#include <linux/mempool.h>
+#include <linux/init.h>
+#include <asm/atomic.h>
+
+struct mpage_private {
+ atomic_t bvec_count;
+ atomic_t uptodate;
+};

-/*
- * The largest-sized BIO which this code will assemble, in bytes. Set this
- * to PAGE_CACHE_SIZE if your drivers are broken.
- */
-#define MPAGE_BIO_MAX_SIZE BIO_MAX_SIZE
+static mempool_t *mpage_priv_pool;
+static kmem_cache_t *mpage_priv_slab;
+
+#define MPAGE_PRIV_POOL_SIZE 100
+
+static void
+bvec_io_error(struct bio_vec *bvec)
+{
+ if (page_has_buffers(bvec->bv_page)) {
+ const int bv_end = bvec->bv_offset + bvec->bv_len;
+ int buf_offset = 0;
+ struct buffer_head *bh = page_buffers(bvec->bv_page);
+
+ while (buf_offset + bh->b_size <= bvec->bv_offset) {
+ buf_offset += bh->b_size;
+ bh = bh->b_this_page;
+ }
+ while (buf_offset < bv_end) {
+ clear_buffer_uptodate(bh);
+ buf_offset += bh->b_size;
+ bh = bh->b_this_page;
+ }
+ }
+}

/*
* I/O completion handler for multipage BIOs.
@@ -38,81 +73,298 @@
* status of that page is hard. See end_buffer_async_read() for the details.
* There is no point in duplicating all that complexity.
*/
-static void mpage_end_io_read(struct bio *bio)
+static void mpage_end_io(struct bio *bio)
{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec;
+ struct mpage_private *priv = bio->bi_private;
+ struct page *prevpage = NULL;
+ int rw;
+
+ if (!uptodate) {
+ int i;
+ bio_for_each_segment(bvec, bio, i) {
+ bvec_io_error(bvec);
+ }
+
+ if (priv != NULL)
+ atomic_set(&priv->uptodate, 0);
+ }

+ if (priv != NULL) {
+ if (!atomic_sub_and_test(bio->bi_vcnt, &priv->bvec_count))
+ return;
+ uptodate = atomic_read(&priv->uptodate);
+ mempool_free(priv, mpage_priv_pool);
+ }
+
+ rw = bio->bi_rw;
+ bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
do {
struct page *page = bvec->bv_page;

if (--bvec >= bio->bi_io_vec)
prefetchw(&bvec->bv_page->flags);

- if (uptodate) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
+ if (page != prevpage) {
+ if (rw == READ) {
+ if (uptodate) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ }
+ else {
+ if (!uptodate)
+ SetPageError(page);
+ end_page_writeback(page);
+ }
}
- unlock_page(page);
} while (bvec >= bio->bi_io_vec);
+
bio_put(bio);
}

-static void mpage_end_io_write(struct bio *bio)
+static struct bio *
+mpage_alloc(struct address_space *mapping, int nr_pages, int rw)
{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct inode *inode = mapping->host;
+ int blocks_per_page;
+ struct bio *bio;

- do {
- struct page *page = bvec->bv_page;
+ if (inode->i_blkbits >= PAGE_CACHE_SHIFT)
+ blocks_per_page = 1;
+ else
+ blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;

- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
+ bio = bio_alloc(GFP_KERNEL,
+ min(nr_pages * blocks_per_page, BIO_MAX_VECS));

- if (!uptodate)
- SetPageError(page);
- end_page_writeback(page);
- } while (bvec >= bio->bi_io_vec);
- bio_put(bio);
+ /* We rely on bio_alloc clearing bi_idx, bi_vcnt and bi_size. */
+ bio->bi_bdev =
+ S_ISBLK(inode->i_mode) ? inode->i_bdev : inode->i_sb->s_bdev;
+
+ bio->bi_end_io = mpage_end_io;
+ bio->bi_private = NULL;
+ bio->bi_rw = rw;
+
+ return bio;
}

-struct bio *mpage_bio_submit(int rw, struct bio *bio)
+static inline void
+mpage_add_block(struct bio **bio_p,
+ struct bio_vec *bvec,
+ int offset,
+ int length,
+ sector_t sector)
{
- bio->bi_vcnt = bio->bi_idx;
- bio->bi_idx = 0;
- bio->bi_end_io = mpage_end_io_read;
- if (rw == WRITE)
- bio->bi_end_io = mpage_end_io_write;
- submit_bio(rw, bio);
- return NULL;
+ struct mpage_private *priv;
+ sector_t contig;
+ struct bio *old = *bio_p;
+
+ /* Deal with empty bvec and empty bio. */
+ if (bvec->bv_len == 0) {
+ bvec->bv_offset = offset;
+ if (old->bi_vcnt == 0)
+ old->bi_sector = sector;
+ }
+
+ contig = old->bi_sector + ((old->bi_size + bvec->bv_len)>>9);
+
+ if (sector == contig &&
+ bvec->bv_offset + bvec->bv_len == offset &&
+ (bvec->bv_len + length <=
+ old->bi_bdev->bd_queue->max_segment_size)) {
+
+ bvec->bv_len += length;
+ return;
+ }
+
+ /* We are definitely going to use multiple bvec's for this
+ page, which can possibly mean multiple bio's, either due
+ to bio_append possibly splitting the bio or us explicitly
+ doing so due to a transfer that is not contiguous on the
+ block device. */
+
+ priv = old->bi_private;
+
+ if (priv == NULL) {
+
+ /* Send off the transfers for other pages, which
+ do not need to use old->bi_private (they must not
+ share ours anyhow). */
+
+ if (old->bi_vcnt != 0)
+ bio_recycle(bio_p, GFP_KERNEL);
+
+ /* Variable "old" is not valid after this point. */
+
+ priv = mempool_alloc(mpage_priv_pool, GFP_KERNEL);
+ (*bio_p)->bi_private = priv;
+ atomic_set(&priv->uptodate, 1);
+ atomic_set(&priv->bvec_count, 1);
+ }
+
+ if (bvec->bv_len != 0) {
+ atomic_inc(&priv->bvec_count);
+ bio_append(bio_p, bvec, GFP_KERNEL);
+ }
+
+ if (sector != contig) {
+ if ((*bio_p)->bi_vcnt != 0)
+ bio_recycle(bio_p, GFP_KERNEL);
+ (*bio_p)->bi_sector = sector;
+ }
+
+ bvec->bv_offset = offset;
+ bvec->bv_len = length;
}

static struct bio *
-mpage_alloc(struct block_device *bdev,
- sector_t first_sector, int nr_vecs, int gfp_flags)
+mpage_xferpage(struct bio *bio, struct page *page, get_block_t *get_block)
{
- struct bio *bio;
+ struct inode *inode = page->mapping->host;
+ const unsigned blkbits = inode->i_blkbits;
+ const unsigned blocksize = 1 << blkbits;
+ unsigned num_blocks;
+ struct bio_vec bvec;
+ sector_t block_in_file;
+ unsigned offset;
+ struct buffer_head bh_local, *bh;
+ void *page_addr = NULL;
+ int sector_offset, page_within_block, xfer_size, xfer_total;
+ unsigned int stride;
+ int rw = bio->bi_rw;
+ const int create = (rw == WRITE);
+ request_queue_t *q = bio->bi_bdev->bd_queue;
+ sector_t sector;
+
+ if (page_has_buffers(page))
+ bh = page_buffers(page);
+ else {
+ /* Make bh keep pointing to bh_local throughout the loop. */
+ bh = &bh_local;
+ bh_local.b_this_page = &bh_local;
+ }
+
+ page_within_block = page->index & ((blocksize-1) >> PAGE_CACHE_SHIFT);
+ sector_offset = page_within_block << (PAGE_CACHE_SHIFT - 9);
+
+ if (blkbits <= PAGE_CACHE_SHIFT) {
+ stride = blocksize;
+ num_blocks = PAGE_CACHE_SIZE >> blkbits;
+ block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
+ } else {
+ stride = PAGE_CACHE_SIZE;
+ num_blocks = 1;
+ block_in_file = page->index >> (blkbits - PAGE_CACHE_SHIFT);
+ }
+
+ /* FIXME. Right now you cannot have a file system on a block
+ device where the minimum tranfer size (i.e., hardsect_size)
+ is larger than either the file system block size, or
+ PAGE_CACHE_SIZE. However, I think adding such support would
+ require changing most of the Linux filesystem drivers.
+ -Adam J. Richter, 2002 June 22, */
+
+ BUG_ON(q->hardsect_size > stride);
+
+ xfer_size = min(q->max_segment_size, stride);
+
+ /* Avoid accidentally extending files and directories. */
+ if (rw == WRITE && !S_ISBLK(inode->i_mode)) {
+ sector_t end_block = (inode->i_size + blocksize - 1)>>blkbits;
+ if (block_in_file + num_blocks > end_block)
+ num_blocks = end_block - block_in_file;
+ }
+
+ bvec.bv_len = 0;
+ bvec.bv_page = page;
+ for (offset = 0; num_blocks--;
+ offset += stride, block_in_file++, bh = bh->b_this_page) {
+
+ bh_local.b_state = 0;
+
+ if (!buffer_mapped(bh)) {
+
+ /*
+ * Do not write all zeroes into file holes.
+ */
+ if (rw == WRITE && page == ZERO_PAGE(NULL) &&
+ !get_block(inode, block_in_file, bh, 0) &&
+ !buffer_mapped(bh)) {
+ clear_buffer_dirty(bh);
+ continue;
+ }
+

- bio = bio_alloc(gfp_flags, nr_vecs);
+ if (get_block(inode, block_in_file, bh, create)) {
+ SetPageError(page);
+ continue;
+ }
+
+ if (!buffer_mapped(bh)) {
+
+ BUG_ON(rw != READ);
+
+ if (page_addr == NULL)
+ page_addr = kmap(page);
+
+ memset(page_addr + offset, 0, stride);
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ }
+
+ sector = (bh->b_blocknr << (blkbits - 9)) + sector_offset;
+
+ for(xfer_total = 0; xfer_total < stride;
+ xfer_total += xfer_size)
+ mpage_add_block(&bio, &bvec, offset + xfer_total,
+ xfer_size,
+ sector + (xfer_total >> 9));
+
+ if (rw == WRITE)
+ clear_buffer_dirty(bh);

- if (bio == NULL && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2))
- bio = bio_alloc(gfp_flags, nr_vecs);
}

- if (bio) {
- bio->bi_bdev = bdev;
- bio->bi_vcnt = nr_vecs;
- bio->bi_idx = 0;
- bio->bi_size = 0;
- bio->bi_sector = first_sector;
- bio->bi_io_vec[0].bv_page = NULL;
+ if (bvec.bv_len == 0) {
+ /* A hole in the file covered the entire page. */
+ SetPageUptodate(page);
+ unlock_page(page);
+
+ } else {
+ if (rw == WRITE) {
+ BUG_ON(PageWriteback(page));
+ SetPageWriteback(page);
+ unlock_page(page);
+ }
+ bio_append(&bio, &bvec, GFP_KERNEL);
+ if (bio->bi_private) {
+ /* Multiple transfer on the same page must not be
+ merged with transfers for another page, so
+ submit them now. Also, this balances out the
+ initial prev->bvec_count == 1 that was set
+ by mpage_add_block. */
+
+ bio_recycle(&bio, GFP_KERNEL);
+ bio->bi_private = NULL;
+ }
}
+
+ if (page_addr != NULL)
+ kunmap(page);
+
+ if (buffer_boundary(bh) && bio->bi_vcnt != 0)
+ bio_recycle(&bio, GFP_KERNEL);
+
return bio;
}

+
/**
* mpage_readpages - populate an address space with some pages, and
* start reads against them.
@@ -159,336 +411,97 @@
*
* This all causes the disk requests to be issued in the correct order.
*/
-static struct bio *
-do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
- sector_t *last_block_in_bio, get_block_t get_block)
-{
- struct inode *inode = page->mapping->host;
- const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
- const unsigned blocksize = 1 << blkbits;
- struct bio_vec *bvec;
- sector_t block_in_file;
- sector_t last_block;
- sector_t blocks[MAX_BUF_PER_PAGE];
- unsigned page_block;
- unsigned first_hole = blocks_per_page;
- struct block_device *bdev = NULL;
- struct buffer_head bh;
-
- if (page_has_buffers(page))
- goto confused;
-
- block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
- last_block = (inode->i_size + blocksize - 1) >> blkbits;
-
- for (page_block = 0; page_block < blocks_per_page;
- page_block++, block_in_file++) {
- bh.b_state = 0;
- if (block_in_file < last_block) {
- if (get_block(inode, block_in_file, &bh, 0))
- goto confused;
- }
-
- if (!buffer_mapped(&bh)) {
- if (first_hole == blocks_per_page)
- first_hole = page_block;
- continue;
- }
-
- if (first_hole != blocks_per_page)
- goto confused; /* hole -> non-hole */
-
- /* Contiguous blocks? */
- if (page_block && blocks[page_block-1] != bh.b_blocknr-1)
- goto confused;
- blocks[page_block] = bh.b_blocknr;
- bdev = bh.b_bdev;
- }
-
- if (first_hole != blocks_per_page) {
- memset(kmap(page) + (first_hole << blkbits), 0,
- PAGE_CACHE_SIZE - (first_hole << blkbits));
- flush_dcache_page(page);
- kunmap(page);
- if (first_hole == 0) {
- SetPageUptodate(page);
- unlock_page(page);
- goto out;
- }
- }
-
- /*
- * This page will go to BIO. Do we need to send this BIO off first?
- */
- if (bio && (bio->bi_idx == bio->bi_vcnt ||
- *last_block_in_bio != blocks[0] - 1))
- bio = mpage_bio_submit(READ, bio);
-
- if (bio == NULL) {
- unsigned nr_bvecs = MPAGE_BIO_MAX_SIZE / PAGE_CACHE_SIZE;
-
- if (nr_bvecs > nr_pages)
- nr_bvecs = nr_pages;
- bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- nr_bvecs, GFP_KERNEL);
- if (bio == NULL)
- goto confused;
- }
-
- bvec = &bio->bi_io_vec[bio->bi_idx++];
- bvec->bv_page = page;
- bvec->bv_len = (first_hole << blkbits);
- bvec->bv_offset = 0;
- bio->bi_size += bvec->bv_len;
- if (buffer_boundary(&bh) || (first_hole != blocks_per_page))
- bio = mpage_bio_submit(READ, bio);
- else
- *last_block_in_bio = blocks[blocks_per_page - 1];
-out:
- return bio;
-
-confused:
- if (bio)
- bio = mpage_bio_submit(READ, bio);
- block_read_full_page(page, get_block);
- goto out;
-}
-
int
mpage_readpages(struct address_space *mapping, struct list_head *pages,
- unsigned nr_pages, get_block_t get_block)
+ unsigned nr_pages, get_block_t *get_block)
{
- struct bio *bio = NULL;
- unsigned page_idx;
- sector_t last_block_in_bio = 0;
+ struct bio *bio;
+
+ if (nr_pages == 0)
+ return 0;

- for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+ bio = mpage_alloc(mapping, nr_pages, READ);
+ while (nr_pages--) {
struct page *page = list_entry(pages->prev, struct page, list);

prefetchw(&page->flags);
list_del(&page->list);
if (!add_to_page_cache_unique(page, mapping, page->index))
- bio = do_mpage_readpage(bio, page,
- nr_pages - page_idx,
- &last_block_in_bio, get_block);
+ bio = mpage_xferpage(bio, page, get_block);
page_cache_release(page);
}
BUG_ON(!list_empty(pages));
- if (bio)
- mpage_bio_submit(READ, bio);
+
+ if (bio->bi_vcnt)
+ submit_bio(READ, bio);
+ else
+ bio_put(bio);
+
return 0;
}
+
EXPORT_SYMBOL(mpage_readpages);

/*
* This isn't called much at all
*/
-int mpage_readpage(struct page *page, get_block_t get_block)
+int mpage_readpage(struct page *page, get_block_t *get_block)
{
- struct bio *bio = NULL;
- sector_t last_block_in_bio = 0;
+ struct bio *bio = mpage_alloc(page->mapping, 1, READ);
+
+ bio = mpage_xferpage(bio, page, get_block);
+
+ if (bio->bi_vcnt)
+ submit_bio(READ, bio);
+ else
+ bio_put(bio);

- bio = do_mpage_readpage(bio, page, 1,
- &last_block_in_bio, get_block);
- if (bio)
- mpage_bio_submit(READ, bio);
return 0;
}
EXPORT_SYMBOL(mpage_readpage);

-/*
- * Writing is not so simple.
- *
- * If the page has buffers then they will be used for obtaining the disk
- * mapping. We only support pages which are fully mapped-and-dirty, with a
- * special case for pages which are unmapped at the end: end-of-file.
- *
- * If the page has no buffers (preferred) then the page is mapped here.
- *
- * If all blocks are found to be contiguous then the page can go into the
- * BIO. Otherwise fall back to the mapping's writepage().
- *
- * FIXME: This code wants an estimate of how many pages are still to be
- * written, so it can intelligently allocate a suitably-sized BIO. For now,
- * just allocate full-size (16-page) BIOs.
- */
-static inline struct bio *
-mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
- sector_t *last_block_in_bio, int *ret)
-{
- struct inode *inode = page->mapping->host;
- const unsigned blkbits = inode->i_blkbits;
- unsigned long end_index;
- const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
- struct bio_vec *bvec;
- sector_t last_block;
- sector_t block_in_file;
- sector_t blocks[MAX_BUF_PER_PAGE];
- unsigned page_block;
- unsigned first_unmapped = blocks_per_page;
- struct block_device *bdev = NULL;
- int boundary = 0;
-
- if (page_has_buffers(page)) {
- struct buffer_head *head = page_buffers(page);
- struct buffer_head *bh = head;
-
- /* If they're all mapped and dirty, do it */
- page_block = 0;
- do {
- BUG_ON(buffer_locked(bh));
- if (!buffer_mapped(bh)) {
- /*
- * unmapped dirty buffers are created by
- * __set_page_dirty_buffers -> mmapped data
- */
- if (buffer_dirty(bh))
- goto confused;
- if (first_unmapped == blocks_per_page)
- first_unmapped = page_block;
- continue;
- }
-
- if (first_unmapped != blocks_per_page)
- goto confused; /* hole -> non-hole */
-
- if (!buffer_dirty(bh) || !buffer_uptodate(bh))
- goto confused;
- if (page_block) {
- if (bh->b_blocknr != blocks[page_block-1] + 1)
- goto confused;
- }
- blocks[page_block++] = bh->b_blocknr;
- boundary = buffer_boundary(bh);
- bdev = bh->b_bdev;
- } while ((bh = bh->b_this_page) != head);
-
- if (first_unmapped)
- goto page_is_mapped;
-
- /*
- * Page has buffers, but they are all unmapped. The page was
- * created by pagein or read over a hole which was handled by
- * block_read_full_page(). If this address_space is also
- * using mpage_readpages then this can rarely happen.
- */
- goto confused;
- }

- /*
- * The page has no buffers: map it to disk
- */
- BUG_ON(!PageUptodate(page));
- block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
- last_block = (inode->i_size - 1) >> blkbits;
- for (page_block = 0; page_block < blocks_per_page; ) {
- struct buffer_head map_bh;
-
- map_bh.b_state = 0;
- if (get_block(inode, block_in_file, &map_bh, 1))
- goto confused;
- if (buffer_new(&map_bh))
- unmap_underlying_metadata(map_bh.b_bdev,
- map_bh.b_blocknr);
- if (page_block) {
- if (map_bh.b_blocknr != blocks[page_block-1] + 1)
- goto confused;
- }
- blocks[page_block++] = map_bh.b_blocknr;
- boundary = buffer_boundary(&map_bh);
- bdev = map_bh.b_bdev;
- if (block_in_file == last_block)
- break;
- block_in_file++;
- }
- if (page_block == 0)
- buffer_error();
-
- first_unmapped = page_block;
-
- end_index = inode->i_size >> PAGE_CACHE_SHIFT;
- if (page->index >= end_index) {
- unsigned offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
-
- if (page->index > end_index || !offset)
- goto confused;
- memset(kmap(page) + offset, 0, PAGE_CACHE_SIZE - offset);
- flush_dcache_page(page);
- kunmap(page);
- }
-
-page_is_mapped:
-
- /*
- * This page will go to BIO. Do we need to send this BIO off first?
- */
- if (bio && (bio->bi_idx == bio->bi_vcnt ||
- *last_block_in_bio != blocks[0] - 1))
- bio = mpage_bio_submit(WRITE, bio);
-
- if (bio == NULL) {
- unsigned nr_bvecs = MPAGE_BIO_MAX_SIZE / PAGE_CACHE_SIZE;
-
- bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- nr_bvecs, GFP_NOFS);
- if (bio == NULL)
- goto confused;
- }
-
- /*
- * OK, we have our BIO, so we can now mark the buffers clean. Make
- * sure to only clean buffers which we know we'll be writing.
- */
- if (page_has_buffers(page)) {
- struct buffer_head *head = page_buffers(page);
- struct buffer_head *bh = head;
- unsigned buffer_counter = 0;
-
- do {
- if (buffer_counter++ == first_unmapped)
- break;
- clear_buffer_dirty(bh);
- bh = bh->b_this_page;
- } while (bh != head);
- }
-
- bvec = &bio->bi_io_vec[bio->bi_idx++];
- bvec->bv_page = page;
- bvec->bv_len = (first_unmapped << blkbits);
- bvec->bv_offset = 0;
- bio->bi_size += bvec->bv_len;
- BUG_ON(PageWriteback(page));
- SetPageWriteback(page);
- unlock_page(page);
- if (boundary || (first_unmapped != blocks_per_page))
- bio = mpage_bio_submit(WRITE, bio);
- else
- *last_block_in_bio = blocks[blocks_per_page - 1];
- goto out;
-
-confused:
- if (bio)
- bio = mpage_bio_submit(WRITE, bio);
- *ret = page->mapping->a_ops->writepage(page);
-out:
- return bio;
+static inline int list_len(struct list_head *list)
+{
+ list_t *tmp;
+ int count = 0;
+ list_for_each(tmp, list)
+ count++;
+ return count;
}

/*
* This is a cut-n-paste of generic_writepages(). We _could_
* generalise that function. It'd get a bit messy. We'll see.
*/
+
int
mpage_writepages(struct address_space *mapping,
int *nr_to_write, get_block_t get_block)
{
- struct bio *bio = NULL;
- sector_t last_block_in_bio = 0;
- int ret = 0;
+ struct bio *bio;
int done = 0;
+ int pages_to_write;

write_lock(&mapping->page_lock);
+ pages_to_write =
+ nr_to_write ? *nr_to_write : list_len(&mapping->io_pages);
+
+ /*
+ * FIXME. The old code would actually do one iteration if
+ * page_to_write was 0. Was that necessary?
+ */
+ if (pages_to_write <= 0)
+ pages_to_write = 1;
+
+#if 0
+ if (pages_to_write == 0) {
+ write_unlock(&mapping->page_lock);
+ return 0;
+ }
+#endif
+
+ bio = mpage_alloc(mapping, pages_to_write, WRITE);

list_splice(&mapping->dirty_pages, &mapping->io_pages);
INIT_LIST_HEAD(&mapping->dirty_pages);
@@ -498,11 +511,9 @@
struct page, list);
list_del(&page->list);
if (PageWriteback(page)) {
- if (PageDirty(page)) {
- list_add(&page->list, &mapping->dirty_pages);
- continue;
- }
- list_add(&page->list, &mapping->locked_pages);
+ list_add(&page->list, PageDirty(page) ?
+ &mapping->dirty_pages :
+ &mapping->locked_pages);
continue;
}
if (!PageDirty(page)) {
@@ -527,9 +538,8 @@
}
spin_unlock(&pagemap_lru_lock);
}
- bio = mpage_writepage(bio, page, get_block,
- &last_block_in_bio, &ret);
- if (ret || (nr_to_write && --(*nr_to_write) <= 0))
+ bio = mpage_xferpage(bio, page, get_block);
+ if (nr_to_write && --(*nr_to_write) <= 0)
done = 1;
} else {
unlock_page(page);
@@ -546,8 +556,50 @@
INIT_LIST_HEAD(&mapping->io_pages);
}
write_unlock(&mapping->page_lock);
- if (bio)
- mpage_bio_submit(WRITE, bio);
- return ret;
+ if (bio->bi_vcnt)
+ submit_bio(WRITE, bio);
+ else
+ bio_put(bio);
+
+ return 0;
}
+
EXPORT_SYMBOL(mpage_writepages);
+
+
+/* Below is copied from bio.c, so I added Jens to the copyright notice.
+ -Adam */
+
+static void *slab_pool_alloc(int gfp_mask, void *data)
+{
+ return kmem_cache_alloc(data, gfp_mask);
+}
+
+static void slab_pool_free(void *ptr, void *data)
+{
+ kmem_cache_free(data, ptr);
+}
+
+static int __init init_mpage(void)
+{
+ mpage_priv_slab = kmem_cache_create("mpage_priv",
+ sizeof(struct mpage_private), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!mpage_priv_slab)
+ panic("mpage: can't create slab cache\n");
+
+ mpage_priv_pool = mempool_create(MPAGE_PRIV_POOL_SIZE,
+ slab_pool_alloc, slab_pool_free,
+ mpage_priv_slab);
+ if (!mpage_priv_pool)
+ panic("mpage: can't create mempool\n");
+
+ printk("MPAGE_PRIV: pool of %d setup, %ZuKb (%Zd bytes/mpage_priv)\n",
+ MPAGE_PRIV_POOL_SIZE,
+ MPAGE_PRIV_POOL_SIZE * sizeof(struct mpage_private) >> 10,
+ sizeof(struct mpage_private));
+
+ return 0;
+}
+
+module_init(init_mpage);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/