Re: Patch??: linux-2.5.20/fs/bio.c - ll_rw_kio could generate bio's

Matthew Dobson (colpatch@us.ibm.com)
Thu, 06 Jun 2002 15:06:09 -0700


Adam,
Your patch version 3 did the trick for me. It booted past the panic, but
it's now hung right after init loads. After init spits out "INIT: " it
locks up.. I've put in kdb, and all the cpus seem to be spinning in the
TIF_NEED_RESCHED loop in poll_idle. I dunno if this is in any way
related to the block stuff or not...

But the panic stopped happening!

Cheers!

-Matt

Adam J. Richter wrote:
> Here is version 3 of my changes to ll_rw_kio in fs/bio.c and fs/mpage.c.
> The differences from the previous version are that do_mpage_readpage
> and mpage_writepage now precompute bdev differently, and I have
> tried to factor out a little bit of common code between bio.c and
> mpage.c in the form of a routine bio_append().
>
> I now know that I am actually running the mpage.c code, because I
> screwed up one of my changes and got to see the resulting kernel
> panic. On the other hand, I do not think that I have caused ll_rw_kio
> to be executed. So, I beileve my ll_rw_kio changes are completely
> untested.
>
> For what it's worth, I am composing this email on a machine running
> the patch that I have attached below.
>
> Adam J. Richter __ ______________ 575 Oroville Road
> adam@yggdrasil.com \ / Milpitas, California 95035
> +1 408 309-6081 | g g d r a s i l United States of America
> "Free Software For The Rest Of Us."
>
>
> --- linux-2.5.20/include/linux/bio.h 2002-06-02 18:44:52.000000000 -0700
> +++ linux/include/linux/bio.h 2002-06-06 06:30:04.000000000 -0700
> @@ -34,9 +34,6 @@
> #define BIO_BUG_ON
> #endif
>
> -#define BIO_MAX_SECTORS 128
> -#define BIO_MAX_SIZE (BIO_MAX_SECTORS << 9)
> -
> /*
> * was unsigned short, but we might as well be ready for > 64kB I/O pages
> */
> @@ -201,6 +198,8 @@
> extern struct bio *bio_clone(struct bio *, int);
> extern struct bio *bio_copy(struct bio *, int, int);
>
> +extern int bio_append(struct bio **bio_p,
> + struct page *page, int len, int offset);
> extern inline void bio_init(struct bio *);
>
> extern int bio_ioctl(kdev_t, unsigned int, unsigned long);
> --- linux-2.5.20/include/linux/blkdev.h 2002-06-02 18:44:44.000000000 -0700
> +++ linux/include/linux/blkdev.h 2002-06-06 04:40:25.000000000 -0700
> @@ -191,9 +191,9 @@
> * queue settings
> */
> unsigned short max_sectors;
> - unsigned short max_phys_segments;
> - unsigned short max_hw_segments;
> + unsigned short max_phys_segments; /* <= max_sectors */
> + unsigned short max_hw_segments; /* <= max_sectors */
> unsigned short hardsect_size;
> unsigned int max_segment_size;
>
> @@ -418,5 +428,7 @@
> page_cache_release(p.v);
> }
>
> +extern int bio_max_iovecs(request_queue_t *q, int *iovec_size);
> +
> #endif
> --- linux-2.5.20/fs/bio.c 2002-06-02 18:44:40.000000000 -0700
> +++ linux/fs/bio.c 2002-06-06 06:13:52.000000000 -0700
> @@ -316,6 +316,64 @@
> return NULL;
> }
>
> +int bio_max_iovecs(request_queue_t *q, int *iovec_size)
> +{
> + const int max_bytes = q->max_sectors << 9;
> + int max_iovecs;
> +
> + if (*iovec_size > max_bytes) {
> + *iovec_size = max_bytes;
> + return 1;
> + }
> + max_iovecs = max_bytes / *iovec_size;
> + if (max_iovecs > q->max_phys_segments)
> + max_iovecs = q->max_phys_segments;
> + if (max_iovecs > q->max_hw_segments)
> + max_iovecs = q->max_hw_segments;
> + return max_iovecs;
> +}
> +
> +/* bio_append appends an IO vector to a bio. bio_append expects to be
> + called with bio->bi_idx indicating the maximum number of IO vectors
> + that this bio can hold, and with bio->bi_vcnt indicating the number
> + of IO vectors already loaded. If the provided bio is already full,
> + bio_append will submit the current bio and allocate a new one. */
> +
> +int bio_append(struct bio **bio_p, struct page *page, int len, int offset)
> +{
> + struct bio *bio = *bio_p;
> + struct bio_vec *vec;
> + if (bio->bi_idx == bio->bi_vcnt) {
> + struct bio *old = bio;
> + *bio_p = bio = bio_alloc(GFP_KERNEL, old->bi_vcnt);
> + if (bio != NULL) {
> + bio->bi_sector =
> + old->bi_sector + (old->bi_size >> 9);
> +
> +#define COPY(field) bio->bi_ ## field = old->bi_ ## field
> + COPY(bdev);
> + COPY(flags);
> + COPY(idx);
> + COPY(rw);
> + COPY(end_io);
> + COPY(private);
> +#undef COPY
> + }
> + old->bi_idx = 0;
> + submit_bio(old->bi_rw, old);
> + if (bio == NULL)
> + return -ENOMEM;
> + }
> + vec = &bio->bi_io_vec[bio->bi_vcnt++];
> + vec->bv_page = page;
> + vec->bv_len = len;
> + vec->bv_offset = offset;
> +
> + bio->bi_size += len;
> + return 0;
> +}
> +
> +
> static void bio_end_io_kio(struct bio *bio)
> {
> struct kiobuf *kio = (struct kiobuf *) bio->bi_private;
> @@ -338,10 +396,10 @@
> **/
> void ll_rw_kio(int rw, struct kiobuf *kio, struct block_device *bdev, sector_t sector)
> {
> - int i, offset, size, err, map_i, total_nr_pages, nr_pages;
> - struct bio_vec *bvec;
> + int offset, size, err, map_i, total_nr_pages, nr_bvecs;
> struct bio *bio;
> kdev_t dev = to_kdev_t(bdev->bd_dev);
> + int bytes_per_iovec, max_pages;
>
> err = 0;
> if ((rw & WRITE) && is_read_only(dev)) {
> @@ -367,63 +425,58 @@
>
> map_i = 0;
>
> -next_chunk:
> - nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - 9);
> - if (nr_pages > total_nr_pages)
> - nr_pages = total_nr_pages;
> + bytes_per_iovec = PAGE_SIZE;
> + max_pages = bio_max_iovecs(bdev->bd_queue, &bytes_per_iovec);
> +
> + nr_bvecs = max_pages;
> + if (nr_bvecs > total_nr_pages)
> + nr_bvecs = total_nr_pages;
>
> atomic_inc(&kio->io_count);
>
> /*
> * allocate bio and do initial setup
> */
> - if ((bio = bio_alloc(GFP_NOIO, nr_pages)) == NULL) {
> + if ((bio = bio_alloc(GFP_NOIO, nr_bvecs)) == NULL) {
> err = -ENOMEM;
> goto out;
> }
>
> + bio->bi_idx = nr_bvecs; /* for bio_append */
> + bio->bi_rw = rw;
> +
> bio->bi_sector = sector;
> bio->bi_bdev = bdev;
> - bio->bi_idx = 0;
> bio->bi_end_io = bio_end_io_kio;
> bio->bi_private = kio;
>
> - bvec = bio->bi_io_vec;
> - for (i = 0; i < nr_pages; i++, bvec++, map_i++) {
> + while (total_nr_pages > 0) {
> int nbytes = PAGE_SIZE - offset;
>
> if (nbytes > size)
> nbytes = size;
> + if (nbytes > bytes_per_iovec)
> + nbytes = bytes_per_iovec;
>
> BUG_ON(kio->maplist[map_i] == NULL);
>
> - if (bio->bi_size + nbytes > (BIO_MAX_SECTORS << 9))
> - goto queue_io;
> -
> - bio->bi_vcnt++;
> - bio->bi_size += nbytes;
> + err = bio_append(&bio, kio->maplist[map_i], nbytes, offset);
> + if (err)
> + goto out;
> +
> + offset = (offset + nbytes) & PAGE_MASK;
> + if (offset == 0) {
> + total_nr_pages--;
> + map_i++;
> + }
>
> - bvec->bv_page = kio->maplist[map_i];
> - bvec->bv_len = nbytes;
> - bvec->bv_offset = offset;
> -
> - /*
> - * kiobuf only has an offset into the first page
> - */
> - offset = 0;
> -
> - sector += nbytes >> 9;
> size -= nbytes;
> - total_nr_pages--;
> kio->offset += nbytes;
> }
>
> -queue_io:
> + bio->bi_idx = 0;
> submit_bio(rw, bio);
>
> - if (total_nr_pages)
> - goto next_chunk;
> -
> if (size) {
> printk("ll_rw_kio: size %d left (kio %d)\n", size, kio->length);
> BUG();
> --- linux-2.5.20/fs/mpage.c 2002-06-02 18:44:44.000000000 -0700
> +++ linux/fs/mpage.c 2002-06-06 06:14:13.000000000 -0700
> @@ -21,12 +21,6 @@
> #include <linux/mpage.h>
>
> /*
> - * The largest-sized BIO which this code will assemble, in bytes. Set this
> - * to PAGE_CACHE_SIZE if your drivers are broken.
> - */
> -#define MPAGE_BIO_MAX_SIZE BIO_MAX_SIZE
> -
> -/*
> * I/O completion handler for multipage BIOs.
> *
> * The mpage code never puts partial pages into a BIO (except for end-of-file).
> @@ -78,19 +72,15 @@
> bio_put(bio);
> }
>
> -struct bio *mpage_bio_submit(int rw, struct bio *bio)
> +struct bio *mpage_bio_submit(struct bio *bio)
> {
> - bio->bi_vcnt = bio->bi_idx;
> bio->bi_idx = 0;
> - bio->bi_end_io = mpage_end_io_read;
> - if (rw == WRITE)
> - bio->bi_end_io = mpage_end_io_write;
> - submit_bio(rw, bio);
> + submit_bio(bio->bi_rw, bio);
> return NULL;
> }
>
> static struct bio *
> -mpage_alloc(struct block_device *bdev,
> +mpage_alloc(struct block_device *bdev, int rw,
> sector_t first_sector, int nr_vecs, int gfp_flags)
> {
> struct bio *bio;
> @@ -98,11 +88,14 @@
> bio = bio_alloc(gfp_flags, nr_vecs);
> if (bio) {
> bio->bi_bdev = bdev;
> - bio->bi_vcnt = nr_vecs;
> - bio->bi_idx = 0;
> + bio->bi_vcnt = 0;
> + bio->bi_idx = nr_vecs;
> bio->bi_size = 0;
> bio->bi_sector = first_sector;
> - bio->bi_io_vec[0].bv_page = NULL;
> + bio->bi_rw = rw;
> + bio->bi_end_io = mpage_end_io_read;
> + if (rw == WRITE)
> + bio->bi_end_io = mpage_end_io_write;
> }
> return bio;
> }
> @@ -161,14 +154,19 @@
> const unsigned blkbits = inode->i_blkbits;
> const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
> const unsigned blocksize = 1 << blkbits;
> - struct bio_vec *bvec;
> sector_t block_in_file;
> sector_t last_block;
> sector_t blocks[MAX_BUF_PER_PAGE];
> unsigned page_block;
> unsigned first_hole = blocks_per_page;
> - struct block_device *bdev = NULL;
> + struct block_device *bdev =
> + S_ISBLK(inode->i_mode) ? inode->i_bdev : inode->i_sb->s_bdev;
> struct buffer_head bh;
> + int bvec_size = PAGE_SIZE;
> + const unsigned max_bvecs = bio_max_iovecs(bdev->bd_queue, &bvec_size);
> +
> + if (bvec_size != PAGE_SIZE)
> + goto confused;
>
> if (page_has_buffers(page))
> goto confused;
> @@ -197,7 +195,6 @@
> if (page_block && blocks[page_block-1] != bh.b_blocknr-1)
> goto confused;
> blocks[page_block] = bh.b_blocknr;
> - bdev = bh.b_bdev;
> }
>
> if (first_hole != blocks_per_page) {
> @@ -215,28 +212,28 @@
> /*
> * This page will go to BIO. Do we need to send this BIO off first?
> */
> - if (bio && (bio->bi_idx == bio->bi_vcnt ||
> - *last_block_in_bio != blocks[0] - 1))
> - bio = mpage_bio_submit(READ, bio);
> -
> if (bio == NULL) {
> - unsigned nr_bvecs = MPAGE_BIO_MAX_SIZE / PAGE_CACHE_SIZE;
> + unsigned int nr_bvecs = max_bvecs;
>
> if (nr_bvecs > nr_pages)
> nr_bvecs = nr_pages;
> - bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
> + bio = mpage_alloc(bdev, READ, blocks[0] << (blkbits - 9),
> nr_bvecs, GFP_KERNEL);
> if (bio == NULL)
> goto confused;
> }
> + else if (*last_block_in_bio != blocks[0] - 1)
> + bio->bi_idx = bio->bi_vcnt; /* force bio_append to
> + allocate a new bio. */
> +
> + if (bio_append(&bio, page, first_hole << blkbits, 0) != 0)
> + goto confused;
> +
> + if (bio->bi_vcnt == 1)
> + bio->bi_sector = blocks[0] << (blkbits - 9);
>
> - bvec = &bio->bi_io_vec[bio->bi_idx++];
> - bvec->bv_page = page;
> - bvec->bv_len = (first_hole << blkbits);
> - bvec->bv_offset = 0;
> - bio->bi_size += bvec->bv_len;
> if (buffer_boundary(&bh) || (first_hole != blocks_per_page))
> - bio = mpage_bio_submit(READ, bio);
> + bio->bi_idx = bio->bi_vcnt;
> else
> *last_block_in_bio = blocks[blocks_per_page - 1];
> out:
> @@ -244,7 +241,7 @@
>
> confused:
> if (bio)
> - bio = mpage_bio_submit(READ, bio);
> + bio = mpage_bio_submit(bio);
> block_read_full_page(page, get_block);
> goto out;
> }
> @@ -270,7 +267,7 @@
> }
> BUG_ON(!list_empty(pages));
> if (bio)
> - mpage_bio_submit(READ, bio);
> + mpage_bio_submit(bio);
> return 0;
> }
> EXPORT_SYMBOL(mpage_readpages);
> @@ -286,7 +283,7 @@
> bio = do_mpage_readpage(bio, page, 1,
> &last_block_in_bio, get_block);
> if (bio)
> - mpage_bio_submit(READ, bio);
> + mpage_bio_submit(bio);
> return 0;
> }
> EXPORT_SYMBOL(mpage_readpage);
> @@ -315,14 +312,19 @@
> const unsigned blkbits = inode->i_blkbits;
> unsigned long end_index;
> const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
> - struct bio_vec *bvec;
> sector_t last_block;
> sector_t block_in_file;
> sector_t blocks[MAX_BUF_PER_PAGE];
> unsigned page_block;
> unsigned first_unmapped = blocks_per_page;
> - struct block_device *bdev = NULL;
> + struct block_device *bdev =
> + S_ISBLK(inode->i_mode) ? inode->i_bdev : inode->i_sb->s_bdev;
> int boundary = 0;
> + int bvec_size = PAGE_SIZE;
> + const unsigned max_bvecs = bio_max_iovecs(bdev->bd_queue, &bvec_size);
> +
> + if (bvec_size != PAGE_SIZE)
> + goto confused;
>
> if (page_has_buffers(page)) {
> struct buffer_head *head = page_buffers(page);
> @@ -355,7 +357,6 @@
> }
> blocks[page_block++] = bh->b_blocknr;
> boundary = buffer_boundary(bh);
> - bdev = bh->b_bdev;
> } while ((bh = bh->b_this_page) != head);
>
> if (first_unmapped)
> @@ -391,7 +392,6 @@
> }
> blocks[page_block++] = map_bh.b_blocknr;
> boundary = buffer_boundary(&map_bh);
> - bdev = map_bh.b_bdev;
> if (block_in_file == last_block)
> break;
> block_in_file++;
> @@ -417,18 +417,14 @@
> /*
> * This page will go to BIO. Do we need to send this BIO off first?
> */
> - if (bio && (bio->bi_idx == bio->bi_vcnt ||
> - *last_block_in_bio != blocks[0] - 1))
> - bio = mpage_bio_submit(WRITE, bio);
> -
> if (bio == NULL) {
> - unsigned nr_bvecs = MPAGE_BIO_MAX_SIZE / PAGE_CACHE_SIZE;
> -
> - bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
> - nr_bvecs, GFP_NOFS);
> + bio = mpage_alloc(bdev, WRITE, blocks[0] << (blkbits - 9),
> + max_bvecs, GFP_NOFS);
> if (bio == NULL)
> goto confused;
> }
> + else if (*last_block_in_bio != blocks[0] - 1)
> + bio->bi_idx = bio->bi_vcnt;
>
> /*
> * OK, we have our BIO, so we can now mark the buffers clean. Make
> @@ -447,23 +443,21 @@
> } while (bh != head);
> }
>
> - bvec = &bio->bi_io_vec[bio->bi_idx++];
> - bvec->bv_page = page;
> - bvec->bv_len = (first_unmapped << blkbits);
> - bvec->bv_offset = 0;
> - bio->bi_size += bvec->bv_len;
> + bio_append(&bio, page, first_unmapped << blkbits, 0);
> + if (bio->bi_vcnt == 1)
> + bio->bi_sector = blocks[0] << (blkbits - 9);
> BUG_ON(PageWriteback(page));
> SetPageWriteback(page);
> unlock_page(page);
> if (boundary || (first_unmapped != blocks_per_page))
> - bio = mpage_bio_submit(WRITE, bio);
> + bio = mpage_bio_submit(bio);
> else
> *last_block_in_bio = blocks[blocks_per_page - 1];
> goto out;
>
> confused:
> if (bio)
> - bio = mpage_bio_submit(WRITE, bio);
> + bio = mpage_bio_submit(bio);
> *ret = block_write_full_page(page, get_block);
> out:
> return bio;
> @@ -541,7 +535,7 @@
> }
> write_unlock(&mapping->page_lock);
> if (bio)
> - mpage_bio_submit(WRITE, bio);
> + mpage_bio_submit(bio);
> return ret;
> }
> EXPORT_SYMBOL(mpage_writepages);
>

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/