[PATCH] sard disk profiling for 2.3.99-pre6

Stephen C. Tweedie (sct@redhat.com)
Fri, 28 Apr 2000 16:17:47 +0100


--EeQfGwPcQSOJBaQU
Content-Type: text/plain; charset=us-ascii

On Thu, Apr 20, 2000 at 05:14:24PM +0100, Stephen C. Tweedie wrote:
>
> Here is the "sard" code for 2.3.99-pre4.2. It provides /proc/partitions
> output in support of "sar -d" functionality, ie. extensive IO reporting
> and profiling of disk activity.

Updated code is now in

ftp.uk.linux.org:/pub/linux/sct/fs/profiling/sard-0.5.tar.gz

This includes the kernel diff for both 2.2.15-pre20 and for
2.3.99-pre6.

The /proc/partitions output now includes tags for all of the new
fields, and all of the output fields from /proc are normalised to a
notional 1msec timer tick (so there is no dependency on HZ in the
output --- this makes the parsing much cleaner and more portable).

The 2.3 patch is attached. It is vastly simpler than the 2.2
patch, because the block device cleanups in 2.3 make registering
the per-device partition stat tables trivial.

--Stephen

--EeQfGwPcQSOJBaQU
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="sard-2.3.99-pre6.diff"

--- linux-2.3.99-pre6.sard/drivers/block/ll_rw_blk.c.~1~ Thu Apr 27 16:15:26 2000
+++ linux-2.3.99-pre6.sard/drivers/block/ll_rw_blk.c Fri Apr 28 13:28:18 2000
@@ -120,6 +120,14 @@
*/
int * max_sectors[MAX_BLKDEV];

+/*
+ * MUTEX locking to prevent concurrent fsync()s to the block devices.
+ * (Concurrent syncs thrash the disk enormously and result in much worse
+ * performance than serial syncs.)
+ */
+char blk_synclock[MAX_BLKDEV] = {0};
+
+
static inline int get_max_sectors(kdev_t dev)
{
if (!max_sectors[MAJOR(dev)])
@@ -411,6 +419,121 @@
printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n");
}

+/* Return up to two hd_structs on which to do IO accounting for a given
+ * request. On a partitioned device, we want to account both against
+ * the partition and against the whole disk. */
+static void locate_hd_struct(struct request *req,
+ struct hd_struct **hd1,
+ struct hd_struct **hd2)
+{
+ struct gendisk *gd;
+
+ *hd1 = NULL;
+ *hd2 = NULL;
+
+ gd = major_gendisk[MAJOR(req->rq_dev)];
+ if (gd && gd->part) {
+ /* Mask out the partition bits: account for the entire disk */
+ int devnr = MINOR(req->rq_dev) >> gd->minor_shift;
+ int whole_minor = devnr << gd->minor_shift;
+ *hd1 = &gd->part[whole_minor];
+ if (whole_minor != MINOR(req->rq_dev))
+ *hd2= &gd->part[MINOR(req->rq_dev)];
+ }
+}
+
+/* Round off the performance stats on an hd_struct. The average IO
+ * queue length and utilisation statistics are maintained by observing
+ * the current state of the queue length and the amount of time it has
+ * been in this state for. Normally, that accounting is done on IO
+ * completion, but that can result in more than a second's worth of IO
+ * being accounted for within any one second, leading to >100%
+ * utilisation. To deal with that, we do a round-off before returning
+ * the results when reading /proc/partitions, accounting immediately for
+ * all queue usage up to the current jiffies and restarting the counters
+ * again. */
+void disk_round_stats(struct hd_struct *hd)
+{
+ unsigned long now = jiffies;
+
+ hd->aveq += (hd->ios_in_flight * (jiffies - hd->last_queue_change));
+ hd->last_queue_change = now;
+
+ if (hd->ios_in_flight)
+ hd->io_ticks += (now - hd->last_idle_time);
+ hd->last_idle_time = now;
+}
+
+
+static inline void down_ios(struct hd_struct *hd)
+{
+ disk_round_stats(hd);
+ --hd->ios_in_flight;
+}
+
+static inline void up_ios(struct hd_struct *hd)
+{
+ disk_round_stats(hd);
+ ++hd->ios_in_flight;
+}
+
+static void account_io_start(struct hd_struct *hd, struct request *req,
+ int merge, int sectors)
+{
+ switch (req->cmd) {
+ case READ:
+ if (merge)
+ hd->rd_merges++;
+ hd->rd_sectors += sectors;
+ break;
+ case WRITE:
+ if (merge)
+ hd->wr_merges++;
+ hd->wr_sectors += sectors;
+ break;
+ default:
+ }
+ if (!merge)
+ up_ios(hd);
+}
+
+static void account_io_end(struct hd_struct *hd, struct request *req)
+{
+ unsigned long duration = jiffies - req->start_time;
+ switch (req->cmd) {
+ case READ:
+ hd->rd_ticks += duration;
+ hd->rd_ios++;
+ break;
+ case WRITE:
+ hd->wr_ticks += duration;
+ hd->wr_ios++;
+ break;
+ default:
+ }
+ down_ios(hd);
+}
+
+void req_new_io(struct request *req, int merge, int sectors)
+{
+ struct hd_struct *hd1, *hd2;
+ locate_hd_struct(req, &hd1, &hd2);
+ if (hd1)
+ account_io_start(hd1, req, merge, sectors);
+ if (hd2)
+ account_io_start(hd2, req, merge, sectors);
+}
+
+void req_finished_io(struct request *req)
+{
+ struct hd_struct *hd1, *hd2;
+ locate_hd_struct(req, &hd1, &hd2);
+ if (hd1)
+ account_io_end(hd1, req);
+ if (hd2)
+ account_io_end(hd2, req);
+}
+
/*
* add-request adds a request to the linked list.
* It disables interrupts (aquires the request spinlock) so that it can muck
@@ -463,6 +586,7 @@
int max_segments)
{
struct request *next;
+ struct hd_struct *hd1, *hd2;

next = blkdev_next_request(req);
if (req->sector + req->nr_sectors != next->sector)
@@ -485,6 +609,15 @@
next->rq_status = RQ_INACTIVE;
list_del(&next->queue);
wake_up (&wait_for_request);
+
+ /* One last thing: we have removed a request, so we now have one
+ less expected IO to complete for accounting purposes. */
+
+ locate_hd_struct(req, &hd1, &hd2);
+ if (hd1)
+ down_ios(hd1);
+ if (hd2)
+ down_ios(hd2);
}

static inline void attempt_back_merge(request_queue_t * q,
@@ -687,6 +820,7 @@
req->bhtail = bh;
req->nr_sectors = req->hard_nr_sectors += count;
drive_stat_acct(req, count, 0);
+ req_new_io(req, 1, count);

elevator_merge_after(elevator, req, latency);

@@ -717,6 +851,7 @@
req->sector = req->hard_sector = sector;
req->nr_sectors = req->hard_nr_sectors += count;
drive_stat_acct(req, count, 0);
+ req_new_io(req, 1, count);

elevator_merge_before(elevator, req, latency);

@@ -764,6 +899,8 @@
req->bh = bh;
req->bhtail = bh;
req->q = q;
+ req->start_time = jiffies;
+ req_new_io(req, 0, count);
add_request(q, req, head, orig_latency);
elevator_account_request(elevator, req);

@@ -953,6 +1090,7 @@
BUG();
if (req->sem != NULL)
up(req->sem);
+ req_finished_io(req);
req->rq_status = RQ_INACTIVE;
wake_up(&wait_for_request);
}
--- linux-2.3.99-pre6.sard/drivers/scsi/scsi_lib.c.~1~ Thu Apr 27 16:15:26 2000
+++ linux-2.3.99-pre6.sard/drivers/scsi/scsi_lib.c Fri Apr 28 13:28:18 2000
@@ -455,6 +455,7 @@
if (req->sem != NULL) {
up(req->sem);
}
+ req_finished_io(req);
add_blkdev_randomness(MAJOR(req->rq_dev));

/*
--- linux-2.3.99-pre6.sard/fs/block_dev.c.~1~ Thu Apr 27 16:15:27 2000
+++ linux-2.3.99-pre6.sard/fs/block_dev.c Fri Apr 28 13:28:18 2000
@@ -11,6 +11,7 @@
#include <linux/malloc.h>
#include <linux/kmod.h>
#include <linux/devfs_fs_kernel.h>
+#include <linux/genhd.h>

#include <asm/uaccess.h>

--- linux-2.3.99-pre6.sard/fs/buffer.c.~1~ Thu Apr 27 16:15:27 2000
+++ linux-2.3.99-pre6.sard/fs/buffer.c Fri Apr 28 13:28:18 2000
@@ -274,6 +274,7 @@
* more buffers on the second pass).
*/
} while (wait && retry && ++pass<=2);
+ run_task_queue(&tq_disk);
return err;
}

--- linux-2.3.99-pre6.sard/fs/partitions/check.c.~1~ Thu Apr 27 16:15:27 2000
+++ linux-2.3.99-pre6.sard/fs/partitions/check.c Fri Apr 28 13:28:18 2000
@@ -39,6 +39,7 @@
extern void initrd_load(void);

struct gendisk *gendisk_head;
+struct gendisk *major_gendisk[MAX_BLKDEV] = {0,};
int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/

static int (*check_part[])(struct gendisk *hd, kdev_t dev, unsigned long first_sect, int first_minor) = {
@@ -113,6 +114,9 @@
(lvm_hd_name_ptr) ( buf, minor);
return buf;
#endif
+ case MD_MAJOR:
+ unit = (minor >> hd->minor_shift) + '0';
+ break;
case IDE9_MAJOR:
unit += 2;
case IDE8_MAJOR:
@@ -245,20 +249,38 @@
}

#ifdef CONFIG_PROC_FS
+
+/* Normalise the disk performance stats to a notional timer tick of
+ 1ms. */
+#define MSEC(x) ((x) * 1000 / HZ)
+
int get_partition_list(char * page)
{
struct gendisk *p;
+ struct hd_struct *hd;
char buf[64];
int n, len;

- len = sprintf(page, "major minor #blocks name\n\n");
+ len = sprintf(page, "major minor #blocks name rio rmerge rsect ruse wio wmerge wsect wuse running use aveq\n\n");
for (p = gendisk_head; p; p = p->next) {
for (n=0; n < (p->nr_real << p->minor_shift); n++) {
if (p->part[n].nr_sects && len < PAGE_SIZE - 80) {
- len += sprintf(page+len,
- "%4d %4d %10d %s\n",
+ hd = &p->part[n];
+ disk_round_stats(hd);
+ len += sprintf(page+len,
+ "%4d %4d %10d %-8s "
+ "%d %d %d %d %d %d %d %d %d %d %d\n",
p->major, n, p->sizes[n],
- disk_name(p, n, buf));
+ disk_name(p, n, buf),
+ hd->rd_ios, hd->rd_merges,
+ hd->rd_sectors,
+ MSEC(hd->rd_ticks),
+ hd->wr_ios, hd->wr_merges,
+ hd->wr_sectors,
+ MSEC(hd->wr_ticks),
+ hd->ios_in_flight,
+ MSEC(hd->io_ticks),
+ MSEC(hd->aveq));
}
}
}
@@ -403,7 +425,8 @@
{
if (!gdev)
return;
- grok_partitions(gdev, MINOR(dev)>>gdev->minor_shift, minors, size);
+ grok_partitions(gdev, MINOR(dev)>>gdev->minor_shift, minors, size);
+ major_gendisk[MAJOR(dev)] = gdev;
}

void grok_partitions(struct gendisk *dev, int drive, unsigned minors, long size)
--- linux-2.3.99-pre6.sard/include/linux/blkdev.h.~1~ Thu Apr 27 16:15:25 2000
+++ linux-2.3.99-pre6.sard/include/linux/blkdev.h Fri Apr 28 13:28:18 2000
@@ -30,6 +30,7 @@
kdev_t rq_dev;
int cmd; /* READ or WRITE */
int errors;
+ unsigned long start_time;
unsigned long sector;
unsigned long nr_sectors;
unsigned long hard_sector, hard_nr_sectors;
--- linux-2.3.99-pre6.sard/include/linux/genhd.h.~1~ Thu Feb 17 11:50:55 2000
+++ linux-2.3.99-pre6.sard/include/linux/genhd.h Fri Apr 28 13:28:18 2000
@@ -51,6 +51,22 @@
long nr_sects;
int type; /* currently RAID or normal */
devfs_handle_t de; /* primary (master) devfs entry */
+
+ /* Performance stats: */
+ unsigned int ios_in_flight;
+ unsigned int io_ticks;
+ unsigned int last_idle_time;
+ unsigned int last_queue_change;
+ unsigned int aveq;
+
+ unsigned int rd_ios;
+ unsigned int rd_merges;
+ unsigned int rd_ticks;
+ unsigned int rd_sectors;
+ unsigned int wr_ios;
+ unsigned int wr_merges;
+ unsigned int wr_ticks;
+ unsigned int wr_sectors;
};

#define GENHD_FL_REMOVABLE 1
@@ -75,6 +91,8 @@
};
#endif /* __KERNEL__ */

+extern struct gendisk *major_gendisk[];
+
#ifdef CONFIG_SOLARIS_X86_PARTITION

#define SOLARIS_X86_NUMSLICE 8
@@ -227,6 +245,19 @@
extern struct gendisk *gendisk_head; /* linked list of disks */

char *disk_name (struct gendisk *hd, int minor, char *buf);
+
+/*
+ * disk_round_stats is used to round off the IO statistics for a disk
+ * for a complete clock tick.
+ */
+void disk_round_stats(struct hd_struct *hd);
+
+/*
+ * Account for the completion of an IO request (used by drivers which
+ * bypass the normal end_request processing)
+ */
+struct request;
+void req_finished_io(struct request *);

extern void devfs_register_partitions (struct gendisk *dev, int minor,
int unregister);

--EeQfGwPcQSOJBaQU--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/