This is something I whipped together the other day for testing the over
head of the O(N) insert scan that the deadline io scheduler does. So
instead of using a plain linked list, I adapted it to use an rb tree
since that was already available with the generic implementation in the
kernel.
Results are quite promising, overhead is down a lot. I'm not pushing
this for inclusion, just sending it out so others can test etc. Patch is
against 2.5.45.
===== drivers/block/ll_rw_blk.c 1.135 vs edited =====
--- 1.135/drivers/block/ll_rw_blk.c	Mon Oct 28 20:57:59 2002
+++ edited/drivers/block/ll_rw_blk.c	Thu Oct 31 14:31:09 2002
@@ -2175,8 +2183,8 @@
 	queue_nr_requests = (total_ram >> 9) & ~7;
 	if (queue_nr_requests < 16)
 		queue_nr_requests = 16;
-	if (queue_nr_requests > 128)
-		queue_nr_requests = 128;
+	if (queue_nr_requests > 512)
+		queue_nr_requests = 512;
 
 	batch_requests = queue_nr_requests / 8;
 	if (batch_requests > 8)
===== drivers/block/deadline-iosched.c 1.9 vs edited =====
--- 1.9/drivers/block/deadline-iosched.c	Tue Oct 29 12:19:59 2002
+++ edited/drivers/block/deadline-iosched.c	Thu Oct 31 10:48:09 2002
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/hash.h>
+#include <linux/rbtree.h>
 
 /*
  * feel free to try other values :-). read_expire value is the timeout for
@@ -33,7 +34,7 @@
  */
 static int writes_starved = 2;
 
-static const int deadline_hash_shift = 8;
+static const int deadline_hash_shift = 10;
 #define DL_HASH_BLOCK(sec)	((sec) >> 3)
 #define DL_HASH_FN(sec)		(hash_long(DL_HASH_BLOCK((sec)), deadline_hash_shift))
 #define DL_HASH_ENTRIES		(1 << deadline_hash_shift)
@@ -48,19 +49,20 @@
 	/*
 	 * run time data
 	 */
-	struct list_head sort_list[2];	/* sorted listed */
+	struct rb_root rb_list[2];
 	struct list_head read_fifo;	/* fifo list */
 	struct list_head *dispatch;	/* driver dispatch queue */
 	struct list_head *hash;		/* request hash */
 	sector_t last_sector;		/* last sector sent to drive */
 	unsigned long hash_valid_count;	/* barrier hash count */
 	unsigned int starved;		/* writes starved */
+	unsigned int rb_entries;
 
 	/*
 	 * settings that change how the i/o scheduler behaves
 	 */
 	unsigned int fifo_batch;
-	unsigned long read_expire;
+	unsigned int read_expire;
 	unsigned int seek_cost;
 	unsigned int writes_starved;
 };
@@ -69,10 +71,24 @@
  * pre-request data.
  */
 struct deadline_rq {
-	struct list_head fifo;
+	/*
+	 * rbtree index, key is the starting offset
+	 */
+	struct rb_node rb_node;
+	sector_t rb_key;
+
+	struct request *request;
+
+	/*
+	 * request hash, key is the ending offset (for back merge lookup)
+	 */
 	struct list_head hash;
 	unsigned long hash_valid_count;
-	struct request *request;
+
+	/*
+	 * expire fifo
+	 */
+	struct list_head fifo;
 	unsigned long expires;
 };
 
@@ -81,23 +97,23 @@
 #define RQ_DATA(rq)	((struct deadline_rq *) (rq)->elevator_private)
 
 /*
- * rq hash
+ * the back merge hash support functions
  */
-static inline void __deadline_del_rq_hash(struct deadline_rq *drq)
+static inline void __deadline_hash_del(struct deadline_rq *drq)
 {
 	drq->hash_valid_count = 0;
 	list_del_init(&drq->hash);
 }
 
 #define ON_HASH(drq)	(drq)->hash_valid_count
-static inline void deadline_del_rq_hash(struct deadline_rq *drq)
+static inline void deadline_hash_del(struct deadline_rq *drq)
 {
 	if (ON_HASH(drq))
-		__deadline_del_rq_hash(drq);
+		__deadline_hash_del(drq);
 }
 
 static inline void
-deadline_add_rq_hash(struct deadline_data *dd, struct deadline_rq *drq)
+deadline_hash_add(struct deadline_data *dd, struct deadline_rq *drq)
 {
 	struct request *rq = drq->request;
 
@@ -109,33 +125,30 @@
 
 #define list_entry_hash(ptr)	list_entry((ptr), struct deadline_rq, hash)
 static struct request *
-deadline_find_hash(struct deadline_data *dd, sector_t offset)
+deadline_hash_find(struct deadline_data *dd, sector_t offset)
 {
 	struct list_head *hash_list = &dd->hash[DL_HASH_FN(offset)];
 	struct list_head *entry, *next = hash_list->next;
-	struct deadline_rq *drq;
-	struct request *rq = NULL;
 
 	while ((entry = next) != hash_list) {
+		struct deadline_rq *drq = list_entry_hash(entry);
+		struct request *__rq = drq->request;
+
 		next = entry->next;
 		
-		drq = list_entry_hash(entry);
-
 		BUG_ON(!drq->hash_valid_count);
 
-		if (!rq_mergeable(drq->request)
+		if (!rq_mergeable(__rq)
 		    || drq->hash_valid_count != dd->hash_valid_count) {
-			__deadline_del_rq_hash(drq);
+			__deadline_hash_del(drq);
 			continue;
 		}
 
-		if (drq->request->sector + drq->request->nr_sectors == offset) {
-			rq = drq->request;
-			break;
-		}
+		if (__rq->sector + __rq->nr_sectors == offset)
+			return __rq;
 	}
 
-	return rq;
+	return NULL;
 }
 
 static sector_t deadline_get_last_sector(struct deadline_data *dd)
@@ -154,86 +167,135 @@
 	return last_sec;
 }
 
+/*
+ * rb tree support functions
+ */
+#define RB_NONE		(2)
+#define RB_EMPTY(root)	((root)->rb_node == NULL)
+#define ON_RB(node)	((node)->rb_color != RB_NONE)
+#define RB_CLEAR(node)	((node)->rb_color = RB_NONE)
+#define deadline_rb_entry(node)	rb_entry((node), struct deadline_rq, rb_node)
+#define DRQ_RB_ROOT(dd, drq)	(&(dd)->rb_list[rq_data_dir((drq)->request)])
+
+static inline int
+__deadline_rb_add(struct deadline_data *dd, struct deadline_rq *drq)
+{
+	struct rb_node **p = &DRQ_RB_ROOT(dd, drq)->rb_node;
+	struct rb_node *parent = NULL;
+	struct deadline_rq *__drq;
+
+	while (*p) {
+		parent = *p;
+		__drq = deadline_rb_entry(parent);
+
+		if (drq->rb_key < __drq->rb_key)
+			p = &(*p)->rb_left;
+		else if (drq->rb_key > __drq->rb_key)
+			p = &(*p)->rb_right;
+		else
+			return 1;
+	}
+
+	rb_link_node(&drq->rb_node, parent, p);
+	return 0;
+}
+
+static void deadline_rb_add(struct deadline_data *dd, struct deadline_rq *drq)
+{
+	drq->rb_key = drq->request->sector;
+
+	if (!__deadline_rb_add(dd, drq)) {
+		rb_insert_color(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
+		dd->rb_entries++;
+		return;
+	}
+
+	/*
+	 * this cannot happen
+	 */
+	blk_dump_rq_flags(drq->request, "deadline_rb_add alias");
+	list_add_tail(&drq->request->queuelist, dd->dispatch);
+}
+
+static inline void
+deadline_rb_del(struct deadline_data *dd, struct deadline_rq *drq)
+{
+	if (ON_RB(&drq->rb_node)) {
+		rb_erase(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
+		RB_CLEAR(&drq->rb_node);
+		dd->rb_entries--;
+		BUG_ON(dd->rb_entries < 0);
+	}
+}
+
+static struct request *
+deadline_rb_find(struct deadline_data *dd, sector_t sector, int data_dir)
+{
+	struct rb_node *n = dd->rb_list[data_dir].rb_node;
+	struct deadline_rq *drq;
+
+	while (n) {
+		drq = deadline_rb_entry(n);
+
+		if (sector < drq->rb_key)
+			n = n->rb_left;
+		else if (sector > drq->rb_key)
+			n = n->rb_right;
+		else
+			return drq->request;
+	}
+
+	return NULL;
+}
+
 static int
 deadline_merge(request_queue_t *q, struct list_head **insert, struct bio *bio)
 {
 	struct deadline_data *dd = q->elevator.elevator_data;
-	const int data_dir = bio_data_dir(bio);
-	struct list_head *entry, *sort_list;
-	struct request *__rq;
 	int ret = ELEVATOR_NO_MERGE;
+	struct request *__rq;
 
 	/*
 	 * try last_merge to avoid going to hash
 	 */
 	ret = elv_try_last_merge(q, bio);
 	if (ret != ELEVATOR_NO_MERGE) {
-		*insert = q->last_merge;
+		__rq = list_entry_rq(q->last_merge);
 		goto out;
 	}
 
 	/*
 	 * see if the merge hash can satisfy a back merge
 	 */
-	if ((__rq = deadline_find_hash(dd, bio->bi_sector))) {
+	if ((__rq = deadline_hash_find(dd, bio->bi_sector))) {
 		BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
 
 		if (elv_rq_merge_ok(__rq, bio)) {
-			*insert = &__rq->queuelist;
 			ret = ELEVATOR_BACK_MERGE;
 			goto out;
 		}
 	}
 
 	/*
-	 * scan list from back to find insertion point.
+	 * probably not worth the extra overhead
 	 */
-	entry = sort_list = &dd->sort_list[data_dir];
-	while ((entry = entry->prev) != sort_list) {
-		__rq = list_entry_rq(entry);
-
-		BUG_ON(__rq->flags & REQ_STARTED);
-
-		if (!(__rq->flags & REQ_CMD))
-			continue;
+	__rq = deadline_rb_find(dd, bio->bi_sector + bio_sectors(bio), bio_data_dir(bio));
+	if (__rq) {
+		BUG_ON(bio->bi_sector + bio_sectors(bio) != __rq->sector);
 
-		/*
-		 * it's not necessary to break here, and in fact it could make
-		 * us loose a front merge. emperical evidence shows this to
-		 * be a big waste of cycles though, so quit scanning
-		 */
-		if (!*insert && bio_rq_in_between(bio, __rq, sort_list)) {
-			*insert = &__rq->queuelist;
-			break;
-		}
-
-		if (__rq->flags & REQ_BARRIER)
-			break;
-
-		/*
-		 * checking for a front merge, hash will miss those
-		 */
-		if (__rq->sector - bio_sectors(bio) == bio->bi_sector) {
-			ret = elv_try_merge(__rq, bio);
-			if (ret != ELEVATOR_NO_MERGE) {
-				*insert = &__rq->queuelist;
-				break;
-			}
+		if (elv_rq_merge_ok(__rq, bio)) {
+			ret = ELEVATOR_FRONT_MERGE;
+			goto out;
 		}
 	}
 
-	/*
-	 * no insertion point found, check the very front
-	 */
-	if (!*insert && !list_empty(sort_list)) {
-		__rq = list_entry_rq(sort_list->next);
-
-		if (bio->bi_sector + bio_sectors(bio) < __rq->sector &&
-		    bio->bi_sector > deadline_get_last_sector(dd))
-			*insert = sort_list;
+	*insert = NULL;
+out:
+	if (ret != ELEVATOR_NO_MERGE) {
+		*insert = &__rq->queuelist;
+		q->last_merge = &__rq->queuelist;
 	}
 
-out:
 	return ret;
 }
 
@@ -242,8 +304,16 @@
 	struct deadline_data *dd = q->elevator.elevator_data;
 	struct deadline_rq *drq = RQ_DATA(req);
 
-	deadline_del_rq_hash(drq);
-	deadline_add_rq_hash(dd, drq);
+	deadline_hash_del(drq);
+	deadline_hash_add(dd, drq);
+
+	/*
+	 * the merge was a front merge, we need to reposition request
+	 */
+	if (req->sector != drq->rb_key) {
+		deadline_rb_del(dd, drq);
+		deadline_rb_add(dd, drq);
+	}
 
 	q->last_merge = &req->queuelist;
 }
@@ -258,8 +328,13 @@
 	BUG_ON(!drq);
 	BUG_ON(!dnext);
 
-	deadline_del_rq_hash(drq);
-	deadline_add_rq_hash(dd, drq);
+	deadline_hash_del(drq);
+	deadline_hash_add(dd, drq);
+
+	if (req->sector != drq->rb_key) {
+		deadline_rb_del(dd, drq);
+		deadline_rb_add(dd, drq);
+	}
 
 	/*
 	 * if dnext expires before drq, assign it's expire time to drq
@@ -274,16 +349,16 @@
 }
 
 /*
- * move request from sort list to dispatch queue. maybe remove from rq hash
- * here too?
+ * move request from sort list to dispatch queue.
  */
 static inline void
 deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
 {
 	struct deadline_rq *drq = RQ_DATA(rq);
 
-	list_move_tail(&rq->queuelist, dd->dispatch);
+	deadline_rb_del(dd, drq);
 	list_del_init(&drq->fifo);
+	list_add_tail(&rq->queuelist, dd->dispatch);
 }
 
 /*
@@ -291,12 +366,12 @@
  */
 static void deadline_move_requests(struct deadline_data *dd, struct request *rq)
 {
-	struct list_head *sort_head = &dd->sort_list[rq_data_dir(rq)];
 	sector_t last_sec = deadline_get_last_sector(dd);
 	int batch_count = dd->fifo_batch;
+	struct deadline_rq *drq = RQ_DATA(rq);
 
 	do {
-		struct list_head *nxt = rq->queuelist.next;
+		struct rb_node *rbnext = rb_next(&drq->rb_node);
 		int this_rq_cost;
 
 		/*
@@ -308,7 +383,7 @@
 		/*
 		 * if this is the last entry, don't bother doing accounting
 		 */
-		if (nxt == sort_head)
+		if (rbnext == NULL)
 			break;
 
 		this_rq_cost = dd->seek_cost;
@@ -320,7 +395,8 @@
 			break;
 
 		last_sec = rq->sector + rq->nr_sectors;
-		rq = list_entry_rq(nxt);
+		drq = deadline_rb_entry(rbnext);
+		rq = drq->request;
 	} while (1);
 }
 
@@ -343,25 +419,11 @@
 	return 0;
 }
 
-static struct request *deadline_next_request(request_queue_t *q)
+static int deadline_dispatch_requests(struct deadline_data *dd)
 {
-	struct deadline_data *dd = q->elevator.elevator_data;
+	const int writes = !RB_EMPTY(&dd->rb_list[WRITE]);
 	struct deadline_rq *drq;
-	struct list_head *nxt;
-	struct request *rq;
-	int writes;
-
-	/*
-	 * if still requests on the dispatch queue, just grab the first one
-	 */
-	if (!list_empty(&q->queue_head)) {
-dispatch:
-		rq = list_entry_rq(q->queue_head.next);
-		dd->last_sector = rq->sector + rq->nr_sectors;
-		return rq;
-	}
-
-	writes = !list_empty(&dd->sort_list[WRITE]);
+	int ret = 0;
 
 	/*
 	 * if we have expired entries on the fifo list, move some to dispatch
@@ -370,19 +432,20 @@
 		if (writes && (dd->starved++ >= dd->writes_starved))
 			goto dispatch_writes;
 
-		nxt = dd->read_fifo.next;
-		drq = list_entry_fifo(nxt);
+		drq = list_entry_fifo(dd->read_fifo.next);
 		deadline_move_requests(dd, drq->request);
-		goto dispatch;
+		ret = 1;
+		goto out;
 	}
 
-	if (!list_empty(&dd->sort_list[READ])) {
+	if (!RB_EMPTY(&dd->rb_list[READ])) {
 		if (writes && (dd->starved++ >= dd->writes_starved))
 			goto dispatch_writes;
 
-		nxt = dd->sort_list[READ].next;
-		deadline_move_requests(dd, list_entry_rq(nxt));
-		goto dispatch;
+		drq = deadline_rb_entry(rb_first(&dd->rb_list[READ]));
+		deadline_move_requests(dd, drq->request);
+		ret = 1;
+		goto out;
 	}
 
 	/*
@@ -391,14 +454,41 @@
 	 */
 	if (writes) {
 dispatch_writes:
-		nxt = dd->sort_list[WRITE].next;
-		deadline_move_requests(dd, list_entry_rq(nxt));
+		drq = deadline_rb_entry(rb_first(&dd->rb_list[WRITE]));
+		deadline_move_requests(dd, drq->request);
 		dd->starved = 0;
-		goto dispatch;
+		ret = 1;
 	}
 
-	BUG_ON(!list_empty(&dd->sort_list[READ]));
-	BUG_ON(writes);
+out:
+	return ret;
+}
+
+static struct request *deadline_next_request(request_queue_t *q)
+{
+	struct deadline_data *dd = q->elevator.elevator_data;
+	struct request *rq;
+
+	/*
+	 * if there are still requests on the dispatch queue, grab the first one
+	 */
+	if (!list_empty(&q->queue_head)) {
+dispatch:
+		rq = list_entry_rq(q->queue_head.next);
+		dd->last_sector = rq->sector + rq->nr_sectors;
+		return rq;
+	}
+
+	if (deadline_dispatch_requests(dd))
+		goto dispatch;
+
+	/*
+	 * if we have entries on the read or write sorted list, its a bug
+	 * if deadline_dispatch_requests() didn't move any
+	 */
+	BUG_ON(!RB_EMPTY(&dd->rb_list[READ]));
+	BUG_ON(!RB_EMPTY(&dd->rb_list[WRITE]));
+
 	return NULL;
 }
 
@@ -409,28 +499,23 @@
 	struct deadline_rq *drq = RQ_DATA(rq);
 	const int data_dir = rq_data_dir(rq);
 
-	/*
-	 * flush hash on barrier insert, as not to allow merges before a
-	 * barrier.
-	 */
 	if (unlikely(rq->flags & REQ_BARRIER)) {
 		DL_INVALIDATE_HASH(dd);
 		q->last_merge = NULL;
 	}
 
-	/*
-	 * add to sort list
-	 */
-	if (!insert_here)
-		insert_here = dd->sort_list[data_dir].prev;
+	if (unlikely(!(rq->flags & REQ_CMD))) {
+		if (!insert_here)
+			insert_here = dd->dispatch->prev;
 
-	list_add(&rq->queuelist, insert_here);
-
-	if (unlikely(!(rq->flags & REQ_CMD)))
+		list_add(&rq->queuelist, insert_here);
 		return;
+	}
+
+	deadline_rb_add(dd, drq);
 
 	if (rq_mergeable(rq)) {
-		deadline_add_rq_hash(dd, drq);
+		deadline_hash_add(dd, drq);
 
 		if (!q->last_merge)
 			q->last_merge = &rq->queuelist;
@@ -443,15 +528,27 @@
 		drq->expires = jiffies + dd->read_expire;
 		list_add_tail(&drq->fifo, &dd->read_fifo);
 	}
+
+	/*
+	 * heuristic to offload the time spent moving entries interrupt
+	 * context. this happens when a driver queues a new request(s) from
+	 * its isr
+	 */
+#if 0
+	if (dd->rb_entries >= 8)
+		deadline_dispatch_requests(dd);
+#endif
 }
 
 static void deadline_remove_request(request_queue_t *q, struct request *rq)
 {
+	struct deadline_data *dd = q->elevator.elevator_data;
 	struct deadline_rq *drq = RQ_DATA(rq);
 
 	if (drq) {
 		list_del_init(&drq->fifo);
-		deadline_del_rq_hash(drq);
+		deadline_hash_del(drq);
+		deadline_rb_del(dd, drq);
 	}
 }
 
@@ -459,8 +556,8 @@
 {
 	struct deadline_data *dd = q->elevator.elevator_data;
 
-	if (!list_empty(&dd->sort_list[WRITE]) ||
-	    !list_empty(&dd->sort_list[READ]) ||
+	if (!RB_EMPTY(&dd->rb_list[WRITE]) ||
+	    !RB_EMPTY(&dd->rb_list[READ]) ||
 	    !list_empty(&q->queue_head))
 		return 0;
 
@@ -473,7 +570,7 @@
 {
 	struct deadline_data *dd = q->elevator.elevator_data;
 
-	return &dd->sort_list[rq_data_dir(rq)];
+	return dd->dispatch;
 }
 
 static void deadline_exit(request_queue_t *q, elevator_t *e)
@@ -484,8 +581,8 @@
 	int i;
 
 	BUG_ON(!list_empty(&dd->read_fifo));
-	BUG_ON(!list_empty(&dd->sort_list[READ]));
-	BUG_ON(!list_empty(&dd->sort_list[WRITE]));
+	BUG_ON(!RB_EMPTY(&dd->rb_list[READ]));
+	BUG_ON(!RB_EMPTY(&dd->rb_list[WRITE]));
 
 	for (i = READ; i <= WRITE; i++) {
 		struct request_list *rl = &q->rq[i];
@@ -538,8 +635,8 @@
 		INIT_LIST_HEAD(&dd->hash[i]);
 
 	INIT_LIST_HEAD(&dd->read_fifo);
-	INIT_LIST_HEAD(&dd->sort_list[READ]);
-	INIT_LIST_HEAD(&dd->sort_list[WRITE]);
+	dd->rb_list[READ] = RB_ROOT;
+	dd->rb_list[WRITE] = RB_ROOT;
 	dd->dispatch = &q->queue_head;
 	dd->fifo_batch = fifo_batch;
 	dd->read_expire = read_expire;
@@ -567,6 +664,7 @@
 			memset(drq, 0, sizeof(*drq));
 			INIT_LIST_HEAD(&drq->fifo);
 			INIT_LIST_HEAD(&drq->hash);
+			RB_CLEAR(&drq->rb_node);
 			drq->request = rq;
 			rq->elevator_private = drq;
 		}
-- Jens Axboe- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/