[PATCH] 2/5 VM changes: skip-writepage.patch

Nikita Danilov (Nikita@Namesys.COM)
Wed, 9 Jul 2003 12:47:05 +0400


Don't call ->writepage from VM scanner when page is met for the first time
during scan.

New page flag PG_skipped is used for this. This flag is TestSet-ed just
before calling ->writepage and is cleaned when page enters inactive
list.

One can see this as "second chance" algorithm for the dirty pages on the
inactive list.

BSD does the same: src/sys/vm/vm_pageout.c:vm_pageout_scan(),
PG_WINATCFLS flag.

Reason behind this is that ->writepages() will perform more efficient writeout
than ->writepage(). Skipping of page can be conditioned on zone->pressure.

On the other hand, avoiding ->writepage() increases amount of scanning
performed by kswapd.

Results of copying 400M * 10 from ramfs to ext2 (512M of ram), averaged over 6
runs:

without patch:

ELAPSED SYSTEM USER
TIME 255.649 42.734 5.403
DEVIATION 10.516 0.948 0.078

with patch:

TIME 158.847 51.059 5.590
DEVIATION 4.400 0.251 0.123

diff -puN mm/vmscan.c~skip-writepage mm/vmscan.c
--- i386/mm/vmscan.c~skip-writepage Wed Jul 9 12:24:50 2003
+++ i386-god/mm/vmscan.c Wed Jul 9 12:24:51 2003
@@ -232,6 +232,104 @@ static int may_write_to_queue(struct bac
return 0;
}

+/* possible outcome of pageout() */
+typedef enum {
+ /* failed to write page out, page is locked */
+ PAGE_KEEP,
+ /* move page to the active list, page is locked */
+ PAGE_ACTIVATE,
+ /* page has been sent to the disk successfully, page is unlocked */
+ PAGE_SUCCESS,
+ /* page is clean and locked */
+ PAGE_CLEAN
+} pageout_t;
+
+
+/*
+ * Called by shrink_list() for each dirty page. Calls ->writepage().
+ */
+static pageout_t pageout(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+
+ /*
+ * If the page is dirty, only perform writeback if that write will be
+ * non-blocking. To prevent this allocation from being stalled by
+ * pagecache activity. But note that there may be stalls if we need
+ * to run get_block(). We could test PagePrivate for that.
+ *
+ * If this process is currently in generic_file_write() against this
+ * page's queue, we can perform writeback even if that will block.
+ *
+ * If the page is swapcache, write it back even if that would block,
+ * for some throttling. This happens by accident, because
+ * swap_backing_dev_info is bust: it doesn't reflect the congestion
+ * state of the swapdevs. Easy to fix, if needed. See
+ * swapfile.c:page_queue_congested().
+ */
+ if (!is_page_cache_freeable(page))
+ return PAGE_KEEP;
+ if (!mapping)
+ return PAGE_KEEP;
+ if (mapping->a_ops->writepage == NULL)
+ return PAGE_ACTIVATE;
+ if (!may_write_to_queue(mapping->backing_dev_info))
+ return PAGE_KEEP;
+ /*
+ * Don't call ->writepage when page is met for the first time during
+ * scanning. Reasons:
+ *
+ * 1. if memory pressure is not too high, skipping ->writepage()
+ * may avoid writing out page that will be re-dirtied (should not
+ * be too important, because scanning starts from the tail of
+ * inactive list, where pages are _supposed_ to be rarely used,
+ * but when under constant memory pressure, inactive list is
+ * rotated and so is more FIFO than LRU).
+ *
+ * 2. ->writepages() writes data more efficiently than
+ * ->writepage().
+ */
+ if (!TestSetPageSkipped(page))
+ return PAGE_KEEP;
+ spin_lock(&mapping->page_lock);
+ if (test_clear_page_dirty(page)) {
+ int res;
+ int is_async = current_is_kswapd() || current_is_pdflush();
+
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = SWAP_CLUSTER_MAX,
+ /*
+ * synchronous page reclamation should be non blocking
+ * for the reasons outlined in the comment above. But
+ * in the asynchronous daemons blocking is ok.
+ */
+ .nonblocking = !is_async,
+ .for_reclaim = 1 /* XXX not used */
+ };
+
+ list_move(&page->list, &mapping->locked_pages);
+ spin_unlock(&mapping->page_lock);
+
+ SetPageReclaim(page);
+ res = mapping->a_ops->writepage(page, &wbc);
+
+ if (res == WRITEPAGE_ACTIVATE) {
+ ClearPageReclaim(page);
+ return PAGE_ACTIVATE;
+ }
+ if (!PageWriteback(page)) {
+ /* synchronous write or broken a_ops? */
+ ClearPageReclaim(page);
+ }
+ return PAGE_SUCCESS;
+ }
+ spin_unlock(&mapping->page_lock);
+ return PAGE_CLEAN;
+}
+
+
+
/*
* shrink_list returns the number of reclaimed pages
*/
@@ -313,62 +411,24 @@ shrink_list(struct list_head *page_list,
pte_chain_unlock(page);

/*
- * If the page is dirty, only perform writeback if that write
- * will be non-blocking. To prevent this allocation from being
- * stalled by pagecache activity. But note that there may be
- * stalls if we need to run get_block(). We could test
- * PagePrivate for that.
- *
- * If this process is currently in generic_file_write() against
- * this page's queue, we can perform writeback even if that
- * will block.
- *
- * If the page is swapcache, write it back even if that would
- * block, for some throttling. This happens by accident, because
- * swap_backing_dev_info is bust: it doesn't reflect the
- * congestion state of the swapdevs. Easy to fix, if needed.
- * See swapfile.c:page_queue_congested().
- */
- if (PageDirty(page)) {
- if (!is_page_cache_freeable(page))
- goto keep_locked;
- if (!mapping)
- goto keep_locked;
- if (mapping->a_ops->writepage == NULL)
- goto activate_locked;
- if (!may_enter_fs)
- goto keep_locked;
- if (!may_write_to_queue(mapping->backing_dev_info))
- goto keep_locked;
- spin_lock(&mapping->page_lock);
- if (test_clear_page_dirty(page)) {
- int res;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = SWAP_CLUSTER_MAX,
- .nonblocking = 1,
- .for_reclaim = 1,
- };
-
- list_move(&page->list, &mapping->locked_pages);
- spin_unlock(&mapping->page_lock);
-
- SetPageReclaim(page);
- res = mapping->a_ops->writepage(page, &wbc);
-
- if (res == WRITEPAGE_ACTIVATE) {
- ClearPageReclaim(page);
- goto activate_locked;
- }
- if (!PageWriteback(page)) {
- /* synchronous write or broken a_ops? */
- ClearPageReclaim(page);
- }
- goto keep;
+ * if calls to file system are allowed and @page is dirty, try
+ * to send it to the disk. If !may_enter_fs, try to
+ * ->releasepage() below anyway.
+ */
+ if (may_enter_fs && PageDirty(page)) {
+ switch (pageout(page)) {
+ case PAGE_KEEP:
+ goto keep_locked;
+ case PAGE_ACTIVATE:
+ goto activate_locked;
+ case PAGE_SUCCESS:
+ goto keep;
+ case PAGE_CLEAN:
+ ;
}
- spin_unlock(&mapping->page_lock);
}

+
/*
* If the page has buffers, try to free the buffer mappings
* associated with this page. If we succeed we try to free
@@ -679,6 +739,7 @@ refill_inactive_zone(struct zone *zone,
if (!TestClearPageActive(page))
BUG();
list_move(&page->lru, &zone->inactive_list);
+ ClearPageSkipped(page);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
zone->nr_inactive += pgmoved;
diff -puN include/linux/mm_inline.h~skip-writepage include/linux/mm_inline.h
--- i386/include/linux/mm_inline.h~skip-writepage Wed Jul 9 12:24:51 2003
+++ i386-god/include/linux/mm_inline.h Wed Jul 9 12:24:51 2003
@@ -10,6 +10,7 @@ static inline void
add_page_to_inactive_list(struct zone *zone, struct page *page)
{
list_add(&page->lru, &zone->inactive_list);
+ ClearPageSkipped(page);
zone->nr_inactive++;
}

diff -puN include/linux/page-flags.h~skip-writepage include/linux/page-flags.h
--- i386/include/linux/page-flags.h~skip-writepage Wed Jul 9 12:24:51 2003
+++ i386-god/include/linux/page-flags.h Wed Jul 9 12:24:51 2003
@@ -75,6 +75,7 @@
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_compound 19 /* Part of a compound page */
+#define PG_skipped 20 /* ->writepage() was skipped on this page */


/*
@@ -302,6 +303,12 @@ extern void get_full_page_state(struct p
#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags)
#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags)

+#define PageSkipped(page) test_bit(PG_skipped, &(page)->flags)
+#define SetPageSkipped(page) set_bit(PG_skipped, &(page)->flags)
+#define TestSetPageSkipped(page) test_and_set_bit(PG_skipped, &(page)->flags)
+#define ClearPageSkipped(page) clear_bit(PG_skipped, &(page)->flags)
+#define TestClearPageSkipped(page) test_and_clear_bit(PG_skipped, &(page)->flags)
+
/*
* The PageSwapCache predicate doesn't use a PG_flag at this time,
* but it may again do so one day.

_
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/