[PATCH] __alloc_pages cleanup -R6 Was: Re: Memory Problem in 2.4.10-pre2 / __alloc_pages failed

Roger Larsson (roger.larsson@skelleftea.mail.telia.com)
Fri, 31 Aug 2001 01:53:24 +0200


Hi,

A new version of the __alloc_pages{_limit} cleanup.
This time for 2.4.10-pre2

Some ideas implemented in this code:
* Reserve memory below min for atomic and recursive allocations.
* When being min..low on free pages, free one more than you want to allocate.
* When being low..high on free pages, free one less than wanted.
* When above high - don't free anything.
* First select zones with more than high free memory.
* Then those with more than high 'free + inactive_clean - inactive_target'
* When freeing - do it properly. Don't steal direct reclaimed pages
(questionable due to locking issues on SMP)
This will "regulate" the number of FREE_PAGES towards the PAGES_LOW.
Possibility for success of an atomic high order alloc is in some way
proportional with number of pages free.

I have not been able to notice any performance degradation with this
patch. But I do not have a SMP PC...

/RogerL

-- 
Roger Larsson
Skellefteċ
Sweden

******************************************* Patch prepared by: roger.larsson@norran.net Name of file: /home/roger/patches/patch-2.4.10-pre2-alloc_pages_limit-R6b

--- linux/mm/page_alloc.c.orig Thu Aug 30 23:20:01 2001 +++ linux/mm/page_alloc.c Fri Aug 31 00:56:20 2001 @@ -212,9 +212,13 @@ return NULL; } -#define PAGES_MIN 0 -#define PAGES_LOW 1 -#define PAGES_HIGH 2 +#define PAGES_MEMALLOC 0 +#define PAGES_CRITICAL 1 +#define PAGES_MIN_FREE 2 +#define PAGES_NORMAL_FREE 3 +#define PAGES_HIGH_FREE 4 +#define PAGES_HIGH 5 +#define PAGES_INACTIVE_TARGET 6 /* * This function does the dirty work for __alloc_pages @@ -228,7 +232,7 @@ for (;;) { zone_t *z = *(zone++); - unsigned long water_mark; + unsigned long water_mark, free_min, pages_to_reclaim; if (!z) break; @@ -239,26 +243,85 @@ * We allocate if the number of free + inactive_clean * pages is above the watermark. */ + + free_min = z->pages_min; + + switch (limit) { + case PAGES_MEMALLOC: + free_min = 1; + water_mark = 0; /* there might be inactive_clean pages */ + break; + case PAGES_CRITICAL: + /* XXX: is pages_min/4 a good amount to reserve for this? */ + free_min = water_mark = z->pages_min / 4; + break; default: - case PAGES_MIN: + printk(KERN_ERR + "__alloc_pages_limit unknown limit (%d) using default\n", + limit); + case PAGES_MIN_FREE: water_mark = z->pages_min; break; - case PAGES_LOW: - water_mark = z->pages_low; + case PAGES_NORMAL_FREE: + water_mark = (z->pages_min + z->pages_low) / 2; break; - case PAGES_HIGH: + case PAGES_INACTIVE_TARGET: + water_mark = z->pages_high + + inactive_target - z->inactive_clean_pages; + break; + case PAGES_HIGH_FREE: water_mark = z->pages_high; + break; + case PAGES_HIGH: + water_mark = z->pages_high - z->inactive_clean_pages; + break; } - if (z->free_pages + z->inactive_clean_pages >= water_mark) { - struct page *page = NULL; - /* If possible, reclaim a page directly. */ - if (direct_reclaim) - page = reclaim_page(z); - /* If that fails, fall back to rmqueue. */ - if (!page) - page = rmqueue(z, order); + + + if (z->free_pages < water_mark) + continue; + + + /* + * Reclaim a page from the inactive_clean list. + * low water mark. Free all reclaimed pages to + * give them a chance to merge to higher orders. + */ + if (direct_reclaim) { + /* Our goal for free pages is z->pages_low + * if there are less try to free one more than needed + * when more, free one less + */ + pages_to_reclaim = 1 << order; /* pages to try to reclaim at free_pages level */ + if (z->free_pages < z->pages_low) + pages_to_reclaim++; + else if (z->free_pages < z->pages_high) + pages_to_reclaim--; + else /* free >= high */ + pages_to_reclaim = 0; + + while (z->inactive_clean_pages && + (z->free_pages < z->pages_min || + pages_to_reclaim--)) { /* note: lazy evaluation! decr. only when free > min */ + struct page *reclaim = reclaim_page(z); + if (reclaim) { + __free_page(reclaim); + } + else { + if (z->inactive_clean_pages > 0) + printk(KERN_ERR "reclaim_pages failed but there are inactive_clean_pages\n"); + + break; + } + } + } + + /* Always alloc via rmqueue */ + if (z->free_pages >= free_min) + { + struct page *page = rmqueue(z, order); if (page) return page; } @@ -268,6 +331,7 @@ return NULL; } + #ifndef CONFIG_DISCONTIGMEM struct page *_alloc_pages(unsigned int gfp_mask, unsigned long order) { @@ -281,7 +345,6 @@ */ struct page * __alloc_pages(unsigned int gfp_mask, unsigned long order, zonelist_t *zonelist) { - zone_t **zone; int direct_reclaim = 0; struct page * page; @@ -291,6 +354,14 @@ memory_pressure++; /* + * To get a hint on who is requesting higher order atomically. + */ + if (order > 0 && !(gfp_mask & __GFP_WAIT)) { + printk("%s; __alloc_pages(gfp=0x%x, order=%ld, ...)\n", current->comm, gfp_mask, order); + show_trace(NULL); + } + + /* * (If anyone calls gfp from interrupts nonatomically then it * will sooner or later tripped up by a schedule().) * @@ -299,70 +370,69 @@ */ /* - * Can we take pages directly from the inactive_clean - * list? - */ - if (order == 0 && (gfp_mask & __GFP_WAIT)) - direct_reclaim = 1; - -try_again: - /* * First, see if we have any zones with lots of free memory. * * We allocate free memory first because it doesn't contain * any data ... DUH! */ - zone = zonelist->zones; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - if (!z->size) - BUG(); + page = __alloc_pages_limit(zonelist, order, PAGES_HIGH_FREE, 0); + if (page) + return page; - if (z->free_pages >= z->pages_low) { - page = rmqueue(z, order); - if (page) - return page; - } else if (z->free_pages < z->pages_min && - waitqueue_active(&kreclaimd_wait)) { - wake_up_interruptible(&kreclaimd_wait); - } - } + /* + * Can we take pages directly from the inactive_clean + * list? __alloc_pages_limit now handles any 'order'. + */ + if (gfp_mask & __GFP_WAIT) + direct_reclaim = 1; + + /* Lots of free and inactive memory? i.e. more than target for + * the next second. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_INACTIVE_TARGET, direct_reclaim); + if (page) + return page; /* - * Try to allocate a page from a zone with a HIGH - * amount of free + inactive_clean pages. + * Hmm. Too few pages inactive to reach our inactive_target. + * + * We wake up kswapd, in the hope that kswapd will + * resolve this situation before memory gets tight. * - * If there is a lot of activity, inactive_target - * will be high and we'll have a good chance of - * finding a page using the HIGH limit. */ + + wakeup_kswapd(); + + page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim); if (page) return page; /* - * Then try to allocate a page from a zone with more - * than zone->pages_low free + inactive_clean pages. + * Then try to allocate a page from a zone with slightly less + * than zone->pages_low free pages. Since this is the goal + * of free pages this alloc will dynamically change among + * zones. * * When the working set is very large and VM activity * is low, we're most likely to have our allocation * succeed here. */ - page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim); +try_again: + page = __alloc_pages_limit(zonelist, order, PAGES_NORMAL_FREE, direct_reclaim); if (page) return page; - /* - * OK, none of the zones on our zonelist has lots - * of pages free. - * - * We wake up kswapd, in the hope that kswapd will - * resolve this situation before memory gets tight. - * - * We also yield the CPU, because that: - * - gives kswapd a chance to do something + + /* "all" zones has less than NORMAL free, i.e. our reclaiming in __alloc_pages_limit + * has not kept up with demand, possibly too few allocs with reclaim + */ + if (waitqueue_active(&kreclaimd_wait)) { + wake_up_interruptible(&kreclaimd_wait); + } + + /* We also yield the CPU, because that: + * - gives kswapd and kreclaimd a chance to do something * - slows down allocations, in particular the * allocations from the fast allocator that's * causing the problems ... @@ -371,13 +441,13 @@ * - if we don't have __GFP_IO set, kswapd may be * able to free some memory we can't free ourselves */ - wakeup_kswapd(); if (gfp_mask & __GFP_WAIT) { __set_current_state(TASK_RUNNING); current->policy |= SCHED_YIELD; schedule(); } + /* * After waking up kswapd, we try to allocate a page * from any zone which isn't critical yet. @@ -385,7 +455,7 @@ * Kswapd should, in most situations, bring the situation * back to normal in no time. */ - page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim); + page = __alloc_pages_limit(zonelist, order, PAGES_MIN_FREE, direct_reclaim); if (page) return page; @@ -398,40 +468,21 @@ * - we're /really/ tight on memory * --> try to free pages ourselves with page_launder */ - if (!(current->flags & PF_MEMALLOC)) { + if (!(current->flags & PF_MEMALLOC) && + (gfp_mask & __GFP_WAIT)) { /* implies direct_reclaim==1 */ /* - * Are we dealing with a higher order allocation? - * - * Move pages from the inactive_clean to the free list - * in the hope of creating a large, physically contiguous - * piece of free memory. + * Move pages from the inactive_dirty to the inactive_clean */ - if (order > 0 && (gfp_mask & __GFP_WAIT)) { - zone = zonelist->zones; - /* First, clean some dirty pages. */ - current->flags |= PF_MEMALLOC; - page_launder(gfp_mask, 1); - current->flags &= ~PF_MEMALLOC; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - if (!z->size) - continue; - while (z->inactive_clean_pages) { - struct page * page; - /* Move one page to the free list. */ - page = reclaim_page(z); - if (!page) - break; - __free_page(page); - /* Try if the allocation succeeds. */ - page = rmqueue(z, order); - if (page) - return page; - } - } - } + + /* First, clean some dirty pages. */ + current->flags |= PF_MEMALLOC; + page_launder(gfp_mask, 1); + current->flags &= ~PF_MEMALLOC; + + page = __alloc_pages_limit(zonelist, order, PAGES_MIN_FREE, direct_reclaim); + if (page) + return page; + /* * When we arrive here, we are really tight on memory. * Since kswapd didn't succeed in freeing pages for us, @@ -447,22 +498,23 @@ * any progress freeing pages, in that case it's better * to give up than to deadlock the kernel looping here. */ - if (gfp_mask & __GFP_WAIT) { - if (!order || free_shortage()) { - int progress = try_to_free_pages(gfp_mask); - if (progress || (gfp_mask & __GFP_FS)) - goto try_again; - /* - * Fail in case no progress was made and the - * allocation may not be able to block on IO. - */ - return NULL; - } + if (!order || free_shortage()) { + int progress = try_to_free_pages(gfp_mask); + if (progress || (gfp_mask & __GFP_FS)) + goto try_again; } + + /* + * Fail in case no further progress can be made. + */ + return NULL; } /* - * Final phase: allocate anything we can! + * Final phase: atomic and recursive only - allocate anything we can! + * + * Note: very high order allocs are not that important and are unlikely + * to succeed with this anyway. * * Higher order allocations, GFP_ATOMIC allocations and * recursive allocations (PF_MEMALLOC) end up here. @@ -471,39 +523,18 @@ * in the system, otherwise it would be just too easy to * deadlock the system... */ - zone = zonelist->zones; - for (;;) { - zone_t *z = *(zone++); - struct page * page = NULL; - if (!z) - break; - if (!z->size) - BUG(); - - /* - * SUBTLE: direct_reclaim is only possible if the task - * becomes PF_MEMALLOC while looping above. This will - * happen when the OOM killer selects this task for - * instant execution... - */ - if (direct_reclaim) { - page = reclaim_page(z); - if (page) - return page; - } - - /* XXX: is pages_min/4 a good amount to reserve for this? */ - if (z->free_pages < z->pages_min / 4 && - !(current->flags & PF_MEMALLOC)) - continue; - page = rmqueue(z, order); - if (page) - return page; - } - + page = __alloc_pages_limit(zonelist, order, + current->flags & PF_MEMALLOC + ? PAGES_MEMALLOC : PAGES_CRITICAL, + direct_reclaim); + if (page) + return page; + /* No luck.. */ - printk(KERN_ERR "__alloc_pages: %lu-order allocation failed (gfp=0x%x/%i).\n", - order, gfp_mask, !!(current->flags & PF_MEMALLOC)); + printk(KERN_ERR + "%s; __alloc_pages: %lu-order allocatioa failed. (gfp=0x%x/%d)\n", + current->comm, order, gfp_mask, + !!(current->flags & PF_MEMALLOC)); return NULL; } - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/