[PATCH] swap 11/13 fix unuse_pmd fixme

Hugh Dickins (hugh@veritas.com)
Tue, 25 Mar 2003 22:21:51 +0000 (GMT)


try_to_unuse drop mmlist_lock across unuse_process (with pretty dance
of atomic_incs and mmputs of various mmlist markers, and a polite new
cond_resched there), so unuse_process can pte_chain_alloc(GFP_KERNEL)
and pass that down and down and down and down to unuse_pte: which
cannot succeed more than once on a given mm (make that explicit by
returning back up once succeeded). Preliminary checks moved up from
unuse_pte to unuse_pmd, and done more efficiently (avoid that extra
pte_file test added recently), swapoff spends far too long in here.
Updated locking comments and references to try_to_swap_out.

--- swap10/mm/swapfile.c Sun Mar 23 10:30:15 2003
+++ swap11/mm/swapfile.c Tue Mar 25 20:44:46 2003
@@ -377,42 +377,34 @@
* share this swap entry, so be cautious and let do_wp_page work out
* what to do if a write is requested later.
*/
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
+/* vma->vm_mm->page_table_lock is held */
static void
unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
{
- pte_t pte = *dir;
-
- if (pte_file(pte))
- return;
- if (likely(pte_to_swp_entry(pte).val != entry.val))
- return;
- if (unlikely(pte_none(pte) || pte_present(pte)))
- return;
+ vma->vm_mm->rss++;
get_page(page);
set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
SetPageAnon(page);
*pte_chainp = page_add_rmap(page, dir, *pte_chainp);
swap_free(entry);
- ++vma->vm_mm->rss;
}

-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
+/* vma->vm_mm->page_table_lock is held */
+static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
unsigned long address, unsigned long size, unsigned long offset,
- swp_entry_t entry, struct page* page)
+ swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
{
pte_t * pte;
unsigned long end;
- struct pte_chain *pte_chain = NULL;
+ pte_t swp_pte = swp_entry_to_pte(entry);

if (pmd_none(*dir))
- return;
+ return 0;
if (pmd_bad(*dir)) {
pmd_ERROR(*dir);
pmd_clear(dir);
- return;
+ return 0;
}
pte = pte_offset_map(dir, address);
offset += address & PMD_MASK;
@@ -422,33 +414,36 @@
end = PMD_SIZE;
do {
/*
- * FIXME: handle pte_chain_alloc() failures
+ * swapoff spends a _lot_ of time in this loop!
+ * Test inline before going to call unuse_pte.
*/
- if (pte_chain == NULL)
- pte_chain = pte_chain_alloc(GFP_ATOMIC);
- unuse_pte(vma, offset+address-vma->vm_start,
- pte, entry, page, &pte_chain);
+ if (unlikely(pte_same(*pte, swp_pte))) {
+ unuse_pte(vma, offset + address, pte,
+ entry, page, pte_chainp);
+ pte_unmap(pte);
+ return 1;
+ }
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
pte_unmap(pte - 1);
- pte_chain_free(pte_chain);
+ return 0;
}

-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
+/* vma->vm_mm->page_table_lock is held */
+static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
unsigned long address, unsigned long size,
- swp_entry_t entry, struct page* page)
+ swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
{
pmd_t * pmd;
unsigned long offset, end;

if (pgd_none(*dir))
- return;
+ return 0;
if (pgd_bad(*dir)) {
pgd_ERROR(*dir);
pgd_clear(dir);
- return;
+ return 0;
}
pmd = pmd_offset(dir, address);
offset = address & PGDIR_MASK;
@@ -459,32 +454,42 @@
if (address >= end)
BUG();
do {
- unuse_pmd(vma, pmd, address, end - address, offset, entry,
- page);
+ if (unuse_pmd(vma, pmd, address, end - address,
+ offset, entry, page, pte_chainp))
+ return 1;
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address && (address < end));
+ return 0;
}

-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
- swp_entry_t entry, struct page* page)
+/* vma->vm_mm->page_table_lock is held */
+static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
+ swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
{
unsigned long start = vma->vm_start, end = vma->vm_end;

if (start >= end)
BUG();
do {
- unuse_pgd(vma, pgdir, start, end - start, entry, page);
+ if (unuse_pgd(vma, pgdir, start, end - start,
+ entry, page, pte_chainp))
+ return 1;
start = (start + PGDIR_SIZE) & PGDIR_MASK;
pgdir++;
} while (start && (start < end));
+ return 0;
}

-static void unuse_process(struct mm_struct * mm,
+static int unuse_process(struct mm_struct * mm,
swp_entry_t entry, struct page* page)
{
struct vm_area_struct* vma;
+ struct pte_chain *pte_chain;
+
+ pte_chain = pte_chain_alloc(GFP_KERNEL);
+ if (!pte_chain)
+ return -ENOMEM;

/*
* Go through process' page directory.
@@ -492,10 +497,12 @@
spin_lock(&mm->page_table_lock);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
pgd_t * pgd = pgd_offset(mm, vma->vm_start);
- unuse_vma(vma, pgd, entry, page);
+ if (unuse_vma(vma, pgd, entry, page, &pte_chain))
+ break;
}
spin_unlock(&mm->page_table_lock);
- return;
+ pte_chain_free(pte_chain);
+ return 0;
}

/*
@@ -639,36 +646,54 @@
if (start_mm == &init_mm)
shmem = shmem_unuse(entry, page);
else
- unuse_process(start_mm, entry, page);
+ retval = unuse_process(start_mm, entry, page);
}
if (*swap_map > 1) {
int set_start_mm = (*swap_map >= swcount);
struct list_head *p = &start_mm->mmlist;
struct mm_struct *new_start_mm = start_mm;
+ struct mm_struct *prev_mm = start_mm;
struct mm_struct *mm;

+ atomic_inc(&new_start_mm->mm_users);
+ atomic_inc(&prev_mm->mm_users);
spin_lock(&mmlist_lock);
- while (*swap_map > 1 &&
+ while (*swap_map > 1 && !retval &&
(p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist);
+ atomic_inc(&mm->mm_users);
+ spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
+ prev_mm = mm;
+
+ cond_resched();
+
swcount = *swap_map;
- if (mm == &init_mm) {
+ if (swcount <= 1)
+ ;
+ else if (mm == &init_mm) {
set_start_mm = 1;
- spin_unlock(&mmlist_lock);
shmem = shmem_unuse(entry, page);
- spin_lock(&mmlist_lock);
} else
- unuse_process(mm, entry, page);
+ retval = unuse_process(mm, entry, page);
if (set_start_mm && *swap_map < swcount) {
+ mmput(new_start_mm);
+ atomic_inc(&mm->mm_users);
new_start_mm = mm;
set_start_mm = 0;
}
+ spin_lock(&mmlist_lock);
}
- atomic_inc(&new_start_mm->mm_users);
spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
mmput(start_mm);
start_mm = new_start_mm;
}
+ if (retval) {
+ unlock_page(page);
+ page_cache_release(page);
+ break;
+ }

/*
* How could swap count reach 0x7fff when the maximum
@@ -692,7 +717,7 @@

/*
* If a reference remains (rare), we would like to leave
- * the page in the swap cache; but try_to_swap_out could
+ * the page in the swap cache; but try_to_unmap could
* then re-duplicate the entry once we drop page lock,
* so we might loop indefinitely; also, that page could
* not be swapped out to other storage meanwhile. So:
@@ -728,7 +753,7 @@
/*
* So we could skip searching mms once swap count went
* to 1, we did not mark any present ptes as dirty: must
- * mark page dirty so try_to_swap_out will preserve it.
+ * mark page dirty so shrink_list will preserve it.
*/
SetPageDirty(page);
unlock_page(page);

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/