Re: [PATCH 2.5.43-mm2] New shared page table patch

Bill Davidsen (davidsen@tmr.com)
Sat, 19 Oct 2002 15:17:31 -0400 (EDT)


This message is in MIME format. The first part should be readable text,
while the remaining parts are likely unreadable without MIME-aware tools.
Send mail to mime@docserver.cac.washington.edu for more info.

--==========734028336==========
Content-Type: TEXT/PLAIN; CHARSET=us-ascii
Content-ID: <Pine.LNX.3.96.1021019151523.29078G@gatekeeper.tmr.com>

On Fri, 18 Oct 2002, Dave McCracken wrote:

> For reference, one of the tests was TPC-H. My code reduced the number of
> allocated pte_chains from 5 million to 50 thousand.

Don't tease, what did that do for performance? I see that someone has
already posted a possible problem, and the code would pass for complex for
most people, so is the gain worth the pain?

-- 
bill davidsen <davidsen@tmr.com>
  CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.

--==========734028336========== Content-Type: TEXT/PLAIN; CHARSET=iso-8859-1; NAME="shpte-2.5.43-mm2-4.diff" Content-Transfer-Encoding: QUOTED-PRINTABLE Content-ID: <Pine.LNX.3.96.1021019151523.29078H@gatekeeper.tmr.com> Content-Description:

--- 2.5.43-mm2/./fs/exec.c 2002-10-17 11:12:59.000000000 -0500 +++ 2.5.43-mm2-shpte/./fs/exec.c 2002-10-17 11:42:16.000000000 -0500 @@ -47,6 +47,7 @@ #include <asm/uaccess.h> #include <asm/pgalloc.h> #include <asm/mmu_context.h> +#include <asm/rmap.h> =20 #ifdef CONFIG_KMOD #include <linux/kmod.h> @@ -311,8 +312,8 @@ flush_page_to_ram(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); page_add_rmap(page, pte); + increment_rss(kmap_atomic_to_page(pte)); pte_unmap(pte); - tsk->mm->rss++; spin_unlock(&tsk->mm->page_table_lock); =20 /* no need for flush_tlb */ --- 2.5.43-mm2/./arch/i386/kernel/vm86.c 2002-10-15 22:27:15.000000000 = -0500 +++ 2.5.43-mm2-shpte/./arch/i386/kernel/vm86.c 2002-10-18 = 13:35:44.000000000 -0500 @@ -39,6 +39,7 @@ #include <linux/smp.h> #include <linux/smp_lock.h> #include <linux/highmem.h> +#include <linux/rmap-locking.h> =20 #include <asm/uaccess.h> #include <asm/pgalloc.h> @@ -120,6 +121,7 @@ =20 static void mark_screen_rdonly(struct task_struct * tsk) { + struct page *ptepage; pgd_t *pgd; pmd_t *pmd; pte_t *pte, *mapped; @@ -143,6 +145,8 @@ pmd_clear(pmd); goto out; } + ptepage =3D pmd_page(*pmd); + pte_page_lock(ptepage); pte =3D mapped =3D pte_offset_map(pmd, 0xA0000); for (i =3D 0; i < 32; i++) { if (pte_present(*pte)) @@ -150,6 +154,7 @@ pte++; } pte_unmap(mapped); + pte_page_unlock(ptepage); out: spin_unlock(&tsk->mm->page_table_lock); preempt_enable(); --- 2.5.43-mm2/./arch/i386/Config.help 2002-10-15 22:27:14.000000000 -0500 +++ 2.5.43-mm2-shpte/./arch/i386/Config.help 2002-10-17 11:42:16.000000000 = -0500 @@ -165,6 +165,13 @@ low memory. Setting this option will put user-space page table entries in high memory. =20 +CONFIG_SHAREPTE + Normally each address space has its own complete page table for all + its mappings. This can mean many mappings of a set of shared data + pages. With this option, the VM will attempt to share the bottom + level of the page table between address spaces that are sharing data + pages. + CONFIG_HIGHMEM4G Select this if you have a 32-bit processor and between 1 and 4 gigabytes of physical RAM. --- 2.5.43-mm2/./arch/i386/config.in 2002-10-17 11:12:53.000000000 -0500 +++ 2.5.43-mm2-shpte/./arch/i386/config.in 2002-10-17 11:42:16.000000000 = -0500 @@ -233,6 +233,7 @@ if [ "$CONFIG_HIGHMEM4G" =3D "y" -o "$CONFIG_HIGHMEM64G" =3D "y" ]; then bool 'Allocate 3rd-level pagetables from highmem' CONFIG_HIGHPTE fi +bool 'Share 3rd-level pagetables' CONFIG_SHAREPTE y =20 bool 'Math emulation' CONFIG_MATH_EMULATION bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR --- 2.5.43-mm2/./include/linux/mm.h 2002-10-17 11:13:00.000000000 -0500 +++ 2.5.43-mm2-shpte/./include/linux/mm.h 2002-10-17 11:42:16.000000000 = -0500 @@ -164,6 +164,8 @@ struct pte_chain *chain;/* Reverse pte mapping pointer. * protected by PG_chainlock */ pte_addr_t direct; + struct mm_chain *mmchain;/* Reverse mm_struct mapping pointer */ + struct mm_struct *mmdirect; } pte; unsigned long private; /* mapping-private opaque data */ =20 @@ -358,7 +360,12 @@ extern int shmem_zero_setup(struct vm_area_struct *); =20 extern void zap_page_range(struct vm_area_struct *vma, unsigned long = address, unsigned long size); +#ifdef CONFIG_SHAREPTE +extern pte_t *pte_unshare(struct mm_struct *mm, pmd_t *pmd, unsigned long = address); +extern int share_page_range(struct mm_struct *dst, struct mm_struct *src, = struct vm_area_struct *vma, pmd_t **prev_pmd); +#else extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, = struct vm_area_struct *vma); +#endif extern int remap_page_range(struct vm_area_struct *vma, unsigned long = from, unsigned long to, unsigned long size, pgprot_t prot); extern int zeromap_page_range(struct vm_area_struct *vma, unsigned long = from, unsigned long size, pgprot_t prot); =20 --- 2.5.43-mm2/./include/linux/rmap-locking.h 2002-10-15 22:27:16.000000000 = -0500 +++ 2.5.43-mm2-shpte/./include/linux/rmap-locking.h 2002-10-17 = 11:42:19.000000000 -0500 @@ -23,6 +23,18 @@ #endif } =20 +static inline int pte_chain_trylock(struct page *page) +{ + preempt_disable(); +#ifdef CONFIG_SMP + if (test_and_set_bit(PG_chainlock, &page->flags)) { + preempt_enable(); + return 0; + } +#endif + return 1; +} + static inline void pte_chain_unlock(struct page *page) { #ifdef CONFIG_SMP @@ -31,3 +43,7 @@ #endif preempt_enable(); } + +#define pte_page_lock pte_chain_lock +#define pte_page_trylock pte_chain_trylock +#define pte_page_unlock pte_chain_unlock --- 2.5.43-mm2/./include/linux/page-flags.h 2002-10-17 11:13:00.000000000 = -0500 +++ 2.5.43-mm2-shpte/./include/linux/page-flags.h 2002-10-17 = 16:06:42.000000000 -0500 @@ -68,7 +68,7 @@ #define PG_chainlock 15 /* lock bit for ->pte_chain */ =20 #define PG_direct 16 /* ->pte_chain points directly at pte */ - +#define PG_ptepage 17 /* This page is a pte page */ /* * Global page accounting. One instance per CPU. Only unsigned longs are * allowed. @@ -239,6 +239,12 @@ #define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags) #define TestClearPageDirect(page) test_and_clear_bit(PG_direct, = &(page)->flags) =20 +#define PagePtepage(page) test_bit(PG_ptepage, &(page)->flags) +#define SetPagePtepage(page) set_bit(PG_ptepage, &(page)->flags) +#define TestSetPagePtepage(page) test_and_set_bit(PG_ptepage, = &(page)->flags) +#define ClearPagePtepage(page) clear_bit(PG_ptepage, &(page)->flags) +#define TestClearPagePtepage(page) test_and_clear_bit(PG_ptepage, = &(page)->flags) + /* * The PageSwapCache predicate doesn't use a PG_flag at this time, * but it may again do so one day. --- 2.5.43-mm2/./include/asm-generic/rmap.h 2002-10-15 22:28:24.000000000 = -0500 +++ 2.5.43-mm2-shpte/./include/asm-generic/rmap.h 2002-10-17 = 11:42:16.000000000 -0500 @@ -26,33 +26,6 @@ */ #include <linux/mm.h> =20 -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * = mm, unsigned long address) -{ -#ifdef BROKEN_PPC_PTE_ALLOC_ONE - /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ - extern int mem_init_done; - - if (!mem_init_done) - return; -#endif - page->mapping =3D (void *)mm; - page->index =3D address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page * page) -{ - page->mapping =3D NULL; - page->index =3D 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page =3D kmap_atomic_to_page(ptep); - return (struct mm_struct *) page->mapping; -} - static inline unsigned long ptep_to_address(pte_t * ptep) { struct page * page =3D kmap_atomic_to_page(ptep); @@ -87,4 +60,10 @@ } #endif =20 +extern void pgtable_add_rmap(struct page * page, struct mm_struct * mm, = unsigned long address); +extern void pgtable_add_rmap_locked(struct page * page, struct mm_struct * = mm, unsigned long address); +extern void pgtable_remove_rmap(struct page * page, struct mm_struct *mm); +extern void pgtable_remove_rmap_locked(struct page * page, struct = mm_struct *mm); +extern void increment_rss(struct page *ptepage); + #endif /* _GENERIC_RMAP_H */ --- 2.5.43-mm2/./include/asm-i386/rmap.h 2002-10-15 22:28:25.000000000 = -0500 +++ 2.5.43-mm2-shpte/./include/asm-i386/rmap.h 2002-10-17 = 16:06:42.000000000 -0500 @@ -9,12 +9,15 @@ { unsigned long pfn =3D (unsigned long)(pte_paddr >> PAGE_SHIFT); unsigned long off =3D ((unsigned long)pte_paddr) & ~PAGE_MASK; + + preempt_disable(); return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off); } =20 static inline void rmap_ptep_unmap(pte_t *pte) { kunmap_atomic(pte, KM_PTE2); + preempt_enable(); } #endif =20 --- 2.5.43-mm2/./include/asm-i386/pgtable.h 2002-10-15 22:28:33.000000000 = -0500 +++ 2.5.43-mm2-shpte/./include/asm-i386/pgtable.h 2002-10-17 = 11:42:16.000000000 -0500 @@ -124,6 +124,7 @@ #define _PAGE_PROTNONE 0x080 /* If not present */ =20 #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | = _PAGE_ACCESSED | _PAGE_DIRTY) +#define _PAGE_TABLE_RDONLY (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | = _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | = _PAGE_DIRTY) #define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) =20 @@ -184,8 +185,8 @@ #define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) !=3D = _KERNPG_TABLE) - +#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_RW)) = !=3D \ + (_KERNPG_TABLE & ~_PAGE_RW)) =20 #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) =20 @@ -209,6 +210,9 @@ static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |=3D = _PAGE_DIRTY; return pte; } static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |=3D = _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |=3D _PAGE_RW; = return pte; } +static inline int pmd_write(pmd_t pmd) { return (pmd).pmd & _PAGE_RW; } +static inline pmd_t pmd_wrprotect(pmd_t pmd) { (pmd).pmd &=3D ~_PAGE_RW; = return pmd; } +static inline pmd_t pmd_mkwrite(pmd_t pmd) { (pmd).pmd |=3D _PAGE_RW; = return pmd; } =20 static inline int ptep_test_and_clear_dirty(pte_t *ptep) { return = test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); } static inline int ptep_test_and_clear_young(pte_t *ptep) { return = test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low); } @@ -265,12 +269,20 @@ ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + __pte_offset(address)) #define pte_offset_map_nested(dir, address) \ ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + __pte_offset(address)) +#define pte_page_map(__page, address) \ + ((pte_t *)kmap_atomic(__page,KM_PTE0) + __pte_offset(address)) +#define pte_page_map_nested(__page, address) \ + ((pte_t *)kmap_atomic(__page,KM_PTE1) + __pte_offset(address)) #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) #else #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address)) #define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) +#define pte_page_map(__page, address) \ + ((pte_t *)page_address(__page) + __pte_offset(address)) +#define pte_page_map_nested(__page, address) \ + ((pte_t *)page_address(__page) + __pte_offset(address)) #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) #endif --- 2.5.43-mm2/./kernel/fork.c 2002-10-17 11:13:01.000000000 -0500 +++ 2.5.43-mm2-shpte/./kernel/fork.c 2002-10-17 11:42:16.000000000 -0500 @@ -210,6 +210,9 @@ struct vm_area_struct * mpnt, *tmp, **pprev; int retval; unsigned long charge =3D 0; +#ifdef CONFIG_SHAREPTE + pmd_t *prev_pmd =3D 0; +#endif =20 flush_cache_mm(current->mm); mm->locked_vm =3D 0; @@ -273,7 +276,11 @@ *pprev =3D tmp; pprev =3D &tmp->vm_next; mm->map_count++; +#ifdef CONFIG_SHAREPTE + retval =3D share_page_range(mm, current->mm, tmp, &prev_pmd); +#else retval =3D copy_page_range(mm, current->mm, tmp); +#endif spin_unlock(&mm->page_table_lock); =20 if (tmp->vm_ops && tmp->vm_ops->open) --- 2.5.43-mm2/./mm/swapfile.c 2002-10-17 11:13:01.000000000 -0500 +++ 2.5.43-mm2-shpte/./mm/swapfile.c 2002-10-18 13:36:14.000000000 -0500 @@ -15,8 +15,10 @@ #include <linux/shm.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> +#include <linux/rmap-locking.h> =20 #include <asm/pgtable.h> +#include <asm/rmap.h> #include <linux/swapops.h> =20 spinlock_t swaplock =3D SPIN_LOCK_UNLOCKED; @@ -371,7 +373,7 @@ */ /* mmlist_lock and vma->vm_mm->page_table_lock are held */ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long = address, - pte_t *dir, swp_entry_t entry, struct page* page) + pte_t *dir, swp_entry_t entry, struct page* page, pmd_t *pmd) { pte_t pte =3D *dir; =20 @@ -383,7 +385,7 @@ set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); page_add_rmap(page, dir); swap_free(entry); - ++vma->vm_mm->rss; + increment_rss(pmd_page(*pmd)); } =20 /* mmlist_lock and vma->vm_mm->page_table_lock are held */ @@ -391,6 +393,7 @@ unsigned long address, unsigned long size, unsigned long offset, swp_entry_t entry, struct page* page) { + struct page *ptepage; pte_t * pte; unsigned long end; =20 @@ -401,6 +404,8 @@ pmd_clear(dir); return; } + ptepage =3D pmd_page(*dir); + pte_page_lock(ptepage); pte =3D pte_offset_map(dir, address); offset +=3D address & PMD_MASK; address &=3D ~PMD_MASK; @@ -408,10 +413,11 @@ if (end > PMD_SIZE) end =3D PMD_SIZE; do { - unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page); + unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page, dir); address +=3D PAGE_SIZE; pte++; } while (address && (address < end)); + pte_page_unlock(ptepage); pte_unmap(pte - 1); } =20 --- 2.5.43-mm2/./mm/msync.c 2002-10-17 11:13:01.000000000 -0500 +++ 2.5.43-mm2-shpte/./mm/msync.c 2002-10-18 13:31:24.000000000 -0500 @@ -11,6 +11,7 @@ #include <linux/pagemap.h> #include <linux/mm.h> #include <linux/mman.h> +#include <linux/rmap-locking.h> =20 #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -43,6 +44,7 @@ unsigned long address, unsigned long end,=20 struct vm_area_struct *vma, unsigned int flags) { + struct page *ptepage; pte_t *pte; int error; =20 @@ -53,6 +55,8 @@ pmd_clear(pmd); return 0; } + ptepage =3D pmd_page(*pmd); + pte_page_lock(ptepage); pte =3D pte_offset_map(pmd, address); if ((address & PMD_MASK) !=3D (end & PMD_MASK)) end =3D (address & PMD_MASK) + PMD_SIZE; @@ -64,6 +68,7 @@ } while (address && (address < end)); =20 pte_unmap(pte - 1); + pte_page_unlock(ptepage); =20 return error; } --- 2.5.43-mm2/./mm/mprotect.c 2002-10-17 11:13:01.000000000 -0500 +++ 2.5.43-mm2-shpte/./mm/mprotect.c 2002-10-17 11:42:16.000000000 -0500 @@ -16,6 +16,7 @@ #include <linux/fs.h> #include <linux/highmem.h> #include <linux/security.h> +#include <linux/rmap-locking.h> =20 #include <asm/uaccess.h> #include <asm/pgalloc.h> @@ -24,10 +25,11 @@ #include <asm/tlbflush.h> =20 static inline void -change_pte_range(pmd_t *pmd, unsigned long address, +change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long = address, unsigned long size, pgprot_t newprot) { pte_t * pte; + struct page *ptepage; unsigned long end; =20 if (pmd_none(*pmd)) @@ -37,11 +39,32 @@ pmd_clear(pmd); return; } - pte =3D pte_offset_map(pmd, address); + ptepage =3D pmd_page(*pmd); + pte_page_lock(ptepage); address &=3D ~PMD_MASK; end =3D address + size; if (end > PMD_SIZE) end =3D PMD_SIZE; + +#ifdef CONFIG_SHAREPTE + if (page_count(ptepage) > 1) { + if ((address =3D=3D 0) && (end =3D=3D PMD_SIZE)) { + pmd_t pmdval =3D *pmd; + + if (vma->vm_flags & VM_MAYWRITE) + pmdval =3D pmd_mkwrite(pmdval); + else + pmdval =3D pmd_wrprotect(pmdval); + set_pmd(pmd, pmdval); + pte_page_unlock(ptepage); + return; + } + pte =3D pte_unshare(vma->vm_mm, pmd, address); + ptepage =3D pmd_page(*pmd); + } else +#endif + pte =3D pte_offset_map(pmd, address); + do { if (pte_present(*pte)) { pte_t entry; @@ -56,11 +79,12 @@ address +=3D PAGE_SIZE; pte++; } while (address && (address < end)); + pte_page_unlock(ptepage); pte_unmap(pte - 1); } =20 static inline void -change_pmd_range(pgd_t *pgd, unsigned long address, +change_pmd_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long = address, unsigned long size, pgprot_t newprot) { pmd_t * pmd; @@ -79,7 +103,7 @@ if (end > PGDIR_SIZE) end =3D PGDIR_SIZE; do { - change_pte_range(pmd, address, end - address, newprot); + change_pte_range(vma, pmd, address, end - address, newprot); address =3D (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -98,7 +122,7 @@ BUG(); spin_lock(&current->mm->page_table_lock); do { - change_pmd_range(dir, start, end - start, newprot); + change_pmd_range(vma, dir, start, end - start, newprot); start =3D (start + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (start && (start < end)); --- 2.5.43-mm2/./mm/memory.c 2002-10-17 11:13:01.000000000 -0500 +++ 2.5.43-mm2-shpte/./mm/memory.c 2002-10-18 16:57:25.000000000 -0500 @@ -36,6 +36,20 @@ * (Gerhard.Wichert@pdb.siemens.de) */ =20 +/* + * A note on locking of the page table structure: + * + * The top level lock that protects the page table is the + * mm->page_table_lock. This lock protects the pgd and pmd layer. + * However, with the advent of shared pte pages, this lock is not + * sufficient. The pte layer is now protected by the pte_page_lock, + * set in the struct page of the pte page. Note that with this + * locking scheme, once the pgd and pmd layers have been set in the + * page fault path and the pte_page_lock has been taken, the + * page_table_lock can be released. + *=20 + */ + #include <linux/kernel_stat.h> #include <linux/mm.h> #include <linux/hugetlb.h> @@ -44,6 +58,7 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/vcache.h> +#include <linux/rmap-locking.h> =20 #include <asm/pgalloc.h> #include <asm/rmap.h> @@ -83,7 +98,7 @@ */ static inline void free_one_pmd(mmu_gather_t *tlb, pmd_t * dir) { - struct page *page; + struct page *ptepage; =20 if (pmd_none(*dir)) return; @@ -92,10 +107,20 @@ pmd_clear(dir); return; } - page =3D pmd_page(*dir); + ptepage =3D pmd_page(*dir); + + pte_page_lock(ptepage); + + BUG_ON(page_count(ptepage) !=3D 1); + pmd_clear(dir); - pgtable_remove_rmap(page); - pte_free_tlb(tlb, page); + pgtable_remove_rmap_locked(ptepage, tlb->mm); + dec_page_state(nr_page_table_pages); + ClearPagePtepage(ptepage); + + pte_page_unlock(ptepage); + + pte_free_tlb(tlb, ptepage); } =20 static inline void free_one_pgd(mmu_gather_t *tlb, pgd_t * dir) @@ -136,6 +161,318 @@ } while (--nr); } =20 +#ifdef CONFIG_SHAREPTE +/* + * This function makes the decision whether a pte page needs to be + * unshared or not. Note that page_count() =3D=3D 1 isn't even tested + * here. The assumption is that if the pmd entry is marked writeable, + * then the page is either already unshared or doesn't need to be + * unshared. This catches the situation where task B unshares the pte + * page, then task A faults and needs to unprotect the pmd entry. + * This is actually done in pte_unshare. + * + * This function should be called with the page_table_lock held. + */ +static inline int pte_needs_unshare(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address, + int write_access) +{ + struct page *ptepage; + + /* It's not even there, nothing to unshare. */ + if (!pmd_present(*pmd)) + return 0; + + /* + * If it's already writable, then it doesn't need to be unshared. + * It's either already not shared or it's part of a large shared + * region that will never need to be unshared. + */ + if (pmd_write(*pmd)) + return 0; + + /* If this isn't a write fault we don't need to unshare. */ + if (!write_access) + return 0; + + /* + * If this page fits entirely inside a shared region, don't unshare it. + */ + ptepage =3D pmd_page(*pmd); + if ((vma->vm_flags & VM_SHARED) && + (vma->vm_start <=3D ptepage->index) && + (vma->vm_end >=3D (ptepage->index + PMD_SIZE))) { + return 0; + } + /* + * Ok, we have to unshare. + */ + return 1; +} + +/** + * pte_unshare - Unshare a pte page + * @mm: the mm_struct that gets an unshared pte page + * @pmd: a pointer to the pmd entry that needs unsharing + * @address: the virtual address that triggered the unshare + * + * Here is where a pte page is actually unshared. It actually covers + * a couple of possible conditions. If the page_count() is already 1, + * then that means it just needs to be set writeable. Otherwise, a + * new page needs to be allocated. + * + * When each pte entry is copied, it is evaluated for COW protection, + * as well as checking whether the swap count needs to be incremented. + * + * This function must be called with the page_table_lock held. It + * will release and reacquire the lock when it allocates a new page. + * + * The function must also be called with the pte_page_lock held on the + * old page. This lock will also be dropped, then reacquired when we + * allocate a new page. The pte_page_lock will be taken on the new + * page. Whichever pte page is returned will have its pte_page_lock + * held. + */ + +pte_t *pte_unshare(struct mm_struct *mm, pmd_t *pmd, unsigned long = address) +{ + pte_t *src_ptb, *dst_ptb; + struct page *oldpage, *newpage; + struct vm_area_struct *vma; + int base, addr; + int end, page_end; + int src_unshare; + + oldpage =3D pmd_page(*pmd); + + /* If it's already unshared, we just need to set it writeable */ + if (page_count(oldpage) =3D=3D 1) { +is_unshared: + pmd_populate(mm, pmd, oldpage); + flush_tlb_mm(mm); + goto out_map; + } + + pte_page_unlock(oldpage); + spin_unlock(&mm->page_table_lock); + newpage =3D pte_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (unlikely(!newpage)) + return NULL; + + /* + * Fetch the ptepage pointer again in case it changed while + * the lock was dropped. + */ + oldpage =3D pmd_page(*pmd); + pte_page_lock(oldpage); + + /* See if it got unshared while we dropped the lock */ + if (page_count(oldpage) =3D=3D 1) { + pte_free(newpage); + goto is_unshared; + } + + pte_page_lock(newpage); + + base =3D addr =3D oldpage->index; + page_end =3D base + PMD_SIZE; + vma =3D find_vma(mm, base); + src_unshare =3D page_count(oldpage) =3D=3D 2; + dst_ptb =3D pte_page_map(newpage, base); + + if (!vma || (page_end <=3D vma->vm_start)) { + goto no_vma; + } + + if (vma->vm_start > addr) + addr =3D vma->vm_start; + + if (vma->vm_end < page_end) + end =3D vma->vm_end; + else + end =3D page_end; + + src_ptb =3D pte_page_map_nested(oldpage, base); + + do { + unsigned int cow =3D 0; + pte_t *src_pte =3D src_ptb + __pte_offset(addr); + pte_t *dst_pte =3D dst_ptb + __pte_offset(addr); + + cow =3D (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) =3D=3D VM_MAYWRITE; + + do { + pte_t pte =3D *src_pte; + struct page *page; + + if (pte_none(pte)) + goto unshare_skip_set; + + if (!pte_present(pte)) { + swap_duplicate(pte_to_swp_entry(pte)); + set_pte(dst_pte, pte); + goto unshare_skip_set; + } + page =3D pte_page(pte); + if (!PageReserved(page)) { + /* COW mappings require write protecting both sides */ + if (cow) { + pte =3D pte_wrprotect(pte); + if (src_unshare) + set_pte(src_pte, pte); + } + /* If it's a shared mapping, + * mark it clean in the new mapping + */ + if (vma->vm_flags & VM_SHARED) + pte =3D pte_mkclean(pte); + pte =3D pte_mkold(pte); + get_page(page); + } + set_pte(dst_pte, pte); + page_add_rmap(page, dst_pte); +unshare_skip_set: + src_pte++; + dst_pte++; + addr +=3D PAGE_SIZE; + } while (addr < end); + + if (addr >=3D page_end) + break; + + vma =3D vma->vm_next; + if (!vma) + break; + + if (page_end <=3D vma->vm_start) + break; + + addr =3D vma->vm_start; + if (vma->vm_end < page_end) + end =3D vma->vm_end; + else + end =3D page_end; + } while (1); + + pte_unmap_nested(src_ptb); + +no_vma: + SetPagePtepage(newpage); + pgtable_remove_rmap_locked(oldpage, mm); + pgtable_add_rmap_locked(newpage, mm, base); + pmd_populate(mm, pmd, newpage); + inc_page_state(nr_page_table_pages); + + /* Copy rss count */ + newpage->private =3D oldpage->private; + + flush_tlb_mm(mm); + + put_page(oldpage); + + pte_page_unlock(oldpage); + + return dst_ptb + __pte_offset(address); + +out_map: + return pte_offset_map(pmd, address); +} + +/** + * pte_try_to_share - Attempt to find a pte page that can be shared + * @mm: the mm_struct that needs a pte page + * @vma: the vm_area the address is in + * @pmd: a pointer to the pmd entry that needs filling + * @address: the address that caused the fault + * + * This function is called during a page fault. If there is no pte + * page for this address, it checks the vma to see if it is shared, + * and if it spans the pte page. If so, it goes to the address_space + * structure and looks through for matching vmas from other tasks that + * already have a pte page that can be shared. If it finds one, it + * attaches it and makes it a shared page. + */ + +static pte_t *pte_try_to_share(struct mm_struct *mm, struct vm_area_struct = *vma, + pmd_t *pmd, unsigned long address) +{ + struct address_space *as; + struct vm_area_struct *lvma; + struct page *ptepage; + unsigned long base; + pte_t *pte =3D NULL; + + /* + * It already has a pte page. No point in checking further. + * We can go ahead and return it now, since we know it's there. + */ + if (pmd_present(*pmd)) { + ptepage =3D pmd_page(*pmd); + pte_page_lock(ptepage); + return pte_page_map(ptepage, address); + } + + /* It's not even shared memory. We definitely can't share the page. */ + if (!(vma->vm_flags & VM_SHARED)) + return NULL; + + /* We can only share if the entire pte page fits inside the vma */ + base =3D address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); + if ((base < vma->vm_start) || (vma->vm_end < (base + PMD_SIZE))) + return NULL; + + as =3D vma->vm_file->f_dentry->d_inode->i_mapping; + + spin_lock(&as->i_shared_lock); + + list_for_each_entry(lvma, &as->i_mmap_shared, shared) { + pgd_t *lpgd; + pmd_t *lpmd; + pmd_t pmdval; + + /* Skip the one we're working on */ + if (lvma =3D=3D vma) + continue; + + /* It has to be mapping to the same address */ + if ((lvma->vm_start !=3D vma->vm_start) || + (lvma->vm_end !=3D vma->vm_end) || + (lvma->vm_pgoff !=3D vma->vm_pgoff)) + continue; + + lpgd =3D pgd_offset(lvma->vm_mm, address); + lpmd =3D pmd_offset(lpgd, address); + + /* This page table doesn't have a pte page either, so skip it. */ + if (!pmd_present(*lpmd)) + continue; + + /* Ok, we can share it. */ + + ptepage =3D pmd_page(*lpmd); + pte_page_lock(ptepage); + get_page(ptepage); + pgtable_add_rmap_locked(ptepage, mm, address); + /* + * If this vma is only mapping it read-only, set the + * pmd entry read-only to protect it from writes. + * Otherwise set it writeable. + */ + if (vma->vm_flags & VM_MAYWRITE) + pmdval =3D pmd_mkwrite(*lpmd); + else + pmdval =3D pmd_wrprotect(*lpmd); + set_pmd(pmd, pmdval); + pte =3D pte_page_map(ptepage, address); + break; + } + spin_unlock(&as->i_shared_lock); + return pte; +} +#endif + pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long = address) { if (!pmd_present(*pmd)) { @@ -155,13 +492,13 @@ pte_free(new); goto out; } + SetPagePtepage(new); pgtable_add_rmap(new, mm, address); pmd_populate(mm, pmd, new); + inc_page_state(nr_page_table_pages); } out: - if (pmd_present(*pmd)) - return pte_offset_map(pmd, address); - return NULL; + return pte_offset_map(pmd, address); } =20 pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long = address) @@ -183,7 +520,6 @@ pte_free_kernel(new); goto out; } - pgtable_add_rmap(virt_to_page(new), mm, address); pmd_populate_kernel(mm, pmd, new); } out: @@ -192,6 +528,111 @@ #define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) #define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) =20 +#ifdef CONFIG_SHAREPTE +/** + * share_page_range - share a range of pages at the pte page level at fork = time + * @dst: the mm_struct of the forked child + * @src: the mm_struct of the forked parent + * @vma: the vm_area to be shared + * @prev_pmd: A pointer to the pmd entry we did at last invocation + * + * This function shares pte pages between parent and child at fork. + * If the vm_area is shared and spans the page, it sets it + * writeable. Otherwise it sets it read-only. The prev_pmd parameter + * is used to keep track of pte pages we've already shared, since this + * function can be called with multiple vmas that point to the same + * pte page. + */ +int share_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma, pmd_t **prev_pmd) +{ + pgd_t *src_pgd, *dst_pgd; + unsigned long address =3D vma->vm_start; + unsigned long end =3D vma->vm_end; + + if (is_vm_hugetlb_page(vma)) + return copy_hugetlb_page_range(dst, src, vma); + + src_pgd =3D pgd_offset(src, address)-1; + dst_pgd =3D pgd_offset(dst, address)-1; + + for (;;) { + pmd_t * src_pmd, * dst_pmd; + + src_pgd++; dst_pgd++; + + if (pgd_none(*src_pgd)) + goto skip_share_pmd_range; + if (pgd_bad(*src_pgd)) { + pgd_ERROR(*src_pgd); + pgd_clear(src_pgd); +skip_share_pmd_range: address =3D (address + PGDIR_SIZE) & PGDIR_MASK; + if (!address || (address >=3D end)) + goto out; + continue; + } + + src_pmd =3D pmd_offset(src_pgd, address); + dst_pmd =3D pmd_alloc(dst, dst_pgd, address); + if (!dst_pmd) + goto nomem; + + spin_lock(&src->page_table_lock); + + do { + pmd_t pmdval =3D *src_pmd; + struct page *ptepage =3D pmd_page(pmdval); + + if (pmd_none(pmdval)) + goto skip_share_pte_range; + if (pmd_bad(pmdval)) { + pmd_ERROR(*src_pmd); + pmd_clear(src_pmd); + goto skip_share_pte_range; + } + + /* + * We set the pmd read-only in both the parent and the + * child unless it's a writeable shared region that + * spans the entire pte page. + */ + if ((((vma->vm_flags & (VM_SHARED|VM_MAYWRITE)) !=3D + (VM_SHARED|VM_MAYWRITE)) || + (ptepage->index < vma->vm_start) || + ((ptepage->index + PMD_SIZE) > vma->vm_end)) && + pmd_write(pmdval)) { + pmdval =3D pmd_wrprotect(pmdval); + set_pmd(src_pmd, pmdval); + } + set_pmd(dst_pmd, pmdval); + + /* Only do this if we haven't seen this pte page before */ + if (src_pmd !=3D *prev_pmd) { + get_page(ptepage); + pgtable_add_rmap(ptepage, dst, address); + *prev_pmd =3D src_pmd; + dst->rss +=3D ptepage->private; + } + +skip_share_pte_range: address =3D (address + PMD_SIZE) & PMD_MASK; + if (address >=3D end) + goto out_unlock; + + src_pmd++; + dst_pmd++; + } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + spin_unlock(&src->page_table_lock); + } + +out_unlock: + spin_unlock(&src->page_table_lock); + +out: + return 0; +nomem: + return -ENOMEM; +} +#else /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range @@ -241,6 +682,7 @@ goto nomem; =20 do { + struct page *ptepage; pte_t * src_pte, * dst_pte; /* copy_pte_range */ @@ -260,10 +702,12 @@ if (!dst_pte) goto nomem; spin_lock(&src->page_table_lock); + ptepage =3D pmd_page(*src_pmd); + pte_page_lock(ptepage); src_pte =3D pte_offset_map_nested(src_pmd, address); do { pte_t pte =3D *src_pte; - struct page *ptepage; + struct page *page; unsigned long pfn; /* copy_one_pte */ @@ -276,12 +720,12 @@ set_pte(dst_pte, pte); goto cont_copy_pte_range_noset; } - ptepage =3D pte_page(pte); + page =3D pte_page(pte); pfn =3D pte_pfn(pte); if (!pfn_valid(pfn)) goto cont_copy_pte_range; - ptepage =3D pfn_to_page(pfn); - if (PageReserved(ptepage)) + page =3D pfn_to_page(pfn); + if (PageReserved(page)) goto cont_copy_pte_range; =20 /* If it's a COW mapping, write protect it both in the parent and the = child */ @@ -294,13 +738,14 @@ if (vma->vm_flags & VM_SHARED) pte =3D pte_mkclean(pte); pte =3D pte_mkold(pte); - get_page(ptepage); + get_page(page); dst->rss++; =20 cont_copy_pte_range: set_pte(dst_pte, pte); - page_add_rmap(ptepage, dst_pte); + page_add_rmap(page, dst_pte); cont_copy_pte_range_noset: address +=3D PAGE_SIZE; if (address >=3D end) { + pte_page_unlock(ptepage); pte_unmap_nested(src_pte); pte_unmap(dst_pte); goto out_unlock; @@ -308,6 +753,7 @@ src_pte++; dst_pte++; } while ((unsigned long)src_pte & PTE_TABLE_MASK); + pte_page_unlock(ptepage); pte_unmap_nested(src_pte-1); pte_unmap(dst_pte-1); spin_unlock(&src->page_table_lock); @@ -323,24 +769,22 @@ nomem: return -ENOMEM; } +#endif =20 static void zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long = address, unsigned long size) { unsigned long offset; + struct mm_struct *mm =3D tlb->mm; + struct page *ptepage =3D pmd_page(*pmd); pte_t *ptep; =20 - if (pmd_none(*pmd)) - return; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - ptep =3D pte_offset_map(pmd, address); offset =3D address & ~PMD_MASK; if (offset + size > PMD_SIZE) size =3D PMD_SIZE - offset; size &=3D PAGE_MASK; + + ptep =3D pte_offset_map(pmd, address); + for (offset=3D0; offset < size; ptep++, offset +=3D PAGE_SIZE) { pte_t pte =3D *ptep; if (pte_none(pte)) @@ -359,6 +803,8 @@ !PageSwapCache(page)) mark_page_accessed(page); tlb->freed++; + ptepage->private--; + mm->rss--; page_remove_rmap(page, ptep); tlb_remove_page(tlb, page); } @@ -371,8 +817,12 @@ pte_unmap(ptep-1); } =20 -static void zap_pmd_range(mmu_gather_t *tlb, pgd_t * dir, unsigned long = address, unsigned long size) +static void zap_pmd_range(mmu_gather_t **tlb, pgd_t * dir, unsigned long = address, unsigned long size) { + struct page *ptepage; +#ifdef CONFIG_SHAREPTE + struct mm_struct *mm =3D (*tlb)->mm; +#endif pmd_t * pmd; unsigned long end; =20 @@ -388,26 +838,59 @@ if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) end =3D ((address + PGDIR_SIZE) & PGDIR_MASK); do { - zap_pte_range(tlb, pmd, address, end - address); + if (pmd_none(*pmd)) + goto skip_pmd; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + goto skip_pmd; + } + + ptepage =3D pmd_page(*pmd); + pte_page_lock(ptepage); +#ifdef CONFIG_SHAREPTE + if (page_count(ptepage) > 1) { + if ((address <=3D ptepage->index) && + (end >=3D (ptepage->index + PMD_SIZE))) { + pmd_clear(pmd); + pgtable_remove_rmap_locked(ptepage, mm); + mm->rss -=3D ptepage->private; + put_page(ptepage); + pte_page_unlock(ptepage); + goto skip_pmd; + } else { + pte_t *pte; + + tlb_finish_mmu(*tlb, address, end); + pte =3D pte_unshare(mm, pmd, address); + pte_unmap(pte); + *tlb =3D tlb_gather_mmu(mm, 0); + ptepage =3D pmd_page(*pmd); + } + } +#endif + zap_pte_range(*tlb, pmd, address, end - address); + pte_page_unlock(ptepage); +skip_pmd: address =3D (address + PMD_SIZE) & PMD_MASK;=20 pmd++; } while (address < end); } =20 -void unmap_page_range(mmu_gather_t *tlb, struct vm_area_struct *vma, = unsigned long address, unsigned long end) +void unmap_page_range(mmu_gather_t **tlb, struct vm_area_struct *vma, = unsigned long address, unsigned long end) { pgd_t * dir; =20 BUG_ON(address >=3D end); =20 dir =3D pgd_offset(vma->vm_mm, address); - tlb_start_vma(tlb, vma); + tlb_start_vma(*tlb, vma); do { zap_pmd_range(tlb, dir, address, end - address); address =3D (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); - tlb_end_vma(tlb, vma); + tlb_end_vma(*tlb, vma); } =20 /* Dispose of an entire mmu_gather_t per rescheduling point */ @@ -451,7 +934,7 @@ =20 flush_cache_range(vma, address, end); tlb =3D tlb_gather_mmu(mm, 0); - unmap_page_range(tlb, vma, address, end); + unmap_page_range(&tlb, vma, address, end); tlb_finish_mmu(tlb, address, end); =20 cond_resched_lock(&mm->page_table_lock); @@ -463,6 +946,126 @@ spin_unlock(&mm->page_table_lock); } =20 +/** + * unmap_all_pages - unmap all the pages for an mm_struct + * @mm: the mm_struct to unmap + * + * This function is only called when an mm_struct is about to be + * released. It walks through all vmas and removes their pages + * from the page table. It understands shared pte pages and will + * decrement the count appropriately. + */ +void unmap_all_pages(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct page *ptepage; + mmu_gather_t *tlb; + pgd_t *pgd; + pmd_t *pmd; + unsigned long address; + unsigned long vm_end, pmd_end; + + tlb =3D tlb_gather_mmu(mm, 1); + + vma =3D mm->mmap; + for (;;) { + if (!vma) + goto out; + + address =3D vma->vm_start; +next_vma: + vm_end =3D vma->vm_end; + mm->map_count--; + /* + * Advance the vma pointer to the next vma. + * To facilitate coalescing adjacent vmas, the + * pointer always points to the next one + * beyond the range we're currently working + * on, which means vma will be null on the + * last iteration. + */ + vma =3D vma->vm_next; + if (vma) { + /* + * Go ahead and include hugetlb vmas + * in the range we process. The pmd + * entry will be cleared by close, so + * we'll just skip over them. This is + * easier than trying to avoid them. + */ + if (is_vm_hugetlb_page(vma)) + vma->vm_ops->close(vma); + + /* + * Coalesce adjacent vmas and process + * them all in one iteration. + */ + if (vma->vm_start =3D=3D vm_end) { + goto next_vma; + } + } + pgd =3D pgd_offset(mm, address); + do { + if (pgd_none(*pgd)) + goto skip_pgd; + + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); +skip_pgd: + address +=3D PGDIR_SIZE; + if (address > vm_end) + address =3D vm_end; + goto next_pgd; + } + pmd =3D pmd_offset(pgd, address); + if (vm_end > ((address + PGDIR_SIZE) & PGDIR_MASK)) + pmd_end =3D (address + PGDIR_SIZE) & PGDIR_MASK; + else + pmd_end =3D vm_end; + + for (;;) { + if (pmd_none(*pmd)) + goto next_pmd; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + goto next_pmd; + } + + ptepage =3D pmd_page(*pmd); + pte_page_lock(ptepage); +#ifdef CONFIG_SHAREPTE + if (page_count(ptepage) > 1) { + pmd_clear(pmd); + pgtable_remove_rmap_locked(ptepage, mm); + mm->rss -=3D ptepage->private; + put_page(ptepage); + } else +#endif + zap_pte_range(tlb, pmd, address, + vm_end - address); + + pte_page_unlock(ptepage); +next_pmd: + address =3D (address + PMD_SIZE) & PMD_MASK; + if (address >=3D pmd_end) { + address =3D pmd_end; + break; + } + pmd++; + } +next_pgd: + pgd++; + } while (address < vm_end); + + } + +out: + clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); + tlb_finish_mmu(tlb, 0, TASK_SIZE); +} + /* * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. @@ -790,6 +1393,7 @@ unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) { struct page *old_page, *new_page; + struct page *ptepage =3D pmd_page(*pmd); unsigned long pfn =3D pte_pfn(pte); =20 if (!pfn_valid(pfn)) @@ -803,7 +1407,7 @@ flush_cache_page(vma, address); establish_pte(vma, address, page_table, = pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); return VM_FAULT_MINOR; } } @@ -813,7 +1417,7 @@ * Ok, we need to copy. Oh, well.. */ page_cache_get(old_page); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); =20 new_page =3D alloc_page(GFP_HIGHUSER); if (!new_page) @@ -823,11 +1427,12 @@ /* * Re-check the pte - we dropped the lock */ - spin_lock(&mm->page_table_lock); + ptepage =3D pmd_page(*pmd); + pte_page_lock(ptepage); page_table =3D pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) - ++mm->rss; + increment_rss(ptepage); page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); page_add_rmap(new_page, page_table); @@ -837,14 +1442,14 @@ new_page =3D old_page; } pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); page_cache_release(new_page); page_cache_release(old_page); return VM_FAULT_MINOR; =20 bad_wp_page: pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", address); /* * This should really halt the system so it can be debugged or @@ -973,12 +1578,13 @@ pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) { struct page *page; + struct page *ptepage =3D pmd_page(*pmd); swp_entry_t entry =3D pte_to_swp_entry(orig_pte); pte_t pte; int ret =3D VM_FAULT_MINOR; =20 pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); page =3D lookup_swap_cache(entry); if (!page) { swapin_readahead(entry); @@ -988,14 +1594,15 @@ * Back out if somebody else faulted in this pte while * we released the page table lock. */ - spin_lock(&mm->page_table_lock); + ptepage =3D pmd_page(*pmd); + pte_page_lock(ptepage); page_table =3D pte_offset_map(pmd, address); if (pte_same(*page_table, orig_pte)) ret =3D VM_FAULT_OOM; else ret =3D VM_FAULT_MINOR; pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); return ret; } =20 @@ -1011,11 +1618,12 @@ * Back out if somebody else faulted in this pte while we * released the page table lock. */ - spin_lock(&mm->page_table_lock); + ptepage =3D pmd_page(*pmd); + pte_page_lock(ptepage); page_table =3D pte_offset_map(pmd, address); if (!pte_same(*page_table, orig_pte)) { pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); unlock_page(page); page_cache_release(page); return VM_FAULT_MINOR; @@ -1027,7 +1635,7 @@ if (vm_swap_full()) remove_exclusive_swap_page(page); =20 - mm->rss++; + increment_rss(ptepage); pte =3D mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) pte =3D pte_mkdirty(pte_mkwrite(pte)); @@ -1041,7 +1649,7 @@ /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); return ret; } =20 @@ -1054,6 +1662,7 @@ { pte_t entry; struct page * page =3D ZERO_PAGE(addr); + struct page *ptepage =3D pmd_page(*pmd); =20 /* Read-only mapping of ZERO_PAGE. */ entry =3D pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); @@ -1062,23 +1671,24 @@ if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); =20 page =3D alloc_page(GFP_HIGHUSER); if (!page) goto no_mem; clear_user_highpage(page, addr); =20 - spin_lock(&mm->page_table_lock); + ptepage =3D pmd_page(*pmd); + pte_page_lock(ptepage); page_table =3D pte_offset_map(pmd, addr); =20 if (!pte_none(*page_table)) { pte_unmap(page_table); page_cache_release(page); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); return VM_FAULT_MINOR; } - mm->rss++; + increment_rss(ptepage); flush_page_to_ram(page); entry =3D pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add(page); @@ -1091,7 +1701,7 @@ =20 /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); return VM_FAULT_MINOR; =20 no_mem: @@ -1114,12 +1724,13 @@ unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) { struct page * new_page; + struct page *ptepage =3D pmd_page(*pmd); pte_t entry; =20 if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, page_table, pmd, write_access, = address); pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); =20 new_page =3D vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); =20 @@ -1144,7 +1755,8 @@ new_page =3D page; } =20 - spin_lock(&mm->page_table_lock); + ptepage =3D pmd_page(*pmd); + pte_page_lock(ptepage); page_table =3D pte_offset_map(pmd, address); =20 /* @@ -1159,7 +1771,7 @@ */ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { - ++mm->rss; + increment_rss(ptepage); flush_page_to_ram(new_page); flush_icache_page(vma, new_page); entry =3D mk_pte(new_page, vma->vm_page_prot); @@ -1172,13 +1784,13 @@ /* One of our sibling threads was faster, back out. */ pte_unmap(page_table); page_cache_release(new_page); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); return VM_FAULT_MINOR; } =20 /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); return VM_FAULT_MAJOR; } =20 @@ -1230,7 +1842,7 @@ entry =3D pte_mkyoung(entry); establish_pte(vma, address, pte, entry); pte_unmap(pte); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(pmd_page(*pmd)); return VM_FAULT_MINOR; } =20 @@ -1255,9 +1867,29 @@ pmd =3D pmd_alloc(mm, pgd, address); =20 if (pmd) { - pte_t * pte =3D pte_alloc_map(mm, pmd, address); + pte_t * pte; + +#ifdef CONFIG_SHAREPTE + if (pte_needs_unshare(mm, vma, pmd, address, write_access)) { + pte_page_lock(pmd_page(*pmd)); + pte =3D pte_unshare(mm, pmd, address); + } else { + pte =3D pte_try_to_share(mm, vma, pmd, address); + if (!pte) { + pte =3D pte_alloc_map(mm, pmd, address); + if (pte) + pte_page_lock(pmd_page(*pmd)); + } + } +#else + pte =3D pte_alloc_map(mm, pmd, address); if (pte) + pte_page_lock(pmd_page(*pmd)); +#endif + if (pte) { + spin_unlock(&mm->page_table_lock); return handle_pte_fault(mm, vma, address, write_access, pte, pmd); + } } spin_unlock(&mm->page_table_lock); return VM_FAULT_OOM; --- 2.5.43-mm2/./mm/mremap.c 2002-10-17 11:13:01.000000000 -0500 +++ 2.5.43-mm2-shpte/./mm/mremap.c 2002-10-18 11:58:43.000000000 -0500 @@ -15,6 +15,7 @@ #include <linux/swap.h> #include <linux/fs.h> #include <linux/highmem.h> +#include <linux/rmap-locking.h> =20 #include <asm/uaccess.h> #include <asm/pgalloc.h> @@ -23,6 +24,7 @@ =20 static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long = addr) { + struct page *ptepage; pgd_t * pgd; pmd_t * pmd; pte_t * pte =3D NULL; @@ -45,8 +47,18 @@ goto end; } =20 - pte =3D pte_offset_map_nested(pmd, addr); + ptepage =3D pmd_page(*pmd); + pte_page_lock(ptepage); +#ifdef CONFIG_SHAREPTE + if (page_count(ptepage) > 1) { + pte =3D pte_unshare(mm, pmd, addr); + ptepage =3D pmd_page(*pmd); + } else +#endif + pte =3D pte_offset_map_nested(pmd, addr); + if (pte_none(*pte)) { + pte_page_unlock(ptepage); pte_unmap_nested(pte); pte =3D NULL; } @@ -54,6 +66,32 @@ return pte; } =20 +static inline void drop_pte_nested(struct mm_struct *mm, unsigned long = addr, pte_t *pte) +{ + struct page *ptepage; + pgd_t *pgd; + pmd_t *pmd; + + pgd =3D pgd_offset(mm, addr); + pmd =3D pmd_offset(pgd, addr); + ptepage =3D pmd_page(*pmd); + pte_page_unlock(ptepage); + pte_unmap_nested(pte); +} + +static inline void drop_pte(struct mm_struct *mm, unsigned long addr, = pte_t *pte) +{ + struct page *ptepage; + pgd_t *pgd; + pmd_t *pmd; + + pgd =3D pgd_offset(mm, addr); + pmd =3D pmd_offset(pgd, addr); + ptepage =3D pmd_page(*pmd); + pte_page_unlock(ptepage); + pte_unmap(pte); +} + #ifdef CONFIG_HIGHPTE /* Save a few cycles on the sane machines */ static inline int page_table_present(struct mm_struct *mm, unsigned long = addr) { @@ -72,12 +110,24 @@ =20 static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long = addr) { + struct page *ptepage; pmd_t * pmd; pte_t * pte =3D NULL; =20 pmd =3D pmd_alloc(mm, pgd_offset(mm, addr), addr); - if (pmd) + if (pmd) { + ptepage =3D pmd_page(*pmd); +#ifdef CONFIG_SHAREPTE + pte_page_lock(ptepage); + if (page_count(ptepage) > 1) { + pte_unshare(mm, pmd, addr); + ptepage =3D pmd_page(*pmd); + } + pte_page_unlock(ptepage); +#endif pte =3D pte_alloc_map(mm, pmd, addr); + pte_page_lock(ptepage); + } return pte; } =20 @@ -121,15 +171,15 @@ * atomic kmap */ if (!page_table_present(mm, new_addr)) { - pte_unmap_nested(src); + drop_pte_nested(mm, old_addr, src); src =3D NULL; } dst =3D alloc_one_pte_map(mm, new_addr); if (src =3D=3D NULL) src =3D get_one_pte_map_nested(mm, old_addr); error =3D copy_one_pte(mm, src, dst); - pte_unmap_nested(src); - pte_unmap(dst); + drop_pte_nested(mm, old_addr, src); + drop_pte(mm, new_addr, dst); } flush_tlb_page(vma, old_addr); spin_unlock(&mm->page_table_lock); --- 2.5.43-mm2/./mm/mmap.c 2002-10-17 11:13:01.000000000 -0500 +++ 2.5.43-mm2-shpte/./mm/mmap.c 2002-10-17 11:42:30.000000000 -0500 @@ -23,7 +23,11 @@ #include <asm/pgalloc.h> #include <asm/tlb.h> =20 -extern void unmap_page_range(mmu_gather_t *,struct vm_area_struct *vma, = unsigned long address, unsigned long size); +extern void unmap_page_range(mmu_gather_t **,struct vm_area_struct *vma, = unsigned long address, unsigned long size); +#ifdef CONFIG_SHAREPTE +extern void unmap_shared_range(struct mm_struct *mm, unsigned long = address, unsigned long end); +#endif +extern void unmap_all_pages(struct mm_struct *mm); extern void clear_page_tables(mmu_gather_t *tlb, unsigned long first, int = nr); =20 /* @@ -1013,7 +1017,7 @@ from =3D start < mpnt->vm_start ? mpnt->vm_start : start; to =3D end > mpnt->vm_end ? mpnt->vm_end : end; =20 - unmap_page_range(tlb, mpnt, from, to); + unmap_page_range(&tlb, mpnt, from, to); =20 if (mpnt->vm_flags & VM_ACCOUNT) { len =3D to - from; @@ -1275,10 +1279,19 @@ } } =20 +/* + * For small tasks, it's most efficient to unmap the pages for each + * vma. For larger tasks, it's better to just walk the entire address + * space in one pass, particularly with shared pte pages. This + * threshold determines the size where we switch from one method to + * the other. + */ + +#define UNMAP_THRESHOLD 500 + /* Release all mmaps. */ void exit_mmap(struct mm_struct * mm) { - mmu_gather_t *tlb; struct vm_area_struct * mpnt; =20 profile_exit_mmap(mm); @@ -1287,36 +1300,14 @@ =20 spin_lock(&mm->page_table_lock); =20 - tlb =3D tlb_gather_mmu(mm, 1); - flush_cache_mm(mm); - mpnt =3D mm->mmap; - while (mpnt) { - unsigned long start =3D mpnt->vm_start; - unsigned long end =3D mpnt->vm_end; - - /* - * If the VMA has been charged for, account for its - * removal - */ - if (mpnt->vm_flags & VM_ACCOUNT) - vm_unacct_memory((end - start) >> PAGE_SHIFT); =20 - mm->map_count--; - if (!(is_vm_hugetlb_page(mpnt))) - unmap_page_range(tlb, mpnt, start, end); - else - mpnt->vm_ops->close(mpnt); - mpnt =3D mpnt->vm_next; - } + unmap_all_pages(mm); =20 /* This is just debugging */ if (mm->map_count) BUG(); =20 - clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); - tlb_finish_mmu(tlb, 0, TASK_SIZE); - mpnt =3D mm->mmap; mm->mmap =3D mm->mmap_cache =3D NULL; mm->mm_rb =3D RB_ROOT; @@ -1332,6 +1323,14 @@ */ while (mpnt) { struct vm_area_struct * next =3D mpnt->vm_next; + + /* + * If the VMA has been charged for, account for its + * removal + */ + if (mpnt->vm_flags & VM_ACCOUNT) + vm_unacct_memory((mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT); + remove_shared_vm_struct(mpnt); if (mpnt->vm_ops) { if (mpnt->vm_ops->close) --- 2.5.43-mm2/./mm/rmap.c 2002-10-15 22:29:02.000000000 -0500 +++ 2.5.43-mm2-shpte/./mm/rmap.c 2002-10-18 16:56:51.000000000 -0500 @@ -14,11 +14,11 @@ /* * Locking: * - the page->pte.chain is protected by the PG_chainlock bit, - * which nests within the zone->lru_lock, then the - * mm->page_table_lock, and then the page lock. + * which nests within the zone->lru_lock, then the pte_page_lock, + * and then the page lock. * - because swapout locking is opposite to the locking order * in the page fault path, the swapout path uses trylocks - * on the mm->page_table_lock + * on the pte_page_lock. */ #include <linux/mm.h> #include <linux/pagemap.h> @@ -45,11 +45,17 @@ */ #define NRPTE ((L1_CACHE_BYTES - sizeof(void *))/sizeof(pte_addr_t)) =20 +struct mm_chain { + struct mm_chain *next; + struct mm_struct *mm; +}; + struct pte_chain { struct pte_chain *next; pte_addr_t ptes[NRPTE]; }; =20 +static kmem_cache_t *mm_chain_cache; static kmem_cache_t *pte_chain_cache; =20 /* @@ -102,6 +108,25 @@ kmem_cache_free(pte_chain_cache, pte_chain); } =20 +static inline struct mm_chain *mm_chain_alloc(void) +{ + struct mm_chain *ret; + + ret =3D kmem_cache_alloc(mm_chain_cache, GFP_ATOMIC); + return ret; +} + +static void mm_chain_free(struct mm_chain *mc, + struct mm_chain *prev_mc, struct page *page) +{ + if (prev_mc) + prev_mc->next =3D mc->next; + else if (page) + page->pte.mmchain =3D mc->next; + + kmem_cache_free(mm_chain_cache, mc); +} + /** ** VM stuff below this comment **/ @@ -161,13 +186,139 @@ return referenced; } =20 +/* + * pgtable_add_rmap_locked - Add an mm_struct to the chain for a pte page. + * @page: The pte page to add the mm_struct to + * @mm: The mm_struct to add + * @address: The address of the page we're mapping + * + * Pte pages maintain a chain of mm_structs that use it. This adds a new + * mm_struct to the chain. + * + * This function must be called with the pte_page_lock held for the page + */ +void pgtable_add_rmap_locked(struct page * page, struct mm_struct * mm, + unsigned long address) +{ + struct mm_chain *mc; + +#ifdef BROKEN_PPC_PTE_ALLOC_ONE + /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ + extern int mem_init_done; + + if (!mem_init_done) + return; +#endif +#ifdef RMAP_DEBUG + BUG_ON(mm =3D=3D NULL); + BUG_ON(!PagePtepage(page)); +#endif + + if (PageDirect(page)) { + mc =3D mm_chain_alloc(); + mc->mm =3D page->pte.mmdirect; + mc->next =3D NULL; + page->pte.mmchain =3D mc; + ClearPageDirect(page); + } + if (page->pte.mmchain) { + /* Hook up the mm_chain to the page. */ + mc =3D mm_chain_alloc(); + mc->mm =3D mm; + mc->next =3D page->pte.mmchain; + page->pte.mmchain =3D mc; + } else { + page->pte.mmdirect =3D mm; + SetPageDirect(page); + page->index =3D address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); + } +} + +/* + * pgtable_remove_rmap_locked - Remove an mm_struct from the chain for a = pte page. + * @page: The pte page to remove the mm_struct from + * @mm: The mm_struct to remove + * + * Pte pages maintain a chain of mm_structs that use it. This removes an=20 + * mm_struct from the chain. + * + * This function must be called with the pte_page_lock held for the page + */ +void pgtable_remove_rmap_locked(struct page * page, struct mm_struct *mm) +{ + struct mm_chain * mc, * prev_mc =3D NULL; + +#ifdef DEBUG_RMAP + BUG_ON(mm =3D=3D NULL); + BUG_ON(!PagePtepage(page)); +#endif + + if (PageDirect(page)) { + if (page->pte.mmdirect =3D=3D mm) { + page->pte.mmdirect =3D NULL; + ClearPageDirect(page); + page->index =3D 0; + goto out; + } + } else { +#ifdef DEBUG_RMAP + BUG_ON(page->pte.mmchain->next =3D=3D NULL); +#endif + for (mc =3D page->pte.mmchain; mc; prev_mc =3D mc, mc =3D mc->next) { + if (mc->mm =3D=3D mm) { + mm_chain_free(mc, prev_mc, page); + /* Check whether we can convert to direct */ + mc =3D page->pte.mmchain; + if (!mc->next) { + page->pte.mmdirect =3D mc->mm; + SetPageDirect(page); + mm_chain_free(mc, NULL, NULL); + } + goto out; + } + } + } + BUG(); +out: +} + +/* + * pgtable_add_rmap - Add an mm_struct to the chain for a pte page. + * @page: The pte page to add the mm_struct to + * @mm: The mm_struct to add + * @address: The address of the page we're mapping + * + * This is a wrapper for pgtable_add_rmap_locked that takes the lock + */ +void pgtable_add_rmap(struct page * page, struct mm_struct * mm, + unsigned long address) +{ + pte_page_lock(page); + pgtable_add_rmap_locked(page, mm, address); + pte_page_unlock(page); +} + +/* + * pgtable_remove_rmap_locked - Remove an mm_struct from the chain for a = pte page. + * @page: The pte page to remove the mm_struct from + * @mm: The mm_struct to remove + * + * This is a wrapper for pgtable_remove_rmap_locked that takes the lock + */ +void pgtable_remove_rmap(struct page * page, struct mm_struct *mm) +{ + pte_page_lock(page); + pgtable_remove_rmap_locked(page, mm); + pte_page_unlock(page); +} + /** * page_add_rmap - add reverse mapping entry to a page * @page: the page to add the mapping to * @ptep: the page table entry mapping this page * * Add a new pte reverse mapping to a page. - * The caller needs to hold the mm->page_table_lock. + * The caller needs to hold the pte_page_lock. */ void page_add_rmap(struct page * page, pte_t * ptep) { @@ -180,8 +331,7 @@ BUG(); if (!pte_present(*ptep)) BUG(); - if (!ptep_to_mm(ptep)) - BUG(); + BUG_ON(PagePtepage(page)); #endif =20 if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) @@ -199,12 +349,15 @@ if (page->pte.direct =3D=3D pte_paddr) BUG(); } else { + int count =3D 0; for (pc =3D page->pte.chain; pc; pc =3D pc->next) { - for (i =3D 0; i < NRPTE; i++) { + for (i =3D 0; i < NRPTE; i++, count++) { pte_addr_t p =3D pc->ptes[i]; =20 - if (p && p =3D=3D pte_paddr) + if (p && p =3D=3D pte_paddr) { + printk(KERN_ERR "page_add_rmap: page %08lx (count %d), ptep %08lx, = rmap count %d\n", page, page_count(page), ptep, count); BUG(); + } } } } @@ -263,7 +416,7 @@ * Removes the reverse mapping from the pte_chain of the page, * after that the caller can clear the page table entry and free * the page. - * Caller needs to hold the mm->page_table_lock. + * Caller needs to hold the pte_page_lock. */ void page_remove_rmap(struct page * page, pte_t * ptep) { @@ -277,6 +430,10 @@ if (!page_mapped(page)) return; /* remap_page_range() from a driver? */ =20 +#ifdef DEBUG_RMAP + BUG_ON(PagePtepage(page)); +#endif + pte_chain_lock(page); =20 if (PageDirect(page)) { @@ -342,6 +499,130 @@ return; } =20 +static int pgtable_check_mlocked_mm(struct mm_struct *mm, unsigned long = address) +{ + struct vm_area_struct *vma; + int ret =3D SWAP_SUCCESS; + + /* During mremap, it's possible pages are not in a VMA. */ + vma =3D find_vma(mm, address); + if (!vma) { + ret =3D SWAP_FAIL; + goto out; + } + + /* The page is mlock()d, we cannot swap it out. */ + if (vma->vm_flags & VM_LOCKED) { + ret =3D SWAP_FAIL; + } +out: + return ret; +} + +static int pgtable_check_mlocked(struct page *ptepage, unsigned long = address) +{ + struct mm_chain *mc; + int ret =3D SWAP_SUCCESS; + +#ifdef DEBUG_RMAP + BUG_ON(!PagePtepage(ptepage)); +#endif + if (PageDirect(ptepage)) { + ret =3D pgtable_check_mlocked_mm(ptepage->pte.mmdirect, address); + goto out; + } + + for (mc =3D ptepage->pte.mmchain; mc; mc =3D mc->next) { +#ifdef DEBUG_RMAP + BUG_ON(mc->mm =3D=3D NULL); +#endif + ret =3D pgtable_check_mlocked_mm(mc->mm, address); + if (ret !=3D SWAP_SUCCESS) + goto out; + } +out: + return ret; +} + +/** + * pgtable_unmap_one_mm - Decrement the rss count and flush for an = mm_struct + * @mm: - the mm_struct to decrement + * @address: - The address of the page we're removing + * + * All pte pages keep a chain of mm_struct that are using it. This does a = flush + * of the address for that mm_struct and decrements the rss count. + */ +static int pgtable_unmap_one_mm(struct mm_struct *mm, unsigned long = address) +{ + struct vm_area_struct *vma; + int ret =3D SWAP_SUCCESS; + + /* During mremap, it's possible pages are not in a VMA. */ + vma =3D find_vma(mm, address); + if (!vma) { + ret =3D SWAP_FAIL; + goto out; + } + flush_tlb_page(vma, address); + flush_cache_page(vma, address); + mm->rss--; + +out: + return ret; +} + +/** + * pgtable_unmap_one - Decrement all rss counts and flush caches for a pte = page + * @ptepage: the pte page to decrement the count for + * @address: the address of the page we're removing + * + * This decrements the rss counts of all mm_structs that map this pte page + * and flushes the tlb and cache for these mm_structs and address + */ +static int pgtable_unmap_one(struct page *ptepage, unsigned long address) +{ + struct mm_chain *mc; + int ret =3D SWAP_SUCCESS; + +#ifdef DEBUG_RMAP + BUG_ON(!PagePtepage(ptepage)); +#endif + + if (PageDirect(ptepage)) { + ret =3D pgtable_unmap_one_mm(ptepage->pte.mmdirect, address); + if (ret !=3D SWAP_SUCCESS) + goto out; + } else for (mc =3D ptepage->pte.mmchain; mc; mc =3D mc->next) { + ret =3D pgtable_unmap_one_mm(mc->mm, address); + if (ret !=3D SWAP_SUCCESS) + goto out; + } + ptepage->private--; +out: + return ret; +} + +/** + * increment_rss - increment the rss count by one + * @ptepage: The pte page that's getting a new paged mapped + * + * Since mapping a page into a pte page can increment the rss + * for multiple mm_structs, this function iterates through all + * the mms and increments them. It also keeps an rss count + * per pte page. + */ +void increment_rss(struct page *ptepage) +{ + struct mm_chain *mc; + + if (PageDirect(ptepage)) + ptepage->pte.mmdirect->rss++; + else for (mc =3D ptepage->pte.mmchain; mc; mc =3D mc->next) + mc->mm->rss++; + + ptepage->private++; +} + /** * try_to_unmap_one - worker function for try_to_unmap * @page: page to unmap @@ -354,48 +635,36 @@ * zone->lru_lock page_launder() * page lock page_launder(), trylock * pte_chain_lock page_launder() - * mm->page_table_lock try_to_unmap_one(), trylock + * pte_page_lock try_to_unmap_one(), trylock */ static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); static int try_to_unmap_one(struct page * page, pte_addr_t paddr) { pte_t *ptep =3D rmap_ptep_map(paddr); - unsigned long address =3D ptep_to_address(ptep); - struct mm_struct * mm =3D ptep_to_mm(ptep); - struct vm_area_struct * vma; pte_t pte; + struct page *ptepage =3D kmap_atomic_to_page(ptep); + unsigned long address =3D ptep_to_address(ptep); int ret; =20 - if (!mm) - BUG(); - - /* - * We need the page_table_lock to protect us from page faults, - * munmap, fork, etc... - */ - if (!spin_trylock(&mm->page_table_lock)) { +#ifdef DEBUG_RMAP + BUG_ON(!PagePtepage(ptepage)); +#endif + if (!pte_page_trylock(ptepage)) { rmap_ptep_unmap(ptep); return SWAP_AGAIN; } =20 - - /* During mremap, it's possible pages are not in a VMA. */ - vma =3D find_vma(mm, address); - if (!vma) { - ret =3D SWAP_FAIL; + ret =3D pgtable_check_mlocked(ptepage, address); + if (ret !=3D SWAP_SUCCESS) goto out_unlock; - } + pte =3D ptep_get_and_clear(ptep); =20 - /* The page is mlock()d, we cannot swap it out. */ - if (vma->vm_flags & VM_LOCKED) { - ret =3D SWAP_FAIL; + ret =3D pgtable_unmap_one(ptepage, address); + if (ret !=3D SWAP_SUCCESS) { + set_pte(ptep, pte); goto out_unlock; } - - /* Nuke the page table entry. */ - pte =3D ptep_get_and_clear(ptep); - flush_tlb_page(vma, address); - flush_cache_page(vma, address); + pte_page_unlock(ptepage); =20 /* Store the swap location in the pte. See handle_pte_fault() ... */ if (PageSwapCache(page)) { @@ -408,13 +677,15 @@ if (pte_dirty(pte)) set_page_dirty(page); =20 - mm->rss--; page_cache_release(page); ret =3D SWAP_SUCCESS; + goto out; =20 out_unlock: + pte_page_unlock(ptepage); + +out: rmap_ptep_unmap(ptep); - spin_unlock(&mm->page_table_lock); return ret; } =20 @@ -523,6 +794,17 @@ =20 void __init pte_chain_init(void) { + + mm_chain_cache =3D kmem_cache_create( "mm_chain", + sizeof(struct mm_chain), + 0, + 0, + NULL, + NULL); + + if (!mm_chain_cache) + panic("failed to create mm_chain cache!\n"); + pte_chain_cache =3D kmem_cache_create( "pte_chain", sizeof(struct pte_chain), 0, --- 2.5.43-mm2/./mm/fremap.c 2002-10-17 11:13:01.000000000 -0500 +++ 2.5.43-mm2-shpte/./mm/fremap.c 2002-10-18 12:17:30.000000000 -0500 @@ -11,6 +11,8 @@ #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swapops.h> +#include <linux/rmap-locking.h> + #include <asm/mmu_context.h> =20 static inline void zap_pte(struct mm_struct *mm, pte_t *ptep) @@ -47,6 +49,7 @@ unsigned long addr, struct page *page, unsigned long prot) { int err =3D -ENOMEM; + struct page *ptepage; pte_t *pte, entry; pgd_t *pgd; pmd_t *pmd; @@ -58,10 +61,25 @@ if (!pmd) goto err_unlock; =20 +#ifdef CONFIG_SHAREPTE + if (pmd_present(*pmd)) { + ptepage =3D pmd_page(*pmd); + if (page_count(ptepage) > 1) { + pte =3D pte_unshare(mm, pmd, addr); + ptepage =3D pmd_page(*pmd); + goto mapped; + } + } +#endif + pte =3D pte_alloc_map(mm, pmd, addr); if (!pte) goto err_unlock; =20 + pte_page_lock(ptepage); +#ifdef CONFIG_SHAREPTE +mapped: +#endif zap_pte(mm, pte); =20 mm->rss++; @@ -75,11 +93,13 @@ pte_unmap(pte); flush_tlb_page(vma, addr); =20 + pte_page_unlock(ptepage); spin_unlock(&mm->page_table_lock); =20 return 0; =20 err_unlock: + pte_page_unlock(ptepage); spin_unlock(&mm->page_table_lock); return err; }

--==========734028336==========-- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/