From: Rik van Riel The following experimental patch implements token based thrashing protection, using the algorithm described in: http://www.cs.wm.edu/~sjiang/token.htm When there are pageins going on, a task can grab a token, that protects the task from pageout (except by itself) until it is no longer doing heavy pageins, or until the maximum hold time of the token is over. If the maximum hold time is exceeded, the task isn't eligable to hold the token for a while more, since it wasn't doing it much good anyway. I have run a very unscientific benchmark on my system to test the effectiveness of the patch, timing how a 230MB two-process qsbench run takes, with and without the token thrashing protection present. normal 2.6.8-rc6: 6m45s 2.6.8-rc6 + token: 4m24s This is a quick hack, implemented without having talked to the inventor of the algorithm. He's copied on the mail and I suspect we'll be able to do better than my quick implementation ... Signed-off-by: Andrew Morton --- 25-akpm/include/linux/sched.h | 4 + 25-akpm/include/linux/swap.h | 21 ++++++++ 25-akpm/kernel/fork.c | 2 25-akpm/mm/Makefile | 2 25-akpm/mm/filemap.c | 1 25-akpm/mm/memory.c | 1 25-akpm/mm/rmap.c | 3 + 25-akpm/mm/thrash.c | 100 ++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 133 insertions(+), 1 deletion(-) diff -puN include/linux/sched.h~token-based-thrashing-control include/linux/sched.h --- 25/include/linux/sched.h~token-based-thrashing-control 2004-08-04 20:40:57.147877384 -0700 +++ 25-akpm/include/linux/sched.h 2004-08-04 20:40:57.164874800 -0700 @@ -246,6 +246,10 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; + /* Token based thrashing protection. */ + unsigned long swap_token_time; + char recent_pagein; + /* coredumping support */ int core_waiters; struct completion *core_startup_done, core_done; diff -puN include/linux/swap.h~token-based-thrashing-control include/linux/swap.h --- 25/include/linux/swap.h~token-based-thrashing-control 2004-08-04 20:40:57.149877080 -0700 +++ 25-akpm/include/linux/swap.h 2004-08-04 20:40:57.165874648 -0700 @@ -204,6 +204,27 @@ extern void free_pages_and_swap_cache(st extern struct page * lookup_swap_cache(swp_entry_t); extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, unsigned long addr); +/* linux/mm/thrash.c */ +#ifdef CONFIG_SWAP +extern struct mm_struct * swap_token_mm; +extern void grab_swap_token(void); +extern void __put_swap_token(struct mm_struct *); + +static inline int has_swap_token(struct mm_struct *mm) +{ + return (mm == swap_token_mm); +} + +static inline void put_swap_token(struct mm_struct *mm) +{ + if (has_swap_token(mm)) + __put_swap_token(mm); +} +#else /* CONFIG_SWAP */ +#define put_swap_token(x) do { } while(0) +#define grab_swap_token do { } while(0) +#define has_swap_token 0 +#endif /* CONFIG_SWAP */ /* linux/mm/swapfile.c */ extern long total_swap_pages; diff -puN kernel/fork.c~token-based-thrashing-control kernel/fork.c --- 25/kernel/fork.c~token-based-thrashing-control 2004-08-04 20:40:57.151876776 -0700 +++ 25-akpm/kernel/fork.c 2004-08-04 20:40:57.166874496 -0700 @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -467,6 +468,7 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); exit_aio(mm); exit_mmap(mm); + put_swap_token(mm); mmdrop(mm); } } diff -puN mm/filemap.c~token-based-thrashing-control mm/filemap.c --- 25/mm/filemap.c~token-based-thrashing-control 2004-08-04 20:40:57.154876320 -0700 +++ 25-akpm/mm/filemap.c 2004-08-04 20:40:57.168874192 -0700 @@ -1196,6 +1196,7 @@ no_cached_page: * effect. */ error = page_cache_read(file, pgoff); + grab_swap_token(); /* * The page we want has now been added to the page cache. diff -puN mm/Makefile~token-based-thrashing-control mm/Makefile --- 25/mm/Makefile~token-based-thrashing-control 2004-08-04 20:40:57.156876016 -0700 +++ 25-akpm/mm/Makefile 2004-08-04 20:40:57.169874040 -0700 @@ -12,6 +12,6 @@ obj-y := bootmem.o filemap.o mempool.o readahead.o slab.o swap.o truncate.o vmscan.o \ $(mmu-y) -obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o diff -puN mm/memory.c~token-based-thrashing-control mm/memory.c --- 25/mm/memory.c~token-based-thrashing-control 2004-08-04 20:40:57.157875864 -0700 +++ 25-akpm/mm/memory.c 2004-08-04 20:40:57.171873736 -0700 @@ -1372,6 +1372,7 @@ static int do_swap_page(struct mm_struct /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; inc_page_state(pgmajfault); + grab_swap_token(); } mark_page_accessed(page); diff -puN mm/rmap.c~token-based-thrashing-control mm/rmap.c --- 25/mm/rmap.c~token-based-thrashing-control 2004-08-04 20:40:57.159875560 -0700 +++ 25-akpm/mm/rmap.c 2004-08-04 20:40:57.172873584 -0700 @@ -230,6 +230,9 @@ static int page_referenced_one(struct pa if (ptep_clear_flush_young(vma, address, pte)) referenced++; + if (mm != current->mm && has_swap_token(mm)) + referenced++; + (*mapcount)--; out_unmap: diff -puN /dev/null mm/thrash.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25-akpm/mm/thrash.c 2004-08-04 20:40:57.173873432 -0700 @@ -0,0 +1,100 @@ +/* + * mm/thrash.c + * + * Copyright (C) 2004, Red Hat, Inc. + * Copyright (C) 2004, Rik van Riel + * Released under the GPL, see the file COPYING for details. + * + * Simple token based thrashing protection, using the algorithm + * described in: http://www.cs.wm.edu/~sjiang/token.pdf + */ +#include +#include +#include +#include + +static spinlock_t swap_token_lock = SPIN_LOCK_UNLOCKED; +static unsigned long swap_token_timeout; +unsigned long swap_token_check; +struct mm_struct * swap_token_mm = &init_mm; + +#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) +#define SWAP_TOKEN_TIMEOUT (HZ * 300) + +/* + * Take the token away if the process had no page faults + * in the last interval, or if it has held the token for + * too long. + */ +#define SWAP_TOKEN_ENOUGH_RSS 1 +#define SWAP_TOKEN_TIMED_OUT 2 +static int should_release_swap_token(struct mm_struct *mm) +{ + int ret = 0; + if (!mm->recent_pagein) + ret = SWAP_TOKEN_ENOUGH_RSS; + else if (time_after(jiffies, swap_token_timeout)) + ret = SWAP_TOKEN_TIMED_OUT; + mm->recent_pagein = 0; + return ret; +} + +/* + * Try to grab the swapout protection token. We only try to + * grab it once every TOKEN_CHECK_INTERVAL, both to prevent + * SMP lock contention and to check that the process that held + * the token before is no longer thrashing. + */ +void grab_swap_token(void) +{ + struct mm_struct *mm; + int reason; + + /* We have the token. Let others know we still need it. */ + if (has_swap_token(current->mm)) { + current->mm->recent_pagein = 1; + return; + } + + if (time_after(jiffies, swap_token_check)) { + + /* Can't get swapout protection if we exceed our RSS limit. */ + // if (current->mm->rss > current->mm->rlimit_rss) + // return; + + /* ... or if we recently held the token. */ + if (time_before(jiffies, current->mm->swap_token_time)) + return; + + if (!spin_trylock(&swap_token_lock)) + return; + + swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; + + mm = swap_token_mm; + if ((reason = should_release_swap_token(mm))) { + unsigned long eligible = jiffies; + if (reason == SWAP_TOKEN_TIMED_OUT) { + eligible += SWAP_TOKEN_TIMEOUT; + } + mm->swap_token_time = eligible; + swap_token_timeout = jiffies + SWAP_TOKEN_TIMEOUT; + swap_token_mm = current->mm; + printk("Took swap token, pid %d (%s)\n", + current->pid, current->comm); + } + spin_unlock(&swap_token_lock); + } + return; +} + +/* Called on process exit. */ +void __put_swap_token(struct mm_struct *mm) +{ + spin_lock(&swap_token_lock); + if (likely(mm == swap_token_mm)) { + swap_token_mm = &init_mm; + swap_token_check = jiffies; + } + spin_unlock(&swap_token_lock); +} _