From e509bb87e92dd811ef0ae0e91fee90fa6f98c47c Mon Sep 17 00:00:00 2001 From: "Markus F.X.J. Oberhumer" Date: Tue, 29 Apr 2025 17:44:47 +0200 Subject: [PATCH] src/stub: flush datacache before msync --- src/stub/src/arm.v4a-linux.elf-fold.S | 7 ++++ src/stub/src/arm.v4a-linux.elf-so_fold.S | 10 ++++++ src/stub/src/arm64-linux.elf-fold.S | 38 ++++++++++++++++++++ src/stub/src/arm64-linux.elf-so_fold.S | 41 +++++++++++++++++++--- src/stub/src/mipsel.r3000-linux.elf-fold.S | 8 +++++ src/stub/src/powerpc-expand.S | 1 + src/stub/src/powerpc64-expand.S | 1 + 7 files changed, 101 insertions(+), 5 deletions(-) diff --git a/src/stub/src/arm.v4a-linux.elf-fold.S b/src/stub/src/arm.v4a-linux.elf-fold.S index fbe509d7..ed7f29c4 100644 --- a/src/stub/src/arm.v4a-linux.elf-fold.S +++ b/src/stub/src/arm.v4a-linux.elf-fold.S @@ -107,6 +107,13 @@ Psync: .globl Psync bic r12,arg1,r12 // lo frag sub arg1,arg1,r12 // page align lo end add arg2,arg2,r12 + + stmdb sp!,{r0,r1,r2} // lo, len, ?? + add r1,r1,r0 // hi + mov r2,#0 // might be CSSELR_DCACHE from linux/arch/arm/include/asm/cachetype.h + do_sys2 __ARM_NR_cacheflush + ldmia sp!,{r0,r1,r2} + do_sys __NR_msync; ret mmap_privanon: .globl mmap_privanon diff --git a/src/stub/src/arm.v4a-linux.elf-so_fold.S b/src/stub/src/arm.v4a-linux.elf-so_fold.S index aa9235ff..9dbb7c1c 100644 --- a/src/stub/src/arm.v4a-linux.elf-so_fold.S +++ b/src/stub/src/arm.v4a-linux.elf-so_fold.S @@ -30,6 +30,9 @@ #define ARM_OLDABI 1 #include "arch/arm/v4a/macros.S" #include "MAX_ELF_HDR.S" +__NR_SYSCALL_BASE = 0 +__ARM_NR_BASE= 0xf0000 + __NR_SYSCALL_BASE +__ARM_NR_cacheflush = 2 + __ARM_NR_BASE NBPW= 4 sz_Elf32_Ehdr = 13*4 @@ -139,6 +142,13 @@ Psync: .globl Psync bic r12,arg1,r12 sub arg1,arg1,r12 add arg2,arg2,r12 + + stmdb sp!,{r0,r1,r2} // lo, len, ?? + add r1,r1,r0 // hi + mov r2,#0 // might be CSSELR_DCACHE from linux/arch/arm/include/asm/cachetype.h + do_sys2 __ARM_NR_cacheflush + ldmia sp!,{r0,r1,r2} + b msync L05: diff --git a/src/stub/src/arm64-linux.elf-fold.S b/src/stub/src/arm64-linux.elf-fold.S index 5e7b446d..a236076a 100644 --- a/src/stub/src/arm64-linux.elf-fold.S +++ b/src/stub/src/arm64-linux.elf-fold.S @@ -128,6 +128,44 @@ Psync: .globl Psync bic x8,x0,x8 sub x0,x0,x8 add x1,x1,x8 + +// Sync contents of data cache into RAM. +// Linux should do this implicitly, but apparently not. +CTR_IDC_SHIFT= 28 +CTR_DIC_SHIFT= 29 + + mov x3,x0 // lo + add x4,x0,x1 // hi +sync_cache_range: // (void *lo= x3, void *const hi= x4) + mrs x6,ctr_el0 + tbnz w6,#CTR_IDC_SHIFT,dc_not_dirty + ubfx x5,x6,#16,#4 // -2+ log2(dline_size) + mov x8,#-4; lsl x8,x8,x5 // sz_dline mask + and x5,x8,x3 // round down to dc line +// cmp x5,x4; b.hs dc_done +dc_loop: + dc cvau,x5 // sync dline + sub x5,x5,x8 // next dline + cmp x5,x4; b.lo dc_loop +//dc_done: +dc_not_dirty: + dsb ish // why here if dc not dirty? + + tbnz w6,#CTR_DIC_SHIFT,ic_not_dirty + and x6,x6,#0xf // -2+ log2(iline_size) + mov x8,#-4; lsl x8,x8,x6 // sz_iline mask + and x3,x8,x3 // round down to ic line +// cmp x3,x4; b.hs ic_done +ic_loop: + ic ivau,x3 // sync iline + sub x3,x3,x8 // next iline + cmp x3,x4; b.lo ic_loop +//ic_done: + dsb ish +ic_not_dirty: + isb +// fall into msync + msync: .globl msync do_sys __NR_msync; ret diff --git a/src/stub/src/arm64-linux.elf-so_fold.S b/src/stub/src/arm64-linux.elf-so_fold.S index 9f5dab42..a2d99560 100644 --- a/src/stub/src/arm64-linux.elf-so_fold.S +++ b/src/stub/src/arm64-linux.elf-so_fold.S @@ -91,6 +91,42 @@ Psync: .globl Psync bic x8,x0,x8 sub x0,x0,x8 add x1,x1,x8 +// Sync contents of data cache into RAM. +// Linux should do this implicitly, but apparently not. +CTR_IDC_SHIFT= 28 +CTR_DIC_SHIFT= 29 + + mov x3,x0 // lo + add x4,x0,x1 // hi +sync_cache_range: // (void *lo= x3, void *const hi= x4) + mrs x6,ctr_el0 + tbnz w6,#CTR_IDC_SHIFT,dc_not_dirty + ubfx x5,x6,#16,#4 // -2+ log2(dline_size) + mov x8,#-4; lsl x8,x8,x5 // sz_dline mask + and x5,x8,x3 // round down to dc line +// cmp x5,x4; b.hs dc_done +dc_loop: + dc cvau,x5 // sync dline + sub x5,x5,x8 // next dline + cmp x5,x4; b.lo dc_loop +//dc_done: +dc_not_dirty: + dsb ish // why here if dc not dirty? + + tbnz w6,#CTR_DIC_SHIFT,ic_not_dirty + and x6,x6,#0xf // -2+ log2(iline_size) + mov x8,#-4; lsl x8,x8,x6 // sz_iline mask + and x3,x8,x3 // round down to ic line +// cmp x3,x4; b.hs ic_done +ic_loop: + ic ivau,x3 // sync iline + sub x3,x3,x8 // next iline + cmp x3,x4; b.lo ic_loop +//ic_done: + dsb ish +ic_not_dirty: + isb +// fall into msync do_sys __NR_msync; ret fold: // enter here (x0= &so_info; x1= &{argc,argv,envp,lr} @@ -206,11 +242,6 @@ brk: readlink: do_sys __NR_readlink; ret - .globl __sync_cache_range -__sync_cache_range: // (void *lo, void *hi) -#include "arm64-sync-cache-range.S" - ret - get_sys_munmap: .globl get_sys_munmap // r0= system call instruction #if defined(ARMEL_DARWIN) /*{*/ ldr w0,4*1 + munmap diff --git a/src/stub/src/mipsel.r3000-linux.elf-fold.S b/src/stub/src/mipsel.r3000-linux.elf-fold.S index 806819f0..fe44cbd5 100644 --- a/src/stub/src/mipsel.r3000-linux.elf-fold.S +++ b/src/stub/src/mipsel.r3000-linux.elf-fold.S @@ -334,6 +334,14 @@ Psync: .globl Psync and TMP,a0,v0 sub a0,TMP add a1,TMP + +__NR_cacheflush = 147+ __NR_Linux +/* asm/cachectl.h */ +ICACHE= 1<<0 +DCACHE= 1<<1 + li a2,DCACHE + li v0,__NR_cacheflush; syscall // ignore failure + li v0,__NR_msync; syscall jr ra addiu sp,2*NBPW diff --git a/src/stub/src/powerpc-expand.S b/src/stub/src/powerpc-expand.S index ec715d36..aacb0318 100644 --- a/src/stub/src/powerpc-expand.S +++ b/src/stub/src/powerpc-expand.S @@ -86,6 +86,7 @@ no_unf: POP2 a0,a1 // MATCH_81 dst, len add a1,a1,a0 // lo, hi + addi a1,a1,-1 // highest covered addr CACHELINE=32 ori a0,a0,-1+ CACHELINE // highest addr on cache line diff --git a/src/stub/src/powerpc64-expand.S b/src/stub/src/powerpc64-expand.S index e7b3f58d..0815112c 100644 --- a/src/stub/src/powerpc64-expand.S +++ b/src/stub/src/powerpc64-expand.S @@ -86,6 +86,7 @@ no_unf: POP2 a0,a1 // MATCH_81 dst, len add a1,a1,a0 // lo, hi + addi a1,a1,-1 // highest covered addr CACHELINE=32 ori a0,a0,-1+ CACHELINE // highest addr on cache line