upx/src/stub/src/amd64-linux.shlib-init.S
John Reiser 3403b87cb7 More Control Flow Integrity (Intel)
More ENDBRxx for CALL *%RDX for 'getbit'.
amd64: LEA xxx(%rip),%reg and STRCON section,
    instead of CALL; .asciz "..."; POP %reg
(Not for PE due to WINDOWS_BACK binary compatibility.)
i386 string constants still use call-.asciz-pop because no (%rip)
	modified:   ../misc/testsuite/upx_testsuite_1-expected_sha256sums.sh
	modified:   p_lx_elf.cpp
	modified:   stub/src/amd64-linux.elf-entry.S
	modified:   stub/src/amd64-linux.elf-main2.c
	modified:   stub/src/amd64-linux.elf-so_entry.S
	modified:   stub/src/amd64-linux.shlib-init.S
	modified:   stub/src/amd64-win64.pe.S
	modified:   stub/src/i386-linux.elf-entry.S
	modified:   stub/src/i386-linux.elf-so_entry.S
	modified:   stub/src/upxfd_android.c
           plus generated *.h *.map *.dump
2024-09-20 14:48:30 -07:00

419 lines
12 KiB
ArmAsm

/* amd64-linux.shlib-init.S -- Linux program entry point & decompressor (Elf shared lib)
*
* This file is part of the UPX executable compressor.
*
* Copyright (C) 1996-2024 Markus Franz Xaver Johannes Oberhumer
* Copyright (C) 1996-2024 Laszlo Molnar
* Copyright (C) 2000-2024 John F. Reiser
* All Rights Reserved.
*
* UPX and the UCL library are free software; you can redistribute them
* and/or modify them under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; see the file COPYING.
* If not, write to the Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Markus F.X.J. Oberhumer Laszlo Molnar
* <markus@oberhumer.com> <ezerotven+github@gmail.com>
*
* John F. Reiser
* <jreiser@users.sourceforge.net>
*/
#include "arch/amd64/macros.S"
#include "arch/amd64/regs.h"
sz_Ehdr= 64
sz_Phdr= 56
sz_l_info= 12
l_lsize= 8
sz_p_info= 12
sz_b_info= 12
sz_unc= 0
sz_cpr= 4
b_method= 8
PROT_READ= 1
PROT_WRITE= 2
PROT_EXEC= 4
MAP_PRIVATE= 2
MAP_FIXED= 0x10
MAP_ANONYMOUS= 0x20
__NR_mmap= 9 // 64-bit mode only! /usr/include/asm/unistd_64.h
__NR_mprotect= 10
__NR_munmap= 11
__NR_write= 1
__NR_exit= 60
PAGE_SHIFT= 12
PAGE_MASK= (~0<<PAGE_SHIFT)
PAGE_SIZE= -PAGE_MASK
M_NRV2B_LE32=2 // ../conf.h
M_NRV2D_LE32=5
M_NRV2E_LE32=8
// .long offset(.) // detect relocation
// .long offset(user DT_INIT)
// .long offset(escape_hatch)
// .long offset({l_info; p_info; b_info; compressed data})
section ELFMAINX
_start: .globl _start
nop; int3; int3
push %rax // space for &DT_INIT
o_uinit= 5*8
push %arg2; push %arg1 // save first two args to DT_INIT()
push %rax // space for &hatch
o_hatch= 2*8
push %arg3 // save third arg to DT_INIT()
push %rbp; mov %rsp,%rbp // frame pointer
call main // push &decompress
ret_main:
/* Returns 0 on success; non-zero on failure. */
decompress: // (uchar const *src, size_t lsrc, uchar *dst, u32 &ldst, uint method)
/* Arguments according to calling convention */
#define src %arg1
#define lsrc %arg2
#define dst %arg3
#define ldst %arg4 /* Out: actually a reference: &len_dst */
#define meth %arg5l
#define methb %arg5b
push %rbp; push %rbx // C callable
push ldst
push dst
addq src,lsrc; push lsrc // &input_eof
subq src,lsrc // restore the value of lsrc
section NRV_HEAD
/* Working registers */
#define off %eax /* XXX: 2GB */
#define len %ecx /* XXX: 2GB */
#define lenq %rcx
#define bits %ebx
#define displ %ebp
#define dispq %rbp
movq src,%rsi // hardware src for movsb, lodsb
movq dst,%rdi // hardware dst for movsb
xor bits,bits // empty; force refill
xor len,len // create loop invariant
orq $(~0),dispq // -1: initial displacement
call setup // push &getbit [TUNED]
ra_setup:
/* AMD64 branch prediction is much worse if there are more than 3 branches
per 16-byte block. The jnextb would suffer unless inlined. getnextb is OK
using closed subroutine to save space, and should be OK on cycles because
CALL+RET should be predicted. getnextb could partially expand, using closed
subroutine only for refill.
*/
/* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */
/* Prediction omitted for now. */
/* On refill: prefetch next byte, for latency reduction on literals and offsets. */
#define jnextb0np jnextb0yp
#define jnextb0yp GETBITp; jnc
#define jnextb1np jnextb1yp
#define jnextb1yp GETBITp; jc
#define GETBITp \
addl bits,bits; jnz 0f; \
movl (%rsi),bits; subq $-4,%rsi; \
adcl bits,bits; movzbl (%rsi),%edx; \
0:
/* Same, but without prefetch (not useful for length of match.) */
#define jnextb0n jnextb0y
#define jnextb0y GETBIT; jnc
#define jnextb1n jnextb1y
#define jnextb1y GETBIT; jc
#define GETBIT \
addl bits,bits; jnz 0f; \
movl (%rsi),bits; subq $-4,%rsi; \
adcl bits,bits; \
0:
/* rotate next bit into bottom bit of reg */
#define getnextbp(reg) call *%r11; adcl reg,reg
#define getnextb(reg) getnextbp(reg)
getbit:
endbr64
addl bits,bits; jz refill // Carry= next bit
rep; ret
refill:
movl (%rsi),bits; subq $-4,%rsi // next 32 bits; set Carry
adcl bits,bits // LSB= 1 (CarryIn); CarryOut= next bit
movzbl (%rsi),%edx // speculate: literal, or bottom 8 bits of offset
rep; ret
copy: // In: len, %rdi, dispq; Out: 0==len, %rdi, dispq; trashes %rax, %rdx
leaq (%rdi,dispq),%rax; cmpl $5,len // <=3 is forced
movzbl (%rax),%edx; jbe copy1 // <=5 for better branch predict
cmpq $-4,dispq; ja copy1 // 4-byte chunks would overlap
subl $4,len // adjust for termination cases
copy4:
movl (%rax),%edx; addq $4, %rax; subl $4,len
movl %edx,(%rdi); leaq 4(%rdi),%rdi; jnc copy4
addl $4,len; movzbl (%rax),%edx; jz copy0
copy1:
incq %rax; movb %dl,(%rdi); subl $1,len
movzbl (%rax),%edx
leaq 1(%rdi),%rdi; jnz copy1
copy0:
rep; ret
setup:
cld
pop %r11 // addq $ getbit - ra_setup,%r11 # &getbit
section NRV2E
#include "arch/amd64/nrv2e_d.S"
section NRV2D
#include "arch/amd64/nrv2d_d.S"
section NRV2B
#include "arch/amd64/nrv2b_d.S"
#include "arch/amd64/lzma_d.S"
section NRV_TAIL
// empty
#undef off
#undef len
#undef lenq
#undef bits
#undef displ
#undef dispq
section ELFMAINY
eof:
pop %rcx // &input_eof
movq %rsi,%rax; subq %rcx,%rax // src -= eof; // return 0: good; else: bad
pop %rdx; subq %rdx,%rdi // dst -= original dst
pop %rcx; movl %edi,(%rcx) // actual length used at dst XXX: 4GB
pop %rbx; pop %rbp
ret
msg_SELinux:
push $ L71 - L70; pop %arg3 // length
call L72
L70:
.asciz "PROT_EXEC|PROT_WRITE failed.\n"
L71:
// IDENTSTR goes here
section ELFMAINZ
L72:
pop %arg2 // message text
push $2; pop %arg1 // fd stderr
push $ __NR_write; pop %rax
syscall
die:
push $127; pop %arg1
push $ __NR_exit; pop %rax
syscall
main:
//// nop; int3; int3
// 1. allocate temporary pages
// 2. copy to temporary pages:
// fragment of page below dst; compressed src;
// decompress+unfilter; supervise
// 3. mmap destination pages for decompressed data
// 4. create escape hatch
// 5. jump to temporary pages
// 6. uncompress
// 7. unfilter
// 8. mprotect decompressed pages
// 9 setup args for unmap of temp pages
// 10. jump to escape hatch
// 11. unmap temporary pages
// 12. goto user DT_INIT
pop %rdx // &decompress
lea _start - decompress - 4*4(%rdx),%rsi
mov %rsi,%rcx
lodsl; sub %rax,%rcx; //mov %rcx,o_reloc(%rbp)
lodsl; add %rcx,%rax; mov %rax,o_uinit(%rbp) // reloc DT_INIT for step 12
lodsl; add %rcx,%rax; mov %rax,o_hatch(%rbp) // reloc &hatch for step 10
lodsl; lea (%rcx,%rax),%rdi // &l_info; also destination for decompress
lea sz_l_info+sz_p_info(%rdi),%rsi // &b_info
push %rax; push %rax // param space: munmap temp pages step 9
p_unmap= -2*8
lodsl; lodsl; add %rax,%rsi; lodsl // skip unpack helper block
lodsl // eax=dstlen
mov %rdi,%rcx
and $~PAGE_MASK,%ecx // %ecx= fragment
add %rcx,%rax; push %rax // params: mprotect restored pages step 8
sub %rcx,%rdi; push %rdi
p_mprot= -4*8
sub %rcx,%rax // restore
add %rcx,%rdi
push %rcx // fragment
o_frag = -5*8
call L210
#include "arch/amd64/bxx.S"
L210:
o_unflt= -6*8
movzbl b_method-4+1(%rsi),%ecx; push %rcx // ftid
movzbl b_method-4+2(%rsi),%ecx; push %rcx // cto8
push %rax; mov %rsp,%rcx // dstlen also for unfilter step 7
push %rdi // dst param for unfilter step 7
p_unflt= -10*8
push %rdx // &decompress
o_uncpr= -11*8
lodsl; mov %eax,%edx // %rdx= srclen
lodsl; push %rax // method,filter,cto,junk
push %rcx // &dstlen
push %rdi // dst
push %rdx // srclen
push %rsi // src; arglist ready for decompress step 6
p_uncpr= -16*8
mov o_uncpr(%rbp),%rax; add -4(%rax),%edx // l_d_cpr + l_f_unc
mov o_unflt(%rbp),%rax; add -4(%rax),%edx // l_d_cpr + l_f_unc + l_f_unf
call L220
supervise:
// Allocate pages for result of decompressing.
// These replace the compressed source and the following hole.
push $0; pop %arg6
push $0; pop %arg5
push $MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED; pop %sys4
push $PROT_READ|PROT_WRITE; pop %arg3
movq p_mprot+8(%rbp),%arg2 // dstlen
movq p_mprot (%rbp),%arg1 // dst
push $__NR_mmap; pop %rax; syscall
cmp %arg1,%rax; je 0f; hlt; 0:
// Restore fragment of page below dst
movl o_frag(%rbp),%ecx
mov %rax,%rdi
mov p_unmap(%rbp),%rsi
add $3,%ecx; shr $2,%ecx // FIXME: is this safe?
rep movsl
pop %arg1
pop %arg2
pop %arg3
pop %arg4
pop %arg5
pop %rax; call *%rax // decompress
//p_unflt
pop %arg1
pop %arg2
lea (%arg1,%arg2),%rax
movl $0x5e5f050f, (%rax) // "syscall; pop %rdi; pop %rsi"
movb $0xc3,4(%rax) // "ret"
mov %rax,o_hatch(%rbp) // hatch beyond .text
pop %arg3
pop %arg4
pop %rax;
test %arg4,%arg4; je 0f // 0==ftid ==> no filter
call *%rax // unfilter
0:
pop %rcx // toss fragment
//p_mprot
pop %arg1 // dst including fragment
pop %arg2 // dstlen
push $PROT_READ|PROT_EXEC; pop %arg3
push $__NR_mprotect; pop %rax; syscall
//p_unmap
pop %arg1 // &temp pages
pop %arg2 // length
push $__NR_munmap; pop %rax
//// nop; int3; int3
pop %rbp
pop %arg3 // third arg to DT_INIT()
ret // goto escape hatch
//hatch:
// syscall // munmap temporary pages
// pop %arg1 // first two args to DT_INIT()
// pop %arg2
// ret // goto user DT_INIT
L220:
mov o_frag(%rbp),%arg2l // fragment
add %edx,%arg2l // + l_d_cpr + l_f_unc + l_f_unf
pop %rax; push %rax // &supervise
add -4(%rax),%arg2l // total length to allocate
// Allocate pages to hold temporary copy.
push $0; pop %arg6
push $0; pop %arg5
push $MAP_PRIVATE|MAP_ANONYMOUS; pop %sys4
push $PROT_READ|PROT_WRITE|PROT_EXEC; pop %arg3
mov %arg2,p_unmap+8(%rbp) // length to unmap
push $0; pop %arg1 // addr
push $__NR_mmap; pop %rax; syscall
cmpq $PAGE_MASK,%rax; jb 0f; hlt; 0:
mov %rax,p_unmap (%rbp) // addr
mov %rax,%rdi // %rdi= dst
pop %rax // &supervise
mov o_frag(%rbp),%ecx // fragment
//p_uncpr
mov p_mprot(%rbp),%rsi
add $3,%ecx; shr $2,%ecx // FIXME: is this safe?
rep movsl // copy the fragment
pop %rsi // &src data (after fragment)
pop %rcx; push %rcx // length
push %rdi // &copied data (after fragment)
add $3,%ecx; shr $2,%ecx
rep movsl // copy compressed data
mov o_uncpr(%rbp),%rsi
mov %rdi,o_uncpr(%rbp)
mov -4(%rsi),%ecx
rep movsb // copy decompressor
mov o_unflt(%rbp),%rsi
mov %rdi,o_unflt(%rbp)
mov -4(%rsi),%ecx
rep movsb // copy unfilter
//o_super
mov %rax,%rsi // %rsi= &supervise
push %rdi // &copied
mov -4(%rsi),%ecx
rep movsb // copy supervisor
ret // goto copied supervise:
/*__XTHEENDX__*/
/* vim:set ts=8 sw=8 et: */