More ENDBRxx for CALL *%RDX for 'getbit'.
amd64: LEA xxx(%rip),%reg and STRCON section,
instead of CALL; .asciz "..."; POP %reg
(Not for PE due to WINDOWS_BACK binary compatibility.)
i386 string constants still use call-.asciz-pop because no (%rip)
modified: ../misc/testsuite/upx_testsuite_1-expected_sha256sums.sh
modified: p_lx_elf.cpp
modified: stub/src/amd64-linux.elf-entry.S
modified: stub/src/amd64-linux.elf-main2.c
modified: stub/src/amd64-linux.elf-so_entry.S
modified: stub/src/amd64-linux.shlib-init.S
modified: stub/src/amd64-win64.pe.S
modified: stub/src/i386-linux.elf-entry.S
modified: stub/src/i386-linux.elf-so_entry.S
modified: stub/src/upxfd_android.c
plus generated *.h *.map *.dump
419 lines
12 KiB
ArmAsm
419 lines
12 KiB
ArmAsm
/* amd64-linux.shlib-init.S -- Linux program entry point & decompressor (Elf shared lib)
|
|
*
|
|
* This file is part of the UPX executable compressor.
|
|
*
|
|
* Copyright (C) 1996-2024 Markus Franz Xaver Johannes Oberhumer
|
|
* Copyright (C) 1996-2024 Laszlo Molnar
|
|
* Copyright (C) 2000-2024 John F. Reiser
|
|
* All Rights Reserved.
|
|
*
|
|
* UPX and the UCL library are free software; you can redistribute them
|
|
* and/or modify them under the terms of the GNU General Public License as
|
|
* published by the Free Software Foundation; either version 2 of
|
|
* the License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; see the file COPYING.
|
|
* If not, write to the Free Software Foundation, Inc.,
|
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*
|
|
* Markus F.X.J. Oberhumer Laszlo Molnar
|
|
* <markus@oberhumer.com> <ezerotven+github@gmail.com>
|
|
*
|
|
* John F. Reiser
|
|
* <jreiser@users.sourceforge.net>
|
|
*/
|
|
|
|
#include "arch/amd64/macros.S"
|
|
#include "arch/amd64/regs.h"
|
|
|
|
sz_Ehdr= 64
|
|
sz_Phdr= 56
|
|
|
|
sz_l_info= 12
|
|
l_lsize= 8
|
|
|
|
sz_p_info= 12
|
|
|
|
sz_b_info= 12
|
|
sz_unc= 0
|
|
sz_cpr= 4
|
|
b_method= 8
|
|
|
|
PROT_READ= 1
|
|
PROT_WRITE= 2
|
|
PROT_EXEC= 4
|
|
|
|
MAP_PRIVATE= 2
|
|
MAP_FIXED= 0x10
|
|
MAP_ANONYMOUS= 0x20
|
|
|
|
__NR_mmap= 9 // 64-bit mode only! /usr/include/asm/unistd_64.h
|
|
__NR_mprotect= 10
|
|
__NR_munmap= 11
|
|
|
|
__NR_write= 1
|
|
__NR_exit= 60
|
|
|
|
PAGE_SHIFT= 12
|
|
PAGE_MASK= (~0<<PAGE_SHIFT)
|
|
PAGE_SIZE= -PAGE_MASK
|
|
|
|
M_NRV2B_LE32=2 // ../conf.h
|
|
M_NRV2D_LE32=5
|
|
M_NRV2E_LE32=8
|
|
|
|
|
|
// .long offset(.) // detect relocation
|
|
// .long offset(user DT_INIT)
|
|
// .long offset(escape_hatch)
|
|
// .long offset({l_info; p_info; b_info; compressed data})
|
|
section ELFMAINX
|
|
_start: .globl _start
|
|
nop; int3; int3
|
|
push %rax // space for &DT_INIT
|
|
o_uinit= 5*8
|
|
push %arg2; push %arg1 // save first two args to DT_INIT()
|
|
push %rax // space for &hatch
|
|
o_hatch= 2*8
|
|
push %arg3 // save third arg to DT_INIT()
|
|
push %rbp; mov %rsp,%rbp // frame pointer
|
|
call main // push &decompress
|
|
ret_main:
|
|
|
|
/* Returns 0 on success; non-zero on failure. */
|
|
decompress: // (uchar const *src, size_t lsrc, uchar *dst, u32 &ldst, uint method)
|
|
|
|
/* Arguments according to calling convention */
|
|
#define src %arg1
|
|
#define lsrc %arg2
|
|
#define dst %arg3
|
|
#define ldst %arg4 /* Out: actually a reference: &len_dst */
|
|
#define meth %arg5l
|
|
#define methb %arg5b
|
|
|
|
push %rbp; push %rbx // C callable
|
|
push ldst
|
|
push dst
|
|
addq src,lsrc; push lsrc // &input_eof
|
|
subq src,lsrc // restore the value of lsrc
|
|
|
|
section NRV_HEAD
|
|
|
|
/* Working registers */
|
|
#define off %eax /* XXX: 2GB */
|
|
#define len %ecx /* XXX: 2GB */
|
|
#define lenq %rcx
|
|
#define bits %ebx
|
|
#define displ %ebp
|
|
#define dispq %rbp
|
|
|
|
movq src,%rsi // hardware src for movsb, lodsb
|
|
movq dst,%rdi // hardware dst for movsb
|
|
xor bits,bits // empty; force refill
|
|
xor len,len // create loop invariant
|
|
orq $(~0),dispq // -1: initial displacement
|
|
call setup // push &getbit [TUNED]
|
|
ra_setup:
|
|
|
|
/* AMD64 branch prediction is much worse if there are more than 3 branches
|
|
per 16-byte block. The jnextb would suffer unless inlined. getnextb is OK
|
|
using closed subroutine to save space, and should be OK on cycles because
|
|
CALL+RET should be predicted. getnextb could partially expand, using closed
|
|
subroutine only for refill.
|
|
*/
|
|
/* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */
|
|
/* Prediction omitted for now. */
|
|
/* On refill: prefetch next byte, for latency reduction on literals and offsets. */
|
|
#define jnextb0np jnextb0yp
|
|
#define jnextb0yp GETBITp; jnc
|
|
#define jnextb1np jnextb1yp
|
|
#define jnextb1yp GETBITp; jc
|
|
#define GETBITp \
|
|
addl bits,bits; jnz 0f; \
|
|
movl (%rsi),bits; subq $-4,%rsi; \
|
|
adcl bits,bits; movzbl (%rsi),%edx; \
|
|
0:
|
|
/* Same, but without prefetch (not useful for length of match.) */
|
|
#define jnextb0n jnextb0y
|
|
#define jnextb0y GETBIT; jnc
|
|
#define jnextb1n jnextb1y
|
|
#define jnextb1y GETBIT; jc
|
|
#define GETBIT \
|
|
addl bits,bits; jnz 0f; \
|
|
movl (%rsi),bits; subq $-4,%rsi; \
|
|
adcl bits,bits; \
|
|
0:
|
|
|
|
/* rotate next bit into bottom bit of reg */
|
|
#define getnextbp(reg) call *%r11; adcl reg,reg
|
|
#define getnextb(reg) getnextbp(reg)
|
|
|
|
|
|
getbit:
|
|
endbr64
|
|
addl bits,bits; jz refill // Carry= next bit
|
|
rep; ret
|
|
refill:
|
|
movl (%rsi),bits; subq $-4,%rsi // next 32 bits; set Carry
|
|
adcl bits,bits // LSB= 1 (CarryIn); CarryOut= next bit
|
|
movzbl (%rsi),%edx // speculate: literal, or bottom 8 bits of offset
|
|
rep; ret
|
|
|
|
copy: // In: len, %rdi, dispq; Out: 0==len, %rdi, dispq; trashes %rax, %rdx
|
|
leaq (%rdi,dispq),%rax; cmpl $5,len // <=3 is forced
|
|
movzbl (%rax),%edx; jbe copy1 // <=5 for better branch predict
|
|
cmpq $-4,dispq; ja copy1 // 4-byte chunks would overlap
|
|
subl $4,len // adjust for termination cases
|
|
copy4:
|
|
movl (%rax),%edx; addq $4, %rax; subl $4,len
|
|
movl %edx,(%rdi); leaq 4(%rdi),%rdi; jnc copy4
|
|
addl $4,len; movzbl (%rax),%edx; jz copy0
|
|
copy1:
|
|
incq %rax; movb %dl,(%rdi); subl $1,len
|
|
movzbl (%rax),%edx
|
|
leaq 1(%rdi),%rdi; jnz copy1
|
|
copy0:
|
|
rep; ret
|
|
|
|
setup:
|
|
cld
|
|
pop %r11 // addq $ getbit - ra_setup,%r11 # &getbit
|
|
|
|
section NRV2E
|
|
#include "arch/amd64/nrv2e_d.S"
|
|
|
|
section NRV2D
|
|
#include "arch/amd64/nrv2d_d.S"
|
|
|
|
section NRV2B
|
|
#include "arch/amd64/nrv2b_d.S"
|
|
|
|
#include "arch/amd64/lzma_d.S"
|
|
|
|
section NRV_TAIL
|
|
// empty
|
|
|
|
#undef off
|
|
#undef len
|
|
#undef lenq
|
|
#undef bits
|
|
#undef displ
|
|
#undef dispq
|
|
|
|
section ELFMAINY
|
|
eof:
|
|
pop %rcx // &input_eof
|
|
movq %rsi,%rax; subq %rcx,%rax // src -= eof; // return 0: good; else: bad
|
|
pop %rdx; subq %rdx,%rdi // dst -= original dst
|
|
pop %rcx; movl %edi,(%rcx) // actual length used at dst XXX: 4GB
|
|
pop %rbx; pop %rbp
|
|
ret
|
|
|
|
msg_SELinux:
|
|
push $ L71 - L70; pop %arg3 // length
|
|
call L72
|
|
L70:
|
|
.asciz "PROT_EXEC|PROT_WRITE failed.\n"
|
|
L71:
|
|
// IDENTSTR goes here
|
|
|
|
section ELFMAINZ
|
|
L72:
|
|
pop %arg2 // message text
|
|
push $2; pop %arg1 // fd stderr
|
|
push $ __NR_write; pop %rax
|
|
syscall
|
|
die:
|
|
push $127; pop %arg1
|
|
push $ __NR_exit; pop %rax
|
|
syscall
|
|
|
|
main:
|
|
//// nop; int3; int3
|
|
|
|
// 1. allocate temporary pages
|
|
// 2. copy to temporary pages:
|
|
// fragment of page below dst; compressed src;
|
|
// decompress+unfilter; supervise
|
|
// 3. mmap destination pages for decompressed data
|
|
// 4. create escape hatch
|
|
// 5. jump to temporary pages
|
|
// 6. uncompress
|
|
// 7. unfilter
|
|
// 8. mprotect decompressed pages
|
|
// 9 setup args for unmap of temp pages
|
|
// 10. jump to escape hatch
|
|
// 11. unmap temporary pages
|
|
// 12. goto user DT_INIT
|
|
|
|
pop %rdx // &decompress
|
|
|
|
lea _start - decompress - 4*4(%rdx),%rsi
|
|
mov %rsi,%rcx
|
|
lodsl; sub %rax,%rcx; //mov %rcx,o_reloc(%rbp)
|
|
lodsl; add %rcx,%rax; mov %rax,o_uinit(%rbp) // reloc DT_INIT for step 12
|
|
lodsl; add %rcx,%rax; mov %rax,o_hatch(%rbp) // reloc &hatch for step 10
|
|
lodsl; lea (%rcx,%rax),%rdi // &l_info; also destination for decompress
|
|
lea sz_l_info+sz_p_info(%rdi),%rsi // &b_info
|
|
|
|
push %rax; push %rax // param space: munmap temp pages step 9
|
|
p_unmap= -2*8
|
|
|
|
lodsl; lodsl; add %rax,%rsi; lodsl // skip unpack helper block
|
|
|
|
lodsl // eax=dstlen
|
|
mov %rdi,%rcx
|
|
and $~PAGE_MASK,%ecx // %ecx= fragment
|
|
add %rcx,%rax; push %rax // params: mprotect restored pages step 8
|
|
sub %rcx,%rdi; push %rdi
|
|
p_mprot= -4*8
|
|
sub %rcx,%rax // restore
|
|
add %rcx,%rdi
|
|
push %rcx // fragment
|
|
o_frag = -5*8
|
|
|
|
call L210
|
|
#include "arch/amd64/bxx.S"
|
|
L210:
|
|
o_unflt= -6*8
|
|
movzbl b_method-4+1(%rsi),%ecx; push %rcx // ftid
|
|
movzbl b_method-4+2(%rsi),%ecx; push %rcx // cto8
|
|
push %rax; mov %rsp,%rcx // dstlen also for unfilter step 7
|
|
push %rdi // dst param for unfilter step 7
|
|
p_unflt= -10*8
|
|
|
|
push %rdx // &decompress
|
|
o_uncpr= -11*8
|
|
lodsl; mov %eax,%edx // %rdx= srclen
|
|
lodsl; push %rax // method,filter,cto,junk
|
|
push %rcx // &dstlen
|
|
push %rdi // dst
|
|
push %rdx // srclen
|
|
push %rsi // src; arglist ready for decompress step 6
|
|
p_uncpr= -16*8
|
|
|
|
mov o_uncpr(%rbp),%rax; add -4(%rax),%edx // l_d_cpr + l_f_unc
|
|
mov o_unflt(%rbp),%rax; add -4(%rax),%edx // l_d_cpr + l_f_unc + l_f_unf
|
|
|
|
call L220
|
|
supervise:
|
|
// Allocate pages for result of decompressing.
|
|
// These replace the compressed source and the following hole.
|
|
push $0; pop %arg6
|
|
push $0; pop %arg5
|
|
push $MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED; pop %sys4
|
|
push $PROT_READ|PROT_WRITE; pop %arg3
|
|
movq p_mprot+8(%rbp),%arg2 // dstlen
|
|
movq p_mprot (%rbp),%arg1 // dst
|
|
push $__NR_mmap; pop %rax; syscall
|
|
cmp %arg1,%rax; je 0f; hlt; 0:
|
|
|
|
// Restore fragment of page below dst
|
|
movl o_frag(%rbp),%ecx
|
|
mov %rax,%rdi
|
|
mov p_unmap(%rbp),%rsi
|
|
add $3,%ecx; shr $2,%ecx // FIXME: is this safe?
|
|
rep movsl
|
|
|
|
pop %arg1
|
|
pop %arg2
|
|
pop %arg3
|
|
pop %arg4
|
|
pop %arg5
|
|
pop %rax; call *%rax // decompress
|
|
//p_unflt
|
|
pop %arg1
|
|
pop %arg2
|
|
|
|
lea (%arg1,%arg2),%rax
|
|
movl $0x5e5f050f, (%rax) // "syscall; pop %rdi; pop %rsi"
|
|
movb $0xc3,4(%rax) // "ret"
|
|
mov %rax,o_hatch(%rbp) // hatch beyond .text
|
|
|
|
pop %arg3
|
|
pop %arg4
|
|
pop %rax;
|
|
test %arg4,%arg4; je 0f // 0==ftid ==> no filter
|
|
call *%rax // unfilter
|
|
0:
|
|
pop %rcx // toss fragment
|
|
//p_mprot
|
|
pop %arg1 // dst including fragment
|
|
pop %arg2 // dstlen
|
|
push $PROT_READ|PROT_EXEC; pop %arg3
|
|
push $__NR_mprotect; pop %rax; syscall
|
|
//p_unmap
|
|
pop %arg1 // &temp pages
|
|
pop %arg2 // length
|
|
push $__NR_munmap; pop %rax
|
|
|
|
//// nop; int3; int3
|
|
|
|
pop %rbp
|
|
pop %arg3 // third arg to DT_INIT()
|
|
ret // goto escape hatch
|
|
//hatch:
|
|
// syscall // munmap temporary pages
|
|
// pop %arg1 // first two args to DT_INIT()
|
|
// pop %arg2
|
|
// ret // goto user DT_INIT
|
|
|
|
L220:
|
|
mov o_frag(%rbp),%arg2l // fragment
|
|
add %edx,%arg2l // + l_d_cpr + l_f_unc + l_f_unf
|
|
pop %rax; push %rax // &supervise
|
|
add -4(%rax),%arg2l // total length to allocate
|
|
|
|
// Allocate pages to hold temporary copy.
|
|
push $0; pop %arg6
|
|
push $0; pop %arg5
|
|
push $MAP_PRIVATE|MAP_ANONYMOUS; pop %sys4
|
|
push $PROT_READ|PROT_WRITE|PROT_EXEC; pop %arg3
|
|
mov %arg2,p_unmap+8(%rbp) // length to unmap
|
|
push $0; pop %arg1 // addr
|
|
push $__NR_mmap; pop %rax; syscall
|
|
cmpq $PAGE_MASK,%rax; jb 0f; hlt; 0:
|
|
|
|
mov %rax,p_unmap (%rbp) // addr
|
|
mov %rax,%rdi // %rdi= dst
|
|
pop %rax // &supervise
|
|
mov o_frag(%rbp),%ecx // fragment
|
|
//p_uncpr
|
|
mov p_mprot(%rbp),%rsi
|
|
add $3,%ecx; shr $2,%ecx // FIXME: is this safe?
|
|
rep movsl // copy the fragment
|
|
|
|
pop %rsi // &src data (after fragment)
|
|
pop %rcx; push %rcx // length
|
|
push %rdi // &copied data (after fragment)
|
|
add $3,%ecx; shr $2,%ecx
|
|
rep movsl // copy compressed data
|
|
|
|
mov o_uncpr(%rbp),%rsi
|
|
mov %rdi,o_uncpr(%rbp)
|
|
mov -4(%rsi),%ecx
|
|
rep movsb // copy decompressor
|
|
|
|
mov o_unflt(%rbp),%rsi
|
|
mov %rdi,o_unflt(%rbp)
|
|
mov -4(%rsi),%ecx
|
|
rep movsb // copy unfilter
|
|
|
|
//o_super
|
|
mov %rax,%rsi // %rsi= &supervise
|
|
push %rdi // &copied
|
|
mov -4(%rsi),%ecx
|
|
rep movsb // copy supervisor
|
|
|
|
ret // goto copied supervise:
|
|
|
|
/*__XTHEENDX__*/
|
|
|
|
/* vim:set ts=8 sw=8 et: */
|