diff --git a/src/stub/amd_d_nrv2b.S b/src/stub/amd_d_nrv2b.S index e4a2f4ab..07982429 100644 --- a/src/stub/amd_d_nrv2b.S +++ b/src/stub/amd_d_nrv2b.S @@ -34,7 +34,7 @@ lit_n2b: incq %rsi; movb %dl,(%rdi) incq %rdi top_n2b: - movzbl (%rsi),%edx # speculate: literal, or bottom 8 bits of offset + movb (%rsi),%dl # speculate: literal, or bottom 8 bits of offset jnextb1y lit_n2b lea 1(lenq),off # [len= 0] off= 1 offmore_n2b: @@ -42,7 +42,7 @@ offmore_n2b: jnextb0n offmore_n2b subl $ 3,off; jc len_n2b # use previous offset - shll $ 8,off + shll $ 8,off; movzbl %dl,%edx orl %edx,off; incq %rsi xorl $~0,off; jz eof movslq off,disp # XXX: 2GB diff --git a/src/stub/amd_d_nrv2e.S b/src/stub/amd_d_nrv2e.S index ef351e69..3cfcd6e5 100644 --- a/src/stub/amd_d_nrv2e.S +++ b/src/stub/amd_d_nrv2e.S @@ -34,7 +34,7 @@ lit_n2e: incq %rsi; movb %dl,(%rdi) incq %rdi top_n2e: - movzbl (%rsi),%edx # speculate: literal, or bottom 8 bits of offset + movb (%rsi),%dl # speculate: literal, or bottom 8 bits of offset jnextb1y lit_n2e lea 1(lenq),off # [len= 0] off= 1 jmp getoff_n2e @@ -47,7 +47,7 @@ getoff_n2e: jnextb0n off_n2e subl $ 3,off; jc offprev_n2e - shll $ 8,off + shll $ 8,off; movzbl %dl,%edx orl %edx,off; incq %rsi xorl $~0,off; jz eof sarl off # Carry= original low bit diff --git a/src/stub/l_lx_elf64amd.S b/src/stub/l_lx_elf64amd.S index 54524db5..e57adb68 100644 --- a/src/stub/l_lx_elf64amd.S +++ b/src/stub/l_lx_elf64amd.S @@ -91,17 +91,29 @@ decompress: # (uchar const *src, size_t lsrc, uchar *dst, u32 &ldst, uint metho xorl bits,bits # empty; force refill xorl len,len # create loop invariant orq $~0,disp # -1: initial displacement - jmp setup + call setup # push &getbit [TUNED] +ra_setup: +/* AMD64 branch prediction is much worse if there are more than 3 branches + per 16-byte block. The jnextb would suffer unless inlined. getnextb is OK + using closed subroutine to save space, and should be OK on cycles because + CALL+RET should be predicted. getnextb could partially expand, using closed + subroutine only for refill. +*/ /* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */ /* Prediction omitted for now. */ #define jnextb0n jnextb0y -#define jnextb0y addl bits,bits; jnz 0f; call refill; 0: jnc +#define jnextb0y GETBIT; jnc #define jnextb1n jnextb1y -#define jnextb1y addl bits,bits; jnz 0f; call refill; 0: jc +#define jnextb1y GETBIT; jc +#define GETBIT \ + addl bits,bits; jnz 0f; \ + movl (%rsi),bits; subq $-4,%rsi; \ + adcl bits,bits; movb (%rsi),%dl; \ +0: /* rotate next bit into bottom bit of reg */ -#define getnextb(reg) addl bits,bits; jnz 0f; call refill; 0: adcl reg,reg +#define getnextb(reg) call *%r11; adcl reg,reg ALIGN(1<<3) getbit: @@ -110,13 +122,13 @@ getbit: refill: movl (%rsi),bits; subq $-4,%rsi # next 32 bits; set Carry adcl bits,bits # LSB= 1 (CarryIn); CarryOut= next bit - movzbl (%rsi),%edx # speculate: literal, or bottom 8 bits of offset + movb (%rsi),%dl # speculate: literal, or bottom 8 bits of offset rep; ret copy: # In: len, %rdi, disp; Out: 0==len, %rdi, disp; trashes %rax, %rdx - leaq (%rdi,disp),%rax; movb (%rax),%dl - cmpl $ 3,len; jbe copy1 # perhaps extend this to length 5 or less? - cmpq $-4,disp; ja copy1 # 4-byte chunks would overlap + leaq (%rdi,disp),%rax; cmpl $5,len # <=3 is forced + movb (%rax),%dl; jbe copy1 # <=5 for better branch predict + cmpq $-4,disp; ja copy1 # 4-byte chunks would overlap subl $4,len # adjust for termination cases copy4: movl (%rax),%edx; addq $4, %rax; subl $4,len @@ -134,7 +146,8 @@ copy0: setup: cld - cmpl $ M_NRV2E_LE32,meth; je bot_n2e + pop %r11 # addq $ getbit - ra_setup,%r11 # &getbit + cmpl $ M_NRV2E_LE32,meth; je top_n2e cmpl $ M_NRV2B_LE32,meth; je top_n2b eof: pop %rcx # &input_eof