diff --git a/src/stub/amd_d_nrv2b.S b/src/stub/amd_d_nrv2b.S
index e4a2f4ab..07982429 100644
--- a/src/stub/amd_d_nrv2b.S
+++ b/src/stub/amd_d_nrv2b.S
@@ -34,7 +34,7 @@ lit_n2b:
         incq %rsi; movb %dl,(%rdi)
         incq %rdi
 top_n2b:
-        movzbl (%rsi),%edx  # speculate: literal, or bottom 8 bits of offset
+        movb (%rsi),%dl  # speculate: literal, or bottom 8 bits of offset
         jnextb1y lit_n2b
         lea 1(lenq),off  # [len= 0] off= 1
 offmore_n2b:
@@ -42,7 +42,7 @@ offmore_n2b:
         jnextb0n offmore_n2b
 
         subl $ 3,off; jc len_n2b  # use previous offset
-        shll $ 8,off
+        shll $ 8,off; movzbl %dl,%edx
         orl %edx,off; incq %rsi
         xorl $~0,off; jz eof
         movslq off,disp  # XXX: 2GB
diff --git a/src/stub/amd_d_nrv2e.S b/src/stub/amd_d_nrv2e.S
index ef351e69..3cfcd6e5 100644
--- a/src/stub/amd_d_nrv2e.S
+++ b/src/stub/amd_d_nrv2e.S
@@ -34,7 +34,7 @@ lit_n2e:
         incq %rsi; movb %dl,(%rdi)
         incq %rdi
 top_n2e:
-        movzbl (%rsi),%edx  # speculate: literal, or bottom 8 bits of offset
+        movb (%rsi),%dl  # speculate: literal, or bottom 8 bits of offset
         jnextb1y lit_n2e
         lea 1(lenq),off  # [len= 0] off= 1
         jmp getoff_n2e
@@ -47,7 +47,7 @@ getoff_n2e:
         jnextb0n off_n2e
 
         subl $ 3,off; jc offprev_n2e
-        shll $ 8,off
+        shll $ 8,off; movzbl %dl,%edx
         orl %edx,off; incq %rsi
         xorl $~0,off; jz eof
         sarl off  # Carry= original low bit
diff --git a/src/stub/l_lx_elf64amd.S b/src/stub/l_lx_elf64amd.S
index 54524db5..e57adb68 100644
--- a/src/stub/l_lx_elf64amd.S
+++ b/src/stub/l_lx_elf64amd.S
@@ -91,17 +91,29 @@ decompress:  # (uchar const *src, size_t lsrc, uchar *dst, u32 &ldst, uint metho
         xorl bits,bits  # empty; force refill
         xorl len,len  # create loop invariant
         orq $~0,disp  # -1: initial displacement
-        jmp setup
+        call setup  # push &getbit [TUNED]
+ra_setup:
 
+/* AMD64 branch prediction is much worse if there are more than 3 branches
+   per 16-byte block.  The jnextb would suffer unless inlined.  getnextb is OK
+   using closed subroutine to save space, and should be OK on cycles because
+   CALL+RET should be predicted.  getnextb could partially expand, using closed
+   subroutine only for refill.
+*/
 /* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */
 /* Prediction omitted for now. */
 #define jnextb0n jnextb0y
-#define jnextb0y addl bits,bits; jnz 0f; call refill; 0: jnc
+#define jnextb0y GETBIT; jnc
 #define jnextb1n jnextb1y
-#define jnextb1y addl bits,bits; jnz 0f; call refill; 0: jc
+#define jnextb1y GETBIT; jc
+#define GETBIT \
+        addl bits,bits; jnz 0f; \
+        movl (%rsi),bits; subq $-4,%rsi; \
+        adcl bits,bits; movb (%rsi),%dl; \
+0:
 
 /* rotate next bit into bottom bit of reg */
-#define getnextb(reg) addl bits,bits; jnz 0f; call refill; 0: adcl reg,reg
+#define getnextb(reg) call *%r11; adcl reg,reg
 
         ALIGN(1<<3)
 getbit:
@@ -110,13 +122,13 @@ getbit:
 refill:
         movl (%rsi),bits; subq $-4,%rsi  # next 32 bits; set Carry
         adcl bits,bits  # LSB= 1 (CarryIn); CarryOut= next bit
-        movzbl (%rsi),%edx  # speculate: literal, or bottom 8 bits of offset
+        movb (%rsi),%dl  # speculate: literal, or bottom 8 bits of offset
         rep; ret
 
 copy:  # In: len, %rdi, disp;  Out: 0==len, %rdi, disp;  trashes %rax, %rdx
-        leaq (%rdi,disp),%rax; movb (%rax),%dl
-        cmpl $ 3,len;  jbe copy1  # perhaps extend this to length 5 or less?
-        cmpq $-4,disp; ja  copy1  # 4-byte chunks would overlap
+        leaq (%rdi,disp),%rax; cmpl $5,len  # <=3 is forced
+        movb (%rax),%dl; jbe copy1  # <=5 for better branch predict
+        cmpq $-4,disp;   ja  copy1  # 4-byte chunks would overlap
         subl $4,len  # adjust for termination cases
 copy4:
         movl (%rax),%edx; addq $4,      %rax; subl $4,len
@@ -134,7 +146,8 @@ copy0:
 
 setup:
         cld
-        cmpl $ M_NRV2E_LE32,meth; je bot_n2e
+        pop %r11  # addq $ getbit - ra_setup,%r11  # &getbit
+        cmpl $ M_NRV2E_LE32,meth; je top_n2e
         cmpl $ M_NRV2B_LE32,meth; je top_n2b
 eof:
         pop %rcx  # &input_eof