Inline the emitCopy call.
name old speed new speed delta
WordsEncode1e1-8 701MB/s ± 1% 712MB/s ± 1% +1.64% (p=0.000 n=10+10)
WordsEncode1e2-8 429MB/s ± 0% 467MB/s ± 0% +8.86% (p=0.000 n=9+9)
WordsEncode1e3-8 447MB/s ± 0% 483MB/s ± 0% +8.20% (p=0.000 n=9+9)
WordsEncode1e4-8 322MB/s ± 1% 353MB/s ± 1% +9.76% (p=0.000 n=10+10)
WordsEncode1e5-8 268MB/s ± 0% 293MB/s ± 0% +9.42% (p=0.000 n=9+8)
WordsEncode1e6-8 313MB/s ± 0% 345MB/s ± 0% +10.06% (p=0.000 n=8+9)
RandomEncode-8 14.4GB/s ± 1% 14.4GB/s ± 2% ~ (p=0.829 n=8+10)
_ZFlat0-8 797MB/s ± 2% 863MB/s ± 0% +8.39% (p=0.000 n=9+9)
_ZFlat1-8 435MB/s ± 1% 471MB/s ± 0% +8.34% (p=0.000 n=9+8)
_ZFlat2-8 16.1GB/s ± 2% 16.2GB/s ± 2% ~ (p=0.165 n=10+10)
_ZFlat3-8 633MB/s ± 0% 659MB/s ± 1% +4.12% (p=0.000 n=10+9)
_ZFlat4-8 7.95GB/s ± 1% 8.29GB/s ± 1% +4.22% (p=0.000 n=10+10)
_ZFlat5-8 771MB/s ± 0% 836MB/s ± 1% +8.33% (p=0.000 n=10+9)
_ZFlat6-8 283MB/s ± 0% 315MB/s ± 0% +11.19% (p=0.000 n=10+9)
_ZFlat7-8 265MB/s ± 0% 293MB/s ± 1% +10.73% (p=0.000 n=9+10)
_ZFlat8-8 299MB/s ± 0% 331MB/s ± 1% +10.74% (p=0.000 n=9+10)
_ZFlat9-8 246MB/s ± 1% 273MB/s ± 1% +10.90% (p=0.000 n=10+10)
_ZFlat10-8 1.05GB/s ± 1% 1.12GB/s ± 1% +7.02% (p=0.000 n=10+10)
_ZFlat11-8 411MB/s ± 0% 460MB/s ± 0% +11.79% (p=0.000 n=10+8)
diff --git a/encode_amd64.s b/encode_amd64.s
index a233b59..6a21444 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -472,15 +472,14 @@
MOVQ SI, 32(SP)
// Spill local variables (registers) onto the stack; call; unspill.
- //
- // We don't need to unspill CX or R9 as we are just about to call another
- // function.
MOVQ DI, 80(SP)
MOVQ R11, 96(SP)
MOVQ R12, 104(SP)
CALL ·extendMatch(SB)
+ MOVQ 56(SP), CX
MOVQ 64(SP), DX
MOVQ 80(SP), DI
+ MOVQ 88(SP), R9
MOVQ 96(SP), R11
MOVQ 104(SP), R12
@@ -489,29 +488,69 @@
MOVQ 40(SP), SI
ADDQ DX, SI
- // d += emitCopy(dst[d:], base-candidate, s-base)
+ // ----------------------------------------
+ // Begin inline of the emitCopy call.
//
- // Push args.
- MOVQ DI, 0(SP)
- MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
- MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
- MOVQ R11, 24(SP)
+ // d += emitCopy(dst[d:], base-candidate, s-base)
+
+ // !!! length := s - base
MOVQ SI, AX
SUBQ R12, AX
- MOVQ AX, 32(SP)
- // Spill local variables (registers) onto the stack; call; unspill.
- MOVQ SI, 72(SP)
- MOVQ DI, 80(SP)
- CALL ·emitCopy(SB)
- MOVQ 56(SP), CX
- MOVQ 64(SP), DX
- MOVQ 72(SP), SI
- MOVQ 80(SP), DI
- MOVQ 88(SP), R9
+inlineEmitCopyLoop0:
+ // for length >= 68 { etc }
+ CMPL AX, $68
+ JLT inlineEmitCopyStep1
- // Finish the "d +=" part of "d += emitCopy(etc)".
- ADDQ 40(SP), DI
+ // Emit a length 64 copy, encoded as 3 bytes.
+ MOVB $0xfe, 0(DI)
+ MOVW R11, 1(DI)
+ ADDQ $3, DI
+ SUBL $64, AX
+ JMP inlineEmitCopyLoop0
+
+inlineEmitCopyStep1:
+ // if length > 64 { etc }
+ CMPL AX, $64
+ JLE inlineEmitCopyStep2
+
+ // Emit a length 60 copy, encoded as 3 bytes.
+ MOVB $0xee, 0(DI)
+ MOVW R11, 1(DI)
+ ADDQ $3, DI
+ SUBL $60, AX
+
+inlineEmitCopyStep2:
+ // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
+ CMPL AX, $12
+ JGE inlineEmitCopyStep3
+ CMPL R11, $2048
+ JGE inlineEmitCopyStep3
+
+ // Emit the remaining copy, encoded as 2 bytes.
+ MOVB R11, 1(DI)
+ SHRL $8, R11
+ SHLB $5, R11
+ SUBB $4, AX
+ SHLB $2, AX
+ ORB AX, R11
+ ORB $1, R11
+ MOVB R11, 0(DI)
+ ADDQ $2, DI
+ JMP inlineEmitCopyEnd
+
+inlineEmitCopyStep3:
+ // Emit the remaining copy, encoded as 3 bytes.
+ SUBL $1, AX
+ SHLB $2, AX
+ ORB $2, AX
+ MOVB AX, 0(DI)
+ MOVW R11, 1(DI)
+ ADDQ $3, DI
+
+inlineEmitCopyEnd:
+ // End inline of the emitCopy call.
+ // ----------------------------------------
// nextEmit = s
MOVQ SI, R10