Inline the emitCopy call.

name              old speed      new speed      delta
WordsEncode1e1-8   701MB/s ± 1%   712MB/s ± 1%   +1.64%  (p=0.000 n=10+10)
WordsEncode1e2-8   429MB/s ± 0%   467MB/s ± 0%   +8.86%    (p=0.000 n=9+9)
WordsEncode1e3-8   447MB/s ± 0%   483MB/s ± 0%   +8.20%    (p=0.000 n=9+9)
WordsEncode1e4-8   322MB/s ± 1%   353MB/s ± 1%   +9.76%  (p=0.000 n=10+10)
WordsEncode1e5-8   268MB/s ± 0%   293MB/s ± 0%   +9.42%    (p=0.000 n=9+8)
WordsEncode1e6-8   313MB/s ± 0%   345MB/s ± 0%  +10.06%    (p=0.000 n=8+9)
RandomEncode-8    14.4GB/s ± 1%  14.4GB/s ± 2%     ~      (p=0.829 n=8+10)
_ZFlat0-8          797MB/s ± 2%   863MB/s ± 0%   +8.39%    (p=0.000 n=9+9)
_ZFlat1-8          435MB/s ± 1%   471MB/s ± 0%   +8.34%    (p=0.000 n=9+8)
_ZFlat2-8         16.1GB/s ± 2%  16.2GB/s ± 2%     ~     (p=0.165 n=10+10)
_ZFlat3-8          633MB/s ± 0%   659MB/s ± 1%   +4.12%   (p=0.000 n=10+9)
_ZFlat4-8         7.95GB/s ± 1%  8.29GB/s ± 1%   +4.22%  (p=0.000 n=10+10)
_ZFlat5-8          771MB/s ± 0%   836MB/s ± 1%   +8.33%   (p=0.000 n=10+9)
_ZFlat6-8          283MB/s ± 0%   315MB/s ± 0%  +11.19%   (p=0.000 n=10+9)
_ZFlat7-8          265MB/s ± 0%   293MB/s ± 1%  +10.73%   (p=0.000 n=9+10)
_ZFlat8-8          299MB/s ± 0%   331MB/s ± 1%  +10.74%   (p=0.000 n=9+10)
_ZFlat9-8          246MB/s ± 1%   273MB/s ± 1%  +10.90%  (p=0.000 n=10+10)
_ZFlat10-8        1.05GB/s ± 1%  1.12GB/s ± 1%   +7.02%  (p=0.000 n=10+10)
_ZFlat11-8         411MB/s ± 0%   460MB/s ± 0%  +11.79%   (p=0.000 n=10+8)
diff --git a/encode_amd64.s b/encode_amd64.s
index a233b59..6a21444 100644
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -472,15 +472,14 @@
 	MOVQ SI, 32(SP)
 
 	// Spill local variables (registers) onto the stack; call; unspill.
-	//
-	// We don't need to unspill CX or R9 as we are just about to call another
-	// function.
 	MOVQ DI, 80(SP)
 	MOVQ R11, 96(SP)
 	MOVQ R12, 104(SP)
 	CALL ·extendMatch(SB)
+	MOVQ 56(SP), CX
 	MOVQ 64(SP), DX
 	MOVQ 80(SP), DI
+	MOVQ 88(SP), R9
 	MOVQ 96(SP), R11
 	MOVQ 104(SP), R12
 
@@ -489,29 +488,69 @@
 	MOVQ 40(SP), SI
 	ADDQ DX, SI
 
-	// d += emitCopy(dst[d:], base-candidate, s-base)
+	// ----------------------------------------
+	// Begin inline of the emitCopy call.
 	//
-	// Push args.
-	MOVQ DI, 0(SP)
-	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ R11, 24(SP)
+	// d += emitCopy(dst[d:], base-candidate, s-base)
+
+	// !!! length := s - base
 	MOVQ SI, AX
 	SUBQ R12, AX
-	MOVQ AX, 32(SP)
 
-	// Spill local variables (registers) onto the stack; call; unspill.
-	MOVQ SI, 72(SP)
-	MOVQ DI, 80(SP)
-	CALL ·emitCopy(SB)
-	MOVQ 56(SP), CX
-	MOVQ 64(SP), DX
-	MOVQ 72(SP), SI
-	MOVQ 80(SP), DI
-	MOVQ 88(SP), R9
+inlineEmitCopyLoop0:
+	// for length >= 68 { etc }
+	CMPL AX, $68
+	JLT  inlineEmitCopyStep1
 
-	// Finish the "d +=" part of "d += emitCopy(etc)".
-	ADDQ 40(SP), DI
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVB $0xfe, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $64, AX
+	JMP  inlineEmitCopyLoop0
+
+inlineEmitCopyStep1:
+	// if length > 64 { etc }
+	CMPL AX, $64
+	JLE  inlineEmitCopyStep2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVB $0xee, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $60, AX
+
+inlineEmitCopyStep2:
+	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
+	CMPL AX, $12
+	JGE  inlineEmitCopyStep3
+	CMPL R11, $2048
+	JGE  inlineEmitCopyStep3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(DI)
+	SHRL $8, R11
+	SHLB $5, R11
+	SUBB $4, AX
+	SHLB $2, AX
+	ORB  AX, R11
+	ORB  $1, R11
+	MOVB R11, 0(DI)
+	ADDQ $2, DI
+	JMP  inlineEmitCopyEnd
+
+inlineEmitCopyStep3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBL $1, AX
+	SHLB $2, AX
+	ORB  $2, AX
+	MOVB AX, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+
+inlineEmitCopyEnd:
+	// End inline of the emitCopy call.
+	// ----------------------------------------
 
 	// nextEmit = s
 	MOVQ SI, R10
OSZAR »