open-vault/vendor/github.com/golang/snappy/encode_amd64.s

731 lines
16 KiB
ArmAsm
Raw Normal View History

2016-04-26 00:18:04 +00:00
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !appengine
// +build gc
// +build !noasm
#include "textflag.h"
2016-05-03 00:11:05 +00:00
// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
// https://github.com/golang/snappy/issues/29
2016-04-26 00:18:04 +00:00
//
2016-05-03 00:11:05 +00:00
// As a workaround, the package was built with a known good assembler, and
// those instructions were disassembled by "objdump -d" to yield the
// 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
// style comments, in AT&T asm syntax. Note that rsp here is a physical
// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
// fine on Go 1.6.
2016-04-26 00:18:04 +00:00
// The asm code generally follows the pure Go code in encode_other.go, except
// where marked with a "!!!".
// ----------------------------------------------------------------------------
// func emitLiteral(dst, lit []byte) int
//
// All local variables fit into registers. The register allocation:
2016-05-03 00:11:05 +00:00
// - AX len(lit)
2016-04-26 00:18:04 +00:00
// - BX n
2016-05-03 00:11:05 +00:00
// - DX return value
2016-04-26 00:18:04 +00:00
// - DI &dst[i]
2016-05-03 00:11:05 +00:00
// - R10 &lit[0]
2016-04-26 00:18:04 +00:00
//
// The 24 bytes of stack space is to call runtime·memmove.
2016-05-03 00:11:05 +00:00
//
// The unusual register allocation of local variables, such as R10 for the
// source pointer, matches the allocation used at the call site in encodeBlock,
// which makes it easier to manually inline this function.
2016-04-26 00:18:04 +00:00
TEXT ·emitLiteral(SB), NOSPLIT, $24-56
MOVQ dst_base+0(FP), DI
2016-05-03 00:11:05 +00:00
MOVQ lit_base+24(FP), R10
MOVQ lit_len+32(FP), AX
MOVQ AX, DX
MOVL AX, BX
2016-04-26 00:18:04 +00:00
SUBL $1, BX
CMPL BX, $60
JLT oneByte
CMPL BX, $256
JLT twoBytes
threeBytes:
MOVB $0xf4, 0(DI)
MOVW BX, 1(DI)
ADDQ $3, DI
2016-05-03 00:11:05 +00:00
ADDQ $3, DX
JMP memmove
2016-04-26 00:18:04 +00:00
twoBytes:
MOVB $0xf0, 0(DI)
MOVB BX, 1(DI)
ADDQ $2, DI
2016-05-03 00:11:05 +00:00
ADDQ $2, DX
JMP memmove
2016-04-26 00:18:04 +00:00
oneByte:
SHLB $2, BX
MOVB BX, 0(DI)
ADDQ $1, DI
2016-05-03 00:11:05 +00:00
ADDQ $1, DX
2016-04-26 00:18:04 +00:00
2016-05-03 00:11:05 +00:00
memmove:
MOVQ DX, ret+48(FP)
2016-04-26 00:18:04 +00:00
// copy(dst[i:], lit)
//
// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
2016-05-03 00:11:05 +00:00
// DI, R10 and AX as arguments.
2016-04-26 00:18:04 +00:00
MOVQ DI, 0(SP)
2016-05-03 00:11:05 +00:00
MOVQ R10, 8(SP)
MOVQ AX, 16(SP)
2016-04-26 00:18:04 +00:00
CALL runtime·memmove(SB)
RET
// ----------------------------------------------------------------------------
// func emitCopy(dst []byte, offset, length int) int
//
// All local variables fit into registers. The register allocation:
2016-05-03 00:11:05 +00:00
// - AX length
2016-04-26 00:18:04 +00:00
// - SI &dst[0]
// - DI &dst[i]
2016-05-03 00:11:05 +00:00
// - R11 offset
//
// The unusual register allocation of local variables, such as R11 for the
// offset, matches the allocation used at the call site in encodeBlock, which
// makes it easier to manually inline this function.
2016-04-26 00:18:04 +00:00
TEXT ·emitCopy(SB), NOSPLIT, $0-48
MOVQ dst_base+0(FP), DI
MOVQ DI, SI
2016-05-03 00:11:05 +00:00
MOVQ offset+24(FP), R11
MOVQ length+32(FP), AX
2016-04-26 00:18:04 +00:00
loop0:
// for length >= 68 { etc }
2016-05-03 00:11:05 +00:00
CMPL AX, $68
2016-04-26 00:18:04 +00:00
JLT step1
// Emit a length 64 copy, encoded as 3 bytes.
MOVB $0xfe, 0(DI)
2016-05-03 00:11:05 +00:00
MOVW R11, 1(DI)
2016-04-26 00:18:04 +00:00
ADDQ $3, DI
2016-05-03 00:11:05 +00:00
SUBL $64, AX
2016-04-26 00:18:04 +00:00
JMP loop0
step1:
// if length > 64 { etc }
2016-05-03 00:11:05 +00:00
CMPL AX, $64
2016-04-26 00:18:04 +00:00
JLE step2
// Emit a length 60 copy, encoded as 3 bytes.
MOVB $0xee, 0(DI)
2016-05-03 00:11:05 +00:00
MOVW R11, 1(DI)
2016-04-26 00:18:04 +00:00
ADDQ $3, DI
2016-05-03 00:11:05 +00:00
SUBL $60, AX
2016-04-26 00:18:04 +00:00
step2:
// if length >= 12 || offset >= 2048 { goto step3 }
2016-05-03 00:11:05 +00:00
CMPL AX, $12
2016-04-26 00:18:04 +00:00
JGE step3
2016-05-03 00:11:05 +00:00
CMPL R11, $2048
2016-04-26 00:18:04 +00:00
JGE step3
// Emit the remaining copy, encoded as 2 bytes.
2016-05-03 00:11:05 +00:00
MOVB R11, 1(DI)
SHRL $8, R11
SHLB $5, R11
SUBB $4, AX
SHLB $2, AX
ORB AX, R11
ORB $1, R11
MOVB R11, 0(DI)
2016-04-26 00:18:04 +00:00
ADDQ $2, DI
// Return the number of bytes written.
SUBQ SI, DI
MOVQ DI, ret+40(FP)
RET
step3:
// Emit the remaining copy, encoded as 3 bytes.
2016-05-03 00:11:05 +00:00
SUBL $1, AX
SHLB $2, AX
ORB $2, AX
MOVB AX, 0(DI)
MOVW R11, 1(DI)
2016-04-26 00:18:04 +00:00
ADDQ $3, DI
// Return the number of bytes written.
SUBQ SI, DI
MOVQ DI, ret+40(FP)
RET
// ----------------------------------------------------------------------------
// func extendMatch(src []byte, i, j int) int
//
// All local variables fit into registers. The register allocation:
2016-05-03 00:11:05 +00:00
// - DX &src[0]
// - SI &src[j]
// - R13 &src[len(src) - 8]
// - R14 &src[len(src)]
// - R15 &src[i]
//
// The unusual register allocation of local variables, such as R15 for a source
// pointer, matches the allocation used at the call site in encodeBlock, which
// makes it easier to manually inline this function.
2016-04-26 00:18:04 +00:00
TEXT ·extendMatch(SB), NOSPLIT, $0-48
2016-05-03 00:11:05 +00:00
MOVQ src_base+0(FP), DX
MOVQ src_len+8(FP), R14
MOVQ i+24(FP), R15
MOVQ j+32(FP), SI
ADDQ DX, R14
ADDQ DX, R15
ADDQ DX, SI
MOVQ R14, R13
SUBQ $8, R13
2016-04-26 00:18:04 +00:00
cmp8:
// As long as we are 8 or more bytes before the end of src, we can load and
// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
2016-05-03 00:11:05 +00:00
CMPQ SI, R13
2016-04-26 00:18:04 +00:00
JA cmp1
2016-05-03 00:11:05 +00:00
MOVQ (R15), AX
MOVQ (SI), BX
2016-04-26 00:18:04 +00:00
CMPQ AX, BX
JNE bsf
2016-05-03 00:11:05 +00:00
ADDQ $8, R15
2016-04-26 00:18:04 +00:00
ADDQ $8, SI
JMP cmp8
bsf:
// If those 8 bytes were not equal, XOR the two 8 byte values, and return
// the index of the first byte that differs. The BSF instruction finds the
// least significant 1 bit, the amd64 architecture is little-endian, and
// the shift by 3 converts a bit index to a byte index.
XORQ AX, BX
BSFQ BX, BX
SHRQ $3, BX
2016-05-03 00:11:05 +00:00
ADDQ BX, SI
2016-04-26 00:18:04 +00:00
// Convert from &src[ret] to ret.
2016-05-03 00:11:05 +00:00
SUBQ DX, SI
MOVQ SI, ret+40(FP)
2016-04-26 00:18:04 +00:00
RET
cmp1:
// In src's tail, compare 1 byte at a time.
2016-05-03 00:11:05 +00:00
CMPQ SI, R14
2016-04-26 00:18:04 +00:00
JAE extendMatchEnd
2016-05-03 00:11:05 +00:00
MOVB (R15), AX
MOVB (SI), BX
2016-04-26 00:18:04 +00:00
CMPB AX, BX
JNE extendMatchEnd
2016-05-03 00:11:05 +00:00
ADDQ $1, R15
2016-04-26 00:18:04 +00:00
ADDQ $1, SI
JMP cmp1
extendMatchEnd:
// Convert from &src[ret] to ret.
2016-05-03 00:11:05 +00:00
SUBQ DX, SI
MOVQ SI, ret+40(FP)
2016-04-26 00:18:04 +00:00
RET
// ----------------------------------------------------------------------------
// func encodeBlock(dst, src []byte) (d int)
//
// All local variables fit into registers, other than "var table". The register
// allocation:
// - AX . .
// - BX . .
// - CX 56 shift (note that amd64 shifts by non-immediates must use CX).
// - DX 64 &src[0], tableSize
// - SI 72 &src[s]
// - DI 80 &dst[d]
// - R9 88 sLimit
// - R10 . &src[nextEmit]
// - R11 96 prevHash, currHash, nextHash, offset
// - R12 104 &src[base], skip
2016-05-03 00:11:05 +00:00
// - R13 . &src[nextS], &src[len(src) - 8]
// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
2016-04-26 00:18:04 +00:00
// - R15 112 candidate
//
// The second column (56, 64, etc) is the stack offset to spill the registers
// when calling other functions. We could pack this slightly tighter, but it's
// simpler to have a dedicated spill map independent of the function called.
//
// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
TEXT ·encodeBlock(SB), 0, $32888-56
MOVQ dst_base+0(FP), DI
MOVQ src_base+24(FP), SI
MOVQ src_len+32(FP), R14
// shift, tableSize := uint32(32-8), 1<<8
MOVQ $24, CX
MOVQ $256, DX
calcShift:
// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
// shift--
// }
CMPQ DX, $16384
JGE varTable
CMPQ DX, R14
JGE varTable
SUBQ $1, CX
SHLQ $1, DX
JMP calcShift
varTable:
// var table [maxTableSize]uint16
//
// In the asm code, unlike the Go code, we can zero-initialize only the
// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
// writes 16 bytes, so we can do only tableSize/8 writes instead of the
// 2048 writes that would zero-initialize all of table's 32768 bytes.
SHRQ $3, DX
LEAQ table-32768(SP), BX
PXOR X0, X0
memclr:
MOVOU X0, 0(BX)
ADDQ $16, BX
SUBQ $1, DX
JNZ memclr
// !!! DX = &src[0]
MOVQ SI, DX
// sLimit := len(src) - inputMargin
MOVQ R14, R9
SUBQ $15, R9
// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
// change for the rest of the function.
MOVQ CX, 56(SP)
MOVQ DX, 64(SP)
MOVQ R9, 88(SP)
// nextEmit := 0
MOVQ DX, R10
// s := 1
ADDQ $1, SI
// nextHash := hash(load32(src, s), shift)
MOVL 0(SI), R11
IMULL $0x1e35a7bd, R11
SHRL CX, R11
outer:
// for { etc }
// skip := 32
MOVQ $32, R12
// nextS := s
MOVQ SI, R13
// candidate := 0
MOVQ $0, R15
inner0:
// for { etc }
// s := nextS
MOVQ R13, SI
// bytesBetweenHashLookups := skip >> 5
MOVQ R12, R14
SHRQ $5, R14
// nextS = s + bytesBetweenHashLookups
ADDQ R14, R13
// skip += bytesBetweenHashLookups
ADDQ R14, R12
// if nextS > sLimit { goto emitRemainder }
MOVQ R13, AX
SUBQ DX, AX
CMPQ AX, R9
JA emitRemainder
// candidate = int(table[nextHash])
// XXX: MOVWQZX table-32768(SP)(R11*2), R15
// XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
BYTE $0x4e
BYTE $0x0f
BYTE $0xb7
BYTE $0x7c
BYTE $0x5c
BYTE $0x78
// table[nextHash] = uint16(s)
MOVQ SI, AX
SUBQ DX, AX
2016-05-03 00:11:05 +00:00
2016-04-26 00:18:04 +00:00
// XXX: MOVW AX, table-32768(SP)(R11*2)
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
BYTE $0x66
BYTE $0x42
BYTE $0x89
BYTE $0x44
BYTE $0x5c
BYTE $0x78
// nextHash = hash(load32(src, nextS), shift)
MOVL 0(R13), R11
IMULL $0x1e35a7bd, R11
SHRL CX, R11
// if load32(src, s) != load32(src, candidate) { continue } break
MOVL 0(SI), AX
MOVL (DX)(R15*1), BX
CMPL AX, BX
JNE inner0
fourByteMatch:
// As per the encode_other.go code:
//
// A 4-byte match has been found. We'll later see etc.
// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
// on inputMargin in encode.go.
MOVQ SI, AX
SUBQ R10, AX
CMPQ AX, $16
JLE emitLiteralFastPath
2016-05-03 00:11:05 +00:00
// ----------------------------------------
// Begin inline of the emitLiteral call.
2016-04-26 00:18:04 +00:00
//
2016-05-03 00:11:05 +00:00
// d += emitLiteral(dst[d:], src[nextEmit:s])
MOVL AX, BX
SUBL $1, BX
2016-04-26 00:18:04 +00:00
2016-05-03 00:11:05 +00:00
CMPL BX, $60
JLT inlineEmitLiteralOneByte
CMPL BX, $256
JLT inlineEmitLiteralTwoBytes
inlineEmitLiteralThreeBytes:
MOVB $0xf4, 0(DI)
MOVW BX, 1(DI)
ADDQ $3, DI
JMP inlineEmitLiteralMemmove
inlineEmitLiteralTwoBytes:
MOVB $0xf0, 0(DI)
MOVB BX, 1(DI)
ADDQ $2, DI
JMP inlineEmitLiteralMemmove
inlineEmitLiteralOneByte:
SHLB $2, BX
MOVB BX, 0(DI)
ADDQ $1, DI
inlineEmitLiteralMemmove:
2016-04-26 00:18:04 +00:00
// Spill local variables (registers) onto the stack; call; unspill.
2016-05-03 00:11:05 +00:00
//
// copy(dst[i:], lit)
//
// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
// DI, R10 and AX as arguments.
MOVQ DI, 0(SP)
MOVQ R10, 8(SP)
MOVQ AX, 16(SP)
ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)".
2016-04-26 00:18:04 +00:00
MOVQ SI, 72(SP)
MOVQ DI, 80(SP)
MOVQ R15, 112(SP)
2016-05-03 00:11:05 +00:00
CALL runtime·memmove(SB)
2016-04-26 00:18:04 +00:00
MOVQ 56(SP), CX
MOVQ 64(SP), DX
MOVQ 72(SP), SI
MOVQ 80(SP), DI
MOVQ 88(SP), R9
MOVQ 112(SP), R15
JMP inner1
2016-05-03 00:11:05 +00:00
inlineEmitLiteralEnd:
// End inline of the emitLiteral call.
// ----------------------------------------
2016-04-26 00:18:04 +00:00
emitLiteralFastPath:
// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
MOVB AX, BX
SUBB $1, BX
SHLB $2, BX
MOVB BX, (DI)
ADDQ $1, DI
// !!! Implement the copy from lit to dst as a 16-byte load and store.
// (Encode's documentation says that dst and src must not overlap.)
//
// This always copies 16 bytes, instead of only len(lit) bytes, but that's
// OK. Subsequent iterations will fix up the overrun.
//
// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
// 16-byte loads and stores. This technique probably wouldn't be as
// effective on architectures that are fussier about alignment.
MOVOU 0(R10), X0
MOVOU X0, 0(DI)
ADDQ AX, DI
inner1:
// for { etc }
// base := s
MOVQ SI, R12
// !!! offset := base - candidate
MOVQ R12, R11
SUBQ R15, R11
SUBQ DX, R11
2016-05-03 00:11:05 +00:00
// ----------------------------------------
// Begin inline of the extendMatch call.
2016-04-26 00:18:04 +00:00
//
2016-05-03 00:11:05 +00:00
// s = extendMatch(src, candidate+4, s+4)
// !!! R14 = &src[len(src)]
2016-04-26 00:18:04 +00:00
MOVQ src_len+32(FP), R14
2016-05-03 00:11:05 +00:00
ADDQ DX, R14
// !!! R13 = &src[len(src) - 8]
MOVQ R14, R13
SUBQ $8, R13
// !!! R15 = &src[candidate + 4]
2016-04-26 00:18:04 +00:00
ADDQ $4, R15
2016-05-03 00:11:05 +00:00
ADDQ DX, R15
// !!! s += 4
2016-04-26 00:18:04 +00:00
ADDQ $4, SI
2016-05-03 00:11:05 +00:00
inlineExtendMatchCmp8:
// As long as we are 8 or more bytes before the end of src, we can load and
// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
CMPQ SI, R13
JA inlineExtendMatchCmp1
MOVQ (R15), AX
MOVQ (SI), BX
CMPQ AX, BX
JNE inlineExtendMatchBSF
ADDQ $8, R15
ADDQ $8, SI
JMP inlineExtendMatchCmp8
2016-04-26 00:18:04 +00:00
2016-05-03 00:11:05 +00:00
inlineExtendMatchBSF:
// If those 8 bytes were not equal, XOR the two 8 byte values, and return
// the index of the first byte that differs. The BSF instruction finds the
// least significant 1 bit, the amd64 architecture is little-endian, and
// the shift by 3 converts a bit index to a byte index.
XORQ AX, BX
BSFQ BX, BX
SHRQ $3, BX
ADDQ BX, SI
JMP inlineExtendMatchEnd
2016-04-26 00:18:04 +00:00
2016-05-03 00:11:05 +00:00
inlineExtendMatchCmp1:
// In src's tail, compare 1 byte at a time.
CMPQ SI, R14
JAE inlineExtendMatchEnd
MOVB (R15), AX
MOVB (SI), BX
CMPB AX, BX
JNE inlineExtendMatchEnd
ADDQ $1, R15
ADDQ $1, SI
JMP inlineExtendMatchCmp1
inlineExtendMatchEnd:
// End inline of the extendMatch call.
// ----------------------------------------
// ----------------------------------------
// Begin inline of the emitCopy call.
2016-04-26 00:18:04 +00:00
//
2016-05-03 00:11:05 +00:00
// d += emitCopy(dst[d:], base-candidate, s-base)
// !!! length := s - base
2016-04-26 00:18:04 +00:00
MOVQ SI, AX
SUBQ R12, AX
2016-05-03 00:11:05 +00:00
inlineEmitCopyLoop0:
// for length >= 68 { etc }
CMPL AX, $68
JLT inlineEmitCopyStep1
// Emit a length 64 copy, encoded as 3 bytes.
MOVB $0xfe, 0(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
SUBL $64, AX
JMP inlineEmitCopyLoop0
inlineEmitCopyStep1:
// if length > 64 { etc }
CMPL AX, $64
JLE inlineEmitCopyStep2
2016-04-26 00:18:04 +00:00
2016-05-03 00:11:05 +00:00
// Emit a length 60 copy, encoded as 3 bytes.
MOVB $0xee, 0(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
SUBL $60, AX
inlineEmitCopyStep2:
// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
CMPL AX, $12
JGE inlineEmitCopyStep3
CMPL R11, $2048
JGE inlineEmitCopyStep3
// Emit the remaining copy, encoded as 2 bytes.
MOVB R11, 1(DI)
SHRL $8, R11
SHLB $5, R11
SUBB $4, AX
SHLB $2, AX
ORB AX, R11
ORB $1, R11
MOVB R11, 0(DI)
ADDQ $2, DI
JMP inlineEmitCopyEnd
inlineEmitCopyStep3:
// Emit the remaining copy, encoded as 3 bytes.
SUBL $1, AX
SHLB $2, AX
ORB $2, AX
MOVB AX, 0(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
inlineEmitCopyEnd:
// End inline of the emitCopy call.
// ----------------------------------------
2016-04-26 00:18:04 +00:00
// nextEmit = s
MOVQ SI, R10
// if s >= sLimit { goto emitRemainder }
MOVQ SI, AX
SUBQ DX, AX
CMPQ AX, R9
JAE emitRemainder
// As per the encode_other.go code:
//
// We could immediately etc.
// x := load64(src, s-1)
MOVQ -1(SI), R14
// prevHash := hash(uint32(x>>0), shift)
MOVL R14, R11
IMULL $0x1e35a7bd, R11
SHRL CX, R11
// table[prevHash] = uint16(s-1)
MOVQ SI, AX
SUBQ DX, AX
SUBQ $1, AX
2016-05-03 00:11:05 +00:00
2016-04-26 00:18:04 +00:00
// XXX: MOVW AX, table-32768(SP)(R11*2)
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
BYTE $0x66
BYTE $0x42
BYTE $0x89
BYTE $0x44
BYTE $0x5c
BYTE $0x78
// currHash := hash(uint32(x>>8), shift)
SHRQ $8, R14
MOVL R14, R11
IMULL $0x1e35a7bd, R11
SHRL CX, R11
// candidate = int(table[currHash])
// XXX: MOVWQZX table-32768(SP)(R11*2), R15
// XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
BYTE $0x4e
BYTE $0x0f
BYTE $0xb7
BYTE $0x7c
BYTE $0x5c
BYTE $0x78
// table[currHash] = uint16(s)
ADDQ $1, AX
2016-05-03 00:11:05 +00:00
2016-04-26 00:18:04 +00:00
// XXX: MOVW AX, table-32768(SP)(R11*2)
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
BYTE $0x66
BYTE $0x42
BYTE $0x89
BYTE $0x44
BYTE $0x5c
BYTE $0x78
// if uint32(x>>8) == load32(src, candidate) { continue }
MOVL (DX)(R15*1), BX
CMPL R14, BX
JEQ inner1
// nextHash = hash(uint32(x>>16), shift)
SHRQ $8, R14
MOVL R14, R11
IMULL $0x1e35a7bd, R11
SHRL CX, R11
// s++
ADDQ $1, SI
// break out of the inner1 for loop, i.e. continue the outer loop.
JMP outer
emitRemainder:
// if nextEmit < len(src) { etc }
MOVQ src_len+32(FP), AX
ADDQ DX, AX
CMPQ R10, AX
JEQ encodeBlockEnd
// d += emitLiteral(dst[d:], src[nextEmit:])
//
// Push args.
MOVQ DI, 0(SP)
MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
MOVQ R10, 24(SP)
SUBQ R10, AX
MOVQ AX, 32(SP)
MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
// Spill local variables (registers) onto the stack; call; unspill.
MOVQ DI, 80(SP)
CALL ·emitLiteral(SB)
MOVQ 80(SP), DI
// Finish the "d +=" part of "d += emitLiteral(etc)".
ADDQ 48(SP), DI
encodeBlockEnd:
MOVQ dst_base+0(FP), AX
SUBQ AX, DI
MOVQ DI, d+48(FP)
RET