435c0d9fc8
This PR switches the Nomad repository from using govendor to Go modules for managing dependencies. Aspects of the Nomad workflow remain pretty much the same. The usual Makefile targets should continue to work as they always did. The API submodule simply defers to the parent Nomad version on the repository, keeping the semantics of API versioning that currently exists.
450 lines
9.1 KiB
ArmAsm
450 lines
9.1 KiB
ArmAsm
// Copyright 2019 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Based on CRYPTOGAMS code with the following comment:
|
|
// # ====================================================================
|
|
// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
// # project. The module is, however, dual licensed under OpenSSL and
|
|
// # CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
// # details see http://www.openssl.org/~appro/cryptogams/.
|
|
// # ====================================================================
|
|
|
|
// Code for the perl script that generates the ppc64 assembler
|
|
// can be found in the cryptogams repository at the link below. It is based on
|
|
// the original from openssl.
|
|
|
|
// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
|
|
|
|
// The differences in this and the original implementation are
|
|
// due to the calling conventions and initialization of constants.
|
|
|
|
// +build !gccgo,!purego
|
|
|
|
#include "textflag.h"
|
|
|
|
#define OUT R3
|
|
#define INP R4
|
|
#define LEN R5
|
|
#define KEY R6
|
|
#define CNT R7
|
|
#define TMP R15
|
|
|
|
#define CONSTBASE R16
|
|
#define BLOCKS R17
|
|
|
|
DATA consts<>+0x00(SB)/8, $0x3320646e61707865
|
|
DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
|
|
DATA consts<>+0x10(SB)/8, $0x0000000000000001
|
|
DATA consts<>+0x18(SB)/8, $0x0000000000000000
|
|
DATA consts<>+0x20(SB)/8, $0x0000000000000004
|
|
DATA consts<>+0x28(SB)/8, $0x0000000000000000
|
|
DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
|
|
DATA consts<>+0x38(SB)/8, $0x0203000106070405
|
|
DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
|
|
DATA consts<>+0x48(SB)/8, $0x0102030005060704
|
|
DATA consts<>+0x50(SB)/8, $0x6170786561707865
|
|
DATA consts<>+0x58(SB)/8, $0x6170786561707865
|
|
DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
|
|
DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
|
|
DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
|
|
DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
|
|
DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
|
|
DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
|
|
DATA consts<>+0x90(SB)/8, $0x0000000100000000
|
|
DATA consts<>+0x98(SB)/8, $0x0000000300000002
|
|
GLOBL consts<>(SB), RODATA, $0xa0
|
|
|
|
//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
|
|
TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
|
|
MOVD out+0(FP), OUT
|
|
MOVD inp+8(FP), INP
|
|
MOVD len+16(FP), LEN
|
|
MOVD key+24(FP), KEY
|
|
MOVD counter+32(FP), CNT
|
|
|
|
// Addressing for constants
|
|
MOVD $consts<>+0x00(SB), CONSTBASE
|
|
MOVD $16, R8
|
|
MOVD $32, R9
|
|
MOVD $48, R10
|
|
MOVD $64, R11
|
|
SRD $6, LEN, BLOCKS
|
|
// V16
|
|
LXVW4X (CONSTBASE)(R0), VS48
|
|
ADD $80,CONSTBASE
|
|
|
|
// Load key into V17,V18
|
|
LXVW4X (KEY)(R0), VS49
|
|
LXVW4X (KEY)(R8), VS50
|
|
|
|
// Load CNT, NONCE into V19
|
|
LXVW4X (CNT)(R0), VS51
|
|
|
|
// Clear V27
|
|
VXOR V27, V27, V27
|
|
|
|
// V28
|
|
LXVW4X (CONSTBASE)(R11), VS60
|
|
|
|
// splat slot from V19 -> V26
|
|
VSPLTW $0, V19, V26
|
|
|
|
VSLDOI $4, V19, V27, V19
|
|
VSLDOI $12, V27, V19, V19
|
|
|
|
VADDUWM V26, V28, V26
|
|
|
|
MOVD $10, R14
|
|
MOVD R14, CTR
|
|
|
|
loop_outer_vsx:
|
|
// V0, V1, V2, V3
|
|
LXVW4X (R0)(CONSTBASE), VS32
|
|
LXVW4X (R8)(CONSTBASE), VS33
|
|
LXVW4X (R9)(CONSTBASE), VS34
|
|
LXVW4X (R10)(CONSTBASE), VS35
|
|
|
|
// splat values from V17, V18 into V4-V11
|
|
VSPLTW $0, V17, V4
|
|
VSPLTW $1, V17, V5
|
|
VSPLTW $2, V17, V6
|
|
VSPLTW $3, V17, V7
|
|
VSPLTW $0, V18, V8
|
|
VSPLTW $1, V18, V9
|
|
VSPLTW $2, V18, V10
|
|
VSPLTW $3, V18, V11
|
|
|
|
// VOR
|
|
VOR V26, V26, V12
|
|
|
|
// splat values from V19 -> V13, V14, V15
|
|
VSPLTW $1, V19, V13
|
|
VSPLTW $2, V19, V14
|
|
VSPLTW $3, V19, V15
|
|
|
|
// splat const values
|
|
VSPLTISW $-16, V27
|
|
VSPLTISW $12, V28
|
|
VSPLTISW $8, V29
|
|
VSPLTISW $7, V30
|
|
|
|
loop_vsx:
|
|
VADDUWM V0, V4, V0
|
|
VADDUWM V1, V5, V1
|
|
VADDUWM V2, V6, V2
|
|
VADDUWM V3, V7, V3
|
|
|
|
VXOR V12, V0, V12
|
|
VXOR V13, V1, V13
|
|
VXOR V14, V2, V14
|
|
VXOR V15, V3, V15
|
|
|
|
VRLW V12, V27, V12
|
|
VRLW V13, V27, V13
|
|
VRLW V14, V27, V14
|
|
VRLW V15, V27, V15
|
|
|
|
VADDUWM V8, V12, V8
|
|
VADDUWM V9, V13, V9
|
|
VADDUWM V10, V14, V10
|
|
VADDUWM V11, V15, V11
|
|
|
|
VXOR V4, V8, V4
|
|
VXOR V5, V9, V5
|
|
VXOR V6, V10, V6
|
|
VXOR V7, V11, V7
|
|
|
|
VRLW V4, V28, V4
|
|
VRLW V5, V28, V5
|
|
VRLW V6, V28, V6
|
|
VRLW V7, V28, V7
|
|
|
|
VADDUWM V0, V4, V0
|
|
VADDUWM V1, V5, V1
|
|
VADDUWM V2, V6, V2
|
|
VADDUWM V3, V7, V3
|
|
|
|
VXOR V12, V0, V12
|
|
VXOR V13, V1, V13
|
|
VXOR V14, V2, V14
|
|
VXOR V15, V3, V15
|
|
|
|
VRLW V12, V29, V12
|
|
VRLW V13, V29, V13
|
|
VRLW V14, V29, V14
|
|
VRLW V15, V29, V15
|
|
|
|
VADDUWM V8, V12, V8
|
|
VADDUWM V9, V13, V9
|
|
VADDUWM V10, V14, V10
|
|
VADDUWM V11, V15, V11
|
|
|
|
VXOR V4, V8, V4
|
|
VXOR V5, V9, V5
|
|
VXOR V6, V10, V6
|
|
VXOR V7, V11, V7
|
|
|
|
VRLW V4, V30, V4
|
|
VRLW V5, V30, V5
|
|
VRLW V6, V30, V6
|
|
VRLW V7, V30, V7
|
|
|
|
VADDUWM V0, V5, V0
|
|
VADDUWM V1, V6, V1
|
|
VADDUWM V2, V7, V2
|
|
VADDUWM V3, V4, V3
|
|
|
|
VXOR V15, V0, V15
|
|
VXOR V12, V1, V12
|
|
VXOR V13, V2, V13
|
|
VXOR V14, V3, V14
|
|
|
|
VRLW V15, V27, V15
|
|
VRLW V12, V27, V12
|
|
VRLW V13, V27, V13
|
|
VRLW V14, V27, V14
|
|
|
|
VADDUWM V10, V15, V10
|
|
VADDUWM V11, V12, V11
|
|
VADDUWM V8, V13, V8
|
|
VADDUWM V9, V14, V9
|
|
|
|
VXOR V5, V10, V5
|
|
VXOR V6, V11, V6
|
|
VXOR V7, V8, V7
|
|
VXOR V4, V9, V4
|
|
|
|
VRLW V5, V28, V5
|
|
VRLW V6, V28, V6
|
|
VRLW V7, V28, V7
|
|
VRLW V4, V28, V4
|
|
|
|
VADDUWM V0, V5, V0
|
|
VADDUWM V1, V6, V1
|
|
VADDUWM V2, V7, V2
|
|
VADDUWM V3, V4, V3
|
|
|
|
VXOR V15, V0, V15
|
|
VXOR V12, V1, V12
|
|
VXOR V13, V2, V13
|
|
VXOR V14, V3, V14
|
|
|
|
VRLW V15, V29, V15
|
|
VRLW V12, V29, V12
|
|
VRLW V13, V29, V13
|
|
VRLW V14, V29, V14
|
|
|
|
VADDUWM V10, V15, V10
|
|
VADDUWM V11, V12, V11
|
|
VADDUWM V8, V13, V8
|
|
VADDUWM V9, V14, V9
|
|
|
|
VXOR V5, V10, V5
|
|
VXOR V6, V11, V6
|
|
VXOR V7, V8, V7
|
|
VXOR V4, V9, V4
|
|
|
|
VRLW V5, V30, V5
|
|
VRLW V6, V30, V6
|
|
VRLW V7, V30, V7
|
|
VRLW V4, V30, V4
|
|
BC 16, LT, loop_vsx
|
|
|
|
VADDUWM V12, V26, V12
|
|
|
|
WORD $0x13600F8C // VMRGEW V0, V1, V27
|
|
WORD $0x13821F8C // VMRGEW V2, V3, V28
|
|
|
|
WORD $0x10000E8C // VMRGOW V0, V1, V0
|
|
WORD $0x10421E8C // VMRGOW V2, V3, V2
|
|
|
|
WORD $0x13A42F8C // VMRGEW V4, V5, V29
|
|
WORD $0x13C63F8C // VMRGEW V6, V7, V30
|
|
|
|
XXPERMDI VS32, VS34, $0, VS33
|
|
XXPERMDI VS32, VS34, $3, VS35
|
|
XXPERMDI VS59, VS60, $0, VS32
|
|
XXPERMDI VS59, VS60, $3, VS34
|
|
|
|
WORD $0x10842E8C // VMRGOW V4, V5, V4
|
|
WORD $0x10C63E8C // VMRGOW V6, V7, V6
|
|
|
|
WORD $0x13684F8C // VMRGEW V8, V9, V27
|
|
WORD $0x138A5F8C // VMRGEW V10, V11, V28
|
|
|
|
XXPERMDI VS36, VS38, $0, VS37
|
|
XXPERMDI VS36, VS38, $3, VS39
|
|
XXPERMDI VS61, VS62, $0, VS36
|
|
XXPERMDI VS61, VS62, $3, VS38
|
|
|
|
WORD $0x11084E8C // VMRGOW V8, V9, V8
|
|
WORD $0x114A5E8C // VMRGOW V10, V11, V10
|
|
|
|
WORD $0x13AC6F8C // VMRGEW V12, V13, V29
|
|
WORD $0x13CE7F8C // VMRGEW V14, V15, V30
|
|
|
|
XXPERMDI VS40, VS42, $0, VS41
|
|
XXPERMDI VS40, VS42, $3, VS43
|
|
XXPERMDI VS59, VS60, $0, VS40
|
|
XXPERMDI VS59, VS60, $3, VS42
|
|
|
|
WORD $0x118C6E8C // VMRGOW V12, V13, V12
|
|
WORD $0x11CE7E8C // VMRGOW V14, V15, V14
|
|
|
|
VSPLTISW $4, V27
|
|
VADDUWM V26, V27, V26
|
|
|
|
XXPERMDI VS44, VS46, $0, VS45
|
|
XXPERMDI VS44, VS46, $3, VS47
|
|
XXPERMDI VS61, VS62, $0, VS44
|
|
XXPERMDI VS61, VS62, $3, VS46
|
|
|
|
VADDUWM V0, V16, V0
|
|
VADDUWM V4, V17, V4
|
|
VADDUWM V8, V18, V8
|
|
VADDUWM V12, V19, V12
|
|
|
|
CMPU LEN, $64
|
|
BLT tail_vsx
|
|
|
|
// Bottom of loop
|
|
LXVW4X (INP)(R0), VS59
|
|
LXVW4X (INP)(R8), VS60
|
|
LXVW4X (INP)(R9), VS61
|
|
LXVW4X (INP)(R10), VS62
|
|
|
|
VXOR V27, V0, V27
|
|
VXOR V28, V4, V28
|
|
VXOR V29, V8, V29
|
|
VXOR V30, V12, V30
|
|
|
|
STXVW4X VS59, (OUT)(R0)
|
|
STXVW4X VS60, (OUT)(R8)
|
|
ADD $64, INP
|
|
STXVW4X VS61, (OUT)(R9)
|
|
ADD $-64, LEN
|
|
STXVW4X VS62, (OUT)(R10)
|
|
ADD $64, OUT
|
|
BEQ done_vsx
|
|
|
|
VADDUWM V1, V16, V0
|
|
VADDUWM V5, V17, V4
|
|
VADDUWM V9, V18, V8
|
|
VADDUWM V13, V19, V12
|
|
|
|
CMPU LEN, $64
|
|
BLT tail_vsx
|
|
|
|
LXVW4X (INP)(R0), VS59
|
|
LXVW4X (INP)(R8), VS60
|
|
LXVW4X (INP)(R9), VS61
|
|
LXVW4X (INP)(R10), VS62
|
|
VXOR V27, V0, V27
|
|
|
|
VXOR V28, V4, V28
|
|
VXOR V29, V8, V29
|
|
VXOR V30, V12, V30
|
|
|
|
STXVW4X VS59, (OUT)(R0)
|
|
STXVW4X VS60, (OUT)(R8)
|
|
ADD $64, INP
|
|
STXVW4X VS61, (OUT)(R9)
|
|
ADD $-64, LEN
|
|
STXVW4X VS62, (OUT)(V10)
|
|
ADD $64, OUT
|
|
BEQ done_vsx
|
|
|
|
VADDUWM V2, V16, V0
|
|
VADDUWM V6, V17, V4
|
|
VADDUWM V10, V18, V8
|
|
VADDUWM V14, V19, V12
|
|
|
|
CMPU LEN, $64
|
|
BLT tail_vsx
|
|
|
|
LXVW4X (INP)(R0), VS59
|
|
LXVW4X (INP)(R8), VS60
|
|
LXVW4X (INP)(R9), VS61
|
|
LXVW4X (INP)(R10), VS62
|
|
|
|
VXOR V27, V0, V27
|
|
VXOR V28, V4, V28
|
|
VXOR V29, V8, V29
|
|
VXOR V30, V12, V30
|
|
|
|
STXVW4X VS59, (OUT)(R0)
|
|
STXVW4X VS60, (OUT)(R8)
|
|
ADD $64, INP
|
|
STXVW4X VS61, (OUT)(R9)
|
|
ADD $-64, LEN
|
|
STXVW4X VS62, (OUT)(R10)
|
|
ADD $64, OUT
|
|
BEQ done_vsx
|
|
|
|
VADDUWM V3, V16, V0
|
|
VADDUWM V7, V17, V4
|
|
VADDUWM V11, V18, V8
|
|
VADDUWM V15, V19, V12
|
|
|
|
CMPU LEN, $64
|
|
BLT tail_vsx
|
|
|
|
LXVW4X (INP)(R0), VS59
|
|
LXVW4X (INP)(R8), VS60
|
|
LXVW4X (INP)(R9), VS61
|
|
LXVW4X (INP)(R10), VS62
|
|
|
|
VXOR V27, V0, V27
|
|
VXOR V28, V4, V28
|
|
VXOR V29, V8, V29
|
|
VXOR V30, V12, V30
|
|
|
|
STXVW4X VS59, (OUT)(R0)
|
|
STXVW4X VS60, (OUT)(R8)
|
|
ADD $64, INP
|
|
STXVW4X VS61, (OUT)(R9)
|
|
ADD $-64, LEN
|
|
STXVW4X VS62, (OUT)(R10)
|
|
ADD $64, OUT
|
|
|
|
MOVD $10, R14
|
|
MOVD R14, CTR
|
|
BNE loop_outer_vsx
|
|
|
|
done_vsx:
|
|
// Increment counter by number of 64 byte blocks
|
|
MOVD (CNT), R14
|
|
ADD BLOCKS, R14
|
|
MOVD R14, (CNT)
|
|
RET
|
|
|
|
tail_vsx:
|
|
ADD $32, R1, R11
|
|
MOVD LEN, CTR
|
|
|
|
// Save values on stack to copy from
|
|
STXVW4X VS32, (R11)(R0)
|
|
STXVW4X VS36, (R11)(R8)
|
|
STXVW4X VS40, (R11)(R9)
|
|
STXVW4X VS44, (R11)(R10)
|
|
ADD $-1, R11, R12
|
|
ADD $-1, INP
|
|
ADD $-1, OUT
|
|
|
|
looptail_vsx:
|
|
// Copying the result to OUT
|
|
// in bytes.
|
|
MOVBZU 1(R12), KEY
|
|
MOVBZU 1(INP), TMP
|
|
XOR KEY, TMP, KEY
|
|
MOVBU KEY, 1(OUT)
|
|
BC 16, LT, looptail_vsx
|
|
|
|
// Clear the stack values
|
|
STXVW4X VS48, (R11)(R0)
|
|
STXVW4X VS48, (R11)(R8)
|
|
STXVW4X VS48, (R11)(R9)
|
|
STXVW4X VS48, (R11)(R10)
|
|
BR done_vsx
|