435c0d9fc8
This PR switches the Nomad repository from using govendor to Go modules for managing dependencies. Aspects of the Nomad workflow remain pretty much the same. The usual Makefile targets should continue to work as they always did. The API submodule simply defers to the parent Nomad version on the repository, keeping the semantics of API versioning that currently exists.
311 lines
9.5 KiB
Go
311 lines
9.5 KiB
Go
// Copyright 2018 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// This file provides the generic implementation of Sum and MAC. Other files
|
|
// might provide optimized assembly implementations of some of this code.
|
|
|
|
package poly1305
|
|
|
|
import "encoding/binary"
|
|
|
|
// Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag
|
|
// for a 64 bytes message is approximately
|
|
//
|
|
// s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r mod 2¹³⁰ - 5
|
|
//
|
|
// for some secret r and s. It can be computed sequentially like
|
|
//
|
|
// for len(msg) > 0:
|
|
// h += read(msg, 16)
|
|
// h *= r
|
|
// h %= 2¹³⁰ - 5
|
|
// return h + s
|
|
//
|
|
// All the complexity is about doing performant constant-time math on numbers
|
|
// larger than any available numeric type.
|
|
|
|
func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
|
|
h := newMACGeneric(key)
|
|
h.Write(msg)
|
|
h.Sum(out)
|
|
}
|
|
|
|
func newMACGeneric(key *[32]byte) macGeneric {
|
|
m := macGeneric{}
|
|
initialize(key, &m.macState)
|
|
return m
|
|
}
|
|
|
|
// macState holds numbers in saturated 64-bit little-endian limbs. That is,
|
|
// the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
|
|
type macState struct {
|
|
// h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but
|
|
// can grow larger during and after rounds. It must, however, remain below
|
|
// 2 * (2¹³⁰ - 5).
|
|
h [3]uint64
|
|
// r and s are the private key components.
|
|
r [2]uint64
|
|
s [2]uint64
|
|
}
|
|
|
|
type macGeneric struct {
|
|
macState
|
|
|
|
buffer [TagSize]byte
|
|
offset int
|
|
}
|
|
|
|
// Write splits the incoming message into TagSize chunks, and passes them to
|
|
// update. It buffers incomplete chunks.
|
|
func (h *macGeneric) Write(p []byte) (int, error) {
|
|
nn := len(p)
|
|
if h.offset > 0 {
|
|
n := copy(h.buffer[h.offset:], p)
|
|
if h.offset+n < TagSize {
|
|
h.offset += n
|
|
return nn, nil
|
|
}
|
|
p = p[n:]
|
|
h.offset = 0
|
|
updateGeneric(&h.macState, h.buffer[:])
|
|
}
|
|
if n := len(p) - (len(p) % TagSize); n > 0 {
|
|
updateGeneric(&h.macState, p[:n])
|
|
p = p[n:]
|
|
}
|
|
if len(p) > 0 {
|
|
h.offset += copy(h.buffer[h.offset:], p)
|
|
}
|
|
return nn, nil
|
|
}
|
|
|
|
// Sum flushes the last incomplete chunk from the buffer, if any, and generates
|
|
// the MAC output. It does not modify its state, in order to allow for multiple
|
|
// calls to Sum, even if no Write is allowed after Sum.
|
|
func (h *macGeneric) Sum(out *[TagSize]byte) {
|
|
state := h.macState
|
|
if h.offset > 0 {
|
|
updateGeneric(&state, h.buffer[:h.offset])
|
|
}
|
|
finalize(out, &state.h, &state.s)
|
|
}
|
|
|
|
// [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It
|
|
// clears some bits of the secret coefficient to make it possible to implement
|
|
// multiplication more efficiently.
|
|
const (
|
|
rMask0 = 0x0FFFFFFC0FFFFFFF
|
|
rMask1 = 0x0FFFFFFC0FFFFFFC
|
|
)
|
|
|
|
// initialize loads the 256-bit key into the two 128-bit secret values r and s.
|
|
func initialize(key *[32]byte, m *macState) {
|
|
m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
|
|
m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
|
|
m.s[0] = binary.LittleEndian.Uint64(key[16:24])
|
|
m.s[1] = binary.LittleEndian.Uint64(key[24:32])
|
|
}
|
|
|
|
// uint128 holds a 128-bit number as two 64-bit limbs, for use with the
|
|
// bits.Mul64 and bits.Add64 intrinsics.
|
|
type uint128 struct {
|
|
lo, hi uint64
|
|
}
|
|
|
|
func mul64(a, b uint64) uint128 {
|
|
hi, lo := bitsMul64(a, b)
|
|
return uint128{lo, hi}
|
|
}
|
|
|
|
func add128(a, b uint128) uint128 {
|
|
lo, c := bitsAdd64(a.lo, b.lo, 0)
|
|
hi, c := bitsAdd64(a.hi, b.hi, c)
|
|
if c != 0 {
|
|
panic("poly1305: unexpected overflow")
|
|
}
|
|
return uint128{lo, hi}
|
|
}
|
|
|
|
func shiftRightBy2(a uint128) uint128 {
|
|
a.lo = a.lo>>2 | (a.hi&3)<<62
|
|
a.hi = a.hi >> 2
|
|
return a
|
|
}
|
|
|
|
// updateGeneric absorbs msg into the state.h accumulator. For each chunk m of
|
|
// 128 bits of message, it computes
|
|
//
|
|
// h₊ = (h + m) * r mod 2¹³⁰ - 5
|
|
//
|
|
// If the msg length is not a multiple of TagSize, it assumes the last
|
|
// incomplete chunk is the final one.
|
|
func updateGeneric(state *macState, msg []byte) {
|
|
h0, h1, h2 := state.h[0], state.h[1], state.h[2]
|
|
r0, r1 := state.r[0], state.r[1]
|
|
|
|
for len(msg) > 0 {
|
|
var c uint64
|
|
|
|
// For the first step, h + m, we use a chain of bits.Add64 intrinsics.
|
|
// The resulting value of h might exceed 2¹³⁰ - 5, but will be partially
|
|
// reduced at the end of the multiplication below.
|
|
//
|
|
// The spec requires us to set a bit just above the message size, not to
|
|
// hide leading zeroes. For full chunks, that's 1 << 128, so we can just
|
|
// add 1 to the most significant (2¹²⁸) limb, h2.
|
|
if len(msg) >= TagSize {
|
|
h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0)
|
|
h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(msg[8:16]), c)
|
|
h2 += c + 1
|
|
|
|
msg = msg[TagSize:]
|
|
} else {
|
|
var buf [TagSize]byte
|
|
copy(buf[:], msg)
|
|
buf[len(msg)] = 1
|
|
|
|
h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0)
|
|
h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(buf[8:16]), c)
|
|
h2 += c
|
|
|
|
msg = nil
|
|
}
|
|
|
|
// Multiplication of big number limbs is similar to elementary school
|
|
// columnar multiplication. Instead of digits, there are 64-bit limbs.
|
|
//
|
|
// We are multiplying a 3 limbs number, h, by a 2 limbs number, r.
|
|
//
|
|
// h2 h1 h0 x
|
|
// r1 r0 =
|
|
// ----------------
|
|
// h2r0 h1r0 h0r0 <-- individual 128-bit products
|
|
// + h2r1 h1r1 h0r1
|
|
// ------------------------
|
|
// m3 m2 m1 m0 <-- result in 128-bit overlapping limbs
|
|
// ------------------------
|
|
// m3.hi m2.hi m1.hi m0.hi <-- carry propagation
|
|
// + m3.lo m2.lo m1.lo m0.lo
|
|
// -------------------------------
|
|
// t4 t3 t2 t1 t0 <-- final result in 64-bit limbs
|
|
//
|
|
// The main difference from pen-and-paper multiplication is that we do
|
|
// carry propagation in a separate step, as if we wrote two digit sums
|
|
// at first (the 128-bit limbs), and then carried the tens all at once.
|
|
|
|
h0r0 := mul64(h0, r0)
|
|
h1r0 := mul64(h1, r0)
|
|
h2r0 := mul64(h2, r0)
|
|
h0r1 := mul64(h0, r1)
|
|
h1r1 := mul64(h1, r1)
|
|
h2r1 := mul64(h2, r1)
|
|
|
|
// Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their
|
|
// top 4 bits cleared by rMask{0,1}, we know that their product is not going
|
|
// to overflow 64 bits, so we can ignore the high part of the products.
|
|
//
|
|
// This also means that the product doesn't have a fifth limb (t4).
|
|
if h2r0.hi != 0 {
|
|
panic("poly1305: unexpected overflow")
|
|
}
|
|
if h2r1.hi != 0 {
|
|
panic("poly1305: unexpected overflow")
|
|
}
|
|
|
|
m0 := h0r0
|
|
m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again
|
|
m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1.
|
|
m3 := h2r1
|
|
|
|
t0 := m0.lo
|
|
t1, c := bitsAdd64(m1.lo, m0.hi, 0)
|
|
t2, c := bitsAdd64(m2.lo, m1.hi, c)
|
|
t3, _ := bitsAdd64(m3.lo, m2.hi, c)
|
|
|
|
// Now we have the result as 4 64-bit limbs, and we need to reduce it
|
|
// modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do
|
|
// a cheap partial reduction according to the reduction identity
|
|
//
|
|
// c * 2¹³⁰ + n = c * 5 + n mod 2¹³⁰ - 5
|
|
//
|
|
// because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is
|
|
// likely to be larger than 2¹³⁰ - 5, but still small enough to fit the
|
|
// assumptions we make about h in the rest of the code.
|
|
//
|
|
// See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23
|
|
|
|
// We split the final result at the 2¹³⁰ mark into h and cc, the carry.
|
|
// Note that the carry bits are effectively shifted left by 2, in other
|
|
// words, cc = c * 4 for the c in the reduction identity.
|
|
h0, h1, h2 = t0, t1, t2&maskLow2Bits
|
|
cc := uint128{t2 & maskNotLow2Bits, t3}
|
|
|
|
// To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c.
|
|
|
|
h0, c = bitsAdd64(h0, cc.lo, 0)
|
|
h1, c = bitsAdd64(h1, cc.hi, c)
|
|
h2 += c
|
|
|
|
cc = shiftRightBy2(cc)
|
|
|
|
h0, c = bitsAdd64(h0, cc.lo, 0)
|
|
h1, c = bitsAdd64(h1, cc.hi, c)
|
|
h2 += c
|
|
|
|
// h2 is at most 3 + 1 + 1 = 5, making the whole of h at most
|
|
//
|
|
// 5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1
|
|
}
|
|
|
|
state.h[0], state.h[1], state.h[2] = h0, h1, h2
|
|
}
|
|
|
|
const (
|
|
maskLow2Bits uint64 = 0x0000000000000003
|
|
maskNotLow2Bits uint64 = ^maskLow2Bits
|
|
)
|
|
|
|
// select64 returns x if v == 1 and y if v == 0, in constant time.
|
|
func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y }
|
|
|
|
// [p0, p1, p2] is 2¹³⁰ - 5 in little endian order.
|
|
const (
|
|
p0 = 0xFFFFFFFFFFFFFFFB
|
|
p1 = 0xFFFFFFFFFFFFFFFF
|
|
p2 = 0x0000000000000003
|
|
)
|
|
|
|
// finalize completes the modular reduction of h and computes
|
|
//
|
|
// out = h + s mod 2¹²⁸
|
|
//
|
|
func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) {
|
|
h0, h1, h2 := h[0], h[1], h[2]
|
|
|
|
// After the partial reduction in updateGeneric, h might be more than
|
|
// 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction
|
|
// in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the
|
|
// result if the subtraction underflows, and t otherwise.
|
|
|
|
hMinusP0, b := bitsSub64(h0, p0, 0)
|
|
hMinusP1, b := bitsSub64(h1, p1, b)
|
|
_, b = bitsSub64(h2, p2, b)
|
|
|
|
// h = h if h < p else h - p
|
|
h0 = select64(b, h0, hMinusP0)
|
|
h1 = select64(b, h1, hMinusP1)
|
|
|
|
// Finally, we compute the last Poly1305 step
|
|
//
|
|
// tag = h + s mod 2¹²⁸
|
|
//
|
|
// by just doing a wide addition with the 128 low bits of h and discarding
|
|
// the overflow.
|
|
h0, c := bitsAdd64(h0, s[0], 0)
|
|
h1, _ = bitsAdd64(h1, s[1], c)
|
|
|
|
binary.LittleEndian.PutUint64(out[0:8], h0)
|
|
binary.LittleEndian.PutUint64(out[8:16], h1)
|
|
}
|