open-vault/helper/random/string_generator.go

package random

import (
	"context"
	"crypto/rand"
	"fmt"
	"io"
	"math"
	"sort"
	"time"
	"unicode"

	"github.com/hashicorp/go-multierror"
)

var (
	LowercaseCharset   = sortCharset("abcdefghijklmnopqrstuvwxyz")
	UppercaseCharset   = sortCharset("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
	NumericCharset     = sortCharset("0123456789")
	FullSymbolCharset  = sortCharset("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")
	ShortSymbolCharset = sortCharset("-")

	AlphabeticCharset              = sortCharset(UppercaseCharset + LowercaseCharset)
	AlphaNumericCharset            = sortCharset(AlphabeticCharset + NumericCharset)
	AlphaNumericShortSymbolCharset = sortCharset(AlphaNumericCharset + ShortSymbolCharset)
	AlphaNumericFullSymbolCharset  = sortCharset(AlphaNumericCharset + FullSymbolCharset)

	LowercaseRuneset   = []rune(LowercaseCharset)
	UppercaseRuneset   = []rune(UppercaseCharset)
	NumericRuneset     = []rune(NumericCharset)
	FullSymbolRuneset  = []rune(FullSymbolCharset)
	ShortSymbolRuneset = []rune(ShortSymbolCharset)

	AlphabeticRuneset              = []rune(AlphabeticCharset)
	AlphaNumericRuneset            = []rune(AlphaNumericCharset)
	AlphaNumericShortSymbolRuneset = []rune(AlphaNumericShortSymbolCharset)
	AlphaNumericFullSymbolRuneset  = []rune(AlphaNumericFullSymbolCharset)

	// DefaultStringGenerator has reasonable default rules for generating strings
	DefaultStringGenerator = &StringGenerator{
		Length: 20,
		Rules: []Rule{
			CharsetRule{
				Charset:  LowercaseRuneset,
				MinChars: 1,
			},
			CharsetRule{
				Charset:  UppercaseRuneset,
				MinChars: 1,
			},
			CharsetRule{
				Charset:  NumericRuneset,
				MinChars: 1,
			},
			CharsetRule{
				Charset:  ShortSymbolRuneset,
				MinChars: 1,
			},
		},
	}
)

func sortCharset(chars string) string {
	r := runes(chars)
	sort.Sort(r)
	return string(r)
}

// StringGenerator generats random strings from the provided charset & adhering to a set of rules. The set of rules
// are things like CharsetRule which requires a certain number of characters from a sub-charset.
type StringGenerator struct {
	// Length of the string to generate.
	Length int `mapstructure:"length" json:"length"`

	// Rules the generated strings must adhere to.
	Rules serializableRules `mapstructure:"-" json:"rule"` // This is "rule" in JSON so it matches the HCL property type

	// CharsetRule to choose runes from. This is computed from the rules, not directly configurable
	charset runes
}

// Generate a random string from the charset and adhering to the provided rules.
// The io.Reader is optional. If not provided, it will default to the reader from crypto/rand
func (g *StringGenerator) Generate(ctx context.Context, rng io.Reader) (str string, err error) {
	if _, hasTimeout := ctx.Deadline(); !hasTimeout {
		var cancel func()
		ctx, cancel = context.WithTimeout(ctx, 1*time.Second) // Ensure there's a timeout on the context
		defer cancel()
	}

	// Ensure the generator is configured well since it may be manually created rather than parsed from HCL
	err = g.validateConfig()
	if err != nil {
		return "", err
	}

LOOP:
	for {
		select {
		case <-ctx.Done():
			return "", fmt.Errorf("timed out generating string")
		default:
			str, err = g.generate(rng)
			if err != nil {
				return "", err
			}
			if str == "" {
				continue LOOP
			}
			return str, err
		}
	}
}

func (g *StringGenerator) generate(rng io.Reader) (str string, err error) {
	// If performance improvements need to be made, this can be changed to read a batch of
	// potential strings at once rather than one at a time. This will significantly
	// improve performance, but at the cost of added complexity.
	candidate, err := randomRunes(rng, g.charset, g.Length)
	if err != nil {
		return "", fmt.Errorf("unable to generate random characters: %w", err)
	}

	for _, rule := range g.Rules {
		if !rule.Pass(candidate) {
			return "", nil
		}
	}

	// Passed all rules
	return string(candidate), nil
}

const (
	// maxCharsetLen is the maximum length a charset is allowed to be when generating a candidate string.
	// This is the total number of numbers available for selecting an index out of the charset slice.
	maxCharsetLen = 256
)

// randomRunes creates a random string based on the provided charset. The charset is limited to 255 characters, but
// could be expanded if needed. Expanding the maximum charset size will decrease performance because it will need to
// combine bytes into a larger integer using binary.BigEndian.Uint16() function.
func randomRunes(rng io.Reader, charset []rune, length int) (candidate []rune, err error) {
	if len(charset) == 0 {
		return nil, fmt.Errorf("no charset specified")
	}
	if len(charset) > maxCharsetLen {
		return nil, fmt.Errorf("charset is too long: limited to %d characters", math.MaxUint8)
	}
	if length <= 0 {
		return nil, fmt.Errorf("unable to generate a zero or negative length runeset")
	}

	// This can't always select indexes from [0-maxCharsetLen) because it could introduce bias to the character selection.
	// For instance, if the length of the charset is [a-zA-Z0-9-] (length of 63):
	// RNG ranges: [0-62][63-125][126-188][189-251] will equally select from the entirety of the charset. However,
	// the RNG values [252-255] will select the first 4 characters of the charset while ignoring the remaining 59.
	// This results in a bias towards the front of the charset.
	//
	// To avoid this, we determine the largest integer multiplier of the charset length that is <= maxCharsetLen
	// For instance, if the maxCharsetLen is 256 (the size of one byte) and the charset is length 63, the multiplier
	// equals 4:
	//   256/63 => 4.06
	//   Trunc(4.06) => 4
	// Multiply by the charset length
	// Subtract 1 to account for 0-based counting and you get the max index value: 251
	maxAllowedRNGValue := (maxCharsetLen/len(charset))*len(charset) - 1

	// rngBufferMultiplier increases the size of the RNG buffer to account for lost
	// indexes due to the maxAllowedRNGValue
	rngBufferMultiplier := 1.0

	// Don't set a multiplier if we are able to use the entire range of indexes
	if maxAllowedRNGValue < maxCharsetLen {
		// Anything more complicated than an arbitrary percentage appears to have little practical performance benefit
		rngBufferMultiplier = 1.5
	}

	// Default to the standard crypto reader if one isn't provided
	if rng == nil {
		rng = rand.Reader
	}

	charsetLen := byte(len(charset))

	runes := make([]rune, 0, length)

	for len(runes) < length {
		// Generate a bunch of indexes
		data := make([]byte, int(float64(length)*rngBufferMultiplier))
		numBytes, err := rng.Read(data)
		if err != nil {
			return nil, err
		}

		// Append characters until either we're out of indexes or the length is long enough
		for i := 0; i < numBytes; i++ {
			// Be careful to ensure that maxAllowedRNGValue isn't >= 256 as it will overflow and this
			// comparison will prevent characters from being selected from the charset
			if data[i] > byte(maxAllowedRNGValue) {
				continue
			}

			index := data[i]
			if len(charset) != maxCharsetLen {
				index = index % charsetLen
			}
			r := charset[index]
			runes = append(runes, r)

			if len(runes) == length {
				break
			}
		}
	}

	return runes, nil
}

// validateConfig of the generator to ensure that we can successfully generate a string.
func (g *StringGenerator) validateConfig() (err error) {
	merr := &multierror.Error{}

	// Ensure the sum of minimum lengths in the rules doesn't exceed the length specified
	minLen := getMinLength(g.Rules)
	if g.Length <= 0 {
		merr = multierror.Append(merr, fmt.Errorf("length must be > 0"))
	} else if g.Length < minLen {
		merr = multierror.Append(merr, fmt.Errorf("specified rules require at least %d characters but %d is specified", minLen, g.Length))
	}

	// Ensure we have a charset & all characters are printable
	if len(g.charset) == 0 {
		// Yes this is mutating the generator but this is done so we don't have to compute this on every generation
		g.charset = getChars(g.Rules)
	}
	if len(g.charset) == 0 {
		merr = multierror.Append(merr, fmt.Errorf("no charset specified"))
	} else {
		for _, r := range g.charset {
			if !unicode.IsPrint(r) {
				merr = multierror.Append(merr, fmt.Errorf("non-printable character in charset"))
				break
			}
		}
	}
	return merr.ErrorOrNil()
}

// getMinLength from the rules using the optional interface: `MinLength() int`
func getMinLength(rules []Rule) (minLen int) {
	type minLengthProvider interface {
		MinLength() int
	}

	for _, rule := range rules {
		mlp, ok := rule.(minLengthProvider)
		if !ok {
			continue
		}
		minLen += mlp.MinLength()
	}
	return minLen
}

// getChars from the rules using the optional interface: `Chars() []rune`
func getChars(rules []Rule) (chars []rune) {
	type charsetProvider interface {
		Chars() []rune
	}

	for _, rule := range rules {
		cp, ok := rule.(charsetProvider)
		if !ok {
			continue
		}
		chars = append(chars, cp.Chars()...)
	}
	return deduplicateRunes(chars)
}

// deduplicateRunes returns a new slice of sorted & de-duplicated runes
func deduplicateRunes(original []rune) (deduped []rune) {
	if len(original) == 0 {
		return nil
	}

	m := map[rune]bool{}
	dedupedRunes := []rune(nil)

	for _, r := range original {
		if m[r] {
			continue
		}
		m[r] = true
		dedupedRunes = append(dedupedRunes, r)
	}

	// They don't have to be sorted, but this is being done to make the charset easier to visualize
	sort.Sort(runes(dedupedRunes))
	return dedupedRunes
}