rocksdb/util/crc32c_arm64.cc

//  Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).

#include "util/crc32c_arm64.h"

#if defined(__linux__) && defined(HAVE_ARM64_CRC)

#include <asm/hwcap.h>
#include <sys/auxv.h>
#ifndef HWCAP_CRC32
#define HWCAP_CRC32 (1 << 7)
#endif
uint32_t crc32c_runtime_check(void) {
  uint64_t auxv = getauxval(AT_HWCAP);
  return (auxv & HWCAP_CRC32) != 0;
}

uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
                             unsigned len) {
  const uint8_t *buf8;
  const uint64_t *buf64 = (uint64_t *)data;
  int length = (int)len;
  crc ^= 0xffffffff;

#ifdef HAVE_ARM64_CRYPTO
  /* Crc32c Parallel computation
   *   Algorithm comes from Intel whitepaper:
   *   crc-iscsi-polynomial-crc32-instruction-paper
   *
   * Input data is divided into three equal-sized blocks
   *   Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
   *   One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
   */
  #define BLK_LENGTH 42
  while (length >= 1024) {
    uint64_t t0, t1;
    uint32_t crc0 = 0, crc1 = 0, crc2 = 0;

    /* Parallel Param:
     *   k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
     *   k1 = CRC32(x ^ (42 * 8 * 8 - 1));
     */
    uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;

    /* First 8 bytei for better pipelining */
    crc0 = crc32c_u64(crc, *buf64++);

    /* 3 blocks crc32c parallel computation
     *
     * 42 * 8 * 3 = 1008 (bytes)
     */
    for (int i = 0; i < BLK_LENGTH; i++, buf64++) {
      crc0 = crc32c_u64(crc0, *buf64);
      crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH));
      crc2 = crc32c_u64(crc2, *(buf64 + (BLK_LENGTH * 2)));
    }
    buf64 += (BLK_LENGTH * 2);

    /* Last 8 bytes */
    crc = crc32c_u64(crc2, *buf64++);

    t0 = (uint64_t)vmull_p64(crc0, k0);
    t1 = (uint64_t)vmull_p64(crc1, k1);

    /* Merge (crc0, crc1, crc2) -> crc */
    crc1 = crc32c_u64(0, t1);
    crc ^= crc1;
    crc0 = crc32c_u64(0, t0);
    crc ^= crc0;

    length -= 1024;
  }
#endif
  buf8 = (const uint8_t *)buf64;
  while (length >= 8) {
    crc = crc32c_u64(crc, *(const uint64_t*)buf8);
    buf8 += 8;
    length -= 8;
  }

  /* The following is more efficient than the straight loop */
  if (length >= 4) {
    crc = crc32c_u32(crc, *(const uint32_t*)buf8);
    buf8 += 4;
    length -= 4;
  }

  if (length >= 2) {
    crc = crc32c_u16(crc, *(const uint16_t*)buf8);
    buf8 += 2;
    length -= 2;
  }

  if (length >= 1)
    crc = crc32c_u8(crc, *buf8);

  crc ^= 0xffffffff;
  return crc;
}

#endif
RocksDB CRC32c optimization with ARMv8 Intrinsic (#5221) Summary: 1. Add Arm linear crc32c implemtation for RocksDB. 2. Arm runtime check for crc32 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5221 Differential Revision: D15013685 Pulled By: siying fbshipit-source-id: 2c2983743d26656d93f212dc7c1a3cf66a1acf12 2019-04-30 17:56:06 +00:00			`// Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.`
			`// This source code is licensed under both the GPLv2 (found in the`
			`// COPYING file in the root directory) and Apache 2.0 License`
			`// (found in the LICENSE.Apache file in the root directory).`

			`#include "util/crc32c_arm64.h"`

			`#if defined(__linux__) && defined(HAVE_ARM64_CRC)`

			`#include <asm/hwcap.h>`
			`#include <sys/auxv.h>`
			`#ifndef HWCAP_CRC32`
			`#define HWCAP_CRC32 (1 << 7)`
			`#endif`
			`uint32_t crc32c_runtime_check(void) {`
			`uint64_t auxv = getauxval(AT_HWCAP);`
			`return (auxv & HWCAP_CRC32) != 0;`
			`}`

			`uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,`
			`unsigned len) {`
Arm64 CRC32 parallel computation optimization for RocksDB (#5494) Summary: Crc32c Parallel computation optimization: Algorithm comes from Intel whitepaper: [crc-iscsi-polynomial-crc32-instruction-paper](https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf) Input data is divided into three equal-sized blocks Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes 1. crc32c_test: ``` [==========] Running 4 tests from 1 test case. [----------] Global test environment set-up. [----------] 4 tests from CRC [ RUN ] CRC.StandardResults [ OK ] CRC.StandardResults (1 ms) [ RUN ] CRC.Values [ OK ] CRC.Values (0 ms) [ RUN ] CRC.Extend [ OK ] CRC.Extend (0 ms) [ RUN ] CRC.Mask [ OK ] CRC.Mask (0 ms) [----------] 4 tests from CRC (1 ms total) [----------] Global test environment tear-down [==========] 4 tests from 1 test case ran. (1 ms total) [ PASSED ] 4 tests. ``` 2. RocksDB benchmark: db_bench --benchmarks="crc32c" ``` Linear Arm crc32c: crc32c: 1.005 micros/op 995133 ops/sec; 3887.2 MB/s (4096 per op) ``` ``` Parallel optimization with Armv8 crypto extension: crc32c: 0.419 micros/op 2385078 ops/sec; 9316.7 MB/s (4096 per op) ``` It gets ~2.4x speedup compared to linear Arm crc32c instructions. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5494 Differential Revision: D16340806 fbshipit-source-id: 95dae9a5b646fd20a8303671d82f17b2e162e945 2019-07-17 18:19:06 +00:00			`const uint8_t *buf8;`
			`const uint64_t buf64 = (uint64_t )data;`
			`int length = (int)len;`
			`crc ^= 0xffffffff;`
RocksDB CRC32c optimization with ARMv8 Intrinsic (#5221) Summary: 1. Add Arm linear crc32c implemtation for RocksDB. 2. Arm runtime check for crc32 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5221 Differential Revision: D15013685 Pulled By: siying fbshipit-source-id: 2c2983743d26656d93f212dc7c1a3cf66a1acf12 2019-04-30 17:56:06 +00:00
Arm64 CRC32 parallel computation optimization for RocksDB (#5494) Summary: Crc32c Parallel computation optimization: Algorithm comes from Intel whitepaper: [crc-iscsi-polynomial-crc32-instruction-paper](https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf) Input data is divided into three equal-sized blocks Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes 1. crc32c_test: ``` [==========] Running 4 tests from 1 test case. [----------] Global test environment set-up. [----------] 4 tests from CRC [ RUN ] CRC.StandardResults [ OK ] CRC.StandardResults (1 ms) [ RUN ] CRC.Values [ OK ] CRC.Values (0 ms) [ RUN ] CRC.Extend [ OK ] CRC.Extend (0 ms) [ RUN ] CRC.Mask [ OK ] CRC.Mask (0 ms) [----------] 4 tests from CRC (1 ms total) [----------] Global test environment tear-down [==========] 4 tests from 1 test case ran. (1 ms total) [ PASSED ] 4 tests. ``` 2. RocksDB benchmark: db_bench --benchmarks="crc32c" ``` Linear Arm crc32c: crc32c: 1.005 micros/op 995133 ops/sec; 3887.2 MB/s (4096 per op) ``` ``` Parallel optimization with Armv8 crypto extension: crc32c: 0.419 micros/op 2385078 ops/sec; 9316.7 MB/s (4096 per op) ``` It gets ~2.4x speedup compared to linear Arm crc32c instructions. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5494 Differential Revision: D16340806 fbshipit-source-id: 95dae9a5b646fd20a8303671d82f17b2e162e945 2019-07-17 18:19:06 +00:00			`#ifdef HAVE_ARM64_CRYPTO`
			`/* Crc32c Parallel computation`
			`* Algorithm comes from Intel whitepaper:`
			`* crc-iscsi-polynomial-crc32-instruction-paper`
			`*`
			`* Input data is divided into three equal-sized blocks`
			`* Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes`
			`* One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes`
			`*/`
			`#define BLK_LENGTH 42`
			`while (length >= 1024) {`
			`uint64_t t0, t1;`
			`uint32_t crc0 = 0, crc1 = 0, crc2 = 0;`
RocksDB CRC32c optimization with ARMv8 Intrinsic (#5221) Summary: 1. Add Arm linear crc32c implemtation for RocksDB. 2. Arm runtime check for crc32 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5221 Differential Revision: D15013685 Pulled By: siying fbshipit-source-id: 2c2983743d26656d93f212dc7c1a3cf66a1acf12 2019-04-30 17:56:06 +00:00
Arm64 CRC32 parallel computation optimization for RocksDB (#5494) Summary: Crc32c Parallel computation optimization: Algorithm comes from Intel whitepaper: [crc-iscsi-polynomial-crc32-instruction-paper](https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf) Input data is divided into three equal-sized blocks Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes 1. crc32c_test: ``` [==========] Running 4 tests from 1 test case. [----------] Global test environment set-up. [----------] 4 tests from CRC [ RUN ] CRC.StandardResults [ OK ] CRC.StandardResults (1 ms) [ RUN ] CRC.Values [ OK ] CRC.Values (0 ms) [ RUN ] CRC.Extend [ OK ] CRC.Extend (0 ms) [ RUN ] CRC.Mask [ OK ] CRC.Mask (0 ms) [----------] 4 tests from CRC (1 ms total) [----------] Global test environment tear-down [==========] 4 tests from 1 test case ran. (1 ms total) [ PASSED ] 4 tests. ``` 2. RocksDB benchmark: db_bench --benchmarks="crc32c" ``` Linear Arm crc32c: crc32c: 1.005 micros/op 995133 ops/sec; 3887.2 MB/s (4096 per op) ``` ``` Parallel optimization with Armv8 crypto extension: crc32c: 0.419 micros/op 2385078 ops/sec; 9316.7 MB/s (4096 per op) ``` It gets ~2.4x speedup compared to linear Arm crc32c instructions. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5494 Differential Revision: D16340806 fbshipit-source-id: 95dae9a5b646fd20a8303671d82f17b2e162e945 2019-07-17 18:19:06 +00:00			`/* Parallel Param:`
			`* k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));`
			`* k1 = CRC32(x ^ (42 * 8 * 8 - 1));`
			`*/`
			`uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;`

			`/* First 8 bytei for better pipelining */`
			`crc0 = crc32c_u64(crc, *buf64++);`

			`/* 3 blocks crc32c parallel computation`
			`*`
			`* 42 * 8 * 3 = 1008 (bytes)`
			`*/`
			`for (int i = 0; i < BLK_LENGTH; i++, buf64++) {`
			`crc0 = crc32c_u64(crc0, *buf64);`
			`crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH));`
			`crc2 = crc32c_u64(crc2, (buf64 + (BLK_LENGTH 2)));`
			`}`
			`buf64 += (BLK_LENGTH * 2);`

			`/* Last 8 bytes */`
			`crc = crc32c_u64(crc2, *buf64++);`

			`t0 = (uint64_t)vmull_p64(crc0, k0);`
			`t1 = (uint64_t)vmull_p64(crc1, k1);`

			`/* Merge (crc0, crc1, crc2) -> crc */`
			`crc1 = crc32c_u64(0, t1);`
			`crc ^= crc1;`
			`crc0 = crc32c_u64(0, t0);`
			`crc ^= crc0;`

			`length -= 1024;`
			`}`
			`#endif`
			`buf8 = (const uint8_t *)buf64;`
			`while (length >= 8) {`
			`crc = crc32c_u64(crc, (const uint64_t)buf8);`
			`buf8 += 8;`
			`length -= 8;`
RocksDB CRC32c optimization with ARMv8 Intrinsic (#5221) Summary: 1. Add Arm linear crc32c implemtation for RocksDB. 2. Arm runtime check for crc32 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5221 Differential Revision: D15013685 Pulled By: siying fbshipit-source-id: 2c2983743d26656d93f212dc7c1a3cf66a1acf12 2019-04-30 17:56:06 +00:00			`}`

			`/* The following is more efficient than the straight loop */`
Arm64 CRC32 parallel computation optimization for RocksDB (#5494) Summary: Crc32c Parallel computation optimization: Algorithm comes from Intel whitepaper: [crc-iscsi-polynomial-crc32-instruction-paper](https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf) Input data is divided into three equal-sized blocks Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes 1. crc32c_test: ``` [==========] Running 4 tests from 1 test case. [----------] Global test environment set-up. [----------] 4 tests from CRC [ RUN ] CRC.StandardResults [ OK ] CRC.StandardResults (1 ms) [ RUN ] CRC.Values [ OK ] CRC.Values (0 ms) [ RUN ] CRC.Extend [ OK ] CRC.Extend (0 ms) [ RUN ] CRC.Mask [ OK ] CRC.Mask (0 ms) [----------] 4 tests from CRC (1 ms total) [----------] Global test environment tear-down [==========] 4 tests from 1 test case ran. (1 ms total) [ PASSED ] 4 tests. ``` 2. RocksDB benchmark: db_bench --benchmarks="crc32c" ``` Linear Arm crc32c: crc32c: 1.005 micros/op 995133 ops/sec; 3887.2 MB/s (4096 per op) ``` ``` Parallel optimization with Armv8 crypto extension: crc32c: 0.419 micros/op 2385078 ops/sec; 9316.7 MB/s (4096 per op) ``` It gets ~2.4x speedup compared to linear Arm crc32c instructions. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5494 Differential Revision: D16340806 fbshipit-source-id: 95dae9a5b646fd20a8303671d82f17b2e162e945 2019-07-17 18:19:06 +00:00			`if (length >= 4) {`
			`crc = crc32c_u32(crc, (const uint32_t)buf8);`
			`buf8 += 4;`
RocksDB CRC32c optimization with ARMv8 Intrinsic (#5221) Summary: 1. Add Arm linear crc32c implemtation for RocksDB. 2. Arm runtime check for crc32 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5221 Differential Revision: D15013685 Pulled By: siying fbshipit-source-id: 2c2983743d26656d93f212dc7c1a3cf66a1acf12 2019-04-30 17:56:06 +00:00			`length -= 4;`
			`}`

Arm64 CRC32 parallel computation optimization for RocksDB (#5494) Summary: Crc32c Parallel computation optimization: Algorithm comes from Intel whitepaper: [crc-iscsi-polynomial-crc32-instruction-paper](https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf) Input data is divided into three equal-sized blocks Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes 1. crc32c_test: ``` [==========] Running 4 tests from 1 test case. [----------] Global test environment set-up. [----------] 4 tests from CRC [ RUN ] CRC.StandardResults [ OK ] CRC.StandardResults (1 ms) [ RUN ] CRC.Values [ OK ] CRC.Values (0 ms) [ RUN ] CRC.Extend [ OK ] CRC.Extend (0 ms) [ RUN ] CRC.Mask [ OK ] CRC.Mask (0 ms) [----------] 4 tests from CRC (1 ms total) [----------] Global test environment tear-down [==========] 4 tests from 1 test case ran. (1 ms total) [ PASSED ] 4 tests. ``` 2. RocksDB benchmark: db_bench --benchmarks="crc32c" ``` Linear Arm crc32c: crc32c: 1.005 micros/op 995133 ops/sec; 3887.2 MB/s (4096 per op) ``` ``` Parallel optimization with Armv8 crypto extension: crc32c: 0.419 micros/op 2385078 ops/sec; 9316.7 MB/s (4096 per op) ``` It gets ~2.4x speedup compared to linear Arm crc32c instructions. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5494 Differential Revision: D16340806 fbshipit-source-id: 95dae9a5b646fd20a8303671d82f17b2e162e945 2019-07-17 18:19:06 +00:00			`if (length >= 2) {`
			`crc = crc32c_u16(crc, (const uint16_t)buf8);`
			`buf8 += 2;`
RocksDB CRC32c optimization with ARMv8 Intrinsic (#5221) Summary: 1. Add Arm linear crc32c implemtation for RocksDB. 2. Arm runtime check for crc32 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5221 Differential Revision: D15013685 Pulled By: siying fbshipit-source-id: 2c2983743d26656d93f212dc7c1a3cf66a1acf12 2019-04-30 17:56:06 +00:00			`length -= 2;`
			`}`

Arm64 CRC32 parallel computation optimization for RocksDB (#5494) Summary: Crc32c Parallel computation optimization: Algorithm comes from Intel whitepaper: [crc-iscsi-polynomial-crc32-instruction-paper](https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf) Input data is divided into three equal-sized blocks Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes 1. crc32c_test: ``` [==========] Running 4 tests from 1 test case. [----------] Global test environment set-up. [----------] 4 tests from CRC [ RUN ] CRC.StandardResults [ OK ] CRC.StandardResults (1 ms) [ RUN ] CRC.Values [ OK ] CRC.Values (0 ms) [ RUN ] CRC.Extend [ OK ] CRC.Extend (0 ms) [ RUN ] CRC.Mask [ OK ] CRC.Mask (0 ms) [----------] 4 tests from CRC (1 ms total) [----------] Global test environment tear-down [==========] 4 tests from 1 test case ran. (1 ms total) [ PASSED ] 4 tests. ``` 2. RocksDB benchmark: db_bench --benchmarks="crc32c" ``` Linear Arm crc32c: crc32c: 1.005 micros/op 995133 ops/sec; 3887.2 MB/s (4096 per op) ``` ``` Parallel optimization with Armv8 crypto extension: crc32c: 0.419 micros/op 2385078 ops/sec; 9316.7 MB/s (4096 per op) ``` It gets ~2.4x speedup compared to linear Arm crc32c instructions. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5494 Differential Revision: D16340806 fbshipit-source-id: 95dae9a5b646fd20a8303671d82f17b2e162e945 2019-07-17 18:19:06 +00:00			`if (length >= 1)`
			`crc = crc32c_u8(crc, *buf8);`
RocksDB CRC32c optimization with ARMv8 Intrinsic (#5221) Summary: 1. Add Arm linear crc32c implemtation for RocksDB. 2. Arm runtime check for crc32 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5221 Differential Revision: D15013685 Pulled By: siying fbshipit-source-id: 2c2983743d26656d93f212dc7c1a3cf66a1acf12 2019-04-30 17:56:06 +00:00
			`crc ^= 0xffffffff;`
			`return crc;`
			`}`

			`#endif`