Update BoringSSL

2022-01-07 11:51:24 +00:00 · 2022-01-07 11:51:24 +00:00 · d2a42aa235
parent d728a4dc8d
commit d2a42aa235
128 changed files with 10290 additions and 2308 deletions
--- a/Sources/CBigNumBoringSSL/crypto/bio/bio.c
+++ b/Sources/CBigNumBoringSSL/crypto/bio/bio.c
@ -262,6 +262,8 @@ int BIO_should_io_special(const BIO *bio) {

 int BIO_get_retry_reason(const BIO *bio) { return bio->retry_reason; }

+void BIO_set_retry_reason(BIO *bio, int reason) { bio->retry_reason = reason; }
+
 void BIO_clear_flags(BIO *bio, int flags) {
  bio->flags &= ~flags;
 }
--- a/Sources/CBigNumBoringSSL/crypto/bio/file.c
+++ b/Sources/CBigNumBoringSSL/crypto/bio/file.c
@ -126,13 +126,7 @@ BIO *BIO_new_fp(FILE *stream, int close_flag) {
  return ret;
 }

-static int file_new(BIO *bio) { return 1; }
-
 static int file_free(BIO *bio) {
-  if (bio == NULL) {
-    return 0;
-  }
-
  if (!bio->shutdown) {
    return 1;
  }
@ -279,7 +273,7 @@ static const BIO_METHOD methods_filep = {
    BIO_TYPE_FILE,   "FILE pointer",
    file_write,      file_read,
    NULL /* puts */, file_gets,
-    file_ctrl,       file_new,
+    file_ctrl,       NULL /* create */,
    file_free,       NULL /* callback_ctrl */,
 };

@ -314,4 +308,10 @@ int BIO_rw_filename(BIO *bio, const char *filename) {
                  BIO_CLOSE | BIO_FP_READ | BIO_FP_WRITE, (char *)filename);
 }

+long BIO_tell(BIO *bio) { return BIO_ctrl(bio, BIO_C_FILE_TELL, 0, NULL); }
+
+long BIO_seek(BIO *bio, long offset) {
+  return BIO_ctrl(bio, BIO_C_FILE_SEEK, offset, NULL);
+}
+
 #endif  // OPENSSL_TRUSTY
--- a/Sources/CBigNumBoringSSL/crypto/bytestring/ber.c
+++ b/Sources/CBigNumBoringSSL/crypto/bytestring/ber.c
@ -29,8 +29,10 @@ static const unsigned kMaxDepth = 2048;
 // is_string_type returns one if |tag| is a string type and zero otherwise. It
 // ignores the constructed bit.
 static int is_string_type(unsigned tag) {
+  // While BER supports constructed BIT STRINGS, OpenSSL misparses them. To
+  // avoid acting on an ambiguous input, we do not support constructed BIT
+  // STRINGS. See https://github.com/openssl/openssl/issues/12810.
  switch (tag & ~CBS_ASN1_CONSTRUCTED) {
-    case CBS_ASN1_BITSTRING:
    case CBS_ASN1_OCTETSTRING:
    case CBS_ASN1_UTF8STRING:
    case CBS_ASN1_NUMERICSTRING:
@ -53,7 +55,7 @@ static int is_string_type(unsigned tag) {
 // depending on whether an indefinite length element or constructed string was
 // found. The value of |orig_in| is not changed. It returns one on success (i.e.
 // |*ber_found| was set) and zero on error.
-static int cbs_find_ber(const CBS *orig_in, char *ber_found, unsigned depth) {
+static int cbs_find_ber(const CBS *orig_in, int *ber_found, unsigned depth) {
  CBS in;

  if (depth > kMaxDepth) {
@ -68,14 +70,11 @@ static int cbs_find_ber(const CBS *orig_in, char *ber_found, unsigned depth) {
    unsigned tag;
    size_t header_len;

-    if (!CBS_get_any_ber_asn1_element(&in, &contents, &tag, &header_len)) {
+    if (!CBS_get_any_ber_asn1_element(&in, &contents, &tag, &header_len,
+                                      ber_found)) {
      return 0;
    }
-    if (CBS_len(&contents) == header_len &&
-        header_len > 0 &&
-        CBS_data(&contents)[header_len-1] == 0x80) {
-      // Found an indefinite-length element.
-      *ber_found = 1;
+    if (*ber_found) {
      return 1;
    }
    if (tag & CBS_ASN1_CONSTRUCTED) {
@ -120,9 +119,11 @@ static int cbs_convert_ber(CBS *in, CBB *out, unsigned string_tag,
    CBS contents;
    unsigned tag, child_string_tag = string_tag;
    size_t header_len;
+    int ber_found;
    CBB *out_contents, out_contents_storage;

-    if (!CBS_get_any_ber_asn1_element(in, &contents, &tag, &header_len)) {
+    if (!CBS_get_any_ber_asn1_element(in, &contents, &tag, &header_len,
+                                      &ber_found)) {
      return 0;
    }

@ -194,7 +195,7 @@ int CBS_asn1_ber_to_der(CBS *in, CBS *out, uint8_t **out_storage) {

  // First, do a quick walk to find any indefinite-length elements. Most of the
  // time we hope that there aren't any and thus we can quickly return.
-  char conversion_needed;
+  int conversion_needed;
  if (!cbs_find_ber(in, &conversion_needed, 0)) {
    return 0;
  }
--- a/Sources/CBigNumBoringSSL/crypto/bytestring/cbb.c
+++ b/Sources/CBigNumBoringSSL/crypto/bytestring/cbb.c
@ -404,6 +404,15 @@ int CBB_add_bytes(CBB *cbb, const uint8_t *data, size_t len) {
  return 1;
 }

+int CBB_add_zeros(CBB *cbb, size_t len) {
+  uint8_t *out;
+  if (!CBB_add_space(cbb, &out, len)) {
+    return 0;
+  }
+  OPENSSL_memset(out, 0, len);
+  return 1;
+}
+
 int CBB_add_space(CBB *cbb, uint8_t **out_data, size_t len) {
  if (!CBB_flush(cbb) ||
      !cbb_buffer_add(cbb->base, out_data, len)) {
--- a/Sources/CBigNumBoringSSL/crypto/bytestring/cbs.c
+++ b/Sources/CBigNumBoringSSL/crypto/bytestring/cbs.c
@ -216,6 +216,14 @@ int CBS_get_u24_length_prefixed(CBS *cbs, CBS *out) {
  return cbs_get_length_prefixed(cbs, out, 3);
 }

+int CBS_get_until_first(CBS *cbs, CBS *out, uint8_t c) {
+  const uint8_t *split = OPENSSL_memchr(CBS_data(cbs), c, CBS_len(cbs));
+  if (split == NULL) {
+    return 0;
+  }
+  return CBS_get_bytes(cbs, out, split - CBS_data(cbs));
+}
+
 // parse_base128_integer reads a big-endian base-128 integer from |cbs| and sets
 // |*out| to the result. This is the encoding used in DER for both high tag
 // number form and OID components.
@ -254,8 +262,7 @@ static int parse_asn1_tag(CBS *cbs, unsigned *out) {
  //
  // If the number portion is 31 (0x1f, the largest value that fits in the
  // allotted bits), then the tag is more than one byte long and the
-  // continuation bytes contain the tag number. This parser only supports tag
-  // numbers less than 31 (and thus single-byte tags).
+  // continuation bytes contain the tag number.
  unsigned tag = ((unsigned)tag_byte & 0xe0) << CBS_ASN1_TAG_SHIFT;
  unsigned tag_number = tag_byte & 0x1f;
  if (tag_number == 0x1f) {
@ -263,7 +270,7 @@ static int parse_asn1_tag(CBS *cbs, unsigned *out) {
    if (!parse_base128_integer(cbs, &v) ||
        // Check the tag number is within our supported bounds.
        v > CBS_ASN1_TAG_NUMBER_MASK ||
-        // Small tag numbers should have used low tag number form.
+        // Small tag numbers should have used low tag number form, even in BER.
        v < 0x1f) {
      return 0;
    }
@ -277,13 +284,17 @@ static int parse_asn1_tag(CBS *cbs, unsigned *out) {
 }

 static int cbs_get_any_asn1_element(CBS *cbs, CBS *out, unsigned *out_tag,
-                                    size_t *out_header_len, int ber_ok) {
+                                    size_t *out_header_len, int *out_ber_found,
+                                    int ber_ok) {
  CBS header = *cbs;
  CBS throwaway;

  if (out == NULL) {
    out = &throwaway;
  }
+  if (ber_ok) {
+    *out_ber_found = 0;
+  }

  unsigned tag;
  if (!parse_asn1_tag(&header, &tag)) {
@ -321,27 +332,38 @@ static int cbs_get_any_asn1_element(CBS *cbs, CBS *out, unsigned *out_tag,
      if (out_header_len != NULL) {
        *out_header_len = header_len;
      }
+      *out_ber_found = 1;
      return CBS_get_bytes(cbs, out, header_len);
    }

    // ITU-T X.690 clause 8.1.3.5.c specifies that the value 0xff shall not be
    // used as the first byte of the length. If this parser encounters that
-    // value, num_bytes will be parsed as 127, which will fail the check below.
+    // value, num_bytes will be parsed as 127, which will fail this check.
    if (num_bytes == 0 || num_bytes > 4) {
      return 0;
    }
    if (!cbs_get_u(&header, &len64, num_bytes)) {
      return 0;
    }
-    // ITU-T X.690 section 10.1 (DER length forms) requires encoding the length
-    // with the minimum number of octets.
+    // ITU-T X.690 section 10.1 (DER length forms) requires encoding the
+    // length with the minimum number of octets. BER could, technically, have
+    // 125 superfluous zero bytes. We do not attempt to handle that and still
+    // require that the length fit in a |uint32_t| for BER.
    if (len64 < 128) {
      // Length should have used short-form encoding.
-      return 0;
+      if (ber_ok) {
+        *out_ber_found = 1;
+      } else {
+        return 0;
+      }
    }
-    if ((len64 >> ((num_bytes-1)*8)) == 0) {
+    if ((len64 >> ((num_bytes - 1) * 8)) == 0) {
      // Length should have been at least one byte shorter.
-      return 0;
+      if (ber_ok) {
+        *out_ber_found = 1;
+      } else {
+        return 0;
+      }
    }
    len = len64;
    if (len + header_len + num_bytes < len) {
@ -374,13 +396,15 @@ int CBS_get_any_asn1(CBS *cbs, CBS *out, unsigned *out_tag) {
 int CBS_get_any_asn1_element(CBS *cbs, CBS *out, unsigned *out_tag,
                                    size_t *out_header_len) {
  return cbs_get_any_asn1_element(cbs, out, out_tag, out_header_len,
-                                  0 /* DER only */);
+                                  NULL, 0 /* DER only */);
 }

 int CBS_get_any_ber_asn1_element(CBS *cbs, CBS *out, unsigned *out_tag,
-                                 size_t *out_header_len) {
-  return cbs_get_any_asn1_element(cbs, out, out_tag, out_header_len,
-                                  1 /* BER allowed */);
+                                 size_t *out_header_len, int *out_ber_found) {
+  int ber_found_temp;
+  return cbs_get_any_asn1_element(
+      cbs, out, out_tag, out_header_len,
+      out_ber_found ? out_ber_found : &ber_found_temp, 1 /* BER allowed */);
 }

 static int cbs_get_asn1(CBS *cbs, CBS *out, unsigned tag_value,
@ -426,29 +450,14 @@ int CBS_peek_asn1_tag(const CBS *cbs, unsigned tag_value) {

 int CBS_get_asn1_uint64(CBS *cbs, uint64_t *out) {
  CBS bytes;
-  if (!CBS_get_asn1(cbs, &bytes, CBS_ASN1_INTEGER)) {
+  if (!CBS_get_asn1(cbs, &bytes, CBS_ASN1_INTEGER) ||
+      !CBS_is_unsigned_asn1_integer(&bytes)) {
    return 0;
  }

  *out = 0;
  const uint8_t *data = CBS_data(&bytes);
  size_t len = CBS_len(&bytes);
-
-  if (len == 0) {
-    // An INTEGER is encoded with at least one octet.
-    return 0;
-  }
-
-  if ((data[0] & 0x80) != 0) {
-    // Negative number.
-    return 0;
-  }
-
-  if (data[0] == 0 && len > 1 && (data[1] & 0x80) == 0) {
-    // Extra leading zeros.
-    return 0;
-  }
-
  for (size_t i = 0; i < len; i++) {
    if ((*out >> 56) != 0) {
      // Too large to represent as a uint64_t.
@ -462,31 +471,21 @@ int CBS_get_asn1_uint64(CBS *cbs, uint64_t *out) {
 }

 int CBS_get_asn1_int64(CBS *cbs, int64_t *out) {
+  int is_negative;
  CBS bytes;
-  if (!CBS_get_asn1(cbs, &bytes, CBS_ASN1_INTEGER)) {
+  if (!CBS_get_asn1(cbs, &bytes, CBS_ASN1_INTEGER) ||
+      !CBS_is_valid_asn1_integer(&bytes, &is_negative)) {
    return 0;
  }
  const uint8_t *data = CBS_data(&bytes);
  const size_t len = CBS_len(&bytes);
-
-  if (len == 0 || len > sizeof(int64_t)) {
-    // An INTEGER is encoded with at least one octet.
+  if (len > sizeof(int64_t)) {
    return 0;
  }
-  if (len > 1) {
-    if (data[0] == 0 && (data[1] & 0x80) == 0) {
-      return 0;  // Extra leading zeros.
-    }
-    if (data[0] == 0xff && (data[1] & 0x80) != 0) {
-      return 0;  // Extra leading 0xff.
-    }
-  }
-
  union {
    int64_t i;
    uint8_t bytes[sizeof(int64_t)];
  } u;
-  const int is_negative = (data[0] & 0x80);
  memset(u.bytes, is_negative ? 0xff : 0, sizeof(u.bytes));  // Sign-extend.
  for (size_t i = 0; i < len; i++) {
    u.bytes[i] = data[len - i - 1];
@ -635,6 +634,30 @@ int CBS_asn1_bitstring_has_bit(const CBS *cbs, unsigned bit) {
         (CBS_data(cbs)[byte_num] & (1 << bit_num)) != 0;
 }

+int CBS_is_valid_asn1_integer(const CBS *cbs, int *out_is_negative) {
+  CBS copy = *cbs;
+  uint8_t first_byte, second_byte;
+  if (!CBS_get_u8(&copy, &first_byte)) {
+    return 0;  // INTEGERs may not be empty.
+  }
+  if (out_is_negative != NULL) {
+    *out_is_negative = (first_byte & 0x80) != 0;
+  }
+  if (!CBS_get_u8(&copy, &second_byte)) {
+    return 1;  // One byte INTEGERs are always minimal.
+  }
+  if ((first_byte == 0x00 && (second_byte & 0x80) == 0) ||
+      (first_byte == 0xff && (second_byte & 0x80) != 0)) {
+    return 0;  // The value is minimal iff the first 9 bits are not all equal.
+  }
+  return 1;
+}
+
+int CBS_is_unsigned_asn1_integer(const CBS *cbs) {
+  int is_negative;
+  return CBS_is_valid_asn1_integer(cbs, &is_negative) && !is_negative;
+}
+
 static int add_decimal(CBB *out, uint64_t v) {
  char buf[DECIMAL_SIZE(uint64_t) + 1];
  BIO_snprintf(buf, sizeof(buf), "%" PRIu64, v);
--- a/Sources/CBigNumBoringSSL/crypto/cpu_aarch64_apple.c
+++ b/Sources/CBigNumBoringSSL/crypto/cpu_aarch64_apple.c
@ -0,0 +1,73 @@
+/* Copyright (c) 2021, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <CBigNumBoringSSL_cpu.h>
+
+#if defined(OPENSSL_AARCH64) && defined(OPENSSL_APPLE) && \
+    !defined(OPENSSL_STATIC_ARMCAP)
+
+#include <sys/sysctl.h>
+#include <sys/types.h>
+
+#include <CBigNumBoringSSL_arm_arch.h>
+
+#include "internal.h"
+
+
+extern uint32_t OPENSSL_armcap_P;
+
+static int has_hw_feature(const char *name) {
+  int value;
+  size_t len = sizeof(value);
+  if (sysctlbyname(name, &value, &len, NULL, 0) != 0) {
+    return 0;
+  }
+  if (len != sizeof(int)) {
+    // This should not happen. All the values queried should be integer-valued.
+    assert(0);
+    return 0;
+  }
+
+  // Per sys/sysctl.h:
+  //
+  //   Selectors that return errors are not support on the system. Supported
+  //   features will return 1 if they are recommended or 0 if they are supported
+  //   but are not expected to help performance. Future versions of these
+  //   selectors may return larger values as necessary so it is best to test for
+  //   non zero.
+  return value != 0;
+}
+
+void OPENSSL_cpuid_setup(void) {
+  // Apple ARM64 platforms have NEON and cryptography extensions available
+  // statically, so we do not need to query them. In particular, there sometimes
+  // are no sysctls corresponding to such features. See below.
+#if !defined(__ARM_NEON) || !defined(__ARM_FEATURE_CRYPTO)
+#error "NEON and crypto extensions should be statically available."
+#endif
+  OPENSSL_armcap_P =
+      ARMV7_NEON | ARMV8_AES | ARMV8_PMULL | ARMV8_SHA1 | ARMV8_SHA256;
+
+  // macOS has sysctls named both like "hw.optional.arm.FEAT_SHA512" and like
+  // "hw.optional.armv8_2_sha512". There does not appear to be documentation on
+  // which to use. The "armv8_2_sha512" style omits statically-available
+  // features, while the "FEAT_SHA512" style includes them. However, the
+  // "FEAT_SHA512" style was added in macOS 12, so we use the older style for
+  // better compatibility and handle static features above.
+  if (has_hw_feature("hw.optional.armv8_2_sha512")) {
+    OPENSSL_armcap_P |= ARMV8_SHA512;
+  }
+}
+
+#endif  // OPENSSL_AARCH64 && OPENSSL_APPLE && !OPENSSL_STATIC_ARMCAP
--- a/Sources/CBigNumBoringSSL/crypto/cpu_aarch64_fuchsia.c
+++ b/Sources/CBigNumBoringSSL/crypto/cpu_aarch64_fuchsia.c
@ -50,6 +50,9 @@ void OPENSSL_cpuid_setup(void) {
  if (hwcap & ZX_ARM64_FEATURE_ISA_SHA2) {
    OPENSSL_armcap_P |= ARMV8_SHA256;
  }
+  // As of writing, Fuchsia does not have a flag for ARMv8.2 SHA-512
+  // extensions. When it does, add it here. See
+  // https://bugs.fuchsia.dev/p/fuchsia/issues/detail?id=90759.
 }

-#endif  // OPENSSL_AARCH64 && !OPENSSL_STATIC_ARMCAP
+#endif  // OPENSSL_AARCH64 && OPENSSL_FUCHSIA && !OPENSSL_STATIC_ARMCAP
--- a/Sources/CBigNumBoringSSL/crypto/cpu_aarch64_linux.c
+++ b/Sources/CBigNumBoringSSL/crypto/cpu_aarch64_linux.c
@ -36,6 +36,7 @@ void OPENSSL_cpuid_setup(void) {
  static const unsigned long kPMULL = 1 << 4;
  static const unsigned long kSHA1 = 1 << 5;
  static const unsigned long kSHA256 = 1 << 6;
+  static const unsigned long kSHA512 = 1 << 21;

  if ((hwcap & kNEON) == 0) {
    // Matching OpenSSL, if NEON is missing, don't report other features
@ -57,6 +58,9 @@ void OPENSSL_cpuid_setup(void) {
  if (hwcap & kSHA256) {
    OPENSSL_armcap_P |= ARMV8_SHA256;
  }
+  if (hwcap & kSHA512) {
+    OPENSSL_armcap_P |= ARMV8_SHA512;
+  }
 }

-#endif  // OPENSSL_AARCH64 && !OPENSSL_STATIC_ARMCAP
+#endif  // OPENSSL_AARCH64 && OPENSSL_LINUX && !OPENSSL_STATIC_ARMCAP
--- a/Sources/CBigNumBoringSSL/crypto/cpu_aarch64_win.c
+++ b/Sources/CBigNumBoringSSL/crypto/cpu_aarch64_win.c
@ -0,0 +1,43 @@
+/* Copyright (c) 2018, Google Inc.
+ * Copyright (c) 2020, Arm Ltd.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <CBigNumBoringSSL_cpu.h>
+
+#if defined(OPENSSL_AARCH64) && defined(OPENSSL_WINDOWS) && \
+    !defined(OPENSSL_STATIC_ARMCAP)
+
+#include <windows.h>
+
+#include <CBigNumBoringSSL_arm_arch.h>
+
+#include "internal.h"
+
+extern uint32_t OPENSSL_armcap_P;
+void OPENSSL_cpuid_setup(void) {
+  // We do not need to check for the presence of NEON, as Armv8-A always has it
+  OPENSSL_armcap_P |= ARMV7_NEON;
+
+  if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)) {
+    // These are all covered by one call in Windows
+    OPENSSL_armcap_P |= ARMV8_AES;
+    OPENSSL_armcap_P |= ARMV8_PMULL;
+    OPENSSL_armcap_P |= ARMV8_SHA1;
+    OPENSSL_armcap_P |= ARMV8_SHA256;
+  }
+  // As of writing, Windows does not have a |PF_*| value for ARMv8.2 SHA-512
+  // extensions. When it does, add it here.
+}
+
+#endif  // OPENSSL_AARCH64 && OPENSSL_WINDOWS && !OPENSSL_STATIC_ARMCAP
--- a/Sources/CBigNumBoringSSL/crypto/cpu_arm.c
+++ b/Sources/CBigNumBoringSSL/crypto/cpu_arm.c
@ -22,15 +22,15 @@

 extern uint32_t OPENSSL_armcap_P;

-char CRYPTO_is_NEON_capable_at_runtime(void) {
+int CRYPTO_is_NEON_capable_at_runtime(void) {
  return (OPENSSL_armcap_P & ARMV7_NEON) != 0;
 }

-int CRYPTO_is_ARMv8_AES_capable(void) {
+int CRYPTO_is_ARMv8_AES_capable_at_runtime(void) {
  return (OPENSSL_armcap_P & ARMV8_AES) != 0;
 }

-int CRYPTO_is_ARMv8_PMULL_capable(void) {
+int CRYPTO_is_ARMv8_PMULL_capable_at_runtime(void) {
  return (OPENSSL_armcap_P & ARMV8_PMULL) != 0;
 }

--- a/Sources/CBigNumBoringSSL/crypto/cpu_arm_linux.c
+++ b/Sources/CBigNumBoringSSL/crypto/cpu_arm_linux.c
@ -23,7 +23,7 @@
 #include <CBigNumBoringSSL_arm_arch.h>
 #include <CBigNumBoringSSL_mem.h>

-#include "cpu-arm-linux.h"
+#include "cpu_arm_linux.h"

 #define AT_HWCAP 16
 #define AT_HWCAP2 26
@ -146,11 +146,13 @@ extern uint32_t OPENSSL_armcap_P;
 static int g_has_broken_neon, g_needs_hwcap2_workaround;

 void OPENSSL_cpuid_setup(void) {
-  char *cpuinfo_data;
-  size_t cpuinfo_len;
-  if (!read_file(&cpuinfo_data, &cpuinfo_len, "/proc/cpuinfo")) {
-    return;
-  }
+  // We ignore the return value of |read_file| and proceed with an empty
+  // /proc/cpuinfo on error. If |getauxval| works, we will still detect
+  // capabilities. There may be a false positive due to
+  // |crypto_cpuinfo_has_broken_neon|, but this is now rare.
+  char *cpuinfo_data = NULL;
+  size_t cpuinfo_len = 0;
+  read_file(&cpuinfo_data, &cpuinfo_len, "/proc/cpuinfo");
  STRING_PIECE cpuinfo;
  cpuinfo.data = cpuinfo_data;
  cpuinfo.len = cpuinfo_len;
@ -173,7 +175,13 @@ void OPENSSL_cpuid_setup(void) {
    hwcap = crypto_get_arm_hwcap_from_cpuinfo(&cpuinfo);
  }

-  // Clear NEON support if known broken.
+  // Clear NEON support if known broken. Note, if NEON is available statically,
+  // the non-NEON code is dropped and this workaround is a no-op.
+  //
+  // TODO(davidben): The Android NDK now builds with NEON statically available
+  // by default. Cronet still has some consumers that support NEON-less devices
+  // (b/150371744). Get metrics on whether they still see this CPU and, if not,
+  // remove this check entirely.
  g_has_broken_neon = crypto_cpuinfo_has_broken_neon(&cpuinfo);
  if (g_has_broken_neon) {
    hwcap &= ~HWCAP_NEON;
@ -184,7 +192,10 @@ void OPENSSL_cpuid_setup(void) {
    OPENSSL_armcap_P |= ARMV7_NEON;

    // Some ARMv8 Android devices don't expose AT_HWCAP2. Fall back to
-    // /proc/cpuinfo. See https://crbug.com/596156.
+    // /proc/cpuinfo. See https://crbug.com/boringssl/46. As of February 2021,
+    // this is now rare (see Chrome's Net.NeedsHWCAP2Workaround metric), but AES
+    // and PMULL extensions are very useful, so we still carry the workaround
+    // for now.
    unsigned long hwcap2 = 0;
    if (getauxval != NULL) {
      hwcap2 = getauxval(AT_HWCAP2);
--- a/Sources/CBigNumBoringSSL/crypto/cpu_arm_linux.h
+++ b/Sources/CBigNumBoringSSL/crypto/cpu_arm_linux.h
--- a/Sources/CBigNumBoringSSL/crypto/cpu_intel.c
+++ b/Sources/CBigNumBoringSSL/crypto/cpu_intel.c
--- a/Sources/CBigNumBoringSSL/crypto/cpu_ppc64le.c
+++ b/Sources/CBigNumBoringSSL/crypto/cpu_ppc64le.c
--- a/Sources/CBigNumBoringSSL/crypto/crypto.c
+++ b/Sources/CBigNumBoringSSL/crypto/crypto.c
@ -16,6 +16,8 @@

 #include <CBigNumBoringSSL_cpu.h>

+#include "fipsmodule/rand/fork_detect.h"
+#include "fipsmodule/rand/internal.h"
 #include "internal.h"


@ -102,6 +104,9 @@ HIDDEN uint32_t OPENSSL_armcap_P =
 #endif
 #if defined(OPENSSL_STATIC_ARMCAP_PMULL) || defined(__ARM_FEATURE_CRYPTO)
    ARMV8_PMULL |
+#endif
+#if defined(__ARM_FEATURE_SHA512)
+    ARMV8_SHA512 |
 #endif
    0;

@ -174,6 +179,15 @@ int CRYPTO_has_asm(void) {
 #endif
 }

+void CRYPTO_pre_sandbox_init(void) {
+  // Read from /proc/cpuinfo if needed.
+  CRYPTO_library_init();
+  // Open /dev/urandom if needed.
+  CRYPTO_init_sysrand();
+  // Set up MADV_WIPEONFORK state if needed.
+  CRYPTO_get_fork_generation();
+}
+
 const char *SSLeay_version(int which) { return OpenSSL_version(which); }

 const char *OpenSSL_version(int which) {
--- a/Sources/CBigNumBoringSSL/crypto/err/err.c
+++ b/Sources/CBigNumBoringSSL/crypto/err/err.c
@ -368,84 +368,6 @@ void ERR_clear_system_error(void) {
  errno = 0;
 }

-char *ERR_error_string(uint32_t packed_error, char *ret) {
-  static char buf[ERR_ERROR_STRING_BUF_LEN];
-
-  if (ret == NULL) {
-    // TODO(fork): remove this.
-    ret = buf;
-  }
-
-#if !defined(NDEBUG)
-  // This is aimed to help catch callers who don't provide
-  // |ERR_ERROR_STRING_BUF_LEN| bytes of space.
-  OPENSSL_memset(ret, 0, ERR_ERROR_STRING_BUF_LEN);
-#endif
-
-  return ERR_error_string_n(packed_error, ret, ERR_ERROR_STRING_BUF_LEN);
-}
-
-char *ERR_error_string_n(uint32_t packed_error, char *buf, size_t len) {
-  char lib_buf[64], reason_buf[64];
-  const char *lib_str, *reason_str;
-  unsigned lib, reason;
-
-  if (len == 0) {
-    return NULL;
-  }
-
-  lib = ERR_GET_LIB(packed_error);
-  reason = ERR_GET_REASON(packed_error);
-
-  lib_str = ERR_lib_error_string(packed_error);
-  reason_str = ERR_reason_error_string(packed_error);
-
-  if (lib_str == NULL) {
-    BIO_snprintf(lib_buf, sizeof(lib_buf), "lib(%u)", lib);
-    lib_str = lib_buf;
-  }
-
- if (reason_str == NULL) {
-    BIO_snprintf(reason_buf, sizeof(reason_buf), "reason(%u)", reason);
-    reason_str = reason_buf;
-  }
-
-  BIO_snprintf(buf, len, "error:%08" PRIx32 ":%s:OPENSSL_internal:%s",
-               packed_error, lib_str, reason_str);
-
-  if (strlen(buf) == len - 1) {
-    // output may be truncated; make sure we always have 5 colon-separated
-    // fields, i.e. 4 colons.
-    static const unsigned num_colons = 4;
-    unsigned i;
-    char *s = buf;
-
-    if (len <= num_colons) {
-      // In this situation it's not possible to ensure that the correct number
-      // of colons are included in the output.
-      return buf;
-    }
-
-    for (i = 0; i < num_colons; i++) {
-      char *colon = strchr(s, ':');
-      char *last_pos = &buf[len - 1] - num_colons + i;
-
-      if (colon == NULL || colon > last_pos) {
-        // set colon |i| at last possible position (buf[len-1] is the
-        // terminating 0). If we're setting this colon, then all whole of the
-        // rest of the string must be colons in order to have the correct
-        // number.
-        OPENSSL_memset(last_pos, ':', num_colons - i);
-        break;
-      }
-
-      s = colon + 1;
-    }
-  }
-
-  return buf;
-}
-
 // err_string_cmp is a compare function for searching error values with
 // |bsearch| in |err_string_lookup|.
 static int err_string_cmp(const void *a, const void *b) {
@ -530,7 +452,7 @@ static const char *const kLibraryNames[ERR_NUM_LIBS] = {
    "User defined functions",       // ERR_LIB_USER
 };

-const char *ERR_lib_error_string(uint32_t packed_error) {
+static const char *err_lib_error_string(uint32_t packed_error) {
  const uint32_t lib = ERR_GET_LIB(packed_error);

  if (lib >= ERR_NUM_LIBS) {
@ -539,11 +461,16 @@ const char *ERR_lib_error_string(uint32_t packed_error) {
  return kLibraryNames[lib];
 }

+const char *ERR_lib_error_string(uint32_t packed_error) {
+  const char *ret = err_lib_error_string(packed_error);
+  return ret == NULL ? "unknown library" : ret;
+}
+
 const char *ERR_func_error_string(uint32_t packed_error) {
  return "OPENSSL_internal";
 }

-const char *ERR_reason_error_string(uint32_t packed_error) {
+static const char *err_reason_error_string(uint32_t packed_error) {
  const uint32_t lib = ERR_GET_LIB(packed_error);
  const uint32_t reason = ERR_GET_REASON(packed_error);

@ -579,6 +506,86 @@ const char *ERR_reason_error_string(uint32_t packed_error) {
                           kOpenSSLReasonValuesLen, kOpenSSLReasonStringData);
 }

+const char *ERR_reason_error_string(uint32_t packed_error) {
+  const char *ret = err_reason_error_string(packed_error);
+  return ret == NULL ? "unknown error" : ret;
+}
+
+char *ERR_error_string(uint32_t packed_error, char *ret) {
+  static char buf[ERR_ERROR_STRING_BUF_LEN];
+
+  if (ret == NULL) {
+    // TODO(fork): remove this.
+    ret = buf;
+  }
+
+#if !defined(NDEBUG)
+  // This is aimed to help catch callers who don't provide
+  // |ERR_ERROR_STRING_BUF_LEN| bytes of space.
+  OPENSSL_memset(ret, 0, ERR_ERROR_STRING_BUF_LEN);
+#endif
+
+  return ERR_error_string_n(packed_error, ret, ERR_ERROR_STRING_BUF_LEN);
+}
+
+char *ERR_error_string_n(uint32_t packed_error, char *buf, size_t len) {
+  if (len == 0) {
+    return NULL;
+  }
+
+  unsigned lib = ERR_GET_LIB(packed_error);
+  unsigned reason = ERR_GET_REASON(packed_error);
+
+  const char *lib_str = err_lib_error_string(packed_error);
+  const char *reason_str = err_reason_error_string(packed_error);
+
+  char lib_buf[64], reason_buf[64];
+  if (lib_str == NULL) {
+    BIO_snprintf(lib_buf, sizeof(lib_buf), "lib(%u)", lib);
+    lib_str = lib_buf;
+  }
+
+ if (reason_str == NULL) {
+    BIO_snprintf(reason_buf, sizeof(reason_buf), "reason(%u)", reason);
+    reason_str = reason_buf;
+  }
+
+  BIO_snprintf(buf, len, "error:%08" PRIx32 ":%s:OPENSSL_internal:%s",
+               packed_error, lib_str, reason_str);
+
+  if (strlen(buf) == len - 1) {
+    // output may be truncated; make sure we always have 5 colon-separated
+    // fields, i.e. 4 colons.
+    static const unsigned num_colons = 4;
+    unsigned i;
+    char *s = buf;
+
+    if (len <= num_colons) {
+      // In this situation it's not possible to ensure that the correct number
+      // of colons are included in the output.
+      return buf;
+    }
+
+    for (i = 0; i < num_colons; i++) {
+      char *colon = strchr(s, ':');
+      char *last_pos = &buf[len - 1] - num_colons + i;
+
+      if (colon == NULL || colon > last_pos) {
+        // set colon |i| at last possible position (buf[len-1] is the
+        // terminating 0). If we're setting this colon, then all whole of the
+        // rest of the string must be colons in order to have the correct
+        // number.
+        OPENSSL_memset(last_pos, ':', num_colons - i);
+        break;
+      }
+
+      s = colon + 1;
+    }
+  }
+
+  return buf;
+}
+
 void ERR_print_errors_cb(ERR_print_errors_callback_t callback, void *ctx) {
  char buf[ERR_ERROR_STRING_BUF_LEN];
  char buf2[1024];
@ -738,6 +745,22 @@ void ERR_add_error_dataf(const char *format, ...) {
  err_set_error_data(buf);
 }

+void ERR_set_error_data(char *data, int flags) {
+  if (!(flags & ERR_FLAG_STRING)) {
+    // We do not support non-string error data.
+    assert(0);
+    return;
+  }
+  if (flags & ERR_FLAG_MALLOCED) {
+    err_set_error_data(data);
+  } else {
+    char *copy = OPENSSL_strdup(data);
+    if (copy != NULL) {
+      err_set_error_data(copy);
+    }
+  }
+}
+
 int ERR_set_mark(void) {
  ERR_STATE *const state = err_get_state();

--- a/Sources/CBigNumBoringSSL/crypto/err/err_data.c
+++ b/Sources/CBigNumBoringSSL/crypto/err/err_data.c
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/aes/mode_wrappers.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/aes/mode_wrappers.c
@ -57,7 +57,23 @@
 void AES_ctr128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
                        const AES_KEY *key, uint8_t ivec[AES_BLOCK_SIZE],
                        uint8_t ecount_buf[AES_BLOCK_SIZE], unsigned int *num) {
-  CRYPTO_ctr128_encrypt(in, out, len, key, ivec, ecount_buf, num, AES_encrypt);
+  if (hwaes_capable()) {
+    CRYPTO_ctr128_encrypt_ctr32(in, out, len, key, ivec, ecount_buf, num,
+                                aes_hw_ctr32_encrypt_blocks);
+  } else if (vpaes_capable()) {
+#if defined(VPAES_CTR32)
+    // TODO(davidben): On ARM, where |BSAES| is additionally defined, this could
+    // use |vpaes_ctr32_encrypt_blocks_with_bsaes|.
+    CRYPTO_ctr128_encrypt_ctr32(in, out, len, key, ivec, ecount_buf, num,
+                                vpaes_ctr32_encrypt_blocks);
+#else
+    CRYPTO_ctr128_encrypt(in, out, len, key, ivec, ecount_buf, num,
+                          vpaes_encrypt);
+#endif
+  } else {
+    CRYPTO_ctr128_encrypt_ctr32(in, out, len, key, ivec, ecount_buf, num,
+                                aes_nohw_ctr32_encrypt_blocks);
+  }
 }

 void AES_ecb_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key,
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64.linux.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesni-gcm-x86_64.mac.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__APPLE__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesni-x86.linux.x86.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesni-x86.linux.x86.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__i386__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__i386__)
 #if defined(BORINGSSL_PREFIX)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesni-x86_64.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesni-x86_64.linux.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesni-x86_64.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesni-x86_64.mac.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__APPLE__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesv8-armx32.ios.arm.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesv8-armx32.ios.arm.S
@ -250,6 +250,7 @@ Ldec_key_abort:
 #endif
 .align	5
 _aes_hw_encrypt:
+	AARCH64_VALID_CALL_TARGET
 	ldr	r3,[r2,#240]
 	vld1.32	{q0},[r2]!
 	vld1.8	{q2},[r0]
@ -282,6 +283,7 @@ Loop_enc:
 #endif
 .align	5
 _aes_hw_decrypt:
+	AARCH64_VALID_CALL_TARGET
 	ldr	r3,[r2,#240]
 	vld1.32	{q0},[r2]!
 	vld1.8	{q2},[r0]
@ -630,20 +632,34 @@ _aes_hw_ctr32_encrypt_blocks:
 	add	r7,r3,#32
 	mov	r6,r5
 	movlo	r12,#0
+
+	@ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	@ affected by silicon errata #1742098 [0] and #1655431 [1],
+	@ respectively, where the second instruction of an aese/aesmc
+	@ instruction pair may execute twice if an interrupt is taken right
+	@ after the first instruction consumes an input register of which a
+	@ single 32-bit lane has been updated the last time it was modified.
+	@ 
+	@ This function uses a counter in one 32-bit lane. The 
+	@ could write to q1 and q10 directly, but that trips this bugs.
+	@ We write to q6 and copy to the final register as a workaround.
+	@ 
+	@ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	@ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
 #ifndef __ARMEB__
 	rev	r8, r8
 #endif
-	vorr	q1,q0,q0
 	add	r10, r8, #1
-	vorr	q10,q0,q0
-	add	r8, r8, #2
 	vorr	q6,q0,q0
 	rev	r10, r10
-	vmov.32	d3[1],r10
+	vmov.32	d13[1],r10
+	add	r8, r8, #2
+	vorr	q1,q6,q6
 	bls	Lctr32_tail
 	rev	r12, r8
+	vmov.32	d13[1],r12
 	sub	r2,r2,#3		@ bias
-	vmov.32	d21[1],r12
+	vorr	q10,q6,q6
 	b	Loop3x_ctr32

 .align	4
@ -670,11 +686,11 @@ Loop3x_ctr32:
 .byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
 .byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
 	vld1.8	{q2},[r0]!
-	vorr	q0,q6,q6
+	add	r9,r8,#1
 .byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
 .byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
 	vld1.8	{q3},[r0]!
-	vorr	q1,q6,q6
+	rev	r9,r9
 .byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
 .byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
 .byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
@ -683,8 +699,6 @@ Loop3x_ctr32:
 	mov	r7,r3
 .byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
 .byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
-	vorr	q10,q6,q6
-	add	r9,r8,#1
 .byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
 .byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
 .byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
@ -699,21 +713,26 @@ Loop3x_ctr32:
 .byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
 .byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
 .byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	 @ Note the logic to update q0, q1, and q1 is written to work
+	 @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 @ 32-bit mode. See the comment above.
 	veor	q11,q11,q7
-	rev	r9,r9
+	vmov.32	d13[1], r9
 .byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
 .byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
-	vmov.32	d1[1], r9
+	vorr	q0,q6,q6
 	rev	r10,r10
 .byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
 .byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+	vmov.32	d13[1], r10
+	rev	r12,r8
 .byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
 .byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
-	vmov.32	d3[1], r10
-	rev	r12,r8
+	vorr	q1,q6,q6
+	vmov.32	d13[1], r12
 .byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
 .byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
-	vmov.32	d21[1], r12
+	vorr	q10,q6,q6
 	subs	r2,r2,#3
 .byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
 .byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesv8-armx32.linux.arm.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesv8-armx32.linux.arm.S
@ -245,6 +245,7 @@ aes_hw_set_decrypt_key:
 .type	aes_hw_encrypt,%function
 .align	5
 aes_hw_encrypt:
+	AARCH64_VALID_CALL_TARGET
 	ldr	r3,[r2,#240]
 	vld1.32	{q0},[r2]!
 	vld1.8	{q2},[r0]
@ -275,6 +276,7 @@ aes_hw_encrypt:
 .type	aes_hw_decrypt,%function
 .align	5
 aes_hw_decrypt:
+	AARCH64_VALID_CALL_TARGET
 	ldr	r3,[r2,#240]
 	vld1.32	{q0},[r2]!
 	vld1.8	{q2},[r0]
@ -619,20 +621,34 @@ aes_hw_ctr32_encrypt_blocks:
 	add	r7,r3,#32
 	mov	r6,r5
 	movlo	r12,#0
+
+	@ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	@ affected by silicon errata #1742098 [0] and #1655431 [1],
+	@ respectively, where the second instruction of an aese/aesmc
+	@ instruction pair may execute twice if an interrupt is taken right
+	@ after the first instruction consumes an input register of which a
+	@ single 32-bit lane has been updated the last time it was modified.
+	@ 
+	@ This function uses a counter in one 32-bit lane. The 
+	@ could write to q1 and q10 directly, but that trips this bugs.
+	@ We write to q6 and copy to the final register as a workaround.
+	@ 
+	@ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	@ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
 #ifndef __ARMEB__
 	rev	r8, r8
 #endif
-	vorr	q1,q0,q0
 	add	r10, r8, #1
-	vorr	q10,q0,q0
-	add	r8, r8, #2
 	vorr	q6,q0,q0
 	rev	r10, r10
-	vmov.32	d3[1],r10
+	vmov.32	d13[1],r10
+	add	r8, r8, #2
+	vorr	q1,q6,q6
 	bls	.Lctr32_tail
 	rev	r12, r8
+	vmov.32	d13[1],r12
 	sub	r2,r2,#3		@ bias
-	vmov.32	d21[1],r12
+	vorr	q10,q6,q6
 	b	.Loop3x_ctr32

 .align	4
@ -659,11 +675,11 @@ aes_hw_ctr32_encrypt_blocks:
 .byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
 .byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
 	vld1.8	{q2},[r0]!
-	vorr	q0,q6,q6
+	add	r9,r8,#1
 .byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
 .byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
 	vld1.8	{q3},[r0]!
-	vorr	q1,q6,q6
+	rev	r9,r9
 .byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
 .byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
 .byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
@ -672,8 +688,6 @@ aes_hw_ctr32_encrypt_blocks:
 	mov	r7,r3
 .byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
 .byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
-	vorr	q10,q6,q6
-	add	r9,r8,#1
 .byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
 .byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
 .byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
@ -688,21 +702,26 @@ aes_hw_ctr32_encrypt_blocks:
 .byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
 .byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
 .byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	 @ Note the logic to update q0, q1, and q1 is written to work
+	 @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 @ 32-bit mode. See the comment above.
 	veor	q11,q11,q7
-	rev	r9,r9
+	vmov.32	d13[1], r9
 .byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
 .byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
-	vmov.32	d1[1], r9
+	vorr	q0,q6,q6
 	rev	r10,r10
 .byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
 .byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+	vmov.32	d13[1], r10
+	rev	r12,r8
 .byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
 .byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
-	vmov.32	d3[1], r10
-	rev	r12,r8
+	vorr	q1,q6,q6
+	vmov.32	d13[1], r12
 .byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
 .byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
-	vmov.32	d21[1], r12
+	vorr	q10,q6,q6
 	subs	r2,r2,#3
 .byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
 .byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesv8-armx64.ios.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesv8-armx64.ios.aarch64.S
@ -34,6 +34,8 @@ Lrcon:
 .align	5
 _aes_hw_set_encrypt_key:
 Lenc_key:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	mov	x3,#-1
@ -202,6 +204,7 @@ Lenc_key_abort:

 .align	5
 _aes_hw_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	bl	Lenc_key
@ -235,6 +238,7 @@ Loop_imc:
 	eor	x0,x0,x0		// return value
 Ldec_key_abort:
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret

 .globl	_aes_hw_encrypt
@ -242,6 +246,7 @@ Ldec_key_abort:

 .align	5
 _aes_hw_encrypt:
+	AARCH64_VALID_CALL_TARGET
 	ldr	w3,[x2,#240]
 	ld1	{v0.4s},[x2],#16
 	ld1	{v2.16b},[x0]
@ -272,6 +277,7 @@ Loop_enc:

 .align	5
 _aes_hw_decrypt:
+	AARCH64_VALID_CALL_TARGET
 	ldr	w3,[x2,#240]
 	ld1	{v0.4s},[x2],#16
 	ld1	{v2.16b},[x0]
@ -302,6 +308,8 @@ Loop_dec:

 .align	5
 _aes_hw_cbc_encrypt:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	subs	x2,x2,#16
@ -593,6 +601,8 @@ Lcbc_abort:

 .align	5
 _aes_hw_ctr32_encrypt_blocks:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	ldr	w5,[x3,#240]
@ -612,20 +622,34 @@ _aes_hw_ctr32_encrypt_blocks:
 	add	x7,x3,#32
 	mov	w6,w5
 	csel	x12,xzr,x12,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov lines
+	// could write to v1.16b and v18.16b directly, but that trips this bugs.
+	// We write to v6.16b and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
 #ifndef __ARMEB__
 	rev	w8, w8
 #endif
-	orr	v1.16b,v0.16b,v0.16b
 	add	w10, w8, #1
-	orr	v18.16b,v0.16b,v0.16b
-	add	w8, w8, #2
 	orr	v6.16b,v0.16b,v0.16b
 	rev	w10, w10
-	mov	v1.s[3],w10
+	mov	v6.s[3],w10
+	add	w8, w8, #2
+	orr	v1.16b,v6.16b,v6.16b
 	b.ls	Lctr32_tail
 	rev	w12, w8
+	mov	v6.s[3],w12
 	sub	x2,x2,#3		// bias
-	mov	v18.s[3],w12
+	orr	v18.16b,v6.16b,v6.16b
 	b	Loop3x_ctr32

 .align	4
@ -652,11 +676,11 @@ Loop3x_ctr32:
 	aese	v1.16b,v16.16b
 	aesmc	v5.16b,v1.16b
 	ld1	{v2.16b},[x0],#16
-	orr	v0.16b,v6.16b,v6.16b
+	add	w9,w8,#1
 	aese	v18.16b,v16.16b
 	aesmc	v18.16b,v18.16b
 	ld1	{v3.16b},[x0],#16
-	orr	v1.16b,v6.16b,v6.16b
+	rev	w9,w9
 	aese	v4.16b,v17.16b
 	aesmc	v4.16b,v4.16b
 	aese	v5.16b,v17.16b
@ -665,8 +689,6 @@ Loop3x_ctr32:
 	mov	x7,x3
 	aese	v18.16b,v17.16b
 	aesmc	v17.16b,v18.16b
-	orr	v18.16b,v6.16b,v6.16b
-	add	w9,w8,#1
 	aese	v4.16b,v20.16b
 	aesmc	v4.16b,v4.16b
 	aese	v5.16b,v20.16b
@ -681,21 +703,26 @@ Loop3x_ctr32:
 	aesmc	v4.16b,v4.16b
 	aese	v5.16b,v21.16b
 	aesmc	v5.16b,v5.16b
+	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
 	eor	v19.16b,v19.16b,v7.16b
-	rev	w9,w9
+	mov	v6.s[3], w9
 	aese	v17.16b,v21.16b
 	aesmc	v17.16b,v17.16b
-	mov	v0.s[3], w9
+	orr	v0.16b,v6.16b,v6.16b
 	rev	w10,w10
 	aese	v4.16b,v22.16b
 	aesmc	v4.16b,v4.16b
+	mov	v6.s[3], w10
+	rev	w12,w8
 	aese	v5.16b,v22.16b
 	aesmc	v5.16b,v5.16b
-	mov	v1.s[3], w10
-	rev	w12,w8
+	orr	v1.16b,v6.16b,v6.16b
+	mov	v6.s[3], w12
 	aese	v17.16b,v22.16b
 	aesmc	v17.16b,v17.16b
-	mov	v18.s[3], w12
+	orr	v18.16b,v6.16b,v6.16b
 	subs	x2,x2,#3
 	aese	v4.16b,v23.16b
 	aese	v5.16b,v23.16b
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesv8-armx64.linux.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/aesv8-armx64.linux.aarch64.S
@ -35,6 +35,8 @@
 .align	5
 aes_hw_set_encrypt_key:
 .Lenc_key:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	mov	x3,#-1
@ -203,6 +205,7 @@ aes_hw_set_encrypt_key:
 .type	aes_hw_set_decrypt_key,%function
 .align	5
 aes_hw_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	bl	.Lenc_key
@ -236,6 +239,7 @@ aes_hw_set_decrypt_key:
 	eor	x0,x0,x0		// return value
 .Ldec_key_abort:
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
 .globl	aes_hw_encrypt
@ -243,6 +247,7 @@ aes_hw_set_decrypt_key:
 .type	aes_hw_encrypt,%function
 .align	5
 aes_hw_encrypt:
+	AARCH64_VALID_CALL_TARGET
 	ldr	w3,[x2,#240]
 	ld1	{v0.4s},[x2],#16
 	ld1	{v2.16b},[x0]
@ -273,6 +278,7 @@ aes_hw_encrypt:
 .type	aes_hw_decrypt,%function
 .align	5
 aes_hw_decrypt:
+	AARCH64_VALID_CALL_TARGET
 	ldr	w3,[x2,#240]
 	ld1	{v0.4s},[x2],#16
 	ld1	{v2.16b},[x0]
@ -303,6 +309,8 @@ aes_hw_decrypt:
 .type	aes_hw_cbc_encrypt,%function
 .align	5
 aes_hw_cbc_encrypt:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	subs	x2,x2,#16
@ -594,6 +602,8 @@ aes_hw_cbc_encrypt:
 .type	aes_hw_ctr32_encrypt_blocks,%function
 .align	5
 aes_hw_ctr32_encrypt_blocks:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	ldr	w5,[x3,#240]
@ -613,20 +623,34 @@ aes_hw_ctr32_encrypt_blocks:
 	add	x7,x3,#32
 	mov	w6,w5
 	csel	x12,xzr,x12,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov lines
+	// could write to v1.16b and v18.16b directly, but that trips this bugs.
+	// We write to v6.16b and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
 #ifndef __ARMEB__
 	rev	w8, w8
 #endif
-	orr	v1.16b,v0.16b,v0.16b
 	add	w10, w8, #1
-	orr	v18.16b,v0.16b,v0.16b
-	add	w8, w8, #2
 	orr	v6.16b,v0.16b,v0.16b
 	rev	w10, w10
-	mov	v1.s[3],w10
+	mov	v6.s[3],w10
+	add	w8, w8, #2
+	orr	v1.16b,v6.16b,v6.16b
 	b.ls	.Lctr32_tail
 	rev	w12, w8
+	mov	v6.s[3],w12
 	sub	x2,x2,#3		// bias
-	mov	v18.s[3],w12
+	orr	v18.16b,v6.16b,v6.16b
 	b	.Loop3x_ctr32

 .align	4
@ -653,11 +677,11 @@ aes_hw_ctr32_encrypt_blocks:
 	aese	v1.16b,v16.16b
 	aesmc	v5.16b,v1.16b
 	ld1	{v2.16b},[x0],#16
-	orr	v0.16b,v6.16b,v6.16b
+	add	w9,w8,#1
 	aese	v18.16b,v16.16b
 	aesmc	v18.16b,v18.16b
 	ld1	{v3.16b},[x0],#16
-	orr	v1.16b,v6.16b,v6.16b
+	rev	w9,w9
 	aese	v4.16b,v17.16b
 	aesmc	v4.16b,v4.16b
 	aese	v5.16b,v17.16b
@ -666,8 +690,6 @@ aes_hw_ctr32_encrypt_blocks:
 	mov	x7,x3
 	aese	v18.16b,v17.16b
 	aesmc	v17.16b,v18.16b
-	orr	v18.16b,v6.16b,v6.16b
-	add	w9,w8,#1
 	aese	v4.16b,v20.16b
 	aesmc	v4.16b,v4.16b
 	aese	v5.16b,v20.16b
@ -682,21 +704,26 @@ aes_hw_ctr32_encrypt_blocks:
 	aesmc	v4.16b,v4.16b
 	aese	v5.16b,v21.16b
 	aesmc	v5.16b,v5.16b
+	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
 	eor	v19.16b,v19.16b,v7.16b
-	rev	w9,w9
+	mov	v6.s[3], w9
 	aese	v17.16b,v21.16b
 	aesmc	v17.16b,v17.16b
-	mov	v0.s[3], w9
+	orr	v0.16b,v6.16b,v6.16b
 	rev	w10,w10
 	aese	v4.16b,v22.16b
 	aesmc	v4.16b,v4.16b
+	mov	v6.s[3], w10
+	rev	w12,w8
 	aese	v5.16b,v22.16b
 	aesmc	v5.16b,v5.16b
-	mov	v1.s[3], w10
-	rev	w12,w8
+	orr	v1.16b,v6.16b,v6.16b
+	mov	v6.s[3], w12
 	aese	v17.16b,v22.16b
 	aesmc	v17.16b,v17.16b
-	mov	v18.s[3], w12
+	orr	v18.16b,v6.16b,v6.16b
 	subs	x2,x2,#3
 	aese	v4.16b,v23.16b
 	aese	v5.16b,v23.16b
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/armv8-mont.ios.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/armv8-mont.ios.aarch64.S
@ -14,6 +14,8 @@
 #if defined(BORINGSSL_PREFIX)
 #include <CBigNumBoringSSL_boringssl_prefix_symbols_asm.h>
 #endif
+#include <CBigNumBoringSSL_arm_arch.h>
+
 .text

 .globl	_bn_mul_mont
@ -21,6 +23,7 @@

 .align	5
 _bn_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
 	tst	x5,#7
 	b.eq	__bn_sqr8x_mont
 	tst	x5,#3
@ -218,11 +221,14 @@ Lcond_copy:
 	mov	x0,#1
 	ldp	x23,x24,[x29,#48]
 	ldr	x29,[sp],#64
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret


 .align	5
 __bn_sqr8x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+	// only from bn_mul_mont which has already signed the return address.
 	cmp	x1,x2
 	b.ne	__bn_mul4x_mont
 Lsqr8x_mont:
@ -976,11 +982,16 @@ Lsqr8x_done:
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret


 .align	5
 __bn_mul4x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+	// return address.
 	stp	x29,x30,[sp,#-128]!
 	add	x29,sp,#0
 	stp	x19,x20,[sp,#16]
@ -1414,6 +1425,8 @@ Lmul4x_done:
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret

 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/armv8-mont.linux.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/armv8-mont.linux.aarch64.S
@ -15,6 +15,8 @@
 #if defined(BORINGSSL_PREFIX)
 #include <CBigNumBoringSSL_boringssl_prefix_symbols_asm.h>
 #endif
+#include <CBigNumBoringSSL_arm_arch.h>
+
 .text

 .globl	bn_mul_mont
@ -22,6 +24,7 @@
 .type	bn_mul_mont,%function
 .align	5
 bn_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
 	tst	x5,#7
 	b.eq	__bn_sqr8x_mont
 	tst	x5,#3
@ -219,11 +222,14 @@ bn_mul_mont:
 	mov	x0,#1
 	ldp	x23,x24,[x29,#48]
 	ldr	x29,[sp],#64
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	bn_mul_mont,.-bn_mul_mont
 .type	__bn_sqr8x_mont,%function
 .align	5
 __bn_sqr8x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+	// only from bn_mul_mont which has already signed the return address.
 	cmp	x1,x2
 	b.ne	__bn_mul4x_mont
 .Lsqr8x_mont:
@ -977,11 +983,16 @@ __bn_sqr8x_mont:
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
 .type	__bn_mul4x_mont,%function
 .align	5
 __bn_mul4x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+	// return address.
 	stp	x29,x30,[sp,#-128]!
 	add	x29,sp,#0
 	stp	x19,x20,[sp,#16]
@ -1415,6 +1426,8 @@ __bn_mul4x_mont:
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	__bn_mul4x_mont,.-__bn_mul4x_mont
 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/bn-586.linux.x86.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/bn-586.linux.x86.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__i386__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__i386__)
 #if defined(BORINGSSL_PREFIX)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/bn/bn.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/bn/bn.c
@ -101,26 +101,7 @@ void BN_free(BIGNUM *bn) {
 }

 void BN_clear_free(BIGNUM *bn) {
-  char should_free;
-
-  if (bn == NULL) {
-    return;
-  }
-
-  if (bn->d != NULL) {
-    if ((bn->flags & BN_FLG_STATIC_DATA) == 0) {
-      OPENSSL_free(bn->d);
-    } else {
-      OPENSSL_cleanse(bn->d, bn->dmax * sizeof(bn->d[0]));
-    }
-  }
-
-  should_free = (bn->flags & BN_FLG_MALLOCED) != 0;
-  if (should_free) {
-    OPENSSL_free(bn);
-  } else {
-    OPENSSL_cleanse(bn, sizeof(BIGNUM));
-  }
+  BN_free(bn);
 }

 BIGNUM *BN_dup(const BIGNUM *src) {
@ -302,6 +283,18 @@ int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num) {
  return 1;
 }

+void bn_set_static_words(BIGNUM *bn, const BN_ULONG *words, size_t num) {
+  if ((bn->flags & BN_FLG_STATIC_DATA) == 0) {
+    OPENSSL_free(bn->d);
+  }
+  bn->d = (BN_ULONG *)words;
+
+  bn->width = num;
+  bn->dmax = num;
+  bn->neg = 0;
+  bn->flags |= BN_FLG_STATIC_DATA;
+}
+
 int bn_fits_in_words(const BIGNUM *bn, size_t num) {
  // All words beyond |num| must be zero.
  BN_ULONG mask = 0;
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/bn/div.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/bn/div.c
@ -64,10 +64,10 @@
 #include "internal.h"


-#if !defined(BN_CAN_DIVIDE_ULLONG) && !defined(BN_CAN_USE_INLINE_ASM)
 // bn_div_words divides a double-width |h|,|l| by |d| and returns the result,
 // which must fit in a |BN_ULONG|.
-static BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
+OPENSSL_UNUSED static BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l,
+                                            BN_ULONG d) {
  BN_ULONG dh, dl, q, ret = 0, th, tl, t;
  int i, count = 2;

@ -135,7 +135,6 @@ static BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
  ret |= q;
  return ret;
 }
-#endif  // !defined(BN_CAN_DIVIDE_ULLONG) && !defined(BN_CAN_USE_INLINE_ASM)

 static inline void bn_div_rem_words(BN_ULONG *quotient_out, BN_ULONG *rem_out,
                                    BN_ULONG n0, BN_ULONG n1, BN_ULONG d0) {
@ -286,8 +285,10 @@ int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator,
  // pointer to the 'top' of snum
  wnump = &(snum->d[num_n - 1]);

-  // Setup to 'res'
-  res->neg = (numerator->neg ^ divisor->neg);
+  // Setup |res|. |numerator| and |res| may alias, so we save |numerator->neg|
+  // for later.
+  const int numerator_neg = numerator->neg;
+  res->neg = (numerator_neg ^ divisor->neg);
  if (!bn_wexpand(res, loop + 1)) {
    goto err;
  }
@ -380,14 +381,11 @@ int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator,
  bn_set_minimal_width(snum);

  if (rem != NULL) {
-    // Keep a copy of the neg flag in numerator because if |rem| == |numerator|
-    // |BN_rshift| will overwrite it.
-    int neg = numerator->neg;
    if (!BN_rshift(rem, snum, norm_shift)) {
      goto err;
    }
    if (!BN_is_zero(rem)) {
-      rem->neg = neg;
+      rem->neg = numerator_neg;
    }
  }

@ -458,7 +456,7 @@ void bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,

 int bn_div_consttime(BIGNUM *quotient, BIGNUM *remainder,
                     const BIGNUM *numerator, const BIGNUM *divisor,
-                     BN_CTX *ctx) {
+                     unsigned divisor_min_bits, BN_CTX *ctx) {
  if (BN_is_negative(numerator) || BN_is_negative(divisor)) {
    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
    return 0;
@ -498,8 +496,26 @@ int bn_div_consttime(BIGNUM *quotient, BIGNUM *remainder,
  r->neg = 0;

  // Incorporate |numerator| into |r|, one bit at a time, reducing after each
-  // step. At the start of each loop iteration, |r| < |divisor|
-  for (int i = numerator->width - 1; i >= 0; i--) {
+  // step. We maintain the invariant that |0 <= r < divisor| and
+  // |q * divisor + r = n| where |n| is the portion of |numerator| incorporated
+  // so far.
+  //
+  // First, we short-circuit the loop: if we know |divisor| has at least
+  // |divisor_min_bits| bits, the top |divisor_min_bits - 1| can be incorporated
+  // without reductions. This significantly speeds up |RSA_check_key|. For
+  // simplicity, we round down to a whole number of words.
+  assert(divisor_min_bits <= BN_num_bits(divisor));
+  int initial_words = 0;
+  if (divisor_min_bits > 0) {
+    initial_words = (divisor_min_bits - 1) / BN_BITS2;
+    if (initial_words > numerator->width) {
+      initial_words = numerator->width;
+    }
+    OPENSSL_memcpy(r->d, numerator->d + numerator->width - initial_words,
+                   initial_words * sizeof(BN_ULONG));
+  }
+
+  for (int i = numerator->width - initial_words - 1; i >= 0; i--) {
    for (int bit = BN_BITS2 - 1; bit >= 0; bit--) {
      // Incorporate the next bit of the numerator, by computing
      // r = 2*r or 2*r + 1. Note the result fits in one more word. We store the
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/bn/gcd_extra.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/bn/gcd_extra.c
@ -157,10 +157,11 @@ int bn_lcm_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
  BN_CTX_start(ctx);
  unsigned shift;
  BIGNUM *gcd = BN_CTX_get(ctx);
-  int ret = gcd != NULL &&
+  int ret = gcd != NULL &&  //
            bn_mul_consttime(r, a, b, ctx) &&
            bn_gcd_consttime(gcd, &shift, a, b, ctx) &&
-            bn_div_consttime(r, NULL, r, gcd, ctx) &&
+            // |gcd| has a secret bit width.
+            bn_div_consttime(r, NULL, r, gcd, /*divisor_min_bits=*/0, ctx) &&
            bn_rshift_secret_shift(r, r, shift, ctx);
  BN_CTX_end(ctx);
  return ret;
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/bn/internal.h
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/bn/internal.h
@ -123,7 +123,7 @@
 #ifndef OPENSSL_HEADER_BN_INTERNAL_H
 #define OPENSSL_HEADER_BN_INTERNAL_H

-#include <CBigNumBoringSSL_base.h>
+#include <CBigNumBoringSSL_bn.h>

 #if defined(OPENSSL_X86_64) && defined(_MSC_VER)
 OPENSSL_MSVC_PRAGMA(warning(push, 3))
@ -241,6 +241,14 @@ void bn_select_words(BN_ULONG *r, BN_ULONG mask, const BN_ULONG *a,
 // least significant word first.
 int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num);

+// bn_set_static_words acts like |bn_set_words|, but doesn't copy the data. A
+// flag is set on |bn| so that |BN_free| won't attempt to free the data.
+//
+// The |STATIC_BIGNUM| macro is probably a better solution for this outside of
+// the FIPS module. Inside of the FIPS module that macro generates rel.ro data,
+// which doesn't work with FIPS requirements.
+void bn_set_static_words(BIGNUM *bn, const BN_ULONG *words, size_t num);
+
 // bn_fits_in_words returns one if |bn| may be represented in |num| words, plus
 // a sign bit, and zero otherwise.
 int bn_fits_in_words(const BIGNUM *bn, size_t num);
@ -289,7 +297,7 @@ void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]);
 void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]);

 // bn_sqr_comba8 sets |r| to |a|^2.
-void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[4]);
+void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]);

 // bn_sqr_comba4 sets |r| to |a|^2.
 void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]);
@ -404,9 +412,19 @@ uint64_t bn_mont_n0(const BIGNUM *n);
 int bn_mod_exp_base_2_consttime(BIGNUM *r, unsigned p, const BIGNUM *n,
                                BN_CTX *ctx);

-#if defined(OPENSSL_X86_64) && defined(_MSC_VER)
+#if defined(_MSC_VER)
+#if defined(OPENSSL_X86_64)
 #define BN_UMULT_LOHI(low, high, a, b) ((low) = _umul128((a), (b), &(high)))
+#elif defined(OPENSSL_AARCH64)
+#define BN_UMULT_LOHI(low, high, a, b) \
+  do {                                 \
+    const BN_ULONG _a = (a);           \
+    const BN_ULONG _b = (b);           \
+    (low) = _a * _b;                   \
+    (high) = __umulh(_a, _b);          \
+  } while (0)
 #endif
+#endif  // _MSC_VER

 #if !defined(BN_ULLONG) && !defined(BN_UMULT_LOHI)
 #error "Either BN_ULLONG or BN_UMULT_LOHI must be defined on every platform."
@ -534,12 +552,15 @@ int bn_sqr_consttime(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx);
 // bn_div_consttime behaves like |BN_div|, but it rejects negative inputs and
 // treats both inputs, including their magnitudes, as secret. It is, as a
 // result, much slower than |BN_div| and should only be used for rare operations
-// where Montgomery reduction is not available.
+// where Montgomery reduction is not available. |divisor_min_bits| is a
+// public lower bound for |BN_num_bits(divisor)|. When |divisor|'s bit width is
+// public, this can speed up the operation.
 //
 // Note that |quotient->width| will be set pessimally to |numerator->width|.
 OPENSSL_EXPORT int bn_div_consttime(BIGNUM *quotient, BIGNUM *remainder,
                                    const BIGNUM *numerator,
-                                    const BIGNUM *divisor, BN_CTX *ctx);
+                                    const BIGNUM *divisor,
+                                    unsigned divisor_min_bits, BN_CTX *ctx);

 // bn_is_relatively_prime checks whether GCD(|x|, |y|) is one. On success, it
 // returns one and sets |*out_relatively_prime| to one if the GCD was one and
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/bn/prime.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/bn/prime.c
@ -115,10 +115,6 @@
 #include "../../internal.h"


-// The quick sieve algorithm approach to weeding out primes is Philip
-// Zimmermann's, as implemented in PGP.  I have had a read of his comments and
-// implemented my own version.
-
 // kPrimes contains the first 1024 primes.
 static const uint16_t kPrimes[] = {
    2,    3,    5,    7,    11,   13,   17,   19,   23,   29,   31,   37,
@ -363,6 +359,18 @@ static int probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add,
 static int probable_prime_dh_safe(BIGNUM *rnd, int bits, const BIGNUM *add,
                                  const BIGNUM *rem, BN_CTX *ctx);

+BN_GENCB *BN_GENCB_new(void) {
+  BN_GENCB *callback = OPENSSL_malloc(sizeof(BN_GENCB));
+  if (callback == NULL) {
+    OPENSSL_PUT_ERROR(BN, ERR_R_MALLOC_FAILURE);
+    return NULL;
+  }
+  OPENSSL_memset(callback, 0, sizeof(BN_GENCB));
+  return callback;
+}
+
+void BN_GENCB_free(BN_GENCB *callback) { OPENSSL_free(callback); }
+
 void BN_GENCB_set(BN_GENCB *callback,
                  int (*f)(int event, int n, struct bn_gencb_st *),
                  void *arg) {
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/bn/sqrt.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/bn/sqrt.c
@ -75,10 +75,8 @@ BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) {
      if (ret == NULL) {
        ret = BN_new();
      }
-      if (ret == NULL) {
-        goto end;
-      }
-      if (!BN_set_word(ret, BN_is_bit_set(a, 0))) {
+      if (ret == NULL ||
+          !BN_set_word(ret, BN_is_bit_set(a, 0))) {
        if (ret != in) {
          BN_free(ret);
        }
@ -88,17 +86,15 @@ BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) {
    }

    OPENSSL_PUT_ERROR(BN, BN_R_P_IS_NOT_PRIME);
-    return (NULL);
+    return NULL;
  }

  if (BN_is_zero(a) || BN_is_one(a)) {
    if (ret == NULL) {
      ret = BN_new();
    }
-    if (ret == NULL) {
-      goto end;
-    }
-    if (!BN_set_word(ret, BN_is_one(a))) {
+    if (ret == NULL ||
+        !BN_set_word(ret, BN_is_one(a))) {
      if (ret != in) {
        BN_free(ret);
      }
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/cipher/cipher.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/cipher/cipher.c
@ -57,6 +57,7 @@
 #include <CBigNumBoringSSL_cipher.h>

 #include <assert.h>
+#include <limits.h>
 #include <string.h>

 #include <CBigNumBoringSSL_err.h>
@ -224,7 +225,6 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher,

  ctx->buf_len = 0;
  ctx->final_used = 0;
-  ctx->block_mask = ctx->cipher->block_size - 1;
  return 1;
 }

@ -238,16 +238,31 @@ int EVP_DecryptInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher,
  return EVP_CipherInit_ex(ctx, cipher, impl, key, iv, 0);
 }

+// block_remainder returns the number of bytes to remove from |len| to get a
+// multiple of |ctx|'s block size.
+static int block_remainder(const EVP_CIPHER_CTX *ctx, int len) {
+  // |block_size| must be a power of two.
+  assert(ctx->cipher->block_size != 0);
+  assert((ctx->cipher->block_size & (ctx->cipher->block_size - 1)) == 0);
+  return len & (ctx->cipher->block_size - 1);
+}
+
 int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len,
                      const uint8_t *in, int in_len) {
-  int i, j, bl;
+  // Ciphers that use blocks may write up to |bl| extra bytes. Ensure the output
+  // does not overflow |*out_len|.
+  int bl = ctx->cipher->block_size;
+  if (bl > 1 && in_len > INT_MAX - bl) {
+    OPENSSL_PUT_ERROR(CIPHER, ERR_R_OVERFLOW);
+    return 0;
+  }

  if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) {
-    i = ctx->cipher->cipher(ctx, out, in, in_len);
-    if (i < 0) {
+    int ret = ctx->cipher->cipher(ctx, out, in, in_len);
+    if (ret < 0) {
      return 0;
    } else {
-      *out_len = i;
+      *out_len = ret;
    }
    return 1;
  }
@ -257,7 +272,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len,
    return in_len == 0;
  }

-  if (ctx->buf_len == 0 && (in_len & ctx->block_mask) == 0) {
+  if (ctx->buf_len == 0 && block_remainder(ctx, in_len) == 0) {
    if (ctx->cipher->cipher(ctx, out, in, in_len)) {
      *out_len = in_len;
      return 1;
@ -267,8 +282,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len,
    }
  }

-  i = ctx->buf_len;
-  bl = ctx->cipher->block_size;
+  int i = ctx->buf_len;
  assert(bl <= (int)sizeof(ctx->buf));
  if (i != 0) {
    if (bl - i > in_len) {
@ -277,7 +291,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len,
      *out_len = 0;
      return 1;
    } else {
-      j = bl - i;
+      int j = bl - i;
      OPENSSL_memcpy(&ctx->buf[i], in, j);
      if (!ctx->cipher->cipher(ctx, out, ctx->buf, bl)) {
        return 0;
@ -291,7 +305,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len,
    *out_len = 0;
  }

-  i = in_len & ctx->block_mask;
+  i = block_remainder(ctx, in_len);
  in_len -= i;
  if (in_len > 0) {
    if (!ctx->cipher->cipher(ctx, out, in, in_len)) {
@ -353,8 +367,13 @@ int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) {

 int EVP_DecryptUpdate(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len,
                      const uint8_t *in, int in_len) {
-  int fix_len;
-  unsigned int b;
+  // Ciphers that use blocks may write up to |bl| extra bytes. Ensure the output
+  // does not overflow |*out_len|.
+  unsigned int b = ctx->cipher->block_size;
+  if (b > 1 && in_len > INT_MAX - (int)b) {
+    OPENSSL_PUT_ERROR(CIPHER, ERR_R_OVERFLOW);
+    return 0;
+  }

  if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) {
    int r = ctx->cipher->cipher(ctx, out, in, in_len);
@ -376,15 +395,12 @@ int EVP_DecryptUpdate(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len,
    return EVP_EncryptUpdate(ctx, out, out_len, in, in_len);
  }

-  b = ctx->cipher->block_size;
  assert(b <= sizeof(ctx->final));
-
+  int fix_len = 0;
  if (ctx->final_used) {
    OPENSSL_memcpy(out, ctx->final, b);
    out += b;
    fix_len = 1;
-  } else {
-    fix_len = 0;
  }

  if (!EVP_EncryptUpdate(ctx, out, out_len, in, in_len)) {
@ -613,6 +629,18 @@ int EVP_DecryptInit(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher,
  return EVP_CipherInit(ctx, cipher, key, iv, 0);
 }

+int EVP_CipherFinal(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) {
+  return EVP_CipherFinal_ex(ctx, out, out_len);
+}
+
+int EVP_EncryptFinal(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) {
+  return EVP_EncryptFinal_ex(ctx, out, out_len);
+}
+
+int EVP_DecryptFinal(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) {
+  return EVP_DecryptFinal_ex(ctx, out, out_len);
+}
+
 int EVP_add_cipher_alias(const char *a, const char *b) {
  return 1;
 }
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/cipher/e_aes.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/cipher/e_aes.c
@ -68,6 +68,8 @@
 OPENSSL_MSVC_PRAGMA(warning(push))
 OPENSSL_MSVC_PRAGMA(warning(disable: 4702))  // Unreachable code.

+#define AES_GCM_NONCE_LENGTH 12
+
 #if defined(BSAES)
 static void vpaes_ctr32_encrypt_blocks_with_bsaes(const uint8_t *in,
                                                  uint8_t *out, size_t blocks,
@ -139,10 +141,22 @@ typedef struct {

 static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
                        const uint8_t *iv, int enc) {
-  int ret, mode;
+  int ret;
  EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+  const int mode = ctx->cipher->flags & EVP_CIPH_MODE_MASK;
+
+  if (mode == EVP_CIPH_CTR_MODE) {
+    switch (ctx->key_len) {
+      case 16:
+        boringssl_fips_inc_counter(fips_counter_evp_aes_128_ctr);
+        break;
+
+      case 32:
+        boringssl_fips_inc_counter(fips_counter_evp_aes_256_ctr);
+        break;
+    }
+  }

-  mode = ctx->cipher->flags & EVP_CIPH_MODE_MASK;
  if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) {
    if (hwaes_capable()) {
      ret = aes_hw_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
@ -351,6 +365,17 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
  if (!iv && !key) {
    return 1;
  }
+
+  switch (ctx->key_len) {
+    case 16:
+      boringssl_fips_inc_counter(fips_counter_evp_aes_128_gcm);
+      break;
+
+    case 32:
+      boringssl_fips_inc_counter(fips_counter_evp_aes_256_gcm);
+      break;
+  }
+
  if (key) {
    OPENSSL_memset(&gctx->gcm, 0, sizeof(gctx->gcm));
    gctx->ctr = aes_ctr_set_key(&gctx->ks.ks, &gctx->gcm.gcm_key, NULL, key,
@ -630,7 +655,7 @@ DEFINE_LOCAL_DATA(EVP_CIPHER, aes_128_gcm_generic) {
  out->nid = NID_aes_128_gcm;
  out->block_size = 1;
  out->key_len = 16;
-  out->iv_len = 12;
+  out->iv_len = AES_GCM_NONCE_LENGTH;
  out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
  out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_CUSTOM_COPY |
               EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
@ -698,7 +723,7 @@ DEFINE_LOCAL_DATA(EVP_CIPHER, aes_192_gcm_generic) {
  out->nid = NID_aes_192_gcm;
  out->block_size = 1;
  out->key_len = 24;
-  out->iv_len = 12;
+  out->iv_len = AES_GCM_NONCE_LENGTH;
  out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
  out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_CUSTOM_COPY |
               EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
@ -766,7 +791,7 @@ DEFINE_LOCAL_DATA(EVP_CIPHER, aes_256_gcm_generic) {
  out->nid = NID_aes_256_gcm;
  out->block_size = 1;
  out->key_len = 32;
-  out->iv_len = 12;
+  out->iv_len = AES_GCM_NONCE_LENGTH;
  out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
  out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_CUSTOM_COPY |
               EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
@ -886,6 +911,16 @@ static int aead_aes_gcm_init_impl(struct aead_aes_gcm_ctx *gcm_ctx,
                                  size_t key_len, size_t tag_len) {
  const size_t key_bits = key_len * 8;

+  switch (key_bits) {
+    case 128:
+      boringssl_fips_inc_counter(fips_counter_evp_aes_128_gcm);
+      break;
+
+    case 256:
+      boringssl_fips_inc_counter(fips_counter_evp_aes_256_gcm);
+      break;
+  }
+
  if (key_bits != 128 && key_bits != 192 && key_bits != 256) {
    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH);
    return 0;  // EVP_AEAD_CTX_init should catch this.
@ -931,21 +966,19 @@ static int aead_aes_gcm_init(EVP_AEAD_CTX *ctx, const uint8_t *key,

 static void aead_aes_gcm_cleanup(EVP_AEAD_CTX *ctx) {}

-static int aead_aes_gcm_seal_scatter(const EVP_AEAD_CTX *ctx, uint8_t *out,
-                                     uint8_t *out_tag, size_t *out_tag_len,
-                                     size_t max_out_tag_len,
-                                     const uint8_t *nonce, size_t nonce_len,
-                                     const uint8_t *in, size_t in_len,
-                                     const uint8_t *extra_in,
-                                     size_t extra_in_len,
-                                     const uint8_t *ad, size_t ad_len) {
-  struct aead_aes_gcm_ctx *gcm_ctx = (struct aead_aes_gcm_ctx *) &ctx->state;
-
-  if (extra_in_len + ctx->tag_len < ctx->tag_len) {
+static int aead_aes_gcm_seal_scatter_impl(
+    const struct aead_aes_gcm_ctx *gcm_ctx,
+    uint8_t *out, uint8_t *out_tag, size_t *out_tag_len, size_t max_out_tag_len,
+    const uint8_t *nonce, size_t nonce_len,
+    const uint8_t *in, size_t in_len,
+    const uint8_t *extra_in, size_t extra_in_len,
+    const uint8_t *ad, size_t ad_len,
+    size_t tag_len) {
+  if (extra_in_len + tag_len < tag_len) {
    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE);
    return 0;
  }
-  if (max_out_tag_len < extra_in_len + ctx->tag_len) {
+  if (max_out_tag_len < extra_in_len + tag_len) {
    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL);
    return 0;
  }
@ -989,18 +1022,35 @@ static int aead_aes_gcm_seal_scatter(const EVP_AEAD_CTX *ctx, uint8_t *out,
    }
  }

-  CRYPTO_gcm128_tag(&gcm, out_tag + extra_in_len, ctx->tag_len);
-  *out_tag_len = ctx->tag_len + extra_in_len;
+  CRYPTO_gcm128_tag(&gcm, out_tag + extra_in_len, tag_len);
+  *out_tag_len = tag_len + extra_in_len;

  return 1;
 }

-static int aead_aes_gcm_open_gather(const EVP_AEAD_CTX *ctx, uint8_t *out,
-                                    const uint8_t *nonce, size_t nonce_len,
-                                    const uint8_t *in, size_t in_len,
-                                    const uint8_t *in_tag, size_t in_tag_len,
-                                    const uint8_t *ad, size_t ad_len) {
-  struct aead_aes_gcm_ctx *gcm_ctx = (struct aead_aes_gcm_ctx *) &ctx->state;
+static int aead_aes_gcm_seal_scatter(const EVP_AEAD_CTX *ctx, uint8_t *out,
+                                     uint8_t *out_tag, size_t *out_tag_len,
+                                     size_t max_out_tag_len,
+                                     const uint8_t *nonce, size_t nonce_len,
+                                     const uint8_t *in, size_t in_len,
+                                     const uint8_t *extra_in,
+                                     size_t extra_in_len,
+                                     const uint8_t *ad, size_t ad_len) {
+  const struct aead_aes_gcm_ctx *gcm_ctx =
+      (const struct aead_aes_gcm_ctx *)&ctx->state;
+  return aead_aes_gcm_seal_scatter_impl(
+      gcm_ctx, out, out_tag, out_tag_len, max_out_tag_len, nonce, nonce_len, in,
+      in_len, extra_in, extra_in_len, ad, ad_len, ctx->tag_len);
+}
+
+static int aead_aes_gcm_open_gather_impl(const struct aead_aes_gcm_ctx *gcm_ctx,
+                                         uint8_t *out,
+                                         const uint8_t *nonce, size_t nonce_len,
+                                         const uint8_t *in, size_t in_len,
+                                         const uint8_t *in_tag,
+                                         size_t in_tag_len,
+                                         const uint8_t *ad, size_t ad_len,
+                                         size_t tag_len) {
  uint8_t tag[EVP_AEAD_AES_GCM_TAG_LEN];

  if (nonce_len == 0) {
@ -1008,7 +1058,7 @@ static int aead_aes_gcm_open_gather(const EVP_AEAD_CTX *ctx, uint8_t *out,
    return 0;
  }

-  if (in_tag_len != ctx->tag_len) {
+  if (in_tag_len != tag_len) {
    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
    return 0;
  }
@ -1035,8 +1085,8 @@ static int aead_aes_gcm_open_gather(const EVP_AEAD_CTX *ctx, uint8_t *out,
    }
  }

-  CRYPTO_gcm128_tag(&gcm, tag, ctx->tag_len);
-  if (CRYPTO_memcmp(tag, in_tag, ctx->tag_len) != 0) {
+  CRYPTO_gcm128_tag(&gcm, tag, tag_len);
+  if (CRYPTO_memcmp(tag, in_tag, tag_len) != 0) {
    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
    return 0;
  }
@ -1044,11 +1094,22 @@ static int aead_aes_gcm_open_gather(const EVP_AEAD_CTX *ctx, uint8_t *out,
  return 1;
 }

+static int aead_aes_gcm_open_gather(const EVP_AEAD_CTX *ctx, uint8_t *out,
+                                    const uint8_t *nonce, size_t nonce_len,
+                                    const uint8_t *in, size_t in_len,
+                                    const uint8_t *in_tag, size_t in_tag_len,
+                                    const uint8_t *ad, size_t ad_len) {
+  struct aead_aes_gcm_ctx *gcm_ctx = (struct aead_aes_gcm_ctx *)&ctx->state;
+  return aead_aes_gcm_open_gather_impl(gcm_ctx, out, nonce, nonce_len, in,
+                                       in_len, in_tag, in_tag_len, ad, ad_len,
+                                       ctx->tag_len);
+}
+
 DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm) {
  memset(out, 0, sizeof(EVP_AEAD));

  out->key_len = 16;
-  out->nonce_len = 12;
+  out->nonce_len = AES_GCM_NONCE_LENGTH;
  out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
  out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
  out->seal_scatter_supports_extra_in = 1;
@ -1063,7 +1124,7 @@ DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_192_gcm) {
  memset(out, 0, sizeof(EVP_AEAD));

  out->key_len = 24;
-  out->nonce_len = 12;
+  out->nonce_len = AES_GCM_NONCE_LENGTH;
  out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
  out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
  out->seal_scatter_supports_extra_in = 1;
@ -1078,7 +1139,7 @@ DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm) {
  memset(out, 0, sizeof(EVP_AEAD));

  out->key_len = 32;
-  out->nonce_len = 12;
+  out->nonce_len = AES_GCM_NONCE_LENGTH;
  out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
  out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
  out->seal_scatter_supports_extra_in = 1;
@ -1089,6 +1150,116 @@ DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm) {
  out->open_gather = aead_aes_gcm_open_gather;
 }

+static int aead_aes_gcm_init_randnonce(EVP_AEAD_CTX *ctx, const uint8_t *key,
+                                       size_t key_len,
+                                       size_t requested_tag_len) {
+  if (requested_tag_len != EVP_AEAD_DEFAULT_TAG_LENGTH) {
+    if (requested_tag_len < AES_GCM_NONCE_LENGTH) {
+      OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL);
+      return 0;
+    }
+    requested_tag_len -= AES_GCM_NONCE_LENGTH;
+  }
+
+  if (!aead_aes_gcm_init(ctx, key, key_len, requested_tag_len)) {
+    return 0;
+  }
+
+  ctx->tag_len += AES_GCM_NONCE_LENGTH;
+  return 1;
+}
+
+static int aead_aes_gcm_seal_scatter_randnonce(
+    const EVP_AEAD_CTX *ctx,
+    uint8_t *out, uint8_t *out_tag, size_t *out_tag_len, size_t max_out_tag_len,
+    const uint8_t *external_nonce, size_t external_nonce_len,
+    const uint8_t *in, size_t in_len,
+    const uint8_t *extra_in, size_t extra_in_len,
+    const uint8_t *ad, size_t ad_len) {
+  if (external_nonce_len != 0) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE);
+    return 0;
+  }
+
+  uint8_t nonce[AES_GCM_NONCE_LENGTH];
+  if (max_out_tag_len < sizeof(nonce)) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL);
+    return 0;
+  }
+
+  RAND_bytes(nonce, sizeof(nonce));
+  const struct aead_aes_gcm_ctx *gcm_ctx =
+      (const struct aead_aes_gcm_ctx *)&ctx->state;
+  if (!aead_aes_gcm_seal_scatter_impl(gcm_ctx, out, out_tag, out_tag_len,
+                                      max_out_tag_len - AES_GCM_NONCE_LENGTH,
+                                      nonce, sizeof(nonce), in, in_len,
+                                      extra_in, extra_in_len, ad, ad_len,
+                                      ctx->tag_len - AES_GCM_NONCE_LENGTH)) {
+    return 0;
+  }
+
+  assert(*out_tag_len + sizeof(nonce) <= max_out_tag_len);
+  memcpy(out_tag + *out_tag_len, nonce, sizeof(nonce));
+  *out_tag_len += sizeof(nonce);
+
+  return 1;
+}
+
+static int aead_aes_gcm_open_gather_randnonce(
+    const EVP_AEAD_CTX *ctx, uint8_t *out,
+    const uint8_t *external_nonce, size_t external_nonce_len,
+    const uint8_t *in, size_t in_len,
+    const uint8_t *in_tag, size_t in_tag_len,
+    const uint8_t *ad, size_t ad_len) {
+  if (external_nonce_len != 0) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE);
+    return 0;
+  }
+
+  if (in_tag_len < AES_GCM_NONCE_LENGTH) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
+    return 0;
+  }
+  const uint8_t *nonce = in_tag + in_tag_len - AES_GCM_NONCE_LENGTH;
+
+  const struct aead_aes_gcm_ctx *gcm_ctx =
+      (const struct aead_aes_gcm_ctx *)&ctx->state;
+  return aead_aes_gcm_open_gather_impl(
+      gcm_ctx, out, nonce, AES_GCM_NONCE_LENGTH, in, in_len, in_tag,
+      in_tag_len - AES_GCM_NONCE_LENGTH, ad, ad_len,
+      ctx->tag_len - AES_GCM_NONCE_LENGTH);
+}
+
+DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm_randnonce) {
+  memset(out, 0, sizeof(EVP_AEAD));
+
+  out->key_len = 16;
+  out->nonce_len = 0;
+  out->overhead = EVP_AEAD_AES_GCM_TAG_LEN + AES_GCM_NONCE_LENGTH;
+  out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN + AES_GCM_NONCE_LENGTH;
+  out->seal_scatter_supports_extra_in = 1;
+
+  out->init = aead_aes_gcm_init_randnonce;
+  out->cleanup = aead_aes_gcm_cleanup;
+  out->seal_scatter = aead_aes_gcm_seal_scatter_randnonce;
+  out->open_gather = aead_aes_gcm_open_gather_randnonce;
+}
+
+DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm_randnonce) {
+  memset(out, 0, sizeof(EVP_AEAD));
+
+  out->key_len = 32;
+  out->nonce_len = 0;
+  out->overhead = EVP_AEAD_AES_GCM_TAG_LEN + AES_GCM_NONCE_LENGTH;
+  out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN + AES_GCM_NONCE_LENGTH;
+  out->seal_scatter_supports_extra_in = 1;
+
+  out->init = aead_aes_gcm_init_randnonce;
+  out->cleanup = aead_aes_gcm_cleanup;
+  out->seal_scatter = aead_aes_gcm_seal_scatter_randnonce;
+  out->open_gather = aead_aes_gcm_open_gather_randnonce;
+}
+
 struct aead_aes_gcm_tls12_ctx {
  struct aead_aes_gcm_ctx gcm_ctx;
  uint64_t min_next_nonce;
@ -1128,7 +1299,7 @@ static int aead_aes_gcm_tls12_seal_scatter(
  struct aead_aes_gcm_tls12_ctx *gcm_ctx =
      (struct aead_aes_gcm_tls12_ctx *) &ctx->state;

-  if (nonce_len != 12) {
+  if (nonce_len != AES_GCM_NONCE_LENGTH) {
    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
    return 0;
  }
@ -1155,7 +1326,7 @@ DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm_tls12) {
  memset(out, 0, sizeof(EVP_AEAD));

  out->key_len = 16;
-  out->nonce_len = 12;
+  out->nonce_len = AES_GCM_NONCE_LENGTH;
  out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
  out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
  out->seal_scatter_supports_extra_in = 1;
@ -1170,7 +1341,7 @@ DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm_tls12) {
  memset(out, 0, sizeof(EVP_AEAD));

  out->key_len = 32;
-  out->nonce_len = 12;
+  out->nonce_len = AES_GCM_NONCE_LENGTH;
  out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
  out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
  out->seal_scatter_supports_extra_in = 1;
@ -1223,7 +1394,7 @@ static int aead_aes_gcm_tls13_seal_scatter(
  struct aead_aes_gcm_tls13_ctx *gcm_ctx =
      (struct aead_aes_gcm_tls13_ctx *) &ctx->state;

-  if (nonce_len != 12) {
+  if (nonce_len != AES_GCM_NONCE_LENGTH) {
    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
    return 0;
  }
@ -1261,7 +1432,7 @@ DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm_tls13) {
  memset(out, 0, sizeof(EVP_AEAD));

  out->key_len = 16;
-  out->nonce_len = 12;
+  out->nonce_len = AES_GCM_NONCE_LENGTH;
  out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
  out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
  out->seal_scatter_supports_extra_in = 1;
@ -1276,7 +1447,7 @@ DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm_tls13) {
  memset(out, 0, sizeof(EVP_AEAD));

  out->key_len = 32;
-  out->nonce_len = 12;
+  out->nonce_len = AES_GCM_NONCE_LENGTH;
  out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
  out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
  out->seal_scatter_supports_extra_in = 1;
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/co-586.linux.x86.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/co-586.linux.x86.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__i386__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__i386__)
 #if defined(BORINGSSL_PREFIX)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-neon-armv8.ios.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-neon-armv8.ios.aarch64.S
@ -14,6 +14,8 @@
 #if defined(BORINGSSL_PREFIX)
 #include <CBigNumBoringSSL_boringssl_prefix_symbols_asm.h>
 #endif
+#include <CBigNumBoringSSL_arm_arch.h>
+
 .text

 .globl	_gcm_init_neon
@ -21,6 +23,7 @@

 .align	4
 _gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
 	// This function is adapted from gcm_init_v8. xC2 is t3.
 	ld1	{v17.2d}, [x1]			// load H
 	movi	v19.16b, #0xe1
@ -46,6 +49,7 @@ _gcm_init_neon:

 .align	4
 _gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
 	ld1	{v3.16b}, [x0]		// load Xi
 	ld1	{v5.1d}, [x1], #8		// load twisted H
 	ld1	{v6.1d}, [x1]
@ -65,6 +69,7 @@ _gcm_gmult_neon:

 .align	4
 _gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
 	ld1	{v0.16b}, [x0]		// load Xi
 	ld1	{v5.1d}, [x1], #8		// load twisted H
 	ld1	{v6.1d}, [x1]
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-neon-armv8.linux.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-neon-armv8.linux.aarch64.S
@ -15,6 +15,8 @@
 #if defined(BORINGSSL_PREFIX)
 #include <CBigNumBoringSSL_boringssl_prefix_symbols_asm.h>
 #endif
+#include <CBigNumBoringSSL_arm_arch.h>
+
 .text

 .globl	gcm_init_neon
@ -22,6 +24,7 @@
 .type	gcm_init_neon,%function
 .align	4
 gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
 	// This function is adapted from gcm_init_v8. xC2 is t3.
 	ld1	{v17.2d}, [x1]			// load H
 	movi	v19.16b, #0xe1
@ -47,6 +50,7 @@ gcm_init_neon:
 .type	gcm_gmult_neon,%function
 .align	4
 gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
 	ld1	{v3.16b}, [x0]		// load Xi
 	ld1	{v5.1d}, [x1], #8		// load twisted H
 	ld1	{v6.1d}, [x1]
@ -66,6 +70,7 @@ gcm_gmult_neon:
 .type	gcm_ghash_neon,%function
 .align	4
 gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
 	ld1	{v0.16b}, [x0]		// load Xi
 	ld1	{v5.1d}, [x1], #8		// load twisted H
 	ld1	{v6.1d}, [x1]
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-ssse3-x86.linux.x86.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-ssse3-x86.linux.x86.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__i386__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__i386__)
 #if defined(BORINGSSL_PREFIX)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64.linux.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-ssse3-x86_64.mac.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__APPLE__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-x86.linux.x86.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-x86.linux.x86.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__i386__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__i386__)
 #if defined(BORINGSSL_PREFIX)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-x86_64.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-x86_64.linux.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-x86_64.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghash-x86_64.mac.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__APPLE__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghashv8-armx32.ios.arm.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghashv8-armx32.ios.arm.S
@ -16,6 +16,7 @@
 #endif
 #include <CBigNumBoringSSL_arm_arch.h>

+#if __ARM_MAX_ARCH__>=7
 .text

 .code	32
@ -27,6 +28,7 @@
 #endif
 .align	4
 _gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
 	vld1.64	{q9},[r1]		@ load input H
 	vmov.i8	q11,#0xe1
 	vshl.i64	q11,q11,#57		@ 0xc2.0
@ -69,8 +71,7 @@ _gcm_init_v8:
 	vext.8	q9,q14,q14,#8		@ Karatsuba pre-processing
 	veor	q9,q9,q14
 	vext.8	q13,q8,q9,#8		@ pack Karatsuba pre-processed
-	vst1.64	{q13,q14},[r0]		@ store Htable[1..2]
-
+	vst1.64	{q13,q14},[r0]!	@ store Htable[1..2]
 	bx	lr

 .globl	_gcm_gmult_v8
@ -80,6 +81,7 @@ _gcm_init_v8:
 #endif
 .align	4
 _gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
 	vld1.64	{q9},[r0]		@ load Xi
 	vmov.i8	q11,#0xe1
 	vld1.64	{q12,q13},[r1]	@ load twisted H, ...
@ -124,6 +126,7 @@ _gcm_gmult_v8:
 #endif
 .align	4
 _gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
 	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ 32-bit ABI says so
 	vld1.64	{q0},[r0]		@ load [rotated] Xi
 						@ "[rotated]" means that
@ -255,6 +258,7 @@ Ldone_v8:
 .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	2
+#endif
 #endif  // !OPENSSL_NO_ASM
 #endif  // defined(__arm__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghashv8-armx32.linux.arm.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghashv8-armx32.linux.arm.S
@ -17,6 +17,7 @@
 #endif
 #include <CBigNumBoringSSL_arm_arch.h>

+#if __ARM_MAX_ARCH__>=7
 .text
 .fpu	neon
 .code	32
@ -26,6 +27,7 @@
 .type	gcm_init_v8,%function
 .align	4
 gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
 	vld1.64	{q9},[r1]		@ load input H
 	vmov.i8	q11,#0xe1
 	vshl.i64	q11,q11,#57		@ 0xc2.0
@ -68,8 +70,7 @@ gcm_init_v8:
 	vext.8	q9,q14,q14,#8		@ Karatsuba pre-processing
 	veor	q9,q9,q14
 	vext.8	q13,q8,q9,#8		@ pack Karatsuba pre-processed
-	vst1.64	{q13,q14},[r0]		@ store Htable[1..2]
-
+	vst1.64	{q13,q14},[r0]!	@ store Htable[1..2]
 	bx	lr
 .size	gcm_init_v8,.-gcm_init_v8
 .globl	gcm_gmult_v8
@ -77,6 +78,7 @@ gcm_init_v8:
 .type	gcm_gmult_v8,%function
 .align	4
 gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
 	vld1.64	{q9},[r0]		@ load Xi
 	vmov.i8	q11,#0xe1
 	vld1.64	{q12,q13},[r1]	@ load twisted H, ...
@ -119,6 +121,7 @@ gcm_gmult_v8:
 .type	gcm_ghash_v8,%function
 .align	4
 gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
 	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ 32-bit ABI says so
 	vld1.64	{q0},[r0]		@ load [rotated] Xi
 						@ "[rotated]" means that
@ -251,6 +254,7 @@ gcm_ghash_v8:
 .align	2
 .align	2
 #endif
+#endif
 #endif  // !OPENSSL_NO_ASM
 .section	.note.GNU-stack,"",%progbits
 #endif  // defined(__arm__) && defined(__linux__)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghashv8-armx64.ios.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghashv8-armx64.ios.aarch64.S
@ -16,6 +16,7 @@
 #endif
 #include <CBigNumBoringSSL_arm_arch.h>

+#if __ARM_MAX_ARCH__>=7
 .text

 .globl	_gcm_init_v8
@ -23,6 +24,7 @@

 .align	4
 _gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
 	ld1	{v17.2d},[x1]		//load input H
 	movi	v19.16b,#0xe1
 	shl	v19.2d,v19.2d,#57		//0xc2.0
@ -65,8 +67,48 @@ _gcm_init_v8:
 	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
 	eor	v17.16b,v17.16b,v22.16b
 	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
-	st1	{v21.2d,v22.2d},[x0]		//store Htable[1..2]
+	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
+	//calculate H^3 and H^4
+	pmull	v0.1q,v20.1d, v22.1d
+	pmull	v5.1q,v22.1d,v22.1d
+	pmull2	v2.1q,v20.2d, v22.2d
+	pmull2	v7.1q,v22.2d,v22.2d
+	pmull	v1.1q,v16.1d,v17.1d
+	pmull	v6.1q,v17.1d,v17.1d

+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+	eor	v20.16b, v0.16b,v18.16b		//H^3
+	eor	v22.16b,v5.16b,v4.16b		//H^4
+
+	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
+	ext	v17.16b,v22.16b,v22.16b,#8
+	eor	v16.16b,v16.16b,v20.16b
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
 	ret

 .globl	_gcm_gmult_v8
@ -74,6 +116,7 @@ _gcm_init_v8:

 .align	4
 _gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
 	ld1	{v17.2d},[x0]		//load Xi
 	movi	v19.16b,#0xe1
 	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
@ -116,6 +159,9 @@ _gcm_gmult_v8:

 .align	4
 _gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
+	cmp	x3,#64
+	b.hs	Lgcm_ghash_v8_4x
 	ld1	{v0.2d},[x0]		//load [rotated] Xi
 						//"[rotated]" means that
 						//loaded value would have
@ -242,9 +288,290 @@ Ldone_v8:

 	ret

+
+.align	4
+gcm_ghash_v8_4x:
+Lgcm_ghash_v8_4x:
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __ARMEB__
+	rev64	v0.16b,v0.16b
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+	ext	v25.16b,v7.16b,v7.16b,#8
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#128
+	b.lo	Ltail4x
+
+	b	Loop4x
+
+.align	4
+Loop4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+	ext	v3.16b,v16.16b,v16.16b,#8
+#ifndef __ARMEB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	ext	v25.16b,v7.16b,v7.16b,#8
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	ext	v24.16b,v6.16b,v6.16b,#8
+	eor	v1.16b,v1.16b,v30.16b
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	eor	v1.16b,v1.16b,v17.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	eor	v0.16b,v1.16b,v18.16b
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v0.16b,v0.16b,v18.16b
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#64
+	b.hs	Loop4x
+
+Ltail4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+
+	adds	x3,x3,#64
+	b.eq	Ldone4x
+
+	cmp	x3,#32
+	b.lo	Lone
+	b.eq	Ltwo
+Lthree:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d,v6.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__ARMEB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v31.1q,v20.2d,v24.2d
+	pmull	v30.1q,v21.1d,v6.1d
+	eor	v0.16b,v0.16b,v18.16b
+	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull2	v23.1q,v22.2d,v23.2d
+	eor	v16.16b,v4.16b,v0.16b
+	pmull2	v5.1q,v21.2d,v5.2d
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v26.2d,v3.2d
+	pmull	v1.1q,v27.1d,v16.1d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	Ldone4x
+
+.align	4
+Ltwo:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__ARMEB__
+	rev64	v5.16b,v5.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull2	v31.1q,v20.2d,v23.2d
+	pmull	v30.1q,v21.1d,v5.1d
+
+	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v22.2d,v3.2d
+	pmull2	v1.1q,v21.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	Ldone4x
+
+.align	4
+Lone:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__ARMEB__
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v20.2d,v3.2d
+	pmull	v1.1q,v21.1d,v16.1d
+
+Ldone4x:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __ARMEB__
+	rev64	v0.16b,v0.16b
+#endif
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
 .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	2
+#endif
 #endif  // !OPENSSL_NO_ASM
 #endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghashv8-armx64.linux.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/ghashv8-armx64.linux.aarch64.S
@ -17,6 +17,7 @@
 #endif
 #include <CBigNumBoringSSL_arm_arch.h>

+#if __ARM_MAX_ARCH__>=7
 .text
 .arch	armv8-a+crypto
 .globl	gcm_init_v8
@ -24,6 +25,7 @@
 .type	gcm_init_v8,%function
 .align	4
 gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
 	ld1	{v17.2d},[x1]		//load input H
 	movi	v19.16b,#0xe1
 	shl	v19.2d,v19.2d,#57		//0xc2.0
@ -66,8 +68,48 @@ gcm_init_v8:
 	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
 	eor	v17.16b,v17.16b,v22.16b
 	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
-	st1	{v21.2d,v22.2d},[x0]		//store Htable[1..2]
+	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
+	//calculate H^3 and H^4
+	pmull	v0.1q,v20.1d, v22.1d
+	pmull	v5.1q,v22.1d,v22.1d
+	pmull2	v2.1q,v20.2d, v22.2d
+	pmull2	v7.1q,v22.2d,v22.2d
+	pmull	v1.1q,v16.1d,v17.1d
+	pmull	v6.1q,v17.1d,v17.1d

+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+	eor	v20.16b, v0.16b,v18.16b		//H^3
+	eor	v22.16b,v5.16b,v4.16b		//H^4
+
+	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
+	ext	v17.16b,v22.16b,v22.16b,#8
+	eor	v16.16b,v16.16b,v20.16b
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
 	ret
 .size	gcm_init_v8,.-gcm_init_v8
 .globl	gcm_gmult_v8
@ -75,6 +117,7 @@ gcm_init_v8:
 .type	gcm_gmult_v8,%function
 .align	4
 gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
 	ld1	{v17.2d},[x0]		//load Xi
 	movi	v19.16b,#0xe1
 	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
@ -117,6 +160,9 @@ gcm_gmult_v8:
 .type	gcm_ghash_v8,%function
 .align	4
 gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
+	cmp	x3,#64
+	b.hs	.Lgcm_ghash_v8_4x
 	ld1	{v0.2d},[x0]		//load [rotated] Xi
 						//"[rotated]" means that
 						//loaded value would have
@ -243,10 +289,291 @@ gcm_ghash_v8:

 	ret
 .size	gcm_ghash_v8,.-gcm_ghash_v8
+.type	gcm_ghash_v8_4x,%function
+.align	4
+gcm_ghash_v8_4x:
+.Lgcm_ghash_v8_4x:
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __ARMEB__
+	rev64	v0.16b,v0.16b
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+	ext	v25.16b,v7.16b,v7.16b,#8
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#128
+	b.lo	.Ltail4x
+
+	b	.Loop4x
+
+.align	4
+.Loop4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+	ext	v3.16b,v16.16b,v16.16b,#8
+#ifndef __ARMEB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	ext	v25.16b,v7.16b,v7.16b,#8
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	ext	v24.16b,v6.16b,v6.16b,#8
+	eor	v1.16b,v1.16b,v30.16b
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	eor	v1.16b,v1.16b,v17.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	eor	v0.16b,v1.16b,v18.16b
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v0.16b,v0.16b,v18.16b
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#64
+	b.hs	.Loop4x
+
+.Ltail4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+
+	adds	x3,x3,#64
+	b.eq	.Ldone4x
+
+	cmp	x3,#32
+	b.lo	.Lone
+	b.eq	.Ltwo
+.Lthree:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d,v6.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__ARMEB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v31.1q,v20.2d,v24.2d
+	pmull	v30.1q,v21.1d,v6.1d
+	eor	v0.16b,v0.16b,v18.16b
+	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull2	v23.1q,v22.2d,v23.2d
+	eor	v16.16b,v4.16b,v0.16b
+	pmull2	v5.1q,v21.2d,v5.2d
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v26.2d,v3.2d
+	pmull	v1.1q,v27.1d,v16.1d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	.Ldone4x
+
+.align	4
+.Ltwo:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__ARMEB__
+	rev64	v5.16b,v5.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull2	v31.1q,v20.2d,v23.2d
+	pmull	v30.1q,v21.1d,v5.1d
+
+	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v22.2d,v3.2d
+	pmull2	v1.1q,v21.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	.Ldone4x
+
+.align	4
+.Lone:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__ARMEB__
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v20.2d,v3.2d
+	pmull	v1.1q,v21.1d,v16.1d
+
+.Ldone4x:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __ARMEB__
+	rev64	v0.16b,v0.16b
+#endif
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
 .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	2
 #endif
+#endif
 #endif  // !OPENSSL_NO_ASM
 .section	.note.GNU-stack,"",%progbits
 #endif  // defined(__aarch64__) && defined(__linux__)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/md5-586.linux.x86.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/md5-586.linux.x86.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__i386__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__i386__)
 #if defined(BORINGSSL_PREFIX)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/md5-x86_64.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/md5-x86_64.linux.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/md5-x86_64.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/md5-x86_64.mac.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__APPLE__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/modes/cbc.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/modes/cbc.c
@ -52,20 +52,25 @@
 #include <CBigNumBoringSSL_type_check.h>

 #include "internal.h"
+#include "../../internal.h"


 void CRYPTO_cbc128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
                           const AES_KEY *key, uint8_t ivec[16],
                           block128_f block) {
+  assert(key != NULL && ivec != NULL);
+  if (len == 0) {
+    // Avoid |ivec| == |iv| in the |memcpy| below, which is not legal in C.
+    return;
+  }
+
+  assert(in != NULL && out != NULL);
  size_t n;
  const uint8_t *iv = ivec;
-
-  assert(key != NULL && ivec != NULL);
-  assert(len == 0 || (in != NULL && out != NULL));
-
  while (len >= 16) {
-    for (n = 0; n < 16; n += sizeof(size_t)) {
-      store_word_le(out + n, load_word_le(in + n) ^ load_word_le(iv + n));
+    for (n = 0; n < 16; n += sizeof(crypto_word_t)) {
+      CRYPTO_store_word_le(
+          out + n, CRYPTO_load_word_le(in + n) ^ CRYPTO_load_word_le(iv + n));
    }
    (*block)(out, out, key);
    iv = out;
@ -97,30 +102,36 @@ void CRYPTO_cbc128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
 void CRYPTO_cbc128_decrypt(const uint8_t *in, uint8_t *out, size_t len,
                           const AES_KEY *key, uint8_t ivec[16],
                           block128_f block) {
-  size_t n;
-  union {
-    size_t t[16 / sizeof(size_t)];
-    uint8_t c[16];
-  } tmp;
-
  assert(key != NULL && ivec != NULL);
-  assert(len == 0 || (in != NULL && out != NULL));
+  if (len == 0) {
+    // Avoid |ivec| == |iv| in the |memcpy| below, which is not legal in C.
+    return;
+  }
+
+  assert(in != NULL && out != NULL);

  const uintptr_t inptr = (uintptr_t) in;
  const uintptr_t outptr = (uintptr_t) out;
  // If |in| and |out| alias, |in| must be ahead.
  assert(inptr >= outptr || inptr + len <= outptr);

+  size_t n;
+  union {
+    crypto_word_t t[16 / sizeof(crypto_word_t)];
+    uint8_t c[16];
+  } tmp;
+
  if ((inptr >= 32 && outptr <= inptr - 32) || inptr < outptr) {
    // If |out| is at least two blocks behind |in| or completely disjoint, there
    // is no need to decrypt to a temporary block.
-    OPENSSL_STATIC_ASSERT(16 % sizeof(size_t) == 0,
+    OPENSSL_STATIC_ASSERT(16 % sizeof(crypto_word_t) == 0,
                          "block cannot be evenly divided into words");
    const uint8_t *iv = ivec;
    while (len >= 16) {
      (*block)(in, out, key);
-      for (n = 0; n < 16; n += sizeof(size_t)) {
-        store_word_le(out + n, load_word_le(out + n) ^ load_word_le(iv + n));
+      for (n = 0; n < 16; n += sizeof(crypto_word_t)) {
+        CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(out + n) ^
+                                          CRYPTO_load_word_le(iv + n));
      }
      iv = in;
      len -= 16;
@ -129,16 +140,16 @@ void CRYPTO_cbc128_decrypt(const uint8_t *in, uint8_t *out, size_t len,
    }
    OPENSSL_memcpy(ivec, iv, 16);
  } else {
-    OPENSSL_STATIC_ASSERT(16 % sizeof(size_t) == 0,
+    OPENSSL_STATIC_ASSERT(16 % sizeof(crypto_word_t) == 0,
                          "block cannot be evenly divided into words");

    while (len >= 16) {
      (*block)(in, tmp.c, key);
-      for (n = 0; n < 16; n += sizeof(size_t)) {
-        size_t c = load_word_le(in + n);
-        store_word_le(out + n,
-                      tmp.t[n / sizeof(size_t)] ^ load_word_le(ivec + n));
-        store_word_le(ivec + n, c);
+      for (n = 0; n < 16; n += sizeof(crypto_word_t)) {
+        crypto_word_t c = CRYPTO_load_word_le(in + n);
+        CRYPTO_store_word_le(out + n, tmp.t[n / sizeof(crypto_word_t)] ^
+                                          CRYPTO_load_word_le(ivec + n));
+        CRYPTO_store_word_le(ivec + n, c);
      }
      len -= 16;
      in += 16;
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/modes/cfb.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/modes/cfb.c
@ -72,10 +72,11 @@ void CRYPTO_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
    }
    while (len >= 16) {
      (*block)(ivec, ivec, key);
-      for (; n < 16; n += sizeof(size_t)) {
-        size_t tmp = load_word_le(ivec + n) ^ load_word_le(in + n);
-        store_word_le(ivec + n, tmp);
-        store_word_le(out + n, tmp);
+      for (; n < 16; n += sizeof(crypto_word_t)) {
+        crypto_word_t tmp =
+            CRYPTO_load_word_le(ivec + n) ^ CRYPTO_load_word_le(in + n);
+        CRYPTO_store_word_le(ivec + n, tmp);
+        CRYPTO_store_word_le(out + n, tmp);
      }
      len -= 16;
      out += 16;
@ -101,10 +102,10 @@ void CRYPTO_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
    }
    while (len >= 16) {
      (*block)(ivec, ivec, key);
-      for (; n < 16; n += sizeof(size_t)) {
-        size_t t = load_word_le(in + n);
-        store_word_le(out + n, load_word_le(ivec + n) ^ t);
-        store_word_le(ivec + n, t);
+      for (; n < 16; n += sizeof(crypto_word_t)) {
+        crypto_word_t t = CRYPTO_load_word_le(in + n);
+        CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(ivec + n) ^ t);
+        CRYPTO_store_word_le(ivec + n, t);
      }
      len -= 16;
      out += 16;
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/modes/ctr.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/modes/ctr.c
@ -52,6 +52,7 @@
 #include <string.h>

 #include "internal.h"
+#include "../../internal.h"


 // NOTE: the IV/counter CTR mode is big-endian.  The code itself
@ -69,8 +70,8 @@ static void ctr128_inc(uint8_t *counter) {
  } while (n);
 }

-OPENSSL_STATIC_ASSERT(16 % sizeof(size_t) == 0,
-                      "block cannot be divided into size_t");
+OPENSSL_STATIC_ASSERT(16 % sizeof(crypto_word_t) == 0,
+                      "block cannot be divided into crypto_word_t");

 // The input encrypted as though 128bit counter mode is being used.  The extra
 // state information to record how much of the 128bit block we have used is
@ -102,9 +103,9 @@ void CRYPTO_ctr128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
  while (len >= 16) {
    (*block)(ivec, ecount_buf, key);
    ctr128_inc(ivec);
-    for (n = 0; n < 16; n += sizeof(size_t)) {
-      store_word_le(out + n,
-                    load_word_le(in + n) ^ load_word_le(ecount_buf + n));
+    for (n = 0; n < 16; n += sizeof(crypto_word_t)) {
+      CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(in + n) ^
+                                        CRYPTO_load_word_le(ecount_buf + n));
    }
    len -= 16;
    out += 16;
@ -152,7 +153,7 @@ void CRYPTO_ctr128_encrypt_ctr32(const uint8_t *in, uint8_t *out, size_t len,
    n = (n + 1) % 16;
  }

-  ctr32 = GETU32(ivec + 12);
+  ctr32 = CRYPTO_load_u32_be(ivec + 12);
  while (len >= 16) {
    size_t blocks = len / 16;
    // 1<<28 is just a not-so-small yet not-so-large number...
@ -172,7 +173,7 @@ void CRYPTO_ctr128_encrypt_ctr32(const uint8_t *in, uint8_t *out, size_t len,
    }
    (*func)(in, out, blocks, key, ivec);
    // (*func) does not update ivec, caller does:
-    PUTU32(ivec + 12, ctr32);
+    CRYPTO_store_u32_be(ivec + 12, ctr32);
    // ... overflow was detected, propogate carry.
    if (ctr32 == 0) {
      ctr96_inc(ivec);
@ -186,7 +187,7 @@ void CRYPTO_ctr128_encrypt_ctr32(const uint8_t *in, uint8_t *out, size_t len,
    OPENSSL_memset(ecount_buf, 0, 16);
    (*func)(ecount_buf, ecount_buf, 1, key, ivec);
    ++ctr32;
-    PUTU32(ivec + 12, ctr32);
+    CRYPTO_store_u32_be(ivec + 12, ctr32);
    if (ctr32 == 0) {
      ctr96_inc(ivec);
    }
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/modes/gcm.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/modes/gcm.c
@ -73,7 +73,7 @@ static const size_t kSizeTWithoutLower4Bits = (size_t) -16;

 #if defined(GHASH_ASM_X86_64) || defined(GHASH_ASM_X86)
 static inline void gcm_reduce_1bit(u128 *V) {
-  if (sizeof(size_t) == 8) {
+  if (sizeof(crypto_word_t) == 8) {
    uint64_t T = UINT64_C(0xe100000000000000) & (0 - (V->hi & 1));
    V->hi = (V->lo << 63) | (V->hi >> 1);
    V->lo = (V->lo >> 1) ^ T;
@ -377,9 +377,10 @@ int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, const AES_KEY *key,
      (*block)(ctx->Yi.c, ctx->EKi.c, key);
      ++ctr;
      ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-      for (size_t i = 0; i < 16; i += sizeof(size_t)) {
-        store_word_le(out + i,
-                      load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
+      for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) {
+        CRYPTO_store_word_le(out + i,
+                             CRYPTO_load_word_le(in + i) ^
+                                 ctx->EKi.t[i / sizeof(crypto_word_t)]);
      }
      out += 16;
      in += 16;
@ -394,9 +395,10 @@ int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, const AES_KEY *key,
      (*block)(ctx->Yi.c, ctx->EKi.c, key);
      ++ctr;
      ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-      for (size_t i = 0; i < 16; i += sizeof(size_t)) {
-        store_word_le(out + i,
-                      load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
+      for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) {
+        CRYPTO_store_word_le(out + i,
+                             CRYPTO_load_word_le(in + i) ^
+                                 ctx->EKi.t[i / sizeof(crypto_word_t)]);
      }
      out += 16;
      in += 16;
@ -468,9 +470,10 @@ int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, const AES_KEY *key,
      (*block)(ctx->Yi.c, ctx->EKi.c, key);
      ++ctr;
      ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-      for (size_t i = 0; i < 16; i += sizeof(size_t)) {
-        store_word_le(out + i,
-                      load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
+      for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) {
+        CRYPTO_store_word_le(out + i,
+                             CRYPTO_load_word_le(in + i) ^
+                                 ctx->EKi.t[i / sizeof(crypto_word_t)]);
      }
      out += 16;
      in += 16;
@ -485,9 +488,10 @@ int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, const AES_KEY *key,
      (*block)(ctx->Yi.c, ctx->EKi.c, key);
      ++ctr;
      ctx->Yi.d[3] = CRYPTO_bswap4(ctr);
-      for (size_t i = 0; i < 16; i += sizeof(size_t)) {
-        store_word_le(out + i,
-                      load_word_le(in + i) ^ ctx->EKi.t[i / sizeof(size_t)]);
+      for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) {
+        CRYPTO_store_word_le(out + i,
+                             CRYPTO_load_word_le(in + i) ^
+                                 ctx->EKi.t[i / sizeof(crypto_word_t)]);
      }
      out += 16;
      in += 16;
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/modes/gcm_nohw.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/modes/gcm_nohw.c
@ -193,7 +193,7 @@ static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
 #endif  // BORINGSSL_HAS_UINT128

 void gcm_init_nohw(u128 Htable[16], const uint64_t Xi[2]) {
-  // We implement GHASH in terms of POLYVAL, as described in RFC8452. This
+  // We implement GHASH in terms of POLYVAL, as described in RFC 8452. This
  // avoids a shift by 1 in the multiplication, needed to account for bit
  // reversal losing a bit after multiplication, that is,
  // rev128(X) * rev128(Y) = rev255(X*Y).
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/modes/internal.h
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/modes/internal.h
@ -64,27 +64,6 @@ extern "C" {
 #endif


-static inline uint32_t GETU32(const void *in) {
-  uint32_t v;
-  OPENSSL_memcpy(&v, in, sizeof(v));
-  return CRYPTO_bswap4(v);
-}
-
-static inline void PUTU32(void *out, uint32_t v) {
-  v = CRYPTO_bswap4(v);
-  OPENSSL_memcpy(out, &v, sizeof(v));
-}
-
-static inline size_t load_word_le(const void *in) {
-  size_t v;
-  OPENSSL_memcpy(&v, in, sizeof(v));
-  return v;
-}
-
-static inline void store_word_le(void *out, size_t v) {
-  OPENSSL_memcpy(out, &v, sizeof(v));
-}
-
 // block128_f is the type of an AES block cipher implementation.
 //
 // Unlike upstream OpenSSL, it and the other functions in this file hard-code
@ -171,7 +150,7 @@ typedef struct {
    uint64_t u[2];
    uint32_t d[4];
    uint8_t c[16];
-    size_t t[16 / sizeof(size_t)];
+    crypto_word_t t[16 / sizeof(crypto_word_t)];
  } Yi, EKi, EK0, len, Xi;

  // Note that the order of |Xi| and |gcm_key| is fixed by the MOVBE-based,
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/modes/ofb.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/modes/ofb.c
@ -60,7 +60,8 @@ OPENSSL_STATIC_ASSERT(16 % sizeof(size_t) == 0,
 void CRYPTO_ofb128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
                           const AES_KEY *key, uint8_t ivec[16], unsigned *num,
                           block128_f block) {
-  assert(in && out && key && ivec && num);
+  assert(key != NULL && ivec != NULL && num != NULL);
+  assert(len == 0 || (in != NULL && out != NULL));

  unsigned n = *num;

--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/p256-x86_64-asm.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/p256-x86_64-asm.linux.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/p256-x86_64-asm.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/p256-x86_64-asm.mac.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__APPLE__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm.linux.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/p256_beeu-x86_64-asm.mac.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__APPLE__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/rand/internal.h
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/rand/internal.h
@ -36,16 +36,45 @@ extern "C" {
 void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len,
                                     const uint8_t user_additional_data[32]);

+#if defined(BORINGSSL_FIPS)
+
+// We overread from /dev/urandom or RDRAND by a factor of 10 and XOR to whiten.
+#define BORINGSSL_FIPS_OVERREAD 10
+
+// CRYPTO_get_seed_entropy writes |out_entropy_len| bytes of entropy, suitable
+// for seeding a DRBG, to |out_entropy|. It sets |*out_used_cpu| to one if the
+// entropy came directly from the CPU and zero if it came from the OS. It
+// actively obtains entropy from the CPU/OS and so should not be called from
+// within the FIPS module.
+void CRYPTO_get_seed_entropy(uint8_t *out_entropy, size_t out_entropy_len,
+                             int *out_used_cpu);
+
+// RAND_load_entropy supplies |entropy_len| bytes of entropy to the module. The
+// |from_cpu| parameter is true iff the entropy was obtained directly from the
+// CPU.
+void RAND_load_entropy(const uint8_t *entropy, size_t entropy_len,
+                       int from_cpu);
+
+// RAND_need_entropy is implemented outside of the FIPS module and is called
+// when the module has stopped because it has run out of entropy.
+void RAND_need_entropy(size_t bytes_needed);
+
+#endif  // BORINGSSL_FIPS
+
 // CRYPTO_sysrand fills |len| bytes at |buf| with entropy from the operating
 // system.
 void CRYPTO_sysrand(uint8_t *buf, size_t len);

-#if defined(OPENSSL_URANDOM)
 // CRYPTO_sysrand_for_seed fills |len| bytes at |buf| with entropy from the
 // operating system. It may draw from the |GRND_RANDOM| pool on Android,
 // depending on the vendor's configuration.
 void CRYPTO_sysrand_for_seed(uint8_t *buf, size_t len);

+#if defined(OPENSSL_URANDOM)
+// CRYPTO_init_sysrand initializes long-lived resources needed to draw entropy
+// from the operating system.
+void CRYPTO_init_sysrand(void);
+
 // CRYPTO_sysrand_if_available fills |len| bytes at |buf| with entropy from the
 // operating system, or early /dev/urandom data, and returns 1, _if_ the entropy
 // pool is initialized or if getrandom() is not available and not in FIPS mode.
@ -53,9 +82,7 @@ void CRYPTO_sysrand_for_seed(uint8_t *buf, size_t len);
 // return 0.
 int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len);
 #else
-OPENSSL_INLINE void CRYPTO_sysrand_for_seed(uint8_t *buf, size_t len) {
-  CRYPTO_sysrand(buf, len);
-}
+OPENSSL_INLINE void CRYPTO_init_sysrand(void) {}

 OPENSSL_INLINE int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) {
  CRYPTO_sysrand(buf, len);
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/rand/rand.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/rand/rand.c
@ -25,6 +25,7 @@
 #include <CBigNumBoringSSL_chacha.h>
 #include <CBigNumBoringSSL_cpu.h>
 #include <CBigNumBoringSSL_mem.h>
+#include <CBigNumBoringSSL_type_check.h>

 #include "internal.h"
 #include "fork_detect.h"
@ -63,11 +64,11 @@ struct rand_thread_state {
  // (re)seeded. This is bound by |kReseedInterval|.
  unsigned calls;
  // last_block_valid is non-zero iff |last_block| contains data from
-  // |CRYPTO_sysrand_for_seed|.
+  // |get_seed_entropy|.
  int last_block_valid;

 #if defined(BORINGSSL_FIPS)
-  // last_block contains the previous block from |CRYPTO_sysrand_for_seed|.
+  // last_block contains the previous block from |get_seed_entropy|.
  uint8_t last_block[CRNGT_BLOCK_SIZE];
  // next and prev form a NULL-terminated, double-linked list of all states in
  // a process.
@ -82,16 +83,18 @@ struct rand_thread_state {
 // called when the whole process is exiting.
 DEFINE_BSS_GET(struct rand_thread_state *, thread_states_list);
 DEFINE_STATIC_MUTEX(thread_states_list_lock);
+DEFINE_STATIC_MUTEX(state_clear_all_lock);

 static void rand_thread_state_clear_all(void) __attribute__((destructor));
 static void rand_thread_state_clear_all(void) {
  CRYPTO_STATIC_MUTEX_lock_write(thread_states_list_lock_bss_get());
+  CRYPTO_STATIC_MUTEX_lock_write(state_clear_all_lock_bss_get());
  for (struct rand_thread_state *cur = *thread_states_list_bss_get();
       cur != NULL; cur = cur->next) {
    CTR_DRBG_clear(&cur->drbg);
  }
-  // |thread_states_list_lock is deliberately left locked so that any threads
-  // that are still running will hang if they try to call |RAND_bytes|.
+  // The locks are deliberately left locked so that any threads that are still
+  // running will hang if they try to call |RAND_bytes|.
 }
 #endif

@ -146,12 +149,6 @@ static int rdrand(uint8_t *buf, const size_t len) {
    OPENSSL_memcpy(buf + len_multiple8, rand_buf, remainder);
  }

-#if defined(BORINGSSL_FIPS_BREAK_CRNG)
-  // This breaks the "continuous random number generator test" defined in FIPS
-  // 140-2, section 4.9.2, and implemented in rand_get_seed().
-  OPENSSL_memset(buf, 0, len);
-#endif
-
  return 1;
 }

@ -165,25 +162,97 @@ static int rdrand(uint8_t *buf, size_t len) {

 #if defined(BORINGSSL_FIPS)

+void CRYPTO_get_seed_entropy(uint8_t *out_entropy, size_t out_entropy_len,
+                             int *out_used_cpu) {
+  *out_used_cpu = 0;
+  if (have_rdrand() && rdrand(out_entropy, out_entropy_len)) {
+    *out_used_cpu = 1;
+  } else {
+    CRYPTO_sysrand_for_seed(out_entropy, out_entropy_len);
+  }
+
+#if defined(BORINGSSL_FIPS_BREAK_CRNG)
+  // This breaks the "continuous random number generator test" defined in FIPS
+  // 140-2, section 4.9.2, and implemented in |rand_get_seed|.
+  OPENSSL_memset(out_entropy, 0, out_entropy_len);
+#endif
+}
+
+// In passive entropy mode, entropy is supplied from outside of the module via
+// |RAND_load_entropy| and is stored in global instance of the following
+// structure.
+
+struct entropy_buffer {
+  // bytes contains entropy suitable for seeding a DRBG.
+  uint8_t bytes[CTR_DRBG_ENTROPY_LEN * BORINGSSL_FIPS_OVERREAD];
+  // bytes_valid indicates the number of bytes of |bytes| that contain valid
+  // data.
+  size_t bytes_valid;
+  // from_cpu is true if any of the contents of |bytes| were obtained directly
+  // from the CPU.
+  int from_cpu;
+};
+
+DEFINE_BSS_GET(struct entropy_buffer, entropy_buffer);
+DEFINE_STATIC_MUTEX(entropy_buffer_lock);
+
+void RAND_load_entropy(const uint8_t *entropy, size_t entropy_len,
+                       int from_cpu) {
+  struct entropy_buffer *const buffer = entropy_buffer_bss_get();
+
+  CRYPTO_STATIC_MUTEX_lock_write(entropy_buffer_lock_bss_get());
+  const size_t space = sizeof(buffer->bytes) - buffer->bytes_valid;
+  if (entropy_len > space) {
+    entropy_len = space;
+  }
+
+  OPENSSL_memcpy(&buffer->bytes[buffer->bytes_valid], entropy, entropy_len);
+  buffer->bytes_valid += entropy_len;
+  buffer->from_cpu |= from_cpu && (entropy_len != 0);
+  CRYPTO_STATIC_MUTEX_unlock_write(entropy_buffer_lock_bss_get());
+}
+
+// get_seed_entropy fills |out_entropy_len| bytes of |out_entropy| from the
+// global |entropy_buffer|.
+static void get_seed_entropy(uint8_t *out_entropy, size_t out_entropy_len,
+                             int *out_used_cpu) {
+  struct entropy_buffer *const buffer = entropy_buffer_bss_get();
+  if (out_entropy_len > sizeof(buffer->bytes)) {
+    abort();
+  }
+
+  CRYPTO_STATIC_MUTEX_lock_write(entropy_buffer_lock_bss_get());
+  while (buffer->bytes_valid < out_entropy_len) {
+    CRYPTO_STATIC_MUTEX_unlock_write(entropy_buffer_lock_bss_get());
+    RAND_need_entropy(out_entropy_len - buffer->bytes_valid);
+    CRYPTO_STATIC_MUTEX_lock_write(entropy_buffer_lock_bss_get());
+  }
+
+  *out_used_cpu = buffer->from_cpu;
+  OPENSSL_memcpy(out_entropy, buffer->bytes, out_entropy_len);
+  OPENSSL_memmove(buffer->bytes, &buffer->bytes[out_entropy_len],
+                  buffer->bytes_valid - out_entropy_len);
+  buffer->bytes_valid -= out_entropy_len;
+  if (buffer->bytes_valid == 0) {
+    buffer->from_cpu = 0;
+  }
+
+  CRYPTO_STATIC_MUTEX_unlock_write(entropy_buffer_lock_bss_get());
+}
+
+// rand_get_seed fills |seed| with entropy and sets |*out_used_cpu| to one if
+// that entropy came directly from the CPU and zero otherwise.
 static void rand_get_seed(struct rand_thread_state *state,
-                          uint8_t seed[CTR_DRBG_ENTROPY_LEN]) {
+                          uint8_t seed[CTR_DRBG_ENTROPY_LEN],
+                          int *out_used_cpu) {
  if (!state->last_block_valid) {
-    if (!have_rdrand() ||
-        !rdrand(state->last_block, sizeof(state->last_block))) {
-      CRYPTO_sysrand_for_seed(state->last_block, sizeof(state->last_block));
-    }
+    int unused;
+    get_seed_entropy(state->last_block, sizeof(state->last_block), &unused);
    state->last_block_valid = 1;
  }

-  // We overread from /dev/urandom or RDRAND by a factor of 10 and XOR to
-  // whiten.
-#define FIPS_OVERREAD 10
-  uint8_t entropy[CTR_DRBG_ENTROPY_LEN * FIPS_OVERREAD];
-
-  int used_rdrand = have_rdrand() && rdrand(entropy, sizeof(entropy));
-  if (!used_rdrand) {
-    CRYPTO_sysrand_for_seed(entropy, sizeof(entropy));
-  }
+  uint8_t entropy[CTR_DRBG_ENTROPY_LEN * BORINGSSL_FIPS_OVERREAD];
+  get_seed_entropy(entropy, sizeof(entropy), out_used_cpu);

  // See FIPS 140-2, section 4.9.2. This is the “continuous random number
  // generator test” which causes the program to randomly abort. Hopefully the
@ -193,6 +262,7 @@ static void rand_get_seed(struct rand_thread_state *state,
    BORINGSSL_FIPS_abort();
  }

+  OPENSSL_STATIC_ASSERT(sizeof(entropy) % CRNGT_BLOCK_SIZE == 0, "");
  for (size_t i = CRNGT_BLOCK_SIZE; i < sizeof(entropy);
       i += CRNGT_BLOCK_SIZE) {
    if (CRYPTO_memcmp(entropy + i - CRNGT_BLOCK_SIZE, entropy + i,
@ -207,31 +277,24 @@ static void rand_get_seed(struct rand_thread_state *state,

  OPENSSL_memcpy(seed, entropy, CTR_DRBG_ENTROPY_LEN);

-  for (size_t i = 1; i < FIPS_OVERREAD; i++) {
+  for (size_t i = 1; i < BORINGSSL_FIPS_OVERREAD; i++) {
    for (size_t j = 0; j < CTR_DRBG_ENTROPY_LEN; j++) {
      seed[j] ^= entropy[CTR_DRBG_ENTROPY_LEN * i + j];
    }
  }
-
-#if defined(OPENSSL_URANDOM)
-  // If we used RDRAND, also opportunistically read from the system. This avoids
-  // solely relying on the hardware once the entropy pool has been initialized.
-  if (used_rdrand) {
-    CRYPTO_sysrand_if_available(entropy, CTR_DRBG_ENTROPY_LEN);
-    for (size_t i = 0; i < CTR_DRBG_ENTROPY_LEN; i++) {
-      seed[i] ^= entropy[i];
-    }
-  }
-#endif
 }

 #else

+// rand_get_seed fills |seed| with entropy and sets |*out_used_cpu| to one if
+// that entropy came directly from the CPU and zero otherwise.
 static void rand_get_seed(struct rand_thread_state *state,
-                          uint8_t seed[CTR_DRBG_ENTROPY_LEN]) {
+                          uint8_t seed[CTR_DRBG_ENTROPY_LEN],
+                          int *out_used_cpu) {
  // If not in FIPS mode, we don't overread from the system entropy source and
  // we don't depend only on the hardware RDRAND.
-  CRYPTO_sysrand(seed, CTR_DRBG_ENTROPY_LEN);
+  CRYPTO_sysrand_for_seed(seed, CTR_DRBG_ENTROPY_LEN);
+  *out_used_cpu = 0;
 }

 #endif
@ -290,8 +353,23 @@ void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len,

    state->last_block_valid = 0;
    uint8_t seed[CTR_DRBG_ENTROPY_LEN];
-    rand_get_seed(state, seed);
-    if (!CTR_DRBG_init(&state->drbg, seed, NULL, 0)) {
+    int used_cpu;
+    rand_get_seed(state, seed, &used_cpu);
+
+    uint8_t personalization[CTR_DRBG_ENTROPY_LEN] = {0};
+    size_t personalization_len = 0;
+#if defined(OPENSSL_URANDOM)
+    // If we used RDRAND, also opportunistically read from the system. This
+    // avoids solely relying on the hardware once the entropy pool has been
+    // initialized.
+    if (used_cpu &&
+        CRYPTO_sysrand_if_available(personalization, sizeof(personalization))) {
+      personalization_len = sizeof(personalization);
+    }
+#endif
+
+    if (!CTR_DRBG_init(&state->drbg, seed, personalization,
+                       personalization_len)) {
      abort();
    }
    state->calls = 0;
@ -315,7 +393,8 @@ void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len,
  if (state->calls >= kReseedInterval ||
      state->fork_generation != fork_generation) {
    uint8_t seed[CTR_DRBG_ENTROPY_LEN];
-    rand_get_seed(state, seed);
+    int used_cpu;
+    rand_get_seed(state, seed, &used_cpu);
 #if defined(BORINGSSL_FIPS)
    // Take a read lock around accesses to |state->drbg|. This is needed to
    // avoid returning bad entropy if we race with
@ -325,7 +404,7 @@ void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len,
    // bug on ppc64le. glibc may implement pthread locks by wrapping user code
    // in a hardware transaction, but, on some older versions of glibc and the
    // kernel, syscalls made with |syscall| did not abort the transaction.
-    CRYPTO_STATIC_MUTEX_lock_read(thread_states_list_lock_bss_get());
+    CRYPTO_STATIC_MUTEX_lock_read(state_clear_all_lock_bss_get());
 #endif
    if (!CTR_DRBG_reseed(&state->drbg, seed, NULL, 0)) {
      abort();
@ -334,7 +413,7 @@ void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len,
    state->fork_generation = fork_generation;
  } else {
 #if defined(BORINGSSL_FIPS)
-    CRYPTO_STATIC_MUTEX_lock_read(thread_states_list_lock_bss_get());
+    CRYPTO_STATIC_MUTEX_lock_read(state_clear_all_lock_bss_get());
 #endif
  }

@ -363,7 +442,7 @@ void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len,
  }

 #if defined(BORINGSSL_FIPS)
-  CRYPTO_STATIC_MUTEX_unlock_read(thread_states_list_lock_bss_get());
+  CRYPTO_STATIC_MUTEX_unlock_read(state_clear_all_lock_bss_get());
 #endif
 }

--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/rand/urandom.c
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/rand/urandom.c
@ -62,6 +62,15 @@
 #include <sys/random.h>
 #endif

+#if defined(OPENSSL_FREEBSD)
+#define URANDOM_BLOCKS_FOR_ENTROPY
+#if __FreeBSD__ >= 12
+// getrandom is supported in FreeBSD 12 and up.
+#define FREEBSD_GETRANDOM
+#include <sys/random.h>
+#endif
+#endif
+
 #include <CBigNumBoringSSL_thread.h>
 #include <CBigNumBoringSSL_mem.h>

@ -95,17 +104,10 @@ static ssize_t boringssl_getrandom(void *buf, size_t buf_len, unsigned flags) {

 #endif  // USE_NR_getrandom

-// rand_lock is used to protect the |*_requested| variables.
-DEFINE_STATIC_MUTEX(rand_lock)
-
-// The following constants are magic values of |urandom_fd|.
-static const int kUnset = 0;
+// kHaveGetrandom in |urandom_fd| signals that |getrandom| or |getentropy| is
+// available and should be used instead.
 static const int kHaveGetrandom = -3;

-// urandom_fd_requested is set by |RAND_set_urandom_fd|. It's protected by
-// |rand_lock|.
-DEFINE_BSS_GET(int, urandom_fd_requested)
-
 // urandom_fd is a file descriptor to /dev/urandom. It's protected by |once|.
 DEFINE_BSS_GET(int, urandom_fd)

@ -144,14 +146,9 @@ static void maybe_set_extra_getrandom_flags(void) {
 DEFINE_STATIC_ONCE(rand_once)

 // init_once initializes the state of this module to values previously
-// requested. This is the only function that modifies |urandom_fd| and
-// |urandom_buffering|, whose values may be read safely after calling the
-// once.
+// requested. This is the only function that modifies |urandom_fd|, which may be
+// read safely after calling the once.
 static void init_once(void) {
-  CRYPTO_STATIC_MUTEX_lock_read(rand_lock_bss_get());
-  int fd = *urandom_fd_requested_bss_get();
-  CRYPTO_STATIC_MUTEX_unlock_read(rand_lock_bss_get());
-
 #if defined(USE_NR_getrandom)
  int have_getrandom;
  uint8_t dummy;
@ -188,37 +185,27 @@ static void init_once(void) {
  }
 #endif

+#if defined(FREEBSD_GETRANDOM)
+  *urandom_fd_bss_get() = kHaveGetrandom;
+  return;
+#endif
+
  // Android FIPS builds must support getrandom.
 #if defined(BORINGSSL_FIPS) && defined(OPENSSL_ANDROID)
  perror("getrandom not found");
  abort();
 #endif

-  if (fd == kUnset) {
-    do {
-      fd = open("/dev/urandom", O_RDONLY);
-    } while (fd == -1 && errno == EINTR);
-  }
+  int fd;
+  do {
+    fd = open("/dev/urandom", O_RDONLY);
+  } while (fd == -1 && errno == EINTR);

  if (fd < 0) {
    perror("failed to open /dev/urandom");
    abort();
  }

-  assert(kUnset == 0);
-  if (fd == kUnset) {
-    // Because we want to keep |urandom_fd| in the BSS, we have to initialise
-    // it to zero. But zero is a valid file descriptor too. Thus if open
-    // returns zero for /dev/urandom, we dup it to get a non-zero number.
-    fd = dup(fd);
-    close(kUnset);
-
-    if (fd <= 0) {
-      perror("failed to dup /dev/urandom fd");
-      abort();
-    }
-  }
-
  int flags = fcntl(fd, F_GETFD);
  if (flags == -1) {
    // Native Client doesn't implement |fcntl|.
@ -283,11 +270,11 @@ static void wait_for_entropy(void) {
    return;
  }

-#if defined(BORINGSSL_FIPS)
-  // In FIPS mode we ensure that the kernel has sufficient entropy before
-  // continuing. This is automatically handled by getrandom, which requires
-  // that the entropy pool has been initialised, but for urandom we have to
-  // poll.
+#if defined(BORINGSSL_FIPS) && !defined(URANDOM_BLOCKS_FOR_ENTROPY)
+  // In FIPS mode on platforms where urandom doesn't block at startup, we ensure
+  // that the kernel has sufficient entropy before continuing. This is
+  // automatically handled by getrandom, which requires that the entropy pool
+  // has been initialised, but for urandom we have to poll.
  for (;;) {
    int entropy_bits;
    if (ioctl(fd, RNDGETENTCNT, &entropy_bits)) {
@ -304,41 +291,7 @@ static void wait_for_entropy(void) {

    usleep(250000);
  }
-#endif  // BORINGSSL_FIPS
-}
-
-void RAND_set_urandom_fd(int fd) {
-  fd = dup(fd);
-  if (fd < 0) {
-    perror("failed to dup supplied urandom fd");
-    abort();
-  }
-
-  assert(kUnset == 0);
-  if (fd == kUnset) {
-    // Because we want to keep |urandom_fd| in the BSS, we have to initialise
-    // it to zero. But zero is a valid file descriptor too. Thus if dup
-    // returned zero we dup it again to get a non-zero number.
-    fd = dup(fd);
-    close(kUnset);
-
-    if (fd <= 0) {
-      perror("failed to dup supplied urandom fd");
-      abort();
-    }
-  }
-
-  CRYPTO_STATIC_MUTEX_lock_write(rand_lock_bss_get());
-  *urandom_fd_requested_bss_get() = fd;
-  CRYPTO_STATIC_MUTEX_unlock_write(rand_lock_bss_get());
-
-  CRYPTO_once(rand_once_bss_get(), init_once);
-  if (*urandom_fd_bss_get() == kHaveGetrandom) {
-    close(fd);
-  } else if (*urandom_fd_bss_get() != fd) {
-    fprintf(stderr, "RAND_set_urandom_fd called after initialisation.\n");
-    abort();
-  }
+#endif  // BORINGSSL_FIPS && !URANDOM_BLOCKS_FOR_ENTROPY
 }

 // fill_with_entropy writes |len| bytes of entropy into |out|. It returns one
@ -352,17 +305,20 @@ static int fill_with_entropy(uint8_t *out, size_t len, int block, int seed) {
    return 1;
  }

-#if defined(USE_NR_getrandom)
+#if defined(USE_NR_getrandom) || defined(FREEBSD_GETRANDOM)
  int getrandom_flags = 0;
  if (!block) {
    getrandom_flags |= GRND_NONBLOCK;
  }
+#endif
+
+#if defined (USE_NR_getrandom)
  if (seed) {
    getrandom_flags |= *extra_getrandom_flags_for_seed_bss_get();
  }
 #endif

-  CRYPTO_once(rand_once_bss_get(), init_once);
+  CRYPTO_init_sysrand();
  if (block) {
    CRYPTO_once(wait_for_entropy_once_bss_get(), wait_for_entropy);
  }
@ -376,6 +332,8 @@ static int fill_with_entropy(uint8_t *out, size_t len, int block, int seed) {
    if (*urandom_fd_bss_get() == kHaveGetrandom) {
 #if defined(USE_NR_getrandom)
      r = boringssl_getrandom(out, len, getrandom_flags);
+#elif defined(FREEBSD_GETRANDOM)
+      r = getrandom(out, len, getrandom_flags);
 #elif defined(OPENSSL_MACOS)
      if (__builtin_available(macos 10.12, *)) {
        // |getentropy| can only request 256 bytes at a time.
@ -409,6 +367,10 @@ static int fill_with_entropy(uint8_t *out, size_t len, int block, int seed) {
  return 1;
 }

+void CRYPTO_init_sysrand(void) {
+  CRYPTO_once(rand_once_bss_get(), init_once);
+}
+
 // CRYPTO_sysrand puts |requested| random bytes into |out|.
 void CRYPTO_sysrand(uint8_t *out, size_t requested) {
  if (!fill_with_entropy(out, requested, /*block=*/1, /*seed=*/0)) {
@ -417,22 +379,13 @@ void CRYPTO_sysrand(uint8_t *out, size_t requested) {
  }
 }

-#if defined(BORINGSSL_FIPS)
 void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) {
  if (!fill_with_entropy(out, requested, /*block=*/1, /*seed=*/1)) {
    perror("entropy fill failed");
    abort();
  }
-
-#if defined(BORINGSSL_FIPS_BREAK_CRNG)
-  // This breaks the "continuous random number generator test" defined in FIPS
-  // 140-2, section 4.9.2, and implemented in rand_get_seed().
-  OPENSSL_memset(out, 0, requested);
-#endif
 }

-#endif  // BORINGSSL_FIPS
-
 int CRYPTO_sysrand_if_available(uint8_t *out, size_t requested) {
  if (fill_with_entropy(out, requested, /*block=*/0, /*seed=*/0)) {
    return 1;
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/rdrand-x86_64.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/rdrand-x86_64.linux.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/rdrand-x86_64.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/rdrand-x86_64.mac.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__APPLE__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/rsaz-avx2.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/rsaz-avx2.linux.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/rsaz-avx2.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/rsaz-avx2.mac.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__APPLE__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha1-586.linux.x86.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha1-586.linux.x86.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__i386__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__i386__)
 #if defined(BORINGSSL_PREFIX)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha1-armv8.ios.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha1-armv8.ios.aarch64.S
@ -19,11 +19,14 @@
 .text


+.private_extern	_OPENSSL_armcap_P
 .globl	_sha1_block_data_order
 .private_extern	_sha1_block_data_order

 .align	6
 _sha1_block_data_order:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
 	adrp	x16,:pg_hi21_nc:_OPENSSL_armcap_P
 #else
@ -1091,6 +1094,8 @@ Loop:

 .align	6
 sha1_block_armv8:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 Lv8_entry:
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
@ -1229,8 +1234,6 @@ Lconst:
 .byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	2
-.comm	_OPENSSL_armcap_P,4,4
-.private_extern	_OPENSSL_armcap_P
 #endif  // !OPENSSL_NO_ASM
 #endif  // defined(__aarch64__) && defined(__APPLE__)
 #if defined(__linux__) && defined(__ELF__)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha1-armv8.linux.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha1-armv8.linux.aarch64.S
@ -20,11 +20,14 @@
 .text


+.hidden	OPENSSL_armcap_P
 .globl	sha1_block_data_order
 .hidden	sha1_block_data_order
 .type	sha1_block_data_order,%function
 .align	6
 sha1_block_data_order:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
 	adrp	x16,:pg_hi21_nc:OPENSSL_armcap_P
 #else
@ -1092,6 +1095,8 @@ sha1_block_data_order:
 .type	sha1_block_armv8,%function
 .align	6
 sha1_block_armv8:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 .Lv8_entry:
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
@ -1230,8 +1235,6 @@ sha1_block_armv8:
 .byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	2
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
 #endif
 #endif  // !OPENSSL_NO_ASM
 .section	.note.GNU-stack,"",%progbits
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha1-x86_64.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha1-x86_64.linux.x86_64.S
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha1-x86_64.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha1-x86_64.mac.x86_64.S
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha256-586.linux.x86.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha256-586.linux.x86.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__i386__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__i386__)
 #if defined(BORINGSSL_PREFIX)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha256-armv8.ios.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha256-armv8.ios.aarch64.S
@ -14,7 +14,7 @@
 #if defined(BORINGSSL_PREFIX)
 #include <CBigNumBoringSSL_boringssl_prefix_symbols_asm.h>
 #endif
-// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
 //
 // Licensed under the OpenSSL license (the "License").  You may not use
 // this file except in compliance with the License.  You can obtain a copy
@ -42,6 +42,7 @@
 // Denver	2.01		10.5 (+26%)	6.70 (+8%)
 // X-Gene			20.0 (+100%)	12.8 (+300%(***))
 // Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
 //
 // (*)	Software SHA256 results are of lesser relevance, presented
 //	mostly for informational purposes.
@ -50,7 +51,7 @@
 //	on Cortex-A53 (or by 4 cycles per round).
 // (***)	Super-impressive coefficients over gcc-generated code are
 //	indication of some compiler "pathology", most notably code
-//	generated with -mgeneral-regs-only is significanty faster
+//	generated with -mgeneral-regs-only is significantly faster
 //	and the gap is only 40-90%.

 #ifndef	__KERNEL__
@ -60,11 +61,13 @@
 .text


+.private_extern	_OPENSSL_armcap_P
 .globl	_sha256_block_data_order
 .private_extern	_sha256_block_data_order

 .align	6
 _sha256_block_data_order:
+	AARCH64_VALID_CALL_TARGET
 #ifndef	__KERNEL__
 #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
 	adrp	x16,:pg_hi21_nc:_OPENSSL_armcap_P
@ -75,6 +78,7 @@ _sha256_block_data_order:
 	tst	w16,#ARMV8_SHA256
 	b.ne	Lv8_entry
 #endif
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-128]!
 	add	x29,sp,#0

@ -99,7 +103,7 @@ Loop:
 	ldr	w19,[x30],#4			// *K++
 	eor	w28,w21,w22				// magic seed
 	str	x1,[x29,#112]
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w3,w3			// 0
 #endif
 	ror	w16,w24,#6
@ -122,7 +126,7 @@ Loop:
 	add	w27,w27,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w27,w27,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w4,w4			// 1
 #endif
 	ldp	w5,w6,[x1],#2*4
@ -147,7 +151,7 @@ Loop:
 	add	w26,w26,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w26,w26,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w5,w5			// 2
 #endif
 	add	w26,w26,w17			// h+=Sigma0(a)
@ -171,7 +175,7 @@ Loop:
 	add	w25,w25,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w25,w25,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w6,w6			// 3
 #endif
 	ldp	w7,w8,[x1],#2*4
@ -196,7 +200,7 @@ Loop:
 	add	w24,w24,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w24,w24,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w7,w7			// 4
 #endif
 	add	w24,w24,w17			// h+=Sigma0(a)
@ -220,7 +224,7 @@ Loop:
 	add	w23,w23,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w23,w23,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w8,w8			// 5
 #endif
 	ldp	w9,w10,[x1],#2*4
@ -245,7 +249,7 @@ Loop:
 	add	w22,w22,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w22,w22,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w9,w9			// 6
 #endif
 	add	w22,w22,w17			// h+=Sigma0(a)
@ -269,7 +273,7 @@ Loop:
 	add	w21,w21,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w21,w21,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w10,w10			// 7
 #endif
 	ldp	w11,w12,[x1],#2*4
@ -294,7 +298,7 @@ Loop:
 	add	w20,w20,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w20,w20,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w11,w11			// 8
 #endif
 	add	w20,w20,w17			// h+=Sigma0(a)
@ -318,7 +322,7 @@ Loop:
 	add	w27,w27,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w27,w27,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w12,w12			// 9
 #endif
 	ldp	w13,w14,[x1],#2*4
@ -343,7 +347,7 @@ Loop:
 	add	w26,w26,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w26,w26,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w13,w13			// 10
 #endif
 	add	w26,w26,w17			// h+=Sigma0(a)
@ -367,7 +371,7 @@ Loop:
 	add	w25,w25,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w25,w25,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w14,w14			// 11
 #endif
 	ldp	w15,w0,[x1],#2*4
@ -393,7 +397,7 @@ Loop:
 	add	w24,w24,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w24,w24,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w15,w15			// 12
 #endif
 	add	w24,w24,w17			// h+=Sigma0(a)
@ -418,7 +422,7 @@ Loop:
 	add	w23,w23,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w23,w23,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w0,w0			// 13
 #endif
 	ldp	w1,w2,[x1]
@ -444,7 +448,7 @@ Loop:
 	add	w22,w22,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w22,w22,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w1,w1			// 14
 #endif
 	ldr	w6,[sp,#12]
@ -470,7 +474,7 @@ Loop:
 	add	w21,w21,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w21,w21,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w2,w2			// 15
 #endif
 	ldr	w7,[sp,#0]
@ -1035,6 +1039,7 @@ Loop_16_xx:
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret


@ -1069,6 +1074,7 @@ LK256:
 .align	6
 sha256_block_armv8:
 Lv8_entry:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0

@ -1204,10 +1210,6 @@ Loop_hw:
 	ldr	x29,[sp],#16
 	ret

-#endif
-#ifndef	__KERNEL__
-.comm	_OPENSSL_armcap_P,4,4
-.private_extern	_OPENSSL_armcap_P
 #endif
 #endif  // !OPENSSL_NO_ASM
 #endif  // defined(__aarch64__) && defined(__APPLE__)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha256-armv8.linux.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha256-armv8.linux.aarch64.S
@ -15,7 +15,7 @@
 #if defined(BORINGSSL_PREFIX)
 #include <CBigNumBoringSSL_boringssl_prefix_symbols_asm.h>
 #endif
-// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
 //
 // Licensed under the OpenSSL license (the "License").  You may not use
 // this file except in compliance with the License.  You can obtain a copy
@ -43,6 +43,7 @@
 // Denver	2.01		10.5 (+26%)	6.70 (+8%)
 // X-Gene			20.0 (+100%)	12.8 (+300%(***))
 // Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
 //
 // (*)	Software SHA256 results are of lesser relevance, presented
 //	mostly for informational purposes.
@ -51,7 +52,7 @@
 //	on Cortex-A53 (or by 4 cycles per round).
 // (***)	Super-impressive coefficients over gcc-generated code are
 //	indication of some compiler "pathology", most notably code
-//	generated with -mgeneral-regs-only is significanty faster
+//	generated with -mgeneral-regs-only is significantly faster
 //	and the gap is only 40-90%.

 #ifndef	__KERNEL__
@ -61,11 +62,13 @@
 .text


+.hidden	OPENSSL_armcap_P
 .globl	sha256_block_data_order
 .hidden	sha256_block_data_order
 .type	sha256_block_data_order,%function
 .align	6
 sha256_block_data_order:
+	AARCH64_VALID_CALL_TARGET
 #ifndef	__KERNEL__
 #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
 	adrp	x16,:pg_hi21_nc:OPENSSL_armcap_P
@ -76,6 +79,7 @@ sha256_block_data_order:
 	tst	w16,#ARMV8_SHA256
 	b.ne	.Lv8_entry
 #endif
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-128]!
 	add	x29,sp,#0

@ -100,7 +104,7 @@ sha256_block_data_order:
 	ldr	w19,[x30],#4			// *K++
 	eor	w28,w21,w22				// magic seed
 	str	x1,[x29,#112]
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w3,w3			// 0
 #endif
 	ror	w16,w24,#6
@ -123,7 +127,7 @@ sha256_block_data_order:
 	add	w27,w27,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w27,w27,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w4,w4			// 1
 #endif
 	ldp	w5,w6,[x1],#2*4
@ -148,7 +152,7 @@ sha256_block_data_order:
 	add	w26,w26,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w26,w26,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w5,w5			// 2
 #endif
 	add	w26,w26,w17			// h+=Sigma0(a)
@ -172,7 +176,7 @@ sha256_block_data_order:
 	add	w25,w25,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w25,w25,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w6,w6			// 3
 #endif
 	ldp	w7,w8,[x1],#2*4
@ -197,7 +201,7 @@ sha256_block_data_order:
 	add	w24,w24,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w24,w24,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w7,w7			// 4
 #endif
 	add	w24,w24,w17			// h+=Sigma0(a)
@ -221,7 +225,7 @@ sha256_block_data_order:
 	add	w23,w23,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w23,w23,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w8,w8			// 5
 #endif
 	ldp	w9,w10,[x1],#2*4
@ -246,7 +250,7 @@ sha256_block_data_order:
 	add	w22,w22,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w22,w22,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w9,w9			// 6
 #endif
 	add	w22,w22,w17			// h+=Sigma0(a)
@ -270,7 +274,7 @@ sha256_block_data_order:
 	add	w21,w21,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w21,w21,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w10,w10			// 7
 #endif
 	ldp	w11,w12,[x1],#2*4
@ -295,7 +299,7 @@ sha256_block_data_order:
 	add	w20,w20,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w20,w20,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w11,w11			// 8
 #endif
 	add	w20,w20,w17			// h+=Sigma0(a)
@ -319,7 +323,7 @@ sha256_block_data_order:
 	add	w27,w27,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w27,w27,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w12,w12			// 9
 #endif
 	ldp	w13,w14,[x1],#2*4
@ -344,7 +348,7 @@ sha256_block_data_order:
 	add	w26,w26,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w26,w26,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w13,w13			// 10
 #endif
 	add	w26,w26,w17			// h+=Sigma0(a)
@ -368,7 +372,7 @@ sha256_block_data_order:
 	add	w25,w25,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w25,w25,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w14,w14			// 11
 #endif
 	ldp	w15,w0,[x1],#2*4
@ -394,7 +398,7 @@ sha256_block_data_order:
 	add	w24,w24,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w24,w24,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w15,w15			// 12
 #endif
 	add	w24,w24,w17			// h+=Sigma0(a)
@ -419,7 +423,7 @@ sha256_block_data_order:
 	add	w23,w23,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w23,w23,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w0,w0			// 13
 #endif
 	ldp	w1,w2,[x1]
@ -445,7 +449,7 @@ sha256_block_data_order:
 	add	w22,w22,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w22,w22,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w1,w1			// 14
 #endif
 	ldr	w6,[sp,#12]
@ -471,7 +475,7 @@ sha256_block_data_order:
 	add	w21,w21,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w21,w21,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w2,w2			// 15
 #endif
 	ldr	w7,[sp,#0]
@ -1036,6 +1040,7 @@ sha256_block_data_order:
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	sha256_block_data_order,.-sha256_block_data_order

@ -1070,6 +1075,7 @@ sha256_block_data_order:
 .align	6
 sha256_block_armv8:
 .Lv8_entry:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0

@ -1206,10 +1212,6 @@ sha256_block_armv8:
 	ret
 .size	sha256_block_armv8,.-sha256_block_armv8
 #endif
-#ifndef	__KERNEL__
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
-#endif
 #endif
 #endif  // !OPENSSL_NO_ASM
 .section	.note.GNU-stack,"",%progbits
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha256-x86_64.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha256-x86_64.linux.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha256-x86_64.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha256-x86_64.mac.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__APPLE__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha512-586.linux.x86.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha512-586.linux.x86.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__i386__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__i386__)
 #if defined(BORINGSSL_PREFIX)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha512-armv8.ios.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha512-armv8.ios.aarch64.S
@ -14,7 +14,7 @@
 #if defined(BORINGSSL_PREFIX)
 #include <CBigNumBoringSSL_boringssl_prefix_symbols_asm.h>
 #endif
-// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
 //
 // Licensed under the OpenSSL license (the "License").  You may not use
 // this file except in compliance with the License.  You can obtain a copy
@ -42,6 +42,7 @@
 // Denver	2.01		10.5 (+26%)	6.70 (+8%)
 // X-Gene			20.0 (+100%)	12.8 (+300%(***))
 // Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
 //
 // (*)	Software SHA256 results are of lesser relevance, presented
 //	mostly for informational purposes.
@ -50,7 +51,7 @@
 //	on Cortex-A53 (or by 4 cycles per round).
 // (***)	Super-impressive coefficients over gcc-generated code are
 //	indication of some compiler "pathology", most notably code
-//	generated with -mgeneral-regs-only is significanty faster
+//	generated with -mgeneral-regs-only is significantly faster
 //	and the gap is only 40-90%.

 #ifndef	__KERNEL__
@ -60,11 +61,24 @@
 .text


+.private_extern	_OPENSSL_armcap_P
 .globl	_sha512_block_data_order
 .private_extern	_sha512_block_data_order

 .align	6
 _sha512_block_data_order:
+	AARCH64_VALID_CALL_TARGET
+#ifndef	__KERNEL__
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+	adrp	x16,:pg_hi21_nc:_OPENSSL_armcap_P
+#else
+	adrp	x16,_OPENSSL_armcap_P@PAGE
+#endif
+	ldr	w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
+	tst	w16,#ARMV8_SHA512
+	b.ne	Lv8_entry
+#endif
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-128]!
 	add	x29,sp,#0

@ -89,7 +103,7 @@ Loop:
 	ldr	x19,[x30],#8			// *K++
 	eor	x28,x21,x22				// magic seed
 	str	x1,[x29,#112]
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x3,x3			// 0
 #endif
 	ror	x16,x24,#14
@ -112,7 +126,7 @@ Loop:
 	add	x27,x27,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x27,x27,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x4,x4			// 1
 #endif
 	ldp	x5,x6,[x1],#2*8
@ -137,7 +151,7 @@ Loop:
 	add	x26,x26,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x26,x26,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x5,x5			// 2
 #endif
 	add	x26,x26,x17			// h+=Sigma0(a)
@ -161,7 +175,7 @@ Loop:
 	add	x25,x25,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x25,x25,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x6,x6			// 3
 #endif
 	ldp	x7,x8,[x1],#2*8
@ -186,7 +200,7 @@ Loop:
 	add	x24,x24,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x24,x24,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x7,x7			// 4
 #endif
 	add	x24,x24,x17			// h+=Sigma0(a)
@ -210,7 +224,7 @@ Loop:
 	add	x23,x23,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x23,x23,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x8,x8			// 5
 #endif
 	ldp	x9,x10,[x1],#2*8
@ -235,7 +249,7 @@ Loop:
 	add	x22,x22,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x22,x22,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x9,x9			// 6
 #endif
 	add	x22,x22,x17			// h+=Sigma0(a)
@ -259,7 +273,7 @@ Loop:
 	add	x21,x21,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x21,x21,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x10,x10			// 7
 #endif
 	ldp	x11,x12,[x1],#2*8
@ -284,7 +298,7 @@ Loop:
 	add	x20,x20,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x20,x20,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x11,x11			// 8
 #endif
 	add	x20,x20,x17			// h+=Sigma0(a)
@ -308,7 +322,7 @@ Loop:
 	add	x27,x27,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x27,x27,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x12,x12			// 9
 #endif
 	ldp	x13,x14,[x1],#2*8
@ -333,7 +347,7 @@ Loop:
 	add	x26,x26,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x26,x26,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x13,x13			// 10
 #endif
 	add	x26,x26,x17			// h+=Sigma0(a)
@ -357,7 +371,7 @@ Loop:
 	add	x25,x25,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x25,x25,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x14,x14			// 11
 #endif
 	ldp	x15,x0,[x1],#2*8
@ -383,7 +397,7 @@ Loop:
 	add	x24,x24,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x24,x24,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x15,x15			// 12
 #endif
 	add	x24,x24,x17			// h+=Sigma0(a)
@ -408,7 +422,7 @@ Loop:
 	add	x23,x23,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x23,x23,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x0,x0			// 13
 #endif
 	ldp	x1,x2,[x1]
@ -434,7 +448,7 @@ Loop:
 	add	x22,x22,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x22,x22,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x1,x1			// 14
 #endif
 	ldr	x6,[sp,#24]
@ -460,7 +474,7 @@ Loop:
 	add	x21,x21,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x21,x21,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x2,x2			// 15
 #endif
 	ldr	x7,[sp,#0]
@ -1025,6 +1039,7 @@ Loop_16_xx:
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret


@ -1077,9 +1092,526 @@ LK512:
 .byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	2
+.text
 #ifndef	__KERNEL__
-.comm	_OPENSSL_armcap_P,4,4
-.private_extern	_OPENSSL_armcap_P
+
+.align	6
+sha512_block_armv8:
+Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
+	adrp	x3,LK512@PAGE
+	add	x3,x3,LK512@PAGEOFF
+
+	rev64	v16.16b,v16.16b
+	rev64	v17.16b,v17.16b
+	rev64	v18.16b,v18.16b
+	rev64	v19.16b,v19.16b
+	rev64	v20.16b,v20.16b
+	rev64	v21.16b,v21.16b
+	rev64	v22.16b,v22.16b
+	rev64	v23.16b,v23.16b
+	b	Loop_hw
+
+.align	4
+Loop_hw:
+	ld1	{v24.2d},[x3],#16
+	subs	x2,x2,#1
+	sub	x4,x1,#128
+	orr	v26.16b,v0.16b,v0.16b			// offload
+	orr	v27.16b,v1.16b,v1.16b
+	orr	v28.16b,v2.16b,v2.16b
+	orr	v29.16b,v3.16b,v3.16b
+	csel	x1,x1,x4,ne			// conditional rewind
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v16.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v16.16b,v16.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v17.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v17.16b,v17.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v18.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v18.16b,v18.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v19.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+	rev64	v19.16b,v19.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v20.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+	rev64	v20.16b,v20.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v21.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v21.16b,v21.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v22.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v22.16b,v22.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	sub	x3,x3,#80*8	// rewind
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v23.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v23.16b,v23.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v0.2d,v0.2d,v26.2d			// accumulate
+	add	v1.2d,v1.2d,v27.2d
+	add	v2.2d,v2.2d,v28.2d
+	add	v3.2d,v3.2d,v29.2d
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
+
+	ldr	x29,[sp],#16
+	ret
+
 #endif
 #endif  // !OPENSSL_NO_ASM
 #endif  // defined(__aarch64__) && defined(__APPLE__)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha512-armv8.linux.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha512-armv8.linux.aarch64.S
@ -15,7 +15,7 @@
 #if defined(BORINGSSL_PREFIX)
 #include <CBigNumBoringSSL_boringssl_prefix_symbols_asm.h>
 #endif
-// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
 //
 // Licensed under the OpenSSL license (the "License").  You may not use
 // this file except in compliance with the License.  You can obtain a copy
@ -43,6 +43,7 @@
 // Denver	2.01		10.5 (+26%)	6.70 (+8%)
 // X-Gene			20.0 (+100%)	12.8 (+300%(***))
 // Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
 //
 // (*)	Software SHA256 results are of lesser relevance, presented
 //	mostly for informational purposes.
@ -51,7 +52,7 @@
 //	on Cortex-A53 (or by 4 cycles per round).
 // (***)	Super-impressive coefficients over gcc-generated code are
 //	indication of some compiler "pathology", most notably code
-//	generated with -mgeneral-regs-only is significanty faster
+//	generated with -mgeneral-regs-only is significantly faster
 //	and the gap is only 40-90%.

 #ifndef	__KERNEL__
@ -61,11 +62,24 @@
 .text


+.hidden	OPENSSL_armcap_P
 .globl	sha512_block_data_order
 .hidden	sha512_block_data_order
 .type	sha512_block_data_order,%function
 .align	6
 sha512_block_data_order:
+	AARCH64_VALID_CALL_TARGET
+#ifndef	__KERNEL__
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+	adrp	x16,:pg_hi21_nc:OPENSSL_armcap_P
+#else
+	adrp	x16,OPENSSL_armcap_P
+#endif
+	ldr	w16,[x16,:lo12:OPENSSL_armcap_P]
+	tst	w16,#ARMV8_SHA512
+	b.ne	.Lv8_entry
+#endif
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-128]!
 	add	x29,sp,#0

@ -90,7 +104,7 @@ sha512_block_data_order:
 	ldr	x19,[x30],#8			// *K++
 	eor	x28,x21,x22				// magic seed
 	str	x1,[x29,#112]
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x3,x3			// 0
 #endif
 	ror	x16,x24,#14
@ -113,7 +127,7 @@ sha512_block_data_order:
 	add	x27,x27,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x27,x27,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x4,x4			// 1
 #endif
 	ldp	x5,x6,[x1],#2*8
@ -138,7 +152,7 @@ sha512_block_data_order:
 	add	x26,x26,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x26,x26,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x5,x5			// 2
 #endif
 	add	x26,x26,x17			// h+=Sigma0(a)
@ -162,7 +176,7 @@ sha512_block_data_order:
 	add	x25,x25,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x25,x25,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x6,x6			// 3
 #endif
 	ldp	x7,x8,[x1],#2*8
@ -187,7 +201,7 @@ sha512_block_data_order:
 	add	x24,x24,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x24,x24,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x7,x7			// 4
 #endif
 	add	x24,x24,x17			// h+=Sigma0(a)
@ -211,7 +225,7 @@ sha512_block_data_order:
 	add	x23,x23,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x23,x23,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x8,x8			// 5
 #endif
 	ldp	x9,x10,[x1],#2*8
@ -236,7 +250,7 @@ sha512_block_data_order:
 	add	x22,x22,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x22,x22,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x9,x9			// 6
 #endif
 	add	x22,x22,x17			// h+=Sigma0(a)
@ -260,7 +274,7 @@ sha512_block_data_order:
 	add	x21,x21,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x21,x21,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x10,x10			// 7
 #endif
 	ldp	x11,x12,[x1],#2*8
@ -285,7 +299,7 @@ sha512_block_data_order:
 	add	x20,x20,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x20,x20,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x11,x11			// 8
 #endif
 	add	x20,x20,x17			// h+=Sigma0(a)
@ -309,7 +323,7 @@ sha512_block_data_order:
 	add	x27,x27,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x27,x27,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x12,x12			// 9
 #endif
 	ldp	x13,x14,[x1],#2*8
@ -334,7 +348,7 @@ sha512_block_data_order:
 	add	x26,x26,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x26,x26,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x13,x13			// 10
 #endif
 	add	x26,x26,x17			// h+=Sigma0(a)
@ -358,7 +372,7 @@ sha512_block_data_order:
 	add	x25,x25,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x25,x25,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x14,x14			// 11
 #endif
 	ldp	x15,x0,[x1],#2*8
@ -384,7 +398,7 @@ sha512_block_data_order:
 	add	x24,x24,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x24,x24,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x15,x15			// 12
 #endif
 	add	x24,x24,x17			// h+=Sigma0(a)
@ -409,7 +423,7 @@ sha512_block_data_order:
 	add	x23,x23,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x23,x23,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x0,x0			// 13
 #endif
 	ldp	x1,x2,[x1]
@ -435,7 +449,7 @@ sha512_block_data_order:
 	add	x22,x22,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x22,x22,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x1,x1			// 14
 #endif
 	ldr	x6,[sp,#24]
@ -461,7 +475,7 @@ sha512_block_data_order:
 	add	x21,x21,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x21,x21,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x2,x2			// 15
 #endif
 	ldr	x7,[sp,#0]
@ -1026,6 +1040,7 @@ sha512_block_data_order:
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	sha512_block_data_order,.-sha512_block_data_order

@ -1078,9 +1093,526 @@ sha512_block_data_order:
 .byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	2
+.text
 #ifndef	__KERNEL__
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
+.type	sha512_block_armv8,%function
+.align	6
+sha512_block_armv8:
+.Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
+	adrp	x3,.LK512
+	add	x3,x3,:lo12:.LK512
+
+	rev64	v16.16b,v16.16b
+	rev64	v17.16b,v17.16b
+	rev64	v18.16b,v18.16b
+	rev64	v19.16b,v19.16b
+	rev64	v20.16b,v20.16b
+	rev64	v21.16b,v21.16b
+	rev64	v22.16b,v22.16b
+	rev64	v23.16b,v23.16b
+	b	.Loop_hw
+
+.align	4
+.Loop_hw:
+	ld1	{v24.2d},[x3],#16
+	subs	x2,x2,#1
+	sub	x4,x1,#128
+	orr	v26.16b,v0.16b,v0.16b			// offload
+	orr	v27.16b,v1.16b,v1.16b
+	orr	v28.16b,v2.16b,v2.16b
+	orr	v29.16b,v3.16b,v3.16b
+	csel	x1,x1,x4,ne			// conditional rewind
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v16.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v16.16b,v16.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v17.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v17.16b,v17.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v18.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v18.16b,v18.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v19.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+	rev64	v19.16b,v19.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v20.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+	rev64	v20.16b,v20.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v21.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v21.16b,v21.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v22.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v22.16b,v22.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	sub	x3,x3,#80*8	// rewind
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v23.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v23.16b,v23.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v0.2d,v0.2d,v26.2d			// accumulate
+	add	v1.2d,v1.2d,v27.2d
+	add	v2.2d,v2.2d,v28.2d
+	add	v3.2d,v3.2d,v29.2d
+
+	cbnz	x2,.Loop_hw
+
+	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
+
+	ldr	x29,[sp],#16
+	ret
+.size	sha512_block_armv8,.-sha512_block_armv8
 #endif
 #endif
 #endif  // !OPENSSL_NO_ASM
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha512-x86_64.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha512-x86_64.linux.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha512-x86_64.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/sha512-x86_64.mac.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__APPLE__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/vpaes-armv8.ios.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/vpaes-armv8.ios.aarch64.S
@ -14,6 +14,8 @@
 #if defined(BORINGSSL_PREFIX)
 #include <CBigNumBoringSSL_boringssl_prefix_symbols_asm.h>
 #endif
+#include <CBigNumBoringSSL_arm_arch.h>
+
 .section	__TEXT,__const


@ -216,6 +218,7 @@ Lenc_entry:

 .align	4
 _vpaes_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0

@ -225,6 +228,7 @@ _vpaes_encrypt:
 	st1	{v0.16b}, [x1]

 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret


@ -453,6 +457,7 @@ Ldec_entry:

 .align	4
 _vpaes_decrypt:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0

@ -462,6 +467,7 @@ _vpaes_decrypt:
 	st1	{v0.16b}, [x1]

 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret


@ -631,6 +637,7 @@ _vpaes_key_preheat:

 .align	4
 _vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29, x30, [sp,#-16]!
 	add	x29,sp,#0

@ -800,6 +807,7 @@ Lschedule_mangle_last_dec:
 	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
 	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
 	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret


@ -1002,7 +1010,7 @@ Lschedule_mangle_dec:

 Lschedule_mangle_both:
 	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
-	add	x8, x8, #64-16			// add	$-16,	%r8
+	add	x8, x8, #48			// add	$-16,	%r8
 	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
 	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
 	ret
@ -1013,6 +1021,7 @@ Lschedule_mangle_both:

 .align	4
 _vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
@ -1028,6 +1037,7 @@ _vpaes_set_encrypt_key:

 	ldp	d8,d9,[sp],#16
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret


@ -1036,6 +1046,7 @@ _vpaes_set_encrypt_key:

 .align	4
 _vpaes_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
@ -1055,6 +1066,7 @@ _vpaes_set_decrypt_key:

 	ldp	d8,d9,[sp],#16
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret

 .globl	_vpaes_cbc_encrypt
@ -1062,6 +1074,7 @@ _vpaes_set_decrypt_key:

 .align	4
 _vpaes_cbc_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
 	cbz	x2, Lcbc_abort
 	cmp	w5, #0			// check direction
 	b.eq	vpaes_cbc_decrypt
@ -1089,12 +1102,15 @@ Lcbc_enc_loop:

 	ldp	x29,x30,[sp],#16
 Lcbc_abort:
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret



 .align	4
 vpaes_cbc_decrypt:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
+	// only from vpaes_cbc_encrypt which has already signed the return address.
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
@ -1136,6 +1152,7 @@ Lcbc_dec_done:
 	ldp	d10,d11,[sp],#16
 	ldp	d8,d9,[sp],#16
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret

 .globl	_vpaes_ctr32_encrypt_blocks
@ -1143,6 +1160,7 @@ Lcbc_dec_done:

 .align	4
 _vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
@ -1210,6 +1228,7 @@ Lctr32_done:
 	ldp	d10,d11,[sp],#16
 	ldp	d8,d9,[sp],#16
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret

 #endif  // !OPENSSL_NO_ASM
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/vpaes-armv8.linux.aarch64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/vpaes-armv8.linux.aarch64.S
@ -15,6 +15,8 @@
 #if defined(BORINGSSL_PREFIX)
 #include <CBigNumBoringSSL_boringssl_prefix_symbols_asm.h>
 #endif
+#include <CBigNumBoringSSL_arm_arch.h>
+
 .section	.rodata

 .type	_vpaes_consts,%object
@ -217,6 +219,7 @@ _vpaes_encrypt_core:
 .type	vpaes_encrypt,%function
 .align	4
 vpaes_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0

@ -226,6 +229,7 @@ vpaes_encrypt:
 	st1	{v0.16b}, [x1]

 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_encrypt,.-vpaes_encrypt

@ -454,6 +458,7 @@ _vpaes_decrypt_core:
 .type	vpaes_decrypt,%function
 .align	4
 vpaes_decrypt:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0

@ -463,6 +468,7 @@ vpaes_decrypt:
 	st1	{v0.16b}, [x1]

 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_decrypt,.-vpaes_decrypt

@ -632,6 +638,7 @@ _vpaes_key_preheat:
 .type	_vpaes_schedule_core,%function
 .align	4
 _vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29, x30, [sp,#-16]!
 	add	x29,sp,#0

@ -801,6 +808,7 @@ _vpaes_schedule_core:
 	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
 	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
 	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	_vpaes_schedule_core,.-_vpaes_schedule_core

@ -1003,7 +1011,7 @@ _vpaes_schedule_mangle:

 .Lschedule_mangle_both:
 	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
-	add	x8, x8, #64-16			// add	$-16,	%r8
+	add	x8, x8, #48			// add	$-16,	%r8
 	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
 	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
 	ret
@ -1014,6 +1022,7 @@ _vpaes_schedule_mangle:
 .type	vpaes_set_encrypt_key,%function
 .align	4
 vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
@ -1029,6 +1038,7 @@ vpaes_set_encrypt_key:

 	ldp	d8,d9,[sp],#16
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key

@ -1037,6 +1047,7 @@ vpaes_set_encrypt_key:
 .type	vpaes_set_decrypt_key,%function
 .align	4
 vpaes_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
@ -1056,6 +1067,7 @@ vpaes_set_decrypt_key:

 	ldp	d8,d9,[sp],#16
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
 .globl	vpaes_cbc_encrypt
@ -1063,6 +1075,7 @@ vpaes_set_decrypt_key:
 .type	vpaes_cbc_encrypt,%function
 .align	4
 vpaes_cbc_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
 	cbz	x2, .Lcbc_abort
 	cmp	w5, #0			// check direction
 	b.eq	vpaes_cbc_decrypt
@ -1090,12 +1103,15 @@ vpaes_cbc_encrypt:

 	ldp	x29,x30,[sp],#16
 .Lcbc_abort:
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt

 .type	vpaes_cbc_decrypt,%function
 .align	4
 vpaes_cbc_decrypt:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
+	// only from vpaes_cbc_encrypt which has already signed the return address.
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
@ -1137,6 +1153,7 @@ vpaes_cbc_decrypt:
 	ldp	d10,d11,[sp],#16
 	ldp	d8,d9,[sp],#16
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
 .globl	vpaes_ctr32_encrypt_blocks
@ -1144,6 +1161,7 @@ vpaes_cbc_decrypt:
 .type	vpaes_ctr32_encrypt_blocks,%function
 .align	4
 vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
@ -1211,6 +1229,7 @@ vpaes_ctr32_encrypt_blocks:
 	ldp	d10,d11,[sp],#16
 	ldp	d8,d9,[sp],#16
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
 #endif
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/vpaes-x86.linux.x86.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/vpaes-x86.linux.x86.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__i386__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__i386__)
 #if defined(BORINGSSL_PREFIX)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/vpaes-x86_64.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/vpaes-x86_64.linux.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/vpaes-x86_64.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/vpaes-x86_64.mac.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__APPLE__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/x86-mont.linux.x86.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/x86-mont.linux.x86.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__i386__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__i386__)
 #if defined(BORINGSSL_PREFIX)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/x86_64-mont.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/x86_64-mont.linux.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/x86_64-mont.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/x86_64-mont.mac.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__APPLE__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/x86_64-mont5.linux.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/x86_64-mont5.linux.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__linux__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/fipsmodule/x86_64-mont5.mac.x86_64.S
+++ b/Sources/CBigNumBoringSSL/crypto/fipsmodule/x86_64-mont5.mac.x86_64.S
@ -1,7 +1,7 @@
 #define BORINGSSL_PREFIX CBigNumBoringSSL
 #if defined(__x86_64__) && defined(__APPLE__)
-# This file is generated from a similarly-named Perl script in the BoringSSL
-# source tree. Do not edit by hand.
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.

 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
--- a/Sources/CBigNumBoringSSL/crypto/internal.h
+++ b/Sources/CBigNumBoringSSL/crypto/internal.h
@ -109,6 +109,7 @@
 #ifndef OPENSSL_HEADER_CRYPTO_INTERNAL_H
 #define OPENSSL_HEADER_CRYPTO_INTERNAL_H

+#include <CBigNumBoringSSL_crypto.h>
 #include <CBigNumBoringSSL_ex_data.h>
 #include <CBigNumBoringSSL_stack.h>
 #include <CBigNumBoringSSL_thread.h>
@ -208,6 +209,9 @@ typedef __uint128_t uint128_t;
 #define OPENSSL_SSE2
 #endif

+
+// Pointer utility functions.
+
 // buffers_alias returns one if |a| and |b| alias and zero otherwise.
 static inline int buffers_alias(const uint8_t *a, size_t a_len,
                                const uint8_t *b, size_t b_len) {
@ -220,6 +224,23 @@ static inline int buffers_alias(const uint8_t *a, size_t a_len,
  return a_u + a_len > b_u && b_u + b_len > a_u;
 }

+// align_pointer returns |ptr|, advanced to |alignment|. |alignment| must be a
+// power of two, and |ptr| must have at least |alignment - 1| bytes of scratch
+// space.
+static inline void *align_pointer(void *ptr, size_t alignment) {
+  // |alignment| must be a power of two.
+  assert(alignment != 0 && (alignment & (alignment - 1)) == 0);
+  // Instead of aligning |ptr| as a |uintptr_t| and casting back, compute the
+  // offset and advance in pointer space. C guarantees that casting from pointer
+  // to |uintptr_t| and back gives the same pointer, but general
+  // integer-to-pointer conversions are implementation-defined. GCC does define
+  // it in the useful way, but this makes fewer assumptions.
+  uintptr_t offset = (0u - (uintptr_t)ptr) & (alignment - 1);
+  ptr = (char *)ptr + offset;
+  assert(((uintptr_t)ptr & (alignment - 1)) == 0);
+  return ptr;
+}
+

 // Constant-time utility functions.
 //
@ -470,6 +491,13 @@ OPENSSL_EXPORT void CRYPTO_once(CRYPTO_once_t *once, void (*init)(void));

 // Reference counting.

+// Automatically enable C11 atomics if implemented.
+#if !defined(OPENSSL_C11_ATOMIC) && defined(OPENSSL_THREADS) &&   \
+    !defined(__STDC_NO_ATOMICS__) && defined(__STDC_VERSION__) && \
+    __STDC_VERSION__ >= 201112L
+#define OPENSSL_C11_ATOMIC
+#endif
+
 // CRYPTO_REFCOUNT_MAX is the value at which the reference count saturates.
 #define CRYPTO_REFCOUNT_MAX 0xffffffff

@ -607,6 +635,7 @@ BSSL_NAMESPACE_END
 typedef enum {
  OPENSSL_THREAD_LOCAL_ERR = 0,
  OPENSSL_THREAD_LOCAL_RAND,
+  OPENSSL_THREAD_LOCAL_FIPS_COUNTERS,
  OPENSSL_THREAD_LOCAL_TEST,
  NUM_OPENSSL_THREAD_LOCALS,
 } thread_local_data_t;
@ -811,6 +840,97 @@ static inline void *OPENSSL_memset(void *dst, int c, size_t n) {
  return memset(dst, c, n);
 }

+
+// Loads and stores.
+//
+// The following functions load and store sized integers with the specified
+// endianness. They use |memcpy|, and so avoid alignment or strict aliasing
+// requirements on the input and output pointers.
+
+static inline uint32_t CRYPTO_load_u32_le(const void *in) {
+  uint32_t v;
+  OPENSSL_memcpy(&v, in, sizeof(v));
+  return v;
+}
+
+static inline void CRYPTO_store_u32_le(void *out, uint32_t v) {
+  OPENSSL_memcpy(out, &v, sizeof(v));
+}
+
+static inline uint32_t CRYPTO_load_u32_be(const void *in) {
+  uint32_t v;
+  OPENSSL_memcpy(&v, in, sizeof(v));
+  return CRYPTO_bswap4(v);
+}
+
+static inline void CRYPTO_store_u32_be(void *out, uint32_t v) {
+  v = CRYPTO_bswap4(v);
+  OPENSSL_memcpy(out, &v, sizeof(v));
+}
+
+static inline uint64_t CRYPTO_load_u64_be(const void *ptr) {
+  uint64_t ret;
+  OPENSSL_memcpy(&ret, ptr, sizeof(ret));
+  return CRYPTO_bswap8(ret);
+}
+
+static inline void CRYPTO_store_u64_be(void *out, uint64_t v) {
+  v = CRYPTO_bswap8(v);
+  OPENSSL_memcpy(out, &v, sizeof(v));
+}
+
+static inline crypto_word_t CRYPTO_load_word_le(const void *in) {
+  crypto_word_t v;
+  OPENSSL_memcpy(&v, in, sizeof(v));
+  return v;
+}
+
+static inline void CRYPTO_store_word_le(void *out, crypto_word_t v) {
+  OPENSSL_memcpy(out, &v, sizeof(v));
+}
+
+
+// Bit rotation functions.
+//
+// Note these functions use |(-shift) & 31|, etc., because shifting by the bit
+// width is undefined. Both Clang and GCC recognize this pattern as a rotation,
+// but MSVC does not. Instead, we call MSVC's built-in functions.
+
+static inline uint32_t CRYPTO_rotl_u32(uint32_t value, int shift) {
+#if defined(_MSC_VER)
+  return _rotl(value, shift);
+#else
+  return (value << shift) | (value >> ((-shift) & 31));
+#endif
+}
+
+static inline uint32_t CRYPTO_rotr_u32(uint32_t value, int shift) {
+#if defined(_MSC_VER)
+  return _rotr(value, shift);
+#else
+  return (value >> shift) | (value << ((-shift) & 31));
+#endif
+}
+
+static inline uint64_t CRYPTO_rotl_u64(uint64_t value, int shift) {
+#if defined(_MSC_VER)
+  return _rotl64(value, shift);
+#else
+  return (value << shift) | (value >> ((-shift) & 63));
+#endif
+}
+
+static inline uint64_t CRYPTO_rotr_u64(uint64_t value, int shift) {
+#if defined(_MSC_VER)
+  return _rotr64(value, shift);
+#else
+  return (value >> shift) | (value << ((-shift) & 63));
+#endif
+}
+
+
+// FIPS functions.
+
 #if defined(BORINGSSL_FIPS)
 // BORINGSSL_FIPS_abort is called when a FIPS power-on or continuous test
 // fails. It prevents any further cryptographic operations by the current
@ -826,6 +946,11 @@ void BORINGSSL_FIPS_abort(void) __attribute__((noreturn));
 int boringssl_fips_self_test(const uint8_t *module_hash,
                             size_t module_hash_len);

+#if defined(BORINGSSL_FIPS_COUNTERS)
+void boringssl_fips_inc_counter(enum fips_counter_t counter);
+#else
+OPENSSL_INLINE void boringssl_fips_inc_counter(enum fips_counter_t counter) {}
+#endif

 #if defined(__cplusplus)
 }  // extern C
--- a/Sources/CBigNumBoringSSL/crypto/mem.c
+++ b/Sources/CBigNumBoringSSL/crypto/mem.c
@ -72,6 +72,8 @@ OPENSSL_MSVC_PRAGMA(warning(pop))


 #define OPENSSL_MALLOC_PREFIX 8
+OPENSSL_STATIC_ASSERT(OPENSSL_MALLOC_PREFIX >= sizeof(size_t),
+                      "size_t too large");

 #if defined(OPENSSL_ASAN)
 void __asan_poison_memory_region(const volatile void *addr, size_t size);
@ -101,14 +103,54 @@ static void __asan_unpoison_memory_region(const void *addr, size_t size) {}
 // linked. This isn't an ideal result, but its helps in some cases.
 WEAK_SYMBOL_FUNC(void, sdallocx, (void *ptr, size_t size, int flags));

-// The following two functions are for memory tracking. They are no-ops by
-// default but can be overridden at link time if the application needs to
-// observe heap operations.
-WEAK_SYMBOL_FUNC(void, OPENSSL_track_memory_alloc, (void *ptr, size_t size));
-WEAK_SYMBOL_FUNC(void, OPENSSL_track_memory_free, (void *ptr, size_t size));
+// The following three functions can be defined to override default heap
+// allocation and freeing. If defined, it is the responsibility of
+// |OPENSSL_memory_free| to zero out the memory before returning it to the
+// system. |OPENSSL_memory_free| will not be passed NULL pointers.
+//
+// WARNING: These functions are called on every allocation and free in
+// BoringSSL across the entire process. They may be called by any code in the
+// process which calls BoringSSL, including in process initializers and thread
+// destructors. When called, BoringSSL may hold pthreads locks. Any other code
+// in the process which, directly or indirectly, calls BoringSSL may be on the
+// call stack and may itself be using arbitrary synchronization primitives.
+//
+// As a result, these functions may not have the usual programming environment
+// available to most C or C++ code. In particular, they may not call into
+// BoringSSL, or any library which depends on BoringSSL. Any synchronization
+// primitives used must tolerate every other synchronization primitive linked
+// into the process, including pthreads locks. Failing to meet these constraints
+// may result in deadlocks, crashes, or memory corruption.
+WEAK_SYMBOL_FUNC(void*, OPENSSL_memory_alloc, (size_t size));
+WEAK_SYMBOL_FUNC(void, OPENSSL_memory_free, (void *ptr));
+WEAK_SYMBOL_FUNC(size_t, OPENSSL_memory_get_size, (void *ptr));
+
+// kBoringSSLBinaryTag is a distinctive byte sequence to identify binaries that
+// are linking in BoringSSL and, roughly, what version they are using.
+static const uint8_t kBoringSSLBinaryTag[18] = {
+    // 16 bytes of magic tag.
+    0x8c, 0x62, 0x20, 0x0b, 0xd2, 0xa0, 0x72, 0x58,
+    0x44, 0xa8, 0x96, 0x69, 0xad, 0x55, 0x7e, 0xec,
+    // Current source iteration. Incremented ~monthly.
+    2, 0,
+};

 void *OPENSSL_malloc(size_t size) {
+  if (OPENSSL_memory_alloc != NULL) {
+    assert(OPENSSL_memory_free != NULL);
+    assert(OPENSSL_memory_get_size != NULL);
+    return OPENSSL_memory_alloc(size);
+  }
+
  if (size + OPENSSL_MALLOC_PREFIX < size) {
+    // |OPENSSL_malloc| is a central function in BoringSSL thus a reference to
+    // |kBoringSSLBinaryTag| is created here so that the tag isn't discarded by
+    // the linker. The following is sufficient to stop GCC, Clang, and MSVC
+    // optimising away the reference at the time of writing. Since this
+    // probably results in an actual memory reference, it is put in this very
+    // rare code path.
+    uint8_t unused = *(volatile uint8_t *)kBoringSSLBinaryTag;
+    (void) unused;
    return NULL;
  }

@ -120,9 +162,6 @@ void *OPENSSL_malloc(size_t size) {
  *(size_t *)ptr = size;

  __asan_poison_memory_region(ptr, OPENSSL_MALLOC_PREFIX);
-  if (OPENSSL_track_memory_alloc) {
-    OPENSSL_track_memory_alloc(ptr, size + OPENSSL_MALLOC_PREFIX);
-  }
  return ((uint8_t *)ptr) + OPENSSL_MALLOC_PREFIX;
 }

@ -131,13 +170,15 @@ void OPENSSL_free(void *orig_ptr) {
    return;
  }

+  if (OPENSSL_memory_free != NULL) {
+    OPENSSL_memory_free(orig_ptr);
+    return;
+  }
+
  void *ptr = ((uint8_t *)orig_ptr) - OPENSSL_MALLOC_PREFIX;
  __asan_unpoison_memory_region(ptr, OPENSSL_MALLOC_PREFIX);

  size_t size = *(size_t *)ptr;
-  if (OPENSSL_track_memory_free) {
-    OPENSSL_track_memory_free(ptr, size + OPENSSL_MALLOC_PREFIX);
-  }
  OPENSSL_cleanse(ptr, size + OPENSSL_MALLOC_PREFIX);
  if (sdallocx) {
    sdallocx(ptr, size + OPENSSL_MALLOC_PREFIX, 0 /* flags */);
@ -151,10 +192,15 @@ void *OPENSSL_realloc(void *orig_ptr, size_t new_size) {
    return OPENSSL_malloc(new_size);
  }

-  void *ptr = ((uint8_t *)orig_ptr) - OPENSSL_MALLOC_PREFIX;
-  __asan_unpoison_memory_region(ptr, OPENSSL_MALLOC_PREFIX);
-  size_t old_size = *(size_t *)ptr;
-  __asan_poison_memory_region(ptr, OPENSSL_MALLOC_PREFIX);
+  size_t old_size;
+  if (OPENSSL_memory_get_size != NULL) {
+    old_size = OPENSSL_memory_get_size(orig_ptr);
+  } else {
+    void *ptr = ((uint8_t *)orig_ptr) - OPENSSL_MALLOC_PREFIX;
+    __asan_unpoison_memory_region(ptr, OPENSSL_MALLOC_PREFIX);
+    old_size = *(size_t *)ptr;
+    __asan_poison_memory_region(ptr, OPENSSL_MALLOC_PREFIX);
+  }

  void *ret = OPENSSL_malloc(new_size);
  if (ret == NULL) {
@ -219,6 +265,8 @@ uint32_t OPENSSL_hash32(const void *ptr, size_t len) {
  return h;
 }

+uint32_t OPENSSL_strhash(const char *s) { return OPENSSL_hash32(s, strlen(s)); }
+
 size_t OPENSSL_strnlen(const char *s, size_t len) {
  for (size_t i = 0; i < len; i++) {
    if (s[i] == 0) {
@ -294,22 +342,15 @@ int BIO_vsnprintf(char *buf, size_t n, const char *format, va_list args) {
 }

 char *OPENSSL_strndup(const char *str, size_t size) {
-  char *ret;
-  size_t alloc_size;
-
-  if (str == NULL) {
-    return NULL;
-  }
-
  size = OPENSSL_strnlen(str, size);

-  alloc_size = size + 1;
+  size_t alloc_size = size + 1;
  if (alloc_size < size) {
    // overflow
    OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE);
    return NULL;
  }
-  ret = OPENSSL_malloc(alloc_size);
+  char *ret = OPENSSL_malloc(alloc_size);
  if (ret == NULL) {
    OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE);
    return NULL;
@ -357,3 +398,13 @@ void *OPENSSL_memdup(const void *data, size_t size) {
  OPENSSL_memcpy(ret, data, size);
  return ret;
 }
+
+void *CRYPTO_malloc(size_t size, const char *file, int line) {
+  return OPENSSL_malloc(size);
+}
+
+void *CRYPTO_realloc(void *ptr, size_t new_size, const char *file, int line) {
+  return OPENSSL_realloc(ptr, new_size);
+}
+
+void CRYPTO_free(void *ptr, const char *file, int line) { OPENSSL_free(ptr); }
--- a/Sources/CBigNumBoringSSL/crypto/rand_extra/deterministic.c
+++ b/Sources/CBigNumBoringSSL/crypto/rand_extra/deterministic.c
@ -49,4 +49,8 @@ void CRYPTO_sysrand(uint8_t *out, size_t requested) {
  CRYPTO_chacha_20(out, out, requested, kZeroKey, nonce, 0);
 }

+void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) {
+  CRYPTO_sysrand(out, requested);
+}
+
 #endif  // BORINGSSL_UNSAFE_DETERMINISTIC_MODE
--- a/Show More
+++ b/Show More