[scudo] 32-bit and hardware agnostic support

Summary: This update introduces i386 support for the Scudo Hardened Allocator, and offers software alternatives for functions that used to require hardware specific instruction sets. This should make porting to new architectures easier. Among the changes: - The chunk header has been changed to accomodate the size limitations encountered on 32-bit architectures. We now fit everything in 64-bit. This was achieved by storing the amount of unused bytes in an allocation rather than the size itself, as one can be deduced from the other with the help of the GetActuallyAllocatedSize function. As it turns out, this header can be used for both 64 and 32 bit, and as such we dropped the requirement for the 128-bit compare and exchange instruction support (cmpxchg16b). - Add 32-bit support for the checksum and the PRNG functions: if the SSE 4.2 instruction set is supported, use the 32-bit CRC32 instruction, and in the XorShift128, use a 32-bit based state instead of 64-bit. - Add software support for CRC32: if SSE 4.2 is not supported, fallback on a software implementation. - Modify tests that were not 32-bit compliant, and expand them to cover more allocation and alignment sizes. The random shuffle test has been deactivated for linux-i386 & linux-i686 as the 32-bit sanitizer allocator doesn't currently randomize chunks. Reviewers: alekseyshl, kcc Subscribers: filcab, llvm-commits, tberghammer, danalbert, srhines, mgorny, modocache Differential Revision: https://reviews.llvm.org/D26358 llvm-svn: 288255
2016-11-30 17:32:20 +00:00 · 2016-11-30 17:32:20 +00:00 · 1148dc5274
parent 2419d670c2
commit 1148dc5274
25 changed files with 447 additions and 246 deletions
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@ -29,6 +29,7 @@ check_cxx_compiler_flag(-std=c++11           COMPILER_RT_HAS_STD_CXX11_FLAG)
 check_cxx_compiler_flag(-ftls-model=initial-exec COMPILER_RT_HAS_FTLS_MODEL_INITIAL_EXEC)
 check_cxx_compiler_flag(-fno-lto             COMPILER_RT_HAS_FNO_LTO_FLAG)
 check_cxx_compiler_flag("-Werror -msse3" COMPILER_RT_HAS_MSSE3_FLAG)
+check_cxx_compiler_flag("-Werror -msse4.2"   COMPILER_RT_HAS_MSSE4_2_FLAG)
 check_cxx_compiler_flag(--sysroot=.          COMPILER_RT_HAS_SYSROOT_FLAG)

 if(NOT WIN32 AND NOT CYGWIN)
@ -178,7 +179,7 @@ set(ALL_UBSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64}
 set(ALL_SAFESTACK_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM64} ${MIPS32} ${MIPS64})
 set(ALL_CFI_SUPPORTED_ARCH ${X86} ${X86_64} ${MIPS64})
 set(ALL_ESAN_SUPPORTED_ARCH ${X86_64} ${MIPS64})
-set(ALL_SCUDO_SUPPORTED_ARCH ${X86_64})
+set(ALL_SCUDO_SUPPORTED_ARCH ${X86} ${X86_64})
 set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64})

 if(APPLE)
@ -405,8 +406,7 @@ else()
    ${ALL_SAFESTACK_SUPPORTED_ARCH})
  filter_available_targets(CFI_SUPPORTED_ARCH ${ALL_CFI_SUPPORTED_ARCH})
  filter_available_targets(ESAN_SUPPORTED_ARCH ${ALL_ESAN_SUPPORTED_ARCH})
-  filter_available_targets(SCUDO_SUPPORTED_ARCH
-    ${ALL_SCUDO_SUPPORTED_ARCH})
+  filter_available_targets(SCUDO_SUPPORTED_ARCH ${ALL_SCUDO_SUPPORTED_ARCH})
  filter_available_targets(XRAY_SUPPORTED_ARCH ${ALL_XRAY_SUPPORTED_ARCH})
 endif()

--- a/compiler-rt/lib/scudo/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/CMakeLists.txt
@ -4,7 +4,7 @@ include_directories(..)

 set(SCUDO_CFLAGS ${SANITIZER_COMMON_CFLAGS})
 append_rtti_flag(OFF SCUDO_CFLAGS)
-list(APPEND SCUDO_CFLAGS -msse4.2 -mcx16)
+append_list_if(COMPILER_RT_HAS_MSSE4_2_FLAG -msse4.2 SCUDO_CFLAGS)

 set(SCUDO_SOURCES
  scudo_allocator.cpp
--- a/compiler-rt/lib/scudo/scudo_allocator.cpp
+++ b/compiler-rt/lib/scudo/scudo_allocator.cpp
@ -22,23 +22,41 @@

 #include <limits.h>
 #include <pthread.h>
-#include <smmintrin.h>

 #include <cstring>

 namespace __scudo {

+#if SANITIZER_CAN_USE_ALLOCATOR64
+const uptr AllocatorSpace = ~0ULL;
+const uptr AllocatorSize = 0x40000000000ULL;
+typedef DefaultSizeClassMap SizeClassMap;
 struct AP {
-  static const uptr kSpaceBeg = ~0ULL;
-  static const uptr kSpaceSize = 0x10000000000ULL;
+  static const uptr kSpaceBeg = AllocatorSpace;
+  static const uptr kSpaceSize = AllocatorSize;
  static const uptr kMetadataSize = 0;
-  typedef DefaultSizeClassMap SizeClassMap;
+  typedef __scudo::SizeClassMap SizeClassMap;
  typedef NoOpMapUnmapCallback MapUnmapCallback;
  static const uptr kFlags =
      SizeClassAllocator64FlagMasks::kRandomShuffleChunks;
 };
-
 typedef SizeClassAllocator64<AP> PrimaryAllocator;
+#else
+// Currently, the 32-bit Sanitizer allocator has not yet benefited from all the
+// security improvements brought to the 64-bit one. This makes the 32-bit
+// version of Scudo slightly less toughened.
+static const uptr RegionSizeLog = 20;
+static const uptr NumRegions = SANITIZER_MMAP_RANGE_SIZE >> RegionSizeLog;
+# if SANITIZER_WORDSIZE == 32
+typedef FlatByteMap<NumRegions> ByteMap;
+# elif SANITIZER_WORDSIZE == 64
+typedef TwoLevelByteMap<(NumRegions >> 12), 1 << 12> ByteMap;
+# endif  // SANITIZER_WORDSIZE
+typedef SizeClassMap<3, 4, 8, 16, 64, 14> SizeClassMap;
+typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, 0, SizeClassMap,
+    RegionSizeLog, ByteMap> PrimaryAllocator;
+#endif  // SANITIZER_CAN_USE_ALLOCATOR64
+
 typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
 typedef ScudoLargeMmapAllocator SecondaryAllocator;
 typedef CombinedAllocator<PrimaryAllocator, AllocatorCache, SecondaryAllocator>
@ -48,7 +66,50 @@ static ScudoAllocator &getAllocator();

 static thread_local Xorshift128Plus Prng;
 // Global static cookie, initialized at start-up.
-static u64 Cookie;
+static uptr Cookie;
+
+enum : u8 {
+  CRC32Software = 0,
+  CRC32Hardware = 1,
+};
+// We default to software CRC32 if the alternatives are not supported, either
+// at compilation or at runtime.
+static atomic_uint8_t HashAlgorithm = { CRC32Software };
+
+// Hardware CRC32 is supported at compilation via the following:
+// - for i386 & x86_64: -msse4.2
+// - for ARM & AArch64: -march=armv8-a+crc
+// An additional check must be performed at runtime as well to make sure the
+// emitted instructions are valid on the target host.
+#if defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
+# ifdef __SSE4_2__
+#  include <smmintrin.h>
+#  define HW_CRC32 FIRST_32_SECOND_64(_mm_crc32_u32, _mm_crc32_u64)
+# endif
+# ifdef __ARM_FEATURE_CRC32
+#  include <arm_acle.h>
+#  define HW_CRC32 FIRST_32_SECOND_64(__crc32cw, __crc32cd)
+# endif
+#endif
+
+// Helper function that will compute the chunk checksum, being passed all the
+// the needed information as uptrs. It will opt for the hardware version of
+// the checksumming function if available.
+INLINE u32 hashUptrs(uptr Pointer, uptr *Array, uptr ArraySize, u8 HashType) {
+  u32 Crc;
+#if defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
+  if (HashType == CRC32Hardware) {
+    Crc = HW_CRC32(Cookie, Pointer);
+    for (uptr i = 0; i < ArraySize; i++)
+      Crc = HW_CRC32(Crc, Array[i]);
+    return Crc;
+  }
+#endif
+  Crc = computeCRC32(Cookie, Pointer);
+  for (uptr i = 0; i < ArraySize; i++)
+    Crc = computeCRC32(Crc, Array[i]);
+  return Crc;
+}

 struct ScudoChunk : UnpackedHeader {
  // We can't use the offset member of the chunk itself, as we would double
@ -59,19 +120,37 @@ struct ScudoChunk : UnpackedHeader {
        reinterpret_cast<uptr>(this) - (Header->Offset << MinAlignmentLog));
  }

-  // CRC32 checksum of the Chunk pointer and its ChunkHeader.
-  // It currently uses the Intel Nehalem SSE4.2 crc32 64-bit instruction.
+  // Returns the usable size for a chunk, meaning the amount of bytes from the
+  // beginning of the user data to the end of the backend allocated chunk.
+  uptr getUsableSize(UnpackedHeader *Header) {
+    uptr Size = getAllocator().GetActuallyAllocatedSize(getAllocBeg(Header));
+    if (Size == 0)
+      return Size;
+    return Size - AlignedChunkHeaderSize - (Header->Offset << MinAlignmentLog);
+  }
+
+  // Compute the checksum of the Chunk pointer and its ChunkHeader.
  u16 computeChecksum(UnpackedHeader *Header) const {
-    u64 HeaderHolder[2];
-    memcpy(HeaderHolder, Header, sizeof(HeaderHolder));
-    u64 Crc = _mm_crc32_u64(Cookie, reinterpret_cast<uptr>(this));
-    // This is somewhat of a shortcut. The checksum is stored in the 16 least
-    // significant bits of the first 8 bytes of the header, hence zero-ing
-    // those bits out. It would be more valid to zero the checksum field of the
-    // UnpackedHeader, but would require holding an additional copy of it.
-    Crc = _mm_crc32_u64(Crc, HeaderHolder[0] & 0xffffffffffff0000ULL);
-    Crc = _mm_crc32_u64(Crc, HeaderHolder[1]);
-    return static_cast<u16>(Crc);
+    UnpackedHeader ZeroChecksumHeader = *Header;
+    ZeroChecksumHeader.Checksum = 0;
+    uptr HeaderHolder[sizeof(UnpackedHeader) / sizeof(uptr)];
+    memcpy(&HeaderHolder, &ZeroChecksumHeader, sizeof(HeaderHolder));
+    u32 Hash = hashUptrs(reinterpret_cast<uptr>(this),
+                         HeaderHolder,
+                         ARRAY_SIZE(HeaderHolder),
+                         atomic_load_relaxed(&HashAlgorithm));
+    return static_cast<u16>(Hash);
+  }
+
+  // Checks the validity of a chunk by verifying its checksum.
+  bool isValid() {
+    UnpackedHeader NewUnpackedHeader;
+    const AtomicPackedHeader *AtomicHeader =
+        reinterpret_cast<const AtomicPackedHeader *>(this);
+    PackedHeader NewPackedHeader =
+        AtomicHeader->load(std::memory_order_relaxed);
+    NewUnpackedHeader = bit_cast<UnpackedHeader>(NewPackedHeader);
+    return (NewUnpackedHeader.Checksum == computeChecksum(&NewUnpackedHeader));
  }

  // Loads and unpacks the header, verifying the checksum in the process.
@ -81,9 +160,7 @@ struct ScudoChunk : UnpackedHeader {
    PackedHeader NewPackedHeader =
        AtomicHeader->load(std::memory_order_relaxed);
    *NewUnpackedHeader = bit_cast<UnpackedHeader>(NewPackedHeader);
-    if ((NewUnpackedHeader->Unused_0_ != 0) ||
-        (NewUnpackedHeader->Unused_1_ != 0) ||
-        (NewUnpackedHeader->Checksum != computeChecksum(NewUnpackedHeader))) {
+    if (NewUnpackedHeader->Checksum != computeChecksum(NewUnpackedHeader)) {
      dieWithMessage("ERROR: corrupted chunk header at address %p\n", this);
    }
  }
@ -119,7 +196,7 @@ struct ScudoChunk : UnpackedHeader {
 static bool ScudoInitIsRunning = false;

 static pthread_once_t GlobalInited = PTHREAD_ONCE_INIT;
-static pthread_key_t pkey;
+static pthread_key_t PThreadKey;

 static thread_local bool ThreadInited = false;
 static thread_local bool ThreadTornDown = false;
@ -133,7 +210,7 @@ static void teardownThread(void *p) {
  // like, so we wait until PTHREAD_DESTRUCTOR_ITERATIONS before draining the
  // quarantine and swallowing the cache.
  if (v < PTHREAD_DESTRUCTOR_ITERATIONS) {
-    pthread_setspecific(pkey, reinterpret_cast<void *>(v + 1));
+    pthread_setspecific(PThreadKey, reinterpret_cast<void *>(v + 1));
    return;
  }
  drainQuarantine();
@ -146,6 +223,11 @@ static void initInternal() {
  CHECK(!ScudoInitIsRunning && "Scudo init calls itself!");
  ScudoInitIsRunning = true;

+  // Check is SSE4.2 is supported, if so, opt for the CRC32 hardware version.
+  if (testCPUFeature(CRC32CPUFeature)) {
+    atomic_store_relaxed(&HashAlgorithm, CRC32Hardware);
+  }
+
  initFlags();

  AllocatorOptions Options;
@ -158,13 +240,13 @@ static void initInternal() {
 }

 static void initGlobal() {
-  pthread_key_create(&pkey, teardownThread);
+  pthread_key_create(&PThreadKey, teardownThread);
  initInternal();
 }

 static void NOINLINE initThread() {
  pthread_once(&GlobalInited, initGlobal);
-  pthread_setspecific(pkey, reinterpret_cast<void *>(1));
+  pthread_setspecific(PThreadKey, reinterpret_cast<void *>(1));
  getAllocator().InitCache(&Cache);
  ThreadInited = true;
 }
@ -253,9 +335,6 @@ struct Allocator {
      FallbackQuarantineCache(LINKER_INITIALIZED) {}

  void init(const AllocatorOptions &Options) {
-    // Currently SSE 4.2 support is required. This might change later.
-    CHECK(testCPUFeature(SSE4_2)); // for crc32
-
    // Verify that the header offset field can hold the maximum offset. In the
    // case of the Secondary allocator, it takes care of alignment and the
    // offset will always be 0. In the case of the Primary, the worst case
@ -266,14 +345,25 @@ struct Allocator {
    // last size class minus the header size, in multiples of MinAlignment.
    UnpackedHeader Header = {};
    uptr MaxPrimaryAlignment = 1 << MostSignificantSetBitIndex(
-        PrimaryAllocator::SizeClassMap::kMaxSize - MinAlignment);
-    uptr MaximumOffset = (MaxPrimaryAlignment - ChunkHeaderSize) >>
+        SizeClassMap::kMaxSize - MinAlignment);
+    uptr MaxOffset = (MaxPrimaryAlignment - AlignedChunkHeaderSize) >>
        MinAlignmentLog;
-    Header.Offset = MaximumOffset;
-    if (Header.Offset != MaximumOffset) {
+    Header.Offset = MaxOffset;
+    if (Header.Offset != MaxOffset) {
      dieWithMessage("ERROR: the maximum possible offset doesn't fit in the "
                     "header\n");
    }
+    // Verify that we can fit the maximum amount of unused bytes in the header.
+    // The worst case scenario would be when allocating 1 byte on a MaxAlignment
+    // alignment. Since the combined allocator currently rounds the size up to
+    // the alignment before passing it to the secondary, we end up with
+    // MaxAlignment - 1 extra bytes.
+    uptr MaxUnusedBytes = MaxAlignment - 1;
+    Header.UnusedBytes = MaxUnusedBytes;
+    if (Header.UnusedBytes != MaxUnusedBytes) {
+      dieWithMessage("ERROR: the maximum possible unused bytes doesn't fit in "
+                     "the header\n");
+    }

    DeallocationTypeMismatch = Options.DeallocationTypeMismatch;
    DeleteSizeMismatch = Options.DeleteSizeMismatch;
@ -286,6 +376,17 @@ struct Allocator {
    Cookie = Prng.Next();
  }

+  // Helper function that checks for a valid Scudo chunk.
+  bool isValidPointer(const void *UserPtr) {
+    uptr ChunkBeg = reinterpret_cast<uptr>(UserPtr);
+    if (!IsAligned(ChunkBeg, MinAlignment)) {
+      return false;
+    }
+    ScudoChunk *Chunk =
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    return Chunk->isValid();
+  }
+
  // Allocates a chunk.
  void *allocate(uptr Size, uptr Alignment, AllocType Type) {
    if (UNLIKELY(!ThreadInited))
@ -302,7 +403,7 @@ struct Allocator {
    if (Size >= MaxAllowedMallocSize)
      return BackendAllocator.ReturnNullOrDieOnBadRequest();
    uptr RoundedSize = RoundUpTo(Size, MinAlignment);
-    uptr NeededSize = RoundedSize + ChunkHeaderSize;
+    uptr NeededSize = RoundedSize + AlignedChunkHeaderSize;
    if (Alignment > MinAlignment)
      NeededSize += Alignment;
    if (NeededSize >= MaxAllowedMallocSize)
@ -321,28 +422,33 @@ struct Allocator {
    if (!Ptr)
      return BackendAllocator.ReturnNullOrDieOnOOM();

-    // If requested, we will zero out the entire contents of the returned chunk.
-    if (ZeroContents && BackendAllocator.FromPrimary(Ptr))
-       memset(Ptr, 0, BackendAllocator.GetActuallyAllocatedSize(Ptr));
-
    uptr AllocBeg = reinterpret_cast<uptr>(Ptr);
    // If the allocation was serviced by the secondary, the returned pointer
    // accounts for ChunkHeaderSize to pass the alignment check of the combined
    // allocator. Adjust it here.
    if (!FromPrimary)
-      AllocBeg -= ChunkHeaderSize;
-    uptr ChunkBeg = AllocBeg + ChunkHeaderSize;
+      AllocBeg -= AlignedChunkHeaderSize;
+
+    uptr ActuallyAllocatedSize = BackendAllocator.GetActuallyAllocatedSize(
+        reinterpret_cast<void *>(AllocBeg));
+    // If requested, we will zero out the entire contents of the returned chunk.
+    if (ZeroContents && FromPrimary)
+       memset(Ptr, 0, ActuallyAllocatedSize);
+
+    uptr ChunkBeg = AllocBeg + AlignedChunkHeaderSize;
    if (!IsAligned(ChunkBeg, Alignment))
      ChunkBeg = RoundUpTo(ChunkBeg, Alignment);
    CHECK_LE(ChunkBeg + Size, AllocBeg + NeededSize);
    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
    UnpackedHeader Header = {};
    Header.State = ChunkAllocated;
-    Header.Offset = (ChunkBeg - ChunkHeaderSize - AllocBeg) >> MinAlignmentLog;
+    uptr Offset = ChunkBeg - AlignedChunkHeaderSize - AllocBeg;
+    Header.Offset = Offset >> MinAlignmentLog;
    Header.AllocType = Type;
-    Header.RequestedSize = Size;
-    Header.Salt = static_cast<u16>(Prng.Next());
+    Header.UnusedBytes = ActuallyAllocatedSize - Offset -
+        AlignedChunkHeaderSize - Size;
+    Header.Salt = static_cast<u8>(Prng.Next());
    Chunk->storeHeader(&Header);
    void *UserPtr = reinterpret_cast<void *>(ChunkBeg);
    // TODO(kostyak): hooks sound like a terrible idea security wise but might
@ -366,13 +472,14 @@ struct Allocator {
                     "aligned at address %p\n", UserPtr);
    }
    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
    UnpackedHeader OldHeader;
    Chunk->loadHeader(&OldHeader);
    if (OldHeader.State != ChunkAllocated) {
      dieWithMessage("ERROR: invalid chunk state when deallocating address "
-                     "%p\n", Chunk);
+                     "%p\n", UserPtr);
    }
+    uptr UsableSize = Chunk->getUsableSize(&OldHeader);
    UnpackedHeader NewHeader = OldHeader;
    NewHeader.State = ChunkQuarantine;
    Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
@ -386,69 +493,40 @@ struct Allocator {
        }
      }
    }
-    uptr Size = NewHeader.RequestedSize;
+    uptr Size = UsableSize - OldHeader.UnusedBytes;
    if (DeleteSizeMismatch) {
      if (DeleteSize && DeleteSize != Size) {
        dieWithMessage("ERROR: invalid sized delete on chunk at address %p\n",
                       Chunk);
      }
    }
+
    if (LIKELY(!ThreadTornDown)) {
      AllocatorQuarantine.Put(&ThreadQuarantineCache,
-                              QuarantineCallback(&Cache), Chunk, Size);
+                              QuarantineCallback(&Cache), Chunk, UsableSize);
    } else {
      SpinMutexLock l(&FallbackMutex);
      AllocatorQuarantine.Put(&FallbackQuarantineCache,
                              QuarantineCallback(&FallbackAllocatorCache),
-                              Chunk, Size);
+                              Chunk, UsableSize);
    }
  }

-  // Returns the actual usable size of a chunk. Since this requires loading the
-  // header, we will return it in the second parameter, as it can be required
-  // by the caller to perform additional processing.
-  uptr getUsableSize(const void *Ptr, UnpackedHeader *Header) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
-    if (!Ptr)
-      return 0;
-    uptr ChunkBeg = reinterpret_cast<uptr>(Ptr);
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
-    Chunk->loadHeader(Header);
-    // Getting the usable size of a chunk only makes sense if it's allocated.
-    if (Header->State != ChunkAllocated) {
-      dieWithMessage("ERROR: attempted to size a non-allocated chunk at "
-                     "address %p\n", Chunk);
-    }
-    uptr Size =
-        BackendAllocator.GetActuallyAllocatedSize(Chunk->getAllocBeg(Header));
-    // UsableSize works as malloc_usable_size, which is also what (AFAIU)
-    // tcmalloc's MallocExtension::GetAllocatedSize aims at providing. This
-    // means we will return the size of the chunk from the user beginning to
-    // the end of the 'user' allocation, hence us subtracting the header size
-    // and the offset from the size.
-    if (Size == 0)
-      return Size;
-    return Size - ChunkHeaderSize - (Header->Offset << MinAlignmentLog);
-  }
-
-  // Helper function that doesn't care about the header.
-  uptr getUsableSize(const void *Ptr) {
-    UnpackedHeader Header;
-    return getUsableSize(Ptr, &Header);
-  }
-
  // Reallocates a chunk. We can save on a new allocation if the new requested
  // size still fits in the chunk.
  void *reallocate(void *OldPtr, uptr NewSize) {
    if (UNLIKELY(!ThreadInited))
      initThread();
-    UnpackedHeader OldHeader;
-    uptr Size = getUsableSize(OldPtr, &OldHeader);
    uptr ChunkBeg = reinterpret_cast<uptr>(OldPtr);
    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    UnpackedHeader OldHeader;
+    Chunk->loadHeader(&OldHeader);
+    if (OldHeader.State != ChunkAllocated) {
+      dieWithMessage("ERROR: invalid chunk state when reallocating address "
+                     "%p\n", OldPtr);
+    }
+    uptr Size = Chunk->getUsableSize(&OldHeader);
    if (OldHeader.AllocType != FromMalloc) {
      dieWithMessage("ERROR: invalid chunk type when reallocating address %p\n",
                     Chunk);
@ -456,7 +534,7 @@ struct Allocator {
    UnpackedHeader NewHeader = OldHeader;
    // The new size still fits in the current chunk.
    if (NewSize <= Size) {
-      NewHeader.RequestedSize = NewSize;
+      NewHeader.UnusedBytes = Size - NewSize;
      Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
      return OldPtr;
    }
@ -464,23 +542,42 @@ struct Allocator {
    // old one.
    void *NewPtr = allocate(NewSize, MinAlignment, FromMalloc);
    if (NewPtr) {
-      uptr OldSize = OldHeader.RequestedSize;
+      uptr OldSize = Size - OldHeader.UnusedBytes;
      memcpy(NewPtr, OldPtr, Min(NewSize, OldSize));
      NewHeader.State = ChunkQuarantine;
      Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
      if (LIKELY(!ThreadTornDown)) {
        AllocatorQuarantine.Put(&ThreadQuarantineCache,
-                                QuarantineCallback(&Cache), Chunk, OldSize);
+                                QuarantineCallback(&Cache), Chunk, Size);
      } else {
        SpinMutexLock l(&FallbackMutex);
        AllocatorQuarantine.Put(&FallbackQuarantineCache,
                                QuarantineCallback(&FallbackAllocatorCache),
-                                Chunk, OldSize);
+                                Chunk, Size);
      }
    }
    return NewPtr;
  }

+  // Helper function that returns the actual usable size of a chunk.
+  uptr getUsableSize(const void *Ptr) {
+    if (UNLIKELY(!ThreadInited))
+      initThread();
+    if (!Ptr)
+      return 0;
+    uptr ChunkBeg = reinterpret_cast<uptr>(Ptr);
+    ScudoChunk *Chunk =
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    UnpackedHeader Header;
+    Chunk->loadHeader(&Header);
+    // Getting the usable size of a chunk only makes sense if it's allocated.
+    if (Header.State != ChunkAllocated) {
+      dieWithMessage("ERROR: invalid chunk state when sizing address %p\n",
+                     Ptr);
+    }
+    return Chunk->getUsableSize(&Header);
+  }
+
  void *calloc(uptr NMemB, uptr Size) {
    if (UNLIKELY(!ThreadInited))
      initThread();
@ -575,7 +672,7 @@ uptr scudoMallocUsableSize(void *Ptr) {
  return Instance.getUsableSize(Ptr);
 }

-} // namespace __scudo
+}  // namespace __scudo

 using namespace __scudo;

@ -605,10 +702,10 @@ uptr __sanitizer_get_estimated_allocated_size(uptr size) {
  return size;
 }

-int __sanitizer_get_ownership(const void *p) {
-  return Instance.getUsableSize(p) != 0;
+int __sanitizer_get_ownership(const void *Ptr) {
+  return Instance.isValidPointer(Ptr);
 }

-uptr __sanitizer_get_allocated_size(const void *p) {
-  return Instance.getUsableSize(p);
+uptr __sanitizer_get_allocated_size(const void *Ptr) {
+  return Instance.getUsableSize(Ptr);
 }
--- a/compiler-rt/lib/scudo/scudo_allocator.h
+++ b/compiler-rt/lib/scudo/scudo_allocator.h
@ -14,10 +14,6 @@
 #ifndef SCUDO_ALLOCATOR_H_
 #define SCUDO_ALLOCATOR_H_

-#ifndef __x86_64__
-# error "The Scudo hardened allocator currently only supports x86_64."
-#endif
-
 #include "scudo_flags.h"

 #include "sanitizer_common/sanitizer_allocator.h"
@ -39,57 +35,38 @@ enum ChunkState : u8 {
  ChunkQuarantine = 2
 };

-#if SANITIZER_WORDSIZE == 64
-// Our header requires 128 bits of storage on 64-bit platforms, which fits
-// nicely with the alignment requirements. Having the offset saves us from
+// Our header requires 64 bits of storage. Having the offset saves us from
 // using functions such as GetBlockBegin, that is fairly costly. Our first
 // implementation used the MetaData as well, which offers the advantage of
 // being stored away from the chunk itself, but accessing it was costly as
 // well. The header will be atomically loaded and stored using the 16-byte
 // primitives offered by the platform (likely requires cmpxchg16b support).
-typedef unsigned __int128 PackedHeader;
-struct UnpackedHeader {
-  u16  Checksum      : 16;
-  uptr RequestedSize : 40; // Needed for reallocation purposes.
-  u8   State         : 2;  // available, allocated, or quarantined
-  u8   AllocType     : 2;  // malloc, new, new[], or memalign
-  u8   Unused_0_     : 4;
-  uptr Offset        : 12; // Offset from the beginning of the backend
-                           // allocation to the beginning of the chunk itself,
-                           // in multiples of MinAlignment. See comment about
-                           // its maximum value and test in init().
-  u64  Unused_1_     : 36;
-  u16  Salt          : 16;
-};
-#elif SANITIZER_WORDSIZE == 32
-// On 32-bit platforms, our header requires 64 bits.
 typedef u64 PackedHeader;
 struct UnpackedHeader {
-  u16  Checksum      : 12;
-  uptr RequestedSize : 32; // Needed for reallocation purposes.
-  u8   State         : 2;  // available, allocated, or quarantined
-  u8   AllocType     : 2;  // malloc, new, new[], or memalign
-  uptr Offset        : 12; // Offset from the beginning of the backend
-                           // allocation to the beginning of the chunk itself,
-                           // in multiples of MinAlignment. See comment about
-                           // its maximum value and test in Allocator::init().
-  u16  Salt          : 4;
+  u64 Checksum    : 16;
+  u64 UnusedBytes : 24; // Needed for reallocation purposes.
+  u64 State       : 2;  // available, allocated, or quarantined
+  u64 AllocType   : 2;  // malloc, new, new[], or memalign
+  u64 Offset      : 12; // Offset from the beginning of the backend
+                        // allocation to the beginning of the chunk itself,
+                        // in multiples of MinAlignment. See comment about
+                        // its maximum value and test in init().
+  u64 Salt        : 8;
 };
-#else
-# error "Unsupported SANITIZER_WORDSIZE."
-#endif  // SANITIZER_WORDSIZE

 typedef std::atomic<PackedHeader> AtomicPackedHeader;
 COMPILER_CHECK(sizeof(UnpackedHeader) == sizeof(PackedHeader));

-const uptr ChunkHeaderSize = sizeof(PackedHeader);
-
 // Minimum alignment of 8 bytes for 32-bit, 16 for 64-bit
 const uptr MinAlignmentLog = FIRST_32_SECOND_64(3, 4);
 const uptr MaxAlignmentLog = 24; // 16 MB
 const uptr MinAlignment = 1 << MinAlignmentLog;
 const uptr MaxAlignment = 1 << MaxAlignmentLog;

+const uptr ChunkHeaderSize = sizeof(PackedHeader);
+const uptr AlignedChunkHeaderSize =
+    (ChunkHeaderSize + MinAlignment - 1) & ~(MinAlignment - 1);
+
 struct AllocatorOptions {
  u32 QuarantineSizeMb;
  u32 ThreadLocalQuarantineSizeKb;
@ -120,6 +97,6 @@ uptr scudoMallocUsableSize(void *Ptr);

 #include "scudo_allocator_secondary.h"

-} // namespace __scudo
+}  // namespace __scudo

 #endif  // SCUDO_ALLOCATOR_H_
--- a/compiler-rt/lib/scudo/scudo_allocator_secondary.h
+++ b/compiler-rt/lib/scudo/scudo_allocator_secondary.h
@ -32,7 +32,7 @@ class ScudoLargeMmapAllocator {
  void *Allocate(AllocatorStats *Stats, uptr Size, uptr Alignment) {
    // The Scudo frontend prevents us from allocating more than
    // MaxAllowedMallocSize, so integer overflow checks would be superfluous.
-    uptr HeadersSize = sizeof(SecondaryHeader) + ChunkHeaderSize;
+    uptr HeadersSize = sizeof(SecondaryHeader) + AlignedChunkHeaderSize;
    uptr MapSize = RoundUpTo(Size + sizeof(SecondaryHeader), PageSize);
    // Account for 2 guard pages, one before and one after the chunk.
    MapSize += 2 * PageSize;
@ -52,27 +52,36 @@ class ScudoLargeMmapAllocator {
        UserBeg += Alignment - (UserBeg & (Alignment - 1));
      CHECK_GE(UserBeg, MapBeg);
      uptr NewMapBeg = UserBeg - HeadersSize;
-      NewMapBeg = (NewMapBeg & ~(PageSize - 1)) - PageSize;
+      NewMapBeg = RoundDownTo(NewMapBeg, PageSize) - PageSize;
      CHECK_GE(NewMapBeg, MapBeg);
-      uptr NewMapSize = MapEnd - NewMapBeg;
-      uptr Diff = NewMapBeg - MapBeg;
+      uptr NewMapSize = RoundUpTo(MapSize - Alignment, PageSize);
+      uptr NewMapEnd = NewMapBeg + NewMapSize;
+      CHECK_LE(NewMapEnd, MapEnd);
      // Unmap the extra memory if it's large enough.
+      uptr Diff = NewMapBeg - MapBeg;
      if (Diff > PageSize)
        UnmapOrDie(reinterpret_cast<void *>(MapBeg), Diff);
+      Diff = MapEnd - NewMapEnd;
+      if (Diff > PageSize)
+        UnmapOrDie(reinterpret_cast<void *>(NewMapEnd), Diff);
      MapBeg = NewMapBeg;
      MapSize = NewMapSize;
+      MapEnd = NewMapEnd;
    }
-    uptr UserEnd = UserBeg - ChunkHeaderSize + Size;
+    uptr UserEnd = UserBeg - AlignedChunkHeaderSize + Size;
    // For larger alignments, Alignment was added by the frontend to Size.
    if (Alignment > MinAlignment)
      UserEnd -= Alignment;
    CHECK_LE(UserEnd, MapEnd - PageSize);
    CHECK_EQ(MapBeg + PageSize, reinterpret_cast<uptr>(
        MmapFixedOrDie(MapBeg + PageSize, MapSize - 2 * PageSize)));
-    uptr Ptr = UserBeg - ChunkHeaderSize;
+    uptr Ptr = UserBeg - AlignedChunkHeaderSize;
    SecondaryHeader *Header = getHeader(Ptr);
    Header->MapBeg = MapBeg;
    Header->MapSize = MapSize;
+    // The primary adds the whole class size to the stats when allocating a
+    // chunk, so we will do something similar here. But we will not account for
+    // the guard pages.
    Stats->Add(AllocatorStatAllocated, MapSize - 2 * PageSize);
    Stats->Add(AllocatorStatMapped, MapSize - 2 * PageSize);
    CHECK(IsAligned(UserBeg, Alignment));
@ -97,8 +106,8 @@ class ScudoLargeMmapAllocator {

  void Deallocate(AllocatorStats *Stats, void *Ptr) {
    SecondaryHeader *Header = getHeader(Ptr);
-    Stats->Sub(AllocatorStatAllocated, Header->MapSize);
-    Stats->Sub(AllocatorStatMapped, Header->MapSize);
+    Stats->Sub(AllocatorStatAllocated, Header->MapSize - 2 * PageSize);
+    Stats->Sub(AllocatorStatMapped, Header->MapSize - 2 * PageSize);
    UnmapOrDie(reinterpret_cast<void *>(Header->MapBeg), Header->MapSize);
  }

@ -154,8 +163,8 @@ class ScudoLargeMmapAllocator {
    uptr MapBeg;
    uptr MapSize;
  };
-  // Check that sizeof(SecondaryHeader) is a multiple of 16.
-  COMPILER_CHECK((sizeof(SecondaryHeader) & 0xf) == 0);
+  // Check that sizeof(SecondaryHeader) is a multiple of MinAlignment.
+  COMPILER_CHECK((sizeof(SecondaryHeader) & (MinAlignment - 1)) == 0);

  SecondaryHeader *getHeader(uptr Ptr) {
    return reinterpret_cast<SecondaryHeader*>(Ptr - sizeof(SecondaryHeader));
--- a/compiler-rt/lib/scudo/scudo_flags.cpp
+++ b/compiler-rt/lib/scudo/scudo_flags.cpp
@ -90,4 +90,4 @@ Flags *getFlags() {
  return &ScudoFlags;
 }

-}
+}  // namespace __scudo
--- a/compiler-rt/lib/scudo/scudo_flags.h
+++ b/compiler-rt/lib/scudo/scudo_flags.h
@ -28,6 +28,6 @@ Flags *getFlags();

 void initFlags();

-} // namespace __scudo
+}  // namespace __scudo

 #endif  // SCUDO_FLAGS_H_
--- a/compiler-rt/lib/scudo/scudo_interceptors.cpp
+++ b/compiler-rt/lib/scudo/scudo_interceptors.cpp
@ -72,4 +72,4 @@ INTERCEPTOR(int, mallopt, int cmd, int value) {
  return -1;
 }

-#endif // SANITIZER_LINUX
+#endif  // SANITIZER_LINUX
--- a/compiler-rt/lib/scudo/scudo_new_delete.cpp
+++ b/compiler-rt/lib/scudo/scudo_new_delete.cpp
@ -24,7 +24,7 @@ using namespace __scudo;
 // Fake std::nothrow_t to avoid including <new>.
 namespace std {
 struct nothrow_t {};
-} // namespace std
+}  // namespace std

 CXX_OPERATOR_ATTRIBUTE
 void *operator new(size_t size) {
--- a/compiler-rt/lib/scudo/scudo_termination.cpp
+++ b/compiler-rt/lib/scudo/scudo_termination.cpp
@ -39,4 +39,4 @@ void NORETURN CheckFailed(const char *File, int Line, const char *Condition,
                          File, Line, Condition, Value1, Value2);
 }

-} // namespace __sanitizer
+}  // namespace __sanitizer
--- a/compiler-rt/lib/scudo/scudo_utils.cpp
+++ b/compiler-rt/lib/scudo/scudo_utils.cpp
@ -17,6 +17,7 @@
 #include <fcntl.h>
 #include <stdarg.h>
 #include <unistd.h>
+#include <cpuid.h>

 #include <cstring>

@ -28,7 +29,7 @@ namespace __sanitizer {
 extern int VSNPrintf(char *buff, int buff_length, const char *format,
                     va_list args);

-} // namespace __sanitizer
+}  // namespace __sanitizer

 namespace __scudo {

@ -44,60 +45,61 @@ void NORETURN dieWithMessage(const char *Format, ...) {
  Die();
 }

+#if defined(__x86_64__) || defined(__i386__)
+// i386 and x86_64 specific code to detect CRC32 hardware support via CPUID.
+// CRC32 requires the SSE 4.2 instruction set.
 typedef struct {
  u32 Eax;
  u32 Ebx;
  u32 Ecx;
  u32 Edx;
-} CPUIDInfo;
+} CPUIDRegs;

-static void getCPUID(CPUIDInfo *info, u32 leaf, u32 subleaf)
+static void getCPUID(CPUIDRegs *Regs, u32 Level)
 {
-  asm volatile("cpuid"
-      : "=a" (info->Eax), "=b" (info->Ebx), "=c" (info->Ecx), "=d" (info->Edx)
-      : "a" (leaf), "c" (subleaf)
-  );
+  __get_cpuid(Level, &Regs->Eax, &Regs->Ebx, &Regs->Ecx, &Regs->Edx);
 }

-// Returns true is the CPU is a "GenuineIntel" or "AuthenticAMD"
-static bool isSupportedCPU()
-{
-  CPUIDInfo Info;
-
-  getCPUID(&Info, 0, 0);
-  if (memcmp(reinterpret_cast<char *>(&Info.Ebx), "Genu", 4) == 0 &&
-      memcmp(reinterpret_cast<char *>(&Info.Edx), "ineI", 4) == 0 &&
-      memcmp(reinterpret_cast<char *>(&Info.Ecx), "ntel", 4) == 0) {
-      return true;
+CPUIDRegs getCPUFeatures() {
+  CPUIDRegs VendorRegs = {};
+  getCPUID(&VendorRegs, 0);
+  bool IsIntel =
+      (VendorRegs.Ebx == signature_INTEL_ebx) &&
+      (VendorRegs.Edx == signature_INTEL_edx) &&
+      (VendorRegs.Ecx == signature_INTEL_ecx);
+  bool IsAMD =
+      (VendorRegs.Ebx == signature_AMD_ebx) &&
+      (VendorRegs.Edx == signature_AMD_edx) &&
+      (VendorRegs.Ecx == signature_AMD_ecx);
+  // Default to an empty feature set if not on a supported CPU.
+  CPUIDRegs FeaturesRegs = {};
+  if (IsIntel || IsAMD) {
+    getCPUID(&FeaturesRegs, 1);
  }
-  if (memcmp(reinterpret_cast<char *>(&Info.Ebx), "Auth", 4) == 0 &&
-      memcmp(reinterpret_cast<char *>(&Info.Edx), "enti", 4) == 0 &&
-      memcmp(reinterpret_cast<char *>(&Info.Ecx), "cAMD", 4) == 0) {
-      return true;
-  }
-  return false;
+  return FeaturesRegs;
 }

-bool testCPUFeature(CPUFeature feature)
-{
-  static bool InfoInitialized = false;
-  static CPUIDInfo CPUInfo = {};
+#ifndef bit_SSE4_2
+#define bit_SSE4_2 bit_SSE42  // clang and gcc have different defines.
+#endif

-  if (InfoInitialized == false) {
-    if (isSupportedCPU() == true)
-      getCPUID(&CPUInfo, 1, 0);
-    else
-      UNIMPLEMENTED();
-    InfoInitialized = true;
-  }
-  switch (feature) {
-    case SSE4_2:
-      return ((CPUInfo.Ecx >> 20) & 0x1) != 0;
+bool testCPUFeature(CPUFeature Feature)
+{
+  static CPUIDRegs FeaturesRegs = getCPUFeatures();
+
+  switch (Feature) {
+    case CRC32CPUFeature:  // CRC32 is provided by SSE 4.2.
+      return !!(FeaturesRegs.Ecx & bit_SSE4_2);
    default:
      break;
  }
  return false;
 }
+#else
+bool testCPUFeature(CPUFeature Feature) {
+  return false;
+}
+#endif  // defined(__x86_64__) || defined(__i386__)

 // readRetry will attempt to read Count bytes from the Fd specified, and if
 // interrupted will retry to read additional bytes to reach Count.
@ -117,17 +119,77 @@ static ssize_t readRetry(int Fd, u8 *Buffer, size_t Count) {
  return AmountRead;
 }

-// Default constructor for Xorshift128Plus seeds the state with /dev/urandom
-Xorshift128Plus::Xorshift128Plus() {
+static void fillRandom(u8 *Data, ssize_t Size) {
  int Fd = open("/dev/urandom", O_RDONLY);
-  bool Success = readRetry(Fd, reinterpret_cast<u8 *>(&State_0_),
-                           sizeof(State_0_)) == sizeof(State_0_);
-  Success &= readRetry(Fd, reinterpret_cast<u8 *>(&State_1_),
-                           sizeof(State_1_)) == sizeof(State_1_);
+  if (Fd < 0) {
+    dieWithMessage("ERROR: failed to open /dev/urandom.\n");
+  }
+  bool Success = readRetry(Fd, Data, Size) == Size;
  close(Fd);
  if (!Success) {
    dieWithMessage("ERROR: failed to read enough data from /dev/urandom.\n");
  }
 }

-} // namespace __scudo
+// Default constructor for Xorshift128Plus seeds the state with /dev/urandom.
+// TODO(kostyak): investigate using getrandom() if available.
+Xorshift128Plus::Xorshift128Plus() {
+  fillRandom(reinterpret_cast<u8 *>(State), sizeof(State));
+}
+
+const static u32 CRC32Table[] = {
+  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
+  0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
+  0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+  0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
+  0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
+  0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
+  0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
+  0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+  0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
+  0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
+  0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
+  0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
+  0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+  0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
+  0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
+  0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
+  0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
+  0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+  0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
+  0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
+  0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
+  0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
+};
+
+u32 computeCRC32(u32 Crc, uptr Data)
+{
+  for (uptr i = 0; i < sizeof(Data); i++) {
+    Crc = CRC32Table[(Crc ^ Data) & 0xff] ^ (Crc >> 8);
+    Data >>= 8;
+  }
+  return Crc;
+}
+
+}  // namespace __scudo
--- a/compiler-rt/lib/scudo/scudo_utils.h
+++ b/compiler-rt/lib/scudo/scudo_utils.h
@ -30,9 +30,9 @@ inline Dest bit_cast(const Source& source) {

 void NORETURN dieWithMessage(const char *Format, ...);

-enum  CPUFeature {
-  SSE4_2 = 0,
-  ENUM_CPUFEATURE_MAX
+enum CPUFeature {
+  CRC32CPUFeature = 0,
+  MaxCPUFeature,
 };
 bool testCPUFeature(CPUFeature feature);

@ -42,18 +42,20 @@ struct Xorshift128Plus {
 public:
  Xorshift128Plus();
  u64 Next() {
-    u64 x = State_0_;
-    const u64 y = State_1_;
-    State_0_ = y;
+    u64 x = State[0];
+    const u64 y = State[1];
+    State[0] = y;
    x ^= x << 23;
-    State_1_ = x ^ y ^ (x >> 17) ^ (y >> 26);
-    return State_1_ + y;
+    State[1] = x ^ y ^ (x >> 17) ^ (y >> 26);
+    return State[1] + y;
  }
 private:
-  u64 State_0_;
-  u64 State_1_;
+  u64 State[2];
 };

-} // namespace __scudo
+// Software CRC32 functions, to be used when SSE 4.2 support is not detected.
+u32 computeCRC32(u32 Crc, uptr Data);
+
+}  // namespace __scudo

 #endif  // SCUDO_UTILS_H_
--- a/compiler-rt/test/scudo/CMakeLists.txt
+++ b/compiler-rt/test/scudo/CMakeLists.txt
@ -1,6 +1,7 @@
 set(SCUDO_LIT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(SCUDO_LIT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})

+set(SCUDO_TESTSUITES)

 set(SCUDO_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
 if(NOT COMPILER_RT_STANDALONE_BUILD)
@ -12,17 +13,30 @@ configure_lit_site_cfg(
  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
  )

-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-   EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO)
-   STRING(REGEX REPLACE "^.*(sse4_2).*$" "\\1" SSE_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "sse4_2" "${SSE_THERE}" SSE42_TRUE)
-endif(CMAKE_SYSTEM_NAME MATCHES "Linux")
+set(SCUDO_TEST_ARCH ${SCUDO_SUPPORTED_ARCH})
+foreach(arch ${SCUDO_TEST_ARCH})
+  set(SCUDO_TEST_TARGET_ARCH ${arch})
+  string(TOLOWER "-${arch}" SCUDO_TEST_CONFIG_SUFFIX)
+  
+  if(ANDROID OR ${arch} MATCHES "arm|aarch64")
+    # This is only true if we are cross-compiling.
+    # Build all tests with host compiler and use host tools.
+    set(SCUDO_TEST_TARGET_CFLAGS ${COMPILER_RT_TEST_COMPILER_CFLAGS})
+  else()
+    get_target_flags_for_arch(${arch} SCUDO_TEST_TARGET_CFLAGS)
+    string(REPLACE ";" " " SCUDO_TEST_TARGET_CFLAGS "${SCUDO_TEST_TARGET_CFLAGS}")
+  endif()

-if (SSE42_TRUE AND CMAKE_SIZEOF_VOID_P EQUAL 8)
-  add_lit_testsuite(check-scudo
-    "Running the Scudo Hardened Allocator tests"
-    ${CMAKE_CURRENT_BINARY_DIR}
-    DEPENDS ${SCUDO_TEST_DEPS})
-  set_target_properties(check-scudo PROPERTIES FOLDER
-    "Compiler-RT Misc")
-endif(SSE42_TRUE AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+  string(TOUPPER ${arch} ARCH_UPPER_CASE)
+  set(CONFIG_NAME ${ARCH_UPPER_CASE}${OS_NAME}Config)
+
+  configure_lit_site_cfg(
+    ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
+    ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg)
+  list(APPEND SCUDO_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME})
+endforeach()
+
+add_lit_testsuite(check-scudo "Running the Scudo Hardened Allocator tests"
+  ${SCUDO_TESTSUITES}
+  DEPENDS ${SCUDO_TEST_DEPS})
+set_target_properties(check-scudo PROPERTIES FOLDER "Compiler-RT Misc")
--- a/compiler-rt/test/scudo/alignment.cpp
+++ b/compiler-rt/test/scudo/alignment.cpp
@ -1,11 +1,10 @@
 // RUN: %clang_scudo %s -o %t
 // RUN: not %run %t pointers 2>&1 | FileCheck %s

-// Tests that a non-16-byte aligned pointer will trigger the associated error
-// on deallocation.
+// Tests that a non MinAlignment aligned pointer will trigger the associated
+// error on deallocation.

 #include <assert.h>
-#include <malloc.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@ -17,7 +16,7 @@ int main(int argc, char **argv)
    void *p = malloc(1U << 16);
    if (!p)
      return 1;
-    free(reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(p) | 8));
+    free(reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(p) | 1));
  }
  return 0;
 }
--- a/compiler-rt/test/scudo/double-free.cpp
+++ b/compiler-rt/test/scudo/double-free.cpp
@ -46,4 +46,4 @@ int main(int argc, char **argv)
  return 0;
 }

-// CHECK: ERROR: invalid chunk state when deallocating address
+// CHECK: ERROR: invalid chunk state
--- a/compiler-rt/test/scudo/interface.cpp
+++ b/compiler-rt/test/scudo/interface.cpp
@ -0,0 +1,28 @@
+// RUN: %clang_scudo %s -o %t
+// RUN: %run %t 2>&1
+
+// Tests that the sanitizer interface functions behave appropriately.
+
+#include <stdlib.h>
+
+#include <vector>
+
+#include <sanitizer/allocator_interface.h>
+
+int main(int argc, char **argv)
+{
+  void *p;
+  std::vector<ssize_t> sizes{1, 8, 16, 32, 1024, 32768,
+    1 << 16, 1 << 17, 1 << 20, 1 << 24};
+  for (size_t size : sizes) {
+    p = malloc(size);
+    if (!p)
+      return 1;
+    if (!__sanitizer_get_ownership(p))
+      return 1;
+    if (__sanitizer_get_allocated_size(p) < size)
+      return 1;
+    free(p);
+  }
+  return 0;
+}
--- a/compiler-rt/test/scudo/lit.cfg
+++ b/compiler-rt/test/scudo/lit.cfg
@ -3,7 +3,7 @@
 import os

 # Setup config name.
-config.name = 'Scudo'
+config.name = 'Scudo' + config.name_suffix

 # Setup source root.
 config.test_source_root = os.path.dirname(__file__)
@ -14,18 +14,19 @@ base_lib = os.path.join(config.compiler_rt_libdir,
 whole_archive = "-Wl,-whole-archive %s -Wl,-no-whole-archive " % base_lib

 # Test suffixes.
-config.suffixes = ['.c', '.cc', '.cpp', '.m', '.mm', '.ll', '.test']
+config.suffixes = ['.c', '.cc', '.cpp']

 # C flags.
-c_flags = ["-std=c++11",
+c_flags = ([config.target_cflags] +
+           ["-std=c++11",
           "-lstdc++",
-           "-ldl",
           "-lrt",
-           "-pthread",
           "-latomic",
+           "-ldl",
+           "-pthread",
           "-fPIE",
           "-pie",
-           "-O0"]
+           "-O0"])

 def build_invocation(compile_flags):                                            
  return " " + " ".join([config.clang] + compile_flags) + " "                   
--- a/compiler-rt/test/scudo/lit.site.cfg.in
+++ b/compiler-rt/test/scudo/lit.site.cfg.in
@ -1,5 +1,9 @@
@LIT_SITE_CFG_IN_HEADER@

+config.name_suffix = "@SCUDO_TEST_CONFIG_SUFFIX@"
+config.target_arch = "@SCUDO_TEST_TARGET_ARCH@"
+config.target_cflags = "@SCUDO_TEST_TARGET_CFLAGS@"
+
 # Load common config for all compiler-rt lit tests.
 lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/test/lit.common.configured")

--- a/compiler-rt/test/scudo/malloc.cpp
+++ b/compiler-rt/test/scudo/malloc.cpp
@ -2,9 +2,9 @@
 // RUN: %run %t 2>&1

 // Tests that a regular workflow of allocation, memory fill and free works as
-// intended. Also tests that a zero-sized allocation succeeds.
+// intended. Tests various sizes serviced by the primary and secondary
+// allocators.

-#include <malloc.h>
 #include <stdlib.h>
 #include <string.h>

@ -13,18 +13,25 @@
 int main(int argc, char **argv)
 {
  void *p;
-  std::vector<size_t> sizes{1, 1 << 5, 1 << 10, 1 << 15, 1 << 20};
+  std::vector<ssize_t> sizes{1, 8, 16, 32, 1024, 32768,
+    1 << 16, 1 << 17, 1 << 20, 1 << 24};
+  std::vector<int> offsets{1, 0, -1, -7, -8, -15, -16, -31, -32};

  p = malloc(0);
  if (!p)
    return 1;
  free(p);
-  for (size_t size : sizes) {
-    p = malloc(size);
-    if (!p)
-      return 1;
-    memset(p, 'A', size);
-    free(p);
+  for (ssize_t size : sizes) {
+    for (int offset: offsets) {
+      ssize_t actual_size = size + offset;
+      if (actual_size <= 0)
+        continue;
+      p = malloc(actual_size);
+      if (!p)
+        return 1;
+      memset(p, 0xff, actual_size);
+      free(p);
+    }
  }

  return 0;
--- a/compiler-rt/test/scudo/memalign.cpp
+++ b/compiler-rt/test/scudo/memalign.cpp
@ -31,7 +31,7 @@ int main(int argc, char **argv)
      return 1;
    free(p);
    // Tests various combinations of alignment and sizes
-    for (int i = 4; i < 20; i++) {
+    for (int i = (sizeof(void *) == 4) ? 3 : 4; i <= 24; i++) {
      alignment = 1U << i;
      for (int j = 1; j < 33; j++) {
        size = 0x800 * j;
--- a/compiler-rt/test/scudo/mismatch.cpp
+++ b/compiler-rt/test/scudo/mismatch.cpp
@ -30,7 +30,7 @@ int main(int argc, char **argv)
    free((void *)p);
  }
  if (!strcmp(argv[1], "memaligndel")) {
-    int *p = (int *)memalign(0x10, 0x10);
+    int *p = (int *)memalign(16, 16);
    if (!p)
      return 1;
    delete p;
--- a/compiler-rt/test/scudo/overflow.cpp
+++ b/compiler-rt/test/scudo/overflow.cpp
@ -11,12 +11,13 @@
 int main(int argc, char **argv)
 {
  assert(argc == 2);
+  ssize_t offset = sizeof(void *) == 8 ? 8 : 0;
  if (!strcmp(argv[1], "malloc")) {
    // Simulate a header corruption of an allocated chunk (1-bit)
    void *p = malloc(1U << 4);
    if (!p)
      return 1;
-    ((char *)p)[-1] ^= 1;
+    ((char *)p)[-(offset + 1)] ^= 1;
    free(p);
  }
  if (!strcmp(argv[1], "quarantine")) {
@ -25,7 +26,7 @@ int main(int argc, char **argv)
      return 1;
    free(p);
    // Simulate a header corruption of a quarantined chunk
-    ((char *)p)[-2] ^= 1;
+    ((char *)p)[-(offset + 2)] ^= 1;
    // Trigger the quarantine recycle
    for (int i = 0; i < 0x100; i++) {
      p = malloc(1U << 16);
--- a/compiler-rt/test/scudo/preinit.cpp
+++ b/compiler-rt/test/scudo/preinit.cpp
@ -4,7 +4,6 @@
 // Verifies that calling malloc in a preinit_array function succeeds, and that
 // the resulting pointer can be freed at program termination.

-#include <malloc.h>
 #include <stdlib.h>
 #include <string.h>

--- a/compiler-rt/test/scudo/random_shuffle.cpp
+++ b/compiler-rt/test/scudo/random_shuffle.cpp
@ -7,6 +7,7 @@
 // RUN: %run %t 10000 > %T/random_shuffle_tmp_dir/out2
 // RUN: not diff %T/random_shuffle_tmp_dir/out?
 // RUN: rm -rf %T/random_shuffle_tmp_dir
+// UNSUPPORTED: i386-linux,i686-linux

 // Tests that the allocator shuffles the chunks before returning to the user.

--- a/compiler-rt/test/scudo/realloc.cpp
+++ b/compiler-rt/test/scudo/realloc.cpp
@ -20,7 +20,7 @@ int main(int argc, char **argv)
 {
  void *p, *old_p;
  // Those sizes will exercise both allocators (Primary & Secondary).
-  std::vector<size_t> sizes{1, 1 << 5, 1 << 10, 1 << 15, 1 << 20};
+  std::vector<size_t> sizes{1, 16, 1024, 32768, 1 << 16, 1 << 17, 1 << 20};

  assert(argc == 2);
  for (size_t size : sizes) {