forked from OSchip/llvm-project
69 lines
2.8 KiB
C
69 lines
2.8 KiB
C
|
//===-- runtime/utf.h -----------------------------------------------------===//
|
||
|
//
|
||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
|
//
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
|
||
|
// UTF-8 is the variant-width standard encoding of Unicode (ISO 10646)
|
||
|
// code points.
|
||
|
//
|
||
|
// 7-bit values in [00 .. 7F] represent themselves as single bytes, so true
|
||
|
// 7-bit ASCII is also valid UTF-8.
|
||
|
//
|
||
|
// Larger values are encoded with a start byte in [C0 .. FE] that carries
|
||
|
// the length of the encoding and some of the upper bits of the value, followed
|
||
|
// by one or more bytes in the range [80 .. BF].
|
||
|
//
|
||
|
// Specifically, the first byte holds two or more uppermost set bits,
|
||
|
// a zero bit, and some payload; the second and later bytes each start with
|
||
|
// their uppermost bit set, the next bit clear, and six bits of payload.
|
||
|
// Payload parcels are in big-endian order. All bytes must be present in a
|
||
|
// valid sequence; i.e., low-order sezo bits must be explicit. UTF-8 is
|
||
|
// self-synchronizing on input as any byte value cannot be both a valid
|
||
|
// first byte or trailing byte.
|
||
|
//
|
||
|
// 0xxxxxxx - 7 bit ASCII
|
||
|
// 110xxxxx 10xxxxxx - 11-bit value
|
||
|
// 1110xxxx 10xxxxxx 10xxxxxx - 16-bit value
|
||
|
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21-bit value
|
||
|
// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26-bit value
|
||
|
// 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 31-bit value
|
||
|
// 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 36-bit value
|
||
|
//
|
||
|
// Canonical UTF-8 sequences should be minimal, and our output is so, but
|
||
|
// we do not reject non-minimal sequences on input. Unicode only defines
|
||
|
// code points up to 0x10FFFF, so 21-bit (4-byte) UTF-8 is the actual
|
||
|
// standard maximum. However, we support extended forms up to 32 bits so that
|
||
|
// CHARACTER(KIND=4) can be abused to hold arbitrary 32-bit data.
|
||
|
|
||
|
#ifndef FORTRAN_RUNTIME_UTF_H_
|
||
|
#define FORTRAN_RUNTIME_UTF_H_
|
||
|
|
||
|
#include <cstddef>
|
||
|
#include <cstdint>
|
||
|
#include <optional>
|
||
|
|
||
|
namespace Fortran::runtime {
|
||
|
|
||
|
// Derive the length of a UTF-8 character encoding from its first byte.
|
||
|
// A zero result signifies an invalid encoding.
|
||
|
extern const std::uint8_t UTF8FirstByteTable[256];
|
||
|
static inline std::size_t MeasureUTF8Bytes(char first) {
|
||
|
return UTF8FirstByteTable[static_cast<std::uint8_t>(first)];
|
||
|
}
|
||
|
|
||
|
static constexpr std::size_t maxUTF8Bytes{7};
|
||
|
|
||
|
// Ensure that all bytes are present in sequence in the input buffer
|
||
|
// before calling; use MeasureUTF8Bytes(first byte) to count them.
|
||
|
std::optional<char32_t> DecodeUTF8(const char *);
|
||
|
|
||
|
// Ensure that at least maxUTF8Bytes remain in the output
|
||
|
// buffer before calling.
|
||
|
std::size_t EncodeUTF8(char *, char32_t);
|
||
|
|
||
|
} // namespace Fortran::runtime
|
||
|
#endif // FORTRAN_RUNTIME_UTF_H_
|