llvm-project/flang/runtime/utf.h

//===-- runtime/utf.h -----------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// UTF-8 is the variant-width standard encoding of Unicode (ISO 10646)
// code points.
//
// 7-bit values in [00 .. 7F] represent themselves as single bytes, so true
// 7-bit ASCII is also valid UTF-8.
//
// Larger values are encoded with a start byte in [C0 .. FE] that carries
// the length of the encoding and some of the upper bits of the value, followed
// by one or more bytes in the range [80 .. BF].
//
// Specifically, the first byte holds two or more uppermost set bits,
// a zero bit, and some payload; the second and later bytes each start with
// their uppermost bit set, the next bit clear, and six bits of payload.
// Payload parcels are in big-endian order.  All bytes must be present in a
// valid sequence; i.e., low-order sezo bits must be explicit.  UTF-8 is
// self-synchronizing on input as any byte value cannot be both a valid
// first byte or trailing byte.
//
// 0xxxxxxx - 7 bit ASCII
// 110xxxxx 10xxxxxx - 11-bit value
// 1110xxxx 10xxxxxx 10xxxxxx - 16-bit value
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21-bit value
// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26-bit value
// 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 31-bit value
// 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 36-bit value
//
// Canonical UTF-8 sequences should be minimal, and our output is so, but
// we do not reject non-minimal sequences on input.  Unicode only defines
// code points up to 0x10FFFF, so 21-bit (4-byte) UTF-8 is the actual
// standard maximum.  However, we support extended forms up to 32 bits so that
// CHARACTER(KIND=4) can be abused to hold arbitrary 32-bit data.

#ifndef FORTRAN_RUNTIME_UTF_H_
#define FORTRAN_RUNTIME_UTF_H_

#include <cstddef>
#include <cstdint>
#include <optional>

namespace Fortran::runtime {

// Derive the length of a UTF-8 character encoding from its first byte.
// A zero result signifies an invalid encoding.
extern const std::uint8_t UTF8FirstByteTable[256];
static inline std::size_t MeasureUTF8Bytes(char first) {
  return UTF8FirstByteTable[static_cast<std::uint8_t>(first)];
}

static constexpr std::size_t maxUTF8Bytes{7};

// Ensure that all bytes are present in sequence in the input buffer
// before calling; use MeasureUTF8Bytes(first byte) to count them.
std::optional<char32_t> DecodeUTF8(const char *);

// Ensure that at least maxUTF8Bytes remain in the output
// buffer before calling.
std::size_t EncodeUTF8(char *, char32_t);

} // namespace Fortran::runtime
#endif // FORTRAN_RUNTIME_UTF_H_
[flang] Initial UTF-8 support in runtime I/O Implements UTF-8 encoding and decoding for external units with OPEN(ENCODING='UTF-8'). This encoding applies to default CHARACTER values that are not 7-bit ASCII as well as to the wide CHARACTER kinds 2 and 4. Basic testing is in place via direct calls to the runtime I/O APIs, but serious checkout awaits lowering support of the wide CHARACTER kinds. Differential Revision: https://reviews.llvm.org/D122038 2022-03-17 03:32:03 +08:00			`//===-- runtime/utf.h -----------------------------------------------------===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`

			`// UTF-8 is the variant-width standard encoding of Unicode (ISO 10646)`
			`// code points.`
			`//`
			`// 7-bit values in [00 .. 7F] represent themselves as single bytes, so true`
			`// 7-bit ASCII is also valid UTF-8.`
			`//`
			`// Larger values are encoded with a start byte in [C0 .. FE] that carries`
			`// the length of the encoding and some of the upper bits of the value, followed`
			`// by one or more bytes in the range [80 .. BF].`
			`//`
			`// Specifically, the first byte holds two or more uppermost set bits,`
			`// a zero bit, and some payload; the second and later bytes each start with`
			`// their uppermost bit set, the next bit clear, and six bits of payload.`
			`// Payload parcels are in big-endian order. All bytes must be present in a`
			`// valid sequence; i.e., low-order sezo bits must be explicit. UTF-8 is`
			`// self-synchronizing on input as any byte value cannot be both a valid`
			`// first byte or trailing byte.`
			`//`
			`// 0xxxxxxx - 7 bit ASCII`
			`// 110xxxxx 10xxxxxx - 11-bit value`
			`// 1110xxxx 10xxxxxx 10xxxxxx - 16-bit value`
			`// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21-bit value`
			`// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26-bit value`
			`// 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 31-bit value`
			`// 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 36-bit value`
			`//`
			`// Canonical UTF-8 sequences should be minimal, and our output is so, but`
			`// we do not reject non-minimal sequences on input. Unicode only defines`
			`// code points up to 0x10FFFF, so 21-bit (4-byte) UTF-8 is the actual`
			`// standard maximum. However, we support extended forms up to 32 bits so that`
			`// CHARACTER(KIND=4) can be abused to hold arbitrary 32-bit data.`

			`#ifndef FORTRAN_RUNTIME_UTF_H_`
			`#define FORTRAN_RUNTIME_UTF_H_`

			`#include <cstddef>`
			`#include <cstdint>`
			`#include <optional>`

			`namespace Fortran::runtime {`

			`// Derive the length of a UTF-8 character encoding from its first byte.`
			`// A zero result signifies an invalid encoding.`
			`extern const std::uint8_t UTF8FirstByteTable[256];`
			`static inline std::size_t MeasureUTF8Bytes(char first) {`
			`return UTF8FirstByteTable[static_cast<std::uint8_t>(first)];`
			`}`

			`static constexpr std::size_t maxUTF8Bytes{7};`

			`// Ensure that all bytes are present in sequence in the input buffer`
			`// before calling; use MeasureUTF8Bytes(first byte) to count them.`
			`std::optional<char32_t> DecodeUTF8(const char *);`

			`// Ensure that at least maxUTF8Bytes remain in the output`
			`// buffer before calling.`
			`std::size_t EncodeUTF8(char *, char32_t);`

			`} // namespace Fortran::runtime`
			`#endif // FORTRAN_RUNTIME_UTF_H_`