[flang] Extirpate NCHARACTER type, NC"" literals, and EUC-JP Hollerith

Original-commit: flang-compiler/f18@10a592a591
Reviewed-on: https://github.com/flang-compiler/f18/pull/535
Tree-same-pre-rewrite: false
This commit is contained in:
peter klausler 2019-06-28 11:16:37 -07:00
parent d4119baffc
commit dbb202c5be
14 changed files with 38 additions and 161 deletions

View File

@ -23,7 +23,6 @@ Extensions, deletions, and legacy features supported by default
* Signed complex literal constants
* DEC `STRUCTURE`, `RECORD`, `UNION`, and `MAP`
* Structure field access with `.field`
* `NCHARACTER` type and `NC` Kanji character literals
* `BYTE` as synonym for `INTEGER(KIND=1)`
* Quad precision REAL literals with `Q`
* `X` prefix/suffix as synonym for `Z` on hexadecimal literals
@ -111,3 +110,4 @@ Extensions and legacy features deliberately not supported
* Comparsion of LOGICAL with ==/.EQ. rather than .EQV. (also .NEQV.) (PGI/Intel)
* Procedure pointers in COMMON blocks (PGI/Intel)
* Underindexing multi-dimensional arrays (e.g., A(1) rather than A(1,1)) (PGI only)
* Legacy PGI `NCHARACTER` type and `NC` Kanji character literals

View File

@ -1,4 +1,4 @@
// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -36,7 +36,6 @@ struct SetOfChars {
// character literals. We repurpose '^' and '?' for newline and unknown
// characters (resp.), leaving the others alone in case this code might
// be useful in preprocssing.
// TODO: EBCDIC?
if (c == '\n') {
// map newline to '^'
c = '^';

View File

@ -119,40 +119,9 @@ template<> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t ucs) {
return result;
}
// These are placeholders; the actual mapping is complicated.
static char32_t JIS_0208ToUCS(char32_t jis) { return jis | 0x80000; }
static char32_t JIS_0212ToUCS(char32_t jis) { return jis | 0x90000; }
static bool IsUCSJIS_0212(char32_t ucs) { return (ucs & 0x90000) == 0x90000; }
static char32_t UCSToJIS(char32_t ucs) { return ucs & 0xffff; }
template<> EncodedCharacter EncodeCharacter<Encoding::EUC_JP>(char32_t ucs) {
EncodedCharacter result;
if (ucs <= 0x7f) {
result.buffer[0] = ucs;
result.bytes = 1;
} else if (ucs <= 0xff) {
result.buffer[0] = '\x8e'; // JIS X 0201
result.buffer[1] = ucs;
result.bytes = 2;
} else if (IsUCSJIS_0212(ucs)) { // JIS X 0212
char32_t jis{UCSToJIS(ucs)};
result.buffer[0] = '\x8f';
result.buffer[1] = 0x80 ^ (jis >> 8);
result.buffer[2] = 0x80 ^ jis;
result.bytes = 3;
} else { // JIS X 0208
char32_t jis{UCSToJIS(ucs)};
result.buffer[0] = 0x80 ^ (jis >> 8);
result.buffer[1] = 0x80 ^ jis;
result.bytes = 2;
}
return result;
}
EncodedCharacter EncodeCharacter(Encoding encoding, char32_t ucs) {
switch (encoding) {
case Encoding::LATIN_1: return EncodeCharacter<Encoding::LATIN_1>(ucs);
case Encoding::EUC_JP: return EncodeCharacter<Encoding::EUC_JP>(ucs);
case Encoding::UTF_8: return EncodeCharacter<Encoding::UTF_8>(ucs);
default: CRASH_NO_CASE;
}
@ -171,7 +140,7 @@ std::string EncodeString(const STRING &str) {
template std::string EncodeString<Encoding::LATIN_1, std::string>(
const std::string &);
template std::string EncodeString<Encoding::EUC_JP, std::u16string>(
template std::string EncodeString<Encoding::UTF_8, std::u16string>(
const std::u16string &);
template std::string EncodeString<Encoding::UTF_8, std::u32string>(
const std::u32string &);
@ -213,27 +182,6 @@ DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(
}
}
template<>
DecodedCharacter DecodeRawCharacter<Encoding::EUC_JP>(
const char *cp, std::size_t bytes) {
auto p{reinterpret_cast<const std::uint8_t *>(cp)};
char32_t ch{*p};
if (ch <= 0x7f) {
return {ch, 1};
} else if (ch == 0x8e) {
if (bytes >= 2) {
return {p[1], 2}; // JIS X 0201
}
} else if (ch == 0x8f) { // JIS X 0212
if (bytes >= 3) {
return {JIS_0212ToUCS(((p[1] << 8) | p[2]) ^ 0x8080), 3};
}
} else if (bytes >= 2) { // assume JIS X 0208
return {JIS_0208ToUCS(((ch << 8) | p[1]) ^ 0x8080), 2};
}
return {};
}
static DecodedCharacter DecodeEscapedCharacter(
const char *cp, std::size_t bytes) {
if (cp[0] == '\\' && bytes >= 2) {
@ -297,8 +245,6 @@ DecodedCharacter DecodeCharacter(
template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
const char *, std::size_t, bool);
template DecodedCharacter DecodeCharacter<Encoding::EUC_JP>(
const char *, std::size_t, bool);
template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
const char *, std::size_t, bool);
@ -307,17 +253,15 @@ DecodedCharacter DecodeCharacter(Encoding encoding, const char *cp,
switch (encoding) {
case Encoding::LATIN_1:
return DecodeCharacter<Encoding::LATIN_1>(cp, bytes, backslashEscapes);
case Encoding::EUC_JP:
return DecodeCharacter<Encoding::EUC_JP>(cp, bytes, backslashEscapes);
case Encoding::UTF_8:
return DecodeCharacter<Encoding::UTF_8>(cp, bytes, backslashEscapes);
default: CRASH_NO_CASE;
}
}
template<Encoding ENCODING>
StringFor<ENCODING> DecodeString(const std::string &s, bool backslashEscapes) {
StringFor<ENCODING> result;
template<typename RESULT, Encoding ENCODING>
RESULT DecodeString(const std::string &s, bool backslashEscapes) {
RESULT result;
const char *p{s.c_str()};
for (auto bytes{s.size()}; bytes != 0;) {
DecodedCharacter decoded{
@ -337,9 +281,10 @@ StringFor<ENCODING> DecodeString(const std::string &s, bool backslashEscapes) {
return result;
}
template std::string DecodeString<Encoding::LATIN_1>(const std::string &, bool);
template std::u16string DecodeString<Encoding::EUC_JP>(
template std::string DecodeString<std::string, Encoding::LATIN_1>(
const std::string &, bool);
template std::u32string DecodeString<Encoding::UTF_8>(
template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
const std::string &, bool);
template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
const std::string &, bool);
}

View File

@ -18,7 +18,6 @@
// Define some character classification predicates and
// conversions here to avoid dependences upon <cctype> and
// also to accomodate Fortran tokenization.
// TODO: EBCDIC?
#include <cstddef>
#include <optional>
@ -31,8 +30,7 @@ namespace Fortran::parser {
// The specific encodings that we can handle include:
// LATIN_1: ISO 8859-1 Latin-1
// UTF_8: Multi-byte encoding of Unicode (ISO/IEC 10646)
// EUC_JP: 1-3 byte encoding of JIS X 0208 / 0212
enum class Encoding { LATIN_1, UTF_8, EUC_JP };
enum class Encoding { LATIN_1, UTF_8 };
inline constexpr bool IsUpperCaseLetter(char ch) {
return ch >= 'A' && ch <= 'Z';
@ -147,7 +145,6 @@ struct EncodedCharacter {
template<Encoding ENCODING> EncodedCharacter EncodeCharacter(char32_t ucs);
template<> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t);
template<> EncodedCharacter EncodeCharacter<Encoding::EUC_JP>(char32_t);
template<> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t);
EncodedCharacter EncodeCharacter(Encoding, char32_t ucs);
@ -156,8 +153,6 @@ template<Encoding ENCODING, typename STRING>
std::string EncodeString(const STRING &);
extern template std::string EncodeString<Encoding::LATIN_1, std::string>(
const std::string &);
extern template std::string EncodeString<Encoding::EUC_JP, std::u16string>(
const std::u16string &);
extern template std::string EncodeString<Encoding::UTF_8, std::u32string>(
const std::u32string &);
@ -194,7 +189,7 @@ void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
std::string QuoteCharacterLiteral(const std::string &,
bool backslashEscapes = true, Encoding = Encoding::LATIN_1);
std::string QuoteCharacterLiteral(const std::u16string &,
bool backslashEscapes = true, Encoding = Encoding::EUC_JP);
bool backslashEscapes = true, Encoding = Encoding::UTF_8);
std::string QuoteCharacterLiteral(const std::u32string &,
bool backslashEscapes = true, Encoding = Encoding::UTF_8);
@ -210,9 +205,7 @@ DecodedCharacter DecodeRawCharacter(const char *, std::size_t);
template<>
DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
const char *, std::size_t);
template<>
DecodedCharacter DecodeRawCharacter<Encoding::EUC_JP>(
const char *, std::size_t);
template<>
DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(const char *, std::size_t);
@ -222,34 +215,19 @@ DecodedCharacter DecodeCharacter(
const char *, std::size_t, bool backslashEscapes);
extern template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
const char *, std::size_t, bool);
extern template DecodedCharacter DecodeCharacter<Encoding::EUC_JP>(
const char *, std::size_t, bool);
extern template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
const char *, std::size_t, bool);
DecodedCharacter DecodeCharacter(
Encoding, const char *, std::size_t, bool backslashEscapes);
template<Encoding ENCODING> struct StringForEncoding;
template<> struct StringForEncoding<Encoding::LATIN_1> {
using type = std::string;
};
template<> struct StringForEncoding<Encoding::EUC_JP> {
using type = std::u16string;
};
template<> struct StringForEncoding<Encoding::UTF_8> {
using type = std::u32string;
};
template<Encoding ENCODING>
using StringFor = typename StringForEncoding<ENCODING>::type;
template<Encoding ENCODING>
StringFor<ENCODING> DecodeString(const std::string &, bool backslashEscapes);
extern template std::string DecodeString<Encoding::LATIN_1>(
template<typename RESULT, Encoding ENCODING>
RESULT DecodeString(const std::string &, bool backslashEscapes);
extern template std::string DecodeString<std::string, Encoding::LATIN_1>(
const std::string &, bool);
extern template std::u16string DecodeString<Encoding::EUC_JP>(
extern template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
const std::string &, bool);
extern template std::u32string DecodeString<Encoding::UTF_8>(
extern template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
const std::string &, bool);
}
#endif // FORTRAN_PARSER_CHARACTERS_H_

View File

@ -352,7 +352,6 @@ public:
NODE(parser::IntrinsicTypeSpec, DoubleComplex)
NODE(parser::IntrinsicTypeSpec, DoublePrecision)
NODE(parser::IntrinsicTypeSpec, Logical)
NODE(parser::IntrinsicTypeSpec, NCharacter)
NODE(parser::IntrinsicTypeSpec, Real)
NODE(parser, IoControlSpec)
NODE(parser::IoControlSpec, Asynchronous)
@ -364,7 +363,6 @@ public:
NODE(parser, IoUnit)
NODE(parser, Keyword)
NODE(parser, KindParam)
NODE(parser::KindParam, Kanji)
NODE(parser, KindSelector)
NODE(parser::KindSelector, StarSize)
NODE(parser, LabelDoStmt)

View File

@ -24,7 +24,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
FixedFormContinuationWithColumn1Ampersand, LogicalAbbreviations,
XOROperator, PunctuationInNames, OptionalFreeFormSpace, BOZExtensions,
EmptyStatement, AlternativeNE, ExecutionPartNamelist, DECStructures,
DoubleComplex, Kanji, Byte, StarKind, QuadPrecision, SlashInitialization,
DoubleComplex, Byte, StarKind, QuadPrecision, SlashInitialization,
TripletInArrayConstructor, MissingColons, SignedComplexLiteral,
OldStyleParameter, ComplexConstructor, PercentLOC, SignedPrimary, FileName,
Convert, Dispose, IOListLeadingComma, AbbreviatedEditDescriptor,

View File

@ -530,7 +530,7 @@ TYPE_CONTEXT_PARSER("declaration type spec"_en_US,
// integer-type-spec | REAL [kind-selector] | DOUBLE PRECISION |
// COMPLEX [kind-selector] | CHARACTER [char-selector] |
// LOGICAL [kind-selector]
// Extensions: DOUBLE COMPLEX, NCHARACTER, BYTE
// Extensions: DOUBLE COMPLEX, BYTE
TYPE_CONTEXT_PARSER("intrinsic type spec"_en_US,
first(construct<IntrinsicTypeSpec>(integerTypeSpec),
construct<IntrinsicTypeSpec>(
@ -546,9 +546,6 @@ TYPE_CONTEXT_PARSER("intrinsic type spec"_en_US,
construct<IntrinsicTypeSpec>("DOUBLE COMPLEX" >>
extension<LanguageFeature::DoubleComplex>(
construct<IntrinsicTypeSpec::DoubleComplex>())),
construct<IntrinsicTypeSpec>(extension<LanguageFeature::Kanji>(
construct<IntrinsicTypeSpec::NCharacter>(
"NCHARACTER" >> maybe(Parser<LengthSelector>{})))),
extension<LanguageFeature::Byte>(
construct<IntrinsicTypeSpec>(construct<IntegerTypeSpec>(
"BYTE" >> construct<std::optional<KindSelector>>(pure(1)))))))
@ -661,7 +658,6 @@ TYPE_PARSER(construct<CharLength>(parenthesized(typeParamValue)) ||
// [kind-param _] " [rep-char]... "
// "rep-char" is any non-control character. Doubled interior quotes are
// combined. Backslash escapes can be enabled.
// PGI extension: nc'...' is Kanji.
// N.B. charLiteralConstantWithoutKind does not skip preceding space.
// N.B. the parsing of "name" takes care to not consume the '_'.
constexpr auto charLiteralConstantWithoutKind{
@ -671,11 +667,7 @@ TYPE_CONTEXT_PARSER("CHARACTER literal constant"_en_US,
construct<CharLiteralConstant>(
kindParam / underscore, charLiteralConstantWithoutKind) ||
construct<CharLiteralConstant>(construct<std::optional<KindParam>>(),
space >> charLiteralConstantWithoutKind) ||
construct<CharLiteralConstant>(
construct<std::optional<KindParam>>(
construct<KindParam>(construct<KindParam::Kanji>("NC"_tok))),
charLiteralConstantWithoutKind))
space >> charLiteralConstantWithoutKind))
// deprecated: Hollerith literals
constexpr auto rawHollerithLiteral{

View File

@ -660,7 +660,7 @@ struct CharSelector {
// integer-type-spec | REAL [kind-selector] | DOUBLE PRECISION |
// COMPLEX [kind-selector] | CHARACTER [char-selector] |
// LOGICAL [kind-selector]
// Extensions: DOUBLE COMPLEX, NCHARACTER (Kanji)
// Extensions: DOUBLE COMPLEX
struct IntrinsicTypeSpec {
UNION_CLASS_BOILERPLATE(IntrinsicTypeSpec);
struct Real {
@ -685,9 +685,8 @@ struct IntrinsicTypeSpec {
std::optional<KindSelector> kind;
};
EMPTY_CLASS(DoubleComplex);
WRAPPER_CLASS(NCharacter, std::optional<LengthSelector>);
std::variant<IntegerTypeSpec, Real, DoublePrecision, Complex, Character,
Logical, DoubleComplex, NCharacter>
Logical, DoubleComplex>
u;
};
@ -737,8 +736,7 @@ struct DeclarationTypeSpec {
// R709 kind-param -> digit-string | scalar-int-constant-name
struct KindParam {
UNION_CLASS_BOILERPLATE(KindParam);
EMPTY_CLASS(Kanji);
std::variant<std::uint64_t, Scalar<Integer<Constant<Name>>>, Kanji> u;
std::variant<std::uint64_t, Scalar<Integer<Constant<Name>>>> u;
};
// R707 signed-int-literal-constant -> [sign] int-literal-constant

View File

@ -166,8 +166,7 @@ TokenSequence &TokenSequence::ToLowerCase() {
*p = ToLowerCaseLetter(*p);
}
} else {
// Kanji NC'...' character literal or literal with kind-param prefix
// name (e.g., K_"ABC").
// Literal with kind-param prefix name (e.g., K_"ABC").
for (; *p != limit[-1]; ++p) {
*p = ToLowerCaseLetter(*p);
}

View File

@ -132,7 +132,6 @@ public:
void Post(const IntrinsicTypeSpec::DoubleComplex &) {
Word("DOUBLE COMPLEX");
}
void Before(const IntrinsicTypeSpec::NCharacter &x) { Word("NCHARACTER"); }
void Before(const IntegerTypeSpec &x) { // R705
Word("INTEGER");
}
@ -188,21 +187,12 @@ public:
void Unparse(const CharLiteralConstant &x) { // R724
const auto &str{std::get<std::string>(x.t)};
if (const auto &k{std::get<std::optional<KindParam>>(x.t)}) {
if (std::holds_alternative<KindParam::Kanji>(k->u)) {
Word("NC");
std::u16string decoded{DecodeString<Encoding::EUC_JP>(str, true)};
std::string encoded{EncodeString<Encoding::EUC_JP>(decoded)};
Put(QuoteCharacterLiteral(encoded, backslashEscapes_));
} else {
Walk(*k), Put('_');
PutNormalized(str);
}
} else {
PutNormalized(str);
Walk(*k), Put('_');
}
PutNormalized(str);
}
void Unparse(const HollerithLiteralConstant &x) {
std::u32string ucs{DecodeString<Encoding::UTF_8>(x.v, false)};
auto ucs{DecodeString<std::u32string, Encoding::UTF_8>(x.v, false)};
Unparse(ucs.size());
Put('H');
for (char32_t ch : ucs) {
@ -2634,7 +2624,7 @@ void UnparseVisitor::Put(const std::string &str) {
}
void UnparseVisitor::PutNormalized(const std::string &str) {
std::string decoded{DecodeString<Encoding::LATIN_1>(str, true)};
auto decoded{DecodeString<std::string, Encoding::LATIN_1>(str, true)};
std::string encoded{EncodeString<Encoding::LATIN_1>(decoded)};
Put(QuoteCharacterLiteral(encoded, backslashEscapes_));
}

View File

@ -321,8 +321,7 @@ template<typename A> MaybeExpr AsMaybeExpr(std::optional<A> &&x) {
// Type kind parameter values for literal constants.
int ExpressionAnalyzer::AnalyzeKindParam(
const std::optional<parser::KindParam> &kindParam, int defaultKind,
int kanjiKind /* = -1 */) {
const std::optional<parser::KindParam> &kindParam, int defaultKind) {
if (!kindParam.has_value()) {
return defaultKind;
}
@ -341,13 +340,6 @@ int ExpressionAnalyzer::AnalyzeKindParam(
}
return defaultKind;
},
[&](parser::KindParam::Kanji) {
if (kanjiKind >= 0) {
return kanjiKind;
}
Say("Kanji not allowed here"_err_en_US);
return defaultKind;
},
},
kindParam->u);
}
@ -522,13 +514,16 @@ MaybeExpr ExpressionAnalyzer::AnalyzeString(std::string &&string, int kind) {
switch (kind) {
case 1:
return AsGenericExpr(Constant<Type<TypeCategory::Character, 1>>{
parser::DecodeString<parser::Encoding::LATIN_1>(string, true)});
parser::DecodeString<std::string, parser::Encoding::LATIN_1>(
string, true)});
case 2:
return AsGenericExpr(Constant<Type<TypeCategory::Character, 2>>{
parser::DecodeString<parser::Encoding::EUC_JP>(string, true)});
parser::DecodeString<std::u16string, parser::Encoding::UTF_8>(
string, true)});
case 4:
return AsGenericExpr(Constant<Type<TypeCategory::Character, 4>>{
parser::DecodeString<parser::Encoding::UTF_8>(string, true)});
parser::DecodeString<std::u32string, parser::Encoding::UTF_8>(
string, true)});
default: CRASH_NO_CASE;
}
}

View File

@ -290,8 +290,8 @@ private:
}
// Analysis subroutines
int AnalyzeKindParam(const std::optional<parser::KindParam> &,
int defaultKind, int kanjiKind = -1);
int AnalyzeKindParam(
const std::optional<parser::KindParam> &, int defaultKind);
template<typename PARSED> MaybeExpr ExprOrVariable(const PARSED &);
template<typename PARSED> MaybeExpr IntLiteralConstant(const PARSED &);
MaybeExpr AnalyzeString(std::string &&, int kind);

View File

@ -693,7 +693,6 @@ public:
void Post(const parser::IntrinsicTypeSpec::Complex &);
void Post(const parser::IntrinsicTypeSpec::Logical &);
void Post(const parser::IntrinsicTypeSpec::Character &);
void Post(const parser::IntrinsicTypeSpec::NCharacter &);
void Post(const parser::CharSelector::LengthAndKind &);
void Post(const parser::CharLength &);
void Post(const parser::LengthSelector &);
@ -2764,15 +2763,6 @@ void DeclarationVisitor::Post(const parser::IntrinsicTypeSpec::Character &x) {
std::move(*charInfo_.length), std::move(*charInfo_.kind)));
charInfo_ = {};
}
void DeclarationVisitor::Post(const parser::IntrinsicTypeSpec::NCharacter &x) {
if (!charInfo_.length) {
charInfo_.length = ParamValue{1};
}
CHECK(!charInfo_.kind.has_value());
SetDeclTypeSpec(currScope().MakeCharacterType(
std::move(*charInfo_.length), KindExpr{2 /* EUC_JP */}));
charInfo_ = {};
}
void DeclarationVisitor::Post(const parser::CharSelector::LengthAndKind &x) {
charInfo_.kind = EvaluateSubscriptIntExpr(x.kind);
if (x.length) {
@ -3149,9 +3139,6 @@ bool DeclarationVisitor::Pre(const parser::ProcInterface &x) {
} else if (name->source == "doublecomplex") {
proc.u = parser::IntrinsicTypeSpec{
parser::IntrinsicTypeSpec::DoubleComplex{}};
} else if (name->source == "ncharacter") {
proc.u = parser::IntrinsicTypeSpec{
parser::IntrinsicTypeSpec::NCharacter{std::nullopt}};
}
}
}

View File

@ -465,8 +465,6 @@ int main(int argc, char *const argv[]) {
driver.encoding = Fortran::parser::Encoding::UTF_8;
} else if (arg == "-flatin") {
driver.encoding = Fortran::parser::Encoding::LATIN_1;
} else if (arg == "-fkanji" || arg == "-Mx,125,4") {
driver.encoding = Fortran::parser::Encoding::EUC_JP;
} else if (arg == "-help" || arg == "--help" || arg == "-?") {
std::cerr
<< "f18 options:\n"
@ -481,8 +479,6 @@ int main(int argc, char *const argv[]) {
<< " -ed enable fixed form D lines\n"
<< " -E prescan & preprocess only\n"
<< " -module dir module output directory (default .)\n"
<< " -fkanji interpret source as EUC_JP rather than "
"UTF-8\n"
<< " -flatin interpret source as Latin-1 (ISO 8859-1) "
"rather than UTF-8\n"
<< " -fparse-only parse only, no output except messages\n"