Implement comment saving mode: the -C and -CC options.

llvm-svn: 38783
2006-07-29 06:30:25 +00:00 · 2006-07-29 06:30:25 +00:00 · 457fc15bc5
parent 2be4115465
commit 457fc15bc5
8 changed files with 125 additions and 44 deletions
--- a/clang/Driver/PrintPreprocessedOutput.cpp
+++ b/clang/Driver/PrintPreprocessedOutput.cpp
@ -99,6 +99,11 @@ static void OutputString(const char *Ptr, unsigned Size) {

 static cl::opt<bool>
 DisableLineMarkers("P", cl::desc("Disable linemarker output in -E mode"));
+static cl::opt<bool>
+EnableCommentOutput("C", cl::desc("Enable comment output in -E mode"));
+static cl::opt<bool>
+EnableMacroCommentOutput("CC", cl::desc("Enable comment output in -E mode, "
+                                        "even from macro expansions"));

 static unsigned EModeCurLine;
 static std::string EModeCurFilename;
@ -357,7 +362,12 @@ static bool AvoidConcat(const LexerToken &PrevTok, const LexerToken &Tok,

 /// DoPrintPreprocessedInput - This implements -E mode.
 ///
-void clang::DoPrintPreprocessedInput(Preprocessor &PP) {
+void clang::DoPrintPreprocessedInput(Preprocessor &PP, LangOptions &Options) {
+  if (EnableCommentOutput)          // -C specified?
+    Options.KeepComments = 1;
+  if (EnableMacroCommentOutput)     // -CC specified?
+    Options.KeepComments = Options.KeepMacroComments = 1;
+  
  InitOutputBuffer();
  
  LexerToken Tok, PrevTok;
--- a/clang/Driver/clang.cpp
+++ b/clang/Driver/clang.cpp
@ -703,7 +703,7 @@ int main(int argc, char **argv) {
  }
    
  case PrintPreprocessedInput:       // -E mode.
-    DoPrintPreprocessedInput(PP);
+    DoPrintPreprocessedInput(PP, Options);
    break;
                  
  case DumpTokens: {                 // Token dump mode.
--- a/clang/Driver/clang.h
+++ b/clang/Driver/clang.h
@ -16,10 +16,11 @@

 namespace llvm {
 namespace clang {
-class Preprocessor;    
+class Preprocessor;
+class LangOptions;

 /// DoPrintPreprocessedInput - Implement -E mode.
-void DoPrintPreprocessedInput(Preprocessor &PP);
+void DoPrintPreprocessedInput(Preprocessor &PP, LangOptions &Options);

 }  // end namespace clang
 }  // end namespace llvm
--- a/clang/Lex/Lexer.cpp
+++ b/clang/Lex/Lexer.cpp
@ -65,6 +65,9 @@ Lexer::Lexer(const SourceBuffer *File, unsigned fileid, Preprocessor &pp,
  // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
  // or otherwise skipping over tokens.
  LexingRawMode = false;
+  
+  // Default to keeping comments if requested.
+  KeepCommentMode = Features.KeepComments;
 }

 /// Stringify - Convert the specified string into a C string, with surrounding
@ -587,13 +590,15 @@ void Lexer::SkipWhitespace(LexerToken &Result, const char *CurPtr) {

  // If the next token is obviously a // or /* */ comment, skip it efficiently
  // too (without going through the big switch stmt).
-  if (Char == '/' && CurPtr[1] == '/') {
+  if (Char == '/' && CurPtr[1] == '/' && !KeepCommentMode) {
    BufferPtr = CurPtr;
-    return SkipBCPLComment(Result, CurPtr+1);
+    SkipBCPLComment(Result, CurPtr+1);
+    return;
  }
-  if (Char == '/' && CurPtr[1] == '*') {
+  if (Char == '/' && CurPtr[1] == '*' && !KeepCommentMode) {
    BufferPtr = CurPtr;
-    return SkipBlockComment(Result, CurPtr+2);
+    SkipBlockComment(Result, CurPtr+2);
+    return;
  }
  BufferPtr = CurPtr;
 }
@ -601,7 +606,7 @@ void Lexer::SkipWhitespace(LexerToken &Result, const char *CurPtr) {
 // SkipBCPLComment - We have just read the // characters from input.  Skip until
 // we find the newline character thats terminate the comment.  Then update
 /// BufferPtr and return.
-void Lexer::SkipBCPLComment(LexerToken &Result, const char *CurPtr) {
+bool Lexer::SkipBCPLComment(LexerToken &Result, const char *CurPtr) {
  // If BCPL comments aren't explicitly enabled for this language, emit an
  // extension warning.
  if (!Features.BCPLComment) {
@ -648,16 +653,20 @@ void Lexer::SkipBCPLComment(LexerToken &Result, const char *CurPtr) {
        }
    }
    
-    if (CurPtr == BufferEnd+1) goto FoundEOF;
+    if (CurPtr == BufferEnd+1) { --CurPtr; break; }
  } while (C != '\n' && C != '\r');

-  // Found and did not consume a newline.
+  // Found but did not consume the newline.
+    
+  // If we are returning comments as tokens, return this comment as a token.
+  if (KeepCommentMode)
+    return SaveBCPLComment(Result, CurPtr);

  // If we are inside a preprocessor directive and we see the end of line,
  // return immediately, so that the lexer can return this as an EOM token.
-  if (ParsingPreprocessorDirective) {
+  if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
    BufferPtr = CurPtr;
-    return;
+    return true;
  }
  
  // Otherwise, eat the \n character.  We don't care if this is a \n\r or
@ -674,15 +683,33 @@ void Lexer::SkipBCPLComment(LexerToken &Result, const char *CurPtr) {
  // big switch, handle it efficiently now.
  if (isWhitespace(*CurPtr)) {
    Result.SetFlag(LexerToken::LeadingSpace);
-    return SkipWhitespace(Result, CurPtr+1);
+    SkipWhitespace(Result, CurPtr+1);
+    return true;
  }

  BufferPtr = CurPtr;
-  return;
+  return true;
+}

-FoundEOF:   // If we ran off the end of the buffer, return EOF.
-  BufferPtr = CurPtr-1;
-  return;
+/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in
+/// an appropriate way and return it.
+bool Lexer::SaveBCPLComment(LexerToken &Result, const char *CurPtr) {
+  Result.SetKind(tok::comment);
+  FormTokenWithChars(Result, CurPtr);
+  
+  // If this BCPL-style comment is in a macro definition, transmogrify it into
+  // a C-style block comment.
+  if (ParsingPreprocessorDirective) {
+    std::string Spelling = PP.getSpelling(Result);
+    assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?");
+    Spelling[1] = '*';   // Change prefix to "/*".
+    Spelling += "*/";    // add suffix.
+    
+    Result.SetLocation(PP.CreateString(&Spelling[0], Spelling.size(),
+                                       Result.getLocation()));
+    Result.SetLength(Spelling.size());
+  }
+  return false;
 }

 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
@ -748,7 +775,7 @@ static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
 /// because they cannot cause the comment to end.  The only thing that can
 /// happen is the comment could end with an escaped newline between the */ end
 /// of comment.
-void Lexer::SkipBlockComment(LexerToken &Result, const char *CurPtr) {
+bool Lexer::SkipBlockComment(LexerToken &Result, const char *CurPtr) {
  // Scan one character past where we should, looking for a '/' character.  Once
  // we find it, check to see if it was preceeded by a *.  This common
  // optimization helps people who like to put a lot of * characters in their
@ -757,7 +784,7 @@ void Lexer::SkipBlockComment(LexerToken &Result, const char *CurPtr) {
  if (C == 0 && CurPtr == BufferEnd+1) {
    Diag(BufferPtr, diag::err_unterminated_block_comment);
    BufferPtr = CurPtr-1;
-    return;
+    return true;
  }
  
  while (1) {
@ -789,22 +816,31 @@ void Lexer::SkipBlockComment(LexerToken &Result, const char *CurPtr) {
      // after the /*, but this would involve lexing a lot of what really is the
      // comment, which surely would confuse the parser.
      BufferPtr = CurPtr-1;
-      return;
+      return true;
    }
    C = *CurPtr++;
  }
+  
+  // If we are returning comments as tokens, return this comment as a token.
+  if (KeepCommentMode) {
+    Result.SetKind(tok::comment);
+    FormTokenWithChars(Result, CurPtr);
+    return false;
+  }

  // It is common for the tokens immediately after a /**/ comment to be
  // whitespace.  Instead of going through the big switch, handle it
  // efficiently now.
  if (isHorizontalWhitespace(*CurPtr)) {
    Result.SetFlag(LexerToken::LeadingSpace);
-    return SkipWhitespace(Result, CurPtr+1);
+    SkipWhitespace(Result, CurPtr+1);
+    return true;
  }

  // Otherwise, just return so that the next character will be lexed as a token.
  BufferPtr = CurPtr;
  Result.SetFlag(LexerToken::LeadingSpace);
+  return true;
 }

 //===----------------------------------------------------------------------===//
@ -920,6 +956,9 @@ bool Lexer::LexEndOfFile(LexerToken &Result, const char *CurPtr) {
    Result.SetKind(tok::eom);
    // Update the location of token as well as BufferPtr.
    FormTokenWithChars(Result, CurPtr);
+    
+    // Restore comment saving mode, in case it was disabled for directive.
+    KeepCommentMode = Features.KeepComments;
    return true;  // Have a token.
  }        

@ -1035,6 +1074,9 @@ LexNextToken:
      // Done parsing the "line".
      ParsingPreprocessorDirective = false;
      
+      // Restore comment saving mode, in case it was disabled for directive.
+      KeepCommentMode = Features.KeepComments;
+      
      // Since we consumed a newline, we are back at the start of a line.
      IsAtStartOfLine = true;
      
@ -1211,13 +1253,13 @@ LexNextToken:
    // 6.4.9: Comments
    Char = getCharAndSize(CurPtr, SizeTmp);
    if (Char == '/') {         // BCPL comment.
-      Result.SetFlag(LexerToken::LeadingSpace);
-      SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result));
-      goto LexNextToken;   // GCC isn't tail call eliminating.
+      if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
+        goto LexNextToken;   // GCC isn't tail call eliminating.
+      return; // KeepCommentMode
    } else if (Char == '*') {  // /**/ comment.
-      Result.SetFlag(LexerToken::LeadingSpace);
-      SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result));
-      goto LexNextToken;   // GCC isn't tail call eliminating.
+      if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
+        goto LexNextToken;   // GCC isn't tail call eliminating.
+      return; // KeepCommentMode
    } else if (Char == '=') {
      Result.SetKind(tok::slashequal);
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
--- a/clang/Lex/Preprocessor.cpp
+++ b/clang/Lex/Preprocessor.cpp
@ -755,6 +755,10 @@ MacroArgs *Preprocessor::ReadFunctionLikeMacroArgs(LexerToken &MacroName,
          return 0;
        }
        // Otherwise, continue to add the tokens to this variable argument.
+      } else if (Tok.getKind() == tok::comment && !Features.KeepMacroComments) {
+        // If this is a comment token in the argument list and we're just in
+        // -C mode (not -CC mode), discard the comment.
+        continue;
      }
  
      ArgTokens.push_back(Tok);
@ -1221,6 +1225,8 @@ void Preprocessor::SkipExcludedConditionalBlock(SourceLocation IfTokenLoc,
    // directive mode.  Tell the lexer this so any newlines we see will be
    // converted into an EOM token (this terminates the macro).
    CurLexer->ParsingPreprocessorDirective = true;
+    CurLexer->KeepCommentMode = false;
+
    
    // Read the next token, the directive flavor.
    LexUnexpandedToken(Tok);
@ -1229,6 +1235,8 @@ void Preprocessor::SkipExcludedConditionalBlock(SourceLocation IfTokenLoc,
    // something bogus), skip it.
    if (Tok.getKind() != tok::identifier) {
      CurLexer->ParsingPreprocessorDirective = false;
+      // Restore comment saving mode.
+      CurLexer->KeepCommentMode = Features.KeepComments;
      continue;
    }

@ -1242,6 +1250,8 @@ void Preprocessor::SkipExcludedConditionalBlock(SourceLocation IfTokenLoc,
    if (FirstChar >= 'a' && FirstChar <= 'z' && 
        FirstChar != 'i' && FirstChar != 'e') {
      CurLexer->ParsingPreprocessorDirective = false;
+      // Restore comment saving mode.
+      CurLexer->KeepCommentMode = Features.KeepComments;
      continue;
    }
    
@ -1261,6 +1271,8 @@ void Preprocessor::SkipExcludedConditionalBlock(SourceLocation IfTokenLoc,
      IdLen = DirectiveStr.size();
      if (IdLen >= 20) {
        CurLexer->ParsingPreprocessorDirective = false;
+        // Restore comment saving mode.
+        CurLexer->KeepCommentMode = Features.KeepComments;
        continue;
      }
      memcpy(Directive, &DirectiveStr[0], IdLen);
@ -1339,6 +1351,8 @@ void Preprocessor::SkipExcludedConditionalBlock(SourceLocation IfTokenLoc,
    }
    
    CurLexer->ParsingPreprocessorDirective = false;
+    // Restore comment saving mode.
+    CurLexer->KeepCommentMode = Features.KeepComments;
  }

  // Finally, if we are out of the conditional (saw an #endif or ran off the end
@ -1698,6 +1712,10 @@ void Preprocessor::HandleDefineDirective(LexerToken &DefineTok) {
  if (MacroNameTok.getKind() == tok::eom)
    return;
  
+  // If we are supposed to keep comments in #defines, reenable comment saving
+  // mode.
+  CurLexer->KeepCommentMode = Features.KeepMacroComments;
+  
  MacroInfo *MI = new MacroInfo(MacroNameTok.getLocation());
  
  LexerToken Tok;
--- a/clang/README.txt
+++ b/clang/README.txt
@ -67,7 +67,6 @@ Lexer:
 Preprocessor:
 * #assert/#unassert
 * #line / #file directives
- * -C output mode in -E mode.
 * MSExtension: "L#param" stringizes to a wide string literal.

 Traditional Preprocessor:
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@ -28,6 +28,9 @@ TOK(unknown)             // Not a token.
 TOK(eof)                 // End of file.
 TOK(eom)                 // End of macro (end of line inside a macro).

+// C99 6.4.9: Comments.
+TOK(comment)             // Comment (only in -E -C[C] mode)
+
 // C99 6.4.2: Identifiers.
 TOK(identifier)          // abcde123

--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@ -26,23 +26,27 @@ class Preprocessor;
 class SourceBuffer;

 struct LangOptions {
-  unsigned Trigraphs    : 1;  // Trigraphs in source files.
-  unsigned BCPLComment  : 1;  // BCPL-style // comments.
-  unsigned DollarIdents : 1;  // '$' allowed in identifiers.
-  unsigned Digraphs     : 1;  // When added to C?  C99?
-  unsigned HexFloats    : 1;  // C99 Hexadecimal float constants.
-  unsigned C99          : 1;  // C99 Support
-  unsigned Microsoft    : 1;  // Microsoft extensions.
-  unsigned CPlusPlus    : 1;  // C++ Support
-  unsigned CPPMinMax    : 1;  // C++ <?=, >?= tokens.
-  unsigned NoExtensions : 1;  // All extensions are disabled, strict mode.
+  unsigned Trigraphs         : 1;  // Trigraphs in source files.
+  unsigned BCPLComment       : 1;  // BCPL-style // comments.
+  unsigned DollarIdents      : 1;  // '$' allowed in identifiers.
+  unsigned Digraphs          : 1;  // When added to C?  C99?
+  unsigned HexFloats         : 1;  // C99 Hexadecimal float constants.
+  unsigned C99               : 1;  // C99 Support
+  unsigned Microsoft         : 1;  // Microsoft extensions.
+  unsigned CPlusPlus         : 1;  // C++ Support
+  unsigned CPPMinMax         : 1;  // C++ <?=, >?= tokens.
+  unsigned NoExtensions      : 1;  // All extensions are disabled, strict mode.
  
-  unsigned ObjC1        : 1;  // Objective C 1 support enabled.
-  unsigned ObjC2        : 1;  // Objective C 2 support enabled (implies ObjC1).
+  unsigned ObjC1             : 1;  // Objective C 1 support enabled.
+  unsigned ObjC2             : 1;  // Objective C 2 support enabled.
+  
+  unsigned KeepComments      : 1;  // Keep comments ("-C") mode.
+  unsigned KeepMacroComments : 1;  // Keep macro-exp comments ("-CC") mode.
  
  LangOptions() {
    Trigraphs = BCPLComment = DollarIdents = Digraphs = ObjC1 = ObjC2 = 0;
    C99 = Microsoft = CPlusPlus = CPPMinMax = NoExtensions = 0;
+    KeepComments = KeepMacroComments = 0;
  }
 };

@ -87,6 +91,10 @@ class Lexer {
  ///     on an unterminated '/*' comment.
  bool LexingRawMode;
  
+  /// KeepCommentMode - The lexer can optionally keep C & BCPL-style comments,
+  /// and return them as tokens.  This is used for -C and -CC modes.
+  bool KeepCommentMode;
+  
  //===--------------------------------------------------------------------===//
  // Context that changes as the file is lexed.
  // NOTE: any state that mutates when in raw mode must have save/restore code
@ -353,9 +361,9 @@ private:
  bool LexEndOfFile          (LexerToken &Result, const char *CurPtr);
  
  void SkipWhitespace        (LexerToken &Result, const char *CurPtr);
-  void SkipBCPLComment       (LexerToken &Result, const char *CurPtr);
-  void SkipBlockComment      (LexerToken &Result, const char *CurPtr);
-  
+  bool SkipBCPLComment       (LexerToken &Result, const char *CurPtr);
+  bool SkipBlockComment      (LexerToken &Result, const char *CurPtr);
+  bool SaveBCPLComment       (LexerToken &Result, const char *CurPtr);
  
  /// LexIncludeFilename - After the preprocessor has parsed a #include, lex and
  /// (potentially) macro expand the filename.  If the sequence parsed is not