mirror of https://github.com/oracle/graal.git
Support atomic groups and possessive quantifiers in Python regexps
This commit is contained in:
parent
db8c304107
commit
bc0d740da2
|
@ -2,6 +2,10 @@
|
|||
|
||||
This changelog summarizes major changes between TRegex versions relevant to language implementors integrating TRegex into their language. This document will focus on API changes relevant to integrators of TRegex.
|
||||
|
||||
## Version 24.0.0
|
||||
|
||||
* Added support for atomic groups and possessive quantifiers in Python regular expressions.
|
||||
|
||||
## Version 23.1.0
|
||||
|
||||
* Added support for Unicode sets mode (`v` flag) in ECMAScript regular expressions.
|
||||
|
|
|
@ -96,6 +96,11 @@ public final class JSRegexLexer extends RegexLexer {
|
|||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean featureEnabledPossessiveQuantifiers() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean featureEnabledCharClassFirstBracketIsLiteral() {
|
||||
return false;
|
||||
|
|
|
@ -714,20 +714,32 @@ public final class RegexASTBuilder {
|
|||
} else {
|
||||
if (quantifier.getMin() == 0 && (curTerm.isLookAroundAssertion() || curTermIsZeroWidthGroup ||
|
||||
curTerm.isCharacterClass() && curTerm.asCharacterClass().getCharSet().matchesNothing())) {
|
||||
// NB: If JavaScript ever gets possessive quantifiers, we might have to adjust this.
|
||||
removeCurTerm();
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (quantifier.getMin() > 0 && (curTerm.isLookAroundAssertion() || curTermIsZeroWidthGroup)) {
|
||||
// quantifying LookAroundAssertions doesn't do anything if quantifier.getMin() > 0, so
|
||||
// ignore.
|
||||
// Quantifying LookAroundAssertions doesn't do anything if quantifier.getMin() > 0, so
|
||||
// ignore. A possessive quantifier would still result in atomicity.
|
||||
if (quantifier.isPossessive()) {
|
||||
wrapCurTermInAtomicGroup();
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (quantifier.getMin() == 1 && quantifier.getMax() == 1) {
|
||||
// x{1,1} -> x
|
||||
if (quantifier.isPossessive()) {
|
||||
wrapCurTermInAtomicGroup();
|
||||
}
|
||||
return;
|
||||
}
|
||||
curTerm = addQuantifier(curTerm, quantifier);
|
||||
if (quantifier.isPossessive()) {
|
||||
wrapCurTermInAtomicGroup();
|
||||
// do not attempt to merge quantifiers when possessive quantifiers are present
|
||||
return;
|
||||
}
|
||||
// merge equal successive quantified terms
|
||||
if (curSequence.size() > 1) {
|
||||
Term prevTerm = curSequence.getTerms().get(curSequence.size() - 2);
|
||||
|
|
|
@ -123,6 +123,11 @@ public abstract class RegexLexer {
|
|||
*/
|
||||
protected abstract boolean featureEnabledBoundedQuantifierEmptyMin();
|
||||
|
||||
/**
|
||||
* Returns {@code true} if possessive quantifiers ({@code +} suffix) are allowed.
|
||||
*/
|
||||
protected abstract boolean featureEnabledPossessiveQuantifiers();
|
||||
|
||||
/**
|
||||
* Returns {@code true} if the first character in a character class must be interpreted as part
|
||||
* of the character set, even if it is the closing bracket {@code ']'}.
|
||||
|
@ -1012,7 +1017,14 @@ public abstract class RegexLexer {
|
|||
min = c == '+' ? 1 : 0;
|
||||
max = c == '?' ? 1 : -1;
|
||||
}
|
||||
return Token.createQuantifier((int) min, (int) max, !consumingLookahead("?"));
|
||||
boolean greedy = true;
|
||||
boolean possessive = false;
|
||||
if (consumingLookahead('?')) {
|
||||
greedy = false;
|
||||
} else if (featureEnabledPossessiveQuantifiers() && consumingLookahead('+')) {
|
||||
possessive = true;
|
||||
}
|
||||
return Token.createQuantifier((int) min, (int) max, greedy, possessive);
|
||||
}
|
||||
|
||||
private boolean isQuantifierOutOfOrder(long parsedMin, long parsedMax, int startMin, int lengthMin, int lengthMax) {
|
||||
|
|
|
@ -68,6 +68,7 @@ public class Token implements JsonConvertible {
|
|||
alternation,
|
||||
captureGroupBegin,
|
||||
nonCaptureGroupBegin,
|
||||
atomicGroupBegin,
|
||||
lookAheadAssertionBegin,
|
||||
lookBehindAssertionBegin,
|
||||
groupEnd,
|
||||
|
@ -93,6 +94,7 @@ public class Token implements JsonConvertible {
|
|||
private static final Token NON_CAPTURE_GROUP_BEGIN = new Token(Kind.nonCaptureGroupBegin);
|
||||
private static final Token CHAR_CLASS_BEGIN = new Token(Kind.charClassBegin);
|
||||
private static final Token CHAR_CLASS_END = new Token(Kind.charClassEnd);
|
||||
private static final Token ATOMIC_GROUP_BEGIN = new Token(Kind.atomicGroupBegin);
|
||||
private static final Token LOOK_AHEAD_ASSERTION_BEGIN = new LookAheadAssertionBegin(false);
|
||||
private static final Token NEGATIVE_LOOK_AHEAD_ASSERTION_BEGIN = new LookAheadAssertionBegin(true);
|
||||
private static final Token LOOK_BEHIND_ASSERTION_BEGIN = new LookBehindAssertionBegin(false);
|
||||
|
@ -139,6 +141,10 @@ public class Token implements JsonConvertible {
|
|||
return NON_CAPTURE_GROUP_BEGIN;
|
||||
}
|
||||
|
||||
public static Token createAtomicGroupBegin() {
|
||||
return ATOMIC_GROUP_BEGIN;
|
||||
}
|
||||
|
||||
public static Token createLookAheadAssertionBegin() {
|
||||
return LOOK_AHEAD_ASSERTION_BEGIN;
|
||||
}
|
||||
|
@ -163,6 +169,10 @@ public class Token implements JsonConvertible {
|
|||
return new Quantifier(min, max, greedy);
|
||||
}
|
||||
|
||||
public static Quantifier createQuantifier(int min, int max, boolean greedy, boolean possessive) {
|
||||
return new Quantifier(min, max, greedy, possessive);
|
||||
}
|
||||
|
||||
public static LiteralCharacter createLiteralCharacter(int codePoint) {
|
||||
return new LiteralCharacter(codePoint);
|
||||
}
|
||||
|
@ -244,14 +254,20 @@ public class Token implements JsonConvertible {
|
|||
private final int min;
|
||||
private final int max;
|
||||
private final boolean greedy;
|
||||
private final boolean possessive;
|
||||
@CompilationFinal private int index = -1;
|
||||
@CompilationFinal private int zeroWidthIndex = -1;
|
||||
|
||||
public Quantifier(int min, int max, boolean greedy) {
|
||||
public Quantifier(int min, int max, boolean greedy, boolean possessive) {
|
||||
super(Kind.quantifier);
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
this.greedy = greedy;
|
||||
this.possessive = possessive;
|
||||
}
|
||||
|
||||
public Quantifier(int min, int max, boolean greedy) {
|
||||
this(min, max, greedy, false);
|
||||
}
|
||||
|
||||
public boolean isInfiniteLoop() {
|
||||
|
@ -276,6 +292,10 @@ public class Token implements JsonConvertible {
|
|||
return greedy;
|
||||
}
|
||||
|
||||
public boolean isPossessive() {
|
||||
return possessive;
|
||||
}
|
||||
|
||||
public boolean hasIndex() {
|
||||
return index >= 0;
|
||||
}
|
||||
|
@ -332,11 +352,11 @@ public class Token implements JsonConvertible {
|
|||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(min, max, greedy, index, zeroWidthIndex);
|
||||
return Objects.hash(min, max, greedy, possessive, index, zeroWidthIndex);
|
||||
}
|
||||
|
||||
public boolean equalsSemantic(Quantifier o) {
|
||||
return min == o.min && max == o.max && greedy == o.greedy;
|
||||
return min == o.min && max == o.max && greedy == o.greedy && possessive == o.possessive;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -348,14 +368,14 @@ public class Token implements JsonConvertible {
|
|||
return false;
|
||||
}
|
||||
Quantifier o = (Quantifier) obj;
|
||||
return min == o.min && max == o.max && greedy == o.greedy && index == o.index && zeroWidthIndex == o.zeroWidthIndex;
|
||||
return min == o.min && max == o.max && greedy == o.greedy && possessive == o.possessive && index == o.index && zeroWidthIndex == o.zeroWidthIndex;
|
||||
}
|
||||
|
||||
@TruffleBoundary
|
||||
@Override
|
||||
public String toString() {
|
||||
String ret = minMaxToString();
|
||||
return isGreedy() ? ret : ret + "?";
|
||||
return isPossessive() ? ret + "+" : isGreedy() ? ret : ret + "?";
|
||||
}
|
||||
|
||||
private String minMaxToString() {
|
||||
|
@ -377,7 +397,8 @@ public class Token implements JsonConvertible {
|
|||
return super.toJson().append(
|
||||
Json.prop("min", getMin()),
|
||||
Json.prop("max", getMax()),
|
||||
Json.prop("greedy", isGreedy()));
|
||||
Json.prop("greedy", isGreedy()),
|
||||
Json.prop("possessive", isPossessive()));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -106,6 +106,11 @@ public final class OracleDBRegexLexer extends RegexLexer {
|
|||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean featureEnabledPossessiveQuantifiers() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean featureEnabledCharClassFirstBracketIsLiteral() {
|
||||
return true;
|
||||
|
|
|
@ -329,6 +329,11 @@ public final class PythonRegexLexer extends RegexLexer {
|
|||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean featureEnabledPossessiveQuantifiers() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean featureEnabledCharClassFirstBracketIsLiteral() {
|
||||
return true;
|
||||
|
@ -747,6 +752,8 @@ public final class PythonRegexLexer extends RegexLexer {
|
|||
throw syntaxErrorAtRel(PyErrorMessages.unknownExtensionP(ch2), 3);
|
||||
}
|
||||
}
|
||||
case '>':
|
||||
return Token.createAtomicGroupBegin();
|
||||
case '(':
|
||||
return parseConditionalBackReference();
|
||||
case '-':
|
||||
|
|
|
@ -211,6 +211,9 @@ public final class PythonRegexParser implements RegexParser {
|
|||
case nonCaptureGroupBegin:
|
||||
astBuilder.pushGroup(token);
|
||||
break;
|
||||
case atomicGroupBegin:
|
||||
astBuilder.pushAtomicGroup(token);
|
||||
break;
|
||||
case lookAheadAssertionBegin:
|
||||
astBuilder.pushLookAheadAssertion(token, ((Token.LookAheadAssertionBegin) token).isNegated());
|
||||
break;
|
||||
|
|
|
@ -616,12 +616,6 @@ public final class RubyRegexParser implements RegexValidator, RegexParser {
|
|||
}
|
||||
}
|
||||
|
||||
private void wrapCurTermInAtomicGroup() {
|
||||
if (!silent) {
|
||||
astBuilder.wrapCurTermInAtomicGroup();
|
||||
}
|
||||
}
|
||||
|
||||
// Error reporting
|
||||
|
||||
private RegexSyntaxException syntaxErrorAtEnd(String message) {
|
||||
|
@ -2014,10 +2008,7 @@ public final class RubyRegexParser implements RegexValidator, RegexParser {
|
|||
Quantifier quantifier = parseQuantifier(ch);
|
||||
if (quantifier != null) {
|
||||
if (canHaveQuantifier) {
|
||||
addQuantifier(Token.createQuantifier(quantifier.lower, quantifier.upper, quantifier.greedy));
|
||||
if (quantifier.possessive) {
|
||||
wrapCurTermInAtomicGroup();
|
||||
}
|
||||
addQuantifier(Token.createQuantifier(quantifier.lower, quantifier.upper, quantifier.greedy, quantifier.possessive));
|
||||
} else {
|
||||
throw syntaxErrorAt(RbErrorMessages.NOTHING_TO_REPEAT, start);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue