Support atomic groups and possessive quantifiers in Python regexps

This commit is contained in:
Jirka Marsik 2023-10-10 13:12:26 +02:00
parent db8c304107
commit bc0d740da2
9 changed files with 79 additions and 19 deletions

View File

@ -2,6 +2,10 @@
This changelog summarizes major changes between TRegex versions relevant to language implementors integrating TRegex into their language. This document will focus on API changes relevant to integrators of TRegex.
## Version 24.0.0
* Added support for atomic groups and possessive quantifiers in Python regular expressions.
## Version 23.1.0
* Added support for Unicode sets mode (`v` flag) in ECMAScript regular expressions.

View File

@ -96,6 +96,11 @@ public final class JSRegexLexer extends RegexLexer {
return false;
}
@Override
protected boolean featureEnabledPossessiveQuantifiers() {
return false;
}
@Override
protected boolean featureEnabledCharClassFirstBracketIsLiteral() {
return false;

View File

@ -714,20 +714,32 @@ public final class RegexASTBuilder {
} else {
if (quantifier.getMin() == 0 && (curTerm.isLookAroundAssertion() || curTermIsZeroWidthGroup ||
curTerm.isCharacterClass() && curTerm.asCharacterClass().getCharSet().matchesNothing())) {
// NB: If JavaScript ever gets possessive quantifiers, we might have to adjust this.
removeCurTerm();
return;
}
}
if (quantifier.getMin() > 0 && (curTerm.isLookAroundAssertion() || curTermIsZeroWidthGroup)) {
// quantifying LookAroundAssertions doesn't do anything if quantifier.getMin() > 0, so
// ignore.
// Quantifying LookAroundAssertions doesn't do anything if quantifier.getMin() > 0, so
// ignore. A possessive quantifier would still result in atomicity.
if (quantifier.isPossessive()) {
wrapCurTermInAtomicGroup();
}
return;
}
if (quantifier.getMin() == 1 && quantifier.getMax() == 1) {
// x{1,1} -> x
if (quantifier.isPossessive()) {
wrapCurTermInAtomicGroup();
}
return;
}
curTerm = addQuantifier(curTerm, quantifier);
if (quantifier.isPossessive()) {
wrapCurTermInAtomicGroup();
// do not attempt to merge quantifiers when possessive quantifiers are present
return;
}
// merge equal successive quantified terms
if (curSequence.size() > 1) {
Term prevTerm = curSequence.getTerms().get(curSequence.size() - 2);

View File

@ -123,6 +123,11 @@ public abstract class RegexLexer {
*/
protected abstract boolean featureEnabledBoundedQuantifierEmptyMin();
/**
* Returns {@code true} if possessive quantifiers ({@code +} suffix) are allowed.
*/
protected abstract boolean featureEnabledPossessiveQuantifiers();
/**
* Returns {@code true} if the first character in a character class must be interpreted as part
* of the character set, even if it is the closing bracket {@code ']'}.
@ -1012,7 +1017,14 @@ public abstract class RegexLexer {
min = c == '+' ? 1 : 0;
max = c == '?' ? 1 : -1;
}
return Token.createQuantifier((int) min, (int) max, !consumingLookahead("?"));
boolean greedy = true;
boolean possessive = false;
if (consumingLookahead('?')) {
greedy = false;
} else if (featureEnabledPossessiveQuantifiers() && consumingLookahead('+')) {
possessive = true;
}
return Token.createQuantifier((int) min, (int) max, greedy, possessive);
}
private boolean isQuantifierOutOfOrder(long parsedMin, long parsedMax, int startMin, int lengthMin, int lengthMax) {

View File

@ -68,6 +68,7 @@ public class Token implements JsonConvertible {
alternation,
captureGroupBegin,
nonCaptureGroupBegin,
atomicGroupBegin,
lookAheadAssertionBegin,
lookBehindAssertionBegin,
groupEnd,
@ -93,6 +94,7 @@ public class Token implements JsonConvertible {
private static final Token NON_CAPTURE_GROUP_BEGIN = new Token(Kind.nonCaptureGroupBegin);
private static final Token CHAR_CLASS_BEGIN = new Token(Kind.charClassBegin);
private static final Token CHAR_CLASS_END = new Token(Kind.charClassEnd);
private static final Token ATOMIC_GROUP_BEGIN = new Token(Kind.atomicGroupBegin);
private static final Token LOOK_AHEAD_ASSERTION_BEGIN = new LookAheadAssertionBegin(false);
private static final Token NEGATIVE_LOOK_AHEAD_ASSERTION_BEGIN = new LookAheadAssertionBegin(true);
private static final Token LOOK_BEHIND_ASSERTION_BEGIN = new LookBehindAssertionBegin(false);
@ -139,6 +141,10 @@ public class Token implements JsonConvertible {
return NON_CAPTURE_GROUP_BEGIN;
}
public static Token createAtomicGroupBegin() {
return ATOMIC_GROUP_BEGIN;
}
public static Token createLookAheadAssertionBegin() {
return LOOK_AHEAD_ASSERTION_BEGIN;
}
@ -163,6 +169,10 @@ public class Token implements JsonConvertible {
return new Quantifier(min, max, greedy);
}
public static Quantifier createQuantifier(int min, int max, boolean greedy, boolean possessive) {
return new Quantifier(min, max, greedy, possessive);
}
public static LiteralCharacter createLiteralCharacter(int codePoint) {
return new LiteralCharacter(codePoint);
}
@ -244,14 +254,20 @@ public class Token implements JsonConvertible {
private final int min;
private final int max;
private final boolean greedy;
private final boolean possessive;
@CompilationFinal private int index = -1;
@CompilationFinal private int zeroWidthIndex = -1;
public Quantifier(int min, int max, boolean greedy) {
public Quantifier(int min, int max, boolean greedy, boolean possessive) {
super(Kind.quantifier);
this.min = min;
this.max = max;
this.greedy = greedy;
this.possessive = possessive;
}
public Quantifier(int min, int max, boolean greedy) {
this(min, max, greedy, false);
}
public boolean isInfiniteLoop() {
@ -276,6 +292,10 @@ public class Token implements JsonConvertible {
return greedy;
}
public boolean isPossessive() {
return possessive;
}
public boolean hasIndex() {
return index >= 0;
}
@ -332,11 +352,11 @@ public class Token implements JsonConvertible {
@Override
public int hashCode() {
return Objects.hash(min, max, greedy, index, zeroWidthIndex);
return Objects.hash(min, max, greedy, possessive, index, zeroWidthIndex);
}
public boolean equalsSemantic(Quantifier o) {
return min == o.min && max == o.max && greedy == o.greedy;
return min == o.min && max == o.max && greedy == o.greedy && possessive == o.possessive;
}
@Override
@ -348,14 +368,14 @@ public class Token implements JsonConvertible {
return false;
}
Quantifier o = (Quantifier) obj;
return min == o.min && max == o.max && greedy == o.greedy && index == o.index && zeroWidthIndex == o.zeroWidthIndex;
return min == o.min && max == o.max && greedy == o.greedy && possessive == o.possessive && index == o.index && zeroWidthIndex == o.zeroWidthIndex;
}
@TruffleBoundary
@Override
public String toString() {
String ret = minMaxToString();
return isGreedy() ? ret : ret + "?";
return isPossessive() ? ret + "+" : isGreedy() ? ret : ret + "?";
}
private String minMaxToString() {
@ -377,7 +397,8 @@ public class Token implements JsonConvertible {
return super.toJson().append(
Json.prop("min", getMin()),
Json.prop("max", getMax()),
Json.prop("greedy", isGreedy()));
Json.prop("greedy", isGreedy()),
Json.prop("possessive", isPossessive()));
}
}

View File

@ -106,6 +106,11 @@ public final class OracleDBRegexLexer extends RegexLexer {
return false;
}
@Override
protected boolean featureEnabledPossessiveQuantifiers() {
return false;
}
@Override
protected boolean featureEnabledCharClassFirstBracketIsLiteral() {
return true;

View File

@ -329,6 +329,11 @@ public final class PythonRegexLexer extends RegexLexer {
return true;
}
@Override
protected boolean featureEnabledPossessiveQuantifiers() {
return true;
}
@Override
protected boolean featureEnabledCharClassFirstBracketIsLiteral() {
return true;
@ -747,6 +752,8 @@ public final class PythonRegexLexer extends RegexLexer {
throw syntaxErrorAtRel(PyErrorMessages.unknownExtensionP(ch2), 3);
}
}
case '>':
return Token.createAtomicGroupBegin();
case '(':
return parseConditionalBackReference();
case '-':

View File

@ -211,6 +211,9 @@ public final class PythonRegexParser implements RegexParser {
case nonCaptureGroupBegin:
astBuilder.pushGroup(token);
break;
case atomicGroupBegin:
astBuilder.pushAtomicGroup(token);
break;
case lookAheadAssertionBegin:
astBuilder.pushLookAheadAssertion(token, ((Token.LookAheadAssertionBegin) token).isNegated());
break;

View File

@ -616,12 +616,6 @@ public final class RubyRegexParser implements RegexValidator, RegexParser {
}
}
private void wrapCurTermInAtomicGroup() {
if (!silent) {
astBuilder.wrapCurTermInAtomicGroup();
}
}
// Error reporting
private RegexSyntaxException syntaxErrorAtEnd(String message) {
@ -2014,10 +2008,7 @@ public final class RubyRegexParser implements RegexValidator, RegexParser {
Quantifier quantifier = parseQuantifier(ch);
if (quantifier != null) {
if (canHaveQuantifier) {
addQuantifier(Token.createQuantifier(quantifier.lower, quantifier.upper, quantifier.greedy));
if (quantifier.possessive) {
wrapCurTermInAtomicGroup();
}
addQuantifier(Token.createQuantifier(quantifier.lower, quantifier.upper, quantifier.greedy, quantifier.possessive));
} else {
throw syntaxErrorAt(RbErrorMessages.NOTHING_TO_REPEAT, start);
}