Re-implemented document splitters (#130)

This commit is contained in:
LangChain4j 2023-08-28 21:33:48 +02:00 committed by GitHub
parent 88b56778f4
commit bebfc78ee1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 1143 additions and 253 deletions

View File

@ -26,4 +26,11 @@ public class Utils {
}
return "\"" + string + "\"";
}
public static String firstChars(String string, int numberOfChars) {
if (string == null) {
return null;
}
return string.length() > numberOfChars ? string.substring(0, numberOfChars) : string;
}
}

View File

@ -118,6 +118,12 @@
<version>2.7.14</version>
</dependency>
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
<version>1.9.4</version>
</dependency>
<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-embeddings-all-minilm-l6-v2-q</artifactId>
@ -278,8 +284,7 @@
<scopes>compile,runtime,provided,test</scopes>
<acceptableLicenses>
<license>
<name>(The )?(Apache License, Version 2\.0)|(Apache-2\.0)|(The Apache Software License, Version 2\.0)
</name>
<name>(The )?(Apache License, Version 2\.0)|(Apache-2\.0)|(The Apache Software License, Version 2\.0)</name>
<url>https?://www\.apache\.org/licenses/LICENSE-2\.0</url>
</license>
<license>

View File

@ -37,6 +37,11 @@
<artifactId>jtokkit</artifactId>
</dependency>
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>

View File

@ -1,49 +0,0 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.data.document.DocumentSplitter;
import java.util.ArrayList;
import java.util.List;
public class CharacterSplitter implements DocumentSplitter {
private final int segmentLength;
private final int segmentOverlap;
public CharacterSplitter(int segmentLength, int segmentOverlap) {
this.segmentLength = segmentLength;
this.segmentOverlap = segmentOverlap;
}
@Override
public List<TextSegment> split(Document document) {
if (document.text() == null || document.text().isEmpty()) {
throw new IllegalArgumentException("Document text should not be null or empty");
}
String text = document.text();
int textLength = text.length();
if (segmentLength <= 0 || segmentOverlap < 0 || segmentLength <= segmentOverlap) {
throw new IllegalArgumentException(String.format("Invalid segmentLength (%s) or segmentOverlap (%s)", segmentLength, segmentOverlap));
}
List<TextSegment> segments = new ArrayList<>();
if (textLength <= segmentLength) {
segments.add(document.toTextSegment());
} else {
for (int i = 0; i < textLength - segmentOverlap; i += segmentLength - segmentOverlap) {
int endIndex = Math.min(i + segmentLength, textLength);
String segment = text.substring(i, endIndex);
segments.add(TextSegment.from(segment, document.metadata()));
if (endIndex == textLength) {
break;
}
}
}
return segments;
}
}

View File

@ -0,0 +1,52 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.Tokenizer;
/**
* Splits the provided {@link Document} into characters and attempts to fit as many characters as possible
* into a single {@link TextSegment}, adhering to the limit set by {@code maxSegmentSize}.
* <p>
* The {@code maxSegmentSize} can be defined in terms of characters (default) or tokens.
* For token-based limit, a {@link Tokenizer} must be provided.
* <p>
* If multiple characters fit within {@code maxSegmentSize}, they are joined together without delimiters.
* <p>
* Each {@link TextSegment} inherits all metadata from the {@link Document} and includes an "index" metadata key
* representing its position within the document (starting from 0).
*/
public class DocumentByCharacterSplitter extends HierarchicalDocumentSplitter {
public DocumentByCharacterSplitter(int maxSegmentSizeInChars) {
super(maxSegmentSizeInChars, null, null);
}
public DocumentByCharacterSplitter(int maxSegmentSizeInChars, DocumentSplitter subSplitter) {
super(maxSegmentSizeInChars, null, subSplitter);
}
public DocumentByCharacterSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer) {
super(maxSegmentSizeInTokens, tokenizer, null);
}
public DocumentByCharacterSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer, DocumentSplitter subSplitter) {
super(maxSegmentSizeInTokens, tokenizer, subSplitter);
}
@Override
public String[] split(String text) {
return text.split("");
}
@Override
public String joinDelimiter() {
return "";
}
@Override
protected DocumentSplitter defaultSubSplitter() {
return null;
}
}

View File

@ -0,0 +1,61 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.Tokenizer;
/**
* Splits the provided {@link Document} into lines and attempts to fit as many lines as possible
* into a single {@link TextSegment}, adhering to the limit set by {@code maxSegmentSize}.
* <p>
* The {@code maxSegmentSize} can be defined in terms of characters (default) or tokens.
* For token-based limit, a {@link Tokenizer} must be provided.
* <p>
* Line boundaries are detected by a minimum of one newline character ("\n").
* Any additional whitespaces before or after are ignored.
* So, the following examples are all valid line separators: "\n", "\n\n", " \n", "\n " and so on.
* <p>
* If multiple lines fit within {@code maxSegmentSize}, they are joined together using a newline ("\n").
* <p>
* If a single line is too long and exceeds {@code maxSegmentSize},
* the {@code subSplitter} ({@link DocumentBySentenceSplitter} by default) is used to split it into smaller parts and
* place them into multiple segments.
* Such segments contain only the parts of the split long line.
* <p>
* Each {@link TextSegment} inherits all metadata from the {@link Document} and includes an "index" metadata key
* representing its position within the document (starting from 0).
*/
public class DocumentByLineSplitter extends HierarchicalDocumentSplitter {
public DocumentByLineSplitter(int maxSegmentSizeInChars) {
super(maxSegmentSizeInChars, null, null);
}
public DocumentByLineSplitter(int maxSegmentSizeInChars, DocumentSplitter subSplitter) {
super(maxSegmentSizeInChars, null, subSplitter);
}
public DocumentByLineSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer) {
super(maxSegmentSizeInTokens, tokenizer, null);
}
public DocumentByLineSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer, DocumentSplitter subSplitter) {
super(maxSegmentSizeInTokens, tokenizer, subSplitter);
}
@Override
public String[] split(String text) {
return text.split("\\s*\\R\\s*"); // additional whitespaces are ignored
}
@Override
public String joinDelimiter() {
return "\n";
}
@Override
protected DocumentSplitter defaultSubSplitter() {
return new DocumentBySentenceSplitter(maxSegmentSize, tokenizer);
}
}

View File

@ -0,0 +1,61 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.Tokenizer;
/**
* Splits the provided {@link Document} into paragraphs and attempts to fit as many paragraphs as possible
* into a single {@link TextSegment}, adhering to the limit set by {@code maxSegmentSize}.
* <p>
* The {@code maxSegmentSize} can be defined in terms of characters (default) or tokens.
* For token-based limit, a {@link Tokenizer} must be provided.
* <p>
* Paragraph boundaries are detected by a minimum of two newline characters ("\n\n").
* Any additional whitespaces before, between, or after are ignored.
* So, the following examples are all valid paragraph separators: "\n\n", "\n\n\n", "\n \n", " \n \n ", and so on.
* <p>
* If multiple paragraphs fit within {@code maxSegmentSize}, they are joined together using a double newline ("\n\n").
* <p>
* If a single paragraph is too long and exceeds {@code maxSegmentSize},
* the {@code subSplitter} ({@link DocumentBySentenceSplitter} by default) is used to split it into smaller parts and
* place them into multiple segments.
* Such segments contain only the parts of the split long paragraph.
* <p>
* Each {@link TextSegment} inherits all metadata from the {@link Document} and includes an "index" metadata key
* representing its position within the document (starting from 0).
*/
public class DocumentByParagraphSplitter extends HierarchicalDocumentSplitter {
public DocumentByParagraphSplitter(int maxSegmentSizeInChars) {
super(maxSegmentSizeInChars, null, null);
}
public DocumentByParagraphSplitter(int maxSegmentSizeInChars, DocumentSplitter subSplitter) {
super(maxSegmentSizeInChars, null, subSplitter);
}
public DocumentByParagraphSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer) {
super(maxSegmentSizeInTokens, tokenizer, null);
}
public DocumentByParagraphSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer, DocumentSplitter subSplitter) {
super(maxSegmentSizeInTokens, tokenizer, subSplitter);
}
@Override
public String[] split(String text) {
return text.split("\\s*\\R\\s*\\R\\s*"); // additional whitespaces are ignored
}
@Override
public String joinDelimiter() {
return "\n\n";
}
@Override
protected DocumentSplitter defaultSubSplitter() {
return new DocumentBySentenceSplitter(maxSegmentSize, tokenizer);
}
}

View File

@ -0,0 +1,81 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.Tokenizer;
import static dev.langchain4j.internal.ValidationUtils.ensureNotNull;
/**
* Splits the provided {@link Document} into parts using the provided {@code regex} and attempts to fit as many parts
* as possible into a single {@link TextSegment}, adhering to the limit set by {@code maxSegmentSize}.
* <p>
* The {@code maxSegmentSize} can be defined in terms of characters (default) or tokens.
* For token-based limit, a {@link Tokenizer} must be provided.
* <p>
* If multiple parts fit within {@code maxSegmentSize}, they are joined together using the provided {@code joinDelimiter}.
* <p>
* If a single part is too long and exceeds {@code maxSegmentSize}, the {@code subSplitter} (which should be provided)
* is used to split it into sub-parts and place them into multiple segments.
* Such segments contain only the sub-parts of the split long part.
* <p>
* Each {@link TextSegment} inherits all metadata from the {@link Document} and includes an "index" metadata key
* representing its position within the document (starting from 0).
*/
public class DocumentByRegexSplitter extends HierarchicalDocumentSplitter {
private final String regex;
private final String joinDelimiter;
public DocumentByRegexSplitter(String regex,
String joinDelimiter,
int maxSegmentSizeInChars) {
super(maxSegmentSizeInChars, null, null);
this.regex = ensureNotNull(regex, "regex");
this.joinDelimiter = ensureNotNull(joinDelimiter, "joinDelimiter");
}
public DocumentByRegexSplitter(String regex,
String joinDelimiter,
int maxSegmentSizeInChars,
DocumentSplitter subSplitter) {
super(maxSegmentSizeInChars, null, subSplitter);
this.regex = ensureNotNull(regex, "regex");
this.joinDelimiter = ensureNotNull(joinDelimiter, "joinDelimiter");
}
public DocumentByRegexSplitter(String regex,
String joinDelimiter,
int maxSegmentSizeInTokens,
Tokenizer tokenizer) {
super(maxSegmentSizeInTokens, tokenizer, null);
this.regex = ensureNotNull(regex, "regex");
this.joinDelimiter = ensureNotNull(joinDelimiter, "joinDelimiter");
}
public DocumentByRegexSplitter(String regex,
String joinDelimiter,
int maxSegmentSizeInTokens,
Tokenizer tokenizer,
DocumentSplitter subSplitter) {
super(maxSegmentSizeInTokens, tokenizer, subSplitter);
this.regex = ensureNotNull(regex, "regex");
this.joinDelimiter = ensureNotNull(joinDelimiter, "joinDelimiter");
}
@Override
public String[] split(String text) {
return text.split(regex);
}
@Override
public String joinDelimiter() {
return joinDelimiter;
}
@Override
protected DocumentSplitter defaultSubSplitter() {
return null;
}
}

View File

@ -0,0 +1,78 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.Tokenizer;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import java.io.InputStream;
/**
* Splits the provided {@link Document} into sentences and attempts to fit as many sentences as possible
* into a single {@link TextSegment}, adhering to the limit set by {@code maxSegmentSize}.
* <p>
* The {@code maxSegmentSize} can be defined in terms of characters (default) or tokens.
* For token-based limit, a {@link Tokenizer} must be provided.
* <p>
* Sentence boundaries are detected using the Apache OpenNLP library with the English sentence model.
* <p>
* If multiple sentences fit within {@code maxSegmentSize}, they are joined together using a space (" ").
* <p>
* If a single sentence is too long and exceeds {@code maxSegmentSize},
* the {@code subSplitter} ({@link DocumentByWordSplitter} by default) is used to split it into smaller parts and
* place them into multiple segments.
* Such segments contain only the parts of the split long sentence.
* <p>
* Each {@link TextSegment} inherits all metadata from the {@link Document} and includes an "index" metadata key
* representing its position within the document (starting from 0).
*/
public class DocumentBySentenceSplitter extends HierarchicalDocumentSplitter {
private final SentenceDetectorME sentenceDetector;
public DocumentBySentenceSplitter(int maxSegmentSizeInChars) {
super(maxSegmentSizeInChars, null, null);
this.sentenceDetector = createSentenceDetector();
}
public DocumentBySentenceSplitter(int maxSegmentSizeInChars, DocumentSplitter subSplitter) {
super(maxSegmentSizeInChars, null, subSplitter);
this.sentenceDetector = createSentenceDetector();
}
public DocumentBySentenceSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer) {
super(maxSegmentSizeInTokens, tokenizer, null);
this.sentenceDetector = createSentenceDetector();
}
public DocumentBySentenceSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer, DocumentSplitter subSplitter) {
super(maxSegmentSizeInTokens, tokenizer, subSplitter);
this.sentenceDetector = createSentenceDetector();
}
private SentenceDetectorME createSentenceDetector() {
String sentenceModelFilePath = "/opennlp/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin";
try (InputStream is = getClass().getResourceAsStream(sentenceModelFilePath)) {
return new SentenceDetectorME(new SentenceModel(is));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
@Override
public String[] split(String text) {
return sentenceDetector.sentDetect(text);
}
@Override
public String joinDelimiter() {
return " ";
}
@Override
protected DocumentSplitter defaultSubSplitter() {
return new DocumentByWordSplitter(maxSegmentSize, tokenizer);
}
}

View File

@ -0,0 +1,61 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.Tokenizer;
/**
* Splits the provided {@link Document} into words and attempts to fit as many words as possible
* into a single {@link TextSegment}, adhering to the limit set by {@code maxSegmentSize}.
* <p>
* The {@code maxSegmentSize} can be defined in terms of characters (default) or tokens.
* For token-based limit, a {@link Tokenizer} must be provided.
* <p>
* Word boundaries are detected by a minimum of one space (" ").
* Any additional whitespaces before or after are ignored.
* So, the following examples are all valid word separators: " ", " ", "\n", and so on.
* <p>
* If multiple words fit within {@code maxSegmentSize}, they are joined together using a space (" ").
* <p>
* Although this should not happen, if a single word is too long and exceeds {@code maxSegmentSize},
* the {@code subSplitter} ({@link DocumentByCharacterSplitter} by default) is used to split it into smaller parts and
* place them into multiple segments.
* Such segments contain only the parts of the split long word.
* <p>
* Each {@link TextSegment} inherits all metadata from the {@link Document} and includes an "index" metadata key
* representing its position within the document (starting from 0).
*/
public class DocumentByWordSplitter extends HierarchicalDocumentSplitter {
public DocumentByWordSplitter(int maxSegmentSizeInChars) {
super(maxSegmentSizeInChars, null, null);
}
public DocumentByWordSplitter(int maxSegmentSizeInChars, DocumentSplitter subSplitter) {
super(maxSegmentSizeInChars, null, subSplitter);
}
public DocumentByWordSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer) {
super(maxSegmentSizeInTokens, tokenizer, null);
}
public DocumentByWordSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer, DocumentSplitter subSplitter) {
super(maxSegmentSizeInTokens, tokenizer, subSplitter);
}
@Override
public String[] split(String text) {
return text.split("\\s+"); // additional whitespaces are ignored
}
@Override
public String joinDelimiter() {
return " ";
}
@Override
protected DocumentSplitter defaultSubSplitter() {
return new DocumentByCharacterSplitter(maxSegmentSize, tokenizer);
}
}

View File

@ -0,0 +1,39 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.model.Tokenizer;
public class DocumentSplitters {
/**
* This is a recommended {@link DocumentSplitter} for generic text.
* It tries to split the document into paragraphs first and fits
* as many paragraphs into a single {@link dev.langchain4j.data.segment.TextSegment} as possible.
* If some paragraphs are too long, they are recursively split into lines, then sentences,
* then words, and then characters until they fit into a segment.
*
* @param maxSegmentSizeInTokens The maximum size of the segment, defined in tokens.
* @param tokenizer The tokenizer that is used to count tokens in the text.
* @return recursive document splitter
*/
public static DocumentSplitter recursive(int maxSegmentSizeInTokens, Tokenizer tokenizer) {
return new DocumentByParagraphSplitter(maxSegmentSizeInTokens, tokenizer,
new DocumentByLineSplitter(maxSegmentSizeInTokens, tokenizer,
new DocumentBySentenceSplitter(maxSegmentSizeInTokens, tokenizer,
new DocumentByWordSplitter(maxSegmentSizeInTokens, tokenizer))));
}
/**
* This is a recommended {@link DocumentSplitter} for generic text.
* It tries to split the document into paragraphs first and fits
* as many paragraphs into a single {@link dev.langchain4j.data.segment.TextSegment} as possible.
* If some paragraphs are too long, they are recursively split into lines, then sentences,
* then words, and then characters until they fit into a segment.
*
* @param maxSegmentSizeInChars The maximum size of the segment, defined in characters.
* @return recursive document splitter
*/
public static DocumentSplitter recursive(int maxSegmentSizeInChars) {
return recursive(maxSegmentSizeInChars, null);
}
}

View File

@ -0,0 +1,113 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.document.Metadata;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.Tokenizer;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import static dev.langchain4j.internal.Utils.firstChars;
import static dev.langchain4j.internal.ValidationUtils.ensureGreaterThanZero;
import static dev.langchain4j.internal.ValidationUtils.ensureNotNull;
import static java.lang.String.format;
public abstract class HierarchicalDocumentSplitter implements DocumentSplitter {
private static final String INDEX = "index";
protected final int maxSegmentSize;
protected final Tokenizer tokenizer;
protected final DocumentSplitter subSplitter;
protected HierarchicalDocumentSplitter(int maxSegmentSizeInChars) {
this(maxSegmentSizeInChars, null, null);
}
protected HierarchicalDocumentSplitter(int maxSegmentSizeInChars,
DocumentSplitter subSplitter) {
this(maxSegmentSizeInChars, null, subSplitter);
}
protected HierarchicalDocumentSplitter(int maxSegmentSizeInTokens,
Tokenizer tokenizer) {
this(maxSegmentSizeInTokens, tokenizer, null);
}
protected HierarchicalDocumentSplitter(int maxSegmentSizeInTokens,
Tokenizer tokenizer,
DocumentSplitter subSplitter) {
this.maxSegmentSize = ensureGreaterThanZero(maxSegmentSizeInTokens, "maxSegmentSize");
this.tokenizer = tokenizer;
this.subSplitter = subSplitter == null ? defaultSubSplitter() : subSplitter;
}
protected abstract String[] split(String text);
protected abstract String joinDelimiter();
protected abstract DocumentSplitter defaultSubSplitter();
@Override
public List<TextSegment> split(Document document) {
ensureNotNull(document, "document");
List<TextSegment> segments = new ArrayList<>();
SegmentBuilder segmentBuilder = new SegmentBuilder(maxSegmentSize, this::sizeOf, joinDelimiter());
AtomicInteger index = new AtomicInteger(0);
String[] parts = split(document.text());
for (String part : parts) {
if (segmentBuilder.hasSpaceFor(part)) {
segmentBuilder.append(part);
} else {
if (segmentBuilder.isNotEmpty()) {
segments.add(createSegment(segmentBuilder.build(), document, index.getAndIncrement()));
segmentBuilder.reset();
}
if (segmentBuilder.hasSpaceFor(part)) {
segmentBuilder.append(part);
} else {
if (subSplitter == null) {
throw new RuntimeException(format(
"The text \"%s...\" (%s %s long) doesn't fit into the maximum segment size (%s %s), " +
"and there is no subSplitter defined to split it further.",
firstChars(part, 30),
sizeOf(part), tokenizer == null ? "characters" : "tokens",
maxSegmentSize, tokenizer == null ? "characters" : "tokens"
));
}
for (TextSegment segment : subSplitter.split(Document.from(part))) {
segments.add(createSegment(segment.text(), document, index.getAndIncrement()));
segmentBuilder.reset();
}
}
}
}
if (segmentBuilder.isNotEmpty()) {
segments.add(createSegment(segmentBuilder.build(), document, index.getAndIncrement()));
segmentBuilder.reset();
}
return segments;
}
private int sizeOf(String text) {
if (tokenizer != null) {
return tokenizer.estimateTokenCountInText(text);
} else {
return text.length();
}
}
private static TextSegment createSegment(String text, Document document, int index) {
Metadata metadata = document.metadata().copy().add(INDEX, index);
return TextSegment.from(text, metadata);
}
}

View File

@ -1,27 +0,0 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.data.document.DocumentSplitter;
import java.util.List;
import static java.util.Arrays.stream;
import static java.util.stream.Collectors.toList;
public class ParagraphSplitter implements DocumentSplitter {
@Override
public List<TextSegment> split(Document document) {
String text = document.text();
if (text == null || text.isEmpty()) {
throw new IllegalArgumentException("Document text should not be null or empty");
}
String[] paragraphs = text.split("\\R\\R");
return stream(paragraphs)
.map(paragraph -> TextSegment.from(paragraph.trim(), document.metadata()))
.collect(toList());
}
}

View File

@ -1,33 +0,0 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.data.document.DocumentSplitter;
import java.util.List;
import static java.util.Arrays.stream;
import static java.util.stream.Collectors.toList;
public class RegexSplitter implements DocumentSplitter {
private final String regex;
public RegexSplitter(String regex) {
this.regex = regex;
}
@Override
public List<TextSegment> split(Document document) {
String text = document.text();
if (text == null || text.isEmpty()) {
throw new IllegalArgumentException("Document text should not be null or empty");
}
String[] segments = text.split(regex);
return stream(segments)
.map(segment -> TextSegment.from(segment, document.metadata()))
.collect(toList());
}
}

View File

@ -0,0 +1,62 @@
package dev.langchain4j.data.document.splitter;
import java.util.function.Function;
import static dev.langchain4j.internal.ValidationUtils.ensureGreaterThanZero;
import static dev.langchain4j.internal.ValidationUtils.ensureNotNull;
class SegmentBuilder {
private StringBuilder segmentBuilder;
private final int maxSegmentSize;
private final Function<String, Integer> sizeFunction;
private final String joinSeparator;
SegmentBuilder(int maxSegmentSize, Function<String, Integer> sizeFunction, String joinSeparator) {
this.segmentBuilder = new StringBuilder();
this.maxSegmentSize = ensureGreaterThanZero(maxSegmentSize, "maxSegmentSize");
this.sizeFunction = ensureNotNull(sizeFunction, "sizeFunction");
this.joinSeparator = ensureNotNull(joinSeparator, "joinSeparator");
}
boolean hasSpaceFor(String text) {
return hasSpaceFor(text, joinSeparator);
}
boolean hasSpaceFor(String text, String separator) {
if (isNotEmpty()) {
return sizeOf(segmentBuilder.toString()) + sizeOf(separator) + sizeOf(text) <= maxSegmentSize;
} else {
return sizeOf(text) <= maxSegmentSize;
}
}
private int sizeOf(String text) {
return sizeFunction.apply(text);
}
void append(String text) {
append(text, joinSeparator);
}
void append(String text, String separator) {
if (segmentBuilder.length() > 0) {
segmentBuilder.append(separator);
}
segmentBuilder.append(text);
}
boolean isNotEmpty() {
return segmentBuilder.length() > 0;
}
String build() {
return segmentBuilder.toString().trim();
}
void reset() {
segmentBuilder = new StringBuilder();
}
}

View File

@ -1,45 +0,0 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.data.document.DocumentSplitter;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import static java.util.stream.Collectors.toList;
public class SentenceSplitter implements DocumentSplitter {
@Override
public List<TextSegment> split(Document document) {
String text = document.text();
if (text == null || text.isEmpty()) {
throw new IllegalArgumentException("Document text should not be null or empty");
}
List<String> sentences = splitIntoSentences(text);
return sentences.stream()
.map(sentence -> TextSegment.from(sentence.trim(), document.metadata()))
.collect(toList());
}
private List<String> splitIntoSentences(String text) {
List<String> sentences = new ArrayList<>();
BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.getDefault());
iterator.setText(text);
int start = iterator.first();
for (int end = iterator.next();
end != BreakIterator.DONE;
start = end, end = iterator.next()) {
sentences.add(text.substring(start, end).trim());
}
return sentences;
}
}

View File

@ -1,65 +0,0 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.segment.TextSegment;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
import org.junit.jupiter.params.provider.NullAndEmptySource;
import java.util.List;
import static dev.langchain4j.data.document.Document.document;
import static dev.langchain4j.data.segment.TextSegment.textSegment;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
class CharacterSplitterTest {
@Test
void should_split_with_overlap() {
DocumentSplitter splitter = new CharacterSplitter(4, 2);
List<TextSegment> segments = splitter.split(document("1234567890"));
assertThat(segments).containsExactly(
textSegment("1234"),
textSegment("3456"),
textSegment("5678"),
textSegment("7890")
);
}
@Test
void should_split_without_overlap() {
DocumentSplitter splitter = new CharacterSplitter(4, 0);
List<TextSegment> segments = splitter.split(document("1234567890"));
assertThat(segments).containsExactly(
textSegment("1234"),
textSegment("5678"),
textSegment("90")
);
}
@ParameterizedTest
@CsvSource({"0,-1", "-1,-1", "-1,0", "0,0", "0,1", "1,-1", "1,1", "1,2"})
void should_fail_on_invalid_length_or_overlap(int segmentLength, int segmentOverlap) {
DocumentSplitter splitter = new CharacterSplitter(segmentLength, segmentOverlap);
assertThatThrownBy(() -> splitter.split(document("any")))
.isExactlyInstanceOf(IllegalArgumentException.class)
.hasMessage("Invalid segmentLength (%s) or segmentOverlap (%s)", segmentLength, segmentOverlap);
}
@ParameterizedTest
@NullAndEmptySource
void testNullCase(String documentText) {
DocumentSplitter splitter = new CharacterSplitter(4, 2);
assertThatThrownBy(() -> splitter.split(document(documentText)))
.isExactlyInstanceOf(IllegalArgumentException.class)
.hasMessage("text cannot be null or blank");
}
}

View File

@ -0,0 +1,257 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.Tokenizer;
import dev.langchain4j.model.openai.OpenAiTokenizer;
import org.junit.jupiter.api.Test;
import java.util.List;
import static dev.langchain4j.data.document.Metadata.metadata;
import static dev.langchain4j.data.segment.TextSegment.textSegment;
import static dev.langchain4j.model.openai.OpenAiModelName.GPT_3_5_TURBO;
import static java.lang.String.format;
import static org.assertj.core.api.Assertions.assertThat;
class DocumentByParagraphSplitterTest {
@Test
void should_split_into_segments_with_one_paragraph_per_segment() {
int maxSegmentSize = 30;
String firstParagraph = "This is a first paragraph.";
assertThat(firstParagraph).hasSizeLessThan(maxSegmentSize);
String secondParagraph = "This is a second paragraph.";
assertThat(secondParagraph).hasSizeLessThan(maxSegmentSize);
assertThat(firstParagraph + "\n \n" + secondParagraph).hasSizeGreaterThan(maxSegmentSize);
Document document = Document.from(
format(" %s \n \n %s ", firstParagraph, secondParagraph),
metadata("document", "0")
);
DocumentSplitter splitter = new DocumentByParagraphSplitter(maxSegmentSize);
List<TextSegment> segments = splitter.split(document);
segments.forEach(segment ->
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
assertThat(segments).containsExactly(
textSegment(firstParagraph, metadata("index", "0").add("document", "0")),
textSegment(secondParagraph, metadata("index", "1").add("document", "0"))
);
}
@Test
void should_split_into_segments_with_multiple_paragraphs_per_segment() {
int maxSegmentSize = 60;
String firstParagraph = "This is a first paragraph.";
String secondParagraph = "This is a second paragraph.";
assertThat(firstParagraph + secondParagraph).hasSizeLessThan(maxSegmentSize);
String thirdParagraph = "This is a third paragraph.";
assertThat(thirdParagraph).hasSizeLessThan(maxSegmentSize);
assertThat(firstParagraph + secondParagraph + thirdParagraph)
.hasSizeGreaterThan(maxSegmentSize);
Document document = Document.from(
format(" %s \n \n %s \n \n %s ", firstParagraph, secondParagraph, thirdParagraph),
metadata("document", "0")
);
DocumentSplitter splitter = new DocumentByParagraphSplitter(maxSegmentSize);
List<TextSegment> segments = splitter.split(document);
segments.forEach(segment ->
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
assertThat(segments).containsExactly(
textSegment(firstParagraph + "\n\n" + secondParagraph, metadata("index", "0").add("document", "0")),
textSegment(thirdParagraph, metadata("index", "1").add("document", "0"))
);
}
@Test
void should_split_paragraph_into_sentences_if_it_does_not_fit_into_segment() {
int maxSegmentSize = 50;
String firstParagraph = "This is a first paragraph.";
assertThat(firstParagraph).hasSizeLessThan(maxSegmentSize);
String firstSentenceOfSecondParagraph = "This is a fist sentence of a second paragraph.";
assertThat(firstSentenceOfSecondParagraph).hasSizeLessThan(maxSegmentSize);
String secondSentenceOfSecondParagraph = "This is a second sentence of a second paragraph.";
assertThat(secondSentenceOfSecondParagraph).hasSizeLessThan(maxSegmentSize);
String secondParagraph = firstSentenceOfSecondParagraph + " " + secondSentenceOfSecondParagraph;
assertThat(secondParagraph).hasSizeGreaterThan(maxSegmentSize);
String thirdParagraph = "This is a third paragraph.";
assertThat(thirdParagraph).hasSizeLessThan(maxSegmentSize);
Document document = Document.from(
format(" %s \n \n %s \n \n %s ", firstParagraph, secondParagraph, thirdParagraph),
metadata("document", "0")
);
DocumentSplitter splitter = new DocumentByParagraphSplitter(maxSegmentSize);
List<TextSegment> segments = splitter.split(document);
segments.forEach(segment ->
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
assertThat(segments).containsExactly(
textSegment(firstParagraph, metadata("index", "0").add("document", "0")),
textSegment(firstSentenceOfSecondParagraph, metadata("index", "1").add("document", "0")),
textSegment(secondSentenceOfSecondParagraph, metadata("index", "2").add("document", "0")),
textSegment(thirdParagraph, metadata("index", "3").add("document", "0"))
);
}
@Test
void should_split_sample_text_containing_multiple_paragraphs() {
int maxSegmentSize = 65;
Tokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
String p1 = "In a small town nestled between two vast mountains, there was a shop unlike any other. " +
"A unique haven. " +
"Visitors would often comment on its peculiar charm, always slightly different from what they " +
"remembered on their previous visits. " +
"The store stood as a testament to the passage of time and the ever-changing landscape of tales.";
assertThat(tokenizer.estimateTokenCountInText(p1)).isEqualTo(62);
String p2p1 = "Upon entering, the first thing to strike you was the enormity of it all. " +
"Every inch of space was occupied with books. " +
"Some stood tall and regal on the highest shelves, looking as if they had witnessed epochs come and go. " +
"They were leather-bound, with pages yellowed by age.";
assertThat(tokenizer.estimateTokenCountInText(p2p1)).isEqualTo(60);
String p2p2 = "Others, smaller and brightly adorned, were reminiscent of summer days and childhood laughter. " +
"But these physical objects were mere vessels. " +
"It was the stories inside that held power.";
assertThat(tokenizer.estimateTokenCountInText(p2p2)).isEqualTo(33);
String p3 = "Mrs. Jenkins ran the shop. " +
"A mystery in her own right. " +
"Her silver hair cascaded like a waterfall, and her eyes seemed to see more than most. " +
"With just a glance, she'd find the perfect story for you.";
assertThat(tokenizer.estimateTokenCountInText(p3)).isEqualTo(47);
String p4p1 = "One wet afternoon, Eli entered. " +
"He was just a boy, lost in the vastness of the store. " +
"Between the aisles, his small fingers danced on the spines of books, feeling the heartbeat of " +
"countless tales. " +
"Then, a simple brown-covered book whispered to him.";
assertThat(tokenizer.estimateTokenCountInText(p4p1)).isEqualTo(56);
String p4p2 = "Without grandeur or pretense, it beckoned. " +
"And he listened.";
assertThat(tokenizer.estimateTokenCountInText(p4p2)).isEqualTo(15);
String p5 = "He read. " +
"And read. " +
"The world around him melted.";
assertThat(tokenizer.estimateTokenCountInText(p5)).isEqualTo(12);
String p6 = "When Mrs. Jenkins approached, night had fallen. " +
"She gently remarked, \"Books have a way of finding their reader.\" " +
"Eli simply nodded, understanding the profound truth in her words.";
assertThat(tokenizer.estimateTokenCountInText(p6)).isEqualTo(36);
String p7 = "Some places and stories remain etched in our souls, offering lessons and moments of sheer wonder. " +
"They defy definition.";
assertThat(tokenizer.estimateTokenCountInText(p7)).isEqualTo(23);
Document document = Document.from(
format("%s\n\n%s %s\n\n%s\n\n%s %s\n\n%s\n\n%s\n\n%s", p1, p2p1, p2p2, p3, p4p1, p4p2, p5, p6, p7),
metadata("document", "0")
);
DocumentSplitter splitter = new DocumentByParagraphSplitter(maxSegmentSize, tokenizer);
List<TextSegment> segments = splitter.split(document);
segments.forEach(segment ->
assertThat(tokenizer.estimateTokenCountInText(segment.text())).isLessThanOrEqualTo(maxSegmentSize));
assertThat(segments).containsExactly(
textSegment(p1, metadata("index", "0").add("document", "0")),
textSegment(p2p1, metadata("index", "1").add("document", "0")),
textSegment(p2p2, metadata("index", "2").add("document", "0")),
textSegment(p3, metadata("index", "3").add("document", "0")),
textSegment(p4p1, metadata("index", "4").add("document", "0")),
textSegment(p4p2, metadata("index", "5").add("document", "0")),
textSegment(p5 + "\n\n" + p6, metadata("index", "6").add("document", "0")),
textSegment(p7, metadata("index", "7").add("document", "0"))
);
}
@Test
void should_split_sample_text_without_paragraphs() {
int maxSegmentSize = 100;
Tokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
String segment1 = "In a small town nestled between two vast mountains, there was a shop unlike any other. " +
"A unique haven. " +
"Visitors would often comment on its peculiar charm, always slightly different from what they " +
"remembered on their previous visits. " +
"The store stood as a testament to the passage of time and the ever-changing landscape of tales. " +
"Upon entering, the first thing to strike you was the enormity of it all. " +
"Every inch of space was occupied with books.";
String segment2 = "Some stood tall and regal on the highest shelves, " +
"looking as if they had witnessed epochs come and go. " +
"They were leather-bound, with pages yellowed by age. " +
"Others, smaller and brightly adorned, were reminiscent of summer days and childhood laughter. " +
"But these physical objects were mere vessels. " +
"It was the stories inside that held power. " +
"Mrs. Jenkins ran the shop. " +
"A mystery in her own right.";
String segment3 = "Her silver hair cascaded like a waterfall, and her eyes seemed to see more than most. " +
"With just a glance, she'd find the perfect story for you. " +
"One wet afternoon, Eli entered. " +
"He was just a boy, lost in the vastness of the store. " +
"Between the aisles, his small fingers danced on the spines of books, feeling the heartbeat of " +
"countless tales. " +
"Then, a simple brown-covered book whispered to him.";
String segment4 = "Without grandeur or pretense, it beckoned. " +
"And he listened. " +
"He read. " +
"And read. " +
"The world around him melted. " +
"When Mrs. Jenkins approached, night had fallen. " +
"She gently remarked, \"Books have a way of finding their reader.\" " +
"Eli simply nodded, understanding the profound truth in her words. " +
"Some places and stories remain etched in our souls, offering lessons and moments of sheer wonder. " +
"They defy definition.";
Document document = Document.from(
format("%s %s %s %s", segment1, segment2, segment3, segment4),
metadata("document", "0")
);
DocumentSplitter splitter = new DocumentByParagraphSplitter(maxSegmentSize, tokenizer);
List<TextSegment> segments = splitter.split(document);
segments.forEach(segment ->
assertThat(tokenizer.estimateTokenCountInText(segment.text())).isLessThanOrEqualTo(maxSegmentSize));
assertThat(segments).containsExactly(
textSegment(segment1, metadata("index", "0").add("document", "0")),
textSegment(segment2, metadata("index", "1").add("document", "0")),
textSegment(segment3, metadata("index", "2").add("document", "0")),
textSegment(segment4, metadata("index", "3").add("document", "0"))
);
}
}

View File

@ -0,0 +1,83 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.segment.TextSegment;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import java.util.List;
import static dev.langchain4j.data.document.Metadata.metadata;
import static dev.langchain4j.data.segment.TextSegment.textSegment;
import static java.lang.String.format;
import static org.assertj.core.api.Assertions.assertThat;
class DocumentByRegexSplitterTest {
@ParameterizedTest
@ValueSource(strings = {" ", ",", "\n", "\n\n"})
void should_split_by(String separator) {
String text = format("one%stwo%sthree", separator, separator);
Document document = Document.from(text, metadata("document", "0"));
int maxSegmentSize = 5;
DocumentSplitter splitter = new DocumentByRegexSplitter(separator, separator, maxSegmentSize);
List<TextSegment> segments = splitter.split(document);
segments.forEach(segment ->
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
assertThat(segments).containsExactly(
textSegment("one", metadata("index", "0").add("document", "0")),
textSegment("two", metadata("index", "1").add("document", "0")),
textSegment("three", metadata("index", "2").add("document", "0"))
);
}
@Test
void should_fit_multiple_parts_into_the_same_segment() {
Document document = Document.from("one two three", metadata("document", "0"));
int maxSegmentSize = 10;
DocumentSplitter splitter = new DocumentByRegexSplitter(" ", "\n", maxSegmentSize);
List<TextSegment> segments = splitter.split(document);
segments.forEach(segment ->
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
assertThat(segments).containsExactly(
textSegment("one\ntwo", metadata("index", "0").add("document", "0")),
textSegment("three", metadata("index", "1").add("document", "0"))
);
}
@Test
void should_split_part_into_sub_parts_if_it_does_not_fit_into_segment() {
Document document = Document.from(
"This is a first line.\nThis is a second line.\n\nThis is a third line.",
metadata("document", "0")
);
int maxSegmentSize = 15;
DocumentSplitter subSplitter = new DocumentByWordSplitter(maxSegmentSize);
DocumentSplitter splitter = new DocumentByRegexSplitter("\n", "\n", maxSegmentSize, subSplitter);
List<TextSegment> segments = splitter.split(document);
segments.forEach(segment ->
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
assertThat(segments).containsExactly(
textSegment("This is a first", metadata("index", "0").add("document", "0")),
textSegment("line.", metadata("index", "1").add("document", "0")),
textSegment("This is a", metadata("index", "2").add("document", "0")),
textSegment("second line.", metadata("index", "3").add("document", "0")),
textSegment("This is a third", metadata("index", "4").add("document", "0")),
textSegment("line.", metadata("index", "5").add("document", "0"))
);
}
}

View File

@ -0,0 +1,176 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.openai.OpenAiTokenizer;
import org.junit.jupiter.api.Test;
import java.util.List;
import static dev.langchain4j.data.document.Metadata.metadata;
import static dev.langchain4j.data.segment.TextSegment.textSegment;
import static dev.langchain4j.model.openai.OpenAiModelName.GPT_3_5_TURBO;
import static java.lang.String.format;
import static org.assertj.core.api.Assertions.assertThat;
class DocumentBySentenceSplitterTest {
@Test
void should_split_into_segments_with_one_sentence_per_segment() {
int maxSegmentSize = 30;
String firstSentence = "This is a first sentence.";
assertThat(firstSentence).hasSizeLessThan(maxSegmentSize);
String secondSentence = "This is a second sentence.";
assertThat(secondSentence).hasSizeLessThan(maxSegmentSize);
assertThat(firstSentence + " " + secondSentence).hasSizeGreaterThan(maxSegmentSize);
Document document = Document.from(
format(" %s %s ", firstSentence, secondSentence),
metadata("document", "0")
);
DocumentSplitter splitter = new DocumentBySentenceSplitter(maxSegmentSize);
List<TextSegment> segments = splitter.split(document);
segments.forEach(segment ->
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
assertThat(segments).containsExactly(
textSegment(firstSentence, metadata("index", "0").add("document", "0")),
textSegment(secondSentence, metadata("index", "1").add("document", "0"))
);
}
@Test
void should_split_into_segments_with_multiple_sentences_per_segment() {
int maxSegmentSize = 60;
String firstSentence = "This is a first sentence.";
String secondSentence = "This is a second sentence.";
assertThat(firstSentence + " " + secondSentence).hasSizeLessThan(maxSegmentSize);
String thirdSentence = "This is a third sentence.";
assertThat(firstSentence + " " + secondSentence + " " + thirdSentence)
.hasSizeGreaterThan(maxSegmentSize);
Document document = Document.from(
format(" %s %s %s ", firstSentence, secondSentence, thirdSentence),
metadata("document", "0")
);
DocumentSplitter splitter = new DocumentBySentenceSplitter(maxSegmentSize);
List<TextSegment> segments = splitter.split(document);
segments.forEach(segment ->
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
assertThat(segments).containsExactly(
textSegment(firstSentence + " " + secondSentence, metadata("index", "0").add("document", "0")),
textSegment(thirdSentence, metadata("index", "1").add("document", "0"))
);
}
@Test
void should_split_sentence_if_it_does_not_fit_into_segment() {
int maxSegmentSize = 40;
String firstSentence = "This is a short sentence.";
assertThat(firstSentence).hasSizeLessThan(maxSegmentSize);
String secondSentence = "This is a very long sentence that does not fit into segment.";
assertThat(secondSentence).hasSizeGreaterThan(maxSegmentSize);
String thirdSentence = "This is another short sentence.";
assertThat(thirdSentence).hasSizeLessThan(maxSegmentSize);
Document document = Document.from(
format(" %s %s %s ", firstSentence, secondSentence, thirdSentence),
metadata("document", "0")
);
DocumentSplitter splitter = new DocumentBySentenceSplitter(maxSegmentSize);
List<TextSegment> segments = splitter.split(document);
segments.forEach(segment ->
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
assertThat(segments).containsExactly(
textSegment(firstSentence, metadata("index", "0").add("document", "0")),
textSegment("This is a very long sentence that does", metadata("index", "1").add("document", "0")),
textSegment("not fit into segment.", metadata("index", "2").add("document", "0")),
textSegment(thirdSentence, metadata("index", "3").add("document", "0"))
);
}
@Test
void should_split_sample_text() {
String s1 = "In a sleepy hamlet, where the trees towered high, there lived a young boy named Elias.";
String s2 = "He loved exploring.";
String s3 = "Fields of gold stretched as far as the eye could see, punctuated by tiny blossoms.";
String s4 = "The wind whispered.";
String s5p1 = "Sometimes, it would carry fragrances from the neighboring towns, which included chocolate, " +
"freshly baked bread, and the salty tang of";
String s5p2 = "the sea.";
String s6 = "In the middle of the town, a single lamppost stood.";
String s7 = "Cats lounged beneath it, stretching languidly in the dappled sunlight.";
String s8 = "Elias had a dream: to build a flying machine.";
String s9 = "Some days, it felt impossible.";
String s10 = "Yet, every evening, he would pull out his sketches, tinkering and toiling away.";
String s11 = "There was a resilience in his spirit.";
String s12 = "Birds often stopped to watch.";
String s13 = "Curiosity is the spark of invention.";
String s14 = "He believed.";
String s15 = "And one day, with the town gathered around him, Elias soared.";
String s16 = "The horizon awaited.";
String s17 = "Life is full of surprises.";
String s18 = "Embrace them.";
Document document = Document.from(
format(
"%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s",
s1, s2, s3, s4, s5p1, s5p2, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18
),
metadata("document", "0")
);
int maxSegmentSize = 26;
OpenAiTokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
DocumentSplitter splitter = new DocumentBySentenceSplitter(maxSegmentSize, tokenizer);
List<TextSegment> segments = splitter.split(document);
segments.forEach(segment ->
assertThat(tokenizer.estimateTokenCountInText(segment.text())).isLessThanOrEqualTo(maxSegmentSize));
assertThat(segments).containsExactly(
textSegment(s1 + " " + s2, metadata("index", 0).add("document", "0")),
textSegment(s3 + " " + s4, metadata("index", 1).add("document", "0")),
textSegment(s5p1, metadata("index", 2).add("document", "0")),
textSegment(s5p2, metadata("index", 3).add("document", "0")),
textSegment(s6, metadata("index", 4).add("document", "0")),
textSegment(s7, metadata("index", 5).add("document", "0")),
textSegment(s8 + " " + s9, metadata("index", 6).add("document", "0")),
textSegment(s10, metadata("index", 7).add("document", "0")),
textSegment(s11 + " " + s12 + " " + s13 + " " + s14, metadata("index", 8).add("document", "0")),
textSegment(s15 + " " + s16 + " " + s17, metadata("index", 9).add("document", "0")),
textSegment(s18, metadata("index", 10).add("document", "0"))
);
}
}

View File

@ -1,32 +0,0 @@
package dev.langchain4j.data.document.splitter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.segment.TextSegment;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
class ParagraphSplitterTest {
ParagraphSplitter splitter = new ParagraphSplitter();
@ParameterizedTest
@ValueSource(strings = {
"first\r\rsecond\r\rthird\r\rcr\r",
"first\n\nsecond\n\nthird\n\nlf\n",
"first\r\n\r\nsecond\r\n\r\nthird\r\n\r\ncrlf\r\n"
})
void test_split_by_paragraph(String text) {
Document document = Document.from(text);
List<TextSegment> segments = splitter.split(document);
assertEquals(4, segments.size());
assertEquals("first", segments.get(0).text());
assertEquals("second", segments.get(1).text());
assertEquals("third", segments.get(2).text());
}
}