Re-implemented document splitters (#130)
This commit is contained in:
parent
88b56778f4
commit
bebfc78ee1
|
@ -26,4 +26,11 @@ public class Utils {
|
|||
}
|
||||
return "\"" + string + "\"";
|
||||
}
|
||||
|
||||
public static String firstChars(String string, int numberOfChars) {
|
||||
if (string == null) {
|
||||
return null;
|
||||
}
|
||||
return string.length() > numberOfChars ? string.substring(0, numberOfChars) : string;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -118,6 +118,12 @@
|
|||
<version>2.7.14</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.opennlp</groupId>
|
||||
<artifactId>opennlp-tools</artifactId>
|
||||
<version>1.9.4</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>dev.langchain4j</groupId>
|
||||
<artifactId>langchain4j-embeddings-all-minilm-l6-v2-q</artifactId>
|
||||
|
@ -278,8 +284,7 @@
|
|||
<scopes>compile,runtime,provided,test</scopes>
|
||||
<acceptableLicenses>
|
||||
<license>
|
||||
<name>(The )?(Apache License, Version 2\.0)|(Apache-2\.0)|(The Apache Software License, Version 2\.0)
|
||||
</name>
|
||||
<name>(The )?(Apache License, Version 2\.0)|(Apache-2\.0)|(The Apache Software License, Version 2\.0)</name>
|
||||
<url>https?://www\.apache\.org/licenses/LICENSE-2\.0</url>
|
||||
</license>
|
||||
<license>
|
||||
|
|
|
@ -37,6 +37,11 @@
|
|||
<artifactId>jtokkit</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.opennlp</groupId>
|
||||
<artifactId>opennlp-tools</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class CharacterSplitter implements DocumentSplitter {
|
||||
|
||||
private final int segmentLength;
|
||||
private final int segmentOverlap;
|
||||
|
||||
public CharacterSplitter(int segmentLength, int segmentOverlap) {
|
||||
this.segmentLength = segmentLength;
|
||||
this.segmentOverlap = segmentOverlap;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<TextSegment> split(Document document) {
|
||||
if (document.text() == null || document.text().isEmpty()) {
|
||||
throw new IllegalArgumentException("Document text should not be null or empty");
|
||||
}
|
||||
|
||||
String text = document.text();
|
||||
int textLength = text.length();
|
||||
|
||||
if (segmentLength <= 0 || segmentOverlap < 0 || segmentLength <= segmentOverlap) {
|
||||
throw new IllegalArgumentException(String.format("Invalid segmentLength (%s) or segmentOverlap (%s)", segmentLength, segmentOverlap));
|
||||
}
|
||||
|
||||
List<TextSegment> segments = new ArrayList<>();
|
||||
if (textLength <= segmentLength) {
|
||||
segments.add(document.toTextSegment());
|
||||
} else {
|
||||
for (int i = 0; i < textLength - segmentOverlap; i += segmentLength - segmentOverlap) {
|
||||
int endIndex = Math.min(i + segmentLength, textLength);
|
||||
String segment = text.substring(i, endIndex);
|
||||
segments.add(TextSegment.from(segment, document.metadata()));
|
||||
if (endIndex == textLength) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return segments;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import dev.langchain4j.model.Tokenizer;
|
||||
|
||||
/**
|
||||
* Splits the provided {@link Document} into characters and attempts to fit as many characters as possible
|
||||
* into a single {@link TextSegment}, adhering to the limit set by {@code maxSegmentSize}.
|
||||
* <p>
|
||||
* The {@code maxSegmentSize} can be defined in terms of characters (default) or tokens.
|
||||
* For token-based limit, a {@link Tokenizer} must be provided.
|
||||
* <p>
|
||||
* If multiple characters fit within {@code maxSegmentSize}, they are joined together without delimiters.
|
||||
* <p>
|
||||
* Each {@link TextSegment} inherits all metadata from the {@link Document} and includes an "index" metadata key
|
||||
* representing its position within the document (starting from 0).
|
||||
*/
|
||||
public class DocumentByCharacterSplitter extends HierarchicalDocumentSplitter {
|
||||
|
||||
public DocumentByCharacterSplitter(int maxSegmentSizeInChars) {
|
||||
super(maxSegmentSizeInChars, null, null);
|
||||
}
|
||||
|
||||
public DocumentByCharacterSplitter(int maxSegmentSizeInChars, DocumentSplitter subSplitter) {
|
||||
super(maxSegmentSizeInChars, null, subSplitter);
|
||||
}
|
||||
|
||||
public DocumentByCharacterSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer) {
|
||||
super(maxSegmentSizeInTokens, tokenizer, null);
|
||||
}
|
||||
|
||||
public DocumentByCharacterSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer, DocumentSplitter subSplitter) {
|
||||
super(maxSegmentSizeInTokens, tokenizer, subSplitter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] split(String text) {
|
||||
return text.split("");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String joinDelimiter() {
|
||||
return "";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected DocumentSplitter defaultSubSplitter() {
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import dev.langchain4j.model.Tokenizer;
|
||||
|
||||
/**
|
||||
* Splits the provided {@link Document} into lines and attempts to fit as many lines as possible
|
||||
* into a single {@link TextSegment}, adhering to the limit set by {@code maxSegmentSize}.
|
||||
* <p>
|
||||
* The {@code maxSegmentSize} can be defined in terms of characters (default) or tokens.
|
||||
* For token-based limit, a {@link Tokenizer} must be provided.
|
||||
* <p>
|
||||
* Line boundaries are detected by a minimum of one newline character ("\n").
|
||||
* Any additional whitespaces before or after are ignored.
|
||||
* So, the following examples are all valid line separators: "\n", "\n\n", " \n", "\n " and so on.
|
||||
* <p>
|
||||
* If multiple lines fit within {@code maxSegmentSize}, they are joined together using a newline ("\n").
|
||||
* <p>
|
||||
* If a single line is too long and exceeds {@code maxSegmentSize},
|
||||
* the {@code subSplitter} ({@link DocumentBySentenceSplitter} by default) is used to split it into smaller parts and
|
||||
* place them into multiple segments.
|
||||
* Such segments contain only the parts of the split long line.
|
||||
* <p>
|
||||
* Each {@link TextSegment} inherits all metadata from the {@link Document} and includes an "index" metadata key
|
||||
* representing its position within the document (starting from 0).
|
||||
*/
|
||||
public class DocumentByLineSplitter extends HierarchicalDocumentSplitter {
|
||||
|
||||
public DocumentByLineSplitter(int maxSegmentSizeInChars) {
|
||||
super(maxSegmentSizeInChars, null, null);
|
||||
}
|
||||
|
||||
public DocumentByLineSplitter(int maxSegmentSizeInChars, DocumentSplitter subSplitter) {
|
||||
super(maxSegmentSizeInChars, null, subSplitter);
|
||||
}
|
||||
|
||||
public DocumentByLineSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer) {
|
||||
super(maxSegmentSizeInTokens, tokenizer, null);
|
||||
}
|
||||
|
||||
public DocumentByLineSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer, DocumentSplitter subSplitter) {
|
||||
super(maxSegmentSizeInTokens, tokenizer, subSplitter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] split(String text) {
|
||||
return text.split("\\s*\\R\\s*"); // additional whitespaces are ignored
|
||||
}
|
||||
|
||||
@Override
|
||||
public String joinDelimiter() {
|
||||
return "\n";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected DocumentSplitter defaultSubSplitter() {
|
||||
return new DocumentBySentenceSplitter(maxSegmentSize, tokenizer);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import dev.langchain4j.model.Tokenizer;
|
||||
|
||||
/**
|
||||
* Splits the provided {@link Document} into paragraphs and attempts to fit as many paragraphs as possible
|
||||
* into a single {@link TextSegment}, adhering to the limit set by {@code maxSegmentSize}.
|
||||
* <p>
|
||||
* The {@code maxSegmentSize} can be defined in terms of characters (default) or tokens.
|
||||
* For token-based limit, a {@link Tokenizer} must be provided.
|
||||
* <p>
|
||||
* Paragraph boundaries are detected by a minimum of two newline characters ("\n\n").
|
||||
* Any additional whitespaces before, between, or after are ignored.
|
||||
* So, the following examples are all valid paragraph separators: "\n\n", "\n\n\n", "\n \n", " \n \n ", and so on.
|
||||
* <p>
|
||||
* If multiple paragraphs fit within {@code maxSegmentSize}, they are joined together using a double newline ("\n\n").
|
||||
* <p>
|
||||
* If a single paragraph is too long and exceeds {@code maxSegmentSize},
|
||||
* the {@code subSplitter} ({@link DocumentBySentenceSplitter} by default) is used to split it into smaller parts and
|
||||
* place them into multiple segments.
|
||||
* Such segments contain only the parts of the split long paragraph.
|
||||
* <p>
|
||||
* Each {@link TextSegment} inherits all metadata from the {@link Document} and includes an "index" metadata key
|
||||
* representing its position within the document (starting from 0).
|
||||
*/
|
||||
public class DocumentByParagraphSplitter extends HierarchicalDocumentSplitter {
|
||||
|
||||
public DocumentByParagraphSplitter(int maxSegmentSizeInChars) {
|
||||
super(maxSegmentSizeInChars, null, null);
|
||||
}
|
||||
|
||||
public DocumentByParagraphSplitter(int maxSegmentSizeInChars, DocumentSplitter subSplitter) {
|
||||
super(maxSegmentSizeInChars, null, subSplitter);
|
||||
}
|
||||
|
||||
public DocumentByParagraphSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer) {
|
||||
super(maxSegmentSizeInTokens, tokenizer, null);
|
||||
}
|
||||
|
||||
public DocumentByParagraphSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer, DocumentSplitter subSplitter) {
|
||||
super(maxSegmentSizeInTokens, tokenizer, subSplitter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] split(String text) {
|
||||
return text.split("\\s*\\R\\s*\\R\\s*"); // additional whitespaces are ignored
|
||||
}
|
||||
|
||||
@Override
|
||||
public String joinDelimiter() {
|
||||
return "\n\n";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected DocumentSplitter defaultSubSplitter() {
|
||||
return new DocumentBySentenceSplitter(maxSegmentSize, tokenizer);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import dev.langchain4j.model.Tokenizer;
|
||||
|
||||
import static dev.langchain4j.internal.ValidationUtils.ensureNotNull;
|
||||
|
||||
/**
|
||||
* Splits the provided {@link Document} into parts using the provided {@code regex} and attempts to fit as many parts
|
||||
* as possible into a single {@link TextSegment}, adhering to the limit set by {@code maxSegmentSize}.
|
||||
* <p>
|
||||
* The {@code maxSegmentSize} can be defined in terms of characters (default) or tokens.
|
||||
* For token-based limit, a {@link Tokenizer} must be provided.
|
||||
* <p>
|
||||
* If multiple parts fit within {@code maxSegmentSize}, they are joined together using the provided {@code joinDelimiter}.
|
||||
* <p>
|
||||
* If a single part is too long and exceeds {@code maxSegmentSize}, the {@code subSplitter} (which should be provided)
|
||||
* is used to split it into sub-parts and place them into multiple segments.
|
||||
* Such segments contain only the sub-parts of the split long part.
|
||||
* <p>
|
||||
* Each {@link TextSegment} inherits all metadata from the {@link Document} and includes an "index" metadata key
|
||||
* representing its position within the document (starting from 0).
|
||||
*/
|
||||
public class DocumentByRegexSplitter extends HierarchicalDocumentSplitter {
|
||||
|
||||
private final String regex;
|
||||
private final String joinDelimiter;
|
||||
|
||||
public DocumentByRegexSplitter(String regex,
|
||||
String joinDelimiter,
|
||||
int maxSegmentSizeInChars) {
|
||||
super(maxSegmentSizeInChars, null, null);
|
||||
this.regex = ensureNotNull(regex, "regex");
|
||||
this.joinDelimiter = ensureNotNull(joinDelimiter, "joinDelimiter");
|
||||
}
|
||||
|
||||
public DocumentByRegexSplitter(String regex,
|
||||
String joinDelimiter,
|
||||
int maxSegmentSizeInChars,
|
||||
DocumentSplitter subSplitter) {
|
||||
super(maxSegmentSizeInChars, null, subSplitter);
|
||||
this.regex = ensureNotNull(regex, "regex");
|
||||
this.joinDelimiter = ensureNotNull(joinDelimiter, "joinDelimiter");
|
||||
}
|
||||
|
||||
public DocumentByRegexSplitter(String regex,
|
||||
String joinDelimiter,
|
||||
int maxSegmentSizeInTokens,
|
||||
Tokenizer tokenizer) {
|
||||
super(maxSegmentSizeInTokens, tokenizer, null);
|
||||
this.regex = ensureNotNull(regex, "regex");
|
||||
this.joinDelimiter = ensureNotNull(joinDelimiter, "joinDelimiter");
|
||||
}
|
||||
|
||||
public DocumentByRegexSplitter(String regex,
|
||||
String joinDelimiter,
|
||||
int maxSegmentSizeInTokens,
|
||||
Tokenizer tokenizer,
|
||||
DocumentSplitter subSplitter) {
|
||||
super(maxSegmentSizeInTokens, tokenizer, subSplitter);
|
||||
this.regex = ensureNotNull(regex, "regex");
|
||||
this.joinDelimiter = ensureNotNull(joinDelimiter, "joinDelimiter");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] split(String text) {
|
||||
return text.split(regex);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String joinDelimiter() {
|
||||
return joinDelimiter;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected DocumentSplitter defaultSubSplitter() {
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import dev.langchain4j.model.Tokenizer;
|
||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
||||
import opennlp.tools.sentdetect.SentenceModel;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* Splits the provided {@link Document} into sentences and attempts to fit as many sentences as possible
|
||||
* into a single {@link TextSegment}, adhering to the limit set by {@code maxSegmentSize}.
|
||||
* <p>
|
||||
* The {@code maxSegmentSize} can be defined in terms of characters (default) or tokens.
|
||||
* For token-based limit, a {@link Tokenizer} must be provided.
|
||||
* <p>
|
||||
* Sentence boundaries are detected using the Apache OpenNLP library with the English sentence model.
|
||||
* <p>
|
||||
* If multiple sentences fit within {@code maxSegmentSize}, they are joined together using a space (" ").
|
||||
* <p>
|
||||
* If a single sentence is too long and exceeds {@code maxSegmentSize},
|
||||
* the {@code subSplitter} ({@link DocumentByWordSplitter} by default) is used to split it into smaller parts and
|
||||
* place them into multiple segments.
|
||||
* Such segments contain only the parts of the split long sentence.
|
||||
* <p>
|
||||
* Each {@link TextSegment} inherits all metadata from the {@link Document} and includes an "index" metadata key
|
||||
* representing its position within the document (starting from 0).
|
||||
*/
|
||||
public class DocumentBySentenceSplitter extends HierarchicalDocumentSplitter {
|
||||
|
||||
private final SentenceDetectorME sentenceDetector;
|
||||
|
||||
public DocumentBySentenceSplitter(int maxSegmentSizeInChars) {
|
||||
super(maxSegmentSizeInChars, null, null);
|
||||
this.sentenceDetector = createSentenceDetector();
|
||||
}
|
||||
|
||||
public DocumentBySentenceSplitter(int maxSegmentSizeInChars, DocumentSplitter subSplitter) {
|
||||
super(maxSegmentSizeInChars, null, subSplitter);
|
||||
this.sentenceDetector = createSentenceDetector();
|
||||
}
|
||||
|
||||
public DocumentBySentenceSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer) {
|
||||
super(maxSegmentSizeInTokens, tokenizer, null);
|
||||
this.sentenceDetector = createSentenceDetector();
|
||||
}
|
||||
|
||||
public DocumentBySentenceSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer, DocumentSplitter subSplitter) {
|
||||
super(maxSegmentSizeInTokens, tokenizer, subSplitter);
|
||||
this.sentenceDetector = createSentenceDetector();
|
||||
}
|
||||
|
||||
private SentenceDetectorME createSentenceDetector() {
|
||||
String sentenceModelFilePath = "/opennlp/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin";
|
||||
try (InputStream is = getClass().getResourceAsStream(sentenceModelFilePath)) {
|
||||
return new SentenceDetectorME(new SentenceModel(is));
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] split(String text) {
|
||||
return sentenceDetector.sentDetect(text);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String joinDelimiter() {
|
||||
return " ";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected DocumentSplitter defaultSubSplitter() {
|
||||
return new DocumentByWordSplitter(maxSegmentSize, tokenizer);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import dev.langchain4j.model.Tokenizer;
|
||||
|
||||
/**
|
||||
* Splits the provided {@link Document} into words and attempts to fit as many words as possible
|
||||
* into a single {@link TextSegment}, adhering to the limit set by {@code maxSegmentSize}.
|
||||
* <p>
|
||||
* The {@code maxSegmentSize} can be defined in terms of characters (default) or tokens.
|
||||
* For token-based limit, a {@link Tokenizer} must be provided.
|
||||
* <p>
|
||||
* Word boundaries are detected by a minimum of one space (" ").
|
||||
* Any additional whitespaces before or after are ignored.
|
||||
* So, the following examples are all valid word separators: " ", " ", "\n", and so on.
|
||||
* <p>
|
||||
* If multiple words fit within {@code maxSegmentSize}, they are joined together using a space (" ").
|
||||
* <p>
|
||||
* Although this should not happen, if a single word is too long and exceeds {@code maxSegmentSize},
|
||||
* the {@code subSplitter} ({@link DocumentByCharacterSplitter} by default) is used to split it into smaller parts and
|
||||
* place them into multiple segments.
|
||||
* Such segments contain only the parts of the split long word.
|
||||
* <p>
|
||||
* Each {@link TextSegment} inherits all metadata from the {@link Document} and includes an "index" metadata key
|
||||
* representing its position within the document (starting from 0).
|
||||
*/
|
||||
public class DocumentByWordSplitter extends HierarchicalDocumentSplitter {
|
||||
|
||||
public DocumentByWordSplitter(int maxSegmentSizeInChars) {
|
||||
super(maxSegmentSizeInChars, null, null);
|
||||
}
|
||||
|
||||
public DocumentByWordSplitter(int maxSegmentSizeInChars, DocumentSplitter subSplitter) {
|
||||
super(maxSegmentSizeInChars, null, subSplitter);
|
||||
}
|
||||
|
||||
public DocumentByWordSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer) {
|
||||
super(maxSegmentSizeInTokens, tokenizer, null);
|
||||
}
|
||||
|
||||
public DocumentByWordSplitter(int maxSegmentSizeInTokens, Tokenizer tokenizer, DocumentSplitter subSplitter) {
|
||||
super(maxSegmentSizeInTokens, tokenizer, subSplitter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] split(String text) {
|
||||
return text.split("\\s+"); // additional whitespaces are ignored
|
||||
}
|
||||
|
||||
@Override
|
||||
public String joinDelimiter() {
|
||||
return " ";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected DocumentSplitter defaultSubSplitter() {
|
||||
return new DocumentByCharacterSplitter(maxSegmentSize, tokenizer);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
import dev.langchain4j.model.Tokenizer;
|
||||
|
||||
public class DocumentSplitters {
|
||||
|
||||
/**
|
||||
* This is a recommended {@link DocumentSplitter} for generic text.
|
||||
* It tries to split the document into paragraphs first and fits
|
||||
* as many paragraphs into a single {@link dev.langchain4j.data.segment.TextSegment} as possible.
|
||||
* If some paragraphs are too long, they are recursively split into lines, then sentences,
|
||||
* then words, and then characters until they fit into a segment.
|
||||
*
|
||||
* @param maxSegmentSizeInTokens The maximum size of the segment, defined in tokens.
|
||||
* @param tokenizer The tokenizer that is used to count tokens in the text.
|
||||
* @return recursive document splitter
|
||||
*/
|
||||
public static DocumentSplitter recursive(int maxSegmentSizeInTokens, Tokenizer tokenizer) {
|
||||
return new DocumentByParagraphSplitter(maxSegmentSizeInTokens, tokenizer,
|
||||
new DocumentByLineSplitter(maxSegmentSizeInTokens, tokenizer,
|
||||
new DocumentBySentenceSplitter(maxSegmentSizeInTokens, tokenizer,
|
||||
new DocumentByWordSplitter(maxSegmentSizeInTokens, tokenizer))));
|
||||
}
|
||||
|
||||
/**
|
||||
* This is a recommended {@link DocumentSplitter} for generic text.
|
||||
* It tries to split the document into paragraphs first and fits
|
||||
* as many paragraphs into a single {@link dev.langchain4j.data.segment.TextSegment} as possible.
|
||||
* If some paragraphs are too long, they are recursively split into lines, then sentences,
|
||||
* then words, and then characters until they fit into a segment.
|
||||
*
|
||||
* @param maxSegmentSizeInChars The maximum size of the segment, defined in characters.
|
||||
* @return recursive document splitter
|
||||
*/
|
||||
public static DocumentSplitter recursive(int maxSegmentSizeInChars) {
|
||||
return recursive(maxSegmentSizeInChars, null);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,113 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
import dev.langchain4j.data.document.Metadata;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import dev.langchain4j.model.Tokenizer;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import static dev.langchain4j.internal.Utils.firstChars;
|
||||
import static dev.langchain4j.internal.ValidationUtils.ensureGreaterThanZero;
|
||||
import static dev.langchain4j.internal.ValidationUtils.ensureNotNull;
|
||||
import static java.lang.String.format;
|
||||
|
||||
public abstract class HierarchicalDocumentSplitter implements DocumentSplitter {
|
||||
|
||||
private static final String INDEX = "index";
|
||||
|
||||
protected final int maxSegmentSize;
|
||||
protected final Tokenizer tokenizer;
|
||||
|
||||
protected final DocumentSplitter subSplitter;
|
||||
|
||||
protected HierarchicalDocumentSplitter(int maxSegmentSizeInChars) {
|
||||
this(maxSegmentSizeInChars, null, null);
|
||||
}
|
||||
|
||||
protected HierarchicalDocumentSplitter(int maxSegmentSizeInChars,
|
||||
DocumentSplitter subSplitter) {
|
||||
this(maxSegmentSizeInChars, null, subSplitter);
|
||||
}
|
||||
|
||||
protected HierarchicalDocumentSplitter(int maxSegmentSizeInTokens,
|
||||
Tokenizer tokenizer) {
|
||||
this(maxSegmentSizeInTokens, tokenizer, null);
|
||||
}
|
||||
|
||||
protected HierarchicalDocumentSplitter(int maxSegmentSizeInTokens,
|
||||
Tokenizer tokenizer,
|
||||
DocumentSplitter subSplitter) {
|
||||
this.maxSegmentSize = ensureGreaterThanZero(maxSegmentSizeInTokens, "maxSegmentSize");
|
||||
this.tokenizer = tokenizer;
|
||||
this.subSplitter = subSplitter == null ? defaultSubSplitter() : subSplitter;
|
||||
}
|
||||
|
||||
protected abstract String[] split(String text);
|
||||
|
||||
protected abstract String joinDelimiter();
|
||||
|
||||
protected abstract DocumentSplitter defaultSubSplitter();
|
||||
|
||||
@Override
|
||||
public List<TextSegment> split(Document document) {
|
||||
ensureNotNull(document, "document");
|
||||
|
||||
List<TextSegment> segments = new ArrayList<>();
|
||||
SegmentBuilder segmentBuilder = new SegmentBuilder(maxSegmentSize, this::sizeOf, joinDelimiter());
|
||||
AtomicInteger index = new AtomicInteger(0);
|
||||
|
||||
String[] parts = split(document.text());
|
||||
for (String part : parts) {
|
||||
if (segmentBuilder.hasSpaceFor(part)) {
|
||||
segmentBuilder.append(part);
|
||||
} else {
|
||||
if (segmentBuilder.isNotEmpty()) {
|
||||
segments.add(createSegment(segmentBuilder.build(), document, index.getAndIncrement()));
|
||||
segmentBuilder.reset();
|
||||
}
|
||||
if (segmentBuilder.hasSpaceFor(part)) {
|
||||
segmentBuilder.append(part);
|
||||
} else {
|
||||
if (subSplitter == null) {
|
||||
throw new RuntimeException(format(
|
||||
"The text \"%s...\" (%s %s long) doesn't fit into the maximum segment size (%s %s), " +
|
||||
"and there is no subSplitter defined to split it further.",
|
||||
firstChars(part, 30),
|
||||
sizeOf(part), tokenizer == null ? "characters" : "tokens",
|
||||
maxSegmentSize, tokenizer == null ? "characters" : "tokens"
|
||||
|
||||
));
|
||||
}
|
||||
for (TextSegment segment : subSplitter.split(Document.from(part))) {
|
||||
segments.add(createSegment(segment.text(), document, index.getAndIncrement()));
|
||||
segmentBuilder.reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (segmentBuilder.isNotEmpty()) {
|
||||
segments.add(createSegment(segmentBuilder.build(), document, index.getAndIncrement()));
|
||||
segmentBuilder.reset();
|
||||
}
|
||||
|
||||
return segments;
|
||||
}
|
||||
|
||||
private int sizeOf(String text) {
|
||||
if (tokenizer != null) {
|
||||
return tokenizer.estimateTokenCountInText(text);
|
||||
} else {
|
||||
return text.length();
|
||||
}
|
||||
}
|
||||
|
||||
private static TextSegment createSegment(String text, Document document, int index) {
|
||||
Metadata metadata = document.metadata().copy().add(INDEX, index);
|
||||
return TextSegment.from(text, metadata);
|
||||
}
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static java.util.Arrays.stream;
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
public class ParagraphSplitter implements DocumentSplitter {
|
||||
|
||||
@Override
|
||||
public List<TextSegment> split(Document document) {
|
||||
String text = document.text();
|
||||
if (text == null || text.isEmpty()) {
|
||||
throw new IllegalArgumentException("Document text should not be null or empty");
|
||||
}
|
||||
|
||||
String[] paragraphs = text.split("\\R\\R");
|
||||
|
||||
return stream(paragraphs)
|
||||
.map(paragraph -> TextSegment.from(paragraph.trim(), document.metadata()))
|
||||
.collect(toList());
|
||||
}
|
||||
}
|
|
@ -1,33 +0,0 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static java.util.Arrays.stream;
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
public class RegexSplitter implements DocumentSplitter {
|
||||
|
||||
private final String regex;
|
||||
|
||||
public RegexSplitter(String regex) {
|
||||
this.regex = regex;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<TextSegment> split(Document document) {
|
||||
String text = document.text();
|
||||
if (text == null || text.isEmpty()) {
|
||||
throw new IllegalArgumentException("Document text should not be null or empty");
|
||||
}
|
||||
|
||||
String[] segments = text.split(regex);
|
||||
|
||||
return stream(segments)
|
||||
.map(segment -> TextSegment.from(segment, document.metadata()))
|
||||
.collect(toList());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import java.util.function.Function;
|
||||
|
||||
import static dev.langchain4j.internal.ValidationUtils.ensureGreaterThanZero;
|
||||
import static dev.langchain4j.internal.ValidationUtils.ensureNotNull;
|
||||
|
||||
class SegmentBuilder {
|
||||
|
||||
private StringBuilder segmentBuilder;
|
||||
|
||||
private final int maxSegmentSize;
|
||||
private final Function<String, Integer> sizeFunction;
|
||||
|
||||
private final String joinSeparator;
|
||||
|
||||
SegmentBuilder(int maxSegmentSize, Function<String, Integer> sizeFunction, String joinSeparator) {
|
||||
this.segmentBuilder = new StringBuilder();
|
||||
this.maxSegmentSize = ensureGreaterThanZero(maxSegmentSize, "maxSegmentSize");
|
||||
this.sizeFunction = ensureNotNull(sizeFunction, "sizeFunction");
|
||||
this.joinSeparator = ensureNotNull(joinSeparator, "joinSeparator");
|
||||
}
|
||||
|
||||
boolean hasSpaceFor(String text) {
|
||||
return hasSpaceFor(text, joinSeparator);
|
||||
}
|
||||
|
||||
boolean hasSpaceFor(String text, String separator) {
|
||||
if (isNotEmpty()) {
|
||||
return sizeOf(segmentBuilder.toString()) + sizeOf(separator) + sizeOf(text) <= maxSegmentSize;
|
||||
} else {
|
||||
return sizeOf(text) <= maxSegmentSize;
|
||||
}
|
||||
}
|
||||
|
||||
private int sizeOf(String text) {
|
||||
return sizeFunction.apply(text);
|
||||
}
|
||||
|
||||
void append(String text) {
|
||||
append(text, joinSeparator);
|
||||
}
|
||||
|
||||
void append(String text, String separator) {
|
||||
if (segmentBuilder.length() > 0) {
|
||||
segmentBuilder.append(separator);
|
||||
}
|
||||
segmentBuilder.append(text);
|
||||
}
|
||||
|
||||
boolean isNotEmpty() {
|
||||
return segmentBuilder.length() > 0;
|
||||
}
|
||||
|
||||
String build() {
|
||||
return segmentBuilder.toString().trim();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
segmentBuilder = new StringBuilder();
|
||||
}
|
||||
}
|
|
@ -1,45 +0,0 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
|
||||
import java.text.BreakIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
public class SentenceSplitter implements DocumentSplitter {
|
||||
|
||||
@Override
|
||||
public List<TextSegment> split(Document document) {
|
||||
String text = document.text();
|
||||
if (text == null || text.isEmpty()) {
|
||||
throw new IllegalArgumentException("Document text should not be null or empty");
|
||||
}
|
||||
|
||||
List<String> sentences = splitIntoSentences(text);
|
||||
|
||||
return sentences.stream()
|
||||
.map(sentence -> TextSegment.from(sentence.trim(), document.metadata()))
|
||||
.collect(toList());
|
||||
}
|
||||
|
||||
private List<String> splitIntoSentences(String text) {
|
||||
List<String> sentences = new ArrayList<>();
|
||||
|
||||
BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.getDefault());
|
||||
iterator.setText(text);
|
||||
|
||||
int start = iterator.first();
|
||||
for (int end = iterator.next();
|
||||
end != BreakIterator.DONE;
|
||||
start = end, end = iterator.next()) {
|
||||
sentences.add(text.substring(start, end).trim());
|
||||
}
|
||||
|
||||
return sentences;
|
||||
}
|
||||
}
|
Binary file not shown.
|
@ -1,65 +0,0 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.CsvSource;
|
||||
import org.junit.jupiter.params.provider.NullAndEmptySource;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static dev.langchain4j.data.document.Document.document;
|
||||
import static dev.langchain4j.data.segment.TextSegment.textSegment;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.assertj.core.api.Assertions.assertThatThrownBy;
|
||||
|
||||
class CharacterSplitterTest {
|
||||
|
||||
@Test
|
||||
void should_split_with_overlap() {
|
||||
DocumentSplitter splitter = new CharacterSplitter(4, 2);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document("1234567890"));
|
||||
|
||||
assertThat(segments).containsExactly(
|
||||
textSegment("1234"),
|
||||
textSegment("3456"),
|
||||
textSegment("5678"),
|
||||
textSegment("7890")
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_split_without_overlap() {
|
||||
DocumentSplitter splitter = new CharacterSplitter(4, 0);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document("1234567890"));
|
||||
|
||||
assertThat(segments).containsExactly(
|
||||
textSegment("1234"),
|
||||
textSegment("5678"),
|
||||
textSegment("90")
|
||||
);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@CsvSource({"0,-1", "-1,-1", "-1,0", "0,0", "0,1", "1,-1", "1,1", "1,2"})
|
||||
void should_fail_on_invalid_length_or_overlap(int segmentLength, int segmentOverlap) {
|
||||
DocumentSplitter splitter = new CharacterSplitter(segmentLength, segmentOverlap);
|
||||
|
||||
assertThatThrownBy(() -> splitter.split(document("any")))
|
||||
.isExactlyInstanceOf(IllegalArgumentException.class)
|
||||
.hasMessage("Invalid segmentLength (%s) or segmentOverlap (%s)", segmentLength, segmentOverlap);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@NullAndEmptySource
|
||||
void testNullCase(String documentText) {
|
||||
DocumentSplitter splitter = new CharacterSplitter(4, 2);
|
||||
|
||||
assertThatThrownBy(() -> splitter.split(document(documentText)))
|
||||
.isExactlyInstanceOf(IllegalArgumentException.class)
|
||||
.hasMessage("text cannot be null or blank");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,257 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import dev.langchain4j.model.Tokenizer;
|
||||
import dev.langchain4j.model.openai.OpenAiTokenizer;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static dev.langchain4j.data.document.Metadata.metadata;
|
||||
import static dev.langchain4j.data.segment.TextSegment.textSegment;
|
||||
import static dev.langchain4j.model.openai.OpenAiModelName.GPT_3_5_TURBO;
|
||||
import static java.lang.String.format;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
class DocumentByParagraphSplitterTest {
|
||||
|
||||
@Test
|
||||
void should_split_into_segments_with_one_paragraph_per_segment() {
|
||||
|
||||
int maxSegmentSize = 30;
|
||||
|
||||
String firstParagraph = "This is a first paragraph.";
|
||||
assertThat(firstParagraph).hasSizeLessThan(maxSegmentSize);
|
||||
|
||||
String secondParagraph = "This is a second paragraph.";
|
||||
assertThat(secondParagraph).hasSizeLessThan(maxSegmentSize);
|
||||
|
||||
assertThat(firstParagraph + "\n \n" + secondParagraph).hasSizeGreaterThan(maxSegmentSize);
|
||||
|
||||
Document document = Document.from(
|
||||
format(" %s \n \n %s ", firstParagraph, secondParagraph),
|
||||
metadata("document", "0")
|
||||
);
|
||||
|
||||
DocumentSplitter splitter = new DocumentByParagraphSplitter(maxSegmentSize);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document);
|
||||
|
||||
segments.forEach(segment ->
|
||||
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
|
||||
assertThat(segments).containsExactly(
|
||||
textSegment(firstParagraph, metadata("index", "0").add("document", "0")),
|
||||
textSegment(secondParagraph, metadata("index", "1").add("document", "0"))
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_split_into_segments_with_multiple_paragraphs_per_segment() {
|
||||
|
||||
int maxSegmentSize = 60;
|
||||
|
||||
String firstParagraph = "This is a first paragraph.";
|
||||
String secondParagraph = "This is a second paragraph.";
|
||||
assertThat(firstParagraph + secondParagraph).hasSizeLessThan(maxSegmentSize);
|
||||
|
||||
String thirdParagraph = "This is a third paragraph.";
|
||||
assertThat(thirdParagraph).hasSizeLessThan(maxSegmentSize);
|
||||
|
||||
assertThat(firstParagraph + secondParagraph + thirdParagraph)
|
||||
.hasSizeGreaterThan(maxSegmentSize);
|
||||
|
||||
Document document = Document.from(
|
||||
format(" %s \n \n %s \n \n %s ", firstParagraph, secondParagraph, thirdParagraph),
|
||||
metadata("document", "0")
|
||||
);
|
||||
|
||||
DocumentSplitter splitter = new DocumentByParagraphSplitter(maxSegmentSize);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document);
|
||||
|
||||
segments.forEach(segment ->
|
||||
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
|
||||
assertThat(segments).containsExactly(
|
||||
textSegment(firstParagraph + "\n\n" + secondParagraph, metadata("index", "0").add("document", "0")),
|
||||
textSegment(thirdParagraph, metadata("index", "1").add("document", "0"))
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_split_paragraph_into_sentences_if_it_does_not_fit_into_segment() {
|
||||
|
||||
int maxSegmentSize = 50;
|
||||
|
||||
String firstParagraph = "This is a first paragraph.";
|
||||
assertThat(firstParagraph).hasSizeLessThan(maxSegmentSize);
|
||||
|
||||
String firstSentenceOfSecondParagraph = "This is a fist sentence of a second paragraph.";
|
||||
assertThat(firstSentenceOfSecondParagraph).hasSizeLessThan(maxSegmentSize);
|
||||
|
||||
String secondSentenceOfSecondParagraph = "This is a second sentence of a second paragraph.";
|
||||
assertThat(secondSentenceOfSecondParagraph).hasSizeLessThan(maxSegmentSize);
|
||||
|
||||
String secondParagraph = firstSentenceOfSecondParagraph + " " + secondSentenceOfSecondParagraph;
|
||||
assertThat(secondParagraph).hasSizeGreaterThan(maxSegmentSize);
|
||||
|
||||
String thirdParagraph = "This is a third paragraph.";
|
||||
assertThat(thirdParagraph).hasSizeLessThan(maxSegmentSize);
|
||||
|
||||
Document document = Document.from(
|
||||
format(" %s \n \n %s \n \n %s ", firstParagraph, secondParagraph, thirdParagraph),
|
||||
metadata("document", "0")
|
||||
);
|
||||
|
||||
DocumentSplitter splitter = new DocumentByParagraphSplitter(maxSegmentSize);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document);
|
||||
|
||||
segments.forEach(segment ->
|
||||
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
|
||||
assertThat(segments).containsExactly(
|
||||
textSegment(firstParagraph, metadata("index", "0").add("document", "0")),
|
||||
textSegment(firstSentenceOfSecondParagraph, metadata("index", "1").add("document", "0")),
|
||||
textSegment(secondSentenceOfSecondParagraph, metadata("index", "2").add("document", "0")),
|
||||
textSegment(thirdParagraph, metadata("index", "3").add("document", "0"))
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_split_sample_text_containing_multiple_paragraphs() {
|
||||
|
||||
int maxSegmentSize = 65;
|
||||
Tokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
|
||||
|
||||
String p1 = "In a small town nestled between two vast mountains, there was a shop unlike any other. " +
|
||||
"A unique haven. " +
|
||||
"Visitors would often comment on its peculiar charm, always slightly different from what they " +
|
||||
"remembered on their previous visits. " +
|
||||
"The store stood as a testament to the passage of time and the ever-changing landscape of tales.";
|
||||
assertThat(tokenizer.estimateTokenCountInText(p1)).isEqualTo(62);
|
||||
|
||||
String p2p1 = "Upon entering, the first thing to strike you was the enormity of it all. " +
|
||||
"Every inch of space was occupied with books. " +
|
||||
"Some stood tall and regal on the highest shelves, looking as if they had witnessed epochs come and go. " +
|
||||
"They were leather-bound, with pages yellowed by age.";
|
||||
assertThat(tokenizer.estimateTokenCountInText(p2p1)).isEqualTo(60);
|
||||
String p2p2 = "Others, smaller and brightly adorned, were reminiscent of summer days and childhood laughter. " +
|
||||
"But these physical objects were mere vessels. " +
|
||||
"It was the stories inside that held power.";
|
||||
assertThat(tokenizer.estimateTokenCountInText(p2p2)).isEqualTo(33);
|
||||
|
||||
String p3 = "Mrs. Jenkins ran the shop. " +
|
||||
"A mystery in her own right. " +
|
||||
"Her silver hair cascaded like a waterfall, and her eyes seemed to see more than most. " +
|
||||
"With just a glance, she'd find the perfect story for you.";
|
||||
assertThat(tokenizer.estimateTokenCountInText(p3)).isEqualTo(47);
|
||||
|
||||
String p4p1 = "One wet afternoon, Eli entered. " +
|
||||
"He was just a boy, lost in the vastness of the store. " +
|
||||
"Between the aisles, his small fingers danced on the spines of books, feeling the heartbeat of " +
|
||||
"countless tales. " +
|
||||
"Then, a simple brown-covered book whispered to him.";
|
||||
assertThat(tokenizer.estimateTokenCountInText(p4p1)).isEqualTo(56);
|
||||
String p4p2 = "Without grandeur or pretense, it beckoned. " +
|
||||
"And he listened.";
|
||||
assertThat(tokenizer.estimateTokenCountInText(p4p2)).isEqualTo(15);
|
||||
|
||||
String p5 = "He read. " +
|
||||
"And read. " +
|
||||
"The world around him melted.";
|
||||
assertThat(tokenizer.estimateTokenCountInText(p5)).isEqualTo(12);
|
||||
|
||||
String p6 = "When Mrs. Jenkins approached, night had fallen. " +
|
||||
"She gently remarked, \"Books have a way of finding their reader.\" " +
|
||||
"Eli simply nodded, understanding the profound truth in her words.";
|
||||
assertThat(tokenizer.estimateTokenCountInText(p6)).isEqualTo(36);
|
||||
|
||||
String p7 = "Some places and stories remain etched in our souls, offering lessons and moments of sheer wonder. " +
|
||||
"They defy definition.";
|
||||
assertThat(tokenizer.estimateTokenCountInText(p7)).isEqualTo(23);
|
||||
|
||||
Document document = Document.from(
|
||||
format("%s\n\n%s %s\n\n%s\n\n%s %s\n\n%s\n\n%s\n\n%s", p1, p2p1, p2p2, p3, p4p1, p4p2, p5, p6, p7),
|
||||
metadata("document", "0")
|
||||
);
|
||||
|
||||
DocumentSplitter splitter = new DocumentByParagraphSplitter(maxSegmentSize, tokenizer);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document);
|
||||
|
||||
segments.forEach(segment ->
|
||||
assertThat(tokenizer.estimateTokenCountInText(segment.text())).isLessThanOrEqualTo(maxSegmentSize));
|
||||
assertThat(segments).containsExactly(
|
||||
textSegment(p1, metadata("index", "0").add("document", "0")),
|
||||
textSegment(p2p1, metadata("index", "1").add("document", "0")),
|
||||
textSegment(p2p2, metadata("index", "2").add("document", "0")),
|
||||
textSegment(p3, metadata("index", "3").add("document", "0")),
|
||||
textSegment(p4p1, metadata("index", "4").add("document", "0")),
|
||||
textSegment(p4p2, metadata("index", "5").add("document", "0")),
|
||||
textSegment(p5 + "\n\n" + p6, metadata("index", "6").add("document", "0")),
|
||||
textSegment(p7, metadata("index", "7").add("document", "0"))
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_split_sample_text_without_paragraphs() {
|
||||
|
||||
int maxSegmentSize = 100;
|
||||
Tokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
|
||||
|
||||
String segment1 = "In a small town nestled between two vast mountains, there was a shop unlike any other. " +
|
||||
"A unique haven. " +
|
||||
"Visitors would often comment on its peculiar charm, always slightly different from what they " +
|
||||
"remembered on their previous visits. " +
|
||||
"The store stood as a testament to the passage of time and the ever-changing landscape of tales. " +
|
||||
"Upon entering, the first thing to strike you was the enormity of it all. " +
|
||||
"Every inch of space was occupied with books.";
|
||||
|
||||
String segment2 = "Some stood tall and regal on the highest shelves, " +
|
||||
"looking as if they had witnessed epochs come and go. " +
|
||||
"They were leather-bound, with pages yellowed by age. " +
|
||||
"Others, smaller and brightly adorned, were reminiscent of summer days and childhood laughter. " +
|
||||
"But these physical objects were mere vessels. " +
|
||||
"It was the stories inside that held power. " +
|
||||
"Mrs. Jenkins ran the shop. " +
|
||||
"A mystery in her own right.";
|
||||
|
||||
String segment3 = "Her silver hair cascaded like a waterfall, and her eyes seemed to see more than most. " +
|
||||
"With just a glance, she'd find the perfect story for you. " +
|
||||
"One wet afternoon, Eli entered. " +
|
||||
"He was just a boy, lost in the vastness of the store. " +
|
||||
"Between the aisles, his small fingers danced on the spines of books, feeling the heartbeat of " +
|
||||
"countless tales. " +
|
||||
"Then, a simple brown-covered book whispered to him.";
|
||||
|
||||
String segment4 = "Without grandeur or pretense, it beckoned. " +
|
||||
"And he listened. " +
|
||||
"He read. " +
|
||||
"And read. " +
|
||||
"The world around him melted. " +
|
||||
"When Mrs. Jenkins approached, night had fallen. " +
|
||||
"She gently remarked, \"Books have a way of finding their reader.\" " +
|
||||
"Eli simply nodded, understanding the profound truth in her words. " +
|
||||
"Some places and stories remain etched in our souls, offering lessons and moments of sheer wonder. " +
|
||||
"They defy definition.";
|
||||
|
||||
Document document = Document.from(
|
||||
format("%s %s %s %s", segment1, segment2, segment3, segment4),
|
||||
metadata("document", "0")
|
||||
);
|
||||
|
||||
DocumentSplitter splitter = new DocumentByParagraphSplitter(maxSegmentSize, tokenizer);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document);
|
||||
|
||||
segments.forEach(segment ->
|
||||
assertThat(tokenizer.estimateTokenCountInText(segment.text())).isLessThanOrEqualTo(maxSegmentSize));
|
||||
assertThat(segments).containsExactly(
|
||||
textSegment(segment1, metadata("index", "0").add("document", "0")),
|
||||
textSegment(segment2, metadata("index", "1").add("document", "0")),
|
||||
textSegment(segment3, metadata("index", "2").add("document", "0")),
|
||||
textSegment(segment4, metadata("index", "3").add("document", "0"))
|
||||
);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.ValueSource;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static dev.langchain4j.data.document.Metadata.metadata;
|
||||
import static dev.langchain4j.data.segment.TextSegment.textSegment;
|
||||
import static java.lang.String.format;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
class DocumentByRegexSplitterTest {
|
||||
|
||||
@ParameterizedTest
|
||||
@ValueSource(strings = {" ", ",", "\n", "\n\n"})
|
||||
void should_split_by(String separator) {
|
||||
|
||||
String text = format("one%stwo%sthree", separator, separator);
|
||||
Document document = Document.from(text, metadata("document", "0"));
|
||||
|
||||
int maxSegmentSize = 5;
|
||||
DocumentSplitter splitter = new DocumentByRegexSplitter(separator, separator, maxSegmentSize);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document);
|
||||
|
||||
segments.forEach(segment ->
|
||||
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
|
||||
assertThat(segments).containsExactly(
|
||||
textSegment("one", metadata("index", "0").add("document", "0")),
|
||||
textSegment("two", metadata("index", "1").add("document", "0")),
|
||||
textSegment("three", metadata("index", "2").add("document", "0"))
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_fit_multiple_parts_into_the_same_segment() {
|
||||
|
||||
Document document = Document.from("one two three", metadata("document", "0"));
|
||||
|
||||
int maxSegmentSize = 10;
|
||||
DocumentSplitter splitter = new DocumentByRegexSplitter(" ", "\n", maxSegmentSize);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document);
|
||||
|
||||
segments.forEach(segment ->
|
||||
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
|
||||
assertThat(segments).containsExactly(
|
||||
textSegment("one\ntwo", metadata("index", "0").add("document", "0")),
|
||||
textSegment("three", metadata("index", "1").add("document", "0"))
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_split_part_into_sub_parts_if_it_does_not_fit_into_segment() {
|
||||
|
||||
Document document = Document.from(
|
||||
"This is a first line.\nThis is a second line.\n\nThis is a third line.",
|
||||
metadata("document", "0")
|
||||
);
|
||||
|
||||
int maxSegmentSize = 15;
|
||||
DocumentSplitter subSplitter = new DocumentByWordSplitter(maxSegmentSize);
|
||||
DocumentSplitter splitter = new DocumentByRegexSplitter("\n", "\n", maxSegmentSize, subSplitter);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document);
|
||||
|
||||
segments.forEach(segment ->
|
||||
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
|
||||
assertThat(segments).containsExactly(
|
||||
textSegment("This is a first", metadata("index", "0").add("document", "0")),
|
||||
textSegment("line.", metadata("index", "1").add("document", "0")),
|
||||
textSegment("This is a", metadata("index", "2").add("document", "0")),
|
||||
textSegment("second line.", metadata("index", "3").add("document", "0")),
|
||||
textSegment("This is a third", metadata("index", "4").add("document", "0")),
|
||||
textSegment("line.", metadata("index", "5").add("document", "0"))
|
||||
);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,176 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.document.DocumentSplitter;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import dev.langchain4j.model.openai.OpenAiTokenizer;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static dev.langchain4j.data.document.Metadata.metadata;
|
||||
import static dev.langchain4j.data.segment.TextSegment.textSegment;
|
||||
import static dev.langchain4j.model.openai.OpenAiModelName.GPT_3_5_TURBO;
|
||||
import static java.lang.String.format;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
class DocumentBySentenceSplitterTest {
|
||||
|
||||
@Test
|
||||
void should_split_into_segments_with_one_sentence_per_segment() {
|
||||
|
||||
int maxSegmentSize = 30;
|
||||
|
||||
String firstSentence = "This is a first sentence.";
|
||||
assertThat(firstSentence).hasSizeLessThan(maxSegmentSize);
|
||||
|
||||
String secondSentence = "This is a second sentence.";
|
||||
assertThat(secondSentence).hasSizeLessThan(maxSegmentSize);
|
||||
|
||||
assertThat(firstSentence + " " + secondSentence).hasSizeGreaterThan(maxSegmentSize);
|
||||
|
||||
Document document = Document.from(
|
||||
format(" %s %s ", firstSentence, secondSentence),
|
||||
metadata("document", "0")
|
||||
);
|
||||
|
||||
DocumentSplitter splitter = new DocumentBySentenceSplitter(maxSegmentSize);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document);
|
||||
|
||||
segments.forEach(segment ->
|
||||
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
|
||||
assertThat(segments).containsExactly(
|
||||
textSegment(firstSentence, metadata("index", "0").add("document", "0")),
|
||||
textSegment(secondSentence, metadata("index", "1").add("document", "0"))
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_split_into_segments_with_multiple_sentences_per_segment() {
|
||||
|
||||
int maxSegmentSize = 60;
|
||||
|
||||
String firstSentence = "This is a first sentence.";
|
||||
String secondSentence = "This is a second sentence.";
|
||||
assertThat(firstSentence + " " + secondSentence).hasSizeLessThan(maxSegmentSize);
|
||||
|
||||
String thirdSentence = "This is a third sentence.";
|
||||
assertThat(firstSentence + " " + secondSentence + " " + thirdSentence)
|
||||
.hasSizeGreaterThan(maxSegmentSize);
|
||||
|
||||
Document document = Document.from(
|
||||
format(" %s %s %s ", firstSentence, secondSentence, thirdSentence),
|
||||
metadata("document", "0")
|
||||
);
|
||||
|
||||
DocumentSplitter splitter = new DocumentBySentenceSplitter(maxSegmentSize);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document);
|
||||
|
||||
segments.forEach(segment ->
|
||||
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
|
||||
assertThat(segments).containsExactly(
|
||||
textSegment(firstSentence + " " + secondSentence, metadata("index", "0").add("document", "0")),
|
||||
textSegment(thirdSentence, metadata("index", "1").add("document", "0"))
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_split_sentence_if_it_does_not_fit_into_segment() {
|
||||
|
||||
int maxSegmentSize = 40;
|
||||
|
||||
String firstSentence = "This is a short sentence.";
|
||||
assertThat(firstSentence).hasSizeLessThan(maxSegmentSize);
|
||||
|
||||
String secondSentence = "This is a very long sentence that does not fit into segment.";
|
||||
assertThat(secondSentence).hasSizeGreaterThan(maxSegmentSize);
|
||||
|
||||
String thirdSentence = "This is another short sentence.";
|
||||
assertThat(thirdSentence).hasSizeLessThan(maxSegmentSize);
|
||||
|
||||
Document document = Document.from(
|
||||
format(" %s %s %s ", firstSentence, secondSentence, thirdSentence),
|
||||
metadata("document", "0")
|
||||
);
|
||||
|
||||
DocumentSplitter splitter = new DocumentBySentenceSplitter(maxSegmentSize);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document);
|
||||
|
||||
segments.forEach(segment ->
|
||||
assertThat(segment.text().length()).isLessThanOrEqualTo(maxSegmentSize));
|
||||
assertThat(segments).containsExactly(
|
||||
textSegment(firstSentence, metadata("index", "0").add("document", "0")),
|
||||
textSegment("This is a very long sentence that does", metadata("index", "1").add("document", "0")),
|
||||
textSegment("not fit into segment.", metadata("index", "2").add("document", "0")),
|
||||
textSegment(thirdSentence, metadata("index", "3").add("document", "0"))
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void should_split_sample_text() {
|
||||
|
||||
String s1 = "In a sleepy hamlet, where the trees towered high, there lived a young boy named Elias.";
|
||||
String s2 = "He loved exploring.";
|
||||
|
||||
String s3 = "Fields of gold stretched as far as the eye could see, punctuated by tiny blossoms.";
|
||||
String s4 = "The wind whispered.";
|
||||
|
||||
String s5p1 = "Sometimes, it would carry fragrances from the neighboring towns, which included chocolate, " +
|
||||
"freshly baked bread, and the salty tang of";
|
||||
|
||||
String s5p2 = "the sea.";
|
||||
|
||||
String s6 = "In the middle of the town, a single lamppost stood.";
|
||||
|
||||
String s7 = "Cats lounged beneath it, stretching languidly in the dappled sunlight.";
|
||||
|
||||
String s8 = "Elias had a dream: to build a flying machine.";
|
||||
String s9 = "Some days, it felt impossible.";
|
||||
|
||||
String s10 = "Yet, every evening, he would pull out his sketches, tinkering and toiling away.";
|
||||
|
||||
String s11 = "There was a resilience in his spirit.";
|
||||
String s12 = "Birds often stopped to watch.";
|
||||
String s13 = "Curiosity is the spark of invention.";
|
||||
String s14 = "He believed.";
|
||||
|
||||
String s15 = "And one day, with the town gathered around him, Elias soared.";
|
||||
String s16 = "The horizon awaited.";
|
||||
String s17 = "Life is full of surprises.";
|
||||
|
||||
String s18 = "Embrace them.";
|
||||
|
||||
Document document = Document.from(
|
||||
format(
|
||||
"%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s",
|
||||
s1, s2, s3, s4, s5p1, s5p2, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18
|
||||
),
|
||||
metadata("document", "0")
|
||||
);
|
||||
|
||||
int maxSegmentSize = 26;
|
||||
OpenAiTokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
|
||||
DocumentSplitter splitter = new DocumentBySentenceSplitter(maxSegmentSize, tokenizer);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document);
|
||||
|
||||
segments.forEach(segment ->
|
||||
assertThat(tokenizer.estimateTokenCountInText(segment.text())).isLessThanOrEqualTo(maxSegmentSize));
|
||||
assertThat(segments).containsExactly(
|
||||
textSegment(s1 + " " + s2, metadata("index", 0).add("document", "0")),
|
||||
textSegment(s3 + " " + s4, metadata("index", 1).add("document", "0")),
|
||||
textSegment(s5p1, metadata("index", 2).add("document", "0")),
|
||||
textSegment(s5p2, metadata("index", 3).add("document", "0")),
|
||||
textSegment(s6, metadata("index", 4).add("document", "0")),
|
||||
textSegment(s7, metadata("index", 5).add("document", "0")),
|
||||
textSegment(s8 + " " + s9, metadata("index", 6).add("document", "0")),
|
||||
textSegment(s10, metadata("index", 7).add("document", "0")),
|
||||
textSegment(s11 + " " + s12 + " " + s13 + " " + s14, metadata("index", 8).add("document", "0")),
|
||||
textSegment(s15 + " " + s16 + " " + s17, metadata("index", 9).add("document", "0")),
|
||||
textSegment(s18, metadata("index", 10).add("document", "0"))
|
||||
);
|
||||
}
|
||||
}
|
|
@ -1,32 +0,0 @@
|
|||
package dev.langchain4j.data.document.splitter;
|
||||
|
||||
import dev.langchain4j.data.document.Document;
|
||||
import dev.langchain4j.data.segment.TextSegment;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.ValueSource;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class ParagraphSplitterTest {
|
||||
|
||||
ParagraphSplitter splitter = new ParagraphSplitter();
|
||||
|
||||
@ParameterizedTest
|
||||
@ValueSource(strings = {
|
||||
"first\r\rsecond\r\rthird\r\rcr\r",
|
||||
"first\n\nsecond\n\nthird\n\nlf\n",
|
||||
"first\r\n\r\nsecond\r\n\r\nthird\r\n\r\ncrlf\r\n"
|
||||
})
|
||||
void test_split_by_paragraph(String text) {
|
||||
Document document = Document.from(text);
|
||||
|
||||
List<TextSegment> segments = splitter.split(document);
|
||||
|
||||
assertEquals(4, segments.size());
|
||||
assertEquals("first", segments.get(0).text());
|
||||
assertEquals("second", segments.get(1).text());
|
||||
assertEquals("third", segments.get(2).text());
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue