Fix #913: FileSystemDocumentLoader: ignore empty/blank documents, improved error/warn messages (#920)

## Context
When loading with `FileSystemDocumentLoader` and it encounters an empty
file, a WARN message in the logs looks like an exception and is
confusing.
See https://github.com/langchain4j/langchain4j/issues/913

## Change
- Ignore empty/blank documents when loading multiple files with
`DocumentIsBlankException`
- Changed log message to look not like an exception

## Checklist
Before submitting this PR, please check the following points:
- [X] I have added unit and integration tests for my change
- [X] All unit and integration tests in the module I have added/changed
are green
- [X] All unit and integration tests in the
[core](https://github.com/langchain4j/langchain4j/tree/main/langchain4j-core)
and
[main](https://github.com/langchain4j/langchain4j/tree/main/langchain4j)
modules are green
- [ ] I have added/updated the
[documentation](https://github.com/langchain4j/langchain4j/tree/main/docs/docs)
- [ ] I have added an example in the [examples
repo](https://github.com/langchain4j/langchain4j-examples) (only for
"big" features)
- [ ] I have added my new module in the
[BOM](https://github.com/langchain4j/langchain4j/blob/main/langchain4j-bom/pom.xml)
(only when a new module is added)

## Checklist for adding new embedding store integration
- [ ] I have added a {NameOfIntegration}EmbeddingStoreIT that extends
from either EmbeddingStoreIT or EmbeddingStoreWithFilteringIT
This commit is contained in:
LangChain4j 2024-04-16 08:58:50 +02:00 committed by GitHub
parent 62f2a1193e
commit 03528af8a1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
29 changed files with 176 additions and 36 deletions

View File

@ -1,6 +1,7 @@
package dev.langchain4j.data.document.parser.apache.pdfbox;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
@ -8,6 +9,8 @@ import org.apache.pdfbox.text.PDFTextStripper;
import java.io.IOException;
import java.io.InputStream;
import static dev.langchain4j.internal.Utils.isNullOrBlank;
/**
* Parses PDF file into a {@link Document} using Apache PDFBox library
*/
@ -20,6 +23,11 @@ public class ApachePdfBoxDocumentParser implements DocumentParser {
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(pdfDocument);
pdfDocument.close();
if (isNullOrBlank(text)) {
throw new BlankDocumentException();
}
return Document.from(text);
} catch (IOException e) {
throw new RuntimeException(e);

View File

@ -1,12 +1,14 @@
package dev.langchain4j.data.document.parser.apache.pdfbox;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentParser;
import org.junit.jupiter.api.Test;
import java.io.InputStream;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
class ApachePdfBoxDocumentParserTest {
@ -21,4 +23,14 @@ class ApachePdfBoxDocumentParserTest {
assertThat(document.text()).isEqualToIgnoringWhitespace("test content");
assertThat(document.metadata().asMap()).isEmpty();
}
@Test
void should_throw_BlankDocumentException() {
DocumentParser parser = new ApachePdfBoxDocumentParser();
InputStream inputStream = getClass().getClassLoader().getResourceAsStream("blank-file.pdf");
assertThatThrownBy(() -> parser.parse(inputStream))
.isExactlyInstanceOf(BlankDocumentException.class);
}
}

View File

@ -1,13 +1,17 @@
package dev.langchain4j.data.document.parser.apache.poi;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.poi.EmptyFileException;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POITextExtractor;
import java.io.IOException;
import java.io.InputStream;
import static dev.langchain4j.internal.Utils.isNullOrBlank;
/**
* Parses Microsoft Office file into a {@link Document} using Apache POI library.
* This parser supports various file formats, including doc, docx, ppt, pptx, xls, and xlsx.
@ -20,7 +24,14 @@ public class ApachePoiDocumentParser implements DocumentParser {
public Document parse(InputStream inputStream) {
try (POITextExtractor extractor = ExtractorFactory.createExtractor(inputStream)) {
String text = extractor.getText();
if (isNullOrBlank(text)) {
throw new BlankDocumentException();
}
return Document.from(text);
} catch (EmptyFileException e) {
throw new BlankDocumentException();
} catch (IOException e) {
throw new RuntimeException(e);
}

View File

@ -1,6 +1,7 @@
package dev.langchain4j.data.document.parser.apache.poi;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentParser;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
@ -8,6 +9,7 @@ import org.junit.jupiter.params.provider.ValueSource;
import java.io.InputStream;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
public class ApachePoiDocumentParserTest {
@ -45,4 +47,21 @@ public class ApachePoiDocumentParserTest {
.isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content");
assertThat(document.metadata().asMap()).isEmpty();
}
@ParameterizedTest
@ValueSource(strings = {
"empty-file.txt",
"blank-file.txt",
"blank-file.docx",
"blank-file.pptx"
// "blank-file.xlsx" TODO
})
void should_throw_BlankDocumentException(String fileName) {
DocumentParser parser = new ApachePoiDocumentParser();
InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName);
assertThatThrownBy(() -> parser.parse(inputStream))
.isExactlyInstanceOf(BlankDocumentException.class);
}
}

View File

@ -1,7 +1,9 @@
package dev.langchain4j.data.document.parser.apache.tika;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
@ -12,6 +14,7 @@ import org.xml.sax.ContentHandler;
import java.io.InputStream;
import static dev.langchain4j.internal.Utils.getOrDefault;
import static dev.langchain4j.internal.Utils.isNullOrBlank;
/**
* Parses files into {@link Document}s using Apache Tika library, automatically detecting the file format.
@ -63,7 +66,16 @@ public class ApacheTikaDocumentParser implements DocumentParser {
try {
parser.parse(inputStream, contentHandler, metadata, parseContext);
String text = contentHandler.toString();
if (isNullOrBlank(text)) {
throw new BlankDocumentException();
}
return Document.from(text);
} catch (BlankDocumentException e) {
throw e;
} catch (ZeroByteFileException e) {
throw new BlankDocumentException();
} catch (Exception e) {
throw new RuntimeException(e);
}

View File

@ -1,6 +1,7 @@
package dev.langchain4j.data.document.parser.apache.tika;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.tika.parser.AutoDetectParser;
import org.junit.jupiter.params.ParameterizedTest;
@ -9,6 +10,7 @@ import org.junit.jupiter.params.provider.ValueSource;
import java.io.InputStream;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
class ApacheTikaDocumentParserTest {
@ -47,4 +49,21 @@ class ApacheTikaDocumentParserTest {
.isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content");
assertThat(document.metadata().asMap()).isEmpty();
}
@ParameterizedTest
@ValueSource(strings = {
"empty-file.txt",
"blank-file.txt",
"blank-file.docx",
"blank-file.pptx"
// "blank-file.xlsx" TODO
})
void should_throw_BlankDocumentException(String fileName) {
DocumentParser parser = new ApacheTikaDocumentParser();
InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName);
assertThatThrownBy(() -> parser.parse(inputStream))
.isExactlyInstanceOf(BlankDocumentException.class);
}
}

View File

@ -0,0 +1,4 @@
package dev.langchain4j.data.document;
public class BlankDocumentException extends RuntimeException {
}

View File

@ -6,7 +6,9 @@ import java.io.InputStream;
* Utility class for loading documents.
*/
public class DocumentLoader {
private DocumentLoader() {}
private DocumentLoader() {
}
/**
* Loads a document from the given source using the given parser.
@ -16,12 +18,15 @@ public class DocumentLoader {
* @param source The source from which the document will be loaded.
* @param parser The parser that will be used to parse the document.
* @return The loaded document.
* @throws BlankDocumentException when the parsed {@link Document} is blank/empty.
*/
public static Document load(DocumentSource source, DocumentParser parser) {
try (InputStream inputStream = source.inputStream()) {
Document document = parser.parse(inputStream);
source.metadata().asMap().forEach((key, value) -> document.metadata().add(key, value));
return document;
} catch (BlankDocumentException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException("Failed to load document", e);
}

View File

@ -3,17 +3,18 @@ package dev.langchain4j.data.document;
import java.io.InputStream;
/**
* Defines the interface for parsing an InputStream into a Document.
* Defines the interface for parsing an {@link InputStream} into a {@link Document}.
* Different document types require specialized parsing logic.
*/
public interface DocumentParser {
/**
* Parses an InputStream into a Document.
* Parses a given {@link InputStream} into a {@link Document}.
* The specific implementation of this method will depend on the type of the document being parsed.
*
* @param inputStream The InputStream that contains the content of the document.
* @return The parsed Document.
* @param inputStream The {@link InputStream} that contains the content of the {@link Document}.
* @return The parsed {@link Document}.
* @throws BlankDocumentException when the parsed {@link Document} is blank/empty.
*/
Document parse(InputStream inputStream);
}

View File

@ -1,6 +1,7 @@
package dev.langchain4j.data.document.loader;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentLoader;
import dev.langchain4j.data.document.DocumentParser;
import dev.langchain4j.data.document.parser.TextDocumentParser;
@ -48,7 +49,7 @@ public class FileSystemDocumentLoader {
*/
public static Document loadDocument(Path filePath, DocumentParser documentParser) {
if (!isRegularFile(filePath)) {
throw illegalArgument("%s is not a file", filePath);
throw illegalArgument("'%s' is not a file", filePath);
}
return DocumentLoader.load(from(filePath), documentParser);
@ -118,7 +119,7 @@ public class FileSystemDocumentLoader {
*/
public static List<Document> loadDocuments(Path directoryPath, DocumentParser documentParser) {
if (!isDirectory(directoryPath)) {
throw illegalArgument("%s is not a directory", directoryPath);
throw illegalArgument("'%s' is not a directory", directoryPath);
}
try (Stream<Path> pathStream = Files.list(directoryPath)) {
@ -200,7 +201,7 @@ public class FileSystemDocumentLoader {
PathMatcher pathMatcher,
DocumentParser documentParser) {
if (!isDirectory(directoryPath)) {
throw illegalArgument("%s is not a directory", directoryPath);
throw illegalArgument("'%s' is not a directory", directoryPath);
}
try (Stream<Path> pathStream = Files.list(directoryPath)) {
@ -294,7 +295,7 @@ public class FileSystemDocumentLoader {
*/
public static List<Document> loadDocumentsRecursively(Path directoryPath, DocumentParser documentParser) {
if (!isDirectory(directoryPath)) {
throw illegalArgument("%s is not a directory", directoryPath);
throw illegalArgument("'%s' is not a directory", directoryPath);
}
try (Stream<Path> pathStream = Files.walk(directoryPath)) {
@ -379,7 +380,7 @@ public class FileSystemDocumentLoader {
PathMatcher pathMatcher,
DocumentParser documentParser) {
if (!isDirectory(directoryPath)) {
throw illegalArgument("%s is not a directory", directoryPath);
throw illegalArgument("'%s' is not a directory", directoryPath);
}
try (Stream<Path> pathStream = Files.walk(directoryPath)) {
@ -486,8 +487,11 @@ public class FileSystemDocumentLoader {
try {
Document document = loadDocument(file, documentParser);
documents.add(document);
} catch (BlankDocumentException ignored) {
// blank/empty documents are ignored
} catch (Exception e) {
log.warn("Failed to load document from " + file, e);
String message = e.getCause() != null ? e.getCause().getMessage() : e.getMessage();
log.warn("Failed to load '{}': {}", file, message);
}
});

View File

@ -1,12 +1,14 @@
package dev.langchain4j.data.document.parser;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentParser;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.nio.charset.Charset;
import static dev.langchain4j.internal.Utils.isNullOrBlank;
import static dev.langchain4j.internal.ValidationUtils.ensureNotNull;
import static java.nio.charset.StandardCharsets.UTF_8;
@ -36,7 +38,13 @@ public class TextDocumentParser implements DocumentParser {
String text = new String(buffer.toByteArray(), charset);
if (isNullOrBlank(text)) {
throw new BlankDocumentException();
}
return Document.from(text);
} catch (BlankDocumentException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException(e);
}

View File

@ -8,6 +8,7 @@ import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.nio.file.*;
@ -22,11 +23,11 @@ class FileSystemDocumentLoaderTest implements WithAssertions {
void load_bad_file() {
assertThatExceptionOfType(IllegalArgumentException.class)
.isThrownBy(() -> loadDocument(Paths.get("bad_file"), new TextDocumentParser()))
.withMessageContaining("bad_file is not a file");
.withMessageContaining("'bad_file' is not a file");
assertThatExceptionOfType(IllegalArgumentException.class)
.isThrownBy(() -> loadDocument(Paths.get("bad_file")))
.withMessageContaining("bad_file is not a file");
.withMessageContaining("'bad_file' is not a file");
assertThatExceptionOfType(IllegalArgumentException.class)
.isThrownBy(() -> loadDocument(Paths.get("/"), new TextDocumentParser()))
@ -57,12 +58,12 @@ class FileSystemDocumentLoaderTest implements WithAssertions {
assertThatExceptionOfType(IllegalArgumentException.class)
.isThrownBy(() -> loadDocuments(
Paths.get("bad_directory"), new TextDocumentParser()))
.withMessageContaining("bad_directory is not a directory");
.withMessageContaining("'bad_directory' is not a directory");
assertThatExceptionOfType(IllegalArgumentException.class)
.isThrownBy(() -> loadDocuments(
Paths.get("bad_directory")))
.withMessageContaining("bad_directory is not a directory");
.withMessageContaining("'bad_directory' is not a directory");
}
@Test
@ -92,24 +93,32 @@ class FileSystemDocumentLoaderTest implements WithAssertions {
assertThat(loadDocuments(resourceDirectory)).isEqualTo(documents);
assertThat(loadDocuments(resourceDirectory.toString())).isEqualTo(documents);
// Silently skips documents that fail to load.
DocumentParser failFirstParser = new DocumentParser() {
DocumentParser parserThatFailsOnFirstNonBlankDocument = new DocumentParser() {
private boolean first = true;
private final DocumentParser parser = new TextDocumentParser();
@Override
public Document parse(InputStream inputStream) {
if (first) {
if (first && isNotBlank(inputStream)) {
first = false;
throw new RuntimeException("fail first");
}
return parser.parse(inputStream);
}
private boolean isNotBlank(InputStream inputStream) {
try {
return inputStream.available() > 10; // rough approximation
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
// when-then
assertThat(loadDocuments(resourceDirectory, failFirstParser))
.hasSize(documents.size() - 1);
assertThat(loadDocuments(resourceDirectory, parserThatFailsOnFirstNonBlankDocument))
.hasSize(documents.size() - 1); // -1 because first document fails
}
@ParameterizedTest

View File

@ -1,14 +1,17 @@
package dev.langchain4j.data.document.parser;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentParser;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import java.io.IOException;
import java.io.InputStream;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatExceptionOfType;
import static org.assertj.core.api.Assertions.*;
class TextDocumentParserTest {
@ -34,6 +37,20 @@ class TextDocumentParserTest {
assertThat(document.text()).isEqualToIgnoringWhitespace("test content");
}
@ParameterizedTest
@ValueSource(strings = {
"empty-file.txt",
"blank-file.txt"
})
void should_throw_BlankDocumentException(String fileName) {
DocumentParser parser = new TextDocumentParser();
InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName);
assertThatThrownBy(() -> parser.parse(inputStream))
.isExactlyInstanceOf(BlankDocumentException.class);
}
@Test
void should_wrap_input_stream_errors() {
InputStream badStream = new InputStream() {
@ -46,8 +63,8 @@ class TextDocumentParserTest {
TextDocumentParser parser = new TextDocumentParser();
assertThatExceptionOfType(RuntimeException.class)
.isThrownBy(() -> parser.parse(badStream))
.withCauseInstanceOf(IOException.class)
.withMessageContaining("test exception");
.isThrownBy(() -> parser.parse(badStream))
.withCauseInstanceOf(IOException.class)
.withMessageContaining("test exception");
}
}

View File

@ -375,30 +375,41 @@ public class AiServicesIT {
}
interface BadChef {
public static final String CHEFS_PROMPT_DOES_NOT_EXIST_TXT = "chefs-prompt-does-not-exist.txt";
public static final String CHEFS_PROMPT_IS_EMPTY_TXT = "chefs-prompt-is-empty.txt";
String CHEFS_PROMPT_DOES_NOT_EXIST_TXT = "chefs-prompt-does-not-exist.txt";
@UserMessage(fromResource = CHEFS_PROMPT_DOES_NOT_EXIST_TXT)
Recipe createRecipeFromNonExistingResource(String... ingredients);
@UserMessage(fromResource = "chefs-prompt-does-not-exist.txt")
Recipe createRecipeWithNonExistingResource(String... ingredients);
@UserMessage(fromResource = CHEFS_PROMPT_IS_EMPTY_TXT)
Recipe createRecipeFromEmptyResource(String... ingredients);
@UserMessage(fromResource = "chefs-prompt-is-empty.txt")
Recipe createRecipeWithEmptyResource(String... ingredients);
@UserMessage(fromResource = "chefs-prompt-is-blank.txt")
Recipe createRecipeWithBlankResource(String... ingredients);
}
@Test
void test_call_model_with_missing_resource() {
void should_fail_when_user_message_resource_is_not_found() {
BadChef badChef = AiServices.create(BadChef.class, chatLanguageModel);
assertThatThrownBy(() -> badChef.createRecipeFromNonExistingResource("cucumber", "tomato", "feta", "onion", "olives"))
assertThatThrownBy(() -> badChef.createRecipeWithNonExistingResource("cucumber", "tomato", "feta", "onion", "olives"))
.isInstanceOf(IllegalConfigurationException.class)
.hasMessage("@UserMessage's resource '" + BadChef.CHEFS_PROMPT_DOES_NOT_EXIST_TXT + "' not found");
}
@Test
void test_call_model_with_empty_resource() {
void should_fail_when_user_message_resource_is_empty() {
BadChef badChef = AiServices.create(BadChef.class, chatLanguageModel);
assertThatThrownBy(() -> badChef.createRecipeFromEmptyResource("cucumber", "tomato", "feta", "onion", "olives"))
assertThatThrownBy(() -> badChef.createRecipeWithEmptyResource("cucumber", "tomato", "feta", "onion", "olives"))
.isInstanceOf(IllegalConfigurationException.class)
.hasMessage("@UserMessage's template cannot be empty");
}
@Test
void should_fail_when_user_message_resource_is_blank() {
BadChef badChef = AiServices.create(BadChef.class, chatLanguageModel);
assertThatThrownBy(() -> badChef.createRecipeWithBlankResource("cucumber", "tomato", "feta", "onion", "olives"))
.isInstanceOf(IllegalConfigurationException.class)
.hasMessage("@UserMessage's template cannot be empty");
}