From 5a18f1baeb130661da6732927fbe16d96c479b9d Mon Sep 17 00:00:00 2001 From: LangChain4j Date: Mon, 9 Sep 2024 10:58:17 +0200 Subject: [PATCH] #1718: Have another constructor on HtmlTextExtractor that only takes the cssSelector --- .../document/transformer/HtmlTextExtractor.java | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/langchain4j/src/main/java/dev/langchain4j/data/document/transformer/HtmlTextExtractor.java b/langchain4j/src/main/java/dev/langchain4j/data/document/transformer/HtmlTextExtractor.java index 206b14757..4d70f3492 100644 --- a/langchain4j/src/main/java/dev/langchain4j/data/document/transformer/HtmlTextExtractor.java +++ b/langchain4j/src/main/java/dev/langchain4j/data/document/transformer/HtmlTextExtractor.java @@ -8,11 +8,11 @@ import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.select.NodeVisitor; - -import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.Map; + import static dev.langchain4j.data.document.Document.URL; import static java.lang.String.format; import static java.util.stream.Collectors.joining; @@ -40,7 +40,17 @@ public class HtmlTextExtractor implements DocumentTransformer { } /** - * Constructs an instance of HtmlToTextTransformer that extracts text from HTML elements matching the provided CSS selector. + * Constructs an instance of HtmlToTextTransformer that extracts text from HTML elements matching the specified CSS selector. + * + * @param cssSelector A CSS selector. + * For example, "#page-content" will extract text from the HTML element with the id "page-content". + */ + public HtmlTextExtractor(String cssSelector) { + this(cssSelector, null, false); + } + + /** + * Constructs an instance of HtmlToTextTransformer that extracts text from HTML elements matching the specified CSS selector. * * @param cssSelector A CSS selector. * For example, "#page-content" will extract text from the HTML element with the id "page-content".