#1718: Have another constructor on HtmlTextExtractor that only takes the cssSelector

This commit is contained in:
LangChain4j 2024-09-09 10:58:17 +02:00
parent 421b4cd048
commit 5a18f1baeb
1 changed files with 13 additions and 3 deletions

View File

@ -8,11 +8,11 @@ import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Map;
import static dev.langchain4j.data.document.Document.URL;
import static java.lang.String.format;
import static java.util.stream.Collectors.joining;
@ -40,7 +40,17 @@ public class HtmlTextExtractor implements DocumentTransformer {
}
/**
* Constructs an instance of HtmlToTextTransformer that extracts text from HTML elements matching the provided CSS selector.
* Constructs an instance of HtmlToTextTransformer that extracts text from HTML elements matching the specified CSS selector.
*
* @param cssSelector A CSS selector.
* For example, "#page-content" will extract text from the HTML element with the id "page-content".
*/
public HtmlTextExtractor(String cssSelector) {
this(cssSelector, null, false);
}
/**
* Constructs an instance of HtmlToTextTransformer that extracts text from HTML elements matching the specified CSS selector.
*
* @param cssSelector A CSS selector.
* For example, "#page-content" will extract text from the HTML element with the id "page-content".