OpenAiTokenizer: default ctor with GPT_3_5_TURBO by default

This commit is contained in:
LangChain4j 2024-01-31 10:11:33 +01:00
parent a51f455e33
commit 8ceb6ba3f7
10 changed files with 30 additions and 23 deletions

View File

@ -59,7 +59,7 @@ class SampleDocumentLoaderAndRagWithAstraTest {
Path path = new File(getClass().getResource("/story-about-happy-carrot.txt").getFile()).toPath();
Document document = FileSystemDocumentLoader.loadDocument(path, new TextDocumentParser());
DocumentSplitter splitter = DocumentSplitters
.recursive(100, 10, new OpenAiTokenizer(GPT_3_5_TURBO));
.recursive(100, 10, new OpenAiTokenizer());
// Embedding model (OpenAI)
EmbeddingModel embeddingModel = OpenAiEmbeddingModel.builder()

View File

@ -16,7 +16,6 @@ import java.util.UUID;
import static com.dtsx.astra.sdk.utils.TestUtils.*;
import static dev.langchain4j.data.message.AiMessage.aiMessage;
import static dev.langchain4j.data.message.UserMessage.userMessage;
import static dev.langchain4j.model.openai.OpenAiModelName.GPT_3_5_TURBO;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.assertNotNull;
@ -53,7 +52,7 @@ public class ChatMemoryStoreAstraTest {
ChatMemory chatMemory = TokenWindowChatMemory.builder()
.chatMemoryStore(chatMemoryStore)
.id(chatSessionId)
.maxTokens(300, new OpenAiTokenizer(GPT_3_5_TURBO))
.maxTokens(300, new OpenAiTokenizer())
.build();
// When

View File

@ -30,6 +30,18 @@ public class OpenAiTokenizer implements Tokenizer {
private final String modelName;
private final Optional<Encoding> encoding;
public OpenAiTokenizer() {
this(GPT_3_5_TURBO);
}
public OpenAiTokenizer(OpenAiChatModelName modelName) {
this(modelName.toString());
}
public OpenAiTokenizer(OpenAiLanguageModelName modelName) {
this(modelName.toString());
}
public OpenAiTokenizer(String modelName) {
this.modelName = ensureNotBlank(modelName, "modelName");
// If the model is unknown, we should NOT fail fast during the creation of OpenAiTokenizer.

View File

@ -25,6 +25,7 @@ import static dev.langchain4j.data.message.AiMessage.aiMessage;
import static dev.langchain4j.data.message.SystemMessage.systemMessage;
import static dev.langchain4j.data.message.ToolExecutionResultMessage.toolExecutionResultMessage;
import static dev.langchain4j.data.message.UserMessage.userMessage;
import static dev.langchain4j.model.openai.OpenAiChatModelName.GPT_3_5_TURBO;
import static java.util.Arrays.asList;
import static java.util.Arrays.stream;
import static java.util.Collections.singletonList;
@ -1088,7 +1089,7 @@ class OpenAiTokenizerIT {
// TODO remove once they fix it
e.printStackTrace();
// there is some pattern to it, so we are going to check if this is really the case or our calculation is wrong
Tokenizer tokenizer2 = new OpenAiTokenizer(GPT_3_5_TURBO.toString());
Tokenizer tokenizer2 = new OpenAiTokenizer(GPT_3_5_TURBO);
int tokenCount2 = tokenizer2.estimateTokenCountInToolExecutionRequests(toolExecutionRequests);
assertThat(tokenCount2).isEqualTo(expectedTokenCount - 3);
} else {

View File

@ -9,13 +9,12 @@ import org.junit.jupiter.params.provider.EnumSource;
import java.util.ArrayList;
import java.util.List;
import static dev.langchain4j.model.openai.OpenAiModelName.GPT_3_5_TURBO;
import static dev.langchain4j.model.openai.OpenAiTokenizer.countArguments;
import static org.assertj.core.api.Assertions.assertThat;
class OpenAiTokenizerTest {
OpenAiTokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
OpenAiTokenizer tokenizer = new OpenAiTokenizer();
@Test
void should_encode_and_decode_text() {

View File

@ -12,7 +12,6 @@ import java.util.List;
import static dev.langchain4j.data.document.Metadata.metadata;
import static dev.langchain4j.data.segment.TextSegment.textSegment;
import static dev.langchain4j.model.openai.OpenAiModelName.GPT_3_5_TURBO;
import static java.lang.String.format;
import static org.assertj.core.api.Assertions.assertThat;
@ -123,7 +122,7 @@ class DocumentByParagraphSplitterTest {
void should_split_sample_text_containing_multiple_paragraphs() {
int maxSegmentSize = 65;
Tokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
Tokenizer tokenizer = new OpenAiTokenizer();
String p1 = "In a small town nestled between two vast mountains, there was a shop unlike any other. " +
"A unique haven. " +
@ -200,7 +199,7 @@ class DocumentByParagraphSplitterTest {
int maxSegmentSize = 65;
int maxOverlapSize = 15;
Tokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
Tokenizer tokenizer = new OpenAiTokenizer();
String s1 = "In a small town nestled between two vast mountains, there was a shop unlike any other.";
String s2 = "A unique haven.";
@ -269,7 +268,7 @@ class DocumentByParagraphSplitterTest {
void should_split_sample_text_without_paragraphs() {
int maxSegmentSize = 100;
Tokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
Tokenizer tokenizer = new OpenAiTokenizer();
String segment1 = "In a small town nestled between two vast mountains, there was a shop unlike any other. " +
"A unique haven. " +
@ -332,7 +331,7 @@ class DocumentByParagraphSplitterTest {
// given
int maxSegmentSize = 100;
int maxOverlapSize = 25;
Tokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
Tokenizer tokenizer = new OpenAiTokenizer();
DocumentSplitter splitter = new DocumentByParagraphSplitter(maxSegmentSize, maxOverlapSize, tokenizer);
Document document = Document.from(sentences(0, 28), Metadata.from("document", "0"));
@ -364,7 +363,7 @@ class DocumentByParagraphSplitterTest {
// given
int maxSegmentSize = 100;
int maxOverlapSize = 80;
Tokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
Tokenizer tokenizer = new OpenAiTokenizer();
DocumentSplitter splitter = new DocumentByParagraphSplitter(maxSegmentSize, maxOverlapSize, tokenizer);
Document document = Document.from(sentences(0, 28), Metadata.from("document", "0"));

View File

@ -10,7 +10,6 @@ import java.util.List;
import static dev.langchain4j.data.document.Metadata.metadata;
import static dev.langchain4j.data.segment.TextSegment.textSegment;
import static dev.langchain4j.model.openai.OpenAiModelName.GPT_3_5_TURBO;
import static java.lang.String.format;
import static org.assertj.core.api.Assertions.assertThat;
@ -152,7 +151,7 @@ class DocumentBySentenceSplitterTest {
);
int maxSegmentSize = 26;
OpenAiTokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
OpenAiTokenizer tokenizer = new OpenAiTokenizer();
DocumentSplitter splitter = new DocumentBySentenceSplitter(maxSegmentSize, 0, tokenizer);
List<TextSegment> segments = splitter.split(document);

View File

@ -15,14 +15,13 @@ import java.util.List;
import static dev.langchain4j.data.message.AiMessage.aiMessage;
import static dev.langchain4j.data.message.SystemMessage.systemMessage;
import static dev.langchain4j.data.message.UserMessage.userMessage;
import static dev.langchain4j.model.openai.OpenAiModelName.GPT_3_5_TURBO;
import static org.assertj.core.api.Assertions.assertThat;
public class TestUtils {
private static final int EXTRA_TOKENS_PER_EACH_MESSAGE =
3 /* extra tokens for each message */ + 1 /* extra token for 'role' */;
private static final OpenAiTokenizer TOKENIZER = new OpenAiTokenizer(GPT_3_5_TURBO);
private static final OpenAiTokenizer TOKENIZER = new OpenAiTokenizer();
@ParameterizedTest
@ValueSource(ints = {5, 10, 25, 50, 100, 250, 500, 1000})

View File

@ -9,7 +9,6 @@ import org.junit.jupiter.api.Test;
import static dev.langchain4j.data.message.SystemMessage.systemMessage;
import static dev.langchain4j.internal.TestUtils.*;
import static dev.langchain4j.model.openai.OpenAiModelName.GPT_3_5_TURBO;
import static org.assertj.core.api.Assertions.assertThat;
class TokenWindowChatMemoryTest {
@ -17,7 +16,7 @@ class TokenWindowChatMemoryTest {
@Test
void should_keep_specified_number_of_tokens_in_chat_memory() {
OpenAiTokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
OpenAiTokenizer tokenizer = new OpenAiTokenizer();
ChatMemory chatMemory = TokenWindowChatMemory.withMaxTokens(33, tokenizer);
UserMessage firstUserMessage = userMessageWithTokens(10);
@ -78,7 +77,7 @@ class TokenWindowChatMemoryTest {
@Test
void should_not_remove_system_message_from_chat_memory() {
OpenAiTokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
OpenAiTokenizer tokenizer = new OpenAiTokenizer();
ChatMemory chatMemory = TokenWindowChatMemory.withMaxTokens(33, tokenizer);
SystemMessage systemMessage = systemMessageWithTokens(10);
@ -118,7 +117,7 @@ class TokenWindowChatMemoryTest {
@Test
void should_keep_only_the_latest_system_message_in_chat_memory() {
OpenAiTokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
OpenAiTokenizer tokenizer = new OpenAiTokenizer();
ChatMemory chatMemory = TokenWindowChatMemory.withMaxTokens(40, tokenizer);
SystemMessage firstSystemMessage = systemMessage("You are a helpful assistant");
@ -149,7 +148,7 @@ class TokenWindowChatMemoryTest {
@Test
void should_not_add_the_same_system_message_to_chat_memory_if_it_is_already_there() {
OpenAiTokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
OpenAiTokenizer tokenizer = new OpenAiTokenizer();
ChatMemory chatMemory = TokenWindowChatMemory.withMaxTokens(33, tokenizer);
SystemMessage systemMessage = systemMessageWithTokens(10);

View File

@ -8,6 +8,7 @@ import dev.langchain4j.data.message.AiMessage;
import dev.langchain4j.data.message.UserMessage;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.memory.chat.MessageWindowChatMemory;
import dev.langchain4j.model.Tokenizer;
import dev.langchain4j.model.chat.ChatLanguageModel;
import dev.langchain4j.model.embedding.AllMiniLmL6V2QuantizedEmbeddingModel;
import dev.langchain4j.model.embedding.EmbeddingModel;
@ -44,7 +45,6 @@ import java.util.Map;
import java.util.stream.Stream;
import static dev.langchain4j.data.document.loader.FileSystemDocumentLoader.loadDocument;
import static dev.langchain4j.model.openai.OpenAiModelName.GPT_3_5_TURBO;
import static java.util.Arrays.asList;
import static java.util.Collections.emptyList;
import static org.assertj.core.api.Assertions.assertThat;
@ -329,7 +329,7 @@ class AiServicesWithRagIT {
}
private void ingest(EmbeddingStore<TextSegment> embeddingStore, EmbeddingModel embeddingModel) {
OpenAiTokenizer tokenizer = new OpenAiTokenizer(GPT_3_5_TURBO);
Tokenizer tokenizer = new OpenAiTokenizer();
DocumentSplitter splitter = DocumentSplitters.recursive(100, 0, tokenizer);
EmbeddingStoreIngestor ingestor = EmbeddingStoreIngestor.builder()
.documentSplitter(splitter)