add jieba c++ code

This commit is contained in:
xulei2020 2020-04-30 11:02:47 +08:00
parent 93e7c97a96
commit da8e095bd3
2 changed files with 12 additions and 9 deletions

View File

@ -3,8 +3,8 @@ set(cppjieba_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
mindspore_add_pkg(cppjieba mindspore_add_pkg(cppjieba
VER 5.0.3 VER 5.0.3
HEAD_ONLY ./ HEAD_ONLY ./
URL https://codeload.github.com/yanyiwu/cppjieba/zip/v5.0.3 URL https://codeload.github.com/yanyiwu/cppjieba/tar.gz/v5.0.3
MD5 0dfef44bd32328c221f128b401e1a45c MD5 b8b3f7a73032c9ce9daafa4f67196c8c
PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/cppjieba/cppjieba.patch001) PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/cppjieba/cppjieba.patch001)
include_directories(${cppjieba_INC}include) include_directories(${cppjieba_INC}include)
include_directories(${cppjieba_INC}deps) include_directories(${cppjieba_INC}deps)

View File

@ -33,9 +33,13 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
Tokenize Chinese string into words based on dictionary. Tokenize Chinese string into words based on dictionary.
Args: Args:
mode (Enum): [Default "MIX"], "MP" model will tokenize with MPSegment algorithm, "HMM" mode will hmm_path (str): the dictionary file is used by HMMSegment algorithm,
tokenize with Hiddel Markov Model Segment algorithm, "MIX" model will tokenize with a mix of MPSegment and the dictionary can be obtained on the official website of cppjieba.
HMMSegment algorithm. mp_path(str): the dictionary file is used by MPSegment algorithm,
the dictionary can be obtained on the official website of cppjieba.
mode (Enum): [Default "MIX"], "MP" model will tokenize with MPSegment algorithm,
"HMM" mode will tokenize with Hiddel Markov Model Segment algorithm,
"MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm.
""" """
@check_jieba_init @check_jieba_init
def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX): def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX):
@ -52,9 +56,8 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
Args: Args:
word(required, string): The word to be added to the JiebaTokenizer instance. word(required, string): The word to be added to the JiebaTokenizer instance.
The added word will not be written into the built-in dictionary on disk. The added word will not be written into the built-in dictionary on disk.
freq(optional, int): The frequency of the word to be added, freq(optional, int): The frequency of the word to be added, The higher the frequency,
The higher the frequency, the better change the word will be tokenized(default None, the better change the word will be tokenized(default None, use default frequency).
use default frequency)
""" """
if freq is None: if freq is None:
super().add_word(word, 0) super().add_word(word, 0)
@ -67,7 +70,7 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
Add user defined word to JiebaTokenizer's dictionary Add user defined word to JiebaTokenizer's dictionary
Args: Args:
user_dict(path/dict):Dictionary to be added, file path or Python dictionary, user_dict(path/dict):Dictionary to be added, file path or Python dictionary,
Python Dict format is {word1:freq1, word2:freq2,...} Python Dict format: {word1:freq1, word2:freq2,...}
Jieba dictionary format : word(required), freq(optional), such as: Jieba dictionary format : word(required), freq(optional), such as:
word1 freq1 word1 freq1
word2 word2