From da8e095bd3229d2c0971d1eb44ddd13d69099176 Mon Sep 17 00:00:00 2001 From: xulei2020 <“xulei83@huawei.com”> Date: Thu, 30 Apr 2020 11:02:47 +0800 Subject: [PATCH] add jieba c++ code --- cmake/external_libs/cppjieba.cmake | 4 ++-- .../dataset/transforms/text/c_transforms.py | 17 ++++++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/cmake/external_libs/cppjieba.cmake b/cmake/external_libs/cppjieba.cmake index 80f34f6ade9..7de0e8b4972 100644 --- a/cmake/external_libs/cppjieba.cmake +++ b/cmake/external_libs/cppjieba.cmake @@ -3,8 +3,8 @@ set(cppjieba_CFLAGS "-D_FORTIFY_SOURCE=2 -O2") mindspore_add_pkg(cppjieba VER 5.0.3 HEAD_ONLY ./ - URL https://codeload.github.com/yanyiwu/cppjieba/zip/v5.0.3 - MD5 0dfef44bd32328c221f128b401e1a45c + URL https://codeload.github.com/yanyiwu/cppjieba/tar.gz/v5.0.3 + MD5 b8b3f7a73032c9ce9daafa4f67196c8c PATCHES ${CMAKE_SOURCE_DIR}/third_party/patch/cppjieba/cppjieba.patch001) include_directories(${cppjieba_INC}include) include_directories(${cppjieba_INC}deps) diff --git a/mindspore/dataset/transforms/text/c_transforms.py b/mindspore/dataset/transforms/text/c_transforms.py index f17def79bbc..24795edb146 100644 --- a/mindspore/dataset/transforms/text/c_transforms.py +++ b/mindspore/dataset/transforms/text/c_transforms.py @@ -33,9 +33,13 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): Tokenize Chinese string into words based on dictionary. Args: - mode (Enum): [Default "MIX"], "MP" model will tokenize with MPSegment algorithm, "HMM" mode will - tokenize with Hiddel Markov Model Segment algorithm, "MIX" model will tokenize with a mix of MPSegment and - HMMSegment algorithm. + hmm_path (str): the dictionary file is used by HMMSegment algorithm, + the dictionary can be obtained on the official website of cppjieba. + mp_path(str): the dictionary file is used by MPSegment algorithm, + the dictionary can be obtained on the official website of cppjieba. + mode (Enum): [Default "MIX"], "MP" model will tokenize with MPSegment algorithm, + "HMM" mode will tokenize with Hiddel Markov Model Segment algorithm, + "MIX" model will tokenize with a mix of MPSegment and HMMSegment algorithm. """ @check_jieba_init def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX): @@ -52,9 +56,8 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): Args: word(required, string): The word to be added to the JiebaTokenizer instance. The added word will not be written into the built-in dictionary on disk. - freq(optional, int): The frequency of the word to be added, - The higher the frequency, the better change the word will be tokenized(default None, - use default frequency) + freq(optional, int): The frequency of the word to be added, The higher the frequency, + the better change the word will be tokenized(default None, use default frequency). """ if freq is None: super().add_word(word, 0) @@ -67,7 +70,7 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): Add user defined word to JiebaTokenizer's dictionary Args: user_dict(path/dict):Dictionary to be added, file path or Python dictionary, - Python Dict format is {word1:freq1, word2:freq2,...} + Python Dict format: {word1:freq1, word2:freq2,...} Jieba dictionary format : word(required), freq(optional), such as: word1 freq1 word2