mindspore/model_zoo/mass/apply_bpe_encoding.py

85 lines
3.7 KiB
Python

# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Apply bpe script."""
import os
import argparse
from multiprocessing import Pool, cpu_count
from src.utils import Dictionary
from src.utils import bpe_encode
parser = argparse.ArgumentParser(description='Apply BPE.')
parser.add_argument("--codes", type=str, default="", required=True,
help="bpe codes path.")
parser.add_argument("--src_folder", type=str, default="", required=True,
help="raw corpus folder.")
parser.add_argument("--output_folder", type=str, default="", required=True,
help="encoded corpus output path.")
parser.add_argument("--prefix", type=str, default="", required=False,
help="Prefix of text file.")
parser.add_argument("--vocab_path", type=str, default="", required=True,
help="Generated vocabulary output path.")
parser.add_argument("--threshold", type=int, default=None, required=False,
help="Filter out words that frequency is lower than threshold.")
parser.add_argument("--processes", type=int, default=2, required=False,
help="Number of processes to use.")
if __name__ == '__main__':
args, _ = parser.parse_known_args()
if not (args.codes and args.src_folder and args.output_folder):
raise ValueError("Please enter required params.")
source_folder = args.src_folder
output_folder = args.output_folder
codes = args.codes
if not os.path.exists(codes):
raise FileNotFoundError("`--codes` is not existed.")
if not os.path.exists(source_folder) or not os.path.isdir(source_folder):
raise ValueError("`--src_folder` must be a dir and existed.")
if not os.path.exists(output_folder) or not os.path.isdir(output_folder):
raise ValueError("`--output_folder` must be a dir and existed.")
if not isinstance(args.prefix, str) or len(args.prefix) > 128:
raise ValueError("`--prefix` must be a str and len <= 128.")
if not isinstance(args.processes, int):
raise TypeError("`--processes` must be an integer.")
available_dict = []
args_groups = []
for file in os.listdir(source_folder):
if args.prefix and not file.startswith(args.prefix):
continue
if file.endswith(".txt"):
output_path = os.path.join(output_folder, file.replace(".txt", "_bpe.txt"))
dict_path = os.path.join(output_folder, file.replace(".txt", ".dict"))
available_dict.append(dict_path)
args_groups.append((codes, os.path.join(source_folder, file),
output_path, dict_path))
kernel_size = 1 if args.processes <= 0 else args.processes
kernel_size = min(kernel_size, cpu_count())
pool = Pool(kernel_size)
for arg in args_groups:
pool.apply_async(bpe_encode, args=arg)
pool.close()
pool.join()
vocab = Dictionary.load_from_text(available_dict)
if args.threshold is not None:
vocab = vocab.shrink(args.threshold)
vocab.persistence(args.vocab_path)
print(f" | Vocabulary Size: {len(vocab)}")