From 203e864425a25d4fcadee581cf9f927b21c1497c Mon Sep 17 00:00:00 2001 From: jonyguo Date: Tue, 1 Sep 2020 09:58:38 +0800 Subject: [PATCH] fix: model zoo aclIMDB preprocess format --- .../aclImdb_preprocess/README.md | 94 +++++++++++-------- 1 file changed, 56 insertions(+), 38 deletions(-) diff --git a/model_zoo/utils/nlp_to_mindrecord/aclImdb_preprocess/README.md b/model_zoo/utils/nlp_to_mindrecord/aclImdb_preprocess/README.md index 5b5eb949aca..66bcf24756f 100644 --- a/model_zoo/utils/nlp_to_mindrecord/aclImdb_preprocess/README.md +++ b/model_zoo/utils/nlp_to_mindrecord/aclImdb_preprocess/README.md @@ -84,44 +84,62 @@ Download aclImdb dataset, transfer it to mindrecord, use MindDataset to read min 2. Output like this: ``` - example 24992: {'input_ids': array([ -1, -1, 65, 0, 89, 0, 367, 0, -1, - -1, -1, -1, 488, 0, 0, 0, 206, 0, - 816, 0, -1, -1, 16, 0, -1, -1, 11998, - 0, 0, 0, 852, 0, 1, 0, 111, 0, - -1, -1, -1, -1, 765, 0, 9, 0, 17, - 0, 35, 0, 72, 0, -1, -1, -1, -1, - 40, 0, 895, 0, 41, 0, 0, 0, 6952, - 0, 170, 0, -1, -1, -1, -1, 3, 0, - 28, 0, -1, -1, 0, 0, 111, 0, 58, - 0, 110, 0, 569, 0, -1, -1, -1, -1, - -1, -1, 0, 0, 24512, 0, 3, 0, 0, - 0], dtype=int32), 'id': array(8045, dtype=int32), 'input_mask': array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=int32), 'segment_ids': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32), 'score': array(1, dtype=int32), 'label': array(1, dtype=int32)} - example 24993: {'input_ids': array([ -1, -1, 11, 0, 7400, 0, 189, 0, 4, 0, 1247, - 0, 9, 0, 17, 0, 29, 0, 0, 0, -1, -1, - -1, -1, -1, -1, 1, 0, -1, -1, 218, 0, 131, - 0, 10, 0, -1, -1, 52, 0, 72, 0, 488, 0, - 6, 0, -1, -1, -1, -1, -1, -1, 1749, 0, 0, - 0, -1, -1, 42, 0, 21, 0, 65, 0, 6895, 0, - -1, -1, -1, -1, -1, -1, 11, 0, 52, 0, 72, - 0, 1498, 0, 10, 0, 21, 0, 65, 0, 19, 0, - -1, -1, -1, -1, 36, 0, 130, 0, 88, 0, 210, - 0], dtype=int32), 'id': array(9903, dtype=int32), 'input_mask': array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=int32), 'segment_ids': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32), 'score': array(7, dtype=int32), 'label': array(0, dtype=int32)} + example 24992: { + 'input_ids': array( + [ -1, -1, 65, 0, 89, 0, 367, 0, -1, + -1, -1, -1, 488, 0, 0, 0, 206, 0, + 816, 0, -1, -1, 16, 0, -1, -1, 11998, + 0, 0, 0, 852, 0, 1, 0, 111, 0, + -1, -1, -1, -1, 765, 0, 9, 0, 17, + 0, 35, 0, 72, 0, -1, -1, -1, -1, + 40, 0, 895, 0, 41, 0, 0, 0, 6952, + 0, 170, 0, -1, -1, -1, -1, 3, 0, + 28, 0, -1, -1, 0, 0, 111, 0, 58, + 0, 110, 0, 569, 0, -1, -1, -1, -1, + -1, -1, 0, 0, 24512, 0, 3, 0, 0, + 0], dtype=int32), + 'id': array(8045, dtype=int32), + 'input_mask': array( + [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=int32), + 'segment_ids': array( + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32), + 'score': array(1, dtype=int32), + 'label': array(1, dtype=int32)} + example 24993: { + 'input_ids': array( + [ -1, -1, 11, 0, 7400, 0, 189, 0, 4, 0, 1247, + 0, 9, 0, 17, 0, 29, 0, 0, 0, -1, -1, + -1, -1, -1, -1, 1, 0, -1, -1, 218, 0, 131, + 0, 10, 0, -1, -1, 52, 0, 72, 0, 488, 0, + 6, 0, -1, -1, -1, -1, -1, -1, 1749, 0, 0, + 0, -1, -1, 42, 0, 21, 0, 65, 0, 6895, 0, + -1, -1, -1, -1, -1, -1, 11, 0, 52, 0, 72, + 0, 1498, 0, 10, 0, 21, 0, 65, 0, 19, 0, + -1, -1, -1, -1, 36, 0, 130, 0, 88, 0, 210, + 0], dtype=int32), + 'id': array(9903, dtype=int32), + 'input_mask': array( + [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=int32), + 'segment_ids': array( + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32), + 'score': array(7, dtype=int32), + 'label': array(0, dtype=int32)} ``` - id : the id "3219" is from review docs like **3219**_10.txt. - label : indicates whether the review is positive or negative, positive: 0, negative: 1.