fix: model zoo aclIMDB preprocess format

This commit is contained in:
jonyguo 2020-09-01 09:58:38 +08:00
parent d5e02cf474
commit 203e864425
1 changed files with 56 additions and 38 deletions

View File

@ -84,44 +84,62 @@ Download aclImdb dataset, transfer it to mindrecord, use MindDataset to read min
2. Output like this:
```
example 24992: {'input_ids': array([ -1, -1, 65, 0, 89, 0, 367, 0, -1,
-1, -1, -1, 488, 0, 0, 0, 206, 0,
816, 0, -1, -1, 16, 0, -1, -1, 11998,
0, 0, 0, 852, 0, 1, 0, 111, 0,
-1, -1, -1, -1, 765, 0, 9, 0, 17,
0, 35, 0, 72, 0, -1, -1, -1, -1,
40, 0, 895, 0, 41, 0, 0, 0, 6952,
0, 170, 0, -1, -1, -1, -1, 3, 0,
28, 0, -1, -1, 0, 0, 111, 0, 58,
0, 110, 0, 569, 0, -1, -1, -1, -1,
-1, -1, 0, 0, 24512, 0, 3, 0, 0,
0], dtype=int32), 'id': array(8045, dtype=int32), 'input_mask': array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=int32), 'segment_ids': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32), 'score': array(1, dtype=int32), 'label': array(1, dtype=int32)}
example 24993: {'input_ids': array([ -1, -1, 11, 0, 7400, 0, 189, 0, 4, 0, 1247,
0, 9, 0, 17, 0, 29, 0, 0, 0, -1, -1,
-1, -1, -1, -1, 1, 0, -1, -1, 218, 0, 131,
0, 10, 0, -1, -1, 52, 0, 72, 0, 488, 0,
6, 0, -1, -1, -1, -1, -1, -1, 1749, 0, 0,
0, -1, -1, 42, 0, 21, 0, 65, 0, 6895, 0,
-1, -1, -1, -1, -1, -1, 11, 0, 52, 0, 72,
0, 1498, 0, 10, 0, 21, 0, 65, 0, 19, 0,
-1, -1, -1, -1, 36, 0, 130, 0, 88, 0, 210,
0], dtype=int32), 'id': array(9903, dtype=int32), 'input_mask': array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=int32), 'segment_ids': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32), 'score': array(7, dtype=int32), 'label': array(0, dtype=int32)}
example 24992: {
'input_ids': array(
[ -1, -1, 65, 0, 89, 0, 367, 0, -1,
-1, -1, -1, 488, 0, 0, 0, 206, 0,
816, 0, -1, -1, 16, 0, -1, -1, 11998,
0, 0, 0, 852, 0, 1, 0, 111, 0,
-1, -1, -1, -1, 765, 0, 9, 0, 17,
0, 35, 0, 72, 0, -1, -1, -1, -1,
40, 0, 895, 0, 41, 0, 0, 0, 6952,
0, 170, 0, -1, -1, -1, -1, 3, 0,
28, 0, -1, -1, 0, 0, 111, 0, 58,
0, 110, 0, 569, 0, -1, -1, -1, -1,
-1, -1, 0, 0, 24512, 0, 3, 0, 0,
0], dtype=int32),
'id': array(8045, dtype=int32),
'input_mask': array(
[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=int32),
'segment_ids': array(
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
'score': array(1, dtype=int32),
'label': array(1, dtype=int32)}
example 24993: {
'input_ids': array(
[ -1, -1, 11, 0, 7400, 0, 189, 0, 4, 0, 1247,
0, 9, 0, 17, 0, 29, 0, 0, 0, -1, -1,
-1, -1, -1, -1, 1, 0, -1, -1, 218, 0, 131,
0, 10, 0, -1, -1, 52, 0, 72, 0, 488, 0,
6, 0, -1, -1, -1, -1, -1, -1, 1749, 0, 0,
0, -1, -1, 42, 0, 21, 0, 65, 0, 6895, 0,
-1, -1, -1, -1, -1, -1, 11, 0, 52, 0, 72,
0, 1498, 0, 10, 0, 21, 0, 65, 0, 19, 0,
-1, -1, -1, -1, 36, 0, 130, 0, 88, 0, 210,
0], dtype=int32),
'id': array(9903, dtype=int32),
'input_mask': array(
[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=int32),
'segment_ids': array(
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
'score': array(7, dtype=int32),
'label': array(0, dtype=int32)}
```
- id : the id "3219" is from review docs like **3219**_10.txt.
- label : indicates whether the review is positive or negative, positive: 0, negative: 1.