!7878 Remove generate_hccn_file from cnnctc and psenet.

Merge pull request !7878 from linqingke/cnnctc
This commit is contained in:
mindspore-ci-bot 2020-10-28 21:01:47 +08:00 committed by Gitee
commit b1f88ad439
6 changed files with 23 additions and 195 deletions

View File

@ -150,7 +150,6 @@ The entire code structure is as following:
|---callback.py // loss callback file
|---dataset.py // process dataset
|---util.py // routine operation
|---generate_hccn_file.py // generate distribute json file
|---preprocess_dataset.py // preprocess dataset
```

View File

@ -31,7 +31,6 @@ echo $PATH1
PATH2=$(get_real_path $2)
echo $PATH2
python ${current_exec_path}/src/generate_hccn_file.py --rank_file=$PATH1
export RANK_TABLE_FILE=$PATH1
export RANK_SIZE=8
ulimit -u unlimited

View File

@ -1,88 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""generate ascend rank file"""
import os
import socket
import argparse
parser = argparse.ArgumentParser(description="ascend distribute rank.")
parser.add_argument("--rank_file", type=str, default="scripts/rank_table_8p.json", help="rank_tabel_file_path.")
def main(rank_table_file):
nproc_per_node = 8
visible_devices = ['0', '1', '2', '3', '4', '5', '6', '7']
server_id = socket.gethostbyname(socket.gethostname())
hccn_configs = open('/etc/hccn.conf', 'r').readlines()
device_ips = {}
for hccn_item in hccn_configs:
hccn_item = hccn_item.strip()
if hccn_item.startswith('address_'):
device_id, device_ip = hccn_item.split('=')
device_id = device_id.split('_')[1]
device_ips[device_id] = device_ip
print('device_id:{}, device_ip:{}'.format(device_id, device_ip))
hccn_table = {}
hccn_table['board_id'] = '0x002f' # A+K
# hccn_table['board_id'] = '0x0000' # A+X
hccn_table['chip_info'] = '910'
hccn_table['deploy_mode'] = 'lab'
hccn_table['group_count'] = '1'
hccn_table['group_list'] = []
instance_list = []
for instance_id in range(nproc_per_node):
instance = {}
instance['devices'] = []
device_id = visible_devices[instance_id]
device_ip = device_ips[device_id]
instance['devices'].append({
'device_id': device_id,
'device_ip': device_ip,
})
instance['rank_id'] = str(instance_id)
instance['server_id'] = server_id
instance_list.append(instance)
hccn_table['group_list'].append({
'device_num': str(nproc_per_node),
'server_num': '1',
'group_name': '',
'instance_count': str(nproc_per_node),
'instance_list': instance_list,
})
hccn_table['para_plane_nic_location'] = 'device'
hccn_table['para_plane_nic_name'] = []
for instance_id in range(nproc_per_node):
eth_id = visible_devices[instance_id]
hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
hccn_table['para_plane_nic_num'] = str(nproc_per_node)
hccn_table['status'] = 'completed'
import json
with open(rank_table_file, 'w') as table_fp:
json.dump(hccn_table, table_fp, indent=4)
if __name__ == '__main__':
args_opt = parser.parse_args()
rank_table = args_opt.rank_file
if os.path.exists(rank_table):
print('Rank table file exists.')
else:
print('Generating rank table file.')
main(rank_table)
print('Rank table file generated')

View File

@ -58,7 +58,7 @@ A testing set containing about 2000 readable words
After installing MindSpore via the official website, you can start training and evaluation as follows:
```python
# run distributed training example
sh scripts/run_distribute_train.sh pretrained_model.ckpt
sh scripts/run_distribute_train.sh rank_table_file pretrained_model.ckpt
#download opencv library
download pyblind11, opencv3.4
@ -91,7 +91,6 @@ sh scripts/run_eval_ascend.sh
└── run_eval_ascend.sh // shell script for evaluation
├── src
├── __init__.py
├── generate_hccn_file.py // creating rank.json
├── ETSNET
├── __init__.py
├── base.py // convolution and BN operator
@ -130,7 +129,7 @@ Major parameters in train.py and config.py are:
### Distributed Training
```
sh scripts/run_distribute_train.sh pretrained_model.ckpt
sh scripts/run_distribute_train.sh rank_table_file pretrained_model.ckpt
```
The above shell script will run distribute training in the background. You can view the results through the file
@ -169,18 +168,18 @@ Calculated!{"precision": 0.814796668299853, "recall": 0.8006740491092923, "hmean
| Parameters | PSENet |
| -------------------------- | ----------------------------------------------------------- |
| Model Version | Inception V1 |
| Model Version | V1 |
| Resource | Ascend 910 CPU 2.60GHz192coresMemory755G |
| uploaded Date | 09/15/2020 (month/day/year) |
| MindSpore Version | 1.0-alpha |
| uploaded Date | 09/30/2020 (month/day/year) |
| MindSpore Version | 1.0.0 |
| Dataset | ICDAR2015 |
| Training Parameters | start_lr=0.1; lr_scale=0.1 |
| Optimizer | SGD |
| Loss Function | LossCallBack |
| outputs | probability |
| Loss | 0.35 |
| Speed | 1pc: 444 ms/step; 4pcs: 446 ms/step |
| Total time | 1pc: 75.48 h; 4pcs: 18.87 h |
| Speed | 1pc: 444 ms/step; 8pcs: 446 ms/step |
| Total time | 1pc: 75.48 h; 8pcs: 10.01 h |
| Parameters (M) | 27.36 |
| Checkpoint for Fine tuning | 109.44M (.ckpt file) |
| Scripts | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/psenet |
@ -190,13 +189,13 @@ Calculated!{"precision": 0.814796668299853, "recall": 0.8006740491092923, "hmean
| Parameters | PSENet |
| ------------------- | --------------------------- |
| Model Version | Inception V1 |
| Model Version | V1 |
| Resource | Ascend 910 |
| Uploaded Date | 09/15/2020 (month/day/year) |
| MindSpore Version | 1.0-alpha |
| Uploaded Date | 09/30/2020 (month/day/year) |
| MindSpore Version | 1.0,0 |
| Dataset | ICDAR2015 |
| outputs | probability |
| Accuracy | 1pc: 81%; 4pcs: 81% |
| Accuracy | 1pc: 81%; 8pcs: 81% |
## [How to use](#contents)

View File

@ -17,9 +17,9 @@
current_exec_path=$(pwd)
echo 'current_exec_path: '${current_exec_path}
if [ $# != 1 ]
if [ $# != 2 ]
then
echo "Usage: sh run_distribute_train.sh [PRETRAINED_PATH]"
echo "Usage: sh run_distribute_train.sh [RANK_FILE] [PRETRAINED_PATH]"
exit 1
fi
@ -30,20 +30,24 @@ get_real_path(){
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $1)
if [ ! -f $PATH1 ]
then
echo "error: PRETRAINED_PATH=$PATH1 is not a file"
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1
fi
python ${current_exec_path}/src/generate_hccn_file.py
PATH2=$(get_real_path $2)
if [ ! -f $PATH2 ]
then
echo "error: PRETRAINED_PATH=$PATH2 is not a file"
exit 1
fi
export DEVICE_NUM=8
export RANK_SIZE=8
export RANK_TABLE_FILE=${current_exec_path}/rank_table_8p.json
export RANK_TABLE_FILE=$PATH1
for((i=0; i<${DEVICE_NUM}; i++))
do
@ -70,7 +74,7 @@ do
cd ${current_exec_path}/device_$i || exit
export RANK_ID=$i
export DEVICE_ID=$i
python ${current_exec_path}/train.py --run_distribute --device_id $i --pre_trained $PATH1 --device_num ${DEVICE_NUM} >test_deep$i.log 2>&1 &
python ${current_exec_path}/train.py --run_distribute --device_id $i --pre_trained $PATH2 --device_num ${DEVICE_NUM} >test_deep$i.log 2>&1 &
cd ${current_exec_path} || exit
done

View File

@ -1,85 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import os
import socket
RANK_TABLE_SAVE_PATH = './rank_table_8p.json'
def main():
nproc_per_node = 4
visible_devices = ['0', '1', '2', '3']
server_id = socket.gethostbyname(socket.gethostname())
hccn_configs = open('/etc/hccn.conf', 'r').readlines()
device_ips = {}
for hccn_item in hccn_configs:
hccn_item = hccn_item.strip()
if hccn_item.startswith('address_'):
device_id, device_ip = hccn_item.split('=')
device_id = device_id.split('_')[1]
device_ips[device_id] = device_ip
print('device_id:{}, device_ip:{}'.format(device_id, device_ip))
hccn_table = {}
hccn_table['board_id'] = '0x002f' # A+K
hccn_table['chip_info'] = '910'
hccn_table['deploy_mode'] = 'lab'
hccn_table['group_count'] = '1'
hccn_table['group_list'] = []
instance_list = []
for instance_id in range(nproc_per_node):
instance = {}
instance['devices'] = []
device_id = visible_devices[instance_id]
device_ip = device_ips[device_id]
instance['devices'].append({
'device_id': device_id,
'device_ip': device_ip,
})
instance['rank_id'] = str(instance_id)
instance['server_id'] = server_id
instance_list.append(instance)
hccn_table['group_list'].append({
'device_num': str(nproc_per_node),
'server_num': '1',
'group_name': '',
'instance_count': str(nproc_per_node),
'instance_list': instance_list,
})
hccn_table['para_plane_nic_location'] = 'device'
hccn_table['para_plane_nic_name'] = []
for instance_id in range(nproc_per_node):
eth_id = visible_devices[instance_id]
hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
hccn_table['para_plane_nic_num'] = str(nproc_per_node)
hccn_table['status'] = 'completed'
import json
with open(RANK_TABLE_SAVE_PATH, 'w') as table_fp:
json.dump(hccn_table, table_fp, indent=4)
if __name__ == '__main__':
if os.path.exists(RANK_TABLE_SAVE_PATH):
print('Rank table file exists.')
else:
print('Generating rank table file.')
main()
print('Rank table file generated')