forked from mindspore-Ecosystem/mindspore
!7878 Remove generate_hccn_file from cnnctc and psenet.
Merge pull request !7878 from linqingke/cnnctc
This commit is contained in:
commit
b1f88ad439
|
@ -150,7 +150,6 @@ The entire code structure is as following:
|
|||
|---callback.py // loss callback file
|
||||
|---dataset.py // process dataset
|
||||
|---util.py // routine operation
|
||||
|---generate_hccn_file.py // generate distribute json file
|
||||
|---preprocess_dataset.py // preprocess dataset
|
||||
|
||||
```
|
||||
|
|
|
@ -31,7 +31,6 @@ echo $PATH1
|
|||
PATH2=$(get_real_path $2)
|
||||
echo $PATH2
|
||||
|
||||
python ${current_exec_path}/src/generate_hccn_file.py --rank_file=$PATH1
|
||||
export RANK_TABLE_FILE=$PATH1
|
||||
export RANK_SIZE=8
|
||||
ulimit -u unlimited
|
||||
|
|
|
@ -1,88 +0,0 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""generate ascend rank file"""
|
||||
|
||||
import os
|
||||
import socket
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="ascend distribute rank.")
|
||||
parser.add_argument("--rank_file", type=str, default="scripts/rank_table_8p.json", help="rank_tabel_file_path.")
|
||||
|
||||
def main(rank_table_file):
|
||||
nproc_per_node = 8
|
||||
|
||||
visible_devices = ['0', '1', '2', '3', '4', '5', '6', '7']
|
||||
|
||||
server_id = socket.gethostbyname(socket.gethostname())
|
||||
|
||||
hccn_configs = open('/etc/hccn.conf', 'r').readlines()
|
||||
device_ips = {}
|
||||
for hccn_item in hccn_configs:
|
||||
hccn_item = hccn_item.strip()
|
||||
if hccn_item.startswith('address_'):
|
||||
device_id, device_ip = hccn_item.split('=')
|
||||
device_id = device_id.split('_')[1]
|
||||
device_ips[device_id] = device_ip
|
||||
print('device_id:{}, device_ip:{}'.format(device_id, device_ip))
|
||||
|
||||
hccn_table = {}
|
||||
hccn_table['board_id'] = '0x002f' # A+K
|
||||
# hccn_table['board_id'] = '0x0000' # A+X
|
||||
|
||||
hccn_table['chip_info'] = '910'
|
||||
hccn_table['deploy_mode'] = 'lab'
|
||||
hccn_table['group_count'] = '1'
|
||||
hccn_table['group_list'] = []
|
||||
instance_list = []
|
||||
for instance_id in range(nproc_per_node):
|
||||
instance = {}
|
||||
instance['devices'] = []
|
||||
device_id = visible_devices[instance_id]
|
||||
device_ip = device_ips[device_id]
|
||||
instance['devices'].append({
|
||||
'device_id': device_id,
|
||||
'device_ip': device_ip,
|
||||
})
|
||||
instance['rank_id'] = str(instance_id)
|
||||
instance['server_id'] = server_id
|
||||
instance_list.append(instance)
|
||||
hccn_table['group_list'].append({
|
||||
'device_num': str(nproc_per_node),
|
||||
'server_num': '1',
|
||||
'group_name': '',
|
||||
'instance_count': str(nproc_per_node),
|
||||
'instance_list': instance_list,
|
||||
})
|
||||
hccn_table['para_plane_nic_location'] = 'device'
|
||||
hccn_table['para_plane_nic_name'] = []
|
||||
for instance_id in range(nproc_per_node):
|
||||
eth_id = visible_devices[instance_id]
|
||||
hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
|
||||
hccn_table['para_plane_nic_num'] = str(nproc_per_node)
|
||||
hccn_table['status'] = 'completed'
|
||||
import json
|
||||
with open(rank_table_file, 'w') as table_fp:
|
||||
json.dump(hccn_table, table_fp, indent=4)
|
||||
|
||||
if __name__ == '__main__':
|
||||
args_opt = parser.parse_args()
|
||||
rank_table = args_opt.rank_file
|
||||
if os.path.exists(rank_table):
|
||||
print('Rank table file exists.')
|
||||
else:
|
||||
print('Generating rank table file.')
|
||||
main(rank_table)
|
||||
print('Rank table file generated')
|
|
@ -58,7 +58,7 @@ A testing set containing about 2000 readable words
|
|||
After installing MindSpore via the official website, you can start training and evaluation as follows:
|
||||
```python
|
||||
# run distributed training example
|
||||
sh scripts/run_distribute_train.sh pretrained_model.ckpt
|
||||
sh scripts/run_distribute_train.sh rank_table_file pretrained_model.ckpt
|
||||
|
||||
#download opencv library
|
||||
download pyblind11, opencv3.4
|
||||
|
@ -91,7 +91,6 @@ sh scripts/run_eval_ascend.sh
|
|||
└── run_eval_ascend.sh // shell script for evaluation
|
||||
├── src
|
||||
├── __init__.py
|
||||
├── generate_hccn_file.py // creating rank.json
|
||||
├── ETSNET
|
||||
├── __init__.py
|
||||
├── base.py // convolution and BN operator
|
||||
|
@ -130,7 +129,7 @@ Major parameters in train.py and config.py are:
|
|||
|
||||
### Distributed Training
|
||||
```
|
||||
sh scripts/run_distribute_train.sh pretrained_model.ckpt
|
||||
sh scripts/run_distribute_train.sh rank_table_file pretrained_model.ckpt
|
||||
```
|
||||
|
||||
The above shell script will run distribute training in the background. You can view the results through the file
|
||||
|
@ -169,18 +168,18 @@ Calculated!{"precision": 0.814796668299853, "recall": 0.8006740491092923, "hmean
|
|||
|
||||
| Parameters | PSENet |
|
||||
| -------------------------- | ----------------------------------------------------------- |
|
||||
| Model Version | Inception V1 |
|
||||
| Model Version | V1 |
|
||||
| Resource | Ascend 910 ;CPU 2.60GHz,192cores;Memory,755G |
|
||||
| uploaded Date | 09/15/2020 (month/day/year) |
|
||||
| MindSpore Version | 1.0-alpha |
|
||||
| uploaded Date | 09/30/2020 (month/day/year) |
|
||||
| MindSpore Version | 1.0.0 |
|
||||
| Dataset | ICDAR2015 |
|
||||
| Training Parameters | start_lr=0.1; lr_scale=0.1 |
|
||||
| Optimizer | SGD |
|
||||
| Loss Function | LossCallBack |
|
||||
| outputs | probability |
|
||||
| Loss | 0.35 |
|
||||
| Speed | 1pc: 444 ms/step; 4pcs: 446 ms/step |
|
||||
| Total time | 1pc: 75.48 h; 4pcs: 18.87 h |
|
||||
| Speed | 1pc: 444 ms/step; 8pcs: 446 ms/step |
|
||||
| Total time | 1pc: 75.48 h; 8pcs: 10.01 h |
|
||||
| Parameters (M) | 27.36 |
|
||||
| Checkpoint for Fine tuning | 109.44M (.ckpt file) |
|
||||
| Scripts | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/psenet |
|
||||
|
@ -190,13 +189,13 @@ Calculated!{"precision": 0.814796668299853, "recall": 0.8006740491092923, "hmean
|
|||
|
||||
| Parameters | PSENet |
|
||||
| ------------------- | --------------------------- |
|
||||
| Model Version | Inception V1 |
|
||||
| Model Version | V1 |
|
||||
| Resource | Ascend 910 |
|
||||
| Uploaded Date | 09/15/2020 (month/day/year) |
|
||||
| MindSpore Version | 1.0-alpha |
|
||||
| Uploaded Date | 09/30/2020 (month/day/year) |
|
||||
| MindSpore Version | 1.0,0 |
|
||||
| Dataset | ICDAR2015 |
|
||||
| outputs | probability |
|
||||
| Accuracy | 1pc: 81%; 4pcs: 81% |
|
||||
| Accuracy | 1pc: 81%; 8pcs: 81% |
|
||||
|
||||
## [How to use](#contents)
|
||||
|
||||
|
|
|
@ -17,9 +17,9 @@
|
|||
current_exec_path=$(pwd)
|
||||
echo 'current_exec_path: '${current_exec_path}
|
||||
|
||||
if [ $# != 1 ]
|
||||
if [ $# != 2 ]
|
||||
then
|
||||
echo "Usage: sh run_distribute_train.sh [PRETRAINED_PATH]"
|
||||
echo "Usage: sh run_distribute_train.sh [RANK_FILE] [PRETRAINED_PATH]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -30,20 +30,24 @@ get_real_path(){
|
|||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
|
||||
PATH1=$(get_real_path $1)
|
||||
|
||||
|
||||
if [ ! -f $PATH1 ]
|
||||
then
|
||||
echo "error: PRETRAINED_PATH=$PATH1 is not a file"
|
||||
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python ${current_exec_path}/src/generate_hccn_file.py
|
||||
PATH2=$(get_real_path $2)
|
||||
if [ ! -f $PATH2 ]
|
||||
then
|
||||
echo "error: PRETRAINED_PATH=$PATH2 is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export DEVICE_NUM=8
|
||||
export RANK_SIZE=8
|
||||
export RANK_TABLE_FILE=${current_exec_path}/rank_table_8p.json
|
||||
export RANK_TABLE_FILE=$PATH1
|
||||
|
||||
for((i=0; i<${DEVICE_NUM}; i++))
|
||||
do
|
||||
|
@ -70,7 +74,7 @@ do
|
|||
cd ${current_exec_path}/device_$i || exit
|
||||
export RANK_ID=$i
|
||||
export DEVICE_ID=$i
|
||||
python ${current_exec_path}/train.py --run_distribute --device_id $i --pre_trained $PATH1 --device_num ${DEVICE_NUM} >test_deep$i.log 2>&1 &
|
||||
python ${current_exec_path}/train.py --run_distribute --device_id $i --pre_trained $PATH2 --device_num ${DEVICE_NUM} >test_deep$i.log 2>&1 &
|
||||
cd ${current_exec_path} || exit
|
||||
done
|
||||
|
||||
|
|
|
@ -1,85 +0,0 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
|
||||
import os
|
||||
import socket
|
||||
|
||||
RANK_TABLE_SAVE_PATH = './rank_table_8p.json'
|
||||
|
||||
|
||||
def main():
|
||||
nproc_per_node = 4
|
||||
|
||||
visible_devices = ['0', '1', '2', '3']
|
||||
|
||||
server_id = socket.gethostbyname(socket.gethostname())
|
||||
|
||||
hccn_configs = open('/etc/hccn.conf', 'r').readlines()
|
||||
device_ips = {}
|
||||
for hccn_item in hccn_configs:
|
||||
hccn_item = hccn_item.strip()
|
||||
if hccn_item.startswith('address_'):
|
||||
device_id, device_ip = hccn_item.split('=')
|
||||
device_id = device_id.split('_')[1]
|
||||
device_ips[device_id] = device_ip
|
||||
print('device_id:{}, device_ip:{}'.format(device_id, device_ip))
|
||||
|
||||
hccn_table = {}
|
||||
hccn_table['board_id'] = '0x002f' # A+K
|
||||
|
||||
hccn_table['chip_info'] = '910'
|
||||
hccn_table['deploy_mode'] = 'lab'
|
||||
hccn_table['group_count'] = '1'
|
||||
hccn_table['group_list'] = []
|
||||
instance_list = []
|
||||
for instance_id in range(nproc_per_node):
|
||||
instance = {}
|
||||
instance['devices'] = []
|
||||
device_id = visible_devices[instance_id]
|
||||
device_ip = device_ips[device_id]
|
||||
instance['devices'].append({
|
||||
'device_id': device_id,
|
||||
'device_ip': device_ip,
|
||||
})
|
||||
instance['rank_id'] = str(instance_id)
|
||||
instance['server_id'] = server_id
|
||||
instance_list.append(instance)
|
||||
hccn_table['group_list'].append({
|
||||
'device_num': str(nproc_per_node),
|
||||
'server_num': '1',
|
||||
'group_name': '',
|
||||
'instance_count': str(nproc_per_node),
|
||||
'instance_list': instance_list,
|
||||
})
|
||||
hccn_table['para_plane_nic_location'] = 'device'
|
||||
hccn_table['para_plane_nic_name'] = []
|
||||
for instance_id in range(nproc_per_node):
|
||||
eth_id = visible_devices[instance_id]
|
||||
hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
|
||||
hccn_table['para_plane_nic_num'] = str(nproc_per_node)
|
||||
hccn_table['status'] = 'completed'
|
||||
import json
|
||||
with open(RANK_TABLE_SAVE_PATH, 'w') as table_fp:
|
||||
json.dump(hccn_table, table_fp, indent=4)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if os.path.exists(RANK_TABLE_SAVE_PATH):
|
||||
print('Rank table file exists.')
|
||||
else:
|
||||
print('Generating rank table file.')
|
||||
main()
|
||||
print('Rank table file generated')
|
Loading…
Reference in New Issue