Compare commits

...

173 Commits
master ... r0.5

Author SHA1 Message Date
mindspore-ci-bot 0e3a39c223 !3357 modify device id
Merge pull request !3357 from changzherui/mod_device_id
2020-07-23 15:10:28 +08:00
changzherui bdc67ee2ca modify device id 2020-07-23 12:29:03 +08:00
mindspore-ci-bot 3f916bddd3 !3340 modify device id
Merge pull request !3340 from changzherui/mod_device_id
2020-07-22 23:24:19 +08:00
changzherui f689648872 modify device id 2020-07-22 21:21:43 +08:00
mindspore-ci-bot aef097d312 !3293 Add environment variable ASCEND_CUSTOM_FWK_PATH to fit specific ascend software structures
Merge pull request !3293 from yanghaoran/r0.5
2020-07-22 11:36:16 +08:00
yanghaoran d946b61a88 add environment variable ASCEND_CUSTOM_FWK_PATH to support specific ascend software structutres 2020-07-21 21:58:35 +08:00
mindspore-ci-bot 6ed3347962 !3170 fix avg_pool operator, adding filter input
Merge pull request !3170 from yanghaoran/r0.5
2020-07-17 21:54:39 +08:00
yanghaoran 86efd33682 fix avg_pool operator, adding filter input 2020-07-17 18:18:33 +08:00
mindspore-ci-bot 370de14557 !3122 update akg commit id to newest in branch r0.5
Merge pull request !3122 from looop5/update_r0.5_akg
2020-07-16 20:14:08 +08:00
looop5 c96fa96ac9 update akg commit id to newest in branch r0.5 2020-07-16 16:34:38 +08:00
mindspore-ci-bot 8e7ae18d0e !3044 delete thirdpraty/icu4c/filter.json
Merge pull request !3044 from qianlong21st/change_icu_compile_r0.5
2020-07-14 17:29:06 +08:00
qianlong 6458bef341 change icu4c compile way 2020-07-14 10:20:42 +08:00
mindspore-ci-bot 8d32dc74a4 !2932 delete package to_mindrecord
Merge pull request !2932 from shenwei41/sw_r0.5
2020-07-09 10:42:02 +08:00
shenwei41 4d3f556f8e delete package to_mindrecord 2020-07-09 09:22:43 +08:00
mindspore-ci-bot 988876c744 !2933 Update sqlite patch
Merge pull request !2933 from luoyang/son_r0.5
2020-07-08 15:08:19 +08:00
YangLuo 1b5ce7ae50 Update sqlite patch 2020-07-08 10:40:33 +08:00
mindspore-ci-bot e4bed3bf3d !2827 add libtiff notice info to r0.5
Merge pull request !2827 from xulei/r0.5
2020-07-02 21:31:48 +08:00
xulei2020 3450388471 add libtiff notice info to r0.5 2020-07-02 21:21:21 +08:00
mindspore-ci-bot 16711f3e62 !2797 update link of readme
Merge pull request !2797 from leiyuning/r0.5
2020-07-01 20:02:03 +08:00
leiyuning 17281bb339 update readme 2020-07-01 19:21:36 +08:00
mindspore-ci-bot 0295928983 !2778 gpu lstm network modify
Merge pull request !2778 from chenweifeng/r0.5_lstm
2020-07-01 17:21:08 +08:00
wilfChen 4c14c085aa gpu lstm performace 2020-07-01 09:50:04 +08:00
zhangzhenghai 8c8e5d99d0 update serving/ms_service.proto.
add copyright
2020-06-30 19:36:27 +08:00
mindspore-ci-bot 1aedd6cee1 !2578 Add case for precision of bert network
Merge pull request !2578 from ddwolf/add_case_for_precisoin_of_bert_to_r0.5
2020-06-30 18:50:26 +08:00
mindspore-ci-bot cf366aa0e7 !2758 update release notes - sync
Merge pull request !2758 from guozhijian/sync_update_release_notes
2020-06-30 18:38:52 +08:00
jonyguo b5e4174868 update release notes 2020-06-30 18:00:08 +08:00
mindspore-ci-bot 9b547e20f4 !2751 fix mix target device id
Merge pull request !2751 from kisnwang/fix-mix-target-device-id-r0.5
2020-06-30 17:18:39 +08:00
kswang d290c07a0f fix mix target device id 2020-06-30 15:19:25 +08:00
duxiutao dcf40c262b add case to guard precision and modify get const tensor value in composite kernel 2020-06-30 14:46:10 +08:00
mindspore-ci-bot ae82f743da !2739 rebase master to r0.5 for quantization aware training
Merge pull request !2739 from chenzhongming/diff_0.5_master
2020-06-30 14:13:41 +08:00
chenzomi 684ecac927 rebase master to r0.5 for quantizaiton aware training 2020-06-30 11:44:57 +08:00
mindspore-ci-bot 256bedf469 !2740 unroll print loss
Merge pull request !2740 from chenzhongming/unroll
2020-06-30 11:32:01 +08:00
mindspore-ci-bot 3c324e1031 !2729 Add resnext50 network
Merge pull request !2729 from z00378171/r0.5
2020-06-30 11:15:52 +08:00
mindspore-ci-bot 3b23d8e45a !2727 Add YOLOV3-DarkNet53 to Model Zoo
Merge pull request !2727 from yangyongjie/r0.5
2020-06-30 11:14:05 +08:00
chenzomi ff042b92ce unroll print loss 2020-06-30 10:42:37 +08:00
yangyongjie 123226c283 Add YOLOV3-DarkNet53 to Model Zoo 2020-06-30 09:41:03 +08:00
mindspore-ci-bot 412e4580d4 !2721 add thirdparty notice for bert/tokenization
Merge pull request !2721 from yuchaojie/r0.5
2020-06-29 22:13:47 +08:00
mindspore-ci-bot 0201001a41 !2731 remove export lite model api
Merge pull request !2731 from yangjie159/r0.5
2020-06-29 22:00:21 +08:00
yangjie159 46da06b3f8 remove export lite model api 2020-06-29 21:37:45 +08:00
zhouyaqiang 66d2690f95 add resnext50 network 2020-06-29 21:22:41 +08:00
yuchaojie a5fd01fd40 add thirdparty notice for bert/tokenization 2020-06-29 21:06:27 +08:00
mindspore-ci-bot f8fa03d732 !2712 Revert "Make assign-node to be before jump-node, ensure child graph can get its inputs"
Merge pull request !2712 from zhoufeng/revert-assign-node-order
2020-06-29 19:53:39 +08:00
mindspore-ci-bot 5b46b05b50 !2714 add notice
Merge pull request !2714 from jinyaohui/print
2020-06-29 19:29:29 +08:00
mindspore-ci-bot bb3e05c317 !2709 [r0.5]Fix bug of paramter is the second input of control depend
Merge pull request !2709 from chenfei_mindspore/parameter-should-not-act-as-the-second-input-of-control-depend-when-depend-mode-is-0
2020-06-29 19:11:50 +08:00
mindspore-ci-bot 7891b53fa8 !2704 fix quantization aware training auto create graph bug
Merge pull request !2704 from chenzhongming/r0.5
2020-06-29 19:08:28 +08:00
chenzomi c831d3eb60 fix quantization aware training auto create graph bug 2020-06-29 17:59:08 +08:00
jinyaohui 5db6852c75 add notice 2020-06-29 17:43:16 +08:00
zhoufeng 30001d68e2 Revert "Make assign-node to be before jump-node, ensure child graph can get its"
This reverts commit 8628b898e1.
2020-06-29 17:33:24 +08:00
chenfei 92aaf297c4 if parameter is the second input of control depend and depend mode is 0,this control relation is invalid 2020-06-29 17:17:51 +08:00
mindspore-ci-bot 2daa1b33c2 !2693 Move resnet_thor from example to model_zoo
Merge pull request !2693 from panbingao/r0.5
2020-06-29 16:21:30 +08:00
mindspore-ci-bot da9ba5e84d !2695 Update akg to r0.5 branch
Merge pull request !2695 from TronZhang/akg_r0.5
2020-06-29 12:49:32 +08:00
mindspore-ci-bot 381bbc4db5 !2691 use two condition, false branch caculate error
Merge pull request !2691 from hexia/deal-switch-input
2020-06-29 11:57:38 +08:00
mindspore-ci-bot bf52346e65 !2678 Add typeid to type conversion scene
Merge pull request !2678 from zhangbuxue/Add_typeid_to_type_conversion_scene
2020-06-29 11:36:43 +08:00
mindspore-ci-bot 39d171ce17 !2689 fix cast kernel build in pynative mode
Merge pull request !2689 from limingqi107/cast_pynative
2020-06-29 11:33:19 +08:00
tronzhang 2848ad1f82 update akg to r0.5 branch 2020-06-29 11:28:18 +08:00
panbingao 70bc8ed529 move resnet_thor series from example to model_zoo 2020-06-29 11:16:51 +08:00
hexia 9794b13c10 fix switch input 2020-06-29 10:59:16 +08:00
buxue fe8f47dc45 add typeid to type conversion scene 2020-06-29 10:27:42 +08:00
mindspore-ci-bot 84cdb9f4ed !2646 Fix grad value is wrong when register hook in pynative
Merge pull request !2646 from JoyLvliang/r0.5
2020-06-29 10:06:52 +08:00
mindspore-ci-bot 51dd49c176 !2628 move resnet series from example to model_zoo
Merge pull request !2628 from gengdongjie/r0.5
2020-06-29 09:45:09 +08:00
limingqi107 7fca26c201 fix cast kernel build in pynative mode 2020-06-29 09:39:43 +08:00
mindspore-ci-bot e70a8c840a !2670 Change comment for akg compilation option
Merge pull request !2670 from TronZhang/akg_default_r0.5
2020-06-29 09:33:26 +08:00
mindspore-ci-bot acbbe52984 !2685 GPU mul support int
Merge pull request !2685 from VectorSL/r0.5
2020-06-29 09:18:33 +08:00
mindspore-ci-bot f8b608cb1c !2686 update MindSpore version from r0.3 to r0.5
Merge pull request !2686 from yanghaoran/r0.5
2020-06-29 09:12:30 +08:00
mindspore-ci-bot 48b4a10f39 !2679 fix issue [controlflow] if Cascad an if, raise error
Merge pull request !2679 from wenchunjiang/fix_if_cascad_if_bug
2020-06-29 09:00:34 +08:00
mindspore-ci-bot 7ec0b35fac !2653 Mass text summarization update.
Merge pull request !2653 from linqingke/mass_r0.5
2020-06-28 22:26:52 +08:00
mindspore-ci-bot 2613d76a96 !2656 fix: tfrecord to mindrecord para check - sync
Merge pull request !2656 from guozhijian/sync_fix_tfrecord_to_mr_para_check
2020-06-28 22:06:27 +08:00
mindspore-ci-bot 778fdf6e49 !2672 auto paralle for sparse tensor gradient
Merge pull request !2672 from lirongzhen1/r0.5
2020-06-28 22:03:46 +08:00
yanghaoran 5b6a59e13e update version to 0.5 2020-06-28 22:03:39 +08:00
VectorSL 79ec5f7398 gpu mul support int 2020-06-28 22:03:25 +08:00
mindspore-ci-bot 25a34c0d13 !2666 fix python abort bug
Merge pull request !2666 from jinyaohui/backend
2020-06-28 21:56:19 +08:00
mindspore-ci-bot 2108f72cd3 !2682 [quant]The top level add op prefix_name check error r0.5
Merge pull request !2682 from vlne-v1/I1LJMR-quant-the-top-level-add-op-prefix_name-check-error-r0.5
2020-06-28 21:55:26 +08:00
gengdongjie 6f13315d90 move resnet series from example to model_zoo 2020-06-28 21:44:14 +08:00
Wei Luning f398495a88 fix bug in quant deploy export 2020-06-28 21:37:25 +08:00
mindspore-ci-bot abb7c40315 !2665 handle RecurseCompileGraph when one branch is Assign
Merge pull request !2665 from Margaret_wangrui/r0.5
2020-06-28 21:32:39 +08:00
mindspore-ci-bot 9b1b34d7ed !2647 Synchronize Ascend software suite 28 Jun 2020
Merge pull request !2647 from yanghaoran/r0.5
2020-06-28 21:21:46 +08:00
mindspore-ci-bot b5d8134682 !2649 Add group params check method and fix print comment
Merge pull request !2649 from ghzl/fix-group-params-type-r0.5
2020-06-28 20:47:40 +08:00
mindspore-ci-bot 269b514b05 !2650 get monitor sampling interval from json file
Merge pull request !2650 from yanghaitao/yht_sampling_interval_r0.5
2020-06-28 20:05:11 +08:00
mindspore-ci-bot 5a6988bc94 !2663 optimize is all nop node detect in mem reuse
Merge pull request !2663 from laiyongqiang/gpu_opt_r05
2020-06-28 20:00:00 +08:00
mindspore-ci-bot f6148c7e39 !2657 [CT][MS][Auto-Parallel]Double recursion does not support the gatherv2 operator
Merge pull request !2657 from Chong/r5
2020-06-28 19:50:06 +08:00
lvliang 24c0a8256f fix-grad-value-is-wrong-in-pynative-hook 2020-06-28 19:22:28 +08:00
wenchunjiang b3da4d9b97 fix bug of labelswitch generate task failed when if cascad if 2020-06-28 18:52:48 +08:00
lirongzhen1 c3a9f1455e auto parallel for sparse gradients 2020-06-28 18:42:08 +08:00
tronzhang 6259db4d5d change comment for akg option 2020-06-28 17:51:40 +08:00
jonyguo b3346a98b9 fix: tfrecord to mindrecord parameter check 2020-06-28 17:43:32 +08:00
jinyaohui 79b2fe28d5 fix bug 2020-06-28 17:38:45 +08:00
mindspore-ci-bot 16a75779be !2645 fix vgg16 accuracy lower then 92
Merge pull request !2645 from caojian05/ms_r0.5
2020-06-28 17:18:08 +08:00
Margaret_wangrui 390efd1207 handle RecurseCompileGraph when one branch is Assign 2020-06-28 17:11:08 +08:00
laiyongqiang 4799131e18 optimize is all nop node detect in mem reuse 2020-06-28 16:51:02 +08:00
hongxing eed1881f2d fix GatherV2/GatherV2P 2020-06-28 10:10:26 +02:00
linqingke c4d8c8aec0 Mass text summarization fix bug. 2020-06-28 16:07:34 +08:00
yanghaitao f3423208c4 set monitor sampling interval through json file 2020-06-28 15:30:37 +08:00
yanghaoran f44a0fd3df synchronize lastest Ascend software suite 28 Jun 2020 2020-06-28 15:30:01 +08:00
guohongzilong 9409f83245 fix params KeyError in group params 2020-06-28 15:29:54 +08:00
caojian05 0a261aba50 fix accurancy lower then 92 2020-06-28 15:19:27 +08:00
mindspore-ci-bot 6ef1a731db !2637 add wide&deep standalone training script for gpu in model zoo
Merge pull request !2637 from zyli2020/add_wide_deep_standalone_training_script
2020-06-28 14:37:37 +08:00
mindspore-ci-bot 40a0cd4a57 !2627 Modify long description format of whl package
Merge pull request !2627 from zhoufeng/modify-long-description-format-r0.5
2020-06-28 14:10:52 +08:00
lizhenyu 3231c4ab13 add wide&deep stanalone training script for gpu in model zoo 2020-06-28 12:02:30 +08:00
mindspore-ci-bot c74b16857a !2546 Fix some mistakes of ConfusionTransposeD vm ops
Merge pull request !2546 from liuwenhao/r0.5
2020-06-28 10:43:05 +08:00
mindspore-ci-bot 375f2bfa61 !2515 Avoid extra recording of summary when end called
Merge pull request !2515 from 李鸿章/no_step_zero
2020-06-28 10:28:02 +08:00
mindspore-ci-bot 231ef6bd74 !2539 1. fix infer value bug 2. tensor init support numpy number
Merge pull request !2539 from geekun/yjk_r0.5
2020-06-28 10:25:21 +08:00
mindspore-ci-bot 80ab1c0ab2 !2538 bugfix(transform): relax the exception of control depend on value node
Merge pull request !2538 from xianwz/r0.5
2020-06-28 10:25:04 +08:00
mindspore-ci-bot f5580ce722 !2617 Decide whether to collect data by dataset sink mode and current step in SummaryCollector
Merge pull request !2617 from ougongchang/fix_collect_freq_r0.5
2020-06-28 10:11:38 +08:00
zhoufeng 6f720a6cd8 Modify long description format of whl package
Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
2020-06-28 09:59:19 +08:00
liuwenhao4 89654580dd Fix some mistakes of ConfusionTransposeD vm ops 2020-06-28 09:53:44 +08:00
mindspore-ci-bot 4f377f2ab4 !2596 Make assign-node to be before jump-node, ensure child graph can get its input
Merge pull request !2596 from zhoufeng/assign-order-before-jump-r0.5
2020-06-28 09:40:58 +08:00
mindspore-ci-bot 5488268648 !2613 optimize fastrcnn training script
Merge pull request !2613 from yanghaitao/yht_fastrcnn_0.5
2020-06-27 17:40:51 +08:00
ougongchang 0934281adc Decide whether to collect data by dataset sink mode and current step in SummaryCollector.
Before, we only decide whether to collect data by current step,
it will not work well in dataset sink mode, so we check to see
if it's a dataset sink mode, and decide whether to collect data.
2020-06-27 16:29:58 +08:00
mindspore-ci-bot 1cadea12f0 !2469 add pretrain for lstm & vgg16 and remove lstm/vgg16/googlenet from directory 'mindspore/model_zoo'
Merge pull request !2469 from caojian05/ms_master_dev
2020-06-27 16:25:44 +08:00
yanghaitao1 0c519882b8 optimize fastrcnn training process 2020-06-27 03:36:09 -04:00
mindspore-ci-bot 122a931090 !2577 [AutoPar] copy Master commits to r0.5
Merge pull request !2577 from Chong/r5
2020-06-27 11:52:09 +08:00
mindspore-ci-bot 6d5ea0ee4d !2581 Add ut case test_lamb to r0.5
Merge pull request !2581 from TronZhang/add_test_lamb_r0.5
2020-06-27 10:36:50 +08:00
hongxing 300dd2971c merge master code to r0.5 2020-06-26 13:59:16 +02:00
mindspore-ci-bot ed22908c99 !2597 GraphKernel support akg batchmatmul
Merge pull request !2597 from DeshiChen/0624_akg_batmatmul_r0.5
2020-06-26 17:22:40 +08:00
dayschan cfe9c35659 GraphKernel support akg batchmatmul 2020-06-26 13:53:28 +08:00
zhoufeng 8628b898e1 Make assign-node to be before jump-node, ensure child graph can get its
input

Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
2020-06-26 12:38:14 +08:00
mindspore-ci-bot 12a359b9de !2588 fix checkpoint evaliaction.
Merge pull request !2588 from chenzhongming/r0.5
2020-06-25 11:17:59 +08:00
chenzomi bed6332688 fix checkpoint evaliaction. 2020-06-25 09:44:34 +08:00
Tron Zhang 43d8e6af1d add ut case test_lamb 2020-06-24 21:29:14 +08:00
mindspore-ci-bot 3c48de8262 !2573 fix print file bug
Merge pull request !2573 from jinyaohui/print
2020-06-24 20:27:04 +08:00
mindspore-ci-bot dd75ebfae3 !2575 dataset: repair bug in GetTensor that access NullPtr
Merge pull request !2575 from ms_yan/r0.5_GetTensor
2020-06-24 19:50:50 +08:00
mindspore-ci-bot d90e43a23c !2496 Enhance callback module and strongly check if callbacks is list or not
Merge pull request !2496 from 李鸿章/callback
2020-06-24 17:55:41 +08:00
ms_yan 7d2fe8c279 change GetTensor into GetRow to avoid NullPtr 2020-06-24 17:45:43 +08:00
jinyaohui e893c70164 fix bug 2020-06-24 17:25:47 +08:00
mindspore-ci-bot 9ea10a0022 !2571 add ENABLE_GE
Merge pull request !2571 from jinyaohui/backend
2020-06-24 17:21:04 +08:00
jinyaohui bb17bc4081 add ENABLE_GE 2020-06-24 17:08:05 +08:00
Xian Weizhao 01f228b0d5 relax the exception of control depend on value node 2020-06-24 16:55:22 +08:00
Li Hongzhang 2f33c76d7b warn when values duplicate and set mode to 'eval' to avoid extra recording 2020-06-24 16:40:49 +08:00
mindspore-ci-bot fe1d4ca3bd !2555 checkpoint add model_type
Merge pull request !2555 from chenzhongming/r0.5
2020-06-24 15:18:37 +08:00
mindspore-ci-bot 2e76c9fb82 !2536 [Control sink]Fix bug of get call real outputs
Merge pull request !2536 from chenfei_mindspore/stop-in-tuple-get-item-of-update-call-real-input
2020-06-24 14:11:33 +08:00
chenzomi 3b632eac46 checkpoint add model_type 2020-06-24 13:48:36 +08:00
mindspore-ci-bot f1106a18aa !2486 Make sure record the first step data, and catch the ValueError when the loss is not a Scalar
Merge pull request !2486 from ougongchang/fix_summary_optimizer_r0.5
2020-06-24 11:10:27 +08:00
mindspore-ci-bot 0857f43e0e !2516 !2482 fix a bug :serialization.export parameter "file_name" doesn't work
Merge pull request !2516 from lilei/save_file
2020-06-24 11:10:25 +08:00
mindspore-ci-bot 9b65782e1b !2522 modify alexnet dataset.py
Merge pull request !2522 from wukesong/wks-r0.5
2020-06-24 10:25:38 +08:00
mindspore-ci-bot 572236bdd7 !2507 fix misspell and check parameters on graphdata
Merge pull request !2507 from heleiwang/r0.5_fix_misspell
2020-06-24 10:10:39 +08:00
geekun e4b3b72ebf fix infer value bug 2020-06-24 10:01:24 +08:00
mindspore-ci-bot 3f8a7920d5 !2513 fix bug to remove reshape when reshape is depend's input
Merge pull request !2513 from laiyongqiang/r0.5
2020-06-24 09:59:49 +08:00
chenfei 7ede538d6a visit stop if tuple getitem and maketuple of function GetCallRealOutputs 2020-06-24 09:23:38 +08:00
mindspore-ci-bot 49ef6b89dd !2525 Fix some mistakes of TransData vm ops
Merge pull request !2525 from liuwenhao/r0.5
2020-06-24 09:21:16 +08:00
heleiwang 5f61b83812 fix misspell and check parameters 2020-06-24 09:17:04 +08:00
mindspore-ci-bot 23771def82 !2523 Disable cuda9.2
Merge pull request !2523 from zhoufeng/disable-cuda9-r0.5
2020-06-24 09:12:52 +08:00
ougongchang 54af354597 Make sure record the first step data in SummaryCollector, and catch the ValueError when the loss is not a Scalar. 2020-06-24 09:00:17 +08:00
zhoufeng 5520a23f7c Disable cuda9.2, use cuda10.1 as default
Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
2020-06-23 22:31:37 +08:00
liuwenhao4 01789e1aa1 Fix some mistakes of TransData vm ops 2020-06-23 22:24:46 +08:00
wukesong 16544b37d6 modify 2020-06-23 22:14:38 +08:00
mindspore-ci-bot ab83bf18d9 !2509 Graph kernel use control sink
Merge pull request !2509 from zhoufeng/graph-kernel-use-control-sink-r0.5
2020-06-23 21:35:43 +08:00
lilei 5d4099704f fix a bug:save .pb file 2020-06-23 21:30:25 +08:00
mindspore-ci-bot 43871e45dd !2493 Add an output to apply_proximal_adagrad op register
Merge pull request !2493 from YuJianfeng/r0.5
2020-06-23 21:10:33 +08:00
laiyongqiang 21770e7b6f fix bug to remove reshape when reshape is depend's input 2020-06-23 20:33:43 +08:00
zhoufeng dd22792344 Graph kernel use control sink
Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
2020-06-23 19:17:44 +08:00
mindspore-ci-bot 166d886501 !2476 fix the summary operator is not work in constant folding scene
Merge pull request !2476 from ougongchang/fix_folding_constant_r0.5
2020-06-23 15:06:42 +08:00
Li Hongzhang 9532e53337 enhance callback module and strongly check callbacks is list or not 2020-06-23 14:52:26 +08:00
mindspore-ci-bot f76096333e !2488 delete ENABLE_GE
Merge pull request !2488 from mindspore_ding/backend
2020-06-23 14:37:26 +08:00
mindspore-ci-bot 19ff5002ab !2467 check control mode of control depend
Merge pull request !2467 from chenfei_mindspore/check-depend-mode-of-control-depend
2020-06-23 14:32:21 +08:00
mindspore-ci-bot 73839591f5 !2481 add bert inference example in serving
Merge pull request !2481 from dinghao/r0.5
2020-06-23 12:19:58 +08:00
yujianfeng 34407391e6 Add an output to apply_proximal_adagrad op register 2020-06-23 11:56:45 +08:00
jinyaohui 29a2458596 delete ENABLE_GE 2020-06-23 11:42:54 +08:00
caojian05 a88e6ea270 add pretrain for lstm & vgg16 and remove lstm/vgg16/googlenet from directory 'mindspore/model_zoo' 2020-06-23 11:12:00 +08:00
chenfei 0e6752fa6a check control mode of control depend 2020-06-23 10:29:10 +08:00
dinghao 4ddb00b996 add bert example 2020-06-23 10:14:08 +08:00
mindspore-ci-bot 35ab95bfae !2461 Add multiple process for computation of optimizer in cpu
Merge pull request !2461 from YuJianfeng/r0.5
2020-06-23 09:12:58 +08:00
ougongchang 20a164e9cf fix the summary operator is not work in constant folding scene
The summary operator will be optimized when it return the origin value
in constant folding scene. So I return a None value to avoid this.
2020-06-23 09:11:18 +08:00
mindspore-ci-bot 1b52753fd7 !2462 optimize cpu reduce gradient
Merge pull request !2462 from kisnwang/optimize-reduce-sparse-gradient
2020-06-22 23:11:42 +08:00
kswang e74b02f460 optimize cpu reduce sparse gradient 2020-06-22 21:26:07 +08:00
mindspore-ci-bot 56b6191db5 !2449 Fix bug of ascend control parser
Merge pull request !2449 from chenfei_mindspore/fix-bug-ascend-control-parser
2020-06-22 21:14:35 +08:00
mindspore-ci-bot eef762e58a !2456 Fix BackendCommonOptimization order
Merge pull request !2456 from zhoufeng/xiu-ba-ge
2020-06-22 20:23:20 +08:00
yujianfeng 794ed3a291 Add multiple process for computation of sparse optimizers 2020-06-22 20:17:39 +08:00
mindspore-ci-bot a420c667c9 !2453 Change the dataset attribute in SummaryCollector
Merge pull request !2453 from ougongchang/r0.5
2020-06-22 19:07:01 +08:00
ougongchang 3c08137904 Change the attribute to children, becuase the attribute has beed changed in dataset 2020-06-22 17:28:23 +08:00
mindspore-ci-bot e726680e38 !2444 fix mix target entry
Merge pull request !2444 from kisnwang/fix-mix-target-entry-for-r0.5
2020-06-22 17:24:36 +08:00
chenfei 144aca43c3 fix bug of control parser 2020-06-22 16:53:42 +08:00
kswang ae3db6d4de fix mix target entry 2020-06-22 16:09:25 +08:00
zhoufeng d4de0c5af1 fix BackendCommonOptimization order
Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
2020-06-22 11:45:07 +08:00
322 changed files with 24217 additions and 48935 deletions

View File

@ -29,7 +29,7 @@ enrichment of the AI software/hardware application ecosystem.
<img src="docs/MindSpore-architecture.png" alt="MindSpore Architecture" width="600"/>
For more details please check out our [Architecture Guide](https://www.mindspore.cn/docs/en/0.3.0-alpha/architecture.html).
For more details please check out our [Architecture Guide](https://www.mindspore.cn/docs/en/r0.5/architecture.html).
### Automatic Differentiation
@ -66,7 +66,6 @@ MindSpore offers build options across multiple backends:
| Ascend910 | Ubuntu-x86 | ✔️ |
| | EulerOS-x86 | ✔️ |
| | EulerOS-aarch64 | ✔️ |
| GPU CUDA 9.2 | Ubuntu-x86 | ✔️ |
| GPU CUDA 10.1 | Ubuntu-x86 | ✔️ |
| CPU | Ubuntu-x86 | ✔️ |
| | Windows-x86 | ✔️ |
@ -76,7 +75,7 @@ For installation using `pip`, take `CPU` and `Ubuntu-x86` build version as an ex
1. Download whl from [MindSpore download page](https://www.mindspore.cn/versions/en), and install the package.
```
pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.3.0-alpha/MindSpore/cpu/ubuntu_x86/mindspore-0.3.0-cp37-cp37m-linux_x86_64.whl
pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.0-beta/MindSpore/cpu/ubuntu_x86/mindspore-0.5.0-cp37-cp37m-linux_x86_64.whl
```
2. Run the following command to verify the install.
@ -133,8 +132,8 @@ currently the containerized build options are supported as follows:
For `CPU` backend, you can directly pull and run the latest stable image using the below command:
```
docker pull mindspore/mindspore-cpu:0.3.0-alpha
docker run -it mindspore/mindspore-cpu:0.3.0-alpha /bin/bash
docker pull mindspore/mindspore-cpu:0.5.0-beta
docker run -it mindspore/mindspore-cpu:0.5.0-beta /bin/bash
```
* GPU
@ -151,8 +150,8 @@ currently the containerized build options are supported as follows:
Then you can pull and run the latest stable image using the below command:
```
docker pull mindspore/mindspore-gpu:0.3.0-alpha
docker run -it --runtime=nvidia --privileged=true mindspore/mindspore-gpu:0.3.0-alpha /bin/bash
docker pull mindspore/mindspore-gpu:0.5.0-beta
docker run -it --runtime=nvidia --privileged=true mindspore/mindspore-gpu:0.5.0-beta /bin/bash
```
To test if the docker image works, please execute the python code below and check the output:
@ -187,7 +186,7 @@ please check out [docker](docker/README.md) repo for the details.
## Quickstart
See the [Quick Start](https://www.mindspore.cn/tutorial/en/0.3.0-alpha/quick_start/quick_start.html)
See the [Quick Start](https://www.mindspore.cn/tutorial/en/r0.5/quick_start/quick_start.html)
to implement the image classification.
## Docs

View File

@ -1,3 +1,75 @@
# Release 0.5.0-beta
## Major Features and Improvements
### Ascend 910 Training and Inference Framework
* New models
* ResNext50: a simple, highly modularized network architecture using aggregated resdiual transformations for image classification on ImageNet 2012 dataset.
* MASS: a pre-training method for sequence to sequence based language generation tasks on Text Summarization and Conversational Response Generation using News Crawls 2007-2017 dataset, Gigaword corpus and Cornell movie dialog corpus.
* Transformer: a neural network architecture for language understanding on WMT 2014 English-German dataset.
* GCNGraph Convolutional Networks for the task of classification of nodes in a graph on Cora and Citeseer datasets.
* GATan attention-based graph neural network for node classification on Cora and CiteSeer dataset.
* Frontend and user interface
* Support tensor value and assignment of mixed tensor index in graph mode.
* Support tensor comparison, len operator, constexpr syntax, value and assignment of tensor index in pynative mode.
* Support converting MindSpore IR to pb format for infer model.
* Support print operator to write data directly on the hard disk.
* Add the double recursive programming solution for very high speed parallel strategy search in automatic parallel.
* User interfaces change log
* Allow the learning rate of AdamWeightDecayDynamicLR and Lamb to be 0([!1826](https://gitee.com/mindspore/mindspore/pulls/1826))
* Restricting the entire network input parameter is Tensor([!1967](https://gitee.com/mindspore/mindspore/pulls/1967))
* Turn shape and dtype into attributes instead of interfaces([!1919](https://gitee.com/mindspore/mindspore/pulls/1919))
* Delete multitypefungraph([!2116](https://gitee.com/mindspore/mindspore/pulls/2116))
* Refactor the callback module in an encapsulated way, use _CallbackManager instead of _build_callbacks([!2236](https://gitee.com/mindspore/mindspore/pulls/2236))
* Delete EmbeddingLookup([!2163](https://gitee.com/mindspore/mindspore/pulls/2163))
* Checkpoint add model_type([!2517](https://gitee.com/mindspore/mindspore/pulls/2517))
* Executor and performance optimization
* Heterogeneous execution on CPU and Ascend devices supported, and is verified in Wide&Deep model.
* Quantitative training of MobileNetV2, Lenet and Resnet50 on Ascend-910 are supported.
* Support new fusion architecture, which can do fusion optimization across graphs and kernels to improve execution speed.
* Data processing, augmentation, and save format
* Support data processing pipeline performance profiling.
* Support public dataset loading, such as CLUE and Coco.
* Support more text processing, such as more tokenizers and vocab data.
* Support MindRecord padded data.
### Other Hardware Support
* GPU platform
* New model supported: Bert / Wide&Deep.
* Support setting max device memory.
* CPU platform
* New model supported: LSTM.
## Bugfixes
* Models
* Bert, Move Bert from `example` to `model_zoo`, optimize network for better performance. ([!1902](https://gitee.com/mindspore/mindspore/pulls/1902))
* VGG16, Move VGG16 from `example` to `model_zoo`, optimize network for better accuracy. ([!2645](https://gitee.com/mindspore/mindspore/pulls/2645))
* Alexnet, modify parameter setting to improve accuracy ([!1364](https://gitee.com/mindspore/mindspore/pulls/2370))
* Wide&Deep, Move Wide&Deep from `example` to `model_zoo`, optimize network for better performance. ([!2221](https://gitee.com/mindspore/mindspore/pulls/2221))
* Python API
* Fix bug in auto cast([!1766](https://gitee.com/mindspore/mindspore/pulls/1766))
* Fix bug of register_backward_hook([!2148](https://gitee.com/mindspore/mindspore/pulls/2148))
* Fix bug of tuple args in pynative mode([!1878](https://gitee.com/mindspore/mindspore/pulls/1878))
* Fix bug of checking numbers of arguments and graph parameters([!1701](https://gitee.com/mindspore/mindspore/pulls/1701))
* Executor
* Fix bug of loading input data repeatedly in pynative mode([!1966](https://gitee.com/mindspore/mindspore/pulls/1966))
* Fix bug of list cannot be used as input in pynative mode([!1765](https://gitee.com/mindspore/mindspore/pulls/1765))
* Fix bug of kernel select ([!2103](https://gitee.com/mindspore/mindspore/pulls/2103))
* Fix bug of pattern matching for batchnorm fusion in the case of auto mix precision.([!1851](https://gitee.com/mindspore/mindspore/pulls/1851))
* Fix bug of generate hccl's kernel info.([!2393](https://gitee.com/mindspore/mindspore/mindspore/pulls/2393))
* GPU platform
* Fix bug of summary feature invalid([!2173](https://gitee.com/mindspore/mindspore/pulls/2173))
* Data processing
* Fix bug of Cifar dataset reading([!2096](https://gitee.com/mindspore/mindspore/pulls/2096))
* Fix bug of C++ behavior in RandomCropAndResize([!2026](https://gitee.com/mindspore/mindspore/pulls/2026))
* Fix the bug of mindrecord shuffle([!2420](https://gitee.com/mindspore/mindspore/pulls/2420))
## Contributors
Thanks goes to these wonderful people:
Alexey Shevlyakov, avakh, baihuawei, BowenK, buxue, caifubi, caojian05, Cathy Wong, changzherui, chenfei, chengxianbin, chenhaozhe, chenjianping, chentingting, chenzomi, chujinjin, Danish Farid, dayschan, dengwentao, dinghao, etone-chan, fangzehua, fary86, geekun, Giancarlo Colmenares, gong chen, gukecai, guohongzilong, hangangqiang, heleiwang, hesham, He Wei, hexia, hongxing, huangdongrun, huanghui, islam_amin, Jamie Nisbet, Jesse Lee, jiangjinsheng, jiangzhiwen, jinyaohui, jjfeing, jojobugfree, Jonathan Yan, jonyguo, Junhan Hu, Kang, kingfo, kouzhenzhong, kpy, kswang, laiyongqiang, leopz, liangzelang, lichenever, lihongkang, Li Hongzhang, lilei, limingqi107, lirongzhen1, liubuyu, liuchongming74, liuwenhao4, liuxiao, Lixia Chen, liyanliu, liyong, lizhenyu, lvliang, Mahdi, Margaret_wangrui, meixiaowei, ms_yan, nhussain, ougongchang, panfengfeng, panyifeng, peilinwang, Peilin Wang, pkuliuliu, qianlong, rick_sanchez, shibeiji, Shida He, shijianning, simson, sunsuodong, suteng, Tinazhang, Tron Zhang, unknown, VectorSL, wandongdong, wangcong, wangdongxu, wangdongxu6, wanghua, wangnan39, Wei Luning, wenchunjiang, wenkai, wilfChen, WilliamLian, wukesong, Xian Weizhao, Xiaoda Zhang, xiefangqi, xulei2020, xunxue, xutianchun, Yang, yanghaitao, yanghaitao1, yanghaoran, yangjie, yangjie159, YangLuo, Yanjun Peng, yankai, yanzhenxiang2020, yao_yf, Yi Huaijie, yoonlee666, yuchaojie, yujianfeng, zhangzhongpeng, zhangdengcheng, Zhang Qinghua, zhangyinxia, zhangz0911gm, zhaojichen, zhaoting, zhaozhenlong, zhoufeng, zhouneng, zhousiyi, Zirui Wu, Ziyan, zjun, ZPaC, lihongzhang, wangdongxu
Contributions of any kind are welcome!
# Release 0.3.0-alpha
## Major Features and Improvements

View File

@ -3638,6 +3638,61 @@ Copyright (C) 2001-2005, International Business Machines Corporation and others.
Copyright (c) 1996-2016, International Business Machines Corporation
Copyright (C) 1997-2010, International Business Machines
Software: libtiff 4.1.0
Copyright notice:
Copyright © 2015 Open Microscopy Environment / University of Dundee
Copyright (c) 2004, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 1990-1997 Sam Leffler
Copyright (c) 1991-1997 Silicon Graphics, Inc.
Copyright (c) 1988-1997 Sam Leffler
Copyright (c) 1991-1997 Sam Leffler
Use and Copyright
Copyright (C) 1990, 1995 Frank D. Cringle.
Copyright (c) 1994-1997 Sam Leffler
Copyright (c) 1994-1997 Silicon Graphics, Inc.
Copyright (c) 1997 Greg Ward Larson
Copyright (c) 1997 Silicon Graphics, Inc.
Copyright (c) 2010, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) Joris Van Damme <info@awaresystems.be>
Copyright (c) AWare Systems <http:www.awaresystems.be/>
Copyright (c) 1996-1997 Sam Leffler
Copyright (c) 1996 Pixar
Copyright (c) 1995-1997 Sam Leffler
Copyright (c) 1995-1997 Silicon Graphics, Inc.
Copyright (c) 1988-1996 Sam Leffler
Copyright (c) 1991-1996 Silicon Graphics, Inc.
Copyright (c) 1992-1997 Sam Leffler
Copyright (c) 1992-1997 Silicon Graphics, Inc.
Copyright (c) 2018, Mapbox
Copyright (c) 2017, Planet Labs
Copyright (c) 1990 by Sun Microsystems, Inc.
Copyright 1990 by Digital Equipment Corporation, Maynard, Massachusetts.
Copyright 1991 by Digital Equipment Corporation, Maynard, Massachusetts.
Copyright (c) 2002, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 2003 Ross Finlayson
Additions (c) Richard Nolde 2006-2010
Copyright (c) 2003, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 2000, Frank Warmerdam
Copyright (c) 1987, 1993, 1994
Copyright (c) 1989, 1993
Copyright (c) 2009 Frank Warmerdam
Copyright (c) 1987, 1993
Copyright (c) 2005 The DragonFly Project. All rights reserved.
Copyright (c) 2003 Citrus Project,
All rights reserved.
Copyright (c) 1990, 1993
Copyright (c) 1996 Mike Johnson
Copyright (c) 1996 BancTec AB
Copyright (c) 2004, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 2012, Frank Warmerdam <warmerdam@pobox.com>
Copyright (c) 2019, Even Rouault <even.rouault at spatialys.com>
Copyright (c) 2007, Frank Warmerdam <warmerdam@pobox.com>
Copyright (c) 2019, Thomas Bernard <miniupnp@free.fr>
Copyright (c) 2008, Andrey Kiselev <dron@ak4719.spb.edu>
Copyright (c) 1999, Frank Warmerdam
Copyright (c) 1991-1996 Sam Leffler
Copyright (c) 1996 USAF Phillips Laboratory
Software: opencv 4.2.0
Copyright notice:
Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
@ -4095,3 +4150,11 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Software: bert eedf5716ce1268e56f0a50264a88cafad334ac61
MindSpore only used file bert/tokenization.py
Copyright notice:
Copyright 2018 The Google AI Language Team Authors.
Apache License, Version 2.0

2
akg

@ -1 +1 @@
Subproject commit c460176523d039c8995f1d71089753725ebc0792
Subproject commit 7c462a5d5acd073dfeff4a49b28e01af55c31c55

View File

@ -50,9 +50,9 @@ usage()
echo " -D Enable dumping of function graph ir, default on"
echo " -z Compile dataset & mindrecord, default on"
echo " -M Enable MPI and NCCL for GPU training, gpu default on"
echo " -V Specify the minimum required cuda version, default CUDA 9.2"
echo " -V Specify the minimum required cuda version, default CUDA 10.1"
echo " -I Compile predict, default off"
echo " -K Compile with AKG, default off"
echo " -K Compile with AKG, default on"
echo " -s Enable serving module, default off"
}
@ -88,7 +88,7 @@ checkopts()
ENABLE_DUMP_IR="on"
COMPILE_MINDDATA="on"
ENABLE_MPI="off"
CUDA_VERSION="9.2"
CUDA_VERSION="10.1"
COMPILE_PREDICT="off"
USE_GLOG="on"
PREDICT_PLATFORM=""
@ -191,6 +191,10 @@ checkopts()
usage
exit 1
fi
if [[ "X$OPTARG" == "X9.2" ]]; then
echo "Unsupported CUDA version 9.2"
exit 1
fi
CUDA_VERSION="$OPTARG"
;;
P)
@ -248,7 +252,7 @@ checkopts()
done
}
checkopts "$@"
echo "---------------- mindspore: build start ----------------"
echo "---------------- mindSpore: build start ----------------"
mkdir -pv "${BUILD_PATH}/package/mindspore/lib"
git submodule update --init graphengine
if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" ]]; then
@ -446,9 +450,9 @@ build_predict()
cd "${BASEPATH}/predict/output/"
if [[ "$PREDICT_PLATFORM" == "x86_64" ]]; then
tar -cf MSPredict-0.3.0-linux_x86_64.tar.gz include/ lib/ --warning=no-file-changed
tar -cf MSPredict-0.5.0-linux_x86_64.tar.gz include/ lib/ --warning=no-file-changed
elif [[ "$PREDICT_PLATFORM" == "arm64" ]]; then
tar -cf MSPredict-0.3.0-linux_aarch64.tar.gz include/ lib/ --warning=no-file-changed
tar -cf MSPredict-0.5.0-linux_aarch64.tar.gz include/ lib/ --warning=no-file-changed
fi
echo "success to build predict project!"
}

View File

@ -45,7 +45,11 @@ else()
set(ASCEND_PATH /usr/local/Ascend)
endif()
set(ASCEND_DRIVER_PATH ${ASCEND_PATH}/driver/lib64/common)
set(ASCEND_RUNTIME_PATH ${ASCEND_PATH}/fwkacllib/lib64)
if (DEFINED ENV{ASCEND_CUSTOM_FWK_PATH})
set(ASCEND_RUNTIME_PATH $ENV{ASCEND_CUSTOM_FWK_PATH}/fwkacllib/lib64)
else ()
set(ASCEND_RUNTIME_PATH ${ASCEND_PATH}/fwkacllib/lib64)
endif ()
find_library(c_sec libc_sec.so ${ASCEND_DRIVER_PATH})
find_library(slog libslog.so ${ASCEND_DRIVER_PATH})
find_library(mmpa libmmpa.so ${ASCEND_DRIVER_PATH})

View File

@ -9,11 +9,11 @@ else()
LIBS ${LIB_ICU_COMMON} ${LIB_ICU_DATA} ${LIB_ICU_I18N}
URL https://github.com/unicode-org/icu/archive/release-67-1.tar.gz
MD5 0c2662a2b0bc80b0eb56495205247c8f
CONFIGURE_COMMAND ./icu4c/source/runConfigureICU Linux --enable-rpath --disable-tests --disable-samples --disable-icuio --disable-extras ICU_DATA_FILTER_FILE=${CMAKE_SOURCE_DIR}/third_party/icu4c/filter.json
CONFIGURE_COMMAND ${CMAKE_SOURCE_DIR}/scripts/build_icu4c.sh
)
include_directories(${icu4c_INC})
add_library(mindspore::icuuc ALIAS icu4c::${LIB_ICU_COMMON})
add_library(mindspore::icudata ALIAS icu4c::${LIB_ICU_DATA})
add_library(mindspore::icui18n ALIAS icu4c::${LIB_ICU_I18N})
add_definitions(-D ENABLE_ICU4C)
endif()
endif()

View File

@ -0,0 +1,67 @@
FROM ubuntu:18.04
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
# Set env
ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5
ENV PATH /usr/local/bin:$PATH
# Install base tools
RUN apt update \
&& DEBIAN_FRONTEND=noninteractive apt install -y \
vim \
wget \
curl \
xz-utils \
net-tools \
openssh-client \
git \
ntpdate \
tzdata \
tcl \
sudo \
bash-completion
# Install compile tools
RUN DEBIAN_FRONTEND=noninteractive apt install -y \
gcc \
g++ \
zlibc \
make \
libgmp-dev \
patch \
autoconf \
libtool \
automake \
flex
# Set bash
RUN echo "dash dash/sh boolean false" | debconf-set-selections
RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash
# Install python (v3.7.5)
RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \
&& cd /tmp \
&& wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \
&& tar -xvf v3.7.5.tar.gz \
&& cd /tmp/cpython-3.7.5 \
&& mkdir -p ${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz
# Set pip source
RUN mkdir -pv /root/.pip \
&& echo "[global]" > /root/.pip/pip.conf \
&& echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf
# Install MindSpore cpu whl package
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.0-beta/MindSpore/cpu/ubuntu_x86/mindspore-0.5.0-cp37-cp37m-linux_x86_64.whl

View File

@ -0,0 +1,83 @@
FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
MAINTAINER leonwanghui <leon.wanghui@huawei.com>
# Set env
ENV PYTHON_ROOT_PATH /usr/local/python-3.7.5
ENV OMPI_ROOT_PATH /usr/local/openmpi-3.1.5
ENV PATH ${OMPI_ROOT_PATH}/bin:/usr/local/bin:$PATH
ENV LD_LIBRARY_PATH ${OMPI_ROOT_PATH}/lib:$LD_LIBRARY_PATH
# Install base tools
RUN apt update \
&& DEBIAN_FRONTEND=noninteractive apt install -y \
vim \
wget \
curl \
xz-utils \
net-tools \
openssh-client \
git \
ntpdate \
tzdata \
tcl \
sudo \
bash-completion
# Install compile tools
RUN DEBIAN_FRONTEND=noninteractive apt install -y \
gcc \
g++ \
zlibc \
make \
libgmp-dev \
patch \
autoconf \
libtool \
automake \
flex \
libnccl2=2.4.8-1+cuda10.1 \
libnccl-dev=2.4.8-1+cuda10.1
# Set bash
RUN echo "dash dash/sh boolean false" | debconf-set-selections
RUN DEBIAN_FRONTEND=noninteractive dpkg-reconfigure dash
# Install python (v3.7.5)
RUN apt install -y libffi-dev libssl-dev zlib1g-dev libbz2-dev libncurses5-dev \
libgdbm-dev libgdbm-compat-dev liblzma-dev libreadline-dev libsqlite3-dev \
&& cd /tmp \
&& wget https://github.com/python/cpython/archive/v3.7.5.tar.gz \
&& tar -xvf v3.7.5.tar.gz \
&& cd /tmp/cpython-3.7.5 \
&& mkdir -p ${PYTHON_ROOT_PATH} \
&& ./configure --prefix=${PYTHON_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -f /usr/local/bin/python \
&& rm -f /usr/local/bin/pip \
&& ln -s ${PYTHON_ROOT_PATH}/bin/python3.7 /usr/local/bin/python \
&& ln -s ${PYTHON_ROOT_PATH}/bin/pip3.7 /usr/local/bin/pip \
&& rm -rf /tmp/cpython-3.7.5 \
&& rm -f /tmp/v3.7.5.tar.gz
# Set pip source
RUN mkdir -pv /root/.pip \
&& echo "[global]" > /root/.pip/pip.conf \
&& echo "trusted-host=mirrors.aliyun.com" >> /root/.pip/pip.conf \
&& echo "index-url=http://mirrors.aliyun.com/pypi/simple/" >> /root/.pip/pip.conf
# Install openmpi (v3.1.5)
RUN cd /tmp \
&& wget https://download.open-mpi.org/release/open-mpi/v3.1/openmpi-3.1.5.tar.gz \
&& tar -xvf openmpi-3.1.5.tar.gz \
&& cd /tmp/openmpi-3.1.5 \
&& mkdir -p ${OMPI_ROOT_PATH} \
&& ./configure --prefix=${OMPI_ROOT_PATH} \
&& make -j4 \
&& make install -j4 \
&& rm -rf /tmp/openmpi-3.1.5 \
&& rm -f /tmp/openmpi-3.1.5.tar.gz
# Install MindSpore cuda-10.1 whl package
RUN pip install --no-cache-dir https://ms-release.obs.cn-north-4.myhuaweicloud.com/0.5.0-beta/MindSpore/gpu/ubuntu_x86/cuda-10.1/mindspore_gpu-0.5.0-cp37-cp37m-linux_x86_64.whl

View File

@ -1,82 +0,0 @@
# Guideline to Convert Training Data CLUERNER2020 to MindRecord For Bert Fine Tuning
<!-- TOC -->
- [What does the example do](#what-does-the-example-do)
- [How to use the example to process CLUERNER2020](#how-to-use-the-example-to-process-cluerner2020)
- [Download CLUERNER2020 and unzip](#download-cluerner2020-and-unzip)
- [Generate MindRecord](#generate-mindrecord)
- [Create MindDataset By MindRecord](#create-minddataset-by-mindrecord)
<!-- /TOC -->
## What does the example do
This example is based on [CLUERNER2020](https://www.cluebenchmarks.com/introduce.html) training data, generating MindRecord file, and finally used for Bert Fine Tuning progress.
1. run.sh: generate MindRecord entry script
2. run_read.py: create MindDataset by MindRecord entry script.
- create_dataset.py: use MindDataset to read MindRecord to generate dataset.
## How to use the example to process CLUERNER2020
Download CLUERNER2020, convert it to MindRecord, use MindDataset to read MindRecord.
### Download CLUERNER2020 and unzip
1. Download the training data zip.
> [CLUERNER2020 dataset download address](https://www.cluebenchmarks.com/introduce.html) **-> 任务介绍 -> CLUENER 细粒度命名实体识别 -> cluener下载链接**
2. Unzip the training data to dir example/nlp_to_mindrecord/CLUERNER2020/cluener_public.
```
unzip -d {your-mindspore}/example/nlp_to_mindrecord/CLUERNER2020/data/cluener_public cluener_public.zip
```
### Generate MindRecord
1. Run the run.sh script.
```bash
bash run.sh
```
2. Output like this:
```
...
[INFO] ME(17603:139620983514944,MainProcess):2020-04-28-16:56:12.498.235 [mindspore/mindrecord/filewriter.py:313] The list of mindrecord files created are: ['data/train.mindrecord'], and the list of index files are: ['data/train.mindrecord.db']
...
[INFO] ME(17603,python):2020-04-28-16:56:13.400.175 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
[INFO] ME(17603,python):2020-04-28-16:56:13.400.863 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
[INFO] ME(17603,python):2020-04-28-16:56:13.401.534 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
[INFO] ME(17603,python):2020-04-28-16:56:13.402.179 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
[INFO] ME(17603,python):2020-04-28-16:56:13.402.702 [mindspore/ccsrc/mindrecord/io/shard_writer.cc:667] WriteRawData] Write 1 records successfully.
...
[INFO] ME(17603:139620983514944,MainProcess):2020-04-28-16:56:13.431.208 [mindspore/mindrecord/filewriter.py:313] The list of mindrecord files created are: ['data/dev.mindrecord'], and the list of index files are: ['data/dev.mindrecord.db']
```
3. Generate files like this:
```bash
$ ls output/
dev.mindrecord dev.mindrecord.db README.md train.mindrecord train.mindrecord.db
```
### Create MindDataset By MindRecord
1. Run the run_read.sh script.
```bash
bash run_read.sh
```
2. Output like this:
```
...
example 1340: input_ids: [ 101 3173 1290 4852 7676 3949 122 3299 123 126 3189 4510 8020 6381 5442 7357 2590 3636 8021 7676 3949 4294 1166 6121 3124 1277 6121 3124 7270 2135 3295 5789 3326 123 126 3189 1355 6134 1093 1325 3173 2399 6590 6791 8024 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1340: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1340: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1340: label_ids: [ 0 18 19 20 2 4 0 0 0 0 0 0 0 34 36 26 27 28 0 34 35 35 35 35 35 35 35 35 35 36 26 27 28 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1341: input_ids: [ 101 1728 711 4293 3868 1168 2190 2150 3791 934 3633 3428 4638 6237 7025 8024 3297 1400 5310 3362 6206 5023 5401 1744 3297 7770 3791 7368 976 1139 1104 2137 511 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1341: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1341: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 1341: label_ids: [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 19 19 19 19 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
...
```

View File

@ -1,36 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""create MindDataset by MindRecord"""
import mindspore.dataset as ds
def create_dataset(data_file):
"""create MindDataset"""
num_readers = 4
data_set = ds.MindDataset(dataset_file=data_file, num_parallel_workers=num_readers, shuffle=True)
index = 0
for item in data_set.create_dict_iterator():
# print("example {}: {}".format(index, item))
print("example {}: input_ids: {}".format(index, item['input_ids']))
print("example {}: input_mask: {}".format(index, item['input_mask']))
print("example {}: segment_ids: {}".format(index, item['segment_ids']))
print("example {}: label_ids: {}".format(index, item['label_ids']))
index += 1
if index % 1000 == 0:
print("read rows: {}".format(index))
print("total rows: {}".format(index))
if __name__ == '__main__':
create_dataset('output/train.mindrecord')
create_dataset('output/dev.mindrecord')

View File

@ -1 +0,0 @@
cluener_public

View File

@ -1 +0,0 @@
## The input dataset

View File

@ -1 +0,0 @@
## output dir

View File

@ -1,40 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
rm -f output/train.mindrecord*
rm -f output/dev.mindrecord*
if [ ! -d "../../../third_party/to_mindrecord/CLUERNER2020" ]; then
echo "The patch base dir ../../../third_party/to_mindrecord/CLUERNER2020 is not exist."
exit 1
fi
if [ ! -f "../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch" ]; then
echo "The patch file ../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch is not exist."
exit 1
fi
# patch for data_processor_seq.py
patch -p0 -d ../../../third_party/to_mindrecord/CLUERNER2020/ -o data_processor_seq_patched.py < ../../../third_party/patch/to_mindrecord/CLUERNER2020/data_processor_seq.patch
if [ $? -ne 0 ]; then
echo "Patch ../../../third_party/to_mindrecord/CLUERNER2020/data_processor_seq.py failed"
exit 1
fi
# use patched script
python ../../../third_party/to_mindrecord/CLUERNER2020/data_processor_seq_patched.py \
--vocab_file=../../../third_party/to_mindrecord/CLUERNER2020/vocab.txt \
--label2id_file=../../../third_party/to_mindrecord/CLUERNER2020/label2id.json

View File

@ -1,173 +0,0 @@
# Guideline to Convert Training Data enwiki to MindRecord For Bert Pre Training
<!-- TOC -->
- [What does the example do](#what-does-the-example-do)
- [How to use the example to process enwiki](#how-to-use-the-example-to-process-enwiki)
- [Download enwiki training data](#download-enwiki-training-data)
- [Process the enwiki](#process-the-enwiki)
- [Generate MindRecord](#generate-mindrecord)
- [Create MindDataset By MindRecord](#create-minddataset-by-mindrecord)
<!-- /TOC -->
## What does the example do
This example is based on [enwiki](https://dumps.wikimedia.org/enwiki) training data, generating MindRecord file, and finally used for Bert network training.
1. run.sh: generate MindRecord entry script.
2. run_read.py: create MindDataset by MindRecord entry script.
- create_dataset.py: use MindDataset to read MindRecord to generate dataset.
## How to use the example to process enwiki
Download enwiki data, process it, convert it to MindRecord, use MindDataset to read MindRecord.
### Download enwiki training data
> [enwiki dataset download address](https://dumps.wikimedia.org/enwiki) **-> 20200501 -> enwiki-20200501-pages-articles-multistream.xml.bz2**
### Process the enwiki
1. Please follow the steps in [process enwiki](https://github.com/mlperf/training/tree/master/language_model/tensorflow/bert)
- All permissions of this step belong to the link address website.
### Generate MindRecord
1. Run the run.sh script.
```
bash run.sh input_dir output_dir vocab_file
```
- input_dir: the directory which contains files like 'part-00251-of-00500'.
- output_dir: which will store the output mindrecord files.
- vocab_file: the vocab file which you can download from other opensource project.
2. The output like this:
```
...
Begin preprocess Wed Jun 10 09:21:23 CST 2020
Begin preprocess input file: /mnt/data/results/part-00000-of-00500
Begin output file: part-00000-of-00500.mindrecord
Total task: 510, processing: 1
Begin preprocess input file: /mnt/data/results/part-00001-of-00500
Begin output file: part-00001-of-00500.mindrecord
Total task: 510, processing: 2
Begin preprocess input file: /mnt/data/results/part-00002-of-00500
Begin output file: part-00002-of-00500.mindrecord
Total task: 510, processing: 3
Begin preprocess input file: /mnt/data/results/part-00003-of-00500
Begin output file: part-00003-of-00500.mindrecord
Total task: 510, processing: 4
Begin preprocess input file: /mnt/data/results/part-00004-of-00500
Begin output file: part-00004-of-00500.mindrecord
Total task: 510, processing: 4
...
```
3. Generate files like this:
```bash
$ ls {your_output_dir}/
part-00000-of-00500.mindrecord part-00000-of-00500.mindrecord.db part-00001-of-00500.mindrecord part-00001-of-00500.mindrecord.db part-00002-of-00500.mindrecord part-00002-of-00500.mindrecord.db ...
```
### Create MindDataset By MindRecord
1. Run the run_read.sh script.
```bash
bash run_read.sh input_dir
```
- input_dir: the directory which contains mindrecord files.
2. The output like this:
```
...
example 633: input_ids: [ 101 2043 19781 4305 2140 4520 2041 1010 103 2034 2455 2002
7879 2003 1996 2455 1997 103 26378 4160 1012 102 7291 2001
1996 103 1011 2343 1997 6327 1010 3423 1998 103 4262 2005
1996 2118 1997 2329 3996 103 102 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0]
example 633: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 633: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
example 633: masked_lm_positions: [ 8 17 20 25 33 41 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0]
example 633: masked_lm_ids: [ 1996 16137 1012 3580 2451 1012 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0]
example 633: masked_lm_weights: [1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0.]
example 633: next_sentence_labels: [1]
...
```

View File

@ -1,43 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""create MindDataset by MindRecord"""
import argparse
import mindspore.dataset as ds
def create_dataset(data_file):
"""create MindDataset"""
num_readers = 4
data_set = ds.MindDataset(dataset_file=data_file, num_parallel_workers=num_readers, shuffle=True)
index = 0
for item in data_set.create_dict_iterator():
# print("example {}: {}".format(index, item))
print("example {}: input_ids: {}".format(index, item['input_ids']))
print("example {}: input_mask: {}".format(index, item['input_mask']))
print("example {}: segment_ids: {}".format(index, item['segment_ids']))
print("example {}: masked_lm_positions: {}".format(index, item['masked_lm_positions']))
print("example {}: masked_lm_ids: {}".format(index, item['masked_lm_ids']))
print("example {}: masked_lm_weights: {}".format(index, item['masked_lm_weights']))
print("example {}: next_sentence_labels: {}".format(index, item['next_sentence_labels']))
index += 1
if index % 1000 == 0:
print("read rows: {}".format(index))
print("total rows: {}".format(index))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_file", nargs='+', type=str, help='Input mindreord file')
args = parser.parse_args()
create_dataset(args.input_file)

View File

@ -1,133 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# -ne 3 ]; then
echo "Usage: $0 input_dir output_dir vocab_file"
exit 1
fi
if [ ! -d $1 ]; then
echo "The input dir: $1 is not exist."
exit 1
fi
if [ ! -d $2 ]; then
echo "The output dir: $2 is not exist."
exit 1
fi
rm -fr $2/*.mindrecord*
if [ ! -f $3 ]; then
echo "The vocab file: $3 is not exist."
exit 1
fi
data_dir=$1
output_dir=$2
vocab_file=$3
file_list=()
output_filename=()
file_index=0
function getdir() {
elements=`ls $1`
for element in ${elements[*]};
do
dir_or_file=$1"/"$element
if [ -d $dir_or_file ];
then
getdir $dir_or_file
else
file_list[$file_index]=$dir_or_file
echo "${dir_or_file}" | tr '/' '\n' > dir_file_list.txt # dir dir file to mapfile
mapfile parent_dir < dir_file_list.txt
rm dir_file_list.txt >/dev/null 2>&1
tmp_output_filename=${parent_dir[${#parent_dir[@]}-1]}".mindrecord"
output_filename[$file_index]=`echo ${tmp_output_filename} | sed 's/ //g'`
file_index=`expr $file_index + 1`
fi
done
}
getdir "${data_dir}"
# echo "The input files: "${file_list[@]}
# echo "The output files: "${output_filename[@]}
if [ ! -d "../../../third_party/to_mindrecord/zhwiki" ]; then
echo "The patch base dir ../../../third_party/to_mindrecord/zhwiki is not exist."
exit 1
fi
if [ ! -f "../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch" ]; then
echo "The patch file ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch is not exist."
exit 1
fi
# patch for create_pretraining_data.py
patch -p0 -d ../../../third_party/to_mindrecord/zhwiki/ -o create_pretraining_data_patched.py < ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch
if [ $? -ne 0 ]; then
echo "Patch ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data.py failed"
exit 1
fi
# get the cpu core count
num_cpu_core=`cat /proc/cpuinfo | grep "processor" | wc -l`
avaiable_core_size=`expr $num_cpu_core / 3 \* 2`
echo "Begin preprocess `date`"
# using patched script to generate mindrecord
file_list_len=`expr ${#file_list[*]} - 1`
for index in $(seq 0 $file_list_len); do
echo "Begin preprocess input file: ${file_list[$index]}"
echo "Begin output file: ${output_filename[$index]}"
python ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data_patched.py \
--input_file=${file_list[$index]} \
--output_file=${output_dir}/${output_filename[$index]} \
--partition_number=1 \
--vocab_file=${vocab_file} \
--do_lower_case=True \
--max_seq_length=512 \
--max_predictions_per_seq=76 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=10 >/tmp/${output_filename[$index]}.log 2>&1 &
process_count=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
echo "Total task: ${#file_list[*]}, processing: ${process_count}"
if [ $process_count -ge $avaiable_core_size ]; then
while [ 1 ]; do
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
if [ $process_count -gt $process_num ]; then
process_count=$process_num
break;
fi
sleep 2
done
fi
done
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
while [ 1 ]; do
if [ $process_num -eq 0 ]; then
break;
fi
echo "There are still ${process_num} preprocess running ..."
sleep 2
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
done
echo "Preprocess all the data success."
echo "End preprocess `date`"

View File

@ -1,113 +0,0 @@
# Guideline to Convert Training Data zhwiki to MindRecord For Bert Pre Training
<!-- TOC -->
- [What does the example do](#what-does-the-example-do)
- [Run simple test](#run-simple-test)
- [How to use the example to process zhwiki](#how-to-use-the-example-to-process-zhwiki)
- [Download zhwiki training data](#download-zhwiki-training-data)
- [Extract the zhwiki](#extract-the-zhwiki)
- [Generate MindRecord](#generate-mindrecord)
- [Create MindDataset By MindRecord](#create-minddataset-by-mindrecord)
<!-- /TOC -->
## What does the example do
This example is based on [zhwiki](https://dumps.wikimedia.org/zhwiki) training data, generating MindRecord file, and finally used for Bert network training.
1. run.sh: generate MindRecord entry script.
2. run_read.py: create MindDataset by MindRecord entry script.
- create_dataset.py: use MindDataset to read MindRecord to generate dataset.
## Run simple test
Follow the step:
```bash
bash run_simple.sh # generate output/simple.mindrecord* by ../../../third_party/to_mindrecord/zhwiki/sample_text.txt
bash run_read_simple.sh # use MindDataset to read output/simple.mindrecord*
```
## How to use the example to process zhwiki
Download zhwiki data, extract it, convert it to MindRecord, use MindDataset to read MindRecord.
### Download zhwiki training data
> [zhwiki dataset download address](https://dumps.wikimedia.org/zhwiki) **-> 20200401 -> zhwiki-20200401-pages-articles-multistream.xml.bz2**
- put the zhwiki-20200401-pages-articles-multistream.xml.bz2 in {your-mindspore}/example/nlp_to_mindrecord/zhwiki/data directory.
### Extract the zhwiki
1. Download [wikiextractor](https://github.com/attardi/wikiextractor) script to {your-mindspore}/example/nlp_to_mindrecord/zhwiki/data directory.
```
$ ls data/
README.md wikiextractor zhwiki-20200401-pages-articles-multistream.xml.bz2
```
2. Extract the zhwiki.
```python
python data/wikiextractor/WikiExtractor.py data/zhwiki-20200401-pages-articles-multistream.xml.bz2 --processes 4 --templates data/template --bytes 8M --min_text_length 0 --filter_disambig_pages --output data/extract
```
3. Generate like this:
```
$ ls data/extract
AA AB
```
### Generate MindRecord
1. Run the run.sh script.
```
bash run.sh
```
> Caution: This process maybe slow, please wait patiently. If you do not have a machine with enough memory and cpu, it is recommended that you modify the script to generate mindrecord in step by step.
2. The output like this:
```
patching file create_pretraining_data_patched.py (read from create_pretraining_data.py)
Begin preprocess input file: ./data/extract/AA/wiki_00
Begin output file: AAwiki_00.mindrecord
Total task: 5, processing: 1
Begin preprocess input file: ./data/extract/AA/wiki_01
Begin output file: AAwiki_01.mindrecord
Total task: 5, processing: 2
Begin preprocess input file: ./data/extract/AA/wiki_02
Begin output file: AAwiki_02.mindrecord
Total task: 5, processing: 3
Begin preprocess input file: ./data/extract/AB/wiki_02
Begin output file: ABwiki_02.mindrecord
Total task: 5, processing: 4
...
```
3. Generate files like this:
```bash
$ ls output/
AAwiki_00.mindrecord AAwiki_00.mindrecord.db AAwiki_01.mindrecord AAwiki_01.mindrecord.db AAwiki_02.mindrecord AAwiki_02.mindrecord.db ... ABwiki_00.mindrecord ABwiki_00.mindrecord.db ...
```
### Create MindDataset By MindRecord
1. Run the run_read.sh script.
```bash
bash run_read.sh
```
2. The output like this:
```
...
example 74: input_ids: [ 101 8168 118 12847 8783 9977 15908 117 8256 9245 11643 8168 8847 8588 11575 8154 8228 143 8384 8376 9197 10241 103 10564 11421 8199 12268 112 161 8228 11541 9586 8436 8174 8363 9864 9702 103 103 119 103 9947 10564 103 8436 8806 11479 103 8912 119 103 103 103 12209 8303 103 8757 8824 117 8256 103 8619 8168 11541 102 11684 8196 103 8228 8847 11523 117 9059 9064 12410 8358 8181 10764 117 11167 11706 9920 148 8332 11390 8936 8205 10951 11997 103 8154 117 103 8670 10467 112 161 10951 13139 12413 117 10288 143 10425 8205 152 10795 8472 8196 103 161 12126 9172 13129 12106 8217 8174 12244 8205 143 103 8461 8277 10628 160 8221 119 102]
example 74: input_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
example 74: segment_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
example 74: masked_lm_positions: [ 6 22 37 38 40 43 47 50 51 52 55 60 67 76 89 92 98 109 120 0]
example 74: masked_lm_ids: [ 8118 8165 8329 8890 8554 8458 119 8850 8565 10392 8174 11467 10291 8181 8549 12718 13139 112 158 0]
example 74: masked_lm_weights: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
example 74: next_sentence_labels: [0]
...
```

View File

@ -1,43 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""create MindDataset by MindRecord"""
import argparse
import mindspore.dataset as ds
def create_dataset(data_file):
"""create MindDataset"""
num_readers = 4
data_set = ds.MindDataset(dataset_file=data_file, num_parallel_workers=num_readers, shuffle=True)
index = 0
for item in data_set.create_dict_iterator():
# print("example {}: {}".format(index, item))
print("example {}: input_ids: {}".format(index, item['input_ids']))
print("example {}: input_mask: {}".format(index, item['input_mask']))
print("example {}: segment_ids: {}".format(index, item['segment_ids']))
print("example {}: masked_lm_positions: {}".format(index, item['masked_lm_positions']))
print("example {}: masked_lm_ids: {}".format(index, item['masked_lm_ids']))
print("example {}: masked_lm_weights: {}".format(index, item['masked_lm_weights']))
print("example {}: next_sentence_labels: {}".format(index, item['next_sentence_labels']))
index += 1
if index % 1000 == 0:
print("read rows: {}".format(index))
print("total rows: {}".format(index))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_file", nargs='+', type=str, help='Input mindreord file')
args = parser.parse_args()
create_dataset(args.input_file)

View File

@ -1,3 +0,0 @@
wikiextractor/
zhwiki-20200401-pages-articles-multistream.xml.bz2
extract/

View File

@ -1 +0,0 @@
## The input dataset

View File

@ -1 +0,0 @@
## Output the mindrecord

View File

@ -1,112 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
rm -f output/*.mindrecord*
data_dir="./data/extract"
file_list=()
output_filename=()
file_index=0
function getdir() {
elements=`ls $1`
for element in ${elements[*]};
do
dir_or_file=$1"/"$element
if [ -d $dir_or_file ];
then
getdir $dir_or_file
else
file_list[$file_index]=$dir_or_file
echo "${dir_or_file}" | tr '/' '\n' > dir_file_list.txt # dir dir file to mapfile
mapfile parent_dir < dir_file_list.txt
rm dir_file_list.txt >/dev/null 2>&1
tmp_output_filename=${parent_dir[${#parent_dir[@]}-2]}${parent_dir[${#parent_dir[@]}-1]}".mindrecord"
output_filename[$file_index]=`echo ${tmp_output_filename} | sed 's/ //g'`
file_index=`expr $file_index + 1`
fi
done
}
getdir "${data_dir}"
# echo "The input files: "${file_list[@]}
# echo "The output files: "${output_filename[@]}
if [ ! -d "../../../third_party/to_mindrecord/zhwiki" ]; then
echo "The patch base dir ../../../third_party/to_mindrecord/zhwiki is not exist."
exit 1
fi
if [ ! -f "../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch" ]; then
echo "The patch file ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch is not exist."
exit 1
fi
# patch for create_pretraining_data.py
patch -p0 -d ../../../third_party/to_mindrecord/zhwiki/ -o create_pretraining_data_patched.py < ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch
if [ $? -ne 0 ]; then
echo "Patch ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data.py failed"
exit 1
fi
# get the cpu core count
num_cpu_core=`cat /proc/cpuinfo | grep "processor" | wc -l`
avaiable_core_size=`expr $num_cpu_core / 3 \* 2`
echo "Begin preprocess `date`"
# using patched script to generate mindrecord
file_list_len=`expr ${#file_list[*]} - 1`
for index in $(seq 0 $file_list_len); do
echo "Begin preprocess input file: ${file_list[$index]}"
echo "Begin output file: ${output_filename[$index]}"
python ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data_patched.py \
--input_file=${file_list[$index]} \
--output_file=output/${output_filename[$index]} \
--partition_number=1 \
--vocab_file=../../../third_party/to_mindrecord/zhwiki/vocab.txt \
--do_lower_case=True \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=10 >/tmp/${output_filename[$index]}.log 2>&1 & # user defined
process_count=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
echo "Total task: ${#file_list[*]}, processing: ${process_count}"
if [ $process_count -ge $avaiable_core_size ]; then
while [ 1 ]; do
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
if [ $process_count -gt $process_num ]; then
process_count=$process_num
break;
fi
sleep 2
done
fi
done
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
while [ 1 ]; do
if [ $process_num -eq 0 ]; then
break;
fi
echo "There are still ${process_num} preprocess running ..."
sleep 2
process_num=`ps -ef | grep create_pretraining_data_patched | grep -v grep | wc -l`
done
echo "Preprocess all the data success."
echo "End preprocess `date`"

View File

@ -1,47 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
rm -f output/simple.mindrecord*
if [ ! -d "../../../third_party/to_mindrecord/zhwiki" ]; then
echo "The patch base dir ../../../third_party/to_mindrecord/zhwiki is not exist."
exit 1
fi
if [ ! -f "../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch" ]; then
echo "The patch file ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch is not exist."
exit 1
fi
# patch for create_pretraining_data.py
patch -p0 -d ../../../third_party/to_mindrecord/zhwiki/ -o create_pretraining_data_patched.py < ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch
if [ $? -ne 0 ]; then
echo "Patch ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data.py failed"
exit 1
fi
# using patched script to generate mindrecord
python ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data_patched.py \
--input_file=../../../third_party/to_mindrecord/zhwiki/sample_text.txt \
--output_file=output/simple.mindrecord \
--partition_number=4 \
--vocab_file=../../../third_party/to_mindrecord/zhwiki/vocab.txt \
--do_lower_case=True \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=10 # user defined

View File

@ -1,137 +0,0 @@
# ResNet-50 Example
## Description
This is an example of training ResNet-50 with CIFAR-10 dataset in MindSpore.
## Requirements
- Install [MindSpore](https://www.mindspore.cn/install/en).
- Download the dataset CIFAR-10
> Unzip the CIFAR-10 dataset to any path you want and the folder structure should include train and eval dataset as follows:
> ```
> .
> ├── cifar-10-batches-bin # train dataset
> └── cifar-10-verify-bin # infer dataset
> ```
## Example structure
```shell
.
├── config.py # parameter configuration
├── dataset.py # data preprocessing
├── eval.py # infer script
├── lr_generator.py # generate learning rate for each step
├── run_distribute_train.sh # launch distributed training(8 pcs)
├── run_infer.sh # launch infering
├── run_standalone_train.sh # launch standalone training(1 pcs)
└── train.py # train script
```
## Parameter configuration
Parameters for both training and inference can be set in config.py.
```
"class_num": 10, # dataset class num
"batch_size": 32, # batch size of input tensor
"loss_scale": 1024, # loss scale
"momentum": 0.9, # momentum
"weight_decay": 1e-4, # weight decay
"epoch_size": 90, # only valid for taining, which is always 1 for inference
"buffer_size": 100, # number of queue size in data preprocessing
"image_height": 224, # image height
"image_width": 224, # image width
"save_checkpoint": True, # whether save checkpoint or not
"save_checkpoint_steps": 195, # the step interval between two checkpoints. By default, the last checkpoint will be saved after the last step
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
"save_checkpoint_path": "./", # path to save checkpoint
"warmup_epochs": 5, # number of warmup epoch
"lr_decay_mode": "poly" # decay mode can be selected in steps, ploy and default
"lr_init": 0.01, # initial learning rate
"lr_end": 0.00001, # final learning rate
"lr_max": 0.1, # maximum learning rate
```
## Running the example
### Train
#### Usage
```
# distributed training
Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]
# standalone training
Usage: sh run_standalone_train.sh [DATASET_PATH]
```
#### Launch
```
# distribute training example
sh run_distribute_train.sh rank_table.json ~/cifar-10-batches-bin
# standalone training example
sh run_standalone_train.sh ~/cifar-10-batches-bin
```
> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
#### Result
Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log.
```
# distribute training result(8 pcs)
epoch: 1 step: 195, loss is 1.9601055
epoch: 2 step: 195, loss is 1.8555021
epoch: 3 step: 195, loss is 1.6707983
epoch: 4 step: 195, loss is 1.8162166
epoch: 5 step: 195, loss is 1.393667
```
### Infer
#### Usage
```
# infer
Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH]
```
#### Launch
```
# infer example
sh run_infer.sh ~/cifar10-10-verify-bin ~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt
```
> checkpoint can be produced in training process.
#### Result
Inference result will be stored in the example path, whose folder name is "infer". Under this, you can find result like the followings in log.
```
result: {'acc': 0.91446314102564111} ckpt=~/resnet50_cifar10/train_parallel0/resnet-90_195.ckpt
```
### Running on GPU
```
# distributed training example
mpirun -n 8 python train.py --dataset_path=~/cifar-10-batches-bin --device_target="GPU" --run_distribute=True
# standalone training example
python train.py --dataset_path=~/cifar-10-batches-bin --device_target="GPU"
# infer example
python eval.py --dataset_path=~/cifar10-10-verify-bin --device_target="GPU" --checkpoint_path=resnet-90_195.ckpt
```

View File

@ -1,81 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
create train or eval dataset.
"""
import os
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset.transforms.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size
from config import config
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"):
"""
create a train or eval dataset
Args:
dataset_path(string): the path of dataset.
do_train(bool): whether dataset is used for train or eval.
repeat_num(int): the repeat times of dataset. Default: 1
batch_size(int): the batch size of dataset. Default: 32
target(str): the device target. Default: Ascend
Returns:
dataset
"""
if target == "Ascend":
device_num = int(os.getenv("DEVICE_NUM"))
rank_id = int(os.getenv("RANK_ID"))
else:
init("nccl")
rank_id = get_rank()
device_num = get_group_size()
if device_num == 1:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
# define map operations
trans = []
if do_train:
trans += [
C.RandomCrop((32, 32), (4, 4, 4, 4)),
C.RandomHorizontalFlip(prob=0.5)
]
trans += [
C.Resize((config.image_height, config.image_width)),
C.Rescale(1.0 / 255.0, 0.0),
C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
C.HWC2CHW()
]
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op)
ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
return ds

View File

@ -1,72 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
eval.
"""
import os
import argparse
from dataset import create_dataset
from config import config
from mindspore import context
from mindspore.model_zoo.resnet import resnet50
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.train.model import Model, ParallelMode
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.communication.management import init, get_group_size
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute')
parser.add_argument('--device_num', type=int, default=1, help='Device num.')
parser.add_argument('--do_train', type=bool, default=False, help='Do train or not.')
parser.add_argument('--do_eval', type=bool, default=True, help='Do eval or not.')
parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
args_opt = parser.parse_args()
if __name__ == '__main__':
target = args_opt.device_target
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
if not args_opt.do_eval and args_opt.run_distribute:
if target == "Ascend":
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(device_id=device_id)
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([140])
init()
elif target == "GPU":
init("nccl")
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
epoch_size = config.epoch_size
net = resnet50(class_num=config.class_num)
loss = SoftmaxCrossEntropyWithLogits(sparse=True)
if args_opt.do_eval:
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size,
target=target)
step_size = dataset.get_dataset_size()
if args_opt.checkpoint_path:
param_dict = load_checkpoint(args_opt.checkpoint_path)
load_param_into_net(net, param_dict)
net.set_train(False)
model = Model(net, loss_fn=loss, metrics={'acc'})
res = model.eval(dataset)
print("result:", res, "ckpt=", args_opt.checkpoint_path)

View File

@ -1,77 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""learning rate generator"""
import numpy as np
def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode):
"""
generate learning rate array
Args:
global_step(int): total steps of the training
lr_init(float): init learning rate
lr_end(float): end learning rate
lr_max(float): max learning rate
warmup_epochs(int): number of warmup epochs
total_epochs(int): total epoch of training
steps_per_epoch(int): steps of one epoch
lr_decay_mode(string): learning rate decay mode, including steps, poly or default
Returns:
np.array, learning rate array
"""
lr_each_step = []
total_steps = steps_per_epoch * total_epochs
warmup_steps = steps_per_epoch * warmup_epochs
if lr_decay_mode == 'steps':
decay_epoch_index = [0.3 * total_steps, 0.6 * total_steps, 0.8 * total_steps]
for i in range(total_steps):
if i < decay_epoch_index[0]:
lr = lr_max
elif i < decay_epoch_index[1]:
lr = lr_max * 0.1
elif i < decay_epoch_index[2]:
lr = lr_max * 0.01
else:
lr = lr_max * 0.001
lr_each_step.append(lr)
elif lr_decay_mode == 'poly':
if warmup_steps != 0:
inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps)
else:
inc_each_step = 0
for i in range(total_steps):
if i < warmup_steps:
lr = float(lr_init) + inc_each_step * float(i)
else:
base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps)))
lr = float(lr_max) * base * base
if lr < 0.0:
lr = 0.0
lr_each_step.append(lr)
else:
for i in range(total_steps):
if i < warmup_steps:
lr = lr_init + (lr_max - lr_init) * i / warmup_steps
else:
lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps)
lr_each_step.append(lr)
current_step = global_step
lr_each_step = np.array(lr_each_step).astype(np.float32)
learning_rate = lr_each_step[current_step:]
return learning_rate

View File

@ -1,64 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 2 ]
then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2)
if [ ! -f "$PATH1" ]
then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
exit 1
fi
if [ ! -d "$PATH2" ]
then
echo "error: DATASET_PATH=$PATH2 is not a directory"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=8
export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
for((i=0; i<${DEVICE_NUM}; i++))
do
export DEVICE_ID=$i
export RANK_ID=$i
rm -rf ./train_parallel$i
mkdir ./train_parallel$i
cp *.py ./train_parallel$i
cp *.sh ./train_parallel$i
cd ./train_parallel$i || exit
echo "start training for rank $RANK_ID, device $DEVICE_ID"
env > env.log
python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log &
cd ..
done

View File

@ -1,64 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 2 ]
then
echo "Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH]"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2)
if [ ! -d $PATH1 ]
then
echo "error: DATASET_PATH=$1 is not a directory"
exit 1
fi
if [ ! -f $PATH2 ]
then
echo "error: CHECKPOINT_PATH=$2 is not a file"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=1
export DEVICE_ID=0
export RANK_SIZE=$DEVICE_NUM
export RANK_ID=0
if [ -d "infer" ];
then
rm -rf ./infer
fi
mkdir ./infer
cp *.py ./infer
cp *.sh ./infer
cd ./infer || exit
env > env.log
echo "start infering for device $DEVICE_ID"
python eval.py --do_eval=True --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log &
cd ..

View File

@ -1,55 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 1 ]
then
echo "Usage: sh run_standalone_train.sh [DATASET_PATH]"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $1)
if [ ! -d "$PATH1" ]
then
echo "error: DATASET_PATH=$PATH1 is not a directory"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=1
export DEVICE_ID=0
export RANK_ID=0
if [ -d "train" ];
then
rm -rf ./train
fi
mkdir ./train
cp *.py ./train
cp *.sh ./train
cd ./train || exit
echo "start training for device $DEVICE_ID"
env > env.log
python train.py --do_train=True --dataset_path=$PATH1 &> log &
cd ..

View File

@ -1,97 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""train_imagenet."""
import os
import argparse
import numpy as np
from dataset import create_dataset
from lr_generator import get_lr
from config import config
from mindspore import context
from mindspore import Tensor
from mindspore.model_zoo.resnet import resnet50
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.nn.optim.momentum import Momentum
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.train.model import Model, ParallelMode
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.communication.management import init, get_rank, get_group_size
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute')
parser.add_argument('--device_num', type=int, default=1, help='Device num.')
parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.')
parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
args_opt = parser.parse_args()
if __name__ == '__main__':
target = args_opt.device_target
ckpt_save_dir = config.save_checkpoint_path
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
np.random.seed(1)
if not args_opt.do_eval and args_opt.run_distribute:
if target == "Ascend":
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=device_id,
enable_auto_mixed_precision=True)
init()
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])
ckpt_save_dir = config.save_checkpoint_path
elif target == "GPU":
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
init("nccl")
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
epoch_size = config.epoch_size
net = resnet50(class_num=config.class_num)
if args_opt.do_train:
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True,
repeat_num=epoch_size, batch_size=config.batch_size, target=target)
step_size = dataset.get_dataset_size()
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
lr = Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max,
warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size,
lr_decay_mode='poly'))
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
config.weight_decay, config.loss_scale)
if target == 'GPU':
loss = SoftmaxCrossEntropyWithLogits(sparse=True, is_grad=False, reduction='mean')
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum)
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})
else:
loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'},
amp_level="O2", keep_batchnorm_fp32=False)
time_cb = TimeMonitor(data_size=step_size)
loss_cb = LossMonitor()
cb = [time_cb, loss_cb]
if config.save_checkpoint:
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs*step_size,
keep_checkpoint_max=config.keep_checkpoint_max)
ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck)
cb += [ckpt_cb]
model.train(epoch_size, dataset, callbacks=cb)

View File

@ -1,150 +0,0 @@
# ResNet-50 Example
## Description
This is an example of training ResNet-50 with ImageNet2012 dataset in MindSpore.
## Requirements
- Install [MindSpore](https://www.mindspore.cn/install/en).
- Download the dataset ImageNet2012
> Unzip the ImageNet2012 dataset to any path you want and the folder structure should include train and eval dataset as follows:
> ```
> .
> ├── ilsvrc # train dataset
> └── ilsvrc_eval # infer dataset
> ```
## Example structure
```shell
.
├── crossentropy.py # CrossEntropy loss function
├── config.py # parameter configuration
├── dataset.py # data preprocessing
├── eval.py # infer script
├── lr_generator.py # generate learning rate for each step
├── run_distribute_train.sh # launch distributed training(8 pcs)
├── run_infer.sh # launch infering
├── run_standalone_train.sh # launch standalone training(1 pcs)
└── train.py # train script
```
## Parameter configuration
Parameters for both training and inference can be set in config.py.
```
"class_num": 1001, # dataset class number
"batch_size": 32, # batch size of input tensor
"loss_scale": 1024, # loss scale
"momentum": 0.9, # momentum optimizer
"weight_decay": 1e-4, # weight decay
"epoch_size": 90, # only valid for taining, which is always 1 for inference
"pretrained_epoch_size": 1, # epoch size that model has been trained before load pretrained checkpoint
"buffer_size": 1000, # number of queue size in data preprocessing
"image_height": 224, # image height
"image_width": 224, # image width
"save_checkpoint": True, # whether save checkpoint or not
"save_checkpoint_epochs": 1, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch
"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint
"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path
"warmup_epochs": 0, # number of warmup epoch
"lr_decay_mode": "cosine", # decay mode for generating learning rate
"label_smooth": True, # label smooth
"label_smooth_factor": 0.1, # label smooth factor
"lr_init": 0, # initial learning rate
"lr_max": 0.1, # maximum learning rate
```
## Running the example
### Train
#### Usage
```
# distributed training
Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
# standalone training
Usage: sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
```
#### Launch
```bash
# distributed training example(8 pcs)
sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc
# If you want to load pretrained ckpt file
sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc ./pretrained.ckpt
# standalone training example(1 pcs)
sh run_standalone_train.sh dataset/ilsvrc
# If you want to load pretrained ckpt file
sh run_standalone_train.sh dataset/ilsvrc ./pretrained.ckpt
```
> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
#### Result
Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log.
```
# distribute training result(8 pcs)
epoch: 1 step: 5004, loss is 4.8995576
epoch: 2 step: 5004, loss is 3.9235563
epoch: 3 step: 5004, loss is 3.833077
epoch: 4 step: 5004, loss is 3.2795618
epoch: 5 step: 5004, loss is 3.1978393
```
### Infer
#### Usage
```
# infer
Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH]
```
#### Launch
```bash
# infer with checkpoint
sh run_infer.sh dataset/ilsvrc_eval train_parallel0/resnet-90_5004.ckpt
```
> checkpoint can be produced in training process.
#### Result
Inference result will be stored in the example path, whose folder name is "infer". Under this, you can find result like the followings in log.
```
result: {'acc': 0.7671054737516005} ckpt=train_parallel0/resnet-90_5004.ckpt
```
### Running on GPU
```
# distributed training example
mpirun -n 8 python train.py --dataset_path=dataset/ilsvrc/train --device_target="GPU" --run_distribute=True
# standalone training example
python train.py --dataset_path=dataset/ilsvrc/train --device_target="GPU"
# standalone training example with pretrained checkpoint
python train.py --dataset_path=dataset/ilsvrc/train --device_target="GPU" --pre_trained=pretrained.ckpt
# infer example
python eval.py --dataset_path=dataset/ilsvrc/val --device_target="GPU" --checkpoint_path=resnet-90_5004ss.ckpt
```

View File

@ -1,85 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
create train or eval dataset.
"""
import os
import mindspore.common.dtype as mstype
import mindspore.dataset.engine as de
import mindspore.dataset.transforms.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
from mindspore.communication.management import init, get_rank, get_group_size
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"):
"""
create a train or eval dataset
Args:
dataset_path(string): the path of dataset.
do_train(bool): whether dataset is used for train or eval.
repeat_num(int): the repeat times of dataset. Default: 1
batch_size(int): the batch size of dataset. Default: 32
target(str): the device target. Default: Ascend
Returns:
dataset
"""
if target == "Ascend":
device_num = int(os.getenv("DEVICE_NUM"))
rank_id = int(os.getenv("RANK_ID"))
else:
init("nccl")
rank_id = get_rank()
device_num = get_group_size()
if device_num == 1:
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
else:
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
# define map operations
if do_train:
trans = [
C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
C.RandomHorizontalFlip(prob=0.5),
C.Normalize(mean=mean, std=std),
C.HWC2CHW()
]
else:
trans = [
C.Decode(),
C.Resize((256, 256)),
C.CenterCrop(image_size),
C.Normalize(mean=mean, std=std),
C.HWC2CHW()
]
type_cast_op = C2.TypeCast(mstype.int32)
ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans)
ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op)
# apply batch operations
ds = ds.batch(batch_size, drop_remainder=True)
# apply dataset repeat operation
ds = ds.repeat(repeat_num)
return ds

View File

@ -1,62 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
eval.
"""
import os
import argparse
from dataset import create_dataset
from config import config
from mindspore import context
from mindspore.model_zoo.resnet import resnet50
from mindspore.train.model import Model
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from crossentropy import CrossEntropy
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute')
parser.add_argument('--device_num', type=int, default=1, help='Device num.')
parser.add_argument('--do_train', type=bool, default=False, help='Do train or not.')
parser.add_argument('--do_eval', type=bool, default=True, help='Do eval or not.')
parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
args_opt = parser.parse_args()
target = args_opt.device_target
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
if target == "Ascend":
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(device_id=device_id)
if __name__ == '__main__':
net = resnet50(class_num=config.class_num)
if not config.use_label_smooth:
config.label_smooth_factor = 0.0
loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
if args_opt.do_eval:
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size,
target=target)
step_size = dataset.get_dataset_size()
if args_opt.checkpoint_path:
param_dict = load_checkpoint(args_opt.checkpoint_path)
load_param_into_net(net, param_dict)
net.set_train(False)
model = Model(net, loss_fn=loss, metrics={'acc'})
res = model.eval(dataset)
print("result:", res, "ckpt=", args_opt.checkpoint_path)

View File

@ -1,122 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""train_imagenet."""
import os
import argparse
import numpy as np
from dataset import create_dataset
from lr_generator import get_lr
from config import config
from mindspore import context
from mindspore import Tensor
from mindspore.model_zoo.resnet import resnet50
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.nn.optim.momentum import Momentum
from mindspore.train.model import Model, ParallelMode
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.communication.management import init, get_rank, get_group_size
import mindspore.nn as nn
import mindspore.common.initializer as weight_init
from crossentropy import CrossEntropy
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute')
parser.add_argument('--device_num', type=int, default=1, help='Device num.')
parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.')
parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.')
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path')
args_opt = parser.parse_args()
if __name__ == '__main__':
target = args_opt.device_target
ckpt_save_dir = config.save_checkpoint_path
context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
np.random.seed(1)
if not args_opt.do_eval and args_opt.run_distribute:
if target == "Ascend":
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=device_id,
enable_auto_mixed_precision=True)
init()
context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])
ckpt_save_dir = config.save_checkpoint_path
elif target == "GPU":
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
init("nccl")
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
epoch_size = config.epoch_size
net = resnet50(class_num=config.class_num)
# weight init
if args_opt.pre_trained:
param_dict = load_checkpoint(args_opt.pre_trained)
load_param_into_net(net, param_dict)
epoch_size = config.epoch_size - config.pretrained_epoch_size
else:
for _, cell in net.cells_and_names():
if isinstance(cell, nn.Conv2d):
cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(),
cell.weight.default_input.shape,
cell.weight.default_input.dtype).to_tensor()
if isinstance(cell, nn.Dense):
cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(),
cell.weight.default_input.shape,
cell.weight.default_input.dtype).to_tensor()
if not config.use_label_smooth:
config.label_smooth_factor = 0.0
loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
if args_opt.do_train:
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True,
repeat_num=epoch_size, batch_size=config.batch_size, target=target)
step_size = dataset.get_dataset_size()
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
lr = get_lr(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs,
total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode='cosine')
if args_opt.pre_trained:
lr = lr[config.pretrained_epoch_size * step_size:]
lr = Tensor(lr)
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
config.weight_decay, config.loss_scale)
if target == "Ascend":
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'},
amp_level="O2", keep_batchnorm_fp32=False)
elif target == "GPU":
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})
time_cb = TimeMonitor(data_size=step_size)
loss_cb = LossMonitor()
cb = [time_cb, loss_cb]
if config.save_checkpoint:
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs*step_size,
keep_checkpoint_max=config.keep_checkpoint_max)
ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck)
cb += [ckpt_cb]
model.train(epoch_size, dataset, callbacks=cb)

@ -1 +1 @@
Subproject commit dda72a48c7e0033389bd377c5804d485fdf3112d
Subproject commit 1c2672868fda8b1d012c99e5aca73725ac869ba9

View File

@ -593,6 +593,17 @@ def check_bool(input_param):
raise TypeError("Input type must be bool!")
def check_string(input_param, valid_values):
"""String type judgment."""
if isinstance(input_param, str) and input_param in valid_values:
return input_param
if len(valid_values) == 1:
raise ValueError(f'Input should be str and must be {valid_values[0]},'
f' but got {input_param}.')
raise ValueError(f'Input should be str and must be one of {valid_values},'
f' but got {input_param}.')
def check_input_format(input_param):
"""Judge input format."""
if input_param == "NCHW":

View File

@ -164,7 +164,11 @@ if (ENABLE_D)
endif ()
set(ASCEND_DRIVER_PATH ${ASCEND_PATH}/driver/lib64/common)
set(ASCEND_DRIVER_BACK_PATH ${ASCEND_PATH}/driver/lib64/driver)
set(ASCEND_RUNTIME_PATH ${ASCEND_PATH}/fwkacllib/lib64)
if (DEFINED ENV{ASCEND_CUSTOM_FWK_PATH})
set(ASCEND_RUNTIME_PATH $ENV{ASCEND_CUSTOM_FWK_PATH}/fwkacllib/lib64)
else ()
set(ASCEND_RUNTIME_PATH ${ASCEND_PATH}/fwkacllib/lib64)
endif ()
endif()
MESSAGE("USE DAV LIB PATH: ${ASCEND_PATH}")
@ -247,7 +251,7 @@ add_library(inference SHARED
${CMAKE_CURRENT_SOURCE_DIR}/session/session.cc
${LOAD_ONNX_SRC}
)
target_link_libraries(inference PRIVATE ${PYTHON_LIBRARY} ${SECUREC_LIBRARY}
target_link_libraries(inference PRIVATE ${PYTHON_LIBRARIES} ${SECUREC_LIBRARY}
-Wl,--whole-archive mindspore -Wl,--no-whole-archive mindspore_gvar mindspore::protobuf)
if (ENABLE_CPU)

View File

@ -41,6 +41,7 @@ Status ConfigManager::FromJson(const nlohmann::json &j) {
set_worker_connector_size(j.value("workerConnectorSize", worker_connector_size_));
set_op_connector_size(j.value("opConnectorSize", op_connector_size_));
set_seed(j.value("seed", seed_));
set_monitor_sampling_interval(j.value("monitorSamplingInterval", monitor_sampling_interval_));
return Status::OK();
}

View File

@ -91,11 +91,14 @@ void Sampler::Print(std::ostream &out, bool show_all) const {
Status Sampler::GetAllIdsThenReset(py::array *data) {
std::unique_ptr<DataBuffer> db;
std::shared_ptr<Tensor> sample_ids;
TensorRow sample_row;
// A call to derived class to get sample ids wrapped inside a buffer
RETURN_IF_NOT_OK(GetNextSample(&db));
// Get the only tensor inside the buffer that contains the actual SampleIds for the entire epoch
RETURN_IF_NOT_OK(db->GetTensor(&sample_ids, 0, 0));
RETURN_IF_NOT_OK(db->GetRow(0, &sample_row));
sample_ids = sample_row[0];
// check this buffer is not a ctrl buffer
CHECK_FAIL_RETURN_UNEXPECTED(db->buffer_flags() == DataBuffer::kDeBFlagNone, "ERROR ctrl buffer received");
{

View File

@ -149,14 +149,37 @@ Status Graph::GetAllNeighbors(const std::vector<NodeIdType> &node_list, NodeType
return Status::OK();
}
Status Graph::CheckSamplesNum(NodeIdType samples_num) {
NodeIdType all_nodes_number =
std::accumulate(node_type_map_.begin(), node_type_map_.end(), 0,
[](NodeIdType t1, const auto &t2) -> NodeIdType { return t1 + t2.second.size(); });
if ((samples_num < 1) || (samples_num > all_nodes_number)) {
std::string err_msg = "Wrong samples number, should be between 1 and " + std::to_string(all_nodes_number) +
", got " + std::to_string(samples_num);
RETURN_STATUS_UNEXPECTED(err_msg);
}
return Status::OK();
}
Status Graph::GetSampledNeighbors(const std::vector<NodeIdType> &node_list,
const std::vector<NodeIdType> &neighbor_nums,
const std::vector<NodeType> &neighbor_types, std::shared_ptr<Tensor> *out) {
CHECK_FAIL_RETURN_UNEXPECTED(!node_list.empty(), "Input node_list is empty.");
CHECK_FAIL_RETURN_UNEXPECTED(neighbor_nums.size() == neighbor_types.size(),
"The sizes of neighbor_nums and neighbor_types are inconsistent.");
for (const auto &num : neighbor_nums) {
RETURN_IF_NOT_OK(CheckSamplesNum(num));
}
for (const auto &type : neighbor_types) {
if (node_type_map_.find(type) == node_type_map_.end()) {
std::string err_msg = "Invalid neighbor type:" + std::to_string(type);
RETURN_STATUS_UNEXPECTED(err_msg);
}
}
std::vector<std::vector<NodeIdType>> neighbors_vec(node_list.size());
for (size_t node_idx = 0; node_idx < node_list.size(); ++node_idx) {
std::shared_ptr<Node> input_node;
RETURN_IF_NOT_OK(GetNodeByNodeId(node_list[node_idx], &input_node));
neighbors_vec[node_idx].emplace_back(node_list[node_idx]);
std::vector<NodeIdType> input_list = {node_list[node_idx]};
for (size_t i = 0; i < neighbor_nums.size(); ++i) {
@ -204,6 +227,12 @@ Status Graph::NegativeSample(const std::vector<NodeIdType> &data, const std::uno
Status Graph::GetNegSampledNeighbors(const std::vector<NodeIdType> &node_list, NodeIdType samples_num,
NodeType neg_neighbor_type, std::shared_ptr<Tensor> *out) {
CHECK_FAIL_RETURN_UNEXPECTED(!node_list.empty(), "Input node_list is empty.");
RETURN_IF_NOT_OK(CheckSamplesNum(samples_num));
if (node_type_map_.find(neg_neighbor_type) == node_type_map_.end()) {
std::string err_msg = "Invalid neighbor type:" + std::to_string(neg_neighbor_type);
RETURN_STATUS_UNEXPECTED(err_msg);
}
std::vector<std::vector<NodeIdType>> neighbors_vec;
neighbors_vec.resize(node_list.size());
for (size_t node_idx = 0; node_idx < node_list.size(); ++node_idx) {
@ -266,7 +295,7 @@ Status Graph::GetNodeFeature(const std::shared_ptr<Tensor> &nodes, const std::ve
if (!nodes || nodes->Size() == 0) {
RETURN_STATUS_UNEXPECTED("Input nodes is empty");
}
CHECK_FAIL_RETURN_UNEXPECTED(!feature_types.empty(), "Inpude feature_types is empty");
CHECK_FAIL_RETURN_UNEXPECTED(!feature_types.empty(), "Input feature_types is empty");
TensorRow tensors;
for (const auto &f_type : feature_types) {
std::shared_ptr<Feature> default_feature;

View File

@ -226,6 +226,8 @@ class Graph {
Status NegativeSample(const std::vector<NodeIdType> &input_data, const std::unordered_set<NodeIdType> &exclude_data,
int32_t samples_num, std::vector<NodeIdType> *out_samples);
Status CheckSamplesNum(NodeIdType samples_num);
std::string dataset_file_;
int32_t num_workers_; // The number of worker threads
std::mt19937 rnd_;

View File

@ -400,7 +400,7 @@ std::string AnfExporter::GetValueText(const FuncGraphPtr &func_graph, const Valu
} else if (value->isa<tensor::Tensor>()) {
auto tensor_ptr = dyn_cast<tensor::Tensor>(value);
oss << value->DumpText() << "@" << DumpObject(tensor_ptr->data(), "T");
} else if (value->isa<parse::Symbol>() || value->isa<None>() || value->isa<NullObj>()) {
} else if (value->isa<parse::Symbol>() || value->isa<None>() || value->isa<Null>()) {
oss << value->DumpText();
} else if (value->isa<ValueSequeue>()) {
oss << GetSequenceText(func_graph, value);

View File

@ -327,19 +327,16 @@ bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) {
vector<std::shared_ptr<TaskInfo>> task_info_list;
auto anf_node_list = graph->execution_order();
TaskGenerator::GenTasks(anf_node_list, &task_info_list, graph->graph_id());
// Store the task_info_list
auto insert_ret = task_map_.insert(std::make_pair(graph->graph_id(), task_info_list));
if (!insert_ret.second) {
MS_LOG(EXCEPTION) << "Duplicate GraphId! Please check in ascend_session.";
}
// Graph may have no compute node, such TensorAddGrad.
if (task_info_list.empty()) {
MS_LOG(WARNING) << "graph " << graph->graph_id() << " have no compute node";
return true;
}
AscendStreamAssign &assign_instance = AscendStreamAssign::GetInstance();
AscendStreamMng &stream_manager = AscendStreamMng::GetInstance();
AscendLabelAssign &label_assign_instance = AscendLabelAssign::GetInstance();
@ -348,19 +345,16 @@ bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) {
assign_instance.GetWaitStreams(&wait_active_stream_list);
std::vector<uint32_t> force_copy_stream_list;
assign_instance.GetHcomStreams(&force_copy_stream_list);
MS_LOG(INFO) << "call DavinciModel total stream num:" << stream_manager.GetCurAllocStreamNum()
<< ", total event num:" << assign_instance.total_event_num()
<< ", total label num:" << label_assign_instance.GetLabelNum(NOT_NULL(graph))
<< ", wait_active_stream_list size:" << wait_active_stream_list.size()
<< ", force_copy_stream_list size:" << force_copy_stream_list.size();
std::vector<std::shared_ptr<ge::model_runner::OpInfo>> empty_list;
std::shared_ptr<ge::model_runner::DavinciModel> model = std::make_shared<ge::model_runner::DavinciModel>(
task_info_list, empty_list, empty_list, empty_list, empty_list, wait_active_stream_list, force_copy_stream_list, 0,
0, 0, 0, 0, 0, stream_manager.GetCurAllocStreamNum(), label_assign_instance.GetLabelNum(NOT_NULL(graph)),
assign_instance.total_event_num(), 0);
auto ret = graph_model_map_.insert(std::make_pair(graph->graph_id(), model));
if (!ret.second) {
MS_LOG(EXCEPTION) << "Duplicate GraphId! Please check in ascend_session.";

View File

@ -562,10 +562,17 @@ KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node, KernelType kern
MS_LOG(WARNING) << "kernel [" << (kernel_info_list.size() + index)
<< "] :" << aicpu_kernel_info_list[index]->ToString();
}
MS_LOG(WARNING) << " <<<";
MS_EXCEPTION(TypeError) << "The node [" << kernel_node->DebugString()
<< "] cannot find valid kernel info, not supported the type:" << buffer.str()
<< ", please refer to the supported dtypes in candidates kernel info list";
if (IsPrimitiveCNode(kernel_node, prim::kPrimLabelSwitch)) {
auto selected_kernel_info = ChooseMatchedKernelInfo(kernel_node, kernel_info_list);
AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_info, kernel_node.get());
// Set format and data type for input tensor.
SetTensorDeviceInfo(*selected_kernel_info, kernel_node);
} else {
MS_LOG(WARNING) << " <<<";
MS_EXCEPTION(TypeError) << "The node [" << kernel_node->DebugString()
<< "] cannot find valid kernel info, not supported the type:" << buffer.str()
<< ", please refer to the supported dtypes in candidates kernel info list";
}
}
return select_status;
}

View File

@ -147,20 +147,18 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const session::KernelWithIndex &k
auto &input_node = kernel_with_index.first;
auto index = kernel_with_index.second;
MS_EXCEPTION_IF_NULL(input_node);
if (input_node->isa<CNode>() && AnfAlgo::GetCNodeName(input_node) == prim::kPrimMakeTuple->name()) {
auto cnode = input_node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(cnode);
VectorRef ret;
for (size_t i = 1; i < cnode->inputs().size(); i++) {
auto item_with_index = AnfAlgo::VisitKernelWithReturnType(cnode->input(i), 0);
auto out = CreatTensorForOutput(item_with_index, input_map, bound_addresses, need_sync_outputs);
ret.push_back(out);
}
return ret;
}
if (input_node->isa<CNode>()) {
auto node = input_node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(node);
if (AnfAlgo::GetCNodeName(input_node) == prim::kPrimMakeTuple->name()) {
VectorRef ret;
for (size_t i = 1; i < node->inputs().size(); i++) {
auto item_with_index = AnfAlgo::VisitKernelWithReturnType(node->input(i), 0);
auto out = CreatTensorForOutput(item_with_index, input_map, bound_addresses, need_sync_outputs);
ret.push_back(out);
}
return ret;
}
size_t output_size = AnfAlgo::GetOutputTensorNum(node);
if (index >= output_size) {
MS_LOG(EXCEPTION) << "Invalid input index " << index;

View File

@ -275,6 +275,11 @@ extern const TypePtr kTypeExternal;
extern const TypePtr kTypeEnv;
extern const TypePtr kTypeType;
extern const TypePtr kString;
extern const TypePtr kList;
extern const TypePtr kTuple;
extern const TypePtr kDict;
extern const TypePtr kSlice;
extern const TypePtr kKeyword;
extern const TypePtr kTensorType;
} // namespace mindspore

View File

@ -18,5 +18,7 @@
namespace mindspore {
const TypePtr kTypeNone = std::make_shared<TypeNone>();
const TypePtr kTypeNull = std::make_shared<TypeNull>();
const TypePtr kTypeEllipsis = std::make_shared<TypeEllipsis>();
const TypePtr kAnyType = std::make_shared<TypeAnything>();
} // namespace mindspore

View File

@ -71,20 +71,22 @@ class TypeNull : public Type {
};
using TypeNullPtr = std::shared_ptr<TypeNull>;
class Ellipsis : public Type {
class TypeEllipsis : public Type {
public:
Ellipsis() : Type(kMetaTypeEllipsis) {}
~Ellipsis() override {}
MS_DECLARE_PARENT(Ellipsis, Type)
TypeEllipsis() : Type(kMetaTypeEllipsis) {}
~TypeEllipsis() override {}
MS_DECLARE_PARENT(TypeEllipsis, Type)
TypeId generic_type_id() const override { return kMetaTypeEllipsis; }
TypePtr DeepCopy() const override { return std::make_shared<Ellipsis>(); }
TypePtr DeepCopy() const override { return std::make_shared<TypeEllipsis>(); }
std::string ToReprString() const override { return "Ellipsis"; }
std::string DumpText() const override { return "Ellipsis"; }
};
using EllipsisPtr = std::shared_ptr<Ellipsis>;
using TypeEllipsisPtr = std::shared_ptr<TypeEllipsis>;
extern const TypePtr kTypeNone;
extern const TypePtr kTypeNull;
extern const TypePtr kTypeEllipsis;
extern const TypePtr kAnyType;
} // namespace mindspore

View File

@ -95,12 +95,30 @@ TypePtr TypeIdToType(TypeId id) {
return kAnyType;
case kMetaTypeNone:
return kTypeNone;
case kMetaTypeNull:
return kTypeNull;
case kMetaTypeEllipsis:
return kTypeEllipsis;
case kObjectTypeEnvType:
return kTypeEnv;
case kObjectTypeRefKey:
return kRefKeyType;
case kObjectTypeRef:
return kRefType;
case kMetaTypeTypeType:
return kTypeType;
case kObjectTypeString:
return kString;
case kObjectTypeList:
return kList;
case kObjectTypeTuple:
return kTuple;
case kObjectTypeDictionary:
return kDict;
case kObjectTypeSlice:
return kSlice;
case kObjectTypeKeyword:
return kKeyword;
case kTypeUnknown:
return kTypeNone;
default:
@ -274,7 +292,7 @@ TypePtr StringToType(const std::string &type_name) {
if (type_name.compare("None") == 0) {
type = std::make_shared<TypeNone>();
} else if (type_name.compare("Ellipsis") == 0) {
type = std::make_shared<Ellipsis>();
type = std::make_shared<TypeEllipsis>();
} else if (type_name.compare("TypeType") == 0) {
type = std::make_shared<TypeType>();
} else if (type_name.compare("SymbolicKeyType") == 0) {
@ -476,7 +494,7 @@ REGISTER_PYBIND_DEFINE(
(void)py::class_<RefType, Type, std::shared_ptr<RefType>>(m_sub, "RefType").def(py::init());
(void)py::class_<TypeAnything, Type, std::shared_ptr<TypeAnything>>(m_sub, "TypeAnything").def(py::init());
(void)py::class_<Slice, Type, std::shared_ptr<Slice>>(m_sub, "Slice").def(py::init());
(void)py::class_<Ellipsis, Type, std::shared_ptr<Ellipsis>>(m_sub, "Ellipsis").def(py::init());
(void)py::class_<TypeEllipsis, Type, std::shared_ptr<TypeEllipsis>>(m_sub, "TypeEllipsis").def(py::init());
}));
const TypePtr kTypeExternal = std::make_shared<External>();
@ -484,4 +502,9 @@ const TypePtr kTypeEnv = std::make_shared<EnvType>();
const TypePtr kTypeType = std::make_shared<TypeType>();
const TypePtr kTensorType = std::make_shared<TensorType>();
const TypePtr kString = std::make_shared<String>();
const TypePtr kList = std::make_shared<List>();
const TypePtr kTuple = std::make_shared<Tuple>();
const TypePtr kDict = std::make_shared<Dictionary>();
const TypePtr kSlice = std::make_shared<Slice>();
const TypePtr kKeyword = std::make_shared<Keyword>();
} // namespace mindspore

View File

@ -432,7 +432,7 @@ AnfNodePtr FuncGraph::GetDefaultValueByName(const std::string &name) {
if (default_value == nullptr) {
MS_LOG(EXCEPTION) << "Graph parameter " << name << " not exist";
}
if (IsValueNode<NullObj>(default_value)) {
if (IsValueNode<Null>(default_value)) {
return nullptr;
}
return default_value;
@ -440,8 +440,8 @@ AnfNodePtr FuncGraph::GetDefaultValueByName(const std::string &name) {
// set the default values
void FuncGraph::SetDefaultValues(const std::vector<std::string> &name_list, const std::vector<AnfNodePtr> &value_list) {
auto all_is_null = std::all_of(value_list.begin(), value_list.end(),
[](const AnfNodePtr &node) { return IsValueNode<NullObj>(node); });
auto all_is_null =
std::all_of(value_list.begin(), value_list.end(), [](const AnfNodePtr &node) { return IsValueNode<Null>(node); });
if (value_list.empty()) {
all_is_null = true;
}
@ -457,7 +457,7 @@ void FuncGraph::ClearDefaultValues() { parameter_default_value_.clear(); }
size_t FuncGraph::GetDefaultValueCount() {
int null_count =
std::count_if(parameter_default_value_.begin(), parameter_default_value_.end(),
[](const std::pair<std::string, AnfNodePtr> &pair) { return IsValueNode<NullObj>(pair.second); });
[](const std::pair<std::string, AnfNodePtr> &pair) { return IsValueNode<Null>(pair.second); });
return parameter_default_value_.size() - IntToSize(null_count);
}

View File

@ -30,9 +30,9 @@ bool Named::operator==(const Value &other) const {
abstract::AbstractBasePtr None::ToAbstract() { return std::make_shared<abstract::AbstractNone>(); }
const NamedPtr kNone = std::make_shared<None>();
abstract::AbstractBasePtr NullObj::ToAbstract() { return std::make_shared<abstract::AbstractNull>(); }
const NamedPtr kNull = std::make_shared<NullObj>();
abstract::AbstractBasePtr Null::ToAbstract() { return std::make_shared<abstract::AbstractNull>(); }
const NamedPtr kNull = std::make_shared<Null>();
abstract::AbstractBasePtr EllipsisObj::ToAbstract() { return std::make_shared<abstract::AbstractEllipsis>(); }
const NamedPtr kEllipsis = std::make_shared<EllipsisObj>();
abstract::AbstractBasePtr Ellipsis::ToAbstract() { return std::make_shared<abstract::AbstractEllipsis>(); }
const NamedPtr kEllipsis = std::make_shared<Ellipsis>();
} // namespace mindspore

View File

@ -71,20 +71,20 @@ class None : public Named {
};
extern const NamedPtr kNone;
class NullObj : public Named {
class Null : public Named {
public:
NullObj() : Named("Null") {}
~NullObj() override = default;
MS_DECLARE_PARENT(NullObj, Named);
Null() : Named("Null") {}
~Null() override = default;
MS_DECLARE_PARENT(Null, Named);
abstract::AbstractBasePtr ToAbstract() override;
};
extern const NamedPtr kNull;
class EllipsisObj : public Named {
class Ellipsis : public Named {
public:
EllipsisObj() : Named("Ellipsis") {}
~EllipsisObj() override = default;
MS_DECLARE_PARENT(EllipsisObj, Named);
Ellipsis() : Named("Ellipsis") {}
~Ellipsis() override = default;
MS_DECLARE_PARENT(Ellipsis, Named);
abstract::AbstractBasePtr ToAbstract() override;
};
extern const NamedPtr kEllipsis;

View File

@ -68,10 +68,10 @@ void SetAkgAttrsForFive2Four(const AnfNodePtr &anf_node) {
void SetAkgAttrsForCast(const AnfNodePtr &anf_node) {
MS_EXCEPTION_IF_NULL(anf_node);
// The x and output are akg op input and output param.
std::vector<std::string> input_names = {"x"};
std::vector<std::string> input_names = {"x", "dst_type"};
std::vector<std::string> output_names = {"output"};
AnfAlgo::SetNodeAttr("input_names", MakeValue(input_names), anf_node);
AnfAlgo::SetNodeAttr("output_names", MakeValue(output_names), anf_node);
AnfAlgo::SetNodeAttr(kAttrInputNames, MakeValue(input_names), anf_node);
AnfAlgo::SetNodeAttr(kAttrOutputNames, MakeValue(output_names), anf_node);
std::string dst_type;
TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, 0);

View File

@ -276,11 +276,11 @@ bool AkgKernelBuild::CreateInputDescJson(const AnfNodePtr &anf_node, nlohmann::j
input_desc_json[kName] = op_input_name;
input_desc_json[kTensorName] = "input_" + std::to_string(GetInputTensorIdxInc(anf_node, real_input_index));
auto input_shape = AnfAlgo::GetInputDeviceShape(anf_node, real_input_index);
if (GetInputTensorValue(anf_node, real_input_index, &input_desc_json)) {
if (anf_node->func_graph() != nullptr && anf_node->func_graph()->has_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL) &&
GetInputTensorValue(anf_node, real_input_index, &input_desc_json)) {
MS_LOG(WARNING) << "we take input[" << real_input_index << "] of [" << anf_node->DebugString(2)
<< "] as const tensor, shape: [" << Vector2Str(input_shape)
<< "], value: " << input_desc_json[kValue];
input_shape.clear();
}
if (input_shape.empty()) {

View File

@ -20,6 +20,7 @@
#include <iostream>
#include <utility>
#include <fstream>
#include <thread>
#include "nlohmann/json.hpp"
#include "session/anf_runtime_algorithm.h"
#include "common/utils.h"
@ -576,6 +577,52 @@ void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGr
unique_grad->indices_size_ = unique_indices_size;
}
struct WorkerParamsForReduceSparseGradient {
size_t slice_start_{0};
size_t slice_end_{0};
size_t max_length_{0};
size_t outer_dim_{0};
std::vector<std::pair<int, size_t>> *sorted_indices_{nullptr};
std::vector<size_t> *slice_positions_{nullptr};
float *src_value_{nullptr};
SparseGradient *unique_grad_{nullptr};
};
void WorkerForReduceSparseGradient(WorkerParamsForReduceSparseGradient param) {
MS_EXCEPTION_IF_NULL(param.sorted_indices_);
MS_EXCEPTION_IF_NULL(param.slice_positions_);
MS_EXCEPTION_IF_NULL(param.src_value_);
MS_EXCEPTION_IF_NULL(param.unique_grad_);
auto outer_dim = param.outer_dim_;
auto &sorted_indices = *(param.sorted_indices_);
auto &slice_positions = *(param.slice_positions_);
auto unique_grad = param.unique_grad_;
for (size_t slice_id = param.slice_start_; slice_id < param.slice_end_; ++slice_id) {
size_t cur_pos = slice_positions[slice_id];
int index = sorted_indices[cur_pos].first;
unique_grad->indices_[slice_id] = index;
size_t start_index = slice_id * outer_dim;
auto ret_code = memcpy_s(unique_grad->value_ + start_index, (param.max_length_ - start_index) * sizeof(float),
param.src_value_ + sorted_indices[cur_pos].second, outer_dim * sizeof(float));
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
cur_pos++;
size_t end_pos;
if (slice_id + 1 < slice_positions.size()) {
end_pos = slice_positions[slice_id + 1];
} else {
end_pos = sorted_indices.size();
}
while (cur_pos < end_pos) {
for (size_t i = 0; i < outer_dim; ++i) {
unique_grad->value_[start_index + i] += param.src_value_[sorted_indices[cur_pos].second + i];
}
cur_pos++;
}
}
}
void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim) {
MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
@ -583,47 +630,50 @@ void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradie
MS_EXCEPTION_IF_NULL(unique_grad);
MS_EXCEPTION_IF_NULL(unique_grad->value_);
MS_EXCEPTION_IF_NULL(unique_grad->indices_);
size_t unique_indices_size = 0;
std::vector<std::pair<int, size_t>> sorted_indices;
sorted_indices.reserve(origin_sparse_grad.indices_size_);
for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) {
int index = origin_sparse_grad.indices_[i];
if (index < 0 || IntToSize(index) >= first_dim) {
continue;
if (index >= 0 && IntToSize(index) < first_dim) {
sorted_indices.emplace_back(std::pair<int, size_t>(index, i * outer_dim));
}
sorted_indices.emplace_back(std::pair<int, size_t>(index, i * outer_dim));
}
std::sort(
sorted_indices.begin(), sorted_indices.end(),
[](const std::pair<int, size_t> &left, const std::pair<int, size_t> &right) { return left.first < right.first; });
int last_index = 0;
size_t indices_size = sorted_indices.size();
size_t start_index = 0;
size_t end_index = outer_dim;
size_t dst_len = indices_size * outer_dim;
for (size_t i = 0; i < indices_size; ++i) {
int index = sorted_indices[i].first;
if (i == 0 || last_index != index) {
if (i > 0 && last_index != index) {
unique_indices_size++;
start_index += outer_dim;
end_index += outer_dim;
}
unique_grad->indices_[unique_indices_size] = index;
auto ret_code = memcpy_s(unique_grad->value_ + start_index, dst_len - start_index,
origin_sparse_grad.value_ + sorted_indices[i].second, outer_dim);
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
} else {
for (size_t j = start_index, k = sorted_indices[i].second; j < end_index; ++j, ++k) {
unique_grad->value_[j] += origin_sparse_grad.value_[k];
}
std::vector<size_t> slice_positions;
for (size_t i = 0; i < sorted_indices.size(); ++i) {
if (i == 0 || last_index != sorted_indices[i].first) {
slice_positions.emplace_back(i);
}
last_index = index;
last_index = sorted_indices[i].first;
}
unique_grad->indices_size_ = unique_indices_size + 1;
size_t thread_num = 8;
if (slice_positions.size() < thread_num) {
thread_num = slice_positions.size();
}
size_t stride = (slice_positions.size() + thread_num - 1) / thread_num;
thread_num = (slice_positions.size() + stride - 1) / stride;
std::vector<std::thread> threads;
size_t max_length = sorted_indices.size() * outer_dim;
for (size_t i = 0; i < thread_num; ++i) {
size_t slice_start = i * stride;
size_t slice_end = 0;
if (i == thread_num - 1) {
slice_end = slice_positions.size();
} else {
slice_end = slice_start + stride;
}
WorkerParamsForReduceSparseGradient params{
slice_start, slice_end, max_length, outer_dim, &sorted_indices, &slice_positions, origin_sparse_grad.value_,
unique_grad};
threads.emplace_back(std::thread(WorkerForReduceSparseGradient, params));
}
for (size_t i = 0; i < thread_num; ++i) {
threads[i].join();
}
unique_grad->indices_size_ = slice_positions.size();
}
std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index) {
@ -876,5 +926,21 @@ bool IsWeightBoundary(const AnfNodePtr &node) {
}
return false;
}
void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params, size_t thread_num,
size_t total_compute_size) {
std::vector<std::thread> threads;
threads.reserve(thread_num);
size_t start = 0;
size_t once_compute_size = (total_compute_size + thread_num - 1) / thread_num;
while (start < total_compute_size) {
size_t end = (start + once_compute_size) > total_compute_size ? total_compute_size : (start + once_compute_size);
threads.emplace_back(std::thread(func, params, start, end));
start += once_compute_size;
}
for (size_t i = 0; i < threads.size(); ++i) {
threads[i].join();
}
}
} // namespace kernel
} // namespace mindspore

View File

@ -25,6 +25,7 @@
#include <string>
#include <vector>
#include <utility>
#include <thread>
#include <nlohmann/json.hpp>
#include "kernel/kernel.h"
#include "kernel/oplib/opinfo.h"
@ -78,6 +79,27 @@ struct SparseGradient {
size_t indices_size_;
};
struct MultiThreadComputeParams {
float *var_;
float *accum_;
float *linear_;
float *m_;
float *m_t_;
float *v_;
float lr_;
float l1_;
float l2_;
float lr_power_;
float beta1_;
float beta2_;
float epsilon_;
SparseGradient sparse_grad_;
size_t var_first_dim_size_;
size_t var_outer_dim_size_;
bool use_nesterov_;
};
using MultiThreadComputeFunc = std::function<void(MultiThreadComputeParams *param, size_t start, size_t end)>;
bool CheckCache(const std::string &kernel_name);
KernelPackPtr SearchCache(const std::string &kernel_name, const std::string &processor);
KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &processor);
@ -107,6 +129,8 @@ void GetValidKernelNodes(const FuncGraphPtr &func_graph, std::vector<AnfNodePtr>
bool GetInputTensorValue(const AnfNodePtr &anf_node, size_t input_idx, nlohmann::json *const node_json);
void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<AnfNodePtr, size_t>> *node_list);
bool IsWeightBoundary(const AnfNodePtr &node);
void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params, size_t thread_num,
size_t total_compute_size);
} // namespace kernel
} // namespace mindspore

View File

@ -14,12 +14,66 @@
* limitations under the License.
*/
#include "kernel/cpu/sparse_apply_adam_cpu_kernel.h"
#include "kernel/common_utils.h"
#include "device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kSparseApplyAdamInputSize = 11;
void ComputeAdam(MultiThreadComputeParams *input_params, size_t start, size_t end) {
MS_EXCEPTION_IF_NULL(input_params);
auto m = input_params->m_;
auto m_t = input_params->m_t_;
auto v = input_params->v_;
auto beta1 = input_params->beta1_;
auto beta2 = input_params->beta2_;
auto use_nesterov = input_params->use_nesterov_;
auto unique_sparse_grad = input_params->sparse_grad_;
auto var_first_dim_size = input_params->var_first_dim_size_;
auto var_outer_dim_size = input_params->var_outer_dim_size_;
for (size_t i = start; i < end; ++i) {
int index = unique_sparse_grad.indices_[i];
if (index < 0 || IntToSize(index) >= var_first_dim_size) {
MS_LOG(EXCEPTION) << "Index " << index << " in indices is out of range after unique process";
}
size_t start_index = var_outer_dim_size * index;
size_t end_index = start_index + var_outer_dim_size;
for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
auto summed_grad = unique_sparse_grad.value_[k];
m[j] += (1 - beta1) * summed_grad;
v[j] += (1 - beta2) * summed_grad * summed_grad;
if (use_nesterov) {
m_t[j] = m[j] * beta1 + (1 - beta1) * summed_grad;
}
}
}
}
void ComputeMomentum(MultiThreadComputeParams *input_params, size_t start, size_t end) {
MS_EXCEPTION_IF_NULL(input_params);
auto m = input_params->m_;
auto v = input_params->v_;
auto beta1 = input_params->beta1_;
auto beta2 = input_params->beta2_;
for (size_t i = start; i < end; ++i) {
m[i] *= beta1;
v[i] *= beta2;
}
}
void ComputeWeight(MultiThreadComputeParams *input_params, size_t start, size_t end) {
MS_EXCEPTION_IF_NULL(input_params);
auto var = input_params->var_;
auto m = input_params->m_;
auto v = input_params->v_;
auto lr = input_params->lr_;
auto epsilon = input_params->epsilon_;
for (size_t i = start; i < end; ++i) {
var[i] -= lr * m[i] / (std::sqrt(v[i]) + epsilon);
}
}
} // namespace
void SparseApplyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
@ -64,29 +118,6 @@ void SparseApplyAdamCPUKernel::InitKernel(const CNodePtr &kernel_node) {
}
}
void SparseApplyAdamCPUKernel::UpdateSparseMomentum(const SparseGradient &unique_sparse_grad, float *m, float *m_t,
float *v, float beta1, float beta2) const {
MS_EXCEPTION_IF_NULL(m);
MS_EXCEPTION_IF_NULL(m_t);
MS_EXCEPTION_IF_NULL(v);
for (size_t i = 0; i < unique_sparse_grad.indices_size_; ++i) {
int index = unique_sparse_grad.indices_[i];
if (index < 0 || IntToSize(index) >= var_first_dim_size_) {
MS_LOG(EXCEPTION) << "Index " << index << " in indices is out of range after unique process";
}
size_t start_index = var_outer_dim_size_ * index;
size_t end_index = start_index + var_outer_dim_size_;
for (size_t j = start_index, k = var_outer_dim_size_ * i; j < end_index; ++j, ++k) {
auto summed_grad = unique_sparse_grad.value_[k];
m[j] += (1 - beta1) * summed_grad;
v[j] += (1 - beta2) * summed_grad * summed_grad;
if (use_nesterov_) {
m_t[j] = m[j] * beta1 + (1 - beta1) * summed_grad;
}
}
}
}
bool SparseApplyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &workspace,
const std::vector<kernel::AddressPtr> & /*outputs*/) {
@ -115,21 +146,31 @@ bool SparseApplyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
var_outer_dim_size_);
size_t total_dim_size = var_first_dim_size_ * var_outer_dim_size_;
// Update momentum
lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
for (size_t i = 0; i < total_dim_size; ++i) {
m[i] *= beta1;
v[i] *= beta2;
}
MultiThreadComputeParams input_params;
input_params.m_ = m;
input_params.v_ = v;
input_params.beta1_ = beta1;
input_params.beta2_ = beta2;
const size_t kThreadNum = 16;
MultiThreadCompute(ComputeMomentum, &input_params, kThreadNum, total_dim_size);
std::vector<float> m_t(m, m + total_dim_size);
UpdateSparseMomentum(unique_sparse_grad, m, m_t.data(), v, beta1, beta2);
// Update weight
input_params.m_t_ = m_t.data();
input_params.use_nesterov_ = use_nesterov_;
input_params.sparse_grad_ = unique_sparse_grad;
input_params.var_first_dim_size_ = var_first_dim_size_;
input_params.var_outer_dim_size_ = var_outer_dim_size_;
MultiThreadCompute(ComputeAdam, &input_params, kThreadNum, unique_sparse_grad.indices_size_);
if (use_nesterov_) {
m = m_t.data();
}
for (size_t i = 0; i < total_dim_size; ++i) {
var[i] -= lr * m[i] / (std::sqrt(v[i]) + epsilon);
input_params.m_ = input_params.m_t_;
}
input_params.var_ = var;
input_params.lr_ = lr;
input_params.epsilon_ = epsilon;
MultiThreadCompute(ComputeWeight, &input_params, kThreadNum, total_dim_size);
return true;
}
} // namespace kernel

View File

@ -20,7 +20,6 @@
#include <memory>
#include "kernel/cpu/cpu_kernel.h"
#include "kernel/cpu/cpu_kernel_factory.h"
#include "kernel/common_utils.h"
namespace mindspore {
namespace kernel {
@ -35,8 +34,6 @@ class SparseApplyAdamCPUKernel : public CPUKernel {
const std::vector<AddressPtr> &outputs) override;
private:
void UpdateSparseMomentum(const SparseGradient &unique_sparse_grad, float *m, float *m_t, float *v, float beta1,
float beta2) const;
size_t indices_size_{0};
size_t var_first_dim_size_{0};
size_t var_outer_dim_size_{1};

View File

@ -21,6 +21,47 @@ namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kSparseApplyFtrlInputSize = 5;
void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t end) {
MS_EXCEPTION_IF_NULL(input_params);
auto var = input_params->var_;
auto accum = input_params->accum_;
auto linear = input_params->linear_;
auto lr = input_params->lr_;
auto l1 = input_params->l1_;
auto l2 = input_params->l2_;
auto lr_power = input_params->lr_power_;
auto unique_sparse_grad = input_params->sparse_grad_;
auto var_first_dim_size = input_params->var_first_dim_size_;
auto var_outer_dim_size = input_params->var_outer_dim_size_;
for (size_t i = start; i < end; ++i) {
int index = unique_sparse_grad.indices_[i];
if (index < 0 || IntToSize(index) >= var_first_dim_size) {
MS_LOG(EXCEPTION) << "Index " << index << " in indices is out of range after unique process";
}
size_t start_index = var_outer_dim_size * index;
size_t end_index = start_index + var_outer_dim_size;
for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
auto summed_grad = unique_sparse_grad.value_[k];
auto accum_new = accum[j] + summed_grad * summed_grad;
if (lr_power == -0.5) {
linear[j] += summed_grad - (std::sqrt(accum_new) - std::sqrt(accum[j])) / lr * var[j];
} else {
linear[j] += summed_grad - (std::pow(accum_new, -lr_power) - std::pow(accum[j], -lr_power)) / lr * var[j];
}
auto x = Sign(linear[j]) * l1 - linear[j];
float y;
if (lr_power == -0.5) {
y = std::sqrt(accum_new) / lr + 2 * l2;
} else {
y = std::pow(accum_new, -lr_power) / lr + 2 * l2;
}
auto pre_shrink = x / y;
var[j] = std::fabs(linear[j]) > l1 ? pre_shrink : 0;
accum[j] = accum_new;
}
}
}
} // namespace
void SparseApplyFtrlCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
@ -96,33 +137,19 @@ bool SparseApplyFtrlCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
var_outer_dim_size_);
for (size_t i = 0; i < unique_sparse_grad.indices_size_; ++i) {
int index = unique_sparse_grad.indices_[i];
if (index < 0 || IntToSize(index) >= var_first_dim_size_) {
MS_LOG(EXCEPTION) << "Index " << index << " in indices is out of range after unique process";
}
size_t start_index = var_outer_dim_size_ * index;
size_t end_index = start_index + var_outer_dim_size_;
for (size_t j = start_index, k = var_outer_dim_size_ * i; j < end_index; ++j, ++k) {
auto summed_grad = unique_sparse_grad.value_[k];
auto accum_new = accum[j] + summed_grad * summed_grad;
if (lr_power_ == -0.5) {
linear[j] += summed_grad - (std::sqrt(accum_new) - std::sqrt(accum[j])) / lr_ * var[j];
} else {
linear[j] += summed_grad - (std::pow(accum_new, -lr_power_) - std::pow(accum[j], -lr_power_)) / lr_ * var[j];
}
auto x = Sign(linear[j]) * l1_ - linear[j];
float y;
if (lr_power_ == -0.5) {
y = std::sqrt(accum_new) / lr_ + 2 * l2_;
} else {
y = std::pow(accum_new, -lr_power_) / lr_ + 2 * l2_;
}
auto pre_shrink = x / y;
var[j] = std::fabs(linear[j]) > l1_ ? pre_shrink : 0;
accum[j] = accum_new;
}
}
MultiThreadComputeParams input_params;
input_params.var_ = var;
input_params.accum_ = accum;
input_params.linear_ = linear;
input_params.lr_ = lr_;
input_params.l1_ = l1_;
input_params.l2_ = l2_;
input_params.lr_power_ = lr_power_;
input_params.sparse_grad_ = unique_sparse_grad;
input_params.var_first_dim_size_ = var_first_dim_size_;
input_params.var_outer_dim_size_ = var_outer_dim_size_;
const size_t kThreadNum = 16;
MultiThreadCompute(ComputeFtrl, &input_params, kThreadNum, unique_sparse_grad.indices_size_);
return true;
}
} // namespace kernel

View File

@ -21,6 +21,39 @@ namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kSparseApplyLazyAdamInputSize = 11;
void ComputeLazyAdam(MultiThreadComputeParams *input_params, size_t start, size_t end) {
MS_EXCEPTION_IF_NULL(input_params);
auto var = input_params->var_;
auto m = input_params->m_;
auto v = input_params->v_;
auto lr = input_params->lr_;
auto beta1 = input_params->beta1_;
auto beta2 = input_params->beta2_;
auto epsilon = input_params->epsilon_;
auto use_nesterov = input_params->use_nesterov_;
auto unique_sparse_grad = input_params->sparse_grad_;
auto var_first_dim_size = input_params->var_first_dim_size_;
auto var_outer_dim_size = input_params->var_outer_dim_size_;
for (size_t i = start; i < end; ++i) {
int index = unique_sparse_grad.indices_[i];
if (index < 0 || IntToSize(index) >= var_first_dim_size) {
MS_LOG(EXCEPTION) << "Index " << index << " in indices is out of range";
}
size_t start_index = var_outer_dim_size * index;
size_t end_index = start_index + var_outer_dim_size;
for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
auto summed_grad = unique_sparse_grad.value_[k];
m[j] = beta1 * m[j] + (1 - beta1) * summed_grad;
v[j] = beta2 * v[j] + (1 - beta2) * summed_grad * summed_grad;
if (use_nesterov) {
var[j] -= lr * (m[j] * beta1 + (1 - beta1) * summed_grad) / (std::sqrt(v[j]) + epsilon);
} else {
var[j] -= lr * m[j] / (std::sqrt(v[j]) + epsilon);
}
}
}
}
} // namespace
void SparseApplyLazyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
@ -94,24 +127,20 @@ bool SparseApplyLazyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr>
var_outer_dim_size_);
lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
for (size_t i = 0; i < unique_sparse_grad.indices_size_; ++i) {
int index = unique_sparse_grad.indices_[i];
if (index < 0 || IntToSize(index) >= var_first_dim_size_) {
MS_LOG(EXCEPTION) << "Index " << index << " in indices is out of range";
}
size_t start_index = var_outer_dim_size_ * index;
size_t end_index = start_index + var_outer_dim_size_;
for (size_t j = start_index, k = var_outer_dim_size_ * i; j < end_index; ++j, ++k) {
auto summed_grad = unique_sparse_grad.value_[k];
m[j] = beta1 * m[j] + (1 - beta1) * summed_grad;
v[j] = beta2 * v[j] + (1 - beta2) * summed_grad * summed_grad;
if (use_nesterov_) {
var[j] -= lr * (m[j] * beta1 + (1 - beta1) * summed_grad) / (std::sqrt(v[j]) + epsilon);
} else {
var[j] -= lr * m[j] / (std::sqrt(v[j]) + epsilon);
}
}
}
MultiThreadComputeParams input_params;
input_params.var_ = var;
input_params.m_ = m;
input_params.v_ = v;
input_params.lr_ = lr;
input_params.beta1_ = beta1;
input_params.beta2_ = beta2;
input_params.epsilon_ = epsilon;
input_params.use_nesterov_ = use_nesterov_;
input_params.sparse_grad_ = unique_sparse_grad;
input_params.var_first_dim_size_ = var_first_dim_size_;
input_params.var_outer_dim_size_ = var_outer_dim_size_;
const size_t kThreadNum = 16;
MultiThreadCompute(ComputeLazyAdam, &input_params, kThreadNum, unique_sparse_grad.indices_size_);
return true;
}
} // namespace kernel

View File

@ -21,6 +21,39 @@ namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kSparseApplyProximalAdagradInputSize = 7;
void ComputeProximalAdagrad(MultiThreadComputeParams *input_params, size_t start, size_t end) {
MS_EXCEPTION_IF_NULL(input_params);
auto var = input_params->var_;
auto accum = input_params->accum_;
auto lr = input_params->lr_;
auto l1 = input_params->l1_;
auto l2 = input_params->l2_;
auto unique_sparse_grad = input_params->sparse_grad_;
auto var_first_dim_size = input_params->var_first_dim_size_;
auto var_outer_dim_size = input_params->var_outer_dim_size_;
for (size_t i = start; i < end; ++i) {
int index = unique_sparse_grad.indices_[i];
if (index < 0 || IntToSize(index) >= var_first_dim_size) {
MS_LOG(EXCEPTION) << "Index " << index << " in indices is out of range after unique process";
}
size_t start_index = var_outer_dim_size * index;
size_t end_index = start_index + var_outer_dim_size;
for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
auto summed_grad = unique_sparse_grad.value_[k];
accum[j] += summed_grad * summed_grad;
auto learning_rate = lr * (1 / std::sqrt(accum[j]));
auto prox_v = var[j];
prox_v -= summed_grad * learning_rate;
if (l1 > 0) {
var[j] = Sign(prox_v) * std::fmax(std::fabs(prox_v) - learning_rate * l1, static_cast<float>(0.0)) /
(1 + l2 * learning_rate);
} else {
var[j] = prox_v / (1 + l2 * learning_rate);
}
}
}
}
} // namespace
void SparseApplyProximalAdagradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
@ -90,27 +123,17 @@ bool SparseApplyProximalAdagradCPUKernel::Launch(const std::vector<kernel::Addre
ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
var_outer_dim_size_);
for (size_t i = 0; i < unique_sparse_grad.indices_size_; ++i) {
int index = unique_sparse_grad.indices_[i];
if (index < 0 || IntToSize(index) >= var_first_dim_size_) {
MS_LOG(EXCEPTION) << "Index " << index << " in indices is out of range after unique process";
}
size_t start_index = var_outer_dim_size_ * index;
size_t end_index = start_index + var_outer_dim_size_;
for (size_t j = start_index, k = var_outer_dim_size_ * i; j < end_index; ++j, ++k) {
auto summed_grad = unique_sparse_grad.value_[k];
accum[j] += summed_grad * summed_grad;
auto learning_rate = lr * (1 / std::sqrt(accum[j]));
auto prox_v = var[j];
prox_v -= summed_grad * learning_rate;
if (l1 > 0) {
var[j] = Sign(prox_v) * std::fmax(std::fabs(prox_v) - learning_rate * l1, static_cast<float>(0.0)) /
(1 + l2 * learning_rate);
} else {
var[j] = prox_v / (1 + l2 * learning_rate);
}
}
}
MultiThreadComputeParams input_params;
input_params.var_ = var;
input_params.accum_ = accum;
input_params.lr_ = lr;
input_params.l1_ = l1;
input_params.l2_ = l2;
input_params.sparse_grad_ = unique_sparse_grad;
input_params.var_first_dim_size_ = var_first_dim_size_;
input_params.var_outer_dim_size_ = var_outer_dim_size_;
const size_t kThreadNum = 16;
MultiThreadCompute(ComputeProximalAdagrad, &input_params, kThreadNum, unique_sparse_grad.indices_size_);
return true;
}
} // namespace kernel

View File

@ -48,6 +48,7 @@ MS_REG_CPU_KERNEL(SparseApplyProximalAdagrad,
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
SparseApplyProximalAdagradCPUKernel);
} // namespace kernel

View File

@ -0,0 +1,138 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <thrust/extrema.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/reduce.h>
#include <thrust/pair.h>
#include "fake_quant_perchannel_impl.cuh"
/**
* Find the nudge min, max and scale value as output.
* @param input_min array
* @param input_max array
* @param quant_min 1 << bit -1
* @param quant_max 0
* @param nudge_min array
* @param nudge_max array
* @param scale array
* @param channel_num
* @return
*/
__global__ void NudgeMinMaxPerChannel(float *input_min, float *input_max, const float quant_min, const float quant_max,
float *nudge_min, float *nudge_max, float *scale, int channel_num,
const bool symmetric) {
float zp_from_min = 0.f;
float nudge_zp = 0.f;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < channel_num; i += blockDim.x * gridDim.x) {
if (symmetric) {
input_max[i] = abs(input_min[0]) < input_max[i] ? input_max[i] : -input_min[i];
input_min[i] = abs(input_min[i]) < input_max[i] ? -input_max[i] : input_min[i];
}
if ((quant_max - quant_min) == 0 || (input_max[i] - input_min[i]) == 0) {
scale[i] = 0.f;
zp_from_min = 0.f;
} else {
scale[i] = (input_max[i] - input_min[i]) / (quant_max - quant_min);
zp_from_min = quant_min - input_min[i] / scale[i];
}
if (zp_from_min <= quant_min) {
nudge_zp = quant_min;
} else if (zp_from_min >= quant_max) {
nudge_zp = quant_max;
} else {
nudge_zp = round(zp_from_min);
}
nudge_min[i] = (quant_min - nudge_zp) * (scale[i]);
nudge_max[i] = (quant_max - nudge_zp) * (scale[i]);
}
}
void CalNudgePerChannel(float *input_min, float *input_max, const float quant_min, const float quant_max,
float *nudge_min, float *nudge_max, float *scale, const int channel_num, const bool symmetric,
cudaStream_t cuda_stream) {
NudgeMinMaxPerChannel<<<GET_BLOCKS(channel_num), GET_THREADS, 0, cuda_stream>>>(
input_min, input_max, quant_min, quant_max, nudge_min, nudge_max, scale, channel_num, symmetric);
}
/**
* Calulate fake quant output accroding by nudge min, nudge max, nudge scale.
* @param input - array
* @param output - array
* @param total_size - int, purpose for cal the per chanel number in filters
* @param channel_size - int, purpose for cal the per channel number in filters
* @param nudge_min - array
* @param nudge_max - array
* @param scale - array
* @return
*/
__global__ void FakeQuantPerChannel(const float *input, float *output, const int total_size, const int channel_size,
const float *nudge_min, const float *nudge_max, const float *scale) {
float input_x = 0.f;
int nudge_input = 0;
int channel_idx = 0;
int per_channel_num = total_size / channel_size;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < total_size; i += blockDim.x * gridDim.x) {
input_x = input[i];
channel_idx = floor(static_cast<double>(i) / static_cast<double>(per_channel_num));
// clamp input x
if (input_x < nudge_min[channel_idx]) {
input_x = nudge_min[channel_idx];
}
if (input_x > nudge_max[channel_idx]) {
input_x = nudge_max[channel_idx];
}
// clamp shift
nudge_input = floor((input_x - nudge_min[channel_idx]) / scale[channel_idx] + 0.5f);
// quantize
output[i] = nudge_input * scale[channel_idx] + nudge_min[channel_idx];
}
}
void CalFakeQuantPerChannel(const float *input, float *output, const int total_size, const int channel_size,
const float *nudge_min, const float *nudge_max, const float *scale,
cudaStream_t cuda_stream) {
FakeQuantPerChannel<<<GET_BLOCKS(total_size), GET_THREADS, 0, cuda_stream>>>(input, output, total_size, channel_size,
nudge_min, nudge_max, scale);
}
__global__ void FakeQuantPerChannelGrad(const float *input, const float *gradient, float *output, const int total_size,
const int channel_size, const float *nudge_min, const float *nudge_max) {
int channel_idx = 0;
int per_channel_num = total_size / channel_size;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < total_size; i += blockDim.x * gridDim.x) {
channel_idx = floor(static_cast<double>(i) / static_cast<double>(per_channel_num));
if (input[i] < nudge_min[channel_idx] || input[i] > nudge_max[channel_idx]) {
output[i] = 0;
} else {
output[i] = gradient[i];
}
}
}
void CalFakeQuantPerChannelGrad(const float *input, const float *gradient, float *output, const int total_num,
const int channel_num, const float *nudge_min, const float *nudge_max,
cudaStream_t cuda_stream) {
FakeQuantPerChannelGrad<<<GET_BLOCKS(channel_num), GET_THREADS, 0, cuda_stream>>>(input, gradient, output, total_num,
channel_num, nudge_min, nudge_max);
}

View File

@ -0,0 +1,34 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERCHANNEL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERCHANNEL_H_
#include "device/gpu/cuda_common.h"
void CalNudgePerChannel(float *input_min, float *input_max, const float quant_min, const float quant_max,
float *nudge_min, float *nudge_max, float *scale, const int channel_num, const bool symmetric,
cudaStream_t cuda_stream);
void CalFakeQuantPerChannel(const float *input, float *output, const int total_num, const int channel_num,
const float *nudge_min, const float *nudge_max, const float *scale,
cudaStream_t cuda_stream);
void CalFakeQuantPerChannelGrad(const float *input, const float *gradient, float *output, const int total_num,
const int channel_num, const float *nudge_min, const float *nudge_max,
cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERCHANNEL_H_

View File

@ -0,0 +1,111 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <thrust/extrema.h>
#include <thrust/device_vector.h>
#include <thrust/pair.h>
#include "fake_quant_perlayer_impl.cuh"
__global__ void FakeQuantPerLayer(const float *input, float *output, const int size, const float *nudge_min,
const float *nudge_max, const float *scale) {
float input_x = 0.f;
int nudge_input = 0;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
input_x = input[i];
// clamp input x
if (input_x < nudge_min[0]) {
input_x = nudge_min[0];
}
if (input_x > nudge_max[0]) {
input_x = nudge_max[0];
}
// clamp shift
nudge_input = round((input_x - nudge_min[0]) / scale[0]);
// quantize
output[i] = nudge_input * scale[0] + nudge_min[0];
}
return;
}
__global__ void FakeQuantPerLayerGrad(const float *input, const float *gradient, float *output, const int size,
const float *nudge_min, const float *nudge_max) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
if (input[i] < nudge_min[0] || input[i] > nudge_max[0]) {
output[i] = 0;
} else {
output[i] = gradient[i];
}
}
return;
}
__global__ void NudgeMinMaxPerLayer(float *input_min, float *input_max, const float quant_min, const float quant_max,
float *nudge_min, float *nudge_max, float *scale, const bool symmetric) {
float zp_from_min = 0.f;
scale[0] = 0.f;
nudge_max[0] = 0.f;
nudge_min[0] = 0.f;
if (symmetric) {
input_max[0] = abs(input_min[0]) < input_max[0] ? input_max[0] : -input_min[0];
input_min[0] = abs(input_min[0]) < input_max[0] ? -input_max[0] : input_min[0];
}
if ((quant_max - quant_min) == 0 || (input_max[0] - input_min[0]) == 0) {
scale[0] = 0.f;
zp_from_min = 0.f;
} else {
scale[0] = (input_max[0] - input_min[0]) / (quant_max - quant_min);
zp_from_min = quant_min - input_min[0] / scale[0];
}
float nudge_zp = 0.f;
if (zp_from_min <= quant_min) {
nudge_zp = quant_min;
} else if (zp_from_min >= quant_max) {
nudge_zp = quant_max;
} else {
nudge_zp = round(zp_from_min);
}
nudge_min[0] = (quant_min - nudge_zp) * (scale[0]);
nudge_max[0] = (quant_max - nudge_zp) * (scale[0]);
return;
}
void CalFakeQuantPerLayer(const float *input, float *output, const int size, const float *nudge_min,
const float *nudge_max, const float *scale, cudaStream_t cuda_stream) {
FakeQuantPerLayer<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(input, output, size, nudge_min, nudge_max,
scale);
return;
}
void CalFakeQuantPerLayerGrad(const float *input, const float *gradient, float *output, const int size,
const float *nudge_min, const float *nudge_max, cudaStream_t cuda_stream) {
FakeQuantPerLayerGrad<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(input, gradient, output, size, nudge_min,
nudge_max);
return;
}
void CalNudgePerLayer(float *input_min, float *input_max, const float quant_min, const float quant_max,
float *nudge_min, float *nudge_max, float *scale, const bool symmetric,
cudaStream_t cuda_stream) {
NudgeMinMaxPerLayer<<<1, 1, 0, cuda_stream>>>(input_min, input_max, quant_min, quant_max, nudge_min, nudge_max, scale,
symmetric);
return;
}

View File

@ -0,0 +1,31 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERLAYER_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERLAYER_H_
#include "device/gpu/cuda_common.h"
void CalNudgePerLayer(float *input_min, float *input_max, const float quant_min, const float quant_max,
float *nudge_min, float *nudge_max, float *scale, const bool symmetric, cudaStream_t cuda_stream);
void CalFakeQuantPerLayer(const float *input, float *output, const int size, const float *nudge_min,
const float *nudge_max, const float *scale, cudaStream_t cuda_stream);
void CalFakeQuantPerLayerGrad(const float *input, const float *gradient, float *output, const int size,
const float *nudge_min, const float *nudge_max, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_FAKE_QUANT_PERLAYER_H_

View File

@ -0,0 +1,87 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <thrust/extrema.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/reduce.h>
#include <thrust/pair.h>
#include "minmax_update_impl.cuh"
#include "device/gpu/cuda_common.h"
__global__ void UpdateInputMinMaxPerLayerWithEMA(const float *input_min, const float *input_max, float *output_min,
float *output_max, const float min, const float max,
const float decay) {
output_min[0] = decay * (min) + (1 - decay) * (input_min[0]);
output_min[0] = input_min[0] > 0 ? 0 : input_min[0];
output_max[0] = decay * (max) + (1 - decay) * (input_max[0]);
output_max[0] = input_max[0] < 0 ? 0 : input_max[0];
return;
}
__global__ void UpdateInputMinMaxPerLayer(float *output_min, float *output_max, const float min, const float max) {
output_min[0] = min > 0 ? 0 : min;
output_max[0] = max < 0 ? 0 : max;
return;
}
__global__ void UpdateInputMinMaxPerChannel(float *input, float *input_min, float *input_max, float *output_min,
float *output_max, int channels, int per_channel_nums, bool ema,
float ema_decay) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < channels; i += blockDim.x * gridDim.x) {
thrust::pair<float *, float *> sum =
thrust::minmax_element(thrust::device, input + i * per_channel_nums, input + per_channel_nums * (i + 1));
if (ema) {
output_min[i] = ema_decay * sum.first[0] + (1 - ema_decay) * input_min[i];
output_max[i] = ema_decay * sum.second[0] + (1 - ema_decay) * input_max[i];
} else {
output_min[i] = sum.first[0];
output_max[i] = sum.second[0];
}
output_min[i] = input_min[i] > 0 ? 0 : input_min[i];
output_max[i] = input_max[i] < 0 ? 0 : input_max[i];
}
return;
}
void CalMinMaxPerChannel(float *input, float *input_min, float *input_max, float *output_min, float *output_max,
const int total_num, const int channel_num, const float ema_decay, const bool ema,
cudaStream_t cuda_stream) {
int per_channel_num = total_num / channel_num;
UpdateInputMinMaxPerChannel<<<GET_BLOCKS(channel_num), GET_THREADS, 0, cuda_stream>>>(
input, input_min, input_max, output_min, output_max, channel_num, per_channel_num, ema, ema_decay);
return;
}
void CalMinMaxPerLayer(float *input, float *input_min, float *input_max, float *output_min, float *output_max,
const int total_num, const float ema_decay, const bool ema, cudaStream_t cuda_stream) {
float minel = 0.f;
float maxel = 0.f;
auto policy = thrust::cuda::par.on(cuda_stream);
thrust::pair<thrust::device_ptr<float>, thrust::device_ptr<float>> tuple;
tuple =
thrust::minmax_element(policy, thrust::device_pointer_cast(input), thrust::device_pointer_cast(input) + total_num);
minel = tuple.first[0];
maxel = tuple.second[0];
if (ema) {
UpdateInputMinMaxPerLayerWithEMA<<<1, 1, 0, cuda_stream>>>(input_min, input_max, output_min, output_max, minel,
maxel, ema_decay);
} else {
UpdateInputMinMaxPerLayer<<<1, 1, 0, cuda_stream>>>(output_min, output_max, minel, maxel);
}
return;
}

View File

@ -0,0 +1,29 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_MIN_MAX_UPDATE_IMPL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_MIN_MAX_UPDATE_IMPL_H_
#include "device/gpu/cuda_common.h"
void CalMinMaxPerChannel(float *input, float *input_min, float *input_max, float *output_min, float *output_max,
const int total_num, const int channel_num, const float ema_decay, const bool ema,
cudaStream_t cuda_stream);
void CalMinMaxPerLayer(float *input, float *input_min, float *input_max, float *output_min, float *output_max,
const int size, const float ema_decay, const bool ema, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_MIN_MAX_UPDATE_IMPL_H_

View File

@ -96,5 +96,8 @@ MS_REG_GPU_KERNEL_TWO(
MS_REG_GPU_KERNEL_TWO(
Maximum, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastOpGpuKernel, int, int)
MS_REG_GPU_KERNEL_TWO(
Mul, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastOpGpuKernel, int, int)
} // namespace kernel
} // namespace mindspore

View File

@ -1,176 +0,0 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "kernel/gpu/quant/fake_quant_gpu_kernel.h"
#include "kernel/gpu/cuda_impl/fake_quant_impl.cuh"
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <thrust/device_vector.h>
#include <cuda_runtime_api.h>
namespace mindspore {
namespace kernel {
FakeQuantGpuKernel::FakeQuantGpuKernel()
: input_size_(0),
min_size_(0),
max_size_(0),
output_size_(0),
workspace_size_(0),
num_bits_(0),
quant_min_(0),
quant_max_(0),
quant_num_(0),
quant_delay_(0),
ema_(false),
ema_decay_(0),
global_step_(0),
training_(false),
narrow_range_(false),
symmetric_(false) {}
const std::vector<size_t> &FakeQuantGpuKernel::GetInputSizeList() const { return input_size_list_; }
const std::vector<size_t> &FakeQuantGpuKernel::GetOutputSizeList() const { return output_size_list_; }
const std::vector<size_t> &FakeQuantGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
bool FakeQuantGpuKernel::Init(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 3) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuant GpuKernel OP needs 3 output.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but FakeQuant GpuKernel OP needs 1 output.";
}
num_bits_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("num_bits"));
ema_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema"));
ema_decay_ = GetValue<float>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema_decay"));
training_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("training"));
if (num_bits_ <= 2 || num_bits_ >= 16) {
MS_LOG(EXCEPTION) << "Attr \'num_bits\' " << num_bits_ << " is out of range, expected between 2 and 16.";
}
quant_delay_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay"));
if (quant_delay_ < 0) {
MS_LOG(EXCEPTION) << "Attr \'quant_delay\' " << num_bits_ << "is less then 0, require larger than 0.";
}
symmetric_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("symmetric"));
if (symmetric_) {
quant_min_ = 0 - (1 << (num_bits_ - 1));
quant_max_ = (1 << (num_bits_ - 1)) - 1;
} else {
quant_min_ = 0;
quant_max_ = (1 << num_bits_) - 1;
}
narrow_range_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("narrow_range"));
if (narrow_range_) {
quant_min_++;
}
if (quant_num_ == 0) {
quant_num_ = 1;
}
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < input_shape.size(); ++i) {
quant_num_ *= SizeToInt(input_shape[i]);
}
input_size_ = sizeof(float);
min_size_ = sizeof(float);
max_size_ = sizeof(float);
for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
}
output_size_ = input_size_;
InitSizeLists();
return true;
}
void FakeQuantGpuKernel::InitSizeLists() {
input_size_list_.push_back(input_size_); // input
input_size_list_.push_back(min_size_); // min
input_size_list_.push_back(max_size_); // max
output_size_list_.push_back(output_size_);
workspace_size_list_.push_back(workspace_size_);
}
bool FakeQuantGpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
float *output = GetDeviceAddress<float>(outputs, 0);
float *input = GetDeviceAddress<float>(inputs, 0);
float *input_min = GetDeviceAddress<float>(inputs, 1);
float *input_max = GetDeviceAddress<float>(inputs, 2);
if (input == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantGpuKernel input x is null.";
}
if (input_min == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantGpuKernel input min is null.";
}
if (input_max == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantGpuKernel input max is null.";
}
// Allocate space for device copies
int size = sizeof(float);
float *d_scale = nullptr;
float *d_nudge_min = nullptr;
float *d_nudge_max = nullptr;
CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_scale), size), "Malloc gpu memory failed");
CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_nudge_min), size), "Malloc gpu memory failed");
CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_nudge_max), size), "Malloc gpu memory failed");
if (training_) {
// calculate the input min and max according by the parameter ema and ema_decay.
CalMinMax(input, input_min, input_max, quant_num_, ema_decay_, ema_, reinterpret_cast<cudaStream_t>(stream_ptr));
// control flow for quant_delay
if (global_step_ >= quant_delay_) {
// real launch
CalNudge(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale,
reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantize(input, output, quant_num_, d_nudge_min, d_nudge_max, d_scale, symmetric_,
reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"Copy gpu memory failed");
}
global_step_++;
} else {
// real launch
CalNudge(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale,
reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantize(input, output, quant_num_, d_nudge_min, d_nudge_max, d_scale, symmetric_,
reinterpret_cast<cudaStream_t>(stream_ptr));
}
// Cleanup
CHECK_CUDA_RET_WITH_ERROR(cudaFree(d_scale), "Free gpu memory failed");
CHECK_CUDA_RET_WITH_ERROR(cudaFree(d_nudge_min), "Free gpu memory failed");
CHECK_CUDA_RET_WITH_ERROR(cudaFree(d_nudge_max), "Free gpu memory failed");
return true;
}
MS_REG_GPU_KERNEL(FakeQuantPerLayer, FakeQuantGpuKernel)
} // namespace kernel
} // namespace mindspore

View File

@ -1,157 +0,0 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "kernel/gpu/quant/fake_quant_grad_gpu_kernel.h"
#include "kernel/gpu/cuda_impl/fake_quant_impl.cuh"
namespace mindspore {
namespace kernel {
FakeQuantGradGpuKernel::FakeQuantGradGpuKernel()
: input_size_(0),
min_size_(0),
max_size_(0),
output_size_(0),
workspace_size_(0),
num_bits_(0),
quant_min_(0),
quant_max_(0),
quant_size_(0),
quant_delay_(0),
global_step_(0),
narrow_range_(false),
symmetric_(false) {}
const std::vector<size_t> &FakeQuantGradGpuKernel::GetInputSizeList() const { return input_size_list_; }
const std::vector<size_t> &FakeQuantGradGpuKernel::GetOutputSizeList() const { return output_size_list_; }
const std::vector<size_t> &FakeQuantGradGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
bool FakeQuantGradGpuKernel::Init(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 4) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuantGrad GpuKernel OP needs 4 output.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but FakeQuantGrad GpuKernel OP needs 1 output.";
}
num_bits_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("num_bits"));
if (num_bits_ <= 2 || num_bits_ >= 16) {
MS_LOG(EXCEPTION) << "Attr \'num_bits\' " << num_bits_ << " is out of range, expected between 2 and 16.";
}
quant_delay_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay"));
if (quant_delay_ < 0) {
MS_LOG(EXCEPTION) << "Attr \'quant_delay_\' " << quant_delay_ << " is less then 0, require larger than 0.";
}
symmetric_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("symmetric"));
if (symmetric_) {
quant_min_ = 0 - (1 << (num_bits_ - 1));
quant_max_ = (1 << (num_bits_ - 1)) - 1;
} else {
quant_min_ = 0;
quant_max_ = (1 << num_bits_) - 1;
}
narrow_range_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("narrow_range"));
if (narrow_range_) {
quant_min_++;
}
if (quant_size_ == 0) {
quant_size_ = 1;
}
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < input_shape.size(); ++i) {
quant_size_ *= SizeToInt(input_shape[i]);
}
input_size_ = sizeof(float);
min_size_ = sizeof(float);
max_size_ = sizeof(float);
for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
}
output_size_ = input_size_;
InitSizeLists();
return true;
}
void FakeQuantGradGpuKernel::InitSizeLists() {
input_size_list_.push_back(input_size_); // gradient
input_size_list_.push_back(input_size_); // input
input_size_list_.push_back(min_size_); // min
input_size_list_.push_back(max_size_); // max
output_size_list_.push_back(output_size_);
}
bool FakeQuantGradGpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
float *output = GetDeviceAddress<float>(outputs, 0);
float *gradient = GetDeviceAddress<float>(inputs, 0);
float *input = GetDeviceAddress<float>(inputs, 1);
float *input_min = GetDeviceAddress<float>(inputs, 2);
float *input_max = GetDeviceAddress<float>(inputs, 3);
if (gradient == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantGradGpuKernel gradient is null";
}
if (input == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantGradGpuKernel input is null.";
}
if (input_min == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantGradGpuKernel input min is null.";
}
if (input_max == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantGradGpuKernel input max is null.";
}
if (global_step_ >= quant_delay_) {
float *d_scale = nullptr;
float *d_nudge_min = nullptr;
float *d_nudge_max = nullptr;
int size = sizeof(float);
// Allocate space for device copies
CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_scale), size), "Malloc gpu memory failed");
CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_nudge_min), size), "Malloc gpu memory failed");
CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_nudge_max), size), "Malloc gpu memory failed");
CalNudge(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale,
reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantizeGrad(input, gradient, output, quant_size_, d_nudge_min, d_nudge_max,
reinterpret_cast<cudaStream_t>(stream_ptr));
// Cleanup
CHECK_CUDA_RET_WITH_ERROR(cudaFree(d_scale), "Free gpu memory failed");
CHECK_CUDA_RET_WITH_ERROR(cudaFree(d_nudge_min), "Free gpu memory failed");
CHECK_CUDA_RET_WITH_ERROR(cudaFree(d_nudge_max), "Free gpu memory failed");
} else {
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, gradient, input_size_, cudaMemcpyDeviceToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"Copy gpu memory failed");
}
global_step_++;
return true;
}
MS_REG_GPU_KERNEL(FakeQuantPerLayerGrad, FakeQuantGradGpuKernel)
} // namespace kernel
} // namespace mindspore

View File

@ -14,8 +14,8 @@
* limitations under the License.
*/
#include "kernel/gpu/quant/fake_quant_per_channel_gpu_kernel.h"
#include "kernel/gpu/cuda_impl/fake_quant_per_channel_impl.cuh"
#include "kernel/gpu/quant/fake_quant_perchannel_gpu_kernel.h"
#include "kernel/gpu/cuda_impl/fake_quant_perchannel_impl.cuh"
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <thrust/device_vector.h>
@ -25,21 +25,15 @@ namespace mindspore {
namespace kernel {
FakeQuantPerChannelGpuKernel::FakeQuantPerChannelGpuKernel()
: input_size_(0),
min_size_(0),
max_size_(0),
output_size_(0),
workspace_size_(0),
num_channels_(0),
num_bits_(0),
training_(false),
symmetric_(false),
narrow_range_(false),
quant_delay_(0),
quant_min_(0),
quant_max_(0),
quant_delay_(0),
ema_(false),
ema_decay_(0),
global_step_(0),
training_(false),
channel_out_(0),
narrow_range_(false),
symmetric_(false) {}
global_step_(0) {}
const std::vector<size_t> &FakeQuantPerChannelGpuKernel::GetInputSizeList() const { return input_size_list_; }
@ -60,91 +54,57 @@ bool FakeQuantPerChannelGpuKernel::Init(const CNodePtr &kernel_node) {
return false;
}
// get attribute
num_bits_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("num_bits"));
ema_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema"));
ema_decay_ = 1.0 - GetValue<float>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema_decay"));
training_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("training"));
symmetric_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("symmetric"));
narrow_range_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("narrow_range"));
quant_delay_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay"));
if (num_bits_ <= 2 || num_bits_ >= 16) {
MS_LOG(EXCEPTION) << "Attr \'num_bits\' " << num_bits_ << "is out of range, expected between 2 and 16.";
return false;
}
quant_delay_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay"));
if (quant_delay_ < 0) {
MS_LOG(EXCEPTION) << "Attr \'quant_delay\' " << num_bits_ << " is less then 0, require larger than 0.";
return false;
}
training_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("training"));
symmetric_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("symmetric"));
if (symmetric_) {
quant_min_ = 0 - (1 << (num_bits_ - 1));
quant_max_ = (1 << (num_bits_ - 1)) - 1;
} else {
quant_min_ = 0;
quant_max_ = (1 << num_bits_) - 1;
}
narrow_range_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("narrow_range"));
// quant min and max value
quant_min_ = 0;
quant_max_ = (1 << num_bits_) - 1;
if (narrow_range_) {
quant_min_++;
}
// shape info for gpu
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
channel_out_ = SizeToInt(input_shape[0]);
min_size_ = sizeof(float) * channel_out_;
max_size_ = sizeof(float) * channel_out_;
num_channels_ = SizeToInt(input_shape[0]);
input_size_ = sizeof(float);
for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
}
output_size_ = input_size_;
InitSizeLists();
return true;
}
void FakeQuantPerChannelGpuKernel::InitSizeLists() {
input_size_list_.push_back(input_size_); // input in tensor
input_size_list_.push_back(min_size_); // min one scalar
input_size_list_.push_back(max_size_); // max on scalar
output_size_list_.push_back(output_size_); // output in tensor
workspace_size_list_.push_back(sizeof(float) * channel_out_); // scale in channel
workspace_size_list_.push_back(sizeof(float) * channel_out_); // min in channel
workspace_size_list_.push_back(sizeof(float) * channel_out_); // max in channel
input_size_list_.push_back(input_size_); // input in tensor
input_size_list_.push_back(sizeof(float) * num_channels_); // min one scalar
input_size_list_.push_back(sizeof(float) * num_channels_); // max on scalar
output_size_list_.push_back(input_size_); // output in tensor
workspace_size_list_.push_back(sizeof(float) * num_channels_); // scale in channel
workspace_size_list_.push_back(sizeof(float) * num_channels_); // min in channel
workspace_size_list_.push_back(sizeof(float) * num_channels_); // max in channel
}
void FakeQuantPerChannelGpuKernel::CalFakeQuantizeForTraining(float *input, float *output, float *input_min,
float *input_max, float *d_nudge_min, float *d_nudge_max,
float *d_scale, void *stream_ptr) {
// calculate the input min and max according by the parameter ema and ema_decay.
CalMinMaxPerChannel(input, input_min, input_max, input_size_ / sizeof(float), channel_out_, ema_decay_, ema_,
reinterpret_cast<cudaStream_t>(stream_ptr));
// control flow for quant_delay
if (global_step_ >= quant_delay_) {
// real launch
CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale, channel_out_,
reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantizePerChannel(input, output, input_size_ / sizeof(float), channel_out_, d_nudge_min, d_nudge_max,
d_scale, symmetric_, reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
CHECK_CUDA_RET_WITH_ERROR(
cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"Copy gpu memory failed.");
}
global_step_++;
}
void FakeQuantPerChannelGpuKernel::CalFakeQuantizeForInfer(float *input, float *output, float *input_min,
float *input_max, float *d_nudge_min, float *d_nudge_max,
float *d_scale, void *stream_ptr) {
// real launch
CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale, channel_out_,
reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantizePerChannel(input, output, input_size_ / sizeof(float), channel_out_, d_nudge_min, d_nudge_max, d_scale,
symmetric_, reinterpret_cast<cudaStream_t>(stream_ptr));
void FakeQuantPerChannelGpuKernel::CalFakeQuantize(float *input, float *output, float *input_min, float *input_max,
float *nudge_min, float *nudge_max, float *scale, void *stream_ptr) {
CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, nudge_min, nudge_max, scale, num_channels_,
symmetric_, reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantPerChannel(input, output, input_size_ / sizeof(float), num_channels_, nudge_min, nudge_max, scale,
reinterpret_cast<cudaStream_t>(stream_ptr));
}
bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
@ -155,9 +115,9 @@ bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
float *input = GetDeviceAddress<float>(inputs, 0);
float *input_min = GetDeviceAddress<float>(inputs, 1);
float *input_max = GetDeviceAddress<float>(inputs, 2);
float *d_scale = GetDeviceAddress<float>(workspace, 0);
float *d_nudge_min = GetDeviceAddress<float>(workspace, 1);
float *d_nudge_max = GetDeviceAddress<float>(workspace, 2);
float *scale = GetDeviceAddress<float>(workspace, 0);
float *nudge_min = GetDeviceAddress<float>(workspace, 1);
float *nudge_max = GetDeviceAddress<float>(workspace, 2);
if (input == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantPerChannelGpuKernel input is null.";
@ -167,9 +127,16 @@ bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
}
if (training_) {
CalFakeQuantizeForTraining(input, output, input_min, input_max, d_nudge_min, d_nudge_max, d_scale, stream_ptr);
if (global_step_ >= quant_delay_) {
CalFakeQuantize(input, output, input_min, input_max, nudge_min, nudge_max, scale, stream_ptr);
} else {
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"Copy gpu memory failed.");
}
global_step_++;
} else {
CalFakeQuantizeForInfer(input, output, input_min, input_max, d_nudge_min, d_nudge_max, d_scale, stream_ptr);
CalFakeQuantize(input, output, input_min, input_max, nudge_min, nudge_max, scale, stream_ptr);
}
return true;

View File

@ -39,31 +39,23 @@ class FakeQuantPerChannelGpuKernel : public GpuKernel {
void InitSizeLists() override;
private:
void CalFakeQuantizeForTraining(float *input, float *output, float *input_min, float *input_max, float *d_nudge_min,
float *d_nudge_max, float *d_scale, void *stream_ptr);
void CalFakeQuantizeForInfer(float *input, float *output, float *input_min, float *input_max, float *d_nudge_min,
float *d_nudge_max, float *d_scale, void *stream_ptr);
void CalFakeQuantize(float *input, float *output, float *input_min, float *input_max, float *nudge_min,
float *nudge_max, float *scale, void *stream_ptr);
size_t input_size_;
size_t min_size_;
size_t max_size_;
size_t output_size_;
size_t workspace_size_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
int num_channels_;
int num_bits_;
bool training_;
bool symmetric_;
bool narrow_range_;
int quant_delay_;
float quant_min_;
float quant_max_;
int quant_delay_;
bool ema_;
float ema_decay_;
int global_step_;
bool training_;
int channel_out_;
bool narrow_range_;
bool symmetric_;
};
} // namespace kernel
} // namespace mindspore

View File

@ -14,21 +14,17 @@
* limitations under the License.
*/
#include "kernel/gpu/quant/fake_quant_per_channel_grad_gpu_kernel.h"
#include "kernel/gpu/cuda_impl/fake_quant_per_channel_impl.cuh"
#include "kernel/gpu/quant/fake_quant_perchannel_grad_gpu_kernel.h"
#include "kernel/gpu/cuda_impl/fake_quant_perchannel_impl.cuh"
namespace mindspore {
namespace kernel {
FakeQuantPerChannelGradGpuKernel::FakeQuantPerChannelGradGpuKernel()
: input_size_(0),
min_size_(0),
max_size_(0),
output_size_(0),
workspace_size_(0),
num_bits_(0),
quant_min_(0),
quant_max_(0),
channel_out_(0),
num_channels_(0),
quant_delay_(0),
global_step_(0),
narrow_range_(false),
@ -64,42 +60,34 @@ bool FakeQuantPerChannelGradGpuKernel::Init(const CNodePtr &kernel_node) {
}
symmetric_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("symmetric"));
if (symmetric_) {
quant_min_ = 0 - (1 << (num_bits_ - 1));
quant_max_ = (1 << (num_bits_ - 1)) - 1;
} else {
quant_min_ = 0;
quant_max_ = (1 << num_bits_) - 1;
}
narrow_range_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("narrow_range"));
// quant min and max value
quant_min_ = 0;
quant_max_ = (1 << num_bits_) - 1;
if (narrow_range_) {
quant_min_++;
}
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
channel_out_ = SizeToInt(input_shape[0]);
min_size_ = sizeof(float) * channel_out_;
max_size_ = sizeof(float) * channel_out_;
num_channels_ = SizeToInt(input_shape[0]);
input_size_ = sizeof(float);
for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
}
output_size_ = input_size_;
InitSizeLists();
return true;
}
void FakeQuantPerChannelGradGpuKernel::InitSizeLists() {
input_size_list_.push_back(input_size_); // gradient
input_size_list_.push_back(input_size_); // input
input_size_list_.push_back(min_size_); // min
input_size_list_.push_back(max_size_); // max
output_size_list_.push_back(output_size_);
workspace_size_list_.push_back(sizeof(float) * channel_out_); // scale in channel
workspace_size_list_.push_back(sizeof(float) * channel_out_); // min in channel
workspace_size_list_.push_back(sizeof(float) * channel_out_); // max in channel
input_size_list_.push_back(input_size_); // gradient
input_size_list_.push_back(input_size_); // input
input_size_list_.push_back(sizeof(float) * num_channels_); // min
input_size_list_.push_back(sizeof(float) * num_channels_); // max
output_size_list_.push_back(input_size_); // output
workspace_size_list_.push_back(sizeof(float) * num_channels_); // scale in channel
workspace_size_list_.push_back(sizeof(float) * num_channels_); // min in channel
workspace_size_list_.push_back(sizeof(float) * num_channels_); // max in channel
}
bool FakeQuantPerChannelGradGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
@ -111,9 +99,9 @@ bool FakeQuantPerChannelGradGpuKernel::Launch(const std::vector<AddressPtr> &inp
float *input = GetDeviceAddress<float>(inputs, 1);
float *input_min = GetDeviceAddress<float>(inputs, 2);
float *input_max = GetDeviceAddress<float>(inputs, 3);
float *d_scale = GetDeviceAddress<float>(workspace, 0);
float *d_nudge_min = GetDeviceAddress<float>(workspace, 1);
float *d_nudge_max = GetDeviceAddress<float>(workspace, 2);
float *scale = GetDeviceAddress<float>(workspace, 0);
float *nudge_min = GetDeviceAddress<float>(workspace, 1);
float *nudge_max = GetDeviceAddress<float>(workspace, 2);
if (gradient == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantPerChannelGradGpuKernel gradient is null";
@ -130,10 +118,10 @@ bool FakeQuantPerChannelGradGpuKernel::Launch(const std::vector<AddressPtr> &inp
int total_size = input_size_ / sizeof(float);
if (global_step_ >= quant_delay_) {
CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale, channel_out_,
reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantizePerChannelGrad(input, gradient, output, total_size, channel_out_, d_nudge_min, d_nudge_max,
reinterpret_cast<cudaStream_t>(stream_ptr));
CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, nudge_min, nudge_max, scale, num_channels_,
symmetric_, reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantPerChannelGrad(input, gradient, output, total_size, num_channels_, nudge_min, nudge_max,
reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, gradient, input_size_, cudaMemcpyDeviceToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),

View File

@ -40,10 +40,6 @@ class FakeQuantPerChannelGradGpuKernel : public GpuKernel {
private:
size_t input_size_;
size_t min_size_;
size_t max_size_;
size_t output_size_;
size_t workspace_size_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
@ -51,7 +47,7 @@ class FakeQuantPerChannelGradGpuKernel : public GpuKernel {
int num_bits_;
float quant_min_;
float quant_max_;
int channel_out_;
int num_channels_;
int quant_delay_;
int global_step_;
bool narrow_range_;

View File

@ -0,0 +1,143 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "kernel/gpu/quant/fake_quant_perlayer_gpu_kernel.h"
#include "kernel/gpu/cuda_impl/fake_quant_perlayer_impl.cuh"
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <thrust/device_vector.h>
#include <cuda_runtime_api.h>
namespace mindspore {
namespace kernel {
FakeQuantPerLayerGpuKernel::FakeQuantPerLayerGpuKernel()
: input_size_(0),
quant_min_(0),
quant_max_(0),
quant_num_(1),
global_step_(0),
num_bits_(0),
quant_delay_(0),
training_(false),
narrow_range_(false),
symmetric_(false) {}
const std::vector<size_t> &FakeQuantPerLayerGpuKernel::GetInputSizeList() const { return input_size_list_; }
const std::vector<size_t> &FakeQuantPerLayerGpuKernel::GetOutputSizeList() const { return output_size_list_; }
const std::vector<size_t> &FakeQuantPerLayerGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
bool FakeQuantPerLayerGpuKernel::Init(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 3) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuant GpuKernel OP needs 3 output.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but FakeQuant GpuKernel OP needs 1 output.";
}
num_bits_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("num_bits"));
quant_delay_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay"));
training_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("training"));
symmetric_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("symmetric"));
narrow_range_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("narrow_range"));
if (num_bits_ <= 2 || num_bits_ >= 16) {
MS_LOG(EXCEPTION) << "Attr \'num_bits\' " << num_bits_ << " is out of range, expected between 2 and 16.";
}
if (quant_delay_ < 0) {
MS_LOG(EXCEPTION) << "Attr \'quant_delay\' " << num_bits_ << "is less then 0, require larger than 0.";
}
// quant min and max value
quant_min_ = 0;
quant_max_ = (1 << num_bits_) - 1;
if (narrow_range_) {
quant_min_++;
}
// init size
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < input_shape.size(); ++i) {
quant_num_ *= SizeToInt(input_shape[i]);
}
input_size_ = sizeof(float);
for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
}
InitSizeLists();
return true;
}
void FakeQuantPerLayerGpuKernel::InitSizeLists() {
input_size_list_.push_back(input_size_); // x
input_size_list_.push_back(sizeof(float)); // min
input_size_list_.push_back(sizeof(float)); // max
output_size_list_.push_back(input_size_); // y
workspace_size_list_.push_back(sizeof(float)); // scale
workspace_size_list_.push_back(sizeof(float)); // nudge_min
workspace_size_list_.push_back(sizeof(float)); // nudge_max
}
bool FakeQuantPerLayerGpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
float *output = GetDeviceAddress<float>(outputs, 0);
float *input = GetDeviceAddress<float>(inputs, 0);
float *input_min = GetDeviceAddress<float>(inputs, 1);
float *input_max = GetDeviceAddress<float>(inputs, 2);
float *scale = GetDeviceAddress<float>(workspace, 0);
float *nudge_min = GetDeviceAddress<float>(workspace, 1);
float *nudge_max = GetDeviceAddress<float>(workspace, 2);
if (input == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantPerLayerGpuKernel input x is null.";
}
if (input_min == nullptr || input_max == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantPerLayerGpuKernel input min or input max is null.";
}
if (training_) {
// control flow for quant_delay
if (global_step_ >= quant_delay_) {
// real launch
CalNudgePerLayer(input_min, input_max, quant_min_, quant_max_, nudge_min, nudge_max, scale, symmetric_,
reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantPerLayer(input, output, quant_num_, nudge_min, nudge_max, scale,
reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, input, input_size_, cudaMemcpyDeviceToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"Copy gpu memory failed");
}
global_step_++;
} else {
// real launch
CalNudgePerLayer(input_min, input_max, quant_min_, quant_max_, nudge_min, nudge_max, scale, symmetric_,
reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantPerLayer(input, output, quant_num_, nudge_min, nudge_max, scale,
reinterpret_cast<cudaStream_t>(stream_ptr));
}
return true;
}
MS_REG_GPU_KERNEL(FakeQuantPerLayer, FakeQuantPerLayerGpuKernel)
} // namespace kernel
} // namespace mindspore

View File

@ -14,8 +14,8 @@
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_GPUKERNEL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_GPUKERNEL_H_
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_PERLAYER_GPUKERNEL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_PERLAYER_GPUKERNEL_H_
#include <vector>
#include "kernel/gpu/gpu_kernel.h"
@ -23,10 +23,10 @@
namespace mindspore {
namespace kernel {
class FakeQuantGpuKernel : public GpuKernel {
class FakeQuantPerLayerGpuKernel : public GpuKernel {
public:
FakeQuantGpuKernel();
~FakeQuantGpuKernel() = default;
FakeQuantPerLayerGpuKernel();
~FakeQuantPerLayerGpuKernel() = default;
const std::vector<size_t> &GetInputSizeList() const override;
const std::vector<size_t> &GetOutputSizeList() const override;
@ -40,22 +40,16 @@ class FakeQuantGpuKernel : public GpuKernel {
private:
size_t input_size_;
size_t min_size_;
size_t max_size_;
size_t output_size_;
size_t workspace_size_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
int num_bits_;
float quant_min_;
float quant_max_;
int quant_num_;
int quant_delay_;
bool ema_;
float ema_decay_;
int global_step_;
int num_bits_;
int quant_delay_;
bool training_;
bool narrow_range_;
bool symmetric_;
@ -63,4 +57,4 @@ class FakeQuantGpuKernel : public GpuKernel {
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_GPUKERNEL_H_
#endif // MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_PERLAYER_GPUKERNEL_H_

View File

@ -0,0 +1,133 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "kernel/gpu/quant/fake_quant_perlayer_grad_gpu_kernel.h"
#include "kernel/gpu/cuda_impl/fake_quant_perlayer_impl.cuh"
namespace mindspore {
namespace kernel {
FakeQuantPerLayerGradGpuKernel::FakeQuantPerLayerGradGpuKernel()
: input_size_(0),
workspace_size_(0),
num_bits_(0),
quant_min_(0),
quant_max_(0),
quant_num_(1),
quant_delay_(0),
global_step_(0),
narrow_range_(false),
symmetric_(false) {}
const std::vector<size_t> &FakeQuantPerLayerGradGpuKernel::GetInputSizeList() const { return input_size_list_; }
const std::vector<size_t> &FakeQuantPerLayerGradGpuKernel::GetOutputSizeList() const { return output_size_list_; }
const std::vector<size_t> &FakeQuantPerLayerGradGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
bool FakeQuantPerLayerGradGpuKernel::Init(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 4) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuantGrad GpuKernel OP needs 4 output.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but FakeQuantGrad GpuKernel OP needs 1 output.";
}
num_bits_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("num_bits"));
if (num_bits_ <= 2 || num_bits_ >= 16) {
MS_LOG(EXCEPTION) << "Attr \'num_bits\' " << num_bits_ << " is out of range, expected between 2 and 16.";
}
quant_delay_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay"));
if (quant_delay_ < 0) {
MS_LOG(EXCEPTION) << "Attr \'quant_delay_\' " << quant_delay_ << " is less then 0, require larger than 0.";
}
symmetric_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("symmetric"));
narrow_range_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("narrow_range"));
// quant min and max value
quant_min_ = 0;
quant_max_ = (1 << num_bits_) - 1;
if (narrow_range_) {
quant_min_++;
}
// init size
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < input_shape.size(); ++i) {
quant_num_ *= SizeToInt(input_shape[i]);
}
input_size_ = sizeof(float);
for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
}
InitSizeLists();
return true;
}
void FakeQuantPerLayerGradGpuKernel::InitSizeLists() {
input_size_list_.push_back(input_size_); // gradient
input_size_list_.push_back(input_size_); // input
input_size_list_.push_back(sizeof(float)); // min
input_size_list_.push_back(sizeof(float)); // max
output_size_list_.push_back(input_size_); // output
workspace_size_list_.push_back(sizeof(float)); // scale
workspace_size_list_.push_back(sizeof(float)); // nudge_min
workspace_size_list_.push_back(sizeof(float)); // nudge_max
}
bool FakeQuantPerLayerGradGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
float *output = GetDeviceAddress<float>(outputs, 0);
float *gradient = GetDeviceAddress<float>(inputs, 0);
float *input = GetDeviceAddress<float>(inputs, 1);
float *input_min = GetDeviceAddress<float>(inputs, 2);
float *input_max = GetDeviceAddress<float>(inputs, 3);
float *scale = GetDeviceAddress<float>(workspace, 0);
float *nudge_min = GetDeviceAddress<float>(workspace, 1);
float *nudge_max = GetDeviceAddress<float>(workspace, 2);
if (gradient == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantPerLayerGradGpuKernel gradient is null";
}
if (input == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantPerLayerGradGpuKernel input is null.";
}
if (input_min == nullptr || input_max == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantPerLayerGradGpuKernel input min or max is null.";
}
if (global_step_ >= quant_delay_) {
CalNudgePerLayer(input_min, input_max, quant_min_, quant_max_, nudge_min, nudge_max, scale, symmetric_,
reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantPerLayerGrad(input, gradient, output, quant_num_, nudge_min, nudge_max,
reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(output, gradient, input_size_, cudaMemcpyDeviceToDevice,
reinterpret_cast<cudaStream_t>(stream_ptr)),
"Copy gpu memory failed");
}
global_step_++;
return true;
}
MS_REG_GPU_KERNEL(FakeQuantPerLayerGrad, FakeQuantPerLayerGradGpuKernel)
} // namespace kernel
} // namespace mindspore

View File

@ -14,8 +14,8 @@
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_GRAD_GPUKERNEL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_GRAD_GPUKERNEL_H_
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_PERLAYER_GRAD_GPUKERNEL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_PERLAYER_GRAD_GPUKERNEL_H_
#include <vector>
#include "kernel/gpu/gpu_kernel.h"
@ -23,10 +23,10 @@
namespace mindspore {
namespace kernel {
class FakeQuantGradGpuKernel : public GpuKernel {
class FakeQuantPerLayerGradGpuKernel : public GpuKernel {
public:
FakeQuantGradGpuKernel();
~FakeQuantGradGpuKernel() = default;
FakeQuantPerLayerGradGpuKernel();
~FakeQuantPerLayerGradGpuKernel() = default;
const std::vector<size_t> &GetInputSizeList() const override;
const std::vector<size_t> &GetOutputSizeList() const override;
@ -40,9 +40,6 @@ class FakeQuantGradGpuKernel : public GpuKernel {
private:
size_t input_size_;
size_t min_size_;
size_t max_size_;
size_t output_size_;
size_t workspace_size_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
@ -51,7 +48,7 @@ class FakeQuantGradGpuKernel : public GpuKernel {
int num_bits_;
float quant_min_;
float quant_max_;
int quant_size_;
int quant_num_;
int quant_delay_;
int global_step_;
bool narrow_range_;
@ -60,4 +57,4 @@ class FakeQuantGradGpuKernel : public GpuKernel {
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_GRAD_GPUKERNEL_H_
#endif // MINDSPORE_CCSRC_KERNEL_GPU_FAKEQUANT_PERLAYER_GRAD_GPUKERNEL_H_

View File

@ -0,0 +1,96 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "kernel/gpu/quant/minmax_update_perchannel_gpu_kernel.h"
#include "kernel/gpu/cuda_impl/minmax_update_impl.cuh"
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <thrust/device_vector.h>
#include <cuda_runtime_api.h>
namespace mindspore {
namespace kernel {
MinMaxUpdatePerChannelGpuKernel::MinMaxUpdatePerChannelGpuKernel()
: input_size_(0), quant_num_(1), ema_(false), ema_decay_(0), num_channels_(0) {}
const std::vector<size_t> &MinMaxUpdatePerChannelGpuKernel::GetInputSizeList() const { return input_size_list_; }
const std::vector<size_t> &MinMaxUpdatePerChannelGpuKernel::GetOutputSizeList() const { return output_size_list_; }
const std::vector<size_t> &MinMaxUpdatePerChannelGpuKernel::GetWorkspaceSizeList() const {
return workspace_size_list_;
}
bool MinMaxUpdatePerChannelGpuKernel::Init(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 3) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuant GpuKernel OP needs 3 output.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 2) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but FakeQuant GpuKernel OP needs 1 output.";
}
ema_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema"));
ema_decay_ = GetValue<float>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema_decay"));
// init size
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
num_channels_ = SizeToInt(input_shape[0]);
for (size_t i = 0; i < input_shape.size(); ++i) {
quant_num_ *= SizeToInt(input_shape[i]);
}
input_size_ = sizeof(float);
for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
}
InitSizeLists();
return true;
}
void MinMaxUpdatePerChannelGpuKernel::InitSizeLists() {
input_size_list_.push_back(input_size_); // input
input_size_list_.push_back(sizeof(float) * num_channels_); // min
input_size_list_.push_back(sizeof(float) * num_channels_); // max
output_size_list_.push_back(sizeof(float) * num_channels_); // output min
output_size_list_.push_back(sizeof(float) * num_channels_); // output max
}
bool MinMaxUpdatePerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
float *output_min = GetDeviceAddress<float>(outputs, 0);
float *output_max = GetDeviceAddress<float>(outputs, 1);
float *input = GetDeviceAddress<float>(inputs, 0);
float *input_min = GetDeviceAddress<float>(inputs, 1);
float *input_max = GetDeviceAddress<float>(inputs, 2);
if (input == nullptr) {
MS_LOG(EXCEPTION) << "MinMaxUpdatePerChannelGpuKernel input x is null.";
}
if (input_min == nullptr || input_max == nullptr) {
MS_LOG(EXCEPTION) << "MinMaxUpdatePerChannelGpuKernel input min or input max is null.";
}
// calculate the input min and max according by the parameter ema and ema_decay.
CalMinMaxPerChannel(input, input_min, input_max, output_min, output_max, input_size_ / sizeof(float), num_channels_,
ema_decay_, ema_, reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
MS_REG_GPU_KERNEL(MinMaxUpdatePerChannel, MinMaxUpdatePerChannelGpuKernel)
} // namespace kernel
} // namespace mindspore

View File

@ -0,0 +1,55 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_MINMAX_UPDATE_PERCHANNEL_GPUKERNEL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_MINMAX_UPDATE_PERCHANNEL_GPUKERNEL_H_
#include <vector>
#include "kernel/gpu/gpu_kernel.h"
#include "kernel/gpu/gpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class MinMaxUpdatePerChannelGpuKernel : public GpuKernel {
public:
MinMaxUpdatePerChannelGpuKernel();
~MinMaxUpdatePerChannelGpuKernel() = default;
const std::vector<size_t> &GetInputSizeList() const override;
const std::vector<size_t> &GetOutputSizeList() const override;
const std::vector<size_t> &GetWorkspaceSizeList() const override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
bool Init(const CNodePtr &kernel) override;
protected:
void InitSizeLists() override;
private:
size_t input_size_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
int quant_num_;
bool ema_;
float ema_decay_;
int num_channels_;
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_KERNEL_GPU_MINMAX_UPDATE_PERCHANNEL_GPUKERNEL_H_

View File

@ -0,0 +1,93 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "kernel/gpu/quant/minmax_update_perlayer_gpu_kernel.h"
#include "kernel/gpu/cuda_impl/minmax_update_impl.cuh"
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <thrust/device_vector.h>
#include <cuda_runtime_api.h>
namespace mindspore {
namespace kernel {
MinMaxUpdatePerLayerGpuKernel::MinMaxUpdatePerLayerGpuKernel()
: input_size_(0), quant_num_(1), ema_(false), ema_decay_(0) {}
const std::vector<size_t> &MinMaxUpdatePerLayerGpuKernel::GetInputSizeList() const { return input_size_list_; }
const std::vector<size_t> &MinMaxUpdatePerLayerGpuKernel::GetOutputSizeList() const { return output_size_list_; }
const std::vector<size_t> &MinMaxUpdatePerLayerGpuKernel::GetWorkspaceSizeList() const { return workspace_size_list_; }
bool MinMaxUpdatePerLayerGpuKernel::Init(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 3) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but FakeQuant GpuKernel OP needs 3 output.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 2) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but FakeQuant GpuKernel OP needs 1 output.";
}
ema_ = GetValue<bool>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema"));
ema_decay_ = GetValue<float>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ema_decay"));
// init size
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < input_shape.size(); ++i) {
quant_num_ *= SizeToInt(input_shape[i]);
}
input_size_ = sizeof(float);
for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
}
InitSizeLists();
return true;
}
void MinMaxUpdatePerLayerGpuKernel::InitSizeLists() {
input_size_list_.push_back(input_size_); // input
input_size_list_.push_back(sizeof(float)); // input min
input_size_list_.push_back(sizeof(float)); // input max
output_size_list_.push_back(sizeof(float)); // output min
output_size_list_.push_back(sizeof(float)); // output max
}
bool MinMaxUpdatePerLayerGpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
float *output_min = GetDeviceAddress<float>(outputs, 0);
float *output_max = GetDeviceAddress<float>(outputs, 1);
float *input = GetDeviceAddress<float>(inputs, 0);
float *input_min = GetDeviceAddress<float>(inputs, 1);
float *input_max = GetDeviceAddress<float>(inputs, 2);
if (input == nullptr) {
MS_LOG(EXCEPTION) << "MinMaxUpdatePerLayerGpuKernel input x is null.";
}
if (input_min == nullptr || input_max == nullptr) {
MS_LOG(EXCEPTION) << "MinMaxUpdatePerLayerGpuKernel input min or input max is null.";
}
CalMinMaxPerLayer(input, input_min, input_max, output_min, output_max, quant_num_, ema_decay_, ema_,
reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
MS_REG_GPU_KERNEL(MinMaxUpdatePerLayer, MinMaxUpdatePerLayerGpuKernel)
} // namespace kernel
} // namespace mindspore

View File

@ -0,0 +1,54 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_MINMAX_UPDATE_PERLAYER_GPUKERNEL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_MINMAX_UPDATE_PERLAYER_GPUKERNEL_H_
#include <vector>
#include "kernel/gpu/gpu_kernel.h"
#include "kernel/gpu/gpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
class MinMaxUpdatePerLayerGpuKernel : public GpuKernel {
public:
MinMaxUpdatePerLayerGpuKernel();
~MinMaxUpdatePerLayerGpuKernel() = default;
const std::vector<size_t> &GetInputSizeList() const override;
const std::vector<size_t> &GetOutputSizeList() const override;
const std::vector<size_t> &GetWorkspaceSizeList() const override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
bool Init(const CNodePtr &kernel) override;
protected:
void InitSizeLists() override;
private:
size_t input_size_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
int quant_num_;
bool ema_;
float ema_decay_;
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_KERNEL_GPU_MINMAX_UPDATE_PERLAYER_GPUKERNEL_H_

View File

@ -23,6 +23,7 @@
#include "kernel/tbe/tbe_kernel_select/tbe_kernel_select.h"
#include "kernel/akg/akg_kernel_metadata.h"
#include "session/anf_runtime_algorithm.h"
#include "utils/context/ms_context.h"
namespace mindspore {
namespace kernel {
@ -96,6 +97,12 @@ void KernelQuery(const CNodePtr &kernel_node, std::vector<std::shared_ptr<kernel
std::string op_name = AnfAlgo::GetCNodeName(kernel_node);
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
if (context_ptr->enable_graph_kernel() && IsPrimitiveCNode(kernel_node, prim::kPrimBatchMatMul)) {
kernel_type = KernelType::AKG_KERNEL;
}
switch (kernel_type) {
case KernelType::AKG_KERNEL:
AkgMetadataInfo(kernel_node, kernel_info_list);

View File

@ -75,8 +75,8 @@ std::vector<TaskInfoPtr> LabelSwitchKernel::GenTask(const std::vector<AddressPtr
std::vector<std::shared_ptr<kernel::KernelBuildInfo>> LabelSwitchDesc::GetKernelInfo() {
std::vector<std::shared_ptr<kernel::KernelBuildInfo>> label_switch_build_info{};
vector<string> input_format{kOpFormat_DEFAULT, kOpFormat_DEFAULT};
vector<TypeId> input_type{kNumberTypeUInt32, kNumberTypeBool};
vector<string> input_format{kOpFormat_DEFAULT};
vector<TypeId> input_type{kNumberTypeInt32};
if (input_format.size() != input_type.size()) {
MS_LOG(EXCEPTION) << "Invalid param num, input_format size " << input_format.size() << " input_type size "
<< input_type.size();

View File

@ -28,7 +28,6 @@
namespace mindspore {
namespace parallel {
#define DOUBLE_MAX (std::numeric_limits<double>::max)()
// Compute redistributed cost
double CostRedis(const Graph::NodeType &node,
@ -621,75 +620,50 @@ StrategyRec CostCommon::ChoseStr(const std::vector<double> &cost_op, StrategyRec
break;
default:
MS_LOG(EXCEPTION) << "Failure: CostBiasAdd failed.";
MS_LOG(EXCEPTION) << "Failure: Common failed.";
}
return str;
}
// Get weight for BN
double CostBatchNorm::GetMinCostIn(const OperatorRec &op) {
int tensor = static_cast<int>(op.arguments[0].tensor_shape.shape_h * op.arguments[0].tensor_str.str_h) *
static_cast<int>(op.arguments[0].tensor_shape.shape_n * op.arguments[0].tensor_str.str_n) *
static_cast<int>(op.arguments[0].tensor_shape.shape_w * op.arguments[0].tensor_str.str_w) *
static_cast<int>(op.arguments[0].tensor_shape.shape_c * op.arguments[0].tensor_str.str_c);
std::vector<double> cost_in;
cost_in.push_back(StrDimB(tensor) * 1.2);
cost_in.push_back(DOUBLE_MAX);
cost_in.push_back(StrDimH(tensor) * 1.2);
cost_in.push_back(StrDimW(tensor) * 1.2);
return *min_element(cost_in.begin(), cost_in.end());
}
// Get optimal strategy for BN
StrategyRec CostBatchNorm::GetOptimalStr(const Graph::NodeType &node,
const std::vector<std::pair<std::string, StrategyRec>> &node_name_to_strategy,
const Graph &graph) {
// Get optimal strategy for BatchParallel OPs
StrategyRec CostBatchParallel::GetOptimalStr(const Graph::NodeType &node) {
const OperatorRec &op = node.apply;
int tensor_filter_n = static_cast<int>(op.arguments[1].tensor_shape.shape_n * op.arguments[1].tensor_str.str_n);
int tensor_filter_c = static_cast<int>(op.arguments[1].tensor_shape.shape_c * op.arguments[1].tensor_str.str_c);
int tensor_filter_h = static_cast<int>(op.arguments[1].tensor_shape.shape_h * op.arguments[1].tensor_str.str_h);
int tensor_filter_w = static_cast<int>(op.arguments[1].tensor_shape.shape_w * op.arguments[1].tensor_str.str_w);
int tensor_filter = tensor_filter_h * tensor_filter_w * tensor_filter_n * tensor_filter_c;
int output_tensor_h = static_cast<int>(node.tensor_parm.tensor_shape.shape_h * node.tensor_parm.tensor_str.str_h);
int output_tensor_w = static_cast<int>(node.tensor_parm.tensor_shape.shape_w * node.tensor_parm.tensor_str.str_w);
int output_tensor_n = static_cast<int>(node.tensor_parm.tensor_shape.shape_n * node.tensor_parm.tensor_str.str_n);
int tensor_n = static_cast<int>(op.arguments[0].tensor_shape.shape_n * op.arguments[0].tensor_str.str_n);
int tensor_c = static_cast<int>(op.arguments[0].tensor_shape.shape_c * op.arguments[0].tensor_str.str_c);
int tensor_h = static_cast<int>(op.arguments[0].tensor_shape.shape_h * op.arguments[0].tensor_str.str_h);
int tensor_w = static_cast<int>(op.arguments[0].tensor_shape.shape_w * op.arguments[0].tensor_str.str_w);
std::vector<double> cost_op;
std::vector<std::vector<float>> mode;
if (output_tensor_n < 2 || output_tensor_n % 2 != 0) {
if (tensor_n < 2 || tensor_n % 2 != 0) {
cost_op.push_back(DOUBLE_MAX);
} else {
cost_op.push_back(StrDimB(tensor_filter) + CostRedis(node, node_name_to_strategy,
mode = {{0.5, 1, 1, 1}, {1, 1, 1, 1}, {0.5, 1, 1, 1}}, graph));
cost_op.push_back(cost_in_);
}
cost_op.push_back(DOUBLE_MAX);
if (output_tensor_h < 2 || output_tensor_h % 2 != 0) {
if (tensor_c < 2 || tensor_c % 2 != 0) {
cost_op.push_back(DOUBLE_MAX);
} else {
cost_op.push_back(StrDimH(tensor_filter) + CostRedis(node, node_name_to_strategy,
mode = {{1, 1, 0.5, 1}, {1, 1, 1, 1}, {1, 1, 0.5, 1}}, graph));
cost_op.push_back(cost_in_);
}
if (output_tensor_w < 2 || output_tensor_w % 2 != 0) {
if (tensor_h < 2 || tensor_h % 2 != 0) {
cost_op.push_back(DOUBLE_MAX);
} else {
cost_op.push_back(StrDimW(tensor_filter) + CostRedis(node, node_name_to_strategy,
mode = {{1, 1, 1, 0.5}, {1, 1, 1, 1}, {1, 1, 1, 0.5}}, graph));
cost_op.push_back(cost_in_);
}
if (tensor_w < 2 || tensor_w % 2 != 0) {
cost_op.push_back(DOUBLE_MAX);
} else {
cost_op.push_back(cost_in_);
}
return ChoseStr(cost_op, node.apply.str);
}
// Chose strategy for BatchNorm
StrategyRec CostBatchNorm::ChoseStr(const std::vector<double> &cost_op, StrategyRec str) {
// Chose strategy for BatchParallel op
StrategyRec CostBatchParallel::ChoseStr(const std::vector<double> &cost_op, StrategyRec str) {
uint64_t min_position = min_element(cost_op.begin(), cost_op.end()) - cost_op.begin();
if (cost_op[min_position] > (DOUBLE_MAX - 0.1)) {
return str;
@ -700,36 +674,75 @@ StrategyRec CostBatchNorm::ChoseStr(const std::vector<double> &cost_op, Strategy
str.inputTensor[0].str_n /= 2.0;
str.outputTensor.str_n /= 2.0;
str.cut_counter += 1;
str.cost = str.cost + cost_in_b_;
str.cost = str.cost + cost_in_;
break;
case 1:
str.inputTensor[0].str_c /= 2.0;
str.inputTensor[1].str_c /= 2.0;
str.inputTensor[2].str_c /= 2.0;
str.inputTensor[3].str_c /= 2.0;
str.inputTensor[4].str_c /= 2.0;
str.outputTensor.str_c /= 2.0;
str.cut_counter += 1;
str.cost = str.cost + cost_in_c_;
str.cost = str.cost + cost_in_;
break;
case 2:
str.inputTensor[0].str_h /= 2.0;
str.outputTensor.str_h /= 2.0;
str.cut_counter += 1;
str.cost = str.cost + cost_in_h_;
str.cost = str.cost + cost_in_;
break;
case 3:
str.inputTensor[0].str_w /= 2.0;
str.outputTensor.str_w /= 2.0;
str.cut_counter += 1;
str.cost = str.cost + cost_in_w_;
str.cost = str.cost + cost_in_;
break;
default:
MS_LOG(EXCEPTION) << "Failure: CostBatchNorm failed.";
MS_LOG(EXCEPTION) << "Failure: CostBatchParallel failed.";
}
return str;
}
// Chose strategy for CostSoftmaxCrossEntropyWithLogits
StrategyRec CostSoftmaxCrossEntropyWithLogits::ChoseStr(const std::vector<double> &cost_op, StrategyRec str) {
uint64_t min_position = min_element(cost_op.begin(), cost_op.end()) - cost_op.begin();
if (cost_op[min_position] > (DOUBLE_MAX - 0.1)) {
return str;
}
switch (min_position) {
case 0:
str.inputTensor[0].str_n /= 2.0;
str.inputTensor[1].str_n /= 2.0;
str.cut_counter += 1;
str.cost = str.cost + cost_in_;
break;
case 1:
str.inputTensor[0].str_c /= 2.0;
str.inputTensor[1].str_c /= 2.0;
str.cut_counter += 1;
str.cost = str.cost + cost_in_;
break;
case 2:
str.inputTensor[0].str_h /= 2.0;
str.inputTensor[1].str_h /= 2.0;
str.outputTensor.str_w /= 2.0;
str.cut_counter += 1;
str.cost = str.cost + cost_in_;
break;
case 3:
str.inputTensor[0].str_w /= 2.0;
str.inputTensor[1].str_w /= 2.0;
str.cut_counter += 1;
str.cost = str.cost + cost_in_;
break;
default:
MS_LOG(EXCEPTION) << "Failure: CostSoftmax failed.";
}
return str;
}

View File

@ -28,6 +28,8 @@
namespace mindspore {
namespace parallel {
#define DOUBLE_MAX (std::numeric_limits<double>::max)()
double CostRedis(const Graph::NodeType &node,
const std::vector<std::pair<std::string, StrategyRec>> &node_name_to_strategy,
const std::vector<std::vector<float>> &mode, const Graph &graph);
@ -195,7 +197,6 @@ class CostTensorAdd : public CostCommon {
};
// all the following operation are element-wise and have the same cost
class CostOneHot : public CostCommon {};
class CostReLU : public CostCommon {};
class CostLog : public CostCommon {};
class CostExp : public CostCommon {};
@ -206,50 +207,27 @@ class CostDiv : public CostCommon {};
class CostSqueeze : public CostCommon {};
class CostCast : public CostCommon {};
// class BatchNorm is used to compute the cost of BatchNorm operator.
class CostBatchNorm {
// class BatchParallel is used to compute the cost of BatchParallel operator.
class CostBatchParallel {
public:
StrategyRec GetOptimalStr(const Graph::NodeType &node,
const std::vector<std::pair<std::string, StrategyRec>> &node_name_to_strategy,
const Graph &graph);
virtual StrategyRec GetOptimalStr(const Graph::NodeType &node);
double GetMinCostIn(const OperatorRec &op);
virtual double GetMaxCostIn() const { return DOUBLE_MAX; }
private:
double StrDimB(int32_t Tensor) {
cost_in_b_ = (static_cast<double>(Tensor) * 4.0) / 2.0;
protected:
virtual StrategyRec ChoseStr(const std::vector<double> &cost_op, StrategyRec str);
return cost_in_b_;
}
double cost_in_ = 0;
}; // class BatchParallel is used to compute the cost of BatchParallel operator.
double StrDimC() {
cost_in_c_ = 0.0;
return cost_in_c_;
}
double StrDimH(int32_t Tensor) {
cost_in_h_ = (static_cast<double>(Tensor) * 4.0) / 2.0;
return cost_in_h_;
}
double StrDimW(int32_t Tensor) {
cost_in_w_ = (static_cast<double>(Tensor) * 4.0) / 2.0;
return cost_in_w_;
}
class CostBatchNorm : public CostBatchParallel {};
class CostOneHot : public CostBatchParallel {};
class CostPRelu : public CostBatchParallel {};
class CostSoftmax : public CostBatchParallel {};
class CostSoftmaxCrossEntropyWithLogits : public CostBatchParallel {
StrategyRec ChoseStr(const std::vector<double> &cost_op, StrategyRec str);
double cost_in_b_ = 0;
double cost_in_c_ = 0;
double cost_in_h_ = 0;
double cost_in_w_ = 0;
}; // class BatchNorm is used to compute the cost of BatchNorm operator.
};
} // namespace parallel
} // namespace mindspore
#endif // PARALLEL_AUTO_PARALLEL_REC_COST_H_

View File

@ -28,10 +28,10 @@
namespace mindspore {
namespace parallel {
void GenerateStrategy(std::shared_ptr<Graph> graph, const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const std::shared_ptr<std::vector<std::vector<size_t>>> eli_list,
void GenerateStrategy(const std::shared_ptr<Graph> &graph, const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const std::shared_ptr<std::vector<std::vector<size_t>>> &eli_list,
const std::vector<std::vector<std::string>> &input_tensor_names,
const std::shared_ptr<std::vector<size_t>> index_list) {
const std::shared_ptr<std::vector<size_t>> &index_list) {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(eli_list);
MS_EXCEPTION_IF_NULL(index_list);
@ -127,25 +127,6 @@ std::vector<std::vector<int32_t>> PrepareMatMul(const std::shared_ptr<Graph> &gr
return strategies;
}
std::vector<std::vector<int32_t>> PreparePReLU(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_graph, const size_t iter_ops) {
std::vector<std::vector<int32_t>> strategies = MakeDataParallelStrategy(graph, ops, iter_graph, iter_ops);
strategies[1][0] = 1;
return strategies;
}
std::vector<std::vector<int32_t>> PrepareBatchNorm(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_graph, const size_t iter_ops) {
std::vector<std::vector<int32_t>> strategies = MakeDataParallelStrategy(graph, ops, iter_graph, iter_ops);
for (size_t i = 1; i < strategies.size(); i++) {
strategies[i][0] = strategies[0][1];
}
strategies[1][0] = 1;
return strategies;
}
std::vector<std::vector<int32_t>> PrepareBiasAdd(const std::shared_ptr<std::vector<int32_t>> &s) {
std::vector<std::vector<int32_t>> strategies;
strategies.push_back(*s);
@ -155,18 +136,88 @@ std::vector<std::vector<int32_t>> PrepareBiasAdd(const std::shared_ptr<std::vect
return strategies;
}
std::vector<std::vector<int32_t>> PrepareOneHot(const std::shared_ptr<std::vector<int32_t>> &s) {
std::vector<std::vector<int32_t>> strategies;
std::vector<std::vector<int32_t>> PrepareOneHot(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_graph, const size_t iter_ops) {
std::vector<std::vector<int32_t>> strategies = MakeRecSearchStrategy(graph, ops, iter_graph, iter_ops);
int32_t axis = -1;
auto iter = ops[iter_ops]->attrs().find(AXIS);
if (iter != ops[iter_ops]->attrs().end()) {
MS_EXCEPTION_IF_NULL(iter->second);
if (iter->second->isa<Int32Imm>()) {
axis = iter->second->cast<Int32ImmPtr>()->value();
} else {
MS_LOG(EXCEPTION) << ops[iter_ops]->name() << ": The value of axis is not int.";
}
}
if (axis == -1) {
strategies[0][0] = strategies[0][1];
strategies[0][1] = 1;
graph->nodes[iter_graph].tensor_parm.tensor_str.str_h = graph->nodes[iter_graph].tensor_parm.tensor_str.str_w;
graph->nodes[iter_graph].tensor_parm.tensor_str.str_w = 1.0;
}
std::vector<int32_t> s_empty = {};
strategies.push_back(*s);
strategies.push_back(s_empty);
strategies.push_back(s_empty);
return strategies;
}
std::vector<std::vector<int32_t>> PrepareGatherV2(const std::shared_ptr<std::vector<int32_t>> &s) {
std::vector<std::vector<int32_t>> PrepareGatherV2(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_ops, std::vector<int32_t> s) {
std::vector<std::vector<int32_t>> strategies;
strategies.push_back(*s);
int32_t axis = 0;
auto axis_input = GetValue<int>(ops[iter_ops]->input_value().at(2));
if (axis_input < 0) {
axis_input += SizeToInt(ops[iter_ops]->inputs_tensor_info()[0].shape().size());
}
axis = axis_input;
if (axis >= SizeToInt(s.size())) {
MS_LOG(EXCEPTION) << "Failure: GatherV2' axis out of range.";
}
s[axis] = 1;
strategies.push_back(s);
auto pos = ops[iter_ops]->name().find("Info");
auto name = ops[iter_ops]->name().substr(0, pos);
if (name == "GatherV2") {
return strategies;
}
std::vector<int32_t> s_indices;
for (size_t i = 0; i < ops[iter_ops]->inputs_tensor_info()[1].shape().size(); i++) {
s_indices.push_back(1);
}
strategies.push_back(s_indices);
return strategies;
}
std::vector<std::vector<int32_t>> PrepareL2Normalize(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_ops, std::vector<int32_t> s) {
int32_t axis = 0;
auto iter = ops[iter_ops]->attrs().find(AXIS);
if (iter != ops[iter_ops]->attrs().end()) {
MS_EXCEPTION_IF_NULL(iter->second);
if (iter->second->isa<Int32Imm>()) {
axis = iter->second->cast<Int32ImmPtr>()->value();
} else {
MS_LOG(EXCEPTION) << ops[iter_ops]->name() << " : The value of axis is not int.";
}
}
int32_t axis_index = axis;
if (axis < 0) {
size_t input_dim = ops[iter_ops]->inputs_tensor_info()[0].shape().size();
axis_index = static_cast<int32_t>(input_dim) + axis;
}
s[IntToSize(axis_index)] = 1;
std::vector<std::vector<int32_t>> strategies;
strategies.push_back(s);
return strategies;
}
@ -209,7 +260,7 @@ std::vector<std::vector<int32_t>> MakeRecSearchStrategy(const std::shared_ptr<Gr
} else if (output_size == 0) {
s = {};
} else {
MS_LOG(ERROR) << "Tensor's output size is unexcepted.";
MS_LOG(EXCEPTION) << ops[iter_ops]->name() << ": Tensor's output size is unexcepted.";
}
strategies.push_back(s);
}
@ -229,7 +280,7 @@ std::vector<std::vector<int32_t>> MakeDataParallelStrategy(const std::shared_ptr
StrategyPtr origin_strategy = ops[iter_ops]->strategy();
std::vector<std::vector<int32_t>> strategies;
size_t max_device_num = g_device_manager->DeviceNum();
size_t target_tensor_batch = ops[iter_ops]->outputs_tensor_info()[0].shape()[0];
size_t target_tensor_batch = ops[iter_ops]->inputs_tensor_info()[0].shape()[0];
for (size_t iter_op_inputs = 0; iter_op_inputs < ops[iter_ops]->inputs_tensor_info().size(); iter_op_inputs++) {
if (iter_op_inputs >= origin_strategy->GetInputDim().size()) {
MS_LOG(EXCEPTION) << "Failure: Strategy's InputDim out of range.";
@ -244,8 +295,10 @@ std::vector<std::vector<int32_t>> MakeDataParallelStrategy(const std::shared_ptr
} else {
s.push_back(1);
}
} else if (input_size == 0) {
s = {};
} else {
MS_LOG(ERROR) << "Tensor's shape is unknown.";
MS_LOG(EXCEPTION) << ops[iter_ops]->name() << ": Tensor's shape is unknown.";
}
}
strategies.push_back(s);
@ -285,25 +338,20 @@ std::vector<std::vector<int32_t>> PrepareStrategy(const std::shared_ptr<Graph> &
if (type == MATMUL) {
return PrepareMatMul(graph, ops, iter_graph, iter_ops);
} else if (type == PRELU) {
return PreparePReLU(graph, ops, iter_graph, iter_ops);
} else if (type == BATCH_NORM) {
return PrepareBatchNorm(graph, ops, iter_graph, iter_ops);
} else if (type == SOFTMAX || type == LOG_SOFTMAX || type == SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS ||
type == SOFTMAX_CROSS_ENTROPY_WITH_LOGITS) {
return MakeDataParallelStrategy(graph, ops, iter_graph, iter_ops);
} else if (type == ONEHOT) {
return PrepareOneHot(graph, ops, iter_graph, iter_ops);
} else {
return MakeRecSearchStrategy(graph, ops, iter_graph, iter_ops);
}
}
void GeneratePartitionedOperatorStrategy(const std::shared_ptr<Graph> graph,
void GeneratePartitionedOperatorStrategy(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const std::shared_ptr<std::vector<size_t>> index_list) {
const std::shared_ptr<std::vector<size_t>> &index_list) {
for (size_t iter_ops = 0; iter_ops < (size_t)index_list->size(); iter_ops++) {
std::vector<std::vector<int32_t>> strategies;
size_t iter_graph = index_list->at(iter_ops);
if (iter_graph != SIZE_MAX) {
if (iter_graph != SIZE_MAX && ops[iter_ops]->type() != GET_NEXT) {
strategies = PrepareStrategy(graph, ops, iter_graph, iter_ops);
}
StrategyPtr sp = std::make_shared<Strategy>(0, strategies);
@ -328,7 +376,7 @@ size_t FindIndexOfOperatorIncoming(const std::vector<std::vector<std::string>> &
return incoming_op_index;
}
std::vector<int32_t> CopyIncomingOperatorOutputStrategy(const std::shared_ptr<Graph> graph,
std::vector<int32_t> CopyIncomingOperatorOutputStrategy(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_ops, const size_t iter_graph) {
std::vector<int32_t> s;
@ -348,7 +396,7 @@ std::vector<int32_t> CopyIncomingOperatorOutputStrategy(const std::shared_ptr<Gr
s.push_back(1 / graph->nodes[iter_graph].tensor_parm.tensor_str.str_h);
s.push_back(1 / graph->nodes[iter_graph].tensor_parm.tensor_str.str_w);
} else {
MS_LOG(ERROR) << "Tensor's shape is unknown.";
MS_LOG(EXCEPTION) << ops[iter_ops]->name() << ": Tensor's shape is unknown.";
}
break;
}
@ -358,7 +406,8 @@ std::vector<int32_t> CopyIncomingOperatorOutputStrategy(const std::shared_ptr<Gr
std::vector<int32_t> PrepareIncomingOperatorInputStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t incoming_op_index) {
std::vector<int32_t> s;
if (ops[incoming_op_index]->type() == RESHAPE || ops[incoming_op_index]->type() == GATHERV2) {
if (ops[incoming_op_index]->type() == RESHAPE || ops[incoming_op_index]->type() == GATHERV2 ||
ops[incoming_op_index]->type() == TRANSPOSE) {
return s;
}
auto strategy = ops[incoming_op_index]->selected_strategy();
@ -426,13 +475,23 @@ std::vector<int32_t> ModifyStrategyIfSqueezeIncoming(const std::vector<std::shar
return s_Squeeze;
}
bool GetKeepDims(const std::vector<std::shared_ptr<OperatorInfo>> &ops, const size_t iter_ops) {
bool keepdims = false;
auto keep_dims_iter = ops[iter_ops]->attrs().find(KEEP_DIMS);
if (keep_dims_iter == ops[iter_ops]->attrs().end()) {
MS_LOG(EXCEPTION) << ops[iter_ops]->name() << ": Don't have attr keep_dims.";
}
MS_EXCEPTION_IF_NULL(keep_dims_iter->second);
if (!keep_dims_iter->second->isa<BoolImm>()) {
MS_LOG(EXCEPTION) << ops[iter_ops]->name() << ": Keep_dims is not a bool.";
}
keepdims = keep_dims_iter->second->cast<BoolImmPtr>()->value();
return keepdims;
}
std::vector<int32_t> GetDimList(const std::vector<std::shared_ptr<OperatorInfo>> &ops, const size_t iter_ops) {
std::vector<int32_t> dim_list;
bool keep_dims;
if (!ops[iter_ops]->attrs().find(KEEP_DIMS)->second->isa<BoolImm>()) {
MS_LOG(EXCEPTION) << "Failure: Parameter keep_dims is not a boolean value." << std::endl;
}
keep_dims = ops[iter_ops]->attrs().find(KEEP_DIMS)->second->cast<BoolImmPtr>()->value();
bool keep_dims = GetKeepDims(ops, iter_ops);
if (keep_dims != false) {
return dim_list;
}
@ -478,6 +537,62 @@ std::vector<int32_t> ModifyStrategyIfReduceIncoming(const std::vector<std::share
return s_Reduce;
}
std::vector<int32_t> GetDimListFromAttrs(const std::vector<std::shared_ptr<OperatorInfo>> &ops, const size_t iter_ops) {
std::vector<int32_t> dim_list;
auto iter = ops[iter_ops]->attrs().find(AXIS);
if (iter == ops[iter_ops]->attrs().end()) {
MS_LOG(EXCEPTION) << ops[iter_ops]->name() << ": Don't have attr axis.";
}
auto input_dim = ops[iter_ops]->inputs_tensor_info()[0].shape().size();
MS_EXCEPTION_IF_NULL(iter->second);
if (iter->second->isa<ValueTuple>()) {
auto attr_axis = GetValue<std::vector<int>>(iter->second);
if (attr_axis.empty()) {
for (size_t i = 0; i < input_dim; ++i) {
dim_list.push_back(SizeToInt(i));
}
} else {
for (auto &axis : attr_axis) {
axis < 0 ? dim_list.push_back(axis + SizeToInt(input_dim)) : dim_list.push_back(axis);
}
}
} else if (iter->second->isa<Int32Imm>()) {
int axis = GetValue<int>(iter->second);
axis < 0 ? dim_list.push_back(axis + SizeToInt(input_dim)) : dim_list.push_back(axis);
} else {
MS_LOG(EXCEPTION) << "Axis type is invalid.";
}
return dim_list;
}
std::vector<int32_t> ModifyStrategyIfArgIncoming(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t incoming_op_index, std::vector<int32_t> s) {
bool keepdims = GetKeepDims(ops, incoming_op_index);
if (keepdims) {
return s;
}
std::vector<int32_t> s_Arg;
std::vector<int32_t> axis_list;
for (size_t i = 0; i < s.size(); i++) {
axis_list.push_back(i);
}
auto dim_list = GetDimListFromAttrs(ops, incoming_op_index);
for (auto axis : dim_list) {
auto it = find(axis_list.begin(), axis_list.end(), axis);
if (it == axis_list.end()) {
MS_LOG(EXCEPTION) << "Failure: Can not find dimension indexes in Axis." << std::endl;
}
axis_list.erase(it);
}
for (size_t i = 0; i < (size_t)axis_list.size(); i++) {
s_Arg.push_back(s[axis_list[i]]);
}
return s_Arg;
}
std::vector<int32_t> CopyIncomingOperatorInputStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_ops, const size_t incoming_op_index) {
std::vector<int32_t> s;
@ -490,6 +605,9 @@ std::vector<int32_t> CopyIncomingOperatorInputStrategy(const std::vector<std::sh
ops[incoming_op_index]->type() == REDUCE_MIN || ops[incoming_op_index]->type() == REDUCE_MEAN) {
s = ModifyStrategyIfReduceIncoming(ops, incoming_op_index, s);
}
if (ops[incoming_op_index]->type() == ARGMAXWITHVALUE || ops[incoming_op_index]->type() == ARGMINWITHVALUE) {
s = ModifyStrategyIfArgIncoming(ops, incoming_op_index, s);
}
}
return s;
}
@ -513,11 +631,11 @@ std::vector<std::vector<int32_t>> GenerateStrategiesFromStrategy(const std::vect
if (ops[iter_ops]->type() == BIAS_ADD) {
return PrepareBiasAdd(s_ptr);
}
if (ops[iter_ops]->type() == ONEHOT) {
return PrepareOneHot(s_ptr);
}
if (ops[iter_ops]->type() == GATHERV2) {
return PrepareGatherV2(s_ptr);
return PrepareGatherV2(ops, iter_ops, basic_stra);
}
if (ops[iter_ops]->type() == L2_NORMALIZE) {
return PrepareL2Normalize(ops, iter_ops, basic_stra);
}
for (size_t iter_op_inputs = 0; iter_op_inputs < (size_t)ops[iter_ops]->inputs_tensor_info().size();
@ -544,11 +662,11 @@ std::vector<std::vector<int32_t>> GenerateStrategiesFromStrategy(const std::vect
return stra;
}
void GenerateEliminatedOperatorStrategyForward(const std::shared_ptr<Graph> graph,
void GenerateEliminatedOperatorStrategyForward(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const std::vector<std::vector<std::string>> &input_tensor_names,
const std::shared_ptr<std::vector<size_t>> index_list,
const std::shared_ptr<std::vector<size_t>> no_stra_op_list) {
const std::shared_ptr<std::vector<size_t>> &index_list,
const std::shared_ptr<std::vector<size_t>> &no_stra_op_list) {
if (no_stra_op_list->size() == 0) {
return;
}
@ -559,7 +677,7 @@ void GenerateEliminatedOperatorStrategyForward(const std::shared_ptr<Graph> grap
std::vector<std::vector<int32_t>> stra;
std::vector<int32_t> s;
size_t incoming_op_index = FindIndexOfOperatorIncoming(input_tensor_names, iter_ops);
if (incoming_op_index != SIZE_MAX && ops[iter_ops]->type() != ONEHOT) {
if (incoming_op_index != SIZE_MAX) {
auto iter_graph = index_list->at(incoming_op_index);
if (iter_graph != SIZE_MAX) {
s = CopyIncomingOperatorOutputStrategy(graph, ops, iter_ops, iter_graph);
@ -617,7 +735,8 @@ std::vector<int32_t> CopyOutgoingOperatorInputStrategy(const std::vector<std::sh
std::vector<int32_t> s;
if (ops[iter_ops]->type() == REDUCE_MAX || ops[iter_ops]->type() == REDUCE_MIN ||
ops[iter_ops]->type() == REDUCE_SUM || ops[iter_ops]->type() == REDUCE_MEAN || ops[iter_ops]->type() == RESHAPE ||
ops[iter_ops]->type() == GATHERV2) {
ops[iter_ops]->type() == GATHERV2 || ops[iter_ops]->type() == TRANSPOSE ||
ops[iter_ops]->type() == ARGMAXWITHVALUE || ops[iter_ops]->type() == ARGMINWITHVALUE) {
return s;
}
@ -640,7 +759,7 @@ std::vector<int32_t> CopyOutgoingOperatorInputStrategy(const std::vector<std::sh
}
if (outgoing_op_index != SIZE_MAX && iter_op_inputs != SIZE_MAX) {
for (size_t k = 0; k < ops[outgoing_op_index]->selected_strategy()->GetInputDim()[iter_op_inputs].size(); ++k) {
for (size_t k = 0; k < ops[iter_ops]->outputs_tensor_info()[0].shape().size(); ++k) {
s.push_back(ops[outgoing_op_index]->selected_strategy()->GetInputDim()[iter_op_inputs][k]);
}
}
@ -649,7 +768,7 @@ std::vector<int32_t> CopyOutgoingOperatorInputStrategy(const std::vector<std::sh
void GenerateEliminatedOperatorStrategyBackward(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const std::vector<std::vector<std::string>> &input_tensor_names,
const std::shared_ptr<std::vector<size_t>> no_stra_op_list) {
const std::shared_ptr<std::vector<size_t>> &no_stra_op_list) {
if (no_stra_op_list->size() == 0) {
return;
}
@ -679,16 +798,16 @@ void GenerateEliminatedOperatorStrategyBackward(const std::vector<std::shared_pt
}
}
void GenerateRemainingOperatorStrategy(const std::shared_ptr<Graph> graph,
void GenerateRemainingOperatorStrategy(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const std::vector<std::vector<std::string>> &input_tensor_names,
const std::shared_ptr<std::vector<size_t>> index_list,
const std::shared_ptr<std::vector<size_t>> no_stra_op_list) {
const std::shared_ptr<std::vector<size_t>> &index_list,
const std::shared_ptr<std::vector<size_t>> &no_stra_op_list) {
if (no_stra_op_list->size() == 0) {
return;
}
size_t no_stra_op_list_size;
size_t no_stra_op_list_size = no_stra_op_list->size();
do {
no_stra_op_list_size = no_stra_op_list->size();
GenerateEliminatedOperatorStrategyForward(graph, ops, input_tensor_names, index_list, no_stra_op_list);

View File

@ -27,22 +27,21 @@
namespace mindspore {
namespace parallel {
void GenerateStrategy(std::shared_ptr<Graph> graph, const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const std::shared_ptr<std::vector<std::vector<size_t>>> eli_list,
void GenerateStrategy(const std::shared_ptr<Graph> &graph, const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const std::shared_ptr<std::vector<std::vector<size_t>>> &eli_list,
const std::vector<std::vector<std::string>> &input_tensor_names,
const std::shared_ptr<std::vector<size_t>> index_list);
const std::shared_ptr<std::vector<size_t>> &index_list);
std::vector<std::vector<int32_t>> PrepareMatMul(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_graph, const size_t iter_ops);
std::vector<std::vector<int32_t>> PreparePReLU(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_graph, const size_t iter_ops);
std::vector<std::vector<int32_t>> PrepareBatchNorm(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_graph, const size_t iter_ops);
std::vector<std::vector<int32_t>> PrepareBiasAdd(const std::shared_ptr<std::vector<int32_t>> &s);
std::vector<std::vector<int32_t>> PrepareOneHot(const std::shared_ptr<std::vector<int32_t>> &s);
std::vector<std::vector<int32_t>> PrepareGatherV2(const std::shared_ptr<std::vector<int32_t>> &s);
std::vector<std::vector<int32_t>> PrepareOneHot(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_graph, const size_t iter_ops);
std::vector<std::vector<int32_t>> PrepareGatherV2(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_ops, std::vector<int32_t> s);
std::vector<std::vector<int32_t>> PrepareL2Normalize(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_ops, std::vector<int32_t> s);
std::vector<std::vector<int32_t>> MakeRecSearchStrategy(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_graph, const size_t iter_ops);
@ -52,12 +51,12 @@ std::vector<std::vector<int32_t>> MakeDataParallelStrategy(const std::shared_ptr
std::vector<std::vector<int32_t>> PrepareStrategy(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_graph, const size_t iter_ops);
void GeneratePartitionedOperatorStrategy(const std::shared_ptr<Graph> graph,
void GeneratePartitionedOperatorStrategy(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const std::shared_ptr<std::vector<size_t>> index_list);
const std::shared_ptr<std::vector<size_t>> &index_list);
size_t FindIndexOfOperatorIncoming(const std::vector<std::vector<std::string>> &input_tensor_names,
const size_t iter_ops);
std::vector<int32_t> CopyIncomingOperatorOutputStrategy(const std::shared_ptr<Graph> graph,
std::vector<int32_t> CopyIncomingOperatorOutputStrategy(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_ops, const size_t iter_graph);
std::vector<int32_t> PrepareIncomingOperatorInputStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
@ -65,19 +64,23 @@ std::vector<int32_t> PrepareIncomingOperatorInputStrategy(const std::vector<std:
std::vector<int32_t> GetAxisList(const std::vector<std::shared_ptr<OperatorInfo>> &ops, const int iter_ops);
std::vector<int32_t> ModifyStrategyIfSqueezeIncoming(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t incoming_op_index, std::vector<int32_t> s);
bool GetKeepDims(const std::vector<std::shared_ptr<OperatorInfo>> &ops, const size_t iter_ops);
std::vector<int32_t> GetDimList(const std::vector<std::shared_ptr<OperatorInfo>> &ops, const size_t iter_ops);
std::vector<int32_t> ModifyStrategyIfReduceIncoming(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t incoming_op_index, std::vector<int32_t> s);
std::vector<int32_t> GetDimListFromAttrs(const std::vector<std::shared_ptr<OperatorInfo>> &ops, const size_t iter_ops);
std::vector<int32_t> ModifyStrategyIfArgIncoming(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t incoming_op_index, std::vector<int32_t> s);
std::vector<int32_t> CopyIncomingOperatorInputStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_ops, const size_t incoming_op_index);
std::vector<std::vector<int32_t>> GenerateStrategiesFromStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_ops,
std::vector<int32_t> basic_stra);
void GenerateEliminatedOperatorStrategyForward(std::shared_ptr<Graph> graph,
void GenerateEliminatedOperatorStrategyForward(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const std::vector<std::vector<std::string>> &input_tensor_names,
const std::shared_ptr<std::vector<size_t>> index_list,
const std::shared_ptr<std::vector<size_t>> no_stra_op_list);
const std::shared_ptr<std::vector<size_t>> &index_list,
const std::shared_ptr<std::vector<size_t>> &no_stra_op_list);
std::vector<int32_t> ModifyStrategyIfSqueezeOutgoing(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const size_t iter_ops, std::vector<int32_t> s);
std::vector<int32_t> CopyOutgoingOperatorInputStrategy(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
@ -85,12 +88,12 @@ std::vector<int32_t> CopyOutgoingOperatorInputStrategy(const std::vector<std::sh
const size_t iter_ops);
void GenerateEliminatedOperatorStrategyBackward(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const std::vector<std::vector<std::string>> &input_tensor_names,
const std::shared_ptr<std::vector<size_t>> no_stra_op_list);
void GenerateRemainingOperatorStrategy(const std::shared_ptr<Graph> graph,
const std::shared_ptr<std::vector<size_t>> &no_stra_op_list);
void GenerateRemainingOperatorStrategy(const std::shared_ptr<Graph> &graph,
const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const std::vector<std::vector<std::string>> &input_tensor_names,
const std::shared_ptr<std::vector<size_t>> index_list,
const std::shared_ptr<std::vector<size_t>> no_stra_op_list);
const std::shared_ptr<std::vector<size_t>> &index_list,
const std::shared_ptr<std::vector<size_t>> &no_stra_op_list);
} // namespace parallel
} // namespace mindspore
#endif // PARALLEL_AUTO_PARALLEL_REC_GENERATE_STRATEGY_H_

View File

@ -38,6 +38,7 @@ enum OperatorType {
kRecBiasAdd,
kRecSoftmax,
kRecSparseSoftmaxCrossEntropyWithLogits,
kRecSoftmaxCrossEntropyWithLogits,
kRecOneHot,
kRecLog,
kRecExp,
@ -49,7 +50,8 @@ enum OperatorType {
kRecCast,
kRecReduce,
kRecPReLU,
kRecGatherV2
kRecGatherV2,
kRecArgWithValue
};
enum InfoType { kApplication, kConstant };

View File

@ -40,7 +40,7 @@ const TensorParam MakeTensor(int n, int c, int h, int w) {
return tensor;
}
Graph::NodeType MakeNewOperator(std::vector<std::shared_ptr<OperatorInfo>> ops, size_t iter_ops) {
Graph::NodeType MakeNewOperator(const std::vector<std::shared_ptr<OperatorInfo>> &ops, size_t iter_ops) {
Graph::NodeType NewOp;
NewOp.name = ops[iter_ops]->name();
NewOp.info = InfoType::kApplication;
@ -140,7 +140,7 @@ std::shared_ptr<Graph> ParseGraph(const std::vector<std::shared_ptr<OperatorInfo
return graph;
}
void MakeEdge(const std::vector<std::vector<std::string>> &input_tensor_names, std::shared_ptr<Graph> graph) {
void MakeEdge(const std::vector<std::vector<std::string>> &input_tensor_names, const std::shared_ptr<Graph> &graph) {
for (size_t iter_i = 0; iter_i < input_tensor_names.size(); iter_i++) {
for (size_t iter_j = 1; iter_j < input_tensor_names[iter_i].size(); iter_j++) {
size_t head_node_index = GetIndexInInputTensorNames(input_tensor_names, input_tensor_names[iter_i][iter_j]);
@ -163,8 +163,8 @@ size_t GetIndexInInputTensorNames(const std::vector<std::vector<std::string>> &i
return SIZE_MAX;
}
void Eliminate_Aux(const size_t node_index, const std::shared_ptr<Graph> graph,
const std::shared_ptr<std::vector<std::vector<size_t>>> eli_list) {
void Eliminate_Aux(const size_t node_index, const std::shared_ptr<Graph> &graph,
const std::shared_ptr<std::vector<std::vector<size_t>>> &eli_list) {
std::vector<size_t> eli;
eli.push_back(node_index);
for (size_t i = 0; i < (size_t)graph->nodes[node_index].node_out.size(); i++) {
@ -211,18 +211,18 @@ void Eliminate_Aux(const size_t node_index, const std::shared_ptr<Graph> graph,
}
}
std::shared_ptr<Graph> EliminateGraph(const std::shared_ptr<Graph> graph,
const std::shared_ptr<std::vector<std::vector<size_t>>> eli_list,
const std::shared_ptr<std::vector<size_t>> index_list) {
std::shared_ptr<Graph> EliminateGraph(const std::shared_ptr<Graph> &graph,
const std::shared_ptr<std::vector<std::vector<size_t>>> &eli_list,
const std::shared_ptr<std::vector<size_t>> &index_list) {
MS_EXCEPTION_IF_NULL(graph);
const std::set<OperatorType> type_list = {
OperatorType::kRecOneHot, OperatorType::kRecReLU, OperatorType::kRecLog, OperatorType::kRecExp,
OperatorType::kRecAdd, OperatorType::kRecElmWiseOp, OperatorType::kRecBiasAdd, OperatorType::kRecSub,
OperatorType::kRecMul, OperatorType::kRecDiv, OperatorType::kRecSqueeze, OperatorType::kRecReduce,
OperatorType::kRecCast, OperatorType::kRecReshape, OperatorType::kRecGatherV2};
static const std::set<OperatorType> elementwise_type = {
OperatorType::kRecReLU, OperatorType::kRecLog, OperatorType::kRecExp, OperatorType::kRecAdd,
OperatorType::kRecElmWiseOp, OperatorType::kRecBiasAdd, OperatorType::kRecSub, OperatorType::kRecMul,
OperatorType::kRecDiv, OperatorType::kRecSqueeze, OperatorType::kRecReduce, OperatorType::kRecCast,
OperatorType::kRecReshape, OperatorType::kRecGatherV2, OperatorType::kRecArgWithValue};
for (size_t node_index = 0; node_index < (size_t)graph->nodes.size(); node_index++) {
auto type = graph->nodes[node_index].apply.op_type;
if (type_list.find(type) != type_list.end()) {
if (elementwise_type.find(type) != elementwise_type.end()) {
Eliminate_Aux(node_index, graph, eli_list);
}
}
@ -250,12 +250,22 @@ std::shared_ptr<Graph> EliminateGraph(const std::shared_ptr<Graph> graph,
new_graph->nodes.push_back(graph->nodes[i]);
auto *node_in = &new_graph->nodes[index_list->at(i)].node_in;
for (size_t j = 0; j < node_in->size(); j++) {
node_in->at(j) = index_list->at(node_in->at(j));
for (size_t j = node_in->size(); j > 0; j--) {
bool IsEliminated = (index_list->at(node_in->at(j - 1)) == SIZE_MAX);
if (IsEliminated) {
node_in->erase(node_in->begin() + j - 1);
} else {
node_in->at(j - 1) = index_list->at(node_in->at(j - 1));
}
}
auto *node_out = &new_graph->nodes[index_list->at(i)].node_out;
for (size_t j = 0; j < node_out->size(); j++) {
node_out->at(j) = index_list->at(node_out->at(j));
for (size_t j = node_out->size(); j > 0; j--) {
bool IsEliminated = (index_list->at(node_out->at(j - 1)) == SIZE_MAX);
if (IsEliminated) {
node_out->erase(node_out->begin() + j - 1);
} else {
node_out->at(j - 1) = index_list->at(node_out->at(j - 1));
}
}
}
return new_graph;

View File

@ -47,6 +47,8 @@ const std::map<std::string, OperatorType> DictOpType{
{REDUCE_MIN, OperatorType::kRecReduce},
{REDUCE_MEAN, OperatorType::kRecReduce},
{GATHERV2, OperatorType::kRecGatherV2},
{ARGMAXWITHVALUE, OperatorType::kRecArgWithValue},
{ARGMINWITHVALUE, OperatorType::kRecArgWithValue},
{RELU, OperatorType::kRecReLU},
{"ReLU6", OperatorType::kRecReLU},
@ -59,6 +61,7 @@ const std::map<std::string, OperatorType> DictOpType{
{PRELU, OperatorType::kRecPReLU},
{TRANSPOSE, OperatorType::kRecElmWiseOp},
{L2_NORMALIZE, OperatorType::kRecElmWiseOp},
{TENSOR_ADD, OperatorType::kRecElmWiseOp},
{SUB, OperatorType::kRecElmWiseOp},
@ -67,7 +70,7 @@ const std::map<std::string, OperatorType> DictOpType{
{REAL_DIV, OperatorType::kRecElmWiseOp},
{SOFTMAX, OperatorType::kRecSoftmax},
{LOG_SOFTMAX, OperatorType::kRecSoftmax},
{SOFTMAX_CROSS_ENTROPY_WITH_LOGITS, OperatorType::kRecSoftmax},
{SOFTMAX_CROSS_ENTROPY_WITH_LOGITS, OperatorType::kRecSoftmaxCrossEntropyWithLogits},
{SQRT, OperatorType::kRecElmWiseOp},
{NEG, OperatorType::kRecElmWiseOp},
{POW, OperatorType::kRecElmWiseOp},
@ -107,7 +110,7 @@ const std::map<std::string, OperatorType> DictOpType{
const TensorParam MakeTensor(int n, int c, int h, int w);
Graph::NodeType MakeNewOperator(std::vector<std::shared_ptr<OperatorInfo>> ops, size_t iter_ops);
Graph::NodeType MakeNewOperator(const std::vector<std::shared_ptr<OperatorInfo>> &ops, size_t iter_ops);
OperatorRec CompleteOperatorInputs(const std::vector<std::shared_ptr<OperatorInfo>> &ops, const size_t iter_ops,
Graph::NodeType NewTensor);
@ -118,17 +121,17 @@ TensorParam Complete2DInputs(const std::vector<std::shared_ptr<OperatorInfo>> &o
std::shared_ptr<Graph> ParseGraph(const std::vector<std::shared_ptr<OperatorInfo>> &ops,
const std::vector<std::vector<std::string>> &input_tensor_names);
void MakeEdge(const std::vector<std::vector<std::string>> &input_tensor_names, std::shared_ptr<Graph> graph);
void MakeEdge(const std::vector<std::vector<std::string>> &input_tensor_names, const std::shared_ptr<Graph> &graph);
size_t GetIndexInInputTensorNames(const std::vector<std::vector<std::string>> &input_tensor_names,
const std::string &input_name);
void Eliminate_Aux(const size_t node_index, const std::shared_ptr<Graph> graph,
const std::shared_ptr<std::vector<std::vector<size_t>>> eli_list);
void Eliminate_Aux(const size_t node_index, const std::shared_ptr<Graph> &graph,
const std::shared_ptr<std::vector<std::vector<size_t>>> &eli_list);
std::shared_ptr<Graph> EliminateGraph(const std::shared_ptr<Graph> graph,
const std::shared_ptr<std::vector<std::vector<size_t>>> eli_list,
const std::shared_ptr<std::vector<size_t>> index_list);
std::shared_ptr<Graph> EliminateGraph(const std::shared_ptr<Graph> &graph,
const std::shared_ptr<std::vector<std::vector<size_t>>> &eli_list,
const std::shared_ptr<std::vector<size_t>> &index_list);
} // namespace parallel
} // namespace mindspore
#endif // PARALLEL_AUTO_PARALLEL_REC_PARSE_GRAPH_H_

View File

@ -68,19 +68,24 @@ double GetWeights(const Graph::NodeType &node) {
auto cost_ptr = std::make_shared<CostBiasAdd>();
return cost_ptr->GetMinCostIn();
} else if (op.op_type == OperatorType::kRecOneHot || op.op_type == OperatorType::kRecLog ||
op.op_type == OperatorType::kRecExp || op.op_type == OperatorType::kRecAdd ||
op.op_type == OperatorType::kRecSub || op.op_type == OperatorType::kRecMul ||
op.op_type == OperatorType::kRecDiv || op.op_type == OperatorType::kRecSqueeze ||
op.op_type == OperatorType::kRecCast) {
} else if (op.op_type == OperatorType::kRecLog || op.op_type == OperatorType::kRecExp ||
op.op_type == OperatorType::kRecAdd || op.op_type == OperatorType::kRecSub ||
op.op_type == OperatorType::kRecMul || op.op_type == OperatorType::kRecDiv ||
op.op_type == OperatorType::kRecSqueeze || op.op_type == OperatorType::kRecCast) {
// For element-wise op
auto cost_ptr = std::make_shared<CostCommon>();
return cost_ptr->GetMinCostIn();
} else if (op.op_type == OperatorType::kRecUnkownType || op.op_type == OperatorType::kRecPReLU ||
op.op_type == OperatorType::kRecBatchNorm || op.op_type == OperatorType::kRecSoftmax ||
op.op_type == OperatorType::kRecSparseSoftmaxCrossEntropyWithLogits) {
// For unprocessed type
} else if (op.op_type == OperatorType::kRecBatchNorm || op.op_type == OperatorType::kRecOneHot ||
op.op_type == OperatorType::kRecPReLU || op.op_type == OperatorType::kRecSoftmax ||
op.op_type == OperatorType::kRecSparseSoftmaxCrossEntropyWithLogits ||
op.op_type == OperatorType::kRecSoftmaxCrossEntropyWithLogits) {
// For BatchParallel op
auto cost_ptr = std::make_shared<CostBatchParallel>();
return cost_ptr->GetMaxCostIn();
} else if (op.op_type == OperatorType::kRecUnkownType) {
// For Unkown type
return 0.0;
} else {
MS_LOG(EXCEPTION) << "Failure: GetOperatorWeight failed.";
@ -88,7 +93,7 @@ double GetWeights(const Graph::NodeType &node) {
}
// Sort all the nodes by their weights
std::vector<size_t> SortByWeight(const std::shared_ptr<Graph> graph) {
std::vector<size_t> SortByWeight(const std::shared_ptr<Graph> &graph) {
MS_EXCEPTION_IF_NULL(graph);
std::vector<std::pair<double, size_t>> weight_to_node_index;
@ -119,7 +124,7 @@ std::vector<size_t> SortByWeight(const std::shared_ptr<Graph> graph) {
// Get optimal strategy to partition the target node
StrategyRec PartitionNode(const Graph::NodeType &node,
const std::vector<std::pair<std::string, StrategyRec>> &node_name_to_strategy,
std::shared_ptr<Graph> graph) {
const std::shared_ptr<Graph> &graph) {
bool enable_conv_chw_partition = false;
MS_EXCEPTION_IF_NULL(graph);
@ -158,19 +163,26 @@ StrategyRec PartitionNode(const Graph::NodeType &node,
auto cost_ptr = std::make_shared<CostBiasAdd>();
return cost_ptr->GetOptimalStr(node, node_name_to_strategy, *graph);
} else if (node.apply.op_type == OperatorType::kRecOneHot || node.apply.op_type == OperatorType::kRecLog ||
node.apply.op_type == OperatorType::kRecExp || node.apply.op_type == OperatorType::kRecAdd ||
node.apply.op_type == OperatorType::kRecSub || node.apply.op_type == OperatorType::kRecMul ||
node.apply.op_type == OperatorType::kRecDiv || node.apply.op_type == OperatorType::kRecSqueeze ||
node.apply.op_type == OperatorType::kRecCast) {
} else if (node.apply.op_type == OperatorType::kRecLog || node.apply.op_type == OperatorType::kRecExp ||
node.apply.op_type == OperatorType::kRecAdd || node.apply.op_type == OperatorType::kRecSub ||
node.apply.op_type == OperatorType::kRecMul || node.apply.op_type == OperatorType::kRecDiv ||
node.apply.op_type == OperatorType::kRecSqueeze || node.apply.op_type == OperatorType::kRecCast) {
// For element-wise op
auto cost_ptr = std::make_shared<CostCommon>();
return cost_ptr->GetOptimalStr(node, node_name_to_strategy, *graph);
} else if (node.apply.op_type == OperatorType::kRecUnkownType || node.apply.op_type == OperatorType::kRecPReLU ||
node.apply.op_type == OperatorType::kRecBatchNorm || node.apply.op_type == OperatorType::kRecSoftmax ||
} else if (node.apply.op_type == OperatorType::kRecBatchNorm || node.apply.op_type == OperatorType::kRecOneHot ||
node.apply.op_type == OperatorType::kRecPReLU || node.apply.op_type == kRecSoftmax ||
node.apply.op_type == OperatorType::kRecSparseSoftmaxCrossEntropyWithLogits) {
// For unprocessed type
// For BatchParallel type
auto cost_ptr = std::make_shared<CostBatchParallel>();
return cost_ptr->GetOptimalStr(node);
} else if (node.apply.op_type == OperatorType::kRecSoftmaxCrossEntropyWithLogits) {
// For SoftmaxCrossEntropyWithLogits type
auto cost_ptr = std::make_shared<CostSoftmaxCrossEntropyWithLogits>();
return cost_ptr->GetOptimalStr(node);
} else if (node.apply.op_type == OperatorType::kRecUnkownType) {
// For Unkown type
StrategyRec default_strategy;
return default_strategy;
} else {
@ -179,7 +191,8 @@ StrategyRec PartitionNode(const Graph::NodeType &node,
}
// Parttion graph into all devices.
Status PartitionForAllDevices(const size_t num_device, const double device_memory, std::shared_ptr<Graph> graph) {
Status PartitionForAllDevices(const size_t num_device, const double device_memory,
const std::shared_ptr<Graph> &graph) {
if (num_device < 1) {
MS_LOG(EXCEPTION) << "ERROR: Number of devices can't be " << num_device << ".";
}
@ -249,7 +262,7 @@ Graph::NodeType ApplyStrToTensor(Graph::NodeType Node) {
return Node;
}
Status DevicesMemoryControl(const size_t num_device, const double device_memory, std::shared_ptr<Graph> graph) {
Status DevicesMemoryControl(const size_t num_device, const double device_memory, const std::shared_ptr<Graph> &graph) {
MS_EXCEPTION_IF_NULL(graph);
if (num_device == 0) {
MS_LOG(EXCEPTION) << "Failure: device number is 0.";

View File

@ -32,19 +32,19 @@
namespace mindspore {
namespace parallel {
std::vector<size_t> SortByWeight(const std::shared_ptr<Graph> graph);
std::vector<size_t> SortByWeight(const std::shared_ptr<Graph> &graph);
double GetWeights(const Graph::NodeType &node);
StrategyRec PartitionNode(const Graph::NodeType &node,
const std::vector<std::pair<std::string, StrategyRec>> &node_name_to_strategy,
std::shared_ptr<Graph> graph);
const std::shared_ptr<Graph> &graph);
Status PartitionForAllDevices(const size_t num_device, const double device_memory, std::shared_ptr<Graph> graph);
Status PartitionForAllDevices(const size_t num_device, const double device_memory, const std::shared_ptr<Graph> &graph);
Graph::NodeType ApplyStrToTensor(Graph::NodeType Node);
Status DevicesMemoryControl(const size_t num_device, const double device_memory, std::shared_ptr<Graph> graph);
Status DevicesMemoryControl(const size_t num_device, const double device_memory, const std::shared_ptr<Graph> &graph);
size_t GetDataTypeSize(const TensorType &type);
} // namespace parallel

View File

@ -282,7 +282,7 @@ bool VmOptimizeAction(const ResourcePtr &res) { return OptimizeAction(res, kVmPa
bool PynativeOptimizeAction(const ResourcePtr &res) { return OptimizeAction(res, kPynativePasses); }
static bool IsCtrlSink(const FuncGraphPtr &graph) {
static bool IsCtrlSink() {
auto ms_ctx = MsContext::GetInstance();
if (ms_ctx->execution_mode() != kGraphMode) {
return false;
@ -297,10 +297,9 @@ static bool IsCtrlSink(const FuncGraphPtr &graph) {
return false;
}
if (graph != nullptr && CompileGraphs::ContainMixedTarget(graph)) {
if (!ms_ctx->is_multi_graph_sink()) {
return false;
}
return true;
}
@ -310,7 +309,21 @@ bool TaskEmitAction(const ResourcePtr &res) {
}
FuncGraphPtr func_graph = res->func_graph();
auto bc_ptr = res->results()[kBackend].cast<compile::BackendPtr>();
if (IsCtrlSink(func_graph)) {
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
if (CompileGraphs::ContainMixedTarget(func_graph)) {
bc_ptr->set_is_multi_graph_sink(false);
context_ptr->set_is_multi_graph_sink(false);
context_ptr->set_loop_sink_flag(false);
} else if (context_ptr->execution_mode() != kPynativeMode) {
std::string device_target = context_ptr->device_target();
if (device_target == kAscendDevice) {
bc_ptr->set_is_multi_graph_sink(true);
context_ptr->set_is_multi_graph_sink(true);
}
}
if (IsCtrlSink()) {
res->results()[kOutput] = bc_ptr->CompileGraph(NOT_NULL(func_graph));
return true;
}
@ -318,19 +331,7 @@ bool TaskEmitAction(const ResourcePtr &res) {
if (bc_ptr->name() == kMsConvert) {
cut_list = compile::GetMsNonlinearOps();
}
std::shared_ptr<CompileGraphs> compile = std::make_shared<CompileGraphs>(bc_ptr, cut_list);
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
if (CompileGraphs::ContainMixedTarget(func_graph)) {
bc_ptr->set_is_multi_graph_sink(false);
context_ptr->set_loop_sink_flag(false);
} else if (context_ptr->execution_mode() != kPynativeMode) {
std::string device_target = context_ptr->device_target();
if (device_target == kAscendDevice) {
bc_ptr->set_is_multi_graph_sink(true);
}
}
res->results()[kOutput] = compile->CompileAndLink(func_graph);
return true;
}
@ -340,11 +341,10 @@ bool ExecuteAction(const ResourcePtr &res) {
MS_LOG(EXCEPTION) << "Execute args error";
}
if (IsCtrlSink(nullptr)) {
if (IsCtrlSink()) {
if (!res->results()[kOutput].is<GraphId>()) {
MS_LOG(EXCEPTION) << "Execute args error";
}
auto graph_id = res->results()[kOutput].cast<GraphId>();
std::shared_ptr<compile::Backend> bc_ptr = res->results()[kBackend].cast<std::shared_ptr<compile::Backend>>();
std::shared_ptr<compile::MsBackend> msbc_ptr = std::dynamic_pointer_cast<compile::MsBackend>(bc_ptr);

View File

@ -515,11 +515,11 @@ using AbstractNullPtr = std::shared_ptr<AbstractNull>;
class AbstractEllipsis : public AbstractBase {
public:
AbstractEllipsis() : AbstractBase(kEllipsis) { set_type(std::make_shared<Ellipsis>()); }
AbstractEllipsis() : AbstractBase(kEllipsis) { set_type(std::make_shared<TypeEllipsis>()); }
~AbstractEllipsis() override = default;
MS_DECLARE_PARENT(AbstractEllipsis, AbstractBase)
TypePtr BuildType() const override { return std::make_shared<Ellipsis>(); }
TypePtr BuildType() const override { return std::make_shared<TypeEllipsis>(); }
bool operator==(const AbstractEllipsis &other) const;
bool operator==(const AbstractBase &other) const override;
AbstractBasePtr Clone() const override { return std::make_shared<AbstractEllipsis>(); }

View File

@ -103,6 +103,7 @@ bool MemReuseUtil::InitDynamicWorkspaceKernelRef() {
bool MemReuseUtil::InitDynamicKernelRef(const KernelGraph *graph) {
MS_EXCEPTION_IF_NULL(graph);
graph_ = graph;
is_all_nop_node_ = opt::IsAllNopNode(graph);
if (!InitDynamicOutputKernelRef()) {
MS_LOG(INFO) << "InitDynamicOutputKernelRef fail";
return false;
@ -229,7 +230,14 @@ KernelRefCountPtr MemReuseUtil::GetKernelInputRef(const CNodePtr &kernel, size_t
}
auto input_node = kernel->input(input_idx + 1);
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
auto kernel_input = AnfAlgo::VisitKernelWithReturnType(input_node, 0, false);
session::KernelWithIndex kernel_input;
if (is_all_nop_node_) {
// The graph does not remove the nop node.
kernel_input = AnfAlgo::VisitKernelWithReturnType(input_node, 0, false);
} else {
// The graph removes the nop node.
kernel_input = AnfAlgo::VisitKernelWithReturnType(input_node, 0, true);
}
if (IsPrimitive(kernel_input.first, prim::kPrimMakeTuple)) {
MS_LOG(EXCEPTION) << "Input node [" << input_node->DebugString() << "]'s input " << input_idx << " is MakeTuple";
}
@ -272,7 +280,14 @@ void MemReuseUtil::SetKernelDefInputs() {
// set the inputs of this kernel_def
auto input_node = AnfAlgo::GetInputNode(kernel, i);
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
auto input = AnfAlgo::VisitKernelWithReturnType(input_node, 0, false);
session::KernelWithIndex input;
if (is_all_nop_node_) {
// The graph does not remove the nop node.
input = AnfAlgo::VisitKernelWithReturnType(input_node, 0, false);
} else {
// The graph removes the nop node.
input = AnfAlgo::VisitKernelWithReturnType(input_node, 0, true);
}
if (IsPrimitive(input.first, prim::kPrimMakeTuple)) {
MS_LOG(EXCEPTION) << "Input node [" << input_node->DebugString() << "]'s input " << i << " is MakeTuple";
}
@ -333,11 +348,10 @@ void MemReuseUtil::SetSummaryNodesRefCount() {
}
void MemReuseUtil::SetGraphOutputRefCount() {
auto is_all_nop_node = opt::IsAllNopNode(graph_);
auto nodes = AnfAlgo::GetAllOutput(graph_->output(), {prim::kPrimTupleGetItem});
for (const auto &node : nodes) {
session::KernelWithIndex kernel_input;
if (is_all_nop_node) {
if (is_all_nop_node_) {
// The graph does not remove the nop node.
kernel_input = AnfAlgo::VisitKernelWithReturnType(node, 0, false);
} else {

Some files were not shown because too many files have changed in this diff Show More