Compare commits

...

547 Commits

Author SHA1 Message Date
i-robot cb7b8eac13 !25423 Fix deepspeech link
Merge pull request !25423 from wanyiming/r1.3_patch_dp2
2021-10-28 04:43:40 +00:00
wanyiming ebb699593a fix deepspeech link 2021-10-26 14:54:33 +08:00
i-robot 1da40208b3 !25180 fix dynamic_rnn_grad_fission_v2 dw output shape
Merge pull request !25180 from yuchaojie/r1.3-patch
2021-10-21 01:41:53 +00:00
yuchaojie a9e8058e9d fix dynamic_rnn_grad_fission_v2 output shape 2021-10-20 15:00:05 +08:00
i-robot bfb901e5cc !24558 Fix windows import mindspore._c_dataengine failed problem to r1.3 patch
Merge pull request !24558 from xiefangqi/md_fix_windows_dll_r1.3_patch
2021-10-13 01:07:41 +00:00
zhoufeng dda1bcc7e2 update cmake/external_libs/sentencepiece.cmake to solve windows problem
Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
2021-10-12 18:55:39 +08:00
i-robot a21f471716 !23182 Raise error information for the case that the Profiler output path was not exists.
Merge pull request !23182 from casgj/r1.3-patch
2021-09-14 06:45:14 +00:00
casgj fe56a764b8 update mindspore/profiler/profiling.py.
Raise error information for the case that the Profiler output path was not exists.
2021-09-09 07:49:39 -04:00
i-robot 1e0e5333d7 !23037 Profiler deal with the case that cpu utilization data may not have information for each pipeline op.
Merge pull request !23037 from casgj/r1.3-patch
2021-09-09 01:14:27 +00:00
gaojing a26b89b323 Profiler deal with the case that the cpu utilization data may not have information for each pipeline op. 2021-09-07 09:02:49 -04:00
yelihua 934cd74dbe get rank id when set hccl env for single card train 2021-08-17 19:29:41 +08:00
i-robot cfbb0d0dfa !20226 Fix for multigraph wp issue for r1.3
Merge pull request !20226 from parastooashtari/mg_wp_1.3
2021-07-14 01:41:35 +00:00
i-robot f0ae03f60d !20201 Update bert config, add large_acc config
Merge pull request !20201 from chenhaozhe/code_docs_modify_bert_config_1.3
2021-07-14 01:25:36 +00:00
i-robot 0ffdf65db0 !20217 [MS][LITE][TOD] Train Lenet Java code example -- allow also mnist relative path
Merge pull request !20217 from ehaleva/r1_3_additions
2021-07-14 01:17:02 +00:00
i-robot 4548cd4cda !19955 [MS][LITE][r1.3]fix memory leak in mindrt
Merge pull request !19955 from XianglongZeng/r1.3_
2021-07-14 01:15:07 +00:00
i-robot b60027b8d3 !19774 Use destMax in memcpy_ss r.13
Merge pull request !19774 from h.farahat/tensor_safe_memcpy
2021-07-14 01:13:07 +00:00
i-robot d9493456c8 !20194 [MSLITE] mindrt bug : thread pool error
Merge pull request !20194 from ling/core_1.3
2021-07-14 01:11:14 +00:00
i-robot 99f0a79623 !20181 [MS][LITE] Fixed double declaration of MSTensor::Impl
Merge pull request !20181 from ehaleva/fix_examples
2021-07-14 01:09:06 +00:00
zhangzhenghai 9f3cfb8a4b !20203 Revert "Ensure the correct execution order of the users of parameter or load"
Merge pull request !20203 from Margaret_wangrui/revert_order_enforce_r1.3
2021-07-14 01:03:43 +00:00
zhangzhenghai 46b4878b44 !20150 fix bug of output none
Merge pull request !20150 from limingqi107/r1.3
2021-07-14 01:02:58 +00:00
i-robot c1bc8cc363 !20014 ci format checking issues fix
Merge pull request !20014 from zhangzhaoju/r1.3_ci_format
2021-07-14 00:57:02 +00:00
i-robot 4e54fcb63c !20223 [Debugger] Offline debugger view tensor failed in the async_dump direcory
Merge pull request !20223 from TinaMengtingZhang/async_view_tensor_r1_3
2021-07-13 23:49:01 +00:00
Parastoo Ashtari ad277da867 fixed multigraph wp issue 2021-07-13 19:38:24 -04:00
TinaMengtingZhang 9f206de31a fix async view tensor bug 2021-07-13 14:16:29 -04:00
i-robot c38b1ac0ca !20169 fix bug prelu grad
Merge pull request !20169 from zhangbuxue/fix_prelu_grad_bug
2021-07-13 16:47:22 +00:00
Emir Haleva c96a5f8146 fix absolute path in train_lenet_java example 2021-07-13 19:15:45 +03:00
i-robot e9edd3217b !19893 fix online debug overflow wp
Merge pull request !19893 from john_tzanakakis/1.3
2021-07-13 15:10:42 +00:00
Margaret_wangrui 46d1e3cbec Revert "Ensure the correct execution order of the users of parameter or load"
This reverts commit d3b947dfb7.
2021-07-13 22:41:39 +08:00
i-robot 5f61e369b0 !20212 r1.3 Update RELEASE.md
Merge pull request !20212 from luoyang/code_docs_1.3
2021-07-13 14:03:24 +00:00
limingqi107 8d126cb95b fix bug of output none and share weight 2021-07-13 21:59:22 +08:00
YangLuo d9183b2eb5 Update RELEASE.md 2021-07-13 21:45:16 +08:00
i-robot 20174d046e !19883 change kMaliGPU to kGPU
Merge pull request !19883 from yeyunpeng2020/r1.3
2021-07-13 13:39:22 +00:00
i-robot 03aef856c8 !20155 fix naml code compile error
Merge pull request !20155 from 于振华/fix_naml_r1.3
2021-07-13 13:17:27 +00:00
chenhaozhe 76610b48c4 update bert config, add large_acc config 2021-07-13 20:52:29 +08:00
i-robot 21b8579c05 !20195 fix doc
Merge pull request !20195 from lianliguang/code_docs_r1.3
2021-07-13 12:51:19 +00:00
lianliguang 30f3603bea fix doc 2021-07-13 20:37:06 +08:00
i-robot c9115874df !20176 fix the constant fold failure bug of random_standard op
Merge pull request !20176 from wangyanling/r1.3
2021-07-13 12:36:02 +00:00
i-robot 83943b024c !20121 InTopK gpu kernel bugfix
Merge pull request !20121 from Peilin/topk-bugfix-index-r1.3
2021-07-13 12:23:56 +00:00
ling 05060250b7 [MSLITE] mindrt bug 2021-07-13 20:20:43 +08:00
i-robot 47041491ec !20189 Fix Doc for the example
Merge pull request !20189 from huangxinjing/code_docs_fix_indent
2021-07-13 12:19:56 +00:00
zengxianglong bd40673e16 fix memory leak in mindrt 2021-07-13 20:10:15 +08:00
i-robot 65f01f5164 !20100 fix bug of albert task in case PWEcrypt for r1.3
Merge pull request !20100 from zhoushan33/flclient0712_om_r1.3
2021-07-13 11:56:00 +00:00
huangxinjing 3979b8c3ea Fix indentant space 2021-07-13 19:40:00 +08:00
Emir Haleva 400cdbc4b3 Fixed double declaration of MSTensor::Impl 2021-07-13 14:03:34 +03:00
i-robot ae6c95082f !20146 [lite]open test and demo
Merge pull request !20146 from 徐安越/r1.3
2021-07-13 10:51:50 +00:00
yeyunpeng2020 eddde7eac9 new api use gpu 2021-07-13 18:46:08 +08:00
i-robot 1dcfee0ef0 !20075 [MSLITE] isolate input tensor set replace copy
Merge pull request !20075 from ling/r1.3
2021-07-13 09:36:46 +00:00
i-robot 8cad7f2b09 !20160 [MS][LITE] modify nnie scripts
Merge pull request !20160 from jianghui58/fix_NNIE
2021-07-13 09:28:53 +00:00
wangyanling fc210973e5 fix the constant fold failure bug 2021-07-13 17:21:08 +08:00
i-robot a934361e61 !20156 [MS][LITE]Fix himax200 bulid
Merge pull request !20156 from gongdaguo/r13_fix_himix200
2021-07-13 09:18:43 +00:00
buxue d81f83f7bb fix bug prelu grad 2021-07-13 17:18:32 +08:00
i-robot cb1bce579a !20149 Remove Graph Kernel
Merge pull request !20149 from huangxinjing/code_docs_fix_graph_kernel
2021-07-13 09:05:27 +00:00
i-robot 7f423b9cb1 !20124 CI code warning fixes [r1.3]
Merge pull request !20124 from cathwong/ckw_r1.3_ci_q3_lint2_analyzer
2021-07-13 08:53:25 +00:00
zhangzhenghai 7001ae7921 !20147 code_docs_0713_protobuf
Merge pull request !20147 from code_docs_0713
2021-07-13 07:32:40 +00:00
i-robot a4e0dc936c !20136 fix cmake's bug about normal_standard object library
Merge pull request !20136 from wangyanling/r1.3
2021-07-13 07:21:29 +00:00
ling fc8f0a04e4 [MSLITE] isolate input tensor set replace copy 2021-07-13 15:13:01 +08:00
jianghui58 6e2000c4a7 modify nnie scripts 2021-07-13 15:09:34 +08:00
i-robot fe08086f5d !20151 Add context setting in doc examples
Merge pull request !20151 from lixiaohui33/code_docs_ci_r1.3
2021-07-13 07:02:08 +00:00
i-robot 931b203977 !20050 modify for bert 310infer on r1.3
Merge pull request !20050 from 郑彬/bert0708_r1.3
2021-07-13 06:46:05 +00:00
i-robot e8ef43af98 !20061 add parameter 'config_path' in tinybert on r1.3
Merge pull request !20061 from 郑彬/tinybert0712_r1.3
2021-07-13 06:44:05 +00:00
zhangzhenghai f3204e98d7 !20107 Export IR with FV informations.
Merge pull request !20107 from 张清华/r1.3
2021-07-13 06:40:53 +00:00
gongdaguo 382c1a42cb fix himix200 bulid 2021-07-13 14:39:55 +08:00
i-robot 3230fc8e3e !20109 fix scripts bug
Merge pull request !20109 from jianghui58/fix_NNIE
2021-07-13 06:36:12 +00:00
yuzhenhua c9c5e26063 fix naml code compile error 2021-07-13 14:23:39 +08:00
xuanyue 44650394b9 open test and demo 2021-07-13 14:22:12 +08:00
lixiaohui 4dde80fc7e add context setting in examples due to the default mode shift in mindspore 1.3 2021-07-13 14:20:37 +08:00
i-robot 8729b60a90 !20087 [LITE][TRAIN]fix reduce infer
Merge pull request !20087 from yefeng/131-fix_infer_1.3
2021-07-13 06:12:07 +00:00
i-robot c57e9ab3ec !20013 [MS][LITE]fix bug of control model
Merge pull request !20013 from mengyuanli/sync_r1.3
2021-07-13 06:10:13 +00:00
jianghui58 41d19f60da fix_nnie 2021-07-13 13:13:00 +08:00
huangxinjing c7835b5646 remove graph kernel 2021-07-13 12:21:55 +08:00
dingpeifei f6e1f76dad protobuf 2021-07-13 12:05:33 +08:00
Zhang Qinghua 50a216ea6c Export IR with FV informations. 2021-07-13 12:04:55 +08:00
i-robot 1cd02eab24 !20135 update the result of example of Split, AdaptiveAvgPool2D and ReduceMean operators.
Merge pull request !20135 from wangshuide/code_docs_wsd_r1.3
2021-07-13 03:37:20 +00:00
i-robot 074b6c590e !20086 [AutoParallel]fix_Pipeline_parallel_bug
Merge pull request !20086 from lichen/fix_Pipeline_parallel_bug
2021-07-13 02:57:21 +00:00
i-robot f90797f50b !20026 fix tuple_slice bounds
Merge pull request !20026 from huangmengxi/numpy_fix
2021-07-13 02:39:21 +00:00
mengyuanli 45e621388b fix bug of fp16 2021-07-13 10:38:35 +08:00
i-robot 32553e2758 !20125 Fixing core dump issue for bert net in r1.3
Merge pull request !20125 from parastooashtari/bert_issue
2021-07-13 02:29:25 +00:00
i-robot 6216e651fb !20096 modify resnet 310 infer
Merge pull request !20096 from lilei/modify_model_zoo_readme_R1.3
2021-07-13 02:21:22 +00:00
wangyanling a155ec7ee8 fix miss bug of normal_standard op 2021-07-13 10:10:33 +08:00
wangshuide2020 b7a56e1e89 update the result of example of Split, AdaptiveAvgPool2D and ReduceMean operators. 2021-07-13 10:07:37 +08:00
i-robot bda4c5b473 !20004 Add autoaugment to r1.3
Merge pull request !20004 from Luke/r1.3
2021-07-13 01:59:59 +00:00
i-robot 508f015652 !19868 Add dynamic shape info to opinfo
Merge pull request !19868 from zjun/fix_r1.3
2021-07-13 01:21:54 +00:00
i-robot 828970a452 !20057 upgrade_ascend_0710_mindspore
Merge pull request !20057 from mindspore_ding/upgrade_ascend_0710
2021-07-13 01:16:00 +00:00
i-robot 8726b51eae !20076 Handle the backend parameter mismatch caused by maketuple Abstract setting error
Merge pull request !20076 from Margaret_wangrui/order_enforce_enforce_r1.3
2021-07-13 00:35:53 +00:00
Parastoo Ashtari df425aa027 fixed core dump issue in bert net 2021-07-12 17:55:44 -04:00
Cathy Wong de4c3c182e CI code warning fixes. Update file permissions for CSV file. 2021-07-12 16:28:04 -04:00
Peilin Wang cba7b2ed2d initial commit: fix topk index 2021-07-12 16:05:14 -04:00
John Tzanakakis 3f92e23f72 fix online debug overflow wp 2021-07-12 14:11:48 -04:00
dingpeifei 552ba3f6b1 upgrade_ascend_0710_mindspore 2021-07-13 00:24:20 +08:00
i-robot 273d54a4ee !19653 update submodule AKG-r1.3
Merge pull request !19653 from zhengzuohe/mamindspore_master
2021-07-12 14:43:37 +00:00
i-robot ee5ff9d273 !20005 disable mindRT in control flow
Merge pull request !20005 from limingqi107/r1.3
2021-07-12 13:27:42 +00:00
i-robot 7703c631b3 !20025 codex & support windows
Merge pull request !20025 from lanzhineng/r1.3
2021-07-12 13:15:54 +00:00
i-robot e56fb99d67 !19890 MD CI code warning fixes [r1.3]
Merge pull request !19890 from cathwong/ckw_r1.3_ci_q3_codedex1
2021-07-12 12:56:25 +00:00
i-robot c3cb98fca6 !20102 optimize Resnet training speed
Merge pull request !20102 from zhouneng/code_docs_fix_issue_I3W1H1_r1.3
2021-07-12 12:54:06 +00:00
lichenever f0f3086f66 fix_Pipeline_parallel_bug 2021-07-12 20:48:49 +08:00
zhouneng2 8d7e3b216c optimize training speed 2021-07-12 20:28:16 +08:00
lilei 83c741fa2c modify resnet 310 infer 2021-07-12 20:01:05 +08:00
i-robot 93ae1bffcd !20048 [MS][RDR] add trigger function of RDR in CPU backend
Merge pull request !20048 from louie5/r1.3
2021-07-12 11:50:10 +00:00
zhoushan 6d150b139a fix bug of albert task in case PWEcrypt for r1.3 2021-07-12 19:49:28 +08:00
i-robot be411bed0d !20079 update the example of batchnorm and dropout operators.
Merge pull request !20079 from wangshuide/code_docs_wsd_r1.3
2021-07-12 11:48:11 +00:00
i-robot 03c744371a !20033 fix numpy docstrings
Merge pull request !20033 from wangrao124/code_docs_numpy_0712_r1.3
2021-07-12 11:32:25 +00:00
i-robot e271af104b !20089 Fix resnet multi server accuracy.
Merge pull request !20089 from linqingke/code_docs_r1.3
2021-07-12 11:30:08 +00:00
i-robot 36c7d685a1 !20054 Fix the output format problem in the example of Flatten and OneHot operators.
Merge pull request !20054 from dinglinhe/code_docs_dlh_r1.3_I4023Z
2021-07-12 11:24:10 +00:00
i-robot 8f3a3222e0 !20006 Fix an issue of input type check
Merge pull request !20006 from jxlang910/push-tor1.3
2021-07-12 11:22:10 +00:00
i-robot a0f24b6f96 !20082 delete note info in comment of lr_callback
Merge pull request !20082 from wangnan39/code_docs_delete_note_at_lr_callback
2021-07-12 11:21:27 +00:00
i-robot be32cf94c8 !20028 [MS][LITE] Add build function for model path
Merge pull request !20028 from cjh9368/r1.3
2021-07-12 11:02:47 +00:00
yefeng bc72adb691 fix infer 2021-07-12 18:42:27 +08:00
limingqi107 516122c52c disable mindRT in control flow 2021-07-12 18:38:18 +08:00
i-robot f799fa80ea !20017 fix type bug in fp16 PReLU grad
Merge pull request !20017 from zhangbuxue/fix_type_bug_in_fp16_PReLU_grad_r1.3
2021-07-12 09:54:53 +00:00
i-robot 300e23e573 !19799 modify hccl dlopen
Merge pull request !19799 from changzherui/mod_dlopen_1.3
2021-07-12 09:52:47 +00:00
王南 3b721532df delete note in lr_callback 2021-07-12 17:37:07 +08:00
wangshuide2020 12428e1957 update the example of batchnorm and dropout operators. 2021-07-12 17:27:42 +08:00
Margaret_wangrui 01ae9db58e Handle the backend parameter mismatch caused by maketuple Abstract setting error 2021-07-12 17:18:38 +08:00
zhangzhenghai fe9809b03f !20060 run0710
Merge pull request !20060 from run_0710
2021-07-12 09:13:10 +00:00
i-robot b9fc89f032 !20068 Add Device Matrix from 1024 to 4096
Merge pull request !20068 from huangxinjing/code_docs_fix_device_matrix
2021-07-12 09:00:53 +00:00
i-robot 672883b785 !20065 Fix the file format problem of the operator
Merge pull request !20065 from dinglinhe/code_docs_dlh_r1.3_I408LD
2021-07-12 08:36:47 +00:00
linqingke dc5e10c02a fix resnet multi-rank accuracy. 2021-07-12 16:31:27 +08:00
huangxinjing e9a50de9fe Change Device Number to 4096 2021-07-12 16:19:29 +08:00
i-robot 27b0debd23 !20039 Add Ascend Memory For PanGuModel
Merge pull request !20039 from huangxinjing/code_docs_add_mem_for_pangu
2021-07-12 08:15:10 +00:00
dinglinhe 46bf65408f fix the docs problems of some operators 2021-07-12 16:10:52 +08:00
i-robot e21781ce55 !19915 fix_pipeline_opt_shard
Merge pull request !19915 from yao_yf/fix_pipeline_opt_shard
2021-07-12 08:05:14 +00:00
郑彬 da2b9779e0 add parameter 'config_path' in tinybert on r1.3 2021-07-12 16:02:38 +08:00
dingpeifei e772f5afcd upgrade_run 2021-07-12 16:01:42 +08:00
i-robot d6e3de5f93 !20037 r1.3 Export header files of lite_cv when MSLITE_ENABLE_TRAIN=on
Merge pull request !20037 from luoyang/son_r1.3
2021-07-12 07:45:12 +00:00
dinglinhe 69575e8ea5 fix the wrong output in the Flatten and OneHot 2021-07-12 15:38:19 +08:00
i-robot ec052373d1 !20018 [MSLITE][Develop] fix bug of tensor_rt
Merge pull request !20018 from yangruoqi713/r1.3
2021-07-12 07:17:27 +00:00
i-robot 810f424bfd !20012 [MS][LITE][CPU]fill fp16算子优化
Merge pull request !20012 from liuzhongkai/fill2
2021-07-12 07:15:10 +00:00
i-robot 54c93a227f !20003 [lite]1.3 add header file for converter
Merge pull request !20003 from 徐安越/r1.3
2021-07-12 07:13:12 +00:00
i-robot 5a1dbac549 !20009 move format.h to api
Merge pull request !20009 from zhoufeng/move-format-h-file-1.3
2021-07-12 07:11:15 +00:00
郑彬 3309c6159d modify for bert 310infer on r1.3 2021-07-12 14:40:24 +08:00
louie5 2081fbacb5 addding record kernel graph & graph execute orders in RDR
adding trigger function of RDR in CPU backend pipeline
2021-07-12 14:27:21 +08:00
lanzhineng f14cdf5a16 infer_opt support windows & codex 2021-07-12 14:22:03 +08:00
zhangzhenghai 30f19265ee !20036 upgrade_ascend_0712_mindspore
Merge pull request !20036 from upgrade_ascend_0712
2021-07-12 06:07:06 +00:00
i-robot 3bf1381e4b !19926 [MS][LITE]Fix matmul and convolution delegate
Merge pull request !19926 from gongdaguo/r1.3_fix_matmul
2021-07-12 05:05:08 +00:00
cjh9368 88b670611e [MS][LITE] Add build function for model_path 2021-07-12 12:53:00 +08:00
jin-xiulang 03328ee86d Fix an issue of input type check 2021-07-12 12:07:40 +08:00
xuanyue c7543a437e fix nnie compile 2021-07-12 11:47:13 +08:00
i-robot 1300fbd697 !20023 [lite]1.3 add header files for converter
Merge pull request !20023 from 徐安越/code_docs_r1.3
2021-07-12 03:37:55 +00:00
huangxinjing 3211e527ad Add ascend memory for pangu model 2021-07-12 11:35:27 +08:00
dingpeifei 470c16af1d upgrade_ascend_0712_mindspore 2021-07-12 11:35:26 +08:00
YangLuo d0491237b4 Export header files of lite_cv when MSLITE_ENABLE_TRAIN=on 2021-07-12 11:35:11 +08:00
lzk 1d076fd311 fp16 op 2021-07-11 20:32:39 -07:00
huangmengxi f7618f422d fix numpy docstrings 2021-07-12 11:06:37 +08:00
i-robot 5491b2c982 !19921 [MS][LITE][Develop] lite nnie release package
Merge pull request !19921 from sunsuodong/lite_release_1.3
2021-07-12 02:56:00 +00:00
huangmengxi 840162eb0d fix tuple bounds 2021-07-12 10:49:21 +08:00
yao_yf bb94a9e811 fix_pipeline_opt_shard 2021-07-12 10:47:08 +08:00
i-robot 7a49005969 !19838 tensor setitem bug fix
Merge pull request !19838 from huangmengxi/setitem_pr
2021-07-12 02:02:49 +00:00
i-robot 88a84a6fbb !19843 fix naming collision for gpu op cumsum and cumprod [r1.3]
Merge pull request !19843 from 杨林枫/fix_cumsum_cumprod_naming_collision_r1.3
2021-07-12 02:00:50 +00:00
buxue 156afadb28 fix type bug in fp16 PReLU grad 2021-07-12 09:58:17 +08:00
yangruoqi713 492940d6e2 [MSLITE][Develop] fix bug of tensor_rt 2021-07-12 09:58:15 +08:00
xuanyue c409a0a14e add header file for converter 2021-07-12 09:45:24 +08:00
i-robot 6c7982f18a !19901 [MS][LITE] optimize nlu model
Merge pull request !19901 from cjh9368/r1.3
2021-07-12 01:40:49 +00:00
zhoufeng 3fc1e8bc1e move format.h to api
Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
2021-07-12 09:40:03 +08:00
i-robot ec2ec77666 !19987 [MS][LITE]fix bug and add new api benchmark
Merge pull request !19987 from 张学同/api13
2021-07-12 01:36:58 +00:00
zhangzhaoju a6795ffb72 ci_format 2021-07-12 09:32:20 +08:00
lukelook d2e64d4d97 Add autoaugment 2021-07-11 22:40:49 +08:00
i-robot cd2d2bef0a !20000 [MSLITE][Develop] move lite api: delegate, kernel to mindspore/include
Merge pull request !20000 from yangruoqi713/r1.3
2021-07-11 13:46:57 +00:00
zhangxuetong 3ff778ac53 add some new api and testcase 2021-07-11 21:33:22 +08:00
i-robot a53de5f992 !19991 [MS][LITE]fix bug of control model
Merge pull request !19991 from mengyuanli/sync_r1.3_2
2021-07-11 12:00:54 +00:00
yangruoqi713 b6630f1c14 [MSLITE][Develop] move lite api: delegate, kernel to mindspore/include 2021-07-11 18:53:27 +08:00
cjh9368 93a68b30ae [MS][LITE] debug layernorm 2021-07-11 18:50:13 +08:00
i-robot d177daa609 !19989 Fix empty device context.
Merge pull request !19989 from gaoyong10/runtime1.36
2021-07-11 10:49:35 +00:00
i-robot 3e73ab942d !19982 [MSLITE][Develop] new api to r1.3
Merge pull request !19982 from yangruoqi713/r1.3
2021-07-11 10:25:33 +00:00
mengyuanli c1f45ccef3 fix bug of memleak 2021-07-11 18:00:12 +08:00
i-robot 4431aec128 !19946 [MSLITE] memory leak r1.3
Merge pull request !19946 from ling/r1.3
2021-07-11 09:43:34 +00:00
yangruoqi713 0a7bc34f7e [MSLITE][Develop] new api to r1.3 2021-07-11 17:24:23 +08:00
i-robot eb87a3e5f1 !19990 [MS][LITE]fix bug of control model
Merge pull request !19990 from mengyuanli/sync_r1.3
2021-07-11 09:15:33 +00:00
i-robot 37b64f0b82 !19967 Fix high grad
Merge pull request !19967 from gaoyong10/runtime1.34
2021-07-11 09:13:35 +00:00
ling a385c0a87a [MSLITE] control-flow memory leak bug
[MSLITE] partical node run fp16 subgraph
2021-07-11 16:34:34 +08:00
mengyuanli 172137fbb5 fix bug of mem leak pf control model 2021-07-11 15:59:46 +08:00
gaoyong10 6a8d62d04a Fix invalid device context 2021-07-11 15:57:45 +08:00
i-robot b06c924ee3 !19980 [MS][LITE][TOD] Add ToD to unified MS API (R1.3)
Merge pull request !19980 from ehaleva/export
2021-07-11 07:49:33 +00:00
i-robot e1c9fcd424 !19983 Fix invalid actor set in forward.
Merge pull request !19983 from gaoyong10/runtime1.36
2021-07-11 07:27:33 +00:00
i-robot e68fd44e15 !19872 [MS][LITE][r1.3] allow reduce op has only one input when axes param is not given
Merge pull request !19872 from XianglongZeng/r1.3
2021-07-11 06:42:16 +00:00
Emir Haleva 3f6426cf1e Add Train API to CXX_API in r1.3 2021-07-11 08:46:04 +03:00
i-robot 0448ec8c20 !19944 modify model_zoo bug for config for R1.3
Merge pull request !19944 from lilei/modify_model_zoo_readme_R1.3
2021-07-11 04:30:49 +00:00
gaoyong10 975211be94 Fix invalid actor set in forward. 2021-07-11 12:28:02 +08:00
i-robot 3bd8532493 !19953 Lift fv core dump fix
Merge pull request !19953 from chenfei_mindspore/lift-fv-core-dump-r1.3
2021-07-11 03:36:50 +00:00
gaoyong10 c92d6d5fb5 Fix cannot find backend node. 2021-07-11 11:27:04 +08:00
i-robot 4ce8de0dda !19975 bugfix output contain tuple and tuple
Merge pull request !19975 from limingqi107/r1.3
2021-07-11 03:18:51 +00:00
i-robot 58659457b5 !19805 fix bug of control flow togther host and device
Merge pull request !19805 from limingqi107/r1.3_3
2021-07-11 03:14:50 +00:00
i-robot 4377909962 !19962 [MSLITE] fix ram increase
Merge pull request !19962 from zhaozhenlong/lite/issue/fix_ram_increase
2021-07-11 03:00:53 +00:00
i-robot bdd5762b34 !19952 [MS][LITE] fix print bug
Merge pull request !19952 from jianghui58/fix_r1.3_dlopen
2021-07-11 02:58:58 +00:00
i-robot 8b8201f44a !19942 [MSLITE][Develop] new api to r1.3
Merge pull request !19942 from yangruoqi713/r1.3
2021-07-11 01:10:51 +00:00
i-robot d61f2b3162 !19970 Fix cannot find backend node.
Merge pull request !19970 from gaoyong10/runtime1.36
2021-07-10 19:05:59 +00:00
i-robot d50df065d5 !19951 Refine unify runtime context
Merge pull request !19951 from zyli2020/r1.3_debug
2021-07-10 15:58:07 +00:00
lizhenyu 86579bcee9 bugfix output contain tuple 2021-07-10 23:50:07 +08:00
i-robot e273cb9e94 !19758 [MS][LITE][CPU] x86 conv shape优化
Merge pull request !19758 from liuzhongkai/thread7
2021-07-10 15:22:45 +00:00
gaoyong10 04a8f18772 Fix cannot find backend node. 2021-07-10 22:22:22 +08:00
i-robot 6dca76d4eb !19947 fix tinydarknet
Merge pull request !19947 from wanglin/fix_1.3_wlte
2021-07-10 14:12:43 +00:00
i-robot 92e6c7740e !19777 fix efficientnet imagenet accuracy
Merge pull request !19777 from panfengfeng/fix_efficient_imagenet_accuracy_r1.3
2021-07-10 13:04:51 +00:00
i-robot 10d5dff4a5 !19922 [bugfix]CPU version core dump when destroy resource of net which includes op BatchNorm and BatchNormGrad
Merge pull request !19922 from zyli2020/r1.3_mindrt
2021-07-10 12:35:01 +00:00
yangruoqi713 cc49daf074 [MSLITE][Develop] new api to r1.3 2021-07-10 20:25:10 +08:00
i-robot 8bdb5dad41 !19881 [MS][LITE]support predict callback and build model with model buf
Merge pull request !19881 from 张学同/api_13
2021-07-10 11:55:03 +00:00
zhaozhenlong a42251225d fix ram increase 2021-07-10 19:45:02 +08:00
lizhenyu c6e92fc24b Refine unify runtime context 2021-07-10 18:58:23 +08:00
i-robot 7ec38002ec !19648 [MS][LITE] r1.3 move create train session to trainsession.h
Merge pull request !19648 from zhengjun10/r1.3
2021-07-10 10:11:00 +00:00
chenfei a2811af5c2 fix lift_fv_before_grad cond 2021-07-10 18:05:26 +08:00
jianghui58 18974ce57d fix print bug 2021-07-10 18:05:23 +08:00
limingqi107 ac068d329a fix bug of control flow togther host and device 2021-07-10 18:01:31 +08:00
lilei 7e3b77933f modify model_zoo bug for config 2021-07-10 17:51:53 +08:00
i-robot a7af3bf197 !19933 [MS][LITE][Develop] optimize common convolution kernel
Merge pull request !19933 from sunsuodong/optimize_kernel_1.3
2021-07-10 09:47:03 +00:00
i-robot 5b47320462 !19900 Reuse rank id
Merge pull request !19900 from anancds/r1.3
2021-07-10 09:43:11 +00:00
i-robot 2c5fe50b37 !19898 Add value check
Merge pull request !19898 from ZPaC/1.3-change-dir
2021-07-10 09:37:03 +00:00
l_emon 12b78e2a9b fix tinydarknet 2021-07-10 17:32:43 +08:00
i-robot a07a5c9afe !19895 [MS][LITE] fix dlopen bug
Merge pull request !19895 from jianghui58/fix_r1.3_dlopen
2021-07-10 09:29:00 +00:00
gongdaguo eddb27b303 fix matmul and convolution delegate 2021-07-10 17:21:42 +08:00
i-robot bcaafbf68d !19909 [MS][LITE][r1.3] evade an random convert error caused by compile param
Merge pull request !19909 from XianglongZeng/r1.3_
2021-07-10 08:55:03 +00:00
i-robot 7a199d5aa4 !19919 [MS][LITE]control flow fp16 not run fp16 in partial node
Merge pull request !19919 from mengyuanli/sync_r1.3
2021-07-10 08:52:21 +00:00
i-robot 9e4cd7121f !19742 add cpu masked_select type
Merge pull request !19742 from baihuawei/mask1.3
2021-07-10 08:21:00 +00:00
sunsuodong 3ebda6b70c optimize common convolution kernel 2021-07-10 16:19:20 +08:00
i-robot d285339aae !19912 dataset: modify some comment and log
Merge pull request !19912 from ms_yan/code_docs_log
2021-07-10 08:10:56 +00:00
i-robot fb9f7a5c1f !19770 Fix param check and default value of center in Rotate and RandomRotation Ops
Merge pull request !19770 from xiaotianci/fix_rotate_center_1.3
2021-07-10 08:07:00 +00:00
zengxianglong fc3721d0cd evade an random convert error caused by compile param 2021-07-10 16:05:15 +08:00
i-robot 13b94934c5 !19866 compression warnning clean
Merge pull request !19866 from zhang_sss/r1.3_warnning_clean
2021-07-10 07:54:55 +00:00
lizhenyu ab972ddb6b [bugfix]CPU version core dump when destroy resource of net which includes op BatchNorm and BatchNormGrad 2021-07-10 15:53:51 +08:00
sunsuodong b463a7759f nnie package 2021-07-10 15:45:13 +08:00
i-robot e1987cffda !19869 Ensure the correct execution order of the users of parameter or load
Merge pull request !19869 from Margaret_wangrui/order_enforce_2_r1.3
2021-07-10 07:36:55 +00:00
mengyuanli 88eae72be2 fix bug of partial not able to run fp16 2021-07-10 15:35:08 +08:00
i-robot 33b03dd39a !19871 Bugfix for gpu profiler in callback mode missing step trace data
Merge pull request !19871 from gzhcv/r1.3Profiler
2021-07-10 07:32:56 +00:00
i-robot c592b4bcb2 !19914 Update loss value in bert precision test case
Merge pull request !19914 from chenhaozhe/code_docs_update_bert_loss_r1.3
2021-07-10 07:24:55 +00:00
i-robot 070307634b !19894 topk and intopk float16 bug workaround
Merge pull request !19894 from Peilin/topk-float16-cast-r1.3-fix
2021-07-10 07:10:56 +00:00
ms_yan a58e64b27c modify log and API comment 2021-07-10 15:02:56 +08:00
chenhaozhe 8a6ad41721 update loss value in bert precision test case 2021-07-10 14:57:29 +08:00
i-robot 9c868ca486 !19802 add path check for fl_client in r1.3
Merge pull request !19802 from zhoushan33/flclient0709_om_r1.3
2021-07-10 06:56:57 +00:00
i-robot 2d11b5ea43 !19720 [lite]1.3 add activation attr for gelu
Merge pull request !19720 from 徐安越/r1.3
2021-07-10 06:54:55 +00:00
i-robot fd7ec5ed01 !19822 Update Lenet's readme & maskrcnn & transformer & fasterrcnn
Merge pull request !19822 from huchunmei/r1.3
2021-07-10 06:51:00 +00:00
i-robot 24d2d7df20 !19903 resnet dataset bug fix r1.3
Merge pull request !19903 from 张晓晓/code_docs_resnet_r1.3
2021-07-10 06:40:57 +00:00
i-robot 5a3cff4586 !19813 Clean code for r1.3
Merge pull request !19813 from liangzelang/clean_code_for_r1.3
2021-07-10 06:27:05 +00:00
zengxianglong 881c320ddc evade an random convert error caused by compile param 2021-07-10 14:26:42 +08:00
i-robot b463418c99 !19834 adjust super params for bert thor
Merge pull request !19834 from wangshuangling/r1.3
2021-07-10 06:24:57 +00:00
i-robot 952ba93c6f !19793 Fix shellCheck for cpm
Merge pull request !19793 from casgj/r1.3_0709
2021-07-10 06:12:57 +00:00
i-robot ce9580ebd0 !19779 Change all fullname_with_scope call in dump and debugger in r1.3
Merge pull request !19779 from TinaMengtingZhang/dump_data_diff_r1_3
2021-07-10 05:58:56 +00:00
i-robot e535c79a36 !19830 [MECompiler]Add switch simplify to a2
Merge pull request !19830 from chenfei_mindspore/r1.3
2021-07-10 04:26:59 +00:00
i-robot ff5c7b6057 !19878 Clone graph if it is used by 2 partial
Merge pull request !19878 from chenfei_mindspore/partial_eliminate_err_r1.3
2021-07-10 04:21:04 +00:00
i-robot 509cbe72ed !19825 fix bug of CPU not support more then 2G copy
Merge pull request !19825 from limingqi107/r1.3_4
2021-07-10 03:52:57 +00:00
i-robot 2eaaa6a7c4 !19786 [MS][LITE]Fix convert log
Merge pull request !19786 from gongdaguo/fix_r1.3_log
2021-07-10 03:42:57 +00:00
i-robot bfce9e226a !19818 [Wide&Deep] Add the sparse configuration from the host_device flag
Merge pull request !19818 from Xiaoda/74-fix-the-host-device-problem
2021-07-10 03:40:56 +00:00
i-robot d8aa259d6c !19837 [AutoParallel] fix auto parallel bug
Merge pull request !19837 from lichen/fix_auto_parallel_bug
2021-07-10 03:34:57 +00:00
zhangxiaoxiao 5abaa8569c resnet bug fix r1.3
modified:   official/cv/resnet/scripts/run_infer_310.sh
2021-07-10 11:03:19 +08:00
i-robot 821de715c3 !19839 improve PReLU forward and implement backward on GPU
Merge pull request !19839 from zhangbuxue/improve_prelu_forward_and_implement_backward_on_gpu
2021-07-10 02:56:57 +00:00
chendongsheng a4b0c29a6f Reuse rank id 2021-07-10 10:50:41 +08:00
i-robot 104782f257 !19859 preload gomp when import mindspore
Merge pull request !19859 from ms_yan/r1.3_ctype_preload
2021-07-10 02:38:59 +00:00
i-robot c72bb20727 !19880 Dump func graph where to call the node.
Merge pull request !19880 from 张清华/r1.3
2021-07-10 02:30:58 +00:00
i-robot 7294a5a9dc !19852 fixed heartbeat timeout
Merge pull request !19852 from anancds/r1.3
2021-07-10 02:24:57 +00:00
i-robot aa7c152435 !19828 Fix the unchanged iteration id on cpu
Merge pull request !19828 from maning202007/r1.3
2021-07-10 02:20:59 +00:00
i-robot d13a20b3e7 !19815 Add obj id to the phase as key of compile cache
Merge pull request !19815 from YuJianfeng/r1.3
2021-07-10 02:12:58 +00:00
ZPaC 4e9e363e11 Add value check 2021-07-10 10:08:38 +08:00
i-robot 00672fa133 !19723 add reset op info for tune
Merge pull request !19723 from laiyongqiang/r1.3_tune_reset
2021-07-10 02:07:01 +00:00
i-robot 6b5fb2cb67 !19833 Fix an issue of minidspore federated
Merge pull request !19833 from jxlang910/push-tor1.3
2021-07-10 01:42:57 +00:00
i-robot 824d03090e !19733 update docs for its wrong output
Merge pull request !19733 from dinglinhe/code_docs_dlh_r1.3_I3ZJRT
2021-07-10 01:25:06 +00:00
jianghui58 7a08494b57 fix dlopen bug 2021-07-10 09:11:57 +08:00
Peilin Wang c2071ea2c4 initial commit: fix topk and intopk 2021-07-09 20:27:32 -04:00
zhang__sss fc2f78a403 code clean 2021-07-10 08:15:42 +08:00
i-robot d141ee0c72 !19855 [bugfix] Reinitialize for cpu kernels which do not support multi-thread
Merge pull request !19855 from zyli2020/r1.3_mindrt
2021-07-09 21:42:58 +00:00
i-robot fe33f37a2b !19844 fix getfuncgraphspecializer inner error
Merge pull request !19844 from xychow/fix-getfuncgraphspecializer-innfer-error-1.3
2021-07-09 19:32:08 +00:00
Cathy Wong 9a6b7a5937 MD CI code warning fixes 2021-07-09 15:13:03 -04:00
TinaMengtingZhang 3b909b7b45 Change fullname_with_scope in dump and online debugger 2021-07-09 14:45:11 -04:00
i-robot 8100e6f969 !19826 Fix input arrow of gather actor.
Merge pull request !19826 from gaoyong10/runtime1.3
2021-07-09 15:53:28 +00:00
Zhang Qinghua cbb2d17efb Dump func graph where to call the node. 2021-07-09 21:40:03 +08:00
Margaret_wangrui d3b947dfb7 Ensure the correct execution order of the users of parameter or load 2021-07-09 21:35:15 +08:00
zhangxuetong 74e63a5201 optimize new api 2021-07-09 21:26:47 +08:00
i-robot 607ffdf63a !19761 Fix func graph stack and CNode call stack issues during infer routine.
Merge pull request !19761 from 张清华/r1.3
2021-07-09 12:32:10 +00:00
chenfei 6d90feb0bd clone graph if it is used by two partial 2021-07-09 20:31:06 +08:00
i-robot b36f16485e !19782 modify_model_zoo_readme_R1.3
Merge pull request !19782 from lilei/modify_model_zoo_readme_R1.3
2021-07-09 11:32:12 +00:00
i-robot da9957ef58 !19864 [MS] error links in .py files for r1.3
Merge pull request !19864 from lvmingfu/code_docs_r1.3
2021-07-09 10:52:53 +00:00
gzhcv df6baac62e Bugfix for gpu profiler in callback mode missing step trace data 2021-07-09 18:43:54 +08:00
zjun e9be186dbe Fix dynamic shape to opinfo
Signed-off-by: zjun <zhangjun0@huawei.com>
2021-07-09 18:32:45 +08:00
buxue b9d454c6f5 improve PReLU forward and implement backward on GPU 2021-07-09 18:27:17 +08:00
lvmingfu 0cb19e5cad fix the error links for r1.3 2021-07-09 17:53:57 +08:00
ms_yan 83522df058 add gomp loadLibrary 2021-07-09 17:50:49 +08:00
i-robot f0d33532bb !19767 fix bug of CPU kernel address must not be nullptr
Merge pull request !19767 from limingqi107/r1.3
2021-07-09 09:44:52 +00:00
lizhenyu 436a90cbac bugfix: reinitialize for cpu kernels which do not support multi-thread 2021-07-09 17:36:38 +08:00
zhoushan 3ed4045736 add path check for fl_client in r1.3 2021-07-09 17:23:22 +08:00
i-robot 09dbb1be13 !19703 Add Example for shard and set_stage API
Merge pull request !19703 from huangxinjing/code_docs_fix_doc
2021-07-09 09:22:51 +00:00
limingqi107 9c40bb268e fix bug of CPU not support more then 2G copy 2021-07-09 17:11:46 +08:00
chendongsheng 060ca81c91 fixed heartbeat timeout 2021-07-09 17:09:27 +08:00
i-robot b02dbcffaf !19631 modify inc load mindir r1.3
Merge pull request !19631 from changzherui/mod_inc_load
2021-07-09 09:06:52 +00:00
jin-xiulang 5c02e1dc62 Fix an issue of mindspore federated 2021-07-09 17:02:54 +08:00
i-robot 0602fa5e8d !19708 fix maxpool log
Merge pull request !19708 from gziyan/fix_maxpool_1.3
2021-07-09 09:00:59 +00:00
chenfei 3af1cd2b78 add switch_simplify pass to a2 2021-07-09 16:57:16 +08:00
zhousiyi fd1754421a run static_analysis with broaded args if all inputs are tensor 2021-07-09 08:48:29 +00:00
lichenever f067d61bc2 fix_auto_parallel_bug 2021-07-09 16:44:32 +08:00
yanglf1121 2feefbdaf8 fix naming collision 2021-07-09 16:39:22 +08:00
i-robot 69a0120187 !19668 fixed state
Merge pull request !19668 from anancds/r1.3
2021-07-09 08:38:50 +00:00
i-robot 184c138471 !19801 fix docs error of probability of r1.3
Merge pull request !19801 from byweng/code_docs_r1.3
2021-07-09 08:36:57 +00:00
i-robot ebeacb11c8 !19676 [MD] fix mindrecord api doc in r1.3
Merge pull request !19676 from liyong126/fix_api_doc_r1.3
2021-07-09 08:34:50 +00:00
huchunmei 1396ec53ad clould 2021-07-09 16:33:28 +08:00
yujianfeng 805e29e00f Add obj id to the phase as key of compile cache 2021-07-09 16:20:45 +08:00
huangmengxi 017809eba1 fix 2021-07-09 16:15:45 +08:00
i-robot de832c989a !19664 add ResizeBilinear ops for aicpu
Merge pull request !19664 from yanzhenxiang2020/r1.3_resize_bilinear
2021-07-09 08:10:58 +00:00
sl_wang 8fb0956e7e adjust super params for bert thor 2021-07-09 16:00:15 +08:00
i-robot 42901b1e83 !19819 fix doc bug on 1.3
Merge pull request !19819 from shenwei41/code_docs_docbugr1.3
2021-07-09 07:54:49 +00:00
i-robot 8e60044136 !19817 Fix docs issues
Merge pull request !19817 from luoyang/code_docs_1.3
2021-07-09 07:48:49 +00:00
i-robot 5a15c5c2c2 !19773 seg fault and existing dump fix
Merge pull request !19773 from john_tzanakakis/1.3
2021-07-09 07:42:48 +00:00
i-robot b137e8a812 !19820 Fix Dataset Import Error in DataParallel Mode
Merge pull request !19820 from huangxinjing/code_docs_fix_dataset_import
2021-07-09 07:36:57 +00:00
gaoyong10 0d3792dcef start the large bug 2021-07-09 15:23:02 +08:00
Xiao Tianci 1ed4196d20 fix the param check of center in Rotate, and change the default value of center from {-1, -1} to {} 2021-07-09 15:19:45 +08:00
maning202007 a6fb7f0790 Fix the unchanged iteration id on cpu 2021-07-09 15:17:17 +08:00
i-robot f9e1a284a1 !19739 [MSLITE] add opencl gpu compile support for arm32
Merge pull request !19739 from Liu_Xuu/r1.3
2021-07-09 07:12:50 +00:00
huangxinjing 458920c46c Add pipeline dataset imports 2021-07-09 15:05:10 +08:00
shenwei41 2c5c8b325d fix docs bug 2021-07-09 15:03:07 +08:00
YangLuo e3bdee5093 Fix docs issues 2021-07-09 14:54:24 +08:00
Xiaoda Zhang ddfcff6de2 adjust the sparse config from the host_device_mix flag 2021-07-09 14:52:33 +08:00
i-robot 1f1065eab0 !19787 [MSLITE] update model obfuscation tool
Merge pull request !19787 from yyuse/model_obf_bin_r1.3
2021-07-09 06:40:09 +00:00
liangzelang d2af03b8f6 clean code. 2021-07-09 14:39:38 +08:00
i-robot 809ab27c4d !19652 [r1.3]fix python parameter del coredump if raise an error before init finished
Merge pull request !19652 from huanghui/r1.3-fix-parameter
2021-07-09 06:36:08 +00:00
dinglinhe e2082b9267 update docs for its wrong output 2021-07-09 14:26:18 +08:00
i-robot 5b95f76198 !19743 fix issue: Resolve Python parser exception can't exit
Merge pull request !19743 from lanzhineng/r1.3
2021-07-09 06:14:20 +00:00
i-robot c8776384e9 !19593 fix matmul bug
Merge pull request !19593 from yeyunpeng2020/r1.3
2021-07-09 06:08:07 +00:00
i-robot fad679a7c0 !19744 [MS][LITE] r1.3 remove redundant load api
Merge pull request !19744 from zhengjun10/api1.3
2021-07-09 05:04:07 +00:00
huangxinjing 1fd90d1f37 Fix document 2021-07-09 11:54:43 +08:00
qianjiahong 6f9b1b656f fix docs error of probability 2021-07-09 11:44:40 +08:00
changzherui 71d5e90c99 modify hccl dlopen 2021-07-09 11:43:12 +08:00
i-robot a7dfad423e !19693 fix switch layer call to call
Merge pull request !19693 from gaoyong10/runtime1.3
2021-07-09 03:18:09 +00:00
gaojing a39eb70ae6 fix shellcheck cpm 2021-07-08 23:04:55 -04:00
i-robot e2b8ec6738 !19713 Fix a problem that if kill -9 dataset subprocesses, these subprocesses will turn to zombie processes
Merge pull request !19713 from xiefangqi/md_fix_zombie_subprocesses_r1.3
2021-07-09 03:04:07 +00:00
lilei c635c424cd modify ssd bug for R1.3 2021-07-09 10:55:06 +08:00
LaiYongqiang b61868b8bf add reset op info for tune 2021-07-09 10:45:52 +08:00
i-robot 4bcf62b171 !19728 add avx512 compile option
Merge pull request !19728 from zhaosida/code_clean_r1.3
2021-07-09 02:18:39 +00:00
i-robot bc5a526cb3 !19769 modify callback comment for r1.3
Merge pull request !19769 from changzherui/code_docs_callback_r1.3
2021-07-09 02:14:39 +00:00
i-robot f613af7515 !19687 fix unet to r1.3
Merge pull request !19687 from jiangzhenguang/fix_unet_to_r1.3
2021-07-09 02:06:38 +00:00
i-robot b45a455c70 !19759 fix example result of graphcell
Merge pull request !19759 from wangnan39/code_docs_fix_result_of_graphcell_example
2021-07-09 02:02:44 +00:00
gongdaguo 6e2e8f3b0f fix convert log 2021-07-09 09:57:26 +08:00
chendongsheng a0df28357f fixed state 2021-07-09 09:53:47 +08:00
yangyuan 91c1aed1aa update model obfuscation tool 2021-07-09 09:53:04 +08:00
i-robot 5ff573c71f !19750 Change some config, NCF hasn't support GPU yet.
Merge pull request !19750 from chenhaozhe/code_docs_chang_config_1.3
2021-07-09 01:44:50 +00:00
i-robot fd18b146bb !19760 Fix fl namespace issue.
Merge pull request !19760 from ZPaC/1.3-change-dir
2021-07-09 01:28:38 +00:00
i-robot a452c541d5 !19735 sponge ops update
Merge pull request !19735 from jiahongQian/code_docs_1.3
2021-07-09 01:26:42 +00:00
i-robot c116292b74 !19642 move irpass.cast_eliminate to a_2
Merge pull request !19642 from huangbingjian/move_cast_1.3
2021-07-09 01:24:39 +00:00
i-robot 5196b66781 !19754 fix error in albert script
Merge pull request !19754 from wtcheng/r1.3
2021-07-09 01:20:46 +00:00
baihuawei c0156d695f add cpu masked_select type 2021-07-09 09:19:33 +08:00
i-robot 1328bc5fba !19639 [r1.3]Dump failed when there are duplicate fullname_with_scope in graph
Merge pull request !19639 from caifubi/r1.3
2021-07-09 01:18:49 +00:00
i-robot 2115fb981f !19567 [MS][LITE][develop] enable npu option default off
Merge pull request !19567 from sunsuodong/npu_default_off_1.3
2021-07-09 01:16:47 +00:00
lzk 9c633d5897 conv common 2021-07-08 18:09:29 -07:00
i-robot 45d318991b !19749 [bugfix] The output addr of MaskedSelectGrad hasn't been memset
Merge pull request !19749 from zyli2020/r1.3
2021-07-08 23:06:39 +00:00
panfengfeng a1c6e7f1f6 fix efficientnet imagenet accuracy 2021-07-09 05:12:54 +08:00
hesham 31569abf64 use destMAx in memcpy_ss od tensor.cc 2021-07-08 16:51:50 -04:00
John Tzanakakis 1253989a55 seg fault and existing dump fix 2021-07-08 16:50:07 -04:00
i-robot 26628ff1aa !19704 Remove nop node optimizer for cpu
Merge pull request !19704 from zyli2020/r1.3_mindrt
2021-07-08 18:55:05 +00:00
changzherui 945af494f4 modify callback comment 2021-07-09 00:08:58 +08:00
changzherui 5fb5db054f mod inc load mindir 2021-07-08 23:54:29 +08:00
limingqi107 863e25913e fix bug of CPU workspace address must not be nullptr 2021-07-08 23:09:55 +08:00
i-robot 912ce8f0fb !19730 upgrade_ascend_0708_mindspore
Merge pull request !19730 from mindspore_ding/upgrade_ascend_0708
2021-07-08 14:10:45 +00:00
Zhang Qinghua e64a519c4b Record and Dump func graph call stack with context info. 2021-07-08 22:01:06 +08:00
Zhang Qinghua 0a766ae5b7 Modify context searching and creating routine. 2021-07-08 22:00:51 +08:00
ZPaC 4708bf9ddf Fix fl namespace issue. 2021-07-08 22:00:32 +08:00
王南 258102b4ec fix example result of graphcell 2021-07-08 21:44:02 +08:00
w00517672 de333a34de fix error in albert script 2021-07-08 21:41:28 +08:00
chenhaozhe ab7a4879e2 Modify config, NCF hasn't support GPU yet 2021-07-08 21:30:14 +08:00
i-robot dd26853335 !19679 Fix bugs of MindSpore federated's PW_ENCRYPT
Merge pull request !19679 from jxlang910/push-tor1.3
2021-07-08 13:28:40 +00:00
lizhenyu ac50a4115c [bugfix] The output of MaskEdSelectGrad is wrong due to output addr hasn't been memset 2021-07-08 21:21:16 +08:00
zhengjun10 770fbce559 remove redundant load api 2021-07-08 21:05:08 +08:00
i-robot c6c2da44b3 !19717 [MS][LITE][CPU] x86 线程切割优化
Merge pull request !19717 from liuzhongkai/thread7
2021-07-08 13:02:39 +00:00
lanzhineng 13060cec45 fix infer exception hup 2021-07-08 20:53:02 +08:00
Liu_Xuu cbed4c3ef6 [MSLITE] add opencl gpu compile support for arm32 2021-07-08 20:31:35 +08:00
i-robot 3d1f533dcc !19721 fix doc
Merge pull request !19721 from lianliguang/code_docs_r1.3
2021-07-08 12:31:04 +00:00
i-robot 6d26ab6a4e !19552 Fix dataset log error that dataset doesn't print error log under some scenarios
Merge pull request !19552 from xiefangqi/md_fix_log_print_r1.3
2021-07-08 12:29:04 +00:00
i-robot 5edc2f3c0b !19732 added api
Merge pull request !19732 from anancds/code_docs
2021-07-08 12:27:03 +00:00
i-robot 381ef66d2f !19731 Fix the problem of missing operator output description documents and the inconsistency between the output on some documents and the actual situation
Merge pull request !19731 from dinglinhe/code_docs_dlh_r1.3_I3ZBXW
2021-07-08 12:25:04 +00:00
i-robot d9ac1a3935 !19729 fix the docs of operators like Pad, MatrixDiag, Cellist and DemseThor
Merge pull request !19729 from dinglinhe/code_docs_dlh_r1.3_I3ZEBG
2021-07-08 12:23:05 +00:00
qianjiahong 68ded0d5d5 0706 ops zhushi 2021-07-08 20:18:22 +08:00
i-robot 514f75ff0b !19698 [MSLITE] Fix bug of nnie compilation.
Merge pull request !19698 from wangshaocong/nnie_to_r1.3
2021-07-08 11:49:03 +00:00
i-robot 25acf48655 !19674 fix coding mistakes for yolov4 and inceptionv4
Merge pull request !19674 from zhouneng/code_docs_fix_coding_errors_r1.3
2021-07-08 11:47:03 +00:00
i-robot b047943c4b !19615 fix gpu op cumprod for r1.3
Merge pull request !19615 from 杨林枫/fix_cumprod_1.3
2021-07-08 11:45:11 +00:00
dingpeifei cbc689f389 upgrade_ascend_0708_mindspore 2021-07-08 19:45:05 +08:00
zhaosida 1606c96cd8 add avx512 compile option 2021-07-08 19:44:15 +08:00
dinglinhe 02cdc53c23 fix the docs of operators like Pad, MatrixDiag, Cellist and DemseThor 2021-07-08 19:37:27 +08:00
chendongsheng 3fb5e1423b added api 2021-07-08 19:36:10 +08:00
dinglinhe 37d6cbeaa7 fix the wrong output of some operators 2021-07-08 19:35:54 +08:00
i-robot 2b5612d06f !19697 fix acc bug of albert for fl_client in r1.3
Merge pull request !19697 from zhoushan33/flclient0708_2_om_r1.3
2021-07-08 11:35:02 +00:00
i-robot 5f8f6e379b !19623 Serving, pangu alpha modelzoo
Merge pull request !19623 from 徐永飞/r1.3
2021-07-08 11:34:23 +00:00
i-robot dd4045b520 !19684 [MS][LITE]Fix x86_linus inference time
Merge pull request !19684 from gongdaguo/fix_x86_runtime4
2021-07-08 11:13:43 +00:00
xuanyue 94d7c38c6f add activation attr for gelu 2021-07-08 19:09:51 +08:00
i-robot b8fc23681a !19658 [MS][LITE][Develop] lite version 1.3.0
Merge pull request !19658 from sunsuodong/version_1.3
2021-07-08 11:09:48 +00:00
i-robot f457c355f5 !19663 [lite]1.3 add gelu to v0
Merge pull request !19663 from 徐安越/r1.3
2021-07-08 11:07:41 +00:00
i-robot dde39211c2 !19710 unet postprocess bug fix r1.3
Merge pull request !19710 from 张晓晓/code_docs_unet_r1.3
2021-07-08 10:47:44 +00:00
lzk 8c6168e5a9 maskmatmuladd 2021-07-08 03:42:20 -07:00
lizhenyu 1d81102fd0 Remove nop node optimizer for cpu 2021-07-08 17:59:42 +08:00
xiefangqi ec929d4cf6 fix zombie subprocesses problem 2021-07-08 17:52:47 +08:00
lianliguang 8d3317685f fix doc 2021-07-08 17:51:14 +08:00
wang_shaocong 0d0265bd7c [MSLITE] Fix bug of nnie compilation 2021-07-08 17:29:41 +08:00
zhangxiaoxiao 93dfd52ac6 unet bug fix r1.3
modified:   official/cv/unet/postprocess.py
2021-07-08 17:28:03 +08:00
xuyongfei 53ba992bfe Serving, pangu alpha modelzoo 2021-07-08 17:19:26 +08:00
Ziyan f64a4e5159 fix maxpool log 2021-07-08 17:14:59 +08:00
i-robot 1ac696d044 !19699 fix docs
Merge pull request !19699 from lianliguang/code_docs_r1.3
2021-07-08 08:43:42 +00:00
lianliguang e5b0288076 fix doc 2021-07-08 16:23:10 +08:00
i-robot 7d47b12e3b !19682 Add README For PanGu Alpha Model
Merge pull request !19682 from huangxinjing/code_docs_pangu_r13
2021-07-08 08:19:43 +00:00
zhoushan 2488333828 fix acc bug of albert for fl_client in r1.3 2021-07-08 16:16:29 +08:00
i-robot 48f0daa9a9 !19614 Move optional_helper from abi_helper to dataset
Merge pull request !19614 from luoyang/son_r1.3
2021-07-08 08:15:42 +00:00
i-robot f8d9f035d2 !19650 add code of albert for fl_client in r1.3
Merge pull request !19650 from zhoushan33/flclient0708_om_r1.3
2021-07-08 08:13:41 +00:00
gaoyong10 e51ddced63 Fix switch layer call to call 2021-07-08 16:10:21 +08:00
sunsuodong 0d18ba9c8b npu_default_off 2021-07-08 16:05:42 +08:00
jiangzhenguang 0c1b8a8fae fix unet 2021-07-08 15:36:17 +08:00
huangxinjing e6e5ce4b81 Add readme.md 2021-07-08 15:29:38 +08:00
gongdaguo b9434c3255 fix x86_linux run time 2021-07-08 15:13:25 +08:00
i-robot e5dcdbd366 !19641 save group info before create group
Merge pull request !19641 from yangzhenzhang/group-info-ckpt-r1.3
2021-07-08 06:51:39 +00:00
jin-xiulang d2b42fd12a Add secure parameters for mindspore federated learning.
fix prime initialization and build key bug

fix reconstruct access bug and kernels' retcode.

Fix init issue

Add fl secure parameter cipher_time_window
2021-07-08 14:43:41 +08:00
i-robot e372634d16 !19647 Optimize armour directory
Merge pull request !19647 from yyuse/r1.3_change_armour_dir
2021-07-08 06:25:39 +00:00
i-robot cb98d34145 !19506 [ME][Compiler]Add PartialBroaden interface
Merge pull request !19506 from chenfei_mindspore/namespace-err
2021-07-08 06:23:42 +00:00
i-robot f03493f179 !19645 inceptionv4 310 infer bug fix r1.3
Merge pull request !19645 from 张晓晓/code_docs_inceptionv4_r1.3
2021-07-08 06:14:09 +00:00
liyong b2db9aa79b fix api doc 2021-07-08 14:12:26 +08:00
zhouneng fe2058ab42 fix coding mistakes 2021-07-08 12:31:41 +08:00
i-robot 968e5b120e !19505 fix empty lic
Merge pull request !19505 from zhoufeng/fix-empty-lic-1.3
2021-07-08 04:07:44 +00:00
i-robot 58befda3ca !19551 Fix gnmt_v2 train 1p fail bug with r1.3
Merge pull request !19551 from zhanghuiyao/fix_gnmt_v2_r1.3
2021-07-08 03:49:39 +00:00
i-robot f5dff92ebc !19638 down vecmatmul fp32 fp16
Merge pull request !19638 from zhaozhenlong/lite/r1.3/down_matmul
2021-07-08 03:37:40 +00:00
i-robot 7558e03042 !19617 fix numpy native docs r1.3
Merge pull request !19617 from 杨林枫/code_docs_fix_numpy_native
2021-07-08 03:25:41 +00:00
zhoushan 1d3098f44b add code of albert for fl_client in r1.3 2021-07-08 11:22:13 +08:00
yanzhenxiang2020 e033175309 add ResizeBilinear ops for aicpu 2021-07-08 11:10:38 +08:00
i-robot 3e9c1373ab !19654 Optimize mindspore.dataset.vision.c_transforms api doc
Merge pull request !19654 from xiefangqi/code_docs_fix_ctransforms_comments_r1.3
2021-07-08 03:07:45 +00:00
i-robot 8f09d53e03 !19570 [MSLITE] graph isolate input bug
Merge pull request !19570 from ling/r1.3
2021-07-08 03:05:46 +00:00
xuanyue d6daba98f3 add gelu to v0 2021-07-08 11:04:15 +08:00
sunsuodong 0d8ae80498 lite version 1.3 2021-07-08 10:51:30 +08:00
huanghui 3cdcb80809 fix python parameter del coredump if raise an error before init finished 2021-07-08 10:25:20 +08:00
xiefangqi d730ad9a73 fix c_transforms comments 2021-07-08 10:25:18 +08:00
zhengzuohe 228edc0c56 update submodule AKG-r1.3 2021-07-08 10:23:21 +08:00
i-robot 2d0fdf3904 !19603 update acc_mode to improve resnet performance.
Merge pull request !19603 from linqingke/r1.3
2021-07-08 02:15:39 +00:00
i-robot b04728f4ec !19561 infer_opt:support swithlayer
Merge pull request !19561 from lanzhineng/r1.3
2021-07-08 02:13:48 +00:00
caifubi 6b3b1f8423 Add uniqueName for data dump 2021-07-08 10:08:41 +08:00
i-robot bf1ee2db87 !19632 [bugfix] custom bprop got unexpected input
Merge pull request !19632 from zyli2020/r1.3_mindrt
2021-07-08 02:03:41 +00:00
i-robot 97c1ae4bec !19607 add federated learning albert r1.3
Merge pull request !19607 from wtcheng/r1.3
2021-07-08 02:01:45 +00:00
zhengjun10 dcb564e5f5 unify lite session train and infer session 2021-07-08 10:00:47 +08:00
yangyuan 43337c1543 Optimize armour directory 2021-07-08 09:53:15 +08:00
i-robot aaa80736a6 !19599 [MS][LITE] r1.3 modify adbert to emotion classify demo
Merge pull request !19599 from zhengjun10/r1.3
2021-07-08 01:51:50 +00:00
zhangxiaoxiao ea77c477c5 inceptionv4 bug fix r1.3 2021-07-08 09:43:27 +08:00
i-robot 418f9eb903 !19597 [AutoParallel]fix pipeline split bug
Merge pull request !19597 from lichen/fix_pipeline_split_bug
2021-07-08 01:41:41 +00:00
i-robot ebb7684b55 !19534 Optimize test scripts of mindspore federated
Merge pull request !19534 from jxlang910/myr1.3
2021-07-08 01:39:42 +00:00
i-robot 835161dc18 !19619 fixed heartbeat timeout
Merge pull request !19619 from anancds/timeout
2021-07-08 01:35:40 +00:00
i-robot fbf2569bc0 !19600 change input_size of Conv3dTranpose from INPUT_ATTR_MAP to ATTR_MAP.
Merge pull request !19600 from wangshuide/wsd_r1.3
2021-07-08 01:33:39 +00:00
i-robot 984a1fa1ad !19542 Support tuple index is negative during TraceTupleCNodeEffectInfo
Merge pull request !19542 from Margaret_wangrui/tuple_index_is_negative_r1.3
2021-07-08 01:31:42 +00:00
i-robot 8e79145083 !19590 Optimize federated learning directory
Merge pull request !19590 from ZPaC/1.3-change-dir
2021-07-08 01:29:46 +00:00
huangbingjian 1c61911a96 move irpass.cast_eliminate to a_2 2021-07-08 09:28:35 +08:00
i-robot 24df7fc466 !19583 fix some network problem
Merge pull request !19583 from jiangzhenguang/fix_someme_problem
2021-07-08 01:27:44 +00:00
i-robot ccc7293780 !19577 bugfix for reducescatter about format selection
Merge pull request !19577 from wangjun/rs_r1.3
2021-07-08 01:25:43 +00:00
i-robot e541f959ea !19555 fixed core dump
Merge pull request !19555 from anancds/r1.3
2021-07-08 01:23:44 +00:00
i-robot 5adf07b218 !19545 fix coredump caused by WriteResult fuc in infer utils.cc
Merge pull request !19545 from ZeyangGAO/coredumpfix
2021-07-08 01:21:40 +00:00
i-robot e9beb7978f !19544 Fit Gpu LoopCount for profiler module (add patch)
Merge pull request !19544 from gzhcv/r1.3Profiler
2021-07-08 01:19:45 +00:00
i-robot 10ce8d1b8b !19535 fix bnupdate_eltwise_eltwise_fusion_pass
Merge pull request !19535 from yuchaojie/r1.3
2021-07-08 01:17:44 +00:00
i-robot 2f31ab4e49 !19538 [LITE][ios][1.3] version
Merge pull request !19538 from yefeng/130-fix_version_1.3
2021-07-08 01:15:45 +00:00
zhaozhenlong 305e058612 down vec matmul fp16 fp32 2021-07-08 09:13:21 +08:00
yangzhenzhang b6003e6043 group info ckpt 2021-07-08 09:12:20 +08:00
i-robot 5770ee2921 !19517 fix device bug of the input empty
Merge pull request !19517 from limingqi107/r1.3
2021-07-07 18:11:38 +00:00
lizhenyu 293267ea48 [bugfix] custom bprop got unexpected input 2021-07-07 23:27:38 +08:00
i-robot 89644e41bb !19589 [MS][LITE][Develop] optimize common convolution kernel
Merge pull request !19589 from sunsuodong/optimize_kernel_1.3
2021-07-07 14:19:57 +00:00
ZPaC 834b698607 Optimize federated learning directory 2021-07-07 19:07:38 +08:00
chendongsheng d1148e02e3 fixed heartbeat 2021-07-07 18:49:06 +08:00
yanglf1121 bc0ce14b3d fix numbers.Number 2021-07-07 18:46:06 +08:00
YangLuo f1f8e6d855 Move optional_helper from abi_helper to dataset 2021-07-07 18:30:43 +08:00
yanglf1121 8c2b0fc7a5 fix gpu cumprod 2021-07-07 18:24:17 +08:00
i-robot 3d04d0362b !19518 Enable mindRT on CPU device
Merge pull request !19518 from limingqi107/r1.3_2
2021-07-07 10:16:02 +00:00
w00517672 623152a56c add federated learning albert 2021-07-07 18:03:31 +08:00
i-robot a6ceef0651 !19587 remove the redundant symbol of TypeError.
Merge pull request !19587 from wangshuide/code_docs_wsd_r1.3
2021-07-07 09:30:04 +00:00
i-robot 409144e9c0 !19540 strategy_ckpy_add_opt_param_r1.3
Merge pull request !19540 from yao_yf/strategy_ckpy_add_opt_param_r1.3
2021-07-07 09:22:09 +00:00
linqingke 9975c6a3a8 update resnet network performence.
less_bn pattern update.

fix clang-format

!19495 add allreduce operators by parameters for lessBN and add group parameters generator for lessBN, gc and grad freeze
Merge pull request !19495 from jinjiali-kali/less_bn

update resnet acc_mode scripts.
2021-07-07 17:07:43 +08:00
i-robot 0305441854 !19541 modify model_zoo resnext and resnet bug
Merge pull request !19541 from lilei/modify_model_zoo_bug
2021-07-07 08:52:05 +00:00
i-robot c6c056985c !19532 Fix cell return funcgraph
Merge pull request !19532 from zjun/r1.3
2021-07-07 08:44:06 +00:00
sunsuodong d492b39bd5 optimize common conv 2021-07-07 16:42:15 +08:00
wangshuide2020 3bc7e787d0 change input_size of Conv3dTranpose from INPUT_ATTR_MAP to ATTR_MAP. 2021-07-07 16:35:02 +08:00
zhengjun10 d37dbd3592 modify adbert to emotion classify demo 2021-07-07 16:29:20 +08:00
lichenever 51238243b4 fix_pipeline_split_bug 2021-07-07 16:16:53 +08:00
i-robot f8acf372af !19557 fix parallel case
Merge pull request !19557 from gziyan/fix_parallel_case
2021-07-07 08:12:04 +00:00
i-robot e7762b44c0 !19578 fix api
Merge pull request !19578 from lianliguang/code_docs_r1.3
2021-07-07 08:10:01 +00:00
yeyunpeng2020 6e8a0e33c1 fix matmul bug 2021-07-07 16:09:15 +08:00
i-robot 45db837f96 !19562 Fixing Dump iteration ="1" issue in GPU
Merge pull request !19562 from jiangshuqiang/fix_dump_issue
2021-07-07 07:54:04 +00:00
wangshuide2020 528b735f73 remove the redundant symbol of TypeError. 2021-07-07 15:47:19 +08:00
i-robot 401b4d38ae !19533 [MS][LITE]fix weight quantlize bug of control flow graph
Merge pull request !19533 from mengyuanli/sync_r1.3
2021-07-07 07:42:03 +00:00
i-robot b4ed7172dc !19531 workaround for bprop specialize before fprop is specialized
Merge pull request !19531 from xychow/workaround-for-bprop-specialize
2021-07-07 07:30:05 +00:00
lianliguang 2c836f0dec add example for reg ops 2021-07-07 15:25:14 +08:00
jiangzhenguang 41d6a684ce fix some network problem 2021-07-07 15:23:20 +08:00
i-robot ecb7293e6e !19564 Fix bug of TensorCopySlices
Merge pull request !19564 from caifubi/r1.3
2021-07-07 07:10:03 +00:00
i-robot 7ff4909f61 !19556 Add hyperlink to number in comment of mindspore.nn
Merge pull request !19556 from wangnan39/code_docs_1.3add_number_link
2021-07-07 06:22:09 +00:00
ling 6a3453fe82 graph-input isolate bug 2021-07-07 14:15:50 +08:00
i-robot f03091b5a9 !19537 fix allgather op select bug
Merge pull request !19537 from yuchaojie/r1.3_op_select
2021-07-07 06:14:04 +00:00
wangjun 7eac6a590c bugfix for reducescatter 2021-07-07 14:12:56 +08:00
mengyuanli 9c76220fd3 fix bug of weight quant of control flow graph 2021-07-07 11:53:23 +08:00
Parastoo Ashtari 3dce741864 fixing dump iteration 1 issue in gpu 2021-07-07 10:49:29 +08:00
lanzhineng e32c63f8a7 infer:support swithlayer 2021-07-07 10:45:52 +08:00
王南 2d15d11446 add Number link for api docs 2021-07-07 10:40:58 +08:00
i-robot 3ceaf633e5 !19509 [bugfix] Heterogenous scenario in a ms function of PyNative Mode occur core dump
Merge pull request !19509 from zyli2020/r1.3_mindrt
2021-07-07 02:38:03 +00:00
Ziyan cba798ef39 fix parallel testcase 2021-07-07 10:33:31 +08:00
chendongsheng 31928cee94 fixed core dump 2021-07-07 10:20:23 +08:00
xiefangqi 9bcf663533 fix gpu allocate error without printing log 2021-07-07 10:14:13 +08:00
zhanghuiyao c96e67fe40 fix gnmt_v2 train 1p fail bug on r1.3. 2021-07-07 10:13:16 +08:00
lilei cc3138cd81 modify model_zoo resnext and resnet bug 2021-07-07 10:03:08 +08:00
gzhcv 94f28a2da3 Fit Gpu LoopCount for profiler module v2 2021-07-07 09:36:34 +08:00
Margaret_wangrui 587e361f14 Support tuple index is negative during TraceTupleCNodeEffectInfo 2021-07-07 09:32:21 +08:00
yao_yf 0cf808e164 strategy_ckpy_add_opt_param_r1.3 2021-07-07 09:25:55 +08:00
yefeng 237028fba1 version 2021-07-07 09:24:14 +08:00
yuchaojie a760c4db82 fix allgather op select bug 2021-07-07 09:22:14 +08:00
yuchaojie a26a3cfaeb fix bnupdate_eltwise_eltwise_fusion_pass 2021-07-07 09:20:59 +08:00
jin-xiulang b4791b9f89 Optimize test scripts of mindspore federated 2021-07-07 09:15:40 +08:00
limingqi107 86a835e720 Enable mindRT on CPU device 2021-07-07 09:13:38 +08:00
i-robot 00149771ae !19502 demo add resource file
Merge pull request !19502 from yeyunpeng2020/r1.3
2021-07-07 01:10:05 +00:00
zjun cdee64b079 Ignore funcgraph grad
Signed-off-by: zjun <zhangjun0@huawei.com>
2021-07-07 08:58:16 +08:00
ZeyangGao b4573cba1c writeresult coredump fix 20210706 2021-07-07 08:45:05 +08:00
limingqi107 04956a776c fix device bug of the input empty 2021-07-06 22:44:20 +08:00
caifubi ab611b293d fix TensorCopySlices bprop 2021-07-06 21:35:10 +08:00
lizhenyu dadfe54ced [bugfix] Heterogenous scenario in a ms function of PyNative Mode occur core dump 2021-07-06 21:30:33 +08:00
zhoufeng d5e36f5a46 fix empty lic
Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
2021-07-06 21:11:11 +08:00
yeyunpeng2020 7c22c9a1f3 demo add resource file 2021-07-06 21:06:48 +08:00
i-robot 83d6ab79e1 !19497 add parameter of lite
Merge pull request !19497 from shenwei41/code_docs_litecv
2021-07-06 12:42:28 +00:00
i-robot 2119ff68cc !19479 wgan export bug fix master
Merge pull request !19479 from 张晓晓/code_docs_wgan
2021-07-06 12:27:18 +00:00
shenwei41 f8e89cf9f8 add parameter of lite 2021-07-06 20:22:57 +08:00
i-robot b698b41b51 !19470 [MS][LITE]add models for control_flow and delete some useless files
Merge pull request !19470 from mengyuanli/mindRT_control_flow
2021-07-06 12:15:21 +00:00
i-robot b85b4f3bdd !19478 code_docs_pangu_add_gpu
Merge pull request !19478 from yao_yf/code_docs_pangu_add_gpu
2021-07-06 11:29:54 +00:00
i-robot e31f46c3f4 !19415 Fix the decode error in debugger when send multigraphs
Merge pull request !19415 from maning202007/master
2021-07-06 11:15:55 +00:00
i-robot 967a3b8104 !19205 ctc loss v2 and ctc los v2 grad
Merge pull request !19205 from liubuyu/ctcloss
2021-07-06 11:09:56 +00:00
i-robot bc38627deb !19452 [lite] adjust model name
Merge pull request !19452 from 徐安越/master_core
2021-07-06 11:01:56 +00:00
i-robot ff9fa4f34d !19460 support user set graph-output-tensor data
Merge pull request !19460 from ling/fix
2021-07-06 10:59:58 +00:00
i-robot a6a65ed8fa !19447 disable benchmark in runtest
Merge pull request !19447 from ling/pr
2021-07-06 10:55:54 +00:00
i-robot 24b9b69b50 !19429 fix fp32 vecmatmul
Merge pull request !19429 from zhaozhenlong/lite/issue/up_fp32_vecmatmul
2021-07-06 10:53:52 +00:00
i-robot 6a6543b181 !19430 Fix the constant folding bug of shape
Merge pull request !19430 from yeyunpeng2020/converter
2021-07-06 10:44:01 +00:00
zhangxiaoxiao 0e98f233d9 wgan export bug fix master 2021-07-06 18:17:30 +08:00
yao_yf da67a91c14 add pangu gpu 2021-07-06 17:31:19 +08:00
i-robot e2408e77ae !19432 add DeepBSDE
Merge pull request !19432 from zhaoting/deepbsde
2021-07-06 09:29:14 +00:00
zhaozhenlong 7cc485f2de fix vec matmul fp32 2021-07-06 17:27:01 +08:00
i-robot bbd6235546 !19426 Fixing load async tensor issue on ascend
Merge pull request !19426 from parastooashtari/async_dump
2021-07-06 09:21:13 +00:00
mengyuanli 8237479e38 add control-flow model
delete some files
2021-07-06 17:06:48 +08:00
i-robot 7458b4a099 !19018 fix split memory leak
Merge pull request !19018 from 范吉斌/split
2021-07-06 08:37:15 +00:00
i-robot 70d1344249 !19446 [MS][LITE]fix bug of control flow
Merge pull request !19446 from mengyuanli/mindRT_control_flow
2021-07-06 08:23:16 +00:00
ling d027619b82 support user set graph-output-tensor data 2021-07-06 16:19:25 +08:00
i-robot 8e4cd2ccdc !19442 adapt variable_memory_max_size range check
Merge pull request !19442 from laiyongqiang/variable_max_size
2021-07-06 07:57:13 +00:00
i-robot 45d3f32f9d !19421 amend yolov5 and centerface Readme
Merge pull request !19421 from chenweitao_295/centerface_yolov5_amend
2021-07-06 07:21:12 +00:00
mengyuanli 78ca53e468 fix bug of control flow 2021-07-06 15:09:49 +08:00
i-robot fb680638d8 !18683 add some function to unet and yolov4
Merge pull request !18683 from jiangzhenguang/unet_and_yolov4
2021-07-06 07:03:12 +00:00
i-robot 8a45765a19 !19395 Add AbstractTuple Comparision In Parameter Reuse.
Merge pull request !19395 from liangzelang/fix_multicall_bug
2021-07-06 06:59:15 +00:00
i-robot c6f576f625 !19445 textrcnn bug fix
Merge pull request !19445 from 张晓晓/master
2021-07-06 06:49:16 +00:00
i-robot faab555824 !19411 down fp32 vecmatmul
Merge pull request !19411 from zhaozhenlong/lite/issue/down_fp32_vecmatmul
2021-07-06 06:47:18 +00:00
i-robot 4564f37262 !18702 Pix2Pix pull request
Merge pull request !18702 from vvvvvvPeng/master
2021-07-06 06:45:18 +00:00
zhousiyi dfce41e027 workaround for bprop func graph specialization but fprop func graph is not specialized yet. 2021-07-06 06:43:25 +00:00
i-robot 3a195af6c0 !19450 Modify comments about LossBase.get_loss()
Merge pull request !19450 from chenhaozhe/code_docs_loss_comments
2021-07-06 06:33:45 +00:00
xuanyue 7a94671370 adjust model name 2021-07-06 14:31:28 +08:00
liubuyu 25df9faabd add ctclossv2 and ctclossv2grad 2021-07-06 14:19:30 +08:00
ling ac7716fa9a disable benchmark in runtest 2021-07-06 11:57:49 +08:00
zhaoting 25e3a360d3 add DeepBSDE 2021-07-06 11:39:36 +08:00
chenhaozhe 554ceb2492 Change comments about LossBase.get_loss() 2021-07-06 11:32:54 +08:00
zhangxiaoxiao ace707951c textrcnn bug fix 2021-07-06 11:32:04 +08:00
LaiYongqiang 8f4c6692b9 add variable_memory_max_size range check 2021-07-06 10:57:36 +08:00
yeyunpeng2020 498479ed5e Fix the constant folding bug of shape 2021-07-06 10:47:03 +08:00
liangzelang d229e53266 Add AbstractTuple comparision. 2021-07-06 09:51:32 +08:00
chenweitao_295 f84cf543d7 amend yolov5 and centerface Readme 2021-07-06 09:17:10 +08:00
jiangzhenguang f4851d2244 add some function to unet and yolov4 2021-07-06 09:06:06 +08:00
Parastoo Ashtari 81a289d084 Fix for load async tensor on acend issue 2021-07-05 19:02:04 -04:00
maning202007 ffeb929a0d Fix the decode error in debugger when send multigraphs 2021-07-05 21:39:13 +08:00
zhaozhenlong db661ee8c1 down fp32 vec matmul 2021-07-05 20:49:15 +08:00
fanjibin 34f77fe20c fix split memory leak 2021-06-28 18:25:00 +08:00
chenweipeng 28071fd4a7 Update_6-22-15:08 2021-06-23 10:59:35 +08:00
1346 changed files with 28718 additions and 9692 deletions

View File

@ -1,3 +1,357 @@
# MindSpore 1.3.0
## MindSpore 1.3.0 Release Notes
### Major Features and Improvements
#### NewModels
- [STABLE] Add CV models on Ascend: CPM, CSPDarknet53.
- [STABLE] Add NLP models on Ascend: NAML, Fasttext, GRU, LSTM.
- [STABLE] Add CV models on GPU: Faster-RCNN.
- [BETA] Add CV models on Ascend:
- [BETA] Add EPP-MVSNet on GPU: a novel deep learning network for 3D reconstruction from multi-view stereo, which has won the first place in Tanks & Temples leaderboard (until April 1, 2021).
#### FrontEnd
- [STABLE] Support interface `run_check` to check whether MindSpore is working properly or not.
- [STABLE] Support saving custom information in checkpoint file.
- [STABLE] Normal class add mean parameter.
- [STABLE] Support export YOLOv3-DarkNet53 and YOLOv4 ONNX model.
- [STABLE] Support 40+ operator export ONNX model.
- [STABLE] The Metric module supports `set_indexes` to select the inputs of `update` in the specified order.
- [STABLE] Switch`_Loss` to an external API `LossBase` as the base class of losses.
#### Auto Parallel
- [STABLE] Add distributed operators: Select/GatherNd/ScatterUpdate/TopK.
- [STABLE] Support basic pipeline parallelism.
- [STABLE] Optimize sharding strategy setting of`Gather`.
- [STABLE] Optimize mix precision and shared parameter scenarios.
- [STABLE] Optimize distributed prediction scenarios.
#### Executor
- [STABLE] Support unified runtime in GPU and CPU backend.
- [STABLE] MindSpore GPU support CUDA11 with cuDNN8.
- [STABLE] MindSpore GPU inference performance optimization by integrating TensoRT.
- [STABLE] MindSpore built on one Linux distribution can now be used on multiple Linux distributions with same CPU architecture (e.g. EulerOS, Ubuntu, CentOS).
- [STABLE] MindSpore now supports Ascend310 and Ascend910 environments with one single wheel package, and provides an alternate binary package for Ascend310 specifically.
- [STABLE] MindSpore Ascend support group convolution.
#### DataSet
- [STABLE] Support caching over MindRecord dataset.
- [STABLE] Support new shuffle mode for MindRecord dataset.
- [STABLE] Support a cropper tool for MindSpore Lite to allow the user to customize MindData binary file according to their script.
- [STABLE] Support share memory mechanism to optimize the multi-processing efficiency of GeneratorDataset/Map/Batch.
- [STABLE] Add features for the GNN dataset to support molecular dynamics simulation scenarios.
#### FederatedLearning
- [STABLE] Support Cross-device federated learning framework.
- [STABLE] Support FL-Server distributed networking including TCP and HTTP communication.
- [STABLE] Support FL-Server distributed federated aggregation,support autoscaling and fault tolerance.
- [STABLE] Develop FL-Client framework.
- [STABLE] Supports local differential privacy algorithms.
- [STABLE] MPC-based security aggregation algorithm.
- [STABLE] MindSpore Lite Device-side Inference & Training Interconnection with FL-Client.
#### Running Data Recorder
- [STABLE] Provide records of multi-stage computational graphs, memory allocation information and graph execution order when a "Launch kernel failed" occurs. (CPU)
#### GraphKernel Fusion
- [STABLE] Add options to control the optimization level.
- [STABLE] Enhance the generalization ability on GPU. GraphKernel is enabled by default in 40+ networks which cover the field of NLP, CV, Recommender, NAS and Audio. The result shows their throughput is significantly improved, and you are Recommended enabling GraphKernel in your network.
### API Change
#### Backwards Incompatible Change
##### Python API
###### `mindspore.dataset.Dataset.device_que` interface removes unused parameter `prefetch_size`([!18973](https://gitee.com/mindspore/mindspore/pulls/18973))
Previously, we have a parameter `prefetch_size` in `device_que` to define the prefetch number of records ahead of the user's request. But indeed this parameter is never used which means it is an ineffective parameter. Therefore, we remove this parameter in 1.3.0 and users can set this configuration by [mindspore.dataset.config.set_prefetch_size](https://www.mindspore.cn/doc/api_python/en/r1.3/mindspore/mindspore.dataset.config.html#mindspore.dataset.config.set_prefetch_size).
<table>
<tr>
<td style="text-align:center"> 1.2.1 </td> <td style="text-align:center"> 1.3.0 </td>
</tr>
<tr>
<td>
```python
device_que(prefetch_size=None, send_epoch_end=True, create_data_info_queue=False)
```
</td>
<td>
```python
device_que(send_epoch_end=True, create_data_info_queue=False)
```
</td>
</tr>
</table>
### Bug fixes
#### FrontEnd朱乃盘
- Fix exception when use import module in while body such as 'F.xxx'.([!17635](https://e.gitee.com/mind_spore/repos/mindspore/mindspore/pulls/17635))
- Fix the exception of 'exceeding limit call depth' in compile graph process when use while expression with grad operation. ([!18662](https://e.gitee.com/mind_spore/repos/mindspore/mindspore/pulls/18662))
#### Executor姜建飞
- Fix reallocate memory bug for communication op.([!14492](https://gitee.com/mindspore/mindspore/pulls/14492))
- Replace memcpy_async op with tensor_move op.([!15204](https://gitee.com/mindspore/mindspore/pulls/15204))
- Fix the build error when multiple python versions are installed in the environment.([!19165](https://gitee.com/mindspore/mindspore/pulls/19165))
- The warning when the te/topi/hccl version does not match is optimized, and fix the repeated warning.([!18704](https://gitee.com/mindspore/mindspore/pulls/18704))
- Fix the error in a cluster with more than 8 pcs in pynative mode.([!16376](https://gitee.com/mindspore/mindspore/pulls/16376))
- Fix graph ring problem in UB fusion.([!16109](https://gitee.com/mindspore/mindspore/pulls/16109))
- Fix AllGather op select problem when shape is not divisible by 16.([!18878](https://gitee.com/mindspore/mindspore/pulls/18878))
#### Dataset (刘存伟)
- Fix an out-of-memory error when ImagefolderDataset gets an illegal directory. ([!16196](https://gitee.com/mindspore/mindspore/pulls/16196))
- Fix bugs of vision transformations in lite mode. ([!14722](https://gitee.com/mindspore/mindspore/pulls/14722),[!14774](https://gitee.com/mindspore/mindspore/pulls/14774),[!15050](https://gitee.com/mindspore/mindspore/pulls/15050))
- Fix default numbers of parallel workers of MindData for those CPUs with fewer cores. ([!15921](https://gitee.com/mindspore/mindspore/pulls/15921))
- Fix cache client status reset and final flush at abnormal termination. ([!15038](https://gitee.com/mindspore/mindspore/pulls/15038))
- Fix MindRecord writing failed probabilistically in multiprocessing. ([!15242](https://gitee.com/mindspore/mindspore/pulls/15242))
## MindSpore Lite
### Major Features and Improvements
#### Converter and runtime
1. Support Caffe model running on Hi3516D.
2. Support delegate mechanism to run your models(part or whole) on user specified executor.
3. Support control flow models.
4. Support cross-compiling for iOS, so that we can inference models on iOS device.
#### x86 backend optimization
1. Optimize kernels for x86 using Advanced Vector Extensions(AVX).
#### ARM backend optimization
1. Optimize fp16 kernels.
2. Support arm32 fp16 instruction acceleration on ARMv8.2.
#### Cuda backend optimization
1. Support NV GPU backend base on delegate mechanism(use TensorRT as delegate).
#### OpenCL backend
1. Optimize the strategy of workgroup and blocksize to improve performance.
2. Support OpenCL dynamic infershape.
3. Support INT32 type ops.
#### Post quantization
1. Support fp32 training model convert to quantization training model.
#### Training on Device
1. Support fp32 training model export to quantization model after training process end.
2. Unify APIs and output package name of training and inference.
3. Simplify implementation of Train Session.
4. Optimize train and infer compile, reduce libmindspore-lite-train.so memory.
5. Training memory optimization: memory reduce 10-50% compare with r1.2.
6. Training performance optimization: for 1*1 special input shape Cov2DGradInput and SparseSoftmaxCrossEntropyWithLogits operator optimization, improved 10%-20%.
7. Support more networks(transformer, albert).
#### Codegen
1. Support depolyment on HarmonyOS for device.
### API Change
#### API Incompatible Change
##### C++ API
###### Unify LiteSession and TrainSession, Merge LiteSession And TrainSession.([!17356](https://gitee.com/mindspore/mindspore/pulls/17356))
Previously, Training on Device use TrainSession while Inference on Device use LiteSession. To simplify implementation, we move TrainSession functions to LiteSession as virtual function. and move APIs previous defined in train_session.h to lite_session.h.
```cpp
class MS_API LiteSession {
...
static LiteSession *CreateTrainSession(const std::string &filename, const lite::Context *context,
bool train_mode = false, const lite::TrainCfg *cfg = nullptr);
static LiteSession *CreateTransferSession(const std::string &filename_backbone, const std::string &filename_head,
const lite::Context *context, bool train_mode = false,
const lite::TrainCfg *cfg = nullptr);
virtual int Train() { return mindspore::lite::RET_ERROR; }
virtual int Eval() { return mindspore::lite::RET_OK; }
virtual int SetupVirtualBatch(int virtual_batch_multiplier, float lr = -1.0f, float momentum = -1.0f) {
return mindspore::lite::RET_ERROR;
}
virtual std::vector<tensor::MSTensor *> GetPredictions() const {
std::vector<tensor::MSTensor *> outputs;
return outputs;
}
...
```
###### Add Export API for Training on device, obsolete SaveToFile API.([!17356](https://gitee.com/mindspore/mindspore/pulls/17356))
Previously, Training on Device use SaveToFile API to save training model to file. Export API was added int this release to support more format, more model type(train or interface part of the model), and save weight quant model of train.
```cpp
virtual int Export(const std::string &file_name, lite::ModelType model_type = lite::MT_TRAIN,
lite::QuantizationType quant_type = lite::QT_DEFAULT, lite::FormatType = lite::FT_FLATBUFFERS) {
return mindspore::lite::RET_ERROR;
}
```
###### Add GetFeatureMaps and UpdateFeatureMaps interface for Training on device.([!18344](https://gitee.com/mindspore/mindspore/pulls/18344))
When Training on Device, we may be need update model featuremap and get model featuremap.particularly in MindSpore Federated Scenario.
```cpp
virtual std::vector<tensor::MSTensor *> GetFeatureMaps() const {
std::vector<tensor::MSTensor *> features;
return features;
}
virtual int UpdateFeatureMaps(const std::vector<tensor::MSTensor *> &features) { return mindspore::lite::RET_ERROR; }
```
#### New features
##### Java API
###### new static method for creating LiteSession by MSConifg in LiteSession.class
Previously, if we want to create a LiteSession object, we need to call two APIs like:
```js
MSConfig config;
// config options ...
LiteSession liteSession = new LiteSession();
boolean ret = liteSession.init(config);
if (!ret) {
// handle init LiteSession failed ...
}
```
now we can create a LiteSession object with new API just like:
```js
MSConfig config;
// config options ...
LiteSession liteSession = createSession(config);
if (liteSession == null) {
// handle create LiteSession failed ...
}
```
###### new static method for creating LiteSession byModelBuffer and MSConfig in LiteSession.class
Previously, if we want to inference a model, we need to call APIs like:
```js
MSConfig config;
// config options ...
LiteSession liteSession = new LiteSession();
boolean initSessionRet = liteSession.init(config);
if (!initSessionRet) {
// handle init LiteSession failed and return ...
}
Model model = new Model();
boolean loadModelRet = model.loadModel(modelMappedByteBuffer);
if (!loadModelRet) {
// handle load model failed and return ...
}
boolean compileModelRet = liteSession.compileGraph(model);
if (!loadModelRet) {
// handle compile model failed and return ...
}
model.free();
// liteSession is ready to inference model, call runGraph in LiteSession.class ...
```
now we can use new API just like:
```js
MSConfig config;
// config options ...
LiteSession liteSession = createSession(modelMappedByteBuffer, config);
if (liteSession == null) {
// handle init LiteSession failed and return ...
}
// liteSession is ready to inference model, call runGraph in LiteSession.class ...
```
New createSession method is an API that integrates four old APIs: LiteSession.init, Model.loadModel, LiteSession.compileGraph and model.free. It is simple and efficient as it reduce one modelBuffer copy operation.
###### new methods getFeaturesMap and updateFeatures for in LiteSession.class
Recently, we add a new C++ api in LiteSession class, Correspondingly we add a new java API in LiteSession.java.
```java
public List<MSTensor> getFeaturesMap() {
List<Long> ret = this.getFeaturesMap(this.sessionPtr);
ArrayList<MSTensor> tensors = new ArrayList<MSTensor>();
for (Long msTensorAddr : ret) {
MSTensor msTensor = new MSTensor(msTensorAddr);
tensors.add(msTensor);
}
return tensors;
}
public boolean updateFeatures(List<MSTensor> features) {
long[] inputsArray = new long[features.size()];
for (int i = 0; i < features.size(); i++) {
inputsArray[i] = features.get(i).getMSTensorPtr();
}
return this.updateFeatures(this.sessionPtr, inputsArray);
}
```
###### new methods export to replace saveToFile API in LiteSession.class
Recently, we add a new C++ api in LiteSession class, Correspondingly we add a new java API in LiteSession.java.
```java
public boolean export(String modelFileName, int modelType, int quantizationType) {
return this.export(this.sessionPtr, modelFileName, modelType, quantizationType);
}
```
###### new train related API moved to LiteSession.class from TrainSession.class
Align with update of C++ api in LiteSession class, add new java API to LiteSession.java Correspondingly.
```java
public class LiteSession {
...
public static LiteSession createTrainSession(String modelName, final MSConfig config, boolean trainMode){...}
public boolean train() {...}
public boolean eval() {...}
...
```
### Bug fixes
1. Fix the bug that the train session not release memory cause of refcount bug.
#### Deprecations
### Contributors
Thanks goes to these wonderful people:
Adel, AGroupofProbiotocs, anthonyaje, anzhengqi, askmiao, baihuawei, baiyangfan, bai-yangfan, bingyaweng, BowenK, buxue, caifubi, CaoJian, caojian05, caozhou, Cathy, changzherui, chenbo116, chenfei, chengxianbin, chenhaozhe, chenjianping, chenzomi, chenzupeng, chujinjin, cj, cjh9368, Corleone, damon0626, danish, Danish, davidmc, dayschan, doitH, dong-li001, eric, Eric, fary86, fuzhiye, Gaoxiong, GAO_HYP_XYJ, gengdongjie, Gogery, gongdaguo, gray0v0, gukecai, guoqi, gzhcv, hangq, hanhuifeng2020, Harshvardhan, He, heleiwang, hexia, Hoai, HuangBingjian, huangdongrun, huanghui, huangxinjing, huqi, huzhifeng, hwjiaorui, Islam Amin, Jesse, , Jiabin Liu, jianghui58, jiangzhiwen, Jiaqi, jin-xiulang, jinyaohui, jjfeing, John, Jonathan, jonyguo, JulyAi, jzg, kai00, kingfo, kingxian, kpy, kswang, laiyongqiang, leonwanghui, Li, liangchenghui, liangzelang, lichen_101010, lichenever, lihongkang, lilei, limingqi107, ling, linqingke, Lin Xh, liubuyu, liuwenhao4, liuxiao78, liuxiao93, liuyang_655, liuzhongkai, Lixia, lixian, liyanliu, liyong, lizhenyu, luopengting, luoyang, lvchangquan, lvliang, lz, mahdi, Mahdi, maning202007, Margaret_wangrui, mayang, mengyuanli, Ming_blue, nhussain, ougongchang, panfengfeng, panyifeng, Payne, Peilin, peixu_ren, Pengyongrong, qianlong, qianjiahong, r1chardf1d0, riemann_penn, rmdyh, Sheng, shenwei41, simson, Simson, Su, sunsuodong, tao_yunhao, tinazhang, VectorSL, , Wan, wandongdong, wangdongxu, wangmin, wangnan39@huawei.com, wangyue01, wangzhe, wanyiming, Wei, wenchunjiang, wilfChen, WilliamLian, wsc, wudenggang, wukesong, wuweikang, wuxuejian, Xiao Tianci, Xiaoda, xiefangqi, xinyunfan, xuanyue, xulei2020, Xun, xuyongfei, yanghaitao, yanghaitao1, yanghaoran, YangLuo, yangruoqi713, yankai, yanzhenxiang2020, yao_yf, yepei6, yeyunpeng, Yi, yoni, yoonlee666, yuchaojie, yujianfeng, yuximiao, zengzitao, Zhang, zhanghaibo5@huawei.com, zhanghuiyao, zhanghui_china, zhangxinfeng3, zhangyihui, zhangz0911gm, zhanke, zhanyuan, zhaodezan, zhaojichen, zhaoting, zhaozhenlong, zhengjun10, Zhenglong Li, zhiqwang, zhoufeng, zhousiyi, zhouyaqiang, zhouyifengCode, Zichun, Zirui, Ziyan, zjun, ZPaC, wangfengwfwf, zymaa, gerayking.
Contributions of any kind are welcome!
# MindSpore 1.2.1
## MindSpore 1.2.1 Release Notes

View File

@ -3968,37 +3968,33 @@ OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org>
Software: google/protobuf 3.8.0
Copyright notice:
Copyright 2008, Google Inc.
Copyright 2008 Google Inc. All Rights Reserved.
Copyright [2007] Neal Norwitz
Portions Copyright [2007] Google Inc.
Copyright 2008 Google Inc. All rights reserved.
Software: google/protobuf 3.13.0
Copyright 2008 Google Inc. All rights reserved.
Copyright 2008 Google Inc. All rights reserved.
Copyright 2007-2010 Baptiste Lepilleur Distributed under MIT license, or public domain if desired and recognized in your jurisdiction.
Copyright 2007 Google Inc. All Rights Reserved.
Copyright 2007, Google Inc.
Copyright 2013, Google Inc.
Copyright 2009, Google Inc.
Copyright 2006, Google Inc.
Copyright 2009 Google Inc. All rights reserved.
Copyright 2005, Google Inc.
Copyright 2012 Google Inc. All rights reserved.
Copyright 2014 Google Inc. All rights reserved.
Copyright 2019 Google Inc. All rights reserved.
Copyright 2008 Google Inc. All Rights Reserved.
copyright = u"2008, Google LLC"
Copyright 2017 Google Inc. All rights reserved.
Copyright 2008 Google Inc.
Copyright 2015, Google Inc.
Copyright (C) 1996-2015 Free Software Foundation, Inc.
Copyright (c) 2007-2010 Baptiste Lepilleur
Copyright 2007 Neal Norwitz
Copyright 2007 Google Inc.
Copyright 2008 Google Inc. All Rights Reserved.
Copyright 2014 Google Inc. All rights reserved.
Copyright 2015 Google Inc. All rights reserved.
Copyright 2015 Google Inc. All rights reserved.
Copyright 2019 Google Inc. All rights reserved.
Copyright (c) 2006, Google Inc.
Copyright 2012 Google Inc. All rights reserved.
Copyright 2005 Google Inc.
Copyright 2010 Google Inc. All Rights Reserved.
Copyright 2010, Google Inc.
Copyright 2005 Google Inc. All Rights Reserved.
Copyright (c) 2007-2010 Baptiste Lepilleur
Copyright 2017 Google Inc. All rights reserved.
Copyright 2015 Google Inc. All rights reserved.
Copyright 2018 Google Inc. All rights reserved.
Copyright 2009 Google Inc. All rights reserved.
Copyright 2009 Google Inc. All Rights Reserved.
Copyright 2007-2011 Baptiste Lepilleur Distributed under MIT license, or public domain if desired and recognized in your jurisdiction.
Copyright 2011 Baptiste Lepilleur Distributed under MIT license, or public domain if desired and recognized in your jurisdiction.
<Copyright>Copyright 2015, Google Inc.</Copyright>
Copyright 2019 Google LLC. All rights reserved.
Copyright 2016 Google Inc. All rights reserved.
Copyright 2005 Google Inc.
Copyright 2016 Google Inc. All rights reserved.
License: BSD 3-Clause License

2
akg

@ -1 +1 @@
Subproject commit 97dc7e96c2ffedf2e6e38310a903ffa205a6e656
Subproject commit 796260887e9c87964aad87ab8154211060870fec

View File

@ -26,7 +26,7 @@ usage()
echo " [-a on|off] [-p on|off] [-i] [-R] [-D on|off] [-j[n]] [-e gpu|ascend|cpu] \\"
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 10.1|11.1|310|910] [-I arm64|arm32|x86_64] [-K] \\"
echo " [-B on|off] [-E] [-l on|off] [-n full|lite|off] [-H on|off] \\"
echo " [-A on|off] [-S on|off] [-k on|off] [-W sse|neon|avx|off] \\"
echo " [-A on|off] [-S on|off] [-k on|off] [-W sse|neon|avx|avx512|off] \\"
echo " [-L Tensor-RT path] \\"
echo ""
echo "Options:"
@ -60,7 +60,7 @@ usage()
echo " -l Compile with python dependency, default on"
echo " -S Enable enable download cmake compile dependency from gitee , default off"
echo " -k Enable make clean, clean up compilation generated cache "
echo " -W Enable x86_64 SSE or AVX instruction set, use [sse|avx|neon|off], default off for lite and avx for CPU"
echo " -W Enable x86_64 SSE or AVX instruction set, use [sse|neon|avx|avx512|off], default off for lite and avx for CPU"
echo " -H Enable hidden"
echo " -L Link and specify Tensor-RT library path, default disable Tensor-RT lib linking"
}
@ -284,12 +284,12 @@ checkopts()
fi
;;
W)
if [[ "$OPTARG" != "sse" && "$OPTARG" != "off" && "$OPTARG" != "avx" && "$OPTARG" != "neon" ]]; then
echo "Invalid value ${OPTARG} for option -W, -W parameter must be sse|neon|avx|off"
if [[ "$OPTARG" != "sse" && "$OPTARG" != "off" && "$OPTARG" != "avx" && "$OPTARG" != "avx512" && "$OPTARG" != "neon" ]]; then
echo "Invalid value ${OPTARG} for option -W, -W parameter must be sse|neon|avx|avx512|off"
usage
exit 1
fi
if [[ "$OPTARG" == "sse" || "$OPTARG" == "avx" ]]; then
if [[ "$OPTARG" == "sse" || "$OPTARG" == "avx" || "$OPTARG" == "avx512" ]]; then
X86_64_SIMD="$OPTARG"
fi
if [[ "$OPTARG" == "neon" ]]; then
@ -588,6 +588,7 @@ build_lite()
COMPILE_MINDDATA_LITE="off"
CMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE}
CMAKE_TOOLCHAIN_NAME=${TOOLCHAIN_NAME}
CMAKE_BUILD_TYPE=${LITE_BUILD_TYPE}
else
CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake
ANDROID_NATIVE_API_LEVEL="19"

View File

@ -15,7 +15,7 @@ if(WIN32)
VER 0.1.92
LIBS sentencepiece sentencepiece_train
URL ${REQ_URL}
CMAKE_OPTION -DCMAKE_BUILD_TYPE=Release -DSPM_USE_BUILTIN_PROTOBUF=ON
CMAKE_OPTION -DCMAKE_BUILD_TYPE=Release -DSPM_USE_BUILTIN_PROTOBUF=ON -DSPM_ENABLE_SHARED=OFF
MD5 ${MD5}
)
else()

View File

@ -23,7 +23,9 @@ set(MINDSPORE_LITE_TRAIN_LIB_NAME libmindspore-lite-train)
set(BENCHMARK_TRAIN_NAME benchmark_train)
set(BENCHMARK_TRAIN_ROOT_DIR ${RUNTIME_PKG_NAME}/tools/benchmark_train)
# full mode will also package the files of lite_cv mode.
if(BUILD_MINDDATA STREQUAL "full")
# full header files
install(FILES
${TOP_DIR}/mindspore/ccsrc/minddata/dataset/include/dataset/constants.h
${TOP_DIR}/mindspore/ccsrc/minddata/dataset/include/dataset/data_helper.h
@ -65,6 +67,10 @@ if(BUILD_MINDDATA STREQUAL "full")
install(FILES ${TOP_DIR}/mindspore/lite/build/securec/src/libsecurec.a
DESTINATION ${SECUREC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
endif()
# lite_cv header files
install(DIRECTORY ${TOP_DIR}/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv
DESTINATION ${MIND_DATA_INC_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
endif()
if(BUILD_MINDDATA STREQUAL "wrapper")
@ -210,8 +216,6 @@ if(PLATFORM_ARM64)
endif()
install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${RUNTIME_INC_DIR}/ir/dtype
COMPONENT ${RUNTIME_COMPONENT_NAME})
install(FILES ${TOP_DIR}/mindspore/core/ir/format.h DESTINATION ${RUNTIME_INC_DIR}/ir
COMPONENT ${RUNTIME_COMPONENT_NAME})
install(DIRECTORY ${TOP_DIR}/include/api/ DESTINATION ${RUNTIME_INC_DIR}/api
COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ops*" EXCLUDE)
__install_micro_wrapper()
@ -263,8 +267,6 @@ elseif(PLATFORM_ARM32)
endif()
install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${RUNTIME_INC_DIR}/ir/dtype
COMPONENT ${RUNTIME_COMPONENT_NAME})
install(FILES ${TOP_DIR}/mindspore/core/ir/format.h DESTINATION ${RUNTIME_INC_DIR}/ir
COMPONENT ${RUNTIME_COMPONENT_NAME})
install(DIRECTORY ${TOP_DIR}/include/api/ DESTINATION ${RUNTIME_INC_DIR}/api
COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ops*" EXCLUDE)
__install_micro_wrapper()
@ -274,7 +276,7 @@ elseif(PLATFORM_ARM32)
install(FILES ${TOP_DIR}/mindspore/lite/tools/providers/NNIE/Hi3516D/libnnie_proposal.so
DESTINATION ${RUNTIME_PKG_NAME}/providers/Hi3516D COMPONENT ${RUNTIME_COMPONENT_NAME})
install(FILES ${TOP_DIR}/mindspore/lite/tools/providers/NNIE/Hi3516D/benchmark
DESTINATION ${RUNTIME_PKG_NAME}/providers/Hi3516D COMPONENT ${RUNTIME_COMPONENT_NAME})
DESTINATION ${BENCHMARK_ROOT_DIR} COMPONENT ${RUNTIME_COMPONENT_NAME})
else()
if(MSLITE_ENABLE_TOOLS)
install(TARGETS ${BENCHMARK_NAME} RUNTIME DESTINATION ${BENCHMARK_ROOT_DIR}
@ -310,6 +312,8 @@ elseif(WIN32)
COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
install(DIRECTORY ${TOP_DIR}/mindspore/core/utils/ DESTINATION ${CONVERTER_ROOT_DIR}/include/core/utils
COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
install(DIRECTORY ${TOP_DIR}/include/api/ DESTINATION ${CONVERTER_ROOT_DIR}/include/api
COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ops*" EXCLUDE)
install(FILES ${TOP_DIR}/mindspore/ccsrc/backend/optimizer/common/pass.h
DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME})
install(FILES ${TOP_DIR}/mindspore/lite/tools/converter/model_parser.h
@ -366,8 +370,6 @@ elseif(WIN32)
COMPONENT ${RUNTIME_COMPONENT_NAME})
install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${RUNTIME_INC_DIR}/ir/dtype
COMPONENT ${RUNTIME_COMPONENT_NAME})
install(FILES ${TOP_DIR}/mindspore/core/ir/format.h DESTINATION ${RUNTIME_INC_DIR}/ir
COMPONENT ${RUNTIME_COMPONENT_NAME})
install(DIRECTORY ${TOP_DIR}/include/api/ DESTINATION ${RUNTIME_INC_DIR}/api
COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ops*" EXCLUDE)
install(FILES ${TOP_DIR}/build/mindspore/src/${MINDSPORE_LITE_LIB_NAME}.a DESTINATION ${RUNTIME_LIB_DIR}
@ -398,8 +400,6 @@ else()
COMPONENT ${RUNTIME_COMPONENT_NAME})
install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${RUNTIME_INC_DIR}/ir/dtype
COMPONENT ${RUNTIME_COMPONENT_NAME})
install(FILES ${TOP_DIR}/mindspore/core/ir/format.h DESTINATION ${RUNTIME_INC_DIR}/ir
COMPONENT ${RUNTIME_COMPONENT_NAME})
install(DIRECTORY ${TOP_DIR}/include/api/ DESTINATION ${RUNTIME_INC_DIR}/api
COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ops*" EXCLUDE)
install(FILES ${TOP_DIR}/mindspore/lite/build/src/${MINDSPORE_LITE_LIB_NAME}.so DESTINATION ${RUNTIME_LIB_DIR}
@ -427,6 +427,8 @@ else()
COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
install(DIRECTORY ${TOP_DIR}/mindspore/core/utils/ DESTINATION ${CONVERTER_ROOT_DIR}/include/core/utils
COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
install(DIRECTORY ${TOP_DIR}/include/api/ DESTINATION ${CONVERTER_ROOT_DIR}/include/api
COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ops*" EXCLUDE)
install(FILES ${TOP_DIR}/mindspore/ccsrc/backend/optimizer/common/pass.h
DESTINATION ${CONVERTER_ROOT_DIR}/include COMPONENT ${RUNTIME_COMPONENT_NAME})
install(FILES ${TOP_DIR}/mindspore/lite/tools/converter/model_parser.h
@ -489,7 +491,7 @@ else()
endif()
if(NOT SUPPORT_TRAIN AND MSLITE_ENABLE_NNIE)
install(FILES ${TOP_DIR}/mindspore/lite/tools/providers/NNIE/converter.cfg
DESTINATION ${CONVERTER_ROOT_DIR}/ COMPONENT ${RUNTIME_COMPONENT_NAME})
DESTINATION ${CONVERTER_ROOT_DIR}/converter COMPONENT ${RUNTIME_COMPONENT_NAME})
install(FILES ${TOP_DIR}/mindspore/lite/tools/providers/NNIE/Hi3516D/libmslite_nnie_converter.so
DESTINATION ${CONVERTER_ROOT_DIR}/providers/Hi3516D/ COMPONENT ${RUNTIME_COMPONENT_NAME})
install(FILES ${TOP_DIR}/mindspore/lite/tools/providers/NNIE/Hi3516D/libmslite_nnie_data_process.so

View File

@ -7,7 +7,7 @@ file(REMOVE_RECURSE ${CMAKE_SOURCE_DIR}/output)
file(MAKE_DIRECTORY ${CMAKE_SOURCE_DIR}/output)
# cpack variables
file(READ ${CMAKE_SOURCE_DIR}/MS_VERSION VERSION_NUMBER)
file(READ ${CMAKE_SOURCE_DIR}/version.txt VERSION_NUMBER)
string(TOLOWER linux_${CMAKE_HOST_SYSTEM_PROCESSOR} PLATFORM_NAME)
set(CPACK_PACKAGE_FILE_NAME mindspore_ascend-${VERSION_NUMBER}-${PLATFORM_NAME})
set(CPACK_GENERATOR "TGZ")

View File

@ -3,5 +3,6 @@ approvers:
- hangangqiang
- xu-yfei
- wilfchen
- zhang_xue_tong
reviewers:
- lx0095

View File

@ -14,11 +14,11 @@
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_INCLUDE_ALLOCATOR_H_
#define MINDSPORE_LITE_INCLUDE_ALLOCATOR_H_
#ifndef MINDSPORE_INCLUDE_API_ALLOCATOR_H
#define MINDSPORE_INCLUDE_API_ALLOCATOR_H
#include <memory>
#include "include/lite_utils.h"
#include "include/api/types.h"
namespace mindspore {
/// \brief Allocator defined a memory pool for malloc memory and free memory dynamically.
@ -85,4 +85,4 @@ class MS_API Allocator {
size_t aligned_size_ = 32;
};
} // namespace mindspore
#endif // MINDSPORE_LITE_INCLUDE_ALLOCATOR_H_
#endif // MINDSPORE_INCLUDE_API_ALLOCATOR_H

View File

@ -0,0 +1,100 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_INCLUDE_API_CALLBACK_CALLBACK_H
#define MINDSPORE_INCLUDE_API_CALLBACK_CALLBACK_H
#include <cstddef>
#include <string>
#include <vector>
#include <memory>
#include "include/api/data_type.h"
#include "include/api/dual_abi_helper.h"
#ifdef _WIN32
#define MS_API __declspec(dllexport)
#else
#define MS_API __attribute__((visibility("default")))
#endif
namespace mindspore {
class Model;
class ModelImpl;
class CallbackImpl;
struct TrainCallBackData {
TrainCallBackData(bool train_mode, int epoch, int step, Model *model): train_mode_(train_mode), epoch_(epoch),
step_(step), model_(model) {}
bool train_mode_; /**< training mode of LiteSession object */
unsigned int epoch_; /**< the current training epoch (starts at 0) */
unsigned int step_ = 0; /**< the current step within the epoch */
Model *model_; /**< pointer to the Model object */
};
enum CallbackRetValue : uint32_t {
kContinue = 0,
kStopTraining = 1,
kExit = 2,
kUnknownRetValue = 0xFFFFFFFF
};
class TrainCallBack {
public:
virtual ~TrainCallBack() = default;
/// \brief This method is called once before the network executing
///
/// \param[in] cb_data info about current execution
virtual void Begin(const TrainCallBackData &cb_data) {}
/// \brief This method is called once following the network execution
///
/// \param[in] cb_data info about current execution
virtual void End(const TrainCallBackData &cb_data) {}
/// \brief This method is called at the beginning of each epoch
///
/// \param[in] cb_data info about current execution
virtual void EpochBegin(const TrainCallBackData &cb_data) {}
/// \brief This method is called after the run of each epoch
///
/// \param[in] cb_data info about current execution
///
/// \return indication if to continue in the train loop:
/// RET_CONTINUE -- continue training
/// RET_STOP_TRAINING -- stop training (e.g., due to achieved accuracy)
/// RET_EXIT -- Exit training (due to error of some sort)
virtual CallbackRetValue EpochEnd(const TrainCallBackData &cb_data) { return kContinue; }
/// \brief This method is called at the beginning of each step
///
/// \param[in] cb_data info about current execution
virtual void StepBegin(const TrainCallBackData &cb_data) {}
/// \brief This method is called after each step is ran
///
/// \param[in] cb_data info about current execution
virtual void StepEnd(const TrainCallBackData &cb_data) {}
protected:
friend class Model;
friend class ModelImpl;
CallbackImpl* callback_impl_ = nullptr;
};
} // namespace mindspore
#endif // MINDSPORE_INCLUDE_API_CALLBACK_CALLBACK_H

View File

@ -0,0 +1,39 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_INCLUDE_API_CALLBACK_CKPT_SAVER_H
#define MINDSPORE_INCLUDE_API_CALLBACK_CKPT_SAVER_H
#include <cstddef>
#include <string>
#include <vector>
#include <memory>
#include "include/api/callback/callback.h"
#ifdef _WIN32
#define MS_API __declspec(dllexport)
#else
#define MS_API __attribute__((visibility("default")))
#endif
namespace mindspore {
class CkptSaver: public TrainCallBack {
public:
explicit CkptSaver(int save_every_n, const std::string &filename_prefix);
virtual ~CkptSaver();
};
} // namespace mindspore
#endif // MINDSPORE_INCLUDE_API_CALLBACK_CKPT_SAVER_H

View File

@ -0,0 +1,41 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_INCLUDE_API_CALLBACK_LOSS_MONITOR_H
#define MINDSPORE_INCLUDE_API_CALLBACK_LOSS_MONITOR_H
#include <cstddef>
#include <vector>
#include <utility>
#include "include/api/callback/callback.h"
#ifdef _WIN32
#define MS_API __declspec(dllexport)
#else
#define MS_API __attribute__((visibility("default")))
#endif
using GraphPoint = std::pair<int, float>;
namespace mindspore {
class LossMonitor: public TrainCallBack {
public:
explicit LossMonitor(int print_every_n_steps = INT_MAX);
virtual ~LossMonitor();
const std::vector<GraphPoint> &GetLossPoints();
};
} // namespace mindspore
#endif // MINDSPORE_INCLUDE_API_CALLBACK_LOSS_MONITOR_H

View File

@ -0,0 +1,57 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_INCLUDE_API_CALLBACK_LR_SCHEDULER_H
#define MINDSPORE_INCLUDE_API_CALLBACK_LR_SCHEDULER_H
#include <cstddef>
#include <string>
#include <vector>
#include <memory>
#include "include/api/callback/callback.h"
#ifdef _WIN32
#define MS_API __declspec(dllexport)
#else
#define MS_API __attribute__((visibility("default")))
#endif
namespace mindspore {
constexpr int DONT_UPDATE_LR = 0;
constexpr int UPDATE_LR = 1;
using LR_Lambda = std::function<int(float *lr, int epoch, void *cb_data)>;
/// \brief Multiply the LR by a factor of gamma every epoch
int MultiplicativeLRLambda(float *lr, int epoch, void *multiplication);
/// \brief Multiply the LR by a factor of gamma every step_size
int StepLRLambda(float *lr, int epoch, void *step_size);
struct StepLRLambda {
StepLRLambda(int step, float g) : step_size(step), gamma(g) {}
int step_size; // period of LR decay
float gamma; // LR decay factor
};
class LRScheduler: public TrainCallBack {
public:
explicit LRScheduler(LR_Lambda lambda_func, void *lr_cb_data = nullptr, int step = 1);
virtual ~LRScheduler();
};
} // namespace mindspore
#endif // MINDSPORE_INCLUDE_API_CALLBACK_LR_SCHEDULER_H

View File

@ -0,0 +1,40 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_INCLUDE_API_CALLBACK_TIME_MONITOR_H
#define MINDSPORE_INCLUDE_API_CALLBACK_TIME_MONITOR_H
#include <cstddef>
#include <string>
#include <vector>
#include <memory>
#include "include/api/callback/callback.h"
#ifdef _WIN32
#define MS_API __declspec(dllexport)
#else
#define MS_API __attribute__((visibility("default")))
#endif
namespace mindspore {
class TimeMonitor: public TrainCallBack {
public:
virtual ~TimeMonitor() = default;
void EpochBegin(const TrainCallBackData &cb_data) override;
CallbackRetValue EpochEnd(const TrainCallBackData &cb_data) override;
};
} // namespace mindspore
#endif // MINDSPORE_INCLUDE_API_CALLBACK_TIME_MONITOR_H

View File

@ -0,0 +1,47 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_INCLUDE_API_CALLBACK_TRAIN_ACCURACY_H
#define MINDSPORE_INCLUDE_API_CALLBACK_TRAIN_ACCURACY_H
#include <cstddef>
#include <string>
#include <vector>
#include <memory>
#include <utility>
#include "include/api/callback/callback.h"
#include "include/api/metrics/accuracy.h"
#ifdef _WIN32
#define MS_API __declspec(dllexport)
#else
#define MS_API __attribute__((visibility("default")))
#endif
using GraphPoint = std::pair<int, float>;
namespace mindspore {
class TrainAccuracy: public TrainCallBack {
public:
explicit TrainAccuracy(int print_every_n = INT_MAX,
int accuracy_metrics = METRICS_CLASSIFICATION,
const std::vector<int> &input_indexes = {1},
const std::vector<int> &output_indexes = {0});
virtual ~TrainAccuracy();
const std::vector<GraphPoint> &GetAccuracyPoints();
};
} // namespace mindspore
#endif // MINDSPORE_INCLUDE_API_CALLBACK_TRAIN_ACCURACY_H

57
include/api/cfg.h Normal file
View File

@ -0,0 +1,57 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_INCLUDE_API_CFG_H
#define MINDSPORE_INCLUDE_API_CFG_H
#include <cstddef>
#include <string>
#include <vector>
#include <memory>
#include "include/api/data_type.h"
#include "include/api/dual_abi_helper.h"
#ifdef _WIN32
#define MS_API __declspec(dllexport)
#else
#define MS_API __attribute__((visibility("default")))
#endif
namespace mindspore {
class MixPrecisionCfg {
public:
MixPrecisionCfg() {
this->dynamic_loss_scale_ = false;
this->loss_scale_ = 128.0f;
this->num_of_not_nan_iter_th_ = 1000;
}
bool dynamic_loss_scale_ = false; /**< Enable\disable dynamic loss scale during mix precision training */
float loss_scale_; /**< Initial loss scale factor */
uint32_t num_of_not_nan_iter_th_; /**< a threshold for modifying loss scale when dynamic loss scale is enabled */
};
class TrainCfg {
public:
TrainCfg() { this->loss_name_ = "_loss_fn"; }
OptimizationLevel optimization_level_ = kO0;
std::string loss_name_; /**< Set part of the name that identify a loss kernel */
MixPrecisionCfg mix_precision_cfg_; /**< Mix precision configuration */
};
} // namespace mindspore
#endif // MINDSPORE_INCLUDE_API_CFG_H

View File

@ -26,8 +26,7 @@
namespace mindspore {
enum DeviceType {
kCPU = 0,
kMaliGPU,
kNvidiaGPU,
kGPU,
kKirinNPU,
kAscend910,
kAscend310,
@ -36,6 +35,7 @@ enum DeviceType {
};
class Allocator;
class Delegate;
class DeviceInfoContext;
class MS_API Context {
@ -46,8 +46,19 @@ class MS_API Context {
void SetThreadNum(int32_t thread_num);
int32_t GetThreadNum() const;
void SetAllocator(const std::shared_ptr<Allocator> &allocator);
std::shared_ptr<Allocator> GetAllocator() const;
/// \brief Set the thread affinity to CPU cores.
///
/// \param mode: 0: no affinities, 1: big cores first, 2: little cores first
void SetThreadAffinity(int mode);
int GetThreadAffinityMode() const;
void SetThreadAffinity(const std::vector<int> &core_list);
std::vector<int32_t> GetThreadAffinityCoreList() const;
void SetEnableParallel(bool is_parallel);
bool GetEnableParallel() const;
void SetDelegate(const std::shared_ptr<Delegate> &delegate);
std::shared_ptr<Delegate> GetDelegate() const;
std::vector<std::shared_ptr<DeviceInfoContext>> &MutableDeviceInfo();
@ -91,19 +102,6 @@ class MS_API CPUDeviceInfo : public DeviceInfoContext {
public:
enum DeviceType GetDeviceType() const override { return DeviceType::kCPU; };
/// \brief Set the thread affinity to CPU cores.
///
/// \param mode: 0: no affinities, 1: big cores first, 2: little cores first
void SetThreadAffinity(int mode);
int GetThreadAffinity() const;
void SetEnableFP16(bool is_fp16);
bool GetEnableFP16() const;
};
class MS_API MaliGPUDeviceInfo : public DeviceInfoContext {
public:
enum DeviceType GetDeviceType() const override { return DeviceType::kMaliGPU; };
void SetEnableFP16(bool is_fp16);
bool GetEnableFP16() const;
};
@ -116,9 +114,9 @@ class MS_API KirinNPUDeviceInfo : public DeviceInfoContext {
int GetFrequency() const;
};
class MS_API NvidiaGPUDeviceInfo : public DeviceInfoContext {
class MS_API GPUDeviceInfo : public DeviceInfoContext {
public:
enum DeviceType GetDeviceType() const override { return DeviceType::kNvidiaGPU; };
enum DeviceType GetDeviceType() const override { return DeviceType::kGPU; };
void SetDeviceID(uint32_t device_id);
uint32_t GetDeviceID() const;
@ -129,15 +127,18 @@ class MS_API NvidiaGPUDeviceInfo : public DeviceInfoContext {
inline void SetPrecisionMode(const std::string &precison_mode);
inline std::string GetPrecisionMode() const;
void SetEnableFP16(bool is_fp16);
bool GetEnableFP16() const;
private:
void SetPrecisionMode(const std::vector<char> &precision_mode);
std::vector<char> GetPrecisionModeChar() const;
};
void NvidiaGPUDeviceInfo::SetPrecisionMode(const std::string &precision_mode) {
void GPUDeviceInfo::SetPrecisionMode(const std::string &precision_mode) {
SetPrecisionMode(StringToChar(precision_mode));
}
std::string NvidiaGPUDeviceInfo::GetPrecisionMode() const { return CharToString(GetPrecisionModeChar()); }
std::string GPUDeviceInfo::GetPrecisionMode() const { return CharToString(GetPrecisionModeChar()); }
class MS_API Ascend910DeviceInfo : public DeviceInfoContext {
public:

View File

@ -23,6 +23,7 @@ enum class DataType : int {
kObjectTypeList = 13,
kObjectTypeTuple = 14,
kObjectTypeTensorType = 17,
kNumberTypeBegin = 29,
kNumberTypeBool = 30,
kNumberTypeInt8 = 32,
kNumberTypeInt16 = 33,

View File

@ -14,24 +14,30 @@
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_DELEGATE_DELEGATE_H_
#define MINDSPORE_LITE_DELEGATE_DELEGATE_H_
#ifndef MINDSPORE_INCLUDE_API_DELEGATE_H
#define MINDSPORE_INCLUDE_API_DELEGATE_H
#include <map>
#include <vector>
#include <memory>
#include "include/ms_tensor.h"
#include "include/context.h"
#include "include/kernel.h"
#include "schema/model_generated.h"
#include "include/api/kernel.h"
namespace mindspore {
typedef enum {
SCHEMA_INVALID = -1, /**< invalid version */
SCHEMA_CUR, /**< current version for ms model defined in model.fbs*/
SCHEMA_V0, /**< previous version for ms model defined in model_v0.fbs*/
} SchemaVersion;
using KernelIter = std::vector<kernel::Kernel *>::iterator;
class DelegateModel {
class MS_API DelegateModel {
public:
/// \brief Constructor of MindSpore Lite DelegateModel.
DelegateModel(std::vector<kernel::Kernel *> *kernels,
const std::map<kernel::Kernel *, const schema::Primitive *> primitives)
: kernels_(kernels), primitives_(primitives) {}
DelegateModel(std::vector<kernel::Kernel *> *kernels, const std::vector<MSTensor> &inputs,
const std::vector<MSTensor> &outputs,
const std::map<kernel::Kernel *, const schema::Primitive *> &primitives, SchemaVersion version)
: kernels_(kernels), inputs_(inputs), outputs_(outputs), primitives_(primitives), version_(version) {}
/// \brief Destructor of MindSpore Lite DelegateModel.
~DelegateModel() = default;
@ -61,14 +67,30 @@ class DelegateModel {
/// \return The next iterator after graph_kernel, point to the next kernel that is not visited.
KernelIter Replace(KernelIter from, KernelIter end, kernel::Kernel *graph_kernel);
/// \brief Get the input tensors of DelegateModel.
///
/// \return The input tensor vector of DelegateModel.
const std::vector<mindspore::MSTensor> &inputs() { return this->inputs_; }
/// \brief Get the output tensors of DelegateModel.
///
/// \return The ioutput tensor vector of DelegateModel.
const std::vector<mindspore::MSTensor> &outputs() { return this->outputs_; }
/// \brief Get the ms model version.
///
/// \return The schema version for the primitives map.
const SchemaVersion GetVersion() { return version_; }
protected:
std::vector<kernel::Kernel *> *kernels_;
const std::map<kernel::Kernel *, const schema::Primitive *> primitives_;
const std::vector<mindspore::MSTensor> &inputs_;
const std::vector<mindspore::MSTensor> &outputs_;
const std::map<kernel::Kernel *, const schema::Primitive *> &primitives_;
SchemaVersion version_;
};
typedef void (*DelegateHook)(std::shared_ptr<Delegate> delegate);
static void HookNullFuc(std::shared_ptr<Delegate> delegate) {}
class Delegate {
class MS_API Delegate {
public:
/// \brief Constructor of MindSpore Lite Delegate.
Delegate() = default;
@ -87,10 +109,6 @@ class Delegate {
///
/// \param[in] model Define the delegate model to be built.
virtual int Build(DelegateModel *model) = 0;
DelegateHook init_hook_ = HookNullFuc;
DelegateHook build_hook_ = HookNullFuc;
DelegateHook run_hook_ = HookNullFuc;
};
} // namespace mindspore
#endif // MINDSPORE_LITE_DELEGATE_DELEGATE_H_
#endif // MINDSPORE_INCLUDE_API_DELEGATE_H

View File

@ -20,7 +20,6 @@
#include <iterator>
#include <map>
#include <memory>
#include <optional>
#include <string>
#include <set>
#include <unordered_map>
@ -32,18 +31,6 @@ inline std::vector<char> StringToChar(const std::string &s) { return std::vector
inline std::string CharToString(const std::vector<char> &c) { return std::string(c.begin(), c.end()); }
inline std::optional<std::vector<char>> OptionalStringToChar(const std::optional<std::string> &s) {
if (s == std::nullopt) return std::nullopt;
std::optional<std::vector<char>> ret = std::vector<char>(s->begin(), s->end());
return ret;
}
inline std::optional<std::string> OptionalCharToString(const std::optional<std::vector<char>> &c) {
if (c == std::nullopt) return std::nullopt;
std::optional<std::string> ret = std::string(c->begin(), c->end());
return ret;
}
inline std::pair<std::vector<char>, int32_t> PairStringToChar(const std::pair<std::string, int32_t> &s) {
return std::pair<std::vector<char>, int32_t>(std::vector<char>(s.first.begin(), s.first.end()), s.second);
}

View File

@ -15,9 +15,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CORE_IR_FORMAT_H_
#define MINDSPORE_CORE_IR_FORMAT_H_
#ifndef MINDSPORE_INCLUDE_API_FORMAT_H_
#define MINDSPORE_INCLUDE_API_FORMAT_H_
#include <cstdint>
@ -43,4 +42,4 @@ enum Format : int64_t {
NCW = 17
};
} // namespace mindspore
#endif // MINDSPORE_CORE_IR_FORMAT_H_
#endif // MINDSPORE_INCLUDE_API_FORMAT_H_

View File

@ -14,22 +14,22 @@
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_KERNEL_H_
#define MINDSPORE_LITE_SRC_KERNEL_H_
#ifndef MINDSPORE_INCLUDE_API_KERNEL_H
#define MINDSPORE_INCLUDE_API_KERNEL_H
#include <vector>
#include <string>
#include <utility>
#include "schema/model_generated.h"
#include "include/lite_utils.h"
#include "include/context.h"
#include "include/api/types.h"
#include "include/api/context.h"
namespace mindspore::kernel {
class Kernel {
public:
Kernel() = default;
Kernel(const std::vector<tensor::MSTensor *> &inputs, const std::vector<tensor::MSTensor *> &outputs,
const schema::Primitive *primitive, const lite::Context *ctx)
Kernel(const std::vector<mindspore::MSTensor> &inputs, const std::vector<mindspore::MSTensor> &outputs,
const schema::Primitive *primitive, const mindspore::Context *ctx)
: inputs_(std::move(inputs)), outputs_(std::move(outputs)), primitive_(primitive), context_(ctx) {
if (primitive != nullptr) {
type_ = primitive->value_type();
@ -46,33 +46,34 @@ class Kernel {
virtual schema::PrimitiveType type() const { return type_; }
virtual void set_inputs(const std::vector<mindspore::tensor::MSTensor *> &in_tensors) { this->inputs_ = in_tensors; }
virtual void set_input(mindspore::tensor::MSTensor *in_tensor, int index) { this->inputs_[index] = in_tensor; }
virtual void set_inputs(const std::vector<mindspore::MSTensor> &in_tensors) { this->inputs_ = in_tensors; }
virtual void set_outputs(const std::vector<mindspore::tensor::MSTensor *> &out_tensors) {
this->outputs_ = out_tensors;
}
virtual void set_input(mindspore::MSTensor in_tensor, int index) { this->inputs_[index] = in_tensor; }
virtual void set_output(mindspore::tensor::MSTensor *out_tensor, int index) { this->outputs_[index] = out_tensor; }
virtual void set_outputs(const std::vector<mindspore::MSTensor> &out_tensors) { this->outputs_ = out_tensors; }
virtual const std::vector<mindspore::tensor::MSTensor *> &inputs() { return this->inputs_; }
virtual void set_output(mindspore::MSTensor out_tensor, int index) { this->outputs_[index] = out_tensor; }
virtual const std::vector<mindspore::tensor::MSTensor *> &outputs() { return this->outputs_; }
virtual const std::vector<mindspore::MSTensor> &inputs() { return this->inputs_; }
virtual const std::vector<mindspore::MSTensor> &outputs() { return this->outputs_; }
std::string name() const { return this->name_; }
void set_name(const std::string &name) { this->name_ = name; }
const lite::Context *context() const { return this->context_; }
const mindspore::Context *context() const { return this->context_; }
const schema::Primitive *primitive() const { return this->primitive_; }
protected:
std::vector<mindspore::tensor::MSTensor *> inputs_;
std::vector<mindspore::tensor::MSTensor *> outputs_;
std::vector<mindspore::MSTensor> inputs_;
std::vector<mindspore::MSTensor> outputs_;
schema::PrimitiveType type_ = schema::PrimitiveType_NONE;
std::string name_;
const schema::Primitive *primitive_ = nullptr;
const lite::Context *context_ = nullptr;
const mindspore::Context *context_ = nullptr;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_KERNEL_H_
#endif // MINDSPORE_INCLUDE_API_KERNEL_H

View File

@ -0,0 +1,36 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_INCLUDE_API_METRICS_ACCURACY_H
#define MINDSPORE_INCLUDE_API_METRICS_ACCURACY_H
#include <vector>
#include "include/api/metrics/metrics.h"
namespace mindspore {
constexpr int METRICS_CLASSIFICATION = 0;
constexpr int METRICS_MULTILABEL = 1;
class AccuracyMetrics : public Metrics {
public:
explicit AccuracyMetrics(int accuracy_metrics = METRICS_CLASSIFICATION, const std::vector<int> &input_indexes = {1},
const std::vector<int> &output_indexes = {0});
virtual ~AccuracyMetrics();
void Clear() override;
float Eval() override;
};
} // namespace mindspore
#endif // MINDSPORE_INCLUDE_API_METRICS_ACCURACY_H

View File

@ -0,0 +1,40 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_INCLUDE_API_METRICS_METRICS_H
#define MINDSPORE_INCLUDE_API_METRICS_METRICS_H
#include <vector>
#include "include/api/model.h"
namespace mindspore {
class MetricsImpl;
class ModelImpl;
class MSTensor;
class Metrics {
public:
virtual ~Metrics() = default;
virtual void Clear() {}
virtual float Eval() { return 0.0; }
virtual void Update(std::vector<MSTensor *> inputs, std::vector<MSTensor *> outputs) {}
protected:
friend class Model;
friend class ModelImpl;
MetricsImpl* metrics_impl_;
};
} // namespace mindspore
#endif // MINDSPORE_INCLUDE_API_METRICS_METRICS_H

View File

@ -25,11 +25,18 @@
#include "include/api/types.h"
#include "include/api/graph.h"
#include "include/api/context.h"
#include "include/api/callback/callback.h"
#include "include/api/cell.h"
#include "include/api/cfg.h"
#include "include/api/dual_abi_helper.h"
namespace mindspore {
class ModelImpl;
class Metrics;
namespace dataset {
class Dataset;
} // namespace dataset
class MS_API Model {
public:
@ -38,14 +45,19 @@ class MS_API Model {
Model(const Model &) = delete;
void operator=(const Model &) = delete;
Status Build(GraphCell graph, const std::shared_ptr<Context> &model_context = nullptr);
Status Build(GraphCell graph, const std::shared_ptr<Context> &model_context = nullptr,
const std::shared_ptr<TrainCfg> &train_cfg = nullptr);
Status Resize(const std::vector<MSTensor> &inputs, const std::vector<std::vector<int64_t>> &dims);
Status Predict(const std::vector<MSTensor> &inputs, std::vector<MSTensor> *outputs);
Status Predict(const std::vector<MSTensor> &inputs, std::vector<MSTensor> *outputs,
const MSKernelCallBack &before = nullptr, const MSKernelCallBack &after = nullptr);
std::vector<MSTensor> GetInputs();
inline MSTensor GetInputByTensorName(const std::string &tensor_name);
Status InitMetrics(std::vector<Metrics *> metrics);
std::vector<Metrics *> GetMetrics();
std::vector<MSTensor> GetOutputs();
inline std::vector<std::string> GetOutputTensorNames();
inline MSTensor GetOutputByTensorName(const std::string &tensor_name);
@ -53,7 +65,19 @@ class MS_API Model {
static bool CheckModelSupport(enum DeviceType device_type, ModelType model_type);
Status SetTrainMode(bool train);
bool GetTrainMode() const;
Status Train(int epochs, std::shared_ptr<dataset::Dataset> ds, std::vector<TrainCallBack *> cbs);
Status Evaluate(std::shared_ptr<dataset::Dataset> ds, std::vector<TrainCallBack *> cbs);
Status Build(const void *model_data, size_t data_size, ModelType model_type,
const std::shared_ptr<Context> &model_context = nullptr, const Key &dec_key = {},
const std::string &dec_mode = kDecModeAesGcm);
Status Build(const std::string &model_path, ModelType model_type,
const std::shared_ptr<Context> &model_context = nullptr, const Key &dec_key = {},
const std::string &dec_mode = kDecModeAesGcm);
private:
friend class Serialization;
// api without std::string
MSTensor GetInputByTensorName(const std::vector<char> &tensor_name);
std::vector<std::vector<char>> GetOutputTensorNamesChar();

View File

@ -27,27 +27,18 @@
#include "include/api/dual_abi_helper.h"
namespace mindspore {
using Key = struct Key {
const size_t max_key_len = 32;
size_t len;
unsigned char key[32];
Key() : len(0) {}
};
class MS_API Serialization {
public:
static Status Load(const void *model_data, size_t data_size, ModelType model_type, Graph *graph);
inline static Status Load(const void *model_data, size_t data_size, ModelType model_type, Graph *graph,
const Key &dec_key, const std::string &dec_mode);
inline static Status Load(const std::string &file, ModelType model_type, Graph *graph);
inline static Status Load(const std::string &file, ModelType model_type, Graph *graph, const Key &dec_key,
const std::string &dec_mode);
const Key &dec_key = {}, const std::string &dec_mode = kDecModeAesGcm);
inline static Status Load(const std::string &file, ModelType model_type, Graph *graph, const Key &dec_key = {},
const std::string &dec_mode = kDecModeAesGcm);
inline static Status Load(const std::vector<std::string> &files, ModelType model_type, std::vector<Graph> *graphs,
const Key &dec_key = {}, const std::string &dec_mode = "AES-GCM");
static Status LoadCheckPoint(const std::string &ckpt_file, std::map<std::string, Buffer> *parameters);
const Key &dec_key = {}, const std::string &dec_mode = kDecModeAesGcm);
static Status SetParameters(const std::map<std::string, Buffer> &parameters, Model *model);
static Status ExportModel(const Model &model, ModelType model_type, Buffer *model_data);
static Status ExportModel(const Model &model, ModelType model_type, const std::string &model_file);
static Status ExportModel(const Model &model, ModelType model_type, const std::string &model_file,
QuantizationType quantization_type = kNoQuant, bool export_inference_only = true);
private:
static Status Load(const void *model_data, size_t data_size, ModelType model_type, Graph *graph, const Key &dec_key,
@ -64,10 +55,6 @@ Status Serialization::Load(const void *model_data, size_t data_size, ModelType m
return Load(model_data, data_size, model_type, graph, dec_key, StringToChar(dec_mode));
}
Status Serialization::Load(const std::string &file, ModelType model_type, Graph *graph) {
return Load(StringToChar(file), model_type, graph);
}
Status Serialization::Load(const std::string &file, ModelType model_type, Graph *graph, const Key &dec_key,
const std::string &dec_mode) {
return Load(StringToChar(file), model_type, graph, dec_key, StringToChar(dec_mode));

View File

@ -78,6 +78,7 @@ enum StatusCode : uint32_t {
kLiteMemoryFailed = kLite | (0x0FFFFFFF & -6), /**< Fail to create memory. */
kLiteNotSupport = kLite | (0x0FFFFFFF & -7), /**< Fail to support. */
kLiteThreadPoolError = kLite | (0x0FFFFFFF & -8), /**< Error occur in thread pool. */
kLiteUninitializedObj = kLite | (0x0FFFFFFF & -9), /**< Object is not initialized. */
// Executor error code, range: [-100,-200)
kLiteOutOfTensorRange = kLite | (0x0FFFFFFF & -100), /**< Failed to check range. */

View File

@ -20,8 +20,10 @@
#include <string>
#include <vector>
#include <memory>
#include <functional>
#include "include/api/data_type.h"
#include "include/api/dual_abi_helper.h"
#include "include/api/format.h"
#ifdef _WIN32
#define MS_API __declspec(dllexport)
@ -35,10 +37,27 @@ enum ModelType : uint32_t {
kAIR = 1,
kOM = 2,
kONNX = 3,
kFlatBuffer = 4,
// insert new data type here
kUnknownType = 0xFFFFFFFF
};
enum QuantizationType : uint32_t {
kNoQuant = 0,
kWeightQuant = 1,
kFullQuant = 2,
kUnknownQuantType = 0xFFFFFFFF
};
enum OptimizationLevel : uint32_t {
kO0 = 0, // Do not change
kO2 = 2, // Cast network to float16, keep batchnorm and loss in float32,
kO3 = 3, // Cast network to float16, including bacthnorm
kAuto = 4, // Choose optimization based on device
kOptimizationType = 0xFFFFFFFF
};
class Allocator;
class MS_API MSTensor {
public:
class Impl;
@ -74,6 +93,17 @@ class MS_API MSTensor {
MSTensor *Clone() const;
bool operator==(std::nullptr_t) const;
bool operator!=(std::nullptr_t) const;
bool operator==(const MSTensor &tensor) const;
void SetShape(const std::vector<int64_t> &shape);
void SetDataType(enum DataType data_type);
void SetTensorName(const std::string &name);
void SetAllocator(std::shared_ptr<Allocator> allocator);
std::shared_ptr<Allocator> allocator() const;
void SetFormat(mindspore::Format format);
mindspore::Format format() const;
void SetData(void *data);
const std::shared_ptr<Impl> impl() const { return impl_; }
private:
// api without std::string
@ -142,5 +172,29 @@ MSTensor::MSTensor(const std::string &name, enum DataType type, const std::vecto
: MSTensor(StringToChar(name), type, shape, data, data_len) {}
std::string MSTensor::Name() const { return CharToString(CharName()); }
struct MS_API Key {
const size_t max_key_len = 32;
size_t len;
unsigned char key[32];
Key() : len(0) {}
explicit Key(const char *dec_key, size_t key_len);
};
constexpr char kDecModeAesGcm[] = "AES-GCM";
/// \brief CallBackParam defined input arguments for callBack function.
struct MSCallBackParam {
std::string node_name_; /**< node name argument */
std::string node_type_; /**< node type argument */
};
/// \brief KernelCallBack defined the function pointer for callBack.
using MSKernelCallBack = std::function<bool(const std::vector<MSTensor> &inputs, const std::vector<MSTensor> &outputs,
const MSCallBackParam &opInfo)>;
std::vector<char> CharVersion();
inline std::string Version() { return CharToString(CharVersion()); }
} // namespace mindspore
#endif // MINDSPORE_INCLUDE_API_TYPES_H

View File

@ -487,7 +487,7 @@ class Validator:
addition_error_info = ''
type_str = (type(type_).__name__ if isinstance(type_, (tuple, list)) else "") + str(type_)
raise TypeError(f'For \'{prim_name}\', the type of `{arg_name}` should be subclass'
f' of {", ".join((str(x) for x in template_types))}, but got {type_str}.'
f' of {", ".join((str(x) for x in template_types))}, but got {type_str}'
f' {addition_error_info}. This message is only for reference. The supported data types'
f' depend on the hardware that executes the operator'
f' and it is a subset of the data types above.')

View File

@ -29,6 +29,7 @@ from mindspore import log
from .tbe_common import check_kernel_info, TBEException
from .helper import _op_select_format, _check_supported
# tune type
NO_TUNE = "NO_TUNE"
GA_TUNE = "GA"
@ -355,7 +356,10 @@ class TbeProcess:
log.error("Auto tune init failed, place check your hardware config or go back to normal compile!")
self.tune_init = False
return error_id
self.__reset_op_info = self.get_reset_op_info()
self.__tuner.tune_init = True
json_info["reset_op_info"] = self.__reset_op_info
op_json = json.dumps(json_info)
self.__all_tune_tasks.append(task_id)
self.__running_tune_tasks.append(task_id)

View File

@ -18,8 +18,8 @@ import datetime
import json
import sys
import traceback
from tbe.common.rl_bank.bank_manager import set_current_op_name
from te.platform.cce_conf import te_set_version
from te_fusion.fusion_manager import set_current_op_name
from te_fusion.fusion_util import fusion_op, dump_fusion_json
from te_fusion.parallel_compilation import init_multi_process_env, get_finished_compilation_task, \
deinit_multi_process_env, start_ga_multi_process
@ -331,13 +331,14 @@ class TbeTuner:
raise ValueError("Json string Errors, key:fusion_op not found.")
kernel_name = json_info["fusion_op"]["fusion_op_name"]
full_name = json_info["fusion_op"]["full_name"]
reset_op_info = json_info["reset_op_info"]
set_current_op_name(kernel_name)
converted_json = fusion_to_fusion(json.dumps(json_info), tune_mode="RL")
job_type = RL_COMPILE
base_kernel = './kernel_meta/' + kernel_name + '.o'
compile_info = None
try:
fusion_op(converted_json)
fusion_op(converted_json, reset_op_info=reset_op_info)
# pylint: disable=broad-except
except Exception:
exc_type, exc_value, _ = sys.exc_info()

View File

@ -224,14 +224,9 @@ set(SUB_COMP
frontend/operator
pipeline/jit
pipeline/pynative
common debug pybind_api utils vm profiler ps
common debug pybind_api utils vm profiler ps fl
)
if(ENABLE_CPU AND NOT WIN32)
add_compile_definitions(ENABLE_ARMOUR)
list(APPEND SUB_COMP "armour")
endif()
foreach(_comp ${SUB_COMP})
add_subdirectory(${_comp})
string(REPLACE "/" "_" sub ${_comp})

View File

@ -1,12 +0,0 @@
file(GLOB_RECURSE ARMOUR_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set(SERVER_FLATBUFFER_OUTPUT "${CMAKE_BINARY_DIR}/schema")
set(FBS_FILES
${CMAKE_CURRENT_SOURCE_DIR}/../../schema/cipher.fbs
${CMAKE_CURRENT_SOURCE_DIR}/../../schema/fl_job.fbs
)
set_property(SOURCE ${ARMOUR_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_ARMOUR)
add_library(_mindspore_armour_obj OBJECT ${ARMOUR_FILES})
add_dependencies(_mindspore_armour_obj generated_fbs_files)
target_link_libraries(_mindspore_armour_obj mindspore::flatbuffers)

View File

@ -194,7 +194,7 @@ std::vector<TaskInfoPtr> AicpuOpKernelMod::GenTask(const std::vector<AddressPtr>
}
AicpuTaskInfoPtr task_info_ptr = std::make_shared<mindspore::ge::model_runner::AicpuTaskInfo>(
kernel_name_, stream_id, node_so_, node_name_, node_def_str_, ext_info_, input_data_addrs, output_data_addrs,
unique_name_, stream_id, node_so_, node_name_, node_def_str_, ext_info_, input_data_addrs, output_data_addrs,
NeedDump());
MS_LOG(INFO) << "AicpuOpKernelMod GenTask end";

View File

@ -62,8 +62,10 @@ constexpr auto kMaskedSelect = "MaskedSelect";
constexpr auto kMaskedSelectGrad = "MaskedSelectGrad";
constexpr auto kDynamicStitch = "DynamicStitch";
constexpr auto kSearchSorted = "SearchSorted";
const std::set<std::string> kCustAiCpuKernelOps{kIdentity, kMaskedSelect, kMaskedSelectGrad, kDynamicStitch,
kSearchSorted};
constexpr auto kResizeBilinear = "ResizeBilinear";
constexpr auto kResizeBilinearGrad = "ResizeBilinearGrad";
const std::set<std::string> kCustAiCpuKernelOps{kIdentity, kMaskedSelect, kMaskedSelectGrad, kDynamicStitch,
kSearchSorted, kResizeBilinear, kResizeBilinearGrad};
const std::set<std::string> kCacheKernelOps{kUpdateCache, kCacheSwapTable, kSubAndFilter,
kPadAndShift, kDropout3D, kDropout2D};

View File

@ -119,7 +119,7 @@ std::vector<TaskInfoPtr> AkgKernelMod::GenTask(const std::vector<AddressPtr> &in
MS_LOG(DEBUG) << "The block_dim is:" << block_dim;
TbeTaskInfoPtr task_info_ptr = std::make_shared<mindspore::ge::model_runner::TbeTaskInfo>(
kernel_name_, stream_id, stub_func, block_dim, args, args_size, sm_desc, binary, binary_size, meta_data,
unique_name_, stream_id, stub_func, block_dim, args, args_size, sm_desc, binary, binary_size, meta_data,
input_data_addrs, output_data_addrs, workspace_addrs, NeedDump());
return {task_info_ptr};
}

View File

@ -34,7 +34,8 @@ class AscendKernelMod : public KernelMod {
uint32_t stream_id() { return stream_id_; }
virtual bool NeedDump() {
const auto &dump_json = DumpJsonParser::GetInstance();
return dump_json.NeedDump(kernel_name_) && dump_json.async_dump_enabled() && dump_json.op_debug_mode() == 0;
return dump_json.NeedDump(fullname_) && dump_json.async_dump_enabled() && dump_json.op_debug_mode() == 0 &&
!is_monad_;
}
protected:

View File

@ -26,7 +26,7 @@
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
#include "schema/fl_job_generated.h"
#include "ps/ps_context.h"
#include "ps/worker/fl_worker.h"
#include "fl/worker/fl_worker.h"
namespace mindspore {
namespace kernel {
@ -47,13 +47,13 @@ class FusedPullWeightKernel : public CPUKernel {
return false;
}
std::shared_ptr<ps::FBBuilder> fbb = std::make_shared<ps::FBBuilder>();
std::shared_ptr<fl::FBBuilder> fbb = std::make_shared<fl::FBBuilder>();
MS_EXCEPTION_IF_NULL(fbb);
total_iteration_++;
// The worker has to train kWorkerTrainStepNum standalone iterations before it communicates with server.
if (total_iteration_ % ps::worker::FLWorker::GetInstance().worker_step_num_per_iteration() !=
ps::kTrainBeginStepNum) {
if (total_iteration_ % fl::worker::FLWorker::GetInstance().worker_step_num_per_iteration() !=
fl::kTrainBeginStepNum) {
return true;
}
@ -63,6 +63,7 @@ class FusedPullWeightKernel : public CPUKernel {
fl_iteration_ = 1;
}
MS_LOG(INFO) << "Start pulling weight for federated learning iteration " << fl_iteration_;
if (!BuildPullWeightReq(fbb)) {
MS_LOG(EXCEPTION) << "Building request for FusedPullWeight failed.";
return false;
@ -72,10 +73,10 @@ class FusedPullWeightKernel : public CPUKernel {
const schema::ResponsePullWeight *pull_weight_rsp = nullptr;
int retcode = schema::ResponseCode_SucNotReady;
while (retcode == schema::ResponseCode_SucNotReady) {
if (!ps::worker::FLWorker::GetInstance().SendToServer(
if (!fl::worker::FLWorker::GetInstance().SendToServer(
0, fbb->GetBufferPointer(), fbb->GetSize(), ps::core::TcpUserCommand::kPullWeight, &pull_weight_rsp_msg)) {
MS_LOG(WARNING) << "Sending request for FusedPullWeight to server 0 failed. This iteration is dropped.";
ps::worker::FLWorker::GetInstance().SetIterationRunning();
fl::worker::FLWorker::GetInstance().SetIterationRunning();
return true;
}
MS_EXCEPTION_IF_NULL(pull_weight_rsp_msg);
@ -116,7 +117,7 @@ class FusedPullWeightKernel : public CPUKernel {
}
}
MS_LOG(INFO) << "Pull weights for " << weight_full_names_ << " succeed. Iteration: " << fl_iteration_;
ps::worker::FLWorker::GetInstance().SetIterationRunning();
fl::worker::FLWorker::GetInstance().SetIterationRunning();
return true;
}
@ -154,7 +155,7 @@ class FusedPullWeightKernel : public CPUKernel {
void InitSizeLists() { return; }
private:
bool BuildPullWeightReq(std::shared_ptr<ps::FBBuilder> fbb) {
bool BuildPullWeightReq(std::shared_ptr<fl::FBBuilder> fbb) {
MS_EXCEPTION_IF_NULL(fbb);
std::vector<flatbuffers::Offset<flatbuffers::String>> fbs_weight_names;
for (const std::string &weight_name : weight_full_names_) {

View File

@ -24,7 +24,7 @@
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
#include "ps/ps_context.h"
#include "ps/worker/fl_worker.h"
#include "fl/worker/fl_worker.h"
namespace mindspore {
namespace kernel {
@ -45,13 +45,13 @@ class FusedPushWeightKernel : public CPUKernel {
return false;
}
std::shared_ptr<ps::FBBuilder> fbb = std::make_shared<ps::FBBuilder>();
std::shared_ptr<fl::FBBuilder> fbb = std::make_shared<fl::FBBuilder>();
MS_EXCEPTION_IF_NULL(fbb);
total_iteration_++;
// The worker has to train kWorkerTrainStepNum standalone iterations before it communicates with server.
if (total_iteration_ % ps::worker::FLWorker::GetInstance().worker_step_num_per_iteration() !=
ps::kTrainBeginStepNum) {
if (total_iteration_ % fl::worker::FLWorker::GetInstance().worker_step_num_per_iteration() !=
fl::kTrainBeginStepNum) {
return true;
}
@ -61,23 +61,24 @@ class FusedPushWeightKernel : public CPUKernel {
fl_iteration_ = 1;
}
MS_LOG(INFO) << "Start pushing weight for federated learning iteration " << fl_iteration_;
if (!BuildPushWeightReq(fbb, inputs)) {
MS_LOG(EXCEPTION) << "Building request for FusedPushWeight failed.";
return false;
}
// The server number may change after scaling in/out.
for (uint32_t i = 0; i < ps::worker::FLWorker::GetInstance().server_num(); i++) {
for (uint32_t i = 0; i < fl::worker::FLWorker::GetInstance().server_num(); i++) {
std::shared_ptr<std::vector<unsigned char>> push_weight_rsp_msg = nullptr;
const schema::ResponsePushWeight *push_weight_rsp = nullptr;
int retcode = schema::ResponseCode_SucNotReady;
while (retcode == schema::ResponseCode_SucNotReady) {
if (!ps::worker::FLWorker::GetInstance().SendToServer(i, fbb->GetBufferPointer(), fbb->GetSize(),
if (!fl::worker::FLWorker::GetInstance().SendToServer(i, fbb->GetBufferPointer(), fbb->GetSize(),
ps::core::TcpUserCommand::kPushWeight,
&push_weight_rsp_msg)) {
MS_LOG(WARNING) << "Sending request for FusedPushWeight to server " << i
<< " failed. This iteration is dropped.";
ps::worker::FLWorker::GetInstance().SetIterationCompleted();
fl::worker::FLWorker::GetInstance().SetIterationCompleted();
return true;
}
MS_EXCEPTION_IF_NULL(push_weight_rsp_msg);
@ -105,7 +106,7 @@ class FusedPushWeightKernel : public CPUKernel {
}
MS_LOG(INFO) << "Push weights for " << weight_full_names_ << " succeed. Iteration: " << fl_iteration_;
ps::worker::FLWorker::GetInstance().SetIterationCompleted();
fl::worker::FLWorker::GetInstance().SetIterationCompleted();
return true;
}
@ -143,7 +144,7 @@ class FusedPushWeightKernel : public CPUKernel {
void InitSizeLists() { return; }
private:
bool BuildPushWeightReq(std::shared_ptr<ps::FBBuilder> fbb, const std::vector<AddressPtr> &weights) {
bool BuildPushWeightReq(std::shared_ptr<fl::FBBuilder> fbb, const std::vector<AddressPtr> &weights) {
std::vector<flatbuffers::Offset<schema::FeatureMap>> fbs_feature_maps;
for (size_t i = 0; i < weight_full_names_.size(); i++) {
const std::string &weight_name = weight_full_names_[i];

View File

@ -48,10 +48,30 @@ MS_REG_CPU_KERNEL_T(
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeFloat32),
MaskedSelectCPUKernel, float);
MS_REG_CPU_KERNEL_T(
MaskedSelect,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeFloat16),
MaskedSelectCPUKernel, float16);
MS_REG_CPU_KERNEL_T(
MaskedSelect,
KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeFloat64),
MaskedSelectCPUKernel, double);
MS_REG_CPU_KERNEL_T(
MaskedSelect,
KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeInt32),
MaskedSelectCPUKernel, int);
MS_REG_CPU_KERNEL_T(
MaskedSelect,
KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeInt16),
MaskedSelectCPUKernel, int16_t);
MS_REG_CPU_KERNEL_T(
MaskedSelect,
KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeInt64),
MaskedSelectCPUKernel, int64_t);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_CPU_KERNEL_H_

View File

@ -46,6 +46,12 @@ bool MaskedSelectGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr>
auto mask = reinterpret_cast<bool *>(inputs[1]->addr);
auto grad = reinterpret_cast<T *>(inputs[2]->addr);
auto dx = reinterpret_cast<T *>(outputs[0]->addr);
auto ret = memset_s(dx, outputs[0]->size, 0, outputs[0]->size);
if (ret != EOK) {
MS_LOG(EXCEPTION) << "Memset output[0] of kernel MaskedSelectGrad failed, ret: " << ret;
}
uint64_t output_size = outputs[0]->size / sizeof(T);
uint64_t j = 0;
if (input_shape_a_ == input_shape_b_) {

View File

@ -51,6 +51,22 @@ MS_REG_CPU_KERNEL_T(MaskedSelectGrad,
.AddOutputAttr(kNumberTypeFloat32),
MaskedSelectGradCPUKernel, float);
MS_REG_CPU_KERNEL_T(MaskedSelectGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat64)
.AddInputAttr(kNumberTypeBool)
.AddInputAttr(kNumberTypeFloat64)
.AddOutputAttr(kNumberTypeFloat64),
MaskedSelectGradCPUKernel, double);
MS_REG_CPU_KERNEL_T(MaskedSelectGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeBool)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
MaskedSelectGradCPUKernel, float16);
MS_REG_CPU_KERNEL_T(MaskedSelectGrad,
KernelAttr()
.AddInputAttr(kNumberTypeInt32)
@ -58,6 +74,22 @@ MS_REG_CPU_KERNEL_T(MaskedSelectGrad,
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32),
MaskedSelectGradCPUKernel, int);
MS_REG_CPU_KERNEL_T(MaskedSelectGrad,
KernelAttr()
.AddInputAttr(kNumberTypeInt64)
.AddInputAttr(kNumberTypeBool)
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64),
MaskedSelectGradCPUKernel, int64_t);
MS_REG_CPU_KERNEL_T(MaskedSelectGrad,
KernelAttr()
.AddInputAttr(kNumberTypeInt16)
.AddInputAttr(kNumberTypeBool)
.AddInputAttr(kNumberTypeInt16)
.AddOutputAttr(kNumberTypeInt16),
MaskedSelectGradCPUKernel, int16_t);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_GRAD_CPU_KERNEL_H_

View File

@ -72,7 +72,9 @@ if(ENABLE_CPU)
elseif("${X86_64_SIMD}" STREQUAL "sse")
target_compile_definitions(nnacl_mid PRIVATE ENABLE_SSE)
elseif("${X86_64_SIMD}" STREQUAL "avx")
target_compile_definitions(nnacl_mid PRIVATE ENABLE_SSE ENABLE_AVX ENABLE_AVX512)
target_compile_definitions(nnacl_mid PRIVATE ENABLE_SSE ENABLE_AVX)
elseif("${X86_64_SIMD}" STREQUAL "avx512")
target_compile_definitions(nnacl_mid PRIVATE ENABLE_AVX512)
target_compile_options(nnacl_mid PRIVATE -mavx512f)
endif()
target_compile_options(nnacl_mid PRIVATE -fPIC)

View File

@ -27,32 +27,34 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_
#else
const int tile_n = 12;
#endif
int out_channel = conv_param->output_channel_;
int output_count = conv_param->output_h_ * conv_param->output_w_;
int output_tile_count = UP_DIV(output_count, tile_n);
int output_hw = conv_param->output_h_ * conv_param->output_w_;
int block_per_thread = UP_DIV(UP_DIV(output_hw, tile_n), conv_param->thread_num_);
int start_block = block_per_thread * task_id;
int start_hw = start_block * tile_n;
int end_hw = MSMIN(output_hw, (start_block + block_per_thread) * tile_n);
if (start_hw >= end_hw) {
return;
}
int out_stride = conv_param->output_channel_ * tile_n;
int deep = conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_;
packed_input += task_id * deep * tile_n;
col_major_input += task_id * deep * tile_n;
size_t input_size = deep * tile_n * sizeof(float16_t);
for (int b = 0; b < conv_param->input_batch_; b++) {
int in_batch_offset = b * conv_param->input_channel_ * conv_param->input_h_ * conv_param->input_w_;
int out_batch_offset = b * out_channel * output_count;
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += conv_param->thread_num_) {
int start_index = thread_id * tile_n;
int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
float16_t *gemm_input = packed_input + task_id * deep * tile_n;
float16_t *col_major_gemm_input = col_major_input + task_id * deep * tile_n;
size_t packed_input_size = deep * tile_n * sizeof(float16_t);
memset(gemm_input, 0, packed_input_size);
memset(col_major_gemm_input, 0, packed_input_size);
Im2ColPackUnitFp16(input_data + in_batch_offset, conv_param, gemm_input, real_cal_num, start_index);
int out_offset = thread_id * tile_n * out_channel + out_batch_offset;
int in_offset = b * conv_param->input_channel_ * conv_param->input_h_ * conv_param->input_w_;
int out_offset = b * conv_param->output_channel_ * output_hw + start_hw * conv_param->output_channel_;
for (int i = start_hw; i < end_hw; i += tile_n, out_offset += out_stride) {
int real_cal_row = MSMIN(output_hw - i, tile_n);
memset(packed_input, 0, input_size);
Im2ColPackUnitFp16(input_data + in_offset, conv_param, packed_input, real_cal_row, i);
#ifdef ENABLE_ARM64
RowMajor2Col16MajorFp16Opt(gemm_input, col_major_gemm_input, tile_n, deep);
RowMajor2Col16MajorFp16Opt(packed_input, col_major_input, tile_n, deep);
#else
RowMajor2Col12MajorFp16Opt(gemm_input, col_major_gemm_input, tile_n, deep);
RowMajor2Col12MajorFp16Opt(packed_input, col_major_input, tile_n, deep);
#endif
MatMulFp16(col_major_gemm_input, packed_weight, output_data + out_offset, bias_data, conv_param->act_type_, deep,
real_cal_num, out_channel, out_channel, OutType_Nhwc);
MatMulFp16(col_major_input, packed_weight, output_data + out_offset, bias_data, conv_param->act_type_, deep,
real_cal_row, conv_param->output_channel_, conv_param->output_channel_, OutType_Nhwc);
}
}
}

View File

@ -0,0 +1,24 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fp16/fill_fp16.h"
int FillFp16(float16_t *output, int size, float16_t data) {
for (int i = 0; i < size; ++i) {
output[i] = data;
}
return NNACL_OK;
}

View File

@ -0,0 +1,34 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_NNACL_FILL_BASE_H_
#define MINDSPORE_NNACL_FILL_BASE_H_
#include "nnacl/op_base.h"
#include "nnacl/errorcode.h"
#include "nnacl/fill_parameter.h"
#ifdef ENABLE_ARM
#include <arm_neon.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
int FillFp16(float16_t *output, int size, float16_t data);
#ifdef __cplusplus
}
#endif
#endif // MINDSPORE_NNACL_FILL_BASE_H_

View File

@ -454,7 +454,7 @@ void VecMatmulFp16(const float16_t *a, const float16_t *b, float16_t *c, const f
vst1q_f16(c, acc_0);
if (ci + C16NUM > col) {
int c_remain = col - ci;
int c_remain = col - ci - C8NUM;
for (int i = 0; i < c_remain; ++i) {
if (act_type == ActType_Relu) {
c[C8NUM + i] = MSMAX(acc_1[i], (float16_t)0.0);
@ -792,15 +792,19 @@ void RowMajor2Row12MajorFp16(const void *src, float16_t *dst, int row, int col,
}
void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) {
int down_c8 = col / C8NUM;
int stride = C8NUM * row;
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
int c = 0;
for (; c < down_c8; c++) {
MS_FLOAT16X8 src_data = MS_LDQ_F16((const float16_t *)src + r * col + c * C8NUM);
MS_STQ_F16(dst + c * stride + r * C8NUM, src_data);
}
c *= C8NUM;
for (; c < col; c++) {
int c_div8 = c / 8;
int c_mod8 = c % 8;
if (is_fp32_src) {
dst[c_div8 * 8 * row + r * 8 + c_mod8] = (float16_t)(((const float *)src)[r * col + c]);
} else {
dst[c_div8 * 8 * row + r * 8 + c_mod8] = ((const float16_t *)src)[r * col + c];
}
dst[c_div8 * stride + r * 8 + c_mod8] = ((const float16_t *)src)[r * col + c];
}
}
}

View File

@ -20,6 +20,15 @@
// sliding window to compate 1x1 conv in x86
void Conv1x1SWFp32(const float *input_data, const float *packed_weight, const float *bias_data, float *output_data,
int task_id, ConvParameter *conv_param, SlidingWindowParam *sw_param) {
int output_w = conv_param->output_w_;
int output_h = conv_param->output_h_;
int ohw = output_h * output_w;
int ohw_step = UP_DIV(ohw, conv_param->thread_num_);
int ohw_start = ohw_step * task_id;
int ohw_end = MSMIN(ohw_start + ohw_step, ohw);
if (ohw_start >= ohw_end) {
return;
}
int oc_tile_ = C8NUM; // oc in algin to C8NUM in x86_64_avx
int act_type = 0;
if (conv_param->act_type_ == ActType_Relu6) {
@ -28,8 +37,6 @@ void Conv1x1SWFp32(const float *input_data, const float *packed_weight, const fl
if (conv_param->act_type_ == ActType_Relu || conv_param->act_type_ == ActType_Relu6) {
act_type += 2;
}
int output_w = conv_param->output_w_;
int output_h = conv_param->output_h_;
int pad_d = conv_param->pad_d_;
int pad_l = conv_param->pad_l_;
int pad_r = conv_param->pad_r_;
@ -43,10 +50,6 @@ void Conv1x1SWFp32(const float *input_data, const float *packed_weight, const fl
int oc_num = sw_param->c_block_;
int in_step = sw_param->in_step_;
int out_step = sw_param->out_step_;
int ohw = output_h * output_w;
int ohw_step = UP_DIV(ohw, conv_param->thread_num_);
int ohw_start = ohw_step * task_id;
int ohw_end = MSMIN(ohw_start + ohw_step, ohw);
const int ow_block_num[4] = {12, 6, 4, 3};
const Conv1x1SWKernel kernel[4][2] = {{Conv1x1SW1x8Kernel, Conv1x1SW12x8Kernel},
{Conv1x1SW1x16Kernel, Conv1x1SW6x16Kernel},

View File

@ -24,12 +24,10 @@
// fp32 conv common
void ConvFp32(const float *input_data, float *packed_input, const float *packed_weight, const float *bias_data,
float *col_major_input, float *output_data, int task_id, const ConvParameter *conv_param) {
int out_channel = conv_param->output_channel_;
int deep = conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_;
int output_count = conv_param->output_h_ * conv_param->output_w_;
if (conv_param->thread_num_ == 0) {
return;
}
int output_hw = conv_param->output_h_ * conv_param->output_w_;
Row2ColMajorFuncPtr Row2ColMajor = NULL;
#ifdef ENABLE_AVX
const int cal_num = C6NUM;
@ -40,11 +38,11 @@ void ConvFp32(const float *input_data, float *packed_input, const float *packed_
#elif defined(ENABLE_ARM64)
int cal_num = 0;
MatmulFloatOptFuncPtr MatmulFloatOpt = NULL;
if (output_count <= C4NUM) {
if (output_hw <= C4NUM) {
cal_num = C4NUM;
Row2ColMajor = RowMajor2Col4Major;
MatmulFloatOpt = MatmulFloatNeon64OptRow4;
} else if (output_count <= C8NUM) {
} else if (output_hw <= C8NUM) {
cal_num = C8NUM;
Row2ColMajor = RowMajor2Col8Major;
MatmulFloatOpt = MatmulFloatNeon64OptRow8;
@ -60,44 +58,46 @@ void ConvFp32(const float *input_data, float *packed_input, const float *packed_
const int cal_num = C12NUM;
Row2ColMajor = RowMajor2Col12Major;
#endif
int output_tile_count = UP_DIV(output_count, cal_num);
int block_per_thread = UP_DIV(UP_DIV(output_hw, cal_num), conv_param->thread_num_);
int start_block = block_per_thread * task_id;
int start_hw = start_block * cal_num;
int end_hw = MSMIN(output_hw, (start_block + block_per_thread) * cal_num);
if (start_hw >= end_hw) {
return;
}
int out_stride = conv_param->output_channel_ * cal_num;
int deep = conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_;
packed_input += task_id * deep * cal_num;
col_major_input += task_id * deep * cal_num;
size_t input_size = deep * cal_num * sizeof(float);
for (int b = 0; b < conv_param->input_batch_; b++) {
int in_batch_offset = b * conv_param->input_channel_ * conv_param->input_h_ * conv_param->input_w_;
int out_batch_offset = b * out_channel * output_count;
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += conv_param->thread_num_) {
int start_index = thread_id * cal_num;
int real_cal_num = (output_count - start_index) < cal_num ? (output_count - start_index) : cal_num;
if (real_cal_num <= 0) {
return;
}
float *gemm_input = packed_input + task_id * deep * cal_num;
float *col_major_gemm_input = col_major_input + task_id * deep * cal_num;
size_t packed_input_size = deep * cal_num * sizeof(float);
memset(gemm_input, 0, packed_input_size);
memset(col_major_gemm_input, 0, packed_input_size);
Im2ColPackUnitFp32(input_data + in_batch_offset, conv_param, gemm_input, real_cal_num, start_index);
int out_offset = thread_id * cal_num * out_channel + out_batch_offset;
int out_channel = conv_param->output_channel_;
int in_offset = b * conv_param->input_channel_ * conv_param->input_h_ * conv_param->input_w_;
int out_offset = b * out_channel * output_hw + start_hw * out_channel;
for (int i = start_hw; i < end_hw; i += cal_num, out_offset += out_stride) {
int real_cal_row = MSMIN(output_hw - i, cal_num);
memset(packed_input, 0, input_size);
Im2ColPackUnitFp32(input_data + in_offset, conv_param, packed_input, real_cal_row, i);
Row2ColMajor(packed_input, col_major_input, cal_num, deep);
float *gemm_output = output_data + out_offset;
Row2ColMajor(gemm_input, col_major_gemm_input, cal_num, deep);
// x86 func param types are different
#if ENABLE_AVX
MatmulFloatAvxOpt(col_major_gemm_input, packed_weight, gemm_output, bias_data, (size_t)conv_param->act_type_,
deep, real_cal_num, out_channel, (size_t)out_channel, (size_t)OutType_Nhwc);
MatmulFloatAvxOpt(col_major_input, packed_weight, gemm_output, bias_data, (size_t)conv_param->act_type_, deep,
real_cal_row, out_channel, (size_t)out_channel, (size_t)OutType_Nhwc);
#elif ENABLE_SSE
MatmulFloatSse64Opt(col_major_gemm_input, packed_weight, gemm_output, bias_data, (int)conv_param->act_type_, deep,
real_cal_num, out_channel, (size_t)out_channel, (int)OutType_Nhwc);
MatmulFloatSse64Opt(col_major_input, packed_weight, gemm_output, bias_data, (int)conv_param->act_type_, deep,
real_cal_row, out_channel, (size_t)out_channel, (int)OutType_Nhwc);
#elif ENABLE_ARM32
MatmulFloatNeon32Opt12x4(col_major_gemm_input, packed_weight, gemm_output, bias_data, (int)conv_param->act_type_,
deep, real_cal_num, out_channel, out_channel, OutType_Nhwc);
MatmulFloatNeon32Opt12x4(col_major_input, packed_weight, gemm_output, bias_data, (int)conv_param->act_type_, deep,
real_cal_row, out_channel, out_channel, OutType_Nhwc);
#elif ENABLE_ARM64
MatmulFloatOpt(col_major_gemm_input, packed_weight, gemm_output, bias_data, conv_param->act_type_, deep,
real_cal_num, out_channel, out_channel, OutType_Nhwc);
MatmulFloatOpt(col_major_input, packed_weight, gemm_output, bias_data, conv_param->act_type_, deep, real_cal_row,
out_channel, out_channel, OutType_Nhwc);
#else
MatMul12x8(col_major_gemm_input, packed_weight, gemm_output, bias_data, (int)conv_param->act_type_, deep,
real_cal_num, out_channel, out_channel, OutType_Nhwc);
MatMul12x8(col_major_input, packed_weight, gemm_output, bias_data, (int)conv_param->act_type_, deep, real_cal_row,
out_channel, out_channel, OutType_Nhwc);
#endif
}
}
@ -134,6 +134,13 @@ void SWBorder(float *dst, const float *src, const float *weight, const float *bi
// fp32 sliding window common conv
void ConvSWFp32(const float *input_data, const float *packed_weight, const float *bias_data, float *output_data,
int task_id, ConvParameter *conv_param, SlidingWindowParam *sw_param) {
int out_h = conv_param->output_h_;
int oh_step = UP_DIV(out_h, conv_param->thread_num_);
int oh_start = oh_step * task_id;
int oh_end = MSMIN(oh_start + oh_step, out_h);
if (oh_start >= oh_end) {
return;
}
int oc_tile_ = C8NUM; // oc in algin to C8NUM in x86_64_avx
int act_type = 0;
if (conv_param->act_type_ == ActType_Relu6) {
@ -149,56 +156,75 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
int in_sw_step = sw_param->in_sw_step_;
int in_kw_step = sw_param->in_kw_step_;
int in_kh_step = sw_param->in_kh_step_;
int in_sh_step = sw_param->in_sh_step_;
int out_h_step = sw_param->out_h_step_;
int kernel_step = sw_param->kernel_step_;
int in_step = sw_param->in_step_;
int out_step = sw_param->out_step_;
int c_block = sw_param->c_block_;
int top = sw_param->top_;
int left = sw_param->left_;
int right = sw_param->right_;
int bottom = sw_param->bottom_;
int block_channel = sw_param->block_channel_;
int stride_h = conv_param->stride_h_;
int stride_w = conv_param->stride_w_;
int out_w = conv_param->output_w_;
int pad_u = conv_param->pad_u_;
int pad_l = conv_param->pad_l_;
int in_h_step = sw_param->in_h_step_;
int out_batch = conv_param->output_batch_;
int in_h_start = top * stride_h - pad_u;
int in_w_start = left * stride_w - pad_l;
int center_step = in_h_start * in_h_step + in_w_start * ic_algin;
const int ow_block_num[4] = {12, 6, 4, 3};
const SWConvKernel kernel[4][2] = {{SWConv1x8Kernel, SWConv12x8Kernel},
{SWConv1x16Kernel, SWConv6x16Kernel},
{SWConv1x24Kernel, SWConv4x24Kernel},
{SWConv1x32Kernel, SWConv3x32Kernel}};
for (int b = 0; b < conv_param->output_batch_; b++) {
for (int oh = task_id; oh < conv_param->output_h_; oh += conv_param->thread_num_) {
float *dst_oh = output_data + oh * sw_param->out_h_step_;
int in_h_start = sw_param->top_ * conv_param->stride_h_ - conv_param->pad_u_;
int in_w_start = sw_param->left_ * conv_param->stride_w_ - conv_param->pad_l_;
const float *src_h = input_data + in_h_start * sw_param->in_h_step_ + in_w_start * sw_param->ic_align_;
for (int b = 0; b < out_batch; b++) {
for (int oh = oh_start; oh < oh_end; oh += 1) {
float *dst_oh = output_data + oh * out_h_step;
const float *src_h = input_data + center_step;
int oc_block = 0;
const float *bias = bias_data;
for (int oc = 0; oc < sw_param->c_block_; oc += oc_block) {
oc_block = MSMIN(C4NUM, sw_param->c_block_ - oc); // 4 3 2 1
const float *weight = packed_weight + oc * sw_param->kernel_step_;
for (int oc = 0; oc < c_block; oc += oc_block) {
oc_block = MSMIN(C4NUM, c_block - oc); // 4 3 2 1
const float *weight = packed_weight + oc * kernel_step;
if (bias != NULL) {
bias = bias_data + oc * oc_tile_;
}
float *dst_w = dst_oh + oc * oc_tile_;
const SWConvKernel kernel_border = kernel[oc_block - 1][0];
if (oh < sw_param->top_ || oh >= sw_param->bottom_) { // oh in up or down border
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, conv_param->output_w_, conv_param, sw_param,
kernel_border, act_type, 1, oc_block);
if (oh < top || oh >= bottom) { // oh in up or down border
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, out_w, conv_param, sw_param, kernel_border, act_type,
1, oc_block);
} else { // oh in center
// ow in right
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, sw_param->left_, conv_param, sw_param, kernel_border,
act_type, 1, oc_block);
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, 0, left, conv_param, sw_param, kernel_border, act_type,
1, oc_block);
// ow in center
const float *src_w = src_h + (oh - sw_param->top_) * sw_param->in_sh_step_;
int ow_block = ow_block_num[oc_block - 1]; // 12 6 4 3
for (int ow = sw_param->left_; ow < sw_param->right_; ow += ow_block) { // left ~ right
ow_block = MSMIN(ow_block, sw_param->right_ - ow);
const float *src_w = src_h + (oh - top) * in_sh_step;
int ow_block = ow_block_num[oc_block - 1]; // 12 6 4 3
for (int ow = left; ow < right; ow += ow_block) { // left ~ right
ow_block = MSMIN(ow_block, right - ow);
if (ow_block < ow_block_num[oc_block - 1]) { // ow is not enough and process one ow
ow_block = 1;
}
kernel[oc_block - 1][ow_block / ow_block_num[oc_block - 1]](
dst_w + ow * sw_param->block_channel_, src_w, weight, bias, kernel_h, kernel_w, act_type, ow_block,
oc_block, oc_algin, ic_algin, in_kw_step, in_kh_step, in_sw_step, 0);
dst_w + ow * block_channel, src_w, weight, bias, kernel_h, kernel_w, act_type, ow_block, oc_block,
oc_algin, ic_algin, in_kw_step, in_kh_step, in_sw_step, 0);
src_w += ow_block * in_sw_step;
}
// ow in left
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, sw_param->right_, conv_param->output_w_, conv_param,
sw_param, kernel_border, act_type, 1, oc_block);
SWBorder(dst_w, input_data, weight, bias, oh, oh + 1, right, out_w, conv_param, sw_param, kernel_border,
act_type, 1, oc_block);
}
}
} // output h loop
input_data += sw_param->in_step_;
output_data += sw_param->out_step_;
input_data += in_step;
output_data += out_step;
} // batch loop
}

View File

@ -1040,6 +1040,12 @@ void DepthwiseBorderAvxFp32(float *dst, const float *src, const float *weight, c
void DepthwiseSWAvxFp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data,
const ConvParameter *conv_param, const SlidingWindowParam *sw_param, int task_id) {
int oh_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
int oh_start = oh_step * task_id;
int oh_end = MSMIN(oh_start + oh_step, conv_param->output_h_);
if (oh_start >= oh_end) {
return;
}
// depthwise sw in x86 avx instructions
int oc_tile_ = C8NUM; // oc in algin to C8NUM in x86_64_avx
int act_type = 0;
@ -1064,6 +1070,8 @@ void DepthwiseSWAvxFp32(float *output_data, const float *input_data, const float
int out_left = sw_param->left_;
int out_top = sw_param->top_;
int out_bottom = sw_param->bottom_;
int kernel_step = sw_param->kernel_step_;
int out_h_step = sw_param->out_h_step_;
int in_h_start = out_top * conv_param->stride_h_ - conv_param->pad_u_;
int in_w_start = out_left * conv_param->stride_w_ - conv_param->pad_l_;
int in_start = in_h_start * sw_param->in_h_step_ + in_w_start * oc_algin;
@ -1072,19 +1080,16 @@ void DepthwiseSWAvxFp32(float *output_data, const float *input_data, const float
{DepthwiseSW1x16Kernel, DepthwiseSW4x16Kernel},
{DepthwiseSW1x24Kernel, DepthwiseSW4x24Kernel},
{DepthwiseSW1x32Kernel, DepthwiseSW3x32Kernel}};
int oh_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
int oh_start = oh_step * task_id;
int oh_end = MSMIN(oh_start + oh_step, conv_param->output_h_);
for (int b = 0; b < conv_param->output_batch_; b++) {
for (int oh = oh_start; oh < oh_end; ++oh) {
float *dst_oh = output_data + oh * sw_param->out_h_step_;
float *dst_oh = output_data + oh * out_h_step;
const float *src_h = input_data + in_start + (oh - out_top) * in_sh_step;
int oc_block = 0;
const float *bias = bias_data;
for (int oc = 0; oc < oc_num; oc += oc_block) {
oc_block = MSMIN(C4NUM, oc_num - oc); // 4 3 2 1
int oc_step = oc * oc_tile_;
const float *weight = weight_data + oc * sw_param->kernel_step_;
const float *weight = weight_data + oc * kernel_step;
if (bias != NULL) {
bias = bias_data + oc_step;
}

View File

@ -955,7 +955,7 @@ void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *
}
vst1q_f32(c, acc_0);
if (ci + C8NUM - 1 >= col) {
int c_remain = col - ci;
int c_remain = col - ci - C4NUM;
for (int i = 0; i < c_remain; ++i) {
if (act_type == ActType_Relu) {
c[C4NUM + i] = MSMAX(acc_1[i], 0.0f);

View File

@ -52,7 +52,7 @@ int ReduceOnSelectedAxes(const TensorC *input, size_t num_axes, const int *actua
int ReduceInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
OpParameter *parameter) {
int check_ret = CheckAugmentNullSize(inputs, inputs_size, outputs, outputs_size, parameter, 2, 1);
int check_ret = CheckAugmentNullSizeInputTwo(inputs, inputs_size, outputs, outputs_size, parameter, 1, 2, 1);
if (check_ret != NNACL_OK) {
return check_ret;
}
@ -70,11 +70,11 @@ int ReduceInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC *
bool keep_dims = param->keep_dims_;
int out_shape[MAX_SHAPE_SIZE] = {0};
const size_t out_shape_size = 0;
// get axes from input tensor
const TensorC *axes_input = inputs[1];
if (axes_input->shape_size_ == 1 && axes_input->shape_[0] == 0) {
if (inputs_size == 1 || (inputs_size == 2 && inputs[1]->shape_size_ == 1 && inputs[1]->shape_[0] == 0)) {
return ReduceOnAllAxes(input, output, out_shape, out_shape_size, keep_dims);
}
// get axes from input tensor
const TensorC *axes_input = inputs[1];
int *axes = (int *)axes_input->data_;
if (axes == NULL) {
return NNACL_NULL_PTR;

View File

@ -34,7 +34,7 @@ static bool CheckInputsDataType(const TensorC *const *inputs, size_t inputs_size
int SliceInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
OpParameter *parameter) {
int ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, 1, 1);
int ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, 3, 1);
if (ret != NNACL_OK) {
return ret;
}

View File

@ -52,7 +52,7 @@ void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
reduce_type_ = kReduceAny;
reduce_func_ = [](const T *input, size_t pos, T *out) { *out |= input[pos]; };
} else {
MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << kernel_name_ << " for bool.";
MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << fullname_ << " for bool.";
}
} else {
if (kernel_name == "ReduceMax") {

View File

@ -24,7 +24,9 @@ template <typename T>
void SplitCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
axis_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "axis");
output_num_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "output_num");
input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
(void)std::transform(input_shape.begin(), input_shape.end(), std::back_inserter(input_shape_),
[](const int &value) { return static_cast<int>(value); });
CheckParam(kernel_node);
}
@ -44,8 +46,6 @@ bool SplitCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
template <typename T>
void SplitCPUKernel<T>::LaunchSplit(T *input, T **output, size_t size) {
(void)std::transform(input_shape_.begin(), input_shape_.end(), std::back_inserter(input_shape_int_),
[](const int &value) { return static_cast<int>(value); });
SplitParameter param;
param.num_split_ = output_num_;
param.split_dim_ = axis_;
@ -64,7 +64,7 @@ void SplitCPUKernel<T>::LaunchSplit(T *input, T **output, size_t size) {
param.split_count_ *= input_shape_[i];
}
auto task = [&](size_t start, size_t end) {
DoSplit(input, reinterpret_cast<void **>(output), &input_shape_int_[0], start, end - start, &param, sizeof(T));
(void)DoSplit(input, reinterpret_cast<void **>(output), &input_shape_[0], start, end - start, &param, sizeof(T));
};
CPUKernelUtils::ParallelFor(task, param.split_count_ * param.num_split_);
return;

View File

@ -52,8 +52,7 @@ class SplitCPUKernel : public CPUKernel {
size_t dims_current_after_axis_;
std::vector<std::vector<size_t>> output_shape_list_;
std::vector<size_t> input_shape_;
std::vector<int> input_shape_int_;
std::vector<int> input_shape_;
TypeId dtype_{kTypeUnknown};
};

View File

@ -22,6 +22,7 @@
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/cast_impl.cuh"
#include "backend/kernel_compiler/gpu/cuda_impl/in_top_k_impl.cuh"
#include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"
@ -47,12 +48,29 @@ class InTopKGpuKernel : public GpuKernel {
T *top_k_output_device = GetDeviceAddress<T>(workspace, 0);
int32_t *top_k_indices_device = GetDeviceAddress<int32_t>(workspace, 1);
// topk sorts the input along the last dimension
FastTopK(outer_size_, inner_size_, predictions_device, static_cast<int32_t>(k_), top_k_output_device,
top_k_indices_device, top_k_init_, reinterpret_cast<cudaStream_t>(stream_ptr));
if (std::is_same<T, half>::value) {
// remove later! urgent fix for bug: topk has incorrect output for float16
float top_k_init = std::numeric_limits<float>::lowest();
CalInTopK(predictions_device, targets_device, output_device, top_k_output_device, input_shape_[0], k_,
reinterpret_cast<cudaStream_t>(stream_ptr));
// cast to float32
float *casted_float32_input = GetDeviceAddress<float>(workspace, 2);
float *top_k_output_device_float32 = GetDeviceAddress<float>(workspace, 3);
Cast(input_size_, predictions_device, casted_float32_input, reinterpret_cast<cudaStream_t>(stream_ptr));
FastTopK(outer_size_, inner_size_, casted_float32_input, static_cast<int32_t>(k_), top_k_output_device_float32,
top_k_indices_device, top_k_init, reinterpret_cast<cudaStream_t>(stream_ptr));
CalInTopK(casted_float32_input, targets_device, output_device, top_k_output_device_float32, input_shape_[0],
input_shape_[1], k_, reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
// topk sorts the input along the last dimension
FastTopK(outer_size_, inner_size_, predictions_device, static_cast<int32_t>(k_), top_k_output_device,
top_k_indices_device, top_k_init_, reinterpret_cast<cudaStream_t>(stream_ptr));
CalInTopK(predictions_device, targets_device, output_device, top_k_output_device, input_shape_[0],
input_shape_[1], k_, reinterpret_cast<cudaStream_t>(stream_ptr));
}
return true;
}
@ -114,6 +132,12 @@ class InTopKGpuKernel : public GpuKernel {
output_size_list_.push_back(input_shape_[0] * sizeof(bool));
workspace_size_list_.push_back(input_shape_[0] * k_ * sizeof(T));
workspace_size_list_.push_back(input_shape_[0] * k_ * sizeof(int32_t));
// remove later! urgent fix for bug: topk has incorrect output for float16
if (std::is_same<T, half>::value) {
workspace_size_list_.push_back(input_size_ * sizeof(float));
workspace_size_list_.push_back(input_shape_[0] * k_ * sizeof(float));
}
}
private:

View File

@ -21,6 +21,7 @@
#include <vector>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/cast_impl.cuh"
#include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"
namespace mindspore {
@ -42,20 +43,38 @@ class TopKGpuKernel : public GpuKernel {
T *output_addr = GetDeviceAddress<T>(outputs, 0);
S *indices = GetDeviceAddress<S>(outputs, 1);
T init_k = std::numeric_limits<T>::lowest();
if (std::is_same<T, half>::value) {
// min value representable by float16, std::numeric_limits doesn't support half
init_k = static_cast<half>(-65504.);
}
S k_cut = 0;
CHECK_CUDA_RET_WITH_EXCEPT(
kernel_node_,
cudaMemcpyAsync(&k_cut, k, sizeof(S), cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemcpyAsync k_cut failed");
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaDeviceSynchronize(), "cudaDeviceSyncFailed - TopK");
FastTopK(outer_size_, inner_size_, input_addr, k_cut, output_addr, indices, init_k,
reinterpret_cast<cudaStream_t>(stream_ptr));
if (std::is_same<T, half>::value) {
// remove later! urgent fix for bug: topk has incorrect output for float16
float init_k = std::numeric_limits<float>::lowest();
// cast to float32
float *casted_float32_input = GetDeviceAddress<float>(workspaces, 0);
float *casted_float32_top_k_output = GetDeviceAddress<float>(workspaces, 1);
Cast(outer_size_ * inner_size_, input_addr, casted_float32_input, reinterpret_cast<cudaStream_t>(stream_ptr));
// call FastTopK with workspace[n], workspace[n+1] as input, output
FastTopK(outer_size_, inner_size_, casted_float32_input, k_cut, casted_float32_top_k_output, indices, init_k,
reinterpret_cast<cudaStream_t>(stream_ptr));
// cast workspace[n+1] back to float16
Cast(outer_size_ * k_, casted_float32_top_k_output, output_addr, reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
T init_k = std::numeric_limits<T>::lowest();
CHECK_CUDA_RET_WITH_EXCEPT(
kernel_node_,
cudaMemcpyAsync(&k_cut, k, sizeof(S), cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream_ptr)),
"cudaMemcpyAsync k_cut failed");
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaDeviceSynchronize(), "cudaDeviceSyncFailed - TopK");
FastTopK(outer_size_, inner_size_, input_addr, k_cut, output_addr, indices, init_k,
reinterpret_cast<cudaStream_t>(stream_ptr));
}
return true;
}
@ -82,6 +101,12 @@ class TopKGpuKernel : public GpuKernel {
input_size_list_.push_back(sizeof(S));
output_size_list_.push_back(outer_size_ * k_ * sizeof(T));
output_size_list_.push_back(outer_size_ * k_ * sizeof(S));
// remove later! urgent fix for bug: topk has incorrect output for float16
if (std::is_same<T, half>::value) {
workspace_size_list_.push_back(outer_size_ * inner_size_ * sizeof(float));
workspace_size_list_.push_back(outer_size_ * k_ * sizeof(float));
}
}
private:

View File

@ -26,7 +26,7 @@ __global__ void Copy(T *input, T *output, size_t size) {
}
template <typename T>
__global__ void LeftMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
__global__ void LeftMoveProd(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
size_t stride2) {
size_t num = dim0 * dim2;
size_t i, k, offset;
@ -38,7 +38,7 @@ __global__ void LeftMove(const T *input, T *output, size_t dim0, size_t dim1, si
for (size_t j = 0; j < dim1; ++j) {
size_t read_index = j * stride2 + offset;
if (j == 0) {
output[read_index] = 0;
output[read_index] = 1;
} else {
size_t read_index2 = (j - 1) * stride2 + offset;
output[read_index] = input[read_index2];
@ -48,7 +48,7 @@ __global__ void LeftMove(const T *input, T *output, size_t dim0, size_t dim1, si
}
template <typename T>
__global__ void RightMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
__global__ void RightMoveProd(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
size_t stride2) {
size_t num = dim0 * dim2;
size_t i, k, offset;
@ -60,7 +60,7 @@ __global__ void RightMove(const T *input, T *output, size_t dim0, size_t dim1, s
for (int j = dim1 - 1; j >= 0; --j) {
size_t read_index = j * stride2 + offset;
if (j == dim1 - 1) {
output[read_index] = 0;
output[read_index] = 1;
} else {
size_t read_index2 = (j + 1) * stride2 + offset;
output[read_index] = input[read_index2];
@ -117,12 +117,12 @@ void CumProd(const T *input, T *output, T *workspace, size_t dim0, size_t dim1,
int size = dim0 * dim2;
if (exclusive_) {
if (reverse_) {
RightMove<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
RightMoveProd<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1);
CumProdKernelReverse<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, dim2, stride,
stride2);
} else {
LeftMove<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
LeftMoveProd<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1);
CumProdKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, dim2, stride, stride2);
}

View File

@ -26,7 +26,7 @@ __global__ void Copy(T *input, T *output, size_t size) {
}
template <typename T>
__global__ void LeftMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
__global__ void LeftMoveSum(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
size_t stride2) {
size_t num = dim0 * dim2;
size_t i, k, offset;
@ -48,7 +48,7 @@ __global__ void LeftMove(const T *input, T *output, size_t dim0, size_t dim1, si
}
template <typename T>
__global__ void RightMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
__global__ void RightMoveSum(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
size_t stride2) {
size_t num = dim0 * dim2;
size_t i, k, offset;
@ -117,12 +117,12 @@ void CumSum(const T *input, T *output, T *workspace, size_t dim0, size_t dim1, s
int size = dim0 * dim2;
if (exclusive_) {
if (reverse_) {
RightMove<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
RightMoveSum<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1);
CumSumKernelReverse<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, dim2, stride,
stride2);
} else {
LeftMove<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
LeftMoveSum<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, output, dim0, dim1, dim2, stride, stride2);
Copy<<<GET_BLOCKS(size * dim1), GET_THREADS, 0, stream>>>(workspace, output, size * dim1);
CumSumKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(workspace, output, dim0, dim1, dim2, stride, stride2);
}

View File

@ -21,26 +21,27 @@
template <typename T>
__global__ void InTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output,
size_t class_id_count, int64_t k) {
size_t batch_size, size_t class_id_count, int64_t k) {
size_t gt_id = blockIdx.x * blockDim.x + threadIdx.x;
for (; gt_id < class_id_count; gt_id += blockDim.x * gridDim.x) {
for (; gt_id < batch_size; gt_id += blockDim.x * gridDim.x) {
int32_t target_index = targets[gt_id];
T predicted_value = predictions[gt_id * class_id_count + target_index];
T top_k_smallest_value = top_k_output[k - 1];
T top_k_smallest_value = top_k_output[gt_id * k + k - 1];
output[gt_id] = predicted_value >= top_k_smallest_value;
}
}
template <typename T>
void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, size_t class_id_count,
int64_t k, cudaStream_t cuda_stream) {
void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, size_t batch_size,
size_t class_id_count, int64_t k, cudaStream_t cuda_stream) {
InTopK<<<GET_BLOCKS(class_id_count), GET_THREADS, 0, cuda_stream>>>(predictions, targets, output, top_k_output,
class_id_count, k);
batch_size, class_id_count, k);
}
template void CalInTopK<half>(const half *predictions, const int32_t *targets, bool *output, const half *top_k_output,
size_t class_id_count, int64_t k, cudaStream_t cuda_stream);
size_t batch_size, size_t class_id_count, int64_t k, cudaStream_t cuda_stream);
template void CalInTopK<float>(const float *predictions, const int32_t *targets, bool *output,
const float *top_k_output, size_t class_id_count, int64_t k, cudaStream_t cuda_stream);
const float *top_k_output, size_t batch_size, size_t class_id_count, int64_t k,
cudaStream_t cuda_stream);

View File

@ -20,7 +20,7 @@
#include <cuda_runtime.h>
template <typename T>
void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, size_t class_id_count,
int64_t k, cudaStream_t cuda_stream);
void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, size_t batch_size,
size_t class_id_count, int64_t k, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_IN_TOP_K_CUH_

View File

@ -0,0 +1,68 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/cuda_impl/prelu_grad_impl.cuh"
#include "backend/kernel_compiler/gpu/cuda_impl/util.cuh"
#include "runtime/device/gpu/cuda_common.h"
template <typename T>
__global__ void CalPReLUGradKernel(size_t size, size_t weight_size, size_t per_channel_size,
const T *dy, const T *x, const T *w, T *dx, float *dw_array) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
size_t channel_id = weight_size == 1 ? 0 : (pos / per_channel_size) % weight_size;
size_t index = channel_id * blockDim.x * gridDim.x + thread_id;
T threshold = static_cast<T>(0);
dx[pos] = x[pos] <= threshold ? w[channel_id] * dy[pos] : dy[pos];
if (x[pos] < threshold) {
dw_array[index] += static_cast<float>(x[pos] * dy[pos]);
}
}
}
__global__ void InitDwArrayData(size_t dw_array_size, float *dw_array) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < dw_array_size; i += blockDim.x * gridDim.x) {
dw_array[i] = 0.0;
}
}
template <typename T>
__global__ void ComputeDwData(size_t weight_size, size_t thread_num, const float *dw_array, T *dw) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < weight_size; i += blockDim.x * gridDim.x) {
float value = 0.0;
for (size_t j = 0; j < thread_num; j++) {
value += dw_array[i * thread_num + j];
}
dw[i] = static_cast<T>(value);
}
}
template <typename T>
void CalPReLUGrad(size_t size, size_t weight_size, size_t per_channel_size,
const T *dy, const T *x, const T *w, T *dx, T *dw, float *dw_array, cudaStream_t cuda_stream) {
size_t thread_num = static_cast<size_t>(GET_BLOCKS(size) * GET_THREADS);
size_t dw_array_size = weight_size * thread_num;
InitDwArrayData<<<GET_BLOCKS(dw_array_size), GET_THREADS, 0, cuda_stream>>>(dw_array_size, dw_array);
CalPReLUGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, weight_size, per_channel_size,
dy, x, w, dx, dw_array);
ComputeDwData<<<GET_BLOCKS(weight_size), GET_THREADS, 0, cuda_stream>>>(weight_size, thread_num, dw_array, dw);
return;
}
template void CalPReLUGrad(size_t, size_t, size_t, const float *, const float *, const float *,
float *, float *, float *, cudaStream_t);
template void CalPReLUGrad(size_t, size_t, size_t, const half *, const half *, const half *,
half *, half *, float *, cudaStream_t);

View File

@ -0,0 +1,25 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_GRAD_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_GRAD_H_
#include "runtime/device/gpu/cuda_common.h"
template <typename T>
void CalPReLUGrad(size_t input_size, size_t weight_size, size_t per_channel_size,
const T *dy, const T *x, const T *w, T *dx, T *dw, float *dw_array, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_GRAD_H_

View File

@ -0,0 +1,37 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/cuda_impl/prelu_impl.cuh"
template <typename T>
__global__ void CalPReLUKernel(size_t size, size_t weight_size, size_t per_channel_size,
const T *input, const T *weight, T *output) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
size_t channel_id = weight_size == 1 ? 0 : (pos / per_channel_size) % weight_size;
output[pos] = input[pos] < static_cast<T>(0) ? weight[channel_id] * input[pos] :input[pos];
}
}
template <typename T>
void CalPReLU(size_t size, size_t weight_size, size_t per_channel_size,
const T *input, const T *weight, T *output, cudaStream_t cuda_stream) {
CalPReLUKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, weight_size, per_channel_size,
input, weight, output);
return;
}
template void CalPReLU(size_t, size_t, size_t, const float *, const float *, float *, cudaStream_t);
template void CalPReLU(size_t, size_t, size_t, const half *, const half *, half *, cudaStream_t);

View File

@ -0,0 +1,25 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_H_
#include "runtime/device/gpu/cuda_common.h"
template <typename T>
void CalPReLU(size_t input_size, size_t weight_size, size_t per_channel_size,
const T *input, const T *weight, T *output, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_PRELU_H_

View File

@ -30,26 +30,6 @@ void CalReLUGrad(int size, T *dy, T *y, T *dx, cudaStream_t cuda_stream) {
return;
}
template <typename T>
__global__ void PReluChannelSharedGradKernel(size_t size, T *dy_addr, T *x_addr, T *w_addr, T *dx_addr, T *dwc_addr) {
T zero = static_cast<T>(0);
T w = w_addr[0];
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
T dy = dy_addr[pos];
T x = x_addr[pos];
dx_addr[pos] = x > zero ? dy : w * dy;
dwc_addr[pos] = x > zero ? zero : x * dy;
}
}
template <typename T>
void PReluChannelSharedGrad(size_t input_size, T *dy_addr, T *x_addr, T *w_addr, T *dx_addr, T *dwc_addr,
cudaStream_t cuda_stream) {
PReluChannelSharedGradKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, cuda_stream>>>(input_size, dy_addr, x_addr,
w_addr, dx_addr, dwc_addr);
return;
}
template void CalReLUGrad(int size, double *dy, double *y, double *dx, cudaStream_t cuda_stream);
template void CalReLUGrad(int size, float *dy, float *y, float *dx, cudaStream_t cuda_stream);
template void CalReLUGrad(int size, half *dy, half *y, half *dx, cudaStream_t cuda_stream);
@ -58,7 +38,3 @@ template void CalReLUGrad(int size, int16_t *dy, int16_t *y, int16_t *dx, cudaSt
template void CalReLUGrad(int size, int32_t *dy, int32_t *y, int32_t *dx, cudaStream_t cuda_stream);
template void CalReLUGrad(int size, int64_t *dy, int64_t *y, int64_t *dx, cudaStream_t cuda_stream);
template void CalReLUGrad(int size, uint8_t *dy, uint8_t *y, uint8_t *dx, cudaStream_t cuda_stream);
template void PReluChannelSharedGrad(size_t input_size, float *dy_addr, float *x_addr, float *w_addr, float *dx_addr,
float *dwc_addr, cudaStream_t cuda_stream);
template void PReluChannelSharedGrad(size_t input_size, half *dy_addr, half *x_addr, half *w_addr, half *dx_addr,
half *dwc_addr, cudaStream_t cuda_stream);

View File

@ -20,8 +20,4 @@
#include "runtime/device/gpu/cuda_common.h"
template <typename T>
void CalReLUGrad(int input_size, T *dy, T *y, T *dx, cudaStream_t cuda_stream);
template <typename T>
void PReluChannelSharedGrad(size_t input_size, T *dy_addr, T *x_addr, T *w_addr, T *dx_addr, T *dwc_addr,
cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_GRAD_H_

View File

@ -96,18 +96,3 @@ template void ReluGradV2(const size_t num, const int64_t *dy, const uint32_t *ma
template void ReluGradV2(const size_t num, const uint8_t *dy, const uint32_t *mask, uint8_t *dx,
cudaStream_t cuda_stream);
template <typename T>
__global__ void CalPReLUKernel(int size, T *input_addr, T *weight_addr, T *output_addr) {
for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
output_addr[pos] = input_addr[pos] > static_cast<T>(0) ? input_addr[pos] : *weight_addr * input_addr[pos];
}
}
template <typename T>
void CalPReLU(int size, T *input_addr, T *weight_addr, T *output_addr, cudaStream_t cuda_stream) {
CalPReLUKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_addr, weight_addr, output_addr);
return;
}
template void CalPReLU(int size, float *input_addr, float *weight_addr, float *output_addr, cudaStream_t cuda_stream);
template void CalPReLU(int size, half *input_addr, half *weight_addr, half *output_addr, cudaStream_t cuda_stream);

View File

@ -25,7 +25,4 @@ template <typename T>
void ReluV2(const size_t num, const T *x, T *y, uint32_t *mask, cudaStream_t cuda_stream);
template <typename T>
void ReluGradV2(const size_t num, const T *dy, const uint32_t *mask, T *dx, cudaStream_t cuda_stream);
template <typename T>
void CalPReLU(int input_size, T *input_addr, T *weight_addr, T *output_addr, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_RELU_H_

View File

@ -19,93 +19,97 @@
#include <vector>
#include <map>
#include <string>
#include <functional>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/relu_impl.cuh"
#include "backend/kernel_compiler/gpu/cuda_impl/prelu_impl.cuh"
namespace mindspore {
namespace kernel {
template <typename T>
class PReLUGpuKernel : public GpuKernel {
public:
PReLUGpuKernel() { ResetResource(); }
~PReLUGpuKernel() override {}
PReLUGpuKernel() = default;
~PReLUGpuKernel() override = default;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
if (is_null_input_) {
return true;
}
T *input = GetDeviceAddress<T>(inputs, 0);
T *weight = GetDeviceAddress<T>(inputs, 1);
T *output = GetDeviceAddress<T>(outputs, 0);
auto *input = GetDeviceAddress<T>(inputs, 0);
auto *weight = GetDeviceAddress<T>(inputs, 1);
auto *output = GetDeviceAddress<T>(outputs, 0);
const int size = input_size_ / sizeof(T);
CalPReLU(size, input, weight, output, reinterpret_cast<cudaStream_t>(stream_ptr));
CalPReLU(input_length_, weight_length_, per_channel_length_, input, weight, output,
reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
bool Init(const CNodePtr &kernel_node) override {
ResetResource();
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 2) {
MS_LOG(ERROR) << "Argument number is " << input_num << ", but ReLUGpuFwdKernel needs 2.";
MS_LOG(ERROR) << "PReLU needs 2 inputs, but got " << input_num;
return false;
}
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
is_null_input_ = CHECK_NULL_INPUT(input_shape);
if (is_null_input_) {
MS_LOG(ERROR) << "PReLUGpuFwdKernel input is null.";
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(ERROR) << "ReLU should have 1 output, but got " << input_num;
return false;
}
size_t size = 1;
for (size_t i = 0; i < input_shape.size(); i++) {
size *= input_shape[i];
}
input_size_ = size * sizeof(T);
auto weight_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
is_null_input_ = CHECK_NULL_INPUT(weight_shape);
if (is_null_input_) {
MS_LOG(ERROR) << "PReLUGpuFwdKernel weight is null.";
return false;
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
input_length_ = std::accumulate(input_shape.begin(), input_shape.end(), size_t(1), std::multiplies<>());
size_t input_rank = input_shape.size();
size_t channel_num;
if (input_rank == 0) {
channel_num = 1;
per_channel_length_ = 1;
} else if (input_rank == 1) {
channel_num = 1;
per_channel_length_ = input_shape[0];
} else {
channel_num = input_shape[1];
per_channel_length_ = std::accumulate(input_shape.begin() + 2, input_shape.end(), size_t(1), std::multiplies<>());
}
size = 1;
for (size_t i = 0; i < weight_shape.size(); i++) {
size *= weight_shape[i];
}
weight_size_ = size * sizeof(T);
auto weight_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
if (weight_shape.size() != 1 && weight_shape[0] != 1 && weight_shape[0] != channel_num) {
MS_LOG(EXCEPTION) << "PReLU requires the rank of weight should be 1, and the elements number should be "
"1 or channels number "
<< channel_num << ", but got weight shape " << weight_shape;
}
weight_length_ = weight_shape[0];
InitSizeLists();
return true;
}
void ResetResource() noexcept override {
is_null_input_ = false;
input_length_ = 0;
weight_length_ = 0;
per_channel_length_ = 0;
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
input_size_ = 0;
workspace_size_ = 0;
}
protected:
void InitSizeLists() override {
input_size_list_.push_back(input_size_);
output_size_list_.push_back(input_size_);
workspace_size_list_.push_back(workspace_size_);
size_t data_size = sizeof(T);
input_size_list_.push_back(input_length_ * data_size);
input_size_list_.push_back(weight_length_ * data_size);
output_size_list_.push_back(input_length_ * data_size);
}
private:
bool is_null_input_;
size_t input_length_{0};
size_t weight_length_{0};
size_t per_channel_length_{0};
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
size_t input_size_;
size_t weight_size_;
size_t workspace_size_;
};
} // namespace kernel
} // namespace mindspore

View File

@ -14,7 +14,7 @@
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/nn/prelu_grad_kernel.h"
#include "backend/kernel_compiler/gpu/nn/prelu_grad_gpu_kernel.h"
namespace mindspore {
namespace kernel {
@ -25,7 +25,7 @@ MS_REG_GPU_KERNEL_ONE(PReLUGrad,
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
PReLUGpuGradKernel, float)
PReLUGradGpuKernel, float)
MS_REG_GPU_KERNEL_ONE(PReLUGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
@ -33,6 +33,6 @@ MS_REG_GPU_KERNEL_ONE(PReLUGrad,
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
PReLUGpuGradKernel, half)
PReLUGradGpuKernel, half)
} // namespace kernel
} // namespace mindspore

View File

@ -0,0 +1,125 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_PRELU_GRAD_GPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_PRELU_GRAD_GPU_KERNEL_H_
#include <vector>
#include <map>
#include <functional>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/prelu_grad_impl.cuh"
namespace mindspore {
namespace kernel {
template <typename T>
class PReLUGradGpuKernel : public GpuKernel {
public:
PReLUGradGpuKernel() = default;
~PReLUGradGpuKernel() override = default;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
auto *dy = GetDeviceAddress<T>(inputs, 0);
auto *x = GetDeviceAddress<T>(inputs, 1);
auto *w = GetDeviceAddress<T>(inputs, 2);
auto *dx = GetDeviceAddress<T>(outputs, 0);
auto *dw = GetDeviceAddress<T>(outputs, 1);
auto *dw_array = GetDeviceAddress<float>(workspace, 0);
CalPReLUGrad(input_length_, weight_length_, per_channel_length_, dy, x, w, dx, dw, dw_array,
reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
bool Init(const CNodePtr &kernel_node) override {
ResetResource();
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 3) {
MS_LOG(ERROR) << "ReLUGrad needs 3 inputs, but got " << input_num;
return false;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 2) {
MS_LOG(ERROR) << "ReLUGrad should have 2 outputs, but got " << input_num;
return false;
}
auto x_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
input_length_ = std::accumulate(x_shape.begin(), x_shape.end(), size_t(1), std::multiplies<>());
size_t x_rank = x_shape.size();
size_t channel_num;
if (x_rank == 0) {
channel_num = 1;
per_channel_length_ = 1;
} else if (x_rank == 1) {
channel_num = 1;
per_channel_length_ = x_shape[0];
} else {
channel_num = x_shape[1];
per_channel_length_ = std::accumulate(x_shape.begin() + 2, x_shape.end(), size_t(1), std::multiplies<>());
}
auto weight_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
if (weight_shape.size() != 1 && weight_shape[0] != 1 && weight_shape[0] != channel_num) {
MS_LOG(EXCEPTION) << "PReLUGrad requires the rank of weight should be 1, and the elements number should be "
"1 or channels number "
<< channel_num << ", but got weight shape " << weight_shape;
}
weight_length_ = weight_shape[0];
workspace_size_ = weight_length_ * IntToSize(GET_BLOCKS(input_length_) * GET_THREADS) * sizeof(float);
InitSizeLists();
return true;
}
void ResetResource() noexcept override {
input_length_ = 0;
weight_length_ = 0;
per_channel_length_ = 0;
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
}
protected:
void InitSizeLists() override {
size_t data_size = sizeof(T);
input_size_list_.push_back(input_length_ * data_size);
input_size_list_.push_back(input_length_ * data_size);
input_size_list_.push_back(weight_length_ * data_size);
output_size_list_.push_back(input_length_ * data_size);
output_size_list_.push_back(weight_length_ * data_size);
workspace_size_list_.push_back(workspace_size_);
}
private:
size_t input_length_{0};
size_t weight_length_{0};
size_t per_channel_length_{0};
size_t workspace_size_{0};
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_PRELU_GRAD_GPU_KERNEL_H_

View File

@ -1,196 +0,0 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_PRELU_GRAD_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_PRELU_GRAD_KERNEL_H_
#include <vector>
#include <string>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/kernel_constants.h"
#include "backend/kernel_compiler/gpu/cuda_impl/relu_grad_impl.cuh"
namespace mindspore {
namespace kernel {
template <typename T>
class PReLUGpuGradKernel : public GpuKernel {
public:
PReLUGpuGradKernel()
: data_format_(kOpFormat_NCDHW),
input_size_(0),
weight_size_(0),
reduce_workspace_size_(0),
spatial_count_(1),
is_null_input_(false),
channel_shared_(false),
channel_last_(false) {}
~PReLUGpuGradKernel() override { DestroyResource(); }
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
T *dy_addr = GetDeviceAddress<T>(inputs, 0);
T *x_addr = GetDeviceAddress<T>(inputs, 1);
T *w_addr = GetDeviceAddress<T>(inputs, 2);
T *dx_addr = GetDeviceAddress<T>(outputs, 0);
T *dw_addr = GetDeviceAddress<T>(outputs, 1);
T *dw_collector_addr = GetDeviceAddress<T>(workspace, 0);
T *reduce_workspace_addr = GetDeviceAddress<T>(workspace, 1);
PReluChannelSharedGrad(input_size_ / sizeof(T), dy_addr, x_addr, w_addr, dx_addr, dw_collector_addr,
reinterpret_cast<cudaStream_t>(stream_ptr));
if (data_type_ == CUDNN_DATA_DOUBLE) {
T alpha = static_cast<T>(1.0f);
T beta = static_cast<T>(0.0f);
CHECK_CUDNN_RET_WITH_EXCEPT(
kernel_node_,
cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, reduce_workspace_addr,
reduce_workspace_size_, &alpha, grad_weight_collector_descriptor_, dw_collector_addr, &beta,
grad_weight_descriptor_, dw_addr),
"cudnnReduceTensor failed.");
} else {
const float alphaf = static_cast<float>(1.0f);
const float betaf = static_cast<float>(0.0f);
CHECK_CUDNN_RET_WITH_EXCEPT(
kernel_node_,
cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, reduce_workspace_addr,
reduce_workspace_size_, &alphaf, grad_weight_collector_descriptor_, dw_collector_addr, &betaf,
grad_weight_descriptor_, dw_addr),
"cudnnReduceTensor failed.");
}
return true;
}
void InitResource() override {
cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_),
"cudnnCreateReduceTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&grad_weight_collector_descriptor_),
"cudnnCreateTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&grad_weight_descriptor_),
"cudnnCreateTensorDescriptor failed.");
}
void DestroyResource() noexcept override {
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_),
"cudnnDestroyReduceTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(grad_weight_collector_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(grad_weight_descriptor_),
"cudnnDestroyTensorDescriptor failed.");
}
bool Init(const CNodePtr &kernel_node) override {
kernel_node_ = kernel_node;
input_size_ = sizeof(T);
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
is_null_input_ = CHECK_NULL_INPUT(input_shape);
if (is_null_input_) {
MS_LOG(WARNING) << "PReLUGpuBwdKernel input is null.";
}
for (size_t i = 0; i < input_shape.size(); ++i) {
input_size_ *= input_shape[i];
}
weight_size_ = sizeof(T);
auto weight_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
is_null_input_ = CHECK_NULL_INPUT(weight_shape);
if (is_null_input_) {
MS_LOG(WARNING) << "PReLUGpuBwdKernel input is null.";
}
for (auto dim : weight_shape) {
weight_size_ *= dim;
}
channel_shared_ = (weight_shape[0] == 1);
if (!channel_shared_) {
MS_LOG(WARNING)
<< "PReLUGpuBwdKernel shares weight for all channels, but the given weight tensor has more than one element.";
}
spatial_count_ = 1;
if (channel_last_) {
for (size_t i = 1; i < input_shape.size() - 1; ++i) {
spatial_count_ *= input_shape[i];
}
} else {
for (size_t i = 2; i < input_shape.size(); ++i) {
spatial_count_ *= input_shape[i];
}
}
data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
int input_dim_length = input_shape.size();
std::vector<size_t> reduce_out_shape(input_dim_length, 1);
if (channel_last_) {
reduce_out_shape[input_dim_length - 1] = weight_shape[0];
} else {
reduce_out_shape[1] = weight_shape[0];
}
InitResource();
CudnnSetTensorNdDescriptor(reduce_out_shape, grad_weight_descriptor_, data_type_, kernel_node_);
CudnnSetTensorNdDescriptor(input_shape, grad_weight_collector_descriptor_, data_type_, kernel_node_);
cudnnDataType_t comp_type = (data_type_ == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
CHECK_CUDNN_RET_WITH_EXCEPT(
kernel_node_,
cudnnSetReduceTensorDescriptor(reduce_tensor_descriptor_, CUDNN_REDUCE_TENSOR_ADD, comp_type,
CUDNN_NOT_PROPAGATE_NAN, CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES),
"cudnnSetReduceTensorDescriptor failed");
InitSizeLists();
return true;
}
protected:
void InitSizeLists() override {
input_size_list_.push_back(input_size_);
input_size_list_.push_back(input_size_);
input_size_list_.push_back(weight_size_);
output_size_list_.push_back(input_size_);
output_size_list_.push_back(weight_size_);
CHECK_CUDNN_RET_WITH_EXCEPT(
kernel_node_,
cudnnGetReductionWorkspaceSize(cudnn_handle_, reduce_tensor_descriptor_, grad_weight_collector_descriptor_,
grad_weight_descriptor_, &reduce_workspace_size_),
"cudnnGetReductionWorkspaceSize failed.");
workspace_size_list_.push_back(input_size_);
workspace_size_list_.push_back(reduce_workspace_size_);
}
private:
cudnnHandle_t cudnn_handle_;
cudnnDataType_t data_type_;
cudnnReduceTensorDescriptor_t reduce_tensor_descriptor_;
cudnnTensorDescriptor_t grad_weight_collector_descriptor_;
cudnnTensorDescriptor_t grad_weight_descriptor_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
std::string data_format_ = kOpFormat_NCDHW;
size_t input_size_;
size_t weight_size_;
size_t reduce_workspace_size_;
size_t spatial_count_;
bool is_null_input_ = false;
bool channel_shared_ = false;
bool channel_last_ = false;
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_PRELU_GRAD_KERNEL_H_

View File

@ -234,7 +234,7 @@ std::vector<TaskInfoPtr> HcclKernel::GenTask(const std::vector<AddressPtr> &inpu
}
results.emplace_back(
std::make_shared<HcclTaskInfo>(kernel_name_, stream_id, hccl::HcclAdapter::GetHcclType(anf_node), input_data_addr,
std::make_shared<HcclTaskInfo>(unique_name_, stream_id, hccl::HcclAdapter::GetHcclType(anf_node), input_data_addr,
output_data_addr, workspace_addr, task.workspace_size, task.stream_num,
private_def, hccl::HcclAdapter::GetInstance().GetHcclOpsKernelInfoStore(),
hccl_count_, root_id_, op_type_, data_type, group_, NeedDump()));

View File

@ -45,7 +45,7 @@ std::string GetKernelFormat(const CNodePtr &kernel_node, size_t index) {
return format;
}
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, index);
if (op_name == kAllGatherOpName) {
if (op_name == kAllGatherOpName && input_shape.size() <= kShape4dDims) {
auto pad_shape = trans::PaddingShapeTo4dDefault(input_shape);
if (pad_shape[N_nchw] % kCubeSize != 0 || pad_shape[C_nchw] % kCubeSize != 0) {
return kOpFormat_DEFAULT;
@ -93,11 +93,7 @@ void HcclMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<K
std::vector<TypeId> outputs_type;
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
for (size_t output_index = 0; output_index < output_num; ++output_index) {
if (op_name == kReduceScatter && AnfAlgo::GetNodeAttr<int64_t>(kernel_node, kAttrFusion) > 0) {
outputs_format.emplace_back(GetKernelFormat(kernel_node, 0));
} else {
outputs_format.emplace_back(GetKernelFormat(kernel_node, output_index));
}
outputs_format.emplace_back(GetKernelFormat(kernel_node, output_index));
if (op_name == kReceive) {
outputs_type.push_back(recv_type);
} else {

View File

@ -142,10 +142,14 @@ class KernelMod {
virtual void ReleaseResource() {}
virtual ~KernelMod() = default;
void set_kernel_name(const std::string &kernel_name) { kernel_name_ = kernel_name; }
void set_unique_name(const std::string &unique_name) { unique_name_ = unique_name; }
void set_fullname(const std::string &fullname) { fullname_ = fullname; }
void set_is_monad(bool is_monad) { is_monad_ = is_monad; }
protected:
std::string kernel_name_;
std::string unique_name_;
std::string fullname_;
bool is_monad_{false};
};
using KernelModPtr = std::shared_ptr<KernelMod>;
} // namespace kernel

View File

@ -55,7 +55,7 @@ std::vector<TaskInfoPtr> AssignKernel::GenTask(const std::vector<AddressPtr> &in
stream_id_ = stream_id;
std::shared_ptr<MemcpyAsyncTaskInfo> task_info_ptr =
std::make_shared<MemcpyAsyncTaskInfo>(kernel_name_, stream_id, inputs[0]->addr, inputs[0]->size, inputs[1]->addr,
std::make_shared<MemcpyAsyncTaskInfo>(unique_name_, stream_id, inputs[0]->addr, inputs[0]->size, inputs[1]->addr,
inputs[1]->size, RT_MEMCPY_DEVICE_TO_DEVICE, false);
MS_EXCEPTION_IF_NULL(task_info_ptr);
return {task_info_ptr};

View File

@ -54,7 +54,7 @@ std::vector<TaskInfoPtr> LabelGotoKernel::GenTask(const std::vector<AddressPtr>
MS_LOG(INFO) << "LabelGotoKernel GenTask label:" << label_ << ", stream id:" << stream_id;
std::vector<TaskInfoPtr> task_info_list;
std::shared_ptr<LabelGotoTaskInfo> task_info_ptr =
std::make_shared<LabelGotoTaskInfo>(kernel_name_, stream_id, label_);
std::make_shared<LabelGotoTaskInfo>(unique_name_, stream_id, label_);
MS_EXCEPTION_IF_NULL(task_info_ptr);
task_info_list.emplace_back(task_info_ptr);
return task_info_list;

View File

@ -53,7 +53,7 @@ std::vector<TaskInfoPtr> LabelSetKernel::GenTask(const std::vector<AddressPtr> &
const std::vector<AddressPtr> &, uint32_t stream_id) {
MS_LOG(INFO) << "LabelSetKernel GenTask label:" << label_ << ", stream id:" << stream_id;
std::vector<TaskInfoPtr> task_info_list;
std::shared_ptr<LabelSetTaskInfo> task_info_ptr = std::make_shared<LabelSetTaskInfo>(kernel_name_, stream_id, label_);
std::shared_ptr<LabelSetTaskInfo> task_info_ptr = std::make_shared<LabelSetTaskInfo>(unique_name_, stream_id, label_);
MS_EXCEPTION_IF_NULL(task_info_ptr);
task_info_list.emplace_back(task_info_ptr);
return task_info_list;

View File

@ -64,7 +64,7 @@ std::vector<TaskInfoPtr> LabelSwitchKernel::GenTask(const std::vector<AddressPtr
MS_LOG(INFO) << "LabelSwitchKernel GenTask label size:" << label_size_ << ", stream id:" << stream_id;
std::vector<TaskInfoPtr> task_info_list;
cond_ = inputs[0]->addr;
auto task_info_ptr = std::make_shared<LabelSwitchTaskInfo>(kernel_name_, stream_id, label_size_, label_list_, cond_);
auto task_info_ptr = std::make_shared<LabelSwitchTaskInfo>(unique_name_, stream_id, label_size_, label_list_, cond_);
MS_EXCEPTION_IF_NULL(task_info_ptr);
task_info_list.emplace_back(task_info_ptr);
return task_info_list;

View File

@ -122,7 +122,7 @@ std::vector<TaskInfoPtr> MemCpyAsyncKernel::GenTask(const std::vector<AddressPtr
stream_id_ = stream_id;
std::shared_ptr<MemcpyAsyncTaskInfo> task_info_ptr =
std::make_shared<MemcpyAsyncTaskInfo>(kernel_name_, stream_id, outputs[0]->addr, outputs[0]->size, inputs[0]->addr,
std::make_shared<MemcpyAsyncTaskInfo>(unique_name_, stream_id, outputs[0]->addr, outputs[0]->size, inputs[0]->addr,
inputs[0]->size, RT_MEMCPY_DEVICE_TO_DEVICE, NeedDump());
MS_EXCEPTION_IF_NULL(task_info_ptr);
return {task_info_ptr};

View File

@ -63,7 +63,7 @@ std::vector<TaskInfoPtr> ProfilingKernelMod::GenTask(const std::vector<AddressPt
<< ", outputs size:" << outputs.size();
stream_id_ = stream_id;
std::shared_ptr<ProfilerTraceTaskInfo> task_info_ptr =
std::make_shared<ProfilerTraceTaskInfo>(kernel_name_, stream_id, log_id_, notify_, flags_);
std::make_shared<ProfilerTraceTaskInfo>(unique_name_, stream_id, log_id_, notify_, flags_);
return {task_info_ptr};
}

View File

@ -57,7 +57,7 @@ std::vector<TaskInfoPtr> RecvKernel::GenTask(const std::vector<AddressPtr> &, co
const std::vector<AddressPtr> &, uint32_t stream_id) {
MS_LOG(INFO) << "RecvKernel GenTask event_id_:" << event_id_ << ", stream_id_:" << stream_id;
stream_id_ = stream_id;
EventWaitTaskInfoPtr task_info_ptr = std::make_shared<EventWaitTaskInfo>(kernel_name_, stream_id, event_id_);
EventWaitTaskInfoPtr task_info_ptr = std::make_shared<EventWaitTaskInfo>(unique_name_, stream_id, event_id_);
MS_EXCEPTION_IF_NULL(task_info_ptr);
return {task_info_ptr};
}

View File

@ -56,7 +56,7 @@ std::vector<TaskInfoPtr> SendKernel::GenTask(const std::vector<AddressPtr> &, co
const std::vector<AddressPtr> &, uint32_t stream_id) {
MS_LOG(INFO) << "SendKernel GenTask event id:" << event_id_ << ", stream id:" << stream_id;
stream_id_ = stream_id;
EventRecordTaskInfoPtr task_info_ptr = std::make_shared<EventRecordTaskInfo>(kernel_name_, stream_id, event_id_);
EventRecordTaskInfoPtr task_info_ptr = std::make_shared<EventRecordTaskInfo>(unique_name_, stream_id, event_id_);
MS_EXCEPTION_IF_NULL(task_info_ptr);
return {task_info_ptr};
}

View File

@ -71,7 +71,7 @@ std::vector<TaskInfoPtr> StreamActiveKernel::GenTask(const std::vector<AddressPt
std::vector<TaskInfoPtr> task_info_list;
for (auto &index : active_streams_index_) {
std::shared_ptr<StreamActiveTaskInfo> task_info_ptr =
std::make_shared<StreamActiveTaskInfo>(kernel_name_, stream_id, index);
std::make_shared<StreamActiveTaskInfo>(unique_name_, stream_id, index);
MS_EXCEPTION_IF_NULL(task_info_ptr);
task_info_list.emplace_back(task_info_ptr);
MS_LOG(INFO) << "StreamActiveKernel GenTask: streamId:" << stream_id << ", Active streamId:" << index;

View File

@ -91,7 +91,7 @@ std::vector<TaskInfoPtr> StreamSwitchKernel::GenTask(const std::vector<AddressPt
MS_LOG(INFO) << "cond_:" << static_cast<int>(cond_) << ", true_stream_index_:" << true_stream_index_
<< ", stream_id:" << stream_id;
std::shared_ptr<StreamSwitchTaskInfo> task_info_ptr = std::make_shared<StreamSwitchTaskInfo>(
kernel_name_, stream_id, true_stream_index_, loop_cnt, ites_per_loop, cond_, data_type_);
unique_name_, stream_id, true_stream_index_, loop_cnt, ites_per_loop, cond_, data_type_);
MS_EXCEPTION_IF_NULL(task_info_ptr);
return {task_info_ptr};
}

View File

@ -142,10 +142,10 @@ std::vector<TaskInfoPtr> TensorCopySlices::GenTask(const std::vector<AddressPtr>
stream_id_ = stream_id;
std::shared_ptr<MemcpyAsyncTaskInfo> task_info_ptr1 =
std::make_shared<MemcpyAsyncTaskInfo>(kernel_name_, stream_id, outputs[0]->addr, outputs[0]->size, inputs[0]->addr,
std::make_shared<MemcpyAsyncTaskInfo>(unique_name_, stream_id, outputs[0]->addr, outputs[0]->size, inputs[0]->addr,
inputs[0]->size, RT_MEMCPY_DEVICE_TO_DEVICE, NeedDump());
std::shared_ptr<MemcpyAsyncTaskInfo> task_info_ptr2 = std::make_shared<MemcpyAsyncTaskInfo>(
kernel_name_, stream_id, VoidPointerOffset(outputs[0]->addr, offset_), copy_size_, inputs[1]->addr, copy_size_,
unique_name_, stream_id, VoidPointerOffset(outputs[0]->addr, offset_), copy_size_, inputs[1]->addr, copy_size_,
RT_MEMCPY_DEVICE_TO_DEVICE, NeedDump());
return {task_info_ptr1, task_info_ptr2};
}

View File

@ -103,7 +103,7 @@ std::vector<TaskInfoPtr> TbeKernelMod::GenTask(const std::vector<AddressPtr> &in
MS_LOG(INFO) << "block_dim is:" << block_dim_;
TbeTaskInfoPtr task_info_ptr = std::make_shared<mindspore::ge::model_runner::TbeTaskInfo>(
kernel_name_, stream_id, stub_func, block_dim_, args, 0, sm_desc, nullptr, 0, meta_data, input_data_addrs,
unique_name_, stream_id, stub_func, block_dim_, args, 0, sm_desc, nullptr, 0, meta_data, input_data_addrs,
output_data_addrs, workspace_addrs, NeedDump());
return {task_info_ptr};
}

View File

@ -40,7 +40,7 @@ static bool CheckStridedSlice(const CNodePtr &cnode) {
}
}
// check reduction on the last dimension
if (AnfAlgo::HasNodeAttr(kAttrShrinkAxisMask, cnode)) {
if (GetCNodeFuncName(cnode) == kStridedSliceOpName && AnfAlgo::HasNodeAttr(kAttrShrinkAxisMask, cnode)) {
auto shrink_axis_mask = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(cnode, kAttrShrinkAxisMask));
AnfNodePtr input = cnode->input(1);
int input_dims = 0;

View File

@ -28,20 +28,6 @@
namespace mindspore {
namespace opt {
namespace {
constexpr size_t kEltwiseInputSize = 2;
constexpr size_t kEltwiseOutputSize = 2;
bool CheckEltwiseInputAndOutputSize(const AnfNodePtr &node) {
if (AnfAlgo::GetInputTensorNum(node) == kEltwiseInputSize) {
return true;
}
if (AnfAlgo::GetOutputTensorNum(node) == kEltwiseOutputSize) {
return true;
}
return false;
}
} // namespace
void BnupdateEltwiseEltwiseFusionPass::MatchBnupdateAddRelu(const CNodePtr &cnode, const AnfNodePtr &relu_input,
const session::KernelGraph &kernel_graph,
FusedNodeRecord *candidate_fusion) {
@ -82,7 +68,8 @@ void BnupdateEltwiseEltwiseFusionPass::MatchSingleFusionPattern(const session::K
auto cnode = node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(cnode);
if (AnfAlgo::GetKernelType(cnode) == KernelType::TBE_KERNEL &&
AnfAlgo::GetFusionType(cnode) == kernel::FusionType::ELEMWISE && CheckEltwiseInputAndOutputSize(cnode)) {
AnfAlgo::GetFusionType(cnode) == kernel::FusionType::ELEMWISE &&
AnfAlgo::GetOutputTensorNum(cnode) == ELTWISE_DOUBLE_OUTPUT_SIZE) {
auto eltwise_input = cnode->input(kIndex1);
MS_EXCEPTION_IF_NULL(eltwise_input);
if (eltwise_input->isa<CNode>() && AnfAlgo::CheckPrimitiveType(eltwise_input, prim::kPrimAdd)) {

View File

@ -456,6 +456,20 @@ AnfNodePtr CreateDwReduceSum(const FuncGraphPtr &func_graph, const CNodePtr &dyn
return reduce_sum;
}
AnfNodePtr CreateDwReshape(const FuncGraphPtr &func_graph, const CNodePtr &dynamic_rnn_grad_cnode,
const AnfNodePtr &batch_matmul) {
MS_EXCEPTION_IF_NULL(func_graph);
// Create node
std::vector<AnfNodePtr> reshape_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimReshape->name())),
batch_matmul};
auto reshape = func_graph->NewCNode(reshape_inputs);
// Set infer data type and shape
AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetOutputInferDataType(dynamic_rnn_grad_cnode, 0)},
{AnfAlgo::GetOutputInferShape(dynamic_rnn_grad_cnode, 0)}, reshape.get());
AnfAlgo::SetNodeAttr("is_backend_insert", MakeValue(true), reshape);
return reshape;
}
AnfNodePtr CreateValueNode(const FuncGraphPtr &func_graph, const CNodePtr &dynamic_rnn_grad_cnode) {
auto origin_input7 = dynamic_rnn_grad_cnode->input(kIndex8);
auto origin_input7_shape = AnfAlgo::GetOutputInferShape(origin_input7, 0);
@ -528,7 +542,8 @@ const AnfNodePtr DynamicRnnGradFissionV2::Process(const FuncGraphPtr &func_graph
auto dw_reduce_sum = CreateDwReduceSum(func_graph, dynamic_rnn_grad_cnode, batch_matmul);
make_tuple_inputs.emplace_back(dw_reduce_sum);
} else {
make_tuple_inputs.emplace_back(batch_matmul);
auto dw_reshape = CreateDwReshape(func_graph, dynamic_rnn_grad_cnode, batch_matmul);
make_tuple_inputs.emplace_back(dw_reshape);
}
auto value_node = CreateValueNode(func_graph, dynamic_rnn_grad_cnode);

View File

@ -140,15 +140,13 @@ AnfNodePtr AnfRuntimeAlgorithm::MakeMonadValueNode(const KernelGraphPtr &kg) {
return kg->NewValueNode(kUMonad->ToAbstract(), kUMonad);
}
// Convert:
// a = former(xxx)
// b = latter(x, xxx)
// To:
// a = former(xxx)
// d1 = Depend(x, a)
// b = latter(d1, xxx)
// ...
// out = Depend(out, latter)
// Convert: a = former(xxx)
// b = latter(x, xxx)
// To: a = former(xxx)
// d1 = Depend(x, a)
// b = latter(d1, xxx)
// ...
// out = Depend(out, latter)
void AnfRuntimeAlgorithm::KeepOrder(const KernelGraphPtr &kg, const AnfNodePtr &former, const AnfNodePtr &latter) {
if (latter->isa<CNode>()) {
auto latter_cnode = latter->cast<CNodePtr>();
@ -321,19 +319,33 @@ std::vector<KernelWithIndex> AnfRuntimeAlgorithm::GetAllOutputWithIndex(const An
return ret;
}
// Value node need get all the elements.
if (node->isa<ValueNode>()) {
auto value = node->cast<ValueNodePtr>()->value();
MS_EXCEPTION_IF_NULL(value);
if (value->isa<None>()) {
return ret;
} else if (value->isa<ValueTuple>()) {
auto value_tuple = value->cast<ValueTuplePtr>();
auto value_tuple_size = CountValueNum(value_tuple);
for (size_t i = 0; i < value_tuple_size; ++i) {
ret.push_back({node, i});
}
} else {
ret.push_back({node, 0});
}
return ret;
}
const std::vector<PrimitivePtr> return_types = {prim::kPrimDepend, prim::kPrimMakeTuple};
size_t outputs_num = 1;
// Value node may be tuple which has multi outputs.
if (IsRealCNodeKernel(node) || node->isa<ValueNode>()) {
if (IsRealCNodeKernel(node)) {
outputs_num = AnfAlgo::GetOutputTensorNum(node);
}
// The output may be the tuple of node, so need visit all the outputs of node.
for (size_t i = 0; i < outputs_num; ++i) {
auto output_with_index = AnfAlgo::VisitKernelWithReturnType(node, i, false, return_types);
MS_EXCEPTION_IF_NULL(output_with_index.first);
if (node->isa<ValueNode>()) {
output_with_index.second = i;
}
// The depend and makeTuple node need recurse.
if (AnfAlgo::CheckPrimitiveType(output_with_index.first, prim::kPrimDepend) ||
@ -343,15 +355,6 @@ std::vector<KernelWithIndex> AnfRuntimeAlgorithm::GetAllOutputWithIndex(const An
continue;
}
// Skip the empty value node.
if (output_with_index.first->isa<ValueNode>()) {
auto value = output_with_index.first->cast<ValueNodePtr>()->value();
MS_EXCEPTION_IF_NULL(value);
if (value->isa<ValueTuple>() && (value->cast<ValueTuplePtr>()->size() == 0)) {
continue;
}
}
// Ignore the output of front call node.
if (output_with_index.first->isa<CNode>()) {
auto cnode = output_with_index.first->cast<CNodePtr>();
@ -2163,5 +2166,16 @@ bool AnfRuntimeAlgorithm::IsControlOpExecInBackend(const AnfNodePtr &node) {
static std::set<std::string> control_ops_exec_in_backend = {kBpropCutOpName};
return control_ops_exec_in_backend.find(AnfAlgo::GetCNodeName(node)) != control_ops_exec_in_backend.end();
}
bool AnfRuntimeAlgorithm::IsNodeInputContainMonad(const AnfNodePtr &node) {
auto input_size = GetInputTensorNum(node);
for (size_t i = 0; i < input_size; ++i) {
auto input_with_index = GetPrevNodeOutput(node, i);
if (HasAbstractMonad(input_with_index.first)) {
return true;
}
}
return false;
}
} // namespace session
} // namespace mindspore

View File

@ -300,6 +300,8 @@ class AnfRuntimeAlgorithm {
// executed in vm. For example, the operator "bprop_cut" will be compiled into kernel graph and be launch
// in backend in PyNative mode.
static bool IsControlOpExecInBackend(const AnfNodePtr &node);
static bool IsNodeInputContainMonad(const AnfNodePtr &node);
};
} // namespace session
using AnfAlgo = session::AnfRuntimeAlgorithm;

View File

@ -42,6 +42,9 @@ using GraphArgPair = std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>;
// We start label id from 0, and use 0xFFFFFFFF to indicate label not set.
constexpr uint32_t kNoLabel = 0xFFFFFFFF;
// We start input index from 2 for AssignOp, as for inputs[2] is input, inputs[1] is output;
constexpr size_t kInputIndex = 2;
// Primitive attribute for argument link assign.
const char LINK[] = "link";
@ -151,6 +154,25 @@ bool IsCompatible(const abstract::AbstractBasePtr &a1, const abstract::AbstractB
if (a1 == a2) {
return true;
}
// Check AbstractTuple.
if (a1->isa<abstract::AbstractTuple>() && a2->isa<abstract::AbstractTuple>()) {
auto &a1_tuple = static_cast<abstract::AbstractTuple &>(*a1);
auto &a2_tuple = static_cast<abstract::AbstractTuple &>(*a2);
auto &a1_elements = a1_tuple.elements();
auto &a2_elements = a2_tuple.elements();
if (a1_elements.size() != a2_elements.size()) {
return false;
}
for (size_t i = 0; i < a1_elements.size(); i++) {
MS_EXCEPTION_IF_NULL(a1_elements[i]);
MS_EXCEPTION_IF_NULL(a2_elements[i]);
if (!IsCompatible(a1_elements[i], a2_elements[i])) {
return false;
}
}
return true;
}
// Check AbstractTensor and AbstractRef.
auto type1 = a1->BuildType();
auto type2 = a2->BuildType();
if (type1 != type2 && *type1 != *type2) {
@ -855,9 +877,8 @@ class AscendAutoMonadConverter {
std::vector<CNodePtr> *stack_pushs) {
uint32_t start_index = 1;
if (AnfAlgo::CheckPrimitiveType(node, prim::kPrimAssign)) {
start_index = 2;
start_index = kInputIndex;
}
// auto node_inputs = node->inputs();
for (uint32_t i = start_index; i < node->inputs().size(); i++) {
auto node_input = node->input(i);
// not need to save monad.

View File

@ -841,9 +841,10 @@ void AscendSession::InitRuntimeResource() {
if (!runtime_instance->Init()) {
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
}
auto env_table_file = common::GetEnv("RANK_TABLE_FILE");
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto env_rank_id = common::GetEnv("RANK_ID");
if (!(env_table_file.empty() || env_rank_id.empty())) {
if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
// get actual rank id if it's distribution training case.
rank_id_ = GetRankId();
}

View File

@ -94,6 +94,7 @@ namespace gpu {
using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
using CollectiveInitializer = device::gpu::CollectiveInitializer;
using GetLocalRankId = device::gpu::GetLocalRankId;
using InitNCCLComm = device::gpu::InitNCCLComm;
void GPUSession::Init(uint32_t device_id) {
const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
@ -112,8 +113,15 @@ void GPUSession::Init(uint32_t device_id) {
MS_EXCEPTION_IF_NULL(ms_context);
ms_context->set_param<uint32_t>(MS_CTX_DEVICE_ID, device_id);
if (collective_inited) {
rank_id_ = GetRankId();
if (collective_handle_ != nullptr) {
auto init_nccl_comm_funcptr =
reinterpret_cast<InitNCCLComm>(dlsym(const_cast<void *>(collective_handle_), "InitNCCLComm"));
MS_EXCEPTION_IF_NULL(init_nccl_comm_funcptr);
(*init_nccl_comm_funcptr)();
rank_id_ = GetRankId();
}
}
auto &json_parser = DumpJsonParser::GetInstance();
// Dump json config file if dump is enabled
json_parser.CopyJsonToDir(rank_id_);

View File

@ -1162,6 +1162,19 @@ void KernelGraph::CacheGraphOutputToFrontNodeWithIndex(const AnfNodePtr &backend
auto backend_outputs = AnfAlgo::GetAllOutputWithIndex(backend_graph_output);
auto front_outputs = AnfAlgo::GetAllOutputWithIndex(front_node);
if (backend_outputs.size() != front_outputs.size()) {
for (const auto &backend_output : backend_outputs) {
const auto &graph = backend_output.first->func_graph();
if (graph != nullptr) {
const auto &kernel_graph = dynamic_cast<KernelGraph *>(graph.get());
MS_EXCEPTION_IF_NULL(kernel_graph);
const auto &front_node = kernel_graph->GetFrontAnfByBackendAnf(backend_output.first);
if (front_node != nullptr) {
graph_output_to_front_node_map_[backend_output] = {front_node, backend_output.second};
MS_LOG(INFO) << "Backend output:" << backend_output.first->DebugString()
<< " map to front node:" << front_node->DebugString();
}
}
}
MS_LOG(INFO) << "The size(" << backend_outputs.size()
<< ") of backend output: " << backend_graph_output->DebugString() << " is not equal to the size("
<< front_outputs.size() << ") of front output: " << front_node->DebugString();

View File

@ -1192,6 +1192,7 @@ void SessionBasic::GetParameterIndex(const KernelGraph *graph, const std::vector
<< ", input size: " << inputs.size();
}
const auto &input = inputs[index];
MS_EXCEPTION_IF_NULL(input);
// Check shape of input and parameter
const auto &input_shape = input->shape();
const auto &param_shape = AnfAlgo::GetOutputInferShape(param, 0);
@ -1405,6 +1406,36 @@ void SessionBasic::GetOpInputTensors(const CNodePtr &cnode,
}
}
tensor::TensorPtr SessionBasic::GetOpInputTensorByIndex(const CNodePtr &cnode,
const std::map<KernelWithIndex, tensor::TensorPtr> &op_output,
const std::map<AnfNodePtr, size_t> &parameter_index,
const std::vector<tensor::TensorPtr> &graph_inputs,
InputTensorInfo *input_tensor_info, size_t input_index) {
MS_EXCEPTION_IF_NULL(cnode);
MS_EXCEPTION_IF_NULL(input_tensor_info);
if (input_index >= cnode->inputs().size() - 1) {
MS_LOG(EXCEPTION) << "Input index is out of range:" << cnode->inputs().size() << ",cnode:" << cnode->DebugString();
}
const auto &input = cnode->input(input_index + 1);
auto kernel_with_index = AnfAlgo::VisitKernel(input, 0);
auto real_input = kernel_with_index.first;
MS_EXCEPTION_IF_NULL(real_input);
if (real_input->isa<Parameter>()) {
return GetParameterOutputTensor(real_input, parameter_index, graph_inputs);
} else if (real_input->isa<CNode>()) {
tensor::TensorPtr tensor = GetCNodeOutputTensor(kernel_with_index, op_output);
if (AnfAlgo::IsControlOpExecInBackend(real_input)) {
CheckInputTensorShape(tensor, cnode, input_index);
}
input_tensor_info->input_kernel.insert(kernel_with_index);
return tensor;
} else {
MS_LOG(EXCEPTION) << "Invalid input node, node = " << real_input->DebugString();
}
}
bool SessionBasic::CreateCNodeOfKernelGraph(const AnfNodePtr &node, KernelGraph *graph) {
MS_EXCEPTION_IF_NULL(node);
MS_EXCEPTION_IF_NULL(graph);
@ -2614,6 +2645,7 @@ uint32_t GetRankId() {
world_group = kNcclWorldGroup;
} else {
MS_LOG(ERROR) << "Invalid backend: " << backend;
return rank_id;
}
if (!CommManager::GetInstance().GetRankID(world_group, &rank_id)) {
MS_LOG(INFO) << "Failed to get rank id.";

View File

@ -258,6 +258,12 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
void GetOpInputTensors(const CNodePtr &cnode, const std::map<KernelWithIndex, tensor::TensorPtr> &op_output,
const std::map<AnfNodePtr, size_t> &parameter_index,
const std::vector<tensor::TensorPtr> &graph_inputs, InputTensorInfo *input_tensor_info);
tensor::TensorPtr GetOpInputTensorByIndex(const CNodePtr &cnode,
const std::map<KernelWithIndex, tensor::TensorPtr> &op_output,
const std::map<AnfNodePtr, size_t> &parameter_index,
const std::vector<tensor::TensorPtr> &graph_inputs,
InputTensorInfo *input_tensor_info, size_t input_index);
// create a new kernel graph and update the graph sum
KernelGraphPtr NewKernelGraph();
AnfNodePtr CreateParameterFromTuple(const AnfNodePtr &node, KernelGraph *graph);

Some files were not shown because too many files have changed in this diff Show More