diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_ffn_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_ffn_info.cc
deleted file mode 100644
index 4ca1f8e505b..00000000000
--- a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_ffn_info.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/**
- * Copyright 2024 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "frontend/parallel/ops_info/matmul_ffn_info.h"
-#include "frontend/parallel/dynamic_creator.h"
-
-namespace mindspore {
-namespace parallel {
-// MatMulQkv has 3 inputs and 2 outputs
-// x:         (batch * seq_len (inc is 1), hidden_size)
-// weight_1:         (weight_1_hidden_size, hidden_size)
-// weight_2:         (weight_2_hidden_size, hidden_size)
-// ------------------------------
-// output_1:  (batch * seq_len (inc is 1), weight_1_hidden_size)
-// output_2:  (batch * seq_len (inc is 1), weight_2_hidden_size)
-
-constexpr size_t kMatMulFfnOutputSize = 2;
-
-Status MatmulFfnInfo::CheckStrategy(const StrategyPtr &strategy) {
-  if (CheckStrategyValue(strategy, inputs_shape_) != SUCCESS) {
-    return FAILED;
-  }
-
-  // TODO
-
-  return SUCCESS;
-}
-
-Status MatmulFfnInfo::InferDevMatrixShape() {
-  auto input_strategies = strategy()->GetInputDim();
-  auto x = input_strategies.at(0);  // (batch * seq_len, hidden_size)
-  auto weight_1 = input_strategies.at(1);
-  // dp   mp
-  // 1    0
-  dev_matrix_shape_ = {x.at(0), weight_1.at(0)};
-
-  return SUCCESS;
-}
-
-Status MatmulFfnInfo::InferTensorMap() {
-  Shape x_tensor_map{1, -1};
-  Shape weight_1_tensor_map{0, -1};
-  Shape weight_2_tensor_map{0, -1};
-  inputs_tensor_map_.emplace_back(x_tensor_map);
-  inputs_tensor_map_.emplace_back(weight_1_tensor_map);
-  inputs_tensor_map_.emplace_back(weight_2_tensor_map);
-
-  Shape output_q_tensor_map{1, 0};
-  Shape output_k_tensor_map{1, 0};
-  outputs_tensor_map_.emplace_back(output_q_tensor_map);
-  outputs_tensor_map_.emplace_back(output_k_tensor_map);
-
-  return SUCCESS;
-}
-
-Status MatmulFfnInfo::InferAsLossDivisor() {
-  if (outputs_tensor_map_.size() != kMatMulFfnOutputSize) {
-    MS_LOG(ERROR) << name_ << ": The size of outputs tensor map must be 2, but got " << outputs_tensor_map_.size();
-    return FAILED;
-  }
-  as_loss_divisor_ = ComputeRepeatDeviceNumByTensorMap(dev_matrix_shape_, outputs_tensor_map_[0]);
-  MS_LOG(INFO) << name_ << " : The dev matrix shape is " << ShapeToString(dev_matrix_shape_)
-               << ", the output[0]'s tensor map is " << ShapeToString(outputs_tensor_map_[0])
-               << ", as_loss_divisor_ is " << as_loss_divisor_;
-  return SUCCESS;
-}
-REGISTER(MatmulFfnInfo);
-}  // namespace parallel
-}  // namespace mindspore
\ No newline at end of file
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_ffn_info.h b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_ffn_info.h
deleted file mode 100644
index 1b800b363b5..00000000000
--- a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_ffn_info.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/**
- * Copyright 2024 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_FFN_INFO_H_
-#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_FFN_INFO_H_
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "utils/hash_map.h"
-#include "ir/value.h"
-#include "frontend/parallel/auto_parallel/operator_costmodel.h"
-#include "frontend/parallel/ops_info/operator_info.h"
-#include "frontend/parallel/strategy.h"
-
-namespace mindspore {
-namespace parallel {
-class MatmulFfnInfo : public OperatorInfo {
- public:
-  MatmulFfnInfo(const std::string &name, const Shapes &inputs_shape, const Shapes &outputs_shape,
-                const PrimitiveAttrs &attrs)
-      : OperatorInfo(name, inputs_shape, outputs_shape, attrs, std::make_shared<ActivationInfoCost>()) {}
-  ~MatmulFfnInfo() override = default;
-  Status CheckStrategy(const StrategyPtr &strategy) override;
-  std::vector<StrategyPtr> GenerateOpStrategies(int64_t stage_id) override { return {}; }
-  Status SetCostUnderStrategy(const StrategyPtr &strategy) override { return SetCostUnderStrategyBase(strategy); }
-
- protected:
-  Status GetAttrs() override { return SUCCESS; }
-  Status InferForwardCommunication() { return SUCCESS; }
-  Status InferTensorMap() override;
-  Status InferDevMatrixShape() override;
-  Status InferAsLossDivisor() override;
-};
-using MatmulFfnInfoPtr = std::shared_ptr<MatmulFfnInfo>;
-}  // namespace parallel
-}  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_FFN_INFO_H_
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_qkv_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_qkv_info.cc
deleted file mode 100644
index 0334fcc6bcb..00000000000
--- a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_qkv_info.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/**
- * Copyright 2024 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "frontend/parallel/ops_info/matmul_qkv_info.h"
-#include "frontend/parallel/dynamic_creator.h"
-
-namespace mindspore {
-namespace parallel {
-// MatMulQkv has 3 inputs and 3 outputs
-// x:         (batch * seq_len (inc is 1), query_hidden_size)
-// q:         (query_hidden_size, query_hidden_size)
-// k:         (key_hidden_size, query_hidden_size)
-// v:         (value_hidden_size, query_hidden_size)
-// ------------------------------
-// output_q:  (batch * seq_len (inc is 1), query_hidden_size)
-// output_k:  (batch * seq_len (inc is 1), key_hidden_size)
-// output_v:  (batch * seq_len (inc is 1), value_hidden_size)
-
-// split strategy
-// batch is not able to split
-// seq_len is not able to split
-// query_hidden_size is able to split
-// key_hidden_size is able to split
-// value_hidden_size is able to split
-constexpr size_t kMatMulQkvOutputSize = 3;
-
-Status MatmulQkvInfo::CheckStrategy(const StrategyPtr &strategy) {
-  if (CheckStrategyValue(strategy, inputs_shape_) != SUCCESS) {
-    return FAILED;
-  }
-
-  // TODO
-
-  return SUCCESS;
-}
-
-Status MatmulQkvInfo::InferDevMatrixShape() {
-  auto input_strategies = strategy()->GetInputDim();
-  auto x = input_strategies.at(0);  // (batch * seq_len, q_hidden_size)
-  auto q = input_strategies.at(1);
-  // dp   mp
-  // 1    0
-  dev_matrix_shape_ = {x.at(0), q.at(0)};
-
-  return SUCCESS;
-}
-
-Status MatmulQkvInfo::InferTensorMap() {
-  Shape x_tensor_map{1, -1};
-  Shape q_tensor_map{0, -1};
-  Shape k_tensor_map{0, -1};
-  Shape v_tensor_map{0, -1};
-
-  inputs_tensor_map_.emplace_back(x_tensor_map);
-  inputs_tensor_map_.emplace_back(q_tensor_map);
-  inputs_tensor_map_.emplace_back(k_tensor_map);
-  inputs_tensor_map_.emplace_back(v_tensor_map);
-
-  Shape output_q_tensor_map{1, 0};
-  Shape output_k_tensor_map{1, 0};
-  Shape output_v_tensor_map{1, 0};
-  outputs_tensor_map_.emplace_back(output_q_tensor_map);
-  outputs_tensor_map_.emplace_back(output_k_tensor_map);
-  outputs_tensor_map_.emplace_back(output_v_tensor_map);
-
-  return SUCCESS;
-}
-
-Status MatmulQkvInfo::InferAsLossDivisor() {
-  if (outputs_tensor_map_.size() != kMatMulQkvOutputSize) {
-    MS_LOG(ERROR) << name_ << ": The size of outputs tensor map must be 3, but got " << outputs_tensor_map_.size();
-    return FAILED;
-  }
-  as_loss_divisor_ = ComputeRepeatDeviceNumByTensorMap(dev_matrix_shape_, outputs_tensor_map_[0]);
-  MS_LOG(INFO) << name_ << " : The dev matrix shape is " << ShapeToString(dev_matrix_shape_)
-               << ", the output[0]'s tensor map is " << ShapeToString(outputs_tensor_map_[0])
-               << ", as_loss_divisor_ is " << as_loss_divisor_;
-  return SUCCESS;
-}
-REGISTER(MatmulQkvInfo);
-}  // namespace parallel
-}  // namespace mindspore
\ No newline at end of file
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_qkv_info.h b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_qkv_info.h
deleted file mode 100644
index 43e6e8c33e1..00000000000
--- a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_qkv_info.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/**
- * Copyright 2024 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_QKV_INFO_H_
-#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_QKV_INFO_H_
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "utils/hash_map.h"
-#include "ir/value.h"
-#include "frontend/parallel/auto_parallel/operator_costmodel.h"
-#include "frontend/parallel/ops_info/operator_info.h"
-#include "frontend/parallel/strategy.h"
-
-namespace mindspore {
-namespace parallel {
-class MatmulQkvInfo : public OperatorInfo {
- public:
-  MatmulQkvInfo(const std::string &name, const Shapes &inputs_shape, const Shapes &outputs_shape,
-                const PrimitiveAttrs &attrs)
-      : OperatorInfo(name, inputs_shape, outputs_shape, attrs, std::make_shared<ActivationInfoCost>()) {}
-  ~MatmulQkvInfo() override = default;
-  Status CheckStrategy(const StrategyPtr &strategy) override;
-  std::vector<StrategyPtr> GenerateOpStrategies(int64_t stage_id) override { return {}; }
-  Status SetCostUnderStrategy(const StrategyPtr &strategy) override { return SetCostUnderStrategyBase(strategy); }
-
- protected:
-  Status GetAttrs() override { return SUCCESS; }
-  Status InferForwardCommunication() { return SUCCESS; }
-  Status InferTensorMap() override;
-  Status InferDevMatrixShape() override;
-  Status InferAsLossDivisor() override;
-};
-using MatMulQkvInfoPtr = std::shared_ptr<MatmulQkvInfo>;
-}  // namespace parallel
-}  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_QKV_INFO_H_
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/paged_attention_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/paged_attention_info.cc
index 29cd499ee1a..4fdbc0feb09 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/paged_attention_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/paged_attention_info.cc
@@ -91,10 +91,8 @@ Status PagedAttentionInfo::CheckStrategy(const StrategyPtr &strategy) {
 
 Status PagedAttentionInfo::InferDevMatrixShape() {
   auto input_strategies = strategy()->GetInputDim();
-  auto query = input_strategies.at(0);         // (batch, seq_len, hidden_size)
-  auto cache = input_strategies.at(1);         // (block_size, num_blocks, hidden_size)
-  auto block_tables = input_strategies.at(3);  // (batch, max_num_block_per_batch)
-  auto context_lens = input_strategies.at(4);  // (context_lens)
+  auto query = input_strategies.at(0);  // (batch, seq_len, hidden_size)
+  auto cache = input_strategies.at(1);  // (block_size, num_blocks, hidden_size)
 
   // batch   block_size   num_blocks   seq_len   hidden_size
   //  4          3             2          1          0
diff --git a/mindspore/ccsrc/frontend/parallel/step_parallel.cc b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
index 49c9ed6212d..c3a658d25fe 100644
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
@@ -834,12 +834,7 @@ void InsertVirtualOutput(const FuncGraphPtr &root, const std::vector<AnfNodePtr>
   OperatorAttrs attrs;
   OperatorArgs args = std::make_pair(attrs, params);
   Operator op = std::make_pair(VIRTUAL_OUTPUT, args);
-  // Temporarily circumvent the MatmulQkv problem, and then modify it
-  auto cnode = dyn_cast_ptr<CNode>(out_node);
-  const auto &input = cnode->input(0);
-  MS_EXCEPTION_IF_NULL(input);
-  auto prim = GetValuePtr<Primitive>(input);
-  if (IsPrimitiveCNode(out_node, prim::kPrimMakeTuple) || prim->name() == "MatmulQkv") {
+  if (IsPrimitiveCNode(out_node, prim::kPrimMakeTuple)) {
     auto tuple = out_node->cast<CNodePtr>();
     MS_EXCEPTION_IF_NULL(tuple);
     for (size_t i = 1; i < tuple->size(); ++i) {
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc
index 771483f8977..311263c8ee5 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc
@@ -763,9 +763,7 @@ bool AscendDeviceAddress::AsyncDeviceToDevice(const ShapeVector & /* shape */, s
   return ret;
 }
 
-
-bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */,
-                                            const void *host_ptr) const {
+bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */, const void *host_ptr) const {
   MS_ERROR_IF_NULL(host_ptr);
   BindDevice();
   if (!MoveToDevice(false)) {
@@ -779,7 +777,8 @@ bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */,
   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id);
   MS_EXCEPTION_IF_NULL(runtime_instance);
 
-  auto ret = CALL_ASCEND_API(aclrtMemcpyAsync, GetDevicePtr(), size, host_ptr, size, ACL_MEMCPY_HOST_TO_DEVICE, runtime_instance->compute_stream());
+  auto ret = CALL_ASCEND_API(aclrtMemcpyAsync, GetDevicePtr(), size, host_ptr, size, ACL_MEMCPY_HOST_TO_DEVICE,
+                             runtime_instance->compute_stream());
   if (ret != ACL_ERROR_NONE) {
     MS_LOG(ERROR) << "Call aclrtMemcpyAsync host to device failed, the error num[" << ret << "]";
     return false;
@@ -787,7 +786,6 @@ bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */,
   return true;
 }
 
-
 bool AscendDeviceAddress::AsyncHostToDevice(const ShapeVector & /* shape */, size_t size, TypeId /* type */,
                                             const void *host_ptr, size_t stream_id) const {
   MS_ERROR_IF_NULL(host_ptr);
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/CMakeLists.txt b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/CMakeLists.txt
index cc0fa4c4a69..700987c6606 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/CMakeLists.txt
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/CMakeLists.txt
@@ -17,7 +17,7 @@ if(DEFINED ENV{MS_INTERNAL_KERNEL_HOME})
             ${CMAKE_SOURCE_DIR}/mindspore/ccsrc/runtime/collective/collective_communication_lib.cc
             ${CMAKE_SOURCE_DIR}/mindspore/ccsrc/runtime/collective/communication_group.cc)
     set_property(SOURCE ${LOWLATENCY_COLLECTIVE_SRCS} PROPERTY COMPILE_DEFINITIONS
-                 SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
+                SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
     add_library(lowlatency_collective SHARED ${LOWLATENCY_COLLECTIVE_SRCS})
     target_link_libraries(lowlatency_collective PRIVATE lcal)
 endif()
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.cc b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.cc
index 3bc396d539f..ef1dfa7037a 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.cc
@@ -246,12 +246,13 @@ void GeDeviceResManager::CreateSessionAndGraphRunner() {
 }
 
 bool GeDeviceResManager::LoadCollectiveCommLib() {
-  // If this is simulation, don't load any collective communication library.
+  // If this is simulation, load dummy collective communication library.
   if (!common::GetEnv(kSimulationLevel).empty()) {
+    collective_comm_lib_ = &DummyAscendCollectiveCommLib::GetInstance();
     return true;
   }
   // Ascend backend supports HCCL and LCCL collective communication libraries.
-  if (!common::GetEnv("ENABLE_LCCL").empty()) {
+  if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
     std::string lowlatency_comm_lib_name = "liblowlatency_collective.so";
     auto loader = std::make_shared<CollectiveCommLibLoader>(lowlatency_comm_lib_name);
     MS_EXCEPTION_IF_NULL(loader);
@@ -265,7 +266,7 @@ bool GeDeviceResManager::LoadCollectiveCommLib() {
     auto instance_func = DlsymFuncObj(communication_lib_instance, collective_comm_lib_handle);
     collective_comm_lib_ = instance_func();
     MS_EXCEPTION_IF_NULL(collective_comm_lib_);
-    MS_LOG(WARNING) << "Loading LCCL because env ENABLE_LCCL is set to 1. Pay attention that LCCL only supports "
+    MS_LOG(WARNING) << "Loading LCCL because env MS_ENABLE_LCCL is set to 1. Pay attention that LCCL only supports "
                        "single-node-multi-card mode in KernelByKernel for now.";
   } else {
     collective_comm_lib_ = &AscendCollectiveCommLib::GetInstance();
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.h b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.h
index 5a315db6ba2..9f331919768 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.h
@@ -73,18 +73,7 @@ class GeDeviceResManager : public DeviceResManager {
 
   static void CreateSessionAndGraphRunner();
 
-<<<<<<< HEAD
-  bool LoadCollectiveCommLib() override {
-    if (common::GetEnv(kSimulationLevel).empty()) {
-      collective_comm_lib_ = &AscendCollectiveCommLib::GetInstance();
-    } else {
-      collective_comm_lib_ = &DummyAscendCollectiveCommLib::GetInstance();
-    }
-    return true;
-  }
-=======
   bool LoadCollectiveCommLib() override;
->>>>>>> Add lccl so.
 
   void ResetStreamAndCtx() override;
   bool BindDeviceToCurrentThread(bool force_bind) const override;
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/lowlatency_communication_group.h b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/lowlatency_communication_group.h
index 9d032133cd3..64bed21e6cc 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/lowlatency_communication_group.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/lowlatency_communication_group.h
@@ -44,7 +44,7 @@ class LowlatencyCommunicationGroup : public CommunicationGroup {
 
   void *GenerateRootInfo(size_t *root_info_size) override;
 
-  // Return communicator for collective communcation ops.
+  // Return communicator for collective communication ops.
   const LcclPtr &lccl_communicator() const;
   // Return communicator of lcal.
   const LcalCommPtr &lcal_comm() const;
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc
index b61676946b3..77873bbb357 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc
@@ -135,7 +135,7 @@ bool HcclKernel::Init(const std::vector<KernelTensor *> &inputs, const std::vect
 
   if (common::GetEnv(kSimulationLevel).empty() && !common::IsNeedProfileMemory()) {
 #ifdef ENABLE_INTERNAL_KERNELS
-    if (!common::GetEnv("ENABLE_LCCL").empty()) {
+    if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
       LoadLcclLibrary();
     } else {
       LoadHcclLibrary();
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_broadcast.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_broadcast.cc
index fe84cf9ff4e..162520518f9 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_broadcast.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_broadcast.cc
@@ -31,7 +31,7 @@ bool HcomAllBroadCastKernel::Launch(const std::vector<KernelTensor *> &inputs, c
   MS_EXCEPTION_IF_NULL(stream_ptr);
 
 #ifdef ENABLE_INTERNAL_KERNELS
-  if (!common::GetEnv("ENABLE_LCCL").empty()) {
+  if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
     auto lccl_result =
       lccl_comm_->Broadcast(inputs[0]->device_ptr(), hccl_count_, hccl_data_type_list_[0], root_id_, stream_ptr);
     if (lccl_result != Lcal::LCAL_SUCCESS) {
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_gather.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_gather.cc
index e4c52b2c59c..9061fa4e93e 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_gather.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_gather.cc
@@ -32,7 +32,7 @@ bool HcomAllGatherKernel::Launch(const std::vector<KernelTensor *> &inputs, cons
   MS_EXCEPTION_IF_NULL(stream_ptr);
 
 #ifdef ENABLE_INTERNAL_KERNELS
-  if (!common::GetEnv("ENABLE_LCCL").empty()) {
+  if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
     auto lccl_result = lccl_comm_->AllGather(inputs[0]->device_ptr(), outputs[0]->device_ptr(), hccl_count_,
                                              hccl_data_type_list_[0], stream_ptr);
     if (lccl_result != Lcal::LCAL_SUCCESS) {
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce.cc
index c51ae577988..c0ad808303e 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce.cc
@@ -38,7 +38,7 @@ bool HcomAllReduceKernel::Launch(const std::vector<KernelTensor *> &inputs, cons
   MS_EXCEPTION_IF_NULL(stream_ptr);
 
 #ifdef ENABLE_INTERNAL_KERNELS
-  if (!common::GetEnv("ENABLE_LCCL").empty()) {
+  if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
     auto lccl_result = lccl_comm_->AllReduce(inputs[0]->device_ptr(), outputs[0]->device_ptr(), hccl_count_,
                                              hccl_data_type_list_[0], op_type_, stream_ptr);
     if (lccl_result != Lcal::LCAL_SUCCESS) {
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce_scatter.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce_scatter.cc
index c47383773e6..c65e2d03564 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce_scatter.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce_scatter.cc
@@ -32,7 +32,7 @@ bool HcomAllReduceScatterKernel::Launch(const std::vector<KernelTensor *> &input
   MS_EXCEPTION_IF_NULL(stream_ptr);
 
 #ifdef ENABLE_INTERNAL_KERNELS
-  if (!common::GetEnv("ENABLE_LCCL").empty()) {
+  if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
     auto lccl_result = lccl_comm_->ReduceScatter(inputs[0]->device_ptr(), outputs[0]->device_ptr(), hccl_count_,
                                                  hccl_data_type_list_[0], op_type_, stream_ptr);
     if (lccl_result != Lcal::LCAL_SUCCESS) {
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.cc
index 232d34c0c09..03f1b37b0b1 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.cc
@@ -62,7 +62,7 @@ int HcomMatMulAllReduceKernel::Resize(const std::vector<KernelTensor *> &inputs,
   // The dimensions of left and right matrices.
   matmul_info_.m = hccl_kernel_input_shape_list_[0][0];
   matmul_info_.k = hccl_kernel_input_shape_list_[0][1];
-  matmul_info_.n = hccl_kernel_input_shape_list_[1][0];
+  matmul_info_.n = hccl_kernel_input_shape_list_[1][1];
   matmul_info_.transA = transpose_a_;
   matmul_info_.transB = transpose_b_;
 
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.h b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.h
index 5ecb10224fc..4cd287222de 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.h
@@ -26,7 +26,6 @@ namespace kernel {
 constexpr uint32_t kMatMulAllReduceInputNum = 2;
 constexpr uint32_t kMatMulAllReduceOutputNum = 1;
 constexpr char kAttrNameTransposeA[] = "transpose_a";
-;
 constexpr char kAttrNameTransposeB[] = "transpose_b";
 
 class HcomMatMulAllReduceKernel : public HcclKernel {
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/internal/elewise_binary.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/internal/elewise_binary.cc
index 7f812caab12..947365f97e3 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/internal/elewise_binary.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/internal/elewise_binary.cc
@@ -85,7 +85,7 @@ class InternalSub : public ElewiseBinary {
     param_ptr->input2_dtype_ = InternalKernelUtils::ToInternalDType(inputs[kIndex1]->dtype_id());
     param_ptr->input1_dims_ = internal::VecToSVec<int64_t>(inputs[kIndex0]->GetShapeVector());
     param_ptr->input2_dims_ = internal::VecToSVec<int64_t>(inputs[kIndex1]->GetShapeVector());
-    
+
     return std::static_pointer_cast<internal::OpParam>(param_ptr);
   }
 };
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/internal/internal_kernel_mod.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/internal/internal_kernel_mod.cc
index 35432c4416f..78929f430f3 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/internal/internal_kernel_mod.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/internal/internal_kernel_mod.cc
@@ -105,7 +105,6 @@ int InternalKernelMod::Resize(const std::vector<KernelTensor *> &inputs, const s
     }
   }
   std::vector<internal::DIMS> input_shapes(inputs_.size());
-  std::vector<internal::DIMS> output_shapes;
   for (auto iter = inputsIdxMap_.begin(); iter != inputsIdxMap_.end(); iter++) {
     InternalKernelUtils::ToInternalTensor(inputs_[iter->second], inputs[iter->first]);
     input_shapes[iter->second] = inputs_[iter->second]->desc.dims;
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/backend_common_unify_mindir.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/backend_common_unify_mindir.cc
index 71cce9ec90e..830bb2ac49f 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/backend_common_unify_mindir.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/backend_common_unify_mindir.cc
@@ -136,13 +136,9 @@ void GetBackendCommonUnifyMindIRPassManager(PassManagerPtr *unify_mindir_pm) {
 #ifdef ENABLE_INTERNAL_KERNELS
   (*unify_mindir_pm)->AddPass(std::make_shared<opt::MultiMatmulsFusion>());
   (*unify_mindir_pm)->AddPass(std::make_shared<opt::AddLayernormFusion>());
-  if (common::GetEnv("MS_ENABLE_INTERNAL_KERNELS") == "on") {
-    (*unify_mindir_pm)->AddPass(std::make_shared<opt::ShapeReshapeFusion>());
-  }
+  (*unify_mindir_pm)->AddPass(std::make_shared<opt::ShapeReshapeFusion>());
   (*unify_mindir_pm)->AddPass(std::make_shared<opt::AddRmsNormFusion>());
-  if (common::GetEnv("ENABLE_MATMUL_ALLREDUCE") == "on") {
-    (*unify_mindir_pm)->AddPass(std::make_shared<opt::MatMulAllReduceFusion>());
-  }
+  (*unify_mindir_pm)->AddPass(std::make_shared<opt::MatMulAllReduceFusion>());
 #endif  // ENABLE_INTERNAL_KERNELS
 }
 
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.cc
index 17a430139fa..d1f591e1bc7 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.cc
@@ -15,11 +15,13 @@
  */
 
 #include "plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.h"
+#include <set>
 #include <vector>
 #include "mindspore/core/ops/nn_ops.h"
 #include "mindspore/core/ops/math_ops.h"
 #include "mindspore/core/ops/other_ops.h"
 #include "mindspore/core/ops/lite_ops.h"
+#include "mindspore/core/utils/ms_context.h"
 #include "include/backend/optimizer/helper.h"
 #include "include/backend/anf_runtime_algorithm.h"
 #include "include/common/utils/anfalgo.h"
@@ -101,6 +103,16 @@ AnfNodePtr MatMulAllReduceFusion::CreateMatMulAllReduceNode(const FuncGraphPtr &
 const AnfNodePtr MatMulAllReduceFusion::Process(const mindspore::FuncGraphPtr &func_graph,
                                                 const mindspore::AnfNodePtr &node,
                                                 const mindspore::EquivPtr &equiv) const {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if (!ms_context->IsEnableInferBoost()) {
+    return nullptr;
+  }
+
+  if (common::GetEnv("DISABLE_MATMULALLREDUCE_FUSION") == "True") {
+    return nullptr;
+  }
+
   if (func_graph == nullptr || node == nullptr) {
     return nullptr;
   }
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.h b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.h
index 7e4c8e8037d..0f12f996fca 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.h
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.h
@@ -16,6 +16,7 @@
 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_
 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_
 
+#include <string>
 #include <memory>
 #include "include/backend/optimizer/optimizer.h"
 
@@ -24,7 +25,7 @@ namespace opt {
 class MatMulAllReduceFusion : public PatternProcessPass {
  public:
   explicit MatMulAllReduceFusion(bool multigraph = true, const string &pass_name = "MatMulAllReduce")
-      : PatternProcessPass(pass_name, multigraph){};
+      : PatternProcessPass(pass_name, multigraph) {}
   ~MatMulAllReduceFusion() override = default;
   const BaseRef DefinePattern() const override;
   const AnfNodePtr Process(const FuncGraphPtr &graph, const AnfNodePtr &node, const EquivPtr &equiv) const override;
@@ -46,4 +47,4 @@ class MatMulAllReduceFusion : public PatternProcessPass {
 }  // namespace opt
 }  // namespace mindspore
 
-#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_
\ No newline at end of file
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/multi_matmuls_fusion.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/multi_matmuls_fusion.cc
index e54392bba0c..70a88644ab0 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/multi_matmuls_fusion.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/multi_matmuls_fusion.cc
@@ -15,13 +15,19 @@
  */
 #include "plugin/device/ascend/optimizer/ir_fusion/multi_matmuls_fusion.h"
 
+#include "mindspore/core/utils/ms_context.h"
+
 namespace mindspore {
 namespace opt {
 bool MultiMatmulsFusion::Run(const FuncGraphPtr &graph) {
   bool changed = false;
-  if (common::GetEnv("ENABLE_MATMUL_FUSION") != "on") {
+
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if (!ms_context->IsEnableInferBoost() || common::GetEnv("ENABLE_MATMUL_FUSION") != "on") {
     return changed;
   }
+
   auto mng = graph->manager();
   MS_EXCEPTION_IF_NULL(mng);
   const auto &node_users_map = mng->node_users();
diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/shape_reshape_fusion.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/shape_reshape_fusion.cc
index 6bd6cd24338..5cfb7c1a4c5 100644
--- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/shape_reshape_fusion.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/shape_reshape_fusion.cc
@@ -22,6 +22,7 @@
 #include "mindspore/core/ops/reshape_ext.h"
 #include "mindspore/core/ops/scalar_graph_holder.h"
 #include "mindspore/core/ops/array_ops.h"
+#include "mindspore/core/utils/ms_context.h"
 #include "include/common/utils/anfalgo.h"
 #include "mindspore/ccsrc/include/common/utils/utils.h"
 #include "plugin/device/ascend/optimizer/get_value_helper.h"
@@ -200,6 +201,12 @@ const BaseRef ShapeReshapeFusion::DefinePattern() const {
 
 const AnfNodePtr ShapeReshapeFusion::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
                                              const EquivPtr &equiv) const {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if (!ms_context->IsEnableInferBoost()) {
+    return nullptr;
+  }
+
   MS_EXCEPTION_IF_NULL(func_graph);
   MS_EXCEPTION_IF_NULL(equiv);
 
diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/data_prepare_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/data_prepare_actor.cc
index 6e5f39a44eb..c89a758ba9a 100644
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/data_prepare_actor.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/data_prepare_actor.cc
@@ -825,9 +825,9 @@ void DataPrepareActor::PrepareDataForHostTensorQueueNew(const VectorRef &args, O
                    << " for input parameter:" << origin_parameter->fullname_with_scope();
 
       if (!isDyn) {
-         if(host_tensors_[tensor_position] != input_tensor->shape()) {
-             isDyn = true;
-         }
+        if (host_tensors_[tensor_position] != input_tensor->shape()) {
+          isDyn = true;
+        }
       }
       host_tensors_[tensor_position] = input_tensor->shape();
       host_tensors[tensor_position] = input_tensor;
diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/data_source_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/data_source_actor.cc
index 627dd4c2d39..24da68b61ff 100644
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/data_source_actor.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/data_source_actor.cc
@@ -23,6 +23,7 @@
 #include "mindrt/include/async/async.h"
 #include "utils/log_adapter.h"
 #include "kernel/common_utils.h"
+#include "mindspore/core/utils/ms_context.h"
 
 namespace mindspore {
 namespace runtime {
@@ -278,6 +279,8 @@ void HostQueueDataSourceActor::SendMemoryFreeReq(OpContext<DeviceTensor> *const
 }
 
 void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
   MS_EXCEPTION_IF_NULL(context);
   if (IsRunningFailed(context)) {
     return;
@@ -328,11 +331,20 @@ void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *cons
         continue;
       }
 
-
-      if (!device_tensor->AsyncHostToDevice(LongToSize(host_tensor->data().nbytes()), host_tensor->data_type(), 
-            host_tensor->data_ptr()->data())) {
-        SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "SyncHostToDevice failed.");
+      if (ms_context->IsEnableInferBoost()) {
+        if (!device_tensor->AsyncHostToDevice(LongToSize(host_tensor->data().nbytes()), host_tensor->data_type(),
+                                              host_tensor->data_ptr()->data())) {
+          SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "SyncHostToDevice failed.");
+        }
+      } else {
+        if (!device_tensor->SyncHostToDevice(
+              trans::GetRuntimePaddingShape(data_node_with_indexs_[i].first, data_node_with_indexs_[i].second),
+              LongToSize(host_tensor->data().nbytes()), host_tensor->data_type(),
+              host_tensor->device_info().host_format_, host_tensor->data_ptr())) {
+          SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "SyncHostToDevice failed.");
+        }
       }
+
       if (IsDynamic(device_tensor->host_shape())) {
         device_tensor->set_host_shape(host_tensor->shape());
       }
diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_infer_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_infer_actor.cc
index 4d3f4eea34c..613f6e2ab82 100644
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_infer_actor.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_infer_actor.cc
@@ -27,7 +27,6 @@ void KernelInferActor::Init() {
   if (memory_free_list_.size() > input_num) {
     memory_free_list_.erase(memory_free_list_.begin() + input_num, memory_free_list_.end());
   }
-  
 }
 
 void KernelInferActor::RunOpData(OpData<DeviceTensor> *const input_data, OpContext<DeviceTensor> *const context) {
diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/super_kernel_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/super_kernel_actor.cc
index 57d61aa340a..21c7fc9e444 100644
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/super_kernel_actor.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/super_kernel_actor.cc
@@ -211,8 +211,6 @@ void SuperKernelActor::Run(OpContext<DeviceTensor> *const context) {
     return RunGraphKernelByKernel(context);
   }
 
-  device::tracker::CALL_MEMORY_TRACKER_WITH_FILE(AddTask, GetAID().Name(), "", graph_->ToString());
-
   if (device_contexts_.empty() || device_contexts_[0] == nullptr) {
     SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Invalid device context for super kernel actor:" + GetAID().Name());
   }
diff --git a/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc b/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc
index 061181d3496..434ecd79c74 100644
--- a/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc
@@ -662,14 +662,14 @@ void GraphScheduler::SpawnMultiPipelineActor(ActorSet *const actor_set, ActorThr
   }
 
   // If enable runtime multi pipeline, async launch kernel will be enabled.
-<<<<<<< HEAD
-  ActorDispatcher::set_enable_runtime_multi_pipeline(EnableRuntimePipeline() && actor_set->has_dynamic_shape_ &&
-                                                     !actor_set->kernel_actors_.empty() &&
-                                                     default_actor_thread_num_ > kMultiPipelineThreadNum);
->>>>>>> 80d1685cc13... lzy: Support run graph without kernel actor for kbyk mode
+  ActorDispatcher::set_enable_runtime_multi_pipeline(
+    enable_runtime_pipeline && actor_set->has_dynamic_shape_ &&
+    (EnableKbkSubGraphExecute() || !actor_set->kernel_actors_.empty()) &&
+    default_actor_thread_num_ > kMultiPipelineThreadNum);
   if (ActorDispatcher::enable_runtime_multi_pipeline() && !already_spawn_kernel_async_infer_resize_actor_) {
     size_t current_actor_thread_num = thread_pool->GetActorThreadNum();
     MS_LOG(INFO) << "Enable runtime multi pipeline, default actor thread num: " << default_actor_thread_num_
+                 << ", current actor thread num: " << current_actor_thread_num;
     if (current_actor_thread_num != default_actor_thread_num_) {
       thread_pool->SetActorThreadNum(default_actor_thread_num_);
       MS_LOG(DEBUG) << "Reset actor thread number to: " << default_actor_thread_num_;
diff --git a/mindspore/core/ops/fusion/matmul_allreduce.cc b/mindspore/core/ops/fusion/matmul_allreduce.cc
index e7176bc2940..d47bd5a6318 100644
--- a/mindspore/core/ops/fusion/matmul_allreduce.cc
+++ b/mindspore/core/ops/fusion/matmul_allreduce.cc
@@ -26,6 +26,5 @@
 namespace mindspore {
 namespace ops {
 MIND_API_OPERATOR_IMPL(MatMulAllReduce, MatMul);
-REGISTER_PRIMITIVE_C(kMatMulAllReduce, MatMulAllReduce);
 }  // namespace ops
 }  // namespace mindspore
diff --git a/mindspore/core/ops/mat_mul.cc b/mindspore/core/ops/mat_mul.cc
index 62066ddeec6..ce4046d56ec 100644
--- a/mindspore/core/ops/mat_mul.cc
+++ b/mindspore/core/ops/mat_mul.cc
@@ -34,8 +34,10 @@
 #include "mindapi/src/helper.h"
 #include "mindspore/core/ops/math_ops.h"
 #include "ops/mat_mul.h"
+#include "ops/fusion/matmul_allreduce.h"
 #include "ops/op_name.h"
 #include "ops/primitive_c.h"
+#include "ops/lite_ops.h"
 #include "utils/check_convert_utils.h"
 #include "utils/convert_utils_base.h"
 #include "utils/log_adapter.h"
@@ -169,5 +171,6 @@ class MatMulInfer : public abstract::OpInferBase {
 };
 
 REGISTER_PRIMITIVE_OP_INFER_IMPL(MatMul, prim::kPrimMatMul, MatMulInfer, false);
+REGISTER_PRIMITIVE_OP_INFER_IMPL(MatMulAllReduce, prim::kPrimMatMulAllReduce, MatMulInfer, false);
 }  // namespace ops
 }  // namespace mindspore
diff --git a/mindspore/core/ops/math_ops.h b/mindspore/core/ops/math_ops.h
index f9e316ace95..cf9c64e3426 100644
--- a/mindspore/core/ops/math_ops.h
+++ b/mindspore/core/ops/math_ops.h
@@ -62,8 +62,10 @@ GVAR_DEF(PrimitivePtr, kPrimTensorAdd, std::make_shared<Primitive>("TensorAdd"))
 GVAR_DEF(PrimitivePtr, kPrimAddV2, std::make_shared<Primitive>(kAddV2OpName));
 GVAR_DEF(PrimitivePtr, kPrimAddLayerNorm, std::make_shared<Primitive>("AddLayerNorm"));
 GVAR_DEF(PrimitivePtr, kPrimAddRmsNorm, std::make_shared<Primitive>("AddRmsNorm"));
+GVAR_DEF(PrimitivePtr, kPrimMatMul, std::make_shared<Primitive>("MatMul"));
 GVAR_DEF(PrimitivePtr, kPrimMatMulV2, std::make_shared<Primitive>("MatMulV2"));
 GVAR_DEF(PrimitivePtr, kPrimMatrixDiag, std::make_shared<Primitive>("MatrixDiag"));
+GVAR_DEF(PrimitivePtr, kPrimBatchMatMul, std::make_shared<Primitive>("BatchMatMul"));
 GVAR_DEF(PrimitivePtr, kPrimBatchMatMulV2, std::make_shared<Primitive>("BatchMatMulV2"));
 GVAR_DEF(PrimitivePtr, kPrimFusedMatMulBiasAdd, std::make_shared<Primitive>("FusedMatMulBiasAdd"));
 GVAR_DEF(PrimitivePtr, kPrimMinimumGradGrad, std::make_shared<Primitive>("MinimumGradGrad"));
diff --git a/mindspore/core/utils/ms_context.h b/mindspore/core/utils/ms_context.h
index 1fc46b3a758..6beb5160a7a 100644
--- a/mindspore/core/utils/ms_context.h
+++ b/mindspore/core/utils/ms_context.h
@@ -24,6 +24,7 @@
 #include <functional>
 #include <mutex>
 #include <vector>
+#include <optional>
 #include "utils/log_adapter.h"
 #include "utils/ms_utils.h"
 
@@ -402,7 +403,7 @@ inline void MsContext::increase_param<uint32_t>(MsCtxParam param) {
   uint32_params_[param - MS_CTX_TYPE_UINT32_BEGIN]++;
 }
 
-// decreate method implementation for type uint32_t
+// decrease method implementation for type uint32_t
 template <>
 inline void MsContext::decrease_param<uint32_t>(MsCtxParam param) {
   uint32_params_[param - MS_CTX_TYPE_UINT32_BEGIN]--;
diff --git a/mindspore/python/mindspore/common/api.py b/mindspore/python/mindspore/common/api.py
index fcf0c7583a1..861b5b6de4b 100644
--- a/mindspore/python/mindspore/common/api.py
+++ b/mindspore/python/mindspore/common/api.py
@@ -302,12 +302,13 @@ def _handle_arg_predict(obj, arg, compile_arg):
         return None
 
     if isinstance(arg, (list, tuple)):
-        if compile_arg is not None and hasattr(compile_arg, "__ms_mutable__") and getattr(compile_arg, "__ms_mutable__"):
+        if compile_arg is not None and hasattr(compile_arg, "__ms_mutable__") and \
+                getattr(compile_arg, "__ms_mutable__"):
             # mutable([]) will be eliminated by FuncGraphSpecializer, and empty list is not supported by backend.
             if isinstance(arg, list) and not arg:
                 return None
             return arg
-        elif hasattr(obj, "enable_tuple_broaden") and obj.enable_tuple_broaden and isinstance(arg, tuple) and \
+        if hasattr(obj, "enable_tuple_broaden") and obj.enable_tuple_broaden and isinstance(arg, tuple) and \
                 _check_all_tensor(arg):
             return arg
         return None
diff --git a/mindspore/python/mindspore/common/jit_config.py b/mindspore/python/mindspore/common/jit_config.py
index 1631ab38d7c..0f6db062f5d 100644
--- a/mindspore/python/mindspore/common/jit_config.py
+++ b/mindspore/python/mindspore/common/jit_config.py
@@ -60,7 +60,7 @@ class JitConfig:
 
         infer_boost (str, optional): enable infer boost mode.
             The value must be ``"on"`` , ``"off"``. Default to an "off", which means that disable infer boost.
-            when infer boost mode is enabled, mindspore will use high perf kernel lib, use faster runtime make 
+            when infer boost mode is enabled, mindspore will use high perf kernel lib, use faster runtime make
             infer speed is best.
             Note: current infer boost only support jit_level == O0 and device is Ascend910B
             Default: ``"off"`` .
@@ -78,16 +78,17 @@ class JitConfig:
         >>>
         >>> net.set_jit_config(jitconfig)
     """
-    def __init__(self, jit_level="", exc_mode="auto", jit_syntax_level="", debug_level="RELEASE", infer_boost="off", **kwargs):
+    def __init__(self, jit_level="", exc_mode="auto", jit_syntax_level="", debug_level="RELEASE",
+                 infer_boost="off", **kwargs):
         if jit_level not in ["", "O0", "O1", "O2"]:
             raise ValueError("For 'jit_level' must be one of ['O0', 'O1', 'O2'].")
-        if exc_mode not in ['auto', 'sink', 'no_sink']:
+        if exc_mode not in ["auto", "sink", "no_sink"]:
             raise ValueError("For 'exc_mode' must be one of '['auto', 'sink', 'no_sink']'.")
-        if jit_syntax_level != "" and jit_syntax_level not in ['STRICT', 'COMPATIBLE', 'LAX']:
+        if jit_syntax_level != "" and jit_syntax_level not in ["STRICT", "COMPATIBLE", "LAX"]:
             raise ValueError("For 'jit_syntax_level' must be one of '['STRICT', 'LAX']'.")
-        if debug_level not in ['RELEASE', 'DEBUG']:
+        if debug_level not in ["RELEASE", "DEBUG"]:
             raise ValueError("For 'debug_level' must be one of '['RELEASE', 'DEBUG']'.")
-        if infer_boost != "" and infer_boost not in ['on', 'off']:
+        if infer_boost != "" and infer_boost not in ["on", "off"]:
             raise ValueError("For 'infer_boost' must be one of '['on', 'off']'.")
         self.jit_config_dict = kwargs
         self.jit_config_dict["jit_level"] = jit_level
diff --git a/mindspore/python/mindspore/nn/cell.py b/mindspore/python/mindspore/nn/cell.py
index 87cea23061c..41c8df35c73 100755
--- a/mindspore/python/mindspore/nn/cell.py
+++ b/mindspore/python/mindspore/nn/cell.py
@@ -667,6 +667,8 @@ class Cell(Cell_):
 
 
     def _predict(self, *args, **kwargs):
+        if not hasattr(self, "phase"):
+            return False, None
         if (self.phase == "prefill" or self.phase == 'increment') and self.phase in self.phase_cache:
             new_args = _get_args_for_run_predict(self, args, kwargs, self._compile_args)
             res = _cell_graph_executor._graph_executor(tuple(new_args), self.phase_cache[self.phase])
@@ -687,7 +689,7 @@ class Cell(Cell_):
             if predict_compiled:
                 return res
             self._check_construct_args(*args)
-            
+
             if self._hook_fn_registered():
                 logger.warning(f"For 'Cell', it's not support hook function in graph mode. If you want to use hook "
                                f"function, please use context.set_context to set pynative mode.")
@@ -993,9 +995,9 @@ class Cell(Cell_):
             kwargs (dict): Kwargs of the Cell object.
         """
         if self.phase == "prefill":
-            os.environ["ENABLE_MATMUL_ALLREDUCE"] = "on"
+            os.environ["DISABLE_MATMULALLREDUCE_FUSION"] = "False"
         else:
-            os.environ["ENABLE_MATMUL_ALLREDUCE"] = ""
+            os.environ["DISABLE_MATMULALLREDUCE_FUSION"] = "True"
         self._compile_args = self._get_compile_args(args)
         _cell_graph_executor.compile(self, *self._compile_args, phase=self.phase,
                                      jit_config_dict=self._jit_config_dict, **kwargs)
diff --git a/mindspore/python/mindspore/ops/operations/__init__.py b/mindspore/python/mindspore/ops/operations/__init__.py
index bf78db7aacf..60e17aa02aa 100644
--- a/mindspore/python/mindspore/ops/operations/__init__.py
+++ b/mindspore/python/mindspore/ops/operations/__init__.py
@@ -118,7 +118,7 @@ from .nn_ops import (LSTM, SGD, Adam, AdamWeightDecay, FusedSparseAdam, FusedSpa
                      FractionalMaxPool, FractionalMaxPool3DWithFixedKsize, FractionalMaxPoolWithFixedKsize,
                      GridSampler2D, TripletMarginLoss, UpsampleNearest3D, UpsampleTrilinear3D, PadV3, ChannelShuffle,
                      GLU, MaxUnpool3D, Pdist, RmsNorm, PagedAttention, PagedAttentionMask, ReshapeAndCache,
-                     ApplyRotaryPosEmb, MatmulQkv, MatmulFfn)
+                     ApplyRotaryPosEmb)
 from .other_ops import (Assign, IOU, BoundingBoxDecode, BoundingBoxEncode,
                         ConfusionMatrix, UpdateState, Load, StopGradient, Reusing,
                         CheckValid, Partial, Depend, Push, Pull, PyExecute, PyFunc, _DynamicLossScale,
@@ -695,8 +695,6 @@ __all__ = [
     "ReshapeAndCache",
     "ApplyRotaryPosEmb",
     "RmsNorm",
-    "MatmulQkv",
-    "MatmulFfn",
 ]
 
 __custom__ = [
diff --git a/mindspore/python/mindspore/ops/operations/nn_ops.py b/mindspore/python/mindspore/ops/operations/nn_ops.py
index 13303bce8b8..fa26de101a9 100644
--- a/mindspore/python/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/python/mindspore/ops/operations/nn_ops.py
@@ -10196,27 +10196,3 @@ class RmsNorm(Primitive):
         """Initialize Dense."""
         validator.check_value_type("epsilon", epsilon, [float], self.name)
         self.init_prim_io_names(inputs=['x', 'gamma'], outputs=["y", "rstd"])
-
-
-class MatmulQkv(Primitive):
-    r"""
-    Fuse three matmul ops for q k v attention into one
-    """
-
-    @prim_attr_register
-    def __init__(self):
-        """Initialize"""
-        self.init_prim_io_names(inputs=['hidden_states', 'weight_q', 'weight_k', 'weight_v'],
-                                outputs=["output_q", "output_k", "output_v"])
-
-
-class MatmulFfn(Primitive):
-    r"""
-    Fuse two matmul ops for feed forward into one
-    """
-
-    @prim_attr_register
-    def __init__(self):
-        """Initialize"""
-        self.init_prim_io_names(inputs=['hidden_states', 'weight_gate', 'weight_up'],
-                                outputs=["output_gate", "output_up"])
diff --git a/scripts/build/check_and_build_ms_kernels_internal.sh b/scripts/build/check_and_build_ms_kernels_internal.sh
index e38689250ac..8bf33778f72 100644
--- a/scripts/build/check_and_build_ms_kernels_internal.sh
+++ b/scripts/build/check_and_build_ms_kernels_internal.sh
@@ -17,26 +17,26 @@
 if [ "$(uname)" == Linux ]; then
   if [ -n "${MS_INTERNAL_KERNEL_HOME}" ]; then
     echo "Use local MS_INTERNAL_KERNEL_HOME : ${MS_INTERNAL_KERNEL_HOME}"
-  else
-    file_path=${BASEPATH}/mindspore/ccsrc/plugin/device/ascend/kernel/internal/prebuild
-    lib_file=${file_path}/ms_kernels_internal.tar.gz
-    if [ -f "${lib_file}" ]; then
-      file_lines=`cat "${lib_file}" | wc -l`
-      if [ ${file_lines} -ne 3 ]; then
-        tar -zxf ${lib_file} -C ${file_path}
-        if [ $? -eq 0 ]; then
-          echo "Unzip ms_kernel_internal.tar.gz SUCCESS!"
-          export MS_INTERNAL_KERNEL_HOME="${file_path}/ms_kernels_internal"
-          echo "MS_INTERNAL_KERNEL_HOME = ${MS_INTERNAL_KERNEL_HOME}"
-        else
-          echo "[WARNING] Unzip ms_kernel_internal.tar.gz FAILED!"
-        fi
-      else
-        echo "[WARNING] The file ms_kernel_internal.tar.gz is not pulled. Please ensure git-lfs is installed by"
-        echo "[WARNING] `git lfs install` and retry downloading using `git lfs pull`."
-      fi
-    else
-      echo "[WARNING] The file ms_kernel_internal.tar.gz does NOT EXIST."
-    fi
+  # else
+  #   file_path=${BASEPATH}/mindspore/ccsrc/plugin/device/ascend/kernel/internal/prebuild
+  #   lib_file=${file_path}/ms_kernels_internal.tar.gz
+  #   if [ -f "${lib_file}" ]; then
+  #     file_lines=`cat "${lib_file}" | wc -l`
+  #     if [ ${file_lines} -ne 3 ]; then
+  #       tar -zxf ${lib_file} -C ${file_path}
+  #       if [ $? -eq 0 ]; then
+  #         echo "Unzip ms_kernel_internal.tar.gz SUCCESS!"
+  #         export MS_INTERNAL_KERNEL_HOME="${file_path}/ms_kernels_internal"
+  #         echo "MS_INTERNAL_KERNEL_HOME = ${MS_INTERNAL_KERNEL_HOME}"
+  #       else
+  #         echo "[WARNING] Unzip ms_kernel_internal.tar.gz FAILED!"
+  #       fi
+  #     else
+  #       echo "[WARNING] The file ms_kernel_internal.tar.gz is not pulled. Please ensure git-lfs is installed by"
+  #       echo "[WARNING] 'git lfs install' and retry downloading using 'git lfs pull'."
+  #     fi
+  #   else
+  #     echo "[WARNING] The file ms_kernel_internal.tar.gz does NOT EXIST."
+  #   fi
   fi
 fi
diff --git a/tests/st/lccl/test_all.py b/tests/st/lccl/test_all.py
index 4c55289b88d..94b3b1058d6 100644
--- a/tests/st/lccl/test_all.py
+++ b/tests/st/lccl/test_all.py
@@ -27,7 +27,7 @@ def test_lccl_allreduce():
     Description: msrun lccl all_reduce 8P case.
     Expectation: success
     """
-    os.environ['ENABLE_LCCL'] = str(1)
+    os.environ['MS_ENABLE_LCCL'] = str(1)
     os.environ['GRAPH_OP_RUN'] = str(1)
     context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
     return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True pytest -s test_lccl_allreduce.py")
@@ -44,7 +44,7 @@ def test_lccl_allgather():
     Description: msrun lccl all_gather 8P case.
     Expectation: success
     """
-    os.environ['ENABLE_LCCL'] = str(1)
+    os.environ['MS_ENABLE_LCCL'] = str(1)
     os.environ['GRAPH_OP_RUN'] = str(1)
     context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
     return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True pytest -s test_lccl_allgather.py")
@@ -61,7 +61,7 @@ def test_lccl_reducescatter():
     Description: msrun lccl reduce_scatter 8P case.
     Expectation: success
     """
-    os.environ['ENABLE_LCCL'] = str(1)
+    os.environ['MS_ENABLE_LCCL'] = str(1)
     os.environ['GRAPH_OP_RUN'] = str(1)
     context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
     return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True "
@@ -79,7 +79,7 @@ def test_lccl_broadcast():
     Description: msrun lccl broadcast 8P case.
     Expectation: success
     """
-    os.environ['ENABLE_LCCL'] = str(1)
+    os.environ['MS_ENABLE_LCCL'] = str(1)
     os.environ['GRAPH_OP_RUN'] = str(1)
     context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
     return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True pytest -s test_lccl_broadcast.py")
diff --git a/tests/st/lccl/test_lccl_allreduce.py b/tests/st/lccl/test_lccl_allreduce.py
index 8d60fc0a622..d239b6aa2c8 100644
--- a/tests/st/lccl/test_lccl_allreduce.py
+++ b/tests/st/lccl/test_lccl_allreduce.py
@@ -24,7 +24,6 @@ from mindspore.common.initializer import initializer
 from mindspore.common.parameter import Parameter
 from mindspore.communication.management import init, HCCL_WORLD_COMM_GROUP, get_rank, get_group_size
 from mindspore.ops import operations as P
-from mindspore.ops.operations import _inner_ops as inner
 
 context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
 
diff --git a/tests/st/msrun/test_entry_msrun.py b/tests/st/msrun/test_entry_msrun.py
index c26dd33e010..5ef228678be 100644
--- a/tests/st/msrun/test_entry_msrun.py
+++ b/tests/st/msrun/test_entry_msrun.py
@@ -27,7 +27,6 @@ def test_msrun():
     Description: Launch distributed training job with dynamic cluster using msrun.
     Expectation: All workers are successfully spawned and running training.
     """
-    os.environ['ENABLE_LCCL'] = str(1)
     os.environ['GRAPH_OP_RUN'] = str(1)
     return_code = os.system(
         "msrun --worker_num=4 --local_worker_num=4 --master_addr=127.0.0.1 "\
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index bec86c14f3b..4ff8412bb3a 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -263,6 +263,8 @@ list(REMOVE_ITEM MINDSPORE_SRC_LIST
         "../../../mindspore/ccsrc/runtime/graph_scheduler/actor/embedding_cache/embedding_cache_prefetch_actor.cc")
 list(REMOVE_ITEM MINDSPORE_SRC_LIST
         "../../../mindspore/ccsrc/plugin/device/ascend/hal/profiler/parallel_strategy_profiling.cc")
+list(REMOVE_ITEM MINDSPORE_SRC_LIST
+        "../../../mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.cc")
 
 add_library(_ut_mindspore_obj STATIC ${MINDSPORE_SRC_LIST} $<TARGET_OBJECTS:core_proto_obj> $<TARGET_OBJECTS:mindrt_mid>
         $<TARGET_OBJECTS:common_shared_lib_obj> $<TARGET_OBJECTS:_mindspore_utils_obj>