From 800dd5abaa732336b4e8c7db7a3867b6332aaa20 Mon Sep 17 00:00:00 2001
From: donghufeng <xuxusheng2@huawei.com>
Date: Fri, 30 Apr 2021 17:59:12 +0800
Subject: [PATCH] clean code, quantum state to array, projector

---
 .../cpu/quantum/evolution_cpu_kernel.cc       |  16 +-
 .../cpu/quantum/evolution_cpu_kernel.h        |   1 -
 .../cpu/quantum/pqc_cpu_kernel.cc             |  30 +-
 .../cpu/quantum/pqc_cpu_kernel.h              |   6 +-
 .../quantum_simulator/gates/basic_gates.h     |   1 +
 .../quantum/quantum_simulator/gates/gates.h   |   1 -
 .../gates/intrinsic_one_para_gate.h           |   1 +
 .../quantum_simulator/gates/parameter_gate.h  |   1 +
 .../quantum_simulator/parameter_resolver.h    |   1 -
 .../quantum_simulator/pqc_simulator.cc        |  60 +-
 .../quantum/quantum_simulator/pqc_simulator.h |   7 +-
 .../quantum/quantum_simulator/projector.cc    |  41 +
 .../cpu/quantum/quantum_simulator/projector.h |  40 +
 .../cpu/quantum/quantum_simulator/sparse.cc   |   1 -
 .../cpu/quantum/quantum_simulator/sparse.h    |   4 +-
 .../quantum/quantum_simulator/transformer.cc  |  10 +
 .../quantum/quantum_simulator/transformer.h   |   3 +
 .../cpu/quantum/quantum_simulator/utils.cc    |  15 +-
 .../cpu/quantum/quantum_simulator/utils.h     |   2 +
 third_party/patch/projectq/projectq.patch001  | 948 +++++++++++++++++-
 20 files changed, 1117 insertions(+), 72 deletions(-)
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.cc
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.h
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.cc
index 0b434baec4e..d5046d5490b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.cc
@@ -72,22 +72,22 @@ bool EvolutionCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
   auto output = reinterpret_cast<float *>(outputs[0]->addr);
   MS_EXCEPTION_IF_NULL(param_data);
   MS_EXCEPTION_IF_NULL(output);
-  sim_ = mindquantum::PQCSimulator(1, n_qubits_);
+  auto sim = mindquantum::PQCSimulator(1, n_qubits_);
   mindquantum::ParameterResolver pr;
   for (size_t i = 0; i < param_names_.size(); i++) {
     pr.SetData(param_names_.at(i), param_data[i]);
   }
-  sim_.Evolution(circ_, pr);
+  sim.Evolution(circ_, pr);
   if (hams_.size() == 1) {
-    sim_.ApplyHamiltonian(hams_[0]);
+    sim.ApplyHamiltonian(hams_[0]);
   }
-  if (state_len_ != sim_.vec_.size()) {
+  if (state_len_ != (1UL << n_qubits_)) {
     MS_LOG(EXCEPTION) << "simulation error number of quantum qubit!";
   }
-  size_t poi = 0;
-  for (auto &v : sim_.vec_) {
-    output[poi++] = v.real();
-    output[poi++] = v.imag();
+  auto size = state_len_ * 2;
+#pragma omp parallel for schedule(static)
+  for (size_t i = 0; i < size; i++) {
+    output[i] = sim.vec_[i];
   }
   return true;
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.h
index 5499ae60340..b17e60d93f0 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.h
@@ -41,7 +41,6 @@ class EvolutionCPUKernel : public CPUKernel {
  private:
   int64_t n_qubits_;
   size_t state_len_;
-  mindquantum::PQCSimulator sim_;
   mindquantum::BasicCircuit circ_;
   mindquantum::transformer::Hamiltonians hams_;
   mindquantum::transformer::NamesType param_names_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.cc
index 6b75bf4756f..55b9e551388 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.cc
@@ -36,10 +36,12 @@ struct ComputeParam {
   mindquantum::BasicCircuit *circ_cp;
   mindquantum::BasicCircuit *herm_circ_cp;
   mindquantum::transformer::Hamiltonians *hams_cp;
+  mindquantum::transformer::Projectors *projectors_cp;
   mindquantum::transformer::NamesType *encoder_params_names_cp;
   mindquantum::transformer::NamesType *ansatz_params_names_cp;
   std::vector<std::vector<std::shared_ptr<mindquantum::PQCSimulator>>> *tmp_sims_cp;
   bool dummy_circuit_cp{false};
+  bool is_projector_cp{false};
   size_t result_len_cp{0};
   size_t encoder_g_len_cp{0};
   size_t ansatz_g_len_cp{0};
@@ -60,6 +62,8 @@ void ComputerForwardBackward(const std::shared_ptr<ComputeParam> &input_params,
   auto circ = input_params->circ_cp;
   auto herm_circ = input_params->herm_circ_cp;
   auto hams = input_params->hams_cp;
+  auto projectors = input_params->projectors_cp;
+  auto is_projector = input_params->is_projector_cp;
   auto encoder_params_names = input_params->encoder_params_names_cp;
   auto ansatz_params_names = input_params->ansatz_params_names_cp;
   auto tmp_sims = input_params->tmp_sims_cp;
@@ -91,6 +95,8 @@ void ComputerForwardBackward(const std::shared_ptr<ComputeParam> &input_params,
     calc_gradient_param->circuit_cp = circ;
     calc_gradient_param->circuit_hermitian_cp = herm_circ;
     calc_gradient_param->hamiltonians_cp = hams;
+    calc_gradient_param->projectors_cp = projectors;
+    calc_gradient_param->is_projector_cp = is_projector;
     calc_gradient_param->paras_cp = &pr;
     calc_gradient_param->encoder_params_names_cp = encoder_params_names;
     calc_gradient_param->ansatz_params_names_cp = ansatz_params_names;
@@ -142,6 +148,8 @@ void PQCCPUKernel::InitPQCStructure(const CNodePtr &kernel_node) {
     AnfAlgo::GetNodeAttr<mindquantum::transformer::PaulisWordsType>(kernel_node, mindquantum::kHamsPauliWord);
   hams_pauli_qubit_ =
     AnfAlgo::GetNodeAttr<mindquantum::transformer::PaulisQubitsType>(kernel_node, mindquantum::kHamsPauliQubit);
+  projector_strs_ = AnfAlgo::GetNodeAttr<mindquantum::transformer::NamesType>(kernel_node, mindquantum::kProjectors);
+  is_projector_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, mindquantum::kIsProjector);
 }
 
 void PQCCPUKernel::InitKernel(const CNodePtr &kernel_node) {
@@ -174,18 +182,12 @@ void PQCCPUKernel::InitKernel(const CNodePtr &kernel_node) {
   herm_circ_ = circs[1];
 
   hams_ = mindquantum::transformer::HamiltoniansTransfor(hams_pauli_coeff_, hams_pauli_word_, hams_pauli_qubit_);
+  projectors_ = mindquantum::transformer::ProjectorsTransfor(projector_strs_);
 
   n_threads_user_ = std::min(n_threads_user_, common::ThreadPool::GetInstance().GetSyncRunThreadNum());
   if (n_samples_ < n_threads_user_) {
     n_threads_user_ = n_samples_;
   }
-  for (size_t i = 0; i < n_threads_user_; i++) {
-    tmp_sims_.push_back({});
-    for (size_t j = 0; j < 4; j++) {
-      auto tmp = std::make_shared<mindquantum::PQCSimulator>(1, n_qubits_);
-      tmp_sims_.back().push_back(tmp);
-    }
-  }
 }
 
 bool PQCCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
@@ -205,6 +207,16 @@ bool PQCCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
   MS_EXCEPTION_IF_NULL(gradient_encoder);
   MS_EXCEPTION_IF_NULL(gradient_ansatz);
 
+  std::vector<std::vector<std::shared_ptr<mindquantum::PQCSimulator>>> tmp_sims(
+    n_threads_user_, std::vector<std::shared_ptr<mindquantum::PQCSimulator>>(4, nullptr));
+#pragma omp parallel for collapse(2) schedule(static)
+  for (size_t i = 0; i < n_threads_user_; i++) {
+    for (size_t j = 0; j < 4; j++) {
+      auto tmp = std::make_shared<mindquantum::PQCSimulator>(1, n_qubits_);
+      tmp_sims[i][j] = tmp;
+    }
+  }
+
   std::vector<common::Task> tasks;
   std::vector<std::shared_ptr<ComputeParam>> thread_params;
   tasks.reserve(n_threads_user_);
@@ -222,9 +234,11 @@ bool PQCCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
     params->circ_cp = &circ_;
     params->herm_circ_cp = &herm_circ_;
     params->hams_cp = &hams_;
+    params->projectors_cp = &projectors_;
+    params->is_projector_cp = is_projector_;
     params->encoder_params_names_cp = &encoder_params_names_;
     params->ansatz_params_names_cp = &ansatz_params_names_;
-    params->tmp_sims_cp = &tmp_sims_;
+    params->tmp_sims_cp = &tmp_sims;
     params->dummy_circuit_cp = dummy_circuit_;
     params->result_len_cp = result_len_;
     params->encoder_g_len_cp = encoder_g_len_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.h
index 41e49ba3d3a..aa5320c602d 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.h
@@ -52,7 +52,7 @@ class PQCCPUKernel : public CPUKernel {
   mindquantum::BasicCircuit circ_;
   mindquantum::BasicCircuit herm_circ_;
   mindquantum::transformer::Hamiltonians hams_;
-  std::vector<std::vector<std::shared_ptr<mindquantum::PQCSimulator>>> tmp_sims_;
+  mindquantum::transformer::Projectors projectors_;
 
   // parameters
   mindquantum::transformer::NamesType encoder_params_names_;
@@ -71,6 +71,10 @@ class PQCCPUKernel : public CPUKernel {
   mindquantum::transformer::PaulisCoeffsType hams_pauli_coeff_;
   mindquantum::transformer::PaulisWordsType hams_pauli_word_;
   mindquantum::transformer::PaulisQubitsType hams_pauli_qubit_;
+
+  // subspace measurement ops
+  mindquantum::transformer::NamesType projector_strs_;
+  bool is_projector_;
 };
 
 MS_REG_CPU_KERNEL(PQC,
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/basic_gates.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/basic_gates.h
index bec1f967748..e36022a9a62 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/basic_gates.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/basic_gates.h
@@ -42,6 +42,7 @@ class BasicGate {
   bool IsParameterGate();
   Indexes GetObjQubits();
   Indexes GetCtrlQubits();
+  virtual ~BasicGate() {}
 };
 }  // namespace mindquantum
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/gates.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/gates.h
index 5d5b69d4963..ac0849f7d93 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/gates.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/gates.h
@@ -77,7 +77,6 @@ class ZZGate : public IntrinsicOneParaGate {
  public:
   ZZGate(const Indexes &, const Indexes &, const ParameterResolver &);
 };
-
 }  // namespace mindquantum
 }  // namespace mindspore
 #endif  // MINDQUANTUM_ENGINE_GATES_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/intrinsic_one_para_gate.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/intrinsic_one_para_gate.h
index c998e56d601..f58ac888cd9 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/intrinsic_one_para_gate.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/intrinsic_one_para_gate.h
@@ -32,6 +32,7 @@ class IntrinsicOneParaGate : public ParameterGate {
   CalcType LinearCombination(const ParameterResolver &, const ParameterResolver &);
   Matrix GetMatrix(const ParameterResolver &) override;
   Matrix GetDiffMatrix(const ParameterResolver &) override;
+  virtual ~IntrinsicOneParaGate() {}
 };
 }  // namespace mindquantum
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/parameter_gate.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/parameter_gate.h
index 321816bba47..a71b3fcfd06 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/parameter_gate.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/parameter_gate.h
@@ -26,6 +26,7 @@ class ParameterGate : public BasicGate {
  public:
   ParameterGate();
   ParameterGate(const std::string &, const Indexes &, const Indexes &, const ParameterResolver &);
+  virtual ~ParameterGate() {}
 };
 }  // namespace mindquantum
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/parameter_resolver.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/parameter_resolver.h
index 08569ba9ae7..abe37aee6d4 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/parameter_resolver.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/parameter_resolver.h
@@ -23,7 +23,6 @@
 
 namespace mindspore {
 namespace mindquantum {
-
 class ParameterResolver {
  public:
   ParameterResolver();
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.cc
index 16fe9c88278..8f6a19da808 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.cc
@@ -20,20 +20,18 @@
 
 namespace mindspore {
 namespace mindquantum {
-PQCSimulator::PQCSimulator() : Simulator(1), n_qubits_(1) {
-  PQCSimulator::AllocateAll();
+PQCSimulator::PQCSimulator() : Simulator(1, 1), n_qubits_(1) {
   for (Index i = 0; i < n_qubits_; i++) {
     ordering_.push_back(i);
   }
-  len_ = (1UL << n_qubits_);
+  len_ = (1UL << (n_qubits_ + 1));
 }
 
-PQCSimulator::PQCSimulator(Index seed = 1, Index N = 1) : Simulator(seed), n_qubits_(N) {
-  PQCSimulator::AllocateAll();
+PQCSimulator::PQCSimulator(Index seed = 1, Index N = 1) : Simulator(seed, N), n_qubits_(N) {
   for (Index i = 0; i < n_qubits_; i++) {
     ordering_.push_back(i);
   }
-  len_ = (1UL << n_qubits_);
+  len_ = (1UL << (n_qubits_ + 1));
 }
 
 void PQCSimulator::ApplyGate(std::shared_ptr<BasicGate> g, const ParameterResolver &paras, bool diff) {
@@ -65,14 +63,15 @@ void PQCSimulator::Evolution(BasicCircuit const &circuit, ParameterResolver cons
   PQCSimulator::ApplyBlocks(circuit.GetGateBlocks(), paras);
 }
 
-CalcType PQCSimulator::Measure(Index mask1, Index mask2, bool apply) {
+CalcType PQCSimulator::Measure(const Indexes &masks, bool apply) {
   CalcType out = 0;
 #pragma omp parallel for reduction(+ : out) schedule(static)
   for (unsigned i = 0; i < (1UL << n_qubits_); i++) {
-    if (((i & mask1) == mask1) && ((i | mask2) == mask2)) {
-      out = out + std::real(vec_[i]) * std::real(vec_[i]) + std::imag(vec_[i]) * std::imag(vec_[i]);
+    if (((i & masks[0]) == masks[0]) && ((i | masks[1]) == masks[1])) {
+      out = out + vec_[2 * i] * vec_[2 * i] + vec_[2 * i + 1] * vec_[2 * i + 1];
     } else if (apply) {
-      vec_[i] = 0;
+      vec_[2 * i] = 0;
+      vec_[2 * i + 1] = 0;
     }
   }
   return out;
@@ -85,6 +84,8 @@ std::vector<std::vector<float>> PQCSimulator::CalcGradient(const std::shared_ptr
   auto circuit = input_params->circuit_cp;
   auto circuit_hermitian = input_params->circuit_hermitian_cp;
   auto hamiltonians = input_params->hamiltonians_cp;
+  auto projectors = input_params->projectors_cp;
+  auto is_projector = input_params->is_projector_cp;
   auto paras = input_params->paras_cp;
   auto encoder_params_names = input_params->encoder_params_names_cp;
   auto ansatz_params_names = input_params->ansatz_params_names_cp;
@@ -102,16 +103,27 @@ std::vector<std::vector<float>> PQCSimulator::CalcGradient(const std::shared_ptr
     MS_LOG(EXCEPTION) << "Empty quantum circuit!";
   }
   unsigned len = circ_gate_blocks.at(0).size();
-  std::vector<float> grad(hamiltonians->size() * poi.size(), 0);
-  std::vector<float> e0(hamiltonians->size(), 0);
+  size_t mea_size = 0;
+  if (is_projector) {
+    mea_size = projectors->size();
+  } else {
+    mea_size = hamiltonians->size();
+  }
+  std::vector<float> grad(mea_size * poi.size(), 0);
+  std::vector<float> e0(mea_size, 0);
 
   // #pragma omp parallel for
-  for (size_t h_index = 0; h_index < hamiltonians->size(); h_index++) {
-    auto &hamiltonian = hamiltonians->at(h_index);
+  for (size_t h_index = 0; h_index < mea_size; h_index++) {
     s_right.set_wavefunction(vec_, ordering_);
     s_left.set_wavefunction(s_right.vec_, ordering_);
-    s_left.apply_qubit_operator(hamiltonian.GetCTD(), ordering_);
-    e0[h_index] = static_cast<float>(ComplexInnerProduct(vec_, s_left.vec_, len_).real());
+    if (is_projector) {
+      auto &proj = projectors->at(h_index);
+      e0[h_index] = s_left.Measure(proj.GetMasks(), true);
+    } else {
+      auto &hamiltonian = hamiltonians->at(h_index);
+      s_left.apply_qubit_operator(hamiltonian.GetCTD(), ordering_);
+      e0[h_index] = static_cast<float>(ComplexInnerProduct(vec_, s_left.vec_, len_).real());
+    }
     if (dummy_circuit_) {
       continue;
     }
@@ -144,7 +156,7 @@ std::vector<std::vector<float>> PQCSimulator::CalcGradient(const std::shared_ptr
   }
   std::vector<float> grad1;
   std::vector<float> grad2;
-  for (size_t i = 0; i < hamiltonians->size(); i++) {
+  for (size_t i = 0; i < mea_size; i++) {
     for (size_t j = 0; j < poi.size(); j++) {
       if (j < encoder_params_names->size()) {
         grad1.push_back(grad[i * poi.size() + j]);
@@ -156,16 +168,6 @@ std::vector<std::vector<float>> PQCSimulator::CalcGradient(const std::shared_ptr
   return {e0, grad1, grad2};
 }
 
-void PQCSimulator::AllocateAll() {
-  for (unsigned i = 0; i < n_qubits_; i++) {
-    Simulator::allocate_qubit(i);
-  }
-}
-void PQCSimulator::DeallocateAll() {
-  for (unsigned i = 0; i < n_qubits_; i++) {
-    Simulator::deallocate_qubit(i);
-  }
-}
 void PQCSimulator::SetState(const StateVector &wavefunction) { Simulator::set_wavefunction(wavefunction, ordering_); }
 
 std::size_t PQCSimulator::GetControlMask(Indexes const &ctrls) {
@@ -184,9 +186,9 @@ CalcType PQCSimulator::GetExpectationValue(const Hamiltonian &ham) {
 void PQCSimulator::SetZeroState() {
 #pragma omp parallel for schedule(static)
   for (size_t i = 0; i < len_; i++) {
-    vec_[i] = {0, 0};
+    vec_[i] = 0;
   }
-  vec_[0] = {1, 0};
+  vec_[0] = 1;
 }
 }  // namespace mindquantum
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.h
index 134b86a03a8..ff65621e1a0 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.h
@@ -28,6 +28,7 @@
 #include "backend/kernel_compiler/cpu/quantum/quantum_simulator/hamiltonian.h"
 #include "backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.h"
 #include "backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.h"
+#include "backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.h"
 
 namespace mindspore {
 namespace mindquantum {
@@ -35,10 +36,12 @@ struct CalcGradientParam {
   BasicCircuit *circuit_cp;
   BasicCircuit *circuit_hermitian_cp;
   transformer::Hamiltonians *hamiltonians_cp;
+  transformer::Projectors *projectors_cp;
   ParameterResolver *paras_cp;
   transformer::NamesType *encoder_params_names_cp;
   transformer::NamesType *ansatz_params_names_cp;
   bool dummy_circuit_cp{false};
+  bool is_projector_cp{false};
 };
 class PQCSimulator : public Simulator {
  private:
@@ -53,13 +56,11 @@ class PQCSimulator : public Simulator {
   void ApplyBlock(const GateBlock &, const ParameterResolver &);
   void ApplyBlocks(const GateBlocks &, const ParameterResolver &);
   void Evolution(const BasicCircuit &, const ParameterResolver &);
-  CalcType Measure(Index, Index, bool);
+  CalcType Measure(const Indexes &, bool);
   void ApplyHamiltonian(const Hamiltonian &);
   CalcType GetExpectationValue(const Hamiltonian &);
   std::vector<std::vector<float>> CalcGradient(const std::shared_ptr<CalcGradientParam> &, PQCSimulator &,
                                                PQCSimulator &, PQCSimulator &);
-  void AllocateAll();
-  void DeallocateAll();
   void SetState(const StateVector &);
   std::size_t GetControlMask(Indexes const &);
   void SetZeroState();
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.cc
new file mode 100644
index 00000000000..6379c7da01f
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.cc
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.h"
+
+namespace mindspore {
+namespace mindquantum {
+Projector::Projector() {}
+Projector::Projector(const NameType &proj_str) : proj_str_(proj_str), n_qubits_(proj_str.length()) {}
+void Projector::HandleMask() {
+  mask1_ = 0;
+  mask2_ = 0;
+  for (auto i : proj_str_) {
+    if (i == '1') {
+      mask1_ = mask1_ * 2 + 1;
+    } else {
+      mask1_ = mask1_ * 2;
+    }
+    if (i == '0') {
+      mask2_ = mask2_ * 2;
+    } else {
+      mask2_ = mask2_ * 2 + 1;
+    }
+  }
+}
+Indexes Projector::GetMasks() { return {mask1_, mask2_}; }
+}  // namespace mindquantum
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.h
new file mode 100644
index 00000000000..ff019e47745
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDQUANTUM_ENGINE_PROJECTOR_H_
+#define MINDQUANTUM_ENGINE_PROJECTOR_H_
+#include <string>
+#include "backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.h"
+
+namespace mindspore {
+namespace mindquantum {
+using NameType = std::string;
+class Projector {
+ public:
+  Projector();
+  explicit Projector(const NameType &);
+  void HandleMask();
+  Indexes GetMasks();
+
+ private:
+  NameType proj_str_;
+  Index n_qubits_;
+  Index mask1_;
+  Index mask2_;
+};
+}  // namespace mindquantum
+}  // namespace mindspore
+#endif  // MINDQUANTUM_ENGINE_PROJECTOR_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.cc
index d8e5635171b..4f0f7bd6b61 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.cc
@@ -118,7 +118,6 @@ SparseMatrix GoodTerm2Sparse(const GoodTerm &gt, Index n_qubits) {
   out = KroneckerProductSparse(IdentitySparse(n_qubits - gt.first.second.second - 1), out).eval();
   return out;
 }
-
 }  // namespace sparse
 }  // namespace mindquantum
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.h
index 15f8cc4abce..e20c18320f2 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.h
@@ -33,9 +33,9 @@ using PauliWord = std::pair<Index, char>;
 using PauliTerm = std::pair<std::vector<PauliWord>, int>;
 using GoodTerm = std::pair<std::pair<CalcType, std::pair<Index, Index>>, std::vector<PauliTerm>>;
 using GoodHamilt = std::vector<GoodTerm>;
-typedef Eigen::VectorXcd EigenComplexVector;
+using EigenComplexVector = Eigen::VectorXcd;
 using Eigen::VectorXi;
-typedef Eigen::SparseMatrix<ComplexType, Eigen::RowMajor, int64_t> SparseMatrix;
+using SparseMatrix = Eigen::SparseMatrix<ComplexType, Eigen::RowMajor, int64_t>;
 using DequeSparseHam = std::deque<SparseMatrix>;
 using KroneckerProductSparse = Eigen::KroneckerProductSparse<SparseMatrix, SparseMatrix>;
 SparseMatrix BasiGateSparse(char);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.cc
index 76419d9e3ee..adaac0fd76a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.cc
@@ -109,6 +109,16 @@ Hamiltonians HamiltoniansTransfor(const PaulisCoeffsType &paulis_coeffs, const P
   }
   return hams;
 }
+
+Projectors ProjectorsTransfor(const NamesType &proj_strs) {
+  Projectors projs;
+  for (Index n = 0; n < proj_strs.size(); n++) {
+    Projector proj = Projector(proj_strs[n]);
+    proj.HandleMask();
+    projs.push_back(proj);
+  }
+  return projs;
+}
 }  // namespace transformer
 }  // namespace mindquantum
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.h
index 43d334a6ac0..c74947fe302 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.h
@@ -23,6 +23,7 @@
 #include "backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.h"
 #include "backend/kernel_compiler/cpu/quantum/quantum_simulator/parameter_resolver.h"
 #include "backend/kernel_compiler/cpu/quantum/quantum_simulator/hamiltonian.h"
+#include "backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.h"
 
 namespace mindspore {
 namespace mindquantum {
@@ -49,6 +50,7 @@ using PauliQubitType = std::vector<int64_t>;
 using PauliQubitsType = std::vector<PauliQubitType>;
 using PaulisQubitsType = std::vector<PauliQubitsType>;
 using Hamiltonians = std::vector<Hamiltonian>;
+using Projectors = std::vector<Projector>;
 
 Hamiltonians HamiltoniansTransfor(const PaulisCoeffsType &, const PaulisWordsType &, const PaulisQubitsType &);
 
@@ -59,6 +61,7 @@ std::vector<BasicCircuit> CircuitTransfor(const NamesType &, const ComplexMatrix
 Matrix MatrixConverter(const MatrixType &, const MatrixType &, bool);
 
 ParameterResolver ParameterResolverConverter(const ParaNameType &, const CoeffType &, const RequireType &, bool);
+Projectors ProjectorsTransfor(const NamesType &);
 }  // namespace transformer
 }  // namespace mindquantum
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.cc
index 2b3c03928e7..d16a17be9ec 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.cc
@@ -21,10 +21,11 @@ namespace mindquantum {
 ComplexType ComplexInnerProduct(const Simulator::StateVector &v1, const Simulator::StateVector &v2, unsigned len) {
   CalcType real_part = 0;
   CalcType imag_part = 0;
+  auto size = len / 2;
 #pragma omp parallel for reduction(+ : real_part, imag_part)
-  for (Index i = 0; i < len; i++) {
-    real_part += v1[i].real() * v2[i].real() + v1[i].imag() * v2[i].imag();
-    imag_part += v1[i].real() * v2[i].imag() - v1[i].imag() * v2[i].real();
+  for (Index i = 0; i < size; i++) {
+    real_part += v1[2 * i] * v2[2 * i] + v1[2 * i + 1] * v2[2 * i + 1];
+    imag_part += v1[2 * i] * v2[2 * i + 1] - v1[2 * i + 1] * v2[2 * i];
   }
 
   ComplexType result = {real_part, imag_part};
@@ -35,16 +36,16 @@ ComplexType ComplexInnerProductWithControl(const Simulator::StateVector &v1, con
                                            Index len, std::size_t ctrlmask) {
   CalcType real_part = 0;
   CalcType imag_part = 0;
+  auto size = len / 2;
 #pragma omp parallel for reduction(+ : real_part, imag_part)
-  for (std::size_t i = 0; i < len; i++) {
+  for (std::size_t i = 0; i < size; i++) {
     if ((i & ctrlmask) == ctrlmask) {
-      real_part += v1[i].real() * v2[i].real() + v1[i].imag() * v2[i].imag();
-      imag_part += v1[i].real() * v2[i].imag() - v1[i].imag() * v2[i].real();
+      real_part += v1[2 * i] * v2[2 * i] + v1[2 * i + 1] * v2[2 * i + 1];
+      imag_part += v1[2 * i] * v2[2 * i + 1] - v1[2 * i + 1] * v2[2 * i];
     }
   }
   ComplexType result = {real_part, imag_part};
   return result;
 }
-
 }  // namespace mindquantum
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.h
index 326d32afa78..24930a5eda1 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.h
@@ -52,6 +52,8 @@ const char kGateRequiresGrad[] = "gate_requires_grad";
 const char kHamsPauliCoeff[] = "hams_pauli_coeff";
 const char kHamsPauliWord[] = "hams_pauli_word";
 const char kHamsPauliQubit[] = "hams_pauli_qubit";
+const char kIsProjector[] = "is_projector";
+const char kProjectors[] = "projectors";
 }  // namespace mindquantum
 }  // namespace mindspore
 #endif  // MINDQUANTUM_ENGINE_UTILS_H_
diff --git a/third_party/patch/projectq/projectq.patch001 b/third_party/patch/projectq/projectq.patch001
index e9f95b57c47..89c810c84ce 100644
--- a/third_party/patch/projectq/projectq.patch001
+++ b/third_party/patch/projectq/projectq.patch001
@@ -1,38 +1,966 @@
+diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp
+--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp	2020-06-05 21:07:57.000000000 +0800
++++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp	2021-04-19 18:04:05.541802882 +0800
+@@ -17,18 +17,18 @@
+ {
+     __m256d v[2];
+ 
+-    v[0] = load2(&psi[I]);
+-    v[1] = load2(&psi[I + d0]);
++    v[0] = load2(psi + 2 * I);
++    v[1] = load2(psi + 2 * (I + d0));
+ 
+-    _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(mul(v[0], m[0], mt[0]), mul(v[1], m[1], mt[1])));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(mul(v[0], m[0], mt[0]), mul(v[1], m[1], mt[1])));
+ 
+ }
+ 
+ // bit indices id[.] are given from high to low (e.g. control first for CNOT)
+ template <class V, class M>
+-void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask)
++void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len)
+ {
+-    std::size_t n = psi.size();
++    std::size_t n = len;
+     std::size_t d0 = 1UL << id0;
+ 
+     __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1])};
+diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp
+--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp	2020-06-05 21:07:57.000000000 +0800
++++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp	2021-04-19 18:04:05.541802882 +0800
+@@ -17,21 +17,21 @@
+ {
+     __m256d v[4];
+ 
+-    v[0] = load2(&psi[I]);
+-    v[1] = load2(&psi[I + d0]);
+-    v[2] = load2(&psi[I + d1]);
+-    v[3] = load2(&psi[I + d0 + d1]);
++    v[0] = load2(psi + 2 * I);
++    v[1] = load2(psi + 2 * (I + d0));
++    v[2] = load2(psi + 2 * (I + d1));
++    v[3] = load2(psi + 2 * (I + d0 + d1));
+ 
+-    _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3])))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7])))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3])))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7])))));
+ 
+ }
+ 
+ // bit indices id[.] are given from high to low (e.g. control first for CNOT)
+ template <class V, class M>
+-void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
++void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len)
+ {
+-    std::size_t n = psi.size();
++    std::size_t n = len;
+     std::size_t d0 = 1UL << id0;
+     std::size_t d1 = 1UL << id1;
+ 
+diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp
+--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp	2020-06-05 21:07:57.000000000 +0800
++++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp	2021-04-19 18:04:05.541802882 +0800
+@@ -17,10 +17,10 @@
+ {
+     __m256d v[4];
+ 
+-    v[0] = load2(&psi[I]);
+-    v[1] = load2(&psi[I + d0]);
+-    v[2] = load2(&psi[I + d1]);
+-    v[3] = load2(&psi[I + d0 + d1]);
++    v[0] = load2(psi + 2 * I);
++    v[1] = load2(psi + 2 * (I + d0));
++    v[2] = load2(psi + 2 * (I + d1));
++    v[3] = load2(psi + 2 * (I + d0 + d1));
+ 
+     __m256d tmp[4];
+ 
+@@ -29,23 +29,23 @@
+     tmp[2] = add(mul(v[0], m[8], mt[8]), add(mul(v[1], m[9], mt[9]), add(mul(v[2], m[10], mt[10]), mul(v[3], m[11], mt[11]))));
+     tmp[3] = add(mul(v[0], m[12], mt[12]), add(mul(v[1], m[13], mt[13]), add(mul(v[2], m[14], mt[14]), mul(v[3], m[15], mt[15]))));
+ 
+-    v[0] = load2(&psi[I + d2]);
+-    v[1] = load2(&psi[I + d0 + d2]);
+-    v[2] = load2(&psi[I + d1 + d2]);
+-    v[3] = load2(&psi[I + d0 + d1 + d2]);
+-
+-    _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[16], mt[16]), add(mul(v[1], m[17], mt[17]), add(mul(v[2], m[18], mt[18]), mul(v[3], m[19], mt[19]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[20], mt[20]), add(mul(v[1], m[21], mt[21]), add(mul(v[2], m[22], mt[22]), mul(v[3], m[23], mt[23]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31]))))));
++    v[0] = load2(psi + 2 * (I + d2));
++    v[1] = load2(psi + 2 * (I + d0 + d2));
++    v[2] = load2(psi + 2 * (I + d1 + d2));
++    v[3] = load2(psi + 2 * (I + d0 + d1 + d2));
++
++    _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(tmp[0], add(mul(v[0], m[16], mt[16]), add(mul(v[1], m[17], mt[17]), add(mul(v[2], m[18], mt[18]), mul(v[3], m[19], mt[19]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(tmp[1], add(mul(v[0], m[20], mt[20]), add(mul(v[1], m[21], mt[21]), add(mul(v[2], m[22], mt[22]), mul(v[3], m[23], mt[23]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2), psi + 2 * (I + d2), add(tmp[2], add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2), psi + 2 * (I + d1 + d2), add(tmp[3], add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31]))))));
+ 
+ }
+ 
+ // bit indices id[.] are given from high to low (e.g. control first for CNOT)
+ template <class V, class M>
+-void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
++void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len)
+ {
+-    std::size_t n = psi.size();
++    std::size_t n = len;
+     std::size_t d0 = 1UL << id0;
+     std::size_t d1 = 1UL << id1;
+     std::size_t d2 = 1UL << id2;
+diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp
+--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp	2020-06-05 21:07:57.000000000 +0800
++++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp	2021-04-19 18:04:05.541802882 +0800
+@@ -17,10 +17,10 @@
+ {
+     __m256d v[4];
+ 
+-    v[0] = load2(&psi[I]);
+-    v[1] = load2(&psi[I + d0]);
+-    v[2] = load2(&psi[I + d1]);
+-    v[3] = load2(&psi[I + d0 + d1]);
++    v[0] = load2(psi + 2 * I);
++    v[1] = load2(psi + 2 * (I + d0));
++    v[2] = load2(psi + 2 * (I + d1));
++    v[3] = load2(psi + 2 * (I + d0 + d1));
+ 
+     __m256d tmp[8];
+ 
+@@ -33,10 +33,10 @@
+     tmp[6] = add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27]))));
+     tmp[7] = add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31]))));
+ 
+-    v[0] = load2(&psi[I + d2]);
+-    v[1] = load2(&psi[I + d0 + d2]);
+-    v[2] = load2(&psi[I + d1 + d2]);
+-    v[3] = load2(&psi[I + d0 + d1 + d2]);
++    v[0] = load2(psi + 2 * (I + d2));
++    v[1] = load2(psi + 2 * (I + d0 + d2));
++    v[2] = load2(psi + 2 * (I + d1 + d2));
++    v[3] = load2(psi + 2 * (I + d0 + d1 + d2));
+ 
+     tmp[0] = add(tmp[0], add(mul(v[0], m[32], mt[32]), add(mul(v[1], m[33], mt[33]), add(mul(v[2], m[34], mt[34]), mul(v[3], m[35], mt[35])))));
+     tmp[1] = add(tmp[1], add(mul(v[0], m[36], mt[36]), add(mul(v[1], m[37], mt[37]), add(mul(v[2], m[38], mt[38]), mul(v[3], m[39], mt[39])))));
+@@ -47,10 +47,10 @@
+     tmp[6] = add(tmp[6], add(mul(v[0], m[56], mt[56]), add(mul(v[1], m[57], mt[57]), add(mul(v[2], m[58], mt[58]), mul(v[3], m[59], mt[59])))));
+     tmp[7] = add(tmp[7], add(mul(v[0], m[60], mt[60]), add(mul(v[1], m[61], mt[61]), add(mul(v[2], m[62], mt[62]), mul(v[3], m[63], mt[63])))));
+ 
+-    v[0] = load2(&psi[I + d3]);
+-    v[1] = load2(&psi[I + d0 + d3]);
+-    v[2] = load2(&psi[I + d1 + d3]);
+-    v[3] = load2(&psi[I + d0 + d1 + d3]);
++    v[0] = load2(psi + 2 * (I + d3));
++    v[1] = load2(psi + 2 * (I + d0 + d3));
++    v[2] = load2(psi + 2 * (I + d1 + d3));
++    v[3] = load2(psi + 2 * (I + d0 + d1 + d3));
+ 
+     tmp[0] = add(tmp[0], add(mul(v[0], m[64], mt[64]), add(mul(v[1], m[65], mt[65]), add(mul(v[2], m[66], mt[66]), mul(v[3], m[67], mt[67])))));
+     tmp[1] = add(tmp[1], add(mul(v[0], m[68], mt[68]), add(mul(v[1], m[69], mt[69]), add(mul(v[2], m[70], mt[70]), mul(v[3], m[71], mt[71])))));
+@@ -61,27 +61,27 @@
+     tmp[6] = add(tmp[6], add(mul(v[0], m[88], mt[88]), add(mul(v[1], m[89], mt[89]), add(mul(v[2], m[90], mt[90]), mul(v[3], m[91], mt[91])))));
+     tmp[7] = add(tmp[7], add(mul(v[0], m[92], mt[92]), add(mul(v[1], m[93], mt[93]), add(mul(v[2], m[94], mt[94]), mul(v[3], m[95], mt[95])))));
+ 
+-    v[0] = load2(&psi[I + d2 + d3]);
+-    v[1] = load2(&psi[I + d0 + d2 + d3]);
+-    v[2] = load2(&psi[I + d1 + d2 + d3]);
+-    v[3] = load2(&psi[I + d0 + d1 + d2 + d3]);
+-
+-    _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[96], mt[96]), add(mul(v[1], m[97], mt[97]), add(mul(v[2], m[98], mt[98]), mul(v[3], m[99], mt[99]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[100], mt[100]), add(mul(v[1], m[101], mt[101]), add(mul(v[2], m[102], mt[102]), mul(v[3], m[103], mt[103]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[104], mt[104]), add(mul(v[1], m[105], mt[105]), add(mul(v[2], m[106], mt[106]), mul(v[3], m[107], mt[107]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[108], mt[108]), add(mul(v[1], m[109], mt[109]), add(mul(v[2], m[110], mt[110]), mul(v[3], m[111], mt[111]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d3], (double*)&psi[I + d3], add(tmp[4], add(mul(v[0], m[112], mt[112]), add(mul(v[1], m[113], mt[113]), add(mul(v[2], m[114], mt[114]), mul(v[3], m[115], mt[115]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], add(tmp[5], add(mul(v[0], m[116], mt[116]), add(mul(v[1], m[117], mt[117]), add(mul(v[2], m[118], mt[118]), mul(v[3], m[119], mt[119]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], add(tmp[6], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], add(tmp[7], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127]))))));
++    v[0] = load2(psi + 2 * (I + d2 + d3));
++    v[1] = load2(psi + 2 * (I + d0 + d2 + d3));
++    v[2] = load2(psi + 2 * (I + d1 + d2 + d3));
++    v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d3));
++
++    _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(tmp[0], add(mul(v[0], m[96], mt[96]), add(mul(v[1], m[97], mt[97]), add(mul(v[2], m[98], mt[98]), mul(v[3], m[99], mt[99]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(tmp[1], add(mul(v[0], m[100], mt[100]), add(mul(v[1], m[101], mt[101]), add(mul(v[2], m[102], mt[102]), mul(v[3], m[103], mt[103]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2), psi + 2 * (I + d2), add(tmp[2], add(mul(v[0], m[104], mt[104]), add(mul(v[1], m[105], mt[105]), add(mul(v[2], m[106], mt[106]), mul(v[3], m[107], mt[107]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2), psi + 2 * (I + d1 + d2), add(tmp[3], add(mul(v[0], m[108], mt[108]), add(mul(v[1], m[109], mt[109]), add(mul(v[2], m[110], mt[110]), mul(v[3], m[111], mt[111]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d3), psi + 2 * (I + d3), add(tmp[4], add(mul(v[0], m[112], mt[112]), add(mul(v[1], m[113], mt[113]), add(mul(v[2], m[114], mt[114]), mul(v[3], m[115], mt[115]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d3), psi + 2 * (I + d1 + d3), add(tmp[5], add(mul(v[0], m[116], mt[116]), add(mul(v[1], m[117], mt[117]), add(mul(v[2], m[118], mt[118]), mul(v[3], m[119], mt[119]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d3), psi + 2 * (I + d2 + d3), add(tmp[6], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d3), psi + 2 * (I + d1 + d2 + d3), add(tmp[7], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127]))))));
+ 
+ }
+ 
+ // bit indices id[.] are given from high to low (e.g. control first for CNOT)
+ template <class V, class M>
+-void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
++void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len)
+ {
+-    std::size_t n = psi.size();
++    std::size_t n = len;
+     std::size_t d0 = 1UL << id0;
+     std::size_t d1 = 1UL << id1;
+     std::size_t d2 = 1UL << id2;
+diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp
+--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp	2020-06-05 21:07:57.000000000 +0800
++++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp	2021-04-19 18:04:05.541802882 +0800
+@@ -17,10 +17,10 @@
+ {
+     __m256d v[4];
+ 
+-    v[0] = load2(&psi[I]);
+-    v[1] = load2(&psi[I + d0]);
+-    v[2] = load2(&psi[I + d1]);
+-    v[3] = load2(&psi[I + d0 + d1]);
++    v[0] = load2(psi + 2 * I);
++    v[1] = load2(psi + 2 * (I + d0));
++    v[2] = load2(psi + 2 * (I + d1));
++    v[3] = load2(psi + 2 * (I + d0 + d1));
+ 
+     __m256d tmp[16];
+ 
+@@ -41,10 +41,10 @@
+     tmp[14] = add(mul(v[0], m[56], mt[56]), add(mul(v[1], m[57], mt[57]), add(mul(v[2], m[58], mt[58]), mul(v[3], m[59], mt[59]))));
+     tmp[15] = add(mul(v[0], m[60], mt[60]), add(mul(v[1], m[61], mt[61]), add(mul(v[2], m[62], mt[62]), mul(v[3], m[63], mt[63]))));
+ 
+-    v[0] = load2(&psi[I + d2]);
+-    v[1] = load2(&psi[I + d0 + d2]);
+-    v[2] = load2(&psi[I + d1 + d2]);
+-    v[3] = load2(&psi[I + d0 + d1 + d2]);
++    v[0] = load2(psi + 2 * (I + d2));
++    v[1] = load2(psi + 2 * (I + d0 + d2));
++    v[2] = load2(psi + 2 * (I + d1 + d2));
++    v[3] = load2(psi + 2 * (I + d0 + d1 + d2));
+ 
+     tmp[0] = add(tmp[0], add(mul(v[0], m[64], mt[64]), add(mul(v[1], m[65], mt[65]), add(mul(v[2], m[66], mt[66]), mul(v[3], m[67], mt[67])))));
+     tmp[1] = add(tmp[1], add(mul(v[0], m[68], mt[68]), add(mul(v[1], m[69], mt[69]), add(mul(v[2], m[70], mt[70]), mul(v[3], m[71], mt[71])))));
+@@ -63,10 +63,10 @@
+     tmp[14] = add(tmp[14], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123])))));
+     tmp[15] = add(tmp[15], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127])))));
+ 
+-    v[0] = load2(&psi[I + d3]);
+-    v[1] = load2(&psi[I + d0 + d3]);
+-    v[2] = load2(&psi[I + d1 + d3]);
+-    v[3] = load2(&psi[I + d0 + d1 + d3]);
++    v[0] = load2(psi + 2 * (I + d3));
++    v[1] = load2(psi + 2 * (I + d0 + d3));
++    v[2] = load2(psi + 2 * (I + d1 + d3));
++    v[3] = load2(psi + 2 * (I + d0 + d1 + d3));
+ 
+     tmp[0] = add(tmp[0], add(mul(v[0], m[128], mt[128]), add(mul(v[1], m[129], mt[129]), add(mul(v[2], m[130], mt[130]), mul(v[3], m[131], mt[131])))));
+     tmp[1] = add(tmp[1], add(mul(v[0], m[132], mt[132]), add(mul(v[1], m[133], mt[133]), add(mul(v[2], m[134], mt[134]), mul(v[3], m[135], mt[135])))));
+@@ -85,10 +85,10 @@
+     tmp[14] = add(tmp[14], add(mul(v[0], m[184], mt[184]), add(mul(v[1], m[185], mt[185]), add(mul(v[2], m[186], mt[186]), mul(v[3], m[187], mt[187])))));
+     tmp[15] = add(tmp[15], add(mul(v[0], m[188], mt[188]), add(mul(v[1], m[189], mt[189]), add(mul(v[2], m[190], mt[190]), mul(v[3], m[191], mt[191])))));
+ 
+-    v[0] = load2(&psi[I + d2 + d3]);
+-    v[1] = load2(&psi[I + d0 + d2 + d3]);
+-    v[2] = load2(&psi[I + d1 + d2 + d3]);
+-    v[3] = load2(&psi[I + d0 + d1 + d2 + d3]);
++    v[0] = load2(psi + 2 * (I + d2 + d3));
++    v[1] = load2(psi + 2 * (I + d0 + d2 + d3));
++    v[2] = load2(psi + 2 * (I + d1 + d2 + d3));
++    v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d3));
+ 
+     tmp[0] = add(tmp[0], add(mul(v[0], m[192], mt[192]), add(mul(v[1], m[193], mt[193]), add(mul(v[2], m[194], mt[194]), mul(v[3], m[195], mt[195])))));
+     tmp[1] = add(tmp[1], add(mul(v[0], m[196], mt[196]), add(mul(v[1], m[197], mt[197]), add(mul(v[2], m[198], mt[198]), mul(v[3], m[199], mt[199])))));
+@@ -107,10 +107,10 @@
+     tmp[14] = add(tmp[14], add(mul(v[0], m[248], mt[248]), add(mul(v[1], m[249], mt[249]), add(mul(v[2], m[250], mt[250]), mul(v[3], m[251], mt[251])))));
+     tmp[15] = add(tmp[15], add(mul(v[0], m[252], mt[252]), add(mul(v[1], m[253], mt[253]), add(mul(v[2], m[254], mt[254]), mul(v[3], m[255], mt[255])))));
+ 
+-    v[0] = load2(&psi[I + d4]);
+-    v[1] = load2(&psi[I + d0 + d4]);
+-    v[2] = load2(&psi[I + d1 + d4]);
+-    v[3] = load2(&psi[I + d0 + d1 + d4]);
++    v[0] = load2(psi + 2 * (I + d4));
++    v[1] = load2(psi + 2 * (I + d0 + d4));
++    v[2] = load2(psi + 2 * (I + d1 + d4));
++    v[3] = load2(psi + 2 * (I + d0 + d1 + d4));
+ 
+     tmp[0] = add(tmp[0], add(mul(v[0], m[256], mt[256]), add(mul(v[1], m[257], mt[257]), add(mul(v[2], m[258], mt[258]), mul(v[3], m[259], mt[259])))));
+     tmp[1] = add(tmp[1], add(mul(v[0], m[260], mt[260]), add(mul(v[1], m[261], mt[261]), add(mul(v[2], m[262], mt[262]), mul(v[3], m[263], mt[263])))));
+@@ -129,10 +129,10 @@
+     tmp[14] = add(tmp[14], add(mul(v[0], m[312], mt[312]), add(mul(v[1], m[313], mt[313]), add(mul(v[2], m[314], mt[314]), mul(v[3], m[315], mt[315])))));
+     tmp[15] = add(tmp[15], add(mul(v[0], m[316], mt[316]), add(mul(v[1], m[317], mt[317]), add(mul(v[2], m[318], mt[318]), mul(v[3], m[319], mt[319])))));
+ 
+-    v[0] = load2(&psi[I + d2 + d4]);
+-    v[1] = load2(&psi[I + d0 + d2 + d4]);
+-    v[2] = load2(&psi[I + d1 + d2 + d4]);
+-    v[3] = load2(&psi[I + d0 + d1 + d2 + d4]);
++    v[0] = load2(psi + 2 * (I + d2 + d4));
++    v[1] = load2(psi + 2 * (I + d0 + d2 + d4));
++    v[2] = load2(psi + 2 * (I + d1 + d2 + d4));
++    v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d4));
+ 
+     tmp[0] = add(tmp[0], add(mul(v[0], m[320], mt[320]), add(mul(v[1], m[321], mt[321]), add(mul(v[2], m[322], mt[322]), mul(v[3], m[323], mt[323])))));
+     tmp[1] = add(tmp[1], add(mul(v[0], m[324], mt[324]), add(mul(v[1], m[325], mt[325]), add(mul(v[2], m[326], mt[326]), mul(v[3], m[327], mt[327])))));
+@@ -151,10 +151,10 @@
+     tmp[14] = add(tmp[14], add(mul(v[0], m[376], mt[376]), add(mul(v[1], m[377], mt[377]), add(mul(v[2], m[378], mt[378]), mul(v[3], m[379], mt[379])))));
+     tmp[15] = add(tmp[15], add(mul(v[0], m[380], mt[380]), add(mul(v[1], m[381], mt[381]), add(mul(v[2], m[382], mt[382]), mul(v[3], m[383], mt[383])))));
+ 
+-    v[0] = load2(&psi[I + d3 + d4]);
+-    v[1] = load2(&psi[I + d0 + d3 + d4]);
+-    v[2] = load2(&psi[I + d1 + d3 + d4]);
+-    v[3] = load2(&psi[I + d0 + d1 + d3 + d4]);
++    v[0] = load2(psi + 2 * (I + d3 + d4));
++    v[1] = load2(psi + 2 * (I + d0 + d3 + d4));
++    v[2] = load2(psi + 2 * (I + d1 + d3 + d4));
++    v[3] = load2(psi + 2 * (I + d0 + d1 + d3 + d4));
+ 
+     tmp[0] = add(tmp[0], add(mul(v[0], m[384], mt[384]), add(mul(v[1], m[385], mt[385]), add(mul(v[2], m[386], mt[386]), mul(v[3], m[387], mt[387])))));
+     tmp[1] = add(tmp[1], add(mul(v[0], m[388], mt[388]), add(mul(v[1], m[389], mt[389]), add(mul(v[2], m[390], mt[390]), mul(v[3], m[391], mt[391])))));
+@@ -173,35 +173,35 @@
+     tmp[14] = add(tmp[14], add(mul(v[0], m[440], mt[440]), add(mul(v[1], m[441], mt[441]), add(mul(v[2], m[442], mt[442]), mul(v[3], m[443], mt[443])))));
+     tmp[15] = add(tmp[15], add(mul(v[0], m[444], mt[444]), add(mul(v[1], m[445], mt[445]), add(mul(v[2], m[446], mt[446]), mul(v[3], m[447], mt[447])))));
+ 
+-    v[0] = load2(&psi[I + d2 + d3 + d4]);
+-    v[1] = load2(&psi[I + d0 + d2 + d3 + d4]);
+-    v[2] = load2(&psi[I + d1 + d2 + d3 + d4]);
+-    v[3] = load2(&psi[I + d0 + d1 + d2 + d3 + d4]);
+-
+-    _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[448], mt[448]), add(mul(v[1], m[449], mt[449]), add(mul(v[2], m[450], mt[450]), mul(v[3], m[451], mt[451]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[452], mt[452]), add(mul(v[1], m[453], mt[453]), add(mul(v[2], m[454], mt[454]), mul(v[3], m[455], mt[455]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[456], mt[456]), add(mul(v[1], m[457], mt[457]), add(mul(v[2], m[458], mt[458]), mul(v[3], m[459], mt[459]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[460], mt[460]), add(mul(v[1], m[461], mt[461]), add(mul(v[2], m[462], mt[462]), mul(v[3], m[463], mt[463]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d3], (double*)&psi[I + d3], add(tmp[4], add(mul(v[0], m[464], mt[464]), add(mul(v[1], m[465], mt[465]), add(mul(v[2], m[466], mt[466]), mul(v[3], m[467], mt[467]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], add(tmp[5], add(mul(v[0], m[468], mt[468]), add(mul(v[1], m[469], mt[469]), add(mul(v[2], m[470], mt[470]), mul(v[3], m[471], mt[471]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], add(tmp[6], add(mul(v[0], m[472], mt[472]), add(mul(v[1], m[473], mt[473]), add(mul(v[2], m[474], mt[474]), mul(v[3], m[475], mt[475]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], add(tmp[7], add(mul(v[0], m[476], mt[476]), add(mul(v[1], m[477], mt[477]), add(mul(v[2], m[478], mt[478]), mul(v[3], m[479], mt[479]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d4], (double*)&psi[I + d4], add(tmp[8], add(mul(v[0], m[480], mt[480]), add(mul(v[1], m[481], mt[481]), add(mul(v[2], m[482], mt[482]), mul(v[3], m[483], mt[483]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], add(tmp[9], add(mul(v[0], m[484], mt[484]), add(mul(v[1], m[485], mt[485]), add(mul(v[2], m[486], mt[486]), mul(v[3], m[487], mt[487]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], add(tmp[10], add(mul(v[0], m[488], mt[488]), add(mul(v[1], m[489], mt[489]), add(mul(v[2], m[490], mt[490]), mul(v[3], m[491], mt[491]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], add(tmp[11], add(mul(v[0], m[492], mt[492]), add(mul(v[1], m[493], mt[493]), add(mul(v[2], m[494], mt[494]), mul(v[3], m[495], mt[495]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], add(tmp[12], add(mul(v[0], m[496], mt[496]), add(mul(v[1], m[497], mt[497]), add(mul(v[2], m[498], mt[498]), mul(v[3], m[499], mt[499]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], add(tmp[13], add(mul(v[0], m[500], mt[500]), add(mul(v[1], m[501], mt[501]), add(mul(v[2], m[502], mt[502]), mul(v[3], m[503], mt[503]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], add(tmp[14], add(mul(v[0], m[504], mt[504]), add(mul(v[1], m[505], mt[505]), add(mul(v[2], m[506], mt[506]), mul(v[3], m[507], mt[507]))))));
+-    _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], add(tmp[15], add(mul(v[0], m[508], mt[508]), add(mul(v[1], m[509], mt[509]), add(mul(v[2], m[510], mt[510]), mul(v[3], m[511], mt[511]))))));
++    v[0] = load2(psi + 2 * (I + d2 + d3 + d4));
++    v[1] = load2(psi + 2 * (I + d0 + d2 + d3 + d4));
++    v[2] = load2(psi + 2 * (I + d1 + d2 + d3 + d4));
++    v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d3 + d4));
++
++    _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(tmp[0], add(mul(v[0], m[448], mt[448]), add(mul(v[1], m[449], mt[449]), add(mul(v[2], m[450], mt[450]), mul(v[3], m[451], mt[451]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(tmp[1], add(mul(v[0], m[452], mt[452]), add(mul(v[1], m[453], mt[453]), add(mul(v[2], m[454], mt[454]), mul(v[3], m[455], mt[455]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2), psi + 2 * (I + d2), add(tmp[2], add(mul(v[0], m[456], mt[456]), add(mul(v[1], m[457], mt[457]), add(mul(v[2], m[458], mt[458]), mul(v[3], m[459], mt[459]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2), psi + 2 * (I + d1 + d2), add(tmp[3], add(mul(v[0], m[460], mt[460]), add(mul(v[1], m[461], mt[461]), add(mul(v[2], m[462], mt[462]), mul(v[3], m[463], mt[463]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d3), psi + 2 * (I + d3), add(tmp[4], add(mul(v[0], m[464], mt[464]), add(mul(v[1], m[465], mt[465]), add(mul(v[2], m[466], mt[466]), mul(v[3], m[467], mt[467]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d3), psi + 2 * (I + d1 + d3), add(tmp[5], add(mul(v[0], m[468], mt[468]), add(mul(v[1], m[469], mt[469]), add(mul(v[2], m[470], mt[470]), mul(v[3], m[471], mt[471]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d3), psi + 2 * (I + d2 + d3), add(tmp[6], add(mul(v[0], m[472], mt[472]), add(mul(v[1], m[473], mt[473]), add(mul(v[2], m[474], mt[474]), mul(v[3], m[475], mt[475]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d3), psi + 2 * (I + d1 + d2 + d3), add(tmp[7], add(mul(v[0], m[476], mt[476]), add(mul(v[1], m[477], mt[477]), add(mul(v[2], m[478], mt[478]), mul(v[3], m[479], mt[479]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d4), psi + 2 * (I + d4), add(tmp[8], add(mul(v[0], m[480], mt[480]), add(mul(v[1], m[481], mt[481]), add(mul(v[2], m[482], mt[482]), mul(v[3], m[483], mt[483]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d4), psi + 2 * (I + d1 + d4), add(tmp[9], add(mul(v[0], m[484], mt[484]), add(mul(v[1], m[485], mt[485]), add(mul(v[2], m[486], mt[486]), mul(v[3], m[487], mt[487]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d4), psi + 2 * (I + d2 + d4), add(tmp[10], add(mul(v[0], m[488], mt[488]), add(mul(v[1], m[489], mt[489]), add(mul(v[2], m[490], mt[490]), mul(v[3], m[491], mt[491]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d4), psi + 2 * (I + d1 + d2 + d4), add(tmp[11], add(mul(v[0], m[492], mt[492]), add(mul(v[1], m[493], mt[493]), add(mul(v[2], m[494], mt[494]), mul(v[3], m[495], mt[495]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d3 + d4), psi + 2 * (I + d3 + d4), add(tmp[12], add(mul(v[0], m[496], mt[496]), add(mul(v[1], m[497], mt[497]), add(mul(v[2], m[498], mt[498]), mul(v[3], m[499], mt[499]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d3 + d4), psi + 2 * (I + d1 + d3 + d4), add(tmp[13], add(mul(v[0], m[500], mt[500]), add(mul(v[1], m[501], mt[501]), add(mul(v[2], m[502], mt[502]), mul(v[3], m[503], mt[503]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d3 + d4), psi + 2 * (I + d2 + d3 + d4), add(tmp[14], add(mul(v[0], m[504], mt[504]), add(mul(v[1], m[505], mt[505]), add(mul(v[2], m[506], mt[506]), mul(v[3], m[507], mt[507]))))));
++    _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d3 + d4), psi + 2 * (I + d1 + d2 + d3 + d4), add(tmp[15], add(mul(v[0], m[508], mt[508]), add(mul(v[1], m[509], mt[509]), add(mul(v[2], m[510], mt[510]), mul(v[3], m[511], mt[511]))))));
+ 
+ }
+ 
+ // bit indices id[.] are given from high to low (e.g. control first for CNOT)
+ template <class V, class M>
+-void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
++void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len)
+ {
+-    std::size_t n = psi.size();
++    std::size_t n = len;
+     std::size_t d0 = 1UL << id0;
+     std::size_t d1 = 1UL << id1;
+     std::size_t d2 = 1UL << id2;
+diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/simulator.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/simulator.hpp
 --- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/simulator.hpp	2020-06-05 21:07:57.000000000 +0800
-+++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/simulator.hpp	2021-01-14 10:52:24.822039389 +0800
-@@ -33,7 +33,6 @@
++++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/simulator.hpp	2021-04-20 11:46:27.115554725 +0800
+@@ -18,11 +18,7 @@
+ #include <vector>
+ #include <complex>
+ 
+-#if defined(NOINTRIN) || !defined(INTRIN)
+-#include "nointrin/kernels.hpp"
+-#else
+ #include "intrin/kernels.hpp"
+-#endif
+ 
+ #include "intrin/alignedallocator.hpp"
+ #include "fusion.hpp"
+@@ -32,173 +28,29 @@
+ #include <tuple>
  #include <random>
  #include <functional>
- 
 -
++#include <cstring>
+ 
  class Simulator{
  public:
      using calc_type = double;
-@@ -44,8 +43,9 @@ public:
+     using complex_type = std::complex<calc_type>;
+-    using StateVector = std::vector<complex_type, aligned_allocator<complex_type,512>>;
++    using StateVector = calc_type *;
+     using Map = std::map<unsigned, unsigned>;
+     using RndEngine = std::mt19937;
      using Term = std::vector<std::pair<unsigned, char>>;
      using TermsDict = std::vector<std::pair<Term, calc_type>>;
      using ComplexTermsDict = std::vector<std::pair<Term, complex_type>>;
 +    StateVector vec_;
  
 -    Simulator(unsigned seed = 1) : N_(0), vec_(1,0.), fusion_qubits_min_(4),
-+    Simulator(unsigned seed = 1) : vec_(1,0.), N_(0), fusion_qubits_min_(4),
++    Simulator(unsigned seed = 1, unsigned N = 0) : N_(N), fusion_qubits_min_(4),
                                     fusion_qubits_max_(5), rnd_eng_(seed) {
++        len_ = 1UL << (N_ + 1);
++        vec_ = (StateVector)calloc(len_, sizeof(calc_type));
          vec_[0]=1.; // all-zero initial state
          std::uniform_real_distribution<double> dist(0., 1.);
-@@ -562,7 +562,6 @@ private:
+         rng_ = std::bind(dist, std::ref(rnd_eng_));
+-    }
+-
+-    void allocate_qubit(unsigned id){
+-        if (map_.count(id) == 0){
+-            map_[id] = N_++;
+-            StateVector newvec; // avoid large memory allocations
+-            if( tmpBuff1_.capacity() >= (1UL << N_) )
+-              std::swap(newvec, tmpBuff1_);
+-            newvec.resize(1UL << N_);
+-#pragma omp parallel for schedule(static)
+-            for (std::size_t i = 0; i < newvec.size(); ++i)
+-                newvec[i] = (i < vec_.size())?vec_[i]:0.;
+-            std::swap(vec_, newvec);
+-            // recycle large memory
+-            std::swap(tmpBuff1_, newvec);
+-            if( tmpBuff1_.capacity() < tmpBuff2_.capacity() )
+-              std::swap(tmpBuff1_, tmpBuff2_);
+-        }
+-        else
+-            throw(std::runtime_error(
+-                "AllocateQubit: ID already exists. Qubit IDs should be unique."));
+-    }
+-
+-    bool get_classical_value(unsigned id, calc_type tol = 1.e-12){
+-        run();
+-        unsigned pos = map_[id];
+-        std::size_t delta = (1UL << pos);
+-
+-        for (std::size_t i = 0; i < vec_.size(); i += 2*delta){
+-            for (std::size_t j = 0; j < delta; ++j){
+-                if (std::norm(vec_[i+j]) > tol)
+-                    return false;
+-                if (std::norm(vec_[i+j+delta]) > tol)
+-                    return true;
+-            }
+-        }
+-        assert(false); // this will never happen
+-        return false; // suppress 'control reaches end of non-void...'
+-    }
+-
+-    bool is_classical(unsigned id, calc_type tol = 1.e-12){
+-        run();
+-        unsigned pos = map_[id];
+-        std::size_t delta = (1UL << pos);
+-
+-        short up = 0, down = 0;
+-        #pragma omp parallel for schedule(static) reduction(|:up,down)
+-        for (std::size_t i = 0; i < vec_.size(); i += 2*delta){
+-            for (std::size_t j = 0; j < delta; ++j){
+-                up = up | ((std::norm(vec_[i+j]) > tol)&1);
+-                down = down | ((std::norm(vec_[i+j+delta]) > tol)&1);
+-            }
+-        }
+-
+-        return 1 == (up^down);
+-    }
+-
+-    void collapse_vector(unsigned id, bool value = false, bool shrink = false){
+-        run();
+-        unsigned pos = map_[id];
+-        std::size_t delta = (1UL << pos);
+-
+-        if (!shrink){
+-            #pragma omp parallel for schedule(static)
+-            for (std::size_t i = 0; i < vec_.size(); i += 2*delta){
+-                for (std::size_t j = 0; j < delta; ++j)
+-                    vec_[i+j+static_cast<std::size_t>(!value)*delta] = 0.;
+-            }
+-        }
+-        else{
+-            StateVector newvec; // avoid costly memory reallocations
+-            if( tmpBuff1_.capacity() >= (1UL << (N_-1)) )
+-              std::swap(tmpBuff1_, newvec);
+-            newvec.resize((1UL << (N_-1)));
+-            #pragma omp parallel for schedule(static) if(0)
+-            for (std::size_t i = 0; i < vec_.size(); i += 2*delta)
+-                std::copy_n(&vec_[i + static_cast<std::size_t>(value)*delta],
+-                            delta, &newvec[i/2]);
+-            std::swap(vec_, newvec);
+-            std::swap(tmpBuff1_, newvec);
+-            if( tmpBuff1_.capacity() < tmpBuff2_.capacity() )
+-              std::swap(tmpBuff1_, tmpBuff2_);
+-
+-            for (auto& p : map_){
+-                if (p.second > pos)
+-                    p.second--;
+-            }
+-            map_.erase(id);
+-            N_--;
+-        }
+-    }
+-
+-    void measure_qubits(std::vector<unsigned> const& ids, std::vector<bool> &res){
+-        run();
+-
+-        std::vector<unsigned> positions(ids.size());
+-        for (unsigned i = 0; i < ids.size(); ++i)
+-            positions[i] = map_[ids[i]];
+-
+-        calc_type P = 0.;
+-        calc_type rnd = rng_();
+-
+-        // pick entry at random with probability |entry|^2
+-        std::size_t pick = 0;
+-        while (P < rnd && pick < vec_.size())
+-            P += std::norm(vec_[pick++]);
+-
+-        pick--;
+-        // determine result vector (boolean values for each qubit)
+-        // and create mask to detect bad entries (i.e., entries that don't agree with measurement)
+-        res = std::vector<bool>(ids.size());
+-        std::size_t mask = 0;
+-        std::size_t val = 0;
+-        for (unsigned i = 0; i < ids.size(); ++i){
+-            bool r = ((pick >> positions[i]) & 1) == 1;
+-            res[i] = r;
+-            mask |= (1UL << positions[i]);
+-            val |= (static_cast<std::size_t>(r&1) << positions[i]);
+-        }
+-        // set bad entries to 0
+-        calc_type N = 0.;
+-        #pragma omp parallel for reduction(+:N) schedule(static)
+-        for (std::size_t i = 0; i < vec_.size(); ++i){
+-            if ((i & mask) != val)
+-                vec_[i] = 0.;
+-            else
+-                N += std::norm(vec_[i]);
+-        }
+-        // re-normalize
+-        N = 1./std::sqrt(N);
+-        #pragma omp parallel for schedule(static)
+-        for (std::size_t i = 0; i < vec_.size(); ++i)
+-            vec_[i] *= N;
+-    }
+-
+-    std::vector<bool> measure_qubits_return(std::vector<unsigned> const& ids){
+-        std::vector<bool> ret;
+-        measure_qubits(ids, ret);
+-        return ret;
+-    }
+-
+-    void deallocate_qubit(unsigned id){
+-        run();
+-        assert(map_.count(id) == 1);
+-        if (!is_classical(id))
+-            throw(std::runtime_error("Error: Qubit has not been measured / uncomputed! There is most likely a bug in your code."));
+-
+-        bool value = get_classical_value(id);
+-        collapse_vector(id, value, true);
++        for (unsigned i = 0; i < N_; i++)
++            map_[i] = i;
+     }
+ 
+     template <class M>
+@@ -221,84 +73,13 @@
+             fused_gates_ = fused_gates;
+     }
+ 
+-    template <class F, class QuReg>
+-    void emulate_math(F const& f, QuReg quregs, const std::vector<unsigned>& ctrl,
+-                      bool parallelize = false){
+-        run();
+-        auto ctrlmask = get_control_mask(ctrl);
+-
+-        for (unsigned i = 0; i < quregs.size(); ++i)
+-            for (unsigned j = 0; j < quregs[i].size(); ++j)
+-                quregs[i][j] = map_[quregs[i][j]];
+-
+-        StateVector newvec; // avoid costly memory reallocations
+-        if( tmpBuff1_.capacity() >= vec_.size() )
+-          std::swap(newvec, tmpBuff1_);
+-        newvec.resize(vec_.size());
+-#pragma omp parallel for schedule(static)
+-        for (std::size_t i = 0; i < vec_.size(); i++)
+-          newvec[i] = 0;
+-
+-//#pragma omp parallel reduction(+:newvec[:newvec.size()]) if(parallelize) // requires OpenMP 4.5
+-        {
+-          std::vector<int> res(quregs.size());
+-          //#pragma omp for schedule(static)
+-          for (std::size_t i = 0; i < vec_.size(); ++i){
+-              if ((ctrlmask&i) == ctrlmask){
+-                  for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
+-                      res[qr_i] = 0;
+-                      for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i)
+-                          res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1) << qb_i;
+-                  }
+-                  f(res);
+-                  auto new_i = i;
+-                  for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
+-                      for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i){
+-                          if (!(((new_i >> quregs[qr_i][qb_i])&1) == ((res[qr_i] >> qb_i)&1)))
+-                              new_i ^= (1UL << quregs[qr_i][qb_i]);
+-                      }
+-                  }
+-                  newvec[new_i] += vec_[i];
+-              }
+-              else
+-                  newvec[i] += vec_[i];
+-          }
+-        }
+-        std::swap(vec_, newvec);
+-        std::swap(tmpBuff1_, newvec);
+-    }
+-
+-    // faster version without calling python 
+-    template<class QuReg>
+-    inline void emulate_math_addConstant(int a, const QuReg& quregs, const std::vector<unsigned>& ctrl)
+-    {
+-      emulate_math([a](std::vector<int> &res){for(auto& x: res) x = x + a;}, quregs, ctrl, true);
+-    }
+-
+-    // faster version without calling python 
+-    template<class QuReg>
+-    inline void emulate_math_addConstantModN(int a, int N, const QuReg& quregs, const std::vector<unsigned>& ctrl)
+-    {
+-      emulate_math([a,N](std::vector<int> &res){for(auto& x: res) x = (x + a) % N;}, quregs, ctrl, true);
+-    }
+-
+-    // faster version without calling python 
+-    template<class QuReg>
+-    inline void emulate_math_multiplyByConstantModN(int a, int N, const QuReg& quregs, const std::vector<unsigned>& ctrl)
+-    {
+-      emulate_math([a,N](std::vector<int> &res){for(auto& x: res) x = (x * a) % N;}, quregs, ctrl, true);
+-    }
+-
+     calc_type get_expectation_value(TermsDict const& td, std::vector<unsigned> const& ids){
+         run();
+         calc_type expectation = 0.;
+ 
+-        StateVector current_state; // avoid costly memory reallocations
+-        if( tmpBuff1_.capacity() >= vec_.size() )
+-          std::swap(tmpBuff1_, current_state);
+-        current_state.resize(vec_.size());
++        StateVector current_state = (StateVector)malloc(len_ *sizeof(calc_type));
+ #pragma omp parallel for schedule(static)
+-        for (std::size_t i = 0; i < vec_.size(); ++i)
++        for (std::size_t i = 0; i < len_; ++i)
+           current_state[i] = vec_[i];
+ 
+         for (auto const& term : td){
+@@ -306,81 +87,53 @@
+             apply_term(term.first, ids, {});
+             calc_type delta = 0.;
+             #pragma omp parallel for reduction(+:delta) schedule(static)
+-            for (std::size_t i = 0; i < vec_.size(); ++i){
+-                auto const a1 = std::real(current_state[i]);
+-                auto const b1 = -std::imag(current_state[i]);
+-                auto const a2 = std::real(vec_[i]);
+-                auto const b2 = std::imag(vec_[i]);
++            for (std::size_t i = 0; i < (len_ >> 1); ++i){
++                auto const a1 = current_state[2 * i];
++                auto const b1 = -current_state[2 * i + 1];
++                auto const a2 = vec_[2 * i];
++                auto const b2 = vec_[2 * i + 1];
+                 delta += a1 * a2 - b1 * b2;
+                 // reset vec_
+-                vec_[i] = current_state[i];
++                vec_[2 * i] = current_state[2 * i];
++                vec_[2 * i + 1] = current_state[2 * i + 1];
+             }
+             expectation += coefficient * delta;
+         }
+-        std::swap(current_state, tmpBuff1_);
++        if (NULL != current_state){
++            free(current_state);
++            current_state = NULL;
++        }
+         return expectation;
+     }
+ 
+     void apply_qubit_operator(ComplexTermsDict const& td, std::vector<unsigned> const& ids){
+         run();
+-        StateVector new_state, current_state; // avoid costly memory reallocations
+-        if( tmpBuff1_.capacity() >= vec_.size() )
+-          std::swap(tmpBuff1_, new_state);
+-        if( tmpBuff2_.capacity() >= vec_.size() )
+-          std::swap(tmpBuff2_, current_state);
+-        new_state.resize(vec_.size());
+-        current_state.resize(vec_.size());
++        StateVector new_state = (StateVector)calloc(len_, sizeof(calc_type));
++        StateVector current_state = (StateVector)malloc(len_ * sizeof(calc_type));
+ #pragma omp parallel for schedule(static)
+-        for (std::size_t i = 0; i < vec_.size(); ++i){
+-          new_state[i] = 0;
++        for (std::size_t i = 0; i < len_; ++i){
+           current_state[i] = vec_[i];
+         }
+         for (auto const& term : td){
+             auto const& coefficient = term.second;
+             apply_term(term.first, ids, {});
+             #pragma omp parallel for schedule(static)
+-            for (std::size_t i = 0; i < vec_.size(); ++i){
+-                new_state[i] += coefficient * vec_[i];
+-                vec_[i] = current_state[i];
++            for (std::size_t i = 0; i < (len_ >> 1); ++i){
++                new_state[2 * i] += coefficient.real() * vec_[2 * i] - coefficient.imag() * vec_[2 * i + 1];
++                new_state[2 * i + 1] += coefficient.real() * vec_[2 * i + 1] + coefficient.imag() * vec_[2 * i];
++                vec_[2 * i] = current_state[2 * i];
++                vec_[2 * i + 1] = current_state[2 * i + 1];
+             }
+         }
+-        std::swap(vec_, new_state);
+-        std::swap(tmpBuff1_, new_state);
+-        std::swap(tmpBuff2_, current_state);
+-    }
+-
+-    calc_type get_probability(std::vector<bool> const& bit_string,
+-                              std::vector<unsigned> const& ids){
+-        run();
+-        if (!check_ids(ids))
+-            throw(std::runtime_error("get_probability(): Unknown qubit id. Please make sure you have called eng.flush()."));
+-        std::size_t mask = 0, bit_str = 0;
+-        for (unsigned i = 0; i < ids.size(); ++i){
+-            mask |= 1UL << map_[ids[i]];
+-            bit_str |= (bit_string[i]?1UL:0UL) << map_[ids[i]];
+-        }
+-        calc_type probability = 0.;
+-        #pragma omp parallel for reduction(+:probability) schedule(static)
+-        for (std::size_t i = 0; i < vec_.size(); ++i)
+-            if ((i & mask) == bit_str)
+-                probability += std::norm(vec_[i]);
+-        return probability;
+-    }
+-
+-    complex_type const& get_amplitude(std::vector<bool> const& bit_string,
+-                                      std::vector<unsigned> const& ids){
+-        run();
+-        std::size_t chk = 0;
+-        std::size_t index = 0;
+-        for (unsigned i = 0; i < ids.size(); ++i){
+-            if (map_.count(ids[i]) == 0)
+-                break;
+-            chk |= 1UL << map_[ids[i]];
+-            index |= (bit_string[i]?1UL:0UL) << map_[ids[i]];
++        if (NULL != vec_)
++            free(vec_);
++        vec_ = new_state;
++        if (NULL != new_state)
++            new_state = NULL;
++        if (NULL != current_state){
++            free(current_state);
++            current_state = NULL;
+         }
+-        if (chk + 1 != vec_.size())
+-            throw(std::runtime_error("The second argument to get_amplitude() must be a permutation of all allocated qubits. Please make sure you have called eng.flush()."));
+-        return vec_[index];
+     }
+ 
+     void emulate_time_evolution(TermsDict const& tdict, calc_type const& time,
+@@ -400,87 +153,71 @@
+         }
+         unsigned s = std::abs(time) * op_nrm + 1.;
+         complex_type correction = std::exp(-time * I * tr / (double)s);
+-        auto output_state = vec_;
++        auto output_state = copy(vec_, len_);
+         auto ctrlmask = get_control_mask(ctrl);
+         for (unsigned i = 0; i < s; ++i){
+             calc_type nrm_change = 1.;
+             for (unsigned k = 0; nrm_change > 1.e-12; ++k){
+                 auto coeff = (-time * I) / double(s * (k + 1));
+-                auto current_state = vec_;
+-                auto update = StateVector(vec_.size(), 0.);
++                auto current_state = copy(vec_, len_);
++                auto update = (StateVector)calloc(len_, sizeof(calc_type));
+                 for (auto const& tup : td){
+                     apply_term(tup.first, ids, {});
+                     #pragma omp parallel for schedule(static)
+-                    for (std::size_t j = 0; j < vec_.size(); ++j){
++                    for (std::size_t j = 0; j < len_; ++j){
+                         update[j] += vec_[j] * tup.second;
+                         vec_[j] = current_state[j];
+                     }
+                 }
+                 nrm_change = 0.;
+                 #pragma omp parallel for reduction(+:nrm_change) schedule(static)
+-                for (std::size_t j = 0; j < vec_.size(); ++j){
+-                    update[j] *= coeff;
+-                    vec_[j] = update[j];
++                for (std::size_t j = 0; j < (len_ >> 1); ++j){
++                    complex_type tmp(update[2 * j], update[2 * j + 1]);
++                    tmp *= coeff;
++                    update[2 * j] *= std::real(tmp);
++                    update[2 * j + 1] *= std::imag(tmp);
++                    vec_[2 * j] = update[2 * j];
++                    vec_[2 * j + 1] = update[2 * j + 1];
+                     if ((j & ctrlmask) == ctrlmask){
+-                        output_state[j] += update[j];
+-                        nrm_change += std::norm(update[j]);
++                        output_state[2 * j] += update[2 * j];
++                        output_state[2 * j + 1] += update[2 * j + 1];
++                        nrm_change += std::sqrt(update[2 * j] * update[2 * j] + update[2 * j + 1] * update[2 * j + 1]);
+                     }
+                 }
+                 nrm_change = std::sqrt(nrm_change);
++                if (NULL != current_state){
++                    free(current_state);
++                    current_state = NULL;
++                }
++                if (NULL != update){
++                    free(update);
++                    update = NULL;
++                }
+             }
+             #pragma omp parallel for schedule(static)
+-            for (std::size_t j = 0; j < vec_.size(); ++j){
+-                if ((j & ctrlmask) == ctrlmask)
+-                    output_state[j] *= correction;
+-                vec_[j] = output_state[j];
++            for (std::size_t j = 0; j < (len_ >>1); ++j){
++                if ((j & ctrlmask) == ctrlmask){
++                    complex_type tmp(output_state[2 * j], output_state[2 * j + 1]);
++                    tmp *= correction;
++                    output_state[2 * j] = std::real(tmp);
++                    output_state[2 * j + 1] =std::imag(tmp);
++                }
++                vec_[2 * j] = output_state[2 * j];
++                vec_[2 * j + 1] = output_state[2 * j + 1];
+             }
+         }
++        if (NULL != output_state){
++            free(output_state);
++            output_state = NULL;
++        }
+     }
+ 
+     void set_wavefunction(StateVector const& wavefunction, std::vector<unsigned> const& ordering){
+         run();
+-        // make sure there are 2^n amplitudes for n qubits
+-        assert(wavefunction.size() == (1UL << ordering.size()));
+-        // check that all qubits have been allocated previously
+-        if (map_.size() != ordering.size() || !check_ids(ordering))
+-            throw(std::runtime_error("set_wavefunction(): Invalid mapping provided. Please make sure all qubits have been allocated previously (call eng.flush())."));
+-
+-        // set mapping and wavefunction
+-        for (unsigned i = 0; i < ordering.size(); ++i)
+-            map_[ordering[i]] = i;
+-        #pragma omp parallel for schedule(static)
+-        for (std::size_t i = 0; i < wavefunction.size(); ++i)
+-            vec_[i] = wavefunction[i];
+-    }
+-
+-    void collapse_wavefunction(std::vector<unsigned> const& ids, std::vector<bool> const& values){
+-        run();
+-        assert(ids.size() == values.size());
+-        if (!check_ids(ids))
+-            throw(std::runtime_error("collapse_wavefunction(): Unknown qubit id(s) provided. Try calling eng.flush() before invoking this function."));
+-        std::size_t mask = 0, val = 0;
+-        for (unsigned i = 0; i < ids.size(); ++i){
+-            mask |= (1UL << map_[ids[i]]);
+-            val |= ((values[i]?1UL:0UL) << map_[ids[i]]);
+-        }
+-        // set bad entries to 0 and compute probability of outcome to renormalize
+-        calc_type N = 0.;
+-        #pragma omp parallel for reduction(+:N) schedule(static)
+-        for (std::size_t i = 0; i < vec_.size(); ++i){
+-            if ((i & mask) == val)
+-                N += std::norm(vec_[i]);
+-        }
+-        if (N < 1.e-12)
+-            throw(std::runtime_error("collapse_wavefunction(): Invalid collapse! Probability is ~0."));
+-        // re-normalize (if possible)
+-        N = 1./std::sqrt(N);
+-        #pragma omp parallel for schedule(static)
+-        for (std::size_t i = 0; i < vec_.size(); ++i){
+-            if ((i & mask) != val)
+-                vec_[i] = 0.;
+-            else
+-                vec_[i] *= N;
++        if (NULL != vec_){
++          free(vec_);
+         }
++        vec_ = copy(wavefunction, len_);
+     }
+ 
+     void run(){
+@@ -500,23 +237,23 @@
+         switch (ids.size()){
+             case 1:
+                 #pragma omp parallel
+-                kernel(vec_, ids[0], m, ctrlmask);
++                kernel(vec_, ids[0], m, ctrlmask, len_ >> 1);
+                 break;
+             case 2:
+                 #pragma omp parallel
+-                kernel(vec_, ids[1], ids[0], m, ctrlmask);
++                kernel(vec_, ids[1], ids[0], m, ctrlmask, len_ >> 1);
+                 break;
+             case 3:
+                 #pragma omp parallel
+-                kernel(vec_, ids[2], ids[1], ids[0], m, ctrlmask);
++                kernel(vec_, ids[2], ids[1], ids[0], m, ctrlmask, len_ >> 1);
+                 break;
+             case 4:
+                 #pragma omp parallel
+-                kernel(vec_, ids[3], ids[2], ids[1], ids[0], m, ctrlmask);
++                kernel(vec_, ids[3], ids[2], ids[1], ids[0], m, ctrlmask, len_ >> 1);
+                 break;
+             case 5:
+                 #pragma omp parallel
+-                kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask);
++                kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask, len_ >> 1);
+                 break;
+             default:
+                 throw std::invalid_argument("Gates with more than 5 qubits are not supported!");
+@@ -525,12 +262,27 @@
+         fused_gates_ = Fusion();
+     }
+ 
+-    std::tuple<Map, StateVector&> cheat(){
++    std::vector<complex_type> cheat(){
+         run();
+-        return make_tuple(map_, std::ref(vec_));
++        std::vector<complex_type> result;
++        for (unsigned int i = 0; i < (len_ >> 1); i++){
++            result.push_back({vec_[2 * i], vec_[2 * i + 1]});
++        }
++        return result;
++    }
++
++    inline StateVector copy(StateVector source, unsigned len){
++    StateVector result = (StateVector)malloc(len * sizeof(calc_type));
++#pragma omp parallel for schedule(static)
++    for (std::size_t i = 0; i < len; ++i) {
++      result[i] = source[i];
+     }
++    return result;
++}
+ 
+     ~Simulator(){
++        if (NULL != vec_)
++            free(vec_);
+     }
+ 
+ private:
+@@ -562,18 +314,13 @@
      }
  
      unsigned N_; // #qubits
 -    StateVector vec_;
++    unsigned len_;
      Map map_;
      Fusion fused_gates_;
      unsigned fusion_qubits_min_, fusion_qubits_max_;
-@@ -570,10 +569,8 @@ private:
+     RndEngine rnd_eng_;
      std::function<double()> rng_;
- 
-     // large array buffers to avoid costly reallocations
+-
+-    // large array buffers to avoid costly reallocations
 -    static StateVector tmpBuff1_, tmpBuff2_;
-+    StateVector tmpBuff1_, tmpBuff2_;
  };
  
 -Simulator::StateVector Simulator::tmpBuff1_;