From 800dd5abaa732336b4e8c7db7a3867b6332aaa20 Mon Sep 17 00:00:00 2001 From: donghufeng Date: Fri, 30 Apr 2021 17:59:12 +0800 Subject: [PATCH] clean code, quantum state to array, projector --- .../cpu/quantum/evolution_cpu_kernel.cc | 16 +- .../cpu/quantum/evolution_cpu_kernel.h | 1 - .../cpu/quantum/pqc_cpu_kernel.cc | 30 +- .../cpu/quantum/pqc_cpu_kernel.h | 6 +- .../quantum_simulator/gates/basic_gates.h | 1 + .../quantum/quantum_simulator/gates/gates.h | 1 - .../gates/intrinsic_one_para_gate.h | 1 + .../quantum_simulator/gates/parameter_gate.h | 1 + .../quantum_simulator/parameter_resolver.h | 1 - .../quantum_simulator/pqc_simulator.cc | 60 +- .../quantum/quantum_simulator/pqc_simulator.h | 7 +- .../quantum/quantum_simulator/projector.cc | 41 + .../cpu/quantum/quantum_simulator/projector.h | 40 + .../cpu/quantum/quantum_simulator/sparse.cc | 1 - .../cpu/quantum/quantum_simulator/sparse.h | 4 +- .../quantum/quantum_simulator/transformer.cc | 10 + .../quantum/quantum_simulator/transformer.h | 3 + .../cpu/quantum/quantum_simulator/utils.cc | 15 +- .../cpu/quantum/quantum_simulator/utils.h | 2 + third_party/patch/projectq/projectq.patch001 | 948 +++++++++++++++++- 20 files changed, 1117 insertions(+), 72 deletions(-) create mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.cc create mode 100644 mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.h diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.cc index 0b434baec4e..d5046d5490b 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.cc @@ -72,22 +72,22 @@ bool EvolutionCPUKernel::Launch(const std::vector &inputs, auto output = reinterpret_cast(outputs[0]->addr); MS_EXCEPTION_IF_NULL(param_data); MS_EXCEPTION_IF_NULL(output); - sim_ = mindquantum::PQCSimulator(1, n_qubits_); + auto sim = mindquantum::PQCSimulator(1, n_qubits_); mindquantum::ParameterResolver pr; for (size_t i = 0; i < param_names_.size(); i++) { pr.SetData(param_names_.at(i), param_data[i]); } - sim_.Evolution(circ_, pr); + sim.Evolution(circ_, pr); if (hams_.size() == 1) { - sim_.ApplyHamiltonian(hams_[0]); + sim.ApplyHamiltonian(hams_[0]); } - if (state_len_ != sim_.vec_.size()) { + if (state_len_ != (1UL << n_qubits_)) { MS_LOG(EXCEPTION) << "simulation error number of quantum qubit!"; } - size_t poi = 0; - for (auto &v : sim_.vec_) { - output[poi++] = v.real(); - output[poi++] = v.imag(); + auto size = state_len_ * 2; +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < size; i++) { + output[i] = sim.vec_[i]; } return true; } diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.h index 5499ae60340..b17e60d93f0 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/evolution_cpu_kernel.h @@ -41,7 +41,6 @@ class EvolutionCPUKernel : public CPUKernel { private: int64_t n_qubits_; size_t state_len_; - mindquantum::PQCSimulator sim_; mindquantum::BasicCircuit circ_; mindquantum::transformer::Hamiltonians hams_; mindquantum::transformer::NamesType param_names_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.cc index 6b75bf4756f..55b9e551388 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.cc @@ -36,10 +36,12 @@ struct ComputeParam { mindquantum::BasicCircuit *circ_cp; mindquantum::BasicCircuit *herm_circ_cp; mindquantum::transformer::Hamiltonians *hams_cp; + mindquantum::transformer::Projectors *projectors_cp; mindquantum::transformer::NamesType *encoder_params_names_cp; mindquantum::transformer::NamesType *ansatz_params_names_cp; std::vector>> *tmp_sims_cp; bool dummy_circuit_cp{false}; + bool is_projector_cp{false}; size_t result_len_cp{0}; size_t encoder_g_len_cp{0}; size_t ansatz_g_len_cp{0}; @@ -60,6 +62,8 @@ void ComputerForwardBackward(const std::shared_ptr &input_params, auto circ = input_params->circ_cp; auto herm_circ = input_params->herm_circ_cp; auto hams = input_params->hams_cp; + auto projectors = input_params->projectors_cp; + auto is_projector = input_params->is_projector_cp; auto encoder_params_names = input_params->encoder_params_names_cp; auto ansatz_params_names = input_params->ansatz_params_names_cp; auto tmp_sims = input_params->tmp_sims_cp; @@ -91,6 +95,8 @@ void ComputerForwardBackward(const std::shared_ptr &input_params, calc_gradient_param->circuit_cp = circ; calc_gradient_param->circuit_hermitian_cp = herm_circ; calc_gradient_param->hamiltonians_cp = hams; + calc_gradient_param->projectors_cp = projectors; + calc_gradient_param->is_projector_cp = is_projector; calc_gradient_param->paras_cp = ≺ calc_gradient_param->encoder_params_names_cp = encoder_params_names; calc_gradient_param->ansatz_params_names_cp = ansatz_params_names; @@ -142,6 +148,8 @@ void PQCCPUKernel::InitPQCStructure(const CNodePtr &kernel_node) { AnfAlgo::GetNodeAttr(kernel_node, mindquantum::kHamsPauliWord); hams_pauli_qubit_ = AnfAlgo::GetNodeAttr(kernel_node, mindquantum::kHamsPauliQubit); + projector_strs_ = AnfAlgo::GetNodeAttr(kernel_node, mindquantum::kProjectors); + is_projector_ = AnfAlgo::GetNodeAttr(kernel_node, mindquantum::kIsProjector); } void PQCCPUKernel::InitKernel(const CNodePtr &kernel_node) { @@ -174,18 +182,12 @@ void PQCCPUKernel::InitKernel(const CNodePtr &kernel_node) { herm_circ_ = circs[1]; hams_ = mindquantum::transformer::HamiltoniansTransfor(hams_pauli_coeff_, hams_pauli_word_, hams_pauli_qubit_); + projectors_ = mindquantum::transformer::ProjectorsTransfor(projector_strs_); n_threads_user_ = std::min(n_threads_user_, common::ThreadPool::GetInstance().GetSyncRunThreadNum()); if (n_samples_ < n_threads_user_) { n_threads_user_ = n_samples_; } - for (size_t i = 0; i < n_threads_user_; i++) { - tmp_sims_.push_back({}); - for (size_t j = 0; j < 4; j++) { - auto tmp = std::make_shared(1, n_qubits_); - tmp_sims_.back().push_back(tmp); - } - } } bool PQCCPUKernel::Launch(const std::vector &inputs, @@ -205,6 +207,16 @@ bool PQCCPUKernel::Launch(const std::vector &inputs, MS_EXCEPTION_IF_NULL(gradient_encoder); MS_EXCEPTION_IF_NULL(gradient_ansatz); + std::vector>> tmp_sims( + n_threads_user_, std::vector>(4, nullptr)); +#pragma omp parallel for collapse(2) schedule(static) + for (size_t i = 0; i < n_threads_user_; i++) { + for (size_t j = 0; j < 4; j++) { + auto tmp = std::make_shared(1, n_qubits_); + tmp_sims[i][j] = tmp; + } + } + std::vector tasks; std::vector> thread_params; tasks.reserve(n_threads_user_); @@ -222,9 +234,11 @@ bool PQCCPUKernel::Launch(const std::vector &inputs, params->circ_cp = &circ_; params->herm_circ_cp = &herm_circ_; params->hams_cp = &hams_; + params->projectors_cp = &projectors_; + params->is_projector_cp = is_projector_; params->encoder_params_names_cp = &encoder_params_names_; params->ansatz_params_names_cp = &ansatz_params_names_; - params->tmp_sims_cp = &tmp_sims_; + params->tmp_sims_cp = &tmp_sims; params->dummy_circuit_cp = dummy_circuit_; params->result_len_cp = result_len_; params->encoder_g_len_cp = encoder_g_len_; diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.h index 41e49ba3d3a..aa5320c602d 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/pqc_cpu_kernel.h @@ -52,7 +52,7 @@ class PQCCPUKernel : public CPUKernel { mindquantum::BasicCircuit circ_; mindquantum::BasicCircuit herm_circ_; mindquantum::transformer::Hamiltonians hams_; - std::vector>> tmp_sims_; + mindquantum::transformer::Projectors projectors_; // parameters mindquantum::transformer::NamesType encoder_params_names_; @@ -71,6 +71,10 @@ class PQCCPUKernel : public CPUKernel { mindquantum::transformer::PaulisCoeffsType hams_pauli_coeff_; mindquantum::transformer::PaulisWordsType hams_pauli_word_; mindquantum::transformer::PaulisQubitsType hams_pauli_qubit_; + + // subspace measurement ops + mindquantum::transformer::NamesType projector_strs_; + bool is_projector_; }; MS_REG_CPU_KERNEL(PQC, diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/basic_gates.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/basic_gates.h index bec1f967748..e36022a9a62 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/basic_gates.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/basic_gates.h @@ -42,6 +42,7 @@ class BasicGate { bool IsParameterGate(); Indexes GetObjQubits(); Indexes GetCtrlQubits(); + virtual ~BasicGate() {} }; } // namespace mindquantum } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/gates.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/gates.h index 5d5b69d4963..ac0849f7d93 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/gates.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/gates.h @@ -77,7 +77,6 @@ class ZZGate : public IntrinsicOneParaGate { public: ZZGate(const Indexes &, const Indexes &, const ParameterResolver &); }; - } // namespace mindquantum } // namespace mindspore #endif // MINDQUANTUM_ENGINE_GATES_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/intrinsic_one_para_gate.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/intrinsic_one_para_gate.h index c998e56d601..f58ac888cd9 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/intrinsic_one_para_gate.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/intrinsic_one_para_gate.h @@ -32,6 +32,7 @@ class IntrinsicOneParaGate : public ParameterGate { CalcType LinearCombination(const ParameterResolver &, const ParameterResolver &); Matrix GetMatrix(const ParameterResolver &) override; Matrix GetDiffMatrix(const ParameterResolver &) override; + virtual ~IntrinsicOneParaGate() {} }; } // namespace mindquantum } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/parameter_gate.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/parameter_gate.h index 321816bba47..a71b3fcfd06 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/parameter_gate.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/gates/parameter_gate.h @@ -26,6 +26,7 @@ class ParameterGate : public BasicGate { public: ParameterGate(); ParameterGate(const std::string &, const Indexes &, const Indexes &, const ParameterResolver &); + virtual ~ParameterGate() {} }; } // namespace mindquantum } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/parameter_resolver.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/parameter_resolver.h index 08569ba9ae7..abe37aee6d4 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/parameter_resolver.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/parameter_resolver.h @@ -23,7 +23,6 @@ namespace mindspore { namespace mindquantum { - class ParameterResolver { public: ParameterResolver(); diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.cc index 16fe9c88278..8f6a19da808 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.cc @@ -20,20 +20,18 @@ namespace mindspore { namespace mindquantum { -PQCSimulator::PQCSimulator() : Simulator(1), n_qubits_(1) { - PQCSimulator::AllocateAll(); +PQCSimulator::PQCSimulator() : Simulator(1, 1), n_qubits_(1) { for (Index i = 0; i < n_qubits_; i++) { ordering_.push_back(i); } - len_ = (1UL << n_qubits_); + len_ = (1UL << (n_qubits_ + 1)); } -PQCSimulator::PQCSimulator(Index seed = 1, Index N = 1) : Simulator(seed), n_qubits_(N) { - PQCSimulator::AllocateAll(); +PQCSimulator::PQCSimulator(Index seed = 1, Index N = 1) : Simulator(seed, N), n_qubits_(N) { for (Index i = 0; i < n_qubits_; i++) { ordering_.push_back(i); } - len_ = (1UL << n_qubits_); + len_ = (1UL << (n_qubits_ + 1)); } void PQCSimulator::ApplyGate(std::shared_ptr g, const ParameterResolver ¶s, bool diff) { @@ -65,14 +63,15 @@ void PQCSimulator::Evolution(BasicCircuit const &circuit, ParameterResolver cons PQCSimulator::ApplyBlocks(circuit.GetGateBlocks(), paras); } -CalcType PQCSimulator::Measure(Index mask1, Index mask2, bool apply) { +CalcType PQCSimulator::Measure(const Indexes &masks, bool apply) { CalcType out = 0; #pragma omp parallel for reduction(+ : out) schedule(static) for (unsigned i = 0; i < (1UL << n_qubits_); i++) { - if (((i & mask1) == mask1) && ((i | mask2) == mask2)) { - out = out + std::real(vec_[i]) * std::real(vec_[i]) + std::imag(vec_[i]) * std::imag(vec_[i]); + if (((i & masks[0]) == masks[0]) && ((i | masks[1]) == masks[1])) { + out = out + vec_[2 * i] * vec_[2 * i] + vec_[2 * i + 1] * vec_[2 * i + 1]; } else if (apply) { - vec_[i] = 0; + vec_[2 * i] = 0; + vec_[2 * i + 1] = 0; } } return out; @@ -85,6 +84,8 @@ std::vector> PQCSimulator::CalcGradient(const std::shared_ptr auto circuit = input_params->circuit_cp; auto circuit_hermitian = input_params->circuit_hermitian_cp; auto hamiltonians = input_params->hamiltonians_cp; + auto projectors = input_params->projectors_cp; + auto is_projector = input_params->is_projector_cp; auto paras = input_params->paras_cp; auto encoder_params_names = input_params->encoder_params_names_cp; auto ansatz_params_names = input_params->ansatz_params_names_cp; @@ -102,16 +103,27 @@ std::vector> PQCSimulator::CalcGradient(const std::shared_ptr MS_LOG(EXCEPTION) << "Empty quantum circuit!"; } unsigned len = circ_gate_blocks.at(0).size(); - std::vector grad(hamiltonians->size() * poi.size(), 0); - std::vector e0(hamiltonians->size(), 0); + size_t mea_size = 0; + if (is_projector) { + mea_size = projectors->size(); + } else { + mea_size = hamiltonians->size(); + } + std::vector grad(mea_size * poi.size(), 0); + std::vector e0(mea_size, 0); // #pragma omp parallel for - for (size_t h_index = 0; h_index < hamiltonians->size(); h_index++) { - auto &hamiltonian = hamiltonians->at(h_index); + for (size_t h_index = 0; h_index < mea_size; h_index++) { s_right.set_wavefunction(vec_, ordering_); s_left.set_wavefunction(s_right.vec_, ordering_); - s_left.apply_qubit_operator(hamiltonian.GetCTD(), ordering_); - e0[h_index] = static_cast(ComplexInnerProduct(vec_, s_left.vec_, len_).real()); + if (is_projector) { + auto &proj = projectors->at(h_index); + e0[h_index] = s_left.Measure(proj.GetMasks(), true); + } else { + auto &hamiltonian = hamiltonians->at(h_index); + s_left.apply_qubit_operator(hamiltonian.GetCTD(), ordering_); + e0[h_index] = static_cast(ComplexInnerProduct(vec_, s_left.vec_, len_).real()); + } if (dummy_circuit_) { continue; } @@ -144,7 +156,7 @@ std::vector> PQCSimulator::CalcGradient(const std::shared_ptr } std::vector grad1; std::vector grad2; - for (size_t i = 0; i < hamiltonians->size(); i++) { + for (size_t i = 0; i < mea_size; i++) { for (size_t j = 0; j < poi.size(); j++) { if (j < encoder_params_names->size()) { grad1.push_back(grad[i * poi.size() + j]); @@ -156,16 +168,6 @@ std::vector> PQCSimulator::CalcGradient(const std::shared_ptr return {e0, grad1, grad2}; } -void PQCSimulator::AllocateAll() { - for (unsigned i = 0; i < n_qubits_; i++) { - Simulator::allocate_qubit(i); - } -} -void PQCSimulator::DeallocateAll() { - for (unsigned i = 0; i < n_qubits_; i++) { - Simulator::deallocate_qubit(i); - } -} void PQCSimulator::SetState(const StateVector &wavefunction) { Simulator::set_wavefunction(wavefunction, ordering_); } std::size_t PQCSimulator::GetControlMask(Indexes const &ctrls) { @@ -184,9 +186,9 @@ CalcType PQCSimulator::GetExpectationValue(const Hamiltonian &ham) { void PQCSimulator::SetZeroState() { #pragma omp parallel for schedule(static) for (size_t i = 0; i < len_; i++) { - vec_[i] = {0, 0}; + vec_[i] = 0; } - vec_[0] = {1, 0}; + vec_[0] = 1; } } // namespace mindquantum } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.h index 134b86a03a8..ff65621e1a0 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/pqc_simulator.h @@ -28,6 +28,7 @@ #include "backend/kernel_compiler/cpu/quantum/quantum_simulator/hamiltonian.h" #include "backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.h" #include "backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.h" +#include "backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.h" namespace mindspore { namespace mindquantum { @@ -35,10 +36,12 @@ struct CalcGradientParam { BasicCircuit *circuit_cp; BasicCircuit *circuit_hermitian_cp; transformer::Hamiltonians *hamiltonians_cp; + transformer::Projectors *projectors_cp; ParameterResolver *paras_cp; transformer::NamesType *encoder_params_names_cp; transformer::NamesType *ansatz_params_names_cp; bool dummy_circuit_cp{false}; + bool is_projector_cp{false}; }; class PQCSimulator : public Simulator { private: @@ -53,13 +56,11 @@ class PQCSimulator : public Simulator { void ApplyBlock(const GateBlock &, const ParameterResolver &); void ApplyBlocks(const GateBlocks &, const ParameterResolver &); void Evolution(const BasicCircuit &, const ParameterResolver &); - CalcType Measure(Index, Index, bool); + CalcType Measure(const Indexes &, bool); void ApplyHamiltonian(const Hamiltonian &); CalcType GetExpectationValue(const Hamiltonian &); std::vector> CalcGradient(const std::shared_ptr &, PQCSimulator &, PQCSimulator &, PQCSimulator &); - void AllocateAll(); - void DeallocateAll(); void SetState(const StateVector &); std::size_t GetControlMask(Indexes const &); void SetZeroState(); diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.cc new file mode 100644 index 00000000000..6379c7da01f --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.cc @@ -0,0 +1,41 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.h" + +namespace mindspore { +namespace mindquantum { +Projector::Projector() {} +Projector::Projector(const NameType &proj_str) : proj_str_(proj_str), n_qubits_(proj_str.length()) {} +void Projector::HandleMask() { + mask1_ = 0; + mask2_ = 0; + for (auto i : proj_str_) { + if (i == '1') { + mask1_ = mask1_ * 2 + 1; + } else { + mask1_ = mask1_ * 2; + } + if (i == '0') { + mask2_ = mask2_ * 2; + } else { + mask2_ = mask2_ * 2 + 1; + } + } +} +Indexes Projector::GetMasks() { return {mask1_, mask2_}; } +} // namespace mindquantum +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.h new file mode 100644 index 00000000000..ff019e47745 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.h @@ -0,0 +1,40 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDQUANTUM_ENGINE_PROJECTOR_H_ +#define MINDQUANTUM_ENGINE_PROJECTOR_H_ +#include +#include "backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.h" + +namespace mindspore { +namespace mindquantum { +using NameType = std::string; +class Projector { + public: + Projector(); + explicit Projector(const NameType &); + void HandleMask(); + Indexes GetMasks(); + + private: + NameType proj_str_; + Index n_qubits_; + Index mask1_; + Index mask2_; +}; +} // namespace mindquantum +} // namespace mindspore +#endif // MINDQUANTUM_ENGINE_PROJECTOR_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.cc index d8e5635171b..4f0f7bd6b61 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.cc @@ -118,7 +118,6 @@ SparseMatrix GoodTerm2Sparse(const GoodTerm >, Index n_qubits) { out = KroneckerProductSparse(IdentitySparse(n_qubits - gt.first.second.second - 1), out).eval(); return out; } - } // namespace sparse } // namespace mindquantum } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.h index 15f8cc4abce..e20c18320f2 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/sparse.h @@ -33,9 +33,9 @@ using PauliWord = std::pair; using PauliTerm = std::pair, int>; using GoodTerm = std::pair>, std::vector>; using GoodHamilt = std::vector; -typedef Eigen::VectorXcd EigenComplexVector; +using EigenComplexVector = Eigen::VectorXcd; using Eigen::VectorXi; -typedef Eigen::SparseMatrix SparseMatrix; +using SparseMatrix = Eigen::SparseMatrix; using DequeSparseHam = std::deque; using KroneckerProductSparse = Eigen::KroneckerProductSparse; SparseMatrix BasiGateSparse(char); diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.cc index 76419d9e3ee..adaac0fd76a 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.cc @@ -109,6 +109,16 @@ Hamiltonians HamiltoniansTransfor(const PaulisCoeffsType &paulis_coeffs, const P } return hams; } + +Projectors ProjectorsTransfor(const NamesType &proj_strs) { + Projectors projs; + for (Index n = 0; n < proj_strs.size(); n++) { + Projector proj = Projector(proj_strs[n]); + proj.HandleMask(); + projs.push_back(proj); + } + return projs; +} } // namespace transformer } // namespace mindquantum } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.h index 43d334a6ac0..c74947fe302 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/transformer.h @@ -23,6 +23,7 @@ #include "backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.h" #include "backend/kernel_compiler/cpu/quantum/quantum_simulator/parameter_resolver.h" #include "backend/kernel_compiler/cpu/quantum/quantum_simulator/hamiltonian.h" +#include "backend/kernel_compiler/cpu/quantum/quantum_simulator/projector.h" namespace mindspore { namespace mindquantum { @@ -49,6 +50,7 @@ using PauliQubitType = std::vector; using PauliQubitsType = std::vector; using PaulisQubitsType = std::vector; using Hamiltonians = std::vector; +using Projectors = std::vector; Hamiltonians HamiltoniansTransfor(const PaulisCoeffsType &, const PaulisWordsType &, const PaulisQubitsType &); @@ -59,6 +61,7 @@ std::vector CircuitTransfor(const NamesType &, const ComplexMatrix Matrix MatrixConverter(const MatrixType &, const MatrixType &, bool); ParameterResolver ParameterResolverConverter(const ParaNameType &, const CoeffType &, const RequireType &, bool); +Projectors ProjectorsTransfor(const NamesType &); } // namespace transformer } // namespace mindquantum } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.cc index 2b3c03928e7..d16a17be9ec 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.cc @@ -21,10 +21,11 @@ namespace mindquantum { ComplexType ComplexInnerProduct(const Simulator::StateVector &v1, const Simulator::StateVector &v2, unsigned len) { CalcType real_part = 0; CalcType imag_part = 0; + auto size = len / 2; #pragma omp parallel for reduction(+ : real_part, imag_part) - for (Index i = 0; i < len; i++) { - real_part += v1[i].real() * v2[i].real() + v1[i].imag() * v2[i].imag(); - imag_part += v1[i].real() * v2[i].imag() - v1[i].imag() * v2[i].real(); + for (Index i = 0; i < size; i++) { + real_part += v1[2 * i] * v2[2 * i] + v1[2 * i + 1] * v2[2 * i + 1]; + imag_part += v1[2 * i] * v2[2 * i + 1] - v1[2 * i + 1] * v2[2 * i]; } ComplexType result = {real_part, imag_part}; @@ -35,16 +36,16 @@ ComplexType ComplexInnerProductWithControl(const Simulator::StateVector &v1, con Index len, std::size_t ctrlmask) { CalcType real_part = 0; CalcType imag_part = 0; + auto size = len / 2; #pragma omp parallel for reduction(+ : real_part, imag_part) - for (std::size_t i = 0; i < len; i++) { + for (std::size_t i = 0; i < size; i++) { if ((i & ctrlmask) == ctrlmask) { - real_part += v1[i].real() * v2[i].real() + v1[i].imag() * v2[i].imag(); - imag_part += v1[i].real() * v2[i].imag() - v1[i].imag() * v2[i].real(); + real_part += v1[2 * i] * v2[2 * i] + v1[2 * i + 1] * v2[2 * i + 1]; + imag_part += v1[2 * i] * v2[2 * i + 1] - v1[2 * i + 1] * v2[2 * i]; } } ComplexType result = {real_part, imag_part}; return result; } - } // namespace mindquantum } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.h index 326d32afa78..24930a5eda1 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/quantum/quantum_simulator/utils.h @@ -52,6 +52,8 @@ const char kGateRequiresGrad[] = "gate_requires_grad"; const char kHamsPauliCoeff[] = "hams_pauli_coeff"; const char kHamsPauliWord[] = "hams_pauli_word"; const char kHamsPauliQubit[] = "hams_pauli_qubit"; +const char kIsProjector[] = "is_projector"; +const char kProjectors[] = "projectors"; } // namespace mindquantum } // namespace mindspore #endif // MINDQUANTUM_ENGINE_UTILS_H_ diff --git a/third_party/patch/projectq/projectq.patch001 b/third_party/patch/projectq/projectq.patch001 index e9f95b57c47..89c810c84ce 100644 --- a/third_party/patch/projectq/projectq.patch001 +++ b/third_party/patch/projectq/projectq.patch001 @@ -1,38 +1,966 @@ +diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp +--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp 2020-06-05 21:07:57.000000000 +0800 ++++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp 2021-04-19 18:04:05.541802882 +0800 +@@ -17,18 +17,18 @@ + { + __m256d v[2]; + +- v[0] = load2(&psi[I]); +- v[1] = load2(&psi[I + d0]); ++ v[0] = load2(psi + 2 * I); ++ v[1] = load2(psi + 2 * (I + d0)); + +- _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(mul(v[0], m[0], mt[0]), mul(v[1], m[1], mt[1]))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(mul(v[0], m[0], mt[0]), mul(v[1], m[1], mt[1]))); + + } + + // bit indices id[.] are given from high to low (e.g. control first for CNOT) + template +-void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask) ++void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len) + { +- std::size_t n = psi.size(); ++ std::size_t n = len; + std::size_t d0 = 1UL << id0; + + __m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1])}; +diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp +--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp 2020-06-05 21:07:57.000000000 +0800 ++++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp 2021-04-19 18:04:05.541802882 +0800 +@@ -17,21 +17,21 @@ + { + __m256d v[4]; + +- v[0] = load2(&psi[I]); +- v[1] = load2(&psi[I + d0]); +- v[2] = load2(&psi[I + d1]); +- v[3] = load2(&psi[I + d0 + d1]); ++ v[0] = load2(psi + 2 * I); ++ v[1] = load2(psi + 2 * (I + d0)); ++ v[2] = load2(psi + 2 * (I + d1)); ++ v[3] = load2(psi + 2 * (I + d0 + d1)); + +- _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3]))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7]))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3]))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7]))))); + + } + + // bit indices id[.] are given from high to low (e.g. control first for CNOT) + template +-void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) ++void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len) + { +- std::size_t n = psi.size(); ++ std::size_t n = len; + std::size_t d0 = 1UL << id0; + std::size_t d1 = 1UL << id1; + +diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp +--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp 2020-06-05 21:07:57.000000000 +0800 ++++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp 2021-04-19 18:04:05.541802882 +0800 +@@ -17,10 +17,10 @@ + { + __m256d v[4]; + +- v[0] = load2(&psi[I]); +- v[1] = load2(&psi[I + d0]); +- v[2] = load2(&psi[I + d1]); +- v[3] = load2(&psi[I + d0 + d1]); ++ v[0] = load2(psi + 2 * I); ++ v[1] = load2(psi + 2 * (I + d0)); ++ v[2] = load2(psi + 2 * (I + d1)); ++ v[3] = load2(psi + 2 * (I + d0 + d1)); + + __m256d tmp[4]; + +@@ -29,23 +29,23 @@ + tmp[2] = add(mul(v[0], m[8], mt[8]), add(mul(v[1], m[9], mt[9]), add(mul(v[2], m[10], mt[10]), mul(v[3], m[11], mt[11])))); + tmp[3] = add(mul(v[0], m[12], mt[12]), add(mul(v[1], m[13], mt[13]), add(mul(v[2], m[14], mt[14]), mul(v[3], m[15], mt[15])))); + +- v[0] = load2(&psi[I + d2]); +- v[1] = load2(&psi[I + d0 + d2]); +- v[2] = load2(&psi[I + d1 + d2]); +- v[3] = load2(&psi[I + d0 + d1 + d2]); +- +- _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[16], mt[16]), add(mul(v[1], m[17], mt[17]), add(mul(v[2], m[18], mt[18]), mul(v[3], m[19], mt[19])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[20], mt[20]), add(mul(v[1], m[21], mt[21]), add(mul(v[2], m[22], mt[22]), mul(v[3], m[23], mt[23])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31])))))); ++ v[0] = load2(psi + 2 * (I + d2)); ++ v[1] = load2(psi + 2 * (I + d0 + d2)); ++ v[2] = load2(psi + 2 * (I + d1 + d2)); ++ v[3] = load2(psi + 2 * (I + d0 + d1 + d2)); ++ ++ _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(tmp[0], add(mul(v[0], m[16], mt[16]), add(mul(v[1], m[17], mt[17]), add(mul(v[2], m[18], mt[18]), mul(v[3], m[19], mt[19])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(tmp[1], add(mul(v[0], m[20], mt[20]), add(mul(v[1], m[21], mt[21]), add(mul(v[2], m[22], mt[22]), mul(v[3], m[23], mt[23])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2), psi + 2 * (I + d2), add(tmp[2], add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2), psi + 2 * (I + d1 + d2), add(tmp[3], add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31])))))); + + } + + // bit indices id[.] are given from high to low (e.g. control first for CNOT) + template +-void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) ++void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len) + { +- std::size_t n = psi.size(); ++ std::size_t n = len; + std::size_t d0 = 1UL << id0; + std::size_t d1 = 1UL << id1; + std::size_t d2 = 1UL << id2; +diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp +--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp 2020-06-05 21:07:57.000000000 +0800 ++++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp 2021-04-19 18:04:05.541802882 +0800 +@@ -17,10 +17,10 @@ + { + __m256d v[4]; + +- v[0] = load2(&psi[I]); +- v[1] = load2(&psi[I + d0]); +- v[2] = load2(&psi[I + d1]); +- v[3] = load2(&psi[I + d0 + d1]); ++ v[0] = load2(psi + 2 * I); ++ v[1] = load2(psi + 2 * (I + d0)); ++ v[2] = load2(psi + 2 * (I + d1)); ++ v[3] = load2(psi + 2 * (I + d0 + d1)); + + __m256d tmp[8]; + +@@ -33,10 +33,10 @@ + tmp[6] = add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27])))); + tmp[7] = add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31])))); + +- v[0] = load2(&psi[I + d2]); +- v[1] = load2(&psi[I + d0 + d2]); +- v[2] = load2(&psi[I + d1 + d2]); +- v[3] = load2(&psi[I + d0 + d1 + d2]); ++ v[0] = load2(psi + 2 * (I + d2)); ++ v[1] = load2(psi + 2 * (I + d0 + d2)); ++ v[2] = load2(psi + 2 * (I + d1 + d2)); ++ v[3] = load2(psi + 2 * (I + d0 + d1 + d2)); + + tmp[0] = add(tmp[0], add(mul(v[0], m[32], mt[32]), add(mul(v[1], m[33], mt[33]), add(mul(v[2], m[34], mt[34]), mul(v[3], m[35], mt[35]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[36], mt[36]), add(mul(v[1], m[37], mt[37]), add(mul(v[2], m[38], mt[38]), mul(v[3], m[39], mt[39]))))); +@@ -47,10 +47,10 @@ + tmp[6] = add(tmp[6], add(mul(v[0], m[56], mt[56]), add(mul(v[1], m[57], mt[57]), add(mul(v[2], m[58], mt[58]), mul(v[3], m[59], mt[59]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[60], mt[60]), add(mul(v[1], m[61], mt[61]), add(mul(v[2], m[62], mt[62]), mul(v[3], m[63], mt[63]))))); + +- v[0] = load2(&psi[I + d3]); +- v[1] = load2(&psi[I + d0 + d3]); +- v[2] = load2(&psi[I + d1 + d3]); +- v[3] = load2(&psi[I + d0 + d1 + d3]); ++ v[0] = load2(psi + 2 * (I + d3)); ++ v[1] = load2(psi + 2 * (I + d0 + d3)); ++ v[2] = load2(psi + 2 * (I + d1 + d3)); ++ v[3] = load2(psi + 2 * (I + d0 + d1 + d3)); + + tmp[0] = add(tmp[0], add(mul(v[0], m[64], mt[64]), add(mul(v[1], m[65], mt[65]), add(mul(v[2], m[66], mt[66]), mul(v[3], m[67], mt[67]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[68], mt[68]), add(mul(v[1], m[69], mt[69]), add(mul(v[2], m[70], mt[70]), mul(v[3], m[71], mt[71]))))); +@@ -61,27 +61,27 @@ + tmp[6] = add(tmp[6], add(mul(v[0], m[88], mt[88]), add(mul(v[1], m[89], mt[89]), add(mul(v[2], m[90], mt[90]), mul(v[3], m[91], mt[91]))))); + tmp[7] = add(tmp[7], add(mul(v[0], m[92], mt[92]), add(mul(v[1], m[93], mt[93]), add(mul(v[2], m[94], mt[94]), mul(v[3], m[95], mt[95]))))); + +- v[0] = load2(&psi[I + d2 + d3]); +- v[1] = load2(&psi[I + d0 + d2 + d3]); +- v[2] = load2(&psi[I + d1 + d2 + d3]); +- v[3] = load2(&psi[I + d0 + d1 + d2 + d3]); +- +- _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[96], mt[96]), add(mul(v[1], m[97], mt[97]), add(mul(v[2], m[98], mt[98]), mul(v[3], m[99], mt[99])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[100], mt[100]), add(mul(v[1], m[101], mt[101]), add(mul(v[2], m[102], mt[102]), mul(v[3], m[103], mt[103])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[104], mt[104]), add(mul(v[1], m[105], mt[105]), add(mul(v[2], m[106], mt[106]), mul(v[3], m[107], mt[107])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[108], mt[108]), add(mul(v[1], m[109], mt[109]), add(mul(v[2], m[110], mt[110]), mul(v[3], m[111], mt[111])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d3], (double*)&psi[I + d3], add(tmp[4], add(mul(v[0], m[112], mt[112]), add(mul(v[1], m[113], mt[113]), add(mul(v[2], m[114], mt[114]), mul(v[3], m[115], mt[115])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], add(tmp[5], add(mul(v[0], m[116], mt[116]), add(mul(v[1], m[117], mt[117]), add(mul(v[2], m[118], mt[118]), mul(v[3], m[119], mt[119])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], add(tmp[6], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], add(tmp[7], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127])))))); ++ v[0] = load2(psi + 2 * (I + d2 + d3)); ++ v[1] = load2(psi + 2 * (I + d0 + d2 + d3)); ++ v[2] = load2(psi + 2 * (I + d1 + d2 + d3)); ++ v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d3)); ++ ++ _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(tmp[0], add(mul(v[0], m[96], mt[96]), add(mul(v[1], m[97], mt[97]), add(mul(v[2], m[98], mt[98]), mul(v[3], m[99], mt[99])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(tmp[1], add(mul(v[0], m[100], mt[100]), add(mul(v[1], m[101], mt[101]), add(mul(v[2], m[102], mt[102]), mul(v[3], m[103], mt[103])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2), psi + 2 * (I + d2), add(tmp[2], add(mul(v[0], m[104], mt[104]), add(mul(v[1], m[105], mt[105]), add(mul(v[2], m[106], mt[106]), mul(v[3], m[107], mt[107])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2), psi + 2 * (I + d1 + d2), add(tmp[3], add(mul(v[0], m[108], mt[108]), add(mul(v[1], m[109], mt[109]), add(mul(v[2], m[110], mt[110]), mul(v[3], m[111], mt[111])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d3), psi + 2 * (I + d3), add(tmp[4], add(mul(v[0], m[112], mt[112]), add(mul(v[1], m[113], mt[113]), add(mul(v[2], m[114], mt[114]), mul(v[3], m[115], mt[115])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d3), psi + 2 * (I + d1 + d3), add(tmp[5], add(mul(v[0], m[116], mt[116]), add(mul(v[1], m[117], mt[117]), add(mul(v[2], m[118], mt[118]), mul(v[3], m[119], mt[119])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d3), psi + 2 * (I + d2 + d3), add(tmp[6], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d3), psi + 2 * (I + d1 + d2 + d3), add(tmp[7], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127])))))); + + } + + // bit indices id[.] are given from high to low (e.g. control first for CNOT) + template +-void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) ++void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len) + { +- std::size_t n = psi.size(); ++ std::size_t n = len; + std::size_t d0 = 1UL << id0; + std::size_t d1 = 1UL << id1; + std::size_t d2 = 1UL << id2; +diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp +--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp 2020-06-05 21:07:57.000000000 +0800 ++++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp 2021-04-19 18:04:05.541802882 +0800 +@@ -17,10 +17,10 @@ + { + __m256d v[4]; + +- v[0] = load2(&psi[I]); +- v[1] = load2(&psi[I + d0]); +- v[2] = load2(&psi[I + d1]); +- v[3] = load2(&psi[I + d0 + d1]); ++ v[0] = load2(psi + 2 * I); ++ v[1] = load2(psi + 2 * (I + d0)); ++ v[2] = load2(psi + 2 * (I + d1)); ++ v[3] = load2(psi + 2 * (I + d0 + d1)); + + __m256d tmp[16]; + +@@ -41,10 +41,10 @@ + tmp[14] = add(mul(v[0], m[56], mt[56]), add(mul(v[1], m[57], mt[57]), add(mul(v[2], m[58], mt[58]), mul(v[3], m[59], mt[59])))); + tmp[15] = add(mul(v[0], m[60], mt[60]), add(mul(v[1], m[61], mt[61]), add(mul(v[2], m[62], mt[62]), mul(v[3], m[63], mt[63])))); + +- v[0] = load2(&psi[I + d2]); +- v[1] = load2(&psi[I + d0 + d2]); +- v[2] = load2(&psi[I + d1 + d2]); +- v[3] = load2(&psi[I + d0 + d1 + d2]); ++ v[0] = load2(psi + 2 * (I + d2)); ++ v[1] = load2(psi + 2 * (I + d0 + d2)); ++ v[2] = load2(psi + 2 * (I + d1 + d2)); ++ v[3] = load2(psi + 2 * (I + d0 + d1 + d2)); + + tmp[0] = add(tmp[0], add(mul(v[0], m[64], mt[64]), add(mul(v[1], m[65], mt[65]), add(mul(v[2], m[66], mt[66]), mul(v[3], m[67], mt[67]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[68], mt[68]), add(mul(v[1], m[69], mt[69]), add(mul(v[2], m[70], mt[70]), mul(v[3], m[71], mt[71]))))); +@@ -63,10 +63,10 @@ + tmp[14] = add(tmp[14], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127]))))); + +- v[0] = load2(&psi[I + d3]); +- v[1] = load2(&psi[I + d0 + d3]); +- v[2] = load2(&psi[I + d1 + d3]); +- v[3] = load2(&psi[I + d0 + d1 + d3]); ++ v[0] = load2(psi + 2 * (I + d3)); ++ v[1] = load2(psi + 2 * (I + d0 + d3)); ++ v[2] = load2(psi + 2 * (I + d1 + d3)); ++ v[3] = load2(psi + 2 * (I + d0 + d1 + d3)); + + tmp[0] = add(tmp[0], add(mul(v[0], m[128], mt[128]), add(mul(v[1], m[129], mt[129]), add(mul(v[2], m[130], mt[130]), mul(v[3], m[131], mt[131]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[132], mt[132]), add(mul(v[1], m[133], mt[133]), add(mul(v[2], m[134], mt[134]), mul(v[3], m[135], mt[135]))))); +@@ -85,10 +85,10 @@ + tmp[14] = add(tmp[14], add(mul(v[0], m[184], mt[184]), add(mul(v[1], m[185], mt[185]), add(mul(v[2], m[186], mt[186]), mul(v[3], m[187], mt[187]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[188], mt[188]), add(mul(v[1], m[189], mt[189]), add(mul(v[2], m[190], mt[190]), mul(v[3], m[191], mt[191]))))); + +- v[0] = load2(&psi[I + d2 + d3]); +- v[1] = load2(&psi[I + d0 + d2 + d3]); +- v[2] = load2(&psi[I + d1 + d2 + d3]); +- v[3] = load2(&psi[I + d0 + d1 + d2 + d3]); ++ v[0] = load2(psi + 2 * (I + d2 + d3)); ++ v[1] = load2(psi + 2 * (I + d0 + d2 + d3)); ++ v[2] = load2(psi + 2 * (I + d1 + d2 + d3)); ++ v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d3)); + + tmp[0] = add(tmp[0], add(mul(v[0], m[192], mt[192]), add(mul(v[1], m[193], mt[193]), add(mul(v[2], m[194], mt[194]), mul(v[3], m[195], mt[195]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[196], mt[196]), add(mul(v[1], m[197], mt[197]), add(mul(v[2], m[198], mt[198]), mul(v[3], m[199], mt[199]))))); +@@ -107,10 +107,10 @@ + tmp[14] = add(tmp[14], add(mul(v[0], m[248], mt[248]), add(mul(v[1], m[249], mt[249]), add(mul(v[2], m[250], mt[250]), mul(v[3], m[251], mt[251]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[252], mt[252]), add(mul(v[1], m[253], mt[253]), add(mul(v[2], m[254], mt[254]), mul(v[3], m[255], mt[255]))))); + +- v[0] = load2(&psi[I + d4]); +- v[1] = load2(&psi[I + d0 + d4]); +- v[2] = load2(&psi[I + d1 + d4]); +- v[3] = load2(&psi[I + d0 + d1 + d4]); ++ v[0] = load2(psi + 2 * (I + d4)); ++ v[1] = load2(psi + 2 * (I + d0 + d4)); ++ v[2] = load2(psi + 2 * (I + d1 + d4)); ++ v[3] = load2(psi + 2 * (I + d0 + d1 + d4)); + + tmp[0] = add(tmp[0], add(mul(v[0], m[256], mt[256]), add(mul(v[1], m[257], mt[257]), add(mul(v[2], m[258], mt[258]), mul(v[3], m[259], mt[259]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[260], mt[260]), add(mul(v[1], m[261], mt[261]), add(mul(v[2], m[262], mt[262]), mul(v[3], m[263], mt[263]))))); +@@ -129,10 +129,10 @@ + tmp[14] = add(tmp[14], add(mul(v[0], m[312], mt[312]), add(mul(v[1], m[313], mt[313]), add(mul(v[2], m[314], mt[314]), mul(v[3], m[315], mt[315]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[316], mt[316]), add(mul(v[1], m[317], mt[317]), add(mul(v[2], m[318], mt[318]), mul(v[3], m[319], mt[319]))))); + +- v[0] = load2(&psi[I + d2 + d4]); +- v[1] = load2(&psi[I + d0 + d2 + d4]); +- v[2] = load2(&psi[I + d1 + d2 + d4]); +- v[3] = load2(&psi[I + d0 + d1 + d2 + d4]); ++ v[0] = load2(psi + 2 * (I + d2 + d4)); ++ v[1] = load2(psi + 2 * (I + d0 + d2 + d4)); ++ v[2] = load2(psi + 2 * (I + d1 + d2 + d4)); ++ v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d4)); + + tmp[0] = add(tmp[0], add(mul(v[0], m[320], mt[320]), add(mul(v[1], m[321], mt[321]), add(mul(v[2], m[322], mt[322]), mul(v[3], m[323], mt[323]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[324], mt[324]), add(mul(v[1], m[325], mt[325]), add(mul(v[2], m[326], mt[326]), mul(v[3], m[327], mt[327]))))); +@@ -151,10 +151,10 @@ + tmp[14] = add(tmp[14], add(mul(v[0], m[376], mt[376]), add(mul(v[1], m[377], mt[377]), add(mul(v[2], m[378], mt[378]), mul(v[3], m[379], mt[379]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[380], mt[380]), add(mul(v[1], m[381], mt[381]), add(mul(v[2], m[382], mt[382]), mul(v[3], m[383], mt[383]))))); + +- v[0] = load2(&psi[I + d3 + d4]); +- v[1] = load2(&psi[I + d0 + d3 + d4]); +- v[2] = load2(&psi[I + d1 + d3 + d4]); +- v[3] = load2(&psi[I + d0 + d1 + d3 + d4]); ++ v[0] = load2(psi + 2 * (I + d3 + d4)); ++ v[1] = load2(psi + 2 * (I + d0 + d3 + d4)); ++ v[2] = load2(psi + 2 * (I + d1 + d3 + d4)); ++ v[3] = load2(psi + 2 * (I + d0 + d1 + d3 + d4)); + + tmp[0] = add(tmp[0], add(mul(v[0], m[384], mt[384]), add(mul(v[1], m[385], mt[385]), add(mul(v[2], m[386], mt[386]), mul(v[3], m[387], mt[387]))))); + tmp[1] = add(tmp[1], add(mul(v[0], m[388], mt[388]), add(mul(v[1], m[389], mt[389]), add(mul(v[2], m[390], mt[390]), mul(v[3], m[391], mt[391]))))); +@@ -173,35 +173,35 @@ + tmp[14] = add(tmp[14], add(mul(v[0], m[440], mt[440]), add(mul(v[1], m[441], mt[441]), add(mul(v[2], m[442], mt[442]), mul(v[3], m[443], mt[443]))))); + tmp[15] = add(tmp[15], add(mul(v[0], m[444], mt[444]), add(mul(v[1], m[445], mt[445]), add(mul(v[2], m[446], mt[446]), mul(v[3], m[447], mt[447]))))); + +- v[0] = load2(&psi[I + d2 + d3 + d4]); +- v[1] = load2(&psi[I + d0 + d2 + d3 + d4]); +- v[2] = load2(&psi[I + d1 + d2 + d3 + d4]); +- v[3] = load2(&psi[I + d0 + d1 + d2 + d3 + d4]); +- +- _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[448], mt[448]), add(mul(v[1], m[449], mt[449]), add(mul(v[2], m[450], mt[450]), mul(v[3], m[451], mt[451])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[452], mt[452]), add(mul(v[1], m[453], mt[453]), add(mul(v[2], m[454], mt[454]), mul(v[3], m[455], mt[455])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[456], mt[456]), add(mul(v[1], m[457], mt[457]), add(mul(v[2], m[458], mt[458]), mul(v[3], m[459], mt[459])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[460], mt[460]), add(mul(v[1], m[461], mt[461]), add(mul(v[2], m[462], mt[462]), mul(v[3], m[463], mt[463])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d3], (double*)&psi[I + d3], add(tmp[4], add(mul(v[0], m[464], mt[464]), add(mul(v[1], m[465], mt[465]), add(mul(v[2], m[466], mt[466]), mul(v[3], m[467], mt[467])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], add(tmp[5], add(mul(v[0], m[468], mt[468]), add(mul(v[1], m[469], mt[469]), add(mul(v[2], m[470], mt[470]), mul(v[3], m[471], mt[471])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], add(tmp[6], add(mul(v[0], m[472], mt[472]), add(mul(v[1], m[473], mt[473]), add(mul(v[2], m[474], mt[474]), mul(v[3], m[475], mt[475])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], add(tmp[7], add(mul(v[0], m[476], mt[476]), add(mul(v[1], m[477], mt[477]), add(mul(v[2], m[478], mt[478]), mul(v[3], m[479], mt[479])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d4], (double*)&psi[I + d4], add(tmp[8], add(mul(v[0], m[480], mt[480]), add(mul(v[1], m[481], mt[481]), add(mul(v[2], m[482], mt[482]), mul(v[3], m[483], mt[483])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], add(tmp[9], add(mul(v[0], m[484], mt[484]), add(mul(v[1], m[485], mt[485]), add(mul(v[2], m[486], mt[486]), mul(v[3], m[487], mt[487])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], add(tmp[10], add(mul(v[0], m[488], mt[488]), add(mul(v[1], m[489], mt[489]), add(mul(v[2], m[490], mt[490]), mul(v[3], m[491], mt[491])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], add(tmp[11], add(mul(v[0], m[492], mt[492]), add(mul(v[1], m[493], mt[493]), add(mul(v[2], m[494], mt[494]), mul(v[3], m[495], mt[495])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], add(tmp[12], add(mul(v[0], m[496], mt[496]), add(mul(v[1], m[497], mt[497]), add(mul(v[2], m[498], mt[498]), mul(v[3], m[499], mt[499])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], add(tmp[13], add(mul(v[0], m[500], mt[500]), add(mul(v[1], m[501], mt[501]), add(mul(v[2], m[502], mt[502]), mul(v[3], m[503], mt[503])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], add(tmp[14], add(mul(v[0], m[504], mt[504]), add(mul(v[1], m[505], mt[505]), add(mul(v[2], m[506], mt[506]), mul(v[3], m[507], mt[507])))))); +- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], add(tmp[15], add(mul(v[0], m[508], mt[508]), add(mul(v[1], m[509], mt[509]), add(mul(v[2], m[510], mt[510]), mul(v[3], m[511], mt[511])))))); ++ v[0] = load2(psi + 2 * (I + d2 + d3 + d4)); ++ v[1] = load2(psi + 2 * (I + d0 + d2 + d3 + d4)); ++ v[2] = load2(psi + 2 * (I + d1 + d2 + d3 + d4)); ++ v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d3 + d4)); ++ ++ _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(tmp[0], add(mul(v[0], m[448], mt[448]), add(mul(v[1], m[449], mt[449]), add(mul(v[2], m[450], mt[450]), mul(v[3], m[451], mt[451])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(tmp[1], add(mul(v[0], m[452], mt[452]), add(mul(v[1], m[453], mt[453]), add(mul(v[2], m[454], mt[454]), mul(v[3], m[455], mt[455])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2), psi + 2 * (I + d2), add(tmp[2], add(mul(v[0], m[456], mt[456]), add(mul(v[1], m[457], mt[457]), add(mul(v[2], m[458], mt[458]), mul(v[3], m[459], mt[459])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2), psi + 2 * (I + d1 + d2), add(tmp[3], add(mul(v[0], m[460], mt[460]), add(mul(v[1], m[461], mt[461]), add(mul(v[2], m[462], mt[462]), mul(v[3], m[463], mt[463])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d3), psi + 2 * (I + d3), add(tmp[4], add(mul(v[0], m[464], mt[464]), add(mul(v[1], m[465], mt[465]), add(mul(v[2], m[466], mt[466]), mul(v[3], m[467], mt[467])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d3), psi + 2 * (I + d1 + d3), add(tmp[5], add(mul(v[0], m[468], mt[468]), add(mul(v[1], m[469], mt[469]), add(mul(v[2], m[470], mt[470]), mul(v[3], m[471], mt[471])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d3), psi + 2 * (I + d2 + d3), add(tmp[6], add(mul(v[0], m[472], mt[472]), add(mul(v[1], m[473], mt[473]), add(mul(v[2], m[474], mt[474]), mul(v[3], m[475], mt[475])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d3), psi + 2 * (I + d1 + d2 + d3), add(tmp[7], add(mul(v[0], m[476], mt[476]), add(mul(v[1], m[477], mt[477]), add(mul(v[2], m[478], mt[478]), mul(v[3], m[479], mt[479])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d4), psi + 2 * (I + d4), add(tmp[8], add(mul(v[0], m[480], mt[480]), add(mul(v[1], m[481], mt[481]), add(mul(v[2], m[482], mt[482]), mul(v[3], m[483], mt[483])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d4), psi + 2 * (I + d1 + d4), add(tmp[9], add(mul(v[0], m[484], mt[484]), add(mul(v[1], m[485], mt[485]), add(mul(v[2], m[486], mt[486]), mul(v[3], m[487], mt[487])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d4), psi + 2 * (I + d2 + d4), add(tmp[10], add(mul(v[0], m[488], mt[488]), add(mul(v[1], m[489], mt[489]), add(mul(v[2], m[490], mt[490]), mul(v[3], m[491], mt[491])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d4), psi + 2 * (I + d1 + d2 + d4), add(tmp[11], add(mul(v[0], m[492], mt[492]), add(mul(v[1], m[493], mt[493]), add(mul(v[2], m[494], mt[494]), mul(v[3], m[495], mt[495])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d3 + d4), psi + 2 * (I + d3 + d4), add(tmp[12], add(mul(v[0], m[496], mt[496]), add(mul(v[1], m[497], mt[497]), add(mul(v[2], m[498], mt[498]), mul(v[3], m[499], mt[499])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d3 + d4), psi + 2 * (I + d1 + d3 + d4), add(tmp[13], add(mul(v[0], m[500], mt[500]), add(mul(v[1], m[501], mt[501]), add(mul(v[2], m[502], mt[502]), mul(v[3], m[503], mt[503])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d3 + d4), psi + 2 * (I + d2 + d3 + d4), add(tmp[14], add(mul(v[0], m[504], mt[504]), add(mul(v[1], m[505], mt[505]), add(mul(v[2], m[506], mt[506]), mul(v[3], m[507], mt[507])))))); ++ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d3 + d4), psi + 2 * (I + d1 + d2 + d3 + d4), add(tmp[15], add(mul(v[0], m[508], mt[508]), add(mul(v[1], m[509], mt[509]), add(mul(v[2], m[510], mt[510]), mul(v[3], m[511], mt[511])))))); + + } + + // bit indices id[.] are given from high to low (e.g. control first for CNOT) + template +-void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask) ++void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len) + { +- std::size_t n = psi.size(); ++ std::size_t n = len; + std::size_t d0 = 1UL << id0; + std::size_t d1 = 1UL << id1; + std::size_t d2 = 1UL << id2; +diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/simulator.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/simulator.hpp --- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/simulator.hpp 2020-06-05 21:07:57.000000000 +0800 -+++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/simulator.hpp 2021-01-14 10:52:24.822039389 +0800 -@@ -33,7 +33,6 @@ ++++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/simulator.hpp 2021-04-20 11:46:27.115554725 +0800 +@@ -18,11 +18,7 @@ + #include + #include + +-#if defined(NOINTRIN) || !defined(INTRIN) +-#include "nointrin/kernels.hpp" +-#else + #include "intrin/kernels.hpp" +-#endif + + #include "intrin/alignedallocator.hpp" + #include "fusion.hpp" +@@ -32,173 +28,29 @@ + #include #include #include - - ++#include + class Simulator{ public: using calc_type = double; -@@ -44,8 +43,9 @@ public: + using complex_type = std::complex; +- using StateVector = std::vector>; ++ using StateVector = calc_type *; + using Map = std::map; + using RndEngine = std::mt19937; using Term = std::vector>; using TermsDict = std::vector>; using ComplexTermsDict = std::vector>; + StateVector vec_; - Simulator(unsigned seed = 1) : N_(0), vec_(1,0.), fusion_qubits_min_(4), -+ Simulator(unsigned seed = 1) : vec_(1,0.), N_(0), fusion_qubits_min_(4), ++ Simulator(unsigned seed = 1, unsigned N = 0) : N_(N), fusion_qubits_min_(4), fusion_qubits_max_(5), rnd_eng_(seed) { ++ len_ = 1UL << (N_ + 1); ++ vec_ = (StateVector)calloc(len_, sizeof(calc_type)); vec_[0]=1.; // all-zero initial state std::uniform_real_distribution dist(0., 1.); -@@ -562,7 +562,6 @@ private: + rng_ = std::bind(dist, std::ref(rnd_eng_)); +- } +- +- void allocate_qubit(unsigned id){ +- if (map_.count(id) == 0){ +- map_[id] = N_++; +- StateVector newvec; // avoid large memory allocations +- if( tmpBuff1_.capacity() >= (1UL << N_) ) +- std::swap(newvec, tmpBuff1_); +- newvec.resize(1UL << N_); +-#pragma omp parallel for schedule(static) +- for (std::size_t i = 0; i < newvec.size(); ++i) +- newvec[i] = (i < vec_.size())?vec_[i]:0.; +- std::swap(vec_, newvec); +- // recycle large memory +- std::swap(tmpBuff1_, newvec); +- if( tmpBuff1_.capacity() < tmpBuff2_.capacity() ) +- std::swap(tmpBuff1_, tmpBuff2_); +- } +- else +- throw(std::runtime_error( +- "AllocateQubit: ID already exists. Qubit IDs should be unique.")); +- } +- +- bool get_classical_value(unsigned id, calc_type tol = 1.e-12){ +- run(); +- unsigned pos = map_[id]; +- std::size_t delta = (1UL << pos); +- +- for (std::size_t i = 0; i < vec_.size(); i += 2*delta){ +- for (std::size_t j = 0; j < delta; ++j){ +- if (std::norm(vec_[i+j]) > tol) +- return false; +- if (std::norm(vec_[i+j+delta]) > tol) +- return true; +- } +- } +- assert(false); // this will never happen +- return false; // suppress 'control reaches end of non-void...' +- } +- +- bool is_classical(unsigned id, calc_type tol = 1.e-12){ +- run(); +- unsigned pos = map_[id]; +- std::size_t delta = (1UL << pos); +- +- short up = 0, down = 0; +- #pragma omp parallel for schedule(static) reduction(|:up,down) +- for (std::size_t i = 0; i < vec_.size(); i += 2*delta){ +- for (std::size_t j = 0; j < delta; ++j){ +- up = up | ((std::norm(vec_[i+j]) > tol)&1); +- down = down | ((std::norm(vec_[i+j+delta]) > tol)&1); +- } +- } +- +- return 1 == (up^down); +- } +- +- void collapse_vector(unsigned id, bool value = false, bool shrink = false){ +- run(); +- unsigned pos = map_[id]; +- std::size_t delta = (1UL << pos); +- +- if (!shrink){ +- #pragma omp parallel for schedule(static) +- for (std::size_t i = 0; i < vec_.size(); i += 2*delta){ +- for (std::size_t j = 0; j < delta; ++j) +- vec_[i+j+static_cast(!value)*delta] = 0.; +- } +- } +- else{ +- StateVector newvec; // avoid costly memory reallocations +- if( tmpBuff1_.capacity() >= (1UL << (N_-1)) ) +- std::swap(tmpBuff1_, newvec); +- newvec.resize((1UL << (N_-1))); +- #pragma omp parallel for schedule(static) if(0) +- for (std::size_t i = 0; i < vec_.size(); i += 2*delta) +- std::copy_n(&vec_[i + static_cast(value)*delta], +- delta, &newvec[i/2]); +- std::swap(vec_, newvec); +- std::swap(tmpBuff1_, newvec); +- if( tmpBuff1_.capacity() < tmpBuff2_.capacity() ) +- std::swap(tmpBuff1_, tmpBuff2_); +- +- for (auto& p : map_){ +- if (p.second > pos) +- p.second--; +- } +- map_.erase(id); +- N_--; +- } +- } +- +- void measure_qubits(std::vector const& ids, std::vector &res){ +- run(); +- +- std::vector positions(ids.size()); +- for (unsigned i = 0; i < ids.size(); ++i) +- positions[i] = map_[ids[i]]; +- +- calc_type P = 0.; +- calc_type rnd = rng_(); +- +- // pick entry at random with probability |entry|^2 +- std::size_t pick = 0; +- while (P < rnd && pick < vec_.size()) +- P += std::norm(vec_[pick++]); +- +- pick--; +- // determine result vector (boolean values for each qubit) +- // and create mask to detect bad entries (i.e., entries that don't agree with measurement) +- res = std::vector(ids.size()); +- std::size_t mask = 0; +- std::size_t val = 0; +- for (unsigned i = 0; i < ids.size(); ++i){ +- bool r = ((pick >> positions[i]) & 1) == 1; +- res[i] = r; +- mask |= (1UL << positions[i]); +- val |= (static_cast(r&1) << positions[i]); +- } +- // set bad entries to 0 +- calc_type N = 0.; +- #pragma omp parallel for reduction(+:N) schedule(static) +- for (std::size_t i = 0; i < vec_.size(); ++i){ +- if ((i & mask) != val) +- vec_[i] = 0.; +- else +- N += std::norm(vec_[i]); +- } +- // re-normalize +- N = 1./std::sqrt(N); +- #pragma omp parallel for schedule(static) +- for (std::size_t i = 0; i < vec_.size(); ++i) +- vec_[i] *= N; +- } +- +- std::vector measure_qubits_return(std::vector const& ids){ +- std::vector ret; +- measure_qubits(ids, ret); +- return ret; +- } +- +- void deallocate_qubit(unsigned id){ +- run(); +- assert(map_.count(id) == 1); +- if (!is_classical(id)) +- throw(std::runtime_error("Error: Qubit has not been measured / uncomputed! There is most likely a bug in your code.")); +- +- bool value = get_classical_value(id); +- collapse_vector(id, value, true); ++ for (unsigned i = 0; i < N_; i++) ++ map_[i] = i; + } + + template +@@ -221,84 +73,13 @@ + fused_gates_ = fused_gates; + } + +- template +- void emulate_math(F const& f, QuReg quregs, const std::vector& ctrl, +- bool parallelize = false){ +- run(); +- auto ctrlmask = get_control_mask(ctrl); +- +- for (unsigned i = 0; i < quregs.size(); ++i) +- for (unsigned j = 0; j < quregs[i].size(); ++j) +- quregs[i][j] = map_[quregs[i][j]]; +- +- StateVector newvec; // avoid costly memory reallocations +- if( tmpBuff1_.capacity() >= vec_.size() ) +- std::swap(newvec, tmpBuff1_); +- newvec.resize(vec_.size()); +-#pragma omp parallel for schedule(static) +- for (std::size_t i = 0; i < vec_.size(); i++) +- newvec[i] = 0; +- +-//#pragma omp parallel reduction(+:newvec[:newvec.size()]) if(parallelize) // requires OpenMP 4.5 +- { +- std::vector res(quregs.size()); +- //#pragma omp for schedule(static) +- for (std::size_t i = 0; i < vec_.size(); ++i){ +- if ((ctrlmask&i) == ctrlmask){ +- for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){ +- res[qr_i] = 0; +- for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i) +- res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1) << qb_i; +- } +- f(res); +- auto new_i = i; +- for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){ +- for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i){ +- if (!(((new_i >> quregs[qr_i][qb_i])&1) == ((res[qr_i] >> qb_i)&1))) +- new_i ^= (1UL << quregs[qr_i][qb_i]); +- } +- } +- newvec[new_i] += vec_[i]; +- } +- else +- newvec[i] += vec_[i]; +- } +- } +- std::swap(vec_, newvec); +- std::swap(tmpBuff1_, newvec); +- } +- +- // faster version without calling python +- template +- inline void emulate_math_addConstant(int a, const QuReg& quregs, const std::vector& ctrl) +- { +- emulate_math([a](std::vector &res){for(auto& x: res) x = x + a;}, quregs, ctrl, true); +- } +- +- // faster version without calling python +- template +- inline void emulate_math_addConstantModN(int a, int N, const QuReg& quregs, const std::vector& ctrl) +- { +- emulate_math([a,N](std::vector &res){for(auto& x: res) x = (x + a) % N;}, quregs, ctrl, true); +- } +- +- // faster version without calling python +- template +- inline void emulate_math_multiplyByConstantModN(int a, int N, const QuReg& quregs, const std::vector& ctrl) +- { +- emulate_math([a,N](std::vector &res){for(auto& x: res) x = (x * a) % N;}, quregs, ctrl, true); +- } +- + calc_type get_expectation_value(TermsDict const& td, std::vector const& ids){ + run(); + calc_type expectation = 0.; + +- StateVector current_state; // avoid costly memory reallocations +- if( tmpBuff1_.capacity() >= vec_.size() ) +- std::swap(tmpBuff1_, current_state); +- current_state.resize(vec_.size()); ++ StateVector current_state = (StateVector)malloc(len_ *sizeof(calc_type)); + #pragma omp parallel for schedule(static) +- for (std::size_t i = 0; i < vec_.size(); ++i) ++ for (std::size_t i = 0; i < len_; ++i) + current_state[i] = vec_[i]; + + for (auto const& term : td){ +@@ -306,81 +87,53 @@ + apply_term(term.first, ids, {}); + calc_type delta = 0.; + #pragma omp parallel for reduction(+:delta) schedule(static) +- for (std::size_t i = 0; i < vec_.size(); ++i){ +- auto const a1 = std::real(current_state[i]); +- auto const b1 = -std::imag(current_state[i]); +- auto const a2 = std::real(vec_[i]); +- auto const b2 = std::imag(vec_[i]); ++ for (std::size_t i = 0; i < (len_ >> 1); ++i){ ++ auto const a1 = current_state[2 * i]; ++ auto const b1 = -current_state[2 * i + 1]; ++ auto const a2 = vec_[2 * i]; ++ auto const b2 = vec_[2 * i + 1]; + delta += a1 * a2 - b1 * b2; + // reset vec_ +- vec_[i] = current_state[i]; ++ vec_[2 * i] = current_state[2 * i]; ++ vec_[2 * i + 1] = current_state[2 * i + 1]; + } + expectation += coefficient * delta; + } +- std::swap(current_state, tmpBuff1_); ++ if (NULL != current_state){ ++ free(current_state); ++ current_state = NULL; ++ } + return expectation; + } + + void apply_qubit_operator(ComplexTermsDict const& td, std::vector const& ids){ + run(); +- StateVector new_state, current_state; // avoid costly memory reallocations +- if( tmpBuff1_.capacity() >= vec_.size() ) +- std::swap(tmpBuff1_, new_state); +- if( tmpBuff2_.capacity() >= vec_.size() ) +- std::swap(tmpBuff2_, current_state); +- new_state.resize(vec_.size()); +- current_state.resize(vec_.size()); ++ StateVector new_state = (StateVector)calloc(len_, sizeof(calc_type)); ++ StateVector current_state = (StateVector)malloc(len_ * sizeof(calc_type)); + #pragma omp parallel for schedule(static) +- for (std::size_t i = 0; i < vec_.size(); ++i){ +- new_state[i] = 0; ++ for (std::size_t i = 0; i < len_; ++i){ + current_state[i] = vec_[i]; + } + for (auto const& term : td){ + auto const& coefficient = term.second; + apply_term(term.first, ids, {}); + #pragma omp parallel for schedule(static) +- for (std::size_t i = 0; i < vec_.size(); ++i){ +- new_state[i] += coefficient * vec_[i]; +- vec_[i] = current_state[i]; ++ for (std::size_t i = 0; i < (len_ >> 1); ++i){ ++ new_state[2 * i] += coefficient.real() * vec_[2 * i] - coefficient.imag() * vec_[2 * i + 1]; ++ new_state[2 * i + 1] += coefficient.real() * vec_[2 * i + 1] + coefficient.imag() * vec_[2 * i]; ++ vec_[2 * i] = current_state[2 * i]; ++ vec_[2 * i + 1] = current_state[2 * i + 1]; + } + } +- std::swap(vec_, new_state); +- std::swap(tmpBuff1_, new_state); +- std::swap(tmpBuff2_, current_state); +- } +- +- calc_type get_probability(std::vector const& bit_string, +- std::vector const& ids){ +- run(); +- if (!check_ids(ids)) +- throw(std::runtime_error("get_probability(): Unknown qubit id. Please make sure you have called eng.flush().")); +- std::size_t mask = 0, bit_str = 0; +- for (unsigned i = 0; i < ids.size(); ++i){ +- mask |= 1UL << map_[ids[i]]; +- bit_str |= (bit_string[i]?1UL:0UL) << map_[ids[i]]; +- } +- calc_type probability = 0.; +- #pragma omp parallel for reduction(+:probability) schedule(static) +- for (std::size_t i = 0; i < vec_.size(); ++i) +- if ((i & mask) == bit_str) +- probability += std::norm(vec_[i]); +- return probability; +- } +- +- complex_type const& get_amplitude(std::vector const& bit_string, +- std::vector const& ids){ +- run(); +- std::size_t chk = 0; +- std::size_t index = 0; +- for (unsigned i = 0; i < ids.size(); ++i){ +- if (map_.count(ids[i]) == 0) +- break; +- chk |= 1UL << map_[ids[i]]; +- index |= (bit_string[i]?1UL:0UL) << map_[ids[i]]; ++ if (NULL != vec_) ++ free(vec_); ++ vec_ = new_state; ++ if (NULL != new_state) ++ new_state = NULL; ++ if (NULL != current_state){ ++ free(current_state); ++ current_state = NULL; + } +- if (chk + 1 != vec_.size()) +- throw(std::runtime_error("The second argument to get_amplitude() must be a permutation of all allocated qubits. Please make sure you have called eng.flush().")); +- return vec_[index]; + } + + void emulate_time_evolution(TermsDict const& tdict, calc_type const& time, +@@ -400,87 +153,71 @@ + } + unsigned s = std::abs(time) * op_nrm + 1.; + complex_type correction = std::exp(-time * I * tr / (double)s); +- auto output_state = vec_; ++ auto output_state = copy(vec_, len_); + auto ctrlmask = get_control_mask(ctrl); + for (unsigned i = 0; i < s; ++i){ + calc_type nrm_change = 1.; + for (unsigned k = 0; nrm_change > 1.e-12; ++k){ + auto coeff = (-time * I) / double(s * (k + 1)); +- auto current_state = vec_; +- auto update = StateVector(vec_.size(), 0.); ++ auto current_state = copy(vec_, len_); ++ auto update = (StateVector)calloc(len_, sizeof(calc_type)); + for (auto const& tup : td){ + apply_term(tup.first, ids, {}); + #pragma omp parallel for schedule(static) +- for (std::size_t j = 0; j < vec_.size(); ++j){ ++ for (std::size_t j = 0; j < len_; ++j){ + update[j] += vec_[j] * tup.second; + vec_[j] = current_state[j]; + } + } + nrm_change = 0.; + #pragma omp parallel for reduction(+:nrm_change) schedule(static) +- for (std::size_t j = 0; j < vec_.size(); ++j){ +- update[j] *= coeff; +- vec_[j] = update[j]; ++ for (std::size_t j = 0; j < (len_ >> 1); ++j){ ++ complex_type tmp(update[2 * j], update[2 * j + 1]); ++ tmp *= coeff; ++ update[2 * j] *= std::real(tmp); ++ update[2 * j + 1] *= std::imag(tmp); ++ vec_[2 * j] = update[2 * j]; ++ vec_[2 * j + 1] = update[2 * j + 1]; + if ((j & ctrlmask) == ctrlmask){ +- output_state[j] += update[j]; +- nrm_change += std::norm(update[j]); ++ output_state[2 * j] += update[2 * j]; ++ output_state[2 * j + 1] += update[2 * j + 1]; ++ nrm_change += std::sqrt(update[2 * j] * update[2 * j] + update[2 * j + 1] * update[2 * j + 1]); + } + } + nrm_change = std::sqrt(nrm_change); ++ if (NULL != current_state){ ++ free(current_state); ++ current_state = NULL; ++ } ++ if (NULL != update){ ++ free(update); ++ update = NULL; ++ } + } + #pragma omp parallel for schedule(static) +- for (std::size_t j = 0; j < vec_.size(); ++j){ +- if ((j & ctrlmask) == ctrlmask) +- output_state[j] *= correction; +- vec_[j] = output_state[j]; ++ for (std::size_t j = 0; j < (len_ >>1); ++j){ ++ if ((j & ctrlmask) == ctrlmask){ ++ complex_type tmp(output_state[2 * j], output_state[2 * j + 1]); ++ tmp *= correction; ++ output_state[2 * j] = std::real(tmp); ++ output_state[2 * j + 1] =std::imag(tmp); ++ } ++ vec_[2 * j] = output_state[2 * j]; ++ vec_[2 * j + 1] = output_state[2 * j + 1]; + } + } ++ if (NULL != output_state){ ++ free(output_state); ++ output_state = NULL; ++ } + } + + void set_wavefunction(StateVector const& wavefunction, std::vector const& ordering){ + run(); +- // make sure there are 2^n amplitudes for n qubits +- assert(wavefunction.size() == (1UL << ordering.size())); +- // check that all qubits have been allocated previously +- if (map_.size() != ordering.size() || !check_ids(ordering)) +- throw(std::runtime_error("set_wavefunction(): Invalid mapping provided. Please make sure all qubits have been allocated previously (call eng.flush()).")); +- +- // set mapping and wavefunction +- for (unsigned i = 0; i < ordering.size(); ++i) +- map_[ordering[i]] = i; +- #pragma omp parallel for schedule(static) +- for (std::size_t i = 0; i < wavefunction.size(); ++i) +- vec_[i] = wavefunction[i]; +- } +- +- void collapse_wavefunction(std::vector const& ids, std::vector const& values){ +- run(); +- assert(ids.size() == values.size()); +- if (!check_ids(ids)) +- throw(std::runtime_error("collapse_wavefunction(): Unknown qubit id(s) provided. Try calling eng.flush() before invoking this function.")); +- std::size_t mask = 0, val = 0; +- for (unsigned i = 0; i < ids.size(); ++i){ +- mask |= (1UL << map_[ids[i]]); +- val |= ((values[i]?1UL:0UL) << map_[ids[i]]); +- } +- // set bad entries to 0 and compute probability of outcome to renormalize +- calc_type N = 0.; +- #pragma omp parallel for reduction(+:N) schedule(static) +- for (std::size_t i = 0; i < vec_.size(); ++i){ +- if ((i & mask) == val) +- N += std::norm(vec_[i]); +- } +- if (N < 1.e-12) +- throw(std::runtime_error("collapse_wavefunction(): Invalid collapse! Probability is ~0.")); +- // re-normalize (if possible) +- N = 1./std::sqrt(N); +- #pragma omp parallel for schedule(static) +- for (std::size_t i = 0; i < vec_.size(); ++i){ +- if ((i & mask) != val) +- vec_[i] = 0.; +- else +- vec_[i] *= N; ++ if (NULL != vec_){ ++ free(vec_); + } ++ vec_ = copy(wavefunction, len_); + } + + void run(){ +@@ -500,23 +237,23 @@ + switch (ids.size()){ + case 1: + #pragma omp parallel +- kernel(vec_, ids[0], m, ctrlmask); ++ kernel(vec_, ids[0], m, ctrlmask, len_ >> 1); + break; + case 2: + #pragma omp parallel +- kernel(vec_, ids[1], ids[0], m, ctrlmask); ++ kernel(vec_, ids[1], ids[0], m, ctrlmask, len_ >> 1); + break; + case 3: + #pragma omp parallel +- kernel(vec_, ids[2], ids[1], ids[0], m, ctrlmask); ++ kernel(vec_, ids[2], ids[1], ids[0], m, ctrlmask, len_ >> 1); + break; + case 4: + #pragma omp parallel +- kernel(vec_, ids[3], ids[2], ids[1], ids[0], m, ctrlmask); ++ kernel(vec_, ids[3], ids[2], ids[1], ids[0], m, ctrlmask, len_ >> 1); + break; + case 5: + #pragma omp parallel +- kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask); ++ kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask, len_ >> 1); + break; + default: + throw std::invalid_argument("Gates with more than 5 qubits are not supported!"); +@@ -525,12 +262,27 @@ + fused_gates_ = Fusion(); + } + +- std::tuple cheat(){ ++ std::vector cheat(){ + run(); +- return make_tuple(map_, std::ref(vec_)); ++ std::vector result; ++ for (unsigned int i = 0; i < (len_ >> 1); i++){ ++ result.push_back({vec_[2 * i], vec_[2 * i + 1]}); ++ } ++ return result; ++ } ++ ++ inline StateVector copy(StateVector source, unsigned len){ ++ StateVector result = (StateVector)malloc(len * sizeof(calc_type)); ++#pragma omp parallel for schedule(static) ++ for (std::size_t i = 0; i < len; ++i) { ++ result[i] = source[i]; + } ++ return result; ++} + + ~Simulator(){ ++ if (NULL != vec_) ++ free(vec_); + } + + private: +@@ -562,18 +314,13 @@ } unsigned N_; // #qubits - StateVector vec_; ++ unsigned len_; Map map_; Fusion fused_gates_; unsigned fusion_qubits_min_, fusion_qubits_max_; -@@ -570,10 +569,8 @@ private: + RndEngine rnd_eng_; std::function rng_; - - // large array buffers to avoid costly reallocations +- +- // large array buffers to avoid costly reallocations - static StateVector tmpBuff1_, tmpBuff2_; -+ StateVector tmpBuff1_, tmpBuff2_; }; -Simulator::StateVector Simulator::tmpBuff1_;