|
|
|
@ -1,38 +1,966 @@
|
|
|
|
|
diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp
|
|
|
|
|
--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp 2020-06-05 21:07:57.000000000 +0800
|
|
|
|
|
+++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel1.hpp 2021-04-19 18:04:05.541802882 +0800
|
|
|
|
|
@@ -17,18 +17,18 @@
|
|
|
|
|
{
|
|
|
|
|
__m256d v[2];
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0]);
|
|
|
|
|
+ v[0] = load2(psi + 2 * I);
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0));
|
|
|
|
|
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(mul(v[0], m[0], mt[0]), mul(v[1], m[1], mt[1])));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(mul(v[0], m[0], mt[0]), mul(v[1], m[1], mt[1])));
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// bit indices id[.] are given from high to low (e.g. control first for CNOT)
|
|
|
|
|
template <class V, class M>
|
|
|
|
|
-void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask)
|
|
|
|
|
+void kernel(V &psi, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len)
|
|
|
|
|
{
|
|
|
|
|
- std::size_t n = psi.size();
|
|
|
|
|
+ std::size_t n = len;
|
|
|
|
|
std::size_t d0 = 1UL << id0;
|
|
|
|
|
|
|
|
|
|
__m256d mm[] = {load(&m[0][0], &m[1][0]), load(&m[0][1], &m[1][1])};
|
|
|
|
|
diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp
|
|
|
|
|
--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp 2020-06-05 21:07:57.000000000 +0800
|
|
|
|
|
+++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel2.hpp 2021-04-19 18:04:05.541802882 +0800
|
|
|
|
|
@@ -17,21 +17,21 @@
|
|
|
|
|
{
|
|
|
|
|
__m256d v[4];
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1]);
|
|
|
|
|
+ v[0] = load2(psi + 2 * I);
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1));
|
|
|
|
|
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3])))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7])))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(mul(v[0], m[0], mt[0]), add(mul(v[1], m[1], mt[1]), add(mul(v[2], m[2], mt[2]), mul(v[3], m[3], mt[3])))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(mul(v[0], m[4], mt[4]), add(mul(v[1], m[5], mt[5]), add(mul(v[2], m[6], mt[6]), mul(v[3], m[7], mt[7])))));
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// bit indices id[.] are given from high to low (e.g. control first for CNOT)
|
|
|
|
|
template <class V, class M>
|
|
|
|
|
-void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
|
|
|
|
|
+void kernel(V &psi, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len)
|
|
|
|
|
{
|
|
|
|
|
- std::size_t n = psi.size();
|
|
|
|
|
+ std::size_t n = len;
|
|
|
|
|
std::size_t d0 = 1UL << id0;
|
|
|
|
|
std::size_t d1 = 1UL << id1;
|
|
|
|
|
|
|
|
|
|
diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp
|
|
|
|
|
--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp 2020-06-05 21:07:57.000000000 +0800
|
|
|
|
|
+++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel3.hpp 2021-04-19 18:04:05.541802882 +0800
|
|
|
|
|
@@ -17,10 +17,10 @@
|
|
|
|
|
{
|
|
|
|
|
__m256d v[4];
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1]);
|
|
|
|
|
+ v[0] = load2(psi + 2 * I);
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1));
|
|
|
|
|
|
|
|
|
|
__m256d tmp[4];
|
|
|
|
|
|
|
|
|
|
@@ -29,23 +29,23 @@
|
|
|
|
|
tmp[2] = add(mul(v[0], m[8], mt[8]), add(mul(v[1], m[9], mt[9]), add(mul(v[2], m[10], mt[10]), mul(v[3], m[11], mt[11]))));
|
|
|
|
|
tmp[3] = add(mul(v[0], m[12], mt[12]), add(mul(v[1], m[13], mt[13]), add(mul(v[2], m[14], mt[14]), mul(v[3], m[15], mt[15]))));
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I + d2]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0 + d2]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1 + d2]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1 + d2]);
|
|
|
|
|
-
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[16], mt[16]), add(mul(v[1], m[17], mt[17]), add(mul(v[2], m[18], mt[18]), mul(v[3], m[19], mt[19]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[20], mt[20]), add(mul(v[1], m[21], mt[21]), add(mul(v[2], m[22], mt[22]), mul(v[3], m[23], mt[23]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31]))))));
|
|
|
|
|
+ v[0] = load2(psi + 2 * (I + d2));
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0 + d2));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1 + d2));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1 + d2));
|
|
|
|
|
+
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(tmp[0], add(mul(v[0], m[16], mt[16]), add(mul(v[1], m[17], mt[17]), add(mul(v[2], m[18], mt[18]), mul(v[3], m[19], mt[19]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(tmp[1], add(mul(v[0], m[20], mt[20]), add(mul(v[1], m[21], mt[21]), add(mul(v[2], m[22], mt[22]), mul(v[3], m[23], mt[23]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2), psi + 2 * (I + d2), add(tmp[2], add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2), psi + 2 * (I + d1 + d2), add(tmp[3], add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31]))))));
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// bit indices id[.] are given from high to low (e.g. control first for CNOT)
|
|
|
|
|
template <class V, class M>
|
|
|
|
|
-void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
|
|
|
|
|
+void kernel(V &psi, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len)
|
|
|
|
|
{
|
|
|
|
|
- std::size_t n = psi.size();
|
|
|
|
|
+ std::size_t n = len;
|
|
|
|
|
std::size_t d0 = 1UL << id0;
|
|
|
|
|
std::size_t d1 = 1UL << id1;
|
|
|
|
|
std::size_t d2 = 1UL << id2;
|
|
|
|
|
diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp
|
|
|
|
|
--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp 2020-06-05 21:07:57.000000000 +0800
|
|
|
|
|
+++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel4.hpp 2021-04-19 18:04:05.541802882 +0800
|
|
|
|
|
@@ -17,10 +17,10 @@
|
|
|
|
|
{
|
|
|
|
|
__m256d v[4];
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1]);
|
|
|
|
|
+ v[0] = load2(psi + 2 * I);
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1));
|
|
|
|
|
|
|
|
|
|
__m256d tmp[8];
|
|
|
|
|
|
|
|
|
|
@@ -33,10 +33,10 @@
|
|
|
|
|
tmp[6] = add(mul(v[0], m[24], mt[24]), add(mul(v[1], m[25], mt[25]), add(mul(v[2], m[26], mt[26]), mul(v[3], m[27], mt[27]))));
|
|
|
|
|
tmp[7] = add(mul(v[0], m[28], mt[28]), add(mul(v[1], m[29], mt[29]), add(mul(v[2], m[30], mt[30]), mul(v[3], m[31], mt[31]))));
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I + d2]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0 + d2]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1 + d2]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1 + d2]);
|
|
|
|
|
+ v[0] = load2(psi + 2 * (I + d2));
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0 + d2));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1 + d2));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1 + d2));
|
|
|
|
|
|
|
|
|
|
tmp[0] = add(tmp[0], add(mul(v[0], m[32], mt[32]), add(mul(v[1], m[33], mt[33]), add(mul(v[2], m[34], mt[34]), mul(v[3], m[35], mt[35])))));
|
|
|
|
|
tmp[1] = add(tmp[1], add(mul(v[0], m[36], mt[36]), add(mul(v[1], m[37], mt[37]), add(mul(v[2], m[38], mt[38]), mul(v[3], m[39], mt[39])))));
|
|
|
|
|
@@ -47,10 +47,10 @@
|
|
|
|
|
tmp[6] = add(tmp[6], add(mul(v[0], m[56], mt[56]), add(mul(v[1], m[57], mt[57]), add(mul(v[2], m[58], mt[58]), mul(v[3], m[59], mt[59])))));
|
|
|
|
|
tmp[7] = add(tmp[7], add(mul(v[0], m[60], mt[60]), add(mul(v[1], m[61], mt[61]), add(mul(v[2], m[62], mt[62]), mul(v[3], m[63], mt[63])))));
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I + d3]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0 + d3]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1 + d3]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1 + d3]);
|
|
|
|
|
+ v[0] = load2(psi + 2 * (I + d3));
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0 + d3));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1 + d3));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1 + d3));
|
|
|
|
|
|
|
|
|
|
tmp[0] = add(tmp[0], add(mul(v[0], m[64], mt[64]), add(mul(v[1], m[65], mt[65]), add(mul(v[2], m[66], mt[66]), mul(v[3], m[67], mt[67])))));
|
|
|
|
|
tmp[1] = add(tmp[1], add(mul(v[0], m[68], mt[68]), add(mul(v[1], m[69], mt[69]), add(mul(v[2], m[70], mt[70]), mul(v[3], m[71], mt[71])))));
|
|
|
|
|
@@ -61,27 +61,27 @@
|
|
|
|
|
tmp[6] = add(tmp[6], add(mul(v[0], m[88], mt[88]), add(mul(v[1], m[89], mt[89]), add(mul(v[2], m[90], mt[90]), mul(v[3], m[91], mt[91])))));
|
|
|
|
|
tmp[7] = add(tmp[7], add(mul(v[0], m[92], mt[92]), add(mul(v[1], m[93], mt[93]), add(mul(v[2], m[94], mt[94]), mul(v[3], m[95], mt[95])))));
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I + d2 + d3]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0 + d2 + d3]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1 + d2 + d3]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1 + d2 + d3]);
|
|
|
|
|
-
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[96], mt[96]), add(mul(v[1], m[97], mt[97]), add(mul(v[2], m[98], mt[98]), mul(v[3], m[99], mt[99]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[100], mt[100]), add(mul(v[1], m[101], mt[101]), add(mul(v[2], m[102], mt[102]), mul(v[3], m[103], mt[103]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[104], mt[104]), add(mul(v[1], m[105], mt[105]), add(mul(v[2], m[106], mt[106]), mul(v[3], m[107], mt[107]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[108], mt[108]), add(mul(v[1], m[109], mt[109]), add(mul(v[2], m[110], mt[110]), mul(v[3], m[111], mt[111]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d3], (double*)&psi[I + d3], add(tmp[4], add(mul(v[0], m[112], mt[112]), add(mul(v[1], m[113], mt[113]), add(mul(v[2], m[114], mt[114]), mul(v[3], m[115], mt[115]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], add(tmp[5], add(mul(v[0], m[116], mt[116]), add(mul(v[1], m[117], mt[117]), add(mul(v[2], m[118], mt[118]), mul(v[3], m[119], mt[119]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], add(tmp[6], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], add(tmp[7], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127]))))));
|
|
|
|
|
+ v[0] = load2(psi + 2 * (I + d2 + d3));
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0 + d2 + d3));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1 + d2 + d3));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d3));
|
|
|
|
|
+
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(tmp[0], add(mul(v[0], m[96], mt[96]), add(mul(v[1], m[97], mt[97]), add(mul(v[2], m[98], mt[98]), mul(v[3], m[99], mt[99]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(tmp[1], add(mul(v[0], m[100], mt[100]), add(mul(v[1], m[101], mt[101]), add(mul(v[2], m[102], mt[102]), mul(v[3], m[103], mt[103]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2), psi + 2 * (I + d2), add(tmp[2], add(mul(v[0], m[104], mt[104]), add(mul(v[1], m[105], mt[105]), add(mul(v[2], m[106], mt[106]), mul(v[3], m[107], mt[107]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2), psi + 2 * (I + d1 + d2), add(tmp[3], add(mul(v[0], m[108], mt[108]), add(mul(v[1], m[109], mt[109]), add(mul(v[2], m[110], mt[110]), mul(v[3], m[111], mt[111]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d3), psi + 2 * (I + d3), add(tmp[4], add(mul(v[0], m[112], mt[112]), add(mul(v[1], m[113], mt[113]), add(mul(v[2], m[114], mt[114]), mul(v[3], m[115], mt[115]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d3), psi + 2 * (I + d1 + d3), add(tmp[5], add(mul(v[0], m[116], mt[116]), add(mul(v[1], m[117], mt[117]), add(mul(v[2], m[118], mt[118]), mul(v[3], m[119], mt[119]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d3), psi + 2 * (I + d2 + d3), add(tmp[6], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d3), psi + 2 * (I + d1 + d2 + d3), add(tmp[7], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127]))))));
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// bit indices id[.] are given from high to low (e.g. control first for CNOT)
|
|
|
|
|
template <class V, class M>
|
|
|
|
|
-void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
|
|
|
|
|
+void kernel(V &psi, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len)
|
|
|
|
|
{
|
|
|
|
|
- std::size_t n = psi.size();
|
|
|
|
|
+ std::size_t n = len;
|
|
|
|
|
std::size_t d0 = 1UL << id0;
|
|
|
|
|
std::size_t d1 = 1UL << id1;
|
|
|
|
|
std::size_t d2 = 1UL << id2;
|
|
|
|
|
diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp
|
|
|
|
|
--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp 2020-06-05 21:07:57.000000000 +0800
|
|
|
|
|
+++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/intrin/kernel5.hpp 2021-04-19 18:04:05.541802882 +0800
|
|
|
|
|
@@ -17,10 +17,10 @@
|
|
|
|
|
{
|
|
|
|
|
__m256d v[4];
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1]);
|
|
|
|
|
+ v[0] = load2(psi + 2 * I);
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1));
|
|
|
|
|
|
|
|
|
|
__m256d tmp[16];
|
|
|
|
|
|
|
|
|
|
@@ -41,10 +41,10 @@
|
|
|
|
|
tmp[14] = add(mul(v[0], m[56], mt[56]), add(mul(v[1], m[57], mt[57]), add(mul(v[2], m[58], mt[58]), mul(v[3], m[59], mt[59]))));
|
|
|
|
|
tmp[15] = add(mul(v[0], m[60], mt[60]), add(mul(v[1], m[61], mt[61]), add(mul(v[2], m[62], mt[62]), mul(v[3], m[63], mt[63]))));
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I + d2]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0 + d2]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1 + d2]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1 + d2]);
|
|
|
|
|
+ v[0] = load2(psi + 2 * (I + d2));
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0 + d2));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1 + d2));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1 + d2));
|
|
|
|
|
|
|
|
|
|
tmp[0] = add(tmp[0], add(mul(v[0], m[64], mt[64]), add(mul(v[1], m[65], mt[65]), add(mul(v[2], m[66], mt[66]), mul(v[3], m[67], mt[67])))));
|
|
|
|
|
tmp[1] = add(tmp[1], add(mul(v[0], m[68], mt[68]), add(mul(v[1], m[69], mt[69]), add(mul(v[2], m[70], mt[70]), mul(v[3], m[71], mt[71])))));
|
|
|
|
|
@@ -63,10 +63,10 @@
|
|
|
|
|
tmp[14] = add(tmp[14], add(mul(v[0], m[120], mt[120]), add(mul(v[1], m[121], mt[121]), add(mul(v[2], m[122], mt[122]), mul(v[3], m[123], mt[123])))));
|
|
|
|
|
tmp[15] = add(tmp[15], add(mul(v[0], m[124], mt[124]), add(mul(v[1], m[125], mt[125]), add(mul(v[2], m[126], mt[126]), mul(v[3], m[127], mt[127])))));
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I + d3]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0 + d3]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1 + d3]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1 + d3]);
|
|
|
|
|
+ v[0] = load2(psi + 2 * (I + d3));
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0 + d3));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1 + d3));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1 + d3));
|
|
|
|
|
|
|
|
|
|
tmp[0] = add(tmp[0], add(mul(v[0], m[128], mt[128]), add(mul(v[1], m[129], mt[129]), add(mul(v[2], m[130], mt[130]), mul(v[3], m[131], mt[131])))));
|
|
|
|
|
tmp[1] = add(tmp[1], add(mul(v[0], m[132], mt[132]), add(mul(v[1], m[133], mt[133]), add(mul(v[2], m[134], mt[134]), mul(v[3], m[135], mt[135])))));
|
|
|
|
|
@@ -85,10 +85,10 @@
|
|
|
|
|
tmp[14] = add(tmp[14], add(mul(v[0], m[184], mt[184]), add(mul(v[1], m[185], mt[185]), add(mul(v[2], m[186], mt[186]), mul(v[3], m[187], mt[187])))));
|
|
|
|
|
tmp[15] = add(tmp[15], add(mul(v[0], m[188], mt[188]), add(mul(v[1], m[189], mt[189]), add(mul(v[2], m[190], mt[190]), mul(v[3], m[191], mt[191])))));
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I + d2 + d3]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0 + d2 + d3]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1 + d2 + d3]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1 + d2 + d3]);
|
|
|
|
|
+ v[0] = load2(psi + 2 * (I + d2 + d3));
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0 + d2 + d3));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1 + d2 + d3));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d3));
|
|
|
|
|
|
|
|
|
|
tmp[0] = add(tmp[0], add(mul(v[0], m[192], mt[192]), add(mul(v[1], m[193], mt[193]), add(mul(v[2], m[194], mt[194]), mul(v[3], m[195], mt[195])))));
|
|
|
|
|
tmp[1] = add(tmp[1], add(mul(v[0], m[196], mt[196]), add(mul(v[1], m[197], mt[197]), add(mul(v[2], m[198], mt[198]), mul(v[3], m[199], mt[199])))));
|
|
|
|
|
@@ -107,10 +107,10 @@
|
|
|
|
|
tmp[14] = add(tmp[14], add(mul(v[0], m[248], mt[248]), add(mul(v[1], m[249], mt[249]), add(mul(v[2], m[250], mt[250]), mul(v[3], m[251], mt[251])))));
|
|
|
|
|
tmp[15] = add(tmp[15], add(mul(v[0], m[252], mt[252]), add(mul(v[1], m[253], mt[253]), add(mul(v[2], m[254], mt[254]), mul(v[3], m[255], mt[255])))));
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I + d4]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0 + d4]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1 + d4]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1 + d4]);
|
|
|
|
|
+ v[0] = load2(psi + 2 * (I + d4));
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0 + d4));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1 + d4));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1 + d4));
|
|
|
|
|
|
|
|
|
|
tmp[0] = add(tmp[0], add(mul(v[0], m[256], mt[256]), add(mul(v[1], m[257], mt[257]), add(mul(v[2], m[258], mt[258]), mul(v[3], m[259], mt[259])))));
|
|
|
|
|
tmp[1] = add(tmp[1], add(mul(v[0], m[260], mt[260]), add(mul(v[1], m[261], mt[261]), add(mul(v[2], m[262], mt[262]), mul(v[3], m[263], mt[263])))));
|
|
|
|
|
@@ -129,10 +129,10 @@
|
|
|
|
|
tmp[14] = add(tmp[14], add(mul(v[0], m[312], mt[312]), add(mul(v[1], m[313], mt[313]), add(mul(v[2], m[314], mt[314]), mul(v[3], m[315], mt[315])))));
|
|
|
|
|
tmp[15] = add(tmp[15], add(mul(v[0], m[316], mt[316]), add(mul(v[1], m[317], mt[317]), add(mul(v[2], m[318], mt[318]), mul(v[3], m[319], mt[319])))));
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I + d2 + d4]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0 + d2 + d4]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1 + d2 + d4]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1 + d2 + d4]);
|
|
|
|
|
+ v[0] = load2(psi + 2 * (I + d2 + d4));
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0 + d2 + d4));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1 + d2 + d4));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d4));
|
|
|
|
|
|
|
|
|
|
tmp[0] = add(tmp[0], add(mul(v[0], m[320], mt[320]), add(mul(v[1], m[321], mt[321]), add(mul(v[2], m[322], mt[322]), mul(v[3], m[323], mt[323])))));
|
|
|
|
|
tmp[1] = add(tmp[1], add(mul(v[0], m[324], mt[324]), add(mul(v[1], m[325], mt[325]), add(mul(v[2], m[326], mt[326]), mul(v[3], m[327], mt[327])))));
|
|
|
|
|
@@ -151,10 +151,10 @@
|
|
|
|
|
tmp[14] = add(tmp[14], add(mul(v[0], m[376], mt[376]), add(mul(v[1], m[377], mt[377]), add(mul(v[2], m[378], mt[378]), mul(v[3], m[379], mt[379])))));
|
|
|
|
|
tmp[15] = add(tmp[15], add(mul(v[0], m[380], mt[380]), add(mul(v[1], m[381], mt[381]), add(mul(v[2], m[382], mt[382]), mul(v[3], m[383], mt[383])))));
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I + d3 + d4]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0 + d3 + d4]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1 + d3 + d4]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1 + d3 + d4]);
|
|
|
|
|
+ v[0] = load2(psi + 2 * (I + d3 + d4));
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0 + d3 + d4));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1 + d3 + d4));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1 + d3 + d4));
|
|
|
|
|
|
|
|
|
|
tmp[0] = add(tmp[0], add(mul(v[0], m[384], mt[384]), add(mul(v[1], m[385], mt[385]), add(mul(v[2], m[386], mt[386]), mul(v[3], m[387], mt[387])))));
|
|
|
|
|
tmp[1] = add(tmp[1], add(mul(v[0], m[388], mt[388]), add(mul(v[1], m[389], mt[389]), add(mul(v[2], m[390], mt[390]), mul(v[3], m[391], mt[391])))));
|
|
|
|
|
@@ -173,35 +173,35 @@
|
|
|
|
|
tmp[14] = add(tmp[14], add(mul(v[0], m[440], mt[440]), add(mul(v[1], m[441], mt[441]), add(mul(v[2], m[442], mt[442]), mul(v[3], m[443], mt[443])))));
|
|
|
|
|
tmp[15] = add(tmp[15], add(mul(v[0], m[444], mt[444]), add(mul(v[1], m[445], mt[445]), add(mul(v[2], m[446], mt[446]), mul(v[3], m[447], mt[447])))));
|
|
|
|
|
|
|
|
|
|
- v[0] = load2(&psi[I + d2 + d3 + d4]);
|
|
|
|
|
- v[1] = load2(&psi[I + d0 + d2 + d3 + d4]);
|
|
|
|
|
- v[2] = load2(&psi[I + d1 + d2 + d3 + d4]);
|
|
|
|
|
- v[3] = load2(&psi[I + d0 + d1 + d2 + d3 + d4]);
|
|
|
|
|
-
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0], (double*)&psi[I], add(tmp[0], add(mul(v[0], m[448], mt[448]), add(mul(v[1], m[449], mt[449]), add(mul(v[2], m[450], mt[450]), mul(v[3], m[451], mt[451]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1], (double*)&psi[I + d1], add(tmp[1], add(mul(v[0], m[452], mt[452]), add(mul(v[1], m[453], mt[453]), add(mul(v[2], m[454], mt[454]), mul(v[3], m[455], mt[455]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d2], (double*)&psi[I + d2], add(tmp[2], add(mul(v[0], m[456], mt[456]), add(mul(v[1], m[457], mt[457]), add(mul(v[2], m[458], mt[458]), mul(v[3], m[459], mt[459]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2], (double*)&psi[I + d1 + d2], add(tmp[3], add(mul(v[0], m[460], mt[460]), add(mul(v[1], m[461], mt[461]), add(mul(v[2], m[462], mt[462]), mul(v[3], m[463], mt[463]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d3], (double*)&psi[I + d3], add(tmp[4], add(mul(v[0], m[464], mt[464]), add(mul(v[1], m[465], mt[465]), add(mul(v[2], m[466], mt[466]), mul(v[3], m[467], mt[467]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3], (double*)&psi[I + d1 + d3], add(tmp[5], add(mul(v[0], m[468], mt[468]), add(mul(v[1], m[469], mt[469]), add(mul(v[2], m[470], mt[470]), mul(v[3], m[471], mt[471]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3], (double*)&psi[I + d2 + d3], add(tmp[6], add(mul(v[0], m[472], mt[472]), add(mul(v[1], m[473], mt[473]), add(mul(v[2], m[474], mt[474]), mul(v[3], m[475], mt[475]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3], (double*)&psi[I + d1 + d2 + d3], add(tmp[7], add(mul(v[0], m[476], mt[476]), add(mul(v[1], m[477], mt[477]), add(mul(v[2], m[478], mt[478]), mul(v[3], m[479], mt[479]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d4], (double*)&psi[I + d4], add(tmp[8], add(mul(v[0], m[480], mt[480]), add(mul(v[1], m[481], mt[481]), add(mul(v[2], m[482], mt[482]), mul(v[3], m[483], mt[483]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d4], (double*)&psi[I + d1 + d4], add(tmp[9], add(mul(v[0], m[484], mt[484]), add(mul(v[1], m[485], mt[485]), add(mul(v[2], m[486], mt[486]), mul(v[3], m[487], mt[487]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d4], (double*)&psi[I + d2 + d4], add(tmp[10], add(mul(v[0], m[488], mt[488]), add(mul(v[1], m[489], mt[489]), add(mul(v[2], m[490], mt[490]), mul(v[3], m[491], mt[491]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d4], (double*)&psi[I + d1 + d2 + d4], add(tmp[11], add(mul(v[0], m[492], mt[492]), add(mul(v[1], m[493], mt[493]), add(mul(v[2], m[494], mt[494]), mul(v[3], m[495], mt[495]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d3 + d4], (double*)&psi[I + d3 + d4], add(tmp[12], add(mul(v[0], m[496], mt[496]), add(mul(v[1], m[497], mt[497]), add(mul(v[2], m[498], mt[498]), mul(v[3], m[499], mt[499]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d3 + d4], (double*)&psi[I + d1 + d3 + d4], add(tmp[13], add(mul(v[0], m[500], mt[500]), add(mul(v[1], m[501], mt[501]), add(mul(v[2], m[502], mt[502]), mul(v[3], m[503], mt[503]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d2 + d3 + d4], (double*)&psi[I + d2 + d3 + d4], add(tmp[14], add(mul(v[0], m[504], mt[504]), add(mul(v[1], m[505], mt[505]), add(mul(v[2], m[506], mt[506]), mul(v[3], m[507], mt[507]))))));
|
|
|
|
|
- _mm256_storeu2_m128d((double*)&psi[I + d0 + d1 + d2 + d3 + d4], (double*)&psi[I + d1 + d2 + d3 + d4], add(tmp[15], add(mul(v[0], m[508], mt[508]), add(mul(v[1], m[509], mt[509]), add(mul(v[2], m[510], mt[510]), mul(v[3], m[511], mt[511]))))));
|
|
|
|
|
+ v[0] = load2(psi + 2 * (I + d2 + d3 + d4));
|
|
|
|
|
+ v[1] = load2(psi + 2 * (I + d0 + d2 + d3 + d4));
|
|
|
|
|
+ v[2] = load2(psi + 2 * (I + d1 + d2 + d3 + d4));
|
|
|
|
|
+ v[3] = load2(psi + 2 * (I + d0 + d1 + d2 + d3 + d4));
|
|
|
|
|
+
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0), psi + 2 * I, add(tmp[0], add(mul(v[0], m[448], mt[448]), add(mul(v[1], m[449], mt[449]), add(mul(v[2], m[450], mt[450]), mul(v[3], m[451], mt[451]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1), psi + 2 * (I + d1), add(tmp[1], add(mul(v[0], m[452], mt[452]), add(mul(v[1], m[453], mt[453]), add(mul(v[2], m[454], mt[454]), mul(v[3], m[455], mt[455]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2), psi + 2 * (I + d2), add(tmp[2], add(mul(v[0], m[456], mt[456]), add(mul(v[1], m[457], mt[457]), add(mul(v[2], m[458], mt[458]), mul(v[3], m[459], mt[459]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2), psi + 2 * (I + d1 + d2), add(tmp[3], add(mul(v[0], m[460], mt[460]), add(mul(v[1], m[461], mt[461]), add(mul(v[2], m[462], mt[462]), mul(v[3], m[463], mt[463]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d3), psi + 2 * (I + d3), add(tmp[4], add(mul(v[0], m[464], mt[464]), add(mul(v[1], m[465], mt[465]), add(mul(v[2], m[466], mt[466]), mul(v[3], m[467], mt[467]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d3), psi + 2 * (I + d1 + d3), add(tmp[5], add(mul(v[0], m[468], mt[468]), add(mul(v[1], m[469], mt[469]), add(mul(v[2], m[470], mt[470]), mul(v[3], m[471], mt[471]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d3), psi + 2 * (I + d2 + d3), add(tmp[6], add(mul(v[0], m[472], mt[472]), add(mul(v[1], m[473], mt[473]), add(mul(v[2], m[474], mt[474]), mul(v[3], m[475], mt[475]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d3), psi + 2 * (I + d1 + d2 + d3), add(tmp[7], add(mul(v[0], m[476], mt[476]), add(mul(v[1], m[477], mt[477]), add(mul(v[2], m[478], mt[478]), mul(v[3], m[479], mt[479]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d4), psi + 2 * (I + d4), add(tmp[8], add(mul(v[0], m[480], mt[480]), add(mul(v[1], m[481], mt[481]), add(mul(v[2], m[482], mt[482]), mul(v[3], m[483], mt[483]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d4), psi + 2 * (I + d1 + d4), add(tmp[9], add(mul(v[0], m[484], mt[484]), add(mul(v[1], m[485], mt[485]), add(mul(v[2], m[486], mt[486]), mul(v[3], m[487], mt[487]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d4), psi + 2 * (I + d2 + d4), add(tmp[10], add(mul(v[0], m[488], mt[488]), add(mul(v[1], m[489], mt[489]), add(mul(v[2], m[490], mt[490]), mul(v[3], m[491], mt[491]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d4), psi + 2 * (I + d1 + d2 + d4), add(tmp[11], add(mul(v[0], m[492], mt[492]), add(mul(v[1], m[493], mt[493]), add(mul(v[2], m[494], mt[494]), mul(v[3], m[495], mt[495]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d3 + d4), psi + 2 * (I + d3 + d4), add(tmp[12], add(mul(v[0], m[496], mt[496]), add(mul(v[1], m[497], mt[497]), add(mul(v[2], m[498], mt[498]), mul(v[3], m[499], mt[499]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d3 + d4), psi + 2 * (I + d1 + d3 + d4), add(tmp[13], add(mul(v[0], m[500], mt[500]), add(mul(v[1], m[501], mt[501]), add(mul(v[2], m[502], mt[502]), mul(v[3], m[503], mt[503]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d2 + d3 + d4), psi + 2 * (I + d2 + d3 + d4), add(tmp[14], add(mul(v[0], m[504], mt[504]), add(mul(v[1], m[505], mt[505]), add(mul(v[2], m[506], mt[506]), mul(v[3], m[507], mt[507]))))));
|
|
|
|
|
+ _mm256_storeu2_m128d(psi + 2 * (I + d0 + d1 + d2 + d3 + d4), psi + 2 * (I + d1 + d2 + d3 + d4), add(tmp[15], add(mul(v[0], m[508], mt[508]), add(mul(v[1], m[509], mt[509]), add(mul(v[2], m[510], mt[510]), mul(v[3], m[511], mt[511]))))));
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// bit indices id[.] are given from high to low (e.g. control first for CNOT)
|
|
|
|
|
template <class V, class M>
|
|
|
|
|
-void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask)
|
|
|
|
|
+void kernel(V &psi, unsigned id4, unsigned id3, unsigned id2, unsigned id1, unsigned id0, M const& m, std::size_t ctrlmask, unsigned len)
|
|
|
|
|
{
|
|
|
|
|
- std::size_t n = psi.size();
|
|
|
|
|
+ std::size_t n = len;
|
|
|
|
|
std::size_t d0 = 1UL << id0;
|
|
|
|
|
std::size_t d1 = 1UL << id1;
|
|
|
|
|
std::size_t d2 = 1UL << id2;
|
|
|
|
|
diff --color -aur ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/simulator.hpp ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/simulator.hpp
|
|
|
|
|
--- ProjectQ-0.5.1/projectq/backends/_sim/_cppkernels/simulator.hpp 2020-06-05 21:07:57.000000000 +0800
|
|
|
|
|
+++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/simulator.hpp 2021-01-14 10:52:24.822039389 +0800
|
|
|
|
|
@@ -33,7 +33,6 @@
|
|
|
|
|
+++ ProjectQ-0.5.1_new/projectq/backends/_sim/_cppkernels/simulator.hpp 2021-04-20 11:46:27.115554725 +0800
|
|
|
|
|
@@ -18,11 +18,7 @@
|
|
|
|
|
#include <vector>
|
|
|
|
|
#include <complex>
|
|
|
|
|
|
|
|
|
|
-#if defined(NOINTRIN) || !defined(INTRIN)
|
|
|
|
|
-#include "nointrin/kernels.hpp"
|
|
|
|
|
-#else
|
|
|
|
|
#include "intrin/kernels.hpp"
|
|
|
|
|
-#endif
|
|
|
|
|
|
|
|
|
|
#include "intrin/alignedallocator.hpp"
|
|
|
|
|
#include "fusion.hpp"
|
|
|
|
|
@@ -32,173 +28,29 @@
|
|
|
|
|
#include <tuple>
|
|
|
|
|
#include <random>
|
|
|
|
|
#include <functional>
|
|
|
|
|
|
|
|
|
|
-
|
|
|
|
|
+#include <cstring>
|
|
|
|
|
|
|
|
|
|
class Simulator{
|
|
|
|
|
public:
|
|
|
|
|
using calc_type = double;
|
|
|
|
|
@@ -44,8 +43,9 @@ public:
|
|
|
|
|
using complex_type = std::complex<calc_type>;
|
|
|
|
|
- using StateVector = std::vector<complex_type, aligned_allocator<complex_type,512>>;
|
|
|
|
|
+ using StateVector = calc_type *;
|
|
|
|
|
using Map = std::map<unsigned, unsigned>;
|
|
|
|
|
using RndEngine = std::mt19937;
|
|
|
|
|
using Term = std::vector<std::pair<unsigned, char>>;
|
|
|
|
|
using TermsDict = std::vector<std::pair<Term, calc_type>>;
|
|
|
|
|
using ComplexTermsDict = std::vector<std::pair<Term, complex_type>>;
|
|
|
|
|
+ StateVector vec_;
|
|
|
|
|
|
|
|
|
|
- Simulator(unsigned seed = 1) : N_(0), vec_(1,0.), fusion_qubits_min_(4),
|
|
|
|
|
+ Simulator(unsigned seed = 1) : vec_(1,0.), N_(0), fusion_qubits_min_(4),
|
|
|
|
|
+ Simulator(unsigned seed = 1, unsigned N = 0) : N_(N), fusion_qubits_min_(4),
|
|
|
|
|
fusion_qubits_max_(5), rnd_eng_(seed) {
|
|
|
|
|
+ len_ = 1UL << (N_ + 1);
|
|
|
|
|
+ vec_ = (StateVector)calloc(len_, sizeof(calc_type));
|
|
|
|
|
vec_[0]=1.; // all-zero initial state
|
|
|
|
|
std::uniform_real_distribution<double> dist(0., 1.);
|
|
|
|
|
@@ -562,7 +562,6 @@ private:
|
|
|
|
|
rng_ = std::bind(dist, std::ref(rnd_eng_));
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- void allocate_qubit(unsigned id){
|
|
|
|
|
- if (map_.count(id) == 0){
|
|
|
|
|
- map_[id] = N_++;
|
|
|
|
|
- StateVector newvec; // avoid large memory allocations
|
|
|
|
|
- if( tmpBuff1_.capacity() >= (1UL << N_) )
|
|
|
|
|
- std::swap(newvec, tmpBuff1_);
|
|
|
|
|
- newvec.resize(1UL << N_);
|
|
|
|
|
-#pragma omp parallel for schedule(static)
|
|
|
|
|
- for (std::size_t i = 0; i < newvec.size(); ++i)
|
|
|
|
|
- newvec[i] = (i < vec_.size())?vec_[i]:0.;
|
|
|
|
|
- std::swap(vec_, newvec);
|
|
|
|
|
- // recycle large memory
|
|
|
|
|
- std::swap(tmpBuff1_, newvec);
|
|
|
|
|
- if( tmpBuff1_.capacity() < tmpBuff2_.capacity() )
|
|
|
|
|
- std::swap(tmpBuff1_, tmpBuff2_);
|
|
|
|
|
- }
|
|
|
|
|
- else
|
|
|
|
|
- throw(std::runtime_error(
|
|
|
|
|
- "AllocateQubit: ID already exists. Qubit IDs should be unique."));
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- bool get_classical_value(unsigned id, calc_type tol = 1.e-12){
|
|
|
|
|
- run();
|
|
|
|
|
- unsigned pos = map_[id];
|
|
|
|
|
- std::size_t delta = (1UL << pos);
|
|
|
|
|
-
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); i += 2*delta){
|
|
|
|
|
- for (std::size_t j = 0; j < delta; ++j){
|
|
|
|
|
- if (std::norm(vec_[i+j]) > tol)
|
|
|
|
|
- return false;
|
|
|
|
|
- if (std::norm(vec_[i+j+delta]) > tol)
|
|
|
|
|
- return true;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- assert(false); // this will never happen
|
|
|
|
|
- return false; // suppress 'control reaches end of non-void...'
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- bool is_classical(unsigned id, calc_type tol = 1.e-12){
|
|
|
|
|
- run();
|
|
|
|
|
- unsigned pos = map_[id];
|
|
|
|
|
- std::size_t delta = (1UL << pos);
|
|
|
|
|
-
|
|
|
|
|
- short up = 0, down = 0;
|
|
|
|
|
- #pragma omp parallel for schedule(static) reduction(|:up,down)
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); i += 2*delta){
|
|
|
|
|
- for (std::size_t j = 0; j < delta; ++j){
|
|
|
|
|
- up = up | ((std::norm(vec_[i+j]) > tol)&1);
|
|
|
|
|
- down = down | ((std::norm(vec_[i+j+delta]) > tol)&1);
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- return 1 == (up^down);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- void collapse_vector(unsigned id, bool value = false, bool shrink = false){
|
|
|
|
|
- run();
|
|
|
|
|
- unsigned pos = map_[id];
|
|
|
|
|
- std::size_t delta = (1UL << pos);
|
|
|
|
|
-
|
|
|
|
|
- if (!shrink){
|
|
|
|
|
- #pragma omp parallel for schedule(static)
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); i += 2*delta){
|
|
|
|
|
- for (std::size_t j = 0; j < delta; ++j)
|
|
|
|
|
- vec_[i+j+static_cast<std::size_t>(!value)*delta] = 0.;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- else{
|
|
|
|
|
- StateVector newvec; // avoid costly memory reallocations
|
|
|
|
|
- if( tmpBuff1_.capacity() >= (1UL << (N_-1)) )
|
|
|
|
|
- std::swap(tmpBuff1_, newvec);
|
|
|
|
|
- newvec.resize((1UL << (N_-1)));
|
|
|
|
|
- #pragma omp parallel for schedule(static) if(0)
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); i += 2*delta)
|
|
|
|
|
- std::copy_n(&vec_[i + static_cast<std::size_t>(value)*delta],
|
|
|
|
|
- delta, &newvec[i/2]);
|
|
|
|
|
- std::swap(vec_, newvec);
|
|
|
|
|
- std::swap(tmpBuff1_, newvec);
|
|
|
|
|
- if( tmpBuff1_.capacity() < tmpBuff2_.capacity() )
|
|
|
|
|
- std::swap(tmpBuff1_, tmpBuff2_);
|
|
|
|
|
-
|
|
|
|
|
- for (auto& p : map_){
|
|
|
|
|
- if (p.second > pos)
|
|
|
|
|
- p.second--;
|
|
|
|
|
- }
|
|
|
|
|
- map_.erase(id);
|
|
|
|
|
- N_--;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- void measure_qubits(std::vector<unsigned> const& ids, std::vector<bool> &res){
|
|
|
|
|
- run();
|
|
|
|
|
-
|
|
|
|
|
- std::vector<unsigned> positions(ids.size());
|
|
|
|
|
- for (unsigned i = 0; i < ids.size(); ++i)
|
|
|
|
|
- positions[i] = map_[ids[i]];
|
|
|
|
|
-
|
|
|
|
|
- calc_type P = 0.;
|
|
|
|
|
- calc_type rnd = rng_();
|
|
|
|
|
-
|
|
|
|
|
- // pick entry at random with probability |entry|^2
|
|
|
|
|
- std::size_t pick = 0;
|
|
|
|
|
- while (P < rnd && pick < vec_.size())
|
|
|
|
|
- P += std::norm(vec_[pick++]);
|
|
|
|
|
-
|
|
|
|
|
- pick--;
|
|
|
|
|
- // determine result vector (boolean values for each qubit)
|
|
|
|
|
- // and create mask to detect bad entries (i.e., entries that don't agree with measurement)
|
|
|
|
|
- res = std::vector<bool>(ids.size());
|
|
|
|
|
- std::size_t mask = 0;
|
|
|
|
|
- std::size_t val = 0;
|
|
|
|
|
- for (unsigned i = 0; i < ids.size(); ++i){
|
|
|
|
|
- bool r = ((pick >> positions[i]) & 1) == 1;
|
|
|
|
|
- res[i] = r;
|
|
|
|
|
- mask |= (1UL << positions[i]);
|
|
|
|
|
- val |= (static_cast<std::size_t>(r&1) << positions[i]);
|
|
|
|
|
- }
|
|
|
|
|
- // set bad entries to 0
|
|
|
|
|
- calc_type N = 0.;
|
|
|
|
|
- #pragma omp parallel for reduction(+:N) schedule(static)
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); ++i){
|
|
|
|
|
- if ((i & mask) != val)
|
|
|
|
|
- vec_[i] = 0.;
|
|
|
|
|
- else
|
|
|
|
|
- N += std::norm(vec_[i]);
|
|
|
|
|
- }
|
|
|
|
|
- // re-normalize
|
|
|
|
|
- N = 1./std::sqrt(N);
|
|
|
|
|
- #pragma omp parallel for schedule(static)
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); ++i)
|
|
|
|
|
- vec_[i] *= N;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- std::vector<bool> measure_qubits_return(std::vector<unsigned> const& ids){
|
|
|
|
|
- std::vector<bool> ret;
|
|
|
|
|
- measure_qubits(ids, ret);
|
|
|
|
|
- return ret;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- void deallocate_qubit(unsigned id){
|
|
|
|
|
- run();
|
|
|
|
|
- assert(map_.count(id) == 1);
|
|
|
|
|
- if (!is_classical(id))
|
|
|
|
|
- throw(std::runtime_error("Error: Qubit has not been measured / uncomputed! There is most likely a bug in your code."));
|
|
|
|
|
-
|
|
|
|
|
- bool value = get_classical_value(id);
|
|
|
|
|
- collapse_vector(id, value, true);
|
|
|
|
|
+ for (unsigned i = 0; i < N_; i++)
|
|
|
|
|
+ map_[i] = i;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class M>
|
|
|
|
|
@@ -221,84 +73,13 @@
|
|
|
|
|
fused_gates_ = fused_gates;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
- template <class F, class QuReg>
|
|
|
|
|
- void emulate_math(F const& f, QuReg quregs, const std::vector<unsigned>& ctrl,
|
|
|
|
|
- bool parallelize = false){
|
|
|
|
|
- run();
|
|
|
|
|
- auto ctrlmask = get_control_mask(ctrl);
|
|
|
|
|
-
|
|
|
|
|
- for (unsigned i = 0; i < quregs.size(); ++i)
|
|
|
|
|
- for (unsigned j = 0; j < quregs[i].size(); ++j)
|
|
|
|
|
- quregs[i][j] = map_[quregs[i][j]];
|
|
|
|
|
-
|
|
|
|
|
- StateVector newvec; // avoid costly memory reallocations
|
|
|
|
|
- if( tmpBuff1_.capacity() >= vec_.size() )
|
|
|
|
|
- std::swap(newvec, tmpBuff1_);
|
|
|
|
|
- newvec.resize(vec_.size());
|
|
|
|
|
-#pragma omp parallel for schedule(static)
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); i++)
|
|
|
|
|
- newvec[i] = 0;
|
|
|
|
|
-
|
|
|
|
|
-//#pragma omp parallel reduction(+:newvec[:newvec.size()]) if(parallelize) // requires OpenMP 4.5
|
|
|
|
|
- {
|
|
|
|
|
- std::vector<int> res(quregs.size());
|
|
|
|
|
- //#pragma omp for schedule(static)
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); ++i){
|
|
|
|
|
- if ((ctrlmask&i) == ctrlmask){
|
|
|
|
|
- for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
|
|
|
|
|
- res[qr_i] = 0;
|
|
|
|
|
- for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i)
|
|
|
|
|
- res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1) << qb_i;
|
|
|
|
|
- }
|
|
|
|
|
- f(res);
|
|
|
|
|
- auto new_i = i;
|
|
|
|
|
- for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
|
|
|
|
|
- for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i){
|
|
|
|
|
- if (!(((new_i >> quregs[qr_i][qb_i])&1) == ((res[qr_i] >> qb_i)&1)))
|
|
|
|
|
- new_i ^= (1UL << quregs[qr_i][qb_i]);
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- newvec[new_i] += vec_[i];
|
|
|
|
|
- }
|
|
|
|
|
- else
|
|
|
|
|
- newvec[i] += vec_[i];
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- std::swap(vec_, newvec);
|
|
|
|
|
- std::swap(tmpBuff1_, newvec);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- // faster version without calling python
|
|
|
|
|
- template<class QuReg>
|
|
|
|
|
- inline void emulate_math_addConstant(int a, const QuReg& quregs, const std::vector<unsigned>& ctrl)
|
|
|
|
|
- {
|
|
|
|
|
- emulate_math([a](std::vector<int> &res){for(auto& x: res) x = x + a;}, quregs, ctrl, true);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- // faster version without calling python
|
|
|
|
|
- template<class QuReg>
|
|
|
|
|
- inline void emulate_math_addConstantModN(int a, int N, const QuReg& quregs, const std::vector<unsigned>& ctrl)
|
|
|
|
|
- {
|
|
|
|
|
- emulate_math([a,N](std::vector<int> &res){for(auto& x: res) x = (x + a) % N;}, quregs, ctrl, true);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- // faster version without calling python
|
|
|
|
|
- template<class QuReg>
|
|
|
|
|
- inline void emulate_math_multiplyByConstantModN(int a, int N, const QuReg& quregs, const std::vector<unsigned>& ctrl)
|
|
|
|
|
- {
|
|
|
|
|
- emulate_math([a,N](std::vector<int> &res){for(auto& x: res) x = (x * a) % N;}, quregs, ctrl, true);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
calc_type get_expectation_value(TermsDict const& td, std::vector<unsigned> const& ids){
|
|
|
|
|
run();
|
|
|
|
|
calc_type expectation = 0.;
|
|
|
|
|
|
|
|
|
|
- StateVector current_state; // avoid costly memory reallocations
|
|
|
|
|
- if( tmpBuff1_.capacity() >= vec_.size() )
|
|
|
|
|
- std::swap(tmpBuff1_, current_state);
|
|
|
|
|
- current_state.resize(vec_.size());
|
|
|
|
|
+ StateVector current_state = (StateVector)malloc(len_ *sizeof(calc_type));
|
|
|
|
|
#pragma omp parallel for schedule(static)
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); ++i)
|
|
|
|
|
+ for (std::size_t i = 0; i < len_; ++i)
|
|
|
|
|
current_state[i] = vec_[i];
|
|
|
|
|
|
|
|
|
|
for (auto const& term : td){
|
|
|
|
|
@@ -306,81 +87,53 @@
|
|
|
|
|
apply_term(term.first, ids, {});
|
|
|
|
|
calc_type delta = 0.;
|
|
|
|
|
#pragma omp parallel for reduction(+:delta) schedule(static)
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); ++i){
|
|
|
|
|
- auto const a1 = std::real(current_state[i]);
|
|
|
|
|
- auto const b1 = -std::imag(current_state[i]);
|
|
|
|
|
- auto const a2 = std::real(vec_[i]);
|
|
|
|
|
- auto const b2 = std::imag(vec_[i]);
|
|
|
|
|
+ for (std::size_t i = 0; i < (len_ >> 1); ++i){
|
|
|
|
|
+ auto const a1 = current_state[2 * i];
|
|
|
|
|
+ auto const b1 = -current_state[2 * i + 1];
|
|
|
|
|
+ auto const a2 = vec_[2 * i];
|
|
|
|
|
+ auto const b2 = vec_[2 * i + 1];
|
|
|
|
|
delta += a1 * a2 - b1 * b2;
|
|
|
|
|
// reset vec_
|
|
|
|
|
- vec_[i] = current_state[i];
|
|
|
|
|
+ vec_[2 * i] = current_state[2 * i];
|
|
|
|
|
+ vec_[2 * i + 1] = current_state[2 * i + 1];
|
|
|
|
|
}
|
|
|
|
|
expectation += coefficient * delta;
|
|
|
|
|
}
|
|
|
|
|
- std::swap(current_state, tmpBuff1_);
|
|
|
|
|
+ if (NULL != current_state){
|
|
|
|
|
+ free(current_state);
|
|
|
|
|
+ current_state = NULL;
|
|
|
|
|
+ }
|
|
|
|
|
return expectation;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void apply_qubit_operator(ComplexTermsDict const& td, std::vector<unsigned> const& ids){
|
|
|
|
|
run();
|
|
|
|
|
- StateVector new_state, current_state; // avoid costly memory reallocations
|
|
|
|
|
- if( tmpBuff1_.capacity() >= vec_.size() )
|
|
|
|
|
- std::swap(tmpBuff1_, new_state);
|
|
|
|
|
- if( tmpBuff2_.capacity() >= vec_.size() )
|
|
|
|
|
- std::swap(tmpBuff2_, current_state);
|
|
|
|
|
- new_state.resize(vec_.size());
|
|
|
|
|
- current_state.resize(vec_.size());
|
|
|
|
|
+ StateVector new_state = (StateVector)calloc(len_, sizeof(calc_type));
|
|
|
|
|
+ StateVector current_state = (StateVector)malloc(len_ * sizeof(calc_type));
|
|
|
|
|
#pragma omp parallel for schedule(static)
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); ++i){
|
|
|
|
|
- new_state[i] = 0;
|
|
|
|
|
+ for (std::size_t i = 0; i < len_; ++i){
|
|
|
|
|
current_state[i] = vec_[i];
|
|
|
|
|
}
|
|
|
|
|
for (auto const& term : td){
|
|
|
|
|
auto const& coefficient = term.second;
|
|
|
|
|
apply_term(term.first, ids, {});
|
|
|
|
|
#pragma omp parallel for schedule(static)
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); ++i){
|
|
|
|
|
- new_state[i] += coefficient * vec_[i];
|
|
|
|
|
- vec_[i] = current_state[i];
|
|
|
|
|
+ for (std::size_t i = 0; i < (len_ >> 1); ++i){
|
|
|
|
|
+ new_state[2 * i] += coefficient.real() * vec_[2 * i] - coefficient.imag() * vec_[2 * i + 1];
|
|
|
|
|
+ new_state[2 * i + 1] += coefficient.real() * vec_[2 * i + 1] + coefficient.imag() * vec_[2 * i];
|
|
|
|
|
+ vec_[2 * i] = current_state[2 * i];
|
|
|
|
|
+ vec_[2 * i + 1] = current_state[2 * i + 1];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
- std::swap(vec_, new_state);
|
|
|
|
|
- std::swap(tmpBuff1_, new_state);
|
|
|
|
|
- std::swap(tmpBuff2_, current_state);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- calc_type get_probability(std::vector<bool> const& bit_string,
|
|
|
|
|
- std::vector<unsigned> const& ids){
|
|
|
|
|
- run();
|
|
|
|
|
- if (!check_ids(ids))
|
|
|
|
|
- throw(std::runtime_error("get_probability(): Unknown qubit id. Please make sure you have called eng.flush()."));
|
|
|
|
|
- std::size_t mask = 0, bit_str = 0;
|
|
|
|
|
- for (unsigned i = 0; i < ids.size(); ++i){
|
|
|
|
|
- mask |= 1UL << map_[ids[i]];
|
|
|
|
|
- bit_str |= (bit_string[i]?1UL:0UL) << map_[ids[i]];
|
|
|
|
|
- }
|
|
|
|
|
- calc_type probability = 0.;
|
|
|
|
|
- #pragma omp parallel for reduction(+:probability) schedule(static)
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); ++i)
|
|
|
|
|
- if ((i & mask) == bit_str)
|
|
|
|
|
- probability += std::norm(vec_[i]);
|
|
|
|
|
- return probability;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- complex_type const& get_amplitude(std::vector<bool> const& bit_string,
|
|
|
|
|
- std::vector<unsigned> const& ids){
|
|
|
|
|
- run();
|
|
|
|
|
- std::size_t chk = 0;
|
|
|
|
|
- std::size_t index = 0;
|
|
|
|
|
- for (unsigned i = 0; i < ids.size(); ++i){
|
|
|
|
|
- if (map_.count(ids[i]) == 0)
|
|
|
|
|
- break;
|
|
|
|
|
- chk |= 1UL << map_[ids[i]];
|
|
|
|
|
- index |= (bit_string[i]?1UL:0UL) << map_[ids[i]];
|
|
|
|
|
+ if (NULL != vec_)
|
|
|
|
|
+ free(vec_);
|
|
|
|
|
+ vec_ = new_state;
|
|
|
|
|
+ if (NULL != new_state)
|
|
|
|
|
+ new_state = NULL;
|
|
|
|
|
+ if (NULL != current_state){
|
|
|
|
|
+ free(current_state);
|
|
|
|
|
+ current_state = NULL;
|
|
|
|
|
}
|
|
|
|
|
- if (chk + 1 != vec_.size())
|
|
|
|
|
- throw(std::runtime_error("The second argument to get_amplitude() must be a permutation of all allocated qubits. Please make sure you have called eng.flush()."));
|
|
|
|
|
- return vec_[index];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void emulate_time_evolution(TermsDict const& tdict, calc_type const& time,
|
|
|
|
|
@@ -400,87 +153,71 @@
|
|
|
|
|
}
|
|
|
|
|
unsigned s = std::abs(time) * op_nrm + 1.;
|
|
|
|
|
complex_type correction = std::exp(-time * I * tr / (double)s);
|
|
|
|
|
- auto output_state = vec_;
|
|
|
|
|
+ auto output_state = copy(vec_, len_);
|
|
|
|
|
auto ctrlmask = get_control_mask(ctrl);
|
|
|
|
|
for (unsigned i = 0; i < s; ++i){
|
|
|
|
|
calc_type nrm_change = 1.;
|
|
|
|
|
for (unsigned k = 0; nrm_change > 1.e-12; ++k){
|
|
|
|
|
auto coeff = (-time * I) / double(s * (k + 1));
|
|
|
|
|
- auto current_state = vec_;
|
|
|
|
|
- auto update = StateVector(vec_.size(), 0.);
|
|
|
|
|
+ auto current_state = copy(vec_, len_);
|
|
|
|
|
+ auto update = (StateVector)calloc(len_, sizeof(calc_type));
|
|
|
|
|
for (auto const& tup : td){
|
|
|
|
|
apply_term(tup.first, ids, {});
|
|
|
|
|
#pragma omp parallel for schedule(static)
|
|
|
|
|
- for (std::size_t j = 0; j < vec_.size(); ++j){
|
|
|
|
|
+ for (std::size_t j = 0; j < len_; ++j){
|
|
|
|
|
update[j] += vec_[j] * tup.second;
|
|
|
|
|
vec_[j] = current_state[j];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
nrm_change = 0.;
|
|
|
|
|
#pragma omp parallel for reduction(+:nrm_change) schedule(static)
|
|
|
|
|
- for (std::size_t j = 0; j < vec_.size(); ++j){
|
|
|
|
|
- update[j] *= coeff;
|
|
|
|
|
- vec_[j] = update[j];
|
|
|
|
|
+ for (std::size_t j = 0; j < (len_ >> 1); ++j){
|
|
|
|
|
+ complex_type tmp(update[2 * j], update[2 * j + 1]);
|
|
|
|
|
+ tmp *= coeff;
|
|
|
|
|
+ update[2 * j] *= std::real(tmp);
|
|
|
|
|
+ update[2 * j + 1] *= std::imag(tmp);
|
|
|
|
|
+ vec_[2 * j] = update[2 * j];
|
|
|
|
|
+ vec_[2 * j + 1] = update[2 * j + 1];
|
|
|
|
|
if ((j & ctrlmask) == ctrlmask){
|
|
|
|
|
- output_state[j] += update[j];
|
|
|
|
|
- nrm_change += std::norm(update[j]);
|
|
|
|
|
+ output_state[2 * j] += update[2 * j];
|
|
|
|
|
+ output_state[2 * j + 1] += update[2 * j + 1];
|
|
|
|
|
+ nrm_change += std::sqrt(update[2 * j] * update[2 * j] + update[2 * j + 1] * update[2 * j + 1]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
nrm_change = std::sqrt(nrm_change);
|
|
|
|
|
+ if (NULL != current_state){
|
|
|
|
|
+ free(current_state);
|
|
|
|
|
+ current_state = NULL;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (NULL != update){
|
|
|
|
|
+ free(update);
|
|
|
|
|
+ update = NULL;
|
|
|
|
|
+ }
|
|
|
|
|
}
|
|
|
|
|
#pragma omp parallel for schedule(static)
|
|
|
|
|
- for (std::size_t j = 0; j < vec_.size(); ++j){
|
|
|
|
|
- if ((j & ctrlmask) == ctrlmask)
|
|
|
|
|
- output_state[j] *= correction;
|
|
|
|
|
- vec_[j] = output_state[j];
|
|
|
|
|
+ for (std::size_t j = 0; j < (len_ >>1); ++j){
|
|
|
|
|
+ if ((j & ctrlmask) == ctrlmask){
|
|
|
|
|
+ complex_type tmp(output_state[2 * j], output_state[2 * j + 1]);
|
|
|
|
|
+ tmp *= correction;
|
|
|
|
|
+ output_state[2 * j] = std::real(tmp);
|
|
|
|
|
+ output_state[2 * j + 1] =std::imag(tmp);
|
|
|
|
|
+ }
|
|
|
|
|
+ vec_[2 * j] = output_state[2 * j];
|
|
|
|
|
+ vec_[2 * j + 1] = output_state[2 * j + 1];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
+ if (NULL != output_state){
|
|
|
|
|
+ free(output_state);
|
|
|
|
|
+ output_state = NULL;
|
|
|
|
|
+ }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void set_wavefunction(StateVector const& wavefunction, std::vector<unsigned> const& ordering){
|
|
|
|
|
run();
|
|
|
|
|
- // make sure there are 2^n amplitudes for n qubits
|
|
|
|
|
- assert(wavefunction.size() == (1UL << ordering.size()));
|
|
|
|
|
- // check that all qubits have been allocated previously
|
|
|
|
|
- if (map_.size() != ordering.size() || !check_ids(ordering))
|
|
|
|
|
- throw(std::runtime_error("set_wavefunction(): Invalid mapping provided. Please make sure all qubits have been allocated previously (call eng.flush())."));
|
|
|
|
|
-
|
|
|
|
|
- // set mapping and wavefunction
|
|
|
|
|
- for (unsigned i = 0; i < ordering.size(); ++i)
|
|
|
|
|
- map_[ordering[i]] = i;
|
|
|
|
|
- #pragma omp parallel for schedule(static)
|
|
|
|
|
- for (std::size_t i = 0; i < wavefunction.size(); ++i)
|
|
|
|
|
- vec_[i] = wavefunction[i];
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- void collapse_wavefunction(std::vector<unsigned> const& ids, std::vector<bool> const& values){
|
|
|
|
|
- run();
|
|
|
|
|
- assert(ids.size() == values.size());
|
|
|
|
|
- if (!check_ids(ids))
|
|
|
|
|
- throw(std::runtime_error("collapse_wavefunction(): Unknown qubit id(s) provided. Try calling eng.flush() before invoking this function."));
|
|
|
|
|
- std::size_t mask = 0, val = 0;
|
|
|
|
|
- for (unsigned i = 0; i < ids.size(); ++i){
|
|
|
|
|
- mask |= (1UL << map_[ids[i]]);
|
|
|
|
|
- val |= ((values[i]?1UL:0UL) << map_[ids[i]]);
|
|
|
|
|
- }
|
|
|
|
|
- // set bad entries to 0 and compute probability of outcome to renormalize
|
|
|
|
|
- calc_type N = 0.;
|
|
|
|
|
- #pragma omp parallel for reduction(+:N) schedule(static)
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); ++i){
|
|
|
|
|
- if ((i & mask) == val)
|
|
|
|
|
- N += std::norm(vec_[i]);
|
|
|
|
|
- }
|
|
|
|
|
- if (N < 1.e-12)
|
|
|
|
|
- throw(std::runtime_error("collapse_wavefunction(): Invalid collapse! Probability is ~0."));
|
|
|
|
|
- // re-normalize (if possible)
|
|
|
|
|
- N = 1./std::sqrt(N);
|
|
|
|
|
- #pragma omp parallel for schedule(static)
|
|
|
|
|
- for (std::size_t i = 0; i < vec_.size(); ++i){
|
|
|
|
|
- if ((i & mask) != val)
|
|
|
|
|
- vec_[i] = 0.;
|
|
|
|
|
- else
|
|
|
|
|
- vec_[i] *= N;
|
|
|
|
|
+ if (NULL != vec_){
|
|
|
|
|
+ free(vec_);
|
|
|
|
|
}
|
|
|
|
|
+ vec_ = copy(wavefunction, len_);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void run(){
|
|
|
|
|
@@ -500,23 +237,23 @@
|
|
|
|
|
switch (ids.size()){
|
|
|
|
|
case 1:
|
|
|
|
|
#pragma omp parallel
|
|
|
|
|
- kernel(vec_, ids[0], m, ctrlmask);
|
|
|
|
|
+ kernel(vec_, ids[0], m, ctrlmask, len_ >> 1);
|
|
|
|
|
break;
|
|
|
|
|
case 2:
|
|
|
|
|
#pragma omp parallel
|
|
|
|
|
- kernel(vec_, ids[1], ids[0], m, ctrlmask);
|
|
|
|
|
+ kernel(vec_, ids[1], ids[0], m, ctrlmask, len_ >> 1);
|
|
|
|
|
break;
|
|
|
|
|
case 3:
|
|
|
|
|
#pragma omp parallel
|
|
|
|
|
- kernel(vec_, ids[2], ids[1], ids[0], m, ctrlmask);
|
|
|
|
|
+ kernel(vec_, ids[2], ids[1], ids[0], m, ctrlmask, len_ >> 1);
|
|
|
|
|
break;
|
|
|
|
|
case 4:
|
|
|
|
|
#pragma omp parallel
|
|
|
|
|
- kernel(vec_, ids[3], ids[2], ids[1], ids[0], m, ctrlmask);
|
|
|
|
|
+ kernel(vec_, ids[3], ids[2], ids[1], ids[0], m, ctrlmask, len_ >> 1);
|
|
|
|
|
break;
|
|
|
|
|
case 5:
|
|
|
|
|
#pragma omp parallel
|
|
|
|
|
- kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask);
|
|
|
|
|
+ kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask, len_ >> 1);
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
throw std::invalid_argument("Gates with more than 5 qubits are not supported!");
|
|
|
|
|
@@ -525,12 +262,27 @@
|
|
|
|
|
fused_gates_ = Fusion();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
- std::tuple<Map, StateVector&> cheat(){
|
|
|
|
|
+ std::vector<complex_type> cheat(){
|
|
|
|
|
run();
|
|
|
|
|
- return make_tuple(map_, std::ref(vec_));
|
|
|
|
|
+ std::vector<complex_type> result;
|
|
|
|
|
+ for (unsigned int i = 0; i < (len_ >> 1); i++){
|
|
|
|
|
+ result.push_back({vec_[2 * i], vec_[2 * i + 1]});
|
|
|
|
|
+ }
|
|
|
|
|
+ return result;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ inline StateVector copy(StateVector source, unsigned len){
|
|
|
|
|
+ StateVector result = (StateVector)malloc(len * sizeof(calc_type));
|
|
|
|
|
+#pragma omp parallel for schedule(static)
|
|
|
|
|
+ for (std::size_t i = 0; i < len; ++i) {
|
|
|
|
|
+ result[i] = source[i];
|
|
|
|
|
}
|
|
|
|
|
+ return result;
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
|
|
~Simulator(){
|
|
|
|
|
+ if (NULL != vec_)
|
|
|
|
|
+ free(vec_);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
@@ -562,18 +314,13 @@
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned N_; // #qubits
|
|
|
|
|
- StateVector vec_;
|
|
|
|
|
+ unsigned len_;
|
|
|
|
|
Map map_;
|
|
|
|
|
Fusion fused_gates_;
|
|
|
|
|
unsigned fusion_qubits_min_, fusion_qubits_max_;
|
|
|
|
|
@@ -570,10 +569,8 @@ private:
|
|
|
|
|
RndEngine rnd_eng_;
|
|
|
|
|
std::function<double()> rng_;
|
|
|
|
|
|
|
|
|
|
// large array buffers to avoid costly reallocations
|
|
|
|
|
-
|
|
|
|
|
- // large array buffers to avoid costly reallocations
|
|
|
|
|
- static StateVector tmpBuff1_, tmpBuff2_;
|
|
|
|
|
+ StateVector tmpBuff1_, tmpBuff2_;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
-Simulator::StateVector Simulator::tmpBuff1_;
|
|
|
|
|