OpenCloudOS-Kernel/lib/raid6/neon.uc

/* -----------------------------------------------------------------------
 *
 *   neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
 *
 *   Copyright (C) 2012 Rob Herring
 *   Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
 *
 *   Based on altivec.uc:
 *     Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
 *
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
 *   Boston MA 02111-1307, USA; either version 2 of the License, or
 *   (at your option) any later version; incorporated herein by reference.
 *
 * ----------------------------------------------------------------------- */

/*
 * neon$#.c
 *
 * $#-way unrolled NEON intrinsics math RAID-6 instruction set
 *
 * This file is postprocessed using unroll.awk
 */

#include <arm_neon.h>

typedef uint8x16_t unative_t;

#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})
#define NSIZE	sizeof(unative_t)

/*
 * The SHLBYTE() operation shifts each byte left by 1, *not*
 * rolling over into the next byte
 */
static inline unative_t SHLBYTE(unative_t v)
{
	return vshlq_n_u8(v, 1);
}

/*
 * The MASK() operation returns 0xFF in any byte for which the high
 * bit is 1, 0x00 for any byte for which the high bit is 0.
 */
static inline unative_t MASK(unative_t v)
{
	return (unative_t)vshrq_n_s8((int8x16_t)v, 7);
}

static inline unative_t PMUL(unative_t v, unative_t u)
{
	return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u);
}

void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
{
	uint8_t **dptr = (uint8_t **)ptrs;
	uint8_t *p, *q;
	int d, z, z0;

	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
	const unative_t x1d = NBYTES(0x1d);

	z0 = disks - 3;		/* Highest data disk */
	p = dptr[z0+1];		/* XOR parity */
	q = dptr[z0+2];		/* RS syndrome */

	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
		wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
		for ( z = z0-1 ; z >= 0 ; z-- ) {
			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
			wp$$ = veorq_u8(wp$$, wd$$);
			w2$$ = MASK(wq$$);
			w1$$ = SHLBYTE(wq$$);

			w2$$ = vandq_u8(w2$$, x1d);
			w1$$ = veorq_u8(w1$$, w2$$);
			wq$$ = veorq_u8(w1$$, wd$$);
		}
		vst1q_u8(&p[d+NSIZE*$$], wp$$);
		vst1q_u8(&q[d+NSIZE*$$], wq$$);
	}
}

void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
				    unsigned long bytes, void **ptrs)
{
	uint8_t **dptr = (uint8_t **)ptrs;
	uint8_t *p, *q;
	int d, z, z0;

	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
	const unative_t x1d = NBYTES(0x1d);

	z0 = stop;		/* P/Q right side optimization */
	p = dptr[disks-2];	/* XOR parity */
	q = dptr[disks-1];	/* RS syndrome */

	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
		wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
		wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);

		/* P/Q data pages */
		for ( z = z0-1 ; z >= start ; z-- ) {
			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
			wp$$ = veorq_u8(wp$$, wd$$);
			w2$$ = MASK(wq$$);
			w1$$ = SHLBYTE(wq$$);

			w2$$ = vandq_u8(w2$$, x1d);
			w1$$ = veorq_u8(w1$$, w2$$);
			wq$$ = veorq_u8(w1$$, wd$$);
		}
		/* P/Q left side optimization */
		for ( z = start-1 ; z >= 3 ; z -= 4 ) {
			w2$$ = vshrq_n_u8(wq$$, 4);
			w1$$ = vshlq_n_u8(wq$$, 4);

			w2$$ = PMUL(w2$$, x1d);
			wq$$ = veorq_u8(w1$$, w2$$);
		}

		switch (z) {
		case 2:
			w2$$ = vshrq_n_u8(wq$$, 5);
			w1$$ = vshlq_n_u8(wq$$, 3);

			w2$$ = PMUL(w2$$, x1d);
			wq$$ = veorq_u8(w1$$, w2$$);
			break;
		case 1:
			w2$$ = vshrq_n_u8(wq$$, 6);
			w1$$ = vshlq_n_u8(wq$$, 2);

			w2$$ = PMUL(w2$$, x1d);
			wq$$ = veorq_u8(w1$$, w2$$);
			break;
		case 0:
			w2$$ = MASK(wq$$);
			w1$$ = SHLBYTE(wq$$);

			w2$$ = vandq_u8(w2$$, x1d);
			wq$$ = veorq_u8(w1$$, w2$$);
		}
		w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
		wq$$ = veorq_u8(wq$$, w1$$);

		vst1q_u8(&p[d+NSIZE*$$], wp$$);
		vst1q_u8(&q[d+NSIZE*$$], wq$$);
	}
}
lib/raid6: add ARM-NEON accelerated syndrome calculation Rebased/reworked a patch contributed by Rob Herring that uses NEON intrinsics to perform the RAID-6 syndrome calculations. It uses the existing unroll.awk code to generate several unrolled versions of which the best performing one is selected at boot time. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Acked-by: Nicolas Pitre <nico@linaro.org> Cc: hpa@linux.intel.com 2013-05-16 23:20:32 +08:00			`/* -----------------------------------------------------------------------`
			`*`
			`* neon.uc - RAID-6 syndrome calculation using ARM NEON instructions`
			`*`
			`* Copyright (C) 2012 Rob Herring`
md/raid6: delta syndrome for ARM NEON This implements XOR syndrome calculation using NEON intrinsics. As before, the module can be built for ARM and arm64 from the same source. Relative performance on a Cortex-A57 based system: raid6: int64x1 gen() 905 MB/s raid6: int64x1 xor() 881 MB/s raid6: int64x2 gen() 1343 MB/s raid6: int64x2 xor() 1286 MB/s raid6: int64x4 gen() 1896 MB/s raid6: int64x4 xor() 1321 MB/s raid6: int64x8 gen() 1773 MB/s raid6: int64x8 xor() 1165 MB/s raid6: neonx1 gen() 1834 MB/s raid6: neonx1 xor() 1278 MB/s raid6: neonx2 gen() 2528 MB/s raid6: neonx2 xor() 1942 MB/s raid6: neonx4 gen() 2888 MB/s raid6: neonx4 xor() 2334 MB/s raid6: neonx8 gen() 2957 MB/s raid6: neonx8 xor() 2232 MB/s raid6: using algorithm neonx8 gen() 2957 MB/s raid6: .... xor() 2232 MB/s, rmw enabled Cc: Markus Stockhausen <stockhausen@collogia.de> Cc: Neil Brown <neilb@suse.de> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: NeilBrown <neilb@suse.com> 2015-07-01 10:19:56 +08:00			`* Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>`
lib/raid6: add ARM-NEON accelerated syndrome calculation Rebased/reworked a patch contributed by Rob Herring that uses NEON intrinsics to perform the RAID-6 syndrome calculations. It uses the existing unroll.awk code to generate several unrolled versions of which the best performing one is selected at boot time. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Acked-by: Nicolas Pitre <nico@linaro.org> Cc: hpa@linux.intel.com 2013-05-16 23:20:32 +08:00			`*`
			`* Based on altivec.uc:`
			`* Copyright 2002-2004 H. Peter Anvin - All Rights Reserved`
			`*`
			`* This program is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation, Inc., 53 Temple Place Ste 330,`
			`* Boston MA 02111-1307, USA; either version 2 of the License, or`
			`* (at your option) any later version; incorporated herein by reference.`
			`*`
			`* ----------------------------------------------------------------------- */`

			`/*`
			`* neon$#.c`
			`*`
			`* $#-way unrolled NEON intrinsics math RAID-6 instruction set`
			`*`
			`* This file is postprocessed using unroll.awk`
			`*/`

			`#include <arm_neon.h>`

			`typedef uint8x16_t unative_t;`

			`#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})`
			`#define NSIZE sizeof(unative_t)`

			`/*`
			`* The SHLBYTE() operation shifts each byte left by 1, not`
			`* rolling over into the next byte`
			`*/`
			`static inline unative_t SHLBYTE(unative_t v)`
			`{`
			`return vshlq_n_u8(v, 1);`
			`}`

			`/*`
			`* The MASK() operation returns 0xFF in any byte for which the high`
			`* bit is 1, 0x00 for any byte for which the high bit is 0.`
			`*/`
			`static inline unative_t MASK(unative_t v)`
			`{`
md/raid6: use faster multiplication for ARM NEON delta syndrome The P/Q left side optimization in the delta syndrome simply involves repeatedly multiplying a value by polynomial 'x' in GF(2^8). Given that 'x * x * x * x' equals 'x^4' even in the polynomial world, we can accelerate this substantially by performing up to 4 such operations at once, using the NEON instructions for polynomial multiplication. Results on a Cortex-A57 running in 64-bit mode: Before: ------- raid6: neonx1 xor() 1680 MB/s raid6: neonx2 xor() 2286 MB/s raid6: neonx4 xor() 3162 MB/s raid6: neonx8 xor() 3389 MB/s After: ------ raid6: neonx1 xor() 2281 MB/s raid6: neonx2 xor() 3362 MB/s raid6: neonx4 xor() 3787 MB/s raid6: neonx8 xor() 4239 MB/s While we're at it, simplify MASK() by using a signed shift rather than a vector compare involving a temp register. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2017-07-14 01:16:00 +08:00			`return (unative_t)vshrq_n_s8((int8x16_t)v, 7);`
			`}`

			`static inline unative_t PMUL(unative_t v, unative_t u)`
			`{`
			`return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u);`
lib/raid6: add ARM-NEON accelerated syndrome calculation Rebased/reworked a patch contributed by Rob Herring that uses NEON intrinsics to perform the RAID-6 syndrome calculations. It uses the existing unroll.awk code to generate several unrolled versions of which the best performing one is selected at boot time. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Acked-by: Nicolas Pitre <nico@linaro.org> Cc: hpa@linux.intel.com 2013-05-16 23:20:32 +08:00			`}`

			`void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)`
			`{`
			`uint8_t dptr = (uint8_t )ptrs;`
			`uint8_t p, q;`
			`int d, z, z0;`

			`register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;`
			`const unative_t x1d = NBYTES(0x1d);`

			`z0 = disks - 3; /* Highest data disk */`
			`p = dptr[z0+1]; /* XOR parity */`
			`q = dptr[z0+2]; /* RS syndrome */`

			`for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {`
			`wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);`
			`for ( z = z0-1 ; z >= 0 ; z-- ) {`
			`wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);`
			`wp$$ = veorq_u8(wp$$, wd$$);`
			`w2$$ = MASK(wq$$);`
			`w1$$ = SHLBYTE(wq$$);`

			`w2$$ = vandq_u8(w2$$, x1d);`
			`w1$$ = veorq_u8(w1$$, w2$$);`
			`wq$$ = veorq_u8(w1$$, wd$$);`
			`}`
			`vst1q_u8(&p[d+NSIZE*$$], wp$$);`
			`vst1q_u8(&q[d+NSIZE*$$], wq$$);`
			`}`
			`}`
md/raid6: delta syndrome for ARM NEON This implements XOR syndrome calculation using NEON intrinsics. As before, the module can be built for ARM and arm64 from the same source. Relative performance on a Cortex-A57 based system: raid6: int64x1 gen() 905 MB/s raid6: int64x1 xor() 881 MB/s raid6: int64x2 gen() 1343 MB/s raid6: int64x2 xor() 1286 MB/s raid6: int64x4 gen() 1896 MB/s raid6: int64x4 xor() 1321 MB/s raid6: int64x8 gen() 1773 MB/s raid6: int64x8 xor() 1165 MB/s raid6: neonx1 gen() 1834 MB/s raid6: neonx1 xor() 1278 MB/s raid6: neonx2 gen() 2528 MB/s raid6: neonx2 xor() 1942 MB/s raid6: neonx4 gen() 2888 MB/s raid6: neonx4 xor() 2334 MB/s raid6: neonx8 gen() 2957 MB/s raid6: neonx8 xor() 2232 MB/s raid6: using algorithm neonx8 gen() 2957 MB/s raid6: .... xor() 2232 MB/s, rmw enabled Cc: Markus Stockhausen <stockhausen@collogia.de> Cc: Neil Brown <neilb@suse.de> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: NeilBrown <neilb@suse.com> 2015-07-01 10:19:56 +08:00
			`void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,`
			`unsigned long bytes, void **ptrs)`
			`{`
			`uint8_t dptr = (uint8_t )ptrs;`
			`uint8_t p, q;`
			`int d, z, z0;`

			`register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;`
			`const unative_t x1d = NBYTES(0x1d);`

			`z0 = stop; /* P/Q right side optimization */`
			`p = dptr[disks-2]; /* XOR parity */`
			`q = dptr[disks-1]; /* RS syndrome */`

			`for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {`
			`wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);`
			`wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);`

			`/* P/Q data pages */`
			`for ( z = z0-1 ; z >= start ; z-- ) {`
			`wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);`
			`wp$$ = veorq_u8(wp$$, wd$$);`
			`w2$$ = MASK(wq$$);`
			`w1$$ = SHLBYTE(wq$$);`

			`w2$$ = vandq_u8(w2$$, x1d);`
			`w1$$ = veorq_u8(w1$$, w2$$);`
			`wq$$ = veorq_u8(w1$$, wd$$);`
			`}`
			`/* P/Q left side optimization */`
md/raid6: use faster multiplication for ARM NEON delta syndrome The P/Q left side optimization in the delta syndrome simply involves repeatedly multiplying a value by polynomial 'x' in GF(2^8). Given that 'x * x * x * x' equals 'x^4' even in the polynomial world, we can accelerate this substantially by performing up to 4 such operations at once, using the NEON instructions for polynomial multiplication. Results on a Cortex-A57 running in 64-bit mode: Before: ------- raid6: neonx1 xor() 1680 MB/s raid6: neonx2 xor() 2286 MB/s raid6: neonx4 xor() 3162 MB/s raid6: neonx8 xor() 3389 MB/s After: ------ raid6: neonx1 xor() 2281 MB/s raid6: neonx2 xor() 3362 MB/s raid6: neonx4 xor() 3787 MB/s raid6: neonx8 xor() 4239 MB/s While we're at it, simplify MASK() by using a signed shift rather than a vector compare involving a temp register. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2017-07-14 01:16:00 +08:00			`for ( z = start-1 ; z >= 3 ; z -= 4 ) {`
			`w2$$ = vshrq_n_u8(wq$$, 4);`
			`w1$$ = vshlq_n_u8(wq$$, 4);`

			`w2$$ = PMUL(w2$$, x1d);`
			`wq$$ = veorq_u8(w1$$, w2$$);`
			`}`

			`switch (z) {`
			`case 2:`
			`w2$$ = vshrq_n_u8(wq$$, 5);`
			`w1$$ = vshlq_n_u8(wq$$, 3);`

			`w2$$ = PMUL(w2$$, x1d);`
			`wq$$ = veorq_u8(w1$$, w2$$);`
			`break;`
			`case 1:`
			`w2$$ = vshrq_n_u8(wq$$, 6);`
			`w1$$ = vshlq_n_u8(wq$$, 2);`

			`w2$$ = PMUL(w2$$, x1d);`
			`wq$$ = veorq_u8(w1$$, w2$$);`
			`break;`
			`case 0:`
md/raid6: delta syndrome for ARM NEON This implements XOR syndrome calculation using NEON intrinsics. As before, the module can be built for ARM and arm64 from the same source. Relative performance on a Cortex-A57 based system: raid6: int64x1 gen() 905 MB/s raid6: int64x1 xor() 881 MB/s raid6: int64x2 gen() 1343 MB/s raid6: int64x2 xor() 1286 MB/s raid6: int64x4 gen() 1896 MB/s raid6: int64x4 xor() 1321 MB/s raid6: int64x8 gen() 1773 MB/s raid6: int64x8 xor() 1165 MB/s raid6: neonx1 gen() 1834 MB/s raid6: neonx1 xor() 1278 MB/s raid6: neonx2 gen() 2528 MB/s raid6: neonx2 xor() 1942 MB/s raid6: neonx4 gen() 2888 MB/s raid6: neonx4 xor() 2334 MB/s raid6: neonx8 gen() 2957 MB/s raid6: neonx8 xor() 2232 MB/s raid6: using algorithm neonx8 gen() 2957 MB/s raid6: .... xor() 2232 MB/s, rmw enabled Cc: Markus Stockhausen <stockhausen@collogia.de> Cc: Neil Brown <neilb@suse.de> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: NeilBrown <neilb@suse.com> 2015-07-01 10:19:56 +08:00			`w2$$ = MASK(wq$$);`
			`w1$$ = SHLBYTE(wq$$);`

			`w2$$ = vandq_u8(w2$$, x1d);`
			`wq$$ = veorq_u8(w1$$, w2$$);`
			`}`
			`w1$$ = vld1q_u8(&q[d+NSIZE*$$]);`
			`wq$$ = veorq_u8(wq$$, w1$$);`

			`vst1q_u8(&p[d+NSIZE*$$], wp$$);`
			`vst1q_u8(&q[d+NSIZE*$$], wq$$);`
			`}`
			`}`