llvm-project/compiler-rt/lib/builtins/hexagon/dfsqrt.S

407 lines
8.4 KiB
ArmAsm

//===----------------------Hexagon builtin routine ------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/* Double Precision square root */
#define EXP r28
#define A r1:0
#define AH r1
#define AL r0
#define SFSH r3:2
#define SF_S r3
#define SF_H r2
#define SFHALF_SONE r5:4
#define S_ONE r4
#define SFHALF r5
#define SF_D r6
#define SF_E r7
#define RECIPEST r8
#define SFRAD r9
#define FRACRAD r11:10
#define FRACRADH r11
#define FRACRADL r10
#define ROOT r13:12
#define ROOTHI r13
#define ROOTLO r12
#define PROD r15:14
#define PRODHI r15
#define PRODLO r14
#define P_TMP p0
#define P_EXP1 p1
#define NORMAL p2
#define SF_EXPBITS 8
#define SF_MANTBITS 23
#define DF_EXPBITS 11
#define DF_MANTBITS 52
#define DF_BIAS 0x3ff
#define DFCLASS_ZERO 0x01
#define DFCLASS_NORMAL 0x02
#define DFCLASS_DENORMAL 0x02
#define DFCLASS_INFINITE 0x08
#define DFCLASS_NAN 0x10
#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function
#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function
#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function
#define END(TAG) .size TAG,.-TAG
.text
.global __hexagon_sqrtdf2
.type __hexagon_sqrtdf2,@function
.global __hexagon_sqrt
.type __hexagon_sqrt,@function
Q6_ALIAS(sqrtdf2)
Q6_ALIAS(sqrt)
FAST_ALIAS(sqrtdf2)
FAST_ALIAS(sqrt)
FAST2_ALIAS(sqrtdf2)
FAST2_ALIAS(sqrt)
.type sqrt,@function
.p2align 5
__hexagon_sqrtdf2:
__hexagon_sqrt:
{
PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)
EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32)
SFHALF_SONE = combine(##0x3f000004,#1)
}
{
NORMAL = dfclass(A,#DFCLASS_NORMAL) // Is it normal
NORMAL = cmp.gt(AH,#-1) // and positive?
if (!NORMAL.new) jump:nt .Lsqrt_abnormal
SFRAD = or(SFHALF,PRODLO)
}
#undef NORMAL
.Ldenormal_restart:
{
FRACRAD = A
SF_E,P_TMP = sfinvsqrta(SFRAD)
SFHALF = and(SFHALF,#-16)
SFSH = #0
}
#undef A
#undef AH
#undef AL
#define ERROR r1:0
#define ERRORHI r1
#define ERRORLO r0
// SF_E : reciprocal square root
// SF_H : half rsqrt
// sf_S : square root
// SF_D : error term
// SFHALF: 0.5
{
SF_S += sfmpy(SF_E,SFRAD):lib // s0: root
SF_H += sfmpy(SF_E,SFHALF):lib // h0: 0.5*y0. Could also decrement exponent...
SF_D = SFHALF
#undef SFRAD
#define SHIFTAMT r9
SHIFTAMT = and(EXP,#1)
}
{
SF_D -= sfmpy(SF_S,SF_H):lib // d0: 0.5-H*S = 0.5-0.5*~1
FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32) // replace upper bits with hidden
P_EXP1 = cmp.gtu(SHIFTAMT,#0)
}
{
SF_S += sfmpy(SF_S,SF_D):lib // s1: refine sqrt
SF_H += sfmpy(SF_H,SF_D):lib // h1: refine half-recip
SF_D = SFHALF
SHIFTAMT = mux(P_EXP1,#8,#9)
}
{
SF_D -= sfmpy(SF_S,SF_H):lib // d1: error term
FRACRAD = asl(FRACRAD,SHIFTAMT) // Move fracrad bits to right place
SHIFTAMT = mux(P_EXP1,#3,#2)
}
{
SF_H += sfmpy(SF_H,SF_D):lib // d2: rsqrt
// cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x).
PROD = asl(FRACRAD,SHIFTAMT) // fracrad<<(2+exp1)
}
{
SF_H = and(SF_H,##0x007fffff)
}
{
SF_H = add(SF_H,##0x00800000 - 3)
SHIFTAMT = mux(P_EXP1,#7,#8)
}
{
RECIPEST = asl(SF_H,SHIFTAMT)
SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0))
}
{
ROOT = mpyu(RECIPEST,PRODHI) // root = mpyu_full(recipest,hi(fracrad<<(2+exp1)))
}
#undef SFSH // r3:2
#undef SF_H // r2
#undef SF_S // r3
#undef S_ONE // r4
#undef SFHALF // r5
#undef SFHALF_SONE // r5:4
#undef SF_D // r6
#undef SF_E // r7
#define HL r3:2
#define LL r5:4
#define HH r7:6
#undef P_EXP1
#define P_CARRY0 p1
#define P_CARRY1 p2
#define P_CARRY2 p3
/* Iteration 0 */
/* Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply */
/* We can shift and subtract instead of shift and add? */
{
ERROR = asl(FRACRAD,#15)
PROD = mpyu(ROOTHI,ROOTHI)
P_CARRY0 = cmp.eq(r0,r0)
}
{
ERROR -= asl(PROD,#15)
PROD = mpyu(ROOTHI,ROOTLO)
P_CARRY1 = cmp.eq(r0,r0)
}
{
ERROR -= lsr(PROD,#16)
P_CARRY2 = cmp.eq(r0,r0)
}
{
ERROR = mpyu(ERRORHI,RECIPEST)
}
{
ROOT += lsr(ERROR,SHIFTAMT)
SHIFTAMT = add(SHIFTAMT,#16)
ERROR = asl(FRACRAD,#31) // for next iter
}
/* Iteration 1 */
{
PROD = mpyu(ROOTHI,ROOTHI)
ERROR -= mpyu(ROOTHI,ROOTLO) // amount is 31, no shift needed
}
{
ERROR -= asl(PROD,#31)
PROD = mpyu(ROOTLO,ROOTLO)
}
{
ERROR -= lsr(PROD,#33)
}
{
ERROR = mpyu(ERRORHI,RECIPEST)
}
{
ROOT += lsr(ERROR,SHIFTAMT)
SHIFTAMT = add(SHIFTAMT,#16)
ERROR = asl(FRACRAD,#47) // for next iter
}
/* Iteration 2 */
{
PROD = mpyu(ROOTHI,ROOTHI)
}
{
ERROR -= asl(PROD,#47)
PROD = mpyu(ROOTHI,ROOTLO)
}
{
ERROR -= asl(PROD,#16) // bidir shr 31-47
PROD = mpyu(ROOTLO,ROOTLO)
}
{
ERROR -= lsr(PROD,#17) // 64-47
}
{
ERROR = mpyu(ERRORHI,RECIPEST)
}
{
ROOT += lsr(ERROR,SHIFTAMT)
}
#undef ERROR
#undef PROD
#undef PRODHI
#undef PRODLO
#define REM_HI r15:14
#define REM_HI_HI r15
#define REM_LO r1:0
#undef RECIPEST
#undef SHIFTAMT
#define TWOROOT_LO r9:8
/* Adjust Root */
{
HL = mpyu(ROOTHI,ROOTLO)
LL = mpyu(ROOTLO,ROOTLO)
REM_HI = #0
REM_LO = #0
}
{
HL += lsr(LL,#33)
LL += asl(HL,#33)
P_CARRY0 = cmp.eq(r0,r0)
}
{
HH = mpyu(ROOTHI,ROOTHI)
REM_LO = sub(REM_LO,LL,P_CARRY0):carry
TWOROOT_LO = #1
}
{
HH += lsr(HL,#31)
TWOROOT_LO += asl(ROOT,#1)
}
#undef HL
#undef LL
#define REM_HI_TMP r3:2
#define REM_HI_TMP_HI r3
#define REM_LO_TMP r5:4
{
REM_HI = sub(FRACRAD,HH,P_CARRY0):carry
REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry
#undef FRACRAD
#undef HH
#define ZERO r11:10
#define ONE r7:6
ONE = #1
ZERO = #0
}
{
REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry
ONE = add(ROOT,ONE)
EXP = add(EXP,#-DF_BIAS) // subtract bias --> signed exp
}
{
// If carry set, no borrow: result was still positive
if (P_CARRY1) ROOT = ONE
if (P_CARRY1) REM_LO = REM_LO_TMP
if (P_CARRY1) REM_HI = REM_HI_TMP
}
{
REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry
ONE = #1
EXP = asr(EXP,#1) // divide signed exp by 2
}
{
REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry
ONE = add(ROOT,ONE)
}
{
if (P_CARRY2) ROOT = ONE
if (P_CARRY2) REM_LO = REM_LO_TMP
// since tworoot <= 2^32, remhi must be zero
#undef REM_HI_TMP
#undef REM_HI_TMP_HI
#define S_ONE r2
#define ADJ r3
S_ONE = #1
}
{
P_TMP = cmp.eq(REM_LO,ZERO) // is the low part zero
if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE) // if so, it's exact... hopefully
ADJ = cl0(ROOT)
EXP = add(EXP,#-63)
}
#undef REM_LO
#define RET r1:0
#define RETHI r1
{
RET = convert_ud2df(ROOT) // set up mantissa, maybe set inexact flag
EXP = add(EXP,ADJ) // add back bias
}
{
RETHI += asl(EXP,#DF_MANTBITS-32) // add exponent adjust
jumpr r31
}
#undef REM_LO_TMP
#undef REM_HI_TMP
#undef REM_HI_TMP_HI
#undef REM_LO
#undef REM_HI
#undef TWOROOT_LO
#undef RET
#define A r1:0
#define AH r1
#define AL r1
#undef S_ONE
#define TMP r3:2
#define TMPHI r3
#define TMPLO r2
#undef P_CARRY0
#define P_NEG p1
#define SFHALF r5
#define SFRAD r9
.Lsqrt_abnormal:
{
P_TMP = dfclass(A,#DFCLASS_ZERO) // zero?
if (P_TMP.new) jumpr:t r31
}
{
P_TMP = dfclass(A,#DFCLASS_NAN)
if (P_TMP.new) jump:nt .Lsqrt_nan
}
{
P_TMP = cmp.gt(AH,#-1)
if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg
if (!P_TMP.new) EXP = ##0x7F800001 // sNaN
}
{
P_TMP = dfclass(A,#DFCLASS_INFINITE)
if (P_TMP.new) jumpr:nt r31
}
// If we got here, we're denormal
// prepare to restart
{
A = extractu(A,#DF_MANTBITS,#0) // Extract mantissa
}
{
EXP = add(clb(A),#-DF_EXPBITS) // how much to normalize?
}
{
A = asl(A,EXP) // Shift mantissa
EXP = sub(#1,EXP) // Form exponent
}
{
AH = insert(EXP,#1,#DF_MANTBITS-32) // insert lsb of exponent
}
{
TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) // get sf value (mant+exp1)
SFHALF = ##0x3f000004 // form half constant
}
{
SFRAD = or(SFHALF,TMPLO) // form sf value
SFHALF = and(SFHALF,#-16)
jump .Ldenormal_restart // restart
}
.Lsqrt_nan:
{
EXP = convert_df2sf(A) // if sNaN, get invalid
A = #-1 // qNaN
jumpr r31
}
.Lsqrt_invalid_neg:
{
A = convert_sf2df(EXP) // Invalid,NaNval
jumpr r31
}
END(__hexagon_sqrt)
END(__hexagon_sqrtdf2)