forked from OSchip/llvm-project
2234 lines
90 KiB
LLVM
2234 lines
90 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
|
|
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
|
|
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
|
|
|
define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
|
|
; GFX7-LABEL: s_mul_i16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX8-LABEL: s_mul_i16:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_mov_b32 s2, 0xffff
|
|
; GFX8-NEXT: s_and_b32 s0, s0, s2
|
|
; GFX8-NEXT: s_and_b32 s1, s1, s2
|
|
; GFX8-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mul_i16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mov_b32 s2, 0xffff
|
|
; GFX9-NEXT: s_and_b32 s0, s0, s2
|
|
; GFX9-NEXT: s_and_b32 s1, s1, s2
|
|
; GFX9-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
%result = mul i16 %num, %den
|
|
ret i16 %result
|
|
}
|
|
|
|
define i16 @v_mul_i16(i16 %num, i16 %den) {
|
|
; GFX7-LABEL: v_mul_i16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_mov_b32 s4, 0xffff
|
|
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
|
|
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
|
|
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: v_mul_i16:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mul_i16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = mul i16 %num, %den
|
|
ret i16 %result
|
|
}
|
|
|
|
define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) {
|
|
; GFX7-LABEL: s_mul_i16_zeroext:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX8-LABEL: s_mul_i16_zeroext:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_mov_b32 s2, 0xffff
|
|
; GFX8-NEXT: s_and_b32 s0, s0, s2
|
|
; GFX8-NEXT: s_and_b32 s1, s1, s2
|
|
; GFX8-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX8-NEXT: s_and_b32 s0, s0, s2
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mul_i16_zeroext:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mov_b32 s2, 0xffff
|
|
; GFX9-NEXT: s_and_b32 s0, s0, s2
|
|
; GFX9-NEXT: s_and_b32 s1, s1, s2
|
|
; GFX9-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX9-NEXT: s_and_b32 s0, s0, s2
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
%result = mul i16 %num, %den
|
|
ret i16 %result
|
|
}
|
|
|
|
define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
|
|
; GFX7-LABEL: v_mul_i16_zeroext:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_mov_b32 s4, 0xffff
|
|
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
|
|
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
|
|
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
|
|
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: v_mul_i16_zeroext:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mul_i16_zeroext:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = mul i16 %num, %den
|
|
ret i16 %result
|
|
}
|
|
|
|
define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
|
|
; GFX7-LABEL: s_mul_i16_signext:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX7-NEXT: s_sext_i32_i16 s0, s0
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX8-LABEL: s_mul_i16_signext:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_mov_b32 s2, 0xffff
|
|
; GFX8-NEXT: s_and_b32 s0, s0, s2
|
|
; GFX8-NEXT: s_and_b32 s1, s1, s2
|
|
; GFX8-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX8-NEXT: s_sext_i32_i16 s0, s0
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mul_i16_signext:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mov_b32 s2, 0xffff
|
|
; GFX9-NEXT: s_and_b32 s0, s0, s2
|
|
; GFX9-NEXT: s_and_b32 s1, s1, s2
|
|
; GFX9-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX9-NEXT: s_sext_i32_i16 s0, s0
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
%result = mul i16 %num, %den
|
|
ret i16 %result
|
|
}
|
|
|
|
define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
|
|
; GFX7-LABEL: v_mul_i16_signext:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_mov_b32 s4, 0xffff
|
|
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
|
|
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
|
|
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
|
|
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: v_mul_i16_signext:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
|
|
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mul_i16_signext:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
|
|
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = mul i16 %num, %den
|
|
ret i16 %result
|
|
}
|
|
|
|
define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
|
|
; GCN-LABEL: s_mul_i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_mul_i32 s0, s0, s1
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
%result = mul i32 %num, %den
|
|
ret i32 %result
|
|
}
|
|
|
|
define i32 @v_mul_i32(i32 %num, i32 %den) {
|
|
; GCN-LABEL: v_mul_i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_mul_lo_u32 v0, v0, v1
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
%result = mul i32 %num, %den
|
|
ret i32 %result
|
|
}
|
|
|
|
define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
|
|
; GCN-LABEL: s_mul_v2i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_mul_i32 s0, s0, s2
|
|
; GCN-NEXT: s_mul_i32 s1, s1, s3
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
%result = mul <2 x i32> %num, %den
|
|
ret <2 x i32> %result
|
|
}
|
|
|
|
define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
|
|
; GCN-LABEL: v_mul_v2i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_mul_lo_u32 v0, v0, v2
|
|
; GCN-NEXT: v_mul_lo_u32 v1, v1, v3
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
%result = mul <2 x i32> %num, %den
|
|
ret <2 x i32> %result
|
|
}
|
|
|
|
define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
|
|
; GFX7-LABEL: s_mul_i64:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX7-NEXT: s_mul_i32 s4, s0, s2
|
|
; GFX7-NEXT: s_mul_i32 s1, s1, s2
|
|
; GFX7-NEXT: s_mul_i32 s0, s0, s3
|
|
; GFX7-NEXT: s_add_i32 s1, s1, s0
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s1, v0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX7-NEXT: s_mov_b32 s0, s4
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX8-LABEL: s_mul_i64:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX8-NEXT: s_mul_i32 s4, s0, s2
|
|
; GFX8-NEXT: s_mul_i32 s1, s1, s2
|
|
; GFX8-NEXT: s_mul_i32 s0, s0, s3
|
|
; GFX8-NEXT: s_add_i32 s1, s1, s0
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s1, v0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX8-NEXT: s_mov_b32 s0, s4
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mul_i64:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mul_i32 s1, s1, s2
|
|
; GFX9-NEXT: s_mul_i32 s3, s0, s3
|
|
; GFX9-NEXT: s_mul_i32 s4, s0, s2
|
|
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s2
|
|
; GFX9-NEXT: s_add_i32 s1, s1, s3
|
|
; GFX9-NEXT: s_add_i32 s1, s1, s0
|
|
; GFX9-NEXT: s_mov_b32 s0, s4
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
%result = mul i64 %num, %den
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @v_mul_i64(i64 %num, i64 %den) {
|
|
; GFX7-LABEL: v_mul_i64:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mul_lo_u32 v4, v0, v3
|
|
; GFX7-NEXT: v_mul_lo_u32 v1, v1, v2
|
|
; GFX7-NEXT: v_mul_lo_u32 v3, v0, v2
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v2
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v4
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v3
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: v_mul_i64:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mul_lo_u32 v4, v0, v3
|
|
; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2
|
|
; GFX8-NEXT: v_mul_lo_u32 v3, v0, v2
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v2
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v3
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mul_i64:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2
|
|
; GFX9-NEXT: v_mul_lo_u32 v3, v0, v3
|
|
; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2
|
|
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2
|
|
; GFX9-NEXT: v_add3_u32 v1, v1, v3, v4
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = mul i64 %num, %den
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
|
|
; GFX7-LABEL: s_mul_i96:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s3
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s1
|
|
; GFX7-NEXT: s_mul_i32 s7, s1, s3
|
|
; GFX7-NEXT: s_mul_i32 s8, s0, s4
|
|
; GFX7-NEXT: s_add_u32 s7, s7, s8
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s4
|
|
; GFX7-NEXT: v_mul_hi_u32 v2, v2, s3
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s7, v0
|
|
; GFX7-NEXT: s_mul_i32 s7, s1, s4
|
|
; GFX7-NEXT: s_mul_i32 s2, s2, s3
|
|
; GFX7-NEXT: v_mul_hi_u32 v3, s0, v3
|
|
; GFX7-NEXT: s_cselect_b32 s8, 1, 0
|
|
; GFX7-NEXT: s_mul_i32 s6, s0, s3
|
|
; GFX7-NEXT: s_mul_i32 s5, s0, s5
|
|
; GFX7-NEXT: s_add_i32 s0, s2, s7
|
|
; GFX7-NEXT: s_add_i32 s0, s0, s5
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2
|
|
; GFX7-NEXT: s_and_b32 s8, s8, 1
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s8, v1
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v3
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
|
|
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s2, v1
|
|
; GFX7-NEXT: s_mov_b32 s0, s6
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX8-LABEL: s_mul_i96:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
|
; GFX8-NEXT: s_mul_i32 s7, s1, s3
|
|
; GFX8-NEXT: s_mul_i32 s8, s0, s4
|
|
; GFX8-NEXT: s_add_u32 s7, s7, s8
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s4
|
|
; GFX8-NEXT: v_mul_hi_u32 v2, v2, s3
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s7, v0
|
|
; GFX8-NEXT: s_mul_i32 s7, s1, s4
|
|
; GFX8-NEXT: s_mul_i32 s2, s2, s3
|
|
; GFX8-NEXT: v_mul_hi_u32 v3, s0, v3
|
|
; GFX8-NEXT: s_cselect_b32 s8, 1, 0
|
|
; GFX8-NEXT: s_mul_i32 s6, s0, s3
|
|
; GFX8-NEXT: s_mul_i32 s5, s0, s5
|
|
; GFX8-NEXT: s_add_i32 s0, s2, s7
|
|
; GFX8-NEXT: s_add_i32 s0, s0, s5
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
|
; GFX8-NEXT: s_and_b32 s8, s8, 1
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s8, v1
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
|
|
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
|
|
; GFX8-NEXT: s_mov_b32 s0, s6
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mul_i96:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mul_i32 s7, s1, s3
|
|
; GFX9-NEXT: s_mul_i32 s8, s0, s4
|
|
; GFX9-NEXT: s_add_u32 s7, s7, s8
|
|
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s9, s0, s3
|
|
; GFX9-NEXT: s_and_b32 s8, s8, 1
|
|
; GFX9-NEXT: s_add_u32 s7, s7, s9
|
|
; GFX9-NEXT: s_cselect_b32 s9, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s9, s9, 1
|
|
; GFX9-NEXT: s_add_i32 s8, s8, s9
|
|
; GFX9-NEXT: s_mul_i32 s9, s1, s4
|
|
; GFX9-NEXT: s_mul_i32 s2, s2, s3
|
|
; GFX9-NEXT: s_mul_i32 s5, s0, s5
|
|
; GFX9-NEXT: s_add_i32 s2, s2, s9
|
|
; GFX9-NEXT: s_mul_hi_u32 s1, s1, s3
|
|
; GFX9-NEXT: s_add_i32 s2, s2, s5
|
|
; GFX9-NEXT: s_mul_i32 s6, s0, s3
|
|
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s4
|
|
; GFX9-NEXT: s_add_i32 s1, s2, s1
|
|
; GFX9-NEXT: s_add_i32 s0, s1, s0
|
|
; GFX9-NEXT: s_add_i32 s2, s0, s8
|
|
; GFX9-NEXT: s_mov_b32 s0, s6
|
|
; GFX9-NEXT: s_mov_b32 s1, s7
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
%result = mul i96 %num, %den
|
|
%cast = bitcast i96 %result to <3 x i32>
|
|
ret <3 x i32> %cast
|
|
}
|
|
|
|
define i96 @v_mul_i96(i96 %num, i96 %den) {
|
|
; GFX7-LABEL: v_mul_i96:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mul_lo_u32 v7, v1, v3
|
|
; GFX7-NEXT: v_mul_lo_u32 v8, v0, v4
|
|
; GFX7-NEXT: v_mul_hi_u32 v9, v0, v3
|
|
; GFX7-NEXT: v_mul_lo_u32 v2, v2, v3
|
|
; GFX7-NEXT: v_mul_lo_u32 v5, v0, v5
|
|
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v9
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v9
|
|
; GFX7-NEXT: v_mul_lo_u32 v9, v1, v4
|
|
; GFX7-NEXT: v_mul_hi_u32 v1, v1, v3
|
|
; GFX7-NEXT: v_mul_lo_u32 v6, v0, v3
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v4
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v9
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v5
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v0, v8
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v7
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: v_mul_i96:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mul_lo_u32 v7, v1, v3
|
|
; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4
|
|
; GFX8-NEXT: v_mul_hi_u32 v9, v0, v3
|
|
; GFX8-NEXT: v_mul_lo_u32 v2, v2, v3
|
|
; GFX8-NEXT: v_mul_lo_u32 v5, v0, v5
|
|
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9
|
|
; GFX8-NEXT: v_mul_lo_u32 v9, v1, v4
|
|
; GFX8-NEXT: v_mul_hi_u32 v1, v1, v3
|
|
; GFX8-NEXT: v_mul_lo_u32 v6, v0, v3
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v4
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v8
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v7
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mul_i96:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mul_lo_u32 v7, v1, v3
|
|
; GFX9-NEXT: v_mul_lo_u32 v8, v0, v4
|
|
; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3
|
|
; GFX9-NEXT: v_mul_lo_u32 v10, v1, v4
|
|
; GFX9-NEXT: v_mul_lo_u32 v2, v2, v3
|
|
; GFX9-NEXT: v_mul_lo_u32 v5, v0, v5
|
|
; GFX9-NEXT: v_mul_hi_u32 v1, v1, v3
|
|
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
|
|
; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3
|
|
; GFX9-NEXT: v_mul_hi_u32 v0, v0, v4
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_u32_e32 v2, v2, v10
|
|
; GFX9-NEXT: v_add_u32_e32 v3, v8, v9
|
|
; GFX9-NEXT: v_add3_u32 v1, v2, v5, v1
|
|
; GFX9-NEXT: v_add3_u32 v2, v1, v0, v3
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v6
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v7
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = mul i96 %num, %den
|
|
ret i96 %result
|
|
}
|
|
|
|
define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
|
|
; GFX7-LABEL: s_mul_i128:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX7-NEXT: s_mul_i32 s9, s1, s4
|
|
; GFX7-NEXT: s_mul_i32 s10, s0, s5
|
|
; GFX7-NEXT: s_add_u32 s9, s9, s10
|
|
; GFX7-NEXT: s_cselect_b32 s10, 1, 0
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s9, v0
|
|
; GFX7-NEXT: s_and_b32 s10, s10, 1
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s10, v1
|
|
; GFX7-NEXT: s_mul_i32 s9, s2, s4
|
|
; GFX7-NEXT: s_mul_i32 s10, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s1
|
|
; GFX7-NEXT: s_add_u32 s9, s9, s10
|
|
; GFX7-NEXT: s_cselect_b32 s10, 1, 0
|
|
; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4
|
|
; GFX7-NEXT: s_mul_i32 s11, s0, s6
|
|
; GFX7-NEXT: s_and_b32 s10, s10, 1
|
|
; GFX7-NEXT: s_add_u32 s9, s9, s11
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: s_cselect_b32 s11, 1, 0
|
|
; GFX7-NEXT: v_mul_hi_u32 v4, s0, v3
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s9, v2
|
|
; GFX7-NEXT: s_and_b32 s11, s11, 1
|
|
; GFX7-NEXT: s_add_i32 s10, s10, s11
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, s10, v5
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v4, v2
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX7-NEXT: s_mul_i32 s5, s2, s5
|
|
; GFX7-NEXT: s_mul_i32 s3, s3, s4
|
|
; GFX7-NEXT: v_mul_hi_u32 v4, v4, s4
|
|
; GFX7-NEXT: s_mul_i32 s8, s0, s4
|
|
; GFX7-NEXT: s_mul_i32 s9, s1, s6
|
|
; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3
|
|
; GFX7-NEXT: s_mul_i32 s7, s0, s7
|
|
; GFX7-NEXT: v_mul_hi_u32 v5, s0, v5
|
|
; GFX7-NEXT: s_add_i32 s0, s3, s5
|
|
; GFX7-NEXT: s_add_i32 s0, s0, s9
|
|
; GFX7-NEXT: s_add_i32 s0, s0, s7
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, s0, v4
|
|
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3
|
|
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v5
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
|
|
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s2, v1
|
|
; GFX7-NEXT: v_readfirstlane_b32 s3, v2
|
|
; GFX7-NEXT: s_mov_b32 s0, s8
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX8-LABEL: s_mul_i128:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX8-NEXT: s_mul_i32 s9, s1, s4
|
|
; GFX8-NEXT: s_mul_i32 s10, s0, s5
|
|
; GFX8-NEXT: s_add_u32 s9, s9, s10
|
|
; GFX8-NEXT: s_cselect_b32 s10, 1, 0
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s9, v0
|
|
; GFX8-NEXT: s_and_b32 s10, s10, 1
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s10, v1
|
|
; GFX8-NEXT: s_mul_i32 s9, s2, s4
|
|
; GFX8-NEXT: s_mul_i32 s10, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
|
; GFX8-NEXT: s_add_u32 s9, s9, s10
|
|
; GFX8-NEXT: s_cselect_b32 s10, 1, 0
|
|
; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4
|
|
; GFX8-NEXT: s_mul_i32 s11, s0, s6
|
|
; GFX8-NEXT: s_and_b32 s10, s10, 1
|
|
; GFX8-NEXT: s_add_u32 s9, s9, s11
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: s_cselect_b32 s11, 1, 0
|
|
; GFX8-NEXT: v_mul_hi_u32 v4, s0, v3
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s9, v2
|
|
; GFX8-NEXT: s_and_b32 s11, s11, 1
|
|
; GFX8-NEXT: s_add_i32 s10, s10, s11
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s10, v5
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX8-NEXT: s_mul_i32 s5, s2, s5
|
|
; GFX8-NEXT: s_mul_i32 s3, s3, s4
|
|
; GFX8-NEXT: v_mul_hi_u32 v4, v4, s4
|
|
; GFX8-NEXT: s_mul_i32 s8, s0, s4
|
|
; GFX8-NEXT: s_mul_i32 s9, s1, s6
|
|
; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3
|
|
; GFX8-NEXT: s_mul_i32 s7, s0, s7
|
|
; GFX8-NEXT: v_mul_hi_u32 v5, s0, v5
|
|
; GFX8-NEXT: s_add_i32 s0, s3, s5
|
|
; GFX8-NEXT: s_add_i32 s0, s0, s9
|
|
; GFX8-NEXT: s_add_i32 s0, s0, s7
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v4
|
|
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
|
|
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
|
|
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
|
|
; GFX8-NEXT: v_readfirstlane_b32 s3, v2
|
|
; GFX8-NEXT: s_mov_b32 s0, s8
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mul_i128:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mul_i32 s9, s1, s4
|
|
; GFX9-NEXT: s_mul_i32 s10, s0, s5
|
|
; GFX9-NEXT: s_add_u32 s9, s9, s10
|
|
; GFX9-NEXT: s_cselect_b32 s10, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s11, s0, s4
|
|
; GFX9-NEXT: s_and_b32 s10, s10, 1
|
|
; GFX9-NEXT: s_add_u32 s9, s9, s11
|
|
; GFX9-NEXT: s_cselect_b32 s11, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s11, s11, 1
|
|
; GFX9-NEXT: s_add_i32 s10, s10, s11
|
|
; GFX9-NEXT: s_mul_i32 s11, s2, s4
|
|
; GFX9-NEXT: s_mul_i32 s12, s1, s5
|
|
; GFX9-NEXT: s_add_u32 s11, s11, s12
|
|
; GFX9-NEXT: s_cselect_b32 s12, 1, 0
|
|
; GFX9-NEXT: s_mul_i32 s13, s0, s6
|
|
; GFX9-NEXT: s_and_b32 s12, s12, 1
|
|
; GFX9-NEXT: s_add_u32 s11, s11, s13
|
|
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s13, s13, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s14, s1, s4
|
|
; GFX9-NEXT: s_add_i32 s12, s12, s13
|
|
; GFX9-NEXT: s_add_u32 s11, s11, s14
|
|
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s13, s13, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s15, s0, s5
|
|
; GFX9-NEXT: s_add_i32 s12, s12, s13
|
|
; GFX9-NEXT: s_add_u32 s11, s11, s15
|
|
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s13, s13, 1
|
|
; GFX9-NEXT: s_add_i32 s12, s12, s13
|
|
; GFX9-NEXT: s_add_u32 s10, s11, s10
|
|
; GFX9-NEXT: s_cselect_b32 s11, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s11, s11, 1
|
|
; GFX9-NEXT: s_add_i32 s12, s12, s11
|
|
; GFX9-NEXT: s_mul_i32 s11, s2, s5
|
|
; GFX9-NEXT: s_mul_i32 s3, s3, s4
|
|
; GFX9-NEXT: s_mul_i32 s13, s1, s6
|
|
; GFX9-NEXT: s_add_i32 s3, s3, s11
|
|
; GFX9-NEXT: s_mul_i32 s7, s0, s7
|
|
; GFX9-NEXT: s_add_i32 s3, s3, s13
|
|
; GFX9-NEXT: s_mul_hi_u32 s2, s2, s4
|
|
; GFX9-NEXT: s_add_i32 s3, s3, s7
|
|
; GFX9-NEXT: s_mul_hi_u32 s1, s1, s5
|
|
; GFX9-NEXT: s_add_i32 s2, s3, s2
|
|
; GFX9-NEXT: s_mul_i32 s8, s0, s4
|
|
; GFX9-NEXT: s_add_i32 s1, s2, s1
|
|
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s6
|
|
; GFX9-NEXT: s_add_i32 s0, s1, s0
|
|
; GFX9-NEXT: s_add_i32 s3, s0, s12
|
|
; GFX9-NEXT: s_mov_b32 s0, s8
|
|
; GFX9-NEXT: s_mov_b32 s1, s9
|
|
; GFX9-NEXT: s_mov_b32 s2, s10
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
%result = mul i128 %num, %den
|
|
%cast = bitcast i128 %result to <4 x i32>
|
|
ret <4 x i32> %cast
|
|
}
|
|
|
|
define i128 @v_mul_i128(i128 %num, i128 %den) {
|
|
; GFX7-LABEL: v_mul_i128:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mul_lo_u32 v9, v1, v4
|
|
; GFX7-NEXT: v_mul_lo_u32 v10, v0, v5
|
|
; GFX7-NEXT: v_mul_hi_u32 v11, v0, v4
|
|
; GFX7-NEXT: v_mul_lo_u32 v12, v1, v5
|
|
; GFX7-NEXT: v_mul_lo_u32 v13, v0, v6
|
|
; GFX7-NEXT: v_add_i32_e32 v9, vcc, v9, v10
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v9, vcc, v9, v11
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
|
|
; GFX7-NEXT: v_mul_lo_u32 v11, v2, v4
|
|
; GFX7-NEXT: v_mul_hi_u32 v14, v1, v4
|
|
; GFX7-NEXT: v_mul_hi_u32 v15, v0, v5
|
|
; GFX7-NEXT: v_mul_lo_u32 v3, v3, v4
|
|
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v12
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v13
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13
|
|
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v14
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13
|
|
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v15
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13
|
|
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v11, v10
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v12, v11
|
|
; GFX7-NEXT: v_mul_lo_u32 v12, v2, v5
|
|
; GFX7-NEXT: v_mul_lo_u32 v13, v1, v6
|
|
; GFX7-NEXT: v_mul_lo_u32 v7, v0, v7
|
|
; GFX7-NEXT: v_mul_hi_u32 v2, v2, v4
|
|
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v12
|
|
; GFX7-NEXT: v_mul_hi_u32 v1, v1, v5
|
|
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v13
|
|
; GFX7-NEXT: v_mul_lo_u32 v8, v0, v4
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v6
|
|
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v7
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
|
|
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v0, v11
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v8
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v9
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v10
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: v_mul_i128:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mul_lo_u32 v9, v1, v4
|
|
; GFX8-NEXT: v_mul_lo_u32 v10, v0, v5
|
|
; GFX8-NEXT: v_mul_hi_u32 v11, v0, v4
|
|
; GFX8-NEXT: v_mul_lo_u32 v12, v1, v5
|
|
; GFX8-NEXT: v_mul_lo_u32 v13, v0, v6
|
|
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v10
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v11
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
|
|
; GFX8-NEXT: v_mul_lo_u32 v11, v2, v4
|
|
; GFX8-NEXT: v_mul_hi_u32 v14, v1, v4
|
|
; GFX8-NEXT: v_mul_hi_u32 v15, v0, v5
|
|
; GFX8-NEXT: v_mul_lo_u32 v3, v3, v4
|
|
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v12
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v13
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
|
|
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v14
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
|
|
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v15
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
|
|
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v10
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v12, v11
|
|
; GFX8-NEXT: v_mul_lo_u32 v12, v2, v5
|
|
; GFX8-NEXT: v_mul_lo_u32 v13, v1, v6
|
|
; GFX8-NEXT: v_mul_lo_u32 v7, v0, v7
|
|
; GFX8-NEXT: v_mul_hi_u32 v2, v2, v4
|
|
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v12
|
|
; GFX8-NEXT: v_mul_hi_u32 v1, v1, v5
|
|
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v13
|
|
; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v6
|
|
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
|
|
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v11
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v8
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v9
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v10
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mul_i128:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mul_lo_u32 v9, v1, v4
|
|
; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5
|
|
; GFX9-NEXT: v_mul_hi_u32 v11, v0, v4
|
|
; GFX9-NEXT: v_mul_lo_u32 v12, v1, v5
|
|
; GFX9-NEXT: v_mul_lo_u32 v13, v0, v6
|
|
; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v10
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v11
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_u32_e32 v10, v10, v11
|
|
; GFX9-NEXT: v_mul_lo_u32 v11, v2, v4
|
|
; GFX9-NEXT: v_mul_hi_u32 v14, v1, v4
|
|
; GFX9-NEXT: v_mul_hi_u32 v15, v0, v5
|
|
; GFX9-NEXT: v_mul_lo_u32 v3, v3, v4
|
|
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v12
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v14
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v15
|
|
; GFX9-NEXT: v_add3_u32 v12, v12, v13, v14
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v11, v12, v13, v11
|
|
; GFX9-NEXT: v_mul_lo_u32 v12, v2, v5
|
|
; GFX9-NEXT: v_mul_lo_u32 v13, v1, v6
|
|
; GFX9-NEXT: v_mul_lo_u32 v7, v0, v7
|
|
; GFX9-NEXT: v_mul_hi_u32 v2, v2, v4
|
|
; GFX9-NEXT: v_mul_hi_u32 v1, v1, v5
|
|
; GFX9-NEXT: v_mul_lo_u32 v8, v0, v4
|
|
; GFX9-NEXT: v_mul_hi_u32 v0, v0, v6
|
|
; GFX9-NEXT: v_add_u32_e32 v3, v3, v12
|
|
; GFX9-NEXT: v_add3_u32 v3, v3, v13, v7
|
|
; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1
|
|
; GFX9-NEXT: v_add3_u32 v3, v1, v0, v11
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v8
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v9
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v10
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = mul i128 %num, %den
|
|
ret i128 %result
|
|
}
|
|
|
|
define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
|
|
; GFX7-LABEL: s_mul_i256:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s8
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX7-NEXT: s_mul_i32 s17, s1, s8
|
|
; GFX7-NEXT: s_mul_i32 s18, s0, s9
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s18
|
|
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s17, v0
|
|
; GFX7-NEXT: s_and_b32 s18, s18, 1
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s18, v1
|
|
; GFX7-NEXT: s_mul_i32 s17, s2, s8
|
|
; GFX7-NEXT: s_mul_i32 s18, s1, s9
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s1
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s18
|
|
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
|
|
; GFX7-NEXT: v_mul_hi_u32 v2, v2, s8
|
|
; GFX7-NEXT: s_mul_i32 s19, s0, s10
|
|
; GFX7-NEXT: s_and_b32 s18, s18, 1
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s19
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s9
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_mul_hi_u32 v4, s0, v3
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s17, v2
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, s18, v5
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4
|
|
; GFX7-NEXT: s_mul_i32 s17, s3, s8
|
|
; GFX7-NEXT: s_mul_i32 s18, s2, s9
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s18
|
|
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
|
|
; GFX7-NEXT: s_mul_i32 s19, s1, s10
|
|
; GFX7-NEXT: s_and_b32 s18, s18, 1
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s19
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v4, v2
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s2
|
|
; GFX7-NEXT: v_mul_hi_u32 v5, v4, s8
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: s_mul_i32 s20, s0, s11
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s20
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, s17, v5
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s10
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, s18, v8
|
|
; GFX7-NEXT: v_mul_hi_u32 v7, s0, v6
|
|
; GFX7-NEXT: s_mul_i32 s17, s4, s8
|
|
; GFX7-NEXT: s_mul_i32 s18, s3, s9
|
|
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s18
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
|
|
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v8, v5
|
|
; GFX7-NEXT: s_mul_i32 s19, s2, s10
|
|
; GFX7-NEXT: s_and_b32 s18, s18, 1
|
|
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v7
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s19
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v7
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
|
|
; GFX7-NEXT: s_mul_i32 s20, s1, s11
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s20
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s3
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: v_mul_hi_u32 v7, v5, s8
|
|
; GFX7-NEXT: s_mul_i32 s21, s0, s12
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s21
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_add_i32_e32 v7, vcc, s17, v7
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: v_mul_hi_u32 v4, v4, s9
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v11, vcc, s18, v11
|
|
; GFX7-NEXT: s_mul_i32 s17, s5, s8
|
|
; GFX7-NEXT: s_mul_i32 s18, s4, s9
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s18
|
|
; GFX7-NEXT: v_mul_hi_u32 v8, s1, v6
|
|
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, s11
|
|
; GFX7-NEXT: s_mul_i32 s19, s3, s10
|
|
; GFX7-NEXT: s_and_b32 s18, s18, 1
|
|
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v11, v7
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s19
|
|
; GFX7-NEXT: v_mul_hi_u32 v10, s0, v9
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v8
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8
|
|
; GFX7-NEXT: s_mul_i32 s20, s2, s11
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v10
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s20
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3
|
|
; GFX7-NEXT: s_mul_i32 s21, s1, s12
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s21
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s4
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: v_mul_hi_u32 v8, v7, s8
|
|
; GFX7-NEXT: s_mul_i32 s22, s0, s13
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s22
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, s17, v8
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: v_mul_hi_u32 v10, v5, s9
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v14, vcc, s18, v14
|
|
; GFX7-NEXT: s_mul_i32 s17, s6, s8
|
|
; GFX7-NEXT: s_mul_i32 s18, s5, s9
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s18
|
|
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
|
|
; GFX7-NEXT: v_mul_hi_u32 v6, s2, v6
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
|
|
; GFX7-NEXT: s_mul_i32 s19, s4, s10
|
|
; GFX7-NEXT: s_and_b32 s18, s18, 1
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s19
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v14, v10
|
|
; GFX7-NEXT: v_mul_hi_u32 v11, s1, v9
|
|
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
|
|
; GFX7-NEXT: v_mov_b32_e32 v12, s12
|
|
; GFX7-NEXT: s_mul_i32 s20, s3, s11
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s20
|
|
; GFX7-NEXT: v_mul_hi_u32 v13, s0, v12
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v11
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
|
|
; GFX7-NEXT: s_mul_i32 s21, s2, s12
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v13
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s21
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v6, v4
|
|
; GFX7-NEXT: s_mul_i32 s22, s1, s13
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s22
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, s5
|
|
; GFX7-NEXT: v_mul_hi_u32 v10, v8, s8
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: s_mul_i32 s23, s0, s14
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: s_add_u32 s17, s17, s23
|
|
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX7-NEXT: v_mul_hi_u32 v11, v7, s9
|
|
; GFX7-NEXT: v_add_i32_e32 v10, vcc, s17, v10
|
|
; GFX7-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX7-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v17, vcc, s18, v17
|
|
; GFX7-NEXT: v_mul_hi_u32 v5, v5, s10
|
|
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX7-NEXT: v_mul_hi_u32 v13, s2, v9
|
|
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v17, v11
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v10, v5
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v11, v10
|
|
; GFX7-NEXT: v_mul_hi_u32 v14, s1, v12
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v13
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX7-NEXT: v_mov_b32_e32 v15, s13
|
|
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
|
|
; GFX7-NEXT: v_mul_hi_u32 v16, s0, v15
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v14
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v16
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
|
|
; GFX7-NEXT: v_mov_b32_e32 v13, s14
|
|
; GFX7-NEXT: s_mul_i32 s7, s7, s8
|
|
; GFX7-NEXT: s_mul_i32 s17, s6, s9
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v6
|
|
; GFX7-NEXT: s_mul_i32 s16, s0, s8
|
|
; GFX7-NEXT: s_mul_i32 s5, s5, s10
|
|
; GFX7-NEXT: s_mul_i32 s15, s0, s15
|
|
; GFX7-NEXT: v_mul_hi_u32 v13, s0, v13
|
|
; GFX7-NEXT: s_add_i32 s0, s7, s17
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
|
; GFX7-NEXT: s_mul_i32 s4, s4, s11
|
|
; GFX7-NEXT: s_add_i32 s0, s0, s5
|
|
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v10, v6
|
|
; GFX7-NEXT: v_mov_b32_e32 v10, s6
|
|
; GFX7-NEXT: s_mul_i32 s11, s3, s12
|
|
; GFX7-NEXT: s_add_i32 s0, s0, s4
|
|
; GFX7-NEXT: s_mul_i32 s12, s2, s13
|
|
; GFX7-NEXT: s_add_i32 s0, s0, s11
|
|
; GFX7-NEXT: v_mul_hi_u32 v10, v10, s8
|
|
; GFX7-NEXT: s_mul_i32 s13, s1, s14
|
|
; GFX7-NEXT: s_add_i32 s0, s0, s12
|
|
; GFX7-NEXT: v_mul_hi_u32 v8, v8, s9
|
|
; GFX7-NEXT: s_add_i32 s0, s0, s13
|
|
; GFX7-NEXT: v_mul_hi_u32 v7, v7, s10
|
|
; GFX7-NEXT: v_mul_hi_u32 v9, s3, v9
|
|
; GFX7-NEXT: s_add_i32 s0, s0, s15
|
|
; GFX7-NEXT: v_mul_hi_u32 v11, s2, v12
|
|
; GFX7-NEXT: v_add_i32_e32 v10, vcc, s0, v10
|
|
; GFX7-NEXT: v_mul_hi_u32 v12, s1, v15
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8
|
|
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v8, v7
|
|
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v9
|
|
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v11
|
|
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v12
|
|
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v13
|
|
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v7, v6
|
|
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s2, v1
|
|
; GFX7-NEXT: v_readfirstlane_b32 s3, v2
|
|
; GFX7-NEXT: v_readfirstlane_b32 s4, v3
|
|
; GFX7-NEXT: v_readfirstlane_b32 s5, v4
|
|
; GFX7-NEXT: v_readfirstlane_b32 s6, v5
|
|
; GFX7-NEXT: v_readfirstlane_b32 s7, v6
|
|
; GFX7-NEXT: s_mov_b32 s0, s16
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX8-LABEL: s_mul_i256:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s8
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX8-NEXT: s_mul_i32 s17, s1, s8
|
|
; GFX8-NEXT: s_mul_i32 s18, s0, s9
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s18
|
|
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s17, v0
|
|
; GFX8-NEXT: s_and_b32 s18, s18, 1
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s18, v1
|
|
; GFX8-NEXT: s_mul_i32 s17, s2, s8
|
|
; GFX8-NEXT: s_mul_i32 s18, s1, s9
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s18
|
|
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
|
|
; GFX8-NEXT: v_mul_hi_u32 v2, v2, s8
|
|
; GFX8-NEXT: s_mul_i32 s19, s0, s10
|
|
; GFX8-NEXT: s_and_b32 s18, s18, 1
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s19
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s9
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_mul_hi_u32 v4, s0, v3
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s17, v2
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s18, v5
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
|
|
; GFX8-NEXT: s_mul_i32 s17, s3, s8
|
|
; GFX8-NEXT: s_mul_i32 s18, s2, s9
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s18
|
|
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
|
|
; GFX8-NEXT: s_mul_i32 s19, s1, s10
|
|
; GFX8-NEXT: s_and_b32 s18, s18, 1
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s19
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s2
|
|
; GFX8-NEXT: v_mul_hi_u32 v5, v4, s8
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: s_mul_i32 s20, s0, s11
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s20
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s17, v5
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s10
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s18, v8
|
|
; GFX8-NEXT: v_mul_hi_u32 v7, s0, v6
|
|
; GFX8-NEXT: s_mul_i32 s17, s4, s8
|
|
; GFX8-NEXT: s_mul_i32 s18, s3, s9
|
|
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s18
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
|
|
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v5
|
|
; GFX8-NEXT: s_mul_i32 s19, s2, s10
|
|
; GFX8-NEXT: s_and_b32 s18, s18, 1
|
|
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s19
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
|
|
; GFX8-NEXT: s_mul_i32 s20, s1, s11
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s20
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s3
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: v_mul_hi_u32 v7, v5, s8
|
|
; GFX8-NEXT: s_mul_i32 s21, s0, s12
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s21
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s17, v7
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: v_mul_hi_u32 v4, v4, s9
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v11, vcc, s18, v11
|
|
; GFX8-NEXT: s_mul_i32 s17, s5, s8
|
|
; GFX8-NEXT: s_mul_i32 s18, s4, s9
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s18
|
|
; GFX8-NEXT: v_mul_hi_u32 v8, s1, v6
|
|
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, s11
|
|
; GFX8-NEXT: s_mul_i32 s19, s3, s10
|
|
; GFX8-NEXT: s_and_b32 s18, s18, 1
|
|
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v11, v7
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s19
|
|
; GFX8-NEXT: v_mul_hi_u32 v10, s0, v9
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
|
|
; GFX8-NEXT: s_mul_i32 s20, s2, s11
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s20
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
|
|
; GFX8-NEXT: s_mul_i32 s21, s1, s12
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s21
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s4
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: v_mul_hi_u32 v8, v7, s8
|
|
; GFX8-NEXT: s_mul_i32 s22, s0, s13
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s22
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s17, v8
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: v_mul_hi_u32 v10, v5, s9
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v14, vcc, s18, v14
|
|
; GFX8-NEXT: s_mul_i32 s17, s6, s8
|
|
; GFX8-NEXT: s_mul_i32 s18, s5, s9
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s18
|
|
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
|
|
; GFX8-NEXT: v_mul_hi_u32 v6, s2, v6
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
|
|
; GFX8-NEXT: s_mul_i32 s19, s4, s10
|
|
; GFX8-NEXT: s_and_b32 s18, s18, 1
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s19
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v14, v10
|
|
; GFX8-NEXT: v_mul_hi_u32 v11, s1, v9
|
|
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
|
|
; GFX8-NEXT: v_mov_b32_e32 v12, s12
|
|
; GFX8-NEXT: s_mul_i32 s20, s3, s11
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s20
|
|
; GFX8-NEXT: v_mul_hi_u32 v13, s0, v12
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v11
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
|
|
; GFX8-NEXT: s_mul_i32 s21, s2, s12
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v13
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s21
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4
|
|
; GFX8-NEXT: s_mul_i32 s22, s1, s13
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s22
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, s5
|
|
; GFX8-NEXT: v_mul_hi_u32 v10, v8, s8
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: s_mul_i32 s23, s0, s14
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: s_add_u32 s17, s17, s23
|
|
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX8-NEXT: v_mul_hi_u32 v11, v7, s9
|
|
; GFX8-NEXT: v_add_u32_e32 v10, vcc, s17, v10
|
|
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX8-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s18, v17
|
|
; GFX8-NEXT: v_mul_hi_u32 v5, v5, s10
|
|
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX8-NEXT: v_mul_hi_u32 v13, s2, v9
|
|
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v17, v11
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v10, v5
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v10
|
|
; GFX8-NEXT: v_mul_hi_u32 v14, s1, v12
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v13
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX8-NEXT: v_mov_b32_e32 v15, s13
|
|
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
|
|
; GFX8-NEXT: v_mul_hi_u32 v16, s0, v15
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v14
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v16
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
|
|
; GFX8-NEXT: v_mov_b32_e32 v13, s14
|
|
; GFX8-NEXT: s_mul_i32 s7, s7, s8
|
|
; GFX8-NEXT: s_mul_i32 s17, s6, s9
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6
|
|
; GFX8-NEXT: s_mul_i32 s16, s0, s8
|
|
; GFX8-NEXT: s_mul_i32 s5, s5, s10
|
|
; GFX8-NEXT: s_mul_i32 s15, s0, s15
|
|
; GFX8-NEXT: v_mul_hi_u32 v13, s0, v13
|
|
; GFX8-NEXT: s_add_i32 s0, s7, s17
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
|
; GFX8-NEXT: s_mul_i32 s4, s4, s11
|
|
; GFX8-NEXT: s_add_i32 s0, s0, s5
|
|
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v10, v6
|
|
; GFX8-NEXT: v_mov_b32_e32 v10, s6
|
|
; GFX8-NEXT: s_mul_i32 s11, s3, s12
|
|
; GFX8-NEXT: s_add_i32 s0, s0, s4
|
|
; GFX8-NEXT: s_mul_i32 s12, s2, s13
|
|
; GFX8-NEXT: s_add_i32 s0, s0, s11
|
|
; GFX8-NEXT: v_mul_hi_u32 v10, v10, s8
|
|
; GFX8-NEXT: s_mul_i32 s13, s1, s14
|
|
; GFX8-NEXT: s_add_i32 s0, s0, s12
|
|
; GFX8-NEXT: v_mul_hi_u32 v8, v8, s9
|
|
; GFX8-NEXT: s_add_i32 s0, s0, s13
|
|
; GFX8-NEXT: v_mul_hi_u32 v7, v7, s10
|
|
; GFX8-NEXT: v_mul_hi_u32 v9, s3, v9
|
|
; GFX8-NEXT: s_add_i32 s0, s0, s15
|
|
; GFX8-NEXT: v_mul_hi_u32 v11, s2, v12
|
|
; GFX8-NEXT: v_add_u32_e32 v10, vcc, s0, v10
|
|
; GFX8-NEXT: v_mul_hi_u32 v12, s1, v15
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8
|
|
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7
|
|
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
|
|
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11
|
|
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v12
|
|
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v13
|
|
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
|
|
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
|
|
; GFX8-NEXT: v_readfirstlane_b32 s3, v2
|
|
; GFX8-NEXT: v_readfirstlane_b32 s4, v3
|
|
; GFX8-NEXT: v_readfirstlane_b32 s5, v4
|
|
; GFX8-NEXT: v_readfirstlane_b32 s6, v5
|
|
; GFX8-NEXT: v_readfirstlane_b32 s7, v6
|
|
; GFX8-NEXT: s_mov_b32 s0, s16
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mul_i256:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mul_i32 s17, s1, s8
|
|
; GFX9-NEXT: s_mul_i32 s18, s0, s9
|
|
; GFX9-NEXT: s_add_u32 s17, s17, s18
|
|
; GFX9-NEXT: s_cselect_b32 s18, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s19, s0, s8
|
|
; GFX9-NEXT: s_and_b32 s18, s18, 1
|
|
; GFX9-NEXT: s_add_u32 s17, s17, s19
|
|
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX9-NEXT: s_add_i32 s18, s18, s19
|
|
; GFX9-NEXT: s_mul_i32 s19, s2, s8
|
|
; GFX9-NEXT: s_mul_i32 s20, s1, s9
|
|
; GFX9-NEXT: s_add_u32 s19, s19, s20
|
|
; GFX9-NEXT: s_cselect_b32 s20, 1, 0
|
|
; GFX9-NEXT: s_mul_i32 s21, s0, s10
|
|
; GFX9-NEXT: s_and_b32 s20, s20, 1
|
|
; GFX9-NEXT: s_add_u32 s19, s19, s21
|
|
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s21, s21, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s22, s1, s8
|
|
; GFX9-NEXT: s_add_i32 s20, s20, s21
|
|
; GFX9-NEXT: s_add_u32 s19, s19, s22
|
|
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s21, s21, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s23, s0, s9
|
|
; GFX9-NEXT: s_add_i32 s20, s20, s21
|
|
; GFX9-NEXT: s_add_u32 s19, s19, s23
|
|
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s21, s21, 1
|
|
; GFX9-NEXT: s_add_i32 s20, s20, s21
|
|
; GFX9-NEXT: s_add_u32 s18, s19, s18
|
|
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s19, s19, 1
|
|
; GFX9-NEXT: s_add_i32 s20, s20, s19
|
|
; GFX9-NEXT: s_mul_i32 s19, s3, s8
|
|
; GFX9-NEXT: s_mul_i32 s21, s2, s9
|
|
; GFX9-NEXT: s_add_u32 s19, s19, s21
|
|
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX9-NEXT: s_mul_i32 s22, s1, s10
|
|
; GFX9-NEXT: s_and_b32 s21, s21, 1
|
|
; GFX9-NEXT: s_add_u32 s19, s19, s22
|
|
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s22, s22, 1
|
|
; GFX9-NEXT: s_mul_i32 s23, s0, s11
|
|
; GFX9-NEXT: s_add_i32 s21, s21, s22
|
|
; GFX9-NEXT: s_add_u32 s19, s19, s23
|
|
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s22, s22, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s24, s2, s8
|
|
; GFX9-NEXT: s_add_i32 s21, s21, s22
|
|
; GFX9-NEXT: s_add_u32 s19, s19, s24
|
|
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s22, s22, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s25, s1, s9
|
|
; GFX9-NEXT: s_add_i32 s21, s21, s22
|
|
; GFX9-NEXT: s_add_u32 s19, s19, s25
|
|
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s22, s22, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s26, s0, s10
|
|
; GFX9-NEXT: s_add_i32 s21, s21, s22
|
|
; GFX9-NEXT: s_add_u32 s19, s19, s26
|
|
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s22, s22, 1
|
|
; GFX9-NEXT: s_add_i32 s21, s21, s22
|
|
; GFX9-NEXT: s_add_u32 s19, s19, s20
|
|
; GFX9-NEXT: s_cselect_b32 s20, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s20, s20, 1
|
|
; GFX9-NEXT: s_add_i32 s21, s21, s20
|
|
; GFX9-NEXT: s_mul_i32 s20, s4, s8
|
|
; GFX9-NEXT: s_mul_i32 s22, s3, s9
|
|
; GFX9-NEXT: s_add_u32 s20, s20, s22
|
|
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
|
|
; GFX9-NEXT: s_mul_i32 s23, s2, s10
|
|
; GFX9-NEXT: s_and_b32 s22, s22, 1
|
|
; GFX9-NEXT: s_add_u32 s20, s20, s23
|
|
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s23, s23, 1
|
|
; GFX9-NEXT: s_mul_i32 s24, s1, s11
|
|
; GFX9-NEXT: s_add_i32 s22, s22, s23
|
|
; GFX9-NEXT: s_add_u32 s20, s20, s24
|
|
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s23, s23, 1
|
|
; GFX9-NEXT: s_mul_i32 s25, s0, s12
|
|
; GFX9-NEXT: s_add_i32 s22, s22, s23
|
|
; GFX9-NEXT: s_add_u32 s20, s20, s25
|
|
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s23, s23, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s26, s3, s8
|
|
; GFX9-NEXT: s_add_i32 s22, s22, s23
|
|
; GFX9-NEXT: s_add_u32 s20, s20, s26
|
|
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s23, s23, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s27, s2, s9
|
|
; GFX9-NEXT: s_add_i32 s22, s22, s23
|
|
; GFX9-NEXT: s_add_u32 s20, s20, s27
|
|
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s23, s23, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s28, s1, s10
|
|
; GFX9-NEXT: s_add_i32 s22, s22, s23
|
|
; GFX9-NEXT: s_add_u32 s20, s20, s28
|
|
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s23, s23, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s29, s0, s11
|
|
; GFX9-NEXT: s_add_i32 s22, s22, s23
|
|
; GFX9-NEXT: s_add_u32 s20, s20, s29
|
|
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s23, s23, 1
|
|
; GFX9-NEXT: s_add_i32 s22, s22, s23
|
|
; GFX9-NEXT: s_add_u32 s20, s20, s21
|
|
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s21, s21, 1
|
|
; GFX9-NEXT: s_add_i32 s22, s22, s21
|
|
; GFX9-NEXT: s_mul_i32 s21, s5, s8
|
|
; GFX9-NEXT: s_mul_i32 s23, s4, s9
|
|
; GFX9-NEXT: s_add_u32 s21, s21, s23
|
|
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX9-NEXT: s_mul_i32 s24, s3, s10
|
|
; GFX9-NEXT: s_and_b32 s23, s23, 1
|
|
; GFX9-NEXT: s_add_u32 s21, s21, s24
|
|
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s24, s24, 1
|
|
; GFX9-NEXT: s_mul_i32 s25, s2, s11
|
|
; GFX9-NEXT: s_add_i32 s23, s23, s24
|
|
; GFX9-NEXT: s_add_u32 s21, s21, s25
|
|
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s24, s24, 1
|
|
; GFX9-NEXT: s_mul_i32 s26, s1, s12
|
|
; GFX9-NEXT: s_add_i32 s23, s23, s24
|
|
; GFX9-NEXT: s_add_u32 s21, s21, s26
|
|
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s24, s24, 1
|
|
; GFX9-NEXT: s_mul_i32 s27, s0, s13
|
|
; GFX9-NEXT: s_add_i32 s23, s23, s24
|
|
; GFX9-NEXT: s_add_u32 s21, s21, s27
|
|
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s24, s24, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s28, s4, s8
|
|
; GFX9-NEXT: s_add_i32 s23, s23, s24
|
|
; GFX9-NEXT: s_add_u32 s21, s21, s28
|
|
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s24, s24, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s29, s3, s9
|
|
; GFX9-NEXT: s_add_i32 s23, s23, s24
|
|
; GFX9-NEXT: s_add_u32 s21, s21, s29
|
|
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s24, s24, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s30, s2, s10
|
|
; GFX9-NEXT: s_add_i32 s23, s23, s24
|
|
; GFX9-NEXT: s_add_u32 s21, s21, s30
|
|
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s24, s24, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s31, s1, s11
|
|
; GFX9-NEXT: s_add_i32 s23, s23, s24
|
|
; GFX9-NEXT: s_add_u32 s21, s21, s31
|
|
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s24, s24, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s33, s0, s12
|
|
; GFX9-NEXT: s_add_i32 s23, s23, s24
|
|
; GFX9-NEXT: s_add_u32 s21, s21, s33
|
|
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s24, s24, 1
|
|
; GFX9-NEXT: s_add_i32 s23, s23, s24
|
|
; GFX9-NEXT: s_add_u32 s21, s21, s22
|
|
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s22, s22, 1
|
|
; GFX9-NEXT: s_add_i32 s23, s23, s22
|
|
; GFX9-NEXT: s_mul_i32 s22, s6, s8
|
|
; GFX9-NEXT: s_mul_i32 s24, s5, s9
|
|
; GFX9-NEXT: s_add_u32 s22, s22, s24
|
|
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX9-NEXT: s_mul_i32 s25, s4, s10
|
|
; GFX9-NEXT: s_and_b32 s24, s24, 1
|
|
; GFX9-NEXT: s_add_u32 s22, s22, s25
|
|
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s25, s25, 1
|
|
; GFX9-NEXT: s_mul_i32 s26, s3, s11
|
|
; GFX9-NEXT: s_add_i32 s24, s24, s25
|
|
; GFX9-NEXT: s_add_u32 s22, s22, s26
|
|
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s25, s25, 1
|
|
; GFX9-NEXT: s_mul_i32 s27, s2, s12
|
|
; GFX9-NEXT: s_add_i32 s24, s24, s25
|
|
; GFX9-NEXT: s_add_u32 s22, s22, s27
|
|
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s25, s25, 1
|
|
; GFX9-NEXT: s_mul_i32 s28, s1, s13
|
|
; GFX9-NEXT: s_add_i32 s24, s24, s25
|
|
; GFX9-NEXT: s_add_u32 s22, s22, s28
|
|
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s25, s25, 1
|
|
; GFX9-NEXT: s_mul_i32 s29, s0, s14
|
|
; GFX9-NEXT: s_add_i32 s24, s24, s25
|
|
; GFX9-NEXT: s_add_u32 s22, s22, s29
|
|
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s25, s25, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s30, s5, s8
|
|
; GFX9-NEXT: s_add_i32 s24, s24, s25
|
|
; GFX9-NEXT: s_add_u32 s22, s22, s30
|
|
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s25, s25, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s31, s4, s9
|
|
; GFX9-NEXT: s_add_i32 s24, s24, s25
|
|
; GFX9-NEXT: s_add_u32 s22, s22, s31
|
|
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s25, s25, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s33, s3, s10
|
|
; GFX9-NEXT: s_add_i32 s24, s24, s25
|
|
; GFX9-NEXT: s_add_u32 s22, s22, s33
|
|
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s25, s25, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s34, s2, s11
|
|
; GFX9-NEXT: s_add_i32 s24, s24, s25
|
|
; GFX9-NEXT: s_add_u32 s22, s22, s34
|
|
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s25, s25, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s35, s1, s12
|
|
; GFX9-NEXT: s_add_i32 s24, s24, s25
|
|
; GFX9-NEXT: s_add_u32 s22, s22, s35
|
|
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s25, s25, 1
|
|
; GFX9-NEXT: s_mul_hi_u32 s36, s0, s13
|
|
; GFX9-NEXT: s_add_i32 s24, s24, s25
|
|
; GFX9-NEXT: s_add_u32 s22, s22, s36
|
|
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s25, s25, 1
|
|
; GFX9-NEXT: s_add_i32 s24, s24, s25
|
|
; GFX9-NEXT: s_add_u32 s22, s22, s23
|
|
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX9-NEXT: s_and_b32 s23, s23, 1
|
|
; GFX9-NEXT: s_add_i32 s24, s24, s23
|
|
; GFX9-NEXT: s_mul_i32 s23, s6, s9
|
|
; GFX9-NEXT: s_mul_i32 s7, s7, s8
|
|
; GFX9-NEXT: s_mul_i32 s25, s5, s10
|
|
; GFX9-NEXT: s_add_i32 s7, s7, s23
|
|
; GFX9-NEXT: s_mul_i32 s26, s4, s11
|
|
; GFX9-NEXT: s_add_i32 s7, s7, s25
|
|
; GFX9-NEXT: s_mul_i32 s27, s3, s12
|
|
; GFX9-NEXT: s_add_i32 s7, s7, s26
|
|
; GFX9-NEXT: s_mul_i32 s28, s2, s13
|
|
; GFX9-NEXT: s_add_i32 s7, s7, s27
|
|
; GFX9-NEXT: s_mul_i32 s29, s1, s14
|
|
; GFX9-NEXT: s_add_i32 s7, s7, s28
|
|
; GFX9-NEXT: s_mul_i32 s15, s0, s15
|
|
; GFX9-NEXT: s_add_i32 s7, s7, s29
|
|
; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8
|
|
; GFX9-NEXT: s_add_i32 s7, s7, s15
|
|
; GFX9-NEXT: s_mul_hi_u32 s5, s5, s9
|
|
; GFX9-NEXT: s_add_i32 s6, s7, s6
|
|
; GFX9-NEXT: s_add_i32 s5, s6, s5
|
|
; GFX9-NEXT: s_mul_hi_u32 s4, s4, s10
|
|
; GFX9-NEXT: s_add_i32 s4, s5, s4
|
|
; GFX9-NEXT: s_mul_hi_u32 s3, s3, s11
|
|
; GFX9-NEXT: s_add_i32 s3, s4, s3
|
|
; GFX9-NEXT: s_mul_hi_u32 s2, s2, s12
|
|
; GFX9-NEXT: s_add_i32 s2, s3, s2
|
|
; GFX9-NEXT: s_mul_hi_u32 s1, s1, s13
|
|
; GFX9-NEXT: s_mul_i32 s16, s0, s8
|
|
; GFX9-NEXT: s_add_i32 s1, s2, s1
|
|
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s14
|
|
; GFX9-NEXT: s_add_i32 s0, s1, s0
|
|
; GFX9-NEXT: s_add_i32 s7, s0, s24
|
|
; GFX9-NEXT: s_mov_b32 s0, s16
|
|
; GFX9-NEXT: s_mov_b32 s1, s17
|
|
; GFX9-NEXT: s_mov_b32 s2, s18
|
|
; GFX9-NEXT: s_mov_b32 s3, s19
|
|
; GFX9-NEXT: s_mov_b32 s4, s20
|
|
; GFX9-NEXT: s_mov_b32 s5, s21
|
|
; GFX9-NEXT: s_mov_b32 s6, s22
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
%result = mul i256 %num, %den
|
|
%cast = bitcast i256 %result to <8 x i32>
|
|
ret <8 x i32> %cast
|
|
}
|
|
|
|
define i256 @v_mul_i256(i256 %num, i256 %den) {
|
|
; GFX7-LABEL: v_mul_i256:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mul_lo_u32 v16, v1, v8
|
|
; GFX7-NEXT: v_mul_lo_u32 v17, v0, v9
|
|
; GFX7-NEXT: v_mul_hi_u32 v18, v0, v8
|
|
; GFX7-NEXT: v_mul_lo_u32 v19, v2, v8
|
|
; GFX7-NEXT: v_mul_lo_u32 v20, v1, v9
|
|
; GFX7-NEXT: v_add_i32_e32 v16, vcc, v16, v17
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v16, vcc, v16, v18
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v17, vcc, v17, v18
|
|
; GFX7-NEXT: v_mul_lo_u32 v18, v0, v10
|
|
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20
|
|
; GFX7-NEXT: v_mul_hi_u32 v21, v1, v8
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
|
|
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v18, v21
|
|
; GFX7-NEXT: v_mul_hi_u32 v21, v0, v9
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20
|
|
; GFX7-NEXT: v_mul_lo_u32 v22, v0, v11
|
|
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v18, v21
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20
|
|
; GFX7-NEXT: v_add_i32_e32 v17, vcc, v18, v17
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
|
|
; GFX7-NEXT: v_mul_lo_u32 v20, v3, v8
|
|
; GFX7-NEXT: v_mul_lo_u32 v21, v2, v9
|
|
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18
|
|
; GFX7-NEXT: v_mul_lo_u32 v19, v1, v10
|
|
; GFX7-NEXT: v_mul_lo_u32 v23, v1, v11
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
|
|
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
|
|
; GFX7-NEXT: v_mul_hi_u32 v22, v2, v8
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
|
|
; GFX7-NEXT: v_mul_lo_u32 v7, v7, v8
|
|
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
|
|
; GFX7-NEXT: v_mul_hi_u32 v22, v1, v9
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
|
|
; GFX7-NEXT: v_mul_lo_u32 v15, v0, v15
|
|
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
|
|
; GFX7-NEXT: v_mul_hi_u32 v22, v0, v10
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
|
|
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
|
|
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
|
|
; GFX7-NEXT: v_mul_lo_u32 v21, v4, v8
|
|
; GFX7-NEXT: v_mul_lo_u32 v22, v3, v9
|
|
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
|
|
; GFX7-NEXT: v_mul_lo_u32 v20, v2, v10
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
|
|
; GFX7-NEXT: v_mul_lo_u32 v23, v0, v12
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
|
|
; GFX7-NEXT: v_mul_hi_u32 v23, v3, v8
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
|
|
; GFX7-NEXT: v_mul_hi_u32 v23, v2, v9
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
|
|
; GFX7-NEXT: v_mul_hi_u32 v23, v1, v10
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
|
|
; GFX7-NEXT: v_mul_hi_u32 v23, v0, v11
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
|
|
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
|
|
; GFX7-NEXT: v_mul_lo_u32 v22, v5, v8
|
|
; GFX7-NEXT: v_mul_lo_u32 v23, v4, v9
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
|
|
; GFX7-NEXT: v_mul_lo_u32 v21, v3, v10
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v23, v22
|
|
; GFX7-NEXT: v_mul_lo_u32 v23, v2, v11
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
|
|
; GFX7-NEXT: v_mul_lo_u32 v23, v1, v12
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
|
|
; GFX7-NEXT: v_mul_lo_u32 v23, v0, v13
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
|
|
; GFX7-NEXT: v_mul_hi_u32 v23, v4, v8
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
|
|
; GFX7-NEXT: v_mul_hi_u32 v23, v3, v9
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
|
|
; GFX7-NEXT: v_mul_hi_u32 v23, v2, v10
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
|
|
; GFX7-NEXT: v_mul_hi_u32 v23, v1, v11
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
|
|
; GFX7-NEXT: v_mul_hi_u32 v23, v0, v12
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
|
|
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
|
|
; GFX7-NEXT: v_mul_lo_u32 v22, v6, v8
|
|
; GFX7-NEXT: v_mul_lo_u32 v23, v5, v9
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
|
|
; GFX7-NEXT: v_mul_lo_u32 v23, v4, v10
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v24, v23
|
|
; GFX7-NEXT: v_mul_lo_u32 v24, v3, v11
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
|
|
; GFX7-NEXT: v_mul_lo_u32 v24, v2, v12
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
|
|
; GFX7-NEXT: v_mul_lo_u32 v24, v1, v13
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
|
|
; GFX7-NEXT: v_mul_lo_u32 v24, v0, v14
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
|
|
; GFX7-NEXT: v_mul_hi_u32 v24, v5, v8
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
|
|
; GFX7-NEXT: v_mul_hi_u32 v24, v4, v9
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
|
|
; GFX7-NEXT: v_mul_hi_u32 v24, v3, v10
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
|
|
; GFX7-NEXT: v_mul_hi_u32 v24, v2, v11
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
|
|
; GFX7-NEXT: v_mul_hi_u32 v24, v1, v12
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
|
|
; GFX7-NEXT: v_mul_hi_u32 v24, v0, v13
|
|
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
|
|
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v22
|
|
; GFX7-NEXT: v_mul_lo_u32 v22, v0, v8
|
|
; GFX7-NEXT: v_mul_hi_u32 v8, v6, v8
|
|
; GFX7-NEXT: v_mul_lo_u32 v6, v6, v9
|
|
; GFX7-NEXT: v_mul_hi_u32 v9, v5, v9
|
|
; GFX7-NEXT: v_mul_lo_u32 v5, v5, v10
|
|
; GFX7-NEXT: v_mul_hi_u32 v10, v4, v10
|
|
; GFX7-NEXT: v_mul_lo_u32 v4, v4, v11
|
|
; GFX7-NEXT: v_mul_hi_u32 v11, v3, v11
|
|
; GFX7-NEXT: v_mul_lo_u32 v3, v3, v12
|
|
; GFX7-NEXT: v_mul_hi_u32 v12, v2, v12
|
|
; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13
|
|
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v7, v6
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v6, v5
|
|
; GFX7-NEXT: v_mul_hi_u32 v13, v1, v13
|
|
; GFX7-NEXT: v_mul_lo_u32 v1, v1, v14
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4
|
|
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3
|
|
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v15
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v8
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v9
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v10
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v14
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v11
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v12
|
|
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v13
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
|
|
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v0, v23
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v22
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v16
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v17
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v18
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, v19
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v20
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v21
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: v_mul_i256:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mul_lo_u32 v16, v1, v8
|
|
; GFX8-NEXT: v_mul_lo_u32 v17, v0, v9
|
|
; GFX8-NEXT: v_mul_hi_u32 v18, v0, v8
|
|
; GFX8-NEXT: v_mul_lo_u32 v19, v2, v8
|
|
; GFX8-NEXT: v_mul_lo_u32 v20, v1, v9
|
|
; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v17
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v18
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v18
|
|
; GFX8-NEXT: v_mul_lo_u32 v18, v0, v10
|
|
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20
|
|
; GFX8-NEXT: v_mul_hi_u32 v21, v1, v8
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
|
|
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v21
|
|
; GFX8-NEXT: v_mul_hi_u32 v21, v0, v9
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20
|
|
; GFX8-NEXT: v_mul_lo_u32 v22, v0, v11
|
|
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v21
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20
|
|
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v18, v17
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
|
|
; GFX8-NEXT: v_mul_lo_u32 v20, v3, v8
|
|
; GFX8-NEXT: v_mul_lo_u32 v21, v2, v9
|
|
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18
|
|
; GFX8-NEXT: v_mul_lo_u32 v19, v1, v10
|
|
; GFX8-NEXT: v_mul_lo_u32 v23, v1, v11
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
|
|
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
|
|
; GFX8-NEXT: v_mul_hi_u32 v22, v2, v8
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
|
|
; GFX8-NEXT: v_mul_lo_u32 v7, v7, v8
|
|
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
|
|
; GFX8-NEXT: v_mul_hi_u32 v22, v1, v9
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
|
|
; GFX8-NEXT: v_mul_lo_u32 v15, v0, v15
|
|
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
|
|
; GFX8-NEXT: v_mul_hi_u32 v22, v0, v10
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
|
|
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
|
|
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
|
|
; GFX8-NEXT: v_mul_lo_u32 v21, v4, v8
|
|
; GFX8-NEXT: v_mul_lo_u32 v22, v3, v9
|
|
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
|
|
; GFX8-NEXT: v_mul_lo_u32 v20, v2, v10
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
|
|
; GFX8-NEXT: v_mul_lo_u32 v23, v0, v12
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
|
|
; GFX8-NEXT: v_mul_hi_u32 v23, v3, v8
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
|
|
; GFX8-NEXT: v_mul_hi_u32 v23, v2, v9
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
|
|
; GFX8-NEXT: v_mul_hi_u32 v23, v1, v10
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
|
|
; GFX8-NEXT: v_mul_hi_u32 v23, v0, v11
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
|
|
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
|
|
; GFX8-NEXT: v_mul_lo_u32 v22, v5, v8
|
|
; GFX8-NEXT: v_mul_lo_u32 v23, v4, v9
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
|
|
; GFX8-NEXT: v_mul_lo_u32 v21, v3, v10
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v23, v22
|
|
; GFX8-NEXT: v_mul_lo_u32 v23, v2, v11
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
|
|
; GFX8-NEXT: v_mul_lo_u32 v23, v1, v12
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
|
|
; GFX8-NEXT: v_mul_lo_u32 v23, v0, v13
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
|
|
; GFX8-NEXT: v_mul_hi_u32 v23, v4, v8
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
|
|
; GFX8-NEXT: v_mul_hi_u32 v23, v3, v9
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
|
|
; GFX8-NEXT: v_mul_hi_u32 v23, v2, v10
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
|
|
; GFX8-NEXT: v_mul_hi_u32 v23, v1, v11
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
|
|
; GFX8-NEXT: v_mul_hi_u32 v23, v0, v12
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
|
|
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
|
|
; GFX8-NEXT: v_mul_lo_u32 v22, v6, v8
|
|
; GFX8-NEXT: v_mul_lo_u32 v23, v5, v9
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
|
|
; GFX8-NEXT: v_mul_lo_u32 v23, v4, v10
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v24, v23
|
|
; GFX8-NEXT: v_mul_lo_u32 v24, v3, v11
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
|
|
; GFX8-NEXT: v_mul_lo_u32 v24, v2, v12
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
|
|
; GFX8-NEXT: v_mul_lo_u32 v24, v1, v13
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
|
|
; GFX8-NEXT: v_mul_lo_u32 v24, v0, v14
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
|
|
; GFX8-NEXT: v_mul_hi_u32 v24, v5, v8
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
|
|
; GFX8-NEXT: v_mul_hi_u32 v24, v4, v9
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
|
|
; GFX8-NEXT: v_mul_hi_u32 v24, v3, v10
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
|
|
; GFX8-NEXT: v_mul_hi_u32 v24, v2, v11
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
|
|
; GFX8-NEXT: v_mul_hi_u32 v24, v1, v12
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
|
|
; GFX8-NEXT: v_mul_hi_u32 v24, v0, v13
|
|
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
|
|
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v22
|
|
; GFX8-NEXT: v_mul_lo_u32 v22, v0, v8
|
|
; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8
|
|
; GFX8-NEXT: v_mul_lo_u32 v6, v6, v9
|
|
; GFX8-NEXT: v_mul_hi_u32 v9, v5, v9
|
|
; GFX8-NEXT: v_mul_lo_u32 v5, v5, v10
|
|
; GFX8-NEXT: v_mul_hi_u32 v10, v4, v10
|
|
; GFX8-NEXT: v_mul_lo_u32 v4, v4, v11
|
|
; GFX8-NEXT: v_mul_hi_u32 v11, v3, v11
|
|
; GFX8-NEXT: v_mul_lo_u32 v3, v3, v12
|
|
; GFX8-NEXT: v_mul_hi_u32 v12, v2, v12
|
|
; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13
|
|
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
|
|
; GFX8-NEXT: v_mul_hi_u32 v13, v1, v13
|
|
; GFX8-NEXT: v_mul_lo_u32 v1, v1, v14
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
|
|
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v15
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v8
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v9
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v10
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v14
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v11
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v12
|
|
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v13
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
|
|
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v0, v23
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v22
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v16
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v17
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v18
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, v19
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v20
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v21
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mul_i256:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mul_lo_u32 v16, v2, v8
|
|
; GFX9-NEXT: v_mul_lo_u32 v17, v1, v9
|
|
; GFX9-NEXT: v_mul_lo_u32 v18, v0, v10
|
|
; GFX9-NEXT: v_mul_hi_u32 v19, v1, v8
|
|
; GFX9-NEXT: v_mul_lo_u32 v20, v1, v8
|
|
; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v17
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v18
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v16, v19
|
|
; GFX9-NEXT: v_mul_lo_u32 v21, v0, v9
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v18, v17, v18, v16
|
|
; GFX9-NEXT: v_mul_hi_u32 v16, v0, v8
|
|
; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, v20, v21
|
|
; GFX9-NEXT: v_mul_hi_u32 v21, v0, v9
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v17, v16
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v21
|
|
; GFX9-NEXT: v_add_u32_e32 v17, v20, v17
|
|
; GFX9-NEXT: v_mul_lo_u32 v21, v3, v8
|
|
; GFX9-NEXT: v_mul_lo_u32 v22, v2, v9
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, v19, v17
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v18, v18, v20, v19
|
|
; GFX9-NEXT: v_mul_lo_u32 v19, v1, v10
|
|
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v22
|
|
; GFX9-NEXT: v_mul_lo_u32 v22, v0, v11
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v22
|
|
; GFX9-NEXT: v_mul_hi_u32 v23, v2, v8
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v20, v21, v20, v22
|
|
; GFX9-NEXT: v_mul_hi_u32 v21, v1, v9
|
|
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23
|
|
; GFX9-NEXT: v_mul_hi_u32 v23, v0, v10
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v21
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v20, v20, v22, v21
|
|
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23
|
|
; GFX9-NEXT: v_mul_lo_u32 v22, v4, v8
|
|
; GFX9-NEXT: v_mul_lo_u32 v23, v3, v9
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, v19, v18
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v19, v20, v21, v19
|
|
; GFX9-NEXT: v_mul_lo_u32 v20, v2, v10
|
|
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v22, v23
|
|
; GFX9-NEXT: v_mul_lo_u32 v23, v1, v11
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v20
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v21, v22, v21, v23
|
|
; GFX9-NEXT: v_mul_lo_u32 v22, v0, v12
|
|
; GFX9-NEXT: v_mul_hi_u32 v23, v3, v8
|
|
; GFX9-NEXT: v_mul_lo_u32 v7, v7, v8
|
|
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v21, v21, v22, v23
|
|
; GFX9-NEXT: v_mul_hi_u32 v22, v2, v9
|
|
; GFX9-NEXT: v_mul_hi_u32 v23, v1, v10
|
|
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v21, v21, v22, v23
|
|
; GFX9-NEXT: v_mul_hi_u32 v22, v0, v11
|
|
; GFX9-NEXT: v_mul_lo_u32 v23, v3, v10
|
|
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v20, v21, v22, v20
|
|
; GFX9-NEXT: v_mul_lo_u32 v21, v5, v8
|
|
; GFX9-NEXT: v_mul_lo_u32 v22, v4, v9
|
|
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v22
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
|
|
; GFX9-NEXT: v_mul_lo_u32 v23, v2, v11
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
|
|
; GFX9-NEXT: v_mul_lo_u32 v23, v1, v12
|
|
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
|
|
; GFX9-NEXT: v_mul_lo_u32 v23, v0, v13
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
|
|
; GFX9-NEXT: v_mul_hi_u32 v23, v4, v8
|
|
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
|
|
; GFX9-NEXT: v_mul_hi_u32 v23, v3, v9
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
|
|
; GFX9-NEXT: v_mul_hi_u32 v23, v2, v10
|
|
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
|
|
; GFX9-NEXT: v_mul_hi_u32 v23, v1, v11
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
|
|
; GFX9-NEXT: v_mul_hi_u32 v23, v0, v12
|
|
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v20
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v21, v22, v23, v21
|
|
; GFX9-NEXT: v_mul_lo_u32 v22, v6, v8
|
|
; GFX9-NEXT: v_mul_lo_u32 v23, v5, v9
|
|
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23
|
|
; GFX9-NEXT: v_mul_lo_u32 v23, v4, v10
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23
|
|
; GFX9-NEXT: v_mul_lo_u32 v23, v3, v11
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v23, v24, v25, v23
|
|
; GFX9-NEXT: v_mul_lo_u32 v24, v2, v12
|
|
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
|
|
; GFX9-NEXT: v_mul_lo_u32 v24, v1, v13
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
|
|
; GFX9-NEXT: v_mul_lo_u32 v24, v0, v14
|
|
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
|
|
; GFX9-NEXT: v_mul_hi_u32 v24, v5, v8
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
|
|
; GFX9-NEXT: v_mul_hi_u32 v24, v4, v9
|
|
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
|
|
; GFX9-NEXT: v_mul_hi_u32 v24, v3, v10
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
|
|
; GFX9-NEXT: v_mul_hi_u32 v24, v2, v11
|
|
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
|
|
; GFX9-NEXT: v_mul_hi_u32 v24, v1, v12
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
|
|
; GFX9-NEXT: v_mul_hi_u32 v24, v0, v13
|
|
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
|
|
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v22, v21
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
|
|
; GFX9-NEXT: v_add3_u32 v22, v23, v24, v22
|
|
; GFX9-NEXT: v_mul_lo_u32 v23, v6, v9
|
|
; GFX9-NEXT: v_mul_lo_u32 v24, v4, v11
|
|
; GFX9-NEXT: v_mul_hi_u32 v4, v4, v10
|
|
; GFX9-NEXT: v_mul_hi_u32 v6, v6, v8
|
|
; GFX9-NEXT: v_add_u32_e32 v7, v7, v23
|
|
; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10
|
|
; GFX9-NEXT: v_mul_hi_u32 v5, v5, v9
|
|
; GFX9-NEXT: v_mul_hi_u32 v9, v3, v11
|
|
; GFX9-NEXT: v_mul_hi_u32 v10, v2, v12
|
|
; GFX9-NEXT: v_mul_lo_u32 v3, v3, v12
|
|
; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13
|
|
; GFX9-NEXT: v_mul_hi_u32 v11, v1, v13
|
|
; GFX9-NEXT: v_mul_lo_u32 v12, v1, v14
|
|
; GFX9-NEXT: v_mul_lo_u32 v13, v0, v15
|
|
; GFX9-NEXT: v_add3_u32 v7, v7, v23, v24
|
|
; GFX9-NEXT: v_add3_u32 v2, v7, v3, v2
|
|
; GFX9-NEXT: v_mul_lo_u32 v1, v0, v8
|
|
; GFX9-NEXT: v_add3_u32 v2, v2, v12, v13
|
|
; GFX9-NEXT: v_mul_hi_u32 v0, v0, v14
|
|
; GFX9-NEXT: v_add3_u32 v2, v2, v6, v5
|
|
; GFX9-NEXT: v_add3_u32 v2, v2, v4, v9
|
|
; GFX9-NEXT: v_add3_u32 v2, v2, v10, v11
|
|
; GFX9-NEXT: v_add3_u32 v7, v2, v0, v22
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v16
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v17
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v18
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, v19
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, v20
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v21
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = mul i256 %num, %den
|
|
ret i256 %result
|
|
}
|