743 lines
23 KiB
NASM
743 lines
23 KiB
NASM
/*
|
|
Copyright (c) 2019-2020 SChernykh
|
|
|
|
This file is part of RandomX OpenCL.
|
|
|
|
RandomX OpenCL is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
RandomX OpenCL is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with RandomX OpenCL. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
.rocm
|
|
.gpu GFX1010
|
|
.arch_minor 1
|
|
.arch_stepping 0
|
|
.eflags 53
|
|
.llvm10binfmt
|
|
.metadatav3
|
|
.md_version 1, 0
|
|
.globaldata
|
|
.fill 64, 1, 0
|
|
.kernel randomx_run
|
|
.config
|
|
.dims x
|
|
.sgprsnum 96
|
|
.vgprsnum 128
|
|
.shared_vgprs 0
|
|
.dx10clamp
|
|
.ieeemode
|
|
.floatmode 0xf0
|
|
.priority 0
|
|
.exceptions 0
|
|
.userdatanum 6
|
|
|
|
# https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc1-gfx6-gfx10-table
|
|
# https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc2-gfx6-gfx10-table
|
|
# https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc3-gfx10-table
|
|
.pgmrsrc1 0x40af0105
|
|
.pgmrsrc2 0x0000008c
|
|
.pgmrsrc3 0x00000000
|
|
|
|
.group_segment_fixed_size 256
|
|
.private_segment_fixed_size 0
|
|
.kernel_code_entry_offset 0x10c0
|
|
.use_private_segment_buffer
|
|
.use_kernarg_segment_ptr
|
|
.use_wave32
|
|
.config
|
|
.md_symname "randomx_run.kd"
|
|
.md_language "OpenCL C", 1, 2
|
|
.reqd_work_group_size 32, 1, 1
|
|
.md_kernarg_segment_size 104
|
|
.md_kernarg_segment_align 8
|
|
.md_group_segment_fixed_size 256
|
|
.md_private_segment_fixed_size 0
|
|
.md_wavefront_size 32
|
|
.md_sgprsnum 96
|
|
.md_vgprsnum 128
|
|
.spilledsgprs 0
|
|
.spilledvgprs 0
|
|
.max_flat_work_group_size 32
|
|
.arg dataset, "uchar*", 8, 0, globalbuf, u8, global, default const
|
|
.arg scratchpad, "uchar*", 8, 8, globalbuf, u8, global, default
|
|
.arg registers, "ulong*", 8, 16, globalbuf, u64, global, default
|
|
.arg rounding_modes, "uint*", 8, 24, globalbuf, u32, global, default
|
|
.arg programs, "uint*", 8, 32, globalbuf, u32, global, default
|
|
.arg batch_size, "uint", 4, 40, value, u32
|
|
.arg rx_parameters, "uint", 4, 44, value, u32
|
|
.arg , "", 8, 48, gox, i64
|
|
.arg , "", 8, 56, goy, i64
|
|
.arg , "", 8, 64, goz, i64
|
|
.arg , "", 8, 72, none, i8
|
|
.arg , "", 8, 80, none, i8
|
|
.arg , "", 8, 88, none, i8
|
|
.arg , "", 8, 96, multigridsyncarg, i8
|
|
.text
|
|
randomx_run:
|
|
# clear all caches
|
|
s_dcache_wb
|
|
s_waitcnt vmcnt(0) & lgkmcnt(0)
|
|
s_waitcnt_vscnt null, 0x0
|
|
s_icache_inv
|
|
s_branch begin
|
|
|
|
# pgmrsrc2 = 0x0000008c, bits 1:5 = 6, so first 6 SGPRs (s0-s7) contain user data
|
|
# s6 contains group id
|
|
# v0 contains local id
|
|
begin:
|
|
# s[0:1] - pointer to registers
|
|
# s[2:3] - pointer to rounding modes
|
|
s_load_dwordx4 s[0:3], s[4:5], 0x10
|
|
|
|
# s[8:9] - group_id*group_size
|
|
s_mov_b32 s9, 0
|
|
s_lshl_b32 s8, s6, 5
|
|
|
|
# v0 - local id (sub)
|
|
# v39 - R[sub]
|
|
v_lshlrev_b32 v39, 3, v0
|
|
|
|
s_mov_b32 s12, s7
|
|
|
|
# vcc_lo = "if (sub < 8)"
|
|
v_cmp_gt_u32 vcc_lo, 8, v0
|
|
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
# load rounding mode
|
|
s_lshl_b32 s16, s6, 2
|
|
s_add_u32 s64, s2, s16
|
|
s_addc_u32 s65, s3, 0
|
|
v_mov_b32 v1, 0
|
|
global_load_dword v1, v1, s[64:65]
|
|
s_waitcnt vmcnt(0)
|
|
v_readlane_b32 s66, v1, 0
|
|
s_setreg_b32 hwreg(mode, 2, 2), s66
|
|
s_mov_b32 s67, 0
|
|
|
|
# ((__local ulong*) R)[sub] = ((__global ulong*) registers)[sub];
|
|
s_lshl_b64 s[2:3], s[8:9], 3
|
|
s_mov_b32 s32, s12
|
|
s_add_u32 s0, s0, s2
|
|
s_addc_u32 s1, s1, s3
|
|
v_add_co_u32 v1, s0, s0, v39
|
|
v_add_co_ci_u32 v2, s0, s1, 0, s0
|
|
global_load_dwordx2 v[4:5], v[1:2], off
|
|
s_waitcnt vmcnt(0)
|
|
ds_write_b64 v39, v[4:5]
|
|
s_waitcnt vmcnt(0) & lgkmcnt(0)
|
|
s_waitcnt_vscnt null, 0x0
|
|
|
|
# "if (sub >= 8) return"
|
|
s_and_saveexec_b32 s0, vcc_lo
|
|
s_cbranch_execz program_end
|
|
|
|
# s[8:9] - pointer to dataset
|
|
# s[10:11] - pointer to scratchpads
|
|
# s[0:1] - pointer to programs
|
|
s_load_dwordx4 s[8:11], s[4:5], 0x0
|
|
s_load_dwordx2 s[0:1], s[4:5], 0x20
|
|
|
|
# rx_parameters
|
|
s_load_dword s20, s[4:5], 0x2c
|
|
|
|
v_mov_b32 v5, 0
|
|
v_mov_b32 v10, 0
|
|
s_waitcnt_vscnt null, 0x0
|
|
ds_read_b64 v[8:9], v39
|
|
v_cmp_gt_u32 vcc_lo, 4, v0
|
|
v_lshlrev_b32 v0, 3, v0
|
|
ds_read2_b64 v[25:28], v5 offset0:16 offset1:17
|
|
ds_read_b32 v11, v5 offset:152
|
|
ds_read_b64 v[35:36], v5 offset:168
|
|
ds_read2_b64 v[20:23], v5 offset0:18 offset1:20
|
|
v_cndmask_b32 v4, 0xffffff, -1, vcc_lo
|
|
v_add_nc_u32 v5, v39, v0
|
|
s_waitcnt lgkmcnt(0)
|
|
v_mov_b32 v13, s11
|
|
v_mov_b32 v7, s1
|
|
v_mov_b32 v6, s0
|
|
|
|
# Scratchpad L1 size
|
|
s_bfe_u32 s21, s20, 0x050000
|
|
s_lshl_b32 s21, 1, s21
|
|
|
|
# Scratchpad L2 size
|
|
s_bfe_u32 s22, s20, 0x050005
|
|
s_lshl_b32 s22, 1, s22
|
|
|
|
# Scratchpad L3 size
|
|
s_bfe_u32 s0, s20, 0x05000A
|
|
s_lshl_b32 s23, 1, s0
|
|
|
|
# program iterations
|
|
s_bfe_u32 s24, s20, 0x04000F
|
|
s_lshl_b32 s24, 1, s24
|
|
|
|
v_mov_b32 v12, s10
|
|
v_mad_u64_u32 v[6:7], s2, 10048, s6, v[6:7]
|
|
|
|
# s[4:5] - pointer to current program
|
|
v_readlane_b32 s4, v6, 0
|
|
v_readlane_b32 s5, v7, 0
|
|
|
|
s_lshl_b32 s2, 1, s0
|
|
v_add_co_u32 v14, s0, s8, v11
|
|
v_cndmask_b32 v34, v36, 0, vcc_lo
|
|
v_cndmask_b32 v24, v23, 0, vcc_lo
|
|
v_cndmask_b32 v3, v22, 0, vcc_lo
|
|
s_add_i32 s3, s2, 64
|
|
v_add_co_ci_u32 v29, s0, s9, v10, s0
|
|
v_cndmask_b32 v35, v35, 0, vcc_lo
|
|
v_add_co_u32 v22, vcc_lo, v14, v0
|
|
|
|
# v[12:13] - pointer to current scratchpad
|
|
v_mad_u64_u32 v[12:13], s2, s3, s6, v[12:13]
|
|
v_mov_b32 v10, v26
|
|
v_mov_b32 v11, v25
|
|
v_lshlrev_b32 v36, 3, v27
|
|
v_lshlrev_b32 v37, 3, v28
|
|
v_lshlrev_b32 v20, 3, v20
|
|
v_lshlrev_b32 v21, 3, v21
|
|
v_add_co_ci_u32 v23, vcc_lo, 0, v29, vcc_lo
|
|
|
|
# rename registers
|
|
# v6 - R[sub]
|
|
v_mov_b32 v6, v39
|
|
|
|
# loop counter
|
|
s_sub_u32 s2, s24, 1
|
|
|
|
# used in IXOR_R instruction
|
|
s_mov_b32 s63, -1
|
|
|
|
# used in CBRANCH instruction
|
|
s_mov_b32 s70, (0xFF << 8)
|
|
s_mov_b32 s71, (0xFF << 9)
|
|
s_mov_b32 s72, (0xFF << 10)
|
|
s_mov_b32 s73, (0xFF << 11)
|
|
s_mov_b32 s74, (0xFF << 12)
|
|
s_mov_b32 s75, (0xFF << 13)
|
|
s_mov_b32 s76, (0xFF << 14)
|
|
s_mov_b32 s77, (0xFF << 15)
|
|
s_mov_b32 s78, (0xFF << 16)
|
|
s_mov_b32 s79, (0xFF << 17)
|
|
s_mov_b32 s80, (0xFF << 18)
|
|
s_mov_b32 s81, (0xFF << 19)
|
|
s_mov_b32 s82, (0xFF << 20)
|
|
s_mov_b32 s83, (0xFF << 21)
|
|
s_mov_b32 s84, (0xFF << 22)
|
|
s_mov_b32 s85, (0xFF << 23)
|
|
|
|
# ScratchpadL3Mask64
|
|
s_sub_u32 s86, s23, 64
|
|
|
|
# Scratchpad masks for scratchpads
|
|
v_sub_nc_u32 v38, s21, 8
|
|
v_sub_nc_u32 v39, s22, 8
|
|
v_sub_nc_u32 v50, s23, 8
|
|
|
|
# mask for FSCAL_R
|
|
v_mov_b32 v51, 0x80F00000
|
|
|
|
# load scratchpad base address
|
|
v_readlane_b32 s0, v12, 0
|
|
v_readlane_b32 s1, v13, 0
|
|
|
|
# v41, v44 = 0
|
|
v_mov_b32 v41, 0
|
|
v_mov_b32 v44, 0
|
|
|
|
# v41 = 0 on lane 0, set it to 8 on lane 1
|
|
# v44 = 0 on lane 0, set it to 4 on lane 1
|
|
s_mov_b64 exec, 2
|
|
v_mov_b32 v41, 8
|
|
v_mov_b32 v44, 4
|
|
|
|
# load group A registers
|
|
# Read low 8 bytes into lane 0 and high 8 bytes into lane 1
|
|
s_mov_b64 exec, 3
|
|
ds_read2_b64 v[52:55], v41 offset0:24 offset1:26
|
|
ds_read2_b64 v[56:59], v41 offset0:28 offset1:30
|
|
|
|
# xmantissaMask
|
|
v_mov_b32 v77, (1 << 24) - 1
|
|
|
|
# xexponentMask
|
|
ds_read_b64 v[78:79], v41 offset:160
|
|
|
|
# Restore execution mask
|
|
s_mov_b64 exec, 255
|
|
|
|
# sign mask (used in FSQRT_R)
|
|
v_mov_b32 v82, 0x80000000
|
|
|
|
# used in FSQRT_R to check for "positive normal value" (v_cmpx_class_f64)
|
|
s_mov_b32 s68, 256
|
|
s_mov_b32 s69, 0
|
|
|
|
# High 32 bits of "1.0" constant (used in FDIV_M)
|
|
v_mov_b32 v83, (1023 << 20)
|
|
|
|
# Used to multiply FP64 values by 0.5
|
|
v_mov_b32 v84, (1 << 20)
|
|
|
|
s_getpc_b64 s[14:15]
|
|
cur_addr:
|
|
|
|
# get addresses of FSQRT_R subroutines
|
|
s_add_u32 s40, s14, fsqrt_r_sub0 - cur_addr
|
|
s_addc_u32 s41, s15, 0
|
|
s_add_u32 s42, s14, fsqrt_r_sub1 - cur_addr
|
|
s_addc_u32 s43, s15, 0
|
|
s_add_u32 s44, s14, fsqrt_r_sub2 - cur_addr
|
|
s_addc_u32 s45, s15, 0
|
|
s_add_u32 s46, s14, fsqrt_r_sub3 - cur_addr
|
|
s_addc_u32 s47, s15, 0
|
|
|
|
# get addresses of FDIV_M subroutines
|
|
s_add_u32 s48, s14, fdiv_m_sub0 - cur_addr
|
|
s_addc_u32 s49, s15, 0
|
|
s_add_u32 s50, s14, fdiv_m_sub1 - cur_addr
|
|
s_addc_u32 s51, s15, 0
|
|
s_add_u32 s52, s14, fdiv_m_sub2 - cur_addr
|
|
s_addc_u32 s53, s15, 0
|
|
s_add_u32 s54, s14, fdiv_m_sub3 - cur_addr
|
|
s_addc_u32 s55, s15, 0
|
|
|
|
# get address for ISMULH_R subroutine
|
|
s_add_u32 s56, s14, ismulh_r_sub - cur_addr
|
|
s_addc_u32 s57, s15, 0
|
|
|
|
# get address for IMULH_R subroutine
|
|
s_add_u32 s58, s14, imulh_r_sub - cur_addr
|
|
s_addc_u32 s59, s15, 0
|
|
|
|
/*
|
|
used: v0-v6, v8-v37
|
|
not used: v7
|
|
*/
|
|
main_loop:
|
|
s_waitcnt_vscnt null, 0x0
|
|
|
|
# v[27:28] = R[readReg0]
|
|
# v[29:30] = R[readReg1]
|
|
ds_read_b64 v[27:28], v37
|
|
ds_read_b64 v[29:30], v36
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
# R[readReg0] ^ R[readReg0] (high 32 bits)
|
|
v_xor_b32 v28, v28, v30
|
|
|
|
# spAddr1
|
|
v_xor_b32 v25, v28, v25
|
|
v_and_b32 v25, s86, v25
|
|
v_add_nc_u32 v25, v25, v0
|
|
|
|
v_add_co_u32 v16, vcc_lo, s0, v25
|
|
|
|
# R[readReg0] ^ R[readReg0] (low 32 bits)
|
|
v_xor_b32 v25, v27, v29
|
|
|
|
v_mov_b32 v29, v11
|
|
v_add_co_ci_u32 v17, vcc_lo, 0, s1, vcc_lo
|
|
v_xor_b32 v25, v25, v26
|
|
|
|
# load from spAddr1
|
|
global_load_dwordx2 v[27:28], v[16:17], off
|
|
|
|
# spAddr0
|
|
v_and_b32 v25, s86, v25
|
|
v_add_nc_u32 v25, v25, v0
|
|
|
|
v_add_co_u32 v31, vcc_lo, s0, v25
|
|
v_add_co_ci_u32 v32, vcc_lo, 0, s1, vcc_lo
|
|
v_add_co_u32 v29, vcc_lo, v22, v29
|
|
|
|
# load from spAddr0
|
|
global_load_dwordx2 v[25:26], v[31:32], off
|
|
|
|
v_add_co_ci_u32 v30, vcc_lo, 0, v23, vcc_lo
|
|
v_mov_b32 v33, v11
|
|
s_and_b32 vcc_lo, exec_lo, vcc_lo
|
|
s_waitcnt vmcnt(1)
|
|
v_cvt_f64_i32 v[14:15], v28
|
|
v_cvt_f64_i32 v[12:13], v27
|
|
v_or_b32 v14, v14, v35
|
|
s_waitcnt vmcnt(0)
|
|
|
|
# R[sub] ^= *p0;
|
|
v_xor_b32 v8, v25, v8
|
|
v_xor_b32 v9, v26, v9
|
|
|
|
v_and_b32 v26, v4, v15
|
|
|
|
v_and_b32 v19, v4, v13
|
|
v_or_b32 v15, v26, v34
|
|
v_or_b32 v18, v12, v3
|
|
v_mov_b32 v26, 0
|
|
v_or_b32 v19, v19, v24
|
|
v_mov_b32 v25, v26
|
|
ds_write2_b64 v5, v[18:19], v[14:15] offset0:8 offset1:9
|
|
|
|
# load from dataset
|
|
global_load_dwordx2 v[18:19], v[29:30], off
|
|
|
|
# load group F,E registers
|
|
# Read low 8 bytes into lane 0 and high 8 bytes into lane 1
|
|
s_mov_b64 exec, 3
|
|
s_waitcnt lgkmcnt(0)
|
|
ds_read2_b64 v[60:63], v41 offset0:8 offset1:10
|
|
ds_read2_b64 v[64:67], v41 offset0:12 offset1:14
|
|
ds_read2_b64 v[68:71], v41 offset0:16 offset1:18
|
|
ds_read2_b64 v[72:75], v41 offset0:20 offset1:22
|
|
|
|
# load VM integer registers
|
|
v_readlane_b32 s16, v8, 0
|
|
v_readlane_b32 s17, v9, 0
|
|
v_readlane_b32 s18, v8, 1
|
|
v_readlane_b32 s19, v9, 1
|
|
v_readlane_b32 s20, v8, 2
|
|
v_readlane_b32 s21, v9, 2
|
|
v_readlane_b32 s22, v8, 3
|
|
v_readlane_b32 s23, v9, 3
|
|
v_readlane_b32 s24, v8, 4
|
|
v_readlane_b32 s25, v9, 4
|
|
v_readlane_b32 s26, v8, 5
|
|
v_readlane_b32 s27, v9, 5
|
|
v_readlane_b32 s28, v8, 6
|
|
v_readlane_b32 s29, v9, 6
|
|
v_readlane_b32 s30, v8, 7
|
|
v_readlane_b32 s31, v9, 7
|
|
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
# Use only first 2 lanes for the program
|
|
s_mov_b64 exec, 3
|
|
|
|
# call JIT code
|
|
s_swappc_b64 s[12:13], s[4:5]
|
|
|
|
# Write out group F,E registers
|
|
# Write low 8 bytes from lane 0 and high 8 bytes from lane 1
|
|
ds_write2_b64 v41, v[60:61], v[62:63] offset0:8 offset1:10
|
|
ds_write2_b64 v41, v[64:65], v[66:67] offset0:12 offset1:14
|
|
ds_write2_b64 v41, v[68:69], v[70:71] offset0:16 offset1:18
|
|
ds_write2_b64 v41, v[72:73], v[74:75] offset0:20 offset1:22
|
|
|
|
# store VM integer registers
|
|
v_writelane_b32 v8, s16, 0
|
|
v_writelane_b32 v9, s17, 0
|
|
v_writelane_b32 v8, s18, 1
|
|
v_writelane_b32 v9, s19, 1
|
|
v_writelane_b32 v8, s20, 2
|
|
v_writelane_b32 v9, s21, 2
|
|
v_writelane_b32 v8, s22, 3
|
|
v_writelane_b32 v9, s23, 3
|
|
v_writelane_b32 v8, s24, 4
|
|
v_writelane_b32 v9, s25, 4
|
|
v_writelane_b32 v8, s26, 5
|
|
v_writelane_b32 v9, s27, 5
|
|
v_writelane_b32 v8, s28, 6
|
|
v_writelane_b32 v9, s29, 6
|
|
v_writelane_b32 v8, s30, 7
|
|
v_writelane_b32 v9, s31, 7
|
|
|
|
# Turn back on 8 execution lanes
|
|
s_mov_b64 exec, 255
|
|
|
|
# Write out VM integer registers
|
|
ds_write_b64 v6, v[8:9]
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
# R[readReg2], R[readReg3]
|
|
ds_read_b32 v11, v21
|
|
ds_read_b32 v27, v20
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
# mx ^= R[readReg2] ^ R[readReg3];
|
|
v_xor_b32 v11, v11, v27
|
|
v_xor_b32 v10, v10, v11
|
|
|
|
# v[27:28] = R[sub]
|
|
# v[29:30] = F[sub]
|
|
ds_read2_b64 v[27:30], v6 offset1:8
|
|
|
|
# mx &= CacheLineAlignMask;
|
|
v_and_b32 v11, 0x7fffffc0, v10
|
|
v_mov_b32 v10, v33
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
# const ulong next_r = R[sub] ^ data;
|
|
s_waitcnt lgkmcnt(0)
|
|
v_xor_b32 v8, v27, v18
|
|
v_xor_b32 v9, v28, v19
|
|
|
|
# *p1 = next_r;
|
|
global_store_dwordx2 v[16:17], v[8:9], off
|
|
|
|
# v[27:28] = E[sub]
|
|
ds_read_b64 v[27:28], v6 offset:128
|
|
|
|
# R[sub] = next_r;
|
|
ds_write_b64 v6, v[8:9]
|
|
s_waitcnt lgkmcnt(1)
|
|
|
|
# *p0 = as_ulong(F[sub]) ^ as_ulong(E[sub]);
|
|
v_xor_b32 v29, v27, v29
|
|
v_xor_b32 v30, v28, v30
|
|
global_store_dwordx2 v[31:32], v[29:30], off
|
|
|
|
s_sub_u32 s2, s2, 1
|
|
s_cbranch_scc0 main_loop
|
|
main_loop_end:
|
|
|
|
global_store_dwordx2 v[1:2], v[8:9], off
|
|
global_store_dwordx2 v[1:2], v[29:30], off inst_offset:64
|
|
global_store_dwordx2 v[1:2], v[27:28], off inst_offset:128
|
|
|
|
# store rounding mode
|
|
v_mov_b32 v0, 0
|
|
v_mov_b32 v1, s66
|
|
global_store_dword v0, v1, s[64:65]
|
|
|
|
program_end:
|
|
s_endpgm
|
|
|
|
fsqrt_r_sub0:
|
|
s_setreg_b32 hwreg(mode, 2, 2), s67
|
|
v_rsq_f64 v[28:29], v[68:69]
|
|
|
|
# Improve initial approximation (can be skipped)
|
|
#v_mul_f64 v[42:43], v[28:29], v[68:69]
|
|
#v_mul_f64 v[48:49], v[28:29], -0.5
|
|
#v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
|
|
#v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
|
|
|
|
v_mul_f64 v[42:43], v[28:29], v[68:69]
|
|
v_mov_b32 v48, v28
|
|
v_sub_nc_u32 v49, v29, v84
|
|
v_mov_b32 v46, v28
|
|
v_xor_b32 v47, v49, v82
|
|
v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
|
|
v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
|
|
v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
|
|
v_fma_f64 v[46:47], -v[42:43], v[42:43], v[68:69]
|
|
s_setreg_b32 hwreg(mode, 2, 2), s66
|
|
v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
|
|
v_cmpx_class_f64 v[68:69], s[68:69]
|
|
v_mov_b32 v68, v42
|
|
v_mov_b32 v69, v43
|
|
s_mov_b64 exec, 3
|
|
s_setpc_b64 s[60:61]
|
|
|
|
fsqrt_r_sub1:
|
|
s_setreg_b32 hwreg(mode, 2, 2), s67
|
|
v_rsq_f64 v[28:29], v[70:71]
|
|
|
|
# Improve initial approximation (can be skipped)
|
|
#v_mul_f64 v[42:43], v[28:29], v[70:71]
|
|
#v_mul_f64 v[48:49], v[28:29], -0.5
|
|
#v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
|
|
#v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
|
|
|
|
v_mul_f64 v[42:43], v[28:29], v[70:71]
|
|
v_mov_b32 v48, v28
|
|
v_sub_nc_u32 v49, v29, v84
|
|
v_mov_b32 v46, v28
|
|
v_xor_b32 v47, v49, v82
|
|
v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
|
|
v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
|
|
v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
|
|
v_fma_f64 v[46:47], -v[42:43], v[42:43], v[70:71]
|
|
s_setreg_b32 hwreg(mode, 2, 2), s66
|
|
v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
|
|
v_cmpx_class_f64 v[70:71], s[68:69]
|
|
v_mov_b32 v70, v42
|
|
v_mov_b32 v71, v43
|
|
s_mov_b64 exec, 3
|
|
s_setpc_b64 s[60:61]
|
|
|
|
fsqrt_r_sub2:
|
|
s_setreg_b32 hwreg(mode, 2, 2), s67
|
|
v_rsq_f64 v[28:29], v[72:73]
|
|
|
|
# Improve initial approximation (can be skipped)
|
|
#v_mul_f64 v[42:43], v[28:29], v[72:73]
|
|
#v_mul_f64 v[48:49], v[28:29], -0.5
|
|
#v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
|
|
#v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
|
|
|
|
v_mul_f64 v[42:43], v[28:29], v[72:73]
|
|
v_mov_b32 v48, v28
|
|
v_sub_nc_u32 v49, v29, v84
|
|
v_mov_b32 v46, v28
|
|
v_xor_b32 v47, v49, v82
|
|
v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
|
|
v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
|
|
v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
|
|
v_fma_f64 v[46:47], -v[42:43], v[42:43], v[72:73]
|
|
s_setreg_b32 hwreg(mode, 2, 2), s66
|
|
v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
|
|
v_cmpx_class_f64 v[72:73], s[68:69]
|
|
v_mov_b32 v72, v42
|
|
v_mov_b32 v73, v43
|
|
s_mov_b64 exec, 3
|
|
s_setpc_b64 s[60:61]
|
|
|
|
fsqrt_r_sub3:
|
|
s_setreg_b32 hwreg(mode, 2, 2), s67
|
|
v_rsq_f64 v[28:29], v[74:75]
|
|
|
|
# Improve initial approximation (can be skipped)
|
|
#v_mul_f64 v[42:43], v[28:29], v[74:75]
|
|
#v_mul_f64 v[48:49], v[28:29], -0.5
|
|
#v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5
|
|
#v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29]
|
|
|
|
v_mul_f64 v[42:43], v[28:29], v[74:75]
|
|
v_mov_b32 v48, v28
|
|
v_sub_nc_u32 v49, v29, v84
|
|
v_mov_b32 v46, v28
|
|
v_xor_b32 v47, v49, v82
|
|
v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5
|
|
v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43]
|
|
v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49]
|
|
v_fma_f64 v[46:47], -v[42:43], v[42:43], v[74:75]
|
|
s_setreg_b32 hwreg(mode, 2, 2), s66
|
|
v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43]
|
|
v_cmpx_class_f64 v[74:75], s[68:69]
|
|
v_mov_b32 v74, v42
|
|
v_mov_b32 v75, v43
|
|
s_mov_b64 exec, 3
|
|
s_setpc_b64 s[60:61]
|
|
|
|
fdiv_m_sub0:
|
|
v_or_b32 v28, v28, v78
|
|
v_and_or_b32 v29, v29, v77, v79
|
|
s_setreg_b32 hwreg(mode, 2, 2), s67
|
|
v_rcp_f64 v[48:49], v[28:29]
|
|
v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
|
|
v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
|
|
v_mul_f64 v[80:81], v[68:69], v[48:49]
|
|
v_fma_f64 v[42:43], -v[28:29], v[80:81], v[68:69]
|
|
s_setreg_b32 hwreg(mode, 2, 2), s66
|
|
v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
|
|
v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[68:69]
|
|
v_cmpx_eq_f64 v[68:69], v[28:29]
|
|
v_mov_b32 v80, 0
|
|
v_mov_b32 v81, v83
|
|
s_mov_b64 exec, 3
|
|
v_mov_b32 v68, v80
|
|
v_mov_b32 v69, v81
|
|
s_setpc_b64 s[60:61]
|
|
|
|
fdiv_m_sub1:
|
|
v_or_b32 v28, v28, v78
|
|
v_and_or_b32 v29, v29, v77, v79
|
|
s_setreg_b32 hwreg(mode, 2, 2), s67
|
|
v_rcp_f64 v[48:49], v[28:29]
|
|
v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
|
|
v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
|
|
v_mul_f64 v[80:81], v[70:71], v[48:49]
|
|
v_fma_f64 v[42:43], -v[28:29], v[80:81], v[70:71]
|
|
s_setreg_b32 hwreg(mode, 2, 2), s66
|
|
v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
|
|
v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[70:71]
|
|
v_cmpx_eq_f64 v[70:71], v[28:29]
|
|
v_mov_b32 v80, 0
|
|
v_mov_b32 v81, v83
|
|
s_mov_b64 exec, 3
|
|
v_mov_b32 v70, v80
|
|
v_mov_b32 v71, v81
|
|
s_setpc_b64 s[60:61]
|
|
|
|
fdiv_m_sub2:
|
|
v_or_b32 v28, v28, v78
|
|
v_and_or_b32 v29, v29, v77, v79
|
|
s_setreg_b32 hwreg(mode, 2, 2), s67
|
|
v_rcp_f64 v[48:49], v[28:29]
|
|
v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
|
|
v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
|
|
v_mul_f64 v[80:81], v[72:73], v[48:49]
|
|
v_fma_f64 v[42:43], -v[28:29], v[80:81], v[72:73]
|
|
s_setreg_b32 hwreg(mode, 2, 2), s66
|
|
v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
|
|
v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[72:73]
|
|
v_cmpx_eq_f64 v[72:73], v[28:29]
|
|
v_mov_b32 v80, 0
|
|
v_mov_b32 v81, v83
|
|
s_mov_b64 exec, 3
|
|
v_mov_b32 v72, v80
|
|
v_mov_b32 v73, v81
|
|
s_setpc_b64 s[60:61]
|
|
|
|
fdiv_m_sub3:
|
|
v_or_b32 v28, v28, v78
|
|
v_and_or_b32 v29, v29, v77, v79
|
|
s_setreg_b32 hwreg(mode, 2, 2), s67
|
|
v_rcp_f64 v[48:49], v[28:29]
|
|
v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0
|
|
v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49]
|
|
v_mul_f64 v[80:81], v[74:75], v[48:49]
|
|
v_fma_f64 v[42:43], -v[28:29], v[80:81], v[74:75]
|
|
s_setreg_b32 hwreg(mode, 2, 2), s66
|
|
v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81]
|
|
v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[74:75]
|
|
v_cmpx_eq_f64 v[74:75], v[28:29]
|
|
v_mov_b32 v80, 0
|
|
v_mov_b32 v81, v83
|
|
s_mov_b64 exec, 3
|
|
v_mov_b32 v74, v80
|
|
v_mov_b32 v75, v81
|
|
s_setpc_b64 s[60:61]
|
|
|
|
ismulh_r_sub:
|
|
s_mov_b64 exec, 1
|
|
v_mov_b32 v45, s14
|
|
v_mul_hi_u32 v40, s38, v45
|
|
v_mov_b32 v47, s15
|
|
v_mad_u64_u32 v[42:43], s32, s38, v47, v[40:41]
|
|
v_mov_b32 v40, v42
|
|
v_mad_u64_u32 v[45:46], s32, s39, v45, v[40:41]
|
|
v_mad_u64_u32 v[42:43], s32, s39, v47, v[43:44]
|
|
v_add_co_u32 v42, vcc_lo, v42, v46
|
|
v_add_co_ci_u32 v43, vcc_lo, 0, v43, vcc_lo
|
|
v_readlane_b32 s32, v42, 0
|
|
v_readlane_b32 s33, v43, 0
|
|
s_cmp_lt_i32 s15, 0
|
|
s_cselect_b64 s[34:35], s[38:39], 0
|
|
s_sub_u32 s32, s32, s34
|
|
s_subb_u32 s33, s33, s35
|
|
s_cmp_lt_i32 s39, 0
|
|
s_cselect_b64 s[34:35], s[14:15], 0
|
|
s_sub_u32 s14, s32, s34
|
|
s_subb_u32 s15, s33, s35
|
|
s_mov_b64 exec, 3
|
|
s_setpc_b64 s[60:61]
|
|
|
|
imulh_r_sub:
|
|
s_mov_b64 exec, 1
|
|
v_mov_b32 v45, s38
|
|
v_mul_hi_u32 v40, s14, v45
|
|
v_mov_b32 v47, s39
|
|
v_mad_u64_u32 v[42:43], s32, s14, v47, v[40:41]
|
|
v_mov_b32 v40, v42
|
|
v_mad_u64_u32 v[45:46], s32, s15, v45, v[40:41]
|
|
v_mad_u64_u32 v[42:43], s32, s15, v47, v[43:44]
|
|
v_add_co_u32 v42, vcc_lo, v42, v46
|
|
v_add_co_ci_u32 v43, vcc_lo, 0, v43, vcc_lo
|
|
v_readlane_b32 s14, v42, 0
|
|
v_readlane_b32 s15, v43, 0
|
|
s_mov_b64 exec, 3
|
|
s_setpc_b64 s[60:61]
|