/* Copyright (c) 2019-2020 SChernykh This file is part of RandomX OpenCL. RandomX OpenCL is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. RandomX OpenCL is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with RandomX OpenCL. If not, see . */ .rocm .gpu GFX1010 .arch_minor 1 .arch_stepping 0 .eflags 53 .llvm10binfmt .metadatav3 .md_version 1, 0 .globaldata .fill 64, 1, 0 .kernel randomx_run .config .dims x .sgprsnum 96 .vgprsnum 128 .shared_vgprs 0 .dx10clamp .ieeemode .floatmode 0xf0 .priority 0 .exceptions 0 .userdatanum 6 # https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc1-gfx6-gfx10-table # https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc2-gfx6-gfx10-table # https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc3-gfx10-table .pgmrsrc1 0x40af0105 .pgmrsrc2 0x0000008c .pgmrsrc3 0x00000000 .group_segment_fixed_size 256 .private_segment_fixed_size 0 .kernel_code_entry_offset 0x10c0 .use_private_segment_buffer .use_kernarg_segment_ptr .use_wave32 .config .md_symname "randomx_run.kd" .md_language "OpenCL C", 1, 2 .reqd_work_group_size 32, 1, 1 .md_kernarg_segment_size 104 .md_kernarg_segment_align 8 .md_group_segment_fixed_size 256 .md_private_segment_fixed_size 0 .md_wavefront_size 32 .md_sgprsnum 96 .md_vgprsnum 128 .spilledsgprs 0 .spilledvgprs 0 .max_flat_work_group_size 32 .arg dataset, "uchar*", 8, 0, globalbuf, u8, global, default const .arg scratchpad, "uchar*", 8, 8, globalbuf, u8, global, default .arg registers, "ulong*", 8, 16, globalbuf, u64, global, default .arg rounding_modes, "uint*", 8, 24, globalbuf, u32, global, default .arg programs, "uint*", 8, 32, globalbuf, u32, global, default .arg batch_size, "uint", 4, 40, value, u32 .arg rx_parameters, "uint", 4, 44, value, u32 .arg , "", 8, 48, gox, i64 .arg , "", 8, 56, goy, i64 .arg , "", 8, 64, goz, i64 .arg , "", 8, 72, none, i8 .arg , "", 8, 80, none, i8 .arg , "", 8, 88, none, i8 .arg , "", 8, 96, multigridsyncarg, i8 .text randomx_run: # clear all caches s_dcache_wb s_waitcnt vmcnt(0) & lgkmcnt(0) s_waitcnt_vscnt null, 0x0 s_icache_inv s_branch begin # pgmrsrc2 = 0x0000008c, bits 1:5 = 6, so first 6 SGPRs (s0-s7) contain user data # s6 contains group id # v0 contains local id begin: # s[0:1] - pointer to registers # s[2:3] - pointer to rounding modes s_load_dwordx4 s[0:3], s[4:5], 0x10 # s[8:9] - group_id*group_size s_mov_b32 s9, 0 s_lshl_b32 s8, s6, 5 # v0 - local id (sub) # v39 - R[sub] v_lshlrev_b32 v39, 3, v0 s_mov_b32 s12, s7 # vcc_lo = "if (sub < 8)" v_cmp_gt_u32 vcc_lo, 8, v0 s_waitcnt lgkmcnt(0) # load rounding mode s_lshl_b32 s16, s6, 2 s_add_u32 s64, s2, s16 s_addc_u32 s65, s3, 0 v_mov_b32 v1, 0 global_load_dword v1, v1, s[64:65] s_waitcnt vmcnt(0) v_readlane_b32 s66, v1, 0 s_setreg_b32 hwreg(mode, 2, 2), s66 s_mov_b32 s67, 0 # ((__local ulong*) R)[sub] = ((__global ulong*) registers)[sub]; s_lshl_b64 s[2:3], s[8:9], 3 s_mov_b32 s32, s12 s_add_u32 s0, s0, s2 s_addc_u32 s1, s1, s3 v_add_co_u32 v1, s0, s0, v39 v_add_co_ci_u32 v2, s0, s1, 0, s0 global_load_dwordx2 v[4:5], v[1:2], off s_waitcnt vmcnt(0) ds_write_b64 v39, v[4:5] s_waitcnt vmcnt(0) & lgkmcnt(0) s_waitcnt_vscnt null, 0x0 # "if (sub >= 8) return" s_and_saveexec_b32 s0, vcc_lo s_cbranch_execz program_end # s[8:9] - pointer to dataset # s[10:11] - pointer to scratchpads # s[0:1] - pointer to programs s_load_dwordx4 s[8:11], s[4:5], 0x0 s_load_dwordx2 s[0:1], s[4:5], 0x20 # rx_parameters s_load_dword s20, s[4:5], 0x2c v_mov_b32 v5, 0 v_mov_b32 v10, 0 s_waitcnt_vscnt null, 0x0 ds_read_b64 v[8:9], v39 v_cmp_gt_u32 vcc_lo, 4, v0 v_lshlrev_b32 v0, 3, v0 ds_read2_b64 v[25:28], v5 offset0:16 offset1:17 ds_read_b32 v11, v5 offset:152 ds_read_b64 v[35:36], v5 offset:168 ds_read2_b64 v[20:23], v5 offset0:18 offset1:20 v_cndmask_b32 v4, 0xffffff, -1, vcc_lo v_add_nc_u32 v5, v39, v0 s_waitcnt lgkmcnt(0) v_mov_b32 v13, s11 v_mov_b32 v7, s1 v_mov_b32 v6, s0 # Scratchpad L1 size s_bfe_u32 s21, s20, 0x050000 s_lshl_b32 s21, 1, s21 # Scratchpad L2 size s_bfe_u32 s22, s20, 0x050005 s_lshl_b32 s22, 1, s22 # Scratchpad L3 size s_bfe_u32 s0, s20, 0x05000A s_lshl_b32 s23, 1, s0 # program iterations s_bfe_u32 s24, s20, 0x04000F s_lshl_b32 s24, 1, s24 v_mov_b32 v12, s10 v_mad_u64_u32 v[6:7], s2, 10048, s6, v[6:7] # s[4:5] - pointer to current program v_readlane_b32 s4, v6, 0 v_readlane_b32 s5, v7, 0 s_lshl_b32 s2, 1, s0 v_add_co_u32 v14, s0, s8, v11 v_cndmask_b32 v34, v36, 0, vcc_lo v_cndmask_b32 v24, v23, 0, vcc_lo v_cndmask_b32 v3, v22, 0, vcc_lo s_add_i32 s3, s2, 64 v_add_co_ci_u32 v29, s0, s9, v10, s0 v_cndmask_b32 v35, v35, 0, vcc_lo v_add_co_u32 v22, vcc_lo, v14, v0 # v[12:13] - pointer to current scratchpad v_mad_u64_u32 v[12:13], s2, s3, s6, v[12:13] v_mov_b32 v10, v26 v_mov_b32 v11, v25 v_lshlrev_b32 v36, 3, v27 v_lshlrev_b32 v37, 3, v28 v_lshlrev_b32 v20, 3, v20 v_lshlrev_b32 v21, 3, v21 v_add_co_ci_u32 v23, vcc_lo, 0, v29, vcc_lo # rename registers # v6 - R[sub] v_mov_b32 v6, v39 # loop counter s_sub_u32 s2, s24, 1 # used in IXOR_R instruction s_mov_b32 s63, -1 # used in CBRANCH instruction s_mov_b32 s70, (0xFF << 8) s_mov_b32 s71, (0xFF << 9) s_mov_b32 s72, (0xFF << 10) s_mov_b32 s73, (0xFF << 11) s_mov_b32 s74, (0xFF << 12) s_mov_b32 s75, (0xFF << 13) s_mov_b32 s76, (0xFF << 14) s_mov_b32 s77, (0xFF << 15) s_mov_b32 s78, (0xFF << 16) s_mov_b32 s79, (0xFF << 17) s_mov_b32 s80, (0xFF << 18) s_mov_b32 s81, (0xFF << 19) s_mov_b32 s82, (0xFF << 20) s_mov_b32 s83, (0xFF << 21) s_mov_b32 s84, (0xFF << 22) s_mov_b32 s85, (0xFF << 23) # ScratchpadL3Mask64 s_sub_u32 s86, s23, 64 # Scratchpad masks for scratchpads v_sub_nc_u32 v38, s21, 8 v_sub_nc_u32 v39, s22, 8 v_sub_nc_u32 v50, s23, 8 # mask for FSCAL_R v_mov_b32 v51, 0x80F00000 # load scratchpad base address v_readlane_b32 s0, v12, 0 v_readlane_b32 s1, v13, 0 # v41, v44 = 0 v_mov_b32 v41, 0 v_mov_b32 v44, 0 # v41 = 0 on lane 0, set it to 8 on lane 1 # v44 = 0 on lane 0, set it to 4 on lane 1 s_mov_b64 exec, 2 v_mov_b32 v41, 8 v_mov_b32 v44, 4 # load group A registers # Read low 8 bytes into lane 0 and high 8 bytes into lane 1 s_mov_b64 exec, 3 ds_read2_b64 v[52:55], v41 offset0:24 offset1:26 ds_read2_b64 v[56:59], v41 offset0:28 offset1:30 # xmantissaMask v_mov_b32 v77, (1 << 24) - 1 # xexponentMask ds_read_b64 v[78:79], v41 offset:160 # Restore execution mask s_mov_b64 exec, 255 # sign mask (used in FSQRT_R) v_mov_b32 v82, 0x80000000 # used in FSQRT_R to check for "positive normal value" (v_cmpx_class_f64) s_mov_b32 s68, 256 s_mov_b32 s69, 0 # High 32 bits of "1.0" constant (used in FDIV_M) v_mov_b32 v83, (1023 << 20) # Used to multiply FP64 values by 0.5 v_mov_b32 v84, (1 << 20) s_getpc_b64 s[14:15] cur_addr: # get addresses of FSQRT_R subroutines s_add_u32 s40, s14, fsqrt_r_sub0 - cur_addr s_addc_u32 s41, s15, 0 s_add_u32 s42, s14, fsqrt_r_sub1 - cur_addr s_addc_u32 s43, s15, 0 s_add_u32 s44, s14, fsqrt_r_sub2 - cur_addr s_addc_u32 s45, s15, 0 s_add_u32 s46, s14, fsqrt_r_sub3 - cur_addr s_addc_u32 s47, s15, 0 # get addresses of FDIV_M subroutines s_add_u32 s48, s14, fdiv_m_sub0 - cur_addr s_addc_u32 s49, s15, 0 s_add_u32 s50, s14, fdiv_m_sub1 - cur_addr s_addc_u32 s51, s15, 0 s_add_u32 s52, s14, fdiv_m_sub2 - cur_addr s_addc_u32 s53, s15, 0 s_add_u32 s54, s14, fdiv_m_sub3 - cur_addr s_addc_u32 s55, s15, 0 # get address for ISMULH_R subroutine s_add_u32 s56, s14, ismulh_r_sub - cur_addr s_addc_u32 s57, s15, 0 # get address for IMULH_R subroutine s_add_u32 s58, s14, imulh_r_sub - cur_addr s_addc_u32 s59, s15, 0 /* used: v0-v6, v8-v37 not used: v7 */ main_loop: s_waitcnt_vscnt null, 0x0 # v[27:28] = R[readReg0] # v[29:30] = R[readReg1] ds_read_b64 v[27:28], v37 ds_read_b64 v[29:30], v36 s_waitcnt lgkmcnt(0) # R[readReg0] ^ R[readReg0] (high 32 bits) v_xor_b32 v28, v28, v30 # spAddr1 v_xor_b32 v25, v28, v25 v_and_b32 v25, s86, v25 v_add_nc_u32 v25, v25, v0 v_add_co_u32 v16, vcc_lo, s0, v25 # R[readReg0] ^ R[readReg0] (low 32 bits) v_xor_b32 v25, v27, v29 v_mov_b32 v29, v11 v_add_co_ci_u32 v17, vcc_lo, 0, s1, vcc_lo v_xor_b32 v25, v25, v26 # load from spAddr1 global_load_dwordx2 v[27:28], v[16:17], off # spAddr0 v_and_b32 v25, s86, v25 v_add_nc_u32 v25, v25, v0 v_add_co_u32 v31, vcc_lo, s0, v25 v_add_co_ci_u32 v32, vcc_lo, 0, s1, vcc_lo v_add_co_u32 v29, vcc_lo, v22, v29 # load from spAddr0 global_load_dwordx2 v[25:26], v[31:32], off v_add_co_ci_u32 v30, vcc_lo, 0, v23, vcc_lo v_mov_b32 v33, v11 s_and_b32 vcc_lo, exec_lo, vcc_lo s_waitcnt vmcnt(1) v_cvt_f64_i32 v[14:15], v28 v_cvt_f64_i32 v[12:13], v27 v_or_b32 v14, v14, v35 s_waitcnt vmcnt(0) # R[sub] ^= *p0; v_xor_b32 v8, v25, v8 v_xor_b32 v9, v26, v9 v_and_b32 v26, v4, v15 v_and_b32 v19, v4, v13 v_or_b32 v15, v26, v34 v_or_b32 v18, v12, v3 v_mov_b32 v26, 0 v_or_b32 v19, v19, v24 v_mov_b32 v25, v26 ds_write2_b64 v5, v[18:19], v[14:15] offset0:8 offset1:9 # load from dataset global_load_dwordx2 v[18:19], v[29:30], off # load group F,E registers # Read low 8 bytes into lane 0 and high 8 bytes into lane 1 s_mov_b64 exec, 3 s_waitcnt lgkmcnt(0) ds_read2_b64 v[60:63], v41 offset0:8 offset1:10 ds_read2_b64 v[64:67], v41 offset0:12 offset1:14 ds_read2_b64 v[68:71], v41 offset0:16 offset1:18 ds_read2_b64 v[72:75], v41 offset0:20 offset1:22 # load VM integer registers v_readlane_b32 s16, v8, 0 v_readlane_b32 s17, v9, 0 v_readlane_b32 s18, v8, 1 v_readlane_b32 s19, v9, 1 v_readlane_b32 s20, v8, 2 v_readlane_b32 s21, v9, 2 v_readlane_b32 s22, v8, 3 v_readlane_b32 s23, v9, 3 v_readlane_b32 s24, v8, 4 v_readlane_b32 s25, v9, 4 v_readlane_b32 s26, v8, 5 v_readlane_b32 s27, v9, 5 v_readlane_b32 s28, v8, 6 v_readlane_b32 s29, v9, 6 v_readlane_b32 s30, v8, 7 v_readlane_b32 s31, v9, 7 s_waitcnt lgkmcnt(0) # Use only first 2 lanes for the program s_mov_b64 exec, 3 # call JIT code s_swappc_b64 s[12:13], s[4:5] # Write out group F,E registers # Write low 8 bytes from lane 0 and high 8 bytes from lane 1 ds_write2_b64 v41, v[60:61], v[62:63] offset0:8 offset1:10 ds_write2_b64 v41, v[64:65], v[66:67] offset0:12 offset1:14 ds_write2_b64 v41, v[68:69], v[70:71] offset0:16 offset1:18 ds_write2_b64 v41, v[72:73], v[74:75] offset0:20 offset1:22 # store VM integer registers v_writelane_b32 v8, s16, 0 v_writelane_b32 v9, s17, 0 v_writelane_b32 v8, s18, 1 v_writelane_b32 v9, s19, 1 v_writelane_b32 v8, s20, 2 v_writelane_b32 v9, s21, 2 v_writelane_b32 v8, s22, 3 v_writelane_b32 v9, s23, 3 v_writelane_b32 v8, s24, 4 v_writelane_b32 v9, s25, 4 v_writelane_b32 v8, s26, 5 v_writelane_b32 v9, s27, 5 v_writelane_b32 v8, s28, 6 v_writelane_b32 v9, s29, 6 v_writelane_b32 v8, s30, 7 v_writelane_b32 v9, s31, 7 # Turn back on 8 execution lanes s_mov_b64 exec, 255 # Write out VM integer registers ds_write_b64 v6, v[8:9] s_waitcnt lgkmcnt(0) # R[readReg2], R[readReg3] ds_read_b32 v11, v21 ds_read_b32 v27, v20 s_waitcnt lgkmcnt(0) # mx ^= R[readReg2] ^ R[readReg3]; v_xor_b32 v11, v11, v27 v_xor_b32 v10, v10, v11 # v[27:28] = R[sub] # v[29:30] = F[sub] ds_read2_b64 v[27:30], v6 offset1:8 # mx &= CacheLineAlignMask; v_and_b32 v11, 0x7fffffc0, v10 v_mov_b32 v10, v33 s_waitcnt lgkmcnt(0) # const ulong next_r = R[sub] ^ data; s_waitcnt lgkmcnt(0) v_xor_b32 v8, v27, v18 v_xor_b32 v9, v28, v19 # *p1 = next_r; global_store_dwordx2 v[16:17], v[8:9], off # v[27:28] = E[sub] ds_read_b64 v[27:28], v6 offset:128 # R[sub] = next_r; ds_write_b64 v6, v[8:9] s_waitcnt lgkmcnt(1) # *p0 = as_ulong(F[sub]) ^ as_ulong(E[sub]); v_xor_b32 v29, v27, v29 v_xor_b32 v30, v28, v30 global_store_dwordx2 v[31:32], v[29:30], off s_sub_u32 s2, s2, 1 s_cbranch_scc0 main_loop main_loop_end: global_store_dwordx2 v[1:2], v[8:9], off global_store_dwordx2 v[1:2], v[29:30], off inst_offset:64 global_store_dwordx2 v[1:2], v[27:28], off inst_offset:128 # store rounding mode v_mov_b32 v0, 0 v_mov_b32 v1, s66 global_store_dword v0, v1, s[64:65] program_end: s_endpgm fsqrt_r_sub0: s_setreg_b32 hwreg(mode, 2, 2), s67 v_rsq_f64 v[28:29], v[68:69] # Improve initial approximation (can be skipped) #v_mul_f64 v[42:43], v[28:29], v[68:69] #v_mul_f64 v[48:49], v[28:29], -0.5 #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] v_mul_f64 v[42:43], v[28:29], v[68:69] v_mov_b32 v48, v28 v_sub_nc_u32 v49, v29, v84 v_mov_b32 v46, v28 v_xor_b32 v47, v49, v82 v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] v_fma_f64 v[46:47], -v[42:43], v[42:43], v[68:69] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] v_cmpx_class_f64 v[68:69], s[68:69] v_mov_b32 v68, v42 v_mov_b32 v69, v43 s_mov_b64 exec, 3 s_setpc_b64 s[60:61] fsqrt_r_sub1: s_setreg_b32 hwreg(mode, 2, 2), s67 v_rsq_f64 v[28:29], v[70:71] # Improve initial approximation (can be skipped) #v_mul_f64 v[42:43], v[28:29], v[70:71] #v_mul_f64 v[48:49], v[28:29], -0.5 #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] v_mul_f64 v[42:43], v[28:29], v[70:71] v_mov_b32 v48, v28 v_sub_nc_u32 v49, v29, v84 v_mov_b32 v46, v28 v_xor_b32 v47, v49, v82 v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] v_fma_f64 v[46:47], -v[42:43], v[42:43], v[70:71] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] v_cmpx_class_f64 v[70:71], s[68:69] v_mov_b32 v70, v42 v_mov_b32 v71, v43 s_mov_b64 exec, 3 s_setpc_b64 s[60:61] fsqrt_r_sub2: s_setreg_b32 hwreg(mode, 2, 2), s67 v_rsq_f64 v[28:29], v[72:73] # Improve initial approximation (can be skipped) #v_mul_f64 v[42:43], v[28:29], v[72:73] #v_mul_f64 v[48:49], v[28:29], -0.5 #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] v_mul_f64 v[42:43], v[28:29], v[72:73] v_mov_b32 v48, v28 v_sub_nc_u32 v49, v29, v84 v_mov_b32 v46, v28 v_xor_b32 v47, v49, v82 v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] v_fma_f64 v[46:47], -v[42:43], v[42:43], v[72:73] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] v_cmpx_class_f64 v[72:73], s[68:69] v_mov_b32 v72, v42 v_mov_b32 v73, v43 s_mov_b64 exec, 3 s_setpc_b64 s[60:61] fsqrt_r_sub3: s_setreg_b32 hwreg(mode, 2, 2), s67 v_rsq_f64 v[28:29], v[74:75] # Improve initial approximation (can be skipped) #v_mul_f64 v[42:43], v[28:29], v[74:75] #v_mul_f64 v[48:49], v[28:29], -0.5 #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] v_mul_f64 v[42:43], v[28:29], v[74:75] v_mov_b32 v48, v28 v_sub_nc_u32 v49, v29, v84 v_mov_b32 v46, v28 v_xor_b32 v47, v49, v82 v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] v_fma_f64 v[46:47], -v[42:43], v[42:43], v[74:75] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] v_cmpx_class_f64 v[74:75], s[68:69] v_mov_b32 v74, v42 v_mov_b32 v75, v43 s_mov_b64 exec, 3 s_setpc_b64 s[60:61] fdiv_m_sub0: v_or_b32 v28, v28, v78 v_and_or_b32 v29, v29, v77, v79 s_setreg_b32 hwreg(mode, 2, 2), s67 v_rcp_f64 v[48:49], v[28:29] v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] v_mul_f64 v[80:81], v[68:69], v[48:49] v_fma_f64 v[42:43], -v[28:29], v[80:81], v[68:69] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[68:69] v_cmpx_eq_f64 v[68:69], v[28:29] v_mov_b32 v80, 0 v_mov_b32 v81, v83 s_mov_b64 exec, 3 v_mov_b32 v68, v80 v_mov_b32 v69, v81 s_setpc_b64 s[60:61] fdiv_m_sub1: v_or_b32 v28, v28, v78 v_and_or_b32 v29, v29, v77, v79 s_setreg_b32 hwreg(mode, 2, 2), s67 v_rcp_f64 v[48:49], v[28:29] v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] v_mul_f64 v[80:81], v[70:71], v[48:49] v_fma_f64 v[42:43], -v[28:29], v[80:81], v[70:71] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[70:71] v_cmpx_eq_f64 v[70:71], v[28:29] v_mov_b32 v80, 0 v_mov_b32 v81, v83 s_mov_b64 exec, 3 v_mov_b32 v70, v80 v_mov_b32 v71, v81 s_setpc_b64 s[60:61] fdiv_m_sub2: v_or_b32 v28, v28, v78 v_and_or_b32 v29, v29, v77, v79 s_setreg_b32 hwreg(mode, 2, 2), s67 v_rcp_f64 v[48:49], v[28:29] v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] v_mul_f64 v[80:81], v[72:73], v[48:49] v_fma_f64 v[42:43], -v[28:29], v[80:81], v[72:73] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[72:73] v_cmpx_eq_f64 v[72:73], v[28:29] v_mov_b32 v80, 0 v_mov_b32 v81, v83 s_mov_b64 exec, 3 v_mov_b32 v72, v80 v_mov_b32 v73, v81 s_setpc_b64 s[60:61] fdiv_m_sub3: v_or_b32 v28, v28, v78 v_and_or_b32 v29, v29, v77, v79 s_setreg_b32 hwreg(mode, 2, 2), s67 v_rcp_f64 v[48:49], v[28:29] v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] v_mul_f64 v[80:81], v[74:75], v[48:49] v_fma_f64 v[42:43], -v[28:29], v[80:81], v[74:75] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[74:75] v_cmpx_eq_f64 v[74:75], v[28:29] v_mov_b32 v80, 0 v_mov_b32 v81, v83 s_mov_b64 exec, 3 v_mov_b32 v74, v80 v_mov_b32 v75, v81 s_setpc_b64 s[60:61] ismulh_r_sub: s_mov_b64 exec, 1 v_mov_b32 v45, s14 v_mul_hi_u32 v40, s38, v45 v_mov_b32 v47, s15 v_mad_u64_u32 v[42:43], s32, s38, v47, v[40:41] v_mov_b32 v40, v42 v_mad_u64_u32 v[45:46], s32, s39, v45, v[40:41] v_mad_u64_u32 v[42:43], s32, s39, v47, v[43:44] v_add_co_u32 v42, vcc_lo, v42, v46 v_add_co_ci_u32 v43, vcc_lo, 0, v43, vcc_lo v_readlane_b32 s32, v42, 0 v_readlane_b32 s33, v43, 0 s_cmp_lt_i32 s15, 0 s_cselect_b64 s[34:35], s[38:39], 0 s_sub_u32 s32, s32, s34 s_subb_u32 s33, s33, s35 s_cmp_lt_i32 s39, 0 s_cselect_b64 s[34:35], s[14:15], 0 s_sub_u32 s14, s32, s34 s_subb_u32 s15, s33, s35 s_mov_b64 exec, 3 s_setpc_b64 s[60:61] imulh_r_sub: s_mov_b64 exec, 1 v_mov_b32 v45, s38 v_mul_hi_u32 v40, s14, v45 v_mov_b32 v47, s39 v_mad_u64_u32 v[42:43], s32, s14, v47, v[40:41] v_mov_b32 v40, v42 v_mad_u64_u32 v[45:46], s32, s15, v45, v[40:41] v_mad_u64_u32 v[42:43], s32, s15, v47, v[43:44] v_add_co_u32 v42, vcc_lo, v42, v46 v_add_co_ci_u32 v43, vcc_lo, 0, v43, vcc_lo v_readlane_b32 s14, v42, 0 v_readlane_b32 s15, v43, 0 s_mov_b64 exec, 3 s_setpc_b64 s[60:61]