#include "defs.h" typedef struct __attribute__ ((aligned(16))) {uint32_t s[PROGPOW_DAG_LOADS];} dag_t; inline void progPowLoop(const uint32_t loop, volatile uint32_t mix_arg[PROGPOW_REGS], __global const dag_t *g_dag, __local const uint32_t c_dag[PROGPOW_CACHE_WORDS], __local uint64_t share[GROUP_SHARE], const bool hack_false) { dag_t data_dag; uint32_t offset, data; uint32_t mix[PROGPOW_REGS]; for(int i=0; iz = 36969 * (st->z & 65535) + (st->z >> 16); st->w = 18000 * (st->w & 65535) + (st->w >> 16); uint32_t MWC = ((st->z << 16) + st->w); st->jsr ^= (st->jsr << 17); st->jsr ^= (st->jsr >> 13); st->jsr ^= (st->jsr << 5); st->jcong = 69069 * st->jcong + 1234567; return ((MWC ^ st->jcong) + st->jsr); } void fill_mix(local uint32_t* seed, uint32_t lane_id, uint32_t* mix) { // Use FNV to expand the per-warp seed to per-lane // Use KISS to expand the per-lane seed to fill mix uint32_t fnv_hash = FNV_OFFSET_BASIS; kiss99_t st; st.z = fnv1a(fnv_hash, seed[0]); st.w = fnv1a(fnv_hash, seed[1]); st.jsr = fnv1a(fnv_hash, lane_id); st.jcong = fnv1a(fnv_hash, lane_id); #pragma unroll for (int i = 0; i < PROGPOW_REGS; i++) mix[i] = kiss99(&st); } typedef struct { uint32_t uint32s[PROGPOW_LANES]; uint64_t uint64s[PROGPOW_LANES / 2]; } shuffle_t; typedef struct { uint32_t uint32s[32 / sizeof(uint32_t)]; } hash32_t; #if PLATFORM != OPENCL_PLATFORM_NVIDIA // use maxrregs on nv __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) #endif __kernel void progpow_search(__global dag_t const* g_dag, __global uint* job_blob, ulong target, uint hack_false, volatile __global uint* results) { __local shuffle_t share[HASHES_PER_GROUP]; __local uint32_t c_dag[PROGPOW_CACHE_WORDS]; uint32_t const lid = get_local_id(0); uint32_t const gid = get_global_id(0); const uint32_t lane_id = lid & (PROGPOW_LANES - 1); const uint32_t group_id = lid / PROGPOW_LANES; // Load the first portion of the DAG into the cache for (uint32_t word = lid * PROGPOW_DAG_LOADS; word < PROGPOW_CACHE_WORDS; word += GROUP_SIZE * PROGPOW_DAG_LOADS) { dag_t load = g_dag[word / PROGPOW_DAG_LOADS]; for (int i = 0; i < PROGPOW_DAG_LOADS; i++) c_dag[word + i] = load.s[i]; } // Sync threads so shared mem is in sync barrier(CLK_LOCAL_MEM_FENCE); uint32_t hash_seed[2]; // KISS99 initiator hash32_t digest; // Carry-over from mix output uint32_t state2[8]; { // Absorb phase for initial round of keccak uint32_t state[25]; // Keccak's state // 1st fill with job data for (int i = 0; i < 10; i++) state[i] = job_blob[i]; // Apply nonce state[8] = gid; // 3rd apply ravencoin input constraints for (int i = 10; i < 25; i++) state[i] = ravencoin_rndc[i-10]; // Run intial keccak round keccak_f800(state); for (int i = 0; i < 8; i++) state2[i] = state[i]; } #pragma unroll 1 for (uint32_t h = 0; h < PROGPOW_LANES; h++) { uint32_t mix[PROGPOW_REGS]; // share the hash's seed across all lanes if (lane_id == h) { share[group_id].uint32s[0] = state2[0]; share[group_id].uint32s[1] = state2[1]; } barrier(CLK_LOCAL_MEM_FENCE); // initialize mix for all lanes fill_mix(share[group_id].uint32s, lane_id, mix); #pragma unroll 1 for (uint32_t l = 0; l < PROGPOW_CNT_DAG; l++) progPowLoop(l, mix, g_dag, c_dag, share[0].uint64s, hack_false); // Reduce mix data to a per-lane 32-bit digest uint32_t mix_hash = FNV_OFFSET_BASIS; #pragma unroll for (int i = 0; i < PROGPOW_REGS; i++) fnv1a(mix_hash, mix[i]); // Reduce all lanes to a single 256-bit digest hash32_t digest_temp; for (int i = 0; i < 8; i++) digest_temp.uint32s[i] = FNV_OFFSET_BASIS; share[group_id].uint32s[lane_id] = mix_hash; barrier(CLK_LOCAL_MEM_FENCE); #pragma unroll for (int i = 0; i < PROGPOW_LANES; i++) fnv1a(digest_temp.uint32s[i % 8], share[group_id].uint32s[i]); if (h == lane_id) digest = digest_temp; } // Absorb phase for last round of keccak (256 bits) uint64_t result; { uint32_t state[25] = {0x0}; // Keccak's state // 1st initial 8 words of state are kept as carry-over from initial keccak for (int i = 0; i < 8; i++) state[i] = state2[i]; // 2nd subsequent 8 words are carried from digest/mix for (int i = 8; i < 16; i++) state[i] = digest.uint32s[i - 8]; // 3rd apply ravencoin input constraints for (int i = 16; i < 25; i++) state[i] = ravencoin_rndc[i - 16]; // Run keccak loop keccak_f800(state); uint64_t res = (uint64_t)state[1] << 32 | state[0]; result = as_ulong(as_uchar8(res).s76543210); } if (result <= target) { const uint k = atomic_inc(results) + 1; if (k <= 15) results[k] = gid; } }