diff --git a/src/forward.jl b/src/forward.jl index 20fb70b..d5f9498 100644 --- a/src/forward.jl +++ b/src/forward.jl @@ -59,7 +59,9 @@ function (kfn::kfn_1)(input::AbstractArray) kfn.lif_refractoryDuration, kfn.lif_gammaPd, kfn.lif_firingCounter, - kfn.lif_recSignal,) + kfn.lif_recSignal, + kfn.lif_subscription, + ) end @async begin # project 3D kfn zit into 4D alif zit @@ -80,6 +82,7 @@ function (kfn::kfn_1)(input::AbstractArray) kfn.alif_gammaPd, kfn.alif_firingCounter, kfn.alif_recSignal, + kfn.alif_subscription, kfn.alif_epsilonRecA, kfn.alif_a, kfn.alif_avth, @@ -117,7 +120,9 @@ function (kfn::kfn_1)(input::AbstractArray) kfn.on_refractoryDuration, kfn.on_gammaPd, kfn.on_firingCounter, - kfn.on_recSignal,) + kfn.on_recSignal, + kfn.on_subscription, + ) logit = reshape(kfn.on_zt, (size(input, 1), :)) @@ -126,6 +131,434 @@ function (kfn::kfn_1)(input::AbstractArray) kfn.zit end +# gpu launcher +function lifForward( zit::CuArray, + wRec::CuArray, + vt::CuArray, + vth::CuArray, + vRest::CuArray, + zt::CuArray, + alpha::CuArray, + phi::CuArray, + epsilonRec::CuArray, + refractoryCounter::CuArray, + refractoryDuration::CuArray, + gammaPd::CuArray, + firingCounter::CuArray, + recSignal::CuArray, + subscription::CuArray, + ) + + kernel = @cuda launch=false lifForward( zit, + wRec, + vt, + vth, + vRest, + zt, + alpha, + phi, + epsilonRec, + refractoryCounter, + refractoryDuration, + gammaPd, + firingCounter, + recSignal, + subscription, + GeneralUtils.linear_to_cartesian, + ) + config = launch_configuration(kernel.fun) + + + # threads to be launched. Since one can't launch exact thread number the kernel needs, + # one just launch threads more than this kernel needs then use a guard inside the kernel + # to prevent unused threads to access memory. + threads = min(1024, config.threads) # depend on gpu. Most NVIDIA gpu has 1024 threads per block + + # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element + totalThreads = length(wRec) + + blocks = cld(totalThreads, threads) + # println("launching gpu kernel") + CUDA.@sync begin + kernel( zit, + wRec, + vt, + vth, + vRest, + zt, + alpha, + phi, + epsilonRec, + refractoryCounter, + refractoryDuration, + gammaPd, + firingCounter, + recSignal, + subscription, + GeneralUtils.linear_to_cartesian; threads, blocks) + end +end + +# gpu kernel +function lifForward( zit, + wRec, + vt, + vth, + vRest, + zt, + alpha, + phi, + epsilonRec, + refractoryCounter, + refractoryDuration, + gammaPd, + firingCounter, + recSignal, + subscription, + linear_to_cartesian, + ) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # gpu threads index + + if i <= length(wRec) + # cartesian index + i1, i2, i3, i4 = linear_to_cartesian(i, size(wRec)) + # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4") + + if refractoryCounter[i1,i2,i3,i4] > 0 # refractory period is active + refractoryCounter[i1,i2,i3,i4] -= 1 + recSignal[i1,i2,i3,i4] = 0 + zt[i1,i2,i3,i4] = 0 + vt[i1,i2,i3,i4] = alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4] + phi[i1,i2,i3,i4] = 0 + + # compute epsilonRec + epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + + (zit[i1,i2,i3,i4] * subscription[i1,i2,i3,i4]) + + else # refractory period is inactive + recSignal[i1,i2,i3,i4] = wRec[i1,i2,i3,i4] * zit[i1,i2,i3,i4] + vt[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]) + + sum(@view(recSignal[:,:,i3,i4])) + + # fires if membrane potential exceed threshold + if vt[i1,i2,i3,i4] > vth[i1,i2,i3,i4] + zt[i1,i2,i3,i4] = 1 + refractoryCounter[i1,i2,i3,i4] = refractoryDuration[i1,i2,i3,i4] + firingCounter[i1,i2,i3,i4] += 1 + vt[i1,i2,i3,i4] = vRest[i1,i2,i3,i4] + else + zt[i1,i2,i3,i4] = 0 + end + + # compute phi, there is a difference from lif formula + phi[i1,i2,i3,i4] = (gammaPd[i1,i2,i3,i4] / vth[i1,i2,i3,i4]) * + max(0, 1 - ((vt[i1,i2,i3,i4] - vth[i1,i2,i3,i4]) / vth[i1,i2,i3,i4])) + + # compute epsilonRec + epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + + (zit[i1,i2,i3,i4] * subscription[i1,i2,i3,i4]) + end + end + return nothing +end + +# gpu launcher +function alifForward( zit::CuArray, + wRec::CuArray, + vt::CuArray, + vth::CuArray, + vRest::CuArray, + zt::CuArray, + alpha::CuArray, + phi::CuArray, + epsilonRec::CuArray, + refractoryCounter::CuArray, + refractoryDuration::CuArray, + gammaPd::CuArray, + firingCounter::CuArray, + recSignal::CuArray, + subscription::CuArray, + epsilonRecA::CuArray, + a::CuArray, + avth::CuArray, + beta::CuArray, + rho::CuArray, + ) + + kernel = @cuda launch=false alifForward( zit, + wRec, + vt, + vth, + vRest, + zt, + alpha, + phi, + epsilonRec, + refractoryCounter, + refractoryDuration, + gammaPd, + firingCounter, + recSignal, + subscription, + epsilonRecA, + a, + avth, + beta, + rho, + GeneralUtils.linear_to_cartesian, + ) + config = launch_configuration(kernel.fun) + + # threads to be launched. Since one can't launch exact thread number the kernel needs, + # one just launch threads more than this kernel needs then use a guard inside the kernel + # to prevent unused threads to access memory. + threads = min(1024, config.threads) # depend on gpu. Most NVIDIA gpu has 1024 threads per block + + # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element + totalThreads = length(wRec) + + blocks = cld(totalThreads, threads) + # println("launching gpu kernel") + CUDA.@sync begin + kernel( zit, + wRec, + vt, + vth, + vRest, + zt, + alpha, + phi, + epsilonRec, + refractoryCounter, + refractoryDuration, + gammaPd, + firingCounter, + recSignal, + subscription, + epsilonRecA, + a, + avth, + beta, + rho, + GeneralUtils.linear_to_cartesian; threads, blocks) + end +end + +# gpu kernel +function alifForward( zit, + wRec, + vt, + vth, + vRest, + zt, + alpha, + phi, + epsilonRec, + refractoryCounter, + refractoryDuration, + gammaPd, + firingCounter, + recSignal, + subscription, + epsilonRecA, + a, + avth, + beta, + rho, + linear_to_cartesian, + ) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # gpu threads index + + if i <= length(wRec) + # cartesian index + i1, i2, i3, i4 = linear_to_cartesian(i, size(wRec)) + # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4") + + if refractoryCounter[i1,i2,i3,i4] > 0 # refractory period is active + refractoryCounter[i1,i2,i3,i4] -= 1 + recSignal[i1,i2,i3,i4] = 0 + zt[i1,i2,i3,i4] = 0 + vt[i1,i2,i3,i4] = alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4] + phi[i1,i2,i3,i4] = 0 + a[i1,i2,i3,i4] = rho[i1,i2,i3,i4] * a[i1,i2,i3,i4] + + # compute epsilonRec + epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + + (zit[i1,i2,i3,i4] * subscription[i1,i2,i3,i4]) + + # compute epsilonRecA + epsilonRecA[i1,i2,i3,i4] = (phi[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + + ((rho[i1,i2,i3,i4] - (phi[i1,i2,i3,i4] * beta[i1,i2,i3,i4])) * + epsilonRecA[i1,i2,i3,i4]) + + # compute avth + avth[i1,i2,i3,i4] = vth[i1,i2,i3,i4] + (beta[i1,i2,i3,i4] * a[i1,i2,i3,i4]) + + else # refractory period is inactive + recSignal[i1,i2,i3,i4] = zit[i1,i2,i3,i4] * wRec[i1,i2,i3,i4] + vt[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]) + + sum(@view(recSignal[:,:,i3,i4])) + + # compute avth + avth[i1,i2,i3,i4] = vth[i1,i2,i3,i4] + (beta[i1,i2,i3,i4] * a[i1,i2,i3,i4]) + + # fires if membrane potential exceed threshold + if vt[i1,i2,i3,i4] > avth[i1,i2,i3,i4] + zt[i1,i2,i3,i4] = 1 + refractoryCounter[i1,i2,i3,i4] = refractoryDuration[i1,i2,i3,i4] + firingCounter[i1,i2,i3,i4] += 1 + vt[i1,i2,i3,i4] = vRest[i1,i2,i3,i4] + a[i1,i2,i3,i4] = (rho[i1,i2,i3,i4] * a[i1,i2,i3,i4]) + 1 + else + zt[i1,i2,i3,i4] = 0 + a[i1,i2,i3,i4] = (rho[i1,i2,i3,i4] * a[i1,i2,i3,i4]) + end + + # compute phi, there is a difference from alif formula + phi[i1,i2,i3,i4] = (gammaPd[i1,i2,i3,i4] / vth[i1,i2,i3,i4]) * + max(0, 1 - ((vt[i1,i2,i3,i4] - vth[i1,i2,i3,i4]) / vth[i1,i2,i3,i4])) + + # compute epsilonRec + epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + + (zit[i1,i2,i3,i4] * subscription[i1,i2,i3,i4]) + + # compute epsilonRecA use eq.26 + epsilonRecA[i1,i2,i3,i4] = (rho[i1,i2,i3,i4] * + (phi[i1,i2,i3,i4] * epsilonRecA[i1,i2,i3,i4])) + + (zit[i1,i2,i3,i4] * subscription[i1,i2,i3,i4]) + end + end + return nothing +end + +# gpu launcher +function onForward( zit::CuArray, + wOut::CuArray, + vt::CuArray, + vth::CuArray, + vRest::CuArray, + zt::CuArray, + alpha::CuArray, + phi::CuArray, + epsilonRec::CuArray, + refractoryCounter::CuArray, + refractoryDuration::CuArray, + gammaPd::CuArray, + firingCounter::CuArray, + recSignal::CuArray, + subscription::CuArray, + ) + + kernel = @cuda launch=false onForward( zit, + wOut, + vt, + vth, + vRest, + zt, + alpha, + phi, + epsilonRec, + refractoryCounter, + refractoryDuration, + gammaPd, + firingCounter, + recSignal, + subscription, + GeneralUtils.linear_to_cartesian, + ) + config = launch_configuration(kernel.fun) + + # threads to be launched. Since one can't launch exact thread number the kernel needs, + # one just launch threads more than this kernel needs then use a guard inside the kernel + # to prevent unused threads to access memory. + threads = min(1024, config.threads) # depend on gpu. Most NVIDIA gpu has 1024 threads per block + + # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element + totalThreads = length(wOut) + + blocks = cld(totalThreads, threads) + # println("launching gpu kernel") + CUDA.@sync begin + kernel( zit, + wOut, + vt, + vth, + vRest, + zt, + alpha, + phi, + epsilonRec, + refractoryCounter, + refractoryDuration, + gammaPd, + firingCounter, + recSignal, + subscription, + GeneralUtils.linear_to_cartesian; threads, blocks) + end +end + +# gpu kernel +function onForward( zit, + wOut, + vt, + vth, + vRest, + zt, + alpha, + phi, + epsilonRec, + refractoryCounter, + refractoryDuration, + gammaPd, + firingCounter, + recSignal, + subscription, + linear_to_cartesian, + ) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # gpu threads index + + if i <= length(wOut) + # cartesian index + i1, i2, i3, i4 = linear_to_cartesian(i, size(wOut)) + # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4") + + if refractoryCounter[i1,i2,i3,i4] > 0 # refractory period is active + refractoryCounter[i1,i2,i3,i4] -= 1 + recSignal[i1,i2,i3,i4] = 0 + zt[i1,i2,i3,i4] = 0 + vt[i1,i2,i3,i4] = alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4] + phi[i1,i2,i3,i4] = 0 + + # compute epsilonRec + epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + + (zit[i1,i2,i3,i4] * subscription[i1,i2,i3,i4]) + + else # refractory period is inactive + recSignal[i1,i2,i3,i4] = zit[i1,i2,i3,i4] * wOut[i1,i2,i3,i4] + vt[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]) + sum(@view(recSignal[:,:,i3,i4])) + + # fires if membrane potential exceed threshold + if vt[i1,i2,i3,i4] > vth[i1,i2,i3,i4] + zt[i1,i2,i3,i4] = 1 + refractoryCounter[i1,i2,i3,i4] = refractoryDuration[i1,i2,i3,i4] + firingCounter[i1,i2,i3,i4] += 1 + vt[i1,i2,i3,i4] = vRest[i1,i2,i3,i4] + else + zt[i1,i2,i3,i4] = 0 + end + + # compute phi, there is a difference from on formula + phi[i1,i2,i3,i4] = (gammaPd[i1,i2,i3,i4] / vth[i1,i2,i3,i4]) * max(0, 1 - ((vt[i1,i2,i3,i4] - vth[i1,i2,i3,i4]) / vth[i1,i2,i3,i4])) + + # compute epsilonRec + epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + + (zit[i1,i2,i3,i4] * subscription[i1,i2,i3,i4]) + end + end + return nothing +end + function lifForward(kfn_zit::Array{T}, zit::Array{T}, wRec::Array{T}, @@ -193,127 +626,6 @@ function lifForward(kfn_zit::Array{T}, end end -# gpu launcher -function lifForward( lif_zit::CuArray, - lif_wRec::CuArray, - lif_vt::CuArray, - lif_vth::CuArray, - lif_vRest::CuArray, - lif_zt::CuArray, - lif_alpha::CuArray, - lif_phi::CuArray, - lif_epsilonRec::CuArray, - lif_refractoryCounter::CuArray, - lif_refractoryDuration::CuArray, - lif_gammaPd::CuArray, - lif_firingCounter::CuArray, - lif_recSignal::CuArray,) - - kernel = @cuda launch=false lifForward( lif_zit, - lif_wRec, - lif_vt, - lif_vth, - lif_vRest, - lif_zt, - lif_alpha, - lif_phi, - lif_epsilonRec, - lif_refractoryCounter, - lif_refractoryDuration, - lif_gammaPd, - lif_firingCounter, - lif_recSignal, - GeneralUtils.linear_to_cartesian) - config = launch_configuration(kernel.fun) - - - # threads to be launched. Since one can't launch exact thread number the kernel needs, - # one just launch threads more than this kernel needs then use a guard inside the kernel - # to prevent unused threads to access memory. - threads = min(1024, config.threads) # depend on gpu. Most NVIDIA gpu has 1024 threads per block - - # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element - totalThreads = length(lif_wRec) - - blocks = cld(totalThreads, threads) - # println("launching gpu kernel") - CUDA.@sync begin - kernel( lif_zit, - lif_wRec, - lif_vt, - lif_vth, - lif_vRest, - lif_zt, - lif_alpha, - lif_phi, - lif_epsilonRec, - lif_refractoryCounter, - lif_refractoryDuration, - lif_gammaPd, - lif_firingCounter, - lif_recSignal, - GeneralUtils.linear_to_cartesian; threads, blocks) - end -end - -# gpu kernel -function lifForward( zit, - wRec, - vt, - vth, - vRest, - zt, - alpha, - phi, - epsilonRec, - refractoryCounter, - refractoryDuration, - gammaPd, - firingCounter, - recSignal, - linear_to_cartesian) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # gpu threads index - - if i <= length(wRec) - # cartesian index - i1, i2, i3, i4 = linear_to_cartesian(i, size(wRec)) - # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4") - - refractoryCounter[i1,i2,i3,i4] -= 1 - - if refractoryCounter[i1,i2,i3,i4] > 0 # refractory period is active - refractoryCounter[i1,i2,i3,i4] -= 1 - zt[i1,i2,i3,i4] = 0 - vt[i1,i2,i3,i4] = alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4] - phi[i1,i2,i3,i4] = 0 - - # compute epsilonRec - epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + zit[i1,i2,i3,i4] - - else # refractory period is inactive - recSignal[i1,i2,i3,i4] = zit[i1,i2,i3,i4] * wRec[i1,i2,i3,i4] - vt[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]) + sum(@view(recSignal[:,:,i3,i4])) - - # fires if membrane potential exceed threshold - if vt[i1,i2,i3,i4] > vth[i1,i2,i3,i4] - zt[i1,i2,i3,i4] = 1 - refractoryCounter[i1,i2,i3,i4] = refractoryDuration[i1,i2,i3,i4] - firingCounter[i1,i2,i3,i4] += 1 - vt[i1,i2,i3,i4] = vRest[i1,i2,i3,i4] - else - zt[i1,i2,i3,i4] = 0 - end - - # compute phi, there is a difference from lif formula - phi[i1,i2,i3,i4] = (gammaPd[i1,i2,i3,i4] / vth[i1,i2,i3,i4]) * max(0, 1 - ((vt[i1,i2,i3,i4] - vth[i1,i2,i3,i4]) / vth[i1,i2,i3,i4])) - - # compute epsilonRec - epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + zit[i1,i2,i3,i4] - end - end - return nothing -end - function alifForward(zit::Array{T}, wRec::Array{T}, vt0::Array{T}, @@ -413,164 +725,6 @@ function alifForward(zit::Array{T}, end end -# gpu launcher -function alifForward( alif_zit::CuArray, - alif_wRec::CuArray, - alif_vt::CuArray, - alif_vth::CuArray, - alif_vRest::CuArray, - alif_zt::CuArray, - alif_alpha::CuArray, - alif_phi::CuArray, - alif_epsilonRec::CuArray, - alif_refractoryCounter::CuArray, - alif_refractoryDuration::CuArray, - alif_gammaPd::CuArray, - alif_firingCounter::CuArray, - alif_recSignal::CuArray, - alif_epsilonRecA::CuArray, - alif_a::CuArray, - alif_avth::CuArray, - alif_beta::CuArray, - alif_rho::CuArray, - ) - - kernel = @cuda launch=false alifForward( alif_zit, - alif_wRec, - alif_vt, - alif_vth, - alif_vRest, - alif_zt, - alif_alpha, - alif_phi, - alif_epsilonRec, - alif_refractoryCounter, - alif_refractoryDuration, - alif_gammaPd, - alif_firingCounter, - alif_recSignal, - alif_epsilonRecA, - alif_a, - alif_avth, - alif_beta, - alif_rho, - GeneralUtils.linear_to_cartesian) - config = launch_configuration(kernel.fun) - - # threads to be launched. Since one can't launch exact thread number the kernel needs, - # one just launch threads more than this kernel needs then use a guard inside the kernel - # to prevent unused threads to access memory. - threads = min(1024, config.threads) # depend on gpu. Most NVIDIA gpu has 1024 threads per block - - # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element - totalThreads = length(alif_wRec) - - blocks = cld(totalThreads, threads) - # println("launching gpu kernel") - CUDA.@sync begin - kernel( alif_zit, - alif_wRec, - alif_vt, - alif_vth, - alif_vRest, - alif_zt, - alif_alpha, - alif_phi, - alif_epsilonRec, - alif_refractoryCounter, - alif_refractoryDuration, - alif_gammaPd, - alif_firingCounter, - alif_recSignal, - alif_epsilonRecA, - alif_a, - alif_avth, - alif_beta, - alif_rho, - GeneralUtils.linear_to_cartesian; threads, blocks) - end -end - -# gpu kernel -function alifForward( zit, - wRec, - vt, - vth, - vRest, - zt, - alpha, - phi, - epsilonRec, - refractoryCounter, - refractoryDuration, - gammaPd, - firingCounter, - recSignal, - epsilonRecA, - a, - avth, - beta, - rho, - linear_to_cartesian) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # gpu threads index - - if i <= length(wRec) - # cartesian index - i1, i2, i3, i4 = linear_to_cartesian(i, size(wRec)) - # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4") - - refractoryCounter[i1,i2,i3,i4] -= 1 - - if refractoryCounter[i1,i2,i3,i4] > 0 # refractory period is active - refractoryCounter[i1,i2,i3,i4] -= 1 - zt[i1,i2,i3,i4] = 0 - vt[i1,i2,i3,i4] = alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4] - phi[i1,i2,i3,i4] = 0 - a[i1,i2,i3,i4] = rho[i1,i2,i3,i4] * a[i1,i2,i3,i4] - - # compute epsilonRec - epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + zit[i1,i2,i3,i4] - - # compute epsilonRecA - epsilonRecA[i1,i2,i3,i4] = (phi[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + - ((rho[i1,i2,i3,i4] - (phi[i1,i2,i3,i4] * beta[i1,i2,i3,i4])) * epsilonRecA[i1,i2,i3,i4]) - - # compute avth - avth[i1,i2,i3,i4] = vth[i1,i2,i3,i4] + (beta[i1,i2,i3,i4] * a[i1,i2,i3,i4]) - - else # refractory period is inactive - recSignal[i1,i2,i3,i4] = zit[i1,i2,i3,i4] * wRec[i1,i2,i3,i4] - vt[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]) + sum(@view(recSignal[:,:,i3,i4])) - - # compute avth - avth[i1,i2,i3,i4] = vth[i1,i2,i3,i4] + (beta[i1,i2,i3,i4] * a[i1,i2,i3,i4]) - - # fires if membrane potential exceed threshold - if vt[i1,i2,i3,i4] > avth[i1,i2,i3,i4] - zt[i1,i2,i3,i4] = 1 - refractoryCounter[i1,i2,i3,i4] = refractoryDuration[i1,i2,i3,i4] - firingCounter[i1,i2,i3,i4] += 1 - vt[i1,i2,i3,i4] = vRest[i1,i2,i3,i4] - a[i1,i2,i3,i4] = (rho[i1,i2,i3,i4] * a[i1,i2,i3,i4]) + 1 - else - zt[i1,i2,i3,i4] = 0 - a[i1,i2,i3,i4] = (rho[i1,i2,i3,i4] * a[i1,i2,i3,i4]) - end - - # compute phi, there is a difference from alif formula - phi[i1,i2,i3,i4] = (gammaPd[i1,i2,i3,i4] / vth[i1,i2,i3,i4]) * max(0, 1 - ((vt[i1,i2,i3,i4] - vth[i1,i2,i3,i4]) / vth[i1,i2,i3,i4])) - - # compute epsilonRec - epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + zit[i1,i2,i3,i4] - - # compute epsilonRecA - epsilonRecA[i1,i2,i3,i4] = (phi[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + - ((rho[i1,i2,i3,i4] - (phi[i1,i2,i3,i4] * beta[i1,i2,i3,i4])) * epsilonRecA[i1,i2,i3,i4]) - end - end - return nothing -end - function onForward(kfn_zit::Array{T}, zit::Array{T}, wOut::Array{T}, @@ -638,133 +792,6 @@ function onForward(kfn_zit::Array{T}, end end -# gpu launcher -function onForward( on_zit::CuArray, - on_wOut::CuArray, - on_vt::CuArray, - on_vth::CuArray, - on_vRest::CuArray, - on_zt::CuArray, - on_alpha::CuArray, - on_phi::CuArray, - on_epsilonRec::CuArray, - on_refractoryCounter::CuArray, - on_refractoryDuration::CuArray, - on_gammaPd::CuArray, - on_firingCounter::CuArray, - on_recSignal::CuArray) - - kernel = @cuda launch=false onForward( on_zit, - on_wOut, - on_vt, - on_vth, - on_vRest, - on_zt, - on_alpha, - on_phi, - on_epsilonRec, - on_refractoryCounter, - on_refractoryDuration, - on_gammaPd, - on_firingCounter, - on_recSignal, - GeneralUtils.linear_to_cartesian) - config = launch_configuration(kernel.fun) - - # threads to be launched. Since one can't launch exact thread number the kernel needs, - # one just launch threads more than this kernel needs then use a guard inside the kernel - # to prevent unused threads to access memory. - threads = min(1024, config.threads) # depend on gpu. Most NVIDIA gpu has 1024 threads per block - - # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element - totalThreads = length(on_wOut) - - blocks = cld(totalThreads, threads) - # println("launching gpu kernel") - CUDA.@sync begin - kernel( on_zit, - on_wOut, - on_vt, - on_vth, - on_vRest, - on_zt, - on_alpha, - on_phi, - on_epsilonRec, - on_refractoryCounter, - on_refractoryDuration, - on_gammaPd, - on_firingCounter, - on_recSignal, - GeneralUtils.linear_to_cartesian; threads, blocks) - end -end - -# gpu kernel -function onForward( zit, - wOut, - vt, - vth, - vRest, - zt, - alpha, - phi, - epsilonRec, - refractoryCounter, - refractoryDuration, - gammaPd, - firingCounter, - recSignal, - linear_to_cartesian) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x # gpu threads index - - if i <= length(wOut) - # cartesian index - i1, i2, i3, i4 = linear_to_cartesian(i, size(wOut)) - # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4") - - refractoryCounter[i1,i2,i3,i4] -= 1 - - if refractoryCounter[i1,i2,i3,i4] > 0 # refractory period is active - refractoryCounter[i1,i2,i3,i4] -= 1 - zt[i1,i2,i3,i4] = 0 - vt[i1,i2,i3,i4] = alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4] - phi[i1,i2,i3,i4] = 0 - - # compute epsilonRec - epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + zit[i1,i2,i3,i4] - - else # refractory period is inactive - recSignal[i1,i2,i3,i4] = zit[i1,i2,i3,i4] * wOut[i1,i2,i3,i4] - vt[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]) + sum(@view(recSignal[:,:,i3,i4])) - - # fires if membrane potential exceed threshold - if vt[i1,i2,i3,i4] > vth[i1,i2,i3,i4] - zt[i1,i2,i3,i4] = 1 - refractoryCounter[i1,i2,i3,i4] = refractoryDuration[i1,i2,i3,i4] - firingCounter[i1,i2,i3,i4] += 1 - vt[i1,i2,i3,i4] = vRest[i1,i2,i3,i4] - else - zt[i1,i2,i3,i4] = 0 - end - - # compute phi, there is a difference from on formula - phi[i1,i2,i3,i4] = (gammaPd[i1,i2,i3,i4] / vth[i1,i2,i3,i4]) * max(0, 1 - ((vt[i1,i2,i3,i4] - vth[i1,i2,i3,i4]) / vth[i1,i2,i3,i4])) - - # compute epsilonRec - epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + zit[i1,i2,i3,i4] - end - end - return nothing -end - - - - - - - - diff --git a/src/learn.jl b/src/learn.jl index 2bf68a1..e65ab72 100644 --- a/src/learn.jl +++ b/src/learn.jl @@ -9,6 +9,7 @@ using ..type, ..snnUtil #------------------------------------------------------------------------------------------------100 function compute_paramsChange!(kfn::kfn_1, modelError, outputError) + modelError = reshape(modelError, (1,1,1,:)) # (1,1,1,batch) lifComputeParamsChange!(kfn.lif_phi, kfn.lif_epsilonRec, kfn.lif_eta, @@ -18,7 +19,10 @@ function compute_paramsChange!(kfn::kfn_1, modelError, outputError) kfn.on_wOut, kfn.lif_arrayProjection4d, kfn.lif_error, - modelError) + modelError, + + kfn.inputSize, + ) alifComputeParamsChange!(kfn.alif_phi, kfn.alif_epsilonRec, @@ -30,7 +34,10 @@ function compute_paramsChange!(kfn::kfn_1, modelError, outputError) kfn.alif_arrayProjection4d, kfn.alif_error, modelError, - kfn.alif_beta) + + kfn.alif_epsilonRecA, + kfn.alif_beta, + ) onComputeParamsChange!(kfn.on_phi, kfn.on_epsilonRec, @@ -38,7 +45,10 @@ function compute_paramsChange!(kfn::kfn_1, modelError, outputError) kfn.on_eRec, kfn.on_wOut, kfn.on_wOutChange, - outputError) + kfn.on_arrayProjection4d, + kfn.on_error, + outputError, + ) # error("DEBUG -> kfn compute_paramsChange! $(Dates.now())") end @@ -51,18 +61,28 @@ function lifComputeParamsChange!( phi::CuArray, wOut::CuArray, arrayProjection4d::CuArray, nError::CuArray, - modelError::CuArray) - - # Bₖⱼ in paper, sum() to get each neuron's total wOut weight - wOutSum = sum(wOut, dims=3) .* arrayProjection4d - + modelError::CuArray, + + inputSize::CuArray, + ) + # Bₖⱼ in paper, sum() to get each neuron's total wOut weight, + # use absolute because only magnitude is needed + wOutSum_all = reshape( abs.(sum(wOut, dims=3)), (1,1,:, size(wOut, 4)) ) # (1,1,allNeuron,batch) + + # get only each lif neuron's wOut, leaving out other neuron's wOut + startIndex = prod(inputSize) +1 + stopIndex = startIndex + size(wRec, 3) -1 + wOutSum = @view(wOutSum_all[1,1, startIndex:stopIndex, :]) + wOutSum = reshape(wOutSum, (1, 1, size(wOutSum, 1), size(wOutSum, 2))) # (1,1,n,batch) + # nError a.k.a. learning signal use dopamine concept, # this neuron receive summed error signal (modelError) - nError .= (modelError .* arrayProjection4d) .* wOutSum + nError .= (modelError .* wOutSum) .* arrayProjection4d eRec .= phi .* epsilonRec - # GeneralUtils.isNotEqual(wRec, 0) is a subscribe filter use to filter out non-subscribed wRecChange - wRecChange .+= ((-1 .* eta) .* nError .* eRec .* sign.(wRec)) .* GeneralUtils.isNotEqual.(wRec, 0) - # error("DEBUG -> lifComputeParamsChange! $(Dates.now())") + wRecChange .+= ((-1 .* eta) .* nError .* eRec) + + # reset epsilonRec + epsilonRec .= 0 end function alifComputeParamsChange!( phi::CuArray, @@ -75,18 +95,29 @@ function alifComputeParamsChange!( phi::CuArray, arrayProjection4d::CuArray, nError::CuArray, modelError::CuArray, - beta::CuArray) - # Bₖⱼ in paper, sum() to get each neuron's total wOut weight - wOutSum = sum(wOut, dims=3) .* arrayProjection4d + epsilonRecA::CuArray, + beta::CuArray + ) + # Bₖⱼ in paper, sum() to get each neuron's total wOut weight, + # use absolute because only magnitude is needed + wOutSum_all = reshape( abs.(sum(wOut, dims=3)), (1,1,:, size(wOut, 4)) ) # (1,1,allNeuron,batch) + + # get only each lif neuron's wOut, leaving out other neuron's wOut + wOutSum = @view(wOutSum_all[1,1, end-size(wRec, 3)+1:end, :]) + wOutSum = reshape(wOutSum, (1, 1, size(wOutSum, 1), size(wOutSum, 2))) # (1,1,n,batch) + # nError a.k.a. learning signal use dopamine concept, # this neuron receive summed error signal (modelError) - nError .= (modelError .* arrayProjection4d) .* wOutSum - eRec .= (phi .* epsilonRec) .+ (phi .* epsilonRec .* beta) + nError .= (modelError .* wOutSum) .* arrayProjection4d + eRec .= phi .* (epsilonRec .- (beta .* epsilonRecA)) # use eq. 25 + wRecChange .+= ((-1 .* eta) .* nError .* eRec) + + # reset epsilonRec + epsilonRec .= 0 + epsilonRecA .= 0 - # GeneralUtils.isNotEqual(wRec, 0) is a subscribe filter use to filter out non-subscribed wRecChange - wRecChange .+= ((-1 .* eta) .* nError .* eRec .* sign.(wRec)) .* GeneralUtils.isNotEqual.(wRec, 0) # error("DEBUG -> alifComputeParamsChange! $(Dates.now())") end @@ -96,15 +127,17 @@ function onComputeParamsChange!(phi::CuArray, eRec::CuArray, wOut::CuArray, wOutChange::CuArray, + arrayProjection4d::CuArray, + nError::CuArray, outputError::CuArray # outputError is output neuron's error ) - # nError a.k.a. learning signal use dopamine concept, - # this neuron receive summed error signal (modelError) - eRec .= (phi .* epsilonRec) .* reshape(outputError, (1, 1, :, size(epsilonRec, 4))) - - # GeneralUtils.isNotEqual(wRec, 0) is a subscribe filter use to filter out non-subscribed wRecChange - wOutChange .+= ((-1 .* eta) .* eRec .* sign.(wOut)) .* GeneralUtils.isNotEqual.(wOut, 0) + eRec .= phi .* epsilonRec + nError .= reshape(outputError, (1, 1, :, size(outputError, 2))) .* arrayProjection4d + wOutChange .+= ((-1 .* eta) .* nError .* eRec) + + # reset epsilonRec + epsilonRec .= 0 # error("DEBUG -> onComputeParamsChange! $(Dates.now())") end @@ -224,20 +257,20 @@ end function lifLearn!(wRec, wRecChange, arrayProjection4d) - # merge learning weight with average learning weight - wRec .+= (sum(wRecChange) ./ (size(wRec, 4))) .* arrayProjection4d + wRec .+= (sum(wRecChange, dims=4) ./ (size(wRec, 4))) .* arrayProjection4d #TODO synaptic strength #TODO neuroplasticity + # error("DEBUG -> lifLearn! $(Dates.now())") end function alifLearn!(wRec, wRecChange, arrayProjection4d) - # merge learning weight + # merge learning weight with average learning weight wRec .+= (sum(wRecChange) ./ (size(wRec, 4))) .* arrayProjection4d #TODO synaptic strength @@ -249,7 +282,7 @@ end function onLearn!(wOut, wOutChange, arrayProjection4d) - # merge learning weight + # merge learning weight with average learning weight wOut .+= (sum(wOutChange) ./ (size(wOut, 4))) .* arrayProjection4d # adaptive wOut to help convergence using c_decay diff --git a/src/type.jl b/src/type.jl index 1f38371..ec1a905 100644 --- a/src/type.jl +++ b/src/type.jl @@ -21,6 +21,7 @@ Base.@kwdef mutable struct kfn_1 <: knowledgeFn timeStep::Union{AbstractArray, Nothing} = nothing learningStage::Union{AbstractArray, Nothing} = nothing # 0 inference, 1 start, 2 during, 3 end learning + inputSize::Union{AbstractArray, Nothing} = nothing zit::Union{AbstractArray, Nothing} = nothing # 3D activation matrix modelError::Union{AbstractArray, Nothing} = nothing # store RSNN error outputError::Union{AbstractArray, Nothing} = nothing # store output neurons error @@ -50,6 +51,7 @@ Base.@kwdef mutable struct kfn_1 <: knowledgeFn lif_gammaPd::Union{AbstractArray, Nothing} = nothing lif_wRecChange::Union{AbstractArray, Nothing} = nothing lif_error::Union{AbstractArray, Nothing} = nothing + lif_subscription::Union{AbstractArray, Nothing} = nothing lif_firingCounter::Union{AbstractArray, Nothing} = nothing @@ -85,6 +87,7 @@ Base.@kwdef mutable struct kfn_1 <: knowledgeFn alif_gammaPd::Union{AbstractArray, Nothing} = nothing alif_wRecChange::Union{AbstractArray, Nothing} = nothing alif_error::Union{AbstractArray, Nothing} = nothing + alif_subscription::Union{AbstractArray, Nothing} = nothing alif_firingCounter::Union{AbstractArray, Nothing} = nothing @@ -137,6 +140,7 @@ Base.@kwdef mutable struct kfn_1 <: knowledgeFn on_gammaPd::Union{AbstractArray, Nothing} = nothing on_wOutChange::Union{AbstractArray, Nothing} = nothing on_error::Union{AbstractArray, Nothing} = nothing + on_subscription::Union{AbstractArray, Nothing} = nothing on_firingCounter::Union{AbstractArray, Nothing} = nothing @@ -162,8 +166,8 @@ function kfn_1(params::Dict; device=cpu) # ---------------------------------------------------------------------------- # # row*col is a 2D matrix represent all RSNN activation row, col, batch = kfn.params[:inputPort][:signal][:numbers] # z-axis represent signal batch number - # row += kfn.params[:inputPort][:noise][:numbers][1] col += kfn.params[:inputPort][:noise][:numbers][2] + kfn.inputSize = [row, col] |> device col += kfn.params[:computeNeuron][:lif][:numbers][2] col += kfn.params[:computeNeuron][:alif][:numbers][2] @@ -208,6 +212,7 @@ function kfn_1(params::Dict; device=cpu) kfn.lif_gammaPd = (similar(kfn.lif_wRec) .= 0.3) |> device kfn.lif_wRecChange = (similar(kfn.lif_wRec) .= 0) |> device kfn.lif_error = (similar(kfn.lif_wRec) .= 0) |> device + kfn.lif_subscription = (GeneralUtils.isNotEqual.(kfn.lif_wRec, 0)) |> device kfn.lif_firingCounter = (similar(kfn.lif_wRec) .= 0) |> device @@ -254,6 +259,7 @@ function kfn_1(params::Dict; device=cpu) kfn.alif_gammaPd = (similar(kfn.alif_wRec) .= 0.3) |> device kfn.alif_wRecChange = (similar(kfn.alif_wRec) .= 0) |> device kfn.alif_error = (similar(kfn.alif_wRec) .= 0) |> device + kfn.alif_subscription = (GeneralUtils.isNotEqual.(kfn.alif_wRec, 0)) |> device kfn.alif_firingCounter = (similar(kfn.alif_wRec) .= 0) |> device @@ -286,9 +292,13 @@ function kfn_1(params::Dict; device=cpu) # subscription w = zeros(row, col, n) synapticConnectionPercent = kfn.params[:outputPort][:params][:synapticConnectionPercent] - synapticConnection = Int(floor(row*col * synapticConnectionPercent/100)) - for slice in eachslice(w, dims=3) - pool = shuffle!([1:row*col...])[1:synapticConnection] + subable = size(kfn.lif_wRec, 3) + size(kfn.alif_wRec, 3) # sub to lif, alif only + synapticConnection = Int(floor(subable * synapticConnectionPercent/100)) + for slice in eachslice(w, dims=3) # each slice is a neuron + startInd = row*col - subable + 1 # e.g. 100(row*col) - 50(subable) = 50 -> startInd = 51 + + # pool must contain only lif, alif neurons + pool = shuffle!([startInd:row*col...])[1:synapticConnection] for i in pool slice[i] = randn()/10 # assign weight to synaptic connection end @@ -313,6 +323,7 @@ function kfn_1(params::Dict; device=cpu) kfn.on_gammaPd = (similar(kfn.on_wOut) .= 0.3) |> device kfn.on_wOutChange = (similar(kfn.on_wOut) .= 0) |> device kfn.on_error = (similar(kfn.on_wOut) .= 0) |> device + kfn.on_subscription = (GeneralUtils.isNotEqual.(kfn.on_wOut, 0)) |> device kfn.on_firingCounter = (similar(kfn.on_wOut) .= 0) |> device