ComputeParamsChange()

2023-08-05 14:54:52 +07:00
parent 28f9fb4bdc
commit 9ff7efc7dc
3 changed files with 685 additions and 300 deletions
--- a/src/forward.jl
+++ b/src/forward.jl
@@ -22,7 +22,6 @@ function (kfn::kfn_1)(input::AbstractArray)
    end
    
    # println(">>> input ", size(input))
-    # println(">>> zit ", size(kfn.zit)) 
    # println(">>> lif_zit ", size(kfn.lif_zit)) 
    # println(">>> lif_recSignal ", size(kfn.lif_recSignal))
    # println(">>> lif_wRec ", size(kfn.lif_wRec))
@@ -31,17 +30,29 @@ function (kfn::kfn_1)(input::AbstractArray)
    # println(">>> lif_vt0 ", size(kfn.lif_vt0))
    # println(">>> lif_vt0 sum ", sum(kfn.lif_vt0))
    
-    # pass input_data into input neuron.
-    GeneralUtils.cartesianAssign!(kfn.zit, input)
+    # update activation matrix with "lif_zt1" and "alif_zt1" by concatenating
+    # (input, lif_zt1, alif_zt1) to form activation matrix
+    _zit = cat(reshape(input, (size(input, 1), size(input, 2), 1, size(input, 3))), 
+                reshape(kfn.lif_zt, (size(input, 1), :, 1, size(input, 3))), 
+                reshape(kfn.alif_zt, (size(input, 1), :, 1, size(input, 3))), dims=2)
+    kfn.zit .= reshape(_zit, (size(input, 1), :, size(input, 3)))
    
-    lifForward( kfn.zit, 
-                kfn.lif_zit,
+    # pass input_data into input neuron.
+#     GeneralUtils.cartesianAssign!(kfn.zit, input)
+
+#     kfn.zit = kfn.zit |> device
+#     input = input |> device
+
+    # project 3D kfn zit into 4D lif zit
+    i1, i2, i3, i4 = size(kfn.lif_zit)
+    kfn.lif_zit .= reshape(kfn.zit, (i1, i2, 1, i4)) .* kfn.lif_arrayProjection4d
+    
+    lifForward( kfn.lif_zit,
                kfn.lif_wRec,
-                kfn.lif_vt0,
-                kfn.lif_vt1,
+                kfn.lif_vt,
                kfn.lif_vth,
                kfn.lif_vRest,
-                kfn.lif_zt1,
+                kfn.lif_zt4d,
                kfn.lif_alpha,
                kfn.lif_phi,
                kfn.lif_epsilonRec,
@@ -49,23 +60,18 @@ function (kfn::kfn_1)(input::AbstractArray)
                kfn.lif_refractoryDuration,
                kfn.lif_gammaPd,
                kfn.lif_firingCounter,
-                kfn.lif_arrayProjection3DTo4D,
-                kfn.lif_recSignal,
-                kfn.lif_decayed_vt0,
-                kfn.lif_decayed_epsilonRec,
-                kfn.lif_vt1_diff_vth,
-                kfn.lif_vt1_diff_vth_div_vth,
-                kfn.lif_gammaPd_div_vth,
-                kfn.lif_phiActivation)
+                kfn.lif_recSignal,)
+
+    # project 3D kfn zit into 4D alif zit
+    i1, i2, i3, i4 = size(kfn.alif_zit)
+    kfn.alif_zit .= reshape(kfn.zit, (i1, i2, 1, i4)) .* kfn.alif_arrayProjection4d
    
-    alifForward( kfn.zit, 
-                kfn.alif_zit,
+    alifForward(kfn.alif_zit,
                kfn.alif_wRec, 
-                kfn.alif_vt0, 
-                kfn.alif_vt1,
+                kfn.alif_vt,
                kfn.alif_vth,
                kfn.alif_vRest,
-                kfn.alif_zt1,
+                kfn.alif_zt4d,
                kfn.alif_alpha,
                kfn.alif_phi,
                kfn.alif_epsilonRec,
@@ -73,44 +79,35 @@ function (kfn::kfn_1)(input::AbstractArray)
                kfn.alif_refractoryDuration,
                kfn.alif_gammaPd,
                kfn.alif_firingCounter,
-                kfn.alif_arrayProjection3DTo4D,
                kfn.alif_recSignal,
-                kfn.alif_decayed_vt0,
-                kfn.alif_decayed_epsilonRec,
-                kfn.alif_vt1_diff_vth,
-                kfn.alif_vt1_diff_vth_div_vth,
-                kfn.alif_gammaPd_div_vth,
-                kfn.alif_phiActivation,
-
                kfn.alif_epsilonRecA,
-                kfn.alif_avth,
                kfn.alif_a,
+                kfn.alif_avth,
                kfn.alif_beta,
-                kfn.alif_rho,
-                kfn.alif_phi_x_epsilonRec,
-                kfn.alif_phi_x_beta,
-                kfn.alif_rho_diff_phi_x_beta,
-                kfn.alif_rho_div_phi_x_beta_x_epsilonRecA,
-                kfn.alif_beta_x_a)
-    # error("DEBUG -> kfn forward")
+                kfn.alif_rho,)
+    
+    # reduce lif_zt4d and alif_zt4d into lif_zt, alif_zt (4d -> 1d)
+    kfn.lif_zt .= reduce(max, kfn.lif_zt4d, dims=(1,2))
+    kfn.alif_zt .= reduce(max, kfn.alif_zt4d, dims=(1,2))

-    
-    
-    # update activation matrix by concatenate (input, lif_zt1, alif_zt1) to form activation matrix
+    # update activation matrix with "lif_zt1" and "alif_zt1" by concatenating
+    # (input, lif_zt1, alif_zt1) to form activation matrix
    _zit = cat(reshape(input, (size(input, 1), size(input, 2), 1, size(input, 3))), 
-                reshape(kfn.lif_zt1, (size(input, 1), :, 1, size(input, 3))), 
-                reshape(kfn.alif_zt1, (size(input, 1), :, 1, size(input, 3))), dims=2)
+                reshape(kfn.lif_zt, (size(input, 1), :, 1, size(input, 3))), 
+                reshape(kfn.alif_zt, (size(input, 1), :, 1, size(input, 3))), dims=2)
    kfn.zit .= reshape(_zit, (size(input, 1), :, size(input, 3)))
    
+    # project 3D kfn zit into 4D on zit
+    i1, i2, i3, i4 = size(kfn.on_zit)
+    kfn.on_zit .= reshape(kfn.zit, (i1, i2, 1, i4)) .* kfn.on_arrayProjection4d
+
    # read out
-    onForward(  kfn.zit, 
-                kfn.on_zit,
-                kfn.on_wOut, 
-                kfn.on_vt0, 
-                kfn.on_vt1,
+    onForward(  kfn.on_zit,
+                kfn.on_wOut,
+                kfn.on_vt,
                kfn.on_vth,
                kfn.on_vRest,
-                kfn.on_zt1,
+                kfn.on_zt4d,
                kfn.on_alpha,
                kfn.on_phi,
                kfn.on_epsilonRec,
@@ -118,16 +115,11 @@ function (kfn::kfn_1)(input::AbstractArray)
                kfn.on_refractoryDuration,
                kfn.on_gammaPd,
                kfn.on_firingCounter,
-                kfn.on_arrayProjection3DTo4D,
-                kfn.on_recSignal,
-                kfn.on_decayed_vt0,
-                kfn.on_decayed_epsilonRec,
-                kfn.on_vt1_diff_vth,
-                kfn.on_vt1_diff_vth_div_vth,
-                kfn.on_gammaPd_div_vth,
-                kfn.on_phiActivation)
+                kfn.on_recSignal,)
+    # error("DEBUG -> kfn forward")
+    logit = reshape(kfn.on_zt, (size(input, 1), :))

-    return reshape(kfn.on_zt1, (size(input, 1), :)),
+    return logit,
            kfn.zit
 end

@@ -147,7 +139,7 @@ function lifForward(kfn_zit::Array{T},
                    refractoryDuration::Array{T},
                    gammaPd::Array{T},
                    firingCounter::Array{T},
-                    arrayProjection3DTo4D::Array{T},
+                    arrayProjection4d::Array{T},
                    recSignal::Array{T},
                    decayed_vt0::Array{T},
                    decayed_epsilonRec::Array{T},
@@ -158,8 +150,8 @@ function lifForward(kfn_zit::Array{T},
                    )   where T<:Number

    # project 3D kfn zit into 4D lif zit
-    zit .= reshape(kfn_zit, 
-                    (size(wRec, 1), size(wRec, 2), 1, size(wRec, 4))) .* arrayProjection3DTo4D   
+    i1, i2, i3, i4 = size(alif_wRec)
+    lif_zit .= reshape(kfn_zit, (i1, i2, 1, i4)) .* lif_arrayProjection4d 

    for j in 1:size(wRec, 4), i in 1:size(wRec, 3)    # compute along neurons axis of every batch
        if sum(@view(refractoryCounter[:,:,i,j])) > 0        # refractory period is active
@@ -199,8 +191,128 @@ function lifForward(kfn_zit::Array{T},
    end
 end

-function alifForward(kfn_zit::Array{T}, 
-                    zit::Array{T},
+# gpu launcher
+function lifForward(    lif_zit::CuArray,
+                        lif_wRec::CuArray,
+                        lif_vt::CuArray,
+                        lif_vth::CuArray,
+                        lif_vRest::CuArray,
+                        lif_zt::CuArray,
+                        lif_alpha::CuArray,
+                        lif_phi::CuArray,
+                        lif_epsilonRec::CuArray,
+                        lif_refractoryCounter::CuArray,
+                        lif_refractoryDuration::CuArray,
+                        lif_gammaPd::CuArray,
+                        lif_firingCounter::CuArray,
+                        lif_recSignal::CuArray,)
+    
+    kernel = @cuda launch=false lifForward(     lif_zit,
+                                                lif_wRec,
+                                                lif_vt,
+                                                lif_vth,
+                                                lif_vRest,
+                                                lif_zt,
+                                                lif_alpha,
+                                                lif_phi,
+                                                lif_epsilonRec,
+                                                lif_refractoryCounter,
+                                                lif_refractoryDuration,
+                                                lif_gammaPd,
+                                                lif_firingCounter,
+                                                lif_recSignal,
+                                                GeneralUtils.linear_to_cartesian)
+    config = launch_configuration(kernel.fun)
+    
+    
+    # threads to be launched. Since one can't launch exact thread number the kernel needs, 
+    # one just launch threads more than this kernel needs then use a guard inside the kernel
+    # to prevent unused threads to access memory. 
+    threads = min(1024, config.threads)    # depend on gpu. Most NVIDIA gpu has 1024 threads per block
+
+    # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element 
+    totalThreads = length(lif_wRec)   
+ 
+    blocks = cld(totalThreads, threads)
+        # println("launching gpu kernel")
+    CUDA.@sync begin
+        kernel( lif_zit,
+                lif_wRec,
+                lif_vt,
+                lif_vth,
+                lif_vRest,
+                lif_zt,
+                lif_alpha,
+                lif_phi,
+                lif_epsilonRec,
+                lif_refractoryCounter,
+                lif_refractoryDuration,
+                lif_gammaPd,
+                lif_firingCounter,
+                lif_recSignal,
+                GeneralUtils.linear_to_cartesian;  threads, blocks)
+    end
+end
+
+# gpu kernel
+function lifForward(    zit,
+                        wRec,
+                        vt,
+                        vth,
+                        vRest,
+                        zt,
+                        alpha,
+                        phi,
+                        epsilonRec,
+                        refractoryCounter,
+                        refractoryDuration,
+                        gammaPd,
+                        firingCounter,
+                        recSignal,
+                        linear_to_cartesian)
+        i = (blockIdx().x - 1) * blockDim().x + threadIdx().x   # gpu threads index
+
+        if i <= length(wRec)
+                # cartesian index
+                i1, i2, i3, i4 = linear_to_cartesian(i, size(wRec))
+                # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4")
+
+                refractoryCounter[i] -= 1
+
+                if refractoryCounter[i] > 0        # refractory period is active
+                        refractoryCounter[i] -= 1
+                        zt[i] = 0
+                        vt[i] = alpha[i] * vt[i]
+                        phi[i] = 0
+                        
+                        # compute epsilonRec
+                        epsilonRec[i] = (alpha[i] * epsilonRec[i]) + zit[i]
+
+                else    # refractory period is inactive
+                        recSignal[i] = zit[i] * wRec[i]
+                        vt[i] = (alpha[i] * vt[i]) + sum(@view(recSignal[:,:,i3,i4]))
+
+                        # fires if membrane potential exceed threshold
+                        if vt[i] > vth[i]
+                                zt[i] = 1
+                                refractoryCounter[i] = refractoryDuration[i]
+                                firingCounter[i] += 1
+                                vt[i] = vRest[i]
+                        else
+                                zt[i] = 0
+                        end
+
+                        # compute phi, there is a difference from lif formula
+                        phi[i] = (gammaPd[i] / vth[i]) * max(0, 1 - ((vt[i] - vth[i]) / vth[i]))
+
+                        # compute epsilonRec
+                        epsilonRec[i] = (alpha[i] * epsilonRec[i]) + zit[i]
+                end
+        end
+        return nothing
+end
+
+function alifForward(zit::Array{T},
                    wRec::Array{T},
                    vt0::Array{T},
                    vt1::Array{T},
@@ -214,7 +326,6 @@ function alifForward(kfn_zit::Array{T},
                    refractoryDuration::Array{T},
                    gammaPd::Array{T},
                    firingCounter::Array{T},
-                    arrayProjection3DTo4D::Array{T},
                    recSignal::Array{T},
                    decayed_vt0::Array{T},
                    decayed_epsilonRec::Array{T},
@@ -234,11 +345,6 @@ function alifForward(kfn_zit::Array{T},
                    rho_div_phi_x_beta_x_epsilonRecA::Array{T},
                    beta_x_a::Array{T},
                    ) where T<:Number
-    
-
-        # project 3D kfn zit into 4D lif zit
-        zit .= reshape(kfn_zit, 
-                        (size(wRec, 1), size(wRec, 2), 1, size(wRec, 4))) .* arrayProjection3DTo4D   

        for j in 1:size(wRec, 4), i in 1:size(wRec, 3)    # compute along neurons axis of every batch
                if sum(@view(refractoryCounter[:,:,i,j])) > 0        # refractory period is active
@@ -305,6 +411,164 @@ function alifForward(kfn_zit::Array{T},
        end
 end

+# gpu launcher
+function alifForward(   alif_zit::CuArray,
+                        alif_wRec::CuArray,
+                        alif_vt::CuArray,
+                        alif_vth::CuArray,
+                        alif_vRest::CuArray,
+                        alif_zt::CuArray,
+                        alif_alpha::CuArray,
+                        alif_phi::CuArray,
+                        alif_epsilonRec::CuArray,
+                        alif_refractoryCounter::CuArray,
+                        alif_refractoryDuration::CuArray,
+                        alif_gammaPd::CuArray,
+                        alif_firingCounter::CuArray,
+                        alif_recSignal::CuArray,
+                        alif_epsilonRecA::CuArray,
+                        alif_a::CuArray,
+                        alif_avth::CuArray,
+                        alif_beta::CuArray,
+                        alif_rho::CuArray,
+                        )
+
+    kernel = @cuda launch=false alifForward(    alif_zit,
+                                                alif_wRec, 
+                                                alif_vt,
+                                                alif_vth,
+                                                alif_vRest,
+                                                alif_zt,
+                                                alif_alpha,
+                                                alif_phi,
+                                                alif_epsilonRec,
+                                                alif_refractoryCounter,
+                                                alif_refractoryDuration,
+                                                alif_gammaPd,
+                                                alif_firingCounter,
+                                                alif_recSignal,
+                                                alif_epsilonRecA,
+                                                alif_a,
+                                                alif_avth,
+                                                alif_beta,
+                                                alif_rho,
+                                                GeneralUtils.linear_to_cartesian)
+    config = launch_configuration(kernel.fun)
+    
+    # threads to be launched. Since one can't launch exact thread number the kernel needs, 
+    # one just launch threads more than this kernel needs then use a guard inside the kernel
+    # to prevent unused threads to access memory. 
+    threads = min(1024, config.threads)    # depend on gpu. Most NVIDIA gpu has 1024 threads per block
+
+    # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element 
+    totalThreads = length(alif_wRec)   
+ 
+    blocks = cld(totalThreads, threads)
+        # println("launching gpu kernel")
+    CUDA.@sync begin
+        kernel( alif_zit,
+                alif_wRec, 
+                alif_vt,
+                alif_vth,
+                alif_vRest,
+                alif_zt,
+                alif_alpha,
+                alif_phi,
+                alif_epsilonRec,
+                alif_refractoryCounter,
+                alif_refractoryDuration,
+                alif_gammaPd,
+                alif_firingCounter,
+                alif_recSignal,
+                alif_epsilonRecA,
+                alif_a,
+                alif_avth,
+                alif_beta,
+                alif_rho,
+                GeneralUtils.linear_to_cartesian;  threads, blocks)
+    end
+end
+
+# gpu kernel
+function alifForward(   zit,
+                        wRec, 
+                        vt,
+                        vth,
+                        vRest,
+                        zt,
+                        alpha,
+                        phi,
+                        epsilonRec,
+                        refractoryCounter,
+                        refractoryDuration,
+                        gammaPd,
+                        firingCounter,
+                        recSignal,
+                        epsilonRecA,
+                        a,
+                        avth,
+                        beta,
+                        rho,
+                        linear_to_cartesian)
+        i = (blockIdx().x - 1) * blockDim().x + threadIdx().x   # gpu threads index
+
+        if i <= length(wRec)
+                # cartesian index
+                i1, i2, i3, i4 = linear_to_cartesian(i, size(wRec))
+                # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4")
+
+                refractoryCounter[i] -= 1
+
+                if refractoryCounter[i] > 0        # refractory period is active
+                        refractoryCounter[i] -= 1
+                        zt[i] = 0
+                        vt[i] = alpha[i] * vt[i]
+                        phi[i] = 0
+                        a[i] = rho[i] * a[i]
+                        
+                        # compute epsilonRec
+                        epsilonRec[i] = (alpha[i] * epsilonRec[i]) + zit[i]
+
+                        # compute epsilonRecA
+                        epsilonRecA[i] = (phi[i] * epsilonRec[i]) + 
+                                                ((rho[i] - (phi[i] * beta[i])) * epsilonRecA[i])
+
+                        # compute avth
+                        avth[i] = vth[i] + (beta[i] * a[i])
+
+                else    # refractory period is inactive
+                        recSignal[i] = zit[i] * wRec[i]
+                        vt[i] = (alpha[i] * vt[i]) + sum(@view(recSignal[:,:,i3,i4]))
+                        
+                        # compute avth
+                        avth[i] = vth[i] + (beta[i] * a[i])
+
+                        # fires if membrane potential exceed threshold
+                        if vt[i] > avth[i]
+                                zt[i] = 1
+                                refractoryCounter[i] = refractoryDuration[i]
+                                firingCounter[i] += 1
+                                vt[i] = vRest[i]
+                                a[i] = (rho[i] * a[i]) + 1
+                        else
+                                zt[i] = 0
+                                a[i] = (rho[i] * a[i])
+                        end
+
+                        # compute phi, there is a difference from alif formula
+                        phi[i] = (gammaPd[i] / vth[i]) * max(0, 1 - ((vt[i] - vth[i]) / vth[i]))
+
+                        # compute epsilonRec
+                        epsilonRec[i] = (alpha[i] * epsilonRec[i]) + zit[i]
+
+                        # compute epsilonRecA
+                        epsilonRecA[i] = (phi[i] * epsilonRec[i]) + 
+                                                ((rho[i] - (phi[i] * beta[i])) * epsilonRecA[i])
+                end
+        end
+        return nothing
+end
+
 function onForward(kfn_zit::Array{T},
                    zit::Array{T},
                    wOut::Array{T},
@@ -320,7 +584,7 @@ function onForward(kfn_zit::Array{T},
                    refractoryDuration::Array{T},
                    gammaPd::Array{T},
                    firingCounter::Array{T},
-                    arrayProjection3DTo4D::Array{T},
+                    arrayProjection4d::Array{T},
                    recSignal::Array{T},
                    decayed_vt0::Array{T},
                    decayed_epsilonRec::Array{T},
@@ -332,7 +596,7 @@ function onForward(kfn_zit::Array{T},
    
    # project 3D kfn zit into 4D lif zit
    zit .= reshape(kfn_zit, 
-                    (size(wOut, 1), size(wOut, 2), 1, size(wOut, 4))) .* arrayProjection3DTo4D  
+                    (size(wOut, 1), size(wOut, 2), 1, size(wOut, 4))) .* arrayProjection4d  
    
    for j in 1:size(wOut, 4), i in 1:size(wOut, 3)    # compute along neurons axis of every batch
        if sum(@view(refractoryCounter[:,:,i,j])) > 0        # refractory period is active
@@ -372,57 +636,125 @@ function onForward(kfn_zit::Array{T},
    end
 end

-# function onForward(kfn_zit, 
-#                     zit,
-#                     wOut, 
-#                     vt0, 
-#                     vt1,
-#                     vth,
-#                     vRest,
-#                     zt1,
-#                     alpha,
-#                     phi,
-#                     epsilonRec,
-#                     refractoryCounter,
-#                     refractoryDuration,
-#                     gammaPd,
-#                     firingCounter)
-#     d1, d2, d3, d4 = size(wOut)
-#     zit .= reshape(kfn_zit, (d1, d2, 1, d4)) .* ones(size(wOut)...)   # project zit into zit
+# gpu launcher
+function onForward(     on_zit::CuArray,
+                        on_wOut::CuArray,
+                        on_vt::CuArray,
+                        on_vth::CuArray,
+                        on_vRest::CuArray,
+                        on_zt::CuArray,
+                        on_alpha::CuArray,
+                        on_phi::CuArray,
+                        on_epsilonRec::CuArray,
+                        on_refractoryCounter::CuArray,
+                        on_refractoryDuration::CuArray,
+                        on_gammaPd::CuArray,
+                        on_firingCounter::CuArray,
+                        on_recSignal::CuArray)
+                        
+    kernel = @cuda launch=false onForward(      on_zit,
+                                                on_wOut,
+                                                on_vt,
+                                                on_vth,
+                                                on_vRest,
+                                                on_zt,
+                                                on_alpha,
+                                                on_phi,
+                                                on_epsilonRec,
+                                                on_refractoryCounter,
+                                                on_refractoryDuration,
+                                                on_gammaPd,
+                                                on_firingCounter,
+                                                on_recSignal,
+                                                GeneralUtils.linear_to_cartesian)
+    config = launch_configuration(kernel.fun)
    
-#     for j in 1:d4, i in 1:d3    # compute along neurons axis of every batch
-#         if view(refractoryCounter, :, :, i, j)[1] > 0   # neuron is inactive (in refractory period)
-#             view(refractoryCounter, :, :, i, j)[1] -= 1
-#             view(zt1, :, :, i, j)[1] = 0
-#             view(vt1, :, :, i, j)[1] = 
-#                                     view(alpha, :, :, i, j)[1] * view(vt0, :, :, i, j)[1]
-#             view(phi, :, :, i, j)[1] = 0.0 
-#             view(epsilonRec, :, :, i, j) .= view(alpha, :, :, i, j)[1] .* 
-#                                             view(epsilonRec, :, :, i, j)
-#         else    # neuron is active
-#             view(vt1, :, :, i, j)[1] = 
-#                                 (view(alpha, :, :, i, j)[1] * view(vt0,:, :, i, j)[1]) + 
-#                                 sum(view(zit, :, :, i, j) .* view(wOut, :, :, i, j))
-#             if view(vt1, :, :, i, j)[1] > view(vth, :, :, i, j)[1]
-#                 view(zt1, :, :, i, j)[1] = 1
-#                 view(refractoryCounter, :, :, i, j)[1] = 
-#                                                         view(refractoryDuration, :, :, i, j)[1]
-#                 view(firingCounter, :, :, i, j)[1] += 1
-#                 view(vt1, :, :, i, j)[1] = view(vRest, :, :, i, j)[1]
-#             else
-#                 view(zt1, :, :, i, j)[1] = 0
-#             end
-#             # there is a difference from alif formula
-#             view(phi, :, :, i, j)[1] = 
-#                 (view(gammaPd, :, :, i, j)[1] / view(vth, :, :, i, j)[1]) *
-#                 max(0, 1 - ((view(vt1, :, :, i, j)[1] - view(vth, :, :, i, j)[1]) / 
-#                             view(vth, :, :, i, j)[1]))
-#             view(epsilonRec, :, :, i, j) .= 
-#                         (view(alpha, :, :, i, j)[1] .* view(epsilonRec, :, :, i, j)) + 
-#                         view(zit, :, :, i, j)
-#         end
-#     end
-# end
+    # threads to be launched. Since one can't launch exact thread number the kernel needs, 
+    # one just launch threads more than this kernel needs then use a guard inside the kernel
+    # to prevent unused threads to access memory. 
+    threads = min(1024, config.threads)    # depend on gpu. Most NVIDIA gpu has 1024 threads per block
+
+    # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element 
+    totalThreads = length(on_wOut)   
+ 
+    blocks = cld(totalThreads, threads)
+        # println("launching gpu kernel")
+    CUDA.@sync begin
+        kernel( on_zit,
+                on_wOut,
+                on_vt,
+                on_vth,
+                on_vRest,
+                on_zt,
+                on_alpha,
+                on_phi,
+                on_epsilonRec,
+                on_refractoryCounter,
+                on_refractoryDuration,
+                on_gammaPd,
+                on_firingCounter,
+                on_recSignal,
+                GeneralUtils.linear_to_cartesian;  threads, blocks)
+    end
+end
+
+# gpu kernel
+function onForward(     zit,
+                        wOut,
+                        vt,
+                        vth,
+                        vRest,
+                        zt,
+                        alpha,
+                        phi,
+                        epsilonRec,
+                        refractoryCounter,
+                        refractoryDuration,
+                        gammaPd,
+                        firingCounter,
+                        recSignal,
+                        linear_to_cartesian)
+        i = (blockIdx().x - 1) * blockDim().x + threadIdx().x   # gpu threads index
+
+        if i <= length(wOut)
+                # cartesian index
+                i1, i2, i3, i4 = linear_to_cartesian(i, size(wOut))
+                # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4")
+
+                refractoryCounter[i] -= 1
+
+                if refractoryCounter[i] > 0        # refractory period is active
+                        refractoryCounter[i] -= 1
+                        zt[i] = 0
+                        vt[i] = alpha[i] * vt[i]
+                        phi[i] = 0
+                        
+                        # compute epsilonRec
+                        epsilonRec[i] = (alpha[i] * epsilonRec[i]) + zit[i]
+
+                else    # refractory period is inactive
+                        recSignal[i] = zit[i] * wOut[i]
+                        vt[i] = (alpha[i] * vt[i]) + sum(@view(recSignal[:,:,i3,i4]))
+
+                        # fires if membrane potential exceed threshold
+                        if vt[i] > vth[i]
+                                zt[i] = 1
+                                refractoryCounter[i] = refractoryDuration[i]
+                                firingCounter[i] += 1
+                                vt[i] = vRest[i]
+                        else
+                                zt[i] = 0
+                        end
+
+                        # compute phi, there is a difference from on formula
+                        phi[i] = (gammaPd[i] / vth[i]) * max(0, 1 - ((vt[i] - vth[i]) / vth[i]))
+
+                        # compute epsilonRec
+                        epsilonRec[i] = (alpha[i] * epsilonRec[i]) + zit[i]
+                end
+        end
+        return nothing
+end