fix algo

2023-08-10 10:06:21 +07:00
parent 65bb97baf3
commit a80e9f2621
3 changed files with 512 additions and 441 deletions
--- a/src/forward.jl
+++ b/src/forward.jl
@@ -59,7 +59,9 @@ function (kfn::kfn_1)(input::AbstractArray)
                                kfn.lif_refractoryDuration,
                                kfn.lif_gammaPd,
                                kfn.lif_firingCounter,
-                                kfn.lif_recSignal,)
+                                kfn.lif_recSignal,
                                kfn.lif_subscription,
                                )
        end
        @async begin
                # project 3D kfn zit into 4D alif zit
@@ -80,6 +82,7 @@ function (kfn::kfn_1)(input::AbstractArray)
                                kfn.alif_gammaPd,
                                kfn.alif_firingCounter,
                                kfn.alif_recSignal,
                                kfn.alif_subscription,
                                kfn.alif_epsilonRecA,
                                kfn.alif_a,
                                kfn.alif_avth,
@@ -117,7 +120,9 @@ function (kfn::kfn_1)(input::AbstractArray)
                kfn.on_refractoryDuration,
                kfn.on_gammaPd,
                kfn.on_firingCounter,
-                kfn.on_recSignal,)
+                kfn.on_recSignal,
                kfn.on_subscription,
                )
    logit = reshape(kfn.on_zt, (size(input, 1), :))
@@ -126,6 +131,434 @@ function (kfn::kfn_1)(input::AbstractArray)
            kfn.zit
 end
 # gpu launcher
 function lifForward(    zit::CuArray,
                        wRec::CuArray,
                        vt::CuArray,
                        vth::CuArray,
                        vRest::CuArray,
                        zt::CuArray,
                        alpha::CuArray,
                        phi::CuArray,
                        epsilonRec::CuArray,
                        refractoryCounter::CuArray,
                        refractoryDuration::CuArray,
                        gammaPd::CuArray,
                        firingCounter::CuArray,
                        recSignal::CuArray,
                        subscription::CuArray,
                        )
    kernel = @cuda launch=false lifForward(     zit,
                                                wRec,
                                                vt,
                                                vth,
                                                vRest,
                                                zt,
                                                alpha,
                                                phi,
                                                epsilonRec,
                                                refractoryCounter,
                                                refractoryDuration,
                                                gammaPd,
                                                firingCounter,
                                                recSignal,
                                                subscription,
                                                GeneralUtils.linear_to_cartesian,
                                                )
    config = launch_configuration(kernel.fun)
    # threads to be launched. Since one can't launch exact thread number the kernel needs, 
    # one just launch threads more than this kernel needs then use a guard inside the kernel
    # to prevent unused threads to access memory. 
    threads = min(1024, config.threads)    # depend on gpu. Most NVIDIA gpu has 1024 threads per block
    # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element 
    totalThreads = length(wRec)   
    blocks = cld(totalThreads, threads)
        # println("launching gpu kernel")
    CUDA.@sync begin
        kernel( zit,
                wRec,
                vt,
                vth,
                vRest,
                zt,
                alpha,
                phi,
                epsilonRec,
                refractoryCounter,
                refractoryDuration,
                gammaPd,
                firingCounter,
                recSignal,
                subscription,
                GeneralUtils.linear_to_cartesian;  threads, blocks)
    end
 end
 # gpu kernel
 function lifForward(    zit,
                        wRec,
                        vt,
                        vth,
                        vRest,
                        zt,
                        alpha,
                        phi,
                        epsilonRec,
                        refractoryCounter,
                        refractoryDuration,
                        gammaPd,
                        firingCounter,
                        recSignal,
                        subscription,
                        linear_to_cartesian,
                        )
        i = (blockIdx().x - 1) * blockDim().x + threadIdx().x   # gpu threads index
        if i <= length(wRec)
                # cartesian index
                i1, i2, i3, i4 = linear_to_cartesian(i, size(wRec))
                # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4")
                if refractoryCounter[i1,i2,i3,i4] > 0        # refractory period is active
                        refractoryCounter[i1,i2,i3,i4] -= 1
                        recSignal[i1,i2,i3,i4] = 0
                        zt[i1,i2,i3,i4] = 0
                        vt[i1,i2,i3,i4] = alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]
                        phi[i1,i2,i3,i4] = 0
                        # compute epsilonRec
                        epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) +
                                                        (zit[i1,i2,i3,i4] * subscription[i1,i2,i3,i4])
                else    # refractory period is inactive
                        recSignal[i1,i2,i3,i4] = wRec[i1,i2,i3,i4] * zit[i1,i2,i3,i4]
                        vt[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]) + 
                                                sum(@view(recSignal[:,:,i3,i4]))
                        # fires if membrane potential exceed threshold
                        if vt[i1,i2,i3,i4] > vth[i1,i2,i3,i4]
                                zt[i1,i2,i3,i4] = 1
                                refractoryCounter[i1,i2,i3,i4] = refractoryDuration[i1,i2,i3,i4]
                                firingCounter[i1,i2,i3,i4] += 1
                                vt[i1,i2,i3,i4] = vRest[i1,i2,i3,i4]
                        else
                                zt[i1,i2,i3,i4] = 0
                        end
                        # compute phi, there is a difference from lif formula
                        phi[i1,i2,i3,i4] = (gammaPd[i1,i2,i3,i4] / vth[i1,i2,i3,i4]) * 
                                max(0, 1 - ((vt[i1,i2,i3,i4] - vth[i1,i2,i3,i4]) / vth[i1,i2,i3,i4]))
                        # compute epsilonRec
                        epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) +
                                                        (zit[i1,i2,i3,i4] * subscription[i1,i2,i3,i4])
                end
        end
        return nothing
 end
 # gpu launcher
 function alifForward(   zit::CuArray,
                        wRec::CuArray,
                        vt::CuArray,
                        vth::CuArray,
                        vRest::CuArray,
                        zt::CuArray,
                        alpha::CuArray,
                        phi::CuArray,
                        epsilonRec::CuArray,
                        refractoryCounter::CuArray,
                        refractoryDuration::CuArray,
                        gammaPd::CuArray,
                        firingCounter::CuArray,
                        recSignal::CuArray,
                        subscription::CuArray,
                        epsilonRecA::CuArray,
                        a::CuArray,
                        avth::CuArray,
                        beta::CuArray,
                        rho::CuArray,
                        )
    kernel = @cuda launch=false alifForward(    zit,
                                                wRec, 
                                                vt,
                                                vth,
                                                vRest,
                                                zt,
                                                alpha,
                                                phi,
                                                epsilonRec,
                                                refractoryCounter,
                                                refractoryDuration,
                                                gammaPd,
                                                firingCounter,
                                                recSignal,
                                                subscription,
                                                epsilonRecA,
                                                a,
                                                avth,
                                                beta,
                                                rho,
                                                GeneralUtils.linear_to_cartesian,
                                                )
    config = launch_configuration(kernel.fun)
    # threads to be launched. Since one can't launch exact thread number the kernel needs, 
    # one just launch threads more than this kernel needs then use a guard inside the kernel
    # to prevent unused threads to access memory. 
    threads = min(1024, config.threads)    # depend on gpu. Most NVIDIA gpu has 1024 threads per block
    # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element 
    totalThreads = length(wRec)   
    blocks = cld(totalThreads, threads)
        # println("launching gpu kernel")
    CUDA.@sync begin
        kernel( zit,
                wRec, 
                vt,
                vth,
                vRest,
                zt,
                alpha,
                phi,
                epsilonRec,
                refractoryCounter,
                refractoryDuration,
                gammaPd,
                firingCounter,
                recSignal,
                subscription,
                epsilonRecA,
                a,
                avth,
                beta,
                rho,
                GeneralUtils.linear_to_cartesian;  threads, blocks)
    end
 end
 # gpu kernel
 function alifForward(   zit,
                        wRec, 
                        vt,
                        vth,
                        vRest,
                        zt,
                        alpha,
                        phi,
                        epsilonRec,
                        refractoryCounter,
                        refractoryDuration,
                        gammaPd,
                        firingCounter,
                        recSignal,
                        subscription,
                        epsilonRecA,
                        a,
                        avth,
                        beta,
                        rho,
                        linear_to_cartesian,
                        )
        i = (blockIdx().x - 1) * blockDim().x + threadIdx().x   # gpu threads index
        if i <= length(wRec)
                # cartesian index
                i1, i2, i3, i4 = linear_to_cartesian(i, size(wRec))
                # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4")
                if refractoryCounter[i1,i2,i3,i4] > 0        # refractory period is active
                        refractoryCounter[i1,i2,i3,i4] -= 1
                        recSignal[i1,i2,i3,i4] = 0
                        zt[i1,i2,i3,i4] = 0
                        vt[i1,i2,i3,i4] = alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]
                        phi[i1,i2,i3,i4] = 0
                        a[i1,i2,i3,i4] = rho[i1,i2,i3,i4] * a[i1,i2,i3,i4]
                        # compute epsilonRec
                        epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + 
                                                        (zit[i1,i2,i3,i4] * subscription[i1,i2,i3,i4])
                        # compute epsilonRecA
                        epsilonRecA[i1,i2,i3,i4] = (phi[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + 
                                        ((rho[i1,i2,i3,i4] - (phi[i1,i2,i3,i4] * beta[i1,i2,i3,i4])) * 
                                        epsilonRecA[i1,i2,i3,i4])
                        # compute avth
                        avth[i1,i2,i3,i4] = vth[i1,i2,i3,i4] + (beta[i1,i2,i3,i4] * a[i1,i2,i3,i4])
                else    # refractory period is inactive
                        recSignal[i1,i2,i3,i4] = zit[i1,i2,i3,i4] * wRec[i1,i2,i3,i4]
                        vt[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]) + 
                                                sum(@view(recSignal[:,:,i3,i4]))
                        # compute avth
                        avth[i1,i2,i3,i4] = vth[i1,i2,i3,i4] + (beta[i1,i2,i3,i4] * a[i1,i2,i3,i4])
                        # fires if membrane potential exceed threshold
                        if vt[i1,i2,i3,i4] > avth[i1,i2,i3,i4]
                                zt[i1,i2,i3,i4] = 1
                                refractoryCounter[i1,i2,i3,i4] = refractoryDuration[i1,i2,i3,i4]
                                firingCounter[i1,i2,i3,i4] += 1
                                vt[i1,i2,i3,i4] = vRest[i1,i2,i3,i4]
                                a[i1,i2,i3,i4] = (rho[i1,i2,i3,i4] * a[i1,i2,i3,i4]) + 1
                        else
                                zt[i1,i2,i3,i4] = 0
                                a[i1,i2,i3,i4] = (rho[i1,i2,i3,i4] * a[i1,i2,i3,i4])
                        end
                        # compute phi, there is a difference from alif formula
                        phi[i1,i2,i3,i4] = (gammaPd[i1,i2,i3,i4] / vth[i1,i2,i3,i4]) * 
                                max(0, 1 - ((vt[i1,i2,i3,i4] - vth[i1,i2,i3,i4]) / vth[i1,i2,i3,i4]))
                        # compute epsilonRec
                        epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + 
                                                        (zit[i1,i2,i3,i4] * subscription[i1,i2,i3,i4])
                        # compute epsilonRecA   use eq.26
                        epsilonRecA[i1,i2,i3,i4] = (rho[i1,i2,i3,i4] * 
                                                        (phi[i1,i2,i3,i4] * epsilonRecA[i1,i2,i3,i4])) + 
                                                        (zit[i1,i2,i3,i4] * subscription[i1,i2,i3,i4])
                end
        end
        return nothing
 end
 # gpu launcher
 function onForward(     zit::CuArray,
                        wOut::CuArray,
                        vt::CuArray,
                        vth::CuArray,
                        vRest::CuArray,
                        zt::CuArray,
                        alpha::CuArray,
                        phi::CuArray,
                        epsilonRec::CuArray,
                        refractoryCounter::CuArray,
                        refractoryDuration::CuArray,
                        gammaPd::CuArray,
                        firingCounter::CuArray,
                        recSignal::CuArray,
                        subscription::CuArray,
                        )
    kernel = @cuda launch=false onForward(      zit,
                                                wOut,
                                                vt,
                                                vth,
                                                vRest,
                                                zt,
                                                alpha,
                                                phi,
                                                epsilonRec,
                                                refractoryCounter,
                                                refractoryDuration,
                                                gammaPd,
                                                firingCounter,
                                                recSignal,
                                                subscription,
                                                GeneralUtils.linear_to_cartesian,
                                                )
    config = launch_configuration(kernel.fun)
    # threads to be launched. Since one can't launch exact thread number the kernel needs, 
    # one just launch threads more than this kernel needs then use a guard inside the kernel
    # to prevent unused threads to access memory. 
    threads = min(1024, config.threads)    # depend on gpu. Most NVIDIA gpu has 1024 threads per block
    # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element 
    totalThreads = length(wOut)   
    blocks = cld(totalThreads, threads)
        # println("launching gpu kernel")
    CUDA.@sync begin
        kernel( zit,
                wOut,
                vt,
                vth,
                vRest,
                zt,
                alpha,
                phi,
                epsilonRec,
                refractoryCounter,
                refractoryDuration,
                gammaPd,
                firingCounter,
                recSignal,
                subscription,
                GeneralUtils.linear_to_cartesian;  threads, blocks)
    end
 end
 # gpu kernel
 function onForward(     zit,
                        wOut,
                        vt,
                        vth,
                        vRest,
                        zt,
                        alpha,
                        phi,
                        epsilonRec,
                        refractoryCounter,
                        refractoryDuration,
                        gammaPd,
                        firingCounter,
                        recSignal,
                        subscription,
                        linear_to_cartesian,
                        )
        i = (blockIdx().x - 1) * blockDim().x + threadIdx().x   # gpu threads index
        if i <= length(wOut)
                # cartesian index
                i1, i2, i3, i4 = linear_to_cartesian(i, size(wOut))
                # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4")
                if refractoryCounter[i1,i2,i3,i4] > 0        # refractory period is active
                        refractoryCounter[i1,i2,i3,i4] -= 1
                        recSignal[i1,i2,i3,i4] = 0
                        zt[i1,i2,i3,i4] = 0
                        vt[i1,i2,i3,i4] = alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]
                        phi[i1,i2,i3,i4] = 0
                        # compute epsilonRec
                        epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + 
                                                        (zit[i1,i2,i3,i4] * subscription[i1,i2,i3,i4])
                else    # refractory period is inactive
                        recSignal[i1,i2,i3,i4] = zit[i1,i2,i3,i4] * wOut[i1,i2,i3,i4]
                        vt[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]) + sum(@view(recSignal[:,:,i3,i4]))
                        # fires if membrane potential exceed threshold
                        if vt[i1,i2,i3,i4] > vth[i1,i2,i3,i4]
                                zt[i1,i2,i3,i4] = 1
                                refractoryCounter[i1,i2,i3,i4] = refractoryDuration[i1,i2,i3,i4]
                                firingCounter[i1,i2,i3,i4] += 1
                                vt[i1,i2,i3,i4] = vRest[i1,i2,i3,i4]
                        else
                                zt[i1,i2,i3,i4] = 0
                        end
                        # compute phi, there is a difference from on formula
                        phi[i1,i2,i3,i4] = (gammaPd[i1,i2,i3,i4] / vth[i1,i2,i3,i4]) * max(0, 1 - ((vt[i1,i2,i3,i4] - vth[i1,i2,i3,i4]) / vth[i1,i2,i3,i4]))
                        # compute epsilonRec
                        epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + 
                                                        (zit[i1,i2,i3,i4] * subscription[i1,i2,i3,i4])
                end
        end
        return nothing
 end
 function lifForward(kfn_zit::Array{T}, 
                    zit::Array{T},
                    wRec::Array{T}, 
@@ -193,127 +626,6 @@ function lifForward(kfn_zit::Array{T},
    end
 end
 # gpu launcher
 function lifForward(    lif_zit::CuArray,
                        lif_wRec::CuArray,
                        lif_vt::CuArray,
                        lif_vth::CuArray,
                        lif_vRest::CuArray,
                        lif_zt::CuArray,
                        lif_alpha::CuArray,
                        lif_phi::CuArray,
                        lif_epsilonRec::CuArray,
                        lif_refractoryCounter::CuArray,
                        lif_refractoryDuration::CuArray,
                        lif_gammaPd::CuArray,
                        lif_firingCounter::CuArray,
                        lif_recSignal::CuArray,)
    kernel = @cuda launch=false lifForward(     lif_zit,
                                                lif_wRec,
                                                lif_vt,
                                                lif_vth,
                                                lif_vRest,
                                                lif_zt,
                                                lif_alpha,
                                                lif_phi,
                                                lif_epsilonRec,
                                                lif_refractoryCounter,
                                                lif_refractoryDuration,
                                                lif_gammaPd,
                                                lif_firingCounter,
                                                lif_recSignal,
                                                GeneralUtils.linear_to_cartesian)
    config = launch_configuration(kernel.fun)
    # threads to be launched. Since one can't launch exact thread number the kernel needs, 
    # one just launch threads more than this kernel needs then use a guard inside the kernel
    # to prevent unused threads to access memory. 
    threads = min(1024, config.threads)    # depend on gpu. Most NVIDIA gpu has 1024 threads per block
    # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element 
    totalThreads = length(lif_wRec)   
    blocks = cld(totalThreads, threads)
        # println("launching gpu kernel")
    CUDA.@sync begin
        kernel( lif_zit,
                lif_wRec,
                lif_vt,
                lif_vth,
                lif_vRest,
                lif_zt,
                lif_alpha,
                lif_phi,
                lif_epsilonRec,
                lif_refractoryCounter,
                lif_refractoryDuration,
                lif_gammaPd,
                lif_firingCounter,
                lif_recSignal,
                GeneralUtils.linear_to_cartesian;  threads, blocks)
    end
 end
 # gpu kernel
 function lifForward(    zit,
                        wRec,
                        vt,
                        vth,
                        vRest,
                        zt,
                        alpha,
                        phi,
                        epsilonRec,
                        refractoryCounter,
                        refractoryDuration,
                        gammaPd,
                        firingCounter,
                        recSignal,
                        linear_to_cartesian)
        i = (blockIdx().x - 1) * blockDim().x + threadIdx().x   # gpu threads index
        if i <= length(wRec)
                # cartesian index
                i1, i2, i3, i4 = linear_to_cartesian(i, size(wRec))
                # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4")
                refractoryCounter[i1,i2,i3,i4] -= 1
                if refractoryCounter[i1,i2,i3,i4] > 0        # refractory period is active
                        refractoryCounter[i1,i2,i3,i4] -= 1
                        zt[i1,i2,i3,i4] = 0
                        vt[i1,i2,i3,i4] = alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]
                        phi[i1,i2,i3,i4] = 0
                        # compute epsilonRec
                        epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + zit[i1,i2,i3,i4]
                else    # refractory period is inactive
                        recSignal[i1,i2,i3,i4] = zit[i1,i2,i3,i4] * wRec[i1,i2,i3,i4]
                        vt[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]) + sum(@view(recSignal[:,:,i3,i4]))
                        # fires if membrane potential exceed threshold
                        if vt[i1,i2,i3,i4] > vth[i1,i2,i3,i4]
                                zt[i1,i2,i3,i4] = 1
                                refractoryCounter[i1,i2,i3,i4] = refractoryDuration[i1,i2,i3,i4]
                                firingCounter[i1,i2,i3,i4] += 1
                                vt[i1,i2,i3,i4] = vRest[i1,i2,i3,i4]
                        else
                                zt[i1,i2,i3,i4] = 0
                        end
                        # compute phi, there is a difference from lif formula
                        phi[i1,i2,i3,i4] = (gammaPd[i1,i2,i3,i4] / vth[i1,i2,i3,i4]) * max(0, 1 - ((vt[i1,i2,i3,i4] - vth[i1,i2,i3,i4]) / vth[i1,i2,i3,i4]))
                        # compute epsilonRec
                        epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + zit[i1,i2,i3,i4]
                end
        end
        return nothing
 end
 function alifForward(zit::Array{T},
                    wRec::Array{T},
                    vt0::Array{T},
@@ -413,164 +725,6 @@ function alifForward(zit::Array{T},
        end
 end
 # gpu launcher
 function alifForward(   alif_zit::CuArray,
                        alif_wRec::CuArray,
                        alif_vt::CuArray,
                        alif_vth::CuArray,
                        alif_vRest::CuArray,
                        alif_zt::CuArray,
                        alif_alpha::CuArray,
                        alif_phi::CuArray,
                        alif_epsilonRec::CuArray,
                        alif_refractoryCounter::CuArray,
                        alif_refractoryDuration::CuArray,
                        alif_gammaPd::CuArray,
                        alif_firingCounter::CuArray,
                        alif_recSignal::CuArray,
                        alif_epsilonRecA::CuArray,
                        alif_a::CuArray,
                        alif_avth::CuArray,
                        alif_beta::CuArray,
                        alif_rho::CuArray,
                        )
    kernel = @cuda launch=false alifForward(    alif_zit,
                                                alif_wRec, 
                                                alif_vt,
                                                alif_vth,
                                                alif_vRest,
                                                alif_zt,
                                                alif_alpha,
                                                alif_phi,
                                                alif_epsilonRec,
                                                alif_refractoryCounter,
                                                alif_refractoryDuration,
                                                alif_gammaPd,
                                                alif_firingCounter,
                                                alif_recSignal,
                                                alif_epsilonRecA,
                                                alif_a,
                                                alif_avth,
                                                alif_beta,
                                                alif_rho,
                                                GeneralUtils.linear_to_cartesian)
    config = launch_configuration(kernel.fun)
    # threads to be launched. Since one can't launch exact thread number the kernel needs, 
    # one just launch threads more than this kernel needs then use a guard inside the kernel
    # to prevent unused threads to access memory. 
    threads = min(1024, config.threads)    # depend on gpu. Most NVIDIA gpu has 1024 threads per block
    # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element 
    totalThreads = length(alif_wRec)   
    blocks = cld(totalThreads, threads)
        # println("launching gpu kernel")
    CUDA.@sync begin
        kernel( alif_zit,
                alif_wRec, 
                alif_vt,
                alif_vth,
                alif_vRest,
                alif_zt,
                alif_alpha,
                alif_phi,
                alif_epsilonRec,
                alif_refractoryCounter,
                alif_refractoryDuration,
                alif_gammaPd,
                alif_firingCounter,
                alif_recSignal,
                alif_epsilonRecA,
                alif_a,
                alif_avth,
                alif_beta,
                alif_rho,
                GeneralUtils.linear_to_cartesian;  threads, blocks)
    end
 end
 # gpu kernel
 function alifForward(   zit,
                        wRec, 
                        vt,
                        vth,
                        vRest,
                        zt,
                        alpha,
                        phi,
                        epsilonRec,
                        refractoryCounter,
                        refractoryDuration,
                        gammaPd,
                        firingCounter,
                        recSignal,
                        epsilonRecA,
                        a,
                        avth,
                        beta,
                        rho,
                        linear_to_cartesian)
        i = (blockIdx().x - 1) * blockDim().x + threadIdx().x   # gpu threads index
        if i <= length(wRec)
                # cartesian index
                i1, i2, i3, i4 = linear_to_cartesian(i, size(wRec))
                # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4")
                refractoryCounter[i1,i2,i3,i4] -= 1
                if refractoryCounter[i1,i2,i3,i4] > 0        # refractory period is active
                        refractoryCounter[i1,i2,i3,i4] -= 1
                        zt[i1,i2,i3,i4] = 0
                        vt[i1,i2,i3,i4] = alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]
                        phi[i1,i2,i3,i4] = 0
                        a[i1,i2,i3,i4] = rho[i1,i2,i3,i4] * a[i1,i2,i3,i4]
                        # compute epsilonRec
                        epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + zit[i1,i2,i3,i4]
                        # compute epsilonRecA
                        epsilonRecA[i1,i2,i3,i4] = (phi[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + 
                                                ((rho[i1,i2,i3,i4] - (phi[i1,i2,i3,i4] * beta[i1,i2,i3,i4])) * epsilonRecA[i1,i2,i3,i4])
                        # compute avth
                        avth[i1,i2,i3,i4] = vth[i1,i2,i3,i4] + (beta[i1,i2,i3,i4] * a[i1,i2,i3,i4])
                else    # refractory period is inactive
                        recSignal[i1,i2,i3,i4] = zit[i1,i2,i3,i4] * wRec[i1,i2,i3,i4]
                        vt[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]) + sum(@view(recSignal[:,:,i3,i4]))
                        # compute avth
                        avth[i1,i2,i3,i4] = vth[i1,i2,i3,i4] + (beta[i1,i2,i3,i4] * a[i1,i2,i3,i4])
                        # fires if membrane potential exceed threshold
                        if vt[i1,i2,i3,i4] > avth[i1,i2,i3,i4]
                                zt[i1,i2,i3,i4] = 1
                                refractoryCounter[i1,i2,i3,i4] = refractoryDuration[i1,i2,i3,i4]
                                firingCounter[i1,i2,i3,i4] += 1
                                vt[i1,i2,i3,i4] = vRest[i1,i2,i3,i4]
                                a[i1,i2,i3,i4] = (rho[i1,i2,i3,i4] * a[i1,i2,i3,i4]) + 1
                        else
                                zt[i1,i2,i3,i4] = 0
                                a[i1,i2,i3,i4] = (rho[i1,i2,i3,i4] * a[i1,i2,i3,i4])
                        end
                        # compute phi, there is a difference from alif formula
                        phi[i1,i2,i3,i4] = (gammaPd[i1,i2,i3,i4] / vth[i1,i2,i3,i4]) * max(0, 1 - ((vt[i1,i2,i3,i4] - vth[i1,i2,i3,i4]) / vth[i1,i2,i3,i4]))
                        # compute epsilonRec
                        epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + zit[i1,i2,i3,i4]
                        # compute epsilonRecA
                        epsilonRecA[i1,i2,i3,i4] = (phi[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + 
                                                ((rho[i1,i2,i3,i4] - (phi[i1,i2,i3,i4] * beta[i1,i2,i3,i4])) * epsilonRecA[i1,i2,i3,i4])
                end
        end
        return nothing
 end
 function onForward(kfn_zit::Array{T},
                    zit::Array{T},
                    wOut::Array{T},
@@ -638,133 +792,6 @@ function onForward(kfn_zit::Array{T},
    end
 end
 # gpu launcher
 function onForward(     on_zit::CuArray,
                        on_wOut::CuArray,
                        on_vt::CuArray,
                        on_vth::CuArray,
                        on_vRest::CuArray,
                        on_zt::CuArray,
                        on_alpha::CuArray,
                        on_phi::CuArray,
                        on_epsilonRec::CuArray,
                        on_refractoryCounter::CuArray,
                        on_refractoryDuration::CuArray,
                        on_gammaPd::CuArray,
                        on_firingCounter::CuArray,
                        on_recSignal::CuArray)
    kernel = @cuda launch=false onForward(      on_zit,
                                                on_wOut,
                                                on_vt,
                                                on_vth,
                                                on_vRest,
                                                on_zt,
                                                on_alpha,
                                                on_phi,
                                                on_epsilonRec,
                                                on_refractoryCounter,
                                                on_refractoryDuration,
                                                on_gammaPd,
                                                on_firingCounter,
                                                on_recSignal,
                                                GeneralUtils.linear_to_cartesian)
    config = launch_configuration(kernel.fun)
    # threads to be launched. Since one can't launch exact thread number the kernel needs, 
    # one just launch threads more than this kernel needs then use a guard inside the kernel
    # to prevent unused threads to access memory. 
    threads = min(1024, config.threads)    # depend on gpu. Most NVIDIA gpu has 1024 threads per block
    # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element 
    totalThreads = length(on_wOut)   
    blocks = cld(totalThreads, threads)
        # println("launching gpu kernel")
    CUDA.@sync begin
        kernel( on_zit,
                on_wOut,
                on_vt,
                on_vth,
                on_vRest,
                on_zt,
                on_alpha,
                on_phi,
                on_epsilonRec,
                on_refractoryCounter,
                on_refractoryDuration,
                on_gammaPd,
                on_firingCounter,
                on_recSignal,
                GeneralUtils.linear_to_cartesian;  threads, blocks)
    end
 end
 # gpu kernel
 function onForward(     zit,
                        wOut,
                        vt,
                        vth,
                        vRest,
                        zt,
                        alpha,
                        phi,
                        epsilonRec,
                        refractoryCounter,
                        refractoryDuration,
                        gammaPd,
                        firingCounter,
                        recSignal,
                        linear_to_cartesian)
        i = (blockIdx().x - 1) * blockDim().x + threadIdx().x   # gpu threads index
        if i <= length(wOut)
                # cartesian index
                i1, i2, i3, i4 = linear_to_cartesian(i, size(wOut))
                # @cuprintln("gpu thread $i $i1 $i2 $i3 $i4")
                refractoryCounter[i1,i2,i3,i4] -= 1
                if refractoryCounter[i1,i2,i3,i4] > 0        # refractory period is active
                        refractoryCounter[i1,i2,i3,i4] -= 1
                        zt[i1,i2,i3,i4] = 0
                        vt[i1,i2,i3,i4] = alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]
                        phi[i1,i2,i3,i4] = 0
                        # compute epsilonRec
                        epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + zit[i1,i2,i3,i4]
                else    # refractory period is inactive
                        recSignal[i1,i2,i3,i4] = zit[i1,i2,i3,i4] * wOut[i1,i2,i3,i4]
                        vt[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * vt[i1,i2,i3,i4]) + sum(@view(recSignal[:,:,i3,i4]))
                        # fires if membrane potential exceed threshold
                        if vt[i1,i2,i3,i4] > vth[i1,i2,i3,i4]
                                zt[i1,i2,i3,i4] = 1
                                refractoryCounter[i1,i2,i3,i4] = refractoryDuration[i1,i2,i3,i4]
                                firingCounter[i1,i2,i3,i4] += 1
                                vt[i1,i2,i3,i4] = vRest[i1,i2,i3,i4]
                        else
                                zt[i1,i2,i3,i4] = 0
                        end
                        # compute phi, there is a difference from on formula
                        phi[i1,i2,i3,i4] = (gammaPd[i1,i2,i3,i4] / vth[i1,i2,i3,i4]) * max(0, 1 - ((vt[i1,i2,i3,i4] - vth[i1,i2,i3,i4]) / vth[i1,i2,i3,i4]))
                        # compute epsilonRec
                        epsilonRec[i1,i2,i3,i4] = (alpha[i1,i2,i3,i4] * epsilonRec[i1,i2,i3,i4]) + zit[i1,i2,i3,i4]
                end
        end
        return nothing
 end
--- a/src/learn.jl
+++ b/src/learn.jl
@@ -9,6 +9,7 @@ using ..type, ..snnUtil
 #------------------------------------------------------------------------------------------------100
 function compute_paramsChange!(kfn::kfn_1, modelError, outputError)
    modelError = reshape(modelError, (1,1,1,:)) # (1,1,1,batch)
    lifComputeParamsChange!(kfn.lif_phi, 
                            kfn.lif_epsilonRec, 
                            kfn.lif_eta,
@@ -18,7 +19,10 @@ function compute_paramsChange!(kfn::kfn_1, modelError, outputError)
                            kfn.on_wOut, 
                            kfn.lif_arrayProjection4d,
                            kfn.lif_error,
-                            modelError)
+                            modelError,
                            kfn.inputSize,
                            )
    alifComputeParamsChange!(kfn.alif_phi, 
                             kfn.alif_epsilonRec, 
@@ -30,7 +34,10 @@ function compute_paramsChange!(kfn::kfn_1, modelError, outputError)
                             kfn.alif_arrayProjection4d,
                             kfn.alif_error,
                             modelError,
-                             kfn.alif_beta)
+
                             kfn.alif_epsilonRecA,
                             kfn.alif_beta,
                             )
    onComputeParamsChange!(kfn.on_phi,
                           kfn.on_epsilonRec,
@@ -38,7 +45,10 @@ function compute_paramsChange!(kfn::kfn_1, modelError, outputError)
                           kfn.on_eRec,
                           kfn.on_wOut,
                           kfn.on_wOutChange,
-                           outputError)
+                           kfn.on_arrayProjection4d,
                           kfn.on_error,
                           outputError,
                           )
    # error("DEBUG ->  kfn compute_paramsChange!    $(Dates.now())")
 end
@@ -51,18 +61,28 @@ function lifComputeParamsChange!(   phi::CuArray,
                                    wOut::CuArray,
                                    arrayProjection4d::CuArray,
                                    nError::CuArray,
-                                    modelError::CuArray)
+                                    modelError::CuArray,
-
+                                    
-    # Bₖⱼ in paper, sum() to get each neuron's total wOut weight
+                                    inputSize::CuArray,
-    wOutSum = sum(wOut, dims=3) .* arrayProjection4d
+                                    )
-
+    # Bₖⱼ in paper, sum() to get each neuron's total wOut weight,
    # use absolute because only magnitude is needed
    wOutSum_all = reshape( abs.(sum(wOut, dims=3)), (1,1,:, size(wOut, 4)) )    # (1,1,allNeuron,batch)
    # get only each lif neuron's wOut, leaving out other neuron's wOut
    startIndex = prod(inputSize) +1
    stopIndex = startIndex + size(wRec, 3)  -1
    wOutSum = @view(wOutSum_all[1,1, startIndex:stopIndex, :])  
    wOutSum = reshape(wOutSum, (1, 1, size(wOutSum, 1), size(wOutSum, 2)))  # (1,1,n,batch)
    # nError a.k.a. learning signal use dopamine concept, 
    # this neuron receive summed error signal (modelError)
-    nError .= (modelError .* arrayProjection4d) .* wOutSum
+    nError .= (modelError .* wOutSum) .* arrayProjection4d
    eRec .= phi .* epsilonRec
-    # GeneralUtils.isNotEqual(wRec, 0) is a subscribe filter use to filter out non-subscribed wRecChange
+    wRecChange .+= ((-1 .* eta) .* nError .* eRec)
-    wRecChange .+= ((-1 .* eta) .* nError .* eRec .* sign.(wRec)) .* GeneralUtils.isNotEqual.(wRec, 0)
+
-    # error("DEBUG ->  lifComputeParamsChange!    $(Dates.now())")
+    # reset epsilonRec
    epsilonRec .= 0
 end
 function alifComputeParamsChange!(  phi::CuArray,
@@ -75,18 +95,29 @@ function alifComputeParamsChange!(  phi::CuArray,
                                    arrayProjection4d::CuArray,
                                    nError::CuArray,
                                    modelError::CuArray,
                                    beta::CuArray)
-    # Bₖⱼ in paper, sum() to get each neuron's total wOut weight
+                                    epsilonRecA::CuArray,
-    wOutSum = sum(wOut, dims=3) .* arrayProjection4d
+                                    beta::CuArray
                                    )
    # Bₖⱼ in paper, sum() to get each neuron's total wOut weight,
    # use absolute because only magnitude is needed
    wOutSum_all = reshape( abs.(sum(wOut, dims=3)), (1,1,:, size(wOut, 4)) )    # (1,1,allNeuron,batch)
    # get only each lif neuron's wOut, leaving out other neuron's wOut
    wOutSum = @view(wOutSum_all[1,1, end-size(wRec, 3)+1:end, :])  
    wOutSum = reshape(wOutSum, (1, 1, size(wOutSum, 1), size(wOutSum, 2)))  # (1,1,n,batch)
    # nError a.k.a. learning signal use dopamine concept, 
    # this neuron receive summed error signal (modelError)
-    nError .= (modelError .* arrayProjection4d) .* wOutSum
+    nError .= (modelError .* wOutSum) .* arrayProjection4d
-    eRec .= (phi .* epsilonRec) .+ (phi .* epsilonRec .* beta)
+    eRec .= phi .* (epsilonRec .- (beta .* epsilonRecA))    # use eq. 25
    wRecChange .+= ((-1 .* eta) .* nError .* eRec)
    # reset epsilonRec
    epsilonRec .= 0
    epsilonRecA .= 0
    # GeneralUtils.isNotEqual(wRec, 0) is a subscribe filter use to filter out non-subscribed wRecChange
    wRecChange .+= ((-1 .* eta) .* nError .* eRec .* sign.(wRec))  .* GeneralUtils.isNotEqual.(wRec, 0)
    # error("DEBUG ->  alifComputeParamsChange!    $(Dates.now())")
 end
@@ -96,15 +127,17 @@ function onComputeParamsChange!(phi::CuArray,
                                eRec::CuArray,
                                wOut::CuArray,
                                wOutChange::CuArray,
                                arrayProjection4d::CuArray,
                                nError::CuArray,
                                outputError::CuArray    # outputError is output neuron's error
                                )   
-    # nError a.k.a. learning signal use dopamine concept, 
+    eRec .= phi .* epsilonRec
-    # this neuron receive summed error signal (modelError)
+    nError .= reshape(outputError, (1, 1, :, size(outputError, 2))) .* arrayProjection4d
-    eRec .= (phi .* epsilonRec) .* reshape(outputError, (1, 1, :, size(epsilonRec, 4)))
+    wOutChange .+= ((-1 .* eta) .* nError .* eRec)
-    
+
-    # GeneralUtils.isNotEqual(wRec, 0) is a subscribe filter use to filter out non-subscribed wRecChange
+    # reset epsilonRec
-    wOutChange .+= ((-1 .* eta) .* eRec .* sign.(wOut)) .* GeneralUtils.isNotEqual.(wOut, 0)
+    epsilonRec .= 0
    # error("DEBUG ->  onComputeParamsChange!    $(Dates.now())")
 end
@@ -224,20 +257,20 @@ end
 function lifLearn!(wRec,
                    wRecChange,
                    arrayProjection4d)
    # merge learning weight with average learning weight
-    wRec .+= (sum(wRecChange) ./ (size(wRec, 4))) .* arrayProjection4d
+    wRec .+= (sum(wRecChange, dims=4) ./ (size(wRec, 4))) .* arrayProjection4d
    #TODO synaptic strength 
    #TODO neuroplasticity
    # error("DEBUG ->  lifLearn!    $(Dates.now())")
 end
 function alifLearn!(wRec,
                    wRecChange,
                    arrayProjection4d)
-    # merge learning weight
+    # merge learning weight with average learning weight
    wRec .+= (sum(wRecChange) ./ (size(wRec, 4))) .* arrayProjection4d
    #TODO synaptic strength 
@@ -249,7 +282,7 @@ end
 function onLearn!(wOut,
                    wOutChange,
                    arrayProjection4d)
-    # merge learning weight
+    # merge learning weight with average learning weight
    wOut .+= (sum(wOutChange) ./ (size(wOut, 4))) .* arrayProjection4d
    # adaptive wOut to help convergence using c_decay
--- a/src/type.jl
+++ b/src/type.jl
@@ -21,6 +21,7 @@ Base.@kwdef mutable struct kfn_1 <: knowledgeFn
    timeStep::Union{AbstractArray, Nothing} = nothing
    learningStage::Union{AbstractArray, Nothing} = nothing  # 0 inference, 1 start, 2 during, 3 end learning
    inputSize::Union{AbstractArray, Nothing} = nothing
    zit::Union{AbstractArray, Nothing} = nothing     # 3D activation matrix
    modelError::Union{AbstractArray, Nothing} = nothing     # store RSNN error
    outputError::Union{AbstractArray, Nothing} = nothing    # store output neurons error
@@ -50,6 +51,7 @@ Base.@kwdef mutable struct kfn_1 <: knowledgeFn
    lif_gammaPd::Union{AbstractArray, Nothing} = nothing
    lif_wRecChange::Union{AbstractArray, Nothing} = nothing
    lif_error::Union{AbstractArray, Nothing} = nothing
    lif_subscription::Union{AbstractArray, Nothing} = nothing
    lif_firingCounter::Union{AbstractArray, Nothing} = nothing
@@ -85,6 +87,7 @@ Base.@kwdef mutable struct kfn_1 <: knowledgeFn
    alif_gammaPd::Union{AbstractArray, Nothing} = nothing
    alif_wRecChange::Union{AbstractArray, Nothing} = nothing
    alif_error::Union{AbstractArray, Nothing} = nothing
    alif_subscription::Union{AbstractArray, Nothing} = nothing
    alif_firingCounter::Union{AbstractArray, Nothing} = nothing
@@ -137,6 +140,7 @@ Base.@kwdef mutable struct kfn_1 <: knowledgeFn
    on_gammaPd::Union{AbstractArray, Nothing} = nothing
    on_wOutChange::Union{AbstractArray, Nothing} = nothing
    on_error::Union{AbstractArray, Nothing} = nothing
    on_subscription::Union{AbstractArray, Nothing} = nothing
    on_firingCounter::Union{AbstractArray, Nothing} = nothing
@@ -162,8 +166,8 @@ function kfn_1(params::Dict; device=cpu)
    # ---------------------------------------------------------------------------- #
    # row*col is a 2D matrix represent all RSNN activation
    row, col, batch = kfn.params[:inputPort][:signal][:numbers] # z-axis represent signal batch number
    # row += kfn.params[:inputPort][:noise][:numbers][1]
    col += kfn.params[:inputPort][:noise][:numbers][2]
    kfn.inputSize = [row, col]                       |> device 
    col += kfn.params[:computeNeuron][:lif][:numbers][2]
    col += kfn.params[:computeNeuron][:alif][:numbers][2]
@@ -208,6 +212,7 @@ function kfn_1(params::Dict; device=cpu)
    kfn.lif_gammaPd = (similar(kfn.lif_wRec) .= 0.3)                   |> device
    kfn.lif_wRecChange = (similar(kfn.lif_wRec) .= 0)                 |> device
    kfn.lif_error = (similar(kfn.lif_wRec) .= 0)               |> device
    kfn.lif_subscription = (GeneralUtils.isNotEqual.(kfn.lif_wRec, 0)) |> device
    kfn.lif_firingCounter = (similar(kfn.lif_wRec) .= 0)               |> device
@@ -254,6 +259,7 @@ function kfn_1(params::Dict; device=cpu)
    kfn.alif_gammaPd = (similar(kfn.alif_wRec) .= 0.3)                   |> device
    kfn.alif_wRecChange = (similar(kfn.alif_wRec) .= 0)                 |> device
    kfn.alif_error = (similar(kfn.alif_wRec) .= 0)                 |> device
    kfn.alif_subscription = (GeneralUtils.isNotEqual.(kfn.alif_wRec, 0))     |> device
    kfn.alif_firingCounter = (similar(kfn.alif_wRec) .= 0)               |> device
@@ -286,9 +292,13 @@ function kfn_1(params::Dict; device=cpu)
    # subscription
    w = zeros(row, col, n)
    synapticConnectionPercent = kfn.params[:outputPort][:params][:synapticConnectionPercent]
-    synapticConnection = Int(floor(row*col * synapticConnectionPercent/100))
+    subable = size(kfn.lif_wRec, 3) + size(kfn.alif_wRec, 3) # sub to lif, alif only
-    for slice in eachslice(w, dims=3)
+    synapticConnection = Int(floor(subable * synapticConnectionPercent/100))
-        pool = shuffle!([1:row*col...])[1:synapticConnection]
+    for slice in eachslice(w, dims=3)   # each slice is a neuron
        startInd = row*col - subable + 1    # e.g. 100(row*col) - 50(subable) = 50 -> startInd = 51
        # pool must contain only lif, alif neurons
        pool = shuffle!([startInd:row*col...])[1:synapticConnection]
        for i in pool
            slice[i] = randn()/10   # assign weight to synaptic connection
        end
@@ -313,6 +323,7 @@ function kfn_1(params::Dict; device=cpu)
    kfn.on_gammaPd = (similar(kfn.on_wOut) .= 0.3)                   |> device
    kfn.on_wOutChange = (similar(kfn.on_wOut) .= 0)                 |> device
    kfn.on_error = (similar(kfn.on_wOut) .= 0)                     |> device
    kfn.on_subscription = (GeneralUtils.isNotEqual.(kfn.on_wOut, 0)) |> device
    kfn.on_firingCounter = (similar(kfn.on_wOut) .= 0)               |> device