seperate GeneralUtils cpu and gpu code

2024-03-14 20:19:32 +07:00
parent 091aeccf8e
commit 251ec12a93
4 changed files with 635 additions and 0 deletions
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -0,0 +1,68 @@
+module interface
+  
+# export
+
+using CUDA
+
+# ------------------------------ 100 characters ------------------------------ #
+
+cartesianAssign!(a::CuArray, b::CuArray) = @cuda cartesianAssign!(a, b)
+
+
+""" GPU version of batchMatEleMul
+    
+    Example
+    julia> using Flux, CUDA
+    julia> device = Flux.CUDA.functional() ? gpu : cpu
+    julia> if device == gpu    CUDA.device!(0) end 
+    julia> input = rand(32, 32, 128)    |> gpu;      # 128-batches
+    julia> weight = rand(32, 32, 1024)  |> gpu;      # 1-batch. this matrix is essentially (32, 32, 1024, 1)
+    julia> r = matMul_3Dto3D_manyTo1batch(input, weight);
+    julia> size(r)
+    (32, 32, 1024, 128)
+"""
+function matMul_3Dto3D_manyTo1batch(a::CuArray, b::CuArray; 
+                        resultStorage::Union{CuArray, Nothing}=nothing)
+    if resultStorage === nothing
+        resultStorage = similar(a, eltype(b), size(b, 1), size(b, 2), size(b, 3), size(a, 3)) |> gpu
+    end
+
+    kernel = @cuda launch=false matMul_3Dto3D_manyTo1batch_gpu!(a, b, resultStorage, GeneralUtils.linear_to_cartesian)
+    config = launch_configuration(kernel.fun)
+    
+    # threads to be launched. Since one can't launch exact thread number the kernel needs, 
+    # one just launch threads more than this kernel needs then use a guard inside the kernel
+    # to prevent unused threads to access memory. 
+    threads = min(256, config.threads)    # threads per block. depend on gpu. Most NVIDIA gpu has 1024 threads per block
+
+    # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element 
+    totalThreads = size(a, 3)   # This kernel use 1 thread per batch 
+
+    blocks = cld(totalThreads, threads)     
+
+    CUDA.@sync begin
+        kernel(a, b, resultStorage, GeneralUtils.linear_to_cartesian; threads, blocks)
+    end
+    return resultStorage
+end
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+end # module