seperate GeneralUtils cpu and gpu code

This commit is contained in:
narawat lamaiin
2024-03-14 20:19:32 +07:00
parent 091aeccf8e
commit 251ec12a93
4 changed files with 635 additions and 0 deletions

68
src/interface.jl Normal file
View File

@@ -0,0 +1,68 @@
module interface
# export
using CUDA
# ------------------------------ 100 characters ------------------------------ #
cartesianAssign!(a::CuArray, b::CuArray) = @cuda cartesianAssign!(a, b)
""" GPU version of batchMatEleMul
Example
julia> using Flux, CUDA
julia> device = Flux.CUDA.functional() ? gpu : cpu
julia> if device == gpu CUDA.device!(0) end
julia> input = rand(32, 32, 128) |> gpu; # 128-batches
julia> weight = rand(32, 32, 1024) |> gpu; # 1-batch. this matrix is essentially (32, 32, 1024, 1)
julia> r = matMul_3Dto3D_manyTo1batch(input, weight);
julia> size(r)
(32, 32, 1024, 128)
"""
function matMul_3Dto3D_manyTo1batch(a::CuArray, b::CuArray;
resultStorage::Union{CuArray, Nothing}=nothing)
if resultStorage === nothing
resultStorage = similar(a, eltype(b), size(b, 1), size(b, 2), size(b, 3), size(a, 3)) |> gpu
end
kernel = @cuda launch=false matMul_3Dto3D_manyTo1batch_gpu!(a, b, resultStorage, GeneralUtils.linear_to_cartesian)
config = launch_configuration(kernel.fun)
# threads to be launched. Since one can't launch exact thread number the kernel needs,
# one just launch threads more than this kernel needs then use a guard inside the kernel
# to prevent unused threads to access memory.
threads = min(256, config.threads) # threads per block. depend on gpu. Most NVIDIA gpu has 1024 threads per block
# total desired threads to launch to gpu. Usually 1 thread per 1 matrix element
totalThreads = size(a, 3) # This kernel use 1 thread per batch
blocks = cld(totalThreads, threads)
CUDA.@sync begin
kernel(a, b, resultStorage, GeneralUtils.linear_to_cartesian; threads, blocks)
end
return resultStorage
end
end # module