85 lines
2.4 KiB
Julia
85 lines
2.4 KiB
Julia
module interface
|
|
|
|
# export
|
|
|
|
using CUDA
|
|
|
|
# ------------------------------ 100 characters ------------------------------ #
|
|
|
|
""" Search wine in stock.
|
|
|
|
Arguments\n
|
|
a : one of ChatAgent's agent.
|
|
|
|
Return\n
|
|
A JSON string of available wine
|
|
|
|
Example\n
|
|
```jldoctest
|
|
julia> using ChatAgent, CommUtils
|
|
julia> agent = ChatAgent.agentReflex("Jene")
|
|
julia> input = "{\"food\": \"pizza\", \"occasion\": \"anniversary\"}"
|
|
julia> result = winestock(agent, input)
|
|
"{"wine 1": {\"Winery\": \"Pichon Baron\", \"wine name\": \"Pauillac (Grand Cru Classé)\", \"grape variety\": \"Cabernet Sauvignon\", \"year\": 2010, \"price\": \"125 USD\", \"stock ID\": \"ar-17\"}, }"
|
|
```
|
|
"""
|
|
cartesianAssign!(a::CuArray, b::CuArray) = @cuda cartesianAssign!(a, b)
|
|
|
|
|
|
""" GPU version of batchMatEleMul
|
|
|
|
Example
|
|
julia> using Flux, CUDA
|
|
julia> device = Flux.CUDA.functional() ? gpu : cpu
|
|
julia> if device == gpu CUDA.device!(0) end
|
|
julia> input = rand(32, 32, 128) |> gpu; # 128-batches
|
|
julia> weight = rand(32, 32, 1024) |> gpu; # 1-batch. this matrix is essentially (32, 32, 1024, 1)
|
|
julia> r = matMul_3Dto3D_manyTo1batch(input, weight);
|
|
julia> size(r)
|
|
(32, 32, 1024, 128)
|
|
"""
|
|
function matMul_3Dto3D_manyTo1batch(a::CuArray, b::CuArray;
|
|
resultStorage::Union{CuArray, Nothing}=nothing)
|
|
if resultStorage === nothing
|
|
resultStorage = similar(a, eltype(b), size(b, 1), size(b, 2), size(b, 3), size(a, 3)) |> gpu
|
|
end
|
|
|
|
kernel = @cuda launch=false matMul_3Dto3D_manyTo1batch_gpu!(a, b, resultStorage, GeneralUtils.linear_to_cartesian)
|
|
config = launch_configuration(kernel.fun)
|
|
|
|
# threads to be launched. Since one can't launch exact thread number the kernel needs,
|
|
# one just launch threads more than this kernel needs then use a guard inside the kernel
|
|
# to prevent unused threads to access memory.
|
|
threads = min(256, config.threads) # threads per block. depend on gpu. Most NVIDIA gpu has 1024 threads per block
|
|
|
|
# total desired threads to launch to gpu. Usually 1 thread per 1 matrix element
|
|
totalThreads = size(a, 3) # This kernel use 1 thread per batch
|
|
|
|
blocks = cld(totalThreads, threads)
|
|
|
|
CUDA.@sync begin
|
|
kernel(a, b, resultStorage, GeneralUtils.linear_to_cartesian; threads, blocks)
|
|
end
|
|
return resultStorage
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
end # module |