module interface # export using CUDA # ------------------------------ 100 characters ------------------------------ # """ Search wine in stock. Arguments\n a : one of ChatAgent's agent. Return\n A JSON string of available wine Example\n ```jldoctest julia> using ChatAgent, CommUtils julia> agent = ChatAgent.agentReflex("Jene") julia> input = "{\"food\": \"pizza\", \"occasion\": \"anniversary\"}" julia> result = winestock(agent, input) "{"wine 1": {\"Winery\": \"Pichon Baron\", \"wine name\": \"Pauillac (Grand Cru Classé)\", \"grape variety\": \"Cabernet Sauvignon\", \"year\": 2010, \"price\": \"125 USD\", \"stock ID\": \"ar-17\"}, }" ``` """ cartesianAssign!(a::CuArray, b::CuArray) = @cuda cartesianAssign!(a, b) """ GPU version of batchMatEleMul Example julia> using Flux, CUDA julia> device = Flux.CUDA.functional() ? gpu : cpu julia> if device == gpu CUDA.device!(0) end julia> input = rand(32, 32, 128) |> gpu; # 128-batches julia> weight = rand(32, 32, 1024) |> gpu; # 1-batch. this matrix is essentially (32, 32, 1024, 1) julia> r = matMul_3Dto3D_manyTo1batch(input, weight); julia> size(r) (32, 32, 1024, 128) """ function matMul_3Dto3D_manyTo1batch(a::CuArray, b::CuArray; resultStorage::Union{CuArray, Nothing}=nothing) if resultStorage === nothing resultStorage = similar(a, eltype(b), size(b, 1), size(b, 2), size(b, 3), size(a, 3)) |> gpu end kernel = @cuda launch=false matMul_3Dto3D_manyTo1batch_gpu!(a, b, resultStorage, GeneralUtils.linear_to_cartesian) config = launch_configuration(kernel.fun) # threads to be launched. Since one can't launch exact thread number the kernel needs, # one just launch threads more than this kernel needs then use a guard inside the kernel # to prevent unused threads to access memory. threads = min(256, config.threads) # threads per block. depend on gpu. Most NVIDIA gpu has 1024 threads per block # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element totalThreads = size(a, 3) # This kernel use 1 thread per batch blocks = cld(totalThreads, threads) CUDA.@sync begin kernel(a, b, resultStorage, GeneralUtils.linear_to_cartesian; threads, blocks) end return resultStorage end end # module