module interface
  
# export

using CUDA

# ------------------------------ 100 characters ------------------------------ #

""" Search wine in stock.

  Arguments\n
    a : one of ChatAgent's agent.
    
  Return\n
    A JSON string of available wine

  Example\n
  ```jldoctest
  julia> using ChatAgent, CommUtils
  julia> agent = ChatAgent.agentReflex("Jene")
  julia> input = "{\"food\": \"pizza\", \"occasion\": \"anniversary\"}"
  julia> result = winestock(agent, input)
  "{"wine 1": {\"Winery\": \"Pichon Baron\", \"wine name\": \"Pauillac (Grand Cru Classé)\", \"grape variety\": \"Cabernet Sauvignon\", \"year\": 2010, \"price\": \"125 USD\", \"stock ID\": \"ar-17\"}, }"
  ```
"""
cartesianAssign!(a::CuArray, b::CuArray) = @cuda cartesianAssign!(a, b)


""" GPU version of batchMatEleMul
    
    Example
    julia> using Flux, CUDA
    julia> device = Flux.CUDA.functional() ? gpu : cpu
    julia> if device == gpu    CUDA.device!(0) end 
    julia> input = rand(32, 32, 128)    |> gpu;      # 128-batches
    julia> weight = rand(32, 32, 1024)  |> gpu;      # 1-batch. this matrix is essentially (32, 32, 1024, 1)
    julia> r = matMul_3Dto3D_manyTo1batch(input, weight);
    julia> size(r)
    (32, 32, 1024, 128)
"""
function matMul_3Dto3D_manyTo1batch(a::CuArray, b::CuArray; 
                        resultStorage::Union{CuArray, Nothing}=nothing)
    if resultStorage === nothing
        resultStorage = similar(a, eltype(b), size(b, 1), size(b, 2), size(b, 3), size(a, 3)) |> gpu
    end

    kernel = @cuda launch=false matMul_3Dto3D_manyTo1batch_gpu!(a, b, resultStorage, GeneralUtils.linear_to_cartesian)
    config = launch_configuration(kernel.fun)
    
    # threads to be launched. Since one can't launch exact thread number the kernel needs, 
    # one just launch threads more than this kernel needs then use a guard inside the kernel
    # to prevent unused threads to access memory. 
    threads = min(256, config.threads)    # threads per block. depend on gpu. Most NVIDIA gpu has 1024 threads per block

    # total desired threads to launch to gpu. Usually 1 thread per 1 matrix element 
    totalThreads = size(a, 3)   # This kernel use 1 thread per batch 

    blocks = cld(totalThreads, threads)     

    CUDA.@sync begin
        kernel(a, b, resultStorage, GeneralUtils.linear_to_cartesian; threads, blocks)
    end
    return resultStorage
end


end # module