adding jsontable

This commit is contained in:
2026-03-08 13:11:53 +07:00
parent 0ef8dd61a8
commit 89a72cf8a9
5 changed files with 604 additions and 3685 deletions

View File

@@ -31,7 +31,15 @@
# [(dataname1, data1, type1), (dataname2, data2, type2), ...]
# ```
#
# Supported types: "text", "dictionary", "table", "image", "audio", "video", "binary"
# Supported types: "text", "dictionary", "arrowtable", "jsontable", "image", "audio", "video", "binary"
#
# Table Datatypes:
# - `arrowtable`: Apache Arrow IPC format for efficient binary serialization
# - Input: DataFrame, Arrow.Table
# - Encoding: arrow-ipc
# - `jsontable`: JSON format for human-readable tabular data
# - Input: Vector{NamedTuple}, Vector{Dict} (column-oriented compatible)
# - Encoding: json
module NATSBridge
@@ -51,7 +59,7 @@ It supports both direct transport (base64-encoded data) and link transport (URL-
# Arguments:
- `id::String` - Unique identifier for this payload (e.g., "uuid4")
- `dataname::String` - Name of the payload (e.g., "login_image")
- `payload_type::String` - Payload type: "text", "dictionary", "table", "image", "audio", "video", "binary"
- `payload_type::String` - Payload type: "text", "dictionary", "arrowtable", "jsontable", "image", "audio", "video", "binary"
- `transport::String` - Transport method: "direct" or "link"
- `encoding::String` - Encoding method: "none", "json", "base64", "arrow-ipc"
- `size::Integer` - Size of the payload in bytes (e.g., 15433)
@@ -100,7 +108,7 @@ payload = msg_payload_v1(
struct msg_payload_v1
id::String # id of this payload e.g. "uuid4"
dataname::String # name of this payload e.g. "login_image"
payload_type::String # this payload type. Can be "text", "dictionary", "table", "image", "audio", "video", "binary"
payload_type::String # this payload type. Can be "text", "dictionary", "arrowtable", "jsontable", "image", "audio", "video", "binary"
transport::String # transport method: "direct" or "link"
encoding::String # encoding method: "none", "json", "base64", "arrow-ipc"
size::Integer # data size in bytes e.g. 15433
@@ -363,7 +371,7 @@ Each payload can have a different type, enabling mixed-content messages (e.g., c
- `data::AbstractArray{Tuple{String, Any, String}}` - List of (dataname, data, type) tuples to send
- `dataname::String` - Name of the payload
- `data::Any` - The actual data to send
- `payload_type::String` - Payload type: "text", "dictionary", "table", "image", "audio", "video", "binary"
- `payload_type::String` - Payload type: "text", "dictionary", "arrowtable", "jsontable", "image", "audio", "video", "binary"
- No standalone `type` parameter - type is specified per payload
# Keyword Arguments:
@@ -399,11 +407,15 @@ env, msg_json = smartsend("my.subject", [("dataname1", data, "dictionary")])
# Send multiple payloads in one message with different types
data1 = Dict("key1" => "value1")
data2 = rand(10_000) # Small array
env, msg_json = smartsend("my.subject", [("dataname1", data1, "dictionary"), ("dataname2", data2, "table")])
env, msg_json = smartsend("my.subject", [("dataname1", data1, "dictionary"), ("dataname2", data2, "arrowtable")])
# Send a large array using fileserver upload
data = rand(10_000_000) # ~80 MB
env, msg_json = smartsend("large.data", [("large_table", data, "table")])
env, msg_json = smartsend("large.data", [("large_arrow_table", data, "arrowtable")])
# Send jsontable (JSON format)
rows = [Dict("id" => 1, "name" => "Alice"), Dict("id" => 2, "name" => "Bob")]
env, msg_json = smartsend("json.data", [("users", rows, "jsontable")])
# Mixed content (e.g., chat with text and image)
env, msg_json = smartsend("chat.subject", [
@@ -424,13 +436,12 @@ function smartsend(
fileserver_upload_handler::Function = plik_oneshot_upload, # a function to handle uploading data to specific HTTP fileserver
size_threshold::Int = DEFAULT_SIZE_THRESHOLD,
#=
Generate a globally unique identifier (UUID) at the start of the request.
This ID must remain constant and immutable as it propagates through every
stage of the execution pipeline. It serves as the end-to-end ID for
distributed tracing, enabling the correlation of all logs, metrics, and
errors across the system back to this specific request instance.
=#
# Generate a globally unique identifier (UUID) at the start of the request.
# This ID must remain constant and immutable as it propagates through every
# stage of the execution pipeline. It serves as the end-to-end ID for
# distributed tracing, enabling the correlation of all logs, metrics, and
# errors across the system back to this specific request instance.
correlation_id::String = string(uuid4()),
msg_purpose::String = "chat",
@@ -463,6 +474,14 @@ function smartsend(
payload_b64 = Base64.base64encode(payload_bytes) # Encode bytes as base64 string
log_trace(correlation_id, "Using direct transport for $payload_size bytes") # Log transport choice
# Determine encoding based on payload_type
encoding = "base64"
if payload_type == "jsontable"
encoding = "json"
elseif payload_type == "arrowtable"
encoding = "arrow-ipc"
end
# Create msg_payload_v1 for direct transport
payload = msg_payload_v1(
payload_b64,
@@ -470,7 +489,7 @@ function smartsend(
id = string(uuid4()),
dataname = dataname,
transport = "direct",
encoding = "base64",
encoding = encoding,
size = payload_size,
metadata = Dict{String, Any}("payload_bytes" => payload_size)
)
@@ -481,7 +500,7 @@ function smartsend(
# Upload to HTTP server
response = fileserver_upload_handler(fileserver_url, dataname, payload_bytes)
if response["status"] != 200 # Check if upload was successful
error("Failed to upload data to fileserver: $(response["status"])") # Throw error if upload failed
end
@@ -489,6 +508,14 @@ function smartsend(
url = response["url"] # URL for the uploaded data
log_trace(correlation_id, "Uploaded to URL: $url") # Log successful upload
# Determine encoding based on payload_type
encoding = "none"
if payload_type == "jsontable"
encoding = "json"
elseif payload_type == "arrowtable"
encoding = "arrow-ipc"
end
# Create msg_payload_v1 for link transport
payload = msg_payload_v1(
url,
@@ -496,7 +523,7 @@ function smartsend(
id = string(uuid4()),
dataname = dataname,
transport = "link",
encoding = "none",
encoding = encoding,
size = payload_size,
metadata = Dict{String, Any}()
)
@@ -543,12 +570,13 @@ It supports multiple serialization formats for different data types.
2. Converts data to binary representation according to format rules
3. For text: converts string to UTF-8 bytes
4. For dictionary: serializes as JSON then converts to bytes
5. For table: uses Arrow.jl to write as IPC stream
6. For image/audio/video/binary: returns binary data directly
5. For arrowtable: uses Arrow.jl to write as IPC stream
6. For jsontable: converts to JSON then to bytes
7. For image/audio/video/binary: returns binary data directly
# Arguments:
- `data::Any` - Data to serialize (string for `"text"`, JSON-serializable for `"dictionary"`, table-like for `"table"`, binary for `"image"`, `"audio"`, `"video"`, `"binary"`)
- `payload_type::String` - Target format: "text", "dictionary", "table", "image", "audio", "video", "binary"
- `data::Any` - Data to serialize (string for `"text"`, JSON-serializable for `"dictionary"`, table-like for `"arrowtable"`, Vector{NamedTuple}/Vector{Dict} for `"jsontable"`, binary for `"image"`, `"audio"`, `"video"`, `"binary"`)
- `payload_type::String` - Target format: "text", "dictionary", "arrowtable", "jsontable", "image", "audio", "video", "binary"
# Return:
- `Vector{UInt8}` - Binary representation of the serialized data
@@ -569,9 +597,13 @@ text_bytes = _serialize_data(text_data, "text")
json_data = Dict("name" => "Alice", "age" => 30)
json_bytes = _serialize_data(json_data, "dictionary")
# Table serialization with a DataFrame (recommended for tabular data)
# Arrow table serialization with a DataFrame (recommended for tabular data)
df = DataFrame(id = 1:3, name = ["Alice", "Bob", "Charlie"], score = [95, 88, 92])
table_bytes = _serialize_data(df, "table")
arrow_bytes = _serialize_data(df, "arrowtable")
# JSON table serialization - Vector{NamedTuple} or Vector{Dict}
rows = [Dict("id" => 1, "name" => "Alice"), Dict("id" => 2, "name" => "Bob")]
json_bytes = _serialize_data(rows, "jsontable")
# Image data (Vector{UInt8})
image_bytes = UInt8[1, 2, 3] # Image bytes
@@ -622,10 +654,30 @@ function _serialize_data(data::Any, payload_type::String)
json_str = JSON.json(data) # Convert Julia data to JSON string
json_str_bytes = Vector{UInt8}(json_str) # Convert JSON string to bytes
return json_str_bytes
elseif payload_type == "table" # Table data - convert to Arrow IPC stream
elseif payload_type == "arrowtable" # Arrow table data - convert to Arrow IPC stream
io = IOBuffer() # Create in-memory buffer
Arrow.write(io, data) # Write data as Arrow IPC stream to buffer
return take!(io) # Return the buffer contents as bytes
elseif payload_type == "jsontable" # JSON table data - convert to JSON
# data can be Vector{NamedTuple}, Vector{Dict}, or DataFrame
# If DataFrame, convert to Vector{Dict} first
if isa(data, DataFrame)
# Convert DataFrame to Vector{Dict} (row-oriented)
rows = []
for i in 1:nrow(data)
row_dict = Dict()
for col in names(data)
row_dict[String(col)] = data[i, col]
end
push!(rows, row_dict)
end
json_str = JSON.json(rows)
return Vector{UInt8}(json_str)
else
# Already Vector{NamedTuple} or Vector{Dict}
json_str = JSON.json(data)
return Vector{UInt8}(json_str)
end
elseif payload_type == "image" # Image data - treat as binary
if isa(data, Vector{UInt8})
return data # Return binary data directly
@@ -881,24 +933,25 @@ end
""" _deserialize_data - Deserialize bytes to data based on type
This internal function converts serialized bytes back to Julia data based on type.
It handles "text" (string), "dictionary" (JSON deserialization), "table" (Arrow IPC deserialization),
"image" (binary data), "audio" (binary data), "video" (binary data), and "binary" (binary data).
It handles "text" (string), "dictionary" (JSON deserialization), "arrowtable" (Arrow IPC deserialization),
"jsontable" (JSON deserialization), "image" (binary data), "audio" (binary data), "video" (binary data), and "binary" (binary data).
# Function Workflow:
1. Validates the data type against supported formats
2. Converts bytes to appropriate Julia data type based on format
3. For text: converts bytes to string
4. For dictionary: converts bytes to JSON string then parses to Julia object
5. For table: reads Arrow IPC format and returns DataFrame
6. For image/audio/video/binary: returns bytes directly
5. For arrowtable: reads Arrow IPC format and returns Arrow.Table
6. For jsontable: converts bytes to JSON string then parses to Vector{Dict}
7. For image/audio/video/binary: returns bytes directly
# Arguments:
- `data::Vector{UInt8}` - Serialized data as bytes
- `payload_type::String` - Data type ("text", "dictionary", "table", "image", "audio", "video", "binary")
- `payload_type::String` - Data type ("text", "dictionary", "arrowtable", "jsontable", "image", "audio", "video", "binary")
- `correlation_id::String` - Correlation ID for logging
# Return:
- Deserialized data (String for "text", DataFrame for "table", JSON data for "dictionary", bytes for "image", "audio", "video", "binary")
- Deserialized data (String for "text", Arrow.Table for "arrowtable", Vector{Dict} for "jsontable", JSON data for "dictionary", bytes for "image", "audio", "video", "binary")
# Throws:
- `Error` if `payload_type` is not one of the supported types
@@ -913,9 +966,13 @@ text_data = _deserialize_data(text_bytes, "text", "correlation123")
json_bytes = UInt8[123, 34, 110, 97, 109, 101, 34, 58, 34, 65, 108, 105, 99, 101, 125] # {"name":"Alice"}
json_data = _deserialize_data(json_bytes, "dictionary", "correlation123")
# Arrow IPC data (table)
# Arrow IPC data (arrowtable)
arrow_bytes = Vector{UInt8}([1, 2, 3]) # Arrow IPC bytes
table_data = _deserialize_data(arrow_bytes, "table", "correlation123")
arrow_table = _deserialize_data(arrow_bytes, "arrowtable", "correlation123")
# JSON table data (jsontable)
json_table_bytes = UInt8[91, 123, 34, 105, 100, 34, 58, 49, 44, 34, 110, 97, 109, 101, 34, 58, 34, 65, 108, 105, 99, 101, 34, 125] # [{"id":1,"name":"Alice"}]
json_table = _deserialize_data(json_table_bytes, "jsontable", "correlation123")
```
"""
function _deserialize_data(
@@ -928,10 +985,13 @@ function _deserialize_data(
elseif payload_type == "dictionary" # JSON data - deserialize
json_str = String(data) # Convert bytes to string
return JSON.parse(json_str) # Parse JSON string to JSON object
elseif payload_type == "table" # Table data - deserialize Arrow IPC stream
elseif payload_type == "arrowtable" # Arrow table data - deserialize Arrow IPC stream
io = IOBuffer(data) # Create buffer from bytes
df = Arrow.Table(io) # Read Arrow IPC format from buffer
return df # Return DataFrame
table = Arrow.Table(io) # Read Arrow IPC format from buffer
return table # Return Arrow.Table
elseif payload_type == "jsontable" # JSON table data - deserialize JSON
json_str = String(data) # Convert bytes to string
return JSON.parse(json_str) # Parse JSON string to Vector{Dict}
elseif payload_type == "image" # Image data - return binary
return data # Return bytes directly
elseif payload_type == "audio" # Audio data - return binary
@@ -945,6 +1005,16 @@ function _deserialize_data(
end
end
function rows_to_columns_dict(rows::Vector{Dict{Symbol,Any}})
# Ensure rows is not empty
isempty(rows) && return Dict{Symbol,Vector{Any}}()
# Build column-oriented dictionary
return Dict(
key => [get(row, key, missing) for row in rows]
for key in keys(rows[1])
)
end
""" plik_oneshot_upload - Upload a single file to a plik server using one-shot mode
This function uploads a raw byte array to a plik server in one-shot mode (no upload session).
@@ -970,19 +1040,19 @@ retrieves an upload ID and token, then uploads the file data as multipart form d
- `"url"` - Full URL to download the uploaded file
# Example
```jldoctest
using HTTP, JSON
```jldoctest
using HTTP, JSON
fileserver_url = "http://localhost:8080"
dataname = "test.txt"
data = Vector{UInt8}("hello world")
fileserver_url = "http://localhost:8080"
dataname = "test.txt"
data = Vector{UInt8}("hello world")
# Upload to local plik server
result = plik_oneshot_upload(file_server_url, dataname, data)
# Upload to local plik server
result = plik_oneshot_upload(file_server_url, dataname, data)
# Access the result as a Dict
# result["status"], result["uploadid"], result["fileid"], result["url"]
```
# Access the result as a Dict
# result["status"], result["uploadid"], result["fileid"], result["url"]
```
"""
function plik_oneshot_upload(file_server_url::String, dataname::String, data::Vector{UInt8})
@@ -1106,18 +1176,4 @@ end
end # module