From 150ddac2c078a5908b775a90c9b4f8f02c786159 Mon Sep 17 00:00:00 2001 From: narawat lamaiin Date: Wed, 30 Apr 2025 12:59:14 +0700 Subject: [PATCH] add extractTextBetweenString --- src/llmUtil.jl | 58 +++++++++++++++++++++++++++++++++++++++++++++++++- src/util.jl | 24 +++++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/src/llmUtil.jl b/src/llmUtil.jl index 82a3708..4a8a58f 100644 --- a/src/llmUtil.jl +++ b/src/llmUtil.jl @@ -98,6 +98,34 @@ function formatLLMtext_qwen(name::T, text::T; end +function formatLLMtext_qwen3(name::T, text::T; + assistantStarter::Bool=false) where {T<:AbstractString} + formattedtext = + if name == "system" + """ + <|im_start|>$name + $text + <|im_end|> + """ + else + """ + <|im_start|>$name + $text + <|im_end|> + """ + end + + if assistantStarter + formattedtext *= + """ + <|im_start|>assistant + """ + end + + return formattedtext +end + + function formatLLMtext_phi4(name::T, text::T; assistantStarter::Bool=false) where {T<:AbstractString} formattedtext = @@ -186,6 +214,8 @@ function formatLLMtext(messages::Vector{Dict{Symbol, T}}, formatname::String # not define yet elseif formatname == "qwen" formatLLMtext_qwen + elseif formatname == "qwen3" + formatLLMtext_qwen3 elseif formatname == "phi4" formatLLMtext_phi4 elseif formatname == "granite3" @@ -230,6 +260,8 @@ function deFormatLLMtext(text::String, formatname::String f = if formatname == "granite3" deFormatLLMtext_granite3 + elseif formatname == "qwen3" + deFormatLLMtext_qwen3 else error("$formatname template not define yet") end @@ -261,7 +293,7 @@ julia> normalText = YiemAgent.deFormatLLMtext(response, "granite3") """ function deFormatLLMtext_granite3(text::String)::Union{Nothing, String} # check if '{' and '}' are in the text because it's a special format for the LLM response - if contains(text, '{') && contains(text, '}') + if contains(text, "<|im_start|>assistant") # get the text between '{' and '}' text_between_braces = GeneralUtils.extractTextBetweenCharacter(text, '{', '}')[1] return text_between_braces @@ -274,6 +306,30 @@ function deFormatLLMtext_granite3(text::String)::Union{Nothing, String} end +function deFormatLLMtext_qwen3(text::String; includethink::Bool=false)::Union{Nothing, String} + think = nothing + str = nothing + + if occursin("", text) + r = GeneralUtils.extractTextBetweenString(text, "", "") + if r[:success] + think = r[:text] + end + str = string(split(text, "")[2]) + end + + if includethink == true && occursin("", text) + result = "ModelThought: $think $str" + return result + elseif includethink == false && occursin("", text) + result = str + return result + else + return text + end +end + + """ Attemp to correct LLM response's incorrect JSON response. # Arguments diff --git a/src/util.jl b/src/util.jl index a8ddac5..2d4498b 100644 --- a/src/util.jl +++ b/src/util.jl @@ -6,6 +6,7 @@ export timedifference, showstracktrace, findHighestIndexKey, uuid4snakecase, rep dfToString, dataframe_to_json_list, dictToString, dictToString_noKey, dictToString_numbering, extract_triple_backtick_text, countGivenWords, remove_french_accents, detect_keyword, extractTextBetweenCharacter, + extractTextBetweenString, convertCamelSnakeKebabCase, fitrange, recentElementsIndex, nonRecentElementsIndex using JSON3, DataStructures, Distributions, Random, Dates, UUIDs, MQTTClient, DataFrames @@ -1070,6 +1071,29 @@ function extractTextBetweenCharacter(text::String, startchar::Char, endchar::Cha end +function extractTextBetweenString(text::String, startstr::String, endstr::String) + # check whether startstr is in the text or not + isStartStr = split(text, startstr) + if length(isStartStr) > 2 + return (success=false, error="There are more than one occurrences of the start string '$startstr' in the text. Text must has only one start string", errorcode=2, result=nothing) + elseif length(isStartStr) == 1 + return (success=false, error="There are no start string '$startstr' in the text. Text must has only one start string", errorcode=1, result=nothing) + end + + # check whether endstr is in the text or not + isEndStr = split(text, endstr) + if length(isEndStr) > 2 + return (success=false, error="There are more than one occurrences of the end string '$endstr' in the text. Text must has only one end string", errorcode=3, result=nothing) + elseif length(isStartStr) == 1 + return (success=false, error="There are no end string '$endstr' in the text. Text must has only one end string", errorcode=4, result=nothing) + end + + s = string(split(isStartStr[2], endstr)[1]) + + return (success=true, error=nothing, errorcode=0, text=s) +end + + """ Determines if the given string follows camel case naming convention.