add extractTextBetweenString

2025-04-30 12:59:14 +07:00
parent 5108ad1f6b
commit 150ddac2c0
2 changed files with 81 additions and 1 deletions
--- a/src/llmUtil.jl
+++ b/src/llmUtil.jl
@@ -98,6 +98,34 @@ function formatLLMtext_qwen(name::T, text::T;
 end


+function formatLLMtext_qwen3(name::T, text::T;
+  assistantStarter::Bool=false) where {T<:AbstractString}
+  formattedtext = 
+  if name == "system"
+    """
+    <|im_start|>$name
+    $text
+    <|im_end|>
+    """
+  else
+    """
+    <|im_start|>$name
+    $text
+    <|im_end|>
+    """
+  end
+
+  if assistantStarter
+    formattedtext *=
+      """
+      <|im_start|>assistant
+      """
+  end
+
+  return formattedtext
+end
+
+
 function formatLLMtext_phi4(name::T, text::T;
  assistantStarter::Bool=false) where {T<:AbstractString}
  formattedtext = 
@@ -186,6 +214,8 @@ function formatLLMtext(messages::Vector{Dict{Symbol, T}}, formatname::String
      # not define yet
    elseif formatname == "qwen"
      formatLLMtext_qwen
+    elseif formatname == "qwen3"
+      formatLLMtext_qwen3
    elseif formatname == "phi4"
      formatLLMtext_phi4
    elseif formatname == "granite3"
@@ -230,6 +260,8 @@ function deFormatLLMtext(text::String, formatname::String
  f = 
    if formatname == "granite3"
      deFormatLLMtext_granite3
+    elseif formatname == "qwen3"
+      deFormatLLMtext_qwen3
    else
      error("$formatname template not define yet")
    end
@@ -261,7 +293,7 @@ julia> normalText = YiemAgent.deFormatLLMtext(response, "granite3")
 """
 function deFormatLLMtext_granite3(text::String)::Union{Nothing, String}
  # check if '{' and '}' are in the text because it's a special format for the LLM response
-  if contains(text, '{') && contains(text, '}')
+  if contains(text, "<|im_start|>assistant")
    # get the text between '{' and '}'
    text_between_braces = GeneralUtils.extractTextBetweenCharacter(text, '{', '}')[1]
    return text_between_braces
@@ -274,6 +306,30 @@ function deFormatLLMtext_granite3(text::String)::Union{Nothing, String}
 end


+function deFormatLLMtext_qwen3(text::String; includethink::Bool=false)::Union{Nothing, String}
+  think = nothing
+  str = nothing
+
+  if occursin("<think>", text)
+    r = GeneralUtils.extractTextBetweenString(text, "<think>", "</think>")
+    if r[:success]
+      think = r[:text]
+    end
+    str = string(split(text, "</think>")[2])
+  end
+
+  if includethink == true && occursin("<think>", text)
+    result = "ModelThought: $think $str"
+    return result
+  elseif includethink == false && occursin("<think>", text)
+    result = str
+    return result
+  else
+    return text
+  end
+end
+
+
 """ Attemp to correct LLM response's incorrect JSON response.

 # Arguments
--- a/src/util.jl
+++ b/src/util.jl
@@ -6,6 +6,7 @@ export timedifference, showstracktrace, findHighestIndexKey, uuid4snakecase, rep
        dfToString, dataframe_to_json_list, dictToString, dictToString_noKey, 
        dictToString_numbering, extract_triple_backtick_text, 
        countGivenWords, remove_french_accents, detect_keyword, extractTextBetweenCharacter,
+        extractTextBetweenString, 
        convertCamelSnakeKebabCase, fitrange, recentElementsIndex, nonRecentElementsIndex

 using JSON3, DataStructures, Distributions, Random, Dates, UUIDs, MQTTClient, DataFrames
@@ -1070,6 +1071,29 @@ function extractTextBetweenCharacter(text::String, startchar::Char, endchar::Cha
 end


+function extractTextBetweenString(text::String, startstr::String, endstr::String)
+  # check whether startstr is in the text or not
+  isStartStr = split(text, startstr)
+  if length(isStartStr) > 2
+    return (success=false, error="There are more than one occurrences of the start string '$startstr' in the text. Text must has only one start string", errorcode=2, result=nothing)
+  elseif length(isStartStr) == 1
+    return (success=false, error="There are no start string '$startstr' in the text. Text must has only one start string", errorcode=1, result=nothing)
+  end
+
+  # check whether endstr is in the text or not
+  isEndStr = split(text, endstr)
+  if length(isEndStr) > 2
+    return (success=false, error="There are more than one occurrences of the end string '$endstr' in the text. Text must has only one end string", errorcode=3, result=nothing)
+  elseif length(isStartStr) == 1
+    return (success=false, error="There are no end string '$endstr' in the text. Text must has only one end string", errorcode=4, result=nothing)
+  end
+
+  s = string(split(isStartStr[2], endstr)[1])
+  
+  return (success=true, error=nothing, errorcode=0, text=s)
+end
+
+
 """
 Determines if the given string follows camel case naming convention.