update

2025-07-17 11:48:16 +07:00
parent c5f3fda2ba
commit b3e8df7287
2 changed files with 189 additions and 247 deletions
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -6,7 +6,7 @@ export noNegative!, randomWithProb, randomChoiceWithProb, findIndex, limitvalue,
        matMul_3Dto4D_batchwise, isNotEqual, linearToCartesian, vectorMax, findMax,
        multiply_last, multiplyRandomElements, replaceElements, replaceElements!, isBetween,
        isLess, allTrue, getStringBetweenCharacters, JSON3read_stringKey, mkDictPath!, 
-        getDictPath
+        getDictPath, detectKeywordVariation, textToDict

 using JSON3, DataStructures, Distributions, Random, Dates, UUIDs, MQTTClient, DataFrames, CSV
 using ..util, ..communication
@@ -1150,9 +1150,193 @@ end



+"""
+    detectKeywordVariation(keywords::AbstractVector{String}, text::String) -> Dict{String, Union{Array, Nothing}}
+
+Detects and collects all case-variant occurrences of multiple keywords in the text. 
+This function processes each keyword individually and returns an array of matched variations for each keyword.
+
+# Arguments
+- `keywords::AbstractVector{String}` Vector of keywords to search for
+- `text::String` The text to search in
+
+# Returns
+- `Dict{String, Array}` Returns a dictionary mapping each keyword to an array of matched variations found in the text
+
+# Examples
+  ```jldoctest
+  julia> detectKeywordVariation(["test", "example", "cat"], "This is a Test EXAMPLE")
+  Dict{String, Array}("test" => ["Test"], "example" => ["EXAMPLE"], "cat" => nothing)
+"""
+function detectKeywordVariation(keywords::T, text::String)::Dict{String, Union{Array, Nothing}} where {T<:AbstractVector}
+  kw = Dict{String, Union{Array, Nothing}}()
+  
+  # use for loop and detect_keyword function to get the exact variation of each keyword in the text then push to kw list
+  for keyword in keywords
+    ws = detectKeywordVariation.(keyword, text)
+    total = sum(issomething.(ws))
+    if total != 0
+      kw[keyword] = ws
+    else
+      kw[keyword] = nothing
+    end
+  end
+  return kw
+end


+"""
+    detectKeywordVariation(keyword::String, text::String) -> Union{Nothing, Array{String}}

+Detects if a keyword exists in the text in different case variations (lowercase, uppercase first letter, or all uppercase).
+
+# Arguments:
+- `keyword::String` The keyword to search for
+- `text::String` The text to search in
+
+# Returns:
+- `Union{Nothing, Array{String}}` Returns an array of matched keyword variations if found, otherwise returns nothing
+
+# Examples:
+  ```jldoctest
+  julia> detectKeywordVariation("test", "This is a Test case")
+  ["Test"]
+
+  julia> detectKeywordVariation("error", "NO ERRORS FOUND")
+  ["ERRORS"]
+
+  julia> detectKeywordVariation("missing", "complete data")
+  nothing
+"""
+function detectKeywordVariation(keyword::String, text::String)::Union{Nothing, Array{String}}
+  # Define the keyword variations to search for
+  wordVariations = [uppercasefirst(keyword), uppercase(keyword), lowercase(keyword)]
+  # wordVariations may duplicate keyword
+  keyword_variations = [keyword]
+  for i in wordVariations
+    i != keyword ? push!(keyword_variations, i) : nothing
+  end
+
+  _splittext = string.(strip.(split(text, " ")))
+  splittext = String[]
+  # remove . after a word
+  for i in _splittext
+    if length(i) != 0 && i[end] ∈ ['.']
+      word = string(i[1:end-1])
+      push!(splittext, word)
+    else
+      push!(splittext, i)
+    end
+  end
+
+  result = String[]
+  for variation in keyword_variations
+    # if length of both word is equals then it is a whole word otherwise it is part of part of other word
+    r = findIndex(splittext, variation)
+
+    if isempty(r[2])
+      # skip
+    else
+      # if variation > 1 add them all so this function detect duplicate keyword
+      variations = [variation for i in eachindex(r[2])]
+      result = vcat(result, variations)
+    end
+  end
+  return result
+end
+
+
+""" Convert text into a dictionary with a given keywords. This function use keywords to slice
+  a given text into the following format: KW1|kw1_text|KW2|kw2_text|KW3|kw3_text.
+  The left most string which has no keyword will be discarded. WARNING, ordering is important
+
+# Arguments
+  - `text::String`
+    A text to be converted.
+  - `keywords::Vector{String}`
+    A list of keywords to be used to slice the text. 
+    These keywords also be the resulting dict keys.
+# Keyword Arguments
+  - `rightmarker::String`
+    A maker used to make a word to be unique. Ex, A keyword "plan" with rightmarker ":", 
+    the function will search for "plan:" otherwise the function will search for "plan".
+    The marker will not be in the resulting dict keys.
+  - `symbolkey::Bool`
+    If true, resulting dict's key will be Symbols, otherwise string.
+  - `lowercasekey::Bool`
+    set resulting dict's key to be lowercase
+
+# Return
+  - `d::OrderedDict`
+
+# Example
+```jldoctest
+julia> text = "TODAY thought: what to do plan: wake up and going out action: 1. wake up 2. eat 3. sleep"
+julia> sample_keywords = ["thought", "plan", "action"]
+julia> resultdict = GeneralUtils.textToDict(text, sample_keywords; rightmarker=":", symbolkey=true)
+julia> println(resultdict)
+OrderedCollections.OrderedDict{Any, Any}(:thought => "what to do", 
+                                        :plan => "wake up and going out", 
+                                        :action => "1. wake up 2. eat 3. sleep")
+```
+
+# Signature
+"""
+function textToDict(text::String, detectKeywords::Vector{String}; 
+    dictKey::Union{Vector{String}, Nothing}=nothing, 
+    symbolkey::Bool=false, lowercasekey::Bool=false
+    )::OrderedDict
+
+  # make sure this function detect variation of a work e.g. agent, Agent, AGENT
+  kw = []
+  # use for loop and detect_keyword function to get the exact variation of each keyword in the text then push to kw list
+  for keyword in detectKeywords
+    detected = detectKeywordVariation(keyword, text)
+    if detected !== nothing
+      push!(kw, detected)
+    else
+      error("Keyword $keyword not found in text: $text")
+    end
+  end
+
+  od1, od2 =
+  if symbolkey
+    OrderedDict{Symbol, Any}(), OrderedDict{Symbol, Any}()
+  else
+    OrderedDict{String, Any}(), OrderedDict{String, Any}()
+  end
+
+  remainingtext = text
+  dictKey_ = reverse(dictKey)
+  
+  # process text from back to front
+  rkw = reverse(kw)
+  for (i,keyword) in enumerate(rkw)
+    # Find the position of the keyword in the text
+    keywordidx = findlast(keyword, remainingtext)
+    dKey = dictKey_[i]
+    
+    if keywordidx !== nothing
+      substr = remainingtext[keywordidx[end]+1:end]
+      str = string(strip(substr))  # Removes both leading and trailing whitespace.
+      _key = lowercasekey == true ? lowercase(dKey) : dKey
+      key = symbolkey == true ? Symbol(_key) : _key
+      od1[key] = str
+      remainingtext = remainingtext[1:keywordidx[1]-1]
+    else
+      error("""keyword "$keyword" not found in the provided text: $text </end of error note>""")
+    end
+  end
+
+  # correct the order
+  ks = reverse([i for i in keys(od1)])
+  for k in ks
+    k = symbolkey == true ? Symbol(k) : k
+    od2[k] = od1[k]
+  end
+  
+  return od2
+end



--- a/src/util.jl
+++ b/src/util.jl
@@ -1,12 +1,12 @@
 module util
  
 export timedifference, showstracktrace, findHighestIndexKey, uuid4snakecase, replaceDictKeys,
-        findMatchingDictKey, textToDict, randstring, randstrings, timeout,
+        findMatchingDictKey, randstring, randstrings, timeout,
        dataframeToCSV, dfToVectorDict, disintegrate_vectorDict, getDataFrameValue, dfRowtoString, 
-        dfToString, dataframe_to_json_list, dictToString, dictToString_noKey, 
+        dfToString, dataframe_to_json_list, dictToString, dictToString_noKey, issomething,
        dictToString_numbering, extract_triple_backtick_text, 
-        countGivenWords, remove_french_accents, detect_keyword, extractTextBetweenCharacter,
-        extractTextBetweenString, 
+        countGivenWords, remove_french_accents, 
+        extractTextBetweenCharacter, extractTextBetweenString, 
        convertCamelSnakeKebabCase, fitrange, recentElementsIndex, nonRecentElementsIndex

 using JSON3, DataStructures, Distributions, Random, Dates, UUIDs, MQTTClient, DataFrames
@@ -244,102 +244,6 @@ function replaceDictKeys(d::Dict, replacementMap::Dict)::Dict
 end


-""" Convert text into a dictionary with a given keywords. This function use keywords to slice
-  a given text into the following format: KW1|kw1_text|KW2|kw2_text|KW3|kw3_text.
-  The left most string which has no keyword will be discarded. WARNING, ordering is important
-
-# Arguments
-  - `text::String`
-    A text to be converted.
-  - `keywords::Vector{String}`
-    A list of keywords to be used to slice the text. 
-    These keywords also be the resulting dict keys.
-# Keyword Arguments
-  - `rightmarker::String`
-    A maker used to make a word to be unique. Ex, A keyword "plan" with rightmarker ":", 
-    the function will search for "plan:" otherwise the function will search for "plan".
-    The marker will not be in the resulting dict keys.
-  - `symbolkey::Bool`
-    If true, resulting dict's key will be Symbols, otherwise string.
-  - `lowercasekey::Bool`
-    set resulting dict's key to be lowercase
-
-# Return
-  - `d::OrderedDict`
-
-# Example
-```jldoctest
-julia> text = "TODAY thought: what to do plan: wake up and going out action: 1. wake up 2. eat 3. sleep"
-julia> sample_keywords = ["thought", "plan", "action"]
-julia> resultdict = GeneralUtils.textToDict(text, sample_keywords; rightmarker=":", symbolkey=true)
-julia> println(resultdict)
-OrderedCollections.OrderedDict{Any, Any}(:thought => "what to do", 
-                                        :plan => "wake up and going out", 
-                                        :action => "1. wake up 2. eat 3. sleep")
-```
-
-# Signature
-"""
-function textToDict(text::String, detectKeywords::Vector{String}; 
-    dictKey::Union{Vector{String}, Nothing}=nothing, 
-    symbolkey::Bool=false, lowercasekey::Bool=false
-    )::OrderedDict
-
-  # make sure this function detect variation of a work e.g. agent, Agent, AGENT
-  kw = []
-  # use for loop and detect_keyword function to get the exact variation of each keyword in the text then push to kw list
-  for keyword in detectKeywords
-    detected = detect_keyword(keyword, text)
-    if detected !== nothing
-      push!(kw, detected)
-    else
-      error("Keyword $keyword not found in text.")
-    end
-  end
-
-  od1, od2 =
-  if symbolkey
-    OrderedDict{Symbol, Any}(), OrderedDict{Symbol, Any}()
-  else
-    OrderedDict{String, Any}(), OrderedDict{String, Any}()
-  end
-
-  remainingtext = text
-  dictKey_ = reverse(dictKey)
-  
-  # process text from back to front
-  rkw = reverse(kw)
-  for (i,keyword) in enumerate(rkw)
-    # Find the position of the keyword in the text
-    keywordidx = findlast(keyword, remainingtext)
-    dKey = dictKey_[i]
-    
-    if keywordidx !== nothing
-      substr = remainingtext[keywordidx[end]+1:end]
-      str = string(strip(substr))  # Removes both leading and trailing whitespace.
-      _key = lowercasekey == true ? lowercase(dKey) : dKey
-      key = symbolkey == true ? Symbol(_key) : _key
-      od1[key] = str
-      remainingtext = remainingtext[1:keywordidx[1]-1]
-    else
-      error("""keyword "$keyword" not found in the provided text: $text </end of error note>""")
-    end
-  end
-
-  # correct the order
-  ks = reverse([i for i in keys(od1)])
-  for k in ks
-    k = symbolkey == true ? Symbol(k) : k
-    od2[k] = od1[k]
-  end
-  
-  return od2
-end
-
-
-
-
-
 """ Generate a random string

 # Arguments
@@ -784,152 +688,6 @@ function cuttext(range, text)
  end
 end

-"""
-    detect_keyword(keywords::AbstractVector{String}, text::String; mode::Union{String, Nothing}=nothing, delimiter::AbstractVector=[' ', '\n', '.']) -> Dict{String, Integer}
-
-Detects and counts occurrences of multiple keywords in the text in different case variations (lowercase, uppercase first letter, or all uppercase).
-
-# Arguments
- `keywords::AbstractVector{String}` Vector of keywords to search for
- `text::String` The text to search in
-
-# Keyword Arguments
- `mode::Union{String, Nothing}` When set to "individual", only counts matches that are individual words (default: nothing)
- `delimiter::AbstractVector` Characters used to determine word boundaries when mode="individual" (default: [' ', '\n', '.'])
-
-# Returns
- `Dict{String, Integer}` Returns a dictionary mapping each keyword to its count in the text (0 if not found)
-
-# Examples
-  ```jldoctest
-  julia> detect_keyword(["test", "example"], "This is a Test EXAMPLE")
-  Dict{String, Integer}("test" => 1, "example" => 1)
-
-  julia> detect_keyword(["cat"], "cats and category", mode="individual")
-  Dict{String, Integer}("cat" => 0)
-
-  julia> detect_keyword(["error"], "No ERRORS found!")
-  Dict{String, Integer}("error" => 1)
-  ```
-  
-# Signature
-"""
-# function detect_keyword(keywords::T1, text::String; 
-#   mode::Union{String, Nothing}=nothing, delimiter::T2=[' ', '\n', '.']
-#   )::Dict{String, Integer} where {T1<:AbstractVector, T2<:AbstractVector}
-#   # Initialize dictionary to store keyword counts
-#   kwdict = Dict{String, Integer}()
-#   for i in keywords
-#     kwdict[i] = 0
-#   end
-  
-#   startindex = 1
-#   # Iterate through each keyword and search for matches in text
-#   for kw in keywords
-#     # Check each possible starting position in the text
-#     for startindex in 1:1:length(text)
-#       # Get the window range for current keyword at current position
-#       wordwindows = wordwindow(kw, startindex)
-#       # Extract the text slice for comparison
-#       cuttexts = cuttext(wordwindows, text)
-#       if cuttexts !== nothing
-#         # Try to detect keyword in current text slice
-#         detected_kw = detect_keyword(kw, cuttexts)
-#         if detected_kw !== nothing && mode === nothing
-#           # Increment count if keyword found and no mode restrictions
-#           kwdict[kw] +=1
-#         elseif detected_kw !== nothing && mode === "individual"
-#           # For individual word mode, check word boundaries
-#           # Check if character before keyword is a delimiter or start of text
-#           checkbefore = 
-#             if wordwindows.start > 1 && 
-#               text[wordwindows.start-1] ∈ delimiter
-#               true
-#             elseif wordwindows.start == 1
-#               true
-#             else
-#               false
-#             end
-
-#           # Check if character after keyword is a delimiter or end of text
-#           checkafter =
-#             if wordwindows.stop < length(text) && 
-#               text[wordwindows.stop+1] ∈ delimiter
-#               true
-#             elseif wordwindows.stop == length(text)
-#               true
-#             else
-#               false
-#             end
-#           # Only count keyword if it's a complete word
-#           if checkbefore && checkafter
-#             kwdict[kw] +=1
-#           end
-#         end
-#       end
-#     end
-#   end
-#   return kwdict
-# end
-
-
-function detect_keyword(keywords::T, text::String)::Dict{String, Integer} where {T<:AbstractVector}
-  kw = Dict{String, Integer}()
-  splittext = string.(split(text, " "))
-  # use for loop and detect_keyword function to get the exact variation of each keyword in the text then push to kw list
-  for keyword in keywords
-    ws = detect_keyword.(keyword, splittext)
-    total = sum(issomething.(ws))
-    if total != 0
-      kw[keyword] = total
-    else
-      kw[keyword] = 0
-    end
-  end
-  return kw
-end
-
-
-"""
-    detect_keyword(keyword::String, text::String) -> Union{Nothing, String}
-
-Detects if a keyword exists in the text in different case variations (lowercase, uppercase first letter, or all uppercase).
-
-# Arguments:
- `keyword::String` The keyword to search for
- `text::String` The text to search in
-
-# Returns:
- `Union{Nothing, String}` Returns the matched keyword variation if found, otherwise returns nothing
-
-# Examples:
-  ```jldoctest
-  julia> detect_keyword("test", "This is a Test case")
-  "Test"
-
-  julia> detect_keyword("error", "NO ERRORS FOUND")
-  "ERRORS"
-
-  julia> detect_keyword("missing", "complete data")
-  nothing
-  ```
-
-# Signature
-"""
-function detect_keyword(keyword::String, text::String)::Union{Nothing, String}
-  # Define the keyword variations to search for
-  keyword_variations = [keyword, uppercasefirst(keyword), uppercase(keyword), lowercase(keyword)]
-  
-  # Check if any of the keyword variations are in the text
-  for variation in keyword_variations
-      if occursin(variation, text)
-          return variation
-      end
-  end
-  
-  # Return nothing if no variation is found
-  return nothing
-end


 """