From b3e8df728714f4d0eb4ab9829634a9885b5cc7b6 Mon Sep 17 00:00:00 2001 From: narawat lamaiin Date: Thu, 17 Jul 2025 11:48:16 +0700 Subject: [PATCH] update --- src/interface.jl | 186 ++++++++++++++++++++++++++++++++++- src/util.jl | 250 +---------------------------------------------- 2 files changed, 189 insertions(+), 247 deletions(-) diff --git a/src/interface.jl b/src/interface.jl index 80df83e..37ff735 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -6,7 +6,7 @@ export noNegative!, randomWithProb, randomChoiceWithProb, findIndex, limitvalue, matMul_3Dto4D_batchwise, isNotEqual, linearToCartesian, vectorMax, findMax, multiply_last, multiplyRandomElements, replaceElements, replaceElements!, isBetween, isLess, allTrue, getStringBetweenCharacters, JSON3read_stringKey, mkDictPath!, - getDictPath + getDictPath, detectKeywordVariation, textToDict using JSON3, DataStructures, Distributions, Random, Dates, UUIDs, MQTTClient, DataFrames, CSV using ..util, ..communication @@ -1150,9 +1150,193 @@ end +""" + detectKeywordVariation(keywords::AbstractVector{String}, text::String) -> Dict{String, Union{Array, Nothing}} + +Detects and collects all case-variant occurrences of multiple keywords in the text. +This function processes each keyword individually and returns an array of matched variations for each keyword. + +# Arguments +- `keywords::AbstractVector{String}` Vector of keywords to search for +- `text::String` The text to search in + +# Returns +- `Dict{String, Array}` Returns a dictionary mapping each keyword to an array of matched variations found in the text + +# Examples + ```jldoctest + julia> detectKeywordVariation(["test", "example", "cat"], "This is a Test EXAMPLE") + Dict{String, Array}("test" => ["Test"], "example" => ["EXAMPLE"], "cat" => nothing) +""" +function detectKeywordVariation(keywords::T, text::String)::Dict{String, Union{Array, Nothing}} where {T<:AbstractVector} + kw = Dict{String, Union{Array, Nothing}}() + + # use for loop and detect_keyword function to get the exact variation of each keyword in the text then push to kw list + for keyword in keywords + ws = detectKeywordVariation.(keyword, text) + total = sum(issomething.(ws)) + if total != 0 + kw[keyword] = ws + else + kw[keyword] = nothing + end + end + return kw +end +""" + detectKeywordVariation(keyword::String, text::String) -> Union{Nothing, Array{String}} +Detects if a keyword exists in the text in different case variations (lowercase, uppercase first letter, or all uppercase). + +# Arguments: +- `keyword::String` The keyword to search for +- `text::String` The text to search in + +# Returns: +- `Union{Nothing, Array{String}}` Returns an array of matched keyword variations if found, otherwise returns nothing + +# Examples: + ```jldoctest + julia> detectKeywordVariation("test", "This is a Test case") + ["Test"] + + julia> detectKeywordVariation("error", "NO ERRORS FOUND") + ["ERRORS"] + + julia> detectKeywordVariation("missing", "complete data") + nothing +""" +function detectKeywordVariation(keyword::String, text::String)::Union{Nothing, Array{String}} + # Define the keyword variations to search for + wordVariations = [uppercasefirst(keyword), uppercase(keyword), lowercase(keyword)] + # wordVariations may duplicate keyword + keyword_variations = [keyword] + for i in wordVariations + i != keyword ? push!(keyword_variations, i) : nothing + end + + _splittext = string.(strip.(split(text, " "))) + splittext = String[] + # remove . after a word + for i in _splittext + if length(i) != 0 && i[end] ∈ ['.'] + word = string(i[1:end-1]) + push!(splittext, word) + else + push!(splittext, i) + end + end + + result = String[] + for variation in keyword_variations + # if length of both word is equals then it is a whole word otherwise it is part of part of other word + r = findIndex(splittext, variation) + + if isempty(r[2]) + # skip + else + # if variation > 1 add them all so this function detect duplicate keyword + variations = [variation for i in eachindex(r[2])] + result = vcat(result, variations) + end + end + return result +end + + +""" Convert text into a dictionary with a given keywords. This function use keywords to slice + a given text into the following format: KW1|kw1_text|KW2|kw2_text|KW3|kw3_text. + The left most string which has no keyword will be discarded. WARNING, ordering is important + +# Arguments + - `text::String` + A text to be converted. + - `keywords::Vector{String}` + A list of keywords to be used to slice the text. + These keywords also be the resulting dict keys. +# Keyword Arguments + - `rightmarker::String` + A maker used to make a word to be unique. Ex, A keyword "plan" with rightmarker ":", + the function will search for "plan:" otherwise the function will search for "plan". + The marker will not be in the resulting dict keys. + - `symbolkey::Bool` + If true, resulting dict's key will be Symbols, otherwise string. + - `lowercasekey::Bool` + set resulting dict's key to be lowercase + +# Return + - `d::OrderedDict` + +# Example +```jldoctest +julia> text = "TODAY thought: what to do plan: wake up and going out action: 1. wake up 2. eat 3. sleep" +julia> sample_keywords = ["thought", "plan", "action"] +julia> resultdict = GeneralUtils.textToDict(text, sample_keywords; rightmarker=":", symbolkey=true) +julia> println(resultdict) +OrderedCollections.OrderedDict{Any, Any}(:thought => "what to do", + :plan => "wake up and going out", + :action => "1. wake up 2. eat 3. sleep") +``` + +# Signature +""" +function textToDict(text::String, detectKeywords::Vector{String}; + dictKey::Union{Vector{String}, Nothing}=nothing, + symbolkey::Bool=false, lowercasekey::Bool=false + )::OrderedDict + + # make sure this function detect variation of a work e.g. agent, Agent, AGENT + kw = [] + # use for loop and detect_keyword function to get the exact variation of each keyword in the text then push to kw list + for keyword in detectKeywords + detected = detectKeywordVariation(keyword, text) + if detected !== nothing + push!(kw, detected) + else + error("Keyword $keyword not found in text: $text") + end + end + + od1, od2 = + if symbolkey + OrderedDict{Symbol, Any}(), OrderedDict{Symbol, Any}() + else + OrderedDict{String, Any}(), OrderedDict{String, Any}() + end + + remainingtext = text + dictKey_ = reverse(dictKey) + + # process text from back to front + rkw = reverse(kw) + for (i,keyword) in enumerate(rkw) + # Find the position of the keyword in the text + keywordidx = findlast(keyword, remainingtext) + dKey = dictKey_[i] + + if keywordidx !== nothing + substr = remainingtext[keywordidx[end]+1:end] + str = string(strip(substr)) # Removes both leading and trailing whitespace. + _key = lowercasekey == true ? lowercase(dKey) : dKey + key = symbolkey == true ? Symbol(_key) : _key + od1[key] = str + remainingtext = remainingtext[1:keywordidx[1]-1] + else + error("""keyword "$keyword" not found in the provided text: $text """) + end + end + + # correct the order + ks = reverse([i for i in keys(od1)]) + for k in ks + k = symbolkey == true ? Symbol(k) : k + od2[k] = od1[k] + end + + return od2 +end diff --git a/src/util.jl b/src/util.jl index 3dabd31..16ca962 100644 --- a/src/util.jl +++ b/src/util.jl @@ -1,12 +1,12 @@ module util export timedifference, showstracktrace, findHighestIndexKey, uuid4snakecase, replaceDictKeys, - findMatchingDictKey, textToDict, randstring, randstrings, timeout, + findMatchingDictKey, randstring, randstrings, timeout, dataframeToCSV, dfToVectorDict, disintegrate_vectorDict, getDataFrameValue, dfRowtoString, - dfToString, dataframe_to_json_list, dictToString, dictToString_noKey, + dfToString, dataframe_to_json_list, dictToString, dictToString_noKey, issomething, dictToString_numbering, extract_triple_backtick_text, - countGivenWords, remove_french_accents, detect_keyword, extractTextBetweenCharacter, - extractTextBetweenString, + countGivenWords, remove_french_accents, + extractTextBetweenCharacter, extractTextBetweenString, convertCamelSnakeKebabCase, fitrange, recentElementsIndex, nonRecentElementsIndex using JSON3, DataStructures, Distributions, Random, Dates, UUIDs, MQTTClient, DataFrames @@ -244,102 +244,6 @@ function replaceDictKeys(d::Dict, replacementMap::Dict)::Dict end -""" Convert text into a dictionary with a given keywords. This function use keywords to slice - a given text into the following format: KW1|kw1_text|KW2|kw2_text|KW3|kw3_text. - The left most string which has no keyword will be discarded. WARNING, ordering is important - -# Arguments - - `text::String` - A text to be converted. - - `keywords::Vector{String}` - A list of keywords to be used to slice the text. - These keywords also be the resulting dict keys. -# Keyword Arguments - - `rightmarker::String` - A maker used to make a word to be unique. Ex, A keyword "plan" with rightmarker ":", - the function will search for "plan:" otherwise the function will search for "plan". - The marker will not be in the resulting dict keys. - - `symbolkey::Bool` - If true, resulting dict's key will be Symbols, otherwise string. - - `lowercasekey::Bool` - set resulting dict's key to be lowercase - -# Return - - `d::OrderedDict` - -# Example -```jldoctest -julia> text = "TODAY thought: what to do plan: wake up and going out action: 1. wake up 2. eat 3. sleep" -julia> sample_keywords = ["thought", "plan", "action"] -julia> resultdict = GeneralUtils.textToDict(text, sample_keywords; rightmarker=":", symbolkey=true) -julia> println(resultdict) -OrderedCollections.OrderedDict{Any, Any}(:thought => "what to do", - :plan => "wake up and going out", - :action => "1. wake up 2. eat 3. sleep") -``` - -# Signature -""" -function textToDict(text::String, detectKeywords::Vector{String}; - dictKey::Union{Vector{String}, Nothing}=nothing, - symbolkey::Bool=false, lowercasekey::Bool=false - )::OrderedDict - - # make sure this function detect variation of a work e.g. agent, Agent, AGENT - kw = [] - # use for loop and detect_keyword function to get the exact variation of each keyword in the text then push to kw list - for keyword in detectKeywords - detected = detect_keyword(keyword, text) - if detected !== nothing - push!(kw, detected) - else - error("Keyword $keyword not found in text.") - end - end - - od1, od2 = - if symbolkey - OrderedDict{Symbol, Any}(), OrderedDict{Symbol, Any}() - else - OrderedDict{String, Any}(), OrderedDict{String, Any}() - end - - remainingtext = text - dictKey_ = reverse(dictKey) - - # process text from back to front - rkw = reverse(kw) - for (i,keyword) in enumerate(rkw) - # Find the position of the keyword in the text - keywordidx = findlast(keyword, remainingtext) - dKey = dictKey_[i] - - if keywordidx !== nothing - substr = remainingtext[keywordidx[end]+1:end] - str = string(strip(substr)) # Removes both leading and trailing whitespace. - _key = lowercasekey == true ? lowercase(dKey) : dKey - key = symbolkey == true ? Symbol(_key) : _key - od1[key] = str - remainingtext = remainingtext[1:keywordidx[1]-1] - else - error("""keyword "$keyword" not found in the provided text: $text """) - end - end - - # correct the order - ks = reverse([i for i in keys(od1)]) - for k in ks - k = symbolkey == true ? Symbol(k) : k - od2[k] = od1[k] - end - - return od2 -end - - - - - """ Generate a random string # Arguments @@ -784,152 +688,6 @@ function cuttext(range, text) end end -""" - detect_keyword(keywords::AbstractVector{String}, text::String; mode::Union{String, Nothing}=nothing, delimiter::AbstractVector=[' ', '\n', '.']) -> Dict{String, Integer} - -Detects and counts occurrences of multiple keywords in the text in different case variations (lowercase, uppercase first letter, or all uppercase). - -# Arguments -- `keywords::AbstractVector{String}` Vector of keywords to search for -- `text::String` The text to search in - -# Keyword Arguments -- `mode::Union{String, Nothing}` When set to "individual", only counts matches that are individual words (default: nothing) -- `delimiter::AbstractVector` Characters used to determine word boundaries when mode="individual" (default: [' ', '\n', '.']) - -# Returns -- `Dict{String, Integer}` Returns a dictionary mapping each keyword to its count in the text (0 if not found) - -# Examples - ```jldoctest - julia> detect_keyword(["test", "example"], "This is a Test EXAMPLE") - Dict{String, Integer}("test" => 1, "example" => 1) - - julia> detect_keyword(["cat"], "cats and category", mode="individual") - Dict{String, Integer}("cat" => 0) - - julia> detect_keyword(["error"], "No ERRORS found!") - Dict{String, Integer}("error" => 1) - ``` - -# Signature -""" -# function detect_keyword(keywords::T1, text::String; -# mode::Union{String, Nothing}=nothing, delimiter::T2=[' ', '\n', '.'] -# )::Dict{String, Integer} where {T1<:AbstractVector, T2<:AbstractVector} -# # Initialize dictionary to store keyword counts -# kwdict = Dict{String, Integer}() -# for i in keywords -# kwdict[i] = 0 -# end - -# startindex = 1 -# # Iterate through each keyword and search for matches in text -# for kw in keywords -# # Check each possible starting position in the text -# for startindex in 1:1:length(text) -# # Get the window range for current keyword at current position -# wordwindows = wordwindow(kw, startindex) -# # Extract the text slice for comparison -# cuttexts = cuttext(wordwindows, text) -# if cuttexts !== nothing -# # Try to detect keyword in current text slice -# detected_kw = detect_keyword(kw, cuttexts) -# if detected_kw !== nothing && mode === nothing -# # Increment count if keyword found and no mode restrictions -# kwdict[kw] +=1 -# elseif detected_kw !== nothing && mode === "individual" -# # For individual word mode, check word boundaries -# # Check if character before keyword is a delimiter or start of text -# checkbefore = -# if wordwindows.start > 1 && -# text[wordwindows.start-1] ∈ delimiter -# true -# elseif wordwindows.start == 1 -# true -# else -# false -# end - -# # Check if character after keyword is a delimiter or end of text -# checkafter = -# if wordwindows.stop < length(text) && -# text[wordwindows.stop+1] ∈ delimiter -# true -# elseif wordwindows.stop == length(text) -# true -# else -# false -# end -# # Only count keyword if it's a complete word -# if checkbefore && checkafter -# kwdict[kw] +=1 -# end -# end -# end -# end -# end -# return kwdict -# end - - -function detect_keyword(keywords::T, text::String)::Dict{String, Integer} where {T<:AbstractVector} - kw = Dict{String, Integer}() - splittext = string.(split(text, " ")) - # use for loop and detect_keyword function to get the exact variation of each keyword in the text then push to kw list - for keyword in keywords - ws = detect_keyword.(keyword, splittext) - total = sum(issomething.(ws)) - if total != 0 - kw[keyword] = total - else - kw[keyword] = 0 - end - end - return kw -end - - -""" - detect_keyword(keyword::String, text::String) -> Union{Nothing, String} - -Detects if a keyword exists in the text in different case variations (lowercase, uppercase first letter, or all uppercase). - -# Arguments: -- `keyword::String` The keyword to search for -- `text::String` The text to search in - -# Returns: -- `Union{Nothing, String}` Returns the matched keyword variation if found, otherwise returns nothing - -# Examples: - ```jldoctest - julia> detect_keyword("test", "This is a Test case") - "Test" - - julia> detect_keyword("error", "NO ERRORS FOUND") - "ERRORS" - - julia> detect_keyword("missing", "complete data") - nothing - ``` - -# Signature -""" -function detect_keyword(keyword::String, text::String)::Union{Nothing, String} - # Define the keyword variations to search for - keyword_variations = [keyword, uppercasefirst(keyword), uppercase(keyword), lowercase(keyword)] - - # Check if any of the keyword variations are in the text - for variation in keyword_variations - if occursin(variation, text) - return variation - end - end - - # Return nothing if no variation is found - return nothing -end """