This commit is contained in:
narawat lamaiin
2025-04-04 15:04:19 +07:00
parent 1da05f5cae
commit a894ad85ba
2 changed files with 116 additions and 19 deletions

View File

@@ -770,29 +770,106 @@ function extract_triple_backtick_text(input::String)::Vector{String}
end
wordwindow(word::String, startindex::Integer)::UnitRange = startindex:startindex + length(word) -1
function cuttext(range, text)
# check whether range is outside text boundary
if range.start > length(text) || range.stop > length(text)
return nothing
else
return text[range]
end
end
"""
detect_keyword(keywords::AbstractVector{String}, text::String) -> Dict{String, Integer}
detect_keyword(keywords::AbstractVector{String}, text::String; mode::Union{String, Nothing}=nothing, delimiter::AbstractVector=[' ', '\n', '.']) -> Dict{String, Integer}
Detects and counts occurrences of multiple keywords in the text in different case variations (lowercase, uppercase first letter, or all uppercase).
# Arguments:
# Arguments
- `keywords::AbstractVector{String}` Vector of keywords to search for
- `text::String` The text to search in
# Returns:
# Keyword Arguments
- `mode::Union{String, Nothing}` When set to "individual", only counts matches that are individual words (default: nothing)
- `delimiter::AbstractVector` Characters used to determine word boundaries when mode="individual" (default: [' ', '\n', '.'])
# Returns
- `Dict{String, Integer}` Returns a dictionary mapping each keyword to its count in the text (0 if not found)
# Examples:
# Examples
```jldoctest
julia> detect_keyword(["test", "error", "case"], "This is a Test case with ERRORS case")
Dict{String, Integer}("test" => 1, "error" => 1, "case" => 2)
julia> detect_keyword(["test", "example"], "This is a Test EXAMPLE")
Dict{String, Integer}("test" => 1, "example" => 1)
julia> detect_keyword(["warning", "missing"], "Warning: data is complete")
Dict{String, Integer}("warning" => 1, "missing" => 0)
julia> detect_keyword(["cat"], "cats and category", mode="individual")
Dict{String, Integer}("cat" => 0)
julia> detect_keyword(["error"], "No ERRORS found!")
Dict{String, Integer}("error" => 1)
```
# Signature
"""
# function detect_keyword(keywords::T1, text::String;
# mode::Union{String, Nothing}=nothing, delimiter::T2=[' ', '\n', '.']
# )::Dict{String, Integer} where {T1<:AbstractVector, T2<:AbstractVector}
# # Initialize dictionary to store keyword counts
# kwdict = Dict{String, Integer}()
# for i in keywords
# kwdict[i] = 0
# end
# startindex = 1
# # Iterate through each keyword and search for matches in text
# for kw in keywords
# # Check each possible starting position in the text
# for startindex in 1:1:length(text)
# # Get the window range for current keyword at current position
# wordwindows = wordwindow(kw, startindex)
# # Extract the text slice for comparison
# cuttexts = cuttext(wordwindows, text)
# if cuttexts !== nothing
# # Try to detect keyword in current text slice
# detected_kw = detect_keyword(kw, cuttexts)
# if detected_kw !== nothing && mode === nothing
# # Increment count if keyword found and no mode restrictions
# kwdict[kw] +=1
# elseif detected_kw !== nothing && mode === "individual"
# # For individual word mode, check word boundaries
# # Check if character before keyword is a delimiter or start of text
# checkbefore =
# if wordwindows.start > 1 &&
# text[wordwindows.start-1] ∈ delimiter
# true
# elseif wordwindows.start == 1
# true
# else
# false
# end
# # Check if character after keyword is a delimiter or end of text
# checkafter =
# if wordwindows.stop < length(text) &&
# text[wordwindows.stop+1] ∈ delimiter
# true
# elseif wordwindows.stop == length(text)
# true
# else
# false
# end
# # Only count keyword if it's a complete word
# if checkbefore && checkafter
# kwdict[kw] +=1
# end
# end
# end
# end
# end
# return kwdict
# end
function detect_keyword(keywords::T, text::String)::Dict{String, Integer} where {T<:AbstractVector}
kw = Dict{String, Integer}()
splittext = string.(split(text, " "))