This commit is contained in:
narawat lamaiin
2025-04-04 15:04:19 +07:00
parent 1da05f5cae
commit a894ad85ba
2 changed files with 116 additions and 19 deletions

View File

@@ -770,29 +770,106 @@ function extract_triple_backtick_text(input::String)::Vector{String}
end end
wordwindow(word::String, startindex::Integer)::UnitRange = startindex:startindex + length(word) -1
function cuttext(range, text)
# check whether range is outside text boundary
if range.start > length(text) || range.stop > length(text)
return nothing
else
return text[range]
end
end
""" """
detect_keyword(keywords::AbstractVector{String}, text::String) -> Dict{String, Integer} detect_keyword(keywords::AbstractVector{String}, text::String; mode::Union{String, Nothing}=nothing, delimiter::AbstractVector=[' ', '\n', '.']) -> Dict{String, Integer}
Detects and counts occurrences of multiple keywords in the text in different case variations (lowercase, uppercase first letter, or all uppercase). Detects and counts occurrences of multiple keywords in the text in different case variations (lowercase, uppercase first letter, or all uppercase).
# Arguments: # Arguments
- `keywords::AbstractVector{String}` Vector of keywords to search for - `keywords::AbstractVector{String}` Vector of keywords to search for
- `text::String` The text to search in - `text::String` The text to search in
# Returns: # Keyword Arguments
- `mode::Union{String, Nothing}` When set to "individual", only counts matches that are individual words (default: nothing)
- `delimiter::AbstractVector` Characters used to determine word boundaries when mode="individual" (default: [' ', '\n', '.'])
# Returns
- `Dict{String, Integer}` Returns a dictionary mapping each keyword to its count in the text (0 if not found) - `Dict{String, Integer}` Returns a dictionary mapping each keyword to its count in the text (0 if not found)
# Examples: # Examples
```jldoctest ```jldoctest
julia> detect_keyword(["test", "error", "case"], "This is a Test case with ERRORS case") julia> detect_keyword(["test", "example"], "This is a Test EXAMPLE")
Dict{String, Integer}("test" => 1, "error" => 1, "case" => 2) Dict{String, Integer}("test" => 1, "example" => 1)
julia> detect_keyword(["warning", "missing"], "Warning: data is complete") julia> detect_keyword(["cat"], "cats and category", mode="individual")
Dict{String, Integer}("warning" => 1, "missing" => 0) Dict{String, Integer}("cat" => 0)
julia> detect_keyword(["error"], "No ERRORS found!")
Dict{String, Integer}("error" => 1)
```
# Signature # Signature
""" """
# function detect_keyword(keywords::T1, text::String;
# mode::Union{String, Nothing}=nothing, delimiter::T2=[' ', '\n', '.']
# )::Dict{String, Integer} where {T1<:AbstractVector, T2<:AbstractVector}
# # Initialize dictionary to store keyword counts
# kwdict = Dict{String, Integer}()
# for i in keywords
# kwdict[i] = 0
# end
# startindex = 1
# # Iterate through each keyword and search for matches in text
# for kw in keywords
# # Check each possible starting position in the text
# for startindex in 1:1:length(text)
# # Get the window range for current keyword at current position
# wordwindows = wordwindow(kw, startindex)
# # Extract the text slice for comparison
# cuttexts = cuttext(wordwindows, text)
# if cuttexts !== nothing
# # Try to detect keyword in current text slice
# detected_kw = detect_keyword(kw, cuttexts)
# if detected_kw !== nothing && mode === nothing
# # Increment count if keyword found and no mode restrictions
# kwdict[kw] +=1
# elseif detected_kw !== nothing && mode === "individual"
# # For individual word mode, check word boundaries
# # Check if character before keyword is a delimiter or start of text
# checkbefore =
# if wordwindows.start > 1 &&
# text[wordwindows.start-1] ∈ delimiter
# true
# elseif wordwindows.start == 1
# true
# else
# false
# end
# # Check if character after keyword is a delimiter or end of text
# checkafter =
# if wordwindows.stop < length(text) &&
# text[wordwindows.stop+1] ∈ delimiter
# true
# elseif wordwindows.stop == length(text)
# true
# else
# false
# end
# # Only count keyword if it's a complete word
# if checkbefore && checkafter
# kwdict[kw] +=1
# end
# end
# end
# end
# end
# return kwdict
# end
function detect_keyword(keywords::T, text::String)::Dict{String, Integer} where {T<:AbstractVector} function detect_keyword(keywords::T, text::String)::Dict{String, Integer} where {T<:AbstractVector}
kw = Dict{String, Integer}() kw = Dict{String, Integer}()
splittext = string.(split(text, " ")) splittext = string.(split(text, " "))

View File

@@ -1,15 +1,35 @@
using Test using Test
using GeneralUtils using GeneralUtils: detect_keyword
@testset "ealierElementsIndex" begin @testset "detect_keyword tests" begin
@test GeneralUtils.ealierElementsIndex([1,2,3,4,5], 2) == 1:3 @test detect_keyword(["test"], "this is a test") == Dict("test" => 1)
@test GeneralUtils.ealierElementsIndex([1,2,3], 0) == 1:3
@test GeneralUtils.ealierElementsIndex([1], 1) == 1:0 @test detect_keyword(["hello", "world"], "hello world hello") == Dict("hello" => 2, "world" => 1)
@test GeneralUtils.ealierElementsIndex([], 0) == 1:0
@test GeneralUtils.ealierElementsIndex([1,2,3,4], 4) == 1:0 @test detect_keyword(["cat"], "category") == Dict("cat" => 1)
@test GeneralUtils.ealierElementsIndex([1,2,3,4], 5) == 1:0
@test GeneralUtils.ealierElementsIndex(collect(1:10), 3) == 1:7 @test detect_keyword(["cat"], "category"; mode="individual") == Dict("cat" => 0)
@test_throws ErrorException GeneralUtils.ealierElementsIndex([1,2,3], -1)
@test detect_keyword(["dog"], "dogs and cats"; mode="individual", delimiter=[' ']) == Dict("dog" => 0)
@test detect_keyword(["test"], "test.case"; mode="individual", delimiter=['.']) == Dict("test" => 1)
@test detect_keyword(["word"], "") == Dict("word" => 0)
@test detect_keyword(String[], "some text") == Dict{String, Integer}()
@test detect_keyword(["a", "b"], "a.b\nc"; delimiter=['.', '\n']) == Dict("a" => 1, "b" => 1)
multiline_text = """
first line
second line
first word
"""
@test detect_keyword(["first"], multiline_text) == Dict("first" => 2)
@test detect_keyword(["word"], "word"; mode="individual") == Dict("word" => 1)
@test detect_keyword(["test"], "testing.test.tester"; mode="individual", delimiter=['.']) == Dict("test" => 1)
end end