update
This commit is contained in:
93
src/util.jl
93
src/util.jl
@@ -770,29 +770,106 @@ function extract_triple_backtick_text(input::String)::Vector{String}
|
||||
end
|
||||
|
||||
|
||||
wordwindow(word::String, startindex::Integer)::UnitRange = startindex:startindex + length(word) -1
|
||||
|
||||
function cuttext(range, text)
|
||||
# check whether range is outside text boundary
|
||||
if range.start > length(text) || range.stop > length(text)
|
||||
return nothing
|
||||
else
|
||||
return text[range]
|
||||
end
|
||||
end
|
||||
|
||||
"""
|
||||
detect_keyword(keywords::AbstractVector{String}, text::String) -> Dict{String, Integer}
|
||||
detect_keyword(keywords::AbstractVector{String}, text::String; mode::Union{String, Nothing}=nothing, delimiter::AbstractVector=[' ', '\n', '.']) -> Dict{String, Integer}
|
||||
|
||||
Detects and counts occurrences of multiple keywords in the text in different case variations (lowercase, uppercase first letter, or all uppercase).
|
||||
|
||||
# Arguments:
|
||||
# Arguments
|
||||
- `keywords::AbstractVector{String}` Vector of keywords to search for
|
||||
- `text::String` The text to search in
|
||||
|
||||
# Returns:
|
||||
# Keyword Arguments
|
||||
- `mode::Union{String, Nothing}` When set to "individual", only counts matches that are individual words (default: nothing)
|
||||
- `delimiter::AbstractVector` Characters used to determine word boundaries when mode="individual" (default: [' ', '\n', '.'])
|
||||
|
||||
# Returns
|
||||
- `Dict{String, Integer}` Returns a dictionary mapping each keyword to its count in the text (0 if not found)
|
||||
|
||||
# Examples:
|
||||
# Examples
|
||||
```jldoctest
|
||||
julia> detect_keyword(["test", "error", "case"], "This is a Test case with ERRORS case")
|
||||
Dict{String, Integer}("test" => 1, "error" => 1, "case" => 2)
|
||||
julia> detect_keyword(["test", "example"], "This is a Test EXAMPLE")
|
||||
Dict{String, Integer}("test" => 1, "example" => 1)
|
||||
|
||||
julia> detect_keyword(["warning", "missing"], "Warning: data is complete")
|
||||
Dict{String, Integer}("warning" => 1, "missing" => 0)
|
||||
julia> detect_keyword(["cat"], "cats and category", mode="individual")
|
||||
Dict{String, Integer}("cat" => 0)
|
||||
|
||||
julia> detect_keyword(["error"], "No ERRORS found!")
|
||||
Dict{String, Integer}("error" => 1)
|
||||
```
|
||||
|
||||
# Signature
|
||||
"""
|
||||
# function detect_keyword(keywords::T1, text::String;
|
||||
# mode::Union{String, Nothing}=nothing, delimiter::T2=[' ', '\n', '.']
|
||||
# )::Dict{String, Integer} where {T1<:AbstractVector, T2<:AbstractVector}
|
||||
# # Initialize dictionary to store keyword counts
|
||||
# kwdict = Dict{String, Integer}()
|
||||
# for i in keywords
|
||||
# kwdict[i] = 0
|
||||
# end
|
||||
|
||||
# startindex = 1
|
||||
# # Iterate through each keyword and search for matches in text
|
||||
# for kw in keywords
|
||||
# # Check each possible starting position in the text
|
||||
# for startindex in 1:1:length(text)
|
||||
# # Get the window range for current keyword at current position
|
||||
# wordwindows = wordwindow(kw, startindex)
|
||||
# # Extract the text slice for comparison
|
||||
# cuttexts = cuttext(wordwindows, text)
|
||||
# if cuttexts !== nothing
|
||||
# # Try to detect keyword in current text slice
|
||||
# detected_kw = detect_keyword(kw, cuttexts)
|
||||
# if detected_kw !== nothing && mode === nothing
|
||||
# # Increment count if keyword found and no mode restrictions
|
||||
# kwdict[kw] +=1
|
||||
# elseif detected_kw !== nothing && mode === "individual"
|
||||
# # For individual word mode, check word boundaries
|
||||
# # Check if character before keyword is a delimiter or start of text
|
||||
# checkbefore =
|
||||
# if wordwindows.start > 1 &&
|
||||
# text[wordwindows.start-1] ∈ delimiter
|
||||
# true
|
||||
# elseif wordwindows.start == 1
|
||||
# true
|
||||
# else
|
||||
# false
|
||||
# end
|
||||
|
||||
# # Check if character after keyword is a delimiter or end of text
|
||||
# checkafter =
|
||||
# if wordwindows.stop < length(text) &&
|
||||
# text[wordwindows.stop+1] ∈ delimiter
|
||||
# true
|
||||
# elseif wordwindows.stop == length(text)
|
||||
# true
|
||||
# else
|
||||
# false
|
||||
# end
|
||||
# # Only count keyword if it's a complete word
|
||||
# if checkbefore && checkafter
|
||||
# kwdict[kw] +=1
|
||||
# end
|
||||
# end
|
||||
# end
|
||||
# end
|
||||
# end
|
||||
# return kwdict
|
||||
# end
|
||||
|
||||
|
||||
function detect_keyword(keywords::T, text::String)::Dict{String, Integer} where {T<:AbstractVector}
|
||||
kw = Dict{String, Integer}()
|
||||
splittext = string.(split(text, " "))
|
||||
|
||||
@@ -1,15 +1,35 @@
|
||||
using Test
|
||||
using GeneralUtils
|
||||
using GeneralUtils: detect_keyword
|
||||
|
||||
@testset "ealierElementsIndex" begin
|
||||
@test GeneralUtils.ealierElementsIndex([1,2,3,4,5], 2) == 1:3
|
||||
@test GeneralUtils.ealierElementsIndex([1,2,3], 0) == 1:3
|
||||
@test GeneralUtils.ealierElementsIndex([1], 1) == 1:0
|
||||
@test GeneralUtils.ealierElementsIndex([], 0) == 1:0
|
||||
@test GeneralUtils.ealierElementsIndex([1,2,3,4], 4) == 1:0
|
||||
@test GeneralUtils.ealierElementsIndex([1,2,3,4], 5) == 1:0
|
||||
@test GeneralUtils.ealierElementsIndex(collect(1:10), 3) == 1:7
|
||||
@test_throws ErrorException GeneralUtils.ealierElementsIndex([1,2,3], -1)
|
||||
@testset "detect_keyword tests" begin
|
||||
@test detect_keyword(["test"], "this is a test") == Dict("test" => 1)
|
||||
|
||||
@test detect_keyword(["hello", "world"], "hello world hello") == Dict("hello" => 2, "world" => 1)
|
||||
|
||||
@test detect_keyword(["cat"], "category") == Dict("cat" => 1)
|
||||
|
||||
@test detect_keyword(["cat"], "category"; mode="individual") == Dict("cat" => 0)
|
||||
|
||||
@test detect_keyword(["dog"], "dogs and cats"; mode="individual", delimiter=[' ']) == Dict("dog" => 0)
|
||||
|
||||
@test detect_keyword(["test"], "test.case"; mode="individual", delimiter=['.']) == Dict("test" => 1)
|
||||
|
||||
@test detect_keyword(["word"], "") == Dict("word" => 0)
|
||||
|
||||
@test detect_keyword(String[], "some text") == Dict{String, Integer}()
|
||||
|
||||
@test detect_keyword(["a", "b"], "a.b\nc"; delimiter=['.', '\n']) == Dict("a" => 1, "b" => 1)
|
||||
|
||||
multiline_text = """
|
||||
first line
|
||||
second line
|
||||
first word
|
||||
"""
|
||||
@test detect_keyword(["first"], multiline_text) == Dict("first" => 2)
|
||||
|
||||
@test detect_keyword(["word"], "word"; mode="individual") == Dict("word" => 1)
|
||||
|
||||
@test detect_keyword(["test"], "testing.test.tester"; mode="individual", delimiter=['.']) == Dict("test" => 1)
|
||||
end
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user