From a894ad85ba794e123a2625432cc4a407d88782f3 Mon Sep 17 00:00:00 2001 From: narawat lamaiin Date: Fri, 4 Apr 2025 15:04:19 +0700 Subject: [PATCH] update --- src/util.jl | 95 +++++++++++++++++++++++++++++++++++++++++++----- test/runtests.jl | 40 +++++++++++++++----- 2 files changed, 116 insertions(+), 19 deletions(-) diff --git a/src/util.jl b/src/util.jl index f34e2f3..682f6c3 100644 --- a/src/util.jl +++ b/src/util.jl @@ -770,29 +770,106 @@ function extract_triple_backtick_text(input::String)::Vector{String} end +wordwindow(word::String, startindex::Integer)::UnitRange = startindex:startindex + length(word) -1 + +function cuttext(range, text) + # check whether range is outside text boundary + if range.start > length(text) || range.stop > length(text) + return nothing + else + return text[range] + end +end + """ - detect_keyword(keywords::AbstractVector{String}, text::String) -> Dict{String, Integer} + detect_keyword(keywords::AbstractVector{String}, text::String; mode::Union{String, Nothing}=nothing, delimiter::AbstractVector=[' ', '\n', '.']) -> Dict{String, Integer} Detects and counts occurrences of multiple keywords in the text in different case variations (lowercase, uppercase first letter, or all uppercase). -# Arguments: +# Arguments - `keywords::AbstractVector{String}` Vector of keywords to search for - `text::String` The text to search in -# Returns: +# Keyword Arguments +- `mode::Union{String, Nothing}` When set to "individual", only counts matches that are individual words (default: nothing) +- `delimiter::AbstractVector` Characters used to determine word boundaries when mode="individual" (default: [' ', '\n', '.']) + +# Returns - `Dict{String, Integer}` Returns a dictionary mapping each keyword to its count in the text (0 if not found) -# Examples: +# Examples ```jldoctest - julia> detect_keyword(["test", "error", "case"], "This is a Test case with ERRORS case") - Dict{String, Integer}("test" => 1, "error" => 1, "case" => 2) + julia> detect_keyword(["test", "example"], "This is a Test EXAMPLE") + Dict{String, Integer}("test" => 1, "example" => 1) - julia> detect_keyword(["warning", "missing"], "Warning: data is complete") - Dict{String, Integer}("warning" => 1, "missing" => 0) + julia> detect_keyword(["cat"], "cats and category", mode="individual") + Dict{String, Integer}("cat" => 0) + + julia> detect_keyword(["error"], "No ERRORS found!") + Dict{String, Integer}("error" => 1) + ``` - # Signature """ +# function detect_keyword(keywords::T1, text::String; +# mode::Union{String, Nothing}=nothing, delimiter::T2=[' ', '\n', '.'] +# )::Dict{String, Integer} where {T1<:AbstractVector, T2<:AbstractVector} +# # Initialize dictionary to store keyword counts +# kwdict = Dict{String, Integer}() +# for i in keywords +# kwdict[i] = 0 +# end + +# startindex = 1 +# # Iterate through each keyword and search for matches in text +# for kw in keywords +# # Check each possible starting position in the text +# for startindex in 1:1:length(text) +# # Get the window range for current keyword at current position +# wordwindows = wordwindow(kw, startindex) +# # Extract the text slice for comparison +# cuttexts = cuttext(wordwindows, text) +# if cuttexts !== nothing +# # Try to detect keyword in current text slice +# detected_kw = detect_keyword(kw, cuttexts) +# if detected_kw !== nothing && mode === nothing +# # Increment count if keyword found and no mode restrictions +# kwdict[kw] +=1 +# elseif detected_kw !== nothing && mode === "individual" +# # For individual word mode, check word boundaries +# # Check if character before keyword is a delimiter or start of text +# checkbefore = +# if wordwindows.start > 1 && +# text[wordwindows.start-1] ∈ delimiter +# true +# elseif wordwindows.start == 1 +# true +# else +# false +# end + +# # Check if character after keyword is a delimiter or end of text +# checkafter = +# if wordwindows.stop < length(text) && +# text[wordwindows.stop+1] ∈ delimiter +# true +# elseif wordwindows.stop == length(text) +# true +# else +# false +# end +# # Only count keyword if it's a complete word +# if checkbefore && checkafter +# kwdict[kw] +=1 +# end +# end +# end +# end +# end +# return kwdict +# end + + function detect_keyword(keywords::T, text::String)::Dict{String, Integer} where {T<:AbstractVector} kw = Dict{String, Integer}() splittext = string.(split(text, " ")) diff --git a/test/runtests.jl b/test/runtests.jl index 444903e..f415525 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,15 +1,35 @@ using Test -using GeneralUtils +using GeneralUtils: detect_keyword -@testset "ealierElementsIndex" begin - @test GeneralUtils.ealierElementsIndex([1,2,3,4,5], 2) == 1:3 - @test GeneralUtils.ealierElementsIndex([1,2,3], 0) == 1:3 - @test GeneralUtils.ealierElementsIndex([1], 1) == 1:0 - @test GeneralUtils.ealierElementsIndex([], 0) == 1:0 - @test GeneralUtils.ealierElementsIndex([1,2,3,4], 4) == 1:0 - @test GeneralUtils.ealierElementsIndex([1,2,3,4], 5) == 1:0 - @test GeneralUtils.ealierElementsIndex(collect(1:10), 3) == 1:7 - @test_throws ErrorException GeneralUtils.ealierElementsIndex([1,2,3], -1) +@testset "detect_keyword tests" begin + @test detect_keyword(["test"], "this is a test") == Dict("test" => 1) + + @test detect_keyword(["hello", "world"], "hello world hello") == Dict("hello" => 2, "world" => 1) + + @test detect_keyword(["cat"], "category") == Dict("cat" => 1) + + @test detect_keyword(["cat"], "category"; mode="individual") == Dict("cat" => 0) + + @test detect_keyword(["dog"], "dogs and cats"; mode="individual", delimiter=[' ']) == Dict("dog" => 0) + + @test detect_keyword(["test"], "test.case"; mode="individual", delimiter=['.']) == Dict("test" => 1) + + @test detect_keyword(["word"], "") == Dict("word" => 0) + + @test detect_keyword(String[], "some text") == Dict{String, Integer}() + + @test detect_keyword(["a", "b"], "a.b\nc"; delimiter=['.', '\n']) == Dict("a" => 1, "b" => 1) + + multiline_text = """ + first line + second line + first word + """ + @test detect_keyword(["first"], multiline_text) == Dict("first" => 2) + + @test detect_keyword(["word"], "word"; mode="individual") == Dict("word" => 1) + + @test detect_keyword(["test"], "testing.test.tester"; mode="individual", delimiter=['.']) == Dict("test" => 1) end