From fb2de59528ba81b103e91da91eab3a58b7f830e4 Mon Sep 17 00:00:00 2001 From: tonaerospace Date: Mon, 23 Dec 2024 07:17:55 +0700 Subject: [PATCH] add remove_french_accents function --- src/util.jl | 112 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 102 insertions(+), 10 deletions(-) diff --git a/src/util.jl b/src/util.jl index 550f9f9..8dd0017 100644 --- a/src/util.jl +++ b/src/util.jl @@ -3,7 +3,8 @@ module util export timedifference, showstracktrace, findHighestIndexKey, uuid4snakecase, replaceDictKeys, findMatchingDictKey, textToDict, randstring, randstrings, timeout, dataframeToCSV, dfToVectorDict, disintegrate_vectorDict, getDataFrameValue, dfRowtoString, - dfToString, dataframe_to_json_list, dict_to_string, extract_triple_backtick_text, countGivenWords + dfToString, dataframe_to_json_list, dict_to_string, extract_triple_backtick_text, + countGivenWords, remove_french_accents using JSON3, DataStructures, Distributions, Random, Dates, UUIDs, MQTTClient, DataFrames @@ -279,7 +280,7 @@ function textToDict(text::String, keywords::Vector{String}; rightmarker::Union{String, Nothing}=nothing, symbolkey::Bool=false, lowercasekey::Bool=false )::OrderedDict - #[WORKING] make sure this function detect variation of a work e.g. agent, Agent, AGENT + # make sure this function detect variation of a work e.g. agent, Agent, AGENT kw = [] # use for loop and detect_keyword function to get the exact variation of each keyword in the text then push to kw list for keyword in keywords @@ -787,7 +788,105 @@ function detect_keyword(keyword::String, text::String)::Union{Nothing, String} end -""" count a given word in a text """ +""" + countGivenWords(text::String, words::Vector{String}) -> Dict{String, Int} + +Count the occurrences of each word in the given list within the provided text. + +# Arguments +- `text::String`: The input text to search through. +- `words::Vector{String}`: A vector of words whose occurrences need to be counted. + +# Returns +- `Dict{String, Int}`: A dictionary where keys are the words from the `words` list and values are their respective counts in the `text`. + +# Examples + ```julia + julia> countGivenWords("hello world hello", ["hello", "world"]) + Dict{String,Int64} with 2 entries: + "hello" => 2 + "world" => 1 + + julia> countGivenWords("foo bar baz foo", ["foo", "qux"]) + Dict{String,Int64} with 2 entries: + "foo" => 2 + "qux" => 0 + ``` + +# Signature +""" +function countGivenWords(text::String, words::Vector{String})::Vector{Int} + count = [] + + # loop through each word in words + for word in words + # initialize a counter for the current word + splittext = split(text, word) + splittext_length = length(splittext) + thisWordCount = splittext_length - 1 + push!(count, thisWordCount) + end + return count +end + + + +""" + remove_french_accents(text::String) -> String + +Remove French accents from the given text. + +# Arguments +- `text::String`: The input string containing French accents. + +# Returns +- `String`: The input string with all French accents removed. + +# Examples + ```julia + julia> remove_french_accents("Café") + "Cafe" + + julia> remove_french_accents("L'été est beau.") + "L'ete est beau." + ``` + +# Signature +""" +function remove_french_accents(text::AbstractString)::AbstractString + textcharlist = [i for i in text] + + # Create a dictionary to map accented characters to their replacements + accented_to_regular = Dict( + 'à' => 'a', 'â' => 'a', 'ä' => 'a', 'á' => 'a', + 'é' => 'e', 'è' => 'e', 'ê' => 'e', 'ë' => 'e', + 'î' => 'i', 'ï' => 'i', 'í' => 'i', + 'ñ' => 'n', + 'ô' => 'o', 'ö' => 'o', 'ò' => 'o', 'ó' => 'o', + 'ù' => 'u', 'û' => 'u', 'ü' => 'u', + 'ÿ' => 'y', + 'ç' => 'c', + 'Ä' => 'A', + 'É' => 'E', + 'Ö' => 'O', + 'Ü' => 'U', + '’' => ''', + ) + + accentedchar = keys(accented_to_regular) + + # Replace accented characters in the text using accented_to_regular dictionary above + for (i, char) in enumerate(textcharlist) + if char ∈ accentedchar + textcharlist[i] = accented_to_regular[char] + end + end + + cleaned_text = join(textcharlist) + return cleaned_text +end + + function countGivenWords(text::String, words::Vector{String})::Vector{Int} count = [] @@ -814,13 +913,6 @@ end - - - - - - -