add remove_french_accents function

This commit is contained in:
2024-12-23 07:17:55 +07:00
parent 3c2242cf94
commit fb2de59528

View File

@@ -3,7 +3,8 @@ module util
export timedifference, showstracktrace, findHighestIndexKey, uuid4snakecase, replaceDictKeys,
findMatchingDictKey, textToDict, randstring, randstrings, timeout,
dataframeToCSV, dfToVectorDict, disintegrate_vectorDict, getDataFrameValue, dfRowtoString,
dfToString, dataframe_to_json_list, dict_to_string, extract_triple_backtick_text, countGivenWords
dfToString, dataframe_to_json_list, dict_to_string, extract_triple_backtick_text,
countGivenWords, remove_french_accents
using JSON3, DataStructures, Distributions, Random, Dates, UUIDs, MQTTClient, DataFrames
@@ -279,7 +280,7 @@ function textToDict(text::String, keywords::Vector{String};
rightmarker::Union{String, Nothing}=nothing, symbolkey::Bool=false, lowercasekey::Bool=false
)::OrderedDict
#[WORKING] make sure this function detect variation of a work e.g. agent, Agent, AGENT
# make sure this function detect variation of a work e.g. agent, Agent, AGENT
kw = []
# use for loop and detect_keyword function to get the exact variation of each keyword in the text then push to kw list
for keyword in keywords
@@ -787,7 +788,105 @@ function detect_keyword(keyword::String, text::String)::Union{Nothing, String}
end
""" count a given word in a text """
"""
countGivenWords(text::String, words::Vector{String}) -> Dict{String, Int}
Count the occurrences of each word in the given list within the provided text.
# Arguments
- `text::String`: The input text to search through.
- `words::Vector{String}`: A vector of words whose occurrences need to be counted.
# Returns
- `Dict{String, Int}`: A dictionary where keys are the words from the `words` list and values are their respective counts in the `text`.
# Examples
```julia
julia> countGivenWords("hello world hello", ["hello", "world"])
Dict{String,Int64} with 2 entries:
"hello" => 2
"world" => 1
julia> countGivenWords("foo bar baz foo", ["foo", "qux"])
Dict{String,Int64} with 2 entries:
"foo" => 2
"qux" => 0
```
# Signature
"""
function countGivenWords(text::String, words::Vector{String})::Vector{Int}
count = []
# loop through each word in words
for word in words
# initialize a counter for the current word
splittext = split(text, word)
splittext_length = length(splittext)
thisWordCount = splittext_length - 1
push!(count, thisWordCount)
end
return count
end
"""
remove_french_accents(text::String) -> String
Remove French accents from the given text.
# Arguments
- `text::String`: The input string containing French accents.
# Returns
- `String`: The input string with all French accents removed.
# Examples
```julia
julia> remove_french_accents("Café")
"Cafe"
julia> remove_french_accents("L'été est beau.")
"L'ete est beau."
```
# Signature
"""
function remove_french_accents(text::AbstractString)::AbstractString
textcharlist = [i for i in text]
# Create a dictionary to map accented characters to their replacements
accented_to_regular = Dict(
'à' => 'a', 'â' => 'a', 'ä' => 'a', 'á' => 'a',
'é' => 'e', 'è' => 'e', 'ê' => 'e', 'ë' => 'e',
'î' => 'i', 'ï' => 'i', 'í' => 'i',
'ñ' => 'n',
'ô' => 'o', 'ö' => 'o', 'ò' => 'o', 'ó' => 'o',
'ù' => 'u', 'û' => 'u', 'ü' => 'u',
'ÿ' => 'y',
'ç' => 'c',
'Ä' => 'A',
'É' => 'E',
'Ö' => 'O',
'Ü' => 'U',
'' => ''',
)
accentedchar = keys(accented_to_regular)
# Replace accented characters in the text using accented_to_regular dictionary above
for (i, char) in enumerate(textcharlist)
if char accentedchar
textcharlist[i] = accented_to_regular[char]
end
end
cleaned_text = join(textcharlist)
return cleaned_text
end
function countGivenWords(text::String, words::Vector{String})::Vector{Int}
count = []
@@ -814,13 +913,6 @@ end