add remove_french_accents function
This commit is contained in:
112
src/util.jl
112
src/util.jl
@@ -3,7 +3,8 @@ module util
|
||||
export timedifference, showstracktrace, findHighestIndexKey, uuid4snakecase, replaceDictKeys,
|
||||
findMatchingDictKey, textToDict, randstring, randstrings, timeout,
|
||||
dataframeToCSV, dfToVectorDict, disintegrate_vectorDict, getDataFrameValue, dfRowtoString,
|
||||
dfToString, dataframe_to_json_list, dict_to_string, extract_triple_backtick_text, countGivenWords
|
||||
dfToString, dataframe_to_json_list, dict_to_string, extract_triple_backtick_text,
|
||||
countGivenWords, remove_french_accents
|
||||
|
||||
using JSON3, DataStructures, Distributions, Random, Dates, UUIDs, MQTTClient, DataFrames
|
||||
|
||||
@@ -279,7 +280,7 @@ function textToDict(text::String, keywords::Vector{String};
|
||||
rightmarker::Union{String, Nothing}=nothing, symbolkey::Bool=false, lowercasekey::Bool=false
|
||||
)::OrderedDict
|
||||
|
||||
#[WORKING] make sure this function detect variation of a work e.g. agent, Agent, AGENT
|
||||
# make sure this function detect variation of a work e.g. agent, Agent, AGENT
|
||||
kw = []
|
||||
# use for loop and detect_keyword function to get the exact variation of each keyword in the text then push to kw list
|
||||
for keyword in keywords
|
||||
@@ -787,7 +788,105 @@ function detect_keyword(keyword::String, text::String)::Union{Nothing, String}
|
||||
end
|
||||
|
||||
|
||||
""" count a given word in a text """
|
||||
"""
|
||||
countGivenWords(text::String, words::Vector{String}) -> Dict{String, Int}
|
||||
|
||||
Count the occurrences of each word in the given list within the provided text.
|
||||
|
||||
# Arguments
|
||||
- `text::String`: The input text to search through.
|
||||
- `words::Vector{String}`: A vector of words whose occurrences need to be counted.
|
||||
|
||||
# Returns
|
||||
- `Dict{String, Int}`: A dictionary where keys are the words from the `words` list and values are their respective counts in the `text`.
|
||||
|
||||
# Examples
|
||||
```julia
|
||||
julia> countGivenWords("hello world hello", ["hello", "world"])
|
||||
Dict{String,Int64} with 2 entries:
|
||||
"hello" => 2
|
||||
"world" => 1
|
||||
|
||||
julia> countGivenWords("foo bar baz foo", ["foo", "qux"])
|
||||
Dict{String,Int64} with 2 entries:
|
||||
"foo" => 2
|
||||
"qux" => 0
|
||||
```
|
||||
|
||||
# Signature
|
||||
"""
|
||||
function countGivenWords(text::String, words::Vector{String})::Vector{Int}
|
||||
count = []
|
||||
|
||||
# loop through each word in words
|
||||
for word in words
|
||||
# initialize a counter for the current word
|
||||
splittext = split(text, word)
|
||||
splittext_length = length(splittext)
|
||||
thisWordCount = splittext_length - 1
|
||||
push!(count, thisWordCount)
|
||||
end
|
||||
return count
|
||||
end
|
||||
|
||||
|
||||
|
||||
"""
|
||||
remove_french_accents(text::String) -> String
|
||||
|
||||
Remove French accents from the given text.
|
||||
|
||||
# Arguments
|
||||
- `text::String`: The input string containing French accents.
|
||||
|
||||
# Returns
|
||||
- `String`: The input string with all French accents removed.
|
||||
|
||||
# Examples
|
||||
```julia
|
||||
julia> remove_french_accents("Café")
|
||||
"Cafe"
|
||||
|
||||
julia> remove_french_accents("L'été est beau.")
|
||||
"L'ete est beau."
|
||||
```
|
||||
|
||||
# Signature
|
||||
"""
|
||||
function remove_french_accents(text::AbstractString)::AbstractString
|
||||
textcharlist = [i for i in text]
|
||||
|
||||
# Create a dictionary to map accented characters to their replacements
|
||||
accented_to_regular = Dict(
|
||||
'à' => 'a', 'â' => 'a', 'ä' => 'a', 'á' => 'a',
|
||||
'é' => 'e', 'è' => 'e', 'ê' => 'e', 'ë' => 'e',
|
||||
'î' => 'i', 'ï' => 'i', 'í' => 'i',
|
||||
'ñ' => 'n',
|
||||
'ô' => 'o', 'ö' => 'o', 'ò' => 'o', 'ó' => 'o',
|
||||
'ù' => 'u', 'û' => 'u', 'ü' => 'u',
|
||||
'ÿ' => 'y',
|
||||
'ç' => 'c',
|
||||
'Ä' => 'A',
|
||||
'É' => 'E',
|
||||
'Ö' => 'O',
|
||||
'Ü' => 'U',
|
||||
'’' => ''',
|
||||
)
|
||||
|
||||
accentedchar = keys(accented_to_regular)
|
||||
|
||||
# Replace accented characters in the text using accented_to_regular dictionary above
|
||||
for (i, char) in enumerate(textcharlist)
|
||||
if char ∈ accentedchar
|
||||
textcharlist[i] = accented_to_regular[char]
|
||||
end
|
||||
end
|
||||
|
||||
cleaned_text = join(textcharlist)
|
||||
return cleaned_text
|
||||
end
|
||||
|
||||
|
||||
function countGivenWords(text::String, words::Vector{String})::Vector{Int}
|
||||
count = []
|
||||
|
||||
@@ -814,13 +913,6 @@ end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user