add remove_french_accents function
This commit is contained in:
112
src/util.jl
112
src/util.jl
@@ -3,7 +3,8 @@ module util
|
|||||||
export timedifference, showstracktrace, findHighestIndexKey, uuid4snakecase, replaceDictKeys,
|
export timedifference, showstracktrace, findHighestIndexKey, uuid4snakecase, replaceDictKeys,
|
||||||
findMatchingDictKey, textToDict, randstring, randstrings, timeout,
|
findMatchingDictKey, textToDict, randstring, randstrings, timeout,
|
||||||
dataframeToCSV, dfToVectorDict, disintegrate_vectorDict, getDataFrameValue, dfRowtoString,
|
dataframeToCSV, dfToVectorDict, disintegrate_vectorDict, getDataFrameValue, dfRowtoString,
|
||||||
dfToString, dataframe_to_json_list, dict_to_string, extract_triple_backtick_text, countGivenWords
|
dfToString, dataframe_to_json_list, dict_to_string, extract_triple_backtick_text,
|
||||||
|
countGivenWords, remove_french_accents
|
||||||
|
|
||||||
using JSON3, DataStructures, Distributions, Random, Dates, UUIDs, MQTTClient, DataFrames
|
using JSON3, DataStructures, Distributions, Random, Dates, UUIDs, MQTTClient, DataFrames
|
||||||
|
|
||||||
@@ -279,7 +280,7 @@ function textToDict(text::String, keywords::Vector{String};
|
|||||||
rightmarker::Union{String, Nothing}=nothing, symbolkey::Bool=false, lowercasekey::Bool=false
|
rightmarker::Union{String, Nothing}=nothing, symbolkey::Bool=false, lowercasekey::Bool=false
|
||||||
)::OrderedDict
|
)::OrderedDict
|
||||||
|
|
||||||
#[WORKING] make sure this function detect variation of a work e.g. agent, Agent, AGENT
|
# make sure this function detect variation of a work e.g. agent, Agent, AGENT
|
||||||
kw = []
|
kw = []
|
||||||
# use for loop and detect_keyword function to get the exact variation of each keyword in the text then push to kw list
|
# use for loop and detect_keyword function to get the exact variation of each keyword in the text then push to kw list
|
||||||
for keyword in keywords
|
for keyword in keywords
|
||||||
@@ -787,7 +788,105 @@ function detect_keyword(keyword::String, text::String)::Union{Nothing, String}
|
|||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
""" count a given word in a text """
|
"""
|
||||||
|
countGivenWords(text::String, words::Vector{String}) -> Dict{String, Int}
|
||||||
|
|
||||||
|
Count the occurrences of each word in the given list within the provided text.
|
||||||
|
|
||||||
|
# Arguments
|
||||||
|
- `text::String`: The input text to search through.
|
||||||
|
- `words::Vector{String}`: A vector of words whose occurrences need to be counted.
|
||||||
|
|
||||||
|
# Returns
|
||||||
|
- `Dict{String, Int}`: A dictionary where keys are the words from the `words` list and values are their respective counts in the `text`.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
```julia
|
||||||
|
julia> countGivenWords("hello world hello", ["hello", "world"])
|
||||||
|
Dict{String,Int64} with 2 entries:
|
||||||
|
"hello" => 2
|
||||||
|
"world" => 1
|
||||||
|
|
||||||
|
julia> countGivenWords("foo bar baz foo", ["foo", "qux"])
|
||||||
|
Dict{String,Int64} with 2 entries:
|
||||||
|
"foo" => 2
|
||||||
|
"qux" => 0
|
||||||
|
```
|
||||||
|
|
||||||
|
# Signature
|
||||||
|
"""
|
||||||
|
function countGivenWords(text::String, words::Vector{String})::Vector{Int}
|
||||||
|
count = []
|
||||||
|
|
||||||
|
# loop through each word in words
|
||||||
|
for word in words
|
||||||
|
# initialize a counter for the current word
|
||||||
|
splittext = split(text, word)
|
||||||
|
splittext_length = length(splittext)
|
||||||
|
thisWordCount = splittext_length - 1
|
||||||
|
push!(count, thisWordCount)
|
||||||
|
end
|
||||||
|
return count
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
remove_french_accents(text::String) -> String
|
||||||
|
|
||||||
|
Remove French accents from the given text.
|
||||||
|
|
||||||
|
# Arguments
|
||||||
|
- `text::String`: The input string containing French accents.
|
||||||
|
|
||||||
|
# Returns
|
||||||
|
- `String`: The input string with all French accents removed.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
```julia
|
||||||
|
julia> remove_french_accents("Café")
|
||||||
|
"Cafe"
|
||||||
|
|
||||||
|
julia> remove_french_accents("L'été est beau.")
|
||||||
|
"L'ete est beau."
|
||||||
|
```
|
||||||
|
|
||||||
|
# Signature
|
||||||
|
"""
|
||||||
|
function remove_french_accents(text::AbstractString)::AbstractString
|
||||||
|
textcharlist = [i for i in text]
|
||||||
|
|
||||||
|
# Create a dictionary to map accented characters to their replacements
|
||||||
|
accented_to_regular = Dict(
|
||||||
|
'à' => 'a', 'â' => 'a', 'ä' => 'a', 'á' => 'a',
|
||||||
|
'é' => 'e', 'è' => 'e', 'ê' => 'e', 'ë' => 'e',
|
||||||
|
'î' => 'i', 'ï' => 'i', 'í' => 'i',
|
||||||
|
'ñ' => 'n',
|
||||||
|
'ô' => 'o', 'ö' => 'o', 'ò' => 'o', 'ó' => 'o',
|
||||||
|
'ù' => 'u', 'û' => 'u', 'ü' => 'u',
|
||||||
|
'ÿ' => 'y',
|
||||||
|
'ç' => 'c',
|
||||||
|
'Ä' => 'A',
|
||||||
|
'É' => 'E',
|
||||||
|
'Ö' => 'O',
|
||||||
|
'Ü' => 'U',
|
||||||
|
'’' => ''',
|
||||||
|
)
|
||||||
|
|
||||||
|
accentedchar = keys(accented_to_regular)
|
||||||
|
|
||||||
|
# Replace accented characters in the text using accented_to_regular dictionary above
|
||||||
|
for (i, char) in enumerate(textcharlist)
|
||||||
|
if char ∈ accentedchar
|
||||||
|
textcharlist[i] = accented_to_regular[char]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
cleaned_text = join(textcharlist)
|
||||||
|
return cleaned_text
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
function countGivenWords(text::String, words::Vector{String})::Vector{Int}
|
function countGivenWords(text::String, words::Vector{String})::Vector{Int}
|
||||||
count = []
|
count = []
|
||||||
|
|
||||||
@@ -814,13 +913,6 @@ end
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user