add remove_french_accents function

2024-12-23 07:17:55 +07:00
parent 3c2242cf94
commit fb2de59528
1 changed files with 102 additions and 10 deletions
--- a/src/util.jl
+++ b/src/util.jl
@@ -3,7 +3,8 @@ module util
 export timedifference, showstracktrace, findHighestIndexKey, uuid4snakecase, replaceDictKeys,
        findMatchingDictKey, textToDict, randstring, randstrings, timeout,
        dataframeToCSV, dfToVectorDict, disintegrate_vectorDict, getDataFrameValue, dfRowtoString, 
-        dfToString, dataframe_to_json_list, dict_to_string, extract_triple_backtick_text, countGivenWords
+        dfToString, dataframe_to_json_list, dict_to_string, extract_triple_backtick_text, 
+        countGivenWords, remove_french_accents

 using JSON3, DataStructures, Distributions, Random, Dates, UUIDs, MQTTClient, DataFrames

@@ -279,7 +280,7 @@ function textToDict(text::String, keywords::Vector{String};
    rightmarker::Union{String, Nothing}=nothing, symbolkey::Bool=false, lowercasekey::Bool=false
    )::OrderedDict
    
-  #[WORKING] make sure this function detect variation of a work e.g. agent, Agent, AGENT
+  # make sure this function detect variation of a work e.g. agent, Agent, AGENT
  kw = []
  # use for loop and detect_keyword function to get the exact variation of each keyword in the text then push to kw list
  for keyword in keywords
@@ -787,7 +788,105 @@ function detect_keyword(keyword::String, text::String)::Union{Nothing, String}
 end


-""" count a given word in a text """
+"""
+    countGivenWords(text::String, words::Vector{String}) -> Dict{String, Int}
+
+Count the occurrences of each word in the given list within the provided text.
+
+# Arguments
+- `text::String`: The input text to search through.
+- `words::Vector{String}`: A vector of words whose occurrences need to be counted.
+
+# Returns
+- `Dict{String, Int}`: A dictionary where keys are the words from the `words` list and values are their respective counts in the `text`.
+
+# Examples
+  ```julia
+  julia> countGivenWords("hello world hello", ["hello", "world"])
+  Dict{String,Int64} with 2 entries:
+    "hello" => 2
+    "world" => 1
+
+  julia> countGivenWords("foo bar baz foo", ["foo", "qux"])
+  Dict{String,Int64} with 2 entries:
+    "foo" => 2
+    "qux" => 0
+  ```
+
+# Signature
+"""
+function countGivenWords(text::String, words::Vector{String})::Vector{Int}
+  count = []
+
+  # loop through each word in words
+  for word in words
+    # initialize a counter for the current word
+    splittext = split(text, word)
+    splittext_length = length(splittext)
+    thisWordCount = splittext_length - 1
+    push!(count, thisWordCount)
+  end
+  return count
+end
+
+
+
+"""
+    remove_french_accents(text::String) -> String
+
+Remove French accents from the given text.
+
+# Arguments
+- `text::String`: The input string containing French accents.
+
+# Returns
+- `String`: The input string with all French accents removed.
+
+# Examples
+  ```julia
+  julia> remove_french_accents("Café")
+  "Cafe"
+
+  julia> remove_french_accents("L'été est beau.")
+  "L'ete est beau."
+  ```
+
+# Signature
+"""
+function remove_french_accents(text::AbstractString)::AbstractString
+  textcharlist = [i for i in text]
+
+  # Create a dictionary to map accented characters to their replacements
+  accented_to_regular = Dict(
+    'à' => 'a', 'â' => 'a', 'ä' => 'a', 'á' => 'a',
+    'é' => 'e', 'è' => 'e', 'ê' => 'e', 'ë' => 'e',
+    'î' => 'i', 'ï' => 'i', 'í' => 'i',
+    'ñ' => 'n',
+    'ô' => 'o', 'ö' => 'o', 'ò' => 'o', 'ó' => 'o',
+    'ù' => 'u', 'û' => 'u', 'ü' => 'u',
+    'ÿ' => 'y',
+    'ç' => 'c',
+    'Ä' => 'A',
+    'É' => 'E',
+    'Ö' => 'O',
+    'Ü' => 'U',
+    '’' => ''',
+  )
+
+  accentedchar = keys(accented_to_regular)
+
+  # Replace accented characters in the text using accented_to_regular dictionary above
+  for (i, char) in enumerate(textcharlist)
+    if char ∈ accentedchar
+      textcharlist[i] = accented_to_regular[char]
+    end
+  end
+
+  cleaned_text = join(textcharlist)
+  return cleaned_text
+end
+
+
 function countGivenWords(text::String, words::Vector{String})::Vector{Int}
  count = []

@@ -814,13 +913,6 @@ end



-
-
-
-
-
-
-