" version 0.4 Word and Positional embedding module " module WPembeddings using Embeddings using JSON3 using Redis include("Utils.jl") export get_word_embedding, get_positional_embedding, wp_embedding #---------------------------------------------------------------------------------------------- # user setting for word embedding GloVe_embedding_filepath = "C:\\myWork\\my_projects\\AI\\NLP\\my_NLP\\glove.840B.300d.txt" max_GloVe_vocab_size = 0 # size 10000+ or "all" #---------------------------------------------------------------------------------------------- # load GloVe word embedding. URL of the embedding file: https://nlp.stanford.edu/projects/glove/ if max_GloVe_vocab_size == 0 # don't load vocab elseif max_GloVe_vocab_size != "all" @time const embtable = Embeddings.load_embeddings(GloVe{:en}, GloVe_embedding_filepath, max_vocab_size=max_GloVe_vocab_size) # size 10000 or something const get_word_index = Dict(word=>ii for (ii,word) in enumerate(embtable.vocab)) else @time const embtable = Embeddings.load_embeddings(GloVe{:en}, GloVe_embedding_filepath) const get_word_index = Dict(word=>ii for (ii,word) in enumerate(embtable.vocab)) end # if max_GloVe_vocab_size != "all" # @time const embtable = Embeddings.load_embeddings(GloVe{:en}, GloVe_embedding_filepath, # max_vocab_size=max_GloVe_vocab_size) # size 10000 or something # const get_word_index = Dict(word=>ii for (ii,word) in enumerate(embtable.vocab)) # elseif max_GloVe_vocab_size == 0 # else # @time const embtable = Embeddings.load_embeddings(GloVe{:en}, GloVe_embedding_filepath) # const get_word_index = Dict(word=>ii for (ii,word) in enumerate(embtable.vocab)) # end """ get_word_embedding(word::String) Get embedding vector of a word. Its dimention is depend on GloVe file used # Example we_matrix = get_word_embedding("blue") """ function get_word_embedding(word::String) index = get_word_index[word] embedding = embtable.embeddings[:,index] return embedding end """ get_positional_embedding(total_word_position::Integer, word_embedding_dimension::Integer=300) return positional embedding matrix of size [word_embedding_dimension * total_word_position] # Example pe_matrix = get_positional_embedding(length(content), 300) """ function get_positional_embedding(total_word_position::Integer, word_embedding_dimension::Integer=300) d = word_embedding_dimension p = total_word_position pe = [x = i%2 == 0 ? cos(j/(10^(2i/d))) : sin(j/(10^(2i/d))) for i = 1:d, j = 1:p] return pe end """ wp_embedding(tokenized_word::Array{String}, positional_embedding::Bool=false) Word embedding with positional embedding. tokenized_word = sentense's tokenized word (not sentense in English definition but BERT definition. 1-BERT sentense can be 20+ English's sentense) # Example """ function wp_embedding(tokenized_word::Array{String}, positional_embedding::Bool=false) we_matrix = 0 for (i, v) in enumerate(tokenized_word) if i == 1 we_matrix = get_word_embedding(v) else we_matrix = hcat(we_matrix, get_word_embedding(v)) end end if positional_embedding pe_matrix = get_positional_embedding(length(tokenized_word), 300) # positional embedding wp_matrix = we_matrix + pe_matrix return wp_matrix else return we_matrix end end """ wp_query(tokenized_word::Array{String}, positional_embedding::Bool=false) convert tokenized_word into JSON3 String to be sent to GloVe docker server """ function wp_query_send(tokenized_word::Array{String}, positional_embedding::Bool=false) d = Dict("tokenized_word"=> tokenized_word, "positional_embedding"=>positional_embedding) json3_str = JSON3.write(d) return json3_str end """ wp_query(tokenized_word::Array{String}, positional_embedding::Bool=false) Using inside word_embedding_server to receive word embedding job convert JSON3 String into tokenized_word and positional_embedding """ function wp_query_receive(json3_str::String) d = JSON3.read(json3_str) tokenized_word = Array(d.tokenized_word) positional_embedding = d.positional_embedding return tokenized_word, positional_embedding end """ Send tokenized_word to word_embedding_server and return word embedding # Example WPembeddings.query_wp_server(tokenized_word) """ function query_wp_server(query; host="0.0.0.0", port=6379, publish_channel="word_embedding_server/input", positional_encoding=true) # channel used to receive JSON String from word_embedding_server wp_channel = Channel(10) function wp_receive(x) array = Utils.JSON3_str_to_Array(x) put!(wp_channel, array) end # establish connection to word_embedding_server using default port conn = Redis.RedisConnection(host=host, port=port) sub = Redis.open_subscription(conn) Redis.subscribe(sub, "word_embedding_server/output", wp_receive) # Redis.subscribe(sub, "word_embedding_server/output", WPembeddings.wp_receive) # set positional_encoding = true to enable positional encoding query = WPembeddings.wp_query_send(query, positional_encoding) # Ask word_embedding_server for word embedding Redis.publish(conn, publish_channel, query); wait(wp_channel) # wait for word_embedding_server to response embedded_word = take!(wp_channel) disconnect(conn) return embedded_word end end