File size: 2,322 Bytes
51778ca cdb0a70 51778ca cdb0a70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import pickle
import gzip
from word2vec import *
def get_unique_words(corpus_filename):
"""
Get a list of unique words from a corpus file
"""
unique_words = set()
with open(corpus_filename, 'r', encoding='utf-8') as file:
for line in file:
words = line.strip().split()
unique_words.update(words)
return list(unique_words)
def save_compressed_word_list(words, filename):
"""
Save a list of words to a compressed file
"""
with gzip.open(filename, 'wb') as file:
pickle.dump(words, file)
def load_compressed_word_list(filename):
"""
Load a list of words from a compressed file
"""
with gzip.open(filename, 'rb') as file:
return pickle.load(file)
def get_autocomplete(input_word=" ", all_words=" "):
"""
Get a list of words that start with the input word
"""
return [word for word in all_words if word.startswith(input_word)]
def custom_sort(item):
if item.isdigit():
print(item)
return (2, item) # Place numbers last
else:
return (0, item.lower())
def order_compressed_list(filename):
"""
Order the compressed list of words alphabetically and put numbers at the end
"""
# Strip extension from filename
filename_raw = filename.split('.')[0]
with gzip.open(filename, 'rb') as file:
words = pickle.load(file)
# Sort the words
sorted_words = sorted(words, key=custom_sort)
return sorted_words
def read_compressed_list(filename):
"""
Read the compressed list of words
"""
with gzip.open(filename, 'rb') as file:
print(pickle.load(file))
def word_in_models_dict(words_file):
"""
Create a dictionary with words as keys and models in which the word occurs as values
"""
with gzip.open(words_file, 'rb') as file:
words = pickle.load(file)
models = load_all_models()
word_models = {word: [] for word in words} # Initialize word_models dictionary with empty lists
for model in models:
model_name = convert_model_to_time_name(model[0])
for word in words:
if word in model[1].wv.key_to_index:
word_models[word].append(model_name)
return word_models
|