Spaces:

GroNLP
/

agalma

Sleeping

File size: 2,322 Bytes

import pickle
import gzip
from word2vec import *


def get_unique_words(corpus_filename):
    """
    Get a list of unique words from a corpus file
    """
    unique_words = set()
    with open(corpus_filename, 'r', encoding='utf-8') as file:
        for line in file:
            words = line.strip().split()
            unique_words.update(words)
    return list(unique_words)


def save_compressed_word_list(words, filename):
    """
    Save a list of words to a compressed file
    """
    with gzip.open(filename, 'wb') as file:
        pickle.dump(words, file)
     
        
def load_compressed_word_list(filename):
    """
    Load a list of words from a compressed file
    """
    with gzip.open(filename, 'rb') as file:
        return pickle.load(file)
    

def get_autocomplete(input_word=" ", all_words=" "):
    """
    Get a list of words that start with the input word
    """
    return [word for word in all_words if word.startswith(input_word)]


def custom_sort(item):
    if item.isdigit():
        print(item)
        return (2, item)  # Place numbers last
    else:
        return (0, item.lower())


def order_compressed_list(filename):
    """
        Order the compressed list of words alphabetically and put numbers at the end
    """
    # Strip extension from filename
    filename_raw = filename.split('.')[0]
    
    with gzip.open(filename, 'rb') as file:
        words = pickle.load(file)

    # Sort the words
    sorted_words = sorted(words, key=custom_sort)
    
    return sorted_words
        
    
def read_compressed_list(filename):
    """
    Read the compressed list of words
    """
    with gzip.open(filename, 'rb') as file:
        print(pickle.load(file))


def word_in_models_dict(words_file):
    """
    Create a dictionary with words as keys and models in which the word occurs as values
    """
    with gzip.open(words_file, 'rb') as file:
        words = pickle.load(file)

    models = load_all_models()

    word_models = {word: [] for word in words}  # Initialize word_models dictionary with empty lists

    for model in models:
        model_name = convert_model_to_time_name(model[0])
        for word in words:
            if word in model[1].wv.key_to_index:
                word_models[word].append(model_name)

    return word_models