Mark7549 commited on
Commit
51778ca
·
1 Parent(s): 317c2f1

updated autocomplete for nearest neighbours

Browse files
Files changed (3) hide show
  1. app.py +6 -5
  2. autocomplete.py +37 -0
  3. corpora/compass_filtered.pkl.gz +3 -0
app.py CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
2
  from streamlit_option_menu import option_menu
3
  from word2vec import *
4
  import pandas as pd
 
5
 
6
  st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
7
 
@@ -13,14 +14,14 @@ active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D g
13
  if active_tab == "Nearest neighbours":
14
  st.write("### TO DO: add description of function")
15
  col1, col2 = st.columns(2)
16
- with open('corpora/compass_filtered.txt', 'r') as file:
17
- all_words = file.read().split()
18
- all_words = sorted(all_words)
 
19
 
20
  with st.container():
21
  with col1:
22
- # word = st.text_input("Enter a word", placeholder="πατήρ")
23
- word = st.multiselect("Enter a word", all_words, default=["πατήρ"])
24
 
25
  with col2:
26
  time_slice = st.selectbox("Time slice", ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"])
 
2
  from streamlit_option_menu import option_menu
3
  from word2vec import *
4
  import pandas as pd
5
+ from autocomplete import *
6
 
7
  st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
8
 
 
14
  if active_tab == "Nearest neighbours":
15
  st.write("### TO DO: add description of function")
16
  col1, col2 = st.columns(2)
17
+
18
+ # Load the compressed word list
19
+ compressed_word_list_filename = 'corpora/compass_filtered.pkl.gz'
20
+ all_words = load_compressed_word_list(compressed_word_list_filename)
21
 
22
  with st.container():
23
  with col1:
24
+ word = st.multiselect("Enter a word", all_words)
 
25
 
26
  with col2:
27
  time_slice = st.selectbox("Time slice", ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"])
autocomplete.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import gzip
3
+
4
+
5
+ def get_unique_words(corpus_filename):
6
+ """
7
+ Get a list of unique words from a corpus file
8
+ """
9
+ unique_words = set()
10
+ with open(corpus_filename, 'r', encoding='utf-8') as file:
11
+ for line in file:
12
+ words = line.strip().split()
13
+ unique_words.update(words)
14
+ return list(unique_words)
15
+
16
+
17
+ def save_compressed_word_list(words, filename):
18
+ """
19
+ Save a list of words to a compressed file
20
+ """
21
+ with gzip.open(filename, 'wb') as file:
22
+ pickle.dump(words, file)
23
+
24
+
25
+ def load_compressed_word_list(filename):
26
+ """
27
+ Load a list of words from a compressed file
28
+ """
29
+ with gzip.open(filename, 'rb') as file:
30
+ return pickle.load(file)
31
+
32
+
33
+ def get_autocomplete(input_word=" ", all_words=" "):
34
+ """
35
+ Get a list of words that start with the input word
36
+ """
37
+ return [word for word in all_words if word.startswith(input_word)]
corpora/compass_filtered.pkl.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a02f7e61330045219098cda092f3edb23650decc70d60db6be2de267e8c7a925
3
+ size 163769