Spaces:
Runtime error
Runtime error
Roland Szabo
commited on
Commit
·
e710e5f
1
Parent(s):
2d8e6b8
Improve caching
Browse files
app.py
CHANGED
@@ -60,11 +60,12 @@ def get_word_idx(sent: str, word: str):
|
|
60 |
return l.index(word)
|
61 |
|
62 |
|
63 |
-
|
|
|
64 |
"""Get a word vector by first tokenizing the input sentence, getting all token idxs
|
65 |
that make up the word of interest, and then `get_hidden_states`."""
|
66 |
layers = [-4, -3, -2, -1] if layers is None else layers
|
67 |
-
|
68 |
encoded = tokenizer.encode_plus(sent, return_tensors="pt")
|
69 |
|
70 |
idx = get_word_idx(sent, word)
|
@@ -81,7 +82,7 @@ def get_embedding(tokenizer, model, sent, word, layers=None)-> torch.Tensor:
|
|
81 |
# Only select the tokens that constitute the requested word
|
82 |
word_tokens_output = output[token_ids_word]
|
83 |
|
84 |
-
return word_tokens_output.mean(dim=0)
|
85 |
|
86 |
|
87 |
|
@@ -100,9 +101,15 @@ for k in verses:
|
|
100 |
print(books)
|
101 |
|
102 |
all_defs = {k: f"{k} - {strongs_defs[k]}" for k in strongs_defs}
|
103 |
-
|
|
|
|
|
|
|
|
|
104 |
option2 = st.multiselect('Select Strongs numbers for second concept', all_defs.keys(),
|
105 |
-
["5368", "5360", "5363", "5362", "5361", "5366", "
|
|
|
|
|
106 |
@st.cache(allow_output_mutation=True)
|
107 |
def get_models():
|
108 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
|
@@ -113,7 +120,6 @@ def get_models():
|
|
113 |
@st.cache
|
114 |
def get_all_embeddings(greek_words):
|
115 |
embeddings = []
|
116 |
-
tokenizer, model = get_models()
|
117 |
|
118 |
for word in greek_words:
|
119 |
for number in greek_words[word]:
|
@@ -122,16 +128,15 @@ def get_all_embeddings(greek_words):
|
|
122 |
for verse, idx in strongs_tags[number]:
|
123 |
if verse in verses:
|
124 |
text = verses[verse]
|
125 |
-
print(text, idx)
|
126 |
words = [x for x in re.split('([ \'])', text) if x != " " and x != "" and x != "'"]
|
127 |
if len(words) <= idx - 1:
|
128 |
continue
|
129 |
ew = words[idx-1].strip(",.!?;:()\"'-")
|
130 |
-
|
131 |
-
emb
|
132 |
-
embeddings.append((emb, f"{verse} {text}", ew, book))
|
133 |
return embeddings
|
134 |
|
|
|
135 |
def get_book_type(idx):
|
136 |
if idx < 4:
|
137 |
return 'Gospels'
|
@@ -143,11 +148,7 @@ def get_book_type(idx):
|
|
143 |
return 'Short lettters'
|
144 |
return 'Revelation'
|
145 |
|
146 |
-
|
147 |
-
"agape": ["0025", "0026"],
|
148 |
-
"phileo": ["5368", "5360", "5363", "5362", "5361", "5366", "5365", "5377"],
|
149 |
-
}
|
150 |
-
word_list = ["lovers", "loved", "loves", "love", "Love"]
|
151 |
|
152 |
embeddings = get_all_embeddings({"concept1": option1, "concept2": option2})
|
153 |
|
|
|
60 |
return l.index(word)
|
61 |
|
62 |
|
63 |
+
@st.cache
|
64 |
+
def get_embedding(sent, word, layers=None):
|
65 |
"""Get a word vector by first tokenizing the input sentence, getting all token idxs
|
66 |
that make up the word of interest, and then `get_hidden_states`."""
|
67 |
layers = [-4, -3, -2, -1] if layers is None else layers
|
68 |
+
tokenizer, model = get_models()
|
69 |
encoded = tokenizer.encode_plus(sent, return_tensors="pt")
|
70 |
|
71 |
idx = get_word_idx(sent, word)
|
|
|
82 |
# Only select the tokens that constitute the requested word
|
83 |
word_tokens_output = output[token_ids_word]
|
84 |
|
85 |
+
return word_tokens_output.mean(dim=0).numpy()
|
86 |
|
87 |
|
88 |
|
|
|
101 |
print(books)
|
102 |
|
103 |
all_defs = {k: f"{k} - {strongs_defs[k]}" for k in strongs_defs}
|
104 |
+
|
105 |
+
def format_strong(number):
|
106 |
+
return f"{number} - {strongs_defs[number]}"
|
107 |
+
|
108 |
+
option1 = st.multiselect('Select Strongs numbers for first concept', all_defs.keys(), ['0025', '0026'], format_func=format_strong)
|
109 |
option2 = st.multiselect('Select Strongs numbers for second concept', all_defs.keys(),
|
110 |
+
["5368", "5360", "5363", "5362", "5361", "5366", "5377"], format_func=format_strong)
|
111 |
+
|
112 |
+
|
113 |
@st.cache(allow_output_mutation=True)
|
114 |
def get_models():
|
115 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
|
|
|
120 |
@st.cache
|
121 |
def get_all_embeddings(greek_words):
|
122 |
embeddings = []
|
|
|
123 |
|
124 |
for word in greek_words:
|
125 |
for number in greek_words[word]:
|
|
|
128 |
for verse, idx in strongs_tags[number]:
|
129 |
if verse in verses:
|
130 |
text = verses[verse]
|
|
|
131 |
words = [x for x in re.split('([ \'])', text) if x != " " and x != "" and x != "'"]
|
132 |
if len(words) <= idx - 1:
|
133 |
continue
|
134 |
ew = words[idx-1].strip(",.!?;:()\"'-")
|
135 |
+
emb = get_embedding(text, ew)
|
136 |
+
embeddings.append((emb, f"{verse} {text}", gw, book))
|
|
|
137 |
return embeddings
|
138 |
|
139 |
+
|
140 |
def get_book_type(idx):
|
141 |
if idx < 4:
|
142 |
return 'Gospels'
|
|
|
148 |
return 'Short lettters'
|
149 |
return 'Revelation'
|
150 |
|
151 |
+
|
|
|
|
|
|
|
|
|
152 |
|
153 |
embeddings = get_all_embeddings({"concept1": option1, "concept2": option2})
|
154 |
|