Roland Szabo commited on
Commit
e710e5f
·
1 Parent(s): 2d8e6b8

Improve caching

Browse files
Files changed (1) hide show
  1. app.py +16 -15
app.py CHANGED
@@ -60,11 +60,12 @@ def get_word_idx(sent: str, word: str):
60
  return l.index(word)
61
 
62
 
63
- def get_embedding(tokenizer, model, sent, word, layers=None)-> torch.Tensor:
 
64
  """Get a word vector by first tokenizing the input sentence, getting all token idxs
65
  that make up the word of interest, and then `get_hidden_states`."""
66
  layers = [-4, -3, -2, -1] if layers is None else layers
67
-
68
  encoded = tokenizer.encode_plus(sent, return_tensors="pt")
69
 
70
  idx = get_word_idx(sent, word)
@@ -81,7 +82,7 @@ def get_embedding(tokenizer, model, sent, word, layers=None)-> torch.Tensor:
81
  # Only select the tokens that constitute the requested word
82
  word_tokens_output = output[token_ids_word]
83
 
84
- return word_tokens_output.mean(dim=0)
85
 
86
 
87
 
@@ -100,9 +101,15 @@ for k in verses:
100
  print(books)
101
 
102
  all_defs = {k: f"{k} - {strongs_defs[k]}" for k in strongs_defs}
103
- option1 = st.multiselect('Select Strongs numbers for first concept', all_defs.keys(), ['0025', '0026'], format_func=lambda x: strongs_defs[x])
 
 
 
 
104
  option2 = st.multiselect('Select Strongs numbers for second concept', all_defs.keys(),
105
- ["5368", "5360", "5363", "5362", "5361", "5366", "5365", "5377"], format_func=lambda x: strongs_defs[x])
 
 
106
  @st.cache(allow_output_mutation=True)
107
  def get_models():
108
  tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
@@ -113,7 +120,6 @@ def get_models():
113
  @st.cache
114
  def get_all_embeddings(greek_words):
115
  embeddings = []
116
- tokenizer, model = get_models()
117
 
118
  for word in greek_words:
119
  for number in greek_words[word]:
@@ -122,16 +128,15 @@ def get_all_embeddings(greek_words):
122
  for verse, idx in strongs_tags[number]:
123
  if verse in verses:
124
  text = verses[verse]
125
- print(text, idx)
126
  words = [x for x in re.split('([ \'])', text) if x != " " and x != "" and x != "'"]
127
  if len(words) <= idx - 1:
128
  continue
129
  ew = words[idx-1].strip(",.!?;:()\"'-")
130
- print(ew)
131
- emb = get_embedding(tokenizer, model, text, ew).numpy()
132
- embeddings.append((emb, f"{verse} {text}", ew, book))
133
  return embeddings
134
 
 
135
  def get_book_type(idx):
136
  if idx < 4:
137
  return 'Gospels'
@@ -143,11 +148,7 @@ def get_book_type(idx):
143
  return 'Short lettters'
144
  return 'Revelation'
145
 
146
- strongs_numbers = {
147
- "agape": ["0025", "0026"],
148
- "phileo": ["5368", "5360", "5363", "5362", "5361", "5366", "5365", "5377"],
149
- }
150
- word_list = ["lovers", "loved", "loves", "love", "Love"]
151
 
152
  embeddings = get_all_embeddings({"concept1": option1, "concept2": option2})
153
 
 
60
  return l.index(word)
61
 
62
 
63
+ @st.cache
64
+ def get_embedding(sent, word, layers=None):
65
  """Get a word vector by first tokenizing the input sentence, getting all token idxs
66
  that make up the word of interest, and then `get_hidden_states`."""
67
  layers = [-4, -3, -2, -1] if layers is None else layers
68
+ tokenizer, model = get_models()
69
  encoded = tokenizer.encode_plus(sent, return_tensors="pt")
70
 
71
  idx = get_word_idx(sent, word)
 
82
  # Only select the tokens that constitute the requested word
83
  word_tokens_output = output[token_ids_word]
84
 
85
+ return word_tokens_output.mean(dim=0).numpy()
86
 
87
 
88
 
 
101
  print(books)
102
 
103
  all_defs = {k: f"{k} - {strongs_defs[k]}" for k in strongs_defs}
104
+
105
+ def format_strong(number):
106
+ return f"{number} - {strongs_defs[number]}"
107
+
108
+ option1 = st.multiselect('Select Strongs numbers for first concept', all_defs.keys(), ['0025', '0026'], format_func=format_strong)
109
  option2 = st.multiselect('Select Strongs numbers for second concept', all_defs.keys(),
110
+ ["5368", "5360", "5363", "5362", "5361", "5366", "5377"], format_func=format_strong)
111
+
112
+
113
  @st.cache(allow_output_mutation=True)
114
  def get_models():
115
  tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
 
120
  @st.cache
121
  def get_all_embeddings(greek_words):
122
  embeddings = []
 
123
 
124
  for word in greek_words:
125
  for number in greek_words[word]:
 
128
  for verse, idx in strongs_tags[number]:
129
  if verse in verses:
130
  text = verses[verse]
 
131
  words = [x for x in re.split('([ \'])', text) if x != " " and x != "" and x != "'"]
132
  if len(words) <= idx - 1:
133
  continue
134
  ew = words[idx-1].strip(",.!?;:()\"'-")
135
+ emb = get_embedding(text, ew)
136
+ embeddings.append((emb, f"{verse} {text}", gw, book))
 
137
  return embeddings
138
 
139
+
140
  def get_book_type(idx):
141
  if idx < 4:
142
  return 'Gospels'
 
148
  return 'Short lettters'
149
  return 'Revelation'
150
 
151
+
 
 
 
 
152
 
153
  embeddings = get_all_embeddings({"concept1": option1, "concept2": option2})
154