Spaces:
Runtime error
Runtime error
import json | |
import re | |
from collections import defaultdict | |
from typing import Dict | |
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import torch | |
from transformers import AutoTokenizer, AutoModel | |
import umap | |
import plotly.express as px | |
import textwrap | |
st.title('Bible analysis with BERT') | |
def load_verses() -> Dict[str, str]: | |
verses = {} | |
count = 0 | |
with open('esv.txt', 'r', encoding='utf8') as f: | |
lines = f.readlines() | |
for line in lines: | |
try: | |
citation, raw_sentence = line.strip().split('\t') | |
verses[citation] = raw_sentence | |
except ValueError: | |
count +=1 | |
print(count) | |
return verses | |
def load_tags(): | |
index = defaultdict(list) | |
with open("esv_tags.txt", encoding='utf8') as f: | |
lines = f.readlines() | |
for line in lines: | |
verse, strongs = line.split("\t", maxsplit=1) | |
tokens = strongs.strip().split("\t") | |
for t in tokens: | |
if "=" in t: | |
words, strongs = t.split("=") | |
words = [(verse, int(x)) for x in words.split("+")] | |
strongs = [x[1:-1] for x in strongs.split("+")] | |
for s in strongs: | |
index[s].extend(words) | |
return index | |
def get_strong_defs(): | |
with open("strongs_defs.json", encoding='utf8') as f: | |
return json.load(f) | |
def get_word_idx(sent: str, word: str): | |
l = re.split('([ .,!?:;""()\'-])', sent) | |
l = [x for x in l if x != " " and x != ""] | |
return l.index(word) | |
def get_embedding(sent, word, layers=None): | |
"""Get a word vector by first tokenizing the input sentence, getting all token idxs | |
that make up the word of interest, and then `get_hidden_states`.""" | |
layers = [-4, -3, -2, -1] if layers is None else layers | |
tokenizer, model = get_models() | |
encoded = tokenizer.encode_plus(sent, return_tensors="pt") | |
idx = get_word_idx(sent, word) | |
# get all token idxs that belong to the word of interest | |
token_ids_word = np.where(np.array(encoded.word_ids()) == idx) | |
with torch.no_grad(): | |
output = model(**encoded) | |
# Get all hidden states | |
states = output.hidden_states | |
# Stack and sum all requested layers | |
output = torch.stack([states[i] for i in layers]).sum(0).squeeze() | |
# Only select the tokens that constitute the requested word | |
word_tokens_output = output[token_ids_word] | |
return word_tokens_output.mean(dim=0).numpy() | |
verses = load_verses() | |
strongs_tags = load_tags() | |
strongs_defs = get_strong_defs() | |
print(len(strongs_tags)) | |
st.text('Loaded {} verses'.format(len(verses))) | |
st.text('Loaded {} tags'.format(len(strongs_tags))) | |
books = [] | |
for k in verses: | |
book = k[:k.index(" ", 2)] | |
if book not in books: | |
books.append(book) | |
print(books) | |
all_defs = {k: f"{k} - {strongs_defs[k]}" for k in strongs_defs} | |
def format_strong(number): | |
return f"{number} - {strongs_defs[number]}" | |
def get_models(): | |
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') | |
model = AutoModel.from_pretrained('bert-base-cased', output_hidden_states=True).eval() | |
return tokenizer, model | |
def get_all_embeddings(greek_words): | |
embeddings = [] | |
for word in greek_words: | |
for number in greek_words[word]: | |
if number in strongs_tags: | |
gw = word | |
for verse, idx in strongs_tags[number]: | |
if verse in verses: | |
text = verses[verse] | |
words = [x for x in re.split('([ \'])', text) if x != " " and x != "" and x != "'"] | |
if len(words) <= idx - 1: | |
continue | |
ew = words[idx-1].strip(",.!?;:()\"'-") | |
print(gw, ew) | |
emb = get_embedding(text, ew) | |
embeddings.append((emb, f"{verse} {text}", gw, book)) | |
return embeddings | |
def get_book_type(idx): | |
if idx < 4: | |
return 'Gospels' | |
if idx == 4: | |
return 'Acts' | |
if idx < 19: | |
return 'Pauline letters' | |
if idx < 26: | |
return 'Short lettters' | |
return 'Revelation' | |
st.markdown(""" | |
This app is a demo of using BERT to analyze the Greek New Testament. It allows you to compare two | |
clusters of Greek words (identified by their Strong's Numbers) and compare the embeddings for them. | |
To use it, select the words you want to use for the first cluster (eg. G0025 and G0026, which are | |
forms of agape), then select the words you want to use for the second cluster (eg. G5368, G5360, | |
G5363, which are forms of phileo) and then hit Submit. | |
For an explanation of what's going on here you can read my [post](https://rolisz.com/analyzing-the-bible-with-bert-models/) | |
where I compare the words soul and spirit and | |
the words agape and phileo. | |
""") | |
with st.form("my_form"): | |
option1 = st.multiselect('Select Strongs numbers for first concept', all_defs.keys(), | |
['0025', '0026'], format_func=format_strong) | |
option2 = st.multiselect('Select Strongs numbers for second concept', all_defs.keys(), | |
["5368", "5360", "5363", "5362", "5361", "5366", "5377"], | |
format_func=format_strong) | |
# Every form must have a submit button. | |
submitted = st.form_submit_button("Submit") | |
if submitted: | |
with st.spinner('Calculating embeddings...'): | |
embeddings = get_all_embeddings({"concept1": option1, "concept2": option2}) | |
with st.spinner('Reducing dimensionality...'): | |
mapper = umap.UMAP().fit([x[0] for x in embeddings]) | |
ts = mapper.embedding_ | |
x = ts[:, 0] | |
y = ts[:, 1] | |
df = pd.DataFrame( | |
{"x": x, "y": y, "verse": ["<br>".join(textwrap.wrap(x[1], 80)) for x in embeddings], | |
"greek word": [x[2] for x in embeddings]}) | |
fig = px.scatter(df, x="x", y="y", | |
hover_data=['verse'], color="greek word", ) | |
# fig.write_html("book_love.html") | |
st.plotly_chart(fig) | |