agalma / lsj_dict.py
Mark7549's picture
made some small cleanups of redundant code
74e30c6
raw
history blame
7.76 kB
import xml.etree.ElementTree as ET
from collections import defaultdict
from autocomplete import load_compressed_word_list
import json
import streamlit as st
import re
def read_xml(file):
"""
Read an XML file of the Greek LSJ dictionary
and return a dictionary with the words and their definitions.
"""
tree = ET.parse(file)
root = tree.getroot()
xml_info = defaultdict(dict)
for entry in root.findall('.//entryFree'):
entry_info = extract_entry_info(entry)
xml_info[entry_info['lemma']] = entry_info
return xml_info
def extract_entry_info(entry):
"""
Extract information from an entry in the LSJ dictionary.
"""
definitions = defaultdict(dict)
# Save the lemma in the dictionary and remove digits
lemma = ''.join([i for i in entry.get('key') if not i.isdigit()])
# Save the orthographies in the dictionary
orthographies = [orth.text for orth in entry.findall('orth')]
definitions[lemma]['orthographies'] = orthographies
# Check if there is a tr element with a definition
definition = ' '.join(entry.itertext()).strip()
definitions[lemma]['definitions'] = {'tr': definition}
# text = get_descendants_text(entry)
text = get_all_text(entry)
cleaned_text = prettify_text(text)
definitions[lemma]['definitions']['text'] = cleaned_text
return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
def get_all_text(element):
"""Recursively collect text from an element and all its descendants."""
text = (element.text or "")
for child in element:
if child.tag == 'sense':
level = child.get('n')
text += f"[SENSE_SEPARATOR][level={level}]\n\n"
elif child.tag == 'tr' and element.tag == 'sense':
if child.text is not None:
text += f"<tr>{child.text.strip()}</tr>\n"
# Skip further recursion for this child since we are already handling its text
text += (child.tail or "") + " "
continue
text += get_all_text(child) + " "
text += (child.tail or "") + " "
return text
def get_descendants_text(element):
"""
Get all the text of the descendants of a given element, separating every 'sense' element.
"""
text = ""
for child in element:
if child.tag == 'sense':
# Add a separator before each 'sense' element
text += f"[SENSE_SEPARATOR]\n\n"
if child.tag == 'tr' and element.tag == 'sense':
# Add [tr] tags around text inside 'tr' tags within 'sense' tags
if child.text is not None:
text += f"<tr>{child.text.strip()}</tr>\n"
text += child.tail
else:
if child.get('n') and len(child.get('n')) <= 2:
text += f"{child.get('n')}. "
text += child.text or ""
text += get_descendants_text(child)
return text
def prettify_text(text):
"""
Prettify the text of the definitions into a readable format,
adding [tr] tags to text inside 'tr' tags within 'sense' tags.
"""
# Split text using the sense separator
parts = text.split("[SENSE_SEPARATOR]")
# Prettify each part separately
prettified_parts = []
for part in parts:
# Remove leading and trailing whitespace and join lines with a space
cleaned_part = ' '.join(line.strip() for line in part.split('\n') if line.strip())
prettified_parts.append(cleaned_part)
# Join prettified parts using sense separator and newline
prettified_text = "\n\n[SENSE_SEPARATOR] ".join(prettified_parts)
return prettified_text
def full_dictionary():
"""
Return the full dictionary of the LSJ dictionary.
"""
merged_info = {}
for i in range(1, 28): # eng1 to eng27
file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
xml_info = read_xml(file)
for lemma, info in xml_info.items():
# Merge dictionaries, assuming word is unique across all files
merged_info.setdefault(lemma, {}).update(info)
return merged_info
def format_text(data):
"""
Modify text to desired template
"""
text = data['definitions']['text']
# Change <tr> tags to bold
text = text.replace("<tr>", "<span class='tr'> ").replace("</tr>", "</span>")
pattern = r"\s+([,;:.()\"\'\[\]])\s+"
text = re.sub(pattern, r"\1 ", text)
pattern_2 = r"(\S)([,;:.()\"\'\[\]])(\S)"
text = re.sub(pattern_2, r"\1\2 \3", text)
# .replace(",", ", ").replace(";", "; ").replace(":", ": ").replace("(", " (").replace(")", ") ").replace("[", " [").replace("]", "] ").replace(" ,", ", ").replace(" ; ", "; ").replace(" : ", ": ").replace(" ." , ". ")
formatted_text = []
primary_indicators = [
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"
]
secondary_indicators = [
"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"
]
tertiary_indicators = [
"2", "3", "4", "5", "6", "7", "8", "9", "10"
]
quaternary_indicators = [
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
"k", "l", "m", "n", "o", "p", "q", "r", "s", "t",
"u", "v", "w", "x", "y", "z"
]
header = text.split("\n\n[SENSE_SEPARATOR]")[0]
formatted_text.append(header)
for text_part in text.split("[SENSE_SEPARATOR]")[1:]:
level = text_part.split("level=")[1].split("]")[0]
text_part = text_part.replace(f"[level={level}]", "")
if level:
if level == "A":
formatted_text.append(f"<div class='list-class primary-class'> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
elif level in secondary_indicators:
formatted_text.append(f"<div class='list-class secondary-class'><span class='secondary-indicator'>{level}. </span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
elif level in tertiary_indicators:
formatted_text.append(f"<div class='list-class tertiary-class'> <span class='tertiary-indicator'>{level}. </span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
elif level in quaternary_indicators:
formatted_text.append(f"<div class='list-class quaternary-class'> <span class='quaternary-indicator'>{level}. </span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div> ")
elif level in primary_indicators:
formatted_text.append(f"<div class='list-class primary-class'> <span class='primary-indicator'>{level}. </span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
return '\n'.join(formatted_text)
def main():
# This code is used to convert the .xml files into a .json file
# TO DO: Make seperate function
download = True
if download is True:
merged_info = {}
for i in range(1, 28): # eng1 to eng27
file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
xml_info = read_xml(file)
for word, info in xml_info.items():
# Merge dictionaries, assuming word is unique across all files
merged_info.setdefault(word, {}).update(info)
# Store merged dictionaries as .json file with pretty print
with open("lsj_dict.json", "w", encoding="utf-8") as file:
json.dump(merged_info, file, ensure_ascii=False, indent=4)
lemma_dict = json.load(open('lsj_dict.json', 'r'))
if __name__ == "__main__":
main()