|
import xml.etree.ElementTree as ET |
|
from collections import defaultdict |
|
from autocomplete import load_compressed_word_list |
|
import json |
|
import streamlit as st |
|
|
|
def read_xml(file): |
|
""" |
|
Read an XML file of the Greek LSJ dictionary |
|
and return a dictionary with the words and their definitions. |
|
""" |
|
tree = ET.parse(file) |
|
root = tree.getroot() |
|
|
|
xml_info = defaultdict(dict) |
|
|
|
for entry in root.findall('.//entryFree'): |
|
entry_info = extract_entry_info(entry) |
|
|
|
xml_info[entry_info['lemma']] = entry_info |
|
|
|
return xml_info |
|
|
|
|
|
def extract_entry_info(entry): |
|
""" |
|
Extract information from an entry in the LSJ dictionary. |
|
""" |
|
definitions = defaultdict(dict) |
|
|
|
|
|
lemma = ''.join([i for i in entry.get('key') if not i.isdigit()]) |
|
|
|
|
|
orthographies = [orth.text for orth in entry.findall('orth')] |
|
definitions[lemma]['orthographies'] = orthographies |
|
|
|
|
|
definition = ' '.join(entry.itertext()).strip() |
|
definitions[lemma]['definitions'] = {'tr': definition} |
|
|
|
|
|
text = get_descendants_text(entry) |
|
cleaned_text = prettify_text(text) |
|
|
|
definitions[lemma]['definitions']['text'] = cleaned_text |
|
|
|
|
|
return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']} |
|
|
|
|
|
|
|
def get_descendants_text(element): |
|
""" |
|
Get all the text of the descendants of a given element, separating every 'sense' element. |
|
""" |
|
text = "" |
|
level_indicators = [ |
|
'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', |
|
'1', '2', '3', '4', '5', '6', '7', '8', '9', '10', |
|
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', |
|
'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', |
|
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', |
|
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' |
|
] |
|
|
|
for child in element: |
|
if child.tag == 'sense': |
|
|
|
text += f"[SENSE_SEPARATOR]\n\n" |
|
if child.tag == 'tr' and element.tag == 'sense': |
|
|
|
if child.text is not None: |
|
text += f"<tr>{child.text.strip()}</tr>\n" |
|
text += child.tail |
|
else: |
|
if child.get('n') and len(child.get('n')) <= 2: |
|
text += f"{child.get('n')}. " |
|
text += child.text or "" |
|
text += get_descendants_text(child) |
|
return text |
|
|
|
|
|
|
|
|
|
def prettify_text(text): |
|
""" |
|
Prettify the text of the definitions into a readable format, |
|
adding [tr] tags to text inside 'tr' tags within 'sense' tags. |
|
""" |
|
|
|
parts = text.split("[SENSE_SEPARATOR]") |
|
|
|
|
|
prettified_parts = [] |
|
|
|
for part in parts: |
|
|
|
cleaned_part = ' '.join(line.strip() for line in part.split('\n') if line.strip()) |
|
prettified_parts.append(cleaned_part) |
|
|
|
|
|
prettified_text = "\n\n[SENSE_SEPARATOR] ".join(prettified_parts) |
|
|
|
return prettified_text |
|
|
|
|
|
def full_dictionary(): |
|
""" |
|
Return the full dictionary of the LSJ dictionary. |
|
""" |
|
merged_info = {} |
|
for i in range(1, 28): |
|
file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml" |
|
xml_info = read_xml(file) |
|
for lemma, info in xml_info.items(): |
|
|
|
merged_info.setdefault(lemma, {}).update(info) |
|
|
|
return merged_info |
|
|
|
|
|
def format_text(data): |
|
text = data['definitions']['text'] |
|
|
|
|
|
text = text.replace("<tr>", "<span class='tr'> ").replace("</tr>", "</span>").replace(",", ", ").replace(";", "; ").replace(":", ": ").replace("(", " (").replace(")", ") ").replace("[", " [").replace("]", "] ").replace(" ,", ", ").replace(" ; ", "; ").replace(" : ", ": ").replace(" ." , ". ") |
|
|
|
formatted_text = [] |
|
|
|
primary_indicators = [ |
|
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J" |
|
] |
|
|
|
secondary_indicators = [ |
|
"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X" |
|
] |
|
|
|
tertiary_indicators = [ |
|
"2", "3", "4", "5", "6", "7", "8", "9", "10" |
|
] |
|
|
|
quaternary_indicators = [ |
|
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", |
|
"k", "l", "m", "n", "o", "p", "q", "r", "s", "t", |
|
"u", "v", "w", "x", "y", "z" |
|
] |
|
|
|
for text_part in text.split("[SENSE_SEPARATOR]"): |
|
level = text_part.split(".")[0].strip() |
|
text_part = text_part.replace(level + ".", "") |
|
if level: |
|
if level in secondary_indicators: |
|
formatted_text.append(f"<div class='list-class secondary-class'><span class='secondary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>") |
|
elif level in tertiary_indicators: |
|
formatted_text.append(f"<div class='list-class tertiary-class'> <span class='tertiary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>") |
|
elif level in quaternary_indicators: |
|
formatted_text.append(f"<div class='list-class quaternary-class'> <span class='quaternary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div> ") |
|
elif level in primary_indicators: |
|
formatted_text.append(f"<div class='list-class primary-class'> <span class='primary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>") |
|
|
|
return '\n'.join(formatted_text) |
|
|
|
|
|
|
|
def main(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
download = True |
|
|
|
if download is True: |
|
merged_info = {} |
|
for i in range(1, 28): |
|
file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml" |
|
xml_info = read_xml(file) |
|
for word, info in xml_info.items(): |
|
|
|
merged_info.setdefault(word, {}).update(info) |
|
|
|
|
|
|
|
with open("lsj_dict.json", "w", encoding="utf-8") as file: |
|
json.dump(merged_info, file, ensure_ascii=False, indent=4) |
|
|
|
|
|
|
|
lemma_dict = json.load(open('lsj_dict.json', 'r')) |
|
|
|
print_test(lemma_dict) |
|
|
|
|
|
def print_test(lemma_dict): |
|
print(lemma_dict["βομβάζω"]) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|
|
|
|
|