import xml.etree.ElementTree as ET from collections import defaultdict from autocomplete import load_compressed_word_list import json import streamlit as st import re def read_xml(file): """ Read an XML file of the Greek LSJ dictionary and return a dictionary with the words and their definitions. """ tree = ET.parse(file) root = tree.getroot() xml_info = defaultdict(dict) for entry in root.findall('.//entryFree'): entry_info = extract_entry_info(entry) xml_info[entry_info['lemma']] = entry_info return xml_info def extract_entry_info(entry): """ Extract information from an entry in the LSJ dictionary. """ definitions = defaultdict(dict) # Save the lemma in the dictionary and remove digits lemma = ''.join([i for i in entry.get('key') if not i.isdigit()]) # Save the orthographies in the dictionary orthographies = [orth.text for orth in entry.findall('orth')] definitions[lemma]['orthographies'] = orthographies # Check if there is a tr element with a definition definition = ' '.join(entry.itertext()).strip() definitions[lemma]['definitions'] = {'tr': definition} # text = get_descendants_text(entry) text = get_all_text(entry) cleaned_text = prettify_text(text) definitions[lemma]['definitions']['text'] = cleaned_text return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']} def get_all_text(element): """Recursively collect text from an element and all its descendants.""" text = (element.text or "") for child in element: if child.tag == 'sense': level = child.get('n') text += f"[SENSE_SEPARATOR][level={level}]\n\n" elif child.tag == 'tr' and element.tag == 'sense': if child.text is not None: text += f"{child.text.strip()}\n" # Skip further recursion for this child since we are already handling its text text += (child.tail or "") + " " continue text += get_all_text(child) + " " text += (child.tail or "") + " " return text def get_descendants_text(element): """ Get all the text of the descendants of a given element, separating every 'sense' element. """ text = "" for child in element: if child.tag == 'sense': # Add a separator before each 'sense' element text += f"[SENSE_SEPARATOR]\n\n" if child.tag == 'tr' and element.tag == 'sense': # Add [tr] tags around text inside 'tr' tags within 'sense' tags if child.text is not None: text += f"{child.text.strip()}\n" text += child.tail else: if child.get('n') and len(child.get('n')) <= 2: text += f"{child.get('n')}. " text += child.text or "" text += get_descendants_text(child) return text def prettify_text(text): """ Prettify the text of the definitions into a readable format, adding [tr] tags to text inside 'tr' tags within 'sense' tags. """ # Split text using the sense separator parts = text.split("[SENSE_SEPARATOR]") # Prettify each part separately prettified_parts = [] for part in parts: # Remove leading and trailing whitespace and join lines with a space cleaned_part = ' '.join(line.strip() for line in part.split('\n') if line.strip()) prettified_parts.append(cleaned_part) # Join prettified parts using sense separator and newline prettified_text = "\n\n[SENSE_SEPARATOR] ".join(prettified_parts) return prettified_text def full_dictionary(): """ Return the full dictionary of the LSJ dictionary. """ merged_info = {} for i in range(1, 28): # eng1 to eng27 file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml" xml_info = read_xml(file) for lemma, info in xml_info.items(): # Merge dictionaries, assuming word is unique across all files merged_info.setdefault(lemma, {}).update(info) return merged_info def format_text(data): """ Modify text to desired template """ text = data['definitions']['text'] # Change tags to bold text = text.replace("", " ").replace("", "") pattern = r"\s+([,;:.()\"\'\[\]])\s+" text = re.sub(pattern, r"\1 ", text) pattern_2 = r"(\S)([,;:.()\"\'\[\]])(\S)" text = re.sub(pattern_2, r"\1\2 \3", text) # .replace(",", ", ").replace(";", "; ").replace(":", ": ").replace("(", " (").replace(")", ") ").replace("[", " [").replace("]", "] ").replace(" ,", ", ").replace(" ; ", "; ").replace(" : ", ": ").replace(" ." , ". ") formatted_text = [] primary_indicators = [ "A", "B", "C", "D", "E", "F", "G", "H", "I", "J" ] secondary_indicators = [ "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X" ] tertiary_indicators = [ "2", "3", "4", "5", "6", "7", "8", "9", "10" ] quaternary_indicators = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" ] header = text.split("\n\n[SENSE_SEPARATOR]")[0] formatted_text.append(header) for text_part in text.split("[SENSE_SEPARATOR]")[1:]: level = text_part.split("level=")[1].split("]")[0] text_part = text_part.replace(f"[level={level}]", "") if level: if level == "A": formatted_text.append(f"
{text_part.replace('[SENSE_SEPARATOR]', '')}
") elif level in secondary_indicators: formatted_text.append(f"
{level}. {text_part.replace('[SENSE_SEPARATOR]', '')}
") elif level in tertiary_indicators: formatted_text.append(f"
{level}. {text_part.replace('[SENSE_SEPARATOR]', '')}
") elif level in quaternary_indicators: formatted_text.append(f"
{level}. {text_part.replace('[SENSE_SEPARATOR]', '')}
") elif level in primary_indicators: formatted_text.append(f"
{level}. {text_part.replace('[SENSE_SEPARATOR]', '')}
") return '\n'.join(formatted_text) def main(): # This code is used to convert the .xml files into a .json file # TO DO: Make seperate function download = True if download is True: merged_info = {} for i in range(1, 28): # eng1 to eng27 file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml" xml_info = read_xml(file) for word, info in xml_info.items(): # Merge dictionaries, assuming word is unique across all files merged_info.setdefault(word, {}).update(info) # Store merged dictionaries as .json file with pretty print with open("lsj_dict.json", "w", encoding="utf-8") as file: json.dump(merged_info, file, ensure_ascii=False, indent=4) lemma_dict = json.load(open('lsj_dict.json', 'r')) if __name__ == "__main__": main()