import xml.etree.ElementTree as ET from collections import defaultdict from autocomplete import load_compressed_word_list import json def read_xml(file): """ Read an XML file of the Greek LSJ dictionary and return a dictionary with the words and their definitions. """ tree = ET.parse(file) root = tree.getroot() xml_info = defaultdict(dict) for entry in root.findall('.//entryFree'): entry_info = extract_entry_info(entry) xml_info[entry_info['lemma']] = entry_info return xml_info def extract_entry_info(entry): """ Extract information from an entry in the LSJ dictionary. """ definitions = defaultdict(dict) # Save the lemma in the dictionary lemma = entry.get('key') # Save the orthographies in the dictionary orthographies = [orth.text for orth in entry.findall('orth')] definitions[lemma]['orthographies'] = orthographies # Check if there is a tr element with a definition definition = ' '.join(entry.itertext()).strip() definitions[lemma]['definitions'] = {'tr': definition} text = get_descendants_text(entry) cleaned_text = prettify_text(text) definitions[lemma]['definitions']['text'] = cleaned_text return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']} def get_descendants_text(element): """ Get all the text of the descendants of a given element, separating every 'sense' element. """ text = "" for child in element: if child.tag == 'sense': # Add a separator before each 'sense' element text += "[SENSE_SEPARATOR]\n\n" if child.tag == 'tr' and element.tag == 'sense': # Add [tr] tags around text inside 'tr' tags within 'sense' tags if child.text is not None: text += f"{child.text.strip()}\n" else: text += child.text or "" text += get_descendants_text(child) text += child.tail or "" return text def prettify_text(text): """ Prettify the text of the definitions into a readable format, adding [tr] tags to text inside 'tr' tags within 'sense' tags. """ # Split text using the sense separator parts = text.split("[SENSE_SEPARATOR]") # Prettify each part separately prettified_parts = [] for part in parts: # Remove leading and trailing whitespace and join lines with a space cleaned_part = ' '.join(line.strip() for line in part.split('\n') if line.strip()) prettified_parts.append(cleaned_part) # Join prettified parts using sense separator and newline prettified_text = "\n\n[SENSE_SEPARATOR] ".join(prettified_parts) return prettified_text def full_dictionary(): """ Return the full dictionary of the LSJ dictionary. """ merged_info = {} for i in range(1, 28): # eng1 to eng27 file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml" xml_info = read_xml(file) for lemma, info in xml_info.items(): # Merge dictionaries, assuming word is unique across all files merged_info.setdefault(lemma, {}).update(info) return merged_info def main(): # xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml") # for word, info in xml_info.items(): # print(word) # print('Lemma: ', info['lemma']) # print('Orthographies: ', info['orthographies']) # print('Definitions: ', info['definitions']) # print('TEST', info['definitions']['tr'].split('\n')[0]) # First word in the definition # print('Text:', info['definitions']['text']) # if len(info['definitions']['tr'].split('\n')) > 1: # print('First definition: ', info['definitions']['tr'].split('\n')[1]) # print(' ') # full_dictionary() download = True if download is True: merged_info = {} for i in range(1, 28): # eng1 to eng27 file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml" xml_info = read_xml(file) for word, info in xml_info.items(): # Merge dictionaries, assuming word is unique across all files merged_info.setdefault(word, {}).update(info) # Store merged dictionaries as .json file with pretty print with open("lsj_dict.json", "w", encoding="utf-8") as file: json.dump(merged_info, file, ensure_ascii=False, indent=4) if __name__ == "__main__": main()