File size: 4,667 Bytes
4ea2d3a c7d9cf9 4ea2d3a c7d9cf9 4ea2d3a c7d9cf9 4ea2d3a c7d9cf9 4ea2d3a c7d9cf9 4ea2d3a c7d9cf9 4ea2d3a c7d9cf9 4ea2d3a c7d9cf9 4ea2d3a c7d9cf9 4ea2d3a c7d9cf9 4ea2d3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import xml.etree.ElementTree as ET
from collections import defaultdict
from autocomplete import load_compressed_word_list
import json
def read_xml(file):
"""
Read an XML file of the Greek LSJ dictionary
and return a dictionary with the words and their definitions.
"""
tree = ET.parse(file)
root = tree.getroot()
xml_info = defaultdict(dict)
for entry in root.findall('.//entryFree'):
entry_info = extract_entry_info(entry)
xml_info[entry_info['lemma']] = entry_info
return xml_info
def extract_entry_info(entry):
"""
Extract information from an entry in the LSJ dictionary.
"""
definitions = defaultdict(dict)
# Save the lemma in the dictionary
lemma = entry.get('key')
# Save the orthographies in the dictionary
orthographies = [orth.text for orth in entry.findall('orth')]
definitions[lemma]['orthographies'] = orthographies
# Check if there is a tr element with a definition
definition = ' '.join(entry.itertext()).strip()
definitions[lemma]['definitions'] = {'tr': definition}
text = get_descendants_text(entry)
cleaned_text = prettify_text(text)
definitions[lemma]['definitions']['text'] = cleaned_text
return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
def get_descendants_text(element):
"""
Get all the text of the descendants of a given element, separating every 'sense' element.
"""
text = ""
for child in element:
if child.tag == 'sense':
# Add a separator before each 'sense' element
text += "[SENSE_SEPARATOR]\n\n"
if child.tag == 'tr' and element.tag == 'sense':
# Add [tr] tags around text inside 'tr' tags within 'sense' tags
if child.text is not None:
text += f"<tr>{child.text.strip()}</tr>\n"
else:
text += child.text or ""
text += get_descendants_text(child)
text += child.tail or ""
return text
def prettify_text(text):
"""
Prettify the text of the definitions into a readable format,
adding [tr] tags to text inside 'tr' tags within 'sense' tags.
"""
# Split text using the sense separator
parts = text.split("[SENSE_SEPARATOR]")
# Prettify each part separately
prettified_parts = []
for part in parts:
# Remove leading and trailing whitespace and join lines with a space
cleaned_part = ' '.join(line.strip() for line in part.split('\n') if line.strip())
prettified_parts.append(cleaned_part)
# Join prettified parts using sense separator and newline
prettified_text = "\n\n[SENSE_SEPARATOR] ".join(prettified_parts)
return prettified_text
def full_dictionary():
"""
Return the full dictionary of the LSJ dictionary.
"""
merged_info = {}
for i in range(1, 28): # eng1 to eng27
file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
xml_info = read_xml(file)
for lemma, info in xml_info.items():
# Merge dictionaries, assuming word is unique across all files
merged_info.setdefault(lemma, {}).update(info)
return merged_info
def main():
# xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")
# for word, info in xml_info.items():
# print(word)
# print('Lemma: ', info['lemma'])
# print('Orthographies: ', info['orthographies'])
# print('Definitions: ', info['definitions'])
# print('TEST', info['definitions']['tr'].split('\n')[0]) # First word in the definition
# print('Text:', info['definitions']['text'])
# if len(info['definitions']['tr'].split('\n')) > 1:
# print('First definition: ', info['definitions']['tr'].split('\n')[1])
# print(' ')
# full_dictionary()
download = True
if download is True:
merged_info = {}
for i in range(1, 28): # eng1 to eng27
file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
xml_info = read_xml(file)
for word, info in xml_info.items():
# Merge dictionaries, assuming word is unique across all files
merged_info.setdefault(word, {}).update(info)
# Store merged dictionaries as .json file with pretty print
with open("lsj_dict.json", "w", encoding="utf-8") as file:
json.dump(merged_info, file, ensure_ascii=False, indent=4)
if __name__ == "__main__":
main()
|