File size: 7,758 Bytes
4ea2d3a c7d9cf9 2605d63 8e13e1c 4ea2d3a c7d9cf9 4ea2d3a 6cd05ce 4ea2d3a c7d9cf9 4ea2d3a c7d9cf9 4ea2d3a 2605d63 8e13e1c c7d9cf9 4ea2d3a c7d9cf9 2605d63 8e13e1c 4ea2d3a c7d9cf9 2605d63 c7d9cf9 2605d63 c7d9cf9 2605d63 c7d9cf9 2605d63 c7d9cf9 2605d63 4ea2d3a c7d9cf9 2605d63 c7d9cf9 4ea2d3a c7d9cf9 4ea2d3a c7d9cf9 46e623d 74e30c6 46e623d 8e13e1c 7eeaa6b 8e13e1c 0d23ce4 8e13e1c 46e623d 2605d63 b9f1e11 8e13e1c 2605d63 8e13e1c b9f1e11 2605d63 b9f1e11 2605d63 b9f1e11 2605d63 b9f1e11 2605d63 46e623d 8e13e1c 74e30c6 c7d9cf9 74e30c6 c7d9cf9 6cd05ce c7d9cf9 4ea2d3a 46e623d 4ea2d3a 2605d63 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
import xml.etree.ElementTree as ET
from collections import defaultdict
from autocomplete import load_compressed_word_list
import json
import streamlit as st
import re
def read_xml(file):
"""
Read an XML file of the Greek LSJ dictionary
and return a dictionary with the words and their definitions.
"""
tree = ET.parse(file)
root = tree.getroot()
xml_info = defaultdict(dict)
for entry in root.findall('.//entryFree'):
entry_info = extract_entry_info(entry)
xml_info[entry_info['lemma']] = entry_info
return xml_info
def extract_entry_info(entry):
"""
Extract information from an entry in the LSJ dictionary.
"""
definitions = defaultdict(dict)
# Save the lemma in the dictionary and remove digits
lemma = ''.join([i for i in entry.get('key') if not i.isdigit()])
# Save the orthographies in the dictionary
orthographies = [orth.text for orth in entry.findall('orth')]
definitions[lemma]['orthographies'] = orthographies
# Check if there is a tr element with a definition
definition = ' '.join(entry.itertext()).strip()
definitions[lemma]['definitions'] = {'tr': definition}
# text = get_descendants_text(entry)
text = get_all_text(entry)
cleaned_text = prettify_text(text)
definitions[lemma]['definitions']['text'] = cleaned_text
return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
def get_all_text(element):
"""Recursively collect text from an element and all its descendants."""
text = (element.text or "")
for child in element:
if child.tag == 'sense':
level = child.get('n')
text += f"[SENSE_SEPARATOR][level={level}]\n\n"
elif child.tag == 'tr' and element.tag == 'sense':
if child.text is not None:
text += f"<tr>{child.text.strip()}</tr>\n"
# Skip further recursion for this child since we are already handling its text
text += (child.tail or "") + " "
continue
text += get_all_text(child) + " "
text += (child.tail or "") + " "
return text
def get_descendants_text(element):
"""
Get all the text of the descendants of a given element, separating every 'sense' element.
"""
text = ""
for child in element:
if child.tag == 'sense':
# Add a separator before each 'sense' element
text += f"[SENSE_SEPARATOR]\n\n"
if child.tag == 'tr' and element.tag == 'sense':
# Add [tr] tags around text inside 'tr' tags within 'sense' tags
if child.text is not None:
text += f"<tr>{child.text.strip()}</tr>\n"
text += child.tail
else:
if child.get('n') and len(child.get('n')) <= 2:
text += f"{child.get('n')}. "
text += child.text or ""
text += get_descendants_text(child)
return text
def prettify_text(text):
"""
Prettify the text of the definitions into a readable format,
adding [tr] tags to text inside 'tr' tags within 'sense' tags.
"""
# Split text using the sense separator
parts = text.split("[SENSE_SEPARATOR]")
# Prettify each part separately
prettified_parts = []
for part in parts:
# Remove leading and trailing whitespace and join lines with a space
cleaned_part = ' '.join(line.strip() for line in part.split('\n') if line.strip())
prettified_parts.append(cleaned_part)
# Join prettified parts using sense separator and newline
prettified_text = "\n\n[SENSE_SEPARATOR] ".join(prettified_parts)
return prettified_text
def full_dictionary():
"""
Return the full dictionary of the LSJ dictionary.
"""
merged_info = {}
for i in range(1, 28): # eng1 to eng27
file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
xml_info = read_xml(file)
for lemma, info in xml_info.items():
# Merge dictionaries, assuming word is unique across all files
merged_info.setdefault(lemma, {}).update(info)
return merged_info
def format_text(data):
"""
Modify text to desired template
"""
text = data['definitions']['text']
# Change <tr> tags to bold
text = text.replace("<tr>", "<span class='tr'> ").replace("</tr>", "</span>")
pattern = r"\s+([,;:.()\"\'\[\]])\s+"
text = re.sub(pattern, r"\1 ", text)
pattern_2 = r"(\S)([,;:.()\"\'\[\]])(\S)"
text = re.sub(pattern_2, r"\1\2 \3", text)
# .replace(",", ", ").replace(";", "; ").replace(":", ": ").replace("(", " (").replace(")", ") ").replace("[", " [").replace("]", "] ").replace(" ,", ", ").replace(" ; ", "; ").replace(" : ", ": ").replace(" ." , ". ")
formatted_text = []
primary_indicators = [
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"
]
secondary_indicators = [
"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"
]
tertiary_indicators = [
"2", "3", "4", "5", "6", "7", "8", "9", "10"
]
quaternary_indicators = [
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
"k", "l", "m", "n", "o", "p", "q", "r", "s", "t",
"u", "v", "w", "x", "y", "z"
]
header = text.split("\n\n[SENSE_SEPARATOR]")[0]
formatted_text.append(header)
for text_part in text.split("[SENSE_SEPARATOR]")[1:]:
level = text_part.split("level=")[1].split("]")[0]
text_part = text_part.replace(f"[level={level}]", "")
if level:
if level == "A":
formatted_text.append(f"<div class='list-class primary-class'> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
elif level in secondary_indicators:
formatted_text.append(f"<div class='list-class secondary-class'><span class='secondary-indicator'>{level}. </span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
elif level in tertiary_indicators:
formatted_text.append(f"<div class='list-class tertiary-class'> <span class='tertiary-indicator'>{level}. </span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
elif level in quaternary_indicators:
formatted_text.append(f"<div class='list-class quaternary-class'> <span class='quaternary-indicator'>{level}. </span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div> ")
elif level in primary_indicators:
formatted_text.append(f"<div class='list-class primary-class'> <span class='primary-indicator'>{level}. </span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
return '\n'.join(formatted_text)
def main():
# This code is used to convert the .xml files into a .json file
# TO DO: Make seperate function
download = True
if download is True:
merged_info = {}
for i in range(1, 28): # eng1 to eng27
file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
xml_info = read_xml(file)
for word, info in xml_info.items():
# Merge dictionaries, assuming word is unique across all files
merged_info.setdefault(word, {}).update(info)
# Store merged dictionaries as .json file with pretty print
with open("lsj_dict.json", "w", encoding="utf-8") as file:
json.dump(merged_info, file, ensure_ascii=False, indent=4)
lemma_dict = json.load(open('lsj_dict.json', 'r'))
if __name__ == "__main__":
main()
|