Spaces:

GroNLP
/

agalma

Sleeping

App Files Files Community

agalma / lsj_dict.py

Mark7549

improved display of the library

2605d63 9 months ago

raw

history blame

7.69 kB

	import xml.etree.ElementTree as ET
	from collections import defaultdict
	from autocomplete import load_compressed_word_list
	import json
	import streamlit as st

	def read_xml(file):
	"""
	Read an XML file of the Greek LSJ dictionary
	and return a dictionary with the words and their definitions.
	"""
	tree = ET.parse(file)
	root = tree.getroot()

	xml_info = defaultdict(dict)

	for entry in root.findall('.//entryFree'):
	entry_info = extract_entry_info(entry)

	xml_info[entry_info['lemma']] = entry_info

	return xml_info


	def extract_entry_info(entry):
	"""
	Extract information from an entry in the LSJ dictionary.
	"""
	definitions = defaultdict(dict)

	# Save the lemma in the dictionary and remove digits
	lemma = ''.join([i for i in entry.get('key') if not i.isdigit()])

	# Save the orthographies in the dictionary
	orthographies = [orth.text for orth in entry.findall('orth')]
	definitions[lemma]['orthographies'] = orthographies

	# Check if there is a tr element with a definition
	definition = ' '.join(entry.itertext()).strip()
	definitions[lemma]['definitions'] = {'tr': definition}


	text = get_descendants_text(entry)
	cleaned_text = prettify_text(text)

	definitions[lemma]['definitions']['text'] = cleaned_text


	return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}



	def get_descendants_text(element):
	"""
	Get all the text of the descendants of a given element, separating every 'sense' element.
	"""
	text = ""
	level_indicators = [
	'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X',
	'1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
	'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
	'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
	'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
	]

	for child in element:
	if child.tag == 'sense':
	# Add a separator before each 'sense' element
	text += f"[SENSE_SEPARATOR]\n\n"
	if child.tag == 'tr' and element.tag == 'sense':
	# Add [tr] tags around text inside 'tr' tags within 'sense' tags
	if child.text is not None:
	text += f"<tr>{child.text.strip()}</tr>\n"
	text += child.tail
	else:
	if child.get('n') and len(child.get('n')) <= 2:
	text += f"{child.get('n')}. "
	text += child.text or ""
	text += get_descendants_text(child)
	return text




	def prettify_text(text):
	"""
	Prettify the text of the definitions into a readable format,
	adding [tr] tags to text inside 'tr' tags within 'sense' tags.
	"""
	# Split text using the sense separator
	parts = text.split("[SENSE_SEPARATOR]")

	# Prettify each part separately
	prettified_parts = []

	for part in parts:
	# Remove leading and trailing whitespace and join lines with a space
	cleaned_part = ' '.join(line.strip() for line in part.split('\n') if line.strip())
	prettified_parts.append(cleaned_part)

	# Join prettified parts using sense separator and newline
	prettified_text = "\n\n[SENSE_SEPARATOR] ".join(prettified_parts)

	return prettified_text


	def full_dictionary():
	"""
	Return the full dictionary of the LSJ dictionary.
	"""
	merged_info = {}
	for i in range(1, 28): # eng1 to eng27
	file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
	xml_info = read_xml(file)
	for lemma, info in xml_info.items():
	# Merge dictionaries, assuming word is unique across all files
	merged_info.setdefault(lemma, {}).update(info)

	return merged_info


	def format_text(data):
	text = data['definitions']['text']

	# Change <tr> tags to bold
	text = text.replace("<tr>", "<span class='tr'> ").replace("</tr>", "</span>").replace(",", ", ").replace(";", "; ").replace(":", ": ").replace("(", " (").replace(")", ") ").replace("[", " [").replace("]", "] ").replace(" ,", ", ").replace(" ; ", "; ").replace(" : ", ": ").replace(" ." , ". ")

	formatted_text = []

	primary_indicators = [
	"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"
	]

	secondary_indicators = [
	"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"
	]

	tertiary_indicators = [
	"2", "3", "4", "5", "6", "7", "8", "9", "10"
	]

	quaternary_indicators = [
	"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
	"k", "l", "m", "n", "o", "p", "q", "r", "s", "t",
	"u", "v", "w", "x", "y", "z"
	]

	for text_part in text.split("[SENSE_SEPARATOR]"):
	level = text_part.split(".")[0].strip()
	text_part = text_part.replace(level + ".", "")
	if level:
	if level in secondary_indicators:
	formatted_text.append(f"<div class='list-class secondary-class'><span class='secondary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
	elif level in tertiary_indicators:
	formatted_text.append(f"<div class='list-class tertiary-class'> <span class='tertiary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
	elif level in quaternary_indicators:
	formatted_text.append(f"<div class='list-class quaternary-class'> <span class='quaternary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div> ")
	elif level in primary_indicators:
	formatted_text.append(f"<div class='list-class primary-class'> <span class='primary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")

	return '\n'.join(formatted_text)



	def main():
	# xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")

	# for word, info in xml_info.items():
	# print(word)
	# print('Lemma: ', info['lemma'])
	# print('Orthographies: ', info['orthographies'])
	# print('Definitions: ', info['definitions'])
	# print('TEST', info['definitions']['tr'].split('\n')[0]) # First word in the definition
	# print('Text:', info['definitions']['text'])
	# if len(info['definitions']['tr'].split('\n')) > 1:
	# print('First definition: ', info['definitions']['tr'].split('\n')[1])
	# print(' ')

	# full_dictionary()

	download = True

	if download is True:
	merged_info = {}
	for i in range(1, 28): # eng1 to eng27
	file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
	xml_info = read_xml(file)
	for word, info in xml_info.items():
	# Merge dictionaries, assuming word is unique across all files
	merged_info.setdefault(word, {}).update(info)


	# Store merged dictionaries as .json file with pretty print
	with open("lsj_dict.json", "w", encoding="utf-8") as file:
	json.dump(merged_info, file, ensure_ascii=False, indent=4)



	lemma_dict = json.load(open('lsj_dict.json', 'r'))

	print_test(lemma_dict)


	def print_test(lemma_dict):
	print(lemma_dict["βομβάζω"])



	if __name__ == "__main__":
	main()