Spaces:

GroNLP
/

agalma

Sleeping

App Files Files Community

Mark7549 commited on May 4, 2024

Commit

c7d9cf9

1 Parent(s): 9ce66d0

Add dictionary functions

Browse files

Files changed (2) hide show

app.py +13 -1
lsj_dict.py +90 -19

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ from autocomplete import *
 from vector_graph import *
 from plots import *
 from lsj_dict import *
 st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
@@ -13,6 +15,9 @@ st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
 active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D graph", 'Dictionary'],
     menu_icon="cast", default_index=0, orientation="horizontal")
 # Nearest neighbours tab
 if active_tab == "Nearest neighbours":
     st.write("### TO DO: add description of function")
@@ -150,14 +155,21 @@ elif active_tab == "3D graph":
             st.plotly_chart(fig)
 # Dictionary tab
 elif active_tab == "Dictionary":
     with st.container():
         all_lemmas = load_compressed_word_list('all_lemmas.pkl.gz')
         query_word = st.multiselect("Search a word in the LSJ dictionary", all_lemmas, max_selections=1)

 from vector_graph import *
 from plots import *
 from lsj_dict import *
+import json
 st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
 active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D graph", 'Dictionary'],
     menu_icon="cast", default_index=0, orientation="horizontal")
+# Prepare dictionary
+lemma_dict = json.load(open('lsj_dict.json', 'r'))
 # Nearest neighbours tab
 if active_tab == "Nearest neighbours":
     st.write("### TO DO: add description of function")
             st.plotly_chart(fig)
 # Dictionary tab
 elif active_tab == "Dictionary":
     with st.container():
         all_lemmas = load_compressed_word_list('all_lemmas.pkl.gz')
         query_word = st.multiselect("Search a word in the LSJ dictionary", all_lemmas, max_selections=1)
+        # If a word has been selected by user
+        if query_word:
+            st.write(f"### {query_word[0]}")
+            st.write(lemma_dict[query_word[0]])

lsj_dict.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import xml.etree.ElementTree as ET
 from collections import defaultdict
 from autocomplete import load_compressed_word_list
 def read_xml(file):
@@ -16,7 +17,7 @@ def read_xml(file):
     for entry in root.findall('.//entryFree'):
         entry_info = extract_entry_info(entry)
-        xml_info[entry_info['word']] = entry_info
     return xml_info
@@ -25,46 +26,116 @@ def extract_entry_info(entry):
     """
     Extract information from an entry in the LSJ dictionary.
     """
-    word = entry.find('orth').text
     definitions = defaultdict(dict)
     # Save the lemma in the dictionary
     lemma = entry.get('key')
-    definitions[word]['lemma'] = lemma
     # Save the orthographies in the dictionary
     orthographies = [orth.text for orth in entry.findall('orth')]
-    definitions[word]['orthographies'] = orthographies
     # Check if there is a tr element with a definition
     definition = ' '.join(entry.itertext()).strip()
-    definitions[word]['definitions'] = {'tr': definition}
-    for child in entry:
-        for grandchild in child.iter():
-            tag = grandchild.tag
-            text = grandchild.text or ""
-            tail = grandchild.tail or ""
-            if tag != "tr":  # Avoiding 'tr' tag since it's handled separately
-                definitions[word]['definitions'][tag] = text + tail
-    return {'word': word, 'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[word]['definitions']}
-def main():
     merged_info = {}
     for i in range(1, 28):  # eng1 to eng27
         file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
         xml_info = read_xml(file)
-        for word, info in xml_info.items():
             # Merge dictionaries, assuming word is unique across all files
-            merged_info.setdefault(word, {}).update(info)
-    # Print lemmas from the merged dictionary
-    for word, info in merged_info.items():
-        print(info['lemma'])

 import xml.etree.ElementTree as ET
 from collections import defaultdict
 from autocomplete import load_compressed_word_list
+import json
 def read_xml(file):
     for entry in root.findall('.//entryFree'):
         entry_info = extract_entry_info(entry)
+        xml_info[entry_info['lemma']] = entry_info
     return xml_info
     """
     Extract information from an entry in the LSJ dictionary.
     """
     definitions = defaultdict(dict)
     # Save the lemma in the dictionary
     lemma = entry.get('key')
     # Save the orthographies in the dictionary
     orthographies = [orth.text for orth in entry.findall('orth')]
+    definitions[lemma]['orthographies'] = orthographies
     # Check if there is a tr element with a definition
     definition = ' '.join(entry.itertext()).strip()
+    definitions[lemma]['definitions'] = {'tr': definition}
+    text = get_descendants_text(entry)
+    cleaned_text = prettify_text(text)
+    definitions[lemma]['definitions']['text'] = cleaned_text
+    return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
+def get_descendants_text(element):
+    """
+    Get all the text of the descendants of a given element, separating every 'sense' element.
+    """
+    text = ""
+    for child in element:
+        if child.tag == 'sense':
+            # Add a separator before each 'sense' element
+            text += "[SENSE_SEPARATOR]\n\n"
+        if child.tag == 'tr' and element.tag == 'sense':
+            # Add [tr] tags around text inside 'tr' tags within 'sense' tags
+            if child.text is not None:
+                text += f"<tr>{child.text.strip()}</tr>\n"
+        else:
+            text += child.text or ""
+            text += get_descendants_text(child)
+            text += child.tail or ""
+    return text
+def prettify_text(text):
+    """
+    Prettify the text of the definitions into a readable format,
+    adding [tr] tags to text inside 'tr' tags within 'sense' tags.
+    """
+    # Split text using the sense separator
+    parts = text.split("[SENSE_SEPARATOR]")
+    # Prettify each part separately
+    prettified_parts = []
+    for part in parts:
+        # Remove leading and trailing whitespace and join lines with a space
+        cleaned_part = ' '.join(line.strip() for line in part.split('\n') if line.strip())
+        prettified_parts.append(cleaned_part)
+    # Join prettified parts using sense separator and newline
+    prettified_text = "\n\n[SENSE_SEPARATOR] ".join(prettified_parts)
+    return prettified_text
+def full_dictionary():
+    """
+    Return the full dictionary of the LSJ dictionary.
+    """
     merged_info = {}
     for i in range(1, 28):  # eng1 to eng27
         file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
         xml_info = read_xml(file)
+        for lemma, info in xml_info.items():
             # Merge dictionaries, assuming word is unique across all files
+            merged_info.setdefault(lemma, {}).update(info)
+    return merged_info
+def main():
+    # xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")
+    # for word, info in xml_info.items():
+    #     print(word)
+    #     print('Lemma: ', info['lemma'])
+    #     print('Orthographies: ', info['orthographies'])
+    #     print('Definitions: ', info['definitions'])
+    #     print('TEST', info['definitions']['tr'].split('\n')[0])  # First word in the definition
+    #     print('Text:', info['definitions']['text'])
+    #     if len(info['definitions']['tr'].split('\n')) > 1:
+    #         print('First definition:  ', info['definitions']['tr'].split('\n')[1])
+    #     print(' ')
+    #   full_dictionary()
+    download = True
+    if download is True:
+        merged_info = {}
+        for i in range(1, 28):  # eng1 to eng27
+            file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
+            xml_info = read_xml(file)
+            for word, info in xml_info.items():
+                # Merge dictionaries, assuming word is unique across all files
+                merged_info.setdefault(word, {}).update(info)
+        # Store merged dictionaries as .json file with pretty print
+        with open("lsj_dict.json", "w", encoding="utf-8") as file:
+            json.dump(merged_info, file, ensure_ascii=False, indent=4)