Mark7549 commited on
Commit
c7d9cf9
·
1 Parent(s): 9ce66d0

Add dictionary functions

Browse files
Files changed (2) hide show
  1. app.py +13 -1
  2. lsj_dict.py +90 -19
app.py CHANGED
@@ -6,6 +6,8 @@ from autocomplete import *
6
  from vector_graph import *
7
  from plots import *
8
  from lsj_dict import *
 
 
9
 
10
  st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
11
 
@@ -13,6 +15,9 @@ st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
13
  active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D graph", 'Dictionary'],
14
  menu_icon="cast", default_index=0, orientation="horizontal")
15
 
 
 
 
16
  # Nearest neighbours tab
17
  if active_tab == "Nearest neighbours":
18
  st.write("### TO DO: add description of function")
@@ -150,14 +155,21 @@ elif active_tab == "3D graph":
150
  st.plotly_chart(fig)
151
 
152
 
153
-
154
 
155
  # Dictionary tab
156
  elif active_tab == "Dictionary":
 
157
  with st.container():
158
  all_lemmas = load_compressed_word_list('all_lemmas.pkl.gz')
159
 
160
  query_word = st.multiselect("Search a word in the LSJ dictionary", all_lemmas, max_selections=1)
161
 
 
 
 
 
 
 
162
 
163
 
 
6
  from vector_graph import *
7
  from plots import *
8
  from lsj_dict import *
9
+ import json
10
+
11
 
12
  st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
13
 
 
15
  active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D graph", 'Dictionary'],
16
  menu_icon="cast", default_index=0, orientation="horizontal")
17
 
18
+ # Prepare dictionary
19
+ lemma_dict = json.load(open('lsj_dict.json', 'r'))
20
+
21
  # Nearest neighbours tab
22
  if active_tab == "Nearest neighbours":
23
  st.write("### TO DO: add description of function")
 
155
  st.plotly_chart(fig)
156
 
157
 
158
+
159
 
160
  # Dictionary tab
161
  elif active_tab == "Dictionary":
162
+
163
  with st.container():
164
  all_lemmas = load_compressed_word_list('all_lemmas.pkl.gz')
165
 
166
  query_word = st.multiselect("Search a word in the LSJ dictionary", all_lemmas, max_selections=1)
167
 
168
+ # If a word has been selected by user
169
+ if query_word:
170
+ st.write(f"### {query_word[0]}")
171
+ st.write(lemma_dict[query_word[0]])
172
+
173
+
174
 
175
 
lsj_dict.py CHANGED
@@ -1,6 +1,7 @@
1
  import xml.etree.ElementTree as ET
2
  from collections import defaultdict
3
  from autocomplete import load_compressed_word_list
 
4
 
5
 
6
  def read_xml(file):
@@ -16,7 +17,7 @@ def read_xml(file):
16
  for entry in root.findall('.//entryFree'):
17
  entry_info = extract_entry_info(entry)
18
 
19
- xml_info[entry_info['word']] = entry_info
20
 
21
  return xml_info
22
 
@@ -25,46 +26,116 @@ def extract_entry_info(entry):
25
  """
26
  Extract information from an entry in the LSJ dictionary.
27
  """
28
- word = entry.find('orth').text
29
  definitions = defaultdict(dict)
30
 
31
  # Save the lemma in the dictionary
32
  lemma = entry.get('key')
33
- definitions[word]['lemma'] = lemma
34
 
35
  # Save the orthographies in the dictionary
36
  orthographies = [orth.text for orth in entry.findall('orth')]
37
- definitions[word]['orthographies'] = orthographies
38
 
39
  # Check if there is a tr element with a definition
40
  definition = ' '.join(entry.itertext()).strip()
41
- definitions[word]['definitions'] = {'tr': definition}
42
 
43
- for child in entry:
44
- for grandchild in child.iter():
45
- tag = grandchild.tag
46
- text = grandchild.text or ""
47
- tail = grandchild.tail or ""
48
- if tag != "tr": # Avoiding 'tr' tag since it's handled separately
49
- definitions[word]['definitions'][tag] = text + tail
50
 
51
- return {'word': word, 'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[word]['definitions']}
 
 
 
52
 
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
 
56
- def main():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  merged_info = {}
58
  for i in range(1, 28): # eng1 to eng27
59
  file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
60
  xml_info = read_xml(file)
61
- for word, info in xml_info.items():
62
  # Merge dictionaries, assuming word is unique across all files
63
- merged_info.setdefault(word, {}).update(info)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- # Print lemmas from the merged dictionary
66
- for word, info in merged_info.items():
67
- print(info['lemma'])
68
 
69
 
70
 
 
1
  import xml.etree.ElementTree as ET
2
  from collections import defaultdict
3
  from autocomplete import load_compressed_word_list
4
+ import json
5
 
6
 
7
  def read_xml(file):
 
17
  for entry in root.findall('.//entryFree'):
18
  entry_info = extract_entry_info(entry)
19
 
20
+ xml_info[entry_info['lemma']] = entry_info
21
 
22
  return xml_info
23
 
 
26
  """
27
  Extract information from an entry in the LSJ dictionary.
28
  """
 
29
  definitions = defaultdict(dict)
30
 
31
  # Save the lemma in the dictionary
32
  lemma = entry.get('key')
 
33
 
34
  # Save the orthographies in the dictionary
35
  orthographies = [orth.text for orth in entry.findall('orth')]
36
+ definitions[lemma]['orthographies'] = orthographies
37
 
38
  # Check if there is a tr element with a definition
39
  definition = ' '.join(entry.itertext()).strip()
40
+ definitions[lemma]['definitions'] = {'tr': definition}
41
 
42
+ text = get_descendants_text(entry)
43
+ cleaned_text = prettify_text(text)
 
 
 
 
 
44
 
45
+ definitions[lemma]['definitions']['text'] = cleaned_text
46
+
47
+
48
+ return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
49
 
50
 
51
+ def get_descendants_text(element):
52
+ """
53
+ Get all the text of the descendants of a given element, separating every 'sense' element.
54
+ """
55
+ text = ""
56
+ for child in element:
57
+ if child.tag == 'sense':
58
+ # Add a separator before each 'sense' element
59
+ text += "[SENSE_SEPARATOR]\n\n"
60
+ if child.tag == 'tr' and element.tag == 'sense':
61
+ # Add [tr] tags around text inside 'tr' tags within 'sense' tags
62
+ if child.text is not None:
63
+ text += f"<tr>{child.text.strip()}</tr>\n"
64
+ else:
65
+ text += child.text or ""
66
+ text += get_descendants_text(child)
67
+ text += child.tail or ""
68
+ return text
69
 
70
 
71
+ def prettify_text(text):
72
+ """
73
+ Prettify the text of the definitions into a readable format,
74
+ adding [tr] tags to text inside 'tr' tags within 'sense' tags.
75
+ """
76
+ # Split text using the sense separator
77
+ parts = text.split("[SENSE_SEPARATOR]")
78
+
79
+ # Prettify each part separately
80
+ prettified_parts = []
81
+ for part in parts:
82
+ # Remove leading and trailing whitespace and join lines with a space
83
+ cleaned_part = ' '.join(line.strip() for line in part.split('\n') if line.strip())
84
+ prettified_parts.append(cleaned_part)
85
+
86
+ # Join prettified parts using sense separator and newline
87
+ prettified_text = "\n\n[SENSE_SEPARATOR] ".join(prettified_parts)
88
+
89
+ return prettified_text
90
+
91
+
92
+ def full_dictionary():
93
+ """
94
+ Return the full dictionary of the LSJ dictionary.
95
+ """
96
  merged_info = {}
97
  for i in range(1, 28): # eng1 to eng27
98
  file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
99
  xml_info = read_xml(file)
100
+ for lemma, info in xml_info.items():
101
  # Merge dictionaries, assuming word is unique across all files
102
+ merged_info.setdefault(lemma, {}).update(info)
103
+
104
+ return merged_info
105
+
106
+
107
+ def main():
108
+ # xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")
109
+
110
+ # for word, info in xml_info.items():
111
+ # print(word)
112
+ # print('Lemma: ', info['lemma'])
113
+ # print('Orthographies: ', info['orthographies'])
114
+ # print('Definitions: ', info['definitions'])
115
+ # print('TEST', info['definitions']['tr'].split('\n')[0]) # First word in the definition
116
+ # print('Text:', info['definitions']['text'])
117
+ # if len(info['definitions']['tr'].split('\n')) > 1:
118
+ # print('First definition: ', info['definitions']['tr'].split('\n')[1])
119
+ # print(' ')
120
+
121
+ # full_dictionary()
122
+
123
+ download = True
124
+
125
+ if download is True:
126
+ merged_info = {}
127
+ for i in range(1, 28): # eng1 to eng27
128
+ file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
129
+ xml_info = read_xml(file)
130
+ for word, info in xml_info.items():
131
+ # Merge dictionaries, assuming word is unique across all files
132
+ merged_info.setdefault(word, {}).update(info)
133
+
134
+ # Store merged dictionaries as .json file with pretty print
135
+ with open("lsj_dict.json", "w", encoding="utf-8") as file:
136
+ json.dump(merged_info, file, ensure_ascii=False, indent=4)
137
+
138
 
 
 
 
139
 
140
 
141