import xml.etree.ElementTree as ET
from collections import defaultdict
from autocomplete import load_compressed_word_list
import json


def read_xml(file):
    """
    Read an XML file of the Greek LSJ dictionary 
    and return a dictionary with the words and their definitions.
    """
    tree = ET.parse(file)
    root = tree.getroot()
    
    xml_info = defaultdict(dict)
    
    for entry in root.findall('.//entryFree'):
        entry_info = extract_entry_info(entry)
        
        xml_info[entry_info['lemma']] = entry_info
        
    return xml_info


def extract_entry_info(entry):
    """
    Extract information from an entry in the LSJ dictionary.
    """
    definitions = defaultdict(dict)
    
    # Save the lemma in the dictionary
    lemma = entry.get('key')
    
    # Save the orthographies in the dictionary
    orthographies = [orth.text for orth in entry.findall('orth')]
    definitions[lemma]['orthographies'] = orthographies
    
    # Check if there is a tr element with a definition
    definition = ' '.join(entry.itertext()).strip()
    definitions[lemma]['definitions'] = {'tr': definition}

    text = get_descendants_text(entry)
    cleaned_text = prettify_text(text)
    
    definitions[lemma]['definitions']['text'] = cleaned_text
        
    
    return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}


def get_descendants_text(element):
    """
    Get all the text of the descendants of a given element, separating every 'sense' element.
    """
    text = ""
    for child in element:
        if child.tag == 'sense':
            # Add a separator before each 'sense' element
            text += "[SENSE_SEPARATOR]\n\n"
        if child.tag == 'tr' and element.tag == 'sense':
            # Add [tr] tags around text inside 'tr' tags within 'sense' tags
            if child.text is not None:
                text += f"<tr>{child.text.strip()}</tr>\n"
        else:
            text += child.text or ""
            text += get_descendants_text(child)
            text += child.tail or ""
    return text


def prettify_text(text):
    """
    Prettify the text of the definitions into a readable format, 
    adding [tr] tags to text inside 'tr' tags within 'sense' tags.
    """
    # Split text using the sense separator
    parts = text.split("[SENSE_SEPARATOR]")
    
    # Prettify each part separately
    prettified_parts = []
    for part in parts:
        # Remove leading and trailing whitespace and join lines with a space
        cleaned_part = ' '.join(line.strip() for line in part.split('\n') if line.strip())
        prettified_parts.append(cleaned_part)
    
    # Join prettified parts using sense separator and newline
    prettified_text = "\n\n[SENSE_SEPARATOR] ".join(prettified_parts)
    
    return prettified_text


def full_dictionary():
    """
    Return the full dictionary of the LSJ dictionary.
    """
    merged_info = {}
    for i in range(1, 28):  # eng1 to eng27
        file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
        xml_info = read_xml(file)
        for lemma, info in xml_info.items():
            # Merge dictionaries, assuming word is unique across all files
            merged_info.setdefault(lemma, {}).update(info)
    
    return merged_info
    

def main():
    # xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")
    
    # for word, info in xml_info.items():
    #     print(word)
    #     print('Lemma: ', info['lemma'])
    #     print('Orthographies: ', info['orthographies'])
    #     print('Definitions: ', info['definitions'])
    #     print('TEST', info['definitions']['tr'].split('\n')[0])  # First word in the definition
    #     print('Text:', info['definitions']['text'])
    #     if len(info['definitions']['tr'].split('\n')) > 1:
    #         print('First definition:  ', info['definitions']['tr'].split('\n')[1])
    #     print(' ')
        
    #   full_dictionary()    
    
    download = True
    
    if download is True:
        merged_info = {}
        for i in range(1, 28):  # eng1 to eng27
            file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml"
            xml_info = read_xml(file)
            for word, info in xml_info.items():
                # Merge dictionaries, assuming word is unique across all files
                merged_info.setdefault(word, {}).update(info)
        
        # Store merged dictionaries as .json file with pretty print
        with open("lsj_dict.json", "w", encoding="utf-8") as file:
            json.dump(merged_info, file, ensure_ascii=False, indent=4)
        
    
if __name__ == "__main__":
    main()