import os
import json
from pathlib import Path
from tqdm.auto import tqdm
from typing import List, Any, Dict

MAX_WORDS = 250


def folder_to_json(folder_in: Path, json_path: Path) -> List[Any]:
    """
    Process JSON lines from files in a given folder and write processed data to a new JSON file.

    Parameters:
    folder_in (Path): Path to the input folder containing the JSON files to process.
    json_path (Path): Path to the output JSON file where the processed data will be written.

    Returns:
    List[Any]: List containing processed JSON data from all files in the input folder.

    Example:
    folder_to_json(Path("/path/to/input/folder"), Path("/path/to/output.json"))
    """

    folder_in = Path(folder_in)
    json_out = []  # Initialize list to hold processed JSON data from all files

    # Calculate total number of files in the input folder to set up the progress bar
    total_files = sum([len(files) for r, d, files in os.walk(folder_in)])

    # Initialize progress bar with total file count, description, and unit of progress
    with tqdm(total=total_files, desc='Processing', unit='file') as pbar:
        # Iterate through all files in the input folder
        for subdir, _, files in os.walk(folder_in):
            # Set progress bar postfix to display current directory
            pbar.set_postfix_str(f"Directory: {subdir}", refresh=False)

            for file in files:
                # Update progress bar postfix to display current file and directory
                pbar.set_postfix_str(f"Dir: {subdir} | File: {file}", refresh=True)

                # Create full file path for the current file
                file_path = Path(subdir) / file

                # Open and read the current file
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        # Load JSON data from each line and process it
                        article = json.loads(line)
                        # Ensure the preprocess function is defined and accessible
                        processed_article = preprocess(article)
                        # Add processed data to the output list
                        json_out.extend(processed_article)

                # Update progress bar after processing each file
                pbar.update(1)

    # Notify that the writing process is starting
    pbar.write("Writing file!")
    # Open the output file and write the processed data as JSON
    with open(json_path, "w", encoding='utf-8') as outfile:
        json.dump(json_out, outfile)
    # Notify that the writing process is complete
    pbar.write("File written!")

    # Return the list of processed data
    return json_out


def preprocess(article: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Preprocess a given article dictionary, extracting and processing the 'text' field. Because of the `break` introduced
    we are only taking the first chunk

    Parameters:
    article (Dict[str, Any]): Input dictionary containing an article. Expected to have a 'text' field.

    Returns:
    List[Dict[str, Any]]: A list of dictionaries, where each dictionary represents a preprocessed chunk of
                          the original article's text. Each dictionary also contains the original article's
                          fields (excluding 'text'), with an additional 'chunk_number' field indicating the
                          order of the chunk.

    Example:
    article = {"text": "Example text", "title": "Example Title", "author": "John Doe"}
    processed = preprocess(article)
    print(processed)
    """

    # Create a new dictionary excluding the 'text' field from the original article
    article_out = {k: v for k, v in article.items() if k != 'text'}

    # Create a prefix using the article's text. Adjust this line as needed based on the actual structure of 'article'
    prefix = f'عنوان: {article["text"]}. '
    out = []  # Initialize the list to hold the preprocessed chunks

    # Iterate over chunks obtained by splitting the article's text using the group_arabic_paragraphs function
    # Ensure group_arabic_paragraphs is defined and accessible
    for i, chunk in enumerate(group_arabic_paragraphs(article['text'], MAX_WORDS)):
        # Concatenate the prefix with the current chunk
        chunk = prefix + chunk
        # Create a new dictionary with the chunk, original article fields (excluding 'text'), and chunk number
        # Then append this dictionary to the 'out' list
        out.append({'chunk': chunk, **article_out, 'chunk_number': i})
        # Only take the first chunk
        break

    # Return the list of preprocessed chunks
    return out


def group_arabic_paragraphs(arabic_text: str, max_words: int) -> List[str]:
    """
    Group contiguous paragraphs of Arabic text without exceeding the max_words limit per group.

    Parameters:
    arabic_text (str): The input Arabic text where paragraphs are separated by newlines.
    max_words (int): The maximum number of words allowed per group of paragraphs.

    Returns:
    List[str]: A list of strings where each string is a group of contiguous paragraphs.

    Example:
    arabic_text = "Paragraph1.\nParagraph2.\nParagraph3."
    max_words = 5
    result = group_arabic_paragraphs(arabic_text, max_words)
    print(result)  # Output will depend on word count of each paragraph and max_words.
    """

    # Splitting the input text into paragraphs using newline as a delimiter
    paragraphs = arabic_text.split('\n')

    # Initialize variables to hold the grouped paragraphs and word count
    grouped_paragraphs = []
    current_group = []
    current_word_count = 0

    # Iterate through each paragraph in the input text
    for paragraph in paragraphs:
        # Count the number of words in the paragraph
        word_count = len(paragraph.split())

        # If adding the paragraph won't exceed the word limit, add it to the current group
        if current_word_count + word_count <= max_words:
            current_group.append(paragraph)
            current_word_count += word_count  # Update the word count for the current group
        else:
            # If the paragraph exceeds the word limit, start a new group
            if current_group:
                grouped_paragraphs.append('\n'.join(current_group))
            # Initialize a new group with the current paragraph
            current_group = [paragraph]
            current_word_count = word_count  # Reset the word count for the new group

    # Add the last group if not empty
    if current_group:
        grouped_paragraphs.append('\n'.join(current_group))

    # Return the grouped paragraphs as a list of strings
    return grouped_paragraphs


if __name__ == '__main__':
    folder = Path('output')
    file_out = Path('arwiki.json')
    folder_to_json(folder, file_out)
    print('Done!')