Spaces:

bugroup
/

GazeGenie

Running

File size: 34,806 Bytes

da572bf

"""Mostly adapted from https://github.com/martin-vasilev/EMreading
Moslty deprecated in favour of alternative methods."""

from icecream import ic
from io import StringIO
import re
import numpy as np
import pandas as pd


def assign_chars_to_words(df):
    df.reset_index(inplace=True, names="index_temp")
    df["wordID"] = ""
    df["char_word"] = -1
    word_list = []
    cols = []
    sent_list = df["sent"].unique()

    for i in range(len(sent_list)):  # for each sentence
        word_list = df[df["sent"] == i]["word"].unique()
        for j in range(len(word_list)):
            cols = df[(df["sent"] == i) & (df["word"] == word_list[j])].index
            df.loc[cols, "wordID"] = "".join(df["char"].loc[cols])
            df.loc[(df["sent"] == i) & (df["word"] == word_list[j]), "char_word"] = [k for k in range(len(cols))]
    df.set_index("index_temp", inplace=True)
    return df


def round_and_int(value):
    if not pd.isna(value):
        return int(round(value))
    else:
        return None


def get_coord_map(coords, x=1920, y=1080):
    """
    Original R version:
    ```R
    # Use stimuli information to create a coordinate map_arr for each pixel on the screen
    # This makes it possible to find exactly what participants were fixating
    coord_map_arr<- function(coords, x=resolution_x, y= resolution_y){

    coords$id<- 1:nrow(coords)
    map_arr<- data.frame(matrix(NA, nrow = y, ncol = x))

    for(i in 1:nrow(coords)){
        map_arr[coords$y1[i]:coords$y2[i],coords$x1[i]:coords$x2[i]]<- coords$id[i]

    }

    return(map_arr)
    }```
    """
    coords.reset_index(drop=True, inplace=True)
    y1 = coords["char_ymin"].map(round_and_int)
    y2 = coords["char_ymax"].map(round_and_int)
    x1 = coords["char_xmin"].map(round_and_int)
    x2 = coords["char_xmax"].map(round_and_int)
    coords["id"] = np.arange(len(coords))
    map_arr = np.full((y, x), np.nan)

    for i in range(len(coords)):
        map_arr[y1[i] : y2[i] + 1, x1[i] : x2[i] + 1] = coords["id"].iloc[i]

    np.sum(pd.isna(map_arr), axis=None)
    return map_arr


def get_char_num_for_each_line(df):
    df.reset_index(inplace=True, names="index_temp")
    df["line_char"] = np.nan
    unq_line = df["assigned_line"].unique()
    for i in unq_line:
        assigned_line = df[df["assigned_line"] == i].index
        df.loc[assigned_line, "line_char"] = range(len(assigned_line))
    df.set_index("index_temp", inplace=True)
    return df


def parse_fix(
    file,
    trial_db,
):

    indexrange = list(range(trial_db["trial_start_idx"], trial_db["trial_end_idx"] + 1))

    sfix_stamps = [i for i in indexrange if re.search(r"(?i)(SFIX)", file[i])]

    efix_stamps = [i for i in indexrange if re.search(r"(?i)EFIX", file[i])]

    if len(sfix_stamps) > (len(efix_stamps) + 1):
        ic(f"length mismatch parse_fix of {len(sfix_stamps) - (len(efix_stamps))}")

    if not sfix_stamps or not efix_stamps:
        raw_fix = None
        return raw_fix
    for safe_num in range(25):
        if efix_stamps[0] < sfix_stamps[0]:
            efix_stamps = efix_stamps[1:]
        elif efix_stamps[-1] <= sfix_stamps[-1]:
            sfix_stamps = sfix_stamps[:-1]
        elif efix_stamps[0] >= sfix_stamps[0]:
            sfix_stamps = sfix_stamps[1:]
        if not (len(efix_stamps) != len(sfix_stamps) and len(efix_stamps) > 1 and len(sfix_stamps) > 1):
            break

    def parse_sacc(string):
        a = string.split("	")
        return float(a[2])

    esacc_flag = [file[f - 1] if "ESACC" in file[f - 1] else None for f in sfix_stamps]
    saccDur = []
    for k in esacc_flag:
        if k is None:
            saccDur.append(None)
        else:
            saccDur.append(parse_sacc(k))

    s_time = [int(file[s].strip().split(" ")[-1]) for s in sfix_stamps]

    e_time = [int(file[s - 1].strip().split(" ")[0]) for s in efix_stamps]
    if len(s_time) != len(e_time):
        if s_time[-1] > e_time[-1]:
            s_time = s_time[:-1]

    fixDur = [e_time[index] - s_time[index] for index in range(len(s_time))]
    fixDur = [e - s for e, s in zip(e_time, s_time)]
    assert ~(np.asarray(fixDur) < 0).any()
    x = [float(file[fidx].split("\t")[3]) for fidx in efix_stamps]
    y = [float(file[fidx].split("\t")[4]) for fidx in efix_stamps]
    blink_stamp = [index for index in indexrange if "EBLINK" in file[index]]
    blink_time = [float(file[index].strip().replace("\t", " ").split(" ")[2]) - 1 for index in blink_stamp]
    index = np.searchsorted(s_time, blink_time, side="right") - 1
    blink = np.zeros((len(s_time)))
    blink[index] = -1
    raw_fix = pd.DataFrame(
        {"s_time": s_time, "e_time": e_time, "fixDur": fixDur, "saccDur": saccDur, "x": x, "y": y, "blink": blink}
    )
    return raw_fix


def process_fix_EM(fix, coords_map, coords, SL):
    resolution_y, resolution_x = coords_map.shape
    loc = None
    raw_fix = pd.DataFrame()
    num_fixations = len(fix)
    SFIX = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    EFIX = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    x = np.full(num_fixations, np.nan)
    y = np.full(num_fixations, np.nan)
    fix_num = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    fix_dur = np.full(num_fixations, None)
    sent = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    line = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    word = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    char_trial = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    char_line = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    word_line = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    max_sent = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    max_word = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    regress = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    blink = pd.array([None] * num_fixations, dtype=pd.BooleanDtype())
    outOfBnds = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    outsideText = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    wordID = np.full(num_fixations, None)
    land_pos = pd.array([None] * num_fixations, dtype=pd.Int64Dtype())
    sacc_len = np.full(num_fixations, np.nan)

    max_sentence = coords["in_sentence_number"].max()

    curr_sent = np.zeros((max_sentence + 1, 2))
    curr_sent[: max_sentence + 1, 0] = np.arange(0, max_sentence + 1)

    if isinstance(coords["index"], str):
        coords["index"] = pd.to_numeric(coords["index"], errors="coerce")

    for j in range(len(fix)):
        if (fix["y"][j] > 0) and (fix["x"][j] > 0) and (fix["y"][j] <= resolution_y) and (fix["x"][j] <= resolution_x):
            loc = coords_map[round(fix["y"][j]), round(fix["x"][j])]
            if pd.isnull(loc):
                loc = None
        else:
            loc = None

        fix_num[j] = j
        fix_dur[j] = fix["duration"][j]
        SFIX[j] = fix["start_uncorrected"][j]
        EFIX[j] = fix["stop_uncorrected"][j]
        x[j] = fix["x"][j]
        y[j] = fix["y"][j]
        blink[j] = fix["blink"][j]

        if x[j] < 1 or x[j] > resolution_x or y[j] < 1 or y[j] > resolution_y:
            outOfBnds[j] = 1
        else:
            outOfBnds[j] = 0
            outsideText[j] = 1 if loc is None else 0

        if fix["x"][j] < 0:
            loc = None
            outOfBnds[j] = 1
            outsideText[j] = 1

        if loc is not None:
            sent[j] = coords["in_sentence_number"][loc]
            line[j] = coords["assigned_line"][loc]
            word[j] = coords["in_word_number"][loc]
            word_line[j] = coords["wordline"][loc]
            char_trial[j] = coords["index"][loc] + 1
            char_line[j] = coords["letline"][loc]
            wordID[j] = coords["in_word"][loc]
            land_pos[j] = coords["letword"][loc]

            if j > 0 and not pd.isna(char_trial[j]) and not pd.isna(char_trial[j - 1]):
                sacc_len[j] = abs(char_trial[j] - char_trial[j - 1])
            else:
                sacc_len[j] = np.nan
        else:
            sent[j] = np.nan
            line[j] = np.nan
            word[j] = np.nan
            word_line[j] = np.nan
            char_trial[j] = np.nan
            char_line[j] = np.nan
            wordID[j] = np.nan
            land_pos[j] = np.nan
            sacc_len[j] = np.nan

        if SL:
            if loc is not None:
                if j == 0:
                    max_sent[j] = sent[j]
                else:
                    max_sent[j] = max_sent[j - 1] if pd.isna(sent[j]) or pd.isna(max_sent[j - 1]) else max_sent[j - 1]
                    if not (pd.isna(max_sent[j]) or pd.isna(sent[j])) and sent[j] > max_sent[j]:
                        max_sent[j] = sent[j]

                if j == 0:
                    max_word[j] = abs(word[j])
                    curr_sent[sent[j] - 1, 1] = abs(word[j])
                else:
                    max_word[j] = (
                        curr_sent[sent[j] - 1, 1]
                        if pd.isna(word[j]) or pd.isna(curr_sent[sent[j] - 1, 1])
                        else curr_sent[sent[j] - 1, 1]
                    )
                    if not (pd.isna(word[j]) or pd.isna(max_word[j])) and abs(word[j]) > curr_sent[sent[j] - 1, 1]:
                        max_word[j] = abs(word[j])
                        curr_sent[sent[j] - 1, 1] = abs(word[j])

                if not (pd.isna(word[j]) or pd.isna(max_word[j])) and abs(word[j]) < max_word[j]:
                    regress[j] = 1
                else:
                    regress[j] = 0

                if j > 0 and not pd.isna(word[j]):
                    if pd.isna(regress[j - 1]):
                        regress[j] = np.nan
                    else:
                        if abs(word[j]) == max_word[j] and regress[j - 1] == 1 and word[j] in np.unique(word[:j]):
                            regress[j] = 1

    raw_fix = pd.DataFrame(
        {
            "start_uncorrected": SFIX,
            "stop_uncorrected": EFIX,
            "x": x,
            "y": y,
            "fixation_number": fix_num,
            "on_sentence_number_EM": sent,
            "line_EM": line,
            "word_EM": word,
            "word_line_EM": word_line,
            "char_trial_EM": char_trial,
            "char_line_EM": char_line,
            "regress_EM": regress,
            "wordID_EM": wordID,
            "land_pos_EM": land_pos,
            "sacc_len_EM": sacc_len,
            "blink_EM": blink,
            "outOfBnds_EM": outOfBnds,
            "outsideText_EM": outsideText,
        }
    )

    fix2 = fix.merge(
        raw_fix,
        on=[
            "start_uncorrected",
            "stop_uncorrected",
            "x",
            "y",
            "fixation_number",
        ],
        how="left",
    )
    return fix2


def RS(i, rawfix, coords, reqYthresh, reqXthresh, Ythresh, Xthresh, threshSimilar):

    if i == 0:
        return 0

    lw = coords["char_xmax"][0] - coords["char_xmin"][0]
    lh = coords["char_ymax"][0] - coords["char_ymin"][0]
    meetXthresh = False
    meetYthresh = False

    leftSacc = rawfix["x"][i] < rawfix["x"][i - 1]
    downSacc = rawfix["y"][i] > rawfix["y"][i - 1]

    if downSacc & reqYthresh:
        Ydiff = lh * Ythresh
        trueYdiff = rawfix["y"][i] - rawfix["y"][i - 1]
        meetYthresh = trueYdiff >= Ydiff

    if leftSacc & reqXthresh:
        Xdiff = lw * Xthresh
        trueXdiff = rawfix["x"][i - 1] - rawfix["x"][i]
        meetXthresh = trueXdiff >= Xdiff

    maxPoints = 1 + 2
    if reqYthresh:
        maxPoints += 1
    if reqXthresh:
        maxPoints += 1

    currPoints = 0
    if leftSacc:
        currPoints = currPoints + (1 / maxPoints)
        if meetXthresh:
            currPoints = currPoints + (1 / maxPoints)

    if downSacc:
        currPoints = currPoints + 2 * (1 / maxPoints)
        if meetYthresh:
            currPoints = currPoints + (1 / maxPoints)

    return round(currPoints, 2)


def reMap(rawfix, i, coords_map, coords, newY=None):
    rawfix.set_index("fixation_number", inplace=True)
    assert i in rawfix.index, "Not in index"
    rawfix.loc[i, "reAligned"] = True
    rawfix.loc[i, "previous_line"] = rawfix.loc[i, "line_EM"]
    rawfix.loc[i, "previous_y"] = rawfix.loc[i, "y"]
    if newY != None:
        rawfix.loc[i, "y"] = newY
    loc = coords_map[round(rawfix.loc[i, "y"]), round(rawfix.loc[i, "x"])]
    if pd.isnull(loc):
        return rawfix
    rawfix.loc[i, "on_sentence_number_EM"] = coords["in_sentence_number"][loc]
    rawfix.loc[i, "word_EM"] = coords["in_word_number"][loc]
    rawfix.loc[i, "line_EM"] = coords["assigned_line"][loc]

    return rawfix.reset_index(drop=False, names=["fixation_number"])


def reAlign(rawfix, coords, coords_map, RSpar):

    ystart = coords["char_ymin"].min()
    yend = coords["char_ymax"].max()
    nlines = coords["assigned_line"].max()
    letterHeight = coords["char_ymax"][0] - coords["char_ymin"][0]
    xstart = pd.DataFrame(columns=["1", "2"])
    xstart["1"] = np.arange(nlines + 1)
    ystart = pd.DataFrame(columns=["1", "2"])
    ystart["1"] = np.arange(nlines + 1)
    xend = pd.DataFrame(columns=["1", "2"])
    xend["1"] = np.arange(nlines + 1)
    yend = pd.DataFrame(columns=["1", "2"])
    yend["1"] = np.arange(nlines + 1)
    rawfix["previous_x"] = np.nan

    for i in coords["assigned_line"].unique():
        a = coords[coords["assigned_line"] == i]
        xstart.loc[i, "2"] = a["char_xmin"].min()
        xend.loc[i, "2"] = a["char_xmax"].max()
        ystart.loc[i, "2"] = a["char_ymin"].min()
        yend.loc[i, "2"] = a["char_ymax"].min()

    lineCenter = ystart["2"] + letterHeight / 2

    rawfix["prob_return_sweep"] = np.nan
    rawfix["prob_interline_saccade"] = np.nan
    rawfix["reAligned"] = False
    rawfix["previous_y"] = np.nan
    rawfix["previous_line"] = np.nan

    for i in range(rawfix.shape[0]):
        rawfix.loc[i, "prob_return_sweep"] = RS(
            i,
            rawfix,
            coords,
            reqYthresh=True,
            reqXthresh=True,
            Ythresh=RSpar[0],
            Xthresh=RSpar[1],
            threshSimilar=RSpar[2],
        )

        if i > 0:
            if (rawfix["prob_return_sweep"][i] < 1) & (rawfix["y"][i] > rawfix["y"][i - 1] + letterHeight / 2):
                rawfix.loc[i, "prob_return_sweep"] = 1

        rawfix.loc[i, "previous_x"] = rawfix["x"][i]
        rawfix.loc[i, "previous_y"] = rawfix["y"][i]

        if i > 0:
            if rawfix["y"][i] < rawfix["y"][i - 1] - letterHeight / 2:
                rawfix.loc[i, "prob_interline_saccade"] = 1
            else:
                rawfix.loc[i, "prob_interline_saccade"] = 0

    RsweepFix = np.sort(
        np.concatenate(
            (np.where(rawfix["prob_return_sweep"] == 1)[0], np.where(rawfix["prob_interline_saccade"] == 1)[0])
        )
    )

    for i in range(len(RsweepFix)):
        if i == 0:
            linePass = rawfix.loc[: RsweepFix[0] - 1]

        elif i >= len(RsweepFix):
            linePass = rawfix.loc[RsweepFix[-1] :]

        else:
            linePass = rawfix.loc[RsweepFix[i - 1] : RsweepFix[i] - 1]

        if linePass.shape[0] == 1:
            continue

        avgYpos = linePass["y"].mean(skipna=True)
        whichLine = min(range(len(lineCenter)), key=lambda index: abs(lineCenter[index] - avgYpos))
        linePass.reset_index(inplace=True, drop=True)
        for j in range(linePass.shape[0]):
            onLine = (linePass["y"][j] >= ystart["2"][whichLine]) & (linePass["y"][j] <= yend["2"][whichLine])

            if not onLine:
                if linePass["y"][j] < ystart["2"][whichLine]:
                    rawfix = reMap(
                        rawfix, linePass.loc[j, "fixation_number"], coords_map, coords, newY=ystart["2"][whichLine] + 5
                    )
                else:
                    rawfix = reMap(
                        rawfix, linePass.loc[j, "fixation_number"], coords_map, coords, newY=yend["2"][whichLine] - 5
                    )
                rawfix.loc[linePass.loc[j, "fixation_number"], "reAligned"] = True
            else:
                rawfix.loc[linePass.loc[j, "fixation_number"], "reAligned"] = False

    return rawfix


def cleanData(
    raw_fix,
    algo_choice,
    removeBlinks=True,
    combineNearbySmallFix=True,
    combineMethod="char",
    combineDist=1,
    removeSmallFix=True,
    smallFixCutoff=80,
    remove_duration_outliers=True,
    outlierMethod="ms",
    outlierCutoff=800,
    keepRS=False,
):

    if combineNearbySmallFix:
        nbefore = raw_fix.shape[0]
        which_comb = []

        for i, _ in enumerate(raw_fix):
            prev_line_same = False
            next_line_same = False

            if (i > 0) and (i < nbefore - 1):
                if combineMethod == "char":
                    if (
                        pd.isna(raw_fix[f"letternum_{algo_choice}"][i])
                        or pd.isna(raw_fix[f"letternum_{algo_choice}"][i - 1])
                        or pd.isna(raw_fix[f"letternum_{algo_choice}"][i + 1])
                    ):
                        continue

                if raw_fix["duration"][i] < smallFixCutoff:
                    if (
                        not pd.isna(raw_fix[f"line_num_{algo_choice}"][i])
                        and not pd.isna(raw_fix[f"line_num_{algo_choice}"][i - 1])
                        and not pd.isna(raw_fix[f"line_num_{algo_choice}"][i + 1])
                    ):

                        if raw_fix[f"line_num_{algo_choice}"][i] == raw_fix[f"line_num_{algo_choice}"][i - 1]:
                            prev_line_same = True
                    if raw_fix[f"line_num_{algo_choice}"][i] == raw_fix[f"line_num_{algo_choice}"][i + 1]:
                        next_line_same = True

                    if combineMethod == "char":
                        prev = abs(raw_fix[f"letternum_{algo_choice}"][i] - raw_fix[f"letternum_{algo_choice}"][i - 1])
                        after = abs(raw_fix[f"letternum_{algo_choice}"][i] - raw_fix[f"letternum_{algo_choice}"][i + 1])

                    else:
                        prev = abs(round(raw_fix["x"][i]) - round(raw_fix["x"][i - 1]))
                        after = abs(round(raw_fix["x"][i]) - round(raw_fix["x"][i + 1]))

                    if prev <= combineDist:
                        which_comb.append(i)

                        if prev_line_same:

                            raw_fix["duration"][i - 1] += raw_fix["duration"][i]

                        if keepRS and (raw_fix["Rtn_sweep"][i] == 1):

                            raw_fix["Rtn_sweep"][i - 1] = 1

                    if after <= combineDist:
                        which_comb.append(i)

                        if next_line_same:

                            raw_fix["duration"][i + 1] += raw_fix["duration"][i]

                        if keepRS and (raw_fix["Rtn_sweep"][i] == 1):

                            raw_fix["Rtn_sweep"][i + 1] = 1

        which_comb = list(set(which_comb))

        if len(which_comb) > 0:
            raw_fix = raw_fix.drop(labels=which_comb, axis=0)
    nstart = raw_fix.shape[0]

    if removeBlinks:
        raw_fix = raw_fix[~raw_fix["blink"]].copy()
    nblink = nstart - raw_fix.shape[0]

    if remove_duration_outliers:
        if outlierMethod == "ms":
            outIndices = np.where(raw_fix["duration"] > outlierCutoff)[0]
            if len(outIndices) > 0:
                raw_fix = raw_fix.drop(outIndices).copy()
        elif outlierMethod == "std":
            nSubCutoff, nOutliers = [], 0
            subM = np.mean(raw_fix["duration"])
            subSTD = np.std(raw_fix["duration"])
            cutoff = subM + outlierCutoff * subSTD
            nSubCutoff.append((len(np.where(raw_fix[raw_fix["duration"] > cutoff])[0])))
            nOutliers = sum(nSubCutoff)

    return raw_fix.reset_index(drop=True)


def get_space(s):
    if len(s) == 0 or s == " ":
        return 1
    else:
        return None


def get_num(string):
    strr = "".join([i for i in string if i.isdigit()])
    if len(strr) > 0:
        return int(strr)
    else:
        ic(string)
        return strr


def parse_itemID(trialid):
    I = re.search(r"I", trialid).start()
    condition = get_num(trialid[:I])

    D = re.search(r"D", trialid).start()
    item = get_num(trialid[I + 1 : D])
    depend = get_num(trialid[D:])

    E = trialid[0]

    return {"trialid": trialid, "condition": condition, "item": item, "depend": depend, "trial_is": E}


def get_coord(str_input):
    string = "\n".join(
        [l.split("\t")[1].strip() for l in str_input if (("DELAY" not in l) & ("BUTTON" not in l) & ("REGION" in l))]
    )

    df = pd.read_table(
        StringIO(string),
        sep=" ",
        names=["X" + str(i) for i in range(1, 12)],
    )
    df.loc[:, ["char_xmin", "char_ymin", "char_xmax", "char_ymax", "X11"]] = df[
        ["char_xmin", "char_ymin", "char_xmax", "char_ymax", "X11"]
    ].apply(pd.to_numeric, errors="coerce")
    df.char = df.char.fillna("")

    a = df[df["char"] == ""].index
    for i in a:
        if "space" not in df.columns:
            df.loc[:, "space"] = None
        df.at[i, "space"] = 1

        if "char_xmin" in df.columns and "char_ymin" in df.columns:
            df.at[i, "char_xmin"], df.at[i, "char_ymin"] = df.at[i, "char_ymin"], df.at[i, "char_xmax"]

        if "char_ymin" in df.columns and "char_xmax" in df.columns:
            df.at[i, "char_ymin"], df.at[i, "char_xmax"] = df.at[i, "char_xmax"], df.at[i, "char_ymax"]

        if "char_xmax" in df.columns and "char_ymax" in df.columns:
            df.at[i, "char_xmax"], df.at[i, "char_ymax"] = df.at[i, "char_ymax"], df.at[i, "X11"]
    df = df.drop(columns=["X1", "X2", "X3", "X5"])
    return df


def map_sent(df):

    sent_bnd = df[(df.char == ".") | (df.char == "?") | (df.char == "!")].index.tolist()

    if len(sent_bnd) > 0:
        sent = pd.Series([-1] * len(df))

        for i, eidx in enumerate(sent_bnd):
            sidx = sent_bnd[i - 1] if i > 0 else 0
            if i == len(sent_bnd) - 1:
                sent.loc[sidx:] = len(sent_bnd) - 1
            else:
                sent.loc[sidx:eidx] = i
        df["sent"] = sent
    else:
        df["sent"] = 1
    return df


def map_line(df):
    df = df[~pd.isnull(df["char_ymin"])].reset_index(names="index_temp")

    lines = sorted(set(df["char_ymin"].values))

    assigned_line = np.array([], dtype=int)

    for i in range(len(lines)):
        loc_lines = np.where(df["char_ymin"].values == lines[i])[0]
        assigned_line = np.concatenate((assigned_line, np.full(len(loc_lines), fill_value=i)))
        df.loc[len(assigned_line) - 1, "space"] = 2

    df["assigned_line"] = assigned_line
    df.set_index("index_temp", inplace=True)

    return df


def map_words(df):
    curr_sent, curr_line, curr_word = 0, 0, 0
    df["space"] == 2

    for i in df.index:
        newSent = curr_sent != df.loc[i, "sent"]
        newLine = curr_line != df.loc[i, "assigned_line"]

        df.loc[i, "word"] = curr_word
        if df.loc[i, "char"] == "" and not newSent:
            curr_word += 1
            df.loc[i, "word"] = curr_word

        elif newLine:
            if df.loc[i, "char"] != ".":
                curr_word += 1
            df.loc[i, "word"] = curr_word
            curr_line += 1

        elif newSent:
            curr_sent += 1
            curr_word = 0
            df.loc[i, "word"] = curr_word

    return df


def get_return_sweeps(raw_fix_new, coords, algo_choice):  # TODO Check if covered by popEye
    currentSent = 0
    currentLine = 0
    maxLine = 0
    inReg = False

    curr_sent = np.zeros((max(coords["in_sentence_number"]) + 1, 4))
    curr_sent[:, 0] = np.arange(0, max(coords["in_sentence_number"]) + 1)

    diff_sent = coords["in_sentence_number"].diff().fillna(0)
    last_words = coords.loc[np.where(diff_sent == 1), "in_word_number"]
    curr_sent[:, 2] = np.append(last_words.values, coords["in_word_number"].iloc[-1])
    for m in range(1, len(raw_fix_new)):
        if not (pd.isna(raw_fix_new["char_line_EM"][m - 1]) or pd.isna(raw_fix_new["char_line_EM"][m])):
            raw_fix_new.at[m, "sacc_len_EM"] = abs(raw_fix_new["char_line_EM"][m] - raw_fix_new["char_line_EM"][m - 1])

        if not pd.isna(raw_fix_new["line_EM"][m]):
            currentLine = raw_fix_new["line_EM"][m]

        if currentLine > maxLine:
            maxLine = currentLine
            raw_fix_new.at[m, "Rtn_sweep"] = 1

            if m < len(raw_fix_new) - 1:
                sameLine = (
                    not (pd.isna(raw_fix_new["line_EM"][m + 1]) or pd.isna(raw_fix_new["line_EM"][m]))
                    and raw_fix_new["line_EM"][m + 1] == raw_fix_new["line_EM"][m]
                )

                if raw_fix_new["x"][m + 1] < raw_fix_new["x"][m]:
                    raw_fix_new.at[m, "Rtn_sweep_type"] = "undersweep" if sameLine else None
                else:
                    raw_fix_new.at[m, "Rtn_sweep_type"] = "accurate" if sameLine else None
            else:
                raw_fix_new.at[m, "Rtn_sweep_type"] = np.nan
        else:
            raw_fix_new.at[m, "Rtn_sweep"] = 0

        if not pd.isna(raw_fix_new["on_sentence_number_EM"][m]):
            if m == 1:
                curr_sent[int(raw_fix_new["on_sentence_number_EM"][m]), 2] = raw_fix_new["word_EM"][m]
                raw_fix_new.at[m, "regress_EM"] = 0
            else:
                if raw_fix_new["word_EM"][m] > curr_sent[int(raw_fix_new["on_sentence_number_EM"][m]), 2]:
                    curr_sent[int(raw_fix_new["on_sentence_number_EM"][m]), 2] = raw_fix_new["word_EM"][m]
                    inReg = False

                if currentSent < raw_fix_new["on_sentence_number_EM"][m]:
                    curr_sent[currentSent, 3] = 1
                    currentSent = raw_fix_new["on_sentence_number_EM"][m]

                if (
                    not pd.isna(raw_fix_new["on_sentence_number_EM"][m - 1])
                    and raw_fix_new["on_sentence_number_EM"][m] > raw_fix_new["on_sentence_number_EM"][m - 1]
                ):
                    curr_sent[int(raw_fix_new["on_sentence_number_EM"][m - 1]), 3] = 1

                if (
                    raw_fix_new["word_EM"][m] < curr_sent[int(raw_fix_new["on_sentence_number_EM"][m]), 2]
                    and curr_sent[int(raw_fix_new["on_sentence_number_EM"][m]), 3] == 0
                ):
                    raw_fix_new.at[m, "regress_EM"] = 1
                    inReg = True
                else:
                    if curr_sent[int(raw_fix_new["on_sentence_number_EM"][m]), 3] == 0:
                        raw_fix_new.at[m, "regress_EM"] = 0

                        if (
                            raw_fix_new["word_EM"][m] == curr_sent[int(raw_fix_new["on_sentence_number_EM"][m]), 2]
                            and inReg
                        ):
                            raw_fix_new.at[m, "regress_EM"] = 1
                    else:
                        raw_fix_new.at[m, "regress_EM"] = 1
                        raw_fix_new.at[m, "regress2nd_EM"] = 1
                        inReg = True
    return raw_fix_new


def word_m_EM(n2):
    sub_list = []
    item_list = []
    cond_list = []
    seq_list = []
    word_list = []
    wordID_list = []
    sent_list = []
    FFD_list = []
    SFD_list = []
    GD_list = []
    TVT_list = []
    nfix1_list = []
    nfix2_list = []
    nfixAll_list = []
    regress_list = []
    o = n2["sent"].unique()
    for k in range(len(o)):
        q = n2[n2["sent"] == o[k]]
        r = sorted(q["word"].unique())

        for l in range(len(r)):
            word_list.append(r[l])
            sub_list.append(n2["sub"].iloc[0])
            item_list.append(n2["item"].iloc[0])
            seq_list.append(n2["seq"].iloc[0])
            cond_list.append(n2["cond"].iloc[0])
            sent_list.append(o[k])

            p = q[q["word"] == r[l]]

            if p.shape[0] == 0:
                FFD_list.append(None)
                SFD_list.append(None)
                GD_list.append(None)
                TVT_list.append(None)
                nfix1_list.append(0)
                nfix2_list.append(0)
                nfixAll_list.append(0)
            else:
                p1 = p[p["regress"] == 0]
                p2 = p[p["regress"] == 1]

                if p1.shape[0] == 0:
                    FFD_list.append(None)
                    SFD_list.append(None)
                    GD_list.append(None)
                elif p1.shape[0] == 1:
                    FFD_list.append(p1["fix_dur"].iloc[0])
                    SFD_list.append(p1["fix_dur"].iloc[0])
                    GD_list.append(p1["fix_dur"].iloc[0])
                else:
                    FFD_list.append(p1["fix_dur"].iloc[0])
                    SFD_list.append(None)
                    GD_list.append(p1["fix_dur"].sum())

                TVT_list.append(p["fix_dur"].sum())
                nfix1_list.append(p1.shape[0])
                nfix2_list.append(p2.shape[0])
                nfixAll_list.append(p1.shape[0] + p2.shape[0])

                wordID_list.append(p["wordID"].iloc[0])

                if nfix2_list[-1] == 0:
                    regress_list.append(0)
                else:
                    regress_list.append(1)

        dataT = pd.DataFrame(
            {
                "sub": sub_list,
                "item": item_list,
                "cond": cond_list,
                "seq": seq_list,
                "word": word_list,
                "wordID": wordID_list,
                "sent": sent_list,
                "FFD": FFD_list,
                "SFD": SFD_list,
                "GD": GD_list,
                "TVT": TVT_list,
                "nfix1": nfix1_list,
                "nfix2": nfix2_list,
                "nfixAll": nfixAll_list,
                "regress": regress_list,
            }
        )

        sub_list = []
        item_list = []
        cond_list = []
        seq_list = []
        word_list = []
        wordID_list = []
        sent_list = []
        FFD_list = []
        SFD_list = []
        GD_list = []
        TVT_list = []
        nfix1_list = []
        nfix2_list = []
        nfixAll_list = []
        regress_list = []

        if "dataN" in locals():
            dataN = pd.concat([dataN, dataT], ignore_index=True)
        else:
            dataN = dataT


def word_measures_EM(data, algo_choice, include_time_stamps=False):
    add_blanks = False

    if "blink" in data.columns:
        required_columns = ["blink", "prev_blink", "after_blink"]
        if all(col in data.columns for col in required_columns):
            if (data["blink"] + data["prev_blink"] + data["after_blink"]).sum() == 0:
                ic("Blinks appear to be already excluded! \n\n")
            else:
                add_blanks = True
                ic("There appears to be valid blink data! We will map blinks to individual words. \n\n")

                regress_blinks = data[data["blink"] == 1 & ~data["regress_EM"].isna()].index

                if len(regress_blinks) < 1:
                    BlinkFixTypeNotMapped = True
                    ic(
                        "Fixation type is not mapped for observations with blinks. Therefore, blinks can't be mapped in terms of 1st and 2nd pass reading."
                    )
                    ic(
                        "Please note that, by default, blink fixation durations will also not be added to fixation duration measures for that word since it's assumed you will delete this word from analysis.\n"
                    )
                    ic("If you need to change this, see settings in the pre-processing function.\n\n")

    data_n = pd.DataFrame()

    o_k = sorted(np.unique(data[f"on_sentence_num_{algo_choice}"]))

    for k, sent_k in enumerate(o_k):
        q_k = data[data[f"on_sentence_num_{algo_choice}"] == sent_k]

        p1_k = q_k[q_k["regress_EM"] == 0].copy()
        p2_k = q_k[q_k["regress_EM"] == 1].copy()

        RS_word = np.nan
        check_next = False

        if max(data[f"line_num_{algo_choice}"]) > 1:
            for z, q_row in q_k.iterrows():
                if not pd.isna(q_row["Rtn_sweep"]):
                    if q_row["Rtn_sweep"] == 1:
                        check_next = True
                        RS_Word = (
                            q_row[f"line_word_{algo_choice}"]
                            if not pd.isna(q_row[f"line_word_{algo_choice}"])
                            else np.nan
                        )
                    elif check_next and (pd.notna(q_row[f"line_word_{algo_choice}"])) and (q_row["regress_EM"]):
                        break

        word_l = []
        sub_l = [data.loc[0, "subject"]] * len(q_k)
        item_l = [data.loc[0, "item"]] * len(q_k)
        cond_l = [1] * len(q_k)

        for l, q_row in q_k.iterrows():
            word_l.append(q_row[f"on_word_number_{algo_choice}"])

            if add_blanks:
                sum_1st_pass = (
                    q_row["blink"]
                    + p1_k[p1_k.index[q_row.name]]["prev_blink"]
                    + p2_k[p2_k.index[q_row.name]]["after_blink"]
                ).sum()

                blinks_l = [0] * len(word_l)
                if sum_1st_pass > 0:
                    blinks_l[l] = 1

        for l, q_row in q_k.iterrows():
            word_line_l = [q_row[f"line_word_{algo_choice}"]]

            line_l = [q_row[f"line_num_{algo_choice}"]]

            if include_time_stamps:
                EFIX_SFD_l = [np.nan]

            for l, q_row in q_k.iterrows():
                word_line_l.append(q_row[f"line_word_{algo_choice}"])
                line_l.append(q_row[f"line_num_{algo_choice}"])

        if include_time_stamps:

            if len(p1_k) > 0:

                if len(p1_k) == 1:
                    EFIX_SFD_l.append(p1_k["stop_uncorrected"][0])

        data_t = pd.DataFrame(
            list(
                zip(
                    sub_l,
                    item_l,
                    cond_l,
                    word_l,
                    line_l,
                )
            ),
            columns=[
                "subject",
                "item",
                "condition",
                f"on_word_number_{algo_choice}",
                f"line_num_{algo_choice}",
                "FFD",
                "SFD",
                "GD",
                "TVT",
                "nfix1",
                "nfix2",
                "nfixAll",
                "regress",
            ],
        )

        if add_blanks:
            data_t["blinks_1stPass"] = blinks_l

        data_n = pd.concat([data_n, data_t], ignore_index=True)

    return data_n