Spaces:

rti-international
/

rota-app

Sleeping

App Files Files

akgodwin commited on Apr 27, 2023

Commit

3febea9

1 Parent(s): d337f30

reroute to streamlit app

Browse files

Files changed (8) hide show

.vscode/settings.json +0 -3
ABOUT.md +3 -1
README.md +1 -1
app.py +3 -101
cleaning_utils.py +0 -2652
download.py +0 -28
onnx_model_utils.py +0 -195
requirements.txt +1 -11

.vscode/settings.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-    "python.defaultInterpreterPath": "/Users/akgodwin/.pyenv/versions/3.9.11/envs/hf-rota-app/bin/python",
-}

ABOUT.md CHANGED Viewed

@@ -15,4 +15,6 @@ The model was trained on [publicly available data](https://web.archive.org/web/2
 For more information on the model, please see the [model repo](https://huggingface.co/rti-international/rota).
-This model and application were developed by the [RTI International Center for Data Science](https://www.rti.org/centers/rti-center-data-science).

 For more information on the model, please see the [model repo](https://huggingface.co/rti-international/rota).
+This model and application were developed by the [RTI International Center for Data Science](https://www.rti.org/centers/rti-center-data-science).
+### ℹ️ Use

README.md CHANGED Viewed

@@ -27,4 +27,4 @@ The model was trained on [publicly available data](https://web.archive.org/web/2
 For more information on the model, please see the [model repo](https://huggingface.co/rti-international/rota).
-This model and application were developed by the [RTI International Center for Data Science](https://www.rti.org/centers/rti-center-data-science).


27
28	For more information on the model, please see the [model repo](https://huggingface.co/rti-international/rota).
29
30	+ This model and application were developed by the [RTI International Center for Data Science and AI](https://www.rti.org/centers/rti-center-data-science).

app.py CHANGED Viewed

@@ -1,111 +1,13 @@
-from functools import partial
 from pathlib import Path
-from pandas import DataFrame, read_csv, read_excel
 import streamlit as st
-from more_itertools import ichunked
-from stqdm import stqdm
-from onnx_model_utils import predict, predict_bulk, max_pred_bulk, RELEASE_TAG
-from download import download_link
 PRED_BATCH_SIZE = 4
 st.set_page_config(page_title="ROTA", initial_sidebar_state="collapsed")
-st.markdown(Path("ABOUT.md").read_text())
-st.markdown("## ✏️ Single Coder Demo")
-input_text = st.text_input(
-    "Input Offense",
-    value="FRAUDULENT USE OF A CREDIT CARD OR DEBT CARD >= $25,000",
-)
-predictions = predict(input_text)
-st.markdown("Predictions")
-labels = ["Charge Category"]
-st.dataframe(
-    DataFrame(predictions[0])
-    .assign(
-        confidence=lambda d: d["score"].apply(lambda d: round(d * 100, 0)).astype(int)
-    )
-    .drop("score", axis="columns")
-)
-st.markdown("---")
-st.markdown("## 📑 Bulk Coder")
-st.warning(
-    "⚠️ *Note:* Your input data will be deduplicated"
-    " on the selected column to reduce computation requirements."
-    " You will need to re-join the results on your offense text column."
-)
-st.markdown("1️⃣ **Upload File**")
-uploaded_file = st.file_uploader("Bulk Upload", type=["xlsx", "csv"])
-file_readers = {"csv": read_csv, "xlsx": partial(read_excel, engine="openpyxl")}
-if uploaded_file is not None:
-    for filetype, reader in file_readers.items():
-        if uploaded_file.name.endswith(filetype):
-            df = reader(uploaded_file)
-            file_name = uploaded_file.name
-    del uploaded_file
-    st.write("2️⃣ **Select Column of Offense Descriptions**")
-    string_columns = list(df.select_dtypes("object").columns)
-    longest_column = max(
-        [(df[c].str.len().mean(), c) for c in string_columns], key=lambda x: x[0]
-    )[1]
-    selected_column = st.selectbox(
-        "Select Column",
-        options=list(string_columns),
-        index=string_columns.index(longest_column),
-    )
-    original_length = len(df)
-    df_unique = df.drop_duplicates(subset=[selected_column]).copy()
-    del df
-    st.markdown(
-        f"Uploaded Data Sample `(Deduplicated. N Rows = {len(df_unique)}, Original N = {original_length})`"
-    )
-    st.dataframe(df_unique.head(20))
-    st.write(f"3️⃣ **Predict Using Column: `{selected_column}`**")
-    column = df_unique[selected_column].copy()
-    del df_unique
-    if st.button(f"Compute Predictions"):
-        input_texts = (value for _, value in column.items())
-        n_batches = (len(column) // PRED_BATCH_SIZE) + 1
-        bulk_preds = []
-        for batch in stqdm(
-            ichunked(input_texts, PRED_BATCH_SIZE),
-            total=n_batches,
-            desc="Bulk Predict Progress",
-        ):
-            batch_preds = predict_bulk(batch)
-            bulk_preds.extend(batch_preds)
-        pred_df = column.to_frame()
-        max_preds = max_pred_bulk(bulk_preds)
-        pred_df["charge_category_pred"] = [p["label"] for p in max_preds]
-        pred_df["charge_category_pred_confidence"] = [
-            int(round(p["score"] * 100, 0)) for p in max_preds
-        ]
-        del column
-        del bulk_preds
-        del max_preds
-        # # TODO: Add all scores
-        st.write("**Sample Output**")
-        st.dataframe(pred_df.head(100))
-        tmp_download_link = download_link(
-            pred_df,
-            f"{file_name}-ncrp-predictions.csv",
-            "⬇️ Download as CSV",
-        )
-        st.markdown(tmp_download_link, unsafe_allow_html=True)

 from pathlib import Path
 import streamlit as st
 PRED_BATCH_SIZE = 4
 st.set_page_config(page_title="ROTA", initial_sidebar_state="collapsed")
+st.markdown(":zap: The ROTA app is available for use at https://rti-rota.streamlit.app/ :zap:")
+st.markdown(Path("ABOUT.md").read_text())
+st.markdown(":zap: To use the ROTA app, go to https://rti-rota.streamlit.app/ :zap:")

cleaning_utils.py DELETED Viewed

@@ -1,2652 +0,0 @@
-import re
-from dataclasses import dataclass
-from string import punctuation
-import pandas as pd
-all_punctuation = punctuation + "‘’·—»"
-# keep in dollar signs
-all_punctuation = all_punctuation.replace("$", "")
-# "regex separator"
-# captures the following: 1+ spaces OR 1+ non-word characters (ex: "/", "-"), OR 1 word boundary
-# apply the this variable using an `fr` string in the regex substituion (ex: `fr"\bw{sep}force\b"`)
-sep = "(?: +|\W+|\b)"
-@dataclass
-class RegexRemoval:
-    description: str
-    regex_str: str  # usually raw string: r"your string"
-    def __post_init__(self):
-        self.regex = re.compile(self.regex_str, re.IGNORECASE)
-@dataclass
-class RegexSubstitution:
-    description: str
-    regex_str: str  # usually raw string: r"your string"
-    replacement: str
-    priority: int = 10  # higher values → run later (eg: 1 runs before 20)
-    def __post_init__(self):
-        self.regex = re.compile(self.regex_str, re.IGNORECASE)
-removals = [
-    RegexRemoval("OBSCIS", r"(OBSCIS)"),
-    RegexRemoval(
-        "MO Suffix",
-        r"\b\w\s\w\s\w\w?\s\w\s\d{2}(?: |\W)\d{2}(?: |\W)\d{4}",
-    ),
-    RegexRemoval(
-        "Statute Prefix", r"\S{1,2}\s\d\S{0,3}\.\d\S{0,3}\.\d\S{0,3}(?:\.\d?\S{0,3}?)?"
-    ),
-]
-substitutions = [
-    # LESS THAN / GREATER THAN terms =========
-    RegexSubstitution("Less Than", fr"\b(?:&LT;|lt)\b", " less than "),
-    RegexSubstitution("Less Than 2", fr"\blt(?=\d+)\b", "less than "),
-    RegexSubstitution("Less Than 3", fr"\<", " less than "),
-    RegexSubstitution("Greater Than", fr"\b(?:&GT;|gt|\>)\b", " greater than "),
-    RegexSubstitution("Greater Than 2", fr"\bgt(?=\d+)\b", "greater than "),
-    RegexSubstitution("Greater Than 3", fr"\>", " greater than "),
-    # WITH terms ===========
-    RegexSubstitution("With Out", fr"\bw{sep}(?:o|out)\b", "without"),
-    RegexSubstitution("With Out 2", fr"\bwo\b", "without"),
-    RegexSubstitution("Within", fr"\bw{sep}(?:i|in)\b", "within", priority=5),
-    RegexSubstitution(
-        "With Intent",
-        fr"\bw{sep}\s?in?t?e?n?t?\b",
-        "with intent",
-    ),
-    RegexSubstitution(
-        "with a",
-        fr"\bw{sep}a\b",
-        "with a",
-    ),
-    RegexSubstitution(
-        "with health",
-        fr"\bw{sep}health\b",
-        "with health",
-    ),
-    RegexSubstitution(
-        "with own",
-        fr"\bw{sep}own\b",
-        "with own",
-    ),
-    RegexSubstitution(
-        "with report",
-        fr"\bw{sep}report\b",
-        "with report",
-    ),
-    RegexSubstitution(
-        "with license",
-        fr"\bw{sep}license\b",
-        "with license",
-    ),
-    RegexSubstitution(
-        "with murder",
-        fr"\bw{sep}murder\b",
-        "with murder",
-    ),
-    RegexSubstitution(
-        "with injury",
-        fr"\bw{sep}(?:injury|inj|injry)\b",
-        "with injury",
-    ),
-    RegexSubstitution(
-        "with turned",
-        fr"\bw{sep}turned\b",
-        "with turned",
-    ),
-    RegexSubstitution(
-        "with altered",
-        fr"\bw{sep}alt\b",
-        "with altered",
-    ),
-    RegexSubstitution(
-        "with deadly",
-        fr"\bw{sep}deadly\b",
-        "with deadly",
-    ),
-    RegexSubstitution(
-        "with dangerous weapon",
-        fr"\b(?:with|w){sep}(?:dangerous|d){sep}(?:weapon|wpn|weapn|weap)\b",
-        "with dangerous weapon",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "with child",
-        fr"\b(?:with|w){sep}(?:child|chi|chld)\b",
-        "with child",
-    ),
-    RegexSubstitution(
-        "with minor",
-        fr"\bw{sep}minor\b",
-        "with minor",
-    ),
-    RegexSubstitution(
-        "with kidnapping",
-        fr"\bw{sep}kidnapping\b",
-        "with kidnapping",
-    ),
-    RegexSubstitution(
-        "with agency",
-        fr"\bw{sep}agency\b",
-        "with agency",
-    ),
-    RegexSubstitution(
-        "with firearm",
-        fr"\bw{sep}firearm\b",
-        "with firearm",
-    ),
-    RegexSubstitution(
-        "with weapon",
-        fr"\bw{sep}(?:weapon|wpn|weapn|weap)\b",
-        "with weapon",
-    ),
-    RegexSubstitution(
-        "with knife",
-        fr"\bw{sep}knife\b",
-        "with knife",
-    ),
-    RegexSubstitution(
-        "with force",
-        fr"\bw{sep}force\b",
-        "with force",
-    ),
-    RegexSubstitution(
-        "with extenuating circumstances",
-        fr"\bw{sep}ext{sep}circumstances\b",
-        "with extenuating circumstances",
-    ),
-    RegexSubstitution(
-        "with prior",
-        fr"\bw{sep}prior\b",
-        "with prior",
-    ),
-    RegexSubstitution(
-        "with previous",
-        fr"\bw{sep}previous\b",
-        "with previous",
-    ),
-    RegexSubstitution(
-        "with domestic violence",
-        fr"\bw{sep}dv\b",
-        "with domestic violence",
-    ),
-    RegexSubstitution(
-        "with suspended",
-        fr"\bw{sep}suspended\b",
-        "with suspended",
-    ),
-    RegexSubstitution(  # doublecheck this
-        "vehicle with",
-        fr"\bvehicle{sep}w{sep}",
-        "vehicle with",
-    ),
-    RegexSubstitution(  # TODO: is this "possession with" or "possession weapon"? see concealed weapon as example
-        "possession with",
-        fr"\b(?:possession|possess|poss){sep}w{sep}",
-        "possession with",
-    ),
-    RegexSubstitution(
-        "possession with intent",
-        fr"\bp{sep}with{sep}intent",
-        "possession with intent",
-        priority=30,
-    ),
-    RegexSubstitution(
-        "neglect with",
-        fr"\bneglect{sep}w{sep}",
-        "neglect with",
-    ),
-    RegexSubstitution(
-        "cooperate with",
-        fr"\bcooperate{sep}w{sep}",
-        "cooperate with",
-    ),
-    RegexSubstitution(
-        "interfere with",
-        fr"\b(?:inter|interfere){sep}w{sep}",
-        "interfere with",
-    ),
-    RegexSubstitution(  # TODO consolidate tamper/tampering?
-        "tamper with",
-        fr"\btamper{sep}w{sep}",
-        "tamper with",
-    ),
-    RegexSubstitution(
-        "tampering with",
-        fr"\btampering{sep}w{sep}",
-        "tampering with",
-    ),
-    RegexSubstitution(
-        "assault with",
-        fr"\bassault{sep}w{sep}",
-        "assault with",
-    ),
-    # FIREARM TERMS
-    RegexSubstitution(
-        "firearm with altered identification numbers",
-        fr"\bfirearm{sep}(?:with|w){sep}alter\b",
-        "firearm with altered identification numbers",
-    ),
-    RegexSubstitution(
-        "firearm",
-        fr"\bf{sep}a\b",
-        "firearm",
-    ),
-    RegexSubstitution(
-        "intimidation",
-        fr"\b(?:intim|intimid)\b",
-        "intimidation",
-    ),
-    # DOMESTIC VIOLENCE TERMS / PROTECTION / RESTRAINING ORDERS
-    RegexSubstitution(
-        "protective order",
-        fr"\b(?:protective|protection|prot){sep}(?:order|ord|or)\b",
-        "protective order",
-    ),
-    RegexSubstitution(
-        "domestic violence protective order",
-        r"\bdvpo\b",
-        "domestic violence protective order",
-    ),
-    RegexSubstitution("domestic", r"\bdom\b", "domestic", priority=20),
-    RegexSubstitution(
-        "domestic violence",
-        r"\bdv\b",
-        "domestic violence",
-    ),
-    RegexSubstitution(
-        "domestic violence 2",
-        fr"\bd{sep}v\b",
-        "domestic violence",
-    ),
-    RegexSubstitution(
-        "witness testimony",
-        fr"\bwit{sep}tes\b",
-        "witness testimony",
-    ),
-    # CONVICTION TERMS ==
-    RegexSubstitution(
-        "misdemeanor conviction",
-        fr"\b(?:misdemeanor|misd){sep}(?:convic|conv)\b",
-        "misdemeanor conviction",
-    ),
-    RegexSubstitution(
-        "prior conviction",
-        fr"\b(?:prior|pr|pri){sep}(?:convic|conv)\b",
-        "prior conviction",
-    ),
-    # ==== GENERAL TERMS =====
-    RegexSubstitution(  # NOTE: added a negative lookbehind for 'mentally' so we won't override 'mentally ill' cases
-        "illegal",
-        fr"\b(?<!mentally )(?:ill|illeg|illgl)\b",
-        "illegal",
-    ),
-    RegexSubstitution("commercial fish", fr"\bcomm{sep}fish\b", "commercial fish"),
-    RegexSubstitution("vessel", fr"\bvess\b", "vessel"),
-    RegexSubstitution(
-        "traffic control device",
-        fr"\btraff{sep}control{sep}dev\b",
-        "traffic control device",
-    ),
-    RegexSubstitution("non-culpable", fr"\bnonculp\b", "non-culpable"),
-    RegexSubstitution("prohibited", fr"\bprohib\b", "prohibited"),
-    RegexSubstitution("nuisance", fr"\bnuis\b", "nuisance"),
-    RegexSubstitution("obstruction", fr"\bobstr\b", "obstruction"),
-    RegexSubstitution("pedestrian", fr"\bped\b", "pedestrian"),
-    RegexSubstitution("conduct", fr"\bcond\b", "conduct", priority=20),
-    RegexSubstitution(
-        "subsequent",
-        fr"\bsubsq\b",
-        "subsequent",
-    ),
-    RegexSubstitution(
-        "disturbing the peace",
-        fr"\bdist{sep}peace\b",
-        "disturbing the peace",
-    ),
-    RegexSubstitution(
-        "offender accountability act",
-        fr"\boaa\b",
-        "offender accountability act",
-    ),
-    RegexSubstitution(
-        "against",
-        fr"\b(?:agnst|agin)\b",
-        "against",
-    ),
-    RegexSubstitution(
-        "child",
-        fr"\b(?:chil|chld)\b",
-        "child",
-    ),
-    RegexSubstitution(
-        "school",
-        fr"\bschl\b",
-        "school",
-    ),
-    RegexSubstitution(
-        "multiple",
-        fr"\bmult\b",
-        "multiple",
-    ),
-    RegexSubstitution(
-        "assailant",
-        fr"\bassail\b",
-        "assailant",
-    ),
-    RegexSubstitution(
-        "public disturbance",
-        fr"\b(?:public|pub|publ){sep}(?:disturbance|disturb|dist)\b",
-        "public disturbance",
-    ),
-    RegexSubstitution(
-        "interfere",
-        fr"\b(?:interf|interfer)\b",
-        "interfere",
-    ),
-    RegexSubstitution(  # TODO should we leave obstructing/obstruction separate terms or lump into obstruct?
-        "obstructing",
-        fr"\bob\b",
-        "obstructing",
-    ),
-    RegexSubstitution(
-        "law enforcement officer",
-        fr"\bleo\b",
-        "law enforcement officer",
-    ),
-    RegexSubstitution(
-        "officer",
-        fr"\b(?:offcr|ofcr)\b",
-        "officer",
-    ),
-    RegexSubstitution(
-        "minor",
-        fr"\b(?:min|minr|mnr)\b",
-        "minor",
-    ),
-    RegexSubstitution(
-        "distance within 300 feet of park",
-        fr"\bdist{sep}300{sep}park\b",
-        "distance within 300 feet of park",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "distance within 300",
-        fr"{sep}dist{sep}w{sep}i{sep}300\b",
-        "distance within 300",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "major",
-        fr"\bmajr\b",
-        "major",
-    ),
-    RegexSubstitution(
-        "willful",
-        fr"\b(?:wilfl|wlfl)\b",
-        "willful",
-    ),
-    RegexSubstitution(
-        "issue worthless checks",
-        fr"\b(?:issue|iss){sep}(?:worthless|wrthlss|wrtls){sep}(?:checks|cks)\b",
-        "worthless",
-    ),
-    RegexSubstitution(
-        "issue multiple worthless checks",
-        fr"\b(?:issue|iss){sep}(?:multiple|mltpl){sep}(?:worthless|wrthlss|wrtls){sep}(?:checks|cks)\b",
-        "worthless",
-    ),
-    RegexSubstitution(
-        "unauthorized",
-        fr"\b(?:unauth|unau|unauthd)\b",
-        "unauthorized",
-    ),
-    RegexSubstitution(
-        "child support",
-        fr"\b(?:child|chld|chi){sep}(?:support|supp|sup)\b",
-        "child support",
-    ),
-    RegexSubstitution(
-        "unlawful",
-        r"\b(?:unlawfully|unlaw|unlawfl|unlawf|unlwfl|unl)\b",
-        "unlawful",
-    ),
-    RegexSubstitution(
-        "Possession",
-        r"\b(?:possess|poss?)\b",
-        "possession",
-    ),
-    RegexSubstitution(
-        "Abetting",
-        r"\b(?:abett|abetted)\b",
-        "Abetting",
-    ),
-    RegexSubstitution("emergency", r"\b(?:emerg|emer)\b", "emergency", priority=20),
-    RegexSubstitution(
-        "Attempted",
-        r"\b(?:att|atmpt)\b",
-        "attempted",
-    ),
-    RegexSubstitution(  # NOTE: added negative look ahead so we don't remap "at risk" to "attempted risk"
-        "Attempted 2",
-        r"\bat(?! risk)\b",
-        "attempted",
-    ),
-    RegexSubstitution(
-        "Battery",
-        r"\bbatt\b",
-        "battery",
-    ),
-    RegexSubstitution(
-        "Violation of Probation",
-        r"\bvop\b",
-        "violation of probation",
-    ),
-    RegexSubstitution(  # NOTE: removed 'con' because shows up in some DV-related text, may not be a one-size fits all regex / 'consp' to conspiracy or conspire?
-        "Conspiracy",
-        r"\b(?:consp|conspi|conspira|conspirc|consprc|consprcy|cnsprcy|conspr)\b",
-        "conspiracy",
-    ),
-    RegexSubstitution(
-        "Property",
-        r"\bprop\b",
-        "property",
-    ),
-    RegexSubstitution(
-        "public disturbance",
-        fr"\b(?:public|pub|publ){sep}(?:disturbance|dist)\b",
-        "public disturbance",
-    ),
-    RegexSubstitution(
-        "Criminal",
-        r"\bcrim\b",
-        "criminal",
-    ),
-    RegexSubstitution(
-        "License",
-        r"\blic\b",
-        "license",
-    ),
-    RegexSubstitution(
-        "Credit Card",
-        r"\bcc\b",
-        "credit card",
-    ),
-    RegexSubstitution(
-        "Credit Card 2",
-        r"\bcred{sep}crd\b",
-        "credit card",
-    ),
-    RegexSubstitution(
-        "exchange",
-        r"\bexch\b",
-        "exchange",
-    ),
-    RegexSubstitution(
-        "electric power",
-        fr"\belec{sep}pwr\b",
-        "electric power",
-    ),
-    RegexSubstitution(
-        "commit false", fr"\bcom?{sep}false\b", "commit false", priority=5
-    ),
-    # VEHICLE terms ===========
-    RegexSubstitution(
-        "Vehicle",
-        r"\b(?:veh|vehi)\b",
-        "vehicle",
-    ),
-    RegexSubstitution(
-        "Vehicles",
-        r"\bvehs\b",
-        "vehicles",
-    ),
-    RegexSubstitution(
-        "commercial motor vehicle",
-        r"\bcmv\b",
-        "commercial motor vehicle",
-    ),
-    RegexSubstitution(
-        "motor vehicle",
-        fr"\b(?:mtr|mot){sep}(?:vehicle|veh)\b",
-        "motor vehicle",
-    ),
-    RegexSubstitution(
-        "motor vehicle 2",
-        fr"\bm{sep}v\b",
-        "motor vehicle",
-    ),
-    RegexSubstitution(
-        "motor vehicle 3",
-        fr"\b(?:mtv|mv)\b",
-        "motor vehicle",
-    ),
-    RegexSubstitution("odometer", fr"\bodom\b", "odometer"),
-    RegexSubstitution(
-        "red light",
-        fr"\bred{sep}light\b",
-        "red light",
-    ),
-    RegexSubstitution(
-        "vehicle sound system",
-        fr"\bveh{sep}snd{sep}sys\b",
-        "vehicle sound system",
-        priority=20,
-    ),
-    # =====
-    RegexSubstitution(
-        "Assault",
-        r"\bass?lt\b",
-        "assault",
-    ),
-    RegexSubstitution(
-        "Assault 2",
-        r"\bass\b",
-        "assault",
-    ),
-    RegexSubstitution(
-        "Mentally",
-        r"\bment\b",
-        "mentally",
-    ),
-    RegexSubstitution(
-        "mentally ill",
-        r"\bmnt{sep}ill\b",
-        "mentally ill",
-    ),
-    RegexSubstitution(
-        "Unknown",
-        r"\bunk\b",
-        "unknown",
-    ),
-    RegexSubstitution(
-        "cohabitation",
-        r"\b(?:coh|cohbt)\b",
-        "cohabitation",
-    ),
-    RegexSubstitution(
-        "Statement",
-        r"\bstmt\b",
-        "statement",
-    ),
-    RegexSubstitution(
-        "Degree",
-        r"\bdegr?e?\b",
-        "degree",
-    ),
-    RegexSubstitution(
-        "Felony",
-        r"\b(?:fe|fel|felo|felny|fl|flny)\b",
-        "felony",
-    ),
-    RegexSubstitution(
-        "misdemeanor",
-        r"\bmisd\b",
-        "misdemeanor",
-    ),
-    # AGE
-    RegexSubstitution(
-        "years of age",
-        r"\byoa\b",
-        "years of age",
-    ),
-    RegexSubstitution(
-        "year",
-        r"\byr\b",
-        "year",
-    ),
-    RegexSubstitution(
-        "year 2",
-        r"(?!\d+)yr\b",
-        " year",
-    ),
-    RegexSubstitution(
-        "elderly",
-        r"\beldrly\b",
-        "elderly",
-    ),
-    RegexSubstitution(
-        "under",
-        r"\b(?:und|undr)\b",
-        "under",
-    ),
-    # AGE / FEMALE
-    RegexSubstitution(
-        "female",
-        fr"\bfem\b",
-        "female",
-    ),
-    RegexSubstitution(
-        "age female",
-        fr"\bage{sep}f\b",
-        "age female",
-    ),
-    RegexSubstitution(
-        "old female",
-        fr"\bold{sep}f\b",
-        "old female",
-    ),
-    RegexSubstitution(
-        "older female",
-        fr"\bolder{sep}f\b",
-        "older female",
-    ),
-    RegexSubstitution(
-        "13 female",
-        fr"\b13{sep}f\b",
-        "13 female",
-    ),
-    RegexSubstitution(
-        "15 female",
-        fr"\b15{sep}f\b",
-        "15 female",
-    ),
-    RegexSubstitution(
-        "17 female",
-        fr"\b17{sep}f\b",
-        "17 female",
-    ),
-    # AGE / MALE
-    RegexSubstitution(
-        "age male",
-        fr"\bage{sep}m\b",
-        "age male",
-    ),
-    RegexSubstitution(
-        "old male",
-        fr"\bold{sep}m\b",
-        "old male",
-    ),
-    RegexSubstitution(
-        "older male",
-        fr"\bolder{sep}m\b",
-        "older male",
-    ),
-    RegexSubstitution(
-        "13 male",
-        fr"\b13{sep}m\b",
-        "13 male",
-    ),
-    RegexSubstitution(
-        "15 male",
-        fr"\b15{sep}m\b",
-        "15 male",
-    ),
-    RegexSubstitution(
-        "17 male",
-        fr"\b17{sep}m\b",
-        "17 male",
-    ),
-    # ======
-    RegexSubstitution(
-        "Robbery",
-        r"\brobb\b",
-        "robbery",
-    ),
-    RegexSubstitution(
-        "Attempted Robbery",
-        fr"\battempted{sep}(?:rob|robb)\b",
-        "attempted robbery",
-    ),
-    RegexSubstitution(
-        "Detainer Robbery",
-        fr"\bdetainer{sep}(?:rob|robb)\b",
-        "detainer robbery",
-    ),
-    RegexSubstitution(
-        "Aggravated",
-        r"\b(?:agg|aggrav|aggr|aggravted)\b",
-        "aggravated",
-    ),
-    RegexSubstitution(
-        "Forced",
-        r"\bfrc\b",
-        "forced",
-    ),
-    RegexSubstitution(
-        "Danger",
-        r"\bdng\b",
-        "danger",
-    ),
-    RegexSubstitution(
-        "Abetting",
-        r"\babet\b",
-        "abetting",
-    ),
-    RegexSubstitution(
-        "Acquaintance",
-        r"\b(?:acquant|acq|acquaint|acquain)\b",
-        "acquaintance",
-    ),
-    RegexSubstitution(
-        "Breaking and Entering",
-        r"\bB ?& ?E\b",
-        "breaking and entering",
-    ),
-    RegexSubstitution("Building", r"\bbldg\b", "building"),
-    RegexSubstitution(
-        "Adult",
-        r"\badlt\b",
-        "adult",
-    ),
-    RegexSubstitution(
-        "Deliver",
-        r"\bdel\b",
-        "deliver",
-    ),
-    RegexSubstitution(
-        "Family",
-        r"\bfam\b",
-        "family",
-    ),
-    RegexSubstitution(
-        "Burglary",
-        r"\bburg\b",
-        "burglary",
-    ),
-    RegexSubstitution(
-        "Murder",
-        r"\bmur\b",
-        "murder",
-    ),
-    RegexSubstitution(
-        "conspiracy to commit",
-        fr"\bconsp{sep}comm\b",
-        "conspiracy to commit",
-    ),
-    RegexSubstitution(
-        "Representation",
-        r"\brep\b",
-        "representation",
-    ),
-    RegexSubstitution(
-        "Previous",
-        r"\bprev\b",
-        "previous",
-    ),
-    RegexSubstitution(  # TODO revisit this - 'com' can also be 'commit'
-        "Common",
-        r"\bcom\b",
-        "common",
-    ),
-    RegexSubstitution(
-        "of a",
-        r"\bofa\b",
-        "of a",
-    ),
-    RegexSubstitution(  # TODO revisit this - 'viol' relates to 'violation' too
-        "violent",
-        r"\bviol\b",
-        "violent",
-    ),
-    RegexSubstitution(
-        "perform",
-        r"\bperf\b",
-        "perform",
-    ),
-    RegexSubstitution(
-        "household",
-        r"\b(?:hh|hsehld|hhld)\b",
-        "household",
-    ),
-    RegexSubstitution(
-        "Other",
-        r"\both\b",
-        "other",
-    ),
-    # WEAPON TERMS =========
-    RegexSubstitution(
-        "Weapon", r"\b(?:wea|wpn|weapn|weap|weapo)\b", "weapon", priority=20
-    ),
-    RegexSubstitution(
-        "Weapons", r"\b(?:wea|wpn|weapn|weap|weapo)s\b", "weapons", priority=20
-    ),
-    RegexSubstitution("dangerous weapon", r"\b(?:dwpn|dw)\b", "dangerous weapon"),
-    RegexSubstitution(
-        "dangerous weapon 2", fr"\bd{sep}(?:w|wpn)\b", "dangerous weapon"
-    ),
-    RegexSubstitution(
-        "concealed weapon", fr"\bconcealed{sep}(?:w|wpn)\b", "concealed weapon"
-    ),
-    # HARM terms =======
-    RegexSubstitution(
-        "Bodily Harm",
-        fr"\b(?:bod{sep}ha?rm|bh)\b",
-        "bodily harm",
-    ),
-    RegexSubstitution(
-        "physical",
-        fr"\bphy\b",
-        "physical",
-    ),
-    RegexSubstitution(
-        "harmful",
-        fr"\bharmfl\b",
-        "harmful",
-    ),
-    RegexSubstitution(
-        "Great Bodily",
-        fr"\b(?:gr|grt){sep}bodily\b",
-        "great bodily",
-    ),
-    RegexSubstitution(
-        "Great Bodily Injury",
-        fr"\bgbi\b",
-        "great bodily injury",
-    ),
-    RegexSubstitution(
-        "Substantial Bodily Harm",
-        r"\bsbh\b",
-        "substantial bodily harm",
-    ),
-    RegexSubstitution(
-        "injury",
-        r"\b(?:injry|inj)\b",
-        "injury",
-    ),
-    RegexSubstitution(
-        "inflict",
-        r"\binflt\b",
-        "inflict",
-    ),
-    RegexSubstitution(
-        "Great Bodily Harm",
-        fr"\bgr{sep}bod{sep}harm\b",
-        "great bodily harm",
-    ),
-    RegexSubstitution(
-        "Great Bodily Harm 2",
-        fr"\bgbh\b",
-        "great bodily harm",
-    ),
-    # ====
-    RegexSubstitution(  # TODO: revisit PERS can be person too
-        "Personal",
-        r"\bpers\b",
-        "personal",
-    ),
-    RegexSubstitution(
-        "persons",
-        r"\bprsns\b",
-        "persons",
-    ),
-    RegexSubstitution(
-        "person",
-        r"\b(?:prsn|per|perso)\b",
-        "person",
-    ),
-    RegexSubstitution("election day", fr"\belec{sep}day\b", "election day"),
-    RegexSubstitution(
-        "temporary",
-        r"\btemp\b",
-        "temporary",
-    ),
-    RegexSubstitution(
-        "improper",
-        r"\bimprop\b",
-        "improper",
-    ),
-    RegexSubstitution(
-        "false",
-        r"\bfls\b",
-        "false",
-    ),
-    RegexSubstitution(
-        "responsibility",
-        r"\bresp\b",
-        "responsibility",
-    ),
-    RegexSubstitution(
-        "advertise",
-        r"\bad\b",
-        "advertise",
-    ),
-    RegexSubstitution(
-        "imprisonment",
-        r"\b(?:imprison|impris|imprsn)\b",
-        "imprisonment",
-    ),
-    RegexSubstitution(
-        "prohibited",
-        r"\bproh\b",
-        "prohibited",
-    ),
-    RegexSubstitution(
-        "under influence",
-        fr"\bunder{sep}(?:infl|influ)\b",
-        "under influence",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "stolen",
-        r"\bstln\b",
-        "stolen",
-    ),
-    RegexSubstitution(
-        "years",
-        r"\byrs\b",
-        "years",
-    ),
-    RegexSubstitution(
-        "intent",
-        r"\bint\b",
-        "intent",
-    ),
-    RegexSubstitution(
-        "passage",
-        r"\bpassg\b",
-        "passage",
-    ),
-    RegexSubstitution(
-        "withdraw",
-        r"\bwit\b",
-        "withdraw",
-    ),
-    RegexSubstitution(
-        "manufacturing or delivering",
-        r"\bman\Wdel\b",
-        "manufacturing delivering",
-    ),
-    RegexSubstitution(  # Revisit this
-        "minimum mandatory",
-        r"\bmin\Wman\b",
-        "minimum mandatory",
-    ),
-    RegexSubstitution(
-        "stranger",
-        r"\bstr(?:ngr)?\b",
-        "stranger",
-    ),
-    RegexSubstitution(
-        "personal use",
-        r"\bpers use\b",
-        "personal use",
-    ),
-    RegexSubstitution(
-        "force",
-        r"\bfo?rc\b",
-        "force",
-    ),
-    RegexSubstitution(
-        "operate",
-        r"\b(?:oper|op|opr)\b",
-        "operate",
-    ),
-    RegexSubstitution(
-        "occupied",
-        r"\bocc\b",
-        "occupied",
-    ),
-    RegexSubstitution(
-        "health care facility",
-        r"\bhealth{sep}care{sep}fac\b",
-        "health care facility",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "residence",
-        r"\bres\b",
-        "residence",
-    ),
-    RegexSubstitution(
-        "terrorism threats",
-        fr"\bterr{sep}(?:thre|thrts)\b",
-        "terrorism threats",
-    ),
-    RegexSubstitution(
-        "false report",
-        fr"\bfals{sep}rprt\b",
-        "false report",
-    ),
-    RegexSubstitution(
-        "government",
-        r"\bgovt\b",
-        "government",
-    ),
-    RegexSubstitution(
-        "advocating",
-        r"\badvoc\b",
-        "advocating",
-    ),
-    RegexSubstitution(
-        "government property",
-        r"\bgov{sep}property\b",
-        "government property",
-    ),
-    RegexSubstitution(
-        "general assembly",
-        r"\bgen{sep}assembly\b",
-        "general assembly",
-    ),
-    RegexSubstitution(  # NOTE: added negative lookahead because was seeing "by off" when updating statutory rape terms & "by offense" is not correct
-        "offense",
-        fr"\b(?<!by )(?:offense|offen|off|offe)\b",
-        "offense",
-    ),
-    RegexSubstitution(
-        "information",
-        fr"\b(?:info|infor)\b",
-        "information",
-    ),
-    # LEWD charge cat
-    RegexSubstitution(
-        "pornography",
-        fr"\b(?:porn|porno)\b",
-        "pornography",
-    ),
-    RegexSubstitution(
-        "compelling",
-        fr"\bcompel\b",
-        "compelling",
-    ),
-    RegexSubstitution(
-        "prostitution",
-        fr"\bprostit\b",
-        "prostitution",
-    ),
-    RegexSubstitution(
-        "computer",
-        fr"\bcomputr\b",
-        "computer",
-    ),
-    RegexSubstitution(
-        "incapable",
-        fr"\bincap\b",
-        "incapable",
-    ),
-    RegexSubstitution(
-        "juvenile",
-        fr"\b(?:juv|juven)\b",
-        "juvenile",
-    ),
-    RegexSubstitution(
-        "involving",
-        fr"\b(?:involv|invlv)\b",
-        "involving",
-    ),
-    RegexSubstitution(
-        "equipment",
-        fr"\bequip\b",
-        "equipment",
-    ),
-    RegexSubstitution(
-        "hazardous",
-        fr"\bhaz\b",
-        "hazardous",
-    ),
-    RegexSubstitution(  # NOTE: assault and battery unless A,B is followed by C
-        "assault and battery",
-        fr"\b(?:a\&b|a{sep}b|a \& b|ab)(?!c)\b",
-        "assault and battery",
-    ),
-    RegexSubstitution(  # NOTE: assault and battery unless A,B is followed by C
-        "assault and battery 2",
-        fr"\b(?:a\&b|a{sep}b|a \& b|ab)(?!\Wc)\b",
-        "assault and battery",
-    ),
-    RegexSubstitution(  # NOTE: assault and battery unless A,B is followed by C
-        "assault and battery 2",
-        fr"\b(?:a\&b|a{sep}b|a \& b|ab)(?! c)\b",
-        "assault and battery",
-    ),
-    RegexSubstitution(
-        "promote distribution",
-        fr"\bpromote{sep}distrb\b",
-        "promote distribution",
-    ),
-    RegexSubstitution(
-        "child molestation first degree",
-        fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol){sep}1\b",
-        "child molestation first degree",
-    ),
-    RegexSubstitution(
-        "child molestation second degree",
-        fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol){sep}2\b",
-        "child molestation second degree",
-    ),
-    RegexSubstitution(
-        "child molestation third degree",
-        fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol){sep}3\b",
-        "child molestation third degree",
-    ),
-    RegexSubstitution(
-        "child molestation",
-        fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol)\b",
-        "child molestation",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "molestation",
-        fr"\b(?:molestation|molest|mol)\b",
-        "molestation",
-    ),
-    RegexSubstitution(
-        "indecent conduct exposure",
-        fr"\bind{sep}cond{sep}expos\b",
-        "indecent conduct exposure",
-    ),
-    RegexSubstitution(
-        "indecent",
-        fr"\bindec\b",
-        "indecent",
-    ),
-    RegexSubstitution(
-        "indecent liberties",
-        fr"\bind{sep}lib\b",
-        "indecent liberties",
-    ),
-    RegexSubstitution(
-        "moving",
-        fr"\bmov\b",
-        "moving",
-    ),
-    RegexSubstitution(
-        "depiction",
-        fr"\bdptn\b",
-        "depiction",
-    ),
-    RegexSubstitution(
-        "child luring",
-        fr"\bchil{sep}lrng\b",
-        "child luring",
-    ),
-    RegexSubstitution(
-        "dissemination",
-        fr"\b(?:dissm|dissem)\b",
-        "dissemination",
-    ),
-    RegexSubstitution(
-        "possession of depictions of minor engaged in sexually explicit conduct",
-        fr"\bposs{sep}(?:depict|dep){sep}(?:minor|min){sep}eng{sep}sex{sep}(?:exp|expct){sep}conduct\b",
-        "possession of depictions of minor engaged in sexually explicit conduct",
-        priority=3,
-    ),
-    RegexSubstitution(
-        "dealing of depictions of minor engaged in sexually explicit conduct",
-        fr"\bdeal{sep}(?:depict|dep){sep}(?:minor|min){sep}eng{sep}sex{sep}(?:exp|expct){sep}conduct\b",
-        "dealing of depictions of minor engaged in sexually explicit conduct",
-        priority=3,
-    ),
-    RegexSubstitution(
-        "viewing of depictions of minor engaged in sexually explicit conduct",
-        fr"\bview{sep}(?:depict|dep){sep}(?:minor|min){sep}eng{sep}sex{sep}(?:exp|expct){sep}conduct\b",
-        "viewing of depictions of minor engaged in sexually explicit conduct",
-        priority=3,
-    ),
-    RegexSubstitution(
-        "online sexual corruption of a child",
-        fr"\bonline{sep}sex{sep}corrupt{sep}child\b",
-        "online sexual corruption of a child",
-    ),
-    RegexSubstitution(
-        "lewd or lascivious act",
-        fr"\b(?:L\&L|L{sep}L)\b",
-        "lewd or lascivious act",
-    ),
-    RegexSubstitution(
-        "exposure",
-        r"\bexpos\b",
-        "exposure",
-    ),
-    # SEXUAL OFFENSES  =====
-    RegexSubstitution(
-        "Criminal Sexual Conduct",
-        r"\bcsc\b",
-        "criminal sexual conduct",
-    ),
-    RegexSubstitution(
-        "sexual",
-        r"\bsexl\b",
-        "sexual",
-    ),
-    RegexSubstitution(
-        "explicit",
-        r"\bexplct\b",
-        "explicit",
-    ),
-    RegexSubstitution(
-        "sexual offense",
-        fr"\b(?:sexual|sex){sep}(?:offense|offen|off)\b",
-        "sexual offense",
-    ),
-    RegexSubstitution(
-        "sexual offenses",
-        fr"\b(?:sexual|sex){sep}(?:offense|offen|off)s\b",
-        "sexual offenses",
-    ),
-    RegexSubstitution(
-        "sexual assault",
-        fr"\b(?:sexual|sex){sep}(?:assault|assult|assualt|ass|asst)\b",
-        "sexual assault",
-    ),
-    RegexSubstitution(
-        "sexual contact",
-        fr"\b(?:sexual|sex){sep}(?:contact)\b",
-        "sexual contact",
-    ),
-    RegexSubstitution(
-        "sexual act",
-        fr"\b(?:sexual|sex){sep}(?:act|acts)\b",
-        "sexual act",
-    ),
-    RegexSubstitution(
-        "sexual act 2",
-        fr"\bsxact\b",
-        "sexual act",
-    ),
-    RegexSubstitution(
-        "sexual abuse",
-        fr"\b(?:sexual|sex){sep}(?:abuse|ab)\b",
-        "sexual abuse",
-    ),
-    RegexSubstitution(
-        "commit sex abuse",
-        fr"\bcomm{sep}sex{sep}abuse\b",
-        "commit sex abuse",
-    ),
-    RegexSubstitution(
-        "commit sex act",
-        fr"\bcomm{sep}sex{sep}act\b",
-        "commit sex act",
-    ),
-    RegexSubstitution(
-        "commit sex abuse minor",
-        fr"\bcommsexabuseminor\b",
-        "commit sex abuse minor",
-        priority=20,
-    ),
-    RegexSubstitution(
-        "sexual battery",
-        fr"\b(?:sexual|sex){sep}(?:battery|batt|bat)\b",
-        "sexual battery",
-    ),
-    RegexSubstitution(  # TODO: should these actually map to "sexual misconduct"?
-        "sexual conduct",
-        fr"\b(?:sexual|sex){sep}(?:conduct|cndct|cond|con)\b",
-        "sexual conduct",
-    ),
-    RegexSubstitution(
-        "sexual penetration",
-        fr"\b(?:sexual|sex){sep}(?:penetration|pen)\b",
-        "sexual penetration",
-    ),
-    RegexSubstitution(  # TODO: Revisit - hard to tell if exp/expl maps to "exploitation" or "explicit"
-        "sexual exploitation",
-        fr"\b(?:sexual|sex){sep}(?:exploitation|exploit)\b",
-        "sexual exploitation",
-    ),
-    RegexSubstitution(
-        "sexual performance",
-        fr"\b(?:sexual|sex){sep}(?:performance|perform)\b",
-        "sexual performance",
-    ),
-    RegexSubstitution(
-        "sexual imposition",
-        fr"\b(?:sexual|sex){sep}(?:imposition|imp)\b",
-        "sexual imposition",
-    ),
-    RegexSubstitution(
-        "sex with",
-        fr"\bsex{sep}w\b",
-        "sex with",
-    ),
-    RegexSubstitution(  # TODO: Revisit - hard to tell if offen/off maps to "offender" or "offense"
-        "sex offender",
-        fr"\b(?:sexual|sex){sep}(?:offender|offend|offndr|ofndr)\b",
-        "sex offender",
-    ),
-    RegexSubstitution(
-        "sexual predator",
-        fr"\b(?:sexual|sex){sep}(?:predator|pred)\b",
-        "sexual predator",
-    ),
-    RegexSubstitution(
-        "voluntary sexual relations",
-        fr"\bvol{sep}sex{sep}rel\b",
-        "voluntary sexual relations",
-    ),
-    RegexSubstitution(
-        "sex related",
-        fr"\bsex{sep}(?:reltd|rel)\b",
-        "sex related",
-    ),
-    RegexSubstitution(
-        "sex related 2",
-        fr"\bsexreltd\b",
-        "sex related",
-    ),
-    RegexSubstitution(
-        "statutory rape",
-        fr"\bstat{sep}rape\b",
-        "statutory rape",
-    ),
-    RegexSubstitution(
-        "rape first degree",
-        fr"\brape{sep}(?:1|1st|i)\b",
-        "rape first degree",
-    ),
-    RegexSubstitution(
-        "rape second degree",
-        fr"\brape{sep}(?:2|2nd|ii)\b",
-        "rape second degree",
-    ),
-    RegexSubstitution(
-        "rape third degree",
-        fr"\brape{sep}(?:3|3rd|iii)\b",
-        "rape third degree",
-    ),
-    RegexSubstitution(
-        "sodomy first degree",
-        fr"\bsodomy{sep}(?:1|1st|i)\b",
-        "sodomy first degree",
-    ),
-    RegexSubstitution(
-        "sodomy second degree",
-        fr"\bsodomy{sep}(?:2|2nd|ii)\b",
-        "sodomy second degree",
-    ),
-    RegexSubstitution(
-        "sodomy third degree",
-        fr"\bsodomy{sep}(?:3|3rd|iii)\b",
-        "sodomy third degree",
-    ),
-    RegexSubstitution(
-        "incest first degree",
-        fr"\bincest{sep}(?:1|1st|i)\b",
-        "incest first degree",
-    ),
-    RegexSubstitution(
-        "incest second degree",
-        fr"\bincest{sep}(?:2|2nd|ii)\b",
-        "incest second degree",
-    ),
-    RegexSubstitution(
-        "sex first degree",
-        fr"\bsex{sep}(?:1|1st|i)\b",
-        "sex first degree",
-    ),
-    RegexSubstitution(
-        "sex second degree",
-        fr"\bsex{sep}(?:2|2nd|ii)\b",
-        "sex second degree",
-    ),
-    RegexSubstitution(
-        "criminal sexual conduct first degree",
-        fr"\bcsc{sep}(?:1|1st|i)\b",
-        "criminal sexual conduct first degree",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "criminal sexual conduct second degree",
-        fr"\bcsc{sep}(?:2|2nd|ii)\b",
-        "criminal sexual conduct second degree",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "criminal sexual conduct third degree",
-        fr"\bcsc{sep}(?:3|3rd|ii)\b",
-        "criminal sexual conduct third degree",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "criminal sexual conduct fourth degree",
-        fr"\bcsc{sep}(?:4|4th|iv)\b",
-        "criminal sexual conduct fourth degree",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "sodomy",
-        r"\bsod\b",
-        "sodomy",
-    ),
-    RegexSubstitution(
-        "engage sexual act",
-        fr"\benga{sep}sex{sep}act\b",
-        "engage sexual act",
-    ),
-    RegexSubstitution(
-        "engage sexual act 2",
-        fr"\beng{sep}sex\b",
-        "engage sexual act",
-    ),
-    RegexSubstitution("no force", fr"\bno{sep}frc\b", "no force", priority=5),
-    RegexSubstitution(
-        "force or coercion",
-        fr"\bfrc{sep}or{sep}coercn\b",
-        "force or coercion",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "coercion",
-        fr"\b(?:coer|coercn)\b",
-        "coercion",
-    ),
-    RegexSubstitution(
-        "position of authority",
-        fr"\bpos{sep}auth\b",
-        "position of authority",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "position of authority 2",
-        fr"\bpos{sep}of{sep}auth\b",
-        "position of authority",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "person in authority",
-        fr"\bper{sep}aut\b",
-        "person in authority",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "other family",
-        fr"\b(?:othr|oth|other){sep}(?:family|fam)\b",
-        "other family",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "immoral",
-        fr"\b(?:immoral|imoral|imm|imor)\b",
-        "immoral",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "purpose",
-        fr"\bpurp\b",
-        "purpose",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "communication with minor for immoral purpose",
-        fr"\b(?:communication|comm|com){sep}(?:with|w){sep}(?:minor|min){sep}(?:immoral|imoral|imm|imor)\b",
-        "communication with minor for immoral purpose",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "communication with minor for immoral purpose 2",
-        fr"\bcomm{sep}minor{sep}imm\b",
-        "communication with minor for immoral purpose",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "communication with minor",
-        fr"\bcom{sep}w{sep}minor\b",
-        "communication with minor",
-        priority=4,
-    ),
-    # EMBEZZLEMENT ===
-    RegexSubstitution(
-        "Embezzlement",
-        r"\b(?:embezzle|embezz|embez|embzzlmnt|embz)\b",
-        "embezzlement",
-    ),
-    RegexSubstitution(
-        "real estate",
-        fr"\breal{sep}estat\b",
-        "real estate",
-    ),
-    RegexSubstitution(
-        "chattel",
-        r"\bchatl\b",
-        "chattel",
-    ),
-    RegexSubstitution(
-        "received",
-        r"\b(?:receiv|rcvd)\b",
-        "received",
-    ),
-    RegexSubstitution(
-        "mortgagor",
-        r"\bmortgr\b",
-        "mortgagor",
-    ),
-    RegexSubstitution(
-        "agreement",
-        r"\bagrmnt\b",
-        "agreement",
-    ),
-    RegexSubstitution(
-        "public",
-        fr"\b(?:pub|publ|pblc)\b",
-        "public",
-    ),
-    RegexSubstitution(
-        "behavior",
-        r"\bbehav\b",
-        "behavior",
-    ),
-    RegexSubstitution(
-        "private",
-        r"\bpriv\b",
-        "private",
-    ),
-    RegexSubstitution(
-        "corporation",
-        fr"\bcorp\b",
-        "corporation",
-    ),
-    RegexSubstitution(
-        "purchase",
-        fr"\bpurc\b",
-        "purchase",
-    ),
-    RegexSubstitution(  # NOTE: pol may also be police - saw pol dog for example (police dog)
-        "political",
-        fr"\b(?:pol|polit|politcl)\b",
-        "political",
-    ),
-    RegexSubstitution("police dog", fr"\bpol{sep}dog\b", "police dog", priority=5),
-    RegexSubstitution(
-        "payroll",
-        fr"\bpayrll\b",
-        "payroll",
-    ),
-    RegexSubstitution(
-        "law enforcement",
-        fr"\blaw{sep}enf\b",
-        "law enforcement",
-    ),
-    RegexSubstitution(
-        "incident",
-        fr"\bincdnt\b",
-        "incident",
-    ),
-    RegexSubstitution(
-        "report",
-        fr"\brept\b",
-        "report",
-    ),
-    RegexSubstitution(
-        "transfer",
-        fr"\btrnsf\b",
-        "transfer",
-    ),
-    RegexSubstitution(
-        "capital assets",
-        fr"\bcptl{sep}asts\b",
-        "capital assets",
-    ),
-    RegexSubstitution(
-        "clerk of court",
-        fr"\bclrk{sep}of{sep}crt\b",
-        "clerk of court",
-    ),
-    RegexSubstitution(
-        "insufficient",
-        fr"\binsuf\b",
-        "insufficient",
-    ),
-    RegexSubstitution(
-        "corporate officer", fr"\bcorp{sep}officer\b", "corporate officer", priority=5
-    ),
-    RegexSubstitution(
-        "institution",
-        fr"\b(?:instit|inst)\b",
-        "institution",
-    ),
-    RegexSubstitution(
-        "organization",
-        fr"\borg\b",
-        "organization",
-    ),
-    RegexSubstitution(
-        "animals",
-        fr"\banmls\b",
-        "animals",
-    ),
-    RegexSubstitution(
-        "animal",
-        fr"\banml\b",
-        "animal",
-    ),
-    RegexSubstitution(
-        "software",
-        fr"\bsoftwr\b",
-        "software",
-    ),
-    RegexSubstitution(
-        "transit or service bus",
-        fr"\btrans{sep}serv{sep}bus\b",
-        "transit or service bus",
-    ),
-    RegexSubstitution(
-        "insurance agent",
-        fr"\binsur{sep}agent\b",
-        "insurance agent",
-    ),
-    RegexSubstitution(
-        "official",
-        fr"\b(?:offic|offl|offcl|officl)\b",
-        "official",
-    ),
-    RegexSubstitution(  # TODO: is 'misapp' ... misappropriation or misapplication?
-        "misappropriation",
-        fr"\b(?:misappro|misapp)\b",
-        "misappropriation",
-    ),
-    RegexSubstitution(
-        "misapplication",
-        fr"\bmisapl\b",
-        "misappropriation",
-    ),
-    RegexSubstitution(
-        "fiduciary",
-        fr"\bfiduc\b",
-        "fiduciary",
-    ),
-    RegexSubstitution(
-        "financial",
-        fr"\bfinan\b",
-        "financial",
-    ),
-    RegexSubstitution(
-        "funds",
-        fr"\bfnds\b",
-        "funds",
-    ),
-    # FELONY - UNSPECIFIED terms
-    RegexSubstitution(
-        "rendering assistance",
-        fr"\brend{sep}assist\b",
-        "rendering assistance",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "criminal assistance",
-        fr"\b(?:crim|criminal){sep}assist\b",
-        "criminal assistance",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "consummate",
-        fr"\b(?:consu|consummat)\b",
-        "consummate",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "deliver",
-        fr"\bdelive\b",
-        "deliver",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "to commit",
-        fr"\bto{sep}comm\b",
-        "to commit",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "violation of",
-        fr"\b(?:viol?|vio){sep}of\b",
-        "violation of",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "violation of civil",
-        fr"\bvol?{sep}civil\b",
-        "violation of civil",
-        priority=4,
-    ),
-    RegexSubstitution("rendering", fr"\brend\b", "rendering"),
-    RegexSubstitution(
-        "assistance first degree",
-        fr"\bassistance{sep}1\b",
-        "assistance first degree",
-        priority=30,
-    ),
-    RegexSubstitution(
-        "assistance second degree",
-        fr"\bassistance{sep}2\b",
-        "assistance second degree",
-        priority=30,
-    ),
-    RegexSubstitution(
-        "assistance third degree",
-        fr"\bassistance{sep}3\b",
-        "assistance third degree",
-        priority=30,
-    ),
-    RegexSubstitution(
-        "class",
-        fr"\bclas\b",
-        "class",
-    ),
-    RegexSubstitution(
-        "accessory",
-        fr"\b(?:accessry|accsry)\b",
-        "accessory",
-    ),
-    RegexSubstitution(
-        "dependency",
-        fr"\bdepndncy\b",
-        "dependency",
-    ),
-    RegexSubstitution(
-        "unspecified",
-        fr"\bunspfd\b",
-        "unspecified",
-    ),
-    RegexSubstitution(
-        "responsibility",
-        fr"\brespon?\b",
-        "responsibility",
-    ),
-    RegexSubstitution(
-        "classification",
-        fr"\bclassif\b",
-        "classification",
-    ),
-    RegexSubstitution(
-        "vice president",
-        fr"\bvp\b",
-        "vice president",
-        priority=30,
-    ),
-    # BRIBERY terms
-    RegexSubstitution(
-        "personal",
-        fr"\bpersona\b",
-        "personal",
-    ),
-    RegexSubstitution(
-        "assistance",
-        fr"\basst\b",
-        "assistance",
-    ),
-    RegexSubstitution(
-        "service",
-        fr"\bserv\b",
-        "service",
-    ),
-    RegexSubstitution(
-        "facilitation",
-        fr"\b(?:facil|fac)\b",
-        "facilitation",
-    ),
-    RegexSubstitution(
-        "smuggling",
-        fr"\bsmug\b",
-        "smuggling",
-    ),
-    RegexSubstitution(
-        "health",
-        fr"\bhlth\b",
-        "health",
-    ),
-    RegexSubstitution(  # NOTE: 'off' tends to be 'offense' hence the priority on this one
-        "official position", fr"\boff{sep}position\b", "official position", priority=5
-    ),
-    RegexSubstitution(
-        "participants",
-        fr"\bparticipnts\b",
-        "participants",
-    ),
-    RegexSubstitution(
-        "contestant",
-        fr"\bcntst\b",
-        "contestant",
-    ),
-    RegexSubstitution(
-        "accept",
-        fr"\baccpt\b",
-        "accept",
-    ),
-    RegexSubstitution(
-        "campaign contribution",
-        fr"\bcamp{sep}cont\b",
-        "campaign contribution",
-    ),
-    RegexSubstitution(
-        "influence",
-        fr"\b(?:inflnce|influenc)\b",
-        "influence",
-    ),
-    RegexSubstitution(
-        "compensation",
-        fr"\bcompens\b",
-        "compensation",
-    ),
-    RegexSubstitution(
-        "treatment",
-        fr"\btreatm\b",
-        "treatment",
-    ),
-    RegexSubstitution(
-        "commercial bribe",
-        fr"\b(?:comm|comm\'l){sep}bribe\b",
-        "commercial bribe",
-    ),
-    RegexSubstitution(
-        "false testimony",
-        fr"\bfalse{sep}test\b",
-        "false testimony",
-    ),
-    RegexSubstitution(
-        "miscellaneous",
-        fr"\bmisc\b",
-        "miscellaneous",
-    ),
-    RegexSubstitution(
-        "impersonating",
-        fr"\bimpers\b",
-        "impersonating",
-    ),
-    RegexSubstitution(
-        "receiving",
-        fr"\brecv\b",
-        "receiving",
-    ),
-    RegexSubstitution(
-        "interfere with official process",
-        fr"\binterfere{sep}w{sep}offc{sep}proc\b",
-        "interfere with official process",
-        priority=5,
-    ),
-    RegexSubstitution("public record", fr"\b(?:public|pub){sep}rec\b", "public record"),
-    RegexSubstitution(
-        "public servant",
-        fr"\b(?:public|pub){sep}(?:servant|srv|srvnt)\b",
-        "public servant",
-    ),
-    RegexSubstitution(  # NOTE: 'wit' also maps to 'withdraw', hence priority here
-        "witness juror",
-        fr"\b(?:witness|wit){sep}(?:juror|jur)\b",
-        "witness juror",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "umpire referee", fr"\b(?:umpire|ump){sep}(?:referee|ref)\b", "umpire referee"
-    ),
-    # FAMILY RELATED OFFENSES
-    RegexSubstitution(
-        "custody interference",
-        fr"\bcust{sep}inter\b",
-        "custody interference",
-    ),
-    RegexSubstitution(
-        "custody interference second degree",
-        fr"\bcust{sep}inter{sep}2\b",
-        "custody interference second degree",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "abandonment",
-        fr"\babandonmnt\b",
-        "abandonment",
-    ),
-    RegexSubstitution(
-        "unattended",
-        fr"\bunatt\b",
-        "unattended",
-    ),
-    RegexSubstitution(
-        "endanger",
-        fr"\b(?:endngr|endgr|endang)\b",
-        "endanger",
-    ),
-    RegexSubstitution(
-        "welfare",
-        fr"\b(?:wlfre|wlfr)\b",
-        "welfare",
-    ),
-    RegexSubstitution(
-        "endanger welfare",
-        fr"\b(?:endngr|endgr|endang){sep}(?:wlfre|wlfr|wel)\b",
-        "endanger welfare",
-    ),
-    RegexSubstitution(
-        "neglect",
-        fr"\bneglct\b",
-        "neglect",
-    ),
-    RegexSubstitution(
-        "contribute",
-        fr"\bcontrib\b",
-        "contribute",
-    ),
-    RegexSubstitution(
-        "delinquincy",
-        fr"\b(?:dlnqncy|delinq)\b",
-        "delinquincy",
-    ),
-    RegexSubstitution(
-        "service",
-        fr"\bsrvc\b",
-        "service",
-    ),
-    RegexSubstitution(
-        "misrepresentation",
-        fr"\bmisrep\b",
-        "misrepresentation",
-    ),
-    RegexSubstitution(
-        "disabled",
-        fr"\bdisabld\b",
-        "disabled",
-    ),
-    # ===
-    RegexSubstitution(
-        "system of records exempt",
-        fr"\bsor{sep}exempt\b",
-        "system of records exempt",
-    ),
-    RegexSubstitution(
-        "type",
-        r"\btyp\b",
-        "type",
-    ),
-    RegexSubstitution(
-        "misconduct",
-        r"\b(?:miscond|miscon)\b",
-        "misconduct",
-    ),
-    RegexSubstitution(
-        "mischief",
-        r"\bmisch\b",
-        "mischief",
-    ),
-    RegexSubstitution(
-        "probation revocation",
-        fr"\bprob{sep}(?:rev|revo)\b",
-        "probation revocation",
-    ),
-    RegexSubstitution(
-        "management",
-        r"\bmgmt\b",
-        "management",
-    ),
-    RegexSubstitution(
-        "subsistence",
-        r"\bsubsist\b",
-        "subsistence",
-    ),
-    RegexSubstitution(
-        "penalty group",
-        r"\bpg\b",
-        "penalty group",
-    ),
-    RegexSubstitution(
-        "community custody",
-        r"\bcomm custody\b",
-        "community custody",
-    ),
-    RegexSubstitution(
-        "contempt",
-        r"\bcntmpt\b",
-        "contempt",
-    ),
-    RegexSubstitution(
-        "counterfeit",
-        r"\b(?:cntft|cntrft|cntrfeit|cnterft|contrft|contrfit)\b",
-        "counterfeit",
-    ),
-    RegexSubstitution(
-        "counts",
-        r"\b(?:cts|cnts)\b",
-        "counts",
-    ),
-    RegexSubstitution(
-        "victim",
-        r"\b(?:vict|vctm|vic)\b",
-        "victim",
-    ),
-    # NUMBER TERMS ===========
-    RegexSubstitution("first", r"\b1st\b", "first", priority=20),
-    RegexSubstitution(
-        "first degree", fr"\b(?:first|1|1st){sep}(?:dgr|dg|de|d)\b", "first degree"
-    ),
-    RegexSubstitution("first degree 2", fr"\b1dg\b", "first degree"),
-    RegexSubstitution(
-        "circumstances in the first degree",
-        fr"\bcircumstances{sep}1\b",
-        "circumstances in the first degree",
-    ),
-    RegexSubstitution("second", r"\b2nd\b", "second", priority=20),
-    RegexSubstitution(
-        "second degree", fr"\b(?:second|2|2nd){sep}(?:dgr|dg|de|d)\b", "second degree"
-    ),
-    RegexSubstitution(
-        "circumstances in the second degree",
-        fr"\bcircumstances{sep}2\b",
-        "circumstances in the second degree",
-    ),
-    RegexSubstitution("third", r"\b3rd\b", "third", priority=20),
-    RegexSubstitution(
-        "third degree", fr"\b(?:third|3|3rd){sep}(?:dgr|dg|de|d)\b", "third degree"
-    ),
-    RegexSubstitution(
-        "circumstances in the third degree",
-        fr"\bcircumstances{sep}3\b",
-        "circumstances in the third degree",
-    ),
-    RegexSubstitution("fourth", r"\b4th\b", "fourth", priority=20),
-    RegexSubstitution("fifth", r"\b5th\b", "fifth", priority=20),
-    RegexSubstitution("sixth", r"\b6th\b", "sixth", priority=20),
-    RegexSubstitution("seventh", r"\b7th\b", "seventh", priority=20),
-    RegexSubstitution("eighth", r"\b8th\b", "eighth", priority=20),
-    RegexSubstitution("ninth", r"\b9th\b", "ninth", priority=20),
-    RegexSubstitution("tenth", r"\b10th\b", "tenth", priority=20),
-    # SCHEDULE terms ===========
-    # observed "l" for use of "i" across schedule terms
-    RegexSubstitution(
-        "Schedule", r"\b(?:sc?he?d?|sch|sched|schd)\b", "schedule", priority=9
-    ),
-    RegexSubstitution(
-        "schedule one",
-        fr"\bschedule{sep}(?:i|1|l)\b",
-        "schedule one",
-    ),
-    RegexSubstitution(
-        "schedule two",
-        fr"\bschedule{sep}(?:ii|2|ll)\b",
-        "schedule two",
-    ),
-    RegexSubstitution(
-        "schedule three",
-        fr"\bschedule{sep}(?:iii|3|lll)\b",
-        "schedule three",
-    ),
-    RegexSubstitution(
-        "schedule four",
-        fr"\bschedule{sep}(?:iv|4|lv)\b",
-        "schedule four",
-    ),
-    RegexSubstitution(
-        "schedule five",
-        fr"\bschedule{sep}(?:v|5)\b",
-        "schedule five",
-    ),
-    RegexSubstitution(
-        "schedule six",
-        fr"\bschedule{sep}(?:vi|6|vl)\b",
-        "schedule six",
-    ),
-    # DRIVING TERMS ===========
-    RegexSubstitution(
-        "driving",
-        r"\bdrvg\b",
-        "driving",
-    ),
-    RegexSubstitution(
-        "driving 2",
-        fr"\bdriv{sep}g\b",
-        "driving",
-    ),
-    RegexSubstitution(
-        "failure to yield",
-        fr"\bfty\b",
-        "failure to yield",
-    ),
-    RegexSubstitution(
-        "permit",
-        fr"\bperm\b",
-        "permit",
-    ),
-    RegexSubstitution(
-        "registration",
-        fr"\b(?:regis|registra)\b",
-        "registration",
-    ),
-    RegexSubstitution(
-        "driving under the influence",
-        r"\bdui\b",
-        "driving under the influence",
-    ),
-    RegexSubstitution(
-        "driving while impaired",
-        r"\bdwi\b",
-        "driving while impaired",
-    ),
-    RegexSubstitution(
-        "driving while license suspended",
-        r"\bdwls\b",
-        "driving while license suspended",
-    ),
-    RegexSubstitution(
-        "driving while license revoked",
-        r"\bdwlr\b",
-        "driving while license revoked",
-    ),
-    RegexSubstitution(
-        "revoked",
-        r"\brevkd\b",
-        "revoked",
-    ),
-    RegexSubstitution(
-        "reckless endangerment",
-        fr"\breckles{sep}endanger\b",
-        "reckless endangerment",
-    ),
-    RegexSubstitution(
-        "highway",
-        fr"\bhi{sep}way\b",
-        "highway",
-    ),
-    RegexSubstitution(
-        "reckless driving",
-        fr"\brek{sep}dr?\b",
-        "reckless driving",
-    ),
-    # ========
-    RegexSubstitution(
-        "retail theft",
-        fr"\bretail{sep}thft\b",
-        "retail theft",
-    ),
-    RegexSubstitution(
-        "impregnate girl",
-        fr"\b(?:impregnate|impreg){sep}(?:girl|grl)\b",
-        "impregnate girl",
-    ),
-    RegexSubstitution(
-        "worker compensation",
-        fr"\bwrkr{sep}cmp\b",
-        "worker compensation",
-    ),
-    RegexSubstitution(
-        "disregard",
-        fr"\bdisreg\b",
-        "disregard",
-    ),
-    RegexSubstitution(
-        "electrical appliance",
-        fr"\belct{sep}appl\b",
-        "electrical appliance",
-    ),
-    RegexSubstitution(
-        "serial number",
-        fr"\b(?:serial|ser){sep}(?:number|nmbr|num|nu|no)\b",
-        "serial number",
-    ),
-    # DISTRIBUTION / FURNISH / TRAFFICK TERMS =======
-    RegexSubstitution(  # TODO: revisit traff/traf', more likely to be traffick/ing but could be traffic (cars)
-        "traffick",
-        r"\b(?:tfk|traff|traf)\b",
-        "traffick",
-    ),
-    RegexSubstitution(  # TODO: revisit adding 'dist', more likely to be distribution but could be disturbance
-        "distribution",
-        r"\b(?:distr|distrib)\b",
-        "distribution",
-    ),
-    RegexSubstitution(
-        "attempted distribution",
-        fr"\b(?:at|att|attempted){sep}dist\b",
-        "attempted distribution",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "illegal distribution",
-        fr"\billgl{sep}dist\b",
-        "intent distribution",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "buy distribute",
-        fr"\bbuy{sep}dist\b",
-        "buy distribute",
-    ),
-    RegexSubstitution(
-        "intent distribute",
-        fr"\b(?:intent|int){sep}dist\b",
-        "intent distribute",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "intent to distribute",
-        fr"\b(?:intent|int){sep}to{sep}dist\b",
-        "intent to distribute",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "distribution possession",
-        fr"\bdist{sep}(?:possession|possess|poss)\b",
-        "distribution possession",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "unauthorized distribution",
-        fr"\b(?:unauthorized|unauth|unau|unauthd){sep}dist\b",
-        "unauthorized distribution",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "possession distribution",
-        fr"\b(?:possession|possess|poss){sep}dist\b",
-        "possession distribution",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "unlaw distribution",
-        fr"\b(?:unlawful|unlaw){sep}dist\b",
-        "unlawful distribution",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "distribution controlled",
-        fr"\bdist{sep}(?:controlled|cntrld|cntrl|contrlld)\b",
-        "distribution controlled",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "distribute schedule",
-        fr"\bdist{sep}(?:schedule|sch|sched)\b",
-        "distribute schedule",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "furnish",
-        r"\b(?:furnishing|furn)\b",
-        "furnish",
-    ),
-    RegexSubstitution(  # TODO: revisit adding 'man', more likely to be manufacture/ing but could have other meaning
-        "manufacturing",
-        r"\b(?:manuf|manu|mfg|manf|manfac)\b",
-        "manufacturing",
-    ),
-    RegexSubstitution(
-        "manufacturing distribution sell",
-        fr"\b(?:manuf|manu|man|mfg|manf|manfac){sep}dist{sep}sell\b",
-        "manufacturing distribution sell",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "record sell rent distribute",
-        fr"\brecord{sep}sell{sep}rent{sep}dist\b",
-        "record sell rent distribute",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "sell distribute",
-        fr"\bsell{sep}dist\b",
-        "sell distribute",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "sale distribute",
-        fr"\bsale{sep}dist\b",
-        "sale distribute",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "offer agree to distribute",
-        fr"\boffer{sep}agree{sep}to{sep}dist\b",
-        "offer agree distribute",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "arrange to distribute",
-        fr"\barrange{sep}to{sep}dist\b",
-        "arrange to distribute",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "arrange to distribute 2",
-        fr"\barrange{sep}dist\b",
-        "arrange to distribute",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "controlled substance distribution",
-        fr"\bcontr{sep}sub{sep}dist\b",
-        "controlled substance distribution",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "manufacturing deliver distribution",
-        fr"\b(?:manuf|manu|man|mfg|manf){sep}del{sep}dist\b",
-        "manufacturing deliver distribution",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "possession distribution manufacturing",
-        fr"\bposs{sep}dist{sep}manuf\b",
-        "possession distribution manufacturing",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "with intent to distribute",
-        fr"\bwitd\b",
-        "with intent to distribute",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "possession with intent to distribute",
-        fr"\bposs{sep}(?:with|w){sep}(?:intent|int|i){sep}dist\b",
-        "possession with intent to distribute",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "manufacturing distribution possession",
-        fr"\b(?:manuf|manu|man|mfg|manf){sep}dist{sep}(?:p|poss|pos)\b",
-        "manufacturing distribution possession",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "manufacturing distribution",
-        fr"\b(?:manuf|manu|man|mfg|manf){sep}dist\b",
-        "manufacturing distribution",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "distribution obscene material",
-        fr"\bdist{sep}(?:obscene|obs|obsc){sep}(?:material|mat|mtrl)\b",
-        "distribution obscene material",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "harmful material",
-        fr"\b(?:harmful|hrmf){sep}(?:material|mat|mtrl)\b",
-        "harmful material",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "obscene material distribution",
-        fr"\b(?:obscene|obs|obsc){sep}(?:material|mat|mtrl){sep}dist\b",
-        "obscene material distribution",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "material",
-        fr"\b(?:matrl|mat|mtrl)\b",
-        "material",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "distribution child porn",
-        fr"\bdist{sep}child{sep}porn\b",
-        "distribution child porn",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "distribution controlled substances",
-        fr"\bdist{sep}cds\b",
-        "distribution controlled substances",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "controlled substances distribution ",
-        fr"\bcds{sep}dist\b",
-        "controlled substances distribution ",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "distribution narcotics",
-        fr"\bdist{sep}narc\b",
-        "distribution narcotics",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "deliver or distribution",
-        fr"\bdel{sep}or{sep}dist\b",
-        "deliver or distribution",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "criminal distribution",
-        fr"\bcriminal{sep}dist\b",
-        "criminal distribution",
-        priority=5,
-    ),
-    RegexSubstitution(
-        "purchase",
-        r"\bpur\b",
-        "purchase",
-    ),
-    # DRUG TERMS ===========
-    RegexSubstitution(
-        "marijuana",
-        r"\b(?:marij|marihuana|mari|marijuan|marijua|mariju|mj)\b",
-        "marijuana",
-    ),
-    RegexSubstitution(
-        "hydrocodone",
-        r"\bhydroc\b",
-        "hydrocodone",
-    ),
-    RegexSubstitution(
-        "cocaine",
-        r"\b(?:cocain|coca|cocai|cocne)\b",
-        "cocaine",
-    ),
-    RegexSubstitution(
-        "crack or cocaine",
-        r"\bcoc\b",
-        "crack or cocaine",
-    ),
-    RegexSubstitution(
-        "rohypnol",
-        r"\brohypnl\b",
-        "rohypnol",
-    ),
-    RegexSubstitution(
-        "heroine",
-        r"\bher\b",
-        "heroine",
-    ),
-    RegexSubstitution(
-        "heroine",
-        r"\bher\b",
-        "heroine",
-    ),
-    RegexSubstitution(
-        "ecstasy",
-        r"\bmdma\b",
-        "ecstasy",
-    ),
-    RegexSubstitution(
-        "methamphetamine",
-        r"\b(?:meth|metham|methamphet|methamph)\b",
-        "methamphetamine",
-    ),
-    RegexSubstitution(
-        "paraphernalia",
-        r"\b(?:para|paraph|paraphenalia|parap)\b",
-        "paraphernalia",
-    ),
-    RegexSubstitution(
-        "grams",
-        r"\b(?:gr|gms|grms)\b",
-        "grams",
-    ),
-    RegexSubstitution(
-        "gram",
-        r"\bgm\b",
-        "gram",
-    ),
-    RegexSubstitution(
-        "kilograms",
-        r"\bkg\b",
-        "kilograms",
-    ),
-    RegexSubstitution(
-        "pounds",
-        r"\blb\b",
-        "pounds",
-    ),
-    RegexSubstitution(
-        "ounces",
-        r"\boz\b",
-        "ounces",
-    ),
-    # ALCOHOL / LIQUOR terms ===========
-    RegexSubstitution(
-        "alcoholic beverage", r"\balc\Wbev\b", "alcoholic beverage", priority=5
-    ),
-    RegexSubstitution(
-        "beverage",
-        r"\bbev\b",
-        "beverage",
-    ),
-    RegexSubstitution(
-        "blood alcohol concentration",
-        r"\bbac\b",
-        "blood alcohol concentration",
-    ),
-    RegexSubstitution(
-        "alcohol",
-        r"\b(?:alc|alch|alchol|alcohl|alco|alcoh|alcoho)\b",
-        "alcohol",
-    ),
-    RegexSubstitution(
-        "over legal",
-        fr"\b(?:over|ov){sep}(?:legal|leg)\b",
-        "over legal",
-    ),
-    RegexSubstitution(
-        "supply",
-        fr"\bsupp\b",
-        "supply",
-    ),
-    RegexSubstitution(
-        "liquor",
-        fr"\bliq\b",
-        "liquor",
-    ),
-    RegexSubstitution(
-        "distill",
-        r"\bdstl\b",
-        "distill",
-    ),
-    RegexSubstitution(
-        "minor in possession",
-        fr"\bmip\b",
-        "minor in possession",
-    ),
-    RegexSubstitution(
-        "premises",
-        fr"\bprem\b",
-        "premises",
-    ),
-    RegexSubstitution(
-        "consume",
-        fr"\bcnsum\b",
-        "consume",
-    ),
-    RegexSubstitution(
-        "intoxication",
-        fr"\bintox\b",
-        "intoxication",
-    ),
-    RegexSubstitution(
-        "available",
-        fr"\bavail\b",
-        "available",
-    ),
-    RegexSubstitution(
-        "unlicensed",
-        fr"\bunlic\b",
-        "unlicensed",
-    ),
-    RegexSubstitution(
-        "large amount",
-        fr"\blg{sep}amt\b",
-        "large amount",
-    ),
-    RegexSubstitution(
-        "small amount",
-        fr"\bsm{sep}amt\b",
-        "small amount",
-    ),
-    RegexSubstitution(
-        "required",
-        fr"\breq\b",
-        "required",
-    ),
-    RegexSubstitution(
-        "violate prohibition",
-        fr"\bvio{sep}prohibition\b",
-        "violate prohibition",
-    ),
-    RegexSubstitution(
-        "enticement",
-        fr"\bentcmnt\b",
-        "enticement",
-    ),
-    # SUBSTANCE TERMS ========
-    RegexSubstitution(
-        "Substance",
-        r"\b(?:sub|subs|substanc|substan|substnces|subtance|substa|substnc|sunstance|subst)\b",
-        "substance",
-        20,
-    ),
-    RegexSubstitution("controlled", r"\b(?:cntrld|cntrl|contrlld)\b", "controlled", 20),
-    RegexSubstitution(
-        "controlled dangerous substances",
-        r"\bcds\b",
-        "controlled dangerous substances",
-    ),
-    RegexSubstitution(
-        "solicitation of controlled substances",
-        fr"\bsol{sep}cds\b",
-        "solicitation of controlled substances",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "solicitation",
-        fr"\b(?:solct|sol|solicit|solic)\b",
-        "solicitation",
-    ),
-    RegexSubstitution(
-        "solicitation of narcotics",
-        fr"\bsol{sep}narc\b",
-        "solicitation of narcotics",
-        priority=4,
-    ),
-    RegexSubstitution(
-        "Controlled Substance",
-        fr"\bcont?r?{sep}?subs?t?(?:\b|stance\b)",
-        "controlled substance",
-    ),
-    RegexSubstitution(
-        "Controlled Substance 2",
-        r"\bc\W?s\b",
-        "controlled substance",
-    ),
-    RegexSubstitution(
-        "unlawful possession of a controlled substance",
-        r"\bupcs\b",
-        "unlawful possession of a controlled substance",
-    ),
-]
-def prep_text(text):
-    # Remove Commas from Numbers
-    text = re.sub(r"(\d+?),(\d+?)", r"\1\2", text)
-    # TODO: double check this `'s` regex
-    text = re.sub(r"\b(\S+?)'(s)", r"\1\2", text)
-    # replace hyphens with spaces
-    text = re.sub("-", " ", text)
-    # replace forward-slashes with spaces
-    text = re.sub("/", " ", text)
-    return text
-def cleaner(text):
-    if pd.isnull(text):
-        return ""
-    # Prepare text for regex substitions
-    text = prep_text(text)
-    # Do all substitutions (Case insensitive on raw text)
-    substitutions_sorted = sorted(substitutions, key=lambda s: s.priority)
-    for substitution in substitutions_sorted:
-        text = re.sub(substitution.regex, substitution.replacement, text)
-    # Remove any terms we don't want
-    for removal in removals:
-        text = re.sub(removal.regex, " ", text)
-    # Then remove remaining punctuation
-    for punct in all_punctuation:
-        text = text.replace(punct, " ")
-    text = " ".join(text.split())  # removes extra spaces: "  " → " "
-    text = text.lower()
-    return text

download.py DELETED Viewed

@@ -1,28 +0,0 @@
-# https://discuss.streamlit.io/t/heres-a-download-function-that-works-for-dataframes-and-txt/4052
-import base64
-import streamlit as st
-import pandas as pd
-def download_link(object_to_download, download_filename, download_link_text):
-    """
-    Generates a link to download the given object_to_download.
-    object_to_download (str, pd.DataFrame):  The object to be downloaded.
-    download_filename (str): filename and extension of file. e.g. mydata.csv, some_txt_output.txt
-    download_link_text (str): Text to display for download link.
-    Examples:
-    download_link(YOUR_DF, 'YOUR_DF.csv', 'Click here to download data!')
-    download_link(YOUR_STRING, 'YOUR_STRING.txt', 'Click here to download your text!')
-    """
-    if isinstance(object_to_download, pd.DataFrame):
-        object_to_download = object_to_download.to_csv(index=False)
-    # some strings <-> bytes conversions necessary here
-    b64 = base64.b64encode(object_to_download.encode()).decode()
-    return f'<a href="data:file/txt;base64,{b64}" download="{download_filename}">{download_link_text}</a>'

onnx_model_utils.py DELETED Viewed

@@ -1,195 +0,0 @@
-import os
-from psutil import cpu_count
-# Constants from the performance optimization available in onnxruntime
-# It needs to be done before importing onnxruntime
-os.environ["OMP_NUM_THREADS"] = str(cpu_count(logical=True))
-os.environ["OMP_WAIT_POLICY"] = "ACTIVE"
-os.environ["TOKENIZERS_PARALLELISM"] = "true"
-import json
-import os
-from pathlib import Path
-from typing import Any, Dict, List
-import gzip
-import shutil
-from numpy import ndarray
-import requests
-import streamlit as st
-from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
-from scipy.special import softmax
-from transformers import AutoTokenizer
-from transformers.file_utils import http_get
-from cleaning_utils import cleaner
-RELEASE_TAG = "2021.05.18.15"
-OUTPUT_PATH = Path("onnx/rota-quantized.onnx")
-ONNX_RELEASE = (
-    "https://github.com/RTIInternational/"
-    "rota/"
-    "releases/download/"
-    f"{RELEASE_TAG}/"
-    "rota-quantized.onnx.gz"
-)
-@st.cache
-def cleaner_cache(text):
-    return cleaner(text)
-def get_label_config(model_name, config_path: Path = Path("config.json")):
-    if config_path.exists():
-        config_json = json.loads(config_path.read_text())
-        labels = {int(k): v for k, v in config_json["id2label"].items()}
-    else:
-        config_url = f"https://huggingface.co/{model_name}/raw/main/config.json"
-        config_json = requests.get(config_url).json()
-        config_path.write_text(json.dumps(config_json))
-        labels = {int(k): v for k, v in config_json["id2label"].items()}
-    return labels
-class ONNXCPUClassificationPipeline:
-    def __init__(self, tokenizer, model_path):
-        self.tokenizer = tokenizer
-        self.model = create_cpu_model(model_path)
-        self.labels = get_label_config(
-            tokenizer.name_or_path, config_path=Path("onnx/config.json")
-        )
-    def __call__(self, texts: List[str]) -> List[List[Dict[str, Any]]]:
-        # Inputs are provided through numpy array
-        model_inputs = self.tokenizer(texts, return_tensors="pt", padding=True)
-        inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}
-        # Run the model (None = get all the outputs)
-        output = self.model.run(0, inputs_onnx)
-        probs = softmax(output[0], axis=1)
-        predictions = self._format_predictions(probs, self.labels)
-        return predictions
-    def _format_predictions(
-        self, softmax_array: ndarray, labels: List[str]
-    ) -> List[List[Dict[str, Any]]]:
-        """Format predictions from ONNX similar to the
-        huggingface transformers classification pipeline
-        Args:
-            softmax_array (np.ndarray): array of shape (n_preds, n_labels)
-        Returns:
-            List[List[Dict[str, Any]]]: Output of predictions, where each row is a list of
-            Dict with keys "label" and "score"
-        """
-        predictions = [
-            [
-                {"label": labels[column], "score": float(softmax_array[row][column])}
-                for column in range(softmax_array.shape[1])
-            ]
-            for row in range(softmax_array.shape[0])
-        ]
-        return predictions
-def create_cpu_model(model_path: str) -> InferenceSession:
-    # Few properties that might have an impact on performances (provided by MS)
-    options = SessionOptions()
-    options.intra_op_num_threads = 1
-    options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
-    # Load the model as a graph and prepare the CPU backend
-    session = InferenceSession(model_path, options, providers=["CPUExecutionProvider"])
-    session.disable_fallback()
-    return session
-def download_model():
-    OUTPUT_PATH.parent.mkdir(exist_ok=True)
-    with open(f"{OUTPUT_PATH}.gz", "wb") as f:
-        http_get(
-            ONNX_RELEASE,
-            f,
-        )
-    with gzip.open(f"{OUTPUT_PATH}.gz", "rb") as f_in:
-        with open(f"{OUTPUT_PATH}", "wb") as f_out:
-            shutil.copyfileobj(f_in, f_out)
-def load_model():
-    if not OUTPUT_PATH.exists():
-        download_model()
-    tokenizer = AutoTokenizer.from_pretrained("rti-international/rota")
-    pipeline = ONNXCPUClassificationPipeline(tokenizer, str(OUTPUT_PATH))
-    return pipeline
-pipeline = load_model()
-def predict(text: str, sort=True) -> List[List[Dict[str, Any]]]:
-    """Generate a single prediction on an input text
-    Args:
-        text (str): The input text to generate a prediction for (post-clean)
-        sort (bool, optional): Whether to sort the predicted labels by score. Defaults to True.
-    Returns:
-        List[List[Dict[str, Any]]]: A list with a single element containing predicted label scores.
-    """
-    clean = cleaner_cache(text)
-    preds = pipeline([clean])
-    if sort:
-        sorted_preds = [
-            sorted(p, key=lambda d: d["score"], reverse=True) for p in preds
-        ]
-        return sorted_preds
-    else:
-        return preds
-def predict_bulk(texts: List[str]) -> List[List[Dict[str, Any]]]:
-    """Generate predictions on a list of strings.
-    Args:
-        texts (List[str]): Input texts to generate predictions (post-cleaning)
-    Returns:
-        List[List[Dict[str, Any]]]: Predicted label scores for each input text
-    """
-    cleaned = [cleaner_cache(text) for text in texts]
-    preds = pipeline(cleaned)
-    del cleaned
-    return preds
-def _max_pred(prediction_scores: List[Dict[str, Any]]) -> Dict[str, Any]:
-    """Utility function to find the maximum predicted label
-    for a single prediction
-    Args:
-        prediction_scores (List[Dict[str, Any]]): A list of predictions with keys
-            'label' and 'score'
-    Returns:
-        Dict[str, Any]: The 'label' and 'score' dict with the highest score value
-    """
-    return max(prediction_scores, key=lambda d: d["score"])
-def max_pred_bulk(preds: List[List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
-    """Generates a "column" of label predictions by finding the max
-    prediction score per element
-    Args:
-        preds (List[List[Dict[str, Any]]]): A list of predictions
-    Returns:
-        List[Dict[str, Any]: A list of  'label' and 'score' dict with the highest score value
-    """
-    return [_max_pred(pred) for pred in preds]

requirements.txt CHANGED Viewed

@@ -1,11 +1 @@
-openpyxl==3.0.6
-pandas==1.2.0
-transformers[torch]==4.6.0
-# New Requirements
-streamlit==0.82.0
-more-itertools==8.7.0
-stqdm==0.0.3
-onnx==1.9.0
-onnxruntime==1.7.0
-psutil==5.8.0
-scipy==1.6.2


1	+ streamlit==1.21.0