Spaces:

srinidhidevaraj
/

Icd_Extractor

Sleeping

App Files Files Community

srinidhidevaraj commited on Aug 20, 2024

Commit

d7f7f62

verified ·

1 Parent(s): 41c1861

Upload 5 files

Browse files

Files changed (5) hide show

helpers.py +214 -0
prompt_template.py +42 -0
requirements.txt +14 -0
run_tree_search.py +174 -0
tree_search_icd.py +47 -0

helpers.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import re
+import os
+import simple_icd_10_cm as cm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# from openai import OpenAI
+from prompt_template import *
+from langchain_groq import ChatGroq
+from groq import Groq
+from dotenv import load_dotenv
+import csv
+import time
+load_dotenv()
+os.environ["LANGCHAIN_TRACING_V2"]="true"
+groq_api_key=os.environ.get('GROQ_API_KEY')
+os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
+LANGCHAIN_API_KEY=os.environ.get("LANGCHAIN_API_KEY")
+client = Groq()
+CHAPTER_LIST = cm.chapter_list
+def construct_translation_prompt(medical_note):
+    """
+    Construct a prompt template for translating spanish medical notes to english.
+    Args:
+        medical_note (str): The medical case note.
+    Returns:
+        str: A structured template ready to be used as input for a language model.
+    """
+    translation_prompt = """You are an expert Spanish-to-English translator. You are provided with a clinical note written in Spanish.
+You must translate the note into English. You must ensure that you properly translate the medical and technical terms from Spanish to English without any mistakes.
+Spanish Medical Note:
+{medical_note}"""
+    return translation_prompt.format(medical_note = medical_note)
+def build_translation_prompt(input_note, system_prompt=""):
+    """
+    Build a zero-shot prompt for translating spanish medical notes to english.
+    Args:
+        input_note (str): The input note or query.
+        system_prompt (str): Optional initial system prompt or instruction.
+    Returns:
+        list of dict: A structured list of dictionaries defining the role and content of each message.
+    """
+    input_prompt = construct_translation_prompt(input_note)
+    return [{"role": "system", "content": system_prompt}, {"role": "user", "content": input_prompt}]
+def remove_extra_spaces(text):
+    """
+    Remove extra spaces from a given text.
+    Args:
+        text (str): The original text string.
+    Returns:
+        str: The cleaned text with extra spaces removed.
+    """
+    return re.sub(r'\s+', ' ', text).strip()
+def remove_last_parenthesis(text):
+    """
+    Removes the last occurrence of content within parentheses from the provided text.
+    Args:
+    text (str): The input string from which to remove the last parentheses and its content.
+    Returns:
+    str: The modified string with the last parentheses content removed.
+    """
+    pattern = r'\([^()]*\)(?!.*\([^()]*\))'
+    cleaned_text = re.sub(pattern, '', text)
+    return cleaned_text
+def format_code_descriptions(text, model_name):
+    """
+    Format the ICD-10 code descriptions by removing content inside brackets and extra spaces.
+    Args:
+        text (str): The original text containing ICD-10 code descriptions.
+    Returns:
+        str: The cleaned text with content in brackets removed and extra spaces cleaned up.
+    """
+    pattern = r'\([^()]*\)(?!.*\([^()]*\))'
+    cleaned_text = remove_last_parenthesis(text)
+    cleaned_text = remove_extra_spaces(cleaned_text)
+    return cleaned_text
+def construct_prompt_template(case_note, code_descriptions, model_name):
+    """
+    Construct a prompt template for evaluating ICD-10 code descriptions against a given case note.
+    Args:
+        case_note (str): The medical case note.
+        code_descriptions (str): The ICD-10 code descriptions formatted as a single string.
+    Returns:
+        str: A structured template ready to be used as input for a language model.
+    """
+    template = prompt_template_dict[model_name]
+    return template.format(note=case_note, code_descriptions=code_descriptions)
+def build_zero_shot_prompt(input_note, descriptions, model_name, system_prompt=""):
+    """
+    Build a zero-shot classification prompt with system and user roles for a language model.
+    Args:
+        input_note (str): The input note or query.
+        descriptions (list of str): List of ICD-10 code descriptions.
+        system_prompt (str): Optional initial system prompt or instruction.
+    Returns:
+        list of dict: A structured list of dictionaries defining the role and content of each message.
+    """
+    if model_name == "llama3-70b-8192":
+        code_descriptions = "\n".join(["* " + x for x in descriptions])
+    else:
+        code_descriptions = "\n".join(["* " + x for x in descriptions])
+    input_prompt = construct_prompt_template(input_note, code_descriptions, model_name)
+    return [{"role": "system", "content": system_prompt}, {"role": "user", "content": input_prompt}]
+def get_response(messages, model_name, temperature=0.0, max_tokens=500):
+    """
+    Obtain responses from a specified model via the chat-completions API.
+    Args:
+        messages (list of dict): List of messages structured for API input.
+        model_name (str): Identifier for the model to query.
+        temperature (float): Controls randomness of response, where 0 is deterministic.
+        max_tokens (int): Limit on the number of tokens in the response.
+    Returns:
+        str: The content of the response message from the model.
+    """
+    response = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        temperature=temperature,
+        max_tokens=max_tokens
+    )
+    return response.choices[0].message.content
+def remove_noisy_prefix(text):
+    # Removing numbers or letters followed by a dot and optional space at the beginning of the string
+    cleaned_text = text.replace("* ", "").strip()
+    cleaned_text = re.sub(r"^\s*\w+\.\s*", "", cleaned_text)
+    return cleaned_text.strip()
+def parse_outputs(output, code_description_map, model_name):
+    """
+    Parse model outputs to confirm ICD-10 codes based on a given description map.
+    Args:
+        output (str): The model output containing confirmations.
+        code_description_map (dict): Mapping of descriptions to ICD-10 codes.
+    Returns:
+        list of dict: A list of confirmed codes and their descriptions.
+    """
+    confirmed_codes = []
+    split_outputs = [x for x in output.split("\n") if x]
+    for item in split_outputs:
+        try:
+            code_description, confirmation = item.split(":", 1)
+            # print(confirmation)
+            cnf,fact = confirmation.split(",", 1)
+            if model_name == "llama3-70b-8192":
+                code_description = remove_noisy_prefix(code_description)
+            else:
+                code_description = remove_noisy_prefix(code_description)
+            if confirmation.lower().strip().startswith("yes"):
+                try:
+                    code = code_description_map[code_description]
+                    confirmed_codes.append({"ICD Code": code, "Code Description": code_description,"Evidence From Notes":fact})
+                except Exception as e:
+                    # print(str(e) + " Here")
+                    continue
+        except:
+            continue
+    return confirmed_codes
+def get_name_and_description(code, model_name):
+    """
+    Retrieve the name and description of an ICD-10 code.
+    Args:
+        code (str): The ICD-10 code.
+    Returns:
+        tuple: A tuple containing the formatted description and the name of the code.
+    """
+    full_data = cm.get_full_data(code).split("\n")
+    return format_code_descriptions(full_data[3], model_name), full_data[1]

prompt_template.py ADDED Viewed

	@@ -0,0 +1,42 @@

+prompt_template_dict = {"mixtral-8x7b-32768" : """[Case note]:
+{note}
+[Example]:
+<code descriptions>
+* Gastro-esophageal reflux disease
+* Enteroptosis
+* Acute Nasopharyngitis [Common Cold]
+</code descriptions>
+<response>
+* Gastro-esophageal reflux disease: Yes,Patient was prescribed omeprazole.
+* Enteroptosis: No.
+* Acute Nasopharyngitis [Common Cold]: No.
+</response>
+[Task]:
+Follow the format in the example response exactly, including the entire description   after your (Yes|No) judgement , followed by a newline.
+Consider each of the following ICD-10 code descriptions and evaluate if there are any related mentions in the Case note.
+{code_descriptions}""",
+"llama3-70b-8192": """[Case note]:
+{note}
+[Example]:
+<code descriptions>
+* Gastro-esophageal reflux disease
+* Enteroptosis
+* Acute Nasopharyngitis [Common Cold]
+</code descriptions>
+<response>
+* Gastro-esophageal reflux disease: Yes,Patient was prescribed omeprazole.
+* Enteroptosis: No.
+* Acute Nasopharyngitis [Common Cold]: No.
+</response>
+[Task]:
+Follow the format in the example response exactly, including the entire description   after your (Yes|No) judgement , followed by a newline.
+Consider each of the following ICD-10 code descriptions and evaluate if there are any related mentions in the Case note.
+{code_descriptions}"""
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+streamlit
+python-dotenv
+simple_icd_10_cm
+tqdm
+transformers
+groq
+langchain
+langchain-groq
+langchain-community
+torch
+tensorflow
+flax
+jax
+jaxlib

run_tree_search.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import argparse
+import os
+import pandas as pd
+import json
+from tree_search_icd import get_icd_codes
+from tqdm import tqdm
+import csv
+import streamlit as st
+import tempfile
+from pathlib import Path
+from io import StringIO
+# def process_medical_notes(file_path,model_name):
+# def process_medical_notes(input_dir, output_file, model_name):
+#     code_map = {}
+#     if not os.path.isdir(input_dir):
+#         raise ValueError("The specified input directory does not exist.")
+#     # Process each file in the input directory
+#     for files in tqdm(os.listdir(input_dir)):
+#         file_path = os.path.join(input_dir, files)
+#         print(file_path)
+#         with open(file_path, "r", encoding="utf-8") as file:
+#             medical_note = file.read()
+#     if not os.path.isfile(file_path):
+#         print(f"File does not exist: {file_path}")
+#         return None
+#     # if os.path.isfile(file_path):
+#     #     st.write(f"File exists: {file_path}")
+#     # try:
+#     #     with open(file_path, "r",encoding="utf-8") as txtfile:
+#     #         st.write(file_path)
+#     #         medical_note = txtfile.read()
+#     #         st.write(f"Content of the file: {medical_note[:1000]}")  # Print the first 1000 characters
+#     # except Exception as e:
+#     #     print(f"Error reading file: {e}")
+#     #     return None
+#     # print(f"File read successfully. Content length: {len(medical_note)}")
+#     #print(medical_note)
+#     icd_codes = get_icd_codes(medical_note, model_name)
+#     print(icd_codes)
+#     # return icd_codes
+# #     print(icd_codes)
+# #     code_map[files] = icd_codes
+#     with open(output_file, "w") as f:
+#         json.dump(code_map, f, indent=4)
+# if __name__ == "__main__":
+#     parser = argparse.ArgumentParser(description="Process medical notes to extract ICD codes using a specified model.")
+#     parser.add_argument("--input_dir", help="Directory containing the medical text files")
+#     parser.add_argument("--output_file", help="File to save the extracted ICD codes in JSON format")
+#     parser.add_argument("--model_name", default="llama3-70b-8192", help="Model name to use for ICD code extraction")
+#     args = parser.parse_args()
+#     process_medical_notes(args.input_dir, args.output_file, args.model_name)
+def process_medical_notes(filepath, model_name):
+    try:
+        for txtfile in filepath:
+           with open(filepath, "r",encoding="utf-8") as txtfile:
+               medical_note = txtfile.read()
+    except Exception as e:
+        # print(f"Error reading file: {e}")
+        return None
+    icd_codes = get_icd_codes(medical_note, model_name)
+    return icd_codes
+def add_custom_css():
+    st.markdown(
+        """
+        <style>
+        /* Remove padding around the main block */
+        .block-container {
+            padding: 1rem;
+        }
+        /* Remove padding around the top */
+        header, footer, .reportview-container .main .block-container {
+            padding: 5;
+        }
+        /* Fullscreen layout adjustments */
+        .css-1d391kg {
+            padding: 5;
+        }
+        h1 {
+            text-align: center;
+        }
+         .table-wrapper {
+            text-align: center;
+        }
+        </style>
+        """,
+        unsafe_allow_html=True,
+    )
+def main():
+    st.set_page_config(layout="wide",page_icon='🔎',page_title='ICD Identifier')
+    add_custom_css()
+    st.title("ICD Code Extractor From Medical Notes")
+    col1, col2 = st.columns([1, 5])
+    with col2:
+        file_uploads=st.file_uploader('Choose Medical Note File',type='txt', accept_multiple_files=True)
+        submit = st.button("Submit")
+    with col1:
+         model_name = st.selectbox(
+            "Select Model",
+            ["llama3-70b-8192", "mixtral-8x7b-32768"],
+            index=0  # Default model selected
+        )
+    if submit :
+        for file_input in file_uploads:
+            file_name = Path(file_input.name).name
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.txt') as temp_file:
+                temp_file.write(file_input.getbuffer())
+                temp_file.flush()
+                file_paths = temp_file.name
+                response=process_medical_notes(file_paths, model_name)
+                res_data=pd.DataFrame(response,columns=['ICD Code','Code Description','Evidence From Notes'])
+                with col2:
+                #     st.markdown(f"""
+                #     <div class="custom-table-container" >
+                #         <h4>Case Id: {file_name}</h4>
+                #         <div class="table-wrapper"  >
+                #             {res_data.to_html(classes='table-wrapper', index=False)}
+                #         </div>
+                #     </div>
+                # """, unsafe_allow_html=True)
+                    st.markdown(f"""
+                    <h5>Case Id: {file_name}</h5>
+                    """, unsafe_allow_html=True)
+                    st.markdown(res_data.style.hide(axis="index").to_html(), unsafe_allow_html=True)
+                # st.write(response)
+if __name__=="__main__":
+    main()

tree_search_icd.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from helpers import *
+def get_icd_codes(medical_note, model_name, temperature=0.0):
+    """
+    Identifies relevant ICD-10 codes for a given medical note by querying a language model.
+    This function implements the tree-search algorithm for ICD coding described in https://openreview.net/forum?id=mqnR8rGWkn.
+    Args:
+        medical_note (str): The medical note for which ICD-10 codes are to be identified.
+        model_name (str): The identifier for the language model used in the API (default is 'gpt-3.5-turbo-0613').
+    Returns:
+        list of str: A list of confirmed ICD-10 codes that are relevant to the medical note.
+    """
+    assigned_codes = []
+    candidate_codes = [x.name for x in CHAPTER_LIST]
+    parent_codes = []
+    prompt_count = 0
+    while prompt_count < 50:
+        code_descriptions = {}
+        for x in candidate_codes:
+            description, code = get_name_and_description(x, model_name)
+            code_descriptions[description] = code
+        prompt = build_zero_shot_prompt(medical_note, list(code_descriptions.keys()), model_name=model_name)
+        lm_response = get_response(prompt, model_name, temperature=temperature, max_tokens=500)
+        predicted_codes = parse_outputs(lm_response, code_descriptions, model_name=model_name)
+        for code in predicted_codes:
+            if cm.is_leaf(code["ICD Code"]):
+                # assigned_codes.append(code["code"])
+                assigned_codes.append({"ICD Code": code["ICD Code"], "Code Description": code["Code Description"],"Evidence From Notes":code["Evidence From Notes"]})
+            else:
+                parent_codes.append(code)
+        if len(parent_codes) > 0:
+            parent_code = parent_codes.pop(0)
+            candidate_codes = cm.get_children(parent_code["ICD Code"])
+        else:
+            break
+        prompt_count += 1
+    return assigned_codes