File size: 5,096 Bytes
152a01f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import re
import gradio as gr
import os
import accelerate
import spaces
from tqdm import tqdm
import subprocess
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from huggingface_hub import login
from docling.document_converter import DocumentConverter

login(token = os.getenv('HF_TOKEN'))

repo_id = "QuantFactory/Meta-Llama-3-70B-Instruct-GGUF" 
model_id = "Meta-Llama-3-70B-Instruct.Q2_K.gguf"

local_dir = "models"

hf_hub_download(
    repo_id=repo_id,
    filename=model_id,
    local_dir = local_dir
)

def harmonize_doc(llm, pdftext, prompt, maxtokens, temperature, top_probability, model_name):
    
    prompt = """
            Please reformat the provided medical report into the following standardized structure:
            
            1. Hospital Information:
               - Name of Hospital: [Name of hospital]
               - Department: [Relevant department or 'N/A']
            
            2. Patient Information:
               - Name: [Full Name]
               - Gender: [Gender]
               - Date of Birth: [Date of Birth]
               - Address: [Full Address or 'N/A']
               - ID Numbers:
                 - [Relevant identifiers such as NHS Number, Case Number, etc.]
            
            3. Procedure Details:
               - Date of Procedure: [Date]
               - Referring Doctor: [Name or 'N/A']
               - Performed By:
                 - Consultant: [Name or 'N/A']
                 - Additional Clinicians: [Name(s) or 'N/A']
                 - Nurses: [Name(s) or 'N/A']
               - Details:
                 - Indications: [Symptoms, reasons for procedure]
                 - Instrument: [Instrument details or 'N/A']
                 - Co-morbidities: [Relevant conditions or 'N/A']
                 - ASA Status: [ASA classification or 'N/A']
                 - Procedure: [Details of patient preparation and exact description of procedures performed as in the original report or 'N/A']
                 - Findings: [Exact findings from the report, including any locations, measurements, or observations]
                 - Specimens Taken: [Details on specimens, if any, or 'N/A']
                 - Comments: [Additional notes, advice, or remarks from the report]
            
            4. Diagnosis and Outcomes:
               - Diagnosis: [Exact diagnosis or 'N/A']
               - Therapeutic Actions: [Treatments performed or 'N/A']
               - Complications: [Details on complications or 'No complications']
               - Follow-Up: [Exact follow-up recommendations from the report]
            
            Instructions for Output:
            1. Use the exact wording and details from the original report wherever possible. Do not summarize or interpret information.
            2. If any information is missing in the original report, use 'N/A' for the corresponding field.
            3. Ensure the output matches the given structure exactly, without omitting any fields.
            4. Retain all medical terms, values, and phrases as stated in the report.
            """



    output = llm.create_chat_completion(
            messages=[
                {"role": "assistant", "content": prompt},
                {
                    "role": "user",
                    "content": pdftext 
                }
            ],
            max_tokens=maxtokens,
            temperature=temperature
        )
    
    output = output['choices'][0]['message']['content']
    find_index = output.find(' '.join(pdftext.split()[:3]))
    if find_index != -1:
        output = output[find_index:].strip()
    return output

    
@spaces.GPU(duration=120)
def pdf_to_text(files, input_text='', prompt='', model_name='default', temperature=0, maxtokens=2048, top_probability=0.95):
    llm = Llama(
        model_path="models/" + model_id,
        flash_attn=True,
        n_gpu_layers=81,
        n_batch=1024,
        n_ctx=8192,
    )
    harmonized_text = ''
    for file in files:        
        converter = DocumentConverter()
        result = converter.convert(file)
        pdftext = result.document.export_to_markdown()
        input_text = pdftext
        harmonized_text += harmonize_doc(llm, input_text, prompt, maxtokens, temperature, top_probability, model_name)
        harmonized_text += '\n\n-----------------------------------------------------------------\n\n'
    return harmonized_text


temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value")
model_name = gr.Dropdown(["default", "fine-tuned"], label="LLama Model")
max_tokens = gr.Number(value=600, label="Max Tokens")
input_text = gr.Text(label='Input Text')
input_prompt = gr.Text(label='Prompt')
input_files = gr.File(file_count="multiple")
output_path_component = gr.File(label="Select Output Path")
iface = gr.Interface(
    fn=pdf_to_text,
    inputs=input_files,
    outputs=['text'],
    title='COBIx Endoscopy Report Harmonization',
    description="This application helps standardize medical reports into a consistent format",
    theme=gr.themes.Soft(),
)
iface.launch()