filius-Dei commited on
Commit
b534b3f
verified
1 Parent(s): 2527ef1

Upload 2 files

Browse files

With Streamlit features

Files changed (2) hide show
  1. CiPE_Streamlit-2.ipynb +0 -0
  2. cipe_streamlit-2.py +290 -0
CiPE_Streamlit-2.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
cipe_streamlit-2.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """CiPE_Streamlit
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1jACLFXfsdWM59lrfTQGcZVsTIHBO92R8
8
+ """
9
+
10
+ # Om Maa
11
+
12
+ !pip install langchain predictionguard lancedb html2text sentence-transformers PyPDF2
13
+ !pip install huggingface_hub
14
+ !pip install transformers
15
+ !pip install sentencepiece
16
+ !pip install streamlit
17
+
18
+ import os
19
+ import urllib.request
20
+
21
+ import html2text
22
+ import predictionguard as pg
23
+ from langchain import PromptTemplate, FewShotPromptTemplate
24
+ from langchain.text_splitter import CharacterTextSplitter
25
+ from sentence_transformers import SentenceTransformer
26
+ import numpy as np
27
+ import lancedb
28
+ from lancedb.embeddings import with_embeddings
29
+ import pandas as pd
30
+
31
+
32
+ os.environ['PREDICTIONGUARD_TOKEN'] = "q1VuOjnffJ3NO2oFN8Q9m8vghYc84ld13jaqdF7E"
33
+
34
+ # Streamlit App Initiation
35
+
36
+ import streamlit as st
37
+
38
+ # Replace input() with Streamlit's input widgets
39
+ # Sidebar for inputting the name, age, gender, and ethnicity
40
+ name = st.sidebar.text_input('Name')
41
+ age = st.sidebar.number_input('Age', min_value=0, max_value=120, step=1)
42
+ gender = st.sidebar.selectbox('Gender', ['Male', 'Female', 'Other'])
43
+ ethnicity = st.sidebar.text_input('Ethnicity')
44
+
45
+
46
+
47
+ # Main container
48
+ with st.form(key='patient_form'):
49
+ # Text input for procedures
50
+ disease = st.text_area('DISEASE', height=100)
51
+
52
+ # Text input for prescriptions (where you would get drug_names)
53
+ prescriptions = st.text_area('PRESCRIPTIONS', height=100)
54
+
55
+ # Text input for additional information
56
+ additional_info = st.text_area('ADDITIONAL INFO', height=100)
57
+
58
+ # Submit button for the form
59
+ submit_button = st.form_submit_button(label='Predict Drug Effects')
60
+
61
+ from PyPDF2 import PdfReader
62
+
63
+ # Replace 'path_to_your_pdf_file.pdf' with the path to your PDF file
64
+ pdf_path = '/content/drug_side_effects_summary_cleaned.pdf'
65
+ reader = PdfReader(pdf_path)
66
+
67
+ # Initialize an empty string to accumulate text
68
+ text = ''
69
+
70
+ # Iterate over each page in the PDF
71
+ for page in reader.pages:
72
+ # Extract text from the page and append it to the text string
73
+ text += page.extract_text() + "\n"
74
+
75
+ # Now, `text` contains the text content of the PDF. You can print it or process it further.
76
+ print(text[:500]) # Example: print the first 500 characters to understand the structure
77
+
78
+ import re
79
+
80
+ # Function to clean the extracted text
81
+ def clean_text(text):
82
+ # Correcting unwanted line breaks and spaces
83
+ text = re.sub(r'-\n', '', text) # Remove hyphenation
84
+ text = re.sub(r'\n', ' ', text) # Replace new lines with space
85
+ text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
86
+ text = text.strip() # Remove leading and trailing spaces
87
+ return text
88
+
89
+ # Clean the extracted text
90
+ cleaned_text = clean_text(text)
91
+
92
+ # Return a portion of the cleaned text to verify the cleaning
93
+ cleaned_text[:500]
94
+
95
+ # Define a function to chunk text with specified size and overlap using standard Python
96
+ def chunk_text(text, chunk_size=700, overlap=50):
97
+ chunks = []
98
+ start = 0
99
+ while start < len(text):
100
+ # If we're not at the beginning, move back 'overlap' characters for context
101
+ if start > 0:
102
+ start -= overlap
103
+ end = start + chunk_size
104
+ chunks.append(text[start:end])
105
+ start += chunk_size
106
+ return chunks
107
+
108
+ # Chunk the cleaned text into smaller pieces for LLM input
109
+ docs_alternative = chunk_text(cleaned_text, chunk_size=700, overlap=50)
110
+
111
+ # Prepare to display the first few chunks to verify the result
112
+ chunks_to_display_alt = 3
113
+ chunks_preview_alt = [docs_alternative[i] for i in range(min(len(docs_alternative), chunks_to_display_alt))]
114
+
115
+ chunks_preview_alt
116
+
117
+ # Format the chunks to avoid prompt template conflicts
118
+ chunks_preview_alt = [x.replace('#', '-') for x in chunks_preview_alt]
119
+
120
+ # Embeddings setup
121
+ name = "all-MiniLM-L12-v2"
122
+ model = SentenceTransformer(name)
123
+
124
+ # Embedding functions
125
+ def embed_batch(batch):
126
+ return [model.encode(sentence, show_progress_bar=True) for sentence in batch]
127
+
128
+ def embed(sentence):
129
+ return model.encode(sentence)
130
+
131
+ # Ensure the LanceDB directory does not exist already to avoid errors
132
+ lancedb_dir = ".lancedb"
133
+ if not os.path.exists(lancedb_dir):
134
+ os.mkdir(lancedb_dir)
135
+ uri = lancedb_dir
136
+ db = lancedb.connect(uri)
137
+
138
+ # Prepare metadata for embedding
139
+ metadata = [[i, chunks_preview_alt] for i, chunks_preview_alt in enumerate(chunks_preview_alt)]
140
+ doc_df = pd.DataFrame(metadata, columns=["chunk", "text"])
141
+
142
+ # Embed the documents
143
+ data = with_embeddings(embed_batch, doc_df)
144
+
145
+ # LanceDB operations
146
+ # if not db.has_table("pdf_data"):
147
+ db.create_table("pdf_data", data=data)
148
+ table = db.open_table("pdf_data")
149
+ table.add(data=data)
150
+
151
+ # Note: Adjust the 'create_table' and 'open_table' to match your dataset/table names
152
+
153
+ message = "What are the side effects of doxycycline for treating Acne?"
154
+ results = table.search(embed(message)).limit(5).to_pandas()
155
+ #print(results.head())
156
+
157
+
158
+ message = "What are the side effects of doxycycline for treating Acne?"
159
+ results = table.search(embed(message)).limit(5).to_pandas()
160
+ #print(results.head())
161
+
162
+ # Assuming the setup for embeddings, LanceDB, and the PromptTemplate are already in place
163
+
164
+ # Assuming drug_names are retrieved from the prescriptions field
165
+ # You should parse the prescriptions field to extract the drug names
166
+ drug_names = prescriptions.split(',') # This is an example, the actual extraction depends on how the prescriptions are entered
167
+ disease = disease # Replace this with the actual method of getting the disease from the user
168
+
169
+
170
+ def rag_answer_drug_side_effects(name, drug_names, disease):
171
+ # Formulate a question related to drug side effects
172
+ message = f"What are the potential side effects of using {drug_names} for treating {disease}? Please provide a list of side effects specific to the use of these drugs in the context of the mentioned disease of {name} person."
173
+
174
+ # Search the database for relevant context
175
+ results = table.search(embed(message)).limit(10).to_pandas() # Adjust based on the correct API call
176
+ results.sort_values(by=['_distance'], inplace=True, ascending=True)
177
+ context = results['text'].iloc[0] # Use the most relevant document
178
+
179
+ # Define the prompt template
180
+ template = """### Instruction:
181
+ Start with Hi, {name}. Then give a compassionate answer in bullet points and list.
182
+ Read the below input context and respond with a mid length answer to the given question. If you cannot find an exact answer then look up something nearer to the medicaiton and disease
183
+ "
184
+
185
+ ### Input:
186
+ Context: {context}
187
+
188
+ Question: {question}
189
+
190
+ ### Response:
191
+ """
192
+
193
+ # Augment the prompt with the retrieved context
194
+ prompt = template.format(context=context, question=message)
195
+
196
+ # Get a response
197
+ result = pg.Completion.create(
198
+ model="Neural-Chat-7B",
199
+ prompt = prompt
200
+ )
201
+
202
+ # # Here you would call your LLM or any other model to generate an answer based on the prompt
203
+ # # Since we cannot execute dynamic model calls in this environment, we'll simulate a response
204
+ # simulated_response = "Sorry, I can't find an answer, but you might try looking in the following resource."
205
+
206
+ return result['choices'][0]['text']
207
+
208
+
209
+ def rag_answer_drug_benfit_effects(name, drug_names, disease):
210
+ # Formulate a question related to drug side effects
211
+ message = f"What are the potential benefits of using {drug_names} for treating {disease}? Please provide a list of benefits specific to the use of these drugs in the context of the mentioned disease of {name} person."
212
+
213
+ # Search the database for relevant context
214
+ results = table.search(embed(message)).limit(10).to_pandas() # Adjust based on the correct API call
215
+ results.sort_values(by=['_distance'], inplace=True, ascending=True)
216
+ context = results['text'].iloc[0] # Use the most relevant document
217
+
218
+ # Define the prompt template
219
+ template = """### Instruction:
220
+ Start with Hi, {name}. Then give a compassionate answer in bullet points and list.
221
+ Read the below input context and respond with a mid length answer to the given question. If you cannot find an exact answer then look up something nearer to the medicaiton and disease
222
+ "
223
+
224
+ ### Input:
225
+ Context: {context}
226
+
227
+ Question: {question}
228
+
229
+ ### Response:
230
+ """
231
+
232
+ # Augment the prompt with the retrieved context
233
+ prompt = template.format(context=context, question=message)
234
+
235
+ # Get a response
236
+ result = pg.Completion.create(
237
+ model="Neural-Chat-7B",
238
+ prompt = prompt
239
+ )
240
+
241
+ # # Here you would call your LLM or any other model to generate an answer based on the prompt
242
+ # # Since we cannot execute dynamic model calls in this environment, we'll simulate a response
243
+ # simulated_response = "Sorry, I can't find an answer, but you might try looking in the following resource."
244
+
245
+ return result['choices'][0]['text']
246
+
247
+ # When this button is clicked, it will return True
248
+ if st.button('Predict Drug Effects'):
249
+ # Call your processing functions here
250
+ # For example:
251
+ side_effects, benefits = rag_answer_drug_side_effects(name, drug_names, disease), rag_answer_drug_benfit_effects(name,drug_names,disease )
252
+ # You will need to define the process_input function to process these inputs
253
+
254
+ # When this button is clicked, it will return True
255
+ if submit_button:
256
+ # Call your processing functions here
257
+ # Make sure to validate input and handle errors/exceptions as necessary
258
+ try:
259
+ side_effects_response = rag_answer_drug_side_effects(name, drug_names, disease)
260
+ benefits_response = rag_answer_drug_benfit_effects(name, drug_names, disease)
261
+ st.write("Side Effects:", side_effects_response)
262
+ st.write("Benefits:", benefits_response)
263
+ except Exception as e:
264
+ st.error(f"An error occurred: {e}")
265
+
266
+ from huggingface_hub import notebook_login, Repository
267
+
268
+ notebook_login()
269
+
270
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
271
+
272
+ # Define the path to the checkpoint
273
+ checkpoint_path = r"filius-Dei/CiPE"
274
+
275
+ # # Load the model
276
+ # model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
277
+
278
+ # # Load the tokenizer
279
+ # tokenzier = AutoTokenzier.from_pretrained("distilbert-base-uncased")
280
+
281
+ # # # Define the path to the checkpoint
282
+ # # checkpoint_path = r'https://huggingface.co/filius-Dei/CiPE'
283
+
284
+ # # # Correct format for repo_id
285
+ # # repo_id = "filius-Dei/CiPE"
286
+
287
+ # # model = AutoModelForSequenceClassification.from_pretrained(repo_id)
288
+
289
+ # # # Load the tokenizer
290
+ # # tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")