Spaces:
Sleeping
Sleeping
seanpedrickcase
commited on
Commit
·
cc6683a
1
Parent(s):
adc03a0
Added presentation of summary table outputs
Browse files- app.py +3 -2
- tools/helper_functions.py +83 -26
- tools/llm_api_call.py +10 -4
app.py
CHANGED
@@ -136,6 +136,7 @@ with app:
|
|
136 |
summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
|
137 |
summarise_previous_data_btn = gr.Button("Summarise existing topics", variant="primary")
|
138 |
summary_output_files = gr.File(label="Summarised output files", interactive=False)
|
|
|
139 |
|
140 |
with gr.Tab(label="Continue previous topic extraction"):
|
141 |
gr.Markdown(
|
@@ -226,9 +227,9 @@ with app:
|
|
226 |
summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
|
227 |
then(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox]).\
|
228 |
then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
|
229 |
-
then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox])
|
230 |
|
231 |
-
latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox])
|
232 |
|
233 |
# If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
|
234 |
continue_previous_data_files_btn.click(
|
|
|
136 |
summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
|
137 |
summarise_previous_data_btn = gr.Button("Summarise existing topics", variant="primary")
|
138 |
summary_output_files = gr.File(label="Summarised output files", interactive=False)
|
139 |
+
summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
|
140 |
|
141 |
with gr.Tab(label="Continue previous topic extraction"):
|
142 |
gr.Markdown(
|
|
|
227 |
summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
|
228 |
then(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox]).\
|
229 |
then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
|
230 |
+
then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown])
|
231 |
|
232 |
+
latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown])
|
233 |
|
234 |
# If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
|
235 |
continue_previous_data_files_btn.click(
|
tools/helper_functions.py
CHANGED
@@ -116,35 +116,92 @@ def read_file(filename):
|
|
116 |
return pd.read_excel(filename)
|
117 |
elif file_type == 'parquet':
|
118 |
return pd.read_parquet(filename)
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
else:
|
138 |
-
|
139 |
-
|
140 |
-
|
|
|
|
|
141 |
|
142 |
-
# Add any remaining text in current_line to wrapped_lines
|
143 |
-
if current_line:
|
144 |
-
wrapped_lines.append(current_line)
|
145 |
|
146 |
-
|
147 |
-
|
|
|
|
|
148 |
|
149 |
# Use apply with axis=1 to apply wrap_text to each element
|
150 |
df_cleaned = df_cleaned.apply(lambda col: col.map(wrap_text))
|
|
|
116 |
return pd.read_excel(filename)
|
117 |
elif file_type == 'parquet':
|
118 |
return pd.read_parquet(filename)
|
119 |
+
|
120 |
+
# Wrap text in each column to the specified max width, including whole words
|
121 |
+
def wrap_text(text, max_width=60):
|
122 |
+
if not isinstance(text, str):
|
123 |
+
return text
|
124 |
+
|
125 |
+
words = text.split()
|
126 |
+
if not words:
|
127 |
+
return text
|
128 |
+
|
129 |
+
# First pass: initial word wrapping
|
130 |
+
wrapped_lines = []
|
131 |
+
current_line = []
|
132 |
+
current_length = 0
|
133 |
+
|
134 |
+
def add_line():
|
135 |
+
if current_line:
|
136 |
+
wrapped_lines.append(' '.join(current_line))
|
137 |
+
current_line.clear()
|
138 |
+
|
139 |
+
for i, word in enumerate(words):
|
140 |
+
word_length = len(word)
|
141 |
+
|
142 |
+
# Handle words longer than max_width
|
143 |
+
if word_length > max_width:
|
144 |
+
add_line()
|
145 |
+
wrapped_lines.append(word)
|
146 |
+
current_length = 0
|
147 |
+
continue
|
148 |
+
|
149 |
+
# Calculate space needed for this word
|
150 |
+
space_needed = word_length if not current_line else word_length + 1
|
151 |
+
|
152 |
+
# Check if adding this word would exceed max_width
|
153 |
+
if current_length + space_needed > max_width:
|
154 |
+
add_line()
|
155 |
+
current_line.append(word)
|
156 |
+
current_length = word_length
|
157 |
+
else:
|
158 |
+
current_line.append(word)
|
159 |
+
current_length += space_needed
|
160 |
+
|
161 |
+
add_line() # Add any remaining text
|
162 |
+
|
163 |
+
# Second pass: redistribute words from lines following single-word lines
|
164 |
+
def can_fit_in_previous_line(prev_line, word):
|
165 |
+
return len(prev_line) + 1 + len(word) <= max_width
|
166 |
+
|
167 |
+
i = 0
|
168 |
+
while i < len(wrapped_lines) - 1:
|
169 |
+
words_in_line = wrapped_lines[i].split()
|
170 |
+
next_line_words = wrapped_lines[i + 1].split()
|
171 |
+
|
172 |
+
# If current line has only one word and isn't too long
|
173 |
+
if len(words_in_line) == 1 and len(words_in_line[0]) < max_width * 0.8:
|
174 |
+
# Try to bring words back from the next line
|
175 |
+
words_to_bring_back = []
|
176 |
+
remaining_words = []
|
177 |
+
current_length = len(words_in_line[0])
|
178 |
+
|
179 |
+
for word in next_line_words:
|
180 |
+
if current_length + len(word) + 1 <= max_width:
|
181 |
+
words_to_bring_back.append(word)
|
182 |
+
current_length += len(word) + 1
|
183 |
+
else:
|
184 |
+
remaining_words.append(word)
|
185 |
+
|
186 |
+
if words_to_bring_back:
|
187 |
+
# Update current line with additional words
|
188 |
+
wrapped_lines[i] = ' '.join(words_in_line + words_to_bring_back)
|
189 |
+
|
190 |
+
# Update next line with remaining words
|
191 |
+
if remaining_words:
|
192 |
+
wrapped_lines[i + 1] = ' '.join(remaining_words)
|
193 |
else:
|
194 |
+
wrapped_lines.pop(i + 1)
|
195 |
+
continue # Don't increment i if we removed a line
|
196 |
+
i += 1
|
197 |
+
|
198 |
+
return '<br>'.join(wrapped_lines)
|
199 |
|
|
|
|
|
|
|
200 |
|
201 |
+
def view_table(file_path: str): # Added max_width parameter
|
202 |
+
df = pd.read_csv(file_path)
|
203 |
+
|
204 |
+
df_cleaned = df.replace('\n', ' ', regex=True)
|
205 |
|
206 |
# Use apply with axis=1 to apply wrap_text to each element
|
207 |
df_cleaned = df_cleaned.apply(lambda col: col.map(wrap_text))
|
tools/llm_api_call.py
CHANGED
@@ -20,7 +20,7 @@ from io import StringIO
|
|
20 |
GradioFileData = gr.FileData
|
21 |
|
22 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt
|
23 |
-
from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
|
24 |
from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
|
25 |
|
26 |
# ResponseObject class for AWS Bedrock calls
|
@@ -1636,6 +1636,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1636 |
'''
|
1637 |
out_metadata = []
|
1638 |
local_model = []
|
|
|
1639 |
|
1640 |
print("In summarise_output_topics function.")
|
1641 |
|
@@ -1646,6 +1647,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1646 |
#print("latest_summary_completed:", latest_summary_completed)
|
1647 |
#print("length_all_summaries:", length_all_summaries)
|
1648 |
|
|
|
1649 |
if latest_summary_completed >= length_all_summaries:
|
1650 |
print("All summaries completed. Creating outputs.")
|
1651 |
|
@@ -1691,7 +1693,11 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1691 |
|
1692 |
output_files.extend([reference_table_df_revised_path, unique_table_df_revised_path])
|
1693 |
|
1694 |
-
|
|
|
|
|
|
|
|
|
1695 |
|
1696 |
tic = time.perf_counter()
|
1697 |
|
@@ -1742,6 +1748,6 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
1742 |
|
1743 |
# If all summaries completeed
|
1744 |
if latest_summary_completed >= length_all_summaries:
|
1745 |
-
print("At last summary.")
|
1746 |
|
1747 |
-
return summarised_references, unique_table_df, reference_table_df, output_files, summarised_outputs, latest_summary_completed, out_metadata_str
|
|
|
20 |
GradioFileData = gr.FileData
|
21 |
|
22 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt
|
23 |
+
from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text
|
24 |
from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
|
25 |
|
26 |
# ResponseObject class for AWS Bedrock calls
|
|
|
1636 |
'''
|
1637 |
out_metadata = []
|
1638 |
local_model = []
|
1639 |
+
summarised_output_markdown = ""
|
1640 |
|
1641 |
print("In summarise_output_topics function.")
|
1642 |
|
|
|
1647 |
#print("latest_summary_completed:", latest_summary_completed)
|
1648 |
#print("length_all_summaries:", length_all_summaries)
|
1649 |
|
1650 |
+
# If all summaries completed, make final outputs
|
1651 |
if latest_summary_completed >= length_all_summaries:
|
1652 |
print("All summaries completed. Creating outputs.")
|
1653 |
|
|
|
1693 |
|
1694 |
output_files.extend([reference_table_df_revised_path, unique_table_df_revised_path])
|
1695 |
|
1696 |
+
unique_table_df_revised_display = unique_table_df_revised.apply(lambda col: col.map(wrap_text))
|
1697 |
+
|
1698 |
+
summarised_output_markdown = unique_table_df_revised_display.to_markdown(index=False)
|
1699 |
+
|
1700 |
+
return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown
|
1701 |
|
1702 |
tic = time.perf_counter()
|
1703 |
|
|
|
1748 |
|
1749 |
# If all summaries completeed
|
1750 |
if latest_summary_completed >= length_all_summaries:
|
1751 |
+
print("At last summary.")
|
1752 |
|
1753 |
+
return summarised_references, unique_table_df, reference_table_df, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown
|