seanpedrickcase commited on
Commit
cc6683a
·
1 Parent(s): adc03a0

Added presentation of summary table outputs

Browse files
Files changed (3) hide show
  1. app.py +3 -2
  2. tools/helper_functions.py +83 -26
  3. tools/llm_api_call.py +10 -4
app.py CHANGED
@@ -136,6 +136,7 @@ with app:
136
  summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
137
  summarise_previous_data_btn = gr.Button("Summarise existing topics", variant="primary")
138
  summary_output_files = gr.File(label="Summarised output files", interactive=False)
 
139
 
140
  with gr.Tab(label="Continue previous topic extraction"):
141
  gr.Markdown(
@@ -226,9 +227,9 @@ with app:
226
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
227
  then(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox]).\
228
  then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
229
- then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox])
230
 
231
- latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox])
232
 
233
  # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
234
  continue_previous_data_files_btn.click(
 
136
  summarisation_in_previous_data_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
137
  summarise_previous_data_btn = gr.Button("Summarise existing topics", variant="primary")
138
  summary_output_files = gr.File(label="Summarised output files", interactive=False)
139
+ summarised_output_markdown = gr.Markdown(value="### Summarised table will appear here")
140
 
141
  with gr.Tab(label="Continue previous topic extraction"):
142
  gr.Markdown(
 
227
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox]).\
228
  then(load_in_previous_data_files, inputs=[summarisation_in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, summarisation_in_previous_data_files_status, data_file_names_textbox]).\
229
  then(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state]).\
230
+ then(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown])
231
 
232
+ latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, summarised_references_markdown, temperature_slide, data_file_names_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown])
233
 
234
  # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
235
  continue_previous_data_files_btn.click(
tools/helper_functions.py CHANGED
@@ -116,35 +116,92 @@ def read_file(filename):
116
  return pd.read_excel(filename)
117
  elif file_type == 'parquet':
118
  return pd.read_parquet(filename)
119
-
120
- def view_table(file_path: str, max_width: int = 60): # Added max_width parameter
121
- df = pd.read_csv(file_path)
122
-
123
- df_cleaned = df.replace('\n', ' ', regex=True)
124
-
125
- # Wrap text in each column to the specified max width, including whole words
126
- def wrap_text(text):
127
- if isinstance(text, str):
128
- words = text.split(' ')
129
- wrapped_lines = []
130
- current_line = ""
131
-
132
- for word in words:
133
- # Check if adding the next word exceeds the max width
134
- if len(current_line) + len(word) + 1 > max_width: # +1 for the space
135
- wrapped_lines.append(current_line)
136
- current_line = word # Start a new line with the current word
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  else:
138
- if current_line: # If current_line is not empty, add a space
139
- current_line += ' '
140
- current_line += word
 
 
141
 
142
- # Add any remaining text in current_line to wrapped_lines
143
- if current_line:
144
- wrapped_lines.append(current_line)
145
 
146
- return '<br>'.join(wrapped_lines) # Join lines with <br>
147
- return text
 
 
148
 
149
  # Use apply with axis=1 to apply wrap_text to each element
150
  df_cleaned = df_cleaned.apply(lambda col: col.map(wrap_text))
 
116
  return pd.read_excel(filename)
117
  elif file_type == 'parquet':
118
  return pd.read_parquet(filename)
119
+
120
+ # Wrap text in each column to the specified max width, including whole words
121
+ def wrap_text(text, max_width=60):
122
+ if not isinstance(text, str):
123
+ return text
124
+
125
+ words = text.split()
126
+ if not words:
127
+ return text
128
+
129
+ # First pass: initial word wrapping
130
+ wrapped_lines = []
131
+ current_line = []
132
+ current_length = 0
133
+
134
+ def add_line():
135
+ if current_line:
136
+ wrapped_lines.append(' '.join(current_line))
137
+ current_line.clear()
138
+
139
+ for i, word in enumerate(words):
140
+ word_length = len(word)
141
+
142
+ # Handle words longer than max_width
143
+ if word_length > max_width:
144
+ add_line()
145
+ wrapped_lines.append(word)
146
+ current_length = 0
147
+ continue
148
+
149
+ # Calculate space needed for this word
150
+ space_needed = word_length if not current_line else word_length + 1
151
+
152
+ # Check if adding this word would exceed max_width
153
+ if current_length + space_needed > max_width:
154
+ add_line()
155
+ current_line.append(word)
156
+ current_length = word_length
157
+ else:
158
+ current_line.append(word)
159
+ current_length += space_needed
160
+
161
+ add_line() # Add any remaining text
162
+
163
+ # Second pass: redistribute words from lines following single-word lines
164
+ def can_fit_in_previous_line(prev_line, word):
165
+ return len(prev_line) + 1 + len(word) <= max_width
166
+
167
+ i = 0
168
+ while i < len(wrapped_lines) - 1:
169
+ words_in_line = wrapped_lines[i].split()
170
+ next_line_words = wrapped_lines[i + 1].split()
171
+
172
+ # If current line has only one word and isn't too long
173
+ if len(words_in_line) == 1 and len(words_in_line[0]) < max_width * 0.8:
174
+ # Try to bring words back from the next line
175
+ words_to_bring_back = []
176
+ remaining_words = []
177
+ current_length = len(words_in_line[0])
178
+
179
+ for word in next_line_words:
180
+ if current_length + len(word) + 1 <= max_width:
181
+ words_to_bring_back.append(word)
182
+ current_length += len(word) + 1
183
+ else:
184
+ remaining_words.append(word)
185
+
186
+ if words_to_bring_back:
187
+ # Update current line with additional words
188
+ wrapped_lines[i] = ' '.join(words_in_line + words_to_bring_back)
189
+
190
+ # Update next line with remaining words
191
+ if remaining_words:
192
+ wrapped_lines[i + 1] = ' '.join(remaining_words)
193
  else:
194
+ wrapped_lines.pop(i + 1)
195
+ continue # Don't increment i if we removed a line
196
+ i += 1
197
+
198
+ return '<br>'.join(wrapped_lines)
199
 
 
 
 
200
 
201
+ def view_table(file_path: str): # Added max_width parameter
202
+ df = pd.read_csv(file_path)
203
+
204
+ df_cleaned = df.replace('\n', ' ', regex=True)
205
 
206
  # Use apply with axis=1 to apply wrap_text to each element
207
  df_cleaned = df_cleaned.apply(lambda col: col.map(wrap_text))
tools/llm_api_call.py CHANGED
@@ -20,7 +20,7 @@ from io import StringIO
20
  GradioFileData = gr.FileData
21
 
22
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt
23
- from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
24
  from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
25
 
26
  # ResponseObject class for AWS Bedrock calls
@@ -1636,6 +1636,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1636
  '''
1637
  out_metadata = []
1638
  local_model = []
 
1639
 
1640
  print("In summarise_output_topics function.")
1641
 
@@ -1646,6 +1647,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1646
  #print("latest_summary_completed:", latest_summary_completed)
1647
  #print("length_all_summaries:", length_all_summaries)
1648
 
 
1649
  if latest_summary_completed >= length_all_summaries:
1650
  print("All summaries completed. Creating outputs.")
1651
 
@@ -1691,7 +1693,11 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1691
 
1692
  output_files.extend([reference_table_df_revised_path, unique_table_df_revised_path])
1693
 
1694
- return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str
 
 
 
 
1695
 
1696
  tic = time.perf_counter()
1697
 
@@ -1742,6 +1748,6 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
1742
 
1743
  # If all summaries completeed
1744
  if latest_summary_completed >= length_all_summaries:
1745
- print("At last summary.")
1746
 
1747
- return summarised_references, unique_table_df, reference_table_df, output_files, summarised_outputs, latest_summary_completed, out_metadata_str
 
20
  GradioFileData = gr.FileData
21
 
22
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt
23
+ from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text
24
  from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
25
 
26
  # ResponseObject class for AWS Bedrock calls
 
1636
  '''
1637
  out_metadata = []
1638
  local_model = []
1639
+ summarised_output_markdown = ""
1640
 
1641
  print("In summarise_output_topics function.")
1642
 
 
1647
  #print("latest_summary_completed:", latest_summary_completed)
1648
  #print("length_all_summaries:", length_all_summaries)
1649
 
1650
+ # If all summaries completed, make final outputs
1651
  if latest_summary_completed >= length_all_summaries:
1652
  print("All summaries completed. Creating outputs.")
1653
 
 
1693
 
1694
  output_files.extend([reference_table_df_revised_path, unique_table_df_revised_path])
1695
 
1696
+ unique_table_df_revised_display = unique_table_df_revised.apply(lambda col: col.map(wrap_text))
1697
+
1698
+ summarised_output_markdown = unique_table_df_revised_display.to_markdown(index=False)
1699
+
1700
+ return summarised_references, unique_table_df_revised, reference_table_df_revised, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown
1701
 
1702
  tic = time.perf_counter()
1703
 
 
1748
 
1749
  # If all summaries completeed
1750
  if latest_summary_completed >= length_all_summaries:
1751
+ print("At last summary.")
1752
 
1753
+ return summarised_references, unique_table_df, reference_table_df, output_files, summarised_outputs, latest_summary_completed, out_metadata_str, summarised_output_markdown