Spaces:

seanpedrickcase
/

llm_topic_modelling

Sleeping

App Files Files Community

seanpedrickcase commited on Dec 11, 2024

Commit

f5a842c

1 Parent(s): a10d388

Enhanced local model support by adding model loading functionality in chatfuncs.py and updating llm_api_call.py to utilize local models for topic extraction and summarization. Improved model path handling and ensured compatibility with GPU configurations.

Browse files

Files changed (3) hide show

app.py +1 -0
tools/chatfuncs.py +22 -25
tools/llm_api_call.py +29 -14

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
 from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default
 from tools.auth import authenticate_user
 from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
 #from tools.aws_functions import load_data_from_aws
 import gradio as gr
 import pandas as pd

 from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default
 from tools.auth import authenticate_user
 from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
+from tools.chatfuncs import load_model
 #from tools.aws_functions import load_data_from_aws
 import gradio as gr
 import pandas as pd

tools/chatfuncs.py CHANGED Viewed

@@ -2,7 +2,6 @@ from typing import TypeVar
 import torch.cuda
 import os
 import time
-import spaces
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 from tools.helper_functions import RUN_LOCAL_MODEL
@@ -111,9 +110,23 @@ class LlamaCPPGenerationConfig:
 ###
 # Load local model
 ###
-@spaces.GPU
-def load_model(local_model_type:str, gpu_layers:int, max_context_length:int, gpu_config:llama_cpp_init_config_gpu=gpu_config, cpu_config:llama_cpp_init_config_cpu=cpu_config, torch_device:str=torch_device):
     '''
     Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
     '''
@@ -136,26 +149,11 @@ def load_model(local_model_type:str, gpu_layers:int, max_context_length:int, gpu
         #print(vars(gpu_config))
         #print(vars(cpu_config))
-        def get_model_path():
-            repo_id = os.environ.get("REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")# "bartowski/Llama-3.2-3B-Instruct-GGUF") # "lmstudio-community/gemma-2-2b-it-GGUF")#"QuantFactory/Phi-3-mini-128k-instruct-GGUF")
-            filename = os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf") # )"Llama-3.2-3B-Instruct-Q5_K_M.gguf") #"gemma-2-2b-it-Q8_0.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf")
-            model_dir = "model/gemma" #"model/phi"  # Assuming this is your intended directory
-            # Construct the expected local path
-            local_path = os.path.join(model_dir, filename)
-            if os.path.exists(local_path):
-                print(f"Model already exists at: {local_path}")
-                return local_path
-            else:
-                print(f"Checking default Hugging Face folder. Downloading model from Hugging Face Hub if not found")
-                return hf_hub_download(repo_id=repo_id, filename=filename)
         model_path = get_model_path()
         try:
-            print(vars(gpu_config))
             llama_model = Llama(model_path=model_path, **vars(gpu_config)) #  type_k=8, type_v = 8, flash_attn=True,
         except Exception as e:
@@ -172,15 +170,15 @@ def load_model(local_model_type:str, gpu_layers:int, max_context_length:int, gpu
     load_confirmation = "Finished loading model: " + local_model_type
     print(load_confirmation)
-    return local_model_type, load_confirmation, local_model_type, model, tokenizer
 ###
 # Load local model
 ###
-if RUN_LOCAL_MODEL == "1":
-    print("Loading model")
-    local_model_type, load_confirmation, local_model_type, model, tokenizer = load_model(local_model_type, gpu_layers, context_length, gpu_config, cpu_config, torch_device)
-    print("model loaded:", model)
 def llama_cpp_streaming(history, full_prompt, temperature=temperature):
@@ -216,7 +214,6 @@ def llama_cpp_streaming(history, full_prompt, temperature=temperature):
     print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
     print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
-@spaces.GPU
 def call_llama_cpp_model(formatted_string:str, gen_config:str, model=model):
     """
     Calls your generation model with parameters from the LlamaCPPGenerationConfig object.

 import torch.cuda
 import os
 import time
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 from tools.helper_functions import RUN_LOCAL_MODEL
 ###
 # Load local model
 ###
+def get_model_path():
+    repo_id = os.environ.get("REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")# "bartowski/Llama-3.2-3B-Instruct-GGUF") # "lmstudio-community/gemma-2-2b-it-GGUF")#"QuantFactory/Phi-3-mini-128k-instruct-GGUF")
+    filename = os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf") # )"Llama-3.2-3B-Instruct-Q5_K_M.gguf") #"gemma-2-2b-it-Q8_0.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf")
+    model_dir = "model/gemma" #"model/phi"  # Assuming this is your intended directory
+    # Construct the expected local path
+    local_path = os.path.join(model_dir, filename)
+    if os.path.exists(local_path):
+        print(f"Model already exists at: {local_path}")
+        return local_path
+    else:
+        print(f"Checking default Hugging Face folder. Downloading model from Hugging Face Hub if not found")
+        return hf_hub_download(repo_id=repo_id, filename=filename)
+def load_model(local_model_type:str=local_model_type, gpu_layers:int=gpu_layers, max_context_length:int=context_length, gpu_config:llama_cpp_init_config_gpu=gpu_config, cpu_config:llama_cpp_init_config_cpu=cpu_config, torch_device:str=torch_device):
     '''
     Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
     '''
         #print(vars(gpu_config))
         #print(vars(cpu_config))
         model_path = get_model_path()
         try:
+            print("GPU load variables:" , vars(gpu_config))
             llama_model = Llama(model_path=model_path, **vars(gpu_config)) #  type_k=8, type_v = 8, flash_attn=True,
         except Exception as e:
     load_confirmation = "Finished loading model: " + local_model_type
     print(load_confirmation)
+    return model, tokenizer
 ###
 # Load local model
 ###
+# if RUN_LOCAL_MODEL == "1":
+#     print("Loading model")
+#     local_model_type, load_confirmation, local_model_type, model, tokenizer = load_model(local_model_type, gpu_layers, context_length, gpu_config, cpu_config, torch_device)
+    # print("model loaded:", model)
 def llama_cpp_streaming(history, full_prompt, temperature=temperature):
     print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
     print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
 def call_llama_cpp_model(formatted_string:str, gen_config:str, model=model):
     """
     Calls your generation model with parameters from the LlamaCPPGenerationConfig object.

tools/llm_api_call.py CHANGED Viewed

@@ -9,6 +9,7 @@ import boto3
 import json
 import string
 import re
 from rapidfuzz import process, fuzz
 from tqdm import tqdm
 from gradio import Progress
@@ -19,7 +20,7 @@ GradioFileData = gr.FileData
 from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
 from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
-from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model
 # ResponseObject class for AWS Bedrock calls
 class ResponseObject:
@@ -331,7 +332,7 @@ def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tok
     return response
 # Function to send a request and update history
-def send_request(prompt: str, conversation_history: List[dict], model: object, config: dict, model_choice: str, system_prompt: str, temperature: float, progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
     """
     This function sends a request to a language model with the given prompt, conversation history, model configuration, model choice, system prompt, and temperature.
     It constructs the full prompt by appending the new user prompt to the conversation history, generates a response from the model, and updates the conversation history with the new prompt and response.
@@ -412,7 +413,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
                 gen_config = LlamaCPPGenerationConfig()
                 gen_config.update_temp(temperature)
-                response = call_llama_cpp_model(prompt, gen_config)
                 #progress_bar.close()
                 #tqdm._instances.clear()
@@ -449,7 +450,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
     return response, conversation_history
-def process_requests(prompts: List[str], system_prompt: str, conversation_history: List[dict], whole_conversation: List[str], whole_conversation_metadata: List[str], model: object, config: dict, model_choice: str, temperature: float, batch_no:int = 1, master:bool = False) -> Tuple[List[ResponseObject], List[dict], List[str], List[str]]:
     """
     Processes a list of prompts by sending them to the model, appending the responses to the conversation history, and updating the whole conversation and metadata.
@@ -464,6 +465,7 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
         model_choice (str): The choice of model to use.
         temperature (float): The temperature parameter for the model.
         batch_no (int): Batch number of the large language model request.
         master (bool): Is this request for the master table.
     Returns:
@@ -478,7 +480,7 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
         #print("prompt to LLM:", prompt)
-        response, conversation_history = send_request(prompt, conversation_history, model=model, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature)
         if isinstance(response, ResponseObject):
             responses.append(response)
@@ -872,7 +874,7 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
     return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
 def extract_topics(in_data_file,
               file_data:pd.DataFrame,
               existing_topics_table:pd.DataFrame,
@@ -991,6 +993,11 @@ def extract_topics(in_data_file,
             out_file_paths = []
             print("model_choice_clean:", model_choice_clean)
     #print("latest_batch_completed:", str(latest_batch_completed))
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
@@ -1096,6 +1103,8 @@ def extract_topics(in_data_file,
     topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
     topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
     for i in topics_loop:
         #for latest_batch_completed in range(num_batches):
@@ -1207,7 +1216,7 @@ def extract_topics(in_data_file,
                 summary_whole_conversation = []
                 # Process requests to large language model
-                master_summary_response, summary_conversation_history, whole_summary_conversation, whole_conversation_metadata = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, master = True)
                 # print("master_summary_response:", master_summary_response[-1].text)
                 # print("Whole conversation metadata:", whole_conversation_metadata)
@@ -1299,7 +1308,7 @@ def extract_topics(in_data_file,
                 whole_conversation = [system_prompt]
                 # Process requests to large language model
-                responses, conversation_history, whole_conversation, whole_conversation_metadata = process_requests(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no)
                 # print("Whole conversation metadata before:", whole_conversation_metadata)
@@ -1533,7 +1542,7 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
     return summarised_references, summarised_references_markdown, reference_df, unique_topics_df
-def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:float, formatted_summary_prompt:str, summarise_topic_descriptions_system_prompt:str):
     conversation_history = []
     whole_conversation_metadata = []
@@ -1549,7 +1558,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
     whole_conversation = [summarise_topic_descriptions_system_prompt]
     # Process requests to large language model
-    responses, conversation_history, whole_conversation, whole_conversation_metadata = process_requests(formatted_summary_prompt, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature)
     print("Finished summary query")
@@ -1569,6 +1578,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
     return latest_response_text, conversation_history, whole_conversation_metadata
 def summarise_output_topics(summarised_references:pd.DataFrame,
                             unique_table_df:pd.DataFrame,
                             reference_table_df:pd.DataFrame,
@@ -1646,11 +1656,16 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
     tic = time.perf_counter()
-    print("Starting with:", latest_summary_completed)
-    print("Last summary number:", length_all_summaries)
     summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
-    summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
     for summary_no in summary_loop:
@@ -1661,7 +1676,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
         formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text)]
         try:
-            response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt)
             summarised_output = response
             summarised_output = re.sub(r'\n{2,}', '\n', summarised_output)  # Replace multiple line breaks with a single line break
             summarised_output = re.sub(r'^\n{1,}', '', summarised_output)  # Remove one or more line breaks at the start

 import json
 import string
 import re
+import spaces
 from rapidfuzz import process, fuzz
 from tqdm import tqdm
 from gradio import Progress
 from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
 from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
+from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
 # ResponseObject class for AWS Bedrock calls
 class ResponseObject:
     return response
 # Function to send a request and update history
+def send_request(prompt: str, conversation_history: List[dict], model: object, config: dict, model_choice: str, system_prompt: str, temperature: float, local_model=[], progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
     """
     This function sends a request to a language model with the given prompt, conversation history, model configuration, model choice, system prompt, and temperature.
     It constructs the full prompt by appending the new user prompt to the conversation history, generates a response from the model, and updates the conversation history with the new prompt and response.
                 gen_config = LlamaCPPGenerationConfig()
                 gen_config.update_temp(temperature)
+                response = call_llama_cpp_model(prompt, gen_config, model=local_model)
                 #progress_bar.close()
                 #tqdm._instances.clear()
     return response, conversation_history
+def process_requests(prompts: List[str], system_prompt: str, conversation_history: List[dict], whole_conversation: List[str], whole_conversation_metadata: List[str], model: object, config: dict, model_choice: str, temperature: float, batch_no:int = 1, local_model = [], master:bool = False) -> Tuple[List[ResponseObject], List[dict], List[str], List[str]]:
     """
     Processes a list of prompts by sending them to the model, appending the responses to the conversation history, and updating the whole conversation and metadata.
         model_choice (str): The choice of model to use.
         temperature (float): The temperature parameter for the model.
         batch_no (int): Batch number of the large language model request.
+        local_model: Local gguf model (if loaded)
         master (bool): Is this request for the master table.
     Returns:
         #print("prompt to LLM:", prompt)
+        response, conversation_history = send_request(prompt, conversation_history, model=model, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature, local_model=local_model)
         if isinstance(response, ResponseObject):
             responses.append(response)
     return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
+@spaces.GPU
 def extract_topics(in_data_file,
               file_data:pd.DataFrame,
               existing_topics_table:pd.DataFrame,
             out_file_paths = []
             print("model_choice_clean:", model_choice_clean)
+            if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
+                progress(0.1, "Loading in Gemma 2b model")
+                local_model, tokenizer = load_model()
+                print("Local model loaded:", local_model)
     #print("latest_batch_completed:", str(latest_batch_completed))
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     topics_loop_description = "Extracting topics from response batches (each batch of " + str(batch_size) + " responses)."
     topics_loop = tqdm(range(latest_batch_completed, num_batches), desc = topics_loop_description, unit="batches remaining")
     for i in topics_loop:
         #for latest_batch_completed in range(num_batches):
                 summary_whole_conversation = []
                 # Process requests to large language model
+                master_summary_response, summary_conversation_history, whole_summary_conversation, whole_conversation_metadata = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
                 # print("master_summary_response:", master_summary_response[-1].text)
                 # print("Whole conversation metadata:", whole_conversation_metadata)
                 whole_conversation = [system_prompt]
                 # Process requests to large language model
+                responses, conversation_history, whole_conversation, whole_conversation_metadata = process_requests(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model)
                 # print("Whole conversation metadata before:", whole_conversation_metadata)
     return summarised_references, summarised_references_markdown, reference_df, unique_topics_df
+def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:float, formatted_summary_prompt:str, summarise_topic_descriptions_system_prompt:str, local_model=[]):
     conversation_history = []
     whole_conversation_metadata = []
     whole_conversation = [summarise_topic_descriptions_system_prompt]
     # Process requests to large language model
+    responses, conversation_history, whole_conversation, whole_conversation_metadata = process_requests(formatted_summary_prompt, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, local_model=local_model)
     print("Finished summary query")
     return latest_response_text, conversation_history, whole_conversation_metadata
+@spaces.GPU
 def summarise_output_topics(summarised_references:pd.DataFrame,
                             unique_table_df:pd.DataFrame,
                             reference_table_df:pd.DataFrame,
     tic = time.perf_counter()
+    #print("Starting with:", latest_summary_completed)
+    #print("Last summary number:", length_all_summaries)
+    if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
+                progress(0.1, "Loading in Gemma 2b model")
+                local_model, tokenizer = load_model()
+                print("Local model loaded:", local_model)
     summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
+    summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
     for summary_no in summary_loop:
         formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text)]
         try:
+            response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
             summarised_output = response
             summarised_output = re.sub(r'\n{2,}', '\n', summarised_output)  # Replace multiple line breaks with a single line break
             summarised_output = re.sub(r'^\n{1,}', '', summarised_output)  # Remove one or more line breaks at the start