Spaces:

phyloforfun
/

VoucherVision

Running

App Files Files Community

phyloforfun commited on Mar 12

Commit

567930d

•

1 Parent(s): c6a70af

updates

Browse files

Files changed (34) hide show

.gitignore +15 -0
.streamlit/config.toml +8 -2
api_status.yaml +4 -2
app.py +46 -25
custom_prompts/SLTPvA_long.yaml +31 -36
custom_prompts/SLTPvA_medium.yaml +2 -6
custom_prompts/SLTPvA_short.yaml +2 -6
custom_prompts/SLTPvB_long.yaml +107 -0
custom_prompts/SLTPvB_medium.yaml +83 -0
custom_prompts/SLTPvB_short.yaml +78 -0
pages/prompt_builder.py +3 -1
requirements.txt +0 -0
requirements_conda.txt +0 -0
requirements_with_versions.txt +0 -0
run_VoucherVision.py +2 -1
vouchervision/LLM_GoogleGemini.py +39 -19
vouchervision/LLM_GooglePalm2.py +65 -20
vouchervision/LLM_MistralAI.py +45 -19
vouchervision/LLM_OpenAI.py +117 -37
vouchervision/LLM_local_MistralAI.py +16 -9
vouchervision/LLM_local_cpu_MistralAI.py +15 -8
vouchervision/LM2_logger.py +22 -5
vouchervision/OCR_google_cloud_vision.py +26 -6
vouchervision/VoucherVision_Config_Builder.py +1 -1
vouchervision/model_maps.py +42 -23
vouchervision/prompt_catalog.py +28 -16
vouchervision/tool_taxonomy_WFO.py +14 -5
vouchervision/utils_LLM.py +1 -4
vouchervision/utils_LLM_JSON_validation.py +2 -1
vouchervision/utils_VoucherVision.py +23 -17
vouchervision/utils_VoucherVision_parallel.py +1022 -0
vouchervision/vouchervision_main.py +1 -0
vouchervision/vouchervision_test_all_options_analysis.py +103 -0
vouchervision/vouchervision_test_all_options_recipes.py +170 -0

.gitignore CHANGED Viewed

@@ -7,15 +7,27 @@ yolov8x-pose.pt
 yolov8n.pt
 *PRIVATE_DATA*
 # Prompts
 /custom_prompts/*
 !/custom_prompts/SLTPvA_long.yaml
 !/custom_prompts/SLTPvA_medium.yaml
 !/custom_prompts/SLTPvA_short.yaml
 # Dirs
 custom_prompts_deprecated/
 demo/demo_output/*
 demo/demo_configs/*
 uploads/*
 uploads_small/*
@@ -59,6 +71,9 @@ vouchervision/component_detector/runs/
 vouchervision/component_detector/architecture/
 vouchervision/component_detector/yolov5x6.pt
 vouchervision/instructor-xl/
 vouchervision/instructor-embedding/

 yolov8n.pt
 *PRIVATE_DATA*
+vouchervision/LLM_MistralAI_Azure_endpoints.py
 # Prompts
 /custom_prompts/*
 !/custom_prompts/SLTPvA_long.yaml
 !/custom_prompts/SLTPvA_medium.yaml
 !/custom_prompts/SLTPvA_short.yaml
+!/custom_prompts/SLTPvB_long.yaml
+!/custom_prompts/SLTPvB_medium.yaml
+!/custom_prompts/SLTPvB_short.yaml
 # Dirs
 custom_prompts_deprecated/
 demo/demo_output/*
+demo/validation_images_repeat/
+demo/validation_json/
+demo/validation_figs/
+demo/validation_output/
+demo/validation_xlsx/
 demo/demo_configs/*
 uploads/*
 uploads_small/*
 vouchervision/component_detector/architecture/
 vouchervision/component_detector/yolov5x6.pt
+vouchervision/vouchervision_test_all_options.py
+vouchervision/prompt_arena.py
 vouchervision/instructor-xl/
 vouchervision/instructor-embedding/

.streamlit/config.toml CHANGED Viewed

@@ -1,5 +1,11 @@
 [theme]
-    primaryColor = "#00ff00"
     backgroundColor="#1a1a1a"
     secondaryBackgroundColor="#303030"
-    textColor = "cccccc"

 [theme]
+    primaryColor = "#16a616"
     backgroundColor="#1a1a1a"
     secondaryBackgroundColor="#303030"
+    textColor = "cccccc"
+    [server]
+    enableStaticServing = false
+    runOnSave = true
+    port = 8524
+    maxUploadSize = 5000

api_status.yaml CHANGED Viewed

@@ -1,10 +1,12 @@
-date: January 26, 2024
 missing_keys: []
 present_keys:
-- Google OCR (Valid)
 - OpenAI (Valid)
 - Azure OpenAI (Valid)
 - Palm2 (Valid)
 - Gemini (Valid)
 - Mistral (Valid)
 - HERE Geocode (Valid)

+date: February 29, 2024
 missing_keys: []
 present_keys:
+- Google OCR Print (Valid)
+- Google OCR Handwriting (Valid)
 - OpenAI (Valid)
 - Azure OpenAI (Valid)
 - Palm2 (Valid)
+- Palm2 LangChain (Valid)
 - Gemini (Valid)
 - Mistral (Valid)
 - HERE Geocode (Valid)

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ import pandas as pd
 from io import BytesIO
 from streamlit_extras.let_it_rain import rain
 from annotated_text import annotated_text
-from transformers import AutoConfig
 from vouchervision.LeafMachine2_Config_Builder import write_config_file
 from vouchervision.VoucherVision_Config_Builder import build_VV_config, TestOptionsGPT, TestOptionsPalm, check_if_usable
@@ -18,6 +17,7 @@ from vouchervision.API_validation import APIvalidation
 from vouchervision.utils_hf import setup_streamlit_config, save_uploaded_file, save_uploaded_local, save_uploaded_file_local
 from vouchervision.data_project import convert_pdf_to_jpg
 from vouchervision.utils_LLM import check_system_gpus
 import cProfile
 import pstats
@@ -250,14 +250,25 @@ def load_gallery(converted_files, uploaded_file):
             file_path_small = save_uploaded_file(st.session_state['dir_uploaded_images_small'], uploaded_file, img)
             st.session_state['input_list_small'].append(file_path_small)
 @st.cache_data
 def handle_image_upload_and_gallery_hf(uploaded_files):
     if uploaded_files:
         # Clear input image gallery and input list
         clear_image_uploads()
         ind_small = 0
         for uploaded_file in uploaded_files:
             # Determine the file type
             if uploaded_file.name.lower().endswith('.pdf'):
                 # Handle PDF files
@@ -305,6 +316,8 @@ def handle_image_upload_and_gallery_hf(uploaded_files):
             # If there are less than 100 images, take them all
             images_to_display = st.session_state['input_list_small']
         show_gallery_small_hf(images_to_display)
 @st.cache_data
@@ -378,7 +391,7 @@ def content_input_images(col_left, col_right):
     with col_right:
         if st.session_state.is_hf:
-            handle_image_upload_and_gallery_hf(uploaded_files)
         else:
             st.session_state['view_local_gallery'] = st.toggle("View Image Gallery",)
@@ -427,7 +440,8 @@ def count_jpg_images(directory_path):
 def create_download_button(zip_filepath, col, key):
     with col:
-        labal_n_images = f"Download Results for {st.session_state['processing_add_on']} Images"
         with open(zip_filepath, 'rb') as f:
             bytes_io = BytesIO(f.read())
         st.download_button(
@@ -1067,6 +1081,11 @@ def create_private_file():
                 "client_x509_cert_url": "A LONG URL",
                 "universe_domain": "googleapis.com"
                 })
         google_application_credentials = st.text_input(label = 'Full path to Google Cloud JSON API key file', value = cfg_private['google'].get('GOOGLE_APPLICATION_CREDENTIALS', ''),
                                                 placeholder = 'e.g. C:/Documents/Secret_Files/google_API/application_default_credentials.json',
                                                 help ="This API Key is in the form of a JSON file. Please save the JSON file in a safe directory. DO NOT store the JSON key inside of the VoucherVision directory.",
@@ -1127,7 +1146,7 @@ def create_private_file():
         st.write("---")
         st.subheader("MistralAI")
-        st.markdown('Follow these [instructions](https://platform.here.com/sign-up?step=verify-identity) to generate an API key for HERE.')
         mistral_API_KEY = st.text_input("MistralAI API Key", cfg_private['mistral'].get('MISTRAL_API_KEY', ''),
                                                  help='e.g. a 32-character string',
                                                  placeholder='e.g. SATgthsykuE64FgrrrrEervr3S4455t_geyDeGq',
@@ -1360,7 +1379,7 @@ def get_all_cost_tables():
             cost_openai[key] = cost_data.get(value,'')
         elif 'PALM2' in parts or 'GEMINI' in parts:
             cost_google[key] = cost_data.get(value,'')
-        elif 'MISTRAL' in parts:
             cost_mistral[key] = cost_data.get(value,'')
     styled_cost_openai = convert_cost_dict_to_table(cost_openai, "OpenAI")
@@ -1403,9 +1422,9 @@ def content_header():
         N_STEPS = 6
         if check_if_usable(is_hf=st.session_state['is_hf']):
-            b_text = f"Start Processing {st.session_state['processing_add_on']} Images" if st.session_state['processing_add_on'] > 1 else f"Start Processing {st.session_state['processing_add_on']} Image"
-            if st.session_state['processing_add_on'] == 0:
-                b_text = f"Start Processing"
             if st.button(b_text, type='primary',use_container_width=True):
                 st.session_state['formatted_json'] = {}
                 st.session_state['formatted_json_WFO'] = {}
@@ -1466,7 +1485,7 @@ def content_header():
             if st.session_state['zip_filepath']:
                 create_download_button(st.session_state['zip_filepath'], col_run_1,key=97863332)
         else:
-            st.button("Start Processing", type='primary', disabled=True)
             with col_run_4:
                 st.error(":heavy_exclamation_mark: Required API keys not set. Please visit the 'API Keys' tab and set the Google Vision OCR API key and at least one LLM key.")
@@ -1482,11 +1501,11 @@ def content_header():
         ct_left, ct_right = st.columns([1,1])
     with ct_left:
         st.button("Refresh", on_click=refresh, use_container_width=True)
-    # with ct_right:
-    #     try:
-    #         st.page_link(os.path.join("pages","faqs.py"), label="FAQs", icon="❔")
-    #     except:
-    #         st.page_link(os.path.join(os.path.dirname(__file__),"pages","faqs.py"), label="FAQs", icon="❔")
@@ -1687,12 +1706,12 @@ def content_prompt_and_llm_version():
                 selected_version = default_version
             st.session_state.config['leafmachine']['project']['prompt_version'] = st.selectbox("Prompt Version", available_prompts, index=available_prompts.index(selected_version),label_visibility='collapsed')
-    # with col_prompt_2:
-    #     # if st.button("Build Custom LLM Prompt"):
-    #     try:
-    #         st.page_link(os.path.join("pages","prompt_builder.py"), label="Prompt Builder", icon="🚧")
-    #     except:
-    #         st.page_link(os.path.join(os.path.dirname(__file__),"pages","prompt_builder.py"), label="Prompt Builder", icon="🚧")
     st.header('LLM Version')
@@ -1703,18 +1722,18 @@ def content_prompt_and_llm_version():
         st.session_state.config['leafmachine']['LLM_version'] = st.selectbox("LLM version", GUI_MODEL_LIST, index=GUI_MODEL_LIST.index(st.session_state.config['leafmachine'].get('LLM_version', ModelMaps.MODELS_GUI_DEFAULT)))
         st.markdown("""
 Based on preliminary results, the following models perform the best. We are currently running tests of all possible OCR + LLM + Prompt combinations to create recipes for different workflows.
-- `Mistral Medium`
-- `Mistral Small`
-- `Mistral Tiny`
 - `PaLM 2 text-bison@001`
 - `GPT 4 Turbo 1106-preview`
-- `GPT 3.5 Instruct`
 - `LOCAL Mixtral 7Bx8 Instruct`
 - `LOCAL Mixtral 7B Instruct`
 Larger models (e.g., `GPT 4`, `GPT 4 32k`, `Gemini Pro`) do not necessarily perform better for these tasks. MistralAI models exceeded our expectations and perform extremely well. PaLM 2 text-bison@001 also seems to consistently out-perform Gemini Pro.
-The `SLTPvA_short.yaml` prompt also seems to work better with smaller LLMs (e.g., Mistral Tiny). Alternatively, enable double OCR to help the LLM focus on the OCR text given a longer prompt.""")
 def content_api_check():
@@ -1927,6 +1946,8 @@ def content_ocr_method():
     #     st.text_area(label='Handwritten/Printed + trOCR',placeholder=demo_text_trh,disabled=True, label_visibility='visible', height=150)
 def is_valid_huggingface_model_path(model_path):
     try:
         # Attempt to load the model configuration from Hugging Face Model Hub
         config = AutoConfig.from_pretrained(model_path)

 from io import BytesIO
 from streamlit_extras.let_it_rain import rain
 from annotated_text import annotated_text
 from vouchervision.LeafMachine2_Config_Builder import write_config_file
 from vouchervision.VoucherVision_Config_Builder import build_VV_config, TestOptionsGPT, TestOptionsPalm, check_if_usable
 from vouchervision.utils_hf import setup_streamlit_config, save_uploaded_file, save_uploaded_local, save_uploaded_file_local
 from vouchervision.data_project import convert_pdf_to_jpg
 from vouchervision.utils_LLM import check_system_gpus
+from vouchervision.OCR_google_cloud_vision import check_for_inappropriate_content
 import cProfile
 import pstats
             file_path_small = save_uploaded_file(st.session_state['dir_uploaded_images_small'], uploaded_file, img)
             st.session_state['input_list_small'].append(file_path_small)
 @st.cache_data
 def handle_image_upload_and_gallery_hf(uploaded_files):
     if uploaded_files:
         # Clear input image gallery and input list
         clear_image_uploads()
         ind_small = 0
         for uploaded_file in uploaded_files:
+            if check_for_inappropriate_content(uploaded_file):
+                clear_image_uploads()
+                st.error("Warning: You have uploaded an inappropriate image")
+                return True
             # Determine the file type
             if uploaded_file.name.lower().endswith('.pdf'):
                 # Handle PDF files
             # If there are less than 100 images, take them all
             images_to_display = st.session_state['input_list_small']
         show_gallery_small_hf(images_to_display)
+    return False
 @st.cache_data
     with col_right:
         if st.session_state.is_hf:
+            result = handle_image_upload_and_gallery_hf(uploaded_files)
         else:
             st.session_state['view_local_gallery'] = st.toggle("View Image Gallery",)
 def create_download_button(zip_filepath, col, key):
     with col:
+        # labal_n_images = f"Download Results for {st.session_state['processing_add_on']} Images"
+        labal_n_images = f"Download Results"
         with open(zip_filepath, 'rb') as f:
             bytes_io = BytesIO(f.read())
         st.download_button(
                 "client_x509_cert_url": "A LONG URL",
                 "universe_domain": "googleapis.com"
                 })
+            blog_text('Google project ID', ': The project ID is the "project_id"  value from the JSON file.')
+            blog_text('Google project location', ': The project location specifies the location of the Google server that your project resources will utilize. It should not really make a difference which location you choose. We use `us-central1`, but you might want to choose a location closer to where you live. [please see this page for a list of available regions](https://cloud.google.com/vertex-ai/docs/general/locations)')
         google_application_credentials = st.text_input(label = 'Full path to Google Cloud JSON API key file', value = cfg_private['google'].get('GOOGLE_APPLICATION_CREDENTIALS', ''),
                                                 placeholder = 'e.g. C:/Documents/Secret_Files/google_API/application_default_credentials.json',
                                                 help ="This API Key is in the form of a JSON file. Please save the JSON file in a safe directory. DO NOT store the JSON key inside of the VoucherVision directory.",
         st.write("---")
         st.subheader("MistralAI")
+        st.markdown('Follow these [instructions](https://console.mistral.ai/) to generate an API key for MistralAI.')
         mistral_API_KEY = st.text_input("MistralAI API Key", cfg_private['mistral'].get('MISTRAL_API_KEY', ''),
                                                  help='e.g. a 32-character string',
                                                  placeholder='e.g. SATgthsykuE64FgrrrrEervr3S4455t_geyDeGq',
             cost_openai[key] = cost_data.get(value,'')
         elif 'PALM2' in parts or 'GEMINI' in parts:
             cost_google[key] = cost_data.get(value,'')
+        elif ('MISTRAL' in parts) or ('MIXTRAL' in parts):
             cost_mistral[key] = cost_data.get(value,'')
     styled_cost_openai = convert_cost_dict_to_table(cost_openai, "OpenAI")
         N_STEPS = 6
         if check_if_usable(is_hf=st.session_state['is_hf']):
+            # b_text = f"Start Processing {st.session_state['processing_add_on']} Images" if st.session_state['processing_add_on'] > 1 else f"Start Processing {st.session_state['processing_add_on']} Image"
+            # if st.session_state['processing_add_on'] == 0:
+            b_text = f"Start Transcription"
             if st.button(b_text, type='primary',use_container_width=True):
                 st.session_state['formatted_json'] = {}
                 st.session_state['formatted_json_WFO'] = {}
             if st.session_state['zip_filepath']:
                 create_download_button(st.session_state['zip_filepath'], col_run_1,key=97863332)
         else:
+            st.button("Start Transcription", type='primary', disabled=True)
             with col_run_4:
                 st.error(":heavy_exclamation_mark: Required API keys not set. Please visit the 'API Keys' tab and set the Google Vision OCR API key and at least one LLM key.")
         ct_left, ct_right = st.columns([1,1])
     with ct_left:
         st.button("Refresh", on_click=refresh, use_container_width=True)
+    with ct_right:
+        try:
+            st.page_link(os.path.join("pages","faqs.py"), label="FAQs", icon="❔")
+        except:
+            st.page_link(os.path.join(os.path.dirname(__file__),"pages","faqs.py"), label="FAQs", icon="❔")
                 selected_version = default_version
             st.session_state.config['leafmachine']['project']['prompt_version'] = st.selectbox("Prompt Version", available_prompts, index=available_prompts.index(selected_version),label_visibility='collapsed')
+    with col_prompt_2:
+        # if st.button("Build Custom LLM Prompt"):
+        try:
+            st.page_link(os.path.join("pages","prompt_builder.py"), label="Prompt Builder", icon="🚧")
+        except:
+            st.page_link(os.path.join(os.path.dirname(__file__),"pages","prompt_builder.py"), label="Prompt Builder", icon="🚧")
     st.header('LLM Version')
         st.session_state.config['leafmachine']['LLM_version'] = st.selectbox("LLM version", GUI_MODEL_LIST, index=GUI_MODEL_LIST.index(st.session_state.config['leafmachine'].get('LLM_version', ModelMaps.MODELS_GUI_DEFAULT)))
         st.markdown("""
 Based on preliminary results, the following models perform the best. We are currently running tests of all possible OCR + LLM + Prompt combinations to create recipes for different workflows.
+- Any Mistral model e.g., `Mistral Large`
 - `PaLM 2 text-bison@001`
 - `GPT 4 Turbo 1106-preview`
+- `GPT 3.5 Turbo`
 - `LOCAL Mixtral 7Bx8 Instruct`
 - `LOCAL Mixtral 7B Instruct`
 Larger models (e.g., `GPT 4`, `GPT 4 32k`, `Gemini Pro`) do not necessarily perform better for these tasks. MistralAI models exceeded our expectations and perform extremely well. PaLM 2 text-bison@001 also seems to consistently out-perform Gemini Pro.
+The `SLTPvA_short.yaml` prompt also seems to work better with smaller LLMs (e.g., Mistral Tiny). Alternatively, enable double OCR to help the LLM focus on the OCR text given a longer prompt.
+Models `GPT 3.5 Turbo` and `GPT 4 Turbo 0125-preview` enable OpenAI's [JSON mode](https://platform.openai.com/docs/guides/text-generation/json-mode), which helps prevent JSON errors. All models implement Langchain JSON parsing too, so JSON errors are rare for most models.""")
 def content_api_check():
     #     st.text_area(label='Handwritten/Printed + trOCR',placeholder=demo_text_trh,disabled=True, label_visibility='visible', height=150)
 def is_valid_huggingface_model_path(model_path):
+    from transformers import AutoConfig
     try:
         # Attempt to load the model configuration from Hugging Face Model Hub
         config = AutoConfig.from_pretrained(model_path)

custom_prompts/SLTPvA_long.yaml CHANGED Viewed

@@ -28,10 +28,7 @@ rules:
   scientificNameAuthorship: The authorship information for the scientificName formatted according to the conventions of the applicable Darwin Core nomenclaturalCode.
   genus: Taxonomic determination to genus. Genus must be capitalized. If
       genus is not present use the taxonomic family name followed by the word 'indet'.
-  subgenus: The full scientific name of the subgenus in which the taxon is classified.
-      Values should include the genus to avoid homonym confusion.
   specificEpithet: The name of the first or species epithet of the scientificName. Only include the species epithet.
-  infraspecificEpithet: The name of the lowest or terminal infraspecific epithet of the scientificName, excluding any rank designation.
   identifiedBy: A comma separated list of names of people, groups, or organizations who assigned the taxon to the subject organism. This is not the specimen collector.
   recordedBy: A comma separated list of names of people, groups, or organizations responsible for observing, recording, collecting, or presenting the original specimen.
       The primary collector or observer should be listed first.
@@ -63,7 +60,7 @@ rules:
       the exact origin or location of the specimen.
   degreeOfEstablishment: Cultivated plants are intentionally grown by humans. In text descriptions,
       look for planting dates, garden locations, ornamental, cultivar names, garden,
-      or farm to indicate cultivated plant. Use either - unknown or cultivated.
   decimalLatitude: Latitude decimal coordinate. Correct and convert the verbatim location coordinates to conform
       with the decimal degrees GPS coordinate format.
   decimalLongitude: Longitude decimal coordinate. Correct and convert the verbatim location coordinates to conform
@@ -78,35 +75,33 @@ rules:
       are explicit then convert from feet ("ft" or "ft." or "feet") to meters ("m"
       or "m." or "meters"). Round to integer.
 mapping:
-  TAXONOMY:
-  - catalogNumber
-  - order
-  - family
-  - scientificName
-  - scientificNameAuthorship
-  - genus
-  - subgenus
-  - specificEpithet
-  - infraspecificEpithet
-  GEOGRAPHY:
-  - country
-  - stateProvince
-  - county
-  - municipality
-  - decimalLatitude
-  - decimalLongitude
-  - verbatimCoordinates
-  LOCALITY:
-  - locality
-  - habitat
-  - minimumElevationInMeters
-  - maximumElevationInMeters
-  COLLECTING:
-  - identifiedBy
-  - recordedBy
-  - recordNumber
-  - verbatimEventDate
-  - eventDate
-  - degreeOfEstablishment
-  - occurrenceRemarks
-  MISC:

   scientificNameAuthorship: The authorship information for the scientificName formatted according to the conventions of the applicable Darwin Core nomenclaturalCode.
   genus: Taxonomic determination to genus. Genus must be capitalized. If
       genus is not present use the taxonomic family name followed by the word 'indet'.
   specificEpithet: The name of the first or species epithet of the scientificName. Only include the species epithet.
   identifiedBy: A comma separated list of names of people, groups, or organizations who assigned the taxon to the subject organism. This is not the specimen collector.
   recordedBy: A comma separated list of names of people, groups, or organizations responsible for observing, recording, collecting, or presenting the original specimen.
       The primary collector or observer should be listed first.
       the exact origin or location of the specimen.
   degreeOfEstablishment: Cultivated plants are intentionally grown by humans. In text descriptions,
       look for planting dates, garden locations, ornamental, cultivar names, garden,
+      or farm to indicate cultivated plant. Set to 'cultivated' if cultivated, otherwise use an empty string.
   decimalLatitude: Latitude decimal coordinate. Correct and convert the verbatim location coordinates to conform
       with the decimal degrees GPS coordinate format.
   decimalLongitude: Longitude decimal coordinate. Correct and convert the verbatim location coordinates to conform
       are explicit then convert from feet ("ft" or "ft." or "feet") to meters ("m"
       or "m." or "meters"). Round to integer.
 mapping:
+    TAXONOMY:
+    - catalogNumber
+    - order
+    - family
+    - scientificName
+    - scientificNameAuthorship
+    - genus
+    - specificEpithet
+    GEOGRAPHY:
+    - country
+    - stateProvince
+    - county
+    - municipality
+    - decimalLatitude
+    - decimalLongitude
+    - verbatimCoordinates
+    LOCALITY:
+    - locality
+    - habitat
+    - minimumElevationInMeters
+    - maximumElevationInMeters
+    COLLECTING:
+    - identifiedBy
+    - recordedBy
+    - recordNumber
+    - verbatimEventDate
+    - eventDate
+    - degreeOfEstablishment
+    - occurrenceRemarks
+    MISC: []

custom_prompts/SLTPvA_medium.yaml CHANGED Viewed

@@ -27,9 +27,7 @@ rules:
       and any lower classifications.
   scientificNameAuthorship: The authorship information for the scientificName formatted according to the conventions of the applicable Darwin Core nomenclaturalCode.
   genus: Taxonomic determination to genus. Genus must be capitalized.
-  subgenus: The full scientific name of the subgenus in which the taxon is classified.
   specificEpithet: The name of the first or species epithet of the scientificName. Only include the species epithet.
-  infraspecificEpithet: The name of the lowest or terminal infraspecific epithet of the scientificName, excluding any rank designation.
   identifiedBy: A comma separated list of names of people, groups, or organizations who assigned the taxon to the subject organism. This is not the specimen collector.
   recordedBy: A comma separated list of names of people, groups, or organizations
   recordNumber: An identifier given to the specimen at the time it was recorded.
@@ -46,7 +44,7 @@ rules:
       the exact origin or location of the specimen.
   degreeOfEstablishment: Cultivated plants are intentionally grown by humans. In text descriptions,
       look for planting dates, garden locations, ornamental, cultivar names, garden,
-      or farm to indicate cultivated plant. Use either - unknown or cultivated.
   decimalLatitude: Latitude decimal coordinate. Correct and convert the verbatim location coordinates to conform with the decimal degrees GPS coordinate format.
   decimalLongitude: Longitude decimal coordinate. Correct and convert the verbatim location coordinates to conform with the decimal degrees GPS coordinate format.
   verbatimCoordinates: Verbatim location coordinates as they appear on the label.
@@ -60,9 +58,7 @@ mapping:
   - scientificName
   - scientificNameAuthorship
   - genus
-  - subgenus
   - specificEpithet
-  - infraspecificEpithet
   GEOGRAPHY:
   - country
   - stateProvince
@@ -84,4 +80,4 @@ mapping:
   - eventDate
   - degreeOfEstablishment
   - occurrenceRemarks
-  MISC:

       and any lower classifications.
   scientificNameAuthorship: The authorship information for the scientificName formatted according to the conventions of the applicable Darwin Core nomenclaturalCode.
   genus: Taxonomic determination to genus. Genus must be capitalized.
   specificEpithet: The name of the first or species epithet of the scientificName. Only include the species epithet.
   identifiedBy: A comma separated list of names of people, groups, or organizations who assigned the taxon to the subject organism. This is not the specimen collector.
   recordedBy: A comma separated list of names of people, groups, or organizations
   recordNumber: An identifier given to the specimen at the time it was recorded.
       the exact origin or location of the specimen.
   degreeOfEstablishment: Cultivated plants are intentionally grown by humans. In text descriptions,
       look for planting dates, garden locations, ornamental, cultivar names, garden,
+      or farm to indicate cultivated plant. Set to 'cultivated' if cultivated, otherwise use an empty string.
   decimalLatitude: Latitude decimal coordinate. Correct and convert the verbatim location coordinates to conform with the decimal degrees GPS coordinate format.
   decimalLongitude: Longitude decimal coordinate. Correct and convert the verbatim location coordinates to conform with the decimal degrees GPS coordinate format.
   verbatimCoordinates: Verbatim location coordinates as they appear on the label.
   - scientificName
   - scientificNameAuthorship
   - genus
   - specificEpithet
   GEOGRAPHY:
   - country
   - stateProvince
   - eventDate
   - degreeOfEstablishment
   - occurrenceRemarks
+  MISC: []

custom_prompts/SLTPvA_short.yaml CHANGED Viewed

@@ -26,9 +26,7 @@ rules:
   scientificName: scientific name of the taxon including Genus, specific epithet, and any lower classifications.
   scientificNameAuthorship: authorship information for the scientificName formatted according to the conventions of the applicable Darwin Core nomenclaturalCode.
   genus: taxonomic determination to Genus, Genus must be capitalized.
-  subgenus: name of the subgenus.
   specificEpithet: The name of the first or species epithet of the scientificName. Only include the species epithet.
-  infraspecificEpithet: lowest or terminal infraspecific epithet of the scientificName.
   identifiedBy: list of names of people, doctors, professors, groups, or organizations who identified, determined the taxon name to the subject organism. This is not the specimen collector.
   recordedBy: list of names of people, doctors, professors, groups, or organizations.
   recordNumber: identifier given to the specimen at the time it was recorded.
@@ -41,7 +39,7 @@ rules:
   county: county, shire, department, parish etc.
   municipality: city, municipality, etc.
   locality: description of geographic information aiding in pinpointing the exact origin or location of the specimen.
-  degreeOfEstablishment: cultivated plants are intentionally grown by humans. Use either - unknown or cultivated.
   decimalLatitude: latitude decimal coordinate.
   decimalLongitude: longitude decimal coordinate.
   verbatimCoordinates: verbatim location coordinates.
@@ -55,9 +53,7 @@ mapping:
   - scientificName
   - scientificNameAuthorship
   - genus
-  - subgenus
   - specificEpithet
-  - infraspecificEpithet
   GEOGRAPHY:
   - country
   - stateProvince
@@ -79,4 +75,4 @@ mapping:
   - eventDate
   - degreeOfEstablishment
   - occurrenceRemarks
-  MISC:

   scientificName: scientific name of the taxon including Genus, specific epithet, and any lower classifications.
   scientificNameAuthorship: authorship information for the scientificName formatted according to the conventions of the applicable Darwin Core nomenclaturalCode.
   genus: taxonomic determination to Genus, Genus must be capitalized.
   specificEpithet: The name of the first or species epithet of the scientificName. Only include the species epithet.
   identifiedBy: list of names of people, doctors, professors, groups, or organizations who identified, determined the taxon name to the subject organism. This is not the specimen collector.
   recordedBy: list of names of people, doctors, professors, groups, or organizations.
   recordNumber: identifier given to the specimen at the time it was recorded.
   county: county, shire, department, parish etc.
   municipality: city, municipality, etc.
   locality: description of geographic information aiding in pinpointing the exact origin or location of the specimen.
+  degreeOfEstablishment: cultivated plants are intentionally grown by humans. Set to 'cultivated' if cultivated, otherwise use an empty string.
   decimalLatitude: latitude decimal coordinate.
   decimalLongitude: longitude decimal coordinate.
   verbatimCoordinates: verbatim location coordinates.
   - scientificName
   - scientificNameAuthorship
   - genus
   - specificEpithet
   GEOGRAPHY:
   - country
   - stateProvince
   - eventDate
   - degreeOfEstablishment
   - occurrenceRemarks
+  MISC: []

custom_prompts/SLTPvB_long.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+prompt_author: Will Weaver
+prompt_author_institution: University of Michigan
+prompt_name: SLTPvB_long
+prompt_version: v-1-0
+prompt_description: Prompt developed by the University of Michigan.
+    SLTPvB prompts all have standardized column headers (fields) that were chosen due to their reliability and prevalence in herbarium records.
+    All field descriptions are based on the official Darwin Core guidelines.
+    SLTPvB_long - The most verbose prompt option. Descriptions closely follow DwC guides. Detailed rules for the LLM to follow. Works best with double or triple OCR to increase attention back to the OCR (select 'use both OCR models' or 'handwritten + printed' along with trOCR).
+    SLTPvB_medium - Shorter verion of _long.
+    SLTPvB_short - The least verbose possible prompt while still providing rules and DwC descriptions.
+LLM: General Purpose
+instructions: 1. Refactor the unstructured OCR text into a dictionary based on the JSON structure outlined below.
+  2. Map the unstructured OCR text to the appropriate JSON key and populate the field given the user-defined rules.
+  3. JSON key values are permitted to remain empty strings if the corresponding information is not found in the unstructured OCR text.
+  4. Duplicate dictionary fields are not allowed.
+  5. Ensure all JSON keys are in camel case.
+  6. Ensure new JSON field values follow sentence case capitalization.
+  7. Ensure all key-value pairs in the JSON dictionary strictly adhere to the format and data types specified in the template.
+  8. Ensure output JSON string is valid JSON format. It should not have trailing commas or unquoted keys.
+  9. Only return a JSON dictionary represented as a string. You should not explain your answer.
+json_formatting_instructions: This section provides rules for formatting each JSON value organized by the JSON key.
+rules:
+  catalogNumber: Barcode identifier, typically a number with at least 6 digits, but fewer than 30 digits.
+  order: The full scientific name of the order in which the taxon is classified. Order must be capitalized.
+  family: The full scientific name of the family in which the taxon is classified. Family must be capitalized.
+  speciesBinomialName: The scientific name of the taxon including genus, specific epithet,
+      and any lower classifications.
+  genus: Taxonomic determination to genus. Genus must be capitalized. If
+      genus is not present use the taxonomic family name followed by the word 'indet'.
+  specificEpithet: The name of the first or species epithet of the scientificName. Only include the species epithet.
+  speciesBinomialNameAuthorship: The authorship information for the scientificName formatted according to the conventions of the applicable Darwin Core nomenclaturalCode.
+  collector: A comma separated list of names of people, groups, or organizations responsible for observing, recording, collecting, or presenting the original specimen.
+      The primary collector or observer should be listed first.
+  recordNumber: An identifier given to the occurrence at the time it was recorded.
+      Often serves as a link between field notes and an occurrence record, such as a specimen collector's number.
+  identifiedBy: A comma separated list of names of people, groups, or organizations who assigned the taxon to the subject organism. This is not the specimen collector.
+  verbatimCollectionDate: The verbatim original representation of the date and time information for when the specimen was collected.
+      Date of collection exactly as it appears on the label. Do not change
+      the format or correct typos.
+  collectionDate: Date the specimen was collected formatted as year-month-day, YYYY-MM_DD. If
+      specific components of the date are unknown, they should be replaced with
+      zeros. Examples "0000-00-00" if the entire date is unknown, "YYYY-00-00"
+      if only the year is known, and "YYYY-MM-00" if year and month are known
+      but day is not.
+  occurrenceRemarks: Text describing the specimen's geographic location. Text describing the appearance of the specimen.
+      A statement about the presence or absence of a taxon at a the collection location.
+      Text describing the significance of the specimen, such as a specific expedition or notable collection.
+      Description of plant features such as leaf shape, size, color,
+      stem texture, height, flower structure, scent, fruit or seed characteristics,
+      root system type, overall growth habit and form, any notable aroma or secretions,
+      presence of hairs or bristles, and any other distinguishing morphological
+      or physiological characteristics.
+  habitat: A category or description of the habitat in which the specimen collection event occurred.
+  locality: Description of geographic location, landscape, landmarks, regional
+      features, nearby places, or any contextual information aiding in pinpointing
+      the exact origin or location of the specimen.
+  isCultivated: Cultivated plants are intentionally grown by humans. In text descriptions,
+      look for planting dates, garden locations, ornamental, cultivar names, garden,
+      or farm to indicate cultivated plant. Set to 'cultivated' if cultivated, otherwise use an empty string.
+  country: The name of the country or major administrative unit in which the specimen was originally collected.
+  stateProvince: The name of the next smaller administrative region than country (state, province, canton, department, region, etc.) in which the specimen was originally collected.
+  county: The full, unabbreviated name of the next smaller administrative region than stateProvince (county, shire, department, parish etc.) in which the specimen was originally collected.
+  municipality: The full, unabbreviated name of the next smaller administrative region than county (city, municipality, etc.) in which the specimen was originally collected.
+  verbatimCoordinates: Verbatim location coordinates as they appear on the label. Do not
+      convert formats. Possible coordinate types include [Lat, Long, UTM, TRS].
+  decimalLatitude: Latitude decimal coordinate. Correct and convert the verbatim location coordinates to conform
+      with the decimal degrees GPS coordinate format.
+  decimalLongitude: Longitude decimal coordinate. Correct and convert the verbatim location coordinates to conform
+      with the decimal degrees GPS coordinate format.
+  minimumElevationInMeters: Minimum elevation or altitude in meters. Only if units are explicit
+      then convert from feet ("ft" or "ft."" or "feet") to meters ("m" or "m." or
+      "meters"). Round to integer.
+  maximumElevationInMeters: Maximum elevation or altitude in meters. If only one elevation
+      is present, then max_elevation should be set to the null_value. Only if units
+      are explicit then convert from feet ("ft" or "ft." or "feet") to meters ("m"
+      or "m." or "meters"). Round to integer.
+mapping:
+  TAXONOMY:
+  - catalogNumber
+  - order
+  - family
+  - speciesBinomialName
+  - genus
+  - specificEpithet
+  - speciesBinomialNameAuthorship
+  GEOGRAPHY:
+  - country
+  - stateProvince
+  - county
+  - municipality
+  - verbatimCoordinates
+  - decimalLatitude
+  - decimalLongitude
+  - minimumElevationInMeters
+  - maximumElevationInMeters
+  LOCALITY:
+  - occurrenceRemarks
+  - habitat
+  - locality
+  - isCultivated
+  COLLECTING:
+  - collector
+  - recordNumber
+  - identifiedBy
+  - verbatimCollectionDate
+  - collectionDate
+  MISC: []

custom_prompts/SLTPvB_medium.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+prompt_author: Will Weaver
+prompt_author_institution: University of Michigan
+prompt_name: SLTPvB_medium
+prompt_version: v-1-0
+prompt_description: Prompt developed by the University of Michigan.
+  SLTPvB prompts all have standardized column headers (fields) that were chosen due to their reliability and prevalence in herbarium records.
+  All field descriptions are based on the official Darwin Core guidelines.
+  SLTPvB_long - The most verbose prompt option. Descriptions closely follow DwC guides. Detailed rules for the LLM to follow. Works best with double or triple OCR to increase attention back to the OCR (select 'use both OCR models' or 'handwritten + printed' along with trOCR).
+  SLTPvB_medium - Shorter verion of _long.
+  SLTPvB_short - The least verbose possible prompt while still providing rules and DwC descriptions.
+LLM: General Purpose
+instructions: 1. Refactor the unstructured OCR text into a dictionary based on the JSON structure outlined below.
+  2. Map the unstructured OCR text to the appropriate JSON key and populate the field given the user-defined rules.
+  3. JSON key values are permitted to remain empty strings if the corresponding information is not found in the unstructured OCR text.
+  4. Duplicate dictionary fields are not allowed.
+  5. Ensure all JSON keys are in camel case.
+  6. Ensure new JSON field values follow sentence case capitalization.
+  7. Ensure all key-value pairs in the JSON dictionary strictly adhere to the format and data types specified in the template.
+  8. Ensure output JSON string is valid JSON format. It should not have trailing commas or unquoted keys.
+  9. Only return a JSON dictionary represented as a string. You should not explain your answer.
+json_formatting_instructions: This section provides rules for formatting each JSON value organized by the JSON key.
+rules:
+  catalogNumber: Barcode identifier, typically a number with at least 6 digits, but fewer than 30 digits.
+  order: The full scientific name of the order in which the taxon is classified. Order must be capitalized.
+  family: The full scientific name of the family in which the taxon is classified. Family must be capitalized.
+  speciesBinomialName: The scientific name of the taxon including genus, specific epithet,
+      and any lower classifications.
+  genus: Taxonomic determination to genus. Genus must be capitalized.
+  specificEpithet: The name of the first or species epithet of the scientificName. Only include the species epithet.
+  speciesBinomialNameAuthorship: The authorship information for the scientificName formatted according to the conventions of the applicable Darwin Core nomenclaturalCode.
+  collector: A comma separated list of names of people, groups, or organizations
+  recordNumber: An identifier given to the specimen at the time it was recorded.
+  identifiedBy: A comma separated list of names of people, groups, or organizations who assigned the taxon to the subject organism. This is not the specimen collector.
+  verbatimCollectionDate: The verbatim original representation of the date and time information for when the specimen was collected.
+  collectionDate: Date the specimen was collected formatted as year-month-day YYYY-MM-DD.
+  occurrenceRemarks: Text describing the specimen's geographic location, appearance of the specimen, presence or absence of a taxon at a the collection location, the significance of the specimen, such as a specific expedition or notable collection, plant features and descriptions.
+  habitat: A category or description of the habitat in which the specimen collection event occurred.
+  locality: Description of geographic location, landscape, landmarks, regional
+      features, nearby places, or any contextual information aiding in pinpointing
+      the exact origin or location of the specimen.
+  isCultivated: Cultivated plants are intentionally grown by humans. In text descriptions,
+      look for planting dates, garden locations, ornamental, cultivar names, garden,
+      or farm to indicate cultivated plant. Set to 'cultivated' if cultivated, otherwise use an empty string.
+  country: The name of the country or major administrative unit in which the specimen was originally collected.
+  stateProvince: The name of the next smaller administrative region than country (state, province, canton, department, region, etc.) in which the specimen was originally collected.
+  county: The full, unabbreviated name of the next smaller administrative region than stateProvince (county, shire, department, parish etc.) in which the specimen was originally collected.
+  municipality: The full, unabbreviated name of the next smaller administrative region than county (city, municipality, etc.) in which the specimen was originally collected.
+  verbatimCoordinates: Verbatim location coordinates as they appear on the label.
+  decimalLatitude: Latitude decimal coordinate. Correct and convert the verbatim location coordinates to conform with the decimal degrees GPS coordinate format.
+  decimalLongitude: Longitude decimal coordinate. Correct and convert the verbatim location coordinates to conform with the decimal degrees GPS coordinate format.
+  minimumElevationInMeters: Minimum elevation or altitude in meters. Only if units are explicit then convert from feet ("ft" or "ft."" or "feet") to meters ("m" or "m." or "meters"). Round to integer.
+  maximumElevationInMeters: Maximum elevation or altitude in meters. If only one elevation is present, then max_elevation should be set to the null_value. Only if units are explicit then convert from feet ("ft" or "ft." or "feet") to meters ("m" or "m." or "meters"). Round to integer.
+mapping:
+  TAXONOMY:
+  - catalogNumber
+  - order
+  - family
+  - speciesBinomialName
+  - genus
+  - specificEpithet
+  - speciesBinomialNameAuthorship
+  GEOGRAPHY:
+  - country
+  - stateProvince
+  - county
+  - municipality
+  - verbatimCoordinates
+  - decimalLatitude
+  - decimalLongitude
+  - minimumElevationInMeters
+  - maximumElevationInMeters
+  LOCALITY:
+  - occurrenceRemarks
+  - habitat
+  - locality
+  - isCultivated
+  COLLECTING:
+  - collector
+  - recordNumber
+  - identifiedBy
+  - verbatimCollectionDate
+  - collectionDate
+  MISC: []

custom_prompts/SLTPvB_short.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+prompt_author: Will Weaver
+prompt_author_institution: University of Michigan
+prompt_name: SLTPvB_short
+prompt_version: v-1-0
+prompt_description: Prompt developed by the University of Michigan.
+  SLTPvB prompts all have standardized column headers (fields) that were chosen due to their reliability and prevalence in herbarium records.
+  All field descriptions are based on the official Darwin Core guidelines.
+  SLTPvB_long - The most verbose prompt option. Descriptions closely follow DwC guides. Detailed rules for the LLM to follow. Works best with double or triple OCR to increase attention back to the OCR (select 'use both OCR models' or 'handwritten + printed' along with trOCR).
+  SLTPvB_medium - Shorter verion of _long.
+  SLTPvB_short - The least verbose possible prompt while still providing rules and DwC descriptions.
+LLM: General Purpose
+instructions: 1. Refactor the unstructured OCR text into a dictionary based on the JSON structure outlined below.
+  2. Map the unstructured OCR text to the appropriate JSON key and populate the field given the user-defined rules.
+  3. JSON key values are permitted to remain empty strings if the corresponding information is not found in the unstructured OCR text.
+  4. Duplicate dictionary fields are not allowed.
+  5. Ensure all JSON keys are in camel case.
+  6. Ensure new JSON field values follow sentence case capitalization.
+  7. Ensure all key-value pairs in the JSON dictionary strictly adhere to the format and data types specified in the template.
+  8. Ensure output JSON string is valid JSON format. It should not have trailing commas or unquoted keys.
+  9. Only return a JSON dictionary represented as a string. You should not explain your answer.
+json_formatting_instructions: This section provides rules for formatting each JSON value organized by the JSON key.
+rules:
+  catalogNumber: barcode identifier, at least 6 digits, fewer than 30 digits.
+  order: full scientific name of the Order in which the taxon is classified. Order must be capitalized.
+  family: full scientific name of the Family in which the taxon is classified. Family must be capitalized.
+  speciesBinomialName: scientific name of the taxon including Genus, specific epithet, and any lower classifications.
+  genus: taxonomic determination to Genus, Genus must be capitalized.
+  specificEpithet: The name of the first or species epithet of the scientificBinomial. Only include the species epithet.
+  speciesBinomialNameAuthorship: authorship information for the scientificName formatted according to the conventions of the applicable Darwin Core nomenclaturalCode.
+  collector: list of names of people, doctors, professors, groups, or organizations.
+  recordNumber: identifier given to the specimen at the time it was recorded.
+  identifiedBy: list of names of people, doctors, professors, groups, or organizations who identified, determined the taxon name to the subject organism. This is not the specimen collector.
+  verbatimCollectionDate: The verbatim original representation of the date and time information for when the specimen was collected.
+  collectionDate: collection date formatted as year-month-day YYYY-MM-DD.
+  occurrenceRemarks: all descriptive text in the OCR rearranged into sensible sentences or sentence fragments.
+  habitat: habitat description.
+  locality: description of geographic information aiding in pinpointing the exact origin or location of the specimen.
+  isCultivated: cultivated plants are intentionally grown by humans. Set to 'cultivated' if cultivated, otherwise use an empty string.
+  country: country or major administrative unit.
+  stateProvince: state, province, canton, department, region, etc.
+  county: county, shire, department, parish etc.
+  municipality: city, municipality, etc.
+  verbatimCoordinates: verbatim location coordinates.
+  decimalLatitude: latitude decimal coordinate.
+  decimalLongitude: longitude decimal coordinate.
+  minimumElevationInMeters: minimum elevation or altitude in meters.
+  maximumElevationInMeters: maximum elevation or altitude in meters.
+mapping:
+  TAXONOMY:
+  - catalogNumber
+  - order
+  - family
+  - speciesBinomialName
+  - genus
+  - specificEpithet
+  - speciesBinomialNameAuthorship
+  GEOGRAPHY:
+  - country
+  - stateProvince
+  - county
+  - municipality
+  - verbatimCoordinates
+  - decimalLatitude
+  - decimalLongitude
+  - minimumElevationInMeters
+  - maximumElevationInMeters
+  LOCALITY:
+  - occurrenceRemarks
+  - habitat
+  - locality
+  - isCultivated
+  COLLECTING:
+  - collector
+  - recordNumber
+  - identifiedBy
+  - verbatimCollectionDate
+  - collectionDate
+  MISC: []

pages/prompt_builder.py CHANGED Viewed

@@ -76,7 +76,9 @@ def load_prompt_yaml(filename):
         st.session_state['mapping'] = st.session_state['prompt_info'].get('mapping', {})
         st.session_state['LLM'] = st.session_state['prompt_info'].get('LLM', 'General Purpose')
-        # Placeholder:
         st.session_state['assigned_columns'] = list(chain.from_iterable(st.session_state['mapping'].values()))

         st.session_state['mapping'] = st.session_state['prompt_info'].get('mapping', {})
         st.session_state['LLM'] = st.session_state['prompt_info'].get('LLM', 'General Purpose')
+        # print(st.session_state['mapping'].values())
+        # print(chain.from_iterable(st.session_state['mapping'].values()))
+        # print(list(chain.from_iterable(st.session_state['mapping'].values())))
         st.session_state['assigned_columns'] = list(chain.from_iterable(st.session_state['mapping'].values()))

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

requirements_conda.txt ADDED Viewed

Binary file (1.97 kB). View file

requirements_with_versions.txt ADDED Viewed

Binary file (11.1 kB). View file

run_VoucherVision.py CHANGED Viewed

@@ -31,7 +31,7 @@ def resolve_path(path):
 if __name__ == "__main__":
     dir_home = os.path.dirname(__file__)
-    start_port = 8528
     try:
         free_port = find_available_port(start_port)
         sys.argv = [
@@ -42,6 +42,7 @@ if __name__ == "__main__":
             "--global.developmentMode=false",
             # "--server.port=8545",
             f"--server.port={free_port}",
             # Toggle below for HF vs Local
             # "--is_hf=1",
             # "--is_hf=0",

 if __name__ == "__main__":
     dir_home = os.path.dirname(__file__)
+    start_port = 8530
     try:
         free_port = find_available_port(start_port)
         sys.argv = [
             "--global.developmentMode=false",
             # "--server.port=8545",
             f"--server.port={free_port}",
+            f"--server.maxUploadSize=51200",
             # Toggle below for HF vs Local
             # "--is_hf=1",
             # "--is_hf=0",

vouchervision/LLM_GoogleGemini.py CHANGED Viewed

@@ -20,7 +20,7 @@ class GoogleGeminiHandler:
     VENDOR = 'google'
     STARTING_TEMP = 0.5
-    def __init__(self, cfg, logger, model_name, JSON_dict_structure):
         self.cfg = cfg
         self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
         self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
@@ -30,10 +30,8 @@ class GoogleGeminiHandler:
         self.model_name = model_name
         self.JSON_dict_structure = JSON_dict_structure
-        self.starting_temp = float(self.STARTING_TEMP)
-        self.temp_increment = float(0.2)
-        self.adjust_temp = self.starting_temp
         self.monitor = SystemLoadMonitor(logger)
         self.parser = JsonOutputParser()
@@ -50,11 +48,24 @@ class GoogleGeminiHandler:
     def _set_config(self):
         # os.environ['GOOGLE_API_KEY'] # Must be set too for the retry call, set in VoucherVision class along with other API Keys
         # vertexai.init(project=os.environ['PALM_PROJECT_ID'], location=os.environ['PALM_LOCATION'])
-        self.config = {
                 "max_output_tokens": 1024,
                 "temperature": self.starting_temp,
-                "top_p": 1
             }
         self.safety_settings = {
             HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
             HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
@@ -65,22 +76,26 @@ class GoogleGeminiHandler:
     def _adjust_config(self):
         new_temp = self.adjust_temp + self.temp_increment
-        self.json_report.set_text(text_main=f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.logger.info(f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.adjust_temp += self.temp_increment
         self.config['temperature'] = self.adjust_temp
     def _reset_config(self):
-        self.json_report.set_text(text_main=f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.logger.info(f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.adjust_temp = self.starting_temp
         self.config['temperature'] = self.starting_temp
     def _build_model_chain_parser(self):
         # Instantiate the LLM class for Google Gemini
-        self.llm_model = ChatGoogleGenerativeAI(model=self.model_name)#,
-                                    # max_output_tokens=self.config.get('max_output_tokens'),
-                                    # top_p=self.config.get('top_p'))
         # self.llm_model = VertexAI(model='gemini-1.0-pro',
         #                           max_output_tokens=self.config.get('max_output_tokens'),
         #                           top_p=self.config.get('top_p'))
@@ -101,7 +116,8 @@ class GoogleGeminiHandler:
     def call_llm_api_GoogleGemini(self, prompt_template, json_report, paths):
         _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
-        self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
         nt_in = 0
         nt_out = 0
@@ -110,9 +126,9 @@ class GoogleGeminiHandler:
         while ind < self.MAX_RETRIES:
             ind += 1
             try:
-                model_kwargs = {"temperature": self.adjust_temp}
                 # Invoke the chain to generate prompt text
-                response = self.chain.invoke({"query": prompt_template, "model_kwargs": model_kwargs})
                 # Use retry_parser to parse the response with retry logic
                 output = self.retry_parser.parse_with_prompt(response, prompt_value=prompt_template)
@@ -131,7 +147,8 @@ class GoogleGeminiHandler:
                     else:
                         self.monitor.stop_inference_timer() # Starts tool timer too
-                        json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
                         save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
@@ -143,7 +160,8 @@ class GoogleGeminiHandler:
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
-                        json_report.set_text(text_main=f'LLM call successful')
                         return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
@@ -153,14 +171,16 @@ class GoogleGeminiHandler:
                 time.sleep(self.RETRY_DELAY)
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
-        self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
-        json_report.set_text(text_main=f'LLM call failed')
         return None, nt_in, nt_out, None, None, usage_report

     VENDOR = 'google'
     STARTING_TEMP = 0.5
+    def __init__(self, cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation):
         self.cfg = cfg
         self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
         self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
         self.model_name = model_name
         self.JSON_dict_structure = JSON_dict_structure
+        self.config_vals_for_permutation = config_vals_for_permutation
         self.monitor = SystemLoadMonitor(logger)
         self.parser = JsonOutputParser()
     def _set_config(self):
         # os.environ['GOOGLE_API_KEY'] # Must be set too for the retry call, set in VoucherVision class along with other API Keys
         # vertexai.init(project=os.environ['PALM_PROJECT_ID'], location=os.environ['PALM_LOCATION'])
+        if self.config_vals_for_permutation:
+            self.starting_temp = float(self.config_vals_for_permutation.get('google').get('temperature'))
+            self.config = {
+                    'max_output_tokens': self.config_vals_for_permutation.get('google').get('max_output_tokens'),
+                    'temperature': self.starting_temp,
+                    'top_p': self.config_vals_for_permutation.get('google').get('top_p'),
+                    }
+        else:
+            self.starting_temp = float(self.STARTING_TEMP)
+            self.config = {
                 "max_output_tokens": 1024,
                 "temperature": self.starting_temp,
+                "top_p": 1.0,
             }
+        self.temp_increment = float(0.2)
+        self.adjust_temp = self.starting_temp
         self.safety_settings = {
             HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
             HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
     def _adjust_config(self):
         new_temp = self.adjust_temp + self.temp_increment
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.logger.info(f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.adjust_temp += self.temp_increment
         self.config['temperature'] = self.adjust_temp
     def _reset_config(self):
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.logger.info(f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.adjust_temp = self.starting_temp
         self.config['temperature'] = self.starting_temp
     def _build_model_chain_parser(self):
         # Instantiate the LLM class for Google Gemini
+        self.llm_model = ChatGoogleGenerativeAI(model=self.model_name,
+                                    max_output_tokens=self.config.get('max_output_tokens'),
+                                    top_p=self.config.get('top_p'),
+                                    temperature=self.config.get('temperature')
+                                    )
         # self.llm_model = VertexAI(model='gemini-1.0-pro',
         #                           max_output_tokens=self.config.get('max_output_tokens'),
         #                           top_p=self.config.get('top_p'))
     def call_llm_api_GoogleGemini(self, prompt_template, json_report, paths):
         _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
         nt_in = 0
         nt_out = 0
         while ind < self.MAX_RETRIES:
             ind += 1
             try:
+                # model_kwargs = {"temperature": self.adjust_temp}
                 # Invoke the chain to generate prompt text
+                response = self.chain.invoke({"query": prompt_template})#, "model_kwargs": model_kwargs})
                 # Use retry_parser to parse the response with retry logic
                 output = self.retry_parser.parse_with_prompt(response, prompt_value=prompt_template)
                     else:
                         self.monitor.stop_inference_timer() # Starts tool timer too
+                        if self.json_report:
+                            self.json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
                         save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
+                        if self.json_report:
+                            self.json_report.set_text(text_main=f'LLM call successful')
                         return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
                 time.sleep(self.RETRY_DELAY)
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
+        if self.json_report:
+            self.json_report.set_text(text_main=f'LLM call failed')
         return None, nt_in, nt_out, None, None, usage_report

vouchervision/LLM_GooglePalm2.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import os, time, json
 # import vertexai
 from vertexai.language_models import TextGenerationModel
 from vertexai.generative_models._generative_models import HarmCategory, HarmBlockThreshold
@@ -10,6 +10,8 @@ from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
 # from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_google_vertexai import VertexAI
 from vouchervision.utils_LLM import SystemLoadMonitor, run_tools, count_tokens, save_individual_prompt, sanitize_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
@@ -31,7 +33,7 @@ class GooglePalm2Handler:
     VENDOR = 'google'
     STARTING_TEMP = 0.5
-    def __init__(self, cfg, logger, model_name, JSON_dict_structure):
         self.cfg = cfg
         self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
         self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
@@ -41,9 +43,9 @@ class GooglePalm2Handler:
         self.model_name = model_name
         self.JSON_dict_structure = JSON_dict_structure
-        self.starting_temp = float(self.STARTING_TEMP)
-        self.temp_increment = float(0.2)
-        self.adjust_temp = self.starting_temp
         self.monitor = SystemLoadMonitor(logger)
@@ -59,12 +61,26 @@ class GooglePalm2Handler:
     def _set_config(self):
         # vertexai.init(project=os.environ['PALM_PROJECT_ID'], location=os.environ['PALM_LOCATION'])
-        self.config = {
                 "max_output_tokens": 1024,
                 "temperature": self.starting_temp,
                 "top_p": 1.0,
-                "top_k": 40,
             }
         self.safety_settings = {
             HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
             HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
@@ -75,13 +91,15 @@ class GooglePalm2Handler:
     def _adjust_config(self):
         new_temp = self.adjust_temp + self.temp_increment
-        self.json_report.set_text(text_main=f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.logger.info(f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.adjust_temp += self.temp_increment
         self.config['temperature'] = self.adjust_temp
     def _reset_config(self):
-        self.json_report.set_text(text_main=f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.logger.info(f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.adjust_temp = self.starting_temp
         self.config['temperature'] = self.starting_temp
@@ -89,7 +107,11 @@ class GooglePalm2Handler:
     def _build_model_chain_parser(self):
         # Instantiate the parser and the retry parser
         # self.llm_model = ChatGoogleGenerativeAI(model=self.model_name)
-        self.llm_model = VertexAI(model=self.model_name)
         self.retry_parser = RetryWithErrorOutputParser.from_llm(
                                                 parser=self.parser,
@@ -105,6 +127,7 @@ class GooglePalm2Handler:
         response = model.predict(prompt_text.text,
                                 max_output_tokens=self.config.get('max_output_tokens'),
                                 temperature=self.config.get('temperature'),
                                 top_p=self.config.get('top_p'))
         # model = GenerativeModel(self.model_name)
@@ -115,7 +138,8 @@ class GooglePalm2Handler:
     def call_llm_api_GooglePalm2(self, prompt_template, json_report, paths):
         _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
-        self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
         nt_in = 0
         nt_out = 0
@@ -124,12 +148,23 @@ class GooglePalm2Handler:
         while ind < self.MAX_RETRIES:
             ind += 1
             try:
-                model_kwargs = {"temperature": self.adjust_temp}
                 # Invoke the chain to generate prompt text
-                response = self.chain.invoke({"query": prompt_template, "model_kwargs": model_kwargs})
                 # Use retry_parser to parse the response with retry logic
-                output = self.retry_parser.parse_with_prompt(response, prompt_value=prompt_template)
                 if output is None:
                     self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{response}')
@@ -144,8 +179,9 @@ class GooglePalm2Handler:
                         self._adjust_config()
                     else:
                         self.monitor.stop_inference_timer() # Starts tool timer too
-                        json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
                         save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
@@ -157,7 +193,8 @@ class GooglePalm2Handler:
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
-                        json_report.set_text(text_main=f'LLM call successful')
                         return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
@@ -167,11 +204,19 @@ class GooglePalm2Handler:
                 time.sleep(self.RETRY_DELAY)
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
-        self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
-        json_report.set_text(text_main=f'LLM call failed')
-        return None, nt_in, nt_out, None, None, usage_report

+import os, time, json, typing
 # import vertexai
 from vertexai.language_models import TextGenerationModel
 from vertexai.generative_models._generative_models import HarmCategory, HarmBlockThreshold
 from langchain_core.output_parsers import JsonOutputParser
 # from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_google_vertexai import VertexAI
+from langchain_core.messages import BaseMessage, HumanMessage
+from langchain_core.prompt_values import PromptValue as BasePromptValue
 from vouchervision.utils_LLM import SystemLoadMonitor, run_tools, count_tokens, save_individual_prompt, sanitize_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
     VENDOR = 'google'
     STARTING_TEMP = 0.5
+    def __init__(self, cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation):
         self.cfg = cfg
         self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
         self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
         self.model_name = model_name
         self.JSON_dict_structure = JSON_dict_structure
+        self.config_vals_for_permutation = config_vals_for_permutation
         self.monitor = SystemLoadMonitor(logger)
     def _set_config(self):
         # vertexai.init(project=os.environ['PALM_PROJECT_ID'], location=os.environ['PALM_LOCATION'])
+        if self.config_vals_for_permutation:
+            self.starting_temp = float(self.config_vals_for_permutation.get('google').get('temperature'))
+            self.config = {
+                    'max_output_tokens': self.config_vals_for_permutation.get('google').get('max_output_tokens'),
+                    'temperature': self.starting_temp,
+                    'top_k': self.config_vals_for_permutation.get('google').get('top_k'),
+                    'top_p': self.config_vals_for_permutation.get('google').get('top_p'),
+                    }
+        else:
+            self.starting_temp = float(self.STARTING_TEMP)
+            self.config = {
                 "max_output_tokens": 1024,
                 "temperature": self.starting_temp,
+                "top_k": 1,
                 "top_p": 1.0,
             }
+        self.temp_increment = float(0.2)
+        self.adjust_temp = self.starting_temp
         self.safety_settings = {
             HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
             HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
     def _adjust_config(self):
         new_temp = self.adjust_temp + self.temp_increment
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.logger.info(f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.adjust_temp += self.temp_increment
         self.config['temperature'] = self.adjust_temp
     def _reset_config(self):
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.logger.info(f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.adjust_temp = self.starting_temp
         self.config['temperature'] = self.starting_temp
     def _build_model_chain_parser(self):
         # Instantiate the parser and the retry parser
         # self.llm_model = ChatGoogleGenerativeAI(model=self.model_name)
+        self.llm_model = VertexAI(model=self.model_name,
+                                  max_output_tokens=self.config.get('max_output_tokens'),
+                                  temperature=self.config.get('temperature'),
+                                  top_k=self.config.get('top_k'),
+                                  top_p=self.config.get('top_p'))
         self.retry_parser = RetryWithErrorOutputParser.from_llm(
                                                 parser=self.parser,
         response = model.predict(prompt_text.text,
                                 max_output_tokens=self.config.get('max_output_tokens'),
                                 temperature=self.config.get('temperature'),
+                                top_k=self.config.get('top_k'),
                                 top_p=self.config.get('top_p'))
         # model = GenerativeModel(self.model_name)
     def call_llm_api_GooglePalm2(self, prompt_template, json_report, paths):
         _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
+        if json_report:
+            self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
         nt_in = 0
         nt_out = 0
         while ind < self.MAX_RETRIES:
             ind += 1
             try:
+                # model_kwargs = {"temperature": self.adjust_temp}
                 # Invoke the chain to generate prompt text
+                response = self.chain.invoke({"query": prompt_template})#, "model_kwargs": model_kwargs})
                 # Use retry_parser to parse the response with retry logic
+                try:
+                    output = self.retry_parser.parse_with_prompt(response, prompt_value=PromptValue(prompt_template))
+                except:
+                    try:
+                        output = self.retry_parser.parse_with_prompt(response, prompt_value=prompt_template)
+                    except:
+                        try:
+                            output = json.loads(response)
+                        except Exception as e:
+                            print(e)
+                            output = None
                 if output is None:
                     self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{response}')
                         self._adjust_config()
                     else:
                         self.monitor.stop_inference_timer() # Starts tool timer too
+                        if self.json_report:
+                            self.json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
                         save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
+                        if self.json_report:
+                            self.json_report.set_text(text_main=f'LLM call successful')
                         return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
                 time.sleep(self.RETRY_DELAY)
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
+        if self.json_report:
+            self.json_report.set_text(text_main=f'LLM call failed')
+        return None, nt_in, nt_out, None, None, usage_report
+class PromptValue(BasePromptValue):
+    prompt_str: str
+    def to_string(self) -> str:
+        return self.prompt_str

vouchervision/LLM_MistralAI.py CHANGED Viewed

@@ -11,12 +11,12 @@ from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys
 class MistralHandler:
     RETRY_DELAY = 2  # Wait 10 seconds before retrying
     MAX_RETRIES = 5  # Maximum number of retries
-    STARTING_TEMP = 0.1
     TOKENIZER_NAME = None
     VENDOR = 'mistral'
     RANDOM_SEED = 2023
-    def __init__(self, cfg, logger, model_name, JSON_dict_structure):
         self.cfg = cfg
         self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
         self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
@@ -27,10 +27,9 @@ class MistralHandler:
         self.has_GPU = torch.cuda.is_available()
         self.model_name = model_name
         self.JSON_dict_structure = JSON_dict_structure
-        self.starting_temp = float(self.STARTING_TEMP)
-        self.temp_increment = float(0.2)
-        self.adjust_temp = self.starting_temp
         # Set up a parser
         self.parser = JsonOutputParser()
@@ -44,25 +43,45 @@ class MistralHandler:
         self._set_config()
     def _set_config(self):
-        self.config = {'max_tokens': 1024,
                 'temperature': self.starting_temp,
                 'random_seed': self.RANDOM_SEED,
                 'safe_mode': False,
-                'top_p': 1,
-                }
         self._build_model_chain_parser()
     def _adjust_config(self):
         new_temp = self.adjust_temp + self.temp_increment
         self.config['random_seed'] = random.randint(1, 1000)
-        self.json_report.set_text(text_main=f'Incrementing temperature from {self.adjust_temp} to {new_temp} and random_seed to {self.config.get("random_seed")}')
         self.logger.info(f'Incrementing temperature from {self.adjust_temp} to {new_temp} and random_seed to {self.config.get("random_seed")}')
         self.adjust_temp += self.temp_increment
         self.config['temperature'] = self.adjust_temp
     def _reset_config(self):
-        self.json_report.set_text(text_main=f'Resetting temperature from {self.adjust_temp} to {self.starting_temp} and random_seed to {self.RANDOM_SEED}')
         self.logger.info(f'Incrementing temperature from {self.adjust_temp} to {self.starting_temp} and random_seed to {self.RANDOM_SEED}')
         self.adjust_temp = self.starting_temp
         self.config['temperature'] = self.starting_temp
@@ -74,7 +93,9 @@ class MistralHandler:
                             model=self.model_name,
                             max_tokens=self.config.get('max_tokens'),
                             safe_mode=self.config.get('safe_mode'),
-                            top_p=self.config.get('top_p'))
         # Set up the retry parser with the runnable
         self.retry_parser = RetryWithErrorOutputParser.from_llm(parser=self.parser, llm=self.llm_model, max_retries=self.MAX_RETRIES)
@@ -85,7 +106,8 @@ class MistralHandler:
         _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
-        self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
         nt_in = 0
         nt_out = 0
@@ -94,10 +116,10 @@ class MistralHandler:
         while ind < self.MAX_RETRIES:
             ind += 1
             try:
-                model_kwargs = {"temperature": self.adjust_temp, "random_seed": self.config.get("random_seed")}
                 # Invoke the chain to generate prompt text
-                response = self.chain.invoke({"query": prompt_template, "model_kwargs": model_kwargs})
                 # Use retry_parser to parse the response with retry logic
                 output = self.retry_parser.parse_with_prompt(response.content, prompt_value=prompt_template)
@@ -115,8 +137,9 @@ class MistralHandler:
                         self._adjust_config()
                     else:
                         self.monitor.stop_inference_timer() # Starts tool timer too
-                        json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
                         save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
@@ -128,7 +151,8 @@ class MistralHandler:
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
-                        json_report.set_text(text_main=f'LLM call successful')
                         return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
@@ -138,11 +162,13 @@ class MistralHandler:
                 time.sleep(self.RETRY_DELAY)
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
-        self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
-        json_report.set_text(text_main=f'LLM call failed')
         return None, nt_in, nt_out, None, None, usage_report

 class MistralHandler:
     RETRY_DELAY = 2  # Wait 10 seconds before retrying
     MAX_RETRIES = 5  # Maximum number of retries
+    STARTING_TEMP = 0.5 #0.01
     TOKENIZER_NAME = None
     VENDOR = 'mistral'
     RANDOM_SEED = 2023
+    def __init__(self, cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation):
         self.cfg = cfg
         self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
         self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
         self.has_GPU = torch.cuda.is_available()
         self.model_name = model_name
         self.JSON_dict_structure = JSON_dict_structure
+        self.config_vals_for_permutation = config_vals_for_permutation
         # Set up a parser
         self.parser = JsonOutputParser()
         self._set_config()
     def _set_config(self):
+        if self.config_vals_for_permutation:
+            self.starting_temp = float(self.config_vals_for_permutation.get('mistral').get('temperature'))
+            self.config = {
+                    'max_tokens': self.config_vals_for_permutation.get('mistral').get('max_tokens'),
+                    'temperature': self.starting_temp,
+                    'top_p': self.config_vals_for_permutation.get('mistral').get('top_p'),
+                    'top_k': self.config_vals_for_permutation.get('mistral').get('top_k'),
+                    'safe_mode': self.config_vals_for_permutation.get('mistral').get('safe_mode'),
+                    'random_seed': self.config_vals_for_permutation.get('mistral').get('random_seed'),
+                    }
+        else:
+            self.starting_temp = float(self.STARTING_TEMP)
+            self.config = {
+                'max_tokens': 1024,
                 'temperature': self.starting_temp,
                 'random_seed': self.RANDOM_SEED,
                 'safe_mode': False,
+                'top_p': 0.5,
+                'top_k': 0.5,
+            }
+        self.temp_increment = float(0.2)
+        self.adjust_temp = self.starting_temp
         self._build_model_chain_parser()
     def _adjust_config(self):
         new_temp = self.adjust_temp + self.temp_increment
         self.config['random_seed'] = random.randint(1, 1000)
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Incrementing temperature from {self.adjust_temp} to {new_temp} and random_seed to {self.config.get("random_seed")}')
         self.logger.info(f'Incrementing temperature from {self.adjust_temp} to {new_temp} and random_seed to {self.config.get("random_seed")}')
         self.adjust_temp += self.temp_increment
         self.config['temperature'] = self.adjust_temp
     def _reset_config(self):
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Resetting temperature from {self.adjust_temp} to {self.starting_temp} and random_seed to {self.RANDOM_SEED}')
         self.logger.info(f'Incrementing temperature from {self.adjust_temp} to {self.starting_temp} and random_seed to {self.RANDOM_SEED}')
         self.adjust_temp = self.starting_temp
         self.config['temperature'] = self.starting_temp
                             model=self.model_name,
                             max_tokens=self.config.get('max_tokens'),
                             safe_mode=self.config.get('safe_mode'),
+                            top_p=self.config.get('top_p'),
+                            top_k=self.config.get('top_k'),
+                            )
         # Set up the retry parser with the runnable
         self.retry_parser = RetryWithErrorOutputParser.from_llm(parser=self.parser, llm=self.llm_model, max_retries=self.MAX_RETRIES)
         _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
         nt_in = 0
         nt_out = 0
         while ind < self.MAX_RETRIES:
             ind += 1
             try:
+                # model_kwargs = {"temperature": self.adjust_temp, "random_seed": self.config.get("random_seed")}
                 # Invoke the chain to generate prompt text
+                response = self.chain.invoke({"query": prompt_template})#, "model_kwargs": model_kwargs})
                 # Use retry_parser to parse the response with retry logic
                 output = self.retry_parser.parse_with_prompt(response.content, prompt_value=prompt_template)
                         self._adjust_config()
                     else:
                         self.monitor.stop_inference_timer() # Starts tool timer too
+                        if self.json_report:
+                            self.json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
                         save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
+                        if self.json_report:
+                            self.json_report.set_text(text_main=f'LLM call successful')
                         return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
                 time.sleep(self.RETRY_DELAY)
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
+        if self.json_report:
+            self.json_report.set_text(text_main=f'LLM call failed')
         return None, nt_in, nt_out, None, None, usage_report

vouchervision/LLM_OpenAI.py CHANGED Viewed

@@ -11,11 +11,11 @@ from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys
 class OpenAIHandler:
     RETRY_DELAY = 10  # Wait 10 seconds before retrying
     MAX_RETRIES = 3  # Maximum number of retries
-    STARTING_TEMP = 0.5
     TOKENIZER_NAME = 'gpt-4'
     VENDOR = 'openai'
-    def __init__(self, cfg, logger, model_name, JSON_dict_structure, is_azure, llm_object):
         self.cfg = cfg
         self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
         self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
@@ -26,14 +26,13 @@ class OpenAIHandler:
         self.JSON_dict_structure = JSON_dict_structure
         self.is_azure = is_azure
         self.llm_object = llm_object
-        self.name_parts = self.model_name.split('-')
         self.monitor = SystemLoadMonitor(logger)
         self.has_GPU = torch.cuda.is_available()
-        self.starting_temp = float(self.STARTING_TEMP)
-        self.temp_increment = float(0.2)
-        self.adjust_temp = self.starting_temp
         # Set up a parser
         self.parser = JsonOutputParser()
@@ -45,12 +44,44 @@ class OpenAIHandler:
         )
         self._set_config()
     def _set_config(self):
-        self.config = {'max_new_tokens': 1024,
-                'temperature': self.starting_temp,
-                'random_seed': 2023,
-                'top_p': 1,
-                }
         # Adjusting the LLM settings based on whether Azure is used
         if self.is_azure:
             self.llm_object.deployment_name = self.model_name
@@ -68,43 +99,84 @@ class OpenAIHandler:
     def _adjust_config(self):
         new_temp = self.adjust_temp + self.temp_increment
-        self.json_report.set_text(text_main=f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.logger.info(f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.adjust_temp += self.temp_increment
-        self.config['temperature'] = self.adjust_temp
     def _reset_config(self):
-        self.json_report.set_text(text_main=f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.logger.info(f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.adjust_temp = self.starting_temp
-        self.config['temperature'] = self.starting_temp
     def _build_model_chain_parser(self):
         if not self.is_azure and ('instruct' in self.name_parts):
             # Set up the retry parser with 3 retries
             self.retry_parser = RetryWithErrorOutputParser.from_llm(
-                # parser=self.parser, llm=self.llm_object if self.is_azure else OpenAI(temperature=self.config.get('temperature'), model=self.model_name), max_retries=self.MAX_RETRIES
-                parser=self.parser, llm=self.llm_object if self.is_azure else OpenAI(model=self.model_name), max_retries=self.MAX_RETRIES
             )
         else:
-            # Set up the retry parser with 3 retries
             self.retry_parser = RetryWithErrorOutputParser.from_llm(
-                # parser=self.parser, llm=self.llm_object if self.is_azure else ChatOpenAI(temperature=self.config.get('temperature'), model=self.model_name), max_retries=self.MAX_RETRIES
-                parser=self.parser, llm=self.llm_object if self.is_azure else ChatOpenAI(model=self.model_name), max_retries=self.MAX_RETRIES
             )
         # Prepare the chain
-        if not self.is_azure and ('instruct' in self.name_parts):
-            # self.chain = self.prompt | (self.format_input_for_azure if self.is_azure else OpenAI(temperature=self.config.get('temperature'), model=self.model_name))
-            self.chain = self.prompt | (self.format_input_for_azure if self.is_azure else OpenAI(model=self.model_name))
         else:
-            # self.chain = self.prompt | (self.format_input_for_azure if self.is_azure else ChatOpenAI(temperature=self.config.get('temperature'), model=self.model_name))
-            self.chain = self.prompt | (self.format_input_for_azure if self.is_azure else ChatOpenAI(model=self.model_name))
     def call_llm_api_OpenAI(self, prompt_template, json_report, paths):
         _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
-        self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
         nt_in = 0
         nt_out = 0
@@ -113,14 +185,20 @@ class OpenAIHandler:
         while ind < self.MAX_RETRIES:
             ind += 1
             try:
-                model_kwargs = {"temperature": self.adjust_temp}
                 # Invoke the chain to generate prompt text
-                response = self.chain.invoke({"query": prompt_template, "model_kwargs": model_kwargs})
                 response_text = response.content if not isinstance(response, str) else response
                 # Use retry_parser to parse the response with retry logic
-                output = self.retry_parser.parse_with_prompt(response_text, prompt_value=prompt_template)
                 if output is None:
                     self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{response_text}')
@@ -136,14 +214,11 @@ class OpenAIHandler:
                     else:
                         self.monitor.stop_inference_timer() # Starts tool timer too
-                        json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
-                        # output1, WFO_record = validate_taxonomy_WFO(self.tool_WFO, output, replace_if_success_wfo=False)
-                        # output2, GEO_record = validate_coordinates_here(self.tool_GEO, output, replace_if_success_geo=False)
-                        # validate_wikipedia(self.tool_wikipedia, json_file_path_wiki, output)
                         save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
@@ -153,7 +228,8 @@ class OpenAIHandler:
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
-                        json_report.set_text(text_main=f'LLM call successful')
                         return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
@@ -163,11 +239,15 @@ class OpenAIHandler:
                 time.sleep(self.RETRY_DELAY)
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
-        self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
-        json_report.set_text(text_main=f'LLM call failed')
         return None, nt_in, nt_out, None, None, usage_report

 class OpenAIHandler:
     RETRY_DELAY = 10  # Wait 10 seconds before retrying
     MAX_RETRIES = 3  # Maximum number of retries
+    STARTING_TEMP = 0.5 # 0.5, config_vals_for_permutation
     TOKENIZER_NAME = 'gpt-4'
     VENDOR = 'openai'
+    def __init__(self, cfg, logger, model_name, JSON_dict_structure, is_azure, llm_object, config_vals_for_permutation):
         self.cfg = cfg
         self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
         self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
         self.JSON_dict_structure = JSON_dict_structure
         self.is_azure = is_azure
         self.llm_object = llm_object
+        self.name_parts = self.model_name.lower().split('-')
         self.monitor = SystemLoadMonitor(logger)
         self.has_GPU = torch.cuda.is_available()
+        ### Config
+        self.config_vals_for_permutation = config_vals_for_permutation
         # Set up a parser
         self.parser = JsonOutputParser()
         )
         self._set_config()
+    def _can_use_json_mode(self):
+        if self.is_azure:
+            return False
+        # gpt-4-turbo-preview (gpt-4-0125-preview)
+        if ('0125' in self.name_parts) and ('4' in self.name_parts):
+            return True
+        # gpt-3.5-turbo-0125
+        elif ('0125' in self.name_parts) and ('3.5' in self.name_parts) and ('turbo' in self.name_parts):
+            return True
+        else:
+            return False
     def _set_config(self):
+        if self.config_vals_for_permutation:
+            self.starting_temp = float(self.config_vals_for_permutation.get('openai').get('temperature'))
+            self.model_kwargs = {
+                    'max_tokens': self.config_vals_for_permutation.get('openai').get('max_tokens'),
+                    'temperature': self.starting_temp,
+                    # 'seed': self.config_vals_for_permutation.get('openai').get('seed'),
+                    'top_p': self.config_vals_for_permutation.get('openai').get('top_p'),
+                    }
+        else:
+            self.starting_temp = float(self.STARTING_TEMP)
+            self.model_kwargs = {
+                    'max_tokens': 1024,
+                    'temperature': self.starting_temp,
+                    # 'seed': 2023,
+                    'top_p': 1, # Set to 1, change temp only
+                    }
+        ### Not all openai models support json mode
+        if self._can_use_json_mode():
+            self.model_kwargs.update({"response_format": {"type": "json_object"}})
+        self.temp_increment = float(0.2)
+        self.adjust_temp = self.starting_temp
         # Adjusting the LLM settings based on whether Azure is used
         if self.is_azure:
             self.llm_object.deployment_name = self.model_name
     def _adjust_config(self):
         new_temp = self.adjust_temp + self.temp_increment
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.logger.info(f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.adjust_temp += self.temp_increment
+        self.model_kwargs['temperature'] = self.adjust_temp
     def _reset_config(self):
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.logger.info(f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.adjust_temp = self.starting_temp
+        self.model_kwargs['temperature'] = self.starting_temp
     def _build_model_chain_parser(self):
         if not self.is_azure and ('instruct' in self.name_parts):
+            # Determine the LLM to use based on whether this is an Azure instance
+            if self.is_azure:
+                llm_to_use = self.llm_object
+            else:
+                llm_to_use = OpenAI(
+                    model=self.model_name,
+                    temperature=self.model_kwargs.get('temperature'),
+                    top_p=self.model_kwargs.get('top_p'),
+                    max_tokens=self.model_kwargs.get('max_tokens')
+                )
             # Set up the retry parser with 3 retries
             self.retry_parser = RetryWithErrorOutputParser.from_llm(
+                parser=self.parser,
+                llm=llm_to_use,
+                max_retries=self.MAX_RETRIES
             )
         else:
+            # Determine the LLM to use for non-Azure instances
+            if self.is_azure:
+                llm_to_use = self.llm_object
+                self.llm_object.temperature = self.model_kwargs.get('temperature')
+                self.llm_object.max_tokens = self.model_kwargs.get('max_tokens')
+                self.llm_object.model_kwargs = self.model_kwargs
+            else:
+                llm_to_use = ChatOpenAI(
+                    model=self.model_name,
+                    temperature=self.model_kwargs.get('temperature'),
+                    top_p=self.model_kwargs.get('top_p'),
+                    max_tokens=self.model_kwargs.get('max_tokens'),
+                )
+            # Set up the retry parser with 3 retries for other cases
             self.retry_parser = RetryWithErrorOutputParser.from_llm(
+                parser=self.parser,
+                llm=llm_to_use,
+                max_retries=self.MAX_RETRIES
             )
         # Prepare the chain
+        if self.is_azure:
+            chain_llm_to_use = self.format_input_for_azure
         else:
+            if 'instruct' in self.name_parts:
+                chain_llm_to_use = OpenAI(
+                    model=self.model_name,
+                    temperature=self.model_kwargs.get('temperature'),
+                    top_p=self.model_kwargs.get('top_p'),
+                    max_tokens=self.model_kwargs.get('max_tokens')
+                )
+            else:
+                chain_llm_to_use = ChatOpenAI(
+                    model=self.model_name,
+                    temperature=self.model_kwargs.get('temperature'),
+                    top_p=self.model_kwargs.get('top_p'),
+                    max_tokens=self.model_kwargs.get('max_tokens')
+                )
+        self.chain = self.prompt | chain_llm_to_use
     def call_llm_api_OpenAI(self, prompt_template, json_report, paths):
         _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
         nt_in = 0
         nt_out = 0
         while ind < self.MAX_RETRIES:
             ind += 1
             try:
+                self.logger.info(str(self.model_kwargs))
                 # Invoke the chain to generate prompt text
+                response = self.chain.invoke(input={"query": prompt_template})#, **self.model_kwargs)# "model_kwargs": self.model_kwargs})
                 response_text = response.content if not isinstance(response, str) else response
                 # Use retry_parser to parse the response with retry logic
+                try:
+                    output = self.retry_parser.parse_with_prompt(response_text, prompt_value=prompt_template)
+                except:
+                    try:
+                        output = json.loads(response_text)
+                    except:
+                        output = None
                 if output is None:
                     self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{response_text}')
                     else:
                         self.monitor.stop_inference_timer() # Starts tool timer too
+                        if self.json_report:
+                            self.json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
                         save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
+                        if self.json_report:
+                            self.json_report.set_text(text_main=f'LLM call successful')
                         return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
                 time.sleep(self.RETRY_DELAY)
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
+        if self.json_report:
+            self.json_report.set_text(text_main=f'LLM call failed')
         return None, nt_in, nt_out, None, None, usage_report

vouchervision/LLM_local_MistralAI.py CHANGED Viewed

@@ -22,7 +22,7 @@ class LocalMistralHandler:
     VENDOR = 'mistral'
     MAX_GPU_MONITORING_INTERVAL = 2  # seconds
-    def __init__(self, cfg, logger, model_name, JSON_dict_structure):
         self.cfg = cfg
         self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
         self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
@@ -122,13 +122,15 @@ class LocalMistralHandler:
     def _adjust_config(self):
         new_temp = self.adjust_temp + self.temp_increment
-        self.json_report.set_text(text_main=f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.logger.info(f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.adjust_temp += self.temp_increment
     def _reset_config(self):
-        self.json_report.set_text(text_main=f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.logger.info(f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.adjust_temp = self.starting_temp
@@ -153,7 +155,8 @@ class LocalMistralHandler:
     def call_llm_local_MistralAI(self, prompt_template, json_report, paths):
         _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
-        self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
         nt_in = 0
@@ -188,8 +191,9 @@ class LocalMistralHandler:
                         self._adjust_config()
                     else:
                         self.monitor.stop_inference_timer() # Starts tool timer too
-                        json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
                         save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
@@ -201,7 +205,8 @@ class LocalMistralHandler:
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
-                        json_report.set_text(text_main=f'LLM call successful')
                         del results
                         return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
@@ -210,11 +215,13 @@ class LocalMistralHandler:
                 self._adjust_config()
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
-        self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
-        json_report.set_text(text_main=f'LLM call failed')
         self._reset_config()
         return None, nt_in, nt_out, None, None, usage_report

     VENDOR = 'mistral'
     MAX_GPU_MONITORING_INTERVAL = 2  # seconds
+    def __init__(self, cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation):
         self.cfg = cfg
         self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
         self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
     def _adjust_config(self):
         new_temp = self.adjust_temp + self.temp_increment
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.logger.info(f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.adjust_temp += self.temp_increment
     def _reset_config(self):
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.logger.info(f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.adjust_temp = self.starting_temp
     def call_llm_local_MistralAI(self, prompt_template, json_report, paths):
         _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
         nt_in = 0
                         self._adjust_config()
                     else:
                         self.monitor.stop_inference_timer() # Starts tool timer too
+                        if self.json_report:
+                            self.json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
                         save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
+                        if self.json_report:
+                            self.json_report.set_text(text_main=f'LLM call successful')
                         del results
                         return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
                 self._adjust_config()
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
+        if self.json_report:
+            self.json_report.set_text(text_main=f'LLM call failed')
         self._reset_config()
         return None, nt_in, nt_out, None, None, usage_report

vouchervision/LLM_local_cpu_MistralAI.py CHANGED Viewed

@@ -30,7 +30,7 @@ class LocalCPUMistralHandler:
     SEED = 2023
-    def __init__(self, cfg, logger, model_name, JSON_dict_structure):
         self.cfg = cfg
         self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
         self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
@@ -106,13 +106,15 @@ class LocalCPUMistralHandler:
     def _adjust_config(self):
         new_temp = self.adjust_temp + self.temp_increment
-        self.json_report.set_text(text_main=f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.logger.info(f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.adjust_temp += self.temp_increment
         self.config['temperature'] = self.adjust_temp
     def _reset_config(self):
-        self.json_report.set_text(text_main=f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.logger.info(f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.adjust_temp = self.starting_temp
         self.config['temperature'] = self.starting_temp
@@ -140,7 +142,8 @@ class LocalCPUMistralHandler:
     def call_llm_local_cpu_MistralAI(self, prompt_template, json_report, paths):
         _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
-        self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
         nt_in = 0
@@ -180,7 +183,8 @@ class LocalCPUMistralHandler:
                     else:
                         self.monitor.stop_inference_timer() # Starts tool timer too
-                        json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
                         save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
@@ -192,7 +196,8 @@ class LocalCPUMistralHandler:
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
-                        json_report.set_text(text_main=f'LLM call successful')
                         return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
@@ -200,13 +205,15 @@ class LocalCPUMistralHandler:
                 self._adjust_config()
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
-        self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
-        json_report.set_text(text_main=f'LLM call failed')
         return None, nt_in, nt_out, None, None, usage_report

     SEED = 2023
+    def __init__(self, cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation):
         self.cfg = cfg
         self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
         self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
     def _adjust_config(self):
         new_temp = self.adjust_temp + self.temp_increment
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.logger.info(f'Incrementing temperature from {self.adjust_temp} to {new_temp}')
         self.adjust_temp += self.temp_increment
         self.config['temperature'] = self.adjust_temp
     def _reset_config(self):
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.logger.info(f'Resetting temperature from {self.adjust_temp} to {self.starting_temp}')
         self.adjust_temp = self.starting_temp
         self.config['temperature'] = self.starting_temp
     def call_llm_local_cpu_MistralAI(self, prompt_template, json_report, paths):
         _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
         nt_in = 0
                     else:
                         self.monitor.stop_inference_timer() # Starts tool timer too
+                        if self.json_report:
+                            self.json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
                         save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
+                        if self.json_report:
+                            self.json_report.set_text(text_main=f'LLM call successful')
                         return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
                 self._adjust_config()
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
+        if self.json_report:
+            self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
+        if self.json_report:
+            self.json_report.set_text(text_main=f'LLM call failed')
         return None, nt_in, nt_out, None, None, usage_report

vouchervision/LM2_logger.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging, os, psutil, torch, platform, cpuinfo, yaml #py-cpuinfo
 from vouchervision.general_utils import get_datetime, print_main_warn, print_main_info
 class SanitizingFileHandler(logging.FileHandler):
@@ -17,7 +18,7 @@ def start_logging(Dirs, cfg):
     path_log = os.path.join(Dirs.path_log, '__'.join(['LM2-log', str(get_datetime()), run_name]) + '.log')
     # Disable default StreamHandler
-    logging.getLogger().handlers = []
     # create logger
     logger = logging.getLogger('Hardware Components')
@@ -27,20 +28,25 @@ def start_logging(Dirs, cfg):
     sanitizing_fh = SanitizingFileHandler(path_log, encoding='utf-8')
     sanitizing_fh.setLevel(logging.DEBUG)
     # create console handler and set level to debug
-    ch = logging.StreamHandler()
-    ch.setLevel(logging.DEBUG)
     # create formatter
     formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')
     # add formatter to handlers
     sanitizing_fh.setFormatter(formatter)
-    ch.setFormatter(formatter)
     # add handlers to logger
     logger.addHandler(sanitizing_fh)
-    logger.addHandler(ch)
     # Create a logger for the file handler
     file_logger = logging.getLogger('file_logger')
@@ -110,6 +116,17 @@ def find_cpu_info():
         except:
             return "CPU: UNKNOWN"
 def LM2_banner():
         logo = """

 import logging, os, psutil, torch, platform, cpuinfo, yaml #py-cpuinfo
+from tqdm import tqdm
 from vouchervision.general_utils import get_datetime, print_main_warn, print_main_info
 class SanitizingFileHandler(logging.FileHandler):
     path_log = os.path.join(Dirs.path_log, '__'.join(['LM2-log', str(get_datetime()), run_name]) + '.log')
     # Disable default StreamHandler
+    logging.getLogger().handlers = []
     # create logger
     logger = logging.getLogger('Hardware Components')
     sanitizing_fh = SanitizingFileHandler(path_log, encoding='utf-8')
     sanitizing_fh.setLevel(logging.DEBUG)
+    tqdm_handler = TqdmLoggingHandler()
+    tqdm_handler.setLevel(logging.DEBUG)
     # create console handler and set level to debug
+    # ch = logging.StreamHandler()
+    # ch.setLevel(logging.DEBUG)
     # create formatter
     formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')
     # add formatter to handlers
     sanitizing_fh.setFormatter(formatter)
+    tqdm_handler.setFormatter(formatter)
+    # ch.setFormatter(formatter)
     # add handlers to logger
     logger.addHandler(sanitizing_fh)
+    logger.addHandler(tqdm_handler)
+    # logger.addHandler(ch)
     # Create a logger for the file handler
     file_logger = logging.getLogger('file_logger')
         except:
             return "CPU: UNKNOWN"
+class TqdmLoggingHandler(logging.Handler):
+    def __init__(self, level=logging.NOTSET):
+        super().__init__(level)
+    def emit(self, record):
+        try:
+            msg = self.format(record)
+            tqdm.write(msg)  # Use tqdm's write function to ensure correct output
+            self.flush()
+        except Exception:
+            self.handleError(record)
 def LM2_banner():
         logo = """

vouchervision/OCR_google_cloud_vision.py CHANGED Viewed

@@ -123,8 +123,9 @@ class OCREngine:
             self.model_path = "liuhaotian/" + self.cfg['leafmachine']['project']['OCR_option_llava']
             self.model_quant = self.cfg['leafmachine']['project']['OCR_option_llava_bit']
-            self.json_report.set_text(text_main=f'Loading LLaVA model: {self.model_path} Quantization: {self.model_quant}')
             if self.model_quant == '4bit':
                 use_4bit = True
@@ -191,7 +192,8 @@ class OCREngine:
         # Process each detected text region
         for box in self.prediction_result["boxes"]:
             i+=1
-            self.json_report.set_text(text_main=f'Locating text using CRAFT --- {i}/{total_b}')
             vertices = [{"x": int(vertex[0]), "y": int(vertex[1])} for vertex in box]
@@ -283,7 +285,8 @@ class OCREngine:
             i=0
             for bound in tqdm(available_bounds, desc="Processing words using Google Vision bboxes"):
                 i+=1
-                self.json_report.set_text(text_main=f'Working on trOCR :construction: {i}/{total_b}')
                 vertices = bound["vertices"]
@@ -688,7 +691,8 @@ class OCREngine:
             # logger.info(f"CRAFT trOCR:\n{self.OCR}")
         if 'LLaVA' in self.OCR_option: # This option does not produce an OCR helper image
-            self.json_report.set_text(text_main=f'Working on LLaVA {self.Llava.model_path} transcription :construction:')
             image, json_output, direct_output, str_output, usage_report = self.Llava.transcribe_image(self.path, self.multimodal_prompt)
             self.logger.info(f"LLaVA Usage Report for Model {self.Llava.model_path}:\n{usage_report}")
@@ -786,4 +790,20 @@ class OCREngine:
             from craft_text_detector import empty_cuda_cache
             empty_cuda_cache()
         except:
-            pass

             self.model_path = "liuhaotian/" + self.cfg['leafmachine']['project']['OCR_option_llava']
             self.model_quant = self.cfg['leafmachine']['project']['OCR_option_llava_bit']
+            if self.json_report:
+                self.json_report.set_text(text_main=f'Loading LLaVA model: {self.model_path} Quantization: {self.model_quant}')
             if self.model_quant == '4bit':
                 use_4bit = True
         # Process each detected text region
         for box in self.prediction_result["boxes"]:
             i+=1
+            if self.json_report:
+                self.json_report.set_text(text_main=f'Locating text using CRAFT --- {i}/{total_b}')
             vertices = [{"x": int(vertex[0]), "y": int(vertex[1])} for vertex in box]
             i=0
             for bound in tqdm(available_bounds, desc="Processing words using Google Vision bboxes"):
                 i+=1
+                if self.json_report:
+                    self.json_report.set_text(text_main=f'Working on trOCR :construction: {i}/{total_b}')
                 vertices = bound["vertices"]
             # logger.info(f"CRAFT trOCR:\n{self.OCR}")
         if 'LLaVA' in self.OCR_option: # This option does not produce an OCR helper image
+            if self.json_report:
+                self.json_report.set_text(text_main=f'Working on LLaVA {self.Llava.model_path} transcription :construction:')
             image, json_output, direct_output, str_output, usage_report = self.Llava.transcribe_image(self.path, self.multimodal_prompt)
             self.logger.info(f"LLaVA Usage Report for Model {self.Llava.model_path}:\n{usage_report}")
             from craft_text_detector import empty_cuda_cache
             empty_cuda_cache()
         except:
+            pass
+def check_for_inappropriate_content(file_stream):
+    client = vision.ImageAnnotatorClient()
+    content = file_stream.read()
+    image = vision.Image(content=content)
+    response = client.safe_search_detection(image=image)
+    safe = response.safe_search_annotation
+    # Check the levels of adult, violence, racy, etc. content.
+    if (safe.adult > vision.Likelihood.POSSIBLE or
+        safe.violence > vision.Likelihood.POSSIBLE or
+        safe.racy > vision.Likelihood.POSSIBLE):
+        return True  # The image violates safe search guidelines.
+    return False  # The image is considered safe.

vouchervision/VoucherVision_Config_Builder.py CHANGED Viewed

@@ -49,7 +49,7 @@ def build_VV_config(loaded_cfg=None):
         check_for_illegal_filenames = False
-        LLM_version_user = 'Azure GPT 3.5 Instruct' #'Azure GPT 4 Turbo 1106-preview'
         prompt_version = 'SLTPvA_long.yaml' # from ["Version 1", "Version 1 No Domain Knowledge", "Version 2"]
         use_LeafMachine2_collage_images = True # Use LeafMachine2 collage images
         do_create_OCR_helper_image = True

         check_for_illegal_filenames = False
+        LLM_version_user = 'Azure GPT 3.5 Turbo' #'Azure GPT 4 Turbo 1106-preview'
         prompt_version = 'SLTPvA_long.yaml' # from ["Version 1", "Version 1 No Domain Knowledge", "Version 2"]
         use_LeafMachine2_collage_images = True # Use LeafMachine2 collage images
         do_create_OCR_helper_image = True

vouchervision/model_maps.py CHANGED Viewed

@@ -20,9 +20,11 @@ class ModelMaps:
         'AZURE_GPT_3_5_INSTRUCT': '#9400D3',  # Dark Violet
         'AZURE_GPT_3_5': '#9932CC',  # Dark Orchid
-        'MISTRAL_TINY': '#FFA07A',  # Light Salmon
-        'MISTRAL_SMALL': '#FF8C00',  # Dark Orange
         'MISTRAL_MEDIUM': '#FF4500',  # Orange Red
         'LOCAL_MIXTRAL_8X7B_INSTRUCT_V01': '#000000',  # Black
         'LOCAL_MISTRAL_7B_INSTRUCT_V02': '#4a4a4a',  # Gray
@@ -34,14 +36,14 @@ class ModelMaps:
                      "GPT 4 32k",
                      "GPT 4 Turbo 0125-preview",
                      "GPT 4 Turbo 1106-preview",
-                     "GPT 3.5",
                      "GPT 3.5 Instruct",
                      "Azure GPT 4",
                      "Azure GPT 4 32k",
                      "Azure GPT 4 Turbo 0125-preview",
                      "Azure GPT 4 Turbo 1106-preview",
-                     "Azure GPT 3.5",
                      "Azure GPT 3.5 Instruct",]
     MODELS_GOOGLE = ["PaLM 2 text-bison@001",
@@ -49,15 +51,18 @@ class ModelMaps:
                      "PaLM 2 text-unicorn@001",
                      "Gemini Pro"]
-    MODELS_MISTRAL = ["Mistral Tiny",
-                      "Mistral Small",
-                      "Mistral Medium",]
     MODELS_LOCAL = ["LOCAL Mixtral 8x7B Instruct v0.1",
                     "LOCAL Mistral 7B Instruct v0.2",
                     "LOCAL CPU Mistral 7B Instruct v0.2 GGUF",]
-    MODELS_GUI_DEFAULT = "Azure GPT 3.5 Instruct" # "GPT 4 Turbo 1106-preview"
     version_mapping_cost = {
         'GPT 4 32k': 'GPT_4_32K',
@@ -65,23 +70,25 @@ class ModelMaps:
         'GPT 4 Turbo 0125-preview': 'GPT_4_TURBO_0125',
         'GPT 4 Turbo 1106-preview': 'GPT_4_TURBO_1106',
         'GPT 3.5 Instruct': 'GPT_3_5_INSTRUCT',
-        'GPT 3.5': 'GPT_3_5',
         'Azure GPT 4 32k': 'AZURE_GPT_4_32K',
         'Azure GPT 4': 'AZURE_GPT_4',
         'Azure GPT 4 Turbo 0125-preview': 'AZURE_GPT_4_TURBO_0125',
         'Azure GPT 4 Turbo 1106-preview': 'AZURE_GPT_4_TURBO_1106',
         'Azure GPT 3.5 Instruct': 'AZURE_GPT_3_5_INSTRUCT',
-        'Azure GPT 3.5': 'AZURE_GPT_3_5',
         'Gemini Pro': 'GEMINI_PRO',
         'PaLM 2 text-unicorn@001': 'PALM2_TU_1',
         'PaLM 2 text-bison@001': 'PALM2_TB_1',
         'PaLM 2 text-bison@002': 'PALM2_TB_2',
         'Mistral Medium': 'MISTRAL_MEDIUM',
         'Mistral Small': 'MISTRAL_SMALL',
-        'Mistral Tiny': 'MISTRAL_TINY',
         'LOCAL Mixtral 8x7B Instruct v0.1': 'LOCAL_MIXTRAL_8X7B_INSTRUCT_V01',
         'LOCAL Mistral 7B Instruct v0.2': 'LOCAL_MISTRAL_7B_INSTRUCT_V02',
@@ -97,10 +104,10 @@ class ModelMaps:
             'GPT 4 Turbo 0125-preview': has_key_openai,
             'GPT 4':  has_key_openai,
             'GPT 4 32k':  has_key_openai,
-            'GPT 3.5':  has_key_openai,
             'GPT 3.5 Instruct':  has_key_openai,
-            'Azure GPT 3.5': has_key_azure_openai,
             'Azure GPT 3.5 Instruct': has_key_azure_openai,
             'Azure GPT 4': has_key_azure_openai,
             'Azure GPT 4 Turbo 1106-preview': has_key_azure_openai,
@@ -112,9 +119,11 @@ class ModelMaps:
             'PaLM 2 text-unicorn@001':  has_key_google_application_credentials,
             'Gemini Pro':  has_key_google_application_credentials,
-            'Mistral Tiny':  has_key_mistral,
             'Mistral Small':  has_key_mistral,
             'Mistral Medium':  has_key_mistral,
             'LOCAL Mixtral 8x7B Instruct v0.1':  True,
             'LOCAL Mistral 7B Instruct v0.2':  True,
@@ -127,15 +136,17 @@ class ModelMaps:
     def get_version_mapping_is_azure(cls, key):
         version_mapping_is_azure = {
             "GPT 4 Turbo 1106-preview": False,
             'GPT 4': False,
             'GPT 4 32k':  False,
-            'GPT 3.5':  False,
             'GPT 3.5 Instruct':  False,
-            'Azure GPT 3.5': True,
             'Azure GPT 3.5 Instruct': True,
             'Azure GPT 4': True,
             'Azure GPT 4 Turbo 1106-preview': True,
             'Azure GPT 4 32k': True,
             'PaLM 2 text-bison@001':  False,
@@ -143,9 +154,11 @@ class ModelMaps:
             'PaLM 2 text-unicorn@001':  False,
             'Gemini Pro':  False,
-            'Mistral Tiny':  False,
             'Mistral Small':  False,
             'Mistral Medium':  False,
             'LOCAL Mixtral 8x7B Instruct v0.1':  False,
             'LOCAL Mistral 7B Instruct v0.2':  False,
@@ -159,7 +172,7 @@ class ModelMaps:
         ### OpenAI
         if key == 'GPT_3_5':
-            return 'gpt-3.5-turbo-1106'
         elif key == 'GPT_3_5_INSTRUCT':
             return 'gpt-3.5-turbo-instruct'
@@ -178,7 +191,7 @@ class ModelMaps:
         ### Azure
         elif key == 'AZURE_GPT_3_5':
-            return 'gpt-35-turbo-1106'
         elif key == 'AZURE_GPT_3_5_INSTRUCT':
             return 'gpt-35-turbo-instruct'
@@ -209,14 +222,20 @@ class ModelMaps:
             return "gemini-1.0-pro"
         ### Mistral
-        elif key == 'MISTRAL_TINY':
-            return "mistral-tiny"
         elif key == 'MISTRAL_SMALL':
-            return 'mistral-small'
         elif key == 'MISTRAL_MEDIUM':
-            return 'mistral-medium'
         ### Mistral LOCAL

         'AZURE_GPT_3_5_INSTRUCT': '#9400D3',  # Dark Violet
         'AZURE_GPT_3_5': '#9932CC',  # Dark Orchid
+        'OPEN_MISTRAL_7B': '#FFA07A',  # Light Salmon
+        'OPEN_MIXTRAL_8X7B': '#FF8C00',  # Dark Orange
+        'MISTRAL_SMALL': '#FF6347',  # Tomato
         'MISTRAL_MEDIUM': '#FF4500',  # Orange Red
+        'MISTRAL_LARGE': '#800000',  # Maroon
         'LOCAL_MIXTRAL_8X7B_INSTRUCT_V01': '#000000',  # Black
         'LOCAL_MISTRAL_7B_INSTRUCT_V02': '#4a4a4a',  # Gray
                      "GPT 4 32k",
                      "GPT 4 Turbo 0125-preview",
                      "GPT 4 Turbo 1106-preview",
+                     "GPT 3.5 Turbo",
                      "GPT 3.5 Instruct",
                      "Azure GPT 4",
                      "Azure GPT 4 32k",
                      "Azure GPT 4 Turbo 0125-preview",
                      "Azure GPT 4 Turbo 1106-preview",
+                     "Azure GPT 3.5 Turbo",
                      "Azure GPT 3.5 Instruct",]
     MODELS_GOOGLE = ["PaLM 2 text-bison@001",
                      "PaLM 2 text-unicorn@001",
                      "Gemini Pro"]
+    MODELS_MISTRAL = ["Mistral Small",
+                      "Mistral Medium",
+                      "Mistral Large",
+                      "Open Mixtral 8x7B",
+                      "Open Mistral 7B",
+                      ]
     MODELS_LOCAL = ["LOCAL Mixtral 8x7B Instruct v0.1",
                     "LOCAL Mistral 7B Instruct v0.2",
                     "LOCAL CPU Mistral 7B Instruct v0.2 GGUF",]
+    MODELS_GUI_DEFAULT = "Azure GPT 3.5 Turbo" # "GPT 4 Turbo 1106-preview"
     version_mapping_cost = {
         'GPT 4 32k': 'GPT_4_32K',
         'GPT 4 Turbo 0125-preview': 'GPT_4_TURBO_0125',
         'GPT 4 Turbo 1106-preview': 'GPT_4_TURBO_1106',
         'GPT 3.5 Instruct': 'GPT_3_5_INSTRUCT',
+        'GPT 3.5 Turbo': 'GPT_3_5',
         'Azure GPT 4 32k': 'AZURE_GPT_4_32K',
         'Azure GPT 4': 'AZURE_GPT_4',
         'Azure GPT 4 Turbo 0125-preview': 'AZURE_GPT_4_TURBO_0125',
         'Azure GPT 4 Turbo 1106-preview': 'AZURE_GPT_4_TURBO_1106',
         'Azure GPT 3.5 Instruct': 'AZURE_GPT_3_5_INSTRUCT',
+        'Azure GPT 3.5 Turbo': 'AZURE_GPT_3_5',
         'Gemini Pro': 'GEMINI_PRO',
         'PaLM 2 text-unicorn@001': 'PALM2_TU_1',
         'PaLM 2 text-bison@001': 'PALM2_TB_1',
         'PaLM 2 text-bison@002': 'PALM2_TB_2',
+        'Mistral Large': 'MISTRAL_LARGE',
         'Mistral Medium': 'MISTRAL_MEDIUM',
         'Mistral Small': 'MISTRAL_SMALL',
+        'Open Mixtral 8x7B': 'OPEN_MIXTRAL_8X7B',
+        'Open Mistral 7B': 'OPEN_MISTRAL_7B',
         'LOCAL Mixtral 8x7B Instruct v0.1': 'LOCAL_MIXTRAL_8X7B_INSTRUCT_V01',
         'LOCAL Mistral 7B Instruct v0.2': 'LOCAL_MISTRAL_7B_INSTRUCT_V02',
             'GPT 4 Turbo 0125-preview': has_key_openai,
             'GPT 4':  has_key_openai,
             'GPT 4 32k':  has_key_openai,
+            'GPT 3.5 Turbo':  has_key_openai,
             'GPT 3.5 Instruct':  has_key_openai,
+            'Azure GPT 3.5 Turbo': has_key_azure_openai,
             'Azure GPT 3.5 Instruct': has_key_azure_openai,
             'Azure GPT 4': has_key_azure_openai,
             'Azure GPT 4 Turbo 1106-preview': has_key_azure_openai,
             'PaLM 2 text-unicorn@001':  has_key_google_application_credentials,
             'Gemini Pro':  has_key_google_application_credentials,
             'Mistral Small':  has_key_mistral,
             'Mistral Medium':  has_key_mistral,
+            'Mistral Large':  has_key_mistral,
+            'Open Mixtral 8x7B':  has_key_mistral,
+            'Open Mistral 7B':  has_key_mistral,
             'LOCAL Mixtral 8x7B Instruct v0.1':  True,
             'LOCAL Mistral 7B Instruct v0.2':  True,
     def get_version_mapping_is_azure(cls, key):
         version_mapping_is_azure = {
             "GPT 4 Turbo 1106-preview": False,
+            "GPT 4 Turbo 0125-preview": False,
             'GPT 4': False,
             'GPT 4 32k':  False,
+            'GPT 3.5 Turbo':  False,
             'GPT 3.5 Instruct':  False,
+            'Azure GPT 3.5 Turbo': True,
             'Azure GPT 3.5 Instruct': True,
             'Azure GPT 4': True,
             'Azure GPT 4 Turbo 1106-preview': True,
+            'Azure GPT 4 Turbo 0125-preview': True,
             'Azure GPT 4 32k': True,
             'PaLM 2 text-bison@001':  False,
             'PaLM 2 text-unicorn@001':  False,
             'Gemini Pro':  False,
             'Mistral Small':  False,
             'Mistral Medium':  False,
+            'Mistral Large':  False,
+            'Open Mixtral 8x7B':  False,
+            'Open Mistral 7B':  False,
             'LOCAL Mixtral 8x7B Instruct v0.1':  False,
             'LOCAL Mistral 7B Instruct v0.2':  False,
         ### OpenAI
         if key == 'GPT_3_5':
+            return 'gpt-3.5-turbo-0125' #'gpt-3.5-turbo-1106'
         elif key == 'GPT_3_5_INSTRUCT':
             return 'gpt-3.5-turbo-instruct'
         ### Azure
         elif key == 'AZURE_GPT_3_5':
+            return 'gpt-35-turbo-0125'
         elif key == 'AZURE_GPT_3_5_INSTRUCT':
             return 'gpt-35-turbo-instruct'
             return "gemini-1.0-pro"
         ### Mistral
+        elif key == 'OPEN_MISTRAL_7B':
+            return "open-mistral-7b"
+        elif key == 'OPEN_MIXTRAL_8X7B':
+            return 'open-mixtral-8x7b'
         elif key == 'MISTRAL_SMALL':
+            return 'mistral-small-latest'
         elif key == 'MISTRAL_MEDIUM':
+            return 'mistral-medium-latest'
+        elif key == 'MISTRAL_LARGE':
+            return 'mistral-large-latest'
         ### Mistral LOCAL

vouchervision/prompt_catalog.py CHANGED Viewed

@@ -18,7 +18,7 @@ class PromptCatalog:
     def prompt_SLTP(self, rules_config_path, OCR=None, is_palm=False):
-        self.OCR = OCR
         self.rules_config_path = rules_config_path
         self.rules_config = self.load_rules_config()
@@ -48,9 +48,9 @@ class PromptCatalog:
                 The unstructured OCR text is:
                 {self.OCR}
                 Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
-                {self.structure}
-                {self.structure}
-                {self.structure}
                 """
         else:
             prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
@@ -62,13 +62,16 @@ class PromptCatalog:
                 The unstructured OCR text is:
                 {self.OCR}
                 Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
-                {self.structure}
                 """
         # xlsx_headers = self.generate_xlsx_headers(is_palm)
         # return prompt, self.PromptJSONModel, self.n_fields, xlsx_headers
         return prompt, self.dictionary_structure
     def copy_prompt_template_to_new_dir(self, new_directory_path, rules_config_path):
         # Ensure the target directory exists, create it if it doesn't
@@ -102,22 +105,31 @@ class PromptCatalog:
         return structure_json_str
     def create_structure(self, is_palm=False):
-        # Create fields for the Pydantic model dynamically
-        fields = {key: (str, Field(default=value, description=value)) for key, value in self.rules_list.items()}
-        # Dynamically create the Pydantic model
-        DynamicJSONParsingModel = create_model('SLTPvA', **fields)
-        DynamicJSONParsingModel_use = DynamicJSONParsingModel()
-        # Define the structure for the "Dictionary" section
-        dictionary_fields = {key: (str, Field(default='', description="")) for key in self.rules_list.keys()}
-        # Dynamically create the "Dictionary" Pydantic model
-        PromptJSONModel = create_model('PromptJSONModel', **dictionary_fields)
-        # Convert the model to JSON string (for demonstration)
-        dictionary_structure = PromptJSONModel().dict()
         structure_json_str = json.dumps(dictionary_structure, sort_keys=False, indent=4)
         return structure_json_str, dictionary_structure

     def prompt_SLTP(self, rules_config_path, OCR=None, is_palm=False):
+        self.OCR = self.remove_colons_and_double_apostrophes(OCR)
         self.rules_config_path = rules_config_path
         self.rules_config = self.load_rules_config()
                 The unstructured OCR text is:
                 {self.OCR}
                 Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
+                {self.dictionary_structure}
+                {self.dictionary_structure}
+                {self.dictionary_structure}
                 """
         else:
             prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
                 The unstructured OCR text is:
                 {self.OCR}
                 Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
+                {self.dictionary_structure}
                 """
         # xlsx_headers = self.generate_xlsx_headers(is_palm)
         # return prompt, self.PromptJSONModel, self.n_fields, xlsx_headers
+        # print(prompt)
         return prompt, self.dictionary_structure
+    def remove_colons_and_double_apostrophes(self, text):
+        return text.replace(":", "").replace("\"", "")
     def copy_prompt_template_to_new_dir(self, new_directory_path, rules_config_path):
         # Ensure the target directory exists, create it if it doesn't
         return structure_json_str
     def create_structure(self, is_palm=False):
+        # # Create fields for the Pydantic model dynamically
+        # fields = {key: (str, Field(default=value, description=value)) for key, value in self.rules_list.items()}
+        # # Dynamically create the Pydantic model
+        # DynamicJSONParsingModel = create_model('SLTPvA', **fields)
+        # DynamicJSONParsingModel_use = DynamicJSONParsingModel()
+        # # Define the structure for the "Dictionary" section
+        # dictionary_fields = {key: (str, Field(default='', description="")) for key in self.rules_list.keys()}
+        # # Dynamically create the "Dictionary" Pydantic model
+        # PromptJSONModel = create_model('PromptJSONModel', **dictionary_fields)
+        # # Convert the model to JSON string (for demonstration)
+        # dictionary_structure = PromptJSONModel().dict()
+        # structure_json_str = json.dumps(dictionary_structure, sort_keys=False, indent=4)
+        # Directly create the dictionary structure with empty strings as default values
+        dictionary_structure = {key: '' for key in self.rules_list.keys()}
+        # Convert the dictionary to JSON string for demonstration if needed
         structure_json_str = json.dumps(dictionary_structure, sort_keys=False, indent=4)
+        # print(structure_json_str)
+        # print(dictionary_structure)
         return structure_json_str, dictionary_structure

vouchervision/tool_taxonomy_WFO.py CHANGED Viewed

@@ -19,12 +19,19 @@ class WFONameMatcher:
         self.is_enabled = tool_WFO
     def extract_input_string(self, record):
-        primary_input = f"{record.get('scientificName', '').strip()} {record.get('scientificNameAuthorship', '').strip()}".strip()
-        secondary_input = ' '.join(filter(None, [record.get('genus', '').strip(),
-                                                 record.get('subgenus', '').strip(),
-                                                 record.get('specificEpithet', '').strip(),
-                                                 record.get('infraspecificEpithet', '').strip()])).strip()
         return primary_input, secondary_input
     def query_wfo_name_matching(self, input_string, check_homonyms=True, check_rank=True, accept_single_candidate=True):
@@ -46,6 +53,8 @@ class WFONameMatcher:
     def query_and_process(self, record):
         primary_input, secondary_input = self.extract_input_string(record)
         # Query with primary input
         primary_result = self.query_wfo_name_matching(primary_input)

         self.is_enabled = tool_WFO
     def extract_input_string(self, record):
+        if 'scientificName' in record and 'scientificNameAuthorship' in record:
+            primary_input = f"{record.get('scientificName', '').strip()} {record.get('scientificNameAuthorship', '').strip()}".strip()
+        elif 'speciesBinomialName' in record and 'speciesBinomialNameAuthorship' in record:
+            primary_input = f"{record.get('speciesBinomialName', '').strip()} {record.get('speciesBinomialNameAuthorship', '').strip()}".strip()
+        else:
+            return None, None
+        if 'genus' in record and 'specificEpithet' in record:
+            secondary_input = ' '.join(filter(None, [record.get('genus', '').strip(),
+                                                 record.get('specificEpithet', '').strip()])).strip()
+        else:
+            return None, None
         return primary_input, secondary_input
     def query_wfo_name_matching(self, input_string, check_homonyms=True, check_rank=True, accept_single_candidate=True):
     def query_and_process(self, record):
         primary_input, secondary_input = self.extract_input_string(record)
+        if primary_input is None and secondary_input is None:
+            return self.NULL_DICT
         # Query with primary input
         primary_result = self.query_wfo_name_matching(primary_input)

vouchervision/utils_LLM.py CHANGED Viewed

@@ -63,16 +63,13 @@ def run_tools(output, tool_WFO, tool_GEO, tool_wikipedia, json_file_path_wiki):
     return output_WFO, WFO_record, output_GEO, GEO_record
 def save_individual_prompt(prompt_template, txt_file_path_ind_prompt):
     with open(txt_file_path_ind_prompt, 'w',encoding='utf-8') as file:
         file.write(prompt_template)
-def remove_colons_and_double_apostrophes(text):
-    return text.replace(":", "").replace("\"", "")
 def sanitize_prompt(data):
     if isinstance(data, dict):
         return {sanitize_prompt(key): sanitize_prompt(value) for key, value in data.items()}

     return output_WFO, WFO_record, output_GEO, GEO_record
 def save_individual_prompt(prompt_template, txt_file_path_ind_prompt):
     with open(txt_file_path_ind_prompt, 'w',encoding='utf-8') as file:
         file.write(prompt_template)
 def sanitize_prompt(data):
     if isinstance(data, dict):
         return {sanitize_prompt(key): sanitize_prompt(value) for key, value in data.items()}

vouchervision/utils_LLM_JSON_validation.py CHANGED Viewed

@@ -11,7 +11,8 @@ def validate_and_align_JSON_keys_with_template(data, JSON_dict_structure):
             if value is None:
                 data[key] = ''
             elif isinstance(value, str):
-                if value.lower() in ['unknown', 'not provided', 'missing', 'na', 'none', 'n/a', 'null', 'unspecified',
                                     'not provided in the text', 'not found in the text',
                                     'not in the text', 'not provided', 'not found',
                                     'not provided in the ocr', 'not found in the ocr',

             if value is None:
                 data[key] = ''
             elif isinstance(value, str):
+                if value.lower() in ['unknown','not provided', 'missing', 'na', 'none', 'n/a', 'null', 'unspecified',
+                                     'TBD',
                                     'not provided in the text', 'not found in the text',
                                     'not in the text', 'not provided', 'not found',
                                     'not provided in the ocr', 'not found in the ocr',

vouchervision/utils_VoucherVision.py CHANGED Viewed

@@ -14,7 +14,6 @@ from vouchervision.LLM_GoogleGemini import GoogleGeminiHandler
 from vouchervision.LLM_MistralAI import MistralHandler
 from vouchervision.LLM_local_cpu_MistralAI import LocalCPUMistralHandler
 from vouchervision.LLM_local_MistralAI import LocalMistralHandler
-from vouchervision.utils_LLM import remove_colons_and_double_apostrophes
 from vouchervision.prompt_catalog import PromptCatalog
 from vouchervision.model_maps import ModelMaps
 from vouchervision.general_utils import get_cfg_from_full_path
@@ -32,7 +31,7 @@ from vouchervision.OCR_google_cloud_vision import OCREngine
 class VoucherVision():
-    def __init__(self, cfg, logger, dir_home, path_custom_prompts, Project, Dirs, is_hf):
         self.cfg = cfg
         self.logger = logger
         self.dir_home = dir_home
@@ -43,6 +42,9 @@ class VoucherVision():
         self.prompt_version = None
         self.is_hf = is_hf
         # self.trOCR_model_version = "microsoft/trocr-large-handwritten"
         # self.trOCR_model_version = "microsoft/trocr-base-handwritten"
         # self.trOCR_model_version = "dh-unibe/trocr-medieval-escriptmask" # NOPE
@@ -686,9 +688,10 @@ class VoucherVision():
         Copy_Prompt = PromptCatalog()
         Copy_Prompt.copy_prompt_template_to_new_dir(self.Dirs.transcription_prompt, self.path_custom_prompts)
-        json_report.set_text(text_main=f'Loading {MODEL_NAME_FORMATTED}')
-        json_report.set_JSON({}, {}, {})
-        llm_model = self.initialize_llm_model(self.cfg, self.logger, MODEL_NAME_FORMATTED, self.JSON_dict_structure, name_parts, is_azure, self.llm)
         for i, path_to_crop in enumerate(self.img_paths):
             self.update_progress_report_batch(progress_report, i)
@@ -701,9 +704,11 @@ class VoucherVision():
             self.path_to_crop = path_to_crop
             filename_without_extension, txt_file_path, txt_file_path_OCR, txt_file_path_OCR_bounds, jpg_file_path_OCR_helper, json_file_path_wiki, txt_file_path_ind_prompt = paths
-            json_report.set_text(text_main='Starting OCR')
             self.perform_OCR_and_save_results(i, json_report, jpg_file_path_OCR_helper, txt_file_path_OCR, txt_file_path_OCR_bounds)
-            json_report.set_text(text_main='Finished OCR')
             if not self.OCR:
                 self.n_failed_OCR += 1
@@ -713,7 +718,7 @@ class VoucherVision():
             else:
                 ### Format prompt
                 prompt = self.setup_prompt()
-                prompt = remove_colons_and_double_apostrophes(prompt)
                 ### Send prompt to chosen LLM
                 self.logger.info(f'Waiting for {model_name} API call --- Using {MODEL_NAME_FORMATTED}')
@@ -747,8 +752,9 @@ class VoucherVision():
             final_JSON_response, final_WFO_record, final_GEO_record = self.update_final_response(response_candidate, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, paths, path_to_crop, nt_in, nt_out)
             self.logger.info(f'Finished LLM call')
-            json_report.set_JSON(final_JSON_response, final_WFO_record, final_GEO_record)
         self.update_progress_report_final(progress_report)
         final_JSON_response = self.parse_final_json_response(final_JSON_response)
@@ -758,22 +764,22 @@ class VoucherVision():
     ##################################################################################################################################
     ################################################## LLM Helper Funcs ##############################################################
     ##################################################################################################################################
-    def initialize_llm_model(self, cfg, logger, model_name, JSON_dict_structure, name_parts, is_azure=None, llm_object=None):
         if 'LOCAL'in name_parts:
             if ('MIXTRAL' in name_parts) or ('MISTRAL' in name_parts):
                 if 'CPU' in name_parts:
-                    return LocalCPUMistralHandler(cfg, logger, model_name, JSON_dict_structure)
                 else:
-                    return LocalMistralHandler(cfg, logger, model_name, JSON_dict_structure)
         else:
             if 'PALM2' in name_parts:
-                return GooglePalm2Handler(cfg, logger, model_name, JSON_dict_structure)
             elif 'GEMINI' in name_parts:
-                return GoogleGeminiHandler(cfg, logger, model_name, JSON_dict_structure)
             elif 'MISTRAL' in name_parts and ('LOCAL' not in name_parts):
-                return MistralHandler(cfg, logger, model_name, JSON_dict_structure)
             else:
-                return OpenAIHandler(cfg, logger, model_name, JSON_dict_structure, is_azure, llm_object)
     def setup_prompt(self):
         Catalog = PromptCatalog()

 from vouchervision.LLM_MistralAI import MistralHandler
 from vouchervision.LLM_local_cpu_MistralAI import LocalCPUMistralHandler
 from vouchervision.LLM_local_MistralAI import LocalMistralHandler
 from vouchervision.prompt_catalog import PromptCatalog
 from vouchervision.model_maps import ModelMaps
 from vouchervision.general_utils import get_cfg_from_full_path
 class VoucherVision():
+    def __init__(self, cfg, logger, dir_home, path_custom_prompts, Project, Dirs, is_hf, config_vals_for_permutation=None):
         self.cfg = cfg
         self.logger = logger
         self.dir_home = dir_home
         self.prompt_version = None
         self.is_hf = is_hf
+        ### config_vals_for_permutation allows you to set the starting temp, top_k, top_p, seed....
+        self.config_vals_for_permutation = config_vals_for_permutation
         # self.trOCR_model_version = "microsoft/trocr-large-handwritten"
         # self.trOCR_model_version = "microsoft/trocr-base-handwritten"
         # self.trOCR_model_version = "dh-unibe/trocr-medieval-escriptmask" # NOPE
         Copy_Prompt = PromptCatalog()
         Copy_Prompt.copy_prompt_template_to_new_dir(self.Dirs.transcription_prompt, self.path_custom_prompts)
+        if json_report:
+            json_report.set_text(text_main=f'Loading {MODEL_NAME_FORMATTED}')
+            json_report.set_JSON({}, {}, {})
+        llm_model = self.initialize_llm_model(self.cfg, self.logger, MODEL_NAME_FORMATTED, self.JSON_dict_structure, name_parts, is_azure, self.llm, self.config_vals_for_permutation)
         for i, path_to_crop in enumerate(self.img_paths):
             self.update_progress_report_batch(progress_report, i)
             self.path_to_crop = path_to_crop
             filename_without_extension, txt_file_path, txt_file_path_OCR, txt_file_path_OCR_bounds, jpg_file_path_OCR_helper, json_file_path_wiki, txt_file_path_ind_prompt = paths
+            if json_report:
+                json_report.set_text(text_main='Starting OCR')
             self.perform_OCR_and_save_results(i, json_report, jpg_file_path_OCR_helper, txt_file_path_OCR, txt_file_path_OCR_bounds)
+            if json_report:
+                json_report.set_text(text_main='Finished OCR')
             if not self.OCR:
                 self.n_failed_OCR += 1
             else:
                 ### Format prompt
                 prompt = self.setup_prompt()
+                # prompt = remove_colons_and_double_apostrophes(prompt) # This is moved to utils_VV since it broke the json structure.
                 ### Send prompt to chosen LLM
                 self.logger.info(f'Waiting for {model_name} API call --- Using {MODEL_NAME_FORMATTED}')
             final_JSON_response, final_WFO_record, final_GEO_record = self.update_final_response(response_candidate, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, paths, path_to_crop, nt_in, nt_out)
             self.logger.info(f'Finished LLM call')
+            if json_report:
+                json_report.set_JSON(final_JSON_response, final_WFO_record, final_GEO_record)
         self.update_progress_report_final(progress_report)
         final_JSON_response = self.parse_final_json_response(final_JSON_response)
     ##################################################################################################################################
     ################################################## LLM Helper Funcs ##############################################################
     ##################################################################################################################################
+    def initialize_llm_model(self, cfg, logger, model_name, JSON_dict_structure, name_parts, is_azure=None, llm_object=None, config_vals_for_permutation=None):
         if 'LOCAL'in name_parts:
             if ('MIXTRAL' in name_parts) or ('MISTRAL' in name_parts):
                 if 'CPU' in name_parts:
+                    return LocalCPUMistralHandler(cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation)
                 else:
+                    return LocalMistralHandler(cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation)
         else:
             if 'PALM2' in name_parts:
+                return GooglePalm2Handler(cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation)
             elif 'GEMINI' in name_parts:
+                return GoogleGeminiHandler(cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation)
             elif 'MISTRAL' in name_parts and ('LOCAL' not in name_parts):
+                return MistralHandler(cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation)
             else:
+                return OpenAIHandler(cfg, logger, model_name, JSON_dict_structure, is_azure, llm_object, config_vals_for_permutation)
     def setup_prompt(self):
         Catalog = PromptCatalog()

vouchervision/utils_VoucherVision_parallel.py ADDED Viewed

	@@ -0,0 +1,1022 @@

+import openai
+import os, json, glob, shutil, yaml, torch, logging
+import openpyxl
+from openpyxl import Workbook, load_workbook
+from tqdm import tqdm
+import vertexai
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+from langchain_openai import AzureChatOpenAI
+from google.oauth2 import service_account
+from transformers import AutoTokenizer, AutoModel
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from queue import Queue
+import threading
+from vouchervision.LLM_OpenAI import OpenAIHandler
+from vouchervision.LLM_GooglePalm2 import GooglePalm2Handler
+from vouchervision.LLM_GoogleGemini import GoogleGeminiHandler
+from vouchervision.LLM_MistralAI import MistralHandler
+from vouchervision.LLM_local_cpu_MistralAI import LocalCPUMistralHandler
+from vouchervision.LLM_local_MistralAI import LocalMistralHandler
+from vouchervision.prompt_catalog import PromptCatalog
+from vouchervision.model_maps import ModelMaps
+from vouchervision.general_utils import get_cfg_from_full_path
+from vouchervision.OCR_google_cloud_vision import OCREngine
+'''
+* For the prefix_removal, the image names have 'MICH-V-' prior to the barcode, so that is used for matching
+  but removed for output.
+* There is also code active to replace the LLM-predicted "Catalog Number" with the correct number since it is known.
+  The LLMs to usually assign the barcode to the correct field, but it's not needed since it is already known.
+        - Look for ####################### Catalog Number pre-defined
+'''
+class VoucherVision():
+    def __init__(self, cfg, logger, dir_home, path_custom_prompts, Project, Dirs, is_hf, config_vals_for_permutation=None):
+        self.cfg = cfg
+        self.logger = logger
+        self.dir_home = dir_home
+        self.path_custom_prompts = path_custom_prompts
+        self.Project = Project
+        self.Dirs = Dirs
+        self.headers = None
+        self.prompt_version = None
+        self.is_hf = is_hf
+        ### config_vals_for_permutation allows you to set the starting temp, top_k, top_p, seed....
+        self.config_vals_for_permutation = config_vals_for_permutation
+        # self.trOCR_model_version = "microsoft/trocr-large-handwritten"
+        # self.trOCR_model_version = "microsoft/trocr-base-handwritten"
+        # self.trOCR_model_version = "dh-unibe/trocr-medieval-escriptmask" # NOPE
+        # self.trOCR_model_version = "dh-unibe/trocr-kurrent" # NOPE
+        # self.trOCR_model_version = "DunnBC22/trocr-base-handwritten-OCR-handwriting_recognition_v2" # NOPE
+        self.trOCR_processor = None
+        self.trOCR_model = None
+        self.set_API_keys()
+        self.setup()
+    def setup(self):
+        self.logger.name = f'[Transcription]'
+        self.logger.info(f'Setting up OCR and LLM')
+        self.trOCR_model_version = self.cfg['leafmachine']['project']['trOCR_model_path']
+        self.db_name = self.cfg['leafmachine']['project']['embeddings_database_name']
+        self.path_domain_knowledge = self.cfg['leafmachine']['project']['path_to_domain_knowledge_xlsx']
+        self.build_new_db = self.cfg['leafmachine']['project']['build_new_embeddings_database']
+        self.continue_run_from_partial_xlsx = self.cfg['leafmachine']['project']['continue_run_from_partial_xlsx']
+        self.prefix_removal = self.cfg['leafmachine']['project']['prefix_removal']
+        self.suffix_removal = self.cfg['leafmachine']['project']['suffix_removal']
+        self.catalog_numerical_only = self.cfg['leafmachine']['project']['catalog_numerical_only']
+        self.prompt_version0 = self.cfg['leafmachine']['project']['prompt_version']
+        self.use_domain_knowledge = self.cfg['leafmachine']['project']['use_domain_knowledge']
+        self.catalog_name_options = ["Catalog Number", "catalog_number", "catalogNumber"]
+        self.geo_headers = ["GEO_override_OCR", "GEO_method", "GEO_formatted_full_string", "GEO_decimal_lat",
+                       "GEO_decimal_long","GEO_city", "GEO_county", "GEO_state",
+                       "GEO_state_code", "GEO_country", "GEO_country_code", "GEO_continent",]
+        self.usage_headers = ["current_time", "inference_time_s", "tool_time_s","max_cpu", "max_ram_gb", "n_gpus", "max_gpu_load", "max_gpu_vram_gb","total_gpu_vram_gb","capability_score",]
+        self.wfo_headers = ["WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement"]
+        self.wfo_headers_no_lists = ["WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_placement"]
+        self.utility_headers = ["filename"] + self.wfo_headers + self.geo_headers + self.usage_headers + ["run_name", "prompt", "LLM", "tokens_in", "tokens_out", "LM2_collage", "OCR_method", "OCR_double", "OCR_trOCR", "path_to_crop","path_to_original","path_to_content","path_to_helper",]
+                                # "WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement",
+                                # "GEO_override_OCR", "GEO_method", "GEO_formatted_full_string", "GEO_decimal_lat",
+                                # "GEO_decimal_long","GEO_city", "GEO_county", "GEO_state",
+                                # "GEO_state_code", "GEO_country", "GEO_country_code", "GEO_continent",
+                                # "tokens_in", "tokens_out", "path_to_crop","path_to_original","path_to_content","path_to_helper",]
+        # WFO_candidate_names is separate, bc it may be type --> list
+        self.do_create_OCR_helper_image = self.cfg['leafmachine']['do_create_OCR_helper_image']
+        self.map_prompt_versions()
+        self.map_dir_labels()
+        self.map_API_options()
+        # self.init_embeddings()
+        self.init_transcription_xlsx()
+        self.init_trOCR_model()
+        '''Logging'''
+        self.logger.info(f'Transcribing dataset --- {self.dir_labels}')
+        self.logger.info(f'Saving transcription batch to --- {self.path_transcription}')
+        self.logger.info(f'Saving individual transcription files to --- {self.Dirs.transcription_ind}')
+        self.logger.info(f'Starting transcription...')
+        self.logger.info(f'     LLM MODEL --> {self.version_name}')
+        self.logger.info(f'     Using Azure API --> {self.is_azure}')
+        self.logger.info(f'     Model name passed to API --> {self.model_name}')
+        self.logger.info(f'     API access token is found in PRIVATE_DATA.yaml --> {self.has_key}')
+    def init_trOCR_model(self):
+        lgr = logging.getLogger('transformers')
+        lgr.setLevel(logging.ERROR)
+        self.trOCR_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") # usually just the "microsoft/trocr-base-handwritten"
+        self.trOCR_model = VisionEncoderDecoderModel.from_pretrained(self.trOCR_model_version) # This matches the model
+        # Check for GPU availability
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.trOCR_model.to(self.device)
+    def map_API_options(self):
+        self.chat_version = self.cfg['leafmachine']['LLM_version']
+        # Get the required values from ModelMaps
+        self.model_name = ModelMaps.get_version_mapping_cost(self.chat_version)
+        self.is_azure = ModelMaps.get_version_mapping_is_azure(self.chat_version)
+        self.has_key = ModelMaps.get_version_has_key(self.chat_version, self.has_key_openai, self.has_key_azure_openai, self.has_key_google_application_credentials, self.has_key_mistral)
+        # Check if the version is supported
+        if self.model_name is None:
+            supported_LLMs = ", ".join(ModelMaps.get_models_gui_list())
+            raise Exception(f"Unsupported LLM: {self.chat_version}. Requires one of: {supported_LLMs}")
+        self.version_name = self.chat_version
+    def map_prompt_versions(self):
+        self.prompt_version_map = {
+            "Version 1": "prompt_v1_verbose",
+        }
+        self.prompt_version = self.prompt_version_map.get(self.prompt_version0, self.path_custom_prompts)
+        self.is_predefined_prompt = self.is_in_prompt_version_map(self.prompt_version)
+    def is_in_prompt_version_map(self, value):
+        return value in self.prompt_version_map.values()
+    def map_dir_labels(self):
+        if self.cfg['leafmachine']['use_RGB_label_images']:
+            self.dir_labels = os.path.join(self.Dirs.save_per_annotation_class,'label')
+        else:
+            self.dir_labels = self.Dirs.save_original
+        # Use glob to get all image paths in the directory
+        self.img_paths = glob.glob(os.path.join(self.dir_labels, "*"))
+    def load_rules_config(self):
+        with open(self.path_custom_prompts, 'r') as stream:
+            try:
+                return yaml.safe_load(stream)
+            except yaml.YAMLError as exc:
+                print(exc)
+                return None
+    def generate_xlsx_headers(self):
+        # Extract headers from the 'Dictionary' keys in the JSON template rules
+        # xlsx_headers = list(self.rules_config_json['rules']["Dictionary"].keys())
+        xlsx_headers = list(self.rules_config_json['rules'].keys())
+        xlsx_headers = xlsx_headers + self.utility_headers
+        return xlsx_headers
+    def init_transcription_xlsx(self):
+        # Initialize output file
+        self.path_transcription = os.path.join(self.Dirs.transcription,"transcribed.xlsx")
+        # else:
+        if not self.is_predefined_prompt:
+            # Load the rules configuration
+            self.rules_config_json = self.load_rules_config()
+            # Generate the headers from the configuration
+            self.headers = self.generate_xlsx_headers()
+            # Set the headers used to the dynamically generated headers
+            self.headers_used = 'CUSTOM'
+        else:
+            # If it's a predefined prompt, raise an exception as we don't have further instructions
+            raise ValueError("Predefined prompt is not handled in this context.")
+        self.create_or_load_excel_with_headers(os.path.join(self.Dirs.transcription,"transcribed.xlsx"), self.headers)
+    def create_or_load_excel_with_headers(self, file_path, headers, show_head=False):
+        output_dir_names = ['Archival_Components', 'Config_File', 'Cropped_Images', 'Logs', 'Original_Images', 'Transcription']
+        self.completed_specimens = []
+        # Check if the file exists and it's not None
+        if self.continue_run_from_partial_xlsx is not None and os.path.isfile(self.continue_run_from_partial_xlsx):
+            workbook = load_workbook(filename=self.continue_run_from_partial_xlsx)
+            sheet = workbook.active
+            show_head=True
+            # Identify the 'path_to_crop' column
+            try:
+                path_to_crop_col = headers.index('path_to_crop') + 1
+                path_to_original_col = headers.index('path_to_original') + 1
+                path_to_content_col = headers.index('path_to_content') + 1
+                path_to_helper_col = headers.index('path_to_helper') + 1
+                # self.completed_specimens = list(sheet.iter_cols(min_col=path_to_crop_col, max_col=path_to_crop_col, values_only=True, min_row=2))
+            except ValueError:
+                print("'path_to_crop' not found in the header row.")
+            path_to_crop = list(sheet.iter_cols(min_col=path_to_crop_col, max_col=path_to_crop_col, values_only=True, min_row=2))
+            path_to_original = list(sheet.iter_cols(min_col=path_to_original_col, max_col=path_to_original_col, values_only=True, min_row=2))
+            path_to_content = list(sheet.iter_cols(min_col=path_to_content_col, max_col=path_to_content_col, values_only=True, min_row=2))
+            path_to_helper = list(sheet.iter_cols(min_col=path_to_helper_col, max_col=path_to_helper_col, values_only=True, min_row=2))
+            others = [path_to_crop_col, path_to_original_col, path_to_content_col, path_to_helper_col]
+            jsons = [path_to_content_col, path_to_helper_col]
+            for cell in path_to_crop[0]:
+                old_path = cell
+                new_path = file_path
+                for dir_name in output_dir_names:
+                    if dir_name in old_path:
+                        old_path_parts = old_path.split(dir_name)
+                        new_path_parts = new_path.split('Transcription')
+                        updated_path = new_path_parts[0] + dir_name + old_path_parts[1]
+                        self.completed_specimens.append(os.path.basename(updated_path))
+            print(f"{len(self.completed_specimens)} images are already completed")
+            ### Copy the JSON files over
+            for colu in jsons:
+                cell = next(sheet.iter_rows(min_row=2, min_col=colu, max_col=colu))[0]
+                old_path = cell.value
+                new_path = file_path
+                old_path_parts = old_path.split('Transcription')
+                new_path_parts = new_path.split('Transcription')
+                updated_path = new_path_parts[0] + 'Transcription' + old_path_parts[1]
+                # Copy files
+                old_dir = os.path.dirname(old_path)
+                new_dir = os.path.dirname(updated_path)
+                # Check if old_dir exists and it's a directory
+                if os.path.exists(old_dir) and os.path.isdir(old_dir):
+                    # Check if new_dir exists. If not, create it.
+                    if not os.path.exists(new_dir):
+                        os.makedirs(new_dir)
+                    # Iterate through all files in old_dir and copy each to new_dir
+                    for filename in os.listdir(old_dir):
+                        shutil.copy2(os.path.join(old_dir, filename), new_dir) # copy2 preserves metadata
+            ### Update the file names
+            for colu in others:
+                for row in sheet.iter_rows(min_row=2, min_col=colu, max_col=colu):
+                    for cell in row:
+                        old_path = cell.value
+                        new_path = file_path
+                        for dir_name in output_dir_names:
+                            if dir_name in old_path:
+                                old_path_parts = old_path.split(dir_name)
+                                new_path_parts = new_path.split('Transcription')
+                                updated_path = new_path_parts[0] + dir_name + old_path_parts[1]
+                                cell.value = updated_path
+            show_head=True
+        else:
+            # Create a new workbook and select the active worksheet
+            workbook = Workbook()
+            sheet = workbook.active
+            # Write headers in the first row
+            for i, header in enumerate(headers, start=1):
+                sheet.cell(row=1, column=i, value=header)
+            self.completed_specimens = []
+        # Save the workbook
+        workbook.save(file_path)
+        if show_head:
+            print("continue_run_from_partial_xlsx:")
+            for i, row in enumerate(sheet.iter_rows(values_only=True)):
+                print(row)
+                if i == 3:  # print the first 5 rows (0-indexed)
+                    print("\n")
+                    break
+    def add_data_to_excel_from_response(self, Dirs, path_transcription, response, WFO_record, GEO_record, usage_report,
+                                        MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, path_to_content, path_to_helper, nt_in, nt_out):
+        wb = openpyxl.load_workbook(path_transcription)
+        sheet = wb.active
+        # find the next empty row
+        next_row = sheet.max_row + 1
+        if isinstance(response, str):
+            try:
+                response = json.loads(response)
+            except json.JSONDecodeError:
+                print(f"Failed to parse response: {response}")
+                return
+        # iterate over headers in the first row
+        for i, header in enumerate(sheet[1], start=1):
+            # check if header value is in response keys
+            if (header.value in response) and (header.value not in self.catalog_name_options): ####################### Catalog Number pre-defined
+                # check if the response value is a dictionary
+                if isinstance(response[header.value], dict):
+                    # if it is a dictionary, extract the 'value' field
+                    cell_value = response[header.value].get('value', '')
+                else:
+                    # if it's not a dictionary, use it directly
+                    cell_value = response[header.value]
+                try:
+                    # write the value to the cell
+                    sheet.cell(row=next_row, column=i, value=cell_value)
+                except:
+                    sheet.cell(row=next_row, column=i, value=cell_value[0])
+            elif header.value in self.catalog_name_options:
+                # if self.prefix_removal:
+                #     filename_without_extension = filename_without_extension.replace(self.prefix_removal, "")
+                # if self.suffix_removal:
+                #     filename_without_extension = filename_without_extension.replace(self.suffix_removal, "")
+                # if self.catalog_numerical_only:
+                #     filename_without_extension = self.remove_non_numbers(filename_without_extension)
+                sheet.cell(row=next_row, column=i, value=filename_without_extension)
+            elif header.value == "path_to_crop":
+                sheet.cell(row=next_row, column=i, value=path_to_crop)
+            elif header.value == "path_to_original":
+                if self.cfg['leafmachine']['use_RGB_label_images']:
+                    fname = os.path.basename(path_to_crop)
+                    base = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(path_to_crop))))
+                    path_to_original = os.path.join(base, 'Original_Images', fname)
+                    sheet.cell(row=next_row, column=i, value=path_to_original)
+                else:
+                    fname = os.path.basename(path_to_crop)
+                    base = os.path.dirname(os.path.dirname(path_to_crop))
+                    path_to_original = os.path.join(base, 'Original_Images', fname)
+                    sheet.cell(row=next_row, column=i, value=path_to_original)
+            elif header.value == "path_to_content":
+                sheet.cell(row=next_row, column=i, value=path_to_content)
+            elif header.value == "path_to_helper":
+                sheet.cell(row=next_row, column=i, value=path_to_helper)
+            elif header.value == "tokens_in":
+                sheet.cell(row=next_row, column=i, value=nt_in)
+            elif header.value == "tokens_out":
+                sheet.cell(row=next_row, column=i, value=nt_out)
+            elif header.value == "filename":
+                sheet.cell(row=next_row, column=i, value=filename_without_extension)
+            elif header.value == "prompt":
+                sheet.cell(row=next_row, column=i, value=os.path.basename(self.path_custom_prompts))
+            elif header.value == "run_name":
+                sheet.cell(row=next_row, column=i, value=Dirs.run_name)
+            elif header.value == "LM2_collage":
+                sheet.cell(row=next_row, column=i, value=self.cfg['leafmachine']['use_RGB_label_images'])
+            elif header.value == "OCR_method":
+                value_to_insert = self.cfg['leafmachine']['project']['OCR_option']
+                if isinstance(value_to_insert, list):
+                    value_to_insert = '|'.join(map(str, value_to_insert))
+                sheet.cell(row=next_row, column=i, value=value_to_insert)
+            elif header.value == "OCR_double":
+                sheet.cell(row=next_row, column=i, value=self.cfg['leafmachine']['project']['double_OCR'])
+            elif header.value == "OCR_trOCR":
+                sheet.cell(row=next_row, column=i, value=self.cfg['leafmachine']['project']['do_use_trOCR'])
+            # "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement"
+            elif header.value in self.wfo_headers_no_lists:
+                sheet.cell(row=next_row, column=i, value=WFO_record.get(header.value, ''))
+            # elif header.value == "WFO_exact_match":
+            #     sheet.cell(row=next_row, column=i, value= WFO_record.get("WFO_exact_match",''))
+            # elif header.value == "WFO_exact_match_name":
+            #     sheet.cell(row=next_row, column=i, value= WFO_record.get("WFO_exact_match_name",''))
+            # elif header.value == "WFO_best_match":
+            #     sheet.cell(row=next_row, column=i, value= WFO_record.get("WFO_best_match",''))
+            # elif header.value == "WFO_placement":
+            #     sheet.cell(row=next_row, column=i, value= WFO_record.get("WFO_placement",''))
+            elif header.value == "WFO_candidate_names":
+                candidate_names = WFO_record.get("WFO_candidate_names", '')
+                # Check if candidate_names is a list and convert to a string if it is
+                if isinstance(candidate_names, list):
+                    candidate_names_str = '|'.join(candidate_names)
+                else:
+                    candidate_names_str = candidate_names
+                sheet.cell(row=next_row, column=i, value=candidate_names_str)
+            # "GEO_method", "GEO_formatted_full_string", "GEO_decimal_lat", "GEO_decimal_long",
+            # "GEO_city", "GEO_county", "GEO_state", "GEO_state_code", "GEO_country", "GEO_country_code", "GEO_continent"
+            elif header.value in self.geo_headers:
+                sheet.cell(row=next_row, column=i, value=GEO_record.get(header.value, ''))
+            elif header.value in self.usage_headers:
+                sheet.cell(row=next_row, column=i, value=usage_report.get(header.value, ''))
+            elif header.value == "LLM":
+                sheet.cell(row=next_row, column=i, value=MODEL_NAME_FORMATTED)
+        # save the workbook
+        wb.save(path_transcription)
+    def has_API_key(self, val):
+        return isinstance(val, str) and bool(val.strip())
+        # if val != '':
+        #     return True
+        # else:
+        #     return False
+    def get_google_credentials(self): # Also used for google drive
+        if self.is_hf:
+            creds_json_str = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
+            credentials = service_account.Credentials.from_service_account_info(json.loads(creds_json_str))
+            return credentials
+        else:
+            with open(self.cfg_private['google']['GOOGLE_APPLICATION_CREDENTIALS'], 'r') as file:
+                data = json.load(file)
+                creds_json_str = json.dumps(data)
+                credentials = service_account.Credentials.from_service_account_info(json.loads(creds_json_str))
+                os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = creds_json_str
+                return credentials
+    def set_API_keys(self):
+        if self.is_hf:
+            self.dir_home = os.path.dirname(os.path.dirname(__file__))
+            self.path_cfg_private = None
+            self.cfg_private = None
+            k_openai = os.getenv('OPENAI_API_KEY')
+            k_openai_azure = os.getenv('AZURE_API_VERSION')
+            k_google_project_id = os.getenv('GOOGLE_PROJECT_ID')
+            k_google_location = os.getenv('GOOGLE_LOCATION')
+            k_google_application_credentials = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
+            k_mistral = os.getenv('MISTRAL_API_KEY')
+            k_here = os.getenv('HERE_API_KEY')
+            k_opencage = os.getenv('open_cage_geocode')
+        else:
+            self.dir_home = os.path.dirname(os.path.dirname(__file__))
+            self.path_cfg_private = os.path.join(self.dir_home, 'PRIVATE_DATA.yaml')
+            self.cfg_private = get_cfg_from_full_path(self.path_cfg_private)
+            k_openai = self.cfg_private['openai']['OPENAI_API_KEY']
+            k_openai_azure = self.cfg_private['openai_azure']['OPENAI_API_KEY_AZURE']
+            k_google_project_id = self.cfg_private['google']['GOOGLE_PROJECT_ID']
+            k_google_location = self.cfg_private['google']['GOOGLE_LOCATION']
+            k_google_application_credentials = self.cfg_private['google']['GOOGLE_APPLICATION_CREDENTIALS']
+            k_mistral = self.cfg_private['mistral']['MISTRAL_API_KEY']
+            k_here = self.cfg_private['here']['API_KEY']
+            k_opencage = self.cfg_private['open_cage_geocode']['API_KEY']
+        self.has_key_openai = self.has_API_key(k_openai)
+        self.has_key_azure_openai = self.has_API_key(k_openai_azure)
+        self.llm = None
+        self.has_key_google_project_id = self.has_API_key(k_google_project_id)
+        self.has_key_google_location = self.has_API_key(k_google_location)
+        self.has_key_google_application_credentials = self.has_API_key(k_google_application_credentials)
+        self.has_key_mistral = self.has_API_key(k_mistral)
+        self.has_key_here = self.has_API_key(k_here)
+        self.has_key_open_cage_geocode = self.has_API_key(k_opencage)
+        ### Google - OCR, Palm2, Gemini
+        if self.has_key_google_application_credentials and self.has_key_google_project_id and self.has_key_google_location:
+            if self.is_hf:
+                vertexai.init(project=os.getenv('GOOGLE_PROJECT_ID'), location=os.getenv('GOOGLE_LOCATION'), credentials=self.get_google_credentials())
+            else:
+                vertexai.init(project=k_google_project_id, location=k_google_location, credentials=self.get_google_credentials())
+                os.environ['GOOGLE_API_KEY'] = self.cfg_private['google']['GOOGLE_PALM_API']
+        ### OpenAI
+        if self.has_key_openai:
+            if self.is_hf:
+                openai.api_key = os.getenv('OPENAI_API_KEY')
+            else:
+                openai.api_key = self.cfg_private['openai']['OPENAI_API_KEY']
+                os.environ["OPENAI_API_KEY"] = self.cfg_private['openai']['OPENAI_API_KEY']
+        ### OpenAI - Azure
+        if self.has_key_azure_openai:
+            if self.is_hf:
+                # Initialize the Azure OpenAI client
+                self.llm = AzureChatOpenAI(
+                    deployment_name = 'gpt-35-turbo',#'gpt-35-turbo',
+                    openai_api_version = os.getenv('AZURE_API_VERSION'),
+                    openai_api_key = os.getenv('AZURE_API_KEY'),
+                    azure_endpoint = os.getenv('AZURE_API_BASE'),
+                    openai_organization = os.getenv('AZURE_ORGANIZATION'),
+                )
+            else:
+                # Initialize the Azure OpenAI client
+                self.llm = AzureChatOpenAI(
+                    deployment_name = 'gpt-35-turbo',#'gpt-35-turbo',
+                    openai_api_version = self.cfg_private['openai_azure']['OPENAI_API_VERSION'],
+                    openai_api_key = self.cfg_private['openai_azure']['OPENAI_API_KEY_AZURE'],
+                    azure_endpoint = self.cfg_private['openai_azure']['OPENAI_API_BASE'],
+                    openai_organization = self.cfg_private['openai_azure']['OPENAI_ORGANIZATION'],
+                )
+        ### Mistral
+        if self.has_key_mistral:
+            if self.is_hf:
+                pass # Already set
+            else:
+                os.environ['MISTRAL_API_KEY'] = self.cfg_private['mistral']['MISTRAL_API_KEY']
+        ### HERE
+        if self.has_key_here:
+            if self.is_hf:
+                pass # Already set
+            else:
+                os.environ['HERE_APP_ID'] = self.cfg_private['here']['APP_ID']
+                os.environ['HERE_API_KEY'] = self.cfg_private['here']['API_KEY']
+        ### HERE
+        if self.has_key_open_cage_geocode:
+            if self.is_hf:
+                pass # Already set
+            else:
+                os.environ['OPENCAGE_API_KEY'] = self.cfg_private['open_cage_geocode']['API_KEY']
+    def clean_catalog_number(self, data, filename_without_extension):
+        #Cleans up the catalog number in data if it's a dict
+        def modify_catalog_key(catalog_key, filename_without_extension, data):
+            # Helper function to apply modifications on catalog number
+            if catalog_key not in data:
+                new_data = {catalog_key: None}
+                data = {**new_data, **data}
+            if self.prefix_removal:
+                filename_without_extension = filename_without_extension.replace(self.prefix_removal, "")
+            if self.suffix_removal:
+                filename_without_extension = filename_without_extension.replace(self.suffix_removal, "")
+            if self.catalog_numerical_only:
+                filename_without_extension = self.remove_non_numbers(data[catalog_key])
+            data[catalog_key] = filename_without_extension
+            return data
+        if isinstance(data, dict):
+            if self.headers_used == 'HEADERS_v1_n22':
+                return modify_catalog_key("Catalog Number", filename_without_extension, data)
+            elif self.headers_used in ['HEADERS_v2_n26', 'CUSTOM']:
+                return modify_catalog_key("filename", filename_without_extension, data)
+            else:
+                raise ValueError("Invalid headers used.")
+        else:
+            raise TypeError("Data is not of type dict.")
+    def write_json_to_file(self, filepath, data):
+        '''Writes dictionary data to a JSON file.'''
+        with open(filepath, 'w') as txt_file:
+            if isinstance(data, dict):
+                data = json.dumps(data, indent=4, sort_keys=False)
+            txt_file.write(data)
+    # def create_null_json(self):
+    #     return {}
+    def remove_non_numbers(self, s):
+        return ''.join([char for char in s if char.isdigit()])
+    def create_null_row(self, filename_without_extension, path_to_crop, path_to_content, path_to_helper):
+        json_dict = {header: '' for header in self.headers}
+        for header, value in json_dict.items():
+            if header == "path_to_crop":
+                json_dict[header] = path_to_crop
+            elif header == "path_to_original":
+                fname = os.path.basename(path_to_crop)
+                base = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(path_to_crop))))
+                path_to_original = os.path.join(base, 'Original_Images', fname)
+                json_dict[header] = path_to_original
+            elif header == "path_to_content":
+                json_dict[header] = path_to_content
+            elif header == "path_to_helper":
+                json_dict[header] = path_to_helper
+            elif header == "filename":
+                json_dict[header] = filename_without_extension
+            # "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement"
+            elif header == "WFO_exact_match":
+                json_dict[header] =''
+            elif header == "WFO_exact_match_name":
+                json_dict[header] = ''
+            elif header == "WFO_best_match":
+                json_dict[header] = ''
+            elif header == "WFO_candidate_names":
+                json_dict[header] = ''
+            elif header == "WFO_placement":
+                json_dict[header] = ''
+        return json_dict
+    ##################################################################################################################################
+    ##################################################     OCR      ##################################################################
+    ##################################################################################################################################
+    def perform_OCR_and_save_results(self, image_index, json_report, jpg_file_path_OCR_helper, txt_file_path_OCR, txt_file_path_OCR_bounds):
+        self.logger.info(f'Working on {image_index + 1}/{len(self.img_paths)} --- Starting OCR')
+        # self.OCR - None
+        ### Process_image() runs the OCR for text, handwriting, trOCR AND creates the overlay image
+        ocr_google = OCREngine(self.logger, json_report, self.dir_home, self.is_hf, self.path_to_crop, self.cfg, self.trOCR_model_version, self.trOCR_model, self.trOCR_processor, self.device)
+        ocr_google.process_image(self.do_create_OCR_helper_image, self.logger)
+        self.OCR = ocr_google.OCR
+        self.logger.info(f"Complete OCR text for LLM prompt:\n\n{self.OCR}\n\n")
+        self.write_json_to_file(txt_file_path_OCR, ocr_google.OCR_JSON_to_file)
+        self.logger.info(f'Working on {image_index + 1}/{len(self.img_paths)} --- Finished OCR')
+        if len(self.OCR) > 0:
+            ocr_google.overlay_image.save(jpg_file_path_OCR_helper)
+            OCR_bounds = {}
+            if ocr_google.hand_text_to_box_mapping is not None:
+                OCR_bounds['OCR_bounds_handwritten'] = ocr_google.hand_text_to_box_mapping
+            if ocr_google.normal_text_to_box_mapping is not None:
+                OCR_bounds['OCR_bounds_printed'] = ocr_google.normal_text_to_box_mapping
+            if ocr_google.trOCR_text_to_box_mapping is not None:
+                OCR_bounds['OCR_bounds_trOCR'] = ocr_google.trOCR_text_to_box_mapping
+            self.write_json_to_file(txt_file_path_OCR_bounds, OCR_bounds)
+            self.logger.info(f'Working on {image_index + 1}/{len(self.img_paths)} --- Saved OCR Overlay Image')
+        else:
+            pass ########################################################################################################################### fix logic for no OCR
+    ##################################################################################################################################
+    #######################################################  LLM Switchboard  ########################################################
+    ##################################################################################################################################
+    def send_to_LLM(self, is_azure, progress_report, json_report, model_name):
+        self.n_failed_LLM_calls = 0
+        self.n_failed_OCR = 0
+        final_JSON_response = None
+        final_WFO_record = None
+        final_GEO_record = None
+        self.initialize_token_counters()
+        self.update_progress_report_initial(progress_report)
+        MODEL_NAME_FORMATTED = ModelMaps.get_API_name(model_name)
+        name_parts = model_name.split("_")
+        self.setup_JSON_dict_structure()
+        Copy_Prompt = PromptCatalog()
+        Copy_Prompt.copy_prompt_template_to_new_dir(self.Dirs.transcription_prompt, self.path_custom_prompts)
+        if json_report:
+            json_report.set_text(text_main=f'Loading {MODEL_NAME_FORMATTED}')
+            json_report.set_JSON({}, {}, {})
+        # llm_model = self.initialize_llm_model(self.cfg, self.logger, MODEL_NAME_FORMATTED, self.JSON_dict_structure, name_parts, is_azure, self.llm, self.config_vals_for_permutation)
+        results_queue = Queue()
+        if json_report:
+            json_report.set_text(text_main='Sending batch to OCR and LLM')
+        num_files = len(self.img_paths)
+        # num_threads = min(num_files, 128)
+        num_threads = 128
+        counter = AtomicCounter()
+        # Setup for parallel execution
+        with ThreadPoolExecutor(max_workers=num_threads) as executor:
+            futures = [executor.submit(self.send_to_LLM_worker,
+                                    path_to_crop,
+                                    results_queue,
+                                    model_name,
+                                    MODEL_NAME_FORMATTED,
+                                    name_parts,
+                                    is_azure,
+                                    i
+                                ) for i, path_to_crop in enumerate(self.img_paths)]
+            for future in tqdm(as_completed(futures), total=len(futures), desc="Processing", unit="task"):
+                try:
+                    # Here, you could also directly process results if they were not being put in a queue
+                    future.result()  # Forces a wait on the future and re-raises any exceptions
+                    new_value = counter.inc()
+                    try:
+                        if json_report:
+                            current_value = counter.value
+                            json_report.set_text(text_main=f'Completed {current_value} of {num_files}')
+                    except:
+                        pass
+                except Exception as e:
+                    # Log the error, possibly mark the task for retry, or handle it as necessary
+                    print(f"A task failed with exception: {e}")
+        # Process results from the queue
+        while not results_queue.empty():
+            response_candidate, nt_in, nt_out, WFO_record, GEO_record, usage_report, path_to_crop, paths = results_queue.get()
+            self.n_failed_LLM_calls += 1 if response_candidate is None else 0
+            ### Estimate n tokens returned
+            self.logger.info(f'Prompt tokens IN --- {nt_in}')
+            self.logger.info(f'Prompt tokens OUT --- {nt_out}')
+            self.update_token_counters(nt_in, nt_out)
+            final_JSON_response, final_WFO_record, final_GEO_record = self.update_final_response(response_candidate, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, paths, path_to_crop, nt_in, nt_out)
+            self.logger.info(f'Finished LLM call')
+            if json_report:
+                json_report.set_JSON(final_JSON_response, final_WFO_record, final_GEO_record)
+        if json_report:
+            json_report.set_text(text_main='Finished!')
+        self.update_progress_report_final(progress_report)
+        final_JSON_response = self.parse_final_json_response(final_JSON_response)
+        return final_JSON_response, final_WFO_record, final_GEO_record, self.total_tokens_in, self.total_tokens_out
+    def send_to_LLM_worker(self, path_to_crop, queue, model_name, MODEL_NAME_FORMATTED, name_parts, is_azure, i):
+        llm_model = self.initialize_llm_model(self.cfg, self.logger, MODEL_NAME_FORMATTED, self.JSON_dict_structure, name_parts, is_azure, self.llm, self.config_vals_for_permutation)
+        # self.update_progress_report_batch(progress_report, i)
+        if self.should_skip_specimen(path_to_crop):
+            self.log_skipping_specimen(path_to_crop)
+            return
+        paths = self.generate_paths(path_to_crop, i)
+        self.path_to_crop = path_to_crop
+        filename_without_extension, txt_file_path, txt_file_path_OCR, txt_file_path_OCR_bounds, jpg_file_path_OCR_helper, json_file_path_wiki, txt_file_path_ind_prompt = paths
+        # if json_report:
+            # json_report.set_text(text_main='Starting OCR')
+        self.perform_OCR_and_save_results(i, None, jpg_file_path_OCR_helper, txt_file_path_OCR, txt_file_path_OCR_bounds)
+        # if json_report:
+            # json_report.set_text(text_main='Finished OCR')
+        if not self.OCR:
+            self.n_failed_OCR += 1
+            response_candidate = None
+            nt_in = 0
+            nt_out = 0
+        else:
+            ### Format prompt
+            prompt = self.setup_prompt()
+            # prompt = remove_colons_and_double_apostrophes(prompt) # This is moved to utils_VV since it broke the json structure.
+            ### Send prompt to chosen LLM
+            self.logger.info(f'Waiting for {model_name} API call --- Using {MODEL_NAME_FORMATTED}')
+            if 'PALM2' in name_parts:
+                response_candidate, nt_in, nt_out, WFO_record, GEO_record, usage_report = llm_model.call_llm_api_GooglePalm2(prompt, None, paths)
+            elif 'GEMINI' in name_parts:
+                response_candidate, nt_in, nt_out, WFO_record, GEO_record, usage_report = llm_model.call_llm_api_GoogleGemini(prompt, None, paths)
+            elif 'MISTRAL' in name_parts and ('LOCAL' not in name_parts):
+                response_candidate, nt_in, nt_out, WFO_record, GEO_record, usage_report = llm_model.call_llm_api_MistralAI(prompt, None, paths)
+            elif 'LOCAL' in name_parts:
+                if 'MISTRAL' in name_parts or 'MIXTRAL' in name_parts:
+                    if 'CPU' in name_parts:
+                        response_candidate, nt_in, nt_out, WFO_record, GEO_record, usage_report = llm_model.call_llm_local_cpu_MistralAI(prompt, None, paths)
+                    else:
+                        response_candidate, nt_in, nt_out, WFO_record, GEO_record, usage_report = llm_model.call_llm_local_MistralAI(prompt, None, paths)
+            else:
+                response_candidate, nt_in, nt_out, WFO_record, GEO_record, usage_report = llm_model.call_llm_api_OpenAI(prompt, None, paths)
+        # Instead of directly updating shared resources, put the structured result in the queue
+        queue.put((response_candidate, nt_in, nt_out, WFO_record, GEO_record, usage_report, path_to_crop, paths))
+    ##################################################################################################################################
+    ################################################## LLM Helper Funcs ##############################################################
+    ##################################################################################################################################
+    def initialize_llm_model(self, cfg, logger, model_name, JSON_dict_structure, name_parts, is_azure=None, llm_object=None, config_vals_for_permutation=None):
+        if 'LOCAL'in name_parts:
+            if ('MIXTRAL' in name_parts) or ('MISTRAL' in name_parts):
+                if 'CPU' in name_parts:
+                    return LocalCPUMistralHandler(cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation)
+                else:
+                    return LocalMistralHandler(cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation)
+        else:
+            if 'PALM2' in name_parts:
+                return GooglePalm2Handler(cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation)
+            elif 'GEMINI' in name_parts:
+                return GoogleGeminiHandler(cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation)
+            elif 'MISTRAL' in name_parts and ('LOCAL' not in name_parts):
+                return MistralHandler(cfg, logger, model_name, JSON_dict_structure, config_vals_for_permutation)
+            else:
+                return OpenAIHandler(cfg, logger, model_name, JSON_dict_structure, is_azure, llm_object, config_vals_for_permutation)
+    def setup_prompt(self):
+        Catalog = PromptCatalog()
+        prompt, _ = Catalog.prompt_SLTP(self.path_custom_prompts, OCR=self.OCR)
+        return prompt
+    def setup_JSON_dict_structure(self):
+        Catalog = PromptCatalog()
+        _, self.JSON_dict_structure = Catalog.prompt_SLTP(self.path_custom_prompts, OCR='Text')
+    def initialize_token_counters(self):
+        self.total_tokens_in = 0
+        self.total_tokens_out = 0
+    def update_progress_report_initial(self, progress_report):
+        if progress_report is not None:
+            progress_report.set_n_batches(len(self.img_paths))
+    def update_progress_report_batch(self, progress_report, batch_index):
+        if progress_report is not None:
+            progress_report.update_batch(f"Working on image {batch_index + 1} of {len(self.img_paths)}")
+    def should_skip_specimen(self, path_to_crop):
+        return os.path.basename(path_to_crop) in self.completed_specimens
+    def log_skipping_specimen(self, path_to_crop):
+        self.logger.info(f'[Skipping] specimen {os.path.basename(path_to_crop)} already processed')
+    def update_token_counters(self, nt_in, nt_out):
+        self.total_tokens_in += nt_in
+        self.total_tokens_out += nt_out
+    def update_final_response(self, response_candidate, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, paths, path_to_crop, nt_in, nt_out):
+        filename_without_extension, txt_file_path, txt_file_path_OCR, txt_file_path_OCR_bounds, jpg_file_path_OCR_helper, json_file_path_wiki, txt_file_path_ind_prompt = paths
+        # Saving the JSON and XLSX files with the response and updating the final JSON response
+        if response_candidate is not None:
+            final_JSON_response_updated = self.save_json_and_xlsx(self.Dirs, response_candidate, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in, nt_out)
+            return final_JSON_response_updated, WFO_record, GEO_record
+        else:
+            final_JSON_response_updated = self.save_json_and_xlsx(self.Dirs, response_candidate, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in, nt_out)
+            return final_JSON_response_updated, WFO_record, GEO_record
+    def update_progress_report_final(self, progress_report):
+        if progress_report is not None:
+            progress_report.reset_batch("Batch Complete")
+    def parse_final_json_response(self, final_JSON_response):
+        try:
+            return json.loads(final_JSON_response.strip('```').replace('json\n', '', 1).replace('json', '', 1))
+        except:
+            return final_JSON_response
+    def generate_paths(self, path_to_crop, i):
+        filename_without_extension = os.path.splitext(os.path.basename(path_to_crop))[0]
+        txt_file_path = os.path.join(self.Dirs.transcription_ind, filename_without_extension + '.json')
+        txt_file_path_OCR = os.path.join(self.Dirs.transcription_ind_OCR, filename_without_extension + '.json')
+        txt_file_path_OCR_bounds = os.path.join(self.Dirs.transcription_ind_OCR_bounds, filename_without_extension + '.json')
+        jpg_file_path_OCR_helper = os.path.join(self.Dirs.transcription_ind_OCR_helper, filename_without_extension + '.jpg')
+        json_file_path_wiki = os.path.join(self.Dirs.transcription_ind_wiki, filename_without_extension + '.json')
+        txt_file_path_ind_prompt = os.path.join(self.Dirs.transcription_ind_prompt, filename_without_extension + '.txt')
+        self.logger.info(f'Working on {i+1}/{len(self.img_paths)} --- {filename_without_extension}')
+        return filename_without_extension, txt_file_path, txt_file_path_OCR, txt_file_path_OCR_bounds, jpg_file_path_OCR_helper, json_file_path_wiki, txt_file_path_ind_prompt
+    def save_json_and_xlsx(self, Dirs, response, WFO_record, GEO_record, usage_report,
+                           MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in, nt_out):
+        if response is None:
+            response = self.JSON_dict_structure
+            # Insert 'filename' as the first key
+            response = {'filename': filename_without_extension, **{k: v for k, v in response.items() if k != 'filename'}}
+            self.write_json_to_file(txt_file_path, response)
+            # Then add the null info to the spreadsheet
+            response_null = self.create_null_row(filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper)
+            self.add_data_to_excel_from_response(Dirs, self.path_transcription, response_null, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in=0, nt_out=0)
+        ### Set completed JSON
+        else:
+            response = self.clean_catalog_number(response, filename_without_extension)
+            self.write_json_to_file(txt_file_path, response)
+            # add to the xlsx file
+            self.add_data_to_excel_from_response(Dirs, self.path_transcription, response, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in, nt_out)
+        return response
+    def process_specimen_batch(self, progress_report, json_report, is_real_run=False):
+        if not self.has_key:
+            self.logger.error(f'No API key found for {self.version_name}')
+            raise Exception(f"No API key found for {self.version_name}")
+        try:
+            if is_real_run:
+                progress_report.update_overall(f"Transcribing Labels")
+            final_json_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out = self.send_to_LLM(self.is_azure, progress_report, json_report, self.model_name)
+            return final_json_response, final_WFO_record, final_GEO_record, total_tokens_in, total_tokens_out
+        except Exception as e:
+            self.logger.error(f"LLM call failed in process_specimen_batch: {e}")
+            if progress_report is not None:
+                progress_report.reset_batch(f"Batch Failed")
+            self.close_logger_handlers()
+            raise
+    def close_logger_handlers(self):
+        for handler in self.logger.handlers[:]:
+            handler.close()
+            self.logger.removeHandler(handler)
+    # def process_specimen_batch_OCR_test(self, path_to_crop):
+    #     for img_filename in os.listdir(path_to_crop):
+    #         img_path = os.path.join(path_to_crop, img_filename)
+    #     self.OCR, self.bounds, self.text_to_box_mapping = detect_text(img_path)
+# https://gist.github.com/benhoyt/8c8a8d62debe8e5aa5340373f9c509c7
+class AtomicCounter(object):
+        """An atomic, thread-safe counter"""
+        def __init__(self, initial=0):
+            """Initialize a new atomic counter to given initial value"""
+            self._value = initial
+            self._lock = threading.Lock()
+        def inc(self, num=1):
+            """Atomically increment the counter by num and return the new value"""
+            with self._lock:
+                self._value += num
+                return self._value
+        def dec(self, num=1):
+            """Atomically decrement the counter by num and return the new value"""
+            with self._lock:
+                self._value -= num
+                return self._value
+        @property
+        def value(self):
+            return self._value
+def space_saver(cfg, Dirs, logger):
+    dir_out = cfg['leafmachine']['project']['dir_output']
+    run_name = Dirs.run_name
+    path_project = os.path.join(dir_out, run_name)
+    if cfg['leafmachine']['project']['delete_temps_keep_VVE']:
+        logger.name = '[DELETE TEMP FILES]'
+        logger.info("Deleting temporary files. Keeping files required for VoucherVisionEditor.")
+        delete_dirs = ['Archival_Components', 'Config_File']
+        for d in delete_dirs:
+            path_delete = os.path.join(path_project, d)
+            if os.path.exists(path_delete):
+                shutil.rmtree(path_delete)
+    elif cfg['leafmachine']['project']['delete_all_temps']:
+        logger.name = '[DELETE TEMP FILES]'
+        logger.info("Deleting ALL temporary files!")
+        delete_dirs = ['Archival_Components', 'Config_File', 'Original_Images', 'Cropped_Images']
+        for d in delete_dirs:
+            path_delete = os.path.join(path_project, d)
+            if os.path.exists(path_delete):
+                shutil.rmtree(path_delete)
+        # Delete the transctiption folder, but keep the xlsx
+        transcription_path = os.path.join(path_project, 'Transcription')
+        if os.path.exists(transcription_path):
+            for item in os.listdir(transcription_path):
+                item_path = os.path.join(transcription_path, item)
+                if os.path.isdir(item_path):  # if the item is a directory
+                    if os.path.exists(item_path):
+                        shutil.rmtree(item_path)  # delete the directory

vouchervision/vouchervision_main.py CHANGED Viewed

@@ -14,6 +14,7 @@ from vouchervision.data_project import Project_Info
 from vouchervision.LM2_logger import start_logging
 from vouchervision.fetch_data import fetch_data
 from vouchervision.utils_VoucherVision import VoucherVision, space_saver
 from vouchervision.utils_hf import upload_to_drive
 def voucher_vision(cfg_file_path, dir_home, path_custom_prompts, cfg_test, progress_report, json_report, path_api_cost=None, test_ind = None, is_hf = True, is_real_run=False):

 from vouchervision.LM2_logger import start_logging
 from vouchervision.fetch_data import fetch_data
 from vouchervision.utils_VoucherVision import VoucherVision, space_saver
+# from vouchervision.utils_VoucherVision_parallel import VoucherVision, space_saver
 from vouchervision.utils_hf import upload_to_drive
 def voucher_vision(cfg_file_path, dir_home, path_custom_prompts, cfg_test, progress_report, json_report, path_api_cost=None, test_ind = None, is_hf = True, is_real_run=False):

vouchervision/vouchervision_test_all_options_analysis.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+def SUMMARY_permute_llms_to_sweep_temperature_and_topP_for_GPT4_SHORT():
+    #####################
+    # Load the Excel file
+    file_path = 'D:/Dropbox/VoucherVision/demo/validation_output/summary/SUMMARY_permute_llms_to_sweep_temperature_and_topP_for_GPT4_SHORT.xlsx'
+    save_path = 'D:/Dropbox/VoucherVision/demo/validation_output/figures/avg_L_score_analysis_SUMMARY_permute_llms_to_sweep_temperature_and_topP_for_GPT4_SHORT.png'
+    df = pd.read_excel(file_path)
+    # Display the first few rows of the dataframe to understand its structure
+    df.head()
+    # Grouping by the parameters and calculating the mean of avg_L_score for each group
+    grouped = df.groupby(['v_prompt_version', 'v_double_ocr', 'temperature', 'top_p'])['avg_L_score'].mean().reset_index()
+    # Finding the group with the highest average L score
+    max_avg_L_score = grouped['avg_L_score'].max()
+    best_group = grouped[grouped['avg_L_score'] == max_avg_L_score]
+    print(best_group)
+    ### Viz
+    # Filtering the dataset for the conditions mentioned
+    filtered_df = df[df['v_prompt_version'] == 'SLTPvB_long.yaml'][df['v_double_ocr'] == True]
+    # Setting up the plotting
+    plt.figure(figsize=(14, 6))
+    # Plot 1: avg_L_score as a function of temperature for each top_p value
+    plt.subplot(1, 2, 1)
+    sns.lineplot(data=filtered_df, x='temperature', y='avg_L_score', hue='top_p', marker='o')
+    plt.title('Average L Score by Temperature for each Top P')
+    plt.xlabel('Temperature')
+    plt.ylabel('Average L Score')
+    plt.legend(title='Top P', bbox_to_anchor=(1.05, 1), loc='upper left')
+    # Plot 2: avg_L_score as a function of top_p for each temperature value
+    plt.subplot(1, 2, 2)
+    sns.lineplot(data=filtered_df, x='top_p', y='avg_L_score', hue='temperature', marker='o')
+    plt.title('Average L Score by Top P for each Temperature')
+    plt.xlabel('Top P')
+    plt.ylabel('Average L Score')
+    plt.legend(title='Temperature', bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=600)
+def SUMMARY_permute_llms_to_sweep_temperature_and_topP_for_google_SHORT():
+    #####################
+    # Load the Excel file
+    file_path = 'D:/Dropbox/VoucherVision/demo/validation_output/summary/SUMMARY_permute_llms_to_sweep_temperature_and_topP_for_google_SHORT.xlsx'
+    save_path = 'D:/Dropbox/VoucherVision/demo/validation_output/figures/avg_L_score_analysis_SUMMARY_permute_llms_to_sweep_temperature_and_topP_for_google_SHORT.png'
+    df = pd.read_excel(file_path)
+    # Display the first few rows of the dataframe to understand its structure
+    df.head()
+    # Grouping by the parameters and calculating the mean of avg_L_score for each group
+    grouped = df.groupby(['v_prompt_version', 'v_double_ocr', 'temperature', 'top_p'])['avg_L_score'].mean().reset_index()
+    # Finding the group with the highest average L score
+    max_avg_L_score = grouped['avg_L_score'].max()
+    best_group = grouped[grouped['avg_L_score'] == max_avg_L_score]
+    print(best_group)
+    ### Viz
+    # Filtering the dataset for the conditions mentioned
+    filtered_df = df[df['v_prompt_version'] == 'SLTPvB_long.yaml'][df['v_double_ocr'] == True]
+    # Setting up the plotting
+    plt.figure(figsize=(14, 6))
+    # Plot 1: avg_L_score as a function of temperature for each top_p value
+    plt.subplot(1, 2, 1)
+    sns.lineplot(data=filtered_df, x='temperature', y='avg_L_score', hue='top_p', marker='o')
+    plt.title('Average L Score by Temperature for each Top P')
+    plt.xlabel('Temperature')
+    plt.ylabel('Average L Score')
+    plt.legend(title='Top P', bbox_to_anchor=(1.05, 1), loc='upper left')
+    # Plot 2: avg_L_score as a function of top_p for each temperature value
+    plt.subplot(1, 2, 2)
+    sns.lineplot(data=filtered_df, x='top_p', y='avg_L_score', hue='temperature', marker='o')
+    plt.title('Average L Score by Top P for each Temperature')
+    plt.xlabel('Top P')
+    plt.ylabel('Average L Score')
+    plt.legend(title='Temperature', bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=600)
+if __name__ == '__main__':
+    # SUMMARY_permute_llms_to_sweep_temperature_and_topP_for_GPT4_SHORT()
+    SUMMARY_permute_llms_to_sweep_temperature_and_topP_for_google_SHORT()

vouchervision/vouchervision_test_all_options_recipes.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import os, inspect, sys, shutil
+class AllOptions():
+    a_llm = [
+        "GPT 4 Turbo 1106-preview",
+        "GPT 4 Turbo 0125-preview",
+        'GPT 4',
+        'GPT 4 32k',
+        'GPT 3.5',
+        'GPT 3.5 Instruct',
+        'Azure GPT 3.5',
+        'Azure GPT 3.5 Instruct',
+        'Azure GPT 4',
+        'Azure GPT 4 Turbo 1106-preview',
+        'Azure GPT 4 Turbo 0125-preview',
+        'Azure GPT 4 32k',
+        'PaLM 2 text-bison@001',
+        'PaLM 2 text-bison@002',
+        'PaLM 2 text-unicorn@001',
+        'Gemini Pro',
+        'Mistral Small',
+        'Mistral Medium',
+        'Mistral Large',
+        'Open Mixtral 8x7B',
+        'Open Mistral 7B',
+        'LOCAL Mixtral 8x7B Instruct v0.1',
+        'LOCAL Mistral 7B Instruct v0.2',
+        'LOCAL CPU Mistral 7B Instruct v0.2 GGUF',
+        ]
+    a_prompt_version = [
+        'SLTPvA_long.yaml',
+        'SLTPvA_medium.yaml',
+        'SLTPvA_short.yaml',
+        'SLTPvB_long.yaml',
+        'SLTPvB_medium.yaml',
+        'SLTPvB_short.yaml',
+        'SLTPvB_minimal.yaml',
+    ]
+    a_LM2 = [False,] # [True, False]
+    a_do_use_trOCR = [False,] # [True, False]
+    a_trocr_path = ["microsoft/trocr-large-handwritten",]
+    a_ocr_option = [
+        'hand',
+        'normal',
+        'CRAFT',
+        'LLaVA',
+        ['hand','CRAFT'],
+        ['hand','LLaVA'],
+        ]
+    a_llava_option = ["llava-v1.6-mistral-7b",
+                      "llava-v1.6-34b",
+                      "llava-v1.6-vicuna-13b",
+                      "llava-v1.6-vicuna-7b",]
+    a_llava_bit = ["full", "4bit",]
+    a_double_ocr = [True, False]
+class Options_permute_llms_to_investigate_determinism_at_restrictive_settings():
+    a_llm = [
+        # "GPT 4 Turbo 1106-preview",
+        # "GPT 4 Turbo 0125-preview",
+        # 'GPT 4',
+        # # 'GPT 4 32k',
+        # 'GPT 3.5 Turbo',
+        # 'GPT 3.5 Instruct',
+        'Azure GPT 3.5 Turbo',
+        'Azure GPT 3.5 Instruct',
+        'Azure GPT 4',
+        'Azure GPT 4 Turbo 1106-preview',
+        'Azure GPT 4 Turbo 0125-preview',
+        # 'Azure GPT 4 32k',
+        'PaLM 2 text-bison@001',
+        'PaLM 2 text-bison@002',
+        'PaLM 2 text-unicorn@001',
+        'Gemini Pro',
+        'Mistral Small',
+        'Mistral Medium',
+        'Mistral Large',
+        # 'Open Mixtral 8x7B',
+        'Open Mistral 7B',
+        # 'LOCAL Mixtral 8x7B Instruct v0.1',
+        # 'LOCAL Mistral 7B Instruct v0.2',
+        # 'LOCAL CPU Mistral 7B Instruct v0.2 GGUF',
+        ]
+    a_prompt_version = [
+        # 'SLTPvA_long.yaml',
+        # 'SLTPvA_short.yaml',
+        'SLTPvB_long.yaml',
+        'SLTPvB_short.yaml',
+        'SLTPvB_minimal.yaml',
+    ]
+    a_double_ocr = [True, False]
+    ### BELOW ARE STATIC
+    a_LM2 = [False,]
+    # a_do_use_trOCR = [True, False]
+    a_do_use_trOCR = [False,]
+    # a_trocr_path = ["microsoft/trocr-large-handwritten","microsoft/trocr-base-handwritten",]
+    a_trocr_path = ["microsoft/trocr-large-handwritten",]
+    a_ocr_option = ['hand',]
+    a_llava_option = ["llava-v1.6-mistral-7b",]
+    a_llava_bit = ["full",]
+class Options_permute_llms_to_sweep_temperature_and_topP_for_GPT4_0125():
+    a_llm = [
+        # 'Azure GPT 4 Turbo 0125-preview', #test 1
+        'Azure GPT 4',
+        ]
+    a_prompt_version = [
+        # 'SLTPvA_long.yaml',
+        # 'SLTPvA_short.yaml',
+        'SLTPvB_long.yaml',
+        'SLTPvB_short.yaml',
+        # 'SLTPvB_minimal.yaml',
+    ]
+    a_double_ocr = [True, False]
+    ### BELOW ARE STATIC
+    a_LM2 = [False,]
+    # a_do_use_trOCR = [True, False]
+    a_do_use_trOCR = [False,]
+    # a_trocr_path = ["microsoft/trocr-large-handwritten","microsoft/trocr-base-handwritten",]
+    a_trocr_path = ["microsoft/trocr-large-handwritten",]
+    a_ocr_option = ['hand',]
+    a_llava_option = ["llava-v1.6-mistral-7b",]
+    a_llava_bit = ["full",]
+class Options_permute_llms_to_sweep_temperature_and_topP_for_google():
+    a_llm = [
+        'PaLM 2 text-bison@001',
+        'PaLM 2 text-bison@002',
+        'Gemini Pro',
+        ]
+    a_prompt_version = [
+        'SLTPvB_long.yaml',
+        'SLTPvB_short.yaml',
+    ]
+    a_double_ocr = [True, False]
+    ### BELOW ARE STATIC
+    a_LM2 = [False,]
+    a_do_use_trOCR = [False,] # [True, False]
+    a_trocr_path = ["microsoft/trocr-large-handwritten",]
+    a_ocr_option = ['hand',]
+    a_llava_option = ["llava-v1.6-mistral-7b",]
+    a_llava_bit = ["full",]
+if __name__ == '__main__':
+    pass