Spaces:

sabazo
/

innofinderai

Sleeping

App Files Files

isayahc commited on Apr 28, 2024

Commit

152e491

1 Parent(s): 95b3333

now users can embed all of the iamges from a pdf

Browse files

Files changed (1) hide show

innovation_pathfinder_ai/vector_store/chroma_vector_store.py +53 -1

innovation_pathfinder_ai/vector_store/chroma_vector_store.py CHANGED Viewed

@@ -28,7 +28,7 @@ from innovation_pathfinder_ai.utils.utils import (
 )
 from innovation_pathfinder_ai.utils.image_processing.image_processing import (
-    caption_image
 )
 from typing import List, Optional, NoReturn
@@ -280,6 +280,58 @@ def add_image_to_vector_store(
         )
 if __name__ == "__main__":

 )
 from innovation_pathfinder_ai.utils.image_processing.image_processing import (
+    caption_image, extract_images_from_pdf
 )
 from typing import List, Optional, NoReturn
         )
+def add_images_to_vector_store(
+    collection_name:str,
+    pdf_location:str,
+    pdf_images_location:str,
+    vector_store_client:chromadb.PersistentClient)-> NoReturn:
+    meta_data = extract_images_from_pdf(
+        pdf_path=pdf_location,
+        output_folder=pdf_images_location,
+        )
+    embedding_function = SentenceTransformerEmbeddings(
+        model_name=os.getenv("EMBEDDING_MODEL"),
+        )
+    client = chromadb.PersistentClient(
+     path=persist_directory,
+    )
+    collection = client.get_or_create_collection(
+    name=collection_name,
+    )
+    captioned_images = []
+    for i in meta_data:
+        temp = caption_image(i["image_location"])
+        captioned_images.append(temp)
+    dict_with_added_captions = []
+    for d, caption in zip(meta_data, captioned_images):
+        d["image_caption"] = caption[0]['generated_text']
+        dict_with_added_captions.append(d)
+    doc_list = []
+    for i in dict_with_added_captions:
+        temp = Document(
+            page_content=i['image_caption'],
+            metadata=i
+        )
+        doc_list.append(temp)
+    collection.add(
+            ids=[generate_uuid() for i in doc_list], # give each document a uuid
+            documents=[i.page_content for i in doc_list], # contents of document
+            embeddings=[embedding_function.embed_query(i.page_content) for i in doc_list],
+            metadatas=[i.metadata for i in doc_list],  # type: ignore
+        )
 if __name__ == "__main__":