Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,692 Bytes
f3d315e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
from colpali_manager import ColpaliManager
from milvus_manager import MilvusManager
from pdf_manager import PdfManager
import hashlib
pdf_manager = PdfManager()
colpali_manager = ColpaliManager()
class Middleware:
def __init__(self, id:str, create_collection=True):
hashed_id = hashlib.md5(id.encode()).hexdigest()[:8]
milvus_db_name = f"milvus_{hashed_id}.db"
self.milvus_manager = MilvusManager(milvus_db_name, "colpali", create_collection)
def index(self, pdf_path: str, id:str, max_pages: int, pages: list[int] = None):
print(f"Indexing {pdf_path}, id: {id}, max_pages: {max_pages}")
image_paths = pdf_manager.save_images(id, pdf_path, max_pages)
print(f"Saved {len(image_paths)} images")
colbert_vecs = colpali_manager.process_images(image_paths)
images_data = [{
"colbert_vecs": colbert_vecs[i],
"filepath": image_paths[i]
} for i in range(len(image_paths))]
print(f"Inserting {len(images_data)} images data to Milvus")
self.milvus_manager.insert_images_data(images_data)
print("Indexing completed")
return image_paths
def search(self, search_queries: list[str]):
print(f"Searching for {len(search_queries)} queries")
final_res = []
for query in search_queries:
print(f"Searching for query: {query}")
query_vec = colpali_manager.process_text([query])[0]
search_res = self.milvus_manager.search(query_vec, topk=1)
print(f"Search result: {search_res} for query: {query}")
final_res.append(search_res)
return final_res
|