import glob import pandas as pd import json import os from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter from transformers import AutoTokenizer from torch import cuda from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings from langchain_community.vectorstores import Qdrant from qdrant_client import QdrantClient from auditqa.reports import files, report_list from langchain.docstore.document import Document import configparser # read all the necessary variables device = 'cuda' if cuda.is_available() else 'cpu' path_to_data = "./reports/" ##---------------------functions -------------------------------------------## def getconfig(configfile_path:str): """ Read the config file Params ---------------- configfile_path: file path of .cfg file """ config = configparser.ConfigParser() try: config.read_file(open(configfile_path)) return config except: logging.warning("config file not found") def open_file(filepath): with open(filepath) as file: simple_json = json.load(file) return simple_json def load_chunks(): """ this method reads through the files and report_list to create the vector database """ # we iterate through the files which contain information about its # 'source'=='category', 'subtype', these are used in UI for document selection # which will be used later for filtering database config = getconfig("./model_params.cfg") doc_processed = open_file(path_to_data + "docling_chunks.json" ) chunks_list = [] for doc in doc_processed: chunks_list.append(Document(page_content= doc['content'], metadata=doc['metadata'] )) # define embedding model embeddings = HuggingFaceEmbeddings( model_kwargs = {'device': device}, show_progress= True, encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE'))), 'batch_size':100}, model_name=config.get('retriever','MODEL') ) # placeholder for collection qdrant_collections = {} print("embeddings started") #batch_size = 1000 # Adjust this value based on your system's memory capacity #for i in range(0, len(chunks_list), batch_size): # print("embedding",(i+batch_size)/1000) # batch_docs = chunks_list[i:i+batch_size] # qdrant = Qdrant.from_documents( # batch_docs, embeddings, # path="/data/local_qdrant", # recreate_collection=False, # collection_name='reportsFeb2025', # ) qdrant_collections['docling'] = Qdrant.from_documents( chunks_list, embeddings, path="/data/local_qdrant", collection_name='docling', ) print(qdrant_collections) print("vector embeddings done") return qdrant_collections def load_new_chunks(): """ this method reads through the files and report_list to create the vector database """ # we iterate through the files which contain information about its # 'source'=='category', 'subtype', these are used in UI for document selection # which will be used later for filtering database config = getconfig("./model_params.cfg") files = pd.read_json("./axa_processed_chunks_update.json") all_documents= [] # iterate through 'source' for i in range(len(files)): # load the chunks try: doc_processed = open_file(path_to_data + "/chunks/"+ os.path.basename(files.loc[i,'chunks_filepath'])) doc_processed = doc_processed['paragraphs'] except Exception as e: print("Exception: ", e) print("chunks in subtype:", files.loc[i,'filename'], "are:",len(doc_processed)) # add metadata information for doc in doc_processed: all_documents.append(Document(page_content= str(doc['content']), metadata={"source": files.loc[i,'category'], "subtype":os.path.splitext(files.loc[i,'filename'])[0], "year":str(files.loc[i,'year']), "filename":files.loc[0,'filename'], "page":doc['metadata']['page'], "headings":doc['metadata']['headings']})) # convert list of list to flat list print("length of chunks:",len(all_documents)) # define embedding model embeddings = HuggingFaceEmbeddings( model_kwargs = {'device': device}, encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE')))}, model_name=config.get('retriever','MODEL') ) # placeholder for collection qdrant_collections = {} qdrant_collections['allreports'] = Qdrant.from_documents( all_documents, embeddings, path="/data/local_qdrant", collection_name='allreports', ) print(qdrant_collections) print("vector embeddings done") return qdrant_collections def get_local_qdrant(): """once the local qdrant server is created this is used to make the connection to exisitng server""" config = getconfig("./model_params.cfg") qdrant_collections = {} embeddings = HuggingFaceEmbeddings( model_kwargs = {'device': device}, encode_kwargs = {'normalize_embeddings': True}, model_name=config.get('retriever','MODEL')) client = QdrantClient(path="/data/local_qdrant") print("Collections in local Qdrant:",client.get_collections()) qdrant_collections['allreports'] = Qdrant(client=client, collection_name='allreports', embeddings=embeddings, ) return qdrant_collections