import os from langchain_community.document_loaders import TextLoader from langchain.vectorstores import Chroma from langchain.chains.query_constructor.base import AttributeInfo from langchain.retrievers.self_query.base import SelfQueryRetriever from llm.gemini import gemini_embeddings, llm from utils.questions_parser import parse_question try: vectorstore = Chroma( persist_directory="./chroma_db", embedding_function=gemini_embeddings ) except Exception as e: print(e) if "DATA_PATH" not in os.environ: raise ValueError("DATA_PATH environment variable is not set") DATA_PATH = os.environ["DATA_PATH"] data_loader = TextLoader(DATA_PATH, encoding="UTF-8").load() questions = list( map(lambda x: "##Questão" + x, data_loader[0].page_content.split("##Questão")) ) docs = [] for question in questions: try: docs.append(parse_question(question)) except Exception as e: print(e, question) db = Chroma.from_documents(docs, gemini_embeddings) vectorstore = Chroma.from_documents( documents=docs, embedding=gemini_embeddings, persist_directory="./chroma_db" ) vectorstore_disk = Chroma( persist_directory="./chroma_db", embedding_function=gemini_embeddings ) metadata_field_info = [ AttributeInfo( name="topico", description="A materia escolar da qual a questão pertence.", type="string", ), AttributeInfo( name="assunto", description="O assunto da materia fornecida anteriormente.", type="string", ), AttributeInfo( name="dificuldade", description="O nivel de dificuldade para resolver a questao.", type="string", ), AttributeInfo( name="tipo", description="O tipo da questao. Pode ser ou Multipla Escolha ou Justificativa", type="string", ), ] document_content_description = "Questões de biologia" retriever = SelfQueryRetriever.from_llm( llm, vectorstore, document_content_description, metadata_field_info, verbose=True )