Spaces:
Sleeping
Sleeping
import os | |
from langchain_community.document_loaders import TextLoader | |
from langchain.vectorstores import Chroma | |
from langchain.chains.query_constructor.base import AttributeInfo | |
from langchain.retrievers.self_query.base import SelfQueryRetriever | |
from llm.gemini import gemini_embeddings, llm | |
from utils.questions_parser import parse_question | |
try: | |
vectorstore = Chroma( | |
persist_directory="./chroma_db", embedding_function=gemini_embeddings | |
) | |
except Exception as e: | |
print(e) | |
if "DATA_PATH" not in os.environ: | |
raise ValueError("DATA_PATH environment variable is not set") | |
DATA_PATH = os.environ["DATA_PATH"] | |
data_loader = TextLoader(DATA_PATH, encoding="UTF-8").load() | |
questions = list( | |
map(lambda x: "##Questão" + x, data_loader[0].page_content.split("##Questão")) | |
) | |
docs = [] | |
for question in questions: | |
try: | |
docs.append(parse_question(question)) | |
except Exception as e: | |
print(e, question) | |
db = Chroma.from_documents(docs, gemini_embeddings) | |
vectorstore = Chroma.from_documents( | |
documents=docs, embedding=gemini_embeddings, persist_directory="./chroma_db" | |
) | |
vectorstore_disk = Chroma( | |
persist_directory="./chroma_db", embedding_function=gemini_embeddings | |
) | |
metadata_field_info = [ | |
AttributeInfo( | |
name="topico", | |
description="A materia escolar da qual a questão pertence.", | |
type="string", | |
), | |
AttributeInfo( | |
name="assunto", | |
description="O assunto da materia fornecida anteriormente.", | |
type="string", | |
), | |
AttributeInfo( | |
name="dificuldade", | |
description="O nivel de dificuldade para resolver a questao.", | |
type="string", | |
), | |
AttributeInfo( | |
name="tipo", | |
description="O tipo da questao. Pode ser ou Multipla Escolha ou Justificativa", | |
type="string", | |
), | |
] | |
document_content_description = "Questões de biologia" | |
retriever = SelfQueryRetriever.from_llm( | |
llm, vectorstore, document_content_description, metadata_field_info, verbose=True | |
) | |