lucas-wa
Adding build options
d1cad4b
raw
history blame
2.11 kB
import os
from langchain_community.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from llm.gemini import gemini_embeddings, llm
from utils.questions_parser import parse_question
try:
vectorstore = Chroma(
persist_directory="./chroma_db", embedding_function=gemini_embeddings
)
except Exception as e:
print(e)
if "DATA_PATH" not in os.environ:
raise ValueError("DATA_PATH environment variable is not set")
DATA_PATH = os.environ["DATA_PATH"]
data_loader = TextLoader(DATA_PATH, encoding="UTF-8").load()
questions = list(
map(lambda x: "##Questão" + x, data_loader[0].page_content.split("##Questão"))
)
docs = []
for question in questions:
try:
docs.append(parse_question(question))
except Exception as e:
print(e, question)
db = Chroma.from_documents(docs, gemini_embeddings)
vectorstore = Chroma.from_documents(
documents=docs, embedding=gemini_embeddings, persist_directory="./chroma_db"
)
vectorstore_disk = Chroma(
persist_directory="./chroma_db", embedding_function=gemini_embeddings
)
metadata_field_info = [
AttributeInfo(
name="topico",
description="A materia escolar da qual a questão pertence.",
type="string",
),
AttributeInfo(
name="assunto",
description="O assunto da materia fornecida anteriormente.",
type="string",
),
AttributeInfo(
name="dificuldade",
description="O nivel de dificuldade para resolver a questao.",
type="string",
),
AttributeInfo(
name="tipo",
description="O tipo da questao. Pode ser ou Multipla Escolha ou Justificativa",
type="string",
),
]
document_content_description = "Questões de biologia"
retriever = SelfQueryRetriever.from_llm(
llm, vectorstore, document_content_description, metadata_field_info, verbose=True
)