Spaces:
Sleeping
Sleeping
File size: 2,111 Bytes
6558cd8 d1cad4b 6558cd8 d1cad4b 6558cd8 d1cad4b 6558cd8 d1cad4b 6558cd8 d1cad4b 6558cd8 d1cad4b 6558cd8 d1cad4b 6558cd8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import os
from langchain_community.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from llm.gemini import gemini_embeddings, llm
from utils.questions_parser import parse_question
try:
vectorstore = Chroma(
persist_directory="./chroma_db", embedding_function=gemini_embeddings
)
except Exception as e:
print(e)
if "DATA_PATH" not in os.environ:
raise ValueError("DATA_PATH environment variable is not set")
DATA_PATH = os.environ["DATA_PATH"]
data_loader = TextLoader(DATA_PATH, encoding="UTF-8").load()
questions = list(
map(lambda x: "##Questão" + x, data_loader[0].page_content.split("##Questão"))
)
docs = []
for question in questions:
try:
docs.append(parse_question(question))
except Exception as e:
print(e, question)
db = Chroma.from_documents(docs, gemini_embeddings)
vectorstore = Chroma.from_documents(
documents=docs, embedding=gemini_embeddings, persist_directory="./chroma_db"
)
vectorstore_disk = Chroma(
persist_directory="./chroma_db", embedding_function=gemini_embeddings
)
metadata_field_info = [
AttributeInfo(
name="topico",
description="A materia escolar da qual a questão pertence.",
type="string",
),
AttributeInfo(
name="assunto",
description="O assunto da materia fornecida anteriormente.",
type="string",
),
AttributeInfo(
name="dificuldade",
description="O nivel de dificuldade para resolver a questao.",
type="string",
),
AttributeInfo(
name="tipo",
description="O tipo da questao. Pode ser ou Multipla Escolha ou Justificativa",
type="string",
),
]
document_content_description = "Questões de biologia"
retriever = SelfQueryRetriever.from_llm(
llm, vectorstore, document_content_description, metadata_field_info, verbose=True
)
|