Spaces:
Running
Running
luanpoppe
commited on
Commit
·
7dc6d22
1
Parent(s):
834da99
feat: tentando melhorar system_prompt
Browse files- .gitignore +1 -0
- langchain_backend/main.py +18 -2
- langchain_backend/utils.py +48 -26
- resumos/views.py +32 -1
- setup/environment.py +2 -1
.gitignore
CHANGED
@@ -139,6 +139,7 @@ venv/
|
|
139 |
ENV/
|
140 |
env.bak/
|
141 |
venv.bak/
|
|
|
142 |
|
143 |
# Spyder project settings
|
144 |
.spyderproject
|
|
|
139 |
ENV/
|
140 |
env.bak/
|
141 |
venv.bak/
|
142 |
+
.venv*
|
143 |
|
144 |
# Spyder project settings
|
145 |
.spyderproject
|
langchain_backend/main.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import os
|
2 |
-
from langchain_backend.utils import create_prompt_llm_chain, create_retriever, getPDF, create_llm, create_prompt_llm_chain_summary
|
3 |
from langchain_backend import utils
|
4 |
from langchain.chains import create_retrieval_chain
|
5 |
from langchain_huggingface import HuggingFaceEmbeddings
|
@@ -54,4 +54,20 @@ def get_llm_answer_summary(system_prompt, user_prompt, pdf_url, model, isIterati
|
|
54 |
print('result: ', result)
|
55 |
return result
|
56 |
# Obs --> Para passar informações personalizadas --> chain = load_summarize_chain(llm, "refine", True, question_prompt=initial_prompt, refine_prompt=PromptTemplate.from_template(refine_prompt))
|
57 |
-
# Para ver mais opções --> Acessa a origem da função load_summarize_chain , e nela acessa a origem da função _load_refine_chain --> As opções são os parâmetros que esta última função recebe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
from langchain_backend.utils import create_prompt_llm_chain, create_retriever, getPDF, create_llm, create_prompt_llm_chain_summary, process_embedding_summary
|
3 |
from langchain_backend import utils
|
4 |
from langchain.chains import create_retrieval_chain
|
5 |
from langchain_huggingface import HuggingFaceEmbeddings
|
|
|
54 |
print('result: ', result)
|
55 |
return result
|
56 |
# Obs --> Para passar informações personalizadas --> chain = load_summarize_chain(llm, "refine", True, question_prompt=initial_prompt, refine_prompt=PromptTemplate.from_template(refine_prompt))
|
57 |
+
# Para ver mais opções --> Acessa a origem da função load_summarize_chain , e nela acessa a origem da função _load_refine_chain --> As opções são os parâmetros que esta última função recebe
|
58 |
+
|
59 |
+
def get_llm_answer_summary_with_embedding(system_prompt, user_prompt, pdf_url, model, isIterativeRefinement):
|
60 |
+
print('model: ', model)
|
61 |
+
print('isIterativeRefinement: ', isIterativeRefinement)
|
62 |
+
print('\n\n\n')
|
63 |
+
pages = getPDF(pdf_url)
|
64 |
+
full_texto = ""
|
65 |
+
for p in pages:
|
66 |
+
full_texto += p.page_content
|
67 |
+
print('full_texto: ', full_texto)
|
68 |
+
|
69 |
+
rag_chain = process_embedding_summary(system_prompt, model)
|
70 |
+
|
71 |
+
results = rag_chain.invoke({"input": user_prompt, "context": pages})
|
72 |
+
|
73 |
+
return results
|
langchain_backend/utils.py
CHANGED
@@ -11,6 +11,9 @@ from uuid import uuid4
|
|
11 |
from langchain_core.output_parsers import JsonOutputParser
|
12 |
from langchain_core.pydantic_v1 import BaseModel, Field
|
13 |
from typing import List
|
|
|
|
|
|
|
14 |
|
15 |
os.environ["LANGCHAIN_TRACING_V2"]="true"
|
16 |
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
|
@@ -31,12 +34,7 @@ def getPDF(file_paths):
|
|
31 |
pagesDoc = loader.load_and_split(text_splitter)
|
32 |
pages = pages + pagesDoc
|
33 |
|
34 |
-
|
35 |
-
# loader = PyPDFLoader(file_paths, extract_images=False)
|
36 |
-
# pages = loader.load_and_split(text_splitter)
|
37 |
for page in pages:
|
38 |
-
# print('\n')
|
39 |
-
# print('allIds: ', allIds)
|
40 |
documentId = str(uuid4())
|
41 |
allIds.append(documentId)
|
42 |
page.id = documentId
|
@@ -70,7 +68,7 @@ def create_prompt_llm_chain(system_prompt, modelParam):
|
|
70 |
|
71 |
def create_llm(modelParam):
|
72 |
if modelParam == default_model:
|
73 |
-
return ChatOpenAI(model=modelParam)
|
74 |
else:
|
75 |
return HuggingFaceEndpoint(
|
76 |
repo_id=modelParam,
|
@@ -88,11 +86,38 @@ class Resumo(BaseModel):
|
|
88 |
doutrina: str = Field()
|
89 |
palavras_chave: List[str] = Field()
|
90 |
|
91 |
-
def create_prompt_llm_chain_summary(system_prompt,
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
|
|
|
|
|
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
|
|
|
|
|
|
96 |
system_prompt = system_prompt + "\n\n" + "{context}"
|
97 |
prompt = ChatPromptTemplate.from_messages(
|
98 |
[
|
@@ -100,10 +125,7 @@ def create_prompt_llm_chain_summary(system_prompt, modelParam):
|
|
100 |
("human", "{input}"),
|
101 |
]
|
102 |
)
|
103 |
-
|
104 |
-
final_chain = question_answer_chain | JsonOutputParser(pydantic_object=Resumo)
|
105 |
-
return final_chain
|
106 |
-
|
107 |
|
108 |
DEFAULT_SYSTEM_PROMPT = """
|
109 |
|
@@ -123,34 +145,34 @@ Before providing your summary, follow these steps:
|
|
123 |
- How does this information relate to the specific summary request?
|
124 |
- What additional context might be necessary to fully understand these points?
|
125 |
|
126 |
-
3. Maximal Marginal Relevance: Apply the principles of Maximal Marginal Relevance to ensure your summary includes diverse, relevant information
|
127 |
|
128 |
After completing these steps, generate the response with around 5000 characteres in BBcode format, as shown below:
|
129 |
|
130 |
Example: :
|
131 |
|
132 |
{{
|
133 |
-
"nome_do_memorial": "[
|
134 |
|
135 |
-
"argumentos": "
|
136 |
[b]Argumento 1:[/b]
|
137 |
-
Fundamento 1.1: [Descreva o fundamento]
|
138 |
-
Fundamento 1.2: [Descreva o fundamento]
|
139 |
[b]Argumento 2:[/b]
|
140 |
-
Fundamento 2.1: [Descreva o fundamento]
|
141 |
-
Fundamento 2.2: [Descreva o fundamento]",
|
142 |
|
143 |
-
"jurisprudencia": "
|
144 |
-
[b]Caso 1:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição
|
145 |
-
[b]Caso 2:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição
|
146 |
|
147 |
-
"doutrina": "
|
148 |
[b]Autor 1:[/b] [Nome do autor]
|
149 |
"[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]
|
150 |
[b]Autor 2:[/b] [Nome do autor]
|
151 |
"[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]",
|
152 |
|
153 |
-
"palavras_chave": "
|
154 |
[Palavra-chave 1]
|
155 |
[Palavra-chave 2]
|
156 |
[Palavra-chave 3]
|
@@ -160,13 +182,13 @@ Example: :
|
|
160 |
|
161 |
Remember:
|
162 |
- Always prioritize relevance to the summary request.
|
163 |
-
- Be concise and avoid unnecessary verbosity.
|
164 |
- Ensure your summary is well-structured and easy to understand.
|
165 |
- Do not include any personal opinions or information not present in the original document.
|
166 |
- If the summary request asks for a specific focus or perspective, make sure to address it directly.
|
167 |
|
168 |
-
Your goal is to provide a comprehensive
|
169 |
|
170 |
Do not pass in the response part of the instructions that you received
|
|
|
171 |
The content to be summarized is as follows:
|
172 |
"""
|
|
|
11 |
from langchain_core.output_parsers import JsonOutputParser
|
12 |
from langchain_core.pydantic_v1 import BaseModel, Field
|
13 |
from typing import List
|
14 |
+
import numpy as np
|
15 |
+
import openai
|
16 |
+
import pandas as pd
|
17 |
|
18 |
os.environ["LANGCHAIN_TRACING_V2"]="true"
|
19 |
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
|
|
|
34 |
pagesDoc = loader.load_and_split(text_splitter)
|
35 |
pages = pages + pagesDoc
|
36 |
|
|
|
|
|
|
|
37 |
for page in pages:
|
|
|
|
|
38 |
documentId = str(uuid4())
|
39 |
allIds.append(documentId)
|
40 |
page.id = documentId
|
|
|
68 |
|
69 |
def create_llm(modelParam):
|
70 |
if modelParam == default_model:
|
71 |
+
return ChatOpenAI(model=modelParam, max_tokens=16384)
|
72 |
else:
|
73 |
return HuggingFaceEndpoint(
|
74 |
repo_id=modelParam,
|
|
|
86 |
doutrina: str = Field()
|
87 |
palavras_chave: List[str] = Field()
|
88 |
|
89 |
+
def create_prompt_llm_chain_summary(system_prompt, model_param):
|
90 |
+
prompt_and_llm = create_prompt_and_llm(system_prompt, model_param)
|
91 |
+
|
92 |
+
question_answer_chain = create_stuff_documents_chain(prompt_and_llm["model"], prompt_and_llm["prompt"])
|
93 |
+
final_chain = question_answer_chain | JsonOutputParser(pydantic_object=Resumo)
|
94 |
+
return final_chain
|
95 |
+
|
96 |
+
def process_embedding_summary(system_prompt, model_param, full_text):
|
97 |
+
prompt_and_llm = create_prompt_and_llm(system_prompt, model_param)
|
98 |
|
99 |
+
text_splitter=RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
|
100 |
+
docs = text_splitter.create_documents([full_text])
|
101 |
+
embeddings=get_embeddings([doc.page_content for doc in docs])
|
102 |
|
103 |
+
content_list = [doc.page_content for doc in docs]
|
104 |
+
df = pd.DataFrame(content_list, columns=['page_content'])
|
105 |
+
vectors = [embedding.embedding for embedding in embeddings]
|
106 |
+
array = np.array(vectors)
|
107 |
+
embeddings_series = pd.Series(list(array))
|
108 |
+
df['embeddings'] = embeddings_series
|
109 |
+
|
110 |
+
|
111 |
+
def get_embeddings(text):
|
112 |
+
response = openai.embeddings.create(
|
113 |
+
model="text-embedding-3-small",
|
114 |
+
input=text
|
115 |
+
)
|
116 |
+
return response.data
|
117 |
|
118 |
+
def create_prompt_and_llm(system_prompt, model_param):
|
119 |
+
model = create_llm(model_param)
|
120 |
+
|
121 |
system_prompt = system_prompt + "\n\n" + "{context}"
|
122 |
prompt = ChatPromptTemplate.from_messages(
|
123 |
[
|
|
|
125 |
("human", "{input}"),
|
126 |
]
|
127 |
)
|
128 |
+
return {"model": model, "prompt": prompt}
|
|
|
|
|
|
|
129 |
|
130 |
DEFAULT_SYSTEM_PROMPT = """
|
131 |
|
|
|
145 |
- How does this information relate to the specific summary request?
|
146 |
- What additional context might be necessary to fully understand these points?
|
147 |
|
148 |
+
3. Maximal Marginal Relevance: Apply the principles of Maximal Marginal Relevance to ensure your summary includes diverse, relevant information. Prioritize information that is both relevant to the summary request and adds new insights not already covered.
|
149 |
|
150 |
After completing these steps, generate the response with around 5000 characteres in BBcode format, as shown below:
|
151 |
|
152 |
Example: :
|
153 |
|
154 |
{{
|
155 |
+
"nome_do_memorial": "[Insira aqui o número do memorial e da equipe] ",
|
156 |
|
157 |
+
"argumentos": "
|
158 |
[b]Argumento 1:[/b]
|
159 |
+
Fundamento 1.1: [Descreva o fundamento de forma detalhada]
|
160 |
+
Fundamento 1.2: [Descreva o fundamento de forma detalhada]
|
161 |
[b]Argumento 2:[/b]
|
162 |
+
Fundamento 2.1: [Descreva o fundamento de forma detalhada]
|
163 |
+
Fundamento 2.2: [Descreva o fundamento de forma detalhada]",
|
164 |
|
165 |
+
"jurisprudencia": "
|
166 |
+
[b]Caso 1:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição de como a jurisprudência se aplica]
|
167 |
+
[b]Caso 2:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição de como a jurisprudência se aplica]",
|
168 |
|
169 |
+
"doutrina": "
|
170 |
[b]Autor 1:[/b] [Nome do autor]
|
171 |
"[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]
|
172 |
[b]Autor 2:[/b] [Nome do autor]
|
173 |
"[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]",
|
174 |
|
175 |
+
"palavras_chave": "
|
176 |
[Palavra-chave 1]
|
177 |
[Palavra-chave 2]
|
178 |
[Palavra-chave 3]
|
|
|
182 |
|
183 |
Remember:
|
184 |
- Always prioritize relevance to the summary request.
|
|
|
185 |
- Ensure your summary is well-structured and easy to understand.
|
186 |
- Do not include any personal opinions or information not present in the original document.
|
187 |
- If the summary request asks for a specific focus or perspective, make sure to address it directly.
|
188 |
|
189 |
+
Your goal is to provide a comprehensive summary that accurately represents the document's content while meeting the specific needs outlined in the summary request.
|
190 |
|
191 |
Do not pass in the response part of the instructions that you received
|
192 |
+
Remember to generate the response with at least 5000 characteres
|
193 |
The content to be summarized is as follows:
|
194 |
"""
|
resumos/views.py
CHANGED
@@ -4,7 +4,7 @@ from rest_framework.response import Response
|
|
4 |
|
5 |
from langchain_backend.utils import DEFAULT_SYSTEM_PROMPT
|
6 |
from .serializer import ResumoPDFSerializer
|
7 |
-
from langchain_backend.main import get_llm_answer_summary
|
8 |
from setup.environment import default_model
|
9 |
from rest_framework.parsers import MultiPartParser
|
10 |
from drf_spectacular.utils import extend_schema
|
@@ -37,6 +37,37 @@ class ResumoView(APIView):
|
|
37 |
system_prompt = data.get("system_prompt", DEFAULT_SYSTEM_PROMPT)
|
38 |
resposta_llm = get_llm_answer_summary(system_prompt, data["user_message"], listaPDFs, model=model, isIterativeRefinement=data["iterative_refinement"])
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
for file in listaPDFs:
|
41 |
os.remove(file)
|
42 |
|
|
|
4 |
|
5 |
from langchain_backend.utils import DEFAULT_SYSTEM_PROMPT
|
6 |
from .serializer import ResumoPDFSerializer
|
7 |
+
from langchain_backend.main import get_llm_answer_summary, get_llm_answer_summary_with_embedding
|
8 |
from setup.environment import default_model
|
9 |
from rest_framework.parsers import MultiPartParser
|
10 |
from drf_spectacular.utils import extend_schema
|
|
|
37 |
system_prompt = data.get("system_prompt", DEFAULT_SYSTEM_PROMPT)
|
38 |
resposta_llm = get_llm_answer_summary(system_prompt, data["user_message"], listaPDFs, model=model, isIterativeRefinement=data["iterative_refinement"])
|
39 |
|
40 |
+
for file in listaPDFs:
|
41 |
+
os.remove(file)
|
42 |
+
|
43 |
+
return Response({"resposta": resposta_llm})
|
44 |
+
|
45 |
+
class ResumoEmbeddingView(APIView):
|
46 |
+
parser_classes = [MultiPartParser]
|
47 |
+
|
48 |
+
@extend_schema(
|
49 |
+
request=ResumoPDFSerializer,
|
50 |
+
)
|
51 |
+
def post(self, request):
|
52 |
+
serializer = ResumoPDFSerializer(data=request.data)
|
53 |
+
if serializer.is_valid(raise_exception=True):
|
54 |
+
listaPDFs = []
|
55 |
+
data = serializer.validated_data
|
56 |
+
model = serializer.validated_data.get("model", default_model)
|
57 |
+
print('serializer.validated_data: ', serializer.validated_data)
|
58 |
+
|
59 |
+
for file in serializer.validated_data['files']:
|
60 |
+
file.seek(0)
|
61 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: # Create a temporary file to save the uploaded PDF
|
62 |
+
for chunk in file.chunks(): # Write the uploaded file content to the temporary file
|
63 |
+
temp_file.write(chunk)
|
64 |
+
temp_file_path = temp_file.name # Get the path of the temporary file
|
65 |
+
listaPDFs.append(temp_file_path)
|
66 |
+
print('listaPDFs: ', listaPDFs)
|
67 |
+
|
68 |
+
system_prompt = data.get("system_prompt", DEFAULT_SYSTEM_PROMPT)
|
69 |
+
resposta_llm = get_llm_answer_summary_with_embedding(system_prompt, data["user_message"], listaPDFs, model=model, isIterativeRefinement=data["iterative_refinement"])
|
70 |
+
|
71 |
for file in listaPDFs:
|
72 |
os.remove(file)
|
73 |
|
setup/environment.py
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
default_model = "gpt-4o-mini"
|
|
|
|
1 |
+
default_model = "gpt-4o-mini"
|
2 |
+
# default_model = "gpt-4o"
|