luanpoppe commited on
Commit
7dc6d22
·
1 Parent(s): 834da99

feat: tentando melhorar system_prompt

Browse files
.gitignore CHANGED
@@ -139,6 +139,7 @@ venv/
139
  ENV/
140
  env.bak/
141
  venv.bak/
 
142
 
143
  # Spyder project settings
144
  .spyderproject
 
139
  ENV/
140
  env.bak/
141
  venv.bak/
142
+ .venv*
143
 
144
  # Spyder project settings
145
  .spyderproject
langchain_backend/main.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
- from langchain_backend.utils import create_prompt_llm_chain, create_retriever, getPDF, create_llm, create_prompt_llm_chain_summary
3
  from langchain_backend import utils
4
  from langchain.chains import create_retrieval_chain
5
  from langchain_huggingface import HuggingFaceEmbeddings
@@ -54,4 +54,20 @@ def get_llm_answer_summary(system_prompt, user_prompt, pdf_url, model, isIterati
54
  print('result: ', result)
55
  return result
56
  # Obs --> Para passar informações personalizadas --> chain = load_summarize_chain(llm, "refine", True, question_prompt=initial_prompt, refine_prompt=PromptTemplate.from_template(refine_prompt))
57
- # Para ver mais opções --> Acessa a origem da função load_summarize_chain , e nela acessa a origem da função _load_refine_chain --> As opções são os parâmetros que esta última função recebe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ from langchain_backend.utils import create_prompt_llm_chain, create_retriever, getPDF, create_llm, create_prompt_llm_chain_summary, process_embedding_summary
3
  from langchain_backend import utils
4
  from langchain.chains import create_retrieval_chain
5
  from langchain_huggingface import HuggingFaceEmbeddings
 
54
  print('result: ', result)
55
  return result
56
  # Obs --> Para passar informações personalizadas --> chain = load_summarize_chain(llm, "refine", True, question_prompt=initial_prompt, refine_prompt=PromptTemplate.from_template(refine_prompt))
57
+ # Para ver mais opções --> Acessa a origem da função load_summarize_chain , e nela acessa a origem da função _load_refine_chain --> As opções são os parâmetros que esta última função recebe
58
+
59
+ def get_llm_answer_summary_with_embedding(system_prompt, user_prompt, pdf_url, model, isIterativeRefinement):
60
+ print('model: ', model)
61
+ print('isIterativeRefinement: ', isIterativeRefinement)
62
+ print('\n\n\n')
63
+ pages = getPDF(pdf_url)
64
+ full_texto = ""
65
+ for p in pages:
66
+ full_texto += p.page_content
67
+ print('full_texto: ', full_texto)
68
+
69
+ rag_chain = process_embedding_summary(system_prompt, model)
70
+
71
+ results = rag_chain.invoke({"input": user_prompt, "context": pages})
72
+
73
+ return results
langchain_backend/utils.py CHANGED
@@ -11,6 +11,9 @@ from uuid import uuid4
11
  from langchain_core.output_parsers import JsonOutputParser
12
  from langchain_core.pydantic_v1 import BaseModel, Field
13
  from typing import List
 
 
 
14
 
15
  os.environ["LANGCHAIN_TRACING_V2"]="true"
16
  os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
@@ -31,12 +34,7 @@ def getPDF(file_paths):
31
  pagesDoc = loader.load_and_split(text_splitter)
32
  pages = pages + pagesDoc
33
 
34
-
35
- # loader = PyPDFLoader(file_paths, extract_images=False)
36
- # pages = loader.load_and_split(text_splitter)
37
  for page in pages:
38
- # print('\n')
39
- # print('allIds: ', allIds)
40
  documentId = str(uuid4())
41
  allIds.append(documentId)
42
  page.id = documentId
@@ -70,7 +68,7 @@ def create_prompt_llm_chain(system_prompt, modelParam):
70
 
71
  def create_llm(modelParam):
72
  if modelParam == default_model:
73
- return ChatOpenAI(model=modelParam)
74
  else:
75
  return HuggingFaceEndpoint(
76
  repo_id=modelParam,
@@ -88,11 +86,38 @@ class Resumo(BaseModel):
88
  doutrina: str = Field()
89
  palavras_chave: List[str] = Field()
90
 
91
- def create_prompt_llm_chain_summary(system_prompt, modelParam):
92
- model = create_llm(modelParam)
 
 
 
 
 
 
 
93
 
 
 
 
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
 
 
 
96
  system_prompt = system_prompt + "\n\n" + "{context}"
97
  prompt = ChatPromptTemplate.from_messages(
98
  [
@@ -100,10 +125,7 @@ def create_prompt_llm_chain_summary(system_prompt, modelParam):
100
  ("human", "{input}"),
101
  ]
102
  )
103
- question_answer_chain = create_stuff_documents_chain(model, prompt)
104
- final_chain = question_answer_chain | JsonOutputParser(pydantic_object=Resumo)
105
- return final_chain
106
-
107
 
108
  DEFAULT_SYSTEM_PROMPT = """
109
 
@@ -123,34 +145,34 @@ Before providing your summary, follow these steps:
123
  - How does this information relate to the specific summary request?
124
  - What additional context might be necessary to fully understand these points?
125
 
126
- 3. Maximal Marginal Relevance: Apply the principles of Maximal Marginal Relevance to ensure your summary includes diverse, relevant information while avoiding redundancy. Prioritize information that is both relevant to the summary request and adds new insights not already covered.
127
 
128
  After completing these steps, generate the response with around 5000 characteres in BBcode format, as shown below:
129
 
130
  Example: :
131
 
132
  {{
133
- "nome_do_memorial": "[b]Nome do Memorial:[/b] [Insira aqui o nome do memorial e número da equipe] ",
134
 
135
- "argumentos": "[size=4][b]1. Argumentos[/b][/size]
136
  [b]Argumento 1:[/b]
137
- Fundamento 1.1: [Descreva o fundamento]
138
- Fundamento 1.2: [Descreva o fundamento]
139
  [b]Argumento 2:[/b]
140
- Fundamento 2.1: [Descreva o fundamento]
141
- Fundamento 2.2: [Descreva o fundamento]",
142
 
143
- "jurisprudencia": "[size=4][b]2. Jurisprudência Aplicada[/b][/size]
144
- [b]Caso 1:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição resumida de como a jurisprudência se aplica]
145
- [b]Caso 2:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição resumida de como a jurisprudência se aplica][size=4]",
146
 
147
- "doutrina": "[b]3. Doutrina Relevante[/b][/size]
148
  [b]Autor 1:[/b] [Nome do autor]
149
  "[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]
150
  [b]Autor 2:[/b] [Nome do autor]
151
  "[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]",
152
 
153
- "palavras_chave": "[size=4][b]4. Palavras-chave Principais[/b][/size]
154
  [Palavra-chave 1]
155
  [Palavra-chave 2]
156
  [Palavra-chave 3]
@@ -160,13 +182,13 @@ Example: :
160
 
161
  Remember:
162
  - Always prioritize relevance to the summary request.
163
- - Be concise and avoid unnecessary verbosity.
164
  - Ensure your summary is well-structured and easy to understand.
165
  - Do not include any personal opinions or information not present in the original document.
166
  - If the summary request asks for a specific focus or perspective, make sure to address it directly.
167
 
168
- Your goal is to provide a comprehensive yet concise summary that accurately represents the document's content while meeting the specific needs outlined in the summary request.
169
 
170
  Do not pass in the response part of the instructions that you received
 
171
  The content to be summarized is as follows:
172
  """
 
11
  from langchain_core.output_parsers import JsonOutputParser
12
  from langchain_core.pydantic_v1 import BaseModel, Field
13
  from typing import List
14
+ import numpy as np
15
+ import openai
16
+ import pandas as pd
17
 
18
  os.environ["LANGCHAIN_TRACING_V2"]="true"
19
  os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
 
34
  pagesDoc = loader.load_and_split(text_splitter)
35
  pages = pages + pagesDoc
36
 
 
 
 
37
  for page in pages:
 
 
38
  documentId = str(uuid4())
39
  allIds.append(documentId)
40
  page.id = documentId
 
68
 
69
  def create_llm(modelParam):
70
  if modelParam == default_model:
71
+ return ChatOpenAI(model=modelParam, max_tokens=16384)
72
  else:
73
  return HuggingFaceEndpoint(
74
  repo_id=modelParam,
 
86
  doutrina: str = Field()
87
  palavras_chave: List[str] = Field()
88
 
89
+ def create_prompt_llm_chain_summary(system_prompt, model_param):
90
+ prompt_and_llm = create_prompt_and_llm(system_prompt, model_param)
91
+
92
+ question_answer_chain = create_stuff_documents_chain(prompt_and_llm["model"], prompt_and_llm["prompt"])
93
+ final_chain = question_answer_chain | JsonOutputParser(pydantic_object=Resumo)
94
+ return final_chain
95
+
96
+ def process_embedding_summary(system_prompt, model_param, full_text):
97
+ prompt_and_llm = create_prompt_and_llm(system_prompt, model_param)
98
 
99
+ text_splitter=RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
100
+ docs = text_splitter.create_documents([full_text])
101
+ embeddings=get_embeddings([doc.page_content for doc in docs])
102
 
103
+ content_list = [doc.page_content for doc in docs]
104
+ df = pd.DataFrame(content_list, columns=['page_content'])
105
+ vectors = [embedding.embedding for embedding in embeddings]
106
+ array = np.array(vectors)
107
+ embeddings_series = pd.Series(list(array))
108
+ df['embeddings'] = embeddings_series
109
+
110
+
111
+ def get_embeddings(text):
112
+ response = openai.embeddings.create(
113
+ model="text-embedding-3-small",
114
+ input=text
115
+ )
116
+ return response.data
117
 
118
+ def create_prompt_and_llm(system_prompt, model_param):
119
+ model = create_llm(model_param)
120
+
121
  system_prompt = system_prompt + "\n\n" + "{context}"
122
  prompt = ChatPromptTemplate.from_messages(
123
  [
 
125
  ("human", "{input}"),
126
  ]
127
  )
128
+ return {"model": model, "prompt": prompt}
 
 
 
129
 
130
  DEFAULT_SYSTEM_PROMPT = """
131
 
 
145
  - How does this information relate to the specific summary request?
146
  - What additional context might be necessary to fully understand these points?
147
 
148
+ 3. Maximal Marginal Relevance: Apply the principles of Maximal Marginal Relevance to ensure your summary includes diverse, relevant information. Prioritize information that is both relevant to the summary request and adds new insights not already covered.
149
 
150
  After completing these steps, generate the response with around 5000 characteres in BBcode format, as shown below:
151
 
152
  Example: :
153
 
154
  {{
155
+ "nome_do_memorial": "[Insira aqui o número do memorial e da equipe] ",
156
 
157
+ "argumentos": "
158
  [b]Argumento 1:[/b]
159
+ Fundamento 1.1: [Descreva o fundamento de forma detalhada]
160
+ Fundamento 1.2: [Descreva o fundamento de forma detalhada]
161
  [b]Argumento 2:[/b]
162
+ Fundamento 2.1: [Descreva o fundamento de forma detalhada]
163
+ Fundamento 2.2: [Descreva o fundamento de forma detalhada]",
164
 
165
+ "jurisprudencia": "
166
+ [b]Caso 1:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição de como a jurisprudência se aplica]
167
+ [b]Caso 2:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição de como a jurisprudência se aplica]",
168
 
169
+ "doutrina": "
170
  [b]Autor 1:[/b] [Nome do autor]
171
  "[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]
172
  [b]Autor 2:[/b] [Nome do autor]
173
  "[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]",
174
 
175
+ "palavras_chave": "
176
  [Palavra-chave 1]
177
  [Palavra-chave 2]
178
  [Palavra-chave 3]
 
182
 
183
  Remember:
184
  - Always prioritize relevance to the summary request.
 
185
  - Ensure your summary is well-structured and easy to understand.
186
  - Do not include any personal opinions or information not present in the original document.
187
  - If the summary request asks for a specific focus or perspective, make sure to address it directly.
188
 
189
+ Your goal is to provide a comprehensive summary that accurately represents the document's content while meeting the specific needs outlined in the summary request.
190
 
191
  Do not pass in the response part of the instructions that you received
192
+ Remember to generate the response with at least 5000 characteres
193
  The content to be summarized is as follows:
194
  """
resumos/views.py CHANGED
@@ -4,7 +4,7 @@ from rest_framework.response import Response
4
 
5
  from langchain_backend.utils import DEFAULT_SYSTEM_PROMPT
6
  from .serializer import ResumoPDFSerializer
7
- from langchain_backend.main import get_llm_answer_summary
8
  from setup.environment import default_model
9
  from rest_framework.parsers import MultiPartParser
10
  from drf_spectacular.utils import extend_schema
@@ -37,6 +37,37 @@ class ResumoView(APIView):
37
  system_prompt = data.get("system_prompt", DEFAULT_SYSTEM_PROMPT)
38
  resposta_llm = get_llm_answer_summary(system_prompt, data["user_message"], listaPDFs, model=model, isIterativeRefinement=data["iterative_refinement"])
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  for file in listaPDFs:
41
  os.remove(file)
42
 
 
4
 
5
  from langchain_backend.utils import DEFAULT_SYSTEM_PROMPT
6
  from .serializer import ResumoPDFSerializer
7
+ from langchain_backend.main import get_llm_answer_summary, get_llm_answer_summary_with_embedding
8
  from setup.environment import default_model
9
  from rest_framework.parsers import MultiPartParser
10
  from drf_spectacular.utils import extend_schema
 
37
  system_prompt = data.get("system_prompt", DEFAULT_SYSTEM_PROMPT)
38
  resposta_llm = get_llm_answer_summary(system_prompt, data["user_message"], listaPDFs, model=model, isIterativeRefinement=data["iterative_refinement"])
39
 
40
+ for file in listaPDFs:
41
+ os.remove(file)
42
+
43
+ return Response({"resposta": resposta_llm})
44
+
45
+ class ResumoEmbeddingView(APIView):
46
+ parser_classes = [MultiPartParser]
47
+
48
+ @extend_schema(
49
+ request=ResumoPDFSerializer,
50
+ )
51
+ def post(self, request):
52
+ serializer = ResumoPDFSerializer(data=request.data)
53
+ if serializer.is_valid(raise_exception=True):
54
+ listaPDFs = []
55
+ data = serializer.validated_data
56
+ model = serializer.validated_data.get("model", default_model)
57
+ print('serializer.validated_data: ', serializer.validated_data)
58
+
59
+ for file in serializer.validated_data['files']:
60
+ file.seek(0)
61
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: # Create a temporary file to save the uploaded PDF
62
+ for chunk in file.chunks(): # Write the uploaded file content to the temporary file
63
+ temp_file.write(chunk)
64
+ temp_file_path = temp_file.name # Get the path of the temporary file
65
+ listaPDFs.append(temp_file_path)
66
+ print('listaPDFs: ', listaPDFs)
67
+
68
+ system_prompt = data.get("system_prompt", DEFAULT_SYSTEM_PROMPT)
69
+ resposta_llm = get_llm_answer_summary_with_embedding(system_prompt, data["user_message"], listaPDFs, model=model, isIterativeRefinement=data["iterative_refinement"])
70
+
71
  for file in listaPDFs:
72
  os.remove(file)
73
 
setup/environment.py CHANGED
@@ -1 +1,2 @@
1
- default_model = "gpt-4o-mini"
 
 
1
+ default_model = "gpt-4o-mini"
2
+ # default_model = "gpt-4o"