luanpoppe commited on
Commit
1fd7b67
·
1 Parent(s): f22dc64

feat: adicionando resumo do cursor

Browse files
_utils/resumo_completo_cursor.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Dict, Tuple
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.document_loaders import PyPDFLoader
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain.vectorstores import Chroma
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.chains import create_extraction_chain
9
+ from langchain.prompts import PromptTemplate
10
+ from dataclasses import dataclass
11
+ import uuid
12
+ import json
13
+ from langchain_huggingface import HuggingFaceEndpoint
14
+ from setup.environment import default_model
15
+
16
+ os.environ["LANGCHAIN_TRACING_V2"]="true"
17
+ os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
18
+ os.environ.get("LANGCHAIN_API_KEY")
19
+ os.environ["LANGCHAIN_PROJECT"]="VELLA"
20
+
21
+ @dataclass
22
+ class DocumentChunk:
23
+ content: str
24
+ page_number: int
25
+ chunk_id: str
26
+ start_char: int
27
+ end_char: int
28
+
29
+ class DocumentSummarizer:
30
+ def __init__(self, openai_api_key: str, model, embedding, chunk_config, system_prompt):
31
+ self.model = model
32
+ self.system_prompt = system_prompt
33
+ self.openai_api_key = openai_api_key
34
+ self.embeddings = HuggingFaceEmbeddings(
35
+ model_name=embedding
36
+ )
37
+ self.text_splitter = RecursiveCharacterTextSplitter(
38
+ chunk_size=chunk_config["size"],
39
+ chunk_overlap=chunk_config["overlap"]
40
+ )
41
+ self.chunk_metadata = {} # Store chunk metadata for tracing
42
+
43
+ def load_and_split_document(self, pdf_path: str) -> List[DocumentChunk]:
44
+ """Load PDF and split into chunks with metadata"""
45
+ loader = PyPDFLoader(pdf_path)
46
+ pages = loader.load()
47
+ chunks = []
48
+ char_count = 0
49
+
50
+ for page in pages:
51
+ text = page.page_content
52
+ # Split the page content
53
+ page_chunks = self.text_splitter.split_text(text)
54
+
55
+ for chunk in page_chunks:
56
+ chunk_id = str(uuid.uuid4())
57
+ start_char = text.find(chunk)
58
+ end_char = start_char + len(chunk)
59
+
60
+ doc_chunk = DocumentChunk(
61
+ content=chunk,
62
+ page_number=page.metadata.get('page') + 1, # 1-based page numbering
63
+ chunk_id=chunk_id,
64
+ start_char=char_count + start_char,
65
+ end_char=char_count + end_char
66
+ )
67
+ chunks.append(doc_chunk)
68
+
69
+ # Store metadata for later retrieval
70
+ self.chunk_metadata[chunk_id] = {
71
+ 'page': doc_chunk.page_number,
72
+ 'start_char': doc_chunk.start_char,
73
+ 'end_char': doc_chunk.end_char
74
+ }
75
+
76
+ char_count += len(text)
77
+
78
+ return chunks
79
+
80
+ def create_vector_store(self, chunks: List[DocumentChunk]) -> Chroma:
81
+ """Create vector store with metadata"""
82
+ texts = [chunk.content for chunk in chunks]
83
+ metadatas = [{
84
+ 'chunk_id': chunk.chunk_id,
85
+ 'page': chunk.page_number,
86
+ 'start_char': chunk.start_char,
87
+ 'end_char': chunk.end_char
88
+ } for chunk in chunks]
89
+
90
+ vector_store = Chroma.from_texts(
91
+ texts=texts,
92
+ metadatas=metadatas,
93
+ embedding=self.embeddings
94
+ )
95
+ return vector_store
96
+
97
+ def generate_summary_with_sources(
98
+ self,
99
+ vector_store: Chroma,
100
+ query: str = "Summarize the main points of this document"
101
+ ) -> List[Dict]:
102
+ """Generate summary with source citations, returning structured JSON data"""
103
+ # Retrieve relevant chunks with metadata
104
+ relevant_docs = vector_store.similarity_search_with_score(query, k=5)
105
+
106
+ # Prepare context and track sources
107
+ contexts = []
108
+ sources = []
109
+
110
+ for doc, score in relevant_docs:
111
+ chunk_id = doc.metadata['chunk_id']
112
+ context = doc.page_content
113
+ contexts.append(context)
114
+
115
+ sources.append({
116
+ 'content': context,
117
+ 'page': doc.metadata['page'],
118
+ 'chunk_id': chunk_id,
119
+ 'relevance_score': score
120
+ })
121
+
122
+ prompt = PromptTemplate(
123
+ template=self.system_prompt,
124
+ input_variables=["context"]
125
+ )
126
+ llm = ""
127
+
128
+ if (self.model == default_model):
129
+ llm = ChatOpenAI(
130
+ temperature=0,
131
+ model_name="gpt-4o-mini",
132
+ api_key=self.openai_api_key
133
+ )
134
+ else:
135
+ llm = HuggingFaceEndpoint(
136
+ repo_id=self.model,
137
+ task="text-generation",
138
+ max_new_tokens=1100,
139
+ do_sample=False,
140
+ huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
141
+ )
142
+
143
+
144
+ response = llm.predict(prompt.format(context="\n\n".join(contexts)))
145
+
146
+ # Split the response into paragraphs
147
+ summaries = [p.strip() for p in response.split('\n\n') if p.strip()]
148
+
149
+ # Create structured output
150
+ structured_output = []
151
+ for idx, summary in enumerate(summaries):
152
+ # Associate each summary with the most relevant source
153
+ structured_output.append({
154
+ "content": summary,
155
+ "source": {
156
+ "page": sources[min(idx, len(sources)-1)]['page'],
157
+ "text": sources[min(idx, len(sources)-1)]['content'][:200] + "...",
158
+ "relevance_score": sources[min(idx, len(sources)-1)]['relevance_score']
159
+ }
160
+ })
161
+
162
+ return structured_output
163
+
164
+ def get_source_context(self, chunk_id: str, window: int = 100) -> Dict:
165
+ """Get extended context around a specific chunk"""
166
+ metadata = self.chunk_metadata.get(chunk_id)
167
+ if not metadata:
168
+ return None
169
+
170
+ return {
171
+ 'page': metadata['page'],
172
+ 'start_char': metadata['start_char'],
173
+ 'end_char': metadata['end_char']
174
+ }
175
+
176
+ def get_llm_summary_answer_by_cursor(serializer, listaPDFs):
177
+ # By Luan
178
+ allPdfsChunks = []
179
+
180
+ # Initialize summarizer
181
+ summarizer = DocumentSummarizer(
182
+ openai_api_key=os.environ.get("OPENAI_API_KEY"),
183
+ embedding=serializer["hf_embedding"],
184
+ chunk_config={"size": serializer["chunk_size"], "overlap": serializer["chunk_overlap"]},
185
+ system_prompt=serializer["system_prompt"],
186
+ model=serializer["model"]
187
+ )
188
+
189
+ # Load and process document
190
+ for pdf in listaPDFs:
191
+ pdf_path = pdf
192
+ chunks = summarizer.load_and_split_document(pdf_path)
193
+ allPdfsChunks = allPdfsChunks + chunks
194
+
195
+ vector_store = summarizer.create_vector_store(allPdfsChunks)
196
+
197
+ # Generate structured summary
198
+ structured_summaries = summarizer.generate_summary_with_sources(vector_store)
199
+
200
+ # Print or return the structured data
201
+ # print(structured_summaries)
202
+ json_data = json.dumps(structured_summaries)
203
+ print("\n\n")
204
+ print(json_data)
205
+ return structured_summaries
206
+ # If you need to send to frontend, you can just return structured_summaries
207
+ # It will be in the format:
208
+ # [
209
+ # {
210
+ # "content": "Summary point 1...",
211
+ # "source": {
212
+ # "page": 1,
213
+ # "text": "Source text...",
214
+ # "relevance_score": 0.95
215
+ # }
216
+ # },
217
+ # ...
218
+ # ]
219
+
220
+ if __name__ == "__main__":
221
+ get_llm_summary_answer_by_cursor()
resumos/serializer.py CHANGED
@@ -1,25 +1,29 @@
1
  from rest_framework import serializers
 
 
2
 
3
- # Exemplo de retorno que devo enviar ao frontend:
4
- # {
5
- # "nome_do_memorial": "[b]Memorial de Defesa em Ação de Indenização por Danos Morais[/b]",
6
- # "argumentos": "[b]Argumentos:[/b]\n[i]• Responsabilidade Civil do Réu:[/i] [i]O réu agiu de forma negligente ao causar o dano. Há dever de indenizar baseado no artigo 186 do Código Civil Brasileiro.[/i]\n[i]• Dano Moral Comprovado:[/i] [i]O dano sofrido pela parte autora é evidente, gerando sofrimento e abalo psicológico. O nexo causal entre a ação do réu e o dano sofrido é claro.[/i]",
7
- # "jurisprudencia": "[b]Jurisprudência:[/b]\n[i]• STJ, REsp 123456/DF -[/i] [i]O Superior Tribunal de Justiça entendeu que a indenização por danos morais deve ser fixada de acordo com a gravidade do ato ilícito.[/i]\n[i]• STJ, REsp 654321/SP -[/i] [i]A jurisprudência confirma que o réu tem o dever de reparar integralmente o dano causado.[/i]",
8
- # "doutrina": "[b]Doutrina:[/b]\n[i]• Carlos Roberto Gonçalves, Responsabilidade Civil -[/i] [i]A responsabilidade civil é objetiva quando há risco para os direitos da personalidade da vítima.[/i]\n[i]• Maria Helena Diniz, Curso de Direito Civil -[/i] [i]O dano moral é configurado pela violação dos direitos da personalidade.[/i]",
9
- # "argumentos_faltantes": "[b]Argumentos Faltantes:[/b]\n[i]• Prova pericial de impacto psicológico.[/i]\n[i]• Estudo comparativo com casos análogos para quantificação do valor da indenização.[/i]",
10
- # "palavras_chave": [
11
- # "[i]Responsabilidade civil[/i]",
12
- # "[i]Dano moral[/i]",
13
- # "[i]Nexo causal[/i]",
14
- # "[i]Indenização[/i]"
15
- #   ]
16
- # }
17
-
18
- # pecam para a AI formatar em BBcode
19
 
20
  class ResumoPDFSerializer(serializers.Serializer):
21
  files = serializers.ListField(child=serializers.FileField(), required=True)
22
  system_prompt = serializers.CharField(required=False)
23
  user_message = serializers.CharField(required=False, default="")
24
  model = serializers.CharField(required=False)
25
- iterative_refinement = serializers.BooleanField(required=False, default=False)
 
 
 
 
 
 
 
 
 
 
1
  from rest_framework import serializers
2
+ from setup.environment import default_model
3
+ # from _utils.utils import DEFAULT_SYSTEM_PROMPT
4
 
5
+ prompt_template = """
6
+ Based on the following context, provide multiple key points from the document.
7
+ For each point, create a new paragraph.
8
+ Each paragraph should be a complete, self-contained insight.
9
+
10
+ Context: {context}
11
+
12
+ Key points:
13
+ """
 
 
 
 
 
 
 
14
 
15
  class ResumoPDFSerializer(serializers.Serializer):
16
  files = serializers.ListField(child=serializers.FileField(), required=True)
17
  system_prompt = serializers.CharField(required=False)
18
  user_message = serializers.CharField(required=False, default="")
19
  model = serializers.CharField(required=False)
20
+ iterative_refinement = serializers.BooleanField(required=False, default=False)
21
+
22
+ class ResumoCursorSerializer(serializers.Serializer):
23
+ files = serializers.ListField(child=serializers.FileField(), required=True)
24
+ system_prompt = serializers.CharField(required=False, default=prompt_template)
25
+ user_message = serializers.CharField(required=False, default="")
26
+ model = serializers.CharField(required=False, default=default_model)
27
+ hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
28
+ chunk_size = serializers.IntegerField(required=False, default=1000)
29
+ chunk_overlap = serializers.IntegerField(required=False, default=200)
resumos/views.py CHANGED
@@ -2,8 +2,9 @@ from rest_framework.views import APIView
2
  import tempfile, os
3
  from rest_framework.response import Response
4
 
 
5
  from _utils.utils import DEFAULT_SYSTEM_PROMPT
6
- from .serializer import ResumoPDFSerializer
7
  from _utils.main import get_llm_answer_summary, get_llm_answer_summary_with_embedding
8
  from setup.environment import default_model
9
  from rest_framework.parsers import MultiPartParser
@@ -68,6 +69,35 @@ class ResumoEmbeddingView(APIView):
68
  system_prompt = data.get("system_prompt", DEFAULT_SYSTEM_PROMPT)
69
  resposta_llm = get_llm_answer_summary_with_embedding(system_prompt, data["user_message"], listaPDFs, model=model, isIterativeRefinement=data["iterative_refinement"])
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  for file in listaPDFs:
72
  os.remove(file)
73
 
 
2
  import tempfile, os
3
  from rest_framework.response import Response
4
 
5
+ from _utils.resumo_completo_cursor import get_llm_summary_answer_by_cursor
6
  from _utils.utils import DEFAULT_SYSTEM_PROMPT
7
+ from .serializer import ResumoPDFSerializer, ResumoCursorSerializer
8
  from _utils.main import get_llm_answer_summary, get_llm_answer_summary_with_embedding
9
  from setup.environment import default_model
10
  from rest_framework.parsers import MultiPartParser
 
69
  system_prompt = data.get("system_prompt", DEFAULT_SYSTEM_PROMPT)
70
  resposta_llm = get_llm_answer_summary_with_embedding(system_prompt, data["user_message"], listaPDFs, model=model, isIterativeRefinement=data["iterative_refinement"])
71
 
72
+ for file in listaPDFs:
73
+ os.remove(file)
74
+
75
+ return Response({"resposta": resposta_llm})
76
+
77
+ class ResumoCompletoCursorView(APIView):
78
+ parser_classes = [MultiPartParser]
79
+
80
+ @extend_schema(
81
+ request=ResumoCursorSerializer,
82
+ )
83
+ def post(self, request):
84
+ serializer = ResumoCursorSerializer(data=request.data)
85
+ if serializer.is_valid(raise_exception=True):
86
+ listaPDFs = []
87
+ data = serializer.validated_data
88
+ print('\nserializer.validated_data: ', serializer.validated_data)
89
+
90
+ for file in serializer.validated_data['files']:
91
+ file.seek(0)
92
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: # Create a temporary file to save the uploaded PDF
93
+ for chunk in file.chunks(): # Write the uploaded file content to the temporary file
94
+ temp_file.write(chunk)
95
+ temp_file_path = temp_file.name # Get the path of the temporary file
96
+ listaPDFs.append(temp_file_path)
97
+ print('listaPDFs: ', listaPDFs)
98
+
99
+ resposta_llm = get_llm_summary_answer_by_cursor(data, listaPDFs)
100
+
101
  for file in listaPDFs:
102
  os.remove(file)
103
 
setup/urls.py CHANGED
@@ -5,7 +5,7 @@ from drf_spectacular.views import SpectacularSwaggerView, SpectacularAPIView
5
 
6
 
7
  from pdfs.views import getPDF
8
- from resumos.views import ResumoView
9
  from modelos_usuarios.views import ListCreateModeloUsuarioView, CreateUpdateDeleteModeloUsuarioView, ListModelosPorUsuarioView
10
 
11
  router = routers.DefaultRouter()
@@ -16,9 +16,9 @@ urlpatterns = [
16
  path('swagger/', SpectacularSwaggerView.as_view(url_name='schema'), name='swagger-ui'),
17
  path("admin/", admin.site.urls),
18
  path('', include(router.urls)),
19
-
20
  path('pdf', getPDF, name='upload-pdf'),
21
  path('resumo', ResumoView.as_view(), name='summary-pdf'),
 
22
  path("modelo", ListCreateModeloUsuarioView.as_view()),
23
  path("modelo/<int:pk>", CreateUpdateDeleteModeloUsuarioView.as_view()),
24
  path("usuario/<int:user_id>/modelos", ListModelosPorUsuarioView.as_view())
 
5
 
6
 
7
  from pdfs.views import getPDF
8
+ from resumos.views import ResumoView, ResumoCompletoCursorView
9
  from modelos_usuarios.views import ListCreateModeloUsuarioView, CreateUpdateDeleteModeloUsuarioView, ListModelosPorUsuarioView
10
 
11
  router = routers.DefaultRouter()
 
16
  path('swagger/', SpectacularSwaggerView.as_view(url_name='schema'), name='swagger-ui'),
17
  path("admin/", admin.site.urls),
18
  path('', include(router.urls)),
 
19
  path('pdf', getPDF, name='upload-pdf'),
20
  path('resumo', ResumoView.as_view(), name='summary-pdf'),
21
+ path('resumo/cursor', ResumoCompletoCursorView.as_view(), name='summary-cursor-pdf'),
22
  path("modelo", ListCreateModeloUsuarioView.as_view()),
23
  path("modelo/<int:pk>", CreateUpdateDeleteModeloUsuarioView.as_view()),
24
  path("usuario/<int:user_id>/modelos", ListModelosPorUsuarioView.as_view())