Spaces:

luanpoppe
/

vella-backend

Running

App Files Files Community

luanpoppe commited on Oct 18, 2024

Commit

7dc6d22

1 Parent(s): 834da99

feat: tentando melhorar system_prompt

Browse files

Files changed (5) hide show

.gitignore +1 -0
langchain_backend/main.py +18 -2
langchain_backend/utils.py +48 -26
resumos/views.py +32 -1
setup/environment.py +2 -1

.gitignore CHANGED Viewed

@@ -139,6 +139,7 @@ venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject

 ENV/
 env.bak/
 venv.bak/
+.venv*
 # Spyder project settings
 .spyderproject

langchain_backend/main.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from langchain_backend.utils import create_prompt_llm_chain, create_retriever, getPDF, create_llm, create_prompt_llm_chain_summary
 from langchain_backend import utils
 from langchain.chains import create_retrieval_chain
 from langchain_huggingface import HuggingFaceEmbeddings
@@ -54,4 +54,20 @@ def get_llm_answer_summary(system_prompt, user_prompt, pdf_url, model, isIterati
     print('result: ', result)
     return result
     # Obs --> Para passar informações personalizadas --> chain = load_summarize_chain(llm, "refine", True, question_prompt=initial_prompt, refine_prompt=PromptTemplate.from_template(refine_prompt))
-    # Para ver mais opções --> Acessa a origem da função load_summarize_chain , e nela acessa a origem da função _load_refine_chain --> As opções são os parâmetros que esta última função recebe

 import os
+from langchain_backend.utils import create_prompt_llm_chain, create_retriever, getPDF, create_llm, create_prompt_llm_chain_summary, process_embedding_summary
 from langchain_backend import utils
 from langchain.chains import create_retrieval_chain
 from langchain_huggingface import HuggingFaceEmbeddings
     print('result: ', result)
     return result
     # Obs --> Para passar informações personalizadas --> chain = load_summarize_chain(llm, "refine", True, question_prompt=initial_prompt, refine_prompt=PromptTemplate.from_template(refine_prompt))
+    # Para ver mais opções --> Acessa a origem da função load_summarize_chain , e nela acessa a origem da função _load_refine_chain --> As opções são os parâmetros que esta última função recebe
+def get_llm_answer_summary_with_embedding(system_prompt, user_prompt, pdf_url, model, isIterativeRefinement):
+  print('model: ', model)
+  print('isIterativeRefinement: ', isIterativeRefinement)
+  print('\n\n\n')
+  pages = getPDF(pdf_url)
+  full_texto = ""
+  for p in pages:
+    full_texto += p.page_content
+  print('full_texto: ', full_texto)
+  rag_chain = process_embedding_summary(system_prompt, model)
+  results = rag_chain.invoke({"input": user_prompt, "context": pages})
+  return results

langchain_backend/utils.py CHANGED Viewed

@@ -11,6 +11,9 @@ from uuid import uuid4
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.pydantic_v1 import BaseModel, Field
 from typing import List
 os.environ["LANGCHAIN_TRACING_V2"]="true"
 os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
@@ -31,12 +34,7 @@ def getPDF(file_paths):
     pagesDoc = loader.load_and_split(text_splitter)
     pages = pages + pagesDoc
-  # loader = PyPDFLoader(file_paths, extract_images=False)
-  # pages = loader.load_and_split(text_splitter)
   for page in pages:
-    # print('\n')
-    # print('allIds: ', allIds)
     documentId = str(uuid4())
     allIds.append(documentId)
     page.id = documentId
@@ -70,7 +68,7 @@ def create_prompt_llm_chain(system_prompt, modelParam):
 def create_llm(modelParam):
   if modelParam == default_model:
-    return ChatOpenAI(model=modelParam)
   else:
     return HuggingFaceEndpoint(
         repo_id=modelParam,
@@ -88,11 +86,38 @@ class Resumo(BaseModel):
     doutrina: str = Field()
     palavras_chave: List[str] = Field()
-def create_prompt_llm_chain_summary(system_prompt, modelParam):
-  model = create_llm(modelParam)
   system_prompt = system_prompt + "\n\n" + "{context}"
   prompt = ChatPromptTemplate.from_messages(
       [
@@ -100,10 +125,7 @@ def create_prompt_llm_chain_summary(system_prompt, modelParam):
           ("human", "{input}"),
       ]
   )
-  question_answer_chain = create_stuff_documents_chain(model, prompt)
-  final_chain = question_answer_chain | JsonOutputParser(pydantic_object=Resumo)
-  return final_chain
 DEFAULT_SYSTEM_PROMPT = """
@@ -123,34 +145,34 @@ Before providing your summary, follow these steps:
  - How does this information relate to the specific summary request?
  - What additional context might be necessary to fully understand these points?
-3. Maximal Marginal Relevance: Apply the principles of Maximal Marginal Relevance to ensure your summary includes diverse, relevant information while avoiding redundancy. Prioritize information that is both relevant to the summary request and adds new insights not already covered.
 After completing these steps, generate the response with around 5000 characteres in BBcode format, as shown below:
 Example: :
 {{
-  "nome_do_memorial": "[b]Nome do Memorial:[/b] [Insira aqui o nome do memorial e número da equipe] ",
-  "argumentos": "[size=4][b]1. Argumentos[/b][/size]
   [b]Argumento 1:[/b]
-  Fundamento 1.1: [Descreva o fundamento]
-  Fundamento 1.2: [Descreva o fundamento]
   [b]Argumento 2:[/b]
-  Fundamento 2.1: [Descreva o fundamento]
-  Fundamento 2.2: [Descreva o fundamento]",
-  "jurisprudencia": "[size=4][b]2. Jurisprudência Aplicada[/b][/size]
-  [b]Caso 1:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição resumida de como a jurisprudência se aplica]
-  [b]Caso 2:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição resumida de como a jurisprudência se aplica][size=4]",
-  "doutrina": "[b]3. Doutrina Relevante[/b][/size]
   [b]Autor 1:[/b] [Nome do autor]
   "[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]
   [b]Autor 2:[/b] [Nome do autor]
   "[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]",
-  "palavras_chave": "[size=4][b]4. Palavras-chave Principais[/b][/size]
   [Palavra-chave 1]
   [Palavra-chave 2]
   [Palavra-chave 3]
@@ -160,13 +182,13 @@ Example: :
 Remember:
 - Always prioritize relevance to the summary request.
-- Be concise and avoid unnecessary verbosity.
 - Ensure your summary is well-structured and easy to understand.
 - Do not include any personal opinions or information not present in the original document.
 - If the summary request asks for a specific focus or perspective, make sure to address it directly.
-Your goal is to provide a comprehensive yet concise summary that accurately represents the document's content while meeting the specific needs outlined in the summary request.
 Do not pass in the response part of the instructions that you received
 The content to be summarized is as follows:
 """

 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.pydantic_v1 import BaseModel, Field
 from typing import List
+import numpy as np
+import openai
+import pandas as pd
 os.environ["LANGCHAIN_TRACING_V2"]="true"
 os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
     pagesDoc = loader.load_and_split(text_splitter)
     pages = pages + pagesDoc
   for page in pages:
     documentId = str(uuid4())
     allIds.append(documentId)
     page.id = documentId
 def create_llm(modelParam):
   if modelParam == default_model:
+    return ChatOpenAI(model=modelParam, max_tokens=16384)
   else:
     return HuggingFaceEndpoint(
         repo_id=modelParam,
     doutrina: str = Field()
     palavras_chave: List[str] = Field()
+def create_prompt_llm_chain_summary(system_prompt, model_param):
+  prompt_and_llm = create_prompt_and_llm(system_prompt, model_param)
+  question_answer_chain = create_stuff_documents_chain(prompt_and_llm["model"], prompt_and_llm["prompt"])
+  final_chain = question_answer_chain | JsonOutputParser(pydantic_object=Resumo)
+  return final_chain
+def process_embedding_summary(system_prompt, model_param, full_text):
+  prompt_and_llm = create_prompt_and_llm(system_prompt, model_param)
+  text_splitter=RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
+  docs = text_splitter.create_documents([full_text])
+  embeddings=get_embeddings([doc.page_content for doc in docs])
+  content_list = [doc.page_content for doc in docs]
+  df = pd.DataFrame(content_list, columns=['page_content'])
+  vectors = [embedding.embedding for embedding in embeddings]
+  array = np.array(vectors)
+  embeddings_series = pd.Series(list(array))
+  df['embeddings'] = embeddings_series
+def get_embeddings(text):
+  response = openai.embeddings.create(
+      model="text-embedding-3-small",
+      input=text
+  )
+  return response.data
+def create_prompt_and_llm(system_prompt, model_param):
+  model = create_llm(model_param)
   system_prompt = system_prompt + "\n\n" + "{context}"
   prompt = ChatPromptTemplate.from_messages(
       [
           ("human", "{input}"),
       ]
   )
+  return {"model": model, "prompt": prompt}
 DEFAULT_SYSTEM_PROMPT = """
  - How does this information relate to the specific summary request?
  - What additional context might be necessary to fully understand these points?
+3. Maximal Marginal Relevance: Apply the principles of Maximal Marginal Relevance to ensure your summary includes diverse, relevant information. Prioritize information that is both relevant to the summary request and adds new insights not already covered.
 After completing these steps, generate the response with around 5000 characteres in BBcode format, as shown below:
 Example: :
 {{
+  "nome_do_memorial": "[Insira aqui o número do memorial e da equipe] ",
+  "argumentos": "
   [b]Argumento 1:[/b]
+  Fundamento 1.1: [Descreva o fundamento de forma detalhada]
+  Fundamento 1.2: [Descreva o fundamento de forma detalhada]
   [b]Argumento 2:[/b]
+  Fundamento 2.1: [Descreva o fundamento de forma detalhada]
+  Fundamento 2.2: [Descreva o fundamento de forma detalhada]",
+  "jurisprudencia": "
+  [b]Caso 1:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição de como a jurisprudência se aplica]
+  [b]Caso 2:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição de como a jurisprudência se aplica]",
+  "doutrina": "
   [b]Autor 1:[/b] [Nome do autor]
   "[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]
   [b]Autor 2:[/b] [Nome do autor]
   "[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]",
+  "palavras_chave": "
   [Palavra-chave 1]
   [Palavra-chave 2]
   [Palavra-chave 3]
 Remember:
 - Always prioritize relevance to the summary request.
 - Ensure your summary is well-structured and easy to understand.
 - Do not include any personal opinions or information not present in the original document.
 - If the summary request asks for a specific focus or perspective, make sure to address it directly.
+Your goal is to provide a comprehensive summary that accurately represents the document's content while meeting the specific needs outlined in the summary request.
 Do not pass in the response part of the instructions that you received
+Remember to generate the response with at least 5000 characteres
 The content to be summarized is as follows:
 """

resumos/views.py CHANGED Viewed

@@ -4,7 +4,7 @@ from rest_framework.response import Response
 from langchain_backend.utils import DEFAULT_SYSTEM_PROMPT
 from .serializer import ResumoPDFSerializer
-from langchain_backend.main import get_llm_answer_summary
 from setup.environment import default_model
 from rest_framework.parsers import MultiPartParser
 from drf_spectacular.utils import extend_schema
@@ -37,6 +37,37 @@ class ResumoView(APIView):
         system_prompt = data.get("system_prompt", DEFAULT_SYSTEM_PROMPT)
         resposta_llm = get_llm_answer_summary(system_prompt, data["user_message"], listaPDFs, model=model, isIterativeRefinement=data["iterative_refinement"])
         for file in listaPDFs:
             os.remove(file)

 from langchain_backend.utils import DEFAULT_SYSTEM_PROMPT
 from .serializer import ResumoPDFSerializer
+from langchain_backend.main import get_llm_answer_summary, get_llm_answer_summary_with_embedding
 from setup.environment import default_model
 from rest_framework.parsers import MultiPartParser
 from drf_spectacular.utils import extend_schema
         system_prompt = data.get("system_prompt", DEFAULT_SYSTEM_PROMPT)
         resposta_llm = get_llm_answer_summary(system_prompt, data["user_message"], listaPDFs, model=model, isIterativeRefinement=data["iterative_refinement"])
+        for file in listaPDFs:
+            os.remove(file)
+        return Response({"resposta": resposta_llm})
+class ResumoEmbeddingView(APIView):
+  parser_classes = [MultiPartParser]
+  @extend_schema(
+    request=ResumoPDFSerializer,
+  )
+  def post(self, request):
+    serializer = ResumoPDFSerializer(data=request.data)
+    if serializer.is_valid(raise_exception=True):
+        listaPDFs = []
+        data = serializer.validated_data
+        model = serializer.validated_data.get("model", default_model)
+        print('serializer.validated_data: ', serializer.validated_data)
+        for file in serializer.validated_data['files']:
+            file.seek(0)
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: # Create a temporary file to save the uploaded PDF
+                for chunk in file.chunks(): # Write the uploaded file content to the temporary file
+                    temp_file.write(chunk)
+                temp_file_path = temp_file.name  # Get the path of the temporary file
+                listaPDFs.append(temp_file_path)
+        print('listaPDFs: ', listaPDFs)
+        system_prompt = data.get("system_prompt", DEFAULT_SYSTEM_PROMPT)
+        resposta_llm = get_llm_answer_summary_with_embedding(system_prompt, data["user_message"], listaPDFs, model=model, isIterativeRefinement=data["iterative_refinement"])
         for file in listaPDFs:
             os.remove(file)

setup/environment.py CHANGED Viewed

	@@ -1 +1,2 @@
1	- default_model = "gpt-4o-mini"


1	+ default_model = "gpt-4o-mini"
2	+ # default_model = "gpt-4o"