Spaces:
Running
Running
File size: 7,486 Bytes
e63103b 3f199c2 68d3cc8 0870c96 3251505 7dc6d22 e63103b 4d3bceb e63103b a37a365 3f199c2 e63103b 0870c96 b700f35 0870c96 e63103b b700f35 0870c96 e63103b 3f199c2 0870c96 3f199c2 0870c96 e63103b e79797a e63103b 1a93363 4d3bceb e63103b 4d3bceb 7dc6d22 4d3bceb 834da99 4d3bceb 4dcf767 3251505 7dc6d22 3251505 7dc6d22 3251505 7dc6d22 3251505 7dc6d22 3251505 7dc6d22 3251505 4dcf767 3251505 4dcf767 20e8064 4dcf767 20e8064 bf45279 20e8064 bf45279 7dc6d22 bf45279 20e8064 bf45279 20e8064 bf45279 7dc6d22 20e8064 bf45279 7dc6d22 bf45279 d6bac9a 834da99 bf45279 3251505 4dcf767 20e8064 834da99 20e8064 834da99 4dcf767 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
from langchain_community.document_loaders import PyPDFLoader
import os
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
from setup.environment import default_model
from uuid import uuid4
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List
import numpy as np
import openai
import pandas as pd
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"]="VELLA"
os.environ.get("OPENAI_API_KEY")
os.environ.get("HUGGINGFACEHUB_API_TOKEN")
embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
allIds = []
def getPDF(file_paths):
documentId = 0
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
pages = []
for file in file_paths:
loader = PyPDFLoader(file, extract_images=False)
pagesDoc = loader.load_and_split(text_splitter)
pages = pages + pagesDoc
for page in pages:
documentId = str(uuid4())
allIds.append(documentId)
page.id = documentId
return pages
def create_retriever(documents, vectorstore):
print('\n\n')
print('documents: ', documents[:2])
vectorstore.add_documents(documents=documents)
retriever = vectorstore.as_retriever(
# search_type="similarity",
# search_kwargs={"k": 3},
)
return retriever
def create_prompt_llm_chain(system_prompt, modelParam):
model = create_llm(modelParam)
system_prompt = system_prompt + "\n\n" + "{context}"
prompt = ChatPromptTemplate.from_messages(
[
("system", system_prompt),
("human", "{input}"),
]
)
question_answer_chain = create_stuff_documents_chain(model, prompt)
return question_answer_chain
def create_llm(modelParam):
if modelParam == default_model:
return ChatOpenAI(model=modelParam, max_tokens=16384)
else:
return HuggingFaceEndpoint(
repo_id=modelParam,
task="text-generation",
max_new_tokens=1100,
do_sample=False,
huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
)
class Resumo(BaseModel):
nome_do_memorial: str = Field()
argumentos: str = Field()
jurisprudencia: str = Field()
doutrina: str = Field()
palavras_chave: List[str] = Field()
def create_prompt_llm_chain_summary(system_prompt, model_param):
prompt_and_llm = create_prompt_and_llm(system_prompt, model_param)
question_answer_chain = create_stuff_documents_chain(prompt_and_llm["model"], prompt_and_llm["prompt"])
final_chain = question_answer_chain | JsonOutputParser(pydantic_object=Resumo)
return final_chain
def process_embedding_summary(system_prompt, model_param, full_text):
prompt_and_llm = create_prompt_and_llm(system_prompt, model_param)
text_splitter=RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
docs = text_splitter.create_documents([full_text])
embeddings=get_embeddings([doc.page_content for doc in docs])
content_list = [doc.page_content for doc in docs]
df = pd.DataFrame(content_list, columns=['page_content'])
vectors = [embedding.embedding for embedding in embeddings]
array = np.array(vectors)
embeddings_series = pd.Series(list(array))
df['embeddings'] = embeddings_series
def get_embeddings(text):
response = openai.embeddings.create(
model="text-embedding-3-small",
input=text
)
return response.data
def create_prompt_and_llm(system_prompt, model_param):
model = create_llm(model_param)
system_prompt = system_prompt + "\n\n" + "{context}"
prompt = ChatPromptTemplate.from_messages(
[
("system", system_prompt),
("human", "{input}"),
]
)
return {"model": model, "prompt": prompt}
DEFAULT_SYSTEM_PROMPT = """
You are a highly knowledgeable legal assistant specializing in case summarization. Your task is to provide comprehensive and accurate summaries of legal cases while maintaining a professional and objective demeanor. Always approach each case with careful consideration and analytical rigor.
First, you will be given a document to analyze:
Next, you will summarize a content provided.
Before providing your summary, follow these steps:
1. Argumentation Mining: Conduct a cross-Document Argument Analysis to identify the main arguments, claims, and supporting evidence within the document. Focus on extracting the most relevant information related to the summary request.
2. Socratic Questioning: Reflect on your initial findings using the Socratic method. Ask yourself probing questions to challenge your assumptions and deepen your understanding of the document's content. For example:
- What are the key points I've identified?
- Are there any counterarguments or alternative perspectives I've overlooked?
- How does this information relate to the specific summary request?
- What additional context might be necessary to fully understand these points?
3. Maximal Marginal Relevance: Apply the principles of Maximal Marginal Relevance to ensure your summary includes diverse, relevant information while avoiding redundancy. Prioritize information that is both relevant to the summary request and adds new insights not already covered.
After completing these steps, generate the response with around 10000 characteres in BBcode format, as shown below:
Example: :
{{
"nome_do_memorial": "[Insira aqui o nome do memorial e número da equipe] ",
"argumentos": "
[b]Argumento 1:[/b]
Fundamento 1.1: [Descreva o fundamento de forma extensa e completa]
Fundamento 1.2: [Descreva o fundamento de forma extensa e completa]
[b]Argumento 2:[/b]
Fundamento 2.1: [Descreva o fundamento de forma extensa e completa]
Fundamento 2.2: [Descreva o fundamento de forma extensa e completa]",
"jurisprudencia": "
[b]Caso 1:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição extensa de como a jurisprudência se aplica]
[b]Caso 2:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição extensa de como a jurisprudência se aplica]",
"doutrina": "
[b]Autor 1:[/b] [Nome do autor]
"[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]
[b]Autor 2:[/b] [Nome do autor]
"[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]",
"palavras-chave": "
[Palavra-chave 1]
[Palavra-chave 2]
[Palavra-chave 3]
[Adicione outras palavras relevantes]"
}}
Remember:
- Always prioritize relevance to the summary request.
- Ensure your summary is well-structured and easy to understand.
- Do not include any personal opinions or information not present in the original document.
- If the summary request asks for a specific focus or perspective, make sure to address it directly.
Your goal is to provide a comprehensive yet concise summary that accurately represents the document's content while meeting the specific needs outlined in the summary request.
Do not pass in the response part of the instructions that you received
Generate the response with at least 10000 characteres
The content to be summarized is as follows:
""" |