luanpoppe commited on
Commit
3f199c2
·
1 Parent(s): 0870c96

feat: adicionando embedding personalizados

Browse files
compose.yaml CHANGED
@@ -7,6 +7,7 @@ services:
7
  - SECRET_KEY=${SECRET_KEY}
8
  - DATABASE_PASSWORD=${DATABASE_PASSWORD}
9
  - OPENAI_API_KEY=${OPENAI_API_KEY}
 
10
  env_file:
11
  - .env
12
  develop:
 
7
  - SECRET_KEY=${SECRET_KEY}
8
  - DATABASE_PASSWORD=${DATABASE_PASSWORD}
9
  - OPENAI_API_KEY=${OPENAI_API_KEY}
10
+ - HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
11
  env_file:
12
  - .env
13
  develop:
endpoint_teste/serializer.py CHANGED
@@ -17,4 +17,5 @@ class PDFUploadSerializer(serializers.Serializer):
17
  file = serializers.FileField()
18
  system_prompt = serializers.CharField(required=True)
19
  user_message = serializers.CharField(required=True)
20
- model = serializers.CharField(required=False)
 
 
17
  file = serializers.FileField()
18
  system_prompt = serializers.CharField(required=True)
19
  user_message = serializers.CharField(required=True)
20
+ model = serializers.CharField(required=False)
21
+ embedding = serializers.CharField(required=False)
endpoint_teste/views.py CHANGED
@@ -68,6 +68,9 @@ def getPDF(request):
68
  print('data: ', data)
69
  pdf_file = serializer.validated_data['file']
70
  pdf_file.seek(0)
 
 
 
71
  # print(dir(pdf_file))
72
  # print('pdf_file: ', pdf_file.read())
73
  # pdf_content = pdf_file.read()
@@ -87,10 +90,7 @@ def getPDF(request):
87
  print('temp_file_path: ', temp_file_path)
88
 
89
  resposta_llm = None
90
- try:
91
- resposta_llm = get_llm_answer(data["system_prompt"], data["user_message"], temp_file_path, model=serializer.validated_data['model'])
92
- except:
93
- resposta_llm = get_llm_answer(data["system_prompt"], data["user_message"], temp_file_path, model=default_model)
94
 
95
  os.remove(temp_file_path)
96
 
 
68
  print('data: ', data)
69
  pdf_file = serializer.validated_data['file']
70
  pdf_file.seek(0)
71
+
72
+ embedding = serializer.validated_data.get("embedding", "gpt")
73
+ model = serializer.validated_data.get("model", default_model)
74
  # print(dir(pdf_file))
75
  # print('pdf_file: ', pdf_file.read())
76
  # pdf_content = pdf_file.read()
 
90
  print('temp_file_path: ', temp_file_path)
91
 
92
  resposta_llm = None
93
+ resposta_llm = get_llm_answer(data["system_prompt"], data["user_message"], temp_file_path, model=model, embedding=embedding)
 
 
 
94
 
95
  os.remove(temp_file_path)
96
 
langchain_backend/main.py CHANGED
@@ -1,27 +1,38 @@
1
  import os
2
- from langchain_backend.utils import create_prompt_llm_chain, create_retriever, getPDF, vectorstore
3
  from langchain_backend import utils
4
  from langchain.chains import create_retrieval_chain
 
 
 
5
 
6
  os.environ.get("OPENAI_API_KEY")
7
 
8
- def get_llm_answer(system_prompt, user_prompt, pdf_url, model):
 
 
 
 
 
 
 
 
 
 
 
9
  print('model: ', model)
 
10
  pages = []
11
  if pdf_url:
12
  pages = getPDF(pdf_url)
13
  else:
14
  pages = getPDF()
15
- retriever = create_retriever(pages)
16
- # rag_chain = None
17
  rag_chain = create_retrieval_chain(retriever, create_prompt_llm_chain(system_prompt, model))
18
- # if model:
19
- # rag_chain = create_retrieval_chain(retriever, create_prompt_llm_chain(system_prompt, model))
20
- # else:
21
- # rag_chain = create_retrieval_chain(retriever, create_prompt_llm_chain(system_prompt))
22
  results = rag_chain.invoke({"input": user_prompt})
23
  print('allIds ARQUIVO MAIN: ', utils.allIds)
24
  vectorstore.delete( utils.allIds)
 
25
  utils.allIds = []
26
  print('utils.allIds: ', utils.allIds)
27
  return results
 
1
  import os
2
+ from langchain_backend.utils import create_prompt_llm_chain, create_retriever, getPDF
3
  from langchain_backend import utils
4
  from langchain.chains import create_retrieval_chain
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from langchain_chroma import Chroma
7
+ from langchain_openai import OpenAIEmbeddings
8
 
9
  os.environ.get("OPENAI_API_KEY")
10
 
11
+ def get_llm_answer(system_prompt, user_prompt, pdf_url, model, embedding):
12
+ if embedding == "gpt":
13
+ embedding_object = OpenAIEmbeddings()
14
+ else:
15
+ embedding_object = HuggingFaceEmbeddings(model_name=embedding)
16
+
17
+ vectorstore = Chroma(
18
+ collection_name="documents",
19
+ embedding_function=embedding_object
20
+ )
21
+
22
+
23
  print('model: ', model)
24
+ print('embedding: ', embedding)
25
  pages = []
26
  if pdf_url:
27
  pages = getPDF(pdf_url)
28
  else:
29
  pages = getPDF()
30
+ retriever = create_retriever(pages, vectorstore)
 
31
  rag_chain = create_retrieval_chain(retriever, create_prompt_llm_chain(system_prompt, model))
 
 
 
 
32
  results = rag_chain.invoke({"input": user_prompt})
33
  print('allIds ARQUIVO MAIN: ', utils.allIds)
34
  vectorstore.delete( utils.allIds)
35
+ vectorstore.delete_collection()
36
  utils.allIds = []
37
  print('utils.allIds: ', utils.allIds)
38
  return results
langchain_backend/utils.py CHANGED
@@ -2,21 +2,18 @@ from langchain_community.document_loaders import PyPDFLoader
2
  import os
3
  from langchain_openai import ChatOpenAI
4
  from langchain_chroma import Chroma
5
- from langchain_openai import OpenAIEmbeddings
6
  from langchain_text_splitters import RecursiveCharacterTextSplitter
7
  from langchain.chains.combine_documents import create_stuff_documents_chain
8
  from langchain_core.prompts import ChatPromptTemplate
9
- from langchain_huggingface import HuggingFaceEndpoint
10
  from setup.environment import default_model
11
  from uuid import uuid4
12
 
 
13
  os.environ.get("OPENAI_API_KEY")
14
  os.environ.get("HUGGINGFACEHUB_API_TOKEN")
 
15
 
16
- vectorstore = Chroma(
17
- collection_name="documents",
18
- embedding_function=OpenAIEmbeddings()
19
- )
20
  allIds = []
21
 
22
  def getPDF(file_path):
@@ -25,22 +22,16 @@ def getPDF(file_path):
25
  loader = PyPDFLoader(file_path, extract_images=False)
26
  pages = loader.load_and_split(text_splitter)
27
  for page in pages:
28
- print('\n\n\n')
29
  print('allIds: ', allIds)
30
  documentId = str(uuid4())
31
  allIds.append(documentId)
32
  page.id = documentId
33
  return pages
34
 
35
- def create_retriever(documents):
36
  print('\n\n')
37
- print('documents: ', documents)
38
-
39
- # vectorstore = Chroma.from_documents(
40
- # documents,
41
- # embedding=OpenAIEmbeddings(),
42
- # )
43
- # vectorstore.delete_collection()
44
 
45
  vectorstore.add_documents(documents=documents)
46
 
@@ -58,12 +49,10 @@ def create_prompt_llm_chain(system_prompt, modelParam):
58
  model = HuggingFaceEndpoint(
59
  repo_id=modelParam,
60
  task="text-generation",
61
- max_new_tokens=100,
62
  do_sample=False,
63
  huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
64
  )
65
- # result = model.invoke("Hugging Face is")
66
- # print('result: ', result)
67
 
68
  system_prompt = system_prompt + "\n\n" + "{context}"
69
  prompt = ChatPromptTemplate.from_messages(
 
2
  import os
3
  from langchain_openai import ChatOpenAI
4
  from langchain_chroma import Chroma
 
5
  from langchain_text_splitters import RecursiveCharacterTextSplitter
6
  from langchain.chains.combine_documents import create_stuff_documents_chain
7
  from langchain_core.prompts import ChatPromptTemplate
8
+ from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
9
  from setup.environment import default_model
10
  from uuid import uuid4
11
 
12
+
13
  os.environ.get("OPENAI_API_KEY")
14
  os.environ.get("HUGGINGFACEHUB_API_TOKEN")
15
+ embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
16
 
 
 
 
 
17
  allIds = []
18
 
19
  def getPDF(file_path):
 
22
  loader = PyPDFLoader(file_path, extract_images=False)
23
  pages = loader.load_and_split(text_splitter)
24
  for page in pages:
25
+ print('\n')
26
  print('allIds: ', allIds)
27
  documentId = str(uuid4())
28
  allIds.append(documentId)
29
  page.id = documentId
30
  return pages
31
 
32
+ def create_retriever(documents, vectorstore):
33
  print('\n\n')
34
+ print('documents: ', documents[:2])
 
 
 
 
 
 
35
 
36
  vectorstore.add_documents(documents=documents)
37
 
 
49
  model = HuggingFaceEndpoint(
50
  repo_id=modelParam,
51
  task="text-generation",
52
+ # max_new_tokens=100,
53
  do_sample=False,
54
  huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
55
  )
 
 
56
 
57
  system_prompt = system_prompt + "\n\n" + "{context}"
58
  prompt = ChatPromptTemplate.from_messages(