Spaces:
Runtime error
Runtime error
MikeCraBash
commited on
Commit
•
ecbe7ae
1
Parent(s):
7184bb7
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
#
|
2 |
# HACK AI MAKERSPACE PREPR
|
3 |
# Date: 2024-5-16
|
4 |
|
@@ -25,11 +24,12 @@ direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
|
25 |
# Now load the document using the direct URL
|
26 |
docs = PyMuPDFLoader(direct_url).load()
|
27 |
|
28 |
-
import
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
)
|
|
|
33 |
return len(tokens)
|
34 |
|
35 |
# Split the document into chunks
|
@@ -38,7 +38,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
38 |
text_splitter = RecursiveCharacterTextSplitter(
|
39 |
chunk_size = 500, # 500 tokens per chunk, experiment with this value
|
40 |
chunk_overlap = 50, # 50 tokens overlap between chunks, experiment with this value
|
41 |
-
length_function =
|
42 |
)
|
43 |
|
44 |
split_chunks = text_splitter.split_documents(docs)
|
@@ -146,5 +146,3 @@ async def main(message: cl.Message):
|
|
146 |
|
147 |
msg = cl.Message(content=chainlit_answer)
|
148 |
await msg.send()
|
149 |
-
|
150 |
-
|
|
|
|
|
1 |
# HACK AI MAKERSPACE PREPR
|
2 |
# Date: 2024-5-16
|
3 |
|
|
|
24 |
# Now load the document using the direct URL
|
25 |
docs = PyMuPDFLoader(direct_url).load()
|
26 |
|
27 |
+
from transformers import AutoTokenizer
|
28 |
+
|
29 |
+
# Function to calculate token length using Hugging Face tokenizer
|
30 |
+
def hf_token_len(text):
|
31 |
+
tokenizer = AutoTokenizer.from_pretrained("Upstage/SOLAR-10.7B-v1.0")
|
32 |
+
tokens = tokenizer.encode(text)
|
33 |
return len(tokens)
|
34 |
|
35 |
# Split the document into chunks
|
|
|
38 |
text_splitter = RecursiveCharacterTextSplitter(
|
39 |
chunk_size = 500, # 500 tokens per chunk, experiment with this value
|
40 |
chunk_overlap = 50, # 50 tokens overlap between chunks, experiment with this value
|
41 |
+
length_function = hf_token_len,
|
42 |
)
|
43 |
|
44 |
split_chunks = text_splitter.split_documents(docs)
|
|
|
146 |
|
147 |
msg = cl.Message(content=chainlit_answer)
|
148 |
await msg.send()
|
|
|
|