MikeCraBash commited on
Commit
ecbe7ae
1 Parent(s): 7184bb7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -9
app.py CHANGED
@@ -1,4 +1,3 @@
1
- #
2
  # HACK AI MAKERSPACE PREPR
3
  # Date: 2024-5-16
4
 
@@ -25,11 +24,12 @@ direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
25
  # Now load the document using the direct URL
26
  docs = PyMuPDFLoader(direct_url).load()
27
 
28
- import tiktoken
29
- def tiktoken_len(text):
30
- tokens = tiktoken.encoding_for_model("solar-10.7b").encode(
31
- text,
32
- )
 
33
  return len(tokens)
34
 
35
  # Split the document into chunks
@@ -38,7 +38,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
38
  text_splitter = RecursiveCharacterTextSplitter(
39
  chunk_size = 500, # 500 tokens per chunk, experiment with this value
40
  chunk_overlap = 50, # 50 tokens overlap between chunks, experiment with this value
41
- length_function = tiktoken_len,
42
  )
43
 
44
  split_chunks = text_splitter.split_documents(docs)
@@ -146,5 +146,3 @@ async def main(message: cl.Message):
146
 
147
  msg = cl.Message(content=chainlit_answer)
148
  await msg.send()
149
-
150
-
 
 
1
  # HACK AI MAKERSPACE PREPR
2
  # Date: 2024-5-16
3
 
 
24
  # Now load the document using the direct URL
25
  docs = PyMuPDFLoader(direct_url).load()
26
 
27
+ from transformers import AutoTokenizer
28
+
29
+ # Function to calculate token length using Hugging Face tokenizer
30
+ def hf_token_len(text):
31
+ tokenizer = AutoTokenizer.from_pretrained("Upstage/SOLAR-10.7B-v1.0")
32
+ tokens = tokenizer.encode(text)
33
  return len(tokens)
34
 
35
  # Split the document into chunks
 
38
  text_splitter = RecursiveCharacterTextSplitter(
39
  chunk_size = 500, # 500 tokens per chunk, experiment with this value
40
  chunk_overlap = 50, # 50 tokens overlap between chunks, experiment with this value
41
+ length_function = hf_token_len,
42
  )
43
 
44
  split_chunks = text_splitter.split_documents(docs)
 
146
 
147
  msg = cl.Message(content=chainlit_answer)
148
  await msg.send()