nurindahpratiwi commited on
Commit
d32e9c7
1 Parent(s): 208fbeb

first commit

Browse files
Files changed (1) hide show
  1. app.py +57 -0
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.document_loaders import PyPDFLoader
4
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
5
+ from transformers import pipeline
6
+ import torch
7
+ import base64
8
+ import time
9
+ from PIL import Image
10
+
11
+ st.image("https://huggingface.co/spaces/wiwaaw/summary/resolve/main/banner.png")
12
+
13
+ #MODEL AND TOKENIZER
14
+ model_checkpoint = "MBZUAI/LaMini-Flan-T5-783M"
15
+ model_tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
16
+ model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
17
+
18
+ #FILE LOADER AND PREPROCESSING
19
+ def preprocess_pdf(file):
20
+ loader = PyPDFLoader(file)
21
+ pages = loader.load_and_split()
22
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=170, chunk_overlap=70)
23
+ texts = text_splitter.split_documents(pages)
24
+ final_text = ""
25
+ for text in texts:
26
+ final_text = final_text + text.page_content
27
+ return final_text
28
+
29
+ @st.cache_data
30
+ #LLM PIPELINE
31
+ def language_model_pipeline(filepath):
32
+ summarization_pipeline = pipeline(
33
+ 'summarization',
34
+ model = model,
35
+ tokenizer = model_tokenizer,
36
+ max_length = 500,
37
+ min_length = 32
38
+ )
39
+ input_text = preprocess_pdf(filepath)
40
+ summary_result = summarization_pipeline(input_text)
41
+ summarized_text = summary_result[0]['summary_text']
42
+ return summarized_text
43
+
44
+ title = st.title("PDF Summarization using LaMini")
45
+ uploaded_file = st.file_uploader('Upload your PDF file', type=['pdf'])
46
+ if uploaded_file is not None:
47
+ st.success("File Uploaded")
48
+ if st.button ("Summarize"):
49
+ time.sleep(10)
50
+
51
+ filepath = uploaded_file.name
52
+ with open(filepath, "wb") as temp_file:
53
+ temp_file.write(uploaded_file.read())
54
+
55
+ summarized_result = language_model_pipeline(filepath)
56
+ st.info("Summarization Complete")
57
+ st.success(summarized_result)