Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 1,858 Bytes
4ec2e0f 2a5f9fb 4ec2e0f 8c49cb6 6b87e28 0a3530a 4ec2e0f 2a73469 4ec2e0f d084b26 0c7ef71 4ec2e0f 6b87e28 4ec2e0f 6b87e28 4ec2e0f 0c7ef71 4ec2e0f b7d036c 4ec2e0f b7d036c 4ec2e0f 26286b2 4ec2e0f 551debe 4ec2e0f 0a3530a 4ec2e0f 6b87e28 4ec2e0f 6b87e28 4ec2e0f 614ee1f 4ec2e0f 1f60a20 4ec2e0f a2790cb 4ec2e0f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import streamlit as st # data app development
import subprocess # process in the os
from subprocess import STDOUT, check_call #os process manipuation
import os #os process manipuation
import base64 # byte object into a pdf file
import camelot as cam # extracting tables from PDFs
import cv2
# to run this only once and it's cached
@st.cache
def gh():
"""install ghostscript on the linux machine"""
proc = subprocess.Popen('apt-get install -y ghostscript', shell=True, stdin=None, stdout=open(os.devnull,"wb"), stderr=STDOUT, executable="/bin/bash")
proc.wait()
gh()
st.title("PDF Table Extractor")
st.subheader("with `Camelot` Python library")
st.image("https://raw.githubusercontent.com/camelot-dev/camelot/master/docs/_static/camelot.png", width=200)
# file uploader on streamlit
input_pdf = st.file_uploader(label = "upload your pdf here", type = 'pdf')
st.markdown("### Page Number")
page_number = st.text_input("Enter the page # from where you want to extract the PDF eg: 3", value = 1)
# run this only when a PDF is uploaded
if input_pdf is not None:
# byte object into a PDF file
with open("input.pdf", "wb") as f:
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
f.write(base64.b64decode(base64_pdf))
f.close()
# read the pdf and parse it using stream
table = cam.read_pdf("input.pdf", pages = page_number, flavor = 'stream')
st.markdown("### Number of Tables")
# display the output after parsing
st.write(table)
# display the table
if len(table) > 0:
# extract the index value of the table
option = st.selectbox(label = "Select the Table to be displayed", options = range(len(table) + 1))
st.markdown('### Output Table')
# display the dataframe
st.dataframe(table[int(option)-1].df)
|