File size: 1,858 Bytes
4ec2e0f
 
 
 
 
 
 
 
 
 
 
 
 
2a5f9fb
4ec2e0f
8c49cb6
6b87e28
0a3530a
4ec2e0f
 
2a73469
4ec2e0f
d084b26
0c7ef71
4ec2e0f
6b87e28
4ec2e0f
6b87e28
4ec2e0f
0c7ef71
4ec2e0f
b7d036c
4ec2e0f
b7d036c
4ec2e0f
 
 
 
 
 
26286b2
4ec2e0f
 
551debe
4ec2e0f
0a3530a
4ec2e0f
 
6b87e28
4ec2e0f
6b87e28
4ec2e0f
614ee1f
4ec2e0f
 
 
1f60a20
4ec2e0f
a2790cb
4ec2e0f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import streamlit as st # data app development
import subprocess # process in the os
from subprocess import STDOUT, check_call #os process manipuation
import os #os process manipuation
import base64 # byte object into a pdf file 
import camelot as cam # extracting tables from PDFs 
import cv2
# to run this only once and it's cached
@st.cache
def gh():
    """install ghostscript on the linux machine"""
    proc = subprocess.Popen('apt-get install -y ghostscript', shell=True, stdin=None, stdout=open(os.devnull,"wb"), stderr=STDOUT, executable="/bin/bash")
    proc.wait()

gh()



st.title("PDF Table Extractor")
st.subheader("with `Camelot` Python library")

st.image("https://raw.githubusercontent.com/camelot-dev/camelot/master/docs/_static/camelot.png", width=200)


# file uploader on streamlit 

input_pdf = st.file_uploader(label = "upload your pdf here", type = 'pdf')

st.markdown("### Page Number")

page_number = st.text_input("Enter the page # from where you want to extract the PDF eg: 3", value = 1)

# run this only when a PDF is uploaded

if input_pdf is not None:
    # byte object into a PDF file 
    with open("input.pdf", "wb") as f:
        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
        f.write(base64.b64decode(base64_pdf))
    f.close()

    # read the pdf and parse it using stream
    table = cam.read_pdf("input.pdf", pages = page_number, flavor = 'stream')

    st.markdown("### Number of Tables")

    # display the output after parsing 
    st.write(table)

    # display the table

    if len(table) > 0:

        # extract the index value of the table
        
        option = st.selectbox(label = "Select the Table to be displayed", options = range(len(table) + 1))

        st.markdown('### Output Table')

        # display the dataframe
        
        st.dataframe(table[int(option)-1].df)