Spaces:
Build error
Build error
import streamlit as st | |
import math | |
import re | |
import os | |
from PyPDF2 import PdfFileReader, PdfFileWriter | |
import pandas as pd | |
import pdfplumber | |
from docx2pdf import convert | |
import fitz | |
import base64 | |
st.header('PDF文件处理工具测试') | |
def fx(x): | |
return sum(x,[]) | |
fns=st.radio('请选择PDF处理类型:',['拆分','合并','读取','在线预览','转换']) | |
if fns=='拆分': | |
uploaded_file = st.text_input("请输入要处理的pdf文件地址:") | |
if uploaded_file !='': | |
pdf_reader = PdfFileReader(uploaded_file) | |
n=pdf_reader.getNumPages() | |
che=st.radio('选择拆分类型',['按固定页数拆分','截取某几页','删除指定页面']) | |
if che=='按固定页数拆分': | |
fn=st.number_input('请输入每组拆分的文档页数:',1,n,1) | |
stre=st.text_input("请输入拆分后文件存放根目录:") | |
zs=math.ceil(n/fn) | |
if st.button('开始拆分>>'): | |
for page in range(1,zs+1): | |
for pn in range(fn*page-fn,fn*page): | |
if pn<n: | |
pdf_writer = PdfFileWriter() | |
pdf_writer.addPage(pdf_reader.getPage(pn)) | |
with open(stre+'/test-{}.pdf'.format(page), 'wb') as out: | |
pdf_writer.write(out) | |
elif che=='截取某几页': | |
st_en=st.text_input("请输入截取的起止页码,格式为“1-5”或“1,3,5”:") | |
stre2=st.text_input("请输入截取后pdf文件存放根目录:") | |
if st_en!='': | |
tt=[int(x) for x in re.split(r'[-,\s]\s*',st_en)] | |
if st.button('开始截取>>'): | |
outw=PdfFileWriter() | |
for r in (tt if ',' in st_en else range(tt[0]-1,tt[1])): | |
outw.addPage(pdf_reader.getPage(r)) | |
with open(stre2+'/666.pdf', 'wb') as out: | |
outw.write(out) | |
else: | |
st_en2=st.text_input("请输入需要删除的页码,格式为“1-5”或“1,3,5”:") | |
stre3=st.text_input("请输入删除指定页面后的pdf文件存放根目录:") | |
if st_en2!='': | |
tt=[int(x) for x in re.split(r'[-,\s]\s*',st_en2)] | |
if st.button('开始删除>>'): | |
outw2=PdfFileWriter() | |
for r in range(n): | |
if r not in (tt if ',' in st_en2 else range(tt[0]-1,tt[1])): | |
outw2.addPage(pdf_reader.getPage(r)) | |
with open(stre3+'/666.pdf', 'wb') as out: | |
outw2.write(out) | |
elif fns=='合并': | |
path = st.text_input("请输入要处理的pdf文件根目录:") | |
scn = st.text_input("请填写输出文件地址及文件名") | |
if path !='' and scn!='': | |
file_list = os.listdir(path) | |
if st.button('开始合并>>'): | |
file_out = PdfFileWriter() | |
for file in file_list: | |
docdir = os.path.join(path, file) | |
file_read = PdfFileReader(docdir) | |
for pageNum in range(file_read.getNumPages()): | |
file_out.addPage(file_read.getPage(pageNum)) | |
with open(scn,'wb') as output: | |
file_out.write(output) | |
elif fns=='读取': | |
path3 = st.text_input("请输入要读取的pdf文件地址:") | |
if path3 !='': | |
ms=st.radio('请选择读取模式:',['指定页码','全部']) | |
if ms=='指定页码': | |
ymq= st.number_input("请选择要读取的pdf页码:",1,66,1) | |
dqlx=st.radio('请选择读取类型',['文本内容','表格内容']) | |
with pdfplumber.open(path3) as p: | |
page = p.pages[ymq-1] | |
if dqlx=='文本内容': | |
textdata = page.extract_text() | |
st.write(textdata) | |
else: | |
n_table=st.number_input('请选择读取页面中第几个表格:',1,3,1) | |
tables=page.extract_tables() | |
datan=tables[n_table-1] | |
st.dataframe(pd.DataFrame(datan[1:],columns=datan[0])) | |
else: | |
dqlx2=st.radio('请选择读取类型',['文本内容','表格内容']) | |
with pdfplumber.open(path3) as p: | |
if dqlx2=='文本内容': | |
sz='\n'.join([page.extract_text() for page in p.pages]) | |
st.write(sz) | |
else: | |
st.dataframe(pd.concat([pd.DataFrame(data=y[1:],columns=y[0]) for y in fx([page.extract_tables() for page in p.pages])])) | |
elif fns=='在线预览': | |
file = st.file_uploader("请上传PDF") | |
if file is not None: | |
base64_pdf = base64.b64encode(file.read()).decode('utf-8') | |
pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" width="100%" height="1000" type="application/pdf">' | |
st.markdown(pdf_display, unsafe_allow_html=True) | |
else: | |
ms1=st.radio('请选择转换模式:',['word->pdf','ppt->pdf','pdf->jpg/png','jpg/png->pdf']) | |
if ms1=='word->pdf': | |
path4 = st.text_input("请输入要批量转换的word文件根目录:") | |
if path4 !='': | |
FileList = map(lambda x: path4 + '\\' + x, os.listdir(path4)) | |
for file in FileList: | |
convert(file, f"{file.split('.')[0]}.pdf") | |
st.success('转换成功!') | |
elif ms1=='pdf->jpg/png': | |
path5 = st.text_input("请输入要转换的pdf文件地址:") | |
dir_1=st.text_input("请输入要输出的图片保存根目录:") | |
if path5 !='' and dir_1 !='': | |
doc = fitz.open(path5) | |
for page in doc: | |
pix = page.get_pixmap() | |
pix.save(dir_1+"/page-%i.png" % page.number) | |
elif ms1=='jpg/png->pdf': | |
dir_2=st.text_input("请输入要转换为pdf的图片根目录:") | |
path6 = st.text_input("请输入合成的pdf文件存放地址:") | |
if path6 !='' and dir_2 !='': | |
doc = fitz.open() | |
imglist = os.listdir(dir_2) | |
for i, f in enumerate(imglist): | |
img = fitz.open(os.path.join(dir_2, f)) | |
rect = img[0].rect | |
pdfbytes = img.convert_to_pdf() | |
img.close() | |
imgPDF = fitz.open("pdf", pdfbytes) | |
page = doc.new_page(width = rect.width,height = rect.height) | |
page.show_pdf_page(rect, imgPDF, 0) | |
doc.save(path6) | |
elif ms1=='ppt->pdf': | |
dir_3=st.text_input("请输入要转换为pdf的PPT文件地址:") | |
path7 = st.text_input("请输入生成的pdf文件存放地址:") | |
if path7 !='' and dir_3 !='': | |
ppt = fitz.open(dir_3) | |
pdfbytes = ppt.convert_to_pdf() | |
pdf = fitz.open("pdf", pdfbytes) | |
pdf.save(path7) | |
else: | |
"" |