FinDoc / build_index /parser /pdf_parser.py
xl2533's picture
change parser
03b9994
raw
history blame
731 Bytes
# -*-coding:utf-8 -*-
import PyPDF2
from build_index.parser.base import BaseParser
class PDFParser(BaseParser):
def header_remove(self):
# 删除研报的页头
pass
def footnote_remove(self):
# 删除研报的页脚
pass
def parse_file(self, file):
# store pages of
text_list = []
with open(file, "rb") as fp:
pdf = PyPDF2.PdfReader(fp)
num_pages = len(pdf.pages)
for page in range(num_pages-1):
page_text = pdf.pages[page].extract_text()
text_list.append(page_text)
text = '\n'.join(text_list)
metadata = {'source': file, 'pages': num_pages}
return text, metadata