# -*-coding:utf-8 -*- | |
import PyPDF2 | |
from build_index.parser.base import BaseParser | |
class PDFParser(BaseParser): | |
def header_remove(self): | |
# 删除研报的页头 | |
pass | |
def footnote_remove(self): | |
# 删除研报的页脚 | |
pass | |
def parse_file(self, file): | |
# store pages of | |
text_list = [] | |
with open(file, "rb") as fp: | |
pdf = PyPDF2.PdfReader(fp) | |
num_pages = len(pdf.pages) | |
for page in range(num_pages-1): | |
page_text = pdf.pages[page].extract_text() | |
text_list.append(page_text) | |
text = '\n'.join(text_list) | |
metadata = {'source': file, 'pages': num_pages} | |
return text, metadata | |