# -*-coding:utf-8 -*- import PyPDF2 from build_index.parser.base import BaseParser class PDFParser(BaseParser): def header_remove(self): # 删除研报的页头 pass def footnote_remove(self): # 删除研报的页脚 pass def parse_file(self, file): # store pages of text_list = [] with open(file, "rb") as fp: pdf = PyPDF2.PdfReader(fp) num_pages = len(pdf.pages) for page in range(num_pages-1): page_text = pdf.pages[page].extract_text() text_list.append(page_text) text = '\n'.join(text_list) metadata = {'source': file, 'pages': num_pages} return text, metadata