ArXivAudio / get_pages.py
Archan's picture
Create new file
d085c50
raw
history blame contribute delete
658 Bytes
from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LTTextContainer
from preprocess import pre_process
def get_pages(filename, start_page=0, end_page=0):
page_number = []
for i in range(start_page, end_page+1):
page_number.append(i-1)
print(page_number)
#filename = str(paper.title)+'.pdf'
pages = extract_pages(filename, page_numbers=page_number)
content = ""
for page_layout in pages:
for element in page_layout:
if isinstance(element, LTTextContainer):
content = content+element.get_text()
content = pre_process(content)
return content