|
|
|
|
|
|
|
|
|
import os |
|
import json |
|
from PyPDF2 import PdfReader |
|
from docx import Document |
|
|
|
def extract_from_pdf(pdf_path): |
|
"""Extract text from a PDF file.""" |
|
pdf_data = "" |
|
with open(pdf_path, "rb") as pdf_file: |
|
reader = PdfReader(pdf_file) |
|
for page_num in range(len(reader.pages)): |
|
page = reader.pages[page_num] |
|
pdf_data += page.extract_text() |
|
return pdf_data |
|
|
|
def extract_from_json(json_path): |
|
"""Extract data from a JSON file.""" |
|
with open(json_path, "r") as json_file: |
|
json_data = json.load(json_file) |
|
return json_data |
|
|
|
def extract_from_word(word_path): |
|
"""Extract text from a Word (.docx) file.""" |
|
doc = Document(word_path) |
|
word_data = "" |
|
for para in doc.paragraphs: |
|
word_data += para.text + "\n" |
|
return word_data |
|
|
|
def extract_data(file_path): |
|
"""Extract data from a file based on its extension.""" |
|
_, file_extension = os.path.splitext(file_path) |
|
|
|
if file_extension == ".pdf": |
|
return extract_from_pdf(file_path) |
|
elif file_extension == ".json": |
|
return extract_from_json(file_path) |
|
elif file_extension == ".docx": |
|
return extract_from_word(file_path) |
|
else: |
|
raise ValueError("Unsupported file extension: " + file_extension) |
|
|
|
def create_data_dictionary(files): |
|
"""Create a dictionary containing data from files based on their extension.""" |
|
data_dict = {} |
|
for file_path in files: |
|
try: |
|
file_data = extract_data(file_path) |
|
data_dict[file_path] = file_data |
|
except ValueError as e: |
|
print(e) |
|
return data_dict |
|
|
|
|
|
path = '' |
|
|
|
exam_files = 'data' |
|
|
|
print(exam_files) |
|
data_dict = create_data_dictionary(exam_files) |
|
|
|
|
|
school_data = ['university','department','course_code','course_title','date','duration','instructor'] |
|
qcm_data = ['question','options', 'answer'] |
|
short_data = ['question','answer'] |
|
|
|
multiple_choice_questions = data_dict[str(exam_data[0])]['multiple_choice_questions'] |
|
short_answer_questions = data_dict[str(exam_data[0])]['short_answer_questions'] |
|
long_answer_questions = data_dict[str(exam_data[0])]['long_answer_questions'] |
|
|
|
for s_data in school_data: |
|
print(f" {s_data}: {data_dict[str(exam_data[0])]['header'][str(s_data)]}") |
|
print(f"***************'school data'************************") |
|
|
|
for idx,qcm in enumerate(multiple_choice_questions): |
|
print(f" Index is: {idx} and 'Question': {qcm['question']}") |
|
print(f" Index is: {idx} and 'Options': {qcm['options']}") |
|
print(f" Index is: {idx} and 'Answer': {qcm['answer']}") |
|
print(f"***************'multiple_choice_questions'************************") |
|
for idx,qcm in enumerate(short_answer_questions): |
|
print(f" Index is: {idx} and 'Question': {qcm['question']}") |
|
print(f" Index is: {idx} and 'Answer': {qcm['answer']}") |
|
print(f"***************' END short_answer_questions'************************") |
|
print(f"***************' START long_answer_questions'************************") |
|
for idx,qcm in enumerate(long_answer_questions): |
|
print(f" Index is: {idx} and 'Question': {qcm['question']}") |
|
print(f" Index is: {idx} and 'Answer': {qcm['answer']}") |
|
print(f"***************' END long_answer_questions'************************") |