Merge pull request #1 from jeff52415/release/v1.0.0
Browse files- .dockerignore +20 -0
- .github/workflows/ci.yml +75 -0
- .github/workflows/docker.yml +43 -0
- .github/workflows/huggingface-sync.yml +30 -0
- .gitignore +12 -2
- .pre-commit-config.yaml +36 -26
- CHANGELOG.md +16 -0
- Dockerfile +69 -0
- Makefile +107 -0
- README.md +67 -4
- app.py +456 -0
- docker-compose.yml +34 -0
- docs/POETRY_JUPYTER_SETUP.md +0 -51
- docs/POETRY_USAGE.md +0 -158
- docs/adding_new_parser.md +178 -0
- llmdataparser/__init__.py +34 -5
- llmdataparser/base_parser.py +282 -28
- llmdataparser/bbh_parser.py +177 -0
- llmdataparser/gsm8k_parser.py +151 -0
- llmdataparser/humaneval_parser.py +273 -0
- llmdataparser/ifeval_parser.py +164 -0
- llmdataparser/math_parser.py +189 -0
- llmdataparser/mbpp_parser.py +174 -0
- llmdataparser/mgsm_parser.py +192 -0
- llmdataparser/mmlu_parser.py +694 -54
- llmdataparser/prompts.py +61 -4
- llmdataparser/tmlu_parser.py +195 -0
- llmdataparser/tw_legal_parser.py +125 -0
- mkdocs.yml +9 -0
- nginx.conf +58 -0
- notebooks/demo.ipynb +0 -77
- poetry.lock +0 -0
- pyproject.toml +49 -24
- tests/test_bbh_parser.py +190 -0
- tests/test_gsm8k_parser.py +207 -0
- tests/test_humaneval_parser.py +198 -0
- tests/test_ifeval_parser.py +120 -0
- tests/test_math_parser.py +253 -0
- tests/test_mbpp_parser.py +178 -0
- tests/test_mgsm_parser.py +228 -0
- tests/test_mmlu_parser.py +314 -0
- tests/test_tmlu_parser.py +176 -0
- tests/test_tw_legal_parser.py +146 -0
.dockerignore
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.git
|
2 |
+
.gitignore
|
3 |
+
.env
|
4 |
+
.venv
|
5 |
+
__pycache__
|
6 |
+
*.pyc
|
7 |
+
*.pyo
|
8 |
+
*.pyd
|
9 |
+
.Python
|
10 |
+
*.py[cod]
|
11 |
+
*$py.class
|
12 |
+
.pytest_cache
|
13 |
+
.coverage
|
14 |
+
htmlcov
|
15 |
+
.mypy_cache
|
16 |
+
.ruff_cache
|
17 |
+
.DS_Store
|
18 |
+
notebooks/
|
19 |
+
tests/
|
20 |
+
docs/
|
.github/workflows/ci.yml
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: CI
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches: [main]
|
6 |
+
pull_request:
|
7 |
+
branches: [main]
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
test:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
strategy:
|
13 |
+
matrix:
|
14 |
+
python-version: ["3.10", "3.11", "3.12"]
|
15 |
+
fail-fast: false
|
16 |
+
|
17 |
+
steps:
|
18 |
+
- uses: actions/checkout@v4
|
19 |
+
|
20 |
+
- name: Set up Python ${{ matrix.python-version }}
|
21 |
+
uses: actions/setup-python@v5
|
22 |
+
with:
|
23 |
+
python-version: ${{ matrix.python-version }}
|
24 |
+
cache: "pip"
|
25 |
+
|
26 |
+
- name: Install Poetry
|
27 |
+
run: |
|
28 |
+
pipx install poetry
|
29 |
+
|
30 |
+
- name: Configure Poetry
|
31 |
+
run: |
|
32 |
+
poetry config virtualenvs.create true
|
33 |
+
poetry config virtualenvs.in-project true
|
34 |
+
|
35 |
+
- name: Cache Poetry virtualenv
|
36 |
+
uses: actions/cache@v3
|
37 |
+
with:
|
38 |
+
path: ./.venv
|
39 |
+
key: venv-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}
|
40 |
+
|
41 |
+
- name: Install dependencies
|
42 |
+
run: |
|
43 |
+
poetry install
|
44 |
+
|
45 |
+
- name: Run pre-commit hooks
|
46 |
+
uses: pre-commit/[email protected]
|
47 |
+
with:
|
48 |
+
extra_args: --all-files
|
49 |
+
env:
|
50 |
+
PRE_COMMIT_CACHE_KEY: ${{ hashFiles('.pre-commit-config.yaml', 'pyproject.toml') }}
|
51 |
+
|
52 |
+
- name: Run tests with coverage
|
53 |
+
run: |
|
54 |
+
poetry run pytest --cov=llmdataparser --cov-report=xml
|
55 |
+
|
56 |
+
- name: Upload coverage to Codecov
|
57 |
+
uses: codecov/codecov-action@v5
|
58 |
+
with:
|
59 |
+
token: ${{ secrets.CODECOV_TOKEN }}
|
60 |
+
file: ./coverage.xml
|
61 |
+
fail_ci_if_error: true
|
62 |
+
|
63 |
+
- name: Build documentation
|
64 |
+
run: |
|
65 |
+
poetry add mkdocs mkdocs-material --group dev
|
66 |
+
cp README.md docs/index.md
|
67 |
+
poetry run mkdocs build
|
68 |
+
if: matrix.python-version == '3.12'
|
69 |
+
|
70 |
+
- name: Upload documentation artifact
|
71 |
+
uses: actions/upload-artifact@v3
|
72 |
+
with:
|
73 |
+
name: documentation
|
74 |
+
path: site/
|
75 |
+
if: matrix.python-version == '3.12'
|
.github/workflows/docker.yml
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Docker CD
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches: [main]
|
6 |
+
paths:
|
7 |
+
- "Dockerfile"
|
8 |
+
- ".dockerignore"
|
9 |
+
- "docker-compose.yml"
|
10 |
+
- "pyproject.toml"
|
11 |
+
- "poetry.lock"
|
12 |
+
|
13 |
+
jobs:
|
14 |
+
docker:
|
15 |
+
runs-on: ubuntu-latest
|
16 |
+
steps:
|
17 |
+
- name: Checkout
|
18 |
+
uses: actions/checkout@v4
|
19 |
+
|
20 |
+
- name: Get version from pyproject.toml
|
21 |
+
run: |
|
22 |
+
echo "VERSION=$(grep '^version = ' pyproject.toml | cut -d'"' -f2)" >> $GITHUB_ENV
|
23 |
+
|
24 |
+
- name: Set up Docker Buildx
|
25 |
+
uses: docker/setup-buildx-action@v3
|
26 |
+
|
27 |
+
- name: Login to Docker Hub
|
28 |
+
uses: docker/login-action@v3
|
29 |
+
with:
|
30 |
+
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
31 |
+
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
32 |
+
|
33 |
+
- name: Build and push
|
34 |
+
uses: docker/build-push-action@v5
|
35 |
+
with:
|
36 |
+
context: .
|
37 |
+
push: true
|
38 |
+
tags: |
|
39 |
+
jeff52415/llmdataparser:latest
|
40 |
+
jeff52415/llmdataparser:v${{ env.VERSION }}
|
41 |
+
cache-from: type=registry,ref=jeff52415/llmdataparser:latest
|
42 |
+
cache-to: type=inline
|
43 |
+
platforms: linux/amd64,linux/arm64
|
.github/workflows/huggingface-sync.yml
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Deploy to Hugging Face Space
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches: [main]
|
6 |
+
|
7 |
+
jobs:
|
8 |
+
sync:
|
9 |
+
runs-on: ubuntu-latest
|
10 |
+
steps:
|
11 |
+
- name: Checkout repository
|
12 |
+
uses: actions/checkout@v4
|
13 |
+
with:
|
14 |
+
fetch-depth: 0
|
15 |
+
|
16 |
+
- name: Configure Git
|
17 |
+
run: |
|
18 |
+
git config --global user.email "github-actions[bot]@users.noreply.github.com"
|
19 |
+
git config --global user.name "github-actions[bot]"
|
20 |
+
|
21 |
+
- name: Login to Hugging Face
|
22 |
+
env:
|
23 |
+
HF_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
|
24 |
+
run: |
|
25 |
+
huggingface-cli login --token $HF_TOKEN --add-to-git-credential
|
26 |
+
|
27 |
+
- name: Push to Hugging Face Space
|
28 |
+
run: |
|
29 |
+
git remote add space https://huggingface.co/spaces/JeffYang52415/LLMEval-Dataset-Parser
|
30 |
+
git push space main:main
|
.gitignore
CHANGED
@@ -8,8 +8,6 @@ build/
|
|
8 |
dist/
|
9 |
*.egg-info/
|
10 |
|
11 |
-
# Poetry
|
12 |
-
poetry.lock
|
13 |
|
14 |
# Virtual environment
|
15 |
.env/
|
@@ -32,3 +30,15 @@ poetry.lock
|
|
32 |
|
33 |
# Mac files
|
34 |
.DS_Store
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
dist/
|
9 |
*.egg-info/
|
10 |
|
|
|
|
|
11 |
|
12 |
# Virtual environment
|
13 |
.env/
|
|
|
30 |
|
31 |
# Mac files
|
32 |
.DS_Store
|
33 |
+
|
34 |
+
#gradio cache
|
35 |
+
.cache/
|
36 |
+
.gradio/
|
37 |
+
|
38 |
+
#notebook cache
|
39 |
+
.ipynb_checkpoints/
|
40 |
+
notebooks/
|
41 |
+
|
42 |
+
#coverage
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
.pre-commit-config.yaml
CHANGED
@@ -1,18 +1,12 @@
|
|
1 |
# .pre-commit-config.yaml
|
2 |
|
3 |
repos:
|
4 |
-
- repo: https://github.com/
|
5 |
-
rev:
|
6 |
-
hooks:
|
7 |
-
- id: black
|
8 |
-
args: ["--target-version=py311"]
|
9 |
-
additional_dependencies: ["typing-extensions>=4.8.0"]
|
10 |
-
- repo: https://github.com/PyCQA/flake8
|
11 |
-
rev: 6.1.0
|
12 |
hooks:
|
13 |
-
- id:
|
14 |
-
|
15 |
-
|
16 |
- repo: https://github.com/PyCQA/isort
|
17 |
rev: 5.12.0
|
18 |
hooks:
|
@@ -24,10 +18,10 @@ repos:
|
|
24 |
- id: mypy
|
25 |
args:
|
26 |
[
|
27 |
-
"--
|
28 |
"--install-types",
|
29 |
"--non-interactive",
|
30 |
-
"--
|
31 |
]
|
32 |
additional_dependencies:
|
33 |
- "typing-extensions>=4.8.0"
|
@@ -41,7 +35,6 @@ repos:
|
|
41 |
- id: detect-aws-credentials
|
42 |
args: ["--allow-missing-credentials"]
|
43 |
- id: detect-private-key
|
44 |
-
- id: end-of-file-fixer
|
45 |
- id: check-added-large-files
|
46 |
- id: check-ast
|
47 |
- id: check-byte-order-marker
|
@@ -49,9 +42,6 @@ repos:
|
|
49 |
- id: check-docstring-first
|
50 |
- id: check-json
|
51 |
- id: debug-statements
|
52 |
-
- id: detect-private-key
|
53 |
-
- id: end-of-file-fixer
|
54 |
-
- id: trailing-whitespace
|
55 |
- id: mixed-line-ending
|
56 |
- repo: https://github.com/myint/autoflake
|
57 |
rev: v2.2.1
|
@@ -70,16 +60,36 @@ repos:
|
|
70 |
hooks:
|
71 |
- id: prettier
|
72 |
types_or: [markdown, yaml]
|
73 |
-
- repo: https://github.com/astral-sh/ruff-pre-commit
|
74 |
-
# Ruff version.
|
75 |
-
rev: v0.4.4
|
76 |
-
hooks:
|
77 |
-
# Run the linter.
|
78 |
-
- id: ruff
|
79 |
-
args: [--fix]
|
80 |
-
# Run the formatter.
|
81 |
-
- id: ruff-format
|
82 |
- repo: https://github.com/kynan/nbstripout
|
83 |
rev: 0.5.0 # use the latest version
|
84 |
hooks:
|
85 |
- id: nbstripout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# .pre-commit-config.yaml
|
2 |
|
3 |
repos:
|
4 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
5 |
+
rev: v0.4.4
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
hooks:
|
7 |
+
- id: ruff
|
8 |
+
args: [--fix]
|
9 |
+
- id: ruff-format
|
10 |
- repo: https://github.com/PyCQA/isort
|
11 |
rev: 5.12.0
|
12 |
hooks:
|
|
|
18 |
- id: mypy
|
19 |
args:
|
20 |
[
|
21 |
+
"--config-file=pyproject.toml",
|
22 |
"--install-types",
|
23 |
"--non-interactive",
|
24 |
+
"--exclude=tests/*",
|
25 |
]
|
26 |
additional_dependencies:
|
27 |
- "typing-extensions>=4.8.0"
|
|
|
35 |
- id: detect-aws-credentials
|
36 |
args: ["--allow-missing-credentials"]
|
37 |
- id: detect-private-key
|
|
|
38 |
- id: check-added-large-files
|
39 |
- id: check-ast
|
40 |
- id: check-byte-order-marker
|
|
|
42 |
- id: check-docstring-first
|
43 |
- id: check-json
|
44 |
- id: debug-statements
|
|
|
|
|
|
|
45 |
- id: mixed-line-ending
|
46 |
- repo: https://github.com/myint/autoflake
|
47 |
rev: v2.2.1
|
|
|
60 |
hooks:
|
61 |
- id: prettier
|
62 |
types_or: [markdown, yaml]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
- repo: https://github.com/kynan/nbstripout
|
64 |
rev: 0.5.0 # use the latest version
|
65 |
hooks:
|
66 |
- id: nbstripout
|
67 |
+
- repo: https://github.com/nbQA-dev/nbQA
|
68 |
+
rev: 1.7.1
|
69 |
+
hooks:
|
70 |
+
- id: nbqa-ruff
|
71 |
+
additional_dependencies: [ruff]
|
72 |
+
- id: nbqa-isort
|
73 |
+
- id: nbqa-flake8
|
74 |
+
- repo: https://github.com/asottile/pyupgrade
|
75 |
+
rev: v3.15.0
|
76 |
+
hooks:
|
77 |
+
- id: pyupgrade
|
78 |
+
- repo: https://github.com/executablebooks/mdformat
|
79 |
+
rev: 0.7.17
|
80 |
+
hooks:
|
81 |
+
- id: mdformat
|
82 |
+
additional_dependencies:
|
83 |
+
- mdformat-gfm # GitHub-flavored Markdown
|
84 |
+
- mdformat-frontmatter # YAML frontmatter
|
85 |
+
- mdformat-footnote
|
86 |
+
- repo: https://github.com/shellcheck-py/shellcheck-py
|
87 |
+
rev: v0.9.0.6
|
88 |
+
hooks:
|
89 |
+
- id: shellcheck
|
90 |
+
- repo: https://github.com/pycqa/bandit
|
91 |
+
rev: 1.7.7
|
92 |
+
hooks:
|
93 |
+
- id: bandit
|
94 |
+
args: ["-c", "pyproject.toml"]
|
95 |
+
additional_dependencies: ["bandit[toml]", ".[toml]"]
|
CHANGELOG.md
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Changelog
|
2 |
+
|
3 |
+
All notable changes to this project will be documented in this file.
|
4 |
+
|
5 |
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
6 |
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
7 |
+
|
8 |
+
## \[1.0.0\] - 2024-12-30
|
9 |
+
|
10 |
+
### Added
|
11 |
+
|
12 |
+
- Initial release
|
13 |
+
- Support for multiple benchmark datasets (MMLU, GSM8k, etc.)
|
14 |
+
- Gradio interface for dataset exploration
|
15 |
+
- Comprehensive test suite
|
16 |
+
- Documentation and examples
|
Dockerfile
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use Python 3.12 slim image as base
|
2 |
+
FROM python:3.12-slim
|
3 |
+
|
4 |
+
# Set environment variables
|
5 |
+
ENV PYTHONUNBUFFERED=1 \
|
6 |
+
POETRY_VERSION=1.7.1 \
|
7 |
+
POETRY_HOME="/opt/poetry" \
|
8 |
+
POETRY_NO_INTERACTION=1 \
|
9 |
+
GRADIO_SERVER_NAME=0.0.0.0 \
|
10 |
+
GRADIO_SERVER_PORT=7860
|
11 |
+
|
12 |
+
# Set working directory
|
13 |
+
WORKDIR /app
|
14 |
+
|
15 |
+
# Create cache directories for Hugging Face and set permissions correctly
|
16 |
+
ENV HF_HOME=/app/.cache/huggingface
|
17 |
+
RUN mkdir -p /app/.cache/huggingface && \
|
18 |
+
mkdir -p /app/.cache/torch && \
|
19 |
+
mkdir -p /app/.cache/transformers
|
20 |
+
|
21 |
+
# Install system dependencies
|
22 |
+
RUN apt-get update && apt-get install -y \
|
23 |
+
portaudio19-dev \
|
24 |
+
python3-pip \
|
25 |
+
gcc \
|
26 |
+
git \
|
27 |
+
curl \
|
28 |
+
&& rm -rf /var/lib/apt/lists/* \
|
29 |
+
&& apt-get clean
|
30 |
+
|
31 |
+
# Install poetry
|
32 |
+
RUN pip install poetry==${POETRY_VERSION} && \
|
33 |
+
poetry config virtualenvs.create false
|
34 |
+
|
35 |
+
# Copy dependency files first
|
36 |
+
COPY pyproject.toml poetry.lock ./
|
37 |
+
|
38 |
+
# Install dependencies using the lock file
|
39 |
+
RUN poetry install --no-dev --no-interaction --no-ansi
|
40 |
+
|
41 |
+
# Create app user and group with specific UID/GID
|
42 |
+
RUN groupadd -r app --gid 1000 && \
|
43 |
+
useradd -r -g app --uid 1000 --create-home app
|
44 |
+
|
45 |
+
# Set ownership of all cache directories
|
46 |
+
RUN chown -R app:app /app/.cache && \
|
47 |
+
chmod -R 755 /app/.cache
|
48 |
+
|
49 |
+
# Before switching to non-root user, create and set permissions
|
50 |
+
RUN mkdir -p /home/app/.cache && \
|
51 |
+
mkdir -p /home/app/.config/matplotlib && \
|
52 |
+
chown -R app:app /home/app/.cache && \
|
53 |
+
chown -R app:app /home/app/.config
|
54 |
+
|
55 |
+
# Set matplotlib config dir
|
56 |
+
ENV MPLCONFIGDIR=/home/app/.config/matplotlib
|
57 |
+
|
58 |
+
# Switch to non-root user
|
59 |
+
USER app
|
60 |
+
|
61 |
+
# Copy the rest of the application
|
62 |
+
COPY --chown=app:app . .
|
63 |
+
|
64 |
+
# Expose the port the app runs on
|
65 |
+
EXPOSE 7860
|
66 |
+
|
67 |
+
# Run the application
|
68 |
+
ENTRYPOINT ["python"]
|
69 |
+
CMD ["app.py"]
|
Makefile
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -----------------------------
|
2 |
+
# Variables
|
3 |
+
# -----------------------------
|
4 |
+
IMAGE_NAME = llmdataparser
|
5 |
+
CONTAINER_NAME = llmdataparser
|
6 |
+
VERSION = latest
|
7 |
+
|
8 |
+
# -----------------------------
|
9 |
+
# Docker Basic Commands
|
10 |
+
# -----------------------------
|
11 |
+
# Build the Docker image
|
12 |
+
build:
|
13 |
+
docker build -t $(IMAGE_NAME):$(VERSION) .
|
14 |
+
|
15 |
+
# Run the container
|
16 |
+
run:
|
17 |
+
docker run -d -p 7860:7860 --name $(CONTAINER_NAME) $(IMAGE_NAME):$(VERSION)
|
18 |
+
|
19 |
+
# Stop the container
|
20 |
+
stop:
|
21 |
+
docker stop $(CONTAINER_NAME)
|
22 |
+
|
23 |
+
# Remove the container
|
24 |
+
rm:
|
25 |
+
docker rm $(CONTAINER_NAME)
|
26 |
+
|
27 |
+
# Remove the image
|
28 |
+
rmi:
|
29 |
+
docker rmi $(IMAGE_NAME):$(VERSION)
|
30 |
+
|
31 |
+
# -----------------------------
|
32 |
+
# Docker Compose Commands
|
33 |
+
# -----------------------------
|
34 |
+
# Start with docker-compose (development)
|
35 |
+
compose-up:
|
36 |
+
docker compose up -d
|
37 |
+
|
38 |
+
# Stop and remove containers
|
39 |
+
compose-down:
|
40 |
+
docker compose down
|
41 |
+
|
42 |
+
# View logs
|
43 |
+
compose-logs:
|
44 |
+
docker compose logs -f
|
45 |
+
|
46 |
+
# Rebuild containers
|
47 |
+
compose-build:
|
48 |
+
docker compose build
|
49 |
+
|
50 |
+
# Restart containers
|
51 |
+
compose-restart:
|
52 |
+
docker compose restart
|
53 |
+
|
54 |
+
# -----------------------------
|
55 |
+
# Convenience Commands
|
56 |
+
# -----------------------------
|
57 |
+
# Build and run with docker
|
58 |
+
up: build run
|
59 |
+
|
60 |
+
# Stop and remove container
|
61 |
+
down: stop rm
|
62 |
+
|
63 |
+
# Clean everything
|
64 |
+
clean: stop rm rmi
|
65 |
+
|
66 |
+
# -----------------------------
|
67 |
+
# Monitoring Commands
|
68 |
+
# -----------------------------
|
69 |
+
# Show container logs
|
70 |
+
logs:
|
71 |
+
docker logs $(CONTAINER_NAME)
|
72 |
+
|
73 |
+
# Follow container logs
|
74 |
+
logs-follow:
|
75 |
+
docker logs -f $(CONTAINER_NAME)
|
76 |
+
|
77 |
+
# Show container status
|
78 |
+
status:
|
79 |
+
docker ps -a | grep $(CONTAINER_NAME)
|
80 |
+
|
81 |
+
# Enter container shell
|
82 |
+
shell:
|
83 |
+
docker exec -it $(CONTAINER_NAME) /bin/bash
|
84 |
+
|
85 |
+
# -----------------------------
|
86 |
+
# Production Commands
|
87 |
+
# -----------------------------
|
88 |
+
# Test nginx configuration (for production use)
|
89 |
+
nginx-test:
|
90 |
+
docker compose run --rm nginx nginx -t
|
91 |
+
|
92 |
+
# Start with nginx test (for production use)
|
93 |
+
compose-up-prod: nginx-test compose-up
|
94 |
+
|
95 |
+
# -----------------------------
|
96 |
+
# Security Commands
|
97 |
+
# -----------------------------
|
98 |
+
security-check:
|
99 |
+
@echo "Checking nginx configuration..."
|
100 |
+
docker compose run --rm nginx nginx -t
|
101 |
+
@echo "Checking exposed ports..."
|
102 |
+
docker compose config | grep -E "ports:|127.0.0.1"
|
103 |
+
|
104 |
+
# Ensure all targets are treated as commands, not files
|
105 |
+
.PHONY: build run stop rm rmi clean up down logs shell \
|
106 |
+
compose-up compose-down compose-logs compose-build compose-restart \
|
107 |
+
nginx-test status logs-follow compose-up-prod
|
README.md
CHANGED
@@ -1,6 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# LLMDataParser
|
2 |
|
3 |
-
**LLMDataParser** is a Python library that provides parsers for benchmark datasets used in evaluating Large Language Models (LLMs). It offers a unified interface for loading and parsing datasets like **MMLU
|
4 |
|
5 |
## Features
|
6 |
|
@@ -8,6 +19,7 @@
|
|
8 |
- **LLM-Agnostic**: Independent of any specific language model.
|
9 |
- **Easy to Use**: Simple methods and built-in Python types.
|
10 |
- **Extensible**: Easily add support for new datasets.
|
|
|
11 |
|
12 |
## Installation
|
13 |
|
@@ -22,7 +34,7 @@ You can install the package directly using `pip`. Even with only a `pyproject.to
|
|
22 |
cd LLMDataParser
|
23 |
```
|
24 |
|
25 |
-
|
26 |
|
27 |
```bash
|
28 |
pip install .
|
@@ -38,7 +50,7 @@ Poetry manages the virtual environment and dependencies automatically, so you do
|
|
38 |
poetry install
|
39 |
```
|
40 |
|
41 |
-
|
42 |
|
43 |
```bash
|
44 |
poetry shell
|
@@ -46,7 +58,58 @@ Poetry manages the virtual environment and dependencies automatically, so you do
|
|
46 |
|
47 |
## Available Parsers
|
48 |
|
49 |
-
- **MMLUDatasetParser
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
## License
|
52 |
|
|
|
1 |
+
---
|
2 |
+
title: LLMEval Dataset Parser
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: gray
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
license: mit
|
9 |
+
short_description: A collection of parsers for LLM benchmark datasets
|
10 |
+
---
|
11 |
+
|
12 |
# LLMDataParser
|
13 |
|
14 |
+
**LLMDataParser** is a Python library that provides parsers for benchmark datasets used in evaluating Large Language Models (LLMs). It offers a unified interface for loading and parsing datasets like **MMLU**, **GSM8k**, and others, streamlining dataset preparation for LLM evaluation. The library aims to simplify the process of working with common LLM benchmark datasets through a consistent API.
|
15 |
|
16 |
## Features
|
17 |
|
|
|
19 |
- **LLM-Agnostic**: Independent of any specific language model.
|
20 |
- **Easy to Use**: Simple methods and built-in Python types.
|
21 |
- **Extensible**: Easily add support for new datasets.
|
22 |
+
- **Gradio**: Built-in Gradio interface for interactive dataset exploration and testing.
|
23 |
|
24 |
## Installation
|
25 |
|
|
|
34 |
cd LLMDataParser
|
35 |
```
|
36 |
|
37 |
+
1. **Install Dependencies with pip**:
|
38 |
|
39 |
```bash
|
40 |
pip install .
|
|
|
50 |
poetry install
|
51 |
```
|
52 |
|
53 |
+
1. **Activate the Virtual Environment**:
|
54 |
|
55 |
```bash
|
56 |
poetry shell
|
|
|
58 |
|
59 |
## Available Parsers
|
60 |
|
61 |
+
- **MMLUDatasetParser**
|
62 |
+
- **MMLUProDatasetParser**
|
63 |
+
- **MMLUReduxDatasetParser**
|
64 |
+
- **TMMLUPlusDatasetParser**
|
65 |
+
- **GSM8KDatasetParser**
|
66 |
+
- **MATHDatasetParser**
|
67 |
+
- **MGSMDatasetParser**
|
68 |
+
- **HumanEvalDatasetParser**
|
69 |
+
- **HumanEvalDatasetPlusParser**
|
70 |
+
- **BBHDatasetParser**
|
71 |
+
- **MBPPDatasetParser**
|
72 |
+
- **IFEvalDatasetParser**
|
73 |
+
- **TWLegalDatasetParser**
|
74 |
+
- **TMLUDatasetParser**
|
75 |
+
|
76 |
+
## Quick Start Guide
|
77 |
+
|
78 |
+
Here's a simple example demonstrating how to use the library:
|
79 |
+
|
80 |
+
```python
|
81 |
+
from llmdataparser import ParserRegistry
|
82 |
+
# list all available parsers
|
83 |
+
ParserRegistry.list_parsers()
|
84 |
+
# get a parser
|
85 |
+
parser = ParserRegistry.get_parser("mmlu")
|
86 |
+
# load the parser
|
87 |
+
parser.load() # optional: task_name, split
|
88 |
+
# parse the parser
|
89 |
+
parser.parse() # optional: split_names
|
90 |
+
|
91 |
+
print(parser.task_names)
|
92 |
+
print(parser.split_names)
|
93 |
+
print(parser.get_dataset_description)
|
94 |
+
print(parser.get_huggingface_link)
|
95 |
+
print(parser.total_tasks)
|
96 |
+
data = parser.get_parsed_data
|
97 |
+
```
|
98 |
+
|
99 |
+
We also provide a Gradio demo for interactive testing:
|
100 |
+
|
101 |
+
```bash
|
102 |
+
python app.py
|
103 |
+
```
|
104 |
+
|
105 |
+
## Adding New Dataset Parsers
|
106 |
+
|
107 |
+
To add support for a new dataset, please refer to our detailed guide in [docs/adding_new_parser.md](docs/adding_new_parser.md). The guide includes:
|
108 |
+
|
109 |
+
- Step-by-step instructions for creating a new parser
|
110 |
+
- Code examples and templates
|
111 |
+
- Best practices and common patterns
|
112 |
+
- Testing guidelines
|
113 |
|
114 |
## License
|
115 |
|
app.py
ADDED
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import secrets
|
2 |
+
from functools import lru_cache
|
3 |
+
from typing import Any
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
from llmdataparser import ParserRegistry
|
8 |
+
from llmdataparser.base_parser import (
|
9 |
+
VALID_CATEGORIES,
|
10 |
+
DatasetDescription,
|
11 |
+
DatasetParser,
|
12 |
+
EvaluationMetric,
|
13 |
+
ParseEntry,
|
14 |
+
)
|
15 |
+
|
16 |
+
|
17 |
+
@lru_cache(maxsize=32)
|
18 |
+
def get_parser_instance(parser_name: str) -> DatasetParser[Any]:
|
19 |
+
"""Get a cached parser instance by name."""
|
20 |
+
return ParserRegistry.get_parser(parser_name)
|
21 |
+
|
22 |
+
|
23 |
+
def get_available_splits(parser: DatasetParser[Any]) -> list[str] | None:
|
24 |
+
"""Get available splits for the selected parser after loading."""
|
25 |
+
if not hasattr(parser, "split_names") or not parser.split_names:
|
26 |
+
return None
|
27 |
+
return list(parser.split_names)
|
28 |
+
|
29 |
+
|
30 |
+
def get_available_tasks(parser: DatasetParser[Any]) -> list[str]:
|
31 |
+
"""Get available tasks for the selected parser."""
|
32 |
+
if not hasattr(parser, "task_names"):
|
33 |
+
return ["default"]
|
34 |
+
return list(parser.task_names)
|
35 |
+
|
36 |
+
|
37 |
+
def format_entry_attributes(entry: ParseEntry) -> str:
|
38 |
+
"""Format all attributes of a ParseEntry except question and answer."""
|
39 |
+
from dataclasses import fields
|
40 |
+
|
41 |
+
# Get all field names from the dataclass
|
42 |
+
field_names = [field.name for field in fields(entry)]
|
43 |
+
# Filter out question and answer
|
44 |
+
filtered_fields = [
|
45 |
+
name for name in field_names if name not in ["question", "answer"]
|
46 |
+
]
|
47 |
+
# Build the formatted string
|
48 |
+
return "\n".join(f"{name}: {getattr(entry, name)}" for name in filtered_fields)
|
49 |
+
|
50 |
+
|
51 |
+
def load_and_parse(
|
52 |
+
parser_name: str, task_name: str | None, split_name: str | None
|
53 |
+
) -> tuple[int, str, str, str, gr.Dropdown, str]:
|
54 |
+
"""Load and parse the dataset, return the first entry and available splits."""
|
55 |
+
try:
|
56 |
+
parser = get_parser_instance(parser_name)
|
57 |
+
|
58 |
+
# Load the dataset
|
59 |
+
parser.load(
|
60 |
+
task_name=task_name if task_name != "default" else None,
|
61 |
+
split=split_name,
|
62 |
+
trust_remote_code=True,
|
63 |
+
)
|
64 |
+
|
65 |
+
# Get available splits after loading
|
66 |
+
available_splits = get_available_splits(parser)
|
67 |
+
|
68 |
+
# Parse the dataset
|
69 |
+
parser.parse(split_names=split_name, force=True)
|
70 |
+
|
71 |
+
# Get parsed data
|
72 |
+
parsed_data = parser.get_parsed_data
|
73 |
+
|
74 |
+
split_dropdown = gr.Dropdown(
|
75 |
+
choices=available_splits,
|
76 |
+
label="Select Split",
|
77 |
+
interactive=True,
|
78 |
+
value=None,
|
79 |
+
allow_custom_value=True,
|
80 |
+
)
|
81 |
+
|
82 |
+
info = parser.__repr__()
|
83 |
+
if not parsed_data:
|
84 |
+
return 0, "", "", "", split_dropdown, info
|
85 |
+
|
86 |
+
# Get the first entry
|
87 |
+
first_entry = parsed_data[0]
|
88 |
+
|
89 |
+
return (
|
90 |
+
0, # Return first index instead of list of indices
|
91 |
+
first_entry.question,
|
92 |
+
first_entry.answer,
|
93 |
+
format_entry_attributes(first_entry),
|
94 |
+
split_dropdown,
|
95 |
+
info,
|
96 |
+
)
|
97 |
+
except Exception as e:
|
98 |
+
# Make the error message more user-friendly and detailed
|
99 |
+
error_msg = f"Failed to load dataset: {str(e)}\nParser: {parser_name}\nTask: {task_name}\nSplit: {split_name}"
|
100 |
+
return 0, error_msg, "", "", [], ""
|
101 |
+
|
102 |
+
|
103 |
+
def update_entry(
|
104 |
+
parsed_data_index: int | None, parser_name: str
|
105 |
+
) -> tuple[str, str, str]:
|
106 |
+
"""Update the displayed entry based on the selected index."""
|
107 |
+
try:
|
108 |
+
if not parser_name:
|
109 |
+
return "Please select a parser first", "", ""
|
110 |
+
|
111 |
+
parser = get_parser_instance(parser_name)
|
112 |
+
parsed_data = parser.get_parsed_data
|
113 |
+
|
114 |
+
if not parsed_data:
|
115 |
+
return "No data available", "", ""
|
116 |
+
|
117 |
+
if parsed_data_index is None:
|
118 |
+
# Random selection using secrets instead of random
|
119 |
+
random_index = secrets.randbelow(len(parsed_data))
|
120 |
+
entry = parsed_data[random_index]
|
121 |
+
else:
|
122 |
+
# Ensure index is within bounds
|
123 |
+
index = max(0, min(parsed_data_index, len(parsed_data) - 1))
|
124 |
+
entry = parsed_data[index]
|
125 |
+
|
126 |
+
return (
|
127 |
+
entry.question,
|
128 |
+
entry.answer,
|
129 |
+
format_entry_attributes(entry),
|
130 |
+
)
|
131 |
+
except Exception as e:
|
132 |
+
return f"Error: {str(e)}", "", ""
|
133 |
+
|
134 |
+
|
135 |
+
def update_parser_options(parser_name: str) -> tuple[gr.Dropdown, gr.Dropdown, str]:
|
136 |
+
"""Update available tasks and splits for the selected parser."""
|
137 |
+
try:
|
138 |
+
parser = get_parser_instance(parser_name)
|
139 |
+
tasks = get_available_tasks(parser)
|
140 |
+
default_task = getattr(parser, "_default_task", "default")
|
141 |
+
|
142 |
+
# Update task dropdown
|
143 |
+
task_dropdown = gr.Dropdown(
|
144 |
+
choices=tasks,
|
145 |
+
value=default_task,
|
146 |
+
label="Select Task",
|
147 |
+
interactive=True,
|
148 |
+
allow_custom_value=True,
|
149 |
+
)
|
150 |
+
|
151 |
+
# Update split dropdown - Note the value is now explicitly None
|
152 |
+
splits = get_available_splits(parser)
|
153 |
+
split_dropdown = gr.Dropdown(
|
154 |
+
choices=splits,
|
155 |
+
label="Select Split",
|
156 |
+
interactive=True,
|
157 |
+
value=None,
|
158 |
+
allow_custom_value=True,
|
159 |
+
)
|
160 |
+
|
161 |
+
info = parser.__repr__()
|
162 |
+
return task_dropdown, split_dropdown, info
|
163 |
+
except Exception as e:
|
164 |
+
return (
|
165 |
+
gr.Dropdown(choices=["default"], value="default"),
|
166 |
+
gr.Dropdown(choices=[]),
|
167 |
+
f"Error: {str(e)}",
|
168 |
+
)
|
169 |
+
|
170 |
+
|
171 |
+
def clear_parser_cache() -> None:
|
172 |
+
"""Clear the parser cache."""
|
173 |
+
get_parser_instance.cache_clear()
|
174 |
+
|
175 |
+
|
176 |
+
def format_dataset_description(description: DatasetDescription) -> str:
|
177 |
+
"""Format DatasetDescription into a readable string."""
|
178 |
+
formatted = [
|
179 |
+
f"# {description.name}",
|
180 |
+
f"\n**Purpose**: {description.purpose}",
|
181 |
+
f"\n**Language**: {description.language}",
|
182 |
+
f"\n**Format**: {description.format}",
|
183 |
+
f"\n**Source**: {description.source}",
|
184 |
+
f"\n**Characteristics**: {description.characteristics}",
|
185 |
+
]
|
186 |
+
|
187 |
+
if description.citation:
|
188 |
+
formatted.append(f"\n**Citation**:\n```\n{description.citation}\n```")
|
189 |
+
|
190 |
+
if description.additional_info:
|
191 |
+
formatted.append("\n**Additional Information**:")
|
192 |
+
for key, value in description.additional_info.items():
|
193 |
+
formatted.append(f"- {key}: {value}")
|
194 |
+
|
195 |
+
return "\n".join(formatted)
|
196 |
+
|
197 |
+
|
198 |
+
def get_primary_metrics(metrics: list[EvaluationMetric]) -> list[str]:
|
199 |
+
"""Get list of primary metric names."""
|
200 |
+
return [metric.name for metric in metrics if metric.primary]
|
201 |
+
|
202 |
+
|
203 |
+
def format_metric_details(metric: EvaluationMetric) -> str:
|
204 |
+
"""Format a single EvaluationMetric into a readable string."""
|
205 |
+
return f"""# {metric.name}<br>
|
206 |
+
**Type**: {metric.type}<br>
|
207 |
+
**Description**: {metric.description}"""
|
208 |
+
|
209 |
+
|
210 |
+
def update_dataset_info(parser_name: str) -> tuple:
|
211 |
+
"""Update dataset description and evaluation metrics information."""
|
212 |
+
try:
|
213 |
+
parser = get_parser_instance(parser_name)
|
214 |
+
description = parser.get_dataset_description()
|
215 |
+
metrics = parser.get_evaluation_metrics()
|
216 |
+
|
217 |
+
# Format description
|
218 |
+
desc_text = format_dataset_description(description)
|
219 |
+
|
220 |
+
# Get primary metrics for dropdown
|
221 |
+
primary_metrics = get_primary_metrics(metrics)
|
222 |
+
|
223 |
+
# Format details for first metric (or empty if no metrics)
|
224 |
+
first_metric = metrics[0] if metrics else None
|
225 |
+
metric_details = format_metric_details(first_metric) if first_metric else ""
|
226 |
+
|
227 |
+
return (
|
228 |
+
gr.Markdown(value=desc_text),
|
229 |
+
gr.Dropdown(
|
230 |
+
choices=primary_metrics,
|
231 |
+
value=primary_metrics[0] if primary_metrics else None,
|
232 |
+
),
|
233 |
+
gr.Markdown(value=metric_details),
|
234 |
+
)
|
235 |
+
except Exception as e:
|
236 |
+
return (
|
237 |
+
gr.Markdown(value=f"Error loading dataset description: {str(e)}"),
|
238 |
+
gr.Dropdown(choices=[]),
|
239 |
+
gr.Markdown(value=""),
|
240 |
+
)
|
241 |
+
|
242 |
+
|
243 |
+
def update_metric_details(metric_name: str, parser_name: str) -> str:
|
244 |
+
"""Update the displayed metric details when selection changes."""
|
245 |
+
try:
|
246 |
+
parser = get_parser_instance(parser_name)
|
247 |
+
metrics = parser.get_evaluation_metrics()
|
248 |
+
selected_metric = next((m for m in metrics if m.name == metric_name), None)
|
249 |
+
return format_metric_details(selected_metric) if selected_metric else ""
|
250 |
+
except Exception as e:
|
251 |
+
return f"Error loading metric details: {str(e)}"
|
252 |
+
|
253 |
+
|
254 |
+
def get_parser_categories(parser_name: str) -> list[str]:
|
255 |
+
"""Get categories for a specific parser."""
|
256 |
+
try:
|
257 |
+
parser = get_parser_instance(parser_name)
|
258 |
+
description = parser.get_dataset_description()
|
259 |
+
return description.category
|
260 |
+
except Exception:
|
261 |
+
return []
|
262 |
+
|
263 |
+
|
264 |
+
def filter_parsers_by_category(category: str | None) -> list[str]:
|
265 |
+
"""Filter available parsers by category."""
|
266 |
+
if not category:
|
267 |
+
return ParserRegistry.list_parsers()
|
268 |
+
|
269 |
+
filtered_parsers = []
|
270 |
+
for parser_name in ParserRegistry.list_parsers():
|
271 |
+
categories = get_parser_categories(parser_name)
|
272 |
+
if category in categories:
|
273 |
+
filtered_parsers.append(parser_name)
|
274 |
+
return filtered_parsers
|
275 |
+
|
276 |
+
|
277 |
+
def create_interface() -> gr.Blocks:
|
278 |
+
"""Create and return the Gradio interface."""
|
279 |
+
with gr.Blocks(css="footer {display: none !important}") as demo:
|
280 |
+
# Add header section with purpose and GitHub info
|
281 |
+
gr.Markdown("""
|
282 |
+
# LLM Evaluation Dataset Parser
|
283 |
+
|
284 |
+
### 🎯 Purpose
|
285 |
+
A unified interface for parsing and exploring various LLM benchmark datasets (MMLU, MMLU-Pro, GSM8k, and more).
|
286 |
+
This tool helps researchers and developers to:
|
287 |
+
- Easily explore different benchmark datasets
|
288 |
+
- Access standardized parsing for multiple dataset formats
|
289 |
+
- View dataset descriptions and evaluation metrics
|
290 |
+
|
291 |
+
### 🔗 Links
|
292 |
+
- [GitHub Repository](https://github.com/jeff52415/LLMDataParser)
|
293 |
+
- [Documentation](https://github.com/jeff52415/LLMDataParser#readme)
|
294 |
+
|
295 |
+
---
|
296 |
+
""")
|
297 |
+
|
298 |
+
# State management
|
299 |
+
parser_state = gr.State("")
|
300 |
+
dataset_status = gr.Textbox(label="Dataset Status", interactive=False)
|
301 |
+
|
302 |
+
with gr.Tabs():
|
303 |
+
with gr.Tab("Dataset Explorer"):
|
304 |
+
with gr.Row():
|
305 |
+
with gr.Column(scale=1):
|
306 |
+
# Add category dropdown before parser selection
|
307 |
+
category_dropdown = gr.Dropdown(
|
308 |
+
choices=["All"] + list(VALID_CATEGORIES),
|
309 |
+
label="Filter by Category",
|
310 |
+
value="All",
|
311 |
+
interactive=True,
|
312 |
+
)
|
313 |
+
|
314 |
+
# Parser selection and controls
|
315 |
+
available_parsers = ParserRegistry.list_parsers()
|
316 |
+
parser_dropdown = gr.Dropdown(
|
317 |
+
choices=available_parsers,
|
318 |
+
label="Select Parser",
|
319 |
+
value=available_parsers[0] if available_parsers else None,
|
320 |
+
interactive=True,
|
321 |
+
allow_custom_value=True,
|
322 |
+
)
|
323 |
+
task_dropdown = gr.Dropdown(
|
324 |
+
choices=["default"],
|
325 |
+
label="Select Task",
|
326 |
+
value="default",
|
327 |
+
interactive=True,
|
328 |
+
allow_custom_value=True,
|
329 |
+
)
|
330 |
+
split_dropdown = gr.Dropdown(
|
331 |
+
choices=[],
|
332 |
+
label="Select Split",
|
333 |
+
interactive=True,
|
334 |
+
value=None,
|
335 |
+
allow_custom_value=True,
|
336 |
+
)
|
337 |
+
load_button = gr.Button(
|
338 |
+
"Load and Parse Dataset", variant="primary"
|
339 |
+
)
|
340 |
+
|
341 |
+
# Entry selection
|
342 |
+
entry_index = gr.Number(
|
343 |
+
label="Select Entry Index (empty for random)",
|
344 |
+
precision=0,
|
345 |
+
interactive=True,
|
346 |
+
)
|
347 |
+
update_button = gr.Button(
|
348 |
+
"Update/Random Entry", variant="secondary"
|
349 |
+
)
|
350 |
+
|
351 |
+
with gr.Column(scale=2):
|
352 |
+
# Output displays
|
353 |
+
question_output = gr.Textbox(
|
354 |
+
label="Question", lines=5, show_copy_button=True
|
355 |
+
)
|
356 |
+
answer_output = gr.Textbox(
|
357 |
+
label="Answer", lines=5, show_copy_button=True
|
358 |
+
)
|
359 |
+
attributes_output = gr.Textbox(
|
360 |
+
label="Other Attributes", lines=5, show_copy_button=True
|
361 |
+
)
|
362 |
+
|
363 |
+
with gr.Tab("Dataset Information"):
|
364 |
+
with gr.Row():
|
365 |
+
with gr.Column(scale=2):
|
366 |
+
# Dataset description
|
367 |
+
dataset_description = gr.Markdown()
|
368 |
+
|
369 |
+
with gr.Column(scale=1):
|
370 |
+
# Evaluation metrics
|
371 |
+
gr.Markdown("## Evaluation Metrics")
|
372 |
+
metric_dropdown = gr.Dropdown(
|
373 |
+
label="Select Primary Metric", interactive=True
|
374 |
+
)
|
375 |
+
metric_details = gr.Markdown()
|
376 |
+
|
377 |
+
# Add new event handler for category filtering
|
378 |
+
def update_parser_list(category: str) -> gr.Dropdown:
|
379 |
+
filtered_parsers = filter_parsers_by_category(
|
380 |
+
None if category == "All" else category
|
381 |
+
)
|
382 |
+
return gr.Dropdown(
|
383 |
+
choices=filtered_parsers,
|
384 |
+
value=filtered_parsers[0] if filtered_parsers else None,
|
385 |
+
)
|
386 |
+
|
387 |
+
category_dropdown.change(
|
388 |
+
fn=update_parser_list, inputs=[category_dropdown], outputs=[parser_dropdown]
|
389 |
+
)
|
390 |
+
|
391 |
+
# Event handlers
|
392 |
+
parser_dropdown.change(
|
393 |
+
fn=update_parser_options,
|
394 |
+
inputs=parser_dropdown,
|
395 |
+
outputs=[
|
396 |
+
task_dropdown,
|
397 |
+
split_dropdown,
|
398 |
+
dataset_status,
|
399 |
+
],
|
400 |
+
).then(lambda x: x, inputs=parser_dropdown, outputs=parser_state).then(
|
401 |
+
fn=update_dataset_info,
|
402 |
+
inputs=[parser_dropdown],
|
403 |
+
outputs=[dataset_description, metric_dropdown, metric_details],
|
404 |
+
)
|
405 |
+
|
406 |
+
load_button.click(
|
407 |
+
fn=load_and_parse,
|
408 |
+
inputs=[parser_dropdown, task_dropdown, split_dropdown],
|
409 |
+
outputs=[
|
410 |
+
entry_index,
|
411 |
+
question_output,
|
412 |
+
answer_output,
|
413 |
+
attributes_output,
|
414 |
+
split_dropdown,
|
415 |
+
dataset_status,
|
416 |
+
],
|
417 |
+
api_name="load_and_parse",
|
418 |
+
show_progress="full",
|
419 |
+
).then(
|
420 |
+
fn=update_dataset_info,
|
421 |
+
inputs=[parser_dropdown],
|
422 |
+
outputs=[dataset_description, metric_dropdown, metric_details],
|
423 |
+
)
|
424 |
+
|
425 |
+
update_button.click(
|
426 |
+
fn=update_entry,
|
427 |
+
inputs=[entry_index, parser_state],
|
428 |
+
outputs=[
|
429 |
+
question_output,
|
430 |
+
answer_output,
|
431 |
+
attributes_output,
|
432 |
+
],
|
433 |
+
api_name="update_entry",
|
434 |
+
)
|
435 |
+
|
436 |
+
metric_dropdown.change(
|
437 |
+
fn=update_metric_details,
|
438 |
+
inputs=[metric_dropdown, parser_dropdown],
|
439 |
+
outputs=metric_details,
|
440 |
+
)
|
441 |
+
|
442 |
+
return demo
|
443 |
+
|
444 |
+
|
445 |
+
if __name__ == "__main__":
|
446 |
+
print("Starting Gradio interface...") # Add debug logging
|
447 |
+
demo = create_interface()
|
448 |
+
try:
|
449 |
+
demo.launch(
|
450 |
+
show_error=True, # Changed to True for debugging
|
451 |
+
)
|
452 |
+
except Exception as e:
|
453 |
+
print(f"Error launching Gradio: {e}") # Add error logging
|
454 |
+
import traceback
|
455 |
+
|
456 |
+
traceback.print_exc()
|
docker-compose.yml
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: "3.8"
|
2 |
+
|
3 |
+
services:
|
4 |
+
llmdataparser:
|
5 |
+
build: jeff52415/llmdataparser
|
6 |
+
environment:
|
7 |
+
- GRADIO_SERVER_PORT=7860
|
8 |
+
volumes:
|
9 |
+
- .:/app
|
10 |
+
- huggingface_cache:/app/.cache/huggingface
|
11 |
+
healthcheck:
|
12 |
+
test: ["CMD", "curl", "-f", "http://127.0.0.1:7860"]
|
13 |
+
interval: 30s
|
14 |
+
timeout: 10s
|
15 |
+
retries: 3
|
16 |
+
networks:
|
17 |
+
- internal
|
18 |
+
|
19 |
+
nginx:
|
20 |
+
image: nginx:alpine
|
21 |
+
ports:
|
22 |
+
- "80:80"
|
23 |
+
volumes:
|
24 |
+
- ./nginx.conf:/etc/nginx/nginx.conf:ro
|
25 |
+
depends_on:
|
26 |
+
- llmdataparser
|
27 |
+
networks:
|
28 |
+
- internal
|
29 |
+
|
30 |
+
networks:
|
31 |
+
internal:
|
32 |
+
|
33 |
+
volumes:
|
34 |
+
huggingface_cache:
|
docs/POETRY_JUPYTER_SETUP.md
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
# Connecting Poetry Environment with Jupyter Notebook
|
2 |
-
|
3 |
-
This guide provides simple steps to connect a Poetry-managed environment to Jupyter Notebook.
|
4 |
-
|
5 |
-
## Steps
|
6 |
-
|
7 |
-
1. **Activate the Poetry Environment**
|
8 |
-
|
9 |
-
First, navigate to your project directory and activate the Poetry shell:
|
10 |
-
|
11 |
-
```bash
|
12 |
-
poetry shell
|
13 |
-
```
|
14 |
-
|
15 |
-
2. **Install Jupyter as a Development Dependency**
|
16 |
-
|
17 |
-
If Jupyter is not already installed, add it as a development dependency:
|
18 |
-
|
19 |
-
```bash
|
20 |
-
poetry add --group dev jupyter
|
21 |
-
```
|
22 |
-
|
23 |
-
3. **Register the Poetry Environment as a Jupyter Kernel**
|
24 |
-
|
25 |
-
Run this command to make the Poetry environment available as a Jupyter kernel:
|
26 |
-
|
27 |
-
```bash
|
28 |
-
python -m ipykernel install --user --name=llmdataparser-env --display-name "Python (LLMDataParser)"
|
29 |
-
```
|
30 |
-
|
31 |
-
- `--name=llmdataparser-env`: Assigns a name to the kernel.
|
32 |
-
- `--display-name "Python (LLMDataParser)"`: Sets the display name seen in Jupyter.
|
33 |
-
|
34 |
-
4. **Start Jupyter Notebook**
|
35 |
-
|
36 |
-
Launch Jupyter Notebook from the Poetry shell:
|
37 |
-
|
38 |
-
```bash
|
39 |
-
jupyter notebook
|
40 |
-
```
|
41 |
-
|
42 |
-
5. **Select the Poetry Kernel in Jupyter**
|
43 |
-
|
44 |
-
- Open a notebook in Jupyter.
|
45 |
-
- Go to "Kernel" > "Change kernel" and select **Python (LLMDataParser)** from the list.
|
46 |
-
|
47 |
-
This connects the notebook to your Poetry environment.
|
48 |
-
|
49 |
-
---
|
50 |
-
|
51 |
-
You’re now set up to use your Poetry environment within Jupyter Notebook!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/POETRY_USAGE.md
DELETED
@@ -1,158 +0,0 @@
|
|
1 |
-
# Poetry Usage Guide
|
2 |
-
|
3 |
-
This guide provides instructions on how to use [Poetry](https://python-poetry.org/) to manage dependencies, install packages, and prepare your project for both development and production environments.
|
4 |
-
|
5 |
-
## Table of Contents
|
6 |
-
|
7 |
-
- [Overview](#overview)
|
8 |
-
- [Installing Poetry](#installing-poetry)
|
9 |
-
- [Using Poetry in Development](#using-poetry-in-development)
|
10 |
-
- [Installing Dependencies](#installing-dependencies)
|
11 |
-
- [Updating Dependencies](#updating-dependencies)
|
12 |
-
- [Adding and Removing Dependencies](#adding-and-removing-dependencies)
|
13 |
-
- [Synchronizing Dependencies](#synchronizing-dependencies)
|
14 |
-
- [Using Poetry in Production](#using-poetry-in-production)
|
15 |
-
- [Locking Dependencies](#locking-dependencies)
|
16 |
-
- [Installing from `poetry.lock`](#installing-from-poetrylock)
|
17 |
-
- [Poetry Commands Summary](#poetry-commands-summary)
|
18 |
-
|
19 |
-
---
|
20 |
-
|
21 |
-
## Overview
|
22 |
-
|
23 |
-
Poetry is a dependency manager and build tool for Python projects. It simplifies managing dependencies, creating virtual environments, and ensuring version consistency between development and production environments. Poetry relies on two files:
|
24 |
-
|
25 |
-
- **`pyproject.toml`**: Defines the dependencies and configuration.
|
26 |
-
- **`poetry.lock`**: Locks dependencies to specific versions to ensure consistency.
|
27 |
-
|
28 |
-
---
|
29 |
-
|
30 |
-
## Installing Poetry(macOS only)
|
31 |
-
|
32 |
-
To install Poetry, use the following command:
|
33 |
-
|
34 |
-
```bash
|
35 |
-
brew install poetry
|
36 |
-
```
|
37 |
-
|
38 |
-
Refer to the [Poetry documentation](https://python-poetry.org/docs/#installation) for more options and OS-specific installation instructions.
|
39 |
-
|
40 |
-
---
|
41 |
-
|
42 |
-
## Using Poetry in Development
|
43 |
-
|
44 |
-
### Installing Dependencies
|
45 |
-
|
46 |
-
In development, install dependencies specified in `pyproject.toml`:
|
47 |
-
|
48 |
-
1. Navigate to the project directory:
|
49 |
-
|
50 |
-
```bash
|
51 |
-
cd path/to/project
|
52 |
-
```
|
53 |
-
|
54 |
-
2. Run:
|
55 |
-
```bash
|
56 |
-
poetry install
|
57 |
-
```
|
58 |
-
|
59 |
-
This command creates a virtual environment, installs all dependencies, and ensures they are compatible with the Python version specified.
|
60 |
-
|
61 |
-
### Updating Dependencies
|
62 |
-
|
63 |
-
During development, you can update dependencies by editing `pyproject.toml` directly and then running:
|
64 |
-
|
65 |
-
```bash
|
66 |
-
poetry install
|
67 |
-
```
|
68 |
-
|
69 |
-
This will apply any changes and update the environment without manually adding each dependency.
|
70 |
-
|
71 |
-
### Adding and Removing Dependencies
|
72 |
-
|
73 |
-
- **Add a New Dependency**:
|
74 |
-
|
75 |
-
```bash
|
76 |
-
poetry add <package-name>
|
77 |
-
```
|
78 |
-
|
79 |
-
Example:
|
80 |
-
|
81 |
-
```bash
|
82 |
-
poetry add requests
|
83 |
-
```
|
84 |
-
|
85 |
-
- **Add a Development Dependency** (only used for development/testing):
|
86 |
-
|
87 |
-
```bash
|
88 |
-
poetry add --group dev <package-name>
|
89 |
-
```
|
90 |
-
|
91 |
-
Example:
|
92 |
-
|
93 |
-
```bash
|
94 |
-
poetry add --group dev pytest
|
95 |
-
```
|
96 |
-
|
97 |
-
- **Remove a Dependency**:
|
98 |
-
```bash
|
99 |
-
poetry remove <package-name>
|
100 |
-
```
|
101 |
-
|
102 |
-
### Synchronizing Dependencies
|
103 |
-
|
104 |
-
If the `pyproject.toml` or `poetry.lock` files are updated (e.g., after pulling changes), run:
|
105 |
-
|
106 |
-
```bash
|
107 |
-
poetry install
|
108 |
-
```
|
109 |
-
|
110 |
-
This keeps your environment synchronized with any updates made to the dependency files.
|
111 |
-
|
112 |
-
---
|
113 |
-
|
114 |
-
## Using Poetry in Production
|
115 |
-
|
116 |
-
### Locking Dependencies
|
117 |
-
|
118 |
-
To lock dependencies for production use, run:
|
119 |
-
|
120 |
-
```bash
|
121 |
-
poetry lock
|
122 |
-
```
|
123 |
-
|
124 |
-
This creates or updates `poetry.lock`, which pins each dependency to a specific version. This lock file should be used to maintain consistency in production.
|
125 |
-
|
126 |
-
### Installing from `poetry.lock`
|
127 |
-
|
128 |
-
In production, use `poetry.lock` to ensure exact dependency versions:
|
129 |
-
|
130 |
-
1. Install only the required (non-development) dependencies:
|
131 |
-
```bash
|
132 |
-
poetry install --no-dev
|
133 |
-
```
|
134 |
-
|
135 |
-
This ensures that dependencies are installed exactly as defined in `poetry.lock`.
|
136 |
-
|
137 |
-
---
|
138 |
-
|
139 |
-
## Poetry Commands Summary
|
140 |
-
|
141 |
-
| Command | Description |
|
142 |
-
| ------------------------------ | ------------------------------------------------------------- |
|
143 |
-
| `poetry install` | Installs dependencies from `pyproject.toml` or `poetry.lock`. |
|
144 |
-
| `poetry add <package-name>` | Adds a new dependency and updates `pyproject.toml`. |
|
145 |
-
| `poetry add --group dev <pkg>` | Adds a development-only dependency. |
|
146 |
-
| `poetry remove <package-name>` | Removes a dependency and updates `pyproject.toml`. |
|
147 |
-
| `poetry update` | Updates all dependencies to their latest compatible versions. |
|
148 |
-
| `poetry lock` | Locks dependencies to specific versions for production. |
|
149 |
-
| `poetry shell` | Activates the Poetry-managed virtual environment. |
|
150 |
-
|
151 |
-
---
|
152 |
-
|
153 |
-
## Additional Resources
|
154 |
-
|
155 |
-
- **Poetry Documentation**: [https://python-poetry.org/docs/](https://python-poetry.org/docs/)
|
156 |
-
- **GitHub Repository**: [https://github.com/python-poetry/poetry](https://github.com/python-poetry/poetry)
|
157 |
-
|
158 |
-
For further help, please refer to the [Poetry documentation](https://python-poetry.org/docs/).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/adding_new_parser.md
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adding a New Dataset Parser
|
2 |
+
|
3 |
+
This guide explains how to add a new dataset parser to the llmdataparser library. The library is designed to make it easy to add support for new datasets while maintaining consistent interfaces and functionality.
|
4 |
+
|
5 |
+
## Step-by-Step Guide
|
6 |
+
|
7 |
+
### 1. Create a New Parser Class
|
8 |
+
|
9 |
+
Create a new file `your_dataset_parser.py` in the `llmdataparser` folder. Your parser should inherit from `HuggingFaceDatasetParser[T]` where T is your custom entry type.
|
10 |
+
|
11 |
+
```python
|
12 |
+
from llmdataparser.base_parser import (
|
13 |
+
DatasetDescription,
|
14 |
+
EvaluationMetric,
|
15 |
+
HuggingFaceDatasetParser,
|
16 |
+
HuggingFaceParseEntry,
|
17 |
+
)
|
18 |
+
|
19 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
20 |
+
class YourDatasetParseEntry(HuggingFaceParseEntry):
|
21 |
+
"""Custom entry class for your dataset."""
|
22 |
+
# Add any additional fields specific to your dataset
|
23 |
+
custom_field: str
|
24 |
+
|
25 |
+
@classmethod
|
26 |
+
def create(cls, question: str, answer: str, raw_question: str,
|
27 |
+
raw_answer: str, task_name: str, custom_field: str) -> "YourDatasetParseEntry":
|
28 |
+
return cls(
|
29 |
+
question=question,
|
30 |
+
answer=answer,
|
31 |
+
raw_question=raw_question,
|
32 |
+
raw_answer=raw_answer,
|
33 |
+
task_name=task_name,
|
34 |
+
custom_field=custom_field
|
35 |
+
)
|
36 |
+
|
37 |
+
class YourDatasetParser(HuggingFaceDatasetParser[YourDatasetParseEntry]):
|
38 |
+
"""Parser for your dataset."""
|
39 |
+
|
40 |
+
# Required class variables
|
41 |
+
_data_source = "huggingface/your-dataset"
|
42 |
+
_default_task = "default"
|
43 |
+
_task_names = ["task1", "task2", "task3"]
|
44 |
+
```
|
45 |
+
|
46 |
+
### 2. Implement Required Methods
|
47 |
+
|
48 |
+
Your parser needs to implement these key methods:
|
49 |
+
|
50 |
+
```python
|
51 |
+
def process_entry(
|
52 |
+
self,
|
53 |
+
row: dict[str, Any],
|
54 |
+
task_name: str | None = None,
|
55 |
+
**kwargs: Any
|
56 |
+
) -> YourDatasetParseEntry:
|
57 |
+
"""Process a single dataset entry."""
|
58 |
+
# Extract data from the row
|
59 |
+
raw_question = row["question"]
|
60 |
+
raw_answer = row["answer"]
|
61 |
+
task = task_name or self._get_current_task(row)
|
62 |
+
|
63 |
+
question = f"Question: {raw_question}\nAnswer:"
|
64 |
+
|
65 |
+
return YourDatasetParseEntry.create(
|
66 |
+
question=question,
|
67 |
+
answer=raw_answer,
|
68 |
+
raw_question=raw_question,
|
69 |
+
raw_answer=raw_answer,
|
70 |
+
task_name=task,
|
71 |
+
custom_field=row["custom_field"]
|
72 |
+
)
|
73 |
+
|
74 |
+
def get_dataset_description(self) -> DatasetDescription:
|
75 |
+
"""Returns description of your dataset."""
|
76 |
+
return DatasetDescription.create(
|
77 |
+
name="Your Dataset Name",
|
78 |
+
purpose="Purpose of the dataset",
|
79 |
+
source="Dataset source/URL",
|
80 |
+
language="Dataset language",
|
81 |
+
format="Data format (e.g., multiple choice, free text)",
|
82 |
+
characteristics="Key characteristics of the dataset",
|
83 |
+
citation="Dataset citation if available"
|
84 |
+
)
|
85 |
+
|
86 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
87 |
+
"""Returns recommended evaluation metrics."""
|
88 |
+
return [
|
89 |
+
EvaluationMetric.create(
|
90 |
+
name="metric_name",
|
91 |
+
type="metric_type",
|
92 |
+
description="Metric description",
|
93 |
+
implementation="implementation_details",
|
94 |
+
primary=True
|
95 |
+
)
|
96 |
+
]
|
97 |
+
```
|
98 |
+
|
99 |
+
### 3. Add Example Usage
|
100 |
+
|
101 |
+
Add example usage at the bottom of your parser file:
|
102 |
+
|
103 |
+
```python
|
104 |
+
if __name__ == "__main__":
|
105 |
+
# Example usage
|
106 |
+
parser = YourDatasetParser()
|
107 |
+
parser.load()
|
108 |
+
parser.parse()
|
109 |
+
|
110 |
+
# Get parsed data
|
111 |
+
parsed_data = parser.get_parsed_data
|
112 |
+
|
113 |
+
# Print example entry
|
114 |
+
if parsed_data:
|
115 |
+
example = parsed_data[0]
|
116 |
+
print("\nExample parsed entry:")
|
117 |
+
print(f"Question: {example.raw_question}")
|
118 |
+
print(f"Answer: {example.answer}")
|
119 |
+
```
|
120 |
+
|
121 |
+
### 4. Create Tests
|
122 |
+
|
123 |
+
Create a test file `tests/test_your_dataset_parser.py`:
|
124 |
+
|
125 |
+
```python
|
126 |
+
import pytest
|
127 |
+
from llmdataparser.your_dataset_parser import YourDatasetParser, YourDatasetParseEntry
|
128 |
+
|
129 |
+
def test_parser_initialization():
|
130 |
+
parser = YourDatasetParser()
|
131 |
+
assert parser._data_source == "huggingface/your-dataset"
|
132 |
+
assert parser._default_task == "default"
|
133 |
+
assert "task1" in parser._task_names
|
134 |
+
|
135 |
+
def test_process_entry():
|
136 |
+
parser = YourDatasetParser()
|
137 |
+
sample_row = {
|
138 |
+
"question": "Sample question",
|
139 |
+
"answer": "Sample answer",
|
140 |
+
"custom_field": "Custom value"
|
141 |
+
}
|
142 |
+
|
143 |
+
entry = parser.process_entry(sample_row)
|
144 |
+
assert isinstance(entry, YourDatasetParseEntry)
|
145 |
+
assert entry.raw_question == "Sample question"
|
146 |
+
assert entry.custom_field == "Custom value"
|
147 |
+
```
|
148 |
+
|
149 |
+
## Best Practices
|
150 |
+
|
151 |
+
1. **Type Safety**: Use type hints consistently and ensure your parser is properly typed.
|
152 |
+
1. **Documentation**: Add clear docstrings and comments explaining your parser's functionality.
|
153 |
+
1. **Error Handling**: Include appropriate error checking and validation.
|
154 |
+
1. **Testing**: Write comprehensive tests covering different scenarios.
|
155 |
+
|
156 |
+
## Examples
|
157 |
+
|
158 |
+
Look at existing parsers for reference:
|
159 |
+
|
160 |
+
- `mmlu_parser.py` for multiple-choice questions
|
161 |
+
- `gsm8k_parser.py` for math word problems
|
162 |
+
- `humaneval_parser.py` for code generation tasks
|
163 |
+
|
164 |
+
## Common Patterns
|
165 |
+
|
166 |
+
1. **Parse Entry Class**: Create a custom parse entry class if you need additional fields.
|
167 |
+
1. **Task Names**: Define all available tasks in `_task_names`.
|
168 |
+
1. **Process Entry**: Handle data extraction and formatting in `process_entry`.
|
169 |
+
1. **Dataset Description**: Provide comprehensive dataset information.
|
170 |
+
1. **Evaluation Metrics**: Define appropriate metrics for your dataset.
|
171 |
+
|
172 |
+
## Testing Your Parser
|
173 |
+
|
174 |
+
1. Run the example usage code to verify basic functionality
|
175 |
+
1. Run pytest to execute your test cases
|
176 |
+
1. Try different dataset splits and tasks
|
177 |
+
1. Verify the parsed output format
|
178 |
+
1. Check error handling with invalid inputs
|
llmdataparser/__init__.py
CHANGED
@@ -1,8 +1,22 @@
|
|
1 |
# llmdataparser/__init__.py
|
2 |
-
from typing import Type
|
3 |
|
4 |
from .base_parser import DatasetParser
|
5 |
-
from .
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
class ParserRegistry:
|
@@ -17,11 +31,13 @@ class ParserRegistry:
|
|
17 |
cls._registry[name.lower()] = parser_class
|
18 |
|
19 |
@classmethod
|
20 |
-
def get_parser(cls, name: str, **kwargs) ->
|
|
|
21 |
parser_class = cls._registry.get(name.lower())
|
22 |
if parser_class is None:
|
23 |
raise ValueError(f"Parser '{name}' is not registered.")
|
24 |
-
|
|
|
25 |
|
26 |
@classmethod
|
27 |
def list_parsers(cls) -> list[str]:
|
@@ -30,4 +46,17 @@ class ParserRegistry:
|
|
30 |
|
31 |
|
32 |
# Register parsers
|
33 |
-
ParserRegistry.register_parser("mmlu",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# llmdataparser/__init__.py
|
2 |
+
from typing import Any, Type
|
3 |
|
4 |
from .base_parser import DatasetParser
|
5 |
+
from .bbh_parser import BBHDatasetParser
|
6 |
+
from .gsm8k_parser import GSM8KDatasetParser
|
7 |
+
from .humaneval_parser import HumanEvalDatasetParser, HumanEvalDatasetPlusParser
|
8 |
+
from .ifeval_parser import IFEvalDatasetParser
|
9 |
+
from .math_parser import MATHDatasetParser
|
10 |
+
from .mbpp_parser import MBPPDatasetParser
|
11 |
+
from .mgsm_parser import MGSMDatasetParser
|
12 |
+
from .mmlu_parser import (
|
13 |
+
BaseMMLUDatasetParser,
|
14 |
+
MMLUProDatasetParser,
|
15 |
+
MMLUReduxDatasetParser,
|
16 |
+
TMMLUPlusDatasetParser,
|
17 |
+
)
|
18 |
+
from .tmlu_parser import TMLUDatasetParser
|
19 |
+
from .tw_legal_parser import TWLegalDatasetParser
|
20 |
|
21 |
|
22 |
class ParserRegistry:
|
|
|
31 |
cls._registry[name.lower()] = parser_class
|
32 |
|
33 |
@classmethod
|
34 |
+
def get_parser(cls, name: str, **kwargs: Any) -> DatasetParser[Any]:
|
35 |
+
"""Get a parser instance by name."""
|
36 |
parser_class = cls._registry.get(name.lower())
|
37 |
if parser_class is None:
|
38 |
raise ValueError(f"Parser '{name}' is not registered.")
|
39 |
+
parser: DatasetParser[Any] = parser_class(**kwargs)
|
40 |
+
return parser
|
41 |
|
42 |
@classmethod
|
43 |
def list_parsers(cls) -> list[str]:
|
|
|
46 |
|
47 |
|
48 |
# Register parsers
|
49 |
+
ParserRegistry.register_parser("mmlu", BaseMMLUDatasetParser)
|
50 |
+
ParserRegistry.register_parser("mmlupro", MMLUProDatasetParser)
|
51 |
+
ParserRegistry.register_parser("mmluredux", MMLUReduxDatasetParser)
|
52 |
+
ParserRegistry.register_parser("tmmluplus", TMMLUPlusDatasetParser)
|
53 |
+
ParserRegistry.register_parser("gsm8k", GSM8KDatasetParser)
|
54 |
+
ParserRegistry.register_parser("math", MATHDatasetParser)
|
55 |
+
ParserRegistry.register_parser("mgsm", MGSMDatasetParser)
|
56 |
+
ParserRegistry.register_parser("humaneval", HumanEvalDatasetParser)
|
57 |
+
ParserRegistry.register_parser("humanevalplus", HumanEvalDatasetPlusParser)
|
58 |
+
ParserRegistry.register_parser("bbh", BBHDatasetParser)
|
59 |
+
ParserRegistry.register_parser("mbpp", MBPPDatasetParser)
|
60 |
+
ParserRegistry.register_parser("ifeval", IFEvalDatasetParser)
|
61 |
+
ParserRegistry.register_parser("twlegal", TWLegalDatasetParser)
|
62 |
+
ParserRegistry.register_parser("tmlu", TMLUDatasetParser)
|
llmdataparser/base_parser.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
from dataclasses import dataclass
|
3 |
from functools import lru_cache
|
4 |
-
from typing import Any, Generic, TypeVar
|
5 |
|
6 |
import datasets
|
7 |
|
@@ -9,17 +9,102 @@ import datasets
|
|
9 |
T = TypeVar("T", bound="ParseEntry")
|
10 |
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
class ParseEntry:
|
14 |
"""A simple base class for entries, customizable by each dataset parser."""
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
"""
|
19 |
Abstract base class defining the interface for all dataset parsers.
|
20 |
"""
|
21 |
|
22 |
-
def __init__(self):
|
23 |
self._parsed_data: list[T] = []
|
24 |
|
25 |
@abstractmethod
|
@@ -39,40 +124,189 @@ class DatasetParser(ABC, Generic[T]):
|
|
39 |
return self._parsed_data
|
40 |
|
41 |
@abstractmethod
|
42 |
-
def process_entry(
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
|
46 |
-
# Base class for Hugging Face datasets
|
47 |
class HuggingFaceDatasetParser(DatasetParser[T]):
|
48 |
"""
|
49 |
Base class for parsers that use datasets from Hugging Face.
|
50 |
"""
|
51 |
|
52 |
-
_data_source
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
-
def __init__(self):
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
57 |
super().__init__()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
return self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
@staticmethod
|
63 |
@lru_cache(maxsize=3)
|
64 |
def load_dataset_cached(
|
65 |
-
data_source: str,
|
66 |
-
|
|
|
|
|
|
|
67 |
"""
|
68 |
Cached static method to load a dataset from Hugging Face.
|
69 |
"""
|
70 |
-
return datasets.load_dataset(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
def load(
|
73 |
self,
|
74 |
-
|
75 |
-
config_name: str = "all",
|
76 |
trust_remote_code: bool = True,
|
77 |
split: str | None = None,
|
78 |
**kwargs: Any,
|
@@ -80,21 +314,41 @@ class HuggingFaceDatasetParser(DatasetParser[T]):
|
|
80 |
"""
|
81 |
Load the dataset using the Hugging Face datasets library.
|
82 |
"""
|
83 |
-
#
|
84 |
-
|
85 |
-
if not data_source:
|
86 |
-
raise ValueError("The 'data_source' class variable must be defined.")
|
87 |
|
88 |
# Call the cached static method
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
trust_remote_code=trust_remote_code,
|
93 |
split=split,
|
94 |
**kwargs,
|
95 |
)
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
print(
|
98 |
-
f"Loaded dataset with {len(self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
)
|
100 |
-
|
|
|
|
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
from dataclasses import dataclass
|
3 |
from functools import lru_cache
|
4 |
+
from typing import Any, ClassVar, Generic, TypeVar
|
5 |
|
6 |
import datasets
|
7 |
|
|
|
9 |
T = TypeVar("T", bound="ParseEntry")
|
10 |
|
11 |
|
12 |
+
# Add this after the DatasetCategory definition
|
13 |
+
VALID_CATEGORIES = {
|
14 |
+
"Math",
|
15 |
+
"General Knowledge and Reasoning",
|
16 |
+
"Programming",
|
17 |
+
"MultiLingual",
|
18 |
+
"Taiwan",
|
19 |
+
"Advanced Reasoning",
|
20 |
+
"Legal",
|
21 |
+
}
|
22 |
+
|
23 |
+
|
24 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
25 |
class ParseEntry:
|
26 |
"""A simple base class for entries, customizable by each dataset parser."""
|
27 |
|
28 |
+
question: str
|
29 |
+
answer: str
|
30 |
+
raw_question: str
|
31 |
+
raw_answer: str
|
32 |
+
|
33 |
+
|
34 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
35 |
+
class DatasetDescription:
|
36 |
+
"""Standardized description of a dataset."""
|
37 |
+
|
38 |
+
name: str
|
39 |
+
purpose: str
|
40 |
+
source: str
|
41 |
+
language: str
|
42 |
+
format: str
|
43 |
+
category: list[str]
|
44 |
+
characteristics: str
|
45 |
+
citation: str | None = None
|
46 |
+
additional_info: dict[str, Any] | None = None
|
47 |
+
|
48 |
+
@classmethod
|
49 |
+
def create(
|
50 |
+
cls,
|
51 |
+
name: str,
|
52 |
+
purpose: str,
|
53 |
+
source: str,
|
54 |
+
language: str,
|
55 |
+
format: str,
|
56 |
+
category: list[str],
|
57 |
+
characteristics: str,
|
58 |
+
citation: str | None = None,
|
59 |
+
additional_info: dict[str, Any] | None = None,
|
60 |
+
) -> "DatasetDescription":
|
61 |
+
# Validate that all categories are valid DatasetCategory values
|
62 |
+
for item in category:
|
63 |
+
assert (
|
64 |
+
item in VALID_CATEGORIES
|
65 |
+
), f"Category '{item}' is not a valid category. Valid categories are: {VALID_CATEGORIES}"
|
66 |
+
return cls(
|
67 |
+
name=name,
|
68 |
+
purpose=purpose,
|
69 |
+
source=source,
|
70 |
+
language=language,
|
71 |
+
format=format,
|
72 |
+
category=category,
|
73 |
+
characteristics=characteristics,
|
74 |
+
citation=citation,
|
75 |
+
additional_info=additional_info,
|
76 |
+
)
|
77 |
+
|
78 |
+
|
79 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
80 |
+
class EvaluationMetric:
|
81 |
+
"""Description of an evaluation metric for a dataset."""
|
82 |
|
83 |
+
name: str
|
84 |
+
type: str
|
85 |
+
description: str
|
86 |
+
implementation: str
|
87 |
+
primary: bool
|
88 |
+
|
89 |
+
@classmethod
|
90 |
+
def create(
|
91 |
+
cls, name: str, type: str, description: str, implementation: str, primary: bool
|
92 |
+
) -> "EvaluationMetric":
|
93 |
+
return cls(
|
94 |
+
name=name,
|
95 |
+
type=type,
|
96 |
+
description=description,
|
97 |
+
implementation=implementation,
|
98 |
+
primary=primary,
|
99 |
+
)
|
100 |
+
|
101 |
+
|
102 |
+
class DatasetParser(Generic[T], ABC):
|
103 |
"""
|
104 |
Abstract base class defining the interface for all dataset parsers.
|
105 |
"""
|
106 |
|
107 |
+
def __init__(self) -> None:
|
108 |
self._parsed_data: list[T] = []
|
109 |
|
110 |
@abstractmethod
|
|
|
124 |
return self._parsed_data
|
125 |
|
126 |
@abstractmethod
|
127 |
+
def process_entry(
|
128 |
+
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
129 |
+
) -> T:
|
130 |
+
"""
|
131 |
+
Process a single entry from the dataset.
|
132 |
+
|
133 |
+
Args:
|
134 |
+
row: A dictionary representing a single entry from the dataset.
|
135 |
+
task_name: Optional task name for the entry.
|
136 |
+
**kwargs: Additional keyword arguments.
|
137 |
+
|
138 |
+
Returns:
|
139 |
+
T: The processed entry, typically an instance of a subclass of ParseEntry.
|
140 |
+
"""
|
141 |
+
|
142 |
+
@abstractmethod
|
143 |
+
def get_dataset_description(self) -> DatasetDescription:
|
144 |
+
"""Returns a standardized description of the dataset."""
|
145 |
+
|
146 |
+
@abstractmethod
|
147 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
148 |
+
"""Returns the recommended evaluation metrics for the dataset."""
|
149 |
+
|
150 |
+
|
151 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
152 |
+
class HuggingFaceParseEntry(ParseEntry):
|
153 |
+
"""ParseEntry with an additional task_name field."""
|
154 |
+
|
155 |
+
task_name: str
|
156 |
|
157 |
|
|
|
158 |
class HuggingFaceDatasetParser(DatasetParser[T]):
|
159 |
"""
|
160 |
Base class for parsers that use datasets from Hugging Face.
|
161 |
"""
|
162 |
|
163 |
+
# _data_source is the name of the dataset, e.g. "lighteval/MATH"
|
164 |
+
_data_source: ClassVar[str]
|
165 |
+
# _task_names is the list of tasks in the dataset, e.g. ["algebra", "geometry", "statistics"]
|
166 |
+
_task_names: ClassVar[list[str]]
|
167 |
+
# _default_task is the default task to use if no task is specified, e.g. "algebra"
|
168 |
+
_default_task: ClassVar[str]
|
169 |
+
# _hidden_task_names is the list of task names that are hidden in the dataset, e.g. ["math", "physics", "chemistry"]
|
170 |
+
_hidden_task_names: ClassVar[list[str]] = []
|
171 |
|
172 |
+
def __init__(self, **kwargs: Any) -> None:
|
173 |
+
"""
|
174 |
+
Initialize a HuggingFaceDatasetParser.
|
175 |
+
|
176 |
+
Args:
|
177 |
+
**kwargs: Additional keyword arguments passed to the parent class.
|
178 |
+
"""
|
179 |
super().__init__()
|
180 |
+
# raw_data is the dataset loaded from HuggingFace
|
181 |
+
self.raw_data: dict[str, Any] | None = None
|
182 |
+
# split_names is the list of splits in the dataset, e.g. ["train", "test", "validation"]
|
183 |
+
self.split_names: list[str] = []
|
184 |
+
# _current_task is the task currently being processed, e.g. "algebra"
|
185 |
+
self._current_task: str = ""
|
186 |
+
|
187 |
+
def _get_current_task(self, data_entry: dict[str, Any] | None = None) -> str:
|
188 |
+
"""
|
189 |
+
Get the currently loaded task name.
|
190 |
+
|
191 |
+
Args:
|
192 |
+
data_entry: Optional dictionary containing entry data that might include task information
|
193 |
+
|
194 |
+
Returns:
|
195 |
+
str: The task name from either the data entry (if available) or the currently set task
|
196 |
+
"""
|
197 |
+
# If data_entry is provided and contains task information, use it
|
198 |
+
if data_entry is not None and hasattr(self, "_get_task_from_entry"):
|
199 |
+
try:
|
200 |
+
task = self._get_task_from_entry(data_entry)
|
201 |
+
if isinstance(task, str): # Add type checking
|
202 |
+
return task
|
203 |
+
except (KeyError, AttributeError):
|
204 |
+
pass
|
205 |
|
206 |
+
# Otherwise return the task set during load()
|
207 |
+
return self._current_task or self._default_task
|
208 |
+
|
209 |
+
@property
|
210 |
+
def task_names(self) -> list[str]:
|
211 |
+
"""Get all available task names."""
|
212 |
+
return self._task_names
|
213 |
+
|
214 |
+
@property
|
215 |
+
def total_tasks(self) -> int:
|
216 |
+
"""Get total number of available tasks."""
|
217 |
+
return len(self._task_names)
|
218 |
+
|
219 |
+
@property
|
220 |
+
def get_huggingface_link(self) -> str:
|
221 |
+
return "https://huggingface.co/datasets/" + self._data_source
|
222 |
|
223 |
@staticmethod
|
224 |
@lru_cache(maxsize=3)
|
225 |
def load_dataset_cached(
|
226 |
+
data_source: str,
|
227 |
+
task_name: str = "default",
|
228 |
+
trust_remote_code: bool = True,
|
229 |
+
**kwargs: Any,
|
230 |
+
) -> datasets.Dataset:
|
231 |
"""
|
232 |
Cached static method to load a dataset from Hugging Face.
|
233 |
"""
|
234 |
+
return datasets.load_dataset(
|
235 |
+
data_source, task_name, trust_remote_code=trust_remote_code, **kwargs
|
236 |
+
)
|
237 |
+
|
238 |
+
def parse(
|
239 |
+
self,
|
240 |
+
split_names: str | list[str] | None = None,
|
241 |
+
force: bool = False,
|
242 |
+
**kwargs: Any,
|
243 |
+
) -> None:
|
244 |
+
"""
|
245 |
+
Parse the MATH dataset splits into structured entries.
|
246 |
+
|
247 |
+
Args:
|
248 |
+
split_names: Dataset splits to parse. Can be:
|
249 |
+
- None: Parse all available splits
|
250 |
+
- str: Parse a single split (e.g., "train")
|
251 |
+
- list[str]: Parse multiple splits (e.g., ["train", "test"])
|
252 |
+
force: If True, overwrites existing parsed data without confirmation.
|
253 |
+
If False and parsed data exists, prompts for confirmation.
|
254 |
+
**kwargs: Additional keyword arguments passed to process_entry
|
255 |
+
|
256 |
+
Raises:
|
257 |
+
ValueError: If no data is loaded or if a specified split name doesn't exist
|
258 |
+
"""
|
259 |
+
if self.raw_data is None:
|
260 |
+
raise ValueError("No data loaded. Please load the dataset first.")
|
261 |
+
|
262 |
+
if self._parsed_data and not force:
|
263 |
+
response = input(
|
264 |
+
f"Found {len(self._parsed_data)} existing parsed entries. "
|
265 |
+
"Do you want to overwrite them? [y/N]: "
|
266 |
+
).lower()
|
267 |
+
if response not in ("y", "yes"):
|
268 |
+
print("Parsing cancelled. Existing data preserved.")
|
269 |
+
return
|
270 |
+
|
271 |
+
self._parsed_data.clear()
|
272 |
+
|
273 |
+
# Dataset with splits
|
274 |
+
if split_names is None:
|
275 |
+
split_names = self.split_names
|
276 |
+
elif isinstance(split_names, str):
|
277 |
+
split_names = [split_names]
|
278 |
+
|
279 |
+
for split_name in split_names:
|
280 |
+
if split_name not in self.split_names:
|
281 |
+
raise ValueError(f"Split '{split_name}' not found in the dataset.")
|
282 |
+
|
283 |
+
dataset_split = self.raw_data[split_name]
|
284 |
+
total_entries = len(dataset_split)
|
285 |
+
print(f"Processing {split_name} split with {total_entries} entries...")
|
286 |
+
|
287 |
+
for index, entry in enumerate(dataset_split, start=1):
|
288 |
+
try:
|
289 |
+
task_name = self._get_current_task(data_entry=entry)
|
290 |
+
parsed_entry = self.process_entry(entry, task_name, **kwargs)
|
291 |
+
self._parsed_data.append(parsed_entry)
|
292 |
+
|
293 |
+
# Print progress every 100 entries
|
294 |
+
if index % 100 == 0:
|
295 |
+
print(
|
296 |
+
f"Processed {index}/{total_entries} entries from '{split_name}'"
|
297 |
+
)
|
298 |
+
|
299 |
+
except Exception as e:
|
300 |
+
print(f"Error processing entry {index} in {split_name}: {str(e)}")
|
301 |
+
continue
|
302 |
+
|
303 |
+
print(f"Completed parsing {index} entries from '{split_name}'")
|
304 |
+
|
305 |
+
print(f"Total parsed entries: {len(self._parsed_data)}")
|
306 |
|
307 |
def load(
|
308 |
self,
|
309 |
+
task_name: str | None = None,
|
|
|
310 |
trust_remote_code: bool = True,
|
311 |
split: str | None = None,
|
312 |
**kwargs: Any,
|
|
|
314 |
"""
|
315 |
Load the dataset using the Hugging Face datasets library.
|
316 |
"""
|
317 |
+
# Set the task name
|
318 |
+
self._current_task = task_name or self._default_task
|
|
|
|
|
319 |
|
320 |
# Call the cached static method
|
321 |
+
raw_data = self.load_dataset_cached(
|
322 |
+
self._data_source,
|
323 |
+
task_name=self._current_task,
|
324 |
trust_remote_code=trust_remote_code,
|
325 |
split=split,
|
326 |
**kwargs,
|
327 |
)
|
328 |
+
|
329 |
+
# Handle split-specific loading
|
330 |
+
if split:
|
331 |
+
self.raw_data = {split: raw_data}
|
332 |
+
self.split_names = [split]
|
333 |
+
else:
|
334 |
+
self.raw_data = raw_data
|
335 |
+
self.split_names = list(raw_data.keys())
|
336 |
+
|
337 |
print(
|
338 |
+
f"Loaded dataset with {len(self.split_names)} groups: {', '.join(self.split_names)}."
|
339 |
+
)
|
340 |
+
|
341 |
+
def __repr__(self) -> str:
|
342 |
+
status = "loaded" if self.raw_data is not None else "not loaded"
|
343 |
+
parsed_count = len(self._parsed_data) if self._parsed_data else 0
|
344 |
+
return (
|
345 |
+
f"{self.__class__.__name__}("
|
346 |
+
f"data_source='{self._data_source}', "
|
347 |
+
f"task='{self._current_task}', "
|
348 |
+
f"status='{status}', "
|
349 |
+
f"parsed_entries={parsed_count}"
|
350 |
+
")"
|
351 |
)
|
352 |
+
|
353 |
+
def __str__(self) -> str:
|
354 |
+
return self.__repr__()
|
llmdataparser/bbh_parser.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from typing import Any, ClassVar, List
|
3 |
+
|
4 |
+
from llmdataparser.base_parser import (
|
5 |
+
DatasetDescription,
|
6 |
+
EvaluationMetric,
|
7 |
+
HuggingFaceDatasetParser,
|
8 |
+
HuggingFaceParseEntry,
|
9 |
+
)
|
10 |
+
|
11 |
+
|
12 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
13 |
+
class BBHParseEntry(HuggingFaceParseEntry):
|
14 |
+
"""Custom entry class for BBH (Big Bench Hard), with fields specific to this dataset."""
|
15 |
+
|
16 |
+
@classmethod
|
17 |
+
def create(
|
18 |
+
cls,
|
19 |
+
question: str,
|
20 |
+
answer: str,
|
21 |
+
raw_question: str,
|
22 |
+
raw_answer: str,
|
23 |
+
task_name: str,
|
24 |
+
) -> "BBHParseEntry":
|
25 |
+
return cls(
|
26 |
+
question=question,
|
27 |
+
answer=answer,
|
28 |
+
raw_question=raw_question,
|
29 |
+
raw_answer=raw_answer,
|
30 |
+
task_name=task_name,
|
31 |
+
)
|
32 |
+
|
33 |
+
|
34 |
+
class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
|
35 |
+
"""Parser for the Big Bench Hard dataset."""
|
36 |
+
|
37 |
+
_data_source: ClassVar[str] = "lukaemon/bbh"
|
38 |
+
_task_names: ClassVar[list[str]] = [
|
39 |
+
"boolean_expressions",
|
40 |
+
"causal_judgement",
|
41 |
+
"date_understanding",
|
42 |
+
"disambiguation_qa",
|
43 |
+
"dyck_languages",
|
44 |
+
"formal_fallacies",
|
45 |
+
"geometric_shapes",
|
46 |
+
"hyperbaton",
|
47 |
+
"logical_deduction_five_objects",
|
48 |
+
"logical_deduction_seven_objects",
|
49 |
+
"logical_deduction_three_objects",
|
50 |
+
"movie_recommendation",
|
51 |
+
"multistep_arithmetic_two",
|
52 |
+
"navigate",
|
53 |
+
"object_counting",
|
54 |
+
"penguins_in_a_table",
|
55 |
+
"reasoning_about_colored_objects",
|
56 |
+
"ruin_names",
|
57 |
+
"salient_translation_error_detection",
|
58 |
+
"snarks",
|
59 |
+
"sports_understanding",
|
60 |
+
"temporal_sequences",
|
61 |
+
"tracking_shuffled_objects_five_objects",
|
62 |
+
"tracking_shuffled_objects_seven_objects",
|
63 |
+
"tracking_shuffled_objects_three_objects",
|
64 |
+
"web_of_lies",
|
65 |
+
"word_sorting",
|
66 |
+
]
|
67 |
+
_default_task: ClassVar[str] = "reasoning_about_colored_objects"
|
68 |
+
|
69 |
+
def process_entry(
|
70 |
+
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
71 |
+
) -> BBHParseEntry:
|
72 |
+
"""Process a single BBH entry."""
|
73 |
+
raw_question = row["input"]
|
74 |
+
raw_answer = row["target"]
|
75 |
+
|
76 |
+
# Remove parentheses from the answer
|
77 |
+
clean_answer = raw_answer.strip("()")
|
78 |
+
|
79 |
+
question = str(raw_question)
|
80 |
+
|
81 |
+
# Use task_name if provided, otherwise use default
|
82 |
+
task = task_name or self._get_current_task(row)
|
83 |
+
|
84 |
+
return BBHParseEntry.create(
|
85 |
+
question=question,
|
86 |
+
answer=clean_answer,
|
87 |
+
raw_question=raw_question,
|
88 |
+
raw_answer=raw_answer,
|
89 |
+
task_name=task,
|
90 |
+
)
|
91 |
+
|
92 |
+
def get_dataset_description(self) -> DatasetDescription:
|
93 |
+
"""Returns a description of the Big Bench Hard dataset."""
|
94 |
+
return DatasetDescription.create(
|
95 |
+
name="Big Bench Hard (BBH)",
|
96 |
+
purpose="A curated subset of 23 challenging BIG-Bench tasks where language models initially performed below average human-rater performance",
|
97 |
+
source="https://github.com/suzgunmirac/BIG-Bench-Hard",
|
98 |
+
language="English",
|
99 |
+
format="Multiple choice questions with single correct answers",
|
100 |
+
characteristics=(
|
101 |
+
"Tasks require complex multi-step reasoning and were selected based on "
|
102 |
+
"initial model performance below human baseline. Performance can be "
|
103 |
+
"significantly improved through chain-of-thought prompting. The dataset "
|
104 |
+
"includes 23 core tasks plus additional related tasks."
|
105 |
+
),
|
106 |
+
category=["Advanced Reasoning"],
|
107 |
+
citation=(
|
108 |
+
"@article{suzgun2022challenging,\n"
|
109 |
+
" title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
|
110 |
+
' author={Suzgun, Mirac and Scales, Nathan and Sch{"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and Wei, Jason},\n'
|
111 |
+
" journal={arXiv preprint arXiv:2210.09261},\n"
|
112 |
+
" year={2022}\n"
|
113 |
+
"}"
|
114 |
+
),
|
115 |
+
additional_info={
|
116 |
+
"model_performance": (
|
117 |
+
"With chain-of-thought prompting, PaLM surpassed human performance on "
|
118 |
+
"10/23 tasks, while Codex surpassed human performance on 17/23 tasks"
|
119 |
+
),
|
120 |
+
"size": "6.5k examples across 27 tasks (23 core + 4 related)",
|
121 |
+
},
|
122 |
+
)
|
123 |
+
|
124 |
+
def get_evaluation_metrics(self) -> List[EvaluationMetric]:
|
125 |
+
"""Returns the recommended evaluation metrics for BBH dataset."""
|
126 |
+
return [
|
127 |
+
EvaluationMetric.create(
|
128 |
+
name="accuracy",
|
129 |
+
type="classification",
|
130 |
+
description="Proportion of exactly correct answers (after stripping parentheses)",
|
131 |
+
implementation="evaluate.load('accuracy')",
|
132 |
+
primary=True,
|
133 |
+
),
|
134 |
+
EvaluationMetric.create(
|
135 |
+
name="human_eval_delta",
|
136 |
+
type="comparison",
|
137 |
+
description="Difference between model accuracy and average human-rater performance baseline",
|
138 |
+
implementation="custom_human_baseline_comparison",
|
139 |
+
primary=True,
|
140 |
+
),
|
141 |
+
EvaluationMetric.create(
|
142 |
+
name="per_task_accuracy",
|
143 |
+
type="classification",
|
144 |
+
description="Accuracy broken down by individual reasoning tasks",
|
145 |
+
implementation="custom_task_accuracy",
|
146 |
+
primary=False,
|
147 |
+
),
|
148 |
+
EvaluationMetric.create(
|
149 |
+
name="exact_match",
|
150 |
+
type="string_match",
|
151 |
+
description="Strict exact match between predicted and target answers",
|
152 |
+
implementation="evaluate.load('exact_match')",
|
153 |
+
primary=False,
|
154 |
+
),
|
155 |
+
]
|
156 |
+
|
157 |
+
|
158 |
+
if __name__ == "__main__":
|
159 |
+
# Example usage
|
160 |
+
parser = BBHDatasetParser()
|
161 |
+
|
162 |
+
# Load the dataset with a specific task
|
163 |
+
parser.load(task_name="reasoning_about_colored_objects")
|
164 |
+
|
165 |
+
# Parse all splits
|
166 |
+
parser.parse()
|
167 |
+
|
168 |
+
# Get parsed data
|
169 |
+
parsed_data = parser.get_parsed_data
|
170 |
+
|
171 |
+
# Print example entry
|
172 |
+
if parsed_data:
|
173 |
+
example = parsed_data[0]
|
174 |
+
print("\nExample parsed entry:")
|
175 |
+
print(f"Task: {example.task_name}")
|
176 |
+
print(f"Question: {example.question}")
|
177 |
+
print(f"Answer: {example.answer}")
|
llmdataparser/gsm8k_parser.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from typing import Any, ClassVar
|
3 |
+
|
4 |
+
from llmdataparser.base_parser import (
|
5 |
+
DatasetDescription,
|
6 |
+
EvaluationMetric,
|
7 |
+
HuggingFaceDatasetParser,
|
8 |
+
HuggingFaceParseEntry,
|
9 |
+
)
|
10 |
+
|
11 |
+
|
12 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
13 |
+
class GSM8KParseEntry(HuggingFaceParseEntry):
|
14 |
+
"""Custom entry class for GSM8K, with fields specific to this dataset parser."""
|
15 |
+
|
16 |
+
solution: str
|
17 |
+
numerical_answer: int | float
|
18 |
+
task_name: str
|
19 |
+
|
20 |
+
@classmethod
|
21 |
+
def create(
|
22 |
+
cls,
|
23 |
+
question: str,
|
24 |
+
answer: str,
|
25 |
+
raw_question: str,
|
26 |
+
raw_answer: str,
|
27 |
+
solution: str,
|
28 |
+
numerical_answer: int | float,
|
29 |
+
task_name: str,
|
30 |
+
) -> "GSM8KParseEntry":
|
31 |
+
return cls(
|
32 |
+
question=question,
|
33 |
+
answer=answer,
|
34 |
+
raw_question=raw_question,
|
35 |
+
raw_answer=raw_answer,
|
36 |
+
solution=solution,
|
37 |
+
numerical_answer=numerical_answer,
|
38 |
+
task_name=task_name,
|
39 |
+
)
|
40 |
+
|
41 |
+
|
42 |
+
class GSM8KDatasetParser(HuggingFaceDatasetParser[GSM8KParseEntry]):
|
43 |
+
"""Parser for the GSM8K dataset."""
|
44 |
+
|
45 |
+
_data_source: ClassVar[str] = "openai/gsm8k"
|
46 |
+
_task_names: ClassVar[list[str]] = ["main", "socratic"]
|
47 |
+
_default_task: ClassVar[str] = "main"
|
48 |
+
|
49 |
+
def process_entry(
|
50 |
+
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
51 |
+
) -> GSM8KParseEntry:
|
52 |
+
"""Process a single GSM8K entry."""
|
53 |
+
task = task_name or self._get_current_task(row)
|
54 |
+
raw_question = row["question"]
|
55 |
+
raw_answer = row["answer"]
|
56 |
+
|
57 |
+
# Extract numerical answer (always after '####' in GSM8K)
|
58 |
+
numerical_str = raw_answer.split("####")[-1].strip().replace(",", "")
|
59 |
+
# Convert string to number
|
60 |
+
try:
|
61 |
+
numerical_answer = float(numerical_str)
|
62 |
+
if numerical_answer.is_integer():
|
63 |
+
numerical_answer = int(numerical_answer)
|
64 |
+
except ValueError:
|
65 |
+
raise ValueError(f"Could not convert '{numerical_str}' to number")
|
66 |
+
|
67 |
+
# Extract solution (everything before '####')
|
68 |
+
solution = raw_answer.split("####")[0].strip()
|
69 |
+
|
70 |
+
question = str(raw_question)
|
71 |
+
|
72 |
+
return GSM8KParseEntry.create(
|
73 |
+
question=question,
|
74 |
+
answer=str(numerical_answer),
|
75 |
+
raw_question=raw_question,
|
76 |
+
raw_answer=raw_answer,
|
77 |
+
solution=solution,
|
78 |
+
numerical_answer=numerical_answer, # Now guaranteed to be int or float
|
79 |
+
task_name=task, # Guarantee non-None
|
80 |
+
)
|
81 |
+
|
82 |
+
def get_dataset_description(self) -> DatasetDescription:
|
83 |
+
"""Returns description of the GSM8K dataset."""
|
84 |
+
return DatasetDescription.create(
|
85 |
+
name="Grade School Math 8K (GSM8K)",
|
86 |
+
purpose="Evaluate mathematical reasoning capabilities through word problems",
|
87 |
+
source="OpenAI",
|
88 |
+
language="English",
|
89 |
+
format="Word problems with step-by-step solutions and numerical answers",
|
90 |
+
category=["Math"],
|
91 |
+
characteristics=(
|
92 |
+
"Collection of 8.5K grade school math word problems that require "
|
93 |
+
"multi-step reasoning. Problems gradually increase in difficulty "
|
94 |
+
"and cover basic arithmetic, word problems, and elementary algebra"
|
95 |
+
),
|
96 |
+
citation="""@article{cobbe2021gsm8k,
|
97 |
+
title={Training Verifiers to Solve Math Word Problems},
|
98 |
+
author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
|
99 |
+
journal={arXiv preprint arXiv:2110.14168},
|
100 |
+
year={2021}
|
101 |
+
}""",
|
102 |
+
)
|
103 |
+
|
104 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
105 |
+
"""Returns recommended evaluation metrics for GSM8K."""
|
106 |
+
return [
|
107 |
+
EvaluationMetric.create(
|
108 |
+
name="exact_match",
|
109 |
+
type="string",
|
110 |
+
description="Exact match comparison between predicted and correct numerical answers",
|
111 |
+
implementation="custom_exact_match",
|
112 |
+
primary=True,
|
113 |
+
),
|
114 |
+
EvaluationMetric.create(
|
115 |
+
name="solution_validity",
|
116 |
+
type="text",
|
117 |
+
description="Assessment of whether the solution steps are mathematically valid and complete",
|
118 |
+
implementation="custom_solution_validator",
|
119 |
+
primary=True,
|
120 |
+
),
|
121 |
+
EvaluationMetric.create(
|
122 |
+
name="step_accuracy",
|
123 |
+
type="numerical",
|
124 |
+
description="Accuracy of intermediate calculation steps (e.g., <<48/2=24>>)",
|
125 |
+
implementation="custom_step_accuracy",
|
126 |
+
primary=True,
|
127 |
+
),
|
128 |
+
EvaluationMetric.create(
|
129 |
+
name="step_count",
|
130 |
+
type="numerical",
|
131 |
+
description="Analysis of the number of reasoning steps in solutions",
|
132 |
+
implementation="custom_step_counter",
|
133 |
+
primary=False,
|
134 |
+
),
|
135 |
+
]
|
136 |
+
|
137 |
+
|
138 |
+
if __name__ == "__main__":
|
139 |
+
from pprint import pprint
|
140 |
+
|
141 |
+
parser = GSM8KDatasetParser()
|
142 |
+
parser.load()
|
143 |
+
parser.parse()
|
144 |
+
|
145 |
+
parsed_data = parser.get_parsed_data
|
146 |
+
pprint(parsed_data[0].question)
|
147 |
+
pprint(parsed_data[0].answer)
|
148 |
+
pprint(parsed_data[0].raw_question)
|
149 |
+
pprint(parsed_data[0].raw_answer)
|
150 |
+
pprint(parsed_data[0].solution)
|
151 |
+
pprint(parsed_data[0].numerical_answer)
|
llmdataparser/humaneval_parser.py
ADDED
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from typing import Any, ClassVar
|
3 |
+
|
4 |
+
from llmdataparser.base_parser import (
|
5 |
+
DatasetDescription,
|
6 |
+
EvaluationMetric,
|
7 |
+
HuggingFaceDatasetParser,
|
8 |
+
HuggingFaceParseEntry,
|
9 |
+
)
|
10 |
+
|
11 |
+
|
12 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
13 |
+
class HumanEvalParseEntry(HuggingFaceParseEntry):
|
14 |
+
"""Custom entry class for HumanEval, with fields specific to this dataset parser."""
|
15 |
+
|
16 |
+
task_id: str
|
17 |
+
task_name: str
|
18 |
+
entry_point: str
|
19 |
+
test: str
|
20 |
+
|
21 |
+
@classmethod
|
22 |
+
def create(
|
23 |
+
cls,
|
24 |
+
question: str,
|
25 |
+
answer: str,
|
26 |
+
raw_question: str,
|
27 |
+
task_id: str,
|
28 |
+
entry_point: str,
|
29 |
+
test: str,
|
30 |
+
task_name: str,
|
31 |
+
) -> "HumanEvalParseEntry":
|
32 |
+
if not task_id:
|
33 |
+
raise ValueError("Task ID cannot be empty")
|
34 |
+
if not entry_point:
|
35 |
+
raise ValueError("Entry point cannot be empty")
|
36 |
+
return cls(
|
37 |
+
question=question,
|
38 |
+
answer=answer,
|
39 |
+
raw_question=raw_question,
|
40 |
+
raw_answer=answer, # In HumanEval, the canonical solution is the raw answer
|
41 |
+
task_id=task_id,
|
42 |
+
entry_point=entry_point,
|
43 |
+
test=test,
|
44 |
+
task_name=task_name,
|
45 |
+
)
|
46 |
+
|
47 |
+
|
48 |
+
class HumanEvalDatasetParser(HuggingFaceDatasetParser[HumanEvalParseEntry]):
|
49 |
+
"""Parser for the HumanEval dataset."""
|
50 |
+
|
51 |
+
_data_source: ClassVar[str] = "openai/openai_humaneval"
|
52 |
+
_default_task: ClassVar[str] = "openai_humaneval"
|
53 |
+
_task_names: ClassVar[list[str]] = ["openai_humaneval"]
|
54 |
+
|
55 |
+
def process_entry(
|
56 |
+
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
57 |
+
) -> HumanEvalParseEntry:
|
58 |
+
"""Process a single HumanEval entry."""
|
59 |
+
raw_question = row["prompt"]
|
60 |
+
answer = row["canonical_solution"]
|
61 |
+
task_id = row["task_id"]
|
62 |
+
entry_point = row["entry_point"]
|
63 |
+
test = row["test"]
|
64 |
+
|
65 |
+
question = str(raw_question)
|
66 |
+
|
67 |
+
# Use task_name if provided, otherwise use default
|
68 |
+
task = task_name or self._get_current_task(row)
|
69 |
+
|
70 |
+
return HumanEvalParseEntry.create(
|
71 |
+
question=question,
|
72 |
+
answer=answer,
|
73 |
+
raw_question=raw_question,
|
74 |
+
task_id=task_id,
|
75 |
+
entry_point=entry_point,
|
76 |
+
test=test,
|
77 |
+
task_name=task, # Guarantee non-None
|
78 |
+
)
|
79 |
+
|
80 |
+
def get_dataset_description(self) -> DatasetDescription:
|
81 |
+
"""Returns description of the HumanEval dataset."""
|
82 |
+
return DatasetDescription.create(
|
83 |
+
name="HumanEval",
|
84 |
+
purpose="Evaluate code generation capabilities through Python programming tasks",
|
85 |
+
source="OpenAI",
|
86 |
+
language="Python",
|
87 |
+
format="Function signatures with docstrings and unit tests",
|
88 |
+
category=["Programming"],
|
89 |
+
characteristics=(
|
90 |
+
"Collection of 164 hand-written Python programming problems. Each problem "
|
91 |
+
"includes a function signature, docstring, example test cases, and hidden unit "
|
92 |
+
"tests. Problems test basic programming, algorithms, and data structure skills"
|
93 |
+
),
|
94 |
+
citation="""@article{chen2021codex,
|
95 |
+
title={Evaluating Large Language Models Trained on Code},
|
96 |
+
author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
|
97 |
+
year={2021},
|
98 |
+
eprint={2107.03374},
|
99 |
+
archivePrefix={arXiv},
|
100 |
+
primaryClass={cs.LG}
|
101 |
+
}""",
|
102 |
+
)
|
103 |
+
|
104 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
105 |
+
"""Returns recommended evaluation metrics for HumanEval."""
|
106 |
+
return [
|
107 |
+
EvaluationMetric.create(
|
108 |
+
name="pass@k",
|
109 |
+
type="code",
|
110 |
+
description="Probability that correct solution appears at least once in k samples",
|
111 |
+
implementation="custom_pass_at_k",
|
112 |
+
primary=True,
|
113 |
+
),
|
114 |
+
EvaluationMetric.create(
|
115 |
+
name="test_success_rate",
|
116 |
+
type="code",
|
117 |
+
description="Percentage of test cases passed by the generated solution",
|
118 |
+
implementation="custom_test_executor",
|
119 |
+
primary=False,
|
120 |
+
),
|
121 |
+
EvaluationMetric.create(
|
122 |
+
name="type_correctness",
|
123 |
+
type="code",
|
124 |
+
description="Verification of type hints and type safety in generated code",
|
125 |
+
implementation="custom_type_checker",
|
126 |
+
primary=False,
|
127 |
+
),
|
128 |
+
EvaluationMetric.create(
|
129 |
+
name="code_style",
|
130 |
+
type="code",
|
131 |
+
description="Compliance with Python best practices and PEP 8 guidelines",
|
132 |
+
implementation="custom_style_checker",
|
133 |
+
primary=False,
|
134 |
+
),
|
135 |
+
EvaluationMetric.create(
|
136 |
+
name="runtime_efficiency",
|
137 |
+
type="code",
|
138 |
+
description="Analysis of time and space complexity of the solution",
|
139 |
+
implementation="custom_complexity_analyzer",
|
140 |
+
primary=False,
|
141 |
+
),
|
142 |
+
]
|
143 |
+
|
144 |
+
|
145 |
+
class HumanEvalDatasetPlusParser(HumanEvalDatasetParser):
|
146 |
+
"""Parser for the enhanced HumanEval Plus dataset with 80x more comprehensive test coverage."""
|
147 |
+
|
148 |
+
_data_source: ClassVar[str] = "evalplus/humanevalplus"
|
149 |
+
_default_task: ClassVar[str] = "default"
|
150 |
+
_task_names: ClassVar[list[str]] = ["default"]
|
151 |
+
|
152 |
+
def process_entry(
|
153 |
+
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
154 |
+
) -> HumanEvalParseEntry:
|
155 |
+
"""Process a single HumanEval entry."""
|
156 |
+
raw_question = row["prompt"]
|
157 |
+
answer = row["canonical_solution"]
|
158 |
+
task_id = row["task_id"]
|
159 |
+
entry_point = row["entry_point"]
|
160 |
+
test = row["test"]
|
161 |
+
|
162 |
+
question = str(raw_question)
|
163 |
+
# Use task_name if provided, otherwise use default
|
164 |
+
task = task_name or self._get_current_task(row)
|
165 |
+
|
166 |
+
return HumanEvalParseEntry.create(
|
167 |
+
question=question,
|
168 |
+
answer=answer,
|
169 |
+
raw_question=raw_question,
|
170 |
+
task_id=task_id,
|
171 |
+
entry_point=entry_point,
|
172 |
+
test=test,
|
173 |
+
task_name=task, # task is guaranteed to be str from _get_current_task
|
174 |
+
)
|
175 |
+
|
176 |
+
def get_dataset_description(self) -> DatasetDescription:
|
177 |
+
"""Returns description of the HumanEval Plus dataset."""
|
178 |
+
return DatasetDescription.create(
|
179 |
+
name="HumanEval Plus",
|
180 |
+
purpose="Enhanced evaluation of code generation with 80x more test coverage",
|
181 |
+
source="EvalPlus",
|
182 |
+
language="Python",
|
183 |
+
format="Function signatures with docstrings and comprehensive test suites",
|
184 |
+
category=["Programming"],
|
185 |
+
characteristics=(
|
186 |
+
"Significantly enhanced version of HumanEval with 80x more test cases. "
|
187 |
+
"Includes extensive edge cases, boundary conditions, stress tests, and "
|
188 |
+
"error handling scenarios to rigorously evaluate code correctness and robustness. "
|
189 |
+
"Each problem has been augmented with comprehensive testing to catch subtle bugs "
|
190 |
+
"and ensure production-quality code generation."
|
191 |
+
),
|
192 |
+
citation="""@inproceedings{evalplus,
|
193 |
+
title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
|
194 |
+
author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
|
195 |
+
booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
|
196 |
+
year = {2023},
|
197 |
+
url = {https://openreview.net/forum?id=1qvx610Cu7},
|
198 |
+
}""",
|
199 |
+
)
|
200 |
+
|
201 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
202 |
+
"""Returns recommended evaluation metrics for HumanEval Plus."""
|
203 |
+
return [
|
204 |
+
EvaluationMetric.create(
|
205 |
+
name="pass@k",
|
206 |
+
type="code",
|
207 |
+
description="Probability that correct solution appears at least once in k samples",
|
208 |
+
implementation="custom_pass_at_k",
|
209 |
+
primary=True,
|
210 |
+
),
|
211 |
+
EvaluationMetric.create(
|
212 |
+
name="test_coverage",
|
213 |
+
type="code",
|
214 |
+
description="Percentage of edge cases and stress tests passed by the solution",
|
215 |
+
implementation="custom_coverage_checker",
|
216 |
+
primary=False,
|
217 |
+
),
|
218 |
+
EvaluationMetric.create(
|
219 |
+
name="error_handling",
|
220 |
+
type="code",
|
221 |
+
description="Assessment of solution's robustness in handling invalid inputs and edge cases",
|
222 |
+
implementation="custom_error_handler",
|
223 |
+
primary=False,
|
224 |
+
),
|
225 |
+
EvaluationMetric.create(
|
226 |
+
name="performance_stress",
|
227 |
+
type="code",
|
228 |
+
description="Evaluation of solution performance under high load and stress conditions",
|
229 |
+
implementation="custom_stress_tester",
|
230 |
+
primary=False,
|
231 |
+
),
|
232 |
+
EvaluationMetric.create(
|
233 |
+
name="code_quality",
|
234 |
+
type="code",
|
235 |
+
description="Analysis of code readability, maintainability and adherence to Python best practices",
|
236 |
+
implementation="custom_quality_checker",
|
237 |
+
primary=False,
|
238 |
+
),
|
239 |
+
]
|
240 |
+
|
241 |
+
|
242 |
+
if __name__ == "__main__":
|
243 |
+
# Example usage
|
244 |
+
parser = HumanEvalDatasetParser()
|
245 |
+
|
246 |
+
# Load the dataset
|
247 |
+
parser.load()
|
248 |
+
|
249 |
+
# Parse all splits
|
250 |
+
parser.parse()
|
251 |
+
|
252 |
+
# Get parsed data
|
253 |
+
parsed_data = parser.get_parsed_data
|
254 |
+
|
255 |
+
# Print example entry
|
256 |
+
if parsed_data:
|
257 |
+
example = parsed_data[0]
|
258 |
+
print("\nExample parsed entry:")
|
259 |
+
print(f"Task ID: {example.task_id}")
|
260 |
+
print(f"Entry Point: {example.entry_point}")
|
261 |
+
print(f"Question:\n{example.question}")
|
262 |
+
print(f"Solution:\n{example.answer}")
|
263 |
+
|
264 |
+
parser = HumanEvalDatasetPlusParser()
|
265 |
+
parser.load()
|
266 |
+
parser.parse()
|
267 |
+
parsed_data = parser.get_parsed_data
|
268 |
+
if parsed_data:
|
269 |
+
example = parsed_data[0]
|
270 |
+
print("\nExample parsed entry:")
|
271 |
+
print(f"Task: {example.task_name}")
|
272 |
+
print(f"Question: {example.raw_question}")
|
273 |
+
print(f"Correct Answer: {example.answer}")
|
llmdataparser/ifeval_parser.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from typing import Any, ClassVar, List
|
3 |
+
|
4 |
+
from llmdataparser.base_parser import (
|
5 |
+
DatasetDescription,
|
6 |
+
EvaluationMetric,
|
7 |
+
HuggingFaceDatasetParser,
|
8 |
+
HuggingFaceParseEntry,
|
9 |
+
)
|
10 |
+
|
11 |
+
|
12 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
13 |
+
class IFEvalParseEntry(HuggingFaceParseEntry):
|
14 |
+
"""Custom entry class for IFEval, with fields specific to this dataset parser."""
|
15 |
+
|
16 |
+
key: int
|
17 |
+
instruction_id_list: List[str]
|
18 |
+
kwargs: dict[str, Any]
|
19 |
+
|
20 |
+
@classmethod
|
21 |
+
def create(
|
22 |
+
cls,
|
23 |
+
question: str,
|
24 |
+
answer: str,
|
25 |
+
raw_question: str,
|
26 |
+
raw_answer: str,
|
27 |
+
key: int,
|
28 |
+
instruction_id_list: List[str],
|
29 |
+
kwargs: dict[str, Any],
|
30 |
+
task_name: str,
|
31 |
+
) -> "IFEvalParseEntry":
|
32 |
+
return cls(
|
33 |
+
question=question,
|
34 |
+
answer=answer,
|
35 |
+
raw_question=raw_question,
|
36 |
+
raw_answer=raw_answer,
|
37 |
+
key=key,
|
38 |
+
instruction_id_list=instruction_id_list,
|
39 |
+
kwargs=kwargs,
|
40 |
+
task_name=task_name,
|
41 |
+
)
|
42 |
+
|
43 |
+
|
44 |
+
class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
|
45 |
+
"""Parser for the IFEval dataset."""
|
46 |
+
|
47 |
+
_data_source: ClassVar[str] = "google/IFEval"
|
48 |
+
_default_task: ClassVar[str] = "default"
|
49 |
+
_task_names: ClassVar[list[str]] = ["default"]
|
50 |
+
|
51 |
+
def process_entry(
|
52 |
+
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
53 |
+
) -> IFEvalParseEntry:
|
54 |
+
"""Process a single IFEval entry."""
|
55 |
+
# Extract fields from the row
|
56 |
+
key = row["key"]
|
57 |
+
raw_question = row["prompt"] # The prompt is the raw question in this case
|
58 |
+
instruction_id_list = row["instruction_id_list"]
|
59 |
+
kwargs_data = row["kwargs"]
|
60 |
+
|
61 |
+
# For IFEval, we don't have explicit answers in the dataset
|
62 |
+
# We'll use empty strings as placeholders
|
63 |
+
answer = ""
|
64 |
+
raw_answer = ""
|
65 |
+
|
66 |
+
question = str(raw_question)
|
67 |
+
|
68 |
+
# Use task_name if provided, otherwise use default
|
69 |
+
task = task_name or self._get_current_task(row)
|
70 |
+
|
71 |
+
return IFEvalParseEntry.create(
|
72 |
+
question=question,
|
73 |
+
answer=answer,
|
74 |
+
raw_question=raw_question,
|
75 |
+
raw_answer=raw_answer,
|
76 |
+
key=key,
|
77 |
+
instruction_id_list=instruction_id_list,
|
78 |
+
kwargs=kwargs_data,
|
79 |
+
task_name=task,
|
80 |
+
)
|
81 |
+
|
82 |
+
def get_dataset_description(self) -> DatasetDescription:
|
83 |
+
"""Returns description of the IFEval dataset."""
|
84 |
+
return DatasetDescription.create(
|
85 |
+
name="IFEval",
|
86 |
+
purpose="Evaluate instruction following capabilities through verifiable instructions",
|
87 |
+
source="Google Research",
|
88 |
+
language="English (BCP-47 en)",
|
89 |
+
format="Verifiable instruction prompts with automated evaluation criteria",
|
90 |
+
category=["Programming"],
|
91 |
+
characteristics=(
|
92 |
+
"Collection of approximately 500 verifiable instructions designed to evaluate "
|
93 |
+
"language models' instruction-following capabilities. Instructions include "
|
94 |
+
"specific, measurable criteria like 'write in more than 400 words' or "
|
95 |
+
"'mention the keyword AI at least 3 times' that can be verified through "
|
96 |
+
"automated heuristics. Used as a core benchmark in the Open LLM Leaderboard "
|
97 |
+
"for evaluating chat or instruction fine-tuned language models."
|
98 |
+
),
|
99 |
+
citation="""@misc{zhou2023instructionfollowingevaluationlargelanguage,
|
100 |
+
title={Instruction-Following Evaluation for Large Language Models},
|
101 |
+
author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
|
102 |
+
year={2023},
|
103 |
+
eprint={2311.07911},
|
104 |
+
archivePrefix={arXiv},
|
105 |
+
primaryClass={cs.CL},
|
106 |
+
url={https://arxiv.org/abs/2311.07911}
|
107 |
+
}""",
|
108 |
+
)
|
109 |
+
|
110 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
111 |
+
"""Returns recommended evaluation metrics for IFEval."""
|
112 |
+
return [
|
113 |
+
EvaluationMetric.create(
|
114 |
+
name="format_compliance",
|
115 |
+
type="text",
|
116 |
+
description="Verifies if the output follows specified formatting rules (e.g., highlighting, bullet points, sections)",
|
117 |
+
implementation="custom_format_checker",
|
118 |
+
primary=True,
|
119 |
+
),
|
120 |
+
EvaluationMetric.create(
|
121 |
+
name="length_constraints",
|
122 |
+
type="text",
|
123 |
+
description="Checks if the response meets word, sentence, or paragraph count requirements",
|
124 |
+
implementation="custom_length_validator",
|
125 |
+
primary=True,
|
126 |
+
),
|
127 |
+
EvaluationMetric.create(
|
128 |
+
name="punctuation_rules",
|
129 |
+
type="text",
|
130 |
+
description="Validates adherence to punctuation constraints (e.g., no commas, specific endings)",
|
131 |
+
implementation="custom_punctuation_checker",
|
132 |
+
primary=True,
|
133 |
+
),
|
134 |
+
EvaluationMetric.create(
|
135 |
+
name="keyword_usage",
|
136 |
+
type="text",
|
137 |
+
description="Verifies correct usage of required keywords or avoidance of forbidden words",
|
138 |
+
implementation="custom_keyword_validator",
|
139 |
+
primary=False,
|
140 |
+
),
|
141 |
+
EvaluationMetric.create(
|
142 |
+
name="structural_requirements",
|
143 |
+
type="text",
|
144 |
+
description="Checks for specific structural elements like sections, paragraphs, or formatting patterns",
|
145 |
+
implementation="custom_structure_validator",
|
146 |
+
primary=False,
|
147 |
+
),
|
148 |
+
]
|
149 |
+
|
150 |
+
|
151 |
+
if __name__ == "__main__":
|
152 |
+
# Example usage
|
153 |
+
parser = IFEvalDatasetParser()
|
154 |
+
parser.load()
|
155 |
+
parser.parse()
|
156 |
+
|
157 |
+
parsed_data = parser.get_parsed_data
|
158 |
+
if parsed_data:
|
159 |
+
example = parsed_data[0]
|
160 |
+
print("\nExample parsed entry:")
|
161 |
+
print(f"Key: {example.key}")
|
162 |
+
print(f"Question: {example.question}")
|
163 |
+
print(f"Instruction IDs: {example.instruction_id_list}")
|
164 |
+
print(f"kwargs: {example.kwargs}")
|
llmdataparser/math_parser.py
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from typing import Any, ClassVar
|
3 |
+
|
4 |
+
from llmdataparser.base_parser import (
|
5 |
+
DatasetDescription,
|
6 |
+
EvaluationMetric,
|
7 |
+
HuggingFaceDatasetParser,
|
8 |
+
HuggingFaceParseEntry,
|
9 |
+
)
|
10 |
+
|
11 |
+
|
12 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
13 |
+
class MATHParseEntry(HuggingFaceParseEntry):
|
14 |
+
"""Custom entry class for MATH dataset, with fields specific to this dataset parser."""
|
15 |
+
|
16 |
+
level: str
|
17 |
+
task_name: str
|
18 |
+
solution: str
|
19 |
+
|
20 |
+
@classmethod
|
21 |
+
def create(
|
22 |
+
cls,
|
23 |
+
question: str,
|
24 |
+
answer: str,
|
25 |
+
raw_question: str,
|
26 |
+
raw_answer: str,
|
27 |
+
level: str,
|
28 |
+
task_name: str,
|
29 |
+
solution: str,
|
30 |
+
) -> "MATHParseEntry":
|
31 |
+
return cls(
|
32 |
+
question=question,
|
33 |
+
answer=answer,
|
34 |
+
raw_question=raw_question,
|
35 |
+
raw_answer=raw_answer,
|
36 |
+
level=level,
|
37 |
+
task_name=task_name,
|
38 |
+
solution=solution,
|
39 |
+
)
|
40 |
+
|
41 |
+
|
42 |
+
class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]):
|
43 |
+
"""Parser for the MATH dataset."""
|
44 |
+
|
45 |
+
_data_source: ClassVar[str] = "lighteval/MATH"
|
46 |
+
_task_names: ClassVar[list[str]] = [
|
47 |
+
"algebra",
|
48 |
+
"geometry",
|
49 |
+
"calculus",
|
50 |
+
"prealgebra",
|
51 |
+
"intermediate_algebra",
|
52 |
+
"number_theory",
|
53 |
+
"precalculus",
|
54 |
+
"all",
|
55 |
+
]
|
56 |
+
_default_task: ClassVar[str] = "all"
|
57 |
+
|
58 |
+
_valid_levels: ClassVar[set[str]] = {
|
59 |
+
f"Level {i}" for i in range(1, 6)
|
60 |
+
} # Levels 1-5 are valid
|
61 |
+
|
62 |
+
def _get_task_from_entry(self, data_entry: dict[str, Any]) -> str:
|
63 |
+
"""Get the task name from the data entry or fall back to current task."""
|
64 |
+
entry_type: str = data_entry.get("type", "")
|
65 |
+
if entry_type and (entry_type in self._task_names):
|
66 |
+
return entry_type
|
67 |
+
return self._current_task or self._default_task
|
68 |
+
|
69 |
+
def process_entry(
|
70 |
+
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
71 |
+
) -> MATHParseEntry:
|
72 |
+
"""Process a single MATH dataset entry."""
|
73 |
+
task = task_name or self._get_current_task(row)
|
74 |
+
|
75 |
+
# Validate and normalize level
|
76 |
+
level = row.get("level")
|
77 |
+
if level not in self._valid_levels:
|
78 |
+
level = "Unknown"
|
79 |
+
|
80 |
+
return MATHParseEntry.create(
|
81 |
+
question=str(row["problem"]),
|
82 |
+
answer=row["solution"],
|
83 |
+
raw_question=row["problem"],
|
84 |
+
raw_answer=row["solution"],
|
85 |
+
level=level,
|
86 |
+
task_name=task,
|
87 |
+
solution=row["solution"],
|
88 |
+
)
|
89 |
+
|
90 |
+
def get_dataset_description(self) -> DatasetDescription:
|
91 |
+
"""Returns description of the MATH dataset."""
|
92 |
+
return DatasetDescription.create(
|
93 |
+
name="MATH",
|
94 |
+
purpose="Measure mathematical problem-solving capabilities in machine learning models",
|
95 |
+
source="Hendrycks et al., UC Berkeley (NeurIPS 2021)",
|
96 |
+
language="English",
|
97 |
+
format="Competition mathematics problems with step-by-step solutions",
|
98 |
+
category=["Math"],
|
99 |
+
characteristics=(
|
100 |
+
"Collection of 12,500 challenging competition mathematics problems designed to "
|
101 |
+
"evaluate mathematical reasoning. Problems include step-by-step solutions that "
|
102 |
+
"can be used to teach models to generate answer derivations and explanations. "
|
103 |
+
"Problems are categorized by subject area and difficulty level (1-5)."
|
104 |
+
),
|
105 |
+
citation="""@article{hendrycksmath2021,
|
106 |
+
title={Measuring Mathematical Problem Solving With the MATH Dataset},
|
107 |
+
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
|
108 |
+
journal={NeurIPS},
|
109 |
+
year={2021}
|
110 |
+
}""",
|
111 |
+
additional_info={
|
112 |
+
"difficulty_levels": "1-5",
|
113 |
+
"topics": [
|
114 |
+
"algebra",
|
115 |
+
"geometry",
|
116 |
+
"calculus",
|
117 |
+
"prealgebra",
|
118 |
+
"intermediate_algebra",
|
119 |
+
"number_theory",
|
120 |
+
"precalculus",
|
121 |
+
],
|
122 |
+
"size": "12,500 problems",
|
123 |
+
"evaluation_note": "Exact match equivalence calculated using sympy library",
|
124 |
+
"homepage": "https://github.com/hendrycks/math",
|
125 |
+
},
|
126 |
+
)
|
127 |
+
|
128 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
129 |
+
"""Returns recommended evaluation metrics for MATH dataset."""
|
130 |
+
return [
|
131 |
+
EvaluationMetric.create(
|
132 |
+
name="symbolic_equivalence",
|
133 |
+
type="exact_match",
|
134 |
+
description="Verifies answer correctness using symbolic mathematics (e.g., sympy) to check mathematical equivalence.",
|
135 |
+
implementation="sympy_equivalence_checker",
|
136 |
+
primary=True,
|
137 |
+
),
|
138 |
+
EvaluationMetric.create(
|
139 |
+
name="solution_presence",
|
140 |
+
type="text",
|
141 |
+
description="Ensures that a complete step-by-step solution is provided, demonstrating how the answer is derived.",
|
142 |
+
implementation="solution_completeness_checker",
|
143 |
+
primary=True,
|
144 |
+
),
|
145 |
+
EvaluationMetric.create(
|
146 |
+
name="reasoning_validity",
|
147 |
+
type="text",
|
148 |
+
description="Evaluates the logical correctness and mathematical reasoning in the solution's derivation steps.",
|
149 |
+
implementation="reasoning_validator",
|
150 |
+
primary=True,
|
151 |
+
),
|
152 |
+
EvaluationMetric.create(
|
153 |
+
name="mathematical_notation",
|
154 |
+
type="text",
|
155 |
+
description="Checks for the correct use of mathematical notation and symbolic representation to ensure clarity.",
|
156 |
+
implementation="notation_validator",
|
157 |
+
primary=False,
|
158 |
+
),
|
159 |
+
EvaluationMetric.create(
|
160 |
+
name="solution_clarity",
|
161 |
+
type="text",
|
162 |
+
description="Assesses the clarity, readability, and coherence of the solution steps to enhance interpretability.",
|
163 |
+
implementation="clarity_scorer",
|
164 |
+
primary=False,
|
165 |
+
),
|
166 |
+
]
|
167 |
+
|
168 |
+
|
169 |
+
if __name__ == "__main__":
|
170 |
+
# Example usage of MATH parser
|
171 |
+
parser = MATHDatasetParser()
|
172 |
+
|
173 |
+
# Load the dataset
|
174 |
+
parser.load()
|
175 |
+
|
176 |
+
# Parse all splits
|
177 |
+
parser.parse()
|
178 |
+
|
179 |
+
# Get parsed data
|
180 |
+
parsed_data = parser.get_parsed_data
|
181 |
+
|
182 |
+
# Print example entry
|
183 |
+
if parsed_data:
|
184 |
+
example = parsed_data[0]
|
185 |
+
print("\nExample parsed entry:")
|
186 |
+
print(f"Task: {example.task_name}")
|
187 |
+
print(f"Level: {example.level}")
|
188 |
+
print(f"Question: {example.question}")
|
189 |
+
print(f"Solution: {example.solution}")
|
llmdataparser/mbpp_parser.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from typing import Any, ClassVar
|
3 |
+
|
4 |
+
from llmdataparser.base_parser import (
|
5 |
+
DatasetDescription,
|
6 |
+
EvaluationMetric,
|
7 |
+
HuggingFaceDatasetParser,
|
8 |
+
HuggingFaceParseEntry,
|
9 |
+
)
|
10 |
+
|
11 |
+
|
12 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
13 |
+
class MBPPParseEntry(HuggingFaceParseEntry):
|
14 |
+
"""Custom entry class for MBPP, with fields specific to this dataset parser."""
|
15 |
+
|
16 |
+
task_id: int
|
17 |
+
test_list: list[str]
|
18 |
+
test_setup_code: str
|
19 |
+
challenge_test_list: list[str]
|
20 |
+
source_file: str
|
21 |
+
|
22 |
+
@classmethod
|
23 |
+
def create(
|
24 |
+
cls,
|
25 |
+
question: str,
|
26 |
+
answer: str,
|
27 |
+
raw_question: str,
|
28 |
+
task_id: int,
|
29 |
+
test_list: list[str],
|
30 |
+
test_setup_code: str,
|
31 |
+
challenge_test_list: list[str],
|
32 |
+
task_name: str,
|
33 |
+
source_file: str,
|
34 |
+
) -> "MBPPParseEntry":
|
35 |
+
if not isinstance(task_id, int):
|
36 |
+
raise ValueError("Task ID must be an integer")
|
37 |
+
|
38 |
+
return cls(
|
39 |
+
question=question,
|
40 |
+
answer=answer,
|
41 |
+
raw_question=raw_question,
|
42 |
+
raw_answer=answer, # In MBPP, the code solution is the raw answer
|
43 |
+
task_id=task_id,
|
44 |
+
test_list=test_list,
|
45 |
+
test_setup_code=test_setup_code,
|
46 |
+
challenge_test_list=challenge_test_list,
|
47 |
+
task_name=task_name,
|
48 |
+
source_file=source_file,
|
49 |
+
)
|
50 |
+
|
51 |
+
|
52 |
+
class MBPPDatasetParser(HuggingFaceDatasetParser[MBPPParseEntry]):
|
53 |
+
"""Parser for the MBPP (Mostly Basic Python Programming) dataset."""
|
54 |
+
|
55 |
+
_data_source: ClassVar[str] = "google-research-datasets/mbpp"
|
56 |
+
_default_task: ClassVar[str] = "full" # Can be 'full' or 'sanitized'
|
57 |
+
_task_names: ClassVar[list[str]] = ["full", "sanitized"]
|
58 |
+
|
59 |
+
def process_entry(
|
60 |
+
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
61 |
+
) -> MBPPParseEntry:
|
62 |
+
"""Process a single MBPP entry."""
|
63 |
+
raw_question = row.get("text", row.get("prompt"))
|
64 |
+
answer = row["code"]
|
65 |
+
task_id = row["task_id"]
|
66 |
+
test_list = row["test_list"]
|
67 |
+
test_setup_code = row.get("test_setup_code", "")
|
68 |
+
challenge_test_list = row.get("challenge_test_list", [])
|
69 |
+
|
70 |
+
question = str(raw_question)
|
71 |
+
|
72 |
+
# Use task_name if provided, otherwise use default
|
73 |
+
task = task_name or self._get_current_task(row)
|
74 |
+
source_file = row.get("source_file", "")
|
75 |
+
|
76 |
+
return MBPPParseEntry.create(
|
77 |
+
question=question,
|
78 |
+
answer=answer,
|
79 |
+
raw_question=raw_question,
|
80 |
+
task_id=task_id,
|
81 |
+
test_list=test_list,
|
82 |
+
test_setup_code=test_setup_code,
|
83 |
+
challenge_test_list=challenge_test_list,
|
84 |
+
task_name=task,
|
85 |
+
source_file=source_file,
|
86 |
+
)
|
87 |
+
|
88 |
+
def get_dataset_description(self) -> DatasetDescription:
|
89 |
+
"""Returns a description of the MBPP dataset."""
|
90 |
+
return DatasetDescription.create(
|
91 |
+
name="Mostly Basic Python Problems (MBPP)",
|
92 |
+
purpose="A benchmark for evaluating code generation capabilities using entry-level Python programming problems",
|
93 |
+
source="https://github.com/google-research/google-research/tree/master/mbpp",
|
94 |
+
language="English and Python",
|
95 |
+
category=["Programming"],
|
96 |
+
format="Task descriptions in English with corresponding Python solutions and automated test cases",
|
97 |
+
characteristics=(
|
98 |
+
"Contains approximately 1,000 crowd-sourced Python programming problems "
|
99 |
+
"designed for entry-level programmers. Problems cover programming fundamentals "
|
100 |
+
"and standard library functionality. Each problem includes a task description, "
|
101 |
+
"code solution, and 3 automated test cases. A subset of the data has been "
|
102 |
+
"hand-verified by the authors."
|
103 |
+
),
|
104 |
+
citation=(
|
105 |
+
"@article{austin2021program,\n"
|
106 |
+
" title={Program Synthesis with Large Language Models},\n"
|
107 |
+
" author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n"
|
108 |
+
" journal={arXiv preprint arXiv:2108.07732},\n"
|
109 |
+
" year={2021}\n"
|
110 |
+
"}"
|
111 |
+
),
|
112 |
+
additional_info={
|
113 |
+
"size": "~1,000 programming problems",
|
114 |
+
"splits": "Available in full or sanitized versions",
|
115 |
+
"test_coverage": "Each problem includes 3 automated test cases",
|
116 |
+
"verification": "Subset of data has been hand-verified by authors",
|
117 |
+
},
|
118 |
+
)
|
119 |
+
|
120 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
121 |
+
"""Returns the recommended evaluation metrics for MBPP dataset."""
|
122 |
+
return [
|
123 |
+
EvaluationMetric.create(
|
124 |
+
name="pass@k",
|
125 |
+
type="code_evaluation",
|
126 |
+
description="Percentage of problems where at least one solution in k generations passes all test cases",
|
127 |
+
implementation="custom_pass_at_k",
|
128 |
+
primary=True,
|
129 |
+
),
|
130 |
+
EvaluationMetric.create(
|
131 |
+
name="test_case_success_rate",
|
132 |
+
type="code_evaluation",
|
133 |
+
description="Percentage of test cases passed across all problems",
|
134 |
+
implementation="custom_test_success_rate",
|
135 |
+
primary=False,
|
136 |
+
),
|
137 |
+
EvaluationMetric.create(
|
138 |
+
name="syntax_validity",
|
139 |
+
type="code_evaluation",
|
140 |
+
description="Verifies that generated code is syntactically valid Python",
|
141 |
+
implementation="custom_syntax_check",
|
142 |
+
primary=False,
|
143 |
+
),
|
144 |
+
EvaluationMetric.create(
|
145 |
+
name="code_similarity",
|
146 |
+
type="similarity",
|
147 |
+
description="Similarity between generated code and reference solution",
|
148 |
+
implementation="evaluate.load('code_eval')",
|
149 |
+
primary=False,
|
150 |
+
),
|
151 |
+
]
|
152 |
+
|
153 |
+
|
154 |
+
if __name__ == "__main__":
|
155 |
+
# Example usage
|
156 |
+
parser = MBPPDatasetParser()
|
157 |
+
|
158 |
+
# Load the dataset
|
159 |
+
parser.load()
|
160 |
+
|
161 |
+
# Parse all splits
|
162 |
+
parser.parse()
|
163 |
+
|
164 |
+
# Get parsed data
|
165 |
+
parsed_data = parser.get_parsed_data
|
166 |
+
|
167 |
+
# Print example entry
|
168 |
+
if parsed_data:
|
169 |
+
example = parsed_data[0]
|
170 |
+
print("\nExample parsed entry:")
|
171 |
+
print(f"Task ID: {example.task_id}")
|
172 |
+
print(f"Task: {example.raw_question}")
|
173 |
+
print(f"Solution:\n{example.answer}")
|
174 |
+
print(f"Test Cases:\n{example.test_list}")
|
llmdataparser/mgsm_parser.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from typing import Any, ClassVar
|
3 |
+
|
4 |
+
from llmdataparser.base_parser import (
|
5 |
+
DatasetDescription,
|
6 |
+
EvaluationMetric,
|
7 |
+
HuggingFaceDatasetParser,
|
8 |
+
HuggingFaceParseEntry,
|
9 |
+
)
|
10 |
+
|
11 |
+
|
12 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
13 |
+
class MGSMParseEntry(HuggingFaceParseEntry):
|
14 |
+
"""Custom entry class for MGSM, with fields specific to this dataset parser."""
|
15 |
+
|
16 |
+
numerical_answer: int | float
|
17 |
+
equation_solution: str | None
|
18 |
+
language: str
|
19 |
+
|
20 |
+
@classmethod
|
21 |
+
def create(
|
22 |
+
cls,
|
23 |
+
question: str,
|
24 |
+
answer: str,
|
25 |
+
raw_question: str,
|
26 |
+
raw_answer: str,
|
27 |
+
numerical_answer: int | float,
|
28 |
+
equation_solution: str | None,
|
29 |
+
task_name: str,
|
30 |
+
language: str,
|
31 |
+
) -> "MGSMParseEntry":
|
32 |
+
return cls(
|
33 |
+
question=question,
|
34 |
+
answer=answer,
|
35 |
+
raw_question=raw_question,
|
36 |
+
raw_answer=raw_answer,
|
37 |
+
numerical_answer=numerical_answer,
|
38 |
+
equation_solution=equation_solution,
|
39 |
+
task_name=task_name,
|
40 |
+
language=language,
|
41 |
+
)
|
42 |
+
|
43 |
+
|
44 |
+
class MGSMDatasetParser(HuggingFaceDatasetParser[MGSMParseEntry]):
|
45 |
+
"""Parser for the MGSM (Multilingual Grade School Math) dataset."""
|
46 |
+
|
47 |
+
_data_source: ClassVar[str] = "juletxara/mgsm"
|
48 |
+
_default_task: ClassVar[str] = "en"
|
49 |
+
_task_names: ClassVar[list[str]] = [
|
50 |
+
"bn",
|
51 |
+
"de",
|
52 |
+
"en",
|
53 |
+
"es",
|
54 |
+
"fr",
|
55 |
+
"ja",
|
56 |
+
"ru",
|
57 |
+
"sw",
|
58 |
+
"te",
|
59 |
+
"th",
|
60 |
+
"zh",
|
61 |
+
]
|
62 |
+
|
63 |
+
def process_entry(
|
64 |
+
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
65 |
+
) -> MGSMParseEntry:
|
66 |
+
"""
|
67 |
+
Process a single MGSM entry.
|
68 |
+
|
69 |
+
Args:
|
70 |
+
row: Dictionary containing the MGSM entry fields
|
71 |
+
task_name: Language code for the current task
|
72 |
+
|
73 |
+
Returns:
|
74 |
+
MGSMParseEntry: Processed entry with question, answer, and metadata
|
75 |
+
"""
|
76 |
+
task = task_name or self._get_current_task(row)
|
77 |
+
raw_question = row["question"]
|
78 |
+
raw_answer = row["answer"] if row["answer"] else ""
|
79 |
+
numerical_answer = row["answer_number"]
|
80 |
+
equation_solution = row["equation_solution"]
|
81 |
+
|
82 |
+
question = str(raw_question)
|
83 |
+
|
84 |
+
# Use numerical answer as string for the answer field if no detailed answer is provided
|
85 |
+
answer = raw_answer if raw_answer else str(numerical_answer)
|
86 |
+
|
87 |
+
return MGSMParseEntry.create(
|
88 |
+
question=question,
|
89 |
+
answer=answer,
|
90 |
+
raw_question=raw_question,
|
91 |
+
raw_answer=raw_answer,
|
92 |
+
numerical_answer=numerical_answer,
|
93 |
+
equation_solution=equation_solution,
|
94 |
+
task_name=task,
|
95 |
+
language=task,
|
96 |
+
)
|
97 |
+
|
98 |
+
def get_dataset_description(self) -> DatasetDescription:
|
99 |
+
"""Returns a description of the Multilingual Grade School Math dataset."""
|
100 |
+
return DatasetDescription.create(
|
101 |
+
name="Multilingual Grade School Math (MGSM)",
|
102 |
+
purpose="Evaluate multilingual chain-of-thought reasoning capabilities in mathematical problem solving",
|
103 |
+
source="https://huggingface.co/datasets/juletxara/mgsm",
|
104 |
+
language="Multilingual (11 languages)",
|
105 |
+
format="Word problems with numerical answers and solution steps",
|
106 |
+
category=["Math", "MultiLingual"],
|
107 |
+
characteristics=(
|
108 |
+
"Human-translated version of 250 GSM8K problems into 10 additional languages. "
|
109 |
+
"Each problem includes the original question from GSM8K, its translations, "
|
110 |
+
"numerical answer, and solution steps. The benchmark is designed to evaluate "
|
111 |
+
"language models' ability to perform mathematical reasoning across different languages."
|
112 |
+
),
|
113 |
+
citation="""@misc{shi2022language,
|
114 |
+
title={Language Models are Multilingual Chain-of-Thought Reasoners},
|
115 |
+
author={Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and Jason Wei},
|
116 |
+
year={2022},
|
117 |
+
eprint={2210.03057},
|
118 |
+
archivePrefix={arXiv},
|
119 |
+
primaryClass={cs.CL}
|
120 |
+
}
|
121 |
+
@article{cobbe2021gsm8k,
|
122 |
+
title={Training Verifiers to Solve Math Word Problems},
|
123 |
+
author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
|
124 |
+
journal={arXiv preprint arXiv:2110.14168},
|
125 |
+
year={2021}
|
126 |
+
}""",
|
127 |
+
additional_info={
|
128 |
+
"languages": [
|
129 |
+
"Bengali",
|
130 |
+
"German",
|
131 |
+
"English",
|
132 |
+
"Spanish",
|
133 |
+
"French",
|
134 |
+
"Japanese",
|
135 |
+
"Russian",
|
136 |
+
"Swahili",
|
137 |
+
"Telugu",
|
138 |
+
"Thai",
|
139 |
+
"Chinese",
|
140 |
+
],
|
141 |
+
"size": "250 problems translated into each language",
|
142 |
+
"base_dataset": "GSM8K (Grade School Math 8K)",
|
143 |
+
},
|
144 |
+
)
|
145 |
+
|
146 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
147 |
+
"""Returns the recommended evaluation metrics for MGSM dataset."""
|
148 |
+
return [
|
149 |
+
EvaluationMetric.create(
|
150 |
+
name="exact_match",
|
151 |
+
type="string",
|
152 |
+
description="Exact match comparison between predicted and correct numerical answers",
|
153 |
+
implementation="custom_exact_match",
|
154 |
+
primary=True,
|
155 |
+
),
|
156 |
+
EvaluationMetric.create(
|
157 |
+
name="solution_validity",
|
158 |
+
type="text",
|
159 |
+
description="Assessment of whether the solution steps are mathematically valid and complete",
|
160 |
+
implementation="custom_solution_validator",
|
161 |
+
primary=True,
|
162 |
+
),
|
163 |
+
EvaluationMetric.create(
|
164 |
+
name="step_accuracy",
|
165 |
+
type="numerical",
|
166 |
+
description="Accuracy of intermediate calculation steps (e.g., <<48/2=24>>)",
|
167 |
+
implementation="custom_step_accuracy",
|
168 |
+
primary=True,
|
169 |
+
),
|
170 |
+
EvaluationMetric.create(
|
171 |
+
name="cross_lingual_consistency",
|
172 |
+
type="comparison",
|
173 |
+
description="Consistency of model performance across different language versions of the same problem",
|
174 |
+
implementation="custom_language_comparator",
|
175 |
+
primary=False,
|
176 |
+
),
|
177 |
+
]
|
178 |
+
|
179 |
+
|
180 |
+
if __name__ == "__main__":
|
181 |
+
from pprint import pprint
|
182 |
+
|
183 |
+
parser = MGSMDatasetParser()
|
184 |
+
parser.load(task_name="en") # Load French dataset
|
185 |
+
parser.parse()
|
186 |
+
|
187 |
+
parsed_data = parser.get_parsed_data
|
188 |
+
pprint(parsed_data[0].question)
|
189 |
+
pprint(parsed_data[0].answer)
|
190 |
+
pprint(parsed_data[0].raw_question)
|
191 |
+
pprint(parsed_data[0].numerical_answer)
|
192 |
+
pprint(parsed_data[0].language)
|
llmdataparser/mmlu_parser.py
CHANGED
@@ -1,81 +1,721 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
-
from typing import Any
|
3 |
|
4 |
-
from llmdataparser.base_parser import
|
5 |
-
|
|
|
|
|
|
|
|
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
@dataclass(frozen=True)
|
9 |
-
class MMLUParseEntry(ParseEntry):
|
10 |
-
"""
|
11 |
-
Custom entry class for MMLU, with fields specific to this dataset parser.
|
12 |
-
"""
|
13 |
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
@classmethod
|
18 |
-
def create(
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
raise ValueError(
|
21 |
-
f"Invalid answer_letter '{
|
22 |
)
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
|
26 |
class MMLUDatasetParser(HuggingFaceDatasetParser[MMLUParseEntry]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
_data_source = "cais/mmlu"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
def
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
)
|
62 |
|
63 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
"""
|
65 |
-
Generate a
|
66 |
|
67 |
Args:
|
68 |
-
row (dict[str, Any]): A data point to be formatted
|
|
|
69 |
|
70 |
Returns:
|
71 |
MMLUParseEntry: The formatted entry object.
|
72 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
choices = "\n".join(
|
74 |
-
f"{chr(65 + i)}. {choice}" for i, choice in enumerate(
|
75 |
)
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
)
|
79 |
-
answer_letter = chr(65 + row["answer"]) # Convert index to 'A', 'B', 'C', 'D'
|
80 |
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from dataclasses import dataclass
|
2 |
+
from typing import Any, Final
|
3 |
|
4 |
+
from llmdataparser.base_parser import (
|
5 |
+
DatasetDescription,
|
6 |
+
EvaluationMetric,
|
7 |
+
HuggingFaceDatasetParser,
|
8 |
+
HuggingFaceParseEntry,
|
9 |
+
)
|
10 |
|
11 |
+
MMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
|
12 |
+
MMLU_PRO_VALID_ANSWERS: Final[set[str]] = {
|
13 |
+
"A",
|
14 |
+
"B",
|
15 |
+
"C",
|
16 |
+
"D",
|
17 |
+
"E",
|
18 |
+
"F",
|
19 |
+
"G",
|
20 |
+
"H",
|
21 |
+
"I",
|
22 |
+
"J",
|
23 |
+
}
|
24 |
+
MMLU_VALID_ANSWER_STR: Final[str] = ", ".join(sorted(MMLU_VALID_ANSWERS))
|
25 |
+
MMLU_PRO_VALID_ANSWER_STR: Final[str] = ", ".join(sorted(MMLU_PRO_VALID_ANSWERS))
|
26 |
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
29 |
+
class MMLUParseEntry(HuggingFaceParseEntry):
|
30 |
+
"""Custom entry class for MMLU, with fields specific to this dataset parser."""
|
31 |
+
|
32 |
+
raw_choices: list[str]
|
33 |
+
task_name: str
|
34 |
+
|
35 |
+
@classmethod
|
36 |
+
def create(
|
37 |
+
cls,
|
38 |
+
question: str,
|
39 |
+
answer: str,
|
40 |
+
raw_question: str,
|
41 |
+
raw_choices: list[str],
|
42 |
+
raw_answer: str,
|
43 |
+
task_name: str,
|
44 |
+
) -> "MMLUParseEntry":
|
45 |
+
if answer not in MMLU_VALID_ANSWERS:
|
46 |
+
raise ValueError(
|
47 |
+
f"Invalid answer_letter '{answer}'; must be one of {MMLU_VALID_ANSWER_STR}"
|
48 |
+
)
|
49 |
+
if not task_name:
|
50 |
+
raise ValueError("Task name cannot be empty")
|
51 |
+
return cls(
|
52 |
+
question=question,
|
53 |
+
answer=answer,
|
54 |
+
raw_question=raw_question,
|
55 |
+
raw_answer=raw_answer,
|
56 |
+
raw_choices=raw_choices,
|
57 |
+
task_name=task_name,
|
58 |
+
)
|
59 |
+
|
60 |
+
|
61 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
62 |
+
class MMLUProParseEntry(HuggingFaceParseEntry):
|
63 |
+
"""Custom entry class for MMLU, with fields specific to this dataset parser."""
|
64 |
+
|
65 |
+
raw_choices: list[str]
|
66 |
+
task_name: str
|
67 |
|
68 |
@classmethod
|
69 |
+
def create(
|
70 |
+
cls,
|
71 |
+
question: str,
|
72 |
+
answer: str,
|
73 |
+
raw_question: str,
|
74 |
+
raw_choices: list[str],
|
75 |
+
raw_answer: str,
|
76 |
+
task_name: str,
|
77 |
+
) -> "MMLUProParseEntry":
|
78 |
+
if answer not in MMLU_PRO_VALID_ANSWERS:
|
79 |
raise ValueError(
|
80 |
+
f"Invalid answer_letter '{answer}'; must be one of {MMLU_PRO_VALID_ANSWER_STR}"
|
81 |
)
|
82 |
+
if not task_name:
|
83 |
+
raise ValueError("Task name cannot be empty")
|
84 |
+
return cls(
|
85 |
+
question=question,
|
86 |
+
answer=answer,
|
87 |
+
raw_question=raw_question,
|
88 |
+
raw_choices=raw_choices,
|
89 |
+
raw_answer=raw_answer,
|
90 |
+
task_name=task_name,
|
91 |
+
)
|
92 |
|
93 |
|
94 |
class MMLUDatasetParser(HuggingFaceDatasetParser[MMLUParseEntry]):
|
95 |
+
"""Base class for MMLU dataset parsers with common functionality."""
|
96 |
+
|
97 |
+
def _get_task_from_entry(self, data_entry: dict[str, Any]) -> str:
|
98 |
+
"""Get the task name from the data entry or default task name."""
|
99 |
+
task_name: str = data_entry.get("subject", "")
|
100 |
+
return task_name if task_name else (self._current_task or self._default_task)
|
101 |
+
|
102 |
+
def process_entry(
|
103 |
+
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
104 |
+
) -> MMLUParseEntry:
|
105 |
+
"""
|
106 |
+
Generate a question and expected answer from the given row.
|
107 |
+
|
108 |
+
Args:
|
109 |
+
row: A data point to be formatted.
|
110 |
+
task_name: Optional task name for the entry.
|
111 |
+
**kwargs: Additional keyword arguments.
|
112 |
+
|
113 |
+
Returns:
|
114 |
+
MMLUParseEntry: The formatted entry object.
|
115 |
+
"""
|
116 |
+
task = task_name or self._get_current_task(row)
|
117 |
+
# Ensure task is not None
|
118 |
+
final_task = task or self._default_task
|
119 |
+
|
120 |
+
choices = "\n".join(
|
121 |
+
f"{chr(65 + i)}. {choice}" for i, choice in enumerate(row["choices"])
|
122 |
+
)
|
123 |
+
raw_question = row["question"]
|
124 |
+
raw_choices = row["choices"]
|
125 |
+
raw_answer = str(row["answer"]) # Ensure raw_answer is a string
|
126 |
+
|
127 |
+
question = f"Question: {raw_question}\n{choices}\nAnswer:"
|
128 |
+
answer_letter = chr(65 + int(raw_answer)) # Convert index to 'A', 'B', 'C', 'D'
|
129 |
+
|
130 |
+
return MMLUParseEntry.create(
|
131 |
+
question=question,
|
132 |
+
answer=answer_letter,
|
133 |
+
raw_question=raw_question,
|
134 |
+
raw_choices=raw_choices,
|
135 |
+
raw_answer=raw_answer,
|
136 |
+
task_name=final_task,
|
137 |
+
)
|
138 |
+
|
139 |
+
|
140 |
+
class BaseMMLUDatasetParser(MMLUDatasetParser):
|
141 |
+
"""Parser for the original MMLU dataset."""
|
142 |
+
|
143 |
_data_source = "cais/mmlu"
|
144 |
+
_default_task = "all"
|
145 |
+
_task_names = [
|
146 |
+
"abstract_algebra",
|
147 |
+
"anatomy",
|
148 |
+
"astronomy",
|
149 |
+
"business_ethics",
|
150 |
+
"clinical_knowledge",
|
151 |
+
"college_biology",
|
152 |
+
"college_chemistry",
|
153 |
+
"college_computer_science",
|
154 |
+
"college_mathematics",
|
155 |
+
"college_medicine",
|
156 |
+
"college_physics",
|
157 |
+
"computer_security",
|
158 |
+
"conceptual_physics",
|
159 |
+
"econometrics",
|
160 |
+
"electrical_engineering",
|
161 |
+
"elementary_mathematics",
|
162 |
+
"formal_logic",
|
163 |
+
"global_facts",
|
164 |
+
"high_school_biology",
|
165 |
+
"high_school_chemistry",
|
166 |
+
"high_school_computer_science",
|
167 |
+
"high_school_european_history",
|
168 |
+
"high_school_geography",
|
169 |
+
"high_school_government_and_politics",
|
170 |
+
"high_school_macroeconomics",
|
171 |
+
"high_school_mathematics",
|
172 |
+
"high_school_microeconomics",
|
173 |
+
"high_school_physics",
|
174 |
+
"high_school_psychology",
|
175 |
+
"high_school_statistics",
|
176 |
+
"high_school_us_history",
|
177 |
+
"high_school_world_history",
|
178 |
+
"human_aging",
|
179 |
+
"human_sexuality",
|
180 |
+
"international_law",
|
181 |
+
"jurisprudence",
|
182 |
+
"logical_fallacies",
|
183 |
+
"machine_learning",
|
184 |
+
"management",
|
185 |
+
"marketing",
|
186 |
+
"medical_genetics",
|
187 |
+
"miscellaneous",
|
188 |
+
"moral_disputes",
|
189 |
+
"moral_scenarios",
|
190 |
+
"nutrition",
|
191 |
+
"philosophy",
|
192 |
+
"prehistory",
|
193 |
+
"professional_accounting",
|
194 |
+
"professional_law",
|
195 |
+
"professional_medicine",
|
196 |
+
"professional_psychology",
|
197 |
+
"public_relations",
|
198 |
+
"security_studies",
|
199 |
+
"sociology",
|
200 |
+
"us_foreign_policy",
|
201 |
+
"virology",
|
202 |
+
"world_religions",
|
203 |
+
]
|
204 |
+
|
205 |
+
def get_dataset_description(self) -> DatasetDescription:
|
206 |
+
"""Returns a description of the MMLU dataset."""
|
207 |
+
return DatasetDescription.create(
|
208 |
+
name="Massive Multitask Language Understanding (MMLU)",
|
209 |
+
purpose="Evaluate models' extensive world knowledge and problem-solving abilities across diverse branches of knowledge",
|
210 |
+
source="https://huggingface.co/datasets/cais/mmlu",
|
211 |
+
language="English",
|
212 |
+
category=["General Knowledge and Reasoning"],
|
213 |
+
format="Multiple choice questions with four options (A, B, C, D)",
|
214 |
+
characteristics=(
|
215 |
+
"Comprehensive evaluation benchmark spanning humanities, social sciences, hard sciences, "
|
216 |
+
"and other essential areas of knowledge. The test includes 57 subjects such as "
|
217 |
+
"elementary mathematics, US history, computer science, and law. Success on this test "
|
218 |
+
"requires both extensive world knowledge and strong problem-solving capabilities."
|
219 |
+
),
|
220 |
+
citation="""@article{hendryckstest2021,
|
221 |
+
title={Measuring Massive Multitask Language Understanding},
|
222 |
+
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
223 |
+
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
|
224 |
+
year={2021}
|
225 |
+
}
|
226 |
+
@article{hendrycks2021ethics,
|
227 |
+
title={Aligning AI With Shared Human Values},
|
228 |
+
author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
|
229 |
+
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
|
230 |
+
year={2021}
|
231 |
+
}""",
|
232 |
+
additional_info={
|
233 |
+
"subjects": "57 tasks/subjects",
|
234 |
+
"categories": [
|
235 |
+
"Humanities",
|
236 |
+
"Social Sciences",
|
237 |
+
"Hard Sciences",
|
238 |
+
"Other",
|
239 |
+
],
|
240 |
+
"example_subjects": [
|
241 |
+
"Elementary Mathematics",
|
242 |
+
"US History",
|
243 |
+
"Computer Science",
|
244 |
+
"Law",
|
245 |
+
],
|
246 |
+
"requirements": [
|
247 |
+
"Extensive world knowledge",
|
248 |
+
"Problem solving ability",
|
249 |
+
],
|
250 |
+
},
|
251 |
+
)
|
252 |
|
253 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
254 |
+
"""Returns the recommended evaluation metrics for MMLU dataset."""
|
255 |
+
return [
|
256 |
+
EvaluationMetric.create(
|
257 |
+
name="accuracy",
|
258 |
+
type="classification",
|
259 |
+
description="Proportion of correctly answered multiple-choice questions (exact match with A, B, C, D)",
|
260 |
+
implementation="evaluate.load('accuracy')",
|
261 |
+
primary=True,
|
262 |
+
),
|
263 |
+
EvaluationMetric.create(
|
264 |
+
name="subject_accuracy",
|
265 |
+
type="classification",
|
266 |
+
description="Per-subject accuracy scores across all 57 tasks",
|
267 |
+
implementation="custom_subject_accuracy",
|
268 |
+
primary=True,
|
269 |
+
),
|
270 |
+
EvaluationMetric.create(
|
271 |
+
name="category_accuracy",
|
272 |
+
type="classification",
|
273 |
+
description="Accuracy grouped by major categories (Humanities, Social Sciences, Hard Sciences, Other)",
|
274 |
+
implementation="custom_category_accuracy",
|
275 |
+
primary=True,
|
276 |
+
),
|
277 |
+
EvaluationMetric.create(
|
278 |
+
name="task_correlation",
|
279 |
+
type="analysis",
|
280 |
+
description="Analysis of performance correlations between different subjects/tasks",
|
281 |
+
implementation="custom_task_correlation",
|
282 |
+
primary=False,
|
283 |
+
),
|
284 |
+
]
|
285 |
+
|
286 |
+
|
287 |
+
class MMLUReduxDatasetParser(MMLUDatasetParser):
|
288 |
+
"""Parser for the MMLU Redux dataset."""
|
289 |
+
|
290 |
+
_data_source = "edinburgh-dawg/mmlu-redux"
|
291 |
+
_default_task = "anatomy"
|
292 |
+
_task_names = [
|
293 |
+
"anatomy",
|
294 |
+
"astronomy",
|
295 |
+
"business_ethics",
|
296 |
+
"clinical_knowledge",
|
297 |
+
"college_chemistry",
|
298 |
+
"college_computer_science",
|
299 |
+
"college_mathematics",
|
300 |
+
"college_medicine",
|
301 |
+
"college_physics",
|
302 |
+
"conceptual_physics",
|
303 |
+
"econometrics",
|
304 |
+
"electrical_engineering",
|
305 |
+
"formal_logic",
|
306 |
+
"global_facts",
|
307 |
+
"high_school_chemistry",
|
308 |
+
"high_school_geography",
|
309 |
+
"high_school_macroeconomics",
|
310 |
+
"high_school_mathematics",
|
311 |
+
"high_school_physics",
|
312 |
+
"high_school_statistics",
|
313 |
+
"high_school_us_history",
|
314 |
+
"human_aging",
|
315 |
+
"logical_fallacies",
|
316 |
+
"machine_learning",
|
317 |
+
"miscellaneous",
|
318 |
+
"philosophy",
|
319 |
+
"professional_accounting",
|
320 |
+
"professional_law",
|
321 |
+
"public_relations",
|
322 |
+
"virology",
|
323 |
+
]
|
324 |
+
|
325 |
+
def get_dataset_description(self) -> DatasetDescription:
|
326 |
+
"""Returns description of the MMLU Redux dataset."""
|
327 |
+
return DatasetDescription.create(
|
328 |
+
name="MMLU Redux",
|
329 |
+
purpose="Provide a manually re-annotated subset of MMLU with error analysis and corrections",
|
330 |
+
source="https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux",
|
331 |
+
language="English",
|
332 |
+
format="Multiple choice questions with four options (A, B, C, D)",
|
333 |
+
category=["General Knowledge and Reasoning"],
|
334 |
+
characteristics=(
|
335 |
+
"A carefully curated subset of 3,000 questions across 30 MMLU subjects, "
|
336 |
+
"manually re-annotated to identify and classify various types of errors. "
|
337 |
+
"The dataset maintains the original questions but provides additional "
|
338 |
+
"error annotations and corrections based on expert review and verification "
|
339 |
+
"against credible sources."
|
340 |
+
),
|
341 |
+
citation="""@misc{gema2024mmlu,
|
342 |
+
title={Are We Done with MMLU?},
|
343 |
+
author={Aryo Pradipta Gema and Joshua Ong Jun Leang and Giwon Hong and Alessio Devoto and Alberto Carlo Maria Mancino and Rohit Saxena and Xuanli He and Yu Zhao and Xiaotang Du and Mohammad Reza Ghasemi Madani and Claire Barale and Robert McHardy and Joshua Harris and Jean Kaddour and Emile van Krieken and Pasquale Minervini},
|
344 |
+
year={2024},
|
345 |
+
eprint={2406.04127},
|
346 |
+
archivePrefix={arXiv},
|
347 |
+
primaryClass={cs.CL}
|
348 |
+
}""",
|
349 |
+
additional_info={
|
350 |
+
"size": "3,000 questions (100 per subject)",
|
351 |
+
"subjects": "30 MMLU subjects",
|
352 |
+
"license": "CC-BY-4.0",
|
353 |
+
"error_types": {
|
354 |
+
"Question Assessment": [
|
355 |
+
"Bad Question Clarity",
|
356 |
+
"Bad Options Clarity",
|
357 |
+
],
|
358 |
+
"Ground Truth Verification": [
|
359 |
+
"No Correct Answer",
|
360 |
+
"Multiple Correct Answers",
|
361 |
+
"Wrong Ground Truth",
|
362 |
+
],
|
363 |
+
},
|
364 |
+
"verification_process": "Expert review with source verification",
|
365 |
+
"base_dataset": "cais/mmlu",
|
366 |
+
},
|
367 |
+
)
|
368 |
+
|
369 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
370 |
+
"""Returns the recommended evaluation metrics for MMLU Redux dataset."""
|
371 |
+
return [
|
372 |
+
EvaluationMetric.create(
|
373 |
+
name="accuracy",
|
374 |
+
type="classification",
|
375 |
+
description="Proportion of correctly answered multiple-choice questions (exact match with A, B, C, D)",
|
376 |
+
implementation="evaluate.load('accuracy')",
|
377 |
+
primary=True,
|
378 |
+
),
|
379 |
+
EvaluationMetric.create(
|
380 |
+
name="subject_accuracy",
|
381 |
+
type="classification",
|
382 |
+
description="Per-subject accuracy scores across 30 subjects (100 questions each)",
|
383 |
+
implementation="custom_subject_accuracy",
|
384 |
+
primary=True,
|
385 |
+
),
|
386 |
+
EvaluationMetric.create(
|
387 |
+
name="question_clarity",
|
388 |
+
type="analysis",
|
389 |
+
description="Analysis of performance on questions with different clarity issues",
|
390 |
+
implementation="custom_clarity_analysis",
|
391 |
+
primary=False,
|
392 |
+
),
|
393 |
+
]
|
394 |
+
|
395 |
+
|
396 |
+
class TMMLUPlusDatasetParser(MMLUDatasetParser):
|
397 |
+
"""Parser for the TMMLU+ dataset."""
|
398 |
+
|
399 |
+
_data_source = "ikala/tmmluplus"
|
400 |
+
_default_task = "taiwanese_hokkien"
|
401 |
+
_task_names = [
|
402 |
+
"engineering_math",
|
403 |
+
"dentistry",
|
404 |
+
"traditional_chinese_medicine_clinical_medicine",
|
405 |
+
"clinical_psychology",
|
406 |
+
"technical",
|
407 |
+
"culinary_skills",
|
408 |
+
"mechanical",
|
409 |
+
"logic_reasoning",
|
410 |
+
"real_estate",
|
411 |
+
"general_principles_of_law",
|
412 |
+
"finance_banking",
|
413 |
+
"anti_money_laundering",
|
414 |
+
"ttqav2",
|
415 |
+
"marketing_management",
|
416 |
+
"business_management",
|
417 |
+
"organic_chemistry",
|
418 |
+
"advance_chemistry",
|
419 |
+
"physics",
|
420 |
+
"secondary_physics",
|
421 |
+
"human_behavior",
|
422 |
+
"national_protection",
|
423 |
+
"jce_humanities",
|
424 |
+
"politic_science",
|
425 |
+
"agriculture",
|
426 |
+
"official_document_management",
|
427 |
+
"financial_analysis",
|
428 |
+
"pharmacy",
|
429 |
+
"educational_psychology",
|
430 |
+
"statistics_and_machine_learning",
|
431 |
+
"management_accounting",
|
432 |
+
"introduction_to_law",
|
433 |
+
"computer_science",
|
434 |
+
"veterinary_pathology",
|
435 |
+
"accounting",
|
436 |
+
"fire_science",
|
437 |
+
"optometry",
|
438 |
+
"insurance_studies",
|
439 |
+
"pharmacology",
|
440 |
+
"taxation",
|
441 |
+
"trust_practice",
|
442 |
+
"geography_of_taiwan",
|
443 |
+
"physical_education",
|
444 |
+
"auditing",
|
445 |
+
"administrative_law",
|
446 |
+
"education_(profession_level)",
|
447 |
+
"economics",
|
448 |
+
"veterinary_pharmacology",
|
449 |
+
"nautical_science",
|
450 |
+
"occupational_therapy_for_psychological_disorders",
|
451 |
+
"basic_medical_science",
|
452 |
+
"macroeconomics",
|
453 |
+
"trade",
|
454 |
+
"chinese_language_and_literature",
|
455 |
+
"tve_design",
|
456 |
+
"junior_science_exam",
|
457 |
+
"junior_math_exam",
|
458 |
+
"junior_chinese_exam",
|
459 |
+
"junior_social_studies",
|
460 |
+
"tve_mathematics",
|
461 |
+
"tve_chinese_language",
|
462 |
+
"tve_natural_sciences",
|
463 |
+
"junior_chemistry",
|
464 |
+
"music",
|
465 |
+
"education",
|
466 |
+
"three_principles_of_people",
|
467 |
+
"taiwanese_hokkien",
|
468 |
+
]
|
469 |
+
|
470 |
+
def process_entry(
|
471 |
+
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
472 |
+
) -> MMLUParseEntry:
|
473 |
+
"""Process a single TMMLU+ entry."""
|
474 |
+
# Extract choices in order
|
475 |
+
raw_choices = [row["A"], row["B"], row["C"], row["D"]]
|
476 |
+
choices = "\n".join(
|
477 |
+
f"{chr(65 + i)}. {choice}" for i, choice in enumerate(raw_choices)
|
478 |
+
)
|
479 |
+
raw_question = row["question"]
|
480 |
+
raw_answer = row["answer"]
|
481 |
+
|
482 |
+
question = f"Question: {raw_question}\n{choices}\nAnswer:"
|
483 |
+
task = task_name or self._get_current_task(row)
|
484 |
+
|
485 |
+
return MMLUParseEntry.create(
|
486 |
+
question, raw_answer, raw_question, raw_choices, raw_answer, task
|
487 |
+
)
|
488 |
+
|
489 |
+
def get_dataset_description(self) -> DatasetDescription:
|
490 |
+
"""Returns description of the TMMLU+ dataset."""
|
491 |
+
return DatasetDescription.create(
|
492 |
+
name="Traditional Chinese Massive Multitask Language Understanding Plus (TMMLU+)",
|
493 |
+
purpose="Evaluate language models' understanding and reasoning capabilities in Traditional Chinese across diverse subjects",
|
494 |
+
source="https://huggingface.co/datasets/ikala/tmmluplus",
|
495 |
+
language="Traditional Chinese",
|
496 |
+
category=["General Knowledge and Reasoning", "Taiwan"],
|
497 |
+
format="Multiple choice questions with four options (A, B, C, D)",
|
498 |
+
characteristics=(
|
499 |
+
"A comprehensive evaluation benchmark featuring 66 subjects from elementary "
|
500 |
+
"to professional level. The dataset is six times larger than the original TMMLU "
|
501 |
+
"and provides more balanced subject coverage. Includes benchmark results from "
|
502 |
+
"both closed-source models and 20 open-weight Chinese language models with "
|
503 |
+
"parameters ranging from 1.8B to 72B."
|
504 |
+
),
|
505 |
+
citation="""@article{ikala2024improved,
|
506 |
+
title={An Improved Traditional Chinese Evaluation Suite for Foundation Model},
|
507 |
+
author={Tam, Zhi-Rui and Pai, Ya-Ting and Lee, Yen-Wei and Cheng, Sega and Shuai, Hong-Han},
|
508 |
+
journal={arXiv preprint arXiv:2403.01858},
|
509 |
+
year={2024}
|
510 |
+
}""",
|
511 |
+
additional_info={
|
512 |
+
"subjects": "66 diverse subjects",
|
513 |
+
"difficulty_levels": ["Elementary", "Secondary", "Professional"],
|
514 |
+
"model_benchmarks": {
|
515 |
+
"model_types": ["Closed-source models", "Open-weight Chinese LLMs"],
|
516 |
+
"parameter_range": "1.8B - 72B",
|
517 |
+
},
|
518 |
+
"comparison": "6x larger than original TMMLU",
|
519 |
+
"script": "Traditional Chinese",
|
520 |
+
},
|
521 |
)
|
522 |
|
523 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
524 |
+
"""Returns the recommended evaluation metrics for TMMLU+ dataset."""
|
525 |
+
return [
|
526 |
+
EvaluationMetric.create(
|
527 |
+
name="accuracy",
|
528 |
+
type="classification",
|
529 |
+
description="Overall percentage of correctly answered multiple-choice questions",
|
530 |
+
implementation="evaluate.load('accuracy')",
|
531 |
+
primary=True,
|
532 |
+
),
|
533 |
+
EvaluationMetric.create(
|
534 |
+
name="subject_accuracy",
|
535 |
+
type="classification",
|
536 |
+
description="Per-subject accuracy scores across all 66 subjects",
|
537 |
+
implementation="custom_subject_accuracy",
|
538 |
+
primary=True,
|
539 |
+
),
|
540 |
+
EvaluationMetric.create(
|
541 |
+
name="difficulty_analysis",
|
542 |
+
type="classification",
|
543 |
+
description="Performance analysis across different difficulty levels (elementary to professional)",
|
544 |
+
implementation="custom_difficulty_analysis",
|
545 |
+
primary=False,
|
546 |
+
),
|
547 |
+
]
|
548 |
+
|
549 |
+
|
550 |
+
class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
|
551 |
+
"""Parser for the MMLU Pro dataset."""
|
552 |
+
|
553 |
+
_data_source = "TIGER-Lab/MMLU-Pro"
|
554 |
+
_default_task = "default"
|
555 |
+
_task_names = ["default"]
|
556 |
+
_hidden_task_names = [
|
557 |
+
"math",
|
558 |
+
"physics",
|
559 |
+
"chemistry",
|
560 |
+
"law",
|
561 |
+
"engineering",
|
562 |
+
"other",
|
563 |
+
"economics",
|
564 |
+
"health",
|
565 |
+
"psychology",
|
566 |
+
"business",
|
567 |
+
"biology",
|
568 |
+
"philosophy",
|
569 |
+
"computer_science",
|
570 |
+
"history",
|
571 |
+
]
|
572 |
+
|
573 |
+
def _get_task_from_entry(self, data_entry: dict[str, Any]) -> str:
|
574 |
+
"""Get the task name from the data entry or default task name."""
|
575 |
+
if data_entry is not None:
|
576 |
+
task_name: str = data_entry.get("category", "")
|
577 |
+
if task_name:
|
578 |
+
return task_name
|
579 |
+
return self._current_task or self._default_task
|
580 |
+
|
581 |
+
def process_entry(
|
582 |
+
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
583 |
+
) -> MMLUProParseEntry:
|
584 |
"""
|
585 |
+
Generate a question and expected answer from the given row.
|
586 |
|
587 |
Args:
|
588 |
+
row (dict[str, Any]): A data point to be formatted with MMLU Pro specific structure
|
589 |
+
containing 'question', 'options', 'answer', and 'answer_index' keys.
|
590 |
|
591 |
Returns:
|
592 |
MMLUParseEntry: The formatted entry object.
|
593 |
"""
|
594 |
+
task = task_name or self._get_current_task(row)
|
595 |
+
# Ensure task is not None
|
596 |
+
final_task = task or self._default_task
|
597 |
+
|
598 |
+
# Extract choices in order
|
599 |
+
raw_choices = row["options"]
|
600 |
choices = "\n".join(
|
601 |
+
f"{chr(65 + i)}. {choice}" for i, choice in enumerate(raw_choices)
|
602 |
)
|
603 |
+
raw_question = row["question"]
|
604 |
+
raw_answer = row["answer"]
|
605 |
+
answer_index = row["answer_index"]
|
606 |
+
|
607 |
+
question = f"Question: {raw_question}\n{choices}\nAnswer:"
|
608 |
+
answer_letter = chr(
|
609 |
+
65 + answer_index
|
610 |
+
) # Convert index to 'A', 'B', 'C', 'D', etc.
|
611 |
+
|
612 |
+
return MMLUProParseEntry.create(
|
613 |
+
question, answer_letter, raw_question, raw_choices, raw_answer, final_task
|
614 |
+
)
|
615 |
+
|
616 |
+
def get_dataset_description(self) -> DatasetDescription:
|
617 |
+
"""Returns description of the MMLU Pro dataset."""
|
618 |
+
return DatasetDescription.create(
|
619 |
+
name="MMLU Pro",
|
620 |
+
purpose="Provide a more robust and challenging multi-task language understanding benchmark with enhanced reasoning requirements",
|
621 |
+
source="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
|
622 |
+
language="English",
|
623 |
+
category=["General Knowledge and Reasoning", "Advanced Reasoning"],
|
624 |
+
format="Multiple choice questions with up to 10 options (expanded from original 4)",
|
625 |
+
characteristics=(
|
626 |
+
"A more challenging version of MMLU containing 12K complex questions across various "
|
627 |
+
"disciplines. Features increased number of options (up to 10), stronger focus on "
|
628 |
+
"reasoning over pure knowledge, and reduced sensitivity to prompt variations. "
|
629 |
+
"Questions are sourced from original MMLU, STEM websites, TheoremQA, and SciBench, "
|
630 |
+
"with expert review and GPT-4 assisted distractor generation."
|
631 |
+
),
|
632 |
+
citation="""@article{wang2024mmlu,
|
633 |
+
title={Mmlu-pro: A more robust and challenging multi-task language understanding benchmark},
|
634 |
+
author={Wang, Yubo and Ma, Xueguang and Zhang, Ge and Ni, Yuansheng and Chandra, Abhranil and Guo, Shiguang and Ren, Weiming and Arulraj, Aaran and He, Xuan and Jiang, Ziyan and others},
|
635 |
+
journal={arXiv preprint arXiv:2406.01574},
|
636 |
+
year={2024}
|
637 |
+
}""",
|
638 |
+
additional_info={
|
639 |
+
"size": "12,000 complex questions",
|
640 |
+
"options": "Up to 10 choices per question",
|
641 |
+
"sources": [
|
642 |
+
"Original MMLU (filtered)",
|
643 |
+
"STEM Website",
|
644 |
+
"TheoremQA",
|
645 |
+
"SciBench",
|
646 |
+
],
|
647 |
+
"enhanced_subjects": [
|
648 |
+
"Biology",
|
649 |
+
"Business",
|
650 |
+
"Chemistry",
|
651 |
+
"Computer Science",
|
652 |
+
"Economics",
|
653 |
+
"Engineering",
|
654 |
+
"Math",
|
655 |
+
"Physics",
|
656 |
+
"Psychology",
|
657 |
+
],
|
658 |
+
"construction_process": [
|
659 |
+
"Initial MMLU filtering",
|
660 |
+
"Question collection from multiple sources",
|
661 |
+
"GPT-4 assisted option augmentation",
|
662 |
+
"Expert review by 10+ experts",
|
663 |
+
],
|
664 |
+
"prompt_sensitivity": "2% (reduced from 4-5% in MMLU)",
|
665 |
+
"reasoning_improvement": "20% higher CoT performance compared to PPL",
|
666 |
+
},
|
667 |
)
|
|
|
668 |
|
669 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
670 |
+
"""Returns the recommended evaluation metrics for MMLU Pro dataset."""
|
671 |
+
return [
|
672 |
+
EvaluationMetric.create(
|
673 |
+
name="accuracy",
|
674 |
+
type="classification",
|
675 |
+
description="Proportion of correctly answered multiple-choice questions (exact match)",
|
676 |
+
implementation="evaluate.load('accuracy')",
|
677 |
+
primary=True,
|
678 |
+
),
|
679 |
+
EvaluationMetric.create(
|
680 |
+
name="subject_accuracy",
|
681 |
+
type="classification",
|
682 |
+
description="Per-subject accuracy scores with focus on enhanced subjects",
|
683 |
+
implementation="custom_subject_accuracy",
|
684 |
+
primary=True,
|
685 |
+
),
|
686 |
+
EvaluationMetric.create(
|
687 |
+
name="reasoning_analysis",
|
688 |
+
type="analysis",
|
689 |
+
description="Comparison of Chain-of-Thought vs standard PPL performance",
|
690 |
+
implementation="custom_reasoning_analysis",
|
691 |
+
primary=True,
|
692 |
+
),
|
693 |
+
EvaluationMetric.create(
|
694 |
+
name="prompt_robustness",
|
695 |
+
type="analysis",
|
696 |
+
description="Analysis of performance stability across different prompt variations",
|
697 |
+
implementation="custom_prompt_sensitivity",
|
698 |
+
primary=False,
|
699 |
+
),
|
700 |
+
]
|
701 |
+
|
702 |
+
|
703 |
+
if __name__ == "__main__":
|
704 |
+
# Example usage of MMLU Pro parser
|
705 |
+
parser = MMLUProDatasetParser()
|
706 |
+
parser.load()
|
707 |
+
parser.parse()
|
708 |
+
|
709 |
+
# Get parsed data with correct type
|
710 |
+
parsed_data = parser.get_parsed_data
|
711 |
+
|
712 |
+
# Print example entry
|
713 |
+
if parsed_data:
|
714 |
+
example = parsed_data[0]
|
715 |
+
print("\nExample parsed entry:")
|
716 |
+
print(f"Task: {example.task_name}")
|
717 |
+
print(f"Question: {example.raw_question}")
|
718 |
+
print("Choices:")
|
719 |
+
for i, choice in enumerate(example.raw_choices):
|
720 |
+
print(f"{chr(65 + i)}. {choice}")
|
721 |
+
print(f"Correct Answer: {example.answer}")
|
llmdataparser/prompts.py
CHANGED
@@ -1,12 +1,69 @@
|
|
1 |
import textwrap
|
2 |
from typing import Final
|
3 |
|
|
|
|
|
4 |
MMLU_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
5 |
"""\
|
6 |
-
You are an expert
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
"""
|
12 |
)
|
|
|
1 |
import textwrap
|
2 |
from typing import Final
|
3 |
|
4 |
+
# Only for reference
|
5 |
+
|
6 |
MMLU_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
7 |
"""\
|
8 |
+
You are an expert answering multiple-choice questions. Select the single most accurate answer (A, B, C, or D) based on factual knowledge. Respond with the letter only.
|
9 |
+
"""
|
10 |
+
)
|
11 |
+
|
12 |
+
MMLU_PRO_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
13 |
+
"""\
|
14 |
+
You are an expert answering multiple-choice questions. Select the single most accurate answer (A through J) based on factual knowledge. Respond with the letter only.
|
15 |
+
"""
|
16 |
+
)
|
17 |
+
|
18 |
+
GSM8K_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
19 |
+
"""\
|
20 |
+
Solve this math problem step by step:
|
21 |
+
1) Show your reasoning
|
22 |
+
2) End with "Therefore, the answer is [number]"
|
23 |
+
"""
|
24 |
+
)
|
25 |
+
|
26 |
+
HUMANEVAL_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
27 |
+
"""\
|
28 |
+
Implement the Python function following best practices. Include error handling, type hints, and comments for complex logic. Return only the implementation code.
|
29 |
+
"""
|
30 |
+
)
|
31 |
+
|
32 |
+
MGSM_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
33 |
+
"""\
|
34 |
+
Solve this math problem step by step in the specified language:
|
35 |
+
1) Show your reasoning
|
36 |
+
2) Use appropriate number formatting
|
37 |
+
3) End with "Therefore, the answer is [number]"
|
38 |
+
"""
|
39 |
+
)
|
40 |
+
|
41 |
+
IFEVAL_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
42 |
+
"""\
|
43 |
+
Follow the given requirements exactly. Provide only the requested output.
|
44 |
+
"""
|
45 |
+
)
|
46 |
+
|
47 |
+
BBH_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
48 |
+
"""\
|
49 |
+
Solve this reasoning problem step by step.
|
50 |
+
"""
|
51 |
+
)
|
52 |
+
|
53 |
+
MBPP_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
54 |
+
"""\
|
55 |
+
Write clean, efficient Python code that solves the given task. Include docstrings and handle edge cases. Return only the implementation code.
|
56 |
+
"""
|
57 |
+
)
|
58 |
|
59 |
+
TW_LEGAL_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
60 |
+
"""\
|
61 |
+
As a Taiwan legal expert, select the most accurate answer (A, B, C, or D) based on Taiwan's laws. Respond with the letter only.
|
62 |
+
"""
|
63 |
+
)
|
64 |
+
|
65 |
+
TMLU_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
66 |
+
"""\
|
67 |
+
Select the most accurate answer (A, B, C, or D) based on Taiwan's educational and professional knowledge. Respond with the letter only.
|
68 |
"""
|
69 |
)
|
llmdataparser/tmlu_parser.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from typing import Any, Final
|
3 |
+
|
4 |
+
from llmdataparser.base_parser import (
|
5 |
+
DatasetDescription,
|
6 |
+
EvaluationMetric,
|
7 |
+
HuggingFaceDatasetParser,
|
8 |
+
HuggingFaceParseEntry,
|
9 |
+
)
|
10 |
+
|
11 |
+
TMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
|
12 |
+
TMLU_VALID_ANSWER_STR: Final[str] = ", ".join(sorted(TMLU_VALID_ANSWERS))
|
13 |
+
|
14 |
+
|
15 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
16 |
+
class TMLUParseEntry(HuggingFaceParseEntry):
|
17 |
+
"""Custom entry class for TMLU, with fields specific to this dataset parser."""
|
18 |
+
|
19 |
+
raw_choices: list[str]
|
20 |
+
explanation: str
|
21 |
+
metadata: dict[str, Any]
|
22 |
+
|
23 |
+
@classmethod
|
24 |
+
def create(
|
25 |
+
cls,
|
26 |
+
question: str,
|
27 |
+
answer: str,
|
28 |
+
raw_question: str,
|
29 |
+
raw_choices: list[str],
|
30 |
+
raw_answer: str,
|
31 |
+
task_name: str,
|
32 |
+
explanation: str = "",
|
33 |
+
metadata: dict[str, Any] = {},
|
34 |
+
) -> "TMLUParseEntry":
|
35 |
+
if answer not in TMLU_VALID_ANSWERS:
|
36 |
+
raise ValueError(
|
37 |
+
f"Invalid answer_letter '{answer}'; must be one of {TMLU_VALID_ANSWER_STR}"
|
38 |
+
)
|
39 |
+
return cls(
|
40 |
+
question=question,
|
41 |
+
answer=answer,
|
42 |
+
raw_question=raw_question,
|
43 |
+
raw_answer=raw_answer,
|
44 |
+
raw_choices=raw_choices,
|
45 |
+
task_name=task_name,
|
46 |
+
explanation=explanation,
|
47 |
+
metadata=metadata,
|
48 |
+
)
|
49 |
+
|
50 |
+
|
51 |
+
class TMLUDatasetParser(HuggingFaceDatasetParser[TMLUParseEntry]):
|
52 |
+
"""Parser for the TMLU dataset."""
|
53 |
+
|
54 |
+
_data_source = "miulab/tmlu"
|
55 |
+
_default_task = "AST_chinese"
|
56 |
+
_task_names = [
|
57 |
+
"AST_chinese",
|
58 |
+
"AST_mathematics",
|
59 |
+
"AST_biology",
|
60 |
+
"AST_chemistry",
|
61 |
+
"AST_physics",
|
62 |
+
"AST_civics",
|
63 |
+
"AST_geography",
|
64 |
+
"AST_history",
|
65 |
+
"GSAT_chinese",
|
66 |
+
"GSAT_chemistry",
|
67 |
+
"GSAT_biology",
|
68 |
+
"GSAT_physics",
|
69 |
+
"GSAT_earth_science",
|
70 |
+
"GSAT_mathematics",
|
71 |
+
"GSAT_geography",
|
72 |
+
"GSAT_history",
|
73 |
+
"GSAT_civics",
|
74 |
+
"CAP_mathematics",
|
75 |
+
"CAP_biology",
|
76 |
+
"CAP_physics",
|
77 |
+
"CAP_chemistry",
|
78 |
+
"CAP_earth_science",
|
79 |
+
"CAP_civics",
|
80 |
+
"CAP_history",
|
81 |
+
"CAP_geography",
|
82 |
+
"CAP_chinese",
|
83 |
+
"driving_rule",
|
84 |
+
"basic_traditional_chinese_medicine",
|
85 |
+
"clinical_traditional_chinese_medicine",
|
86 |
+
"lawyer_qualification",
|
87 |
+
"nutritionist",
|
88 |
+
"tour_leader",
|
89 |
+
"tour_guide",
|
90 |
+
"taiwan_tourist_resources",
|
91 |
+
"clinical_psychologist",
|
92 |
+
"teacher_qualification",
|
93 |
+
"accountant",
|
94 |
+
]
|
95 |
+
|
96 |
+
def process_entry(
|
97 |
+
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
98 |
+
) -> TMLUParseEntry:
|
99 |
+
"""Process a single TMLU entry."""
|
100 |
+
task = task_name or self._get_current_task(row)
|
101 |
+
# Extract choices in order
|
102 |
+
raw_choices = [row["A"], row["B"], row["C"], row["D"]]
|
103 |
+
choices = "\n".join(
|
104 |
+
f"{chr(65 + i)}. {choice}" for i, choice in enumerate(raw_choices)
|
105 |
+
)
|
106 |
+
raw_question = row["question"]
|
107 |
+
raw_answer = row["answer"]
|
108 |
+
explanation = row.get("explanation", "")
|
109 |
+
metadata = row.get("metadata", {})
|
110 |
+
|
111 |
+
question = f"Question: {raw_question}\n{choices}\nAnswer:"
|
112 |
+
|
113 |
+
return TMLUParseEntry.create(
|
114 |
+
question=question,
|
115 |
+
answer=raw_answer,
|
116 |
+
raw_question=raw_question,
|
117 |
+
raw_choices=raw_choices,
|
118 |
+
raw_answer=raw_answer,
|
119 |
+
task_name=task,
|
120 |
+
explanation=explanation,
|
121 |
+
metadata=metadata,
|
122 |
+
)
|
123 |
+
|
124 |
+
def get_dataset_description(self) -> DatasetDescription:
|
125 |
+
"""Returns description of the TMLU dataset."""
|
126 |
+
return DatasetDescription.create(
|
127 |
+
name="Taiwan Multiple-choice Language Understanding (TMLU)",
|
128 |
+
language="Traditional Chinese",
|
129 |
+
purpose="Evaluate models on Taiwan-specific educational and professional knowledge",
|
130 |
+
source="Various Taiwan standardized tests and professional certifications",
|
131 |
+
category=["Taiwan", "General Knowledge and Reasoning"],
|
132 |
+
format="Multiple choice questions (A/B/C/D)",
|
133 |
+
characteristics=(
|
134 |
+
"Covers various subjects including Advanced Subjects Test (AST), "
|
135 |
+
"General Scholastic Ability Test (GSAT), College Admission Practice (CAP), "
|
136 |
+
"and professional certifications"
|
137 |
+
),
|
138 |
+
citation="""@article{DBLP:journals/corr/abs-2403-20180,
|
139 |
+
author = {Po-Heng Chen and Sijia Cheng and Wei-Lin Chen and Yen-Ting Lin and Yun-Nung Chen},
|
140 |
+
title = {Measuring Taiwanese Mandarin Language Understanding},
|
141 |
+
journal = {CoRR},
|
142 |
+
volume = {abs/2403.20180},
|
143 |
+
year = {2024},
|
144 |
+
url = {https://doi.org/10.48550/arXiv.2403.20180},
|
145 |
+
doi = {10.48550/ARXIV.2403.20180},
|
146 |
+
eprinttype = {arXiv},
|
147 |
+
eprint = {2403.20180},
|
148 |
+
timestamp = {Wed, 10 Apr 2024 17:37:45 +0200},
|
149 |
+
biburl = {https://dblp.org/rec/journals/corr/abs-2403-20180.bib},
|
150 |
+
bibsource = {dblp computer science bibliography, https://dblp.org}
|
151 |
+
}""",
|
152 |
+
)
|
153 |
+
|
154 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
155 |
+
"""Returns recommended evaluation metrics for TMLU."""
|
156 |
+
return [
|
157 |
+
EvaluationMetric.create(
|
158 |
+
name="accuracy",
|
159 |
+
type="classification",
|
160 |
+
description="Overall percentage of correctly answered questions",
|
161 |
+
implementation="datasets.load_metric('accuracy')",
|
162 |
+
primary=True,
|
163 |
+
),
|
164 |
+
EvaluationMetric.create(
|
165 |
+
name="per_subject_accuracy",
|
166 |
+
type="classification",
|
167 |
+
description="Accuracy broken down by subject areas (AST, GSAT, CAP, etc.)",
|
168 |
+
implementation="custom_subject_accuracy",
|
169 |
+
primary=True,
|
170 |
+
),
|
171 |
+
]
|
172 |
+
|
173 |
+
|
174 |
+
if __name__ == "__main__":
|
175 |
+
# Example usage
|
176 |
+
parser = TMLUDatasetParser()
|
177 |
+
parser.load()
|
178 |
+
parser.parse()
|
179 |
+
|
180 |
+
# Get parsed data with correct type
|
181 |
+
parsed_data = parser.get_parsed_data
|
182 |
+
|
183 |
+
# Print example entry
|
184 |
+
if parsed_data:
|
185 |
+
example = parsed_data[0]
|
186 |
+
print("\nExample parsed entry:")
|
187 |
+
print(f"Task: {example.task_name}")
|
188 |
+
print(f"Question: {example.question}")
|
189 |
+
print("Choices:")
|
190 |
+
for i, choice in enumerate(example.raw_choices):
|
191 |
+
print(f"{chr(65 + i)}. {choice}")
|
192 |
+
print(f"Correct Answer: {example.answer}")
|
193 |
+
if example.explanation:
|
194 |
+
print(f"Explanation: {example.explanation}")
|
195 |
+
print(f"Metadata: {example.metadata}")
|
llmdataparser/tw_legal_parser.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from typing import Any, Final
|
3 |
+
|
4 |
+
from llmdataparser.base_parser import (
|
5 |
+
DatasetDescription,
|
6 |
+
EvaluationMetric,
|
7 |
+
HuggingFaceDatasetParser,
|
8 |
+
HuggingFaceParseEntry,
|
9 |
+
)
|
10 |
+
|
11 |
+
TW_LEGAL_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
|
12 |
+
TW_LEGAL_VALID_ANSWER_STR: Final[str] = ", ".join(sorted(TW_LEGAL_VALID_ANSWERS))
|
13 |
+
|
14 |
+
|
15 |
+
@dataclass(frozen=True, kw_only=True, slots=True)
|
16 |
+
class TWLegalParseEntry(HuggingFaceParseEntry):
|
17 |
+
"""Custom entry class for Taiwan Legal Benchmark, with fields specific to this dataset parser."""
|
18 |
+
|
19 |
+
raw_choices: list[str]
|
20 |
+
|
21 |
+
@classmethod
|
22 |
+
def create(
|
23 |
+
cls,
|
24 |
+
question: str,
|
25 |
+
answer: str,
|
26 |
+
raw_question: str,
|
27 |
+
raw_choices: list[str],
|
28 |
+
raw_answer: str,
|
29 |
+
task_name: str,
|
30 |
+
) -> "TWLegalParseEntry":
|
31 |
+
if answer not in TW_LEGAL_VALID_ANSWERS:
|
32 |
+
raise ValueError(
|
33 |
+
f"Invalid answer_letter '{answer}'; must be one of {TW_LEGAL_VALID_ANSWER_STR}"
|
34 |
+
)
|
35 |
+
return cls(
|
36 |
+
question=question,
|
37 |
+
answer=answer,
|
38 |
+
raw_question=raw_question,
|
39 |
+
raw_answer=raw_answer,
|
40 |
+
raw_choices=raw_choices,
|
41 |
+
task_name=task_name,
|
42 |
+
)
|
43 |
+
|
44 |
+
|
45 |
+
class TWLegalDatasetParser(HuggingFaceDatasetParser[TWLegalParseEntry]):
|
46 |
+
"""Parser for the Taiwan Legal Benchmark dataset."""
|
47 |
+
|
48 |
+
_data_source = "lianghsun/tw-legal-benchmark-v1"
|
49 |
+
_default_task = "default"
|
50 |
+
_task_names = ["default"]
|
51 |
+
|
52 |
+
def process_entry(
|
53 |
+
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
54 |
+
) -> TWLegalParseEntry:
|
55 |
+
"""Process a single Taiwan Legal Benchmark entry."""
|
56 |
+
# Extract choices in order
|
57 |
+
task = task_name or self._get_current_task(row)
|
58 |
+
raw_choices = [row["A"], row["B"], row["C"], row["D"]]
|
59 |
+
choices = "\n".join(
|
60 |
+
f"{chr(65 + i)}. {choice}" for i, choice in enumerate(raw_choices)
|
61 |
+
)
|
62 |
+
raw_question = row["question"]
|
63 |
+
raw_answer = row["answer"]
|
64 |
+
|
65 |
+
question = f"Question: {raw_question}\n{choices}\nAnswer:"
|
66 |
+
|
67 |
+
return TWLegalParseEntry.create(
|
68 |
+
question=question,
|
69 |
+
answer=raw_answer,
|
70 |
+
raw_question=raw_question,
|
71 |
+
raw_choices=raw_choices,
|
72 |
+
raw_answer=raw_answer,
|
73 |
+
task_name=task,
|
74 |
+
)
|
75 |
+
|
76 |
+
def get_dataset_description(self) -> DatasetDescription:
|
77 |
+
"""Returns description of the Taiwan Legal Benchmark dataset."""
|
78 |
+
return DatasetDescription.create(
|
79 |
+
name="Taiwan Legal Benchmark",
|
80 |
+
language="Traditional Chinese",
|
81 |
+
purpose="Evaluate models on Taiwan-specific legal knowledge and understanding",
|
82 |
+
source="Taiwan Bar Examination questions",
|
83 |
+
category=["Taiwan", "General Knowledge and Reasoning", "Legal"],
|
84 |
+
format="Multiple choice questions (A/B/C/D)",
|
85 |
+
characteristics=(
|
86 |
+
"Contains questions from Taiwan's bar examination, testing understanding "
|
87 |
+
"of Taiwan's legal system, terminology, and concepts"
|
88 |
+
),
|
89 |
+
citation="""
|
90 |
+
url={https://huggingface.co/datasets/lianghsun/tw-legal-benchmark-v1}
|
91 |
+
""",
|
92 |
+
)
|
93 |
+
|
94 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
95 |
+
"""Returns recommended evaluation metrics for Taiwan Legal Benchmark."""
|
96 |
+
return [
|
97 |
+
EvaluationMetric.create(
|
98 |
+
name="accuracy",
|
99 |
+
type="classification",
|
100 |
+
description="Overall percentage of correctly answered legal questions",
|
101 |
+
implementation="datasets.load_metric('accuracy')",
|
102 |
+
primary=True,
|
103 |
+
),
|
104 |
+
]
|
105 |
+
|
106 |
+
|
107 |
+
if __name__ == "__main__":
|
108 |
+
# Example usage
|
109 |
+
parser = TWLegalDatasetParser()
|
110 |
+
parser.load()
|
111 |
+
parser.parse()
|
112 |
+
|
113 |
+
# Get parsed data with correct type
|
114 |
+
parsed_data = parser.get_parsed_data
|
115 |
+
|
116 |
+
# Print example entry
|
117 |
+
if parsed_data:
|
118 |
+
example = parsed_data[0]
|
119 |
+
print("\nExample parsed entry:")
|
120 |
+
print(f"Question: {example.question}")
|
121 |
+
print("Choices:")
|
122 |
+
for i, choice in enumerate(example.raw_choices):
|
123 |
+
print(f"{chr(65 + i)}. {choice}")
|
124 |
+
print(f"Correct Answer: {example.answer}")
|
125 |
+
print(f"Task Name: {example.task_name}")
|
mkdocs.yml
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
site_name: LLMDataParser
|
2 |
+
theme:
|
3 |
+
name: material
|
4 |
+
|
5 |
+
nav:
|
6 |
+
- Home: index.md
|
7 |
+
|
8 |
+
plugins:
|
9 |
+
- search
|
nginx.conf
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
events {
|
2 |
+
worker_connections 1024;
|
3 |
+
}
|
4 |
+
|
5 |
+
http {
|
6 |
+
# Basic security settings
|
7 |
+
server_tokens off; # Don't show nginx version
|
8 |
+
client_max_body_size 10M; # Limit request size
|
9 |
+
client_body_timeout 12;
|
10 |
+
client_header_timeout 12;
|
11 |
+
|
12 |
+
upstream gradio_app {
|
13 |
+
server llmdataparser:7860;
|
14 |
+
keepalive 32;
|
15 |
+
}
|
16 |
+
|
17 |
+
server {
|
18 |
+
listen 80;
|
19 |
+
server_name localhost;
|
20 |
+
|
21 |
+
# Enhanced security headers
|
22 |
+
add_header X-Frame-Options "SAMEORIGIN" always;
|
23 |
+
add_header X-Content-Type-Options "nosniff" always;
|
24 |
+
add_header X-XSS-Protection "1; mode=block" always;
|
25 |
+
add_header Referrer-Policy "strict-origin-always" always;
|
26 |
+
add_header Content-Security-Policy "default-src 'self'; script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';" always;
|
27 |
+
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
|
28 |
+
|
29 |
+
location / {
|
30 |
+
proxy_pass http://gradio_app;
|
31 |
+
proxy_set_header Host $host;
|
32 |
+
proxy_set_header X-Real-IP $remote_addr;
|
33 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
34 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
35 |
+
|
36 |
+
# WebSocket support
|
37 |
+
proxy_http_version 1.1;
|
38 |
+
proxy_set_header Upgrade $http_upgrade;
|
39 |
+
proxy_set_header Connection "upgrade";
|
40 |
+
|
41 |
+
# Timeouts
|
42 |
+
proxy_connect_timeout 60s;
|
43 |
+
proxy_send_timeout 60s;
|
44 |
+
proxy_read_timeout 60s;
|
45 |
+
|
46 |
+
# Security
|
47 |
+
proxy_buffering on;
|
48 |
+
proxy_buffer_size 8k;
|
49 |
+
proxy_buffers 8 8k;
|
50 |
+
}
|
51 |
+
|
52 |
+
# Deny access to hidden files
|
53 |
+
location ~ /\. {
|
54 |
+
deny all;
|
55 |
+
return 404;
|
56 |
+
}
|
57 |
+
}
|
58 |
+
}
|
notebooks/demo.ipynb
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": null,
|
6 |
-
"metadata": {},
|
7 |
-
"outputs": [],
|
8 |
-
"source": [
|
9 |
-
"import pprint\n",
|
10 |
-
"import random"
|
11 |
-
]
|
12 |
-
},
|
13 |
-
{
|
14 |
-
"cell_type": "code",
|
15 |
-
"execution_count": null,
|
16 |
-
"metadata": {},
|
17 |
-
"outputs": [],
|
18 |
-
"source": [
|
19 |
-
"from llmdataparser import ParserRegistry\n",
|
20 |
-
"ParserRegistry.list_parsers()"
|
21 |
-
]
|
22 |
-
},
|
23 |
-
{
|
24 |
-
"cell_type": "code",
|
25 |
-
"execution_count": null,
|
26 |
-
"metadata": {},
|
27 |
-
"outputs": [],
|
28 |
-
"source": [
|
29 |
-
"mmlu_parser = ParserRegistry.get_parser('mmlu')\n",
|
30 |
-
"mmlu_parser.load()"
|
31 |
-
]
|
32 |
-
},
|
33 |
-
{
|
34 |
-
"cell_type": "code",
|
35 |
-
"execution_count": null,
|
36 |
-
"metadata": {},
|
37 |
-
"outputs": [],
|
38 |
-
"source": [
|
39 |
-
"mmlu_parser.parse(split_names=['dev', 'test'])\n",
|
40 |
-
"parsed_data = mmlu_parser.get_parsed_data"
|
41 |
-
]
|
42 |
-
},
|
43 |
-
{
|
44 |
-
"cell_type": "code",
|
45 |
-
"execution_count": null,
|
46 |
-
"metadata": {},
|
47 |
-
"outputs": [],
|
48 |
-
"source": [
|
49 |
-
"index = random.randint(0, len(parsed_data))\n",
|
50 |
-
"print(f\"Question: \\n-------------------\\n {parsed_data[index].prompt}\")\n",
|
51 |
-
"print(\"-------------------\")\n",
|
52 |
-
"print(f\"Answer: \\n-------------------\\n{parsed_data[index].answer_letter}\")"
|
53 |
-
]
|
54 |
-
}
|
55 |
-
],
|
56 |
-
"metadata": {
|
57 |
-
"kernelspec": {
|
58 |
-
"display_name": "llmdata",
|
59 |
-
"language": "python",
|
60 |
-
"name": "python3"
|
61 |
-
},
|
62 |
-
"language_info": {
|
63 |
-
"codemirror_mode": {
|
64 |
-
"name": "ipython",
|
65 |
-
"version": 3
|
66 |
-
},
|
67 |
-
"file_extension": ".py",
|
68 |
-
"mimetype": "text/x-python",
|
69 |
-
"name": "python",
|
70 |
-
"nbconvert_exporter": "python",
|
71 |
-
"pygments_lexer": "ipython3",
|
72 |
-
"version": "3.12.7"
|
73 |
-
}
|
74 |
-
},
|
75 |
-
"nbformat": 4,
|
76 |
-
"nbformat_minor": 2
|
77 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
CHANGED
@@ -1,6 +1,10 @@
|
|
|
|
|
|
|
|
|
|
1 |
[tool.poetry]
|
2 |
name = "llmdataparser"
|
3 |
-
version = "
|
4 |
description = "A collection of parsers for LLM benchmark datasets like MMLU, MMLU-Pro, GSM8k, and more."
|
5 |
authors = ["Jeff Yang <[email protected]>"]
|
6 |
license = "MIT"
|
@@ -16,45 +20,66 @@ classifiers = [
|
|
16 |
"Intended Audience :: Developers"
|
17 |
]
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
[tool.poetry.dependencies]
|
20 |
-
python = ">=3.
|
21 |
pandas = "^2.0.3"
|
22 |
-
datasets = "^2.
|
|
|
23 |
typing-extensions = "^4.8.0"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
[tool.poetry.group.dev.dependencies]
|
26 |
-
pytest = "^7.
|
27 |
-
black = { version = "^23.9.1", allow-prereleases = true }
|
28 |
-
flake8 = "^6.1.0"
|
29 |
isort = "^5.12.0"
|
30 |
mypy = "^1.5.1"
|
31 |
pre-commit = "^3.4.0"
|
32 |
types-python-dateutil = "^2.8.19.14"
|
33 |
ipykernel = "^6.7.0"
|
|
|
|
|
|
|
34 |
|
35 |
-
[tool.
|
36 |
line-length = 88
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
| build
|
43 |
-
| dist
|
44 |
-
)/
|
45 |
-
"""
|
46 |
|
47 |
[tool.isort]
|
48 |
profile = "black"
|
|
|
49 |
line_length = 88
|
50 |
-
known_first_party = ["llmdataparser"]
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
-
[tool.ruff]
|
54 |
-
line-length = 88
|
55 |
-
select = ["E", "F"] # or specify checks explicitly without E501
|
56 |
-
ignore = ["E501"]
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = ["poetry-core>=1.5.0"]
|
3 |
+
build-backend = "poetry.core.masonry.api"
|
4 |
+
|
5 |
[tool.poetry]
|
6 |
name = "llmdataparser"
|
7 |
+
version = "1.0.0"
|
8 |
description = "A collection of parsers for LLM benchmark datasets like MMLU, MMLU-Pro, GSM8k, and more."
|
9 |
authors = ["Jeff Yang <[email protected]>"]
|
10 |
license = "MIT"
|
|
|
20 |
"Intended Audience :: Developers"
|
21 |
]
|
22 |
|
23 |
+
packages = [
|
24 |
+
{ include = "llmdataparser" }
|
25 |
+
]
|
26 |
+
|
27 |
+
[tool.poetry.scripts]
|
28 |
+
start = "llmdataparser.app:main"
|
29 |
+
|
30 |
[tool.poetry.dependencies]
|
31 |
+
python = ">=3.10"
|
32 |
pandas = "^2.0.3"
|
33 |
+
datasets = "^3.2.0"
|
34 |
+
fsspec = "^2024.9.0"
|
35 |
typing-extensions = "^4.8.0"
|
36 |
+
ipywidgets = "^8.1.1"
|
37 |
+
gradio = "^4.19.2"
|
38 |
+
pyyaml = "^6.0.1" # Add this for configuration handling
|
39 |
+
tqdm = "^4.66.1" # Add this for progress bars
|
40 |
+
numpy = "^1.24.0" # Add this for numerical operations
|
41 |
+
mkdocs = "^1.5.0"
|
42 |
+
mkdocs-material = "^9.5.0" # Optional but recommended for better documentation
|
43 |
|
44 |
[tool.poetry.group.dev.dependencies]
|
45 |
+
pytest = "^7.0.0"
|
|
|
|
|
46 |
isort = "^5.12.0"
|
47 |
mypy = "^1.5.1"
|
48 |
pre-commit = "^3.4.0"
|
49 |
types-python-dateutil = "^2.8.19.14"
|
50 |
ipykernel = "^6.7.0"
|
51 |
+
coverage = "^7.4.1"
|
52 |
+
pytest-cov = "^4.1.0"
|
53 |
+
evaluate = "^0.4.0"
|
54 |
|
55 |
+
[tool.ruff]
|
56 |
line-length = 88
|
57 |
+
|
58 |
+
[tool.ruff.lint]
|
59 |
+
select = ["E", "F", "I"]
|
60 |
+
ignore = ["E501"]
|
61 |
+
|
|
|
|
|
|
|
|
|
62 |
|
63 |
[tool.isort]
|
64 |
profile = "black"
|
65 |
+
multi_line_output = 3
|
66 |
line_length = 88
|
|
|
67 |
|
68 |
+
[tool.mypy]
|
69 |
+
python_version = "3.12"
|
70 |
+
warn_return_any = true
|
71 |
+
warn_unused_configs = true
|
72 |
+
exclude = ["tests/.*"]
|
73 |
+
ignore_missing_imports = true
|
74 |
+
follow_imports = "silent"
|
75 |
|
|
|
|
|
|
|
|
|
76 |
|
77 |
+
|
78 |
+
[tool.pytest.ini_options]
|
79 |
+
testpaths = ["tests"]
|
80 |
+
python_files = ["test_*.py"]
|
81 |
+
addopts = "-ra -q --cov=llmdataparser --cov-report=term-missing"
|
82 |
+
|
83 |
+
[tool.bandit]
|
84 |
+
exclude_dirs = ["tests"]
|
85 |
+
skips = ["B101"]
|
tests/test_bbh_parser.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
|
3 |
+
from llmdataparser.bbh_parser import BBHDatasetParser, BBHParseEntry
|
4 |
+
|
5 |
+
|
6 |
+
@pytest.fixture
|
7 |
+
def bbh_parser():
|
8 |
+
"""Create a BBH parser instance for testing."""
|
9 |
+
return BBHDatasetParser()
|
10 |
+
|
11 |
+
|
12 |
+
@pytest.fixture
|
13 |
+
def loaded_bbh_parser(bbh_parser):
|
14 |
+
"""Create and load a BBH parser instance for testing."""
|
15 |
+
bbh_parser.load(task_name="reasoning_about_colored_objects", split="test")
|
16 |
+
return bbh_parser
|
17 |
+
|
18 |
+
|
19 |
+
@pytest.fixture
|
20 |
+
def sample_row():
|
21 |
+
"""Create a sample BBH data row for testing."""
|
22 |
+
return {
|
23 |
+
"input": "What color is the sky on a clear day?\nA) Blue\nB) Green\nC) Red\nD) Yellow",
|
24 |
+
"target": "(A)",
|
25 |
+
}
|
26 |
+
|
27 |
+
|
28 |
+
def test_bbh_parse_entry_creation_valid():
|
29 |
+
"""Test valid creation of BBHParseEntry."""
|
30 |
+
entry = BBHParseEntry.create(
|
31 |
+
question="Test question",
|
32 |
+
answer="A",
|
33 |
+
raw_question="Test question",
|
34 |
+
raw_answer="(A)",
|
35 |
+
task_name="reasoning_about_colored_objects",
|
36 |
+
)
|
37 |
+
assert isinstance(entry, BBHParseEntry)
|
38 |
+
assert entry.question == "Test question"
|
39 |
+
assert entry.answer == "A"
|
40 |
+
assert entry.raw_question == "Test question"
|
41 |
+
assert entry.raw_answer == "(A)"
|
42 |
+
assert entry.task_name == "reasoning_about_colored_objects"
|
43 |
+
|
44 |
+
|
45 |
+
def test_bbh_parser_initialization(bbh_parser):
|
46 |
+
"""Test BBH parser initialization."""
|
47 |
+
assert bbh_parser._data_source == "lukaemon/bbh"
|
48 |
+
assert bbh_parser._default_task == "reasoning_about_colored_objects"
|
49 |
+
assert "boolean_expressions" in bbh_parser._task_names
|
50 |
+
assert "word_sorting" in bbh_parser._task_names
|
51 |
+
assert (
|
52 |
+
bbh_parser.get_huggingface_link
|
53 |
+
== "https://huggingface.co/datasets/lukaemon/bbh"
|
54 |
+
)
|
55 |
+
|
56 |
+
|
57 |
+
def test_load_dataset(loaded_bbh_parser):
|
58 |
+
"""Test loading the dataset."""
|
59 |
+
assert loaded_bbh_parser.raw_data is not None
|
60 |
+
assert loaded_bbh_parser.split_names == ["test"]
|
61 |
+
assert loaded_bbh_parser._current_task == "reasoning_about_colored_objects"
|
62 |
+
|
63 |
+
|
64 |
+
@pytest.mark.integration
|
65 |
+
def test_full_parse_workflow(loaded_bbh_parser):
|
66 |
+
"""Test the complete workflow of loading and parsing data."""
|
67 |
+
# Parse the test split
|
68 |
+
loaded_bbh_parser.parse(split_names="test", force=True)
|
69 |
+
parsed_data = loaded_bbh_parser.get_parsed_data
|
70 |
+
|
71 |
+
# Basic checks
|
72 |
+
assert len(parsed_data) > 0
|
73 |
+
|
74 |
+
# Check first entry structure
|
75 |
+
first_entry = parsed_data[0]
|
76 |
+
assert isinstance(first_entry, BBHParseEntry)
|
77 |
+
assert first_entry.task_name == "reasoning_about_colored_objects"
|
78 |
+
assert first_entry.answer.strip("()").isalpha() # Should be a single letter
|
79 |
+
|
80 |
+
|
81 |
+
def test_process_entry(bbh_parser, sample_row):
|
82 |
+
"""Test processing of a single BBH entry."""
|
83 |
+
entry = bbh_parser.process_entry(
|
84 |
+
sample_row, task_name="reasoning_about_colored_objects"
|
85 |
+
)
|
86 |
+
|
87 |
+
assert isinstance(entry, BBHParseEntry)
|
88 |
+
assert entry.answer == "A" # Stripped from "(A)"
|
89 |
+
assert "What color is the sky" in entry.question
|
90 |
+
assert entry.raw_answer == "(A)"
|
91 |
+
assert entry.task_name == "reasoning_about_colored_objects"
|
92 |
+
|
93 |
+
|
94 |
+
@pytest.mark.parametrize("split_name", ["invalid_split", "wrong_split"])
|
95 |
+
def test_parse_with_invalid_split(bbh_parser, split_name):
|
96 |
+
"""Test parsing with invalid split names."""
|
97 |
+
bbh_parser.raw_data = {"train": [], "test": []} # Mock data
|
98 |
+
|
99 |
+
with pytest.raises(
|
100 |
+
ValueError, match=f"Split '{split_name}' not found in the dataset"
|
101 |
+
):
|
102 |
+
bbh_parser.parse(split_name)
|
103 |
+
|
104 |
+
|
105 |
+
def test_parse_without_loaded_data(bbh_parser):
|
106 |
+
"""Test parsing without loading data first."""
|
107 |
+
with pytest.raises(
|
108 |
+
ValueError, match="No data loaded. Please load the dataset first"
|
109 |
+
):
|
110 |
+
bbh_parser.parse()
|
111 |
+
|
112 |
+
|
113 |
+
@pytest.mark.parametrize(
|
114 |
+
"test_case",
|
115 |
+
[
|
116 |
+
{"input": "Test question", "target": "(A)"},
|
117 |
+
{"input": "Test question", "target": "(B)"},
|
118 |
+
{"input": "Test question", "target": "(C)"},
|
119 |
+
],
|
120 |
+
)
|
121 |
+
def test_answer_stripping(bbh_parser, test_case):
|
122 |
+
"""Test stripping of parentheses from answers."""
|
123 |
+
entry = bbh_parser.process_entry(
|
124 |
+
test_case, task_name="reasoning_about_colored_objects"
|
125 |
+
)
|
126 |
+
assert entry.answer == test_case["target"].strip("()")
|
127 |
+
assert entry.raw_answer == test_case["target"]
|
128 |
+
|
129 |
+
|
130 |
+
def test_parser_properties(bbh_parser):
|
131 |
+
"""Test parser property getters."""
|
132 |
+
assert len(bbh_parser.task_names) > 0
|
133 |
+
assert bbh_parser.total_tasks == len(bbh_parser._task_names)
|
134 |
+
assert all(isinstance(task, str) for task in bbh_parser.task_names)
|
135 |
+
|
136 |
+
|
137 |
+
def test_parser_string_representation(loaded_bbh_parser):
|
138 |
+
"""Test string representation of parser."""
|
139 |
+
repr_str = str(loaded_bbh_parser)
|
140 |
+
assert "BBHDatasetParser" in repr_str
|
141 |
+
assert "lukaemon/bbh" in repr_str
|
142 |
+
assert "reasoning_about_colored_objects" in repr_str
|
143 |
+
assert "loaded" in repr_str
|
144 |
+
|
145 |
+
|
146 |
+
@pytest.mark.integration
|
147 |
+
@pytest.mark.parametrize(
|
148 |
+
"task_name", ["boolean_expressions", "causal_judgement", "date_understanding"]
|
149 |
+
)
|
150 |
+
def test_different_tasks_parsing(bbh_parser, task_name):
|
151 |
+
"""Test parsing different tasks of the dataset."""
|
152 |
+
bbh_parser.load(task_name=task_name, split="test")
|
153 |
+
bbh_parser.parse(split_names="test", force=True)
|
154 |
+
parsed_data = bbh_parser.get_parsed_data
|
155 |
+
|
156 |
+
assert len(parsed_data) > 0
|
157 |
+
assert all(entry.task_name == task_name for entry in parsed_data)
|
158 |
+
assert all(isinstance(entry.answer, str) for entry in parsed_data)
|
159 |
+
|
160 |
+
|
161 |
+
def test_get_dataset_description(bbh_parser):
|
162 |
+
"""Test dataset description generation."""
|
163 |
+
description = bbh_parser.get_dataset_description()
|
164 |
+
|
165 |
+
assert description.name == "Big Bench Hard (BBH)"
|
166 |
+
assert description.language == "English"
|
167 |
+
assert description.format == "Multiple choice questions with single correct answers"
|
168 |
+
assert "suzgun2022challenging" in description.citation
|
169 |
+
|
170 |
+
|
171 |
+
def test_get_evaluation_metrics(bbh_parser):
|
172 |
+
"""Test evaluation metrics generation."""
|
173 |
+
metrics = bbh_parser.get_evaluation_metrics()
|
174 |
+
|
175 |
+
assert len(metrics) == 4 # Check total number of metrics
|
176 |
+
|
177 |
+
# Check primary metrics
|
178 |
+
primary_metrics = [m for m in metrics if m.primary]
|
179 |
+
assert len(primary_metrics) == 2
|
180 |
+
assert any(m.name == "accuracy" for m in primary_metrics)
|
181 |
+
assert any(m.name == "human_eval_delta" for m in primary_metrics)
|
182 |
+
|
183 |
+
# Check specific metric properties
|
184 |
+
accuracy_metric = next(m for m in metrics if m.name == "accuracy")
|
185 |
+
assert accuracy_metric.type == "classification"
|
186 |
+
assert "evaluate.load('accuracy')" in accuracy_metric.implementation
|
187 |
+
|
188 |
+
# Check non-primary metrics
|
189 |
+
assert any(m.name == "per_task_accuracy" and not m.primary for m in metrics)
|
190 |
+
assert any(m.name == "exact_match" and not m.primary for m in metrics)
|
tests/test_gsm8k_parser.py
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
|
3 |
+
from llmdataparser.gsm8k_parser import GSM8KDatasetParser, GSM8KParseEntry
|
4 |
+
|
5 |
+
|
6 |
+
@pytest.fixture
|
7 |
+
def gsm8k_parser():
|
8 |
+
"""Create a GSM8K parser instance for testing."""
|
9 |
+
return GSM8KDatasetParser()
|
10 |
+
|
11 |
+
|
12 |
+
@pytest.fixture
|
13 |
+
def loaded_gsm8k_parser(gsm8k_parser):
|
14 |
+
"""Create and load a GSM8K parser instance for testing."""
|
15 |
+
gsm8k_parser.load(
|
16 |
+
task_name="main", split="test"
|
17 |
+
) # Using test split as it's smaller
|
18 |
+
return gsm8k_parser
|
19 |
+
|
20 |
+
|
21 |
+
@pytest.fixture
|
22 |
+
def sample_row():
|
23 |
+
"""Create a sample GSM8K data row for testing."""
|
24 |
+
return {
|
25 |
+
"question": "Janet has 3 apples. She buys 2 more. How many apples does she have now?",
|
26 |
+
"answer": "Let's solve this step by step:\n1) Initially, Janet has 3 apples\n2) She buys 2 more apples\n3) Total apples = 3 + 2\n#### 5",
|
27 |
+
}
|
28 |
+
|
29 |
+
|
30 |
+
def test_gsm8k_parse_entry_creation_valid():
|
31 |
+
"""Test valid creation of GSM8KParseEntry."""
|
32 |
+
entry = GSM8KParseEntry.create(
|
33 |
+
question="Test question",
|
34 |
+
answer="5",
|
35 |
+
raw_question="Test question",
|
36 |
+
raw_answer="Solution steps #### 5",
|
37 |
+
solution="Solution steps",
|
38 |
+
task_name="main",
|
39 |
+
numerical_answer=5,
|
40 |
+
)
|
41 |
+
assert isinstance(entry, GSM8KParseEntry)
|
42 |
+
assert entry.question == "Test question"
|
43 |
+
assert entry.answer == "5"
|
44 |
+
assert entry.solution == "Solution steps"
|
45 |
+
assert entry.numerical_answer == 5
|
46 |
+
assert entry.task_name == "main"
|
47 |
+
|
48 |
+
|
49 |
+
def test_gsm8k_parser_initialization(gsm8k_parser):
|
50 |
+
"""Test GSM8K parser initialization."""
|
51 |
+
assert gsm8k_parser._data_source == "openai/gsm8k"
|
52 |
+
assert gsm8k_parser._default_task == "main"
|
53 |
+
assert gsm8k_parser._task_names == ["main", "socratic"]
|
54 |
+
assert (
|
55 |
+
gsm8k_parser.get_huggingface_link
|
56 |
+
== "https://huggingface.co/datasets/openai/gsm8k"
|
57 |
+
)
|
58 |
+
|
59 |
+
|
60 |
+
def test_load_dataset(loaded_gsm8k_parser):
|
61 |
+
"""Test loading the dataset."""
|
62 |
+
assert loaded_gsm8k_parser.raw_data is not None
|
63 |
+
assert loaded_gsm8k_parser.split_names == [
|
64 |
+
"test"
|
65 |
+
] # Since we specifically loaded the test split
|
66 |
+
assert loaded_gsm8k_parser._current_task == "main"
|
67 |
+
|
68 |
+
|
69 |
+
@pytest.mark.integration
|
70 |
+
def test_full_parse_workflow(loaded_gsm8k_parser):
|
71 |
+
"""Test the complete workflow of loading and parsing data."""
|
72 |
+
# Parse the test split
|
73 |
+
loaded_gsm8k_parser.parse(split_names="test", force=True)
|
74 |
+
parsed_data = loaded_gsm8k_parser.get_parsed_data
|
75 |
+
|
76 |
+
# Basic checks
|
77 |
+
assert len(parsed_data) > 0
|
78 |
+
|
79 |
+
# Check first entry structure
|
80 |
+
first_entry = parsed_data[0]
|
81 |
+
assert isinstance(first_entry, GSM8KParseEntry)
|
82 |
+
assert first_entry.task_name == "main"
|
83 |
+
assert isinstance(first_entry.numerical_answer, (str, int, float))
|
84 |
+
assert "####" in first_entry.raw_answer
|
85 |
+
assert first_entry.solution
|
86 |
+
|
87 |
+
|
88 |
+
def test_process_entry(gsm8k_parser, sample_row):
|
89 |
+
"""Test processing of a single GSM8K entry."""
|
90 |
+
entry = gsm8k_parser.process_entry(sample_row, task_name="main")
|
91 |
+
|
92 |
+
assert isinstance(entry, GSM8KParseEntry)
|
93 |
+
assert entry.numerical_answer == 5
|
94 |
+
assert "Janet has 3 apples" in entry.raw_question
|
95 |
+
assert "#### 5" in entry.raw_answer
|
96 |
+
assert "Let's solve this step by step:" in entry.solution
|
97 |
+
assert entry.task_name == "main"
|
98 |
+
|
99 |
+
|
100 |
+
@pytest.mark.parametrize("split_name", ["invalid_split", "wrong_split"])
|
101 |
+
def test_parse_with_invalid_split(gsm8k_parser, split_name):
|
102 |
+
"""Test parsing with invalid split names."""
|
103 |
+
gsm8k_parser.raw_data = {"train": [], "test": []} # Mock data
|
104 |
+
|
105 |
+
with pytest.raises(
|
106 |
+
ValueError, match=f"Split '{split_name}' not found in the dataset"
|
107 |
+
):
|
108 |
+
gsm8k_parser.parse(split_name)
|
109 |
+
|
110 |
+
|
111 |
+
def test_parse_without_loaded_data(gsm8k_parser):
|
112 |
+
"""Test parsing without loading data first."""
|
113 |
+
with pytest.raises(
|
114 |
+
ValueError, match="No data loaded. Please load the dataset first"
|
115 |
+
):
|
116 |
+
gsm8k_parser.parse()
|
117 |
+
|
118 |
+
|
119 |
+
@pytest.mark.parametrize(
|
120 |
+
"test_case",
|
121 |
+
[
|
122 |
+
{"question": "Test question", "answer": "Some solution steps #### 42"},
|
123 |
+
{
|
124 |
+
"question": "Test question",
|
125 |
+
"answer": "Complex solution\nWith multiple lines\n#### 123.45",
|
126 |
+
},
|
127 |
+
{"question": "Test question", "answer": "No steps #### 0"},
|
128 |
+
],
|
129 |
+
)
|
130 |
+
def test_numerical_answer_extraction(gsm8k_parser, test_case):
|
131 |
+
"""Test extraction of numerical answers from different formats."""
|
132 |
+
entry = gsm8k_parser.process_entry(test_case, task_name="main")
|
133 |
+
assert str(entry.numerical_answer) == test_case["answer"].split("####")[
|
134 |
+
-1
|
135 |
+
].strip().replace(",", "")
|
136 |
+
|
137 |
+
|
138 |
+
def test_solution_extraction(gsm8k_parser):
|
139 |
+
"""Test extraction of solution steps."""
|
140 |
+
row = {
|
141 |
+
"question": "Test question",
|
142 |
+
"answer": "Step 1: Do this\nStep 2: Do that\n#### 42",
|
143 |
+
}
|
144 |
+
|
145 |
+
entry = gsm8k_parser.process_entry(row, task_name="main")
|
146 |
+
assert entry.solution == "Step 1: Do this\nStep 2: Do that"
|
147 |
+
assert entry.task_name == "main"
|
148 |
+
assert "####" not in entry.solution
|
149 |
+
|
150 |
+
|
151 |
+
def test_parser_properties(gsm8k_parser):
|
152 |
+
"""Test parser property getters."""
|
153 |
+
assert gsm8k_parser.task_names == ["main", "socratic"]
|
154 |
+
assert gsm8k_parser.total_tasks == 2
|
155 |
+
|
156 |
+
|
157 |
+
def test_parser_string_representation(loaded_gsm8k_parser):
|
158 |
+
"""Test string representation of parser."""
|
159 |
+
repr_str = str(loaded_gsm8k_parser)
|
160 |
+
assert "GSM8KDatasetParser" in repr_str
|
161 |
+
assert "openai/gsm8k" in repr_str
|
162 |
+
assert "main" in repr_str
|
163 |
+
assert "loaded" in repr_str
|
164 |
+
|
165 |
+
|
166 |
+
@pytest.mark.integration
|
167 |
+
def test_different_splits_parsing(gsm8k_parser):
|
168 |
+
"""Test parsing different splits of the dataset."""
|
169 |
+
# Load and parse test split
|
170 |
+
gsm8k_parser.load(task_name="main", split="test")
|
171 |
+
gsm8k_parser.parse(split_names="test", force=True)
|
172 |
+
test_count = len(gsm8k_parser.get_parsed_data)
|
173 |
+
|
174 |
+
# Load and parse train split
|
175 |
+
gsm8k_parser.load(task_name="main", split="train")
|
176 |
+
gsm8k_parser.parse(split_names="train", force=True)
|
177 |
+
train_count = len(gsm8k_parser.get_parsed_data)
|
178 |
+
|
179 |
+
assert test_count > 0
|
180 |
+
assert train_count > 0
|
181 |
+
assert train_count != test_count
|
182 |
+
|
183 |
+
|
184 |
+
def test_get_dataset_description(gsm8k_parser):
|
185 |
+
"""Test dataset description generation."""
|
186 |
+
description = gsm8k_parser.get_dataset_description()
|
187 |
+
|
188 |
+
assert description.name == "Grade School Math 8K (GSM8K)"
|
189 |
+
assert description.source == "OpenAI"
|
190 |
+
assert description.language == "English"
|
191 |
+
assert "Cobbe" in description.citation
|
192 |
+
|
193 |
+
|
194 |
+
def test_get_evaluation_metrics(gsm8k_parser):
|
195 |
+
"""Test evaluation metrics specification."""
|
196 |
+
metrics = gsm8k_parser.get_evaluation_metrics()
|
197 |
+
|
198 |
+
# Check we have all expected metrics
|
199 |
+
metric_names = {metric.name for metric in metrics}
|
200 |
+
expected_names = {"exact_match", "solution_validity", "step_accuracy", "step_count"}
|
201 |
+
assert metric_names == expected_names
|
202 |
+
|
203 |
+
# Check exact_match metric details
|
204 |
+
exact_match = next(m for m in metrics if m.name == "exact_match")
|
205 |
+
assert exact_match.type == "string"
|
206 |
+
assert exact_match.primary is True
|
207 |
+
assert "exact match" in exact_match.description.lower()
|
tests/test_humaneval_parser.py
ADDED
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
|
3 |
+
from llmdataparser.humaneval_parser import (
|
4 |
+
HumanEvalDatasetParser,
|
5 |
+
HumanEvalDatasetPlusParser,
|
6 |
+
HumanEvalParseEntry,
|
7 |
+
)
|
8 |
+
|
9 |
+
|
10 |
+
@pytest.fixture
|
11 |
+
def sample_entry():
|
12 |
+
return {
|
13 |
+
"prompt": 'def add(a, b):\n """Add two numbers."""\n',
|
14 |
+
"canonical_solution": "def add(a, b):\n return a + b\n",
|
15 |
+
"task_id": "HumanEval/0",
|
16 |
+
"entry_point": "add",
|
17 |
+
"test": "def test_add(): assert add(2, 3) == 5",
|
18 |
+
}
|
19 |
+
|
20 |
+
|
21 |
+
@pytest.fixture
|
22 |
+
def parser():
|
23 |
+
return HumanEvalDatasetParser()
|
24 |
+
|
25 |
+
|
26 |
+
@pytest.fixture
|
27 |
+
def plus_parser():
|
28 |
+
return HumanEvalDatasetPlusParser()
|
29 |
+
|
30 |
+
|
31 |
+
@pytest.fixture
|
32 |
+
def plus_sample_entry():
|
33 |
+
return {
|
34 |
+
"prompt": 'def add(a, b):\n """Add two numbers."""\n',
|
35 |
+
"canonical_solution": "def add(a, b):\n return a + b\n",
|
36 |
+
"task_id": "HumanEval/0",
|
37 |
+
"entry_point": "add",
|
38 |
+
"test": "def test_add(): assert add(2, 3) == 5",
|
39 |
+
}
|
40 |
+
|
41 |
+
|
42 |
+
def test_humaneval_parse_entry_creation():
|
43 |
+
"""Test creation of HumanEvalParseEntry"""
|
44 |
+
entry = HumanEvalParseEntry.create(
|
45 |
+
question="test question",
|
46 |
+
answer="test answer",
|
47 |
+
raw_question="raw question",
|
48 |
+
task_id="HumanEval/1",
|
49 |
+
entry_point="test_func",
|
50 |
+
test="test case",
|
51 |
+
task_name="openai_humaneval",
|
52 |
+
)
|
53 |
+
|
54 |
+
assert entry.question == "test question"
|
55 |
+
assert entry.answer == "test answer"
|
56 |
+
assert entry.raw_question == "raw question"
|
57 |
+
assert entry.raw_answer == "test answer" # Should match answer
|
58 |
+
assert entry.task_id == "HumanEval/1"
|
59 |
+
assert entry.entry_point == "test_func"
|
60 |
+
assert entry.test == "test case"
|
61 |
+
assert entry.task_name == "openai_humaneval"
|
62 |
+
|
63 |
+
|
64 |
+
def test_humaneval_parse_entry_validation():
|
65 |
+
"""Test validation of required fields"""
|
66 |
+
with pytest.raises(ValueError, match="Task ID cannot be empty"):
|
67 |
+
HumanEvalParseEntry.create(
|
68 |
+
question="test",
|
69 |
+
answer="test",
|
70 |
+
raw_question="test",
|
71 |
+
task_id="", # Empty task_id should raise error
|
72 |
+
entry_point="test",
|
73 |
+
test="test",
|
74 |
+
task_name="test",
|
75 |
+
)
|
76 |
+
|
77 |
+
with pytest.raises(ValueError, match="Entry point cannot be empty"):
|
78 |
+
HumanEvalParseEntry.create(
|
79 |
+
question="test",
|
80 |
+
answer="test",
|
81 |
+
raw_question="test",
|
82 |
+
task_id="test",
|
83 |
+
entry_point="", # Empty entry_point should raise error
|
84 |
+
test="test",
|
85 |
+
task_name="test",
|
86 |
+
)
|
87 |
+
|
88 |
+
|
89 |
+
def test_process_entry(parser, sample_entry):
|
90 |
+
"""Test processing of a single entry"""
|
91 |
+
result = parser.process_entry(sample_entry, task_name="openai_humaneval")
|
92 |
+
|
93 |
+
assert isinstance(result, HumanEvalParseEntry)
|
94 |
+
assert result.task_id == "HumanEval/0"
|
95 |
+
assert result.entry_point == "add"
|
96 |
+
|
97 |
+
assert result.answer == sample_entry["canonical_solution"]
|
98 |
+
assert result.test == sample_entry["test"]
|
99 |
+
assert result.task_name == "openai_humaneval"
|
100 |
+
|
101 |
+
|
102 |
+
def test_parser_initialization(parser):
|
103 |
+
"""Test parser initialization and properties"""
|
104 |
+
assert parser._data_source == "openai/openai_humaneval"
|
105 |
+
assert parser._default_task == "openai_humaneval"
|
106 |
+
assert parser._task_names == ["openai_humaneval"]
|
107 |
+
assert (
|
108 |
+
parser.get_huggingface_link
|
109 |
+
== "https://huggingface.co/datasets/openai/openai_humaneval"
|
110 |
+
)
|
111 |
+
|
112 |
+
|
113 |
+
@pytest.mark.integration
|
114 |
+
def test_parser_load_and_parse(parser):
|
115 |
+
"""Integration test for loading and parsing data"""
|
116 |
+
parser.load()
|
117 |
+
parser.parse()
|
118 |
+
parsed_data = parser.get_parsed_data
|
119 |
+
|
120 |
+
assert len(parsed_data) > 0
|
121 |
+
assert all(isinstance(entry, HumanEvalParseEntry) for entry in parsed_data)
|
122 |
+
|
123 |
+
|
124 |
+
def test_get_current_task(parser, sample_entry):
|
125 |
+
"""Test _get_current_task method"""
|
126 |
+
task = parser._get_current_task(sample_entry)
|
127 |
+
assert task == parser._default_task
|
128 |
+
|
129 |
+
|
130 |
+
def test_plus_parser_initialization(plus_parser):
|
131 |
+
"""Test HumanEvalDatasetPlusParser initialization and properties"""
|
132 |
+
assert plus_parser._data_source == "evalplus/humanevalplus"
|
133 |
+
assert plus_parser._default_task == "default"
|
134 |
+
assert plus_parser._task_names == ["default"]
|
135 |
+
assert (
|
136 |
+
plus_parser.get_huggingface_link
|
137 |
+
== "https://huggingface.co/datasets/evalplus/humanevalplus"
|
138 |
+
)
|
139 |
+
|
140 |
+
|
141 |
+
def test_plus_process_entry(plus_parser, plus_sample_entry):
|
142 |
+
"""Test processing of a single entry in HumanEvalDatasetPlusParser"""
|
143 |
+
result = plus_parser.process_entry(plus_sample_entry, task_name="default")
|
144 |
+
|
145 |
+
assert isinstance(result, HumanEvalParseEntry)
|
146 |
+
assert result.task_id == "HumanEval/0"
|
147 |
+
assert result.entry_point == "add"
|
148 |
+
|
149 |
+
assert result.answer == plus_sample_entry["canonical_solution"]
|
150 |
+
assert result.test == plus_sample_entry["test"]
|
151 |
+
assert result.task_name == "default"
|
152 |
+
|
153 |
+
|
154 |
+
@pytest.mark.integration
|
155 |
+
def test_plus_parser_load_and_parse(plus_parser):
|
156 |
+
"""Integration test for loading and parsing data with HumanEvalDatasetPlusParser"""
|
157 |
+
plus_parser.load()
|
158 |
+
plus_parser.parse()
|
159 |
+
parsed_data = plus_parser.get_parsed_data
|
160 |
+
|
161 |
+
assert len(parsed_data) > 0
|
162 |
+
assert all(isinstance(entry, HumanEvalParseEntry) for entry in parsed_data)
|
163 |
+
|
164 |
+
|
165 |
+
def test_plus_get_current_task(plus_parser, plus_sample_entry):
|
166 |
+
"""Test _get_current_task method for HumanEvalDatasetPlusParser"""
|
167 |
+
task = plus_parser._get_current_task(plus_sample_entry)
|
168 |
+
assert task == plus_parser._default_task
|
169 |
+
|
170 |
+
|
171 |
+
def test_get_dataset_description(parser, plus_parser):
|
172 |
+
"""Test dataset description generation for both parsers."""
|
173 |
+
# Test original HumanEval description
|
174 |
+
description = parser.get_dataset_description()
|
175 |
+
assert description.name == "HumanEval"
|
176 |
+
assert "code generation" in description.purpose
|
177 |
+
assert description.language == "Python"
|
178 |
+
assert "chen2021codex" in description.citation
|
179 |
+
|
180 |
+
# Test HumanEval Plus description
|
181 |
+
plus_description = plus_parser.get_dataset_description()
|
182 |
+
assert plus_description.name == "HumanEval Plus"
|
183 |
+
assert "80x more test coverage" in plus_description.purpose
|
184 |
+
assert "comprehensive test suites" in plus_description.format
|
185 |
+
assert "edge cases" in plus_description.characteristics
|
186 |
+
assert "evalplus" in plus_description.citation
|
187 |
+
|
188 |
+
|
189 |
+
def test_get_evaluation_metrics(parser):
|
190 |
+
"""Test evaluation metrics generation for both parsers."""
|
191 |
+
# Test original HumanEval metrics
|
192 |
+
metrics = parser.get_evaluation_metrics()
|
193 |
+
assert len(metrics) == 5 # Base metrics + 2 specific metrics
|
194 |
+
|
195 |
+
# Check primary metrics - update to match actual implementation
|
196 |
+
primary_metrics = [m for m in metrics if m.primary]
|
197 |
+
assert len(primary_metrics) == 1 # pass@k
|
198 |
+
assert any(m.name == "pass@k" for m in primary_metrics)
|
tests/test_ifeval_parser.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
|
3 |
+
from llmdataparser.ifeval_parser import IFEvalDatasetParser, IFEvalParseEntry
|
4 |
+
|
5 |
+
|
6 |
+
@pytest.fixture
|
7 |
+
def sample_ifeval_entries():
|
8 |
+
"""Create sample IFEval dataset entries for testing."""
|
9 |
+
return [
|
10 |
+
{
|
11 |
+
"key": 1,
|
12 |
+
"prompt": "Write a function to calculate factorial.",
|
13 |
+
"instruction_id_list": ["math_001", "programming_001"],
|
14 |
+
"kwargs": {"difficulty": "medium", "category": "mathematics"},
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"key": 2,
|
18 |
+
"prompt": "Explain quantum computing.",
|
19 |
+
"instruction_id_list": ["physics_001"],
|
20 |
+
"kwargs": {"difficulty": "hard", "category": "physics"},
|
21 |
+
},
|
22 |
+
]
|
23 |
+
|
24 |
+
|
25 |
+
@pytest.fixture
|
26 |
+
def ifeval_parser():
|
27 |
+
"""Create an IFEval parser instance."""
|
28 |
+
return IFEvalDatasetParser()
|
29 |
+
|
30 |
+
|
31 |
+
def test_ifeval_parse_entry_creation_valid():
|
32 |
+
"""Test valid creation of IFEvalParseEntry."""
|
33 |
+
entry = IFEvalParseEntry.create(
|
34 |
+
question="Test instruction",
|
35 |
+
answer="", # IFEval doesn't have answers
|
36 |
+
raw_question="Test instruction",
|
37 |
+
raw_answer="",
|
38 |
+
key=1,
|
39 |
+
instruction_id_list=["test_001", "test_002"],
|
40 |
+
kwargs={"difficulty": "easy"},
|
41 |
+
task_name="default",
|
42 |
+
)
|
43 |
+
|
44 |
+
assert isinstance(entry, IFEvalParseEntry)
|
45 |
+
assert entry.question == "Test instruction"
|
46 |
+
assert entry.answer == ""
|
47 |
+
assert entry.key == 1
|
48 |
+
assert entry.instruction_id_list == ["test_001", "test_002"]
|
49 |
+
assert entry.kwargs == {"difficulty": "easy"}
|
50 |
+
assert entry.task_name == "default"
|
51 |
+
|
52 |
+
|
53 |
+
def test_process_entry_ifeval(ifeval_parser, sample_ifeval_entries):
|
54 |
+
"""Test processing entries in IFEval parser."""
|
55 |
+
entry = ifeval_parser.process_entry(sample_ifeval_entries[0])
|
56 |
+
|
57 |
+
assert isinstance(entry, IFEvalParseEntry)
|
58 |
+
assert entry.key == 1
|
59 |
+
assert entry.instruction_id_list == ["math_001", "programming_001"]
|
60 |
+
assert entry.kwargs == {"difficulty": "medium", "category": "mathematics"}
|
61 |
+
assert entry.raw_question == "Write a function to calculate factorial."
|
62 |
+
assert entry.answer == "" # IFEval doesn't have answers
|
63 |
+
assert entry.task_name == "default"
|
64 |
+
|
65 |
+
|
66 |
+
def test_parser_initialization(ifeval_parser):
|
67 |
+
"""Test initialization of IFEval parser."""
|
68 |
+
assert ifeval_parser._data_source == "google/IFEval"
|
69 |
+
assert ifeval_parser._default_task == "default"
|
70 |
+
assert ifeval_parser.task_names == ["default"]
|
71 |
+
assert (
|
72 |
+
ifeval_parser.get_huggingface_link
|
73 |
+
== "https://huggingface.co/datasets/google/IFEval"
|
74 |
+
)
|
75 |
+
|
76 |
+
|
77 |
+
@pytest.mark.integration
|
78 |
+
def test_load_dataset(ifeval_parser):
|
79 |
+
"""Test loading the IFEval dataset."""
|
80 |
+
ifeval_parser.load(split="train")
|
81 |
+
assert ifeval_parser.raw_data is not None
|
82 |
+
assert ifeval_parser.split_names == ["train"]
|
83 |
+
assert ifeval_parser._current_task == "default"
|
84 |
+
|
85 |
+
|
86 |
+
def test_parser_string_representation(ifeval_parser):
|
87 |
+
"""Test string representation of IFEval parser."""
|
88 |
+
repr_str = str(ifeval_parser)
|
89 |
+
assert "IFEvalDatasetParser" in repr_str
|
90 |
+
assert "google/IFEval" in repr_str
|
91 |
+
assert "not loaded" in repr_str
|
92 |
+
|
93 |
+
|
94 |
+
def test_get_dataset_description(ifeval_parser):
|
95 |
+
"""Test dataset description generation for IFEval."""
|
96 |
+
description = ifeval_parser.get_dataset_description()
|
97 |
+
|
98 |
+
assert description.name == "IFEval"
|
99 |
+
assert description.source == "Google Research"
|
100 |
+
assert description.language == "English (BCP-47 en)"
|
101 |
+
|
102 |
+
|
103 |
+
def test_get_evaluation_metrics(ifeval_parser):
|
104 |
+
"""Test evaluation metrics generation for IFEval."""
|
105 |
+
metrics = ifeval_parser.get_evaluation_metrics()
|
106 |
+
|
107 |
+
# Should have 5 metrics total
|
108 |
+
assert len(metrics) == 5
|
109 |
+
|
110 |
+
# Check primary metrics
|
111 |
+
primary_metrics = [m for m in metrics if m.primary]
|
112 |
+
assert len(primary_metrics) == 3
|
113 |
+
|
114 |
+
# Verify specific metrics exist and have correct properties
|
115 |
+
metric_names = {m.name for m in metrics}
|
116 |
+
assert "format_compliance" in metric_names
|
117 |
+
assert "length_constraints" in metric_names
|
118 |
+
assert "punctuation_rules" in metric_names
|
119 |
+
assert "keyword_usage" in metric_names
|
120 |
+
assert "structural_requirements" in metric_names
|
tests/test_math_parser.py
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
|
3 |
+
from llmdataparser.math_parser import MATHDatasetParser, MATHParseEntry
|
4 |
+
|
5 |
+
|
6 |
+
@pytest.fixture
|
7 |
+
def math_parser():
|
8 |
+
"""Create a MATH parser instance for testing."""
|
9 |
+
return MATHDatasetParser()
|
10 |
+
|
11 |
+
|
12 |
+
@pytest.fixture
|
13 |
+
def loaded_math_parser(math_parser):
|
14 |
+
"""Create and load a MATH parser instance with test split."""
|
15 |
+
math_parser.load(task_name="algebra", split="test")
|
16 |
+
return math_parser
|
17 |
+
|
18 |
+
|
19 |
+
@pytest.fixture
|
20 |
+
def sample_math_entries():
|
21 |
+
"""Create sample MATH dataset entries for testing."""
|
22 |
+
return [
|
23 |
+
{
|
24 |
+
"problem": "Solve for x: 2x + 4 = 10",
|
25 |
+
"level": "Level 3",
|
26 |
+
"solution": "Let's solve step by step:\n1) Subtract 4 from both sides: 2x = 6\n2) Divide both sides by 2\n\nTherefore, x = 3",
|
27 |
+
"type": "algebra",
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"problem": "Find the area of a circle with radius 5 units.",
|
31 |
+
"level": "Level 2",
|
32 |
+
"solution": "Area = πr²\nArea = π(5)²\nArea = 25π square units",
|
33 |
+
"type": "geometry",
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"problem": "What is the limit of (x²-1)/(x-1) as x approaches 1?",
|
37 |
+
"level": "Level 4",
|
38 |
+
"solution": "Using L'Hôpital's rule:\nlim(x→1) (x²-1)/(x-1) = lim(x→1) (2x)/(1) = 2",
|
39 |
+
"type": "calculus",
|
40 |
+
},
|
41 |
+
]
|
42 |
+
|
43 |
+
|
44 |
+
def test_math_parse_entry_creation_valid():
|
45 |
+
"""Test valid creation of MATHParseEntry with all fields."""
|
46 |
+
entry = MATHParseEntry.create(
|
47 |
+
question="Test question",
|
48 |
+
answer="Test answer",
|
49 |
+
raw_question="Test question",
|
50 |
+
raw_answer="Test solution",
|
51 |
+
level="Level 5",
|
52 |
+
task_name="algebra",
|
53 |
+
solution="Test solution",
|
54 |
+
)
|
55 |
+
|
56 |
+
assert isinstance(entry, MATHParseEntry)
|
57 |
+
assert entry.question == "Test question"
|
58 |
+
assert entry.answer == "Test answer"
|
59 |
+
assert entry.raw_question == "Test question"
|
60 |
+
assert entry.raw_answer == "Test solution"
|
61 |
+
assert entry.level == "Level 5"
|
62 |
+
assert entry.task_name == "algebra"
|
63 |
+
assert entry.solution == "Test solution"
|
64 |
+
|
65 |
+
|
66 |
+
@pytest.mark.parametrize(
|
67 |
+
"test_case",
|
68 |
+
[
|
69 |
+
{
|
70 |
+
"problem": "Solve for x: 2x + 4 = 10",
|
71 |
+
"level": "Level 3",
|
72 |
+
"solution": "x = 3",
|
73 |
+
"type": "algebra",
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"problem": "Find the derivative of f(x) = x²",
|
77 |
+
"level": "Level 4",
|
78 |
+
"solution": "f'(x) = 2x",
|
79 |
+
"type": "calculus",
|
80 |
+
},
|
81 |
+
],
|
82 |
+
)
|
83 |
+
def test_process_entry(math_parser, test_case):
|
84 |
+
"""Test processing different types of MATH entries."""
|
85 |
+
entry = math_parser.process_entry(test_case, task_name=test_case["type"])
|
86 |
+
|
87 |
+
assert isinstance(entry, MATHParseEntry)
|
88 |
+
|
89 |
+
assert entry.answer == test_case["solution"]
|
90 |
+
assert entry.raw_question == test_case["problem"]
|
91 |
+
assert entry.raw_answer == test_case["solution"]
|
92 |
+
assert entry.level == test_case["level"]
|
93 |
+
assert entry.task_name == test_case["type"]
|
94 |
+
assert entry.solution == test_case["solution"]
|
95 |
+
|
96 |
+
|
97 |
+
def test_math_parser_initialization(math_parser):
|
98 |
+
"""Test MATH parser initialization and properties."""
|
99 |
+
assert isinstance(math_parser.task_names, list)
|
100 |
+
assert len(math_parser.task_names) == 8
|
101 |
+
assert math_parser._data_source == "lighteval/MATH"
|
102 |
+
assert math_parser._default_task == "all"
|
103 |
+
assert "algebra" in math_parser.task_names
|
104 |
+
assert "geometry" in math_parser.task_names
|
105 |
+
assert (
|
106 |
+
math_parser.get_huggingface_link
|
107 |
+
== "https://huggingface.co/datasets/lighteval/MATH"
|
108 |
+
)
|
109 |
+
|
110 |
+
|
111 |
+
def test_get_current_task(math_parser):
|
112 |
+
"""Test task name resolution in different scenarios."""
|
113 |
+
# Test with valid type in data entry
|
114 |
+
test_row_with_type = {"type": "algebra"}
|
115 |
+
assert math_parser._get_current_task(test_row_with_type) == "algebra"
|
116 |
+
|
117 |
+
# Test without type in data entry
|
118 |
+
test_row_without_type = {}
|
119 |
+
math_parser._current_task = "geometry"
|
120 |
+
assert math_parser._get_current_task(test_row_without_type) == "geometry"
|
121 |
+
|
122 |
+
# Test with invalid type - should return current task
|
123 |
+
test_row_invalid_type = {"type": "invalid_type"}
|
124 |
+
math_parser._current_task = "algebra"
|
125 |
+
assert math_parser._get_current_task(test_row_invalid_type) == "algebra"
|
126 |
+
|
127 |
+
|
128 |
+
def test_valid_levels(math_parser):
|
129 |
+
"""Test handling of valid level values."""
|
130 |
+
for i in range(1, 6):
|
131 |
+
test_row = {
|
132 |
+
"problem": "Test problem",
|
133 |
+
"level": f"Level {i}",
|
134 |
+
"solution": "Test solution",
|
135 |
+
"type": "algebra",
|
136 |
+
}
|
137 |
+
entry = math_parser.process_entry(test_row, task_name="algebra")
|
138 |
+
assert entry.level == f"Level {i}"
|
139 |
+
|
140 |
+
|
141 |
+
@pytest.mark.parametrize(
|
142 |
+
"invalid_level",
|
143 |
+
[
|
144 |
+
"Level 0", # Too low
|
145 |
+
"Level 6", # Too high
|
146 |
+
"Invalid", # Wrong format
|
147 |
+
None, # Missing
|
148 |
+
"", # Empty
|
149 |
+
"level 1", # Wrong capitalization
|
150 |
+
],
|
151 |
+
)
|
152 |
+
def test_invalid_level_handling(math_parser, invalid_level):
|
153 |
+
"""Test handling of invalid level values."""
|
154 |
+
test_row = {
|
155 |
+
"problem": "Test problem",
|
156 |
+
"level": invalid_level,
|
157 |
+
"solution": "Test solution",
|
158 |
+
"type": "algebra",
|
159 |
+
}
|
160 |
+
|
161 |
+
entry = math_parser.process_entry(test_row, task_name="algebra")
|
162 |
+
assert entry.level == "Unknown"
|
163 |
+
|
164 |
+
|
165 |
+
@pytest.mark.integration
|
166 |
+
def test_load_dataset(loaded_math_parser):
|
167 |
+
"""Test loading the MATH dataset."""
|
168 |
+
assert loaded_math_parser.raw_data is not None
|
169 |
+
assert loaded_math_parser.split_names == ["test"]
|
170 |
+
assert loaded_math_parser._current_task == "algebra"
|
171 |
+
|
172 |
+
|
173 |
+
def test_parser_string_representation(loaded_math_parser):
|
174 |
+
"""Test string representation of MATH parser."""
|
175 |
+
repr_str = str(loaded_math_parser)
|
176 |
+
assert "MATHDatasetParser" in repr_str
|
177 |
+
assert "lighteval/MATH" in repr_str
|
178 |
+
assert "algebra" in repr_str
|
179 |
+
assert "loaded" in repr_str
|
180 |
+
|
181 |
+
|
182 |
+
@pytest.mark.integration
|
183 |
+
def test_different_splits_parsing(math_parser):
|
184 |
+
"""Test parsing different splits of the dataset."""
|
185 |
+
# Load and parse test split
|
186 |
+
math_parser.load(task_name="algebra", split="test")
|
187 |
+
math_parser.parse(split_names="test", force=True)
|
188 |
+
test_count = len(math_parser.get_parsed_data)
|
189 |
+
|
190 |
+
# Load and parse train split
|
191 |
+
math_parser.load(task_name="algebra", split="train")
|
192 |
+
math_parser.parse(split_names="train", force=True)
|
193 |
+
train_count = len(math_parser.get_parsed_data)
|
194 |
+
|
195 |
+
assert test_count > 0
|
196 |
+
assert train_count > 0
|
197 |
+
assert train_count != test_count
|
198 |
+
|
199 |
+
|
200 |
+
def test_get_dataset_description(math_parser):
|
201 |
+
"""Test dataset description generation."""
|
202 |
+
description = math_parser.get_dataset_description()
|
203 |
+
|
204 |
+
assert description.name == "MATH"
|
205 |
+
assert "Hendrycks" in description.source
|
206 |
+
assert description.language == "English"
|
207 |
+
assert "12,500" in description.characteristics
|
208 |
+
assert "hendrycksmath2021" in description.citation
|
209 |
+
assert "NeurIPS" in description.citation
|
210 |
+
|
211 |
+
# Check additional info
|
212 |
+
assert description.additional_info is not None
|
213 |
+
assert description.additional_info["difficulty_levels"] == "1-5"
|
214 |
+
assert "algebra" in description.additional_info["topics"]
|
215 |
+
assert "geometry" in description.additional_info["topics"]
|
216 |
+
assert description.additional_info["size"] == "12,500 problems"
|
217 |
+
|
218 |
+
|
219 |
+
def test_get_evaluation_metrics(math_parser):
|
220 |
+
"""Test evaluation metrics generation."""
|
221 |
+
metrics = math_parser.get_evaluation_metrics()
|
222 |
+
|
223 |
+
# Check total number of metrics
|
224 |
+
assert len(metrics) == 5
|
225 |
+
|
226 |
+
# Check primary metrics
|
227 |
+
primary_metrics = [m for m in metrics if m.primary]
|
228 |
+
assert len(primary_metrics) == 3
|
229 |
+
|
230 |
+
# Verify specific metrics exist with correct properties
|
231 |
+
metric_names = {m.name for m in metrics}
|
232 |
+
assert "symbolic_equivalence" in metric_names
|
233 |
+
assert "solution_presence" in metric_names
|
234 |
+
assert "reasoning_validity" in metric_names
|
235 |
+
assert "mathematical_notation" in metric_names
|
236 |
+
assert "solution_clarity" in metric_names
|
237 |
+
|
238 |
+
# Check specific metric properties
|
239 |
+
symbolic_metric = next(m for m in metrics if m.name == "symbolic_equivalence")
|
240 |
+
assert symbolic_metric.type == "exact_match"
|
241 |
+
assert symbolic_metric.primary is True
|
242 |
+
assert "sympy" in symbolic_metric.description.lower()
|
243 |
+
assert "equivalence" in symbolic_metric.description.lower()
|
244 |
+
|
245 |
+
solution_metric = next(m for m in metrics if m.name == "solution_presence")
|
246 |
+
assert solution_metric.type == "text"
|
247 |
+
assert solution_metric.primary is True
|
248 |
+
assert "step-by-step" in solution_metric.description.lower()
|
249 |
+
|
250 |
+
reasoning_metric = next(m for m in metrics if m.name == "reasoning_validity")
|
251 |
+
assert reasoning_metric.type == "text"
|
252 |
+
assert reasoning_metric.primary is True
|
253 |
+
assert "mathematical reasoning" in reasoning_metric.description.lower()
|
tests/test_mbpp_parser.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
|
3 |
+
from llmdataparser.mbpp_parser import MBPPDatasetParser, MBPPParseEntry
|
4 |
+
|
5 |
+
|
6 |
+
@pytest.fixture
|
7 |
+
def sample_entry():
|
8 |
+
return {
|
9 |
+
"text": "Write a function to find the sum of numbers in a list.",
|
10 |
+
"code": "def sum_list(lst):\n return sum(lst)",
|
11 |
+
"task_id": 42,
|
12 |
+
"test_list": ["assert sum_list([1, 2, 3]) == 6"],
|
13 |
+
"test_setup_code": "",
|
14 |
+
"challenge_test_list": ["assert sum_list([4, 5, 6]) == 15"],
|
15 |
+
}
|
16 |
+
|
17 |
+
|
18 |
+
@pytest.fixture
|
19 |
+
def parser():
|
20 |
+
return MBPPDatasetParser()
|
21 |
+
|
22 |
+
|
23 |
+
def test_mbpp_parse_entry_creation():
|
24 |
+
"""Test creation of MBPPParseEntry"""
|
25 |
+
entry = MBPPParseEntry.create(
|
26 |
+
question="test question",
|
27 |
+
answer="test answer",
|
28 |
+
raw_question="raw question",
|
29 |
+
task_id=42,
|
30 |
+
test_list=["test1", "test2"],
|
31 |
+
test_setup_code="setup code",
|
32 |
+
challenge_test_list=["challenge1"],
|
33 |
+
task_name="full",
|
34 |
+
source_file="test.pdf",
|
35 |
+
)
|
36 |
+
|
37 |
+
assert entry.question == "test question"
|
38 |
+
assert entry.answer == "test answer"
|
39 |
+
assert entry.raw_question == "raw question"
|
40 |
+
assert entry.raw_answer == "test answer"
|
41 |
+
assert entry.task_id == 42
|
42 |
+
assert entry.test_list == ["test1", "test2"]
|
43 |
+
assert entry.test_setup_code == "setup code"
|
44 |
+
assert entry.challenge_test_list == ["challenge1"]
|
45 |
+
assert entry.task_name == "full"
|
46 |
+
|
47 |
+
|
48 |
+
def test_mbpp_parse_entry_validation():
|
49 |
+
"""Test validation of required fields"""
|
50 |
+
with pytest.raises(ValueError, match="Task ID must be an integer"):
|
51 |
+
MBPPParseEntry.create(
|
52 |
+
question="test",
|
53 |
+
answer="test",
|
54 |
+
raw_question="test",
|
55 |
+
task_id="not_an_int", # Invalid task_id type
|
56 |
+
test_list=[],
|
57 |
+
test_setup_code="",
|
58 |
+
challenge_test_list=[],
|
59 |
+
task_name="full",
|
60 |
+
source_file="test.pdf",
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
def test_process_entry(parser, sample_entry):
|
65 |
+
"""Test processing of a single entry"""
|
66 |
+
result = parser.process_entry(sample_entry, task_name="full")
|
67 |
+
|
68 |
+
assert isinstance(result, MBPPParseEntry)
|
69 |
+
assert result.task_id == 42
|
70 |
+
assert result.raw_question == sample_entry["text"]
|
71 |
+
assert result.answer == sample_entry["code"]
|
72 |
+
assert result.test_list == sample_entry["test_list"]
|
73 |
+
assert result.challenge_test_list == sample_entry["challenge_test_list"]
|
74 |
+
assert result.task_name == "full"
|
75 |
+
|
76 |
+
|
77 |
+
def test_parser_initialization(parser):
|
78 |
+
"""Test parser initialization and properties"""
|
79 |
+
assert parser._data_source == "google-research-datasets/mbpp"
|
80 |
+
assert parser._default_task == "full"
|
81 |
+
assert parser._task_names == ["full", "sanitized"]
|
82 |
+
assert (
|
83 |
+
parser.get_huggingface_link
|
84 |
+
== "https://huggingface.co/datasets/google-research-datasets/mbpp"
|
85 |
+
)
|
86 |
+
|
87 |
+
|
88 |
+
@pytest.mark.integration
|
89 |
+
@pytest.mark.skip(reason="Requires access to HuggingFace MBPP dataset")
|
90 |
+
def test_parser_load_and_parse(parser):
|
91 |
+
"""Integration test for loading and parsing data"""
|
92 |
+
parser.load(split="train")
|
93 |
+
parser.parse(force=True)
|
94 |
+
parsed_data = parser.get_parsed_data
|
95 |
+
|
96 |
+
assert len(parsed_data) > 0
|
97 |
+
assert all(isinstance(entry, MBPPParseEntry) for entry in parsed_data)
|
98 |
+
|
99 |
+
|
100 |
+
def test_get_current_task(parser, sample_entry):
|
101 |
+
"""Test _get_current_task method"""
|
102 |
+
task = parser._get_current_task(sample_entry)
|
103 |
+
assert task == parser._default_task
|
104 |
+
|
105 |
+
|
106 |
+
@pytest.mark.parametrize("task_name", ["full", "sanitized"])
|
107 |
+
@pytest.mark.skip(reason="Requires access to HuggingFace MBPP dataset")
|
108 |
+
def test_different_tasks_loading(parser, task_name):
|
109 |
+
"""Test loading different tasks of the dataset"""
|
110 |
+
parser.load(task_name=task_name, split="train")
|
111 |
+
assert parser._current_task == task_name
|
112 |
+
|
113 |
+
|
114 |
+
def test_parser_string_representation(parser):
|
115 |
+
"""Test string representation of parser"""
|
116 |
+
repr_str = str(parser)
|
117 |
+
assert "MBPPDatasetParser" in repr_str
|
118 |
+
assert "google-research-datasets/mbpp" in repr_str
|
119 |
+
assert "not loaded" in repr_str
|
120 |
+
|
121 |
+
|
122 |
+
def test_parse_without_loaded_data(parser):
|
123 |
+
"""Test parsing without loading data first"""
|
124 |
+
with pytest.raises(
|
125 |
+
ValueError, match="No data loaded. Please load the dataset first"
|
126 |
+
):
|
127 |
+
parser.parse()
|
128 |
+
|
129 |
+
|
130 |
+
@pytest.mark.integration
|
131 |
+
@pytest.mark.skip(reason="Requires access to HuggingFace MBPP dataset")
|
132 |
+
def test_full_workflow_with_different_splits(parser):
|
133 |
+
"""Test the complete workflow with different splits"""
|
134 |
+
parser.load(split="train")
|
135 |
+
parser.parse(force=True)
|
136 |
+
train_data = parser.get_parsed_data
|
137 |
+
|
138 |
+
assert len(train_data) > 0
|
139 |
+
assert all(isinstance(entry, MBPPParseEntry) for entry in train_data)
|
140 |
+
assert all(entry.task_name == "full" for entry in train_data)
|
141 |
+
|
142 |
+
|
143 |
+
def test_get_dataset_description(parser):
|
144 |
+
"""Test dataset description generation."""
|
145 |
+
description = parser.get_dataset_description()
|
146 |
+
|
147 |
+
assert description.name == "Mostly Basic Python Problems (MBPP)"
|
148 |
+
assert "code generation" in description.purpose.lower()
|
149 |
+
assert "google-research" in description.source
|
150 |
+
assert description.language == "English and Python"
|
151 |
+
assert "1,000" in description.characteristics
|
152 |
+
assert "austin2021program" in description.citation
|
153 |
+
assert "Program Synthesis" in description.citation
|
154 |
+
|
155 |
+
|
156 |
+
def test_get_evaluation_metrics(parser):
|
157 |
+
"""Test evaluation metrics generation."""
|
158 |
+
metrics = parser.get_evaluation_metrics()
|
159 |
+
|
160 |
+
# Check total number of metrics
|
161 |
+
assert len(metrics) == 4
|
162 |
+
|
163 |
+
# Check primary metrics
|
164 |
+
primary_metrics = [m for m in metrics if m.primary]
|
165 |
+
assert len(primary_metrics) == 1
|
166 |
+
|
167 |
+
# Verify specific metrics exist with correct properties
|
168 |
+
metric_names = {m.name for m in metrics}
|
169 |
+
assert "pass@k" in metric_names
|
170 |
+
assert "test_case_success_rate" in metric_names
|
171 |
+
assert "syntax_validity" in metric_names
|
172 |
+
|
173 |
+
# Check specific metric properties
|
174 |
+
pass_k_metric = next(m for m in metrics if m.name == "pass@k")
|
175 |
+
assert pass_k_metric.type == "code_evaluation"
|
176 |
+
assert pass_k_metric.primary is True
|
177 |
+
assert "k generations" in pass_k_metric.description.lower()
|
178 |
+
assert "custom_pass_at_k" in pass_k_metric.implementation
|
tests/test_mgsm_parser.py
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
|
3 |
+
from llmdataparser.mgsm_parser import MGSMDatasetParser, MGSMParseEntry
|
4 |
+
|
5 |
+
|
6 |
+
@pytest.fixture
|
7 |
+
def mgsm_parser():
|
8 |
+
"""Create a MGSM parser instance for testing."""
|
9 |
+
return MGSMDatasetParser()
|
10 |
+
|
11 |
+
|
12 |
+
@pytest.fixture
|
13 |
+
def loaded_mgsm_parser(mgsm_parser):
|
14 |
+
"""Create and load a MGSM parser instance with test split."""
|
15 |
+
mgsm_parser.load(task_name="en", split="test")
|
16 |
+
return mgsm_parser
|
17 |
+
|
18 |
+
|
19 |
+
@pytest.fixture
|
20 |
+
def sample_mgsm_entries():
|
21 |
+
"""Create sample MGSM dataset entries for testing."""
|
22 |
+
return [
|
23 |
+
{
|
24 |
+
"question": "John has 5 apples and buys 3 more. How many apples does he have now?",
|
25 |
+
"answer": "Let's solve step by step:\n1) Initial apples = 5\n2) Bought apples = 3\n3) Total = 5 + 3 = 8\nJohn has 8 apples now.",
|
26 |
+
"answer_number": 8,
|
27 |
+
"equation_solution": "5 + 3 = 8",
|
28 |
+
"language": "en",
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"question": "Juan tiene 5 manzanas y compra 3 más. ¿Cuántas manzanas tiene ahora?",
|
32 |
+
"answer": "Resolvamos paso a paso:\n1) Manzanas iniciales = 5\n2) Manzanas compradas = 3\n3) Total = 5 + 3 = 8\nJuan tiene 8 manzanas ahora.",
|
33 |
+
"answer_number": 8,
|
34 |
+
"equation_solution": "5 + 3 = 8",
|
35 |
+
"language": "es",
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"question": "ジョンはリンゴを5個持っていて、さらに3個買います。今何個持っていますか?",
|
39 |
+
"answer": None, # Testing case with missing detailed answer
|
40 |
+
"answer_number": 8,
|
41 |
+
"equation_solution": "5 + 3 = 8",
|
42 |
+
"language": "ja",
|
43 |
+
},
|
44 |
+
]
|
45 |
+
|
46 |
+
|
47 |
+
def test_mgsm_parse_entry_creation_valid():
|
48 |
+
"""Test valid creation of MGSMParseEntry with all fields."""
|
49 |
+
entry = MGSMParseEntry.create(
|
50 |
+
question="Test question",
|
51 |
+
answer="Test answer",
|
52 |
+
raw_question="Test question",
|
53 |
+
raw_answer="Test answer",
|
54 |
+
numerical_answer=42,
|
55 |
+
equation_solution="21 * 2 = 42",
|
56 |
+
task_name="en",
|
57 |
+
language="en",
|
58 |
+
)
|
59 |
+
|
60 |
+
assert isinstance(entry, MGSMParseEntry)
|
61 |
+
assert entry.question == "Test question"
|
62 |
+
assert entry.answer == "Test answer"
|
63 |
+
assert entry.raw_question == "Test question"
|
64 |
+
assert entry.raw_answer == "Test answer"
|
65 |
+
assert entry.numerical_answer == 42
|
66 |
+
assert entry.equation_solution == "21 * 2 = 42"
|
67 |
+
assert entry.task_name == "en"
|
68 |
+
assert entry.language == "en"
|
69 |
+
|
70 |
+
|
71 |
+
def test_process_entry_with_detailed_answer(mgsm_parser, sample_mgsm_entries):
|
72 |
+
"""Test processing entry with detailed answer in English."""
|
73 |
+
entry = mgsm_parser.process_entry(sample_mgsm_entries[0], task_name="en")
|
74 |
+
|
75 |
+
assert isinstance(entry, MGSMParseEntry)
|
76 |
+
assert entry.numerical_answer == 8
|
77 |
+
assert entry.equation_solution == "5 + 3 = 8"
|
78 |
+
assert "step by step" in entry.answer
|
79 |
+
assert entry.language == "en"
|
80 |
+
assert entry.task_name == "en"
|
81 |
+
|
82 |
+
|
83 |
+
def test_process_entry_without_detailed_answer(mgsm_parser, sample_mgsm_entries):
|
84 |
+
"""Test processing entry without detailed answer (Japanese)."""
|
85 |
+
entry = mgsm_parser.process_entry(sample_mgsm_entries[2], task_name="ja")
|
86 |
+
|
87 |
+
assert isinstance(entry, MGSMParseEntry)
|
88 |
+
assert entry.numerical_answer == 8
|
89 |
+
assert entry.equation_solution == "5 + 3 = 8"
|
90 |
+
assert entry.answer == "8" # Should use numerical_answer as string
|
91 |
+
assert entry.language == "ja"
|
92 |
+
assert entry.task_name == "ja"
|
93 |
+
|
94 |
+
|
95 |
+
def test_process_entry_spanish(mgsm_parser, sample_mgsm_entries):
|
96 |
+
"""Test processing Spanish entry."""
|
97 |
+
entry = mgsm_parser.process_entry(sample_mgsm_entries[1], task_name="es")
|
98 |
+
|
99 |
+
assert isinstance(entry, MGSMParseEntry)
|
100 |
+
assert entry.numerical_answer == 8
|
101 |
+
assert entry.equation_solution == "5 + 3 = 8"
|
102 |
+
assert "paso a paso" in entry.answer # Spanish for "step by step"
|
103 |
+
assert entry.language == "es"
|
104 |
+
assert entry.task_name == "es"
|
105 |
+
|
106 |
+
|
107 |
+
def test_mgsm_parser_initialization(mgsm_parser):
|
108 |
+
"""Test MGSM parser initialization and properties."""
|
109 |
+
assert isinstance(mgsm_parser.task_names, list)
|
110 |
+
assert len(mgsm_parser.task_names) == 11 # 11 supported languages
|
111 |
+
assert mgsm_parser._data_source == "juletxara/mgsm"
|
112 |
+
assert mgsm_parser._default_task == "en"
|
113 |
+
assert all(lang in mgsm_parser.task_names for lang in ["en", "es", "ja", "zh"])
|
114 |
+
assert (
|
115 |
+
mgsm_parser.get_huggingface_link
|
116 |
+
== "https://huggingface.co/datasets/juletxara/mgsm"
|
117 |
+
)
|
118 |
+
|
119 |
+
|
120 |
+
@pytest.mark.integration
|
121 |
+
def test_load_dataset(loaded_mgsm_parser):
|
122 |
+
"""Test loading the MGSM dataset."""
|
123 |
+
assert loaded_mgsm_parser.raw_data is not None
|
124 |
+
assert loaded_mgsm_parser.split_names == ["test"]
|
125 |
+
assert loaded_mgsm_parser._current_task == "en"
|
126 |
+
|
127 |
+
|
128 |
+
def test_parser_string_representation(loaded_mgsm_parser):
|
129 |
+
"""Test string representation of MGSM parser."""
|
130 |
+
repr_str = str(loaded_mgsm_parser)
|
131 |
+
assert "MGSMDatasetParser" in repr_str
|
132 |
+
assert "juletxara/mgsm" in repr_str
|
133 |
+
assert "en" in repr_str
|
134 |
+
assert "loaded" in repr_str
|
135 |
+
|
136 |
+
|
137 |
+
@pytest.mark.integration
|
138 |
+
def test_different_languages_parsing(mgsm_parser):
|
139 |
+
"""Test parsing different language versions."""
|
140 |
+
# Load and parse English
|
141 |
+
mgsm_parser.load(task_name="en", split="test")
|
142 |
+
mgsm_parser.parse(split_names="test", force=True)
|
143 |
+
en_count = len(mgsm_parser.get_parsed_data)
|
144 |
+
|
145 |
+
# Load and parse Spanish
|
146 |
+
mgsm_parser.load(task_name="es", split="test")
|
147 |
+
mgsm_parser.parse(split_names="test", force=True)
|
148 |
+
es_count = len(mgsm_parser.get_parsed_data)
|
149 |
+
|
150 |
+
assert en_count > 0
|
151 |
+
assert es_count > 0
|
152 |
+
assert en_count == es_count # Should have same number of problems in each language
|
153 |
+
|
154 |
+
|
155 |
+
@pytest.mark.parametrize("language", ["en", "es", "ja", "zh", "ru"])
|
156 |
+
def test_supported_languages(mgsm_parser, language):
|
157 |
+
"""Test that each supported language can be processed."""
|
158 |
+
test_entry = {
|
159 |
+
"question": f"Test question in {language}",
|
160 |
+
"answer": f"Test answer in {language}",
|
161 |
+
"answer_number": 42,
|
162 |
+
"equation_solution": "21 * 2 = 42",
|
163 |
+
}
|
164 |
+
|
165 |
+
entry = mgsm_parser.process_entry(test_entry, task_name=language)
|
166 |
+
assert entry.language == language
|
167 |
+
assert entry.task_name == language
|
168 |
+
assert entry.numerical_answer == 42
|
169 |
+
|
170 |
+
|
171 |
+
def test_get_dataset_description(mgsm_parser):
|
172 |
+
"""Test dataset description generation."""
|
173 |
+
description = mgsm_parser.get_dataset_description()
|
174 |
+
|
175 |
+
assert description.name == "Multilingual Grade School Math (MGSM)"
|
176 |
+
assert "multilingual chain-of-thought reasoning" in description.purpose.lower()
|
177 |
+
assert "juletxara/mgsm" in description.source
|
178 |
+
assert description.language == "Multilingual (11 languages)"
|
179 |
+
|
180 |
+
assert "mathematical reasoning" in description.characteristics.lower()
|
181 |
+
|
182 |
+
# Check citations
|
183 |
+
assert "shi2022language" in description.citation
|
184 |
+
assert "cobbe2021gsm8k" in description.citation
|
185 |
+
|
186 |
+
# Check additional info
|
187 |
+
assert description.additional_info is not None
|
188 |
+
assert len(description.additional_info["languages"]) == 11
|
189 |
+
assert "English" in description.additional_info["languages"]
|
190 |
+
assert "Chinese" in description.additional_info["languages"]
|
191 |
+
|
192 |
+
|
193 |
+
def test_get_evaluation_metrics(mgsm_parser):
|
194 |
+
"""Test evaluation metrics generation."""
|
195 |
+
metrics = mgsm_parser.get_evaluation_metrics()
|
196 |
+
|
197 |
+
# Check total number of metrics
|
198 |
+
assert len(metrics) == 4
|
199 |
+
|
200 |
+
# Check primary metrics
|
201 |
+
primary_metrics = [m for m in metrics if m.primary]
|
202 |
+
assert len(primary_metrics) == 3
|
203 |
+
|
204 |
+
# Verify specific metrics exist with correct properties
|
205 |
+
metric_names = {m.name for m in metrics}
|
206 |
+
assert "exact_match" in metric_names
|
207 |
+
assert "solution_validity" in metric_names
|
208 |
+
assert "step_accuracy" in metric_names
|
209 |
+
assert "cross_lingual_consistency" in metric_names
|
210 |
+
|
211 |
+
# Check specific metric properties
|
212 |
+
exact_match_metric = next(m for m in metrics if m.name == "exact_match")
|
213 |
+
assert exact_match_metric.type == "string"
|
214 |
+
assert exact_match_metric.primary is True
|
215 |
+
assert "numerical answers" in exact_match_metric.description.lower()
|
216 |
+
assert "custom_exact_match" in exact_match_metric.implementation
|
217 |
+
|
218 |
+
solution_metric = next(m for m in metrics if m.name == "solution_validity")
|
219 |
+
assert solution_metric.type == "text"
|
220 |
+
assert solution_metric.primary is True
|
221 |
+
assert "mathematically valid" in solution_metric.description.lower()
|
222 |
+
assert "custom_solution_validator" in solution_metric.implementation
|
223 |
+
|
224 |
+
step_metric = next(m for m in metrics if m.name == "step_accuracy")
|
225 |
+
assert step_metric.type == "numerical"
|
226 |
+
assert step_metric.primary is True
|
227 |
+
assert "calculation steps" in step_metric.description.lower()
|
228 |
+
assert "custom_step_accuracy" in step_metric.implementation
|
tests/test_mmlu_parser.py
ADDED
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
|
3 |
+
from llmdataparser.mmlu_parser import (
|
4 |
+
BaseMMLUDatasetParser,
|
5 |
+
MMLUParseEntry,
|
6 |
+
MMLUProDatasetParser,
|
7 |
+
MMLUProParseEntry,
|
8 |
+
MMLUReduxDatasetParser,
|
9 |
+
TMMLUPlusDatasetParser,
|
10 |
+
)
|
11 |
+
|
12 |
+
|
13 |
+
@pytest.fixture
|
14 |
+
def base_parser():
|
15 |
+
"""Create a base MMLU parser instance."""
|
16 |
+
return BaseMMLUDatasetParser()
|
17 |
+
|
18 |
+
|
19 |
+
@pytest.fixture
|
20 |
+
def redux_parser():
|
21 |
+
"""Create a MMLU Redux parser instance."""
|
22 |
+
return MMLUReduxDatasetParser()
|
23 |
+
|
24 |
+
|
25 |
+
@pytest.fixture
|
26 |
+
def tmmlu_parser():
|
27 |
+
"""Create a TMMLU+ parser instance."""
|
28 |
+
return TMMLUPlusDatasetParser()
|
29 |
+
|
30 |
+
|
31 |
+
@pytest.fixture
|
32 |
+
def mmlu_pro_parser():
|
33 |
+
"""Create a MMLU Pro parser instance."""
|
34 |
+
return MMLUProDatasetParser()
|
35 |
+
|
36 |
+
|
37 |
+
@pytest.fixture
|
38 |
+
def sample_mmlu_entries():
|
39 |
+
"""Create sample MMLU dataset entries for testing."""
|
40 |
+
return [
|
41 |
+
{
|
42 |
+
"question": "What is the capital of France?",
|
43 |
+
"choices": ["London", "Paris", "Berlin", "Madrid"],
|
44 |
+
"answer": 1, # Paris
|
45 |
+
"subject": "geography",
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"question": "Which of these is a primary color?",
|
49 |
+
"choices": ["Green", "Purple", "Blue", "Orange"],
|
50 |
+
"answer": 2, # Blue
|
51 |
+
"subject": "art",
|
52 |
+
},
|
53 |
+
]
|
54 |
+
|
55 |
+
|
56 |
+
@pytest.fixture
|
57 |
+
def sample_mmlu_pro_entries():
|
58 |
+
"""Create sample MMLU Pro dataset entries for testing."""
|
59 |
+
return [
|
60 |
+
{
|
61 |
+
"question": "What is the time complexity of quicksort?",
|
62 |
+
"options": ["O(n)", "O(n log n)", "O(n²)", "O(2ⁿ)", "O(n!)", "O(1)"],
|
63 |
+
"answer": "The average time complexity of quicksort is O(n log n)",
|
64 |
+
"answer_index": 1,
|
65 |
+
"category": "computer_science",
|
66 |
+
}
|
67 |
+
]
|
68 |
+
|
69 |
+
|
70 |
+
def test_mmlu_parse_entry_creation_valid():
|
71 |
+
"""Test valid creation of MMLUParseEntry."""
|
72 |
+
entry = MMLUParseEntry.create(
|
73 |
+
question="Test question",
|
74 |
+
answer="A",
|
75 |
+
raw_question="Test question",
|
76 |
+
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
77 |
+
raw_answer="0",
|
78 |
+
task_name="test_task",
|
79 |
+
)
|
80 |
+
assert isinstance(entry, MMLUParseEntry)
|
81 |
+
assert entry.question == "Test question"
|
82 |
+
assert entry.answer == "A"
|
83 |
+
assert entry.raw_choices == ["choice1", "choice2", "choice3", "choice4"]
|
84 |
+
assert entry.task_name == "test_task"
|
85 |
+
|
86 |
+
|
87 |
+
@pytest.mark.parametrize("invalid_answer", ["E", "F", "1", "", None])
|
88 |
+
def test_mmlu_parse_entry_creation_invalid(invalid_answer):
|
89 |
+
"""Test invalid answer handling in MMLUParseEntry creation."""
|
90 |
+
with pytest.raises(
|
91 |
+
ValueError, match="Invalid answer_letter.*must be one of A, B, C, D"
|
92 |
+
):
|
93 |
+
MMLUParseEntry.create(
|
94 |
+
question="Test question",
|
95 |
+
answer=invalid_answer,
|
96 |
+
raw_question="Test question",
|
97 |
+
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
98 |
+
raw_answer="4",
|
99 |
+
task_name="test_task",
|
100 |
+
)
|
101 |
+
|
102 |
+
|
103 |
+
def test_process_entry_base(base_parser, sample_mmlu_entries):
|
104 |
+
"""Test processing entries in base MMLU parser."""
|
105 |
+
entry = base_parser.process_entry(sample_mmlu_entries[0], task_name="geography")
|
106 |
+
|
107 |
+
assert isinstance(entry, MMLUParseEntry)
|
108 |
+
assert entry.answer == "B" # Index 1 maps to B
|
109 |
+
assert "A. London" in entry.question
|
110 |
+
assert "B. Paris" in entry.question
|
111 |
+
assert "C. Berlin" in entry.question
|
112 |
+
assert "D. Madrid" in entry.question
|
113 |
+
assert entry.raw_question == "What is the capital of France?"
|
114 |
+
assert entry.raw_choices == ["London", "Paris", "Berlin", "Madrid"]
|
115 |
+
assert entry.raw_answer == "1"
|
116 |
+
assert entry.task_name == "geography"
|
117 |
+
|
118 |
+
|
119 |
+
def test_mmlu_pro_parse_entry_creation_valid():
|
120 |
+
"""Test valid creation of MMLUProParseEntry."""
|
121 |
+
entry = MMLUProParseEntry.create(
|
122 |
+
question="Test question",
|
123 |
+
answer="E", # MMLU Pro supports up to J
|
124 |
+
raw_question="Test question",
|
125 |
+
raw_choices=["choice1", "choice2", "choice3", "choice4", "choice5"],
|
126 |
+
raw_answer="4",
|
127 |
+
task_name="test_task",
|
128 |
+
)
|
129 |
+
assert isinstance(entry, MMLUProParseEntry)
|
130 |
+
assert entry.answer == "E"
|
131 |
+
assert len(entry.raw_choices) == 5
|
132 |
+
|
133 |
+
|
134 |
+
def test_process_entry_mmlu_pro(mmlu_pro_parser, sample_mmlu_pro_entries):
|
135 |
+
"""Test processing entries in MMLU Pro parser."""
|
136 |
+
entry = mmlu_pro_parser.process_entry(
|
137 |
+
sample_mmlu_pro_entries[0], task_name="computer_science"
|
138 |
+
)
|
139 |
+
|
140 |
+
assert isinstance(entry, MMLUProParseEntry)
|
141 |
+
assert entry.answer == "B" # Index 1 maps to B
|
142 |
+
assert "O(n log n)" in entry.question
|
143 |
+
assert entry.task_name == "computer_science"
|
144 |
+
assert len(entry.raw_choices) == 6
|
145 |
+
|
146 |
+
|
147 |
+
def test_tmmlu_process_entry(tmmlu_parser):
|
148 |
+
"""Test processing entries in TMMLU+ parser."""
|
149 |
+
test_row = {
|
150 |
+
"question": "什麼是台灣最高的山峰?",
|
151 |
+
"A": "玉山",
|
152 |
+
"B": "阿里山",
|
153 |
+
"C": "合歡山",
|
154 |
+
"D": "雪山",
|
155 |
+
"answer": "A",
|
156 |
+
"subject": "geography_of_taiwan",
|
157 |
+
}
|
158 |
+
|
159 |
+
entry = tmmlu_parser.process_entry(test_row, task_name="geography_of_taiwan")
|
160 |
+
assert isinstance(entry, MMLUParseEntry)
|
161 |
+
assert entry.answer == "A"
|
162 |
+
assert entry.raw_choices == ["玉山", "阿里山", "合歡山", "雪山"]
|
163 |
+
assert entry.task_name == "geography_of_taiwan"
|
164 |
+
|
165 |
+
|
166 |
+
@pytest.mark.parametrize(
|
167 |
+
"parser_fixture,expected_tasks,expected_source",
|
168 |
+
[
|
169 |
+
("base_parser", 57, "cais/mmlu"),
|
170 |
+
("redux_parser", 30, "edinburgh-dawg/mmlu-redux"),
|
171 |
+
("tmmlu_parser", 66, "ikala/tmmluplus"),
|
172 |
+
("mmlu_pro_parser", 1, "TIGER-Lab/MMLU-Pro"),
|
173 |
+
],
|
174 |
+
)
|
175 |
+
def test_parser_initialization(
|
176 |
+
request, parser_fixture, expected_tasks, expected_source
|
177 |
+
):
|
178 |
+
"""Test initialization of different MMLU parser variants."""
|
179 |
+
parser = request.getfixturevalue(parser_fixture)
|
180 |
+
assert len(parser.task_names) == expected_tasks
|
181 |
+
assert parser._data_source == expected_source
|
182 |
+
assert (
|
183 |
+
parser.get_huggingface_link
|
184 |
+
== f"https://huggingface.co/datasets/{expected_source}"
|
185 |
+
)
|
186 |
+
|
187 |
+
|
188 |
+
@pytest.mark.integration
|
189 |
+
def test_load_dataset(base_parser):
|
190 |
+
"""Test loading the MMLU dataset."""
|
191 |
+
base_parser.load(task_name="anatomy", split="test")
|
192 |
+
assert base_parser.raw_data is not None
|
193 |
+
assert base_parser.split_names == ["test"]
|
194 |
+
assert base_parser._current_task == "anatomy"
|
195 |
+
|
196 |
+
|
197 |
+
def test_parser_string_representation(base_parser):
|
198 |
+
"""Test string representation of MMLU parser."""
|
199 |
+
repr_str = str(base_parser)
|
200 |
+
assert "MMLUDatasetParser" in repr_str
|
201 |
+
assert "cais/mmlu" in repr_str
|
202 |
+
assert "not loaded" in repr_str
|
203 |
+
|
204 |
+
|
205 |
+
@pytest.mark.integration
|
206 |
+
def test_different_splits_parsing(base_parser):
|
207 |
+
"""Test parsing different splits of the dataset."""
|
208 |
+
# Load and parse test split
|
209 |
+
base_parser.load(task_name="anatomy", split="test")
|
210 |
+
base_parser.parse(split_names="test", force=True)
|
211 |
+
test_count = len(base_parser.get_parsed_data)
|
212 |
+
|
213 |
+
# Load and parse validation split
|
214 |
+
base_parser.load(task_name="anatomy", split="validation")
|
215 |
+
base_parser.parse(split_names="validation", force=True)
|
216 |
+
val_count = len(base_parser.get_parsed_data)
|
217 |
+
|
218 |
+
assert test_count > 0
|
219 |
+
assert val_count > 0
|
220 |
+
assert test_count != val_count
|
221 |
+
|
222 |
+
|
223 |
+
def test_base_mmlu_dataset_description(base_parser):
|
224 |
+
"""Test dataset description for base MMLU."""
|
225 |
+
description = base_parser.get_dataset_description()
|
226 |
+
|
227 |
+
assert description.name == "Massive Multitask Language Understanding (MMLU)"
|
228 |
+
assert "cais/mmlu" in description.source
|
229 |
+
assert description.language == "English"
|
230 |
+
|
231 |
+
# Check characteristics
|
232 |
+
assert "57 subjects" in description.characteristics.lower()
|
233 |
+
|
234 |
+
# Check citation
|
235 |
+
assert "hendryckstest2021" in description.citation
|
236 |
+
|
237 |
+
|
238 |
+
def test_mmlu_redux_dataset_description(redux_parser):
|
239 |
+
"""Test dataset description for MMLU Redux."""
|
240 |
+
description = redux_parser.get_dataset_description()
|
241 |
+
|
242 |
+
assert description.name == "MMLU Redux"
|
243 |
+
assert "manually re-annotated" in description.purpose.lower()
|
244 |
+
assert "edinburgh-dawg/mmlu-redux" in description.source
|
245 |
+
assert description.language == "English"
|
246 |
+
|
247 |
+
# Check characteristics
|
248 |
+
assert "3,000" in description.characteristics
|
249 |
+
|
250 |
+
|
251 |
+
def test_tmmlu_plus_dataset_description(tmmlu_parser):
|
252 |
+
"""Test dataset description for TMMLU+."""
|
253 |
+
description = tmmlu_parser.get_dataset_description()
|
254 |
+
|
255 |
+
assert "ikala/tmmluplus" in description.source
|
256 |
+
assert description.language == "Traditional Chinese"
|
257 |
+
|
258 |
+
# Check characteristics
|
259 |
+
assert "66 subjects" in description.characteristics.lower()
|
260 |
+
|
261 |
+
# Check citation
|
262 |
+
assert "ikala2024improved" in description.citation
|
263 |
+
|
264 |
+
|
265 |
+
def test_mmlu_pro_dataset_description(mmlu_pro_parser):
|
266 |
+
"""Test dataset description for MMLU Pro."""
|
267 |
+
description = mmlu_pro_parser.get_dataset_description()
|
268 |
+
|
269 |
+
assert description.name == "MMLU Pro"
|
270 |
+
assert "challenging" in description.purpose.lower()
|
271 |
+
assert "TIGER-Lab/MMLU-Pro" in description.source
|
272 |
+
assert description.language == "English"
|
273 |
+
|
274 |
+
|
275 |
+
def test_base_mmlu_evaluation_metrics(base_parser):
|
276 |
+
"""Test evaluation metrics for base MMLU."""
|
277 |
+
metrics = base_parser.get_evaluation_metrics()
|
278 |
+
|
279 |
+
assert len(metrics) >= 3
|
280 |
+
metric_names = {m.name for m in metrics}
|
281 |
+
|
282 |
+
assert "accuracy" in metric_names
|
283 |
+
assert "subject_accuracy" in metric_names
|
284 |
+
assert "category_accuracy" in metric_names
|
285 |
+
|
286 |
+
accuracy_metric = next(m for m in metrics if m.name == "accuracy")
|
287 |
+
assert accuracy_metric.type == "classification"
|
288 |
+
assert accuracy_metric.primary is True
|
289 |
+
assert "multiple-choice" in accuracy_metric.description.lower()
|
290 |
+
|
291 |
+
|
292 |
+
def test_mmlu_redux_evaluation_metrics(redux_parser):
|
293 |
+
"""Test evaluation metrics for MMLU Redux."""
|
294 |
+
metrics = redux_parser.get_evaluation_metrics()
|
295 |
+
|
296 |
+
metric_names = {m.name for m in metrics}
|
297 |
+
assert "question_clarity" in metric_names
|
298 |
+
|
299 |
+
|
300 |
+
def test_tmmlu_plus_evaluation_metrics(tmmlu_parser):
|
301 |
+
"""Test evaluation metrics for TMMLU+."""
|
302 |
+
metrics = tmmlu_parser.get_evaluation_metrics()
|
303 |
+
|
304 |
+
metric_names = {m.name for m in metrics}
|
305 |
+
assert "difficulty_analysis" in metric_names
|
306 |
+
|
307 |
+
|
308 |
+
def test_mmlu_pro_evaluation_metrics(mmlu_pro_parser):
|
309 |
+
"""Test evaluation metrics for MMLU Pro."""
|
310 |
+
metrics = mmlu_pro_parser.get_evaluation_metrics()
|
311 |
+
|
312 |
+
metric_names = {m.name for m in metrics}
|
313 |
+
assert "reasoning_analysis" in metric_names
|
314 |
+
assert "prompt_robustness" in metric_names
|
tests/test_tmlu_parser.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
|
3 |
+
from llmdataparser.tmlu_parser import TMLUDatasetParser, TMLUParseEntry
|
4 |
+
|
5 |
+
|
6 |
+
@pytest.fixture
|
7 |
+
def tmlu_parser():
|
8 |
+
"""Create a TMLU parser instance for testing."""
|
9 |
+
return TMLUDatasetParser()
|
10 |
+
|
11 |
+
|
12 |
+
@pytest.fixture
|
13 |
+
def sample_tmlu_entries():
|
14 |
+
"""Create sample TMLU dataset entries for testing."""
|
15 |
+
return [
|
16 |
+
{
|
17 |
+
"question": "閱讀下文,選出依序最適合填入□內的選項:",
|
18 |
+
"A": "張揚/綢繆未雨/奏疏",
|
19 |
+
"B": "抽搐/煮繭抽絲/奏疏",
|
20 |
+
"C": "張揚/煮繭抽絲/進貢",
|
21 |
+
"D": "抽搐/綢繆未雨/進貢",
|
22 |
+
"answer": "B",
|
23 |
+
"explanation": "根據文意,選項B最為恰當。",
|
24 |
+
"metadata": {
|
25 |
+
"timestamp": "2023-10-09T18:27:20.304623",
|
26 |
+
"source": "AST chinese - 108",
|
27 |
+
"explanation_source": "",
|
28 |
+
},
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"question": "下列何者是質數?",
|
32 |
+
"A": "21",
|
33 |
+
"B": "27",
|
34 |
+
"C": "31",
|
35 |
+
"D": "33",
|
36 |
+
"answer": "C",
|
37 |
+
"explanation": "31是質數,其他選項都是合數。",
|
38 |
+
"metadata": {
|
39 |
+
"timestamp": "2023-10-09T18:27:20.304623",
|
40 |
+
"source": "AST mathematics - 108",
|
41 |
+
"explanation_source": "",
|
42 |
+
},
|
43 |
+
},
|
44 |
+
]
|
45 |
+
|
46 |
+
|
47 |
+
def test_tmlu_parse_entry_creation_valid():
|
48 |
+
"""Test valid creation of TMLUParseEntry."""
|
49 |
+
entry = TMLUParseEntry.create(
|
50 |
+
question="Test question",
|
51 |
+
answer="A",
|
52 |
+
raw_question="Test question",
|
53 |
+
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
54 |
+
raw_answer="A",
|
55 |
+
task_name="AST_chinese",
|
56 |
+
explanation="Test explanation",
|
57 |
+
metadata={"source": "test"},
|
58 |
+
)
|
59 |
+
assert isinstance(entry, TMLUParseEntry)
|
60 |
+
assert entry.question == "Test question"
|
61 |
+
assert entry.answer == "A"
|
62 |
+
assert entry.raw_choices == ["choice1", "choice2", "choice3", "choice4"]
|
63 |
+
assert entry.explanation == "Test explanation"
|
64 |
+
assert entry.metadata == {"source": "test"}
|
65 |
+
|
66 |
+
|
67 |
+
@pytest.mark.parametrize("invalid_answer", ["E", "F", "1", "", None])
|
68 |
+
def test_tmlu_parse_entry_creation_invalid(invalid_answer):
|
69 |
+
"""Test invalid answer handling in TMLUParseEntry creation."""
|
70 |
+
with pytest.raises(
|
71 |
+
ValueError, match="Invalid answer_letter.*must be one of A, B, C, D"
|
72 |
+
):
|
73 |
+
TMLUParseEntry.create(
|
74 |
+
question="Test question",
|
75 |
+
answer=invalid_answer,
|
76 |
+
raw_question="Test question",
|
77 |
+
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
78 |
+
raw_answer=invalid_answer,
|
79 |
+
task_name="AST_chinese",
|
80 |
+
)
|
81 |
+
|
82 |
+
|
83 |
+
def test_process_entry(tmlu_parser, sample_tmlu_entries):
|
84 |
+
"""Test processing entries in TMLU parser."""
|
85 |
+
entry = tmlu_parser.process_entry(sample_tmlu_entries[0], task_name="AST_chinese")
|
86 |
+
|
87 |
+
assert isinstance(entry, TMLUParseEntry)
|
88 |
+
assert entry.answer == "B"
|
89 |
+
assert entry.task_name == "AST_chinese"
|
90 |
+
assert len(entry.raw_choices) == 4
|
91 |
+
assert entry.explanation == "根據文意,選項B最為恰當。"
|
92 |
+
assert "AST chinese - 108" in entry.metadata["source"]
|
93 |
+
|
94 |
+
|
95 |
+
def test_tmlu_parser_initialization(tmlu_parser):
|
96 |
+
"""Test TMLU parser initialization and properties."""
|
97 |
+
assert isinstance(tmlu_parser.task_names, list)
|
98 |
+
assert len(tmlu_parser.task_names) == 37 # Total number of tasks
|
99 |
+
assert tmlu_parser._data_source == "miulab/tmlu"
|
100 |
+
assert tmlu_parser._default_task == "AST_chinese"
|
101 |
+
assert "AST_chinese" in tmlu_parser.task_names
|
102 |
+
assert "GSAT_mathematics" in tmlu_parser.task_names
|
103 |
+
assert (
|
104 |
+
tmlu_parser.get_huggingface_link
|
105 |
+
== "https://huggingface.co/datasets/miulab/tmlu"
|
106 |
+
)
|
107 |
+
|
108 |
+
|
109 |
+
@pytest.mark.integration
|
110 |
+
def test_load_dataset(tmlu_parser):
|
111 |
+
"""Test loading the TMLU dataset."""
|
112 |
+
tmlu_parser.load(task_name="AST_chinese", split="test")
|
113 |
+
assert tmlu_parser.raw_data is not None
|
114 |
+
assert tmlu_parser.split_names == ["test"]
|
115 |
+
assert tmlu_parser._current_task == "AST_chinese"
|
116 |
+
|
117 |
+
|
118 |
+
def test_parser_string_representation(tmlu_parser):
|
119 |
+
"""Test string representation of TMLU parser."""
|
120 |
+
repr_str = str(tmlu_parser)
|
121 |
+
assert "TMLUDatasetParser" in repr_str
|
122 |
+
assert "miulab/tmlu" in repr_str
|
123 |
+
assert "not loaded" in repr_str
|
124 |
+
|
125 |
+
|
126 |
+
@pytest.mark.integration
|
127 |
+
def test_different_tasks_parsing(tmlu_parser):
|
128 |
+
"""Test parsing different tasks of the dataset."""
|
129 |
+
# Load and parse AST_chinese
|
130 |
+
tmlu_parser.load(task_name="AST_chinese", split="test")
|
131 |
+
tmlu_parser.parse(split_names="test", force=True)
|
132 |
+
chinese_count = len(tmlu_parser.get_parsed_data)
|
133 |
+
|
134 |
+
# Load and parse AST_mathematics
|
135 |
+
tmlu_parser.load(task_name="AST_mathematics", split="test")
|
136 |
+
tmlu_parser.parse(split_names="test", force=True)
|
137 |
+
math_count = len(tmlu_parser.get_parsed_data)
|
138 |
+
|
139 |
+
assert chinese_count > 0
|
140 |
+
assert math_count > 0
|
141 |
+
|
142 |
+
|
143 |
+
def test_metadata_handling(tmlu_parser, sample_tmlu_entries):
|
144 |
+
"""Test proper handling of metadata in entries."""
|
145 |
+
entry = tmlu_parser.process_entry(sample_tmlu_entries[0])
|
146 |
+
|
147 |
+
assert "timestamp" in entry.metadata
|
148 |
+
assert "source" in entry.metadata
|
149 |
+
assert "explanation_source" in entry.metadata
|
150 |
+
assert entry.metadata["source"] == "AST chinese - 108"
|
151 |
+
|
152 |
+
|
153 |
+
def test_get_dataset_description(tmlu_parser):
|
154 |
+
"""Test dataset description generation."""
|
155 |
+
description = tmlu_parser.get_dataset_description()
|
156 |
+
|
157 |
+
assert description.name == "Taiwan Multiple-choice Language Understanding (TMLU)"
|
158 |
+
assert description.language == "Traditional Chinese"
|
159 |
+
assert "Taiwan-specific educational" in description.purpose
|
160 |
+
assert "Various Taiwan standardized tests" in description.source
|
161 |
+
assert description.format == "Multiple choice questions (A/B/C/D)"
|
162 |
+
assert "Advanced Subjects Test (AST)" in description.characteristics
|
163 |
+
assert "DBLP:journals/corr/abs-2403-20180" in description.citation
|
164 |
+
|
165 |
+
|
166 |
+
def test_get_evaluation_metrics(tmlu_parser):
|
167 |
+
"""Test evaluation metrics generation."""
|
168 |
+
metrics = tmlu_parser.get_evaluation_metrics()
|
169 |
+
|
170 |
+
assert len(metrics) == 2 # Check total number of metrics
|
171 |
+
|
172 |
+
# Check primary metrics
|
173 |
+
primary_metrics = [m for m in metrics if m.primary]
|
174 |
+
assert len(primary_metrics) == 2
|
175 |
+
assert any(m.name == "accuracy" for m in primary_metrics)
|
176 |
+
assert any(m.name == "per_subject_accuracy" for m in primary_metrics)
|
tests/test_tw_legal_parser.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
|
3 |
+
from llmdataparser.tw_legal_parser import TWLegalDatasetParser, TWLegalParseEntry
|
4 |
+
|
5 |
+
|
6 |
+
@pytest.fixture
|
7 |
+
def tw_legal_parser():
|
8 |
+
"""Create a Taiwan Legal parser instance for testing."""
|
9 |
+
return TWLegalDatasetParser()
|
10 |
+
|
11 |
+
|
12 |
+
@pytest.fixture
|
13 |
+
def sample_tw_legal_entries():
|
14 |
+
"""Create sample Taiwan Legal dataset entries for testing."""
|
15 |
+
return [
|
16 |
+
{
|
17 |
+
"question": "依民法規定,下列關於法人之敘述,何者錯誤?",
|
18 |
+
"A": "法人於法令限制內,有享受權利負擔義務之能力",
|
19 |
+
"B": "法人因目的之達到而消滅",
|
20 |
+
"C": "法人非依法律之規定,不得成立",
|
21 |
+
"D": "法人於登記前,即取得權利能力",
|
22 |
+
"answer": "D",
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"question": "關於刑法第321條第1項第4款之結夥三人以上而犯竊盜罪,下列敘述何者正確?",
|
26 |
+
"A": "須行為人主觀上有結夥犯竊盜之認識",
|
27 |
+
"B": "三人以上當場在場實施竊盜行為始足當之",
|
28 |
+
"C": "三人以上已達成犯意聯絡即可成立",
|
29 |
+
"D": "三人以上須全部在現場實施竊盜行為",
|
30 |
+
"answer": "A",
|
31 |
+
},
|
32 |
+
]
|
33 |
+
|
34 |
+
|
35 |
+
def test_tw_legal_parse_entry_creation_valid():
|
36 |
+
"""Test valid creation of TWLegalParseEntry."""
|
37 |
+
entry = TWLegalParseEntry.create(
|
38 |
+
question="Test question",
|
39 |
+
answer="A",
|
40 |
+
raw_question="Test question",
|
41 |
+
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
42 |
+
raw_answer="A",
|
43 |
+
task_name="default",
|
44 |
+
)
|
45 |
+
assert isinstance(entry, TWLegalParseEntry)
|
46 |
+
assert entry.question == "Test question"
|
47 |
+
assert entry.answer == "A"
|
48 |
+
assert entry.raw_choices == ["choice1", "choice2", "choice3", "choice4"]
|
49 |
+
|
50 |
+
|
51 |
+
@pytest.mark.parametrize("invalid_answer", ["E", "F", "1", "", None])
|
52 |
+
def test_tw_legal_parse_entry_creation_invalid(invalid_answer):
|
53 |
+
"""Test invalid answer handling in TWLegalParseEntry creation."""
|
54 |
+
with pytest.raises(
|
55 |
+
ValueError, match="Invalid answer_letter.*must be one of A, B, C, D"
|
56 |
+
):
|
57 |
+
TWLegalParseEntry.create(
|
58 |
+
question="Test question",
|
59 |
+
answer=invalid_answer,
|
60 |
+
raw_question="Test question",
|
61 |
+
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
62 |
+
raw_answer=invalid_answer,
|
63 |
+
task_name="default",
|
64 |
+
)
|
65 |
+
|
66 |
+
|
67 |
+
def test_process_entry(tw_legal_parser, sample_tw_legal_entries):
|
68 |
+
"""Test processing entries in Taiwan Legal parser."""
|
69 |
+
entry = tw_legal_parser.process_entry(sample_tw_legal_entries[0])
|
70 |
+
|
71 |
+
assert isinstance(entry, TWLegalParseEntry)
|
72 |
+
assert entry.answer == "D"
|
73 |
+
assert "A. 法人於法令限制內,有享受權利負擔義務之能力" in entry.question
|
74 |
+
assert "B. 法人因目的之達到而消滅" in entry.question
|
75 |
+
assert "C. 法人非依法律之規定,不得成立" in entry.question
|
76 |
+
assert "D. 法人於登記前,即取得權利能力" in entry.question
|
77 |
+
assert entry.raw_question == "依民法規定,下列關於法人之敘述,何者錯誤?"
|
78 |
+
assert len(entry.raw_choices) == 4
|
79 |
+
|
80 |
+
|
81 |
+
def test_tw_legal_parser_initialization(tw_legal_parser):
|
82 |
+
"""Test Taiwan Legal parser initialization and properties."""
|
83 |
+
assert isinstance(tw_legal_parser.task_names, list)
|
84 |
+
assert len(tw_legal_parser.task_names) == 1 # Only default task
|
85 |
+
assert tw_legal_parser._data_source == "lianghsun/tw-legal-benchmark-v1"
|
86 |
+
assert tw_legal_parser._default_task == "default"
|
87 |
+
assert (
|
88 |
+
tw_legal_parser.get_huggingface_link
|
89 |
+
== "https://huggingface.co/datasets/lianghsun/tw-legal-benchmark-v1"
|
90 |
+
)
|
91 |
+
|
92 |
+
|
93 |
+
@pytest.mark.integration
|
94 |
+
def test_load_dataset(tw_legal_parser):
|
95 |
+
"""Test loading the Taiwan Legal dataset."""
|
96 |
+
tw_legal_parser.load(split="train")
|
97 |
+
assert tw_legal_parser.raw_data is not None
|
98 |
+
assert tw_legal_parser.split_names == ["train"]
|
99 |
+
assert tw_legal_parser._current_task == "default"
|
100 |
+
|
101 |
+
|
102 |
+
def test_parser_string_representation(tw_legal_parser):
|
103 |
+
"""Test string representation of Taiwan Legal parser."""
|
104 |
+
repr_str = str(tw_legal_parser)
|
105 |
+
assert "TWLegalDatasetParser" in repr_str
|
106 |
+
assert "lianghsun/tw-legal-benchmark-v1" in repr_str
|
107 |
+
assert "not loaded" in repr_str
|
108 |
+
|
109 |
+
|
110 |
+
@pytest.mark.integration
|
111 |
+
def test_data_parsing(tw_legal_parser):
|
112 |
+
"""Test parsing the dataset."""
|
113 |
+
# Load and parse train split
|
114 |
+
tw_legal_parser.load(split="train")
|
115 |
+
tw_legal_parser.parse(split_names="train", force=True)
|
116 |
+
train_count = len(tw_legal_parser.get_parsed_data)
|
117 |
+
|
118 |
+
assert train_count > 0
|
119 |
+
# Additional assertions about the parsed data
|
120 |
+
parsed_data = tw_legal_parser.get_parsed_data
|
121 |
+
assert all(isinstance(entry, TWLegalParseEntry) for entry in parsed_data)
|
122 |
+
assert all(entry.answer in {"A", "B", "C", "D"} for entry in parsed_data)
|
123 |
+
|
124 |
+
|
125 |
+
def test_get_dataset_description(tw_legal_parser):
|
126 |
+
"""Test getting dataset description for Taiwan Legal parser."""
|
127 |
+
description = tw_legal_parser.get_dataset_description()
|
128 |
+
|
129 |
+
assert description.name == "Taiwan Legal Benchmark"
|
130 |
+
assert description.language == "Traditional Chinese"
|
131 |
+
assert "Taiwan's legal system" in description.characteristics
|
132 |
+
assert (
|
133 |
+
"huggingface.co/datasets/lianghsun/tw-legal-benchmark-v1"
|
134 |
+
in description.citation
|
135 |
+
)
|
136 |
+
|
137 |
+
|
138 |
+
def test_get_evaluation_metrics(tw_legal_parser):
|
139 |
+
"""Test getting evaluation metrics for Taiwan Legal parser."""
|
140 |
+
metrics = tw_legal_parser.get_evaluation_metrics()
|
141 |
+
|
142 |
+
assert len(metrics) == 1
|
143 |
+
metric = metrics[0]
|
144 |
+
assert metric.name == "accuracy"
|
145 |
+
assert metric.type == "classification"
|
146 |
+
assert metric.primary is True
|