JeffYang52415 commited on
Commit
fecdc3d
·
unverified ·
2 Parent(s): f06dec4 8b1be45

Merge pull request #1 from jeff52415/release/v1.0.0

Browse files
.dockerignore ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .gitignore
3
+ .env
4
+ .venv
5
+ __pycache__
6
+ *.pyc
7
+ *.pyo
8
+ *.pyd
9
+ .Python
10
+ *.py[cod]
11
+ *$py.class
12
+ .pytest_cache
13
+ .coverage
14
+ htmlcov
15
+ .mypy_cache
16
+ .ruff_cache
17
+ .DS_Store
18
+ notebooks/
19
+ tests/
20
+ docs/
.github/workflows/ci.yml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12"]
15
+ fail-fast: false
16
+
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+
20
+ - name: Set up Python ${{ matrix.python-version }}
21
+ uses: actions/setup-python@v5
22
+ with:
23
+ python-version: ${{ matrix.python-version }}
24
+ cache: "pip"
25
+
26
+ - name: Install Poetry
27
+ run: |
28
+ pipx install poetry
29
+
30
+ - name: Configure Poetry
31
+ run: |
32
+ poetry config virtualenvs.create true
33
+ poetry config virtualenvs.in-project true
34
+
35
+ - name: Cache Poetry virtualenv
36
+ uses: actions/cache@v3
37
+ with:
38
+ path: ./.venv
39
+ key: venv-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}
40
+
41
+ - name: Install dependencies
42
+ run: |
43
+ poetry install
44
+
45
+ - name: Run pre-commit hooks
46
+ uses: pre-commit/[email protected]
47
+ with:
48
+ extra_args: --all-files
49
+ env:
50
+ PRE_COMMIT_CACHE_KEY: ${{ hashFiles('.pre-commit-config.yaml', 'pyproject.toml') }}
51
+
52
+ - name: Run tests with coverage
53
+ run: |
54
+ poetry run pytest --cov=llmdataparser --cov-report=xml
55
+
56
+ - name: Upload coverage to Codecov
57
+ uses: codecov/codecov-action@v5
58
+ with:
59
+ token: ${{ secrets.CODECOV_TOKEN }}
60
+ file: ./coverage.xml
61
+ fail_ci_if_error: true
62
+
63
+ - name: Build documentation
64
+ run: |
65
+ poetry add mkdocs mkdocs-material --group dev
66
+ cp README.md docs/index.md
67
+ poetry run mkdocs build
68
+ if: matrix.python-version == '3.12'
69
+
70
+ - name: Upload documentation artifact
71
+ uses: actions/upload-artifact@v3
72
+ with:
73
+ name: documentation
74
+ path: site/
75
+ if: matrix.python-version == '3.12'
.github/workflows/docker.yml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Docker CD
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ paths:
7
+ - "Dockerfile"
8
+ - ".dockerignore"
9
+ - "docker-compose.yml"
10
+ - "pyproject.toml"
11
+ - "poetry.lock"
12
+
13
+ jobs:
14
+ docker:
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - name: Checkout
18
+ uses: actions/checkout@v4
19
+
20
+ - name: Get version from pyproject.toml
21
+ run: |
22
+ echo "VERSION=$(grep '^version = ' pyproject.toml | cut -d'"' -f2)" >> $GITHUB_ENV
23
+
24
+ - name: Set up Docker Buildx
25
+ uses: docker/setup-buildx-action@v3
26
+
27
+ - name: Login to Docker Hub
28
+ uses: docker/login-action@v3
29
+ with:
30
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
31
+ password: ${{ secrets.DOCKERHUB_TOKEN }}
32
+
33
+ - name: Build and push
34
+ uses: docker/build-push-action@v5
35
+ with:
36
+ context: .
37
+ push: true
38
+ tags: |
39
+ jeff52415/llmdataparser:latest
40
+ jeff52415/llmdataparser:v${{ env.VERSION }}
41
+ cache-from: type=registry,ref=jeff52415/llmdataparser:latest
42
+ cache-to: type=inline
43
+ platforms: linux/amd64,linux/arm64
.github/workflows/huggingface-sync.yml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Deploy to Hugging Face Space
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+
7
+ jobs:
8
+ sync:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - name: Checkout repository
12
+ uses: actions/checkout@v4
13
+ with:
14
+ fetch-depth: 0
15
+
16
+ - name: Configure Git
17
+ run: |
18
+ git config --global user.email "github-actions[bot]@users.noreply.github.com"
19
+ git config --global user.name "github-actions[bot]"
20
+
21
+ - name: Login to Hugging Face
22
+ env:
23
+ HF_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
24
+ run: |
25
+ huggingface-cli login --token $HF_TOKEN --add-to-git-credential
26
+
27
+ - name: Push to Hugging Face Space
28
+ run: |
29
+ git remote add space https://huggingface.co/spaces/JeffYang52415/LLMEval-Dataset-Parser
30
+ git push space main:main
.gitignore CHANGED
@@ -8,8 +8,6 @@ build/
8
  dist/
9
  *.egg-info/
10
 
11
- # Poetry
12
- poetry.lock
13
 
14
  # Virtual environment
15
  .env/
@@ -32,3 +30,15 @@ poetry.lock
32
 
33
  # Mac files
34
  .DS_Store
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  dist/
9
  *.egg-info/
10
 
 
 
11
 
12
  # Virtual environment
13
  .env/
 
30
 
31
  # Mac files
32
  .DS_Store
33
+
34
+ #gradio cache
35
+ .cache/
36
+ .gradio/
37
+
38
+ #notebook cache
39
+ .ipynb_checkpoints/
40
+ notebooks/
41
+
42
+ #coverage
43
+ .coverage
44
+ .coverage.*
.pre-commit-config.yaml CHANGED
@@ -1,18 +1,12 @@
1
  # .pre-commit-config.yaml
2
 
3
  repos:
4
- - repo: https://github.com/psf/black
5
- rev: 23.9.1
6
- hooks:
7
- - id: black
8
- args: ["--target-version=py311"]
9
- additional_dependencies: ["typing-extensions>=4.8.0"]
10
- - repo: https://github.com/PyCQA/flake8
11
- rev: 6.1.0
12
  hooks:
13
- - id: flake8
14
- additional_dependencies: ["typing-extensions>=4.8.0"]
15
- args: ["--ignore=E203, E501, W503, E501"]
16
  - repo: https://github.com/PyCQA/isort
17
  rev: 5.12.0
18
  hooks:
@@ -24,10 +18,10 @@ repos:
24
  - id: mypy
25
  args:
26
  [
27
- "--python-version=3.11",
28
  "--install-types",
29
  "--non-interactive",
30
- "--ignore-missing-imports",
31
  ]
32
  additional_dependencies:
33
  - "typing-extensions>=4.8.0"
@@ -41,7 +35,6 @@ repos:
41
  - id: detect-aws-credentials
42
  args: ["--allow-missing-credentials"]
43
  - id: detect-private-key
44
- - id: end-of-file-fixer
45
  - id: check-added-large-files
46
  - id: check-ast
47
  - id: check-byte-order-marker
@@ -49,9 +42,6 @@ repos:
49
  - id: check-docstring-first
50
  - id: check-json
51
  - id: debug-statements
52
- - id: detect-private-key
53
- - id: end-of-file-fixer
54
- - id: trailing-whitespace
55
  - id: mixed-line-ending
56
  - repo: https://github.com/myint/autoflake
57
  rev: v2.2.1
@@ -70,16 +60,36 @@ repos:
70
  hooks:
71
  - id: prettier
72
  types_or: [markdown, yaml]
73
- - repo: https://github.com/astral-sh/ruff-pre-commit
74
- # Ruff version.
75
- rev: v0.4.4
76
- hooks:
77
- # Run the linter.
78
- - id: ruff
79
- args: [--fix]
80
- # Run the formatter.
81
- - id: ruff-format
82
  - repo: https://github.com/kynan/nbstripout
83
  rev: 0.5.0 # use the latest version
84
  hooks:
85
  - id: nbstripout
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # .pre-commit-config.yaml
2
 
3
  repos:
4
+ - repo: https://github.com/astral-sh/ruff-pre-commit
5
+ rev: v0.4.4
 
 
 
 
 
 
6
  hooks:
7
+ - id: ruff
8
+ args: [--fix]
9
+ - id: ruff-format
10
  - repo: https://github.com/PyCQA/isort
11
  rev: 5.12.0
12
  hooks:
 
18
  - id: mypy
19
  args:
20
  [
21
+ "--config-file=pyproject.toml",
22
  "--install-types",
23
  "--non-interactive",
24
+ "--exclude=tests/*",
25
  ]
26
  additional_dependencies:
27
  - "typing-extensions>=4.8.0"
 
35
  - id: detect-aws-credentials
36
  args: ["--allow-missing-credentials"]
37
  - id: detect-private-key
 
38
  - id: check-added-large-files
39
  - id: check-ast
40
  - id: check-byte-order-marker
 
42
  - id: check-docstring-first
43
  - id: check-json
44
  - id: debug-statements
 
 
 
45
  - id: mixed-line-ending
46
  - repo: https://github.com/myint/autoflake
47
  rev: v2.2.1
 
60
  hooks:
61
  - id: prettier
62
  types_or: [markdown, yaml]
 
 
 
 
 
 
 
 
 
63
  - repo: https://github.com/kynan/nbstripout
64
  rev: 0.5.0 # use the latest version
65
  hooks:
66
  - id: nbstripout
67
+ - repo: https://github.com/nbQA-dev/nbQA
68
+ rev: 1.7.1
69
+ hooks:
70
+ - id: nbqa-ruff
71
+ additional_dependencies: [ruff]
72
+ - id: nbqa-isort
73
+ - id: nbqa-flake8
74
+ - repo: https://github.com/asottile/pyupgrade
75
+ rev: v3.15.0
76
+ hooks:
77
+ - id: pyupgrade
78
+ - repo: https://github.com/executablebooks/mdformat
79
+ rev: 0.7.17
80
+ hooks:
81
+ - id: mdformat
82
+ additional_dependencies:
83
+ - mdformat-gfm # GitHub-flavored Markdown
84
+ - mdformat-frontmatter # YAML frontmatter
85
+ - mdformat-footnote
86
+ - repo: https://github.com/shellcheck-py/shellcheck-py
87
+ rev: v0.9.0.6
88
+ hooks:
89
+ - id: shellcheck
90
+ - repo: https://github.com/pycqa/bandit
91
+ rev: 1.7.7
92
+ hooks:
93
+ - id: bandit
94
+ args: ["-c", "pyproject.toml"]
95
+ additional_dependencies: ["bandit[toml]", ".[toml]"]
CHANGELOG.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## \[1.0.0\] - 2024-12-30
9
+
10
+ ### Added
11
+
12
+ - Initial release
13
+ - Support for multiple benchmark datasets (MMLU, GSM8k, etc.)
14
+ - Gradio interface for dataset exploration
15
+ - Comprehensive test suite
16
+ - Documentation and examples
Dockerfile ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.12 slim image as base
2
+ FROM python:3.12-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONUNBUFFERED=1 \
6
+ POETRY_VERSION=1.7.1 \
7
+ POETRY_HOME="/opt/poetry" \
8
+ POETRY_NO_INTERACTION=1 \
9
+ GRADIO_SERVER_NAME=0.0.0.0 \
10
+ GRADIO_SERVER_PORT=7860
11
+
12
+ # Set working directory
13
+ WORKDIR /app
14
+
15
+ # Create cache directories for Hugging Face and set permissions correctly
16
+ ENV HF_HOME=/app/.cache/huggingface
17
+ RUN mkdir -p /app/.cache/huggingface && \
18
+ mkdir -p /app/.cache/torch && \
19
+ mkdir -p /app/.cache/transformers
20
+
21
+ # Install system dependencies
22
+ RUN apt-get update && apt-get install -y \
23
+ portaudio19-dev \
24
+ python3-pip \
25
+ gcc \
26
+ git \
27
+ curl \
28
+ && rm -rf /var/lib/apt/lists/* \
29
+ && apt-get clean
30
+
31
+ # Install poetry
32
+ RUN pip install poetry==${POETRY_VERSION} && \
33
+ poetry config virtualenvs.create false
34
+
35
+ # Copy dependency files first
36
+ COPY pyproject.toml poetry.lock ./
37
+
38
+ # Install dependencies using the lock file
39
+ RUN poetry install --no-dev --no-interaction --no-ansi
40
+
41
+ # Create app user and group with specific UID/GID
42
+ RUN groupadd -r app --gid 1000 && \
43
+ useradd -r -g app --uid 1000 --create-home app
44
+
45
+ # Set ownership of all cache directories
46
+ RUN chown -R app:app /app/.cache && \
47
+ chmod -R 755 /app/.cache
48
+
49
+ # Before switching to non-root user, create and set permissions
50
+ RUN mkdir -p /home/app/.cache && \
51
+ mkdir -p /home/app/.config/matplotlib && \
52
+ chown -R app:app /home/app/.cache && \
53
+ chown -R app:app /home/app/.config
54
+
55
+ # Set matplotlib config dir
56
+ ENV MPLCONFIGDIR=/home/app/.config/matplotlib
57
+
58
+ # Switch to non-root user
59
+ USER app
60
+
61
+ # Copy the rest of the application
62
+ COPY --chown=app:app . .
63
+
64
+ # Expose the port the app runs on
65
+ EXPOSE 7860
66
+
67
+ # Run the application
68
+ ENTRYPOINT ["python"]
69
+ CMD ["app.py"]
Makefile ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -----------------------------
2
+ # Variables
3
+ # -----------------------------
4
+ IMAGE_NAME = llmdataparser
5
+ CONTAINER_NAME = llmdataparser
6
+ VERSION = latest
7
+
8
+ # -----------------------------
9
+ # Docker Basic Commands
10
+ # -----------------------------
11
+ # Build the Docker image
12
+ build:
13
+ docker build -t $(IMAGE_NAME):$(VERSION) .
14
+
15
+ # Run the container
16
+ run:
17
+ docker run -d -p 7860:7860 --name $(CONTAINER_NAME) $(IMAGE_NAME):$(VERSION)
18
+
19
+ # Stop the container
20
+ stop:
21
+ docker stop $(CONTAINER_NAME)
22
+
23
+ # Remove the container
24
+ rm:
25
+ docker rm $(CONTAINER_NAME)
26
+
27
+ # Remove the image
28
+ rmi:
29
+ docker rmi $(IMAGE_NAME):$(VERSION)
30
+
31
+ # -----------------------------
32
+ # Docker Compose Commands
33
+ # -----------------------------
34
+ # Start with docker-compose (development)
35
+ compose-up:
36
+ docker compose up -d
37
+
38
+ # Stop and remove containers
39
+ compose-down:
40
+ docker compose down
41
+
42
+ # View logs
43
+ compose-logs:
44
+ docker compose logs -f
45
+
46
+ # Rebuild containers
47
+ compose-build:
48
+ docker compose build
49
+
50
+ # Restart containers
51
+ compose-restart:
52
+ docker compose restart
53
+
54
+ # -----------------------------
55
+ # Convenience Commands
56
+ # -----------------------------
57
+ # Build and run with docker
58
+ up: build run
59
+
60
+ # Stop and remove container
61
+ down: stop rm
62
+
63
+ # Clean everything
64
+ clean: stop rm rmi
65
+
66
+ # -----------------------------
67
+ # Monitoring Commands
68
+ # -----------------------------
69
+ # Show container logs
70
+ logs:
71
+ docker logs $(CONTAINER_NAME)
72
+
73
+ # Follow container logs
74
+ logs-follow:
75
+ docker logs -f $(CONTAINER_NAME)
76
+
77
+ # Show container status
78
+ status:
79
+ docker ps -a | grep $(CONTAINER_NAME)
80
+
81
+ # Enter container shell
82
+ shell:
83
+ docker exec -it $(CONTAINER_NAME) /bin/bash
84
+
85
+ # -----------------------------
86
+ # Production Commands
87
+ # -----------------------------
88
+ # Test nginx configuration (for production use)
89
+ nginx-test:
90
+ docker compose run --rm nginx nginx -t
91
+
92
+ # Start with nginx test (for production use)
93
+ compose-up-prod: nginx-test compose-up
94
+
95
+ # -----------------------------
96
+ # Security Commands
97
+ # -----------------------------
98
+ security-check:
99
+ @echo "Checking nginx configuration..."
100
+ docker compose run --rm nginx nginx -t
101
+ @echo "Checking exposed ports..."
102
+ docker compose config | grep -E "ports:|127.0.0.1"
103
+
104
+ # Ensure all targets are treated as commands, not files
105
+ .PHONY: build run stop rm rmi clean up down logs shell \
106
+ compose-up compose-down compose-logs compose-build compose-restart \
107
+ nginx-test status logs-follow compose-up-prod
README.md CHANGED
@@ -1,6 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
1
  # LLMDataParser
2
 
3
- **LLMDataParser** is a Python library that provides parsers for benchmark datasets used in evaluating Large Language Models (LLMs). It offers a unified interface for loading and parsing datasets like **MMLU** and **GSM8k**, simplifying dataset preparation for LLM evaluation.
4
 
5
  ## Features
6
 
@@ -8,6 +19,7 @@
8
  - **LLM-Agnostic**: Independent of any specific language model.
9
  - **Easy to Use**: Simple methods and built-in Python types.
10
  - **Extensible**: Easily add support for new datasets.
 
11
 
12
  ## Installation
13
 
@@ -22,7 +34,7 @@ You can install the package directly using `pip`. Even with only a `pyproject.to
22
  cd LLMDataParser
23
  ```
24
 
25
- 2. **Install Dependencies with pip**:
26
 
27
  ```bash
28
  pip install .
@@ -38,7 +50,7 @@ Poetry manages the virtual environment and dependencies automatically, so you do
38
  poetry install
39
  ```
40
 
41
- 2. **Activate the Virtual Environment**:
42
 
43
  ```bash
44
  poetry shell
@@ -46,7 +58,58 @@ Poetry manages the virtual environment and dependencies automatically, so you do
46
 
47
  ## Available Parsers
48
 
49
- - **MMLUDatasetParser**: Parses the MMLU dataset.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  ## License
52
 
 
1
+ ---
2
+ title: LLMEval Dataset Parser
3
+ emoji: ⚡
4
+ colorFrom: green
5
+ colorTo: gray
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ short_description: A collection of parsers for LLM benchmark datasets
10
+ ---
11
+
12
  # LLMDataParser
13
 
14
+ **LLMDataParser** is a Python library that provides parsers for benchmark datasets used in evaluating Large Language Models (LLMs). It offers a unified interface for loading and parsing datasets like **MMLU**, **GSM8k**, and others, streamlining dataset preparation for LLM evaluation. The library aims to simplify the process of working with common LLM benchmark datasets through a consistent API.
15
 
16
  ## Features
17
 
 
19
  - **LLM-Agnostic**: Independent of any specific language model.
20
  - **Easy to Use**: Simple methods and built-in Python types.
21
  - **Extensible**: Easily add support for new datasets.
22
+ - **Gradio**: Built-in Gradio interface for interactive dataset exploration and testing.
23
 
24
  ## Installation
25
 
 
34
  cd LLMDataParser
35
  ```
36
 
37
+ 1. **Install Dependencies with pip**:
38
 
39
  ```bash
40
  pip install .
 
50
  poetry install
51
  ```
52
 
53
+ 1. **Activate the Virtual Environment**:
54
 
55
  ```bash
56
  poetry shell
 
58
 
59
  ## Available Parsers
60
 
61
+ - **MMLUDatasetParser**
62
+ - **MMLUProDatasetParser**
63
+ - **MMLUReduxDatasetParser**
64
+ - **TMMLUPlusDatasetParser**
65
+ - **GSM8KDatasetParser**
66
+ - **MATHDatasetParser**
67
+ - **MGSMDatasetParser**
68
+ - **HumanEvalDatasetParser**
69
+ - **HumanEvalDatasetPlusParser**
70
+ - **BBHDatasetParser**
71
+ - **MBPPDatasetParser**
72
+ - **IFEvalDatasetParser**
73
+ - **TWLegalDatasetParser**
74
+ - **TMLUDatasetParser**
75
+
76
+ ## Quick Start Guide
77
+
78
+ Here's a simple example demonstrating how to use the library:
79
+
80
+ ```python
81
+ from llmdataparser import ParserRegistry
82
+ # list all available parsers
83
+ ParserRegistry.list_parsers()
84
+ # get a parser
85
+ parser = ParserRegistry.get_parser("mmlu")
86
+ # load the parser
87
+ parser.load() # optional: task_name, split
88
+ # parse the parser
89
+ parser.parse() # optional: split_names
90
+
91
+ print(parser.task_names)
92
+ print(parser.split_names)
93
+ print(parser.get_dataset_description)
94
+ print(parser.get_huggingface_link)
95
+ print(parser.total_tasks)
96
+ data = parser.get_parsed_data
97
+ ```
98
+
99
+ We also provide a Gradio demo for interactive testing:
100
+
101
+ ```bash
102
+ python app.py
103
+ ```
104
+
105
+ ## Adding New Dataset Parsers
106
+
107
+ To add support for a new dataset, please refer to our detailed guide in [docs/adding_new_parser.md](docs/adding_new_parser.md). The guide includes:
108
+
109
+ - Step-by-step instructions for creating a new parser
110
+ - Code examples and templates
111
+ - Best practices and common patterns
112
+ - Testing guidelines
113
 
114
  ## License
115
 
app.py ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import secrets
2
+ from functools import lru_cache
3
+ from typing import Any
4
+
5
+ import gradio as gr
6
+
7
+ from llmdataparser import ParserRegistry
8
+ from llmdataparser.base_parser import (
9
+ VALID_CATEGORIES,
10
+ DatasetDescription,
11
+ DatasetParser,
12
+ EvaluationMetric,
13
+ ParseEntry,
14
+ )
15
+
16
+
17
+ @lru_cache(maxsize=32)
18
+ def get_parser_instance(parser_name: str) -> DatasetParser[Any]:
19
+ """Get a cached parser instance by name."""
20
+ return ParserRegistry.get_parser(parser_name)
21
+
22
+
23
+ def get_available_splits(parser: DatasetParser[Any]) -> list[str] | None:
24
+ """Get available splits for the selected parser after loading."""
25
+ if not hasattr(parser, "split_names") or not parser.split_names:
26
+ return None
27
+ return list(parser.split_names)
28
+
29
+
30
+ def get_available_tasks(parser: DatasetParser[Any]) -> list[str]:
31
+ """Get available tasks for the selected parser."""
32
+ if not hasattr(parser, "task_names"):
33
+ return ["default"]
34
+ return list(parser.task_names)
35
+
36
+
37
+ def format_entry_attributes(entry: ParseEntry) -> str:
38
+ """Format all attributes of a ParseEntry except question and answer."""
39
+ from dataclasses import fields
40
+
41
+ # Get all field names from the dataclass
42
+ field_names = [field.name for field in fields(entry)]
43
+ # Filter out question and answer
44
+ filtered_fields = [
45
+ name for name in field_names if name not in ["question", "answer"]
46
+ ]
47
+ # Build the formatted string
48
+ return "\n".join(f"{name}: {getattr(entry, name)}" for name in filtered_fields)
49
+
50
+
51
+ def load_and_parse(
52
+ parser_name: str, task_name: str | None, split_name: str | None
53
+ ) -> tuple[int, str, str, str, gr.Dropdown, str]:
54
+ """Load and parse the dataset, return the first entry and available splits."""
55
+ try:
56
+ parser = get_parser_instance(parser_name)
57
+
58
+ # Load the dataset
59
+ parser.load(
60
+ task_name=task_name if task_name != "default" else None,
61
+ split=split_name,
62
+ trust_remote_code=True,
63
+ )
64
+
65
+ # Get available splits after loading
66
+ available_splits = get_available_splits(parser)
67
+
68
+ # Parse the dataset
69
+ parser.parse(split_names=split_name, force=True)
70
+
71
+ # Get parsed data
72
+ parsed_data = parser.get_parsed_data
73
+
74
+ split_dropdown = gr.Dropdown(
75
+ choices=available_splits,
76
+ label="Select Split",
77
+ interactive=True,
78
+ value=None,
79
+ allow_custom_value=True,
80
+ )
81
+
82
+ info = parser.__repr__()
83
+ if not parsed_data:
84
+ return 0, "", "", "", split_dropdown, info
85
+
86
+ # Get the first entry
87
+ first_entry = parsed_data[0]
88
+
89
+ return (
90
+ 0, # Return first index instead of list of indices
91
+ first_entry.question,
92
+ first_entry.answer,
93
+ format_entry_attributes(first_entry),
94
+ split_dropdown,
95
+ info,
96
+ )
97
+ except Exception as e:
98
+ # Make the error message more user-friendly and detailed
99
+ error_msg = f"Failed to load dataset: {str(e)}\nParser: {parser_name}\nTask: {task_name}\nSplit: {split_name}"
100
+ return 0, error_msg, "", "", [], ""
101
+
102
+
103
+ def update_entry(
104
+ parsed_data_index: int | None, parser_name: str
105
+ ) -> tuple[str, str, str]:
106
+ """Update the displayed entry based on the selected index."""
107
+ try:
108
+ if not parser_name:
109
+ return "Please select a parser first", "", ""
110
+
111
+ parser = get_parser_instance(parser_name)
112
+ parsed_data = parser.get_parsed_data
113
+
114
+ if not parsed_data:
115
+ return "No data available", "", ""
116
+
117
+ if parsed_data_index is None:
118
+ # Random selection using secrets instead of random
119
+ random_index = secrets.randbelow(len(parsed_data))
120
+ entry = parsed_data[random_index]
121
+ else:
122
+ # Ensure index is within bounds
123
+ index = max(0, min(parsed_data_index, len(parsed_data) - 1))
124
+ entry = parsed_data[index]
125
+
126
+ return (
127
+ entry.question,
128
+ entry.answer,
129
+ format_entry_attributes(entry),
130
+ )
131
+ except Exception as e:
132
+ return f"Error: {str(e)}", "", ""
133
+
134
+
135
+ def update_parser_options(parser_name: str) -> tuple[gr.Dropdown, gr.Dropdown, str]:
136
+ """Update available tasks and splits for the selected parser."""
137
+ try:
138
+ parser = get_parser_instance(parser_name)
139
+ tasks = get_available_tasks(parser)
140
+ default_task = getattr(parser, "_default_task", "default")
141
+
142
+ # Update task dropdown
143
+ task_dropdown = gr.Dropdown(
144
+ choices=tasks,
145
+ value=default_task,
146
+ label="Select Task",
147
+ interactive=True,
148
+ allow_custom_value=True,
149
+ )
150
+
151
+ # Update split dropdown - Note the value is now explicitly None
152
+ splits = get_available_splits(parser)
153
+ split_dropdown = gr.Dropdown(
154
+ choices=splits,
155
+ label="Select Split",
156
+ interactive=True,
157
+ value=None,
158
+ allow_custom_value=True,
159
+ )
160
+
161
+ info = parser.__repr__()
162
+ return task_dropdown, split_dropdown, info
163
+ except Exception as e:
164
+ return (
165
+ gr.Dropdown(choices=["default"], value="default"),
166
+ gr.Dropdown(choices=[]),
167
+ f"Error: {str(e)}",
168
+ )
169
+
170
+
171
+ def clear_parser_cache() -> None:
172
+ """Clear the parser cache."""
173
+ get_parser_instance.cache_clear()
174
+
175
+
176
+ def format_dataset_description(description: DatasetDescription) -> str:
177
+ """Format DatasetDescription into a readable string."""
178
+ formatted = [
179
+ f"# {description.name}",
180
+ f"\n**Purpose**: {description.purpose}",
181
+ f"\n**Language**: {description.language}",
182
+ f"\n**Format**: {description.format}",
183
+ f"\n**Source**: {description.source}",
184
+ f"\n**Characteristics**: {description.characteristics}",
185
+ ]
186
+
187
+ if description.citation:
188
+ formatted.append(f"\n**Citation**:\n```\n{description.citation}\n```")
189
+
190
+ if description.additional_info:
191
+ formatted.append("\n**Additional Information**:")
192
+ for key, value in description.additional_info.items():
193
+ formatted.append(f"- {key}: {value}")
194
+
195
+ return "\n".join(formatted)
196
+
197
+
198
+ def get_primary_metrics(metrics: list[EvaluationMetric]) -> list[str]:
199
+ """Get list of primary metric names."""
200
+ return [metric.name for metric in metrics if metric.primary]
201
+
202
+
203
+ def format_metric_details(metric: EvaluationMetric) -> str:
204
+ """Format a single EvaluationMetric into a readable string."""
205
+ return f"""# {metric.name}<br>
206
+ **Type**: {metric.type}<br>
207
+ **Description**: {metric.description}"""
208
+
209
+
210
+ def update_dataset_info(parser_name: str) -> tuple:
211
+ """Update dataset description and evaluation metrics information."""
212
+ try:
213
+ parser = get_parser_instance(parser_name)
214
+ description = parser.get_dataset_description()
215
+ metrics = parser.get_evaluation_metrics()
216
+
217
+ # Format description
218
+ desc_text = format_dataset_description(description)
219
+
220
+ # Get primary metrics for dropdown
221
+ primary_metrics = get_primary_metrics(metrics)
222
+
223
+ # Format details for first metric (or empty if no metrics)
224
+ first_metric = metrics[0] if metrics else None
225
+ metric_details = format_metric_details(first_metric) if first_metric else ""
226
+
227
+ return (
228
+ gr.Markdown(value=desc_text),
229
+ gr.Dropdown(
230
+ choices=primary_metrics,
231
+ value=primary_metrics[0] if primary_metrics else None,
232
+ ),
233
+ gr.Markdown(value=metric_details),
234
+ )
235
+ except Exception as e:
236
+ return (
237
+ gr.Markdown(value=f"Error loading dataset description: {str(e)}"),
238
+ gr.Dropdown(choices=[]),
239
+ gr.Markdown(value=""),
240
+ )
241
+
242
+
243
+ def update_metric_details(metric_name: str, parser_name: str) -> str:
244
+ """Update the displayed metric details when selection changes."""
245
+ try:
246
+ parser = get_parser_instance(parser_name)
247
+ metrics = parser.get_evaluation_metrics()
248
+ selected_metric = next((m for m in metrics if m.name == metric_name), None)
249
+ return format_metric_details(selected_metric) if selected_metric else ""
250
+ except Exception as e:
251
+ return f"Error loading metric details: {str(e)}"
252
+
253
+
254
+ def get_parser_categories(parser_name: str) -> list[str]:
255
+ """Get categories for a specific parser."""
256
+ try:
257
+ parser = get_parser_instance(parser_name)
258
+ description = parser.get_dataset_description()
259
+ return description.category
260
+ except Exception:
261
+ return []
262
+
263
+
264
+ def filter_parsers_by_category(category: str | None) -> list[str]:
265
+ """Filter available parsers by category."""
266
+ if not category:
267
+ return ParserRegistry.list_parsers()
268
+
269
+ filtered_parsers = []
270
+ for parser_name in ParserRegistry.list_parsers():
271
+ categories = get_parser_categories(parser_name)
272
+ if category in categories:
273
+ filtered_parsers.append(parser_name)
274
+ return filtered_parsers
275
+
276
+
277
+ def create_interface() -> gr.Blocks:
278
+ """Create and return the Gradio interface."""
279
+ with gr.Blocks(css="footer {display: none !important}") as demo:
280
+ # Add header section with purpose and GitHub info
281
+ gr.Markdown("""
282
+ # LLM Evaluation Dataset Parser
283
+
284
+ ### 🎯 Purpose
285
+ A unified interface for parsing and exploring various LLM benchmark datasets (MMLU, MMLU-Pro, GSM8k, and more).
286
+ This tool helps researchers and developers to:
287
+ - Easily explore different benchmark datasets
288
+ - Access standardized parsing for multiple dataset formats
289
+ - View dataset descriptions and evaluation metrics
290
+
291
+ ### 🔗 Links
292
+ - [GitHub Repository](https://github.com/jeff52415/LLMDataParser)
293
+ - [Documentation](https://github.com/jeff52415/LLMDataParser#readme)
294
+
295
+ ---
296
+ """)
297
+
298
+ # State management
299
+ parser_state = gr.State("")
300
+ dataset_status = gr.Textbox(label="Dataset Status", interactive=False)
301
+
302
+ with gr.Tabs():
303
+ with gr.Tab("Dataset Explorer"):
304
+ with gr.Row():
305
+ with gr.Column(scale=1):
306
+ # Add category dropdown before parser selection
307
+ category_dropdown = gr.Dropdown(
308
+ choices=["All"] + list(VALID_CATEGORIES),
309
+ label="Filter by Category",
310
+ value="All",
311
+ interactive=True,
312
+ )
313
+
314
+ # Parser selection and controls
315
+ available_parsers = ParserRegistry.list_parsers()
316
+ parser_dropdown = gr.Dropdown(
317
+ choices=available_parsers,
318
+ label="Select Parser",
319
+ value=available_parsers[0] if available_parsers else None,
320
+ interactive=True,
321
+ allow_custom_value=True,
322
+ )
323
+ task_dropdown = gr.Dropdown(
324
+ choices=["default"],
325
+ label="Select Task",
326
+ value="default",
327
+ interactive=True,
328
+ allow_custom_value=True,
329
+ )
330
+ split_dropdown = gr.Dropdown(
331
+ choices=[],
332
+ label="Select Split",
333
+ interactive=True,
334
+ value=None,
335
+ allow_custom_value=True,
336
+ )
337
+ load_button = gr.Button(
338
+ "Load and Parse Dataset", variant="primary"
339
+ )
340
+
341
+ # Entry selection
342
+ entry_index = gr.Number(
343
+ label="Select Entry Index (empty for random)",
344
+ precision=0,
345
+ interactive=True,
346
+ )
347
+ update_button = gr.Button(
348
+ "Update/Random Entry", variant="secondary"
349
+ )
350
+
351
+ with gr.Column(scale=2):
352
+ # Output displays
353
+ question_output = gr.Textbox(
354
+ label="Question", lines=5, show_copy_button=True
355
+ )
356
+ answer_output = gr.Textbox(
357
+ label="Answer", lines=5, show_copy_button=True
358
+ )
359
+ attributes_output = gr.Textbox(
360
+ label="Other Attributes", lines=5, show_copy_button=True
361
+ )
362
+
363
+ with gr.Tab("Dataset Information"):
364
+ with gr.Row():
365
+ with gr.Column(scale=2):
366
+ # Dataset description
367
+ dataset_description = gr.Markdown()
368
+
369
+ with gr.Column(scale=1):
370
+ # Evaluation metrics
371
+ gr.Markdown("## Evaluation Metrics")
372
+ metric_dropdown = gr.Dropdown(
373
+ label="Select Primary Metric", interactive=True
374
+ )
375
+ metric_details = gr.Markdown()
376
+
377
+ # Add new event handler for category filtering
378
+ def update_parser_list(category: str) -> gr.Dropdown:
379
+ filtered_parsers = filter_parsers_by_category(
380
+ None if category == "All" else category
381
+ )
382
+ return gr.Dropdown(
383
+ choices=filtered_parsers,
384
+ value=filtered_parsers[0] if filtered_parsers else None,
385
+ )
386
+
387
+ category_dropdown.change(
388
+ fn=update_parser_list, inputs=[category_dropdown], outputs=[parser_dropdown]
389
+ )
390
+
391
+ # Event handlers
392
+ parser_dropdown.change(
393
+ fn=update_parser_options,
394
+ inputs=parser_dropdown,
395
+ outputs=[
396
+ task_dropdown,
397
+ split_dropdown,
398
+ dataset_status,
399
+ ],
400
+ ).then(lambda x: x, inputs=parser_dropdown, outputs=parser_state).then(
401
+ fn=update_dataset_info,
402
+ inputs=[parser_dropdown],
403
+ outputs=[dataset_description, metric_dropdown, metric_details],
404
+ )
405
+
406
+ load_button.click(
407
+ fn=load_and_parse,
408
+ inputs=[parser_dropdown, task_dropdown, split_dropdown],
409
+ outputs=[
410
+ entry_index,
411
+ question_output,
412
+ answer_output,
413
+ attributes_output,
414
+ split_dropdown,
415
+ dataset_status,
416
+ ],
417
+ api_name="load_and_parse",
418
+ show_progress="full",
419
+ ).then(
420
+ fn=update_dataset_info,
421
+ inputs=[parser_dropdown],
422
+ outputs=[dataset_description, metric_dropdown, metric_details],
423
+ )
424
+
425
+ update_button.click(
426
+ fn=update_entry,
427
+ inputs=[entry_index, parser_state],
428
+ outputs=[
429
+ question_output,
430
+ answer_output,
431
+ attributes_output,
432
+ ],
433
+ api_name="update_entry",
434
+ )
435
+
436
+ metric_dropdown.change(
437
+ fn=update_metric_details,
438
+ inputs=[metric_dropdown, parser_dropdown],
439
+ outputs=metric_details,
440
+ )
441
+
442
+ return demo
443
+
444
+
445
+ if __name__ == "__main__":
446
+ print("Starting Gradio interface...") # Add debug logging
447
+ demo = create_interface()
448
+ try:
449
+ demo.launch(
450
+ show_error=True, # Changed to True for debugging
451
+ )
452
+ except Exception as e:
453
+ print(f"Error launching Gradio: {e}") # Add error logging
454
+ import traceback
455
+
456
+ traceback.print_exc()
docker-compose.yml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3.8"
2
+
3
+ services:
4
+ llmdataparser:
5
+ build: jeff52415/llmdataparser
6
+ environment:
7
+ - GRADIO_SERVER_PORT=7860
8
+ volumes:
9
+ - .:/app
10
+ - huggingface_cache:/app/.cache/huggingface
11
+ healthcheck:
12
+ test: ["CMD", "curl", "-f", "http://127.0.0.1:7860"]
13
+ interval: 30s
14
+ timeout: 10s
15
+ retries: 3
16
+ networks:
17
+ - internal
18
+
19
+ nginx:
20
+ image: nginx:alpine
21
+ ports:
22
+ - "80:80"
23
+ volumes:
24
+ - ./nginx.conf:/etc/nginx/nginx.conf:ro
25
+ depends_on:
26
+ - llmdataparser
27
+ networks:
28
+ - internal
29
+
30
+ networks:
31
+ internal:
32
+
33
+ volumes:
34
+ huggingface_cache:
docs/POETRY_JUPYTER_SETUP.md DELETED
@@ -1,51 +0,0 @@
1
- # Connecting Poetry Environment with Jupyter Notebook
2
-
3
- This guide provides simple steps to connect a Poetry-managed environment to Jupyter Notebook.
4
-
5
- ## Steps
6
-
7
- 1. **Activate the Poetry Environment**
8
-
9
- First, navigate to your project directory and activate the Poetry shell:
10
-
11
- ```bash
12
- poetry shell
13
- ```
14
-
15
- 2. **Install Jupyter as a Development Dependency**
16
-
17
- If Jupyter is not already installed, add it as a development dependency:
18
-
19
- ```bash
20
- poetry add --group dev jupyter
21
- ```
22
-
23
- 3. **Register the Poetry Environment as a Jupyter Kernel**
24
-
25
- Run this command to make the Poetry environment available as a Jupyter kernel:
26
-
27
- ```bash
28
- python -m ipykernel install --user --name=llmdataparser-env --display-name "Python (LLMDataParser)"
29
- ```
30
-
31
- - `--name=llmdataparser-env`: Assigns a name to the kernel.
32
- - `--display-name "Python (LLMDataParser)"`: Sets the display name seen in Jupyter.
33
-
34
- 4. **Start Jupyter Notebook**
35
-
36
- Launch Jupyter Notebook from the Poetry shell:
37
-
38
- ```bash
39
- jupyter notebook
40
- ```
41
-
42
- 5. **Select the Poetry Kernel in Jupyter**
43
-
44
- - Open a notebook in Jupyter.
45
- - Go to "Kernel" > "Change kernel" and select **Python (LLMDataParser)** from the list.
46
-
47
- This connects the notebook to your Poetry environment.
48
-
49
- ---
50
-
51
- You’re now set up to use your Poetry environment within Jupyter Notebook!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/POETRY_USAGE.md DELETED
@@ -1,158 +0,0 @@
1
- # Poetry Usage Guide
2
-
3
- This guide provides instructions on how to use [Poetry](https://python-poetry.org/) to manage dependencies, install packages, and prepare your project for both development and production environments.
4
-
5
- ## Table of Contents
6
-
7
- - [Overview](#overview)
8
- - [Installing Poetry](#installing-poetry)
9
- - [Using Poetry in Development](#using-poetry-in-development)
10
- - [Installing Dependencies](#installing-dependencies)
11
- - [Updating Dependencies](#updating-dependencies)
12
- - [Adding and Removing Dependencies](#adding-and-removing-dependencies)
13
- - [Synchronizing Dependencies](#synchronizing-dependencies)
14
- - [Using Poetry in Production](#using-poetry-in-production)
15
- - [Locking Dependencies](#locking-dependencies)
16
- - [Installing from `poetry.lock`](#installing-from-poetrylock)
17
- - [Poetry Commands Summary](#poetry-commands-summary)
18
-
19
- ---
20
-
21
- ## Overview
22
-
23
- Poetry is a dependency manager and build tool for Python projects. It simplifies managing dependencies, creating virtual environments, and ensuring version consistency between development and production environments. Poetry relies on two files:
24
-
25
- - **`pyproject.toml`**: Defines the dependencies and configuration.
26
- - **`poetry.lock`**: Locks dependencies to specific versions to ensure consistency.
27
-
28
- ---
29
-
30
- ## Installing Poetry(macOS only)
31
-
32
- To install Poetry, use the following command:
33
-
34
- ```bash
35
- brew install poetry
36
- ```
37
-
38
- Refer to the [Poetry documentation](https://python-poetry.org/docs/#installation) for more options and OS-specific installation instructions.
39
-
40
- ---
41
-
42
- ## Using Poetry in Development
43
-
44
- ### Installing Dependencies
45
-
46
- In development, install dependencies specified in `pyproject.toml`:
47
-
48
- 1. Navigate to the project directory:
49
-
50
- ```bash
51
- cd path/to/project
52
- ```
53
-
54
- 2. Run:
55
- ```bash
56
- poetry install
57
- ```
58
-
59
- This command creates a virtual environment, installs all dependencies, and ensures they are compatible with the Python version specified.
60
-
61
- ### Updating Dependencies
62
-
63
- During development, you can update dependencies by editing `pyproject.toml` directly and then running:
64
-
65
- ```bash
66
- poetry install
67
- ```
68
-
69
- This will apply any changes and update the environment without manually adding each dependency.
70
-
71
- ### Adding and Removing Dependencies
72
-
73
- - **Add a New Dependency**:
74
-
75
- ```bash
76
- poetry add <package-name>
77
- ```
78
-
79
- Example:
80
-
81
- ```bash
82
- poetry add requests
83
- ```
84
-
85
- - **Add a Development Dependency** (only used for development/testing):
86
-
87
- ```bash
88
- poetry add --group dev <package-name>
89
- ```
90
-
91
- Example:
92
-
93
- ```bash
94
- poetry add --group dev pytest
95
- ```
96
-
97
- - **Remove a Dependency**:
98
- ```bash
99
- poetry remove <package-name>
100
- ```
101
-
102
- ### Synchronizing Dependencies
103
-
104
- If the `pyproject.toml` or `poetry.lock` files are updated (e.g., after pulling changes), run:
105
-
106
- ```bash
107
- poetry install
108
- ```
109
-
110
- This keeps your environment synchronized with any updates made to the dependency files.
111
-
112
- ---
113
-
114
- ## Using Poetry in Production
115
-
116
- ### Locking Dependencies
117
-
118
- To lock dependencies for production use, run:
119
-
120
- ```bash
121
- poetry lock
122
- ```
123
-
124
- This creates or updates `poetry.lock`, which pins each dependency to a specific version. This lock file should be used to maintain consistency in production.
125
-
126
- ### Installing from `poetry.lock`
127
-
128
- In production, use `poetry.lock` to ensure exact dependency versions:
129
-
130
- 1. Install only the required (non-development) dependencies:
131
- ```bash
132
- poetry install --no-dev
133
- ```
134
-
135
- This ensures that dependencies are installed exactly as defined in `poetry.lock`.
136
-
137
- ---
138
-
139
- ## Poetry Commands Summary
140
-
141
- | Command | Description |
142
- | ------------------------------ | ------------------------------------------------------------- |
143
- | `poetry install` | Installs dependencies from `pyproject.toml` or `poetry.lock`. |
144
- | `poetry add <package-name>` | Adds a new dependency and updates `pyproject.toml`. |
145
- | `poetry add --group dev <pkg>` | Adds a development-only dependency. |
146
- | `poetry remove <package-name>` | Removes a dependency and updates `pyproject.toml`. |
147
- | `poetry update` | Updates all dependencies to their latest compatible versions. |
148
- | `poetry lock` | Locks dependencies to specific versions for production. |
149
- | `poetry shell` | Activates the Poetry-managed virtual environment. |
150
-
151
- ---
152
-
153
- ## Additional Resources
154
-
155
- - **Poetry Documentation**: [https://python-poetry.org/docs/](https://python-poetry.org/docs/)
156
- - **GitHub Repository**: [https://github.com/python-poetry/poetry](https://github.com/python-poetry/poetry)
157
-
158
- For further help, please refer to the [Poetry documentation](https://python-poetry.org/docs/).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/adding_new_parser.md ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adding a New Dataset Parser
2
+
3
+ This guide explains how to add a new dataset parser to the llmdataparser library. The library is designed to make it easy to add support for new datasets while maintaining consistent interfaces and functionality.
4
+
5
+ ## Step-by-Step Guide
6
+
7
+ ### 1. Create a New Parser Class
8
+
9
+ Create a new file `your_dataset_parser.py` in the `llmdataparser` folder. Your parser should inherit from `HuggingFaceDatasetParser[T]` where T is your custom entry type.
10
+
11
+ ```python
12
+ from llmdataparser.base_parser import (
13
+ DatasetDescription,
14
+ EvaluationMetric,
15
+ HuggingFaceDatasetParser,
16
+ HuggingFaceParseEntry,
17
+ )
18
+
19
+ @dataclass(frozen=True, kw_only=True, slots=True)
20
+ class YourDatasetParseEntry(HuggingFaceParseEntry):
21
+ """Custom entry class for your dataset."""
22
+ # Add any additional fields specific to your dataset
23
+ custom_field: str
24
+
25
+ @classmethod
26
+ def create(cls, question: str, answer: str, raw_question: str,
27
+ raw_answer: str, task_name: str, custom_field: str) -> "YourDatasetParseEntry":
28
+ return cls(
29
+ question=question,
30
+ answer=answer,
31
+ raw_question=raw_question,
32
+ raw_answer=raw_answer,
33
+ task_name=task_name,
34
+ custom_field=custom_field
35
+ )
36
+
37
+ class YourDatasetParser(HuggingFaceDatasetParser[YourDatasetParseEntry]):
38
+ """Parser for your dataset."""
39
+
40
+ # Required class variables
41
+ _data_source = "huggingface/your-dataset"
42
+ _default_task = "default"
43
+ _task_names = ["task1", "task2", "task3"]
44
+ ```
45
+
46
+ ### 2. Implement Required Methods
47
+
48
+ Your parser needs to implement these key methods:
49
+
50
+ ```python
51
+ def process_entry(
52
+ self,
53
+ row: dict[str, Any],
54
+ task_name: str | None = None,
55
+ **kwargs: Any
56
+ ) -> YourDatasetParseEntry:
57
+ """Process a single dataset entry."""
58
+ # Extract data from the row
59
+ raw_question = row["question"]
60
+ raw_answer = row["answer"]
61
+ task = task_name or self._get_current_task(row)
62
+
63
+ question = f"Question: {raw_question}\nAnswer:"
64
+
65
+ return YourDatasetParseEntry.create(
66
+ question=question,
67
+ answer=raw_answer,
68
+ raw_question=raw_question,
69
+ raw_answer=raw_answer,
70
+ task_name=task,
71
+ custom_field=row["custom_field"]
72
+ )
73
+
74
+ def get_dataset_description(self) -> DatasetDescription:
75
+ """Returns description of your dataset."""
76
+ return DatasetDescription.create(
77
+ name="Your Dataset Name",
78
+ purpose="Purpose of the dataset",
79
+ source="Dataset source/URL",
80
+ language="Dataset language",
81
+ format="Data format (e.g., multiple choice, free text)",
82
+ characteristics="Key characteristics of the dataset",
83
+ citation="Dataset citation if available"
84
+ )
85
+
86
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
87
+ """Returns recommended evaluation metrics."""
88
+ return [
89
+ EvaluationMetric.create(
90
+ name="metric_name",
91
+ type="metric_type",
92
+ description="Metric description",
93
+ implementation="implementation_details",
94
+ primary=True
95
+ )
96
+ ]
97
+ ```
98
+
99
+ ### 3. Add Example Usage
100
+
101
+ Add example usage at the bottom of your parser file:
102
+
103
+ ```python
104
+ if __name__ == "__main__":
105
+ # Example usage
106
+ parser = YourDatasetParser()
107
+ parser.load()
108
+ parser.parse()
109
+
110
+ # Get parsed data
111
+ parsed_data = parser.get_parsed_data
112
+
113
+ # Print example entry
114
+ if parsed_data:
115
+ example = parsed_data[0]
116
+ print("\nExample parsed entry:")
117
+ print(f"Question: {example.raw_question}")
118
+ print(f"Answer: {example.answer}")
119
+ ```
120
+
121
+ ### 4. Create Tests
122
+
123
+ Create a test file `tests/test_your_dataset_parser.py`:
124
+
125
+ ```python
126
+ import pytest
127
+ from llmdataparser.your_dataset_parser import YourDatasetParser, YourDatasetParseEntry
128
+
129
+ def test_parser_initialization():
130
+ parser = YourDatasetParser()
131
+ assert parser._data_source == "huggingface/your-dataset"
132
+ assert parser._default_task == "default"
133
+ assert "task1" in parser._task_names
134
+
135
+ def test_process_entry():
136
+ parser = YourDatasetParser()
137
+ sample_row = {
138
+ "question": "Sample question",
139
+ "answer": "Sample answer",
140
+ "custom_field": "Custom value"
141
+ }
142
+
143
+ entry = parser.process_entry(sample_row)
144
+ assert isinstance(entry, YourDatasetParseEntry)
145
+ assert entry.raw_question == "Sample question"
146
+ assert entry.custom_field == "Custom value"
147
+ ```
148
+
149
+ ## Best Practices
150
+
151
+ 1. **Type Safety**: Use type hints consistently and ensure your parser is properly typed.
152
+ 1. **Documentation**: Add clear docstrings and comments explaining your parser's functionality.
153
+ 1. **Error Handling**: Include appropriate error checking and validation.
154
+ 1. **Testing**: Write comprehensive tests covering different scenarios.
155
+
156
+ ## Examples
157
+
158
+ Look at existing parsers for reference:
159
+
160
+ - `mmlu_parser.py` for multiple-choice questions
161
+ - `gsm8k_parser.py` for math word problems
162
+ - `humaneval_parser.py` for code generation tasks
163
+
164
+ ## Common Patterns
165
+
166
+ 1. **Parse Entry Class**: Create a custom parse entry class if you need additional fields.
167
+ 1. **Task Names**: Define all available tasks in `_task_names`.
168
+ 1. **Process Entry**: Handle data extraction and formatting in `process_entry`.
169
+ 1. **Dataset Description**: Provide comprehensive dataset information.
170
+ 1. **Evaluation Metrics**: Define appropriate metrics for your dataset.
171
+
172
+ ## Testing Your Parser
173
+
174
+ 1. Run the example usage code to verify basic functionality
175
+ 1. Run pytest to execute your test cases
176
+ 1. Try different dataset splits and tasks
177
+ 1. Verify the parsed output format
178
+ 1. Check error handling with invalid inputs
llmdataparser/__init__.py CHANGED
@@ -1,8 +1,22 @@
1
  # llmdataparser/__init__.py
2
- from typing import Type
3
 
4
  from .base_parser import DatasetParser
5
- from .mmlu_parser import MMLUDatasetParser
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
 
8
  class ParserRegistry:
@@ -17,11 +31,13 @@ class ParserRegistry:
17
  cls._registry[name.lower()] = parser_class
18
 
19
  @classmethod
20
- def get_parser(cls, name: str, **kwargs) -> Type[DatasetParser]:
 
21
  parser_class = cls._registry.get(name.lower())
22
  if parser_class is None:
23
  raise ValueError(f"Parser '{name}' is not registered.")
24
- return parser_class(**kwargs)
 
25
 
26
  @classmethod
27
  def list_parsers(cls) -> list[str]:
@@ -30,4 +46,17 @@ class ParserRegistry:
30
 
31
 
32
  # Register parsers
33
- ParserRegistry.register_parser("mmlu", MMLUDatasetParser)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # llmdataparser/__init__.py
2
+ from typing import Any, Type
3
 
4
  from .base_parser import DatasetParser
5
+ from .bbh_parser import BBHDatasetParser
6
+ from .gsm8k_parser import GSM8KDatasetParser
7
+ from .humaneval_parser import HumanEvalDatasetParser, HumanEvalDatasetPlusParser
8
+ from .ifeval_parser import IFEvalDatasetParser
9
+ from .math_parser import MATHDatasetParser
10
+ from .mbpp_parser import MBPPDatasetParser
11
+ from .mgsm_parser import MGSMDatasetParser
12
+ from .mmlu_parser import (
13
+ BaseMMLUDatasetParser,
14
+ MMLUProDatasetParser,
15
+ MMLUReduxDatasetParser,
16
+ TMMLUPlusDatasetParser,
17
+ )
18
+ from .tmlu_parser import TMLUDatasetParser
19
+ from .tw_legal_parser import TWLegalDatasetParser
20
 
21
 
22
  class ParserRegistry:
 
31
  cls._registry[name.lower()] = parser_class
32
 
33
  @classmethod
34
+ def get_parser(cls, name: str, **kwargs: Any) -> DatasetParser[Any]:
35
+ """Get a parser instance by name."""
36
  parser_class = cls._registry.get(name.lower())
37
  if parser_class is None:
38
  raise ValueError(f"Parser '{name}' is not registered.")
39
+ parser: DatasetParser[Any] = parser_class(**kwargs)
40
+ return parser
41
 
42
  @classmethod
43
  def list_parsers(cls) -> list[str]:
 
46
 
47
 
48
  # Register parsers
49
+ ParserRegistry.register_parser("mmlu", BaseMMLUDatasetParser)
50
+ ParserRegistry.register_parser("mmlupro", MMLUProDatasetParser)
51
+ ParserRegistry.register_parser("mmluredux", MMLUReduxDatasetParser)
52
+ ParserRegistry.register_parser("tmmluplus", TMMLUPlusDatasetParser)
53
+ ParserRegistry.register_parser("gsm8k", GSM8KDatasetParser)
54
+ ParserRegistry.register_parser("math", MATHDatasetParser)
55
+ ParserRegistry.register_parser("mgsm", MGSMDatasetParser)
56
+ ParserRegistry.register_parser("humaneval", HumanEvalDatasetParser)
57
+ ParserRegistry.register_parser("humanevalplus", HumanEvalDatasetPlusParser)
58
+ ParserRegistry.register_parser("bbh", BBHDatasetParser)
59
+ ParserRegistry.register_parser("mbpp", MBPPDatasetParser)
60
+ ParserRegistry.register_parser("ifeval", IFEvalDatasetParser)
61
+ ParserRegistry.register_parser("twlegal", TWLegalDatasetParser)
62
+ ParserRegistry.register_parser("tmlu", TMLUDatasetParser)
llmdataparser/base_parser.py CHANGED
@@ -1,7 +1,7 @@
1
  from abc import ABC, abstractmethod
2
  from dataclasses import dataclass
3
  from functools import lru_cache
4
- from typing import Any, Generic, TypeVar
5
 
6
  import datasets
7
 
@@ -9,17 +9,102 @@ import datasets
9
  T = TypeVar("T", bound="ParseEntry")
10
 
11
 
12
- @dataclass(frozen=True)
 
 
 
 
 
 
 
 
 
 
 
 
13
  class ParseEntry:
14
  """A simple base class for entries, customizable by each dataset parser."""
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- class DatasetParser(ABC, Generic[T]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  """
19
  Abstract base class defining the interface for all dataset parsers.
20
  """
21
 
22
- def __init__(self):
23
  self._parsed_data: list[T] = []
24
 
25
  @abstractmethod
@@ -39,40 +124,189 @@ class DatasetParser(ABC, Generic[T]):
39
  return self._parsed_data
40
 
41
  @abstractmethod
42
- def process_entry(self, row: dict[str, Any]) -> T:
43
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
 
46
- # Base class for Hugging Face datasets
47
  class HuggingFaceDatasetParser(DatasetParser[T]):
48
  """
49
  Base class for parsers that use datasets from Hugging Face.
50
  """
51
 
52
- _data_source: str # Class variable for the dataset name
 
 
 
 
 
 
 
53
 
54
- def __init__(self):
55
- self.raw_data = None
56
- self.task_names = []
 
 
 
 
57
  super().__init__()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- def get_task_names(self) -> list[str]:
60
- return self.task_names
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  @staticmethod
63
  @lru_cache(maxsize=3)
64
  def load_dataset_cached(
65
- data_source: str, config_name: str = "default", **kwargs: Any
66
- ):
 
 
 
67
  """
68
  Cached static method to load a dataset from Hugging Face.
69
  """
70
- return datasets.load_dataset(data_source, config_name, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  def load(
73
  self,
74
- data_source: str | None = None,
75
- config_name: str = "all",
76
  trust_remote_code: bool = True,
77
  split: str | None = None,
78
  **kwargs: Any,
@@ -80,21 +314,41 @@ class HuggingFaceDatasetParser(DatasetParser[T]):
80
  """
81
  Load the dataset using the Hugging Face datasets library.
82
  """
83
- # Use class-level data_source if not provided
84
- data_source = data_source or self._data_source
85
- if not data_source:
86
- raise ValueError("The 'data_source' class variable must be defined.")
87
 
88
  # Call the cached static method
89
- self.raw_data = self.load_dataset_cached(
90
- data_source,
91
- config_name=config_name,
92
  trust_remote_code=trust_remote_code,
93
  split=split,
94
  **kwargs,
95
  )
96
- self.task_names = list(self.raw_data.keys())
 
 
 
 
 
 
 
 
97
  print(
98
- f"Loaded dataset with {len(self.task_names)} tasks: {', '.join(self.task_names)}."
 
 
 
 
 
 
 
 
 
 
 
 
99
  )
100
- # Additional common initialization can be added here
 
 
 
1
  from abc import ABC, abstractmethod
2
  from dataclasses import dataclass
3
  from functools import lru_cache
4
+ from typing import Any, ClassVar, Generic, TypeVar
5
 
6
  import datasets
7
 
 
9
  T = TypeVar("T", bound="ParseEntry")
10
 
11
 
12
+ # Add this after the DatasetCategory definition
13
+ VALID_CATEGORIES = {
14
+ "Math",
15
+ "General Knowledge and Reasoning",
16
+ "Programming",
17
+ "MultiLingual",
18
+ "Taiwan",
19
+ "Advanced Reasoning",
20
+ "Legal",
21
+ }
22
+
23
+
24
+ @dataclass(frozen=True, kw_only=True, slots=True)
25
  class ParseEntry:
26
  """A simple base class for entries, customizable by each dataset parser."""
27
 
28
+ question: str
29
+ answer: str
30
+ raw_question: str
31
+ raw_answer: str
32
+
33
+
34
+ @dataclass(frozen=True, kw_only=True, slots=True)
35
+ class DatasetDescription:
36
+ """Standardized description of a dataset."""
37
+
38
+ name: str
39
+ purpose: str
40
+ source: str
41
+ language: str
42
+ format: str
43
+ category: list[str]
44
+ characteristics: str
45
+ citation: str | None = None
46
+ additional_info: dict[str, Any] | None = None
47
+
48
+ @classmethod
49
+ def create(
50
+ cls,
51
+ name: str,
52
+ purpose: str,
53
+ source: str,
54
+ language: str,
55
+ format: str,
56
+ category: list[str],
57
+ characteristics: str,
58
+ citation: str | None = None,
59
+ additional_info: dict[str, Any] | None = None,
60
+ ) -> "DatasetDescription":
61
+ # Validate that all categories are valid DatasetCategory values
62
+ for item in category:
63
+ assert (
64
+ item in VALID_CATEGORIES
65
+ ), f"Category '{item}' is not a valid category. Valid categories are: {VALID_CATEGORIES}"
66
+ return cls(
67
+ name=name,
68
+ purpose=purpose,
69
+ source=source,
70
+ language=language,
71
+ format=format,
72
+ category=category,
73
+ characteristics=characteristics,
74
+ citation=citation,
75
+ additional_info=additional_info,
76
+ )
77
+
78
+
79
+ @dataclass(frozen=True, kw_only=True, slots=True)
80
+ class EvaluationMetric:
81
+ """Description of an evaluation metric for a dataset."""
82
 
83
+ name: str
84
+ type: str
85
+ description: str
86
+ implementation: str
87
+ primary: bool
88
+
89
+ @classmethod
90
+ def create(
91
+ cls, name: str, type: str, description: str, implementation: str, primary: bool
92
+ ) -> "EvaluationMetric":
93
+ return cls(
94
+ name=name,
95
+ type=type,
96
+ description=description,
97
+ implementation=implementation,
98
+ primary=primary,
99
+ )
100
+
101
+
102
+ class DatasetParser(Generic[T], ABC):
103
  """
104
  Abstract base class defining the interface for all dataset parsers.
105
  """
106
 
107
+ def __init__(self) -> None:
108
  self._parsed_data: list[T] = []
109
 
110
  @abstractmethod
 
124
  return self._parsed_data
125
 
126
  @abstractmethod
127
+ def process_entry(
128
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
129
+ ) -> T:
130
+ """
131
+ Process a single entry from the dataset.
132
+
133
+ Args:
134
+ row: A dictionary representing a single entry from the dataset.
135
+ task_name: Optional task name for the entry.
136
+ **kwargs: Additional keyword arguments.
137
+
138
+ Returns:
139
+ T: The processed entry, typically an instance of a subclass of ParseEntry.
140
+ """
141
+
142
+ @abstractmethod
143
+ def get_dataset_description(self) -> DatasetDescription:
144
+ """Returns a standardized description of the dataset."""
145
+
146
+ @abstractmethod
147
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
148
+ """Returns the recommended evaluation metrics for the dataset."""
149
+
150
+
151
+ @dataclass(frozen=True, kw_only=True, slots=True)
152
+ class HuggingFaceParseEntry(ParseEntry):
153
+ """ParseEntry with an additional task_name field."""
154
+
155
+ task_name: str
156
 
157
 
 
158
  class HuggingFaceDatasetParser(DatasetParser[T]):
159
  """
160
  Base class for parsers that use datasets from Hugging Face.
161
  """
162
 
163
+ # _data_source is the name of the dataset, e.g. "lighteval/MATH"
164
+ _data_source: ClassVar[str]
165
+ # _task_names is the list of tasks in the dataset, e.g. ["algebra", "geometry", "statistics"]
166
+ _task_names: ClassVar[list[str]]
167
+ # _default_task is the default task to use if no task is specified, e.g. "algebra"
168
+ _default_task: ClassVar[str]
169
+ # _hidden_task_names is the list of task names that are hidden in the dataset, e.g. ["math", "physics", "chemistry"]
170
+ _hidden_task_names: ClassVar[list[str]] = []
171
 
172
+ def __init__(self, **kwargs: Any) -> None:
173
+ """
174
+ Initialize a HuggingFaceDatasetParser.
175
+
176
+ Args:
177
+ **kwargs: Additional keyword arguments passed to the parent class.
178
+ """
179
  super().__init__()
180
+ # raw_data is the dataset loaded from HuggingFace
181
+ self.raw_data: dict[str, Any] | None = None
182
+ # split_names is the list of splits in the dataset, e.g. ["train", "test", "validation"]
183
+ self.split_names: list[str] = []
184
+ # _current_task is the task currently being processed, e.g. "algebra"
185
+ self._current_task: str = ""
186
+
187
+ def _get_current_task(self, data_entry: dict[str, Any] | None = None) -> str:
188
+ """
189
+ Get the currently loaded task name.
190
+
191
+ Args:
192
+ data_entry: Optional dictionary containing entry data that might include task information
193
+
194
+ Returns:
195
+ str: The task name from either the data entry (if available) or the currently set task
196
+ """
197
+ # If data_entry is provided and contains task information, use it
198
+ if data_entry is not None and hasattr(self, "_get_task_from_entry"):
199
+ try:
200
+ task = self._get_task_from_entry(data_entry)
201
+ if isinstance(task, str): # Add type checking
202
+ return task
203
+ except (KeyError, AttributeError):
204
+ pass
205
 
206
+ # Otherwise return the task set during load()
207
+ return self._current_task or self._default_task
208
+
209
+ @property
210
+ def task_names(self) -> list[str]:
211
+ """Get all available task names."""
212
+ return self._task_names
213
+
214
+ @property
215
+ def total_tasks(self) -> int:
216
+ """Get total number of available tasks."""
217
+ return len(self._task_names)
218
+
219
+ @property
220
+ def get_huggingface_link(self) -> str:
221
+ return "https://huggingface.co/datasets/" + self._data_source
222
 
223
  @staticmethod
224
  @lru_cache(maxsize=3)
225
  def load_dataset_cached(
226
+ data_source: str,
227
+ task_name: str = "default",
228
+ trust_remote_code: bool = True,
229
+ **kwargs: Any,
230
+ ) -> datasets.Dataset:
231
  """
232
  Cached static method to load a dataset from Hugging Face.
233
  """
234
+ return datasets.load_dataset(
235
+ data_source, task_name, trust_remote_code=trust_remote_code, **kwargs
236
+ )
237
+
238
+ def parse(
239
+ self,
240
+ split_names: str | list[str] | None = None,
241
+ force: bool = False,
242
+ **kwargs: Any,
243
+ ) -> None:
244
+ """
245
+ Parse the MATH dataset splits into structured entries.
246
+
247
+ Args:
248
+ split_names: Dataset splits to parse. Can be:
249
+ - None: Parse all available splits
250
+ - str: Parse a single split (e.g., "train")
251
+ - list[str]: Parse multiple splits (e.g., ["train", "test"])
252
+ force: If True, overwrites existing parsed data without confirmation.
253
+ If False and parsed data exists, prompts for confirmation.
254
+ **kwargs: Additional keyword arguments passed to process_entry
255
+
256
+ Raises:
257
+ ValueError: If no data is loaded or if a specified split name doesn't exist
258
+ """
259
+ if self.raw_data is None:
260
+ raise ValueError("No data loaded. Please load the dataset first.")
261
+
262
+ if self._parsed_data and not force:
263
+ response = input(
264
+ f"Found {len(self._parsed_data)} existing parsed entries. "
265
+ "Do you want to overwrite them? [y/N]: "
266
+ ).lower()
267
+ if response not in ("y", "yes"):
268
+ print("Parsing cancelled. Existing data preserved.")
269
+ return
270
+
271
+ self._parsed_data.clear()
272
+
273
+ # Dataset with splits
274
+ if split_names is None:
275
+ split_names = self.split_names
276
+ elif isinstance(split_names, str):
277
+ split_names = [split_names]
278
+
279
+ for split_name in split_names:
280
+ if split_name not in self.split_names:
281
+ raise ValueError(f"Split '{split_name}' not found in the dataset.")
282
+
283
+ dataset_split = self.raw_data[split_name]
284
+ total_entries = len(dataset_split)
285
+ print(f"Processing {split_name} split with {total_entries} entries...")
286
+
287
+ for index, entry in enumerate(dataset_split, start=1):
288
+ try:
289
+ task_name = self._get_current_task(data_entry=entry)
290
+ parsed_entry = self.process_entry(entry, task_name, **kwargs)
291
+ self._parsed_data.append(parsed_entry)
292
+
293
+ # Print progress every 100 entries
294
+ if index % 100 == 0:
295
+ print(
296
+ f"Processed {index}/{total_entries} entries from '{split_name}'"
297
+ )
298
+
299
+ except Exception as e:
300
+ print(f"Error processing entry {index} in {split_name}: {str(e)}")
301
+ continue
302
+
303
+ print(f"Completed parsing {index} entries from '{split_name}'")
304
+
305
+ print(f"Total parsed entries: {len(self._parsed_data)}")
306
 
307
  def load(
308
  self,
309
+ task_name: str | None = None,
 
310
  trust_remote_code: bool = True,
311
  split: str | None = None,
312
  **kwargs: Any,
 
314
  """
315
  Load the dataset using the Hugging Face datasets library.
316
  """
317
+ # Set the task name
318
+ self._current_task = task_name or self._default_task
 
 
319
 
320
  # Call the cached static method
321
+ raw_data = self.load_dataset_cached(
322
+ self._data_source,
323
+ task_name=self._current_task,
324
  trust_remote_code=trust_remote_code,
325
  split=split,
326
  **kwargs,
327
  )
328
+
329
+ # Handle split-specific loading
330
+ if split:
331
+ self.raw_data = {split: raw_data}
332
+ self.split_names = [split]
333
+ else:
334
+ self.raw_data = raw_data
335
+ self.split_names = list(raw_data.keys())
336
+
337
  print(
338
+ f"Loaded dataset with {len(self.split_names)} groups: {', '.join(self.split_names)}."
339
+ )
340
+
341
+ def __repr__(self) -> str:
342
+ status = "loaded" if self.raw_data is not None else "not loaded"
343
+ parsed_count = len(self._parsed_data) if self._parsed_data else 0
344
+ return (
345
+ f"{self.__class__.__name__}("
346
+ f"data_source='{self._data_source}', "
347
+ f"task='{self._current_task}', "
348
+ f"status='{status}', "
349
+ f"parsed_entries={parsed_count}"
350
+ ")"
351
  )
352
+
353
+ def __str__(self) -> str:
354
+ return self.__repr__()
llmdataparser/bbh_parser.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Any, ClassVar, List
3
+
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
+
11
+
12
+ @dataclass(frozen=True, kw_only=True, slots=True)
13
+ class BBHParseEntry(HuggingFaceParseEntry):
14
+ """Custom entry class for BBH (Big Bench Hard), with fields specific to this dataset."""
15
+
16
+ @classmethod
17
+ def create(
18
+ cls,
19
+ question: str,
20
+ answer: str,
21
+ raw_question: str,
22
+ raw_answer: str,
23
+ task_name: str,
24
+ ) -> "BBHParseEntry":
25
+ return cls(
26
+ question=question,
27
+ answer=answer,
28
+ raw_question=raw_question,
29
+ raw_answer=raw_answer,
30
+ task_name=task_name,
31
+ )
32
+
33
+
34
+ class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
35
+ """Parser for the Big Bench Hard dataset."""
36
+
37
+ _data_source: ClassVar[str] = "lukaemon/bbh"
38
+ _task_names: ClassVar[list[str]] = [
39
+ "boolean_expressions",
40
+ "causal_judgement",
41
+ "date_understanding",
42
+ "disambiguation_qa",
43
+ "dyck_languages",
44
+ "formal_fallacies",
45
+ "geometric_shapes",
46
+ "hyperbaton",
47
+ "logical_deduction_five_objects",
48
+ "logical_deduction_seven_objects",
49
+ "logical_deduction_three_objects",
50
+ "movie_recommendation",
51
+ "multistep_arithmetic_two",
52
+ "navigate",
53
+ "object_counting",
54
+ "penguins_in_a_table",
55
+ "reasoning_about_colored_objects",
56
+ "ruin_names",
57
+ "salient_translation_error_detection",
58
+ "snarks",
59
+ "sports_understanding",
60
+ "temporal_sequences",
61
+ "tracking_shuffled_objects_five_objects",
62
+ "tracking_shuffled_objects_seven_objects",
63
+ "tracking_shuffled_objects_three_objects",
64
+ "web_of_lies",
65
+ "word_sorting",
66
+ ]
67
+ _default_task: ClassVar[str] = "reasoning_about_colored_objects"
68
+
69
+ def process_entry(
70
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
71
+ ) -> BBHParseEntry:
72
+ """Process a single BBH entry."""
73
+ raw_question = row["input"]
74
+ raw_answer = row["target"]
75
+
76
+ # Remove parentheses from the answer
77
+ clean_answer = raw_answer.strip("()")
78
+
79
+ question = str(raw_question)
80
+
81
+ # Use task_name if provided, otherwise use default
82
+ task = task_name or self._get_current_task(row)
83
+
84
+ return BBHParseEntry.create(
85
+ question=question,
86
+ answer=clean_answer,
87
+ raw_question=raw_question,
88
+ raw_answer=raw_answer,
89
+ task_name=task,
90
+ )
91
+
92
+ def get_dataset_description(self) -> DatasetDescription:
93
+ """Returns a description of the Big Bench Hard dataset."""
94
+ return DatasetDescription.create(
95
+ name="Big Bench Hard (BBH)",
96
+ purpose="A curated subset of 23 challenging BIG-Bench tasks where language models initially performed below average human-rater performance",
97
+ source="https://github.com/suzgunmirac/BIG-Bench-Hard",
98
+ language="English",
99
+ format="Multiple choice questions with single correct answers",
100
+ characteristics=(
101
+ "Tasks require complex multi-step reasoning and were selected based on "
102
+ "initial model performance below human baseline. Performance can be "
103
+ "significantly improved through chain-of-thought prompting. The dataset "
104
+ "includes 23 core tasks plus additional related tasks."
105
+ ),
106
+ category=["Advanced Reasoning"],
107
+ citation=(
108
+ "@article{suzgun2022challenging,\n"
109
+ " title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
110
+ ' author={Suzgun, Mirac and Scales, Nathan and Sch{"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and Wei, Jason},\n'
111
+ " journal={arXiv preprint arXiv:2210.09261},\n"
112
+ " year={2022}\n"
113
+ "}"
114
+ ),
115
+ additional_info={
116
+ "model_performance": (
117
+ "With chain-of-thought prompting, PaLM surpassed human performance on "
118
+ "10/23 tasks, while Codex surpassed human performance on 17/23 tasks"
119
+ ),
120
+ "size": "6.5k examples across 27 tasks (23 core + 4 related)",
121
+ },
122
+ )
123
+
124
+ def get_evaluation_metrics(self) -> List[EvaluationMetric]:
125
+ """Returns the recommended evaluation metrics for BBH dataset."""
126
+ return [
127
+ EvaluationMetric.create(
128
+ name="accuracy",
129
+ type="classification",
130
+ description="Proportion of exactly correct answers (after stripping parentheses)",
131
+ implementation="evaluate.load('accuracy')",
132
+ primary=True,
133
+ ),
134
+ EvaluationMetric.create(
135
+ name="human_eval_delta",
136
+ type="comparison",
137
+ description="Difference between model accuracy and average human-rater performance baseline",
138
+ implementation="custom_human_baseline_comparison",
139
+ primary=True,
140
+ ),
141
+ EvaluationMetric.create(
142
+ name="per_task_accuracy",
143
+ type="classification",
144
+ description="Accuracy broken down by individual reasoning tasks",
145
+ implementation="custom_task_accuracy",
146
+ primary=False,
147
+ ),
148
+ EvaluationMetric.create(
149
+ name="exact_match",
150
+ type="string_match",
151
+ description="Strict exact match between predicted and target answers",
152
+ implementation="evaluate.load('exact_match')",
153
+ primary=False,
154
+ ),
155
+ ]
156
+
157
+
158
+ if __name__ == "__main__":
159
+ # Example usage
160
+ parser = BBHDatasetParser()
161
+
162
+ # Load the dataset with a specific task
163
+ parser.load(task_name="reasoning_about_colored_objects")
164
+
165
+ # Parse all splits
166
+ parser.parse()
167
+
168
+ # Get parsed data
169
+ parsed_data = parser.get_parsed_data
170
+
171
+ # Print example entry
172
+ if parsed_data:
173
+ example = parsed_data[0]
174
+ print("\nExample parsed entry:")
175
+ print(f"Task: {example.task_name}")
176
+ print(f"Question: {example.question}")
177
+ print(f"Answer: {example.answer}")
llmdataparser/gsm8k_parser.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Any, ClassVar
3
+
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
+
11
+
12
+ @dataclass(frozen=True, kw_only=True, slots=True)
13
+ class GSM8KParseEntry(HuggingFaceParseEntry):
14
+ """Custom entry class for GSM8K, with fields specific to this dataset parser."""
15
+
16
+ solution: str
17
+ numerical_answer: int | float
18
+ task_name: str
19
+
20
+ @classmethod
21
+ def create(
22
+ cls,
23
+ question: str,
24
+ answer: str,
25
+ raw_question: str,
26
+ raw_answer: str,
27
+ solution: str,
28
+ numerical_answer: int | float,
29
+ task_name: str,
30
+ ) -> "GSM8KParseEntry":
31
+ return cls(
32
+ question=question,
33
+ answer=answer,
34
+ raw_question=raw_question,
35
+ raw_answer=raw_answer,
36
+ solution=solution,
37
+ numerical_answer=numerical_answer,
38
+ task_name=task_name,
39
+ )
40
+
41
+
42
+ class GSM8KDatasetParser(HuggingFaceDatasetParser[GSM8KParseEntry]):
43
+ """Parser for the GSM8K dataset."""
44
+
45
+ _data_source: ClassVar[str] = "openai/gsm8k"
46
+ _task_names: ClassVar[list[str]] = ["main", "socratic"]
47
+ _default_task: ClassVar[str] = "main"
48
+
49
+ def process_entry(
50
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
51
+ ) -> GSM8KParseEntry:
52
+ """Process a single GSM8K entry."""
53
+ task = task_name or self._get_current_task(row)
54
+ raw_question = row["question"]
55
+ raw_answer = row["answer"]
56
+
57
+ # Extract numerical answer (always after '####' in GSM8K)
58
+ numerical_str = raw_answer.split("####")[-1].strip().replace(",", "")
59
+ # Convert string to number
60
+ try:
61
+ numerical_answer = float(numerical_str)
62
+ if numerical_answer.is_integer():
63
+ numerical_answer = int(numerical_answer)
64
+ except ValueError:
65
+ raise ValueError(f"Could not convert '{numerical_str}' to number")
66
+
67
+ # Extract solution (everything before '####')
68
+ solution = raw_answer.split("####")[0].strip()
69
+
70
+ question = str(raw_question)
71
+
72
+ return GSM8KParseEntry.create(
73
+ question=question,
74
+ answer=str(numerical_answer),
75
+ raw_question=raw_question,
76
+ raw_answer=raw_answer,
77
+ solution=solution,
78
+ numerical_answer=numerical_answer, # Now guaranteed to be int or float
79
+ task_name=task, # Guarantee non-None
80
+ )
81
+
82
+ def get_dataset_description(self) -> DatasetDescription:
83
+ """Returns description of the GSM8K dataset."""
84
+ return DatasetDescription.create(
85
+ name="Grade School Math 8K (GSM8K)",
86
+ purpose="Evaluate mathematical reasoning capabilities through word problems",
87
+ source="OpenAI",
88
+ language="English",
89
+ format="Word problems with step-by-step solutions and numerical answers",
90
+ category=["Math"],
91
+ characteristics=(
92
+ "Collection of 8.5K grade school math word problems that require "
93
+ "multi-step reasoning. Problems gradually increase in difficulty "
94
+ "and cover basic arithmetic, word problems, and elementary algebra"
95
+ ),
96
+ citation="""@article{cobbe2021gsm8k,
97
+ title={Training Verifiers to Solve Math Word Problems},
98
+ author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
99
+ journal={arXiv preprint arXiv:2110.14168},
100
+ year={2021}
101
+ }""",
102
+ )
103
+
104
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
105
+ """Returns recommended evaluation metrics for GSM8K."""
106
+ return [
107
+ EvaluationMetric.create(
108
+ name="exact_match",
109
+ type="string",
110
+ description="Exact match comparison between predicted and correct numerical answers",
111
+ implementation="custom_exact_match",
112
+ primary=True,
113
+ ),
114
+ EvaluationMetric.create(
115
+ name="solution_validity",
116
+ type="text",
117
+ description="Assessment of whether the solution steps are mathematically valid and complete",
118
+ implementation="custom_solution_validator",
119
+ primary=True,
120
+ ),
121
+ EvaluationMetric.create(
122
+ name="step_accuracy",
123
+ type="numerical",
124
+ description="Accuracy of intermediate calculation steps (e.g., <<48/2=24>>)",
125
+ implementation="custom_step_accuracy",
126
+ primary=True,
127
+ ),
128
+ EvaluationMetric.create(
129
+ name="step_count",
130
+ type="numerical",
131
+ description="Analysis of the number of reasoning steps in solutions",
132
+ implementation="custom_step_counter",
133
+ primary=False,
134
+ ),
135
+ ]
136
+
137
+
138
+ if __name__ == "__main__":
139
+ from pprint import pprint
140
+
141
+ parser = GSM8KDatasetParser()
142
+ parser.load()
143
+ parser.parse()
144
+
145
+ parsed_data = parser.get_parsed_data
146
+ pprint(parsed_data[0].question)
147
+ pprint(parsed_data[0].answer)
148
+ pprint(parsed_data[0].raw_question)
149
+ pprint(parsed_data[0].raw_answer)
150
+ pprint(parsed_data[0].solution)
151
+ pprint(parsed_data[0].numerical_answer)
llmdataparser/humaneval_parser.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Any, ClassVar
3
+
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
+
11
+
12
+ @dataclass(frozen=True, kw_only=True, slots=True)
13
+ class HumanEvalParseEntry(HuggingFaceParseEntry):
14
+ """Custom entry class for HumanEval, with fields specific to this dataset parser."""
15
+
16
+ task_id: str
17
+ task_name: str
18
+ entry_point: str
19
+ test: str
20
+
21
+ @classmethod
22
+ def create(
23
+ cls,
24
+ question: str,
25
+ answer: str,
26
+ raw_question: str,
27
+ task_id: str,
28
+ entry_point: str,
29
+ test: str,
30
+ task_name: str,
31
+ ) -> "HumanEvalParseEntry":
32
+ if not task_id:
33
+ raise ValueError("Task ID cannot be empty")
34
+ if not entry_point:
35
+ raise ValueError("Entry point cannot be empty")
36
+ return cls(
37
+ question=question,
38
+ answer=answer,
39
+ raw_question=raw_question,
40
+ raw_answer=answer, # In HumanEval, the canonical solution is the raw answer
41
+ task_id=task_id,
42
+ entry_point=entry_point,
43
+ test=test,
44
+ task_name=task_name,
45
+ )
46
+
47
+
48
+ class HumanEvalDatasetParser(HuggingFaceDatasetParser[HumanEvalParseEntry]):
49
+ """Parser for the HumanEval dataset."""
50
+
51
+ _data_source: ClassVar[str] = "openai/openai_humaneval"
52
+ _default_task: ClassVar[str] = "openai_humaneval"
53
+ _task_names: ClassVar[list[str]] = ["openai_humaneval"]
54
+
55
+ def process_entry(
56
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
57
+ ) -> HumanEvalParseEntry:
58
+ """Process a single HumanEval entry."""
59
+ raw_question = row["prompt"]
60
+ answer = row["canonical_solution"]
61
+ task_id = row["task_id"]
62
+ entry_point = row["entry_point"]
63
+ test = row["test"]
64
+
65
+ question = str(raw_question)
66
+
67
+ # Use task_name if provided, otherwise use default
68
+ task = task_name or self._get_current_task(row)
69
+
70
+ return HumanEvalParseEntry.create(
71
+ question=question,
72
+ answer=answer,
73
+ raw_question=raw_question,
74
+ task_id=task_id,
75
+ entry_point=entry_point,
76
+ test=test,
77
+ task_name=task, # Guarantee non-None
78
+ )
79
+
80
+ def get_dataset_description(self) -> DatasetDescription:
81
+ """Returns description of the HumanEval dataset."""
82
+ return DatasetDescription.create(
83
+ name="HumanEval",
84
+ purpose="Evaluate code generation capabilities through Python programming tasks",
85
+ source="OpenAI",
86
+ language="Python",
87
+ format="Function signatures with docstrings and unit tests",
88
+ category=["Programming"],
89
+ characteristics=(
90
+ "Collection of 164 hand-written Python programming problems. Each problem "
91
+ "includes a function signature, docstring, example test cases, and hidden unit "
92
+ "tests. Problems test basic programming, algorithms, and data structure skills"
93
+ ),
94
+ citation="""@article{chen2021codex,
95
+ title={Evaluating Large Language Models Trained on Code},
96
+ author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
97
+ year={2021},
98
+ eprint={2107.03374},
99
+ archivePrefix={arXiv},
100
+ primaryClass={cs.LG}
101
+ }""",
102
+ )
103
+
104
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
105
+ """Returns recommended evaluation metrics for HumanEval."""
106
+ return [
107
+ EvaluationMetric.create(
108
+ name="pass@k",
109
+ type="code",
110
+ description="Probability that correct solution appears at least once in k samples",
111
+ implementation="custom_pass_at_k",
112
+ primary=True,
113
+ ),
114
+ EvaluationMetric.create(
115
+ name="test_success_rate",
116
+ type="code",
117
+ description="Percentage of test cases passed by the generated solution",
118
+ implementation="custom_test_executor",
119
+ primary=False,
120
+ ),
121
+ EvaluationMetric.create(
122
+ name="type_correctness",
123
+ type="code",
124
+ description="Verification of type hints and type safety in generated code",
125
+ implementation="custom_type_checker",
126
+ primary=False,
127
+ ),
128
+ EvaluationMetric.create(
129
+ name="code_style",
130
+ type="code",
131
+ description="Compliance with Python best practices and PEP 8 guidelines",
132
+ implementation="custom_style_checker",
133
+ primary=False,
134
+ ),
135
+ EvaluationMetric.create(
136
+ name="runtime_efficiency",
137
+ type="code",
138
+ description="Analysis of time and space complexity of the solution",
139
+ implementation="custom_complexity_analyzer",
140
+ primary=False,
141
+ ),
142
+ ]
143
+
144
+
145
+ class HumanEvalDatasetPlusParser(HumanEvalDatasetParser):
146
+ """Parser for the enhanced HumanEval Plus dataset with 80x more comprehensive test coverage."""
147
+
148
+ _data_source: ClassVar[str] = "evalplus/humanevalplus"
149
+ _default_task: ClassVar[str] = "default"
150
+ _task_names: ClassVar[list[str]] = ["default"]
151
+
152
+ def process_entry(
153
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
154
+ ) -> HumanEvalParseEntry:
155
+ """Process a single HumanEval entry."""
156
+ raw_question = row["prompt"]
157
+ answer = row["canonical_solution"]
158
+ task_id = row["task_id"]
159
+ entry_point = row["entry_point"]
160
+ test = row["test"]
161
+
162
+ question = str(raw_question)
163
+ # Use task_name if provided, otherwise use default
164
+ task = task_name or self._get_current_task(row)
165
+
166
+ return HumanEvalParseEntry.create(
167
+ question=question,
168
+ answer=answer,
169
+ raw_question=raw_question,
170
+ task_id=task_id,
171
+ entry_point=entry_point,
172
+ test=test,
173
+ task_name=task, # task is guaranteed to be str from _get_current_task
174
+ )
175
+
176
+ def get_dataset_description(self) -> DatasetDescription:
177
+ """Returns description of the HumanEval Plus dataset."""
178
+ return DatasetDescription.create(
179
+ name="HumanEval Plus",
180
+ purpose="Enhanced evaluation of code generation with 80x more test coverage",
181
+ source="EvalPlus",
182
+ language="Python",
183
+ format="Function signatures with docstrings and comprehensive test suites",
184
+ category=["Programming"],
185
+ characteristics=(
186
+ "Significantly enhanced version of HumanEval with 80x more test cases. "
187
+ "Includes extensive edge cases, boundary conditions, stress tests, and "
188
+ "error handling scenarios to rigorously evaluate code correctness and robustness. "
189
+ "Each problem has been augmented with comprehensive testing to catch subtle bugs "
190
+ "and ensure production-quality code generation."
191
+ ),
192
+ citation="""@inproceedings{evalplus,
193
+ title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
194
+ author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
195
+ booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
196
+ year = {2023},
197
+ url = {https://openreview.net/forum?id=1qvx610Cu7},
198
+ }""",
199
+ )
200
+
201
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
202
+ """Returns recommended evaluation metrics for HumanEval Plus."""
203
+ return [
204
+ EvaluationMetric.create(
205
+ name="pass@k",
206
+ type="code",
207
+ description="Probability that correct solution appears at least once in k samples",
208
+ implementation="custom_pass_at_k",
209
+ primary=True,
210
+ ),
211
+ EvaluationMetric.create(
212
+ name="test_coverage",
213
+ type="code",
214
+ description="Percentage of edge cases and stress tests passed by the solution",
215
+ implementation="custom_coverage_checker",
216
+ primary=False,
217
+ ),
218
+ EvaluationMetric.create(
219
+ name="error_handling",
220
+ type="code",
221
+ description="Assessment of solution's robustness in handling invalid inputs and edge cases",
222
+ implementation="custom_error_handler",
223
+ primary=False,
224
+ ),
225
+ EvaluationMetric.create(
226
+ name="performance_stress",
227
+ type="code",
228
+ description="Evaluation of solution performance under high load and stress conditions",
229
+ implementation="custom_stress_tester",
230
+ primary=False,
231
+ ),
232
+ EvaluationMetric.create(
233
+ name="code_quality",
234
+ type="code",
235
+ description="Analysis of code readability, maintainability and adherence to Python best practices",
236
+ implementation="custom_quality_checker",
237
+ primary=False,
238
+ ),
239
+ ]
240
+
241
+
242
+ if __name__ == "__main__":
243
+ # Example usage
244
+ parser = HumanEvalDatasetParser()
245
+
246
+ # Load the dataset
247
+ parser.load()
248
+
249
+ # Parse all splits
250
+ parser.parse()
251
+
252
+ # Get parsed data
253
+ parsed_data = parser.get_parsed_data
254
+
255
+ # Print example entry
256
+ if parsed_data:
257
+ example = parsed_data[0]
258
+ print("\nExample parsed entry:")
259
+ print(f"Task ID: {example.task_id}")
260
+ print(f"Entry Point: {example.entry_point}")
261
+ print(f"Question:\n{example.question}")
262
+ print(f"Solution:\n{example.answer}")
263
+
264
+ parser = HumanEvalDatasetPlusParser()
265
+ parser.load()
266
+ parser.parse()
267
+ parsed_data = parser.get_parsed_data
268
+ if parsed_data:
269
+ example = parsed_data[0]
270
+ print("\nExample parsed entry:")
271
+ print(f"Task: {example.task_name}")
272
+ print(f"Question: {example.raw_question}")
273
+ print(f"Correct Answer: {example.answer}")
llmdataparser/ifeval_parser.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Any, ClassVar, List
3
+
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
+
11
+
12
+ @dataclass(frozen=True, kw_only=True, slots=True)
13
+ class IFEvalParseEntry(HuggingFaceParseEntry):
14
+ """Custom entry class for IFEval, with fields specific to this dataset parser."""
15
+
16
+ key: int
17
+ instruction_id_list: List[str]
18
+ kwargs: dict[str, Any]
19
+
20
+ @classmethod
21
+ def create(
22
+ cls,
23
+ question: str,
24
+ answer: str,
25
+ raw_question: str,
26
+ raw_answer: str,
27
+ key: int,
28
+ instruction_id_list: List[str],
29
+ kwargs: dict[str, Any],
30
+ task_name: str,
31
+ ) -> "IFEvalParseEntry":
32
+ return cls(
33
+ question=question,
34
+ answer=answer,
35
+ raw_question=raw_question,
36
+ raw_answer=raw_answer,
37
+ key=key,
38
+ instruction_id_list=instruction_id_list,
39
+ kwargs=kwargs,
40
+ task_name=task_name,
41
+ )
42
+
43
+
44
+ class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
45
+ """Parser for the IFEval dataset."""
46
+
47
+ _data_source: ClassVar[str] = "google/IFEval"
48
+ _default_task: ClassVar[str] = "default"
49
+ _task_names: ClassVar[list[str]] = ["default"]
50
+
51
+ def process_entry(
52
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
53
+ ) -> IFEvalParseEntry:
54
+ """Process a single IFEval entry."""
55
+ # Extract fields from the row
56
+ key = row["key"]
57
+ raw_question = row["prompt"] # The prompt is the raw question in this case
58
+ instruction_id_list = row["instruction_id_list"]
59
+ kwargs_data = row["kwargs"]
60
+
61
+ # For IFEval, we don't have explicit answers in the dataset
62
+ # We'll use empty strings as placeholders
63
+ answer = ""
64
+ raw_answer = ""
65
+
66
+ question = str(raw_question)
67
+
68
+ # Use task_name if provided, otherwise use default
69
+ task = task_name or self._get_current_task(row)
70
+
71
+ return IFEvalParseEntry.create(
72
+ question=question,
73
+ answer=answer,
74
+ raw_question=raw_question,
75
+ raw_answer=raw_answer,
76
+ key=key,
77
+ instruction_id_list=instruction_id_list,
78
+ kwargs=kwargs_data,
79
+ task_name=task,
80
+ )
81
+
82
+ def get_dataset_description(self) -> DatasetDescription:
83
+ """Returns description of the IFEval dataset."""
84
+ return DatasetDescription.create(
85
+ name="IFEval",
86
+ purpose="Evaluate instruction following capabilities through verifiable instructions",
87
+ source="Google Research",
88
+ language="English (BCP-47 en)",
89
+ format="Verifiable instruction prompts with automated evaluation criteria",
90
+ category=["Programming"],
91
+ characteristics=(
92
+ "Collection of approximately 500 verifiable instructions designed to evaluate "
93
+ "language models' instruction-following capabilities. Instructions include "
94
+ "specific, measurable criteria like 'write in more than 400 words' or "
95
+ "'mention the keyword AI at least 3 times' that can be verified through "
96
+ "automated heuristics. Used as a core benchmark in the Open LLM Leaderboard "
97
+ "for evaluating chat or instruction fine-tuned language models."
98
+ ),
99
+ citation="""@misc{zhou2023instructionfollowingevaluationlargelanguage,
100
+ title={Instruction-Following Evaluation for Large Language Models},
101
+ author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
102
+ year={2023},
103
+ eprint={2311.07911},
104
+ archivePrefix={arXiv},
105
+ primaryClass={cs.CL},
106
+ url={https://arxiv.org/abs/2311.07911}
107
+ }""",
108
+ )
109
+
110
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
111
+ """Returns recommended evaluation metrics for IFEval."""
112
+ return [
113
+ EvaluationMetric.create(
114
+ name="format_compliance",
115
+ type="text",
116
+ description="Verifies if the output follows specified formatting rules (e.g., highlighting, bullet points, sections)",
117
+ implementation="custom_format_checker",
118
+ primary=True,
119
+ ),
120
+ EvaluationMetric.create(
121
+ name="length_constraints",
122
+ type="text",
123
+ description="Checks if the response meets word, sentence, or paragraph count requirements",
124
+ implementation="custom_length_validator",
125
+ primary=True,
126
+ ),
127
+ EvaluationMetric.create(
128
+ name="punctuation_rules",
129
+ type="text",
130
+ description="Validates adherence to punctuation constraints (e.g., no commas, specific endings)",
131
+ implementation="custom_punctuation_checker",
132
+ primary=True,
133
+ ),
134
+ EvaluationMetric.create(
135
+ name="keyword_usage",
136
+ type="text",
137
+ description="Verifies correct usage of required keywords or avoidance of forbidden words",
138
+ implementation="custom_keyword_validator",
139
+ primary=False,
140
+ ),
141
+ EvaluationMetric.create(
142
+ name="structural_requirements",
143
+ type="text",
144
+ description="Checks for specific structural elements like sections, paragraphs, or formatting patterns",
145
+ implementation="custom_structure_validator",
146
+ primary=False,
147
+ ),
148
+ ]
149
+
150
+
151
+ if __name__ == "__main__":
152
+ # Example usage
153
+ parser = IFEvalDatasetParser()
154
+ parser.load()
155
+ parser.parse()
156
+
157
+ parsed_data = parser.get_parsed_data
158
+ if parsed_data:
159
+ example = parsed_data[0]
160
+ print("\nExample parsed entry:")
161
+ print(f"Key: {example.key}")
162
+ print(f"Question: {example.question}")
163
+ print(f"Instruction IDs: {example.instruction_id_list}")
164
+ print(f"kwargs: {example.kwargs}")
llmdataparser/math_parser.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Any, ClassVar
3
+
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
+
11
+
12
+ @dataclass(frozen=True, kw_only=True, slots=True)
13
+ class MATHParseEntry(HuggingFaceParseEntry):
14
+ """Custom entry class for MATH dataset, with fields specific to this dataset parser."""
15
+
16
+ level: str
17
+ task_name: str
18
+ solution: str
19
+
20
+ @classmethod
21
+ def create(
22
+ cls,
23
+ question: str,
24
+ answer: str,
25
+ raw_question: str,
26
+ raw_answer: str,
27
+ level: str,
28
+ task_name: str,
29
+ solution: str,
30
+ ) -> "MATHParseEntry":
31
+ return cls(
32
+ question=question,
33
+ answer=answer,
34
+ raw_question=raw_question,
35
+ raw_answer=raw_answer,
36
+ level=level,
37
+ task_name=task_name,
38
+ solution=solution,
39
+ )
40
+
41
+
42
+ class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]):
43
+ """Parser for the MATH dataset."""
44
+
45
+ _data_source: ClassVar[str] = "lighteval/MATH"
46
+ _task_names: ClassVar[list[str]] = [
47
+ "algebra",
48
+ "geometry",
49
+ "calculus",
50
+ "prealgebra",
51
+ "intermediate_algebra",
52
+ "number_theory",
53
+ "precalculus",
54
+ "all",
55
+ ]
56
+ _default_task: ClassVar[str] = "all"
57
+
58
+ _valid_levels: ClassVar[set[str]] = {
59
+ f"Level {i}" for i in range(1, 6)
60
+ } # Levels 1-5 are valid
61
+
62
+ def _get_task_from_entry(self, data_entry: dict[str, Any]) -> str:
63
+ """Get the task name from the data entry or fall back to current task."""
64
+ entry_type: str = data_entry.get("type", "")
65
+ if entry_type and (entry_type in self._task_names):
66
+ return entry_type
67
+ return self._current_task or self._default_task
68
+
69
+ def process_entry(
70
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
71
+ ) -> MATHParseEntry:
72
+ """Process a single MATH dataset entry."""
73
+ task = task_name or self._get_current_task(row)
74
+
75
+ # Validate and normalize level
76
+ level = row.get("level")
77
+ if level not in self._valid_levels:
78
+ level = "Unknown"
79
+
80
+ return MATHParseEntry.create(
81
+ question=str(row["problem"]),
82
+ answer=row["solution"],
83
+ raw_question=row["problem"],
84
+ raw_answer=row["solution"],
85
+ level=level,
86
+ task_name=task,
87
+ solution=row["solution"],
88
+ )
89
+
90
+ def get_dataset_description(self) -> DatasetDescription:
91
+ """Returns description of the MATH dataset."""
92
+ return DatasetDescription.create(
93
+ name="MATH",
94
+ purpose="Measure mathematical problem-solving capabilities in machine learning models",
95
+ source="Hendrycks et al., UC Berkeley (NeurIPS 2021)",
96
+ language="English",
97
+ format="Competition mathematics problems with step-by-step solutions",
98
+ category=["Math"],
99
+ characteristics=(
100
+ "Collection of 12,500 challenging competition mathematics problems designed to "
101
+ "evaluate mathematical reasoning. Problems include step-by-step solutions that "
102
+ "can be used to teach models to generate answer derivations and explanations. "
103
+ "Problems are categorized by subject area and difficulty level (1-5)."
104
+ ),
105
+ citation="""@article{hendrycksmath2021,
106
+ title={Measuring Mathematical Problem Solving With the MATH Dataset},
107
+ author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
108
+ journal={NeurIPS},
109
+ year={2021}
110
+ }""",
111
+ additional_info={
112
+ "difficulty_levels": "1-5",
113
+ "topics": [
114
+ "algebra",
115
+ "geometry",
116
+ "calculus",
117
+ "prealgebra",
118
+ "intermediate_algebra",
119
+ "number_theory",
120
+ "precalculus",
121
+ ],
122
+ "size": "12,500 problems",
123
+ "evaluation_note": "Exact match equivalence calculated using sympy library",
124
+ "homepage": "https://github.com/hendrycks/math",
125
+ },
126
+ )
127
+
128
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
129
+ """Returns recommended evaluation metrics for MATH dataset."""
130
+ return [
131
+ EvaluationMetric.create(
132
+ name="symbolic_equivalence",
133
+ type="exact_match",
134
+ description="Verifies answer correctness using symbolic mathematics (e.g., sympy) to check mathematical equivalence.",
135
+ implementation="sympy_equivalence_checker",
136
+ primary=True,
137
+ ),
138
+ EvaluationMetric.create(
139
+ name="solution_presence",
140
+ type="text",
141
+ description="Ensures that a complete step-by-step solution is provided, demonstrating how the answer is derived.",
142
+ implementation="solution_completeness_checker",
143
+ primary=True,
144
+ ),
145
+ EvaluationMetric.create(
146
+ name="reasoning_validity",
147
+ type="text",
148
+ description="Evaluates the logical correctness and mathematical reasoning in the solution's derivation steps.",
149
+ implementation="reasoning_validator",
150
+ primary=True,
151
+ ),
152
+ EvaluationMetric.create(
153
+ name="mathematical_notation",
154
+ type="text",
155
+ description="Checks for the correct use of mathematical notation and symbolic representation to ensure clarity.",
156
+ implementation="notation_validator",
157
+ primary=False,
158
+ ),
159
+ EvaluationMetric.create(
160
+ name="solution_clarity",
161
+ type="text",
162
+ description="Assesses the clarity, readability, and coherence of the solution steps to enhance interpretability.",
163
+ implementation="clarity_scorer",
164
+ primary=False,
165
+ ),
166
+ ]
167
+
168
+
169
+ if __name__ == "__main__":
170
+ # Example usage of MATH parser
171
+ parser = MATHDatasetParser()
172
+
173
+ # Load the dataset
174
+ parser.load()
175
+
176
+ # Parse all splits
177
+ parser.parse()
178
+
179
+ # Get parsed data
180
+ parsed_data = parser.get_parsed_data
181
+
182
+ # Print example entry
183
+ if parsed_data:
184
+ example = parsed_data[0]
185
+ print("\nExample parsed entry:")
186
+ print(f"Task: {example.task_name}")
187
+ print(f"Level: {example.level}")
188
+ print(f"Question: {example.question}")
189
+ print(f"Solution: {example.solution}")
llmdataparser/mbpp_parser.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Any, ClassVar
3
+
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
+
11
+
12
+ @dataclass(frozen=True, kw_only=True, slots=True)
13
+ class MBPPParseEntry(HuggingFaceParseEntry):
14
+ """Custom entry class for MBPP, with fields specific to this dataset parser."""
15
+
16
+ task_id: int
17
+ test_list: list[str]
18
+ test_setup_code: str
19
+ challenge_test_list: list[str]
20
+ source_file: str
21
+
22
+ @classmethod
23
+ def create(
24
+ cls,
25
+ question: str,
26
+ answer: str,
27
+ raw_question: str,
28
+ task_id: int,
29
+ test_list: list[str],
30
+ test_setup_code: str,
31
+ challenge_test_list: list[str],
32
+ task_name: str,
33
+ source_file: str,
34
+ ) -> "MBPPParseEntry":
35
+ if not isinstance(task_id, int):
36
+ raise ValueError("Task ID must be an integer")
37
+
38
+ return cls(
39
+ question=question,
40
+ answer=answer,
41
+ raw_question=raw_question,
42
+ raw_answer=answer, # In MBPP, the code solution is the raw answer
43
+ task_id=task_id,
44
+ test_list=test_list,
45
+ test_setup_code=test_setup_code,
46
+ challenge_test_list=challenge_test_list,
47
+ task_name=task_name,
48
+ source_file=source_file,
49
+ )
50
+
51
+
52
+ class MBPPDatasetParser(HuggingFaceDatasetParser[MBPPParseEntry]):
53
+ """Parser for the MBPP (Mostly Basic Python Programming) dataset."""
54
+
55
+ _data_source: ClassVar[str] = "google-research-datasets/mbpp"
56
+ _default_task: ClassVar[str] = "full" # Can be 'full' or 'sanitized'
57
+ _task_names: ClassVar[list[str]] = ["full", "sanitized"]
58
+
59
+ def process_entry(
60
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
61
+ ) -> MBPPParseEntry:
62
+ """Process a single MBPP entry."""
63
+ raw_question = row.get("text", row.get("prompt"))
64
+ answer = row["code"]
65
+ task_id = row["task_id"]
66
+ test_list = row["test_list"]
67
+ test_setup_code = row.get("test_setup_code", "")
68
+ challenge_test_list = row.get("challenge_test_list", [])
69
+
70
+ question = str(raw_question)
71
+
72
+ # Use task_name if provided, otherwise use default
73
+ task = task_name or self._get_current_task(row)
74
+ source_file = row.get("source_file", "")
75
+
76
+ return MBPPParseEntry.create(
77
+ question=question,
78
+ answer=answer,
79
+ raw_question=raw_question,
80
+ task_id=task_id,
81
+ test_list=test_list,
82
+ test_setup_code=test_setup_code,
83
+ challenge_test_list=challenge_test_list,
84
+ task_name=task,
85
+ source_file=source_file,
86
+ )
87
+
88
+ def get_dataset_description(self) -> DatasetDescription:
89
+ """Returns a description of the MBPP dataset."""
90
+ return DatasetDescription.create(
91
+ name="Mostly Basic Python Problems (MBPP)",
92
+ purpose="A benchmark for evaluating code generation capabilities using entry-level Python programming problems",
93
+ source="https://github.com/google-research/google-research/tree/master/mbpp",
94
+ language="English and Python",
95
+ category=["Programming"],
96
+ format="Task descriptions in English with corresponding Python solutions and automated test cases",
97
+ characteristics=(
98
+ "Contains approximately 1,000 crowd-sourced Python programming problems "
99
+ "designed for entry-level programmers. Problems cover programming fundamentals "
100
+ "and standard library functionality. Each problem includes a task description, "
101
+ "code solution, and 3 automated test cases. A subset of the data has been "
102
+ "hand-verified by the authors."
103
+ ),
104
+ citation=(
105
+ "@article{austin2021program,\n"
106
+ " title={Program Synthesis with Large Language Models},\n"
107
+ " author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n"
108
+ " journal={arXiv preprint arXiv:2108.07732},\n"
109
+ " year={2021}\n"
110
+ "}"
111
+ ),
112
+ additional_info={
113
+ "size": "~1,000 programming problems",
114
+ "splits": "Available in full or sanitized versions",
115
+ "test_coverage": "Each problem includes 3 automated test cases",
116
+ "verification": "Subset of data has been hand-verified by authors",
117
+ },
118
+ )
119
+
120
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
121
+ """Returns the recommended evaluation metrics for MBPP dataset."""
122
+ return [
123
+ EvaluationMetric.create(
124
+ name="pass@k",
125
+ type="code_evaluation",
126
+ description="Percentage of problems where at least one solution in k generations passes all test cases",
127
+ implementation="custom_pass_at_k",
128
+ primary=True,
129
+ ),
130
+ EvaluationMetric.create(
131
+ name="test_case_success_rate",
132
+ type="code_evaluation",
133
+ description="Percentage of test cases passed across all problems",
134
+ implementation="custom_test_success_rate",
135
+ primary=False,
136
+ ),
137
+ EvaluationMetric.create(
138
+ name="syntax_validity",
139
+ type="code_evaluation",
140
+ description="Verifies that generated code is syntactically valid Python",
141
+ implementation="custom_syntax_check",
142
+ primary=False,
143
+ ),
144
+ EvaluationMetric.create(
145
+ name="code_similarity",
146
+ type="similarity",
147
+ description="Similarity between generated code and reference solution",
148
+ implementation="evaluate.load('code_eval')",
149
+ primary=False,
150
+ ),
151
+ ]
152
+
153
+
154
+ if __name__ == "__main__":
155
+ # Example usage
156
+ parser = MBPPDatasetParser()
157
+
158
+ # Load the dataset
159
+ parser.load()
160
+
161
+ # Parse all splits
162
+ parser.parse()
163
+
164
+ # Get parsed data
165
+ parsed_data = parser.get_parsed_data
166
+
167
+ # Print example entry
168
+ if parsed_data:
169
+ example = parsed_data[0]
170
+ print("\nExample parsed entry:")
171
+ print(f"Task ID: {example.task_id}")
172
+ print(f"Task: {example.raw_question}")
173
+ print(f"Solution:\n{example.answer}")
174
+ print(f"Test Cases:\n{example.test_list}")
llmdataparser/mgsm_parser.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Any, ClassVar
3
+
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
+
11
+
12
+ @dataclass(frozen=True, kw_only=True, slots=True)
13
+ class MGSMParseEntry(HuggingFaceParseEntry):
14
+ """Custom entry class for MGSM, with fields specific to this dataset parser."""
15
+
16
+ numerical_answer: int | float
17
+ equation_solution: str | None
18
+ language: str
19
+
20
+ @classmethod
21
+ def create(
22
+ cls,
23
+ question: str,
24
+ answer: str,
25
+ raw_question: str,
26
+ raw_answer: str,
27
+ numerical_answer: int | float,
28
+ equation_solution: str | None,
29
+ task_name: str,
30
+ language: str,
31
+ ) -> "MGSMParseEntry":
32
+ return cls(
33
+ question=question,
34
+ answer=answer,
35
+ raw_question=raw_question,
36
+ raw_answer=raw_answer,
37
+ numerical_answer=numerical_answer,
38
+ equation_solution=equation_solution,
39
+ task_name=task_name,
40
+ language=language,
41
+ )
42
+
43
+
44
+ class MGSMDatasetParser(HuggingFaceDatasetParser[MGSMParseEntry]):
45
+ """Parser for the MGSM (Multilingual Grade School Math) dataset."""
46
+
47
+ _data_source: ClassVar[str] = "juletxara/mgsm"
48
+ _default_task: ClassVar[str] = "en"
49
+ _task_names: ClassVar[list[str]] = [
50
+ "bn",
51
+ "de",
52
+ "en",
53
+ "es",
54
+ "fr",
55
+ "ja",
56
+ "ru",
57
+ "sw",
58
+ "te",
59
+ "th",
60
+ "zh",
61
+ ]
62
+
63
+ def process_entry(
64
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
65
+ ) -> MGSMParseEntry:
66
+ """
67
+ Process a single MGSM entry.
68
+
69
+ Args:
70
+ row: Dictionary containing the MGSM entry fields
71
+ task_name: Language code for the current task
72
+
73
+ Returns:
74
+ MGSMParseEntry: Processed entry with question, answer, and metadata
75
+ """
76
+ task = task_name or self._get_current_task(row)
77
+ raw_question = row["question"]
78
+ raw_answer = row["answer"] if row["answer"] else ""
79
+ numerical_answer = row["answer_number"]
80
+ equation_solution = row["equation_solution"]
81
+
82
+ question = str(raw_question)
83
+
84
+ # Use numerical answer as string for the answer field if no detailed answer is provided
85
+ answer = raw_answer if raw_answer else str(numerical_answer)
86
+
87
+ return MGSMParseEntry.create(
88
+ question=question,
89
+ answer=answer,
90
+ raw_question=raw_question,
91
+ raw_answer=raw_answer,
92
+ numerical_answer=numerical_answer,
93
+ equation_solution=equation_solution,
94
+ task_name=task,
95
+ language=task,
96
+ )
97
+
98
+ def get_dataset_description(self) -> DatasetDescription:
99
+ """Returns a description of the Multilingual Grade School Math dataset."""
100
+ return DatasetDescription.create(
101
+ name="Multilingual Grade School Math (MGSM)",
102
+ purpose="Evaluate multilingual chain-of-thought reasoning capabilities in mathematical problem solving",
103
+ source="https://huggingface.co/datasets/juletxara/mgsm",
104
+ language="Multilingual (11 languages)",
105
+ format="Word problems with numerical answers and solution steps",
106
+ category=["Math", "MultiLingual"],
107
+ characteristics=(
108
+ "Human-translated version of 250 GSM8K problems into 10 additional languages. "
109
+ "Each problem includes the original question from GSM8K, its translations, "
110
+ "numerical answer, and solution steps. The benchmark is designed to evaluate "
111
+ "language models' ability to perform mathematical reasoning across different languages."
112
+ ),
113
+ citation="""@misc{shi2022language,
114
+ title={Language Models are Multilingual Chain-of-Thought Reasoners},
115
+ author={Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and Jason Wei},
116
+ year={2022},
117
+ eprint={2210.03057},
118
+ archivePrefix={arXiv},
119
+ primaryClass={cs.CL}
120
+ }
121
+ @article{cobbe2021gsm8k,
122
+ title={Training Verifiers to Solve Math Word Problems},
123
+ author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
124
+ journal={arXiv preprint arXiv:2110.14168},
125
+ year={2021}
126
+ }""",
127
+ additional_info={
128
+ "languages": [
129
+ "Bengali",
130
+ "German",
131
+ "English",
132
+ "Spanish",
133
+ "French",
134
+ "Japanese",
135
+ "Russian",
136
+ "Swahili",
137
+ "Telugu",
138
+ "Thai",
139
+ "Chinese",
140
+ ],
141
+ "size": "250 problems translated into each language",
142
+ "base_dataset": "GSM8K (Grade School Math 8K)",
143
+ },
144
+ )
145
+
146
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
147
+ """Returns the recommended evaluation metrics for MGSM dataset."""
148
+ return [
149
+ EvaluationMetric.create(
150
+ name="exact_match",
151
+ type="string",
152
+ description="Exact match comparison between predicted and correct numerical answers",
153
+ implementation="custom_exact_match",
154
+ primary=True,
155
+ ),
156
+ EvaluationMetric.create(
157
+ name="solution_validity",
158
+ type="text",
159
+ description="Assessment of whether the solution steps are mathematically valid and complete",
160
+ implementation="custom_solution_validator",
161
+ primary=True,
162
+ ),
163
+ EvaluationMetric.create(
164
+ name="step_accuracy",
165
+ type="numerical",
166
+ description="Accuracy of intermediate calculation steps (e.g., <<48/2=24>>)",
167
+ implementation="custom_step_accuracy",
168
+ primary=True,
169
+ ),
170
+ EvaluationMetric.create(
171
+ name="cross_lingual_consistency",
172
+ type="comparison",
173
+ description="Consistency of model performance across different language versions of the same problem",
174
+ implementation="custom_language_comparator",
175
+ primary=False,
176
+ ),
177
+ ]
178
+
179
+
180
+ if __name__ == "__main__":
181
+ from pprint import pprint
182
+
183
+ parser = MGSMDatasetParser()
184
+ parser.load(task_name="en") # Load French dataset
185
+ parser.parse()
186
+
187
+ parsed_data = parser.get_parsed_data
188
+ pprint(parsed_data[0].question)
189
+ pprint(parsed_data[0].answer)
190
+ pprint(parsed_data[0].raw_question)
191
+ pprint(parsed_data[0].numerical_answer)
192
+ pprint(parsed_data[0].language)
llmdataparser/mmlu_parser.py CHANGED
@@ -1,81 +1,721 @@
1
  from dataclasses import dataclass
2
- from typing import Any
3
 
4
- from llmdataparser.base_parser import HuggingFaceDatasetParser, ParseEntry
5
- from llmdataparser.prompts import MMLU_SYSTEM_PROMPT
 
 
 
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- @dataclass(frozen=True)
9
- class MMLUParseEntry(ParseEntry):
10
- """
11
- Custom entry class for MMLU, with fields specific to this dataset parser.
12
- """
13
 
14
- prompt: str
15
- answer_letter: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  @classmethod
18
- def create(cls, prompt: str, answer_letter: str) -> "MMLUParseEntry":
19
- if answer_letter not in {"A", "B", "C", "D"}:
 
 
 
 
 
 
 
 
20
  raise ValueError(
21
- f"Invalid answer_letter '{answer_letter}'; must be one of 'A', 'B', 'C', 'D'."
22
  )
23
- return cls(prompt=prompt, answer_letter=answer_letter)
 
 
 
 
 
 
 
 
 
24
 
25
 
26
  class MMLUDatasetParser(HuggingFaceDatasetParser[MMLUParseEntry]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  _data_source = "cais/mmlu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- def __init__(self, system_prompt: str = MMLU_SYSTEM_PROMPT):
30
- super().__init__() # Properly initialize the base class
31
- self.parsed_data: list[MMLUParseEntry] = []
32
- self.task_names: list[str] = []
33
- self.subject_list: set[str] = set()
34
- self.system_prompt: str = system_prompt
35
- super().__init__()
36
-
37
- def parse(self, split_names: str | list[str] | None = None, **kwargs: Any) -> None:
38
- self.parsed_data.clear()
39
- if self.raw_data is None:
40
- raise ValueError("No data loaded. Please load the dataset first.")
41
-
42
- if split_names is None:
43
- split_names = self.task_names
44
- elif isinstance(split_names, str):
45
- split_names = [split_names]
46
-
47
- for split_name in split_names:
48
- if split_name not in self.task_names:
49
- raise ValueError(f"Task '{split_name}' not found in the dataset.")
50
-
51
- dataset_split = self.raw_data[split_name]
52
- for index, entry in enumerate(dataset_split, start=1):
53
- data_entry = self.process_entry(entry, **kwargs)
54
- self._parsed_data.append(data_entry)
55
- self.subject_list.add(entry.get("subject", "Unknown"))
56
- print(f"Parsed {index} data points from task '{split_name}'.")
57
-
58
- print(
59
- f"Number of subjects: {len(self.subject_list)}. "
60
- "For more details, please check the `self.subject_list` attribute."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  )
62
 
63
- def process_entry(self, row: dict[str, Any], **kwargs) -> MMLUParseEntry:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  """
65
- Generate a prompt and expected answer from the given row.
66
 
67
  Args:
68
- row (dict[str, Any]): A data point to be formatted.
 
69
 
70
  Returns:
71
  MMLUParseEntry: The formatted entry object.
72
  """
 
 
 
 
 
 
73
  choices = "\n".join(
74
- f"{chr(65 + i)}. {choice}" for i, choice in enumerate(row["choices"])
75
  )
76
- prompt = (
77
- f"{self.system_prompt}\nQuestion: {row['question']}\n{choices}\nAnswer:"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  )
79
- answer_letter = chr(65 + row["answer"]) # Convert index to 'A', 'B', 'C', 'D'
80
 
81
- return MMLUParseEntry.create(prompt, answer_letter)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from dataclasses import dataclass
2
+ from typing import Any, Final
3
 
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
 
11
+ MMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
12
+ MMLU_PRO_VALID_ANSWERS: Final[set[str]] = {
13
+ "A",
14
+ "B",
15
+ "C",
16
+ "D",
17
+ "E",
18
+ "F",
19
+ "G",
20
+ "H",
21
+ "I",
22
+ "J",
23
+ }
24
+ MMLU_VALID_ANSWER_STR: Final[str] = ", ".join(sorted(MMLU_VALID_ANSWERS))
25
+ MMLU_PRO_VALID_ANSWER_STR: Final[str] = ", ".join(sorted(MMLU_PRO_VALID_ANSWERS))
26
 
 
 
 
 
 
27
 
28
+ @dataclass(frozen=True, kw_only=True, slots=True)
29
+ class MMLUParseEntry(HuggingFaceParseEntry):
30
+ """Custom entry class for MMLU, with fields specific to this dataset parser."""
31
+
32
+ raw_choices: list[str]
33
+ task_name: str
34
+
35
+ @classmethod
36
+ def create(
37
+ cls,
38
+ question: str,
39
+ answer: str,
40
+ raw_question: str,
41
+ raw_choices: list[str],
42
+ raw_answer: str,
43
+ task_name: str,
44
+ ) -> "MMLUParseEntry":
45
+ if answer not in MMLU_VALID_ANSWERS:
46
+ raise ValueError(
47
+ f"Invalid answer_letter '{answer}'; must be one of {MMLU_VALID_ANSWER_STR}"
48
+ )
49
+ if not task_name:
50
+ raise ValueError("Task name cannot be empty")
51
+ return cls(
52
+ question=question,
53
+ answer=answer,
54
+ raw_question=raw_question,
55
+ raw_answer=raw_answer,
56
+ raw_choices=raw_choices,
57
+ task_name=task_name,
58
+ )
59
+
60
+
61
+ @dataclass(frozen=True, kw_only=True, slots=True)
62
+ class MMLUProParseEntry(HuggingFaceParseEntry):
63
+ """Custom entry class for MMLU, with fields specific to this dataset parser."""
64
+
65
+ raw_choices: list[str]
66
+ task_name: str
67
 
68
  @classmethod
69
+ def create(
70
+ cls,
71
+ question: str,
72
+ answer: str,
73
+ raw_question: str,
74
+ raw_choices: list[str],
75
+ raw_answer: str,
76
+ task_name: str,
77
+ ) -> "MMLUProParseEntry":
78
+ if answer not in MMLU_PRO_VALID_ANSWERS:
79
  raise ValueError(
80
+ f"Invalid answer_letter '{answer}'; must be one of {MMLU_PRO_VALID_ANSWER_STR}"
81
  )
82
+ if not task_name:
83
+ raise ValueError("Task name cannot be empty")
84
+ return cls(
85
+ question=question,
86
+ answer=answer,
87
+ raw_question=raw_question,
88
+ raw_choices=raw_choices,
89
+ raw_answer=raw_answer,
90
+ task_name=task_name,
91
+ )
92
 
93
 
94
  class MMLUDatasetParser(HuggingFaceDatasetParser[MMLUParseEntry]):
95
+ """Base class for MMLU dataset parsers with common functionality."""
96
+
97
+ def _get_task_from_entry(self, data_entry: dict[str, Any]) -> str:
98
+ """Get the task name from the data entry or default task name."""
99
+ task_name: str = data_entry.get("subject", "")
100
+ return task_name if task_name else (self._current_task or self._default_task)
101
+
102
+ def process_entry(
103
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
104
+ ) -> MMLUParseEntry:
105
+ """
106
+ Generate a question and expected answer from the given row.
107
+
108
+ Args:
109
+ row: A data point to be formatted.
110
+ task_name: Optional task name for the entry.
111
+ **kwargs: Additional keyword arguments.
112
+
113
+ Returns:
114
+ MMLUParseEntry: The formatted entry object.
115
+ """
116
+ task = task_name or self._get_current_task(row)
117
+ # Ensure task is not None
118
+ final_task = task or self._default_task
119
+
120
+ choices = "\n".join(
121
+ f"{chr(65 + i)}. {choice}" for i, choice in enumerate(row["choices"])
122
+ )
123
+ raw_question = row["question"]
124
+ raw_choices = row["choices"]
125
+ raw_answer = str(row["answer"]) # Ensure raw_answer is a string
126
+
127
+ question = f"Question: {raw_question}\n{choices}\nAnswer:"
128
+ answer_letter = chr(65 + int(raw_answer)) # Convert index to 'A', 'B', 'C', 'D'
129
+
130
+ return MMLUParseEntry.create(
131
+ question=question,
132
+ answer=answer_letter,
133
+ raw_question=raw_question,
134
+ raw_choices=raw_choices,
135
+ raw_answer=raw_answer,
136
+ task_name=final_task,
137
+ )
138
+
139
+
140
+ class BaseMMLUDatasetParser(MMLUDatasetParser):
141
+ """Parser for the original MMLU dataset."""
142
+
143
  _data_source = "cais/mmlu"
144
+ _default_task = "all"
145
+ _task_names = [
146
+ "abstract_algebra",
147
+ "anatomy",
148
+ "astronomy",
149
+ "business_ethics",
150
+ "clinical_knowledge",
151
+ "college_biology",
152
+ "college_chemistry",
153
+ "college_computer_science",
154
+ "college_mathematics",
155
+ "college_medicine",
156
+ "college_physics",
157
+ "computer_security",
158
+ "conceptual_physics",
159
+ "econometrics",
160
+ "electrical_engineering",
161
+ "elementary_mathematics",
162
+ "formal_logic",
163
+ "global_facts",
164
+ "high_school_biology",
165
+ "high_school_chemistry",
166
+ "high_school_computer_science",
167
+ "high_school_european_history",
168
+ "high_school_geography",
169
+ "high_school_government_and_politics",
170
+ "high_school_macroeconomics",
171
+ "high_school_mathematics",
172
+ "high_school_microeconomics",
173
+ "high_school_physics",
174
+ "high_school_psychology",
175
+ "high_school_statistics",
176
+ "high_school_us_history",
177
+ "high_school_world_history",
178
+ "human_aging",
179
+ "human_sexuality",
180
+ "international_law",
181
+ "jurisprudence",
182
+ "logical_fallacies",
183
+ "machine_learning",
184
+ "management",
185
+ "marketing",
186
+ "medical_genetics",
187
+ "miscellaneous",
188
+ "moral_disputes",
189
+ "moral_scenarios",
190
+ "nutrition",
191
+ "philosophy",
192
+ "prehistory",
193
+ "professional_accounting",
194
+ "professional_law",
195
+ "professional_medicine",
196
+ "professional_psychology",
197
+ "public_relations",
198
+ "security_studies",
199
+ "sociology",
200
+ "us_foreign_policy",
201
+ "virology",
202
+ "world_religions",
203
+ ]
204
+
205
+ def get_dataset_description(self) -> DatasetDescription:
206
+ """Returns a description of the MMLU dataset."""
207
+ return DatasetDescription.create(
208
+ name="Massive Multitask Language Understanding (MMLU)",
209
+ purpose="Evaluate models' extensive world knowledge and problem-solving abilities across diverse branches of knowledge",
210
+ source="https://huggingface.co/datasets/cais/mmlu",
211
+ language="English",
212
+ category=["General Knowledge and Reasoning"],
213
+ format="Multiple choice questions with four options (A, B, C, D)",
214
+ characteristics=(
215
+ "Comprehensive evaluation benchmark spanning humanities, social sciences, hard sciences, "
216
+ "and other essential areas of knowledge. The test includes 57 subjects such as "
217
+ "elementary mathematics, US history, computer science, and law. Success on this test "
218
+ "requires both extensive world knowledge and strong problem-solving capabilities."
219
+ ),
220
+ citation="""@article{hendryckstest2021,
221
+ title={Measuring Massive Multitask Language Understanding},
222
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
223
+ journal={Proceedings of the International Conference on Learning Representations (ICLR)},
224
+ year={2021}
225
+ }
226
+ @article{hendrycks2021ethics,
227
+ title={Aligning AI With Shared Human Values},
228
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
229
+ journal={Proceedings of the International Conference on Learning Representations (ICLR)},
230
+ year={2021}
231
+ }""",
232
+ additional_info={
233
+ "subjects": "57 tasks/subjects",
234
+ "categories": [
235
+ "Humanities",
236
+ "Social Sciences",
237
+ "Hard Sciences",
238
+ "Other",
239
+ ],
240
+ "example_subjects": [
241
+ "Elementary Mathematics",
242
+ "US History",
243
+ "Computer Science",
244
+ "Law",
245
+ ],
246
+ "requirements": [
247
+ "Extensive world knowledge",
248
+ "Problem solving ability",
249
+ ],
250
+ },
251
+ )
252
 
253
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
254
+ """Returns the recommended evaluation metrics for MMLU dataset."""
255
+ return [
256
+ EvaluationMetric.create(
257
+ name="accuracy",
258
+ type="classification",
259
+ description="Proportion of correctly answered multiple-choice questions (exact match with A, B, C, D)",
260
+ implementation="evaluate.load('accuracy')",
261
+ primary=True,
262
+ ),
263
+ EvaluationMetric.create(
264
+ name="subject_accuracy",
265
+ type="classification",
266
+ description="Per-subject accuracy scores across all 57 tasks",
267
+ implementation="custom_subject_accuracy",
268
+ primary=True,
269
+ ),
270
+ EvaluationMetric.create(
271
+ name="category_accuracy",
272
+ type="classification",
273
+ description="Accuracy grouped by major categories (Humanities, Social Sciences, Hard Sciences, Other)",
274
+ implementation="custom_category_accuracy",
275
+ primary=True,
276
+ ),
277
+ EvaluationMetric.create(
278
+ name="task_correlation",
279
+ type="analysis",
280
+ description="Analysis of performance correlations between different subjects/tasks",
281
+ implementation="custom_task_correlation",
282
+ primary=False,
283
+ ),
284
+ ]
285
+
286
+
287
+ class MMLUReduxDatasetParser(MMLUDatasetParser):
288
+ """Parser for the MMLU Redux dataset."""
289
+
290
+ _data_source = "edinburgh-dawg/mmlu-redux"
291
+ _default_task = "anatomy"
292
+ _task_names = [
293
+ "anatomy",
294
+ "astronomy",
295
+ "business_ethics",
296
+ "clinical_knowledge",
297
+ "college_chemistry",
298
+ "college_computer_science",
299
+ "college_mathematics",
300
+ "college_medicine",
301
+ "college_physics",
302
+ "conceptual_physics",
303
+ "econometrics",
304
+ "electrical_engineering",
305
+ "formal_logic",
306
+ "global_facts",
307
+ "high_school_chemistry",
308
+ "high_school_geography",
309
+ "high_school_macroeconomics",
310
+ "high_school_mathematics",
311
+ "high_school_physics",
312
+ "high_school_statistics",
313
+ "high_school_us_history",
314
+ "human_aging",
315
+ "logical_fallacies",
316
+ "machine_learning",
317
+ "miscellaneous",
318
+ "philosophy",
319
+ "professional_accounting",
320
+ "professional_law",
321
+ "public_relations",
322
+ "virology",
323
+ ]
324
+
325
+ def get_dataset_description(self) -> DatasetDescription:
326
+ """Returns description of the MMLU Redux dataset."""
327
+ return DatasetDescription.create(
328
+ name="MMLU Redux",
329
+ purpose="Provide a manually re-annotated subset of MMLU with error analysis and corrections",
330
+ source="https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux",
331
+ language="English",
332
+ format="Multiple choice questions with four options (A, B, C, D)",
333
+ category=["General Knowledge and Reasoning"],
334
+ characteristics=(
335
+ "A carefully curated subset of 3,000 questions across 30 MMLU subjects, "
336
+ "manually re-annotated to identify and classify various types of errors. "
337
+ "The dataset maintains the original questions but provides additional "
338
+ "error annotations and corrections based on expert review and verification "
339
+ "against credible sources."
340
+ ),
341
+ citation="""@misc{gema2024mmlu,
342
+ title={Are We Done with MMLU?},
343
+ author={Aryo Pradipta Gema and Joshua Ong Jun Leang and Giwon Hong and Alessio Devoto and Alberto Carlo Maria Mancino and Rohit Saxena and Xuanli He and Yu Zhao and Xiaotang Du and Mohammad Reza Ghasemi Madani and Claire Barale and Robert McHardy and Joshua Harris and Jean Kaddour and Emile van Krieken and Pasquale Minervini},
344
+ year={2024},
345
+ eprint={2406.04127},
346
+ archivePrefix={arXiv},
347
+ primaryClass={cs.CL}
348
+ }""",
349
+ additional_info={
350
+ "size": "3,000 questions (100 per subject)",
351
+ "subjects": "30 MMLU subjects",
352
+ "license": "CC-BY-4.0",
353
+ "error_types": {
354
+ "Question Assessment": [
355
+ "Bad Question Clarity",
356
+ "Bad Options Clarity",
357
+ ],
358
+ "Ground Truth Verification": [
359
+ "No Correct Answer",
360
+ "Multiple Correct Answers",
361
+ "Wrong Ground Truth",
362
+ ],
363
+ },
364
+ "verification_process": "Expert review with source verification",
365
+ "base_dataset": "cais/mmlu",
366
+ },
367
+ )
368
+
369
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
370
+ """Returns the recommended evaluation metrics for MMLU Redux dataset."""
371
+ return [
372
+ EvaluationMetric.create(
373
+ name="accuracy",
374
+ type="classification",
375
+ description="Proportion of correctly answered multiple-choice questions (exact match with A, B, C, D)",
376
+ implementation="evaluate.load('accuracy')",
377
+ primary=True,
378
+ ),
379
+ EvaluationMetric.create(
380
+ name="subject_accuracy",
381
+ type="classification",
382
+ description="Per-subject accuracy scores across 30 subjects (100 questions each)",
383
+ implementation="custom_subject_accuracy",
384
+ primary=True,
385
+ ),
386
+ EvaluationMetric.create(
387
+ name="question_clarity",
388
+ type="analysis",
389
+ description="Analysis of performance on questions with different clarity issues",
390
+ implementation="custom_clarity_analysis",
391
+ primary=False,
392
+ ),
393
+ ]
394
+
395
+
396
+ class TMMLUPlusDatasetParser(MMLUDatasetParser):
397
+ """Parser for the TMMLU+ dataset."""
398
+
399
+ _data_source = "ikala/tmmluplus"
400
+ _default_task = "taiwanese_hokkien"
401
+ _task_names = [
402
+ "engineering_math",
403
+ "dentistry",
404
+ "traditional_chinese_medicine_clinical_medicine",
405
+ "clinical_psychology",
406
+ "technical",
407
+ "culinary_skills",
408
+ "mechanical",
409
+ "logic_reasoning",
410
+ "real_estate",
411
+ "general_principles_of_law",
412
+ "finance_banking",
413
+ "anti_money_laundering",
414
+ "ttqav2",
415
+ "marketing_management",
416
+ "business_management",
417
+ "organic_chemistry",
418
+ "advance_chemistry",
419
+ "physics",
420
+ "secondary_physics",
421
+ "human_behavior",
422
+ "national_protection",
423
+ "jce_humanities",
424
+ "politic_science",
425
+ "agriculture",
426
+ "official_document_management",
427
+ "financial_analysis",
428
+ "pharmacy",
429
+ "educational_psychology",
430
+ "statistics_and_machine_learning",
431
+ "management_accounting",
432
+ "introduction_to_law",
433
+ "computer_science",
434
+ "veterinary_pathology",
435
+ "accounting",
436
+ "fire_science",
437
+ "optometry",
438
+ "insurance_studies",
439
+ "pharmacology",
440
+ "taxation",
441
+ "trust_practice",
442
+ "geography_of_taiwan",
443
+ "physical_education",
444
+ "auditing",
445
+ "administrative_law",
446
+ "education_(profession_level)",
447
+ "economics",
448
+ "veterinary_pharmacology",
449
+ "nautical_science",
450
+ "occupational_therapy_for_psychological_disorders",
451
+ "basic_medical_science",
452
+ "macroeconomics",
453
+ "trade",
454
+ "chinese_language_and_literature",
455
+ "tve_design",
456
+ "junior_science_exam",
457
+ "junior_math_exam",
458
+ "junior_chinese_exam",
459
+ "junior_social_studies",
460
+ "tve_mathematics",
461
+ "tve_chinese_language",
462
+ "tve_natural_sciences",
463
+ "junior_chemistry",
464
+ "music",
465
+ "education",
466
+ "three_principles_of_people",
467
+ "taiwanese_hokkien",
468
+ ]
469
+
470
+ def process_entry(
471
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
472
+ ) -> MMLUParseEntry:
473
+ """Process a single TMMLU+ entry."""
474
+ # Extract choices in order
475
+ raw_choices = [row["A"], row["B"], row["C"], row["D"]]
476
+ choices = "\n".join(
477
+ f"{chr(65 + i)}. {choice}" for i, choice in enumerate(raw_choices)
478
+ )
479
+ raw_question = row["question"]
480
+ raw_answer = row["answer"]
481
+
482
+ question = f"Question: {raw_question}\n{choices}\nAnswer:"
483
+ task = task_name or self._get_current_task(row)
484
+
485
+ return MMLUParseEntry.create(
486
+ question, raw_answer, raw_question, raw_choices, raw_answer, task
487
+ )
488
+
489
+ def get_dataset_description(self) -> DatasetDescription:
490
+ """Returns description of the TMMLU+ dataset."""
491
+ return DatasetDescription.create(
492
+ name="Traditional Chinese Massive Multitask Language Understanding Plus (TMMLU+)",
493
+ purpose="Evaluate language models' understanding and reasoning capabilities in Traditional Chinese across diverse subjects",
494
+ source="https://huggingface.co/datasets/ikala/tmmluplus",
495
+ language="Traditional Chinese",
496
+ category=["General Knowledge and Reasoning", "Taiwan"],
497
+ format="Multiple choice questions with four options (A, B, C, D)",
498
+ characteristics=(
499
+ "A comprehensive evaluation benchmark featuring 66 subjects from elementary "
500
+ "to professional level. The dataset is six times larger than the original TMMLU "
501
+ "and provides more balanced subject coverage. Includes benchmark results from "
502
+ "both closed-source models and 20 open-weight Chinese language models with "
503
+ "parameters ranging from 1.8B to 72B."
504
+ ),
505
+ citation="""@article{ikala2024improved,
506
+ title={An Improved Traditional Chinese Evaluation Suite for Foundation Model},
507
+ author={Tam, Zhi-Rui and Pai, Ya-Ting and Lee, Yen-Wei and Cheng, Sega and Shuai, Hong-Han},
508
+ journal={arXiv preprint arXiv:2403.01858},
509
+ year={2024}
510
+ }""",
511
+ additional_info={
512
+ "subjects": "66 diverse subjects",
513
+ "difficulty_levels": ["Elementary", "Secondary", "Professional"],
514
+ "model_benchmarks": {
515
+ "model_types": ["Closed-source models", "Open-weight Chinese LLMs"],
516
+ "parameter_range": "1.8B - 72B",
517
+ },
518
+ "comparison": "6x larger than original TMMLU",
519
+ "script": "Traditional Chinese",
520
+ },
521
  )
522
 
523
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
524
+ """Returns the recommended evaluation metrics for TMMLU+ dataset."""
525
+ return [
526
+ EvaluationMetric.create(
527
+ name="accuracy",
528
+ type="classification",
529
+ description="Overall percentage of correctly answered multiple-choice questions",
530
+ implementation="evaluate.load('accuracy')",
531
+ primary=True,
532
+ ),
533
+ EvaluationMetric.create(
534
+ name="subject_accuracy",
535
+ type="classification",
536
+ description="Per-subject accuracy scores across all 66 subjects",
537
+ implementation="custom_subject_accuracy",
538
+ primary=True,
539
+ ),
540
+ EvaluationMetric.create(
541
+ name="difficulty_analysis",
542
+ type="classification",
543
+ description="Performance analysis across different difficulty levels (elementary to professional)",
544
+ implementation="custom_difficulty_analysis",
545
+ primary=False,
546
+ ),
547
+ ]
548
+
549
+
550
+ class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
551
+ """Parser for the MMLU Pro dataset."""
552
+
553
+ _data_source = "TIGER-Lab/MMLU-Pro"
554
+ _default_task = "default"
555
+ _task_names = ["default"]
556
+ _hidden_task_names = [
557
+ "math",
558
+ "physics",
559
+ "chemistry",
560
+ "law",
561
+ "engineering",
562
+ "other",
563
+ "economics",
564
+ "health",
565
+ "psychology",
566
+ "business",
567
+ "biology",
568
+ "philosophy",
569
+ "computer_science",
570
+ "history",
571
+ ]
572
+
573
+ def _get_task_from_entry(self, data_entry: dict[str, Any]) -> str:
574
+ """Get the task name from the data entry or default task name."""
575
+ if data_entry is not None:
576
+ task_name: str = data_entry.get("category", "")
577
+ if task_name:
578
+ return task_name
579
+ return self._current_task or self._default_task
580
+
581
+ def process_entry(
582
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
583
+ ) -> MMLUProParseEntry:
584
  """
585
+ Generate a question and expected answer from the given row.
586
 
587
  Args:
588
+ row (dict[str, Any]): A data point to be formatted with MMLU Pro specific structure
589
+ containing 'question', 'options', 'answer', and 'answer_index' keys.
590
 
591
  Returns:
592
  MMLUParseEntry: The formatted entry object.
593
  """
594
+ task = task_name or self._get_current_task(row)
595
+ # Ensure task is not None
596
+ final_task = task or self._default_task
597
+
598
+ # Extract choices in order
599
+ raw_choices = row["options"]
600
  choices = "\n".join(
601
+ f"{chr(65 + i)}. {choice}" for i, choice in enumerate(raw_choices)
602
  )
603
+ raw_question = row["question"]
604
+ raw_answer = row["answer"]
605
+ answer_index = row["answer_index"]
606
+
607
+ question = f"Question: {raw_question}\n{choices}\nAnswer:"
608
+ answer_letter = chr(
609
+ 65 + answer_index
610
+ ) # Convert index to 'A', 'B', 'C', 'D', etc.
611
+
612
+ return MMLUProParseEntry.create(
613
+ question, answer_letter, raw_question, raw_choices, raw_answer, final_task
614
+ )
615
+
616
+ def get_dataset_description(self) -> DatasetDescription:
617
+ """Returns description of the MMLU Pro dataset."""
618
+ return DatasetDescription.create(
619
+ name="MMLU Pro",
620
+ purpose="Provide a more robust and challenging multi-task language understanding benchmark with enhanced reasoning requirements",
621
+ source="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
622
+ language="English",
623
+ category=["General Knowledge and Reasoning", "Advanced Reasoning"],
624
+ format="Multiple choice questions with up to 10 options (expanded from original 4)",
625
+ characteristics=(
626
+ "A more challenging version of MMLU containing 12K complex questions across various "
627
+ "disciplines. Features increased number of options (up to 10), stronger focus on "
628
+ "reasoning over pure knowledge, and reduced sensitivity to prompt variations. "
629
+ "Questions are sourced from original MMLU, STEM websites, TheoremQA, and SciBench, "
630
+ "with expert review and GPT-4 assisted distractor generation."
631
+ ),
632
+ citation="""@article{wang2024mmlu,
633
+ title={Mmlu-pro: A more robust and challenging multi-task language understanding benchmark},
634
+ author={Wang, Yubo and Ma, Xueguang and Zhang, Ge and Ni, Yuansheng and Chandra, Abhranil and Guo, Shiguang and Ren, Weiming and Arulraj, Aaran and He, Xuan and Jiang, Ziyan and others},
635
+ journal={arXiv preprint arXiv:2406.01574},
636
+ year={2024}
637
+ }""",
638
+ additional_info={
639
+ "size": "12,000 complex questions",
640
+ "options": "Up to 10 choices per question",
641
+ "sources": [
642
+ "Original MMLU (filtered)",
643
+ "STEM Website",
644
+ "TheoremQA",
645
+ "SciBench",
646
+ ],
647
+ "enhanced_subjects": [
648
+ "Biology",
649
+ "Business",
650
+ "Chemistry",
651
+ "Computer Science",
652
+ "Economics",
653
+ "Engineering",
654
+ "Math",
655
+ "Physics",
656
+ "Psychology",
657
+ ],
658
+ "construction_process": [
659
+ "Initial MMLU filtering",
660
+ "Question collection from multiple sources",
661
+ "GPT-4 assisted option augmentation",
662
+ "Expert review by 10+ experts",
663
+ ],
664
+ "prompt_sensitivity": "2% (reduced from 4-5% in MMLU)",
665
+ "reasoning_improvement": "20% higher CoT performance compared to PPL",
666
+ },
667
  )
 
668
 
669
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
670
+ """Returns the recommended evaluation metrics for MMLU Pro dataset."""
671
+ return [
672
+ EvaluationMetric.create(
673
+ name="accuracy",
674
+ type="classification",
675
+ description="Proportion of correctly answered multiple-choice questions (exact match)",
676
+ implementation="evaluate.load('accuracy')",
677
+ primary=True,
678
+ ),
679
+ EvaluationMetric.create(
680
+ name="subject_accuracy",
681
+ type="classification",
682
+ description="Per-subject accuracy scores with focus on enhanced subjects",
683
+ implementation="custom_subject_accuracy",
684
+ primary=True,
685
+ ),
686
+ EvaluationMetric.create(
687
+ name="reasoning_analysis",
688
+ type="analysis",
689
+ description="Comparison of Chain-of-Thought vs standard PPL performance",
690
+ implementation="custom_reasoning_analysis",
691
+ primary=True,
692
+ ),
693
+ EvaluationMetric.create(
694
+ name="prompt_robustness",
695
+ type="analysis",
696
+ description="Analysis of performance stability across different prompt variations",
697
+ implementation="custom_prompt_sensitivity",
698
+ primary=False,
699
+ ),
700
+ ]
701
+
702
+
703
+ if __name__ == "__main__":
704
+ # Example usage of MMLU Pro parser
705
+ parser = MMLUProDatasetParser()
706
+ parser.load()
707
+ parser.parse()
708
+
709
+ # Get parsed data with correct type
710
+ parsed_data = parser.get_parsed_data
711
+
712
+ # Print example entry
713
+ if parsed_data:
714
+ example = parsed_data[0]
715
+ print("\nExample parsed entry:")
716
+ print(f"Task: {example.task_name}")
717
+ print(f"Question: {example.raw_question}")
718
+ print("Choices:")
719
+ for i, choice in enumerate(example.raw_choices):
720
+ print(f"{chr(65 + i)}. {choice}")
721
+ print(f"Correct Answer: {example.answer}")
llmdataparser/prompts.py CHANGED
@@ -1,12 +1,69 @@
1
  import textwrap
2
  from typing import Final
3
 
 
 
4
  MMLU_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
5
  """\
6
- You are an expert assistant for answering questions in a multiple-choice format. Each question has four answer options (A, B, C, D). Your task is to analyze each question and select the most accurate answer.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- Instructions:
9
- 1. Answer Selection: Review the question and choose the best option.
10
- 2. Response Format: Reply with only the letter (A, B, C, or D) of your chosen answer, without additional explanation.
 
 
 
 
 
 
11
  """
12
  )
 
1
  import textwrap
2
  from typing import Final
3
 
4
+ # Only for reference
5
+
6
  MMLU_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
7
  """\
8
+ You are an expert answering multiple-choice questions. Select the single most accurate answer (A, B, C, or D) based on factual knowledge. Respond with the letter only.
9
+ """
10
+ )
11
+
12
+ MMLU_PRO_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
13
+ """\
14
+ You are an expert answering multiple-choice questions. Select the single most accurate answer (A through J) based on factual knowledge. Respond with the letter only.
15
+ """
16
+ )
17
+
18
+ GSM8K_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
19
+ """\
20
+ Solve this math problem step by step:
21
+ 1) Show your reasoning
22
+ 2) End with "Therefore, the answer is [number]"
23
+ """
24
+ )
25
+
26
+ HUMANEVAL_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
27
+ """\
28
+ Implement the Python function following best practices. Include error handling, type hints, and comments for complex logic. Return only the implementation code.
29
+ """
30
+ )
31
+
32
+ MGSM_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
33
+ """\
34
+ Solve this math problem step by step in the specified language:
35
+ 1) Show your reasoning
36
+ 2) Use appropriate number formatting
37
+ 3) End with "Therefore, the answer is [number]"
38
+ """
39
+ )
40
+
41
+ IFEVAL_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
42
+ """\
43
+ Follow the given requirements exactly. Provide only the requested output.
44
+ """
45
+ )
46
+
47
+ BBH_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
48
+ """\
49
+ Solve this reasoning problem step by step.
50
+ """
51
+ )
52
+
53
+ MBPP_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
54
+ """\
55
+ Write clean, efficient Python code that solves the given task. Include docstrings and handle edge cases. Return only the implementation code.
56
+ """
57
+ )
58
 
59
+ TW_LEGAL_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
60
+ """\
61
+ As a Taiwan legal expert, select the most accurate answer (A, B, C, or D) based on Taiwan's laws. Respond with the letter only.
62
+ """
63
+ )
64
+
65
+ TMLU_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
66
+ """\
67
+ Select the most accurate answer (A, B, C, or D) based on Taiwan's educational and professional knowledge. Respond with the letter only.
68
  """
69
  )
llmdataparser/tmlu_parser.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Any, Final
3
+
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
+
11
+ TMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
12
+ TMLU_VALID_ANSWER_STR: Final[str] = ", ".join(sorted(TMLU_VALID_ANSWERS))
13
+
14
+
15
+ @dataclass(frozen=True, kw_only=True, slots=True)
16
+ class TMLUParseEntry(HuggingFaceParseEntry):
17
+ """Custom entry class for TMLU, with fields specific to this dataset parser."""
18
+
19
+ raw_choices: list[str]
20
+ explanation: str
21
+ metadata: dict[str, Any]
22
+
23
+ @classmethod
24
+ def create(
25
+ cls,
26
+ question: str,
27
+ answer: str,
28
+ raw_question: str,
29
+ raw_choices: list[str],
30
+ raw_answer: str,
31
+ task_name: str,
32
+ explanation: str = "",
33
+ metadata: dict[str, Any] = {},
34
+ ) -> "TMLUParseEntry":
35
+ if answer not in TMLU_VALID_ANSWERS:
36
+ raise ValueError(
37
+ f"Invalid answer_letter '{answer}'; must be one of {TMLU_VALID_ANSWER_STR}"
38
+ )
39
+ return cls(
40
+ question=question,
41
+ answer=answer,
42
+ raw_question=raw_question,
43
+ raw_answer=raw_answer,
44
+ raw_choices=raw_choices,
45
+ task_name=task_name,
46
+ explanation=explanation,
47
+ metadata=metadata,
48
+ )
49
+
50
+
51
+ class TMLUDatasetParser(HuggingFaceDatasetParser[TMLUParseEntry]):
52
+ """Parser for the TMLU dataset."""
53
+
54
+ _data_source = "miulab/tmlu"
55
+ _default_task = "AST_chinese"
56
+ _task_names = [
57
+ "AST_chinese",
58
+ "AST_mathematics",
59
+ "AST_biology",
60
+ "AST_chemistry",
61
+ "AST_physics",
62
+ "AST_civics",
63
+ "AST_geography",
64
+ "AST_history",
65
+ "GSAT_chinese",
66
+ "GSAT_chemistry",
67
+ "GSAT_biology",
68
+ "GSAT_physics",
69
+ "GSAT_earth_science",
70
+ "GSAT_mathematics",
71
+ "GSAT_geography",
72
+ "GSAT_history",
73
+ "GSAT_civics",
74
+ "CAP_mathematics",
75
+ "CAP_biology",
76
+ "CAP_physics",
77
+ "CAP_chemistry",
78
+ "CAP_earth_science",
79
+ "CAP_civics",
80
+ "CAP_history",
81
+ "CAP_geography",
82
+ "CAP_chinese",
83
+ "driving_rule",
84
+ "basic_traditional_chinese_medicine",
85
+ "clinical_traditional_chinese_medicine",
86
+ "lawyer_qualification",
87
+ "nutritionist",
88
+ "tour_leader",
89
+ "tour_guide",
90
+ "taiwan_tourist_resources",
91
+ "clinical_psychologist",
92
+ "teacher_qualification",
93
+ "accountant",
94
+ ]
95
+
96
+ def process_entry(
97
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
98
+ ) -> TMLUParseEntry:
99
+ """Process a single TMLU entry."""
100
+ task = task_name or self._get_current_task(row)
101
+ # Extract choices in order
102
+ raw_choices = [row["A"], row["B"], row["C"], row["D"]]
103
+ choices = "\n".join(
104
+ f"{chr(65 + i)}. {choice}" for i, choice in enumerate(raw_choices)
105
+ )
106
+ raw_question = row["question"]
107
+ raw_answer = row["answer"]
108
+ explanation = row.get("explanation", "")
109
+ metadata = row.get("metadata", {})
110
+
111
+ question = f"Question: {raw_question}\n{choices}\nAnswer:"
112
+
113
+ return TMLUParseEntry.create(
114
+ question=question,
115
+ answer=raw_answer,
116
+ raw_question=raw_question,
117
+ raw_choices=raw_choices,
118
+ raw_answer=raw_answer,
119
+ task_name=task,
120
+ explanation=explanation,
121
+ metadata=metadata,
122
+ )
123
+
124
+ def get_dataset_description(self) -> DatasetDescription:
125
+ """Returns description of the TMLU dataset."""
126
+ return DatasetDescription.create(
127
+ name="Taiwan Multiple-choice Language Understanding (TMLU)",
128
+ language="Traditional Chinese",
129
+ purpose="Evaluate models on Taiwan-specific educational and professional knowledge",
130
+ source="Various Taiwan standardized tests and professional certifications",
131
+ category=["Taiwan", "General Knowledge and Reasoning"],
132
+ format="Multiple choice questions (A/B/C/D)",
133
+ characteristics=(
134
+ "Covers various subjects including Advanced Subjects Test (AST), "
135
+ "General Scholastic Ability Test (GSAT), College Admission Practice (CAP), "
136
+ "and professional certifications"
137
+ ),
138
+ citation="""@article{DBLP:journals/corr/abs-2403-20180,
139
+ author = {Po-Heng Chen and Sijia Cheng and Wei-Lin Chen and Yen-Ting Lin and Yun-Nung Chen},
140
+ title = {Measuring Taiwanese Mandarin Language Understanding},
141
+ journal = {CoRR},
142
+ volume = {abs/2403.20180},
143
+ year = {2024},
144
+ url = {https://doi.org/10.48550/arXiv.2403.20180},
145
+ doi = {10.48550/ARXIV.2403.20180},
146
+ eprinttype = {arXiv},
147
+ eprint = {2403.20180},
148
+ timestamp = {Wed, 10 Apr 2024 17:37:45 +0200},
149
+ biburl = {https://dblp.org/rec/journals/corr/abs-2403-20180.bib},
150
+ bibsource = {dblp computer science bibliography, https://dblp.org}
151
+ }""",
152
+ )
153
+
154
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
155
+ """Returns recommended evaluation metrics for TMLU."""
156
+ return [
157
+ EvaluationMetric.create(
158
+ name="accuracy",
159
+ type="classification",
160
+ description="Overall percentage of correctly answered questions",
161
+ implementation="datasets.load_metric('accuracy')",
162
+ primary=True,
163
+ ),
164
+ EvaluationMetric.create(
165
+ name="per_subject_accuracy",
166
+ type="classification",
167
+ description="Accuracy broken down by subject areas (AST, GSAT, CAP, etc.)",
168
+ implementation="custom_subject_accuracy",
169
+ primary=True,
170
+ ),
171
+ ]
172
+
173
+
174
+ if __name__ == "__main__":
175
+ # Example usage
176
+ parser = TMLUDatasetParser()
177
+ parser.load()
178
+ parser.parse()
179
+
180
+ # Get parsed data with correct type
181
+ parsed_data = parser.get_parsed_data
182
+
183
+ # Print example entry
184
+ if parsed_data:
185
+ example = parsed_data[0]
186
+ print("\nExample parsed entry:")
187
+ print(f"Task: {example.task_name}")
188
+ print(f"Question: {example.question}")
189
+ print("Choices:")
190
+ for i, choice in enumerate(example.raw_choices):
191
+ print(f"{chr(65 + i)}. {choice}")
192
+ print(f"Correct Answer: {example.answer}")
193
+ if example.explanation:
194
+ print(f"Explanation: {example.explanation}")
195
+ print(f"Metadata: {example.metadata}")
llmdataparser/tw_legal_parser.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Any, Final
3
+
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
+
11
+ TW_LEGAL_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
12
+ TW_LEGAL_VALID_ANSWER_STR: Final[str] = ", ".join(sorted(TW_LEGAL_VALID_ANSWERS))
13
+
14
+
15
+ @dataclass(frozen=True, kw_only=True, slots=True)
16
+ class TWLegalParseEntry(HuggingFaceParseEntry):
17
+ """Custom entry class for Taiwan Legal Benchmark, with fields specific to this dataset parser."""
18
+
19
+ raw_choices: list[str]
20
+
21
+ @classmethod
22
+ def create(
23
+ cls,
24
+ question: str,
25
+ answer: str,
26
+ raw_question: str,
27
+ raw_choices: list[str],
28
+ raw_answer: str,
29
+ task_name: str,
30
+ ) -> "TWLegalParseEntry":
31
+ if answer not in TW_LEGAL_VALID_ANSWERS:
32
+ raise ValueError(
33
+ f"Invalid answer_letter '{answer}'; must be one of {TW_LEGAL_VALID_ANSWER_STR}"
34
+ )
35
+ return cls(
36
+ question=question,
37
+ answer=answer,
38
+ raw_question=raw_question,
39
+ raw_answer=raw_answer,
40
+ raw_choices=raw_choices,
41
+ task_name=task_name,
42
+ )
43
+
44
+
45
+ class TWLegalDatasetParser(HuggingFaceDatasetParser[TWLegalParseEntry]):
46
+ """Parser for the Taiwan Legal Benchmark dataset."""
47
+
48
+ _data_source = "lianghsun/tw-legal-benchmark-v1"
49
+ _default_task = "default"
50
+ _task_names = ["default"]
51
+
52
+ def process_entry(
53
+ self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
54
+ ) -> TWLegalParseEntry:
55
+ """Process a single Taiwan Legal Benchmark entry."""
56
+ # Extract choices in order
57
+ task = task_name or self._get_current_task(row)
58
+ raw_choices = [row["A"], row["B"], row["C"], row["D"]]
59
+ choices = "\n".join(
60
+ f"{chr(65 + i)}. {choice}" for i, choice in enumerate(raw_choices)
61
+ )
62
+ raw_question = row["question"]
63
+ raw_answer = row["answer"]
64
+
65
+ question = f"Question: {raw_question}\n{choices}\nAnswer:"
66
+
67
+ return TWLegalParseEntry.create(
68
+ question=question,
69
+ answer=raw_answer,
70
+ raw_question=raw_question,
71
+ raw_choices=raw_choices,
72
+ raw_answer=raw_answer,
73
+ task_name=task,
74
+ )
75
+
76
+ def get_dataset_description(self) -> DatasetDescription:
77
+ """Returns description of the Taiwan Legal Benchmark dataset."""
78
+ return DatasetDescription.create(
79
+ name="Taiwan Legal Benchmark",
80
+ language="Traditional Chinese",
81
+ purpose="Evaluate models on Taiwan-specific legal knowledge and understanding",
82
+ source="Taiwan Bar Examination questions",
83
+ category=["Taiwan", "General Knowledge and Reasoning", "Legal"],
84
+ format="Multiple choice questions (A/B/C/D)",
85
+ characteristics=(
86
+ "Contains questions from Taiwan's bar examination, testing understanding "
87
+ "of Taiwan's legal system, terminology, and concepts"
88
+ ),
89
+ citation="""
90
+ url={https://huggingface.co/datasets/lianghsun/tw-legal-benchmark-v1}
91
+ """,
92
+ )
93
+
94
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
95
+ """Returns recommended evaluation metrics for Taiwan Legal Benchmark."""
96
+ return [
97
+ EvaluationMetric.create(
98
+ name="accuracy",
99
+ type="classification",
100
+ description="Overall percentage of correctly answered legal questions",
101
+ implementation="datasets.load_metric('accuracy')",
102
+ primary=True,
103
+ ),
104
+ ]
105
+
106
+
107
+ if __name__ == "__main__":
108
+ # Example usage
109
+ parser = TWLegalDatasetParser()
110
+ parser.load()
111
+ parser.parse()
112
+
113
+ # Get parsed data with correct type
114
+ parsed_data = parser.get_parsed_data
115
+
116
+ # Print example entry
117
+ if parsed_data:
118
+ example = parsed_data[0]
119
+ print("\nExample parsed entry:")
120
+ print(f"Question: {example.question}")
121
+ print("Choices:")
122
+ for i, choice in enumerate(example.raw_choices):
123
+ print(f"{chr(65 + i)}. {choice}")
124
+ print(f"Correct Answer: {example.answer}")
125
+ print(f"Task Name: {example.task_name}")
mkdocs.yml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ site_name: LLMDataParser
2
+ theme:
3
+ name: material
4
+
5
+ nav:
6
+ - Home: index.md
7
+
8
+ plugins:
9
+ - search
nginx.conf ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ events {
2
+ worker_connections 1024;
3
+ }
4
+
5
+ http {
6
+ # Basic security settings
7
+ server_tokens off; # Don't show nginx version
8
+ client_max_body_size 10M; # Limit request size
9
+ client_body_timeout 12;
10
+ client_header_timeout 12;
11
+
12
+ upstream gradio_app {
13
+ server llmdataparser:7860;
14
+ keepalive 32;
15
+ }
16
+
17
+ server {
18
+ listen 80;
19
+ server_name localhost;
20
+
21
+ # Enhanced security headers
22
+ add_header X-Frame-Options "SAMEORIGIN" always;
23
+ add_header X-Content-Type-Options "nosniff" always;
24
+ add_header X-XSS-Protection "1; mode=block" always;
25
+ add_header Referrer-Policy "strict-origin-always" always;
26
+ add_header Content-Security-Policy "default-src 'self'; script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';" always;
27
+ add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
28
+
29
+ location / {
30
+ proxy_pass http://gradio_app;
31
+ proxy_set_header Host $host;
32
+ proxy_set_header X-Real-IP $remote_addr;
33
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
34
+ proxy_set_header X-Forwarded-Proto $scheme;
35
+
36
+ # WebSocket support
37
+ proxy_http_version 1.1;
38
+ proxy_set_header Upgrade $http_upgrade;
39
+ proxy_set_header Connection "upgrade";
40
+
41
+ # Timeouts
42
+ proxy_connect_timeout 60s;
43
+ proxy_send_timeout 60s;
44
+ proxy_read_timeout 60s;
45
+
46
+ # Security
47
+ proxy_buffering on;
48
+ proxy_buffer_size 8k;
49
+ proxy_buffers 8 8k;
50
+ }
51
+
52
+ # Deny access to hidden files
53
+ location ~ /\. {
54
+ deny all;
55
+ return 404;
56
+ }
57
+ }
58
+ }
notebooks/demo.ipynb DELETED
@@ -1,77 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import pprint\n",
10
- "import random"
11
- ]
12
- },
13
- {
14
- "cell_type": "code",
15
- "execution_count": null,
16
- "metadata": {},
17
- "outputs": [],
18
- "source": [
19
- "from llmdataparser import ParserRegistry\n",
20
- "ParserRegistry.list_parsers()"
21
- ]
22
- },
23
- {
24
- "cell_type": "code",
25
- "execution_count": null,
26
- "metadata": {},
27
- "outputs": [],
28
- "source": [
29
- "mmlu_parser = ParserRegistry.get_parser('mmlu')\n",
30
- "mmlu_parser.load()"
31
- ]
32
- },
33
- {
34
- "cell_type": "code",
35
- "execution_count": null,
36
- "metadata": {},
37
- "outputs": [],
38
- "source": [
39
- "mmlu_parser.parse(split_names=['dev', 'test'])\n",
40
- "parsed_data = mmlu_parser.get_parsed_data"
41
- ]
42
- },
43
- {
44
- "cell_type": "code",
45
- "execution_count": null,
46
- "metadata": {},
47
- "outputs": [],
48
- "source": [
49
- "index = random.randint(0, len(parsed_data))\n",
50
- "print(f\"Question: \\n-------------------\\n {parsed_data[index].prompt}\")\n",
51
- "print(\"-------------------\")\n",
52
- "print(f\"Answer: \\n-------------------\\n{parsed_data[index].answer_letter}\")"
53
- ]
54
- }
55
- ],
56
- "metadata": {
57
- "kernelspec": {
58
- "display_name": "llmdata",
59
- "language": "python",
60
- "name": "python3"
61
- },
62
- "language_info": {
63
- "codemirror_mode": {
64
- "name": "ipython",
65
- "version": 3
66
- },
67
- "file_extension": ".py",
68
- "mimetype": "text/x-python",
69
- "name": "python",
70
- "nbconvert_exporter": "python",
71
- "pygments_lexer": "ipython3",
72
- "version": "3.12.7"
73
- }
74
- },
75
- "nbformat": 4,
76
- "nbformat_minor": 2
77
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -1,6 +1,10 @@
 
 
 
 
1
  [tool.poetry]
2
  name = "llmdataparser"
3
- version = "0.1.0"
4
  description = "A collection of parsers for LLM benchmark datasets like MMLU, MMLU-Pro, GSM8k, and more."
5
  authors = ["Jeff Yang <[email protected]>"]
6
  license = "MIT"
@@ -16,45 +20,66 @@ classifiers = [
16
  "Intended Audience :: Developers"
17
  ]
18
 
 
 
 
 
 
 
 
19
  [tool.poetry.dependencies]
20
- python = ">=3.11"
21
  pandas = "^2.0.3"
22
- datasets = "^2.14.4"
 
23
  typing-extensions = "^4.8.0"
 
 
 
 
 
 
 
24
 
25
  [tool.poetry.group.dev.dependencies]
26
- pytest = "^7.4.0"
27
- black = { version = "^23.9.1", allow-prereleases = true }
28
- flake8 = "^6.1.0"
29
  isort = "^5.12.0"
30
  mypy = "^1.5.1"
31
  pre-commit = "^3.4.0"
32
  types-python-dateutil = "^2.8.19.14"
33
  ipykernel = "^6.7.0"
 
 
 
34
 
35
- [tool.black]
36
  line-length = 88
37
- target-version = ["py311"]
38
- exclude = """
39
- /(
40
- \\.git
41
- | \\.venv
42
- | build
43
- | dist
44
- )/
45
- """
46
 
47
  [tool.isort]
48
  profile = "black"
 
49
  line_length = 88
50
- known_first_party = ["llmdataparser"]
51
 
 
 
 
 
 
 
 
52
 
53
- [tool.ruff]
54
- line-length = 88
55
- select = ["E", "F"] # or specify checks explicitly without E501
56
- ignore = ["E501"]
57
 
58
- [build-system]
59
- requires = ["poetry-core>=1.5.0"]
60
- build-backend = "poetry.core.masonry.api"
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["poetry-core>=1.5.0"]
3
+ build-backend = "poetry.core.masonry.api"
4
+
5
  [tool.poetry]
6
  name = "llmdataparser"
7
+ version = "1.0.0"
8
  description = "A collection of parsers for LLM benchmark datasets like MMLU, MMLU-Pro, GSM8k, and more."
9
  authors = ["Jeff Yang <[email protected]>"]
10
  license = "MIT"
 
20
  "Intended Audience :: Developers"
21
  ]
22
 
23
+ packages = [
24
+ { include = "llmdataparser" }
25
+ ]
26
+
27
+ [tool.poetry.scripts]
28
+ start = "llmdataparser.app:main"
29
+
30
  [tool.poetry.dependencies]
31
+ python = ">=3.10"
32
  pandas = "^2.0.3"
33
+ datasets = "^3.2.0"
34
+ fsspec = "^2024.9.0"
35
  typing-extensions = "^4.8.0"
36
+ ipywidgets = "^8.1.1"
37
+ gradio = "^4.19.2"
38
+ pyyaml = "^6.0.1" # Add this for configuration handling
39
+ tqdm = "^4.66.1" # Add this for progress bars
40
+ numpy = "^1.24.0" # Add this for numerical operations
41
+ mkdocs = "^1.5.0"
42
+ mkdocs-material = "^9.5.0" # Optional but recommended for better documentation
43
 
44
  [tool.poetry.group.dev.dependencies]
45
+ pytest = "^7.0.0"
 
 
46
  isort = "^5.12.0"
47
  mypy = "^1.5.1"
48
  pre-commit = "^3.4.0"
49
  types-python-dateutil = "^2.8.19.14"
50
  ipykernel = "^6.7.0"
51
+ coverage = "^7.4.1"
52
+ pytest-cov = "^4.1.0"
53
+ evaluate = "^0.4.0"
54
 
55
+ [tool.ruff]
56
  line-length = 88
57
+
58
+ [tool.ruff.lint]
59
+ select = ["E", "F", "I"]
60
+ ignore = ["E501"]
61
+
 
 
 
 
62
 
63
  [tool.isort]
64
  profile = "black"
65
+ multi_line_output = 3
66
  line_length = 88
 
67
 
68
+ [tool.mypy]
69
+ python_version = "3.12"
70
+ warn_return_any = true
71
+ warn_unused_configs = true
72
+ exclude = ["tests/.*"]
73
+ ignore_missing_imports = true
74
+ follow_imports = "silent"
75
 
 
 
 
 
76
 
77
+
78
+ [tool.pytest.ini_options]
79
+ testpaths = ["tests"]
80
+ python_files = ["test_*.py"]
81
+ addopts = "-ra -q --cov=llmdataparser --cov-report=term-missing"
82
+
83
+ [tool.bandit]
84
+ exclude_dirs = ["tests"]
85
+ skips = ["B101"]
tests/test_bbh_parser.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from llmdataparser.bbh_parser import BBHDatasetParser, BBHParseEntry
4
+
5
+
6
+ @pytest.fixture
7
+ def bbh_parser():
8
+ """Create a BBH parser instance for testing."""
9
+ return BBHDatasetParser()
10
+
11
+
12
+ @pytest.fixture
13
+ def loaded_bbh_parser(bbh_parser):
14
+ """Create and load a BBH parser instance for testing."""
15
+ bbh_parser.load(task_name="reasoning_about_colored_objects", split="test")
16
+ return bbh_parser
17
+
18
+
19
+ @pytest.fixture
20
+ def sample_row():
21
+ """Create a sample BBH data row for testing."""
22
+ return {
23
+ "input": "What color is the sky on a clear day?\nA) Blue\nB) Green\nC) Red\nD) Yellow",
24
+ "target": "(A)",
25
+ }
26
+
27
+
28
+ def test_bbh_parse_entry_creation_valid():
29
+ """Test valid creation of BBHParseEntry."""
30
+ entry = BBHParseEntry.create(
31
+ question="Test question",
32
+ answer="A",
33
+ raw_question="Test question",
34
+ raw_answer="(A)",
35
+ task_name="reasoning_about_colored_objects",
36
+ )
37
+ assert isinstance(entry, BBHParseEntry)
38
+ assert entry.question == "Test question"
39
+ assert entry.answer == "A"
40
+ assert entry.raw_question == "Test question"
41
+ assert entry.raw_answer == "(A)"
42
+ assert entry.task_name == "reasoning_about_colored_objects"
43
+
44
+
45
+ def test_bbh_parser_initialization(bbh_parser):
46
+ """Test BBH parser initialization."""
47
+ assert bbh_parser._data_source == "lukaemon/bbh"
48
+ assert bbh_parser._default_task == "reasoning_about_colored_objects"
49
+ assert "boolean_expressions" in bbh_parser._task_names
50
+ assert "word_sorting" in bbh_parser._task_names
51
+ assert (
52
+ bbh_parser.get_huggingface_link
53
+ == "https://huggingface.co/datasets/lukaemon/bbh"
54
+ )
55
+
56
+
57
+ def test_load_dataset(loaded_bbh_parser):
58
+ """Test loading the dataset."""
59
+ assert loaded_bbh_parser.raw_data is not None
60
+ assert loaded_bbh_parser.split_names == ["test"]
61
+ assert loaded_bbh_parser._current_task == "reasoning_about_colored_objects"
62
+
63
+
64
+ @pytest.mark.integration
65
+ def test_full_parse_workflow(loaded_bbh_parser):
66
+ """Test the complete workflow of loading and parsing data."""
67
+ # Parse the test split
68
+ loaded_bbh_parser.parse(split_names="test", force=True)
69
+ parsed_data = loaded_bbh_parser.get_parsed_data
70
+
71
+ # Basic checks
72
+ assert len(parsed_data) > 0
73
+
74
+ # Check first entry structure
75
+ first_entry = parsed_data[0]
76
+ assert isinstance(first_entry, BBHParseEntry)
77
+ assert first_entry.task_name == "reasoning_about_colored_objects"
78
+ assert first_entry.answer.strip("()").isalpha() # Should be a single letter
79
+
80
+
81
+ def test_process_entry(bbh_parser, sample_row):
82
+ """Test processing of a single BBH entry."""
83
+ entry = bbh_parser.process_entry(
84
+ sample_row, task_name="reasoning_about_colored_objects"
85
+ )
86
+
87
+ assert isinstance(entry, BBHParseEntry)
88
+ assert entry.answer == "A" # Stripped from "(A)"
89
+ assert "What color is the sky" in entry.question
90
+ assert entry.raw_answer == "(A)"
91
+ assert entry.task_name == "reasoning_about_colored_objects"
92
+
93
+
94
+ @pytest.mark.parametrize("split_name", ["invalid_split", "wrong_split"])
95
+ def test_parse_with_invalid_split(bbh_parser, split_name):
96
+ """Test parsing with invalid split names."""
97
+ bbh_parser.raw_data = {"train": [], "test": []} # Mock data
98
+
99
+ with pytest.raises(
100
+ ValueError, match=f"Split '{split_name}' not found in the dataset"
101
+ ):
102
+ bbh_parser.parse(split_name)
103
+
104
+
105
+ def test_parse_without_loaded_data(bbh_parser):
106
+ """Test parsing without loading data first."""
107
+ with pytest.raises(
108
+ ValueError, match="No data loaded. Please load the dataset first"
109
+ ):
110
+ bbh_parser.parse()
111
+
112
+
113
+ @pytest.mark.parametrize(
114
+ "test_case",
115
+ [
116
+ {"input": "Test question", "target": "(A)"},
117
+ {"input": "Test question", "target": "(B)"},
118
+ {"input": "Test question", "target": "(C)"},
119
+ ],
120
+ )
121
+ def test_answer_stripping(bbh_parser, test_case):
122
+ """Test stripping of parentheses from answers."""
123
+ entry = bbh_parser.process_entry(
124
+ test_case, task_name="reasoning_about_colored_objects"
125
+ )
126
+ assert entry.answer == test_case["target"].strip("()")
127
+ assert entry.raw_answer == test_case["target"]
128
+
129
+
130
+ def test_parser_properties(bbh_parser):
131
+ """Test parser property getters."""
132
+ assert len(bbh_parser.task_names) > 0
133
+ assert bbh_parser.total_tasks == len(bbh_parser._task_names)
134
+ assert all(isinstance(task, str) for task in bbh_parser.task_names)
135
+
136
+
137
+ def test_parser_string_representation(loaded_bbh_parser):
138
+ """Test string representation of parser."""
139
+ repr_str = str(loaded_bbh_parser)
140
+ assert "BBHDatasetParser" in repr_str
141
+ assert "lukaemon/bbh" in repr_str
142
+ assert "reasoning_about_colored_objects" in repr_str
143
+ assert "loaded" in repr_str
144
+
145
+
146
+ @pytest.mark.integration
147
+ @pytest.mark.parametrize(
148
+ "task_name", ["boolean_expressions", "causal_judgement", "date_understanding"]
149
+ )
150
+ def test_different_tasks_parsing(bbh_parser, task_name):
151
+ """Test parsing different tasks of the dataset."""
152
+ bbh_parser.load(task_name=task_name, split="test")
153
+ bbh_parser.parse(split_names="test", force=True)
154
+ parsed_data = bbh_parser.get_parsed_data
155
+
156
+ assert len(parsed_data) > 0
157
+ assert all(entry.task_name == task_name for entry in parsed_data)
158
+ assert all(isinstance(entry.answer, str) for entry in parsed_data)
159
+
160
+
161
+ def test_get_dataset_description(bbh_parser):
162
+ """Test dataset description generation."""
163
+ description = bbh_parser.get_dataset_description()
164
+
165
+ assert description.name == "Big Bench Hard (BBH)"
166
+ assert description.language == "English"
167
+ assert description.format == "Multiple choice questions with single correct answers"
168
+ assert "suzgun2022challenging" in description.citation
169
+
170
+
171
+ def test_get_evaluation_metrics(bbh_parser):
172
+ """Test evaluation metrics generation."""
173
+ metrics = bbh_parser.get_evaluation_metrics()
174
+
175
+ assert len(metrics) == 4 # Check total number of metrics
176
+
177
+ # Check primary metrics
178
+ primary_metrics = [m for m in metrics if m.primary]
179
+ assert len(primary_metrics) == 2
180
+ assert any(m.name == "accuracy" for m in primary_metrics)
181
+ assert any(m.name == "human_eval_delta" for m in primary_metrics)
182
+
183
+ # Check specific metric properties
184
+ accuracy_metric = next(m for m in metrics if m.name == "accuracy")
185
+ assert accuracy_metric.type == "classification"
186
+ assert "evaluate.load('accuracy')" in accuracy_metric.implementation
187
+
188
+ # Check non-primary metrics
189
+ assert any(m.name == "per_task_accuracy" and not m.primary for m in metrics)
190
+ assert any(m.name == "exact_match" and not m.primary for m in metrics)
tests/test_gsm8k_parser.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from llmdataparser.gsm8k_parser import GSM8KDatasetParser, GSM8KParseEntry
4
+
5
+
6
+ @pytest.fixture
7
+ def gsm8k_parser():
8
+ """Create a GSM8K parser instance for testing."""
9
+ return GSM8KDatasetParser()
10
+
11
+
12
+ @pytest.fixture
13
+ def loaded_gsm8k_parser(gsm8k_parser):
14
+ """Create and load a GSM8K parser instance for testing."""
15
+ gsm8k_parser.load(
16
+ task_name="main", split="test"
17
+ ) # Using test split as it's smaller
18
+ return gsm8k_parser
19
+
20
+
21
+ @pytest.fixture
22
+ def sample_row():
23
+ """Create a sample GSM8K data row for testing."""
24
+ return {
25
+ "question": "Janet has 3 apples. She buys 2 more. How many apples does she have now?",
26
+ "answer": "Let's solve this step by step:\n1) Initially, Janet has 3 apples\n2) She buys 2 more apples\n3) Total apples = 3 + 2\n#### 5",
27
+ }
28
+
29
+
30
+ def test_gsm8k_parse_entry_creation_valid():
31
+ """Test valid creation of GSM8KParseEntry."""
32
+ entry = GSM8KParseEntry.create(
33
+ question="Test question",
34
+ answer="5",
35
+ raw_question="Test question",
36
+ raw_answer="Solution steps #### 5",
37
+ solution="Solution steps",
38
+ task_name="main",
39
+ numerical_answer=5,
40
+ )
41
+ assert isinstance(entry, GSM8KParseEntry)
42
+ assert entry.question == "Test question"
43
+ assert entry.answer == "5"
44
+ assert entry.solution == "Solution steps"
45
+ assert entry.numerical_answer == 5
46
+ assert entry.task_name == "main"
47
+
48
+
49
+ def test_gsm8k_parser_initialization(gsm8k_parser):
50
+ """Test GSM8K parser initialization."""
51
+ assert gsm8k_parser._data_source == "openai/gsm8k"
52
+ assert gsm8k_parser._default_task == "main"
53
+ assert gsm8k_parser._task_names == ["main", "socratic"]
54
+ assert (
55
+ gsm8k_parser.get_huggingface_link
56
+ == "https://huggingface.co/datasets/openai/gsm8k"
57
+ )
58
+
59
+
60
+ def test_load_dataset(loaded_gsm8k_parser):
61
+ """Test loading the dataset."""
62
+ assert loaded_gsm8k_parser.raw_data is not None
63
+ assert loaded_gsm8k_parser.split_names == [
64
+ "test"
65
+ ] # Since we specifically loaded the test split
66
+ assert loaded_gsm8k_parser._current_task == "main"
67
+
68
+
69
+ @pytest.mark.integration
70
+ def test_full_parse_workflow(loaded_gsm8k_parser):
71
+ """Test the complete workflow of loading and parsing data."""
72
+ # Parse the test split
73
+ loaded_gsm8k_parser.parse(split_names="test", force=True)
74
+ parsed_data = loaded_gsm8k_parser.get_parsed_data
75
+
76
+ # Basic checks
77
+ assert len(parsed_data) > 0
78
+
79
+ # Check first entry structure
80
+ first_entry = parsed_data[0]
81
+ assert isinstance(first_entry, GSM8KParseEntry)
82
+ assert first_entry.task_name == "main"
83
+ assert isinstance(first_entry.numerical_answer, (str, int, float))
84
+ assert "####" in first_entry.raw_answer
85
+ assert first_entry.solution
86
+
87
+
88
+ def test_process_entry(gsm8k_parser, sample_row):
89
+ """Test processing of a single GSM8K entry."""
90
+ entry = gsm8k_parser.process_entry(sample_row, task_name="main")
91
+
92
+ assert isinstance(entry, GSM8KParseEntry)
93
+ assert entry.numerical_answer == 5
94
+ assert "Janet has 3 apples" in entry.raw_question
95
+ assert "#### 5" in entry.raw_answer
96
+ assert "Let's solve this step by step:" in entry.solution
97
+ assert entry.task_name == "main"
98
+
99
+
100
+ @pytest.mark.parametrize("split_name", ["invalid_split", "wrong_split"])
101
+ def test_parse_with_invalid_split(gsm8k_parser, split_name):
102
+ """Test parsing with invalid split names."""
103
+ gsm8k_parser.raw_data = {"train": [], "test": []} # Mock data
104
+
105
+ with pytest.raises(
106
+ ValueError, match=f"Split '{split_name}' not found in the dataset"
107
+ ):
108
+ gsm8k_parser.parse(split_name)
109
+
110
+
111
+ def test_parse_without_loaded_data(gsm8k_parser):
112
+ """Test parsing without loading data first."""
113
+ with pytest.raises(
114
+ ValueError, match="No data loaded. Please load the dataset first"
115
+ ):
116
+ gsm8k_parser.parse()
117
+
118
+
119
+ @pytest.mark.parametrize(
120
+ "test_case",
121
+ [
122
+ {"question": "Test question", "answer": "Some solution steps #### 42"},
123
+ {
124
+ "question": "Test question",
125
+ "answer": "Complex solution\nWith multiple lines\n#### 123.45",
126
+ },
127
+ {"question": "Test question", "answer": "No steps #### 0"},
128
+ ],
129
+ )
130
+ def test_numerical_answer_extraction(gsm8k_parser, test_case):
131
+ """Test extraction of numerical answers from different formats."""
132
+ entry = gsm8k_parser.process_entry(test_case, task_name="main")
133
+ assert str(entry.numerical_answer) == test_case["answer"].split("####")[
134
+ -1
135
+ ].strip().replace(",", "")
136
+
137
+
138
+ def test_solution_extraction(gsm8k_parser):
139
+ """Test extraction of solution steps."""
140
+ row = {
141
+ "question": "Test question",
142
+ "answer": "Step 1: Do this\nStep 2: Do that\n#### 42",
143
+ }
144
+
145
+ entry = gsm8k_parser.process_entry(row, task_name="main")
146
+ assert entry.solution == "Step 1: Do this\nStep 2: Do that"
147
+ assert entry.task_name == "main"
148
+ assert "####" not in entry.solution
149
+
150
+
151
+ def test_parser_properties(gsm8k_parser):
152
+ """Test parser property getters."""
153
+ assert gsm8k_parser.task_names == ["main", "socratic"]
154
+ assert gsm8k_parser.total_tasks == 2
155
+
156
+
157
+ def test_parser_string_representation(loaded_gsm8k_parser):
158
+ """Test string representation of parser."""
159
+ repr_str = str(loaded_gsm8k_parser)
160
+ assert "GSM8KDatasetParser" in repr_str
161
+ assert "openai/gsm8k" in repr_str
162
+ assert "main" in repr_str
163
+ assert "loaded" in repr_str
164
+
165
+
166
+ @pytest.mark.integration
167
+ def test_different_splits_parsing(gsm8k_parser):
168
+ """Test parsing different splits of the dataset."""
169
+ # Load and parse test split
170
+ gsm8k_parser.load(task_name="main", split="test")
171
+ gsm8k_parser.parse(split_names="test", force=True)
172
+ test_count = len(gsm8k_parser.get_parsed_data)
173
+
174
+ # Load and parse train split
175
+ gsm8k_parser.load(task_name="main", split="train")
176
+ gsm8k_parser.parse(split_names="train", force=True)
177
+ train_count = len(gsm8k_parser.get_parsed_data)
178
+
179
+ assert test_count > 0
180
+ assert train_count > 0
181
+ assert train_count != test_count
182
+
183
+
184
+ def test_get_dataset_description(gsm8k_parser):
185
+ """Test dataset description generation."""
186
+ description = gsm8k_parser.get_dataset_description()
187
+
188
+ assert description.name == "Grade School Math 8K (GSM8K)"
189
+ assert description.source == "OpenAI"
190
+ assert description.language == "English"
191
+ assert "Cobbe" in description.citation
192
+
193
+
194
+ def test_get_evaluation_metrics(gsm8k_parser):
195
+ """Test evaluation metrics specification."""
196
+ metrics = gsm8k_parser.get_evaluation_metrics()
197
+
198
+ # Check we have all expected metrics
199
+ metric_names = {metric.name for metric in metrics}
200
+ expected_names = {"exact_match", "solution_validity", "step_accuracy", "step_count"}
201
+ assert metric_names == expected_names
202
+
203
+ # Check exact_match metric details
204
+ exact_match = next(m for m in metrics if m.name == "exact_match")
205
+ assert exact_match.type == "string"
206
+ assert exact_match.primary is True
207
+ assert "exact match" in exact_match.description.lower()
tests/test_humaneval_parser.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from llmdataparser.humaneval_parser import (
4
+ HumanEvalDatasetParser,
5
+ HumanEvalDatasetPlusParser,
6
+ HumanEvalParseEntry,
7
+ )
8
+
9
+
10
+ @pytest.fixture
11
+ def sample_entry():
12
+ return {
13
+ "prompt": 'def add(a, b):\n """Add two numbers."""\n',
14
+ "canonical_solution": "def add(a, b):\n return a + b\n",
15
+ "task_id": "HumanEval/0",
16
+ "entry_point": "add",
17
+ "test": "def test_add(): assert add(2, 3) == 5",
18
+ }
19
+
20
+
21
+ @pytest.fixture
22
+ def parser():
23
+ return HumanEvalDatasetParser()
24
+
25
+
26
+ @pytest.fixture
27
+ def plus_parser():
28
+ return HumanEvalDatasetPlusParser()
29
+
30
+
31
+ @pytest.fixture
32
+ def plus_sample_entry():
33
+ return {
34
+ "prompt": 'def add(a, b):\n """Add two numbers."""\n',
35
+ "canonical_solution": "def add(a, b):\n return a + b\n",
36
+ "task_id": "HumanEval/0",
37
+ "entry_point": "add",
38
+ "test": "def test_add(): assert add(2, 3) == 5",
39
+ }
40
+
41
+
42
+ def test_humaneval_parse_entry_creation():
43
+ """Test creation of HumanEvalParseEntry"""
44
+ entry = HumanEvalParseEntry.create(
45
+ question="test question",
46
+ answer="test answer",
47
+ raw_question="raw question",
48
+ task_id="HumanEval/1",
49
+ entry_point="test_func",
50
+ test="test case",
51
+ task_name="openai_humaneval",
52
+ )
53
+
54
+ assert entry.question == "test question"
55
+ assert entry.answer == "test answer"
56
+ assert entry.raw_question == "raw question"
57
+ assert entry.raw_answer == "test answer" # Should match answer
58
+ assert entry.task_id == "HumanEval/1"
59
+ assert entry.entry_point == "test_func"
60
+ assert entry.test == "test case"
61
+ assert entry.task_name == "openai_humaneval"
62
+
63
+
64
+ def test_humaneval_parse_entry_validation():
65
+ """Test validation of required fields"""
66
+ with pytest.raises(ValueError, match="Task ID cannot be empty"):
67
+ HumanEvalParseEntry.create(
68
+ question="test",
69
+ answer="test",
70
+ raw_question="test",
71
+ task_id="", # Empty task_id should raise error
72
+ entry_point="test",
73
+ test="test",
74
+ task_name="test",
75
+ )
76
+
77
+ with pytest.raises(ValueError, match="Entry point cannot be empty"):
78
+ HumanEvalParseEntry.create(
79
+ question="test",
80
+ answer="test",
81
+ raw_question="test",
82
+ task_id="test",
83
+ entry_point="", # Empty entry_point should raise error
84
+ test="test",
85
+ task_name="test",
86
+ )
87
+
88
+
89
+ def test_process_entry(parser, sample_entry):
90
+ """Test processing of a single entry"""
91
+ result = parser.process_entry(sample_entry, task_name="openai_humaneval")
92
+
93
+ assert isinstance(result, HumanEvalParseEntry)
94
+ assert result.task_id == "HumanEval/0"
95
+ assert result.entry_point == "add"
96
+
97
+ assert result.answer == sample_entry["canonical_solution"]
98
+ assert result.test == sample_entry["test"]
99
+ assert result.task_name == "openai_humaneval"
100
+
101
+
102
+ def test_parser_initialization(parser):
103
+ """Test parser initialization and properties"""
104
+ assert parser._data_source == "openai/openai_humaneval"
105
+ assert parser._default_task == "openai_humaneval"
106
+ assert parser._task_names == ["openai_humaneval"]
107
+ assert (
108
+ parser.get_huggingface_link
109
+ == "https://huggingface.co/datasets/openai/openai_humaneval"
110
+ )
111
+
112
+
113
+ @pytest.mark.integration
114
+ def test_parser_load_and_parse(parser):
115
+ """Integration test for loading and parsing data"""
116
+ parser.load()
117
+ parser.parse()
118
+ parsed_data = parser.get_parsed_data
119
+
120
+ assert len(parsed_data) > 0
121
+ assert all(isinstance(entry, HumanEvalParseEntry) for entry in parsed_data)
122
+
123
+
124
+ def test_get_current_task(parser, sample_entry):
125
+ """Test _get_current_task method"""
126
+ task = parser._get_current_task(sample_entry)
127
+ assert task == parser._default_task
128
+
129
+
130
+ def test_plus_parser_initialization(plus_parser):
131
+ """Test HumanEvalDatasetPlusParser initialization and properties"""
132
+ assert plus_parser._data_source == "evalplus/humanevalplus"
133
+ assert plus_parser._default_task == "default"
134
+ assert plus_parser._task_names == ["default"]
135
+ assert (
136
+ plus_parser.get_huggingface_link
137
+ == "https://huggingface.co/datasets/evalplus/humanevalplus"
138
+ )
139
+
140
+
141
+ def test_plus_process_entry(plus_parser, plus_sample_entry):
142
+ """Test processing of a single entry in HumanEvalDatasetPlusParser"""
143
+ result = plus_parser.process_entry(plus_sample_entry, task_name="default")
144
+
145
+ assert isinstance(result, HumanEvalParseEntry)
146
+ assert result.task_id == "HumanEval/0"
147
+ assert result.entry_point == "add"
148
+
149
+ assert result.answer == plus_sample_entry["canonical_solution"]
150
+ assert result.test == plus_sample_entry["test"]
151
+ assert result.task_name == "default"
152
+
153
+
154
+ @pytest.mark.integration
155
+ def test_plus_parser_load_and_parse(plus_parser):
156
+ """Integration test for loading and parsing data with HumanEvalDatasetPlusParser"""
157
+ plus_parser.load()
158
+ plus_parser.parse()
159
+ parsed_data = plus_parser.get_parsed_data
160
+
161
+ assert len(parsed_data) > 0
162
+ assert all(isinstance(entry, HumanEvalParseEntry) for entry in parsed_data)
163
+
164
+
165
+ def test_plus_get_current_task(plus_parser, plus_sample_entry):
166
+ """Test _get_current_task method for HumanEvalDatasetPlusParser"""
167
+ task = plus_parser._get_current_task(plus_sample_entry)
168
+ assert task == plus_parser._default_task
169
+
170
+
171
+ def test_get_dataset_description(parser, plus_parser):
172
+ """Test dataset description generation for both parsers."""
173
+ # Test original HumanEval description
174
+ description = parser.get_dataset_description()
175
+ assert description.name == "HumanEval"
176
+ assert "code generation" in description.purpose
177
+ assert description.language == "Python"
178
+ assert "chen2021codex" in description.citation
179
+
180
+ # Test HumanEval Plus description
181
+ plus_description = plus_parser.get_dataset_description()
182
+ assert plus_description.name == "HumanEval Plus"
183
+ assert "80x more test coverage" in plus_description.purpose
184
+ assert "comprehensive test suites" in plus_description.format
185
+ assert "edge cases" in plus_description.characteristics
186
+ assert "evalplus" in plus_description.citation
187
+
188
+
189
+ def test_get_evaluation_metrics(parser):
190
+ """Test evaluation metrics generation for both parsers."""
191
+ # Test original HumanEval metrics
192
+ metrics = parser.get_evaluation_metrics()
193
+ assert len(metrics) == 5 # Base metrics + 2 specific metrics
194
+
195
+ # Check primary metrics - update to match actual implementation
196
+ primary_metrics = [m for m in metrics if m.primary]
197
+ assert len(primary_metrics) == 1 # pass@k
198
+ assert any(m.name == "pass@k" for m in primary_metrics)
tests/test_ifeval_parser.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from llmdataparser.ifeval_parser import IFEvalDatasetParser, IFEvalParseEntry
4
+
5
+
6
+ @pytest.fixture
7
+ def sample_ifeval_entries():
8
+ """Create sample IFEval dataset entries for testing."""
9
+ return [
10
+ {
11
+ "key": 1,
12
+ "prompt": "Write a function to calculate factorial.",
13
+ "instruction_id_list": ["math_001", "programming_001"],
14
+ "kwargs": {"difficulty": "medium", "category": "mathematics"},
15
+ },
16
+ {
17
+ "key": 2,
18
+ "prompt": "Explain quantum computing.",
19
+ "instruction_id_list": ["physics_001"],
20
+ "kwargs": {"difficulty": "hard", "category": "physics"},
21
+ },
22
+ ]
23
+
24
+
25
+ @pytest.fixture
26
+ def ifeval_parser():
27
+ """Create an IFEval parser instance."""
28
+ return IFEvalDatasetParser()
29
+
30
+
31
+ def test_ifeval_parse_entry_creation_valid():
32
+ """Test valid creation of IFEvalParseEntry."""
33
+ entry = IFEvalParseEntry.create(
34
+ question="Test instruction",
35
+ answer="", # IFEval doesn't have answers
36
+ raw_question="Test instruction",
37
+ raw_answer="",
38
+ key=1,
39
+ instruction_id_list=["test_001", "test_002"],
40
+ kwargs={"difficulty": "easy"},
41
+ task_name="default",
42
+ )
43
+
44
+ assert isinstance(entry, IFEvalParseEntry)
45
+ assert entry.question == "Test instruction"
46
+ assert entry.answer == ""
47
+ assert entry.key == 1
48
+ assert entry.instruction_id_list == ["test_001", "test_002"]
49
+ assert entry.kwargs == {"difficulty": "easy"}
50
+ assert entry.task_name == "default"
51
+
52
+
53
+ def test_process_entry_ifeval(ifeval_parser, sample_ifeval_entries):
54
+ """Test processing entries in IFEval parser."""
55
+ entry = ifeval_parser.process_entry(sample_ifeval_entries[0])
56
+
57
+ assert isinstance(entry, IFEvalParseEntry)
58
+ assert entry.key == 1
59
+ assert entry.instruction_id_list == ["math_001", "programming_001"]
60
+ assert entry.kwargs == {"difficulty": "medium", "category": "mathematics"}
61
+ assert entry.raw_question == "Write a function to calculate factorial."
62
+ assert entry.answer == "" # IFEval doesn't have answers
63
+ assert entry.task_name == "default"
64
+
65
+
66
+ def test_parser_initialization(ifeval_parser):
67
+ """Test initialization of IFEval parser."""
68
+ assert ifeval_parser._data_source == "google/IFEval"
69
+ assert ifeval_parser._default_task == "default"
70
+ assert ifeval_parser.task_names == ["default"]
71
+ assert (
72
+ ifeval_parser.get_huggingface_link
73
+ == "https://huggingface.co/datasets/google/IFEval"
74
+ )
75
+
76
+
77
+ @pytest.mark.integration
78
+ def test_load_dataset(ifeval_parser):
79
+ """Test loading the IFEval dataset."""
80
+ ifeval_parser.load(split="train")
81
+ assert ifeval_parser.raw_data is not None
82
+ assert ifeval_parser.split_names == ["train"]
83
+ assert ifeval_parser._current_task == "default"
84
+
85
+
86
+ def test_parser_string_representation(ifeval_parser):
87
+ """Test string representation of IFEval parser."""
88
+ repr_str = str(ifeval_parser)
89
+ assert "IFEvalDatasetParser" in repr_str
90
+ assert "google/IFEval" in repr_str
91
+ assert "not loaded" in repr_str
92
+
93
+
94
+ def test_get_dataset_description(ifeval_parser):
95
+ """Test dataset description generation for IFEval."""
96
+ description = ifeval_parser.get_dataset_description()
97
+
98
+ assert description.name == "IFEval"
99
+ assert description.source == "Google Research"
100
+ assert description.language == "English (BCP-47 en)"
101
+
102
+
103
+ def test_get_evaluation_metrics(ifeval_parser):
104
+ """Test evaluation metrics generation for IFEval."""
105
+ metrics = ifeval_parser.get_evaluation_metrics()
106
+
107
+ # Should have 5 metrics total
108
+ assert len(metrics) == 5
109
+
110
+ # Check primary metrics
111
+ primary_metrics = [m for m in metrics if m.primary]
112
+ assert len(primary_metrics) == 3
113
+
114
+ # Verify specific metrics exist and have correct properties
115
+ metric_names = {m.name for m in metrics}
116
+ assert "format_compliance" in metric_names
117
+ assert "length_constraints" in metric_names
118
+ assert "punctuation_rules" in metric_names
119
+ assert "keyword_usage" in metric_names
120
+ assert "structural_requirements" in metric_names
tests/test_math_parser.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from llmdataparser.math_parser import MATHDatasetParser, MATHParseEntry
4
+
5
+
6
+ @pytest.fixture
7
+ def math_parser():
8
+ """Create a MATH parser instance for testing."""
9
+ return MATHDatasetParser()
10
+
11
+
12
+ @pytest.fixture
13
+ def loaded_math_parser(math_parser):
14
+ """Create and load a MATH parser instance with test split."""
15
+ math_parser.load(task_name="algebra", split="test")
16
+ return math_parser
17
+
18
+
19
+ @pytest.fixture
20
+ def sample_math_entries():
21
+ """Create sample MATH dataset entries for testing."""
22
+ return [
23
+ {
24
+ "problem": "Solve for x: 2x + 4 = 10",
25
+ "level": "Level 3",
26
+ "solution": "Let's solve step by step:\n1) Subtract 4 from both sides: 2x = 6\n2) Divide both sides by 2\n\nTherefore, x = 3",
27
+ "type": "algebra",
28
+ },
29
+ {
30
+ "problem": "Find the area of a circle with radius 5 units.",
31
+ "level": "Level 2",
32
+ "solution": "Area = πr²\nArea = π(5)²\nArea = 25π square units",
33
+ "type": "geometry",
34
+ },
35
+ {
36
+ "problem": "What is the limit of (x²-1)/(x-1) as x approaches 1?",
37
+ "level": "Level 4",
38
+ "solution": "Using L'Hôpital's rule:\nlim(x→1) (x²-1)/(x-1) = lim(x→1) (2x)/(1) = 2",
39
+ "type": "calculus",
40
+ },
41
+ ]
42
+
43
+
44
+ def test_math_parse_entry_creation_valid():
45
+ """Test valid creation of MATHParseEntry with all fields."""
46
+ entry = MATHParseEntry.create(
47
+ question="Test question",
48
+ answer="Test answer",
49
+ raw_question="Test question",
50
+ raw_answer="Test solution",
51
+ level="Level 5",
52
+ task_name="algebra",
53
+ solution="Test solution",
54
+ )
55
+
56
+ assert isinstance(entry, MATHParseEntry)
57
+ assert entry.question == "Test question"
58
+ assert entry.answer == "Test answer"
59
+ assert entry.raw_question == "Test question"
60
+ assert entry.raw_answer == "Test solution"
61
+ assert entry.level == "Level 5"
62
+ assert entry.task_name == "algebra"
63
+ assert entry.solution == "Test solution"
64
+
65
+
66
+ @pytest.mark.parametrize(
67
+ "test_case",
68
+ [
69
+ {
70
+ "problem": "Solve for x: 2x + 4 = 10",
71
+ "level": "Level 3",
72
+ "solution": "x = 3",
73
+ "type": "algebra",
74
+ },
75
+ {
76
+ "problem": "Find the derivative of f(x) = x²",
77
+ "level": "Level 4",
78
+ "solution": "f'(x) = 2x",
79
+ "type": "calculus",
80
+ },
81
+ ],
82
+ )
83
+ def test_process_entry(math_parser, test_case):
84
+ """Test processing different types of MATH entries."""
85
+ entry = math_parser.process_entry(test_case, task_name=test_case["type"])
86
+
87
+ assert isinstance(entry, MATHParseEntry)
88
+
89
+ assert entry.answer == test_case["solution"]
90
+ assert entry.raw_question == test_case["problem"]
91
+ assert entry.raw_answer == test_case["solution"]
92
+ assert entry.level == test_case["level"]
93
+ assert entry.task_name == test_case["type"]
94
+ assert entry.solution == test_case["solution"]
95
+
96
+
97
+ def test_math_parser_initialization(math_parser):
98
+ """Test MATH parser initialization and properties."""
99
+ assert isinstance(math_parser.task_names, list)
100
+ assert len(math_parser.task_names) == 8
101
+ assert math_parser._data_source == "lighteval/MATH"
102
+ assert math_parser._default_task == "all"
103
+ assert "algebra" in math_parser.task_names
104
+ assert "geometry" in math_parser.task_names
105
+ assert (
106
+ math_parser.get_huggingface_link
107
+ == "https://huggingface.co/datasets/lighteval/MATH"
108
+ )
109
+
110
+
111
+ def test_get_current_task(math_parser):
112
+ """Test task name resolution in different scenarios."""
113
+ # Test with valid type in data entry
114
+ test_row_with_type = {"type": "algebra"}
115
+ assert math_parser._get_current_task(test_row_with_type) == "algebra"
116
+
117
+ # Test without type in data entry
118
+ test_row_without_type = {}
119
+ math_parser._current_task = "geometry"
120
+ assert math_parser._get_current_task(test_row_without_type) == "geometry"
121
+
122
+ # Test with invalid type - should return current task
123
+ test_row_invalid_type = {"type": "invalid_type"}
124
+ math_parser._current_task = "algebra"
125
+ assert math_parser._get_current_task(test_row_invalid_type) == "algebra"
126
+
127
+
128
+ def test_valid_levels(math_parser):
129
+ """Test handling of valid level values."""
130
+ for i in range(1, 6):
131
+ test_row = {
132
+ "problem": "Test problem",
133
+ "level": f"Level {i}",
134
+ "solution": "Test solution",
135
+ "type": "algebra",
136
+ }
137
+ entry = math_parser.process_entry(test_row, task_name="algebra")
138
+ assert entry.level == f"Level {i}"
139
+
140
+
141
+ @pytest.mark.parametrize(
142
+ "invalid_level",
143
+ [
144
+ "Level 0", # Too low
145
+ "Level 6", # Too high
146
+ "Invalid", # Wrong format
147
+ None, # Missing
148
+ "", # Empty
149
+ "level 1", # Wrong capitalization
150
+ ],
151
+ )
152
+ def test_invalid_level_handling(math_parser, invalid_level):
153
+ """Test handling of invalid level values."""
154
+ test_row = {
155
+ "problem": "Test problem",
156
+ "level": invalid_level,
157
+ "solution": "Test solution",
158
+ "type": "algebra",
159
+ }
160
+
161
+ entry = math_parser.process_entry(test_row, task_name="algebra")
162
+ assert entry.level == "Unknown"
163
+
164
+
165
+ @pytest.mark.integration
166
+ def test_load_dataset(loaded_math_parser):
167
+ """Test loading the MATH dataset."""
168
+ assert loaded_math_parser.raw_data is not None
169
+ assert loaded_math_parser.split_names == ["test"]
170
+ assert loaded_math_parser._current_task == "algebra"
171
+
172
+
173
+ def test_parser_string_representation(loaded_math_parser):
174
+ """Test string representation of MATH parser."""
175
+ repr_str = str(loaded_math_parser)
176
+ assert "MATHDatasetParser" in repr_str
177
+ assert "lighteval/MATH" in repr_str
178
+ assert "algebra" in repr_str
179
+ assert "loaded" in repr_str
180
+
181
+
182
+ @pytest.mark.integration
183
+ def test_different_splits_parsing(math_parser):
184
+ """Test parsing different splits of the dataset."""
185
+ # Load and parse test split
186
+ math_parser.load(task_name="algebra", split="test")
187
+ math_parser.parse(split_names="test", force=True)
188
+ test_count = len(math_parser.get_parsed_data)
189
+
190
+ # Load and parse train split
191
+ math_parser.load(task_name="algebra", split="train")
192
+ math_parser.parse(split_names="train", force=True)
193
+ train_count = len(math_parser.get_parsed_data)
194
+
195
+ assert test_count > 0
196
+ assert train_count > 0
197
+ assert train_count != test_count
198
+
199
+
200
+ def test_get_dataset_description(math_parser):
201
+ """Test dataset description generation."""
202
+ description = math_parser.get_dataset_description()
203
+
204
+ assert description.name == "MATH"
205
+ assert "Hendrycks" in description.source
206
+ assert description.language == "English"
207
+ assert "12,500" in description.characteristics
208
+ assert "hendrycksmath2021" in description.citation
209
+ assert "NeurIPS" in description.citation
210
+
211
+ # Check additional info
212
+ assert description.additional_info is not None
213
+ assert description.additional_info["difficulty_levels"] == "1-5"
214
+ assert "algebra" in description.additional_info["topics"]
215
+ assert "geometry" in description.additional_info["topics"]
216
+ assert description.additional_info["size"] == "12,500 problems"
217
+
218
+
219
+ def test_get_evaluation_metrics(math_parser):
220
+ """Test evaluation metrics generation."""
221
+ metrics = math_parser.get_evaluation_metrics()
222
+
223
+ # Check total number of metrics
224
+ assert len(metrics) == 5
225
+
226
+ # Check primary metrics
227
+ primary_metrics = [m for m in metrics if m.primary]
228
+ assert len(primary_metrics) == 3
229
+
230
+ # Verify specific metrics exist with correct properties
231
+ metric_names = {m.name for m in metrics}
232
+ assert "symbolic_equivalence" in metric_names
233
+ assert "solution_presence" in metric_names
234
+ assert "reasoning_validity" in metric_names
235
+ assert "mathematical_notation" in metric_names
236
+ assert "solution_clarity" in metric_names
237
+
238
+ # Check specific metric properties
239
+ symbolic_metric = next(m for m in metrics if m.name == "symbolic_equivalence")
240
+ assert symbolic_metric.type == "exact_match"
241
+ assert symbolic_metric.primary is True
242
+ assert "sympy" in symbolic_metric.description.lower()
243
+ assert "equivalence" in symbolic_metric.description.lower()
244
+
245
+ solution_metric = next(m for m in metrics if m.name == "solution_presence")
246
+ assert solution_metric.type == "text"
247
+ assert solution_metric.primary is True
248
+ assert "step-by-step" in solution_metric.description.lower()
249
+
250
+ reasoning_metric = next(m for m in metrics if m.name == "reasoning_validity")
251
+ assert reasoning_metric.type == "text"
252
+ assert reasoning_metric.primary is True
253
+ assert "mathematical reasoning" in reasoning_metric.description.lower()
tests/test_mbpp_parser.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from llmdataparser.mbpp_parser import MBPPDatasetParser, MBPPParseEntry
4
+
5
+
6
+ @pytest.fixture
7
+ def sample_entry():
8
+ return {
9
+ "text": "Write a function to find the sum of numbers in a list.",
10
+ "code": "def sum_list(lst):\n return sum(lst)",
11
+ "task_id": 42,
12
+ "test_list": ["assert sum_list([1, 2, 3]) == 6"],
13
+ "test_setup_code": "",
14
+ "challenge_test_list": ["assert sum_list([4, 5, 6]) == 15"],
15
+ }
16
+
17
+
18
+ @pytest.fixture
19
+ def parser():
20
+ return MBPPDatasetParser()
21
+
22
+
23
+ def test_mbpp_parse_entry_creation():
24
+ """Test creation of MBPPParseEntry"""
25
+ entry = MBPPParseEntry.create(
26
+ question="test question",
27
+ answer="test answer",
28
+ raw_question="raw question",
29
+ task_id=42,
30
+ test_list=["test1", "test2"],
31
+ test_setup_code="setup code",
32
+ challenge_test_list=["challenge1"],
33
+ task_name="full",
34
+ source_file="test.pdf",
35
+ )
36
+
37
+ assert entry.question == "test question"
38
+ assert entry.answer == "test answer"
39
+ assert entry.raw_question == "raw question"
40
+ assert entry.raw_answer == "test answer"
41
+ assert entry.task_id == 42
42
+ assert entry.test_list == ["test1", "test2"]
43
+ assert entry.test_setup_code == "setup code"
44
+ assert entry.challenge_test_list == ["challenge1"]
45
+ assert entry.task_name == "full"
46
+
47
+
48
+ def test_mbpp_parse_entry_validation():
49
+ """Test validation of required fields"""
50
+ with pytest.raises(ValueError, match="Task ID must be an integer"):
51
+ MBPPParseEntry.create(
52
+ question="test",
53
+ answer="test",
54
+ raw_question="test",
55
+ task_id="not_an_int", # Invalid task_id type
56
+ test_list=[],
57
+ test_setup_code="",
58
+ challenge_test_list=[],
59
+ task_name="full",
60
+ source_file="test.pdf",
61
+ )
62
+
63
+
64
+ def test_process_entry(parser, sample_entry):
65
+ """Test processing of a single entry"""
66
+ result = parser.process_entry(sample_entry, task_name="full")
67
+
68
+ assert isinstance(result, MBPPParseEntry)
69
+ assert result.task_id == 42
70
+ assert result.raw_question == sample_entry["text"]
71
+ assert result.answer == sample_entry["code"]
72
+ assert result.test_list == sample_entry["test_list"]
73
+ assert result.challenge_test_list == sample_entry["challenge_test_list"]
74
+ assert result.task_name == "full"
75
+
76
+
77
+ def test_parser_initialization(parser):
78
+ """Test parser initialization and properties"""
79
+ assert parser._data_source == "google-research-datasets/mbpp"
80
+ assert parser._default_task == "full"
81
+ assert parser._task_names == ["full", "sanitized"]
82
+ assert (
83
+ parser.get_huggingface_link
84
+ == "https://huggingface.co/datasets/google-research-datasets/mbpp"
85
+ )
86
+
87
+
88
+ @pytest.mark.integration
89
+ @pytest.mark.skip(reason="Requires access to HuggingFace MBPP dataset")
90
+ def test_parser_load_and_parse(parser):
91
+ """Integration test for loading and parsing data"""
92
+ parser.load(split="train")
93
+ parser.parse(force=True)
94
+ parsed_data = parser.get_parsed_data
95
+
96
+ assert len(parsed_data) > 0
97
+ assert all(isinstance(entry, MBPPParseEntry) for entry in parsed_data)
98
+
99
+
100
+ def test_get_current_task(parser, sample_entry):
101
+ """Test _get_current_task method"""
102
+ task = parser._get_current_task(sample_entry)
103
+ assert task == parser._default_task
104
+
105
+
106
+ @pytest.mark.parametrize("task_name", ["full", "sanitized"])
107
+ @pytest.mark.skip(reason="Requires access to HuggingFace MBPP dataset")
108
+ def test_different_tasks_loading(parser, task_name):
109
+ """Test loading different tasks of the dataset"""
110
+ parser.load(task_name=task_name, split="train")
111
+ assert parser._current_task == task_name
112
+
113
+
114
+ def test_parser_string_representation(parser):
115
+ """Test string representation of parser"""
116
+ repr_str = str(parser)
117
+ assert "MBPPDatasetParser" in repr_str
118
+ assert "google-research-datasets/mbpp" in repr_str
119
+ assert "not loaded" in repr_str
120
+
121
+
122
+ def test_parse_without_loaded_data(parser):
123
+ """Test parsing without loading data first"""
124
+ with pytest.raises(
125
+ ValueError, match="No data loaded. Please load the dataset first"
126
+ ):
127
+ parser.parse()
128
+
129
+
130
+ @pytest.mark.integration
131
+ @pytest.mark.skip(reason="Requires access to HuggingFace MBPP dataset")
132
+ def test_full_workflow_with_different_splits(parser):
133
+ """Test the complete workflow with different splits"""
134
+ parser.load(split="train")
135
+ parser.parse(force=True)
136
+ train_data = parser.get_parsed_data
137
+
138
+ assert len(train_data) > 0
139
+ assert all(isinstance(entry, MBPPParseEntry) for entry in train_data)
140
+ assert all(entry.task_name == "full" for entry in train_data)
141
+
142
+
143
+ def test_get_dataset_description(parser):
144
+ """Test dataset description generation."""
145
+ description = parser.get_dataset_description()
146
+
147
+ assert description.name == "Mostly Basic Python Problems (MBPP)"
148
+ assert "code generation" in description.purpose.lower()
149
+ assert "google-research" in description.source
150
+ assert description.language == "English and Python"
151
+ assert "1,000" in description.characteristics
152
+ assert "austin2021program" in description.citation
153
+ assert "Program Synthesis" in description.citation
154
+
155
+
156
+ def test_get_evaluation_metrics(parser):
157
+ """Test evaluation metrics generation."""
158
+ metrics = parser.get_evaluation_metrics()
159
+
160
+ # Check total number of metrics
161
+ assert len(metrics) == 4
162
+
163
+ # Check primary metrics
164
+ primary_metrics = [m for m in metrics if m.primary]
165
+ assert len(primary_metrics) == 1
166
+
167
+ # Verify specific metrics exist with correct properties
168
+ metric_names = {m.name for m in metrics}
169
+ assert "pass@k" in metric_names
170
+ assert "test_case_success_rate" in metric_names
171
+ assert "syntax_validity" in metric_names
172
+
173
+ # Check specific metric properties
174
+ pass_k_metric = next(m for m in metrics if m.name == "pass@k")
175
+ assert pass_k_metric.type == "code_evaluation"
176
+ assert pass_k_metric.primary is True
177
+ assert "k generations" in pass_k_metric.description.lower()
178
+ assert "custom_pass_at_k" in pass_k_metric.implementation
tests/test_mgsm_parser.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from llmdataparser.mgsm_parser import MGSMDatasetParser, MGSMParseEntry
4
+
5
+
6
+ @pytest.fixture
7
+ def mgsm_parser():
8
+ """Create a MGSM parser instance for testing."""
9
+ return MGSMDatasetParser()
10
+
11
+
12
+ @pytest.fixture
13
+ def loaded_mgsm_parser(mgsm_parser):
14
+ """Create and load a MGSM parser instance with test split."""
15
+ mgsm_parser.load(task_name="en", split="test")
16
+ return mgsm_parser
17
+
18
+
19
+ @pytest.fixture
20
+ def sample_mgsm_entries():
21
+ """Create sample MGSM dataset entries for testing."""
22
+ return [
23
+ {
24
+ "question": "John has 5 apples and buys 3 more. How many apples does he have now?",
25
+ "answer": "Let's solve step by step:\n1) Initial apples = 5\n2) Bought apples = 3\n3) Total = 5 + 3 = 8\nJohn has 8 apples now.",
26
+ "answer_number": 8,
27
+ "equation_solution": "5 + 3 = 8",
28
+ "language": "en",
29
+ },
30
+ {
31
+ "question": "Juan tiene 5 manzanas y compra 3 más. ¿Cuántas manzanas tiene ahora?",
32
+ "answer": "Resolvamos paso a paso:\n1) Manzanas iniciales = 5\n2) Manzanas compradas = 3\n3) Total = 5 + 3 = 8\nJuan tiene 8 manzanas ahora.",
33
+ "answer_number": 8,
34
+ "equation_solution": "5 + 3 = 8",
35
+ "language": "es",
36
+ },
37
+ {
38
+ "question": "ジョンはリンゴを5個持っていて、さらに3個買います。今何個持っていますか?",
39
+ "answer": None, # Testing case with missing detailed answer
40
+ "answer_number": 8,
41
+ "equation_solution": "5 + 3 = 8",
42
+ "language": "ja",
43
+ },
44
+ ]
45
+
46
+
47
+ def test_mgsm_parse_entry_creation_valid():
48
+ """Test valid creation of MGSMParseEntry with all fields."""
49
+ entry = MGSMParseEntry.create(
50
+ question="Test question",
51
+ answer="Test answer",
52
+ raw_question="Test question",
53
+ raw_answer="Test answer",
54
+ numerical_answer=42,
55
+ equation_solution="21 * 2 = 42",
56
+ task_name="en",
57
+ language="en",
58
+ )
59
+
60
+ assert isinstance(entry, MGSMParseEntry)
61
+ assert entry.question == "Test question"
62
+ assert entry.answer == "Test answer"
63
+ assert entry.raw_question == "Test question"
64
+ assert entry.raw_answer == "Test answer"
65
+ assert entry.numerical_answer == 42
66
+ assert entry.equation_solution == "21 * 2 = 42"
67
+ assert entry.task_name == "en"
68
+ assert entry.language == "en"
69
+
70
+
71
+ def test_process_entry_with_detailed_answer(mgsm_parser, sample_mgsm_entries):
72
+ """Test processing entry with detailed answer in English."""
73
+ entry = mgsm_parser.process_entry(sample_mgsm_entries[0], task_name="en")
74
+
75
+ assert isinstance(entry, MGSMParseEntry)
76
+ assert entry.numerical_answer == 8
77
+ assert entry.equation_solution == "5 + 3 = 8"
78
+ assert "step by step" in entry.answer
79
+ assert entry.language == "en"
80
+ assert entry.task_name == "en"
81
+
82
+
83
+ def test_process_entry_without_detailed_answer(mgsm_parser, sample_mgsm_entries):
84
+ """Test processing entry without detailed answer (Japanese)."""
85
+ entry = mgsm_parser.process_entry(sample_mgsm_entries[2], task_name="ja")
86
+
87
+ assert isinstance(entry, MGSMParseEntry)
88
+ assert entry.numerical_answer == 8
89
+ assert entry.equation_solution == "5 + 3 = 8"
90
+ assert entry.answer == "8" # Should use numerical_answer as string
91
+ assert entry.language == "ja"
92
+ assert entry.task_name == "ja"
93
+
94
+
95
+ def test_process_entry_spanish(mgsm_parser, sample_mgsm_entries):
96
+ """Test processing Spanish entry."""
97
+ entry = mgsm_parser.process_entry(sample_mgsm_entries[1], task_name="es")
98
+
99
+ assert isinstance(entry, MGSMParseEntry)
100
+ assert entry.numerical_answer == 8
101
+ assert entry.equation_solution == "5 + 3 = 8"
102
+ assert "paso a paso" in entry.answer # Spanish for "step by step"
103
+ assert entry.language == "es"
104
+ assert entry.task_name == "es"
105
+
106
+
107
+ def test_mgsm_parser_initialization(mgsm_parser):
108
+ """Test MGSM parser initialization and properties."""
109
+ assert isinstance(mgsm_parser.task_names, list)
110
+ assert len(mgsm_parser.task_names) == 11 # 11 supported languages
111
+ assert mgsm_parser._data_source == "juletxara/mgsm"
112
+ assert mgsm_parser._default_task == "en"
113
+ assert all(lang in mgsm_parser.task_names for lang in ["en", "es", "ja", "zh"])
114
+ assert (
115
+ mgsm_parser.get_huggingface_link
116
+ == "https://huggingface.co/datasets/juletxara/mgsm"
117
+ )
118
+
119
+
120
+ @pytest.mark.integration
121
+ def test_load_dataset(loaded_mgsm_parser):
122
+ """Test loading the MGSM dataset."""
123
+ assert loaded_mgsm_parser.raw_data is not None
124
+ assert loaded_mgsm_parser.split_names == ["test"]
125
+ assert loaded_mgsm_parser._current_task == "en"
126
+
127
+
128
+ def test_parser_string_representation(loaded_mgsm_parser):
129
+ """Test string representation of MGSM parser."""
130
+ repr_str = str(loaded_mgsm_parser)
131
+ assert "MGSMDatasetParser" in repr_str
132
+ assert "juletxara/mgsm" in repr_str
133
+ assert "en" in repr_str
134
+ assert "loaded" in repr_str
135
+
136
+
137
+ @pytest.mark.integration
138
+ def test_different_languages_parsing(mgsm_parser):
139
+ """Test parsing different language versions."""
140
+ # Load and parse English
141
+ mgsm_parser.load(task_name="en", split="test")
142
+ mgsm_parser.parse(split_names="test", force=True)
143
+ en_count = len(mgsm_parser.get_parsed_data)
144
+
145
+ # Load and parse Spanish
146
+ mgsm_parser.load(task_name="es", split="test")
147
+ mgsm_parser.parse(split_names="test", force=True)
148
+ es_count = len(mgsm_parser.get_parsed_data)
149
+
150
+ assert en_count > 0
151
+ assert es_count > 0
152
+ assert en_count == es_count # Should have same number of problems in each language
153
+
154
+
155
+ @pytest.mark.parametrize("language", ["en", "es", "ja", "zh", "ru"])
156
+ def test_supported_languages(mgsm_parser, language):
157
+ """Test that each supported language can be processed."""
158
+ test_entry = {
159
+ "question": f"Test question in {language}",
160
+ "answer": f"Test answer in {language}",
161
+ "answer_number": 42,
162
+ "equation_solution": "21 * 2 = 42",
163
+ }
164
+
165
+ entry = mgsm_parser.process_entry(test_entry, task_name=language)
166
+ assert entry.language == language
167
+ assert entry.task_name == language
168
+ assert entry.numerical_answer == 42
169
+
170
+
171
+ def test_get_dataset_description(mgsm_parser):
172
+ """Test dataset description generation."""
173
+ description = mgsm_parser.get_dataset_description()
174
+
175
+ assert description.name == "Multilingual Grade School Math (MGSM)"
176
+ assert "multilingual chain-of-thought reasoning" in description.purpose.lower()
177
+ assert "juletxara/mgsm" in description.source
178
+ assert description.language == "Multilingual (11 languages)"
179
+
180
+ assert "mathematical reasoning" in description.characteristics.lower()
181
+
182
+ # Check citations
183
+ assert "shi2022language" in description.citation
184
+ assert "cobbe2021gsm8k" in description.citation
185
+
186
+ # Check additional info
187
+ assert description.additional_info is not None
188
+ assert len(description.additional_info["languages"]) == 11
189
+ assert "English" in description.additional_info["languages"]
190
+ assert "Chinese" in description.additional_info["languages"]
191
+
192
+
193
+ def test_get_evaluation_metrics(mgsm_parser):
194
+ """Test evaluation metrics generation."""
195
+ metrics = mgsm_parser.get_evaluation_metrics()
196
+
197
+ # Check total number of metrics
198
+ assert len(metrics) == 4
199
+
200
+ # Check primary metrics
201
+ primary_metrics = [m for m in metrics if m.primary]
202
+ assert len(primary_metrics) == 3
203
+
204
+ # Verify specific metrics exist with correct properties
205
+ metric_names = {m.name for m in metrics}
206
+ assert "exact_match" in metric_names
207
+ assert "solution_validity" in metric_names
208
+ assert "step_accuracy" in metric_names
209
+ assert "cross_lingual_consistency" in metric_names
210
+
211
+ # Check specific metric properties
212
+ exact_match_metric = next(m for m in metrics if m.name == "exact_match")
213
+ assert exact_match_metric.type == "string"
214
+ assert exact_match_metric.primary is True
215
+ assert "numerical answers" in exact_match_metric.description.lower()
216
+ assert "custom_exact_match" in exact_match_metric.implementation
217
+
218
+ solution_metric = next(m for m in metrics if m.name == "solution_validity")
219
+ assert solution_metric.type == "text"
220
+ assert solution_metric.primary is True
221
+ assert "mathematically valid" in solution_metric.description.lower()
222
+ assert "custom_solution_validator" in solution_metric.implementation
223
+
224
+ step_metric = next(m for m in metrics if m.name == "step_accuracy")
225
+ assert step_metric.type == "numerical"
226
+ assert step_metric.primary is True
227
+ assert "calculation steps" in step_metric.description.lower()
228
+ assert "custom_step_accuracy" in step_metric.implementation
tests/test_mmlu_parser.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from llmdataparser.mmlu_parser import (
4
+ BaseMMLUDatasetParser,
5
+ MMLUParseEntry,
6
+ MMLUProDatasetParser,
7
+ MMLUProParseEntry,
8
+ MMLUReduxDatasetParser,
9
+ TMMLUPlusDatasetParser,
10
+ )
11
+
12
+
13
+ @pytest.fixture
14
+ def base_parser():
15
+ """Create a base MMLU parser instance."""
16
+ return BaseMMLUDatasetParser()
17
+
18
+
19
+ @pytest.fixture
20
+ def redux_parser():
21
+ """Create a MMLU Redux parser instance."""
22
+ return MMLUReduxDatasetParser()
23
+
24
+
25
+ @pytest.fixture
26
+ def tmmlu_parser():
27
+ """Create a TMMLU+ parser instance."""
28
+ return TMMLUPlusDatasetParser()
29
+
30
+
31
+ @pytest.fixture
32
+ def mmlu_pro_parser():
33
+ """Create a MMLU Pro parser instance."""
34
+ return MMLUProDatasetParser()
35
+
36
+
37
+ @pytest.fixture
38
+ def sample_mmlu_entries():
39
+ """Create sample MMLU dataset entries for testing."""
40
+ return [
41
+ {
42
+ "question": "What is the capital of France?",
43
+ "choices": ["London", "Paris", "Berlin", "Madrid"],
44
+ "answer": 1, # Paris
45
+ "subject": "geography",
46
+ },
47
+ {
48
+ "question": "Which of these is a primary color?",
49
+ "choices": ["Green", "Purple", "Blue", "Orange"],
50
+ "answer": 2, # Blue
51
+ "subject": "art",
52
+ },
53
+ ]
54
+
55
+
56
+ @pytest.fixture
57
+ def sample_mmlu_pro_entries():
58
+ """Create sample MMLU Pro dataset entries for testing."""
59
+ return [
60
+ {
61
+ "question": "What is the time complexity of quicksort?",
62
+ "options": ["O(n)", "O(n log n)", "O(n²)", "O(2ⁿ)", "O(n!)", "O(1)"],
63
+ "answer": "The average time complexity of quicksort is O(n log n)",
64
+ "answer_index": 1,
65
+ "category": "computer_science",
66
+ }
67
+ ]
68
+
69
+
70
+ def test_mmlu_parse_entry_creation_valid():
71
+ """Test valid creation of MMLUParseEntry."""
72
+ entry = MMLUParseEntry.create(
73
+ question="Test question",
74
+ answer="A",
75
+ raw_question="Test question",
76
+ raw_choices=["choice1", "choice2", "choice3", "choice4"],
77
+ raw_answer="0",
78
+ task_name="test_task",
79
+ )
80
+ assert isinstance(entry, MMLUParseEntry)
81
+ assert entry.question == "Test question"
82
+ assert entry.answer == "A"
83
+ assert entry.raw_choices == ["choice1", "choice2", "choice3", "choice4"]
84
+ assert entry.task_name == "test_task"
85
+
86
+
87
+ @pytest.mark.parametrize("invalid_answer", ["E", "F", "1", "", None])
88
+ def test_mmlu_parse_entry_creation_invalid(invalid_answer):
89
+ """Test invalid answer handling in MMLUParseEntry creation."""
90
+ with pytest.raises(
91
+ ValueError, match="Invalid answer_letter.*must be one of A, B, C, D"
92
+ ):
93
+ MMLUParseEntry.create(
94
+ question="Test question",
95
+ answer=invalid_answer,
96
+ raw_question="Test question",
97
+ raw_choices=["choice1", "choice2", "choice3", "choice4"],
98
+ raw_answer="4",
99
+ task_name="test_task",
100
+ )
101
+
102
+
103
+ def test_process_entry_base(base_parser, sample_mmlu_entries):
104
+ """Test processing entries in base MMLU parser."""
105
+ entry = base_parser.process_entry(sample_mmlu_entries[0], task_name="geography")
106
+
107
+ assert isinstance(entry, MMLUParseEntry)
108
+ assert entry.answer == "B" # Index 1 maps to B
109
+ assert "A. London" in entry.question
110
+ assert "B. Paris" in entry.question
111
+ assert "C. Berlin" in entry.question
112
+ assert "D. Madrid" in entry.question
113
+ assert entry.raw_question == "What is the capital of France?"
114
+ assert entry.raw_choices == ["London", "Paris", "Berlin", "Madrid"]
115
+ assert entry.raw_answer == "1"
116
+ assert entry.task_name == "geography"
117
+
118
+
119
+ def test_mmlu_pro_parse_entry_creation_valid():
120
+ """Test valid creation of MMLUProParseEntry."""
121
+ entry = MMLUProParseEntry.create(
122
+ question="Test question",
123
+ answer="E", # MMLU Pro supports up to J
124
+ raw_question="Test question",
125
+ raw_choices=["choice1", "choice2", "choice3", "choice4", "choice5"],
126
+ raw_answer="4",
127
+ task_name="test_task",
128
+ )
129
+ assert isinstance(entry, MMLUProParseEntry)
130
+ assert entry.answer == "E"
131
+ assert len(entry.raw_choices) == 5
132
+
133
+
134
+ def test_process_entry_mmlu_pro(mmlu_pro_parser, sample_mmlu_pro_entries):
135
+ """Test processing entries in MMLU Pro parser."""
136
+ entry = mmlu_pro_parser.process_entry(
137
+ sample_mmlu_pro_entries[0], task_name="computer_science"
138
+ )
139
+
140
+ assert isinstance(entry, MMLUProParseEntry)
141
+ assert entry.answer == "B" # Index 1 maps to B
142
+ assert "O(n log n)" in entry.question
143
+ assert entry.task_name == "computer_science"
144
+ assert len(entry.raw_choices) == 6
145
+
146
+
147
+ def test_tmmlu_process_entry(tmmlu_parser):
148
+ """Test processing entries in TMMLU+ parser."""
149
+ test_row = {
150
+ "question": "什麼是台灣最高的山峰?",
151
+ "A": "玉山",
152
+ "B": "阿里山",
153
+ "C": "合歡山",
154
+ "D": "雪山",
155
+ "answer": "A",
156
+ "subject": "geography_of_taiwan",
157
+ }
158
+
159
+ entry = tmmlu_parser.process_entry(test_row, task_name="geography_of_taiwan")
160
+ assert isinstance(entry, MMLUParseEntry)
161
+ assert entry.answer == "A"
162
+ assert entry.raw_choices == ["玉山", "阿里山", "合歡山", "雪山"]
163
+ assert entry.task_name == "geography_of_taiwan"
164
+
165
+
166
+ @pytest.mark.parametrize(
167
+ "parser_fixture,expected_tasks,expected_source",
168
+ [
169
+ ("base_parser", 57, "cais/mmlu"),
170
+ ("redux_parser", 30, "edinburgh-dawg/mmlu-redux"),
171
+ ("tmmlu_parser", 66, "ikala/tmmluplus"),
172
+ ("mmlu_pro_parser", 1, "TIGER-Lab/MMLU-Pro"),
173
+ ],
174
+ )
175
+ def test_parser_initialization(
176
+ request, parser_fixture, expected_tasks, expected_source
177
+ ):
178
+ """Test initialization of different MMLU parser variants."""
179
+ parser = request.getfixturevalue(parser_fixture)
180
+ assert len(parser.task_names) == expected_tasks
181
+ assert parser._data_source == expected_source
182
+ assert (
183
+ parser.get_huggingface_link
184
+ == f"https://huggingface.co/datasets/{expected_source}"
185
+ )
186
+
187
+
188
+ @pytest.mark.integration
189
+ def test_load_dataset(base_parser):
190
+ """Test loading the MMLU dataset."""
191
+ base_parser.load(task_name="anatomy", split="test")
192
+ assert base_parser.raw_data is not None
193
+ assert base_parser.split_names == ["test"]
194
+ assert base_parser._current_task == "anatomy"
195
+
196
+
197
+ def test_parser_string_representation(base_parser):
198
+ """Test string representation of MMLU parser."""
199
+ repr_str = str(base_parser)
200
+ assert "MMLUDatasetParser" in repr_str
201
+ assert "cais/mmlu" in repr_str
202
+ assert "not loaded" in repr_str
203
+
204
+
205
+ @pytest.mark.integration
206
+ def test_different_splits_parsing(base_parser):
207
+ """Test parsing different splits of the dataset."""
208
+ # Load and parse test split
209
+ base_parser.load(task_name="anatomy", split="test")
210
+ base_parser.parse(split_names="test", force=True)
211
+ test_count = len(base_parser.get_parsed_data)
212
+
213
+ # Load and parse validation split
214
+ base_parser.load(task_name="anatomy", split="validation")
215
+ base_parser.parse(split_names="validation", force=True)
216
+ val_count = len(base_parser.get_parsed_data)
217
+
218
+ assert test_count > 0
219
+ assert val_count > 0
220
+ assert test_count != val_count
221
+
222
+
223
+ def test_base_mmlu_dataset_description(base_parser):
224
+ """Test dataset description for base MMLU."""
225
+ description = base_parser.get_dataset_description()
226
+
227
+ assert description.name == "Massive Multitask Language Understanding (MMLU)"
228
+ assert "cais/mmlu" in description.source
229
+ assert description.language == "English"
230
+
231
+ # Check characteristics
232
+ assert "57 subjects" in description.characteristics.lower()
233
+
234
+ # Check citation
235
+ assert "hendryckstest2021" in description.citation
236
+
237
+
238
+ def test_mmlu_redux_dataset_description(redux_parser):
239
+ """Test dataset description for MMLU Redux."""
240
+ description = redux_parser.get_dataset_description()
241
+
242
+ assert description.name == "MMLU Redux"
243
+ assert "manually re-annotated" in description.purpose.lower()
244
+ assert "edinburgh-dawg/mmlu-redux" in description.source
245
+ assert description.language == "English"
246
+
247
+ # Check characteristics
248
+ assert "3,000" in description.characteristics
249
+
250
+
251
+ def test_tmmlu_plus_dataset_description(tmmlu_parser):
252
+ """Test dataset description for TMMLU+."""
253
+ description = tmmlu_parser.get_dataset_description()
254
+
255
+ assert "ikala/tmmluplus" in description.source
256
+ assert description.language == "Traditional Chinese"
257
+
258
+ # Check characteristics
259
+ assert "66 subjects" in description.characteristics.lower()
260
+
261
+ # Check citation
262
+ assert "ikala2024improved" in description.citation
263
+
264
+
265
+ def test_mmlu_pro_dataset_description(mmlu_pro_parser):
266
+ """Test dataset description for MMLU Pro."""
267
+ description = mmlu_pro_parser.get_dataset_description()
268
+
269
+ assert description.name == "MMLU Pro"
270
+ assert "challenging" in description.purpose.lower()
271
+ assert "TIGER-Lab/MMLU-Pro" in description.source
272
+ assert description.language == "English"
273
+
274
+
275
+ def test_base_mmlu_evaluation_metrics(base_parser):
276
+ """Test evaluation metrics for base MMLU."""
277
+ metrics = base_parser.get_evaluation_metrics()
278
+
279
+ assert len(metrics) >= 3
280
+ metric_names = {m.name for m in metrics}
281
+
282
+ assert "accuracy" in metric_names
283
+ assert "subject_accuracy" in metric_names
284
+ assert "category_accuracy" in metric_names
285
+
286
+ accuracy_metric = next(m for m in metrics if m.name == "accuracy")
287
+ assert accuracy_metric.type == "classification"
288
+ assert accuracy_metric.primary is True
289
+ assert "multiple-choice" in accuracy_metric.description.lower()
290
+
291
+
292
+ def test_mmlu_redux_evaluation_metrics(redux_parser):
293
+ """Test evaluation metrics for MMLU Redux."""
294
+ metrics = redux_parser.get_evaluation_metrics()
295
+
296
+ metric_names = {m.name for m in metrics}
297
+ assert "question_clarity" in metric_names
298
+
299
+
300
+ def test_tmmlu_plus_evaluation_metrics(tmmlu_parser):
301
+ """Test evaluation metrics for TMMLU+."""
302
+ metrics = tmmlu_parser.get_evaluation_metrics()
303
+
304
+ metric_names = {m.name for m in metrics}
305
+ assert "difficulty_analysis" in metric_names
306
+
307
+
308
+ def test_mmlu_pro_evaluation_metrics(mmlu_pro_parser):
309
+ """Test evaluation metrics for MMLU Pro."""
310
+ metrics = mmlu_pro_parser.get_evaluation_metrics()
311
+
312
+ metric_names = {m.name for m in metrics}
313
+ assert "reasoning_analysis" in metric_names
314
+ assert "prompt_robustness" in metric_names
tests/test_tmlu_parser.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from llmdataparser.tmlu_parser import TMLUDatasetParser, TMLUParseEntry
4
+
5
+
6
+ @pytest.fixture
7
+ def tmlu_parser():
8
+ """Create a TMLU parser instance for testing."""
9
+ return TMLUDatasetParser()
10
+
11
+
12
+ @pytest.fixture
13
+ def sample_tmlu_entries():
14
+ """Create sample TMLU dataset entries for testing."""
15
+ return [
16
+ {
17
+ "question": "閱讀下文,選出依序最適合填入□內的選項:",
18
+ "A": "張揚/綢繆未雨/奏疏",
19
+ "B": "抽搐/煮繭抽絲/奏疏",
20
+ "C": "張揚/煮繭抽絲/進貢",
21
+ "D": "抽搐/綢繆未雨/進貢",
22
+ "answer": "B",
23
+ "explanation": "根據文意,選項B最為恰當。",
24
+ "metadata": {
25
+ "timestamp": "2023-10-09T18:27:20.304623",
26
+ "source": "AST chinese - 108",
27
+ "explanation_source": "",
28
+ },
29
+ },
30
+ {
31
+ "question": "下列何者是質數?",
32
+ "A": "21",
33
+ "B": "27",
34
+ "C": "31",
35
+ "D": "33",
36
+ "answer": "C",
37
+ "explanation": "31是質數,其他選項都是合數。",
38
+ "metadata": {
39
+ "timestamp": "2023-10-09T18:27:20.304623",
40
+ "source": "AST mathematics - 108",
41
+ "explanation_source": "",
42
+ },
43
+ },
44
+ ]
45
+
46
+
47
+ def test_tmlu_parse_entry_creation_valid():
48
+ """Test valid creation of TMLUParseEntry."""
49
+ entry = TMLUParseEntry.create(
50
+ question="Test question",
51
+ answer="A",
52
+ raw_question="Test question",
53
+ raw_choices=["choice1", "choice2", "choice3", "choice4"],
54
+ raw_answer="A",
55
+ task_name="AST_chinese",
56
+ explanation="Test explanation",
57
+ metadata={"source": "test"},
58
+ )
59
+ assert isinstance(entry, TMLUParseEntry)
60
+ assert entry.question == "Test question"
61
+ assert entry.answer == "A"
62
+ assert entry.raw_choices == ["choice1", "choice2", "choice3", "choice4"]
63
+ assert entry.explanation == "Test explanation"
64
+ assert entry.metadata == {"source": "test"}
65
+
66
+
67
+ @pytest.mark.parametrize("invalid_answer", ["E", "F", "1", "", None])
68
+ def test_tmlu_parse_entry_creation_invalid(invalid_answer):
69
+ """Test invalid answer handling in TMLUParseEntry creation."""
70
+ with pytest.raises(
71
+ ValueError, match="Invalid answer_letter.*must be one of A, B, C, D"
72
+ ):
73
+ TMLUParseEntry.create(
74
+ question="Test question",
75
+ answer=invalid_answer,
76
+ raw_question="Test question",
77
+ raw_choices=["choice1", "choice2", "choice3", "choice4"],
78
+ raw_answer=invalid_answer,
79
+ task_name="AST_chinese",
80
+ )
81
+
82
+
83
+ def test_process_entry(tmlu_parser, sample_tmlu_entries):
84
+ """Test processing entries in TMLU parser."""
85
+ entry = tmlu_parser.process_entry(sample_tmlu_entries[0], task_name="AST_chinese")
86
+
87
+ assert isinstance(entry, TMLUParseEntry)
88
+ assert entry.answer == "B"
89
+ assert entry.task_name == "AST_chinese"
90
+ assert len(entry.raw_choices) == 4
91
+ assert entry.explanation == "根據文意,選項B最為恰當。"
92
+ assert "AST chinese - 108" in entry.metadata["source"]
93
+
94
+
95
+ def test_tmlu_parser_initialization(tmlu_parser):
96
+ """Test TMLU parser initialization and properties."""
97
+ assert isinstance(tmlu_parser.task_names, list)
98
+ assert len(tmlu_parser.task_names) == 37 # Total number of tasks
99
+ assert tmlu_parser._data_source == "miulab/tmlu"
100
+ assert tmlu_parser._default_task == "AST_chinese"
101
+ assert "AST_chinese" in tmlu_parser.task_names
102
+ assert "GSAT_mathematics" in tmlu_parser.task_names
103
+ assert (
104
+ tmlu_parser.get_huggingface_link
105
+ == "https://huggingface.co/datasets/miulab/tmlu"
106
+ )
107
+
108
+
109
+ @pytest.mark.integration
110
+ def test_load_dataset(tmlu_parser):
111
+ """Test loading the TMLU dataset."""
112
+ tmlu_parser.load(task_name="AST_chinese", split="test")
113
+ assert tmlu_parser.raw_data is not None
114
+ assert tmlu_parser.split_names == ["test"]
115
+ assert tmlu_parser._current_task == "AST_chinese"
116
+
117
+
118
+ def test_parser_string_representation(tmlu_parser):
119
+ """Test string representation of TMLU parser."""
120
+ repr_str = str(tmlu_parser)
121
+ assert "TMLUDatasetParser" in repr_str
122
+ assert "miulab/tmlu" in repr_str
123
+ assert "not loaded" in repr_str
124
+
125
+
126
+ @pytest.mark.integration
127
+ def test_different_tasks_parsing(tmlu_parser):
128
+ """Test parsing different tasks of the dataset."""
129
+ # Load and parse AST_chinese
130
+ tmlu_parser.load(task_name="AST_chinese", split="test")
131
+ tmlu_parser.parse(split_names="test", force=True)
132
+ chinese_count = len(tmlu_parser.get_parsed_data)
133
+
134
+ # Load and parse AST_mathematics
135
+ tmlu_parser.load(task_name="AST_mathematics", split="test")
136
+ tmlu_parser.parse(split_names="test", force=True)
137
+ math_count = len(tmlu_parser.get_parsed_data)
138
+
139
+ assert chinese_count > 0
140
+ assert math_count > 0
141
+
142
+
143
+ def test_metadata_handling(tmlu_parser, sample_tmlu_entries):
144
+ """Test proper handling of metadata in entries."""
145
+ entry = tmlu_parser.process_entry(sample_tmlu_entries[0])
146
+
147
+ assert "timestamp" in entry.metadata
148
+ assert "source" in entry.metadata
149
+ assert "explanation_source" in entry.metadata
150
+ assert entry.metadata["source"] == "AST chinese - 108"
151
+
152
+
153
+ def test_get_dataset_description(tmlu_parser):
154
+ """Test dataset description generation."""
155
+ description = tmlu_parser.get_dataset_description()
156
+
157
+ assert description.name == "Taiwan Multiple-choice Language Understanding (TMLU)"
158
+ assert description.language == "Traditional Chinese"
159
+ assert "Taiwan-specific educational" in description.purpose
160
+ assert "Various Taiwan standardized tests" in description.source
161
+ assert description.format == "Multiple choice questions (A/B/C/D)"
162
+ assert "Advanced Subjects Test (AST)" in description.characteristics
163
+ assert "DBLP:journals/corr/abs-2403-20180" in description.citation
164
+
165
+
166
+ def test_get_evaluation_metrics(tmlu_parser):
167
+ """Test evaluation metrics generation."""
168
+ metrics = tmlu_parser.get_evaluation_metrics()
169
+
170
+ assert len(metrics) == 2 # Check total number of metrics
171
+
172
+ # Check primary metrics
173
+ primary_metrics = [m for m in metrics if m.primary]
174
+ assert len(primary_metrics) == 2
175
+ assert any(m.name == "accuracy" for m in primary_metrics)
176
+ assert any(m.name == "per_subject_accuracy" for m in primary_metrics)
tests/test_tw_legal_parser.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from llmdataparser.tw_legal_parser import TWLegalDatasetParser, TWLegalParseEntry
4
+
5
+
6
+ @pytest.fixture
7
+ def tw_legal_parser():
8
+ """Create a Taiwan Legal parser instance for testing."""
9
+ return TWLegalDatasetParser()
10
+
11
+
12
+ @pytest.fixture
13
+ def sample_tw_legal_entries():
14
+ """Create sample Taiwan Legal dataset entries for testing."""
15
+ return [
16
+ {
17
+ "question": "依民法規定,下列關於法人之敘述,何者錯誤?",
18
+ "A": "法人於法令限制內,有享受權利負擔義務之能力",
19
+ "B": "法人因目的之達到而消滅",
20
+ "C": "法人非依法律之規定,不得成立",
21
+ "D": "法人於登記前,即取得權利能力",
22
+ "answer": "D",
23
+ },
24
+ {
25
+ "question": "關於刑法第321條第1項第4款之結夥三人以上而犯竊盜罪,下列敘述何者正確?",
26
+ "A": "須行為人主觀上有結夥犯竊盜之認識",
27
+ "B": "三人以上當場在場實施竊盜行為始足當之",
28
+ "C": "三人以上已達成犯意聯絡即可成立",
29
+ "D": "三人以上須全部在現場實施竊盜行為",
30
+ "answer": "A",
31
+ },
32
+ ]
33
+
34
+
35
+ def test_tw_legal_parse_entry_creation_valid():
36
+ """Test valid creation of TWLegalParseEntry."""
37
+ entry = TWLegalParseEntry.create(
38
+ question="Test question",
39
+ answer="A",
40
+ raw_question="Test question",
41
+ raw_choices=["choice1", "choice2", "choice3", "choice4"],
42
+ raw_answer="A",
43
+ task_name="default",
44
+ )
45
+ assert isinstance(entry, TWLegalParseEntry)
46
+ assert entry.question == "Test question"
47
+ assert entry.answer == "A"
48
+ assert entry.raw_choices == ["choice1", "choice2", "choice3", "choice4"]
49
+
50
+
51
+ @pytest.mark.parametrize("invalid_answer", ["E", "F", "1", "", None])
52
+ def test_tw_legal_parse_entry_creation_invalid(invalid_answer):
53
+ """Test invalid answer handling in TWLegalParseEntry creation."""
54
+ with pytest.raises(
55
+ ValueError, match="Invalid answer_letter.*must be one of A, B, C, D"
56
+ ):
57
+ TWLegalParseEntry.create(
58
+ question="Test question",
59
+ answer=invalid_answer,
60
+ raw_question="Test question",
61
+ raw_choices=["choice1", "choice2", "choice3", "choice4"],
62
+ raw_answer=invalid_answer,
63
+ task_name="default",
64
+ )
65
+
66
+
67
+ def test_process_entry(tw_legal_parser, sample_tw_legal_entries):
68
+ """Test processing entries in Taiwan Legal parser."""
69
+ entry = tw_legal_parser.process_entry(sample_tw_legal_entries[0])
70
+
71
+ assert isinstance(entry, TWLegalParseEntry)
72
+ assert entry.answer == "D"
73
+ assert "A. 法人於法令限制內,有享受權利負擔義務之能力" in entry.question
74
+ assert "B. 法人因目的之達到而消滅" in entry.question
75
+ assert "C. 法人非依法律之規定,不得成立" in entry.question
76
+ assert "D. 法人於登記前,即取得權利能力" in entry.question
77
+ assert entry.raw_question == "依民法規定,下列關於法人之敘述,何者錯誤?"
78
+ assert len(entry.raw_choices) == 4
79
+
80
+
81
+ def test_tw_legal_parser_initialization(tw_legal_parser):
82
+ """Test Taiwan Legal parser initialization and properties."""
83
+ assert isinstance(tw_legal_parser.task_names, list)
84
+ assert len(tw_legal_parser.task_names) == 1 # Only default task
85
+ assert tw_legal_parser._data_source == "lianghsun/tw-legal-benchmark-v1"
86
+ assert tw_legal_parser._default_task == "default"
87
+ assert (
88
+ tw_legal_parser.get_huggingface_link
89
+ == "https://huggingface.co/datasets/lianghsun/tw-legal-benchmark-v1"
90
+ )
91
+
92
+
93
+ @pytest.mark.integration
94
+ def test_load_dataset(tw_legal_parser):
95
+ """Test loading the Taiwan Legal dataset."""
96
+ tw_legal_parser.load(split="train")
97
+ assert tw_legal_parser.raw_data is not None
98
+ assert tw_legal_parser.split_names == ["train"]
99
+ assert tw_legal_parser._current_task == "default"
100
+
101
+
102
+ def test_parser_string_representation(tw_legal_parser):
103
+ """Test string representation of Taiwan Legal parser."""
104
+ repr_str = str(tw_legal_parser)
105
+ assert "TWLegalDatasetParser" in repr_str
106
+ assert "lianghsun/tw-legal-benchmark-v1" in repr_str
107
+ assert "not loaded" in repr_str
108
+
109
+
110
+ @pytest.mark.integration
111
+ def test_data_parsing(tw_legal_parser):
112
+ """Test parsing the dataset."""
113
+ # Load and parse train split
114
+ tw_legal_parser.load(split="train")
115
+ tw_legal_parser.parse(split_names="train", force=True)
116
+ train_count = len(tw_legal_parser.get_parsed_data)
117
+
118
+ assert train_count > 0
119
+ # Additional assertions about the parsed data
120
+ parsed_data = tw_legal_parser.get_parsed_data
121
+ assert all(isinstance(entry, TWLegalParseEntry) for entry in parsed_data)
122
+ assert all(entry.answer in {"A", "B", "C", "D"} for entry in parsed_data)
123
+
124
+
125
+ def test_get_dataset_description(tw_legal_parser):
126
+ """Test getting dataset description for Taiwan Legal parser."""
127
+ description = tw_legal_parser.get_dataset_description()
128
+
129
+ assert description.name == "Taiwan Legal Benchmark"
130
+ assert description.language == "Traditional Chinese"
131
+ assert "Taiwan's legal system" in description.characteristics
132
+ assert (
133
+ "huggingface.co/datasets/lianghsun/tw-legal-benchmark-v1"
134
+ in description.citation
135
+ )
136
+
137
+
138
+ def test_get_evaluation_metrics(tw_legal_parser):
139
+ """Test getting evaluation metrics for Taiwan Legal parser."""
140
+ metrics = tw_legal_parser.get_evaluation_metrics()
141
+
142
+ assert len(metrics) == 1
143
+ metric = metrics[0]
144
+ assert metric.name == "accuracy"
145
+ assert metric.type == "classification"
146
+ assert metric.primary is True