kisejin
/

fedllm_repo

Model card Files Files and versions Community

kisejin commited on Jan 31

Commit

795c49e

1 Parent(s): 02e64cb

initial: create version skipbert for mates

Browse files

Files changed (26) hide show

template_FL/.gitignore +175 -0
template_FL/LICENSE +21 -0
template_FL/README.md +1 -0
template_FL/requirements.txt +36 -0
template_FL/src/environment.yml +565 -0
template_FL/src/ex.env.example +3 -0
template_FL/src/fedllm/Untitled.ipynb +861 -0
template_FL/src/fedllm/__init__.py +1 -0
template_FL/src/fedllm/client_app.py +335 -0
template_FL/src/fedllm/data_domains.py +281 -0
template_FL/src/fedllm/dataset.py +122 -0
template_FL/src/fedllm/flwr_mods.py +49 -0
template_FL/src/fedllm/make_data.py +101 -0
template_FL/src/fedllm/metrics.py +73 -0
template_FL/src/fedllm/models.py +200 -0
template_FL/src/fedllm/myaggregation.py +416 -0
template_FL/src/fedllm/myfedavg.py +295 -0
template_FL/src/fedllm/server_app.py +309 -0
template_FL/src/fedllm/skipbert/__init__.py +0 -0
template_FL/src/fedllm/skipbert/modeling.py +922 -0
template_FL/src/fedllm/skipbert/plot.py +173 -0
template_FL/src/fedllm/templates/alpaca.json +6 -0
template_FL/src/fedllm/trainer.py +476 -0
template_FL/src/fedllm/utils.py +54 -0
template_FL/src/pyproject.toml +180 -0
template_FL/src/tesst.ipynb +40 -0

template_FL/.gitignore ADDED Viewed

	@@ -0,0 +1,175 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc
+# Remove uncenssary output files
+results/
+wandb/

template_FL/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 KiseJin
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

template_FL/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # template_FL

template_FL/requirements.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+bitsandbytes==0.45.0
+cryptography==42.0.8
+cupy-cuda12x==13.3.0
+docker-pycreds==0.4.0
+faiss-cpu==1.9.0.post1
+fastrlock==0.8.3
+# flash-attn==2.6.3
+flwr==1.14.0
+flwr-datasets==0.5.0
+fsspec<=2024.10.0,>=2023.1.0
+gitdb==4.0.12
+gitpython==3.1.44
+grpcio==1.64.3
+iterators==0.0.2
+multiprocess==0.70.16
+pathspec==0.12.1
+protobuf==4.25.5
+prv-accountant==0.2.0
+pycryptodome==3.21.0
+ray==2.10.0
+rouge-score==0.1.2
+sentry-sdk==2.19.2
+setproctitle==1.3.4
+shellingham==1.5.4
+smmap==5.0.2
+sympy==1.13.1
+thop==0.1.1-2209072238
+tomli-w==1.1.0
+typer==0.12.5
+wandb==0.19.3
+python-dotenv
+omegaconf
+trl
+evaluate
+google
+deepspeed

template_FL/src/environment.yml ADDED Viewed

	@@ -0,0 +1,565 @@

+name: fedllm
+channels:
+  - xformers
+  - pytorch
+  - nvidia
+  - defaults
+  - conda-forge
+  - https://repo.anaconda.com/pkgs/main
+  - https://repo.anaconda.com/pkgs/r
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - about-time=4.2.1=pyhd8ed1ab_1
+  - absl-py=2.1.0=pyhd8ed1ab_1
+  - accelerate=1.2.1=pyhd8ed1ab_1
+  - aiohappyeyeballs=2.4.4=pyhd8ed1ab_1
+  - aiohttp=3.11.11=py311h2dc5d0c_0
+  - aiosignal=1.3.2=pyhd8ed1ab_0
+  - alive-progress=3.2.0=pyhd8ed1ab_0
+  - alsa-lib=1.2.8=h166bdaf_0
+  - annotated-types=0.7.0=pyhd8ed1ab_1
+  - antlr-python-runtime=4.9.3=pyhd8ed1ab_1
+  - anyio=4.8.0=pyhd8ed1ab_0
+  - aom=3.5.0=h27087fc_0
+  - argon2-cffi=23.1.0=pyhd8ed1ab_1
+  - argon2-cffi-bindings=21.2.0=py311h9ecbd09_5
+  - arrow=1.3.0=pyhd8ed1ab_1
+  - asttokens=3.0.0=pyhd8ed1ab_1
+  - async-lru=2.0.4=pyhd8ed1ab_1
+  - async-timeout=4.0.3=pyhd8ed1ab_0
+  - attr=2.5.1=h166bdaf_1
+  - attrs=24.3.0=pyh71513ae_0
+  - autograd=1.7.0=pyhd8ed1ab_1
+  - aws-c-auth=0.7.3=he2921ad_3
+  - aws-c-cal=0.6.2=hc309b26_1
+  - aws-c-common=0.9.0=hd590300_0
+  - aws-c-compression=0.2.17=h4d4d85c_2
+  - aws-c-event-stream=0.3.2=h2e3709c_0
+  - aws-c-http=0.7.12=hc865f51_1
+  - aws-c-io=0.13.32=h1a03231_3
+  - aws-c-mqtt=0.9.5=h3a0376c_1
+  - aws-c-s3=0.3.14=h1678ad6_3
+  - aws-c-sdkutils=0.1.12=h4d4d85c_1
+  - aws-checksums=0.1.17=h4d4d85c_1
+  - aws-crt-cpp=0.23.0=h40cdbb9_5
+  - aws-sdk-cpp=1.10.57=h6f6b8fa_21
+  - babel=2.16.0=pyhd8ed1ab_1
+  - beautifulsoup4=4.12.3=pyha770c72_1
+  - binaryornot=0.4.4=pyhd8ed1ab_2
+  - blas=1.0=mkl
+  - bleach=6.2.0=pyhd8ed1ab_3
+  - bleach-with-css=6.2.0=hd8ed1ab_3
+  - blosc=1.21.5=h0f2a231_0
+  - boltons=24.0.0=pyhd8ed1ab_1
+  - brotli=1.0.9=h166bdaf_9
+  - brotli-bin=1.0.9=h166bdaf_9
+  - brotli-python=1.0.9=py311ha362b79_9
+  - brunsli=0.1=h9c3ff4c_0
+  - bzip2=1.0.8=h5eee18b_6
+  - c-ares=1.34.4=hb9d3cd8_0
+  - c-blosc2=2.12.0=hb4ffafa_0
+  - ca-certificates=2024.12.31=h06a4308_0
+  - cached-property=1.5.2=hd8ed1ab_1
+  - cached_property=1.5.2=pyha770c72_1
+  - cachetools=5.5.0=pyhd8ed1ab_1
+  - cairo=1.16.0=ha61ee94_1014
+  - certifi=2024.12.14=pyhd8ed1ab_0
+  - cffi=1.17.1=py311hf29c0ef_0
+  - cfitsio=4.2.0=hd9d235c_0
+  - chardet=5.2.0=py311h38be061_2
+  - charls=2.4.2=h59595ed_0
+  - charset-normalizer=3.4.1=pyhd8ed1ab_0
+  - click=8.1.8=pyh707e725_0
+  - cma=3.2.2=pyh050c7b8_0
+  - colorama=0.4.6=pyhd8ed1ab_1
+  - comm=0.2.2=pyhd8ed1ab_1
+  - conda=23.7.4=py311h38be061_0
+  - conda-package-handling=2.4.0=pyh7900ff3_2
+  - conda-package-streaming=0.11.0=pyhd8ed1ab_0
+  - contourpy=1.3.1=py311hd18a35c_0
+  - cookiecutter=2.6.0=pyhd8ed1ab_1
+  - cpp-expected=1.1.0=hf52228f_0
+  - cuda-cudart=12.4.127=0
+  - cuda-cupti=12.4.127=0
+  - cuda-libraries=12.4.1=0
+  - cuda-nvrtc=12.4.127=0
+  - cuda-nvtx=12.4.127=0
+  - cuda-opencl=12.6.77=0
+  - cuda-runtime=12.4.1=0
+  - cuda-version=12.6=3
+  - cycler=0.12.1=pyhd8ed1ab_1
+  - dataclasses=0.8=pyhc8e2a94_3
+  - dataclasses-json=0.6.7=pyhd8ed1ab_1
+  - datasets=2.19.2=pyhd8ed1ab_0
+  - dav1d=1.2.1=hd590300_0
+  - dbus=1.13.6=h5008d03_3
+  - debugpy=1.8.11=py311hfdbb021_0
+  - decorator=5.1.1=pyhd8ed1ab_1
+  - deepspeed=0.16.2=cpu_py311hd0a74d0_0
+  - defusedxml=0.7.1=pyhd8ed1ab_0
+  - deprecated=1.2.15=pyhd8ed1ab_1
+  - dill=0.3.8=pyhd8ed1ab_0
+  - distro=1.9.0=pyhd8ed1ab_1
+  - docstring_parser=0.16=pyhd8ed1ab_0
+  - einops=0.8.0=pyhd8ed1ab_1
+  - entrypoints=0.4=pyhd8ed1ab_1
+  - eval_type_backport=0.2.2=pyha770c72_0
+  - evaluate=0.4.1=pyhd8ed1ab_0
+  - exceptiongroup=1.2.2=pyhd8ed1ab_1
+  - executing=2.1.0=pyhd8ed1ab_1
+  - expat=2.6.4=h5888daf_0
+  - ffmpeg=4.3=hf484d3e_0
+  - fftw=3.3.10=nompi_hf1063bd_110
+  - filelock=3.16.1=pyhd8ed1ab_1
+  - fmt=10.2.1=h00ab1b0_0
+  - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
+  - font-ttf-inconsolata=3.000=h77eed37_0
+  - font-ttf-source-code-pro=2.038=h77eed37_0
+  - font-ttf-ubuntu=0.83=h77eed37_3
+  - fontconfig=2.14.2=h14ed4e7_0
+  - fonts-conda-ecosystem=1=0
+  - fonts-conda-forge=1=0
+  - fonttools=4.55.3=py311h2dc5d0c_1
+  - fqdn=1.5.1=pyhd8ed1ab_1
+  - freetype=2.12.1=h267a509_2
+  - frozendict=2.4.6=py311h9ecbd09_0
+  - frozenlist=1.5.0=py311h9ecbd09_0
+  - functorch=2.0.0=pyhd8ed1ab_0
+  - fvcore=0.1.5.post20221221=pyhd8ed1ab_0
+  - gdown=5.2.0=pyhd8ed1ab_1
+  - gettext=0.22.5=he02047a_3
+  - gettext-tools=0.22.5=he02047a_3
+  - gflags=2.2.2=h5888daf_1005
+  - giflib=5.2.2=hd590300_0
+  - glib=2.78.4=hfc55251_0
+  - glib-tools=2.78.4=hfc55251_0
+  - glog=0.6.0=h6f12383_0
+  - gmp=6.3.0=hac33072_2
+  - gmpy2=2.1.5=py311h0f6cedb_3
+  - gnutls=3.6.13=h85f3911_1
+  - grapheme=0.6.0=pyhd8ed1ab_1
+  - graphite2=1.3.13=h59595ed_1003
+  - greenlet=3.1.1=py311hfdbb021_1
+  - gst-plugins-base=1.22.0=h4243ec0_2
+  - gstreamer=1.22.0=h25f0c4b_2
+  - gstreamer-orc=0.4.40=hb9d3cd8_0
+  - h11=0.14.0=pyhd8ed1ab_1
+  - h2=4.1.0=pyhd8ed1ab_1
+  - harfbuzz=6.0.0=h8e241bc_0
+  - hjson-py=3.1.0=pyhd8ed1ab_1
+  - hpack=4.0.0=pyhd8ed1ab_1
+  - httpcore=1.0.7=pyh29332c3_1
+  - httpx=0.28.1=pyhd8ed1ab_0
+  - huggingface_hub=0.27.1=pyhd8ed1ab_0
+  - hyperframe=6.0.1=pyhd8ed1ab_1
+  - icu=70.1=h27087fc_0
+  - idna=3.10=pyhd8ed1ab_1
+  - imagecodecs=2023.1.23=py311ha5a3c35_0
+  - imageio=2.36.1=pyh12aca89_1
+  - importlib-metadata=8.5.0=pyha770c72_1
+  - importlib_metadata=8.5.0=hd8ed1ab_1
+  - importlib_resources=6.5.2=pyhd8ed1ab_0
+  - intel-openmp=2023.1.0=hdb19cb5_46306
+  - ipykernel=6.29.5=pyh3099207_0
+  - ipython=8.31.0=pyh707e725_0
+  - ipywidgets=8.1.5=pyhd8ed1ab_1
+  - isoduration=20.11.0=pyhd8ed1ab_1
+  - jack=1.9.22=h11f4161_0
+  - jedi=0.19.2=pyhd8ed1ab_1
+  - jinja2=3.1.5=pyhd8ed1ab_0
+  - jiter=0.8.2=py311h9e33e62_0
+  - joblib=1.4.2=pyhd8ed1ab_1
+  - jpeg=9e=h166bdaf_2
+  - json5=0.10.0=pyhd8ed1ab_1
+  - jsonpatch=1.33=pyhd8ed1ab_1
+  - jsonpointer=3.0.0=py311h38be061_1
+  - jsonschema=4.23.0=pyhd8ed1ab_1
+  - jsonschema-specifications=2024.10.1=pyhd8ed1ab_1
+  - jsonschema-with-format-nongpl=4.23.0=hd8ed1ab_1
+  - jupyter=1.1.1=pyhd8ed1ab_1
+  - jupyter-lsp=2.2.5=pyhd8ed1ab_1
+  - jupyter_client=8.6.3=pyhd8ed1ab_1
+  - jupyter_console=6.6.3=pyhd8ed1ab_1
+  - jupyter_core=5.7.2=pyh31011fe_1
+  - jupyter_events=0.11.0=pyhd8ed1ab_0
+  - jupyter_server=2.15.0=pyhd8ed1ab_0
+  - jupyter_server_terminals=0.5.3=pyhd8ed1ab_1
+  - jupyterlab=4.3.4=pyhd8ed1ab_0
+  - jupyterlab_pygments=0.3.0=pyhd8ed1ab_2
+  - jupyterlab_server=2.27.3=pyhd8ed1ab_1
+  - jupyterlab_widgets=3.0.13=pyhd8ed1ab_1
+  - jxrlib=1.1=hd590300_3
+  - keyutils=1.6.1=h166bdaf_0
+  - kiwisolver=1.4.7=py311hd18a35c_0
+  - krb5=1.20.1=h81ceb04_0
+  - lame=3.100=h166bdaf_1003
+  - langchain=0.2.5=pyhd8ed1ab_0
+  - langchain-core=0.2.43=pyhd8ed1ab_0
+  - langchain-text-splitters=0.2.4=pyhd8ed1ab_0
+  - langsmith=0.1.147=pyhd8ed1ab_0
+  - lazy-loader=0.4=pyhd8ed1ab_2
+  - lazy_loader=0.4=pyhd8ed1ab_2
+  - lcms2=2.15=hfd0df8a_0
+  - ld_impl_linux-64=2.40=h12ee557_0
+  - lerc=4.0.0=h27087fc_0
+  - libabseil=20230125.3=cxx17_h59595ed_0
+  - libaec=1.1.3=h59595ed_0
+  - libaio=0.3.113=h166bdaf_0
+  - libarchive=3.6.2=h3d51595_0
+  - libarrow=13.0.0=hb9dc469_0_cpu
+  - libasprintf=0.22.5=he8f35ee_3
+  - libasprintf-devel=0.22.5=he8f35ee_3
+  - libavif=0.11.1=h8182462_2
+  - libblas=3.9.0=1_h86c2bf4_netlib
+  - libbrotlicommon=1.0.9=h166bdaf_9
+  - libbrotlidec=1.0.9=h166bdaf_9
+  - libbrotlienc=1.0.9=h166bdaf_9
+  - libcap=2.67=he9d0100_0
+  - libcblas=3.9.0=8_h3b12eaf_netlib
+  - libclang=15.0.7=default_h127d8a8_5
+  - libclang13=15.0.7=default_h5d6823c_5
+  - libcrc32c=1.1.2=h9c3ff4c_0
+  - libcublas=12.4.5.8=0
+  - libcufft=11.2.1.3=0
+  - libcufile=1.11.1.6=0
+  - libcups=2.3.3=h36d4200_3
+  - libcurand=10.3.7.77=0
+  - libcurl=8.11.1=hc9e6f67_0
+  - libcusolver=11.6.1.9=0
+  - libcusparse=12.3.1.170=0
+  - libdb=6.2.32=h9c3ff4c_0
+  - libdeflate=1.17=h0b41bf4_0
+  - libedit=3.1.20191231=he28a2e2_2
+  - libev=4.33=hd590300_2
+  - libevent=2.1.10=h28343ad_4
+  - libexpat=2.6.4=h5888daf_0
+  - libffi=3.4.4=h6a678d5_1
+  - libflac=1.4.3=h59595ed_0
+  - libgcc=14.2.0=h77fa898_1
+  - libgcc-ng=14.2.0=h69a702a_1
+  - libgcrypt=1.11.0=ha770c72_2
+  - libgcrypt-devel=1.11.0=hb9d3cd8_2
+  - libgcrypt-lib=1.11.0=hb9d3cd8_2
+  - libgcrypt-tools=1.11.0=hb9d3cd8_2
+  - libgettextpo=0.22.5=he02047a_3
+  - libgettextpo-devel=0.22.5=he02047a_3
+  - libgfortran=14.2.0=h69a702a_1
+  - libgfortran-ng=14.2.0=h69a702a_1
+  - libgfortran5=14.2.0=hd5240d6_1
+  - libglib=2.78.4=h783c2da_0
+  - libgomp=14.2.0=h77fa898_1
+  - libgoogle-cloud=2.12.0=h840a212_1
+  - libgpg-error=1.51=hbd13f7d_1
+  - libgrpc=1.56.2=h3905398_1
+  - libhwloc=2.9.1=hd6dc26d_0
+  - libiconv=1.17=hd590300_2
+  - libjpeg-turbo=2.0.0=h9bf148f_0
+  - liblapack=3.9.0=8_h3b12eaf_netlib
+  - libllvm15=15.0.7=hadd5161_1
+  - libmamba=1.5.1=h744094f_0
+  - libmambapy=1.5.1=py311hf2555c7_0
+  - libnghttp2=1.57.0=h2d74bed_0
+  - libnpp=12.2.5.30=0
+  - libnsl=2.0.1=hd590300_0
+  - libnuma=2.0.18=h4ab18f5_2
+  - libnvfatbin=12.6.77=0
+  - libnvjitlink=12.4.127=0
+  - libnvjpeg=12.3.1.117=0
+  - libogg=1.3.5=h4ab18f5_0
+  - libopus=1.3.1=h7f98852_1
+  - libpng=1.6.39=h5eee18b_0
+  - libpq=15.3=hbcd7760_1
+  - libprotobuf=4.23.3=hd1fb520_1
+  - libsentencepiece=0.1.99=h28b9611_1
+  - libsndfile=1.2.2=hc60ed4a_1
+  - libsodium=1.0.18=h36c2ea0_1
+  - libsolv=0.7.30=he621ea3_1
+  - libsqlite=3.46.0=hde9e2c9_0
+  - libssh2=1.11.1=h251f7ec_0
+  - libstdcxx=14.2.0=hc0a3c3a_1
+  - libstdcxx-ng=14.2.0=h4852527_1
+  - libsystemd0=253=h8c4010b_1
+  - libthrift=0.18.1=h5e4af38_0
+  - libtiff=4.5.0=h6adf6a1_2
+  - libtool=2.4.7=he02047a_1
+  - libudev1=253=h0b41bf4_1
+  - libutf8proc=2.8.0=hf23e847_1
+  - libuuid=2.38.1=h0b41bf4_0
+  - libvorbis=1.3.7=h9c3ff4c_0
+  - libwebp=1.2.4=h1daa5a0_1
+  - libwebp-base=1.2.4=h166bdaf_0
+  - libxcb=1.13=h7f98852_1004
+  - libxkbcommon=1.5.0=h79f4944_1
+  - libxml2=2.10.3=hca2bb57_4
+  - libzlib=1.2.13=h4ab18f5_6
+  - libzopfli=1.0.3=h9c3ff4c_0
+  - lightning=2.5.0.post0=pyhd8ed1ab_0
+  - lightning-utilities=0.11.9=pyhd8ed1ab_1
+  - llvm-openmp=12.0.1=h4bd325d_1
+  - lz4-c=1.9.4=hcb278e6_0
+  - lzo=2.10=hd590300_1001
+  - mamba=1.5.1=py311h3072747_0
+  - markdown=3.6=pyhd8ed1ab_0
+  - markdown-it-py=3.0.0=pyhd8ed1ab_1
+  - markupsafe=3.0.2=py311h2dc5d0c_1
+  - marshmallow=3.25.1=pyhd8ed1ab_0
+  - matplotlib=3.9.1=py311h38be061_1
+  - matplotlib-base=3.9.1=py311h74b4f7c_2
+  - matplotlib-inline=0.1.7=pyhd8ed1ab_1
+  - mdurl=0.1.2=pyhd8ed1ab_1
+  - mistune=3.1.0=pyhd8ed1ab_0
+  - mkl=2023.1.0=h213fc3f_46344
+  - mpc=1.3.1=h24ddda3_1
+  - mpfr=4.2.1=h90cbb55_3
+  - mpg123=1.32.9=hc50e24c_0
+  - mpmath=1.3.0=pyhd8ed1ab_1
+  - msgpack-python=1.1.0=py311hd18a35c_0
+  - multidict=6.1.0=py311h2dc5d0c_2
+  - munkres=1.1.4=pyh9f0ad1d_0
+  - mypy_extensions=1.0.0=pyha770c72_1
+  - mysql-common=8.0.33=hf1915f5_6
+  - mysql-libs=8.0.33=hca2cd23_6
+  - nbclient=0.10.2=pyhd8ed1ab_0
+  - nbconvert-core=7.16.5=pyhd8ed1ab_1
+  - nbformat=5.10.4=pyhd8ed1ab_1
+  - ncurses=6.4=h6a678d5_0
+  - nest-asyncio=1.6.0=pyhd8ed1ab_1
+  - nettle=3.6=he412f7d_0
+  - networkx=3.4.2=pyh267e887_2
+  - ninja=1.12.1=h297d8ca_0
+  - nlohmann_json=3.11.3=he02047a_1
+  - nltk=3.9.1=pyhd8ed1ab_1
+  - notebook=7.3.2=pyhd8ed1ab_0
+  - notebook-shim=0.2.4=pyhd8ed1ab_1
+  - nspr=4.36=h5888daf_0
+  - nss=3.100=hca3bf56_0
+  - numpy=1.26.4=py311h64a7726_0
+  - nvidia-ml-py=12.560.30=pyhd8ed1ab_1
+  - nvitop=1.4.1=pyh707e725_1
+  - omegaconf=2.3.0=pyhd8ed1ab_0
+  - opacus=1.5.2=pyhd8ed1ab_1
+  - openai=1.59.7=pyhd8ed1ab_0
+  - openh264=2.1.1=h780b84a_0
+  - openjpeg=2.5.0=hfec8fc6_2
+  - openssl=3.1.7=hb9d3cd8_0
+  - opt-einsum=3.4.0=hd8ed1ab_1
+  - opt_einsum=3.4.0=pyhd8ed1ab_1
+  - orc=1.9.0=h385abfd_1
+  - orjson=3.10.14=py311h9e33e62_0
+  - overrides=7.7.0=pyhd8ed1ab_1
+  - packaging=24.2=pyhd8ed1ab_2
+  - pandas=2.2.3=py311h7db5c69_1
+  - pandocfilters=1.5.0=pyhd8ed1ab_0
+  - parso=0.8.4=pyhd8ed1ab_1
+  - patsy=1.0.1=pyhd8ed1ab_1
+  - pcre2=10.42=hebb0a14_1
+  - peft=0.14.0=pyhd8ed1ab_0
+  - pexpect=4.9.0=pyhd8ed1ab_1
+  - pickleshare=0.7.5=pyhd8ed1ab_1004
+  - pillow=9.4.0=py311h6a678d5_0
+  - pip=24.2=py311h06a4308_0
+  - pixman=0.44.2=h29eaf8c_0
+  - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_2
+  - platformdirs=4.3.6=pyhd8ed1ab_1
+  - pluggy=1.5.0=pyhd8ed1ab_1
+  - ply=3.11=pyhd8ed1ab_3
+  - portalocker=3.0.0=py311h38be061_0
+  - prometheus_client=0.21.1=pyhd8ed1ab_0
+  - prompt-toolkit=3.0.48=pyha770c72_1
+  - prompt_toolkit=3.0.48=hd8ed1ab_1
+  - propcache=0.2.1=py311h9ecbd09_0
+  - psutil=6.1.1=py311h9ecbd09_0
+  - pthread-stubs=0.4=hb9d3cd8_1002
+  - ptyprocess=0.7.0=pyhd8ed1ab_1
+  - pulseaudio=16.1=hcb278e6_3
+  - pulseaudio-client=16.1=h5195f5e_3
+  - pulseaudio-daemon=16.1=ha8d29e2_3
+  - pure_eval=0.2.3=pyhd8ed1ab_1
+  - py-cpuinfo=9.0.0=pyhd8ed1ab_1
+  - pyarrow=13.0.0=py311h39c9aba_0_cpu
+  - pyarrow-hotfix=0.6=pyhd8ed1ab_1
+  - pybind11-abi=4=hd8ed1ab_3
+  - pycosat=0.6.6=py311h9ecbd09_2
+  - pycparser=2.22=pyh29332c3_1
+  - pydantic=2.10.5=pyh3cfb1c2_0
+  - pydantic-core=2.27.2=py311h9e33e62_0
+  - pygments=2.19.1=pyhd8ed1ab_0
+  - pymoo=0.6.1.3=py311h7db5c69_0
+  - pynvml=12.0.0=pyhd8ed1ab_0
+  - pyopenssl=24.3.0=pyhd8ed1ab_0
+  - pyparsing=3.2.1=pyhd8ed1ab_0
+  - pypdf=3.17.4=pyhd8ed1ab_0
+  - pyqt=5.15.9=py311hf0fb5b6_5
+  - pyqt5-sip=12.12.2=py311hb755f60_5
+  - pysocks=1.7.1=pyha55dd90_7
+  - python=3.11.6=hab00c5b_0_cpython
+  - python-dateutil=2.9.0.post0=pyhff2d567_1
+  - python-dotenv=1.0.1=pyhd8ed1ab_1
+  - python-fastjsonschema=2.21.1=pyhd8ed1ab_0
+  - python-json-logger=2.0.7=pyhd8ed1ab_0
+  - python-slugify=8.0.4=pyhd8ed1ab_1
+  - python-tzdata=2024.2=pyhd8ed1ab_1
+  - python-xxhash=3.5.0=py311h9ecbd09_1
+  - python_abi=3.11=2_cp311
+  - pytorch=2.5.1=py3.11_cuda12.4_cudnn9.1.0_0
+  - pytorch-cuda=12.4=hc786d27_7
+  - pytorch-lightning=2.5.0.post0=pyh101cb37_0
+  - pytorch-mutex=1.0=cuda
+  - pytz=2024.1=pyhd8ed1ab_0
+  - pywavelets=1.8.0=py311h9f3472d_0
+  - pyyaml=6.0.2=py311h9ecbd09_1
+  - pyzmq=26.2.0=py311h7deb3e3_0
+  - qhull=2020.2=h434a139_5
+  - qt-main=5.15.8=h5d23da1_6
+  - rdma-core=28.9=h59595ed_1
+  - re2=2023.03.02=h8c504da_0
+  - readline=8.2=h5eee18b_0
+  - referencing=0.35.1=pyhd8ed1ab_1
+  - regex=2024.11.6=py311h9ecbd09_0
+  - reproc=14.2.5.post0=hb9d3cd8_0
+  - reproc-cpp=14.2.5.post0=h5888daf_0
+  - requests=2.32.3=pyhd8ed1ab_1
+  - requests-toolbelt=1.0.0=pyhd8ed1ab_1
+  - responses=0.18.0=pyhd8ed1ab_0
+  - rfc3339-validator=0.1.4=pyhd8ed1ab_1
+  - rfc3986-validator=0.1.1=pyh9f0ad1d_0
+  - rich=13.9.4=pyhd8ed1ab_1
+  - rpds-py=0.22.3=py311h9e33e62_0
+  - ruamel.yaml=0.17.40=py311h459d7ec_0
+  - ruamel.yaml.clib=0.2.8=py311h9ecbd09_1
+  - s2n=1.3.51=h06160fa_0
+  - sacrebleu=2.1.0=pyhd8ed1ab_0
+  - sacremoses=0.0.53=pyhd8ed1ab_0
+  - safetensors=0.5.2=py311h9e33e62_0
+  - scikit-image=0.25.0=py311h7db5c69_0
+  - scikit-learn=1.6.1=py311h57cc02b_0
+  - scipy=1.15.1=py311hc1ac118_0
+  - seaborn=0.13.2=hd8ed1ab_3
+  - seaborn-base=0.13.2=pyhd8ed1ab_3
+  - send2trash=1.8.3=pyh0d859eb_1
+  - sentence-transformers=2.7.0=pyhd8ed1ab_0
+  - sentencepiece=0.1.99=h38be061_1
+  - sentencepiece-python=0.1.99=py311hf03188e_1
+  - sentencepiece-spm=0.1.99=h28b9611_1
+  - setuptools=75.1.0=py311h06a4308_0
+  - shtab=1.7.1=pyhd8ed1ab_1
+  - simdjson=3.11.5=h84d6215_0
+  - sip=6.7.12=py311hb755f60_0
+  - six=1.17.0=pyhd8ed1ab_0
+  - snappy=1.1.10=hdb0a2a9_1
+  - sniffio=1.3.1=pyhd8ed1ab_1
+  - soupsieve=2.5=pyhd8ed1ab_1
+  - spdlog=1.14.1=h597fd29_0
+  - sqlalchemy=2.0.37=py311h9ecbd09_0
+  - sqlite=3.45.3=h5eee18b_0
+  - stack_data=0.6.3=pyhd8ed1ab_1
+  - statsmodels=0.14.4=py311h9f3472d_0
+  - tabulate=0.9.0=pyhd8ed1ab_2
+  - tbb=2021.9.0=hf52228f_0
+  - tenacity=8.5.0=pyhd8ed1ab_0
+  - tensorboard=2.18.0=pyhd8ed1ab_1
+  - tensorboard-data-server=0.7.0=py311h63ff55d_1
+  - termcolor=2.5.0=pyhd8ed1ab_1
+  - terminado=0.18.1=pyh0d859eb_0
+  - text-unidecode=1.3=pyhd8ed1ab_2
+  - threadpoolctl=3.5.0=pyhc1e730c_0
+  - tifffile=2023.8.12=pyhd8ed1ab_0
+  - tinycss2=1.4.0=pyhd8ed1ab_0
+  - tk=8.6.14=h39e8969_0
+  - tokenizers=0.13.3=py311h1b04a43_0
+  - toml=0.10.2=pyhd8ed1ab_1
+  - tomli=2.2.1=pyhd8ed1ab_1
+  - toolz=1.0.0=pyhd8ed1ab_1
+  - torchaudio=2.5.1=py311_cu124
+  - torchmetrics=1.6.1=pyhd8ed1ab_0
+  - torchtriton=3.1.0=py311
+  - torchvision=0.20.1=py311_cu124
+  - tornado=6.4.2=py311h9ecbd09_0
+  - tqdm=4.67.1=pyhd8ed1ab_1
+  - traitlets=5.14.3=pyhd8ed1ab_1
+  - transformers=4.33.3=pyhd8ed1ab_0
+  - transforms3d=0.4.2=pyhd8ed1ab_1
+  - trl=0.10.1=pyhd8ed1ab_0
+  - types-python-dateutil=2.9.0.20241206=pyhd8ed1ab_0
+  - typing=3.10.0.0=pyhd8ed1ab_2
+  - typing-extensions=4.12.2=hd8ed1ab_1
+  - typing_extensions=4.12.2=pyha770c72_1
+  - typing_inspect=0.9.0=pyhd8ed1ab_1
+  - typing_utils=0.1.0=pyhd8ed1ab_1
+  - tyro=0.9.1=pyhff2d567_0
+  - tzdata=2024b=h04d1e81_0
+  - ucx=1.14.1=h195a15c_5
+  - unicodedata2=16.0.0=py311h9ecbd09_0
+  - uri-template=1.3.0=pyhd8ed1ab_1
+  - urllib3=2.3.0=pyhd8ed1ab_0
+  - wcwidth=0.2.13=pyhd8ed1ab_1
+  - webcolors=24.11.1=pyhd8ed1ab_0
+  - webencodings=0.5.1=pyhd8ed1ab_3
+  - websocket-client=1.8.0=pyhd8ed1ab_1
+  - werkzeug=3.1.3=pyhd8ed1ab_1
+  - wheel=0.44.0=py311h06a4308_0
+  - widgetsnbextension=4.0.13=pyhd8ed1ab_1
+  - wrapt=1.17.1=py311h9ecbd09_0
+  - xcb-util=0.4.0=h516909a_0
+  - xcb-util-image=0.4.0=h166bdaf_0
+  - xcb-util-keysyms=0.4.0=h516909a_0
+  - xcb-util-renderutil=0.3.9=h166bdaf_0
+  - xcb-util-wm=0.4.1=h516909a_0
+  - xformers=0.0.28.post3=py311_cu12.1.0_pyt2.5.1
+  - xkeyboard-config=2.38=h0b41bf4_0
+  - xorg-kbproto=1.0.7=hb9d3cd8_1003
+  - xorg-libice=1.1.2=hb9d3cd8_0
+  - xorg-libsm=1.2.5=he73a12e_0
+  - xorg-libx11=1.8.4=h0b41bf4_0
+  - xorg-libxau=1.0.12=hb9d3cd8_0
+  - xorg-libxdmcp=1.1.5=hb9d3cd8_0
+  - xorg-libxext=1.3.4=h0b41bf4_2
+  - xorg-libxrender=0.9.10=h7f98852_1003
+  - xorg-renderproto=0.11.1=hb9d3cd8_1003
+  - xorg-xextproto=7.3.0=hb9d3cd8_1004
+  - xorg-xproto=7.0.31=hb9d3cd8_1008
+  - xxhash=0.8.2=hd590300_0
+  - xz=5.4.6=h5eee18b_1
+  - yacs=0.1.8=pyhd8ed1ab_1
+  - yaml=0.2.5=h7f98852_2
+  - yaml-cpp=0.7.0=h59595ed_3
+  - yarl=1.18.3=py311h9ecbd09_0
+  - zeromq=4.3.5=h59595ed_1
+  - zfp=1.0.1=h5888daf_2
+  - zipp=3.21.0=pyhd8ed1ab_1
+  - zlib=1.2.13=h4ab18f5_6
+  - zlib-ng=2.0.7=h0b41bf4_0
+  - zstandard=0.23.0=py311hbc35293_1
+  - zstd=1.5.6=hc292b87_0
+  - pip:
+      - bitsandbytes==0.45.0
+      - cryptography==42.0.8
+      - cupy-cuda12x==13.3.0
+      - docker-pycreds==0.4.0
+      - faiss-cpu==1.9.0.post1
+      - fastrlock==0.8.3
+      - flash-attn==2.6.3
+      - flwr==1.14.0
+      - flwr-datasets==0.5.0
+      - fsspec==2024.12.0
+      - gitdb==4.0.12
+      - gitpython==3.1.44
+      - grpcio==1.64.3
+      - iterators==0.0.2
+      - multiprocess==0.70.16
+      - pathspec==0.12.1
+      - protobuf==4.25.5
+      - prv-accountant==0.2.0
+      - pycryptodome==3.21.0
+      - ray==2.10.0
+      - rouge-score==0.1.2
+      - sentry-sdk==2.19.2
+      - setproctitle==1.3.4
+      - shellingham==1.5.4
+      - smmap==5.0.2
+      - sympy==1.13.1
+      - thop==0.1.1-2209072238
+      - tomli-w==1.1.0
+      - typer==0.12.5
+      - wandb==0.19.3

template_FL/src/ex.env.example ADDED Viewed

	@@ -0,0 +1,3 @@

+WANDB_API_KEY = ""
+WANDB_NAME = "FL@CSS25"
+HF_TOKEN = ""

template_FL/src/fedllm/Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,861 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5f1cdd6d-0f6c-447c-8b11-665d0201215e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'flwr_datasets'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mflwr_datasets\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpartitioner\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m IidPartitioner\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mflwr_datasets\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m FederatedDataset\n\u001b[1;32m      5\u001b[0m partitioner \u001b[38;5;241m=\u001b[39m IidPartitioner(num_partitions\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10\u001b[39m)\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'flwr_datasets'"
+     ]
+    }
+   ],
+   "source": [
+    "from flwr_datasets.partitioner import IidPartitioner\n",
+    "from flwr_datasets import FederatedDataset\n",
+    "\n",
+    "\n",
+    "partitioner = IidPartitioner(num_partitions=10)\n",
+    "FDS = FederatedDataset(\n",
+    "    dataset=\"vicgalle/alpaca-gpt4\",\n",
+    "    partitioners={\"train\": partitioner},\n",
+    ")\n",
+    "client_trainset = FDS.load_partition(1, \"train\")\n",
+    "client_trainset = client_trainset.rename_column(\"output\", \"response\")\n",
+    "client_trainset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "fecf675d-1279-4fbd-a4e3-15d245582df4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "from datasets import load_dataset, DatasetDict\n",
+    "import pandas as pd\n",
+    "from functools import partial\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "\n",
+    "def get_dataset(dataset_name, local_data_dir=None):\n",
+    "\n",
+    "    if dataset_name in [\"gsm8k\"]:\n",
+    "        dataset_name = local_data_dir + dataset_name if local_data_dir is not None else dataset_name\n",
+    "        dataset = load_dataset(dataset_name, name=\"main\")\n",
+    "    elif dataset_name in [\"lighteval/MATH\"]:\n",
+    "        dataset_name = local_data_dir + dataset_name if local_data_dir is not None else dataset_name\n",
+    "        dataset = load_dataset(dataset_name, name=\"all\")\n",
+    "    else:\n",
+    "        dataset_name = local_data_dir + dataset_name if local_data_dir is not None else dataset_name\n",
+    "        dataset = load_dataset(dataset_name)\n",
+    "\n",
+    "    return dataset\n",
+    "\n",
+    "# Function to split a dataset dictionary into two 50/50 parts\n",
+    "def split_dataset_50_50(dataset_dict):\n",
+    "    split_datasets = {\n",
+    "        'ds_1': None,\n",
+    "        'ds_2': None\n",
+    "    }\n",
+    "    for split in ['train', 'valid', 'test']:\n",
+    "        if split in dataset_dict:\n",
+    "            dataset_split_1, dataset_split_2 = train_test_split(\n",
+    "                dataset_dict[split], test_size=0.5, shuffle=True, seed=42\n",
+    "            )\n",
+    "            print(f\">> ===== After split, Dataset1 {split} has {len(dataset_split_1)} examples. =====\")\n",
+    "            print(f\">> ===== After split, Dataset2 {split} has {len(dataset_split_2)} examples. =====\")\n",
+    "            split_datasets['ds_1'][split] = dataset_split_1\n",
+    "            split_datasets['ds_2'][split] = dataset_split_2\n",
+    "    return DatasetDict(split_datasets['ds_1']), DatasetDict(split_datasets['ds_2'])\n",
+    "\n",
+    "\n",
+    "def process_sft_dataset(dataset_name, dataset, dataset_sample):\n",
+    "    if dataset_name in [\"lucasmccabe-lmi/CodeAlpaca-20k\", \"yahma/alpaca-cleaned\", \"FinGPT/fingpt-sentiment-train\"]:\n",
+    "        dataset = dataset.map(alpaca_format, remove_columns=['input', 'output'], desc=f\"Preprocessing {dataset_name} for unified format.\")\n",
+    "    elif dataset_name in [\"WizardLM/WizardLM_evol_instruct_70k\"]:\n",
+    "        dataset = dataset.rename_column(\"output\", \"response\")\n",
+    "    elif dataset_name in [\"tatsu-lab/alpaca\", \"vicgalle/alpaca-gpt4\", \"gbharti/finance-alpaca\"]:\n",
+    "        dataset = dataset.map(alpaca_format, remove_columns=['input', 'output', 'text'], desc=f\"Preprocessing {dataset_name} for unified format.\")\n",
+    "    elif dataset_name in [\"TIGER-Lab/MathInstruct\"]:\n",
+    "        df = pd.DataFrame(dataset)\n",
+    "        df = df.drop_duplicates(subset=['instruction'])\n",
+    "        dataset = datasets.Dataset.from_pandas(df)\n",
+    "        dataset = dataset.rename_column(\"output\", \"response\")\n",
+    "        dataset = dataset.remove_columns(['source'])\n",
+    "    elif dataset_name in [\"lighteval/MATH\"]:\n",
+    "        dataset = dataset.rename_column(\"solution\", \"response\")\n",
+    "        dataset = dataset.rename_column(\"problem\", \"instruction\")\n",
+    "        dataset = dataset.remove_columns(['level', 'type'])\n",
+    "    elif dataset_name in ['gsm8k']:\n",
+    "        dataset = dataset.rename_column(\"question\", \"instruction\")\n",
+    "        dataset = dataset.rename_column(\"answer\", \"response\")\n",
+    "    elif dataset_name in ['medalpaca/medical_meadow_medical_flashcards']:       # TODO: 'lavita/ChatDoctor-HealthCareMagic-100k'. not sure whether to discard the instruction.\n",
+    "        dataset = dataset.remove_columns(['instruction'])\n",
+    "        dataset = dataset.rename_column(\"input\", \"instruction\")\n",
+    "        dataset = dataset.rename_column(\"output\", \"response\")\n",
+    "    else:\n",
+    "        raise NotImplementedError(f\"Dataset {dataset_name} is not supported.\")\n",
+    "    dataset = dataset.shuffle(seed=2023)\n",
+    "    if dataset_sample:\n",
+    "        num_sample = min(len(dataset), dataset_sample)\n",
+    "        dataset = dataset.select(range(num_sample))\n",
+    "    print(f\">> ===== After processing, Dataset {dataset_name} has {len(dataset)} examples. =====\")\n",
+    "    print(f\">> ===== Spliting two parts datasets =====\")\n",
+    "    \n",
+    "    if len(dataset['train']) > 10000 and len(dataset['test']) >= 2000:\n",
+    "        dataset = split_dataset_50_50(dataset)\n",
+    "    return dataset\n",
+    "\n",
+    "def alpaca_format(example):\n",
+    "    if example['input'] == \"\":\n",
+    "        example[\"instruction\"] = example[\"instruction\"]\n",
+    "    else:\n",
+    "        example[\"instruction\"] = example[\"instruction\"] + \" \" + example['input']\n",
+    "    example[\"response\"] = example['output']\n",
+    "    return example\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "73d16824-ec97-4e5f-87bc-c432253b93c5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "import pandas as pd\n",
+    "from datasets import Dataset, DatasetDict, load_dataset\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from functools import partial\n",
+    "\n",
+    "\n",
+    "class DatasetAbstract:\n",
+    "    def __init__(self, dataset_name: list[str], category: str):\n",
+    "        self.dataset_name = dataset_name\n",
+    "        self.metadata = {\n",
+    "            'domain': category\n",
+    "        }\n",
+    "    \n",
+    "    def _processing_data(self):\n",
+    "        pass\n",
+    "    \n",
+    "    @classmethod\n",
+    "    def get_dataset(cls, dataset_name, local_data_dir=None):\n",
+    "        if dataset_name in [\"gsm8k\"]:\n",
+    "            dataset_name = local_data_dir + dataset_name if local_data_dir is not None else dataset_name\n",
+    "            dataset = load_dataset(dataset_name, name=\"main\")\n",
+    "        else:\n",
+    "            dataset_name = local_data_dir + dataset_name if local_data_dir is not None else dataset_name\n",
+    "            dataset = load_dataset(dataset_name)\n",
+    "        \n",
+    "        return dataset\n",
+    "    \n",
+    "    def get_split_dataset(self, dataset):\n",
+    "        print(f\">> ===== After processing, Dataset has {len(dataset)} examples. =====\")\n",
+    "        if len(dataset) > 10000:\n",
+    "            ds_part1, ds_part2 = train_test_split(\n",
+    "                dataset, test_size=0.5, shuffle=True, random_state=42\n",
+    "            )\n",
+    "            print(f\">> ===== After split, Dataset1 has {len(ds_part1)} examples and Dataset2 has {len(ds_part2)} examples. =====\")\n",
+    "            list_dataset = []\n",
+    "            for subset in [ds_part1, ds_part2]:\n",
+    "                train, test = train_test_split(\n",
+    "                    subset, test_size=0.2, shuffle=True, random_state=42\n",
+    "                )\n",
+    "                ds = DatasetDict({\n",
+    "                    \"train\": Dataset.from_pandas(train).remove_columns(['__index_level_0__']),\n",
+    "                    \"test\": Dataset.from_pandas(test).remove_columns(['__index_level_0__'])\n",
+    "                })\n",
+    "                list_dataset.append(ds)\n",
+    "            return list_dataset\n",
+    "                \n",
+    "        else:\n",
+    "            train, test = train_test_split(\n",
+    "                dataset , test_size=0.2, shuffle=True, random_state=42\n",
+    "            )\n",
+    "            ds = DatasetDict(\n",
+    "                {\n",
+    "                    \"train\": Dataset.from_pandas(train).remove_columns(['__index_level_0__']),\n",
+    "                    \"test\": Dataset.from_pandas(test).remove_columns(['__index_level_0__'])\n",
+    "                }\n",
+    "            )\n",
+    "            return [ds]\n",
+    "\n",
+    "        \n",
+    "class GeneralDataset(DatasetAbstract):\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        list_dataset = [\"tatsu-lab/alpaca\", \"vicgalle/alpaca-gpt4\"]\n",
+    "        super().__init__(list_dataset, 'general')\n",
+    "        self._processing_data()\n",
+    "    \n",
+    "    def _processing_data(self):\n",
+    "        datasets = []\n",
+    "        for dataset_name in self.dataset_name:\n",
+    "            datasets.append(\n",
+    "                pd.DataFrame(super().get_dataset(dataset_name=dataset_name, local_data_dir=None)['train'])\n",
+    "            )\n",
+    "        dataset = pd.concat(datasets, ignore_index=True)\n",
+    "        self.list_dataset = self.get_split_dataset(dataset)\n",
+    "            \n",
+    "        \n",
+    "\n",
+    "class FinanceDataset(DatasetAbstract):\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        list_dataset = [\"gbharti/finance-alpaca\", \"FinGPT/fingpt-sentiment-train\"]\n",
+    "        super().__init__(list_dataset, 'finance')\n",
+    "        \n",
+    "        self._processing_data()\n",
+    "    \n",
+    "    def _processing_data(self):\n",
+    "        datasets = []\n",
+    "        for dataset_name in self.dataset_name:\n",
+    "            ds = super().get_dataset(dataset_name=dataset_name, local_data_dir=None)['train']\n",
+    "            if dataset_name == 'gbharti/finance-alpaca':\n",
+    "                ds = ds.remove_columns(['text'])\n",
+    "            df = pd.DataFrame(ds)\n",
+    "            datasets.append(df)\n",
+    "        dataset = pd.concat(datasets, ignore_index=True)\n",
+    "        self.list_dataset = self.get_split_dataset(dataset)\n",
+    "        \n",
+    "\n",
+    "class MathDataset(DatasetAbstract):\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        list_dataset = [\"TIGER-Lab/MathInstruct\", \"xDAN2099/lighteval-MATH\", \"gsm8k\"]\n",
+    "        super().__init__(list_dataset, 'math')\n",
+    "        self._processing_data()\n",
+    "        \n",
+    "    \n",
+    "    def get_split_dataset(self, dataset):\n",
+    "        dataset_train, dataset_test = dataset[0], dataset[1]\n",
+    "        print(f\">> ===== After processing, Dataset  has {len(dataset_train)} examples. =====\")\n",
+    "        if len(dataset_train) > 10000:\n",
+    "            ds_train_part1, ds_train_part2 = train_test_split(\n",
+    "                dataset_train, test_size=0.5, shuffle=True, random_state=42\n",
+    "            )\n",
+    "            ds_test_part1, ds_test_part2 = train_test_split(\n",
+    "                dataset_test, test_size=0.5, shuffle=True, random_state=42\n",
+    "            )\n",
+    "            print(f\">> ===== After split, Dataset1 has {len(ds_train_part1)} examples and Dataset2 has {len(ds_train_part2)} examples. =====\")\n",
+    "            list_dataset = []\n",
+    "            for i in range(2):\n",
+    "                ds = DatasetDict({\n",
+    "                    \"train\": Dataset.from_pandas(eval(f'ds_train_part{i+1}')).remove_columns(['__index_level_0__']), \n",
+    "                    \"test\": Dataset.from_pandas(eval(f'ds_test_part{i+1}')).remove_columns(['__index_level_0__'])\n",
+    "                })\n",
+    "                list_dataset.append(ds)\n",
+    "            return list_dataset\n",
+    "                \n",
+    "        else:\n",
+    "            ds = DatasetDict(\n",
+    "                {\n",
+    "                    \"train\": Dataset.from_pandas(dataset_train).remove_columns(['__index_level_0__']),\n",
+    "                    \"test\": Dataset.from_pandas(dataset_test).remove_columns(['__index_level_0__'])\n",
+    "                }\n",
+    "            )\n",
+    "    \n",
+    "    def _processing_data(self):\n",
+    "        datasets_train, datasets_test = [], []\n",
+    "        for dataset_name in self.dataset_name:\n",
+    "            ds_tmp = super().get_dataset(dataset_name=dataset_name, local_data_dir=None)\n",
+    "            if dataset_name == 'TIGER-Lab/MathInstruct':\n",
+    "                df = pd.DataFrame(ds_tmp['train'])\n",
+    "                df = df.drop_duplicates(subset=['instruction'])\n",
+    "                df = df.drop(['source'], axis=1)\n",
+    "                df_train, df_test = train_test_split(df, test_size=0.3, shuffle=True, random_state=42)\n",
+    "                \n",
+    "            elif dataset_name == \"xDAN2099/lighteval-MATH\":\n",
+    "                ds_tmp = ds_tmp.remove_columns(['level', 'type'])\n",
+    "                ds_tmp = ds_tmp.rename_column(\"solution\", \"output\")\n",
+    "                ds_tmp = ds_tmp.rename_column(\"problem\", \"instruction\")\n",
+    "                df_train, df_test = pd.DataFrame(ds_tmp['train']), pd.DataFrame(ds_tmp['test'])\n",
+    "            \n",
+    "            elif dataset_name == 'gsm8k':\n",
+    "                ds_tmp = ds_tmp.rename_column(\"answer\", \"output\")\n",
+    "                ds_tmp = ds_tmp.rename_column(\"question\", \"instruction\")\n",
+    "                df_train, df_test = pd.DataFrame(ds_tmp['train']), pd.DataFrame(ds_tmp['test'])\n",
+    "            \n",
+    "            df_train['input'] = [''] * len(df_train)\n",
+    "            df_test['input'] = [''] * len(df_test)\n",
+    "            datasets_train.append(df_train)\n",
+    "            datasets_test.append(df_test)\n",
+    "            \n",
+    "        dataset_train = pd.concat(datasets_train, ignore_index=True)\n",
+    "        dataset_test = pd.concat(datasets_test, ignore_index=True)\n",
+    "        dataset = [dataset_train, dataset_test]\n",
+    "        self.list_dataset = self.get_split_dataset(dataset)\n",
+    "    \n",
+    "\n",
+    "class MedicalDataset(DatasetAbstract):\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        list_dataset = [\"medalpaca/medical_meadow_medical_flashcards\"]\n",
+    "        super().__init__(list_dataset, 'medical')\n",
+    "        self._processing_data()\n",
+    "    \n",
+    "    def _processing_data(self):\n",
+    "        datasets = []\n",
+    "        for dataset_name in self.dataset_name:\n",
+    "            ds = super().get_dataset(dataset_name=dataset_name, local_data_dir=None)['train']\n",
+    "            if dataset_name == 'medalpaca/medical_meadow_medical_flashcards':\n",
+    "                ds = ds.remove_columns(['instruction'])\n",
+    "                ds = ds.rename_column(\"input\", \"instruction\")\n",
+    "            \n",
+    "            df = pd.DataFrame(ds)\n",
+    "            df['input'] = [''] * len(df)\n",
+    "            datasets.append(df)\n",
+    "        dataset = pd.concat(datasets, ignore_index=True)\n",
+    "        self.list_dataset = self.get_split_dataset(dataset)\n",
+    "        \n",
+    "class CodeDataset(DatasetAbstract):\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        list_dataset = [\"lucasmccabe-lmi/CodeAlpaca-20k\", \"WizardLMTeam/WizardLM_evol_instruct_70k\"]\n",
+    "        super().__init__(list_dataset, 'code')\n",
+    "        self._processing_data()\n",
+    "    \n",
+    "    def _processing_data(self):\n",
+    "        datasets = []\n",
+    "        for dataset_name in self.dataset_name:\n",
+    "            ds = super().get_dataset(dataset_name=dataset_name, local_data_dir=None)['train']\n",
+    "            df = pd.DataFrame(ds)\n",
+    "            if dataset_name == 'WizardLMTeam/WizardLM_evol_instruct_70k':\n",
+    "                df['input'] = [''] * len(df)\n",
+    "            datasets.append(df)\n",
+    "        dataset = pd.concat(datasets, ignore_index=True)\n",
+    "        self.list_dataset = self.get_split_dataset(dataset)\n",
+    "        \n",
+    "client_id_dataset = {\n",
+    "    '1': GeneralDataset().list_dataset[0],\n",
+    "    '2': GeneralDataset().list_dataset[1],\n",
+    "    '3': FinanceDataset().list_dataset[0],\n",
+    "    '4': FinanceDataset().list_dataset[1],\n",
+    "    '5': MathDataset().list_dataset[0],\n",
+    "    '6': MathDataset().list_dataset[1],\n",
+    "    '7': MedicalDataset().list_dataset[0],\n",
+    "    '8': MedicalDataset().list_dataset[1],\n",
+    "    '9': CodeDataset().list_dataset[0],\n",
+    "    '10': CodeDataset().list_dataset[1],\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "85a86706-fd9b-4618-b6e0-b6b2743d1246",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a5ca5da771024ac5acffe25a7e625d6e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "README.md:   0%|          | 0.00/709 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "33deb82850224a00912678f33da2f5c7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Cleaned_date.json:   0%|          | 0.00/42.9M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2161bc54748c4eaf990d85c8dfb1d616",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split:   0%|          | 0/68912 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f0abe4e86e9b40e7a64dbd4ba80a1b3d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "README.md:   0%|          | 0.00/529 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4b662fd05288467b90064ce383f5ecea",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "(…)-00000-of-00001-dabab110260ac909.parquet:   0%|          | 0.00/6.42M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5ee6a13f60124db4af266207872fc6fb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split:   0%|          | 0/76772 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">> ===== After processing, Dataset has 145684 examples. =====\n",
+      ">> ===== After split, Dataset1 has 72842 examples and Dataset2 has 72842 examples. =====\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[DatasetDict({\n",
+       "     train: Dataset({\n",
+       "         features: ['instruction', 'input', 'output'],\n",
+       "         num_rows: 58273\n",
+       "     })\n",
+       "     test: Dataset({\n",
+       "         features: ['instruction', 'input', 'output'],\n",
+       "         num_rows: 14569\n",
+       "     })\n",
+       " }),\n",
+       " DatasetDict({\n",
+       "     train: Dataset({\n",
+       "         features: ['instruction', 'input', 'output'],\n",
+       "         num_rows: 58273\n",
+       "     })\n",
+       "     test: Dataset({\n",
+       "         features: ['instruction', 'input', 'output'],\n",
+       "         num_rows: 14569\n",
+       "     })\n",
+       " })]"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "FinanceDataset().list_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "78c926d4-9462-4a8a-b50a-ef5ed1e5c25e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>letter</th>\n",
+       "      <th>number</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>a</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>b</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  letter  number\n",
+       "0      a       1\n",
+       "1      b       2"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client_id_dataset = {\n",
+    "    '1':\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "9b9babb9-01fe-4b01-be23-d1f334dd450c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>letter</th>\n",
+       "      <th>number</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>a</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>b</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  letter  number\n",
+       "0      a       1\n",
+       "1      b       2"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "df1 = pd.DataFrame([['a', 1], ['b', 2]],\n",
+    "                   columns=['letter', 'number'])\n",
+    "\n",
+    "df2 = pd.DataFrame([['c', 3], ['d', 4]],\n",
+    "                   columns=['letter', 'number'])\n",
+    "\n",
+    "df3 = pd.DataFrame([['e', 5], ['f', 6]],\n",
+    "                   columns=['letter', 'number'])\n",
+    "\n",
+    "pd.concat([df1], ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "fa021e4a-5e8b-470f-bf6f-db7b9e1f9eb6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">> ===== After processing, Dataset gsm8k has 2 examples. =====\n",
+      ">> ===== Spliting two parts datasets =====\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['instruction', 'output'],\n",
+       "        num_rows: 7473\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['instruction', 'output'],\n",
+       "        num_rows: 1319\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset = get_dataset(dataset_name=\"gsm8k\", local_data_dir=None)\n",
+    "datasets = process_sft_dataset(dataset_name=\"gsm8k\", dataset=dataset, dataset_sample=None)\n",
+    "datasets = datasets.rename_column(\"response\", \"output\")\n",
+    "datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "c8e02304-5b90-4651-ad36-7feb440d7ea4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def clean_llm_text(text):\n",
+    "    \"\"\"\n",
+    "    Clean and normalize text from LLM outputs by removing noise and repetitions.\n",
+    "    \n",
+    "    Args:\n",
+    "        text (str): Raw text from LLM prediction\n",
+    "        \n",
+    "    Returns:\n",
+    "        str: Cleaned and normalized text\n",
+    "    \"\"\"\n",
+    "    import re\n",
+    "    \n",
+    "    # Remove repetitive patterns (like 'cor cor cor' or 'asesases')\n",
+    "    def remove_repetitions(text):\n",
+    "        # Split into words\n",
+    "        words = text.split()\n",
+    "        cleaned_words = []\n",
+    "        prev_word = None\n",
+    "        repetition_count = 0\n",
+    "        \n",
+    "        for word in words:\n",
+    "            if word == prev_word:\n",
+    "                repetition_count += 1\n",
+    "                if repetition_count < 2:  # Allow up to 2 repetitions for legitimate cases\n",
+    "                    cleaned_words.append(word)\n",
+    "            else:\n",
+    "                repetition_count = 0\n",
+    "                cleaned_words.append(word)\n",
+    "            prev_word = word\n",
+    "            \n",
+    "        return ' '.join(cleaned_words)\n",
+    "    \n",
+    "    def remove_repeats(text):\n",
+    "        # Remove repeated words\n",
+    "        pattern_words = r'\\b(\\w+)(?:\\s+\\1\\b)+'\n",
+    "        text = re.sub(pattern_words, r'\\1', text)\n",
+    "\n",
+    "        # Remove repeated character patterns (like 'asasas')\n",
+    "        pattern_chars = r'(\\w+?)\\1+'\n",
+    "        text = re.sub(pattern_chars, r'\\1', text)\n",
+    "\n",
+    "        return text\n",
+    "    \n",
+    "    # Remove excessive punctuation\n",
+    "    def normalize_punctuation(text):\n",
+    "        # Replace multiple exclamation/question marks with single ones\n",
+    "        text = re.sub(r'!+', '!', text)\n",
+    "        text = re.sub(r'\\?+', '?', text)\n",
+    "        # Remove multiple periods (except for ellipsis)\n",
+    "        text = re.sub(r'\\.{4,}', '...', text)\n",
+    "        text = text.replace('cor', '').replace('asesa', '')\n",
+    "        return text\n",
+    "    \n",
+    "    # Main cleaning pipeline\n",
+    "    cleaned_text = text.strip()\n",
+    "    \n",
+    "    # Remove common noise patterns\n",
+    "    noise_patterns = [\n",
+    "        r'\\n+',              # Multiple newlines\n",
+    "        r'\\s+',              # Multiple spaces\n",
+    "        r'\\\\n',              # Literal \\n\n",
+    "        r'\\\\t',              # Literal \\t\n",
+    "    ]\n",
+    "    \n",
+    "    for pattern in noise_patterns:\n",
+    "        cleaned_text = re.sub(pattern, ' ', cleaned_text)\n",
+    "    \n",
+    "    # Apply cleaning functions\n",
+    "    # cleaned_text = remove_repetitions(cleaned_text)\n",
+    "    cleaned_text = remove_repeats(cleaned_text)\n",
+    "    cleaned_text = normalize_punctuation(cleaned_text)\n",
+    "    cleaned_text = ' '.join(cleaned_text.split())  # Normalize spacing\n",
+    "    \n",
+    "    return cleaned_text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "9d95fbfe-d824-4247-8628-3bb1cffe9065",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'folowing into their based mamals animals, water animals. 1 Response: animals: , Elephant,Sea Animals: Dolphin, Dolphin'"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text_str = f\"\"\"at\\n\\nSea Animals: Whale, Fish cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor', \" cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor One example of a technology that uses artificial intelligence is a virtual personal assistant such as Amazon's Alexa, Apple's Siri, or Google Assistant. These devices use natural language processing and machine learning to understand and respond to user's voice commands, providing assistance in tasks such as setting reminders, playing music, or answering questions. cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor cor\"\"\"\n",
+    "text_str1 = f\"\"\"following into their based mammals animals, water animals.\\n\\n1 Response:\\n \\n animals: \\n, Elephant,Sea Animals: Dolphin, Dolphin\\n\\nasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesasesa\"\"\"\n",
+    "clean_llm_text(text_str1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "c2bd139b-55f9-4848-a3d3-0db55d601e67",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[1, 2, 3]])"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "x = torch.tensor([1, 2, 3])\n",
+    "torch.unsqueeze(x,dim=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "2eb4071a-3d82-4d86-bb0e-a1403a2b1c1e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TF-IDF Matrix Shape: (136, 25)\n"
+     ]
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f75512f-13e8-4373-8630-8b5269729c32",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py11torch",
+   "language": "python",
+   "name": "py11torch"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

template_FL/src/fedllm/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """flowertune_llm."""

template_FL/src/fedllm/client_app.py ADDED Viewed

	@@ -0,0 +1,335 @@

+"""flowertune-llm: A Flower / FlowerTune app."""
+import os
+import warnings
+from typing import Dict, Tuple
+import torch
+import wandb
+import numpy as np
+from flwr.client import ClientApp, NumPyClient
+from flwr.common import Context
+from flwr.common.config import unflatten_dict
+from flwr.common.typing import NDArrays, Scalar
+from omegaconf import DictConfig
+from transformers import TrainingArguments, DataCollatorForSeq2Seq, Trainer, EarlyStoppingCallback, BertForSequenceClassification, GenerationConfig
+from trl import SFTTrainer, SFTConfig
+from deepspeed.profiling.flops_profiler import get_model_profile
+from deepspeed.accelerator import get_accelerator
+from .trainer import ManualTrainer
+from .dataset import (
+    get_data_collator_and_propt_formatting,
+    load_data,
+    load_data_homo,
+    load_data_hete,
+    replace_keys,
+)
+from .models import *
+from .flwr_mods import get_wandb_mod
+from .metrics import exact_match, f1, get_rouge_score
+from .utils import clean_output_text
+from .make_data import Prompter, generate_and_tokenize_prompt
+# Avoid warnings
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
+warnings.filterwarnings("ignore", category=UserWarning)
+def input_constructor(batch_size, seq_len, tokenizer):
+    fake_seq = ""
+    for _ in range(seq_len - 2):  # ignore the two special tokens [CLS] and [SEP]
+      fake_seq += tokenizer.pad_token
+    inputs = tokenizer([fake_seq] * batch_size,
+                       padding=True,
+                       truncation=True,
+                       max_length=seq_len,
+                       return_tensors="pt")
+    labels = torch.tensor([1] * batch_size)
+    inputs = dict(inputs)
+    # inputs.update({"labels": torch.unsqueeze(labels,dim=0)})
+    # To device
+    inputs = {k: v.to('cuda:0') for k, v in inputs.items()}
+    return inputs
+def convert_to_float(value_str):
+    value, unit = value_str.split()
+    value = float(value)
+    if unit == 'T' or 'T' in unit:
+        return value * 1e12
+    elif unit == 'G' or 'G' in unit:
+        return value * 1e9
+    elif unit == 'M' or 'M' in unit:
+        return value * 1e6
+    elif unit == 'K' or 'K' in unit:
+        return value * 1e3
+    return value
+# pylint: disable=too-many-arguments
+# pylint: disable=too-many-instance-attributes
+class FlowerClient(NumPyClient):
+    """Standard Flower client for CNN training."""
+    def __init__(
+        self,
+        model_cfg: DictConfig,
+        train_cfg: DictConfig,
+        mates_args: DictConfig,
+        trainset,
+        valset,
+        num_rounds,
+    ):  # pylint: disable=too-many-arguments
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        self.train_cfg = train_cfg
+        self.training_arguments = TrainingArguments(**train_cfg.training_arguments)
+        # self.training_arguments = SFTConfig(**train_cfg.training_arguments, max_seq_length=train_cfg.seq_length)
+        self.num_rounds = num_rounds
+        self.trainset = trainset
+        self.valset = valset
+        self.mates_args = mates_args
+        self.holdoutset = None
+        self.refset = None
+        self.data_influence_model = None
+        self.data_influence_tokenizer = None
+        # instantiate model
+        self.model, self.tokenizer = get_model(model_cfg)
+        if self.mates_args.state:
+            self.data_influence_model, self.data_influence_tokenizer = get_data_influence_model(model_cfg)
+        # (
+        #     self.data_collator,
+        #     self.formatting_prompts_func
+        # ) = get_data_collator_and_propt_formatting(self.tokenizer)
+        self.data_collator = DataCollatorForSeq2Seq(
+            self.tokenizer,
+            pad_to_multiple_of=8,
+            return_tensors="pt",
+            padding=True,
+        )
+        self.train_on_inputs = self.train_cfg.train_on_inputs
+        self._make_dataset()
+    def compute_metrics(self, pred):
+        labels_ids = pred['label_ids']
+        pred_ids = pred['predictions']
+        # Replace -100 with pad token id in labels
+        labels_ids[labels_ids == -100] = self.tokenizer.pad_token_id
+        print(f"Shape of predictions: {np.shape(pred_ids)}")
+        print(f"Shape of labels: {np.shape(labels_ids)}")
+        # Decode predictions and labels
+        pred_str = self.tokenizer.batch_decode(
+            pred_ids, skip_special_tokens=True
+        )
+        label_str = self.tokenizer.batch_decode(
+            labels_ids, skip_special_tokens=True
+        )
+        # Remove any extra whitespace from the decoded strings
+        pred_str = [s.strip() for s in pred_str]
+        label_str = [s.strip() for s in label_str]
+        return {
+            **get_rouge_score(predictions=pred_str, targets=label_str),
+            **f1(predictions=pred_str, targets=label_str),
+        }
+    def _make_dataset(self):
+        prompter = Prompter(self.train_cfg.prompt_template_name, self.train_cfg.verbose)
+        tmp_dict = {
+            "prompter": prompter,
+            "seq_length": self.train_cfg.seq_length,
+            "train_on_inputs": self.train_on_inputs,
+            "tokenizer": self.tokenizer,
+        }
+        # Process trainset
+        self.trainset = (
+            self.trainset
+            .shuffle()
+            .map(
+                lambda x: generate_and_tokenize_prompt(x, **tmp_dict),
+                num_proc=8,
+            )
+        )
+        # Process valset
+        self.valset = (
+            self.valset
+            .shuffle()
+            .map(
+                lambda x: generate_and_tokenize_prompt(x, **tmp_dict),
+                num_proc=8,
+            )
+        )
+        # Create holdoutset and refset if state is True
+        if self.mates_args.state:
+            trainset_size = len(self.trainset)
+            # Calculate sizes for holdout and reference sets
+            holdout_size = int(trainset_size * self.mates_args.holdout_ratio)
+            ref_size = int(trainset_size * self.mates_args.reference_ratio)
+            # Shuffle the trainset to ensure randomness
+            shuffled_indices = list(range(trainset_size))
+            self.trainset = self.trainset.shuffle()
+            # Split the dataset
+            holdout_indices = shuffled_indices[:holdout_size]
+            ref_indices = shuffled_indices[holdout_size:holdout_size + ref_size]
+            # Create holdoutset and refset
+            self.holdoutset = self.trainset.select(holdout_indices)
+            self.refset = self.trainset.select(ref_indices)
+            print(f"Holdoutset size: {len(self.holdoutset)}, Refset size: {len(self.refset)}")
+    def fit(
+        self, parameters: NDArrays, config: Dict[str, Scalar]
+    ) -> Tuple[NDArrays, int, Dict]:
+        """Implement distributed fit function for a given client."""
+        if self.mates_args.state and int(config["current_round"]) != 1:
+            main_model_params, data_influence_model_params = split_models(parameters)
+            set_parameters(self.model, main_model_params)
+            set_parameters_bert(self.data_influence_model, data_influence_model_params)
+        else:
+            set_parameters(self.model, parameters)
+        new_lr = cosine_annealing(
+            int(config["current_round"]),
+            self.num_rounds,
+            self.train_cfg.learning_rate_max,
+            self.train_cfg.learning_rate_min,
+        )
+        self.training_arguments.learning_rate = new_lr
+        self.training_arguments.output_dir = config["save_path"]
+        # Initialize callback
+        early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=5)
+        # Construct supervised trainer
+        # trainer = SFTTrainer(
+        #     model=self.model,
+        #     tokenizer=self.tokenizer,
+        #     args=self.training_arguments,
+        #     train_dataset=self.trainset,
+        #     eval_dataset=self.valset,
+        #     formatting_func=self.formatting_prompts_func,
+        #     data_collator=self.data_collator,
+        #     compute_metrics=self.compute_metrics,
+        #     callbacks=[flops_callback, early_stopping_callback]
+        # )
+        # # Constuct baseline Trainer
+        # trainer = Trainer(
+        #     model=self.model,
+        #     train_dataset=self.trainset,
+        #     eval_dataset=self.valset.select(range(10)),
+        #     args=self.training_arguments,
+        #     data_collator=self.data_collator,
+        #     compute_metrics=self.compute_metrics,
+        #     callbacks=[early_stopping_callback]
+        # )
+        trainer = ManualTrainer(
+            model=self.model,
+            tokenizer = self.tokenizer,
+            train_dataset=self.trainset,
+            val_dataset=self.valset.select(range(10)),
+            holdout_dataset=self.holdoutset,
+            reference_dataset=self.refset,
+            args=self.training_arguments,
+            data_collator=self.data_collator,
+            compute_metrics=self.compute_metrics,
+            mates_args=self.mates_args,
+            data_influence_model=self.data_influence_model,
+            data_influence_tokenizer=self.data_influence_tokenizer,
+        )
+        # Train the model
+        results = trainer.train()
+        if self.mates_args.state:
+            # After training
+            main_model_params = get_parameters(self.model)
+            data_influence_model_params = model_parameters_to_ndarrays(self.data_influence_model)
+            final_model_params = concatenate_models_with_marker(main_model_params, data_influence_model_params)
+        else:
+            final_model_params = get_parameters(self.model)
+        # Calculate FLOPs
+        with get_accelerator().device('cuda:0'):
+            batch_size = self.training_arguments.per_device_eval_batch_size
+            seq_len = self.train_cfg.seq_length
+            flops1, macs1, params1 = get_model_profile(
+              self.model,
+              kwargs=input_constructor(batch_size, seq_len, self.tokenizer),
+              print_profile=True,
+              detailed=False,
+            )
+            flops2, macs2, params2 = get_model_profile(
+              self.data_influence_model,
+              kwargs=input_constructor(batch_size, seq_len, self.data_influence_tokenizer),
+              print_profile=True,
+              detailed=False,
+            )
+            flops1_value, flops2_value = convert_to_float(flops1), convert_to_float(flops2)
+            macs1_value, macs2_value = convert_to_float(macs1), convert_to_float(macs2)
+            params1_value, params2_value  = convert_to_float(params1), convert_to_float(params2)
+            wandb.log({"total_flops": flops1_value + flops2_value, "macs": macs1_value + macs2_value, "params": params1_value + params2_value})  # wa
+        return (
+            final_model_params,
+            len(self.trainset),
+            {"train_loss": results['training_loss'], "flops": flops1_value + flops2_value},
+        )
+def client_fn(context: Context) -> FlowerClient:
+    """Create a Flower client representing a single organization."""
+    partition_id = context.node_config["partition-id"]
+    num_partitions = context.node_config["num-partitions"]
+    num_rounds = context.run_config["num-server-rounds"]
+    cfg = DictConfig(replace_keys(unflatten_dict(context.run_config)))
+    # Let's get the client partition
+    if cfg.dataset.type == 'homo':
+        client_set = load_data_homo(partition_id, num_partitions, cfg.dataset.name)
+    else:
+        client_set = load_data_hete(partition_id)
+    return FlowerClient(
+        cfg.model,
+        cfg.train,
+        cfg.mates,
+        client_set['train'],
+        client_set['test'],
+        num_rounds,
+    ).to_client()
+# Flower ClientApp
+app = ClientApp(
+    client_fn,
+    mods=[
+        get_wandb_mod("FL@CSS25"),
+    ],
+)

template_FL/src/fedllm/data_domains.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import datasets
+import pandas as pd
+from datasets import Dataset, DatasetDict, load_dataset
+from sklearn.model_selection import train_test_split
+from functools import partial
+global_test_set_hete = {}
+class DatasetAbstract:
+    def __init__(self, dataset_name: list[str], category: str):
+        self.dataset_name = dataset_name
+        self.metadata = {
+            'domain': category
+        }
+    def _processing_data(self):
+        pass
+    @classmethod
+    def get_dataset(cls, dataset_name, local_data_dir=None):
+        if dataset_name in ["gsm8k"]:
+            dataset_name = local_data_dir + dataset_name if local_data_dir is not None else dataset_name
+            dataset = load_dataset(dataset_name, name="main")
+        else:
+            dataset_name = local_data_dir + dataset_name if local_data_dir is not None else dataset_name
+            dataset = load_dataset(dataset_name)
+        return dataset
+    def get_split_dataset(self, dataset):
+        print(f">> ===== After processing, Dataset has {len(dataset)} examples. =====")
+        if len(dataset) > 10000:
+            ds_part1, ds_part2 = train_test_split(
+                dataset, test_size=0.5, shuffle=True, random_state=42
+            )
+            print(f">> ===== After split, Dataset1 has {len(ds_part1)} examples and Dataset2 has {len(ds_part2)} examples. =====")
+            list_dataset = []
+            list_global_set = []
+            for subset in [ds_part1, ds_part2]:
+                train, test = train_test_split(
+                    subset, test_size=0.2, shuffle=True, random_state=42
+                )
+                test, global_test = train_test_split(
+                    subset, test_size=0.1, shuffle=True, random_state=42
+                )
+                ds = DatasetDict({
+                    "train": Dataset.from_pandas(train).remove_columns(['__index_level_0__']),
+                    "test": Dataset.from_pandas(test).remove_columns(['__index_level_0__'])
+                })
+                list_dataset.append(ds)
+                list_global_set.append(global_test)
+            list_global_set = pd.concat(list_global_set, ignore_index=True)
+            list_global_set = Dataset.from_pandas(list_global_set)
+            return list_dataset, list_global_set
+        else:
+            train, test = train_test_split(
+                dataset , test_size=0.2, shuffle=True, random_state=42
+            )
+            test, global_test = train_test_split(
+                    subset, test_size=0.1, shuffle=True, random_state=42
+            )
+            ds = DatasetDict(
+                {
+                    "train": Dataset.from_pandas(train).remove_columns(['__index_level_0__']),
+                    "test": Dataset.from_pandas(test).remove_columns(['__index_level_0__'])
+                }
+            )
+            global_set = Dataset.from_pandas(global_test).remove_columns(['__index_level_0__'])
+            return [ds], global_set
+class GeneralDataset(DatasetAbstract):
+    def __init__(self):
+        list_dataset = ["tatsu-lab/alpaca", "vicgalle/alpaca-gpt4"]
+        super().__init__(list_dataset, 'general')
+        self._processing_data()
+    def _processing_data(self):
+        datasets = []
+        for dataset_name in self.dataset_name:
+            datasets.append(
+                pd.DataFrame(super().get_dataset(dataset_name=dataset_name, local_data_dir=None)['train'])
+            )
+        dataset = pd.concat(datasets, ignore_index=True)
+        self.list_dataset, global_test = self.get_split_dataset(dataset)
+        global global_test_set_hete
+        global_test_set_hete.update(
+            {self.metadata['domain']: global_test}
+        )
+class FinanceDataset(DatasetAbstract):
+    def __init__(self):
+        list_dataset = ["gbharti/finance-alpaca", "FinGPT/fingpt-sentiment-train"]
+        super().__init__(list_dataset, 'finance')
+        self._processing_data()
+    def _processing_data(self):
+        datasets = []
+        for dataset_name in self.dataset_name:
+            ds = super().get_dataset(dataset_name=dataset_name, local_data_dir=None)['train']
+            if dataset_name == 'gbharti/finance-alpaca':
+                ds = ds.remove_columns(['text'])
+            df = pd.DataFrame(ds)
+            datasets.append(df)
+        dataset = pd.concat(datasets, ignore_index=True)
+        self.list_dataset, global_test = self.get_split_dataset(dataset)
+        global global_test_set_hete
+        global_test_set_hete.update(
+            {self.metadata['domain']: global_test}
+        )
+class MathDataset(DatasetAbstract):
+    def __init__(self):
+        list_dataset = ["TIGER-Lab/MathInstruct", "xDAN2099/lighteval-MATH", "gsm8k"]
+        super().__init__(list_dataset, 'math')
+        self._processing_data()
+    def get_split_dataset(self, dataset):
+        dataset_train, dataset_test = dataset[0], dataset[1]
+        dataset_test, global_test = train_test_split(
+            dataset_test, test_size=0.1, shuffle=True, random_state=42
+        )
+        global_test = Dataset.from_pandas(global_test)
+        print(f">> ===== After processing, Dataset  has {len(dataset_train)} examples. =====")
+        if len(dataset_train) > 10000:
+            ds_train_part1, ds_train_part2 = train_test_split(
+                dataset_train, test_size=0.5, shuffle=True, random_state=42
+            )
+            ds_test_part1, ds_test_part2 = train_test_split(
+                dataset_test, test_size=0.5, shuffle=True, random_state=42
+            )
+            print(f">> ===== After split, Dataset1 has {len(ds_train_part1)} examples and Dataset2 has {len(ds_train_part2)} examples. =====")
+            list_dataset = []
+            for i in range(2):
+                ds = DatasetDict({
+                    "train": Dataset.from_pandas(eval(f'ds_train_part{i+1}')).remove_columns(['__index_level_0__']),
+                    "test": Dataset.from_pandas(eval(f'ds_test_part{i+1}')).remove_columns(['__index_level_0__'])
+                })
+                list_dataset.append(ds)
+            return list_dataset, global_test
+        else:
+            ds = DatasetDict(
+                {
+                    "train": Dataset.from_pandas(dataset_train).remove_columns(['__index_level_0__']),
+                    "test": Dataset.from_pandas(dataset_test).remove_columns(['__index_level_0__'])
+                }
+            )
+            return [ds], global_test
+    def _processing_data(self):
+        datasets_train, datasets_test = [], []
+        for dataset_name in self.dataset_name:
+            ds_tmp = super().get_dataset(dataset_name=dataset_name, local_data_dir=None)
+            if dataset_name == 'TIGER-Lab/MathInstruct':
+                df = pd.DataFrame(ds_tmp['train'])
+                df = df.drop_duplicates(subset=['instruction'])
+                df = df.drop(['source'], axis=1)
+                df_train, df_test = train_test_split(df, test_size=0.3, shuffle=True, random_state=42)
+            elif dataset_name == "xDAN2099/lighteval-MATH":
+                ds_tmp = ds_tmp.remove_columns(['level', 'type'])
+                ds_tmp = ds_tmp.rename_column("solution", "output")
+                ds_tmp = ds_tmp.rename_column("problem", "instruction")
+                df_train, df_test = pd.DataFrame(ds_tmp['train']), pd.DataFrame(ds_tmp['test'])
+            elif dataset_name == 'gsm8k':
+                ds_tmp = ds_tmp.rename_column("answer", "output")
+                ds_tmp = ds_tmp.rename_column("question", "instruction")
+                df_train, df_test = pd.DataFrame(ds_tmp['train']), pd.DataFrame(ds_tmp['test'])
+            df_train['input'] = [''] * len(df_train)
+            df_test['input'] = [''] * len(df_test)
+            datasets_train.append(df_train)
+            datasets_test.append(df_test)
+        dataset_train = pd.concat(datasets_train, ignore_index=True)
+        dataset_test = pd.concat(datasets_test, ignore_index=True)
+        dataset = [dataset_train, dataset_test]
+        self.list_dataset, global_test = self.get_split_dataset(dataset)
+        global global_test_set_hete
+        global_test_set_hete.update(
+            {self.metadata['domain']: global_test}
+        )
+class MedicalDataset(DatasetAbstract):
+    def __init__(self):
+        list_dataset = ["medalpaca/medical_meadow_medical_flashcards"]
+        super().__init__(list_dataset, 'medical')
+        self._processing_data()
+    def _processing_data(self):
+        datasets = []
+        for dataset_name in self.dataset_name:
+            ds = super().get_dataset(dataset_name=dataset_name, local_data_dir=None)['train']
+            if dataset_name == 'medalpaca/medical_meadow_medical_flashcards':
+                ds = ds.remove_columns(['instruction'])
+                ds = ds.rename_column("input", "instruction")
+            df = pd.DataFrame(ds)
+            df['input'] = [''] * len(df)
+            datasets.append(df)
+        dataset = pd.concat(datasets, ignore_index=True)
+        self.list_dataset, global_test = self.get_split_dataset(dataset)
+        global global_test_set_hete
+        global_test_set_hete.update(
+            {self.metadata['domain']: global_test}
+        )
+class CodeDataset(DatasetAbstract):
+    def __init__(self):
+        list_dataset = ["lucasmccabe-lmi/CodeAlpaca-20k", "WizardLMTeam/WizardLM_evol_instruct_70k"]
+        super().__init__(list_dataset, 'code')
+        self._processing_data()
+    def _processing_data(self):
+        datasets = []
+        for dataset_name in self.dataset_name:
+            ds = super().get_dataset(dataset_name=dataset_name, local_data_dir=None)['train']
+            df = pd.DataFrame(ds)
+            if dataset_name == 'WizardLMTeam/WizardLM_evol_instruct_70k':
+                df['input'] = [''] * len(df)
+            datasets.append(df)
+        dataset = pd.concat(datasets, ignore_index=True)
+        self.list_dataset, global_test = self.get_split_dataset(dataset)
+        global global_test_set_hete
+        global_test_set_hete.update(
+            {self.metadata['domain']: global_test}
+        )
+def release_ds():
+    data_domain = {
+        'general': GeneralDataset().list_dataset,
+        'finance': FinanceDataset().list_dataset,
+        'math': MathDataset().list_dataset,
+        'medical': MedicalDataset().list_dataset,
+        'code': CodeDataset().list_dataset
+    }
+    tmp_dataset = {}
+    k = 0
+    for task in data_domain.keys():
+        tmp_dataset[str(k)] = data_domain[task][0]
+        tmp_dataset[str(k+1)] = data_domain[task][1]
+        k += 2
+    return tmp_dataset
+# data_domain = {
+#     'general': GeneralDataset().list_dataset,
+#     'finance': FinanceDataset().list_dataset,
+#     'math': MathDataset().list_dataset,
+#     'medical': MedicalDataset().list_dataset,
+#     'code': CodeDataset().list_dataset
+# }
+# client_id_dataset = {
+#     '0': data_domain['general'][0],
+#     '1': data_domain['general'][1],
+#     '2': data_domain['finance'][0],
+#     '3': data_domain['finance'][1],
+#     '4': data_domain['math'][0],
+#     '5': data_domain['math'][1],
+#     '6': data_domain['medical'][0],
+#     '7': data_domain['medical'][1],
+#     '8': data_domain['code'][0],
+#     '9': data_domain['code'][1],
+# }
+client_id_dataset = release_ds()

template_FL/src/fedllm/dataset.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from trl import DataCollatorForCompletionOnlyLM
+from flwr_datasets.partitioner import IidPartitioner
+from flwr_datasets import FederatedDataset
+from datasets import Dataset, DatasetDict
+from sklearn.model_selection import train_test_split
+import pandas as pd
+FDS = None  # Cache FederatedDataset
+client_id_ds = None
+global_test_set_homo = None
+def split_train_test(dataset, test_size):
+    # Split the dataset into train and test sets
+    train_data, test_data = train_test_split(dataset.to_pandas(), test_size=test_size, shuffle=True, random_state=42)
+    test_data, global_test = train_test_split(test_data, test_size=0.1, shuffle=True, random_state=42)
+    # Convert to Dataset objects
+    train_dataset = Dataset.from_pandas(train_data)
+    test_dataset = Dataset.from_pandas(test_data)
+    global_test = Dataset.from_pandas(global_test)
+    # Combine into a DatasetDict
+    datasets_dict = DatasetDict({
+        'train': train_dataset,
+        'test': test_dataset
+    })
+    return datasets_dict
+def formatting_prompts_func(example):
+    output_texts = []
+    # Constructing a standard Alpaca (https://github.com/tatsu-lab/stanford_alpaca#data-release) prompt
+    mssg = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
+    for i in range(len(example["instruction"])):
+        text = f"{mssg}\n### Instruction:\n{example['instruction'][i]}\n### Response: {example['response'][i]}"
+        output_texts.append(text)
+    return output_texts
+def get_data_collator_and_propt_formatting(tokenizer):
+    # From: https://huggingface.co/docs/trl/en/sft_trainer
+    response_template_with_context = "\n### Response:"  # alpaca response tag
+    response_template_ids = tokenizer.encode(
+        response_template_with_context, add_special_tokens=False
+    )[2:]
+    data_collator = DataCollatorForCompletionOnlyLM(
+        response_template_ids, tokenizer=tokenizer
+    )
+    return data_collator, formatting_prompts_func
+def load_data(partition_id: int, num_partitions: int, dataset_name: str):
+    """Load partition data."""
+    # Only initialize `FederatedDataset` once
+    global FDS
+    if FDS is None:
+        partitioner = IidPartitioner(num_partitions=num_partitions)
+        FDS = FederatedDataset(
+            dataset=dataset_name,
+            partitioners={"train": partitioner},
+        )
+    print(f"<---- Load client {partition_id} --->")
+    client_trainset = FDS.load_partition(partition_id, "train")
+    client_trainset = client_trainset.rename_column("output", "response")
+    print(client_trainset)
+    return client_trainset
+def load_data_homo(partition_id: int, num_partitions: int, dataset_name: str):
+    """Load partition data."""
+    # Only initialize `FederatedDataset` once
+    global FDS
+    global global_test_set_homo
+    if FDS is None:
+        partitioner = IidPartitioner(num_partitions=num_partitions)
+        FDS = FederatedDataset(
+            dataset=dataset_name,
+            partitioners={"train": partitioner},
+        )
+        # list_ds = []
+        # for cid in range(0,num_partitions):
+        #     tmp_set = FDS.load_partition(cid, "train")
+        #     list_ds.append(
+        #         pd.DataFrame(tmp_set)
+        #     )
+        # list_ds = pd.concat(list_ds, ignore_index=True)
+        # _, global_test_set_homo = train_test_split(
+        #         list_ds, test_size=0.1, shuffle=True, random_state=42
+        # )
+        # global_test_set_homo = Dataset.from_pandas(global_test_set_homo).remove_columns(['__index_level_0__'])
+    print(f"<---- Load client {partition_id} --->")
+    client_trainset = FDS.load_partition(partition_id, "train")
+    # client_trainset = client_trainset.rename_column("output", "response")
+    client_set = split_train_test(client_trainset, test_size=0.2)
+    return client_set
+def load_data_hete(partition_id: int):
+    """Load partition data heterogeneous"""
+    global client_id_ds
+    if client_id_ds is None:
+        from .data_domains import client_id_dataset
+        client_id_ds = client_id_dataset
+    print(f"<---- Load client {partition_id} --->")
+    client_set = client_id_ds[str(partition_id)]
+    return client_set
+def replace_keys(input_dict, match="-", target="_"):
+    """Recursively replace match string with target string in dictionary keys."""
+    new_dict = {}
+    for key, value in input_dict.items():
+        new_key = key.replace(match, target)
+        if isinstance(value, dict):
+            new_dict[new_key] = replace_keys(value, match, target)
+        else:
+            new_dict[new_key] = value
+    return new_dict

template_FL/src/fedllm/flwr_mods.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from flwr.common import Context, Message, MessageType, ConfigsRecord
+from flwr.client.typing import ClientAppCallable
+from typing import Callable
+import wandb
+import time
+from .myfedavg import client_id_idx
+# Define type alias for Mod
+Mod = Callable[[Message, Context, ClientAppCallable], Message]
+def get_wandb_mod(name: str) -> Mod:
+    # Keep track of active runs
+    active_run: Optional[wandb.Run] = None
+    def wandb_mod(msg: Message, context: Context, app: ClientAppCallable) -> Message:
+        nonlocal active_run
+        server_round = int(msg.metadata.group_id)
+        run_id = msg.metadata.run_id
+        group_name = f"Run ID: {run_id}"
+        node_id = str(msg.metadata.dst_node_id)
+        run_name = f"Client ID: {client_id_idx[node_id]}"
+        wandb.init(
+                project=name,
+                group=group_name,
+                name=run_name,
+                id=f"{run_id}_{client_id_idx[node_id]}",
+                resume="allow",
+                reinit=True,
+                # settings=wandb.Settings(start_method="thread")
+        )
+        start_time = time.time()
+        reply = app(msg, context)
+        if reply.metadata.message_type == MessageType.TRAIN and reply.has_content():
+            time_diff = time.time() - start_time
+            metrics = reply.content.configs_records
+            results_to_log = dict(metrics.get("fitres.metrics", ConfigsRecord()))
+            results_to_log["fit_time"] = time_diff
+            wandb.log(results_to_log, step=int(server_round), commit=True)
+        return reply
+    return wandb_mod

template_FL/src/fedllm/make_data.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import json
+import os
+import os.path as osp
+from typing import Union
+class Prompter(object):
+    __slots__ = ("template", "_verbose")
+    def __init__(self, template_name: str = "", verbose: bool = False):
+        self._verbose = verbose
+        if not template_name:
+            # Enforece the default here, so the constructor can be called with '' and will not break.
+            template_name = "alpaca"
+        file_name = osp.join(
+            os.getcwd(), "fedllm/templates", f"{template_name}.json"
+        )
+        if not osp.exists(file_name):
+            raise ValueError(f"Can't read {file_name}")
+        with open(file=file_name) as fp:
+            self.template = json.load(fp)
+        if self._verbose:
+            print(
+                f"Using prompt template {template_name}: {self.template['description']}"
+            )
+    def generate_prompt(
+        self,
+        instruction: str,
+        input: Union[None, str] = None,
+        label: Union[None, str] = None,
+    ) -> str:
+        # returns the full prompt from instruction and optional input
+        # if a label (=response, =output) is provided, it's also appended.
+        if input:
+            res = self.template["prompt_input"].format(
+                instruction=instruction,
+                input=input,
+            )
+        else:
+            res = self.template["prompt_no_input"].format(
+                instruction=instruction,
+            )
+        if label:
+            res = f"{res}{label}"
+        if self._verbose:
+            print(res)
+        return res
+    def get_reponse(self, output: str) -> str:
+        return output.split(self.template["response_split"])[1].strip()
+def tokenize(tokenizer, prompt, cutoff_len=512, add_eos_token=True):
+    result = tokenizer(
+        prompt,
+        truncation=True,
+        max_length=cutoff_len,
+        padding=False,
+        return_tensors=None,
+    )
+    if (
+        result["input_ids"][-1] != tokenizer.eos_token_id
+        and len(result["input_ids"]) < cutoff_len
+        and add_eos_token
+    ):
+        result["input_ids"].append(tokenizer.eos_token_id)
+        result["attention_mask"].append(1)
+    result["labels"] = result["input_ids"].copy()
+    return result
+def generate_and_tokenize_prompt(data_point, **kwargs):
+    full_prompt = kwargs["prompter"].generate_prompt(
+        data_point["instruction"],
+        data_point["input"],
+        data_point["output"],
+    )
+    tokenized_full_prompt = tokenize(
+        kwargs["tokenizer"],
+        full_prompt,
+        cutoff_len=kwargs["seq_length"],
+        add_eos_token=True,
+    )
+    if not kwargs["train_on_inputs"]:
+        user_prompt = kwargs["prompter"].generate_prompt(
+            data_point["instruction"], data_point["input"]
+        )
+        tokenized_user_prompt = kwargs["tokenizer"](
+            user_prompt, add_eos_token=False
+        )
+        user_prompt_len = len(tokenized_user_prompt["input_ids"])
+        tokenized_full_prompt["labels"] = [
+            -100
+        ] * user_prompt_len + tokenized_full_prompt["labels"][
+            user_prompt_len:
+        ]  # could be sped up, probably
+    return tokenized_full_prompt

template_FL/src/fedllm/metrics.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import re
+import evaluate
+from rouge_score import rouge_scorer
+import numpy as np
+import copy
+from collections import OrderedDict, Counter
+from .utils import clean_output_text
+def get_answer(text):
+    # text = text.lower()
+    label = text.split('Response:')[-1].strip()
+    return label
+def check_data_state(preds, targets):
+    assert len(preds) == len(targets)
+def get_rouge_score(predictions, targets):
+    check_data_state(predictions, targets)
+    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
+    scores = {
+        'rouge1': 0.0,
+        'rouge2': 0.0,
+        'rougeL': 0.0,
+        'rougeLsum': 0.0
+    }
+    for prediction, target in zip(predictions, targets):
+        prediction = get_answer(clean_output_text(prediction))
+        target = get_answer(clean_output_text(target))
+        rouge_output = rouge.score(prediction=prediction, target=target)
+        scores['rouge1'] += round(rouge_output["rouge1"].fmeasure, 4)
+        scores['rouge2'] += round(rouge_output["rouge2"].fmeasure, 4)
+        scores['rougeL'] += round(rouge_output["rougeL"].fmeasure, 4)
+        scores['rougeLsum'] += round(rouge_output["rougeLsum"].fmeasure, 4)
+    scores['rouge1'] /= len(predictions)
+    scores['rouge2'] /= len(predictions)
+    scores['rougeL'] /= len(predictions)
+    scores['rougeLsum'] /= len(predictions)
+    return scores
+def exact_match(predictions, targets):
+    check_data_state(predictions, targets)
+    predictions = [get_answer(clean_output_text(prediction)) for prediction in predictions]
+    targets = [get_answer(clean_output_text(target)) for target in targets]
+    preds, targets = np.asarray(predictions, dtype="<U16"), np.asarray(targets, dtype="<U16")
+    # print(preds, targets)
+    return {"exact_match": np.sum(preds == targets) / preds.size}
+def _f1_score(prediction, target):
+    prediction_tokens = prediction.split()
+    target_tokens = target.split()
+    common = Counter(prediction_tokens) & Counter(target_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(target_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+def f1(predictions, targets):
+    check_data_state(predictions, targets)
+    f1_score = 0.0
+    for prediction, target in zip(predictions, targets):
+        prediction = get_answer(clean_output_text(prediction))
+        target = get_answer(clean_output_text(target))
+        f1_score += _f1_score(prediction=prediction, target=target)
+    f1_score /= len(predictions)
+    return {'f1': f1_score}

template_FL/src/fedllm/models.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import math
+import torch
+import torch.nn as nn
+from omegaconf import DictConfig
+from collections import OrderedDict
+from peft import (
+    LoraConfig,
+    get_peft_model,
+    get_peft_model_state_dict,
+    set_peft_model_state_dict,
+)
+from peft.utils import prepare_model_for_kbit_training
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainerCallback, BertForSequenceClassification
+from flwr.common.typing import NDArrays
+from transformers.trainer_callback import TrainerControl, TrainerState
+from transformers.training_args import TrainingArguments
+from thop import profile
+import wandb
+from typing import Dict, List
+import copy
+import time
+import numpy as np
+def cosine_annealing(
+    current_round: int,
+    total_round: int,
+    lrate_max: float = 0.001,
+    lrate_min: float = 0.0,
+) -> float:
+    """Implement cosine annealing learning rate schedule."""
+    cos_inner = math.pi * current_round / total_round
+    return lrate_min + 0.5 * (lrate_max - lrate_min) * (1 + math.cos(cos_inner))
+def get_model(model_cfg: DictConfig):
+    """Load model with appropriate quantization config and other optimizations.
+    Please refer to this example for `peft + BitsAndBytes`:
+    https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
+    """
+    use_cuda = torch.cuda.is_available()
+    device_map = torch.device("cuda:0" if use_cuda else "cpu")
+    if model_cfg.quantization == 4:
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
+    elif model_cfg.quantization == 8:
+        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+    else:
+        raise ValueError(
+            f"Use 4-bit or 8-bit quantization. You passed: {model_cfg.quantization}/"
+        )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_cfg.name,
+        quantization_config=quantization_config,
+        # torch_dtype=torch.bfloat16,
+        attn_implementation=(
+            "flash_attention_2" if model_cfg.flash_attention else "eager"
+        ),
+    ).to(device_map)
+    if use_cuda:
+        model = prepare_model_for_kbit_training(
+            model, use_gradient_checkpointing=model_cfg.gradient_checkpointing
+        )
+    # Get tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_cfg.name, use_fast=True, padding_side="right"
+    )
+    tokenizer.pad_token = tokenizer.eos_token
+    peft_config = LoraConfig(
+        r=model_cfg.lora.lora_r,
+        lora_alpha=model_cfg.lora.lora_alpha,
+        lora_dropout=model_cfg.lora.lora_dropout,
+        target_modules=model_cfg.lora.lora_target_modules.split(", "),
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+    return get_peft_model(model, peft_config), tokenizer
+def get_data_influence_model(model_cfg: DictConfig):
+    use_cuda = torch.cuda.is_available()
+    device_map = torch.device("cuda:0" if use_cuda else "cpu")
+    # Load model with num_labels=1
+    model = BertForSequenceClassification.from_pretrained(
+        "bert-base-uncased",
+        num_labels=1,  # Set number of labels to 1 for regression or single-class tasks
+    ).to(device_map)
+    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+    if use_cuda:
+        model = prepare_model_for_kbit_training(
+            model, use_gradient_checkpointing=model_cfg.gradient_checkpointing
+        )
+    return model, tokenizer
+def set_parameters(model, parameters: NDArrays) -> None:
+    """Change the parameters of the model using the given ones."""
+    peft_state_dict_keys = get_peft_model_state_dict(model).keys()
+    params_dict = zip(peft_state_dict_keys, parameters)
+    state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict})
+    set_peft_model_state_dict(model, state_dict)
+def get_parameters(model) -> NDArrays:
+    """Return the parameters of the current net."""
+    state_dict = get_peft_model_state_dict(model)
+    return [val.cpu().numpy() for _, val in state_dict.items()]
+def model_parameters_to_ndarrays(model):
+    """
+    Convert the parameters of a HuggingFace model into a list of NDArrays.
+    Args:
+        model (torch.nn.Module): The HuggingFace model.
+    Returns:
+        list[NDArrays]: A list of NumPy arrays representing the model's parameters.
+    """
+    ndarrays = []
+    for param_tensor in model.state_dict().values():
+        # Convert PyTorch tensor to NumPy array
+        ndarrays.append(param_tensor.cpu().numpy())
+    return ndarrays
+def concatenate_models_with_marker(main_model_params: list[NDArrays],
+                                   data_influence_model_params: list[NDArrays],
+                                   marker_value: float = np.nan) -> list[NDArrays]:
+    """
+    Concatenate two models' parameters with a unique marker.
+    Args:
+        main_model_params (list[NDArrays]): Parameters of the main model as NDArrays.
+        data_influence_model_params (list[NDArrays]): Parameters of the data influence model as NDArrays.
+        marker_value (float): A unique marker value to separate the two models.
+    Returns:
+        list[NDArrays]: A single list of NDArrays with the unique marker separating the models.
+    """
+    marker = np.array([marker_value])  # Unique marker
+    concatenated_params = main_model_params + [marker] + data_influence_model_params
+    return concatenated_params
+def split_models(concatenated_model: list[NDArrays]) -> tuple[list[NDArrays], list[NDArrays]]:
+    """Split the concatenated model back into main and data influence models."""
+    # Find the marker's index
+    marker_index = next(
+        (i for i, param in enumerate(concatenated_model) if np.isnan(param).all()),
+        -1,
+    )
+    if marker_index == -1:
+        raise ValueError("Marker not found in the concatenated model parameters.")
+    main_model = concatenated_model[:marker_index]
+    data_influence_model = concatenated_model[marker_index + 1 :]
+    return main_model, data_influence_model
+def set_parameters_bert(model: BertForSequenceClassification, parameters: list[NDArrays]) -> None:
+    """
+    Set the parameters of a BertForSequenceClassification model using the given ones.
+    Args:
+        model (BertForSequenceClassification): The model whose parameters need to be updated.
+        parameters (list[NDArrays]): A list of NumPy arrays representing the parameters.
+    """
+    # Get the state_dict keys from the model
+    state_dict_keys = model.state_dict().keys()
+    # Ensure the number of parameters matches the model's state_dict
+    if len(parameters) != len(state_dict_keys):
+        raise ValueError(
+            f"Number of parameters ({len(parameters)}) does not match "
+            f"the number of state_dict keys ({len(state_dict_keys)})."
+        )
+    # Create an OrderedDict to update the model
+    params_dict = zip(state_dict_keys, parameters)
+    state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict})
+    # Load the updated state_dict into the model
+    model.load_state_dict(state_dict)

template_FL/src/fedllm/myaggregation.py ADDED Viewed

	@@ -0,0 +1,416 @@

+# Copyright 2020 Flower Labs GmbH. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Aggregation functions for strategy implementations."""
+# mypy: disallow_untyped_calls=False
+from functools import partial, reduce
+from typing import Any, Callable, Union
+import numpy as np
+from flwr.common import FitRes, NDArray, NDArrays, parameters_to_ndarrays
+from flwr.server.client_proxy import ClientProxy
+from .models import split_models
+def aggregate(results: list[tuple[NDArrays, int]]) -> NDArrays:
+    """Compute weighted average."""
+    # Calculate the total number of examples used during training
+    num_examples_total = sum(num_examples for (_, num_examples) in results)
+    # Create a list of weights, each multiplied by the related number of examples
+    weighted_weights = [
+        [layer * num_examples for layer in weights] for weights, num_examples in results
+    ]
+    # Compute average weights of each layer
+    weights_prime: NDArrays = [
+        reduce(np.add, layer_updates) / num_examples_total
+        for layer_updates in zip(*weighted_weights)
+    ]
+    return weights_prime
+def aggregate_inplace(results: list[tuple[ClientProxy, FitRes]]) -> NDArrays:
+    """Compute in-place weighted average."""
+    # Count total examples
+    num_examples_total = sum(fit_res.num_examples for (_, fit_res) in results)
+    # Compute scaling factors for each result
+    scaling_factors = np.asarray(
+        [fit_res.num_examples / num_examples_total for _, fit_res in results]
+    )
+    def _try_inplace(
+        x: NDArray, y: Union[NDArray, np.float64], np_binary_op: np.ufunc
+    ) -> NDArray:
+        return (  # type: ignore[no-any-return]
+            np_binary_op(x, y, out=x)
+            if np.can_cast(y, x.dtype, casting="same_kind")
+            else np_binary_op(x, np.array(y, x.dtype), out=x)
+        )
+    # Let's do in-place aggregation
+    # Get first result, then add up each other
+    # print(f"Param : {results[0][1].parameters}. Type: {type(results[0][1].parameters)}")
+    params = [
+        _try_inplace(x, scaling_factors[0], np_binary_op=np.multiply)
+        for x in parameters_to_ndarrays(results[0][1].parameters)
+    ]
+    for i, (_, fit_res) in enumerate(results[1:], start=1):
+        res = (
+            _try_inplace(x, scaling_factors[i], np_binary_op=np.multiply)
+            for x in parameters_to_ndarrays(fit_res.parameters)
+        )
+        params = [
+            reduce(partial(_try_inplace, np_binary_op=np.add), layer_updates)
+            for layer_updates in zip(params, res)
+        ]
+    return params
+def aggregate_inplace_mates(results: list[tuple[ClientProxy, FitRes]]) -> NDArrays:
+    """Aggregate main model and data influence model separately."""
+    num_examples_total = sum(fit_res.num_examples for _, fit_res in results)
+    scaling_factors = [
+        fit_res.num_examples / num_examples_total for _, fit_res in results
+    ]
+    aggregated_main_model = None
+    aggregated_data_influence_model = None
+    for i, (_, fit_res) in enumerate(results):
+        # Convert parameters to NDArrays and split into main and data influence models
+        concatenated_params = parameters_to_ndarrays(fit_res.parameters)
+        main_model, data_influence_model = split_models(concatenated_params)
+        # Scale the models by the scaling factor
+        scaled_main_model = [x * scaling_factors[i] for x in main_model]
+        scaled_data_influence_model = [x * scaling_factors[i] for x in data_influence_model]
+        # Aggregate in-place
+        if aggregated_main_model is None:
+            aggregated_main_model = scaled_main_model
+            aggregated_data_influence_model = scaled_data_influence_model
+        else:
+            aggregated_main_model = [
+                x + y for x, y in zip(aggregated_main_model, scaled_main_model)
+            ]
+            aggregated_data_influence_model = [
+                x + y for x, y in zip(aggregated_data_influence_model, scaled_data_influence_model)
+            ]
+    return concatenate_models_with_marker(aggregated_main_model, aggregated_data_influence_model)
+def aggregate_median(results: list[tuple[NDArrays, int]]) -> NDArrays:
+    """Compute median."""
+    # Create a list of weights and ignore the number of examples
+    weights = [weights for weights, _ in results]
+    # Compute median weight of each layer
+    median_w: NDArrays = [
+        np.median(np.asarray(layer), axis=0) for layer in zip(*weights)
+    ]
+    return median_w
+def aggregate_krum(
+    results: list[tuple[NDArrays, int]], num_malicious: int, to_keep: int
+) -> NDArrays:
+    """Choose one parameter vector according to the Krum function.
+    If to_keep is not None, then MultiKrum is applied.
+    """
+    # Create a list of weights and ignore the number of examples
+    weights = [weights for weights, _ in results]
+    # Compute distances between vectors
+    distance_matrix = _compute_distances(weights)
+    # For each client, take the n-f-2 closest parameters vectors
+    num_closest = max(1, len(weights) - num_malicious - 2)
+    closest_indices = []
+    for distance in distance_matrix:
+        closest_indices.append(
+            np.argsort(distance)[1 : num_closest + 1].tolist()  # noqa: E203
+        )
+    # Compute the score for each client, that is the sum of the distances
+    # of the n-f-2 closest parameters vectors
+    scores = [
+        np.sum(distance_matrix[i, closest_indices[i]])
+        for i in range(len(distance_matrix))
+    ]
+    if to_keep > 0:
+        # Choose to_keep clients and return their average (MultiKrum)
+        best_indices = np.argsort(scores)[::-1][len(scores) - to_keep :]  # noqa: E203
+        best_results = [results[i] for i in best_indices]
+        return aggregate(best_results)
+    # Return the model parameters that minimize the score (Krum)
+    return weights[np.argmin(scores)]
+# pylint: disable=too-many-locals
+def aggregate_bulyan(
+    results: list[tuple[NDArrays, int]],
+    num_malicious: int,
+    aggregation_rule: Callable,  # type: ignore
+    **aggregation_rule_kwargs: Any,
+) -> NDArrays:
+    """Perform Bulyan aggregation.
+    Parameters
+    ----------
+    results: list[tuple[NDArrays, int]]
+        Weights and number of samples for each of the client.
+    num_malicious: int
+        The maximum number of malicious clients.
+    aggregation_rule: Callable
+        Byzantine resilient aggregation rule used as the first step of the Bulyan
+    aggregation_rule_kwargs: Any
+        The arguments to the aggregation rule.
+    Returns
+    -------
+    aggregated_parameters: NDArrays
+        Aggregated parameters according to the Bulyan strategy.
+    """
+    byzantine_resilient_single_ret_model_aggregation = [aggregate_krum]
+    # also GeoMed (but not implemented yet)
+    byzantine_resilient_many_return_models_aggregation = []  # type: ignore
+    # Brute, Medoid (but not implemented yet)
+    num_clients = len(results)
+    if num_clients < 4 * num_malicious + 3:
+        raise ValueError(
+            "The Bulyan aggregation requires then number of clients to be greater or "
+            "equal to the 4 * num_malicious + 3. This is the assumption of this method."
+            "It is needed to ensure that the method reduces the attacker's leeway to "
+            "the one proved in the paper."
+        )
+    selected_models_set: list[tuple[NDArrays, int]] = []
+    theta = len(results) - 2 * num_malicious
+    beta = theta - 2 * num_malicious
+    for _ in range(theta):
+        best_model = aggregation_rule(
+            results=results, num_malicious=num_malicious, **aggregation_rule_kwargs
+        )
+        list_of_weights = [weights for weights, num_samples in results]
+        # This group gives exact result
+        if aggregation_rule in byzantine_resilient_single_ret_model_aggregation:
+            best_idx = _find_reference_weights(best_model, list_of_weights)
+        # This group requires finding the closest model to the returned one
+        # (weights distance wise)
+        elif aggregation_rule in byzantine_resilient_many_return_models_aggregation:
+            # when different aggregation strategies available
+            # write a function to find the closest model
+            raise NotImplementedError(
+                "aggregate_bulyan currently does not support the aggregation rules that"
+                " return many models as results. "
+                "Such aggregation rules are currently not available in Flower."
+            )
+        else:
+            raise ValueError(
+                "The given aggregation rule is not added as Byzantine resilient. "
+                "Please choose from Byzantine resilient rules."
+            )
+        selected_models_set.append(results[best_idx])
+        # remove idx from tracker and weights_results
+        results.pop(best_idx)
+    # Compute median parameter vector across selected_models_set
+    median_vect = aggregate_median(selected_models_set)
+    # Take the averaged beta parameters of the closest distance to the median
+    # (coordinate-wise)
+    parameters_aggregated = _aggregate_n_closest_weights(
+        median_vect, selected_models_set, beta_closest=beta
+    )
+    return parameters_aggregated
+def weighted_loss_avg(results: list[tuple[int, float]]) -> float:
+    """Aggregate evaluation results obtained from multiple clients."""
+    num_total_evaluation_examples = sum(num_examples for (num_examples, _) in results)
+    weighted_losses = [num_examples * loss for num_examples, loss in results]
+    return sum(weighted_losses) / num_total_evaluation_examples
+def aggregate_qffl(
+    parameters: NDArrays, deltas: list[NDArrays], hs_fll: list[NDArrays]
+) -> NDArrays:
+    """Compute weighted average based on Q-FFL paper."""
+    demominator: float = np.sum(np.asarray(hs_fll))
+    scaled_deltas = []
+    for client_delta in deltas:
+        scaled_deltas.append([layer * 1.0 / demominator for layer in client_delta])
+    updates = []
+    for i in range(len(deltas[0])):
+        tmp = scaled_deltas[0][i]
+        for j in range(1, len(deltas)):
+            tmp += scaled_deltas[j][i]
+        updates.append(tmp)
+    new_parameters = [(u - v) * 1.0 for u, v in zip(parameters, updates)]
+    return new_parameters
+def _compute_distances(weights: list[NDArrays]) -> NDArray:
+    """Compute distances between vectors.
+    Input: weights - list of weights vectors
+    Output: distances - matrix distance_matrix of squared distances between the vectors
+    """
+    flat_w = np.array([np.concatenate(p, axis=None).ravel() for p in weights])
+    distance_matrix = np.zeros((len(weights), len(weights)))
+    for i, flat_w_i in enumerate(flat_w):
+        for j, flat_w_j in enumerate(flat_w):
+            delta = flat_w_i - flat_w_j
+            norm = np.linalg.norm(delta)
+            distance_matrix[i, j] = norm**2
+    return distance_matrix
+def _trim_mean(array: NDArray, proportiontocut: float) -> NDArray:
+    """Compute trimmed mean along axis=0.
+    It is based on the scipy implementation.
+    https://docs.scipy.org/doc/scipy/reference/generated/
+    scipy.stats.trim_mean.html.
+    """
+    axis = 0
+    nobs = array.shape[axis]
+    lowercut = int(proportiontocut * nobs)
+    uppercut = nobs - lowercut
+    if lowercut > uppercut:
+        raise ValueError("Proportion too big.")
+    atmp = np.partition(array, (lowercut, uppercut - 1), axis)
+    slice_list = [slice(None)] * atmp.ndim
+    slice_list[axis] = slice(lowercut, uppercut)
+    result: NDArray = np.mean(atmp[tuple(slice_list)], axis=axis)
+    return result
+def aggregate_trimmed_avg(
+    results: list[tuple[NDArrays, int]], proportiontocut: float
+) -> NDArrays:
+    """Compute trimmed average."""
+    # Create a list of weights and ignore the number of examples
+    weights = [weights for weights, _ in results]
+    trimmed_w: NDArrays = [
+        _trim_mean(np.asarray(layer), proportiontocut=proportiontocut)
+        for layer in zip(*weights)
+    ]
+    return trimmed_w
+def _check_weights_equality(weights1: NDArrays, weights2: NDArrays) -> bool:
+    """Check if weights are the same."""
+    if len(weights1) != len(weights2):
+        return False
+    return all(
+        np.array_equal(layer_weights1, layer_weights2)
+        for layer_weights1, layer_weights2 in zip(weights1, weights2)
+    )
+def _find_reference_weights(
+    reference_weights: NDArrays, list_of_weights: list[NDArrays]
+) -> int:
+    """Find the reference weights by looping through the `list_of_weights`.
+    Raise Error if the reference weights is not found.
+    Parameters
+    ----------
+    reference_weights: NDArrays
+        Weights that will be searched for.
+    list_of_weights: list[NDArrays]
+        list of weights that will be searched through.
+    Returns
+    -------
+    index: int
+        The index of `reference_weights` in the `list_of_weights`.
+    Raises
+    ------
+    ValueError
+        If `reference_weights` is not found in `list_of_weights`.
+    """
+    for idx, weights in enumerate(list_of_weights):
+        if _check_weights_equality(reference_weights, weights):
+            return idx
+    raise ValueError("The reference weights not found in list_of_weights.")
+def _aggregate_n_closest_weights(
+    reference_weights: NDArrays, results: list[tuple[NDArrays, int]], beta_closest: int
+) -> NDArrays:
+    """Calculate element-wise mean of the `N` closest values.
+    Note, each i-th coordinate of the result weight is the average of the beta_closest
+    -ith coordinates to the reference weights
+    Parameters
+    ----------
+    reference_weights: NDArrays
+        The weights from which the distances will be computed
+    results: list[tuple[NDArrays, int]]
+        The weights from models
+    beta_closest: int
+        The number of the closest distance weights that will be averaged
+    Returns
+    -------
+    aggregated_weights: NDArrays
+        Averaged (element-wise) beta weights that have the closest distance to
+         reference weights
+    """
+    list_of_weights = [weights for weights, num_examples in results]
+    aggregated_weights = []
+    for layer_id, layer_weights in enumerate(reference_weights):
+        other_weights_layer_list = []
+        for other_w in list_of_weights:
+            other_weights_layer = other_w[layer_id]
+            other_weights_layer_list.append(other_weights_layer)
+        other_weights_layer_np = np.array(other_weights_layer_list)
+        diff_np = np.abs(layer_weights - other_weights_layer_np)
+        # Create indices of the smallest differences
+        # We do not need the exact order but just the beta closest weights
+        # therefore np.argpartition is used instead of np.argsort
+        indices = np.argpartition(diff_np, kth=beta_closest - 1, axis=0)
+        # Take the weights (coordinate-wise) corresponding to the beta of the
+        # closest distances
+        beta_closest_weights = np.take_along_axis(
+            other_weights_layer_np, indices=indices, axis=0
+        )[:beta_closest]
+        aggregated_weights.append(np.mean(beta_closest_weights, axis=0))
+    return aggregated_weights

template_FL/src/fedllm/myfedavg.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# Copyright 2020 Flower Labs GmbH. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Federated Averaging (FedAvg) [McMahan et al., 2016] strategy.
+Paper: arxiv.org/abs/1602.05629
+"""
+from logging import WARNING
+from typing import Callable, Optional, Union
+from flwr.common import (
+    EvaluateIns,
+    EvaluateRes,
+    FitIns,
+    FitRes,
+    MetricsAggregationFn,
+    NDArrays,
+    Parameters,
+    Scalar,
+    ndarrays_to_parameters,
+    parameters_to_ndarrays,
+)
+from flwr.common.logger import log
+from flwr.server.client_manager import ClientManager
+from flwr.server.client_proxy import ClientProxy
+from .myaggregation import aggregate, aggregate_inplace, weighted_loss_avg
+from flwr.server.strategy import Strategy
+WARNING_MIN_AVAILABLE_CLIENTS_TOO_LOW = """
+Setting `min_available_clients` lower than `min_fit_clients` or
+`min_evaluate_clients` can cause the server to fail when there are too few clients
+connected to the server. `min_available_clients` must be set to a value larger
+than or equal to the values of `min_fit_clients` and `min_evaluate_clients`.
+"""
+client_id_idx = {}
+# pylint: disable=line-too-long
+class FedAvg(Strategy):
+    """Federated Averaging strategy.
+    Implementation based on https://arxiv.org/abs/1602.05629
+    Parameters
+    ----------
+    fraction_fit : float, optional
+        Fraction of clients used during training. In case `min_fit_clients`
+        is larger than `fraction_fit * available_clients`, `min_fit_clients`
+        will still be sampled. Defaults to 1.0.
+    fraction_evaluate : float, optional
+        Fraction of clients used during validation. In case `min_evaluate_clients`
+        is larger than `fraction_evaluate * available_clients`,
+        `min_evaluate_clients` will still be sampled. Defaults to 1.0.
+    min_fit_clients : int, optional
+        Minimum number of clients used during training. Defaults to 2.
+    min_evaluate_clients : int, optional
+        Minimum number of clients used during validation. Defaults to 2.
+    min_available_clients : int, optional
+        Minimum number of total clients in the system. Defaults to 2.
+    evaluate_fn : Optional[Callable[[int, NDArrays, Dict[str, Scalar]],Optional[Tuple[float, Dict[str, Scalar]]]]]
+        Optional function used for validation. Defaults to None.
+    on_fit_config_fn : Callable[[int], Dict[str, Scalar]], optional
+        Function used to configure training. Defaults to None.
+    on_evaluate_config_fn : Callable[[int], Dict[str, Scalar]], optional
+        Function used to configure validation. Defaults to None.
+    accept_failures : bool, optional
+        Whether or not accept rounds containing failures. Defaults to True.
+    initial_parameters : Parameters, optional
+        Initial global model parameters.
+    fit_metrics_aggregation_fn : Optional[MetricsAggregationFn]
+        Metrics aggregation function, optional.
+    evaluate_metrics_aggregation_fn : Optional[MetricsAggregationFn]
+        Metrics aggregation function, optional.
+    inplace : bool (default: True)
+        Enable (True) or disable (False) in-place aggregation of model updates.
+    """
+    # pylint: disable=too-many-arguments,too-many-instance-attributes, line-too-long
+    def __init__(
+        self,
+        *,
+        fraction_fit: float = 1.0,
+        fraction_evaluate: float = 1.0,
+        min_fit_clients: int = 2,
+        min_evaluate_clients: int = 2,
+        min_available_clients: int = 2,
+        evaluate_fn: Optional[
+            Callable[
+                [int, NDArrays, dict[str, Scalar]],
+                Optional[tuple[float, dict[str, Scalar]]],
+            ]
+        ] = None,
+        on_fit_config_fn: Optional[Callable[[int], dict[str, Scalar]]] = None,
+        on_evaluate_config_fn: Optional[Callable[[int], dict[str, Scalar]]] = None,
+        accept_failures: bool = True,
+        initial_parameters: Optional[Parameters] = None,
+        fit_metrics_aggregation_fn: Optional[MetricsAggregationFn] = None,
+        evaluate_metrics_aggregation_fn: Optional[MetricsAggregationFn] = None,
+        inplace: bool = True,
+        use_mates: bool = False,
+    ) -> None:
+        super().__init__()
+        if (
+            min_fit_clients > min_available_clients
+            or min_evaluate_clients > min_available_clients
+        ):
+            log(WARNING, WARNING_MIN_AVAILABLE_CLIENTS_TOO_LOW)
+        self.fraction_fit = fraction_fit
+        self.fraction_evaluate = fraction_evaluate
+        self.min_fit_clients = min_fit_clients
+        self.min_evaluate_clients = min_evaluate_clients
+        self.min_available_clients = min_available_clients
+        self.evaluate_fn = evaluate_fn
+        self.on_fit_config_fn = on_fit_config_fn
+        self.on_evaluate_config_fn = on_evaluate_config_fn
+        self.accept_failures = accept_failures
+        self.initial_parameters = initial_parameters
+        self.fit_metrics_aggregation_fn = fit_metrics_aggregation_fn
+        self.evaluate_metrics_aggregation_fn = evaluate_metrics_aggregation_fn
+        self.inplace = inplace
+        self.use_mates = use_mates
+    def __repr__(self) -> str:
+        """Compute a string representation of the strategy."""
+        rep = f"FedAvg(accept_failures={self.accept_failures})"
+        return rep
+    def num_fit_clients(self, num_available_clients: int) -> tuple[int, int]:
+        """Return the sample size and the required number of available clients."""
+        num_clients = int(num_available_clients * self.fraction_fit)
+        return max(num_clients, self.min_fit_clients), self.min_available_clients
+    def num_evaluation_clients(self, num_available_clients: int) -> tuple[int, int]:
+        """Use a fraction of available clients for evaluation."""
+        num_clients = int(num_available_clients * self.fraction_evaluate)
+        return max(num_clients, self.min_evaluate_clients), self.min_available_clients
+    def initialize_parameters(
+        self, client_manager: ClientManager
+    ) -> Optional[Parameters]:
+        """Initialize global model parameters."""
+        initial_parameters = self.initial_parameters
+        self.initial_parameters = None  # Don't keep initial parameters in memory
+        return initial_parameters
+    def evaluate(
+        self, server_round: int, parameters: Parameters
+    ) -> Optional[tuple[float, dict[str, Scalar]]]:
+        """Evaluate model parameters using an evaluation function."""
+        if self.evaluate_fn is None:
+            # No evaluation function provided
+            return None
+        parameters_ndarrays = parameters_to_ndarrays(parameters)
+        eval_res = self.evaluate_fn(server_round, parameters_ndarrays, {})
+        if eval_res is None:
+            return None
+        loss, metrics = eval_res
+        return loss, metrics
+    def configure_fit(
+        self, server_round: int, parameters: Parameters, client_manager: ClientManager
+    ) -> list[tuple[ClientProxy, FitIns]]:
+        """Configure the next round of training."""
+        config = {}
+        if self.on_fit_config_fn is not None:
+            # Custom fit config function provided
+            config = self.on_fit_config_fn(server_round)
+        fit_ins = FitIns(parameters, config)
+        if not client_id_idx:
+            for i, (client_id, _) in enumerate(client_manager.clients.items()):
+                client_id_idx[client_id] = i
+        # Sample clients
+        sample_size, min_num_clients = self.num_fit_clients(
+            client_manager.num_available()
+        )
+        clients = client_manager.sample(
+            num_clients=sample_size, min_num_clients=min_num_clients
+        )
+        # Return client/config pairs
+        return [(client, fit_ins) for client in clients]
+    def configure_evaluate(
+        self, server_round: int, parameters: Parameters, client_manager: ClientManager
+    ) -> list[tuple[ClientProxy, EvaluateIns]]:
+        """Configure the next round of evaluation."""
+        # Do not configure federated evaluation if fraction eval is 0.
+        if self.fraction_evaluate == 0.0:
+            return []
+        # Parameters and config
+        config = {}
+        if self.on_evaluate_config_fn is not None:
+            # Custom evaluation config function provided
+            config = self.on_evaluate_config_fn(server_round)
+        evaluate_ins = EvaluateIns(parameters, config)
+        # Sample clients
+        sample_size, min_num_clients = self.num_evaluation_clients(
+            client_manager.num_available()
+        )
+        clients = client_manager.sample(
+            num_clients=sample_size, min_num_clients=min_num_clients
+        )
+        # Return client/config pairs
+        return [(client, evaluate_ins) for client in clients]
+    def aggregate_fit(
+        self,
+        server_round: int,
+        results: list[tuple[ClientProxy, FitRes]],
+        failures: list[Union[tuple[ClientProxy, FitRes], BaseException]],
+    ) -> tuple[Optional[Parameters], dict[str, Scalar]]:
+        """Aggregate fit results using weighted average."""
+        if not results:
+            return None, {}
+        # Do not aggregate if there are failures and failures are not accepted
+        if not self.accept_failures and failures:
+            return None, {}
+        if self.inplace:
+            # Does in-place weighted average of results
+            aggregated_ndarrays = aggregate_inplace(results)
+        elif self.use_mates:
+            aggregated_ndarrays = aggregate_inplace_mates(results)
+        else:
+            # Convert results
+            weights_results = [
+                (parameters_to_ndarrays(fit_res.parameters), fit_res.num_examples)
+                for _, fit_res in results
+            ]
+            aggregated_ndarrays = aggregate(weights_results)
+        parameters_aggregated = ndarrays_to_parameters(aggregated_ndarrays)
+        # Aggregate custom metrics if aggregation fn was provided
+        metrics_aggregated = {}
+        if self.fit_metrics_aggregation_fn:
+            fit_metrics = [(res.num_examples, res.metrics) for _, res in results]
+            metrics_aggregated = self.fit_metrics_aggregation_fn(fit_metrics)
+        elif server_round == 1:  # Only log this warning once
+            log(WARNING, "No fit_metrics_aggregation_fn provided")
+        return parameters_aggregated, metrics_aggregated
+    def aggregate_evaluate(
+        self,
+        server_round: int,
+        results: list[tuple[ClientProxy, EvaluateRes]],
+        failures: list[Union[tuple[ClientProxy, EvaluateRes], BaseException]],
+    ) -> tuple[Optional[float], dict[str, Scalar]]:
+        """Aggregate evaluation losses using weighted average."""
+        if not results:
+            return None, {}
+        # Do not aggregate if there are failures and failures are not accepted
+        if not self.accept_failures and failures:
+            return None, {}
+        # Aggregate loss
+        loss_aggregated = weighted_loss_avg(
+            [
+                (evaluate_res.num_examples, evaluate_res.loss)
+                for _, evaluate_res in results
+            ]
+        )
+        # Aggregate custom metrics if aggregation fn was provided
+        metrics_aggregated = {}
+        if self.evaluate_metrics_aggregation_fn:
+            eval_metrics = [(res.num_examples, res.metrics) for _, res in results]
+            metrics_aggregated = self.evaluate_metrics_aggregation_fn(eval_metrics)
+        elif server_round == 1:  # Only log this warning once
+            log(WARNING, "No evaluate_metrics_aggregation_fn provided")
+        return loss_aggregated, metrics_aggregated

template_FL/src/fedllm/server_app.py ADDED Viewed

	@@ -0,0 +1,309 @@

+"""flowertune-llm: A Flower / FlowerTune app."""
+import os
+import torch
+import wandb
+import numpy as np
+from dotenv import load_dotenv
+from datetime import datetime
+from tqdm import tqdm
+from transformers import DataCollatorForSeq2Seq, DataCollatorWithPadding, TrainingArguments, Trainer, GenerationConfig
+from .trainer import ManualTrainer
+from transformers.integrations import WandbCallback
+from torch.utils.data import DataLoader
+from flwr.common import Context, ndarrays_to_parameters
+from flwr.common.config import unflatten_dict
+from flwr.server import ServerApp, ServerAppComponents, ServerConfig
+# from flwr.server.strategy import FedAvg
+from omegaconf import DictConfig
+from .models import *
+from .dataset import replace_keys
+from .myfedavg import FedAvg
+from .data_domains import global_test_set_hete
+from .make_data import Prompter, generate_and_tokenize_prompt
+from .metrics import exact_match, f1, get_rouge_score
+from datasets import load_dataset, Dataset
+from sklearn.model_selection import train_test_split
+load_dotenv(".env")
+os.environ["WANDB_API_KEY"] = os.getenv("WANDB_API_KEY")
+os.environ["WANDB_NAME"] = os.getenv("WANDB_NAME")
+os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
+# os.environ["WANDB_LOG_MODEL"] = "checkpoint"
+class LLMSampleCB(WandbCallback):
+    def __init__(self, trainer, test_dataset, task, num_samples=10, max_new_tokens=256, log_model="checkpoint"):
+        "A CallBack to log samples a wandb.Table during training"
+        super().__init__()
+        # self._log_model = log_model
+        self.task = task
+        self.sample_dataset = test_dataset.shuffle().select(range(num_samples))
+        self.model, self.tokenizer = trainer.model, trainer.tokenizer
+        self.max_new_tokens = max_new_tokens
+        self.gen_config = GenerationConfig.from_pretrained(trainer.model.name_or_path,
+                                                           max_new_tokens=max_new_tokens)
+    def generate(self, prompt):
+        tokenized_prompt = self.tokenizer(
+            prompt,
+            # padding='max_length', max_length=self.max_new_tokens,
+            return_tensors='pt'
+        )
+        input_ids = tokenized_prompt['input_ids'].to('cuda:0')
+        with torch.inference_mode():
+            output = self.model.generate(input_ids, generation_config=self.gen_config)
+        return self.tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True)
+    def samples_table(self, examples):
+        "Create a wandb.Table to store the generations"
+        records_table = wandb.Table(columns=["input", "prediction", "label", "task"] + list(self.gen_config.to_dict().keys()))
+        for example in tqdm(examples, leave=False):
+            instruction = example["instruction"]
+            inputt = example["input"]
+            output = example['output']
+            prompt = ''
+            if inputt == '':
+                prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Response: """
+            else:
+                prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Input: {inputt} ### Response:"""
+            generation = self.generate(prompt=prompt)
+            records_table.add_data(prompt, generation, output, self.task, *list(self.gen_config.to_dict().values()))
+        return records_table
+    def on_evaluate(self, args, state, control,  **kwargs):
+        "Log the wandb.Table after calling trainer.evaluate"
+        super().on_evaluate(args, state, control, **kwargs)
+        records_table = self.samples_table(self.sample_dataset)
+        self._wandb.log({"sample_predictions":records_table})
+def test_model(dataset, model, tokenizer, train_cfg, tmp_dict, sround, mates_args, task):
+    wandb.init(
+        project='FL@CSS25',
+        name=f'global_eval_round_{sround}',
+        id=f"round_{sround}",
+        resume="allow",
+        reinit=True,
+        # settings=wandb.Settings(start_method="thread")
+    )
+    def compute_metrics(pred):
+        labels_ids = pred['label_ids']
+        labels_ids[labels_ids == -100] = 1829
+        pred_ids = pred['predictions']
+        # all unnecessary tokens are removed
+        pred_str = tokenizer.batch_decode(
+            pred_ids, skip_special_tokens=True
+        )
+        label_str = tokenizer.batch_decode(
+            labels_ids, skip_special_tokens=True
+        )
+        return {
+            **get_rouge_score(predictions=pred_str, targets=label_str),
+            **f1(predictions=pred_str, targets=label_str),
+        }
+    data_collator = DataCollatorForSeq2Seq(
+            tokenizer,
+            pad_to_multiple_of=8,
+            return_tensors="pt",
+            padding=True,
+    )
+    testset = (
+            dataset
+            .shuffle()
+            .map(
+                lambda x: generate_and_tokenize_prompt(x, **tmp_dict),
+                num_proc=8,
+            )
+    )
+    training_arguments = TrainingArguments(**train_cfg.training_arguments)
+    training_arguments.output_dir = './global_results'
+    training_arguments.logging_dir='./global_logs'
+    # training_arguments.run_name = f'global_eval_round_{sround}'
+    # # Constuct baseline Trainer
+    # trainer = Trainer(
+    #     model=model,
+    #     eval_dataset=testset.select(range(10)),
+    #     args=training_arguments,
+    #     data_collator=data_collator,
+    #     compute_metrics=compute_metrics,
+    #     tokenizer=tokenizer
+    # )
+    mates_args.state = False
+    trainer = ManualTrainer(
+        model= model,
+        tokenizer = tokenizer,
+        train_dataset=None,
+        val_dataset=testset.select(range(10)),
+        holdout_dataset=None,
+        reference_dataset=None,
+        args=training_arguments,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+        mates_args=mates_args,
+        data_influence_model=None,
+        data_influence_tokenizer=None,
+    )
+    # Do local training
+    results = trainer.evaluate(wandb_sample=True)
+    # Extract loss, predictions, and labels
+    eval_loss = results[f"eval_loss"]
+    eval_metrics = {
+        f'{task}_f1': results["f1"],
+        f'{task}_rouge1': results["rouge1"],
+        f'{task}_rouge2': results['rouge2'],
+        f'{task}_rougeL': results['rougeL'],
+        f'{task}_rougeLsum': results['rougeLsum'],
+    }
+    wandb.finish()
+    return eval_loss, eval_metrics
+# Get function that will be executed by the strategy's evaluate() method
+# Here we use it to save global model checkpoints
+def get_evaluate_fn(train_cfg, model_cfg, dataset_cfg, save_every_round, total_round, total_nodes, save_path, mates_args):
+    """Return an evaluation function for saving global model."""
+    def evaluate(server_round: int, parameters, config):
+        # Save model
+        total_loss, result_metric = 0, {}
+        prompter = Prompter(train_cfg.prompt_template_name, train_cfg.verbose)
+        if server_round != 0 and (
+            server_round == total_round or server_round % save_every_round == 0
+        ):
+            # Init model
+            main_model_params, _ = split_models(parameters)
+            model, tokenizer = get_model(model_cfg)
+            set_parameters(model, main_model_params)
+            tmp_dict = {
+                "prompter": prompter,
+                "seq_length": train_cfg.seq_length,
+                "train_on_inputs": train_cfg.train_on_inputs,
+                "tokenizer": tokenizer,
+            }
+            if dataset_cfg.type == 'homo':
+                ds = load_dataset(dataset_cfg.name)
+                _, test = train_test_split(
+                    ds, test_size=0.09, shuffle=True, random_state=42
+                )
+                global_test_set_homo = Dataset.from_pandas(test).remove_columns(['__index_level_0__'])
+                loss, metrics = test_model(global_test_set_homo, model, tokenizer, train_cfg, tmp_dict, server_round, mates_args, 'homo')
+                total_loss = loss
+                result_metric = {'homo_f1': metrics['homo_f1']}
+            else:
+                (
+                    list_loss, list_f1,
+                    list_rouge1, list_rouge2,
+                    list_rougeL, list_rougeLsum
+                ) = [], {}, {}, {}, {}, {}
+                for task in ['general', 'finance', 'math', 'medical', 'code']:
+                    ds = global_test_set_hete[task]
+                    loss, metrics = test_model(ds, model, tokenizer, train_cfg, tmp_dict, server_round, mates_args, task)
+                    list_loss.append(loss)
+                    list_f1[f'{task}_f1'] = metrics[f'{task}_f1']
+                    # list_rouge1[f'{task}_rouge1'] = metrics['rouge1']
+                    # list_rouge2[f'{task}_rouge2'] = metrics['rouge2']
+                    # list_rougeL[f'{task}_rougeL'] = metrics['rougeL']
+                    # list_rougeLsum[f'{task}_rougeLsum'] = metrics['rougeLsum']
+                total_loss = sum(list_loss) / len(list_loss)
+                avg_f1  = sum([v for k, v in list_f1.items()]) / len(list_f1)
+                result_metric = {**list_f1, 'avg_hete_f1': avg_f1}
+            model.save_pretrained(f"{save_path}/peft_{server_round}")
+        return total_loss, result_metric
+    return evaluate
+def get_on_fit_config(save_path):
+    """Return a function that will be used to construct the config that the client's
+    fit() method will receive."""
+    def fit_config_fn(server_round: int):
+        fit_config = {}
+        fit_config["current_round"] = server_round
+        fit_config["save_path"] = save_path
+        return fit_config
+    return fit_config_fn
+def fit_weighted_average(metrics):
+    """Aggregate (federated) evaluation metrics."""
+    # Multiply accuracy of each client by number of examples used
+    losses = [num_examples * m["train_loss"] for num_examples, m in metrics]
+    total_flops = [m["flops"] for num_examples, m in metrics]
+    examples = [num_examples for num_examples, _ in metrics]
+    # Aggregate and return custom metric (weighted average)
+    return {"train_loss": round(sum(losses) / sum(examples), 3), "total_flops": f"{sum(total_flops)/1e12:.2f}T"}
+def server_fn(context: Context):
+    """Construct components that set the ServerApp behaviour."""
+    # Create output directory given current timestamp
+    current_time = datetime.now()
+    folder_name = current_time.strftime("%Y-%m-%d_%H-%M-%S")
+    save_path = os.path.join(os.getcwd(), f"results/{folder_name}")
+    os.makedirs(save_path, exist_ok=True)
+    # Read from config
+    num_rounds = context.run_config["num-server-rounds"]
+    num_nodes = context.run_config['num-supernodes']
+    cfg = DictConfig(replace_keys(unflatten_dict(context.run_config)))
+    # Get initial model weights
+    init_model, tokenizer = get_model(cfg.model)
+    init_model_parameters = get_parameters(init_model)
+    init_model_parameters = ndarrays_to_parameters(init_model_parameters)
+    # Define strategy
+    strategy = FedAvg(
+        fraction_fit=cfg.train.strategy.fraction_fit,
+        fraction_evaluate=cfg.train.strategy.fraction_evaluate,
+        on_fit_config_fn=get_on_fit_config(save_path),
+        fit_metrics_aggregation_fn=fit_weighted_average,
+        initial_parameters=init_model_parameters,
+        evaluate_fn=get_evaluate_fn(
+            cfg.train, cfg.model, cfg.dataset, cfg.train.save_every_round, num_rounds, num_nodes, save_path, cfg.mates
+        ),
+        use_mates=cfg.mates.state
+    )
+    config = ServerConfig(num_rounds=num_rounds)
+    return ServerAppComponents(strategy=strategy, config=config)
+# Flower ServerApp
+app = ServerApp(server_fn=server_fn)

template_FL/src/fedllm/skipbert/__init__.py ADDED Viewed

File without changes

template_FL/src/fedllm/skipbert/modeling.py ADDED Viewed

	@@ -0,0 +1,922 @@

+"""SkipBERT modeling"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+import copy
+import json
+import math
+import os
+import sys
+import time
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from typing import List, Optional, Tuple, Union
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+import transformers
+from transformers import BertPreTrainedModel, BertModel
+from transformers.models.bert.modeling_bert import BertEmbeddings, BertEncoder, BertPooler, BertLayer
+from transformers.models.bert.modeling_bert import BertPreTrainingHeads
+from transformers.modeling_outputs import SequenceClassifierOutput
+from . import plot
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, add_code_sample_docstrings
+import logging
+logger = logging.getLogger(__name__)
+logger.warn('Hacking BertSelfAttention! Now it returns attention scores rather than probabilities.')
+class BertSelfAttention(transformers.models.bert.modeling_bert.BertSelfAttention):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        device = hidden_states.device
+        mixed_query_layer = self.query(hidden_states)
+        # most codes are copied from transformers v4.3.3
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        if self.is_decoder:
+            past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores.to(device) + attention_mask.to(device)
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+            #attention_scores = attention_scores * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (context_layer, attention_scores) if output_attentions else (context_layer,) # hacked: replace attention_probs with attention_scores
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+transformers.models.bert.modeling_bert.BertSelfAttention = BertSelfAttention
+class BertForPreTraining(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        fit_size = getattr(config, 'fit_size', 768)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+        self.fit_denses = nn.ModuleList(
+            [nn.Linear(config.hidden_size, fit_size) for _ in range(config.num_hidden_layers+1)]
+        )
+    def forward(self, input_ids, token_type_ids=None,
+                attention_mask=None, masked_lm_labels=None,
+                next_sentence_label=None, labels=None,
+                output_attentions=True, output_hidden_states=True,):
+        outputs = self.bert(
+            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states)
+        sequence_output, att_output, pooled_output = outputs.hidden_states, outputs.attentions, outputs.pooler_output
+        tmp = []
+        for s_id, sequence_layer in enumerate(sequence_output):
+            tmp.append(self.fit_denses[s_id](sequence_layer))
+        sequence_output = tmp
+        return att_output, sequence_output
+# class BertForSequenceClassification(BertPreTrainedModel):
+#     def __init__(self, config, do_fit=False, share_param=True):
+#         super().__init__(config)
+#         num_labels = config.num_labels
+#         self.hidden_size = config.hidden_size
+#         self.num_labels = num_labels
+#         self.bert = BertModel(config)
+#         self.dropout = nn.Dropout(
+#             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob)
+#         self.classifier = nn.Linear(config.hidden_size, num_labels)
+#         self.do_fit, self.share_param = do_fit, share_param
+#         if self.do_fit:
+#             fit_size = getattr(config, 'fit_size', 768)
+#             self.fit_size = fit_size
+#             if self.share_param:
+#                 self.fit_dense = nn.Linear(config.hidden_size, fit_size)
+#             else:
+#                 self.fit_denses = nn.ModuleList(
+#                     [nn.Linear(config.hidden_size, fit_size) for _ in range(config.num_hidden_layers + 1)]
+#                 )
+#     def do_fit_dense(self, sequence_output):
+#         tmp = []
+#         if self.do_fit:
+#             for s_id, sequence_layer in enumerate(sequence_output):
+#                 if self.share_param:
+#                     tmp.append(self.fit_dense(sequence_layer))
+#                 else:
+#                     tmp.append(self.fit_denses[s_id](sequence_layer))
+#             sequence_output = tmp
+#         return sequence_output
+#     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
+#         outputs = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
+#                             output_hidden_states=True, output_attentions=True)
+#         sequence_output, att_output, pooled_output = outputs.hidden_states, outputs.attentions, outputs.pooler_output
+#         logits = self.classifier(pooled_output)
+#         sequence_output = self.do_fit_dense(sequence_output)
+#         return logits, att_output, sequence_output
+# SequenceClassification docstring
+_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "textattack/bert-base-uncased-yelp-polarity"
+_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
+_SEQ_CLASS_EXPECTED_LOSS = 0.01
+_CONFIG_FOR_DOC = "BertConfig"
+BERT_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`BertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`or `(batch_size, sequence_length, target_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    """
+    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config, do_fit=False, share_param=True):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.do_fit = do_fit
+        self.share_param = share_param
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Add fit layers if enabled
+        if self.do_fit:
+            fit_size = getattr(config, 'fit_size', 768)
+            if self.share_param:
+                self.fit_dense = nn.Linear(config.hidden_size, fit_size)
+            else:
+                self.fit_denses = nn.ModuleList(
+                    [nn.Linear(config.hidden_size, fit_size) for _ in range(config.num_hidden_layers + 1)]
+                )
+        self.post_init()
+    def do_fit_dense(self, hidden_states):
+        """Process hidden states through fit layers if enabled"""
+        if not self.do_fit:
+            return hidden_states
+        processed_states = []
+        for layer_idx, state in enumerate(hidden_states):
+            if self.share_param:
+                processed_states.append(self.fit_dense(state))
+            else:
+                processed_states.append(self.fit_denses[layer_idx](state))
+        return processed_states
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Force output hidden states if fit layers are enabled
+        if self.do_fit:
+            output_hidden_states = True
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        # Process hidden states through fit layers
+        hidden_states = outputs.hidden_states
+        if self.do_fit and hidden_states is not None:
+            hidden_states = self.do_fit_dense(hidden_states)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            # Replace original hidden states with processed ones
+            if self.do_fit and hidden_states is not None:
+                output = (logits,) + (hidden_states,) + outputs[3:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+class BertForSequenceClassificationPrediction(BertForSequenceClassification):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
+        assert not self.training
+        _, pooled_output, sequence_output, att_output = self.bert(
+            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
+            output_hidden_states=True, output_attentions=True)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            loss = torch.tensor(0.)
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+        )
+class ShallowSkipping(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+#         self.model = model # do not register
+        self.config = model.config
+        self.shallow_config = model.shallow_config
+        # current only support trigram
+        self.ngram = 3
+        if self.shallow_config.hidden_size != self.config.hidden_size:
+            self.linear = nn.Linear(self.shallow_config.hidden_size, self.config.hidden_size)
+        self.plot = plot.Plot(self.config.max_num_entries, self.config.hidden_size)
+    def _build_tri_gram_ids(self, input_ids:torch.Tensor) -> torch.Tensor:
+        return torch.from_numpy(
+            self.plot.input_ids_to_tri_grams(input_ids.cpu().numpy())
+        ).to(input_ids.device)
+    def build_input_ngrams(self, input_ids:torch.Tensor, token_type_ids:torch.Tensor):
+        input_ngram_ids = self._build_tri_gram_ids(input_ids)
+        token_ngram_type_ids = None #
+        attention_mask = (input_ngram_ids > 0).float()
+        if self.training:
+            _mask = torch.rand(attention_mask.shape).to(attention_mask.device)
+            _mask = (_mask > self.config.ngram_masking)
+            attention_mask *= _mask
+        attention_mask[:, self.ngram//2] = 1 # avoid masking all tokens in a tri-gram
+        return input_ngram_ids, token_ngram_type_ids, attention_mask
+    @torch.jit.script
+    def merge_ngrams(input_ids, ngram_hidden_states, aux_embeddings):
+        batch_size, seq_length = input_ids.shape
+        lens = (input_ids!=0).sum(1)
+        hidden_state = torch.zeros([batch_size, seq_length, ngram_hidden_states.size(-1)], dtype=ngram_hidden_states.dtype, device=ngram_hidden_states.device)
+        # assert to be trigrams
+        flat_hidden_state = ngram_hidden_states[:, 1]
+        flat_hidden_state[:-1] = flat_hidden_state[:-1] + ngram_hidden_states[1:, 0]
+        flat_hidden_state[1:] = flat_hidden_state[1:] + ngram_hidden_states[:-1, 2]
+        k = 0
+        for i in range(batch_size):
+            hidden_state[i, :lens[i]] = flat_hidden_state[k: k+lens[i]]
+            k += 1 + lens[i] # 1 for skipping one padding tri-gram
+        hidden_state = hidden_state + aux_embeddings
+        return hidden_state
+    def forward_shallow_layers(
+        self,
+        input_ids,
+        token_type_ids,
+        attention_mask,
+        ngram_mask_position=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=True,
+        output_hidden_states=True,
+        model=None,
+    ):
+        device = model.device
+        input_ngram_ids, token_ngram_type_ids, attention_mask = self.build_input_ngrams(input_ids, token_type_ids)
+        ngram_attention_mask = attention_mask.clone()
+        if ngram_mask_position is not None:
+            input_ngram_ids[:, ngram_mask_position] = 0
+            ngram_attention_mask[:, ngram_mask_position] = 0
+        extended_attention_mask = model.get_extended_attention_mask(attention_mask, input_ngram_ids.shape, device)
+        ngram_index=(input_ngram_ids[:, self.ngram//2] > 0)
+        embedding_output = model.embeddings(input_ids=input_ngram_ids, token_type_ids=token_ngram_type_ids)
+        hidden_states = embedding_output
+        attention_mask = extended_attention_mask
+        for i, layer_module in enumerate(
+                model.encoder.layer[:self.config.num_hidden_layers - self.config.num_full_hidden_layers]):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=layer_head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+        if self.shallow_config.hidden_size != self.config.hidden_size:
+            hidden_states = self.linear(hidden_states)
+        # Set zero the padding ngrams: (..., [PAD], ...)
+        hidden_states = hidden_states * ngram_index[:, None, None]
+        hidden_states = hidden_states * model.attn(hidden_states).sigmoid() * ngram_attention_mask.unsqueeze(-1)
+        return input_ngram_ids, hidden_states
+    def forward(
+        self,
+        input_ids,
+        token_type_ids,
+        attention_mask,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=True,
+        output_hidden_states=True,
+        model=None,
+    ):
+        device = model.device
+        batch_size, seq_length = input_ids.shape
+        aux_embeddings = model.embeddings.position_embeddings2.weight[:seq_length].unsqueeze(0)
+        aux_embeddings = aux_embeddings + model.embeddings.token_type_embeddings2(token_type_ids)
+        if self.config.plot_mode == 'force_compute':
+            '''
+            compute only, ignore PLOT
+            '''
+            input_ngram_ids, hidden_states = self.forward_shallow_layers(
+                input_ids=input_ids,
+                token_type_ids=token_type_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                ngram_mask_position=None,
+                model=model,
+            )
+        elif self.config.plot_mode == 'update_all':
+            '''
+            build PLOT
+            '''
+            # uni-grams
+            input_ngram_ids, hidden_states = self.forward_shallow_layers(
+                input_ids=input_ids,
+                token_type_ids=token_type_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                ngram_mask_position=(0,2),
+                model=model,
+            )
+            self.plot.update_data(input_ngram_ids, hidden_states)
+            # bi-grams
+            input_ngram_ids, hidden_states = self.forward_shallow_layers(
+                input_ids=input_ids,
+                token_type_ids=token_type_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                ngram_mask_position=0,
+                model=model,
+            )
+            self.plot.update_data(input_ngram_ids, hidden_states)
+            # tri-grams
+            input_ngram_ids, hidden_states = self.forward_shallow_layers(
+                input_ids=input_ids,
+                token_type_ids=token_type_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                ngram_mask_position=None,
+                model=model,
+            )
+            self.plot.update_data(input_ngram_ids, hidden_states)
+        elif self.config.plot_mode == 'plot_passive':
+            '''
+            use plot if no oov
+            '''
+            if input_ids.is_cuda:
+                input_ids = input_ids.cpu()
+            if not self.plot.has_oov(input_ids):
+                hidden_states = self.plot.retrieve_data(input_ids)
+                hidden_states = hidden_states.to(device)
+            else:
+                input_ids = input_ids.to(device)
+                input_ngram_ids, hidden_states = self.forward_shallow_layers(
+                    input_ids=input_ids,
+                    token_type_ids=token_type_ids,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    ngram_mask_position=None,
+                    model=model,
+                )
+                self.plot.update_data(input_ngram_ids, hidden_states)
+        elif self.config.plot_mode == 'plot_only':
+            '''
+            plot only
+            looking up order: trigram -> bigram -> unigram -> 0
+            '''
+            if input_ids.is_cuda:
+                logger.warn("'input_ids' is better to placed in CPU.")
+                input_ids = input_ids.cpu()
+            hidden_states = self.plot.retrieve_data(input_ids)
+            hidden_states = hidden_states.to(device)
+        hidden_states = F.dropout(hidden_states, self.config.hidden_dropout_prob, self.training)
+        hidden_states = self.merge_ngrams(input_ids, hidden_states, aux_embeddings)
+        hidden_states = model.norm(hidden_states)
+        return hidden_states
+class SkipBertEncoder(BertEncoder):
+    def __init__(self, shallow_config, config):
+        super(BertEncoder, self).__init__()
+        self.config = config
+        self.shallow_config = shallow_config
+        self.layer = nn.ModuleList(
+            [
+                BertLayer(shallow_config) for _ in range(config.num_hidden_layers - config.num_full_hidden_layers)
+            ] + [
+                BertLayer(config) for _ in range(config.num_full_hidden_layers)
+            ])
+class SkipBertModel(BertModel):
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+        self.shallow_config = copy.deepcopy(config)
+        self.shallow_config.hidden_size = getattr(config, 'shallow_hidden_size', 768)
+        self.shallow_config.intermediate_size = getattr(config, 'shallow_intermediate_size', 3072)
+        self.embeddings = BertEmbeddings(self.shallow_config)
+        self.encoder = SkipBertEncoder(self.shallow_config, config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.embeddings.position_embeddings2 = nn.Embedding(self.config.max_position_embeddings, self.config.hidden_size)
+        self.embeddings.token_type_embeddings2 = nn.Embedding(self.config.type_vocab_size, self.config.hidden_size)
+        self.norm = nn.LayerNorm(self.config.hidden_size)
+        self.attn = nn.Linear(self.config.hidden_size, 1)
+        self.shallow_skipping = ShallowSkipping(self)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=True,
+        output_hidden_states=True,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        input_shape = input_ids.size()
+        device = self.device
+        if attention_mask is None:
+            attention_mask = (input_ids != 0).float()
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        hidden_states = self.shallow_skipping(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            model=self,
+        )
+        # Global transformer layers
+        attention_mask = extended_attention_mask
+        all_hidden_states = ()
+        all_self_attentions = ()
+        for i, layer_module in enumerate(self.encoder.layer[-self.config.num_full_hidden_layers:]):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i + self.config.num_hidden_layers - self.config.num_full_hidden_layers] if head_mask is not None else None
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=layer_head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=None,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        sequence_output = hidden_states
+        pooled_output = self.pooler(sequence_output)
+        return (sequence_output, pooled_output, all_hidden_states, all_self_attentions)
+    def freeze_shallow_layers(self):
+        for p in self.embeddings.parameters():
+            p.requires_grad = False
+        for layer in self.encoder.layer[:self.config.num_hidden_layers - self.config.num_full_hidden_layers]:
+            for p in layer.parameters():
+                p.requires_grad = False
+        try:
+            for p in self.shallow_skipping.linear.parameters():
+                p.requires_grad = False
+        except Exception as e:
+            pass
+        try:
+            for p in self.attn.parameters():
+                p.requires_grad = False
+        except Exception as e:
+            pass
+        self.embeddings.dropout.p = 0.
+        for layer in self.encoder.layer[:self.config.num_hidden_layers - self.config.num_full_hidden_layers]:
+            for m in layer.modules():
+                if isinstance(m, torch.nn.Dropout):
+                    m.p = 0.
+class SkipBertForPreTraining(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        fit_size = getattr(config, 'fit_size', 768)
+        self.bert = SkipBertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+        if self.fit_size != config.hidden_size:
+            self.fit_denses = nn.ModuleList(
+                [nn.Linear(config.hidden_size, self.fit_size) for _ in range(config.num_hidden_layers + 1)]
+            )
+    def forward(self, input_ids, token_type_ids=None,
+                attention_mask=None, masked_lm_labels=None,
+                next_sentence_label=None, labels=None,
+                output_attentions=True, output_hidden_states=True,):
+        _, pooled_output, sequence_output, att_output = self.bert(
+            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
+            output_attentions=output_attentions, output_hidden_states=output_hidden_states)
+        if self.fit_size != self.config.hidden_size:
+            tmp = []
+            for s_id, sequence_layer in enumerate(sequence_output):
+                tmp.append(self.fit_denses[s_id](sequence_layer))
+            sequence_output = tmp
+        return att_output, sequence_output
+    def freeze_shallow_layers(self):
+        self.bert.freeze_shallow_layers()
+class SkipBertForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config, do_fit=False, share_param=True):
+        super().__init__(config)
+        num_labels = config.num_labels
+        self.hidden_size = config.hidden_size
+        self.num_labels = num_labels
+        self.bert = SkipBertModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.do_fit, self.share_param = do_fit, share_param
+        if self.do_fit:
+            fit_size = getattr(config, 'fit_size', 768)
+            self.fit_size = fit_size
+            if self.share_param:
+                self.share_fit_dense = nn.Linear(config.hidden_size, fit_size)
+            else:
+                self.fit_denses = nn.ModuleList(
+                    [nn.Linear(config.hidden_size, fit_size) for _ in range(config.num_hidden_layers + 1)]
+                )
+    def do_fit_dense(self, sequence_output):
+        tmp = []
+        if self.do_fit:
+            for s_id, sequence_layer in enumerate(sequence_output):
+                if self.share_param:
+                    tmp.append(self.share_fit_dense(sequence_layer))
+                else:
+                    tmp.append(self.fit_denses[s_id](sequence_layer))
+            sequence_output = tmp
+        return sequence_output
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
+        _, pooled_output, sequence_output, att_output = self.bert(
+            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
+            output_hidden_states=True, output_attentions=True)
+        sequence_output = self.do_fit_dense(sequence_output)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        return logits, att_output, sequence_output
+    def freeze_shallow_layers(self):
+        self.bert.freeze_shallow_layers()
+class SkipBertForSequenceClassificationPrediction(SkipBertForSequenceClassification):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
+        assert not self.training
+        _, pooled_output, sequence_output, att_output = self.bert(
+            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
+            output_hidden_states=True, output_attentions=True)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            loss = torch.tensor(0.)
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+        )

template_FL/src/fedllm/skipbert/plot.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import copy
+import json
+import time
+import torch.nn.functional as F
+import torch
+import numpy as np
+import sys,os
+import numba
+def _set_madvise(large_data, advise=1):
+    '''
+    0: MADV_NORMAL
+    1: MADV_RANDOM
+    2: MADV_SEQUENTIAL
+    3: MADV_WILLNEED
+    4: MADV_DONTNEED
+    '''
+    import ctypes
+    madvise = ctypes.CDLL("libc.so.6").madvise
+    madvise.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int]
+    madvise.restype = ctypes.c_int
+    assert madvise(large_data.ctypes.data, large_data.size * large_data.dtype.itemsize, advise) == 0, "MADVISE FAILED" # 1 means MADV_RANDOM
+def _read_or_create_memmap(path, return_tensor=True, *args, **kargs):
+    if os.path.exists(path):
+        a = np.memmap(path, mode='r+', *args, **kargs)
+    else:
+        a = np.memmap(path, mode='w+', *args, **kargs)
+        # first row is reserved for oovs
+        a[0] = 0
+    _set_madvise(a, advise=1)
+    if return_tensor:
+        a = torch.from_numpy(a) # zero-copy
+    return a
+def _to_key(k):
+    return tuple(k.tolist())
+@numba.njit()
+def _input_ids_to_tri_grams(x: np.array):
+    bs, seq_len = x.shape
+    ret = np.zeros((bs*(seq_len+1), 3), dtype=np.int64)
+    i_ret = 0
+    for i_bs in range(bs):
+        for i_token in range(seq_len):
+            if x[i_bs, i_token] == 0:
+                break
+            if i_token == 0:
+                ret[i_ret][1] = x[i_bs, i_token]
+                ret[i_ret][2] = x[i_bs, i_token+1]
+            elif i_token == seq_len - 1:
+                ret[i_ret][0] = x[i_bs, i_token-1]
+                ret[i_ret][1] = x[i_bs, i_token]
+            else:
+                ret[i_ret] = x[i_bs, i_token-1:i_token+2]
+            i_ret += 1
+        i_ret += 1 # add a pad trigram between seqs
+    return ret[:i_ret]
+@numba.njit()
+def _input_ids_to_ngram_ids(d: dict, x: np.array):
+    '''
+    input_ids tp ngram_ids.
+    try match (x0, x1, x2) -> id;
+    if not possible, match (0, x1, x2) -> id;
+    if also not possible, match (0, x1, 0) -> id.
+    '''
+    bs, seq_len = x.shape
+    ret = np.zeros(bs*(seq_len+1), dtype=np.int64)
+    i_ret = 0
+    for i_bs in range(bs):
+        for i_token in range(seq_len):
+            if x[i_bs, i_token] == 0:
+                break
+            if i_token == 0:
+                k = (0, x[i_bs, i_token], x[i_bs, i_token+1])
+            elif i_token == seq_len - 1:
+                k = (x[i_bs, i_token-1], x[i_bs, i_token], 0)
+            else:
+                k = (x[i_bs, i_token-1], x[i_bs, i_token], x[i_bs, i_token+1])
+            if k in d: # tri-gram
+                ret[i_ret] = d[k]
+            else:
+                k = (0, k[1], k[2])
+                if k in d: # bi-gram
+                    ret[i_ret] = d[k]
+                else:
+                    k = (0, k[1], 0)
+                    if k in d: # uni-gram
+                        ret[i_ret] = d[k]
+            i_ret += 1
+        i_ret += 1 # add a pad trigram between seqs
+    return ret[:i_ret]
+@numba.njit()
+def _has_oov(d: dict, x: np.array):
+    bs, seq_len = x.shape
+    for i_bs in range(bs):
+        for i_token in range(seq_len):
+            if x[i_bs, i_token] == 0:
+                break
+            if i_token == 0:
+                k = (0, x[i_bs, i_token], x[i_bs, i_token+1])
+            elif i_token == seq_len - 1:
+                k = (x[i_bs, i_token-1], x[i_bs, i_token], 0)
+            else:
+                k = (x[i_bs, i_token-1], x[i_bs, i_token], x[i_bs, i_token+1])
+            if k not in d:
+                return True
+    return False
+class Plot:
+    def __init__(self, max_num_entries=100000, hidden_size=768):
+        self.max_num_entries = max_num_entries
+        self.hidden_size = hidden_size
+        self.trigram_to_id, self.id_to_trigram = self.build_hash_table('input_ids_tri_gram.memmap', max_num_entries)
+        self.orig_trigram_hidden_states =  _read_or_create_memmap("plot_hidden_states_tri_gram.memmap", dtype='float16', shape=(max_num_entries, 3, hidden_size))
+    def build_hash_table(self, path, max_num_entries):
+        n_gram = 3
+        hash_table1 = numba.typed.Dict()
+        hash_table1[tuple([0]*n_gram)] = 0 # dummy entry
+        orig_ngram_ids_mmap = _read_or_create_memmap(
+            path, return_tensor=False, dtype='int32', shape=(max_num_entries, n_gram))
+        for i in range(1, self.max_num_entries):
+            _tmp = orig_ngram_ids_mmap[i]
+            # break when meet all 0 ngram
+            if (_tmp==0).all():
+                break
+            tmp_hash = _to_key(_tmp)
+            if tmp_hash not in hash_table1:
+                hash_table1[tmp_hash] = i
+        return hash_table1, orig_ngram_ids_mmap
+    def input_ids_to_tri_grams(self, input_ids):
+        return _input_ids_to_tri_grams(input_ids)
+    def update_data(self, ngram_input_ids, ngram_hidden_states):
+        ngram_input_ids = ngram_input_ids.cpu().numpy()
+        ngram_hidden_states = ngram_hidden_states.detach().half().cpu() # FP16
+        bs, ngram = ngram_input_ids.shape
+        ngram_to_id, id_to_ngram, id_to_hidden_state = \
+            self.trigram_to_id, self.id_to_trigram, self.orig_trigram_hidden_states
+        # TODO: optimize the for-loop later
+        id_to_save = []
+        for i in range(bs):
+            ngram = _to_key(ngram_input_ids[i])
+            # TODO: handle ngram_id > max_size
+            ngram_id = ngram_to_id.get(ngram, len(ngram_to_id))
+            if ngram_id >= self.max_num_entries:
+                print('Exceed max number of entries...')
+                print('Skip current entry...')
+                continue
+            ngram_to_id[ngram] = ngram_id
+            id_to_ngram[ngram_id] = ngram
+            id_to_save.append(ngram_id)
+        id_to_hidden_state[id_to_save] = ngram_hidden_states
+    def retrieve_data(self, input_ids):
+        input_ids = input_ids.numpy()
+        id_to_get = _input_ids_to_ngram_ids(self.trigram_to_id, input_ids)
+        hidden_states = self.orig_trigram_hidden_states[id_to_get]
+        return hidden_states
+    def has_oov(self, input_ids):
+        return _has_oov(self.trigram_to_id, input_ids.numpy())

template_FL/src/fedllm/templates/alpaca.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "description": "Template used by Alpaca-LoRA.",
+    "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
+    "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
+    "response_split": "### Response:"
+}

template_FL/src/fedllm/trainer.py ADDED Viewed

	@@ -0,0 +1,476 @@

+from accelerate import Accelerator
+from torch.utils.data import DataLoader
+import torch
+import copy
+import numpy as np
+from transformers import BertForSequenceClassification, GenerationConfig, AutoTokenizer
+import inspect
+import logging
+import wandb
+from tqdm import tqdm
+logger = logging.getLogger(__name__)
+class ManualLLMSampleCB:
+    def __init__(self, model, tokenizer, task, num_samples=10, max_new_tokens=256):
+        self.model = model
+        self.concat_model = None
+        self.tokenizer = tokenizer
+        self.task = task
+        self.num_samples = num_samples
+        self.max_new_tokens = max_new_tokens
+        self.gen_config = GenerationConfig.from_pretrained(
+            model.config.name_or_path, max_new_tokens=max_new_tokens
+        )
+    def generate(self, prompt):
+        # Tokenize the input prompt and include the attention mask
+        tokenized_prompt = self.tokenizer(prompt, return_tensors='pt').to(self.model.device)
+        input_ids = tokenized_prompt['input_ids']
+        attention_mask = tokenized_prompt['attention_mask']  # Extract attention mask
+        with torch.no_grad():
+            output = self.model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=self.max_new_tokens,
+                generation_config=self.gen_config
+            )
+        return self.tokenizer.decode(output[0], skip_special_tokens=True)
+    def create_samples_table(self, dataset):
+        table = wandb.Table(columns=["input", "prediction", "label", "task"])
+        sampled_dataset = dataset.shuffle(seed=42).select(range(self.num_samples))
+        for example in tqdm(sampled_dataset, desc="Generating Samples"):
+            instruction = example.get("instruction", "")
+            input_text = example.get("input", "")
+            label = example.get("output", "")
+            if input_text:
+                prompt = f"Instruction: {instruction} Input: {input_text} Response:"
+            else:
+                prompt = f"Instruction: {instruction} Response:"
+            prediction = self.generate(prompt)
+            table.add_data(prompt, prediction, label, self.task)
+        return table
+    def log_samples_to_wandb(self, dataset):
+        samples_table = self.create_samples_table(dataset)
+        wandb.log({"sample_predictions": samples_table})
+class ManualTrainer:
+    def __init__(
+        self, model, tokenizer, train_dataset, val_dataset, holdout_dataset, reference_dataset,
+        args, data_collator, compute_metrics, mates_args, data_influence_model, data_influence_tokenizer
+    ):
+        self.accelerator = Accelerator()
+        self.model = model
+        self.tokenizer = tokenizer
+        self.args = args
+        self.data_collator = data_collator
+        self.compute_metrics = compute_metrics
+        self.mates_args = mates_args
+        self.data_influence_model = data_influence_model
+        self.data_influence_tokenizer = data_influence_tokenizer
+        # Remove unused columns from datasets
+        if train_dataset:
+            self.train_dataset = self._remove_unused_columns(train_dataset, "training")
+            # Prepare data loaders
+            self.full_train_loader = DataLoader(
+                self.train_dataset,
+                batch_size=self.args.per_device_train_batch_size,
+                shuffle=True,
+                collate_fn=self.data_collator,
+                drop_last=self.args.dataloader_drop_last
+            )
+        else:
+            self.train_loader = None
+            self.full_train_loader = None
+        if val_dataset:
+            self.val_dataset = self._remove_unused_columns(val_dataset, "validation")
+            self.val_loader = DataLoader(
+                self.val_dataset,
+                batch_size=self.args.per_device_eval_batch_size,
+                shuffle=False,
+                collate_fn=self.data_collator,
+                drop_last=self.args.dataloader_drop_last
+            )
+        else:
+            self.val_loader = None
+        if self.mates_args.state:
+            self.holdout_dataset = self._remove_unused_columns(holdout_dataset, "holdout")
+            self.reference_dataset = self._remove_unused_columns(reference_dataset, "reference")
+            self.holdout_loader = DataLoader(
+                self.holdout_dataset,
+                batch_size=self.mates_args.holdout_batch_size,
+                shuffle=True,
+                collate_fn=self.data_collator,
+                drop_last=self.args.dataloader_drop_last
+            )
+            self.reference_loader = DataLoader(
+                self.reference_dataset,
+                batch_size=self.mates_args.reference_batch_size,
+                shuffle=False,
+                collate_fn=self.data_collator,
+                drop_last=self.args.dataloader_drop_last
+            )
+        # Prepare optimizer
+        self.optimizer = torch.optim.AdamW(
+            self.model.parameters(),
+            lr=self.args.learning_rate
+        )
+        # Prepare model, optimizer, and data loaders for Accelerator
+        self.model, self.optimizer, self.full_train_loader, self.val_loader = self.accelerator.prepare(
+            self.model, self.optimizer, self.full_train_loader, self.val_loader
+        )
+        if self.mates_args.state:
+            # Prepare holdout and reference loaders for Accelerator
+            self.data_influence_model, self.holdout_loader, self.reference_loader = self.accelerator.prepare(
+                self.data_influence_model, self.holdout_loader, self.reference_loader
+            )
+    def _remove_unused_columns(self, dataset, description=None):
+        """
+        Removes columns from a dataset that are not used by the model's forward method.
+        Args:
+            dataset: A dataset object (e.g., from datasets.Dataset).
+            description: A string description of the dataset (e.g., "training" or "validation").
+        Returns:
+            The dataset with unused columns removed.
+        """
+        # Inspect the model forward signature
+        forward_signature = inspect.signature(self.model.forward)
+        signature_columns = list(forward_signature.parameters.keys())
+        # Add label columns to the signature columns
+        label_columns = ["labels", "label_ids"]
+        signature_columns += label_columns
+        # Determine unused columns
+        dataset_columns = set(dataset.column_names)
+        used_columns = set(signature_columns).intersection(dataset_columns)
+        ignored_columns = list(dataset_columns - used_columns)
+        if ignored_columns:
+            logger.info(
+                f"The following columns in the {description} set don't have a corresponding argument in "
+                f"`{self.model.__class__.__name__}.forward` and have been ignored: {', '.join(ignored_columns)}."
+            )
+        # Ensure at least one column matches the model's expected inputs
+        if not used_columns:
+            raise ValueError(
+                f"No columns in the {description} dataset match the model's forward method signature. "
+                f"The following columns have been ignored: {', '.join(ignored_columns)}."
+            )
+        return dataset.remove_columns(ignored_columns)
+    def train(self):
+        best_val_loss = float('inf')
+        early_stopping_counter = 0
+        early_stopping_patience = 5
+        training_loss = []
+        for epoch in range(self.args.num_train_epochs):
+            # Check if it's time to update the data influence model and state is True
+            if self.mates_args.state and epoch % self.mates_args.update_data_influence_model_step == 0:
+                print("Updating the data influence model and selecting high-quality data...")
+                logger.info("Updating the data influence model and selecting high-quality data...")
+                self.update_data_influence_model()
+            # Filter high-quality data using the data influence model
+            high_quality_indices = self.select_high_quality_data(
+                dataset_size=len(self.train_dataset),
+                selection_fraction=self.mates_args.selection_fraction,
+            )
+            self.train_loader = self.accelerator.prepare(
+                self.create_filtered_dataloader(high_quality_indices)
+            )
+            self.model.train()
+            epoch_loss = 0.0
+            for step, batch in enumerate(self.train_loader):
+                if step >= self.args.max_steps:
+                    break
+                self.optimizer.zero_grad()
+                outputs = self.model(
+                    input_ids=batch['input_ids'],
+                    attention_mask=batch['attention_mask'],
+                    labels=batch['labels']
+                )
+                loss = outputs.loss
+                self.accelerator.backward(loss)
+                self.optimizer.step()
+                epoch_loss += loss.item()
+                if (step + 1) % self.args.logging_steps == 0:
+                    # print(f"Step {step + 1}: Train Loss = {epoch_loss / (step + 1):.4f}")
+                    logger.info(f"Step {step + 1}: Train Loss = {epoch_loss / (step + 1):.4f}")
+            avg_epoch_loss = epoch_loss / len(self.train_loader)
+            training_loss.append(avg_epoch_loss)
+            val_results = self.evaluate()
+            # print(f"Epoch {epoch + 1}: Train Loss = {avg_epoch_loss:.4f}, Val Loss = {val_results['eval_loss']:.4f}")
+            logger,info(f"Epoch {epoch + 1}: Train Loss = {avg_epoch_loss:.4f}, Val Loss = {val_results['eval_loss']:.4f}")
+            # Early stopping logic
+            if val_results["eval_loss"] < best_val_loss:
+                best_val_loss = val_results["eval_loss"]
+                early_stopping_counter = 0
+            else:
+                early_stopping_counter += 1
+                if early_stopping_counter >= early_stopping_patience:
+                    print("Early stopping triggered")
+                    break
+        return {"training_loss": sum(training_loss) / len(training_loss), "best_val_loss": best_val_loss}
+    def select_high_quality_data(self, dataset_size, selection_fraction):
+        """
+        Use the data influence model to predict quality scores and select high-quality data indices.
+        """
+        print("Selecting high-quality data using the data influence model...")
+        # Predict influence scores for the entire dataset
+        influence_scores = []
+        self.data_influence_model.eval()
+        influence_optimizer = self.accelerator.prepare(
+            torch.optim.AdamW(self.data_influence_model.parameters(), lr=self.args.learning_rate)
+        )
+        i = 0
+        with torch.no_grad():
+            for batch in self.full_train_loader:  # Full dataset loader
+                text = self.tokenizer.batch_decode(
+                    batch['input_ids'],
+                    skip_special_tokens=True
+                )
+                # Tokenize the text using the BERT tokenizer
+                bert_inputs = self.data_influence_tokenizer(
+                    text,
+                    truncation=True,
+                    padding='max_length',
+                    max_length=256,
+                    return_tensors='pt'
+                ).to(self.accelerator.device)
+                # Train the data influence model
+                influence_optimizer.zero_grad()
+                outputs = self.data_influence_model(
+                    input_ids=bert_inputs['input_ids'],
+                    attention_mask=bert_inputs['attention_mask'],
+                )
+                influence_scores.extend(outputs.logits.squeeze(-1).cpu().numpy())
+                i += 1
+                if i == 100:
+                    break
+        # Normalize influence scores and apply Gumbel-Top-$k$ selection
+        influence_scores = np.array(influence_scores)
+        print(">> Influence scores shape:", influence_scores.shape)
+        # Add Gumbel noise for diversity
+        rng = np.random.default_rng()
+        gumbel_noise = rng.gumbel(size=len(influence_scores))
+        influence_scores += gumbel_noise
+        # Select top indices based on influence scores
+        selection_size = int(len(influence_scores)*selection_fraction)
+        high_quality_indices = np.argpartition(-influence_scores, selection_size)[:selection_size]
+        print(f"Selected {len(high_quality_indices)} high-quality samples.")
+        return high_quality_indices
+    def create_filtered_dataloader(self, indices):
+        """
+        Create a new dataloader with only the selected high-quality data.
+        """
+        print("Creating a filtered dataloader with selected high-quality data...")
+        subset_dataset = torch.utils.data.Subset(self.train_dataset, indices)
+        return torch.utils.data.DataLoader(
+            subset_dataset,
+            batch_size=self.args.per_device_train_batch_size,
+            shuffle=True,
+            collate_fn=self.data_collator,  # Use the same collate function
+            drop_last=self.args.dataloader_drop_last
+        )
+    def update_data_influence_model(self):
+        # Train a copy of the model on holdout data and validate on reference data
+        copied_model = copy.deepcopy(self.model)
+        copied_model.train()
+        optimizer = self.accelerator.prepare(
+            torch.optim.Adam(copied_model.parameters(), lr=self.args.learning_rate)
+        )
+        holdout_reference_pairs = []
+        # print("Starting to collect holdout-reference pairs...")
+        logger.info("Starting to collect holdout-reference pairs...")
+        for step, holdout_batch in enumerate(self.holdout_loader):
+            # print(f"Processing holdout batch {step+1}/{len(self.holdout_loader)}...")
+            logger.info(f"Processing holdout batch {step+1}/{len(self.holdout_loader)}...")
+            optimizer.zero_grad()
+            outputs = copied_model(
+                input_ids=holdout_batch['input_ids'],
+                attention_mask=holdout_batch['attention_mask'],
+                labels=holdout_batch['labels']
+            )
+            holdout_loss = outputs.loss
+            decoded_texts = self.tokenizer.batch_decode(
+                holdout_batch['input_ids'],
+                skip_special_tokens=True
+            )
+            holdout_loss.backward()
+            optimizer.step()
+            print(f"Evaluating reference losses at step {step}...")
+            logger.info(f"Evaluating reference losses at step {step}...")
+            copied_model.eval()
+            reference_losses = []
+            with torch.no_grad():
+                for ref_batch in self.reference_loader:
+                    outputs = copied_model(
+                        input_ids=ref_batch['input_ids'],
+                        attention_mask=ref_batch['attention_mask'],
+                        labels=ref_batch['labels']
+                    )
+                    reference_losses.append(outputs.loss.item())
+            # Compute the mean of reference losses
+            score = sum(reference_losses) / len(reference_losses) if reference_losses else 0.0
+            holdout_reference_pairs.append((decoded_texts, score))
+            # copied_model.train()
+        # Train the data influence model using the generated pairs
+        print("Starting to train the data influence model...")
+        logger.info("Starting to train the data influence model...")
+        self.data_influence_model.train()
+        influence_optimizer = torch.optim.AdamW(self.data_influence_model.parameters(), lr=self.args.learning_rate)
+        for step, (text, score) in enumerate(holdout_reference_pairs):
+            # Tokenize the text using the BERT tokenizer
+            bert_inputs = self.data_influence_tokenizer(
+                text,
+                truncation=True,
+                padding='max_length',
+                max_length=256,
+                return_tensors='pt'
+            ).to(self.accelerator.device)
+            # Convert score to tensor and enable gradients
+            score_tensor = torch.tensor([score], device=self.accelerator.device, dtype=torch.float32, requires_grad=True)
+            # Train the data influence model
+            influence_optimizer.zero_grad()
+            outputs = self.data_influence_model(
+                input_ids=bert_inputs['input_ids'],
+                attention_mask=bert_inputs['attention_mask'],
+                labels=score_tensor
+            )
+            influence_loss = outputs.loss
+            influence_loss.backward()
+            influence_optimizer.step()
+            if step % 50 == 0:
+                print(f"[Influence Training] Step {step}: Loss = {influence_loss.item():.4f}")
+                logger.info(f"[Influence Training] Step {step}: Loss = {influence_loss.item():.4f}")
+        # Distillation for SkipBERT
+    def evaluate(self, wandb_sample=True):
+        self.model.eval()
+        val_loss = 0.0
+        all_preds = []
+        all_labels = []
+        with torch.no_grad():
+            for batch in self.val_loader:
+                outputs = self.model(
+                    input_ids=batch['input_ids'],
+                    attention_mask=batch['attention_mask'],
+                    labels=batch['labels']
+                )
+                val_loss += outputs.loss.item()
+                logits = self.accelerator.gather(outputs.logits)
+                labels = self.accelerator.gather(batch['labels'])
+                logits = logits.cpu().numpy()
+                labels = labels.cpu().numpy()
+                predictions = np.argmax(logits, axis=-1)
+                attention_mask = batch['attention_mask'].cpu().numpy()
+                for pred, label, mask in zip(predictions, labels, attention_mask):
+                    valid_pred = pred[mask.astype(bool)]
+                    valid_label = label[mask.astype(bool)]
+                    all_preds.append(valid_pred)
+                    all_labels.append(valid_label)
+        max_len = max(len(seq) for seq in all_preds)
+        padded_preds = np.array([
+            np.pad(seq, (0, max_len - len(seq)), 'constant', constant_values=self.tokenizer.pad_token_id)
+            for seq in all_preds
+        ])
+        max_len = max(len(seq) for seq in all_labels)
+        padded_labels = np.array([
+            np.pad(seq, (0, max_len - len(seq)), 'constant', constant_values=-100)
+            for seq in all_labels
+        ])
+        metrics = self.compute_metrics({"predictions": padded_preds, "label_ids": padded_labels})
+        metrics.update({"eval_loss": val_loss / len(self.val_loader)})
+        print("Validation Metrics:", metrics)
+        if wandb_sample:
+            # Sample Logging
+            llm_sample_cb = ManualLLMSampleCB(
+                model=self.model,
+                tokenizer=self.tokenizer,
+                task="classification",
+                num_samples=5,
+                max_new_tokens=128
+            )
+            llm_sample_cb.log_samples_to_wandb(self.val_dataset)
+        return metrics

template_FL/src/fedllm/utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+def clean_output_text(text):
+    """
+    Clean and normalize text from LLM outputs by removing noise and repetitions.
+    Args:
+        text (str): Raw text from LLM prediction
+    Returns:
+        str: Cleaned and normalized text
+    """
+    import re
+    def remove_repeats(text):
+        # Remove repeated words
+        pattern_words = r'\b(\w+)(?:\s+\1\b)+'
+        text = re.sub(pattern_words, r'\1', text)
+        # Remove repeated character patterns (like 'asasas')
+        pattern_chars = r'(\w+?)\1+'
+        text = re.sub(pattern_chars, r'\1', text)
+        return text
+    # Remove excessive punctuation
+    def normalize_punctuation(text):
+        # Replace multiple exclamation/question marks with single ones
+        text = re.sub(r'!+', '!', text)
+        text = re.sub(r'\?+', '?', text)
+        # Remove multiple periods (except for ellipsis)
+        text = re.sub(r'\.{4,}', '...', text)
+        text = text.replace('cor', '').replace('asesa', '')
+        return text
+    # Main cleaning pipeline
+    cleaned_text = text.strip()
+    # Remove common noise patterns
+    noise_patterns = [
+        r'\n+',              # Multiple newlines
+        r'\s+',              # Multiple spaces
+        r'\\n',              # Literal \n
+        r'\\t',              # Literal \t
+    ]
+    for pattern in noise_patterns:
+        cleaned_text = re.sub(pattern, ' ', cleaned_text)
+    # Apply cleaning functions
+    # cleaned_text = remove_repetitions(cleaned_text)
+    cleaned_text = remove_repeats(cleaned_text)
+    cleaned_text = normalize_punctuation(cleaned_text)
+    cleaned_text = ' '.join(cleaned_text.split())  # Normalize spacing
+    return cleaned_text.strip()

template_FL/src/pyproject.toml ADDED Viewed

	@@ -0,0 +1,180 @@

+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "flowertune-llm"
+version = "1.0.0"
+description = "FlowerTune LLM: Federated LLM Fine-tuning with Flower"
+license = "Apache-2.0"
+dependencies = [
+    "flwr[simulation]==1.12.0",
+    "flwr-datasets>=0.3.0",
+    "trl==0.8.1",
+    "bitsandbytes==0.43.0",
+    "scipy==1.13.0",
+    "peft==0.6.2",
+    "fschat[model_worker,webui]==0.2.35",
+    "transformers==4.41.1",
+    "sentencepiece==0.2.0",
+    "omegaconf==2.3.0",
+    "hf_transfer==0.1.8",
+]
+[tool.hatch.build.targets.wheel]
+packages = ["."]
+[tool.flwr.app]
+publisher = "flwrlabs"
+[tool.flwr.app.components]
+serverapp = "fedllm.server_app:app"
+clientapp = "fedllm.client_app:app"
+[tool.flwr.app.config]
+num-server-rounds = 2
+num-supernodes = 10
+# Define dataset
+dataset.type = 'hete' # type = ['homo','hete']
+dataset.name = "vicgalle/alpaca-gpt4"
+# Define model settings
+model.name = "Qwen/Qwen2.5-1.5B-Instruct"
+model.quantization = 4
+model.gradient-checkpointing = true
+model.flash_attention = false
+### Use MATES ###
+mates.state = true
+mates.holdout-ratio = 0.001
+mates.reference-ratio = 0.0005
+mates.holdout-batch-size = 4
+mates.reference-batch-size = 2
+mates.update-data-influence-model-step = 20
+mates.selection-fraction = 0.4
+### END ###
+### Use SkipBERT ###
+# Model setting
+skipbert.student-model = "bert-base-uncased"
+skipbert.num_layers_student = 12
+skipbert.num_full_hidden_layers_student = 6
+skipbert.num_masked_layers_teacher = 0
+skipbert.num_masked_last_layers_teacher = 0
+# Training hyperparameters
+skipbert.train_batch_size = 8
+skipbert.gradient_accumulation_steps = 2
+skipbert.eval_batch_size = 8
+skipbert.eval_accumulation_steps = 2
+skipbert.learning_rate = 2.0e-5
+skipbert.num_train_epochs = 10
+skipbert.eval_step = 10
+skipbert.max_seq_length = 128
+skipbert.weight_decay = 1.0e-4
+skipbert.warmup_steps = 100 # 500
+skipbert.do_train = true
+skipbert.do_eval = true
+skipbert.max_steps = -1
+skipbert.evaluation_strategy = "epoch"
+skipbert.save_strategy = "epoch"
+skipbert.lr_scheduler_type = "cosine" # or 'warmup_linear'
+skipbert.logging_dir = './skipbert_logs'
+skipbert.output_dir = "./skipbert_results"
+skipbert.report_to = 'wandb'
+# Knowledge distillation parameters
+skipbert.beta = 0.01
+skipbert.T = 1.0
+skipbert.alpha = 1.0
+skipbert.reduce_T = 1.0
+skipbert.epochs_no_cls = 5
+# Training schedule and features
+skipbert.freeze_lower_layers = true
+# Feature usage flags
+skipbert.use_logits = true
+skipbert.use_att = true
+skipbert.use_rep = true
+skipbert.use_embedding = false
+# Training modes
+skipbert.do_train = true
+skipbert.do_eval = true
+skipbert.do_predict = false
+skipbert.do_fit = false
+skipbert.fp16 = false
+skipbert.no_pretrain = false
+skipbert.use_init_weight = false
+skipbert.share_param = true
+skipbert.do_lower_case = true
+skipbert.no_cuda = false
+# N-gram settings
+skipbert.n_gram_left = 1
+skipbert.n_gram_right = 1
+# Layer mappings
+skipbert.att_layer_maps: [1, 3, 5, 7, 9, 11]
+skipbert.hid_layer_maps: [6, 7, 8, 9, 10, 11, 12]
+### END ###
+# Define LoRA settings
+model.lora.lora-r = 8
+model.lora.lora-alpha = 16
+model.lora.lora-dropout = 0.05
+model.lora.lora-target-modules = "q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj"
+# Define training settings
+train.save-every-round = 5
+train.learning-rate-max = 5e-5
+train.learning-rate-min = 1e-6
+train.seq-length = 256
+train.prompt_template_name = "alpaca"
+train.train_on_inputs = true
+train.verbose = false
+# Define training agruments for HF Trainer
+train.training-arguments.output-dir = ""
+train.training-arguments.learning-rate = 3e-4
+train.training-arguments.per-device-train-batch-size = 4
+train.training-arguments.gradient-accumulation-steps = 1
+train.training-arguments.per-device-eval-batch-size = 2
+train.training-arguments.eval-accumulation-steps = 1
+train.training-arguments.logging-steps = 10
+train.training-arguments.num-train-epochs = 1
+train.training-arguments.max-steps = 10
+train.training-arguments.save-steps = 1000
+train.training-arguments.save-total-limit = 10
+train.training-arguments.gradient-checkpointing = true
+train.training-arguments.lr-scheduler-type = "cosine"
+train.training-arguments.warmup-steps = 0
+train.training-arguments.do-train = true
+train.training-arguments.do-eval = true
+train.training-arguments.dataloader-drop-last = false
+train.training-arguments.eval-strategy = "epoch"
+train.training-arguments.save-strategy = "epoch"
+train.training-arguments.ddp-find-unused-parameters = false
+train.training-arguments.group-by-length = true
+train.training-arguments.load_best_model_at_end = true
+train.training-arguments.report-to = "wandb"
+# Define local training settings
+train.strategy.fraction-fit = 0.2
+train.strategy.fraction-evaluate = 0.0
+[tool.flwr.federations]
+default = "local-simulation"
+[tool.flwr.federations.local-simulation]
+options.num-supernodes = 10
+options.backend.client-resources.num-cpus = 8
+options.backend.client-resources.num-gpus = 1.0

template_FL/src/tesst.ipynb ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "013c8cfe-254e-4ee8-81e0-27478628c8e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tomllib\n",
+    "\n",
+    "with open(\"config.toml\", \"rb\") as f:\n",
+    "    config = tomllib.load(f)\n",
+    "\n",
+    "numbers = config[\"my_integers\"]  # Returns Python list: [1, 2, 3, 42]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}