naumov1 commited on
Commit
783fe70
1 Parent(s): 9fd8d56

Initial Commit

Browse files
Files changed (5) hide show
  1. README.md +45 -0
  2. config.json +39 -0
  3. generation_config.json +6 -0
  4. gitattributes +35 -0
  5. pytorch_model.bin +3 -0
README.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-nd-4.0
3
+ ---
4
+ # Introduction
5
+ TQCompressedGPT-2 is an advanced neural network model, offering a novel method for model compression through improved tensor decompositions. It addresses the challenges of computational and storage demands in NLP tasks, introducing a permutation-based enhancement to Kronecker decomposition, significantly reducing model size while maintaining performance.\
6
+ TQCompressedGPT2 © 2024 by Terra Quantum AG is licensed under CC BY-NC-ND 4.0. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-nd/4.0/ \
7
+ Any entity who wishes to use this library for commercial purposes should contact [email protected] for more information.\
8
+ [![License: CC BY-NC-ND 4.0](https://img.shields.io/badge/License-CC%20BY--NC--ND%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by-nc-nd/4.0/)\
9
+ <img src="https://cdn-uploads.huggingface.co/production/uploads/6476003bbed7adbb05f8441f/jEKdKKFoEzlAbbI4NnokH.png" width="500">
10
+ # Features
11
+ **Model Size Reduction:** Compresses the GPT-2small model from 124 million to 81 million parameters.\
12
+ **Permutation-Based Enhancement:** Introduces a new permutation algorithm for matrix factorization, minimizing performance degradation.\
13
+ **Efficient Training Strategy:** Employs multi-step knowledge distillation with a fraction (3.1%) of the OpenWebText dataset.\
14
+ **Performance:** Outperforms DistilGPT-2 in comparative evaluations.\
15
+ <img src="https://cdn-uploads.huggingface.co/production/uploads/6476003bbed7adbb05f8441f/x1krVBC2RTZNDR0dynbRp.png" width="500">
16
+
17
+ ## Permutation-Based Enhancement
18
+ In our work we employ permutation-based algorithm, which allows to achieve better decomposition approximation for weight matrices:\
19
+ <img src="https://cdn-uploads.huggingface.co/production/uploads/6476003bbed7adbb05f8441f/bM6KwfKWYBJjeX_xGw83C.png" width="500">
20
+
21
+ # Methodology
22
+ For more details about the techniques of TQCompressedGPT-2, refer to our paper: **(ADD LINK)TQCompressor: Improving Tensor Decomposition in Neural Networks via Permutations**\
23
+ **TQCompressed Decomposition:** Focuses on optimal permutation of weight matrices followed by Kronecker decomposition.\
24
+ **Knowledge Distillation:** Uses an iterative compression method coupled with knowledge distillation, enhancing performance.\
25
+ **Application:** Demonstrated on the GPT-2 model, showing its versatility and applicability to various neural network architectures.
26
+
27
+ # Usage
28
+ The model and code are publicly available at:
29
+ - [GitHub Repository](https://github.com/terra-quantum-io/TQCompressedGPT2)
30
+ - [HuggingFace Repository](https://huggingface.co/tq-ai-research/TQCompressedGPT2)
31
+
32
+ # Citation
33
+ If you find TQCompressedGPT-2 useful in your research, please cite the following paper:
34
+ ```
35
+ @article{tqcompressedgpt2,
36
+ title={TQCompressor: Improving Tensor Decomposition in Neural Networks via Permutations},
37
+ author={Abronin, V., Naumov, A., Mazur, D., Bystrov, D., Tsarova, K., Melnikov, Ar., Oseledets, I., Dolgov, S., Brasher, R., Perelshtein, M.},
38
+ journal={arXiv preprint arXiv:[insert_arxiv_id]},
39
+ year={2023}
40
+ }
41
+ ```
42
+
43
+ # Acknowledgments
44
+ - [Terra Quantum AG](https://terraquantum.swiss/), Kornhausstrasse 25, 9000 St. Gallen, Switzerland
45
+ - Project contributors and researchers.
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.31.0",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.31.0"
6
+ }
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c65274bbf42607d7451261fc0a5a671bc28dc5f7ac86b77adf0e5e684ea4c75c
3
+ size 480171457