File size: 8,614 Bytes
9f3edac bffbe64 9f3edac bffbe64 9f3edac bffbe64 9f3edac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
---
license: apache-2.0
tags:
- MDEL
---
# Model Name
Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts
# Model Description
This model was generated by averaging the weights of the following models
- [Multi-Domain-Expert-Layers/expert-pubmed_central](https://huggingface.co/Multi-Domain-Expert-Layers/expert-pubmed_central)
- [Multi-Domain-Expert-Layers/expert-freelaw](https://huggingface.co/Multi-Domain-Expert-Layers/expert-freelaw)
- [Multi-Domain-Expert-Layers/expert-github](https://huggingface.co/Multi-Domain-Expert-Layers/expert-github)
- [Multi-Domain-Expert-Layers/expert-uspto](https://huggingface.co/Multi-Domain-Expert-Layers/expert-uspto)
- [Multi-Domain-Expert-Layers/expert-arxiv](https://huggingface.co/Multi-Domain-Expert-Layers/expert-arxiv)
- [theblackcat102/pythia-1b-deduped-sft](theblackcat102/pythia-1b-deduped-sft)
- We also keep a mixture that is primarily one of the above as an expert that can be loaded on demand.
### NOTE: There is a mistake below where we are using a routed expert for pubmed-abstract, but we merged pubmed central
```
# test merged experts
# TODO: add dynamic routing, testing better expert mixtures
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM, GPTNeoXLayer
from torch import nn
class GPTNeoXExpertsForCasualLM(GPTNeoXForCausalLM):
""" Stores various experts for layers 9, 10 """ # , 11
def __init__(self, config):
super().__init__(config)
self.config = config
self.orig_chat = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
self.uspto_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
self.github_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
self.pubmed_abstracts_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
self.freelaw_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
self.arxiv_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
self.merged_chat_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
self.curr_expert = "merged_chat_expert"
def generate_with_expert(self, text, tokenizer, expert="merged_chat_expert", return_answer_only=False, do_self_contrastive=True, max_length=512, min_length=1, max_return_sequences=1, do_sample=True, do_beam=False, device="cuda", target_lang=None):
"""Generates using one of the experts."""
tokenizer.pad_token = tokenizer.eos_token
if expert != self.curr_expert:
self.curr_expert = expert
for layer_id in range(2):
if expert == "orig_chat":
self.gpt_neox.layers[layer_id+9] = self.orig_chat[layer_id]
elif expert == "uspto_expert":
self.gpt_neox.layers[layer_id+9] = self.uspto_expert[layer_id]
elif expert == "github_expert":
self.gpt_neox.layers[layer_id+9] = self.github_expert[layer_id]
elif expert == "pubmed_abstracts_expert":
self.gpt_neox.layers[layer_id+9] = self.pubmed_abstracts_expert[layer_id]
elif expert == "arxiv_expert":
self.gpt_neox.layers[layer_id+9] = self.arxiv_expert[layer_id]
elif expert == "freelaw_expert":
self.gpt_neox.layers[layer_id+9] = self.freelaw_expert[layer_id]
elif expert == "merged_chat_expert":
self.gpt_neox.layers[layer_id+9] = self.merged_chat_expert[layer_id]
if type(text) is str:
text = [text]
text = [p.strip() for p in text]
input_ids = tokenizer(text, return_tensors='pt',padding=True, truncation=True, max_length=max_length )
input_ids = input_ids.to(device)
with torch.no_grad():
outputs = self.generate(
**input_ids,
max_length=max_length,
repetition_penalty=1.1,
min_length=min_length,
do_sample=True,
top_p=0.95,
penalty_alpha=0.6 if do_self_contrastive else None,
top_k=10,
)
ret = []
for i in range(len(outputs)): # can use batch_decode, unless we want to do something special here
out = tokenizer.decode(outputs[i], skip_special_tokens=True)
if return_answer_only:
out = out[len(text[i]):].lstrip(".? \n\t")
ret.append(out)
return ret
tokenizer = AutoTokenizer.from_pretrained("theblackcat102/pythia-1b-deduped-sft")
tokenizer.pad_token = tokenizer.eos_token
model1 = GPTNeoXExpertsForCasualLM.from_pretrained("Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts")
model1=model1.half().cuda().eval()
for expert in ["orig_chat", "merged_chat_expert", "uspto_expert", "github_expert", "pubmed_abstracts_expert", "arxiv_expert", "freelaw_expert"]:
print (f'## {expert}')
print (model1.generate_with_expert("<human> Write a patent about an electric toothbrush\n<bot>", tokenizer, expert=expert)[0])
print (f'## {expert} more')
print (model1.generate_with_expert("Field of the Invention.\nAn electric toothbrush\n", tokenizer, expert=expert)[0])
```
### To recreate the expert, modify this script. We can also extend to do dynamic merging and/or experitment with different weights for different layers.
```
def recreate_merged_expert():
model1 = GPTNeoXExpertsForCasualLM.from_pretrained("theblackcat102/pythia-1b-deduped-sft").float()
model2 = AutoModelForCausalLM.from_pretrained("stillerman/MDEL-pubmed-feelaw-github-arxiv").float()
model_uspto = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-uspto").float()
model_github = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-github").float()
model_pubmed_abstracts = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-pubmed_abstracts").float()
model_freelaw = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-freelaw").float()
model_arxiv = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-arxiv").float()
model = AutoModelForCausalLM.from_pretrained("theblackcat102/pythia-1b-deduped-sft").float() # half().cuda().eval()
with torch.no_grad():
for layer_id in [9,10]: #9,10,11,12,13
model1.orig_chat[layer_id-9] = model.gpt_neox.layers[layer_id]
for layer_id in [9,10]: #9,10,11,12,13
for p1, p2, p3 in zip(model1.gpt_neox.layers[layer_id].parameters(), model2.gpt_neox.layers[layer_id].parameters(), model_uspto.gpt_neox.layers[layer_id].parameters()):
p1.data = p1.data*.6 + p2.data*0.3 + p3.data*0.1
model1.merged_chat_expert[layer_id-9] = model1.gpt_neox.layers[layer_id]
#model1.uspto_expert.layers_9_10_11 = []
for layer_id in [9,10]: #9,10,11,12,13
for p1, p2 in zip(model_uspto.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
p1.data = p1.data*.6 + p2.data*0.4
model1.uspto_expert[layer_id-9] = model_uspto.gpt_neox.layers[layer_id]
#model1.github_expert.layers_9_10_11 = []
for layer_id in [9,10]: #9,10,11,12,13
for p1, p2 in zip(model_github.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
p1.data = p1.data*.6 + p2.data*0.4
model1.github_expert[layer_id-9] = model_github.gpt_neox.layers[layer_id]
#model1.pubmed_abstracts_expert.layers_9_10_11 = []
for layer_id in [9,10]: #9,10,11,12,13
for p1, p2 in zip(model_pubmed_abstracts.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
p1.data = p1.data*.6 + p2.data*0.4
model1.pubmed_abstracts_expert[layer_id-9] = model_pubmed_abstracts.gpt_neox.layers[layer_id]
#model1.freelaw_expert.layers_9_10_11 = []
for layer_id in [9,10]: #9,10,11,12,13
for p1, p2 in zip(model_freelaw.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
p1.data = p1.data*.6 + p2.data*0.4
model1.freelaw_expert[layer_id-9] = model_freelaw.gpt_neox.layers[layer_id]
#model1.arxiv_expert.layers_9_10_11 = []
for layer_id in [9,10]: #9,10,11,12,13
for p1, p2 in zip(model_arxiv.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
p1.data = p1.data*.6 + p2.data*0.4
model1.arxiv_expert[layer_id-9] = model_arxiv.gpt_neox.layers[layer_id]
model1 = model1.half().eval()
model1.save_pretrained("MDEL-theblackcat-chat-5-experts", torch_dtype=torch.float16)
model1.push_to_hub("Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts")
return model1
``` |